forked from lijiext/lammps
Merge pull request #2311 from stanmoore1/kk_update_3.2
Update Kokkos library in LAMMPS to v3.2
This commit is contained in:
commit
d00807ee9a
|
@ -35,8 +35,8 @@ if(DOWNLOAD_KOKKOS)
|
|||
list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}")
|
||||
include(ExternalProject)
|
||||
ExternalProject_Add(kokkos_build
|
||||
URL https://github.com/kokkos/kokkos/archive/3.1.01.tar.gz
|
||||
URL_MD5 3ccb2100f7fc316891e7dad3bc33fa37
|
||||
URL https://github.com/kokkos/kokkos/archive/3.2.00.tar.gz
|
||||
URL_MD5 81569170fe232e5e64ab074f7cca5e50
|
||||
CMAKE_ARGS ${KOKKOS_LIB_BUILD_ARGS}
|
||||
BUILD_BYPRODUCTS <INSTALL_DIR>/lib/libkokkoscore.a
|
||||
)
|
||||
|
@ -50,7 +50,7 @@ if(DOWNLOAD_KOKKOS)
|
|||
target_link_libraries(lammps PRIVATE LAMMPS::KOKKOS)
|
||||
add_dependencies(LAMMPS::KOKKOS kokkos_build)
|
||||
elseif(EXTERNAL_KOKKOS)
|
||||
find_package(Kokkos 3.1.01 REQUIRED CONFIG)
|
||||
find_package(Kokkos 3.2.00 REQUIRED CONFIG)
|
||||
target_link_libraries(lammps PRIVATE Kokkos::kokkos)
|
||||
else()
|
||||
set(LAMMPS_LIB_KOKKOS_SRC_DIR ${LAMMPS_LIB_SOURCE_DIR}/kokkos)
|
||||
|
|
|
@ -361,9 +361,12 @@ be specified in uppercase.
|
|||
* - AMDAVX
|
||||
- HOST
|
||||
- AMD 64-bit x86 CPU (AVX 1)
|
||||
* - EPYC
|
||||
* - ZEN
|
||||
- HOST
|
||||
- AMD EPYC Zen class CPU (AVX 2)
|
||||
- AMD Zen class CPU (AVX 2)
|
||||
* - ZEN2
|
||||
- HOST
|
||||
- AMD Zen2 class CPU (AVX 2)
|
||||
* - ARMV80
|
||||
- HOST
|
||||
- ARMv8.0 Compatible CPU
|
||||
|
@ -445,12 +448,18 @@ be specified in uppercase.
|
|||
* - TURING75
|
||||
- GPU
|
||||
- NVIDIA Turing generation CC 7.5 GPU
|
||||
* - AMPERE80
|
||||
- GPU
|
||||
- NVIDIA Ampere generation CC 8.0 GPU
|
||||
* - VEGA900
|
||||
- GPU
|
||||
- AMD GPU MI25 GFX900
|
||||
* - VEGA906
|
||||
- GPU
|
||||
- AMD GPU MI50/MI60 GFX906
|
||||
* - INTEL_GEN
|
||||
- GPU
|
||||
- Intel GPUs Gen9+
|
||||
|
||||
Basic CMake build settings:
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
|
|
@ -10,33 +10,45 @@ for C++. Applications heavily leveraging Kokkos are strongly encouraged to use
|
|||
You can either use Kokkos as an installed package (encouraged) or use Kokkos in-tree in your project.
|
||||
Modern CMake is exceedingly simple at a high-level (with the devil in the details).
|
||||
Once Kokkos is installed In your `CMakeLists.txt` simply use:
|
||||
````
|
||||
````cmake
|
||||
find_package(Kokkos REQUIRED)
|
||||
````
|
||||
Then for every executable or library in your project:
|
||||
````
|
||||
````cmake
|
||||
target_link_libraries(myTarget Kokkos::kokkos)
|
||||
````
|
||||
That's it! There is no checking Kokkos preprocessor, compiler, or linker flags.
|
||||
Kokkos propagates all the necessary flags to your project.
|
||||
This means not only is linking to Kokkos easy, but Kokkos itself can actually configure compiler and linker flags for *your*
|
||||
project. If building in-tree, there is no `find_package` and you link with `target_link_libraries(kokkos)`.
|
||||
project.
|
||||
When configuring your project just set:
|
||||
````bash
|
||||
> cmake ${srcdir} \
|
||||
-DKokkos_ROOT=${kokkos_install_prefix} \
|
||||
-DCMAKE_CXX_COMPILER=${compiler_used_to_build_kokkos}
|
||||
````
|
||||
Note: You may need the following if using some versions of CMake (e.g. 3.12):
|
||||
````cmake
|
||||
cmake_policy(SET CMP0074 NEW)
|
||||
````
|
||||
If building in-tree, there is no `find_package`. You can use `add_subdirectory(kokkos)` with the Kokkos source and again just link with `target_link_libraries(Kokkos::kokkos)`.
|
||||
The examples in `examples/cmake_build_installed` and `examples/cmake_build_in_tree` can help get you started.
|
||||
|
||||
|
||||
## Configuring CMake
|
||||
A very basic installation is done with:
|
||||
````
|
||||
cmake ${srcdir} \
|
||||
A very basic installation of Kokkos is done with:
|
||||
````bash
|
||||
> cmake ${srcdir} \
|
||||
-DCMAKE_CXX_COMPILER=g++ \
|
||||
-DCMAKE_INSTALL_PREFIX=${my_install_folder}
|
||||
-DCMAKE_INSTALL_PREFIX=${kokkos_install_folder}
|
||||
````
|
||||
which builds and installed a default Kokkos when you run `make install`.
|
||||
There are numerous device backends, options, and architecture-specific optimizations that can be configured, e.g.
|
||||
````
|
||||
cmake ${srcdir} \
|
||||
````bash
|
||||
> cmake ${srcdir} \
|
||||
-DCMAKE_CXX_COMPILER=g++ \
|
||||
-DCMAKE_INSTALL_PREFIX=${my_install_folder} \
|
||||
-DKokkos_ENABLE_OPENMP=On
|
||||
-DCMAKE_INSTALL_PREFIX=${kokkos_install_folder} \
|
||||
-DKokkos_ENABLE_OPENMP=ON
|
||||
````
|
||||
which activates the OpenMP backend. All of the options controlling device backends, options, architectures, and third-party libraries (TPLs) are given below.
|
||||
|
||||
|
@ -50,16 +62,16 @@ which activates the OpenMP backend. All of the options controlling device backen
|
|||
## Spack
|
||||
An alternative to manually building with the CMake is to use the Spack package manager.
|
||||
To do so, download the `kokkos-spack` git repo and add to the package list:
|
||||
````
|
||||
spack repo add $path-to-kokkos-spack
|
||||
````bash
|
||||
> spack repo add $path-to-kokkos-spack
|
||||
````
|
||||
A basic installation would be done as:
|
||||
````
|
||||
spack install kokkos
|
||||
````bash
|
||||
> spack install kokkos
|
||||
````
|
||||
Spack allows options and and compilers to be tuned in the install command.
|
||||
````
|
||||
spack install kokkos@3.0 %gcc@7.3.0 +openmp
|
||||
````bash
|
||||
> spack install kokkos@3.0 %gcc@7.3.0 +openmp
|
||||
````
|
||||
This example illustrates the three most common parameters to Spack:
|
||||
* Variants: specified with, e.g. `+openmp`, this activates (or deactivates with, e.g. `~openmp`) certain options.
|
||||
|
@ -67,17 +79,17 @@ This example illustrates the three most common parameters to Spack:
|
|||
* Compiler: a default compiler will be chosen if not specified, but an exact compiler version can be given with the `%`option.
|
||||
|
||||
For a complete list of Kokkos options, run:
|
||||
````bash
|
||||
> spack info kokkos
|
||||
````
|
||||
spack info kokkos
|
||||
````
|
||||
More details can be found in the kokkos-spack repository [README](https://github.com/kokkos/kokkos-spack/blob/master/README.md).
|
||||
More details can be found in the [Spack README](Spack.md)
|
||||
|
||||
#### Spack Development
|
||||
Spack currently installs packages to a location determined by a unique hash. This hash name is not really "human readable".
|
||||
Generally, Spack usage should never really require you to reference the computer-generated unique install folder.
|
||||
If you must know, you can locate Spack Kokkos installations with:
|
||||
````
|
||||
spack find -p kokkos ...
|
||||
````bash
|
||||
> spack find -p kokkos ...
|
||||
````
|
||||
where `...` is the unique spec identifying the particular Kokkos configuration and version.
|
||||
|
||||
|
@ -102,8 +114,14 @@ Device backends can be enabled by specifying `-DKokkos_ENABLE_X`.
|
|||
* Whether to build Pthread backend
|
||||
* BOOL Default: OFF
|
||||
* Kokkos_ENABLE_SERIAL
|
||||
* Whether to build serial backend
|
||||
* Whether to build serial backend
|
||||
* BOOL Default: ON
|
||||
* Kokkos_ENABLE_HIP (Experimental)
|
||||
* Whether to build HIP backend
|
||||
* BOOL Default: OFF
|
||||
* Kokkos_ENABLE_OPENMPTARGET (Experimental)
|
||||
* Whether to build the OpenMP target backend
|
||||
* BOOL Default: OFF
|
||||
|
||||
## Enable Options
|
||||
Options can be enabled by specifying `-DKokkos_ENABLE_X`.
|
||||
|
@ -138,9 +156,6 @@ Options can be enabled by specifying `-DKokkos_ENABLE_X`.
|
|||
* Kokkos_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
|
||||
* Debug check on dual views
|
||||
* BOOL Default: OFF
|
||||
* Kokkos_ENABLE_DEPRECATED_CODE
|
||||
* Whether to enable deprecated code
|
||||
* BOOL Default: OFF
|
||||
* Kokkos_ENABLE_EXAMPLES
|
||||
* Whether to enable building examples
|
||||
* BOOL Default: OFF
|
||||
|
@ -150,9 +165,6 @@ Options can be enabled by specifying `-DKokkos_ENABLE_X`.
|
|||
* Kokkos_ENABLE_LARGE_MEM_TESTS
|
||||
* Whether to perform extra large memory tests
|
||||
* BOOL_Default: OFF
|
||||
* Kokkos_ENABLE_PROFILING
|
||||
* Whether to create bindings for profiling tools
|
||||
* BOOL Default: ON
|
||||
* Kokkos_ENABLE_PROFILING_LOAD_PRINT
|
||||
* Whether to print information about which profiling tools gotloaded
|
||||
* BOOL Default: OFF
|
||||
|
@ -235,8 +247,11 @@ Architecture-specific optimizations can be enabled by specifying `-DKokkos_ARCH_
|
|||
* Kokkos_ARCH_BGQ
|
||||
* Whether to optimize for the BGQ architecture
|
||||
* BOOL Default: OFF
|
||||
* Kokkos_ARCH_EPYC
|
||||
* Whether to optimize for the EPYC architecture
|
||||
* Kokkos_ARCH_ZEN
|
||||
* Whether to optimize for the Zen architecture
|
||||
* BOOL Default: OFF
|
||||
* Kokkos_ARCH_ZEN2
|
||||
* Whether to optimize for the Zen2 architecture
|
||||
* BOOL Default: OFF
|
||||
* Kokkos_ARCH_HSW
|
||||
* Whether to optimize for the HSW architecture
|
||||
|
|
|
@ -1,6 +1,113 @@
|
|||
# Change Log
|
||||
|
||||
## [3.1.1](https://github.com/kokkos/kokkos/tree/3.1.1) (2020-04-14)
|
||||
## [3.2.00](https://github.com/kokkos/kokkos/tree/3.2.00) (2020-08-19)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.1.01...3.2.00)
|
||||
|
||||
**Implemented enhancements:**
|
||||
|
||||
- HIP:Enable stream in HIP [\#3163](https://github.com/kokkos/kokkos/issues/3163)
|
||||
- HIP:Add support for shuffle reduction for the HIP backend [\#3154](https://github.com/kokkos/kokkos/issues/3154)
|
||||
- HIP:Add implementations of missing HIPHostPinnedSpace methods for LAMMPS [\#3137](https://github.com/kokkos/kokkos/issues/3137)
|
||||
- HIP:Require HIP 3.5.0 or higher [\#3099](https://github.com/kokkos/kokkos/issues/3099)
|
||||
- HIP:WorkGraphPolicy for HIP [\#3096](https://github.com/kokkos/kokkos/issues/3096)
|
||||
- OpenMPTarget: Significant update to the new experimental backend. Requires C++17, works on Intel GPUs, reference counting fixes. [\#3169](https://github.com/kokkos/kokkos/issues/3169)
|
||||
- Windows Cuda support [\#3018](https://github.com/kokkos/kokkos/issues/3018)
|
||||
- Pass `-Wext-lambda-captures-this` to NVCC when support for `__host__ __device__` lambda is enabled from CUDA 11 [\#3241](https://github.com/kokkos/kokkos/issues/3241)
|
||||
- Use explicit staging buffer for constant memory kernel launches and cleanup host/device synchronization [\#3234](https://github.com/kokkos/kokkos/issues/3234)
|
||||
- Various fixup to policies including making TeamPolicy default constructible and making RangePolicy and TeamPolicy assignable 1: [\#3202](https://github.com/kokkos/kokkos/issues/3202)
|
||||
- Various fixup to policies including making TeamPolicy default constructible and making RangePolicy and TeamPolicy assignable 2: [\#3203](https://github.com/kokkos/kokkos/issues/3203)
|
||||
- Various fixup to policies including making TeamPolicy default constructible and making RangePolicy and TeamPolicy assignable 3: [\#3196](https://github.com/kokkos/kokkos/issues/3196)
|
||||
- Annotations for `DefaultExectutionSpace` and `DefaultHostExectutionSpace` to use in static analysis [\#3189](https://github.com/kokkos/kokkos/issues/3189)
|
||||
- Add documentation on using Spack to install Kokkos and developing packages that depend on Kokkos [\#3187](https://github.com/kokkos/kokkos/issues/3187)
|
||||
- Improve support for nvcc\_wrapper with exotic host compiler [\#3186](https://github.com/kokkos/kokkos/issues/3186)
|
||||
- Add OpenMPTarget backend flags for NVC++ compiler [\#3185](https://github.com/kokkos/kokkos/issues/3185)
|
||||
- Move deep\_copy/create\_mirror\_view on Experimental::OffsetView into Kokkos:: namespace [\#3166](https://github.com/kokkos/kokkos/issues/3166)
|
||||
- Allow for larger block size in HIP [\#3165](https://github.com/kokkos/kokkos/issues/3165)
|
||||
- View: Added names of Views to the different View initialize/free kernels [\#3159](https://github.com/kokkos/kokkos/issues/3159)
|
||||
- Cuda: Caching cudaFunctorAttributes and whether L1/Shmem prefer was set [\#3151](https://github.com/kokkos/kokkos/issues/3151)
|
||||
- BuildSystem: Provide an explicit default CMAKE\_BUILD\_TYPE [\#3131](https://github.com/kokkos/kokkos/issues/3131)
|
||||
- Cuda: Update CUDA occupancy calculation [\#3124](https://github.com/kokkos/kokkos/issues/3124)
|
||||
- Vector: Adding data() to Vector [\#3123](https://github.com/kokkos/kokkos/issues/3123)
|
||||
- BuildSystem: Add CUDA Ampere configuration support [\#3122](https://github.com/kokkos/kokkos/issues/3122)
|
||||
- General: Apply [[noreturn]] to Kokkos::abort when applicable [\#3106](https://github.com/kokkos/kokkos/issues/3106)
|
||||
- TeamPolicy: Validate storage level argument passed to TeamPolicy::set\_scratch\_size() [\#3098](https://github.com/kokkos/kokkos/issues/3098)
|
||||
- nvcc\_wrapper: send --cudart to nvcc instead of host compiler [\#3092](https://github.com/kokkos/kokkos/issues/3092)
|
||||
- BuildSystem: Make kokkos\_has\_string() function in Makefile.kokkos case insensitive [\#3091](https://github.com/kokkos/kokkos/issues/3091)
|
||||
- Modify KOKKOS\_FUNCTION macro for clang-tidy analysis [\#3087](https://github.com/kokkos/kokkos/issues/3087)
|
||||
- Move allocation profiling to allocate/deallocate calls [\#3084](https://github.com/kokkos/kokkos/issues/3084)
|
||||
- BuildSystem: FATAL\_ERROR when attempting in-source build [\#3082](https://github.com/kokkos/kokkos/issues/3082)
|
||||
- Change enums in ScatterView to types [\#3076](https://github.com/kokkos/kokkos/issues/3076)
|
||||
- HIP: Changes for new compiler/runtime [\#3067](https://github.com/kokkos/kokkos/issues/3067)
|
||||
- Extract and use get\_gpu [\#3061](https://github.com/kokkos/kokkos/issues/3061)
|
||||
- Extract and use get\_gpu [\#3048](https://github.com/kokkos/kokkos/issues/3048)
|
||||
- Add is\_allocated to View-like containers [\#3059](https://github.com/kokkos/kokkos/issues/3059)
|
||||
- Combined reducers for scalar references [\#3052](https://github.com/kokkos/kokkos/issues/3052)
|
||||
- Add configurable capacity for UniqueToken [\#3051](https://github.com/kokkos/kokkos/issues/3051)
|
||||
- Add installation testing [\#3034](https://github.com/kokkos/kokkos/issues/3034)
|
||||
- BuildSystem: Add -expt-relaxed-constexpr flag to nvcc\_wrapper [\#3021](https://github.com/kokkos/kokkos/issues/3021)
|
||||
- HIP: Add UniqueToken [\#3020](https://github.com/kokkos/kokkos/issues/3020)
|
||||
- Autodetect number of devices [\#3013](https://github.com/kokkos/kokkos/issues/3013)
|
||||
|
||||
|
||||
**Fixed bugs:**
|
||||
|
||||
- Check error code from `cudaStreamSynchronize` in CUDA fences [\#3255](https://github.com/kokkos/kokkos/issues/3255)
|
||||
- Fix issue with C++ standard flags when using `nvcc\_wrapper` with PGI [\#3254](https://github.com/kokkos/kokkos/issues/3254)
|
||||
- Add missing threadfence in lock-based atomics [\#3208](https://github.com/kokkos/kokkos/issues/3208)
|
||||
- Fix dedup of linker flags for shared lib on CMake <=3.12 [\#3176](https://github.com/kokkos/kokkos/issues/3176)
|
||||
- Fix memory leak with CUDA streams [\#3170](https://github.com/kokkos/kokkos/issues/3170)
|
||||
- BuildSystem: Fix OpenMP Target flags for Cray [\#3161](https://github.com/kokkos/kokkos/issues/3161)
|
||||
- ScatterView: fix for OpenmpTarget remove inheritance from reducers [\#3162](https://github.com/kokkos/kokkos/issues/3162)
|
||||
- BuildSystem: Set OpenMP flags according to host compiler [\#3127](https://github.com/kokkos/kokkos/issues/3127)
|
||||
- OpenMP: Fix logic for nested omp in partition\_master bug [\#3101](https://github.com/kokkos/kokkos/issues/3101)
|
||||
- BuildSystem: Fixes for Cuda/11 and c++17 [\#3085](https://github.com/kokkos/kokkos/issues/3085)
|
||||
- HIP: Fix print\_configuration [\#3080](https://github.com/kokkos/kokkos/issues/3080)
|
||||
- Conditionally define get\_gpu [\#3072](https://github.com/kokkos/kokkos/issues/3072)
|
||||
- Fix bounds for ranges in random number generator [\#3069](https://github.com/kokkos/kokkos/issues/3069)
|
||||
- Fix Cuda minor arch check [\#3035](https://github.com/kokkos/kokkos/issues/3035)
|
||||
|
||||
**Incompatibilities:**
|
||||
|
||||
- Remove ETI support [\#3157](https://github.com/kokkos/kokkos/issues/3157)
|
||||
- Remove KOKKOS\_INTERNAL\_ENABLE\_NON\_CUDA\_BACKEND [\#3147](https://github.com/kokkos/kokkos/issues/3147)
|
||||
- Remove core/unit\_test/config [\#3146](https://github.com/kokkos/kokkos/issues/3146)
|
||||
- Removed the preprocessor branch for KOKKOS\_ENABLE\_PROFILING [\#3115](https://github.com/kokkos/kokkos/issues/3115)
|
||||
- Disable profiling with MSVC [\#3066](https://github.com/kokkos/kokkos/issues/3066)
|
||||
|
||||
**Closed issues:**
|
||||
|
||||
- Silent error (Validate storage level arg to set_scratch_size) [\#3097](https://github.com/kokkos/kokkos/issues/3097)
|
||||
- Remove KOKKKOS\_ENABLE\_PROFILING Option [\#3095](https://github.com/kokkos/kokkos/issues/3095)
|
||||
- Cuda 11 -\> allow C++17 [\#3083](https://github.com/kokkos/kokkos/issues/3083)
|
||||
- In source build failure not explained [\#3081](https://github.com/kokkos/kokkos/issues/3081)
|
||||
- Allow naming of Views for initialization kernel [\#3070](https://github.com/kokkos/kokkos/issues/3070)
|
||||
- DefaultInit tests failing when using CTest resource allocation feature [\#3040](https://github.com/kokkos/kokkos/issues/3040)
|
||||
- Add installation testing. [\#3037](https://github.com/kokkos/kokkos/issues/3037)
|
||||
- nvcc\_wrapper needs to handle `-expt-relaxed-constexpr` flag [\#3017](https://github.com/kokkos/kokkos/issues/3017)
|
||||
- CPU core oversubscription warning on macOS with OpenMP backend [\#2996](https://github.com/kokkos/kokkos/issues/2996)
|
||||
- Default behavior of KOKKOS\_NUM\_DEVICES to use all devices available [\#2975](https://github.com/kokkos/kokkos/issues/2975)
|
||||
- Assert blocksize \> 0 [\#2974](https://github.com/kokkos/kokkos/issues/2974)
|
||||
- Add ability to assign kokkos profile function from executable [\#2973](https://github.com/kokkos/kokkos/issues/2973)
|
||||
- ScatterView Support for the pre/post increment operator [\#2967](https://github.com/kokkos/kokkos/issues/2967)
|
||||
|
||||
- Compiler issue: Cuda build with clang 10 has errors with the atomic unit tests [\#3237](https://github.com/kokkos/kokkos/issues/3237)
|
||||
- Incompatibility of flags for C++ standard with PGI v20.4 on Power9/NVIDIA V100 system [\#3252](https://github.com/kokkos/kokkos/issues/3252)
|
||||
- Error configuring as subproject [\#3140](https://github.com/kokkos/kokkos/issues/3140)
|
||||
- CMake fails with Nvidia compilers when the GPU architecture option is not supplied (Fix configure with OMPT and Cuda) [\#3207](https://github.com/kokkos/kokkos/issues/3207)
|
||||
- PGI compiler being passed the gcc -fopenmp flag [\#3125](https://github.com/kokkos/kokkos/issues/3125)
|
||||
- Cuda: Memory leak when using CUDA stream [\#3167](https://github.com/kokkos/kokkos/issues/3167)
|
||||
- RangePolicy has an implicitly deleted assignment operator [\#3192](https://github.com/kokkos/kokkos/issues/3192)
|
||||
- MemorySpace::allocate needs to have memory pool counting. [\#3064](https://github.com/kokkos/kokkos/issues/3064)
|
||||
- Missing write fence for lock based atomics on CUDA [\#3038](https://github.com/kokkos/kokkos/issues/3038)
|
||||
- CUDA compute capability version check problem [\#3026](https://github.com/kokkos/kokkos/issues/3026)
|
||||
- Make DynRankView fencing consistent [\#3014](https://github.com/kokkos/kokkos/issues/3014)
|
||||
- nvcc\_wrapper cant handle -Xcompiler -o out.o [\#2993](https://github.com/kokkos/kokkos/issues/2993)
|
||||
- Reductions of non-trivial types of size 4 fail in CUDA shfl operations [\#2990](https://github.com/kokkos/kokkos/issues/2990)
|
||||
- complex\_double misalignment in reduce, clang+CUDA [\#2989](https://github.com/kokkos/kokkos/issues/2989)
|
||||
- Span of degenerated \(zero-length\) subviews is not zero in some special cases [\#2979](https://github.com/kokkos/kokkos/issues/2979)
|
||||
- Rank 1 custom layouts dont work as expected. [\#2840](https://github.com/kokkos/kokkos/issues/2840)
|
||||
|
||||
## [3.1.01](https://github.com/kokkos/kokkos/tree/3.1.1) (2020-04-14)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.1.00...3.1.1)
|
||||
|
||||
**Fixed bugs:**
|
||||
|
|
|
@ -1,4 +1,9 @@
|
|||
|
||||
# Disable in-source builds to prevent source tree corruption.
|
||||
if( "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}" )
|
||||
message( FATAL_ERROR "FATAL: In-source builds are not allowed. You should create a separate directory for build files." )
|
||||
endif()
|
||||
|
||||
# We want to determine if options are given with the wrong case
|
||||
# In order to detect which arguments are given to compare against
|
||||
# the list of valid arguments, at the beginning here we need to
|
||||
|
@ -34,6 +39,9 @@ IF(COMMAND TRIBITS_PACKAGE_DECL)
|
|||
ELSE()
|
||||
SET(KOKKOS_HAS_TRILINOS OFF)
|
||||
ENDIF()
|
||||
# Is this build a subdirectory of another project
|
||||
GET_DIRECTORY_PROPERTY(HAS_PARENT PARENT_DIRECTORY)
|
||||
|
||||
|
||||
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake)
|
||||
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_pick_cxx_std.cmake)
|
||||
|
@ -75,16 +83,17 @@ IF(NOT KOKKOS_HAS_TRILINOS)
|
|||
SET(CMAKE_CXX_COMPILER ${SPACK_CXX} CACHE STRING "the C++ compiler" FORCE)
|
||||
SET(ENV{CXX} ${SPACK_CXX})
|
||||
ENDIF()
|
||||
ENDif()
|
||||
IF(NOT DEFINED ${PROJECT_NAME})
|
||||
# WORKAROUND FOR HIPCC
|
||||
IF(Kokkos_ENABLE_HIP)
|
||||
SET(KOKKOS_INTERNAL_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
|
||||
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --amdgpu-target=gfx906")
|
||||
ENDIF()
|
||||
PROJECT(Kokkos CXX)
|
||||
IF(Kokkos_ENABLE_HIP)
|
||||
SET(CMAKE_CXX_FLAGS ${KOKKOS_INTERNAL_CMAKE_CXX_FLAGS})
|
||||
ENDIF()
|
||||
# Always call the project command to define Kokkos_ variables
|
||||
# and to make sure that C++ is an enabled language
|
||||
PROJECT(Kokkos CXX)
|
||||
IF(NOT HAS_PARENT)
|
||||
IF (NOT CMAKE_BUILD_TYPE)
|
||||
SET(DEFAULT_BUILD_TYPE "RelWithDebInfo")
|
||||
MESSAGE(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' as none was specified.")
|
||||
SET(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" CACHE STRING
|
||||
"Choose the type of build, options are: Debug, Release, RelWithDebInfo and MinSizeRel."
|
||||
FORCE)
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
@ -102,8 +111,8 @@ ENDIF()
|
|||
|
||||
|
||||
set(Kokkos_VERSION_MAJOR 3)
|
||||
set(Kokkos_VERSION_MINOR 1)
|
||||
set(Kokkos_VERSION_PATCH 1)
|
||||
set(Kokkos_VERSION_MINOR 2)
|
||||
set(Kokkos_VERSION_PATCH 0)
|
||||
set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
|
||||
math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
|
||||
|
||||
|
@ -147,6 +156,7 @@ INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tribits.cmake)
|
|||
# Check the environment and set certain variables
|
||||
# to allow platform-specific checks
|
||||
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_check_env.cmake)
|
||||
|
||||
# The build environment setup goes in the following steps
|
||||
# 1) Check all the enable options. This includes checking Kokkos_DEVICES
|
||||
# 2) Check the compiler ID (type and version)
|
||||
|
@ -169,7 +179,6 @@ SET(KOKKOS_EXT_LIBRARIES Kokkos::kokkos Kokkos::kokkoscore Kokkos::kokkoscontain
|
|||
SET(KOKKOS_INT_LIBRARIES kokkos kokkoscore kokkoscontainers kokkosalgorithms)
|
||||
SET_PROPERTY(GLOBAL PROPERTY KOKKOS_INT_LIBRARIES ${KOKKOS_INT_LIBRARIES})
|
||||
|
||||
GET_DIRECTORY_PROPERTY(HAS_PARENT PARENT_DIRECTORY)
|
||||
IF (KOKKOS_HAS_TRILINOS)
|
||||
SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
|
||||
SET(KOKKOS_HEADER_DIR ${TRILINOS_INCDIR})
|
||||
|
@ -203,7 +212,7 @@ IF (KOKKOS_HAS_TRILINOS)
|
|||
SET(KOKKOSCORE_XCOMPILER_OPTIONS "${KOKKOSCORE_XCOMPILER_OPTIONS} -Xcompiler ${XCOMP_FLAG}")
|
||||
LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcompiler ${XCOMP_FLAG})
|
||||
ENDFOREACH()
|
||||
SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_COMPILE_OPTIONS} ${CMAKE_CXX${KOKKOS_CXX_STANDARD}_STANDARD_COMPILE_OPTION} ${KOKKOSCORE_XCOMPILER_OPTIONS}")
|
||||
SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_COMPILE_OPTIONS} ${KOKKOSCORE_XCOMPILER_OPTIONS}")
|
||||
IF (KOKKOS_ENABLE_CUDA)
|
||||
STRING(REPLACE ";" " " KOKKOSCORE_CUDA_OPTIONS "${KOKKOS_CUDA_OPTIONS}")
|
||||
FOREACH(CUDAFE_FLAG ${KOKKOS_CUDAFE_OPTIONS})
|
||||
|
@ -246,7 +255,7 @@ KOKKOS_PACKAGE_POSTPROCESS()
|
|||
#We are ready to configure the header
|
||||
CONFIGURE_FILE(cmake/KokkosCore_config.h.in KokkosCore_config.h @ONLY)
|
||||
|
||||
IF (NOT KOKKOS_HAS_TRILINOS)
|
||||
IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING)
|
||||
ADD_LIBRARY(kokkos INTERFACE)
|
||||
#Make sure in-tree projects can reference this as Kokkos::
|
||||
#to match the installed target names
|
||||
|
@ -262,8 +271,6 @@ INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake)
|
|||
# If the argument of DESTINATION is a relative path, CMake computes it
|
||||
# as relative to ${CMAKE_INSTALL_PATH}.
|
||||
INSTALL(PROGRAMS ${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
|
||||
|
||||
# Finally - if we are a subproject - make sure the enabled devices are visible
|
||||
IF (HAS_PARENT)
|
||||
|
|
|
@ -11,20 +11,20 @@ CXXFLAGS += $(SHFLAGS)
|
|||
endif
|
||||
|
||||
KOKKOS_VERSION_MAJOR = 3
|
||||
KOKKOS_VERSION_MINOR = 1
|
||||
KOKKOS_VERSION_PATCH = 1
|
||||
KOKKOS_VERSION_MINOR = 2
|
||||
KOKKOS_VERSION_PATCH = 0
|
||||
KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
|
||||
|
||||
# Options: Cuda,HIP,ROCm,OpenMP,Pthread,Serial
|
||||
KOKKOS_DEVICES ?= "OpenMP"
|
||||
#KOKKOS_DEVICES ?= "Pthread"
|
||||
# Options:
|
||||
# Options:
|
||||
# Intel: KNC,KNL,SNB,HSW,BDW,SKX
|
||||
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75
|
||||
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80
|
||||
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2
|
||||
# IBM: BGQ,Power7,Power8,Power9
|
||||
# AMD-GPUS: Vega900,Vega906
|
||||
# AMD-CPUS: AMDAVX,EPYC
|
||||
# AMD-CPUS: AMDAVX,Zen,Zen2
|
||||
KOKKOS_ARCH ?= ""
|
||||
# Options: yes,no
|
||||
KOKKOS_DEBUG ?= "no"
|
||||
|
@ -32,10 +32,8 @@ KOKKOS_DEBUG ?= "no"
|
|||
KOKKOS_USE_TPLS ?= ""
|
||||
# Options: c++11,c++14,c++1y,c++17,c++1z,c++2a
|
||||
KOKKOS_CXX_STANDARD ?= "c++11"
|
||||
# Options: aggressive_vectorization,disable_profiling,enable_deprecated_code,disable_deprecated_code,enable_large_mem_tests,disable_complex_align
|
||||
# Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align
|
||||
KOKKOS_OPTIONS ?= ""
|
||||
# Option for setting ETI path
|
||||
KOKKOS_ETI_PATH ?= ${KOKKOS_PATH}/core/src/eti
|
||||
KOKKOS_CMAKE ?= "no"
|
||||
KOKKOS_TRIBITS ?= "no"
|
||||
KOKKOS_STANDALONE_CMAKE ?= "no"
|
||||
|
@ -74,6 +72,7 @@ KOKKOS_INTERNAL_ENABLE_CXX1Y := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),
|
|||
KOKKOS_INTERNAL_ENABLE_CXX17 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++17)
|
||||
KOKKOS_INTERNAL_ENABLE_CXX1Z := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++1z)
|
||||
KOKKOS_INTERNAL_ENABLE_CXX2A := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2a)
|
||||
KOKKOS_INTERNAL_ENABLE_CXX20 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++20)
|
||||
|
||||
# Check for external libraries.
|
||||
KOKKOS_INTERNAL_USE_HWLOC := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),hwloc)
|
||||
|
@ -83,9 +82,7 @@ KOKKOS_INTERNAL_USE_MEMKIND := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),exper
|
|||
# Check for advanced settings.
|
||||
KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings)
|
||||
KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization)
|
||||
KOKKOS_INTERNAL_DISABLE_PROFILING := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_profiling)
|
||||
KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_deprecated_code)
|
||||
KOKKOS_INTERNAL_ENABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_deprecated_code)
|
||||
KOKKOS_INTERNAL_ENABLE_TUNING := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_tuning)
|
||||
KOKKOS_INTERNAL_DISABLE_COMPLEX_ALIGN := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_complex_align)
|
||||
KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_dualview_modify_check)
|
||||
KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_profile_load_print)
|
||||
|
@ -96,7 +93,6 @@ KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS
|
|||
KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda)
|
||||
KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_constexpr)
|
||||
KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch)
|
||||
KOKKOS_INTERNAL_ENABLE_ETI := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_eti)
|
||||
|
||||
KOKKOS_INTERNAL_HIP_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),rdc)
|
||||
|
||||
|
@ -140,6 +136,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
|
|||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
|
||||
KOKKOS_DEVICELIST += OPENMPTARGET
|
||||
KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER := $(shell expr $(KOKKOS_INTERNAL_ENABLE_CXX17) \
|
||||
+ $(KOKKOS_INTERNAL_ENABLE_CXX20) \
|
||||
+ $(KOKKOS_INTERNAL_ENABLE_CXX2A))
|
||||
ifneq ($(KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER), 1)
|
||||
$(error OpenMPTarget backend requires C++17 or newer)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
|
@ -281,7 +283,7 @@ endif
|
|||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
KOKKOS_INTERNAL_CXX11_FLAG := --c++11
|
||||
KOKKOS_INTERNAL_CXX14_FLAG := --c++14
|
||||
#KOKKOS_INTERNAL_CXX17_FLAG := --c++17
|
||||
KOKKOS_INTERNAL_CXX17_FLAG := --c++17
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
|
||||
KOKKOS_INTERNAL_CXX11_FLAG := -std=c++11
|
||||
|
@ -338,35 +340,27 @@ KOKKOS_INTERNAL_USE_ARCH_PASCAL60 := $(call kokkos_has_string,$(KOKKOS_ARCH),Pas
|
|||
KOKKOS_INTERNAL_USE_ARCH_VOLTA70 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volta70)
|
||||
KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volta72)
|
||||
KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
|
||||
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53))
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80))
|
||||
|
||||
#SEK: This seems like a bug to me
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
|
||||
KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(call kokkos_has_string,$(KOKKOS_ARCH),Maxwell)
|
||||
KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler)
|
||||
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53))
|
||||
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50))
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1)
|
||||
|
@ -394,19 +388,20 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
|
|||
|
||||
# AMD based.
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX)
|
||||
KOKKOS_INTERNAL_USE_ARCH_EPYC := $(call kokkos_has_string,$(KOKKOS_ARCH),EPYC)
|
||||
KOKKOS_INTERNAL_USE_ARCH_ZEN2 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen2)
|
||||
KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen)
|
||||
KOKKOS_INTERNAL_USE_ARCH_VEGA900 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega900)
|
||||
KOKKOS_INTERNAL_USE_ARCH_VEGA906 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega906)
|
||||
|
||||
# Any AVX?
|
||||
KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_EPYC))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SKX))
|
||||
|
||||
# Decide what ISA level we are able to support.
|
||||
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_EPYC))
|
||||
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2))
|
||||
KOKKOS_INTERNAL_USE_ISA_KNC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNC))
|
||||
KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER8) + $(KOKKOS_INTERNAL_USE_ARCH_POWER9))
|
||||
KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER7))
|
||||
|
@ -430,7 +425,7 @@ endif
|
|||
KOKKOS_CPPFLAGS =
|
||||
KOKKOS_LIBDIRS =
|
||||
ifneq ($(KOKKOS_CMAKE), yes)
|
||||
KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src -I$(KOKKOS_ETI_PATH)
|
||||
KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src
|
||||
endif
|
||||
KOKKOS_TPL_INCLUDE_DIRS =
|
||||
KOKKOS_TPL_LIBRARY_DIRS =
|
||||
|
@ -458,88 +453,91 @@ KOKKOS_CONFIG_HEADER=KokkosCore_config.h
|
|||
# Functions for generating config header file
|
||||
kokkos_append_header = $(shell echo $1 >> $(KOKKOS_INTERNAL_CONFIG_TMP))
|
||||
|
||||
# assign hash sign to variable for compat. with make 4.3
|
||||
H := \#
|
||||
|
||||
# Do not append first line
|
||||
tmp := $(shell echo "/* ---------------------------------------------" > KokkosCore_config.tmp)
|
||||
tmp := $(call kokkos_append_header,"Makefile constructed configuration:")
|
||||
tmp := $(call kokkos_append_header,"$(shell date)")
|
||||
tmp := $(call kokkos_append_header,"----------------------------------------------*/")
|
||||
|
||||
tmp := $(call kokkos_append_header,'\#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)')
|
||||
tmp := $(call kokkos_append_header,'\#error "Do not include $(KOKKOS_CONFIG_HEADER) directly; include Kokkos_Macros.hpp instead."')
|
||||
tmp := $(call kokkos_append_header,'\#else')
|
||||
tmp := $(call kokkos_append_header,'\#define KOKKOS_CORE_CONFIG_H')
|
||||
tmp := $(call kokkos_append_header,'\#endif')
|
||||
tmp := $(call kokkos_append_header,'$H''if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)')
|
||||
tmp := $(call kokkos_append_header,'$H''error "Do not include $(KOKKOS_CONFIG_HEADER) directly; include Kokkos_Macros.hpp instead."')
|
||||
tmp := $(call kokkos_append_header,'$H''else')
|
||||
tmp := $(call kokkos_append_header,'$H''define KOKKOS_CORE_CONFIG_H')
|
||||
tmp := $(call kokkos_append_header,'$H''endif')
|
||||
|
||||
tmp := $(call kokkos_append_header,"")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_VERSION $(KOKKOS_VERSION)")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_VERSION $(KOKKOS_VERSION)")
|
||||
tmp := $(call kokkos_append_header,"")
|
||||
|
||||
|
||||
tmp := $(call kokkos_append_header,"/* Execution Spaces */")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_COMPILER_CUDA_VERSION $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION)")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_COMPILER_CUDA_VERSION $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION)")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
|
||||
tmp := $(call kokkos_append_header,'\#define KOKKOS_ENABLE_ROCM')
|
||||
tmp := $(call kokkos_append_header,'\#define KOKKOS_IMPL_ROCM_CLANG_WORKAROUND 1')
|
||||
tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_ROCM')
|
||||
tmp := $(call kokkos_append_header,'$H''define KOKKOS_IMPL_ROCM_CLANG_WORKAROUND 1')
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
|
||||
tmp := $(call kokkos_append_header,'\#define KOKKOS_ENABLE_HIP')
|
||||
tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_HIP')
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
|
||||
tmp := $(call kokkos_append_header,'\#define KOKKOS_ENABLE_OPENMPTARGET')
|
||||
tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET')
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_WORKAROUND_OPENMPTARGET_GCC")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_WORKAROUND_OPENMPTARGET_GCC")
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
|
||||
tmp := $(call kokkos_append_header,'\#define KOKKOS_ENABLE_OPENMP')
|
||||
tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMP')
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_THREADS")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_THREADS")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_HPX")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HPX")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_SERIAL")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_SERIAL")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_TM), 1)
|
||||
tmp := $(call kokkos_append_header,"\#ifndef __CUDA_ARCH__")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_TM")
|
||||
tmp := $(call kokkos_append_header,"\#endif")
|
||||
tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_TM")
|
||||
tmp := $(call kokkos_append_header,"$H""endif")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ISA_X86_64), 1)
|
||||
tmp := $(call kokkos_append_header,"\#ifndef __CUDA_ARCH__")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_USE_ISA_X86_64")
|
||||
tmp := $(call kokkos_append_header,"\#endif")
|
||||
tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_ISA_X86_64")
|
||||
tmp := $(call kokkos_append_header,"$H""endif")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ISA_KNC), 1)
|
||||
tmp := $(call kokkos_append_header,"\#ifndef __CUDA_ARCH__")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_USE_ISA_KNC")
|
||||
tmp := $(call kokkos_append_header,"\#endif")
|
||||
tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_ISA_KNC")
|
||||
tmp := $(call kokkos_append_header,"$H""endif")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCLE), 1)
|
||||
tmp := $(call kokkos_append_header,"\#ifndef __CUDA_ARCH__")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_USE_ISA_POWERPCLE")
|
||||
tmp := $(call kokkos_append_header,"\#endif")
|
||||
tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_ISA_POWERPCLE")
|
||||
tmp := $(call kokkos_append_header,"$H""endif")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCBE), 1)
|
||||
tmp := $(call kokkos_append_header,"\#ifndef __CUDA_ARCH__")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_USE_ISA_POWERPCBE")
|
||||
tmp := $(call kokkos_append_header,"\#endif")
|
||||
tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_ISA_POWERPCBE")
|
||||
tmp := $(call kokkos_append_header,"$H""endif")
|
||||
endif
|
||||
|
||||
#only add the c++ standard flags if this is not CMake
|
||||
|
@ -548,34 +546,39 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1)
|
|||
ifneq ($(KOKKOS_STANDALONE_CMAKE), yes)
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG)
|
||||
endif
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX11")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX11")
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX14), 1)
|
||||
ifneq ($(KOKKOS_STANDALONE_CMAKE), yes)
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX14_FLAG)
|
||||
endif
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX14")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX14")
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Y), 1)
|
||||
#I cannot make CMake add this in a good way - so add it here
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Y_FLAG)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX14")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX14")
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX17), 1)
|
||||
ifneq ($(KOKKOS_STANDALONE_CMAKE), yes)
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX17_FLAG)
|
||||
endif
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX17")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX17")
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Z), 1)
|
||||
#I cannot make CMake add this in a good way - so add it here
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Z_FLAG)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX17")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX17")
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2A), 1)
|
||||
#I cannot make CMake add this in a good way - so add it here
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2A_FLAG)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX20")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX20")
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX20), 1)
|
||||
#I cannot make CMake add this in a good way - so add it here
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX20_FLAG)
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX20")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
|
||||
|
@ -585,20 +588,26 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
|
|||
|
||||
KOKKOS_CXXFLAGS += -g
|
||||
KOKKOS_LDFLAGS += -g
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEBUG")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEBUG")
|
||||
ifeq ($(KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK), 0)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK")
|
||||
endif
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_DISABLE_COMPLEX_ALIGN), 0)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_COMPLEX_ALIGN")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_COMPLEX_ALIGN")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_PROFILING_LOAD_PRINT")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_PROFILING_LOAD_PRINT")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_TUNING), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_TUNING")
|
||||
endif
|
||||
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_LIBDL")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1)
|
||||
ifneq ($(KOKKOS_CMAKE), yes)
|
||||
ifneq ($(HWLOC_PATH),)
|
||||
|
@ -611,11 +620,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1)
|
|||
KOKKOS_LIBS += -lhwloc
|
||||
KOKKOS_TPL_LIBRARY_NAMES += hwloc
|
||||
endif
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_HWLOC")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HWLOC")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_USE_LIBRT")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_LIBRT")
|
||||
KOKKOS_LIBS += -lrt
|
||||
KOKKOS_TPL_LIBRARY_NAMES += rt
|
||||
endif
|
||||
|
@ -632,50 +641,36 @@ ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
|
|||
KOKKOS_LIBS += -lmemkind -lnuma
|
||||
KOKKOS_TPL_LIBRARY_NAMES += memkind numa
|
||||
endif
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_HBWSPACE")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_DISABLE_PROFILING), 0)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_PROFILING")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 0)
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_DEPRECATED_CODE), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEPRECATED_CODE")
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_ETI")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HBWSPACE")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_LARGE_MEM_TESTS")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_LARGE_MEM_TESTS")
|
||||
endif
|
||||
|
||||
tmp := $(call kokkos_append_header,"/* Optimization Settings */")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION")
|
||||
endif
|
||||
|
||||
tmp := $(call kokkos_append_header,"/* Cuda Settings */")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC")
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC")
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_UVM")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_UVM")
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE")
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
KOKKOS_CXXFLAGS += -fcuda-rdc
|
||||
KOKKOS_LDFLAGS += -fcuda-rdc
|
||||
|
@ -696,7 +691,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
|||
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
|
||||
ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION) -gt 70; echo $$?),0)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_LAMBDA")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA")
|
||||
KOKKOS_CXXFLAGS += -expt-extended-lambda
|
||||
else
|
||||
$(warning Warning: Cuda Lambda support was requested but NVCC version is too low. This requires NVCC for Cuda version 7.5 or higher. Disabling Lambda support now.)
|
||||
|
@ -704,14 +699,14 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
|||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_LAMBDA")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA")
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
|
||||
ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION) -ge 80; echo $$?),0)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_CONSTEXPR")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_CONSTEXPR")
|
||||
KOKKOS_CXXFLAGS += -expt-relaxed-constexpr
|
||||
else
|
||||
$(warning Warning: Cuda relaxed constexpr support was requested but NVCC version is too low. This requires NVCC for Cuda version 8.0 or higher. Disabling relaxed constexpr support now.)
|
||||
|
@ -719,25 +714,25 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
|||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_CONSTEXPR")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_CONSTEXPR")
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_IMPL_CUDA_CLANG_WORKAROUND")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_IMPL_CUDA_CLANG_WORKAROUND")
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_HPX_ASYNC_DISPATCH")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HPX_ASYNC_DISPATCH")
|
||||
endif
|
||||
endif
|
||||
|
||||
# Add Architecture flags.
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV80")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV80")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
KOKKOS_CXXFLAGS +=
|
||||
|
@ -754,7 +749,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1)
|
|||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV81")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV81")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
KOKKOS_CXXFLAGS +=
|
||||
|
@ -770,9 +765,9 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1)
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_EPYC), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AMD_EPYC")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AMD_AVX2")
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN), 1)
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_AVX2")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
|
||||
KOKKOS_CXXFLAGS += -mavx2
|
||||
|
@ -783,9 +778,22 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_EPYC), 1)
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN2), 1)
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN2")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_AVX2")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
|
||||
KOKKOS_CXXFLAGS += -mavx2
|
||||
KOKKOS_LDFLAGS += -mavx2
|
||||
else
|
||||
KOKKOS_CXXFLAGS += -march=znver2 -mtune=znver2
|
||||
KOKKOS_LDFLAGS += -march=znver2 -mtune=znver2
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV80")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV8_THUNDERX")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV80")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV8_THUNDERX")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
KOKKOS_CXXFLAGS +=
|
||||
|
@ -802,8 +810,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
|
|||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV81")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV8_THUNDERX2")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV81")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV8_THUNDERX2")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
KOKKOS_CXXFLAGS +=
|
||||
|
@ -820,7 +828,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2), 1)
|
|||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_SSE42")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_SSE42")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
|
||||
KOKKOS_CXXFLAGS += -xSSE4.2
|
||||
|
@ -842,7 +850,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1)
|
|||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AVX")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
|
||||
KOKKOS_CXXFLAGS += -mavx
|
||||
|
@ -864,7 +872,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
|
|||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER7), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_POWER7")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_POWER7")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
|
||||
|
@ -876,7 +884,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER7), 1)
|
|||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_POWER8")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_POWER8")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
|
||||
|
@ -897,7 +905,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
|
|||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_POWER9")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_POWER9")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
|
||||
|
@ -918,7 +926,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
|
|||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HSW), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AVX2")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX2")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
|
||||
KOKKOS_CXXFLAGS += -xCORE-AVX2
|
||||
|
@ -940,7 +948,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HSW), 1)
|
|||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_BDW), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AVX2")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX2")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
|
||||
KOKKOS_CXXFLAGS += -xCORE-AVX2
|
||||
|
@ -962,7 +970,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_BDW), 1)
|
|||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AVX512MIC")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512MIC")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
|
||||
KOKKOS_CXXFLAGS += -xMIC-AVX512
|
||||
|
@ -983,7 +991,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
|
|||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AVX512XEON")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512XEON")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
|
||||
KOKKOS_CXXFLAGS += -xCORE-AVX512
|
||||
|
@ -1004,7 +1012,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1)
|
|||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KNC")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KNC")
|
||||
KOKKOS_CXXFLAGS += -mmic
|
||||
KOKKOS_LDFLAGS += -mmic
|
||||
endif
|
||||
|
@ -1022,8 +1030,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
|
|||
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG=-arch
|
||||
else ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG=--cuda-gpu-arch
|
||||
KOKKOS_CXXFLAGS += -x cuda
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG=--cuda-gpu-arch
|
||||
KOKKOS_CXXFLAGS += -x cuda
|
||||
else
|
||||
$(error Makefile.kokkos: CUDA is enabled but the compiler is neither NVCC nor Clang (got version string $(KOKKOS_CXX_VERSION)) )
|
||||
endif
|
||||
|
@ -1039,65 +1047,70 @@ endif
|
|||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER30")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER30")
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_30
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER32")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER32")
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_32
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER35")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER35")
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_35
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER37")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER37")
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_37
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL50")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL50")
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_50
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL52")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL52")
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_52
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL53")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL53")
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_53
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_PASCAL")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_PASCAL60")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL60")
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_60
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_PASCAL")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_PASCAL61")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL61")
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_61
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VOLTA")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VOLTA70")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA70")
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_70
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VOLTA")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VOLTA72")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA72")
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_72
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_TURING")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_TURING75")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75")
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_75
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE80), 1)
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80")
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_80
|
||||
endif
|
||||
|
||||
ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
|
||||
|
@ -1121,13 +1134,13 @@ endif
|
|||
ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
|
||||
# Lets start with adding architecture defines
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA900), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_HIP 900")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VEGA900")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HIP 900")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA900")
|
||||
KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx900
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA906), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_HIP 906")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VEGA906")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HIP 906")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA906")
|
||||
KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx906
|
||||
endif
|
||||
|
||||
|
@ -1138,7 +1151,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
|
|||
KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG)
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_HIP_USE_RELOC), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE")
|
||||
KOKKOS_CXXFLAGS+=-fgpu-rdc
|
||||
KOKKOS_LDFLAGS+=-fgpu-rdc
|
||||
else
|
||||
|
@ -1171,9 +1184,6 @@ KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp)
|
|||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
|
||||
KOKKOS_SRC += $(wildcard $(KOKKOS_ETI_PATH)/Cuda/*.cpp)
|
||||
endif
|
||||
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
|
||||
ifneq ($(CUDA_PATH),)
|
||||
KOKKOS_CPPLAGS += -I$(CUDA_PATH)/include
|
||||
|
@ -1211,9 +1221,6 @@ endif
|
|||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
|
||||
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp)
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
|
||||
KOKKOS_SRC += $(wildcard $(KOKKOS_ETI_PATH)/OpenMP/*.cpp)
|
||||
endif
|
||||
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
|
||||
|
@ -1228,9 +1235,6 @@ endif
|
|||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
|
||||
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
|
||||
KOKKOS_SRC += $(wildcard $(KOKKOS_ETI_PATH)/Threads/*.cpp)
|
||||
endif
|
||||
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
|
||||
KOKKOS_LIBS += -lpthread
|
||||
KOKKOS_TPL_LIBRARY_NAMES += pthread
|
||||
|
@ -1279,9 +1283,6 @@ endif
|
|||
# Don't include Kokkos_Serial.cpp or Kokkos_Serial_Task.cpp if not using Serial
|
||||
# device to avoid a link warning.
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
|
||||
KOKKOS_SRC += $(wildcard $(KOKKOS_ETI_PATH)/Serial/*.cpp)
|
||||
endif
|
||||
endif
|
||||
ifneq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
|
||||
KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp,$(KOKKOS_SRC))
|
||||
|
|
|
@ -26,21 +26,17 @@ Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spi
|
|||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
|
||||
Kokkos_HostBarrier.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp
|
||||
Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
|
||||
Kokkos_Profiling.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling.cpp
|
||||
Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
|
||||
Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
|
||||
Kokkos_MemorySpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp
|
||||
Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
|
||||
include $(KOKKOS_ETI_PATH)/Serial/Makefile.eti_Serial
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
Kokkos_Cuda_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp
|
||||
|
@ -50,9 +46,6 @@ Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu
|
|||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
|
||||
Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
|
||||
include $(KOKKOS_ETI_PATH)/Cuda/Makefile.eti_Cuda
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
|
||||
|
@ -75,9 +68,6 @@ Kokkos_ROCm_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_RO
|
|||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Task.cpp
|
||||
Kokkos_ROCm_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Impl.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Impl.cpp
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
|
||||
include $(KOKKOS_ETI_PATH)/ROCm/Makefile.eti_ROCm
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
|
||||
|
@ -85,9 +75,6 @@ Kokkos_ThreadsExec_base.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads
|
|||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp
|
||||
Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
|
||||
include $(KOKKOS_ETI_PATH)/Threads/Makefile.eti_Threads
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
|
||||
|
@ -95,9 +82,6 @@ Kokkos_OpenMP_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokko
|
|||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
|
||||
Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
|
||||
include $(KOKKOS_ETI_PATH)/OpenMP/Makefile.eti_OpenMP
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
|
||||
|
|
|
@ -151,7 +151,7 @@ Full details are given in the [build instructions](BUILD.md). Basic setups are s
|
|||
## CMake
|
||||
|
||||
The best way to install Kokkos is using the CMake build system. Assuming Kokkos lives in `$srcdir`:
|
||||
````
|
||||
````bash
|
||||
cmake $srcdir \
|
||||
-DCMAKE_CXX_COMPILER=$path_to_compiler \
|
||||
-DCMAKE_INSTALL_PREFIX=$path_to_install \
|
||||
|
@ -170,7 +170,7 @@ and run `make test` after completing the build.
|
|||
|
||||
For your CMake project using Kokkos, code such as the following:
|
||||
|
||||
````
|
||||
````cmake
|
||||
find_package(Kokkos)
|
||||
...
|
||||
target_link_libraries(myTarget Kokkos::kokkos)
|
||||
|
@ -187,17 +187,15 @@ for the install location given above.
|
|||
|
||||
## Spack
|
||||
An alternative to manually building with the CMake is to use the Spack package manager.
|
||||
To do so, download the `kokkos-spack` git repo and add to the package list:
|
||||
````
|
||||
spack repo add $path-to-kokkos-spack
|
||||
To get started, download the Spack [repo](https://github.com/spack/spack).
|
||||
````
|
||||
A basic installation would be done as:
|
||||
````
|
||||
spack install kokkos
|
||||
````bash
|
||||
> spack install kokkos
|
||||
````
|
||||
Spack allows options and and compilers to be tuned in the install command.
|
||||
````
|
||||
spack install kokkos@3.0 %gcc@7.3.0 +openmp
|
||||
````bash
|
||||
> spack install kokkos@3.0 %gcc@7.3.0 +openmp
|
||||
````
|
||||
This example illustrates the three most common parameters to Spack:
|
||||
* Variants: specified with, e.g. `+openmp`, this activates (or deactivates with, e.g. `~openmp`) certain options.
|
||||
|
@ -205,33 +203,33 @@ This example illustrates the three most common parameters to Spack:
|
|||
* Compiler: a default compiler will be chosen if not specified, but an exact compiler version can be given with the `%`option.
|
||||
|
||||
For a complete list of Kokkos options, run:
|
||||
````
|
||||
spack info kokkos
|
||||
````bash
|
||||
> spack info kokkos
|
||||
````
|
||||
Spack currently installs packages to a location determined by a unique hash. This hash name is not really "human readable".
|
||||
Generally, Spack usage should never really require you to reference the computer-generated unique install folder.
|
||||
More details are given in the [build instructions](BUILD.md). If you must know, you can locate Spack Kokkos installations with:
|
||||
````
|
||||
spack find -p kokkos ...
|
||||
````bash
|
||||
> spack find -p kokkos ...
|
||||
````
|
||||
where `...` is the unique spec identifying the particular Kokkos configuration and version.
|
||||
|
||||
Some more details can found in the Kokkos spack [documentation](Spack.md) or the Spack [website](https://spack.readthedocs.io/en/latest).
|
||||
|
||||
## Raw Makefile
|
||||
A bash script is provided to generate raw makefiles.
|
||||
To install Kokkos as a library create a build directory and run the following
|
||||
````
|
||||
$KOKKOS_PATH/generate_makefile.bash --prefix=$path_to_install
|
||||
````bash
|
||||
> $KOKKOS_PATH/generate_makefile.bash --prefix=$path_to_install
|
||||
````
|
||||
Once the Makefile is generated, run:
|
||||
````
|
||||
make kokkoslib
|
||||
make install
|
||||
````bash
|
||||
> make kokkoslib
|
||||
> make install
|
||||
````
|
||||
To additionally run the unit tests:
|
||||
````
|
||||
make build-test
|
||||
make test
|
||||
````bash
|
||||
> make build-test
|
||||
> make test
|
||||
````
|
||||
Run `generate_makefile.bash --help` for more detailed options such as
|
||||
changing the device type for which to build.
|
||||
|
@ -274,7 +272,7 @@ more than a single GPU is used by a single process.
|
|||
|
||||
If you publish work which mentions Kokkos, please cite the following paper:
|
||||
|
||||
````
|
||||
````BibTeX
|
||||
@article{CarterEdwards20143202,
|
||||
title = "Kokkos: Enabling manycore performance portability through polymorphic memory access patterns ",
|
||||
journal = "Journal of Parallel and Distributed Computing ",
|
||||
|
|
|
@ -0,0 +1,267 @@
|
|||

|
||||
|
||||
# Kokkos Spack
|
||||
|
||||
This gives instructions for using Spack to install Kokkos and developing packages that depend on Kokkos.
|
||||
|
||||
## Getting Started
|
||||
|
||||
Make sure you have downloaded [Spack](https://github.com/spack/spack).
|
||||
The easiest way to configure the Spack environment is:
|
||||
````bash
|
||||
> source spack/share/spack/setup-env.sh
|
||||
````
|
||||
with other scripts available for other shells.
|
||||
You can display information about how to install packages with:
|
||||
````bash
|
||||
> spack info kokkos
|
||||
````
|
||||
This will print all the information about how to install Kokkos with Spack.
|
||||
For detailed instructions on how to use Spack, see the [User Manual](https://spack.readthedocs.io).
|
||||
|
||||
## Setting Up Spack: Avoiding the Package Cascade
|
||||
By default, Spack doesn't 'see' anything on your system - including things like CMake and CUDA.
|
||||
This can be limited by adding a `packages.yaml` to your `$HOME/.spack` folder that includes CMake (and CUDA, if applicable). For example, your `packages.yaml` file could be:
|
||||
````yaml
|
||||
packages:
|
||||
cuda:
|
||||
modules:
|
||||
cuda@10.1.243: [cuda/10.1.243]
|
||||
paths:
|
||||
cuda@10.1.243:
|
||||
/opt/local/ppc64le-pwr8-nvidia/cuda/10.1.243
|
||||
buildable: false
|
||||
cmake:
|
||||
modules:
|
||||
cmake: [cmake/3.16.8]
|
||||
paths:
|
||||
cmake:
|
||||
/opt/local/ppc64le/cmake/3.16.8
|
||||
buildable: false
|
||||
````
|
||||
The `modules` entry is only necessary on systems that require loading Modules (i.e. most DOE systems).
|
||||
The `buildable` flag is useful to make sure Spack crashes if there is a path error,
|
||||
rather than having a type-o and Spack rebuilding everything because `cmake` isn't found.
|
||||
You can verify your environment is set up correctly by running `spack graph` or `spack spec`.
|
||||
For example:
|
||||
````bash
|
||||
> spack graph kokkos +cuda
|
||||
o kokkos
|
||||
|\
|
||||
o | cuda
|
||||
/
|
||||
o cmake
|
||||
````
|
||||
Without the existing CUDA and CMake being identified in `packages.yaml`, a (subset!) of the output would be:
|
||||
````bash
|
||||
o kokkos
|
||||
|\
|
||||
| o cmake
|
||||
| |\
|
||||
| | | |\
|
||||
| | | | | |\
|
||||
| | | | | | | |\
|
||||
| | | | | | | | | |\
|
||||
| | | | | | | o | | | libarchive
|
||||
| | | | | | | |\ \ \ \
|
||||
| | | | | | | | | |\ \ \ \
|
||||
| | | | | | | | | | | | |_|/
|
||||
| | | | | | | | | | | |/| |
|
||||
| | | | | | | | | | | | | o curl
|
||||
| | |_|_|_|_|_|_|_|_|_|_|/|
|
||||
| |/| | | |_|_|_|_|_|_|_|/
|
||||
| | | | |/| | | | | | | |
|
||||
| | | | o | | | | | | | | openssl
|
||||
| |/| | | | | | | | | | |
|
||||
| | | | | | | | | | o | | libxml2
|
||||
| | |_|_|_|_|_|_|_|/| | |
|
||||
| | | | | | | | | | |\ \ \
|
||||
| o | | | | | | | | | | | | zlib
|
||||
| / / / / / / / / / / / /
|
||||
| o | | | | | | | | | | | xz
|
||||
| / / / / / / / / / / /
|
||||
| o | | | | | | | | | | rhash
|
||||
| / / / / / / / / / /
|
||||
| | | | o | | | | | | nettle
|
||||
| | | | |\ \ \ \ \ \ \
|
||||
| | | o | | | | | | | | libuv
|
||||
| | | | o | | | | | | | autoconf
|
||||
| | |_|/| | | | | | | |
|
||||
| | | | |/ / / / / / /
|
||||
| o | | | | | | | | | perl
|
||||
| o | | | | | | | | | gdbm
|
||||
| o | | | | | | | | | readline
|
||||
````
|
||||
|
||||
## Configuring Kokkos as a Project Dependency
|
||||
Say you have a project "SuperScience" which needs to use Kokkos.
|
||||
In your `package.py` file, you would generally include something like:
|
||||
````python
|
||||
class SuperScience(CMakePackage):
|
||||
...
|
||||
depends_on("kokkos")
|
||||
````
|
||||
Often projects want to tweak behavior when using certain features, e.g.
|
||||
````python
|
||||
depends_on("kokkos+cuda", when="+cuda")
|
||||
````
|
||||
if your project needs CUDA-specific logic to configure and build.
|
||||
This illustrates the general principle in Spack of "flowing-up".
|
||||
A user requests a feature in the final app:
|
||||
````bash
|
||||
> spack install superscience+cuda
|
||||
````
|
||||
This flows upstream to the Kokkos dependency, causing the `kokkos+cuda` variant to build.
|
||||
The downstream app (SuperScience) tells the upstream app (Kokkos) how to build.
|
||||
|
||||
Because Kokkos is a performance portability library, it somewhat inverts this principle.
|
||||
Kokkos "flows-down", telling your application how best to configure for performance.
|
||||
Rather than a downstream app (SuperScience) telling the upstream (Kokkos) what variants to build,
|
||||
a pre-built Kokkos should be telling the downstream app SuperScience what variants to use.
|
||||
Kokkos works best when there is an "expert" configuration installed on your system.
|
||||
Your build should simply request `-DKokkos_ROOT=<BEST_KOKKOS_FOR_MY_SYSTEM>` and configure appropriately based on the Kokkos it finds.
|
||||
|
||||
Kokkos has many, many build variants.
|
||||
Where possible, projects should only depend on a general Kokkos, not specific variants.
|
||||
We recommend instead adding for each system you build on a Kokkos configuration to your `packages.yaml` file (usually found in `~/.spack` for specific users).
|
||||
For a Xeon + Volta system, this could look like:
|
||||
````yaml
|
||||
kokkos:
|
||||
variants: +cuda +openmp +cuda_lambda +wrapper ^cuda@10.1 cuda_arch=70
|
||||
compiler: [gcc@7.2.0]
|
||||
````
|
||||
which gives the "best" Kokkos configuration as CUDA+OpenMP optimized for a Volta 70 architecture using CUDA 10.1.
|
||||
It also enables support for CUDA Lambdas.
|
||||
The `+wrapper` option tells Kokkos to build with the special `nvcc_wrapper` (more below).
|
||||
Note here that we use the built-in `cuda_arch` variant of Spack to specify the archicture.
|
||||
For a Haswell system, we use
|
||||
````yaml
|
||||
kokkos:
|
||||
variants: +openmp std=14 target=haswell
|
||||
compiler: [intel@18]
|
||||
````
|
||||
which uses the built-in microarchitecture variants of Spack.
|
||||
Consult the Spack documentation for more details of Spack microarchitectures
|
||||
and CUDA architectures.
|
||||
Spack does not currently provide an AMD GPU microarchitecture option.
|
||||
If building for HIP or an AMD GPU, Kokkos provides an `amd_gpu_arch` similar to `cuda_arch`.
|
||||
````yaml
|
||||
kokkos:
|
||||
variants: +hip amd_gpu_arch=vega900
|
||||
````
|
||||
|
||||
Without an optimal default in your `packages.yaml` file, it is highly likely that the default Kokkos configuration you get will not be what you want.
|
||||
For example, CUDA is not enabled by default (there is no easy logic to conditionally activate this for CUDA-enabled systems).
|
||||
If you don't specify a CUDA build variant in a `packages.yaml` and you build your Kokkos-dependent project:
|
||||
````bash
|
||||
> spack install superscience
|
||||
````
|
||||
you may end up just getting the default Kokkos (i.e. Serial).
|
||||
Some examples are included in the `config/yaml` folder for common platforms.
|
||||
Before running `spack install <package>` we recommend running `spack spec <package>` to confirm your dependency tree is correct.
|
||||
For example, with Kokkos Kernels:
|
||||
````bash
|
||||
kokkos-kernels@3.0%gcc@8.3.0~blas build_type=RelWithDebInfo ~cblas~complex_double~complex_float~cublas~cuda cuda_arch=none ~cusparse~diy+double execspace_cuda=auto execspace_openmp=auto execspace_serial=auto execspace_threads=auto ~float~lapack~lapacke+layoutleft~layoutright memspace_cudaspace=auto memspace_cudauvmspace=auto +memspace_hostspace~mkl+offset_int+offset_size_t~openmp+ordinal_int~ordinal_int64_t~serial~superlu arch=linux-rhel7-skylake_avx512
|
||||
^cmake@3.16.2%gcc@8.3.0~doc+ncurses+openssl+ownlibs~qt arch=linux-rhel7-skylake_avx512
|
||||
^kokkos@3.0%gcc@8.3.0~aggressive_vectorization~amdavx~armv80~armv81~armv8_thunderx~armv8_tx2~bdw~bgq build_type=RelWithDebInfo ~carrizo~compiler_warnings+cuda cuda_arch=none +cuda_lambda~cuda_ldg_intrinsic~cuda_relocatable_device_code~cuda_uvm~debug~debug_bounds_check~debug_dualview_modify_check~deprecated_code~diy~epyc~examples~explicit_instantiation~fiji~gfx901~hpx~hpx_async_dispatch~hsw~hwloc~kaveri~kepler30~kepler32~kepler35~kepler37~knc~knl~maxwell50~maxwell52~maxwell53~memkind~numactl+openmp~pascal60~pascal61~power7~power8~power9+profiling~profiling_load_print~pthread~qthread~rocm~ryzen~serial~skx~snb std=14 ~tests~turing75~vega+volta70~volta72+wrapper~wsm arch=linux-rhel7-skylake_avx512
|
||||
^cuda@10.1%gcc@8.3.0 arch=linux-rhel7-skylake_avx512
|
||||
^kokkos-nvcc-wrapper@old%gcc@8.3.0 build_type=RelWithDebInfo +mpi arch=linux-rhel7-skylake_avx512
|
||||
^openmpi@4.0.2%gcc@8.3.0~cuda+cxx_exceptions fabrics=none ~java~legacylaunchers~memchecker patches=073477a76bba780c67c36e959cd3ee6910743e2735c7e76850ffba6791d498e4 ~pmi schedulers=none ~sqlite3~thread_multiple+vt arch=linux-rhel7-skylake_avx512
|
||||
````
|
||||
The output can be very verbose, but we can verify the expected `kokkos`:
|
||||
````bash
|
||||
kokkos@3.0%gcc@8.3.0~aggressive_vectorization~amdavx~armv80~armv81~armv8_thunderx~armv8_tx2~bdw~bgq build_type=RelWithDebInfo ~carrizo~compiler_warnings+cuda cuda_arch=none +cuda_lambda~cuda_ldg_intrinsic~cuda_relocatable_device_code~cuda_uvm~debug~debug_bounds_check~debug_dualview_modify_check~deprecated_code~diy~epyc~examples~explicit_instantiation~fiji~gfx901~hpx~hpx_async_dispatch~hsw~hwloc~kaveri~kepler30~kepler32~kepler35~kepler37~knc~knl~maxwell50~maxwell52~maxwell53~memkind~numactl+openmp~pascal60~pascal61~power7~power8~power9+profiling~profiling_load_print~pthread~qthread~rocm~ryzen~serial~skx~snb std=11 ~tests~turing75~vega+volta70~volta72+wrapper~wsm arch=linux-rhel7-skylake_avx512
|
||||
````
|
||||
We see that we do have `+volta70` and `+wrapper`, e.g.
|
||||
|
||||
### Spack Environments
|
||||
The encouraged way to use Spack is with Spack environments ([more details here](https://spack-tutorial.readthedocs.io/en/latest/tutorial_environments.html#dealing-with-many-specs-at-once)).
|
||||
Rather than installing packages one-at-a-time, you add packages to an environment.
|
||||
After adding all packages, you concretize and install them all.
|
||||
Using environments, one can explicitly add a desired Kokkos for the environment, e.g.
|
||||
````bash
|
||||
> spack add kokkos +cuda +cuda_lambda +volta70
|
||||
> spack add my_project +my_variant
|
||||
> ...
|
||||
> spack install
|
||||
````
|
||||
All packages within the environment will build against the CUDA-enabled Kokkos,
|
||||
even if they only request a default Kokkos.
|
||||
|
||||
## NVCC Wrapper
|
||||
Kokkos is a C++ project, but often builds for the CUDA backend.
|
||||
This is particularly problematic with CMake. At this point, `nvcc` does not accept all the flags that normally get passed to a C++ compiler.
|
||||
Kokkos provides `nvcc_wrapper` that identifies correctly as a C++ compiler to CMake and accepts C++ flags, but uses `nvcc` as the underlying compiler.
|
||||
`nvcc` itself also uses an underlying host compiler, e.g. GCC.
|
||||
|
||||
In Spack, the underlying host compiler is specified as below, e.g.:
|
||||
````bash
|
||||
> spack install package %gcc@8.0.0
|
||||
````
|
||||
This is still valid for Kokkos. To use the special wrapper for CUDA builds, request a desired compiler and simply add the `+wrapper` variant.
|
||||
````bash
|
||||
> spack install kokkos +cuda +wrapper %gcc@7.2.0
|
||||
````
|
||||
Downstream projects depending on Kokkos need to override their compiler.
|
||||
Kokkos provides the compiler in a `kokkos_cxx` variable,
|
||||
which points to either `nvcc_wrapper` when needed or the regular compiler otherwise.
|
||||
Spack projects already do this to use MPI compiler wrappers.
|
||||
````python
|
||||
def cmake_args(self):
|
||||
options = []
|
||||
...
|
||||
options.append("-DCMAKE_CXX_COMPILER=%s" % self.spec["kokkos"].kokkos_cxx)
|
||||
...
|
||||
return options
|
||||
````
|
||||
Note: `nvcc_wrapper` works with the MPI compiler wrappers.
|
||||
If building your project with MPI, do NOT set your compiler to `nvcc_wrapper`.
|
||||
Instead set your compiler to `mpicxx` and `nvcc_wrapper` will be used under the hood.
|
||||
````python
|
||||
def cmake_args(self):
|
||||
options = []
|
||||
...
|
||||
options.append("-DCMAKE_CXX_COMPILER=%s" % self.spec["mpi"].mpicxx)
|
||||
...
|
||||
return options
|
||||
````
|
||||
To accomplish this, `nvcc_wrapper` must depend on MPI (even though it uses no MPI).
|
||||
This has the unfortunate consequence that Kokkos CUDA projects not using MPI will implicitly depend on MPI anyway.
|
||||
This behavior is necessary for now, but will hopefully be removed later.
|
||||
When using environments, if MPI is not needed, you can remove the MPI dependency with:
|
||||
````bash
|
||||
> spack add kokkos-nvcc-wrapper ~mpi
|
||||
````
|
||||
|
||||
## Developing With Spack
|
||||
|
||||
Spack has historically been much more suited to *deployment* of mature packages than active testing or developing.
|
||||
However, recent features have improved support for development.
|
||||
Future releases are likely to make this even easier and incorporate Git integration.
|
||||
The most common commands will do a full build and install of the packages.
|
||||
If doing development, you may wish to merely set up a build environment.
|
||||
This allows you to modify the source and re-build.
|
||||
In this case, you can stop after configuring.
|
||||
Suppose you have Kokkos checkout in the folder `kokkos-src`:
|
||||
````bash
|
||||
> spack dev-build -d kokkos-src -u cmake kokkos@develop +wrapper +openmp
|
||||
````
|
||||
This sets up a development environment for you in `kokkos-src` which you can use (Bash example shown):
|
||||
Note: Always specify `develop` as the version when doing `dev-build`, except in rare cases.
|
||||
You are usually developing a feature branch that will merge into `develop`,
|
||||
hence you are making a new `develop` branch.
|
||||
|
||||
````bash
|
||||
> cd kokko-src
|
||||
> source spack-build-env.txt
|
||||
> cd spack-build
|
||||
> make
|
||||
````
|
||||
Before sourcing the Spack development environment, you may wish to save your current environment:
|
||||
````bash
|
||||
> declare -px > myenv.sh
|
||||
````
|
||||
When done with Spack, you can then restore your original environment:
|
||||
````bash
|
||||
> source myenv.sh
|
||||
````
|
|
@ -2,7 +2,9 @@
|
|||
|
||||
KOKKOS_SUBPACKAGE(Algorithms)
|
||||
|
||||
ADD_SUBDIRECTORY(src)
|
||||
IF (NOT Kokkos_INSTALL_TESTING)
|
||||
ADD_SUBDIRECTORY(src)
|
||||
ENDIF()
|
||||
|
||||
KOKKOS_ADD_TEST_DIRECTORIES(unit_tests)
|
||||
|
||||
|
|
|
@ -7,9 +7,15 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
|
|||
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
FILE(GLOB HEADERS *.hpp)
|
||||
FILE(GLOB SOURCES *.cpp)
|
||||
LIST(APPEND HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h)
|
||||
FILE(GLOB ALGO_HEADERS *.hpp)
|
||||
FILE(GLOB ALGO_SOURCES *.cpp)
|
||||
LIST(APPEND ALGO_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h)
|
||||
|
||||
INSTALL (
|
||||
DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/"
|
||||
DESTINATION ${KOKKOS_HEADER_DIR}
|
||||
FILES_MATCHING PATTERN "*.hpp"
|
||||
)
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
|
@ -17,8 +23,8 @@ LIST(APPEND HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h)
|
|||
# These will get ignored for standalone CMake and a true interface library made
|
||||
KOKKOS_ADD_INTERFACE_LIBRARY(
|
||||
kokkosalgorithms
|
||||
HEADERS ${HEADERS}
|
||||
SOURCES ${SOURCES}
|
||||
HEADERS ${ALGO_HEADERS}
|
||||
SOURCES ${ALGO_SOURCES}
|
||||
)
|
||||
KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkosalgorithms
|
||||
${KOKKOS_TOP_BUILD_DIR}
|
||||
|
|
|
@ -94,9 +94,9 @@ namespace Kokkos {
|
|||
class Pool {
|
||||
public:
|
||||
//The Kokkos device type
|
||||
typedef Device device_type;
|
||||
using device_type = Device;
|
||||
//The actual generator type
|
||||
typedef Generator<Device> generator_type;
|
||||
using generator_type = Generator<Device>;
|
||||
|
||||
//Default constructor: does not initialize a pool
|
||||
Pool();
|
||||
|
@ -124,7 +124,7 @@ namespace Kokkos {
|
|||
class Generator {
|
||||
public:
|
||||
//The Kokkos device type
|
||||
typedef DeviceType device_type;
|
||||
using device_type = DeviceType;
|
||||
|
||||
//Max return values of respective [X]rand[S]() functions
|
||||
enum {MAX_URAND = 0xffffffffU};
|
||||
|
@ -138,75 +138,75 @@ namespace Kokkos {
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
Generator (STATE_ARGUMENTS, int state_idx = 0);
|
||||
|
||||
//Draw a equidistributed uint32_t in the range (0,MAX_URAND]
|
||||
//Draw a equidistributed uint32_t in the range [0,MAX_URAND)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint32_t urand();
|
||||
|
||||
//Draw a equidistributed uint64_t in the range (0,MAX_URAND64]
|
||||
//Draw a equidistributed uint64_t in the range [0,MAX_URAND64)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint64_t urand64();
|
||||
|
||||
//Draw a equidistributed uint32_t in the range (0,range]
|
||||
//Draw a equidistributed uint32_t in the range [0,range)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint32_t urand(const uint32_t& range);
|
||||
|
||||
//Draw a equidistributed uint32_t in the range (start,end]
|
||||
//Draw a equidistributed uint32_t in the range [start,end)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint32_t urand(const uint32_t& start, const uint32_t& end );
|
||||
|
||||
//Draw a equidistributed uint64_t in the range (0,range]
|
||||
//Draw a equidistributed uint64_t in the range [0,range)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint64_t urand64(const uint64_t& range);
|
||||
|
||||
//Draw a equidistributed uint64_t in the range (start,end]
|
||||
//Draw a equidistributed uint64_t in the range [start,end)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint64_t urand64(const uint64_t& start, const uint64_t& end );
|
||||
|
||||
//Draw a equidistributed int in the range (0,MAX_RAND]
|
||||
//Draw a equidistributed int in the range [0,MAX_RAND)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int rand();
|
||||
|
||||
//Draw a equidistributed int in the range (0,range]
|
||||
//Draw a equidistributed int in the range [0,range)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int rand(const int& range);
|
||||
|
||||
//Draw a equidistributed int in the range (start,end]
|
||||
//Draw a equidistributed int in the range [start,end)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int rand(const int& start, const int& end );
|
||||
|
||||
//Draw a equidistributed int64_t in the range (0,MAX_RAND64]
|
||||
//Draw a equidistributed int64_t in the range [0,MAX_RAND64)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t rand64();
|
||||
|
||||
//Draw a equidistributed int64_t in the range (0,range]
|
||||
//Draw a equidistributed int64_t in the range [0,range)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t rand64(const int64_t& range);
|
||||
|
||||
//Draw a equidistributed int64_t in the range (start,end]
|
||||
//Draw a equidistributed int64_t in the range [start,end)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t rand64(const int64_t& start, const int64_t& end );
|
||||
|
||||
//Draw a equidistributed float in the range (0,1.0]
|
||||
//Draw a equidistributed float in the range [0,1.0)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand();
|
||||
|
||||
//Draw a equidistributed float in the range (0,range]
|
||||
//Draw a equidistributed float in the range [0,range)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand(const float& range);
|
||||
|
||||
//Draw a equidistributed float in the range (start,end]
|
||||
//Draw a equidistributed float in the range [start,end)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand(const float& start, const float& end );
|
||||
|
||||
//Draw a equidistributed double in the range (0,1.0]
|
||||
//Draw a equidistributed double in the range [0,1.0)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand();
|
||||
|
||||
//Draw a equidistributed double in the range (0,range]
|
||||
//Draw a equidistributed double in the range [0,range)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand(const double& range);
|
||||
|
||||
//Draw a equidistributed double in the range (start,end]
|
||||
//Draw a equidistributed double in the range [start,end)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand(const double& start, const double& end );
|
||||
|
||||
|
@ -221,11 +221,11 @@ namespace Kokkos {
|
|||
|
||||
//Additional Functions:
|
||||
|
||||
//Fills view with random numbers in the range (0,range]
|
||||
//Fills view with random numbers in the range [0,range)
|
||||
template<class ViewType, class PoolType>
|
||||
void fill_random(ViewType view, PoolType pool, ViewType::value_type range);
|
||||
|
||||
//Fills view with random numbers in the range (start,end]
|
||||
//Fills view with random numbers in the range [start,end)
|
||||
template<class ViewType, class PoolType>
|
||||
void fill_random(ViewType view, PoolType pool,
|
||||
ViewType::value_type start, ViewType::value_type end);
|
||||
|
@ -381,7 +381,7 @@ struct rand<Generator, unsigned long> {
|
|||
// NOTE (mfh 26 oct 2014) This is a partial specialization for long
|
||||
// long, a C99 / C++11 signed type which is guaranteed to be at
|
||||
// least 64 bits. Do NOT write a partial specialization for
|
||||
// int64_t!!! This is just a typedef! It could be either long or
|
||||
// int64_t!!! This is just an alias! It could be either long or
|
||||
// long long. We don't know which a priori, and I've seen both.
|
||||
// The types long and long long are guaranteed to differ, so it's
|
||||
// always safe to specialize for both.
|
||||
|
@ -413,7 +413,7 @@ struct rand<Generator, long long> {
|
|||
// NOTE (mfh 26 oct 2014) This is a partial specialization for
|
||||
// unsigned long long, a C99 / C++11 unsigned type which is
|
||||
// guaranteed to be at least 64 bits. Do NOT write a partial
|
||||
// specialization for uint64_t!!! This is just a typedef! It could
|
||||
// specialization for uint64_t!!! This is just an alias! It could
|
||||
// be either unsigned long or unsigned long long. We don't know
|
||||
// which a priori, and I've seen both. The types unsigned long and
|
||||
// unsigned long long are guaranteed to differ, so it's always safe
|
||||
|
@ -604,11 +604,7 @@ struct Random_UniqueIndex {
|
|||
KOKKOS_FUNCTION
|
||||
static int get_state_idx(const locks_view_type) {
|
||||
#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
const int i = ExecutionSpace::hardware_thread_id();
|
||||
#else
|
||||
const int i = ExecutionSpace::impl_hardware_thread_id();
|
||||
#endif
|
||||
return i;
|
||||
#else
|
||||
return 0;
|
||||
|
@ -652,15 +648,13 @@ struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
|
|||
static int get_state_idx(const locks_view_type& locks_) {
|
||||
#ifdef __HIP_DEVICE_COMPILE__
|
||||
const int i_offset =
|
||||
(hipThreadIdx_x * hipBlockDim_y + hipThreadIdx_y) * hipBlockDim_z +
|
||||
hipThreadIdx_z;
|
||||
int i = (((hipBlockIdx_x * hipGridDim_y + hipBlockIdx_y) * hipGridDim_z +
|
||||
hipBlockIdx_z) *
|
||||
hipBlockDim_x * hipBlockDim_y * hipBlockDim_z +
|
||||
(threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z;
|
||||
int i = (((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) *
|
||||
blockDim.x * blockDim.y * blockDim.z +
|
||||
i_offset) %
|
||||
locks_.extent(0);
|
||||
while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
|
||||
i += hipBlockDim_x * hipBlockDim_y * hipBlockDim_z;
|
||||
i += blockDim.x * blockDim.y * blockDim.z;
|
||||
if (i >= static_cast<int>(locks_.extent(0))) {
|
||||
i = i_offset;
|
||||
}
|
||||
|
@ -687,7 +681,7 @@ class Random_XorShift64 {
|
|||
friend class Random_XorShift64_Pool<DeviceType>;
|
||||
|
||||
public:
|
||||
typedef DeviceType device_type;
|
||||
using device_type = DeviceType;
|
||||
|
||||
constexpr static uint32_t MAX_URAND = std::numeric_limits<uint32_t>::max();
|
||||
constexpr static uint64_t MAX_URAND64 = std::numeric_limits<uint64_t>::max();
|
||||
|
@ -805,11 +799,6 @@ class Random_XorShift64 {
|
|||
// number
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double normal() {
|
||||
#ifndef __HIP_DEVICE_COMPILE__ // FIXME_HIP
|
||||
using std::sqrt;
|
||||
#else
|
||||
using ::sqrt;
|
||||
#endif
|
||||
double S = 2.0;
|
||||
double U;
|
||||
while (S >= 1.0) {
|
||||
|
@ -817,7 +806,7 @@ class Random_XorShift64 {
|
|||
const double V = 2.0 * drand() - 1.0;
|
||||
S = U * U + V * V;
|
||||
}
|
||||
return U * sqrt(-2.0 * log(S) / S);
|
||||
return U * std::sqrt(-2.0 * log(S) / S);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -830,15 +819,15 @@ template <class DeviceType = Kokkos::DefaultExecutionSpace>
|
|||
class Random_XorShift64_Pool {
|
||||
private:
|
||||
using execution_space = typename DeviceType::execution_space;
|
||||
typedef View<int*, execution_space> locks_type;
|
||||
typedef View<uint64_t*, DeviceType> state_data_type;
|
||||
using locks_type = View<int*, execution_space>;
|
||||
using state_data_type = View<uint64_t*, DeviceType>;
|
||||
locks_type locks_;
|
||||
state_data_type state_;
|
||||
int num_states_;
|
||||
|
||||
public:
|
||||
typedef Random_XorShift64<DeviceType> generator_type;
|
||||
typedef DeviceType device_type;
|
||||
using generator_type = Random_XorShift64<DeviceType>;
|
||||
using device_type = DeviceType;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift64_Pool() { num_states_ = 0; }
|
||||
|
@ -923,8 +912,8 @@ class Random_XorShift1024 {
|
|||
friend class Random_XorShift1024_Pool<DeviceType>;
|
||||
|
||||
public:
|
||||
typedef Random_XorShift1024_Pool<DeviceType> pool_type;
|
||||
typedef DeviceType device_type;
|
||||
using pool_type = Random_XorShift1024_Pool<DeviceType>;
|
||||
using device_type = DeviceType;
|
||||
|
||||
constexpr static uint32_t MAX_URAND = std::numeric_limits<uint32_t>::max();
|
||||
constexpr static uint64_t MAX_URAND64 = std::numeric_limits<uint64_t>::max();
|
||||
|
@ -1046,11 +1035,6 @@ class Random_XorShift1024 {
|
|||
// number
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double normal() {
|
||||
#ifndef KOKKOS_ENABLE_HIP // FIXME_HIP
|
||||
using std::sqrt;
|
||||
#else
|
||||
using ::sqrt;
|
||||
#endif
|
||||
double S = 2.0;
|
||||
double U;
|
||||
while (S >= 1.0) {
|
||||
|
@ -1058,7 +1042,7 @@ class Random_XorShift1024 {
|
|||
const double V = 2.0 * drand() - 1.0;
|
||||
S = U * U + V * V;
|
||||
}
|
||||
return U * sqrt(-2.0 * log(S) / S);
|
||||
return U * std::sqrt(-2.0 * log(S) / S);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -1071,9 +1055,9 @@ template <class DeviceType = Kokkos::DefaultExecutionSpace>
|
|||
class Random_XorShift1024_Pool {
|
||||
private:
|
||||
using execution_space = typename DeviceType::execution_space;
|
||||
typedef View<int*, execution_space> locks_type;
|
||||
typedef View<int*, DeviceType> int_view_type;
|
||||
typedef View<uint64_t * [16], DeviceType> state_data_type;
|
||||
using locks_type = View<int*, execution_space>;
|
||||
using int_view_type = View<int*, DeviceType>;
|
||||
using state_data_type = View<uint64_t * [16], DeviceType>;
|
||||
|
||||
locks_type locks_;
|
||||
state_data_type state_;
|
||||
|
@ -1082,9 +1066,9 @@ class Random_XorShift1024_Pool {
|
|||
friend class Random_XorShift1024<DeviceType>;
|
||||
|
||||
public:
|
||||
typedef Random_XorShift1024<DeviceType> generator_type;
|
||||
using generator_type = Random_XorShift1024<DeviceType>;
|
||||
|
||||
typedef DeviceType device_type;
|
||||
using device_type = DeviceType;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift1024_Pool() { num_states_ = 0; }
|
||||
|
@ -1176,14 +1160,13 @@ struct fill_random_functor_begin_end;
|
|||
|
||||
template <class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_range<ViewType, RandomPool, loops, 1, IndexType> {
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
using execution_space = typename ViewType::execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
typename ViewType::const_value_type range;
|
||||
|
||||
typedef rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>
|
||||
Rand;
|
||||
using Rand = rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>;
|
||||
|
||||
fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
|
||||
typename ViewType::const_value_type range_)
|
||||
|
@ -1203,14 +1186,13 @@ struct fill_random_functor_range<ViewType, RandomPool, loops, 1, IndexType> {
|
|||
|
||||
template <class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_range<ViewType, RandomPool, loops, 2, IndexType> {
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
using execution_space = typename ViewType::execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
typename ViewType::const_value_type range;
|
||||
|
||||
typedef rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>
|
||||
Rand;
|
||||
using Rand = rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>;
|
||||
|
||||
fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
|
||||
typename ViewType::const_value_type range_)
|
||||
|
@ -1232,14 +1214,13 @@ struct fill_random_functor_range<ViewType, RandomPool, loops, 2, IndexType> {
|
|||
|
||||
template <class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_range<ViewType, RandomPool, loops, 3, IndexType> {
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
using execution_space = typename ViewType::execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
typename ViewType::const_value_type range;
|
||||
|
||||
typedef rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>
|
||||
Rand;
|
||||
using Rand = rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>;
|
||||
|
||||
fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
|
||||
typename ViewType::const_value_type range_)
|
||||
|
@ -1262,14 +1243,13 @@ struct fill_random_functor_range<ViewType, RandomPool, loops, 3, IndexType> {
|
|||
|
||||
template <class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_range<ViewType, RandomPool, loops, 4, IndexType> {
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
using execution_space = typename ViewType::execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
typename ViewType::const_value_type range;
|
||||
|
||||
typedef rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>
|
||||
Rand;
|
||||
using Rand = rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>;
|
||||
|
||||
fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
|
||||
typename ViewType::const_value_type range_)
|
||||
|
@ -1293,14 +1273,13 @@ struct fill_random_functor_range<ViewType, RandomPool, loops, 4, IndexType> {
|
|||
|
||||
template <class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_range<ViewType, RandomPool, loops, 5, IndexType> {
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
using execution_space = typename ViewType::execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
typename ViewType::const_value_type range;
|
||||
|
||||
typedef rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>
|
||||
Rand;
|
||||
using Rand = rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>;
|
||||
|
||||
fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
|
||||
typename ViewType::const_value_type range_)
|
||||
|
@ -1326,14 +1305,13 @@ struct fill_random_functor_range<ViewType, RandomPool, loops, 5, IndexType> {
|
|||
|
||||
template <class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_range<ViewType, RandomPool, loops, 6, IndexType> {
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
using execution_space = typename ViewType::execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
typename ViewType::const_value_type range;
|
||||
|
||||
typedef rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>
|
||||
Rand;
|
||||
using Rand = rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>;
|
||||
|
||||
fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
|
||||
typename ViewType::const_value_type range_)
|
||||
|
@ -1361,14 +1339,13 @@ struct fill_random_functor_range<ViewType, RandomPool, loops, 6, IndexType> {
|
|||
|
||||
template <class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_range<ViewType, RandomPool, loops, 7, IndexType> {
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
using execution_space = typename ViewType::execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
typename ViewType::const_value_type range;
|
||||
|
||||
typedef rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>
|
||||
Rand;
|
||||
using Rand = rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>;
|
||||
|
||||
fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
|
||||
typename ViewType::const_value_type range_)
|
||||
|
@ -1398,14 +1375,13 @@ struct fill_random_functor_range<ViewType, RandomPool, loops, 7, IndexType> {
|
|||
|
||||
template <class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_range<ViewType, RandomPool, loops, 8, IndexType> {
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
using execution_space = typename ViewType::execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
typename ViewType::const_value_type range;
|
||||
|
||||
typedef rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>
|
||||
Rand;
|
||||
using Rand = rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>;
|
||||
|
||||
fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
|
||||
typename ViewType::const_value_type range_)
|
||||
|
@ -1437,14 +1413,13 @@ struct fill_random_functor_range<ViewType, RandomPool, loops, 8, IndexType> {
|
|||
template <class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 1,
|
||||
IndexType> {
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
using execution_space = typename ViewType::execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
typename ViewType::const_value_type begin, end;
|
||||
|
||||
typedef rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>
|
||||
Rand;
|
||||
using Rand = rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>;
|
||||
|
||||
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
|
||||
typename ViewType::const_value_type begin_,
|
||||
|
@ -1466,14 +1441,13 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 1,
|
|||
template <class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 2,
|
||||
IndexType> {
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
using execution_space = typename ViewType::execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
typename ViewType::const_value_type begin, end;
|
||||
|
||||
typedef rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>
|
||||
Rand;
|
||||
using Rand = rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>;
|
||||
|
||||
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
|
||||
typename ViewType::const_value_type begin_,
|
||||
|
@ -1497,14 +1471,13 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 2,
|
|||
template <class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 3,
|
||||
IndexType> {
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
using execution_space = typename ViewType::execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
typename ViewType::const_value_type begin, end;
|
||||
|
||||
typedef rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>
|
||||
Rand;
|
||||
using Rand = rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>;
|
||||
|
||||
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
|
||||
typename ViewType::const_value_type begin_,
|
||||
|
@ -1529,14 +1502,13 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 3,
|
|||
template <class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 4,
|
||||
IndexType> {
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
using execution_space = typename ViewType::execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
typename ViewType::const_value_type begin, end;
|
||||
|
||||
typedef rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>
|
||||
Rand;
|
||||
using Rand = rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>;
|
||||
|
||||
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
|
||||
typename ViewType::const_value_type begin_,
|
||||
|
@ -1562,14 +1534,13 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 4,
|
|||
template <class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 5,
|
||||
IndexType> {
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
using execution_space = typename ViewType::execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
typename ViewType::const_value_type begin, end;
|
||||
|
||||
typedef rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>
|
||||
Rand;
|
||||
using Rand = rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>;
|
||||
|
||||
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
|
||||
typename ViewType::const_value_type begin_,
|
||||
|
@ -1597,14 +1568,13 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 5,
|
|||
template <class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 6,
|
||||
IndexType> {
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
using execution_space = typename ViewType::execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
typename ViewType::const_value_type begin, end;
|
||||
|
||||
typedef rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>
|
||||
Rand;
|
||||
using Rand = rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>;
|
||||
|
||||
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
|
||||
typename ViewType::const_value_type begin_,
|
||||
|
@ -1634,14 +1604,13 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 6,
|
|||
template <class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 7,
|
||||
IndexType> {
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
using execution_space = typename ViewType::execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
typename ViewType::const_value_type begin, end;
|
||||
|
||||
typedef rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>
|
||||
Rand;
|
||||
using Rand = rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>;
|
||||
|
||||
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
|
||||
typename ViewType::const_value_type begin_,
|
||||
|
@ -1673,14 +1642,13 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 7,
|
|||
template <class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 8,
|
||||
IndexType> {
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
using execution_space = typename ViewType::execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
typename ViewType::const_value_type begin, end;
|
||||
|
||||
typedef rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>
|
||||
Rand;
|
||||
using Rand = rand<typename RandomPool::generator_type,
|
||||
typename ViewType::non_const_value_type>;
|
||||
|
||||
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
|
||||
typename ViewType::const_value_type begin_,
|
||||
|
|
|
@ -95,9 +95,9 @@ class BinSort {
|
|||
public:
|
||||
template <class DstViewType, class SrcViewType>
|
||||
struct copy_functor {
|
||||
typedef typename SrcViewType::const_type src_view_type;
|
||||
using src_view_type = typename SrcViewType::const_type;
|
||||
|
||||
typedef Impl::CopyOp<DstViewType, src_view_type> copy_op;
|
||||
using copy_op = Impl::CopyOp<DstViewType, src_view_type>;
|
||||
|
||||
DstViewType dst_values;
|
||||
src_view_type src_values;
|
||||
|
@ -120,17 +120,17 @@ class BinSort {
|
|||
// If a Kokkos::View then can generate constant random access
|
||||
// otherwise can only use the constant type.
|
||||
|
||||
typedef typename std::conditional<
|
||||
using src_view_type = typename std::conditional<
|
||||
Kokkos::is_view<SrcViewType>::value,
|
||||
Kokkos::View<typename SrcViewType::const_data_type,
|
||||
typename SrcViewType::array_layout,
|
||||
typename SrcViewType::device_type,
|
||||
Kokkos::MemoryTraits<Kokkos::RandomAccess> >,
|
||||
typename SrcViewType::const_type>::type src_view_type;
|
||||
typename SrcViewType::const_type>::type;
|
||||
|
||||
typedef typename PermuteViewType::const_type perm_view_type;
|
||||
using perm_view_type = typename PermuteViewType::const_type;
|
||||
|
||||
typedef Impl::CopyOp<DstViewType, src_view_type> copy_op;
|
||||
using copy_op = Impl::CopyOp<DstViewType, src_view_type>;
|
||||
|
||||
DstViewType dst_values;
|
||||
perm_view_type sort_order;
|
||||
|
@ -151,8 +151,8 @@ class BinSort {
|
|||
}
|
||||
};
|
||||
|
||||
typedef typename Space::execution_space execution_space;
|
||||
typedef BinSortOp bin_op_type;
|
||||
using execution_space = typename Space::execution_space;
|
||||
using bin_op_type = BinSortOp;
|
||||
|
||||
struct bin_count_tag {};
|
||||
struct bin_offset_tag {};
|
||||
|
@ -160,30 +160,30 @@ class BinSort {
|
|||
struct bin_sort_bins_tag {};
|
||||
|
||||
public:
|
||||
typedef SizeType size_type;
|
||||
typedef size_type value_type;
|
||||
using size_type = SizeType;
|
||||
using value_type = size_type;
|
||||
|
||||
typedef Kokkos::View<size_type*, Space> offset_type;
|
||||
typedef Kokkos::View<const int*, Space> bin_count_type;
|
||||
using offset_type = Kokkos::View<size_type*, Space>;
|
||||
using bin_count_type = Kokkos::View<const int*, Space>;
|
||||
|
||||
typedef typename KeyViewType::const_type const_key_view_type;
|
||||
using const_key_view_type = typename KeyViewType::const_type;
|
||||
|
||||
// If a Kokkos::View then can generate constant random access
|
||||
// otherwise can only use the constant type.
|
||||
|
||||
typedef typename std::conditional<
|
||||
using const_rnd_key_view_type = typename std::conditional<
|
||||
Kokkos::is_view<KeyViewType>::value,
|
||||
Kokkos::View<typename KeyViewType::const_data_type,
|
||||
typename KeyViewType::array_layout,
|
||||
typename KeyViewType::device_type,
|
||||
Kokkos::MemoryTraits<Kokkos::RandomAccess> >,
|
||||
const_key_view_type>::type const_rnd_key_view_type;
|
||||
const_key_view_type>::type;
|
||||
|
||||
typedef typename KeyViewType::non_const_value_type non_const_key_scalar;
|
||||
typedef typename KeyViewType::const_value_type const_key_scalar;
|
||||
using non_const_key_scalar = typename KeyViewType::non_const_value_type;
|
||||
using const_key_scalar = typename KeyViewType::const_value_type;
|
||||
|
||||
typedef Kokkos::View<int*, Space, Kokkos::MemoryTraits<Kokkos::Atomic> >
|
||||
bin_count_atomic_type;
|
||||
using bin_count_atomic_type =
|
||||
Kokkos::View<int*, Space, Kokkos::MemoryTraits<Kokkos::Atomic> >;
|
||||
|
||||
private:
|
||||
const_key_view_type keys;
|
||||
|
@ -266,10 +266,10 @@ class BinSort {
|
|||
template <class ValuesViewType>
|
||||
void sort(ValuesViewType const& values, int values_range_begin,
|
||||
int values_range_end) const {
|
||||
typedef Kokkos::View<typename ValuesViewType::data_type,
|
||||
typename ValuesViewType::array_layout,
|
||||
typename ValuesViewType::device_type>
|
||||
scratch_view_type;
|
||||
using scratch_view_type =
|
||||
Kokkos::View<typename ValuesViewType::data_type,
|
||||
typename ValuesViewType::array_layout,
|
||||
typename ValuesViewType::device_type>;
|
||||
|
||||
const size_t len = range_end - range_begin;
|
||||
const size_t values_len = values_range_end - values_range_begin;
|
||||
|
@ -278,13 +278,6 @@ class BinSort {
|
|||
"BinSort::sort: values range length != permutation vector length");
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
scratch_view_type sorted_values(
|
||||
ViewAllocateWithoutInitializing(
|
||||
"Kokkos::SortImpl::BinSortFunctor::sorted_values"),
|
||||
len, values.extent(1), values.extent(2), values.extent(3),
|
||||
values.extent(4), values.extent(5), values.extent(6), values.extent(7));
|
||||
#else
|
||||
scratch_view_type sorted_values(
|
||||
ViewAllocateWithoutInitializing(
|
||||
"Kokkos::SortImpl::BinSortFunctor::sorted_values"),
|
||||
|
@ -303,7 +296,6 @@ class BinSort {
|
|||
: KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
values.rank_dynamic > 7 ? values.extent(7)
|
||||
: KOKKOS_IMPL_CTOR_DEFAULT_ARG);
|
||||
#endif
|
||||
|
||||
{
|
||||
copy_permute_functor<scratch_view_type /* DstViewType */
|
||||
|
@ -511,8 +503,8 @@ bool try_std_sort(ViewType view) {
|
|||
|
||||
template <class ViewType>
|
||||
struct min_max_functor {
|
||||
typedef Kokkos::MinMaxScalar<typename ViewType::non_const_value_type>
|
||||
minmax_scalar;
|
||||
using minmax_scalar =
|
||||
Kokkos::MinMaxScalar<typename ViewType::non_const_value_type>;
|
||||
|
||||
ViewType view;
|
||||
min_max_functor(const ViewType& view_) : view(view_) {}
|
||||
|
@ -531,7 +523,7 @@ void sort(ViewType const& view, bool const always_use_kokkos_sort = false) {
|
|||
if (!always_use_kokkos_sort) {
|
||||
if (Impl::try_std_sort(view)) return;
|
||||
}
|
||||
typedef BinOp1D<ViewType> CompType;
|
||||
using CompType = BinOp1D<ViewType>;
|
||||
|
||||
Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result;
|
||||
Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result);
|
||||
|
@ -548,8 +540,8 @@ void sort(ViewType const& view, bool const always_use_kokkos_sort = false) {
|
|||
|
||||
template <class ViewType>
|
||||
void sort(ViewType view, size_t const begin, size_t const end) {
|
||||
typedef Kokkos::RangePolicy<typename ViewType::execution_space> range_policy;
|
||||
typedef BinOp1D<ViewType> CompType;
|
||||
using range_policy = Kokkos::RangePolicy<typename ViewType::execution_space>;
|
||||
using CompType = BinOp1D<ViewType>;
|
||||
|
||||
Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result;
|
||||
Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result);
|
||||
|
|
|
@ -20,14 +20,18 @@ KOKKOS_ADD_TEST_LIBRARY(
|
|||
HEADERS ${GTEST_SOURCE_DIR}/gtest/gtest.h
|
||||
SOURCES ${GTEST_SOURCE_DIR}/gtest/gtest-all.cc
|
||||
)
|
||||
# WORKAROUND FOR HIPCC
|
||||
IF(Kokkos_ENABLE_HIP)
|
||||
TARGET_COMPILE_DEFINITIONS(kokkosalgorithms_gtest PUBLIC "-DGTEST_HAS_PTHREAD=0 --amdgpu-target=gfx906")
|
||||
ELSE()
|
||||
TARGET_COMPILE_DEFINITIONS(kokkosalgorithms_gtest PUBLIC "-DGTEST_HAS_PTHREAD=0")
|
||||
|
||||
# avoid deprecation warnings from MSVC
|
||||
TARGET_COMPILE_DEFINITIONS(kokkosalgorithms_gtest PUBLIC GTEST_HAS_TR1_TUPLE=0 GTEST_HAS_PTHREAD=0)
|
||||
|
||||
IF(NOT (Kokkos_ENABLE_CUDA AND WIN32))
|
||||
TARGET_COMPILE_FEATURES(kokkosalgorithms_gtest PUBLIC cxx_std_11)
|
||||
ENDIF()
|
||||
|
||||
TARGET_COMPILE_FEATURES(kokkosalgorithms_gtest PUBLIC cxx_std_11)
|
||||
# Suppress clang-tidy diagnostics on code that we do not have control over
|
||||
IF(CMAKE_CXX_CLANG_TIDY)
|
||||
SET_TARGET_PROPERTIES(kokkosalgorithms_gtest PROPERTIES CXX_CLANG_TIDY "")
|
||||
ENDIF()
|
||||
|
||||
SET(SOURCES
|
||||
UnitTestMain.cpp
|
||||
|
|
|
@ -111,10 +111,10 @@ struct RandomProperties {
|
|||
|
||||
template <class GeneratorPool, class Scalar>
|
||||
struct test_random_functor {
|
||||
typedef typename GeneratorPool::generator_type rnd_type;
|
||||
using rnd_type = typename GeneratorPool::generator_type;
|
||||
|
||||
typedef RandomProperties value_type;
|
||||
typedef typename GeneratorPool::device_type device_type;
|
||||
using value_type = RandomProperties;
|
||||
using device_type = typename GeneratorPool::device_type;
|
||||
|
||||
GeneratorPool rand_pool;
|
||||
const double mean;
|
||||
|
@ -125,12 +125,12 @@ struct test_random_functor {
|
|||
// implementations might violate this upper bound, due to rounding
|
||||
// error. Just in case, we leave an extra space at the end of each
|
||||
// dimension, in the View types below.
|
||||
typedef Kokkos::View<int[HIST_DIM1D + 1], typename GeneratorPool::device_type>
|
||||
type_1d;
|
||||
using type_1d =
|
||||
Kokkos::View<int[HIST_DIM1D + 1], typename GeneratorPool::device_type>;
|
||||
type_1d density_1d;
|
||||
typedef Kokkos::View<int[HIST_DIM3D + 1][HIST_DIM3D + 1][HIST_DIM3D + 1],
|
||||
typename GeneratorPool::device_type>
|
||||
type_3d;
|
||||
using type_3d =
|
||||
Kokkos::View<int[HIST_DIM3D + 1][HIST_DIM3D + 1][HIST_DIM3D + 1],
|
||||
typename GeneratorPool::device_type>;
|
||||
type_3d density_3d;
|
||||
|
||||
test_random_functor(GeneratorPool rand_pool_, type_1d d1d, type_3d d3d)
|
||||
|
@ -200,9 +200,9 @@ struct test_random_functor {
|
|||
|
||||
template <class DeviceType>
|
||||
struct test_histogram1d_functor {
|
||||
typedef RandomProperties value_type;
|
||||
typedef typename DeviceType::execution_space execution_space;
|
||||
typedef typename DeviceType::memory_space memory_space;
|
||||
using value_type = RandomProperties;
|
||||
using execution_space = typename DeviceType::execution_space;
|
||||
using memory_space = typename DeviceType::memory_space;
|
||||
|
||||
// NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to define
|
||||
// an exclusive upper bound on the range of random numbers that
|
||||
|
@ -210,7 +210,7 @@ struct test_histogram1d_functor {
|
|||
// implementations might violate this upper bound, due to rounding
|
||||
// error. Just in case, we leave an extra space at the end of each
|
||||
// dimension, in the View type below.
|
||||
typedef Kokkos::View<int[HIST_DIM1D + 1], memory_space> type_1d;
|
||||
using type_1d = Kokkos::View<int[HIST_DIM1D + 1], memory_space>;
|
||||
type_1d density_1d;
|
||||
double mean;
|
||||
|
||||
|
@ -219,7 +219,7 @@ struct test_histogram1d_functor {
|
|||
|
||||
KOKKOS_INLINE_FUNCTION void operator()(
|
||||
const typename memory_space::size_type i, RandomProperties& prop) const {
|
||||
typedef typename memory_space::size_type size_type;
|
||||
using size_type = typename memory_space::size_type;
|
||||
const double count = density_1d(i);
|
||||
prop.mean += count;
|
||||
prop.variance += 1.0 * (count - mean) * (count - mean);
|
||||
|
@ -234,9 +234,9 @@ struct test_histogram1d_functor {
|
|||
|
||||
template <class DeviceType>
|
||||
struct test_histogram3d_functor {
|
||||
typedef RandomProperties value_type;
|
||||
typedef typename DeviceType::execution_space execution_space;
|
||||
typedef typename DeviceType::memory_space memory_space;
|
||||
using value_type = RandomProperties;
|
||||
using execution_space = typename DeviceType::execution_space;
|
||||
using memory_space = typename DeviceType::memory_space;
|
||||
|
||||
// NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to define
|
||||
// an exclusive upper bound on the range of random numbers that
|
||||
|
@ -244,9 +244,9 @@ struct test_histogram3d_functor {
|
|||
// implementations might violate this upper bound, due to rounding
|
||||
// error. Just in case, we leave an extra space at the end of each
|
||||
// dimension, in the View type below.
|
||||
typedef Kokkos::View<int[HIST_DIM3D + 1][HIST_DIM3D + 1][HIST_DIM3D + 1],
|
||||
memory_space>
|
||||
type_3d;
|
||||
using type_3d =
|
||||
Kokkos::View<int[HIST_DIM3D + 1][HIST_DIM3D + 1][HIST_DIM3D + 1],
|
||||
memory_space>;
|
||||
type_3d density_3d;
|
||||
double mean;
|
||||
|
||||
|
@ -255,7 +255,7 @@ struct test_histogram3d_functor {
|
|||
|
||||
KOKKOS_INLINE_FUNCTION void operator()(
|
||||
const typename memory_space::size_type i, RandomProperties& prop) const {
|
||||
typedef typename memory_space::size_type size_type;
|
||||
using size_type = typename memory_space::size_type;
|
||||
const double count = density_3d(
|
||||
i / (HIST_DIM3D * HIST_DIM3D),
|
||||
(i % (HIST_DIM3D * HIST_DIM3D)) / HIST_DIM3D, i % HIST_DIM3D);
|
||||
|
@ -276,7 +276,7 @@ struct test_histogram3d_functor {
|
|||
//
|
||||
template <class RandomGenerator, class Scalar>
|
||||
struct test_random_scalar {
|
||||
typedef typename RandomGenerator::generator_type rnd_type;
|
||||
using rnd_type = typename RandomGenerator::generator_type;
|
||||
|
||||
int pass_mean, pass_var, pass_covar;
|
||||
int pass_hist1d_mean, pass_hist1d_var, pass_hist1d_covar;
|
||||
|
@ -294,7 +294,7 @@ struct test_random_scalar {
|
|||
cout << " -- Testing randomness properties" << endl;
|
||||
|
||||
RandomProperties result;
|
||||
typedef test_random_functor<RandomGenerator, Scalar> functor_type;
|
||||
using functor_type = test_random_functor<RandomGenerator, Scalar>;
|
||||
parallel_reduce(num_draws / 1024,
|
||||
functor_type(pool, density_1d, density_3d), result);
|
||||
|
||||
|
@ -325,8 +325,8 @@ struct test_random_scalar {
|
|||
cout << " -- Testing 1-D histogram" << endl;
|
||||
|
||||
RandomProperties result;
|
||||
typedef test_histogram1d_functor<typename RandomGenerator::device_type>
|
||||
functor_type;
|
||||
using functor_type =
|
||||
test_histogram1d_functor<typename RandomGenerator::device_type>;
|
||||
parallel_reduce(HIST_DIM1D, functor_type(density_1d, num_draws), result);
|
||||
|
||||
double tolerance = 6 * std::sqrt(1.0 / HIST_DIM1D);
|
||||
|
@ -357,8 +357,8 @@ struct test_random_scalar {
|
|||
cout << " -- Testing 3-D histogram" << endl;
|
||||
|
||||
RandomProperties result;
|
||||
typedef test_histogram3d_functor<typename RandomGenerator::device_type>
|
||||
functor_type;
|
||||
using functor_type =
|
||||
test_histogram3d_functor<typename RandomGenerator::device_type>;
|
||||
parallel_reduce(HIST_DIM1D, functor_type(density_3d, num_draws), result);
|
||||
|
||||
double tolerance = 6 * std::sqrt(1.0 / HIST_DIM1D);
|
||||
|
|
|
@ -55,8 +55,8 @@ namespace Impl {
|
|||
|
||||
template <class ExecutionSpace, class Scalar>
|
||||
struct is_sorted_struct {
|
||||
typedef unsigned int value_type;
|
||||
typedef ExecutionSpace execution_space;
|
||||
using value_type = unsigned int;
|
||||
using execution_space = ExecutionSpace;
|
||||
|
||||
Kokkos::View<Scalar*, ExecutionSpace> keys;
|
||||
|
||||
|
@ -69,8 +69,8 @@ struct is_sorted_struct {
|
|||
|
||||
template <class ExecutionSpace, class Scalar>
|
||||
struct sum {
|
||||
typedef double value_type;
|
||||
typedef ExecutionSpace execution_space;
|
||||
using value_type = double;
|
||||
using execution_space = ExecutionSpace;
|
||||
|
||||
Kokkos::View<Scalar*, ExecutionSpace> keys;
|
||||
|
||||
|
@ -81,8 +81,8 @@ struct sum {
|
|||
|
||||
template <class ExecutionSpace, class Scalar>
|
||||
struct bin3d_is_sorted_struct {
|
||||
typedef unsigned int value_type;
|
||||
typedef ExecutionSpace execution_space;
|
||||
using value_type = unsigned int;
|
||||
using execution_space = ExecutionSpace;
|
||||
|
||||
Kokkos::View<Scalar * [3], ExecutionSpace> keys;
|
||||
|
||||
|
@ -115,8 +115,8 @@ struct bin3d_is_sorted_struct {
|
|||
|
||||
template <class ExecutionSpace, class Scalar>
|
||||
struct sum3D {
|
||||
typedef double value_type;
|
||||
typedef ExecutionSpace execution_space;
|
||||
using value_type = double;
|
||||
using execution_space = ExecutionSpace;
|
||||
|
||||
Kokkos::View<Scalar * [3], ExecutionSpace> keys;
|
||||
|
||||
|
@ -131,7 +131,7 @@ struct sum3D {
|
|||
|
||||
template <class ExecutionSpace, typename KeyType>
|
||||
void test_1D_sort_impl(unsigned int n, bool force_kokkos) {
|
||||
typedef Kokkos::View<KeyType*, ExecutionSpace> KeyViewType;
|
||||
using KeyViewType = Kokkos::View<KeyType*, ExecutionSpace>;
|
||||
KeyViewType keys("Keys", n);
|
||||
|
||||
// Test sorting array with all numbers equal
|
||||
|
@ -166,7 +166,7 @@ void test_1D_sort_impl(unsigned int n, bool force_kokkos) {
|
|||
|
||||
template <class ExecutionSpace, typename KeyType>
|
||||
void test_3D_sort_impl(unsigned int n) {
|
||||
typedef Kokkos::View<KeyType * [3], ExecutionSpace> KeyViewType;
|
||||
using KeyViewType = Kokkos::View<KeyType * [3], ExecutionSpace>;
|
||||
|
||||
KeyViewType keys("Keys", n * n * n);
|
||||
|
||||
|
@ -186,7 +186,7 @@ void test_3D_sort_impl(unsigned int n) {
|
|||
typename KeyViewType::value_type min[3] = {0, 0, 0};
|
||||
typename KeyViewType::value_type max[3] = {100, 100, 100};
|
||||
|
||||
typedef Kokkos::BinOp3D<KeyViewType> BinOp;
|
||||
using BinOp = Kokkos::BinOp3D<KeyViewType>;
|
||||
BinOp bin_op(bin_max, min, max);
|
||||
Kokkos::BinSort<KeyViewType, BinOp> Sorter(keys, bin_op, false);
|
||||
Sorter.create_permute_vector();
|
||||
|
@ -215,9 +215,9 @@ void test_3D_sort_impl(unsigned int n) {
|
|||
|
||||
template <class ExecutionSpace, typename KeyType>
|
||||
void test_dynamic_view_sort_impl(unsigned int n) {
|
||||
typedef Kokkos::Experimental::DynamicView<KeyType*, ExecutionSpace>
|
||||
KeyDynamicViewType;
|
||||
typedef Kokkos::View<KeyType*, ExecutionSpace> KeyViewType;
|
||||
using KeyDynamicViewType =
|
||||
Kokkos::Experimental::DynamicView<KeyType*, ExecutionSpace>;
|
||||
using KeyViewType = Kokkos::View<KeyType*, ExecutionSpace>;
|
||||
|
||||
const size_t upper_bound = 2 * n;
|
||||
const size_t min_chunk_size = 1024;
|
||||
|
@ -305,8 +305,8 @@ void test_issue_1160_impl() {
|
|||
Kokkos::deep_copy(x_, h_x);
|
||||
Kokkos::deep_copy(v_, h_v);
|
||||
|
||||
typedef decltype(element_) KeyViewType;
|
||||
typedef Kokkos::BinOp1D<KeyViewType> BinOp;
|
||||
using KeyViewType = decltype(element_);
|
||||
using BinOp = Kokkos::BinOp1D<KeyViewType>;
|
||||
|
||||
int begin = 3;
|
||||
int end = 8;
|
||||
|
|
|
@ -5,6 +5,6 @@ build_script:
|
|||
- cmd: >-
|
||||
mkdir build &&
|
||||
cd build &&
|
||||
cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DKokkos_ENABLE_LIBDL=OFF -DKokkos_ENABLE_PROFILING=OFF &&
|
||||
cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON &&
|
||||
cmake --build . --target install &&
|
||||
ctest -C Debug -V
|
||||
|
|
|
@ -69,13 +69,13 @@ int main(int argc, char* argv[]) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
int L = atoi(argv[1]);
|
||||
int N = atoi(argv[2]);
|
||||
int M = atoi(argv[3]);
|
||||
int D = atoi(argv[4]);
|
||||
int K = atoi(argv[5]);
|
||||
int R = atoi(argv[6]);
|
||||
int type = atoi(argv[7]);
|
||||
int L = std::stoi(argv[1]);
|
||||
int N = std::stoi(argv[2]);
|
||||
int M = std::stoi(argv[3]);
|
||||
int D = std::stoi(argv[4]);
|
||||
int K = std::stoi(argv[5]);
|
||||
int R = std::stoi(argv[6]);
|
||||
int type = std::stoi(argv[7]);
|
||||
|
||||
Kokkos::View<int*> offsets("Offsets", L, M);
|
||||
Kokkos::Random_XorShift64_Pool<> pool(12371);
|
||||
|
|
|
@ -73,15 +73,15 @@ int main(int argc, char* argv[]) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
int P = atoi(argv[1]);
|
||||
int N = atoi(argv[2]);
|
||||
int K = atoi(argv[3]);
|
||||
int R = atoi(argv[4]);
|
||||
int D = atoi(argv[5]);
|
||||
int U = atoi(argv[6]);
|
||||
int F = atoi(argv[7]);
|
||||
int T = atoi(argv[8]);
|
||||
int S = atoi(argv[9]);
|
||||
int P = std::stoi(argv[1]);
|
||||
int N = std::stoi(argv[2]);
|
||||
int K = std::stoi(argv[3]);
|
||||
int R = std::stoi(argv[4]);
|
||||
int D = std::stoi(argv[5]);
|
||||
int U = std::stoi(argv[6]);
|
||||
int F = std::stoi(argv[7]);
|
||||
int T = std::stoi(argv[8]);
|
||||
int S = std::stoi(argv[9]);
|
||||
|
||||
if (U > 8) {
|
||||
printf("U must be 1-8\n");
|
||||
|
|
|
@ -72,13 +72,13 @@ int main(int argc, char* argv[]) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
int S = atoi(argv[1]);
|
||||
int N = atoi(argv[2]);
|
||||
int K = atoi(argv[3]);
|
||||
int D = atoi(argv[4]);
|
||||
int R = atoi(argv[5]);
|
||||
int U = atoi(argv[6]);
|
||||
int F = atoi(argv[7]);
|
||||
int S = std::stoi(argv[1]);
|
||||
int N = std::stoi(argv[2]);
|
||||
int K = std::stoi(argv[3]);
|
||||
int D = std::stoi(argv[4]);
|
||||
int R = std::stoi(argv[5]);
|
||||
int U = std::stoi(argv[6]);
|
||||
int F = std::stoi(argv[7]);
|
||||
|
||||
if ((S != 1) && (S != 2) && (S != 4)) {
|
||||
printf("S must be one of 1,2,4\n");
|
||||
|
|
|
@ -50,151 +50,152 @@
|
|||
#define HLINE "-------------------------------------------------------------\n"
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA)
|
||||
typedef Kokkos::View<int64_t*, Kokkos::CudaSpace>::HostMirror GUPSHostArray;
|
||||
typedef Kokkos::View<int64_t*, Kokkos::CudaSpace> GUPSDeviceArray;
|
||||
using GUPSHostArray = Kokkos::View<int64_t*, Kokkos::CudaSpace>::HostMirror;
|
||||
using GUPSDeviceArray = Kokkos::View<int64_t*, Kokkos::CudaSpace>;
|
||||
#else
|
||||
typedef Kokkos::View<int64_t*, Kokkos::HostSpace>::HostMirror GUPSHostArray;
|
||||
typedef Kokkos::View<int64_t*, Kokkos::HostSpace> GUPSDeviceArray;
|
||||
using GUPSHostArray = Kokkos::View<int64_t*, Kokkos::HostSpace>::HostMirror;
|
||||
using GUPSDeviceArray = Kokkos::View<int64_t*, Kokkos::HostSpace>;
|
||||
#endif
|
||||
|
||||
typedef int GUPSIndex;
|
||||
using GUPSIndex = int;
|
||||
|
||||
double now() {
|
||||
struct timeval now;
|
||||
gettimeofday(&now, nullptr);
|
||||
struct timeval now;
|
||||
gettimeofday(&now, nullptr);
|
||||
|
||||
return (double) now.tv_sec + ((double) now.tv_usec * 1.0e-6);
|
||||
return (double)now.tv_sec + ((double)now.tv_usec * 1.0e-6);
|
||||
}
|
||||
|
||||
void randomize_indices(GUPSHostArray& indices, GUPSDeviceArray& dev_indices, const int64_t dataCount) {
|
||||
for( GUPSIndex i = 0; i < indices.extent(0); ++i ) {
|
||||
indices[i] = lrand48() % dataCount;
|
||||
}
|
||||
void randomize_indices(GUPSHostArray& indices, GUPSDeviceArray& dev_indices,
|
||||
const int64_t dataCount) {
|
||||
for (GUPSIndex i = 0; i < indices.extent(0); ++i) {
|
||||
indices[i] = lrand48() % dataCount;
|
||||
}
|
||||
|
||||
Kokkos::deep_copy(dev_indices, indices);
|
||||
Kokkos::deep_copy(dev_indices, indices);
|
||||
}
|
||||
|
||||
void run_gups(GUPSDeviceArray& indices, GUPSDeviceArray& data, const int64_t datum,
|
||||
const bool performAtomics) {
|
||||
void run_gups(GUPSDeviceArray& indices, GUPSDeviceArray& data,
|
||||
const int64_t datum, const bool performAtomics) {
|
||||
if (performAtomics) {
|
||||
Kokkos::parallel_for(
|
||||
"bench-gups-atomic", indices.extent(0),
|
||||
KOKKOS_LAMBDA(const GUPSIndex i) {
|
||||
Kokkos::atomic_fetch_xor(&data[indices[i]], datum);
|
||||
});
|
||||
} else {
|
||||
Kokkos::parallel_for(
|
||||
"bench-gups-non-atomic", indices.extent(0),
|
||||
KOKKOS_LAMBDA(const GUPSIndex i) { data[indices[i]] ^= datum; });
|
||||
}
|
||||
|
||||
if( performAtomics ) {
|
||||
Kokkos::parallel_for("bench-gups-atomic", indices.extent(0), KOKKOS_LAMBDA(const GUPSIndex i) {
|
||||
Kokkos::atomic_fetch_xor( &data[indices[i]], datum );
|
||||
});
|
||||
} else {
|
||||
Kokkos::parallel_for("bench-gups-non-atomic", indices.extent(0), KOKKOS_LAMBDA(const GUPSIndex i) {
|
||||
data[indices[i]] ^= datum;
|
||||
});
|
||||
}
|
||||
|
||||
Kokkos::fence();
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
int run_benchmark(const GUPSIndex indicesCount, const GUPSIndex dataCount, const int repeats,
|
||||
const bool useAtomics) {
|
||||
int run_benchmark(const GUPSIndex indicesCount, const GUPSIndex dataCount,
|
||||
const int repeats, const bool useAtomics) {
|
||||
printf("Reports fastest timing per kernel\n");
|
||||
printf("Creating Views...\n");
|
||||
|
||||
printf("Reports fastest timing per kernel\n");
|
||||
printf("Creating Views...\n");
|
||||
printf("Memory Sizes:\n");
|
||||
printf("- Elements: %15" PRIu64 " (%12.4f MB)\n",
|
||||
static_cast<uint64_t>(dataCount),
|
||||
1.0e-6 * ((double)dataCount * (double)sizeof(int64_t)));
|
||||
printf("- Indices: %15" PRIu64 " (%12.4f MB)\n",
|
||||
static_cast<uint64_t>(indicesCount),
|
||||
1.0e-6 * ((double)indicesCount * (double)sizeof(int64_t)));
|
||||
printf(" - Atomics: %15s\n", (useAtomics ? "Yes" : "No"));
|
||||
printf("Benchmark kernels will be performed for %d iterations.\n", repeats);
|
||||
|
||||
printf("Memory Sizes:\n");
|
||||
printf("- Elements: %15" PRIu64 " (%12.4f MB)\n", static_cast<uint64_t>(dataCount),
|
||||
1.0e-6 * ((double) dataCount * (double) sizeof(int64_t)));
|
||||
printf("- Indices: %15" PRIu64 " (%12.4f MB)\n", static_cast<uint64_t>(indicesCount),
|
||||
1.0e-6 * ((double) indicesCount * (double) sizeof(int64_t)));
|
||||
printf(" - Atomics: %15s\n", (useAtomics ? "Yes" : "No") );
|
||||
printf("Benchmark kernels will be performed for %d iterations.\n", repeats);
|
||||
printf(HLINE);
|
||||
|
||||
printf(HLINE);
|
||||
GUPSDeviceArray dev_indices("indices", indicesCount);
|
||||
GUPSDeviceArray dev_data("data", dataCount);
|
||||
int64_t datum = -1;
|
||||
|
||||
GUPSDeviceArray dev_indices("indices", indicesCount);
|
||||
GUPSDeviceArray dev_data("data", dataCount);
|
||||
int64_t datum = -1;
|
||||
GUPSHostArray indices = Kokkos::create_mirror_view(dev_indices);
|
||||
GUPSHostArray data = Kokkos::create_mirror_view(dev_data);
|
||||
|
||||
GUPSHostArray indices = Kokkos::create_mirror_view(dev_indices);
|
||||
GUPSHostArray data = Kokkos::create_mirror_view(dev_data);
|
||||
double gupsTime = 0.0;
|
||||
|
||||
double gupsTime = 0.0;
|
||||
|
||||
printf("Initializing Views...\n");
|
||||
printf("Initializing Views...\n");
|
||||
|
||||
#if defined(KOKKOS_HAVE_OPENMP)
|
||||
Kokkos::parallel_for("init-data", Kokkos::RangePolicy<Kokkos::OpenMP>(0, dataCount),
|
||||
Kokkos::parallel_for(
|
||||
"init-data", Kokkos::RangePolicy<Kokkos::OpenMP>(0, dataCount),
|
||||
#else
|
||||
Kokkos::parallel_for("init-data", Kokkos::RangePolicy<Kokkos::Serial>(0, dataCount),
|
||||
Kokkos::parallel_for(
|
||||
"init-data", Kokkos::RangePolicy<Kokkos::Serial>(0, dataCount),
|
||||
#endif
|
||||
KOKKOS_LAMBDA(const int i) {
|
||||
|
||||
data[i] = 10101010101;
|
||||
});
|
||||
KOKKOS_LAMBDA(const int i) { data[i] = 10101010101; });
|
||||
|
||||
#if defined(KOKKOS_HAVE_OPENMP)
|
||||
Kokkos::parallel_for("init-indices", Kokkos::RangePolicy<Kokkos::OpenMP>(0, indicesCount),
|
||||
Kokkos::parallel_for(
|
||||
"init-indices", Kokkos::RangePolicy<Kokkos::OpenMP>(0, indicesCount),
|
||||
#else
|
||||
Kokkos::parallel_for("init-indices", Kokkos::RangePolicy<Kokkos::Serial>(0, indicesCount),
|
||||
Kokkos::parallel_for(
|
||||
"init-indices", Kokkos::RangePolicy<Kokkos::Serial>(0, indicesCount),
|
||||
#endif
|
||||
KOKKOS_LAMBDA(const int i) {
|
||||
KOKKOS_LAMBDA(const int i) { indices[i] = 0; });
|
||||
|
||||
indices[i] = 0;
|
||||
});
|
||||
Kokkos::deep_copy(dev_data, data);
|
||||
Kokkos::deep_copy(dev_indices, indices);
|
||||
double start;
|
||||
|
||||
Kokkos::deep_copy(dev_data, data);
|
||||
Kokkos::deep_copy(dev_indices, indices);
|
||||
double start;
|
||||
printf("Starting benchmarking...\n");
|
||||
|
||||
printf("Starting benchmarking...\n");
|
||||
for (GUPSIndex k = 0; k < repeats; ++k) {
|
||||
randomize_indices(indices, dev_indices, data.extent(0));
|
||||
|
||||
for( GUPSIndex k = 0; k < repeats; ++k ) {
|
||||
randomize_indices(indices, dev_indices, data.extent(0));
|
||||
start = now();
|
||||
run_gups(dev_indices, dev_data, datum, useAtomics);
|
||||
gupsTime += now() - start;
|
||||
}
|
||||
|
||||
start = now();
|
||||
run_gups(dev_indices, dev_data, datum, useAtomics);
|
||||
gupsTime += now() - start;
|
||||
}
|
||||
Kokkos::deep_copy(indices, dev_indices);
|
||||
Kokkos::deep_copy(data, dev_data);
|
||||
|
||||
Kokkos::deep_copy(indices, dev_indices);
|
||||
Kokkos::deep_copy(data, dev_data);
|
||||
printf(HLINE);
|
||||
printf(
|
||||
"GUP/s Random: %18.6f\n",
|
||||
(1.0e-9 * ((double)repeats) * (double)dev_indices.extent(0)) / gupsTime);
|
||||
printf(HLINE);
|
||||
|
||||
printf(HLINE);
|
||||
printf("GUP/s Random: %18.6f\n",
|
||||
(1.0e-9 * ((double) repeats) * (double) dev_indices.extent(0)) / gupsTime);
|
||||
printf(HLINE);
|
||||
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
printf(HLINE);
|
||||
printf("Kokkos GUPS Benchmark\n");
|
||||
printf(HLINE);
|
||||
|
||||
printf(HLINE);
|
||||
printf("Kokkos GUPS Benchmark\n");
|
||||
printf(HLINE);
|
||||
srand48(1010101);
|
||||
|
||||
srand48(1010101);
|
||||
Kokkos::initialize(argc, argv);
|
||||
|
||||
Kokkos::initialize(argc, argv);
|
||||
int64_t indices = 8192;
|
||||
int64_t data = 33554432;
|
||||
int64_t repeats = 10;
|
||||
bool useAtomics = false;
|
||||
|
||||
int64_t indices = 8192;
|
||||
int64_t data = 33554432;
|
||||
int64_t repeats = 10;
|
||||
bool useAtomics = false;
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
if (strcmp(argv[i], "--indices") == 0) {
|
||||
indices = std::atoll(argv[i + 1]);
|
||||
++i;
|
||||
} else if (strcmp(argv[i], "--data") == 0) {
|
||||
data = std::atoll(argv[i + 1]);
|
||||
++i;
|
||||
} else if (strcmp(argv[i], "--repeats") == 0) {
|
||||
repeats = std::atoll(argv[i + 1]);
|
||||
++i;
|
||||
} else if (strcmp(argv[i], "--atomics") == 0) {
|
||||
useAtomics = true;
|
||||
}
|
||||
}
|
||||
|
||||
for( int i = 1; i < argc; ++i ) {
|
||||
if( strcmp( argv[i], "--indices" ) == 0 ) {
|
||||
indices = std::atoll(argv[i+1]);
|
||||
++i;
|
||||
} else if( strcmp( argv[i], "--data" ) == 0 ) {
|
||||
data = std::atoll(argv[i+1]);
|
||||
++i;
|
||||
} else if( strcmp( argv[i], "--repeats" ) == 0 ) {
|
||||
repeats = std::atoll(argv[i+1]);
|
||||
++i;
|
||||
} else if( strcmp( argv[i], "--atomics" ) == 0 ) {
|
||||
useAtomics = true;
|
||||
}
|
||||
}
|
||||
const int rc = run_benchmark(indices, data, repeats, useAtomics);
|
||||
|
||||
const int rc = run_benchmark(indices, data, repeats, useAtomics);
|
||||
Kokkos::finalize();
|
||||
|
||||
Kokkos::finalize();
|
||||
|
||||
return rc;
|
||||
return rc;
|
||||
}
|
||||
|
|
|
@ -94,22 +94,22 @@ int main(int argc, char* argv[]) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
int team_range = atoi(argv[1]);
|
||||
int thread_range = atoi(argv[2]);
|
||||
int vector_range = atoi(argv[3]);
|
||||
int team_range = std::stoi(argv[1]);
|
||||
int thread_range = std::stoi(argv[2]);
|
||||
int vector_range = std::stoi(argv[3]);
|
||||
|
||||
int outer_repeat = atoi(argv[4]);
|
||||
int thread_repeat = atoi(argv[5]);
|
||||
int vector_repeat = atoi(argv[6]);
|
||||
int outer_repeat = std::stoi(argv[4]);
|
||||
int thread_repeat = std::stoi(argv[5]);
|
||||
int vector_repeat = std::stoi(argv[6]);
|
||||
|
||||
int team_size = atoi(argv[7]);
|
||||
int vector_size = atoi(argv[8]);
|
||||
int schedule = atoi(argv[9]);
|
||||
int test_type = atoi(argv[10]);
|
||||
int team_size = std::stoi(argv[7]);
|
||||
int vector_size = std::stoi(argv[8]);
|
||||
int schedule = std::stoi(argv[9]);
|
||||
int test_type = std::stoi(argv[10]);
|
||||
|
||||
int disable_verbose_output = 0;
|
||||
if (argc > 11) {
|
||||
disable_verbose_output = atoi(argv[11]);
|
||||
disable_verbose_output = std::stoi(argv[11]);
|
||||
}
|
||||
|
||||
if (schedule != 1 && schedule != 2) {
|
||||
|
@ -138,9 +138,9 @@ int main(int argc, char* argv[]) {
|
|||
double& lval) { lval += 1; },
|
||||
result);
|
||||
|
||||
typedef Kokkos::View<double*, Kokkos::LayoutRight> view_type_1d;
|
||||
typedef Kokkos::View<double**, Kokkos::LayoutRight> view_type_2d;
|
||||
typedef Kokkos::View<double***, Kokkos::LayoutRight> view_type_3d;
|
||||
using view_type_1d = Kokkos::View<double*, Kokkos::LayoutRight>;
|
||||
using view_type_2d = Kokkos::View<double**, Kokkos::LayoutRight>;
|
||||
using view_type_3d = Kokkos::View<double***, Kokkos::LayoutRight>;
|
||||
|
||||
// Allocate view without initializing
|
||||
// Call a 'warmup' test with 1 repeat - this will initialize the corresponding
|
||||
|
|
|
@ -68,8 +68,8 @@ void test_policy(int team_range, int thread_range, int vector_range,
|
|||
int team_size, int vector_size, int test_type, ViewType1& v1,
|
||||
ViewType2& v2, ViewType3& v3, double& result,
|
||||
double& result_expect, double& time) {
|
||||
typedef Kokkos::TeamPolicy<ScheduleType, IndexType> t_policy;
|
||||
typedef typename t_policy::member_type t_team;
|
||||
using t_policy = Kokkos::TeamPolicy<ScheduleType, IndexType>;
|
||||
using t_team = typename t_policy::member_type;
|
||||
Kokkos::Timer timer;
|
||||
|
||||
for (int orep = 0; orep < outer_repeat; orep++) {
|
||||
|
|
|
@ -48,219 +48,224 @@
|
|||
#include <sys/time.h>
|
||||
|
||||
#define STREAM_ARRAY_SIZE 100000000
|
||||
#define STREAM_NTIMES 20
|
||||
#define STREAM_NTIMES 20
|
||||
|
||||
#define HLINE "-------------------------------------------------------------\n"
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA)
|
||||
typedef Kokkos::View<double*, Kokkos::CudaSpace>::HostMirror StreamHostArray;
|
||||
typedef Kokkos::View<double*, Kokkos::CudaSpace> StreamDeviceArray;
|
||||
using StreamHostArray = Kokkos::View<double*, Kokkos::CudaSpace>::HostMirror;
|
||||
using StreamDeviceArray = Kokkos::View<double*, Kokkos::CudaSpace>;
|
||||
#else
|
||||
typedef Kokkos::View<double*, Kokkos::HostSpace>::HostMirror StreamHostArray;
|
||||
typedef Kokkos::View<double*, Kokkos::HostSpace> StreamDeviceArray;
|
||||
using StreamHostArray = Kokkos::View<double*, Kokkos::HostSpace>::HostMirror;
|
||||
using StreamDeviceArray = Kokkos::View<double*, Kokkos::HostSpace>;
|
||||
#endif
|
||||
|
||||
typedef int StreamIndex;
|
||||
using StreamIndex = int;
|
||||
|
||||
double now() {
|
||||
struct timeval now;
|
||||
gettimeofday(&now, nullptr);
|
||||
struct timeval now;
|
||||
gettimeofday(&now, nullptr);
|
||||
|
||||
return (double) now.tv_sec + ((double) now.tv_usec * 1.0e-6);
|
||||
return (double)now.tv_sec + ((double)now.tv_usec * 1.0e-6);
|
||||
}
|
||||
|
||||
void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c) {
|
||||
void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b,
|
||||
StreamDeviceArray& c) {
|
||||
Kokkos::parallel_for(
|
||||
"copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) { c[i] = a[i]; });
|
||||
|
||||
Kokkos::parallel_for("copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
|
||||
c[i] = a[i];
|
||||
});
|
||||
|
||||
Kokkos::fence();
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c,
|
||||
const double scalar) {
|
||||
void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b,
|
||||
StreamDeviceArray& c, const double scalar) {
|
||||
Kokkos::parallel_for(
|
||||
"copy", a.extent(0),
|
||||
KOKKOS_LAMBDA(const StreamIndex i) { b[i] = scalar * c[i]; });
|
||||
|
||||
Kokkos::parallel_for("copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
|
||||
b[i] = scalar * c[i];
|
||||
});
|
||||
|
||||
Kokkos::fence();
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
void perform_add(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c) {
|
||||
Kokkos::parallel_for("add", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
|
||||
c[i] = a[i] + b[i];
|
||||
});
|
||||
void perform_add(StreamDeviceArray& a, StreamDeviceArray& b,
|
||||
StreamDeviceArray& c) {
|
||||
Kokkos::parallel_for(
|
||||
"add", a.extent(0),
|
||||
KOKKOS_LAMBDA(const StreamIndex i) { c[i] = a[i] + b[i]; });
|
||||
|
||||
Kokkos::fence();
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
void perform_triad(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c,
|
||||
const double scalar) {
|
||||
void perform_triad(StreamDeviceArray& a, StreamDeviceArray& b,
|
||||
StreamDeviceArray& c, const double scalar) {
|
||||
Kokkos::parallel_for(
|
||||
"triad", a.extent(0),
|
||||
KOKKOS_LAMBDA(const StreamIndex i) { a[i] = b[i] + scalar * c[i]; });
|
||||
|
||||
Kokkos::parallel_for("triad", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
|
||||
a[i] = b[i] + scalar * c[i];
|
||||
});
|
||||
|
||||
Kokkos::fence();
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
int perform_validation(StreamHostArray& a, StreamHostArray& b, StreamHostArray& c,
|
||||
const StreamIndex arraySize, const double scalar) {
|
||||
int perform_validation(StreamHostArray& a, StreamHostArray& b,
|
||||
StreamHostArray& c, const StreamIndex arraySize,
|
||||
const double scalar) {
|
||||
double ai = 1.0;
|
||||
double bi = 2.0;
|
||||
double ci = 0.0;
|
||||
|
||||
double ai = 1.0;
|
||||
double bi = 2.0;
|
||||
double ci = 0.0;
|
||||
for (StreamIndex i = 0; i < arraySize; ++i) {
|
||||
ci = ai;
|
||||
bi = scalar * ci;
|
||||
ci = ai + bi;
|
||||
ai = bi + scalar * ci;
|
||||
};
|
||||
|
||||
for( StreamIndex i = 0; i < arraySize; ++i ) {
|
||||
ci = ai;
|
||||
bi = scalar * ci;
|
||||
ci = ai + bi;
|
||||
ai = bi + scalar * ci;
|
||||
};
|
||||
double aError = 0.0;
|
||||
double bError = 0.0;
|
||||
double cError = 0.0;
|
||||
|
||||
double aError = 0.0;
|
||||
double bError = 0.0;
|
||||
double cError = 0.0;
|
||||
for (StreamIndex i = 0; i < arraySize; ++i) {
|
||||
aError = std::abs(a[i] - ai);
|
||||
bError = std::abs(b[i] - bi);
|
||||
cError = std::abs(c[i] - ci);
|
||||
}
|
||||
|
||||
for( StreamIndex i = 0; i < arraySize; ++i ) {
|
||||
aError = std::abs( a[i] - ai );
|
||||
bError = std::abs( b[i] - bi );
|
||||
cError = std::abs( c[i] - ci );
|
||||
}
|
||||
double aAvgError = aError / (double)arraySize;
|
||||
double bAvgError = bError / (double)arraySize;
|
||||
double cAvgError = cError / (double)arraySize;
|
||||
|
||||
double aAvgError = aError / (double) arraySize;
|
||||
double bAvgError = bError / (double) arraySize;
|
||||
double cAvgError = cError / (double) arraySize;
|
||||
const double epsilon = 1.0e-13;
|
||||
int errorCount = 0;
|
||||
|
||||
const double epsilon = 1.0e-13;
|
||||
int errorCount = 0;
|
||||
if (std::abs(aAvgError / ai) > epsilon) {
|
||||
fprintf(stderr, "Error: validation check on View a failed.\n");
|
||||
errorCount++;
|
||||
}
|
||||
|
||||
if( std::abs( aAvgError / ai ) > epsilon ) {
|
||||
fprintf(stderr, "Error: validation check on View a failed.\n");
|
||||
errorCount++;
|
||||
}
|
||||
if (std::abs(bAvgError / bi) > epsilon) {
|
||||
fprintf(stderr, "Error: validation check on View b failed.\n");
|
||||
errorCount++;
|
||||
}
|
||||
|
||||
if( std::abs( bAvgError / bi ) > epsilon ) {
|
||||
fprintf(stderr, "Error: validation check on View b failed.\n");
|
||||
errorCount++;
|
||||
}
|
||||
if (std::abs(cAvgError / ci) > epsilon) {
|
||||
fprintf(stderr, "Error: validation check on View c failed.\n");
|
||||
errorCount++;
|
||||
}
|
||||
|
||||
if( std::abs( cAvgError / ci ) > epsilon ) {
|
||||
fprintf(stderr, "Error: validation check on View c failed.\n");
|
||||
errorCount++;
|
||||
}
|
||||
if (errorCount == 0) {
|
||||
printf("All solutions checked and verified.\n");
|
||||
}
|
||||
|
||||
if( errorCount == 0 ) {
|
||||
printf("All solutions checked and verified.\n");
|
||||
}
|
||||
|
||||
return errorCount;
|
||||
return errorCount;
|
||||
}
|
||||
|
||||
int run_benchmark() {
|
||||
printf("Reports fastest timing per kernel\n");
|
||||
printf("Creating Views...\n");
|
||||
|
||||
printf("Reports fastest timing per kernel\n");
|
||||
printf("Creating Views...\n");
|
||||
printf("Memory Sizes:\n");
|
||||
printf("- Array Size: %" PRIu64 "\n",
|
||||
static_cast<uint64_t>(STREAM_ARRAY_SIZE));
|
||||
printf("- Per Array: %12.2f MB\n",
|
||||
1.0e-6 * (double)STREAM_ARRAY_SIZE * (double)sizeof(double));
|
||||
printf("- Total: %12.2f MB\n",
|
||||
3.0e-6 * (double)STREAM_ARRAY_SIZE * (double)sizeof(double));
|
||||
|
||||
printf("Memory Sizes:\n");
|
||||
printf("- Array Size: %" PRIu64 "\n", static_cast<uint64_t>(STREAM_ARRAY_SIZE));
|
||||
printf("- Per Array: %12.2f MB\n", 1.0e-6 * (double) STREAM_ARRAY_SIZE * (double) sizeof(double));
|
||||
printf("- Total: %12.2f MB\n", 3.0e-6 * (double) STREAM_ARRAY_SIZE * (double) sizeof(double));
|
||||
printf("Benchmark kernels will be performed for %d iterations.\n",
|
||||
STREAM_NTIMES);
|
||||
|
||||
printf("Benchmark kernels will be performed for %d iterations.\n", STREAM_NTIMES);
|
||||
printf(HLINE);
|
||||
|
||||
printf(HLINE);
|
||||
StreamDeviceArray dev_a("a", STREAM_ARRAY_SIZE);
|
||||
StreamDeviceArray dev_b("b", STREAM_ARRAY_SIZE);
|
||||
StreamDeviceArray dev_c("c", STREAM_ARRAY_SIZE);
|
||||
|
||||
StreamDeviceArray dev_a("a", STREAM_ARRAY_SIZE);
|
||||
StreamDeviceArray dev_b("b", STREAM_ARRAY_SIZE);
|
||||
StreamDeviceArray dev_c("c", STREAM_ARRAY_SIZE);
|
||||
StreamHostArray a = Kokkos::create_mirror_view(dev_a);
|
||||
StreamHostArray b = Kokkos::create_mirror_view(dev_b);
|
||||
StreamHostArray c = Kokkos::create_mirror_view(dev_c);
|
||||
|
||||
StreamHostArray a = Kokkos::create_mirror_view(dev_a);
|
||||
StreamHostArray b = Kokkos::create_mirror_view(dev_b);
|
||||
StreamHostArray c = Kokkos::create_mirror_view(dev_c);
|
||||
const double scalar = 3.0;
|
||||
|
||||
const double scalar = 3.0;
|
||||
double copyTime = std::numeric_limits<double>::max();
|
||||
double scaleTime = std::numeric_limits<double>::max();
|
||||
double addTime = std::numeric_limits<double>::max();
|
||||
double triadTime = std::numeric_limits<double>::max();
|
||||
|
||||
double copyTime = std::numeric_limits<double>::max();
|
||||
double scaleTime = std::numeric_limits<double>::max();
|
||||
double addTime = std::numeric_limits<double>::max();
|
||||
double triadTime = std::numeric_limits<double>::max();
|
||||
|
||||
printf("Initializing Views...\n");
|
||||
printf("Initializing Views...\n");
|
||||
|
||||
#if defined(KOKKOS_HAVE_OPENMP)
|
||||
Kokkos::parallel_for("init", Kokkos::RangePolicy<Kokkos::OpenMP>(0, STREAM_ARRAY_SIZE),
|
||||
Kokkos::parallel_for(
|
||||
"init", Kokkos::RangePolicy<Kokkos::OpenMP>(0, STREAM_ARRAY_SIZE),
|
||||
#else
|
||||
Kokkos::parallel_for("init", Kokkos::RangePolicy<Kokkos::Serial>(0, STREAM_ARRAY_SIZE),
|
||||
Kokkos::parallel_for(
|
||||
"init", Kokkos::RangePolicy<Kokkos::Serial>(0, STREAM_ARRAY_SIZE),
|
||||
#endif
|
||||
KOKKOS_LAMBDA(const int i) {
|
||||
KOKKOS_LAMBDA(const int i) {
|
||||
a[i] = 1.0;
|
||||
b[i] = 2.0;
|
||||
c[i] = 0.0;
|
||||
});
|
||||
|
||||
a[i] = 1.0;
|
||||
b[i] = 2.0;
|
||||
c[i] = 0.0;
|
||||
});
|
||||
// Copy contents of a (from the host) to the dev_a (device)
|
||||
Kokkos::deep_copy(dev_a, a);
|
||||
Kokkos::deep_copy(dev_b, b);
|
||||
Kokkos::deep_copy(dev_c, c);
|
||||
|
||||
// Copy contents of a (from the host) to the dev_a (device)
|
||||
Kokkos::deep_copy(dev_a, a);
|
||||
Kokkos::deep_copy(dev_b, b);
|
||||
Kokkos::deep_copy(dev_c, c);
|
||||
double start;
|
||||
|
||||
double start;
|
||||
printf("Starting benchmarking...\n");
|
||||
|
||||
printf("Starting benchmarking...\n");
|
||||
for (StreamIndex k = 0; k < STREAM_NTIMES; ++k) {
|
||||
start = now();
|
||||
perform_copy(dev_a, dev_b, dev_c);
|
||||
copyTime = std::min(copyTime, (now() - start));
|
||||
|
||||
for( StreamIndex k = 0; k < STREAM_NTIMES; ++k ) {
|
||||
start = now();
|
||||
perform_copy(dev_a, dev_b, dev_c);
|
||||
copyTime = std::min( copyTime, (now() - start) );
|
||||
start = now();
|
||||
perform_scale(dev_a, dev_b, dev_c, scalar);
|
||||
scaleTime = std::min(scaleTime, (now() - start));
|
||||
|
||||
start = now();
|
||||
perform_scale(dev_a, dev_b, dev_c, scalar);
|
||||
scaleTime = std::min( scaleTime, (now() - start) );
|
||||
start = now();
|
||||
perform_add(dev_a, dev_b, dev_c);
|
||||
addTime = std::min(addTime, (now() - start));
|
||||
|
||||
start = now();
|
||||
perform_add(dev_a, dev_b, dev_c);
|
||||
addTime = std::min( addTime, (now() - start) );
|
||||
start = now();
|
||||
perform_triad(dev_a, dev_b, dev_c, scalar);
|
||||
triadTime = std::min(triadTime, (now() - start));
|
||||
}
|
||||
|
||||
start = now();
|
||||
perform_triad(dev_a, dev_b, dev_c, scalar);
|
||||
triadTime = std::min( triadTime, (now() - start) );
|
||||
}
|
||||
Kokkos::deep_copy(a, dev_a);
|
||||
Kokkos::deep_copy(b, dev_b);
|
||||
Kokkos::deep_copy(c, dev_c);
|
||||
|
||||
Kokkos::deep_copy(a, dev_a);
|
||||
Kokkos::deep_copy(b, dev_b);
|
||||
Kokkos::deep_copy(c, dev_c);
|
||||
printf("Performing validation...\n");
|
||||
int rc = perform_validation(a, b, c, STREAM_ARRAY_SIZE, scalar);
|
||||
|
||||
printf("Performing validation...\n");
|
||||
int rc = perform_validation(a, b, c, STREAM_ARRAY_SIZE, scalar);
|
||||
printf(HLINE);
|
||||
|
||||
printf(HLINE);
|
||||
printf("Copy %11.2f MB/s\n",
|
||||
(1.0e-06 * 2.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
|
||||
copyTime);
|
||||
printf("Scale %11.2f MB/s\n",
|
||||
(1.0e-06 * 2.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
|
||||
scaleTime);
|
||||
printf("Add %11.2f MB/s\n",
|
||||
(1.0e-06 * 3.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
|
||||
addTime);
|
||||
printf("Triad %11.2f MB/s\n",
|
||||
(1.0e-06 * 3.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
|
||||
triadTime);
|
||||
|
||||
printf("Copy %11.2f MB/s\n",
|
||||
( 1.0e-06 * 2.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / copyTime );
|
||||
printf("Scale %11.2f MB/s\n",
|
||||
( 1.0e-06 * 2.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / scaleTime );
|
||||
printf("Add %11.2f MB/s\n",
|
||||
( 1.0e-06 * 3.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / addTime );
|
||||
printf("Triad %11.2f MB/s\n",
|
||||
( 1.0e-06 * 3.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / triadTime );
|
||||
printf(HLINE);
|
||||
|
||||
printf(HLINE);
|
||||
|
||||
return rc;
|
||||
return rc;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
printf(HLINE);
|
||||
printf("Kokkos STREAM Benchmark\n");
|
||||
printf(HLINE);
|
||||
|
||||
printf(HLINE);
|
||||
printf("Kokkos STREAM Benchmark\n");
|
||||
printf(HLINE);
|
||||
Kokkos::initialize(argc, argv);
|
||||
const int rc = run_benchmark();
|
||||
Kokkos::finalize();
|
||||
|
||||
Kokkos::initialize(argc, argv);
|
||||
const int rc = run_benchmark();
|
||||
Kokkos::finalize();
|
||||
|
||||
return rc;
|
||||
return rc;
|
||||
}
|
||||
|
|
|
@ -19,6 +19,13 @@ default_arch="sm_35"
|
|||
# The default C++ compiler.
|
||||
#
|
||||
host_compiler=${NVCC_WRAPPER_DEFAULT_COMPILER:-"g++"}
|
||||
|
||||
# Default to whatever is in the path
|
||||
nvcc_compiler=nvcc
|
||||
if [ ! -z $CUDA_ROOT ]; then
|
||||
nvcc_compiler="$CUDA_ROOT/bin/nvcc"
|
||||
fi
|
||||
|
||||
#host_compiler="icpc"
|
||||
#host_compiler="/usr/local/gcc/4.8.3/bin/g++"
|
||||
#host_compiler="/usr/local/gcc/4.9.1/bin/g++"
|
||||
|
@ -58,7 +65,7 @@ object_files_xlinker=""
|
|||
shared_versioned_libraries_host=""
|
||||
shared_versioned_libraries=""
|
||||
|
||||
# Does the User set the architecture
|
||||
# Does the User set the architecture
|
||||
arch_set=0
|
||||
|
||||
# Does the user overwrite the host compiler
|
||||
|
@ -77,7 +84,7 @@ host_only_args=""
|
|||
# Just run version on host compiler
|
||||
get_host_version=0
|
||||
|
||||
# Enable workaround for CUDA 6.5 for pragma ident
|
||||
# Enable workaround for CUDA 6.5 for pragma ident
|
||||
replace_pragma_ident=0
|
||||
|
||||
# Mark first host compiler argument
|
||||
|
@ -179,7 +186,7 @@ do
|
|||
shift
|
||||
;;
|
||||
#Handle known nvcc args
|
||||
--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*|--fmad*)
|
||||
--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-Xptxas*|--fmad*|--Wext-lambda-captures-this|-Wext-lambda-captures-this)
|
||||
cuda_args="$cuda_args $1"
|
||||
;;
|
||||
#Handle more known nvcc args
|
||||
|
@ -187,7 +194,7 @@ do
|
|||
cuda_args="$cuda_args $1"
|
||||
;;
|
||||
#Handle known nvcc args that have an argument
|
||||
-rdc|-maxrregcount|--default-stream|-Xnvlink|--fmad)
|
||||
-rdc|-maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart)
|
||||
cuda_args="$cuda_args $1 $2"
|
||||
shift
|
||||
;;
|
||||
|
@ -195,11 +202,11 @@ do
|
|||
cuda_args="$cuda_args $1"
|
||||
;;
|
||||
#Handle unsupported standard flags
|
||||
--std=c++1y|-std=c++1y|--std=c++1z|-std=c++1z|--std=gnu++1y|-std=gnu++1y|--std=gnu++1z|-std=gnu++1z|--std=c++2a|-std=c++2a|--std=c++17|-std=c++17)
|
||||
--std=c++1y|-std=c++1y|--std=gnu++1y|-std=gnu++1y|--std=c++1z|-std=c++1z|--std=gnu++1z|-std=gnu++1z|--std=c++2a|-std=c++2a)
|
||||
fallback_std_flag="-std=c++14"
|
||||
# this is hopefully just occurring in a downstream project during CMake feature tests
|
||||
# we really have no choice here but to accept the flag and change to an accepted C++ standard
|
||||
echo "nvcc_wrapper does not accept standard flags $1 since partial standard flags and standards after C++14 are not supported. nvcc_wrapper will use $fallback_std_flag instead. It is undefined behavior to use this flag. This should only be occurring during CMake configuration."
|
||||
echo "nvcc_wrapper does not accept standard flags $1 since partial standard flags and standards after C++17 are not supported. nvcc_wrapper will use $fallback_std_flag instead. It is undefined behavior to use this flag. This should only be occurring during CMake configuration."
|
||||
if [ -n "$std_flag" ]; then
|
||||
warn_std_flag
|
||||
shared_args=${shared_args/ $std_flag/}
|
||||
|
@ -216,7 +223,25 @@ do
|
|||
fi
|
||||
std_flag=$corrected_std_flag
|
||||
shared_args="$shared_args $std_flag"
|
||||
;;
|
||||
;;
|
||||
--std=c++17|-std=c++17)
|
||||
if [ -n "$std_flag" ]; then
|
||||
warn_std_flag
|
||||
shared_args=${shared_args/ $std_flag/}
|
||||
fi
|
||||
# NVCC only has C++17 from version 11 on
|
||||
cuda_main_version=$([[ $(${nvcc_compiler} --version) =~ V([0-9]+) ]] && echo ${BASH_REMATCH[1]})
|
||||
if [ ${cuda_main_version} -lt 11 ]; then
|
||||
fallback_std_flag="-std=c++14"
|
||||
# this is hopefully just occurring in a downstream project during CMake feature tests
|
||||
# we really have no choice here but to accept the flag and change to an accepted C++ standard
|
||||
echo "nvcc_wrapper does not accept standard flags $1 since partial standard flags and standards after C++14 are not supported. nvcc_wrapper will use $fallback_std_flag instead. It is undefined behavior to use this flag. This should only be occurring during CMake configuration."
|
||||
std_flag=$fallback_std_flag
|
||||
else
|
||||
std_flag=$1
|
||||
fi
|
||||
shared_args="$shared_args $std_flag"
|
||||
;;
|
||||
--std=c++11|-std=c++11|--std=c++14|-std=c++14)
|
||||
if [ -n "$std_flag" ]; then
|
||||
warn_std_flag
|
||||
|
@ -226,6 +251,20 @@ do
|
|||
shared_args="$shared_args $std_flag"
|
||||
;;
|
||||
|
||||
#convert PGI standard flags to something nvcc can handle
|
||||
--c++11|--c++14|--c++17)
|
||||
if [ -n "$std_flag" ]; then
|
||||
warn_std_flag
|
||||
shared_args=${shared_args/ $std_flag/}
|
||||
fi
|
||||
std_flag="-std=${1#--}"
|
||||
shared_args="$shared_args $std_flag"
|
||||
;;
|
||||
|
||||
#ignore PGI forcing ISO C++-conforming code
|
||||
-A)
|
||||
;;
|
||||
|
||||
#strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98
|
||||
-std=c++98|--std=c++98)
|
||||
;;
|
||||
|
@ -237,13 +276,17 @@ do
|
|||
;;
|
||||
#strip -Xcompiler because we add it
|
||||
-Xcompiler)
|
||||
if [ $first_xcompiler_arg -eq 1 ]; then
|
||||
xcompiler_args="$2"
|
||||
first_xcompiler_arg=0
|
||||
else
|
||||
xcompiler_args="$xcompiler_args,$2"
|
||||
if [[ $2 != "-o" ]]; then
|
||||
if [ $first_xcompiler_arg -eq 1 ]; then
|
||||
xcompiler_args="$2"
|
||||
first_xcompiler_arg=0
|
||||
else
|
||||
xcompiler_args="$xcompiler_args,$2"
|
||||
fi
|
||||
shift
|
||||
fi
|
||||
shift
|
||||
# else this we have -Xcompiler -o <filename>, in this case just drop -Xcompiler and process
|
||||
# the -o flag with the filename (done above)
|
||||
;;
|
||||
#strip of "-x cu" because we add that
|
||||
-x)
|
||||
|
@ -329,7 +372,7 @@ do
|
|||
if [ $first_xcompiler_arg -eq 1 ]; then
|
||||
xcompiler_args=$1
|
||||
first_xcompiler_arg=0
|
||||
else
|
||||
else
|
||||
xcompiler_args="$xcompiler_args,$1"
|
||||
fi
|
||||
;;
|
||||
|
@ -387,7 +430,7 @@ if [ $arch_set -ne 1 ]; then
|
|||
fi
|
||||
|
||||
#Compose compilation command
|
||||
nvcc_command="nvcc $cuda_args $shared_args $xlinker_args $shared_versioned_libraries"
|
||||
nvcc_command="$nvcc_compiler $cuda_args $shared_args $xlinker_args $shared_versioned_libraries"
|
||||
if [ $first_xcompiler_arg -eq 0 ]; then
|
||||
nvcc_command="$nvcc_command -Xcompiler $xcompiler_args"
|
||||
fi
|
||||
|
|
|
@ -2,6 +2,7 @@ SET(Kokkos_DEVICES @KOKKOS_ENABLED_DEVICES@)
|
|||
SET(Kokkos_OPTIONS @KOKKOS_ENABLED_OPTIONS@)
|
||||
SET(Kokkos_TPLS @KOKKOS_ENABLED_TPLS@)
|
||||
SET(Kokkos_ARCH @KOKKOS_ENABLED_ARCH_LIST@)
|
||||
SET(Kokkos_CXX_COMPILER "@CMAKE_CXX_COMPILER@")
|
||||
|
||||
# These are needed by KokkosKernels
|
||||
FOREACH(DEV ${Kokkos_DEVICES})
|
||||
|
@ -38,7 +39,7 @@ include(FindPackageHandleStandardArgs)
|
|||
# kokkos_check(
|
||||
# [DEVICES <devices>...] # Set of backends (e.g. "OpenMP" and/or "Cuda")
|
||||
# [ARCH <archs>...] # Target architectures (e.g. "Power9" and/or "Volta70")
|
||||
# [OPTIONS <options>...] # Optional settings (e.g. "PROFILING")
|
||||
# [OPTIONS <options>...] # Optional settings (e.g. "TUNING")
|
||||
# [TPLS <tpls>...] # Third party libraries
|
||||
# [RETURN_VALUE <result>] # Set a variable that indicates the result of the
|
||||
# # check instead of a fatal error
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
|
||||
#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
|
||||
#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
|
||||
#error \
|
||||
"Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
|
||||
#else
|
||||
#define KOKKOS_CORE_CONFIG_H
|
||||
#endif
|
||||
|
@ -10,7 +11,6 @@
|
|||
// KOKKOS_VERSION / 10000 is the major version
|
||||
#cmakedefine KOKKOS_VERSION @KOKKOS_VERSION@
|
||||
|
||||
|
||||
/* Execution Spaces */
|
||||
#cmakedefine KOKKOS_ENABLE_SERIAL
|
||||
#cmakedefine KOKKOS_ENABLE_OPENMP
|
||||
|
@ -47,10 +47,9 @@
|
|||
#cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
|
||||
#cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
|
||||
#cmakedefine KOKKOS_ENABLE_COMPILER_WARNINGS
|
||||
#cmakedefine KOKKOS_ENABLE_PROFILING
|
||||
#cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT
|
||||
#cmakedefine KOKKOS_ENABLE_TUNING
|
||||
#cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
#cmakedefine KOKKOS_ENABLE_ETI
|
||||
#cmakedefine KOKKOS_ENABLE_LARGE_MEM_TESTS
|
||||
#cmakedefine KOKKOS_ENABLE_DUALVIEW_MODIFY_CHECK
|
||||
#cmakedefine KOKKOS_ENABLE_COMPLEX_ALIGN
|
||||
|
@ -60,7 +59,7 @@
|
|||
#cmakedefine KOKKOS_ENABLE_HWLOC
|
||||
#cmakedefine KOKKOS_USE_LIBRT
|
||||
#cmakedefine KOKKOS_ENABLE_HWBSPACE
|
||||
|
||||
#cmakedefine KOKKOS_ENABLE_LIBDL
|
||||
#cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
|
||||
|
||||
#cmakedefine KOKKOS_COMPILER_CUDA_VERSION @KOKKOS_COMPILER_CUDA_VERSION@
|
||||
|
@ -95,4 +94,6 @@
|
|||
#cmakedefine KOKKOS_ARCH_VOLTA70
|
||||
#cmakedefine KOKKOS_ARCH_VOLTA72
|
||||
#cmakedefine KOKKOS_ARCH_TURING75
|
||||
#cmakedefine KOKKOS_ARCH_AMD_EPYC
|
||||
#cmakedefine KOKKOS_ARCH_AMPERE80
|
||||
#cmakedefine KOKKOS_ARCH_AMD_ZEN
|
||||
#cmakedefine KOKKOS_ARCH_AMD_ZEN2
|
||||
|
|
|
@ -0,0 +1,958 @@
|
|||
# Distributed under the OSI-approved BSD 3-Clause License. See accompanying
|
||||
# file Copyright.txt or https://cmake.org/licensing for details.
|
||||
|
||||
#[=======================================================================[.rst:
|
||||
FindCUDAToolkit
|
||||
---------------
|
||||
|
||||
This script locates the NVIDIA CUDA toolkit and the associated libraries, but
|
||||
does not require the ``CUDA`` language be enabled for a given project. This
|
||||
module does not search for the NVIDIA CUDA Samples.
|
||||
|
||||
Search Behavior
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
Finding the CUDA Toolkit requires finding the ``nvcc`` executable, which is
|
||||
searched for in the following order:
|
||||
|
||||
1. If the ``CUDA`` language has been enabled we will use the directory
|
||||
containing the compiler as the first search location for ``nvcc``.
|
||||
|
||||
2. If the ``CUDAToolkit_ROOT`` cmake configuration variable (e.g.,
|
||||
``-DCUDAToolkit_ROOT=/some/path``) *or* environment variable is defined, it
|
||||
will be searched. If both an environment variable **and** a
|
||||
configuration variable are specified, the *configuration* variable takes
|
||||
precedence.
|
||||
|
||||
The directory specified here must be such that the executable ``nvcc`` can be
|
||||
found underneath the directory specified by ``CUDAToolkit_ROOT``. If
|
||||
``CUDAToolkit_ROOT`` is specified, but no ``nvcc`` is found underneath, this
|
||||
package is marked as **not** found. No subsequent search attempts are
|
||||
performed.
|
||||
|
||||
3. If the CUDA_PATH environment variable is defined, it will be searched.
|
||||
|
||||
4. The user's path is searched for ``nvcc`` using :command:`find_program`. If
|
||||
this is found, no subsequent search attempts are performed. Users are
|
||||
responsible for ensuring that the first ``nvcc`` to show up in the path is
|
||||
the desired path in the event that multiple CUDA Toolkits are installed.
|
||||
|
||||
5. On Unix systems, if the symbolic link ``/usr/local/cuda`` exists, this is
|
||||
used. No subsequent search attempts are performed. No default symbolic link
|
||||
location exists for the Windows platform.
|
||||
|
||||
6. The platform specific default install locations are searched. If exactly one
|
||||
candidate is found, this is used. The default CUDA Toolkit install locations
|
||||
searched are:
|
||||
|
||||
+-------------+-------------------------------------------------------------+
|
||||
| Platform | Search Pattern |
|
||||
+=============+=============================================================+
|
||||
| macOS | ``/Developer/NVIDIA/CUDA-X.Y`` |
|
||||
+-------------+-------------------------------------------------------------+
|
||||
| Other Unix | ``/usr/local/cuda-X.Y`` |
|
||||
+-------------+-------------------------------------------------------------+
|
||||
| Windows | ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y`` |
|
||||
+-------------+-------------------------------------------------------------+
|
||||
|
||||
Where ``X.Y`` would be a specific version of the CUDA Toolkit, such as
|
||||
``/usr/local/cuda-9.0`` or
|
||||
``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0``
|
||||
|
||||
.. note::
|
||||
|
||||
When multiple CUDA Toolkits are installed in the default location of a
|
||||
system (e.g., both ``/usr/local/cuda-9.0`` and ``/usr/local/cuda-10.0``
|
||||
exist but the ``/usr/local/cuda`` symbolic link does **not** exist), this
|
||||
package is marked as **not** found.
|
||||
|
||||
There are too many factors involved in making an automatic decision in
|
||||
the presence of multiple CUDA Toolkits being installed. In this
|
||||
situation, users are encouraged to either (1) set ``CUDAToolkit_ROOT`` or
|
||||
(2) ensure that the correct ``nvcc`` executable shows up in ``$PATH`` for
|
||||
:command:`find_program` to find.
|
||||
|
||||
Options
|
||||
^^^^^^^
|
||||
|
||||
``VERSION``
|
||||
If specified, describes the version of the CUDA Toolkit to search for.
|
||||
|
||||
``REQUIRED``
|
||||
If specified, configuration will error if a suitable CUDA Toolkit is not
|
||||
found.
|
||||
|
||||
``QUIET``
|
||||
If specified, the search for a suitable CUDA Toolkit will not produce any
|
||||
messages.
|
||||
|
||||
``EXACT``
|
||||
If specified, the CUDA Toolkit is considered found only if the exact
|
||||
``VERSION`` specified is recovered.
|
||||
|
||||
Imported targets
|
||||
^^^^^^^^^^^^^^^^
|
||||
|
||||
An :ref:`imported target <Imported targets>` named ``CUDA::toolkit`` is provided.
|
||||
|
||||
This module defines :prop_tgt:`IMPORTED` targets for each
|
||||
of the following libraries that are part of the CUDAToolkit:
|
||||
|
||||
- :ref:`CUDA Runtime Library<cuda_toolkit_rt_lib>`
|
||||
- :ref:`CUDA Driver Library<cuda_toolkit_driver_lib>`
|
||||
- :ref:`cuBLAS<cuda_toolkit_cuBLAS>`
|
||||
- :ref:`cuFFT<cuda_toolkit_cuFFT>`
|
||||
- :ref:`cuRAND<cuda_toolkit_cuRAND>`
|
||||
- :ref:`cuSOLVER<cuda_toolkit_cuSOLVER>`
|
||||
- :ref:`cuSPARSE<cuda_toolkit_cuSPARSE>`
|
||||
- :ref:`cuPTI<cuda_toolkit_cupti>`
|
||||
- :ref:`NPP<cuda_toolkit_NPP>`
|
||||
- :ref:`nvBLAS<cuda_toolkit_nvBLAS>`
|
||||
- :ref:`nvGRAPH<cuda_toolkit_nvGRAPH>`
|
||||
- :ref:`nvJPEG<cuda_toolkit_nvJPEG>`
|
||||
- :ref:`nvidia-ML<cuda_toolkit_nvML>`
|
||||
- :ref:`nvRTC<cuda_toolkit_nvRTC>`
|
||||
- :ref:`nvToolsExt<cuda_toolkit_nvToolsExt>`
|
||||
- :ref:`OpenCL<cuda_toolkit_opencl>`
|
||||
- :ref:`cuLIBOS<cuda_toolkit_cuLIBOS>`
|
||||
|
||||
.. _`cuda_toolkit_rt_lib`:
|
||||
|
||||
CUDA Runtime Library
|
||||
""""""""""""""""""""
|
||||
|
||||
The CUDA Runtime library (cudart) are what most applications will typically
|
||||
need to link against to make any calls such as `cudaMalloc`, and `cudaFree`.
|
||||
|
||||
Targets Created:
|
||||
|
||||
- ``CUDA::cudart``
|
||||
- ``CUDA::cudart_static``
|
||||
|
||||
.. _`cuda_toolkit_driver_lib`:
|
||||
|
||||
CUDA Driver Library
|
||||
""""""""""""""""""""
|
||||
|
||||
The CUDA Driver library (cuda) are used by applications that use calls
|
||||
such as `cuMemAlloc`, and `cuMemFree`. This is generally used by advanced
|
||||
|
||||
|
||||
Targets Created:
|
||||
|
||||
- ``CUDA::cuda_driver``
|
||||
- ``CUDA::cuda_driver``
|
||||
|
||||
.. _`cuda_toolkit_cuBLAS`:
|
||||
|
||||
cuBLAS
|
||||
""""""
|
||||
|
||||
The `cuBLAS <https://docs.nvidia.com/cuda/cublas/index.html>`_ library.
|
||||
|
||||
Targets Created:
|
||||
|
||||
- ``CUDA::cublas``
|
||||
- ``CUDA::cublas_static``
|
||||
|
||||
.. _`cuda_toolkit_cuFFT`:
|
||||
|
||||
cuFFT
|
||||
"""""
|
||||
|
||||
The `cuFFT <https://docs.nvidia.com/cuda/cufft/index.html>`_ library.
|
||||
|
||||
Targets Created:
|
||||
|
||||
- ``CUDA::cufft``
|
||||
- ``CUDA::cufftw``
|
||||
- ``CUDA::cufft_static``
|
||||
- ``CUDA::cufftw_static``
|
||||
|
||||
cuRAND
|
||||
""""""
|
||||
|
||||
The `cuRAND <https://docs.nvidia.com/cuda/curand/index.html>`_ library.
|
||||
|
||||
Targets Created:
|
||||
|
||||
- ``CUDA::curand``
|
||||
- ``CUDA::curand_static``
|
||||
|
||||
.. _`cuda_toolkit_cuSOLVER`:
|
||||
|
||||
cuSOLVER
|
||||
""""""""
|
||||
|
||||
The `cuSOLVER <https://docs.nvidia.com/cuda/cusolver/index.html>`_ library.
|
||||
|
||||
Targets Created:
|
||||
|
||||
- ``CUDA::cusolver``
|
||||
- ``CUDA::cusolver_static``
|
||||
|
||||
.. _`cuda_toolkit_cuSPARSE`:
|
||||
|
||||
cuSPARSE
|
||||
""""""""
|
||||
|
||||
The `cuSPARSE <https://docs.nvidia.com/cuda/cusparse/index.html>`_ library.
|
||||
|
||||
Targets Created:
|
||||
|
||||
- ``CUDA::cusparse``
|
||||
- ``CUDA::cusparse_static``
|
||||
|
||||
.. _`cuda_toolkit_cupti`:
|
||||
|
||||
cupti
|
||||
"""""
|
||||
|
||||
The `NVIDIA CUDA Profiling Tools Interface <https://developer.nvidia.com/CUPTI>`_.
|
||||
|
||||
Targets Created:
|
||||
|
||||
- ``CUDA::cupti``
|
||||
- ``CUDA::cupti_static``
|
||||
|
||||
.. _`cuda_toolkit_NPP`:
|
||||
|
||||
NPP
|
||||
"""
|
||||
|
||||
The `NPP <https://docs.nvidia.com/cuda/npp/index.html>`_ libraries.
|
||||
|
||||
Targets Created:
|
||||
|
||||
- `nppc`:
|
||||
|
||||
- ``CUDA::nppc``
|
||||
- ``CUDA::nppc_static``
|
||||
|
||||
- `nppial`: Arithmetic and logical operation functions in `nppi_arithmetic_and_logical_operations.h`
|
||||
|
||||
- ``CUDA::nppial``
|
||||
- ``CUDA::nppial_static``
|
||||
|
||||
- `nppicc`: Color conversion and sampling functions in `nppi_color_conversion.h`
|
||||
|
||||
- ``CUDA::nppicc``
|
||||
- ``CUDA::nppicc_static``
|
||||
|
||||
- `nppicom`: JPEG compression and decompression functions in `nppi_compression_functions.h`
|
||||
|
||||
- ``CUDA::nppicom``
|
||||
- ``CUDA::nppicom_static``
|
||||
|
||||
- `nppidei`: Data exchange and initialization functions in `nppi_data_exchange_and_initialization.h`
|
||||
|
||||
- ``CUDA::nppidei``
|
||||
- ``CUDA::nppidei_static``
|
||||
|
||||
- `nppif`: Filtering and computer vision functions in `nppi_filter_functions.h`
|
||||
|
||||
- ``CUDA::nppif``
|
||||
- ``CUDA::nppif_static``
|
||||
|
||||
- `nppig`: Geometry transformation functions found in `nppi_geometry_transforms.h`
|
||||
|
||||
- ``CUDA::nppig``
|
||||
- ``CUDA::nppig_static``
|
||||
|
||||
- `nppim`: Morphological operation functions found in `nppi_morphological_operations.h`
|
||||
|
||||
- ``CUDA::nppim``
|
||||
- ``CUDA::nppim_static``
|
||||
|
||||
- `nppist`: Statistics and linear transform in `nppi_statistics_functions.h` and `nppi_linear_transforms.h`
|
||||
|
||||
- ``CUDA::nppist``
|
||||
- ``CUDA::nppist_static``
|
||||
|
||||
- `nppisu`: Memory support functions in `nppi_support_functions.h`
|
||||
|
||||
- ``CUDA::nppisu``
|
||||
- ``CUDA::nppisu_static``
|
||||
|
||||
- `nppitc`: Threshold and compare operation functions in `nppi_threshold_and_compare_operations.h`
|
||||
|
||||
- ``CUDA::nppitc``
|
||||
- ``CUDA::nppitc_static``
|
||||
|
||||
- `npps`:
|
||||
|
||||
- ``CUDA::npps``
|
||||
- ``CUDA::npps_static``
|
||||
|
||||
.. _`cuda_toolkit_nvBLAS`:
|
||||
|
||||
nvBLAS
|
||||
""""""
|
||||
|
||||
The `nvBLAS <https://docs.nvidia.com/cuda/nvblas/index.html>`_ libraries.
|
||||
This is a shared library only.
|
||||
|
||||
Targets Created:
|
||||
|
||||
- ``CUDA::nvblas``
|
||||
|
||||
.. _`cuda_toolkit_nvGRAPH`:
|
||||
|
||||
nvGRAPH
|
||||
"""""""
|
||||
|
||||
The `nvGRAPH <https://docs.nvidia.com/cuda/nvgraph/index.html>`_ library.
|
||||
|
||||
Targets Created:
|
||||
|
||||
- ``CUDA::nvgraph``
|
||||
- ``CUDA::nvgraph_static``
|
||||
|
||||
|
||||
.. _`cuda_toolkit_nvJPEG`:
|
||||
|
||||
nvJPEG
|
||||
""""""
|
||||
|
||||
The `nvJPEG <https://docs.nvidia.com/cuda/nvjpeg/index.html>`_ library.
|
||||
Introduced in CUDA 10.
|
||||
|
||||
Targets Created:
|
||||
|
||||
- ``CUDA::nvjpeg``
|
||||
- ``CUDA::nvjpeg_static``
|
||||
|
||||
.. _`cuda_toolkit_nvRTC`:
|
||||
|
||||
nvRTC
|
||||
"""""
|
||||
|
||||
The `nvRTC <https://docs.nvidia.com/cuda/nvrtc/index.html>`_ (Runtime Compilation) library.
|
||||
This is a shared library only.
|
||||
|
||||
Targets Created:
|
||||
|
||||
- ``CUDA::nvrtc``
|
||||
|
||||
.. _`cuda_toolkit_nvml`:
|
||||
|
||||
nvidia-ML
|
||||
"""""""""
|
||||
|
||||
The `NVIDIA Management Library <https://developer.nvidia.com/nvidia-management-library-nvml>`_.
|
||||
This is a shared library only.
|
||||
|
||||
Targets Created:
|
||||
|
||||
- ``CUDA::nvml``
|
||||
|
||||
.. _`cuda_toolkit_nvToolsExt`:
|
||||
|
||||
nvToolsExt
|
||||
""""""""""
|
||||
|
||||
The `NVIDIA Tools Extension <https://docs.nvidia.com/gameworks/content/gameworkslibrary/nvtx/nvidia_tools_extension_library_nvtx.htm>`_.
|
||||
This is a shared library only.
|
||||
|
||||
Targets Created:
|
||||
|
||||
- ``CUDA::nvToolsExt``
|
||||
|
||||
.. _`cuda_toolkit_opencl`:
|
||||
|
||||
OpenCL
|
||||
""""""
|
||||
|
||||
The `NVIDIA OpenCL Library <https://developer.nvidia.com/opencl>`_.
|
||||
This is a shared library only.
|
||||
|
||||
Targets Created:
|
||||
|
||||
- ``CUDA::OpenCL``
|
||||
|
||||
.. _`cuda_toolkit_cuLIBOS`:
|
||||
|
||||
cuLIBOS
|
||||
"""""""
|
||||
|
||||
The cuLIBOS library is a backend thread abstraction layer library which is
|
||||
static only. The ``CUDA::cublas_static``, ``CUDA::cusparse_static``,
|
||||
``CUDA::cufft_static``, ``CUDA::curand_static``, and (when implemented) NPP
|
||||
libraries all automatically have this dependency linked.
|
||||
|
||||
Target Created:
|
||||
|
||||
- ``CUDA::culibos``
|
||||
|
||||
**Note**: direct usage of this target by consumers should not be necessary.
|
||||
|
||||
.. _`cuda_toolkit_cuRAND`:
|
||||
|
||||
|
||||
|
||||
Result variables
|
||||
^^^^^^^^^^^^^^^^
|
||||
|
||||
``CUDAToolkit_FOUND``
|
||||
A boolean specifying whether or not the CUDA Toolkit was found.
|
||||
|
||||
``CUDAToolkit_VERSION``
|
||||
The exact version of the CUDA Toolkit found (as reported by
|
||||
``nvcc --version``).
|
||||
|
||||
``CUDAToolkit_VERSION_MAJOR``
|
||||
The major version of the CUDA Toolkit.
|
||||
|
||||
``CUDAToolkit_VERSION_MAJOR``
|
||||
The minor version of the CUDA Toolkit.
|
||||
|
||||
``CUDAToolkit_VERSION_PATCH``
|
||||
The patch version of the CUDA Toolkit.
|
||||
|
||||
``CUDAToolkit_BIN_DIR``
|
||||
The path to the CUDA Toolkit library directory that contains the CUDA
|
||||
executable ``nvcc``.
|
||||
|
||||
``CUDAToolkit_INCLUDE_DIRS``
|
||||
The path to the CUDA Toolkit ``include`` folder containing the header files
|
||||
required to compile a project linking against CUDA.
|
||||
|
||||
``CUDAToolkit_LIBRARY_DIR``
|
||||
The path to the CUDA Toolkit library directory that contains the CUDA
|
||||
Runtime library ``cudart``.
|
||||
|
||||
``CUDAToolkit_TARGET_DIR``
|
||||
The path to the CUDA Toolkit directory including the target architecture
|
||||
when cross-compiling. When not cross-compiling this will be equivalant to
|
||||
``CUDAToolkit_ROOT_DIR``.
|
||||
|
||||
``CUDAToolkit_NVCC_EXECUTABLE``
|
||||
The path to the NVIDIA CUDA compiler ``nvcc``. Note that this path may
|
||||
**not** be the same as
|
||||
:variable:`CMAKE_CUDA_COMPILER <CMAKE_<LANG>_COMPILER>`. ``nvcc`` must be
|
||||
found to determine the CUDA Toolkit version as well as determining other
|
||||
features of the Toolkit. This variable is set for the convenience of
|
||||
modules that depend on this one.
|
||||
|
||||
|
||||
#]=======================================================================]
|
||||
|
||||
# NOTE: much of this was simply extracted from FindCUDA.cmake.
|
||||
|
||||
# James Bigler, NVIDIA Corp (nvidia.com - jbigler)
|
||||
# Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
|
||||
#
|
||||
# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved.
|
||||
#
|
||||
# Copyright (c) 2007-2009
|
||||
# Scientific Computing and Imaging Institute, University of Utah
|
||||
#
|
||||
# This code is licensed under the MIT License. See the FindCUDA.cmake script
|
||||
# for the text of the license.
|
||||
|
||||
# The MIT License
|
||||
#
|
||||
# License for the specific language governing rights and limitations under
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included
|
||||
# in all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
# DEALINGS IN THE SOFTWARE.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
# For NVCC we can easily deduce the SDK binary directory from the compiler path.
|
||||
if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR AND CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
|
||||
get_filename_component(cuda_dir "${CMAKE_CUDA_COMPILER}" DIRECTORY)
|
||||
set(CUDAToolkit_BIN_DIR "${cuda_dir}" CACHE PATH "")
|
||||
mark_as_advanced(CUDAToolkit_BIN_DIR)
|
||||
unset(cuda_dir)
|
||||
endif()
|
||||
|
||||
IF(CMAKE_VERSION VERSION_LESS "3.12.0")
|
||||
function(import_target_link_libraries target)
|
||||
cmake_parse_arguments(HACK
|
||||
"SYSTEM;INTERFACE;PUBLIC"
|
||||
""
|
||||
""
|
||||
${ARGN}
|
||||
)
|
||||
get_target_property(LIBS ${target} INTERFACE_LINK_LIBRARIES)
|
||||
if (LIBS)
|
||||
list(APPEND LIBS ${HACK_UNPARSED_ARGUMENTS})
|
||||
else()
|
||||
set(LIBS ${HACK_UNPARSED_ARGUMENTS})
|
||||
endif()
|
||||
set_target_properties(${target} PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${LIBS}")
|
||||
endfunction()
|
||||
ELSE()
|
||||
function(import_target_link_libraries)
|
||||
target_link_libraries(${ARGN})
|
||||
endfunction()
|
||||
ENDIF()
|
||||
|
||||
IF(CMAKE_VERSION VERSION_LESS "3.13.0")
|
||||
function(import_target_link_directories target)
|
||||
cmake_parse_arguments(HACK
|
||||
"SYSTEM;INTERFACE;PUBLIC"
|
||||
""
|
||||
""
|
||||
${ARGN}
|
||||
)
|
||||
get_target_property(LINK_LIBS ${target} INTERFACE_LINK_LIBRARIES)
|
||||
if (LINK_LIBS) #could be not-found
|
||||
set(LINK_LIBS_LIST ${LINK_LIBS})
|
||||
endif()
|
||||
foreach(LIB ${HACK_UNPARSED_ARGUMENTS})
|
||||
list(APPEND LINK_LIBS_LIST -L${LIB})
|
||||
endforeach()
|
||||
set_target_properties(${target} PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${LINK_LIBS_LIST}")
|
||||
endfunction()
|
||||
ELSE()
|
||||
function(import_target_link_directories)
|
||||
target_link_directories(${ARGN})
|
||||
endfunction()
|
||||
ENDIF()
|
||||
|
||||
IF(CMAKE_VERSION VERSION_LESS "3.12.0")
|
||||
function(import_target_include_directories target)
|
||||
cmake_parse_arguments(HACK
|
||||
"SYSTEM;INTERFACE;PUBLIC"
|
||||
""
|
||||
""
|
||||
${ARGN}
|
||||
)
|
||||
get_target_property(INLUDE_DIRS ${target} INTERFACE_INCLUDE_DIRECTORIES)
|
||||
if (INCLUDE_DIRS)
|
||||
list(APPEND INCLUDE_DIRS ${HACK_UNPARSED_ARGUMENTS})
|
||||
else()
|
||||
set(INCLUDE_DIRS ${HACK_UNPARSED_ARGUMENTS})
|
||||
endif()
|
||||
set_target_properties(${target} PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${INCLUDE_DIRS}")
|
||||
endfunction()
|
||||
ELSE()
|
||||
function(import_target_include_directories)
|
||||
target_include_directories(${ARGN})
|
||||
endfunction()
|
||||
ENDIF()
|
||||
|
||||
# Try language- or user-provided path first.
|
||||
if(CUDAToolkit_BIN_DIR)
|
||||
find_program(CUDAToolkit_NVCC_EXECUTABLE
|
||||
NAMES nvcc nvcc.exe
|
||||
PATHS ${CUDAToolkit_BIN_DIR}
|
||||
NO_DEFAULT_PATH
|
||||
)
|
||||
endif()
|
||||
|
||||
# Search using CUDAToolkit_ROOT
|
||||
find_program(CUDAToolkit_NVCC_EXECUTABLE
|
||||
NAMES nvcc nvcc.exe
|
||||
PATHS ENV CUDA_PATH
|
||||
PATH_SUFFIXES bin
|
||||
)
|
||||
|
||||
# If the user specified CUDAToolkit_ROOT but nvcc could not be found, this is an error.
|
||||
if (NOT CUDAToolkit_NVCC_EXECUTABLE AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT}))
|
||||
# Declare error messages now, print later depending on find_package args.
|
||||
set(fail_base "Could not find nvcc executable in path specified by")
|
||||
set(cuda_root_fail "${fail_base} CUDAToolkit_ROOT=${CUDAToolkit_ROOT}")
|
||||
set(env_cuda_root_fail "${fail_base} environment variable CUDAToolkit_ROOT=$ENV{CUDAToolkit_ROOT}")
|
||||
|
||||
if (CUDAToolkit_FIND_REQUIRED)
|
||||
if (DEFINED CUDAToolkit_ROOT)
|
||||
message(FATAL_ERROR ${cuda_root_fail})
|
||||
elseif (DEFINED ENV{CUDAToolkit_ROOT})
|
||||
message(FATAL_ERROR ${env_cuda_root_fail})
|
||||
endif()
|
||||
else()
|
||||
if (NOT CUDAToolkit_FIND_QUIETLY)
|
||||
if (DEFINED CUDAToolkit_ROOT)
|
||||
message(STATUS ${cuda_root_fail})
|
||||
elseif (DEFINED ENV{CUDAToolkit_ROOT})
|
||||
message(STATUS ${env_cuda_root_fail})
|
||||
endif()
|
||||
endif()
|
||||
set(CUDAToolkit_FOUND FALSE)
|
||||
unset(fail_base)
|
||||
unset(cuda_root_fail)
|
||||
unset(env_cuda_root_fail)
|
||||
return()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# CUDAToolkit_ROOT cmake / env variable not specified, try platform defaults.
|
||||
#
|
||||
# - Linux: /usr/local/cuda-X.Y
|
||||
# - macOS: /Developer/NVIDIA/CUDA-X.Y
|
||||
# - Windows: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y
|
||||
#
|
||||
# We will also search the default symlink location /usr/local/cuda first since
|
||||
# if CUDAToolkit_ROOT is not specified, it is assumed that the symlinked
|
||||
# directory is the desired location.
|
||||
if (NOT CUDAToolkit_NVCC_EXECUTABLE)
|
||||
if (UNIX)
|
||||
if (NOT APPLE)
|
||||
set(platform_base "/usr/local/cuda-")
|
||||
else()
|
||||
set(platform_base "/Developer/NVIDIA/CUDA-")
|
||||
endif()
|
||||
else()
|
||||
set(platform_base "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v")
|
||||
endif()
|
||||
|
||||
# Build out a descending list of possible cuda installations, e.g.
|
||||
file(GLOB possible_paths "${platform_base}*")
|
||||
# Iterate the glob results and create a descending list.
|
||||
set(possible_versions)
|
||||
foreach (p ${possible_paths})
|
||||
# Extract version number from end of string
|
||||
string(REGEX MATCH "[0-9][0-9]?\\.[0-9]$" p_version ${p})
|
||||
if (IS_DIRECTORY ${p} AND p_version)
|
||||
list(APPEND possible_versions ${p_version})
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# Cannot use list(SORT) because that is alphabetical, we need numerical.
|
||||
# NOTE: this is not an efficient sorting strategy. But even if a user had
|
||||
# every possible version of CUDA installed, this wouldn't create any
|
||||
# significant overhead.
|
||||
set(versions)
|
||||
foreach (v ${possible_versions})
|
||||
list(LENGTH versions num_versions)
|
||||
# First version, nothing to compare with so just append.
|
||||
if (num_versions EQUAL 0)
|
||||
list(APPEND versions ${v})
|
||||
else()
|
||||
# Loop through list. Insert at an index when comparison is
|
||||
# VERSION_GREATER since we want a descending list. Duplicates will not
|
||||
# happen since this came from a glob list of directories.
|
||||
set(i 0)
|
||||
set(early_terminate FALSE)
|
||||
while (i LESS num_versions)
|
||||
list(GET versions ${i} curr)
|
||||
if (v VERSION_GREATER curr)
|
||||
list(INSERT versions ${i} ${v})
|
||||
set(early_terminate TRUE)
|
||||
break()
|
||||
endif()
|
||||
math(EXPR i "${i} + 1")
|
||||
endwhile()
|
||||
# If it did not get inserted, place it at the end.
|
||||
if (NOT early_terminate)
|
||||
list(APPEND versions ${v})
|
||||
endif()
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# With a descending list of versions, populate possible paths to search.
|
||||
set(search_paths)
|
||||
foreach (v ${versions})
|
||||
list(APPEND search_paths "${platform_base}${v}")
|
||||
endforeach()
|
||||
|
||||
# Force the global default /usr/local/cuda to the front on Unix.
|
||||
if (UNIX)
|
||||
list(INSERT search_paths 0 "/usr/local/cuda")
|
||||
endif()
|
||||
|
||||
# Now search for nvcc again using the platform default search paths.
|
||||
find_program(CUDAToolkit_NVCC_EXECUTABLE
|
||||
NAMES nvcc nvcc.exe
|
||||
PATHS ${search_paths}
|
||||
PATH_SUFFIXES bin
|
||||
)
|
||||
|
||||
# We are done with these variables now, cleanup for caller.
|
||||
unset(platform_base)
|
||||
unset(possible_paths)
|
||||
unset(possible_versions)
|
||||
unset(versions)
|
||||
unset(i)
|
||||
unset(early_terminate)
|
||||
unset(search_paths)
|
||||
|
||||
if (NOT CUDAToolkit_NVCC_EXECUTABLE)
|
||||
if (CUDAToolkit_FIND_REQUIRED)
|
||||
message(FATAL_ERROR "Could not find nvcc, please set CUDAToolkit_ROOT.")
|
||||
elseif(NOT CUDAToolkit_FIND_QUIETLY)
|
||||
message(STATUS "Could not find nvcc, please set CUDAToolkit_ROOT.")
|
||||
endif()
|
||||
|
||||
set(CUDAToolkit_FOUND FALSE)
|
||||
return()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(NOT CUDAToolkit_BIN_DIR AND CUDAToolkit_NVCC_EXECUTABLE)
|
||||
get_filename_component(cuda_dir "${CUDAToolkit_NVCC_EXECUTABLE}" DIRECTORY)
|
||||
set(CUDAToolkit_BIN_DIR "${cuda_dir}" CACHE PATH "" FORCE)
|
||||
mark_as_advanced(CUDAToolkit_BIN_DIR)
|
||||
unset(cuda_dir)
|
||||
endif()
|
||||
|
||||
if(CUDAToolkit_NVCC_EXECUTABLE AND
|
||||
CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER)
|
||||
# Need to set these based off the already computed CMAKE_CUDA_COMPILER_VERSION value
|
||||
# This if statement will always match, but is used to provide variables for MATCH 1,2,3...
|
||||
if(CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
|
||||
set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
|
||||
set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
|
||||
set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
|
||||
set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_VERSION}")
|
||||
endif()
|
||||
else()
|
||||
# Compute the version by invoking nvcc
|
||||
execute_process (COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
|
||||
if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=])
|
||||
set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
|
||||
set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
|
||||
set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
|
||||
set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}")
|
||||
endif()
|
||||
unset(NVCC_OUT)
|
||||
endif()
|
||||
|
||||
|
||||
get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE)
|
||||
|
||||
# Handle cross compilation
|
||||
if(CMAKE_CROSSCOMPILING)
|
||||
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a")
|
||||
# Support for NVPACK
|
||||
set (CUDAToolkit_TARGET_NAME "armv7-linux-androideabi")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
|
||||
# Support for arm cross compilation
|
||||
set(CUDAToolkit_TARGET_NAME "armv7-linux-gnueabihf")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
|
||||
# Support for aarch64 cross compilation
|
||||
if (ANDROID_ARCH_NAME STREQUAL "arm64")
|
||||
set(CUDAToolkit_TARGET_NAME "aarch64-linux-androideabi")
|
||||
else()
|
||||
set(CUDAToolkit_TARGET_NAME "aarch64-linux")
|
||||
endif (ANDROID_ARCH_NAME STREQUAL "arm64")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
|
||||
set(CUDAToolkit_TARGET_NAME "x86_64-linux")
|
||||
endif()
|
||||
|
||||
if (EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
|
||||
set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
|
||||
# add known CUDA target root path to the set of directories we search for programs, libraries and headers
|
||||
list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}")
|
||||
|
||||
# Mark that we need to pop the root search path changes after we have
|
||||
# found all cuda libraries so that searches for our cross-compilation
|
||||
# libraries work when another cuda sdk is in CMAKE_PREFIX_PATH or
|
||||
# PATh
|
||||
set(_CUDAToolkit_Pop_ROOT_PATH True)
|
||||
endif()
|
||||
else()
|
||||
# Not cross compiling
|
||||
set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}")
|
||||
# Now that we have the real ROOT_DIR, find components inside it.
|
||||
list(APPEND CMAKE_PREFIX_PATH ${CUDAToolkit_ROOT_DIR})
|
||||
|
||||
# Mark that we need to pop the prefix path changes after we have
|
||||
# found the cudart library.
|
||||
set(_CUDAToolkit_Pop_Prefix True)
|
||||
endif()
|
||||
|
||||
|
||||
# Find the include/ directory
|
||||
find_path(CUDAToolkit_INCLUDE_DIR
|
||||
NAMES cuda_runtime.h
|
||||
)
|
||||
|
||||
# And find the CUDA Runtime Library libcudart
|
||||
find_library(CUDA_CUDART
|
||||
NAMES cudart
|
||||
PATH_SUFFIXES lib64 lib/x64
|
||||
)
|
||||
if (NOT CUDA_CUDART)
|
||||
find_library(CUDA_CUDART
|
||||
NAMES cudart
|
||||
PATH_SUFFIXES lib64/stubs lib/x64/stubs
|
||||
)
|
||||
endif()
|
||||
|
||||
if (NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY)
|
||||
message(STATUS "Unable to find cudart library.")
|
||||
endif()
|
||||
|
||||
unset(CUDAToolkit_ROOT_DIR)
|
||||
if(_CUDAToolkit_Pop_Prefix)
|
||||
list(REMOVE_AT CMAKE_PREFIX_PATH -1)
|
||||
unset(_CUDAToolkit_Pop_Prefix)
|
||||
endif()
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
# Perform version comparison and validate all required variables are set.
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(CUDAToolkit
|
||||
REQUIRED_VARS
|
||||
CUDAToolkit_INCLUDE_DIR
|
||||
CUDA_CUDART
|
||||
CUDAToolkit_NVCC_EXECUTABLE
|
||||
VERSION_VAR
|
||||
CUDAToolkit_VERSION
|
||||
)
|
||||
mark_as_advanced(CUDA_CUDART
|
||||
CUDAToolkit_INCLUDE_DIR
|
||||
CUDAToolkit_NVCC_EXECUTABLE
|
||||
)
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
# Construct result variables
|
||||
if(CUDAToolkit_FOUND)
|
||||
set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR})
|
||||
get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE)
|
||||
endif()
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
# Construct import targets
|
||||
if(CUDAToolkit_FOUND)
|
||||
|
||||
function(_CUDAToolkit_find_and_add_import_lib lib_name)
|
||||
cmake_parse_arguments(arg "" "" "ALT;DEPS;EXTRA_PATH_SUFFIXES" ${ARGN})
|
||||
|
||||
set(search_names ${lib_name} ${arg_ALT})
|
||||
|
||||
find_library(CUDA_${lib_name}_LIBRARY
|
||||
NAMES ${search_names}
|
||||
HINTS ${CUDAToolkit_LIBRARY_DIR}
|
||||
ENV CUDA_PATH
|
||||
PATH_SUFFIXES nvidia/current lib64 lib/x64 lib
|
||||
${arg_EXTRA_PATH_SUFFIXES}
|
||||
)
|
||||
# Don't try any stub directories intil we have exhausted all other
|
||||
# search locations.
|
||||
if(NOT CUDA_${lib_name}_LIBRARY)
|
||||
find_library(CUDA_${lib_name}_LIBRARY
|
||||
NAMES ${search_names}
|
||||
HINTS ${CUDAToolkit_LIBRARY_DIR}
|
||||
ENV CUDA_PATH
|
||||
PATH_SUFFIXES lib64/stubs lib/x64/stubs lib/stubs stubs
|
||||
)
|
||||
endif()
|
||||
|
||||
mark_as_advanced(CUDA_${lib_name}_LIBRARY)
|
||||
|
||||
if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY)
|
||||
add_library(CUDA::${lib_name} IMPORTED INTERFACE)
|
||||
import_target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
|
||||
import_target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}")
|
||||
foreach(dep ${arg_DEPS})
|
||||
if(TARGET CUDA::${dep})
|
||||
import_target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep})
|
||||
endif()
|
||||
endforeach()
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
if(NOT TARGET CUDA::toolkit)
|
||||
add_library(CUDA::toolkit IMPORTED INTERFACE)
|
||||
import_target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
|
||||
import_target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}")
|
||||
endif()
|
||||
|
||||
_CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda)
|
||||
|
||||
_CUDAToolkit_find_and_add_import_lib(cudart)
|
||||
_CUDAToolkit_find_and_add_import_lib(cudart_static)
|
||||
|
||||
# setup dependencies that are required for cudart_static when building
|
||||
# on linux. These are generally only required when using the CUDA toolkit
|
||||
# when CUDA language is disabled
|
||||
if(NOT TARGET CUDA::cudart_static_deps
|
||||
AND TARGET CUDA::cudart_static)
|
||||
|
||||
add_library(CUDA::cudart_static_deps IMPORTED INTERFACE)
|
||||
import_target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps)
|
||||
|
||||
if(UNIX AND (CMAKE_C_COMPILER OR CMAKE_CXX_COMPILER))
|
||||
find_package(Threads REQUIRED)
|
||||
import_target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS})
|
||||
endif()
|
||||
|
||||
if(UNIX AND NOT APPLE)
|
||||
# On Linux, you must link against librt when using the static cuda runtime.
|
||||
find_library(CUDAToolkit_rt_LIBRARY rt)
|
||||
mark_as_advanced(CUDAToolkit_rt_LIBRARY)
|
||||
if(NOT CUDAToolkit_rt_LIBRARY)
|
||||
message(WARNING "Could not find librt library, needed by CUDA::cudart_static")
|
||||
else()
|
||||
import_target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY})
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
_CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library
|
||||
foreach (cuda_lib cublas cufft curand cusparse nppc nvjpeg)
|
||||
_CUDAToolkit_find_and_add_import_lib(${cuda_lib})
|
||||
_CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos)
|
||||
endforeach()
|
||||
|
||||
# cuFFTW depends on cuFFT
|
||||
_CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft)
|
||||
_CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft_static)
|
||||
|
||||
# cuSOLVER depends on cuBLAS, and cuSPARSE
|
||||
_CUDAToolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse)
|
||||
_CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos)
|
||||
|
||||
# nvGRAPH depends on cuRAND, and cuSOLVER.
|
||||
_CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver)
|
||||
_CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static)
|
||||
|
||||
# Process the majority of the NPP libraries.
|
||||
foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu)
|
||||
_CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc)
|
||||
_CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static)
|
||||
endforeach()
|
||||
|
||||
_CUDAToolkit_find_and_add_import_lib(cupti
|
||||
EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/
|
||||
../extras/CUPTI/lib/)
|
||||
_CUDAToolkit_find_and_add_import_lib(cupti_static
|
||||
EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/
|
||||
../extras/CUPTI/lib/)
|
||||
|
||||
_CUDAToolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver)
|
||||
|
||||
_CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml)
|
||||
|
||||
if(WIN32)
|
||||
# nvtools can be installed outside the CUDA toolkit directory
|
||||
# so prefer the NVTOOLSEXT_PATH windows only environment variable
|
||||
# In addition on windows the most common name is nvToolsExt64_1
|
||||
find_library(CUDA_nvToolsExt_LIBRARY
|
||||
NAMES nvToolsExt64_1 nvToolsExt64 nvToolsExt
|
||||
PATHS ENV NVTOOLSEXT_PATH
|
||||
ENV CUDA_PATH
|
||||
PATH_SUFFIXES lib/x64 lib
|
||||
)
|
||||
endif()
|
||||
_CUDAToolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64)
|
||||
|
||||
_CUDAToolkit_find_and_add_import_lib(OpenCL)
|
||||
endif()
|
||||
|
||||
if(_CUDAToolkit_Pop_ROOT_PATH)
|
||||
list(REMOVE_AT CMAKE_FIND_ROOT_PATH 0)
|
||||
unset(_CUDAToolkit_Pop_ROOT_PATH)
|
||||
endif()
|
|
@ -1,17 +1,37 @@
|
|||
|
||||
IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
|
||||
# Note: "stubs" suffix allows CMake to find the dummy
|
||||
# libcuda.so provided by the NVIDIA CUDA Toolkit for
|
||||
# cross-compiling CUDA on a host without a GPU.
|
||||
KOKKOS_FIND_IMPORTED(CUDA INTERFACE
|
||||
LIBRARIES cudart cuda
|
||||
LIBRARY_PATHS ENV LD_LIBRARY_PATH ENV CUDA_PATH /usr/local/cuda
|
||||
LIBRARY_SUFFIXES lib lib64 lib/stubs lib64/stubs
|
||||
ALLOW_SYSTEM_PATH_FALLBACK
|
||||
)
|
||||
ELSE()
|
||||
KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE
|
||||
LINK_LIBRARIES cuda
|
||||
)
|
||||
IF (NOT CUDAToolkit_ROOT)
|
||||
IF (NOT CUDA_ROOT)
|
||||
SET(CUDA_ROOT $ENV{CUDA_ROOT})
|
||||
ENDIF()
|
||||
IF(CUDA_ROOT)
|
||||
SET(CUDAToolkit_ROOT ${CUDA_ROOT})
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0")
|
||||
find_package(CUDAToolkit)
|
||||
ELSE()
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/CudaToolkit.cmake)
|
||||
ENDIF()
|
||||
|
||||
|
||||
IF (TARGET CUDA::cudart)
|
||||
SET(FOUND_CUDART TRUE)
|
||||
KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cudart)
|
||||
ELSE()
|
||||
SET(FOUND_CUDART FALSE)
|
||||
ENDIF()
|
||||
|
||||
IF (TARGET CUDA::cuda_driver)
|
||||
SET(FOUND_CUDA_DRIVER TRUE)
|
||||
KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cuda_driver)
|
||||
ELSE()
|
||||
SET(FOUND_CUDA_DRIVVER FALSE)
|
||||
ENDIF()
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA DEFAULT_MSG FOUND_CUDART FOUND_CUDA_DRIVER)
|
||||
IF (FOUND_CUDA_DRIVER AND FOUND_CUDART)
|
||||
KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE
|
||||
LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart
|
||||
)
|
||||
ENDIF()
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
|
@ -8,8 +9,6 @@
|
|||
// Under the terms of Contract DE-NA0003525 with NTESS,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Kokkos is licensed under 3-clause BSD terms of use:
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -41,18 +40,43 @@
|
|||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#define KOKKOS_IMPL_COMPILING_LIBRARY true
|
||||
#include <Kokkos_Core.hpp>
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
KOKKOS_IMPL_VIEWCOPY_ETI_INST(int64_t*****, LayoutStride, LayoutRight, OpenMP,
|
||||
int64_t)
|
||||
KOKKOS_IMPL_VIEWCOPY_ETI_INST(int64_t*****, LayoutStride, LayoutLeft, OpenMP,
|
||||
int64_t)
|
||||
KOKKOS_IMPL_VIEWCOPY_ETI_INST(int64_t*****, LayoutStride, LayoutStride, OpenMP,
|
||||
int64_t)
|
||||
KOKKOS_IMPL_VIEWFILL_ETI_INST(int64_t*****, LayoutStride, OpenMP, int64_t)
|
||||
#include <iostream>
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
int main() {
|
||||
cudaDeviceProp device_properties;
|
||||
const cudaError_t error = cudaGetDeviceProperties(&device_properties,
|
||||
/*device*/ 0);
|
||||
if (error != cudaSuccess) {
|
||||
std::cout << "CUDA error: " << cudaGetErrorString(error) << '\n';
|
||||
return error;
|
||||
}
|
||||
unsigned int const compute_capability =
|
||||
device_properties.major * 10 + device_properties.minor;
|
||||
#ifdef SM_ONLY
|
||||
std::cout << compute_capability;
|
||||
#else
|
||||
switch (compute_capability) {
|
||||
// clang-format off
|
||||
case 30: std::cout << "Set -DKokkos_ARCH_KEPLER30=ON ." << std::endl; break;
|
||||
case 32: std::cout << "Set -DKokkos_ARCH_KEPLER32=ON ." << std::endl; break;
|
||||
case 35: std::cout << "Set -DKokkos_ARCH_KEPLER35=ON ." << std::endl; break;
|
||||
case 37: std::cout << "Set -DKokkos_ARCH_KEPLER37=ON ." << std::endl; break;
|
||||
case 50: std::cout << "Set -DKokkos_ARCH_MAXWELL50=ON ." << std::endl; break;
|
||||
case 52: std::cout << "Set -DKokkos_ARCH_MAXWELL52=ON ." << std::endl; break;
|
||||
case 53: std::cout << "Set -DKokkos_ARCH_MAXWELL53=ON ." << std::endl; break;
|
||||
case 60: std::cout << "Set -DKokkos_ARCH_PASCAL60=ON ." << std::endl; break;
|
||||
case 61: std::cout << "Set -DKokkos_ARCH_PASCAL61=ON ." << std::endl; break;
|
||||
case 70: std::cout << "Set -DKokkos_ARCH_VOLTA70=ON ." << std::endl; break;
|
||||
case 72: std::cout << "Set -DKokkos_ARCH_VOLTA72=ON ." << std::endl; break;
|
||||
case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
|
||||
case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
|
||||
default:
|
||||
std::cout << "Compute capability " << compute_capability
|
||||
<< " is not supported" << std::endl;
|
||||
// clang-format on
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
|
@ -88,7 +88,7 @@ FUNCTION(KOKKOS_ADD_TEST)
|
|||
if (KOKKOS_HAS_TRILINOS)
|
||||
CMAKE_PARSE_ARGUMENTS(TEST
|
||||
""
|
||||
"EXE;NAME"
|
||||
"EXE;NAME;TOOL"
|
||||
""
|
||||
${ARGN})
|
||||
IF(TEST_EXE)
|
||||
|
@ -104,10 +104,15 @@ FUNCTION(KOKKOS_ADD_TEST)
|
|||
NUM_MPI_PROCS 1
|
||||
${TEST_UNPARSED_ARGUMENTS}
|
||||
)
|
||||
|
||||
if(TEST_TOOL)
|
||||
add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool
|
||||
set_property(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$<TARGET_FILE:${TEST_TOOL}>")
|
||||
endif()
|
||||
else()
|
||||
CMAKE_PARSE_ARGUMENTS(TEST
|
||||
"WILL_FAIL"
|
||||
"FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME"
|
||||
"FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL"
|
||||
"CATEGORIES;CMD_ARGS"
|
||||
${ARGN})
|
||||
# To match Tribits, we should always be receiving
|
||||
|
@ -135,6 +140,10 @@ FUNCTION(KOKKOS_ADD_TEST)
|
|||
IF(TEST_PASS_REGULAR_EXPRESSION)
|
||||
SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION})
|
||||
ENDIF()
|
||||
if(TEST_TOOL)
|
||||
add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool
|
||||
set_property(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$<TARGET_FILE:${TEST_TOOL}>")
|
||||
endif()
|
||||
VERIFY_EMPTY(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS})
|
||||
endif()
|
||||
ENDFUNCTION()
|
||||
|
|
|
@ -2,11 +2,14 @@
|
|||
FUNCTION(KOKKOS_ARCH_OPTION SUFFIX DEV_TYPE DESCRIPTION)
|
||||
#all optimizations off by default
|
||||
KOKKOS_OPTION(ARCH_${SUFFIX} OFF BOOL "Optimize for ${DESCRIPTION} (${DEV_TYPE})")
|
||||
IF (KOKKOS_ARCH_${SUFFIX})
|
||||
SET(KOKKOS_ARCH_${SUFFIX} ${KOKKOS_ARCH_${SUFFIX}} PARENT_SCOPE)
|
||||
SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE)
|
||||
SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE)
|
||||
SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE)
|
||||
IF(KOKKOS_ARCH_${SUFFIX})
|
||||
LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${SUFFIX})
|
||||
SET(KOKKOS_ENABLED_ARCH_LIST ${KOKKOS_ENABLED_ARCH_LIST} PARENT_SCOPE)
|
||||
ENDIF()
|
||||
SET(KOKKOS_ARCH_${SUFFIX} ${KOKKOS_ARCH_${SUFFIX}} PARENT_SCOPE)
|
||||
ENDFUNCTION()
|
||||
|
||||
|
||||
|
@ -15,6 +18,10 @@ KOKKOS_CFG_DEPENDS(ARCH COMPILER_ID)
|
|||
KOKKOS_CFG_DEPENDS(ARCH DEVICES)
|
||||
KOKKOS_CFG_DEPENDS(ARCH OPTIONS)
|
||||
|
||||
KOKKOS_CHECK_DEPRECATED_OPTIONS(
|
||||
ARCH_EPYC "Please replace EPYC with ZEN or ZEN2, depending on your platform"
|
||||
ARCH_RYZEN "Please replace RYZEN with ZEN or ZEN2, depending on your platform"
|
||||
)
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# List of possible host architectures.
|
||||
|
@ -51,9 +58,12 @@ KOKKOS_ARCH_OPTION(PASCAL61 GPU "NVIDIA Pascal generation CC 6.1")
|
|||
KOKKOS_ARCH_OPTION(VOLTA70 GPU "NVIDIA Volta generation CC 7.0")
|
||||
KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2")
|
||||
KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5")
|
||||
KOKKOS_ARCH_OPTION(EPYC HOST "AMD Epyc architecture")
|
||||
KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0")
|
||||
KOKKOS_ARCH_OPTION(ZEN HOST "AMD Zen architecture")
|
||||
KOKKOS_ARCH_OPTION(ZEN2 HOST "AMD Zen2 architecture")
|
||||
KOKKOS_ARCH_OPTION(VEGA900 GPU "AMD GPU MI25 GFX900")
|
||||
KOKKOS_ARCH_OPTION(VEGA906 GPU "AMD GPU MI50/MI60 GFX906")
|
||||
KOKKOS_ARCH_OPTION(INTEL_GEN GPU "Intel GPUs Gen9+")
|
||||
|
||||
IF (KOKKOS_ENABLE_CUDA)
|
||||
#Regardless of version, make sure we define the general architecture name
|
||||
|
@ -75,6 +85,10 @@ IF (KOKKOS_ENABLE_CUDA)
|
|||
IF (KOKKOS_ARCH_VOLTA70 OR KOKKOS_ARCH_VOLTA72)
|
||||
SET(KOKKOS_ARCH_VOLTA ON)
|
||||
ENDIF()
|
||||
|
||||
IF (KOKKOS_ARCH_AMPERE80)
|
||||
SET(KOKKOS_ARCH_AMPERE ON)
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
|
||||
|
@ -88,9 +102,10 @@ IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
|
|||
${COMMON_WARNINGS})
|
||||
|
||||
COMPILER_SPECIFIC_FLAGS(
|
||||
PGI NO-VALUE-SPECIFIED
|
||||
GNU ${GNU_WARNINGS}
|
||||
DEFAULT ${COMMON_WARNINGS}
|
||||
COMPILER_ID CMAKE_CXX_COMPILER_ID
|
||||
PGI NO-VALUE-SPECIFIED
|
||||
GNU ${GNU_WARNINGS}
|
||||
DEFAULT ${COMMON_WARNINGS}
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
|
@ -102,6 +117,9 @@ GLOBAL_SET(KOKKOS_CUDA_OPTIONS)
|
|||
IF (KOKKOS_ENABLE_CUDA_LAMBDA)
|
||||
IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
|
||||
GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-expt-extended-lambda")
|
||||
IF(KOKKOS_COMPILER_CUDA_VERSION GREATER_EQUAL 110)
|
||||
GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this")
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
|
@ -113,7 +131,6 @@ ENDIF()
|
|||
|
||||
IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
|
||||
SET(CUDA_ARCH_FLAG "--cuda-gpu-arch")
|
||||
SET(AMDGPU_ARCH_FLAG "--amdgpu-target")
|
||||
GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -x cuda)
|
||||
IF (KOKKOS_ENABLE_CUDA)
|
||||
SET(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND ON CACHE BOOL "enable CUDA Clang workarounds" FORCE)
|
||||
|
@ -133,6 +150,15 @@ IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
|
|||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
|
||||
#------------------------------- KOKKOS_HIP_OPTIONS ---------------------------
|
||||
#clear anything that might be in the cache
|
||||
GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS)
|
||||
IF(KOKKOS_CXX_COMPILER_ID STREQUAL HIP)
|
||||
SET(AMDGPU_ARCH_FLAG "--amdgpu-target")
|
||||
ENDIF()
|
||||
|
||||
|
||||
IF (KOKKOS_ARCH_ARMV80)
|
||||
COMPILER_SPECIFIC_FLAGS(
|
||||
Cray NO-VALUE-SPECIFIED
|
||||
|
@ -167,12 +193,21 @@ IF (KOKKOS_ARCH_ARMV8_THUNDERX2)
|
|||
)
|
||||
ENDIF()
|
||||
|
||||
IF (KOKKOS_ARCH_EPYC)
|
||||
IF (KOKKOS_ARCH_ZEN)
|
||||
COMPILER_SPECIFIC_FLAGS(
|
||||
Intel -mavx2
|
||||
DEFAULT -march=znver1 -mtune=znver1
|
||||
)
|
||||
SET(KOKKOS_ARCH_AMD_EPYC ON)
|
||||
SET(KOKKOS_ARCH_AMD_ZEN ON)
|
||||
SET(KOKKOS_ARCH_AMD_AVX2 ON)
|
||||
ENDIF()
|
||||
|
||||
IF (KOKKOS_ARCH_ZEN2)
|
||||
COMPILER_SPECIFIC_FLAGS(
|
||||
Intel -mavx2
|
||||
DEFAULT -march=znver2 -mtune=znver2
|
||||
)
|
||||
SET(KOKKOS_ARCH_AMD_ZEN2 ON)
|
||||
SET(KOKKOS_ARCH_AMD_AVX2 ON)
|
||||
ENDIF()
|
||||
|
||||
|
@ -216,14 +251,6 @@ IF (KOKKOS_ARCH_BDW)
|
|||
)
|
||||
ENDIF()
|
||||
|
||||
IF (KOKKOS_ARCH_EPYC)
|
||||
SET(KOKKOS_ARCH_AMD_AVX2 ON)
|
||||
COMPILER_SPECIFIC_FLAGS(
|
||||
Intel -mvax2
|
||||
DEFAULT -march=znver1 -mtune=znver1
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
IF (KOKKOS_ARCH_KNL)
|
||||
#avx512-mic
|
||||
SET(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable
|
||||
|
@ -253,7 +280,7 @@ IF (KOKKOS_ARCH_SKX)
|
|||
)
|
||||
ENDIF()
|
||||
|
||||
IF (KOKKOS_ARCH_WSM OR KOKKOS_ARCH_SNB OR KOKKOS_ARCH_HSW OR KOKKOS_ARCH_BDW OR KOKKOS_ARCH_KNL OR KOKKOS_ARCH_SKX OR KOKKOS_ARCH_EPYC)
|
||||
IF (KOKKOS_ARCH_WSM OR KOKKOS_ARCH_SNB OR KOKKOS_ARCH_HSW OR KOKKOS_ARCH_BDW OR KOKKOS_ARCH_KNL OR KOKKOS_ARCH_SKX OR KOKKOS_ARCH_ZEN OR KOKKOS_ARCH_ZEN2)
|
||||
SET(KOKKOS_USE_ISA_X86_64 ON)
|
||||
ENDIF()
|
||||
|
||||
|
@ -296,6 +323,21 @@ IF (Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE)
|
|||
)
|
||||
ENDIF()
|
||||
|
||||
# Clang needs mcx16 option enabled for Windows atomic functions
|
||||
IF (CMAKE_CXX_COMPILER_ID STREQUAL Clang AND WIN32)
|
||||
COMPILER_SPECIFIC_OPTIONS(
|
||||
Clang -mcx16
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
# MSVC ABI has many deprecation warnings, so ignore them
|
||||
IF (CMAKE_CXX_COMPILER_ID STREQUAL MSVC OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC")
|
||||
COMPILER_SPECIFIC_DEFS(
|
||||
Clang _CRT_SECURE_NO_WARNINGS
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
|
||||
#Right now we cannot get the compiler ID when cross-compiling, so just check
|
||||
#that HIP is enabled
|
||||
IF (Kokkos_ENABLE_HIP)
|
||||
|
@ -324,11 +366,15 @@ FUNCTION(CHECK_CUDA_ARCH ARCH FLAG)
|
|||
ELSE()
|
||||
SET(KOKKOS_CUDA_ARCH_FLAG ${FLAG} PARENT_SCOPE)
|
||||
GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}")
|
||||
IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE)
|
||||
IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OR KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
|
||||
GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}")
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
LIST(APPEND KOKKOS_CUDA_ARCH_FLAGS ${FLAG})
|
||||
SET(KOKKOS_CUDA_ARCH_FLAGS ${KOKKOS_CUDA_ARCH_FLAGS} PARENT_SCOPE)
|
||||
LIST(APPEND KOKKOS_CUDA_ARCH_LIST ${ARCH})
|
||||
SET(KOKKOS_CUDA_ARCH_LIST ${KOKKOS_CUDA_ARCH_LIST} PARENT_SCOPE)
|
||||
ENDFUNCTION()
|
||||
|
||||
|
||||
|
@ -346,6 +392,7 @@ CHECK_CUDA_ARCH(PASCAL61 sm_61)
|
|||
CHECK_CUDA_ARCH(VOLTA70 sm_70)
|
||||
CHECK_CUDA_ARCH(VOLTA72 sm_72)
|
||||
CHECK_CUDA_ARCH(TURING75 sm_75)
|
||||
CHECK_CUDA_ARCH(AMPERE80 sm_80)
|
||||
|
||||
SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
|
||||
FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
|
||||
|
@ -372,12 +419,19 @@ ENDFUNCTION()
|
|||
CHECK_AMDGPU_ARCH(VEGA900 gfx900) # Radeon Instinct MI25
|
||||
CHECK_AMDGPU_ARCH(VEGA906 gfx906) # Radeon Instinct MI50 and MI60
|
||||
|
||||
IF(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED)
|
||||
MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. "
|
||||
"Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.")
|
||||
ENDIF()
|
||||
|
||||
IF (KOKKOS_ENABLE_OPENMPTARGET)
|
||||
SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG})
|
||||
IF (CLANG_CUDA_ARCH)
|
||||
STRING(REPLACE "sm_" "cc" PGI_CUDA_ARCH ${CLANG_CUDA_ARCH})
|
||||
COMPILER_SPECIFIC_FLAGS(
|
||||
Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64-nvidia-cuda
|
||||
XL -qtgtarch=${KOKKOS_CUDA_ARCH_FLAG}
|
||||
PGI -gpu=${PGI_CUDA_ARCH}
|
||||
)
|
||||
ENDIF()
|
||||
SET(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG})
|
||||
|
@ -386,10 +440,39 @@ IF (KOKKOS_ENABLE_OPENMPTARGET)
|
|||
Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${CLANG_AMDGPU_ARCH} -fopenmp-targets=amdgcn-amd-amdhsa
|
||||
)
|
||||
ENDIF()
|
||||
IF (KOKKOS_ARCH_INTEL_GEN)
|
||||
COMPILER_SPECIFIC_FLAGS(
|
||||
IntelClang -fopenmp-targets=spir64 -D__STRICT_ANSI__
|
||||
)
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
IF(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED)
|
||||
MESSAGE(SEND_ERROR "CUDA enabled but no NVIDIA GPU architecture currently enabled. Please give one -DKokkos_ARCH_{..}=ON' to enable an NVIDIA GPU architecture.")
|
||||
# Try to autodetect the CUDA Compute Capability by asking the device
|
||||
SET(_BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/CUDAComputeCapabilityWorkdir)
|
||||
FILE(REMOVE_RECURSE ${_BINARY_TEST_DIR})
|
||||
FILE(MAKE_DIRECTORY ${_BINARY_TEST_DIR})
|
||||
|
||||
TRY_RUN(
|
||||
_RESULT
|
||||
_COMPILE_RESULT
|
||||
${_BINARY_TEST_DIR}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc
|
||||
COMPILE_DEFINITIONS -DSM_ONLY
|
||||
RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY)
|
||||
LIST(FIND KOKKOS_CUDA_ARCH_FLAGS sm_${_CUDA_COMPUTE_CAPABILITY} FLAG_INDEX)
|
||||
IF(_COMPILE_RESULT AND _RESULT EQUAL 0 AND NOT FLAG_INDEX EQUAL -1)
|
||||
MESSAGE(STATUS "Detected CUDA Compute Capability ${_CUDA_COMPUTE_CAPABILITY}")
|
||||
LIST(GET KOKKOS_CUDA_ARCH_LIST ${FLAG_INDEX} ARCHITECTURE)
|
||||
KOKKOS_SET_OPTION(ARCH_${ARCHITECTURE} ON)
|
||||
CHECK_CUDA_ARCH(${ARCHITECTURE} sm_${_CUDA_COMPUTE_CAPABILITY})
|
||||
LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCHITECTURE})
|
||||
ELSE()
|
||||
MESSAGE(SEND_ERROR "CUDA enabled but no NVIDIA GPU architecture currently enabled and auto-detection failed. "
|
||||
"Please give one -DKokkos_ARCH_{..}=ON' to enable an NVIDIA GPU architecture.\n"
|
||||
"You can yourself try to compile ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc and run the executable. "
|
||||
"If you are cross-compiling, you should try to do this on a compute node.")
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
#CMake verbose is kind of pointless
|
||||
|
@ -453,4 +536,3 @@ MESSAGE(STATUS "Architectures:")
|
|||
FOREACH(Arch ${KOKKOS_ENABLED_ARCH_LIST})
|
||||
MESSAGE(STATUS " ${Arch}")
|
||||
ENDFOREACH()
|
||||
|
||||
|
|
|
@ -4,33 +4,54 @@ SET(KOKKOS_CXX_COMPILER ${CMAKE_CXX_COMPILER})
|
|||
SET(KOKKOS_CXX_COMPILER_ID ${CMAKE_CXX_COMPILER_ID})
|
||||
SET(KOKKOS_CXX_COMPILER_VERSION ${CMAKE_CXX_COMPILER_VERSION})
|
||||
|
||||
# Check if the compiler is nvcc (which really means nvcc_wrapper).
|
||||
EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version
|
||||
COMMAND grep nvcc
|
||||
COMMAND wc -l
|
||||
OUTPUT_VARIABLE INTERNAL_HAVE_COMPILER_NVCC
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
IF(Kokkos_ENABLE_CUDA)
|
||||
# Check if the compiler is nvcc (which really means nvcc_wrapper).
|
||||
EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version
|
||||
OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
|
||||
STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} )
|
||||
|
||||
STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC)
|
||||
|
||||
|
||||
STRING(REGEX REPLACE "^ +" ""
|
||||
INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}")
|
||||
|
||||
STRING(REGEX REPLACE "^ +" ""
|
||||
INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}")
|
||||
IF(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1)
|
||||
SET(INTERNAL_HAVE_COMPILER_NVCC true)
|
||||
ELSE()
|
||||
SET(INTERNAL_HAVE_COMPILER_NVCC false)
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
IF(INTERNAL_HAVE_COMPILER_NVCC)
|
||||
# Save the host compiler id before overwriting it.
|
||||
SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID})
|
||||
|
||||
# SET the compiler id to nvcc. We use the value used by CMake 3.8.
|
||||
SET(KOKKOS_CXX_COMPILER_ID NVIDIA CACHE STRING INTERNAL FORCE)
|
||||
|
||||
# SET nvcc's compiler version.
|
||||
EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version
|
||||
COMMAND grep release
|
||||
OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
|
||||
STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+$"
|
||||
TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION})
|
||||
STRING(REGEX MATCH "V[0-9]+\\.[0-9]+\\.[0-9]+"
|
||||
TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE})
|
||||
STRING(SUBSTRING ${TEMP_CXX_COMPILER_VERSION} 1 -1 TEMP_CXX_COMPILER_VERSION)
|
||||
SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE)
|
||||
MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}")
|
||||
ENDIF()
|
||||
|
||||
IF(Kokkos_ENABLE_HIP)
|
||||
# get HIP version
|
||||
EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version
|
||||
OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
|
||||
STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} )
|
||||
SET(KOKKOS_CXX_COMPILER_ID HIP CACHE STRING INTERNAL FORCE)
|
||||
|
||||
STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+"
|
||||
TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE})
|
||||
SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE)
|
||||
MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}")
|
||||
ENDIF()
|
||||
|
||||
IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
|
||||
# The Cray compiler reports as Clang to most versions of CMake
|
||||
|
@ -42,6 +63,16 @@ IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
|
|||
IF (INTERNAL_HAVE_CRAY_COMPILER) #not actually Clang
|
||||
SET(KOKKOS_CLANG_IS_CRAY TRUE)
|
||||
ENDIF()
|
||||
# The clang based Intel compiler reports as Clang to most versions of CMake
|
||||
EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version
|
||||
COMMAND grep icpx
|
||||
COMMAND wc -l
|
||||
OUTPUT_VARIABLE INTERNAL_HAVE_INTEL_COMPILER
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
IF (INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang
|
||||
SET(KOKKOS_CLANG_IS_INTEL TRUE)
|
||||
SET(KOKKOS_CXX_COMPILER_ID IntelClang CACHE STRING INTERNAL FORCE)
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray OR KOKKOS_CLANG_IS_CRAY)
|
||||
|
@ -65,6 +96,7 @@ SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang 3.5.2 or higher"
|
|||
SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC 4.8.4 or higher")
|
||||
SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel 15.0.2 or higher")
|
||||
SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC 9.0.69 or higher")
|
||||
SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC 3.5.0 or higher")
|
||||
SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n PGI 17.1 or higher\n")
|
||||
|
||||
IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
|
||||
|
@ -84,6 +116,10 @@ ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
|
|||
MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
|
||||
ENDIF()
|
||||
SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE)
|
||||
ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIP)
|
||||
IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 3.5.0)
|
||||
MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
|
||||
ENDIF()
|
||||
ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI)
|
||||
IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.1)
|
||||
MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_ENABLE_OPENMP AND NOT KOKKOS_CLANG_IS_CRAY)
|
||||
IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_ENABLE_OPENMP AND NOT KOKKOS_CLANG_IS_CRAY AND NOT "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC")
|
||||
# The clang "version" doesn't actually tell you what runtimes and tools
|
||||
# were built into Clang. We should therefore make sure that libomp
|
||||
# was actually built into Clang. Otherwise the user will get nonsensical
|
||||
|
@ -49,11 +49,11 @@ ENDIF()
|
|||
|
||||
IF (KOKKOS_CXX_STANDARD STREQUAL 17)
|
||||
IF (KOKKOS_CXX_COMPILER_ID STREQUAL GNU AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 7)
|
||||
MESSAGE(FATAL_ERROR "You have requested c++17 support for GCC ${KOKKOS_CXX_COMPILER_VERSION}. Although CMake has allowed this and GCC accepts -std=c++1z/c++17, GCC <= 6 does not properly support *this capture. Please reduce the C++ standard to 14 or upgrade the compiler if you do need 17 support")
|
||||
MESSAGE(FATAL_ERROR "You have requested c++17 support for GCC ${KOKKOS_CXX_COMPILER_VERSION}. Although CMake has allowed this and GCC accepts -std=c++1z/c++17, GCC <= 6 does not properly support *this capture. Please reduce the C++ standard to 14 or upgrade the compiler if you do need C++17 support.")
|
||||
ENDIF()
|
||||
|
||||
IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
|
||||
MESSAGE(FATAL_ERROR "You have requested c++17 support for NVCC. Please reduce the C++ standard to 14. No versions of NVCC currently support 17.")
|
||||
IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11)
|
||||
MESSAGE(FATAL_ERROR "You have requested c++17 support for NVCC ${KOKKOS_CXX_COMPILER_VERSION}. NVCC only supports C++17 from version 11 on. Please reduce the C++ standard to 14 or upgrade the compiler if you need C++17 support.")
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
|
|
|
@ -36,25 +36,51 @@ IF(KOKKOS_ENABLE_OPENMP)
|
|||
IF(KOKKOS_CLANG_IS_CRAY)
|
||||
SET(ClangOpenMPFlag -fopenmp)
|
||||
ENDIF()
|
||||
COMPILER_SPECIFIC_FLAGS(
|
||||
Clang ${ClangOpenMPFlag}
|
||||
AppleClang -Xpreprocessor -fopenmp
|
||||
PGI -mp
|
||||
NVIDIA -Xcompiler -fopenmp
|
||||
Cray NO-VALUE-SPECIFIED
|
||||
XL -qsmp=omp
|
||||
DEFAULT -fopenmp
|
||||
)
|
||||
COMPILER_SPECIFIC_LIBS(
|
||||
AppleClang -lomp
|
||||
)
|
||||
IF(KOKKOS_CLANG_IS_INTEL)
|
||||
SET(ClangOpenMPFlag -fiopenmp)
|
||||
ENDIF()
|
||||
IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC")
|
||||
#expression /openmp yields error, so add a specific Clang flag
|
||||
COMPILER_SPECIFIC_OPTIONS(Clang /clang:-fopenmp)
|
||||
#link omp library from LLVM lib dir
|
||||
get_filename_component(LLVM_BIN_DIR ${CMAKE_CXX_COMPILER_AR} DIRECTORY)
|
||||
COMPILER_SPECIFIC_LIBS(Clang "${LLVM_BIN_DIR}/../lib/libomp.lib")
|
||||
ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
|
||||
COMPILER_SPECIFIC_FLAGS(
|
||||
COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
|
||||
Clang -Xcompiler ${ClangOpenMPFlag}
|
||||
PGI -Xcompiler -mp
|
||||
Cray NO-VALUE-SPECIFIED
|
||||
XL -Xcompiler -qsmp=omp
|
||||
DEFAULT -Xcompiler -fopenmp
|
||||
)
|
||||
ELSE()
|
||||
COMPILER_SPECIFIC_FLAGS(
|
||||
Clang ${ClangOpenMPFlag}
|
||||
AppleClang -Xpreprocessor -fopenmp
|
||||
PGI -mp
|
||||
Cray NO-VALUE-SPECIFIED
|
||||
XL -qsmp=omp
|
||||
DEFAULT -fopenmp
|
||||
)
|
||||
COMPILER_SPECIFIC_LIBS(
|
||||
AppleClang -lomp
|
||||
)
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
KOKKOS_DEVICE_OPTION(OPENMPTARGET OFF DEVICE "Whether to build the OpenMP target backend")
|
||||
IF (KOKKOS_ENABLE_OPENMPTARGET)
|
||||
SET(ClangOpenMPFlag -fopenmp=libomp)
|
||||
IF(KOKKOS_CLANG_IS_CRAY)
|
||||
SET(ClangOpenMPFlag -fopenmp)
|
||||
ENDIF()
|
||||
|
||||
COMPILER_SPECIFIC_FLAGS(
|
||||
Clang -fopenmp -fopenmp=libomp
|
||||
Clang ${ClangOpenMPFlag} -Wno-openmp-mapping
|
||||
IntelClang -fiopenmp -Wno-openmp-mapping
|
||||
XL -qsmp=omp -qoffload -qnoeh
|
||||
PGI -mp=gpu
|
||||
DEFAULT -fopenmp
|
||||
)
|
||||
COMPILER_SPECIFIC_DEFS(
|
||||
|
@ -65,6 +91,9 @@ IF (KOKKOS_ENABLE_OPENMPTARGET)
|
|||
# COMPILER_SPECIFIC_LIBS(
|
||||
# Clang -lopenmptarget
|
||||
# )
|
||||
IF(KOKKOS_CXX_STANDARD LESS 17)
|
||||
MESSAGE(FATAL_ERROR "OpenMPTarget backend requires C++17 or newer")
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA)
|
||||
|
@ -76,6 +105,9 @@ KOKKOS_DEVICE_OPTION(CUDA ${CUDA_DEFAULT} DEVICE "Whether to build CUDA backend"
|
|||
|
||||
IF (KOKKOS_ENABLE_CUDA)
|
||||
GLOBAL_SET(KOKKOS_DONT_ALLOW_EXTENSIONS "CUDA enabled")
|
||||
IF(WIN32)
|
||||
GLOBAL_APPEND(KOKKOS_COMPILE_OPTIONS -x cu)
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
# We want this to default to OFF for cache reasons, but if no
|
||||
|
|
|
@ -45,10 +45,9 @@ UNSET(_UPPERCASE_CMAKE_BUILD_TYPE)
|
|||
KOKKOS_ENABLE_OPTION(LARGE_MEM_TESTS OFF "Whether to perform extra large memory tests")
|
||||
KOKKOS_ENABLE_OPTION(DEBUG_BOUNDS_CHECK OFF "Whether to use bounds checking - will increase runtime")
|
||||
KOKKOS_ENABLE_OPTION(COMPILER_WARNINGS OFF "Whether to print all compiler warnings")
|
||||
KOKKOS_ENABLE_OPTION(PROFILING ON "Whether to create bindings for profiling tools")
|
||||
KOKKOS_ENABLE_OPTION(PROFILING_LOAD_PRINT OFF "Whether to print information about which profiling tools got loaded")
|
||||
KOKKOS_ENABLE_OPTION(TUNING OFF "Whether to create bindings for tuning tools")
|
||||
KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops")
|
||||
KOKKOS_ENABLE_OPTION(DEPRECATED_CODE OFF "Whether to enable deprecated code")
|
||||
|
||||
IF (KOKKOS_ENABLE_CUDA)
|
||||
SET(KOKKOS_COMPILER_CUDA_VERSION "${KOKKOS_COMPILER_VERSION_MAJOR}${KOKKOS_COMPILER_VERSION_MINOR}")
|
||||
|
|
|
@ -47,6 +47,13 @@ FUNCTION(kokkos_option CAMEL_SUFFIX DEFAULT TYPE DOCSTRING)
|
|||
SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX})
|
||||
STRING(TOUPPER ${CAMEL_NAME} UC_NAME)
|
||||
|
||||
LIST(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX})
|
||||
SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE)
|
||||
LIST(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}")
|
||||
SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE)
|
||||
LIST(APPEND KOKKOS_OPTION_TYPES ${TYPE})
|
||||
SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE)
|
||||
|
||||
# Make sure this appears in the cache with the appropriate DOCSTRING
|
||||
SET(${CAMEL_NAME} ${DEFAULT} CACHE ${TYPE} ${DOCSTRING})
|
||||
|
||||
|
@ -73,7 +80,21 @@ FUNCTION(kokkos_option CAMEL_SUFFIX DEFAULT TYPE DOCSTRING)
|
|||
ELSE()
|
||||
SET(${UC_NAME} ${DEFAULT} PARENT_SCOPE)
|
||||
ENDIF()
|
||||
ENDFUNCTION()
|
||||
|
||||
FUNCTION(kokkos_set_option CAMEL_SUFFIX VALUE)
|
||||
LIST(FIND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX} OPTION_INDEX)
|
||||
IF(OPTION_INDEX EQUAL -1)
|
||||
MESSAGE(FATAL_ERROR "Couldn't set value for Kokkos_${CAMEL_SUFFIX}")
|
||||
ENDIF()
|
||||
SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX})
|
||||
STRING(TOUPPER ${CAMEL_NAME} UC_NAME)
|
||||
|
||||
LIST(GET KOKKOS_OPTION_VALUES ${OPTION_INDEX} DOCSTRING)
|
||||
LIST(GET KOKKOS_OPTION_TYPES ${OPTION_INDEX} TYPE)
|
||||
SET(${CAMEL_NAME} ${VALUE} CACHE ${TYPE} ${DOCSTRING} FORCE)
|
||||
MESSAGE(STATUS "Setting ${CAMEL_NAME}=${VALUE}")
|
||||
SET(${UC_NAME} ${VALUE} PARENT_SCOPE)
|
||||
ENDFUNCTION()
|
||||
|
||||
FUNCTION(kokkos_append_config_line LINE)
|
||||
|
@ -109,8 +130,8 @@ ENDMACRO()
|
|||
|
||||
MACRO(kokkos_export_imported_tpl NAME)
|
||||
IF (NOT KOKKOS_HAS_TRILINOS)
|
||||
GET_TARGET_PROPERTY(LIB_TYPE ${NAME} TYPE)
|
||||
IF (${LIB_TYPE} STREQUAL "INTERFACE_LIBRARY")
|
||||
GET_TARGET_PROPERTY(LIB_IMPORTED ${NAME} IMPORTED)
|
||||
IF (NOT LIB_IMPORTED)
|
||||
# This is not an imported target
|
||||
# This an interface library that we created
|
||||
INSTALL(
|
||||
|
@ -123,12 +144,18 @@ MACRO(kokkos_export_imported_tpl NAME)
|
|||
ELSE()
|
||||
#make sure this also gets "exported" in the config file
|
||||
KOKKOS_APPEND_CONFIG_LINE("IF(NOT TARGET ${NAME})")
|
||||
KOKKOS_APPEND_CONFIG_LINE("ADD_LIBRARY(${NAME} UNKNOWN IMPORTED)")
|
||||
KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES")
|
||||
|
||||
GET_TARGET_PROPERTY(TPL_LIBRARY ${NAME} IMPORTED_LOCATION)
|
||||
IF(TPL_LIBRARY)
|
||||
KOKKOS_APPEND_CONFIG_LINE("IMPORTED_LOCATION ${TPL_LIBRARY}")
|
||||
GET_TARGET_PROPERTY(LIB_TYPE ${NAME} TYPE)
|
||||
IF (${LIB_TYPE} STREQUAL "INTERFACE_LIBRARY")
|
||||
KOKKOS_APPEND_CONFIG_LINE("ADD_LIBRARY(${NAME} INTERFACE IMPORTED)")
|
||||
KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES")
|
||||
ELSE()
|
||||
KOKKOS_APPEND_CONFIG_LINE("ADD_LIBRARY(${NAME} UNKNOWN IMPORTED)")
|
||||
KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES")
|
||||
GET_TARGET_PROPERTY(TPL_LIBRARY ${NAME} IMPORTED_LOCATION)
|
||||
IF(TPL_LIBRARY)
|
||||
KOKKOS_APPEND_CONFIG_LINE("IMPORTED_LOCATION ${TPL_LIBRARY}")
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
GET_TARGET_PROPERTY(TPL_INCLUDES ${NAME} INTERFACE_INCLUDE_DIRECTORIES)
|
||||
|
@ -737,18 +764,22 @@ FUNCTION(kokkos_link_tpl TARGET)
|
|||
ENDFUNCTION()
|
||||
|
||||
FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER)
|
||||
SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang GNU)
|
||||
SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelClang GNU HIP)
|
||||
CMAKE_PARSE_ARGUMENTS(
|
||||
PARSE
|
||||
"LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES"
|
||||
""
|
||||
"COMPILER_ID"
|
||||
"${COMPILERS}"
|
||||
${ARGN})
|
||||
IF(PARSE_UNPARSED_ARGUMENTS)
|
||||
MESSAGE(SEND_ERROR "'${PARSE_UNPARSED_ARGUMENTS}' argument(s) not recognized when providing compiler specific options")
|
||||
ENDIF()
|
||||
|
||||
SET(COMPILER ${KOKKOS_CXX_COMPILER_ID})
|
||||
IF(PARSE_COMPILER_ID)
|
||||
SET(COMPILER ${${PARSE_COMPILER_ID}})
|
||||
ELSE()
|
||||
SET(COMPILER ${KOKKOS_CXX_COMPILER_ID})
|
||||
ENDIF()
|
||||
|
||||
SET(COMPILER_SPECIFIC_FLAGS_TMP)
|
||||
FOREACH(COMP ${COMPILERS})
|
||||
|
@ -792,6 +823,14 @@ FUNCTION(COMPILER_SPECIFIC_FLAGS)
|
|||
COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_OPTIONS LINK_OPTIONS)
|
||||
ENDFUNCTION(COMPILER_SPECIFIC_FLAGS)
|
||||
|
||||
FUNCTION(COMPILER_SPECIFIC_OPTIONS)
|
||||
COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_OPTIONS)
|
||||
ENDFUNCTION(COMPILER_SPECIFIC_OPTIONS)
|
||||
|
||||
FUNCTION(COMPILER_SPECIFIC_LINK_OPTIONS)
|
||||
COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} LINK_OPTIONS)
|
||||
ENDFUNCTION(COMPILER_SPECIFIC_LINK_OPTIONS)
|
||||
|
||||
FUNCTION(COMPILER_SPECIFIC_DEFS)
|
||||
COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_DEFINITIONS)
|
||||
ENDFUNCTION(COMPILER_SPECIFIC_DEFS)
|
||||
|
@ -799,3 +838,36 @@ ENDFUNCTION(COMPILER_SPECIFIC_DEFS)
|
|||
FUNCTION(COMPILER_SPECIFIC_LIBS)
|
||||
COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} LINK_LIBRARIES)
|
||||
ENDFUNCTION(COMPILER_SPECIFIC_LIBS)
|
||||
|
||||
# Given a list of the form
|
||||
# key1;value1;key2;value2,...
|
||||
# Create a list of all keys in a variable named ${KEY_LIST_NAME}
|
||||
# and set the value for each key in a variable ${VAR_PREFIX}key1,...
|
||||
# kokkos_key_value_map(ARCH ALL_ARCHES key1;value1;key2;value2)
|
||||
# would produce a list variable ALL_ARCHES=key1;key2
|
||||
# and individual variables ARCHkey1=value1 and ARCHkey2=value2
|
||||
MACRO(KOKKOS_KEY_VALUE_MAP VAR_PREFIX KEY_LIST_NAME)
|
||||
SET(PARSE_KEY ON)
|
||||
SET(${KEY_LIST_NAME})
|
||||
FOREACH(ENTRY ${ARGN})
|
||||
IF(PARSE_KEY)
|
||||
SET(CURRENT_KEY ${ENTRY})
|
||||
SET(PARSE_KEY OFF)
|
||||
LIST(APPEND ${KEY_LIST_NAME} ${CURRENT_KEY})
|
||||
ELSE()
|
||||
SET(${VAR_PREFIX}${CURRENT_KEY} ${ENTRY})
|
||||
SET(PARSE_KEY ON)
|
||||
ENDIF()
|
||||
ENDFOREACH()
|
||||
ENDMACRO()
|
||||
|
||||
FUNCTION(KOKKOS_CHECK_DEPRECATED_OPTIONS)
|
||||
KOKKOS_KEY_VALUE_MAP(DEPRECATED_MSG_ DEPRECATED_LIST ${ARGN})
|
||||
FOREACH(OPTION_SUFFIX ${DEPRECATED_LIST})
|
||||
SET(OPTION_NAME Kokkos_${OPTION_SUFFIX})
|
||||
SET(OPTION_MESSAGE ${DEPRECATED_MSG_${OPTION_SUFFIX}})
|
||||
IF(DEFINED ${OPTION_NAME}) # This variable has been given by the user as on or off
|
||||
MESSAGE(SEND_ERROR "Removed option ${OPTION_NAME} has been given with value ${${OPTION_NAME}}. ${OPT_MESSAGE}")
|
||||
ENDIF()
|
||||
ENDFOREACH()
|
||||
ENDFUNCTION()
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
INCLUDE(CMakePackageConfigHelpers)
|
||||
IF (NOT KOKKOS_HAS_TRILINOS)
|
||||
IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING)
|
||||
INCLUDE(GNUInstallDirs)
|
||||
|
||||
#Set all the variables needed for KokkosConfig.cmake
|
||||
|
|
|
@ -28,19 +28,30 @@ FUNCTION(kokkos_set_cxx_standard_feature standard)
|
|||
GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME})
|
||||
ELSEIF(NOT KOKKOS_USE_CXX_EXTENSIONS AND ${STANDARD_NAME})
|
||||
MESSAGE(STATUS "Using ${${STANDARD_NAME}} for C++${standard} standard as feature")
|
||||
IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL GNU OR KOKKOS_CXX_HOST_COMPILER_ID STREQUAL Clang))
|
||||
SET(SUPPORTED_NVCC_FLAGS "-std=c++11;-std=c++14;-std=c++17")
|
||||
IF (NOT ${${STANDARD_NAME}} IN_LIST SUPPORTED_NVCC_FLAGS)
|
||||
MESSAGE(FATAL_ERROR "CMake wants to use ${${STANDARD_NAME}} which is not supported by NVCC. Using a more recent host compiler or a more recent CMake version might help.")
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME})
|
||||
ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC")
|
||||
ELSEIF (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC")
|
||||
#MSVC doesn't need a command line flag, that doesn't mean it has no support
|
||||
MESSAGE(STATUS "Using no flag for C++${standard} standard as feature")
|
||||
GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME})
|
||||
ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32)
|
||||
MESSAGE(STATUS "Using no flag for C++${standard} standard as feature")
|
||||
GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "")
|
||||
ELSE()
|
||||
#nope, we can't do anything here
|
||||
MESSAGE(WARNING "C++${standard} is not supported as a compiler feature. We will choose custom flags for now, but this behavior has been deprecated. Please open an issue at https://github.com/kokkos/kokkos/issues reporting that ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION} failed for ${KOKKOS_CXX_STANDARD}, preferrably including your CMake command.")
|
||||
GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "")
|
||||
ENDIF()
|
||||
|
||||
IF(NOT ${FEATURE_NAME} IN_LIST CMAKE_CXX_COMPILE_FEATURES)
|
||||
MESSAGE(FATAL_ERROR "Compiler ${KOKKOS_CXX_COMPILER_ID} should support ${FEATURE_NAME}, but CMake reports feature not supported")
|
||||
IF(NOT WIN32)
|
||||
IF(NOT ${FEATURE_NAME} IN_LIST CMAKE_CXX_COMPILE_FEATURES)
|
||||
MESSAGE(FATAL_ERROR "Compiler ${KOKKOS_CXX_COMPILER_ID} should support ${FEATURE_NAME}, but CMake reports feature not supported")
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
ENDFUNCTION()
|
||||
|
||||
|
@ -123,7 +134,7 @@ IF (NOT KOKKOS_CXX_STANDARD_FEATURE)
|
|||
ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel)
|
||||
INCLUDE(${KOKKOS_SRC_PATH}/cmake/intel.cmake)
|
||||
kokkos_set_intel_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD})
|
||||
ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC")
|
||||
ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") OR ((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32))
|
||||
INCLUDE(${KOKKOS_SRC_PATH}/cmake/msvc.cmake)
|
||||
kokkos_set_msvc_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD})
|
||||
ELSE()
|
||||
|
|
|
@ -13,10 +13,10 @@ KOKKOS_TPL_OPTION(LIBNUMA Off)
|
|||
KOKKOS_TPL_OPTION(MEMKIND Off)
|
||||
KOKKOS_TPL_OPTION(CUDA Off)
|
||||
KOKKOS_TPL_OPTION(LIBRT Off)
|
||||
KOKKOS_TPL_OPTION(LIBDL On)
|
||||
|
||||
IF(KOKKOS_ENABLE_PROFILING AND NOT KOKKOS_ENABLE_LIBDL)
|
||||
MESSAGE(SEND_ERROR "Kokkos_ENABLE_PROFILING requires Kokkos_ENABLE_LIBDL=ON")
|
||||
IF (WIN32)
|
||||
KOKKOS_TPL_OPTION(LIBDL Off)
|
||||
ELSE()
|
||||
KOKKOS_TPL_OPTION(LIBDL On)
|
||||
ENDIF()
|
||||
|
||||
IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_HPX)
|
||||
|
|
|
@ -21,10 +21,6 @@ IF (KOKKOS_HAS_TRILINOS)
|
|||
SET(${PROJECT_NAME}_ENABLE_DEBUG OFF)
|
||||
ENDIF()
|
||||
|
||||
IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_CXX11)
|
||||
SET(${PROJECT_NAME}_ENABLE_CXX11 ON)
|
||||
ENDIF()
|
||||
|
||||
IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_TESTS)
|
||||
SET(${PROJECT_NAME}_ENABLE_TESTS OFF)
|
||||
ENDIF()
|
||||
|
@ -134,7 +130,7 @@ FUNCTION(KOKKOS_ADD_EXECUTABLE ROOT_NAME)
|
|||
VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS})
|
||||
#All executables must link to all the kokkos targets
|
||||
#This is just private linkage because exe is final
|
||||
TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE kokkos)
|
||||
TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkos)
|
||||
endif()
|
||||
ENDFUNCTION()
|
||||
|
||||
|
@ -174,16 +170,42 @@ FUNCTION(KOKKOS_SET_EXE_PROPERTY ROOT_NAME)
|
|||
ENDFUNCTION()
|
||||
|
||||
MACRO(KOKKOS_SETUP_BUILD_ENVIRONMENT)
|
||||
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_compiler_id.cmake)
|
||||
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_devices.cmake)
|
||||
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_options.cmake)
|
||||
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_test_cxx_std.cmake)
|
||||
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_arch.cmake)
|
||||
IF (NOT KOKKOS_HAS_TRILINOS)
|
||||
SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/")
|
||||
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tpls.cmake)
|
||||
ENDIF()
|
||||
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_corner_cases.cmake)
|
||||
# This is needed for both regular build and install tests
|
||||
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_compiler_id.cmake)
|
||||
#set an internal option, if not already set
|
||||
SET(Kokkos_INSTALL_TESTING OFF CACHE INTERNAL "Whether to build tests and examples against installation")
|
||||
IF (Kokkos_INSTALL_TESTING)
|
||||
SET(KOKKOS_ENABLE_TESTS ON)
|
||||
SET(KOKKOS_ENABLE_EXAMPLES ON)
|
||||
# This looks a little weird, but what we are doing
|
||||
# is to NOT build Kokkos but instead look for an
|
||||
# installed Kokkos - then build examples and tests
|
||||
# against that installed Kokkos
|
||||
FIND_PACKAGE(Kokkos REQUIRED)
|
||||
# Just grab the configuration from the installation
|
||||
FOREACH(DEV ${Kokkos_DEVICES})
|
||||
SET(KOKKOS_ENABLE_${DEV} ON)
|
||||
ENDFOREACH()
|
||||
FOREACH(OPT ${Kokkos_OPTIONS})
|
||||
SET(KOKKOS_ENABLE_${OPT} ON)
|
||||
ENDFOREACH()
|
||||
FOREACH(TPL ${Kokkos_TPLS})
|
||||
SET(KOKKOS_ENABLE_${TPL} ON)
|
||||
ENDFOREACH()
|
||||
FOREACH(ARCH ${Kokkos_ARCH})
|
||||
SET(KOKKOS_ARCH_${ARCH} ON)
|
||||
ENDFOREACH()
|
||||
ELSE()
|
||||
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_devices.cmake)
|
||||
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_options.cmake)
|
||||
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_test_cxx_std.cmake)
|
||||
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_arch.cmake)
|
||||
IF (NOT KOKKOS_HAS_TRILINOS)
|
||||
SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/")
|
||||
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tpls.cmake)
|
||||
ENDIF()
|
||||
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_corner_cases.cmake)
|
||||
ENDIF()
|
||||
ENDMACRO()
|
||||
|
||||
MACRO(KOKKOS_ADD_TEST_EXECUTABLE ROOT_NAME)
|
||||
|
@ -310,28 +332,40 @@ FUNCTION(KOKKOS_INTERNAL_ADD_LIBRARY LIBRARY_NAME)
|
|||
LIST(REMOVE_DUPLICATES PARSE_SOURCES)
|
||||
ENDIF()
|
||||
|
||||
IF(PARSE_STATIC)
|
||||
SET(LINK_TYPE STATIC)
|
||||
ENDIF()
|
||||
|
||||
IF(PARSE_SHARED)
|
||||
SET(LINK_TYPE SHARED)
|
||||
ENDIF()
|
||||
|
||||
# MSVC and other platforms want to have
|
||||
# the headers included as source files
|
||||
# for better dependency detection
|
||||
ADD_LIBRARY(
|
||||
${LIBRARY_NAME}
|
||||
${LINK_TYPE}
|
||||
${PARSE_HEADERS}
|
||||
${PARSE_SOURCES}
|
||||
)
|
||||
|
||||
KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${LIBRARY_NAME})
|
||||
|
||||
INSTALL(
|
||||
FILES ${PARSE_HEADERS}
|
||||
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
|
||||
COMPONENT ${PACKAGE_NAME}
|
||||
)
|
||||
|
||||
#In case we are building in-tree, add an alias name
|
||||
#that matches the install Kokkos:: name
|
||||
ADD_LIBRARY(Kokkos::${LIBRARY_NAME} ALIAS ${LIBRARY_NAME})
|
||||
ENDFUNCTION()
|
||||
|
||||
FUNCTION(KOKKOS_ADD_LIBRARY LIBRARY_NAME)
|
||||
CMAKE_PARSE_ARGUMENTS(PARSE
|
||||
"ADD_BUILD_OPTIONS"
|
||||
""
|
||||
""
|
||||
${ARGN}
|
||||
)
|
||||
IF (KOKKOS_HAS_TRILINOS)
|
||||
TRIBITS_ADD_LIBRARY(${LIBRARY_NAME} ${ARGN})
|
||||
TRIBITS_ADD_LIBRARY(${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS})
|
||||
#Stolen from Tribits - it can add prefixes
|
||||
SET(TRIBITS_LIBRARY_NAME_PREFIX "${${PROJECT_NAME}_LIBRARY_NAME_PREFIX}")
|
||||
SET(TRIBITS_LIBRARY_NAME ${TRIBITS_LIBRARY_NAME_PREFIX}${LIBRARY_NAME})
|
||||
|
@ -346,8 +380,10 @@ FUNCTION(KOKKOS_ADD_LIBRARY LIBRARY_NAME)
|
|||
#KOKKOS_SET_LIBRARY_PROPERTIES(${TRIBITS_LIBRARY_NAME} PLAIN_STYLE)
|
||||
ELSE()
|
||||
KOKKOS_INTERNAL_ADD_LIBRARY(
|
||||
${LIBRARY_NAME} ${ARGN})
|
||||
KOKKOS_SET_LIBRARY_PROPERTIES(${LIBRARY_NAME})
|
||||
${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS})
|
||||
IF (PARSE_ADD_BUILD_OPTIONS)
|
||||
KOKKOS_SET_LIBRARY_PROPERTIES(${LIBRARY_NAME})
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
ENDFUNCTION()
|
||||
|
||||
|
@ -364,17 +400,6 @@ ELSE()
|
|||
|
||||
ADD_LIBRARY(${NAME} INTERFACE)
|
||||
KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${NAME})
|
||||
|
||||
INSTALL(
|
||||
FILES ${PARSE_HEADERS}
|
||||
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
|
||||
)
|
||||
|
||||
INSTALL(
|
||||
FILES ${PARSE_HEADERS}
|
||||
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
|
||||
COMPONENT ${PACKAGE_NAME}
|
||||
)
|
||||
ENDIF()
|
||||
ENDFUNCTION()
|
||||
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
packages:
|
||||
kokkos:
|
||||
variants: +cuda +openmp +volta70 +cuda_lambda +wrapper ^cuda@10.1
|
||||
compiler: [gcc@7.2.0]
|
|
@ -2,7 +2,9 @@
|
|||
|
||||
KOKKOS_SUBPACKAGE(Containers)
|
||||
|
||||
ADD_SUBDIRECTORY(src)
|
||||
IF (NOT Kokkos_INSTALL_TESTING)
|
||||
ADD_SUBDIRECTORY(src)
|
||||
ENDIF()
|
||||
|
||||
KOKKOS_ADD_TEST_DIRECTORIES(unit_tests)
|
||||
KOKKOS_ADD_TEST_DIRECTORIES(performance_tests)
|
||||
|
|
|
@ -31,10 +31,10 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
|||
TEST_TARGETS += test-cuda
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
|
||||
OBJ_ROCM = TestROCm.o TestMain.o gtest-all.o
|
||||
TARGETS += KokkosContainers_PerformanceTest_ROCm
|
||||
TEST_TARGETS += test-rocm
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
|
||||
OBJ_HIP = TestHIP.o TestMain.o gtest-all.o
|
||||
TARGETS += KokkosContainers_PerformanceTest_HIP
|
||||
TEST_TARGETS += test-hip
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
|
||||
|
|
|
@ -58,7 +58,7 @@ namespace Performance {
|
|||
// View functor
|
||||
template <typename DeviceType>
|
||||
struct InitViewFunctor {
|
||||
typedef Kokkos::View<double ***, DeviceType> inviewtype;
|
||||
using inviewtype = Kokkos::View<double ***, DeviceType>;
|
||||
inviewtype _inview;
|
||||
|
||||
InitViewFunctor(inviewtype &inview_) : _inview(inview_) {}
|
||||
|
@ -73,10 +73,10 @@ struct InitViewFunctor {
|
|||
}
|
||||
|
||||
struct SumComputationTest {
|
||||
typedef Kokkos::View<double ***, DeviceType> inviewtype;
|
||||
using inviewtype = Kokkos::View<double ***, DeviceType>;
|
||||
inviewtype _inview;
|
||||
|
||||
typedef Kokkos::View<double *, DeviceType> outviewtype;
|
||||
using outviewtype = Kokkos::View<double *, DeviceType>;
|
||||
outviewtype _outview;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -96,7 +96,7 @@ struct InitViewFunctor {
|
|||
|
||||
template <typename DeviceType>
|
||||
struct InitStrideViewFunctor {
|
||||
typedef Kokkos::View<double ***, Kokkos::LayoutStride, DeviceType> inviewtype;
|
||||
using inviewtype = Kokkos::View<double ***, Kokkos::LayoutStride, DeviceType>;
|
||||
inviewtype _inview;
|
||||
|
||||
InitStrideViewFunctor(inviewtype &inview_) : _inview(inview_) {}
|
||||
|
@ -113,7 +113,7 @@ struct InitStrideViewFunctor {
|
|||
|
||||
template <typename DeviceType>
|
||||
struct InitViewRank7Functor {
|
||||
typedef Kokkos::View<double *******, DeviceType> inviewtype;
|
||||
using inviewtype = Kokkos::View<double *******, DeviceType>;
|
||||
inviewtype _inview;
|
||||
|
||||
InitViewRank7Functor(inviewtype &inview_) : _inview(inview_) {}
|
||||
|
@ -131,7 +131,7 @@ struct InitViewRank7Functor {
|
|||
// DynRankView functor
|
||||
template <typename DeviceType>
|
||||
struct InitDynRankViewFunctor {
|
||||
typedef Kokkos::DynRankView<double, DeviceType> inviewtype;
|
||||
using inviewtype = Kokkos::DynRankView<double, DeviceType>;
|
||||
inviewtype _inview;
|
||||
|
||||
InitDynRankViewFunctor(inviewtype &inview_) : _inview(inview_) {}
|
||||
|
@ -146,10 +146,10 @@ struct InitDynRankViewFunctor {
|
|||
}
|
||||
|
||||
struct SumComputationTest {
|
||||
typedef Kokkos::DynRankView<double, DeviceType> inviewtype;
|
||||
using inviewtype = Kokkos::DynRankView<double, DeviceType>;
|
||||
inviewtype _inview;
|
||||
|
||||
typedef Kokkos::DynRankView<double, DeviceType> outviewtype;
|
||||
using outviewtype = Kokkos::DynRankView<double, DeviceType>;
|
||||
outviewtype _outview;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -169,8 +169,8 @@ struct InitDynRankViewFunctor {
|
|||
|
||||
template <typename DeviceType>
|
||||
void test_dynrankview_op_perf(const int par_size) {
|
||||
typedef DeviceType execution_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
using execution_space = DeviceType;
|
||||
using size_type = typename execution_space::size_type;
|
||||
const size_type dim_2 = 90;
|
||||
const size_type dim_3 = 30;
|
||||
|
||||
|
@ -184,7 +184,7 @@ void test_dynrankview_op_perf(const int par_size) {
|
|||
{
|
||||
Kokkos::View<double ***, DeviceType> testview("testview", par_size, dim_2,
|
||||
dim_3);
|
||||
typedef InitViewFunctor<DeviceType> FunctorType;
|
||||
using FunctorType = InitViewFunctor<DeviceType>;
|
||||
|
||||
timer.reset();
|
||||
Kokkos::RangePolicy<DeviceType> policy(0, par_size);
|
||||
|
@ -204,7 +204,7 @@ void test_dynrankview_op_perf(const int par_size) {
|
|||
|
||||
Kokkos::View<double ***, Kokkos::LayoutStride, DeviceType> teststrideview =
|
||||
Kokkos::subview(testview, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL);
|
||||
typedef InitStrideViewFunctor<DeviceType> FunctorStrideType;
|
||||
using FunctorStrideType = InitStrideViewFunctor<DeviceType>;
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_for(policy, FunctorStrideType(teststrideview));
|
||||
|
@ -216,7 +216,7 @@ void test_dynrankview_op_perf(const int par_size) {
|
|||
{
|
||||
Kokkos::View<double *******, DeviceType> testview("testview", par_size,
|
||||
dim_2, dim_3, 1, 1, 1, 1);
|
||||
typedef InitViewRank7Functor<DeviceType> FunctorType;
|
||||
using FunctorType = InitViewRank7Functor<DeviceType>;
|
||||
|
||||
timer.reset();
|
||||
Kokkos::RangePolicy<DeviceType> policy(0, par_size);
|
||||
|
@ -229,7 +229,7 @@ void test_dynrankview_op_perf(const int par_size) {
|
|||
{
|
||||
Kokkos::DynRankView<double, DeviceType> testdrview("testdrview", par_size,
|
||||
dim_2, dim_3);
|
||||
typedef InitDynRankViewFunctor<DeviceType> FunctorType;
|
||||
using FunctorType = InitDynRankViewFunctor<DeviceType>;
|
||||
|
||||
timer.reset();
|
||||
Kokkos::RangePolicy<DeviceType> policy(0, par_size);
|
||||
|
|
|
@ -65,9 +65,9 @@ union helper {
|
|||
|
||||
template <typename Device>
|
||||
struct generate_ids {
|
||||
typedef Device execution_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
typedef Kokkos::View<uint32_t*, execution_space> local_id_view;
|
||||
using execution_space = Device;
|
||||
using size_type = typename execution_space::size_type;
|
||||
using local_id_view = Kokkos::View<uint32_t*, execution_space>;
|
||||
|
||||
local_id_view local_2_global;
|
||||
|
||||
|
@ -96,13 +96,12 @@ struct generate_ids {
|
|||
|
||||
template <typename Device>
|
||||
struct fill_map {
|
||||
typedef Device execution_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
typedef Kokkos::View<const uint32_t*, execution_space,
|
||||
Kokkos::MemoryRandomAccess>
|
||||
local_id_view;
|
||||
typedef Kokkos::UnorderedMap<uint32_t, size_type, execution_space>
|
||||
global_id_view;
|
||||
using execution_space = Device;
|
||||
using size_type = typename execution_space::size_type;
|
||||
using local_id_view = Kokkos::View<const uint32_t*, execution_space,
|
||||
Kokkos::MemoryRandomAccess>;
|
||||
using global_id_view =
|
||||
Kokkos::UnorderedMap<uint32_t, size_type, execution_space>;
|
||||
|
||||
global_id_view global_2_local;
|
||||
local_id_view local_2_global;
|
||||
|
@ -120,18 +119,17 @@ struct fill_map {
|
|||
|
||||
template <typename Device>
|
||||
struct find_test {
|
||||
typedef Device execution_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
typedef Kokkos::View<const uint32_t*, execution_space,
|
||||
Kokkos::MemoryRandomAccess>
|
||||
local_id_view;
|
||||
typedef Kokkos::UnorderedMap<const uint32_t, const size_type, execution_space>
|
||||
global_id_view;
|
||||
using execution_space = Device;
|
||||
using size_type = typename execution_space::size_type;
|
||||
using local_id_view = Kokkos::View<const uint32_t*, execution_space,
|
||||
Kokkos::MemoryRandomAccess>;
|
||||
using global_id_view =
|
||||
Kokkos::UnorderedMap<const uint32_t, const size_type, execution_space>;
|
||||
|
||||
global_id_view global_2_local;
|
||||
local_id_view local_2_global;
|
||||
|
||||
typedef size_t value_type;
|
||||
using value_type = size_t;
|
||||
|
||||
find_test(global_id_view gIds, local_id_view lIds, value_type& num_errors)
|
||||
: global_2_local(gIds), local_2_global(lIds) {
|
||||
|
@ -156,12 +154,12 @@ struct find_test {
|
|||
|
||||
template <typename Device>
|
||||
void test_global_to_local_ids(unsigned num_ids) {
|
||||
typedef Device execution_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
using execution_space = Device;
|
||||
using size_type = typename execution_space::size_type;
|
||||
|
||||
typedef Kokkos::View<uint32_t*, execution_space> local_id_view;
|
||||
typedef Kokkos::UnorderedMap<uint32_t, size_type, execution_space>
|
||||
global_id_view;
|
||||
using local_id_view = Kokkos::View<uint32_t*, execution_space>;
|
||||
using global_id_view =
|
||||
Kokkos::UnorderedMap<uint32_t, size_type, execution_space>;
|
||||
|
||||
// size
|
||||
std::cout << num_ids << ", ";
|
||||
|
|
|
@ -50,14 +50,14 @@
|
|||
|
||||
namespace Perf {
|
||||
|
||||
template <typename ExecSpace, typename Layout, int duplication,
|
||||
int contribution>
|
||||
template <typename ExecSpace, typename Layout, typename Duplication,
|
||||
typename Contribution>
|
||||
void test_scatter_view(int m, int n) {
|
||||
Kokkos::View<double * [3], Layout, ExecSpace> original_view("original_view",
|
||||
n);
|
||||
{
|
||||
auto scatter_view = Kokkos::Experimental::create_scatter_view<
|
||||
Kokkos::Experimental::ScatterSum, duplication, contribution>(
|
||||
Kokkos::Experimental::ScatterSum, Duplication, Contribution>(
|
||||
original_view);
|
||||
Kokkos::Experimental::UniqueToken<
|
||||
ExecSpace, Kokkos::Experimental::UniqueTokenScope::Global>
|
||||
|
|
|
@ -55,9 +55,9 @@ namespace Perf {
|
|||
|
||||
template <typename Device, bool Near>
|
||||
struct UnorderedMapTest {
|
||||
typedef Device execution_space;
|
||||
typedef Kokkos::UnorderedMap<uint32_t, uint32_t, execution_space> map_type;
|
||||
typedef typename map_type::histogram_type histogram_type;
|
||||
using execution_space = Device;
|
||||
using map_type = Kokkos::UnorderedMap<uint32_t, uint32_t, execution_space>;
|
||||
using histogram_type = typename map_type::histogram_type;
|
||||
|
||||
struct value_type {
|
||||
uint32_t failed_count;
|
||||
|
|
|
@ -9,6 +9,10 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
|
|||
|
||||
SET(KOKKOS_CONTAINERS_SRCS)
|
||||
APPEND_GLOB(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp)
|
||||
SET(KOKKOS_CONTAINER_HEADERS)
|
||||
APPEND_GLOB(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp)
|
||||
APPEND_GLOB(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp)
|
||||
|
||||
|
||||
INSTALL (
|
||||
DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/"
|
||||
|
@ -19,6 +23,7 @@ INSTALL (
|
|||
KOKKOS_ADD_LIBRARY(
|
||||
kokkoscontainers
|
||||
SOURCES ${KOKKOS_CONTAINERS_SRCS}
|
||||
HEADERS ${KOKKOS_CONTAINER_HEADERS}
|
||||
)
|
||||
|
||||
SET_TARGET_PROPERTIES(kokkoscontainers PROPERTIES VERSION ${Kokkos_VERSION})
|
||||
|
|
|
@ -73,8 +73,8 @@ void deep_copy(ConstBitset<DstDevice>& dst, ConstBitset<SrcDevice> const& src);
|
|||
template <typename Device>
|
||||
class Bitset {
|
||||
public:
|
||||
typedef Device execution_space;
|
||||
typedef unsigned size_type;
|
||||
using execution_space = Device;
|
||||
using size_type = unsigned;
|
||||
|
||||
enum { BIT_SCAN_REVERSE = 1u };
|
||||
enum { MOVE_HINT_BACKWARD = 2u };
|
||||
|
@ -137,9 +137,9 @@ class Bitset {
|
|||
|
||||
if (m_last_block_mask) {
|
||||
// clear the unused bits in the last block
|
||||
typedef Kokkos::Impl::DeepCopy<typename execution_space::memory_space,
|
||||
Kokkos::HostSpace>
|
||||
raw_deep_copy;
|
||||
using raw_deep_copy =
|
||||
Kokkos::Impl::DeepCopy<typename execution_space::memory_space,
|
||||
Kokkos::HostSpace>;
|
||||
raw_deep_copy(m_blocks.data() + (m_blocks.extent(0) - 1u),
|
||||
&m_last_block_mask, sizeof(unsigned));
|
||||
}
|
||||
|
@ -234,6 +234,10 @@ class Bitset {
|
|||
return find_any_helper(block_idx, offset, block, scan_direction);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const {
|
||||
return m_blocks.is_allocated();
|
||||
}
|
||||
|
||||
private:
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
Kokkos::pair<bool, unsigned> find_any_helper(unsigned block_idx,
|
||||
|
@ -304,8 +308,8 @@ class Bitset {
|
|||
template <typename Device>
|
||||
class ConstBitset {
|
||||
public:
|
||||
typedef Device execution_space;
|
||||
typedef unsigned size_type;
|
||||
using execution_space = Device;
|
||||
using size_type = unsigned;
|
||||
|
||||
private:
|
||||
enum { block_size = static_cast<unsigned>(sizeof(unsigned) * CHAR_BIT) };
|
||||
|
@ -380,9 +384,9 @@ void deep_copy(Bitset<DstDevice>& dst, Bitset<SrcDevice> const& src) {
|
|||
"Error: Cannot deep_copy bitsets of different sizes!");
|
||||
}
|
||||
|
||||
typedef Kokkos::Impl::DeepCopy<typename DstDevice::memory_space,
|
||||
typename SrcDevice::memory_space>
|
||||
raw_deep_copy;
|
||||
using raw_deep_copy =
|
||||
Kokkos::Impl::DeepCopy<typename DstDevice::memory_space,
|
||||
typename SrcDevice::memory_space>;
|
||||
raw_deep_copy(dst.m_blocks.data(), src.m_blocks.data(),
|
||||
sizeof(unsigned) * src.m_blocks.extent(0));
|
||||
}
|
||||
|
@ -394,9 +398,9 @@ void deep_copy(Bitset<DstDevice>& dst, ConstBitset<SrcDevice> const& src) {
|
|||
"Error: Cannot deep_copy bitsets of different sizes!");
|
||||
}
|
||||
|
||||
typedef Kokkos::Impl::DeepCopy<typename DstDevice::memory_space,
|
||||
typename SrcDevice::memory_space>
|
||||
raw_deep_copy;
|
||||
using raw_deep_copy =
|
||||
Kokkos::Impl::DeepCopy<typename DstDevice::memory_space,
|
||||
typename SrcDevice::memory_space>;
|
||||
raw_deep_copy(dst.m_blocks.data(), src.m_blocks.data(),
|
||||
sizeof(unsigned) * src.m_blocks.extent(0));
|
||||
}
|
||||
|
@ -408,9 +412,9 @@ void deep_copy(ConstBitset<DstDevice>& dst, ConstBitset<SrcDevice> const& src) {
|
|||
"Error: Cannot deep_copy bitsets of different sizes!");
|
||||
}
|
||||
|
||||
typedef Kokkos::Impl::DeepCopy<typename DstDevice::memory_space,
|
||||
typename SrcDevice::memory_space>
|
||||
raw_deep_copy;
|
||||
using raw_deep_copy =
|
||||
Kokkos::Impl::DeepCopy<typename DstDevice::memory_space,
|
||||
typename SrcDevice::memory_space>;
|
||||
raw_deep_copy(dst.m_blocks.data(), src.m_blocks.data(),
|
||||
sizeof(unsigned) * src.m_blocks.extent(0));
|
||||
}
|
||||
|
|
|
@ -100,99 +100,91 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
|
|||
public:
|
||||
//! \name Typedefs for device types and various Kokkos::View specializations.
|
||||
//@{
|
||||
typedef ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> traits;
|
||||
using traits = ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type>;
|
||||
|
||||
//! The Kokkos Host Device type;
|
||||
typedef typename traits::host_mirror_space host_mirror_space;
|
||||
using host_mirror_space = typename traits::host_mirror_space;
|
||||
|
||||
//! The type of a Kokkos::View on the device.
|
||||
typedef View<typename traits::data_type, Arg1Type, Arg2Type, Arg3Type> t_dev;
|
||||
using t_dev = View<typename traits::data_type, Arg1Type, Arg2Type, Arg3Type>;
|
||||
|
||||
/// \typedef t_host
|
||||
/// \brief The type of a Kokkos::View host mirror of \c t_dev.
|
||||
typedef typename t_dev::HostMirror t_host;
|
||||
using t_host = typename t_dev::HostMirror;
|
||||
|
||||
//! The type of a const View on the device.
|
||||
//! The type of a Kokkos::View on the device.
|
||||
typedef View<typename traits::const_data_type, Arg1Type, Arg2Type, Arg3Type>
|
||||
t_dev_const;
|
||||
using t_dev_const =
|
||||
View<typename traits::const_data_type, Arg1Type, Arg2Type, Arg3Type>;
|
||||
|
||||
/// \typedef t_host_const
|
||||
/// \brief The type of a const View host mirror of \c t_dev_const.
|
||||
typedef typename t_dev_const::HostMirror t_host_const;
|
||||
using t_host_const = typename t_dev_const::HostMirror;
|
||||
|
||||
//! The type of a const, random-access View on the device.
|
||||
typedef View<typename traits::const_data_type, typename traits::array_layout,
|
||||
typename traits::device_type,
|
||||
Kokkos::MemoryTraits<Kokkos::RandomAccess> >
|
||||
t_dev_const_randomread;
|
||||
using t_dev_const_randomread =
|
||||
View<typename traits::const_data_type, typename traits::array_layout,
|
||||
typename traits::device_type,
|
||||
Kokkos::MemoryTraits<Kokkos::RandomAccess> >;
|
||||
|
||||
/// \typedef t_host_const_randomread
|
||||
/// \brief The type of a const, random-access View host mirror of
|
||||
/// \c t_dev_const_randomread.
|
||||
typedef typename t_dev_const_randomread::HostMirror t_host_const_randomread;
|
||||
using t_host_const_randomread = typename t_dev_const_randomread::HostMirror;
|
||||
|
||||
//! The type of an unmanaged View on the device.
|
||||
typedef View<typename traits::data_type, typename traits::array_layout,
|
||||
typename traits::device_type, MemoryUnmanaged>
|
||||
t_dev_um;
|
||||
using t_dev_um =
|
||||
View<typename traits::data_type, typename traits::array_layout,
|
||||
typename traits::device_type, MemoryUnmanaged>;
|
||||
|
||||
//! The type of an unmanaged View host mirror of \c t_dev_um.
|
||||
typedef View<typename t_host::data_type, typename t_host::array_layout,
|
||||
typename t_host::device_type, MemoryUnmanaged>
|
||||
t_host_um;
|
||||
using t_host_um =
|
||||
View<typename t_host::data_type, typename t_host::array_layout,
|
||||
typename t_host::device_type, MemoryUnmanaged>;
|
||||
|
||||
//! The type of a const unmanaged View on the device.
|
||||
typedef View<typename traits::const_data_type, typename traits::array_layout,
|
||||
typename traits::device_type, MemoryUnmanaged>
|
||||
t_dev_const_um;
|
||||
using t_dev_const_um =
|
||||
View<typename traits::const_data_type, typename traits::array_layout,
|
||||
typename traits::device_type, MemoryUnmanaged>;
|
||||
|
||||
//! The type of a const unmanaged View host mirror of \c t_dev_const_um.
|
||||
typedef View<typename t_host::const_data_type, typename t_host::array_layout,
|
||||
typename t_host::device_type, MemoryUnmanaged>
|
||||
t_host_const_um;
|
||||
using t_host_const_um =
|
||||
View<typename t_host::const_data_type, typename t_host::array_layout,
|
||||
typename t_host::device_type, MemoryUnmanaged>;
|
||||
|
||||
//! The type of a const, random-access View on the device.
|
||||
typedef View<typename t_host::const_data_type, typename t_host::array_layout,
|
||||
typename t_host::device_type,
|
||||
Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
|
||||
t_dev_const_randomread_um;
|
||||
using t_dev_const_randomread_um =
|
||||
View<typename t_host::const_data_type, typename t_host::array_layout,
|
||||
typename t_host::device_type,
|
||||
Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >;
|
||||
|
||||
/// \typedef t_host_const_randomread
|
||||
/// \brief The type of a const, random-access View host mirror of
|
||||
/// \c t_dev_const_randomread.
|
||||
typedef
|
||||
typename t_dev_const_randomread::HostMirror t_host_const_randomread_um;
|
||||
|
||||
//@}
|
||||
//! \name The two View instances.
|
||||
//@{
|
||||
|
||||
t_dev d_view;
|
||||
t_host h_view;
|
||||
using t_host_const_randomread_um =
|
||||
typename t_dev_const_randomread::HostMirror;
|
||||
|
||||
//@}
|
||||
//! \name Counters to keep track of changes ("modified" flags)
|
||||
//@{
|
||||
|
||||
#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
protected:
|
||||
// modified_flags[0] -> host
|
||||
// modified_flags[1] -> device
|
||||
typedef View<unsigned int[2], LayoutLeft, Kokkos::HostSpace> t_modified_flags;
|
||||
using t_modified_flags = View<unsigned int[2], LayoutLeft, Kokkos::HostSpace>;
|
||||
t_modified_flags modified_flags;
|
||||
|
||||
public:
|
||||
#else
|
||||
typedef View<unsigned int[2], LayoutLeft, typename t_host::execution_space>
|
||||
t_modified_flags;
|
||||
typedef View<unsigned int, LayoutLeft, typename t_host::execution_space>
|
||||
t_modified_flag;
|
||||
t_modified_flags modified_flags;
|
||||
t_modified_flag modified_host, modified_device;
|
||||
#endif
|
||||
|
||||
//@}
|
||||
|
||||
// Moved this specifically after modified_flags to resolve an alignment issue
|
||||
// on MSVC/NVCC
|
||||
//! \name The two View instances.
|
||||
//@{
|
||||
t_dev d_view;
|
||||
t_host h_view;
|
||||
//@}
|
||||
|
||||
//! \name Constructors
|
||||
//@{
|
||||
|
||||
|
@ -201,14 +193,7 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
|
|||
/// Both device and host View objects are constructed using their
|
||||
/// default constructors. The "modified" flags are both initialized
|
||||
/// to "unmodified."
|
||||
#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
DualView() = default;
|
||||
#else
|
||||
DualView() : modified_flags(t_modified_flags("DualView::modified_flags")) {
|
||||
modified_host = t_modified_flag(modified_flags, 0);
|
||||
modified_device = t_modified_flag(modified_flags, 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
/// \brief Constructor that allocates View objects on both host and device.
|
||||
///
|
||||
|
@ -228,15 +213,10 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
|
|||
const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG)
|
||||
: d_view(label, n0, n1, n2, n3, n4, n5, n6, n7),
|
||||
: modified_flags(t_modified_flags("DualView::modified_flags")),
|
||||
d_view(label, n0, n1, n2, n3, n4, n5, n6, n7),
|
||||
h_view(create_mirror_view(d_view)) // without UVM, host View mirrors
|
||||
,
|
||||
modified_flags(t_modified_flags("DualView::modified_flags")) {
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
modified_host = t_modified_flag(modified_flags, 0);
|
||||
modified_device = t_modified_flag(modified_flags, 1);
|
||||
#endif
|
||||
}
|
||||
{}
|
||||
|
||||
/// \brief Constructor that allocates View objects on both host and device.
|
||||
///
|
||||
|
@ -260,15 +240,10 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
|
|||
const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG)
|
||||
: d_view(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7),
|
||||
: modified_flags(t_modified_flags("DualView::modified_flags")),
|
||||
d_view(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7),
|
||||
h_view(create_mirror_view(d_view)) // without UVM, host View mirrors
|
||||
,
|
||||
modified_flags(t_modified_flags("DualView::modified_flags")) {
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
modified_host = t_modified_flag(modified_flags, 0);
|
||||
modified_device = t_modified_flag(modified_flags, 1);
|
||||
#endif
|
||||
}
|
||||
{}
|
||||
|
||||
explicit inline DualView(const ViewAllocateWithoutInitializing& arg_prop,
|
||||
const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
|
||||
|
@ -288,30 +263,16 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
|
|||
//! Copy constructor (shallow copy)
|
||||
template <class SS, class LS, class DS, class MS>
|
||||
DualView(const DualView<SS, LS, DS, MS>& src)
|
||||
: d_view(src.d_view),
|
||||
h_view(src.h_view),
|
||||
modified_flags(src.modified_flags)
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
,
|
||||
modified_host(src.modified_host),
|
||||
modified_device(src.modified_device)
|
||||
#endif
|
||||
{
|
||||
}
|
||||
: modified_flags(src.modified_flags),
|
||||
d_view(src.d_view),
|
||||
h_view(src.h_view) {}
|
||||
|
||||
//! Subview constructor
|
||||
template <class SD, class S1, class S2, class S3, class Arg0, class... Args>
|
||||
DualView(const DualView<SD, S1, S2, S3>& src, const Arg0& arg0, Args... args)
|
||||
: d_view(Kokkos::subview(src.d_view, arg0, args...)),
|
||||
h_view(Kokkos::subview(src.h_view, arg0, args...)),
|
||||
modified_flags(src.modified_flags)
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
,
|
||||
modified_host(src.modified_host),
|
||||
modified_device(src.modified_device)
|
||||
#endif
|
||||
{
|
||||
}
|
||||
: modified_flags(src.modified_flags),
|
||||
d_view(Kokkos::subview(src.d_view, arg0, args...)),
|
||||
h_view(Kokkos::subview(src.h_view, arg0, args...)) {}
|
||||
|
||||
/// \brief Create DualView from existing device and host View objects.
|
||||
///
|
||||
|
@ -324,9 +285,9 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
|
|||
/// \param d_view_ Device View
|
||||
/// \param h_view_ Host View (must have type t_host = t_dev::HostMirror)
|
||||
DualView(const t_dev& d_view_, const t_host& h_view_)
|
||||
: d_view(d_view_),
|
||||
h_view(h_view_),
|
||||
modified_flags(t_modified_flags("DualView::modified_flags")) {
|
||||
: modified_flags(t_modified_flags("DualView::modified_flags")),
|
||||
d_view(d_view_),
|
||||
h_view(h_view_) {
|
||||
if (int(d_view.rank) != int(h_view.rank) ||
|
||||
d_view.extent(0) != h_view.extent(0) ||
|
||||
d_view.extent(1) != h_view.extent(1) ||
|
||||
|
@ -348,10 +309,6 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
|
|||
Kokkos::Impl::throw_runtime_exception(
|
||||
"DualView constructed with incompatible views");
|
||||
}
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
modified_host = t_modified_flag(modified_flags, 0);
|
||||
modified_device = t_modified_flag(modified_flags, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
//@}
|
||||
|
@ -367,20 +324,25 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
|
|||
///
|
||||
/// For example, suppose you create a DualView on Cuda, like this:
|
||||
/// \code
|
||||
/// typedef Kokkos::DualView<float, Kokkos::LayoutRight, Kokkos::Cuda>
|
||||
/// dual_view_type; dual_view_type DV ("my dual view", 100); \endcode If you
|
||||
/// want to get the CUDA device View, do this: \code typename
|
||||
/// dual_view_type::t_dev cudaView = DV.view<Kokkos::Cuda> (); \endcode and if
|
||||
/// you want to get the host mirror of that View, do this: \code typedef
|
||||
/// typename Kokkos::HostSpace::execution_space host_device_type; typename
|
||||
/// dual_view_type::t_host hostView = DV.view<host_device_type> (); \endcode
|
||||
/// using dual_view_type =
|
||||
/// Kokkos::DualView<float, Kokkos::LayoutRight, Kokkos::Cuda>;
|
||||
/// dual_view_type DV ("my dual view", 100);
|
||||
/// \endcode
|
||||
/// If you want to get the CUDA device View, do this:
|
||||
/// \code
|
||||
/// typename dual_view_type::t_dev cudaView = DV.view<Kokkos::Cuda> ();
|
||||
/// \endcode
|
||||
/// and if you want to get the host mirror of that View, do this:
|
||||
/// \code
|
||||
/// using host_device_type = typename Kokkos::HostSpace::execution_space;
|
||||
/// typename dual_view_type::t_host hostView = DV.view<host_device_type> ();
|
||||
/// \endcode
|
||||
template <class Device>
|
||||
KOKKOS_INLINE_FUNCTION const typename Impl::if_c<
|
||||
std::is_same<typename t_dev::memory_space,
|
||||
typename Device::memory_space>::value,
|
||||
t_dev, t_host>::type&
|
||||
view() const {
|
||||
#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
constexpr bool device_is_memspace =
|
||||
std::is_same<Device, typename Device::memory_space>::value;
|
||||
constexpr bool device_is_execspace =
|
||||
|
@ -415,7 +377,6 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
|
|||
(device_exec_is_t_dev_exec || device_exec_is_t_host_exec))),
|
||||
"Template parameter to .view() must exactly match one of the "
|
||||
"DualView's device types or one of the execution or memory spaces");
|
||||
#endif
|
||||
|
||||
return Impl::if_c<std::is_same<typename t_dev::memory_space,
|
||||
typename Device::memory_space>::value,
|
||||
|
@ -428,6 +389,10 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
t_dev view_device() const { return d_view; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const {
|
||||
return (d_view.is_allocated() && h_view.is_allocated());
|
||||
}
|
||||
|
||||
template <class Device>
|
||||
static int get_device_side() {
|
||||
constexpr bool device_is_memspace =
|
||||
|
@ -453,7 +418,6 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
|
|||
std::is_same<typename Device::memory_space,
|
||||
typename t_host::device_type>::value;
|
||||
|
||||
#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
static_assert(
|
||||
device_is_t_dev_device || device_is_t_host_device ||
|
||||
(device_is_memspace &&
|
||||
|
@ -465,13 +429,8 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
|
|||
(device_exec_is_t_dev_exec || device_exec_is_t_host_exec))),
|
||||
"Template parameter to .sync() must exactly match one of the "
|
||||
"DualView's device types or one of the execution or memory spaces");
|
||||
#endif
|
||||
|
||||
#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
int dev = -1;
|
||||
#else
|
||||
int dev = 0;
|
||||
#endif
|
||||
if (device_is_t_dev_device)
|
||||
dev = 1;
|
||||
else if (device_is_t_host_device)
|
||||
|
@ -822,11 +781,6 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
|
|||
//! \name Methods for getting capacity, stride, or dimension(s).
|
||||
//@{
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
//! The allocation size (same as Kokkos::View::capacity).
|
||||
size_t capacity() const { return d_view.span(); }
|
||||
#endif
|
||||
|
||||
//! The allocation size (same as Kokkos::View::span).
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return d_view.span(); }
|
||||
|
||||
|
@ -854,29 +808,6 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
|
|||
return static_cast<int>(d_view.extent(r));
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
/* Deprecate all 'dimension' functions in favor of
|
||||
* ISO/C++ vocabulary 'extent'.
|
||||
*/
|
||||
|
||||
/* \brief return size of dimension 0 */
|
||||
size_t dimension_0() const { return d_view.extent(0); }
|
||||
/* \brief return size of dimension 1 */
|
||||
size_t dimension_1() const { return d_view.extent(1); }
|
||||
/* \brief return size of dimension 2 */
|
||||
size_t dimension_2() const { return d_view.extent(2); }
|
||||
/* \brief return size of dimension 3 */
|
||||
size_t dimension_3() const { return d_view.extent(3); }
|
||||
/* \brief return size of dimension 4 */
|
||||
size_t dimension_4() const { return d_view.extent(4); }
|
||||
/* \brief return size of dimension 5 */
|
||||
size_t dimension_5() const { return d_view.extent(5); }
|
||||
/* \brief return size of dimension 6 */
|
||||
size_t dimension_6() const { return d_view.extent(6); }
|
||||
/* \brief return size of dimension 7 */
|
||||
size_t dimension_7() const { return d_view.extent(7); }
|
||||
#endif
|
||||
|
||||
//@}
|
||||
};
|
||||
|
||||
|
@ -893,13 +824,12 @@ namespace Impl {
|
|||
|
||||
template <class D, class A1, class A2, class A3, class... Args>
|
||||
struct DualViewSubview {
|
||||
typedef typename Kokkos::Impl::ViewMapping<
|
||||
void, Kokkos::ViewTraits<D, A1, A2, A3>, Args...>::traits_type dst_traits;
|
||||
using dst_traits = typename Kokkos::Impl::ViewMapping<
|
||||
void, Kokkos::ViewTraits<D, A1, A2, A3>, Args...>::traits_type;
|
||||
|
||||
typedef Kokkos::DualView<
|
||||
using type = Kokkos::DualView<
|
||||
typename dst_traits::data_type, typename dst_traits::array_layout,
|
||||
typename dst_traits::device_type, typename dst_traits::memory_traits>
|
||||
type;
|
||||
typename dst_traits::device_type, typename dst_traits::memory_traits>;
|
||||
};
|
||||
|
||||
} /* namespace Impl */
|
||||
|
|
|
@ -349,8 +349,8 @@ class ViewMapping<
|
|||
public:
|
||||
enum { is_assignable = is_assignable_value_type && is_assignable_layout };
|
||||
|
||||
typedef ViewMapping<DstTraits, typename DstTraits::specialize> DstType;
|
||||
typedef ViewMapping<SrcTraits, typename SrcTraits::specialize> SrcType;
|
||||
using DstType = ViewMapping<DstTraits, typename DstTraits::specialize>;
|
||||
using SrcType = ViewMapping<SrcTraits, typename SrcTraits::specialize>;
|
||||
|
||||
template <typename DT, typename... DP, typename ST, typename... SP>
|
||||
KOKKOS_INLINE_FUNCTION static void assign(
|
||||
|
@ -365,13 +365,13 @@ class ViewMapping<
|
|||
|
||||
// Removed dimension checks...
|
||||
|
||||
typedef typename DstType::offset_type dst_offset_type;
|
||||
using dst_offset_type = typename DstType::offset_type;
|
||||
dst.m_map.m_impl_offset = dst_offset_type(
|
||||
std::integral_constant<unsigned, 0>(),
|
||||
src.layout()); // Check this for integer input1 for padding, etc
|
||||
dst.m_map.m_impl_handle = Kokkos::Impl::ViewDataHandle<DstTraits>::assign(
|
||||
src.m_map.m_impl_handle, src.m_track);
|
||||
dst.m_track.assign(src.m_track, DstTraits::is_managed);
|
||||
src.m_map.m_impl_handle, src.m_track.m_tracker);
|
||||
dst.m_track.assign(src.m_track.m_tracker, DstTraits::is_managed);
|
||||
dst.m_rank = src.Rank;
|
||||
}
|
||||
};
|
||||
|
@ -415,16 +415,16 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
|
|||
friend class Kokkos::Impl::ViewMapping;
|
||||
|
||||
public:
|
||||
typedef ViewTraits<DataType, Properties...> drvtraits;
|
||||
using drvtraits = ViewTraits<DataType, Properties...>;
|
||||
|
||||
typedef View<DataType*******, Properties...> view_type;
|
||||
using view_type = View<DataType*******, Properties...>;
|
||||
|
||||
typedef ViewTraits<DataType*******, Properties...> traits;
|
||||
using traits = ViewTraits<DataType*******, Properties...>;
|
||||
|
||||
private:
|
||||
typedef Kokkos::Impl::ViewMapping<traits, typename traits::specialize>
|
||||
map_type;
|
||||
typedef Kokkos::Impl::SharedAllocationTracker track_type;
|
||||
using map_type =
|
||||
Kokkos::Impl::ViewMapping<traits, typename traits::specialize>;
|
||||
using track_type = Kokkos::Impl::SharedAllocationTracker;
|
||||
|
||||
track_type m_track;
|
||||
map_type m_map;
|
||||
|
@ -440,28 +440,24 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
|
|||
// 7 data_type of the traits
|
||||
|
||||
/** \brief Compatible view of array of scalar types */
|
||||
typedef DynRankView<
|
||||
using array_type = DynRankView<
|
||||
typename drvtraits::scalar_array_type, typename drvtraits::array_layout,
|
||||
typename drvtraits::device_type, typename drvtraits::memory_traits>
|
||||
array_type;
|
||||
typename drvtraits::device_type, typename drvtraits::memory_traits>;
|
||||
|
||||
/** \brief Compatible view of const data type */
|
||||
typedef DynRankView<
|
||||
using const_type = DynRankView<
|
||||
typename drvtraits::const_data_type, typename drvtraits::array_layout,
|
||||
typename drvtraits::device_type, typename drvtraits::memory_traits>
|
||||
const_type;
|
||||
typename drvtraits::device_type, typename drvtraits::memory_traits>;
|
||||
|
||||
/** \brief Compatible view of non-const data type */
|
||||
typedef DynRankView<
|
||||
using non_const_type = DynRankView<
|
||||
typename drvtraits::non_const_data_type, typename drvtraits::array_layout,
|
||||
typename drvtraits::device_type, typename drvtraits::memory_traits>
|
||||
non_const_type;
|
||||
typename drvtraits::device_type, typename drvtraits::memory_traits>;
|
||||
|
||||
/** \brief Compatible HostMirror view */
|
||||
typedef DynRankView<typename drvtraits::non_const_data_type,
|
||||
typename drvtraits::array_layout,
|
||||
typename drvtraits::host_mirror_space>
|
||||
HostMirror;
|
||||
using HostMirror = DynRankView<typename drvtraits::non_const_data_type,
|
||||
typename drvtraits::array_layout,
|
||||
typename drvtraits::host_mirror_space>;
|
||||
|
||||
//----------------------------------------
|
||||
// Domain rank and extents
|
||||
|
@ -493,42 +489,6 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
|
|||
* ISO/C++ vocabulary 'extent'.
|
||||
*/
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
template <typename iType>
|
||||
KOKKOS_INLINE_FUNCTION constexpr
|
||||
typename std::enable_if<std::is_integral<iType>::value, size_t>::type
|
||||
dimension(const iType& r) const {
|
||||
return extent(r);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const {
|
||||
return m_map.dimension_0();
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const {
|
||||
return m_map.dimension_1();
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const {
|
||||
return m_map.dimension_2();
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const {
|
||||
return m_map.dimension_3();
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const {
|
||||
return m_map.dimension_4();
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const {
|
||||
return m_map.dimension_5();
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const {
|
||||
return m_map.dimension_6();
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const {
|
||||
return m_map.dimension_7();
|
||||
}
|
||||
#endif
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t size() const {
|
||||
return m_map.extent(0) * m_map.extent(1) * m_map.extent(2) *
|
||||
m_map.extent(3) * m_map.extent(4) * m_map.extent(5) *
|
||||
|
@ -568,8 +528,8 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
|
|||
//----------------------------------------
|
||||
// Range span is the span which contains all members.
|
||||
|
||||
typedef typename map_type::reference_type reference_type;
|
||||
typedef typename map_type::pointer_type pointer_type;
|
||||
using reference_type = typename map_type::reference_type;
|
||||
using pointer_type = typename map_type::pointer_type;
|
||||
|
||||
enum {
|
||||
reference_type_is_lvalue_reference =
|
||||
|
@ -577,39 +537,18 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
|
|||
};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); }
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
// Deprecated, use 'span()' instead
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t capacity() const {
|
||||
return m_map.span();
|
||||
}
|
||||
#endif
|
||||
KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const {
|
||||
return m_map.span_is_contiguous();
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const {
|
||||
return m_map.data();
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
// Deprecated, use 'span_is_contigous()' instead
|
||||
KOKKOS_INLINE_FUNCTION constexpr bool is_contiguous() const {
|
||||
return m_map.span_is_contiguous();
|
||||
KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const {
|
||||
return (m_map.data() != nullptr);
|
||||
}
|
||||
// Deprecated, use 'data()' instead
|
||||
KOKKOS_INLINE_FUNCTION constexpr pointer_type ptr_on_device() const {
|
||||
return m_map.data();
|
||||
}
|
||||
#endif
|
||||
|
||||
//----------------------------------------
|
||||
// Allow specializations to query their specialized map
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const Kokkos::Impl::ViewMapping<traits, typename traits::specialize>&
|
||||
implementation_map() const {
|
||||
return m_map;
|
||||
}
|
||||
#endif
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const Kokkos::Impl::ViewMapping<traits, typename traits::specialize>&
|
||||
impl_map() const {
|
||||
|
@ -709,12 +648,11 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
|
|||
const size_t dim_scalar = m_map.dimension_scalar();
|
||||
const size_t bytes = this->span() / dim_scalar;
|
||||
|
||||
typedef Kokkos::View<
|
||||
using tmp_view_type = Kokkos::View<
|
||||
DataType*, typename traits::array_layout, typename traits::device_type,
|
||||
Kokkos::MemoryTraits<traits::memory_traits::is_unmanaged |
|
||||
traits::memory_traits::is_random_access |
|
||||
traits::memory_traits::is_atomic> >
|
||||
tmp_view_type;
|
||||
traits::memory_traits::is_atomic> >;
|
||||
tmp_view_type rankone_view(this->data(), bytes, dim_scalar);
|
||||
return rankone_view(i0);
|
||||
}
|
||||
|
@ -1102,10 +1040,9 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
|
|||
template <class RT, class... RP>
|
||||
KOKKOS_INLINE_FUNCTION DynRankView(const DynRankView<RT, RP...>& rhs)
|
||||
: m_track(rhs.m_track, traits::is_managed), m_map(), m_rank(rhs.m_rank) {
|
||||
typedef typename DynRankView<RT, RP...>::traits SrcTraits;
|
||||
typedef Kokkos::Impl::ViewMapping<traits, SrcTraits,
|
||||
typename traits::specialize>
|
||||
Mapping;
|
||||
using SrcTraits = typename DynRankView<RT, RP...>::traits;
|
||||
using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits,
|
||||
typename traits::specialize>;
|
||||
static_assert(Mapping::is_assignable,
|
||||
"Incompatible DynRankView copy construction");
|
||||
Mapping::assign(m_map, rhs.m_map, rhs.m_track);
|
||||
|
@ -1114,10 +1051,9 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
|
|||
template <class RT, class... RP>
|
||||
KOKKOS_INLINE_FUNCTION DynRankView& operator=(
|
||||
const DynRankView<RT, RP...>& rhs) {
|
||||
typedef typename DynRankView<RT, RP...>::traits SrcTraits;
|
||||
typedef Kokkos::Impl::ViewMapping<traits, SrcTraits,
|
||||
typename traits::specialize>
|
||||
Mapping;
|
||||
using SrcTraits = typename DynRankView<RT, RP...>::traits;
|
||||
using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits,
|
||||
typename traits::specialize>;
|
||||
static_assert(Mapping::is_assignable,
|
||||
"Incompatible DynRankView copy construction");
|
||||
Mapping::assign(m_map, rhs.m_map, rhs.m_track);
|
||||
|
@ -1130,10 +1066,10 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
|
|||
template <class RT, class... RP>
|
||||
KOKKOS_INLINE_FUNCTION DynRankView(const View<RT, RP...>& rhs)
|
||||
: m_track(), m_map(), m_rank(rhs.Rank) {
|
||||
typedef typename View<RT, RP...>::traits SrcTraits;
|
||||
typedef Kokkos::Impl::ViewMapping<traits, SrcTraits,
|
||||
Kokkos::Impl::ViewToDynRankViewTag>
|
||||
Mapping;
|
||||
using SrcTraits = typename View<RT, RP...>::traits;
|
||||
using Mapping =
|
||||
Kokkos::Impl::ViewMapping<traits, SrcTraits,
|
||||
Kokkos::Impl::ViewToDynRankViewTag>;
|
||||
static_assert(Mapping::is_assignable,
|
||||
"Incompatible View to DynRankView copy construction");
|
||||
Mapping::assign(*this, rhs);
|
||||
|
@ -1141,10 +1077,10 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
|
|||
|
||||
template <class RT, class... RP>
|
||||
KOKKOS_INLINE_FUNCTION DynRankView& operator=(const View<RT, RP...>& rhs) {
|
||||
typedef typename View<RT, RP...>::traits SrcTraits;
|
||||
typedef Kokkos::Impl::ViewMapping<traits, SrcTraits,
|
||||
Kokkos::Impl::ViewToDynRankViewTag>
|
||||
Mapping;
|
||||
using SrcTraits = typename View<RT, RP...>::traits;
|
||||
using Mapping =
|
||||
Kokkos::Impl::ViewMapping<traits, SrcTraits,
|
||||
Kokkos::Impl::ViewToDynRankViewTag>;
|
||||
static_assert(Mapping::is_assignable,
|
||||
"Incompatible View to DynRankView copy assignment");
|
||||
Mapping::assign(*this, rhs);
|
||||
|
@ -1177,11 +1113,11 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
|
|||
template computeRank<typename traits::array_layout, P...>(
|
||||
arg_prop, arg_layout)) {
|
||||
// Append layout and spaces if not input
|
||||
typedef Kokkos::Impl::ViewCtorProp<P...> alloc_prop_input;
|
||||
using alloc_prop_input = Kokkos::Impl::ViewCtorProp<P...>;
|
||||
|
||||
// use 'std::integral_constant<unsigned,I>' for non-types
|
||||
// to avoid duplicate class error.
|
||||
typedef Kokkos::Impl::ViewCtorProp<
|
||||
using alloc_prop = Kokkos::Impl::ViewCtorProp<
|
||||
P...,
|
||||
typename std::conditional<alloc_prop_input::has_label,
|
||||
std::integral_constant<unsigned, 0>,
|
||||
|
@ -1193,19 +1129,13 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
|
|||
typename std::conditional<
|
||||
alloc_prop_input::has_execution_space,
|
||||
std::integral_constant<unsigned, 2>,
|
||||
typename traits::device_type::execution_space>::type>
|
||||
alloc_prop;
|
||||
typename traits::device_type::execution_space>::type>;
|
||||
|
||||
static_assert(traits::is_managed,
|
||||
"View allocation constructor requires managed memory");
|
||||
|
||||
if (alloc_prop::initialize &&
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
!alloc_prop::execution_space::is_initialized()
|
||||
#else
|
||||
!alloc_prop::execution_space::impl_is_initialized()
|
||||
#endif
|
||||
) {
|
||||
!alloc_prop::execution_space::impl_is_initialized()) {
|
||||
// If initializing view data then
|
||||
// the execution space must be initialized.
|
||||
Kokkos::Impl::throw_runtime_exception(
|
||||
|
@ -1499,36 +1429,34 @@ struct ViewMapping<
|
|||
unsigned(R4) + unsigned(R5) + unsigned(R6)
|
||||
};
|
||||
|
||||
typedef Kokkos::LayoutStride array_layout;
|
||||
using array_layout = Kokkos::LayoutStride;
|
||||
|
||||
typedef typename SrcTraits::value_type value_type;
|
||||
using value_type = typename SrcTraits::value_type;
|
||||
|
||||
typedef value_type******* data_type;
|
||||
using data_type = value_type*******;
|
||||
|
||||
public:
|
||||
typedef Kokkos::ViewTraits<data_type, array_layout,
|
||||
typename SrcTraits::device_type,
|
||||
typename SrcTraits::memory_traits>
|
||||
traits_type;
|
||||
using traits_type = Kokkos::ViewTraits<data_type, array_layout,
|
||||
typename SrcTraits::device_type,
|
||||
typename SrcTraits::memory_traits>;
|
||||
|
||||
typedef Kokkos::View<data_type, array_layout, typename SrcTraits::device_type,
|
||||
typename SrcTraits::memory_traits>
|
||||
type;
|
||||
using type =
|
||||
Kokkos::View<data_type, array_layout, typename SrcTraits::device_type,
|
||||
typename SrcTraits::memory_traits>;
|
||||
|
||||
template <class MemoryTraits>
|
||||
struct apply {
|
||||
static_assert(Kokkos::Impl::is_memory_traits<MemoryTraits>::value, "");
|
||||
|
||||
typedef Kokkos::ViewTraits<data_type, array_layout,
|
||||
typename SrcTraits::device_type, MemoryTraits>
|
||||
traits_type;
|
||||
using traits_type =
|
||||
Kokkos::ViewTraits<data_type, array_layout,
|
||||
typename SrcTraits::device_type, MemoryTraits>;
|
||||
|
||||
typedef Kokkos::View<data_type, array_layout,
|
||||
typename SrcTraits::device_type, MemoryTraits>
|
||||
type;
|
||||
using type = Kokkos::View<data_type, array_layout,
|
||||
typename SrcTraits::device_type, MemoryTraits>;
|
||||
};
|
||||
|
||||
typedef typename SrcTraits::dimension dimension;
|
||||
using dimension = typename SrcTraits::dimension;
|
||||
|
||||
template <class Arg0 = int, class Arg1 = int, class Arg2 = int,
|
||||
class Arg3 = int, class Arg4 = int, class Arg5 = int,
|
||||
|
@ -1544,18 +1472,17 @@ struct ViewMapping<
|
|||
}
|
||||
};
|
||||
|
||||
typedef Kokkos::DynRankView<value_type, array_layout,
|
||||
typename SrcTraits::device_type,
|
||||
typename SrcTraits::memory_traits>
|
||||
ret_type;
|
||||
using ret_type = Kokkos::DynRankView<value_type, array_layout,
|
||||
typename SrcTraits::device_type,
|
||||
typename SrcTraits::memory_traits>;
|
||||
|
||||
template <typename T, class... P>
|
||||
KOKKOS_INLINE_FUNCTION static ret_type subview(
|
||||
const unsigned src_rank, Kokkos::DynRankView<T, P...> const& src,
|
||||
Args... args) {
|
||||
typedef ViewMapping<traits_type, typename traits_type::specialize> DstType;
|
||||
using DstType = ViewMapping<traits_type, typename traits_type::specialize>;
|
||||
|
||||
typedef typename std::conditional<
|
||||
using DstDimType = typename std::conditional<
|
||||
(rank == 0), ViewDimension<>,
|
||||
typename std::conditional<
|
||||
(rank == 1), ViewDimension<0>,
|
||||
|
@ -1570,10 +1497,10 @@ struct ViewMapping<
|
|||
typename std::conditional<
|
||||
(rank == 6), ViewDimension<0, 0, 0, 0, 0, 0>,
|
||||
ViewDimension<0, 0, 0, 0, 0, 0, 0> >::type>::
|
||||
type>::type>::type>::type>::type>::type DstDimType;
|
||||
type>::type>::type>::type>::type>::type;
|
||||
|
||||
typedef ViewOffset<DstDimType, Kokkos::LayoutStride> dst_offset_type;
|
||||
typedef typename DstType::handle_type dst_handle_type;
|
||||
using dst_offset_type = ViewOffset<DstDimType, Kokkos::LayoutStride>;
|
||||
using dst_handle_type = typename DstType::handle_type;
|
||||
|
||||
ret_type dst;
|
||||
|
||||
|
@ -1636,9 +1563,9 @@ subdynrankview(const Kokkos::DynRankView<D, P...>& src, Args... args) {
|
|||
"DynRankView");
|
||||
}
|
||||
|
||||
typedef Kokkos::Impl::ViewMapping<Kokkos::Impl::DynRankSubviewTag,
|
||||
Kokkos::ViewTraits<D*******, P...>, Args...>
|
||||
metafcn;
|
||||
using metafcn =
|
||||
Kokkos::Impl::ViewMapping<Kokkos::Impl::DynRankSubviewTag,
|
||||
Kokkos::ViewTraits<D*******, P...>, Args...>;
|
||||
|
||||
return metafcn::subview(src.rank(), src, args...);
|
||||
}
|
||||
|
@ -1659,8 +1586,8 @@ template <class LT, class... LP, class RT, class... RP>
|
|||
KOKKOS_INLINE_FUNCTION bool operator==(const DynRankView<LT, LP...>& lhs,
|
||||
const DynRankView<RT, RP...>& rhs) {
|
||||
// Same data, layout, dimensions
|
||||
typedef ViewTraits<LT, LP...> lhs_traits;
|
||||
typedef ViewTraits<RT, RP...> rhs_traits;
|
||||
using lhs_traits = ViewTraits<LT, LP...>;
|
||||
using rhs_traits = ViewTraits<RT, RP...>;
|
||||
|
||||
return std::is_same<typename lhs_traits::const_value_type,
|
||||
typename rhs_traits::const_value_type>::value &&
|
||||
|
@ -1691,7 +1618,7 @@ namespace Impl {
|
|||
|
||||
template <class OutputView, typename Enable = void>
|
||||
struct DynRankViewFill {
|
||||
typedef typename OutputView::traits::const_value_type const_value_type;
|
||||
using const_value_type = typename OutputView::traits::const_value_type;
|
||||
|
||||
const OutputView output;
|
||||
const_value_type input;
|
||||
|
@ -1722,15 +1649,11 @@ struct DynRankViewFill {
|
|||
|
||||
DynRankViewFill(const OutputView& arg_out, const_value_type& arg_in)
|
||||
: output(arg_out), input(arg_in) {
|
||||
typedef typename OutputView::execution_space execution_space;
|
||||
typedef Kokkos::RangePolicy<execution_space> Policy;
|
||||
using execution_space = typename OutputView::execution_space;
|
||||
using Policy = Kokkos::RangePolicy<execution_space>;
|
||||
|
||||
const Kokkos::Impl::ParallelFor<DynRankViewFill, Policy> closure(
|
||||
*this, Policy(0, output.extent(0)));
|
||||
|
||||
closure.execute();
|
||||
|
||||
execution_space().fence();
|
||||
Kokkos::parallel_for("Kokkos::DynRankViewFill", Policy(0, output.extent(0)),
|
||||
*this);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1770,11 +1693,9 @@ struct DynRankViewRemap {
|
|||
n5(std::min((size_t)arg_out.extent(5), (size_t)arg_in.extent(5))),
|
||||
n6(std::min((size_t)arg_out.extent(6), (size_t)arg_in.extent(6))),
|
||||
n7(std::min((size_t)arg_out.extent(7), (size_t)arg_in.extent(7))) {
|
||||
typedef Kokkos::RangePolicy<ExecSpace> Policy;
|
||||
const Kokkos::Impl::ParallelFor<DynRankViewRemap, Policy> closure(
|
||||
*this, Policy(0, n0));
|
||||
closure.execute();
|
||||
// ExecSpace().fence(); // ??
|
||||
using Policy = Kokkos::RangePolicy<ExecSpace>;
|
||||
|
||||
Kokkos::parallel_for("Kokkos::DynRankViewRemap", Policy(0, n0), *this);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -1814,7 +1735,9 @@ inline void deep_copy(
|
|||
typename ViewTraits<DT, DP...>::value_type>::value,
|
||||
"deep_copy requires non-const type");
|
||||
|
||||
Kokkos::fence();
|
||||
Kokkos::Impl::DynRankViewFill<DynRankView<DT, DP...> >(dst, value);
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
/** \brief Deep copy into a value in Host memory from a view. */
|
||||
|
@ -1828,10 +1751,12 @@ inline void deep_copy(
|
|||
Kokkos::abort("");
|
||||
}
|
||||
|
||||
typedef ViewTraits<ST, SP...> src_traits;
|
||||
typedef typename src_traits::memory_space src_memory_space;
|
||||
using src_traits = ViewTraits<ST, SP...>;
|
||||
using src_memory_space = typename src_traits::memory_space;
|
||||
Kokkos::fence();
|
||||
Kokkos::Impl::DeepCopy<HostSpace, src_memory_space>(&dst, src.data(),
|
||||
sizeof(ST));
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
@ -1851,13 +1776,13 @@ inline void deep_copy(
|
|||
typename DstType::traits::non_const_value_type>::value,
|
||||
"deep_copy requires non-const destination type");
|
||||
|
||||
typedef DstType dst_type;
|
||||
typedef SrcType src_type;
|
||||
using dst_type = DstType;
|
||||
using src_type = SrcType;
|
||||
|
||||
typedef typename dst_type::execution_space dst_execution_space;
|
||||
typedef typename src_type::execution_space src_execution_space;
|
||||
typedef typename dst_type::memory_space dst_memory_space;
|
||||
typedef typename src_type::memory_space src_memory_space;
|
||||
using dst_execution_space = typename dst_type::execution_space;
|
||||
using src_execution_space = typename src_type::execution_space;
|
||||
using dst_memory_space = typename dst_type::memory_space;
|
||||
using src_memory_space = typename src_type::memory_space;
|
||||
|
||||
enum {
|
||||
DstExecCanAccessSrc =
|
||||
|
@ -1878,9 +1803,11 @@ inline void deep_copy(
|
|||
// If same type, equal layout, equal dimensions, equal span, and contiguous
|
||||
// memory then can byte-wise copy
|
||||
if (rank(src) == 0 && rank(dst) == 0) {
|
||||
typedef typename dst_type::value_type value_type;
|
||||
using value_type = typename dst_type::value_type;
|
||||
Kokkos::fence();
|
||||
Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
|
||||
dst.data(), src.data(), sizeof(value_type));
|
||||
Kokkos::fence();
|
||||
} else if (std::is_same<
|
||||
typename DstType::traits::value_type,
|
||||
typename SrcType::traits::non_const_value_type>::value &&
|
||||
|
@ -1902,9 +1829,10 @@ inline void deep_copy(
|
|||
dst.extent(6) == src.extent(6) &&
|
||||
dst.extent(7) == src.extent(7)) {
|
||||
const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
|
||||
|
||||
Kokkos::fence();
|
||||
Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
|
||||
dst.data(), src.data(), nbytes);
|
||||
Kokkos::fence();
|
||||
} else if (std::is_same<
|
||||
typename DstType::traits::value_type,
|
||||
typename SrcType::traits::non_const_value_type>::value &&
|
||||
|
@ -1931,22 +1859,29 @@ inline void deep_copy(
|
|||
dst.stride_6() == src.stride_6() &&
|
||||
dst.stride_7() == src.stride_7()) {
|
||||
const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
|
||||
|
||||
Kokkos::fence();
|
||||
Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
|
||||
dst.data(), src.data(), nbytes);
|
||||
Kokkos::fence();
|
||||
} else if (DstExecCanAccessSrc) {
|
||||
// Copying data between views in accessible memory spaces and either
|
||||
// non-contiguous or incompatible shape.
|
||||
Kokkos::fence();
|
||||
Kokkos::Impl::DynRankViewRemap<dst_type, src_type>(dst, src);
|
||||
Kokkos::fence();
|
||||
} else if (SrcExecCanAccessDst) {
|
||||
// Copying data between views in accessible memory spaces and either
|
||||
// non-contiguous or incompatible shape.
|
||||
Kokkos::fence();
|
||||
Kokkos::Impl::DynRankViewRemap<dst_type, src_type, src_execution_space>(
|
||||
dst, src);
|
||||
Kokkos::fence();
|
||||
} else {
|
||||
Kokkos::Impl::throw_runtime_exception(
|
||||
"deep_copy given views that would require a temporary allocation");
|
||||
}
|
||||
} else {
|
||||
Kokkos::fence();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1962,45 +1897,45 @@ namespace Impl {
|
|||
template <class Space, class T, class... P>
|
||||
struct MirrorDRViewType {
|
||||
// The incoming view_type
|
||||
typedef typename Kokkos::DynRankView<T, P...> src_view_type;
|
||||
using src_view_type = typename Kokkos::DynRankView<T, P...>;
|
||||
// The memory space for the mirror view
|
||||
typedef typename Space::memory_space memory_space;
|
||||
using memory_space = typename Space::memory_space;
|
||||
// Check whether it is the same memory space
|
||||
enum {
|
||||
is_same_memspace =
|
||||
std::is_same<memory_space, typename src_view_type::memory_space>::value
|
||||
};
|
||||
// The array_layout
|
||||
typedef typename src_view_type::array_layout array_layout;
|
||||
using array_layout = typename src_view_type::array_layout;
|
||||
// The data type (we probably want it non-const since otherwise we can't even
|
||||
// deep_copy to it.
|
||||
typedef typename src_view_type::non_const_data_type data_type;
|
||||
using data_type = typename src_view_type::non_const_data_type;
|
||||
// The destination view type if it is not the same memory space
|
||||
typedef Kokkos::DynRankView<data_type, array_layout, Space> dest_view_type;
|
||||
using dest_view_type = Kokkos::DynRankView<data_type, array_layout, Space>;
|
||||
// If it is the same memory_space return the existsing view_type
|
||||
// This will also keep the unmanaged trait if necessary
|
||||
typedef typename std::conditional<is_same_memspace, src_view_type,
|
||||
dest_view_type>::type view_type;
|
||||
using view_type = typename std::conditional<is_same_memspace, src_view_type,
|
||||
dest_view_type>::type;
|
||||
};
|
||||
|
||||
template <class Space, class T, class... P>
|
||||
struct MirrorDRVType {
|
||||
// The incoming view_type
|
||||
typedef typename Kokkos::DynRankView<T, P...> src_view_type;
|
||||
using src_view_type = typename Kokkos::DynRankView<T, P...>;
|
||||
// The memory space for the mirror view
|
||||
typedef typename Space::memory_space memory_space;
|
||||
using memory_space = typename Space::memory_space;
|
||||
// Check whether it is the same memory space
|
||||
enum {
|
||||
is_same_memspace =
|
||||
std::is_same<memory_space, typename src_view_type::memory_space>::value
|
||||
};
|
||||
// The array_layout
|
||||
typedef typename src_view_type::array_layout array_layout;
|
||||
using array_layout = typename src_view_type::array_layout;
|
||||
// The data type (we probably want it non-const since otherwise we can't even
|
||||
// deep_copy to it.
|
||||
typedef typename src_view_type::non_const_data_type data_type;
|
||||
using data_type = typename src_view_type::non_const_data_type;
|
||||
// The destination view type if it is not the same memory space
|
||||
typedef Kokkos::DynRankView<data_type, array_layout, Space> view_type;
|
||||
using view_type = Kokkos::DynRankView<data_type, array_layout, Space>;
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
@ -2012,8 +1947,8 @@ inline typename DynRankView<T, P...>::HostMirror create_mirror(
|
|||
std::is_same<typename ViewTraits<T, P...>::specialize, void>::value &&
|
||||
!std::is_same<typename Kokkos::ViewTraits<T, P...>::array_layout,
|
||||
Kokkos::LayoutStride>::value>::type* = nullptr) {
|
||||
typedef DynRankView<T, P...> src_type;
|
||||
typedef typename src_type::HostMirror dst_type;
|
||||
using src_type = DynRankView<T, P...>;
|
||||
using dst_type = typename src_type::HostMirror;
|
||||
|
||||
return dst_type(std::string(src.label()).append("_mirror"),
|
||||
Impl::reconstructLayout(src.layout(), src.rank()));
|
||||
|
@ -2026,8 +1961,8 @@ inline typename DynRankView<T, P...>::HostMirror create_mirror(
|
|||
std::is_same<typename ViewTraits<T, P...>::specialize, void>::value &&
|
||||
std::is_same<typename Kokkos::ViewTraits<T, P...>::array_layout,
|
||||
Kokkos::LayoutStride>::value>::type* = 0) {
|
||||
typedef DynRankView<T, P...> src_type;
|
||||
typedef typename src_type::HostMirror dst_type;
|
||||
using src_type = DynRankView<T, P...>;
|
||||
using dst_type = typename src_type::HostMirror;
|
||||
|
||||
return dst_type(std::string(src.label()).append("_mirror"),
|
||||
Impl::reconstructLayout(src.layout(), src.rank()));
|
||||
|
@ -2066,7 +2001,7 @@ inline typename DynRankView<T, P...>::HostMirror create_mirror_view(
|
|||
typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
|
||||
std::is_same<typename DynRankView<T, P...>::data_type,
|
||||
typename DynRankView<T, P...>::HostMirror::data_type>::
|
||||
value)>::type* = 0) {
|
||||
value)>::type* = nullptr) {
|
||||
return Kokkos::create_mirror(src);
|
||||
}
|
||||
|
||||
|
@ -2085,7 +2020,8 @@ template <class Space, class T, class... P>
|
|||
typename Impl::MirrorDRViewType<Space, T, P...>::view_type create_mirror_view(
|
||||
const Space&, const Kokkos::DynRankView<T, P...>& src,
|
||||
typename std::enable_if<
|
||||
!Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace>::type* = 0) {
|
||||
!Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace>::type* =
|
||||
nullptr) {
|
||||
return typename Impl::MirrorDRViewType<Space, T, P...>::view_type(
|
||||
src.label(), Impl::reconstructLayout(src.layout(), src.rank()));
|
||||
}
|
||||
|
@ -2112,7 +2048,8 @@ create_mirror_view_and_copy(
|
|||
const Space&, const Kokkos::DynRankView<T, P...>& src,
|
||||
std::string const& name = "",
|
||||
typename std::enable_if<
|
||||
!Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace>::type* = 0) {
|
||||
!Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace>::type* =
|
||||
nullptr) {
|
||||
using Mirror = typename Impl::MirrorDRViewType<Space, T, P...>::view_type;
|
||||
std::string label = name.empty() ? src.label() : name;
|
||||
auto mirror = Mirror(Kokkos::ViewAllocateWithoutInitializing(label),
|
||||
|
@ -2139,7 +2076,7 @@ inline void resize(DynRankView<T, P...>& v,
|
|||
const size_t n5 = KOKKOS_INVALID_INDEX,
|
||||
const size_t n6 = KOKKOS_INVALID_INDEX,
|
||||
const size_t n7 = KOKKOS_INVALID_INDEX) {
|
||||
typedef DynRankView<T, P...> drview_type;
|
||||
using drview_type = DynRankView<T, P...>;
|
||||
|
||||
static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
|
||||
"Can only resize managed views");
|
||||
|
@ -2163,7 +2100,7 @@ inline void realloc(DynRankView<T, P...>& v,
|
|||
const size_t n5 = KOKKOS_INVALID_INDEX,
|
||||
const size_t n6 = KOKKOS_INVALID_INDEX,
|
||||
const size_t n7 = KOKKOS_INVALID_INDEX) {
|
||||
typedef DynRankView<T, P...> drview_type;
|
||||
using drview_type = DynRankView<T, P...>;
|
||||
|
||||
static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
|
||||
"Can only realloc managed views");
|
||||
|
|
|
@ -85,13 +85,13 @@ struct ChunkArraySpace<Kokkos::Experimental::HIPSpace> {
|
|||
template <typename DataType, typename... P>
|
||||
class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
|
||||
public:
|
||||
typedef Kokkos::ViewTraits<DataType, P...> traits;
|
||||
using traits = Kokkos::ViewTraits<DataType, P...>;
|
||||
|
||||
private:
|
||||
template <class, class...>
|
||||
friend class DynamicView;
|
||||
|
||||
typedef Kokkos::Impl::SharedAllocationTracker track_type;
|
||||
using track_type = Kokkos::Impl::SharedAllocationTracker;
|
||||
|
||||
static_assert(traits::rank == 1 && traits::rank_dynamic == 1,
|
||||
"DynamicView must be rank-one");
|
||||
|
@ -118,8 +118,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
|
|||
|
||||
private:
|
||||
track_type m_track;
|
||||
typename traits::value_type**
|
||||
m_chunks; // array of pointers to 'chunks' of memory
|
||||
typename traits::value_type** m_chunks =
|
||||
nullptr; // array of pointers to 'chunks' of memory
|
||||
unsigned m_chunk_shift; // ceil(log2(m_chunk_size))
|
||||
unsigned m_chunk_mask; // m_chunk_size - 1
|
||||
unsigned m_chunk_max; // number of entries in the chunk array - each pointing
|
||||
|
@ -130,38 +130,36 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
|
|||
//----------------------------------------------------------------------
|
||||
|
||||
/** \brief Compatible view of array of scalar types */
|
||||
typedef DynamicView<typename traits::data_type, typename traits::device_type>
|
||||
array_type;
|
||||
using array_type =
|
||||
DynamicView<typename traits::data_type, typename traits::device_type>;
|
||||
|
||||
/** \brief Compatible view of const data type */
|
||||
typedef DynamicView<typename traits::const_data_type,
|
||||
typename traits::device_type>
|
||||
const_type;
|
||||
using const_type = DynamicView<typename traits::const_data_type,
|
||||
typename traits::device_type>;
|
||||
|
||||
/** \brief Compatible view of non-const data type */
|
||||
typedef DynamicView<typename traits::non_const_data_type,
|
||||
typename traits::device_type>
|
||||
non_const_type;
|
||||
using non_const_type = DynamicView<typename traits::non_const_data_type,
|
||||
typename traits::device_type>;
|
||||
|
||||
/** \brief Must be accessible everywhere */
|
||||
typedef DynamicView HostMirror;
|
||||
using HostMirror = DynamicView;
|
||||
|
||||
/** \brief Unified types */
|
||||
typedef Kokkos::Device<typename traits::device_type::execution_space,
|
||||
Kokkos::AnonymousSpace>
|
||||
uniform_device;
|
||||
typedef array_type uniform_type;
|
||||
typedef const_type uniform_const_type;
|
||||
typedef array_type uniform_runtime_type;
|
||||
typedef const_type uniform_runtime_const_type;
|
||||
typedef DynamicView<typename traits::data_type, uniform_device>
|
||||
uniform_nomemspace_type;
|
||||
typedef DynamicView<typename traits::const_data_type, uniform_device>
|
||||
uniform_const_nomemspace_type;
|
||||
typedef DynamicView<typename traits::data_type, uniform_device>
|
||||
uniform_runtime_nomemspace_type;
|
||||
typedef DynamicView<typename traits::const_data_type, uniform_device>
|
||||
uniform_runtime_const_nomemspace_type;
|
||||
using uniform_device =
|
||||
Kokkos::Device<typename traits::device_type::execution_space,
|
||||
Kokkos::AnonymousSpace>;
|
||||
using uniform_type = array_type;
|
||||
using uniform_const_type = const_type;
|
||||
using uniform_runtime_type = array_type;
|
||||
using uniform_runtime_const_type = const_type;
|
||||
using uniform_nomemspace_type =
|
||||
DynamicView<typename traits::data_type, uniform_device>;
|
||||
using uniform_const_nomemspace_type =
|
||||
DynamicView<typename traits::const_data_type, uniform_device>;
|
||||
using uniform_runtime_nomemspace_type =
|
||||
DynamicView<typename traits::data_type, uniform_device>;
|
||||
using uniform_runtime_const_nomemspace_type =
|
||||
DynamicView<typename traits::const_data_type, uniform_device>;
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
|
@ -193,17 +191,6 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
|
|||
return r == 0 ? size() : 1;
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
KOKKOS_INLINE_FUNCTION size_t dimension_0() const { return size(); }
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return 1; }
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return 1; }
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return 1; }
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return 1; }
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return 1; }
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return 1; }
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return 1; }
|
||||
#endif
|
||||
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return 0; }
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return 0; }
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return 0; }
|
||||
|
@ -231,8 +218,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
|
|||
//----------------------------------------------------------------------
|
||||
// Range span is the span which contains all members.
|
||||
|
||||
typedef typename traits::value_type& reference_type;
|
||||
typedef typename traits::value_type* pointer_type;
|
||||
using reference_type = typename traits::value_type&;
|
||||
using pointer_type = typename traits::value_type*;
|
||||
|
||||
enum {
|
||||
reference_type_is_lvalue_reference =
|
||||
|
@ -299,8 +286,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
|
|||
typename Impl::ChunkArraySpace<
|
||||
typename traits::memory_space>::memory_space>::accessible>::type
|
||||
resize_serial(IntType const& n) {
|
||||
typedef typename traits::value_type local_value_type;
|
||||
typedef local_value_type* value_pointer_type;
|
||||
using local_value_type = typename traits::value_type;
|
||||
using value_pointer_type = local_value_type*;
|
||||
|
||||
const uintptr_t NC =
|
||||
(n + m_chunk_mask) >>
|
||||
|
@ -332,6 +319,17 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
|
|||
*(pc + 1) = n;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION bool is_allocated() const {
|
||||
if (m_chunks == nullptr) {
|
||||
return false;
|
||||
} else {
|
||||
// *m_chunks[m_chunk_max] stores the current number of chunks being used
|
||||
uintptr_t* const pc =
|
||||
reinterpret_cast<uintptr_t*>(m_chunks + m_chunk_max);
|
||||
return (*(pc + 1) > 0);
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
~DynamicView() = default;
|
||||
|
@ -349,8 +347,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
|
|||
m_chunk_mask(rhs.m_chunk_mask),
|
||||
m_chunk_max(rhs.m_chunk_max),
|
||||
m_chunk_size(rhs.m_chunk_size) {
|
||||
typedef typename DynamicView<RT, RP...>::traits SrcTraits;
|
||||
typedef Kokkos::Impl::ViewMapping<traits, SrcTraits, void> Mapping;
|
||||
using SrcTraits = typename DynamicView<RT, RP...>::traits;
|
||||
using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, void>;
|
||||
static_assert(Mapping::is_assignable,
|
||||
"Incompatible DynamicView copy construction");
|
||||
}
|
||||
|
@ -373,9 +371,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
|
|||
}
|
||||
|
||||
void execute(bool arg_destroy) {
|
||||
typedef Kokkos::RangePolicy<typename HostSpace::execution_space> Range;
|
||||
// typedef Kokkos::RangePolicy< typename Impl::ChunkArraySpace< typename
|
||||
// traits::memory_space >::memory_space::execution_space > Range ;
|
||||
using Range = Kokkos::RangePolicy<typename HostSpace::execution_space>;
|
||||
|
||||
m_destroy = arg_destroy;
|
||||
|
||||
|
@ -431,12 +427,11 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
|
|||
m_chunk_shift) // max num pointers-to-chunks in array
|
||||
,
|
||||
m_chunk_size(2 << (m_chunk_shift - 1)) {
|
||||
typedef typename Impl::ChunkArraySpace<
|
||||
typename traits::memory_space>::memory_space chunk_array_memory_space;
|
||||
using chunk_array_memory_space = typename Impl::ChunkArraySpace<
|
||||
typename traits::memory_space>::memory_space;
|
||||
// A functor to deallocate all of the chunks upon final destruction
|
||||
typedef Kokkos::Impl::SharedAllocationRecord<chunk_array_memory_space,
|
||||
Destroy>
|
||||
record_type;
|
||||
using record_type =
|
||||
Kokkos::Impl::SharedAllocationRecord<chunk_array_memory_space, Destroy>;
|
||||
|
||||
// Allocate chunk pointers and allocation counter
|
||||
record_type* const record =
|
||||
|
@ -471,11 +466,11 @@ create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src) {
|
|||
template <class T, class... DP, class... SP>
|
||||
inline void deep_copy(const View<T, DP...>& dst,
|
||||
const Kokkos::Experimental::DynamicView<T, SP...>& src) {
|
||||
typedef View<T, DP...> dst_type;
|
||||
typedef Kokkos::Experimental::DynamicView<T, SP...> src_type;
|
||||
using dst_type = View<T, DP...>;
|
||||
using src_type = Kokkos::Experimental::DynamicView<T, SP...>;
|
||||
|
||||
typedef typename ViewTraits<T, DP...>::execution_space dst_execution_space;
|
||||
typedef typename ViewTraits<T, SP...>::memory_space src_memory_space;
|
||||
using dst_execution_space = typename ViewTraits<T, DP...>::execution_space;
|
||||
using src_memory_space = typename ViewTraits<T, SP...>::memory_space;
|
||||
|
||||
enum {
|
||||
DstExecCanAccessSrc =
|
||||
|
@ -496,11 +491,11 @@ inline void deep_copy(const View<T, DP...>& dst,
|
|||
template <class T, class... DP, class... SP>
|
||||
inline void deep_copy(const Kokkos::Experimental::DynamicView<T, DP...>& dst,
|
||||
const View<T, SP...>& src) {
|
||||
typedef Kokkos::Experimental::DynamicView<T, SP...> dst_type;
|
||||
typedef View<T, DP...> src_type;
|
||||
using dst_type = Kokkos::Experimental::DynamicView<T, SP...>;
|
||||
using src_type = View<T, DP...>;
|
||||
|
||||
typedef typename ViewTraits<T, DP...>::execution_space dst_execution_space;
|
||||
typedef typename ViewTraits<T, SP...>::memory_space src_memory_space;
|
||||
using dst_execution_space = typename ViewTraits<T, DP...>::execution_space;
|
||||
using src_memory_space = typename ViewTraits<T, SP...>::memory_space;
|
||||
|
||||
enum {
|
||||
DstExecCanAccessSrc =
|
||||
|
@ -522,10 +517,10 @@ namespace Impl {
|
|||
template <class Arg0, class... DP, class... SP>
|
||||
struct CommonSubview<Kokkos::Experimental::DynamicView<DP...>,
|
||||
Kokkos::Experimental::DynamicView<SP...>, 1, Arg0> {
|
||||
typedef Kokkos::Experimental::DynamicView<DP...> DstType;
|
||||
typedef Kokkos::Experimental::DynamicView<SP...> SrcType;
|
||||
typedef DstType dst_subview_type;
|
||||
typedef SrcType src_subview_type;
|
||||
using DstType = Kokkos::Experimental::DynamicView<DP...>;
|
||||
using SrcType = Kokkos::Experimental::DynamicView<SP...>;
|
||||
using dst_subview_type = DstType;
|
||||
using src_subview_type = SrcType;
|
||||
dst_subview_type dst_sub;
|
||||
src_subview_type src_sub;
|
||||
CommonSubview(const DstType& dst, const SrcType& src, const Arg0& /*arg0*/)
|
||||
|
@ -535,9 +530,9 @@ struct CommonSubview<Kokkos::Experimental::DynamicView<DP...>,
|
|||
template <class... DP, class SrcType, class Arg0>
|
||||
struct CommonSubview<Kokkos::Experimental::DynamicView<DP...>, SrcType, 1,
|
||||
Arg0> {
|
||||
typedef Kokkos::Experimental::DynamicView<DP...> DstType;
|
||||
typedef DstType dst_subview_type;
|
||||
typedef typename Kokkos::Subview<SrcType, Arg0> src_subview_type;
|
||||
using DstType = Kokkos::Experimental::DynamicView<DP...>;
|
||||
using dst_subview_type = DstType;
|
||||
using src_subview_type = typename Kokkos::Subview<SrcType, Arg0>;
|
||||
dst_subview_type dst_sub;
|
||||
src_subview_type src_sub;
|
||||
CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0)
|
||||
|
@ -547,9 +542,9 @@ struct CommonSubview<Kokkos::Experimental::DynamicView<DP...>, SrcType, 1,
|
|||
template <class DstType, class... SP, class Arg0>
|
||||
struct CommonSubview<DstType, Kokkos::Experimental::DynamicView<SP...>, 1,
|
||||
Arg0> {
|
||||
typedef Kokkos::Experimental::DynamicView<SP...> SrcType;
|
||||
typedef typename Kokkos::Subview<DstType, Arg0> dst_subview_type;
|
||||
typedef SrcType src_subview_type;
|
||||
using SrcType = Kokkos::Experimental::DynamicView<SP...>;
|
||||
using dst_subview_type = typename Kokkos::Subview<DstType, Arg0>;
|
||||
using src_subview_type = SrcType;
|
||||
dst_subview_type dst_sub;
|
||||
src_subview_type src_sub;
|
||||
CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0)
|
||||
|
@ -559,11 +554,11 @@ struct CommonSubview<DstType, Kokkos::Experimental::DynamicView<SP...>, 1,
|
|||
template <class... DP, class ViewTypeB, class Layout, class ExecSpace,
|
||||
typename iType>
|
||||
struct ViewCopy<Kokkos::Experimental::DynamicView<DP...>, ViewTypeB, Layout,
|
||||
ExecSpace, 1, iType, false> {
|
||||
ExecSpace, 1, iType> {
|
||||
Kokkos::Experimental::DynamicView<DP...> a;
|
||||
ViewTypeB b;
|
||||
|
||||
typedef Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<iType>> policy_type;
|
||||
using policy_type = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<iType>>;
|
||||
|
||||
ViewCopy(const Kokkos::Experimental::DynamicView<DP...>& a_,
|
||||
const ViewTypeB& b_)
|
||||
|
@ -580,11 +575,11 @@ template <class... DP, class... SP, class Layout, class ExecSpace,
|
|||
typename iType>
|
||||
struct ViewCopy<Kokkos::Experimental::DynamicView<DP...>,
|
||||
Kokkos::Experimental::DynamicView<SP...>, Layout, ExecSpace, 1,
|
||||
iType, false> {
|
||||
iType> {
|
||||
Kokkos::Experimental::DynamicView<DP...> a;
|
||||
Kokkos::Experimental::DynamicView<SP...> b;
|
||||
|
||||
typedef Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<iType>> policy_type;
|
||||
using policy_type = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<iType>>;
|
||||
|
||||
ViewCopy(const Kokkos::Experimental::DynamicView<DP...>& a_,
|
||||
const Kokkos::Experimental::DynamicView<SP...>& b_)
|
||||
|
|
|
@ -56,9 +56,9 @@ namespace Experimental {
|
|||
template <typename ReportType, typename DeviceType>
|
||||
class ErrorReporter {
|
||||
public:
|
||||
typedef ReportType report_type;
|
||||
typedef DeviceType device_type;
|
||||
typedef typename device_type::execution_space execution_space;
|
||||
using report_type = ReportType;
|
||||
using device_type = DeviceType;
|
||||
using execution_space = typename device_type::execution_space;
|
||||
|
||||
ErrorReporter(int max_results)
|
||||
: m_numReportsAttempted(""),
|
||||
|
@ -103,10 +103,10 @@ class ErrorReporter {
|
|||
}
|
||||
|
||||
private:
|
||||
typedef Kokkos::View<report_type *, execution_space> reports_view_t;
|
||||
typedef Kokkos::DualView<report_type *, execution_space> reports_dualview_t;
|
||||
using reports_view_t = Kokkos::View<report_type *, execution_space>;
|
||||
using reports_dualview_t = Kokkos::DualView<report_type *, execution_space>;
|
||||
|
||||
typedef typename reports_dualview_t::host_mirror_space host_mirror_space;
|
||||
using host_mirror_space = typename reports_dualview_t::host_mirror_space;
|
||||
Kokkos::View<int, execution_space> m_numReportsAttempted;
|
||||
reports_dualview_t m_reports;
|
||||
Kokkos::DualView<int *, execution_space> m_reporters;
|
||||
|
|
|
@ -52,10 +52,10 @@ namespace Kokkos {
|
|||
|
||||
template <typename T>
|
||||
struct pod_hash {
|
||||
typedef T argument_type;
|
||||
typedef T first_argument_type;
|
||||
typedef uint32_t second_argument_type;
|
||||
typedef uint32_t result_type;
|
||||
using argument_type = T;
|
||||
using first_argument_type = T;
|
||||
using second_argument_type = uint32_t;
|
||||
using result_type = uint32_t;
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
uint32_t operator()(T const& t) const {
|
||||
|
@ -70,9 +70,9 @@ struct pod_hash {
|
|||
|
||||
template <typename T>
|
||||
struct pod_equal_to {
|
||||
typedef T first_argument_type;
|
||||
typedef T second_argument_type;
|
||||
typedef bool result_type;
|
||||
using first_argument_type = T;
|
||||
using second_argument_type = T;
|
||||
using result_type = bool;
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
bool operator()(T const& a, T const& b) const {
|
||||
|
@ -82,9 +82,9 @@ struct pod_equal_to {
|
|||
|
||||
template <typename T>
|
||||
struct pod_not_equal_to {
|
||||
typedef T first_argument_type;
|
||||
typedef T second_argument_type;
|
||||
typedef bool result_type;
|
||||
using first_argument_type = T;
|
||||
using second_argument_type = T;
|
||||
using result_type = bool;
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
bool operator()(T const& a, T const& b) const {
|
||||
|
@ -94,9 +94,9 @@ struct pod_not_equal_to {
|
|||
|
||||
template <typename T>
|
||||
struct equal_to {
|
||||
typedef T first_argument_type;
|
||||
typedef T second_argument_type;
|
||||
typedef bool result_type;
|
||||
using first_argument_type = T;
|
||||
using second_argument_type = T;
|
||||
using result_type = bool;
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
bool operator()(T const& a, T const& b) const { return a == b; }
|
||||
|
@ -104,9 +104,9 @@ struct equal_to {
|
|||
|
||||
template <typename T>
|
||||
struct not_equal_to {
|
||||
typedef T first_argument_type;
|
||||
typedef T second_argument_type;
|
||||
typedef bool result_type;
|
||||
using first_argument_type = T;
|
||||
using second_argument_type = T;
|
||||
using result_type = bool;
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
bool operator()(T const& a, T const& b) const { return a != b; }
|
||||
|
@ -114,9 +114,9 @@ struct not_equal_to {
|
|||
|
||||
template <typename T>
|
||||
struct greater {
|
||||
typedef T first_argument_type;
|
||||
typedef T second_argument_type;
|
||||
typedef bool result_type;
|
||||
using first_argument_type = T;
|
||||
using second_argument_type = T;
|
||||
using result_type = bool;
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
bool operator()(T const& a, T const& b) const { return a > b; }
|
||||
|
@ -124,9 +124,9 @@ struct greater {
|
|||
|
||||
template <typename T>
|
||||
struct less {
|
||||
typedef T first_argument_type;
|
||||
typedef T second_argument_type;
|
||||
typedef bool result_type;
|
||||
using first_argument_type = T;
|
||||
using second_argument_type = T;
|
||||
using result_type = bool;
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
bool operator()(T const& a, T const& b) const { return a < b; }
|
||||
|
@ -134,9 +134,9 @@ struct less {
|
|||
|
||||
template <typename T>
|
||||
struct greater_equal {
|
||||
typedef T first_argument_type;
|
||||
typedef T second_argument_type;
|
||||
typedef bool result_type;
|
||||
using first_argument_type = T;
|
||||
using second_argument_type = T;
|
||||
using result_type = bool;
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
bool operator()(T const& a, T const& b) const { return a >= b; }
|
||||
|
@ -144,9 +144,9 @@ struct greater_equal {
|
|||
|
||||
template <typename T>
|
||||
struct less_equal {
|
||||
typedef T first_argument_type;
|
||||
typedef T second_argument_type;
|
||||
typedef bool result_type;
|
||||
using first_argument_type = T;
|
||||
using second_argument_type = T;
|
||||
using result_type = bool;
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
bool operator()(T const& a, T const& b) const { return a <= b; }
|
||||
|
|
|
@ -51,10 +51,10 @@ namespace Impl {
|
|||
|
||||
template <class ViewType>
|
||||
struct GetOffsetViewTypeFromViewType {
|
||||
typedef OffsetView<
|
||||
typename ViewType::data_type, typename ViewType::array_layout,
|
||||
typename ViewType::device_type, typename ViewType::memory_traits>
|
||||
type;
|
||||
using type =
|
||||
OffsetView<typename ViewType::data_type, typename ViewType::array_layout,
|
||||
typename ViewType::device_type,
|
||||
typename ViewType::memory_traits>;
|
||||
};
|
||||
|
||||
template <unsigned, class MapType, class BeginsType>
|
||||
|
@ -180,7 +180,7 @@ void runtime_check_rank_device(const size_t rank_dynamic, const size_t rank,
|
|||
template <class DataType, class... Properties>
|
||||
class OffsetView : public ViewTraits<DataType, Properties...> {
|
||||
public:
|
||||
typedef ViewTraits<DataType, Properties...> traits;
|
||||
using traits = ViewTraits<DataType, Properties...>;
|
||||
|
||||
private:
|
||||
template <class, class...>
|
||||
|
@ -190,12 +190,12 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
|
|||
template <class, class...>
|
||||
friend class Kokkos::Impl::ViewMapping;
|
||||
|
||||
typedef Kokkos::Impl::ViewMapping<traits, void> map_type;
|
||||
typedef Kokkos::Impl::SharedAllocationTracker track_type;
|
||||
using map_type = Kokkos::Impl::ViewMapping<traits, void>;
|
||||
using track_type = Kokkos::Impl::SharedAllocationTracker;
|
||||
|
||||
public:
|
||||
enum { Rank = map_type::Rank };
|
||||
typedef Kokkos::Array<int64_t, Rank> begins_type;
|
||||
using begins_type = Kokkos::Array<int64_t, Rank>;
|
||||
|
||||
template <
|
||||
typename iType,
|
||||
|
@ -223,28 +223,27 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
|
|||
public:
|
||||
//----------------------------------------
|
||||
/** \brief Compatible view of array of scalar types */
|
||||
typedef OffsetView<
|
||||
typename traits::scalar_array_type, typename traits::array_layout,
|
||||
typename traits::device_type, typename traits::memory_traits>
|
||||
array_type;
|
||||
using array_type =
|
||||
OffsetView<typename traits::scalar_array_type,
|
||||
typename traits::array_layout, typename traits::device_type,
|
||||
typename traits::memory_traits>;
|
||||
|
||||
/** \brief Compatible view of const data type */
|
||||
typedef OffsetView<
|
||||
typename traits::const_data_type, typename traits::array_layout,
|
||||
typename traits::device_type, typename traits::memory_traits>
|
||||
const_type;
|
||||
using const_type =
|
||||
OffsetView<typename traits::const_data_type,
|
||||
typename traits::array_layout, typename traits::device_type,
|
||||
typename traits::memory_traits>;
|
||||
|
||||
/** \brief Compatible view of non-const data type */
|
||||
typedef OffsetView<
|
||||
typename traits::non_const_data_type, typename traits::array_layout,
|
||||
typename traits::device_type, typename traits::memory_traits>
|
||||
non_const_type;
|
||||
using non_const_type =
|
||||
OffsetView<typename traits::non_const_data_type,
|
||||
typename traits::array_layout, typename traits::device_type,
|
||||
typename traits::memory_traits>;
|
||||
|
||||
/** \brief Compatible HostMirror view */
|
||||
typedef OffsetView<typename traits::non_const_data_type,
|
||||
typename traits::array_layout,
|
||||
typename traits::host_mirror_space>
|
||||
HostMirror;
|
||||
using HostMirror = OffsetView<typename traits::non_const_data_type,
|
||||
typename traits::array_layout,
|
||||
typename traits::host_mirror_space>;
|
||||
|
||||
//----------------------------------------
|
||||
// Domain rank and extents
|
||||
|
@ -335,8 +334,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
|
|||
//----------------------------------------
|
||||
// Range span is the span which contains all members.
|
||||
|
||||
typedef typename map_type::reference_type reference_type;
|
||||
typedef typename map_type::pointer_type pointer_type;
|
||||
using reference_type = typename map_type::reference_type;
|
||||
using pointer_type = typename map_type::pointer_type;
|
||||
|
||||
enum {
|
||||
reference_type_is_lvalue_reference =
|
||||
|
@ -347,6 +346,9 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
|
|||
KOKKOS_INLINE_FUNCTION bool span_is_contiguous() const {
|
||||
return m_map.span_is_contiguous();
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const {
|
||||
return m_map.data() != nullptr;
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const {
|
||||
return m_map.data();
|
||||
}
|
||||
|
@ -841,10 +843,9 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
|
|||
|
||||
// interoperability with View
|
||||
private:
|
||||
typedef View<typename traits::scalar_array_type,
|
||||
typename traits::array_layout, typename traits::device_type,
|
||||
typename traits::memory_traits>
|
||||
view_type;
|
||||
using view_type =
|
||||
View<typename traits::scalar_array_type, typename traits::array_layout,
|
||||
typename traits::device_type, typename traits::memory_traits>;
|
||||
|
||||
public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -856,8 +857,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
|
|||
template <class RT, class... RP>
|
||||
KOKKOS_INLINE_FUNCTION OffsetView(const View<RT, RP...>& aview)
|
||||
: m_track(aview.impl_track()), m_map() {
|
||||
typedef typename OffsetView<RT, RP...>::traits SrcTraits;
|
||||
typedef Kokkos::Impl::ViewMapping<traits, SrcTraits, void> Mapping;
|
||||
using SrcTraits = typename OffsetView<RT, RP...>::traits;
|
||||
using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, void>;
|
||||
static_assert(Mapping::is_assignable,
|
||||
"Incompatible OffsetView copy construction");
|
||||
Mapping::assign(m_map, aview.impl_map(), m_track);
|
||||
|
@ -871,8 +872,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
|
|||
KOKKOS_INLINE_FUNCTION OffsetView(const View<RT, RP...>& aview,
|
||||
const index_list_type& minIndices)
|
||||
: m_track(aview.impl_track()), m_map() {
|
||||
typedef typename OffsetView<RT, RP...>::traits SrcTraits;
|
||||
typedef Kokkos::Impl::ViewMapping<traits, SrcTraits, void> Mapping;
|
||||
using SrcTraits = typename OffsetView<RT, RP...>::traits;
|
||||
using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, void>;
|
||||
static_assert(Mapping::is_assignable,
|
||||
"Incompatible OffsetView copy construction");
|
||||
Mapping::assign(m_map, aview.impl_map(), m_track);
|
||||
|
@ -894,8 +895,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
|
|||
KOKKOS_INLINE_FUNCTION OffsetView(const View<RT, RP...>& aview,
|
||||
const begins_type& beg)
|
||||
: m_track(aview.impl_track()), m_map(), m_begins(beg) {
|
||||
typedef typename OffsetView<RT, RP...>::traits SrcTraits;
|
||||
typedef Kokkos::Impl::ViewMapping<traits, SrcTraits, void> Mapping;
|
||||
using SrcTraits = typename OffsetView<RT, RP...>::traits;
|
||||
using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, void>;
|
||||
static_assert(Mapping::is_assignable,
|
||||
"Incompatible OffsetView copy construction");
|
||||
Mapping::assign(m_map, aview.impl_map(), m_track);
|
||||
|
@ -917,8 +918,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
|
|||
: m_track(rhs.m_track, traits::is_managed),
|
||||
m_map(),
|
||||
m_begins(rhs.m_begins) {
|
||||
typedef typename OffsetView<RT, RP...>::traits SrcTraits;
|
||||
typedef Kokkos::Impl::ViewMapping<traits, SrcTraits, void> Mapping;
|
||||
using SrcTraits = typename OffsetView<RT, RP...>::traits;
|
||||
using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, void>;
|
||||
static_assert(Mapping::is_assignable,
|
||||
"Incompatible OffsetView copy construction");
|
||||
Mapping::assign(m_map, rhs.m_map, rhs.m_track); // swb what about assign?
|
||||
|
@ -1215,11 +1216,11 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
|
|||
for (size_t i = 0; i < Rank; ++i) m_begins[i] = minIndices.begin()[i];
|
||||
|
||||
// Append layout and spaces if not input
|
||||
typedef Kokkos::Impl::ViewCtorProp<P...> alloc_prop_input;
|
||||
using alloc_prop_input = Kokkos::Impl::ViewCtorProp<P...>;
|
||||
|
||||
// use 'std::integral_constant<unsigned,I>' for non-types
|
||||
// to avoid duplicate class error.
|
||||
typedef Kokkos::Impl::ViewCtorProp<
|
||||
using alloc_prop = Kokkos::Impl::ViewCtorProp<
|
||||
P...,
|
||||
typename std::conditional<alloc_prop_input::has_label,
|
||||
std::integral_constant<unsigned, 0>,
|
||||
|
@ -1231,19 +1232,13 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
|
|||
typename std::conditional<
|
||||
alloc_prop_input::has_execution_space,
|
||||
std::integral_constant<unsigned, 2>,
|
||||
typename traits::device_type::execution_space>::type>
|
||||
alloc_prop;
|
||||
typename traits::device_type::execution_space>::type>;
|
||||
|
||||
static_assert(traits::is_managed,
|
||||
"OffsetView allocation constructor requires managed memory");
|
||||
|
||||
if (alloc_prop::initialize &&
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
!alloc_prop::execution_space::is_initialized()
|
||||
#else
|
||||
!alloc_prop::execution_space::impl_is_initialized()
|
||||
#endif
|
||||
) {
|
||||
!alloc_prop::execution_space::impl_is_initialized()) {
|
||||
// If initializing view data then
|
||||
// the execution space must be initialized.
|
||||
Kokkos::Impl::throw_runtime_exception(
|
||||
|
@ -1764,8 +1759,8 @@ template <class LT, class... LP, class RT, class... RP>
|
|||
KOKKOS_INLINE_FUNCTION bool operator==(const OffsetView<LT, LP...>& lhs,
|
||||
const OffsetView<RT, RP...>& rhs) {
|
||||
// Same data, layout, dimensions
|
||||
typedef ViewTraits<LT, LP...> lhs_traits;
|
||||
typedef ViewTraits<RT, RP...> rhs_traits;
|
||||
using lhs_traits = ViewTraits<LT, LP...>;
|
||||
using rhs_traits = ViewTraits<RT, RP...>;
|
||||
|
||||
return std::is_same<typename lhs_traits::const_value_type,
|
||||
typename rhs_traits::const_value_type>::value &&
|
||||
|
@ -1795,8 +1790,8 @@ template <class LT, class... LP, class RT, class... RP>
|
|||
KOKKOS_INLINE_FUNCTION bool operator==(const View<LT, LP...>& lhs,
|
||||
const OffsetView<RT, RP...>& rhs) {
|
||||
// Same data, layout, dimensions
|
||||
typedef ViewTraits<LT, LP...> lhs_traits;
|
||||
typedef ViewTraits<RT, RP...> rhs_traits;
|
||||
using lhs_traits = ViewTraits<LT, LP...>;
|
||||
using rhs_traits = ViewTraits<RT, RP...>;
|
||||
|
||||
return std::is_same<typename lhs_traits::const_value_type,
|
||||
typename rhs_traits::const_value_type>::value &&
|
||||
|
@ -1825,10 +1820,10 @@ KOKKOS_INLINE_FUNCTION bool operator==(const OffsetView<LT, LP...>& lhs,
|
|||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
template <class DT, class... DP>
|
||||
inline void deep_copy(
|
||||
const OffsetView<DT, DP...>& dst,
|
||||
const Experimental::OffsetView<DT, DP...>& dst,
|
||||
typename ViewTraits<DT, DP...>::const_value_type& value,
|
||||
typename std::enable_if<std::is_same<
|
||||
typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
|
||||
|
@ -1844,7 +1839,8 @@ inline void deep_copy(
|
|||
|
||||
template <class DT, class... DP, class ST, class... SP>
|
||||
inline void deep_copy(
|
||||
const OffsetView<DT, DP...>& dst, const OffsetView<ST, SP...>& value,
|
||||
const Experimental::OffsetView<DT, DP...>& dst,
|
||||
const Experimental::OffsetView<ST, SP...>& value,
|
||||
typename std::enable_if<std::is_same<
|
||||
typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
|
||||
nullptr) {
|
||||
|
@ -1858,7 +1854,8 @@ inline void deep_copy(
|
|||
}
|
||||
template <class DT, class... DP, class ST, class... SP>
|
||||
inline void deep_copy(
|
||||
const OffsetView<DT, DP...>& dst, const View<ST, SP...>& value,
|
||||
const Experimental::OffsetView<DT, DP...>& dst,
|
||||
const View<ST, SP...>& value,
|
||||
typename std::enable_if<std::is_same<
|
||||
typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
|
||||
nullptr) {
|
||||
|
@ -1873,7 +1870,8 @@ inline void deep_copy(
|
|||
|
||||
template <class DT, class... DP, class ST, class... SP>
|
||||
inline void deep_copy(
|
||||
const View<DT, DP...>& dst, const OffsetView<ST, SP...>& value,
|
||||
const View<DT, DP...>& dst,
|
||||
const Experimental::OffsetView<ST, SP...>& value,
|
||||
typename std::enable_if<std::is_same<
|
||||
typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
|
||||
nullptr) {
|
||||
|
@ -1884,53 +1882,54 @@ inline void deep_copy(
|
|||
|
||||
Kokkos::deep_copy(dst, value.view());
|
||||
}
|
||||
|
||||
namespace Impl {
|
||||
|
||||
// Deduce Mirror Types
|
||||
template <class Space, class T, class... P>
|
||||
struct MirrorOffsetViewType {
|
||||
// The incoming view_type
|
||||
typedef typename Kokkos::Experimental::OffsetView<T, P...> src_view_type;
|
||||
using src_view_type = typename Kokkos::Experimental::OffsetView<T, P...>;
|
||||
// The memory space for the mirror view
|
||||
typedef typename Space::memory_space memory_space;
|
||||
using memory_space = typename Space::memory_space;
|
||||
// Check whether it is the same memory space
|
||||
enum {
|
||||
is_same_memspace =
|
||||
std::is_same<memory_space, typename src_view_type::memory_space>::value
|
||||
};
|
||||
// The array_layout
|
||||
typedef typename src_view_type::array_layout array_layout;
|
||||
using array_layout = typename src_view_type::array_layout;
|
||||
// The data type (we probably want it non-const since otherwise we can't even
|
||||
// deep_copy to it.
|
||||
typedef typename src_view_type::non_const_data_type data_type;
|
||||
using data_type = typename src_view_type::non_const_data_type;
|
||||
// The destination view type if it is not the same memory space
|
||||
typedef Kokkos::Experimental::OffsetView<data_type, array_layout, Space>
|
||||
dest_view_type;
|
||||
using dest_view_type =
|
||||
Kokkos::Experimental::OffsetView<data_type, array_layout, Space>;
|
||||
// If it is the same memory_space return the existsing view_type
|
||||
// This will also keep the unmanaged trait if necessary
|
||||
typedef typename std::conditional<is_same_memspace, src_view_type,
|
||||
dest_view_type>::type view_type;
|
||||
using view_type = typename std::conditional<is_same_memspace, src_view_type,
|
||||
dest_view_type>::type;
|
||||
};
|
||||
|
||||
template <class Space, class T, class... P>
|
||||
struct MirrorOffsetType {
|
||||
// The incoming view_type
|
||||
typedef typename Kokkos::Experimental::OffsetView<T, P...> src_view_type;
|
||||
using src_view_type = typename Kokkos::Experimental::OffsetView<T, P...>;
|
||||
// The memory space for the mirror view
|
||||
typedef typename Space::memory_space memory_space;
|
||||
using memory_space = typename Space::memory_space;
|
||||
// Check whether it is the same memory space
|
||||
enum {
|
||||
is_same_memspace =
|
||||
std::is_same<memory_space, typename src_view_type::memory_space>::value
|
||||
};
|
||||
// The array_layout
|
||||
typedef typename src_view_type::array_layout array_layout;
|
||||
using array_layout = typename src_view_type::array_layout;
|
||||
// The data type (we probably want it non-const since otherwise we can't even
|
||||
// deep_copy to it.
|
||||
typedef typename src_view_type::non_const_data_type data_type;
|
||||
using data_type = typename src_view_type::non_const_data_type;
|
||||
// The destination view type if it is not the same memory space
|
||||
typedef Kokkos::Experimental::OffsetView<data_type, array_layout, Space>
|
||||
view_type;
|
||||
using view_type =
|
||||
Kokkos::Experimental::OffsetView<data_type, array_layout, Space>;
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
@ -1942,8 +1941,8 @@ create_mirror(
|
|||
typename std::enable_if<
|
||||
!std::is_same<typename Kokkos::ViewTraits<T, P...>::array_layout,
|
||||
Kokkos::LayoutStride>::value>::type* = 0) {
|
||||
typedef OffsetView<T, P...> src_type;
|
||||
typedef typename src_type::HostMirror dst_type;
|
||||
using src_type = Experimental::OffsetView<T, P...>;
|
||||
using dst_type = typename src_type::HostMirror;
|
||||
|
||||
return dst_type(
|
||||
Kokkos::Impl::ViewCtorProp<std::string>(
|
||||
|
@ -1962,8 +1961,8 @@ create_mirror(
|
|||
typename std::enable_if<
|
||||
std::is_same<typename Kokkos::ViewTraits<T, P...>::array_layout,
|
||||
Kokkos::LayoutStride>::value>::type* = 0) {
|
||||
typedef OffsetView<T, P...> src_type;
|
||||
typedef typename src_type::HostMirror dst_type;
|
||||
using src_type = Experimental::OffsetView<T, P...>;
|
||||
using dst_type = typename src_type::HostMirror;
|
||||
|
||||
Kokkos::LayoutStride layout;
|
||||
|
||||
|
@ -1992,14 +1991,13 @@ create_mirror(
|
|||
|
||||
// Create a mirror in a new space (specialization for different space)
|
||||
template <class Space, class T, class... P>
|
||||
typename Kokkos::Experimental::Impl::MirrorOffsetType<Space, T, P...>::view_type
|
||||
typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type
|
||||
create_mirror(const Space&,
|
||||
const Kokkos::Experimental::OffsetView<T, P...>& src) {
|
||||
return typename Kokkos::Experimental::Impl::MirrorOffsetType<
|
||||
Space, T, P...>::view_type(src.label(), src.layout(),
|
||||
{src.begin(0), src.begin(1), src.begin(2),
|
||||
src.begin(3), src.begin(4), src.begin(5),
|
||||
src.begin(6), src.begin(7)});
|
||||
return typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type(
|
||||
src.label(), src.layout(),
|
||||
{src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4),
|
||||
src.begin(5), src.begin(6), src.begin(7)});
|
||||
}
|
||||
|
||||
template <class T, class... P>
|
||||
|
@ -2031,13 +2029,12 @@ create_mirror_view(
|
|||
typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
|
||||
typename Kokkos::Experimental::OffsetView<
|
||||
T, P...>::HostMirror::data_type>::value)>::type* = 0) {
|
||||
return Kokkos::Experimental::create_mirror(src);
|
||||
return Kokkos::create_mirror(src);
|
||||
}
|
||||
|
||||
// Create a mirror view in a new space (specialization for same space)
|
||||
template <class Space, class T, class... P>
|
||||
typename Kokkos::Experimental::Impl::MirrorOffsetViewType<Space, T,
|
||||
P...>::view_type
|
||||
typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type
|
||||
create_mirror_view(const Space&,
|
||||
const Kokkos::Experimental::OffsetView<T, P...>& src,
|
||||
typename std::enable_if<Impl::MirrorOffsetViewType<
|
||||
|
@ -2047,17 +2044,15 @@ create_mirror_view(const Space&,
|
|||
|
||||
// Create a mirror view in a new space (specialization for different space)
|
||||
template <class Space, class T, class... P>
|
||||
typename Kokkos::Experimental::Impl::MirrorOffsetViewType<Space, T,
|
||||
P...>::view_type
|
||||
typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type
|
||||
create_mirror_view(const Space&,
|
||||
const Kokkos::Experimental::OffsetView<T, P...>& src,
|
||||
typename std::enable_if<!Impl::MirrorOffsetViewType<
|
||||
Space, T, P...>::is_same_memspace>::type* = 0) {
|
||||
return typename Kokkos::Experimental::Impl::MirrorOffsetViewType<
|
||||
Space, T, P...>::view_type(src.label(), src.layout(),
|
||||
{src.begin(0), src.begin(1), src.begin(2),
|
||||
src.begin(3), src.begin(4), src.begin(5),
|
||||
src.begin(6), src.begin(7)});
|
||||
return typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type(
|
||||
src.label(), src.layout(),
|
||||
{src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4),
|
||||
src.begin(5), src.begin(6), src.begin(7)});
|
||||
}
|
||||
//
|
||||
// // Create a mirror view and deep_copy in a new space (specialization for
|
||||
|
@ -2093,7 +2088,6 @@ create_mirror_view(const Space&,
|
|||
// return mirror;
|
||||
// }
|
||||
|
||||
} // namespace Experimental
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -57,7 +57,7 @@ namespace Kokkos {
|
|||
namespace Impl {
|
||||
template <class RowOffsetsType, class RowBlockOffsetsType>
|
||||
struct StaticCrsGraphBalancerFunctor {
|
||||
typedef typename RowOffsetsType::non_const_value_type int_type;
|
||||
using int_type = typename RowOffsetsType::non_const_value_type;
|
||||
RowOffsetsType row_offsets;
|
||||
RowBlockOffsetsType row_block_offsets;
|
||||
|
||||
|
@ -148,7 +148,7 @@ struct StaticCrsGraphBalancerFunctor {
|
|||
///
|
||||
/// Here is an example loop over the entries in the row:
|
||||
/// \code
|
||||
/// typedef typename GraphRowViewConst<MatrixType>::ordinal_type ordinal_type;
|
||||
/// using ordinal_type = typename GraphRowViewConst<MatrixType>::ordinal_type;
|
||||
///
|
||||
/// GraphRowView<GraphType> G_i = ...;
|
||||
/// const ordinal_type numEntries = G_i.length;
|
||||
|
@ -159,7 +159,7 @@ struct StaticCrsGraphBalancerFunctor {
|
|||
/// \endcode
|
||||
///
|
||||
/// GraphType must provide the \c data_type
|
||||
/// typedefs. In addition, it must make sense to use GraphRowViewConst to
|
||||
/// aliases. In addition, it must make sense to use GraphRowViewConst to
|
||||
/// view a row of GraphType. In particular, column
|
||||
/// indices of a row must be accessible using the <tt>entries</tt>
|
||||
/// resp. <tt>colidx</tt> arrays given to the constructor of this
|
||||
|
@ -170,7 +170,7 @@ struct StaticCrsGraphBalancerFunctor {
|
|||
template <class GraphType>
|
||||
struct GraphRowViewConst {
|
||||
//! The type of the column indices in the row.
|
||||
typedef const typename GraphType::data_type ordinal_type;
|
||||
using ordinal_type = const typename GraphType::data_type;
|
||||
|
||||
private:
|
||||
//! Array of (local) column indices in the row.
|
||||
|
@ -279,49 +279,33 @@ struct GraphRowViewConst {
|
|||
/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li>
|
||||
/// </ul>
|
||||
template <class DataType, class Arg1Type, class Arg2Type = void,
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
typename SizeType =
|
||||
typename ViewTraits<DataType*, Arg1Type, Arg2Type>::size_type,
|
||||
class Arg3Type = void>
|
||||
#else
|
||||
class Arg3Type = void,
|
||||
typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type,
|
||||
Arg3Type>::size_type>
|
||||
#endif
|
||||
class StaticCrsGraph {
|
||||
private:
|
||||
typedef ViewTraits<DataType*, Arg1Type, Arg2Type, Arg3Type> traits;
|
||||
using traits = ViewTraits<DataType*, Arg1Type, Arg2Type, Arg3Type>;
|
||||
|
||||
public:
|
||||
typedef DataType data_type;
|
||||
typedef typename traits::array_layout array_layout;
|
||||
typedef typename traits::execution_space execution_space;
|
||||
typedef typename traits::device_type device_type;
|
||||
typedef typename traits::memory_traits memory_traits;
|
||||
typedef SizeType size_type;
|
||||
using data_type = DataType;
|
||||
using array_layout = typename traits::array_layout;
|
||||
using execution_space = typename traits::execution_space;
|
||||
using device_type = typename traits::device_type;
|
||||
using memory_traits = typename traits::memory_traits;
|
||||
using size_type = SizeType;
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
typedef StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType, Arg3Type>
|
||||
staticcrsgraph_type;
|
||||
typedef StaticCrsGraph<data_type, array_layout,
|
||||
typename traits::host_mirror_space, size_type,
|
||||
memory_traits>
|
||||
HostMirror;
|
||||
#else
|
||||
typedef StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>
|
||||
staticcrsgraph_type;
|
||||
typedef StaticCrsGraph<data_type, array_layout,
|
||||
typename traits::host_mirror_space, memory_traits,
|
||||
size_type>
|
||||
HostMirror;
|
||||
#endif
|
||||
using staticcrsgraph_type =
|
||||
StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>;
|
||||
using HostMirror = StaticCrsGraph<data_type, array_layout,
|
||||
typename traits::host_mirror_space,
|
||||
memory_traits, size_type>;
|
||||
|
||||
typedef View<const size_type*, array_layout, device_type, memory_traits>
|
||||
row_map_type;
|
||||
typedef View<data_type*, array_layout, device_type, memory_traits>
|
||||
entries_type;
|
||||
typedef View<const size_type*, array_layout, device_type, memory_traits>
|
||||
row_block_type;
|
||||
using row_map_type =
|
||||
View<const size_type*, array_layout, device_type, memory_traits>;
|
||||
using entries_type =
|
||||
View<data_type*, array_layout, device_type, memory_traits>;
|
||||
using row_block_type =
|
||||
View<const size_type*, array_layout, device_type, memory_traits>;
|
||||
|
||||
entries_type entries;
|
||||
row_map_type row_map;
|
||||
|
@ -370,6 +354,10 @@ class StaticCrsGraph {
|
|||
: static_cast<size_type>(0);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const {
|
||||
return (row_map.is_allocated() && entries.is_allocated());
|
||||
}
|
||||
|
||||
/// \brief Return a const view of row i of the graph.
|
||||
///
|
||||
/// If row i does not belong to the graph, return an empty view.
|
||||
|
@ -436,35 +424,19 @@ typename StaticCrsGraphType::staticcrsgraph_type create_staticcrsgraph(
|
|||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template <class DataType, class Arg1Type, class Arg2Type,
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
typename SizeType, class Arg3Type>
|
||||
typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType,
|
||||
Arg3Type>::HostMirror
|
||||
create_mirror_view(const StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType,
|
||||
Arg3Type>& input);
|
||||
#else
|
||||
class Arg3Type, typename SizeType>
|
||||
template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type,
|
||||
typename SizeType>
|
||||
typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
|
||||
SizeType>::HostMirror
|
||||
create_mirror_view(const StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
|
||||
SizeType>& input);
|
||||
#endif
|
||||
|
||||
template <class DataType, class Arg1Type, class Arg2Type,
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
typename SizeType, class Arg3Type>
|
||||
typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType,
|
||||
Arg3Type>::HostMirror
|
||||
create_mirror_view(const StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType,
|
||||
Arg3Type>& input);
|
||||
#else
|
||||
class Arg3Type, typename SizeType>
|
||||
template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type,
|
||||
typename SizeType>
|
||||
typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
|
||||
SizeType>::HostMirror
|
||||
create_mirror(const StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
|
||||
SizeType>& input);
|
||||
#endif
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
|
@ -481,8 +453,8 @@ namespace Impl {
|
|||
|
||||
template <class GraphType>
|
||||
struct StaticCrsGraphMaximumEntry {
|
||||
typedef typename GraphType::execution_space execution_space;
|
||||
typedef typename GraphType::data_type value_type;
|
||||
using execution_space = typename GraphType::execution_space;
|
||||
using value_type = typename GraphType::data_type;
|
||||
|
||||
const typename GraphType::entries_type entries;
|
||||
|
||||
|
@ -505,22 +477,13 @@ struct StaticCrsGraphMaximumEntry {
|
|||
|
||||
} // namespace Impl
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
template <class DataType, class Arg1Type, class Arg2Type, typename SizeType,
|
||||
class Arg3Type>
|
||||
DataType maximum_entry(const StaticCrsGraph<DataType, Arg1Type, Arg2Type,
|
||||
SizeType, Arg3Type>& graph) {
|
||||
typedef StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType, Arg3Type>
|
||||
GraphType;
|
||||
#else
|
||||
template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type,
|
||||
typename SizeType>
|
||||
DataType maximum_entry(const StaticCrsGraph<DataType, Arg1Type, Arg2Type,
|
||||
Arg3Type, SizeType>& graph) {
|
||||
typedef StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>
|
||||
GraphType;
|
||||
#endif
|
||||
typedef Impl::StaticCrsGraphMaximumEntry<GraphType> FunctorType;
|
||||
using GraphType =
|
||||
StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>;
|
||||
using FunctorType = Impl::StaticCrsGraphMaximumEntry<GraphType>;
|
||||
|
||||
DataType result = 0;
|
||||
Kokkos::parallel_reduce("Kokkos::maximum_entry", graph.entries.extent(0),
|
||||
|
|
|
@ -66,7 +66,7 @@
|
|||
|
||||
namespace Kokkos {
|
||||
|
||||
enum { UnorderedMapInvalidIndex = ~0u };
|
||||
enum : unsigned { UnorderedMapInvalidIndex = ~0u };
|
||||
|
||||
/// \brief First element of the return value of UnorderedMap::insert().
|
||||
///
|
||||
|
@ -84,7 +84,7 @@ enum { UnorderedMapInvalidIndex = ~0u };
|
|||
|
||||
class UnorderedMapInsertResult {
|
||||
private:
|
||||
enum Status {
|
||||
enum Status : uint32_t {
|
||||
SUCCESS = 1u << 31,
|
||||
EXISTING = 1u << 30,
|
||||
FREED_EXISTING = 1u << 29,
|
||||
|
@ -206,42 +206,40 @@ template <typename Key, typename Value,
|
|||
pod_equal_to<typename std::remove_const<Key>::type> >
|
||||
class UnorderedMap {
|
||||
private:
|
||||
typedef typename ViewTraits<Key, Device, void, void>::host_mirror_space
|
||||
host_mirror_space;
|
||||
using host_mirror_space =
|
||||
typename ViewTraits<Key, Device, void, void>::host_mirror_space;
|
||||
|
||||
public:
|
||||
//! \name Public types and constants
|
||||
//@{
|
||||
|
||||
// key_types
|
||||
typedef Key declared_key_type;
|
||||
typedef typename std::remove_const<declared_key_type>::type key_type;
|
||||
typedef typename std::add_const<key_type>::type const_key_type;
|
||||
using declared_key_type = Key;
|
||||
using key_type = typename std::remove_const<declared_key_type>::type;
|
||||
using const_key_type = typename std::add_const<key_type>::type;
|
||||
|
||||
// value_types
|
||||
typedef Value declared_value_type;
|
||||
typedef typename std::remove_const<declared_value_type>::type value_type;
|
||||
typedef typename std::add_const<value_type>::type const_value_type;
|
||||
using declared_value_type = Value;
|
||||
using value_type = typename std::remove_const<declared_value_type>::type;
|
||||
using const_value_type = typename std::add_const<value_type>::type;
|
||||
|
||||
typedef Device device_type;
|
||||
typedef typename Device::execution_space execution_space;
|
||||
typedef Hasher hasher_type;
|
||||
typedef EqualTo equal_to_type;
|
||||
typedef uint32_t size_type;
|
||||
using device_type = Device;
|
||||
using execution_space = typename Device::execution_space;
|
||||
using hasher_type = Hasher;
|
||||
using equal_to_type = EqualTo;
|
||||
using size_type = uint32_t;
|
||||
|
||||
// map_types
|
||||
typedef UnorderedMap<declared_key_type, declared_value_type, device_type,
|
||||
hasher_type, equal_to_type>
|
||||
declared_map_type;
|
||||
typedef UnorderedMap<key_type, value_type, device_type, hasher_type,
|
||||
equal_to_type>
|
||||
insertable_map_type;
|
||||
typedef UnorderedMap<const_key_type, value_type, device_type, hasher_type,
|
||||
equal_to_type>
|
||||
modifiable_map_type;
|
||||
typedef UnorderedMap<const_key_type, const_value_type, device_type,
|
||||
hasher_type, equal_to_type>
|
||||
const_map_type;
|
||||
using declared_map_type =
|
||||
UnorderedMap<declared_key_type, declared_value_type, device_type,
|
||||
hasher_type, equal_to_type>;
|
||||
using insertable_map_type = UnorderedMap<key_type, value_type, device_type,
|
||||
hasher_type, equal_to_type>;
|
||||
using modifiable_map_type =
|
||||
UnorderedMap<const_key_type, value_type, device_type, hasher_type,
|
||||
equal_to_type>;
|
||||
using const_map_type = UnorderedMap<const_key_type, const_value_type,
|
||||
device_type, hasher_type, equal_to_type>;
|
||||
|
||||
static const bool is_set = std::is_same<void, value_type>::value;
|
||||
static const bool has_const_key =
|
||||
|
@ -254,43 +252,42 @@ class UnorderedMap {
|
|||
static const bool is_modifiable_map = has_const_key && !has_const_value;
|
||||
static const bool is_const_map = has_const_key && has_const_value;
|
||||
|
||||
typedef UnorderedMapInsertResult insert_result;
|
||||
using insert_result = UnorderedMapInsertResult;
|
||||
|
||||
typedef UnorderedMap<Key, Value, host_mirror_space, Hasher, EqualTo>
|
||||
HostMirror;
|
||||
using HostMirror =
|
||||
UnorderedMap<Key, Value, host_mirror_space, Hasher, EqualTo>;
|
||||
|
||||
typedef Impl::UnorderedMapHistogram<const_map_type> histogram_type;
|
||||
using histogram_type = Impl::UnorderedMapHistogram<const_map_type>;
|
||||
|
||||
//@}
|
||||
|
||||
private:
|
||||
enum { invalid_index = ~static_cast<size_type>(0) };
|
||||
enum : size_type { invalid_index = ~static_cast<size_type>(0) };
|
||||
|
||||
typedef typename Impl::if_c<is_set, int, declared_value_type>::type
|
||||
impl_value_type;
|
||||
using impl_value_type =
|
||||
typename Impl::if_c<is_set, int, declared_value_type>::type;
|
||||
|
||||
typedef typename Impl::if_c<
|
||||
using key_type_view = typename Impl::if_c<
|
||||
is_insertable_map, View<key_type *, device_type>,
|
||||
View<const key_type *, device_type, MemoryTraits<RandomAccess> > >::type
|
||||
key_type_view;
|
||||
View<const key_type *, device_type, MemoryTraits<RandomAccess> > >::type;
|
||||
|
||||
typedef typename Impl::if_c<is_insertable_map || is_modifiable_map,
|
||||
View<impl_value_type *, device_type>,
|
||||
View<const impl_value_type *, device_type,
|
||||
MemoryTraits<RandomAccess> > >::type
|
||||
value_type_view;
|
||||
using value_type_view =
|
||||
typename Impl::if_c<is_insertable_map || is_modifiable_map,
|
||||
View<impl_value_type *, device_type>,
|
||||
View<const impl_value_type *, device_type,
|
||||
MemoryTraits<RandomAccess> > >::type;
|
||||
|
||||
typedef typename Impl::if_c<
|
||||
using size_type_view = typename Impl::if_c<
|
||||
is_insertable_map, View<size_type *, device_type>,
|
||||
View<const size_type *, device_type, MemoryTraits<RandomAccess> > >::type
|
||||
size_type_view;
|
||||
View<const size_type *, device_type, MemoryTraits<RandomAccess> > >::type;
|
||||
|
||||
typedef typename Impl::if_c<is_insertable_map, Bitset<execution_space>,
|
||||
ConstBitset<execution_space> >::type bitset_type;
|
||||
using bitset_type =
|
||||
typename Impl::if_c<is_insertable_map, Bitset<execution_space>,
|
||||
ConstBitset<execution_space> >::type;
|
||||
|
||||
enum { modified_idx = 0, erasable_idx = 1, failed_insert_idx = 2 };
|
||||
enum { num_scalars = 3 };
|
||||
typedef View<int[num_scalars], LayoutLeft, device_type> scalars_view;
|
||||
using scalars_view = View<int[num_scalars], LayoutLeft, device_type>;
|
||||
|
||||
public:
|
||||
//! \name Public member functions
|
||||
|
@ -353,6 +350,11 @@ class UnorderedMap {
|
|||
{ Kokkos::deep_copy(m_scalars, 0); }
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const {
|
||||
return (m_keys.is_allocated() && m_values.is_allocated() &&
|
||||
m_scalars.is_allocated());
|
||||
}
|
||||
|
||||
/// \brief Change the capacity of the the map
|
||||
///
|
||||
/// If there are no failed inserts the current size of the map will
|
||||
|
@ -742,9 +744,9 @@ class UnorderedMap {
|
|||
|
||||
Kokkos::deep_copy(tmp.m_available_indexes, src.m_available_indexes);
|
||||
|
||||
typedef Kokkos::Impl::DeepCopy<typename device_type::memory_space,
|
||||
typename SDevice::memory_space>
|
||||
raw_deep_copy;
|
||||
using raw_deep_copy =
|
||||
Kokkos::Impl::DeepCopy<typename device_type::memory_space,
|
||||
typename SDevice::memory_space>;
|
||||
|
||||
raw_deep_copy(tmp.m_hash_lists.data(), src.m_hash_lists.data(),
|
||||
sizeof(size_type) * src.m_hash_lists.extent(0));
|
||||
|
@ -768,25 +770,25 @@ class UnorderedMap {
|
|||
bool modified() const { return get_flag(modified_idx); }
|
||||
|
||||
void set_flag(int flag) const {
|
||||
typedef Kokkos::Impl::DeepCopy<typename device_type::memory_space,
|
||||
Kokkos::HostSpace>
|
||||
raw_deep_copy;
|
||||
using raw_deep_copy =
|
||||
Kokkos::Impl::DeepCopy<typename device_type::memory_space,
|
||||
Kokkos::HostSpace>;
|
||||
const int true_ = true;
|
||||
raw_deep_copy(m_scalars.data() + flag, &true_, sizeof(int));
|
||||
}
|
||||
|
||||
void reset_flag(int flag) const {
|
||||
typedef Kokkos::Impl::DeepCopy<typename device_type::memory_space,
|
||||
Kokkos::HostSpace>
|
||||
raw_deep_copy;
|
||||
using raw_deep_copy =
|
||||
Kokkos::Impl::DeepCopy<typename device_type::memory_space,
|
||||
Kokkos::HostSpace>;
|
||||
const int false_ = false;
|
||||
raw_deep_copy(m_scalars.data() + flag, &false_, sizeof(int));
|
||||
}
|
||||
|
||||
bool get_flag(int flag) const {
|
||||
typedef Kokkos::Impl::DeepCopy<Kokkos::HostSpace,
|
||||
typename device_type::memory_space>
|
||||
raw_deep_copy;
|
||||
using raw_deep_copy =
|
||||
Kokkos::Impl::DeepCopy<Kokkos::HostSpace,
|
||||
typename device_type::memory_space>;
|
||||
int result = false;
|
||||
raw_deep_copy(&result, m_scalars.data() + flag, sizeof(int));
|
||||
return result;
|
||||
|
|
|
@ -58,19 +58,19 @@ namespace Kokkos {
|
|||
template <class Scalar, class Arg1Type = void>
|
||||
class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
|
||||
public:
|
||||
typedef Scalar value_type;
|
||||
typedef Scalar* pointer;
|
||||
typedef const Scalar* const_pointer;
|
||||
typedef Scalar& reference;
|
||||
typedef const Scalar& const_reference;
|
||||
typedef Scalar* iterator;
|
||||
typedef const Scalar* const_iterator;
|
||||
typedef size_t size_type;
|
||||
using value_type = Scalar;
|
||||
using pointer = Scalar*;
|
||||
using const_pointer = const Scalar*;
|
||||
using reference = Scalar&;
|
||||
using const_reference = const Scalar&;
|
||||
using iterator = Scalar*;
|
||||
using const_iterator = const Scalar*;
|
||||
using size_type = size_t;
|
||||
|
||||
private:
|
||||
size_t _size;
|
||||
float _extra_storage;
|
||||
typedef DualView<Scalar*, LayoutLeft, Arg1Type> DV;
|
||||
using DV = DualView<Scalar*, LayoutLeft, Arg1Type>;
|
||||
|
||||
public:
|
||||
#ifdef KOKKOS_ENABLE_CUDA_UVM
|
||||
|
@ -212,14 +212,17 @@ class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
|
|||
return begin() + start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const {
|
||||
return DV::is_allocated();
|
||||
}
|
||||
|
||||
size_type size() const { return _size; }
|
||||
size_type max_size() const { return 2000000000; }
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
size_type capacity() const { return DV::capacity(); }
|
||||
#endif
|
||||
size_type span() const { return DV::span(); }
|
||||
bool empty() const { return _size == 0; }
|
||||
|
||||
pointer data() const { return DV::h_view.data(); }
|
||||
|
||||
iterator begin() const { return DV::h_view.data(); }
|
||||
|
||||
iterator end() const {
|
||||
|
@ -310,7 +313,7 @@ class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
|
|||
|
||||
public:
|
||||
struct set_functor {
|
||||
typedef typename DV::t_dev::execution_space execution_space;
|
||||
using execution_space = typename DV::t_dev::execution_space;
|
||||
typename DV::t_dev _data;
|
||||
Scalar _val;
|
||||
|
||||
|
@ -321,7 +324,7 @@ class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
|
|||
};
|
||||
|
||||
struct set_functor_host {
|
||||
typedef typename DV::t_host::execution_space execution_space;
|
||||
using execution_space = typename DV::t_host::execution_space;
|
||||
typename DV::t_host _data;
|
||||
Scalar _val;
|
||||
|
||||
|
|
|
@ -65,11 +65,11 @@ unsigned rotate_right(unsigned i, int r) {
|
|||
|
||||
template <typename Bitset>
|
||||
struct BitsetCount {
|
||||
typedef Bitset bitset_type;
|
||||
typedef
|
||||
typename bitset_type::execution_space::execution_space execution_space;
|
||||
typedef typename bitset_type::size_type size_type;
|
||||
typedef size_type value_type;
|
||||
using bitset_type = Bitset;
|
||||
using execution_space =
|
||||
typename bitset_type::execution_space::execution_space;
|
||||
using size_type = typename bitset_type::size_type;
|
||||
using value_type = size_type;
|
||||
|
||||
bitset_type m_bitset;
|
||||
|
||||
|
|
|
@ -140,10 +140,10 @@ uint32_t MurmurHash3_x86_32(const void* key, int len, uint32_t seed) {
|
|||
template <typename T>
|
||||
KOKKOS_FORCEINLINE_FUNCTION bool bitwise_equal(T const* const a_ptr,
|
||||
T const* const b_ptr) {
|
||||
typedef uint64_t KOKKOS_IMPL_MAY_ALIAS T64;
|
||||
typedef uint32_t KOKKOS_IMPL_MAY_ALIAS T32;
|
||||
typedef uint16_t KOKKOS_IMPL_MAY_ALIAS T16;
|
||||
typedef uint8_t KOKKOS_IMPL_MAY_ALIAS T8;
|
||||
typedef uint64_t KOKKOS_IMPL_MAY_ALIAS T64; // NOLINT(modernize-use-using)
|
||||
typedef uint32_t KOKKOS_IMPL_MAY_ALIAS T32; // NOLINT(modernize-use-using)
|
||||
typedef uint16_t KOKKOS_IMPL_MAY_ALIAS T16; // NOLINT(modernize-use-using)
|
||||
typedef uint8_t KOKKOS_IMPL_MAY_ALIAS T8; // NOLINT(modernize-use-using)
|
||||
|
||||
enum {
|
||||
NUM_8 = sizeof(T),
|
||||
|
|
|
@ -50,19 +50,6 @@
|
|||
|
||||
namespace Kokkos {
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
template <class DataType, class Arg1Type, class Arg2Type, typename SizeType,
|
||||
class Arg3Type>
|
||||
inline typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType,
|
||||
Arg3Type>::HostMirror
|
||||
create_mirror_view(
|
||||
const StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType, Arg3Type>&
|
||||
view,
|
||||
typename std::enable_if<ViewTraits<DataType, Arg1Type, Arg2Type,
|
||||
Arg3Type>::is_hostspace>::type* = 0) {
|
||||
return view;
|
||||
}
|
||||
#else
|
||||
template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type,
|
||||
typename SizeType>
|
||||
inline typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
|
||||
|
@ -74,20 +61,7 @@ create_mirror_view(
|
|||
Arg3Type>::is_hostspace>::type* = 0) {
|
||||
return view;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
template <class DataType, class Arg1Type, class Arg2Type, typename SizeType,
|
||||
class Arg3Type>
|
||||
inline typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType,
|
||||
Arg3Type>::HostMirror
|
||||
create_mirror(const StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType,
|
||||
Arg3Type>& view) {
|
||||
// Force copy:
|
||||
// typedef Impl::ViewAssignment< Impl::ViewDefault > alloc ; // unused
|
||||
typedef StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType, Arg3Type>
|
||||
staticcrsgraph_type;
|
||||
#else
|
||||
template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type,
|
||||
typename SizeType>
|
||||
inline typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
|
||||
|
@ -95,10 +69,9 @@ inline typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
|
|||
create_mirror(const StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
|
||||
SizeType>& view) {
|
||||
// Force copy:
|
||||
// typedef Impl::ViewAssignment< Impl::ViewDefault > alloc ; // unused
|
||||
typedef StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>
|
||||
staticcrsgraph_type;
|
||||
#endif
|
||||
// using alloc = Impl::ViewAssignment<Impl::ViewDefault>; // unused
|
||||
using staticcrsgraph_type =
|
||||
StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>;
|
||||
|
||||
typename staticcrsgraph_type::HostMirror tmp;
|
||||
typename staticcrsgraph_type::row_map_type::HostMirror tmp_row_map =
|
||||
|
@ -120,17 +93,6 @@ create_mirror(const StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
|
|||
return tmp;
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
template <class DataType, class Arg1Type, class Arg2Type, typename SizeType,
|
||||
class Arg3Type>
|
||||
inline typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType,
|
||||
Arg3Type>::HostMirror
|
||||
create_mirror_view(
|
||||
const StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType, Arg3Type>&
|
||||
view,
|
||||
typename std::enable_if<!ViewTraits<DataType, Arg1Type, Arg2Type,
|
||||
Arg3Type>::is_hostspace>::type* = 0)
|
||||
#else
|
||||
template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type,
|
||||
typename SizeType>
|
||||
inline typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
|
||||
|
@ -139,9 +101,7 @@ create_mirror_view(
|
|||
const StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>&
|
||||
view,
|
||||
typename std::enable_if<!ViewTraits<DataType, Arg1Type, Arg2Type,
|
||||
Arg3Type>::is_hostspace>::type* = 0)
|
||||
#endif
|
||||
{
|
||||
Arg3Type>::is_hostspace>::type* = 0) {
|
||||
return create_mirror(view);
|
||||
}
|
||||
} // namespace Kokkos
|
||||
|
@ -154,16 +114,15 @@ namespace Kokkos {
|
|||
template <class StaticCrsGraphType, class InputSizeType>
|
||||
inline typename StaticCrsGraphType::staticcrsgraph_type create_staticcrsgraph(
|
||||
const std::string& label, const std::vector<InputSizeType>& input) {
|
||||
typedef StaticCrsGraphType output_type;
|
||||
// typedef std::vector< InputSizeType > input_type ; // unused
|
||||
using output_type = StaticCrsGraphType;
|
||||
// using input_type = std::vector<InputSizeType>; // unused
|
||||
|
||||
typedef typename output_type::entries_type entries_type;
|
||||
using entries_type = typename output_type::entries_type;
|
||||
|
||||
typedef View<typename output_type::size_type[],
|
||||
typename output_type::array_layout,
|
||||
typename output_type::execution_space,
|
||||
typename output_type::memory_traits>
|
||||
work_type;
|
||||
using work_type = View<typename output_type::size_type[],
|
||||
typename output_type::array_layout,
|
||||
typename output_type::execution_space,
|
||||
typename output_type::memory_traits>;
|
||||
|
||||
output_type output;
|
||||
|
||||
|
@ -197,16 +156,15 @@ template <class StaticCrsGraphType, class InputSizeType>
|
|||
inline typename StaticCrsGraphType::staticcrsgraph_type create_staticcrsgraph(
|
||||
const std::string& label,
|
||||
const std::vector<std::vector<InputSizeType> >& input) {
|
||||
typedef StaticCrsGraphType output_type;
|
||||
typedef typename output_type::entries_type entries_type;
|
||||
using output_type = StaticCrsGraphType;
|
||||
using entries_type = typename output_type::entries_type;
|
||||
|
||||
static_assert(entries_type::rank == 1, "Graph entries view must be rank one");
|
||||
|
||||
typedef View<typename output_type::size_type[],
|
||||
typename output_type::array_layout,
|
||||
typename output_type::execution_space,
|
||||
typename output_type::memory_traits>
|
||||
work_type;
|
||||
using work_type = View<typename output_type::size_type[],
|
||||
typename output_type::array_layout,
|
||||
typename output_type::execution_space,
|
||||
typename output_type::memory_traits>;
|
||||
|
||||
output_type output;
|
||||
|
||||
|
|
|
@ -60,10 +60,10 @@ uint32_t find_hash_size(uint32_t size);
|
|||
|
||||
template <typename Map>
|
||||
struct UnorderedMapRehash {
|
||||
typedef Map map_type;
|
||||
typedef typename map_type::const_map_type const_map_type;
|
||||
typedef typename map_type::execution_space execution_space;
|
||||
typedef typename map_type::size_type size_type;
|
||||
using map_type = Map;
|
||||
using const_map_type = typename map_type::const_map_type;
|
||||
using execution_space = typename map_type::execution_space;
|
||||
using size_type = typename map_type::size_type;
|
||||
|
||||
map_type m_dst;
|
||||
const_map_type m_src;
|
||||
|
@ -84,11 +84,11 @@ struct UnorderedMapRehash {
|
|||
|
||||
template <typename UMap>
|
||||
struct UnorderedMapErase {
|
||||
typedef UMap map_type;
|
||||
typedef typename map_type::execution_space execution_space;
|
||||
typedef typename map_type::size_type size_type;
|
||||
typedef typename map_type::key_type key_type;
|
||||
typedef typename map_type::impl_value_type value_type;
|
||||
using map_type = UMap;
|
||||
using execution_space = typename map_type::execution_space;
|
||||
using size_type = typename map_type::size_type;
|
||||
using key_type = typename map_type::key_type;
|
||||
using value_type = typename map_type::impl_value_type;
|
||||
|
||||
map_type m_map;
|
||||
|
||||
|
@ -140,12 +140,12 @@ struct UnorderedMapErase {
|
|||
|
||||
template <typename UMap>
|
||||
struct UnorderedMapHistogram {
|
||||
typedef UMap map_type;
|
||||
typedef typename map_type::execution_space execution_space;
|
||||
typedef typename map_type::size_type size_type;
|
||||
using map_type = UMap;
|
||||
using execution_space = typename map_type::execution_space;
|
||||
using size_type = typename map_type::size_type;
|
||||
|
||||
typedef View<int[100], execution_space> histogram_view;
|
||||
typedef typename histogram_view::HostMirror host_histogram_view;
|
||||
using histogram_view = View<int[100], execution_space>;
|
||||
using host_histogram_view = typename histogram_view::HostMirror;
|
||||
|
||||
map_type m_map;
|
||||
histogram_view m_length;
|
||||
|
@ -230,9 +230,9 @@ struct UnorderedMapHistogram {
|
|||
|
||||
template <typename UMap>
|
||||
struct UnorderedMapPrint {
|
||||
typedef UMap map_type;
|
||||
typedef typename map_type::execution_space execution_space;
|
||||
typedef typename map_type::size_type size_type;
|
||||
using map_type = UMap;
|
||||
using execution_space = typename map_type::execution_space;
|
||||
using size_type = typename map_type::size_type;
|
||||
|
||||
map_type m_map;
|
||||
|
||||
|
|
|
@ -47,6 +47,7 @@
|
|||
#include <iostream>
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Kokkos_Bitset.hpp>
|
||||
#include <array>
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
@ -54,9 +55,9 @@ namespace Impl {
|
|||
|
||||
template <typename Bitset, bool Set>
|
||||
struct TestBitset {
|
||||
typedef Bitset bitset_type;
|
||||
typedef typename bitset_type::execution_space execution_space;
|
||||
typedef uint32_t value_type;
|
||||
using bitset_type = Bitset;
|
||||
using execution_space = typename bitset_type::execution_space;
|
||||
using value_type = uint32_t;
|
||||
|
||||
bitset_type m_bitset;
|
||||
|
||||
|
@ -95,9 +96,9 @@ struct TestBitset {
|
|||
|
||||
template <typename Bitset>
|
||||
struct TestBitsetTest {
|
||||
typedef Bitset bitset_type;
|
||||
typedef typename bitset_type::execution_space execution_space;
|
||||
typedef uint32_t value_type;
|
||||
using bitset_type = Bitset;
|
||||
using execution_space = typename bitset_type::execution_space;
|
||||
using value_type = uint32_t;
|
||||
|
||||
bitset_type m_bitset;
|
||||
|
||||
|
@ -127,9 +128,9 @@ struct TestBitsetTest {
|
|||
|
||||
template <typename Bitset, bool Set>
|
||||
struct TestBitsetAny {
|
||||
typedef Bitset bitset_type;
|
||||
typedef typename bitset_type::execution_space execution_space;
|
||||
typedef uint32_t value_type;
|
||||
using bitset_type = Bitset;
|
||||
using execution_space = typename bitset_type::execution_space;
|
||||
using value_type = uint32_t;
|
||||
|
||||
bitset_type m_bitset;
|
||||
|
||||
|
@ -181,16 +182,30 @@ struct TestBitsetAny {
|
|||
|
||||
template <typename Device>
|
||||
void test_bitset() {
|
||||
typedef Kokkos::Bitset<Device> bitset_type;
|
||||
typedef Kokkos::ConstBitset<Device> const_bitset_type;
|
||||
using bitset_type = Kokkos::Bitset<Device>;
|
||||
using const_bitset_type = Kokkos::ConstBitset<Device>;
|
||||
|
||||
// unsigned test_sizes[] = { 0u, 1000u, 1u<<14, 1u<<16, 10000001 };
|
||||
unsigned test_sizes[] = {1000u, 1u << 14, 1u << 16, 10000001};
|
||||
{
|
||||
unsigned ts = 100u;
|
||||
bitset_type b1;
|
||||
ASSERT_TRUE(b1.is_allocated());
|
||||
|
||||
for (int i = 0, end = sizeof(test_sizes) / sizeof(unsigned); i < end; ++i) {
|
||||
b1 = bitset_type(ts);
|
||||
bitset_type b2(b1);
|
||||
bitset_type b3(ts);
|
||||
|
||||
ASSERT_TRUE(b1.is_allocated());
|
||||
ASSERT_TRUE(b2.is_allocated());
|
||||
ASSERT_TRUE(b3.is_allocated());
|
||||
}
|
||||
|
||||
std::array<unsigned, 7> test_sizes = {
|
||||
{0u, 10u, 100u, 1000u, 1u << 14, 1u << 16, 10000001}};
|
||||
|
||||
for (const auto test_size : test_sizes) {
|
||||
// std::cout << "Bitset " << test_sizes[i] << std::endl;
|
||||
|
||||
bitset_type bitset(test_sizes[i]);
|
||||
bitset_type bitset(test_size);
|
||||
|
||||
// std::cout << " Check initial count " << std::endl;
|
||||
// nothing should be set
|
||||
|
@ -253,10 +268,7 @@ void test_bitset() {
|
|||
}
|
||||
}
|
||||
|
||||
// FIXME_HIP deadlock
|
||||
#ifndef KOKKOS_ENABLE_HIP
|
||||
TEST(TEST_CATEGORY, bitset) { test_bitset<TEST_EXECSPACE>(); }
|
||||
#endif
|
||||
} // namespace Test
|
||||
|
||||
#endif // KOKKOS_TEST_BITSET_HPP
|
||||
|
|
|
@ -55,13 +55,45 @@
|
|||
namespace Test {
|
||||
|
||||
namespace Impl {
|
||||
template <typename Scalar, class Device>
|
||||
struct test_dualview_alloc {
|
||||
using scalar_type = Scalar;
|
||||
using execution_space = Device;
|
||||
|
||||
template <typename ViewType>
|
||||
bool run_me(unsigned int n, unsigned int m) {
|
||||
if (n < 10) n = 10;
|
||||
if (m < 3) m = 3;
|
||||
|
||||
{
|
||||
ViewType b1;
|
||||
if (b1.is_allocated() == true) return false;
|
||||
|
||||
b1 = ViewType("B1", n, m);
|
||||
ViewType b2(b1);
|
||||
ViewType b3("B3", n, m);
|
||||
|
||||
if (b1.is_allocated() == false) return false;
|
||||
if (b2.is_allocated() == false) return false;
|
||||
if (b3.is_allocated() == false) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool result = false;
|
||||
|
||||
test_dualview_alloc(unsigned int size) {
|
||||
result = run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >(
|
||||
size, 3);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Scalar, class Device>
|
||||
struct test_dualview_combinations {
|
||||
typedef test_dualview_combinations<Scalar, Device> self_type;
|
||||
using self_type = test_dualview_combinations<Scalar, Device>;
|
||||
|
||||
typedef Scalar scalar_type;
|
||||
typedef Device execution_space;
|
||||
using scalar_type = Scalar;
|
||||
using execution_space = Device;
|
||||
|
||||
Scalar reference;
|
||||
Scalar result;
|
||||
|
@ -110,7 +142,7 @@ struct test_dualview_combinations {
|
|||
|
||||
template <typename Scalar, class ViewType>
|
||||
struct SumViewEntriesFunctor {
|
||||
typedef Scalar value_type;
|
||||
using value_type = Scalar;
|
||||
|
||||
ViewType fv;
|
||||
|
||||
|
@ -126,8 +158,8 @@ struct SumViewEntriesFunctor {
|
|||
|
||||
template <typename Scalar, class Device>
|
||||
struct test_dual_view_deep_copy {
|
||||
typedef Scalar scalar_type;
|
||||
typedef Device execution_space;
|
||||
using scalar_type = Scalar;
|
||||
using execution_space = Device;
|
||||
|
||||
template <typename ViewType>
|
||||
void run_me(int n, const int m, const bool use_templ_sync) {
|
||||
|
@ -153,8 +185,8 @@ struct test_dual_view_deep_copy {
|
|||
// Check device view is initialized as expected
|
||||
scalar_type a_d_sum = 0;
|
||||
// Execute on the execution_space associated with t_dev's memory space
|
||||
typedef typename ViewType::t_dev::memory_space::execution_space
|
||||
t_dev_exec_space;
|
||||
using t_dev_exec_space =
|
||||
typename ViewType::t_dev::memory_space::execution_space;
|
||||
Kokkos::parallel_reduce(
|
||||
Kokkos::RangePolicy<t_dev_exec_space>(0, n),
|
||||
SumViewEntriesFunctor<scalar_type, typename ViewType::t_dev>(a.d_view),
|
||||
|
@ -220,8 +252,8 @@ struct test_dual_view_deep_copy {
|
|||
|
||||
template <typename Scalar, class Device>
|
||||
struct test_dualview_resize {
|
||||
typedef Scalar scalar_type;
|
||||
typedef Device execution_space;
|
||||
using scalar_type = Scalar;
|
||||
using execution_space = Device;
|
||||
|
||||
template <typename ViewType>
|
||||
void run_me() {
|
||||
|
@ -244,8 +276,8 @@ struct test_dualview_resize {
|
|||
// Check device view is initialized as expected
|
||||
scalar_type a_d_sum = 0;
|
||||
// Execute on the execution_space associated with t_dev's memory space
|
||||
typedef typename ViewType::t_dev::memory_space::execution_space
|
||||
t_dev_exec_space;
|
||||
using t_dev_exec_space =
|
||||
typename ViewType::t_dev::memory_space::execution_space;
|
||||
Kokkos::parallel_reduce(
|
||||
Kokkos::RangePolicy<t_dev_exec_space>(0, a.d_view.extent(0)),
|
||||
SumViewEntriesFunctor<scalar_type, typename ViewType::t_dev>(a.d_view),
|
||||
|
@ -274,8 +306,8 @@ struct test_dualview_resize {
|
|||
// Check device view is initialized as expected
|
||||
a_d_sum = 0;
|
||||
// Execute on the execution_space associated with t_dev's memory space
|
||||
typedef typename ViewType::t_dev::memory_space::execution_space
|
||||
t_dev_exec_space;
|
||||
using t_dev_exec_space =
|
||||
typename ViewType::t_dev::memory_space::execution_space;
|
||||
Kokkos::parallel_reduce(
|
||||
Kokkos::RangePolicy<t_dev_exec_space>(0, a.d_view.extent(0)),
|
||||
SumViewEntriesFunctor<scalar_type, typename ViewType::t_dev>(a.d_view),
|
||||
|
@ -301,8 +333,8 @@ struct test_dualview_resize {
|
|||
|
||||
template <typename Scalar, class Device>
|
||||
struct test_dualview_realloc {
|
||||
typedef Scalar scalar_type;
|
||||
typedef Device execution_space;
|
||||
using scalar_type = Scalar;
|
||||
using execution_space = Device;
|
||||
|
||||
template <typename ViewType>
|
||||
void run_me() {
|
||||
|
@ -319,8 +351,8 @@ struct test_dualview_realloc {
|
|||
// Check device view is initialized as expected
|
||||
scalar_type a_d_sum = 0;
|
||||
// Execute on the execution_space associated with t_dev's memory space
|
||||
typedef typename ViewType::t_dev::memory_space::execution_space
|
||||
t_dev_exec_space;
|
||||
using t_dev_exec_space =
|
||||
typename ViewType::t_dev::memory_space::execution_space;
|
||||
Kokkos::parallel_reduce(
|
||||
Kokkos::RangePolicy<t_dev_exec_space>(0, a.d_view.extent(0)),
|
||||
SumViewEntriesFunctor<scalar_type, typename ViewType::t_dev>(a.d_view),
|
||||
|
@ -351,6 +383,12 @@ void test_dualview_combinations(unsigned int size, bool with_init) {
|
|||
ASSERT_EQ(test.result, 0);
|
||||
}
|
||||
|
||||
template <typename Scalar, typename Device>
|
||||
void test_dualview_alloc(unsigned int size) {
|
||||
Impl::test_dualview_alloc<Scalar, Device> test(size);
|
||||
ASSERT_TRUE(test.result);
|
||||
}
|
||||
|
||||
template <typename Scalar, typename Device>
|
||||
void test_dualview_deep_copy() {
|
||||
Impl::test_dual_view_deep_copy<Scalar, Device>();
|
||||
|
@ -370,6 +408,10 @@ TEST(TEST_CATEGORY, dualview_combination) {
|
|||
test_dualview_combinations<int, TEST_EXECSPACE>(10, true);
|
||||
}
|
||||
|
||||
TEST(TEST_CATEGORY, dualview_alloc) {
|
||||
test_dualview_alloc<int, TEST_EXECSPACE>(10);
|
||||
}
|
||||
|
||||
TEST(TEST_CATEGORY, dualview_combinations_without_init) {
|
||||
test_dualview_combinations<int, TEST_EXECSPACE>(10, false);
|
||||
}
|
||||
|
|
|
@ -68,12 +68,12 @@ size_t allocation_count(const Kokkos::DynRankView<T, P...>& view) {
|
|||
|
||||
template <typename T, class DeviceType>
|
||||
struct TestViewOperator {
|
||||
typedef DeviceType execution_space;
|
||||
using execution_space = DeviceType;
|
||||
|
||||
static const unsigned N = 100;
|
||||
static const unsigned D = 3;
|
||||
|
||||
typedef Kokkos::DynRankView<T, execution_space> view_type;
|
||||
using view_type = Kokkos::DynRankView<T, execution_space>;
|
||||
|
||||
const view_type v1;
|
||||
const view_type v2;
|
||||
|
@ -101,11 +101,11 @@ struct TestViewOperator_LeftAndRight;
|
|||
|
||||
template <class DataType, class DeviceType>
|
||||
struct TestViewOperator_LeftAndRight<DataType, DeviceType, 7> {
|
||||
typedef DeviceType execution_space;
|
||||
typedef typename execution_space::memory_space memory_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
using execution_space = DeviceType;
|
||||
using memory_space = typename execution_space::memory_space;
|
||||
using size_type = typename execution_space::size_type;
|
||||
|
||||
typedef int value_type;
|
||||
using value_type = int;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void join(volatile value_type& update,
|
||||
|
@ -116,11 +116,11 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 7> {
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
static void init(value_type& update) { update = 0; }
|
||||
|
||||
typedef Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>
|
||||
left_view;
|
||||
using left_view =
|
||||
Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>;
|
||||
|
||||
typedef Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>
|
||||
right_view;
|
||||
using right_view =
|
||||
Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>;
|
||||
|
||||
left_view left;
|
||||
right_view right;
|
||||
|
@ -186,11 +186,11 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 7> {
|
|||
|
||||
template <class DataType, class DeviceType>
|
||||
struct TestViewOperator_LeftAndRight<DataType, DeviceType, 6> {
|
||||
typedef DeviceType execution_space;
|
||||
typedef typename execution_space::memory_space memory_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
using execution_space = DeviceType;
|
||||
using memory_space = typename execution_space::memory_space;
|
||||
using size_type = typename execution_space::size_type;
|
||||
|
||||
typedef int value_type;
|
||||
using value_type = int;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void join(volatile value_type& update,
|
||||
|
@ -201,11 +201,11 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 6> {
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
static void init(value_type& update) { update = 0; }
|
||||
|
||||
typedef Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>
|
||||
left_view;
|
||||
using left_view =
|
||||
Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>;
|
||||
|
||||
typedef Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>
|
||||
right_view;
|
||||
using right_view =
|
||||
Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>;
|
||||
|
||||
left_view left;
|
||||
right_view right;
|
||||
|
@ -268,11 +268,11 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 6> {
|
|||
|
||||
template <class DataType, class DeviceType>
|
||||
struct TestViewOperator_LeftAndRight<DataType, DeviceType, 5> {
|
||||
typedef DeviceType execution_space;
|
||||
typedef typename execution_space::memory_space memory_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
using execution_space = DeviceType;
|
||||
using memory_space = typename execution_space::memory_space;
|
||||
using size_type = typename execution_space::size_type;
|
||||
|
||||
typedef int value_type;
|
||||
using value_type = int;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void join(volatile value_type& update,
|
||||
|
@ -283,14 +283,14 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 5> {
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
static void init(value_type& update) { update = 0; }
|
||||
|
||||
typedef Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>
|
||||
left_view;
|
||||
using left_view =
|
||||
Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>;
|
||||
|
||||
typedef Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>
|
||||
right_view;
|
||||
using right_view =
|
||||
Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>;
|
||||
|
||||
typedef Kokkos::DynRankView<DataType, Kokkos::LayoutStride, execution_space>
|
||||
stride_view;
|
||||
using stride_view =
|
||||
Kokkos::DynRankView<DataType, Kokkos::LayoutStride, execution_space>;
|
||||
|
||||
left_view left;
|
||||
right_view right;
|
||||
|
@ -363,11 +363,11 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 5> {
|
|||
|
||||
template <class DataType, class DeviceType>
|
||||
struct TestViewOperator_LeftAndRight<DataType, DeviceType, 4> {
|
||||
typedef DeviceType execution_space;
|
||||
typedef typename execution_space::memory_space memory_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
using execution_space = DeviceType;
|
||||
using memory_space = typename execution_space::memory_space;
|
||||
using size_type = typename execution_space::size_type;
|
||||
|
||||
typedef int value_type;
|
||||
using value_type = int;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void join(volatile value_type& update,
|
||||
|
@ -378,11 +378,11 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 4> {
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
static void init(value_type& update) { update = 0; }
|
||||
|
||||
typedef Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>
|
||||
left_view;
|
||||
using left_view =
|
||||
Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>;
|
||||
|
||||
typedef Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>
|
||||
right_view;
|
||||
using right_view =
|
||||
Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>;
|
||||
|
||||
left_view left;
|
||||
right_view right;
|
||||
|
@ -438,11 +438,11 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 4> {
|
|||
|
||||
template <class DataType, class DeviceType>
|
||||
struct TestViewOperator_LeftAndRight<DataType, DeviceType, 3> {
|
||||
typedef DeviceType execution_space;
|
||||
typedef typename execution_space::memory_space memory_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
using execution_space = DeviceType;
|
||||
using memory_space = typename execution_space::memory_space;
|
||||
using size_type = typename execution_space::size_type;
|
||||
|
||||
typedef int value_type;
|
||||
using value_type = int;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void join(volatile value_type& update,
|
||||
|
@ -453,14 +453,14 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 3> {
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
static void init(value_type& update) { update = 0; }
|
||||
|
||||
typedef Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>
|
||||
left_view;
|
||||
using left_view =
|
||||
Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>;
|
||||
|
||||
typedef Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>
|
||||
right_view;
|
||||
using right_view =
|
||||
Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>;
|
||||
|
||||
typedef Kokkos::DynRankView<DataType, Kokkos::LayoutStride, execution_space>
|
||||
stride_view;
|
||||
using stride_view =
|
||||
Kokkos::DynRankView<DataType, Kokkos::LayoutStride, execution_space>;
|
||||
|
||||
left_view left;
|
||||
right_view right;
|
||||
|
@ -536,11 +536,11 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 3> {
|
|||
|
||||
template <class DataType, class DeviceType>
|
||||
struct TestViewOperator_LeftAndRight<DataType, DeviceType, 2> {
|
||||
typedef DeviceType execution_space;
|
||||
typedef typename execution_space::memory_space memory_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
using execution_space = DeviceType;
|
||||
using memory_space = typename execution_space::memory_space;
|
||||
using size_type = typename execution_space::size_type;
|
||||
|
||||
typedef int value_type;
|
||||
using value_type = int;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void join(volatile value_type& update,
|
||||
|
@ -551,11 +551,11 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 2> {
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
static void init(value_type& update) { update = 0; }
|
||||
|
||||
typedef Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>
|
||||
left_view;
|
||||
using left_view =
|
||||
Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>;
|
||||
|
||||
typedef Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>
|
||||
right_view;
|
||||
using right_view =
|
||||
Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>;
|
||||
|
||||
left_view left;
|
||||
right_view right;
|
||||
|
@ -616,11 +616,11 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 2> {
|
|||
|
||||
template <class DataType, class DeviceType>
|
||||
struct TestViewOperator_LeftAndRight<DataType, DeviceType, 1> {
|
||||
typedef DeviceType execution_space;
|
||||
typedef typename execution_space::memory_space memory_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
using execution_space = DeviceType;
|
||||
using memory_space = typename execution_space::memory_space;
|
||||
using size_type = typename execution_space::size_type;
|
||||
|
||||
typedef int value_type;
|
||||
using value_type = int;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void join(volatile value_type& update,
|
||||
|
@ -631,14 +631,14 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 1> {
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
static void init(value_type& update) { update = 0; }
|
||||
|
||||
typedef Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>
|
||||
left_view;
|
||||
using left_view =
|
||||
Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>;
|
||||
|
||||
typedef Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>
|
||||
right_view;
|
||||
using right_view =
|
||||
Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>;
|
||||
|
||||
typedef Kokkos::DynRankView<DataType, Kokkos::LayoutStride, execution_space>
|
||||
stride_view;
|
||||
using stride_view =
|
||||
Kokkos::DynRankView<DataType, Kokkos::LayoutStride, execution_space>;
|
||||
|
||||
left_view left;
|
||||
right_view right;
|
||||
|
@ -689,22 +689,22 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 1> {
|
|||
template <typename T, class DeviceType>
|
||||
class TestDynViewAPI {
|
||||
public:
|
||||
typedef DeviceType device;
|
||||
using device = DeviceType;
|
||||
|
||||
enum { N0 = 1000, N1 = 3, N2 = 5, N3 = 7 };
|
||||
|
||||
typedef Kokkos::DynRankView<T, device> dView0;
|
||||
typedef Kokkos::DynRankView<const T, device> const_dView0;
|
||||
using dView0 = Kokkos::DynRankView<T, device>;
|
||||
using const_dView0 = Kokkos::DynRankView<const T, device>;
|
||||
|
||||
typedef Kokkos::DynRankView<T, device, Kokkos::MemoryUnmanaged>
|
||||
dView0_unmanaged;
|
||||
typedef typename dView0::host_mirror_space host_drv_space;
|
||||
using dView0_unmanaged =
|
||||
Kokkos::DynRankView<T, device, Kokkos::MemoryUnmanaged>;
|
||||
using host_drv_space = typename dView0::host_mirror_space;
|
||||
|
||||
typedef Kokkos::View<T, device> View0;
|
||||
typedef Kokkos::View<T*, device> View1;
|
||||
typedef Kokkos::View<T*******, device> View7;
|
||||
using View0 = Kokkos::View<T, device>;
|
||||
using View1 = Kokkos::View<T*, device>;
|
||||
using View7 = Kokkos::View<T*******, device>;
|
||||
|
||||
typedef typename View0::host_mirror_space host_view_space;
|
||||
using host_view_space = typename View0::host_mirror_space;
|
||||
|
||||
static void run_tests() {
|
||||
run_test_resize_realloc();
|
||||
|
@ -712,6 +712,7 @@ class TestDynViewAPI {
|
|||
run_test_mirror_and_copy();
|
||||
run_test_scalar();
|
||||
run_test();
|
||||
run_test_allocated();
|
||||
run_test_const();
|
||||
run_test_subview();
|
||||
run_test_subview_strided();
|
||||
|
@ -750,8 +751,8 @@ class TestDynViewAPI {
|
|||
}
|
||||
|
||||
static void run_test_mirror() {
|
||||
typedef Kokkos::DynRankView<int, host_drv_space> view_type;
|
||||
typedef typename view_type::HostMirror mirror_type;
|
||||
using view_type = Kokkos::DynRankView<int, host_drv_space>;
|
||||
using mirror_type = typename view_type::HostMirror;
|
||||
view_type a("a");
|
||||
mirror_type am = Kokkos::create_mirror_view(a);
|
||||
mirror_type ax = Kokkos::create_mirror(a);
|
||||
|
@ -851,8 +852,8 @@ class TestDynViewAPI {
|
|||
ASSERT_EQ(a_h.rank(), a_d.rank());
|
||||
}
|
||||
{
|
||||
typedef Kokkos::DynRankView<int, Kokkos::LayoutStride, Kokkos::HostSpace>
|
||||
view_stride_type;
|
||||
using view_stride_type =
|
||||
Kokkos::DynRankView<int, Kokkos::LayoutStride, Kokkos::HostSpace>;
|
||||
unsigned order[] = {6, 5, 4, 3, 2, 1, 0},
|
||||
dimen[] = {N0, N1, N2, 2, 2, 2, 2}; // LayoutRight equivalent
|
||||
view_stride_type a_h(
|
||||
|
@ -956,8 +957,8 @@ class TestDynViewAPI {
|
|||
}
|
||||
|
||||
static void run_test_scalar() {
|
||||
typedef typename dView0::HostMirror
|
||||
hView0; // HostMirror of DynRankView is a DynRankView
|
||||
using hView0 = typename dView0::HostMirror; // HostMirror of DynRankView is
|
||||
// a DynRankView
|
||||
|
||||
dView0 dx, dy;
|
||||
hView0 hx, hy;
|
||||
|
@ -1050,12 +1051,12 @@ class TestDynViewAPI {
|
|||
|
||||
static void run_test() {
|
||||
// mfh 14 Feb 2014: This test doesn't actually create instances of
|
||||
// these types. In order to avoid "declared but unused typedef"
|
||||
// these types. In order to avoid "unused type alias"
|
||||
// warnings, we declare empty instances of these types, with the
|
||||
// usual "(void)" marker to avoid compiler warnings for unused
|
||||
// variables.
|
||||
|
||||
typedef typename dView0::HostMirror hView0;
|
||||
using hView0 = typename dView0::HostMirror;
|
||||
|
||||
{
|
||||
hView0 thing;
|
||||
|
@ -1361,7 +1362,7 @@ class TestDynViewAPI {
|
|||
}
|
||||
}
|
||||
|
||||
typedef T DataType;
|
||||
using DataType = T;
|
||||
|
||||
static void check_auto_conversion_to_const(
|
||||
const Kokkos::DynRankView<const DataType, device>& arg_const,
|
||||
|
@ -1369,12 +1370,28 @@ class TestDynViewAPI {
|
|||
ASSERT_TRUE(arg_const == arg);
|
||||
}
|
||||
|
||||
static void run_test_allocated() {
|
||||
using device_type = Kokkos::DynRankView<DataType, device>;
|
||||
|
||||
const int N1 = 100;
|
||||
const int N2 = 10;
|
||||
|
||||
device_type d1;
|
||||
ASSERT_FALSE(d1.is_allocated());
|
||||
|
||||
d1 = device_type("d1", N1, N2);
|
||||
device_type d2(d1);
|
||||
device_type d3("d3", N1);
|
||||
ASSERT_TRUE(d1.is_allocated());
|
||||
ASSERT_TRUE(d2.is_allocated());
|
||||
ASSERT_TRUE(d3.is_allocated());
|
||||
}
|
||||
|
||||
static void run_test_const() {
|
||||
typedef Kokkos::DynRankView<DataType, device> typeX;
|
||||
typedef Kokkos::DynRankView<const DataType, device> const_typeX;
|
||||
typedef Kokkos::DynRankView<const DataType, device,
|
||||
Kokkos::MemoryRandomAccess>
|
||||
const_typeR;
|
||||
using typeX = Kokkos::DynRankView<DataType, device>;
|
||||
using const_typeX = Kokkos::DynRankView<const DataType, device>;
|
||||
using const_typeR =
|
||||
Kokkos::DynRankView<const DataType, device, Kokkos::MemoryRandomAccess>;
|
||||
typeX x("X", 2);
|
||||
const_typeX xc = x;
|
||||
const_typeR xr = x;
|
||||
|
@ -1398,10 +1415,10 @@ class TestDynViewAPI {
|
|||
}
|
||||
|
||||
static void run_test_subview() {
|
||||
typedef Kokkos::DynRankView<const T, device> cdView;
|
||||
typedef Kokkos::DynRankView<T, device> dView;
|
||||
using cdView = Kokkos::DynRankView<const T, device>;
|
||||
using dView = Kokkos::DynRankView<T, device>;
|
||||
// LayoutStride required for all returned DynRankView subdynrankview's
|
||||
typedef Kokkos::DynRankView<T, Kokkos::LayoutStride, device> sdView;
|
||||
using sdView = Kokkos::DynRankView<T, Kokkos::LayoutStride, device>;
|
||||
|
||||
dView0 d0("d0");
|
||||
cdView s0 = d0;
|
||||
|
@ -1452,7 +1469,7 @@ class TestDynViewAPI {
|
|||
ASSERT_EQ(dv6.rank(), 6);
|
||||
|
||||
// DynRankView with LayoutRight
|
||||
typedef Kokkos::DynRankView<T, Kokkos::LayoutRight, device> drView;
|
||||
using drView = Kokkos::DynRankView<T, Kokkos::LayoutRight, device>;
|
||||
drView dr5("dr5", N0, N1, N2, 2, 2);
|
||||
ASSERT_EQ(dr5.rank(), 5);
|
||||
|
||||
|
@ -1514,7 +1531,8 @@ class TestDynViewAPI {
|
|||
ASSERT_EQ(ds5.extent(4), ds5plus.extent(4));
|
||||
ASSERT_EQ(ds5.extent(5), ds5plus.extent(5));
|
||||
|
||||
#if !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_UVM)
|
||||
#if (!defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_UVM)) && \
|
||||
!defined(KOKKOS_ENABLE_HIP)
|
||||
ASSERT_EQ(&ds5(1, 1, 1, 1, 0) - &ds5plus(1, 1, 1, 1, 0), 0);
|
||||
ASSERT_EQ(&ds5(1, 1, 1, 1, 0, 0) - &ds5plus(1, 1, 1, 1, 0, 0),
|
||||
0); // passing argument to rank beyond the view's rank is allowed
|
||||
|
@ -1538,12 +1556,12 @@ class TestDynViewAPI {
|
|||
}
|
||||
|
||||
static void run_test_subview_strided() {
|
||||
typedef Kokkos::DynRankView<int, Kokkos::LayoutLeft, host_drv_space>
|
||||
drview_left;
|
||||
typedef Kokkos::DynRankView<int, Kokkos::LayoutRight, host_drv_space>
|
||||
drview_right;
|
||||
typedef Kokkos::DynRankView<int, Kokkos::LayoutStride, host_drv_space>
|
||||
drview_stride;
|
||||
using drview_left =
|
||||
Kokkos::DynRankView<int, Kokkos::LayoutLeft, host_drv_space>;
|
||||
using drview_right =
|
||||
Kokkos::DynRankView<int, Kokkos::LayoutRight, host_drv_space>;
|
||||
using drview_stride =
|
||||
Kokkos::DynRankView<int, Kokkos::LayoutStride, host_drv_space>;
|
||||
|
||||
drview_left xl2("xl2", 100, 200);
|
||||
drview_right xr2("xr2", 100, 200);
|
||||
|
@ -1588,31 +1606,29 @@ class TestDynViewAPI {
|
|||
static void run_test_vector() {
|
||||
static const unsigned Length = 1000, Count = 8;
|
||||
|
||||
typedef typename Kokkos::DynRankView<T, Kokkos::LayoutLeft, host_drv_space>
|
||||
multivector_type;
|
||||
using multivector_type =
|
||||
typename Kokkos::DynRankView<T, Kokkos::LayoutLeft, host_drv_space>;
|
||||
|
||||
typedef typename Kokkos::DynRankView<T, Kokkos::LayoutRight, host_drv_space>
|
||||
multivector_right_type;
|
||||
using multivector_right_type =
|
||||
typename Kokkos::DynRankView<T, Kokkos::LayoutRight, host_drv_space>;
|
||||
|
||||
multivector_type mv = multivector_type("mv", Length, Count);
|
||||
multivector_right_type mv_right =
|
||||
multivector_right_type("mv", Length, Count);
|
||||
|
||||
typedef
|
||||
typename Kokkos::DynRankView<T, Kokkos::LayoutStride, host_drv_space>
|
||||
svector_type;
|
||||
typedef
|
||||
typename Kokkos::DynRankView<T, Kokkos::LayoutStride, host_drv_space>
|
||||
smultivector_type;
|
||||
typedef typename Kokkos::DynRankView<const T, Kokkos::LayoutStride,
|
||||
host_drv_space>
|
||||
const_svector_right_type;
|
||||
typedef typename Kokkos::DynRankView<const T, Kokkos::LayoutStride,
|
||||
host_drv_space>
|
||||
const_svector_type;
|
||||
typedef typename Kokkos::DynRankView<const T, Kokkos::LayoutStride,
|
||||
host_drv_space>
|
||||
const_smultivector_type;
|
||||
using svector_type =
|
||||
typename Kokkos::DynRankView<T, Kokkos::LayoutStride, host_drv_space>;
|
||||
using smultivector_type =
|
||||
typename Kokkos::DynRankView<T, Kokkos::LayoutStride, host_drv_space>;
|
||||
using const_svector_right_type =
|
||||
typename Kokkos::DynRankView<const T, Kokkos::LayoutStride,
|
||||
host_drv_space>;
|
||||
using const_svector_type =
|
||||
typename Kokkos::DynRankView<const T, Kokkos::LayoutStride,
|
||||
host_drv_space>;
|
||||
using const_smultivector_type =
|
||||
typename Kokkos::DynRankView<const T, Kokkos::LayoutStride,
|
||||
host_drv_space>;
|
||||
|
||||
svector_type v1 = Kokkos::subdynrankview(mv, Kokkos::ALL(), 0);
|
||||
svector_type v2 = Kokkos::subdynrankview(mv, Kokkos::ALL(), 1);
|
||||
|
|
|
@ -44,10 +44,7 @@
|
|||
|
||||
#include <TestDynViewAPI.hpp>
|
||||
namespace Test {
|
||||
// FIXME_HIP attempt to access inaccessible memory space
|
||||
#ifndef KOKKOS_ENABLE_HIP
|
||||
TEST(TEST_CATEGORY, dyn_rank_view_api_generic) {
|
||||
TestDynViewAPI<double, TEST_EXECSPACE>::run_tests();
|
||||
}
|
||||
#endif
|
||||
} // namespace Test
|
||||
|
|
|
@ -45,10 +45,7 @@
|
|||
#include <TestDynViewAPI.hpp>
|
||||
|
||||
namespace Test {
|
||||
// FIXME_HIP failing with wrong value
|
||||
#ifndef KOKKOS_ENABLE_HIP
|
||||
TEST(TEST_CATEGORY, dyn_rank_view_api_operator_rank12345) {
|
||||
TestDynViewAPI<double, TEST_EXECSPACE>::run_operator_test_rank12345();
|
||||
}
|
||||
#endif
|
||||
} // namespace Test
|
||||
|
|
|
@ -58,12 +58,12 @@ namespace Test {
|
|||
|
||||
template <typename Scalar, class Space>
|
||||
struct TestDynamicView {
|
||||
typedef typename Space::execution_space execution_space;
|
||||
typedef typename Space::memory_space memory_space;
|
||||
using execution_space = typename Space::execution_space;
|
||||
using memory_space = typename Space::memory_space;
|
||||
|
||||
typedef Kokkos::Experimental::DynamicView<Scalar*, Space> view_type;
|
||||
using view_type = Kokkos::Experimental::DynamicView<Scalar*, Space>;
|
||||
|
||||
typedef double value_type;
|
||||
using value_type = double;
|
||||
|
||||
static void run(unsigned arg_total_size) {
|
||||
// Test: Create DynamicView, initialize size (via resize), run through
|
||||
|
@ -71,6 +71,27 @@ struct TestDynamicView {
|
|||
// values and repeat
|
||||
// Case 1: min_chunk_size is a power of 2
|
||||
{
|
||||
{
|
||||
view_type d1;
|
||||
ASSERT_FALSE(d1.is_allocated());
|
||||
|
||||
d1 = view_type("d1", 1024, arg_total_size);
|
||||
view_type d2(d1);
|
||||
view_type d3("d3", 1024, arg_total_size);
|
||||
|
||||
ASSERT_FALSE(d1.is_allocated());
|
||||
ASSERT_FALSE(d2.is_allocated());
|
||||
ASSERT_FALSE(d3.is_allocated());
|
||||
|
||||
unsigned d_size = arg_total_size / 8;
|
||||
d1.resize_serial(d_size);
|
||||
d2.resize_serial(d_size);
|
||||
d3.resize_serial(d_size);
|
||||
|
||||
ASSERT_TRUE(d1.is_allocated());
|
||||
ASSERT_TRUE(d2.is_allocated());
|
||||
ASSERT_TRUE(d3.is_allocated());
|
||||
}
|
||||
view_type da("da", 1024, arg_total_size);
|
||||
ASSERT_EQ(da.size(), 0);
|
||||
// Init
|
||||
|
@ -223,7 +244,7 @@ struct TestDynamicView {
|
|||
};
|
||||
|
||||
TEST(TEST_CATEGORY, dynamic_view) {
|
||||
typedef TestDynamicView<double, TEST_EXECSPACE> TestDynView;
|
||||
using TestDynView = TestDynamicView<double, TEST_EXECSPACE>;
|
||||
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
TestDynView::run(100000 + 100 * i);
|
||||
|
|
|
@ -84,9 +84,9 @@ void checkReportersAndReportsAgree(const std::vector<int> &reporters,
|
|||
|
||||
template <typename DeviceType>
|
||||
struct ErrorReporterDriverBase {
|
||||
typedef ThreeValReport<int, int, double> report_type;
|
||||
typedef Kokkos::Experimental::ErrorReporter<report_type, DeviceType>
|
||||
error_reporter_type;
|
||||
using report_type = ThreeValReport<int, int, double>;
|
||||
using error_reporter_type =
|
||||
Kokkos::Experimental::ErrorReporter<report_type, DeviceType>;
|
||||
error_reporter_type m_errorReporter;
|
||||
|
||||
ErrorReporterDriverBase(int reporter_capacity, int /*test_size*/)
|
||||
|
@ -97,10 +97,11 @@ struct ErrorReporterDriverBase {
|
|||
}
|
||||
|
||||
void check_expectations(int reporter_capacity, int test_size) {
|
||||
using namespace std;
|
||||
int num_reported = m_errorReporter.getNumReports();
|
||||
int num_attempts = m_errorReporter.getNumReportAttempts();
|
||||
|
||||
int expected_num_reports = std::min(reporter_capacity, test_size / 2);
|
||||
int expected_num_reports = min(reporter_capacity, test_size / 2);
|
||||
EXPECT_EQ(expected_num_reports, num_reported);
|
||||
EXPECT_EQ(test_size / 2, num_attempts);
|
||||
|
||||
|
@ -112,7 +113,7 @@ struct ErrorReporterDriverBase {
|
|||
|
||||
template <typename ErrorReporterDriverType>
|
||||
void TestErrorReporter() {
|
||||
typedef ErrorReporterDriverType tester_type;
|
||||
using tester_type = ErrorReporterDriverType;
|
||||
std::vector<int> reporters;
|
||||
std::vector<typename tester_type::report_type> reports;
|
||||
|
||||
|
@ -147,9 +148,9 @@ void TestErrorReporter() {
|
|||
|
||||
template <typename DeviceType>
|
||||
struct ErrorReporterDriver : public ErrorReporterDriverBase<DeviceType> {
|
||||
typedef ErrorReporterDriverBase<DeviceType> driver_base;
|
||||
typedef typename driver_base::error_reporter_type::execution_space
|
||||
execution_space;
|
||||
using driver_base = ErrorReporterDriverBase<DeviceType>;
|
||||
using execution_space =
|
||||
typename driver_base::error_reporter_type::execution_space;
|
||||
|
||||
ErrorReporterDriver(int reporter_capacity, int test_size)
|
||||
: driver_base(reporter_capacity, test_size) {
|
||||
|
@ -185,12 +186,16 @@ struct ErrorReporterDriver : public ErrorReporterDriverBase<DeviceType> {
|
|||
template <typename DeviceType>
|
||||
struct ErrorReporterDriverUseLambda
|
||||
: public ErrorReporterDriverBase<DeviceType> {
|
||||
typedef ErrorReporterDriverBase<DeviceType> driver_base;
|
||||
typedef typename driver_base::error_reporter_type::execution_space
|
||||
execution_space;
|
||||
using driver_base = ErrorReporterDriverBase<DeviceType>;
|
||||
using execution_space =
|
||||
typename driver_base::error_reporter_type::execution_space;
|
||||
|
||||
ErrorReporterDriverUseLambda(int reporter_capacity, int test_size)
|
||||
: driver_base(reporter_capacity, test_size) {
|
||||
execute(reporter_capacity, test_size);
|
||||
}
|
||||
|
||||
void execute(int reporter_capacity, int test_size) {
|
||||
Kokkos::parallel_for(
|
||||
Kokkos::RangePolicy<execution_space>(0, test_size),
|
||||
KOKKOS_CLASS_LAMBDA(const int work_idx) {
|
||||
|
@ -210,9 +215,9 @@ struct ErrorReporterDriverUseLambda
|
|||
#ifdef KOKKOS_ENABLE_OPENMP
|
||||
struct ErrorReporterDriverNativeOpenMP
|
||||
: public ErrorReporterDriverBase<Kokkos::OpenMP> {
|
||||
typedef ErrorReporterDriverBase<Kokkos::OpenMP> driver_base;
|
||||
typedef typename driver_base::error_reporter_type::execution_space
|
||||
execution_space;
|
||||
using driver_base = ErrorReporterDriverBase<Kokkos::OpenMP>;
|
||||
using execution_space =
|
||||
typename driver_base::error_reporter_type::execution_space;
|
||||
|
||||
ErrorReporterDriverNativeOpenMP(int reporter_capacity, int test_size)
|
||||
: driver_base(reporter_capacity, test_size) {
|
||||
|
|
|
@ -61,12 +61,25 @@ namespace Test {
|
|||
|
||||
template <typename Scalar, typename Device>
|
||||
void test_offsetview_construction() {
|
||||
typedef Kokkos::Experimental::OffsetView<Scalar**, Device> offset_view_type;
|
||||
typedef Kokkos::View<Scalar**, Device> view_type;
|
||||
using offset_view_type = Kokkos::Experimental::OffsetView<Scalar**, Device>;
|
||||
using view_type = Kokkos::View<Scalar**, Device>;
|
||||
|
||||
Kokkos::Experimental::index_list_type range0 = {-1, 3};
|
||||
Kokkos::Experimental::index_list_type range1 = {-2, 2};
|
||||
|
||||
{
|
||||
offset_view_type o1;
|
||||
ASSERT_FALSE(o1.is_allocated());
|
||||
|
||||
o1 = offset_view_type("o1", range0, range1);
|
||||
offset_view_type o2(o1);
|
||||
offset_view_type o3("o3", range0, range1);
|
||||
|
||||
ASSERT_TRUE(o1.is_allocated());
|
||||
ASSERT_TRUE(o2.is_allocated());
|
||||
ASSERT_TRUE(o3.is_allocated());
|
||||
}
|
||||
|
||||
offset_view_type ov("firstOV", range0, range1);
|
||||
|
||||
ASSERT_EQ("firstOV", ov.label());
|
||||
|
@ -109,9 +122,9 @@ void test_offsetview_construction() {
|
|||
{ // test deep copy of scalar const value into mirro
|
||||
const int constVal = 6;
|
||||
typename offset_view_type::HostMirror hostOffsetView =
|
||||
Kokkos::Experimental::create_mirror_view(ov);
|
||||
Kokkos::create_mirror_view(ov);
|
||||
|
||||
Kokkos::Experimental::deep_copy(hostOffsetView, constVal);
|
||||
Kokkos::deep_copy(hostOffsetView, constVal);
|
||||
|
||||
for (int i = hostOffsetView.begin(0); i < hostOffsetView.end(0); ++i) {
|
||||
for (int j = hostOffsetView.begin(1); j < hostOffsetView.end(1); ++j) {
|
||||
|
@ -121,10 +134,9 @@ void test_offsetview_construction() {
|
|||
}
|
||||
}
|
||||
|
||||
typedef Kokkos::MDRangePolicy<Device, Kokkos::Rank<2>,
|
||||
Kokkos::IndexType<int> >
|
||||
range_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
using range_type =
|
||||
Kokkos::MDRangePolicy<Device, Kokkos::Rank<2>, Kokkos::IndexType<int> >;
|
||||
using point_type = typename range_type::point_type;
|
||||
|
||||
range_type rangePolicy2D(point_type{{ovmin0, ovmin1}},
|
||||
point_type{{ovend0, ovend1}});
|
||||
|
@ -136,9 +148,9 @@ void test_offsetview_construction() {
|
|||
|
||||
// test offsetview to offsetviewmirror deep copy
|
||||
typename offset_view_type::HostMirror hostOffsetView =
|
||||
Kokkos::Experimental::create_mirror_view(ov);
|
||||
Kokkos::create_mirror_view(ov);
|
||||
|
||||
Kokkos::Experimental::deep_copy(hostOffsetView, ov);
|
||||
Kokkos::deep_copy(hostOffsetView, ov);
|
||||
|
||||
for (int i = hostOffsetView.begin(0); i < hostOffsetView.end(0); ++i) {
|
||||
for (int j = hostOffsetView.begin(1); j < hostOffsetView.end(1); ++j) {
|
||||
|
@ -185,10 +197,9 @@ void test_offsetview_construction() {
|
|||
|
||||
Kokkos::deep_copy(view3D, 1);
|
||||
|
||||
typedef Kokkos::MDRangePolicy<Device, Kokkos::Rank<3>,
|
||||
Kokkos::IndexType<int64_t> >
|
||||
range3_type;
|
||||
typedef typename range3_type::point_type point3_type;
|
||||
using range3_type = Kokkos::MDRangePolicy<Device, Kokkos::Rank<3>,
|
||||
Kokkos::IndexType<int64_t> >;
|
||||
using point3_type = typename range3_type::point_type;
|
||||
|
||||
typename point3_type::value_type begins0 = -10, begins1 = -20,
|
||||
begins2 = -30;
|
||||
|
@ -245,7 +256,7 @@ void test_offsetview_construction() {
|
|||
|
||||
{ // test offsetview to view deep copy
|
||||
view_type aView("aView", ov.extent(0), ov.extent(1));
|
||||
Kokkos::Experimental::deep_copy(aView, ov);
|
||||
Kokkos::deep_copy(aView, ov);
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
int sum = 0;
|
||||
|
@ -264,7 +275,7 @@ void test_offsetview_construction() {
|
|||
view_type aView("aView", ov.extent(0), ov.extent(1));
|
||||
|
||||
Kokkos::deep_copy(aView, 99);
|
||||
Kokkos::Experimental::deep_copy(ov, aView);
|
||||
Kokkos::deep_copy(ov, aView);
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
int sum = 0;
|
||||
|
@ -447,10 +458,9 @@ void test_offsetview_subview() {
|
|||
ASSERT_EQ(offsetSubview.end(1), 9);
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
typedef Kokkos::MDRangePolicy<Device, Kokkos::Rank<2>,
|
||||
Kokkos::IndexType<int> >
|
||||
range_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
using range_type = Kokkos::MDRangePolicy<Device, Kokkos::Rank<2>,
|
||||
Kokkos::IndexType<int> >;
|
||||
using point_type = typename range_type::point_type;
|
||||
|
||||
const int b0 = offsetSubview.begin(0);
|
||||
const int b1 = offsetSubview.begin(1);
|
||||
|
|
|
@ -50,21 +50,22 @@
|
|||
|
||||
namespace Test {
|
||||
|
||||
template <typename DeviceType, typename Layout, int duplication,
|
||||
int contribution, int op>
|
||||
template <typename DeviceType, typename Layout, typename Duplication,
|
||||
typename Contribution, typename Op, typename NumberType>
|
||||
struct test_scatter_view_impl_cls;
|
||||
|
||||
template <typename DeviceType, typename Layout, int duplication,
|
||||
int contribution>
|
||||
struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
|
||||
Kokkos::Experimental::ScatterSum> {
|
||||
template <typename DeviceType, typename Layout, typename Duplication,
|
||||
typename Contribution, typename NumberType>
|
||||
struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
|
||||
Kokkos::Experimental::ScatterSum,
|
||||
NumberType> {
|
||||
public:
|
||||
typedef Kokkos::Experimental::ScatterView<double * [3], Layout, DeviceType,
|
||||
Kokkos::Experimental::ScatterSum,
|
||||
duplication, contribution>
|
||||
scatter_view_type;
|
||||
using scatter_view_type =
|
||||
Kokkos::Experimental::ScatterView<NumberType * [12], Layout, DeviceType,
|
||||
Kokkos::Experimental::ScatterSum,
|
||||
Duplication, Contribution>;
|
||||
|
||||
typedef Kokkos::View<double * [3], Layout, DeviceType> orig_view_type;
|
||||
using orig_view_type = Kokkos::View<NumberType * [12], Layout, DeviceType>;
|
||||
|
||||
scatter_view_type scatter_view;
|
||||
int scatterSize;
|
||||
|
@ -80,9 +81,18 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
|
|||
Kokkos::fence();
|
||||
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0);
|
||||
++i) {
|
||||
host_view(i, 0) = 0.0;
|
||||
host_view(i, 1) = 0.0;
|
||||
host_view(i, 2) = 0.0;
|
||||
host_view(i, 0) = 0.0;
|
||||
host_view(i, 1) = 0.0;
|
||||
host_view(i, 2) = 0.0;
|
||||
host_view(i, 3) = 0.0;
|
||||
host_view(i, 4) = 0.0;
|
||||
host_view(i, 5) = 0.0;
|
||||
host_view(i, 6) = 0.0;
|
||||
host_view(i, 7) = 0.0;
|
||||
host_view(i, 8) = 0.0;
|
||||
host_view(i, 9) = 0.0;
|
||||
host_view(i, 10) = 0.0;
|
||||
host_view(i, 11) = 0.0;
|
||||
}
|
||||
Kokkos::fence();
|
||||
Kokkos::deep_copy(orig, host_view);
|
||||
|
@ -102,9 +112,18 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
|
|||
scatter_view.template access<Kokkos::Experimental::ScatterAtomic>();
|
||||
for (int j = 0; j < 10; ++j) {
|
||||
auto k = (i + j) % scatterSize;
|
||||
scatter_access(k, 0) += 4.2;
|
||||
scatter_access_atomic(k, 1) += 2.0;
|
||||
scatter_access(k, 2) += 1.0;
|
||||
scatter_access(k, 0) += 4;
|
||||
++scatter_access(k, 1);
|
||||
--scatter_access(k, 2);
|
||||
scatter_access(k, 3)++;
|
||||
scatter_access(k, 4)--;
|
||||
scatter_access(k, 5) -= 5;
|
||||
scatter_access_atomic(k, 6) += 2;
|
||||
scatter_access_atomic(k, 7)++;
|
||||
scatter_access_atomic(k, 8)--;
|
||||
--scatter_access_atomic(k, 9);
|
||||
++scatter_access_atomic(k, 10);
|
||||
scatter_access(k, 11) -= 3;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -114,27 +133,46 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
|
|||
Kokkos::fence();
|
||||
for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0);
|
||||
++i) {
|
||||
auto val0 = host_view(i, 0);
|
||||
auto val1 = host_view(i, 1);
|
||||
auto val2 = host_view(i, 2);
|
||||
EXPECT_TRUE(std::fabs((val0 - 84.0) / 84.0) < 1e-14);
|
||||
EXPECT_TRUE(std::fabs((val1 - 40.0) / 40.0) < 1e-14);
|
||||
EXPECT_TRUE(std::fabs((val2 - 20.0) / 20.0) < 1e-14);
|
||||
auto val0 = host_view(i, 0);
|
||||
auto val1 = host_view(i, 1);
|
||||
auto val2 = host_view(i, 2);
|
||||
auto val3 = host_view(i, 3);
|
||||
auto val4 = host_view(i, 4);
|
||||
auto val5 = host_view(i, 5);
|
||||
auto val6 = host_view(i, 6);
|
||||
auto val7 = host_view(i, 7);
|
||||
auto val8 = host_view(i, 8);
|
||||
auto val9 = host_view(i, 9);
|
||||
auto val10 = host_view(i, 10);
|
||||
auto val11 = host_view(i, 11);
|
||||
EXPECT_NEAR(val0, NumberType(80), 1e-14);
|
||||
EXPECT_NEAR(val1, NumberType(20), 1e-14);
|
||||
EXPECT_NEAR(val2, NumberType(-20), 1e-14);
|
||||
EXPECT_NEAR(val3, NumberType(20), 1e-14);
|
||||
EXPECT_NEAR(val4, NumberType(-20), 1e-14);
|
||||
EXPECT_NEAR(val5, NumberType(-100), 1e-14);
|
||||
EXPECT_NEAR(val6, NumberType(40), 1e-14);
|
||||
EXPECT_NEAR(val7, NumberType(20), 1e-14);
|
||||
EXPECT_NEAR(val8, NumberType(-20), 1e-14);
|
||||
EXPECT_NEAR(val9, NumberType(-20), 1e-14);
|
||||
EXPECT_NEAR(val10, NumberType(20), 1e-14);
|
||||
EXPECT_NEAR(val11, NumberType(-60), 1e-14);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename DeviceType, typename Layout, int duplication,
|
||||
int contribution>
|
||||
struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
|
||||
Kokkos::Experimental::ScatterProd> {
|
||||
template <typename DeviceType, typename Layout, typename Duplication,
|
||||
typename Contribution, typename NumberType>
|
||||
struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
|
||||
Kokkos::Experimental::ScatterProd,
|
||||
NumberType> {
|
||||
public:
|
||||
typedef Kokkos::Experimental::ScatterView<double * [3], Layout, DeviceType,
|
||||
Kokkos::Experimental::ScatterProd,
|
||||
duplication, contribution>
|
||||
scatter_view_type;
|
||||
using scatter_view_type =
|
||||
Kokkos::Experimental::ScatterView<NumberType * [3], Layout, DeviceType,
|
||||
Kokkos::Experimental::ScatterProd,
|
||||
Duplication, Contribution>;
|
||||
|
||||
typedef Kokkos::View<double * [3], Layout, DeviceType> orig_view_type;
|
||||
using orig_view_type = Kokkos::View<NumberType * [3], Layout, DeviceType>;
|
||||
|
||||
scatter_view_type scatter_view;
|
||||
int scatterSize;
|
||||
|
@ -194,17 +232,18 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
|
|||
}
|
||||
};
|
||||
|
||||
template <typename DeviceType, typename Layout, int duplication,
|
||||
int contribution>
|
||||
struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
|
||||
Kokkos::Experimental::ScatterMin> {
|
||||
template <typename DeviceType, typename Layout, typename Duplication,
|
||||
typename Contribution, typename NumberType>
|
||||
struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
|
||||
Kokkos::Experimental::ScatterMin,
|
||||
NumberType> {
|
||||
public:
|
||||
typedef Kokkos::Experimental::ScatterView<double * [3], Layout, DeviceType,
|
||||
Kokkos::Experimental::ScatterMin,
|
||||
duplication, contribution>
|
||||
scatter_view_type;
|
||||
using scatter_view_type =
|
||||
Kokkos::Experimental::ScatterView<NumberType * [3], Layout, DeviceType,
|
||||
Kokkos::Experimental::ScatterMin,
|
||||
Duplication, Contribution>;
|
||||
|
||||
typedef Kokkos::View<double * [3], Layout, DeviceType> orig_view_type;
|
||||
using orig_view_type = Kokkos::View<NumberType * [3], Layout, DeviceType>;
|
||||
|
||||
scatter_view_type scatter_view;
|
||||
int scatterSize;
|
||||
|
@ -242,9 +281,9 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
|
|||
scatter_view.template access<Kokkos::Experimental::ScatterAtomic>();
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
auto k = (i + j) % scatterSize;
|
||||
scatter_access(k, 0).update((double)(j + 1) * 4);
|
||||
scatter_access_atomic(k, 1).update((double)(j + 1) * 2.0);
|
||||
scatter_access(k, 2).update((double)(j + 1) * 1.0);
|
||||
scatter_access(k, 0).update((NumberType)(j + 1) * 4);
|
||||
scatter_access_atomic(k, 1).update((NumberType)(j + 1) * 2.0);
|
||||
scatter_access(k, 2).update((NumberType)(j + 1) * 1.0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -264,17 +303,18 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
|
|||
}
|
||||
};
|
||||
|
||||
template <typename DeviceType, typename Layout, int duplication,
|
||||
int contribution>
|
||||
struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
|
||||
Kokkos::Experimental::ScatterMax> {
|
||||
template <typename DeviceType, typename Layout, typename Duplication,
|
||||
typename Contribution, typename NumberType>
|
||||
struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
|
||||
Kokkos::Experimental::ScatterMax,
|
||||
NumberType> {
|
||||
public:
|
||||
typedef Kokkos::Experimental::ScatterView<double * [3], Layout, DeviceType,
|
||||
Kokkos::Experimental::ScatterMax,
|
||||
duplication, contribution>
|
||||
scatter_view_type;
|
||||
using scatter_view_type =
|
||||
Kokkos::Experimental::ScatterView<NumberType * [3], Layout, DeviceType,
|
||||
Kokkos::Experimental::ScatterMax,
|
||||
Duplication, Contribution>;
|
||||
|
||||
typedef Kokkos::View<double * [3], Layout, DeviceType> orig_view_type;
|
||||
using orig_view_type = Kokkos::View<NumberType * [3], Layout, DeviceType>;
|
||||
|
||||
scatter_view_type scatter_view;
|
||||
int scatterSize;
|
||||
|
@ -311,9 +351,9 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
|
|||
scatter_view.template access<Kokkos::Experimental::ScatterAtomic>();
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
auto k = (i + j) % scatterSize;
|
||||
scatter_access(k, 0).update((double)(j + 1) * 4);
|
||||
scatter_access_atomic(k, 1).update((double)(j + 1) * 2.0);
|
||||
scatter_access(k, 2).update((double)(j + 1) * 1.0);
|
||||
scatter_access(k, 0).update((NumberType)(j + 1) * 4);
|
||||
scatter_access_atomic(k, 1).update((NumberType)(j + 1) * 2.0);
|
||||
scatter_access(k, 2).update((NumberType)(j + 1) * 1.0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -333,27 +373,126 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
|
|||
}
|
||||
};
|
||||
|
||||
template <typename DeviceType, typename Layout, int duplication,
|
||||
int contribution, int op>
|
||||
struct test_scatter_view_config {
|
||||
template <typename DeviceType, typename Layout, typename Op,
|
||||
typename NumberType>
|
||||
struct test_default_scatter_view {
|
||||
public:
|
||||
typedef
|
||||
typename test_scatter_view_impl_cls<DeviceType, Layout, duplication,
|
||||
contribution, op>::scatter_view_type
|
||||
scatter_view_def;
|
||||
typedef typename test_scatter_view_impl_cls<DeviceType, Layout, duplication,
|
||||
contribution, op>::orig_view_type
|
||||
orig_view_def;
|
||||
using default_duplication = Kokkos::Impl::Experimental::DefaultDuplication<
|
||||
typename DeviceType::execution_space>;
|
||||
using Duplication = typename default_duplication::type;
|
||||
using Contribution = typename Kokkos::Impl::Experimental::DefaultContribution<
|
||||
typename DeviceType::execution_space, Duplication>::type;
|
||||
using scatter_view_def =
|
||||
typename test_scatter_view_impl_cls<DeviceType, Layout, Duplication,
|
||||
Contribution, Op,
|
||||
NumberType>::scatter_view_type;
|
||||
using orig_view_def =
|
||||
typename test_scatter_view_impl_cls<DeviceType, Layout, Duplication,
|
||||
Contribution, Op,
|
||||
NumberType>::orig_view_type;
|
||||
|
||||
void run_test(int n) {
|
||||
// Test creation via create_scatter_view overload 1
|
||||
{
|
||||
orig_view_def original_view("original_view", n);
|
||||
scatter_view_def scatter_view =
|
||||
Kokkos::Experimental::create_scatter_view(Op{}, original_view);
|
||||
|
||||
test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
|
||||
Op, NumberType>
|
||||
scatter_view_test_impl(scatter_view);
|
||||
scatter_view_test_impl.initialize(original_view);
|
||||
scatter_view_test_impl.run_parallel(n);
|
||||
|
||||
Kokkos::Experimental::contribute(original_view, scatter_view);
|
||||
scatter_view.reset_except(original_view);
|
||||
|
||||
scatter_view_test_impl.run_parallel(n);
|
||||
|
||||
Kokkos::Experimental::contribute(original_view, scatter_view);
|
||||
Kokkos::fence();
|
||||
|
||||
scatter_view_test_impl.validateResults(original_view);
|
||||
|
||||
{
|
||||
scatter_view_def persistent_view("persistent", n);
|
||||
auto result_view = persistent_view.subview();
|
||||
contribute(result_view, persistent_view);
|
||||
Kokkos::fence();
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename DeviceType, typename Layout, typename Duplication,
|
||||
typename Contribution, typename Op, typename NumberType>
|
||||
struct test_scatter_view_config {
|
||||
public:
|
||||
using scatter_view_def =
|
||||
typename test_scatter_view_impl_cls<DeviceType, Layout, Duplication,
|
||||
Contribution, Op,
|
||||
NumberType>::scatter_view_type;
|
||||
using orig_view_def =
|
||||
typename test_scatter_view_impl_cls<DeviceType, Layout, Duplication,
|
||||
Contribution, Op,
|
||||
NumberType>::orig_view_type;
|
||||
|
||||
void run_test(int n) {
|
||||
// test allocation
|
||||
{
|
||||
orig_view_def ov1("ov1", n);
|
||||
scatter_view_def sv1;
|
||||
|
||||
ASSERT_FALSE(sv1.is_allocated());
|
||||
|
||||
sv1 = Kokkos::Experimental::create_scatter_view<Op, Duplication,
|
||||
Contribution>(ov1);
|
||||
|
||||
scatter_view_def sv2(sv1);
|
||||
scatter_view_def sv3("sv3", n);
|
||||
|
||||
ASSERT_TRUE(sv1.is_allocated());
|
||||
ASSERT_TRUE(sv2.is_allocated());
|
||||
ASSERT_TRUE(sv3.is_allocated());
|
||||
}
|
||||
|
||||
// Test creation via create_scatter_view
|
||||
{
|
||||
orig_view_def original_view("original_view", n);
|
||||
scatter_view_def scatter_view = Kokkos::Experimental::create_scatter_view<
|
||||
op, duplication, contribution>(original_view);
|
||||
Op, Duplication, Contribution>(original_view);
|
||||
|
||||
test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
|
||||
op>
|
||||
test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
|
||||
Op, NumberType>
|
||||
scatter_view_test_impl(scatter_view);
|
||||
scatter_view_test_impl.initialize(original_view);
|
||||
scatter_view_test_impl.run_parallel(n);
|
||||
|
||||
Kokkos::Experimental::contribute(original_view, scatter_view);
|
||||
scatter_view.reset_except(original_view);
|
||||
|
||||
scatter_view_test_impl.run_parallel(n);
|
||||
|
||||
Kokkos::Experimental::contribute(original_view, scatter_view);
|
||||
Kokkos::fence();
|
||||
|
||||
scatter_view_test_impl.validateResults(original_view);
|
||||
|
||||
{
|
||||
scatter_view_def persistent_view("persistent", n);
|
||||
auto result_view = persistent_view.subview();
|
||||
contribute(result_view, persistent_view);
|
||||
Kokkos::fence();
|
||||
}
|
||||
}
|
||||
// Test creation via create_scatter_view overload 2
|
||||
{
|
||||
orig_view_def original_view("original_view", n);
|
||||
scatter_view_def scatter_view = Kokkos::Experimental::create_scatter_view(
|
||||
Op{}, Duplication{}, Contribution{}, original_view);
|
||||
|
||||
test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
|
||||
Op, NumberType>
|
||||
scatter_view_test_impl(scatter_view);
|
||||
scatter_view_test_impl.initialize(original_view);
|
||||
scatter_view_test_impl.run_parallel(n);
|
||||
|
@ -380,8 +519,8 @@ struct test_scatter_view_config {
|
|||
orig_view_def original_view("original_view", n);
|
||||
scatter_view_def scatter_view(original_view);
|
||||
|
||||
test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
|
||||
op>
|
||||
test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
|
||||
Op, NumberType>
|
||||
scatter_view_test_impl(scatter_view);
|
||||
scatter_view_test_impl.initialize(original_view);
|
||||
scatter_view_test_impl.run_parallel(n);
|
||||
|
@ -406,19 +545,19 @@ struct test_scatter_view_config {
|
|||
}
|
||||
};
|
||||
|
||||
template <typename DeviceType, int ScatterType>
|
||||
template <typename DeviceType, typename ScatterType, typename NumberType>
|
||||
struct TestDuplicatedScatterView {
|
||||
TestDuplicatedScatterView(int n) {
|
||||
// ScatterSum test
|
||||
test_scatter_view_config<DeviceType, Kokkos::LayoutRight,
|
||||
Kokkos::Experimental::ScatterDuplicated,
|
||||
Kokkos::Experimental::ScatterNonAtomic,
|
||||
ScatterType>
|
||||
ScatterType, NumberType>
|
||||
test_sv_right_config;
|
||||
test_sv_right_config.run_test(n);
|
||||
test_scatter_view_config<
|
||||
DeviceType, Kokkos::LayoutLeft, Kokkos::Experimental::ScatterDuplicated,
|
||||
Kokkos::Experimental::ScatterNonAtomic, ScatterType>
|
||||
Kokkos::Experimental::ScatterNonAtomic, ScatterType, NumberType>
|
||||
test_sv_left_config;
|
||||
test_sv_left_config.run_test(n);
|
||||
}
|
||||
|
@ -427,18 +566,19 @@ struct TestDuplicatedScatterView {
|
|||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
// disable duplicated instantiation with CUDA until
|
||||
// UniqueToken can support it
|
||||
template <int ScatterType>
|
||||
struct TestDuplicatedScatterView<Kokkos::Cuda, ScatterType> {
|
||||
template <typename ScatterType, typename NumberType>
|
||||
struct TestDuplicatedScatterView<Kokkos::Cuda, ScatterType, NumberType> {
|
||||
TestDuplicatedScatterView(int) {}
|
||||
};
|
||||
template <int ScatterType>
|
||||
template <typename ScatterType, typename NumberType>
|
||||
struct TestDuplicatedScatterView<
|
||||
Kokkos::Device<Kokkos::Cuda, Kokkos::CudaSpace>, ScatterType> {
|
||||
Kokkos::Device<Kokkos::Cuda, Kokkos::CudaSpace>, ScatterType, NumberType> {
|
||||
TestDuplicatedScatterView(int) {}
|
||||
};
|
||||
template <int ScatterType>
|
||||
template <typename ScatterType, typename NumberType>
|
||||
struct TestDuplicatedScatterView<
|
||||
Kokkos::Device<Kokkos::Cuda, Kokkos::CudaUVMSpace>, ScatterType> {
|
||||
Kokkos::Device<Kokkos::Cuda, Kokkos::CudaUVMSpace>, ScatterType,
|
||||
NumberType> {
|
||||
TestDuplicatedScatterView(int) {}
|
||||
};
|
||||
#endif
|
||||
|
@ -446,13 +586,14 @@ struct TestDuplicatedScatterView<
|
|||
#ifdef KOKKOS_ENABLE_ROCM
|
||||
// disable duplicated instantiation with ROCm until
|
||||
// UniqueToken can support it
|
||||
template <int ScatterType>
|
||||
template <typename ScatterType>
|
||||
struct TestDuplicatedScatterView<Kokkos::Experimental::ROCm, ScatterType> {
|
||||
TestDuplicatedScatterView(int) {}
|
||||
};
|
||||
#endif
|
||||
|
||||
template <typename DeviceType, int ScatterType>
|
||||
template <typename DeviceType, typename ScatterType,
|
||||
typename NumberType = double>
|
||||
void test_scatter_view(int n) {
|
||||
using execution_space = typename DeviceType::execution_space;
|
||||
|
||||
|
@ -463,7 +604,7 @@ void test_scatter_view(int n) {
|
|||
test_scatter_view_config<DeviceType, Kokkos::LayoutRight,
|
||||
Kokkos::Experimental::ScatterNonDuplicated,
|
||||
Kokkos::Experimental::ScatterNonAtomic,
|
||||
ScatterType>
|
||||
ScatterType, NumberType>
|
||||
test_sv_config;
|
||||
test_sv_config.run_test(n);
|
||||
}
|
||||
|
@ -472,30 +613,40 @@ void test_scatter_view(int n) {
|
|||
#endif
|
||||
test_scatter_view_config<DeviceType, Kokkos::LayoutRight,
|
||||
Kokkos::Experimental::ScatterNonDuplicated,
|
||||
Kokkos::Experimental::ScatterAtomic, ScatterType>
|
||||
Kokkos::Experimental::ScatterAtomic, ScatterType,
|
||||
NumberType>
|
||||
test_sv_config;
|
||||
test_sv_config.run_test(n);
|
||||
#ifdef KOKKOS_ENABLE_SERIAL
|
||||
}
|
||||
#endif
|
||||
// with hundreds of threads we were running out of memory.
|
||||
// limit (n) so that duplication doesn't exceed 8GB
|
||||
// limit (n) so that duplication doesn't exceed 4GB
|
||||
constexpr std::size_t maximum_allowed_total_bytes =
|
||||
8ull * 1024ull * 1024ull * 1024ull;
|
||||
4ull * 1024ull * 1024ull * 1024ull;
|
||||
std::size_t const maximum_allowed_copy_bytes =
|
||||
maximum_allowed_total_bytes /
|
||||
std::size_t(execution_space().concurrency());
|
||||
constexpr std::size_t bytes_per_value = sizeof(double) * 3;
|
||||
constexpr std::size_t bytes_per_value = sizeof(NumberType) * 12;
|
||||
std::size_t const maximum_allowed_copy_values =
|
||||
maximum_allowed_copy_bytes / bytes_per_value;
|
||||
n = std::min(n, int(maximum_allowed_copy_values));
|
||||
TestDuplicatedScatterView<DeviceType, ScatterType> duptest(n);
|
||||
|
||||
// if the default is duplicated, this needs to follow the limit
|
||||
{
|
||||
test_default_scatter_view<DeviceType, Kokkos::LayoutRight, ScatterType,
|
||||
NumberType>
|
||||
test_default_sv;
|
||||
test_default_sv.run_test(n);
|
||||
}
|
||||
TestDuplicatedScatterView<DeviceType, ScatterType, NumberType> duptest(n);
|
||||
}
|
||||
|
||||
// FIXME_HIP ScatterView requires UniqueToken
|
||||
#ifndef KOKKOS_ENABLE_HIP
|
||||
TEST(TEST_CATEGORY, scatterview) {
|
||||
test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterSum>(10);
|
||||
test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterSum, double>(
|
||||
10);
|
||||
test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterSum,
|
||||
unsigned int>(10);
|
||||
test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterProd>(10);
|
||||
test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterMin>(10);
|
||||
test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterMax>(10);
|
||||
|
@ -512,7 +663,10 @@ TEST(TEST_CATEGORY, scatterview) {
|
|||
#endif
|
||||
|
||||
#endif
|
||||
test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterSum>(big_n);
|
||||
test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterSum, double>(
|
||||
big_n);
|
||||
test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterSum,
|
||||
unsigned int>(big_n);
|
||||
test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterProd>(big_n);
|
||||
test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterMin>(big_n);
|
||||
test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterMax>(big_n);
|
||||
|
@ -522,7 +676,9 @@ TEST(TEST_CATEGORY, scatterview_devicetype) {
|
|||
using device_type =
|
||||
Kokkos::Device<TEST_EXECSPACE, typename TEST_EXECSPACE::memory_space>;
|
||||
|
||||
test_scatter_view<device_type, Kokkos::Experimental::ScatterSum>(10);
|
||||
test_scatter_view<device_type, Kokkos::Experimental::ScatterSum, double>(10);
|
||||
test_scatter_view<device_type, Kokkos::Experimental::ScatterSum,
|
||||
unsigned int>(10);
|
||||
test_scatter_view<device_type, Kokkos::Experimental::ScatterProd>(10);
|
||||
test_scatter_view<device_type, Kokkos::Experimental::ScatterMin>(10);
|
||||
test_scatter_view<device_type, Kokkos::Experimental::ScatterMax>(10);
|
||||
|
@ -530,14 +686,19 @@ TEST(TEST_CATEGORY, scatterview_devicetype) {
|
|||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
if (std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value) {
|
||||
using cuda_device_type = Kokkos::Device<Kokkos::Cuda, Kokkos::CudaSpace>;
|
||||
test_scatter_view<cuda_device_type, Kokkos::Experimental::ScatterSum>(10);
|
||||
test_scatter_view<cuda_device_type, Kokkos::Experimental::ScatterSum,
|
||||
double>(10);
|
||||
test_scatter_view<cuda_device_type, Kokkos::Experimental::ScatterSum,
|
||||
unsigned int>(10);
|
||||
test_scatter_view<cuda_device_type, Kokkos::Experimental::ScatterProd>(10);
|
||||
test_scatter_view<cuda_device_type, Kokkos::Experimental::ScatterMin>(10);
|
||||
test_scatter_view<cuda_device_type, Kokkos::Experimental::ScatterMax>(10);
|
||||
using cudauvm_device_type =
|
||||
Kokkos::Device<Kokkos::Cuda, Kokkos::CudaUVMSpace>;
|
||||
test_scatter_view<cudauvm_device_type, Kokkos::Experimental::ScatterSum>(
|
||||
10);
|
||||
test_scatter_view<cudauvm_device_type, Kokkos::Experimental::ScatterSum,
|
||||
double>(10);
|
||||
test_scatter_view<cudauvm_device_type, Kokkos::Experimental::ScatterSum,
|
||||
unsigned int>(10);
|
||||
test_scatter_view<cudauvm_device_type, Kokkos::Experimental::ScatterProd>(
|
||||
10);
|
||||
test_scatter_view<cudauvm_device_type, Kokkos::Experimental::ScatterMin>(
|
||||
|
@ -547,7 +708,6 @@ TEST(TEST_CATEGORY, scatterview_devicetype) {
|
|||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace Test
|
||||
|
||||
|
|
|
@ -55,12 +55,10 @@ namespace TestStaticCrsGraph {
|
|||
|
||||
template <class Space>
|
||||
void run_test_graph() {
|
||||
typedef Kokkos::StaticCrsGraph<unsigned, Space> dView;
|
||||
typedef typename dView::HostMirror hView;
|
||||
using dView = Kokkos::StaticCrsGraph<unsigned, Space>;
|
||||
using hView = typename dView::HostMirror;
|
||||
|
||||
const unsigned LENGTH = 1000;
|
||||
dView dx;
|
||||
hView hx;
|
||||
|
||||
std::vector<std::vector<int> > graph(LENGTH);
|
||||
|
||||
|
@ -71,6 +69,23 @@ void run_test_graph() {
|
|||
}
|
||||
}
|
||||
|
||||
{
|
||||
dView d1;
|
||||
ASSERT_FALSE(d1.is_allocated());
|
||||
|
||||
d1 = Kokkos::create_staticcrsgraph<dView>("d1", graph);
|
||||
|
||||
dView d2(d1);
|
||||
dView d3(d1.entries, d1.row_map);
|
||||
|
||||
ASSERT_TRUE(d1.is_allocated());
|
||||
ASSERT_TRUE(d2.is_allocated());
|
||||
ASSERT_TRUE(d3.is_allocated());
|
||||
}
|
||||
|
||||
dView dx;
|
||||
hView hx;
|
||||
|
||||
dx = Kokkos::create_staticcrsgraph<dView>("dx", graph);
|
||||
hx = Kokkos::create_mirror(dx);
|
||||
|
||||
|
@ -98,8 +113,8 @@ void run_test_graph() {
|
|||
|
||||
template <class Space>
|
||||
void run_test_graph2() {
|
||||
typedef Kokkos::StaticCrsGraph<unsigned[3], Space> dView;
|
||||
typedef typename dView::HostMirror hView;
|
||||
using dView = Kokkos::StaticCrsGraph<unsigned[3], Space>;
|
||||
using hView = typename dView::HostMirror;
|
||||
|
||||
const unsigned LENGTH = 10;
|
||||
|
||||
|
@ -158,8 +173,8 @@ template <class Space>
|
|||
void run_test_graph3(size_t B, size_t N) {
|
||||
srand(10310);
|
||||
|
||||
typedef Kokkos::StaticCrsGraph<int, Space> dView;
|
||||
typedef typename dView::HostMirror hView;
|
||||
using dView = Kokkos::StaticCrsGraph<int, Space>;
|
||||
using hView = typename dView::HostMirror;
|
||||
|
||||
const unsigned LENGTH = 2000;
|
||||
|
||||
|
@ -197,20 +212,13 @@ void run_test_graph3(size_t B, size_t N) {
|
|||
|
||||
template <class Space>
|
||||
void run_test_graph4() {
|
||||
typedef unsigned ordinal_type;
|
||||
typedef Kokkos::LayoutRight layout_type;
|
||||
typedef Space space_type;
|
||||
typedef Kokkos::MemoryUnmanaged memory_traits_type;
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
typedef Kokkos::StaticCrsGraph<ordinal_type, layout_type, space_type,
|
||||
ordinal_type, memory_traits_type>
|
||||
dView;
|
||||
#else
|
||||
typedef Kokkos::StaticCrsGraph<ordinal_type, layout_type, space_type,
|
||||
memory_traits_type>
|
||||
dView;
|
||||
#endif
|
||||
typedef typename dView::HostMirror hView;
|
||||
using ordinal_type = unsigned;
|
||||
using layout_type = Kokkos::LayoutRight;
|
||||
using space_type = Space;
|
||||
using memory_traits_type = Kokkos::MemoryUnmanaged;
|
||||
using dView = Kokkos::StaticCrsGraph<ordinal_type, layout_type, space_type,
|
||||
memory_traits_type>;
|
||||
using hView = typename dView::HostMirror;
|
||||
|
||||
dView dx;
|
||||
|
||||
|
@ -227,8 +235,8 @@ void run_test_graph4() {
|
|||
// of the unmanaged StaticCrsGraph
|
||||
|
||||
// Data types for raw pointers storing StaticCrsGraph info
|
||||
typedef typename dView::size_type ptr_row_map_type;
|
||||
typedef typename dView::data_type ptr_entries_type;
|
||||
using ptr_row_map_type = typename dView::size_type;
|
||||
using ptr_entries_type = typename dView::data_type;
|
||||
|
||||
const ordinal_type numRows = 8;
|
||||
const ordinal_type nnz = 24;
|
||||
|
@ -237,8 +245,8 @@ void run_test_graph4() {
|
|||
4, 5, 4, 5, 2, 3, 6, 7, 2, 3, 6, 7};
|
||||
|
||||
// Wrap pointers in unmanaged host views
|
||||
typedef typename hView::row_map_type local_row_map_type;
|
||||
typedef typename hView::entries_type local_entries_type;
|
||||
using local_row_map_type = typename hView::row_map_type;
|
||||
using local_entries_type = typename hView::entries_type;
|
||||
local_row_map_type unman_row_map(&(ptrRaw[0]), numRows + 1);
|
||||
local_entries_type unman_entries(&(indRaw[0]), nnz);
|
||||
|
||||
|
@ -248,10 +256,10 @@ void run_test_graph4() {
|
|||
// Create the device Views for copying the host arrays into
|
||||
// An allocation is needed on the device for the unmanaged StaticCrsGraph to
|
||||
// wrap the pointer
|
||||
typedef typename Kokkos::View<ptr_row_map_type*, layout_type, space_type>
|
||||
d_row_map_view_type;
|
||||
typedef typename Kokkos::View<ptr_entries_type*, layout_type, space_type>
|
||||
d_entries_view_type;
|
||||
using d_row_map_view_type =
|
||||
typename Kokkos::View<ptr_row_map_type*, layout_type, space_type>;
|
||||
using d_entries_view_type =
|
||||
typename Kokkos::View<ptr_entries_type*, layout_type, space_type>;
|
||||
|
||||
d_row_map_view_type tmp_row_map("tmp_row_map", numRows + 1);
|
||||
d_entries_view_type tmp_entries("tmp_entries", nnz);
|
||||
|
|
|
@ -53,9 +53,9 @@ namespace Impl {
|
|||
|
||||
template <typename MapType, bool Near = false>
|
||||
struct TestInsert {
|
||||
typedef MapType map_type;
|
||||
typedef typename map_type::execution_space execution_space;
|
||||
typedef uint32_t value_type;
|
||||
using map_type = MapType;
|
||||
using execution_space = typename map_type::execution_space;
|
||||
using value_type = uint32_t;
|
||||
|
||||
map_type map;
|
||||
uint32_t inserts;
|
||||
|
@ -101,10 +101,10 @@ struct TestInsert {
|
|||
|
||||
template <typename MapType, bool Near>
|
||||
struct TestErase {
|
||||
typedef TestErase<MapType, Near> self_type;
|
||||
using self_type = TestErase<MapType, Near>;
|
||||
|
||||
typedef MapType map_type;
|
||||
typedef typename MapType::execution_space execution_space;
|
||||
using map_type = MapType;
|
||||
using execution_space = typename MapType::execution_space;
|
||||
|
||||
map_type m_map;
|
||||
uint32_t m_num_erase;
|
||||
|
@ -131,9 +131,9 @@ struct TestErase {
|
|||
|
||||
template <typename MapType>
|
||||
struct TestFind {
|
||||
typedef MapType map_type;
|
||||
typedef typename MapType::execution_space::execution_space execution_space;
|
||||
typedef uint32_t value_type;
|
||||
using map_type = MapType;
|
||||
using execution_space = typename MapType::execution_space::execution_space;
|
||||
using value_type = uint32_t;
|
||||
|
||||
map_type m_map;
|
||||
uint32_t m_num_insert;
|
||||
|
@ -180,9 +180,9 @@ struct TestFind {
|
|||
template <typename Device>
|
||||
void test_insert(uint32_t num_nodes, uint32_t num_inserts,
|
||||
uint32_t num_duplicates, bool near) {
|
||||
typedef Kokkos::UnorderedMap<uint32_t, uint32_t, Device> map_type;
|
||||
typedef Kokkos::UnorderedMap<const uint32_t, const uint32_t, Device>
|
||||
const_map_type;
|
||||
using map_type = Kokkos::UnorderedMap<uint32_t, uint32_t, Device>;
|
||||
using const_map_type =
|
||||
Kokkos::UnorderedMap<const uint32_t, const uint32_t, Device>;
|
||||
|
||||
const uint32_t expected_inserts =
|
||||
(num_inserts + num_duplicates - 1u) / num_duplicates;
|
||||
|
@ -232,7 +232,7 @@ void test_insert(uint32_t num_nodes, uint32_t num_inserts,
|
|||
|
||||
template <typename Device>
|
||||
void test_failed_insert(uint32_t num_nodes) {
|
||||
typedef Kokkos::UnorderedMap<uint32_t, uint32_t, Device> map_type;
|
||||
using map_type = Kokkos::UnorderedMap<uint32_t, uint32_t, Device>;
|
||||
|
||||
map_type map(num_nodes);
|
||||
Impl::TestInsert<map_type> test_insert(map, 2u * num_nodes, 1u);
|
||||
|
@ -244,13 +244,11 @@ void test_failed_insert(uint32_t num_nodes) {
|
|||
|
||||
template <typename Device>
|
||||
void test_deep_copy(uint32_t num_nodes) {
|
||||
typedef Kokkos::UnorderedMap<uint32_t, uint32_t, Device> map_type;
|
||||
typedef Kokkos::UnorderedMap<const uint32_t, const uint32_t, Device>
|
||||
const_map_type;
|
||||
using map_type = Kokkos::UnorderedMap<uint32_t, uint32_t, Device>;
|
||||
using const_map_type =
|
||||
Kokkos::UnorderedMap<const uint32_t, const uint32_t, Device>;
|
||||
|
||||
typedef typename map_type::HostMirror host_map_type;
|
||||
// typedef Kokkos::UnorderedMap<uint32_t, uint32_t, typename
|
||||
// Device::host_mirror_execution_space > host_map_type;
|
||||
using host_map_type = typename map_type::HostMirror;
|
||||
|
||||
map_type map;
|
||||
map.rehash(num_nodes, false);
|
||||
|
@ -295,7 +293,7 @@ void test_deep_copy(uint32_t num_nodes) {
|
|||
}
|
||||
}
|
||||
|
||||
// FIXME_HIP deadlock
|
||||
// FIXME_HIP wrong result in CI but works locally
|
||||
#ifndef KOKKOS_ENABLE_HIP
|
||||
// WORKAROUND MSVC
|
||||
#ifndef _WIN32
|
||||
|
@ -306,6 +304,7 @@ TEST(TEST_CATEGORY, UnorderedMap_insert) {
|
|||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
TEST(TEST_CATEGORY, UnorderedMap_failed_insert) {
|
||||
for (int i = 0; i < 1000; ++i) test_failed_insert<TEST_EXECSPACE>(10000);
|
||||
|
@ -314,7 +313,6 @@ TEST(TEST_CATEGORY, UnorderedMap_failed_insert) {
|
|||
TEST(TEST_CATEGORY, UnorderedMap_deep_copy) {
|
||||
for (int i = 0; i < 2; ++i) test_deep_copy<TEST_EXECSPACE>(10000);
|
||||
}
|
||||
#endif
|
||||
|
||||
TEST(TEST_CATEGORY, UnorderedMap_valid_empty) {
|
||||
using Key = int;
|
||||
|
@ -326,6 +324,8 @@ TEST(TEST_CATEGORY, UnorderedMap_valid_empty) {
|
|||
n = Map{m.capacity()};
|
||||
n.rehash(m.capacity());
|
||||
Kokkos::deep_copy(n, m);
|
||||
ASSERT_TRUE(m.is_allocated());
|
||||
ASSERT_TRUE(n.is_allocated());
|
||||
}
|
||||
|
||||
} // namespace Test
|
||||
|
|
|
@ -55,14 +55,17 @@ namespace Impl {
|
|||
|
||||
template <typename Scalar, class Device>
|
||||
struct test_vector_insert {
|
||||
typedef Scalar scalar_type;
|
||||
typedef Device execution_space;
|
||||
using scalar_type = Scalar;
|
||||
using execution_space = Device;
|
||||
|
||||
template <typename Vector>
|
||||
void run_test(Vector& a) {
|
||||
int n = a.size();
|
||||
|
||||
auto it = a.begin();
|
||||
if (n > 0) {
|
||||
ASSERT_EQ(a.data(), &a[0]);
|
||||
}
|
||||
it += 15;
|
||||
ASSERT_EQ(*it, scalar_type(1));
|
||||
|
||||
|
@ -173,11 +176,42 @@ struct test_vector_insert {
|
|||
};
|
||||
|
||||
template <typename Scalar, class Device>
|
||||
struct test_vector_combinations {
|
||||
typedef test_vector_combinations<Scalar, Device> self_type;
|
||||
struct test_vector_allocate {
|
||||
using self_type = test_vector_allocate<Scalar, Device>;
|
||||
|
||||
typedef Scalar scalar_type;
|
||||
typedef Device execution_space;
|
||||
using scalar_type = Scalar;
|
||||
using execution_space = Device;
|
||||
|
||||
bool result = false;
|
||||
|
||||
template <typename Vector>
|
||||
Scalar run_me(unsigned int n) {
|
||||
{
|
||||
Vector v1;
|
||||
if (v1.is_allocated() == true) return false;
|
||||
|
||||
v1 = Vector(n, 1);
|
||||
Vector v2(v1);
|
||||
Vector v3(n, 1);
|
||||
|
||||
if (v1.is_allocated() == false) return false;
|
||||
if (v2.is_allocated() == false) return false;
|
||||
if (v3.is_allocated() == false) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
test_vector_allocate(unsigned int size) {
|
||||
result = run_me<Kokkos::vector<Scalar, Device> >(size);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Scalar, class Device>
|
||||
struct test_vector_combinations {
|
||||
using self_type = test_vector_combinations<Scalar, Device>;
|
||||
|
||||
using scalar_type = Scalar;
|
||||
using execution_space = Device;
|
||||
|
||||
Scalar reference;
|
||||
Scalar result;
|
||||
|
@ -231,7 +265,14 @@ void test_vector_combinations(unsigned int size) {
|
|||
ASSERT_EQ(test.reference, test.result);
|
||||
}
|
||||
|
||||
template <typename Scalar, typename Device>
|
||||
void test_vector_allocate(unsigned int size) {
|
||||
Impl::test_vector_allocate<Scalar, Device> test(size);
|
||||
ASSERT_TRUE(test.result);
|
||||
}
|
||||
|
||||
TEST(TEST_CATEGORY, vector_combination) {
|
||||
test_vector_allocate<int, TEST_EXECSPACE>(10);
|
||||
test_vector_combinations<int, TEST_EXECSPACE>(10);
|
||||
test_vector_combinations<int, TEST_EXECSPACE>(3057);
|
||||
}
|
||||
|
|
|
@ -91,10 +91,10 @@ struct TestViewCtorProp_EmbeddedDim {
|
|||
{
|
||||
// Two views
|
||||
auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1);
|
||||
typedef
|
||||
typename decltype(view_alloc_arg)::value_type CommonViewValueType;
|
||||
typedef typename Kokkos::View<CommonViewValueType*, ExecSpace> CVT;
|
||||
typedef typename CVT::HostMirror HostCVT;
|
||||
using CommonViewValueType =
|
||||
typename decltype(view_alloc_arg)::value_type;
|
||||
using CVT = typename Kokkos::View<CommonViewValueType*, ExecSpace>;
|
||||
using HostCVT = typename CVT::HostMirror;
|
||||
|
||||
// Construct View using the common type; for case of specialization, an
|
||||
// 'embedded_dim' would be stored by view_alloc_arg
|
||||
|
@ -128,10 +128,10 @@ struct TestViewCtorProp_EmbeddedDim {
|
|||
{
|
||||
// Single view
|
||||
auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1);
|
||||
typedef
|
||||
typename decltype(view_alloc_arg)::value_type CommonViewValueType;
|
||||
typedef typename Kokkos::View<CommonViewValueType*, ExecSpace> CVT;
|
||||
typedef typename CVT::HostMirror HostCVT;
|
||||
using CommonViewValueType =
|
||||
typename decltype(view_alloc_arg)::value_type;
|
||||
using CVT = typename Kokkos::View<CommonViewValueType*, ExecSpace>;
|
||||
using HostCVT = typename CVT::HostMirror;
|
||||
|
||||
// Construct View using the common type; for case of specialization, an
|
||||
// 'embedded_dim' would be stored by view_alloc_arg
|
||||
|
@ -161,10 +161,10 @@ struct TestViewCtorProp_EmbeddedDim {
|
|||
{
|
||||
// Two views
|
||||
auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1);
|
||||
typedef
|
||||
typename decltype(view_alloc_arg)::value_type CommonViewValueType;
|
||||
typedef typename Kokkos::View<CommonViewValueType*, ExecSpace> CVT;
|
||||
typedef typename CVT::HostMirror HostCVT;
|
||||
using CommonViewValueType =
|
||||
typename decltype(view_alloc_arg)::value_type;
|
||||
using CVT = typename Kokkos::View<CommonViewValueType*, ExecSpace>;
|
||||
using HostCVT = typename CVT::HostMirror;
|
||||
|
||||
// Construct View using the common type; for case of specialization, an
|
||||
// 'embedded_dim' would be stored by view_alloc_arg
|
||||
|
@ -182,10 +182,10 @@ struct TestViewCtorProp_EmbeddedDim {
|
|||
{
|
||||
// Single views
|
||||
auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1);
|
||||
typedef
|
||||
typename decltype(view_alloc_arg)::value_type CommonViewValueType;
|
||||
typedef typename Kokkos::View<CommonViewValueType*, ExecSpace> CVT;
|
||||
typedef typename CVT::HostMirror HostCVT;
|
||||
using CommonViewValueType =
|
||||
typename decltype(view_alloc_arg)::value_type;
|
||||
using CVT = typename Kokkos::View<CommonViewValueType*, ExecSpace>;
|
||||
using HostCVT = typename CVT::HostMirror;
|
||||
|
||||
// Construct View using the common type; for case of specialization, an
|
||||
// 'embedded_dim' would be stored by view_alloc_arg
|
||||
|
|
|
@ -2,7 +2,9 @@
|
|||
|
||||
KOKKOS_SUBPACKAGE(Core)
|
||||
|
||||
ADD_SUBDIRECTORY(src)
|
||||
IF (NOT Kokkos_INSTALL_TESTING)
|
||||
ADD_SUBDIRECTORY(src)
|
||||
ENDIF()
|
||||
|
||||
KOKKOS_ADD_TEST_DIRECTORIES(unit_test)
|
||||
KOKKOS_ADD_TEST_DIRECTORIES(perf_test)
|
||||
|
|
|
@ -6,7 +6,8 @@
|
|||
#if !defined(KOKKOS_FOR_SIERRA)
|
||||
|
||||
#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
|
||||
#error "Don't include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
|
||||
#error \
|
||||
"Don't include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
|
||||
#else
|
||||
#define KOKKOS_CORE_CONFIG_H
|
||||
#endif
|
||||
|
@ -25,8 +26,8 @@
|
|||
#cmakedefine KOKKOS_ENABLE_DEBUG
|
||||
#cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
|
||||
#cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
|
||||
#cmakedefine KOKKOS_ENABLE_PROFILING
|
||||
#cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT
|
||||
#cmakedefine KOKKOS_ENABLE_TUNING
|
||||
|
||||
#cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
|
||||
|
||||
|
@ -38,7 +39,8 @@
|
|||
// any value of KOKKOS_USE_CUDA_UVM here. Doing this should prevent build
|
||||
// warnings like this one:
|
||||
//
|
||||
// packages/kokkos/core/src/KokkosCore_config.h:13:1: warning: "KOKKOS_USE_CUDA_UVM" redefined
|
||||
// packages/kokkos/core/src/KokkosCore_config.h:13:1: warning:
|
||||
// "KOKKOS_USE_CUDA_UVM" redefined
|
||||
//
|
||||
// At some point, we should edit the test-build scripts in
|
||||
// Trilinos/cmake/ctest/drivers/perseus/, and take
|
||||
|
@ -100,4 +102,4 @@
|
|||
#cmakedefine KOKKOS_USING_DEPRECATED_VIEW
|
||||
#cmakedefine KOKKOS_ENABLE_CXX11
|
||||
|
||||
#endif // !defined(KOKKOS_FOR_SIERRA)
|
||||
#endif // !defined(KOKKOS_FOR_SIERRA)
|
||||
|
|
|
@ -49,11 +49,19 @@ SET(SOURCES
|
|||
)
|
||||
|
||||
IF(Kokkos_ENABLE_HIP)
|
||||
# FIXME requires TeamPolicy
|
||||
# FIXME HIP requires TeamPolicy
|
||||
LIST(REMOVE_ITEM SOURCES
|
||||
PerfTest_CustomReduction.cpp
|
||||
PerfTest_ExecSpacePartitioning.cpp
|
||||
)
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
IF(Kokkos_ENABLE_OPENMPTARGET)
|
||||
# FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction
|
||||
LIST(REMOVE_ITEM SOURCES
|
||||
PerfTest_CustomReduction.cpp
|
||||
PerfTest_ExecSpacePartitioning.cpp
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
# Per #374, we always want to build this test, but we only want to run
|
||||
|
@ -76,7 +84,22 @@ IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC")
|
|||
ENDIF()
|
||||
|
||||
KOKKOS_ADD_EXECUTABLE_AND_TEST(
|
||||
PerformanceTest_TaskDag
|
||||
SOURCES test_taskdag.cpp
|
||||
PerformanceTest_Atomic
|
||||
SOURCES test_atomic.cpp
|
||||
CATEGORIES PERFORMANCE
|
||||
)
|
||||
|
||||
KOKKOS_ADD_EXECUTABLE_AND_TEST(
|
||||
PerformanceTest_Mempool
|
||||
SOURCES test_mempool.cpp
|
||||
CATEGORIES PERFORMANCE
|
||||
)
|
||||
|
||||
IF(NOT Kokkos_ENABLE_OPENMPTARGET)
|
||||
# FIXME OPENMPTARGET needs tasking
|
||||
KOKKOS_ADD_EXECUTABLE_AND_TEST(
|
||||
PerformanceTest_TaskDag
|
||||
SOURCES test_taskdag.cpp
|
||||
CATEGORIES PERFORMANCE
|
||||
)
|
||||
ENDIF()
|
||||
|
|
|
@ -53,7 +53,6 @@ TEST_TARGETS += test-atomic
|
|||
|
||||
#
|
||||
|
||||
ifneq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
|
||||
OBJ_MEMPOOL = test_mempool.o
|
||||
TARGETS += KokkosCore_PerformanceTest_Mempool
|
||||
TEST_TARGETS += test-mempool
|
||||
|
@ -63,7 +62,6 @@ TEST_TARGETS += test-mempool
|
|||
OBJ_TASKDAG = test_taskdag.o
|
||||
TARGETS += KokkosCore_PerformanceTest_TaskDAG
|
||||
TEST_TARGETS += test-taskdag
|
||||
endif
|
||||
|
||||
#
|
||||
|
||||
|
|
|
@ -51,12 +51,12 @@ namespace Kokkos {
|
|||
|
||||
template <class Type>
|
||||
struct Dot {
|
||||
typedef typename Type::execution_space execution_space;
|
||||
using execution_space = typename Type::execution_space;
|
||||
|
||||
static_assert(static_cast<unsigned>(Type::Rank) == static_cast<unsigned>(1),
|
||||
"Dot static_assert Fail: Rank != 1");
|
||||
|
||||
typedef double value_type;
|
||||
using value_type = double;
|
||||
|
||||
#if 1
|
||||
typename Type::const_type X;
|
||||
|
@ -83,12 +83,12 @@ struct Dot {
|
|||
|
||||
template <class Type>
|
||||
struct DotSingle {
|
||||
typedef typename Type::execution_space execution_space;
|
||||
using execution_space = typename Type::execution_space;
|
||||
|
||||
static_assert(static_cast<unsigned>(Type::Rank) == static_cast<unsigned>(1),
|
||||
"DotSingle static_assert Fail: Rank != 1");
|
||||
|
||||
typedef double value_type;
|
||||
using value_type = double;
|
||||
|
||||
#if 1
|
||||
typename Type::const_type X;
|
||||
|
@ -116,7 +116,7 @@ struct DotSingle {
|
|||
|
||||
template <class ScalarType, class VectorType>
|
||||
struct Scale {
|
||||
typedef typename VectorType::execution_space execution_space;
|
||||
using execution_space = typename VectorType::execution_space;
|
||||
|
||||
static_assert(static_cast<unsigned>(ScalarType::Rank) ==
|
||||
static_cast<unsigned>(0),
|
||||
|
@ -143,7 +143,7 @@ struct Scale {
|
|||
|
||||
template <class ScalarType, class ConstVectorType, class VectorType>
|
||||
struct AXPBY {
|
||||
typedef typename VectorType::execution_space execution_space;
|
||||
using execution_space = typename VectorType::execution_space;
|
||||
|
||||
static_assert(static_cast<unsigned>(ScalarType::Rank) ==
|
||||
static_cast<unsigned>(0),
|
||||
|
@ -185,7 +185,7 @@ namespace Kokkos {
|
|||
template <class ConstScalarType, class ConstVectorType, class VectorType>
|
||||
void axpby(const ConstScalarType& alpha, const ConstVectorType& X,
|
||||
const ConstScalarType& beta, const VectorType& Y) {
|
||||
typedef AXPBY<ConstScalarType, ConstVectorType, VectorType> functor;
|
||||
using functor = AXPBY<ConstScalarType, ConstVectorType, VectorType>;
|
||||
|
||||
parallel_for(Y.extent(0), functor(alpha, X, beta, Y));
|
||||
}
|
||||
|
@ -193,7 +193,7 @@ void axpby(const ConstScalarType& alpha, const ConstVectorType& X,
|
|||
/** \brief Y *= alpha */
|
||||
template <class ConstScalarType, class VectorType>
|
||||
void scale(const ConstScalarType& alpha, const VectorType& Y) {
|
||||
typedef Scale<ConstScalarType, VectorType> functor;
|
||||
using functor = Scale<ConstScalarType, VectorType>;
|
||||
|
||||
parallel_for(Y.extent(0), functor(alpha, Y));
|
||||
}
|
||||
|
@ -201,14 +201,14 @@ void scale(const ConstScalarType& alpha, const VectorType& Y) {
|
|||
template <class ConstVectorType, class Finalize>
|
||||
void dot(const ConstVectorType& X, const ConstVectorType& Y,
|
||||
const Finalize& finalize) {
|
||||
typedef Dot<ConstVectorType> functor;
|
||||
using functor = Dot<ConstVectorType>;
|
||||
|
||||
parallel_reduce(X.extent(0), functor(X, Y), finalize);
|
||||
}
|
||||
|
||||
template <class ConstVectorType, class Finalize>
|
||||
void dot(const ConstVectorType& X, const Finalize& finalize) {
|
||||
typedef DotSingle<ConstVectorType> functor;
|
||||
using functor = DotSingle<ConstVectorType>;
|
||||
|
||||
parallel_reduce(X.extent(0), functor(X), finalize);
|
||||
}
|
||||
|
|
|
@ -58,7 +58,7 @@ namespace Test {
|
|||
// PostProcess : R(j,j) = result ; inv = 1 / result ;
|
||||
template <class VectorView, class ValueView>
|
||||
struct InvNorm2 : public Kokkos::DotSingle<VectorView> {
|
||||
typedef typename Kokkos::DotSingle<VectorView>::value_type value_type;
|
||||
using value_type = typename Kokkos::DotSingle<VectorView>::value_type;
|
||||
|
||||
ValueView Rjj;
|
||||
ValueView inv;
|
||||
|
@ -69,10 +69,7 @@ struct InvNorm2 : public Kokkos::DotSingle<VectorView> {
|
|||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void final(value_type& result) const {
|
||||
#ifndef KOKKOS_ENABLE_HIP // FIXME_HIP
|
||||
using std::sqrt;
|
||||
#endif
|
||||
result = sqrt(result);
|
||||
result = std::sqrt(result);
|
||||
Rjj() = result;
|
||||
inv() = (0 < result) ? 1.0 / result : 0;
|
||||
}
|
||||
|
@ -88,7 +85,7 @@ inline void invnorm2(const VectorView& x, const ValueView& r,
|
|||
// PostProcess : tmp = - ( R(j,k) = result );
|
||||
template <class VectorView, class ValueView>
|
||||
struct DotM : public Kokkos::Dot<VectorView> {
|
||||
typedef typename Kokkos::Dot<VectorView>::value_type value_type;
|
||||
using value_type = typename Kokkos::Dot<VectorView>::value_type;
|
||||
|
||||
ValueView Rjk;
|
||||
ValueView tmp;
|
||||
|
@ -113,16 +110,16 @@ inline void dot_neg(const VectorView& x, const VectorView& y,
|
|||
|
||||
template <typename Scalar, class DeviceType>
|
||||
struct ModifiedGramSchmidt {
|
||||
typedef DeviceType execution_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
using execution_space = DeviceType;
|
||||
using size_type = typename execution_space::size_type;
|
||||
|
||||
typedef Kokkos::View<Scalar**, Kokkos::LayoutLeft, execution_space>
|
||||
multivector_type;
|
||||
using multivector_type =
|
||||
Kokkos::View<Scalar**, Kokkos::LayoutLeft, execution_space>;
|
||||
|
||||
typedef Kokkos::View<Scalar*, Kokkos::LayoutLeft, execution_space>
|
||||
vector_type;
|
||||
using vector_type =
|
||||
Kokkos::View<Scalar*, Kokkos::LayoutLeft, execution_space>;
|
||||
|
||||
typedef Kokkos::View<Scalar, Kokkos::LayoutLeft, execution_space> value_view;
|
||||
using value_view = Kokkos::View<Scalar, Kokkos::LayoutLeft, execution_space>;
|
||||
|
||||
multivector_type Q;
|
||||
multivector_type R;
|
||||
|
@ -243,9 +240,9 @@ TEST(default_exec, gramschmidt) {
|
|||
int exp_end = 20;
|
||||
int num_trials = 5;
|
||||
|
||||
if (command_line_num_args() > 1) exp_beg = atoi(command_line_arg(1));
|
||||
if (command_line_num_args() > 2) exp_end = atoi(command_line_arg(2));
|
||||
if (command_line_num_args() > 3) num_trials = atoi(command_line_arg(3));
|
||||
if (command_line_num_args() > 1) exp_beg = std::stoi(command_line_arg(1));
|
||||
if (command_line_num_args() > 2) exp_end = std::stoi(command_line_arg(2));
|
||||
if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3));
|
||||
|
||||
EXPECT_NO_THROW(run_test_gramschmidt<Kokkos::DefaultExecutionSpace>(
|
||||
exp_beg, exp_end, num_trials, Kokkos::DefaultExecutionSpace::name()));
|
||||
|
|
|
@ -51,20 +51,20 @@ namespace Test {
|
|||
template <class DeviceType, typename CoordScalarType = double,
|
||||
typename GradScalarType = float>
|
||||
struct HexGrad {
|
||||
typedef DeviceType execution_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
using execution_space = DeviceType;
|
||||
using size_type = typename execution_space::size_type;
|
||||
|
||||
typedef HexGrad<DeviceType, CoordScalarType, GradScalarType> self_type;
|
||||
using self_type = HexGrad<DeviceType, CoordScalarType, GradScalarType>;
|
||||
|
||||
// 3D array : ( ParallelWork , Space , Node )
|
||||
|
||||
enum { NSpace = 3, NNode = 8 };
|
||||
|
||||
typedef Kokkos::View<CoordScalarType * [NSpace][NNode], execution_space>
|
||||
elem_coord_type;
|
||||
using elem_coord_type =
|
||||
Kokkos::View<CoordScalarType * [NSpace][NNode], execution_space>;
|
||||
|
||||
typedef Kokkos::View<GradScalarType * [NSpace][NNode], execution_space>
|
||||
elem_grad_type;
|
||||
using elem_grad_type =
|
||||
Kokkos::View<GradScalarType * [NSpace][NNode], execution_space>;
|
||||
|
||||
elem_coord_type coords;
|
||||
elem_grad_type grad_op;
|
||||
|
@ -179,7 +179,7 @@ struct HexGrad {
|
|||
//--------------------------------------------------------------------------
|
||||
|
||||
struct Init {
|
||||
typedef typename self_type::execution_space execution_space;
|
||||
using execution_space = typename self_type::execution_space;
|
||||
|
||||
elem_coord_type coords;
|
||||
|
||||
|
@ -289,9 +289,9 @@ TEST(default_exec, hexgrad) {
|
|||
int exp_end = 20;
|
||||
int num_trials = 5;
|
||||
|
||||
if (command_line_num_args() > 1) exp_beg = atoi(command_line_arg(1));
|
||||
if (command_line_num_args() > 2) exp_end = atoi(command_line_arg(2));
|
||||
if (command_line_num_args() > 3) num_trials = atoi(command_line_arg(3));
|
||||
if (command_line_num_args() > 1) exp_beg = std::stoi(command_line_arg(1));
|
||||
if (command_line_num_args() > 2) exp_end = std::stoi(command_line_arg(2));
|
||||
if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3));
|
||||
|
||||
EXPECT_NO_THROW(run_test_hexgrad<Kokkos::DefaultExecutionSpace>(
|
||||
exp_beg, exp_end, num_trials, Kokkos::DefaultExecutionSpace::name()));
|
||||
|
|
|
@ -46,13 +46,13 @@ namespace Test {
|
|||
template <class DeviceType, typename ScalarType = double,
|
||||
typename TestLayout = Kokkos::LayoutRight>
|
||||
struct MultiDimRangePerf3D {
|
||||
typedef DeviceType execution_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
using execution_space = DeviceType;
|
||||
using size_type = typename execution_space::size_type;
|
||||
|
||||
using iterate_type = Kokkos::Iterate;
|
||||
|
||||
typedef Kokkos::View<ScalarType ***, TestLayout, DeviceType> view_type;
|
||||
typedef typename view_type::HostMirror host_view_type;
|
||||
using view_type = Kokkos::View<ScalarType ***, TestLayout, DeviceType>;
|
||||
using host_view_type = typename view_type::HostMirror;
|
||||
|
||||
view_type A;
|
||||
view_type B;
|
||||
|
@ -108,8 +108,8 @@ struct MultiDimRangePerf3D {
|
|||
// This test performs multidim range over all dims
|
||||
view_type Atest("Atest", icount, jcount, kcount);
|
||||
view_type Btest("Btest", icount + 2, jcount + 2, kcount + 2);
|
||||
typedef MultiDimRangePerf3D<execution_space, ScalarType, TestLayout>
|
||||
FunctorType;
|
||||
using FunctorType =
|
||||
MultiDimRangePerf3D<execution_space, ScalarType, TestLayout>;
|
||||
|
||||
double dt_min = 0;
|
||||
|
||||
|
@ -125,10 +125,9 @@ struct MultiDimRangePerf3D {
|
|||
policy_initB({{0, 0, 0}}, {{icount + 2, jcount + 2, kcount + 2}},
|
||||
{{Ti, Tj, Tk}});
|
||||
|
||||
typedef typename Kokkos::MDRangePolicy<
|
||||
using MDRangeType = typename Kokkos::MDRangePolicy<
|
||||
Kokkos::Rank<3, iterate_type::Right, iterate_type::Right>,
|
||||
execution_space>
|
||||
MDRangeType;
|
||||
execution_space>;
|
||||
using tile_type = typename MDRangeType::tile_type;
|
||||
using point_type = typename MDRangeType::point_type;
|
||||
|
||||
|
@ -216,14 +215,15 @@ struct MultiDimRangePerf3D {
|
|||
policy_initB({{0, 0, 0}}, {{icount + 2, jcount + 2, kcount + 2}},
|
||||
{{Ti, Tj, Tk}});
|
||||
|
||||
// typedef typename Kokkos::MDRangePolicy<Kokkos::Rank<3,
|
||||
// iterate_type::Left, iterate_type::Left>, execution_space > MDRangeType;
|
||||
// using MDRangeType =
|
||||
// typename Kokkos::MDRangePolicy<
|
||||
// Kokkos::Rank<3, iterate_type::Left, iterate_type::Left>,
|
||||
// execution_space >;
|
||||
// using tile_type = typename MDRangeType::tile_type;
|
||||
// using point_type = typename MDRangeType::point_type;
|
||||
// Kokkos::MDRangePolicy<Kokkos::Rank<3, iterate_type::Left,
|
||||
// iterate_type::Left>, execution_space >
|
||||
// policy(point_type{{0,0,0}},point_type{{icount,jcount,kcount}},tile_type{{Ti,Tj,Tk}}
|
||||
// );
|
||||
// MDRangeType policy(point_type{{0,0,0}},
|
||||
// point_type{{icount,jcount,kcount}},
|
||||
// tile_type{{Ti,Tj,Tk}});
|
||||
Kokkos::MDRangePolicy<
|
||||
Kokkos::Rank<3, iterate_type::Left, iterate_type::Left>,
|
||||
execution_space>
|
||||
|
@ -306,14 +306,14 @@ struct RangePolicyCollapseTwo {
|
|||
// RangePolicy for 3D range, but will collapse only 2 dims => like Rank<2> for
|
||||
// multi-dim; unroll 2 dims in one-dim
|
||||
|
||||
typedef DeviceType execution_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
typedef TestLayout layout;
|
||||
using execution_space = DeviceType;
|
||||
using size_type = typename execution_space::size_type;
|
||||
using layout = TestLayout;
|
||||
|
||||
using iterate_type = Kokkos::Iterate;
|
||||
|
||||
typedef Kokkos::View<ScalarType ***, TestLayout, DeviceType> view_type;
|
||||
typedef typename view_type::HostMirror host_view_type;
|
||||
using view_type = Kokkos::View<ScalarType ***, TestLayout, DeviceType>;
|
||||
using host_view_type = typename view_type::HostMirror;
|
||||
|
||||
view_type A;
|
||||
view_type B;
|
||||
|
@ -388,8 +388,8 @@ struct RangePolicyCollapseTwo {
|
|||
// This test refers to collapsing two dims while using the RangePolicy
|
||||
view_type Atest("Atest", icount, jcount, kcount);
|
||||
view_type Btest("Btest", icount + 2, jcount + 2, kcount + 2);
|
||||
typedef RangePolicyCollapseTwo<execution_space, ScalarType, TestLayout>
|
||||
FunctorType;
|
||||
using FunctorType =
|
||||
RangePolicyCollapseTwo<execution_space, ScalarType, TestLayout>;
|
||||
|
||||
long collapse_index_rangeA = 0;
|
||||
long collapse_index_rangeB = 0;
|
||||
|
@ -480,12 +480,12 @@ template <class DeviceType, typename ScalarType = double,
|
|||
struct RangePolicyCollapseAll {
|
||||
// RangePolicy for 3D range, but will collapse all dims
|
||||
|
||||
typedef DeviceType execution_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
typedef TestLayout layout;
|
||||
using execution_space = DeviceType;
|
||||
using size_type = typename execution_space::size_type;
|
||||
using layout = TestLayout;
|
||||
|
||||
typedef Kokkos::View<ScalarType ***, TestLayout, DeviceType> view_type;
|
||||
typedef typename view_type::HostMirror host_view_type;
|
||||
using view_type = Kokkos::View<ScalarType ***, TestLayout, DeviceType>;
|
||||
using host_view_type = typename view_type::HostMirror;
|
||||
|
||||
view_type A;
|
||||
view_type B;
|
||||
|
@ -552,8 +552,8 @@ struct RangePolicyCollapseAll {
|
|||
// This test refers to collapsing all dims using the RangePolicy
|
||||
view_type Atest("Atest", icount, jcount, kcount);
|
||||
view_type Btest("Btest", icount + 2, jcount + 2, kcount + 2);
|
||||
typedef RangePolicyCollapseAll<execution_space, ScalarType, TestLayout>
|
||||
FunctorType;
|
||||
using FunctorType =
|
||||
RangePolicyCollapseAll<execution_space, ScalarType, TestLayout>;
|
||||
|
||||
const long flat_index_range = icount * jcount * kcount;
|
||||
Kokkos::RangePolicy<execution_space> policy(0, flat_index_range);
|
||||
|
|
|
@ -129,9 +129,9 @@ TEST(default_exec, custom_reduction) {
|
|||
int R = 1000;
|
||||
int num_trials = 1;
|
||||
|
||||
if (command_line_num_args() > 1) N = atoi(command_line_arg(1));
|
||||
if (command_line_num_args() > 2) R = atoi(command_line_arg(2));
|
||||
if (command_line_num_args() > 3) num_trials = atoi(command_line_arg(3));
|
||||
if (command_line_num_args() > 1) N = std::stoi(command_line_arg(1));
|
||||
if (command_line_num_args() > 2) R = std::stoi(command_line_arg(2));
|
||||
if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3));
|
||||
custom_reduction_test<double>(N, R, num_trials);
|
||||
}
|
||||
} // namespace Test
|
||||
|
|
|
@ -29,7 +29,7 @@ struct SpaceInstance<Kokkos::Cuda> {
|
|||
bool value = true;
|
||||
auto local_rank_str = std::getenv("CUDA_LAUNCH_BLOCKING");
|
||||
if (local_rank_str) {
|
||||
value = (std::atoi(local_rank_str) == 0);
|
||||
value = (std::stoi(local_rank_str) == 0);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
|
|
@ -49,7 +49,7 @@
|
|||
#include <Kokkos_Core.hpp>
|
||||
#include <impl/Kokkos_Timer.hpp>
|
||||
|
||||
typedef Kokkos::DefaultExecutionSpace exec_space;
|
||||
using exec_space = Kokkos::DefaultExecutionSpace;
|
||||
|
||||
#define RESET 0
|
||||
#define BRIGHT 1
|
||||
|
@ -80,9 +80,9 @@ void textcolor_standard() { textcolor(RESET, BLACK, WHITE); }
|
|||
|
||||
template <class T, class DEVICE_TYPE>
|
||||
struct ZeroFunctor {
|
||||
typedef DEVICE_TYPE execution_space;
|
||||
typedef typename Kokkos::View<T, execution_space> type;
|
||||
typedef typename Kokkos::View<T, execution_space>::HostMirror h_type;
|
||||
using execution_space = DEVICE_TYPE;
|
||||
using type = typename Kokkos::View<T, execution_space>;
|
||||
using h_type = typename Kokkos::View<T, execution_space>::HostMirror;
|
||||
type data;
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(int) const { data() = 0; }
|
||||
|
@ -94,8 +94,8 @@ struct ZeroFunctor {
|
|||
|
||||
template <class T, class DEVICE_TYPE>
|
||||
struct AddFunctor {
|
||||
typedef DEVICE_TYPE execution_space;
|
||||
typedef Kokkos::View<T, execution_space> type;
|
||||
using execution_space = DEVICE_TYPE;
|
||||
using type = Kokkos::View<T, execution_space>;
|
||||
type data;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -123,8 +123,8 @@ T AddLoop(int loop) {
|
|||
|
||||
template <class T, class DEVICE_TYPE>
|
||||
struct AddNonAtomicFunctor {
|
||||
typedef DEVICE_TYPE execution_space;
|
||||
typedef Kokkos::View<T, execution_space> type;
|
||||
using execution_space = DEVICE_TYPE;
|
||||
using type = Kokkos::View<T, execution_space>;
|
||||
type data;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -166,8 +166,8 @@ T AddLoopSerial(int loop) {
|
|||
|
||||
template <class T, class DEVICE_TYPE>
|
||||
struct CASFunctor {
|
||||
typedef DEVICE_TYPE execution_space;
|
||||
typedef Kokkos::View<T, execution_space> type;
|
||||
using execution_space = DEVICE_TYPE;
|
||||
using type = Kokkos::View<T, execution_space>;
|
||||
type data;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -204,8 +204,8 @@ T CASLoop(int loop) {
|
|||
|
||||
template <class T, class DEVICE_TYPE>
|
||||
struct CASNonAtomicFunctor {
|
||||
typedef DEVICE_TYPE execution_space;
|
||||
typedef Kokkos::View<T, execution_space> type;
|
||||
using execution_space = DEVICE_TYPE;
|
||||
using type = Kokkos::View<T, execution_space>;
|
||||
type data;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -268,8 +268,8 @@ T CASLoopSerial(int loop) {
|
|||
|
||||
template <class T, class DEVICE_TYPE>
|
||||
struct ExchFunctor {
|
||||
typedef DEVICE_TYPE execution_space;
|
||||
typedef Kokkos::View<T, execution_space> type;
|
||||
using execution_space = DEVICE_TYPE;
|
||||
using type = Kokkos::View<T, execution_space>;
|
||||
type data, data2;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -309,8 +309,8 @@ T ExchLoop(int loop) {
|
|||
|
||||
template <class T, class DEVICE_TYPE>
|
||||
struct ExchNonAtomicFunctor {
|
||||
typedef DEVICE_TYPE execution_space;
|
||||
typedef Kokkos::View<T, execution_space> type;
|
||||
using execution_space = DEVICE_TYPE;
|
||||
using type = Kokkos::View<T, execution_space>;
|
||||
type data, data2;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -448,15 +448,15 @@ int main(int argc, char* argv[]) {
|
|||
|
||||
for (int i = 0; i < argc; i++) {
|
||||
if ((strcmp(argv[i], "--test") == 0)) {
|
||||
test = atoi(argv[++i]);
|
||||
test = std::stoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if ((strcmp(argv[i], "--type") == 0)) {
|
||||
type = atoi(argv[++i]);
|
||||
type = std::stoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if ((strcmp(argv[i], "-l") == 0) || (strcmp(argv[i], "--loop") == 0)) {
|
||||
loop = atoi(argv[++i]);
|
||||
loop = std::stoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -56,7 +56,7 @@ using MemorySpace = Kokkos::DefaultExecutionSpace::memory_space;
|
|||
using MemoryPool = Kokkos::MemoryPool<ExecSpace>;
|
||||
|
||||
struct TestFunctor {
|
||||
typedef Kokkos::View<uintptr_t*, ExecSpace> ptrs_type;
|
||||
using ptrs_type = Kokkos::View<uintptr_t*, ExecSpace>;
|
||||
|
||||
enum : unsigned { chunk = 32 };
|
||||
|
||||
|
@ -87,7 +87,7 @@ struct TestFunctor {
|
|||
|
||||
//----------------------------------------
|
||||
|
||||
typedef long value_type;
|
||||
using value_type = long;
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
|
@ -107,7 +107,7 @@ struct TestFunctor {
|
|||
}
|
||||
|
||||
bool test_fill() {
|
||||
typedef Kokkos::RangePolicy<ExecSpace, TagFill> policy;
|
||||
using policy = Kokkos::RangePolicy<ExecSpace, TagFill>;
|
||||
|
||||
long result = 0;
|
||||
|
||||
|
@ -134,7 +134,7 @@ struct TestFunctor {
|
|||
}
|
||||
|
||||
void test_del() {
|
||||
typedef Kokkos::RangePolicy<ExecSpace, TagDel> policy;
|
||||
using policy = Kokkos::RangePolicy<ExecSpace, TagDel>;
|
||||
|
||||
Kokkos::parallel_for(policy(0, range_iter), *this);
|
||||
Kokkos::fence();
|
||||
|
@ -164,7 +164,7 @@ struct TestFunctor {
|
|||
}
|
||||
|
||||
bool test_alloc_dealloc() {
|
||||
typedef Kokkos::RangePolicy<ExecSpace, TagAllocDealloc> policy;
|
||||
using policy = Kokkos::RangePolicy<ExecSpace, TagAllocDealloc>;
|
||||
|
||||
long error_count = 0;
|
||||
|
||||
|
@ -203,22 +203,22 @@ int main(int argc, char* argv[]) {
|
|||
total_alloc_size = atol(a + strlen(alloc_size_flag));
|
||||
|
||||
if (!strncmp(a, super_size_flag, strlen(super_size_flag)))
|
||||
min_superblock_size = atoi(a + strlen(super_size_flag));
|
||||
min_superblock_size = std::stoi(a + strlen(super_size_flag));
|
||||
|
||||
if (!strncmp(a, fill_stride_flag, strlen(fill_stride_flag)))
|
||||
fill_stride = atoi(a + strlen(fill_stride_flag));
|
||||
fill_stride = std::stoi(a + strlen(fill_stride_flag));
|
||||
|
||||
if (!strncmp(a, fill_level_flag, strlen(fill_level_flag)))
|
||||
fill_level = atoi(a + strlen(fill_level_flag));
|
||||
fill_level = std::stoi(a + strlen(fill_level_flag));
|
||||
|
||||
if (!strncmp(a, chunk_span_flag, strlen(chunk_span_flag)))
|
||||
chunk_span = atoi(a + strlen(chunk_span_flag));
|
||||
chunk_span = std::stoi(a + strlen(chunk_span_flag));
|
||||
|
||||
if (!strncmp(a, repeat_outer_flag, strlen(repeat_outer_flag)))
|
||||
repeat_outer = atoi(a + strlen(repeat_outer_flag));
|
||||
repeat_outer = std::stoi(a + strlen(repeat_outer_flag));
|
||||
|
||||
if (!strncmp(a, repeat_inner_flag, strlen(repeat_inner_flag)))
|
||||
repeat_inner = atoi(a + strlen(repeat_inner_flag));
|
||||
repeat_inner = std::stoi(a + strlen(repeat_inner_flag));
|
||||
}
|
||||
|
||||
int chunk_span_bytes = 0;
|
||||
|
|
|
@ -91,7 +91,7 @@ struct TestFib {
|
|||
using MemberType = typename Scheduler::member_type;
|
||||
using FutureType = Kokkos::BasicFuture<long, Scheduler>;
|
||||
|
||||
typedef long value_type;
|
||||
using value_type = long;
|
||||
|
||||
FutureType dep[2];
|
||||
const value_type n;
|
||||
|
@ -152,13 +152,13 @@ int main(int argc, char* argv[]) {
|
|||
total_alloc_size = atol(a + strlen(alloc_size));
|
||||
|
||||
if (!strncmp(a, super_size, strlen(super_size)))
|
||||
min_superblock_size = atoi(a + strlen(super_size));
|
||||
min_superblock_size = std::stoi(a + strlen(super_size));
|
||||
|
||||
if (!strncmp(a, repeat_outer, strlen(repeat_outer)))
|
||||
test_repeat_outer = atoi(a + strlen(repeat_outer));
|
||||
test_repeat_outer = std::stoi(a + strlen(repeat_outer));
|
||||
|
||||
if (!strncmp(a, input_value, strlen(input_value)))
|
||||
fib_input = atoi(a + strlen(input_value));
|
||||
fib_input = std::stoi(a + strlen(input_value));
|
||||
}
|
||||
|
||||
const long fib_output = eval_fib(fib_input);
|
||||
|
@ -182,7 +182,7 @@ int main(int argc, char* argv[]) {
|
|||
|
||||
using Scheduler = Kokkos::TaskSchedulerMultiple<ExecSpace>;
|
||||
|
||||
typedef TestFib<Scheduler> Functor;
|
||||
using Functor = TestFib<Scheduler>;
|
||||
|
||||
Kokkos::initialize(argc, argv);
|
||||
|
||||
|
|
|
@ -8,50 +8,49 @@ KOKKOS_INCLUDE_DIRECTORIES(
|
|||
INSTALL (DIRECTORY
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/"
|
||||
DESTINATION ${KOKKOS_HEADER_DIR}
|
||||
FILES_MATCHING PATTERN "*.hpp"
|
||||
FILES_MATCHING
|
||||
PATTERN "*.hpp"
|
||||
PATTERN "*.h"
|
||||
)
|
||||
|
||||
SET(KOKKOS_CORE_SRCS)
|
||||
APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp)
|
||||
SET(KOKKOS_CORE_HEADERS)
|
||||
APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp)
|
||||
APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp)
|
||||
|
||||
IF (KOKKOS_ENABLE_ROCM)
|
||||
APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/ROCm/*.cpp)
|
||||
IF (KOKKOS_ENABLE_ETI)
|
||||
APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/eti/ROCm/*.cpp)
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
IF (KOKKOS_ENABLE_CUDA)
|
||||
APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.cpp)
|
||||
IF (KOKKOS_ENABLE_ETI)
|
||||
APPEND_GLOB(KOKKOS_CORE_SRC ${CMAKE_CURRENT_SOURCE_DIR/eti/Cuda/*.cpp)
|
||||
ENDIF()
|
||||
APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.hpp)
|
||||
ENDIF()
|
||||
|
||||
IF (KOKKOS_ENABLE_OPENMP)
|
||||
APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.cpp)
|
||||
IF (KOKKOS_ENABLE_ETI)
|
||||
APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/eti/OpenMP/*.cpp)
|
||||
ENDIF()
|
||||
APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.hpp)
|
||||
ENDIF()
|
||||
|
||||
IF (KOKKOS_ENABLE_OPENMPTARGET)
|
||||
APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.cpp)
|
||||
APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.hpp)
|
||||
ENDIF()
|
||||
|
||||
IF (KOKKOS_ENABLE_PTHREAD)
|
||||
APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.cpp)
|
||||
IF (KOKKOS_ENABLE_ETI)
|
||||
APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/eti/Threads/*.cpp)
|
||||
ENDIF()
|
||||
APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.hpp)
|
||||
ENDIF()
|
||||
|
||||
IF (KOKKOS_ENABLE_HIP)
|
||||
APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.cpp)
|
||||
APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.hpp)
|
||||
ENDIF()
|
||||
|
||||
IF (KOKKOS_ENABLE_HPX)
|
||||
APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.cpp)
|
||||
APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.hpp)
|
||||
ENDIF()
|
||||
|
||||
IF (NOT KOKKOS_ENABLE_MEMKIND)
|
||||
|
@ -59,9 +58,7 @@ IF (NOT KOKKOS_ENABLE_MEMKIND)
|
|||
ENDIF()
|
||||
|
||||
IF (KOKKOS_ENABLE_SERIAL)
|
||||
IF (KOKKOS_ENABLE_ETI)
|
||||
APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/eti/Serial/*.cpp)
|
||||
ENDIF()
|
||||
APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp)
|
||||
ELSE()
|
||||
LIST(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/Kokkos_Serial.cpp)
|
||||
LIST(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/Kokkos_Serial_task.cpp)
|
||||
|
@ -70,6 +67,8 @@ ENDIF()
|
|||
KOKKOS_ADD_LIBRARY(
|
||||
kokkoscore
|
||||
SOURCES ${KOKKOS_CORE_SRCS}
|
||||
HEADERS ${KOKKOS_CORE_HEADERS}
|
||||
ADD_BUILD_OPTIONS # core should be given all the necessary compiler/linker flags
|
||||
)
|
||||
|
||||
SET_TARGET_PROPERTIES(kokkoscore PROPERTIES VERSION ${Kokkos_VERSION})
|
||||
|
|
|
@ -48,7 +48,6 @@
|
|||
#include <Kokkos_Macros.hpp>
|
||||
#if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA)
|
||||
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
|
||||
|
@ -60,10 +59,8 @@
|
|||
// type is not allowed As a result, recreate cuda_parallel_launch and associated
|
||||
// code
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
#include <impl/Kokkos_Tools.hpp>
|
||||
#include <typeinfo>
|
||||
#endif
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
@ -1291,8 +1288,8 @@ struct DeviceIterateTile {
|
|||
using point_type = typename RP::point_type;
|
||||
|
||||
struct VoidDummy {};
|
||||
typedef typename std::conditional<std::is_same<Tag, void>::value, VoidDummy,
|
||||
Tag>::type usable_tag;
|
||||
using usable_tag = typename std::conditional<std::is_same<Tag, void>::value,
|
||||
VoidDummy, Tag>::type;
|
||||
|
||||
DeviceIterateTile(const RP& rp, const Functor& func)
|
||||
: m_rp{rp}, m_func{func} {}
|
||||
|
@ -1310,6 +1307,8 @@ struct DeviceIterateTile {
|
|||
65535; // not true for blockIdx.x for newer archs
|
||||
if (RP::rank == 2) {
|
||||
const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1);
|
||||
KOKKOS_ASSERT(block.x > 0);
|
||||
KOKKOS_ASSERT(block.y > 0);
|
||||
const dim3 grid(
|
||||
std::min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x,
|
||||
maxblocks),
|
||||
|
@ -1319,6 +1318,9 @@ struct DeviceIterateTile {
|
|||
CudaLaunch<DeviceIterateTile>(*this, grid, block);
|
||||
} else if (RP::rank == 3) {
|
||||
const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
|
||||
KOKKOS_ASSERT(block.x > 0);
|
||||
KOKKOS_ASSERT(block.y > 0);
|
||||
KOKKOS_ASSERT(block.z > 0);
|
||||
const dim3 grid(
|
||||
std::min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x,
|
||||
maxblocks),
|
||||
|
@ -1332,6 +1334,8 @@ struct DeviceIterateTile {
|
|||
// threadIdx.z
|
||||
const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2],
|
||||
m_rp.m_tile[3]);
|
||||
KOKKOS_ASSERT(block.y > 0);
|
||||
KOKKOS_ASSERT(block.z > 0);
|
||||
const dim3 grid(
|
||||
std::min(
|
||||
static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]),
|
||||
|
@ -1346,6 +1350,7 @@ struct DeviceIterateTile {
|
|||
// threadIdx.z
|
||||
const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1],
|
||||
m_rp.m_tile[2] * m_rp.m_tile[3], m_rp.m_tile[4]);
|
||||
KOKKOS_ASSERT(block.z > 0);
|
||||
const dim3 grid(
|
||||
std::min(
|
||||
static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]),
|
||||
|
|
|
@ -48,9 +48,7 @@
|
|||
#include <Kokkos_Macros.hpp>
|
||||
#if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA)
|
||||
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
|
||||
#include <utility>
|
||||
|
||||
|
@ -60,10 +58,8 @@
|
|||
// type is not allowed use existing Kokkos functionality, e.g. max blocks, once
|
||||
// resolved
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
#include <impl/Kokkos_Tools.hpp>
|
||||
#include <typeinfo>
|
||||
#endif
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
|
|
@ -60,9 +60,7 @@
|
|||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <impl/Kokkos_MemorySpace.hpp>
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
#endif
|
||||
#include <impl/Kokkos_Tools.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
@ -75,8 +73,8 @@ namespace {
|
|||
static std::atomic<int> num_uvm_allocations(0);
|
||||
|
||||
cudaStream_t get_deep_copy_stream() {
|
||||
static cudaStream_t s = 0;
|
||||
if (s == 0) {
|
||||
static cudaStream_t s = nullptr;
|
||||
if (s == nullptr) {
|
||||
cudaStreamCreate(&s);
|
||||
}
|
||||
return s;
|
||||
|
@ -201,6 +199,10 @@ CudaHostPinnedSpace::CudaHostPinnedSpace() {}
|
|||
// <editor-fold desc="allocate()"> {{{1
|
||||
|
||||
void *CudaSpace::allocate(const size_t arg_alloc_size) const {
|
||||
return allocate("[unlabeled]", arg_alloc_size);
|
||||
}
|
||||
void *CudaSpace::allocate(const char *arg_label, const size_t arg_alloc_size,
|
||||
const size_t arg_logical_size) const {
|
||||
void *ptr = nullptr;
|
||||
|
||||
auto error_code = cudaMalloc(&ptr, arg_alloc_size);
|
||||
|
@ -213,10 +215,22 @@ void *CudaSpace::allocate(const size_t arg_alloc_size) const {
|
|||
Experimental::RawMemoryAllocationFailure::AllocationMechanism::
|
||||
CudaMalloc);
|
||||
}
|
||||
|
||||
if (Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
const size_t reported_size =
|
||||
(arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
|
||||
Kokkos::Profiling::allocateData(
|
||||
Kokkos::Profiling::make_space_handle(name()), arg_label, ptr,
|
||||
reported_size);
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void *CudaUVMSpace::allocate(const size_t arg_alloc_size) const {
|
||||
return allocate("[unlabeled]", arg_alloc_size);
|
||||
}
|
||||
void *CudaUVMSpace::allocate(const char *arg_label, const size_t arg_alloc_size,
|
||||
const size_t arg_logical_size) const {
|
||||
void *ptr = nullptr;
|
||||
|
||||
Cuda::impl_static_fence();
|
||||
|
@ -243,11 +257,22 @@ void *CudaUVMSpace::allocate(const size_t arg_alloc_size) const {
|
|||
}
|
||||
}
|
||||
Cuda::impl_static_fence();
|
||||
|
||||
if (Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
const size_t reported_size =
|
||||
(arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
|
||||
Kokkos::Profiling::allocateData(
|
||||
Kokkos::Profiling::make_space_handle(name()), arg_label, ptr,
|
||||
reported_size);
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void *CudaHostPinnedSpace::allocate(const size_t arg_alloc_size) const {
|
||||
return allocate("[unlabeled]", arg_alloc_size);
|
||||
}
|
||||
void *CudaHostPinnedSpace::allocate(const char *arg_label,
|
||||
const size_t arg_alloc_size,
|
||||
const size_t arg_logical_size) const {
|
||||
void *ptr = nullptr;
|
||||
|
||||
auto error_code = cudaHostAlloc(&ptr, arg_alloc_size, cudaHostAllocDefault);
|
||||
|
@ -260,24 +285,56 @@ void *CudaHostPinnedSpace::allocate(const size_t arg_alloc_size) const {
|
|||
Experimental::RawMemoryAllocationFailure::AllocationMechanism::
|
||||
CudaHostAlloc);
|
||||
}
|
||||
|
||||
if (Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
const size_t reported_size =
|
||||
(arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
|
||||
Kokkos::Profiling::allocateData(
|
||||
Kokkos::Profiling::make_space_handle(name()), arg_label, ptr,
|
||||
reported_size);
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// </editor-fold> end allocate() }}}1
|
||||
//==============================================================================
|
||||
|
||||
void CudaSpace::deallocate(void *const arg_alloc_ptr,
|
||||
const size_t /* arg_alloc_size */) const {
|
||||
const size_t arg_alloc_size) const {
|
||||
deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
|
||||
}
|
||||
void CudaSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr,
|
||||
const size_t arg_alloc_size,
|
||||
const size_t arg_logical_size) const {
|
||||
if (Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
const size_t reported_size =
|
||||
(arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
|
||||
Kokkos::Profiling::deallocateData(
|
||||
Kokkos::Profiling::make_space_handle(name()), arg_label, arg_alloc_ptr,
|
||||
reported_size);
|
||||
}
|
||||
|
||||
try {
|
||||
CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
|
||||
} catch (...) {
|
||||
}
|
||||
}
|
||||
|
||||
void CudaUVMSpace::deallocate(void *const arg_alloc_ptr,
|
||||
const size_t /* arg_alloc_size */) const {
|
||||
const size_t arg_alloc_size) const {
|
||||
deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
|
||||
}
|
||||
|
||||
void CudaUVMSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr,
|
||||
const size_t arg_alloc_size
|
||||
|
||||
,
|
||||
const size_t arg_logical_size) const {
|
||||
Cuda::impl_static_fence();
|
||||
if (Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
const size_t reported_size =
|
||||
(arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
|
||||
Kokkos::Profiling::deallocateData(
|
||||
Kokkos::Profiling::make_space_handle(name()), arg_label, arg_alloc_ptr,
|
||||
reported_size);
|
||||
}
|
||||
try {
|
||||
if (arg_alloc_ptr != nullptr) {
|
||||
Kokkos::Impl::num_uvm_allocations--;
|
||||
|
@ -289,7 +346,21 @@ void CudaUVMSpace::deallocate(void *const arg_alloc_ptr,
|
|||
}
|
||||
|
||||
void CudaHostPinnedSpace::deallocate(void *const arg_alloc_ptr,
|
||||
const size_t /* arg_alloc_size */) const {
|
||||
const size_t arg_alloc_size) const {
|
||||
deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
|
||||
}
|
||||
|
||||
void CudaHostPinnedSpace::deallocate(const char *arg_label,
|
||||
void *const arg_alloc_ptr,
|
||||
const size_t arg_alloc_size,
|
||||
const size_t arg_logical_size) const {
|
||||
if (Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
const size_t reported_size =
|
||||
(arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
|
||||
Kokkos::Profiling::deallocateData(
|
||||
Kokkos::Profiling::make_space_handle(name()), arg_label, arg_alloc_ptr,
|
||||
reported_size);
|
||||
}
|
||||
try {
|
||||
CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr));
|
||||
} catch (...) {
|
||||
|
@ -321,7 +392,8 @@ SharedAllocationRecord<Kokkos::CudaSpace, void>::attach_texture_object(
|
|||
size_t const alloc_size) {
|
||||
enum { TEXTURE_BOUND_1D = 1u << 27 };
|
||||
|
||||
if ((alloc_ptr == 0) || (sizeof_alias * TEXTURE_BOUND_1D <= alloc_size)) {
|
||||
if ((alloc_ptr == nullptr) ||
|
||||
(sizeof_alias * TEXTURE_BOUND_1D <= alloc_size)) {
|
||||
std::ostringstream msg;
|
||||
msg << "Kokkos::CudaSpace ERROR: Cannot attach texture object to"
|
||||
<< " alloc_ptr(" << alloc_ptr << ")"
|
||||
|
@ -434,48 +506,36 @@ void SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::deallocate(
|
|||
// <editor-fold desc="SharedAllocationRecord destructors"> {{{1
|
||||
|
||||
SharedAllocationRecord<Kokkos::CudaSpace, void>::~SharedAllocationRecord() {
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
const char *label = nullptr;
|
||||
if (Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
SharedAllocationHeader header;
|
||||
Kokkos::Impl::DeepCopy<CudaSpace, HostSpace>(
|
||||
Kokkos::Impl::DeepCopy<Kokkos::CudaSpace, HostSpace>(
|
||||
&header, RecordBase::m_alloc_ptr, sizeof(SharedAllocationHeader));
|
||||
|
||||
Kokkos::Profiling::deallocateData(
|
||||
Kokkos::Profiling::SpaceHandle(Kokkos::CudaSpace::name()),
|
||||
header.m_label, data(), size());
|
||||
label = header.label();
|
||||
}
|
||||
#endif
|
||||
|
||||
m_space.deallocate(SharedAllocationRecord<void, void>::m_alloc_ptr,
|
||||
SharedAllocationRecord<void, void>::m_alloc_size);
|
||||
auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size;
|
||||
m_space.deallocate(label, SharedAllocationRecord<void, void>::m_alloc_ptr,
|
||||
alloc_size, (alloc_size - sizeof(SharedAllocationHeader)));
|
||||
}
|
||||
|
||||
SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::~SharedAllocationRecord() {
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
const char *label = nullptr;
|
||||
if (Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Cuda::impl_static_fence(); // Make sure I can access the label ...
|
||||
Kokkos::Profiling::deallocateData(
|
||||
Kokkos::Profiling::SpaceHandle(Kokkos::CudaUVMSpace::name()),
|
||||
RecordBase::m_alloc_ptr->m_label, data(), size());
|
||||
label = RecordBase::m_alloc_ptr->m_label;
|
||||
}
|
||||
#endif
|
||||
|
||||
m_space.deallocate(SharedAllocationRecord<void, void>::m_alloc_ptr,
|
||||
SharedAllocationRecord<void, void>::m_alloc_size);
|
||||
m_space.deallocate(label, SharedAllocationRecord<void, void>::m_alloc_ptr,
|
||||
SharedAllocationRecord<void, void>::m_alloc_size,
|
||||
(SharedAllocationRecord<void, void>::m_alloc_size -
|
||||
sizeof(SharedAllocationHeader)));
|
||||
}
|
||||
|
||||
SharedAllocationRecord<Kokkos::CudaHostPinnedSpace,
|
||||
void>::~SharedAllocationRecord() {
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if (Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::deallocateData(
|
||||
Kokkos::Profiling::SpaceHandle(Kokkos::CudaHostPinnedSpace::name()),
|
||||
RecordBase::m_alloc_ptr->m_label, data(), size());
|
||||
}
|
||||
#endif
|
||||
|
||||
m_space.deallocate(SharedAllocationRecord<void, void>::m_alloc_ptr,
|
||||
SharedAllocationRecord<void, void>::m_alloc_size);
|
||||
m_space.deallocate(RecordBase::m_alloc_ptr->m_label,
|
||||
SharedAllocationRecord<void, void>::m_alloc_ptr,
|
||||
SharedAllocationRecord<void, void>::m_alloc_size,
|
||||
(SharedAllocationRecord<void, void>::m_alloc_size -
|
||||
sizeof(SharedAllocationHeader)));
|
||||
}
|
||||
|
||||
// </editor-fold> end SharedAllocationRecord destructors }}}1
|
||||
|
@ -499,13 +559,6 @@ SharedAllocationRecord<Kokkos::CudaSpace, void>::SharedAllocationRecord(
|
|||
sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
|
||||
m_tex_obj(0),
|
||||
m_space(arg_space) {
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if (Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::allocateData(
|
||||
Kokkos::Profiling::SpaceHandle(arg_space.name()), arg_label, data(),
|
||||
arg_alloc_size);
|
||||
}
|
||||
#endif
|
||||
|
||||
SharedAllocationHeader header;
|
||||
|
||||
|
@ -537,13 +590,6 @@ SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::SharedAllocationRecord(
|
|||
sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
|
||||
m_tex_obj(0),
|
||||
m_space(arg_space) {
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if (Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::allocateData(
|
||||
Kokkos::Profiling::SpaceHandle(arg_space.name()), arg_label, data(),
|
||||
arg_alloc_size);
|
||||
}
|
||||
#endif
|
||||
// Fill in the Header information, directly accessible via UVM
|
||||
|
||||
RecordBase::m_alloc_ptr->m_record = this;
|
||||
|
@ -572,13 +618,6 @@ SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::
|
|||
arg_alloc_size),
|
||||
sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
|
||||
m_space(arg_space) {
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if (Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::allocateData(
|
||||
Kokkos::Profiling::SpaceHandle(arg_space.name()), arg_label, data(),
|
||||
arg_alloc_size);
|
||||
}
|
||||
#endif
|
||||
// Fill in the Header information, directly accessible on the host
|
||||
|
||||
RecordBase::m_alloc_ptr->m_record = this;
|
||||
|
@ -599,7 +638,7 @@ SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::
|
|||
void *SharedAllocationRecord<Kokkos::CudaSpace, void>::allocate_tracked(
|
||||
const Kokkos::CudaSpace &arg_space, const std::string &arg_alloc_label,
|
||||
const size_t arg_alloc_size) {
|
||||
if (!arg_alloc_size) return (void *)0;
|
||||
if (!arg_alloc_size) return nullptr;
|
||||
|
||||
SharedAllocationRecord *const r =
|
||||
allocate(arg_space, arg_alloc_label, arg_alloc_size);
|
||||
|
@ -611,7 +650,7 @@ void *SharedAllocationRecord<Kokkos::CudaSpace, void>::allocate_tracked(
|
|||
|
||||
void SharedAllocationRecord<Kokkos::CudaSpace, void>::deallocate_tracked(
|
||||
void *const arg_alloc_ptr) {
|
||||
if (arg_alloc_ptr != 0) {
|
||||
if (arg_alloc_ptr != nullptr) {
|
||||
SharedAllocationRecord *const r = get_record(arg_alloc_ptr);
|
||||
|
||||
RecordBase::decrement(r);
|
||||
|
@ -636,7 +675,7 @@ void *SharedAllocationRecord<Kokkos::CudaSpace, void>::reallocate_tracked(
|
|||
void *SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::allocate_tracked(
|
||||
const Kokkos::CudaUVMSpace &arg_space, const std::string &arg_alloc_label,
|
||||
const size_t arg_alloc_size) {
|
||||
if (!arg_alloc_size) return (void *)0;
|
||||
if (!arg_alloc_size) return nullptr;
|
||||
|
||||
SharedAllocationRecord *const r =
|
||||
allocate(arg_space, arg_alloc_label, arg_alloc_size);
|
||||
|
@ -648,7 +687,7 @@ void *SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::allocate_tracked(
|
|||
|
||||
void SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::deallocate_tracked(
|
||||
void *const arg_alloc_ptr) {
|
||||
if (arg_alloc_ptr != 0) {
|
||||
if (arg_alloc_ptr != nullptr) {
|
||||
SharedAllocationRecord *const r = get_record(arg_alloc_ptr);
|
||||
|
||||
RecordBase::decrement(r);
|
||||
|
@ -674,7 +713,7 @@ void *
|
|||
SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::allocate_tracked(
|
||||
const Kokkos::CudaHostPinnedSpace &arg_space,
|
||||
const std::string &arg_alloc_label, const size_t arg_alloc_size) {
|
||||
if (!arg_alloc_size) return (void *)0;
|
||||
if (!arg_alloc_size) return nullptr;
|
||||
|
||||
SharedAllocationRecord *const r =
|
||||
allocate(arg_space, arg_alloc_label, arg_alloc_size);
|
||||
|
@ -687,7 +726,7 @@ SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::allocate_tracked(
|
|||
void SharedAllocationRecord<Kokkos::CudaHostPinnedSpace,
|
||||
void>::deallocate_tracked(void *const
|
||||
arg_alloc_ptr) {
|
||||
if (arg_alloc_ptr != 0) {
|
||||
if (arg_alloc_ptr != nullptr) {
|
||||
SharedAllocationRecord *const r = get_record(arg_alloc_ptr);
|
||||
|
||||
RecordBase::decrement(r);
|
||||
|
@ -726,7 +765,7 @@ SharedAllocationRecord<Kokkos::CudaSpace, void>::get_record(void *alloc_ptr) {
|
|||
Header head;
|
||||
|
||||
Header const *const head_cuda =
|
||||
alloc_ptr ? Header::get_header(alloc_ptr) : (Header *)0;
|
||||
alloc_ptr ? Header::get_header(alloc_ptr) : nullptr;
|
||||
|
||||
if (alloc_ptr) {
|
||||
Kokkos::Impl::DeepCopy<HostSpace, CudaSpace>(
|
||||
|
@ -734,7 +773,7 @@ SharedAllocationRecord<Kokkos::CudaSpace, void>::get_record(void *alloc_ptr) {
|
|||
}
|
||||
|
||||
RecordCuda *const record =
|
||||
alloc_ptr ? static_cast<RecordCuda *>(head.m_record) : (RecordCuda *)0;
|
||||
alloc_ptr ? static_cast<RecordCuda *>(head.m_record) : nullptr;
|
||||
|
||||
if (!alloc_ptr || record->m_alloc_ptr != head_cuda) {
|
||||
Kokkos::Impl::throw_runtime_exception(
|
||||
|
@ -751,7 +790,7 @@ SharedAllocationRecord<Kokkos::CudaUVMSpace, void> *SharedAllocationRecord<
|
|||
using RecordCuda = SharedAllocationRecord<Kokkos::CudaUVMSpace, void>;
|
||||
|
||||
Header *const h =
|
||||
alloc_ptr ? reinterpret_cast<Header *>(alloc_ptr) - 1 : (Header *)0;
|
||||
alloc_ptr ? reinterpret_cast<Header *>(alloc_ptr) - 1 : nullptr;
|
||||
|
||||
if (!alloc_ptr || h->m_record->m_alloc_ptr != h) {
|
||||
Kokkos::Impl::throw_runtime_exception(
|
||||
|
@ -769,7 +808,7 @@ SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>
|
|||
using RecordCuda = SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>;
|
||||
|
||||
Header *const h =
|
||||
alloc_ptr ? reinterpret_cast<Header *>(alloc_ptr) - 1 : (Header *)0;
|
||||
alloc_ptr ? reinterpret_cast<Header *>(alloc_ptr) - 1 : nullptr;
|
||||
|
||||
if (!alloc_ptr || h->m_record->m_alloc_ptr != h) {
|
||||
Kokkos::Impl::throw_runtime_exception(
|
||||
|
|
|
@ -48,22 +48,102 @@
|
|||
#include <Kokkos_Macros.hpp>
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <iostream>
|
||||
#include <Cuda/Kokkos_Cuda_Error.hpp>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template <class DriverType, class LaunchBounds, bool Large>
|
||||
struct CudaGetMaxBlockSize;
|
||||
inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
|
||||
cudaFuncAttributes const& attributes,
|
||||
int block_size, size_t dynamic_shmem) {
|
||||
// Limits due do registers/SM
|
||||
int const regs_per_sm = properties.regsPerMultiprocessor;
|
||||
int const regs_per_thread = attributes.numRegs;
|
||||
int const max_blocks_regs = regs_per_sm / (regs_per_thread * block_size);
|
||||
|
||||
template <class DriverType, class LaunchBounds>
|
||||
int cuda_get_max_block_size(const typename DriverType::functor_type& f,
|
||||
const size_t vector_length,
|
||||
const size_t shmem_extra_block,
|
||||
const size_t shmem_extra_thread) {
|
||||
return CudaGetMaxBlockSize<DriverType, LaunchBounds, true>::get_block_size(
|
||||
f, vector_length, shmem_extra_block, shmem_extra_thread);
|
||||
// Limits due to shared memory/SM
|
||||
size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor;
|
||||
size_t const shmem_per_block = properties.sharedMemPerBlock;
|
||||
size_t const static_shmem = attributes.sharedSizeBytes;
|
||||
size_t const dynamic_shmem_per_block = attributes.maxDynamicSharedSizeBytes;
|
||||
size_t const total_shmem = static_shmem + dynamic_shmem;
|
||||
|
||||
int const max_blocks_shmem =
|
||||
total_shmem > shmem_per_block || dynamic_shmem > dynamic_shmem_per_block
|
||||
? 0
|
||||
: (total_shmem > 0 ? (int)shmem_per_sm / total_shmem
|
||||
: max_blocks_regs);
|
||||
|
||||
// Limits due to blocks/SM
|
||||
#if CUDA_VERSION >= 11000
|
||||
int const max_blocks_per_sm = properties.maxBlocksPerMultiProcessor;
|
||||
#else
|
||||
int const max_blocks_per_sm = [&properties]() {
|
||||
switch (properties.major) {
|
||||
case 3: return 16;
|
||||
case 5:
|
||||
case 6: return 32;
|
||||
case 7: {
|
||||
int isTuring = properties.minor == 5;
|
||||
return (isTuring) ? 16 : 32;
|
||||
}
|
||||
default:
|
||||
throw_runtime_exception("Unknown device in cuda block size deduction");
|
||||
return 0;
|
||||
}
|
||||
}();
|
||||
#endif
|
||||
|
||||
// Overall occupancy in blocks
|
||||
return std::min({max_blocks_regs, max_blocks_shmem, max_blocks_per_sm});
|
||||
}
|
||||
|
||||
template <typename UnaryFunction, typename LaunchBounds>
|
||||
inline int cuda_deduce_block_size(bool early_termination,
|
||||
cudaDeviceProp const& properties,
|
||||
cudaFuncAttributes const& attributes,
|
||||
UnaryFunction block_size_to_dynamic_shmem,
|
||||
LaunchBounds) {
|
||||
// Limits
|
||||
int const max_threads_per_sm = properties.maxThreadsPerMultiProcessor;
|
||||
// unsure if I need to do that or if this is already accounted for in the
|
||||
// functor attributes
|
||||
int const max_threads_per_block =
|
||||
std::min(LaunchBounds::maxTperB == 0 ? (int)properties.maxThreadsPerBlock
|
||||
: (int)LaunchBounds::maxTperB,
|
||||
attributes.maxThreadsPerBlock);
|
||||
int const min_blocks_per_sm =
|
||||
LaunchBounds::minBperSM == 0 ? 1 : LaunchBounds::minBperSM;
|
||||
|
||||
// Recorded maximum
|
||||
int opt_block_size = 0;
|
||||
int opt_threads_per_sm = 0;
|
||||
|
||||
for (int block_size = max_threads_per_block; block_size > 0;
|
||||
block_size -= 32) {
|
||||
size_t const dynamic_shmem = block_size_to_dynamic_shmem(block_size);
|
||||
|
||||
int blocks_per_sm = cuda_max_active_blocks_per_sm(
|
||||
properties, attributes, block_size, dynamic_shmem);
|
||||
|
||||
int threads_per_sm = blocks_per_sm * block_size;
|
||||
|
||||
if (threads_per_sm > max_threads_per_sm) {
|
||||
blocks_per_sm = max_threads_per_sm / block_size;
|
||||
threads_per_sm = blocks_per_sm * block_size;
|
||||
}
|
||||
|
||||
if (blocks_per_sm >= min_blocks_per_sm) {
|
||||
if (threads_per_sm >= opt_threads_per_sm) {
|
||||
opt_block_size = block_size;
|
||||
opt_threads_per_sm = threads_per_sm;
|
||||
}
|
||||
}
|
||||
|
||||
if (early_termination && blocks_per_sm != 0) break;
|
||||
}
|
||||
|
||||
return opt_block_size;
|
||||
}
|
||||
|
||||
template <class FunctorType, class LaunchBounds>
|
||||
|
@ -72,295 +152,24 @@ int cuda_get_max_block_size(const CudaInternal* cuda_instance,
|
|||
const FunctorType& f, const size_t vector_length,
|
||||
const size_t shmem_block,
|
||||
const size_t shmem_thread) {
|
||||
const int min_blocks_per_sm =
|
||||
LaunchBounds::minBperSM == 0 ? 1 : LaunchBounds::minBperSM;
|
||||
const int max_threads_per_block = LaunchBounds::maxTperB == 0
|
||||
? cuda_instance->m_maxThreadsPerBlock
|
||||
: LaunchBounds::maxTperB;
|
||||
(void)cuda_instance;
|
||||
|
||||
const int regs_per_thread = attr.numRegs;
|
||||
const int regs_per_sm = cuda_instance->m_regsPerSM;
|
||||
const int shmem_per_sm = cuda_instance->m_shmemPerSM;
|
||||
const int max_shmem_per_block = cuda_instance->m_maxShmemPerBlock;
|
||||
const int max_blocks_per_sm = cuda_instance->m_maxBlocksPerSM;
|
||||
const int max_threads_per_sm = cuda_instance->m_maxThreadsPerSM;
|
||||
auto const& prop = Kokkos::Cuda().cuda_device_prop();
|
||||
|
||||
int block_size = std::min(attr.maxThreadsPerBlock, max_threads_per_block);
|
||||
auto const block_size_to_dynamic_shmem = [&f, vector_length, shmem_block,
|
||||
shmem_thread](int block_size) {
|
||||
size_t const functor_shmem =
|
||||
Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
|
||||
f, block_size / vector_length);
|
||||
|
||||
int functor_shmem =
|
||||
FunctorTeamShmemSize<FunctorType>::value(f, block_size / vector_length);
|
||||
int total_shmem = shmem_block + shmem_thread * (block_size / vector_length) +
|
||||
functor_shmem + attr.sharedSizeBytes;
|
||||
int max_blocks_regs = regs_per_sm / (regs_per_thread * block_size);
|
||||
int max_blocks_shmem =
|
||||
(total_shmem < max_shmem_per_block)
|
||||
? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs)
|
||||
: 0;
|
||||
int blocks_per_sm = std::min(max_blocks_regs, max_blocks_shmem);
|
||||
int threads_per_sm = blocks_per_sm * block_size;
|
||||
if (threads_per_sm > max_threads_per_sm) {
|
||||
blocks_per_sm = max_threads_per_sm / block_size;
|
||||
threads_per_sm = blocks_per_sm * block_size;
|
||||
}
|
||||
int opt_block_size = (blocks_per_sm >= min_blocks_per_sm) ? block_size : 0;
|
||||
int opt_threads_per_sm = threads_per_sm;
|
||||
// printf("BlockSizeMax: %i Shmem: %i %i %i %i Regs: %i %i Blocks: %i %i
|
||||
// Achieved: %i %i Opt: %i %i\n",block_size,
|
||||
// shmem_per_sm,max_shmem_per_block,functor_shmem,total_shmem,
|
||||
// regs_per_sm,regs_per_thread,max_blocks_shmem,max_blocks_regs,blocks_per_sm,threads_per_sm,opt_block_size,opt_threads_per_sm);
|
||||
block_size -= 32;
|
||||
while ((blocks_per_sm == 0) && (block_size >= 32)) {
|
||||
functor_shmem =
|
||||
FunctorTeamShmemSize<FunctorType>::value(f, block_size / vector_length);
|
||||
total_shmem = shmem_block + shmem_thread * (block_size / vector_length) +
|
||||
functor_shmem + attr.sharedSizeBytes;
|
||||
max_blocks_regs = regs_per_sm / (regs_per_thread * block_size);
|
||||
max_blocks_shmem =
|
||||
(total_shmem < max_shmem_per_block)
|
||||
? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs)
|
||||
: 0;
|
||||
blocks_per_sm = std::min(max_blocks_regs, max_blocks_shmem);
|
||||
threads_per_sm = blocks_per_sm * block_size;
|
||||
if (threads_per_sm > max_threads_per_sm) {
|
||||
blocks_per_sm = max_threads_per_sm / block_size;
|
||||
threads_per_sm = blocks_per_sm * block_size;
|
||||
}
|
||||
if ((blocks_per_sm >= min_blocks_per_sm) &&
|
||||
(blocks_per_sm <= max_blocks_per_sm)) {
|
||||
if (threads_per_sm >= opt_threads_per_sm) {
|
||||
opt_block_size = block_size;
|
||||
opt_threads_per_sm = threads_per_sm;
|
||||
}
|
||||
}
|
||||
// printf("BlockSizeMax: %i Shmem: %i %i %i %i Regs: %i %i Blocks: %i %i
|
||||
// Achieved: %i %i Opt: %i %i\n",block_size,
|
||||
// shmem_per_sm,max_shmem_per_block,functor_shmem,total_shmem,
|
||||
// regs_per_sm,regs_per_thread,max_blocks_shmem,max_blocks_regs,blocks_per_sm,threads_per_sm,opt_block_size,opt_threads_per_sm);
|
||||
block_size -= 32;
|
||||
}
|
||||
return opt_block_size;
|
||||
}
|
||||
size_t const dynamic_shmem = shmem_block +
|
||||
shmem_thread * (block_size / vector_length) +
|
||||
functor_shmem;
|
||||
return dynamic_shmem;
|
||||
};
|
||||
|
||||
template <class DriverType>
|
||||
struct CudaGetMaxBlockSize<DriverType, Kokkos::LaunchBounds<>, true> {
|
||||
static int get_block_size(const typename DriverType::functor_type& f,
|
||||
const size_t vector_length,
|
||||
const size_t shmem_extra_block,
|
||||
const size_t shmem_extra_thread) {
|
||||
int numBlocks;
|
||||
int blockSize = 1024;
|
||||
int sharedmem =
|
||||
shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
|
||||
FunctorTeamShmemSize<typename DriverType::functor_type>::value(
|
||||
f, blockSize / vector_length);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks, cuda_parallel_launch_constant_memory<DriverType>, blockSize,
|
||||
sharedmem);
|
||||
|
||||
if (numBlocks > 0) return blockSize;
|
||||
while (blockSize > 32 && numBlocks == 0) {
|
||||
blockSize /= 2;
|
||||
sharedmem =
|
||||
shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
|
||||
FunctorTeamShmemSize<typename DriverType::functor_type>::value(
|
||||
f, blockSize / vector_length);
|
||||
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks, cuda_parallel_launch_constant_memory<DriverType>,
|
||||
blockSize, sharedmem);
|
||||
}
|
||||
int blockSizeUpperBound = blockSize * 2;
|
||||
while (blockSize < blockSizeUpperBound && numBlocks > 0) {
|
||||
blockSize += 32;
|
||||
sharedmem =
|
||||
shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
|
||||
FunctorTeamShmemSize<typename DriverType::functor_type>::value(
|
||||
f, blockSize / vector_length);
|
||||
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks, cuda_parallel_launch_constant_memory<DriverType>,
|
||||
blockSize, sharedmem);
|
||||
}
|
||||
return blockSize - 32;
|
||||
}
|
||||
};
|
||||
|
||||
template <class DriverType>
|
||||
struct CudaGetMaxBlockSize<DriverType, Kokkos::LaunchBounds<>, false> {
|
||||
static int get_block_size(const typename DriverType::functor_type& f,
|
||||
const size_t vector_length,
|
||||
const size_t shmem_extra_block,
|
||||
const size_t shmem_extra_thread) {
|
||||
int numBlocks;
|
||||
|
||||
unsigned int blockSize = 1024;
|
||||
unsigned int sharedmem =
|
||||
shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
|
||||
FunctorTeamShmemSize<typename DriverType::functor_type>::value(
|
||||
f, blockSize / vector_length);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks, cuda_parallel_launch_local_memory<DriverType>, blockSize,
|
||||
sharedmem);
|
||||
|
||||
if (numBlocks > 0) return blockSize;
|
||||
while (blockSize > 32 && numBlocks == 0) {
|
||||
blockSize /= 2;
|
||||
sharedmem =
|
||||
shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
|
||||
FunctorTeamShmemSize<typename DriverType::functor_type>::value(
|
||||
f, blockSize / vector_length);
|
||||
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks, cuda_parallel_launch_local_memory<DriverType>, blockSize,
|
||||
sharedmem);
|
||||
}
|
||||
unsigned int blockSizeUpperBound = blockSize * 2;
|
||||
while (blockSize < blockSizeUpperBound && numBlocks > 0) {
|
||||
blockSize += 32;
|
||||
sharedmem =
|
||||
shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
|
||||
FunctorTeamShmemSize<typename DriverType::functor_type>::value(
|
||||
f, blockSize / vector_length);
|
||||
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks, cuda_parallel_launch_local_memory<DriverType>, blockSize,
|
||||
sharedmem);
|
||||
}
|
||||
return blockSize - 32;
|
||||
}
|
||||
};
|
||||
|
||||
template <class DriverType, unsigned int MaxThreadsPerBlock,
|
||||
unsigned int MinBlocksPerSM>
|
||||
struct CudaGetMaxBlockSize<
|
||||
DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
|
||||
true> {
|
||||
static int get_block_size(const typename DriverType::functor_type& f,
|
||||
const size_t vector_length,
|
||||
const size_t shmem_extra_block,
|
||||
const size_t shmem_extra_thread) {
|
||||
int numBlocks = 0, oldNumBlocks = 0;
|
||||
unsigned int blockSize = MaxThreadsPerBlock;
|
||||
unsigned int sharedmem =
|
||||
shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
|
||||
FunctorTeamShmemSize<typename DriverType::functor_type>::value(
|
||||
f, blockSize / vector_length);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
cuda_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock,
|
||||
MinBlocksPerSM>,
|
||||
blockSize, sharedmem);
|
||||
|
||||
if (static_cast<unsigned int>(numBlocks) >= MinBlocksPerSM)
|
||||
return blockSize;
|
||||
|
||||
while (blockSize > 32 &&
|
||||
static_cast<unsigned int>(numBlocks) < MinBlocksPerSM) {
|
||||
blockSize /= 2;
|
||||
sharedmem =
|
||||
shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
|
||||
FunctorTeamShmemSize<typename DriverType::functor_type>::value(
|
||||
f, blockSize / vector_length);
|
||||
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks, cuda_parallel_launch_constant_memory<DriverType>,
|
||||
blockSize, sharedmem);
|
||||
}
|
||||
unsigned int blockSizeUpperBound =
|
||||
(blockSize * 2 < MaxThreadsPerBlock ? blockSize * 2
|
||||
: MaxThreadsPerBlock);
|
||||
while (blockSize<blockSizeUpperBound&& static_cast<unsigned int>(numBlocks)>
|
||||
MinBlocksPerSM) {
|
||||
blockSize += 32;
|
||||
sharedmem =
|
||||
shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
|
||||
FunctorTeamShmemSize<typename DriverType::functor_type>::value(
|
||||
f, blockSize / vector_length);
|
||||
oldNumBlocks = numBlocks;
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks, cuda_parallel_launch_constant_memory<DriverType>,
|
||||
blockSize, sharedmem);
|
||||
}
|
||||
if (static_cast<unsigned int>(oldNumBlocks) >= MinBlocksPerSM)
|
||||
return blockSize - 32;
|
||||
return -1;
|
||||
}
|
||||
};
|
||||
|
||||
template <class DriverType, unsigned int MaxThreadsPerBlock,
|
||||
unsigned int MinBlocksPerSM>
|
||||
struct CudaGetMaxBlockSize<
|
||||
DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
|
||||
false> {
|
||||
static int get_block_size(const typename DriverType::functor_type& f,
|
||||
const size_t vector_length,
|
||||
const size_t shmem_extra_block,
|
||||
const size_t shmem_extra_thread) {
|
||||
int numBlocks = 0, oldNumBlocks = 0;
|
||||
unsigned int blockSize = MaxThreadsPerBlock;
|
||||
int sharedmem =
|
||||
shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
|
||||
FunctorTeamShmemSize<typename DriverType::functor_type>::value(
|
||||
f, blockSize / vector_length);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
cuda_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
|
||||
MinBlocksPerSM>,
|
||||
blockSize, sharedmem);
|
||||
if (static_cast<unsigned int>(numBlocks) >= MinBlocksPerSM)
|
||||
return blockSize;
|
||||
|
||||
while (blockSize > 32 &&
|
||||
static_cast<unsigned int>(numBlocks) < MinBlocksPerSM) {
|
||||
blockSize /= 2;
|
||||
sharedmem =
|
||||
shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
|
||||
FunctorTeamShmemSize<typename DriverType::functor_type>::value(
|
||||
f, blockSize / vector_length);
|
||||
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks, cuda_parallel_launch_local_memory<DriverType>, blockSize,
|
||||
sharedmem);
|
||||
}
|
||||
unsigned int blockSizeUpperBound =
|
||||
(blockSize * 2 < MaxThreadsPerBlock ? blockSize * 2
|
||||
: MaxThreadsPerBlock);
|
||||
while (blockSize < blockSizeUpperBound &&
|
||||
static_cast<unsigned int>(numBlocks) >= MinBlocksPerSM) {
|
||||
blockSize += 32;
|
||||
sharedmem =
|
||||
shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
|
||||
FunctorTeamShmemSize<typename DriverType::functor_type>::value(
|
||||
f, blockSize / vector_length);
|
||||
oldNumBlocks = numBlocks;
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks, cuda_parallel_launch_local_memory<DriverType>, blockSize,
|
||||
sharedmem);
|
||||
}
|
||||
if (static_cast<unsigned int>(oldNumBlocks) >= MinBlocksPerSM)
|
||||
return blockSize - 32;
|
||||
return -1;
|
||||
}
|
||||
};
|
||||
|
||||
template <class DriverType, class LaunchBounds, bool Large>
|
||||
struct CudaGetOptBlockSize;
|
||||
|
||||
template <class DriverType, class LaunchBounds>
|
||||
int cuda_get_opt_block_size(const typename DriverType::functor_type& f,
|
||||
const size_t vector_length,
|
||||
const size_t shmem_extra_block,
|
||||
const size_t shmem_extra_thread) {
|
||||
return CudaGetOptBlockSize<
|
||||
DriverType, LaunchBounds,
|
||||
// LaunchBounds::launch_mechanism == Kokkos::Experimental::LaunchDefault ?
|
||||
// (( CudaTraits::ConstantMemoryUseThreshold <
|
||||
// sizeof(DriverType) )?
|
||||
// Kokkos::Experimental::CudaLaunchConstantMemory:Kokkos::Experimental::CudaLaunchLocalMemory):
|
||||
// LaunchBounds::launch_mechanism
|
||||
(CudaTraits::ConstantMemoryUseThreshold <
|
||||
sizeof(DriverType))>::get_block_size(f, vector_length, shmem_extra_block,
|
||||
shmem_extra_thread);
|
||||
return cuda_deduce_block_size(true, prop, attr, block_size_to_dynamic_shmem,
|
||||
LaunchBounds{});
|
||||
}
|
||||
|
||||
template <class FunctorType, class LaunchBounds>
|
||||
|
@ -369,221 +178,26 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
|
|||
const FunctorType& f, const size_t vector_length,
|
||||
const size_t shmem_block,
|
||||
const size_t shmem_thread) {
|
||||
const int min_blocks_per_sm =
|
||||
LaunchBounds::minBperSM == 0 ? 1 : LaunchBounds::minBperSM;
|
||||
const int max_threads_per_block = LaunchBounds::maxTperB == 0
|
||||
? cuda_instance->m_maxThreadsPerBlock
|
||||
: LaunchBounds::maxTperB;
|
||||
(void)cuda_instance;
|
||||
|
||||
const int regs_per_thread = attr.numRegs;
|
||||
const int regs_per_sm = cuda_instance->m_regsPerSM;
|
||||
const int shmem_per_sm = cuda_instance->m_shmemPerSM;
|
||||
const int max_shmem_per_block = cuda_instance->m_maxShmemPerBlock;
|
||||
const int max_blocks_per_sm = cuda_instance->m_maxBlocksPerSM;
|
||||
const int max_threads_per_sm = cuda_instance->m_maxThreadsPerSM;
|
||||
auto const& prop = Kokkos::Cuda().cuda_device_prop();
|
||||
|
||||
int block_size = std::min(attr.maxThreadsPerBlock, max_threads_per_block);
|
||||
auto const block_size_to_dynamic_shmem = [&f, vector_length, shmem_block,
|
||||
shmem_thread](int block_size) {
|
||||
size_t const functor_shmem =
|
||||
Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
|
||||
f, block_size / vector_length);
|
||||
|
||||
int functor_shmem =
|
||||
FunctorTeamShmemSize<FunctorType>::value(f, block_size / vector_length);
|
||||
int total_shmem = shmem_block + shmem_thread * (block_size / vector_length) +
|
||||
functor_shmem + attr.sharedSizeBytes;
|
||||
int max_blocks_regs = regs_per_sm / (regs_per_thread * block_size);
|
||||
int max_blocks_shmem =
|
||||
(total_shmem < max_shmem_per_block)
|
||||
? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs)
|
||||
: 0;
|
||||
int blocks_per_sm = std::min(max_blocks_regs, max_blocks_shmem);
|
||||
int threads_per_sm = blocks_per_sm * block_size;
|
||||
if (threads_per_sm > max_threads_per_sm) {
|
||||
blocks_per_sm = max_threads_per_sm / block_size;
|
||||
threads_per_sm = blocks_per_sm * block_size;
|
||||
}
|
||||
int opt_block_size = (blocks_per_sm >= min_blocks_per_sm) ? block_size : 0;
|
||||
int opt_threads_per_sm = threads_per_sm;
|
||||
size_t const dynamic_shmem = shmem_block +
|
||||
shmem_thread * (block_size / vector_length) +
|
||||
functor_shmem;
|
||||
return dynamic_shmem;
|
||||
};
|
||||
|
||||
block_size -= 32;
|
||||
while ((block_size >= 32)) {
|
||||
functor_shmem =
|
||||
FunctorTeamShmemSize<FunctorType>::value(f, block_size / vector_length);
|
||||
total_shmem = shmem_block + shmem_thread * (block_size / vector_length) +
|
||||
functor_shmem + attr.sharedSizeBytes;
|
||||
max_blocks_regs = regs_per_sm / (regs_per_thread * block_size);
|
||||
max_blocks_shmem =
|
||||
(total_shmem < max_shmem_per_block)
|
||||
? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs)
|
||||
: 0;
|
||||
blocks_per_sm = std::min(max_blocks_regs, max_blocks_shmem);
|
||||
threads_per_sm = blocks_per_sm * block_size;
|
||||
if (threads_per_sm > max_threads_per_sm) {
|
||||
blocks_per_sm = max_threads_per_sm / block_size;
|
||||
threads_per_sm = blocks_per_sm * block_size;
|
||||
}
|
||||
if ((blocks_per_sm >= min_blocks_per_sm) &&
|
||||
(blocks_per_sm <= max_blocks_per_sm)) {
|
||||
if (threads_per_sm >= opt_threads_per_sm) {
|
||||
opt_block_size = block_size;
|
||||
opt_threads_per_sm = threads_per_sm;
|
||||
}
|
||||
}
|
||||
block_size -= 32;
|
||||
}
|
||||
return opt_block_size;
|
||||
return cuda_deduce_block_size(false, prop, attr, block_size_to_dynamic_shmem,
|
||||
LaunchBounds{});
|
||||
}
|
||||
|
||||
template <class DriverType>
|
||||
struct CudaGetOptBlockSize<DriverType, Kokkos::LaunchBounds<0, 0>, true> {
|
||||
static int get_block_size(const typename DriverType::functor_type& f,
|
||||
const size_t vector_length,
|
||||
const size_t shmem_extra_block,
|
||||
const size_t shmem_extra_thread) {
|
||||
int blockSize = 16;
|
||||
int numBlocks;
|
||||
int sharedmem;
|
||||
int maxOccupancy = 0;
|
||||
int bestBlockSize = 0;
|
||||
|
||||
while (blockSize < 1024) {
|
||||
blockSize *= 2;
|
||||
|
||||
// calculate the occupancy with that optBlockSize and check whether its
|
||||
// larger than the largest one found so far
|
||||
sharedmem =
|
||||
shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
|
||||
FunctorTeamShmemSize<typename DriverType::functor_type>::value(
|
||||
f, blockSize / vector_length);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks, cuda_parallel_launch_constant_memory<DriverType>,
|
||||
blockSize, sharedmem);
|
||||
if (maxOccupancy < numBlocks * blockSize) {
|
||||
maxOccupancy = numBlocks * blockSize;
|
||||
bestBlockSize = blockSize;
|
||||
}
|
||||
}
|
||||
return bestBlockSize;
|
||||
}
|
||||
};
|
||||
|
||||
template <class DriverType>
|
||||
struct CudaGetOptBlockSize<DriverType, Kokkos::LaunchBounds<0, 0>, false> {
|
||||
static int get_block_size(const typename DriverType::functor_type& f,
|
||||
const size_t vector_length,
|
||||
const size_t shmem_extra_block,
|
||||
const size_t shmem_extra_thread) {
|
||||
int blockSize = 16;
|
||||
int numBlocks;
|
||||
int sharedmem;
|
||||
int maxOccupancy = 0;
|
||||
int bestBlockSize = 0;
|
||||
|
||||
while (blockSize < 1024) {
|
||||
blockSize *= 2;
|
||||
sharedmem =
|
||||
shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
|
||||
FunctorTeamShmemSize<typename DriverType::functor_type>::value(
|
||||
f, blockSize / vector_length);
|
||||
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks, cuda_parallel_launch_local_memory<DriverType>, blockSize,
|
||||
sharedmem);
|
||||
|
||||
if (maxOccupancy < numBlocks * blockSize) {
|
||||
maxOccupancy = numBlocks * blockSize;
|
||||
bestBlockSize = blockSize;
|
||||
}
|
||||
}
|
||||
return bestBlockSize;
|
||||
}
|
||||
};
|
||||
|
||||
template <class DriverType, unsigned int MaxThreadsPerBlock,
|
||||
unsigned int MinBlocksPerSM>
|
||||
struct CudaGetOptBlockSize<
|
||||
DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
|
||||
true> {
|
||||
static int get_block_size(const typename DriverType::functor_type& f,
|
||||
const size_t vector_length,
|
||||
const size_t shmem_extra_block,
|
||||
const size_t shmem_extra_thread) {
|
||||
int blockSize = 16;
|
||||
int numBlocks;
|
||||
int sharedmem;
|
||||
int maxOccupancy = 0;
|
||||
int bestBlockSize = 0;
|
||||
int max_threads_per_block =
|
||||
std::min(MaxThreadsPerBlock,
|
||||
cuda_internal_maximum_warp_count() * CudaTraits::WarpSize);
|
||||
|
||||
while (blockSize < max_threads_per_block) {
|
||||
blockSize *= 2;
|
||||
|
||||
// calculate the occupancy with that optBlockSize and check whether its
|
||||
// larger than the largest one found so far
|
||||
sharedmem =
|
||||
shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
|
||||
FunctorTeamShmemSize<typename DriverType::functor_type>::value(
|
||||
f, blockSize / vector_length);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
cuda_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock,
|
||||
MinBlocksPerSM>,
|
||||
blockSize, sharedmem);
|
||||
if (numBlocks >= int(MinBlocksPerSM) &&
|
||||
blockSize <= int(MaxThreadsPerBlock)) {
|
||||
if (maxOccupancy < numBlocks * blockSize) {
|
||||
maxOccupancy = numBlocks * blockSize;
|
||||
bestBlockSize = blockSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (maxOccupancy > 0) return bestBlockSize;
|
||||
return -1;
|
||||
}
|
||||
};
|
||||
|
||||
template <class DriverType, unsigned int MaxThreadsPerBlock,
|
||||
unsigned int MinBlocksPerSM>
|
||||
struct CudaGetOptBlockSize<
|
||||
DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
|
||||
false> {
|
||||
static int get_block_size(const typename DriverType::functor_type& f,
|
||||
const size_t vector_length,
|
||||
const size_t shmem_extra_block,
|
||||
const size_t shmem_extra_thread) {
|
||||
int blockSize = 16;
|
||||
int numBlocks;
|
||||
int sharedmem;
|
||||
int maxOccupancy = 0;
|
||||
int bestBlockSize = 0;
|
||||
int max_threads_per_block =
|
||||
std::min(MaxThreadsPerBlock,
|
||||
cuda_internal_maximum_warp_count() * CudaTraits::WarpSize);
|
||||
|
||||
while (blockSize < max_threads_per_block) {
|
||||
blockSize *= 2;
|
||||
sharedmem =
|
||||
shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
|
||||
FunctorTeamShmemSize<typename DriverType::functor_type>::value(
|
||||
f, blockSize / vector_length);
|
||||
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
cuda_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
|
||||
MinBlocksPerSM>,
|
||||
blockSize, sharedmem);
|
||||
if (numBlocks >= int(MinBlocksPerSM) &&
|
||||
blockSize <= int(MaxThreadsPerBlock)) {
|
||||
if (maxOccupancy < numBlocks * blockSize) {
|
||||
maxOccupancy = numBlocks * blockSize;
|
||||
bestBlockSize = blockSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (maxOccupancy > 0) return bestBlockSize;
|
||||
return -1;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
|
|
|
@ -50,7 +50,7 @@
|
|||
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
|
||||
#include <iostream>
|
||||
#include <iosfwd>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
@ -113,12 +113,7 @@ class CudaRawMemoryAllocationFailure : public RawMemoryAllocationFailure {
|
|||
get_failure_mode(arg_error_code), arg_mechanism),
|
||||
m_error_code(arg_error_code) {}
|
||||
|
||||
void append_additional_error_information(std::ostream& o) const override {
|
||||
if (m_error_code != cudaSuccess) {
|
||||
o << " The Cuda allocation returned the error code \"\""
|
||||
<< cudaGetErrorName(m_error_code) << "\".";
|
||||
}
|
||||
}
|
||||
void append_additional_error_information(std::ostream& o) const override;
|
||||
};
|
||||
|
||||
} // end namespace Experimental
|
||||
|
|
|
@ -55,7 +55,7 @@
|
|||
#include <Cuda/Kokkos_Cuda_Instance.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Locks.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
#include <impl/Kokkos_Tools.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* Standard 'C' libraries */
|
||||
|
@ -134,7 +134,7 @@ bool cuda_launch_blocking() {
|
|||
|
||||
if (env == 0) return false;
|
||||
|
||||
return atoi(env);
|
||||
return std::stoi(env);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -239,8 +239,9 @@ const CudaInternalDevices &CudaInternalDevices::singleton() {
|
|||
|
||||
} // namespace
|
||||
|
||||
int CudaInternal::was_initialized = 0;
|
||||
int CudaInternal::was_finalized = 0;
|
||||
unsigned long *CudaInternal::constantMemHostStaging = nullptr;
|
||||
cudaEvent_t CudaInternal::constantMemReusable = nullptr;
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
void CudaInternal::print_configuration(std::ostream &s) const {
|
||||
|
@ -288,11 +289,11 @@ CudaInternal::~CudaInternal() {
|
|||
m_scratchUnifiedCount = 0;
|
||||
m_scratchUnifiedSupported = 0;
|
||||
m_streamCount = 0;
|
||||
m_scratchSpace = 0;
|
||||
m_scratchFlags = 0;
|
||||
m_scratchUnified = 0;
|
||||
m_scratchConcurrentBitset = 0;
|
||||
m_stream = 0;
|
||||
m_scratchSpace = nullptr;
|
||||
m_scratchFlags = nullptr;
|
||||
m_scratchUnified = nullptr;
|
||||
m_scratchConcurrentBitset = nullptr;
|
||||
m_stream = nullptr;
|
||||
}
|
||||
|
||||
int CudaInternal::verify_is_initialized(const char *const label) const {
|
||||
|
@ -307,22 +308,20 @@ CudaInternal &CudaInternal::singleton() {
|
|||
static CudaInternal self;
|
||||
return self;
|
||||
}
|
||||
void CudaInternal::fence() const { cudaStreamSynchronize(m_stream); }
|
||||
void CudaInternal::fence() const {
|
||||
CUDA_SAFE_CALL(cudaStreamSynchronize(m_stream));
|
||||
}
|
||||
|
||||
void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
|
||||
if (was_finalized)
|
||||
Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n");
|
||||
was_initialized = 1;
|
||||
was_initialized = true;
|
||||
if (is_initialized()) return;
|
||||
|
||||
enum { WordSize = sizeof(size_type) };
|
||||
|
||||
#ifndef KOKKOS_IMPL_TURN_OFF_CUDA_HOST_INIT_CHECK
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
if (!HostSpace::execution_space::is_initialized()) {
|
||||
#else
|
||||
if (!HostSpace::execution_space::impl_is_initialized()) {
|
||||
#endif
|
||||
const std::string msg(
|
||||
"Cuda::initialize ERROR : HostSpace::execution_space is not "
|
||||
"initialized");
|
||||
|
@ -332,7 +331,7 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
|
|||
|
||||
const CudaInternalDevices &dev_info = CudaInternalDevices::singleton();
|
||||
|
||||
const bool ok_init = 0 == m_scratchSpace || 0 == m_scratchFlags;
|
||||
const bool ok_init = nullptr == m_scratchSpace || nullptr == m_scratchFlags;
|
||||
|
||||
const bool ok_id =
|
||||
0 <= cuda_device_id && cuda_device_id < dev_info.m_cudaDevCount;
|
||||
|
@ -366,7 +365,7 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
|
|||
int compiled_major = m_cudaArch / 100;
|
||||
int compiled_minor = (m_cudaArch % 100) / 10;
|
||||
|
||||
if (compiled_major != cudaProp.major || compiled_minor < cudaProp.minor) {
|
||||
if (compiled_major != cudaProp.major || compiled_minor > cudaProp.minor) {
|
||||
std::stringstream ss;
|
||||
ss << "Kokkos::Cuda::initialize ERROR: running kernels compiled for "
|
||||
"compute capability "
|
||||
|
@ -453,8 +452,8 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
|
|||
|
||||
// Allocate and initialize uint32_t[ buffer_bound ]
|
||||
|
||||
typedef Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>
|
||||
Record;
|
||||
using Record =
|
||||
Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
|
||||
|
||||
Record *const r =
|
||||
Record::allocate(Kokkos::CudaSpace(), "InternalScratchBitset",
|
||||
|
@ -511,7 +510,7 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
|
|||
if (env_force_device_alloc == 0)
|
||||
force_device_alloc = false;
|
||||
else
|
||||
force_device_alloc = atoi(env_force_device_alloc) != 0;
|
||||
force_device_alloc = std::stoi(env_force_device_alloc) != 0;
|
||||
|
||||
const char *env_visible_devices = getenv("CUDA_VISIBLE_DEVICES");
|
||||
bool visible_devices_one = true;
|
||||
|
@ -542,14 +541,23 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
|
|||
#endif
|
||||
|
||||
// Init the array for used for arbitrarily sized atomics
|
||||
if (stream == 0) Impl::initialize_host_cuda_lock_arrays();
|
||||
if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();
|
||||
|
||||
// Allocate a staging buffer for constant mem in pinned host memory
|
||||
// and an event to avoid overwriting driver for previous kernel launches
|
||||
if (stream == nullptr) {
|
||||
CUDA_SAFE_CALL(cudaMallocHost((void **)&constantMemHostStaging,
|
||||
CudaTraits::ConstantMemoryUsage));
|
||||
|
||||
CUDA_SAFE_CALL(cudaEventCreate(&constantMemReusable));
|
||||
}
|
||||
|
||||
m_stream = stream;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
typedef Cuda::size_type ScratchGrain[Impl::CudaTraits::WarpSize];
|
||||
using ScratchGrain = Cuda::size_type[Impl::CudaTraits::WarpSize];
|
||||
enum { sizeScratchGrain = sizeof(ScratchGrain) };
|
||||
|
||||
Cuda::size_type *CudaInternal::scratch_flags(const Cuda::size_type size) const {
|
||||
|
@ -557,8 +565,8 @@ Cuda::size_type *CudaInternal::scratch_flags(const Cuda::size_type size) const {
|
|||
m_scratchFlagsCount * sizeScratchGrain < size) {
|
||||
m_scratchFlagsCount = (size + sizeScratchGrain - 1) / sizeScratchGrain;
|
||||
|
||||
typedef Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>
|
||||
Record;
|
||||
using Record =
|
||||
Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
|
||||
|
||||
if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags));
|
||||
|
||||
|
@ -582,8 +590,8 @@ Cuda::size_type *CudaInternal::scratch_space(const Cuda::size_type size) const {
|
|||
m_scratchSpaceCount * sizeScratchGrain < size) {
|
||||
m_scratchSpaceCount = (size + sizeScratchGrain - 1) / sizeScratchGrain;
|
||||
|
||||
typedef Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>
|
||||
Record;
|
||||
using Record =
|
||||
Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
|
||||
|
||||
if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace));
|
||||
|
||||
|
@ -605,9 +613,8 @@ Cuda::size_type *CudaInternal::scratch_unified(
|
|||
m_scratchUnifiedCount * sizeScratchGrain < size) {
|
||||
m_scratchUnifiedCount = (size + sizeScratchGrain - 1) / sizeScratchGrain;
|
||||
|
||||
typedef Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaHostPinnedSpace,
|
||||
void>
|
||||
Record;
|
||||
using Record =
|
||||
Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>;
|
||||
|
||||
if (m_scratchUnified)
|
||||
Record::decrement(Record::get_record(m_scratchUnified));
|
||||
|
@ -629,8 +636,8 @@ Cuda::size_type *CudaInternal::scratch_functor(
|
|||
if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) {
|
||||
m_scratchFunctorSize = size;
|
||||
|
||||
typedef Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>
|
||||
Record;
|
||||
using Record =
|
||||
Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
|
||||
|
||||
if (m_scratchFunctor)
|
||||
Record::decrement(Record::get_record(m_scratchFunctor));
|
||||
|
@ -649,15 +656,13 @@ Cuda::size_type *CudaInternal::scratch_functor(
|
|||
//----------------------------------------------------------------------------
|
||||
|
||||
void CudaInternal::finalize() {
|
||||
was_finalized = 1;
|
||||
if (0 != m_scratchSpace || 0 != m_scratchFlags) {
|
||||
was_finalized = true;
|
||||
if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
|
||||
Impl::finalize_host_cuda_lock_arrays();
|
||||
|
||||
if (m_stream != 0) cudaStreamDestroy(m_stream);
|
||||
|
||||
typedef Kokkos::Impl::SharedAllocationRecord<CudaSpace> RecordCuda;
|
||||
typedef Kokkos::Impl::SharedAllocationRecord<CudaHostPinnedSpace>
|
||||
RecordHost;
|
||||
using RecordCuda = Kokkos::Impl::SharedAllocationRecord<CudaSpace>;
|
||||
using RecordHost =
|
||||
Kokkos::Impl::SharedAllocationRecord<CudaHostPinnedSpace>;
|
||||
|
||||
RecordCuda::decrement(RecordCuda::get_record(m_scratchFlags));
|
||||
RecordCuda::decrement(RecordCuda::get_record(m_scratchSpace));
|
||||
|
@ -675,11 +680,17 @@ void CudaInternal::finalize() {
|
|||
m_scratchFlagsCount = 0;
|
||||
m_scratchUnifiedCount = 0;
|
||||
m_streamCount = 0;
|
||||
m_scratchSpace = 0;
|
||||
m_scratchFlags = 0;
|
||||
m_scratchUnified = 0;
|
||||
m_scratchConcurrentBitset = 0;
|
||||
m_stream = 0;
|
||||
m_scratchSpace = nullptr;
|
||||
m_scratchFlags = nullptr;
|
||||
m_scratchUnified = nullptr;
|
||||
m_scratchConcurrentBitset = nullptr;
|
||||
m_stream = nullptr;
|
||||
}
|
||||
|
||||
// only destroy these if we're finalizing the singleton
|
||||
if (this == &singleton()) {
|
||||
cudaFreeHost(constantMemHostStaging);
|
||||
cudaEventDestroy(constantMemReusable);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -743,27 +754,13 @@ int Cuda::concurrency() {
|
|||
return Impl::CudaInternal::singleton().m_maxConcurrency;
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
int Cuda::is_initialized()
|
||||
#else
|
||||
int Cuda::impl_is_initialized()
|
||||
#endif
|
||||
{
|
||||
int Cuda::impl_is_initialized() {
|
||||
return Impl::CudaInternal::singleton().is_initialized();
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
void Cuda::initialize(const Cuda::SelectDevice config, size_t num_instances)
|
||||
#else
|
||||
void Cuda::impl_initialize(const Cuda::SelectDevice config,
|
||||
size_t /*num_instances*/)
|
||||
#endif
|
||||
{
|
||||
Impl::CudaInternal::singleton().initialize(config.cuda_device_id, 0);
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::initialize();
|
||||
#endif
|
||||
size_t /*num_instances*/) {
|
||||
Impl::CudaInternal::singleton().initialize(config.cuda_device_id, nullptr);
|
||||
}
|
||||
|
||||
std::vector<unsigned> Cuda::detect_device_arch() {
|
||||
|
@ -793,48 +790,72 @@ Cuda::size_type Cuda::device_arch() {
|
|||
return dev_arch;
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
void Cuda::finalize()
|
||||
#else
|
||||
void Cuda::impl_finalize()
|
||||
#endif
|
||||
{
|
||||
Impl::CudaInternal::singleton().finalize();
|
||||
void Cuda::impl_finalize() { Impl::CudaInternal::singleton().finalize(); }
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::finalize();
|
||||
#endif
|
||||
}
|
||||
|
||||
Cuda::Cuda() : m_space_instance(&Impl::CudaInternal::singleton()) {
|
||||
Cuda::Cuda()
|
||||
: m_space_instance(&Impl::CudaInternal::singleton()), m_counter(nullptr) {
|
||||
Impl::CudaInternal::singleton().verify_is_initialized(
|
||||
"Cuda instance constructor");
|
||||
}
|
||||
|
||||
Cuda::Cuda(cudaStream_t stream) : m_space_instance(new Impl::CudaInternal) {
|
||||
Cuda::Cuda(cudaStream_t stream)
|
||||
: m_space_instance(new Impl::CudaInternal), m_counter(new int(1)) {
|
||||
Impl::CudaInternal::singleton().verify_is_initialized(
|
||||
"Cuda instance constructor");
|
||||
m_space_instance->initialize(Impl::CudaInternal::singleton().m_cudaDev,
|
||||
stream);
|
||||
}
|
||||
|
||||
KOKKOS_FUNCTION Cuda::Cuda(Cuda &&other) noexcept {
|
||||
m_space_instance = other.m_space_instance;
|
||||
other.m_space_instance = nullptr;
|
||||
m_counter = other.m_counter;
|
||||
other.m_counter = nullptr;
|
||||
}
|
||||
|
||||
KOKKOS_FUNCTION Cuda::Cuda(const Cuda &other)
|
||||
: m_space_instance(other.m_space_instance), m_counter(other.m_counter) {
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
|
||||
if (m_counter) Kokkos::atomic_add(m_counter, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
KOKKOS_FUNCTION Cuda &Cuda::operator=(Cuda &&other) noexcept {
|
||||
m_space_instance = other.m_space_instance;
|
||||
other.m_space_instance = nullptr;
|
||||
m_counter = other.m_counter;
|
||||
other.m_counter = nullptr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
KOKKOS_FUNCTION Cuda &Cuda::operator=(const Cuda &other) {
|
||||
m_space_instance = other.m_space_instance;
|
||||
m_counter = other.m_counter;
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
|
||||
if (m_counter) Kokkos::atomic_add(m_counter, 1);
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
KOKKOS_FUNCTION Cuda::~Cuda() noexcept {
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
|
||||
if (m_counter == nullptr) return;
|
||||
int const count = Kokkos::atomic_fetch_sub(m_counter, 1);
|
||||
if (count == 1) {
|
||||
delete m_counter;
|
||||
m_space_instance->finalize();
|
||||
delete m_space_instance;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void Cuda::print_configuration(std::ostream &s, const bool) {
|
||||
Impl::CudaInternal::singleton().print_configuration(s);
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
bool Cuda::sleep() { return false; }
|
||||
|
||||
bool Cuda::wake() { return true; }
|
||||
#endif
|
||||
|
||||
void Cuda::impl_static_fence() { Kokkos::Impl::cuda_device_synchronize(); }
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
void Cuda::fence() { impl_static_fence(); }
|
||||
#else
|
||||
void Cuda::fence() const { m_space_instance->fence(); }
|
||||
#endif
|
||||
|
||||
const char *Cuda::name() { return "Cuda"; }
|
||||
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
#ifndef KOKKOS_CUDA_INSTANCE_HPP_
|
||||
#define KOKKOS_CUDA_INSTANCE_HPP_
|
||||
|
||||
#include <vector>
|
||||
#include <impl/Kokkos_Tools.hpp>
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
// These functions fulfill the purpose of allowing to work around
|
||||
|
@ -15,25 +17,28 @@ namespace Kokkos {
|
|||
namespace Impl {
|
||||
|
||||
struct CudaTraits {
|
||||
enum { WarpSize = 32 /* 0x0020 */ };
|
||||
enum { WarpIndexMask = 0x001f /* Mask for warpindex */ };
|
||||
enum { WarpIndexShift = 5 /* WarpSize == 1 << WarpShift */ };
|
||||
|
||||
enum { ConstantMemoryUsage = 0x008000 /* 32k bytes */ };
|
||||
enum { ConstantMemoryCache = 0x002000 /* 8k bytes */ };
|
||||
enum { KernelArgumentLimit = 0x001000 /* 4k bytes */ };
|
||||
|
||||
typedef unsigned long
|
||||
ConstantGlobalBufferType[ConstantMemoryUsage / sizeof(unsigned long)];
|
||||
|
||||
#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_PASCAL)
|
||||
enum {
|
||||
ConstantMemoryUseThreshold =
|
||||
0x000200 /* 0 bytes -> always use constant (or global)*/
|
||||
enum : CudaSpace::size_type { WarpSize = 32 /* 0x0020 */ };
|
||||
enum : CudaSpace::size_type {
|
||||
WarpIndexMask = 0x001f /* Mask for warpindex */
|
||||
};
|
||||
#else
|
||||
enum : CudaSpace::size_type {
|
||||
WarpIndexShift = 5 /* WarpSize == 1 << WarpShift */
|
||||
};
|
||||
|
||||
enum : CudaSpace::size_type {
|
||||
ConstantMemoryUsage = 0x008000 /* 32k bytes */
|
||||
};
|
||||
enum : CudaSpace::size_type {
|
||||
ConstantMemoryCache = 0x002000 /* 8k bytes */
|
||||
};
|
||||
enum : CudaSpace::size_type {
|
||||
KernelArgumentLimit = 0x001000 /* 4k bytes */
|
||||
};
|
||||
|
||||
using ConstantGlobalBufferType =
|
||||
unsigned long[ConstantMemoryUsage / sizeof(unsigned long)];
|
||||
|
||||
enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ };
|
||||
#endif
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_count(
|
||||
CudaSpace::size_type i) {
|
||||
|
@ -42,7 +47,7 @@ struct CudaTraits {
|
|||
|
||||
KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_align(
|
||||
CudaSpace::size_type i) {
|
||||
enum { Mask = ~CudaSpace::size_type(WarpIndexMask) };
|
||||
constexpr CudaSpace::size_type Mask = ~WarpIndexMask;
|
||||
return (i + WarpIndexMask) & Mask;
|
||||
}
|
||||
};
|
||||
|
@ -79,7 +84,7 @@ class CudaInternal {
|
|||
#endif
|
||||
|
||||
public:
|
||||
typedef Cuda::size_type size_type;
|
||||
using size_type = Cuda::size_type;
|
||||
|
||||
int m_cudaDev;
|
||||
|
||||
|
@ -112,18 +117,23 @@ class CudaInternal {
|
|||
uint32_t* m_scratchConcurrentBitset;
|
||||
cudaStream_t m_stream;
|
||||
|
||||
static int was_initialized;
|
||||
static int was_finalized;
|
||||
bool was_initialized = false;
|
||||
bool was_finalized = false;
|
||||
|
||||
// FIXME_CUDA: these want to be per-device, not per-stream... use of 'static'
|
||||
// here will break once there are multiple devices though
|
||||
static unsigned long* constantMemHostStaging;
|
||||
static cudaEvent_t constantMemReusable;
|
||||
|
||||
static CudaInternal& singleton();
|
||||
|
||||
int verify_is_initialized(const char* const label) const;
|
||||
|
||||
int is_initialized() const {
|
||||
return 0 != m_scratchSpace && 0 != m_scratchFlags;
|
||||
return nullptr != m_scratchSpace && nullptr != m_scratchFlags;
|
||||
}
|
||||
|
||||
void initialize(int cuda_device_id, cudaStream_t stream = 0);
|
||||
void initialize(int cuda_device_id, cudaStream_t stream = nullptr);
|
||||
void finalize();
|
||||
|
||||
void print_configuration(std::ostream&) const;
|
||||
|
@ -157,12 +167,12 @@ class CudaInternal {
|
|||
m_scratchFunctorSize(0),
|
||||
m_scratchUnifiedSupported(0),
|
||||
m_streamCount(0),
|
||||
m_scratchSpace(0),
|
||||
m_scratchFlags(0),
|
||||
m_scratchUnified(0),
|
||||
m_scratchFunctor(0),
|
||||
m_scratchConcurrentBitset(0),
|
||||
m_stream(0) {}
|
||||
m_scratchSpace(nullptr),
|
||||
m_scratchFlags(nullptr),
|
||||
m_scratchUnified(nullptr),
|
||||
m_scratchFunctor(nullptr),
|
||||
m_scratchConcurrentBitset(nullptr),
|
||||
m_stream(nullptr) {}
|
||||
|
||||
size_type* scratch_space(const size_type size) const;
|
||||
size_type* scratch_flags(const size_type size) const;
|
||||
|
|
|
@ -244,9 +244,6 @@ struct CudaParallelLaunch<
|
|||
const CudaInternal* cuda_instance,
|
||||
const bool prefer_shmem) {
|
||||
if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) {
|
||||
// Fence before changing settings and copying closure
|
||||
Kokkos::Cuda().fence();
|
||||
|
||||
if (cuda_instance->m_maxShmemPerBlock < shmem) {
|
||||
Kokkos::Impl::throw_runtime_exception(std::string(
|
||||
"CudaParallelLaunch FAILED: shared memory request is too large"));
|
||||
|
@ -254,28 +251,43 @@ struct CudaParallelLaunch<
|
|||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else {
|
||||
CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
|
||||
cuda_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock,
|
||||
MinBlocksPerSM>,
|
||||
(prefer_shmem ? cudaFuncCachePreferShared
|
||||
: cudaFuncCachePreferL1)));
|
||||
static bool cache_config_set = false;
|
||||
if (!cache_config_set) {
|
||||
CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
|
||||
cuda_parallel_launch_constant_memory<
|
||||
DriverType, MaxThreadsPerBlock, MinBlocksPerSM>,
|
||||
(prefer_shmem ? cudaFuncCachePreferShared
|
||||
: cudaFuncCachePreferL1)));
|
||||
cache_config_set = true;
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)prefer_shmem;
|
||||
#endif
|
||||
|
||||
// Copy functor to constant memory on the device
|
||||
cudaMemcpyToSymbolAsync(kokkos_impl_cuda_constant_memory_buffer, &driver,
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
// Wait until the previous kernel that uses the constant buffer is done
|
||||
CUDA_SAFE_CALL(cudaEventSynchronize(cuda_instance->constantMemReusable));
|
||||
|
||||
// Copy functor (synchronously) to staging buffer in pinned host memory
|
||||
unsigned long* staging = cuda_instance->constantMemHostStaging;
|
||||
memcpy(staging, &driver, sizeof(DriverType));
|
||||
|
||||
// Copy functor asynchronously from there to constant memory on the device
|
||||
cudaMemcpyToSymbolAsync(kokkos_impl_cuda_constant_memory_buffer, staging,
|
||||
sizeof(DriverType), 0, cudaMemcpyHostToDevice,
|
||||
cudaStream_t(cuda_instance->m_stream));
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
// Invoke the driver function on the device
|
||||
cuda_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock,
|
||||
MinBlocksPerSM>
|
||||
<<<grid, block, shmem, cuda_instance->m_stream>>>();
|
||||
|
||||
// Record an event that says when the constant buffer can be reused
|
||||
CUDA_SAFE_CALL(cudaEventRecord(cuda_instance->constantMemReusable,
|
||||
cudaStream_t(cuda_instance->m_stream)));
|
||||
|
||||
#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
|
||||
CUDA_SAFE_CALL(cudaGetLastError());
|
||||
Kokkos::Cuda().fence();
|
||||
|
@ -284,11 +296,15 @@ struct CudaParallelLaunch<
|
|||
}
|
||||
|
||||
static cudaFuncAttributes get_cuda_func_attributes() {
|
||||
cudaFuncAttributes attr;
|
||||
CUDA_SAFE_CALL(cudaFuncGetAttributes(
|
||||
&attr,
|
||||
cuda_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock,
|
||||
MinBlocksPerSM>));
|
||||
static cudaFuncAttributes attr;
|
||||
static bool attr_set = false;
|
||||
if (!attr_set) {
|
||||
CUDA_SAFE_CALL(cudaFuncGetAttributes(
|
||||
&attr,
|
||||
cuda_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock,
|
||||
MinBlocksPerSM>));
|
||||
attr_set = true;
|
||||
}
|
||||
return attr;
|
||||
}
|
||||
};
|
||||
|
@ -304,9 +320,6 @@ struct CudaParallelLaunch<DriverType, Kokkos::LaunchBounds<0, 0>,
|
|||
const CudaInternal* cuda_instance,
|
||||
const bool prefer_shmem) {
|
||||
if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) {
|
||||
// Fence before changing settings and copying closure
|
||||
Kokkos::Cuda().fence();
|
||||
|
||||
if (cuda_instance->m_maxShmemPerBlock < shmem) {
|
||||
Kokkos::Impl::throw_runtime_exception(std::string(
|
||||
"CudaParallelLaunch FAILED: shared memory request is too large"));
|
||||
|
@ -314,26 +327,41 @@ struct CudaParallelLaunch<DriverType, Kokkos::LaunchBounds<0, 0>,
|
|||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else {
|
||||
CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
|
||||
cuda_parallel_launch_constant_memory<DriverType>,
|
||||
(prefer_shmem ? cudaFuncCachePreferShared
|
||||
: cudaFuncCachePreferL1)));
|
||||
static bool cache_config_set = false;
|
||||
if (!cache_config_set) {
|
||||
CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
|
||||
cuda_parallel_launch_constant_memory<DriverType>,
|
||||
(prefer_shmem ? cudaFuncCachePreferShared
|
||||
: cudaFuncCachePreferL1)));
|
||||
cache_config_set = true;
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)prefer_shmem;
|
||||
#endif
|
||||
|
||||
// Copy functor to constant memory on the device
|
||||
cudaMemcpyToSymbolAsync(kokkos_impl_cuda_constant_memory_buffer, &driver,
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
// Wait until the previous kernel that uses the constant buffer is done
|
||||
CUDA_SAFE_CALL(cudaEventSynchronize(cuda_instance->constantMemReusable));
|
||||
|
||||
// Copy functor (synchronously) to staging buffer in pinned host memory
|
||||
unsigned long* staging = cuda_instance->constantMemHostStaging;
|
||||
memcpy(staging, &driver, sizeof(DriverType));
|
||||
|
||||
// Copy functor asynchronously from there to constant memory on the device
|
||||
cudaMemcpyToSymbolAsync(kokkos_impl_cuda_constant_memory_buffer, staging,
|
||||
sizeof(DriverType), 0, cudaMemcpyHostToDevice,
|
||||
cudaStream_t(cuda_instance->m_stream));
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
// Invoke the driver function on the device
|
||||
cuda_parallel_launch_constant_memory<DriverType>
|
||||
<<<grid, block, shmem, cuda_instance->m_stream>>>();
|
||||
|
||||
// Record an event that says when the constant buffer can be reused
|
||||
CUDA_SAFE_CALL(cudaEventRecord(cuda_instance->constantMemReusable,
|
||||
cudaStream_t(cuda_instance->m_stream)));
|
||||
|
||||
#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
|
||||
CUDA_SAFE_CALL(cudaGetLastError());
|
||||
Kokkos::Cuda().fence();
|
||||
|
@ -342,9 +370,13 @@ struct CudaParallelLaunch<DriverType, Kokkos::LaunchBounds<0, 0>,
|
|||
}
|
||||
|
||||
static cudaFuncAttributes get_cuda_func_attributes() {
|
||||
cudaFuncAttributes attr;
|
||||
CUDA_SAFE_CALL(cudaFuncGetAttributes(
|
||||
&attr, cuda_parallel_launch_constant_memory<DriverType>));
|
||||
static cudaFuncAttributes attr;
|
||||
static bool attr_set = false;
|
||||
if (!attr_set) {
|
||||
CUDA_SAFE_CALL(cudaFuncGetAttributes(
|
||||
&attr, cuda_parallel_launch_constant_memory<DriverType>));
|
||||
attr_set = true;
|
||||
}
|
||||
return attr;
|
||||
}
|
||||
};
|
||||
|
@ -369,11 +401,15 @@ struct CudaParallelLaunch<
|
|||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else {
|
||||
CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
|
||||
cuda_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
|
||||
MinBlocksPerSM>,
|
||||
(prefer_shmem ? cudaFuncCachePreferShared
|
||||
: cudaFuncCachePreferL1)));
|
||||
static bool cache_config_set = false;
|
||||
if (!cache_config_set) {
|
||||
CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
|
||||
cuda_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
|
||||
MinBlocksPerSM>,
|
||||
(prefer_shmem ? cudaFuncCachePreferShared
|
||||
: cudaFuncCachePreferL1)));
|
||||
cache_config_set = true;
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)prefer_shmem;
|
||||
|
@ -394,10 +430,15 @@ struct CudaParallelLaunch<
|
|||
}
|
||||
|
||||
static cudaFuncAttributes get_cuda_func_attributes() {
|
||||
cudaFuncAttributes attr;
|
||||
CUDA_SAFE_CALL(cudaFuncGetAttributes(
|
||||
&attr, cuda_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
|
||||
MinBlocksPerSM>));
|
||||
static cudaFuncAttributes attr;
|
||||
static bool attr_set = false;
|
||||
if (!attr_set) {
|
||||
CUDA_SAFE_CALL(cudaFuncGetAttributes(
|
||||
&attr,
|
||||
cuda_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
|
||||
MinBlocksPerSM>));
|
||||
attr_set = true;
|
||||
}
|
||||
return attr;
|
||||
}
|
||||
};
|
||||
|
@ -420,10 +461,14 @@ struct CudaParallelLaunch<DriverType, Kokkos::LaunchBounds<0, 0>,
|
|||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else {
|
||||
CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
|
||||
cuda_parallel_launch_local_memory<DriverType>,
|
||||
(prefer_shmem ? cudaFuncCachePreferShared
|
||||
: cudaFuncCachePreferL1)));
|
||||
static bool cache_config_set = false;
|
||||
if (!cache_config_set) {
|
||||
CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
|
||||
cuda_parallel_launch_local_memory<DriverType>,
|
||||
(prefer_shmem ? cudaFuncCachePreferShared
|
||||
: cudaFuncCachePreferL1)));
|
||||
cache_config_set = true;
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)prefer_shmem;
|
||||
|
@ -443,9 +488,13 @@ struct CudaParallelLaunch<DriverType, Kokkos::LaunchBounds<0, 0>,
|
|||
}
|
||||
|
||||
static cudaFuncAttributes get_cuda_func_attributes() {
|
||||
cudaFuncAttributes attr;
|
||||
CUDA_SAFE_CALL(cudaFuncGetAttributes(
|
||||
&attr, cuda_parallel_launch_local_memory<DriverType>));
|
||||
static cudaFuncAttributes attr;
|
||||
static bool attr_set = false;
|
||||
if (!attr_set) {
|
||||
CUDA_SAFE_CALL(cudaFuncGetAttributes(
|
||||
&attr, cuda_parallel_launch_local_memory<DriverType>));
|
||||
attr_set = true;
|
||||
}
|
||||
return attr;
|
||||
}
|
||||
};
|
||||
|
@ -467,11 +516,15 @@ struct CudaParallelLaunch<
|
|||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else {
|
||||
CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
|
||||
cuda_parallel_launch_global_memory<DriverType, MaxThreadsPerBlock,
|
||||
MinBlocksPerSM>,
|
||||
(prefer_shmem ? cudaFuncCachePreferShared
|
||||
: cudaFuncCachePreferL1)));
|
||||
static bool cache_config_set = false;
|
||||
if (!cache_config_set) {
|
||||
CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
|
||||
cuda_parallel_launch_global_memory<DriverType, MaxThreadsPerBlock,
|
||||
MinBlocksPerSM>,
|
||||
(prefer_shmem ? cudaFuncCachePreferShared
|
||||
: cudaFuncCachePreferL1)));
|
||||
cache_config_set = true;
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)prefer_shmem;
|
||||
|
@ -497,11 +550,15 @@ struct CudaParallelLaunch<
|
|||
}
|
||||
}
|
||||
static cudaFuncAttributes get_cuda_func_attributes() {
|
||||
cudaFuncAttributes attr;
|
||||
CUDA_SAFE_CALL(cudaFuncGetAttributes(
|
||||
&attr,
|
||||
cuda_parallel_launch_global_memory<DriverType, MaxThreadsPerBlock,
|
||||
MinBlocksPerSM>));
|
||||
static cudaFuncAttributes attr;
|
||||
static bool attr_set = false;
|
||||
if (!attr_set) {
|
||||
CUDA_SAFE_CALL(cudaFuncGetAttributes(
|
||||
&attr,
|
||||
cuda_parallel_launch_global_memory<DriverType, MaxThreadsPerBlock,
|
||||
MinBlocksPerSM>));
|
||||
attr_set = true;
|
||||
}
|
||||
return attr;
|
||||
}
|
||||
};
|
||||
|
@ -521,10 +578,14 @@ struct CudaParallelLaunch<DriverType, Kokkos::LaunchBounds<0, 0>,
|
|||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else {
|
||||
CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
|
||||
cuda_parallel_launch_global_memory<DriverType>,
|
||||
(prefer_shmem ? cudaFuncCachePreferShared
|
||||
: cudaFuncCachePreferL1)));
|
||||
static bool cache_config_set = false;
|
||||
if (!cache_config_set) {
|
||||
CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
|
||||
cuda_parallel_launch_global_memory<DriverType>,
|
||||
(prefer_shmem ? cudaFuncCachePreferShared
|
||||
: cudaFuncCachePreferL1)));
|
||||
cache_config_set = true;
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)prefer_shmem;
|
||||
|
@ -549,9 +610,13 @@ struct CudaParallelLaunch<DriverType, Kokkos::LaunchBounds<0, 0>,
|
|||
}
|
||||
|
||||
static cudaFuncAttributes get_cuda_func_attributes() {
|
||||
cudaFuncAttributes attr;
|
||||
CUDA_SAFE_CALL(cudaFuncGetAttributes(
|
||||
&attr, cuda_parallel_launch_global_memory<DriverType>));
|
||||
static cudaFuncAttributes attr;
|
||||
static bool attr_set = false;
|
||||
if (!attr_set) {
|
||||
CUDA_SAFE_CALL(cudaFuncGetAttributes(
|
||||
&attr, cuda_parallel_launch_global_memory<DriverType>));
|
||||
attr_set = true;
|
||||
}
|
||||
return attr;
|
||||
}
|
||||
};
|
||||
|
|
|
@ -48,8 +48,8 @@
|
|||
#include <Kokkos_Macros.hpp>
|
||||
#if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA)
|
||||
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <cstdio>
|
||||
#include <cstdint>
|
||||
|
||||
|
@ -63,10 +63,8 @@
|
|||
#include <Kokkos_Vectorization.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
#include <impl/Kokkos_Tools.hpp>
|
||||
#include <typeinfo>
|
||||
#endif
|
||||
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
|
||||
|
@ -84,9 +82,9 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
|
|||
: public PolicyTraits<Properties...> {
|
||||
public:
|
||||
//! Tag this class as a kokkos execution policy
|
||||
typedef TeamPolicyInternal execution_policy;
|
||||
using execution_policy = TeamPolicyInternal;
|
||||
|
||||
typedef PolicyTraits<Properties...> traits;
|
||||
using traits = PolicyTraits<Properties...>;
|
||||
|
||||
template <class ExecSpace, class... OtherProperties>
|
||||
friend class TeamPolicyInternal;
|
||||
|
@ -104,7 +102,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
|
|||
|
||||
public:
|
||||
//! Execution space of this execution policy
|
||||
typedef Kokkos::Cuda execution_space;
|
||||
using execution_space = Kokkos::Cuda;
|
||||
|
||||
template <class... OtherProperties>
|
||||
TeamPolicyInternal(const TeamPolicyInternal<OtherProperties...>& p) {
|
||||
|
@ -119,50 +117,12 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
|
|||
m_space = p.m_space;
|
||||
}
|
||||
|
||||
TeamPolicyInternal& operator=(const TeamPolicyInternal& p) {
|
||||
m_league_size = p.m_league_size;
|
||||
m_team_size = p.m_team_size;
|
||||
m_vector_length = p.m_vector_length;
|
||||
m_team_scratch_size[0] = p.m_team_scratch_size[0];
|
||||
m_team_scratch_size[1] = p.m_team_scratch_size[1];
|
||||
m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
|
||||
m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
|
||||
m_chunk_size = p.m_chunk_size;
|
||||
m_space = p.m_space;
|
||||
return *this;
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
template <class FunctorType>
|
||||
static inline int team_size_max(const FunctorType& functor) {
|
||||
int n = MAX_WARP * Impl::CudaTraits::WarpSize;
|
||||
|
||||
for (; n; n >>= 1) {
|
||||
const int shmem_size =
|
||||
/* for global reduce */ Impl::
|
||||
cuda_single_inter_block_reduce_scan_shmem<
|
||||
false, FunctorType, typename traits::work_tag>(functor, n)
|
||||
/* for team reduce */
|
||||
+ (n + 2) * sizeof(double)
|
||||
/* for team shared */
|
||||
+ Impl::FunctorTeamShmemSize<FunctorType>::value(functor, n);
|
||||
|
||||
if (shmem_size < typename traits::execution_space()
|
||||
.impl_internal_space_instance()
|
||||
->m_maxShmemPerBlock)
|
||||
break;
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
#endif
|
||||
|
||||
template <class FunctorType>
|
||||
int team_size_max(const FunctorType& f, const ParallelForTag&) const {
|
||||
typedef Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>
|
||||
closure_type;
|
||||
using closure_type =
|
||||
Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
|
||||
cudaFuncAttributes attr =
|
||||
CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
|
||||
get_cuda_func_attributes();
|
||||
|
@ -179,15 +139,15 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
|
|||
template <class FunctorType>
|
||||
inline int team_size_max(const FunctorType& f,
|
||||
const ParallelReduceTag&) const {
|
||||
typedef Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
|
||||
TeamPolicyInternal, FunctorType>
|
||||
functor_analysis_type;
|
||||
typedef typename Impl::ParallelReduceReturnValue<
|
||||
using functor_analysis_type =
|
||||
Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
|
||||
TeamPolicyInternal, FunctorType>;
|
||||
using reducer_type = typename Impl::ParallelReduceReturnValue<
|
||||
void, typename functor_analysis_type::value_type,
|
||||
FunctorType>::reducer_type reducer_type;
|
||||
typedef Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
|
||||
reducer_type>
|
||||
closure_type;
|
||||
FunctorType>::reducer_type;
|
||||
using closure_type =
|
||||
Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
|
||||
reducer_type>;
|
||||
return internal_team_size_max<closure_type>(f);
|
||||
}
|
||||
|
||||
|
@ -200,25 +160,10 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
|
|||
return internal_team_size_max<closure_type>(f);
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
template <class FunctorType>
|
||||
static int team_size_recommended(const FunctorType& functor) {
|
||||
return team_size_max(functor);
|
||||
}
|
||||
|
||||
template <class FunctorType>
|
||||
static int team_size_recommended(const FunctorType& functor,
|
||||
const int vector_length) {
|
||||
int max = team_size_max(functor) / vector_length;
|
||||
if (max < 1) max = 1;
|
||||
return max;
|
||||
}
|
||||
#endif
|
||||
|
||||
template <class FunctorType>
|
||||
int team_size_recommended(const FunctorType& f, const ParallelForTag&) const {
|
||||
typedef Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>
|
||||
closure_type;
|
||||
using closure_type =
|
||||
Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
|
||||
cudaFuncAttributes attr =
|
||||
CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
|
||||
get_cuda_func_attributes();
|
||||
|
@ -235,24 +180,24 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
|
|||
template <class FunctorType>
|
||||
inline int team_size_recommended(const FunctorType& f,
|
||||
const ParallelReduceTag&) const {
|
||||
typedef Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
|
||||
TeamPolicyInternal, FunctorType>
|
||||
functor_analysis_type;
|
||||
typedef typename Impl::ParallelReduceReturnValue<
|
||||
using functor_analysis_type =
|
||||
Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
|
||||
TeamPolicyInternal, FunctorType>;
|
||||
using reducer_type = typename Impl::ParallelReduceReturnValue<
|
||||
void, typename functor_analysis_type::value_type,
|
||||
FunctorType>::reducer_type reducer_type;
|
||||
typedef Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
|
||||
reducer_type>
|
||||
closure_type;
|
||||
FunctorType>::reducer_type;
|
||||
using closure_type =
|
||||
Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
|
||||
reducer_type>;
|
||||
return internal_team_size_recommended<closure_type>(f);
|
||||
}
|
||||
|
||||
template <class FunctorType, class ReducerType>
|
||||
int team_size_recommended(const FunctorType& f, const ReducerType&,
|
||||
const ParallelReduceTag&) const {
|
||||
typedef Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
|
||||
ReducerType>
|
||||
closure_type;
|
||||
using closure_type =
|
||||
Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
|
||||
ReducerType>;
|
||||
return internal_team_size_recommended<closure_type>(f);
|
||||
}
|
||||
|
||||
|
@ -401,44 +346,6 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
|
|||
|
||||
inline int chunk_size() const { return m_chunk_size; }
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
/** \brief set chunk_size to a discrete value*/
|
||||
inline TeamPolicyInternal set_chunk_size(
|
||||
typename traits::index_type chunk_size_) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_chunk_size = chunk_size_;
|
||||
return p;
|
||||
}
|
||||
|
||||
/** \brief set per team scratch size for a specific level of the scratch
|
||||
* hierarchy */
|
||||
inline TeamPolicyInternal set_scratch_size(
|
||||
const int& level, const PerTeamValue& per_team) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_team_scratch_size[level] = per_team.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
/** \brief set per thread scratch size for a specific level of the scratch
|
||||
* hierarchy */
|
||||
inline TeamPolicyInternal set_scratch_size(
|
||||
const int& level, const PerThreadValue& per_thread) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_thread_scratch_size[level] = per_thread.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
/** \brief set per thread and per team scratch size for a specific level of
|
||||
* the scratch hierarchy */
|
||||
inline TeamPolicyInternal set_scratch_size(
|
||||
const int& level, const PerTeamValue& per_team,
|
||||
const PerThreadValue& per_thread) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_team_scratch_size[level] = per_team.value;
|
||||
p.m_thread_scratch_size[level] = per_thread.value;
|
||||
return p;
|
||||
};
|
||||
#else
|
||||
/** \brief set chunk_size to a discrete value*/
|
||||
inline TeamPolicyInternal& set_chunk_size(
|
||||
typename traits::index_type chunk_size_) {
|
||||
|
@ -471,46 +378,10 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
|
|||
m_thread_scratch_size[level] = per_thread.value;
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
|
||||
typedef Kokkos::Impl::CudaTeamMember member_type;
|
||||
using member_type = Kokkos::Impl::CudaTeamMember;
|
||||
|
||||
protected:
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
/** \brief set chunk_size to a discrete value*/
|
||||
inline TeamPolicyInternal internal_set_chunk_size(
|
||||
typename traits::index_type chunk_size_) {
|
||||
m_chunk_size = chunk_size_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** \brief set per team scratch size for a specific level of the scratch
|
||||
* hierarchy */
|
||||
inline TeamPolicyInternal internal_set_scratch_size(
|
||||
const int& level, const PerTeamValue& per_team) {
|
||||
m_team_scratch_size[level] = per_team.value;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** \brief set per thread scratch size for a specific level of the scratch
|
||||
* hierarchy */
|
||||
inline TeamPolicyInternal internal_set_scratch_size(
|
||||
const int& level, const PerThreadValue& per_thread) {
|
||||
m_thread_scratch_size[level] = per_thread.value;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** \brief set per thread and per team scratch size for a specific level of
|
||||
* the scratch hierarchy */
|
||||
inline TeamPolicyInternal internal_set_scratch_size(
|
||||
const int& level, const PerTeamValue& per_team,
|
||||
const PerThreadValue& per_thread) {
|
||||
m_team_scratch_size[level] = per_team.value;
|
||||
m_thread_scratch_size[level] = per_thread.value;
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
|
||||
template <class ClosureType, class FunctorType, class BlockSizeCallable>
|
||||
int internal_team_size_common(const FunctorType& f,
|
||||
BlockSizeCallable&& block_size_callable) const {
|
||||
|
@ -567,12 +438,12 @@ namespace Impl {
|
|||
template <class FunctorType, class... Traits>
|
||||
class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
public:
|
||||
typedef Kokkos::RangePolicy<Traits...> Policy;
|
||||
using Policy = Kokkos::RangePolicy<Traits...>;
|
||||
|
||||
private:
|
||||
typedef typename Policy::member_type Member;
|
||||
typedef typename Policy::work_tag WorkTag;
|
||||
typedef typename Policy::launch_bounds LaunchBounds;
|
||||
using Member = typename Policy::member_type;
|
||||
using WorkTag = typename Policy::work_tag;
|
||||
using LaunchBounds = typename Policy::launch_bounds;
|
||||
|
||||
const FunctorType m_functor;
|
||||
const Policy m_policy;
|
||||
|
@ -595,7 +466,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
|||
}
|
||||
|
||||
public:
|
||||
typedef FunctorType functor_type;
|
||||
using functor_type = FunctorType;
|
||||
|
||||
inline __device__ void operator()(void) const {
|
||||
const Member work_stride = blockDim.y * gridDim.x;
|
||||
|
@ -620,6 +491,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
|||
Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
|
||||
m_policy.space().impl_internal_space_instance(), attr, m_functor, 1,
|
||||
0, 0);
|
||||
KOKKOS_ASSERT(block_size > 0);
|
||||
dim3 block(1, block_size, 1);
|
||||
dim3 grid(
|
||||
std::min(
|
||||
|
@ -646,13 +518,13 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
|||
template <class FunctorType, class... Traits>
|
||||
class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
public:
|
||||
typedef Kokkos::MDRangePolicy<Traits...> Policy;
|
||||
using Policy = Kokkos::MDRangePolicy<Traits...>;
|
||||
|
||||
private:
|
||||
using RP = Policy;
|
||||
typedef typename Policy::array_index_type array_index_type;
|
||||
typedef typename Policy::index_type index_type;
|
||||
typedef typename Policy::launch_bounds LaunchBounds;
|
||||
using RP = Policy;
|
||||
using array_index_type = typename Policy::array_index_type;
|
||||
using index_type = typename Policy::index_type;
|
||||
using LaunchBounds = typename Policy::launch_bounds;
|
||||
|
||||
const FunctorType m_functor;
|
||||
const Policy m_rp;
|
||||
|
@ -666,29 +538,36 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
|||
}
|
||||
|
||||
inline void execute() const {
|
||||
using namespace std;
|
||||
|
||||
if (m_rp.m_num_tiles == 0) return;
|
||||
const array_index_type maxblocks = static_cast<array_index_type>(
|
||||
m_rp.space().impl_internal_space_instance()->m_maxBlock);
|
||||
if (RP::rank == 2) {
|
||||
const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1);
|
||||
KOKKOS_ASSERT(block.x > 0);
|
||||
KOKKOS_ASSERT(block.y > 0);
|
||||
const dim3 grid(
|
||||
std::min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x,
|
||||
maxblocks),
|
||||
std::min((m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y,
|
||||
maxblocks),
|
||||
min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x,
|
||||
maxblocks),
|
||||
min((m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y,
|
||||
maxblocks),
|
||||
1);
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
false);
|
||||
} else if (RP::rank == 3) {
|
||||
const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
|
||||
KOKKOS_ASSERT(block.x > 0);
|
||||
KOKKOS_ASSERT(block.y > 0);
|
||||
KOKKOS_ASSERT(block.z > 0);
|
||||
const dim3 grid(
|
||||
std::min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x,
|
||||
maxblocks),
|
||||
std::min((m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y,
|
||||
maxblocks),
|
||||
std::min((m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
|
||||
maxblocks));
|
||||
min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x,
|
||||
maxblocks),
|
||||
min((m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y,
|
||||
maxblocks),
|
||||
min((m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
|
||||
maxblocks));
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
false);
|
||||
|
@ -697,14 +576,15 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
|||
// threadIdx.z
|
||||
const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2],
|
||||
m_rp.m_tile[3]);
|
||||
KOKKOS_ASSERT(block.y > 0);
|
||||
KOKKOS_ASSERT(block.z > 0);
|
||||
const dim3 grid(
|
||||
std::min(
|
||||
static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]),
|
||||
min(static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]),
|
||||
static_cast<index_type>(maxblocks)),
|
||||
std::min((m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1) / block.y,
|
||||
maxblocks),
|
||||
std::min((m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
|
||||
maxblocks));
|
||||
min((m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1) / block.y,
|
||||
maxblocks),
|
||||
min((m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
|
||||
maxblocks));
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
false);
|
||||
|
@ -713,15 +593,14 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
|||
// threadIdx.z
|
||||
const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1],
|
||||
m_rp.m_tile[2] * m_rp.m_tile[3], m_rp.m_tile[4]);
|
||||
KOKKOS_ASSERT(block.z > 0);
|
||||
const dim3 grid(
|
||||
std::min(
|
||||
static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]),
|
||||
min(static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]),
|
||||
static_cast<index_type>(maxblocks)),
|
||||
std::min(
|
||||
static_cast<index_type>(m_rp.m_tile_end[2] * m_rp.m_tile_end[3]),
|
||||
min(static_cast<index_type>(m_rp.m_tile_end[2] * m_rp.m_tile_end[3]),
|
||||
static_cast<index_type>(maxblocks)),
|
||||
std::min((m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
|
||||
maxblocks));
|
||||
min((m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
|
||||
maxblocks));
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
false);
|
||||
|
@ -732,14 +611,11 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
|||
m_rp.m_tile[2] * m_rp.m_tile[3],
|
||||
m_rp.m_tile[4] * m_rp.m_tile[5]);
|
||||
const dim3 grid(
|
||||
std::min(
|
||||
static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]),
|
||||
min(static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]),
|
||||
static_cast<index_type>(maxblocks)),
|
||||
std::min(
|
||||
static_cast<index_type>(m_rp.m_tile_end[2] * m_rp.m_tile_end[3]),
|
||||
min(static_cast<index_type>(m_rp.m_tile_end[2] * m_rp.m_tile_end[3]),
|
||||
static_cast<index_type>(maxblocks)),
|
||||
std::min(
|
||||
static_cast<index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5]),
|
||||
min(static_cast<index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5]),
|
||||
static_cast<index_type>(maxblocks)));
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
|
@ -760,16 +636,16 @@ template <class FunctorType, class... Properties>
|
|||
class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
|
||||
Kokkos::Cuda> {
|
||||
public:
|
||||
typedef TeamPolicyInternal<Kokkos::Cuda, Properties...> Policy;
|
||||
using Policy = TeamPolicyInternal<Kokkos::Cuda, Properties...>;
|
||||
|
||||
private:
|
||||
typedef typename Policy::member_type Member;
|
||||
typedef typename Policy::work_tag WorkTag;
|
||||
typedef typename Policy::launch_bounds LaunchBounds;
|
||||
using Member = typename Policy::member_type;
|
||||
using WorkTag = typename Policy::work_tag;
|
||||
using LaunchBounds = typename Policy::launch_bounds;
|
||||
|
||||
public:
|
||||
typedef FunctorType functor_type;
|
||||
typedef Cuda::size_type size_type;
|
||||
using functor_type = FunctorType;
|
||||
using size_type = Cuda::size_type;
|
||||
|
||||
private:
|
||||
// Algorithmic constraints: blockDim.y is a power of two AND blockDim.y ==
|
||||
|
@ -941,34 +817,34 @@ template <class FunctorType, class ReducerType, class... Traits>
|
|||
class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
|
||||
Kokkos::Cuda> {
|
||||
public:
|
||||
typedef Kokkos::RangePolicy<Traits...> Policy;
|
||||
using Policy = Kokkos::RangePolicy<Traits...>;
|
||||
|
||||
private:
|
||||
typedef typename Policy::WorkRange WorkRange;
|
||||
typedef typename Policy::work_tag WorkTag;
|
||||
typedef typename Policy::member_type Member;
|
||||
typedef typename Policy::launch_bounds LaunchBounds;
|
||||
using WorkRange = typename Policy::WorkRange;
|
||||
using WorkTag = typename Policy::work_tag;
|
||||
using Member = typename Policy::member_type;
|
||||
using LaunchBounds = typename Policy::launch_bounds;
|
||||
|
||||
typedef Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
|
||||
FunctorType, ReducerType>
|
||||
ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
typedef
|
||||
using ReducerConditional =
|
||||
Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
|
||||
FunctorType, ReducerType>;
|
||||
using ReducerTypeFwd = typename ReducerConditional::type;
|
||||
using WorkTagFwd =
|
||||
typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
|
||||
WorkTag, void>::type WorkTagFwd;
|
||||
WorkTag, void>::type;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>
|
||||
ValueTraits;
|
||||
typedef Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd> ValueInit;
|
||||
typedef Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd> ValueJoin;
|
||||
using ValueTraits =
|
||||
Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
|
||||
using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
|
||||
using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
|
||||
|
||||
public:
|
||||
typedef typename ValueTraits::pointer_type pointer_type;
|
||||
typedef typename ValueTraits::value_type value_type;
|
||||
typedef typename ValueTraits::reference_type reference_type;
|
||||
typedef FunctorType functor_type;
|
||||
typedef Kokkos::Cuda::size_type size_type;
|
||||
typedef typename Policy::index_type index_type;
|
||||
using pointer_type = typename ValueTraits::pointer_type;
|
||||
using value_type = typename ValueTraits::value_type;
|
||||
using reference_type = typename ValueTraits::reference_type;
|
||||
using functor_type = FunctorType;
|
||||
using size_type = Kokkos::Cuda::size_type;
|
||||
using index_type = typename Policy::index_type;
|
||||
|
||||
// Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
|
||||
// blockDim.z == 1
|
||||
|
@ -990,8 +866,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
|
|||
//};
|
||||
// Some crutch to do function overloading
|
||||
private:
|
||||
typedef double DummyShflReductionType;
|
||||
typedef int DummySHMEMReductionType;
|
||||
using DummyShflReductionType = double;
|
||||
using DummySHMEMReductionType = int;
|
||||
|
||||
public:
|
||||
// Make the exec_range calls call to Reduce::DeviceIterateTile
|
||||
|
@ -1124,13 +1000,19 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
|
|||
int shmem_size =
|
||||
cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
|
||||
f, n);
|
||||
using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
|
||||
cudaFuncAttributes attr =
|
||||
CudaParallelLaunch<closure_type,
|
||||
LaunchBounds>::get_cuda_func_attributes();
|
||||
while (
|
||||
(n &&
|
||||
(m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
|
||||
shmem_size)) ||
|
||||
(n > static_cast<unsigned>(
|
||||
Kokkos::Impl::cuda_get_max_block_size<
|
||||
ParallelReduce, LaunchBounds>(f, 1, shmem_size, 0)))) {
|
||||
(n >
|
||||
static_cast<unsigned>(
|
||||
Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
|
||||
m_policy.space().impl_internal_space_instance(), attr, f, 1,
|
||||
shmem_size, 0)))) {
|
||||
n >>= 1;
|
||||
shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
|
||||
WorkTag>(f, n);
|
||||
|
@ -1142,6 +1024,7 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
|
|||
const index_type nwork = m_policy.end() - m_policy.begin();
|
||||
if (nwork) {
|
||||
const int block_size = local_block_size(m_functor);
|
||||
KOKKOS_ASSERT(block_size > 0);
|
||||
|
||||
m_scratch_space = cuda_internal_scratch_space(
|
||||
m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
|
||||
|
@ -1215,9 +1098,9 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
|
|||
m_result_ptr_device_accessible(
|
||||
MemorySpaceAccess<Kokkos::CudaSpace,
|
||||
typename ViewType::memory_space>::accessible),
|
||||
m_scratch_space(0),
|
||||
m_scratch_flags(0),
|
||||
m_unified_space(0) {}
|
||||
m_scratch_space(nullptr),
|
||||
m_scratch_flags(nullptr),
|
||||
m_unified_space(nullptr) {}
|
||||
|
||||
ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
|
||||
const ReducerType& reducer)
|
||||
|
@ -1229,9 +1112,9 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
|
|||
MemorySpaceAccess<Kokkos::CudaSpace,
|
||||
typename ReducerType::result_view_type::
|
||||
memory_space>::accessible),
|
||||
m_scratch_space(0),
|
||||
m_scratch_flags(0),
|
||||
m_unified_space(0) {}
|
||||
m_scratch_space(nullptr),
|
||||
m_scratch_flags(nullptr),
|
||||
m_unified_space(nullptr) {}
|
||||
};
|
||||
|
||||
// MDRangePolicy impl
|
||||
|
@ -1239,35 +1122,35 @@ template <class FunctorType, class ReducerType, class... Traits>
|
|||
class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
|
||||
Kokkos::Cuda> {
|
||||
public:
|
||||
typedef Kokkos::MDRangePolicy<Traits...> Policy;
|
||||
using Policy = Kokkos::MDRangePolicy<Traits...>;
|
||||
|
||||
private:
|
||||
typedef typename Policy::array_index_type array_index_type;
|
||||
typedef typename Policy::index_type index_type;
|
||||
using array_index_type = typename Policy::array_index_type;
|
||||
using index_type = typename Policy::index_type;
|
||||
|
||||
typedef typename Policy::work_tag WorkTag;
|
||||
typedef typename Policy::member_type Member;
|
||||
typedef typename Policy::launch_bounds LaunchBounds;
|
||||
using WorkTag = typename Policy::work_tag;
|
||||
using Member = typename Policy::member_type;
|
||||
using LaunchBounds = typename Policy::launch_bounds;
|
||||
|
||||
typedef Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
|
||||
FunctorType, ReducerType>
|
||||
ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
typedef
|
||||
using ReducerConditional =
|
||||
Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
|
||||
FunctorType, ReducerType>;
|
||||
using ReducerTypeFwd = typename ReducerConditional::type;
|
||||
using WorkTagFwd =
|
||||
typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
|
||||
WorkTag, void>::type WorkTagFwd;
|
||||
WorkTag, void>::type;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>
|
||||
ValueTraits;
|
||||
typedef Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd> ValueInit;
|
||||
typedef Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd> ValueJoin;
|
||||
using ValueTraits =
|
||||
Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
|
||||
using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
|
||||
using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
|
||||
|
||||
public:
|
||||
typedef typename ValueTraits::pointer_type pointer_type;
|
||||
typedef typename ValueTraits::value_type value_type;
|
||||
typedef typename ValueTraits::reference_type reference_type;
|
||||
typedef FunctorType functor_type;
|
||||
typedef Cuda::size_type size_type;
|
||||
using pointer_type = typename ValueTraits::pointer_type;
|
||||
using value_type = typename ValueTraits::value_type;
|
||||
using reference_type = typename ValueTraits::reference_type;
|
||||
using functor_type = FunctorType;
|
||||
using size_type = Cuda::size_type;
|
||||
|
||||
// Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
|
||||
// blockDim.z == 1
|
||||
|
@ -1281,10 +1164,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
|
|||
size_type* m_scratch_flags;
|
||||
size_type* m_unified_space;
|
||||
|
||||
typedef typename Kokkos::Impl::Reduce::DeviceIterateTile<
|
||||
using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile<
|
||||
Policy::rank, Policy, FunctorType, typename Policy::work_tag,
|
||||
reference_type>
|
||||
DeviceIteratePattern;
|
||||
reference_type>;
|
||||
|
||||
// Shall we use the shfl based reduction or not (only use it for static sized
|
||||
// types of more than 128bit
|
||||
|
@ -1294,8 +1176,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
|
|||
};
|
||||
// Some crutch to do function overloading
|
||||
private:
|
||||
typedef double DummyShflReductionType;
|
||||
typedef int DummySHMEMReductionType;
|
||||
using DummyShflReductionType = double;
|
||||
using DummySHMEMReductionType = int;
|
||||
|
||||
public:
|
||||
inline __device__ void exec_range(reference_type update) const {
|
||||
|
@ -1414,13 +1296,19 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
|
|||
int shmem_size =
|
||||
cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
|
||||
f, n);
|
||||
using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
|
||||
cudaFuncAttributes attr =
|
||||
CudaParallelLaunch<closure_type,
|
||||
LaunchBounds>::get_cuda_func_attributes();
|
||||
while (
|
||||
(n &&
|
||||
(m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
|
||||
shmem_size)) ||
|
||||
(n > static_cast<unsigned>(
|
||||
Kokkos::Impl::cuda_get_max_block_size<
|
||||
ParallelReduce, LaunchBounds>(f, 1, shmem_size, 0)))) {
|
||||
(n >
|
||||
static_cast<unsigned>(
|
||||
Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
|
||||
m_policy.space().impl_internal_space_instance(), attr, f, 1,
|
||||
shmem_size, 0)))) {
|
||||
n >>= 1;
|
||||
shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
|
||||
WorkTag>(f, n);
|
||||
|
@ -1507,9 +1395,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
|
|||
m_result_ptr_device_accessible(
|
||||
MemorySpaceAccess<Kokkos::CudaSpace,
|
||||
typename ViewType::memory_space>::accessible),
|
||||
m_scratch_space(0),
|
||||
m_scratch_flags(0),
|
||||
m_unified_space(0) {}
|
||||
m_scratch_space(nullptr),
|
||||
m_scratch_flags(nullptr),
|
||||
m_unified_space(nullptr) {}
|
||||
|
||||
ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
|
||||
const ReducerType& reducer)
|
||||
|
@ -1521,9 +1409,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
|
|||
MemorySpaceAccess<Kokkos::CudaSpace,
|
||||
typename ReducerType::result_view_type::
|
||||
memory_space>::accessible),
|
||||
m_scratch_space(0),
|
||||
m_scratch_flags(0),
|
||||
m_unified_space(0) {}
|
||||
m_scratch_space(nullptr),
|
||||
m_scratch_flags(nullptr),
|
||||
m_unified_space(nullptr) {}
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
@ -1532,39 +1420,39 @@ template <class FunctorType, class ReducerType, class... Properties>
|
|||
class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
|
||||
ReducerType, Kokkos::Cuda> {
|
||||
public:
|
||||
typedef TeamPolicyInternal<Kokkos::Cuda, Properties...> Policy;
|
||||
using Policy = TeamPolicyInternal<Kokkos::Cuda, Properties...>;
|
||||
|
||||
private:
|
||||
typedef typename Policy::member_type Member;
|
||||
typedef typename Policy::work_tag WorkTag;
|
||||
typedef typename Policy::launch_bounds LaunchBounds;
|
||||
using Member = typename Policy::member_type;
|
||||
using WorkTag = typename Policy::work_tag;
|
||||
using LaunchBounds = typename Policy::launch_bounds;
|
||||
|
||||
typedef Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
|
||||
FunctorType, ReducerType>
|
||||
ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
typedef
|
||||
using ReducerConditional =
|
||||
Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
|
||||
FunctorType, ReducerType>;
|
||||
using ReducerTypeFwd = typename ReducerConditional::type;
|
||||
using WorkTagFwd =
|
||||
typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
|
||||
WorkTag, void>::type WorkTagFwd;
|
||||
WorkTag, void>::type;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>
|
||||
ValueTraits;
|
||||
typedef Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd> ValueInit;
|
||||
typedef Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd> ValueJoin;
|
||||
using ValueTraits =
|
||||
Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
|
||||
using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
|
||||
using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type;
|
||||
typedef typename ValueTraits::reference_type reference_type;
|
||||
typedef typename ValueTraits::value_type value_type;
|
||||
using pointer_type = typename ValueTraits::pointer_type;
|
||||
using reference_type = typename ValueTraits::reference_type;
|
||||
using value_type = typename ValueTraits::value_type;
|
||||
|
||||
public:
|
||||
typedef FunctorType functor_type;
|
||||
typedef Cuda::size_type size_type;
|
||||
using functor_type = FunctorType;
|
||||
using size_type = Cuda::size_type;
|
||||
|
||||
enum { UseShflReduction = (true && (ValueTraits::StaticValueSize != 0)) };
|
||||
|
||||
private:
|
||||
typedef double DummyShflReductionType;
|
||||
typedef int DummySHMEMReductionType;
|
||||
using DummyShflReductionType = double;
|
||||
using DummySHMEMReductionType = int;
|
||||
|
||||
// Algorithmic constraints: blockDim.y is a power of two AND blockDim.y ==
|
||||
// blockDim.z == 1 shared memory utilization:
|
||||
|
@ -1818,9 +1706,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
|
|||
m_result_ptr_device_accessible(
|
||||
MemorySpaceAccess<Kokkos::CudaSpace,
|
||||
typename ViewType::memory_space>::accessible),
|
||||
m_scratch_space(0),
|
||||
m_scratch_flags(0),
|
||||
m_unified_space(0),
|
||||
m_scratch_space(nullptr),
|
||||
m_scratch_flags(nullptr),
|
||||
m_unified_space(nullptr),
|
||||
m_team_begin(0),
|
||||
m_shmem_begin(0),
|
||||
m_shmem_size(0),
|
||||
|
@ -1917,9 +1805,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
|
|||
MemorySpaceAccess<Kokkos::CudaSpace,
|
||||
typename ReducerType::result_view_type::
|
||||
memory_space>::accessible),
|
||||
m_scratch_space(0),
|
||||
m_scratch_flags(0),
|
||||
m_unified_space(0),
|
||||
m_scratch_space(nullptr),
|
||||
m_scratch_flags(nullptr),
|
||||
m_unified_space(nullptr),
|
||||
m_team_begin(0),
|
||||
m_shmem_begin(0),
|
||||
m_shmem_size(0),
|
||||
|
@ -2013,23 +1901,23 @@ namespace Impl {
|
|||
template <class FunctorType, class... Traits>
|
||||
class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
public:
|
||||
typedef Kokkos::RangePolicy<Traits...> Policy;
|
||||
using Policy = Kokkos::RangePolicy<Traits...>;
|
||||
|
||||
private:
|
||||
typedef typename Policy::member_type Member;
|
||||
typedef typename Policy::work_tag WorkTag;
|
||||
typedef typename Policy::WorkRange WorkRange;
|
||||
typedef typename Policy::launch_bounds LaunchBounds;
|
||||
using Member = typename Policy::member_type;
|
||||
using WorkTag = typename Policy::work_tag;
|
||||
using WorkRange = typename Policy::WorkRange;
|
||||
using LaunchBounds = typename Policy::launch_bounds;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag> ValueTraits;
|
||||
typedef Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag> ValueInit;
|
||||
typedef Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag> ValueOps;
|
||||
using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>;
|
||||
using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
|
||||
using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
|
||||
|
||||
public:
|
||||
typedef typename ValueTraits::pointer_type pointer_type;
|
||||
typedef typename ValueTraits::reference_type reference_type;
|
||||
typedef FunctorType functor_type;
|
||||
typedef Cuda::size_type size_type;
|
||||
using pointer_type = typename ValueTraits::pointer_type;
|
||||
using reference_type = typename ValueTraits::reference_type;
|
||||
using functor_type = FunctorType;
|
||||
using size_type = Cuda::size_type;
|
||||
|
||||
private:
|
||||
// Algorithmic constraints:
|
||||
|
@ -2233,6 +2121,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
|||
enum { GridMaxComputeCapability_2x = 0x0ffff };
|
||||
|
||||
const int block_size = local_block_size(m_functor);
|
||||
KOKKOS_ASSERT(block_size > 0);
|
||||
|
||||
const int grid_max =
|
||||
(block_size * block_size) < GridMaxComputeCapability_2x
|
||||
|
@ -2283,8 +2172,8 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
|||
ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
|
||||
: m_functor(arg_functor),
|
||||
m_policy(arg_policy),
|
||||
m_scratch_space(0),
|
||||
m_scratch_flags(0),
|
||||
m_scratch_space(nullptr),
|
||||
m_scratch_flags(nullptr),
|
||||
m_final(false)
|
||||
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||
,
|
||||
|
@ -2299,23 +2188,23 @@ template <class FunctorType, class ReturnType, class... Traits>
|
|||
class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
||||
ReturnType, Kokkos::Cuda> {
|
||||
public:
|
||||
typedef Kokkos::RangePolicy<Traits...> Policy;
|
||||
using Policy = Kokkos::RangePolicy<Traits...>;
|
||||
|
||||
private:
|
||||
typedef typename Policy::member_type Member;
|
||||
typedef typename Policy::work_tag WorkTag;
|
||||
typedef typename Policy::WorkRange WorkRange;
|
||||
typedef typename Policy::launch_bounds LaunchBounds;
|
||||
using Member = typename Policy::member_type;
|
||||
using WorkTag = typename Policy::work_tag;
|
||||
using WorkRange = typename Policy::WorkRange;
|
||||
using LaunchBounds = typename Policy::launch_bounds;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag> ValueTraits;
|
||||
typedef Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag> ValueInit;
|
||||
typedef Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag> ValueOps;
|
||||
using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>;
|
||||
using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
|
||||
using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
|
||||
|
||||
public:
|
||||
typedef typename ValueTraits::pointer_type pointer_type;
|
||||
typedef typename ValueTraits::reference_type reference_type;
|
||||
typedef FunctorType functor_type;
|
||||
typedef Cuda::size_type size_type;
|
||||
using pointer_type = typename ValueTraits::pointer_type;
|
||||
using reference_type = typename ValueTraits::reference_type;
|
||||
using functor_type = FunctorType;
|
||||
using size_type = Cuda::size_type;
|
||||
|
||||
private:
|
||||
// Algorithmic constraints:
|
||||
|
@ -2523,6 +2412,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
|||
enum { GridMaxComputeCapability_2x = 0x0ffff };
|
||||
|
||||
const int block_size = local_block_size(m_functor);
|
||||
KOKKOS_ASSERT(block_size > 0);
|
||||
|
||||
const int grid_max =
|
||||
(block_size * block_size) < GridMaxComputeCapability_2x
|
||||
|
@ -2585,8 +2475,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
|||
const Policy& arg_policy, ReturnType& arg_returnvalue)
|
||||
: m_functor(arg_functor),
|
||||
m_policy(arg_policy),
|
||||
m_scratch_space(0),
|
||||
m_scratch_flags(0),
|
||||
m_scratch_space(nullptr),
|
||||
m_scratch_flags(nullptr),
|
||||
m_final(false),
|
||||
m_returnvalue(arg_returnvalue)
|
||||
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||
|
@ -2610,7 +2500,7 @@ template <class FunctorType, class ExecPolicy, class ValueType,
|
|||
class Tag = typename ExecPolicy::work_tag>
|
||||
struct CudaFunctorAdapter {
|
||||
const FunctorType f;
|
||||
typedef ValueType value_type;
|
||||
using value_type = ValueType;
|
||||
CudaFunctorAdapter(const FunctorType& f_) : f(f_) {}
|
||||
|
||||
__device__ inline void operator()(typename ExecPolicy::work_tag,
|
||||
|
@ -2680,7 +2570,7 @@ struct CudaFunctorAdapter {
|
|||
template <class FunctorType, class ExecPolicy, class ValueType>
|
||||
struct CudaFunctorAdapter<FunctorType, ExecPolicy, ValueType, void> {
|
||||
const FunctorType f;
|
||||
typedef ValueType value_type;
|
||||
using value_type = ValueType;
|
||||
CudaFunctorAdapter(const FunctorType& f_) : f(f_) {}
|
||||
|
||||
__device__ inline void operator()(const typename ExecPolicy::member_type& i,
|
||||
|
@ -2801,13 +2691,14 @@ struct CudaFunctorAdapter<FunctorType, ExecPolicy, ValueType, void> {
|
|||
template <class FunctorType, class ResultType, class Tag,
|
||||
bool Enable = IsNonTrivialReduceFunctor<FunctorType>::value>
|
||||
struct FunctorReferenceType {
|
||||
typedef ResultType& reference_type;
|
||||
using reference_type = ResultType&;
|
||||
};
|
||||
|
||||
template <class FunctorType, class ResultType, class Tag>
|
||||
struct FunctorReferenceType<FunctorType, ResultType, Tag, true> {
|
||||
typedef typename Kokkos::Impl::FunctorValueTraits<
|
||||
FunctorType, Tag>::reference_type reference_type;
|
||||
using reference_type =
|
||||
typename Kokkos::Impl::FunctorValueTraits<FunctorType,
|
||||
Tag>::reference_type;
|
||||
};
|
||||
|
||||
template <class FunctorTypeIn, class ExecPolicy, class ValueType>
|
||||
|
@ -2815,10 +2706,9 @@ struct ParallelReduceFunctorType<FunctorTypeIn, ExecPolicy, ValueType, Cuda> {
|
|||
enum {
|
||||
FunctorHasValueType = IsNonTrivialReduceFunctor<FunctorTypeIn>::value
|
||||
};
|
||||
typedef typename Kokkos::Impl::if_c<
|
||||
using functor_type = typename Kokkos::Impl::if_c<
|
||||
FunctorHasValueType, FunctorTypeIn,
|
||||
Impl::CudaFunctorAdapter<FunctorTypeIn, ExecPolicy, ValueType>>::type
|
||||
functor_type;
|
||||
Impl::CudaFunctorAdapter<FunctorTypeIn, ExecPolicy, ValueType>>::type;
|
||||
static functor_type functor(const FunctorTypeIn& functor_in) {
|
||||
return Impl::if_c<FunctorHasValueType, FunctorTypeIn, functor_type>::select(
|
||||
functor_in, functor_type(functor_in));
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue