forked from lijiext/lammps
Fix typos
This commit is contained in:
parent
60864e38d1
commit
50935b5cbc
lib/kokkos
BUILD.mdCHANGELOG.md
benchmarks/policy_performance
bin
cmake
containers
core
src
Cuda
Kokkos_Concepts.hppKokkos_CopyViews.hppKokkos_CudaSpace.hppKokkos_HBWSpace.hppKokkos_HostSpace.hppKokkos_OpenMPTargetSpace.hppKokkos_Parallel_Reduce.hppKokkos_ROCmSpace.hppOpenMP
Threads
impl
unit_test
example/tutorial
|
@ -18,7 +18,7 @@ Then for every executable or library in your project:
|
||||||
target_link_libraries(myTarget Kokkos::kokkos)
|
target_link_libraries(myTarget Kokkos::kokkos)
|
||||||
````
|
````
|
||||||
That's it! There is no checking Kokkos preprocessor, compiler, or linker flags.
|
That's it! There is no checking Kokkos preprocessor, compiler, or linker flags.
|
||||||
Kokkos propagates all the necesssary flags to your project.
|
Kokkos propagates all the necessary flags to your project.
|
||||||
This means not only is linking to Kokkos easy, but Kokkos itself can actually configure compiler and linker flags for *your*
|
This means not only is linking to Kokkos easy, but Kokkos itself can actually configure compiler and linker flags for *your*
|
||||||
project. If building in-tree, there is no `find_package` and you link with `target_link_libraries(kokkos)`.
|
project. If building in-tree, there is no `find_package` and you link with `target_link_libraries(kokkos)`.
|
||||||
|
|
||||||
|
@ -103,7 +103,7 @@ endif()
|
||||||
# Kokkos Keyword Listing
|
# Kokkos Keyword Listing
|
||||||
|
|
||||||
## Device Backends
|
## Device Backends
|
||||||
Device backends can be enabled by specifiying `-DKokkos_ENABLE_X`.
|
Device backends can be enabled by specifying `-DKokkos_ENABLE_X`.
|
||||||
|
|
||||||
* Kokkos_ENABLE_CUDA
|
* Kokkos_ENABLE_CUDA
|
||||||
* Whether to build CUDA backend
|
* Whether to build CUDA backend
|
||||||
|
@ -122,7 +122,7 @@ Device backends can be enabled by specifiying `-DKokkos_ENABLE_X`.
|
||||||
* BOOL Default: ON
|
* BOOL Default: ON
|
||||||
|
|
||||||
## Enable Options
|
## Enable Options
|
||||||
Options can be enabled by specifiying `-DKokkos_ENABLE_X`.
|
Options can be enabled by specifying `-DKokkos_ENABLE_X`.
|
||||||
|
|
||||||
* Kokkos_ENABLE_AGGRESSIVE_VECTORIZATION
|
* Kokkos_ENABLE_AGGRESSIVE_VECTORIZATION
|
||||||
* Whether to aggressively vectorize loops
|
* Whether to aggressively vectorize loops
|
||||||
|
@ -225,7 +225,7 @@ The following options control `find_package` paths for CMake-based TPLs:
|
||||||
* PATH Default:
|
* PATH Default:
|
||||||
|
|
||||||
## Architecture Keywords
|
## Architecture Keywords
|
||||||
Architecture-specific optimizations can be enabled by specifiying `-DKokkos_ARCH_X`.
|
Architecture-specific optimizations can be enabled by specifying `-DKokkos_ARCH_X`.
|
||||||
|
|
||||||
* Kokkos_ARCH_AMDAVX
|
* Kokkos_ARCH_AMDAVX
|
||||||
* Whether to optimize for the AMDAVX architecture
|
* Whether to optimize for the AMDAVX architecture
|
||||||
|
|
|
@ -31,7 +31,7 @@
|
||||||
- OffsetView: Kokkos::OffsetView missing constructor which takes pointer [\#2247](https://github.com/kokkos/kokkos/issues/2247)
|
- OffsetView: Kokkos::OffsetView missing constructor which takes pointer [\#2247](https://github.com/kokkos/kokkos/issues/2247)
|
||||||
- OffsetView: Kokkos::OffsetView: allow offset=0 [\#2246](https://github.com/kokkos/kokkos/issues/2246)
|
- OffsetView: Kokkos::OffsetView: allow offset=0 [\#2246](https://github.com/kokkos/kokkos/issues/2246)
|
||||||
- DeepCopy: Missing DeepCopy instrumentation in Kokkos [\#2522](https://github.com/kokkos/kokkos/issues/2522)
|
- DeepCopy: Missing DeepCopy instrumentation in Kokkos [\#2522](https://github.com/kokkos/kokkos/issues/2522)
|
||||||
- nvcc\_wrapper: --host-only fails with mutiple -W\* flags [\#2484](https://github.com/kokkos/kokkos/issues/2484)
|
- nvcc\_wrapper: --host-only fails with multiple -W\* flags [\#2484](https://github.com/kokkos/kokkos/issues/2484)
|
||||||
- nvcc\_wrapper: taking first -std option is counterintuitive [\#2553](https://github.com/kokkos/kokkos/issues/2553)
|
- nvcc\_wrapper: taking first -std option is counterintuitive [\#2553](https://github.com/kokkos/kokkos/issues/2553)
|
||||||
- Subview: Error taking subviews of views with static_extents of min rank [\#2448](https://github.com/kokkos/kokkos/issues/2448)
|
- Subview: Error taking subviews of views with static_extents of min rank [\#2448](https://github.com/kokkos/kokkos/issues/2448)
|
||||||
- TeamPolicy: reducers with valuetypes without += broken on CUDA [\#2410](https://github.com/kokkos/kokkos/issues/2410)
|
- TeamPolicy: reducers with valuetypes without += broken on CUDA [\#2410](https://github.com/kokkos/kokkos/issues/2410)
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
# Sample script for benchmarking policy performance
|
# Sample script for benchmarking policy performance
|
||||||
|
|
||||||
# Suggested enviroment variables to export prior to executing script:
|
# Suggested environment variables to export prior to executing script:
|
||||||
# KNL:
|
# KNL:
|
||||||
# OMP_NUM_THREADS=256 KMP_AFFINITY=compact
|
# OMP_NUM_THREADS=256 KMP_AFFINITY=compact
|
||||||
# Power:
|
# Power:
|
||||||
|
|
|
@ -383,7 +383,7 @@ fi
|
||||||
# Check unknown arguments
|
# Check unknown arguments
|
||||||
################################################################################
|
################################################################################
|
||||||
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
|
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
|
||||||
echo "HPCBIND Uknown options: ${UNKNOWN_ARGS[*]}" > >(tee -a ${HPCBIND_LOG})
|
echo "HPCBIND Unknown options: ${UNKNOWN_ARGS[*]}" > >(tee -a ${HPCBIND_LOG})
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
|
@ -81,7 +81,7 @@ add_library(B b.cpp)
|
||||||
target_compile_features(B PUBLIC cxx_std_14)
|
target_compile_features(B PUBLIC cxx_std_14)
|
||||||
target_link_libraries(A B)
|
target_link_libraries(A B)
|
||||||
````
|
````
|
||||||
I have requested two diferent features.
|
I have requested two different features.
|
||||||
CMake understands the requests and knows that `cxx_std_11` is a subset of `cxx_std_14`.
|
CMake understands the requests and knows that `cxx_std_11` is a subset of `cxx_std_14`.
|
||||||
CMake then picks C++14 for library `B`.
|
CMake then picks C++14 for library `B`.
|
||||||
CMake would not have been able to do feature resolution if we had directly done:
|
CMake would not have been able to do feature resolution if we had directly done:
|
||||||
|
@ -179,7 +179,7 @@ If you do not find the TPL where you expect it, you should error out rather than
|
||||||
#### Finding TPLs
|
#### Finding TPLs
|
||||||
|
|
||||||
If finding a TPL that is not a modern CMake project, refer to the `FindHWLOC.cmake` file in `cmake/Modules` for an example.
|
If finding a TPL that is not a modern CMake project, refer to the `FindHWLOC.cmake` file in `cmake/Modules` for an example.
|
||||||
You will ususally need to verify expected headers with `find_path`
|
You will usually need to verify expected headers with `find_path`
|
||||||
````
|
````
|
||||||
find_path(TPL_INCLUDE_DIR mytpl.h PATHS "${KOKKOS_MYTPL_DIR}/include")
|
find_path(TPL_INCLUDE_DIR mytpl.h PATHS "${KOKKOS_MYTPL_DIR}/include")
|
||||||
````
|
````
|
||||||
|
|
|
@ -100,7 +100,7 @@ class UnorderedMapInsertResult {
|
||||||
KOKKOS_FORCEINLINE_FUNCTION
|
KOKKOS_FORCEINLINE_FUNCTION
|
||||||
bool existing() const { return (m_status & EXISTING); }
|
bool existing() const { return (m_status & EXISTING); }
|
||||||
|
|
||||||
/// Did the map fail to insert the key due to insufficent capacity
|
/// Did the map fail to insert the key due to insufficient capacity
|
||||||
KOKKOS_FORCEINLINE_FUNCTION
|
KOKKOS_FORCEINLINE_FUNCTION
|
||||||
bool failed() const { return m_index == UnorderedMapInvalidIndex; }
|
bool failed() const { return m_index == UnorderedMapInvalidIndex; }
|
||||||
|
|
||||||
|
|
|
@ -109,7 +109,7 @@ class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
|
||||||
void resize(size_t n, const Scalar& val) { assign(n, val); }
|
void resize(size_t n, const Scalar& val) { assign(n, val); }
|
||||||
|
|
||||||
void assign(size_t n, const Scalar& val) {
|
void assign(size_t n, const Scalar& val) {
|
||||||
/* Resize if necessary (behavour of std:vector) */
|
/* Resize if necessary (behavior of std:vector) */
|
||||||
|
|
||||||
if (n > span()) DV::resize(size_t(n * _extra_storage));
|
if (n > span()) DV::resize(size_t(n * _extra_storage));
|
||||||
_size = n;
|
_size = n;
|
||||||
|
@ -234,7 +234,7 @@ class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
|
||||||
|
|
||||||
const_reference back() const { return DV::h_view(_size - 1); }
|
const_reference back() const { return DV::h_view(_size - 1); }
|
||||||
|
|
||||||
/* std::algorithms wich work originally with iterators, here they are
|
/* std::algorithms which work originally with iterators, here they are
|
||||||
* implemented as member functions */
|
* implemented as member functions */
|
||||||
|
|
||||||
size_t lower_bound(const size_t& start, const size_t& theEnd,
|
size_t lower_bound(const size_t& start, const size_t& theEnd,
|
||||||
|
|
|
@ -192,7 +192,7 @@ void test_bitset() {
|
||||||
|
|
||||||
bitset_type bitset(test_sizes[i]);
|
bitset_type bitset(test_sizes[i]);
|
||||||
|
|
||||||
// std::cout << " Check inital count " << std::endl;
|
// std::cout << " Check initial count " << std::endl;
|
||||||
// nothing should be set
|
// nothing should be set
|
||||||
{
|
{
|
||||||
Impl::TestBitsetTest<bitset_type> f(bitset);
|
Impl::TestBitsetTest<bitset_type> f(bitset);
|
||||||
|
|
|
@ -64,7 +64,7 @@ struct TestViewCtorProp_EmbeddedDim {
|
||||||
using DynRankViewIntType = typename Kokkos::DynRankView<int, ExecSpace>;
|
using DynRankViewIntType = typename Kokkos::DynRankView<int, ExecSpace>;
|
||||||
using DynRankViewDoubleType = typename Kokkos::DynRankView<double, ExecSpace>;
|
using DynRankViewDoubleType = typename Kokkos::DynRankView<double, ExecSpace>;
|
||||||
|
|
||||||
// Cuda 7.0 has issues with using a lamda in parallel_for to initialize the
|
// Cuda 7.0 has issues with using a lambda in parallel_for to initialize the
|
||||||
// view - replace with this functor
|
// view - replace with this functor
|
||||||
template <class ViewType>
|
template <class ViewType>
|
||||||
struct Functor {
|
struct Functor {
|
||||||
|
|
|
@ -113,10 +113,10 @@ __device__
|
||||||
|
|
||||||
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
|
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
|
||||||
|
|
||||||
/// \brief Aquire a lock for the address
|
/// \brief Acquire a lock for the address
|
||||||
///
|
///
|
||||||
/// This function tries to aquire the lock for the hash value derived
|
/// This function tries to acquire the lock for the hash value derived
|
||||||
/// from the provided ptr. If the lock is successfully aquired the
|
/// from the provided ptr. If the lock is successfully acquired the
|
||||||
/// function returns true. Otherwise it returns false.
|
/// function returns true. Otherwise it returns false.
|
||||||
__device__ inline bool lock_address_cuda_space(void* ptr) {
|
__device__ inline bool lock_address_cuda_space(void* ptr) {
|
||||||
size_t offset = size_t(ptr);
|
size_t offset = size_t(ptr);
|
||||||
|
@ -131,7 +131,7 @@ __device__ inline bool lock_address_cuda_space(void* ptr) {
|
||||||
///
|
///
|
||||||
/// This function releases the lock for the hash value derived
|
/// This function releases the lock for the hash value derived
|
||||||
/// from the provided ptr. This function should only be called
|
/// from the provided ptr. This function should only be called
|
||||||
/// after previously successfully aquiring a lock with
|
/// after previously successfully acquiring a lock with
|
||||||
/// lock_address.
|
/// lock_address.
|
||||||
__device__ inline void unlock_address_cuda_space(void* ptr) {
|
__device__ inline void unlock_address_cuda_space(void* ptr) {
|
||||||
size_t offset = size_t(ptr);
|
size_t offset = size_t(ptr);
|
||||||
|
|
|
@ -325,13 +325,13 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
|
||||||
m_team_scratch_size{0, 0},
|
m_team_scratch_size{0, 0},
|
||||||
m_thread_scratch_size{0, 0},
|
m_thread_scratch_size{0, 0},
|
||||||
m_chunk_size(32) {
|
m_chunk_size(32) {
|
||||||
// Make sure league size is permissable
|
// Make sure league size is permissible
|
||||||
if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
|
if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
|
||||||
Impl::throw_runtime_exception(
|
Impl::throw_runtime_exception(
|
||||||
"Requested too large league_size for TeamPolicy on Cuda execution "
|
"Requested too large league_size for TeamPolicy on Cuda execution "
|
||||||
"space.");
|
"space.");
|
||||||
|
|
||||||
// Make sure total block size is permissable
|
// Make sure total block size is permissible
|
||||||
if (m_team_size * m_vector_length > 1024) {
|
if (m_team_size * m_vector_length > 1024) {
|
||||||
Impl::throw_runtime_exception(
|
Impl::throw_runtime_exception(
|
||||||
std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. "
|
std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. "
|
||||||
|
@ -351,7 +351,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
|
||||||
m_team_scratch_size{0, 0},
|
m_team_scratch_size{0, 0},
|
||||||
m_thread_scratch_size{0, 0},
|
m_thread_scratch_size{0, 0},
|
||||||
m_chunk_size(32) {
|
m_chunk_size(32) {
|
||||||
// Make sure league size is permissable
|
// Make sure league size is permissible
|
||||||
if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
|
if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
|
||||||
Impl::throw_runtime_exception(
|
Impl::throw_runtime_exception(
|
||||||
"Requested too large league_size for TeamPolicy on Cuda execution "
|
"Requested too large league_size for TeamPolicy on Cuda execution "
|
||||||
|
@ -367,13 +367,13 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
|
||||||
m_team_scratch_size{0, 0},
|
m_team_scratch_size{0, 0},
|
||||||
m_thread_scratch_size{0, 0},
|
m_thread_scratch_size{0, 0},
|
||||||
m_chunk_size(32) {
|
m_chunk_size(32) {
|
||||||
// Make sure league size is permissable
|
// Make sure league size is permissible
|
||||||
if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
|
if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
|
||||||
Impl::throw_runtime_exception(
|
Impl::throw_runtime_exception(
|
||||||
"Requested too large league_size for TeamPolicy on Cuda execution "
|
"Requested too large league_size for TeamPolicy on Cuda execution "
|
||||||
"space.");
|
"space.");
|
||||||
|
|
||||||
// Make sure total block size is permissable
|
// Make sure total block size is permissible
|
||||||
if (m_team_size * m_vector_length > 1024) {
|
if (m_team_size * m_vector_length > 1024) {
|
||||||
Impl::throw_runtime_exception(
|
Impl::throw_runtime_exception(
|
||||||
std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. "
|
std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. "
|
||||||
|
@ -392,7 +392,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
|
||||||
m_team_scratch_size{0, 0},
|
m_team_scratch_size{0, 0},
|
||||||
m_thread_scratch_size{0, 0},
|
m_thread_scratch_size{0, 0},
|
||||||
m_chunk_size(32) {
|
m_chunk_size(32) {
|
||||||
// Make sure league size is permissable
|
// Make sure league size is permissible
|
||||||
if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
|
if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
|
||||||
Impl::throw_runtime_exception(
|
Impl::throw_runtime_exception(
|
||||||
"Requested too large league_size for TeamPolicy on Cuda execution "
|
"Requested too large league_size for TeamPolicy on Cuda execution "
|
||||||
|
|
|
@ -304,7 +304,7 @@ struct is_space {
|
||||||
host_mirror_space;
|
host_mirror_space;
|
||||||
};
|
};
|
||||||
|
|
||||||
// For backward compatiblity
|
// For backward compatibility
|
||||||
|
|
||||||
namespace Impl {
|
namespace Impl {
|
||||||
|
|
||||||
|
|
|
@ -1450,7 +1450,7 @@ inline void deep_copy(
|
||||||
typename ViewType::value_type>::value,
|
typename ViewType::value_type>::value,
|
||||||
"deep_copy requires non-const type");
|
"deep_copy requires non-const type");
|
||||||
|
|
||||||
// If contigous we can simply do a 1D flat loop
|
// If contiguous we can simply do a 1D flat loop
|
||||||
if (dst.span_is_contiguous()) {
|
if (dst.span_is_contiguous()) {
|
||||||
typedef Kokkos::View<
|
typedef Kokkos::View<
|
||||||
typename ViewType::value_type*, Kokkos::LayoutRight,
|
typename ViewType::value_type*, Kokkos::LayoutRight,
|
||||||
|
|
|
@ -130,7 +130,7 @@ int* atomic_lock_array_cuda_space_ptr(bool deallocate = false);
|
||||||
/// global memory.
|
/// global memory.
|
||||||
///
|
///
|
||||||
/// Team and Thread private scratch allocations in
|
/// Team and Thread private scratch allocations in
|
||||||
/// global memory are aquired via locks.
|
/// global memory are acquired via locks.
|
||||||
/// This function retrieves the lock array pointer.
|
/// This function retrieves the lock array pointer.
|
||||||
/// If the array is not yet allocated it will do so.
|
/// If the array is not yet allocated it will do so.
|
||||||
int* scratch_lock_array_cuda_space_ptr(bool deallocate = false);
|
int* scratch_lock_array_cuda_space_ptr(bool deallocate = false);
|
||||||
|
|
|
@ -64,10 +64,10 @@ namespace Impl {
|
||||||
/// This function initializes the locks to zero (unset).
|
/// This function initializes the locks to zero (unset).
|
||||||
void init_lock_array_hbw_space();
|
void init_lock_array_hbw_space();
|
||||||
|
|
||||||
/// \brief Aquire a lock for the address
|
/// \brief Acquire a lock for the address
|
||||||
///
|
///
|
||||||
/// This function tries to aquire the lock for the hash value derived
|
/// This function tries to acquire the lock for the hash value derived
|
||||||
/// from the provided ptr. If the lock is successfully aquired the
|
/// from the provided ptr. If the lock is successfully acquired the
|
||||||
/// function returns true. Otherwise it returns false.
|
/// function returns true. Otherwise it returns false.
|
||||||
bool lock_address_hbw_space(void* ptr);
|
bool lock_address_hbw_space(void* ptr);
|
||||||
|
|
||||||
|
@ -75,7 +75,7 @@ bool lock_address_hbw_space(void* ptr);
|
||||||
///
|
///
|
||||||
/// This function releases the lock for the hash value derived
|
/// This function releases the lock for the hash value derived
|
||||||
/// from the provided ptr. This function should only be called
|
/// from the provided ptr. This function should only be called
|
||||||
/// after previously successfully aquiring a lock with
|
/// after previously successfully acquiring a lock with
|
||||||
/// lock_address.
|
/// lock_address.
|
||||||
void unlock_address_hbw_space(void* ptr);
|
void unlock_address_hbw_space(void* ptr);
|
||||||
|
|
||||||
|
|
|
@ -74,10 +74,10 @@ namespace Impl {
|
||||||
/// This function initializes the locks to zero (unset).
|
/// This function initializes the locks to zero (unset).
|
||||||
void init_lock_array_host_space();
|
void init_lock_array_host_space();
|
||||||
|
|
||||||
/// \brief Aquire a lock for the address
|
/// \brief Acquire a lock for the address
|
||||||
///
|
///
|
||||||
/// This function tries to aquire the lock for the hash value derived
|
/// This function tries to acquire the lock for the hash value derived
|
||||||
/// from the provided ptr. If the lock is successfully aquired the
|
/// from the provided ptr. If the lock is successfully acquired the
|
||||||
/// function returns true. Otherwise it returns false.
|
/// function returns true. Otherwise it returns false.
|
||||||
bool lock_address_host_space(void* ptr);
|
bool lock_address_host_space(void* ptr);
|
||||||
|
|
||||||
|
@ -85,7 +85,7 @@ bool lock_address_host_space(void* ptr);
|
||||||
///
|
///
|
||||||
/// This function releases the lock for the hash value derived
|
/// This function releases the lock for the hash value derived
|
||||||
/// from the provided ptr. This function should only be called
|
/// from the provided ptr. This function should only be called
|
||||||
/// after previously successfully aquiring a lock with
|
/// after previously successfully acquiring a lock with
|
||||||
/// lock_address.
|
/// lock_address.
|
||||||
void unlock_address_host_space(void* ptr);
|
void unlock_address_host_space(void* ptr);
|
||||||
|
|
||||||
|
|
|
@ -69,10 +69,10 @@ namespace Impl {
|
||||||
/// This function initializes the locks to zero (unset).
|
/// This function initializes the locks to zero (unset).
|
||||||
// void init_lock_array_host_space();
|
// void init_lock_array_host_space();
|
||||||
|
|
||||||
/// \brief Aquire a lock for the address
|
/// \brief Acquire a lock for the address
|
||||||
///
|
///
|
||||||
/// This function tries to aquire the lock for the hash value derived
|
/// This function tries to acquire the lock for the hash value derived
|
||||||
/// from the provided ptr. If the lock is successfully aquired the
|
/// from the provided ptr. If the lock is successfully acquired the
|
||||||
/// function returns true. Otherwise it returns false.
|
/// function returns true. Otherwise it returns false.
|
||||||
// bool lock_address_host_space(void* ptr);
|
// bool lock_address_host_space(void* ptr);
|
||||||
|
|
||||||
|
@ -80,7 +80,7 @@ namespace Impl {
|
||||||
///
|
///
|
||||||
/// This function releases the lock for the hash value derived
|
/// This function releases the lock for the hash value derived
|
||||||
/// from the provided ptr. This function should only be called
|
/// from the provided ptr. This function should only be called
|
||||||
/// after previously successfully aquiring a lock with
|
/// after previously successfully acquiring a lock with
|
||||||
/// lock_address.
|
/// lock_address.
|
||||||
// void unlock_address_host_space(void* ptr);
|
// void unlock_address_host_space(void* ptr);
|
||||||
|
|
||||||
|
|
|
@ -1169,7 +1169,7 @@ inline void parallel_reduce(const std::string& label, const size_t& policy,
|
||||||
} // namespace Kokkos
|
} // namespace Kokkos
|
||||||
|
|
||||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||||
// backwards compatiblity for Kokkos::Experimental reducers
|
// backwards compatibility for Kokkos::Experimental reducers
|
||||||
namespace Kokkos {
|
namespace Kokkos {
|
||||||
namespace Experimental {
|
namespace Experimental {
|
||||||
using Kokkos::BAnd;
|
using Kokkos::BAnd;
|
||||||
|
|
|
@ -130,7 +130,7 @@ int* atomic_lock_array_rocm_space_ptr(bool deallocate = false);
|
||||||
/// global memory.
|
/// global memory.
|
||||||
///
|
///
|
||||||
/// Team and Thread private scratch allocations in
|
/// Team and Thread private scratch allocations in
|
||||||
/// global memory are aquired via locks.
|
/// global memory are acquired via locks.
|
||||||
/// This function retrieves the lock array pointer.
|
/// This function retrieves the lock array pointer.
|
||||||
/// If the array is not yet allocated it will do so.
|
/// If the array is not yet allocated it will do so.
|
||||||
int* scratch_lock_array_rocm_space_ptr(bool deallocate = false);
|
int* scratch_lock_array_rocm_space_ptr(bool deallocate = false);
|
||||||
|
|
|
@ -930,7 +930,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
|
||||||
|
|
||||||
if (is_dynamic) {
|
if (is_dynamic) {
|
||||||
// Must synchronize to make sure each team has set its
|
// Must synchronize to make sure each team has set its
|
||||||
// partition before begining the work stealing loop.
|
// partition before beginning the work stealing loop.
|
||||||
if (data.pool_rendezvous()) data.pool_rendezvous_release();
|
if (data.pool_rendezvous()) data.pool_rendezvous_release();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1077,7 +1077,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
|
||||||
|
|
||||||
if (is_dynamic) {
|
if (is_dynamic) {
|
||||||
// Must synchronize to make sure each team has set its
|
// Must synchronize to make sure each team has set its
|
||||||
// partition before begining the work stealing loop.
|
// partition before beginning the work stealing loop.
|
||||||
if (data.pool_rendezvous()) data.pool_rendezvous_release();
|
if (data.pool_rendezvous()) data.pool_rendezvous_release();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -611,7 +611,7 @@ void ThreadsExec::initialize(unsigned thread_count, unsigned use_numa_count,
|
||||||
|
|
||||||
// Spawn thread executing the 'driver()' function.
|
// Spawn thread executing the 'driver()' function.
|
||||||
// Wait until spawned thread has attempted to initialize.
|
// Wait until spawned thread has attempted to initialize.
|
||||||
// If spawning and initialization is successfull then
|
// If spawning and initialization is successful then
|
||||||
// an entry in 's_threads_exec' will be assigned.
|
// an entry in 's_threads_exec' will be assigned.
|
||||||
if (ThreadsExec::spawn()) {
|
if (ThreadsExec::spawn()) {
|
||||||
wait_yield(s_threads_process.m_pool_state, ThreadsExec::Inactive);
|
wait_yield(s_threads_process.m_pool_state, ThreadsExec::Inactive);
|
||||||
|
@ -639,7 +639,7 @@ void ThreadsExec::initialize(unsigned thread_count, unsigned use_numa_count,
|
||||||
|
|
||||||
if (!thread_spawn_failed) {
|
if (!thread_spawn_failed) {
|
||||||
// Bind process to the core on which it was located before spawning
|
// Bind process to the core on which it was located before spawning
|
||||||
// occured
|
// occurred
|
||||||
if (hwloc_can_bind) {
|
if (hwloc_can_bind) {
|
||||||
Kokkos::hwloc::bind_this_thread(proc_coord);
|
Kokkos::hwloc::bind_this_thread(proc_coord);
|
||||||
}
|
}
|
||||||
|
|
|
@ -52,7 +52,7 @@
|
||||||
#include <sched.h>
|
#include <sched.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
|
|
||||||
/* Standard C++ libaries */
|
/* Standard C++ libraries */
|
||||||
|
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
@ -153,7 +153,7 @@ void ThreadsExec::wait_yield(volatile int& flag, const int value) {
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
#include <process.h>
|
#include <process.h>
|
||||||
|
|
||||||
/* Standard C++ libaries */
|
/* Standard C++ libraries */
|
||||||
|
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
|
@ -70,7 +70,7 @@ namespace Impl {
|
||||||
// called split_release
|
// called split_release
|
||||||
//
|
//
|
||||||
// The purporse of the split functions is to allow the last thread to arrive
|
// The purporse of the split functions is to allow the last thread to arrive
|
||||||
// an opprotunity to perform some actions before releasing the waiting threads
|
// an opportunity to perform some actions before releasing the waiting threads
|
||||||
//
|
//
|
||||||
// If all threads have arrived (and split_release has been call if using
|
// If all threads have arrived (and split_release has been call if using
|
||||||
// split_arrive) before a wait type call, the wait may return quickly
|
// split_arrive) before a wait type call, the wait may return quickly
|
||||||
|
|
|
@ -240,7 +240,7 @@ int HostThreadTeamData::get_work_stealing() noexcept {
|
||||||
HostThreadTeamData *const *const pool =
|
HostThreadTeamData *const *const pool =
|
||||||
(HostThreadTeamData **)(m_pool_scratch + m_pool_members);
|
(HostThreadTeamData **)(m_pool_scratch + m_pool_members);
|
||||||
|
|
||||||
// Attempt from begining failed, try to steal from end of neighbor
|
// Attempt from beginning failed, try to steal from end of neighbor
|
||||||
|
|
||||||
pair_int_t volatile *steal_range = &(pool[m_steal_rank]->m_work_range);
|
pair_int_t volatile *steal_range = &(pool[m_steal_rank]->m_work_range);
|
||||||
|
|
||||||
|
|
|
@ -267,7 +267,7 @@ union SharedAllocationTracker {
|
||||||
public:
|
public:
|
||||||
// Use macros instead of inline functions to reduce
|
// Use macros instead of inline functions to reduce
|
||||||
// pressure on compiler optimization by reducing
|
// pressure on compiler optimization by reducing
|
||||||
// number of symbols and inline functons.
|
// number of symbols and inline functions.
|
||||||
|
|
||||||
#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
|
#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
|
||||||
|
|
||||||
|
|
|
@ -499,7 +499,7 @@ class RunnableTaskBase
|
||||||
void acquire_predecessor_from(runnable_task_type& other) {
|
void acquire_predecessor_from(runnable_task_type& other) {
|
||||||
KOKKOS_EXPECTS(m_predecessor == nullptr ||
|
KOKKOS_EXPECTS(m_predecessor == nullptr ||
|
||||||
other.m_predecessor == m_predecessor);
|
other.m_predecessor == m_predecessor);
|
||||||
// since we're transfering, no need to modify the reference count
|
// since we're transferring, no need to modify the reference count
|
||||||
m_predecessor = other.m_predecessor;
|
m_predecessor = other.m_predecessor;
|
||||||
other.m_predecessor = nullptr;
|
other.m_predecessor = nullptr;
|
||||||
}
|
}
|
||||||
|
@ -508,7 +508,7 @@ class RunnableTaskBase
|
||||||
void acquire_predecessor_from(runnable_task_type& other) volatile {
|
void acquire_predecessor_from(runnable_task_type& other) volatile {
|
||||||
KOKKOS_EXPECTS(m_predecessor == nullptr ||
|
KOKKOS_EXPECTS(m_predecessor == nullptr ||
|
||||||
other.m_predecessor == m_predecessor);
|
other.m_predecessor == m_predecessor);
|
||||||
// since we're transfering, no need to modify the reference count
|
// since we're transferring, no need to modify the reference count
|
||||||
m_predecessor = other.m_predecessor;
|
m_predecessor = other.m_predecessor;
|
||||||
other.m_predecessor = nullptr;
|
other.m_predecessor = nullptr;
|
||||||
}
|
}
|
||||||
|
|
|
@ -597,7 +597,7 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::complete(
|
||||||
// If 'task' is an aggregate then any of the runnable tasks that
|
// If 'task' is an aggregate then any of the runnable tasks that
|
||||||
// it depends upon may be attempting to complete this 'task'.
|
// it depends upon may be attempting to complete this 'task'.
|
||||||
// Must only transition a task once to complete status.
|
// Must only transition a task once to complete status.
|
||||||
// This is controled by atomically locking the wait queue.
|
// This is controlled by atomically locking the wait queue.
|
||||||
|
|
||||||
// Stop other tasks from adding themselves to this task's wait queue
|
// Stop other tasks from adding themselves to this task's wait queue
|
||||||
// by locking the head of this task's wait queue.
|
// by locking the head of this task's wait queue.
|
||||||
|
|
|
@ -129,7 +129,7 @@ struct are_integral<T, Args...> {
|
||||||
enum {
|
enum {
|
||||||
value =
|
value =
|
||||||
// Accept std::is_integral OR std::is_enum as an integral value
|
// Accept std::is_integral OR std::is_enum as an integral value
|
||||||
// since a simple enum value is automically convertable to an
|
// since a simple enum value is automically convertible to an
|
||||||
// integral value.
|
// integral value.
|
||||||
(std::is_integral<T>::value || std::is_enum<T>::value) &&
|
(std::is_integral<T>::value || std::is_enum<T>::value) &&
|
||||||
are_integral<Args...>::value
|
are_integral<Args...>::value
|
||||||
|
|
|
@ -1218,7 +1218,7 @@ struct TestTeamBroadcast {
|
||||||
// expected_result=%d,
|
// expected_result=%d,
|
||||||
// total=%d\n",expected_result, total);
|
// total=%d\n",expected_result, total);
|
||||||
|
|
||||||
// team_broadcast with funtion object
|
// team_broadcast with function object
|
||||||
total = 0;
|
total = 0;
|
||||||
|
|
||||||
Kokkos::parallel_reduce(policy_type_f(league_size, team_size), functor,
|
Kokkos::parallel_reduce(policy_type_f(league_size, team_size), functor,
|
||||||
|
@ -1230,7 +1230,7 @@ struct TestTeamBroadcast {
|
||||||
expected_result += val;
|
expected_result += val;
|
||||||
}
|
}
|
||||||
ASSERT_EQ(size_t(expected_result),
|
ASSERT_EQ(size_t(expected_result),
|
||||||
size_t(total)); // printf("team_broadcast with funtion object --
|
size_t(total)); // printf("team_broadcast with function object --
|
||||||
// expected_result=%d,
|
// expected_result=%d,
|
||||||
// total=%d\n",expected_result, total);
|
// total=%d\n",expected_result, total);
|
||||||
}
|
}
|
||||||
|
|
|
@ -605,7 +605,7 @@ struct functor_vec_single {
|
||||||
|
|
||||||
KOKKOS_INLINE_FUNCTION
|
KOKKOS_INLINE_FUNCTION
|
||||||
void operator()(typename policy_type::member_type team) const {
|
void operator()(typename policy_type::member_type team) const {
|
||||||
// Warning: this test case intentionally violates permissable semantics.
|
// Warning: this test case intentionally violates permissible semantics.
|
||||||
// It is not valid to get references to members of the enclosing region
|
// It is not valid to get references to members of the enclosing region
|
||||||
// inside a parallel_for and write to it.
|
// inside a parallel_for and write to it.
|
||||||
Scalar value = 0;
|
Scalar value = 0;
|
||||||
|
|
|
@ -60,7 +60,7 @@ struct TestViewCtorProp_EmbeddedDim {
|
||||||
using ViewIntType = typename Kokkos::View<int**, ExecSpace>;
|
using ViewIntType = typename Kokkos::View<int**, ExecSpace>;
|
||||||
using ViewDoubleType = typename Kokkos::View<double*, ExecSpace>;
|
using ViewDoubleType = typename Kokkos::View<double*, ExecSpace>;
|
||||||
|
|
||||||
// Cuda 7.0 has issues with using a lamda in parallel_for to initialize the
|
// Cuda 7.0 has issues with using a lambda in parallel_for to initialize the
|
||||||
// view - replace with this functor
|
// view - replace with this functor
|
||||||
template <class ViewType>
|
template <class ViewType>
|
||||||
struct Functor {
|
struct Functor {
|
||||||
|
|
|
@ -56,7 +56,7 @@ namespace {
|
||||||
the N-th fibonacci number as follows:
|
the N-th fibonacci number as follows:
|
||||||
- Each "task" or "work item" computes the i-th fibonacci number
|
- Each "task" or "work item" computes the i-th fibonacci number
|
||||||
- If a task as (i < 2), it will record the known answer ahead of time.
|
- If a task as (i < 2), it will record the known answer ahead of time.
|
||||||
- If a taks has (i >= 2), it will "spawn" two more tasks to compute
|
- If a task has (i >= 2), it will "spawn" two more tasks to compute
|
||||||
the (i - 1) and (i - 2) fibonacci numbers.
|
the (i - 1) and (i - 2) fibonacci numbers.
|
||||||
We do NOT do any de-duplication of these tasks.
|
We do NOT do any de-duplication of these tasks.
|
||||||
De-duplication would result in only (N - 2) tasks which must be run in
|
De-duplication would result in only (N - 2) tasks which must be run in
|
||||||
|
|
|
@ -74,7 +74,7 @@ find cmake/kokkos -name KokkosTargets.cmake -exec grep -h INTERFACE_COMPILE_OPTI
|
||||||
|
|
||||||
#-I flags and -std= flags are not part of CMake's compile options
|
#-I flags and -std= flags are not part of CMake's compile options
|
||||||
#that's fine, let's ignore thse below
|
#that's fine, let's ignore thse below
|
||||||
#redunant lines - tail the last one
|
#redundant lines - tail the last one
|
||||||
#awk print each on new line
|
#awk print each on new line
|
||||||
#grep out blank lines
|
#grep out blank lines
|
||||||
#grep out include flags
|
#grep out include flags
|
||||||
|
|
|
@ -43,7 +43,7 @@ include $(KOKKOS_PATH)/Makefile.kokkos
|
||||||
|
|
||||||
build: $(EXE)
|
build: $(EXE)
|
||||||
|
|
||||||
#for unit testing only, for best preformance with OpenMP 4.0 or better
|
#for unit testing only, for best performance with OpenMP 4.0 or better
|
||||||
test: $(EXE)
|
test: $(EXE)
|
||||||
./$(EXE)
|
./$(EXE)
|
||||||
|
|
||||||
|
|
|
@ -64,10 +64,10 @@ typedef Kokkos::HostSpace::execution_space DefaultHostType;
|
||||||
// threads can grep their own. On CPU architectures the pool size is equal to
|
// threads can grep their own. On CPU architectures the pool size is equal to
|
||||||
// the thread number, on CUDA about 128k states are generated (enough to give
|
// the thread number, on CUDA about 128k states are generated (enough to give
|
||||||
// every potentially simultaneously running thread its own state). With a kernel
|
// every potentially simultaneously running thread its own state). With a kernel
|
||||||
// a thread is required to aquire a state from the pool and later return it. On
|
// a thread is required to acquire a state from the pool and later return it. On
|
||||||
// CPUs the Random number generator is deterministic if using the same number of
|
// CPUs the Random number generator is deterministic if using the same number of
|
||||||
// threads. On GPUs (i.e. using the CUDA backend it is not deterministic because
|
// threads. On GPUs (i.e. using the CUDA backend it is not deterministic because
|
||||||
// threads aquire states via atomics.
|
// threads acquire states via atomics.
|
||||||
|
|
||||||
// A Functor for generating uint64_t random numbers templated on the
|
// A Functor for generating uint64_t random numbers templated on the
|
||||||
// GeneratorPool type
|
// GeneratorPool type
|
||||||
|
@ -97,7 +97,7 @@ struct generate_random {
|
||||||
for (int k = 0; k < samples; k++)
|
for (int k = 0; k < samples; k++)
|
||||||
vals(i * samples + k) = rand_gen.urand64();
|
vals(i * samples + k) = rand_gen.urand64();
|
||||||
|
|
||||||
// Give the state back, which will allow another thread to aquire it
|
// Give the state back, which will allow another thread to acquire it
|
||||||
rand_pool.free_state(rand_gen);
|
rand_pool.free_state(rand_gen);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in New Issue