Update Kokkos library

This commit is contained in:
Stan Moore 2019-02-01 12:45:54 -07:00
parent d6eaf73db1
commit 64834e4a3d
43 changed files with 797 additions and 249 deletions

View File

@ -6,16 +6,16 @@ ifndef KOKKOS_PATH
endif
CXXFLAGS=$(CCFLAGS)
# Options: Cuda,ROCm,OpenMP,Pthread,Qthreads,Serial
# Options: Cuda,ROCm,OpenMP,Pthreads,Qthreads,Serial
KOKKOS_DEVICES ?= "OpenMP"
#KOKKOS_DEVICES ?= "Pthread"
#KOKKOS_DEVICES ?= "Pthreads"
# Options:
# Intel: KNC,KNL,SNB,HSW,BDW,SKX
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2
# IBM: BGQ,Power7,Power8,Power9
# AMD-GPUS: Kaveri,Carrizo,Fiji,Vega
# AMD-CPUS: AMDAVX,Ryzen,Epyc
# AMD-CPUS: AMDAVX,Ryzen,EPYC
KOKKOS_ARCH ?= ""
# Options: yes,no
KOKKOS_DEBUG ?= "no"
@ -224,7 +224,7 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
KOKKOS_INTERNAL_CXX11_FLAG := -std=c++11
#KOKKOS_INTERNAL_CXX14_FLAG := -std=c++14
KOKKOS_INTERNAL_CXX14_FLAG := -std=c++14
KOKKOS_INTERNAL_CXX1Y_FLAG := -std=c++1y
#KOKKOS_INTERNAL_CXX17_FLAG := -std=c++17
#KOKKOS_INTERNAL_CXX1Z_FLAG := -std=c++1Z
@ -276,6 +276,7 @@ KOKKOS_INTERNAL_USE_ARCH_PASCAL61 := $(call kokkos_has_string,$(KOKKOS_ARCH),Pas
KOKKOS_INTERNAL_USE_ARCH_PASCAL60 := $(call kokkos_has_string,$(KOKKOS_ARCH),Pascal60)
KOKKOS_INTERNAL_USE_ARCH_VOLTA70 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volta70)
KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volta72)
KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
@ -284,6 +285,7 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53))
@ -300,6 +302,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53))
@ -331,7 +334,7 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
# AMD based.
KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX)
KOKKOS_INTERNAL_USE_ARCH_RYZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Ryzen)
KOKKOS_INTERNAL_USE_ARCH_EPYC := $(call kokkos_has_string,$(KOKKOS_ARCH),Epyc)
KOKKOS_INTERNAL_USE_ARCH_EPYC := $(call kokkos_has_string,$(KOKKOS_ARCH),EPYC)
KOKKOS_INTERNAL_USE_ARCH_KAVERI := $(call kokkos_has_string,$(KOKKOS_ARCH),Kaveri)
KOKKOS_INTERNAL_USE_ARCH_CARRIZO := $(call kokkos_has_string,$(KOKKOS_ARCH),Carrizo)
KOKKOS_INTERNAL_USE_ARCH_FIJI := $(call kokkos_has_string,$(KOKKOS_ARCH),Fiji)
@ -341,12 +344,12 @@ KOKKOS_INTERNAL_USE_ARCH_GFX901 := $(call kokkos_has_string,$(KOKKOS_ARCH),gfx90
# Any AVX?
KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM))
KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX))
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW))
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_EPYC))
KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL))
KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SKX))
# Decide what ISA level we are able to support.
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX))
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_EPYC))
KOKKOS_INTERNAL_USE_ISA_KNC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNC))
KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER8) + $(KOKKOS_INTERNAL_USE_ARCH_POWER9))
KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER7))
@ -658,6 +661,19 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1)
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_EPYC), 1)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AMD_EPYC")
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AMD_AVX2")
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -mavx2
KOKKOS_LDFLAGS += -mavx2
else
KOKKOS_CXXFLAGS += -march=znver1 -mtune=znver1
KOKKOS_LDFLAGS += -march=znver1 -mtune=znver1
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV80")
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV8_THUNDERX")
@ -950,6 +966,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VOLTA72")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_72
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_TURING")
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_TURING75")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_75
endif
ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)

View File

@ -73,6 +73,8 @@ For specifics see the LICENSE file contained in the repository or distribution.
* NVCC 7.5 for CUDA (with gcc 4.8.4)
* NVCC 8.0.44 for CUDA (with gcc 5.3.0)
* NVCC 9.1 for CUDA (with gcc 6.1.0)
* NVCC 9.2 for CUDA (with gcc 7.2.0)
* NVCC 10.0 for CUDA (with gcc 7.4.0)
### Primary tested compilers on Power 8 are:
* GCC 6.4.0 (OpenMP,Serial)
@ -109,7 +111,7 @@ GCC: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits
-Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
Intel: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
Clang: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
NVCC: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
NVCC: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
Other compilers are tested occasionally, in particular when pushing from develop to
master branch, without -Werror and only for a select set of backends.

View File

@ -308,6 +308,16 @@ do
shift
done
#Check if nvcc exists
if [ $host_only -ne 1 ]; then
var=$(which nvcc )
if [ $? -gt 0 ]; then
echo "Could not find nvcc in PATH"
exit $?
fi
fi
# Only print host compiler version
if [ $get_host_version -eq 1 ]; then
$host_compiler --version

View File

@ -104,6 +104,7 @@ list(APPEND KOKKOS_ARCH_LIST
Pascal61 # (GPU) NVIDIA Pascal generation CC 6.1
Volta70 # (GPU) NVIDIA Volta generation CC 7.0
Volta72 # (GPU) NVIDIA Volta generation CC 7.2
Turing75 # (GPU) NVIDIA Turing generation CC 7.5
)
# List of possible device architectures.

View File

@ -832,16 +832,14 @@ void
deep_copy (DualView<DT,DL,DD,DM> dst, // trust me, this must not be a reference
const DualView<ST,SL,SD,SM>& src )
{
if(src.modified_flags.data()==NULL || dst.modified_flags.data()==NULL) {
return deep_copy(dst.d_view, src.d_view);
}
if (src.modified_flags(1) >= src.modified_flags(0)) {
deep_copy (dst.d_view, src.d_view);
dst.template modify<typename DualView<DT,DL,DD,DM>::device_type> ();
} else {
if ( src.need_sync_device() ) {
deep_copy (dst.h_view, src.h_view);
dst.template modify<typename DualView<DT,DL,DD,DM>::host_mirror_space> ();
dst.modify_host();
}
else {
deep_copy (dst.d_view, src.d_view);
dst.modify_device();
}
}
template< class ExecutionSpace ,
@ -852,15 +850,12 @@ deep_copy (const ExecutionSpace& exec ,
DualView<DT,DL,DD,DM> dst, // trust me, this must not be a reference
const DualView<ST,SL,SD,SM>& src )
{
if(src.modified_flags.data()==NULL || dst.modified_flags.data()==NULL) {
return deep_copy(exec, dst.d_view, src.d_view);
}
if (src.modified_flags(1) >= src.modified_flags(0)) {
deep_copy (exec, dst.d_view, src.d_view);
dst.template modify<typename DualView<DT,DL,DD,DM>::device_type> ();
} else {
if ( src.need_sync_device() ) {
deep_copy (exec, dst.h_view, src.h_view);
dst.template modify<typename DualView<DT,DL,DD,DM>::host_mirror_space> ();
dst.modify_host();
} else {
deep_copy (exec, dst.d_view, src.d_view);
dst.modify_device();
}
}

View File

@ -368,8 +368,8 @@ public:
enum { is_assignable = is_assignable_value_type &&
is_assignable_layout };
typedef ViewMapping< DstTraits , void > DstType ;
typedef ViewMapping< SrcTraits , void > SrcType ;
typedef ViewMapping< DstTraits , typename DstTraits::specialize > DstType ;
typedef ViewMapping< SrcTraits , typename SrcTraits::specialize > SrcType ;
template < typename DT , typename ... DP , typename ST , typename ... SP >
KOKKOS_INLINE_FUNCTION
@ -432,7 +432,7 @@ public:
private:
typedef Kokkos::Impl::ViewMapping< traits , void > map_type ;
typedef Kokkos::Impl::ViewMapping< traits , typename traits::specialize > map_type ;
typedef Kokkos::Impl::SharedAllocationTracker track_type ;
track_type m_track ;
@ -567,11 +567,11 @@ public:
// Allow specializations to query their specialized map
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
KOKKOS_INLINE_FUNCTION
const Kokkos::Impl::ViewMapping< traits , void > &
const Kokkos::Impl::ViewMapping< traits , typename traits::specialize > &
implementation_map() const { return m_map ; }
#endif
KOKKOS_INLINE_FUNCTION
const Kokkos::Impl::ViewMapping< traits , void > &
const Kokkos::Impl::ViewMapping< traits , typename traits::specialize > &
impl_map() const { return m_map ; }
//----------------------------------------
@ -952,7 +952,7 @@ public:
, m_rank(rhs.m_rank)
{
typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , typename traits::specialize > Mapping ;
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
Mapping::assign( m_map , rhs.m_map , rhs.m_track );
}
@ -962,7 +962,7 @@ public:
DynRankView & operator = (const DynRankView<RT,RP...> & rhs )
{
typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , typename traits::specialize > Mapping ;
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
Mapping::assign( m_map , rhs.m_map , rhs.m_track );
m_track.assign( rhs.m_track , traits::is_managed );
@ -980,7 +980,7 @@ public:
{
typedef typename View<RT,RP...>::traits SrcTraits ;
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Impl::ViewToDynRankViewTag > Mapping ;
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
static_assert( Mapping::is_assignable , "Incompatible View to DynRankView copy construction" );
Mapping::assign( *this , rhs );
}
@ -1432,7 +1432,7 @@ public:
, Args ... args )
{
typedef ViewMapping< traits_type, void > DstType ;
typedef ViewMapping< traits_type, typename traits_type::specialize > DstType ;
typedef typename std::conditional< (rank==0) , ViewDimension<>
, typename std::conditional< (rank==1) , ViewDimension<0>

View File

@ -101,13 +101,98 @@ namespace Impl {
result = run_me< Kokkos::DualView<Scalar**,Kokkos::LayoutLeft,Device> >(size,3);
}
};
};
template < typename Scalar, class ViewType >
struct SumViewEntriesFunctor {
typedef Scalar value_type;
ViewType fv;
SumViewEntriesFunctor ( const ViewType & fv_ ) : fv(fv_) {}
KOKKOS_INLINE_FUNCTION
void operator() ( const int i , value_type & total ) const {
for ( size_t j = 0; j < fv.extent(1); ++j ) {
total += fv(i,j);
}
}
};
template <typename Scalar, class Device>
struct test_dual_view_deep_copy
{
typedef Scalar scalar_type;
typedef Device execution_space;
template <typename ViewType>
void run_me() {
const unsigned int n = 10;
const unsigned int m = 5;
const unsigned int sum_total = n * m;
ViewType a("A",n,m);
ViewType b("B",n,m);
Kokkos::deep_copy( a.d_view , 1 );
a.template modify<typename ViewType::execution_space>();
a.template sync<typename ViewType::host_mirror_space>();
// Check device view is initialized as expected
scalar_type a_d_sum = 0;
// Execute on the execution_space associated with t_dev's memory space
typedef typename ViewType::t_dev::memory_space::execution_space t_dev_exec_space;
Kokkos::parallel_reduce( Kokkos::RangePolicy<t_dev_exec_space>(0,n), SumViewEntriesFunctor<scalar_type, typename ViewType::t_dev>(a.d_view), a_d_sum );
ASSERT_EQ(a_d_sum, sum_total);
// Check host view is synced as expected
scalar_type a_h_sum = 0;
for ( size_t i = 0; i < a.h_view.extent(0); ++i )
for ( size_t j = 0; j < a.h_view.extent(1); ++j ) {
a_h_sum += a.h_view(i,j);
}
ASSERT_EQ(a_h_sum, sum_total);
// Test deep_copy
Kokkos::deep_copy( b, a );
b.template sync<typename ViewType::host_mirror_space>();
// Perform same checks on b as done on a
// Check device view is initialized as expected
scalar_type b_d_sum = 0;
// Execute on the execution_space associated with t_dev's memory space
Kokkos::parallel_reduce( Kokkos::RangePolicy<t_dev_exec_space>(0,n), SumViewEntriesFunctor<scalar_type, typename ViewType::t_dev>(b.d_view), b_d_sum );
ASSERT_EQ(b_d_sum, sum_total);
// Check host view is synced as expected
scalar_type b_h_sum = 0;
for ( size_t i = 0; i < b.h_view.extent(0); ++i )
for ( size_t j = 0; j < b.h_view.extent(1); ++j ) {
b_h_sum += b.h_view(i,j);
}
ASSERT_EQ(b_h_sum, sum_total);
} // end run_me
test_dual_view_deep_copy()
{
run_me< Kokkos::DualView<Scalar**,Kokkos::LayoutLeft,Device> >();
}
};
} // namespace Impl
template <typename Scalar, typename Device>
void test_dualview_combinations(unsigned int size)
{
@ -116,10 +201,21 @@ void test_dualview_combinations(unsigned int size)
}
template <typename Scalar, typename Device>
void test_dualview_deep_copy()
{
Impl::test_dual_view_deep_copy<Scalar,Device> ();
}
TEST_F( TEST_CATEGORY, dualview_combination) {
test_dualview_combinations<int,TEST_EXECSPACE>(10);
}
TEST_F( TEST_CATEGORY, dualview_deep_copy) {
test_dualview_deep_copy<int,TEST_EXECSPACE>();
test_dualview_deep_copy<double,TEST_EXECSPACE>();
}
} // namespace Test

View File

@ -829,7 +829,8 @@ void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
}
if(bytes > current_size) {
current_size = bytes;
ptr = Kokkos::kokkos_realloc<Kokkos::CudaSpace>(ptr,current_size);
Kokkos::kokkos_free<Kokkos::CudaSpace>(ptr);
ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
}
if((bytes < current_size) && (force_shrink)) {
current_size = bytes;

View File

@ -561,7 +561,11 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
}
#endif
#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
#else
cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
#endif
// Init the array for used for arbitrarily sized atomics
Impl::initialize_host_cuda_lock_arrays();

View File

@ -525,6 +525,7 @@ public:
inline
void execute() const
{
if(m_rp.m_num_tiles==0) return;
const array_index_type maxblocks = static_cast<array_index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
if ( RP::rank == 2 )
{
@ -685,7 +686,7 @@ public:
typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>()
, m_shmem_begin
, m_shmem_size
, (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
, (void*) ( ((char*)m_scratch_ptr[1]) + ptrdiff_t(threadid/(blockDim.x*blockDim.y)) * m_scratch_size[1])
, m_scratch_size[1]
, league_rank
, m_league_size ) );
@ -1336,7 +1337,7 @@ public:
( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
, m_shmem_begin
, m_shmem_size
, (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
, (void*) ( ((char*)m_scratch_ptr[1]) + ptrdiff_t(threadid/(blockDim.x*blockDim.y)) * m_scratch_size[1])
, m_scratch_size[1]
, league_rank
, m_league_size )
@ -1378,7 +1379,7 @@ public:
( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
, m_shmem_begin
, m_shmem_size
, (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
, (void*) ( ((char*)m_scratch_ptr[1]) + ptrdiff_t(threadid/(blockDim.x*blockDim.y)) * m_scratch_size[1])
, m_scratch_size[1]
, league_rank
, m_league_size )
@ -2064,7 +2065,7 @@ private:
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
#else
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
KOKKOS_IMPL_CUDA_SYNCWARP;
#endif
if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values.
@ -2291,7 +2292,7 @@ private:
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
#else
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
KOKKOS_IMPL_CUDA_SYNCWARP;
#endif
if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values.

View File

@ -321,7 +321,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
int active = KOKKOS_IMPL_CUDA_BALLOT(1);
#endif
if (int(blockDim.x*blockDim.y) > 2) {
value_type tmp = Kokkos::shfl_down(value, 2,32);
@ -331,7 +331,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
active += KOKKOS_IMPL_CUDA_BALLOT(1);
#endif
if (int(blockDim.x*blockDim.y) > 4) {
value_type tmp = Kokkos::shfl_down(value, 4,32);
@ -341,7 +341,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
active += KOKKOS_IMPL_CUDA_BALLOT(1);
#endif
if (int(blockDim.x*blockDim.y) > 8) {
value_type tmp = Kokkos::shfl_down(value, 8,32);
@ -351,7 +351,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
active += KOKKOS_IMPL_CUDA_BALLOT(1);
#endif
if (int(blockDim.x*blockDim.y) > 16) {
value_type tmp = Kokkos::shfl_down(value, 16,32);
@ -361,7 +361,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
active += KOKKOS_IMPL_CUDA_BALLOT(1);
#endif
}
}
@ -506,7 +506,7 @@ cuda_inter_block_reduction( const ReducerType& reducer,
unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
int active = KOKKOS_IMPL_CUDA_BALLOT(1);
#endif
if (int(blockDim.x*blockDim.y) > 2) {
value_type tmp = Kokkos::shfl_down(value, 2,32);
@ -516,7 +516,7 @@ cuda_inter_block_reduction( const ReducerType& reducer,
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
active += KOKKOS_IMPL_CUDA_BALLOT(1);
#endif
if (int(blockDim.x*blockDim.y) > 4) {
value_type tmp = Kokkos::shfl_down(value, 4,32);
@ -526,7 +526,7 @@ cuda_inter_block_reduction( const ReducerType& reducer,
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
active += KOKKOS_IMPL_CUDA_BALLOT(1);
#endif
if (int(blockDim.x*blockDim.y) > 8) {
value_type tmp = Kokkos::shfl_down(value, 8,32);
@ -536,7 +536,7 @@ cuda_inter_block_reduction( const ReducerType& reducer,
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
active += KOKKOS_IMPL_CUDA_BALLOT(1);
#endif
if (int(blockDim.x*blockDim.y) > 16) {
value_type tmp = Kokkos::shfl_down(value, 16,32);
@ -546,7 +546,7 @@ cuda_inter_block_reduction( const ReducerType& reducer,
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
active += KOKKOS_IMPL_CUDA_BALLOT(1);
#endif
}
}
@ -578,7 +578,7 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, true> {
const int width, // How much of the warp participates
Scalar& result)
{
unsigned mask = width==32?0xffffffff:((1<<width)-1)<<((threadIdx.y*blockDim.x+threadIdx.x)%(32/width))*width;
unsigned mask = width==32?0xffffffff:((1<<width)-1)<<((threadIdx.y*blockDim.x+threadIdx.x)/width)*width;
for(int delta=skip_vector?blockDim.x:1; delta<width; delta*=2) {
Scalar tmp;
cuda_shfl_down(tmp,value,delta,width,mask);
@ -683,7 +683,7 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
const int width) // How much of the warp participates
{
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
unsigned mask = width==32?0xffffffff:((1<<width)-1)<<((threadIdx.y*blockDim.x+threadIdx.x)%(32/width))*width;
unsigned mask = width==32?0xffffffff:((1<<width)-1)<<((threadIdx.y*blockDim.x+threadIdx.x)/width)*width;
#endif
const int lane_id = (threadIdx.y*blockDim.x+threadIdx.x)%32;
for(int delta=skip_vector?blockDim.x:1; delta<width; delta*=2) {
@ -693,7 +693,7 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(mask);
#else
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
KOKKOS_IMPL_CUDA_SYNCWARP;
#endif
}
*value=*(value-lane_id);
@ -779,7 +779,7 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
/*
* Algorithmic constraints:
* (a) blockDim.y is a power of two
* (b) blockDim.y <= 512
* (b) blockDim.y <= 1024
* (c) blockDim.x == blockDim.z == 1
*/
@ -828,14 +828,26 @@ void cuda_intra_block_reduce_scan( const FunctorType & functor ,
{ // Inter-warp reduce-scan by a single warp to avoid extra synchronizations
const unsigned rtid_inter = ( threadIdx.y ^ BlockSizeMask ) << CudaTraits::WarpIndexShift ;
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
unsigned inner_mask = KOKKOS_IMPL_CUDA_BALLOT_MASK(0xffffffff,(rtid_inter<blockDim.y));
#endif
if ( rtid_inter < blockDim.y ) {
const pointer_type tdata_inter = base_data + value_count * ( rtid_inter ^ BlockSizeMask );
if ( (1<<5) < BlockSizeMask ) { BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
if ( (1<<6) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
if ( (1<<7) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
if ( (1<<8) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
if ( (1<<5) < BlockSizeMask ) { KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
if ( (1<<6) < BlockSizeMask ) { KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
if ( (1<<7) < BlockSizeMask ) { KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
if ( (1<<8) < BlockSizeMask ) { KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
if ( (1<<9) < BlockSizeMask ) { KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,9) }
#else
if ( (1<<5) < BlockSizeMask ) { KOKKOS_IMPL_CUDA_SYNCWARP; BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
if ( (1<<6) < BlockSizeMask ) { KOKKOS_IMPL_CUDA_SYNCWARP; BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
if ( (1<<7) < BlockSizeMask ) { KOKKOS_IMPL_CUDA_SYNCWARP; BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
if ( (1<<8) < BlockSizeMask ) { KOKKOS_IMPL_CUDA_SYNCWARP; BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
if ( (1<<9) < BlockSizeMask ) { KOKKOS_IMPL_CUDA_SYNCWARP; BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,9) }
#endif
if ( DoScan ) {
@ -846,10 +858,17 @@ void cuda_intra_block_reduce_scan( const FunctorType & functor ,
if ( ! ( rtid_inter + n < blockDim.y ) ) n = 0 ;
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,8)
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,7)
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,6)
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,5)
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); BLOCK_SCAN_STEP(tdata_inter,n,8)
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); BLOCK_SCAN_STEP(tdata_inter,n,7)
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); BLOCK_SCAN_STEP(tdata_inter,n,6)
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); BLOCK_SCAN_STEP(tdata_inter,n,5)
#else
KOKKOS_IMPL_CUDA_SYNCWARP; BLOCK_SCAN_STEP(tdata_inter,n,8)
KOKKOS_IMPL_CUDA_SYNCWARP; BLOCK_SCAN_STEP(tdata_inter,n,7)
KOKKOS_IMPL_CUDA_SYNCWARP; BLOCK_SCAN_STEP(tdata_inter,n,6)
KOKKOS_IMPL_CUDA_SYNCWARP; BLOCK_SCAN_STEP(tdata_inter,n,5)
#endif
}
}
}
@ -864,19 +883,17 @@ void cuda_intra_block_reduce_scan( const FunctorType & functor ,
( rtid_intra & 16 ) ? 16 : 0 ))));
if ( ! ( rtid_intra + n < blockDim.y ) ) n = 0 ;
#ifdef KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
BLOCK_SCAN_STEP(tdata_intra,n,4) __syncthreads();//__threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,3) __syncthreads();//__threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,2) __syncthreads();//__threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,1) __syncthreads();//__threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,0) __syncthreads();
#else
KOKKOS_IMPL_CUDA_SYNCWARP;
BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block();
KOKKOS_IMPL_CUDA_SYNCWARP;
BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block();
KOKKOS_IMPL_CUDA_SYNCWARP;
BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block();
KOKKOS_IMPL_CUDA_SYNCWARP;
BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block();
KOKKOS_IMPL_CUDA_SYNCWARP;
BLOCK_SCAN_STEP(tdata_intra,n,0) __threadfence_block();
#endif
KOKKOS_IMPL_CUDA_SYNCWARP;
}
#undef BLOCK_SCAN_STEP

View File

@ -290,7 +290,7 @@ public:
// Intra vector lane shuffle reduction:
typename ReducerType::value_type tmp ( reducer.reference() );
unsigned mask = blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x;
unsigned mask = blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<((threadIdx.y%(32/blockDim.x))*blockDim.x);
for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
cuda_shfl_down( reducer.reference() , tmp , i , blockDim.x , mask );
@ -742,7 +742,7 @@ void parallel_for
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x);
#else
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
KOKKOS_IMPL_CUDA_SYNCWARP;
#endif
#endif
}
@ -915,7 +915,7 @@ void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const Functo
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x);
#else
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
KOKKOS_IMPL_CUDA_SYNCWARP;
#endif
#endif
}
@ -928,7 +928,7 @@ void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& , const Functo
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x);
#else
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
KOKKOS_IMPL_CUDA_SYNCWARP;
#endif
#endif
}
@ -938,7 +938,7 @@ KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda, ValueType& val) {
#ifdef __CUDA_ARCH__
if(threadIdx.x == 0) lambda(val);
unsigned mask = blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x;
unsigned mask = blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<((threadIdx.y%(32/blockDim.x))*blockDim.x);
Impl::cuda_shfl(val,val,0,blockDim.x,mask);
#endif
}

View File

@ -4,9 +4,9 @@
#if ( CUDA_VERSION < 9000 )
#define KOKKOS_IMPL_CUDA_ACTIVEMASK 0
#define KOKKOS_IMPL_CUDA_SYNCWARP __threadfence_block()
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK __threadfence_block()
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) if(m)__threadfence_block()
#define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot(x)
#define KOKKOS_IMPL_CUDA_BALLOT_MASK(x) __ballot(x)
#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m,x) __ballot(x)
#define KOKKOS_IMPL_CUDA_SHFL(x,y,z) __shfl(x,y,z)
#define KOKKOS_IMPL_CUDA_SHFL_MASK(m,x,y,z) __shfl(x,y,z)
#define KOKKOS_IMPL_CUDA_SHFL_UP(x,y,z) __shfl_up(x,y,z)
@ -16,7 +16,7 @@
#else
#define KOKKOS_IMPL_CUDA_ACTIVEMASK __activemask()
#define KOKKOS_IMPL_CUDA_SYNCWARP __syncwarp(0xffffffff)
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) __syncwarp(m);
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) __syncwarp(m)
#define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot_sync(__activemask(),x)
#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m,x) __ballot_sync(m,x)
#define KOKKOS_IMPL_CUDA_SHFL(x,y,z) __shfl_sync(0xffffffff,x,y,z)
@ -29,9 +29,9 @@
#else
#define KOKKOS_IMPL_CUDA_ACTIVEMASK 0
#define KOKKOS_IMPL_CUDA_SYNCWARP
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) (void)m
#define KOKKOS_IMPL_CUDA_BALLOT(x) 0
#define KOKKOS_IMPL_CUDA_BALLOT_MASK(x) 0
#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m,x) 0
#define KOKKOS_IMPL_CUDA_SHFL(x,y,z) 0
#define KOKKOS_IMPL_CUDA_SHFL_MASK(m,x,y,z) 0
#define KOKKOS_IMPL_CUDA_SHFL_UP(x,y,z) 0

View File

@ -1401,7 +1401,33 @@ void deep_copy
typedef typename src_type::memory_space src_memory_space ;
typedef typename dst_type::value_type dst_value_type ;
typedef typename src_type::value_type src_value_type ;
if(dst.data() == NULL && src.data() == NULL) {
if(dst.data() == NULL || src.data() == NULL) {
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
// do nothing
#else
// throw if dimension mismatch
if ( (src.extent(0) != dst.extent(0)) ||
(src.extent(1) != dst.extent(1)) ||
(src.extent(2) != dst.extent(2)) ||
(src.extent(3) != dst.extent(3)) ||
(src.extent(4) != dst.extent(4)) ||
(src.extent(5) != dst.extent(5)) ||
(src.extent(6) != dst.extent(6)) ||
(src.extent(7) != dst.extent(7))
) {
std::string message("Deprecation Error: Kokkos::deep_copy extents of views don't match: ");
message += dst.label(); message += "(";
for(int r = 0; r<dst_type::Rank-1; r++)
{ message+= std::to_string(dst.extent(r)); message += ","; }
message+= std::to_string(dst.extent(dst_type::Rank-1)); message += ") ";
message += src.label(); message += "(";
for(int r = 0; r<src_type::Rank-1; r++)
{ message+= std::to_string(src.extent(r)); message += ","; }
message+= std::to_string(src.extent(src_type::Rank-1)); message += ") ";
Kokkos::Impl::throw_runtime_exception(message);
}
#endif
Kokkos::fence();
return;
}
@ -1646,7 +1672,33 @@ void deep_copy
typedef typename dst_type::value_type dst_value_type ;
typedef typename src_type::value_type src_value_type ;
if(dst.data() == NULL && src.data() == NULL) {
if(dst.data() == NULL || src.data() == NULL) {
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
// do nothing
#else
// throw if dimension mismatch
if ( (src.extent(0) != dst.extent(0)) ||
(src.extent(1) != dst.extent(1)) ||
(src.extent(2) != dst.extent(2)) ||
(src.extent(3) != dst.extent(3)) ||
(src.extent(4) != dst.extent(4)) ||
(src.extent(5) != dst.extent(5)) ||
(src.extent(6) != dst.extent(6)) ||
(src.extent(7) != dst.extent(7))
) {
std::string message("Deprecation Error: Kokkos::deep_copy extents of views don't match: ");
message += dst.label(); message += "(";
for(int r = 0; r<dst_type::Rank-1; r++)
{ message+= std::to_string(dst.extent(r)); message += ","; }
message+= std::to_string(dst.extent(dst_type::Rank-1)); message += ") ";
message += src.label(); message += "(";
for(int r = 0; r<src_type::Rank-1; r++)
{ message+= std::to_string(src.extent(r)); message += ","; }
message+= std::to_string(src.extent(src_type::Rank-1)); message += ") ";
Kokkos::Impl::throw_runtime_exception(message);
}
#endif
exec_space.fence();
return;
}

View File

@ -100,32 +100,27 @@ public:
row_map_type row_map;
entries_type entries;
//! Construct an empty view.
Crs() : row_map(), entries() {}
//! Copy constructor (shallow copy).
Crs(const Crs& rhs) : row_map(rhs.row_map), entries(rhs.entries)
{}
/*
* Default Constructors, operators and destructor
*/
KOKKOS_FUNCTION Crs() = default;
KOKKOS_FUNCTION Crs(Crs const &) = default;
KOKKOS_FUNCTION Crs(Crs &&) = default;
KOKKOS_FUNCTION Crs& operator=(Crs const &) = default;
KOKKOS_FUNCTION Crs& operator=(Crs &&) = default;
KOKKOS_FUNCTION ~Crs() = default;
/** \brief Assign to a view of the rhs array.
* If the old view is the last view
* then allocated memory is deallocated.
*/
template<class EntriesType, class RowMapType>
Crs(const RowMapType& row_map_, const EntriesType& entries_) : row_map(row_map_), entries(entries_)
{}
/** \brief Assign to a view of the rhs array.
* If the old view is the last view
* then allocated memory is deallocated.
*/
Crs& operator= (const Crs& rhs) {
row_map = rhs.row_map;
entries = rhs.entries;
return *this;
KOKKOS_INLINE_FUNCTION
Crs(const RowMapType& row_map_, const EntriesType& entries_)
: row_map(row_map_), entries(entries_)
{
}
/** \brief Destroy this view of the array.
* If the last view then allocated memory is deallocated.
*/
~Crs() {}
/** \brief Return number of rows in the graph
*/
KOKKOS_INLINE_FUNCTION

View File

@ -170,6 +170,10 @@
// see https://github.com/kokkos/kokkos/issues/1470
#define KOKKOS_CUDA_9_DEFAULTED_BUG_WORKAROUND
#endif
#if ( 10000 > CUDA_VERSION )
#define KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
#endif
#endif // #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
//----------------------------------------------------------------------------

View File

@ -505,7 +505,7 @@ public:
}
KOKKOS_INLINE_FUNCTION
value_type& reference() {
value_type& reference() const {
return *value;
}
@ -559,7 +559,7 @@ public:
}
KOKKOS_INLINE_FUNCTION
value_type& reference() {
value_type& reference() const {
return *value;
}
@ -637,7 +637,7 @@ public:
}
KOKKOS_INLINE_FUNCTION
value_type& reference() {
value_type& reference() const {
return *value;
}
@ -727,7 +727,7 @@ public:
}
KOKKOS_INLINE_FUNCTION
value_type& reference() {
value_type& reference() const {
return *value;
}

View File

@ -198,6 +198,7 @@ struct ViewTraits< void >
typedef void HostMirrorSpace ;
typedef void array_layout ;
typedef void memory_traits ;
typedef void specialize ;
};
template< class ... Prop >
@ -209,6 +210,7 @@ struct ViewTraits< void , void , Prop ... >
typedef typename ViewTraits<void,Prop...>::HostMirrorSpace HostMirrorSpace ;
typedef typename ViewTraits<void,Prop...>::array_layout array_layout ;
typedef typename ViewTraits<void,Prop...>::memory_traits memory_traits ;
typedef typename ViewTraits<void,Prop...>::specialize specialize ;
};
template< class ArrayLayout , class ... Prop >
@ -221,6 +223,7 @@ struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_array_layout<ArrayL
typedef typename ViewTraits<void,Prop...>::HostMirrorSpace HostMirrorSpace ;
typedef ArrayLayout array_layout ;
typedef typename ViewTraits<void,Prop...>::memory_traits memory_traits ;
typedef typename ViewTraits<void,Prop...>::specialize specialize ;
};
template< class Space , class ... Prop >
@ -239,6 +242,7 @@ struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_space<Space>::value
typedef typename Kokkos::Impl::HostMirror< Space >::Space HostMirrorSpace ;
typedef typename execution_space::array_layout array_layout ;
typedef typename ViewTraits<void,Prop...>::memory_traits memory_traits ;
typedef typename ViewTraits<void,Prop...>::specialize specialize ;
};
template< class MemoryTraits , class ... Prop >
@ -257,6 +261,7 @@ struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_memory_traits<Memor
typedef void HostMirrorSpace ;
typedef void array_layout ;
typedef MemoryTraits memory_traits ;
typedef void specialize ;
};
@ -335,7 +340,12 @@ public:
typedef ArrayLayout array_layout ;
typedef typename data_analysis::dimension dimension ;
typedef typename data_analysis::specialize specialize /* mapping specialization tag */ ;
typedef typename std::conditional<
std::is_same<typename data_analysis::specialize,void>::value
,typename prop::specialize
,typename data_analysis::specialize>::type
specialize ; /* mapping specialization tag */
enum { rank = dimension::rank };
enum { rank_dynamic = dimension::rank_dynamic };
@ -542,7 +552,7 @@ public:
private:
typedef Kokkos::Impl::ViewMapping< traits , void > map_type ;
typedef Kokkos::Impl::ViewMapping< traits , typename traits::specialize > map_type ;
typedef Kokkos::Impl::SharedAllocationTracker track_type ;
track_type m_track ;
@ -608,13 +618,18 @@ public:
template< typename iType >
KOKKOS_INLINE_FUNCTION constexpr
typename std::enable_if< std::is_integral<iType>::value , size_t >::type
extent( const iType & r ) const
extent( const iType & r ) const noexcept
{ return m_map.extent(r); }
static KOKKOS_INLINE_FUNCTION constexpr
size_t
static_extent( const unsigned r ) noexcept
{ return map_type::static_extent(r); }
template< typename iType >
KOKKOS_INLINE_FUNCTION constexpr
typename std::enable_if< std::is_integral<iType>::value , int >::type
extent_int( const iType & r ) const
extent_int( const iType & r ) const noexcept
{ return static_cast<int>(m_map.extent(r)); }
KOKKOS_INLINE_FUNCTION constexpr
@ -709,11 +724,11 @@ public:
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
KOKKOS_INLINE_FUNCTION
const Kokkos::Impl::ViewMapping< traits , void > &
const Kokkos::Impl::ViewMapping< traits , typename traits::specialize > &
implementation_map() const { return m_map ; }
#endif
KOKKOS_INLINE_FUNCTION
const Kokkos::Impl::ViewMapping< traits , void > &
const Kokkos::Impl::ViewMapping< traits , typename traits::specialize > &
impl_map() const { return m_map ; }
KOKKOS_INLINE_FUNCTION
const Kokkos::Impl::SharedAllocationTracker &
@ -1955,7 +1970,7 @@ public:
, m_map()
{
typedef typename View<RT,RP...>::traits SrcTraits ;
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , typename traits::specialize > Mapping ;
static_assert( Mapping::is_assignable , "Incompatible View copy construction" );
Mapping::assign( m_map , rhs.m_map , rhs.m_track );
}
@ -1965,7 +1980,7 @@ public:
View & operator = ( const View<RT,RP...> & rhs )
{
typedef typename View<RT,RP...>::traits SrcTraits ;
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , typename traits::specialize > Mapping ;
static_assert( Mapping::is_assignable , "Incompatible View copy assignment" );
Mapping::assign( m_map , rhs.m_map , rhs.m_track );
m_track.assign( rhs.m_track , traits::is_managed );
@ -1992,7 +2007,7 @@ public:
typedef typename Mapping::type DstType ;
static_assert( Kokkos::Impl::ViewMapping< traits , typename DstType::traits , void >::is_assignable
static_assert( Kokkos::Impl::ViewMapping< traits , typename DstType::traits , typename traits::specialize >::is_assignable
, "Subview construction requires compatible view and subview arguments" );
Mapping::assign( m_map, src_view.m_map, arg0 , args... );
@ -2266,10 +2281,10 @@ public:
}
template <class Traits>
KOKKOS_INLINE_FUNCTION
View( const track_type & track, const Kokkos::Impl::ViewMapping< Traits , void > &map ) :
View( const track_type & track, const Kokkos::Impl::ViewMapping< Traits , typename Traits::specialize > &map ) :
m_track(track), m_map()
{
typedef Kokkos::Impl::ViewMapping< traits , Traits , void > Mapping ;
typedef Kokkos::Impl::ViewMapping< traits , Traits , typename traits::specialize > Mapping ;
static_assert( Mapping::is_assignable , "Incompatible View copy construction" );
Mapping::assign( m_map , map , track );
}

View File

@ -142,14 +142,14 @@ private:
WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size());
exec.set_work_range(0,range.end()-range.begin(),self.m_policy.chunk_size());
exec.reset_steal_target();
exec.barrier();
long work_index = exec.get_work_index();
while(work_index != -1) {
const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size()+range.begin();
const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
ParallelFor::template exec_range< WorkTag >
@ -470,14 +470,14 @@ private:
const ParallelReduce & self = * ((const ParallelReduce *) arg );
const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size());
exec.set_work_range(0,range.end()-range.begin(),self.m_policy.chunk_size());
exec.reset_steal_target();
exec.barrier();
long work_index = exec.get_work_index();
reference_type update = ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() );
while(work_index != -1) {
const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size() + range.begin();
const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
ParallelReduce::template exec_range< WorkTag >
( self.m_functor , begin , end

View File

@ -111,7 +111,7 @@ T atomic_compare_exchange( volatile T * const dest , const T & compare ,
unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
#endif
unsigned int done_active = 0;
while (active!=done_active) {
@ -127,7 +127,7 @@ T atomic_compare_exchange( volatile T * const dest , const T & compare ,
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,done);
#else
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(done);
done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
#endif
}
return return_val;
@ -308,6 +308,16 @@ T atomic_compare_exchange( volatile T * const dest_v, const T compare, const T v
#endif
#endif // !defined ROCM_ATOMICS
// dummy for non-CUDA Kokkos headers being processed by NVCC
#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA)
template <typename T>
__inline__ __device__
T atomic_compare_exchange(volatile T * const, const Kokkos::Impl::identity_t<T>, const Kokkos::Impl::identity_t<T>)
{
return T();
}
#endif
template <typename T>
KOKKOS_INLINE_FUNCTION
bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)

View File

@ -134,7 +134,7 @@ T atomic_exchange( volatile T * const dest ,
unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
#endif
unsigned int done_active = 0;
while (active!=done_active) {
@ -149,7 +149,7 @@ T atomic_exchange( volatile T * const dest ,
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,done);
#else
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(done);
done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
#endif
}
return return_val;
@ -418,6 +418,23 @@ void atomic_assign( volatile T * const dest_v , const T val )
#endif
#endif
// dummy for non-CUDA Kokkos headers being processed by NVCC
#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA)
template <typename T>
__inline__ __device__
T atomic_exchange(volatile T * const, const Kokkos::Impl::identity_t<T>)
{
return T();
}
template < typename T >
__inline__ __device__
void atomic_assign(volatile T * const, const Kokkos::Impl::identity_t<T>)
{
}
#endif
} // namespace Kokkos
#endif

View File

@ -147,7 +147,7 @@ T atomic_fetch_add( volatile T * const dest ,
unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
#endif
unsigned int done_active = 0;
while (active!=done_active) {
@ -164,7 +164,7 @@ T atomic_fetch_add( volatile T * const dest ,
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,done);
#else
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(done);
done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
#endif
}
return return_val;
@ -384,6 +384,15 @@ T atomic_fetch_add( volatile T * const dest_v , typename std::add_const<T>::type
#endif // !defined ROCM_ATOMICS
//----------------------------------------------------------------------------
// dummy for non-CUDA Kokkos headers being processed by NVCC
#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA)
template< typename T >
__inline__ __device__
T atomic_fetch_add(volatile T* const, Kokkos::Impl::identity_t<T>) {
return T();
}
#endif
// Simpler version of atomic_fetch_add without the fetch
template <typename T>
KOKKOS_INLINE_FUNCTION

View File

@ -149,6 +149,15 @@ T atomic_fetch_and( volatile T * const dest_v , const T val )
#endif
//----------------------------------------------------------------------------
// dummy for non-CUDA Kokkos headers being processed by NVCC
#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA)
template< typename T >
__inline__ __device__
T atomic_fetch_and(volatile T* const, Kokkos::Impl::identity_t<T>) {
return T();
}
#endif
// Simpler version of atomic_fetch_and without the fetch
template <typename T>
KOKKOS_INLINE_FUNCTION

View File

@ -149,6 +149,15 @@ T atomic_fetch_or( volatile T * const dest_v , const T val )
#endif
//----------------------------------------------------------------------------
// dummy for non-CUDA Kokkos headers being processed by NVCC
#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA)
template< typename T >
__inline__ __device__
T atomic_fetch_or(volatile T* const, Kokkos::Impl::identity_t<T>) {
return T();
}
#endif
// Simpler version of atomic_fetch_or without the fetch
template <typename T>
KOKKOS_INLINE_FUNCTION

View File

@ -139,7 +139,7 @@ T atomic_fetch_sub( volatile T * const dest ,
unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
#endif
unsigned int done_active = 0;
while (active!=done_active) {
@ -154,7 +154,7 @@ T atomic_fetch_sub( volatile T * const dest ,
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,done);
#else
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(done);
done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
#endif
}
return return_val;
@ -304,6 +304,15 @@ T atomic_fetch_sub( volatile T * const dest_v , const T val )
#endif
#endif // !defined ROCM_ATOMICS
// dummy for non-CUDA Kokkos headers being processed by NVCC
#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA)
template< typename T >
__inline__ __device__
T atomic_fetch_sub(volatile T* const, Kokkos::Impl::identity_t<T>) {
return T();
}
#endif
// Simpler version of atomic_fetch_sub without the fetch
template <typename T>
KOKKOS_INLINE_FUNCTION

View File

@ -230,9 +230,6 @@ T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
typename Kokkos::Impl::enable_if<
( sizeof(T) != 4 )
&& ( sizeof(T) != 8 )
#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
&& ( sizeof(T) != 16 )
#endif
, const T >::type val )
{
@ -250,7 +247,7 @@ T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
#endif
unsigned int done_active = 0;
while (active!=done_active) {
@ -265,7 +262,7 @@ T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,done);
#else
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(done);
done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
#endif
}
return return_val;
@ -298,7 +295,7 @@ T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
#else
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
#endif
unsigned int done_active = 0;
while (active!=done_active) {
@ -313,7 +310,7 @@ T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,done);
#else
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(done);
done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
#endif
}
return return_val;

View File

@ -49,6 +49,7 @@
#include <sstream>
#include <cstdlib>
#include <stack>
#include <cerrno>
//----------------------------------------------------------------------------
@ -70,7 +71,6 @@ bool is_unsigned_int(const char* str)
}
return true;
}
void initialize_internal(const InitArguments& args)
{
// This is an experimental setting
@ -99,6 +99,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
if (use_gpu < 0 && ndevices >= 0) {
auto local_rank_str = std::getenv("OMPI_COMM_WORLD_LOCAL_RANK"); //OpenMPI
if (!local_rank_str) local_rank_str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK"); //MVAPICH2
if (!local_rank_str) local_rank_str = std::getenv("SLURM_LOCALID"); //SLURM
if (local_rank_str) {
auto local_rank = std::atoi(local_rank_str);
use_gpu = local_rank % ndevices;
@ -532,6 +533,85 @@ void initialize(int& narg, char* arg[])
iarg++;
}
//Read environment variables
char * endptr;
auto env_num_threads_str = std::getenv("KOKKOS_NUM_THREADS");
if (env_num_threads_str!=nullptr) {
errno = 0;
auto env_num_threads = std::strtol(env_num_threads_str,&endptr,10);
if (endptr== env_num_threads_str)
Impl::throw_runtime_exception("Error: cannot convert KOKKOS_NUM_THREADS to an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
if (errno == ERANGE)
Impl::throw_runtime_exception("Error: KOKKOS_NUM_THREADS out of range of representable values by an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
if ((num_threads != -1)&&(env_num_threads!=num_threads))
Impl::throw_runtime_exception("Error: expecting a match between --kokkos-threads and KOKKOS_NUM_THREADS if both are set. Raised by Kokkos::initialize(int narg, char* argc[]).");
else
num_threads = env_num_threads;
}
auto env_numa_str = std::getenv("KOKKOS_NUMA");
if (env_numa_str!=nullptr) {
errno = 0;
auto env_numa = std::strtol(env_numa_str,&endptr,10);
if (endptr== env_numa_str)
Impl::throw_runtime_exception("Error: cannot convert KOKKOS_NUMA to an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
if (errno == ERANGE)
Impl::throw_runtime_exception("Error: KOKKOS_NUMA out of range of representable values by an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
if ((numa != -1)&&(env_numa!=numa))
Impl::throw_runtime_exception("Error: expecting a match between --kokkos-numa and KOKKOS_NUMA if both are set. Raised by Kokkos::initialize(int narg, char* argc[]).");
else
numa = env_numa;
}
auto env_device_str = std::getenv("KOKKOS_DEVICE_ID");
if (env_device_str!=nullptr) {
errno = 0;
auto env_device = std::strtol(env_device_str,&endptr,10);
if (endptr== env_device_str)
Impl::throw_runtime_exception("Error: cannot convert KOKKOS_DEVICE_ID to an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
if (errno == ERANGE)
Impl::throw_runtime_exception("Error: KOKKOS_DEVICE_ID out of range of representable values by an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
if ((device != -1)&&(env_device!=device))
Impl::throw_runtime_exception("Error: expecting a match between --kokkos-device and KOKKOS_DEVICE_ID if both are set. Raised by Kokkos::initialize(int narg, char* argc[]).");
else
device = env_device;
}
auto env_ndevices_str = std::getenv("KOKKOS_NUM_DEVICES");
if (env_ndevices_str!=nullptr) {
errno = 0;
auto env_ndevices = std::strtol(env_ndevices_str,&endptr,10);
if (endptr== env_ndevices_str)
Impl::throw_runtime_exception("Error: cannot convert KOKKOS_NUM_DEVICES to an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
if (errno == ERANGE)
Impl::throw_runtime_exception("Error: KOKKOS_NUM_DEVICES out of range of representable values by an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
if ((ndevices != -1)&&(env_ndevices!=ndevices))
Impl::throw_runtime_exception("Error: expecting a match between --kokkos-ndevices and KOKKOS_NUM_DEVICES if both are set. Raised by Kokkos::initialize(int narg, char* argc[]).");
else
ndevices = env_ndevices;
//Skip device
auto env_skip_device_str = std::getenv("KOKKOS_SKIP_DEVICE");
if (env_skip_device_str!=nullptr) {
errno = 0;
auto env_skip_device = std::strtol(env_skip_device_str,&endptr,10);
if (endptr== env_skip_device_str)
Impl::throw_runtime_exception("Error: cannot convert KOKKOS_SKIP_DEVICE to an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
if (errno == ERANGE)
Impl::throw_runtime_exception("Error: KOKKOS_SKIP_DEVICE out of range of representable values by an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
if ((skip_device != 9999)&&(env_skip_device!=skip_device))
Impl::throw_runtime_exception("Error: expecting a match between --kokkos-ndevices and KOKKOS_SKIP_DEVICE if both are set. Raised by Kokkos::initialize(int narg, char* argc[]).");
else
skip_device = env_skip_device;
}
}
char * env_disablewarnings_str = std::getenv("KOKKOS_DISABLE_WARNINGS");
if (env_disablewarnings_str!=nullptr) {
std::string env_str (env_disablewarnings_str); // deep-copies string
for (char& c : env_str) { c = toupper (c); }
if ((env_str == "TRUE") || (env_str == "ON") || (env_str == "1"))
disable_warnings = true;
else
if (disable_warnings)
Impl::throw_runtime_exception("Error: expecting a match between --kokkos-disable-warnings and KOKKOS_DISABLE_WARNINGS if both are set. Raised by Kokkos::initialize(int narg, char* argc[]).");
}
InitArguments arguments;
arguments.num_threads = num_threads;
arguments.num_numa = numa;

View File

@ -409,6 +409,9 @@ struct inclusive_scan_integer_sequence
static constexpr value_type value = helper::value ;
};
template <typename T>
using identity_t = T;
}} // namespace Kokkos::Impl

View File

@ -103,13 +103,7 @@ namespace Impl {
/** \brief View mapping for non-specialized data type and standard layout */
template< class Traits >
class ViewMapping< Traits ,
typename std::enable_if<(
std::is_same< typename Traits::specialize , Kokkos::Array<> >::value &&
( std::is_same< typename Traits::array_layout , Kokkos::LayoutLeft >::value ||
std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value ||
std::is_same< typename Traits::array_layout , Kokkos::LayoutStride >::value )
)>::type >
class ViewMapping< Traits , Kokkos::Array<> >
{
private:
@ -345,64 +339,6 @@ public:
}
};
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
/** \brief Assign compatible default mappings */
template< class DstTraits , class SrcTraits >
class ViewMapping< DstTraits , SrcTraits ,
typename std::enable_if<(
std::is_same< typename DstTraits::memory_space , typename SrcTraits::memory_space >::value
&&
std::is_same< typename DstTraits::specialize , Kokkos::Array<> >::value
&&
(
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
)
&&
std::is_same< typename SrcTraits::specialize , Kokkos::Array<> >::value
&&
(
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
)
)>::type >
{
public:
enum { is_assignable = true };
typedef Kokkos::Impl::SharedAllocationTracker TrackType ;
typedef ViewMapping< DstTraits , void > DstType ;
typedef ViewMapping< SrcTraits , void > SrcType ;
KOKKOS_INLINE_FUNCTION
static void assign( DstType & dst , const SrcType & src , const TrackType & src_track )
{
static_assert( std::is_same< typename DstTraits::value_type , typename SrcTraits::value_type >::value ||
std::is_same< typename DstTraits::value_type , typename SrcTraits::const_value_type >::value
, "View assignment must have same value type or const = non-const" );
static_assert( ViewDimensionAssignable< typename DstTraits::dimension , typename SrcTraits::dimension >::value
, "View assignment must have compatible dimensions" );
static_assert( std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value ||
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value ||
( DstTraits::dimension::rank == 0 ) ||
( DstTraits::dimension::rank == 1 && DstTraits::dimension::rank_dynamic == 1 )
, "View assignment must have compatible layout or have rank <= 1" );
typedef typename DstType::offset_type dst_offset_type ;
dst.m_impl_offset = dst_offset_type( src.m_impl_offset );
dst.m_impl_handle = src.m_impl_handle ;
dst.m_stride = src.m_stride ;
}
};
/** \brief Assign Array to non-Array */
template< class DstTraits , class SrcTraits >
@ -436,7 +372,7 @@ public:
typedef Kokkos::Impl::SharedAllocationTracker TrackType ;
typedef ViewMapping< DstTraits , void > DstType ;
typedef ViewMapping< SrcTraits , void > SrcType ;
typedef ViewMapping< SrcTraits , Kokkos::Array<> > SrcType ;
KOKKOS_INLINE_FUNCTION
static void assign( DstType & dst , const SrcType & src , const TrackType & src_track )
@ -480,6 +416,7 @@ public:
}
};
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -195,7 +195,7 @@ struct ViewDimension
{}
KOKKOS_INLINE_FUNCTION
constexpr size_t extent( const unsigned r ) const
constexpr size_t extent( const unsigned r ) const noexcept
{
return r == 0 ? N0 : (
r == 1 ? N1 : (
@ -207,6 +207,19 @@ struct ViewDimension
r == 7 ? N7 : 0 )))))));
}
static KOKKOS_INLINE_FUNCTION
constexpr size_t static_extent( const unsigned r ) noexcept
{
return r == 0 ? ArgN0 : (
r == 1 ? ArgN1 : (
r == 2 ? ArgN2 : (
r == 3 ? ArgN3 : (
r == 4 ? ArgN4 : (
r == 5 ? ArgN5 : (
r == 6 ? ArgN6 : (
r == 7 ? ArgN7 : 0 )))))));
}
template< size_t N >
struct prepend { typedef ViewDimension< N , Vals... > type ; };
@ -2640,6 +2653,12 @@ public:
KOKKOS_INLINE_FUNCTION constexpr size_t extent( const iType & r ) const
{ return m_impl_offset.m_dim.extent(r); }
static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( const unsigned r ) noexcept
{
using dim_type = typename offset_type::dimension_type;
return dim_type::static_extent(r);
}
KOKKOS_INLINE_FUNCTION constexpr
typename Traits::array_layout layout() const
{ return m_impl_offset.layout(); }

View File

@ -63,6 +63,86 @@ struct CountFillFunctor {
}
};
/* RunUpdateCrsTest
* 4 test cases:
* 1. use member object version which is constructed directly using the copy constructor
* 2. excplicity copy construct in local variable
* 3. construct default and assign to input object
* 4. construct object from views
*/
template< class CrsType, class ExecSpace, class scalarType >
struct RunUpdateCrsTest {
struct TestOne {};
struct TestTwo {};
struct TestThree {};
struct TestFour {};
CrsType graph;
RunUpdateCrsTest( CrsType g_in ) : graph(g_in)
{
}
void run_test(int nTest) {
switch (nTest)
{
case 1:
parallel_for ("TestCrs1", Kokkos::RangePolicy<ExecSpace, TestOne>(0,graph.numRows()),*this);
break;
case 2:
parallel_for ("TestCrs2", Kokkos::RangePolicy<ExecSpace, TestTwo>(0,graph.numRows()),*this);
break;
case 3:
parallel_for ("TestCrs3", Kokkos::RangePolicy<ExecSpace, TestThree>(0,graph.numRows()),*this);
break;
case 4:
parallel_for ("TestCrs4", Kokkos::RangePolicy<ExecSpace, TestFour>(0,graph.numRows()),*this);
break;
default:
break;
}
}
KOKKOS_INLINE_FUNCTION
void updateGraph(const CrsType & g_in, const scalarType row) const {
auto row_map = g_in.row_map;
auto entries = g_in.entries;
auto j_start = row_map(row);
auto j_end = row_map(row+1)-j_start;
for (scalarType j = 0; j < j_end; ++j) {
entries(j_start+j) = (j+1)*(j+1);
}
}
// Test Crs class from class member
KOKKOS_INLINE_FUNCTION
void operator()(const TestOne &, const scalarType row) const {
updateGraph(graph, row);
}
// Test Crs class from copy constructor (local_graph(graph)
KOKKOS_INLINE_FUNCTION
void operator()(const TestTwo &, const scalarType row) const {
CrsType local_graph(graph);
updateGraph(local_graph, row);
}
// Test Crs class from default constructor assigned to function parameter
KOKKOS_INLINE_FUNCTION
void operator()(const TestThree &, const scalarType row) const {
CrsType local_graph;
local_graph = graph;
updateGraph(local_graph, row);
}
// Test Crs class from local graph constructed from row_map and entities access on input parameter)
KOKKOS_INLINE_FUNCTION
void operator()(const TestFour &, const scalarType row) const {
CrsType local_graph(graph.row_map, graph.entries);
updateGraph(local_graph, row);
}
};
template< class ExecSpace >
void test_count_fill(std::int32_t nrows) {
Kokkos::Crs<std::int32_t, ExecSpace, void, std::int32_t> graph;
@ -81,6 +161,38 @@ void test_count_fill(std::int32_t nrows) {
}
}
// Test Crs Constructor / assignment operation by
// using count and fill to create/populate initial graph,
// then use parallel_for with Crs directly to update content
// then verify results
template< class ExecSpace >
void test_constructor(std::int32_t nrows) {
for (int nTest = 1; nTest < 5; nTest++)
{
typedef Kokkos::Crs<std::int32_t, ExecSpace, void, std::int32_t> crs_int32;
crs_int32 graph;
Kokkos::count_and_fill_crs(graph, nrows, CountFillFunctor<ExecSpace>());
ASSERT_EQ(graph.numRows(), nrows);
RunUpdateCrsTest<crs_int32, ExecSpace, std::int32_t> crstest(graph);
crstest.run_test(nTest);
auto row_map = Kokkos::create_mirror_view(graph.row_map);
Kokkos::deep_copy(row_map, graph.row_map);
auto entries = Kokkos::create_mirror_view(graph.entries);
Kokkos::deep_copy(entries, graph.entries);
for (std::int32_t row = 0; row < nrows; ++row) {
auto n = (row % 4) + 1;
ASSERT_EQ(row_map(row + 1) - row_map(row), n);
for (std::int32_t j = 0; j < n; ++j) {
ASSERT_EQ(entries(row_map(row) + j), (j + 1)*(j+1));
}
}
}
}
} // anonymous namespace
TEST_F( TEST_CATEGORY, crs_count_fill )
@ -95,4 +207,17 @@ TEST_F( TEST_CATEGORY, crs_count_fill )
test_count_fill<TEST_EXECSPACE>(10000);
}
TEST_F( TEST_CATEGORY, crs_copy_constructor )
{
test_constructor<TEST_EXECSPACE>(0);
test_constructor<TEST_EXECSPACE>(1);
test_constructor<TEST_EXECSPACE>(2);
test_constructor<TEST_EXECSPACE>(3);
test_constructor<TEST_EXECSPACE>(13);
test_constructor<TEST_EXECSPACE>(100);
test_constructor<TEST_EXECSPACE>(1000);
test_constructor<TEST_EXECSPACE>(10000);
}
} // namespace Test

View File

@ -956,7 +956,12 @@ struct TestMDRange_3D {
}
, Kokkos::Min<double>(min) );
ASSERT_EQ( min, 8.0 );
if((N0-1)*(N1-1)*(N2-1)>0)
ASSERT_EQ( min, 8.0 );
else {
double min_identity = Kokkos::reduction_identity<double>::min();
ASSERT_EQ( min, min_identity );
}
}
#endif
#endif

View File

@ -46,8 +46,10 @@
namespace Test {
TEST_F( TEST_CATEGORY , mdrange_3d) {
TestMDRange_3D< TEST_EXECSPACE >::test_for3( 1, 10, 100 );
TestMDRange_3D< TEST_EXECSPACE >::test_for3( 100, 10, 100 );
#if !defined( KOKKOS_ENABLE_ROCM ) // MDRange Reduced explicitly handled in its own cpp file
TestMDRange_3D< TEST_EXECSPACE >::test_reduce3( 1, 10, 100 );
TestMDRange_3D< TEST_EXECSPACE >::test_reduce3( 100, 10, 100 );
#endif
}

View File

@ -60,8 +60,11 @@ struct TestRange {
struct VerifyInitTag {};
struct ResetTag {};
struct VerifyResetTag {};
struct OffsetTag {};
struct VerifyOffsetTag {};
int N;
int N;
static const int offset = 13;
TestRange( const size_t N_ )
: m_flags( Kokkos::ViewAllocateWithoutInitializing( "flags" ), N_ ), N(N_)
{}
@ -117,6 +120,18 @@ struct TestRange {
if ( int( 2 * i ) != host_flags( i ) ) ++error_count;
}
ASSERT_EQ( error_count, int( 0 ) );
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType, OffsetTag >( offset, N + offset ), *this );
Kokkos::parallel_for( std::string("TestKernelFor"), Kokkos::RangePolicy<ExecSpace, ScheduleType, VerifyOffsetTag>( 0, N ), *this);
Kokkos::deep_copy(host_flags, m_flags);
error_count = 0;
for (int i = 0; i < N; ++i) {
if (i + offset != host_flags(i))
++error_count;
}
ASSERT_EQ(error_count, int(0));
}
KOKKOS_INLINE_FUNCTION
@ -144,9 +159,19 @@ struct TestRange {
}
}
//----------------------------------------
KOKKOS_INLINE_FUNCTION
void operator()(const OffsetTag &, const int i) const {
m_flags(i - offset) = i;
}
struct OffsetTag {};
KOKKOS_INLINE_FUNCTION
void operator()(const VerifyOffsetTag &, const int i) const {
if (i + offset != m_flags(i)) {
printf("TestRange::test_for error at %d != %d\n", i + offset, m_flags(i));
}
}
//----------------------------------------
void test_reduce( )
{
@ -158,7 +183,7 @@ struct TestRange {
// sum( 0 .. N-1 )
ASSERT_EQ( size_t( ( N - 1 ) * ( N ) / 2 ), size_t( total ) );
Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, ScheduleType, OffsetTag>( 0, N ), *this, total );
Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, ScheduleType, OffsetTag>( offset, N+offset ), *this, total );
// sum( 1 .. N )
ASSERT_EQ( size_t( ( N ) * ( N + 1 ) / 2 ), size_t( total ) );
}
@ -169,7 +194,7 @@ struct TestRange {
KOKKOS_INLINE_FUNCTION
void operator()( const OffsetTag &, const int i, value_type & update ) const
{ update += 1 + m_flags( i ); }
{ update += 1 + m_flags( i-offset ); }
//----------------------------------------

View File

@ -532,7 +532,11 @@ struct functor_vec_single {
typedef ExecutionSpace execution_space;
Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
functor_vec_single( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
int nStart;
int nEnd;
functor_vec_single( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_, const int start_, const int end_ ) :
flag( flag_ ), nStart(start_), nEnd(end_) {}
KOKKOS_INLINE_FUNCTION
void operator()( typename policy_type::member_type team ) const {
@ -541,7 +545,7 @@ struct functor_vec_single {
// inside a parallel_for and write to it.
Scalar value = 0;
Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 0, 13 ), [&] ( int i )
Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, nStart, nEnd ), [&] ( int i )
{
value = i; // This write is violating Kokkos semantics for nested parallelism.
});
@ -552,12 +556,12 @@ struct functor_vec_single {
}, value );
Scalar value2 = 0;
Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, 0, 13 ), [&] ( int i, Scalar & val )
Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, nStart, nEnd ), [&] ( int i, Scalar & val )
{
val += value;
}, value2 );
if ( value2 != ( value * 13 ) ) {
if ( value2 != ( value * (nEnd-nStart) ) ) {
printf( "FAILED vector_single broadcast %i %i %f %f\n",
team.league_rank(), team.team_rank(), (double) value2, (double) value );
@ -746,12 +750,6 @@ bool test_scalar( int nteams, int team_size, int test ) {
functor_vec_red< Scalar, ExecutionSpace >( d_flag ) );
}
else if ( test == 1 ) {
// WORKAROUND CUDA
#if defined(KOKKOS_ENABLE_CUDA)
#if defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) || defined(KOKKOS_ARCH_PASCAL)
if(!std::is_same<ExecutionSpace,Kokkos::Cuda>::value)
#endif
#endif
Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
functor_vec_red_reducer< Scalar, ExecutionSpace >( d_flag ) );
}
@ -765,7 +763,7 @@ bool test_scalar( int nteams, int team_size, int test ) {
}
else if ( test == 4 ) {
Kokkos::parallel_for( "B", Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
functor_vec_single< Scalar, ExecutionSpace >( d_flag ) );
functor_vec_single< Scalar, ExecutionSpace >( d_flag, 0, 13 ) );
}
else if ( test == 5 ) {
Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size ),
@ -791,6 +789,10 @@ bool test_scalar( int nteams, int team_size, int test ) {
Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
functor_team_vector_reduce_reducer< Scalar, ExecutionSpace >( d_flag ) );
}
else if ( test == 11 ) {
Kokkos::parallel_for( "B", Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
functor_vec_single< Scalar, ExecutionSpace >( d_flag, 4, 13 ) );
}
Kokkos::deep_copy( h_flag, d_flag );
@ -938,6 +940,7 @@ TEST_F( TEST_CATEGORY, team_vector )
ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 8 ) ) );
ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 9 ) ) );
ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 10 ) ) );
ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 11 ) ) );
}
#endif

View File

@ -56,17 +56,13 @@ struct TestViewCopy {
using InExecSpace = ExecSpace;
static void test_view_copy()
static void test_view_copy(const int dim0, const int dim1, const int dim2)
{
#if defined( KOKKOS_ENABLE_CUDA ) || defined( KOKKOS_ENABLE_ROCM )
// ExecSpace = CudaUVM, CudaHostPinned
// This test will fail at runtime with an illegal memory access if something goes wrong
// Test 1: deep_copy from host_mirror_space to ExecSpace and ExecSpace back to host_mirror_space
{
const int dim0 = 4;
const int dim1 = 2;
const int dim2 = 3;
typedef Kokkos::View<double****,InExecSpace> Rank4ViewType;
Rank4ViewType view_4;
view_4 = Rank4ViewType("view_4", dim0, dim1, dim2, dim2);
@ -88,19 +84,21 @@ struct TestViewCopy {
// Test 2: deep_copy from Cuda to ExecSpace and ExecSpace back to Cuda
{
const int dim0 = 4;
const int dim1 = 2;
const int dim2 = 3;
typedef Kokkos::View<double****,InExecSpace> Rank4ViewType;
Rank4ViewType view_4;
view_4 = Rank4ViewType("view_4", dim0, dim1, dim2, dim2);
#if defined( KOKKOS_ENABLE_CUDA )
typedef Kokkos::Cuda space_type;
typedef typename std::conditional<
Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace,typename InExecSpace::memory_space>::accessible,
Kokkos::CudaSpace,
InExecSpace>::type space_type;
#endif
#if defined( KOKKOS_ENABLE_ROCM )
typedef Kokkos::Experimental::ROCm space_type;
typedef typename std::conditional<
Kokkos::Impl::MemorySpaceAccess<Kokkos::ROCmSpace,typename InExecSpace::memory_space>::accessible,
Kokkos::ROCmSpace,
InExecSpace>::type space_type;
#endif
Kokkos::View<double**,Kokkos::LayoutLeft,space_type> srcView("srcView", dim2, dim2);
@ -118,10 +116,6 @@ struct TestViewCopy {
// Test 3: deep_copy from host_space to ExecSpace and ExecSpace back to host_space
{
const int dim0 = 4;
const int dim1 = 2;
const int dim2 = 3;
typedef Kokkos::View<double****,InExecSpace> Rank4ViewType;
Rank4ViewType view_4;
view_4 = Rank4ViewType("view_4", dim0, dim1, dim2, dim2);
@ -149,7 +143,41 @@ struct TestViewCopy {
TEST_F( TEST_CATEGORY , view_copy_tests ) {
//Only include this file to be compiled with CudaUVM and CudaHostPinned
TestViewCopy< TEST_EXECSPACE >::test_view_copy();
TestViewCopy< TEST_EXECSPACE >::test_view_copy(4,2,3);
TestViewCopy< TEST_EXECSPACE >::test_view_copy(4,2,0);
}
TEST_F( TEST_CATEGORY , view_copy_degenerated ) {
//Only include this file to be compiled with CudaUVM and CudaHostPinned
Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v_um_def_1;
Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v_um_1( reinterpret_cast<int*>(-1), 0 );
Kokkos::View<int*> v_m_def_1;
Kokkos::View<int*> v_m_1("v_m_1", 0);
Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v_um_def_2;
Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v_um_2( reinterpret_cast<int*>(-1), 0 );
Kokkos::View<int*> v_m_def_2;
Kokkos::View<int*> v_m_2("v_m_2", 0);
Kokkos::deep_copy(v_um_def_1, v_um_def_2);
Kokkos::deep_copy(v_um_def_1, v_um_2);
Kokkos::deep_copy(v_um_def_1, v_m_def_2);
Kokkos::deep_copy(v_um_def_1, v_m_2);
Kokkos::deep_copy(v_um_1, v_um_def_2);
Kokkos::deep_copy(v_um_1, v_um_2);
Kokkos::deep_copy(v_um_1, v_m_def_2);
Kokkos::deep_copy(v_um_1, v_m_2);
Kokkos::deep_copy(v_m_def_1, v_um_def_2);
Kokkos::deep_copy(v_m_def_1, v_um_2);
Kokkos::deep_copy(v_m_def_1, v_m_def_2);
Kokkos::deep_copy(v_m_def_1, v_m_2);
Kokkos::deep_copy(v_m_1, v_um_def_2);
Kokkos::deep_copy(v_m_1, v_um_2);
Kokkos::deep_copy(v_m_1, v_m_def_2);
Kokkos::deep_copy(v_m_1, v_m_2);
}
} // namespace Test

View File

@ -1245,5 +1245,12 @@ TEST_F( TEST_CATEGORY , view_mapping_operator )
test_view_mapping_operator< TEST_EXECSPACE >();
}
TEST_F( TEST_CATEGORY , static_extent )
{
using T = Kokkos::View<double*[2][3]>;
ASSERT_EQ( T::static_extent(1), 2 );
ASSERT_EQ( T::static_extent(2), 3 );
}
}

View File

@ -228,6 +228,10 @@ TEST_F( cuda, uvm )
}
}
/* Removing UVM Allocs Test due to added time to complete overall unit test
* The issue verified with this unit test appears to no longer be an
* problem. Refer to github issue 1880 for more details
*
TEST_F( cuda, uvm_num_allocs )
{
// The max number of UVM allocations allowed is 65536.
@ -288,6 +292,7 @@ TEST_F( cuda, uvm_num_allocs )
#undef MAX_NUM_ALLOCS
}
*/
template< class MemSpace, class ExecSpace >
struct TestViewCudaAccessible {

View File

@ -43,3 +43,4 @@
#include <openmp/TestOpenMP_Category.hpp>
#include <TestViewAPI_e.hpp>
#include <TestViewCopy.hpp>

View File

@ -43,3 +43,5 @@
#include <serial/TestSerial_Category.hpp>
#include <TestViewAPI_e.hpp>
#include <TestViewCopy.hpp>

View File

@ -43,3 +43,4 @@
#include <threads/TestThreads_Category.hpp>
#include <TestViewAPI_e.hpp>
#include <TestViewCopy.hpp>

View File

@ -68,6 +68,9 @@ do
--cxxflags*)
CXXFLAGS="${key#*=}"
;;
--cxxstandard*)
KOKKOS_CXX_STANDARD="${key#*=}"
;;
--ldflags*)
LDFLAGS="${key#*=}"
;;
@ -127,6 +130,7 @@ do
echo "--arch=[OPT]: Set target architectures. Options are:"
echo " [AMD]"
echo " AMDAVX = AMD CPU"
echo " EPYC = AMD EPYC Zen-Core CPU"
echo " [ARM]"
echo " ARMv80 = ARMv8.0 Compatible CPU"
echo " ARMv81 = ARMv8.1 Compatible CPU"
@ -165,6 +169,8 @@ do
echo " build. This will still set certain required"
echo " flags via KOKKOS_CXXFLAGS (such as -fopenmp,"
echo " --std=c++11, etc.)."
echo "--cxxstandard=[FLAGS] Overwrite KOKKOS_CXX_STANDARD for library build and test"
echo " c++11 (default), c++14, c++17, c++1y, c++1z, c++2a"
echo "--ldflags=[FLAGS] Overwrite LDFLAGS for library build and test"
echo " build. This will still set certain required"
echo " flags via KOKKOS_LDFLAGS (such as -fopenmp,"
@ -243,6 +249,10 @@ if [ ${#CXXFLAGS} -gt 0 ]; then
KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXXFLAGS=\"${CXXFLAGS}\""
fi
if [ ${#KOKKOS_CXX_STANDARD} -gt 0 ]; then
KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_CXX_STANDARD=\"${KOKKOS_CXX_STANDARD}\""
fi
if [ ${#LDFLAGS} -gt 0 ]; then
KOKKOS_SETTINGS="${KOKKOS_SETTINGS} LDFLAGS=\"${LDFLAGS}\""
fi

View File

@ -88,6 +88,8 @@ CXX_FLAGS_EXTRA=""
LD_FLAGS_EXTRA=""
KOKKOS_OPTIONS=""
CXX_STANDARD="c++11"
#
# Handle arguments.
#
@ -142,6 +144,9 @@ do
--cxxflags-extra*)
CXX_FLAGS_EXTRA="${key#*=}"
;;
--cxxstandard*)
CXX_STANDARD="${key#*=}"
;;
--ldflags-extra*)
LD_FLAGS_EXTRA="${key#*=}"
;;
@ -227,18 +232,30 @@ elif [ "$MACHINE" = "white" ]; then
export SLURM_TASKS_PER_NODE=32
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>,gcc/7.2.0"
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.2.0,ibm/xl/16.1.0"
CUDA10_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.4.0,ibm/xl/16.1.0"
# Don't do pthread on white.
GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
"cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
)
if [ "$SPOT_CHECK" = "True" ]; then
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS"
"gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
"cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
)
else
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
"ibm/16.1.1 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
"cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/10.0.130 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
)
fi
if [ -z "$ARCH_FLAG" ]; then
ARCH_FLAG="--arch=Power8,Kepler37"
@ -323,6 +340,7 @@ elif [ "$MACHINE" = "apollo" ]; then
BASE_MODULE_LIST="sems-env,kokkos-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>,kokkos-hwloc/1.10.1/base"
CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base"
CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base"
CUDA10_MODULE_LIST="sems-env,kokkos-env,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base"
CLANG_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/9.0.69"
CLANG7_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/9.1"
@ -344,6 +362,7 @@ elif [ "$MACHINE" = "apollo" ]; then
else
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("cuda/9.1 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/10.0 $CUDA10_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
"clang/6.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
"clang/7.0 $CLANG7_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
"clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
@ -629,6 +648,8 @@ single_build_and_test() {
local cxxflags="${cxxflags} ${CXX_FLAGS_EXTRA}"
local ldflags="${ldflags} ${LD_FLAGS_EXTRA}"
local cxx_standard="${CXX_STANDARD}"
if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then
local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS"
fi
@ -650,7 +671,7 @@ single_build_and_test() {
run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; }
fi
else
run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --ldflags=\"$ldflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
local -i build_start_time=$(date +%s)
run_cmd make -j 48 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
local -i build_end_time=$(date +%s)