forked from lijiext/lammps
Update Kokkos library
This commit is contained in:
parent
d6eaf73db1
commit
64834e4a3d
|
@ -6,16 +6,16 @@ ifndef KOKKOS_PATH
|
|||
endif
|
||||
CXXFLAGS=$(CCFLAGS)
|
||||
|
||||
# Options: Cuda,ROCm,OpenMP,Pthread,Qthreads,Serial
|
||||
# Options: Cuda,ROCm,OpenMP,Pthreads,Qthreads,Serial
|
||||
KOKKOS_DEVICES ?= "OpenMP"
|
||||
#KOKKOS_DEVICES ?= "Pthread"
|
||||
#KOKKOS_DEVICES ?= "Pthreads"
|
||||
# Options:
|
||||
# Intel: KNC,KNL,SNB,HSW,BDW,SKX
|
||||
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72
|
||||
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75
|
||||
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2
|
||||
# IBM: BGQ,Power7,Power8,Power9
|
||||
# AMD-GPUS: Kaveri,Carrizo,Fiji,Vega
|
||||
# AMD-CPUS: AMDAVX,Ryzen,Epyc
|
||||
# AMD-CPUS: AMDAVX,Ryzen,EPYC
|
||||
KOKKOS_ARCH ?= ""
|
||||
# Options: yes,no
|
||||
KOKKOS_DEBUG ?= "no"
|
||||
|
@ -224,7 +224,7 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
|||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
|
||||
KOKKOS_INTERNAL_CXX11_FLAG := -std=c++11
|
||||
#KOKKOS_INTERNAL_CXX14_FLAG := -std=c++14
|
||||
KOKKOS_INTERNAL_CXX14_FLAG := -std=c++14
|
||||
KOKKOS_INTERNAL_CXX1Y_FLAG := -std=c++1y
|
||||
#KOKKOS_INTERNAL_CXX17_FLAG := -std=c++17
|
||||
#KOKKOS_INTERNAL_CXX1Z_FLAG := -std=c++1Z
|
||||
|
@ -276,6 +276,7 @@ KOKKOS_INTERNAL_USE_ARCH_PASCAL61 := $(call kokkos_has_string,$(KOKKOS_ARCH),Pas
|
|||
KOKKOS_INTERNAL_USE_ARCH_PASCAL60 := $(call kokkos_has_string,$(KOKKOS_ARCH),Pascal60)
|
||||
KOKKOS_INTERNAL_USE_ARCH_VOLTA70 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volta70)
|
||||
KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volta72)
|
||||
KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
|
||||
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
|
||||
|
@ -284,6 +285,7 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
|
|||
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53))
|
||||
|
@ -300,6 +302,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
|
|||
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53))
|
||||
|
@ -331,7 +334,7 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
|
|||
# AMD based.
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX)
|
||||
KOKKOS_INTERNAL_USE_ARCH_RYZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Ryzen)
|
||||
KOKKOS_INTERNAL_USE_ARCH_EPYC := $(call kokkos_has_string,$(KOKKOS_ARCH),Epyc)
|
||||
KOKKOS_INTERNAL_USE_ARCH_EPYC := $(call kokkos_has_string,$(KOKKOS_ARCH),EPYC)
|
||||
KOKKOS_INTERNAL_USE_ARCH_KAVERI := $(call kokkos_has_string,$(KOKKOS_ARCH),Kaveri)
|
||||
KOKKOS_INTERNAL_USE_ARCH_CARRIZO := $(call kokkos_has_string,$(KOKKOS_ARCH),Carrizo)
|
||||
KOKKOS_INTERNAL_USE_ARCH_FIJI := $(call kokkos_has_string,$(KOKKOS_ARCH),Fiji)
|
||||
|
@ -341,12 +344,12 @@ KOKKOS_INTERNAL_USE_ARCH_GFX901 := $(call kokkos_has_string,$(KOKKOS_ARCH),gfx90
|
|||
# Any AVX?
|
||||
KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_EPYC))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SKX))
|
||||
|
||||
# Decide what ISA level we are able to support.
|
||||
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX))
|
||||
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_EPYC))
|
||||
KOKKOS_INTERNAL_USE_ISA_KNC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNC))
|
||||
KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER8) + $(KOKKOS_INTERNAL_USE_ARCH_POWER9))
|
||||
KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER7))
|
||||
|
@ -658,6 +661,19 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1)
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_EPYC), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AMD_EPYC")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AMD_AVX2")
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
|
||||
KOKKOS_CXXFLAGS += -mavx2
|
||||
KOKKOS_LDFLAGS += -mavx2
|
||||
else
|
||||
KOKKOS_CXXFLAGS += -march=znver1 -mtune=znver1
|
||||
KOKKOS_LDFLAGS += -march=znver1 -mtune=znver1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV80")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV8_THUNDERX")
|
||||
|
@ -950,6 +966,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
|||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VOLTA72")
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_72
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1)
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_TURING")
|
||||
tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_TURING75")
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_75
|
||||
endif
|
||||
|
||||
ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
|
||||
|
|
|
@ -73,6 +73,8 @@ For specifics see the LICENSE file contained in the repository or distribution.
|
|||
* NVCC 7.5 for CUDA (with gcc 4.8.4)
|
||||
* NVCC 8.0.44 for CUDA (with gcc 5.3.0)
|
||||
* NVCC 9.1 for CUDA (with gcc 6.1.0)
|
||||
* NVCC 9.2 for CUDA (with gcc 7.2.0)
|
||||
* NVCC 10.0 for CUDA (with gcc 7.4.0)
|
||||
|
||||
### Primary tested compilers on Power 8 are:
|
||||
* GCC 6.4.0 (OpenMP,Serial)
|
||||
|
@ -109,7 +111,7 @@ GCC: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits
|
|||
-Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
|
||||
Intel: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
|
||||
Clang: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
|
||||
NVCC: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
|
||||
NVCC: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
|
||||
|
||||
Other compilers are tested occasionally, in particular when pushing from develop to
|
||||
master branch, without -Werror and only for a select set of backends.
|
||||
|
|
|
@ -308,6 +308,16 @@ do
|
|||
shift
|
||||
done
|
||||
|
||||
#Check if nvcc exists
|
||||
if [ $host_only -ne 1 ]; then
|
||||
var=$(which nvcc )
|
||||
if [ $? -gt 0 ]; then
|
||||
echo "Could not find nvcc in PATH"
|
||||
exit $?
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
# Only print host compiler version
|
||||
if [ $get_host_version -eq 1 ]; then
|
||||
$host_compiler --version
|
||||
|
|
|
@ -104,6 +104,7 @@ list(APPEND KOKKOS_ARCH_LIST
|
|||
Pascal61 # (GPU) NVIDIA Pascal generation CC 6.1
|
||||
Volta70 # (GPU) NVIDIA Volta generation CC 7.0
|
||||
Volta72 # (GPU) NVIDIA Volta generation CC 7.2
|
||||
Turing75 # (GPU) NVIDIA Turing generation CC 7.5
|
||||
)
|
||||
|
||||
# List of possible device architectures.
|
||||
|
|
|
@ -832,16 +832,14 @@ void
|
|||
deep_copy (DualView<DT,DL,DD,DM> dst, // trust me, this must not be a reference
|
||||
const DualView<ST,SL,SD,SM>& src )
|
||||
{
|
||||
if(src.modified_flags.data()==NULL || dst.modified_flags.data()==NULL) {
|
||||
return deep_copy(dst.d_view, src.d_view);
|
||||
}
|
||||
if (src.modified_flags(1) >= src.modified_flags(0)) {
|
||||
deep_copy (dst.d_view, src.d_view);
|
||||
dst.template modify<typename DualView<DT,DL,DD,DM>::device_type> ();
|
||||
} else {
|
||||
if ( src.need_sync_device() ) {
|
||||
deep_copy (dst.h_view, src.h_view);
|
||||
dst.template modify<typename DualView<DT,DL,DD,DM>::host_mirror_space> ();
|
||||
dst.modify_host();
|
||||
}
|
||||
else {
|
||||
deep_copy (dst.d_view, src.d_view);
|
||||
dst.modify_device();
|
||||
}
|
||||
}
|
||||
|
||||
template< class ExecutionSpace ,
|
||||
|
@ -852,15 +850,12 @@ deep_copy (const ExecutionSpace& exec ,
|
|||
DualView<DT,DL,DD,DM> dst, // trust me, this must not be a reference
|
||||
const DualView<ST,SL,SD,SM>& src )
|
||||
{
|
||||
if(src.modified_flags.data()==NULL || dst.modified_flags.data()==NULL) {
|
||||
return deep_copy(exec, dst.d_view, src.d_view);
|
||||
}
|
||||
if (src.modified_flags(1) >= src.modified_flags(0)) {
|
||||
deep_copy (exec, dst.d_view, src.d_view);
|
||||
dst.template modify<typename DualView<DT,DL,DD,DM>::device_type> ();
|
||||
} else {
|
||||
if ( src.need_sync_device() ) {
|
||||
deep_copy (exec, dst.h_view, src.h_view);
|
||||
dst.template modify<typename DualView<DT,DL,DD,DM>::host_mirror_space> ();
|
||||
dst.modify_host();
|
||||
} else {
|
||||
deep_copy (exec, dst.d_view, src.d_view);
|
||||
dst.modify_device();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -368,8 +368,8 @@ public:
|
|||
enum { is_assignable = is_assignable_value_type &&
|
||||
is_assignable_layout };
|
||||
|
||||
typedef ViewMapping< DstTraits , void > DstType ;
|
||||
typedef ViewMapping< SrcTraits , void > SrcType ;
|
||||
typedef ViewMapping< DstTraits , typename DstTraits::specialize > DstType ;
|
||||
typedef ViewMapping< SrcTraits , typename SrcTraits::specialize > SrcType ;
|
||||
|
||||
template < typename DT , typename ... DP , typename ST , typename ... SP >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -432,7 +432,7 @@ public:
|
|||
|
||||
|
||||
private:
|
||||
typedef Kokkos::Impl::ViewMapping< traits , void > map_type ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , typename traits::specialize > map_type ;
|
||||
typedef Kokkos::Impl::SharedAllocationTracker track_type ;
|
||||
|
||||
track_type m_track ;
|
||||
|
@ -567,11 +567,11 @@ public:
|
|||
// Allow specializations to query their specialized map
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const Kokkos::Impl::ViewMapping< traits , void > &
|
||||
const Kokkos::Impl::ViewMapping< traits , typename traits::specialize > &
|
||||
implementation_map() const { return m_map ; }
|
||||
#endif
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const Kokkos::Impl::ViewMapping< traits , void > &
|
||||
const Kokkos::Impl::ViewMapping< traits , typename traits::specialize > &
|
||||
impl_map() const { return m_map ; }
|
||||
|
||||
//----------------------------------------
|
||||
|
@ -952,7 +952,7 @@ public:
|
|||
, m_rank(rhs.m_rank)
|
||||
{
|
||||
typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , typename traits::specialize > Mapping ;
|
||||
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
|
||||
Mapping::assign( m_map , rhs.m_map , rhs.m_track );
|
||||
}
|
||||
|
@ -962,7 +962,7 @@ public:
|
|||
DynRankView & operator = (const DynRankView<RT,RP...> & rhs )
|
||||
{
|
||||
typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , typename traits::specialize > Mapping ;
|
||||
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
|
||||
Mapping::assign( m_map , rhs.m_map , rhs.m_track );
|
||||
m_track.assign( rhs.m_track , traits::is_managed );
|
||||
|
@ -980,7 +980,7 @@ public:
|
|||
{
|
||||
typedef typename View<RT,RP...>::traits SrcTraits ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Impl::ViewToDynRankViewTag > Mapping ;
|
||||
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
|
||||
static_assert( Mapping::is_assignable , "Incompatible View to DynRankView copy construction" );
|
||||
Mapping::assign( *this , rhs );
|
||||
}
|
||||
|
||||
|
@ -1432,7 +1432,7 @@ public:
|
|||
, Args ... args )
|
||||
{
|
||||
|
||||
typedef ViewMapping< traits_type, void > DstType ;
|
||||
typedef ViewMapping< traits_type, typename traits_type::specialize > DstType ;
|
||||
|
||||
typedef typename std::conditional< (rank==0) , ViewDimension<>
|
||||
, typename std::conditional< (rank==1) , ViewDimension<0>
|
||||
|
|
|
@ -101,13 +101,98 @@ namespace Impl {
|
|||
result = run_me< Kokkos::DualView<Scalar**,Kokkos::LayoutLeft,Device> >(size,3);
|
||||
}
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
template < typename Scalar, class ViewType >
|
||||
struct SumViewEntriesFunctor {
|
||||
|
||||
typedef Scalar value_type;
|
||||
|
||||
ViewType fv;
|
||||
|
||||
SumViewEntriesFunctor ( const ViewType & fv_ ) : fv(fv_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() ( const int i , value_type & total ) const {
|
||||
for ( size_t j = 0; j < fv.extent(1); ++j ) {
|
||||
total += fv(i,j);
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
template <typename Scalar, class Device>
|
||||
struct test_dual_view_deep_copy
|
||||
{
|
||||
typedef Scalar scalar_type;
|
||||
typedef Device execution_space;
|
||||
|
||||
template <typename ViewType>
|
||||
void run_me() {
|
||||
|
||||
const unsigned int n = 10;
|
||||
const unsigned int m = 5;
|
||||
const unsigned int sum_total = n * m;
|
||||
|
||||
ViewType a("A",n,m);
|
||||
ViewType b("B",n,m);
|
||||
|
||||
Kokkos::deep_copy( a.d_view , 1 );
|
||||
|
||||
a.template modify<typename ViewType::execution_space>();
|
||||
a.template sync<typename ViewType::host_mirror_space>();
|
||||
|
||||
// Check device view is initialized as expected
|
||||
scalar_type a_d_sum = 0;
|
||||
// Execute on the execution_space associated with t_dev's memory space
|
||||
typedef typename ViewType::t_dev::memory_space::execution_space t_dev_exec_space;
|
||||
Kokkos::parallel_reduce( Kokkos::RangePolicy<t_dev_exec_space>(0,n), SumViewEntriesFunctor<scalar_type, typename ViewType::t_dev>(a.d_view), a_d_sum );
|
||||
ASSERT_EQ(a_d_sum, sum_total);
|
||||
|
||||
// Check host view is synced as expected
|
||||
scalar_type a_h_sum = 0;
|
||||
for ( size_t i = 0; i < a.h_view.extent(0); ++i )
|
||||
for ( size_t j = 0; j < a.h_view.extent(1); ++j ) {
|
||||
a_h_sum += a.h_view(i,j);
|
||||
}
|
||||
|
||||
ASSERT_EQ(a_h_sum, sum_total);
|
||||
|
||||
|
||||
// Test deep_copy
|
||||
Kokkos::deep_copy( b, a );
|
||||
b.template sync<typename ViewType::host_mirror_space>();
|
||||
|
||||
// Perform same checks on b as done on a
|
||||
// Check device view is initialized as expected
|
||||
scalar_type b_d_sum = 0;
|
||||
// Execute on the execution_space associated with t_dev's memory space
|
||||
Kokkos::parallel_reduce( Kokkos::RangePolicy<t_dev_exec_space>(0,n), SumViewEntriesFunctor<scalar_type, typename ViewType::t_dev>(b.d_view), b_d_sum );
|
||||
ASSERT_EQ(b_d_sum, sum_total);
|
||||
|
||||
// Check host view is synced as expected
|
||||
scalar_type b_h_sum = 0;
|
||||
for ( size_t i = 0; i < b.h_view.extent(0); ++i )
|
||||
for ( size_t j = 0; j < b.h_view.extent(1); ++j ) {
|
||||
b_h_sum += b.h_view(i,j);
|
||||
}
|
||||
|
||||
ASSERT_EQ(b_h_sum, sum_total);
|
||||
|
||||
} // end run_me
|
||||
|
||||
test_dual_view_deep_copy()
|
||||
{
|
||||
run_me< Kokkos::DualView<Scalar**,Kokkos::LayoutLeft,Device> >();
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
|
||||
|
||||
|
||||
template <typename Scalar, typename Device>
|
||||
void test_dualview_combinations(unsigned int size)
|
||||
{
|
||||
|
@ -116,10 +201,21 @@ void test_dualview_combinations(unsigned int size)
|
|||
|
||||
}
|
||||
|
||||
template <typename Scalar, typename Device>
|
||||
void test_dualview_deep_copy()
|
||||
{
|
||||
Impl::test_dual_view_deep_copy<Scalar,Device> ();
|
||||
}
|
||||
|
||||
TEST_F( TEST_CATEGORY, dualview_combination) {
|
||||
test_dualview_combinations<int,TEST_EXECSPACE>(10);
|
||||
}
|
||||
|
||||
TEST_F( TEST_CATEGORY, dualview_deep_copy) {
|
||||
test_dualview_deep_copy<int,TEST_EXECSPACE>();
|
||||
test_dualview_deep_copy<double,TEST_EXECSPACE>();
|
||||
}
|
||||
|
||||
|
||||
} // namespace Test
|
||||
|
||||
|
|
|
@ -829,7 +829,8 @@ void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
|
|||
}
|
||||
if(bytes > current_size) {
|
||||
current_size = bytes;
|
||||
ptr = Kokkos::kokkos_realloc<Kokkos::CudaSpace>(ptr,current_size);
|
||||
Kokkos::kokkos_free<Kokkos::CudaSpace>(ptr);
|
||||
ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
|
||||
}
|
||||
if((bytes < current_size) && (force_shrink)) {
|
||||
current_size = bytes;
|
||||
|
|
|
@ -561,7 +561,11 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
|
||||
cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
|
||||
#else
|
||||
cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
|
||||
#endif
|
||||
|
||||
// Init the array for used for arbitrarily sized atomics
|
||||
Impl::initialize_host_cuda_lock_arrays();
|
||||
|
|
|
@ -525,6 +525,7 @@ public:
|
|||
inline
|
||||
void execute() const
|
||||
{
|
||||
if(m_rp.m_num_tiles==0) return;
|
||||
const array_index_type maxblocks = static_cast<array_index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
|
||||
if ( RP::rank == 2 )
|
||||
{
|
||||
|
@ -685,7 +686,7 @@ public:
|
|||
typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>()
|
||||
, m_shmem_begin
|
||||
, m_shmem_size
|
||||
, (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
|
||||
, (void*) ( ((char*)m_scratch_ptr[1]) + ptrdiff_t(threadid/(blockDim.x*blockDim.y)) * m_scratch_size[1])
|
||||
, m_scratch_size[1]
|
||||
, league_rank
|
||||
, m_league_size ) );
|
||||
|
@ -1336,7 +1337,7 @@ public:
|
|||
( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
|
||||
, m_shmem_begin
|
||||
, m_shmem_size
|
||||
, (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
|
||||
, (void*) ( ((char*)m_scratch_ptr[1]) + ptrdiff_t(threadid/(blockDim.x*blockDim.y)) * m_scratch_size[1])
|
||||
, m_scratch_size[1]
|
||||
, league_rank
|
||||
, m_league_size )
|
||||
|
@ -1378,7 +1379,7 @@ public:
|
|||
( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
|
||||
, m_shmem_begin
|
||||
, m_shmem_size
|
||||
, (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
|
||||
, (void*) ( ((char*)m_scratch_ptr[1]) + ptrdiff_t(threadid/(blockDim.x*blockDim.y)) * m_scratch_size[1])
|
||||
, m_scratch_size[1]
|
||||
, league_rank
|
||||
, m_league_size )
|
||||
|
@ -2064,7 +2065,7 @@ private:
|
|||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
|
||||
#else
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP;
|
||||
#endif
|
||||
if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values.
|
||||
|
||||
|
@ -2291,7 +2292,7 @@ private:
|
|||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
|
||||
#else
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP;
|
||||
#endif
|
||||
if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values.
|
||||
|
||||
|
|
|
@ -321,7 +321,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
|
|||
unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
|
||||
int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
int active = KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#endif
|
||||
if (int(blockDim.x*blockDim.y) > 2) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 2,32);
|
||||
|
@ -331,7 +331,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
|
|||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#endif
|
||||
if (int(blockDim.x*blockDim.y) > 4) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 4,32);
|
||||
|
@ -341,7 +341,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
|
|||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#endif
|
||||
if (int(blockDim.x*blockDim.y) > 8) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 8,32);
|
||||
|
@ -351,7 +351,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
|
|||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#endif
|
||||
if (int(blockDim.x*blockDim.y) > 16) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 16,32);
|
||||
|
@ -361,7 +361,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
|
|||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
@ -506,7 +506,7 @@ cuda_inter_block_reduction( const ReducerType& reducer,
|
|||
unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
|
||||
int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
int active = KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#endif
|
||||
if (int(blockDim.x*blockDim.y) > 2) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 2,32);
|
||||
|
@ -516,7 +516,7 @@ cuda_inter_block_reduction( const ReducerType& reducer,
|
|||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#endif
|
||||
if (int(blockDim.x*blockDim.y) > 4) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 4,32);
|
||||
|
@ -526,7 +526,7 @@ cuda_inter_block_reduction( const ReducerType& reducer,
|
|||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#endif
|
||||
if (int(blockDim.x*blockDim.y) > 8) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 8,32);
|
||||
|
@ -536,7 +536,7 @@ cuda_inter_block_reduction( const ReducerType& reducer,
|
|||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#endif
|
||||
if (int(blockDim.x*blockDim.y) > 16) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 16,32);
|
||||
|
@ -546,7 +546,7 @@ cuda_inter_block_reduction( const ReducerType& reducer,
|
|||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
active += KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
@ -578,7 +578,7 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, true> {
|
|||
const int width, // How much of the warp participates
|
||||
Scalar& result)
|
||||
{
|
||||
unsigned mask = width==32?0xffffffff:((1<<width)-1)<<((threadIdx.y*blockDim.x+threadIdx.x)%(32/width))*width;
|
||||
unsigned mask = width==32?0xffffffff:((1<<width)-1)<<((threadIdx.y*blockDim.x+threadIdx.x)/width)*width;
|
||||
for(int delta=skip_vector?blockDim.x:1; delta<width; delta*=2) {
|
||||
Scalar tmp;
|
||||
cuda_shfl_down(tmp,value,delta,width,mask);
|
||||
|
@ -683,7 +683,7 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
|
|||
const int width) // How much of the warp participates
|
||||
{
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
unsigned mask = width==32?0xffffffff:((1<<width)-1)<<((threadIdx.y*blockDim.x+threadIdx.x)%(32/width))*width;
|
||||
unsigned mask = width==32?0xffffffff:((1<<width)-1)<<((threadIdx.y*blockDim.x+threadIdx.x)/width)*width;
|
||||
#endif
|
||||
const int lane_id = (threadIdx.y*blockDim.x+threadIdx.x)%32;
|
||||
for(int delta=skip_vector?blockDim.x:1; delta<width; delta*=2) {
|
||||
|
@ -693,7 +693,7 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
|
|||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(mask);
|
||||
#else
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP;
|
||||
#endif
|
||||
}
|
||||
*value=*(value-lane_id);
|
||||
|
@ -779,7 +779,7 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
|
|||
/*
|
||||
* Algorithmic constraints:
|
||||
* (a) blockDim.y is a power of two
|
||||
* (b) blockDim.y <= 512
|
||||
* (b) blockDim.y <= 1024
|
||||
* (c) blockDim.x == blockDim.z == 1
|
||||
*/
|
||||
|
||||
|
@ -828,14 +828,26 @@ void cuda_intra_block_reduce_scan( const FunctorType & functor ,
|
|||
{ // Inter-warp reduce-scan by a single warp to avoid extra synchronizations
|
||||
const unsigned rtid_inter = ( threadIdx.y ^ BlockSizeMask ) << CudaTraits::WarpIndexShift ;
|
||||
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
unsigned inner_mask = KOKKOS_IMPL_CUDA_BALLOT_MASK(0xffffffff,(rtid_inter<blockDim.y));
|
||||
#endif
|
||||
if ( rtid_inter < blockDim.y ) {
|
||||
|
||||
const pointer_type tdata_inter = base_data + value_count * ( rtid_inter ^ BlockSizeMask );
|
||||
|
||||
if ( (1<<5) < BlockSizeMask ) { BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
|
||||
if ( (1<<6) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
|
||||
if ( (1<<7) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
|
||||
if ( (1<<8) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
if ( (1<<5) < BlockSizeMask ) { KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
|
||||
if ( (1<<6) < BlockSizeMask ) { KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
|
||||
if ( (1<<7) < BlockSizeMask ) { KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
|
||||
if ( (1<<8) < BlockSizeMask ) { KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
|
||||
if ( (1<<9) < BlockSizeMask ) { KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,9) }
|
||||
#else
|
||||
if ( (1<<5) < BlockSizeMask ) { KOKKOS_IMPL_CUDA_SYNCWARP; BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
|
||||
if ( (1<<6) < BlockSizeMask ) { KOKKOS_IMPL_CUDA_SYNCWARP; BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
|
||||
if ( (1<<7) < BlockSizeMask ) { KOKKOS_IMPL_CUDA_SYNCWARP; BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
|
||||
if ( (1<<8) < BlockSizeMask ) { KOKKOS_IMPL_CUDA_SYNCWARP; BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
|
||||
if ( (1<<9) < BlockSizeMask ) { KOKKOS_IMPL_CUDA_SYNCWARP; BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,9) }
|
||||
#endif
|
||||
|
||||
if ( DoScan ) {
|
||||
|
||||
|
@ -846,10 +858,17 @@ void cuda_intra_block_reduce_scan( const FunctorType & functor ,
|
|||
|
||||
if ( ! ( rtid_inter + n < blockDim.y ) ) n = 0 ;
|
||||
|
||||
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,8)
|
||||
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,7)
|
||||
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,6)
|
||||
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,5)
|
||||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); BLOCK_SCAN_STEP(tdata_inter,n,8)
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); BLOCK_SCAN_STEP(tdata_inter,n,7)
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); BLOCK_SCAN_STEP(tdata_inter,n,6)
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); BLOCK_SCAN_STEP(tdata_inter,n,5)
|
||||
#else
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP; BLOCK_SCAN_STEP(tdata_inter,n,8)
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP; BLOCK_SCAN_STEP(tdata_inter,n,7)
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP; BLOCK_SCAN_STEP(tdata_inter,n,6)
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP; BLOCK_SCAN_STEP(tdata_inter,n,5)
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -864,19 +883,17 @@ void cuda_intra_block_reduce_scan( const FunctorType & functor ,
|
|||
( rtid_intra & 16 ) ? 16 : 0 ))));
|
||||
|
||||
if ( ! ( rtid_intra + n < blockDim.y ) ) n = 0 ;
|
||||
#ifdef KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,4) __syncthreads();//__threadfence_block();
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,3) __syncthreads();//__threadfence_block();
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,2) __syncthreads();//__threadfence_block();
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,1) __syncthreads();//__threadfence_block();
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,0) __syncthreads();
|
||||
#else
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP;
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block();
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP;
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block();
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP;
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block();
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP;
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block();
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP;
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,0) __threadfence_block();
|
||||
#endif
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP;
|
||||
}
|
||||
|
||||
#undef BLOCK_SCAN_STEP
|
||||
|
|
|
@ -290,7 +290,7 @@ public:
|
|||
// Intra vector lane shuffle reduction:
|
||||
typename ReducerType::value_type tmp ( reducer.reference() );
|
||||
|
||||
unsigned mask = blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x;
|
||||
unsigned mask = blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<((threadIdx.y%(32/blockDim.x))*blockDim.x);
|
||||
|
||||
for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
|
||||
cuda_shfl_down( reducer.reference() , tmp , i , blockDim.x , mask );
|
||||
|
@ -742,7 +742,7 @@ void parallel_for
|
|||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x);
|
||||
#else
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
@ -915,7 +915,7 @@ void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const Functo
|
|||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x);
|
||||
#else
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
@ -928,7 +928,7 @@ void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& , const Functo
|
|||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK(blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x);
|
||||
#else
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
|
||||
KOKKOS_IMPL_CUDA_SYNCWARP;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
@ -938,7 +938,7 @@ KOKKOS_INLINE_FUNCTION
|
|||
void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda, ValueType& val) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
if(threadIdx.x == 0) lambda(val);
|
||||
unsigned mask = blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x;
|
||||
unsigned mask = blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<((threadIdx.y%(32/blockDim.x))*blockDim.x);
|
||||
Impl::cuda_shfl(val,val,0,blockDim.x,mask);
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -4,9 +4,9 @@
|
|||
#if ( CUDA_VERSION < 9000 )
|
||||
#define KOKKOS_IMPL_CUDA_ACTIVEMASK 0
|
||||
#define KOKKOS_IMPL_CUDA_SYNCWARP __threadfence_block()
|
||||
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK __threadfence_block()
|
||||
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) if(m)__threadfence_block()
|
||||
#define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot(x)
|
||||
#define KOKKOS_IMPL_CUDA_BALLOT_MASK(x) __ballot(x)
|
||||
#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m,x) __ballot(x)
|
||||
#define KOKKOS_IMPL_CUDA_SHFL(x,y,z) __shfl(x,y,z)
|
||||
#define KOKKOS_IMPL_CUDA_SHFL_MASK(m,x,y,z) __shfl(x,y,z)
|
||||
#define KOKKOS_IMPL_CUDA_SHFL_UP(x,y,z) __shfl_up(x,y,z)
|
||||
|
@ -16,7 +16,7 @@
|
|||
#else
|
||||
#define KOKKOS_IMPL_CUDA_ACTIVEMASK __activemask()
|
||||
#define KOKKOS_IMPL_CUDA_SYNCWARP __syncwarp(0xffffffff)
|
||||
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) __syncwarp(m);
|
||||
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) __syncwarp(m)
|
||||
#define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot_sync(__activemask(),x)
|
||||
#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m,x) __ballot_sync(m,x)
|
||||
#define KOKKOS_IMPL_CUDA_SHFL(x,y,z) __shfl_sync(0xffffffff,x,y,z)
|
||||
|
@ -29,9 +29,9 @@
|
|||
#else
|
||||
#define KOKKOS_IMPL_CUDA_ACTIVEMASK 0
|
||||
#define KOKKOS_IMPL_CUDA_SYNCWARP
|
||||
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK
|
||||
#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) (void)m
|
||||
#define KOKKOS_IMPL_CUDA_BALLOT(x) 0
|
||||
#define KOKKOS_IMPL_CUDA_BALLOT_MASK(x) 0
|
||||
#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m,x) 0
|
||||
#define KOKKOS_IMPL_CUDA_SHFL(x,y,z) 0
|
||||
#define KOKKOS_IMPL_CUDA_SHFL_MASK(m,x,y,z) 0
|
||||
#define KOKKOS_IMPL_CUDA_SHFL_UP(x,y,z) 0
|
||||
|
|
|
@ -1401,7 +1401,33 @@ void deep_copy
|
|||
typedef typename src_type::memory_space src_memory_space ;
|
||||
typedef typename dst_type::value_type dst_value_type ;
|
||||
typedef typename src_type::value_type src_value_type ;
|
||||
if(dst.data() == NULL && src.data() == NULL) {
|
||||
if(dst.data() == NULL || src.data() == NULL) {
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
// do nothing
|
||||
#else
|
||||
// throw if dimension mismatch
|
||||
if ( (src.extent(0) != dst.extent(0)) ||
|
||||
(src.extent(1) != dst.extent(1)) ||
|
||||
(src.extent(2) != dst.extent(2)) ||
|
||||
(src.extent(3) != dst.extent(3)) ||
|
||||
(src.extent(4) != dst.extent(4)) ||
|
||||
(src.extent(5) != dst.extent(5)) ||
|
||||
(src.extent(6) != dst.extent(6)) ||
|
||||
(src.extent(7) != dst.extent(7))
|
||||
) {
|
||||
std::string message("Deprecation Error: Kokkos::deep_copy extents of views don't match: ");
|
||||
message += dst.label(); message += "(";
|
||||
for(int r = 0; r<dst_type::Rank-1; r++)
|
||||
{ message+= std::to_string(dst.extent(r)); message += ","; }
|
||||
message+= std::to_string(dst.extent(dst_type::Rank-1)); message += ") ";
|
||||
message += src.label(); message += "(";
|
||||
for(int r = 0; r<src_type::Rank-1; r++)
|
||||
{ message+= std::to_string(src.extent(r)); message += ","; }
|
||||
message+= std::to_string(src.extent(src_type::Rank-1)); message += ") ";
|
||||
|
||||
Kokkos::Impl::throw_runtime_exception(message);
|
||||
}
|
||||
#endif
|
||||
Kokkos::fence();
|
||||
return;
|
||||
}
|
||||
|
@ -1646,7 +1672,33 @@ void deep_copy
|
|||
typedef typename dst_type::value_type dst_value_type ;
|
||||
typedef typename src_type::value_type src_value_type ;
|
||||
|
||||
if(dst.data() == NULL && src.data() == NULL) {
|
||||
if(dst.data() == NULL || src.data() == NULL) {
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
// do nothing
|
||||
#else
|
||||
// throw if dimension mismatch
|
||||
if ( (src.extent(0) != dst.extent(0)) ||
|
||||
(src.extent(1) != dst.extent(1)) ||
|
||||
(src.extent(2) != dst.extent(2)) ||
|
||||
(src.extent(3) != dst.extent(3)) ||
|
||||
(src.extent(4) != dst.extent(4)) ||
|
||||
(src.extent(5) != dst.extent(5)) ||
|
||||
(src.extent(6) != dst.extent(6)) ||
|
||||
(src.extent(7) != dst.extent(7))
|
||||
) {
|
||||
std::string message("Deprecation Error: Kokkos::deep_copy extents of views don't match: ");
|
||||
message += dst.label(); message += "(";
|
||||
for(int r = 0; r<dst_type::Rank-1; r++)
|
||||
{ message+= std::to_string(dst.extent(r)); message += ","; }
|
||||
message+= std::to_string(dst.extent(dst_type::Rank-1)); message += ") ";
|
||||
message += src.label(); message += "(";
|
||||
for(int r = 0; r<src_type::Rank-1; r++)
|
||||
{ message+= std::to_string(src.extent(r)); message += ","; }
|
||||
message+= std::to_string(src.extent(src_type::Rank-1)); message += ") ";
|
||||
|
||||
Kokkos::Impl::throw_runtime_exception(message);
|
||||
}
|
||||
#endif
|
||||
exec_space.fence();
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -100,32 +100,27 @@ public:
|
|||
row_map_type row_map;
|
||||
entries_type entries;
|
||||
|
||||
//! Construct an empty view.
|
||||
Crs() : row_map(), entries() {}
|
||||
|
||||
//! Copy constructor (shallow copy).
|
||||
Crs(const Crs& rhs) : row_map(rhs.row_map), entries(rhs.entries)
|
||||
{}
|
||||
/*
|
||||
* Default Constructors, operators and destructor
|
||||
*/
|
||||
KOKKOS_FUNCTION Crs() = default;
|
||||
KOKKOS_FUNCTION Crs(Crs const &) = default;
|
||||
KOKKOS_FUNCTION Crs(Crs &&) = default;
|
||||
KOKKOS_FUNCTION Crs& operator=(Crs const &) = default;
|
||||
KOKKOS_FUNCTION Crs& operator=(Crs &&) = default;
|
||||
KOKKOS_FUNCTION ~Crs() = default;
|
||||
|
||||
/** \brief Assign to a view of the rhs array.
|
||||
* If the old view is the last view
|
||||
* then allocated memory is deallocated.
|
||||
*/
|
||||
template<class EntriesType, class RowMapType>
|
||||
Crs(const RowMapType& row_map_, const EntriesType& entries_) : row_map(row_map_), entries(entries_)
|
||||
{}
|
||||
|
||||
/** \brief Assign to a view of the rhs array.
|
||||
* If the old view is the last view
|
||||
* then allocated memory is deallocated.
|
||||
*/
|
||||
Crs& operator= (const Crs& rhs) {
|
||||
row_map = rhs.row_map;
|
||||
entries = rhs.entries;
|
||||
return *this;
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Crs(const RowMapType& row_map_, const EntriesType& entries_)
|
||||
: row_map(row_map_), entries(entries_)
|
||||
{
|
||||
}
|
||||
|
||||
/** \brief Destroy this view of the array.
|
||||
* If the last view then allocated memory is deallocated.
|
||||
*/
|
||||
~Crs() {}
|
||||
|
||||
/** \brief Return number of rows in the graph
|
||||
*/
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
|
|
@ -170,6 +170,10 @@
|
|||
// see https://github.com/kokkos/kokkos/issues/1470
|
||||
#define KOKKOS_CUDA_9_DEFAULTED_BUG_WORKAROUND
|
||||
#endif
|
||||
|
||||
#if ( 10000 > CUDA_VERSION )
|
||||
#define KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
|
||||
#endif
|
||||
#endif // #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
|
|
@ -505,7 +505,7 @@ public:
|
|||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
value_type& reference() {
|
||||
value_type& reference() const {
|
||||
return *value;
|
||||
}
|
||||
|
||||
|
@ -559,7 +559,7 @@ public:
|
|||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
value_type& reference() {
|
||||
value_type& reference() const {
|
||||
return *value;
|
||||
}
|
||||
|
||||
|
@ -637,7 +637,7 @@ public:
|
|||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
value_type& reference() {
|
||||
value_type& reference() const {
|
||||
return *value;
|
||||
}
|
||||
|
||||
|
@ -727,7 +727,7 @@ public:
|
|||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
value_type& reference() {
|
||||
value_type& reference() const {
|
||||
return *value;
|
||||
}
|
||||
|
||||
|
|
|
@ -198,6 +198,7 @@ struct ViewTraits< void >
|
|||
typedef void HostMirrorSpace ;
|
||||
typedef void array_layout ;
|
||||
typedef void memory_traits ;
|
||||
typedef void specialize ;
|
||||
};
|
||||
|
||||
template< class ... Prop >
|
||||
|
@ -209,6 +210,7 @@ struct ViewTraits< void , void , Prop ... >
|
|||
typedef typename ViewTraits<void,Prop...>::HostMirrorSpace HostMirrorSpace ;
|
||||
typedef typename ViewTraits<void,Prop...>::array_layout array_layout ;
|
||||
typedef typename ViewTraits<void,Prop...>::memory_traits memory_traits ;
|
||||
typedef typename ViewTraits<void,Prop...>::specialize specialize ;
|
||||
};
|
||||
|
||||
template< class ArrayLayout , class ... Prop >
|
||||
|
@ -221,6 +223,7 @@ struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_array_layout<ArrayL
|
|||
typedef typename ViewTraits<void,Prop...>::HostMirrorSpace HostMirrorSpace ;
|
||||
typedef ArrayLayout array_layout ;
|
||||
typedef typename ViewTraits<void,Prop...>::memory_traits memory_traits ;
|
||||
typedef typename ViewTraits<void,Prop...>::specialize specialize ;
|
||||
};
|
||||
|
||||
template< class Space , class ... Prop >
|
||||
|
@ -239,6 +242,7 @@ struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_space<Space>::value
|
|||
typedef typename Kokkos::Impl::HostMirror< Space >::Space HostMirrorSpace ;
|
||||
typedef typename execution_space::array_layout array_layout ;
|
||||
typedef typename ViewTraits<void,Prop...>::memory_traits memory_traits ;
|
||||
typedef typename ViewTraits<void,Prop...>::specialize specialize ;
|
||||
};
|
||||
|
||||
template< class MemoryTraits , class ... Prop >
|
||||
|
@ -257,6 +261,7 @@ struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_memory_traits<Memor
|
|||
typedef void HostMirrorSpace ;
|
||||
typedef void array_layout ;
|
||||
typedef MemoryTraits memory_traits ;
|
||||
typedef void specialize ;
|
||||
};
|
||||
|
||||
|
||||
|
@ -335,7 +340,12 @@ public:
|
|||
|
||||
typedef ArrayLayout array_layout ;
|
||||
typedef typename data_analysis::dimension dimension ;
|
||||
typedef typename data_analysis::specialize specialize /* mapping specialization tag */ ;
|
||||
|
||||
typedef typename std::conditional<
|
||||
std::is_same<typename data_analysis::specialize,void>::value
|
||||
,typename prop::specialize
|
||||
,typename data_analysis::specialize>::type
|
||||
specialize ; /* mapping specialization tag */
|
||||
|
||||
enum { rank = dimension::rank };
|
||||
enum { rank_dynamic = dimension::rank_dynamic };
|
||||
|
@ -542,7 +552,7 @@ public:
|
|||
|
||||
private:
|
||||
|
||||
typedef Kokkos::Impl::ViewMapping< traits , void > map_type ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , typename traits::specialize > map_type ;
|
||||
typedef Kokkos::Impl::SharedAllocationTracker track_type ;
|
||||
|
||||
track_type m_track ;
|
||||
|
@ -608,13 +618,18 @@ public:
|
|||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION constexpr
|
||||
typename std::enable_if< std::is_integral<iType>::value , size_t >::type
|
||||
extent( const iType & r ) const
|
||||
extent( const iType & r ) const noexcept
|
||||
{ return m_map.extent(r); }
|
||||
|
||||
static KOKKOS_INLINE_FUNCTION constexpr
|
||||
size_t
|
||||
static_extent( const unsigned r ) noexcept
|
||||
{ return map_type::static_extent(r); }
|
||||
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION constexpr
|
||||
typename std::enable_if< std::is_integral<iType>::value , int >::type
|
||||
extent_int( const iType & r ) const
|
||||
extent_int( const iType & r ) const noexcept
|
||||
{ return static_cast<int>(m_map.extent(r)); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION constexpr
|
||||
|
@ -709,11 +724,11 @@ public:
|
|||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const Kokkos::Impl::ViewMapping< traits , void > &
|
||||
const Kokkos::Impl::ViewMapping< traits , typename traits::specialize > &
|
||||
implementation_map() const { return m_map ; }
|
||||
#endif
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const Kokkos::Impl::ViewMapping< traits , void > &
|
||||
const Kokkos::Impl::ViewMapping< traits , typename traits::specialize > &
|
||||
impl_map() const { return m_map ; }
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const Kokkos::Impl::SharedAllocationTracker &
|
||||
|
@ -1955,7 +1970,7 @@ public:
|
|||
, m_map()
|
||||
{
|
||||
typedef typename View<RT,RP...>::traits SrcTraits ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , typename traits::specialize > Mapping ;
|
||||
static_assert( Mapping::is_assignable , "Incompatible View copy construction" );
|
||||
Mapping::assign( m_map , rhs.m_map , rhs.m_track );
|
||||
}
|
||||
|
@ -1965,7 +1980,7 @@ public:
|
|||
View & operator = ( const View<RT,RP...> & rhs )
|
||||
{
|
||||
typedef typename View<RT,RP...>::traits SrcTraits ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , typename traits::specialize > Mapping ;
|
||||
static_assert( Mapping::is_assignable , "Incompatible View copy assignment" );
|
||||
Mapping::assign( m_map , rhs.m_map , rhs.m_track );
|
||||
m_track.assign( rhs.m_track , traits::is_managed );
|
||||
|
@ -1992,7 +2007,7 @@ public:
|
|||
|
||||
typedef typename Mapping::type DstType ;
|
||||
|
||||
static_assert( Kokkos::Impl::ViewMapping< traits , typename DstType::traits , void >::is_assignable
|
||||
static_assert( Kokkos::Impl::ViewMapping< traits , typename DstType::traits , typename traits::specialize >::is_assignable
|
||||
, "Subview construction requires compatible view and subview arguments" );
|
||||
|
||||
Mapping::assign( m_map, src_view.m_map, arg0 , args... );
|
||||
|
@ -2266,10 +2281,10 @@ public:
|
|||
}
|
||||
template <class Traits>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
View( const track_type & track, const Kokkos::Impl::ViewMapping< Traits , void > &map ) :
|
||||
View( const track_type & track, const Kokkos::Impl::ViewMapping< Traits , typename Traits::specialize > &map ) :
|
||||
m_track(track), m_map()
|
||||
{
|
||||
typedef Kokkos::Impl::ViewMapping< traits , Traits , void > Mapping ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , Traits , typename traits::specialize > Mapping ;
|
||||
static_assert( Mapping::is_assignable , "Incompatible View copy construction" );
|
||||
Mapping::assign( m_map , map , track );
|
||||
}
|
||||
|
|
|
@ -142,14 +142,14 @@ private:
|
|||
|
||||
WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
|
||||
|
||||
exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size());
|
||||
exec.set_work_range(0,range.end()-range.begin(),self.m_policy.chunk_size());
|
||||
exec.reset_steal_target();
|
||||
exec.barrier();
|
||||
|
||||
long work_index = exec.get_work_index();
|
||||
|
||||
while(work_index != -1) {
|
||||
const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
|
||||
const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size()+range.begin();
|
||||
const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
|
||||
|
||||
ParallelFor::template exec_range< WorkTag >
|
||||
|
@ -470,14 +470,14 @@ private:
|
|||
const ParallelReduce & self = * ((const ParallelReduce *) arg );
|
||||
const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
|
||||
exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size());
|
||||
exec.set_work_range(0,range.end()-range.begin(),self.m_policy.chunk_size());
|
||||
exec.reset_steal_target();
|
||||
exec.barrier();
|
||||
|
||||
long work_index = exec.get_work_index();
|
||||
reference_type update = ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() );
|
||||
while(work_index != -1) {
|
||||
const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
|
||||
const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size() + range.begin();
|
||||
const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
|
||||
ParallelReduce::template exec_range< WorkTag >
|
||||
( self.m_functor , begin , end
|
||||
|
|
|
@ -111,7 +111,7 @@ T atomic_compare_exchange( volatile T * const dest , const T & compare ,
|
|||
unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
|
||||
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#endif
|
||||
unsigned int done_active = 0;
|
||||
while (active!=done_active) {
|
||||
|
@ -127,7 +127,7 @@ T atomic_compare_exchange( volatile T * const dest , const T & compare ,
|
|||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,done);
|
||||
#else
|
||||
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(done);
|
||||
done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
|
||||
#endif
|
||||
}
|
||||
return return_val;
|
||||
|
@ -308,6 +308,16 @@ T atomic_compare_exchange( volatile T * const dest_v, const T compare, const T v
|
|||
#endif
|
||||
#endif // !defined ROCM_ATOMICS
|
||||
|
||||
// dummy for non-CUDA Kokkos headers being processed by NVCC
|
||||
#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA)
|
||||
template <typename T>
|
||||
__inline__ __device__
|
||||
T atomic_compare_exchange(volatile T * const, const Kokkos::Impl::identity_t<T>, const Kokkos::Impl::identity_t<T>)
|
||||
{
|
||||
return T();
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
|
||||
|
|
|
@ -134,7 +134,7 @@ T atomic_exchange( volatile T * const dest ,
|
|||
unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
|
||||
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#endif
|
||||
unsigned int done_active = 0;
|
||||
while (active!=done_active) {
|
||||
|
@ -149,7 +149,7 @@ T atomic_exchange( volatile T * const dest ,
|
|||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,done);
|
||||
#else
|
||||
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(done);
|
||||
done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
|
||||
#endif
|
||||
}
|
||||
return return_val;
|
||||
|
@ -418,6 +418,23 @@ void atomic_assign( volatile T * const dest_v , const T val )
|
|||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// dummy for non-CUDA Kokkos headers being processed by NVCC
|
||||
#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA)
|
||||
template <typename T>
|
||||
__inline__ __device__
|
||||
T atomic_exchange(volatile T * const, const Kokkos::Impl::identity_t<T>)
|
||||
{
|
||||
return T();
|
||||
}
|
||||
|
||||
template < typename T >
|
||||
__inline__ __device__
|
||||
void atomic_assign(volatile T * const, const Kokkos::Impl::identity_t<T>)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif
|
||||
|
|
|
@ -147,7 +147,7 @@ T atomic_fetch_add( volatile T * const dest ,
|
|||
unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
|
||||
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#endif
|
||||
unsigned int done_active = 0;
|
||||
while (active!=done_active) {
|
||||
|
@ -164,7 +164,7 @@ T atomic_fetch_add( volatile T * const dest ,
|
|||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,done);
|
||||
#else
|
||||
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(done);
|
||||
done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
|
||||
#endif
|
||||
}
|
||||
return return_val;
|
||||
|
@ -384,6 +384,15 @@ T atomic_fetch_add( volatile T * const dest_v , typename std::add_const<T>::type
|
|||
#endif // !defined ROCM_ATOMICS
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
// dummy for non-CUDA Kokkos headers being processed by NVCC
|
||||
#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA)
|
||||
template< typename T >
|
||||
__inline__ __device__
|
||||
T atomic_fetch_add(volatile T* const, Kokkos::Impl::identity_t<T>) {
|
||||
return T();
|
||||
}
|
||||
#endif
|
||||
|
||||
// Simpler version of atomic_fetch_add without the fetch
|
||||
template <typename T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
|
|
@ -149,6 +149,15 @@ T atomic_fetch_and( volatile T * const dest_v , const T val )
|
|||
#endif
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
// dummy for non-CUDA Kokkos headers being processed by NVCC
|
||||
#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA)
|
||||
template< typename T >
|
||||
__inline__ __device__
|
||||
T atomic_fetch_and(volatile T* const, Kokkos::Impl::identity_t<T>) {
|
||||
return T();
|
||||
}
|
||||
#endif
|
||||
|
||||
// Simpler version of atomic_fetch_and without the fetch
|
||||
template <typename T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
|
|
@ -149,6 +149,15 @@ T atomic_fetch_or( volatile T * const dest_v , const T val )
|
|||
#endif
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
// dummy for non-CUDA Kokkos headers being processed by NVCC
|
||||
#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA)
|
||||
template< typename T >
|
||||
__inline__ __device__
|
||||
T atomic_fetch_or(volatile T* const, Kokkos::Impl::identity_t<T>) {
|
||||
return T();
|
||||
}
|
||||
#endif
|
||||
|
||||
// Simpler version of atomic_fetch_or without the fetch
|
||||
template <typename T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
|
|
@ -139,7 +139,7 @@ T atomic_fetch_sub( volatile T * const dest ,
|
|||
unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
|
||||
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#endif
|
||||
unsigned int done_active = 0;
|
||||
while (active!=done_active) {
|
||||
|
@ -154,7 +154,7 @@ T atomic_fetch_sub( volatile T * const dest ,
|
|||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,done);
|
||||
#else
|
||||
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(done);
|
||||
done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
|
||||
#endif
|
||||
}
|
||||
return return_val;
|
||||
|
@ -304,6 +304,15 @@ T atomic_fetch_sub( volatile T * const dest_v , const T val )
|
|||
#endif
|
||||
#endif // !defined ROCM_ATOMICS
|
||||
|
||||
// dummy for non-CUDA Kokkos headers being processed by NVCC
|
||||
#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA)
|
||||
template< typename T >
|
||||
__inline__ __device__
|
||||
T atomic_fetch_sub(volatile T* const, Kokkos::Impl::identity_t<T>) {
|
||||
return T();
|
||||
}
|
||||
#endif
|
||||
|
||||
// Simpler version of atomic_fetch_sub without the fetch
|
||||
template <typename T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
|
|
@ -230,9 +230,6 @@ T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
|
|||
typename Kokkos::Impl::enable_if<
|
||||
( sizeof(T) != 4 )
|
||||
&& ( sizeof(T) != 8 )
|
||||
#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
|
||||
&& ( sizeof(T) != 16 )
|
||||
#endif
|
||||
, const T >::type val )
|
||||
{
|
||||
|
||||
|
@ -250,7 +247,7 @@ T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
|
|||
unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
|
||||
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#endif
|
||||
unsigned int done_active = 0;
|
||||
while (active!=done_active) {
|
||||
|
@ -265,7 +262,7 @@ T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
|
|||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,done);
|
||||
#else
|
||||
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(done);
|
||||
done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
|
||||
#endif
|
||||
}
|
||||
return return_val;
|
||||
|
@ -298,7 +295,7 @@ T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
|
|||
unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
|
||||
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
|
||||
#else
|
||||
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
|
||||
unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
|
||||
#endif
|
||||
unsigned int done_active = 0;
|
||||
while (active!=done_active) {
|
||||
|
@ -313,7 +310,7 @@ T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
|
|||
#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
|
||||
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,done);
|
||||
#else
|
||||
done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(done);
|
||||
done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
|
||||
#endif
|
||||
}
|
||||
return return_val;
|
||||
|
|
|
@ -49,6 +49,7 @@
|
|||
#include <sstream>
|
||||
#include <cstdlib>
|
||||
#include <stack>
|
||||
#include <cerrno>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
@ -70,7 +71,6 @@ bool is_unsigned_int(const char* str)
|
|||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void initialize_internal(const InitArguments& args)
|
||||
{
|
||||
// This is an experimental setting
|
||||
|
@ -99,6 +99,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
|
|||
if (use_gpu < 0 && ndevices >= 0) {
|
||||
auto local_rank_str = std::getenv("OMPI_COMM_WORLD_LOCAL_RANK"); //OpenMPI
|
||||
if (!local_rank_str) local_rank_str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK"); //MVAPICH2
|
||||
if (!local_rank_str) local_rank_str = std::getenv("SLURM_LOCALID"); //SLURM
|
||||
if (local_rank_str) {
|
||||
auto local_rank = std::atoi(local_rank_str);
|
||||
use_gpu = local_rank % ndevices;
|
||||
|
@ -532,6 +533,85 @@ void initialize(int& narg, char* arg[])
|
|||
iarg++;
|
||||
}
|
||||
|
||||
//Read environment variables
|
||||
char * endptr;
|
||||
auto env_num_threads_str = std::getenv("KOKKOS_NUM_THREADS");
|
||||
if (env_num_threads_str!=nullptr) {
|
||||
errno = 0;
|
||||
auto env_num_threads = std::strtol(env_num_threads_str,&endptr,10);
|
||||
if (endptr== env_num_threads_str)
|
||||
Impl::throw_runtime_exception("Error: cannot convert KOKKOS_NUM_THREADS to an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
if (errno == ERANGE)
|
||||
Impl::throw_runtime_exception("Error: KOKKOS_NUM_THREADS out of range of representable values by an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
if ((num_threads != -1)&&(env_num_threads!=num_threads))
|
||||
Impl::throw_runtime_exception("Error: expecting a match between --kokkos-threads and KOKKOS_NUM_THREADS if both are set. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
else
|
||||
num_threads = env_num_threads;
|
||||
}
|
||||
auto env_numa_str = std::getenv("KOKKOS_NUMA");
|
||||
if (env_numa_str!=nullptr) {
|
||||
errno = 0;
|
||||
auto env_numa = std::strtol(env_numa_str,&endptr,10);
|
||||
if (endptr== env_numa_str)
|
||||
Impl::throw_runtime_exception("Error: cannot convert KOKKOS_NUMA to an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
if (errno == ERANGE)
|
||||
Impl::throw_runtime_exception("Error: KOKKOS_NUMA out of range of representable values by an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
if ((numa != -1)&&(env_numa!=numa))
|
||||
Impl::throw_runtime_exception("Error: expecting a match between --kokkos-numa and KOKKOS_NUMA if both are set. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
else
|
||||
numa = env_numa;
|
||||
}
|
||||
auto env_device_str = std::getenv("KOKKOS_DEVICE_ID");
|
||||
if (env_device_str!=nullptr) {
|
||||
errno = 0;
|
||||
auto env_device = std::strtol(env_device_str,&endptr,10);
|
||||
if (endptr== env_device_str)
|
||||
Impl::throw_runtime_exception("Error: cannot convert KOKKOS_DEVICE_ID to an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
if (errno == ERANGE)
|
||||
Impl::throw_runtime_exception("Error: KOKKOS_DEVICE_ID out of range of representable values by an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
if ((device != -1)&&(env_device!=device))
|
||||
Impl::throw_runtime_exception("Error: expecting a match between --kokkos-device and KOKKOS_DEVICE_ID if both are set. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
else
|
||||
device = env_device;
|
||||
}
|
||||
auto env_ndevices_str = std::getenv("KOKKOS_NUM_DEVICES");
|
||||
if (env_ndevices_str!=nullptr) {
|
||||
errno = 0;
|
||||
auto env_ndevices = std::strtol(env_ndevices_str,&endptr,10);
|
||||
if (endptr== env_ndevices_str)
|
||||
Impl::throw_runtime_exception("Error: cannot convert KOKKOS_NUM_DEVICES to an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
if (errno == ERANGE)
|
||||
Impl::throw_runtime_exception("Error: KOKKOS_NUM_DEVICES out of range of representable values by an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
if ((ndevices != -1)&&(env_ndevices!=ndevices))
|
||||
Impl::throw_runtime_exception("Error: expecting a match between --kokkos-ndevices and KOKKOS_NUM_DEVICES if both are set. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
else
|
||||
ndevices = env_ndevices;
|
||||
//Skip device
|
||||
auto env_skip_device_str = std::getenv("KOKKOS_SKIP_DEVICE");
|
||||
if (env_skip_device_str!=nullptr) {
|
||||
errno = 0;
|
||||
auto env_skip_device = std::strtol(env_skip_device_str,&endptr,10);
|
||||
if (endptr== env_skip_device_str)
|
||||
Impl::throw_runtime_exception("Error: cannot convert KOKKOS_SKIP_DEVICE to an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
if (errno == ERANGE)
|
||||
Impl::throw_runtime_exception("Error: KOKKOS_SKIP_DEVICE out of range of representable values by an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
if ((skip_device != 9999)&&(env_skip_device!=skip_device))
|
||||
Impl::throw_runtime_exception("Error: expecting a match between --kokkos-ndevices and KOKKOS_SKIP_DEVICE if both are set. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
else
|
||||
skip_device = env_skip_device;
|
||||
}
|
||||
}
|
||||
char * env_disablewarnings_str = std::getenv("KOKKOS_DISABLE_WARNINGS");
|
||||
if (env_disablewarnings_str!=nullptr) {
|
||||
std::string env_str (env_disablewarnings_str); // deep-copies string
|
||||
for (char& c : env_str) { c = toupper (c); }
|
||||
if ((env_str == "TRUE") || (env_str == "ON") || (env_str == "1"))
|
||||
disable_warnings = true;
|
||||
else
|
||||
if (disable_warnings)
|
||||
Impl::throw_runtime_exception("Error: expecting a match between --kokkos-disable-warnings and KOKKOS_DISABLE_WARNINGS if both are set. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
}
|
||||
|
||||
InitArguments arguments;
|
||||
arguments.num_threads = num_threads;
|
||||
arguments.num_numa = numa;
|
||||
|
|
|
@ -409,6 +409,9 @@ struct inclusive_scan_integer_sequence
|
|||
static constexpr value_type value = helper::value ;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
using identity_t = T;
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
|
||||
|
|
|
@ -103,13 +103,7 @@ namespace Impl {
|
|||
|
||||
/** \brief View mapping for non-specialized data type and standard layout */
|
||||
template< class Traits >
|
||||
class ViewMapping< Traits ,
|
||||
typename std::enable_if<(
|
||||
std::is_same< typename Traits::specialize , Kokkos::Array<> >::value &&
|
||||
( std::is_same< typename Traits::array_layout , Kokkos::LayoutLeft >::value ||
|
||||
std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value ||
|
||||
std::is_same< typename Traits::array_layout , Kokkos::LayoutStride >::value )
|
||||
)>::type >
|
||||
class ViewMapping< Traits , Kokkos::Array<> >
|
||||
{
|
||||
private:
|
||||
|
||||
|
@ -345,64 +339,6 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
/** \brief Assign compatible default mappings */
|
||||
|
||||
template< class DstTraits , class SrcTraits >
|
||||
class ViewMapping< DstTraits , SrcTraits ,
|
||||
typename std::enable_if<(
|
||||
std::is_same< typename DstTraits::memory_space , typename SrcTraits::memory_space >::value
|
||||
&&
|
||||
std::is_same< typename DstTraits::specialize , Kokkos::Array<> >::value
|
||||
&&
|
||||
(
|
||||
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
|
||||
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
|
||||
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
|
||||
)
|
||||
&&
|
||||
std::is_same< typename SrcTraits::specialize , Kokkos::Array<> >::value
|
||||
&&
|
||||
(
|
||||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
|
||||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
|
||||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
|
||||
)
|
||||
)>::type >
|
||||
{
|
||||
public:
|
||||
|
||||
enum { is_assignable = true };
|
||||
|
||||
typedef Kokkos::Impl::SharedAllocationTracker TrackType ;
|
||||
typedef ViewMapping< DstTraits , void > DstType ;
|
||||
typedef ViewMapping< SrcTraits , void > SrcType ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void assign( DstType & dst , const SrcType & src , const TrackType & src_track )
|
||||
{
|
||||
static_assert( std::is_same< typename DstTraits::value_type , typename SrcTraits::value_type >::value ||
|
||||
std::is_same< typename DstTraits::value_type , typename SrcTraits::const_value_type >::value
|
||||
, "View assignment must have same value type or const = non-const" );
|
||||
|
||||
static_assert( ViewDimensionAssignable< typename DstTraits::dimension , typename SrcTraits::dimension >::value
|
||||
, "View assignment must have compatible dimensions" );
|
||||
|
||||
static_assert( std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value ||
|
||||
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value ||
|
||||
( DstTraits::dimension::rank == 0 ) ||
|
||||
( DstTraits::dimension::rank == 1 && DstTraits::dimension::rank_dynamic == 1 )
|
||||
, "View assignment must have compatible layout or have rank <= 1" );
|
||||
|
||||
typedef typename DstType::offset_type dst_offset_type ;
|
||||
|
||||
dst.m_impl_offset = dst_offset_type( src.m_impl_offset );
|
||||
dst.m_impl_handle = src.m_impl_handle ;
|
||||
dst.m_stride = src.m_stride ;
|
||||
}
|
||||
};
|
||||
|
||||
/** \brief Assign Array to non-Array */
|
||||
|
||||
template< class DstTraits , class SrcTraits >
|
||||
|
@ -436,7 +372,7 @@ public:
|
|||
|
||||
typedef Kokkos::Impl::SharedAllocationTracker TrackType ;
|
||||
typedef ViewMapping< DstTraits , void > DstType ;
|
||||
typedef ViewMapping< SrcTraits , void > SrcType ;
|
||||
typedef ViewMapping< SrcTraits , Kokkos::Array<> > SrcType ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void assign( DstType & dst , const SrcType & src , const TrackType & src_track )
|
||||
|
@ -480,6 +416,7 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
|
|
@ -195,7 +195,7 @@ struct ViewDimension
|
|||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
constexpr size_t extent( const unsigned r ) const
|
||||
constexpr size_t extent( const unsigned r ) const noexcept
|
||||
{
|
||||
return r == 0 ? N0 : (
|
||||
r == 1 ? N1 : (
|
||||
|
@ -207,6 +207,19 @@ struct ViewDimension
|
|||
r == 7 ? N7 : 0 )))))));
|
||||
}
|
||||
|
||||
static KOKKOS_INLINE_FUNCTION
|
||||
constexpr size_t static_extent( const unsigned r ) noexcept
|
||||
{
|
||||
return r == 0 ? ArgN0 : (
|
||||
r == 1 ? ArgN1 : (
|
||||
r == 2 ? ArgN2 : (
|
||||
r == 3 ? ArgN3 : (
|
||||
r == 4 ? ArgN4 : (
|
||||
r == 5 ? ArgN5 : (
|
||||
r == 6 ? ArgN6 : (
|
||||
r == 7 ? ArgN7 : 0 )))))));
|
||||
}
|
||||
|
||||
template< size_t N >
|
||||
struct prepend { typedef ViewDimension< N , Vals... > type ; };
|
||||
|
||||
|
@ -2640,6 +2653,12 @@ public:
|
|||
KOKKOS_INLINE_FUNCTION constexpr size_t extent( const iType & r ) const
|
||||
{ return m_impl_offset.m_dim.extent(r); }
|
||||
|
||||
static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( const unsigned r ) noexcept
|
||||
{
|
||||
using dim_type = typename offset_type::dimension_type;
|
||||
return dim_type::static_extent(r);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION constexpr
|
||||
typename Traits::array_layout layout() const
|
||||
{ return m_impl_offset.layout(); }
|
||||
|
|
|
@ -63,6 +63,86 @@ struct CountFillFunctor {
|
|||
}
|
||||
};
|
||||
|
||||
/* RunUpdateCrsTest
|
||||
* 4 test cases:
|
||||
* 1. use member object version which is constructed directly using the copy constructor
|
||||
* 2. excplicity copy construct in local variable
|
||||
* 3. construct default and assign to input object
|
||||
* 4. construct object from views
|
||||
*/
|
||||
template< class CrsType, class ExecSpace, class scalarType >
|
||||
struct RunUpdateCrsTest {
|
||||
|
||||
struct TestOne {};
|
||||
struct TestTwo {};
|
||||
struct TestThree {};
|
||||
struct TestFour {};
|
||||
|
||||
CrsType graph;
|
||||
RunUpdateCrsTest( CrsType g_in ) : graph(g_in)
|
||||
{
|
||||
}
|
||||
|
||||
void run_test(int nTest) {
|
||||
switch (nTest)
|
||||
{
|
||||
case 1:
|
||||
parallel_for ("TestCrs1", Kokkos::RangePolicy<ExecSpace, TestOne>(0,graph.numRows()),*this);
|
||||
break;
|
||||
case 2:
|
||||
parallel_for ("TestCrs2", Kokkos::RangePolicy<ExecSpace, TestTwo>(0,graph.numRows()),*this);
|
||||
break;
|
||||
case 3:
|
||||
parallel_for ("TestCrs3", Kokkos::RangePolicy<ExecSpace, TestThree>(0,graph.numRows()),*this);
|
||||
break;
|
||||
case 4:
|
||||
parallel_for ("TestCrs4", Kokkos::RangePolicy<ExecSpace, TestFour>(0,graph.numRows()),*this);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void updateGraph(const CrsType & g_in, const scalarType row) const {
|
||||
auto row_map = g_in.row_map;
|
||||
auto entries = g_in.entries;
|
||||
auto j_start = row_map(row);
|
||||
auto j_end = row_map(row+1)-j_start;
|
||||
for (scalarType j = 0; j < j_end; ++j) {
|
||||
entries(j_start+j) = (j+1)*(j+1);
|
||||
}
|
||||
}
|
||||
|
||||
// Test Crs class from class member
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const TestOne &, const scalarType row) const {
|
||||
updateGraph(graph, row);
|
||||
}
|
||||
|
||||
// Test Crs class from copy constructor (local_graph(graph)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const TestTwo &, const scalarType row) const {
|
||||
CrsType local_graph(graph);
|
||||
updateGraph(local_graph, row);
|
||||
}
|
||||
|
||||
// Test Crs class from default constructor assigned to function parameter
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const TestThree &, const scalarType row) const {
|
||||
CrsType local_graph;
|
||||
local_graph = graph;
|
||||
updateGraph(local_graph, row);
|
||||
}
|
||||
|
||||
// Test Crs class from local graph constructed from row_map and entities access on input parameter)
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const TestFour &, const scalarType row) const {
|
||||
CrsType local_graph(graph.row_map, graph.entries);
|
||||
updateGraph(local_graph, row);
|
||||
}
|
||||
};
|
||||
|
||||
template< class ExecSpace >
|
||||
void test_count_fill(std::int32_t nrows) {
|
||||
Kokkos::Crs<std::int32_t, ExecSpace, void, std::int32_t> graph;
|
||||
|
@ -81,6 +161,38 @@ void test_count_fill(std::int32_t nrows) {
|
|||
}
|
||||
}
|
||||
|
||||
// Test Crs Constructor / assignment operation by
|
||||
// using count and fill to create/populate initial graph,
|
||||
// then use parallel_for with Crs directly to update content
|
||||
// then verify results
|
||||
template< class ExecSpace >
|
||||
void test_constructor(std::int32_t nrows) {
|
||||
|
||||
for (int nTest = 1; nTest < 5; nTest++)
|
||||
{
|
||||
typedef Kokkos::Crs<std::int32_t, ExecSpace, void, std::int32_t> crs_int32;
|
||||
crs_int32 graph;
|
||||
Kokkos::count_and_fill_crs(graph, nrows, CountFillFunctor<ExecSpace>());
|
||||
ASSERT_EQ(graph.numRows(), nrows);
|
||||
|
||||
RunUpdateCrsTest<crs_int32, ExecSpace, std::int32_t> crstest(graph);
|
||||
crstest.run_test(nTest);
|
||||
|
||||
auto row_map = Kokkos::create_mirror_view(graph.row_map);
|
||||
Kokkos::deep_copy(row_map, graph.row_map);
|
||||
auto entries = Kokkos::create_mirror_view(graph.entries);
|
||||
Kokkos::deep_copy(entries, graph.entries);
|
||||
|
||||
for (std::int32_t row = 0; row < nrows; ++row) {
|
||||
auto n = (row % 4) + 1;
|
||||
ASSERT_EQ(row_map(row + 1) - row_map(row), n);
|
||||
for (std::int32_t j = 0; j < n; ++j) {
|
||||
ASSERT_EQ(entries(row_map(row) + j), (j + 1)*(j+1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
TEST_F( TEST_CATEGORY, crs_count_fill )
|
||||
|
@ -95,4 +207,17 @@ TEST_F( TEST_CATEGORY, crs_count_fill )
|
|||
test_count_fill<TEST_EXECSPACE>(10000);
|
||||
}
|
||||
|
||||
TEST_F( TEST_CATEGORY, crs_copy_constructor )
|
||||
{
|
||||
test_constructor<TEST_EXECSPACE>(0);
|
||||
test_constructor<TEST_EXECSPACE>(1);
|
||||
test_constructor<TEST_EXECSPACE>(2);
|
||||
test_constructor<TEST_EXECSPACE>(3);
|
||||
test_constructor<TEST_EXECSPACE>(13);
|
||||
test_constructor<TEST_EXECSPACE>(100);
|
||||
test_constructor<TEST_EXECSPACE>(1000);
|
||||
test_constructor<TEST_EXECSPACE>(10000);
|
||||
}
|
||||
|
||||
|
||||
} // namespace Test
|
||||
|
|
|
@ -956,7 +956,12 @@ struct TestMDRange_3D {
|
|||
}
|
||||
, Kokkos::Min<double>(min) );
|
||||
|
||||
ASSERT_EQ( min, 8.0 );
|
||||
if((N0-1)*(N1-1)*(N2-1)>0)
|
||||
ASSERT_EQ( min, 8.0 );
|
||||
else {
|
||||
double min_identity = Kokkos::reduction_identity<double>::min();
|
||||
ASSERT_EQ( min, min_identity );
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -46,8 +46,10 @@
|
|||
namespace Test {
|
||||
|
||||
TEST_F( TEST_CATEGORY , mdrange_3d) {
|
||||
TestMDRange_3D< TEST_EXECSPACE >::test_for3( 1, 10, 100 );
|
||||
TestMDRange_3D< TEST_EXECSPACE >::test_for3( 100, 10, 100 );
|
||||
#if !defined( KOKKOS_ENABLE_ROCM ) // MDRange Reduced explicitly handled in its own cpp file
|
||||
TestMDRange_3D< TEST_EXECSPACE >::test_reduce3( 1, 10, 100 );
|
||||
TestMDRange_3D< TEST_EXECSPACE >::test_reduce3( 100, 10, 100 );
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -60,8 +60,11 @@ struct TestRange {
|
|||
struct VerifyInitTag {};
|
||||
struct ResetTag {};
|
||||
struct VerifyResetTag {};
|
||||
struct OffsetTag {};
|
||||
struct VerifyOffsetTag {};
|
||||
|
||||
int N;
|
||||
int N;
|
||||
static const int offset = 13;
|
||||
TestRange( const size_t N_ )
|
||||
: m_flags( Kokkos::ViewAllocateWithoutInitializing( "flags" ), N_ ), N(N_)
|
||||
{}
|
||||
|
@ -117,6 +120,18 @@ struct TestRange {
|
|||
if ( int( 2 * i ) != host_flags( i ) ) ++error_count;
|
||||
}
|
||||
ASSERT_EQ( error_count, int( 0 ) );
|
||||
|
||||
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType, OffsetTag >( offset, N + offset ), *this );
|
||||
Kokkos::parallel_for( std::string("TestKernelFor"), Kokkos::RangePolicy<ExecSpace, ScheduleType, VerifyOffsetTag>( 0, N ), *this);
|
||||
|
||||
Kokkos::deep_copy(host_flags, m_flags);
|
||||
|
||||
error_count = 0;
|
||||
for (int i = 0; i < N; ++i) {
|
||||
if (i + offset != host_flags(i))
|
||||
++error_count;
|
||||
}
|
||||
ASSERT_EQ(error_count, int(0));
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -144,9 +159,19 @@ struct TestRange {
|
|||
}
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const OffsetTag &, const int i) const {
|
||||
m_flags(i - offset) = i;
|
||||
}
|
||||
|
||||
struct OffsetTag {};
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const VerifyOffsetTag &, const int i) const {
|
||||
if (i + offset != m_flags(i)) {
|
||||
printf("TestRange::test_for error at %d != %d\n", i + offset, m_flags(i));
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
void test_reduce( )
|
||||
{
|
||||
|
@ -158,7 +183,7 @@ struct TestRange {
|
|||
// sum( 0 .. N-1 )
|
||||
ASSERT_EQ( size_t( ( N - 1 ) * ( N ) / 2 ), size_t( total ) );
|
||||
|
||||
Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, ScheduleType, OffsetTag>( 0, N ), *this, total );
|
||||
Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, ScheduleType, OffsetTag>( offset, N+offset ), *this, total );
|
||||
// sum( 1 .. N )
|
||||
ASSERT_EQ( size_t( ( N ) * ( N + 1 ) / 2 ), size_t( total ) );
|
||||
}
|
||||
|
@ -169,7 +194,7 @@ struct TestRange {
|
|||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( const OffsetTag &, const int i, value_type & update ) const
|
||||
{ update += 1 + m_flags( i ); }
|
||||
{ update += 1 + m_flags( i-offset ); }
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
|
|
|
@ -532,7 +532,11 @@ struct functor_vec_single {
|
|||
typedef ExecutionSpace execution_space;
|
||||
|
||||
Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
|
||||
functor_vec_single( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
|
||||
int nStart;
|
||||
int nEnd;
|
||||
|
||||
functor_vec_single( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_, const int start_, const int end_ ) :
|
||||
flag( flag_ ), nStart(start_), nEnd(end_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( typename policy_type::member_type team ) const {
|
||||
|
@ -541,7 +545,7 @@ struct functor_vec_single {
|
|||
// inside a parallel_for and write to it.
|
||||
Scalar value = 0;
|
||||
|
||||
Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 0, 13 ), [&] ( int i )
|
||||
Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, nStart, nEnd ), [&] ( int i )
|
||||
{
|
||||
value = i; // This write is violating Kokkos semantics for nested parallelism.
|
||||
});
|
||||
|
@ -552,12 +556,12 @@ struct functor_vec_single {
|
|||
}, value );
|
||||
|
||||
Scalar value2 = 0;
|
||||
Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, 0, 13 ), [&] ( int i, Scalar & val )
|
||||
Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, nStart, nEnd ), [&] ( int i, Scalar & val )
|
||||
{
|
||||
val += value;
|
||||
}, value2 );
|
||||
|
||||
if ( value2 != ( value * 13 ) ) {
|
||||
if ( value2 != ( value * (nEnd-nStart) ) ) {
|
||||
printf( "FAILED vector_single broadcast %i %i %f %f\n",
|
||||
team.league_rank(), team.team_rank(), (double) value2, (double) value );
|
||||
|
||||
|
@ -746,12 +750,6 @@ bool test_scalar( int nteams, int team_size, int test ) {
|
|||
functor_vec_red< Scalar, ExecutionSpace >( d_flag ) );
|
||||
}
|
||||
else if ( test == 1 ) {
|
||||
// WORKAROUND CUDA
|
||||
#if defined(KOKKOS_ENABLE_CUDA)
|
||||
#if defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) || defined(KOKKOS_ARCH_PASCAL)
|
||||
if(!std::is_same<ExecutionSpace,Kokkos::Cuda>::value)
|
||||
#endif
|
||||
#endif
|
||||
Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
|
||||
functor_vec_red_reducer< Scalar, ExecutionSpace >( d_flag ) );
|
||||
}
|
||||
|
@ -765,7 +763,7 @@ bool test_scalar( int nteams, int team_size, int test ) {
|
|||
}
|
||||
else if ( test == 4 ) {
|
||||
Kokkos::parallel_for( "B", Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
|
||||
functor_vec_single< Scalar, ExecutionSpace >( d_flag ) );
|
||||
functor_vec_single< Scalar, ExecutionSpace >( d_flag, 0, 13 ) );
|
||||
}
|
||||
else if ( test == 5 ) {
|
||||
Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size ),
|
||||
|
@ -791,6 +789,10 @@ bool test_scalar( int nteams, int team_size, int test ) {
|
|||
Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
|
||||
functor_team_vector_reduce_reducer< Scalar, ExecutionSpace >( d_flag ) );
|
||||
}
|
||||
else if ( test == 11 ) {
|
||||
Kokkos::parallel_for( "B", Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
|
||||
functor_vec_single< Scalar, ExecutionSpace >( d_flag, 4, 13 ) );
|
||||
}
|
||||
|
||||
Kokkos::deep_copy( h_flag, d_flag );
|
||||
|
||||
|
@ -938,6 +940,7 @@ TEST_F( TEST_CATEGORY, team_vector )
|
|||
ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 8 ) ) );
|
||||
ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 9 ) ) );
|
||||
ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 10 ) ) );
|
||||
ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 11 ) ) );
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -56,17 +56,13 @@ struct TestViewCopy {
|
|||
|
||||
using InExecSpace = ExecSpace;
|
||||
|
||||
static void test_view_copy()
|
||||
static void test_view_copy(const int dim0, const int dim1, const int dim2)
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_CUDA ) || defined( KOKKOS_ENABLE_ROCM )
|
||||
// ExecSpace = CudaUVM, CudaHostPinned
|
||||
// This test will fail at runtime with an illegal memory access if something goes wrong
|
||||
// Test 1: deep_copy from host_mirror_space to ExecSpace and ExecSpace back to host_mirror_space
|
||||
{
|
||||
const int dim0 = 4;
|
||||
const int dim1 = 2;
|
||||
const int dim2 = 3;
|
||||
|
||||
typedef Kokkos::View<double****,InExecSpace> Rank4ViewType;
|
||||
Rank4ViewType view_4;
|
||||
view_4 = Rank4ViewType("view_4", dim0, dim1, dim2, dim2);
|
||||
|
@ -88,19 +84,21 @@ struct TestViewCopy {
|
|||
|
||||
// Test 2: deep_copy from Cuda to ExecSpace and ExecSpace back to Cuda
|
||||
{
|
||||
const int dim0 = 4;
|
||||
const int dim1 = 2;
|
||||
const int dim2 = 3;
|
||||
|
||||
typedef Kokkos::View<double****,InExecSpace> Rank4ViewType;
|
||||
Rank4ViewType view_4;
|
||||
view_4 = Rank4ViewType("view_4", dim0, dim1, dim2, dim2);
|
||||
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
typedef Kokkos::Cuda space_type;
|
||||
typedef typename std::conditional<
|
||||
Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace,typename InExecSpace::memory_space>::accessible,
|
||||
Kokkos::CudaSpace,
|
||||
InExecSpace>::type space_type;
|
||||
#endif
|
||||
#if defined( KOKKOS_ENABLE_ROCM )
|
||||
typedef Kokkos::Experimental::ROCm space_type;
|
||||
typedef typename std::conditional<
|
||||
Kokkos::Impl::MemorySpaceAccess<Kokkos::ROCmSpace,typename InExecSpace::memory_space>::accessible,
|
||||
Kokkos::ROCmSpace,
|
||||
InExecSpace>::type space_type;
|
||||
#endif
|
||||
Kokkos::View<double**,Kokkos::LayoutLeft,space_type> srcView("srcView", dim2, dim2);
|
||||
|
||||
|
@ -118,10 +116,6 @@ struct TestViewCopy {
|
|||
|
||||
// Test 3: deep_copy from host_space to ExecSpace and ExecSpace back to host_space
|
||||
{
|
||||
const int dim0 = 4;
|
||||
const int dim1 = 2;
|
||||
const int dim2 = 3;
|
||||
|
||||
typedef Kokkos::View<double****,InExecSpace> Rank4ViewType;
|
||||
Rank4ViewType view_4;
|
||||
view_4 = Rank4ViewType("view_4", dim0, dim1, dim2, dim2);
|
||||
|
@ -149,7 +143,41 @@ struct TestViewCopy {
|
|||
|
||||
TEST_F( TEST_CATEGORY , view_copy_tests ) {
|
||||
//Only include this file to be compiled with CudaUVM and CudaHostPinned
|
||||
TestViewCopy< TEST_EXECSPACE >::test_view_copy();
|
||||
TestViewCopy< TEST_EXECSPACE >::test_view_copy(4,2,3);
|
||||
TestViewCopy< TEST_EXECSPACE >::test_view_copy(4,2,0);
|
||||
}
|
||||
|
||||
TEST_F( TEST_CATEGORY , view_copy_degenerated ) {
|
||||
//Only include this file to be compiled with CudaUVM and CudaHostPinned
|
||||
Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v_um_def_1;
|
||||
Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v_um_1( reinterpret_cast<int*>(-1), 0 );
|
||||
Kokkos::View<int*> v_m_def_1;
|
||||
Kokkos::View<int*> v_m_1("v_m_1", 0);
|
||||
|
||||
Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v_um_def_2;
|
||||
Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v_um_2( reinterpret_cast<int*>(-1), 0 );
|
||||
Kokkos::View<int*> v_m_def_2;
|
||||
Kokkos::View<int*> v_m_2("v_m_2", 0);
|
||||
|
||||
Kokkos::deep_copy(v_um_def_1, v_um_def_2);
|
||||
Kokkos::deep_copy(v_um_def_1, v_um_2);
|
||||
Kokkos::deep_copy(v_um_def_1, v_m_def_2);
|
||||
Kokkos::deep_copy(v_um_def_1, v_m_2);
|
||||
|
||||
Kokkos::deep_copy(v_um_1, v_um_def_2);
|
||||
Kokkos::deep_copy(v_um_1, v_um_2);
|
||||
Kokkos::deep_copy(v_um_1, v_m_def_2);
|
||||
Kokkos::deep_copy(v_um_1, v_m_2);
|
||||
|
||||
Kokkos::deep_copy(v_m_def_1, v_um_def_2);
|
||||
Kokkos::deep_copy(v_m_def_1, v_um_2);
|
||||
Kokkos::deep_copy(v_m_def_1, v_m_def_2);
|
||||
Kokkos::deep_copy(v_m_def_1, v_m_2);
|
||||
|
||||
Kokkos::deep_copy(v_m_1, v_um_def_2);
|
||||
Kokkos::deep_copy(v_m_1, v_um_2);
|
||||
Kokkos::deep_copy(v_m_1, v_m_def_2);
|
||||
Kokkos::deep_copy(v_m_1, v_m_2);
|
||||
}
|
||||
|
||||
} // namespace Test
|
||||
|
|
|
@ -1245,5 +1245,12 @@ TEST_F( TEST_CATEGORY , view_mapping_operator )
|
|||
test_view_mapping_operator< TEST_EXECSPACE >();
|
||||
}
|
||||
|
||||
TEST_F( TEST_CATEGORY , static_extent )
|
||||
{
|
||||
using T = Kokkos::View<double*[2][3]>;
|
||||
ASSERT_EQ( T::static_extent(1), 2 );
|
||||
ASSERT_EQ( T::static_extent(2), 3 );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -228,6 +228,10 @@ TEST_F( cuda, uvm )
|
|||
}
|
||||
}
|
||||
|
||||
/* Removing UVM Allocs Test due to added time to complete overall unit test
|
||||
* The issue verified with this unit test appears to no longer be an
|
||||
* problem. Refer to github issue 1880 for more details
|
||||
*
|
||||
TEST_F( cuda, uvm_num_allocs )
|
||||
{
|
||||
// The max number of UVM allocations allowed is 65536.
|
||||
|
@ -288,6 +292,7 @@ TEST_F( cuda, uvm_num_allocs )
|
|||
|
||||
#undef MAX_NUM_ALLOCS
|
||||
}
|
||||
*/
|
||||
|
||||
template< class MemSpace, class ExecSpace >
|
||||
struct TestViewCudaAccessible {
|
||||
|
|
|
@ -43,3 +43,4 @@
|
|||
|
||||
#include <openmp/TestOpenMP_Category.hpp>
|
||||
#include <TestViewAPI_e.hpp>
|
||||
#include <TestViewCopy.hpp>
|
||||
|
|
|
@ -43,3 +43,5 @@
|
|||
|
||||
#include <serial/TestSerial_Category.hpp>
|
||||
#include <TestViewAPI_e.hpp>
|
||||
#include <TestViewCopy.hpp>
|
||||
|
||||
|
|
|
@ -43,3 +43,4 @@
|
|||
|
||||
#include <threads/TestThreads_Category.hpp>
|
||||
#include <TestViewAPI_e.hpp>
|
||||
#include <TestViewCopy.hpp>
|
||||
|
|
|
@ -68,6 +68,9 @@ do
|
|||
--cxxflags*)
|
||||
CXXFLAGS="${key#*=}"
|
||||
;;
|
||||
--cxxstandard*)
|
||||
KOKKOS_CXX_STANDARD="${key#*=}"
|
||||
;;
|
||||
--ldflags*)
|
||||
LDFLAGS="${key#*=}"
|
||||
;;
|
||||
|
@ -127,6 +130,7 @@ do
|
|||
echo "--arch=[OPT]: Set target architectures. Options are:"
|
||||
echo " [AMD]"
|
||||
echo " AMDAVX = AMD CPU"
|
||||
echo " EPYC = AMD EPYC Zen-Core CPU"
|
||||
echo " [ARM]"
|
||||
echo " ARMv80 = ARMv8.0 Compatible CPU"
|
||||
echo " ARMv81 = ARMv8.1 Compatible CPU"
|
||||
|
@ -165,6 +169,8 @@ do
|
|||
echo " build. This will still set certain required"
|
||||
echo " flags via KOKKOS_CXXFLAGS (such as -fopenmp,"
|
||||
echo " --std=c++11, etc.)."
|
||||
echo "--cxxstandard=[FLAGS] Overwrite KOKKOS_CXX_STANDARD for library build and test"
|
||||
echo " c++11 (default), c++14, c++17, c++1y, c++1z, c++2a"
|
||||
echo "--ldflags=[FLAGS] Overwrite LDFLAGS for library build and test"
|
||||
echo " build. This will still set certain required"
|
||||
echo " flags via KOKKOS_LDFLAGS (such as -fopenmp,"
|
||||
|
@ -243,6 +249,10 @@ if [ ${#CXXFLAGS} -gt 0 ]; then
|
|||
KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXXFLAGS=\"${CXXFLAGS}\""
|
||||
fi
|
||||
|
||||
if [ ${#KOKKOS_CXX_STANDARD} -gt 0 ]; then
|
||||
KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_CXX_STANDARD=\"${KOKKOS_CXX_STANDARD}\""
|
||||
fi
|
||||
|
||||
if [ ${#LDFLAGS} -gt 0 ]; then
|
||||
KOKKOS_SETTINGS="${KOKKOS_SETTINGS} LDFLAGS=\"${LDFLAGS}\""
|
||||
fi
|
||||
|
|
|
@ -88,6 +88,8 @@ CXX_FLAGS_EXTRA=""
|
|||
LD_FLAGS_EXTRA=""
|
||||
KOKKOS_OPTIONS=""
|
||||
|
||||
CXX_STANDARD="c++11"
|
||||
|
||||
#
|
||||
# Handle arguments.
|
||||
#
|
||||
|
@ -142,6 +144,9 @@ do
|
|||
--cxxflags-extra*)
|
||||
CXX_FLAGS_EXTRA="${key#*=}"
|
||||
;;
|
||||
--cxxstandard*)
|
||||
CXX_STANDARD="${key#*=}"
|
||||
;;
|
||||
--ldflags-extra*)
|
||||
LD_FLAGS_EXTRA="${key#*=}"
|
||||
;;
|
||||
|
@ -227,18 +232,30 @@ elif [ "$MACHINE" = "white" ]; then
|
|||
export SLURM_TASKS_PER_NODE=32
|
||||
|
||||
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
|
||||
IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
|
||||
IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>,gcc/7.2.0"
|
||||
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.2.0,ibm/xl/16.1.0"
|
||||
CUDA10_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.4.0,ibm/xl/16.1.0"
|
||||
|
||||
# Don't do pthread on white.
|
||||
GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
|
||||
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
|
||||
"cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
)
|
||||
if [ "$SPOT_CHECK" = "True" ]; then
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
|
||||
"cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
)
|
||||
else
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
|
||||
"ibm/16.1.1 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
|
||||
"cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"cuda/10.0.130 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
)
|
||||
fi
|
||||
|
||||
if [ -z "$ARCH_FLAG" ]; then
|
||||
ARCH_FLAG="--arch=Power8,Kepler37"
|
||||
|
@ -323,6 +340,7 @@ elif [ "$MACHINE" = "apollo" ]; then
|
|||
BASE_MODULE_LIST="sems-env,kokkos-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>,kokkos-hwloc/1.10.1/base"
|
||||
CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base"
|
||||
CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base"
|
||||
CUDA10_MODULE_LIST="sems-env,kokkos-env,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base"
|
||||
|
||||
CLANG_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/9.0.69"
|
||||
CLANG7_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/9.1"
|
||||
|
@ -344,6 +362,7 @@ elif [ "$MACHINE" = "apollo" ]; then
|
|||
else
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("cuda/9.1 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"cuda/10.0 $CUDA10_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"clang/6.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
|
||||
"clang/7.0 $CLANG7_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
|
||||
"clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
|
||||
|
@ -629,6 +648,8 @@ single_build_and_test() {
|
|||
local cxxflags="${cxxflags} ${CXX_FLAGS_EXTRA}"
|
||||
local ldflags="${ldflags} ${LD_FLAGS_EXTRA}"
|
||||
|
||||
local cxx_standard="${CXX_STANDARD}"
|
||||
|
||||
if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then
|
||||
local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS"
|
||||
fi
|
||||
|
@ -650,7 +671,7 @@ single_build_and_test() {
|
|||
run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; }
|
||||
fi
|
||||
else
|
||||
run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --ldflags=\"$ldflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
|
||||
run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
|
||||
local -i build_start_time=$(date +%s)
|
||||
run_cmd make -j 48 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
|
||||
local -i build_end_time=$(date +%s)
|
||||
|
|
Loading…
Reference in New Issue