From 64834e4a3d01fa59bac248185282013e1267e200 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 1 Feb 2019 12:45:54 -0700 Subject: [PATCH] Update Kokkos library --- lib/kokkos/Makefile.kokkos | 37 ++++-- lib/kokkos/README | 4 +- lib/kokkos/bin/nvcc_wrapper | 10 ++ lib/kokkos/cmake/kokkos_options.cmake | 1 + lib/kokkos/containers/src/Kokkos_DualView.hpp | 27 ++-- .../containers/src/Kokkos_DynRankView.hpp | 18 +-- .../containers/unit_tests/TestDualView.hpp | 100 +++++++++++++- lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp | 3 +- lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp | 4 + .../core/src/Cuda/Kokkos_Cuda_Parallel.hpp | 11 +- .../core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp | 77 ++++++----- lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp | 10 +- .../Kokkos_Cuda_Version_9_8_Compatibility.hpp | 10 +- lib/kokkos/core/src/Kokkos_CopyViews.hpp | 56 +++++++- lib/kokkos/core/src/Kokkos_Crs.hpp | 39 +++--- lib/kokkos/core/src/Kokkos_Macros.hpp | 4 + .../core/src/Kokkos_Parallel_Reduce.hpp | 8 +- lib/kokkos/core/src/Kokkos_View.hpp | 37 ++++-- .../src/Threads/Kokkos_Threads_Parallel.hpp | 8 +- .../Kokkos_Atomic_Compare_Exchange_Strong.hpp | 14 +- .../core/src/impl/Kokkos_Atomic_Exchange.hpp | 21 ++- .../core/src/impl/Kokkos_Atomic_Fetch_Add.hpp | 13 +- .../core/src/impl/Kokkos_Atomic_Fetch_And.hpp | 9 ++ .../core/src/impl/Kokkos_Atomic_Fetch_Or.hpp | 9 ++ .../core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp | 13 +- .../core/src/impl/Kokkos_Atomic_Generic.hpp | 11 +- lib/kokkos/core/src/impl/Kokkos_Core.cpp | 82 +++++++++++- lib/kokkos/core/src/impl/Kokkos_Utilities.hpp | 3 + lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp | 69 +--------- .../core/src/impl/Kokkos_ViewMapping.hpp | 21 ++- lib/kokkos/core/unit_test/TestCrs.hpp | 125 ++++++++++++++++++ lib/kokkos/core/unit_test/TestMDRange.hpp | 7 +- lib/kokkos/core/unit_test/TestMDRange_d.hpp | 2 + lib/kokkos/core/unit_test/TestRange.hpp | 35 ++++- lib/kokkos/core/unit_test/TestTeamVector.hpp | 25 ++-- lib/kokkos/core/unit_test/TestViewCopy.hpp | 60 ++++++--- .../core/unit_test/TestViewMapping_a.hpp | 7 + .../core/unit_test/cuda/TestCuda_Spaces.cpp | 5 + .../unit_test/openmp/TestOpenMP_ViewAPI_e.cpp | 1 + .../unit_test/serial/TestSerial_ViewAPI_e.cpp | 2 + .../threads/TestThreads_ViewAPI_e.cpp | 1 + lib/kokkos/generate_makefile.bash | 10 ++ .../scripts/testing_scripts/test_all_sandia | 37 ++++-- 43 files changed, 797 insertions(+), 249 deletions(-) diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index 84ce007639..a90e86b9f8 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -6,16 +6,16 @@ ifndef KOKKOS_PATH endif CXXFLAGS=$(CCFLAGS) -# Options: Cuda,ROCm,OpenMP,Pthread,Qthreads,Serial +# Options: Cuda,ROCm,OpenMP,Pthreads,Qthreads,Serial KOKKOS_DEVICES ?= "OpenMP" -#KOKKOS_DEVICES ?= "Pthread" +#KOKKOS_DEVICES ?= "Pthreads" # Options: # Intel: KNC,KNL,SNB,HSW,BDW,SKX -# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72 +# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75 # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2 # IBM: BGQ,Power7,Power8,Power9 # AMD-GPUS: Kaveri,Carrizo,Fiji,Vega -# AMD-CPUS: AMDAVX,Ryzen,Epyc +# AMD-CPUS: AMDAVX,Ryzen,EPYC KOKKOS_ARCH ?= "" # Options: yes,no KOKKOS_DEBUG ?= "no" @@ -224,7 +224,7 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) else ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) KOKKOS_INTERNAL_CXX11_FLAG := -std=c++11 - #KOKKOS_INTERNAL_CXX14_FLAG := -std=c++14 + KOKKOS_INTERNAL_CXX14_FLAG := -std=c++14 KOKKOS_INTERNAL_CXX1Y_FLAG := -std=c++1y #KOKKOS_INTERNAL_CXX17_FLAG := -std=c++17 #KOKKOS_INTERNAL_CXX1Z_FLAG := -std=c++1Z @@ -276,6 +276,7 @@ KOKKOS_INTERNAL_USE_ARCH_PASCAL61 := $(call kokkos_has_string,$(KOKKOS_ARCH),Pas KOKKOS_INTERNAL_USE_ARCH_PASCAL60 := $(call kokkos_has_string,$(KOKKOS_ARCH),Pascal60) KOKKOS_INTERNAL_USE_ARCH_VOLTA70 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volta70) KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volta72) +KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75) KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ @@ -284,6 +285,7 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \ + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \ + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \ + + $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53)) @@ -300,6 +302,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \ + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \ + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \ + + $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53)) @@ -331,7 +334,7 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ # AMD based. KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX) KOKKOS_INTERNAL_USE_ARCH_RYZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Ryzen) -KOKKOS_INTERNAL_USE_ARCH_EPYC := $(call kokkos_has_string,$(KOKKOS_ARCH),Epyc) +KOKKOS_INTERNAL_USE_ARCH_EPYC := $(call kokkos_has_string,$(KOKKOS_ARCH),EPYC) KOKKOS_INTERNAL_USE_ARCH_KAVERI := $(call kokkos_has_string,$(KOKKOS_ARCH),Kaveri) KOKKOS_INTERNAL_USE_ARCH_CARRIZO := $(call kokkos_has_string,$(KOKKOS_ARCH),Carrizo) KOKKOS_INTERNAL_USE_ARCH_FIJI := $(call kokkos_has_string,$(KOKKOS_ARCH),Fiji) @@ -341,12 +344,12 @@ KOKKOS_INTERNAL_USE_ARCH_GFX901 := $(call kokkos_has_string,$(KOKKOS_ARCH),gfx90 # Any AVX? KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM)) KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)) -KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW)) +KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_EPYC)) KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL)) KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SKX)) # Decide what ISA level we are able to support. -KOKKOS_INTERNAL_USE_ISA_X86_64 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX)) +KOKKOS_INTERNAL_USE_ISA_X86_64 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_EPYC)) KOKKOS_INTERNAL_USE_ISA_KNC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNC)) KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER8) + $(KOKKOS_INTERNAL_USE_ARCH_POWER9)) KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER7)) @@ -658,6 +661,19 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1) endif endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_EPYC), 1) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AMD_EPYC") + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AMD_AVX2") + + ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) + KOKKOS_CXXFLAGS += -mavx2 + KOKKOS_LDFLAGS += -mavx2 + else + KOKKOS_CXXFLAGS += -march=znver1 -mtune=znver1 + KOKKOS_LDFLAGS += -march=znver1 -mtune=znver1 + endif +endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1) tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV80") tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV8_THUNDERX") @@ -950,6 +966,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VOLTA72") KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_72 endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_TURING") + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_TURING75") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_75 + endif ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) diff --git a/lib/kokkos/README b/lib/kokkos/README index 4b6d4170e0..cb6ceb5581 100644 --- a/lib/kokkos/README +++ b/lib/kokkos/README @@ -73,6 +73,8 @@ For specifics see the LICENSE file contained in the repository or distribution. * NVCC 7.5 for CUDA (with gcc 4.8.4) * NVCC 8.0.44 for CUDA (with gcc 5.3.0) * NVCC 9.1 for CUDA (with gcc 6.1.0) + * NVCC 9.2 for CUDA (with gcc 7.2.0) + * NVCC 10.0 for CUDA (with gcc 7.4.0) ### Primary tested compilers on Power 8 are: * GCC 6.4.0 (OpenMP,Serial) @@ -109,7 +111,7 @@ GCC: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized Intel: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized Clang: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized -NVCC: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized +NVCC: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized Other compilers are tested occasionally, in particular when pushing from develop to master branch, without -Werror and only for a select set of backends. diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper index f926ae024c..94bc72854e 100755 --- a/lib/kokkos/bin/nvcc_wrapper +++ b/lib/kokkos/bin/nvcc_wrapper @@ -308,6 +308,16 @@ do shift done +#Check if nvcc exists +if [ $host_only -ne 1 ]; then + var=$(which nvcc ) + if [ $? -gt 0 ]; then + echo "Could not find nvcc in PATH" + exit $? + fi +fi + + # Only print host compiler version if [ $get_host_version -eq 1 ]; then $host_compiler --version diff --git a/lib/kokkos/cmake/kokkos_options.cmake b/lib/kokkos/cmake/kokkos_options.cmake index 580d1d322b..be494e5df0 100644 --- a/lib/kokkos/cmake/kokkos_options.cmake +++ b/lib/kokkos/cmake/kokkos_options.cmake @@ -104,6 +104,7 @@ list(APPEND KOKKOS_ARCH_LIST Pascal61 # (GPU) NVIDIA Pascal generation CC 6.1 Volta70 # (GPU) NVIDIA Volta generation CC 7.0 Volta72 # (GPU) NVIDIA Volta generation CC 7.2 + Turing75 # (GPU) NVIDIA Turing generation CC 7.5 ) # List of possible device architectures. diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp index adba0c4158..f6631a4149 100644 --- a/lib/kokkos/containers/src/Kokkos_DualView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp @@ -832,16 +832,14 @@ void deep_copy (DualView dst, // trust me, this must not be a reference const DualView& src ) { - if(src.modified_flags.data()==NULL || dst.modified_flags.data()==NULL) { - return deep_copy(dst.d_view, src.d_view); - } - if (src.modified_flags(1) >= src.modified_flags(0)) { - deep_copy (dst.d_view, src.d_view); - dst.template modify::device_type> (); - } else { + if ( src.need_sync_device() ) { deep_copy (dst.h_view, src.h_view); - dst.template modify::host_mirror_space> (); + dst.modify_host(); } + else { + deep_copy (dst.d_view, src.d_view); + dst.modify_device(); + } } template< class ExecutionSpace , @@ -852,15 +850,12 @@ deep_copy (const ExecutionSpace& exec , DualView dst, // trust me, this must not be a reference const DualView& src ) { - if(src.modified_flags.data()==NULL || dst.modified_flags.data()==NULL) { - return deep_copy(exec, dst.d_view, src.d_view); - } - if (src.modified_flags(1) >= src.modified_flags(0)) { - deep_copy (exec, dst.d_view, src.d_view); - dst.template modify::device_type> (); - } else { + if ( src.need_sync_device() ) { deep_copy (exec, dst.h_view, src.h_view); - dst.template modify::host_mirror_space> (); + dst.modify_host(); + } else { + deep_copy (exec, dst.d_view, src.d_view); + dst.modify_device(); } } diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp index 8be2c49a31..3f284e6a8d 100644 --- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -368,8 +368,8 @@ public: enum { is_assignable = is_assignable_value_type && is_assignable_layout }; - typedef ViewMapping< DstTraits , void > DstType ; - typedef ViewMapping< SrcTraits , void > SrcType ; + typedef ViewMapping< DstTraits , typename DstTraits::specialize > DstType ; + typedef ViewMapping< SrcTraits , typename SrcTraits::specialize > SrcType ; template < typename DT , typename ... DP , typename ST , typename ... SP > KOKKOS_INLINE_FUNCTION @@ -432,7 +432,7 @@ public: private: - typedef Kokkos::Impl::ViewMapping< traits , void > map_type ; + typedef Kokkos::Impl::ViewMapping< traits , typename traits::specialize > map_type ; typedef Kokkos::Impl::SharedAllocationTracker track_type ; track_type m_track ; @@ -567,11 +567,11 @@ public: // Allow specializations to query their specialized map #ifdef KOKKOS_ENABLE_DEPRECATED_CODE KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::ViewMapping< traits , void > & + const Kokkos::Impl::ViewMapping< traits , typename traits::specialize > & implementation_map() const { return m_map ; } #endif KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::ViewMapping< traits , void > & + const Kokkos::Impl::ViewMapping< traits , typename traits::specialize > & impl_map() const { return m_map ; } //---------------------------------------- @@ -952,7 +952,7 @@ public: , m_rank(rhs.m_rank) { typedef typename DynRankView ::traits SrcTraits ; - typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ; + typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , typename traits::specialize > Mapping ; static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" ); Mapping::assign( m_map , rhs.m_map , rhs.m_track ); } @@ -962,7 +962,7 @@ public: DynRankView & operator = (const DynRankView & rhs ) { typedef typename DynRankView ::traits SrcTraits ; - typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ; + typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , typename traits::specialize > Mapping ; static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" ); Mapping::assign( m_map , rhs.m_map , rhs.m_track ); m_track.assign( rhs.m_track , traits::is_managed ); @@ -980,7 +980,7 @@ public: { typedef typename View::traits SrcTraits ; typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Impl::ViewToDynRankViewTag > Mapping ; - static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" ); + static_assert( Mapping::is_assignable , "Incompatible View to DynRankView copy construction" ); Mapping::assign( *this , rhs ); } @@ -1432,7 +1432,7 @@ public: , Args ... args ) { - typedef ViewMapping< traits_type, void > DstType ; + typedef ViewMapping< traits_type, typename traits_type::specialize > DstType ; typedef typename std::conditional< (rank==0) , ViewDimension<> , typename std::conditional< (rank==1) , ViewDimension<0> diff --git a/lib/kokkos/containers/unit_tests/TestDualView.hpp b/lib/kokkos/containers/unit_tests/TestDualView.hpp index cbff27cb39..767f93c093 100644 --- a/lib/kokkos/containers/unit_tests/TestDualView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDualView.hpp @@ -101,13 +101,98 @@ namespace Impl { result = run_me< Kokkos::DualView >(size,3); } - }; + }; + + template < typename Scalar, class ViewType > + struct SumViewEntriesFunctor { + + typedef Scalar value_type; + + ViewType fv; + + SumViewEntriesFunctor ( const ViewType & fv_ ) : fv(fv_) {} + + KOKKOS_INLINE_FUNCTION + void operator() ( const int i , value_type & total ) const { + for ( size_t j = 0; j < fv.extent(1); ++j ) { + total += fv(i,j); + } + } + + }; + + + template + struct test_dual_view_deep_copy + { + typedef Scalar scalar_type; + typedef Device execution_space; + + template + void run_me() { + + const unsigned int n = 10; + const unsigned int m = 5; + const unsigned int sum_total = n * m; + + ViewType a("A",n,m); + ViewType b("B",n,m); + + Kokkos::deep_copy( a.d_view , 1 ); + + a.template modify(); + a.template sync(); + + // Check device view is initialized as expected + scalar_type a_d_sum = 0; + // Execute on the execution_space associated with t_dev's memory space + typedef typename ViewType::t_dev::memory_space::execution_space t_dev_exec_space; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0,n), SumViewEntriesFunctor(a.d_view), a_d_sum ); + ASSERT_EQ(a_d_sum, sum_total); + + // Check host view is synced as expected + scalar_type a_h_sum = 0; + for ( size_t i = 0; i < a.h_view.extent(0); ++i ) + for ( size_t j = 0; j < a.h_view.extent(1); ++j ) { + a_h_sum += a.h_view(i,j); + } + + ASSERT_EQ(a_h_sum, sum_total); + + + // Test deep_copy + Kokkos::deep_copy( b, a ); + b.template sync(); + + // Perform same checks on b as done on a + // Check device view is initialized as expected + scalar_type b_d_sum = 0; + // Execute on the execution_space associated with t_dev's memory space + Kokkos::parallel_reduce( Kokkos::RangePolicy(0,n), SumViewEntriesFunctor(b.d_view), b_d_sum ); + ASSERT_EQ(b_d_sum, sum_total); + + // Check host view is synced as expected + scalar_type b_h_sum = 0; + for ( size_t i = 0; i < b.h_view.extent(0); ++i ) + for ( size_t j = 0; j < b.h_view.extent(1); ++j ) { + b_h_sum += b.h_view(i,j); + } + + ASSERT_EQ(b_h_sum, sum_total); + + } // end run_me + + test_dual_view_deep_copy() + { + run_me< Kokkos::DualView >(); + } + + }; } // namespace Impl - template void test_dualview_combinations(unsigned int size) { @@ -116,10 +201,21 @@ void test_dualview_combinations(unsigned int size) } +template +void test_dualview_deep_copy() +{ + Impl::test_dual_view_deep_copy (); +} + TEST_F( TEST_CATEGORY, dualview_combination) { test_dualview_combinations(10); } +TEST_F( TEST_CATEGORY, dualview_deep_copy) { + test_dualview_deep_copy(); + test_dualview_deep_copy(); +} + } // namespace Test diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index 4fa4609968..e13744e327 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -829,7 +829,8 @@ void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) { } if(bytes > current_size) { current_size = bytes; - ptr = Kokkos::kokkos_realloc(ptr,current_size); + Kokkos::kokkos_free(ptr); + ptr = Kokkos::kokkos_malloc("CudaSpace::ScratchMemory",current_size); } if((bytes < current_size) && (force_shrink)) { current_size = bytes; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp index 16952a3ae4..4fd7a9c69e 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp @@ -561,7 +561,11 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count ) } #endif + #ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API cudaThreadSetCacheConfig(cudaFuncCachePreferShared); + #else + cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); + #endif // Init the array for used for arbitrarily sized atomics Impl::initialize_host_cuda_lock_arrays(); diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp index 2ae1cc0ddd..665d0732a7 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp @@ -525,6 +525,7 @@ public: inline void execute() const { + if(m_rp.m_num_tiles==0) return; const array_index_type maxblocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); if ( RP::rank == 2 ) { @@ -685,7 +686,7 @@ public: typename Policy::member_type( kokkos_impl_cuda_shared_memory() , m_shmem_begin , m_shmem_size - , (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1]) + , (void*) ( ((char*)m_scratch_ptr[1]) + ptrdiff_t(threadid/(blockDim.x*blockDim.y)) * m_scratch_size[1]) , m_scratch_size[1] , league_rank , m_league_size ) ); @@ -1336,7 +1337,7 @@ public: ( Member( kokkos_impl_cuda_shared_memory() + m_team_begin , m_shmem_begin , m_shmem_size - , (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1]) + , (void*) ( ((char*)m_scratch_ptr[1]) + ptrdiff_t(threadid/(blockDim.x*blockDim.y)) * m_scratch_size[1]) , m_scratch_size[1] , league_rank , m_league_size ) @@ -1378,7 +1379,7 @@ public: ( Member( kokkos_impl_cuda_shared_memory() + m_team_begin , m_shmem_begin , m_shmem_size - , (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1]) + , (void*) ( ((char*)m_scratch_ptr[1]) + ptrdiff_t(threadid/(blockDim.x*blockDim.y)) * m_scratch_size[1]) , m_scratch_size[1] , league_rank , m_league_size ) @@ -2064,7 +2065,7 @@ private: #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK); #else - KOKKOS_IMPL_CUDA_SYNCWARP_MASK; + KOKKOS_IMPL_CUDA_SYNCWARP; #endif if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values. @@ -2291,7 +2292,7 @@ private: #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK); #else - KOKKOS_IMPL_CUDA_SYNCWARP_MASK; + KOKKOS_IMPL_CUDA_SYNCWARP; #endif if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values. diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp index 82d691f7d4..d09854c3a5 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp @@ -321,7 +321,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK; int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1); #else - int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1); + int active = KOKKOS_IMPL_CUDA_BALLOT(1); #endif if (int(blockDim.x*blockDim.y) > 2) { value_type tmp = Kokkos::shfl_down(value, 2,32); @@ -331,7 +331,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1); #else - active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1); + active += KOKKOS_IMPL_CUDA_BALLOT(1); #endif if (int(blockDim.x*blockDim.y) > 4) { value_type tmp = Kokkos::shfl_down(value, 4,32); @@ -341,7 +341,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1); #else - active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1); + active += KOKKOS_IMPL_CUDA_BALLOT(1); #endif if (int(blockDim.x*blockDim.y) > 8) { value_type tmp = Kokkos::shfl_down(value, 8,32); @@ -351,7 +351,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1); #else - active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1); + active += KOKKOS_IMPL_CUDA_BALLOT(1); #endif if (int(blockDim.x*blockDim.y) > 16) { value_type tmp = Kokkos::shfl_down(value, 16,32); @@ -361,7 +361,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1); #else - active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1); + active += KOKKOS_IMPL_CUDA_BALLOT(1); #endif } } @@ -506,7 +506,7 @@ cuda_inter_block_reduction( const ReducerType& reducer, unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK; int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1); #else - int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1); + int active = KOKKOS_IMPL_CUDA_BALLOT(1); #endif if (int(blockDim.x*blockDim.y) > 2) { value_type tmp = Kokkos::shfl_down(value, 2,32); @@ -516,7 +516,7 @@ cuda_inter_block_reduction( const ReducerType& reducer, #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1); #else - active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1); + active += KOKKOS_IMPL_CUDA_BALLOT(1); #endif if (int(blockDim.x*blockDim.y) > 4) { value_type tmp = Kokkos::shfl_down(value, 4,32); @@ -526,7 +526,7 @@ cuda_inter_block_reduction( const ReducerType& reducer, #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1); #else - active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1); + active += KOKKOS_IMPL_CUDA_BALLOT(1); #endif if (int(blockDim.x*blockDim.y) > 8) { value_type tmp = Kokkos::shfl_down(value, 8,32); @@ -536,7 +536,7 @@ cuda_inter_block_reduction( const ReducerType& reducer, #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1); #else - active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1); + active += KOKKOS_IMPL_CUDA_BALLOT(1); #endif if (int(blockDim.x*blockDim.y) > 16) { value_type tmp = Kokkos::shfl_down(value, 16,32); @@ -546,7 +546,7 @@ cuda_inter_block_reduction( const ReducerType& reducer, #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1); #else - active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1); + active += KOKKOS_IMPL_CUDA_BALLOT(1); #endif } } @@ -578,7 +578,7 @@ struct CudaReductionsFunctor { const int width, // How much of the warp participates Scalar& result) { - unsigned mask = width==32?0xffffffff:((1< { const int width) // How much of the warp participates { #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK - unsigned mask = width==32?0xffffffff:((1< { #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK KOKKOS_IMPL_CUDA_SYNCWARP_MASK(mask); #else - KOKKOS_IMPL_CUDA_SYNCWARP_MASK; + KOKKOS_IMPL_CUDA_SYNCWARP; #endif } *value=*(value-lane_id); @@ -779,7 +779,7 @@ struct CudaReductionsFunctor { /* * Algorithmic constraints: * (a) blockDim.y is a power of two - * (b) blockDim.y <= 512 + * (b) blockDim.y <= 1024 * (c) blockDim.x == blockDim.z == 1 */ @@ -828,14 +828,26 @@ void cuda_intra_block_reduce_scan( const FunctorType & functor , { // Inter-warp reduce-scan by a single warp to avoid extra synchronizations const unsigned rtid_inter = ( threadIdx.y ^ BlockSizeMask ) << CudaTraits::WarpIndexShift ; + #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + unsigned inner_mask = KOKKOS_IMPL_CUDA_BALLOT_MASK(0xffffffff,(rtid_inter>= 1 ) ; ) { cuda_shfl_down( reducer.reference() , tmp , i , blockDim.x , mask ); @@ -742,7 +742,7 @@ void parallel_for #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK KOKKOS_IMPL_CUDA_SYNCWARP_MASK(blockDim.x==32?0xffffffff:((1<& , const Functo #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK KOKKOS_IMPL_CUDA_SYNCWARP_MASK(blockDim.x==32?0xffffffff:((1<& , const Functo #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK KOKKOS_IMPL_CUDA_SYNCWARP_MASK(blockDim.x==32?0xffffffff:((1<& , const FunctorType& lambda, ValueType& val) { #ifdef __CUDA_ARCH__ if(threadIdx.x == 0) lambda(val); - unsigned mask = blockDim.x==32?0xffffffff:((1< - Crs(const RowMapType& row_map_, const EntriesType& entries_) : row_map(row_map_), entries(entries_) - {} - - /** \brief Assign to a view of the rhs array. - * If the old view is the last view - * then allocated memory is deallocated. - */ - Crs& operator= (const Crs& rhs) { - row_map = rhs.row_map; - entries = rhs.entries; - return *this; + KOKKOS_INLINE_FUNCTION + Crs(const RowMapType& row_map_, const EntriesType& entries_) + : row_map(row_map_), entries(entries_) + { } - /** \brief Destroy this view of the array. - * If the last view then allocated memory is deallocated. - */ - ~Crs() {} - /** \brief Return number of rows in the graph */ KOKKOS_INLINE_FUNCTION diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp index 96bd23e220..10fc09423e 100644 --- a/lib/kokkos/core/src/Kokkos_Macros.hpp +++ b/lib/kokkos/core/src/Kokkos_Macros.hpp @@ -170,6 +170,10 @@ // see https://github.com/kokkos/kokkos/issues/1470 #define KOKKOS_CUDA_9_DEFAULTED_BUG_WORKAROUND #endif + + #if ( 10000 > CUDA_VERSION ) + #define KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API + #endif #endif // #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ ) //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp index 1900794514..06aaa6546e 100644 --- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp +++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp @@ -505,7 +505,7 @@ public: } KOKKOS_INLINE_FUNCTION - value_type& reference() { + value_type& reference() const { return *value; } @@ -559,7 +559,7 @@ public: } KOKKOS_INLINE_FUNCTION - value_type& reference() { + value_type& reference() const { return *value; } @@ -637,7 +637,7 @@ public: } KOKKOS_INLINE_FUNCTION - value_type& reference() { + value_type& reference() const { return *value; } @@ -727,7 +727,7 @@ public: } KOKKOS_INLINE_FUNCTION - value_type& reference() { + value_type& reference() const { return *value; } diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp index da49aff222..754a0ab8c0 100644 --- a/lib/kokkos/core/src/Kokkos_View.hpp +++ b/lib/kokkos/core/src/Kokkos_View.hpp @@ -198,6 +198,7 @@ struct ViewTraits< void > typedef void HostMirrorSpace ; typedef void array_layout ; typedef void memory_traits ; + typedef void specialize ; }; template< class ... Prop > @@ -209,6 +210,7 @@ struct ViewTraits< void , void , Prop ... > typedef typename ViewTraits::HostMirrorSpace HostMirrorSpace ; typedef typename ViewTraits::array_layout array_layout ; typedef typename ViewTraits::memory_traits memory_traits ; + typedef typename ViewTraits::specialize specialize ; }; template< class ArrayLayout , class ... Prop > @@ -221,6 +223,7 @@ struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_array_layout::HostMirrorSpace HostMirrorSpace ; typedef ArrayLayout array_layout ; typedef typename ViewTraits::memory_traits memory_traits ; + typedef typename ViewTraits::specialize specialize ; }; template< class Space , class ... Prop > @@ -239,6 +242,7 @@ struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_space::value typedef typename Kokkos::Impl::HostMirror< Space >::Space HostMirrorSpace ; typedef typename execution_space::array_layout array_layout ; typedef typename ViewTraits::memory_traits memory_traits ; + typedef typename ViewTraits::specialize specialize ; }; template< class MemoryTraits , class ... Prop > @@ -257,6 +261,7 @@ struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_memory_traits::value + ,typename prop::specialize + ,typename data_analysis::specialize>::type + specialize ; /* mapping specialization tag */ enum { rank = dimension::rank }; enum { rank_dynamic = dimension::rank_dynamic }; @@ -542,7 +552,7 @@ public: private: - typedef Kokkos::Impl::ViewMapping< traits , void > map_type ; + typedef Kokkos::Impl::ViewMapping< traits , typename traits::specialize > map_type ; typedef Kokkos::Impl::SharedAllocationTracker track_type ; track_type m_track ; @@ -608,13 +618,18 @@ public: template< typename iType > KOKKOS_INLINE_FUNCTION constexpr typename std::enable_if< std::is_integral::value , size_t >::type - extent( const iType & r ) const + extent( const iType & r ) const noexcept { return m_map.extent(r); } + static KOKKOS_INLINE_FUNCTION constexpr + size_t + static_extent( const unsigned r ) noexcept + { return map_type::static_extent(r); } + template< typename iType > KOKKOS_INLINE_FUNCTION constexpr typename std::enable_if< std::is_integral::value , int >::type - extent_int( const iType & r ) const + extent_int( const iType & r ) const noexcept { return static_cast(m_map.extent(r)); } KOKKOS_INLINE_FUNCTION constexpr @@ -709,11 +724,11 @@ public: #ifdef KOKKOS_ENABLE_DEPRECATED_CODE KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::ViewMapping< traits , void > & + const Kokkos::Impl::ViewMapping< traits , typename traits::specialize > & implementation_map() const { return m_map ; } #endif KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::ViewMapping< traits , void > & + const Kokkos::Impl::ViewMapping< traits , typename traits::specialize > & impl_map() const { return m_map ; } KOKKOS_INLINE_FUNCTION const Kokkos::Impl::SharedAllocationTracker & @@ -1955,7 +1970,7 @@ public: , m_map() { typedef typename View::traits SrcTraits ; - typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ; + typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , typename traits::specialize > Mapping ; static_assert( Mapping::is_assignable , "Incompatible View copy construction" ); Mapping::assign( m_map , rhs.m_map , rhs.m_track ); } @@ -1965,7 +1980,7 @@ public: View & operator = ( const View & rhs ) { typedef typename View::traits SrcTraits ; - typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ; + typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , typename traits::specialize > Mapping ; static_assert( Mapping::is_assignable , "Incompatible View copy assignment" ); Mapping::assign( m_map , rhs.m_map , rhs.m_track ); m_track.assign( rhs.m_track , traits::is_managed ); @@ -1992,7 +2007,7 @@ public: typedef typename Mapping::type DstType ; - static_assert( Kokkos::Impl::ViewMapping< traits , typename DstType::traits , void >::is_assignable + static_assert( Kokkos::Impl::ViewMapping< traits , typename DstType::traits , typename traits::specialize >::is_assignable , "Subview construction requires compatible view and subview arguments" ); Mapping::assign( m_map, src_view.m_map, arg0 , args... ); @@ -2266,10 +2281,10 @@ public: } template KOKKOS_INLINE_FUNCTION - View( const track_type & track, const Kokkos::Impl::ViewMapping< Traits , void > &map ) : + View( const track_type & track, const Kokkos::Impl::ViewMapping< Traits , typename Traits::specialize > &map ) : m_track(track), m_map() { - typedef Kokkos::Impl::ViewMapping< traits , Traits , void > Mapping ; + typedef Kokkos::Impl::ViewMapping< traits , Traits , typename traits::specialize > Mapping ; static_assert( Mapping::is_assignable , "Incompatible View copy construction" ); Mapping::assign( m_map , map , track ); } diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp index 6b3e206f6c..be33ab7034 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp @@ -142,14 +142,14 @@ private: WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() ); - exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size()); + exec.set_work_range(0,range.end()-range.begin(),self.m_policy.chunk_size()); exec.reset_steal_target(); exec.barrier(); long work_index = exec.get_work_index(); while(work_index != -1) { - const Member begin = static_cast(work_index) * self.m_policy.chunk_size(); + const Member begin = static_cast(work_index) * self.m_policy.chunk_size()+range.begin(); const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end(); ParallelFor::template exec_range< WorkTag > @@ -470,14 +470,14 @@ private: const ParallelReduce & self = * ((const ParallelReduce *) arg ); const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() ); - exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size()); + exec.set_work_range(0,range.end()-range.begin(),self.m_policy.chunk_size()); exec.reset_steal_target(); exec.barrier(); long work_index = exec.get_work_index(); reference_type update = ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ); while(work_index != -1) { - const Member begin = static_cast(work_index) * self.m_policy.chunk_size(); + const Member begin = static_cast(work_index) * self.m_policy.chunk_size() + range.begin(); const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end(); ParallelReduce::template exec_range< WorkTag > ( self.m_functor , begin , end diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp index e2028db8c8..3d99b07568 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp @@ -111,7 +111,7 @@ T atomic_compare_exchange( volatile T * const dest , const T & compare , unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK; unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1); #else - unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1); + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1); #endif unsigned int done_active = 0; while (active!=done_active) { @@ -127,7 +127,7 @@ T atomic_compare_exchange( volatile T * const dest , const T & compare , #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,done); #else - done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(done); + done_active = KOKKOS_IMPL_CUDA_BALLOT(done); #endif } return return_val; @@ -308,6 +308,16 @@ T atomic_compare_exchange( volatile T * const dest_v, const T compare, const T v #endif #endif // !defined ROCM_ATOMICS +// dummy for non-CUDA Kokkos headers being processed by NVCC +#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA) +template +__inline__ __device__ +T atomic_compare_exchange(volatile T * const, const Kokkos::Impl::identity_t, const Kokkos::Impl::identity_t) +{ + return T(); +} +#endif + template KOKKOS_INLINE_FUNCTION bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val) diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp index 4e41cb1258..6ccf35816b 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp @@ -134,7 +134,7 @@ T atomic_exchange( volatile T * const dest , unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK; unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1); #else - unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1); + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1); #endif unsigned int done_active = 0; while (active!=done_active) { @@ -149,7 +149,7 @@ T atomic_exchange( volatile T * const dest , #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,done); #else - done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(done); + done_active = KOKKOS_IMPL_CUDA_BALLOT(done); #endif } return return_val; @@ -418,6 +418,23 @@ void atomic_assign( volatile T * const dest_v , const T val ) #endif #endif + +// dummy for non-CUDA Kokkos headers being processed by NVCC +#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA) +template +__inline__ __device__ +T atomic_exchange(volatile T * const, const Kokkos::Impl::identity_t) +{ + return T(); +} + +template < typename T > +__inline__ __device__ +void atomic_assign(volatile T * const, const Kokkos::Impl::identity_t) +{ +} +#endif + } // namespace Kokkos #endif diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp index e2e23bb5fd..d6fab81133 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp @@ -147,7 +147,7 @@ T atomic_fetch_add( volatile T * const dest , unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK; unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1); #else - unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1); + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1); #endif unsigned int done_active = 0; while (active!=done_active) { @@ -164,7 +164,7 @@ T atomic_fetch_add( volatile T * const dest , #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,done); #else - done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(done); + done_active = KOKKOS_IMPL_CUDA_BALLOT(done); #endif } return return_val; @@ -384,6 +384,15 @@ T atomic_fetch_add( volatile T * const dest_v , typename std::add_const::type #endif // !defined ROCM_ATOMICS //---------------------------------------------------------------------------- +// dummy for non-CUDA Kokkos headers being processed by NVCC +#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA) +template< typename T > +__inline__ __device__ +T atomic_fetch_add(volatile T* const, Kokkos::Impl::identity_t) { + return T(); +} +#endif + // Simpler version of atomic_fetch_add without the fetch template KOKKOS_INLINE_FUNCTION diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp index 044cbdf79a..db0d97ca19 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp @@ -149,6 +149,15 @@ T atomic_fetch_and( volatile T * const dest_v , const T val ) #endif //---------------------------------------------------------------------------- +// dummy for non-CUDA Kokkos headers being processed by NVCC +#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA) +template< typename T > +__inline__ __device__ +T atomic_fetch_and(volatile T* const, Kokkos::Impl::identity_t) { + return T(); +} +#endif + // Simpler version of atomic_fetch_and without the fetch template KOKKOS_INLINE_FUNCTION diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp index 0b8cbb1d8c..d146ef3148 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp @@ -149,6 +149,15 @@ T atomic_fetch_or( volatile T * const dest_v , const T val ) #endif //---------------------------------------------------------------------------- +// dummy for non-CUDA Kokkos headers being processed by NVCC +#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA) +template< typename T > +__inline__ __device__ +T atomic_fetch_or(volatile T* const, Kokkos::Impl::identity_t) { + return T(); +} +#endif + // Simpler version of atomic_fetch_or without the fetch template KOKKOS_INLINE_FUNCTION diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp index dd69c967c5..48dc8731ef 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp @@ -139,7 +139,7 @@ T atomic_fetch_sub( volatile T * const dest , unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK; unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1); #else - unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1); + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1); #endif unsigned int done_active = 0; while (active!=done_active) { @@ -154,7 +154,7 @@ T atomic_fetch_sub( volatile T * const dest , #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,done); #else - done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(done); + done_active = KOKKOS_IMPL_CUDA_BALLOT(done); #endif } return return_val; @@ -304,6 +304,15 @@ T atomic_fetch_sub( volatile T * const dest_v , const T val ) #endif #endif // !defined ROCM_ATOMICS +// dummy for non-CUDA Kokkos headers being processed by NVCC +#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA) +template< typename T > +__inline__ __device__ +T atomic_fetch_sub(volatile T* const, Kokkos::Impl::identity_t) { + return T(); +} +#endif + // Simpler version of atomic_fetch_sub without the fetch template KOKKOS_INLINE_FUNCTION diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp index 74e9db303d..a3a18166af 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp @@ -230,9 +230,6 @@ T atomic_fetch_oper( const Oper& op, volatile T * const dest , typename Kokkos::Impl::enable_if< ( sizeof(T) != 4 ) && ( sizeof(T) != 8 ) - #if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) - && ( sizeof(T) != 16 ) - #endif , const T >::type val ) { @@ -250,7 +247,7 @@ T atomic_fetch_oper( const Oper& op, volatile T * const dest , unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK; unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1); #else - unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1); + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1); #endif unsigned int done_active = 0; while (active!=done_active) { @@ -265,7 +262,7 @@ T atomic_fetch_oper( const Oper& op, volatile T * const dest , #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,done); #else - done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(done); + done_active = KOKKOS_IMPL_CUDA_BALLOT(done); #endif } return return_val; @@ -298,7 +295,7 @@ T atomic_oper_fetch( const Oper& op, volatile T * const dest , unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK; unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1); #else - unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1); + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1); #endif unsigned int done_active = 0; while (active!=done_active) { @@ -313,7 +310,7 @@ T atomic_oper_fetch( const Oper& op, volatile T * const dest , #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,done); #else - done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(done); + done_active = KOKKOS_IMPL_CUDA_BALLOT(done); #endif } return return_val; diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp index 628e070a0d..82fdee4399 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp @@ -49,6 +49,7 @@ #include #include #include +#include //---------------------------------------------------------------------------- @@ -70,7 +71,6 @@ bool is_unsigned_int(const char* str) } return true; } - void initialize_internal(const InitArguments& args) { // This is an experimental setting @@ -99,6 +99,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0); if (use_gpu < 0 && ndevices >= 0) { auto local_rank_str = std::getenv("OMPI_COMM_WORLD_LOCAL_RANK"); //OpenMPI if (!local_rank_str) local_rank_str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK"); //MVAPICH2 + if (!local_rank_str) local_rank_str = std::getenv("SLURM_LOCALID"); //SLURM if (local_rank_str) { auto local_rank = std::atoi(local_rank_str); use_gpu = local_rank % ndevices; @@ -532,6 +533,85 @@ void initialize(int& narg, char* arg[]) iarg++; } + //Read environment variables + char * endptr; + auto env_num_threads_str = std::getenv("KOKKOS_NUM_THREADS"); + if (env_num_threads_str!=nullptr) { + errno = 0; + auto env_num_threads = std::strtol(env_num_threads_str,&endptr,10); + if (endptr== env_num_threads_str) + Impl::throw_runtime_exception("Error: cannot convert KOKKOS_NUM_THREADS to an integer. Raised by Kokkos::initialize(int narg, char* argc[])."); + if (errno == ERANGE) + Impl::throw_runtime_exception("Error: KOKKOS_NUM_THREADS out of range of representable values by an integer. Raised by Kokkos::initialize(int narg, char* argc[])."); + if ((num_threads != -1)&&(env_num_threads!=num_threads)) + Impl::throw_runtime_exception("Error: expecting a match between --kokkos-threads and KOKKOS_NUM_THREADS if both are set. Raised by Kokkos::initialize(int narg, char* argc[])."); + else + num_threads = env_num_threads; + } + auto env_numa_str = std::getenv("KOKKOS_NUMA"); + if (env_numa_str!=nullptr) { + errno = 0; + auto env_numa = std::strtol(env_numa_str,&endptr,10); + if (endptr== env_numa_str) + Impl::throw_runtime_exception("Error: cannot convert KOKKOS_NUMA to an integer. Raised by Kokkos::initialize(int narg, char* argc[])."); + if (errno == ERANGE) + Impl::throw_runtime_exception("Error: KOKKOS_NUMA out of range of representable values by an integer. Raised by Kokkos::initialize(int narg, char* argc[])."); + if ((numa != -1)&&(env_numa!=numa)) + Impl::throw_runtime_exception("Error: expecting a match between --kokkos-numa and KOKKOS_NUMA if both are set. Raised by Kokkos::initialize(int narg, char* argc[])."); + else + numa = env_numa; + } + auto env_device_str = std::getenv("KOKKOS_DEVICE_ID"); + if (env_device_str!=nullptr) { + errno = 0; + auto env_device = std::strtol(env_device_str,&endptr,10); + if (endptr== env_device_str) + Impl::throw_runtime_exception("Error: cannot convert KOKKOS_DEVICE_ID to an integer. Raised by Kokkos::initialize(int narg, char* argc[])."); + if (errno == ERANGE) + Impl::throw_runtime_exception("Error: KOKKOS_DEVICE_ID out of range of representable values by an integer. Raised by Kokkos::initialize(int narg, char* argc[])."); + if ((device != -1)&&(env_device!=device)) + Impl::throw_runtime_exception("Error: expecting a match between --kokkos-device and KOKKOS_DEVICE_ID if both are set. Raised by Kokkos::initialize(int narg, char* argc[])."); + else + device = env_device; + } + auto env_ndevices_str = std::getenv("KOKKOS_NUM_DEVICES"); + if (env_ndevices_str!=nullptr) { + errno = 0; + auto env_ndevices = std::strtol(env_ndevices_str,&endptr,10); + if (endptr== env_ndevices_str) + Impl::throw_runtime_exception("Error: cannot convert KOKKOS_NUM_DEVICES to an integer. Raised by Kokkos::initialize(int narg, char* argc[])."); + if (errno == ERANGE) + Impl::throw_runtime_exception("Error: KOKKOS_NUM_DEVICES out of range of representable values by an integer. Raised by Kokkos::initialize(int narg, char* argc[])."); + if ((ndevices != -1)&&(env_ndevices!=ndevices)) + Impl::throw_runtime_exception("Error: expecting a match between --kokkos-ndevices and KOKKOS_NUM_DEVICES if both are set. Raised by Kokkos::initialize(int narg, char* argc[])."); + else + ndevices = env_ndevices; + //Skip device + auto env_skip_device_str = std::getenv("KOKKOS_SKIP_DEVICE"); + if (env_skip_device_str!=nullptr) { + errno = 0; + auto env_skip_device = std::strtol(env_skip_device_str,&endptr,10); + if (endptr== env_skip_device_str) + Impl::throw_runtime_exception("Error: cannot convert KOKKOS_SKIP_DEVICE to an integer. Raised by Kokkos::initialize(int narg, char* argc[])."); + if (errno == ERANGE) + Impl::throw_runtime_exception("Error: KOKKOS_SKIP_DEVICE out of range of representable values by an integer. Raised by Kokkos::initialize(int narg, char* argc[])."); + if ((skip_device != 9999)&&(env_skip_device!=skip_device)) + Impl::throw_runtime_exception("Error: expecting a match between --kokkos-ndevices and KOKKOS_SKIP_DEVICE if both are set. Raised by Kokkos::initialize(int narg, char* argc[])."); + else + skip_device = env_skip_device; + } + } + char * env_disablewarnings_str = std::getenv("KOKKOS_DISABLE_WARNINGS"); + if (env_disablewarnings_str!=nullptr) { + std::string env_str (env_disablewarnings_str); // deep-copies string + for (char& c : env_str) { c = toupper (c); } + if ((env_str == "TRUE") || (env_str == "ON") || (env_str == "1")) + disable_warnings = true; + else + if (disable_warnings) + Impl::throw_runtime_exception("Error: expecting a match between --kokkos-disable-warnings and KOKKOS_DISABLE_WARNINGS if both are set. Raised by Kokkos::initialize(int narg, char* argc[])."); + } + InitArguments arguments; arguments.num_threads = num_threads; arguments.num_numa = numa; diff --git a/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp b/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp index 868b31861a..611a32c4fe 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp @@ -409,6 +409,9 @@ struct inclusive_scan_integer_sequence static constexpr value_type value = helper::value ; }; +template +using identity_t = T; + }} // namespace Kokkos::Impl diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp index d4e3a03d38..e1539d10b0 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp @@ -103,13 +103,7 @@ namespace Impl { /** \brief View mapping for non-specialized data type and standard layout */ template< class Traits > -class ViewMapping< Traits , - typename std::enable_if<( - std::is_same< typename Traits::specialize , Kokkos::Array<> >::value && - ( std::is_same< typename Traits::array_layout , Kokkos::LayoutLeft >::value || - std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value || - std::is_same< typename Traits::array_layout , Kokkos::LayoutStride >::value ) - )>::type > +class ViewMapping< Traits , Kokkos::Array<> > { private: @@ -345,64 +339,6 @@ public: } }; -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -/** \brief Assign compatible default mappings */ - -template< class DstTraits , class SrcTraits > -class ViewMapping< DstTraits , SrcTraits , - typename std::enable_if<( - std::is_same< typename DstTraits::memory_space , typename SrcTraits::memory_space >::value - && - std::is_same< typename DstTraits::specialize , Kokkos::Array<> >::value - && - ( - std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value || - std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value || - std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value - ) - && - std::is_same< typename SrcTraits::specialize , Kokkos::Array<> >::value - && - ( - std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value || - std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value || - std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value - ) - )>::type > -{ -public: - - enum { is_assignable = true }; - - typedef Kokkos::Impl::SharedAllocationTracker TrackType ; - typedef ViewMapping< DstTraits , void > DstType ; - typedef ViewMapping< SrcTraits , void > SrcType ; - - KOKKOS_INLINE_FUNCTION - static void assign( DstType & dst , const SrcType & src , const TrackType & src_track ) - { - static_assert( std::is_same< typename DstTraits::value_type , typename SrcTraits::value_type >::value || - std::is_same< typename DstTraits::value_type , typename SrcTraits::const_value_type >::value - , "View assignment must have same value type or const = non-const" ); - - static_assert( ViewDimensionAssignable< typename DstTraits::dimension , typename SrcTraits::dimension >::value - , "View assignment must have compatible dimensions" ); - - static_assert( std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value || - std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value || - ( DstTraits::dimension::rank == 0 ) || - ( DstTraits::dimension::rank == 1 && DstTraits::dimension::rank_dynamic == 1 ) - , "View assignment must have compatible layout or have rank <= 1" ); - - typedef typename DstType::offset_type dst_offset_type ; - - dst.m_impl_offset = dst_offset_type( src.m_impl_offset ); - dst.m_impl_handle = src.m_impl_handle ; - dst.m_stride = src.m_stride ; - } -}; - /** \brief Assign Array to non-Array */ template< class DstTraits , class SrcTraits > @@ -436,7 +372,7 @@ public: typedef Kokkos::Impl::SharedAllocationTracker TrackType ; typedef ViewMapping< DstTraits , void > DstType ; - typedef ViewMapping< SrcTraits , void > SrcType ; + typedef ViewMapping< SrcTraits , Kokkos::Array<> > SrcType ; KOKKOS_INLINE_FUNCTION static void assign( DstType & dst , const SrcType & src , const TrackType & src_track ) @@ -480,6 +416,7 @@ public: } }; + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp index bb3bcfd334..773f336281 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp @@ -195,7 +195,7 @@ struct ViewDimension {} KOKKOS_INLINE_FUNCTION - constexpr size_t extent( const unsigned r ) const + constexpr size_t extent( const unsigned r ) const noexcept { return r == 0 ? N0 : ( r == 1 ? N1 : ( @@ -207,6 +207,19 @@ struct ViewDimension r == 7 ? N7 : 0 ))))))); } + static KOKKOS_INLINE_FUNCTION + constexpr size_t static_extent( const unsigned r ) noexcept + { + return r == 0 ? ArgN0 : ( + r == 1 ? ArgN1 : ( + r == 2 ? ArgN2 : ( + r == 3 ? ArgN3 : ( + r == 4 ? ArgN4 : ( + r == 5 ? ArgN5 : ( + r == 6 ? ArgN6 : ( + r == 7 ? ArgN7 : 0 ))))))); + } + template< size_t N > struct prepend { typedef ViewDimension< N , Vals... > type ; }; @@ -2640,6 +2653,12 @@ public: KOKKOS_INLINE_FUNCTION constexpr size_t extent( const iType & r ) const { return m_impl_offset.m_dim.extent(r); } + static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( const unsigned r ) noexcept + { + using dim_type = typename offset_type::dimension_type; + return dim_type::static_extent(r); + } + KOKKOS_INLINE_FUNCTION constexpr typename Traits::array_layout layout() const { return m_impl_offset.layout(); } diff --git a/lib/kokkos/core/unit_test/TestCrs.hpp b/lib/kokkos/core/unit_test/TestCrs.hpp index 77ea508b89..08087ae062 100644 --- a/lib/kokkos/core/unit_test/TestCrs.hpp +++ b/lib/kokkos/core/unit_test/TestCrs.hpp @@ -63,6 +63,86 @@ struct CountFillFunctor { } }; +/* RunUpdateCrsTest + * 4 test cases: + * 1. use member object version which is constructed directly using the copy constructor + * 2. excplicity copy construct in local variable + * 3. construct default and assign to input object + * 4. construct object from views + */ +template< class CrsType, class ExecSpace, class scalarType > +struct RunUpdateCrsTest { + + struct TestOne {}; + struct TestTwo {}; + struct TestThree {}; + struct TestFour {}; + + CrsType graph; + RunUpdateCrsTest( CrsType g_in ) : graph(g_in) + { + } + + void run_test(int nTest) { + switch (nTest) + { + case 1: + parallel_for ("TestCrs1", Kokkos::RangePolicy(0,graph.numRows()),*this); + break; + case 2: + parallel_for ("TestCrs2", Kokkos::RangePolicy(0,graph.numRows()),*this); + break; + case 3: + parallel_for ("TestCrs3", Kokkos::RangePolicy(0,graph.numRows()),*this); + break; + case 4: + parallel_for ("TestCrs4", Kokkos::RangePolicy(0,graph.numRows()),*this); + break; + default: + break; + } + } + + KOKKOS_INLINE_FUNCTION + void updateGraph(const CrsType & g_in, const scalarType row) const { + auto row_map = g_in.row_map; + auto entries = g_in.entries; + auto j_start = row_map(row); + auto j_end = row_map(row+1)-j_start; + for (scalarType j = 0; j < j_end; ++j) { + entries(j_start+j) = (j+1)*(j+1); + } + } + + // Test Crs class from class member + KOKKOS_INLINE_FUNCTION + void operator()(const TestOne &, const scalarType row) const { + updateGraph(graph, row); + } + + // Test Crs class from copy constructor (local_graph(graph) + KOKKOS_INLINE_FUNCTION + void operator()(const TestTwo &, const scalarType row) const { + CrsType local_graph(graph); + updateGraph(local_graph, row); + } + + // Test Crs class from default constructor assigned to function parameter + KOKKOS_INLINE_FUNCTION + void operator()(const TestThree &, const scalarType row) const { + CrsType local_graph; + local_graph = graph; + updateGraph(local_graph, row); + } + + // Test Crs class from local graph constructed from row_map and entities access on input parameter) + KOKKOS_INLINE_FUNCTION + void operator()(const TestFour &, const scalarType row) const { + CrsType local_graph(graph.row_map, graph.entries); + updateGraph(local_graph, row); + } +}; + template< class ExecSpace > void test_count_fill(std::int32_t nrows) { Kokkos::Crs graph; @@ -81,6 +161,38 @@ void test_count_fill(std::int32_t nrows) { } } +// Test Crs Constructor / assignment operation by +// using count and fill to create/populate initial graph, +// then use parallel_for with Crs directly to update content +// then verify results +template< class ExecSpace > +void test_constructor(std::int32_t nrows) { + + for (int nTest = 1; nTest < 5; nTest++) + { + typedef Kokkos::Crs crs_int32; + crs_int32 graph; + Kokkos::count_and_fill_crs(graph, nrows, CountFillFunctor()); + ASSERT_EQ(graph.numRows(), nrows); + + RunUpdateCrsTest crstest(graph); + crstest.run_test(nTest); + + auto row_map = Kokkos::create_mirror_view(graph.row_map); + Kokkos::deep_copy(row_map, graph.row_map); + auto entries = Kokkos::create_mirror_view(graph.entries); + Kokkos::deep_copy(entries, graph.entries); + + for (std::int32_t row = 0; row < nrows; ++row) { + auto n = (row % 4) + 1; + ASSERT_EQ(row_map(row + 1) - row_map(row), n); + for (std::int32_t j = 0; j < n; ++j) { + ASSERT_EQ(entries(row_map(row) + j), (j + 1)*(j+1)); + } + } + } +} + } // anonymous namespace TEST_F( TEST_CATEGORY, crs_count_fill ) @@ -95,4 +207,17 @@ TEST_F( TEST_CATEGORY, crs_count_fill ) test_count_fill(10000); } +TEST_F( TEST_CATEGORY, crs_copy_constructor ) +{ + test_constructor(0); + test_constructor(1); + test_constructor(2); + test_constructor(3); + test_constructor(13); + test_constructor(100); + test_constructor(1000); + test_constructor(10000); +} + + } // namespace Test diff --git a/lib/kokkos/core/unit_test/TestMDRange.hpp b/lib/kokkos/core/unit_test/TestMDRange.hpp index 88b3a9b0c6..a382a20700 100644 --- a/lib/kokkos/core/unit_test/TestMDRange.hpp +++ b/lib/kokkos/core/unit_test/TestMDRange.hpp @@ -956,7 +956,12 @@ struct TestMDRange_3D { } , Kokkos::Min(min) ); - ASSERT_EQ( min, 8.0 ); + if((N0-1)*(N1-1)*(N2-1)>0) + ASSERT_EQ( min, 8.0 ); + else { + double min_identity = Kokkos::reduction_identity::min(); + ASSERT_EQ( min, min_identity ); + } } #endif #endif diff --git a/lib/kokkos/core/unit_test/TestMDRange_d.hpp b/lib/kokkos/core/unit_test/TestMDRange_d.hpp index 1a477a228f..e25213a289 100644 --- a/lib/kokkos/core/unit_test/TestMDRange_d.hpp +++ b/lib/kokkos/core/unit_test/TestMDRange_d.hpp @@ -46,8 +46,10 @@ namespace Test { TEST_F( TEST_CATEGORY , mdrange_3d) { + TestMDRange_3D< TEST_EXECSPACE >::test_for3( 1, 10, 100 ); TestMDRange_3D< TEST_EXECSPACE >::test_for3( 100, 10, 100 ); #if !defined( KOKKOS_ENABLE_ROCM ) // MDRange Reduced explicitly handled in its own cpp file + TestMDRange_3D< TEST_EXECSPACE >::test_reduce3( 1, 10, 100 ); TestMDRange_3D< TEST_EXECSPACE >::test_reduce3( 100, 10, 100 ); #endif } diff --git a/lib/kokkos/core/unit_test/TestRange.hpp b/lib/kokkos/core/unit_test/TestRange.hpp index bc0acfb21d..be878046cb 100644 --- a/lib/kokkos/core/unit_test/TestRange.hpp +++ b/lib/kokkos/core/unit_test/TestRange.hpp @@ -60,8 +60,11 @@ struct TestRange { struct VerifyInitTag {}; struct ResetTag {}; struct VerifyResetTag {}; + struct OffsetTag {}; + struct VerifyOffsetTag {}; - int N; + int N; + static const int offset = 13; TestRange( const size_t N_ ) : m_flags( Kokkos::ViewAllocateWithoutInitializing( "flags" ), N_ ), N(N_) {} @@ -117,6 +120,18 @@ struct TestRange { if ( int( 2 * i ) != host_flags( i ) ) ++error_count; } ASSERT_EQ( error_count, int( 0 ) ); + + Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType, OffsetTag >( offset, N + offset ), *this ); + Kokkos::parallel_for( std::string("TestKernelFor"), Kokkos::RangePolicy( 0, N ), *this); + + Kokkos::deep_copy(host_flags, m_flags); + + error_count = 0; + for (int i = 0; i < N; ++i) { + if (i + offset != host_flags(i)) + ++error_count; + } + ASSERT_EQ(error_count, int(0)); } KOKKOS_INLINE_FUNCTION @@ -144,9 +159,19 @@ struct TestRange { } } - //---------------------------------------- + KOKKOS_INLINE_FUNCTION + void operator()(const OffsetTag &, const int i) const { + m_flags(i - offset) = i; + } - struct OffsetTag {}; + KOKKOS_INLINE_FUNCTION + void operator()(const VerifyOffsetTag &, const int i) const { + if (i + offset != m_flags(i)) { + printf("TestRange::test_for error at %d != %d\n", i + offset, m_flags(i)); + } + } + + //---------------------------------------- void test_reduce( ) { @@ -158,7 +183,7 @@ struct TestRange { // sum( 0 .. N-1 ) ASSERT_EQ( size_t( ( N - 1 ) * ( N ) / 2 ), size_t( total ) ); - Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, ScheduleType, OffsetTag>( 0, N ), *this, total ); + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, ScheduleType, OffsetTag>( offset, N+offset ), *this, total ); // sum( 1 .. N ) ASSERT_EQ( size_t( ( N ) * ( N + 1 ) / 2 ), size_t( total ) ); } @@ -169,7 +194,7 @@ struct TestRange { KOKKOS_INLINE_FUNCTION void operator()( const OffsetTag &, const int i, value_type & update ) const - { update += 1 + m_flags( i ); } + { update += 1 + m_flags( i-offset ); } //---------------------------------------- diff --git a/lib/kokkos/core/unit_test/TestTeamVector.hpp b/lib/kokkos/core/unit_test/TestTeamVector.hpp index 294247a78d..498d156db3 100644 --- a/lib/kokkos/core/unit_test/TestTeamVector.hpp +++ b/lib/kokkos/core/unit_test/TestTeamVector.hpp @@ -532,7 +532,11 @@ struct functor_vec_single { typedef ExecutionSpace execution_space; Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag; - functor_vec_single( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {} + int nStart; + int nEnd; + + functor_vec_single( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_, const int start_, const int end_ ) : + flag( flag_ ), nStart(start_), nEnd(end_) {} KOKKOS_INLINE_FUNCTION void operator()( typename policy_type::member_type team ) const { @@ -541,7 +545,7 @@ struct functor_vec_single { // inside a parallel_for and write to it. Scalar value = 0; - Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 0, 13 ), [&] ( int i ) + Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, nStart, nEnd ), [&] ( int i ) { value = i; // This write is violating Kokkos semantics for nested parallelism. }); @@ -552,12 +556,12 @@ struct functor_vec_single { }, value ); Scalar value2 = 0; - Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, 0, 13 ), [&] ( int i, Scalar & val ) + Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, nStart, nEnd ), [&] ( int i, Scalar & val ) { val += value; }, value2 ); - if ( value2 != ( value * 13 ) ) { + if ( value2 != ( value * (nEnd-nStart) ) ) { printf( "FAILED vector_single broadcast %i %i %f %f\n", team.league_rank(), team.team_rank(), (double) value2, (double) value ); @@ -746,12 +750,6 @@ bool test_scalar( int nteams, int team_size, int test ) { functor_vec_red< Scalar, ExecutionSpace >( d_flag ) ); } else if ( test == 1 ) { - // WORKAROUND CUDA - #if defined(KOKKOS_ENABLE_CUDA) - #if defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) || defined(KOKKOS_ARCH_PASCAL) - if(!std::is_same::value) - #endif - #endif Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ), functor_vec_red_reducer< Scalar, ExecutionSpace >( d_flag ) ); } @@ -765,7 +763,7 @@ bool test_scalar( int nteams, int team_size, int test ) { } else if ( test == 4 ) { Kokkos::parallel_for( "B", Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ), - functor_vec_single< Scalar, ExecutionSpace >( d_flag ) ); + functor_vec_single< Scalar, ExecutionSpace >( d_flag, 0, 13 ) ); } else if ( test == 5 ) { Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size ), @@ -791,6 +789,10 @@ bool test_scalar( int nteams, int team_size, int test ) { Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ), functor_team_vector_reduce_reducer< Scalar, ExecutionSpace >( d_flag ) ); } + else if ( test == 11 ) { + Kokkos::parallel_for( "B", Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ), + functor_vec_single< Scalar, ExecutionSpace >( d_flag, 4, 13 ) ); + } Kokkos::deep_copy( h_flag, d_flag ); @@ -938,6 +940,7 @@ TEST_F( TEST_CATEGORY, team_vector ) ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 8 ) ) ); ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 9 ) ) ); ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 10 ) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 11 ) ) ); } #endif diff --git a/lib/kokkos/core/unit_test/TestViewCopy.hpp b/lib/kokkos/core/unit_test/TestViewCopy.hpp index 7eab9daa11..ddcd0ae5ba 100644 --- a/lib/kokkos/core/unit_test/TestViewCopy.hpp +++ b/lib/kokkos/core/unit_test/TestViewCopy.hpp @@ -56,17 +56,13 @@ struct TestViewCopy { using InExecSpace = ExecSpace; - static void test_view_copy() + static void test_view_copy(const int dim0, const int dim1, const int dim2) { #if defined( KOKKOS_ENABLE_CUDA ) || defined( KOKKOS_ENABLE_ROCM ) // ExecSpace = CudaUVM, CudaHostPinned // This test will fail at runtime with an illegal memory access if something goes wrong // Test 1: deep_copy from host_mirror_space to ExecSpace and ExecSpace back to host_mirror_space { - const int dim0 = 4; - const int dim1 = 2; - const int dim2 = 3; - typedef Kokkos::View Rank4ViewType; Rank4ViewType view_4; view_4 = Rank4ViewType("view_4", dim0, dim1, dim2, dim2); @@ -88,19 +84,21 @@ struct TestViewCopy { // Test 2: deep_copy from Cuda to ExecSpace and ExecSpace back to Cuda { - const int dim0 = 4; - const int dim1 = 2; - const int dim2 = 3; - typedef Kokkos::View Rank4ViewType; Rank4ViewType view_4; view_4 = Rank4ViewType("view_4", dim0, dim1, dim2, dim2); #if defined( KOKKOS_ENABLE_CUDA ) - typedef Kokkos::Cuda space_type; + typedef typename std::conditional< + Kokkos::Impl::MemorySpaceAccess::accessible, + Kokkos::CudaSpace, + InExecSpace>::type space_type; #endif #if defined( KOKKOS_ENABLE_ROCM ) - typedef Kokkos::Experimental::ROCm space_type; + typedef typename std::conditional< + Kokkos::Impl::MemorySpaceAccess::accessible, + Kokkos::ROCmSpace, + InExecSpace>::type space_type; #endif Kokkos::View srcView("srcView", dim2, dim2); @@ -118,10 +116,6 @@ struct TestViewCopy { // Test 3: deep_copy from host_space to ExecSpace and ExecSpace back to host_space { - const int dim0 = 4; - const int dim1 = 2; - const int dim2 = 3; - typedef Kokkos::View Rank4ViewType; Rank4ViewType view_4; view_4 = Rank4ViewType("view_4", dim0, dim1, dim2, dim2); @@ -149,7 +143,41 @@ struct TestViewCopy { TEST_F( TEST_CATEGORY , view_copy_tests ) { //Only include this file to be compiled with CudaUVM and CudaHostPinned - TestViewCopy< TEST_EXECSPACE >::test_view_copy(); + TestViewCopy< TEST_EXECSPACE >::test_view_copy(4,2,3); + TestViewCopy< TEST_EXECSPACE >::test_view_copy(4,2,0); +} + +TEST_F( TEST_CATEGORY , view_copy_degenerated ) { + //Only include this file to be compiled with CudaUVM and CudaHostPinned + Kokkos::View> v_um_def_1; + Kokkos::View> v_um_1( reinterpret_cast(-1), 0 ); + Kokkos::View v_m_def_1; + Kokkos::View v_m_1("v_m_1", 0); + + Kokkos::View> v_um_def_2; + Kokkos::View> v_um_2( reinterpret_cast(-1), 0 ); + Kokkos::View v_m_def_2; + Kokkos::View v_m_2("v_m_2", 0); + + Kokkos::deep_copy(v_um_def_1, v_um_def_2); + Kokkos::deep_copy(v_um_def_1, v_um_2); + Kokkos::deep_copy(v_um_def_1, v_m_def_2); + Kokkos::deep_copy(v_um_def_1, v_m_2); + + Kokkos::deep_copy(v_um_1, v_um_def_2); + Kokkos::deep_copy(v_um_1, v_um_2); + Kokkos::deep_copy(v_um_1, v_m_def_2); + Kokkos::deep_copy(v_um_1, v_m_2); + + Kokkos::deep_copy(v_m_def_1, v_um_def_2); + Kokkos::deep_copy(v_m_def_1, v_um_2); + Kokkos::deep_copy(v_m_def_1, v_m_def_2); + Kokkos::deep_copy(v_m_def_1, v_m_2); + + Kokkos::deep_copy(v_m_1, v_um_def_2); + Kokkos::deep_copy(v_m_1, v_um_2); + Kokkos::deep_copy(v_m_1, v_m_def_2); + Kokkos::deep_copy(v_m_1, v_m_2); } } // namespace Test diff --git a/lib/kokkos/core/unit_test/TestViewMapping_a.hpp b/lib/kokkos/core/unit_test/TestViewMapping_a.hpp index 365531cb6f..03d5e501b9 100644 --- a/lib/kokkos/core/unit_test/TestViewMapping_a.hpp +++ b/lib/kokkos/core/unit_test/TestViewMapping_a.hpp @@ -1245,5 +1245,12 @@ TEST_F( TEST_CATEGORY , view_mapping_operator ) test_view_mapping_operator< TEST_EXECSPACE >(); } +TEST_F( TEST_CATEGORY , static_extent ) +{ + using T = Kokkos::View; + ASSERT_EQ( T::static_extent(1), 2 ); + ASSERT_EQ( T::static_extent(2), 3 ); +} + } diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp index f7bfdc6787..e871b3c0c0 100644 --- a/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp +++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp @@ -228,6 +228,10 @@ TEST_F( cuda, uvm ) } } +/* Removing UVM Allocs Test due to added time to complete overall unit test + * The issue verified with this unit test appears to no longer be an + * problem. Refer to github issue 1880 for more details + * TEST_F( cuda, uvm_num_allocs ) { // The max number of UVM allocations allowed is 65536. @@ -288,6 +292,7 @@ TEST_F( cuda, uvm_num_allocs ) #undef MAX_NUM_ALLOCS } +*/ template< class MemSpace, class ExecSpace > struct TestViewCudaAccessible { diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_e.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_e.cpp index 4249b58e82..2e8134aac4 100644 --- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_e.cpp +++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_e.cpp @@ -43,3 +43,4 @@ #include #include +#include diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_e.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_e.cpp index 9f0e765aba..5082729789 100644 --- a/lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_e.cpp +++ b/lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_e.cpp @@ -43,3 +43,5 @@ #include #include +#include + diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_e.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_e.cpp index 2d9b17bc3e..616a923496 100644 --- a/lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_e.cpp +++ b/lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_e.cpp @@ -43,3 +43,4 @@ #include #include +#include diff --git a/lib/kokkos/generate_makefile.bash b/lib/kokkos/generate_makefile.bash index 34be03f980..f3c4f16238 100755 --- a/lib/kokkos/generate_makefile.bash +++ b/lib/kokkos/generate_makefile.bash @@ -68,6 +68,9 @@ do --cxxflags*) CXXFLAGS="${key#*=}" ;; + --cxxstandard*) + KOKKOS_CXX_STANDARD="${key#*=}" + ;; --ldflags*) LDFLAGS="${key#*=}" ;; @@ -127,6 +130,7 @@ do echo "--arch=[OPT]: Set target architectures. Options are:" echo " [AMD]" echo " AMDAVX = AMD CPU" + echo " EPYC = AMD EPYC Zen-Core CPU" echo " [ARM]" echo " ARMv80 = ARMv8.0 Compatible CPU" echo " ARMv81 = ARMv8.1 Compatible CPU" @@ -165,6 +169,8 @@ do echo " build. This will still set certain required" echo " flags via KOKKOS_CXXFLAGS (such as -fopenmp," echo " --std=c++11, etc.)." + echo "--cxxstandard=[FLAGS] Overwrite KOKKOS_CXX_STANDARD for library build and test" + echo " c++11 (default), c++14, c++17, c++1y, c++1z, c++2a" echo "--ldflags=[FLAGS] Overwrite LDFLAGS for library build and test" echo " build. This will still set certain required" echo " flags via KOKKOS_LDFLAGS (such as -fopenmp," @@ -243,6 +249,10 @@ if [ ${#CXXFLAGS} -gt 0 ]; then KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXXFLAGS=\"${CXXFLAGS}\"" fi +if [ ${#KOKKOS_CXX_STANDARD} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_CXX_STANDARD=\"${KOKKOS_CXX_STANDARD}\"" +fi + if [ ${#LDFLAGS} -gt 0 ]; then KOKKOS_SETTINGS="${KOKKOS_SETTINGS} LDFLAGS=\"${LDFLAGS}\"" fi diff --git a/lib/kokkos/scripts/testing_scripts/test_all_sandia b/lib/kokkos/scripts/testing_scripts/test_all_sandia index d1424ade81..d34d04b7ce 100755 --- a/lib/kokkos/scripts/testing_scripts/test_all_sandia +++ b/lib/kokkos/scripts/testing_scripts/test_all_sandia @@ -88,6 +88,8 @@ CXX_FLAGS_EXTRA="" LD_FLAGS_EXTRA="" KOKKOS_OPTIONS="" +CXX_STANDARD="c++11" + # # Handle arguments. # @@ -142,6 +144,9 @@ do --cxxflags-extra*) CXX_FLAGS_EXTRA="${key#*=}" ;; + --cxxstandard*) + CXX_STANDARD="${key#*=}" + ;; --ldflags-extra*) LD_FLAGS_EXTRA="${key#*=}" ;; @@ -227,18 +232,30 @@ elif [ "$MACHINE" = "white" ]; then export SLURM_TASKS_PER_NODE=32 BASE_MODULE_LIST="/" - IBM_MODULE_LIST="/xl/" + IBM_MODULE_LIST="/xl/,gcc/7.2.0" CUDA_MODULE_LIST="/,gcc/7.2.0,ibm/xl/16.1.0" + CUDA10_MODULE_LIST="/,gcc/7.4.0,ibm/xl/16.1.0" # Don't do pthread on white. GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" - "cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) + if [ "$SPOT_CHECK" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" + "cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" + "ibm/16.1.1 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" + "cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.0.130 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + fi if [ -z "$ARCH_FLAG" ]; then ARCH_FLAG="--arch=Power8,Kepler37" @@ -323,6 +340,7 @@ elif [ "$MACHINE" = "apollo" ]; then BASE_MODULE_LIST="sems-env,kokkos-env,sems-/,kokkos-hwloc/1.10.1/base" CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base" CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base" + CUDA10_MODULE_LIST="sems-env,kokkos-env,/,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base" CLANG_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,/,cuda/9.0.69" CLANG7_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,/,cuda/9.1" @@ -344,6 +362,7 @@ elif [ "$MACHINE" = "apollo" ]; then else # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("cuda/9.1 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.0 $CUDA10_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "clang/6.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS" "clang/7.0 $CLANG7_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS" "clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" @@ -629,6 +648,8 @@ single_build_and_test() { local cxxflags="${cxxflags} ${CXX_FLAGS_EXTRA}" local ldflags="${ldflags} ${LD_FLAGS_EXTRA}" + local cxx_standard="${CXX_STANDARD}" + if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS" fi @@ -650,7 +671,7 @@ single_build_and_test() { run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; } fi else - run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --ldflags=\"$ldflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } + run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } local -i build_start_time=$(date +%s) run_cmd make -j 48 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; } local -i build_end_time=$(date +%s)