diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000000..fdee6325d0 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,21 @@ +# This file contains file patterns that triggers automatic +# code review requests from users that are owners of these files +# Order matters, the last match has the highest precedence + +# library folders +lib/colvars/* @giacomofiorin +lib/compress/* @akohlmey +lib/kokkos/* @stanmoore1 +lib/molfile/* @akohlmey +lib/qmmm/* @akohlmey +lib/vtk/* @rbberger + +# packages +src/KOKKOS @stanmoore1 +src/USER-CGSDK @akohlmey +src/USER-COLVARS @giacomofiorin +src/USER-OMP @akohlmey +src/USER-QMMM @akohlmey + +# tools +tools/msi2lmp/* @akohlmey diff --git a/.gitignore b/.gitignore index 74e511515e..50b970249a 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,11 @@ log.cite .Trashes ehthumbs.db Thumbs.db + +#cmake +/build* +/CMakeCache.txt +/CMakeFiles/ +/Makefile +/cmake_install.cmake +/lmp diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt new file mode 100644 index 0000000000..76c28fcb72 --- /dev/null +++ b/cmake/CMakeLists.txt @@ -0,0 +1,547 @@ +######################################## +# CMake build system +# This file is part of LAMMPS +# Created by Christoph Junghans and Richard Berger +cmake_minimum_required(VERSION 3.1) + +project(lammps) +set(SOVERSION 0) +set(LAMMPS_SOURCE_DIR ${CMAKE_SOURCE_DIR}/../src) +set(LAMMPS_LIB_SOURCE_DIR ${CMAKE_SOURCE_DIR}/../lib) +set(LAMMPS_LIB_BINARY_DIR ${CMAKE_BINARY_DIR}/lib) + +#To not conflict with old Makefile build system, we build everything here +file(GLOB LIB_SOURCES ${LAMMPS_SOURCE_DIR}/*.cpp) +file(GLOB LMP_SOURCES ${LAMMPS_SOURCE_DIR}/main.cpp) +list(REMOVE_ITEM LIB_SOURCES ${LMP_SOURCES}) + +# Cmake modules/macros are in a subdirectory to keep this file cleaner +set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/Modules) + +if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CXX_FLAGS) + #release comes with -O3 by default + set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." FORCE) +endif(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CXX_FLAGS) + +foreach(STYLE_FILE style_angle.h style_atom.h style_body.h style_bond.h style_command.h style_compute.h style_dihedral.h style_dump.h + style_fix.h style_improper.h style_integrate.h style_kspace.h style_minimize.h style_nbin.h style_npair.h style_nstencil.h + style_ntopo.h style_pair.h style_reader.h style_region.h) + if(EXISTS ${LAMMPS_SOURCE_DIR}/${STYLE_FILE}) + message(FATAL_ERROR "There is a ${STYLE_FILE} in ${LAMMPS_SOURCE_DIR}, please clean up the source directory first") + endif() +endforeach() + +enable_language(CXX) + +###################################################################### +# compiler tests +# these need ot be done early (before further tests). +##################################################################### +include(CheckCCompilerFlag) + +######################################################################## +# User input options # +######################################################################## +option(BUILD_SHARED_LIBS "Build shared libs" OFF) +option(INSTALL_LIB "Install lammps library and header" ON) +include(GNUInstallDirs) + +set(LAMMPS_LINK_LIBS) +option(ENABLE_MPI "Build MPI version" OFF) +if(ENABLE_MPI) + find_package(MPI REQUIRED) + include_directories(${MPI_C_INCLUDE_PATH}) + list(APPEND LAMMPS_LINK_LIBS ${MPI_CXX_LIBRARIES}) + option(LAMMPS_LONGLONG_TO_LONG "Workaround if your system or MPI version does not recognize 'long long' data types" OFF) + if(LAMMPS_LONGLONG_TO_LONG) + add_definitions(-DLAMMPS_LONGLONG_TO_LONG) + endif() +else() + file(GLOB MPI_SOURCES ${LAMMPS_SOURCE_DIR}/STUBS/mpi.c) + list(APPEND LIB_SOURCES ${MPI_SOURCES}) + include_directories(${LAMMPS_SOURCE_DIR}/STUBS) +endif() + +set(LAMMPS_SIZE_LIMIT "LAMMPS_SMALLBIG" CACHE STRING "Lammps size limit") +set_property(CACHE LAMMPS_SIZE_LIMIT PROPERTY STRINGS LAMMPS_SMALLBIG LAMMPS_BIGBIG LAMMPS_SMALLSMALL) +add_definitions(-D${LAMMPS_SIZE_LIMIT}) + +set(LAMMPS_MEMALIGN "64" CACHE STRING "enables the use of the posix_memalign() call instead of malloc() when large chunks or memory are allocated by LAMMPS") +add_definitions(-DLAMMPS_MEMALIGN=${LAMMPS_MEMALIGN}) + +option(LAMMPS_EXCEPTIONS "enable the use of C++ exceptions for error messages (useful for library interface)" OFF) +if(LAMMPS_EXCEPTIONS) + add_definitions(-DLAMMPS_EXCEPTIONS) +endif() + +option(CMAKE_VERBOSE_MAKEFILE "Verbose makefile" OFF) + +option(ENABLE_TESTING "Enable testing" OFF) +if(ENABLE_TESTING) + enable_testing() +endif(ENABLE_TESTING) + +option(ENABLE_ALL "Build all default packages" OFF) +set(DEFAULT_PACKAGES ASPHERE BODY CLASS2 COLLOID COMPRESS CORESHELL DIPOLE GRANULAR + KSPACE MANYBODY MC MEAM MISC MOLECULE PERI QEQ + REAX REPLICA RIGID SHOCK SNAP SRD) +set(OTHER_PACKAGES KIM PYTHON MSCG MPIIO VORONOI POEMS + USER-ATC USER-AWPMD USER-CGDNA + USER-CGSDK USER-COLVARS USER-DIFFRACTION USER-DPD USER-DRUDE USER-EFF + USER-FEP USER-H5MD USER-LB USER-MANIFOLD USER-MEAMC USER-MGPT USER-MISC + USER-MOLFILE USER-NETCDF USER-PHONON USER-QTB USER-REAXC USER-SMD + USER-SMTBQ USER-SPH USER-TALLY USER-VTK USER-QUIP USER-QMMM) +set(ACCEL_PACKAGES USER-OMP KOKKOS OPT USER-INTEL GPU) +foreach(PKG ${DEFAULT_PACKAGES}) + option(ENABLE_${PKG} "Build ${PKG} Package" ${ENABLE_ALL}) +endforeach() +foreach(PKG ${ACCEL_PACKAGES} ${OTHER_PACKAGES}) + option(ENABLE_${PKG} "Build ${PKG} Package" OFF) +endforeach() + +macro(pkg_depends PKG1 PKG2) + if(ENABLE_${PKG1} AND NOT ENABLE_${PKG2}) + message(FATAL_ERROR "${PKG1} package needs LAMMPS to be build with ${PKG2}") + endif() +endmacro() + +pkg_depends(MPIIO MPI) +pkg_depends(QEQ MANYBODY) +pkg_depends(USER-ATC MANYBODY) +pkg_depends(USER-H5MD MPI) +pkg_depends(USER-LB MPI) +pkg_depends(USER-MISC MANYBODY) +pkg_depends(USER-PHONON KSPACE) + +if(ENABLE_BODY AND ENABLE_POEMS) + message(FATAL_ERROR "BODY and POEMS cannot be enabled at the same time") +endif() + +###################################################### +# packages with special compiler needs or external libs +###################################################### +if(ENABLE_REAX OR ENABLE_MEAM OR ENABLE_USER-QUIP OR ENABLE_USER-QMMM) + enable_language(Fortran) +endif() + +if(ENABLE_KOKKOS OR ENABLE_MSCG) + # starting with CMake 3.1 this is all you have to do to enforce C++11 + set(CMAKE_CXX_STANDARD 11) # C++11... + set(CMAKE_CXX_STANDARD_REQUIRED ON) #...is required... + set(CMAKE_CXX_EXTENSIONS OFF) #...without compiler extensions like gnu++11 +endif() + +if(ENABLE_USER-OMP OR ENABLE_KOKKOS OR ENABLE_USER-INTEL) + find_package(OpenMP REQUIRED) + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") +endif() + +if(ENABLE_KSPACE) + set(FFT "KISSFFT" CACHE STRING "FFT library for KSPACE package") + set_property(CACHE FFT PROPERTY STRINGS KISSFFT FFTW3 MKL FFTW2) + if(NOT FFT STREQUAL "KISSFFT") + find_package(${FFT} REQUIRED) + add_definitions(-DFFT_${FFT}) + include_directories(${${FFT}_INCLUDE_DIRS}) + list(APPEND LAMMPS_LINK_LIBS ${${FFT}_LIBRARIES}) + endif() + set(PACK_OPTIMIZATION "PACK_ARRAY" CACHE STRING "Optimization for FFT") + set_property(CACHE PACK_OPTIMIZATION PROPERTY STRINGS PACK_ARRAY PACK_POINTER PACK_MEMCPY) + if(NOT PACK_OPTIMIZATION STREQUAL "PACK_ARRAY") + add_definitions(-D${PACK_OPTIMIZATION}) + endif() +endif() + +if(ENABLE_MISC) + option(LAMMPS_XDR "include XDR compatibility files for doing particle dumps in XTC format" OFF) + if(LAMMPS_XDR) + add_definitions(-DLAMMPS_XDR) + endif() +endif() + +if(ENABLE_MSCG OR ENABLE_USER-ATC OR ENABLE_USER-AWPMD OR ENABLE_USER-QUIP) + find_package(LAPACK) + if(LAPACK_FOUND) + list(APPEND LAMMPS_LINK_LIBS ${LAPACK_LIBRARIES}) + else() + enable_language(Fortran) + file(GLOB LAPACK_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/linalg/*.f) + list(APPEND LIB_SOURCES ${LAPACK_SOURCES}) + endif() +endif() + +if(ENABLE_PYTHON) + find_package(PythonInterp REQUIRED) + find_package(PythonLibs REQUIRED) + add_definitions(-DLMP_PYTHON) + include_directories(${PYTHON_INCLUDE_DIR}) + list(APPEND LAMMPS_LINK_LIBS ${PYTHON_LIBRARY}) + if(NOT PYTHON_INSTDIR) + execute_process(COMMAND ${PYTHON_EXECUTABLE} + -c "import distutils.sysconfig as cg; print(cg.get_python_lib(1,0,prefix='${CMAKE_INSTALL_PREFIX}'))" + OUTPUT_VARIABLE PYTHON_INSTDIR OUTPUT_STRIP_TRAILING_WHITESPACE) + endif() + install(FILES ${CMAKE_SOURCE_DIR}/../python/lammps.py DESTINATION ${PYTHON_INSTDIR}) + if(NOT BUILD_SHARED_LIBS) + message(FATAL_ERROR "Python package need lammps to be build shared, use -DBUILD_SHARED_LIBS=ON") + endif() +endif() + +find_package(JPEG) +if(JPEG_FOUND) + add_definitions(-DLAMMPS_JPEG) + include_directories(${JPEG_INCLUDE_DIR}) + list(APPEND LAMMPS_LINK_LIBS ${JPEG_LIBRARIES}) +endif() + +find_package(PNG) +find_package(ZLIB) +if(PNG_FOUND AND ZLIB_FOUND) + include_directories(${PNG_INCLUDE_DIRS} ${ZLIB_INCLUDE_DIRS}) + list(APPEND LAMMPS_LINK_LIBS ${PNG_LIBRARIES} ${ZLIB_LIBRARIES}) + add_definitions(-DLAMMPS_PNG) +endif() + +find_program(GZIP_EXECUTABLE gzip) +find_package_handle_standard_args(GZIP REQUIRED_VARS GZIP_EXECUTABLE) +if(GZIP_FOUND) + add_definitions(-DLAMMPS_GZIP) +endif() + +find_program(FFMPEG_EXECUTABLE ffmpeg) +find_package_handle_standard_args(FFMPEG REQUIRED_VARS FFMPEG_EXECUTABLE) +if(FFMPEG_FOUND) + add_definitions(-DLAMMPS_FFMPEG) +endif() + +if(ENABLE_VORONOI) + find_package(VORO REQUIRED) #some distros + include_directories(${VORO_INCLUDE_DIRS}) + list(APPEND LAMMPS_LINK_LIBS ${VORO_LIBRARIES}) +endif() + +if(ENABLE_USER-MOLFILE) + list(APPEND LAMMPS_LINK_LIBS ${CMAKE_DL_LIBS}) +endif() + +if(ENABLE_USER-NETCDF) + find_package(NetCDF REQUIRED) + include_directories(NETCDF_INCLUDE_DIR) + list(APPEND LAMMPS_LINK_LIBS ${NETCDF_LIBRARY}) + add_definitions(-DLMP_HAS_NETCDF -DNC_64BIT_DATA=0x0020) +endif() + +if(ENABLE_USER-SMD) + find_package(Eigen3 REQUIRED) + include_directories(${EIGEN3_INCLUDE_DIR}) +endif() + +if(ENABLE_USER-QUIP) + find_package(QUIP REQUIRED) + list(APPEND LAMMPS_LINK_LIBS ${QUIP_LIBRARIES} ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES}) +endif() + +if(ENABLE_USER-QMMM) + find_package(QE REQUIRED) + include_directories(${QE_INCLUDE_DIRS}) + list(APPEND LAMMPS_LINK_LIBS ${QE_LIBRARIES} ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES}) +endif() + +if(ENABLE_USER-AWPMD) + include_directories(${LAMMPS_LIB_SOURCE_DIR}/awpmd/systems/interact + ${LAMMPS_LIB_SOURCE_DIR}/awpmd/ivutils/include) +endif() + +if(ENABLE_USER-H5MD) + find_package(HDF5 REQUIRED) + list(APPEND LAMMPS_LINK_LIBS ${HDF5_LIBRARIES}) + include_directories(${HDF5_INCLUDE_DIRS} ${LAMMPS_LIB_SOURCE_DIR}/h5md/include) +endif() + +if(ENABLE_USER-VTK) + find_package(VTK REQUIRED NO_MODULE) + include(${VTK_USE_FILE}) + add_definitions(-DLAMMPS_VTK) + list(APPEND LAMMPS_LINK_LIBS ${VTK_LIBRARIES}) +endif() + +if(ENABLE_KIM) + find_package(KIM REQUIRED) + list(APPEND LAMMPS_LINK_LIBS ${KIM_LIBRARIES}) + include_directories(${KIM_INCLUDE_DIRS}) +endif() + +if(ENABLE_MSCG) + find_package(GSL REQUIRED) + set(LAMMPS_LIB_MSCG_BIN_DIR ${LAMMPS_LIB_BINARY_DIR}/mscg) + set(MSCG_TARBALL ${LAMMPS_LIB_MSCG_BIN_DIR}/MS-CG-master.zip) + set(LAMMPS_LIB_MSCG_BIN_DIR ${LAMMPS_LIB_MSCG_BIN_DIR}/MSCG-release-master/src) + if(NOT EXISTS ${LAMMPS_LIB_MSCG_BIN_DIR}) + if(NOT EXISTS ${MSCG_TARBALL}) + message(STATUS "Downloading ${MSCG_TARBALL}") + file(DOWNLOAD + https://github.com/uchicago-voth/MSCG-release/archive/master.zip + ${MSCG_TARBALL} SHOW_PROGRESS) #EXPECTED_MD5 cannot be due due to master + endif() + message(STATUS "Unpacking ${MSCG_TARBALL}") + execute_process(COMMAND ${CMAKE_COMMAND} -E tar xvf ${MSCG_TARBALL} + WORKING_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/mscg) + endif() + file(GLOB MSCG_SOURCES ${LAMMPS_LIB_MSCG_BIN_DIR}/*.cpp) + list(APPEND LIB_SOURCES ${MSCG_SOURCES}) + foreach(MSCG_SOURCE ${MSCG_SOURCES}) + set_property(SOURCE ${MSCG_SOURCE} APPEND PROPERTY COMPILE_DEFINITIONS + DIMENSION=3 _exclude_gromacs=1) + endforeach() + include_directories(${LAMMPS_LIB_MSCG_BIN_DIR} ${GSL_INCLUDE_DIRS}) + list(APPEND LAMMPS_LINK_LIBS ${GSL_LIBRARIES}) +endif() + +######################################################################## +# Basic system tests (standard libraries, headers, functions, types) # +######################################################################## +include(CheckIncludeFile) +foreach(HEADER math.h) + check_include_file(${HEADER} FOUND_${HEADER}) + if(NOT FOUND_${HEADER}) + message(FATAL_ERROR "Could not find needed header - ${HEADER}") + endif(NOT FOUND_${HEADER}) +endforeach(HEADER) + +set(MATH_LIBRARIES "m" CACHE STRING "math library") +mark_as_advanced( MATH_LIBRARIES ) +include(CheckLibraryExists) +foreach(FUNC sin cos) + check_library_exists(${MATH_LIBRARIES} ${FUNC} "" FOUND_${FUNC}_${MATH_LIBRARIES}) + if(NOT FOUND_${FUNC}_${MATH_LIBRARIES}) + message(FATAL_ERROR "Could not find needed math function - ${FUNC}") + endif(NOT FOUND_${FUNC}_${MATH_LIBRARIES}) +endforeach(FUNC) +list(APPEND LAMMPS_LINK_LIBS ${MATH_LIBRARIES}) + +###################################### +# Generate Basic Style files +###################################### +include(StyleHeaderUtils) +RegisterStyles(${LAMMPS_SOURCE_DIR}) + +############################################## +# add sources of enabled packages +############################################ +foreach(PKG ${DEFAULT_PACKAGES} ${OTHER_PACKAGES}) + if(ENABLE_${PKG}) + set(${PKG}_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/${PKG}) + + # detects styles in package and adds them to global list + RegisterStyles(${${PKG}_SOURCES_DIR}) + + file(GLOB ${PKG}_SOURCES ${${PKG}_SOURCES_DIR}/*.cpp) + list(APPEND LIB_SOURCES ${${PKG}_SOURCES}) + include_directories(${${PKG}_SOURCES_DIR}) + endif() +endforeach() + +############################################## +# add lib sources of (simple) enabled packages +############################################ +foreach(SIMPLE_LIB REAX MEAM POEMS USER-ATC USER-AWPMD USER-COLVARS USER-H5MD + USER-MOLFILE USER-QMMM) + if(ENABLE_${SIMPLE_LIB}) + string(REGEX REPLACE "^USER-" "" SIMPLE_LIB "${SIMPLE_LIB}") + string(TOLOWER "${SIMPLE_LIB}" INC_DIR) + file(GLOB_RECURSE ${SIMPLE_LIB}_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/${INC_DIR}/*.F + ${LAMMPS_LIB_SOURCE_DIR}/${INC_DIR}/*.c ${LAMMPS_LIB_SOURCE_DIR}/${INC_DIR}/*.cpp) + list(APPEND LIB_SOURCES ${${SIMPLE_LIB}_SOURCES}) + include_directories(${LAMMPS_LIB_SOURCE_DIR}/${INC_DIR}) + endif() +endforeach() + +###################################################################### +# packages which selectively include variants based on enabled styles +# e.g. accelerator packages +###################################################################### +if(ENABLE_USER-OMP) + set(USER-OMP_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/USER-OMP) + set(USER-OMP_SOURCES ${USER-OMP_SOURCES_DIR}/thr_data.cpp + ${USER-OMP_SOURCES_DIR}/thr_omp.cpp + ${USER-OMP_SOURCES_DIR}/fix_nh_omp.cpp + ${USER-OMP_SOURCES_DIR}/fix_nh_sphere_omp.cpp) + set_property(GLOBAL PROPERTY "OMP_SOURCES" "${USER-OMP_SOURCES}") + + # detects styles which have USER-OMP version + RegisterStylesExt(${USER-OMP_SOURCES_DIR} omp OMP_SOURCES) + + get_property(USER-OMP_SOURCES GLOBAL PROPERTY OMP_SOURCES) + + list(APPEND LIB_SOURCES ${USER-OMP_SOURCES}) + include_directories(${USER-OMP_SOURCES_DIR}) +endif() + +if(ENABLE_KOKKOS) + set(LAMMPS_LIB_KOKKOS_SRC_DIR ${LAMMPS_LIB_SOURCE_DIR}/kokkos) + set(LAMMPS_LIB_KOKKOS_BIN_DIR ${LAMMPS_LIB_BINARY_DIR}/kokkos) + add_definitions(-DLMP_KOKKOS) + add_subdirectory(${LAMMPS_LIB_KOKKOS_SRC_DIR} ${LAMMPS_LIB_KOKKOS_BIN_DIR}) + + set(Kokkos_INCLUDE_DIRS ${LAMMPS_LIB_KOKKOS_SRC_DIR}/core/src + ${LAMMPS_LIB_KOKKOS_SRC_DIR}/containers/src + ${LAMMPS_LIB_KOKKOS_SRC_DIR}/algorithms/src + ${LAMMPS_LIB_KOKKOS_BIN_DIR}) + include_directories(${Kokkos_INCLUDE_DIRS}) + list(APPEND LAMMPS_LINK_LIBS kokkos) + + set(KOKKOS_PKG_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/KOKKOS) + set(KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/kokkos.cpp + ${KOKKOS_PKG_SOURCES_DIR}/atom_kokkos.cpp + ${KOKKOS_PKG_SOURCES_DIR}/atom_vec_kokkos.cpp + ${KOKKOS_PKG_SOURCES_DIR}/comm_kokkos.cpp + ${KOKKOS_PKG_SOURCES_DIR}/comm_tiled_kokkos.cpp + ${KOKKOS_PKG_SOURCES_DIR}/neighbor_kokkos.cpp + ${KOKKOS_PKG_SOURCES_DIR}/neigh_list_kokkos.cpp + ${KOKKOS_PKG_SOURCES_DIR}/neigh_bond_kokkos.cpp + ${KOKKOS_PKG_SOURCES_DIR}/fix_nh_kokkos.cpp + ${KOKKOS_PKG_SOURCES_DIR}/domain_kokkos.cpp + ${KOKKOS_PKG_SOURCES_DIR}/modify_kokkos.cpp) + set_property(GLOBAL PROPERTY "KOKKOS_PKG_SOURCES" "${KOKKOS_PKG_SOURCES}") + + # detects styles which have KOKKOS version + RegisterStylesExt(${KOKKOS_PKG_SOURCES_DIR} kokkos KOKKOS_PKG_SOURCES) + + get_property(KOKKOS_PKG_SOURCES GLOBAL PROPERTY KOKKOS_PKG_SOURCES) + + list(APPEND LIB_SOURCES ${KOKKOS_PKG_SOURCES}) + include_directories(${KOKKOS_PKG_SOURCES_DIR}) +endif() + +if(ENABLE_OPT) + set(OPT_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/OPT) + set(OPT_SOURCES) + set_property(GLOBAL PROPERTY "OPT_SOURCES" "${OPT_SOURCES}") + + # detects styles which have OPT version + RegisterStylesExt(${OPT_SOURCES_DIR} opt OPT_SOURCES) + + get_property(OPT_SOURCES GLOBAL PROPERTY OPT_SOURCES) + + list(APPEND LIB_SOURCES ${OPT_SOURCES}) + include_directories(${OPT_SOURCES_DIR}) +endif() + +if(ENABLE_USER-INTEL) + set(USER-INTEL_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/USER-INTEL) + set(USER-INTEL_SOURCES ${USER-INTEL_SOURCES_DIR}/intel_preprocess.h + ${USER-INTEL_SOURCES_DIR}/intel_buffers.h + ${USER-INTEL_SOURCES_DIR}/intel_buffers.cpp + ${USER-INTEL_SOURCES_DIR}/math_extra_intel.h + ${USER-INTEL_SOURCES_DIR}/nbin_intel.h + ${USER-INTEL_SOURCES_DIR}/nbin_intel.cpp + ${USER-INTEL_SOURCES_DIR}/npair_intel.h + ${USER-INTEL_SOURCES_DIR}/npair_intel.cpp + ${USER-INTEL_SOURCES_DIR}/intel_simd.h + ${USER-INTEL_SOURCES_DIR}/intel_intrinsics.h) + + set_property(GLOBAL PROPERTY "USER-INTEL_SOURCES" "${USER-INTEL_SOURCES}") + + # detects styles which have USER-INTEL version + RegisterStylesExt(${USER-INTEL_SOURCES_DIR} opt USER-INTEL_SOURCES) + + get_property(USER-INTEL_SOURCES GLOBAL PROPERTY USER-INTEL_SOURCES) + + list(APPEND LIB_SOURCES ${USER-INTEL_SOURCES}) + include_directories(${USER-INTEL_SOURCES_DIR}) +endif() + +if(ENABLE_GPU) + find_package(CUDA REQUIRED) + find_program(BIN2C bin2c) + if(NOT BIN2C) + message(FATAL_ERROR "Couldn't find bin2c, use -DBIN2C helping cmake to find it.") + endif() + include_directories(${CUDA_INCLUDE_DIRS}) + list(APPEND LAMMPS_LINK_LIBS ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY}) + set(GPU_PREC "SINGLE_DOUBLE" CACHE STRING "Lammps gpu precision size") + set_property(CACHE GPU_PREC PROPERTY STRINGS SINGLE_DOUBLE SINGLE_SINGLE DOUBLE_DOUBLE) + add_definitions(-D_${GPU_PREC}) + add_definitions(-DNV_KERNEL -DUCL_CUDADR) + option(CUDPP_OPT "Enable CUDPP_OPT" ON) + + set(GPU_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/GPU) + set(GPU_SOURCES ${GPU_SOURCES_DIR}/gpu_extra.h) + + set_property(GLOBAL PROPERTY "GPU_SOURCES" "${GPU_SOURCES}") + + # detects styles which have GPU version + RegisterStylesExt(${GPU_SOURCES_DIR} opt GPU_SOURCES) + + get_property(GPU_SOURCES GLOBAL PROPERTY GPU_SOURCES) + + file(GLOB GPU_LIB_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/*.cpp) + file(GLOB GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/*.cu ${CMAKE_SOURCE_DIR}/gpu/*.cu) + file(GLOB_RECURSE GPU_NOT_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_pppm.cu) + list(REMOVE_ITEM GPU_LIB_CU ${GPU_NOT_LIB_CU}) + include_directories(${GPU_SOURCES_DIR} ${LAMMPS_LIB_SOURCE_DIR}/gpu ${LAMMPS_LIB_BINARY_DIR}/gpu) + if(CUDPP_OPT) + include_directories(${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini) + add_definitions(-DCUDPP_OPT) + file(GLOB GPU_LIB_CUDPP_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini/*.cpp) + file(GLOB GPU_LIB_CUDPP_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini/*.cu) + endif() + cuda_compile(GPU_OBJS ${GPU_LIB_CU} ${GPU_LIB_CUDPP_CU} OPTIONS $<$:-Xcompiler=-fPIC>) + file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu) + foreach(CU_OBJ ${GPU_OBJS}) + get_filename_component(CU_NAME ${CU_OBJ} NAME_WE) + string(REGEX REPLACE "^.*_lal_" "" CU_NAME "${CU_NAME}") + add_custom_command(OUTPUT ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h + COMMAND ${BIN2C} -c -n ${CU_NAME} ${CU_OBJ} > ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h + DEPENDS ${CU_OBJ} + COMMENT "Generating ${CU_NAME}_cubin.h") + list(APPEND LIB_SOURCES ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h) + if(${CU_NAME} STREQUAL "pppm_d") #pppm_d doesn't get linked into the lib + set(CU_FORBIDDEN_OBJ "${CU_OBJ}") + endif() + endforeach() + list(REMOVE_ITEM GPU_OBJS "${CU_FORBIDDEN_OBJ}") + list(APPEND LIB_SOURCES ${GPU_SOURCES} ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS}) + set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "${LAMMPS_LIB_BINARY_DIR}/gpu/*_cubin.h") +endif() + +###################################################### +# Generate style headers based on global list of +# styles registered during package selection +###################################################### +set(LAMMPS_STYLE_HEADERS_DIR ${CMAKE_CURRENT_BINARY_DIR}/styles) + +GenerateStyleHeaders(${LAMMPS_STYLE_HEADERS_DIR}) + +include_directories(${LAMMPS_SOURCE_DIR}) +include_directories(${LAMMPS_STYLE_HEADERS_DIR}) + +########################################### +# Actually add executable and lib to build +############################################ +add_library(lammps ${LIB_SOURCES}) +target_link_libraries(lammps ${LAMMPS_LINK_LIBS}) +set_target_properties(lammps PROPERTIES SOVERSION ${SOVERSION}) +if(INSTALL_LIB) + install(TARGETS lammps LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) + install(FILES ${LAMMPS_SOURCE_DIR}/lammps.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) +elseif(BUILD_SHARED_LIBS) + message(FATAL_ERROR "Shared library has to be installed, use -DINSTALL_LIB=ON to install lammps with a library") +endif() + +add_executable(lmp ${LMP_SOURCES}) +target_link_libraries(lmp lammps) +install(TARGETS lmp DESTINATION ${CMAKE_INSTALL_BINDIR}) +if(ENABLE_TESTING) + add_test(ShowHelp ${CMAKE_CURRENT_BINARY_DIR}/lmp -help) +endif() + +################################## +# Print package summary +################################## +foreach(PKG ${DEFAULT_PACKAGES} ${OTHER_PACKAGES} ${ACCEL_PACKAGES}) + if(ENABLE_${PKG}) + message(STATUS "Building package: ${PKG}") + endif() +endforeach() diff --git a/cmake/Modules/FindFFTW2.cmake b/cmake/Modules/FindFFTW2.cmake new file mode 100644 index 0000000000..c77e6cf8e9 --- /dev/null +++ b/cmake/Modules/FindFFTW2.cmake @@ -0,0 +1,22 @@ +# - Find fftw2 +# Find the native FFTW2 headers and libraries. +# +# FFTW2_INCLUDE_DIRS - where to find fftw2.h, etc. +# FFTW2_LIBRARIES - List of libraries when using fftw2. +# FFTW2_FOUND - True if fftw2 found. +# + +find_path(FFTW2_INCLUDE_DIR fftw.h) + +find_library(FFTW2_LIBRARY NAMES fftw) + +set(FFTW2_LIBRARIES ${FFTW2_LIBRARY}) +set(FFTW2_INCLUDE_DIRS ${FFTW2_INCLUDE_DIR}) + +include(FindPackageHandleStandardArgs) +# handle the QUIETLY and REQUIRED arguments and set FFTW2_FOUND to TRUE +# if all listed variables are TRUE + +find_package_handle_standard_args(FFTW2 DEFAULT_MSG FFTW2_LIBRARY FFTW2_INCLUDE_DIR) + +mark_as_advanced(FFTW2_INCLUDE_DIR FFTW2_LIBRARY ) diff --git a/cmake/Modules/FindFFTW3.cmake b/cmake/Modules/FindFFTW3.cmake new file mode 100644 index 0000000000..552bcc4257 --- /dev/null +++ b/cmake/Modules/FindFFTW3.cmake @@ -0,0 +1,25 @@ +# - Find fftw3 +# Find the native FFTW3 headers and libraries. +# +# FFTW3_INCLUDE_DIRS - where to find fftw3.h, etc. +# FFTW3_LIBRARIES - List of libraries when using fftw3. +# FFTW3_FOUND - True if fftw3 found. +# + +find_package(PkgConfig) + +pkg_check_modules(PC_FFTW3 fftw3) +find_path(FFTW3_INCLUDE_DIR fftw3.h HINTS ${PC_FFTW3_INCLUDE_DIRS}) + +find_library(FFTW3_LIBRARY NAMES fftw3 HINTS ${PC_FFTW3_LIBRARY_DIRS}) + +set(FFTW3_LIBRARIES ${FFTW3_LIBRARY}) +set(FFTW3_INCLUDE_DIRS ${FFTW3_INCLUDE_DIR}) + +include(FindPackageHandleStandardArgs) +# handle the QUIETLY and REQUIRED arguments and set FFTW3_FOUND to TRUE +# if all listed variables are TRUE + +find_package_handle_standard_args(FFTW3 DEFAULT_MSG FFTW3_LIBRARY FFTW3_INCLUDE_DIR) + +mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_LIBRARY ) diff --git a/cmake/Modules/FindKIM.cmake b/cmake/Modules/FindKIM.cmake new file mode 100644 index 0000000000..a01f817cf6 --- /dev/null +++ b/cmake/Modules/FindKIM.cmake @@ -0,0 +1,22 @@ +# - Find kim +# Find the native KIM headers and libraries. +# +# KIM_INCLUDE_DIRS - where to find kim.h, etc. +# KIM_LIBRARIES - List of libraries when using kim. +# KIM_FOUND - True if kim found. +# + +find_path(KIM_INCLUDE_DIR KIM_API.h PATH_SUFFIXES kim-api-v1) + +find_library(KIM_LIBRARY NAMES kim-api-v1) + +set(KIM_LIBRARIES ${KIM_LIBRARY}) +set(KIM_INCLUDE_DIRS ${KIM_INCLUDE_DIR}) + +include(FindPackageHandleStandardArgs) +# handle the QUIETLY and REQUIRED arguments and set KIM_FOUND to TRUE +# if all listed variables are TRUE + +find_package_handle_standard_args(KIM DEFAULT_MSG KIM_LIBRARY KIM_INCLUDE_DIR) + +mark_as_advanced(KIM_INCLUDE_DIR KIM_LIBRARY ) diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake new file mode 100644 index 0000000000..4246062103 --- /dev/null +++ b/cmake/Modules/FindMKL.cmake @@ -0,0 +1,22 @@ +# - Find mkl +# Find the native MKL headers and libraries. +# +# MKL_INCLUDE_DIRS - where to find mkl.h, etc. +# MKL_LIBRARIES - List of libraries when using mkl. +# MKL_FOUND - True if mkl found. +# + +find_path(MKL_INCLUDE_DIR mkl_dfti.h HINTS $ENV{MKLROOT}/include) + +find_library(MKL_LIBRARY NAMES mkl_rt HINTS $ENV{MKLROOT}/lib $ENV{MKLROOT}/lib/intel64) + +set(MKL_LIBRARIES ${MKL_LIBRARY}) +set(MKL_INCLUDE_DIRS ${MKL_INCLUDE_DIR}) + +include(FindPackageHandleStandardArgs) +# handle the QUIETLY and REQUIRED arguments and set MKL_FOUND to TRUE +# if all listed variables are TRUE + +find_package_handle_standard_args(MKL DEFAULT_MSG MKL_LIBRARY MKL_INCLUDE_DIR) + +mark_as_advanced(MKL_INCLUDE_DIR MKL_LIBRARY ) diff --git a/cmake/Modules/FindNetCDF.cmake b/cmake/Modules/FindNetCDF.cmake new file mode 100644 index 0000000000..a28c959acf --- /dev/null +++ b/cmake/Modules/FindNetCDF.cmake @@ -0,0 +1,118 @@ +# - Find NetCDF +# Find the native NetCDF includes and library +# +# NETCDF_INCLUDE_DIR - user modifiable choice of where netcdf headers are +# NETCDF_LIBRARY - user modifiable choice of where netcdf libraries are +# +# Your package can require certain interfaces to be FOUND by setting these +# +# NETCDF_CXX - require the C++ interface and link the C++ library +# NETCDF_F77 - require the F77 interface and link the fortran library +# NETCDF_F90 - require the F90 interface and link the fortran library +# +# Or equivalently by calling FindNetCDF with a COMPONENTS argument containing one or +# more of "CXX;F77;F90". +# +# When interfaces are requested the user has access to interface specific hints: +# +# NETCDF_${LANG}_INCLUDE_DIR - where to search for interface header files +# NETCDF_${LANG}_LIBRARY - where to search for interface libraries +# +# This module returns these variables for the rest of the project to use. +# +# NETCDF_FOUND - True if NetCDF found including required interfaces (see below) +# NETCDF_LIBRARIES - All netcdf related libraries. +# NETCDF_INCLUDE_DIRS - All directories to include. +# NETCDF_HAS_INTERFACES - Whether requested interfaces were found or not. +# NETCDF_${LANG}_INCLUDE_DIRS/NETCDF_${LANG}_LIBRARIES - C/C++/F70/F90 only interface +# +# Normal usage would be: +# set (NETCDF_F90 "YES") +# find_package (NetCDF REQUIRED) +# target_link_libraries (uses_everthing ${NETCDF_LIBRARIES}) +# target_link_libraries (only_uses_f90 ${NETCDF_F90_LIBRARIES}) + +#search starting from user editable cache var +if (NETCDF_INCLUDE_DIR AND NETCDF_LIBRARY) + # Already in cache, be silent + set (NETCDF_FIND_QUIETLY TRUE) +endif () + +set(USE_DEFAULT_PATHS "NO_DEFAULT_PATH") +if(NETCDF_USE_DEFAULT_PATHS) + set(USE_DEFAULT_PATHS "") +endif() + +find_path (NETCDF_INCLUDE_DIR netcdf.h + HINTS "${NETCDF_DIR}/include") +mark_as_advanced (NETCDF_INCLUDE_DIR) +set (NETCDF_C_INCLUDE_DIRS ${NETCDF_INCLUDE_DIR}) + +find_library (NETCDF_LIBRARY NAMES netcdf + HINTS "${NETCDF_DIR}/lib") +mark_as_advanced (NETCDF_LIBRARY) + +set (NETCDF_C_LIBRARIES ${NETCDF_LIBRARY}) + +#start finding requested language components +set (NetCDF_libs "") +set (NetCDF_includes "${NETCDF_INCLUDE_DIR}") + +get_filename_component (NetCDF_lib_dirs "${NETCDF_LIBRARY}" PATH) +set (NETCDF_HAS_INTERFACES "YES") # will be set to NO if we're missing any interfaces + +macro (NetCDF_check_interface lang header libs) + if (NETCDF_${lang}) + #search starting from user modifiable cache var + find_path (NETCDF_${lang}_INCLUDE_DIR NAMES ${header} + HINTS "${NETCDF_INCLUDE_DIR}" + HINTS "${NETCDF_${lang}_ROOT}/include" + ${USE_DEFAULT_PATHS}) + + find_library (NETCDF_${lang}_LIBRARY NAMES ${libs} + HINTS "${NetCDF_lib_dirs}" + HINTS "${NETCDF_${lang}_ROOT}/lib" + ${USE_DEFAULT_PATHS}) + + mark_as_advanced (NETCDF_${lang}_INCLUDE_DIR NETCDF_${lang}_LIBRARY) + + #export to internal varS that rest of project can use directly + set (NETCDF_${lang}_LIBRARIES ${NETCDF_${lang}_LIBRARY}) + set (NETCDF_${lang}_INCLUDE_DIRS ${NETCDF_${lang}_INCLUDE_DIR}) + + if (NETCDF_${lang}_INCLUDE_DIR AND NETCDF_${lang}_LIBRARY) + list (APPEND NetCDF_libs ${NETCDF_${lang}_LIBRARY}) + list (APPEND NetCDF_includes ${NETCDF_${lang}_INCLUDE_DIR}) + else () + set (NETCDF_HAS_INTERFACES "NO") + message (STATUS "Failed to find NetCDF interface for ${lang}") + endif () + endif () +endmacro () + +list (FIND NetCDF_FIND_COMPONENTS "CXX" _nextcomp) +if (_nextcomp GREATER -1) + set (NETCDF_CXX 1) +endif () +list (FIND NetCDF_FIND_COMPONENTS "F77" _nextcomp) +if (_nextcomp GREATER -1) + set (NETCDF_F77 1) +endif () +list (FIND NetCDF_FIND_COMPONENTS "F90" _nextcomp) +if (_nextcomp GREATER -1) + set (NETCDF_F90 1) +endif () +NetCDF_check_interface (CXX netcdfcpp.h netcdf_c++) +NetCDF_check_interface (F77 netcdf.inc netcdff) +NetCDF_check_interface (F90 netcdf.mod netcdff) + +#export accumulated results to internal varS that rest of project can depend on +list (APPEND NetCDF_libs "${NETCDF_C_LIBRARIES}") +set (NETCDF_LIBRARIES ${NetCDF_libs}) +set (NETCDF_INCLUDE_DIRS ${NetCDF_includes}) + +# handle the QUIETLY and REQUIRED arguments and set NETCDF_FOUND to TRUE if +# all listed variables are TRUE +include (FindPackageHandleStandardArgs) +find_package_handle_standard_args (NetCDF + DEFAULT_MSG NETCDF_LIBRARIES NETCDF_INCLUDE_DIRS NETCDF_HAS_INTERFACES) diff --git a/cmake/Modules/FindQE.cmake b/cmake/Modules/FindQE.cmake new file mode 100644 index 0000000000..4484bd4db2 --- /dev/null +++ b/cmake/Modules/FindQE.cmake @@ -0,0 +1,29 @@ +# - Find quantum-espresso +# Find the native QE headers and libraries. +# +# QE_INCLUDE_DIRS - where to find quantum-espresso.h, etc. +# QE_LIBRARIES - List of libraries when using quantum-espresso. +# QE_FOUND - True if quantum-espresso found. +# + +find_path(QE_INCLUDE_DIR libqecouple.h PATH_SUFFIXES COUPLE/include) + +find_library(QECOUPLE_LIBRARY NAMES qecouple) +find_library(PW_LIBRARY NAMES pw) +find_library(QEMOD_LIBRARY NAMES qemod) +find_library(QEFFT_LIBRARY NAMES qefft) +find_library(QELA_LIBRARY NAMES qela) +find_library(CLIB_LIBRARY NAMES clib) +find_library(IOTK_LIBRARY NAMES iotk) + + +set(QE_LIBRARIES ${QECOUPLE_LIBRARY} ${PW_LIBRARY} ${QEMOD_LIBRARY} ${QEFFT_LIBRARY} ${QELA_LIBRARY} ${CLIB_LIBRARY} ${IOTK_LIBRARY}) +set(QE_INCLUDE_DIRS ${QE_INCLUDE_DIR}) + +include(FindPackageHandleStandardArgs) +# handle the QUIETLY and REQUIRED arguments and set QE_FOUND to TRUE +# if all listed variables are TRUE + +find_package_handle_standard_args(QE DEFAULT_MSG QECOUPLE_LIBRARY PW_LIBRARY QEMOD_LIBRARY QEFFT_LIBRARY QELA_LIBRARY CLIB_LIBRARY IOTK_LIBRARY QE_INCLUDE_DIR) + +mark_as_advanced(QE_INCLUDE_DIR QECOUPLE_LIBRARY PW_LIBRARY QEMOD_LIBRARY QEFFT_LIBRARY QELA_LIBRARY CLIB_LIBRARY IOTK_LIBRARY) diff --git a/cmake/Modules/FindQUIP.cmake b/cmake/Modules/FindQUIP.cmake new file mode 100644 index 0000000000..4ee1baf4f8 --- /dev/null +++ b/cmake/Modules/FindQUIP.cmake @@ -0,0 +1,18 @@ +# - Find quip +# Find the native QUIP libraries. +# +# QUIP_LIBRARIES - List of libraries when using fftw3. +# QUIP_FOUND - True if fftw3 found. +# + +find_library(QUIP_LIBRARY NAMES quip) + +set(QUIP_LIBRARIES ${QUIP_LIBRARY}) + +include(FindPackageHandleStandardArgs) +# handle the QUIETLY and REQUIRED arguments and set QUIP_FOUND to TRUE +# if all listed variables are TRUE + +find_package_handle_standard_args(QUIP DEFAULT_MSG QUIP_LIBRARY) + +mark_as_advanced(QUIP_LIBRARY) diff --git a/cmake/Modules/FindVORO.cmake b/cmake/Modules/FindVORO.cmake new file mode 100644 index 0000000000..b0cccbcd1d --- /dev/null +++ b/cmake/Modules/FindVORO.cmake @@ -0,0 +1,22 @@ +# - Find voro++ +# Find the native VORO headers and libraries. +# +# VORO_INCLUDE_DIRS - where to find voro++.hh, etc. +# VORO_LIBRARIES - List of libraries when using voro++. +# VORO_FOUND - True if voro++ found. +# + +find_path(VORO_INCLUDE_DIR voro++.hh PATH_SUFFIXES voro++) + +find_library(VORO_LIBRARY NAMES voro++) + +set(VORO_LIBRARIES ${VORO_LIBRARY}) +set(VORO_INCLUDE_DIRS ${VORO_INCLUDE_DIR}) + +include(FindPackageHandleStandardArgs) +# handle the QUIETLY and REQUIRED arguments and set VORO_FOUND to TRUE +# if all listed variables are TRUE + +find_package_handle_standard_args(VORO DEFAULT_MSG VORO_LIBRARY VORO_INCLUDE_DIR) + +mark_as_advanced(VORO_INCLUDE_DIR VORO_LIBRARY ) diff --git a/cmake/Modules/StyleHeaderUtils.cmake b/cmake/Modules/StyleHeaderUtils.cmake new file mode 100644 index 0000000000..9939a7505a --- /dev/null +++ b/cmake/Modules/StyleHeaderUtils.cmake @@ -0,0 +1,132 @@ +function(FindStyleHeaders path style_class file_pattern headers) + file(GLOB files "${path}/${file_pattern}*.h") + get_property(hlist GLOBAL PROPERTY ${headers}) + + foreach(file_name ${files}) + file(STRINGS ${file_name} is_style LIMIT_COUNT 1 REGEX ${style_class}) + if(is_style) + list(APPEND hlist ${file_name}) + endif() + endforeach() + set_property(GLOBAL PROPERTY ${headers} "${hlist}") +endfunction(FindStyleHeaders) + +function(FindStyleHeadersExt path style_class extension headers sources) + get_property(hlist GLOBAL PROPERTY ${headers}) + get_property(slist GLOBAL PROPERTY ${sources}) + set(ext_list) + get_filename_component(abs_path "${path}" ABSOLUTE) + + foreach(file_name ${hlist}) + get_filename_component(basename ${file_name} NAME_WE) + set(ext_file_name "${abs_path}/${basename}_${extension}.h") + if(EXISTS "${ext_file_name}") + file(STRINGS ${ext_file_name} is_style LIMIT_COUNT 1 REGEX ${style_class}) + if(is_style) + list(APPEND ext_list ${ext_file_name}) + + set(source_file_name "${abs_path}/${basename}_${extension}.cpp") + if(EXISTS "${source_file_name}") + list(APPEND slist ${source_file_name}) + endif() + endif() + endif() + endforeach() + + list(APPEND hlist ${ext_list}) + set_property(GLOBAL PROPERTY ${headers} "${hlist}") + set_property(GLOBAL PROPERTY ${sources} "${slist}") +endfunction(FindStyleHeadersExt) + +function(CreateStyleHeader path filename) + math(EXPR N "${ARGC}-2") + + set(temp "") + if(N GREATER 0) + math(EXPR ARG_END "${ARGC}-1") + + foreach(IDX RANGE 2 ${ARG_END}) + list(GET ARGV ${IDX} FNAME) + get_filename_component(FNAME ${FNAME} NAME) + set(temp "${temp}#include \"${FNAME}\"\n") + endforeach() + endif() + message(STATUS "Generating ${filename}...") + file(WRITE "${path}/${filename}.tmp" "${temp}" ) + execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different "${path}/${filename}.tmp" "${path}/${filename}") +endfunction(CreateStyleHeader) + +function(GenerateStyleHeader path property style) + get_property(files GLOBAL PROPERTY ${property}) + #message("${property} = ${files}") + CreateStyleHeader("${path}" "style_${style}.h" ${files}) +endfunction(GenerateStyleHeader) + +function(RegisterStyles search_path) + FindStyleHeaders(${search_path} ANGLE_CLASS angle_ ANGLE ) # angle ) # force + FindStyleHeaders(${search_path} ATOM_CLASS atom_vec_ ATOM_VEC ) # atom ) # atom atom_vec_hybrid + FindStyleHeaders(${search_path} BODY_CLASS body_ BODY ) # body ) # atom_vec_body + FindStyleHeaders(${search_path} BOND_CLASS bond_ BOND ) # bond ) # force + FindStyleHeaders(${search_path} COMMAND_CLASS "" COMMAND ) # command ) # input + FindStyleHeaders(${search_path} COMPUTE_CLASS compute_ COMPUTE ) # compute ) # modify + FindStyleHeaders(${search_path} DIHEDRAL_CLASS dihedral_ DIHEDRAL ) # dihedral ) # force + FindStyleHeaders(${search_path} DUMP_CLASS dump_ DUMP ) # dump ) # output write_dump + FindStyleHeaders(${search_path} FIX_CLASS fix_ FIX ) # fix ) # modify + FindStyleHeaders(${search_path} IMPROPER_CLASS improper_ IMPROPER ) # improper ) # force + FindStyleHeaders(${search_path} INTEGRATE_CLASS "" INTEGRATE ) # integrate ) # update + FindStyleHeaders(${search_path} KSPACE_CLASS "" KSPACE ) # kspace ) # force + FindStyleHeaders(${search_path} MINIMIZE_CLASS min_ MINIMIZE ) # minimize ) # update + FindStyleHeaders(${search_path} NBIN_CLASS nbin_ NBIN ) # nbin ) # neighbor + FindStyleHeaders(${search_path} NPAIR_CLASS npair_ NPAIR ) # npair ) # neighbor + FindStyleHeaders(${search_path} NSTENCIL_CLASS nstencil_ NSTENCIL ) # nstencil ) # neighbor + FindStyleHeaders(${search_path} NTOPO_CLASS ntopo_ NTOPO ) # ntopo ) # neighbor + FindStyleHeaders(${search_path} PAIR_CLASS pair_ PAIR ) # pair ) # force + FindStyleHeaders(${search_path} READER_CLASS reader_ READER ) # reader ) # read_dump + FindStyleHeaders(${search_path} REGION_CLASS region_ REGION ) # region ) # domain +endfunction(RegisterStyles) + +function(RegisterStylesExt search_path extension sources) + FindStyleHeadersExt(${search_path} ANGLE_CLASS ${extension} ANGLE ${sources}) + FindStyleHeadersExt(${search_path} ATOM_CLASS ${extension} ATOM_VEC ${sources}) + FindStyleHeadersExt(${search_path} BODY_CLASS ${extension} BODY ${sources}) + FindStyleHeadersExt(${search_path} BOND_CLASS ${extension} BOND ${sources}) + FindStyleHeadersExt(${search_path} COMMAND_CLASS ${extension} COMMAND ${sources}) + FindStyleHeadersExt(${search_path} COMPUTE_CLASS ${extension} COMPUTE ${sources}) + FindStyleHeadersExt(${search_path} DIHEDRAL_CLASS ${extension} DIHEDRAL ${sources}) + FindStyleHeadersExt(${search_path} DUMP_CLASS ${extension} DUMP ${sources}) + FindStyleHeadersExt(${search_path} FIX_CLASS ${extension} FIX ${sources}) + FindStyleHeadersExt(${search_path} IMPROPER_CLASS ${extension} IMPROPER ${sources}) + FindStyleHeadersExt(${search_path} INTEGRATE_CLASS ${extension} INTEGRATE ${sources}) + FindStyleHeadersExt(${search_path} KSPACE_CLASS ${extension} KSPACE ${sources}) + FindStyleHeadersExt(${search_path} MINIMIZE_CLASS ${extension} MINIMIZE ${sources}) + FindStyleHeadersExt(${search_path} NBIN_CLASS ${extension} NBIN ${sources}) + FindStyleHeadersExt(${search_path} NPAIR_CLASS ${extension} NPAIR ${sources}) + FindStyleHeadersExt(${search_path} NSTENCIL_CLASS ${extension} NSTENCIL ${sources}) + FindStyleHeadersExt(${search_path} NTOPO_CLASS ${extension} NTOPO ${sources}) + FindStyleHeadersExt(${search_path} PAIR_CLASS ${extension} PAIR ${sources}) + FindStyleHeadersExt(${search_path} READER_CLASS ${extension} READER ${sources}) + FindStyleHeadersExt(${search_path} REGION_CLASS ${extension} REGION ${sources}) +endfunction(RegisterStylesExt) + +function(GenerateStyleHeaders output_path) + GenerateStyleHeader(${output_path} ANGLE angle ) # force + GenerateStyleHeader(${output_path} ATOM_VEC atom ) # atom atom_vec_hybrid + GenerateStyleHeader(${output_path} BODY body ) # atom_vec_body + GenerateStyleHeader(${output_path} BOND bond ) # force + GenerateStyleHeader(${output_path} COMMAND command ) # input + GenerateStyleHeader(${output_path} COMPUTE compute ) # modify + GenerateStyleHeader(${output_path} DIHEDRAL dihedral ) # force + GenerateStyleHeader(${output_path} DUMP dump ) # output write_dump + GenerateStyleHeader(${output_path} FIX fix ) # modify + GenerateStyleHeader(${output_path} IMPROPER improper ) # force + GenerateStyleHeader(${output_path} INTEGRATE integrate ) # update + GenerateStyleHeader(${output_path} KSPACE kspace ) # force + GenerateStyleHeader(${output_path} MINIMIZE minimize ) # update + GenerateStyleHeader(${output_path} NBIN nbin ) # neighbor + GenerateStyleHeader(${output_path} NPAIR npair ) # neighbor + GenerateStyleHeader(${output_path} NSTENCIL nstencil ) # neighbor + GenerateStyleHeader(${output_path} NTOPO ntopo ) # neighbor + GenerateStyleHeader(${output_path} PAIR pair ) # force + GenerateStyleHeader(${output_path} READER reader ) # read_dump + GenerateStyleHeader(${output_path} REGION region ) # domain +endfunction(GenerateStyleHeaders) diff --git a/cmake/README b/cmake/README new file mode 100644 index 0000000000..cc67cceb52 --- /dev/null +++ b/cmake/README @@ -0,0 +1,19 @@ +cmake-buildsystem +----------------- + +To use the cmake build system instead of the make-driven one, do: +``` +cmake /path/to/lammps/source/cmake +``` +(please note the cmake directory as the very end) + +To enable package, e.g. GPU do +``` +cmake /path/to/lammps/source/cmake -DENABLE_GPU=ON +``` + +cmake has many many options, do get an overview use the curses-based cmake interface, ccmake: +``` +ccmake /path/to/lammps/source/cmake +``` +(Don't forget to press "g" for generate once you are done with configuring) diff --git a/cmake/gpu/lal_pppm_d.cu b/cmake/gpu/lal_pppm_d.cu new file mode 100644 index 0000000000..a49a535013 --- /dev/null +++ b/cmake/gpu/lal_pppm_d.cu @@ -0,0 +1,4 @@ +#define grdtyp double +#define grdtyp4 double4 + +#include "lal_pppm.cu" diff --git a/cmake/gpu/lal_pppm_f.cu b/cmake/gpu/lal_pppm_f.cu new file mode 100644 index 0000000000..e7f5116fa0 --- /dev/null +++ b/cmake/gpu/lal_pppm_f.cu @@ -0,0 +1,4 @@ +#define grdtyp float +#define grdtyp4 float4 + +#include "lal_pppm.cu" diff --git a/doc/src/JPG/user_intel.png b/doc/src/JPG/user_intel.png index 302b50124a..7ec83b3207 100755 Binary files a/doc/src/JPG/user_intel.png and b/doc/src/JPG/user_intel.png differ diff --git a/doc/src/Manual.txt b/doc/src/Manual.txt index 2af2ffd4b7..bb2e1b8114 100644 --- a/doc/src/Manual.txt +++ b/doc/src/Manual.txt @@ -1,7 +1,7 @@ LAMMPS Users Manual - + @@ -21,7 +21,7 @@

LAMMPS Documentation :c,h3 -10 Aug 2017 version :c,h4 +17 Aug 2017 version :c,h4 Version info: :h4 @@ -79,7 +79,7 @@ bug reports and feature requests are mainly coordinated through the "LAMMPS project on GitHub."_https://github.com/lammps/lammps The lammps.org domain, currently hosting "public continuous integration testing"_https://ci.lammps.org/job/lammps/ and "precompiled Linux -RPM and Windows installer packages"_http://rpm.lammps.org is located +RPM and Windows installer packages"_http://packages.lammps.org is located at Temple University and managed by Richard Berger, richard.berger at temple.edu. diff --git a/doc/src/PDF/colvars-refman-lammps.pdf b/doc/src/PDF/colvars-refman-lammps.pdf index a14d93cd69..ad15752107 100644 Binary files a/doc/src/PDF/colvars-refman-lammps.pdf and b/doc/src/PDF/colvars-refman-lammps.pdf differ diff --git a/doc/src/Section_commands.txt b/doc/src/Section_commands.txt index f1eb225fe5..571c6c4920 100644 --- a/doc/src/Section_commands.txt +++ b/doc/src/Section_commands.txt @@ -892,8 +892,8 @@ KOKKOS, o = USER-OMP, t = OPT. "hybrid"_pair_hybrid.html, "hybrid/overlay"_pair_hybrid.html, "adp (o)"_pair_adp.html, -"airebo (o)"_pair_airebo.html, -"airebo/morse (o)"_pair_airebo.html, +"airebo (oi)"_pair_airebo.html, +"airebo/morse (oi)"_pair_airebo.html, "beck (go)"_pair_beck.html, "body"_pair_body.html, "bop"_pair_bop.html, @@ -927,8 +927,8 @@ KOKKOS, o = USER-OMP, t = OPT. "dpd/tstat (go)"_pair_dpd.html, "dsmc"_pair_dsmc.html, "eam (gkiot)"_pair_eam.html, -"eam/alloy (gkot)"_pair_eam.html, -"eam/fs (gkot)"_pair_eam.html, +"eam/alloy (gkiot)"_pair_eam.html, +"eam/fs (gkiot)"_pair_eam.html, "eim (o)"_pair_eim.html, "gauss (go)"_pair_gauss.html, "gayberne (gio)"_pair_gayberne.html, @@ -942,9 +942,9 @@ KOKKOS, o = USER-OMP, t = OPT. "kim"_pair_kim.html, "lcbop"_pair_lcbop.html, "line/lj"_pair_line_lj.html, -"lj/charmm/coul/charmm (ko)"_pair_charmm.html, +"lj/charmm/coul/charmm (kio)"_pair_charmm.html, "lj/charmm/coul/charmm/implicit (ko)"_pair_charmm.html, -"lj/charmm/coul/long (giko)"_pair_charmm.html, +"lj/charmm/coul/long (gkio)"_pair_charmm.html, "lj/charmm/coul/msm"_pair_charmm.html, "lj/charmmfsw/coul/charmmfsh"_pair_charmm.html, "lj/charmmfsw/coul/long"_pair_charmm.html, @@ -990,7 +990,7 @@ KOKKOS, o = USER-OMP, t = OPT. "polymorphic"_pair_polymorphic.html, "python"_pair_python.html, "reax"_pair_reax.html, -"rebo (o)"_pair_airebo.html, +"rebo (oi)"_pair_airebo.html, "resquared (go)"_pair_resquared.html, "snap"_pair_snap.html, "soft (go)"_pair_soft.html, diff --git a/doc/src/Section_errors.txt b/doc/src/Section_errors.txt index 408c01d52c..f5829f92fb 100644 --- a/doc/src/Section_errors.txt +++ b/doc/src/Section_errors.txt @@ -7886,8 +7886,8 @@ keyword to allow for additional bonds to be formed :dd {New bond exceeded special list size in fix bond/create} :dt -See the "special_bonds extra" command -(or the "read_data extra/special/per/atom" command) +See the "read_data extra/special/per/atom" command +(or the "create_box extra/special/per/atom" command) for info on how to leave space in the special bonds list to allow for additional bonds to be formed. :dd @@ -9666,8 +9666,8 @@ you are running. :dd {Special list size exceeded in fix bond/create} :dt -See the special_bonds extra command -(or the read_data extra/special/per/atom command) +See the "read_data extra/special/per/atom" command +(or the "create_box extra/special/per/atom" command) for info on how to leave space in the special bonds list to allow for additional bonds to be formed. :dd diff --git a/doc/src/Section_start.txt b/doc/src/Section_start.txt index 6eef155be2..a25ec11cfe 100644 --- a/doc/src/Section_start.txt +++ b/doc/src/Section_start.txt @@ -662,27 +662,25 @@ your own build system. Due to differences between the Windows OS and Windows system libraries to Unix-like environments like Linux or MacOS, when compiling for Windows a few adjustments may be needed: -Do not set the -DLAMMPS_MEMALIGN define (see LMP_INC makefile variable) +Do [not] set the -DLAMMPS_MEMALIGN define (see LMP_INC makefile variable) Add -lwsock32 -lpsapi to the linker flags (see LIB makefile variable) -Try adding -static-libgcc or -static or both to the linker flags when your -LAMMPS executable complains about missing .dll files :ul +Try adding -static-libgcc or -static or both to the linker flags when your LAMMPS executable complains about missing .dll files :ul -Since none of the current LAMMPS core developers -has significant experience building executables on Windows, we are -happy to distribute contributed instructions and modifications, but -we cannot provide support for those. +Since none of the current LAMMPS core developers has significant +experience building executables on Windows, we are happy to distribute +contributed instructions and modifications to improve the situation, +but we cannot provide support for those. With the so-called "Anniversary Update" to Windows 10, there is a Ubuntu Linux subsystem available for Windows, that can be installed and then used to compile/install LAMMPS as if you are running on a Ubuntu Linux system instead of Windows. -As an alternative, you can download "daily builds" (and some older -versions) of the installer packages from -"rpm.lammps.org/windows.html"_http://rpm.lammps.org/windows.html. -These executables are built with most optional packages and the -download includes documentation, potential files, some tools and -many examples, but no source code. +As an alternative, you can download pre-compiled installer packages from +"packages.lammps.org/windows.html"_http://packages.lammps.org/windows.html. +These executables are built with most optional packages included and the +download includes documentation, potential files, some tools and many +examples, but no source code. :line @@ -1095,7 +1093,7 @@ LAMMPS to be built with one or more of its optional packages. :line On a Windows box, you can skip making LAMMPS and simply download an -installer package from "here"_http://rpm.lammps.org/windows.html +installer package from "here"_http://packages.lammps.org/windows.html For running the non-MPI executable, follow these steps: @@ -1107,18 +1105,27 @@ the [in.lj] input from the bench folder. (e.g. by typing: cd "Documents"). :l At the command prompt, type "lmp_serial -in in.lj", replacing [in.lj] with the name of your LAMMPS input script. :l + +The serial executable includes support for multi-threading +parallelization from the styles in the USER-OMP packages. + +To run with, e.g. 4 threads, type "lmp_serial -in in.lj -pk omp 4 -sf omp" :ule -For the MPI version, which allows you to run LAMMPS under Windows on -multiple processors, follow these steps: +For the MPI version, which allows you to run LAMMPS under Windows with +the more general message passing parallel library (LAMMPS has been +designed from ground up to use MPI efficiently), follow these steps: -Download and install -"MPICH2"_http://www.mcs.anl.gov/research/projects/mpich2/downloads/index.php?s=downloads -for Windows. :ulb,l +Download and install a compatible MPI library binary package: +for 32-bit Windows +"mpich2-1.4.1p1-win-ia32.msi"_download.lammps.org/thirdparty/mpich2-1.4.1p1-win-ia32.msi +and for 64-bit Windows +"mpich2-1.4.1p1-win-x86-64.msi"_download.lammps.org/thirdparty/mpich2-1.4.1p1-win-x86-64.msi +:ulb,l The LAMMPS Windows installer packages will automatically adjust your path for the default location of this MPI package. After the installation -of the MPICH software, it needs to be integrated into the system. +of the MPICH2 software, it needs to be integrated into the system. For this you need to start a Command Prompt in {Administrator Mode} (right click on the icon and select it). Change into the MPICH2 installation directory, then into the subdirectory [bin] and execute @@ -1137,7 +1144,7 @@ or mpiexec -np 4 lmp_mpi -in in.lj :pre -replacing in.lj with the name of your LAMMPS input script. For the latter +replacing [in.lj] with the name of your LAMMPS input script. For the latter case, you may be prompted to enter your password. :l In this mode, output may not immediately show up on the screen, so if @@ -1149,6 +1156,11 @@ something like: lmp_mpi -in in.lj :pre +And the parallel executable also includes OpenMP multi-threading, which +can be combined with MPI using something like: + +mpiexec -localonly 2 lmp_mpi -in in.lj -pk omp 2 -sf omp :pre + :ule :line diff --git a/doc/src/accelerate_intel.txt b/doc/src/accelerate_intel.txt index 9eb295e0d0..a7c3382caa 100644 --- a/doc/src/accelerate_intel.txt +++ b/doc/src/accelerate_intel.txt @@ -29,8 +29,10 @@ Bond Styles: fene, harmonic :l Dihedral Styles: charmm, harmonic, opls :l Fixes: nve, npt, nvt, nvt/sllod :l Improper Styles: cvff, harmonic :l -Pair Styles: buck/coul/cut, buck/coul/long, buck, eam, gayberne, -charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, sw, tersoff :l +Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long, +buck, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm, +lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo, +sw, tersoff :l K-Space Styles: pppm, pppm/disp :l :ule diff --git a/doc/src/fix_bond_create.txt b/doc/src/fix_bond_create.txt index a44c3103dd..c0045ac0f0 100644 --- a/doc/src/fix_bond_create.txt +++ b/doc/src/fix_bond_create.txt @@ -150,10 +150,9 @@ atoms. Note that adding a single bond always adds a new 1st neighbor but may also induce *many* new 2nd and 3rd neighbors, depending on the molecular topology of your system. The "extra special per atom" parameter must typically be set to allow for the new maximum total -size (1st + 2nd + 3rd neighbors) of this per-atom list. There are 3 +size (1st + 2nd + 3rd neighbors) of this per-atom list. There are 2 ways to do this. See the "read_data"_read_data.html or -"create_box"_create_box.html or "special_bonds extra" commands for -details. +"create_box"_create_box.html commands for details. NOTE: Even if you do not use the {atype}, {dtype}, or {itype} keywords, the list of topological neighbors is updated for atoms diff --git a/doc/src/pair_airebo.txt b/doc/src/pair_airebo.txt index e66ecb637f..1aa017f278 100644 --- a/doc/src/pair_airebo.txt +++ b/doc/src/pair_airebo.txt @@ -7,10 +7,13 @@ :line pair_style airebo command :h3 +pair_style airebo/intel command :h3 pair_style airebo/omp command :h3 pair_style airebo/morse command :h3 +pair_style airebo/morse/intel command :h3 pair_style airebo/morse/omp command :h3 pair_style rebo command :h3 +pair_style rebo/intel command :h3 pair_style rebo/omp command :h3 [Syntax:] diff --git a/doc/src/pair_charmm.txt b/doc/src/pair_charmm.txt index ef4ef41c95..75a8e4bff9 100644 --- a/doc/src/pair_charmm.txt +++ b/doc/src/pair_charmm.txt @@ -7,6 +7,7 @@ :line pair_style lj/charmm/coul/charmm command :h3 +pair_style lj/charmm/coul/charmm/intel command :h3 pair_style lj/charmm/coul/charmm/omp command :h3 pair_style lj/charmm/coul/charmm/implicit command :h3 pair_style lj/charmm/coul/charmm/implicit/omp command :h3 diff --git a/doc/src/pair_eam.txt b/doc/src/pair_eam.txt index ce8495affd..a0026432ec 100644 --- a/doc/src/pair_eam.txt +++ b/doc/src/pair_eam.txt @@ -14,6 +14,7 @@ pair_style eam/omp command :h3 pair_style eam/opt command :h3 pair_style eam/alloy command :h3 pair_style eam/alloy/gpu command :h3 +pair_style eam/alloy/intel command :h3 pair_style eam/alloy/kk command :h3 pair_style eam/alloy/omp command :h3 pair_style eam/alloy/opt command :h3 @@ -21,6 +22,7 @@ pair_style eam/cd command :h3 pair_style eam/cd/omp command :h3 pair_style eam/fs command :h3 pair_style eam/fs/gpu command :h3 +pair_style eam/fs/intel command :h3 pair_style eam/fs/kk command :h3 pair_style eam/fs/omp command :h3 pair_style eam/fs/opt command :h3 diff --git a/doc/src/special_bonds.txt b/doc/src/special_bonds.txt index 6a661015bd..1021c4856b 100644 --- a/doc/src/special_bonds.txt +++ b/doc/src/special_bonds.txt @@ -25,9 +25,7 @@ keyword = {amber} or {charmm} or {dreiding} or {fene} or {lj/coul} or {lj} or {c {coul} values = w1,w2,w3 w1,w2,w3 = weights (0.0 to 1.0) on pairwise Coulombic interactions {angle} value = {yes} or {no} - {dihedral} value = {yes} or {no} - {extra} value = N - N = number of extra 1-2,1-3,1-4 interactions to save space for :pre + {dihedral} value = {yes} or {no} :pre :ule Examples: @@ -36,8 +34,7 @@ special_bonds amber special_bonds charmm special_bonds fene dihedral no special_bonds lj/coul 0.0 0.0 0.5 angle yes dihedral yes -special_bonds lj 0.0 0.0 0.5 coul 0.0 0.0 0.0 dihedral yes -special_bonds lj/coul 0 1 1 extra 2 :pre +special_bonds lj 0.0 0.0 0.5 coul 0.0 0.0 0.0 dihedral yes :pre [Description:] @@ -178,14 +175,6 @@ interaction between atoms 2 and 5 will be unaffected (full weighting of 1.0). If the {dihedral} keyword is specified as {no} which is the default, then the 2,5 interaction will also be weighted by 0.5. -The {extra} keyword can be used when additional bonds will be created -during a simulation run, e.g. by the "fix -bond/create"_fix_bond_create.html command. It can also be used if -molecules will be added to the system, e.g. via the "fix -deposit"_fix_deposit.html, or "fix pour"_fix_pour.html commands, which -will have atoms with more special neighbors than any atom in the -current system has. - :line NOTE: LAMMPS stores and maintains a data structure with a list of the @@ -194,8 +183,9 @@ the system). If new bonds are created (or molecules added containing atoms with more special neighbors), the size of this list needs to grow. Note that adding a single bond always adds a new 1st neighbor but may also induce *many* new 2nd and 3rd neighbors, depending on the -molecular topology of your system. Using the {extra} keyword leaves -empty space in the list for this N additional 1st, 2nd, or 3rd +molecular topology of your system. Using the {extra/special/per/atom} +keyword to either "read_data"_read_data.html or "create_box"_create_box.html +reserves empty space in the list for this N additional 1st, 2nd, or 3rd neighbors to be added. If you do not do this, you may get an error when bonds (or molecules) are added. @@ -203,8 +193,7 @@ when bonds (or molecules) are added. NOTE: If you reuse this command in an input script, you should set all the options you need each time. This command cannot be used a 2nd -time incrementally, e.g. to add some extra storage locations via the -{extra} keyword. E.g. these two commands: +time incrementally. E.g. these two commands: special_bonds lj 0.0 1.0 1.0 special_bonds coul 0.0 0.0 1.0 @@ -221,25 +210,6 @@ Coul: coul 0.0 0.0 1.0 because the LJ settings are reset to their default values each time the command is issued. -Likewise - -special_bonds amber -special_bonds extra 2 :pre - -is not the same as this single command: - -special_bonds amber extra 2 :pre - -since in the former case, the 2nd command will reset all the LJ and -Coulombic weights to 0.0 (the default). - -One exception to this rule is the {extra} option itself. It is not -reset to its default value of 0 each time the special_bonds command is -invoked. This is because it can also be set by the -"read_data"_read_data.html and "create_box"_create_box.html commands, -so this command will not override those settings unless you explicitly -use {extra} as an option. - [Restrictions:] none [Related commands:] diff --git a/doc/src/tutorial_bash_on_windows.txt b/doc/src/tutorial_bash_on_windows.txt old mode 100755 new mode 100644 diff --git a/doc/src/tutorial_drude.txt b/doc/src/tutorial_drude.txt index b9a167b804..f6e7eed40b 100644 --- a/doc/src/tutorial_drude.txt +++ b/doc/src/tutorial_drude.txt @@ -176,12 +176,13 @@ By recognizing the fix {drude}, LAMMPS will find and store matching DC-DP pairs and will treat DP as equivalent to their DC in the {special bonds} relations. It may be necessary to extend the space for storing such special relations. In this case extra space should -be reserved by using the {extra} keyword of the {special_bonds} +be reserved by using the {extra/special/per/atom} keyword of either +the "read_data"_read_data.html or "create_box"_create_box.html command. With our phenol, there is 1 more special neighbor for which space is required. Otherwise LAMMPS crashes and gives the required value. -special_bonds lj/coul 0.0 0.0 0.5 extra 1 :pre +read_data data-p.lmp extra/special/per/atom 1 :pre Let us assume we want to run a simple NVT simulation at 300 K. Note that Drude oscillators need to be thermalized at a low temperature in diff --git a/doc/src/tutorials.txt b/doc/src/tutorials.txt old mode 100755 new mode 100644 diff --git a/lib/colvars/Install.py b/lib/colvars/Install.py index 01e70543f2..030644ceb5 100644 --- a/lib/colvars/Install.py +++ b/lib/colvars/Install.py @@ -45,12 +45,12 @@ while iarg < nargs: if args[iarg] == "-m": if iarg+2 > len(args): error() machine = args[iarg+1] - iarg += 2 + iarg += 2 elif args[iarg] == "-e": if iarg+2 > len(args): error() extraflag = True suffix = args[iarg+1] - iarg += 2 + iarg += 2 else: error() # set lib from working dir diff --git a/lib/colvars/README b/lib/colvars/README index ce1d319974..5df9612dfa 100644 --- a/lib/colvars/README +++ b/lib/colvars/README @@ -32,7 +32,7 @@ where Makefile.g++ uses the GNU C++ compiler and is a good template to start. **Optional**: if you use the Install.py script provided in this folder, you can give the machine name as the '-m' argument. This can be the suffix of one -of the files from either this folder, or from src/MAKE. +of the files from either this folder, or from src/MAKE/MACHINES. *This is only supported by the Install.py within the lib/colvars folder*. When you are done building this library, two files should @@ -53,10 +53,10 @@ settings in Makefile.common should work. For the reference manual see: http://colvars.github.io/colvars-refman-lammps -A copy of reference manual is also in: +A copy of the reference manual is also in: doc/PDF/colvars-refman-lammps.pdf -Also included is a Doxygen-based developer documentation: +Also available is a Doxygen-based developer documentation: http://colvars.github.io/doxygen/html/ The reference article is: diff --git a/lib/colvars/colvar.h b/lib/colvars/colvar.h index 6113e1678b..dfa9e093a5 100644 --- a/lib/colvars/colvar.h +++ b/lib/colvars/colvar.h @@ -88,7 +88,12 @@ public: static std::vector cv_features; /// \brief Implementation of the feature list accessor for colvar - std::vector &features() { + virtual const std::vector &features() + { + return cv_features; + } + virtual std::vector &modify_features() + { return cv_features; } diff --git a/lib/colvars/colvaratoms.h b/lib/colvars/colvaratoms.h index dba2890abc..6113fb38a9 100644 --- a/lib/colvars/colvaratoms.h +++ b/lib/colvars/colvaratoms.h @@ -206,7 +206,12 @@ public: static std::vector ag_features; /// \brief Implementation of the feature list accessor for atom group - virtual std::vector &features() { + virtual const std::vector &features() + { + return ag_features; + } + virtual std::vector &modify_features() + { return ag_features; } diff --git a/lib/colvars/colvarbias.cpp b/lib/colvars/colvarbias.cpp index e437466be9..636727ca39 100644 --- a/lib/colvars/colvarbias.cpp +++ b/lib/colvars/colvarbias.cpp @@ -384,6 +384,7 @@ std::ostream & colvarbias::write_traj(std::ostream &os) os << " "; if (b_output_energy) os << " " + << std::setprecision(cvm::en_prec) << std::setw(cvm::en_width) << bias_energy; return os; } diff --git a/lib/colvars/colvarbias.h b/lib/colvars/colvarbias.h index 205e761cfc..a147cd3210 100644 --- a/lib/colvars/colvarbias.h +++ b/lib/colvars/colvarbias.h @@ -175,7 +175,11 @@ public: static std::vector cvb_features; /// \brief Implementation of the feature list accessor for colvarbias - virtual std::vector &features() + virtual const std::vector &features() + { + return cvb_features; + } + virtual std::vector &modify_features() { return cvb_features; } diff --git a/lib/colvars/colvarbias_restraint.cpp b/lib/colvars/colvarbias_restraint.cpp index bb6d6164e5..6879190968 100644 --- a/lib/colvars/colvarbias_restraint.cpp +++ b/lib/colvars/colvarbias_restraint.cpp @@ -99,12 +99,9 @@ int colvarbias_restraint_centers::init(std::string const &conf) if (null_centers) { // try to initialize the restraint centers for the first time colvar_centers.resize(num_variables()); - colvar_centers_raw.resize(num_variables()); for (i = 0; i < num_variables(); i++) { colvar_centers[i].type(variables(i)->value()); colvar_centers[i].reset(); - colvar_centers_raw[i].type(variables(i)->value()); - colvar_centers_raw[i].reset(); } } @@ -113,7 +110,6 @@ int colvarbias_restraint_centers::init(std::string const &conf) if (cvm::debug()) { cvm::log("colvarbias_restraint: parsing initial centers, i = "+cvm::to_str(i)+".\n"); } - colvar_centers_raw[i] = colvar_centers[i]; colvar_centers[i].apply_constraints(); } null_centers = false; @@ -141,8 +137,6 @@ int colvarbias_restraint_centers::change_configuration(std::string const &conf) for (size_t i = 0; i < num_variables(); i++) { colvar_centers[i].type(variables(i)->value()); colvar_centers[i].apply_constraints(); - colvar_centers_raw[i].type(variables(i)->value()); - colvar_centers_raw[i] = colvar_centers[i]; } } return COLVARS_OK; @@ -232,7 +226,6 @@ int colvarbias_restraint_moving::set_state_params(std::string const &conf) { if (b_chg_centers || b_chg_force_k) { if (target_nstages) { - // cvm::log ("Reading current stage from the restart.\n"); if (!get_keyval(conf, "stage", stage)) cvm::error("Error: current stage is missing from the restart.\n"); } @@ -265,100 +258,127 @@ int colvarbias_restraint_centers_moving::init(std::string const &conf) size_t i; if (get_keyval(conf, "targetCenters", target_centers, colvar_centers)) { - if (colvar_centers.size() != num_variables()) { + if (target_centers.size() != num_variables()) { cvm::error("Error: number of target centers does not match " - "that of collective variables.\n"); + "that of collective variables.\n", INPUT_ERROR); } b_chg_centers = true; for (i = 0; i < target_centers.size(); i++) { target_centers[i].apply_constraints(); + centers_incr.push_back(colvar_centers[i]); + centers_incr[i].reset(); } } if (b_chg_centers) { - // parse moving restraint options + // parse moving schedule options colvarbias_restraint_moving::init(conf); + if (initial_centers.size() == 0) { + // One-time init + initial_centers = colvar_centers; + } + // Call to check that the definition is correct + for (i = 0; i < num_variables(); i++) { + colvarvalue const midpoint = + colvarvalue::interpolate(initial_centers[i], + target_centers[i], + 0.5); + } } else { target_centers.clear(); return COLVARS_OK; } get_keyval(conf, "outputCenters", b_output_centers, b_output_centers); - get_keyval(conf, "outputAccumulatedWork", b_output_acc_work, b_output_acc_work); + get_keyval(conf, "outputAccumulatedWork", b_output_acc_work, + b_output_acc_work); // TODO this conflicts with stages return COLVARS_OK; } +int colvarbias_restraint_centers_moving::update_centers(cvm::real lambda) +{ + if (cvm::debug()) { + cvm::log("Updating centers for the restraint bias \""+ + this->name+"\": "+cvm::to_str(colvar_centers)+".\n"); + } + size_t i; + for (i = 0; i < num_variables(); i++) { + colvarvalue const c_new = colvarvalue::interpolate(initial_centers[i], + target_centers[i], + lambda); + centers_incr[i] = (c_new).dist2_grad(colvar_centers[i]); + colvar_centers[i] = c_new; + variables(i)->wrap(colvar_centers[i]); + } + if (cvm::debug()) { + cvm::log("New centers for the restraint bias \""+ + this->name+"\": "+cvm::to_str(colvar_centers)+".\n"); + } + return cvm::get_error(); +} + + int colvarbias_restraint_centers_moving::update() { if (b_chg_centers) { - if (cvm::debug()) { - cvm::log("Updating centers for the restraint bias \""+ - this->name+"\": "+cvm::to_str(colvar_centers)+".\n"); - } - - if (!centers_incr.size()) { - // if this is the first calculation, calculate the advancement - // at each simulation step (or stage, if applicable) - // (take current stage into account: it can be non-zero - // if we are restarting a staged calculation) - centers_incr.resize(num_variables()); - for (size_t i = 0; i < num_variables(); i++) { - centers_incr[i].type(variables(i)->value()); - centers_incr[i] = (target_centers[i] - colvar_centers_raw[i]) / - cvm::real( target_nstages ? (target_nstages - stage) : - (target_nsteps - cvm::step_absolute())); - } - if (cvm::debug()) { - cvm::log("Center increment for the restraint bias \""+ - this->name+"\": "+cvm::to_str(centers_incr)+" at stage "+cvm::to_str(stage)+ ".\n"); - } - } - if (target_nstages) { - if ((cvm::step_relative() > 0) - && (cvm::step_absolute() % target_nsteps) == 0 - && stage < target_nstages) { - - for (size_t i = 0; i < num_variables(); i++) { - colvar_centers_raw[i] += centers_incr[i]; - colvar_centers[i] = colvar_centers_raw[i]; - variables(i)->wrap(colvar_centers[i]); - colvar_centers[i].apply_constraints(); + // Staged update + if (stage <= target_nstages) { + if ((cvm::step_relative() > 0) && + ((cvm::step_absolute() % target_nsteps) == 1)) { + cvm::real const lambda = + cvm::real(stage)/cvm::real(target_nstages); + update_centers(lambda); + stage++; + cvm::log("Moving restraint \"" + this->name + + "\" stage " + cvm::to_str(stage) + + " : setting centers to " + cvm::to_str(colvar_centers) + + " at step " + cvm::to_str(cvm::step_absolute())); + } else { + for (size_t i = 0; i < num_variables(); i++) { + centers_incr[i].reset(); + } } - stage++; - cvm::log("Moving restraint \"" + this->name + - "\" stage " + cvm::to_str(stage) + - " : setting centers to " + cvm::to_str(colvar_centers) + - " at step " + cvm::to_str(cvm::step_absolute())); } - } else if ((cvm::step_relative() > 0) && (cvm::step_absolute() <= target_nsteps)) { - // move the restraint centers in the direction of the targets - // (slow growth) + } else { + // Continuous update + if (cvm::step_absolute() <= target_nsteps) { + cvm::real const lambda = + cvm::real(cvm::step_absolute())/cvm::real(target_nsteps); + update_centers(lambda); + } else { + for (size_t i = 0; i < num_variables(); i++) { + centers_incr[i].reset(); + } + } + } + + if (cvm::step_relative() == 0) { for (size_t i = 0; i < num_variables(); i++) { - colvar_centers_raw[i] += centers_incr[i]; - colvar_centers[i] = colvar_centers_raw[i]; - variables(i)->wrap(colvar_centers[i]); - colvar_centers[i].apply_constraints(); + // finite differences are undefined when restarting + centers_incr[i].reset(); } } if (cvm::debug()) { - cvm::log("New centers for the restraint bias \""+ - this->name+"\": "+cvm::to_str(colvar_centers)+".\n"); + cvm::log("Center increment for the restraint bias \""+ + this->name+"\": "+cvm::to_str(centers_incr)+ + " at stage "+cvm::to_str(stage)+ ".\n"); } } - return COLVARS_OK; + return cvm::get_error(); } int colvarbias_restraint_centers_moving::update_acc_work() { if (b_output_acc_work) { - if ((cvm::step_relative() > 0) || (cvm::step_absolute() == 0)) { + if ((cvm::step_relative() > 0) && + (cvm::step_absolute() <= target_nsteps)) { for (size_t i = 0; i < num_variables(); i++) { // project forces on the calculated increments at this step acc_work += colvar_forces[i] * centers_incr[i]; @@ -383,13 +403,6 @@ std::string const colvarbias_restraint_centers_moving::get_state_params() const << colvar_centers[i]; } os << "\n"; - os << "centers_raw "; - for (i = 0; i < num_variables(); i++) { - os << " " - << std::setprecision(cvm::cv_prec) << std::setw(cvm::cv_width) - << colvar_centers_raw[i]; - } - os << "\n"; if (b_output_acc_work) { os << "accumulatedWork " @@ -398,7 +411,7 @@ std::string const colvarbias_restraint_centers_moving::get_state_params() const } } - return colvarbias_restraint_moving::get_state_params() + os.str(); + return os.str(); } @@ -410,8 +423,6 @@ int colvarbias_restraint_centers_moving::set_state_params(std::string const &con // cvm::log ("Reading the updated restraint centers from the restart.\n"); if (!get_keyval(conf, "centers", colvar_centers)) cvm::error("Error: restraint centers are missing from the restart.\n"); - if (!get_keyval(conf, "centers_raw", colvar_centers_raw)) - cvm::error("Error: \"raw\" restraint centers are missing from the restart.\n"); if (b_output_acc_work) { if (!get_keyval(conf, "accumulatedWork", acc_work)) cvm::error("Error: accumulatedWork is missing from the restart.\n"); @@ -609,7 +620,7 @@ std::string const colvarbias_restraint_k_moving::get_state_params() const << std::setprecision(cvm::en_prec) << std::setw(cvm::en_width) << force_k << "\n"; } - return colvarbias_restraint_moving::get_state_params() + os.str(); + return os.str(); } @@ -770,6 +781,7 @@ cvm::real colvarbias_restraint_harmonic::d_restraint_potential_dk(size_t i) cons std::string const colvarbias_restraint_harmonic::get_state_params() const { return colvarbias_restraint::get_state_params() + + colvarbias_restraint_moving::get_state_params() + colvarbias_restraint_centers_moving::get_state_params() + colvarbias_restraint_k_moving::get_state_params(); } @@ -779,6 +791,7 @@ int colvarbias_restraint_harmonic::set_state_params(std::string const &conf) { int error_code = COLVARS_OK; error_code |= colvarbias_restraint::set_state_params(conf); + error_code |= colvarbias_restraint_moving::set_state_params(conf); error_code |= colvarbias_restraint_centers_moving::set_state_params(conf); error_code |= colvarbias_restraint_k_moving::set_state_params(conf); return error_code; @@ -1037,6 +1050,7 @@ cvm::real colvarbias_restraint_harmonic_walls::d_restraint_potential_dk(size_t i std::string const colvarbias_restraint_harmonic_walls::get_state_params() const { return colvarbias_restraint::get_state_params() + + colvarbias_restraint_moving::get_state_params() + colvarbias_restraint_k_moving::get_state_params(); } @@ -1045,6 +1059,7 @@ int colvarbias_restraint_harmonic_walls::set_state_params(std::string const &con { int error_code = COLVARS_OK; error_code |= colvarbias_restraint::set_state_params(conf); + error_code |= colvarbias_restraint_moving::set_state_params(conf); error_code |= colvarbias_restraint_k_moving::set_state_params(conf); return error_code; } @@ -1164,6 +1179,7 @@ cvm::real colvarbias_restraint_linear::d_restraint_potential_dk(size_t i) const std::string const colvarbias_restraint_linear::get_state_params() const { return colvarbias_restraint::get_state_params() + + colvarbias_restraint_moving::get_state_params() + colvarbias_restraint_centers_moving::get_state_params() + colvarbias_restraint_k_moving::get_state_params(); } @@ -1173,6 +1189,7 @@ int colvarbias_restraint_linear::set_state_params(std::string const &conf) { int error_code = COLVARS_OK; error_code |= colvarbias_restraint::set_state_params(conf); + error_code |= colvarbias_restraint_moving::set_state_params(conf); error_code |= colvarbias_restraint_centers_moving::set_state_params(conf); error_code |= colvarbias_restraint_k_moving::set_state_params(conf); return error_code; diff --git a/lib/colvars/colvarbias_restraint.h b/lib/colvars/colvarbias_restraint.h index 98b967abdb..8c3a1537fc 100644 --- a/lib/colvars/colvarbias_restraint.h +++ b/lib/colvars/colvarbias_restraint.h @@ -74,9 +74,6 @@ protected: /// \brief Restraint centers std::vector colvar_centers; - - /// \brief Restraint centers outside the domain of the colvars (no wrapping or constraints applied) - std::vector colvar_centers_raw; }; @@ -156,10 +153,16 @@ protected: /// \brief New restraint centers std::vector target_centers; + /// \brief Initial value of the restraint centers + std::vector initial_centers; + /// \brief Amplitude of the restraint centers' increment at each step - /// (or stage) towards the new values (calculated from target_nsteps) + /// towards the new values (calculated from target_nsteps) std::vector centers_incr; + /// \brief Update the centers by interpolating between initial and target + virtual int update_centers(cvm::real lambda); + /// Whether to write the current restraint centers to the trajectory file bool b_output_centers; diff --git a/lib/colvars/colvarcomp.h b/lib/colvars/colvarcomp.h index 2c865a166b..3c1ec2495c 100644 --- a/lib/colvars/colvarcomp.h +++ b/lib/colvars/colvarcomp.h @@ -132,9 +132,15 @@ public: static std::vector cvc_features; /// \brief Implementation of the feature list accessor for colvar - virtual std::vector &features() { + virtual const std::vector &features() + { return cvc_features; } + virtual std::vector &modify_features() + { + return cvc_features; + } + /// \brief Obtain data needed for the calculation for the backend virtual void read_data(); diff --git a/lib/colvars/colvardeps.cpp b/lib/colvars/colvardeps.cpp index 5402836f53..8f241a6255 100644 --- a/lib/colvars/colvardeps.cpp +++ b/lib/colvars/colvardeps.cpp @@ -374,8 +374,8 @@ int colvardeps::decr_ref_count(int feature_id) { } void colvardeps::init_feature(int feature_id, const char *description, feature_type type) { - features()[feature_id]->description = description; - features()[feature_id]->type = type; + modify_features()[feature_id]->description = description; + modify_features()[feature_id]->type = type; } // Shorthand macros for describing dependencies @@ -401,7 +401,7 @@ void colvardeps::init_cvb_requires() { int i; if (features().size() == 0) { for (i = 0; i < f_cvb_ntot; i++) { - features().push_back(new feature); + modify_features().push_back(new feature); } init_feature(f_cvb_active, "active", f_type_dynamic); @@ -438,7 +438,7 @@ void colvardeps::init_cv_requires() { size_t i; if (features().size() == 0) { for (i = 0; i < f_cv_ntot; i++) { - features().push_back(new feature); + modify_features().push_back(new feature); } init_feature(f_cv_active, "active", f_type_dynamic); @@ -554,7 +554,7 @@ void colvardeps::init_cvc_requires() { // Initialize static array once and for all if (features().size() == 0) { for (i = 0; i < colvardeps::f_cvc_ntot; i++) { - features().push_back(new feature); + modify_features().push_back(new feature); } init_feature(f_cvc_active, "active", f_type_dynamic); @@ -633,7 +633,7 @@ void colvardeps::init_ag_requires() { // Initialize static array once and for all if (features().size() == 0) { for (i = 0; i < f_ag_ntot; i++) { - features().push_back(new feature); + modify_features().push_back(new feature); } init_feature(f_ag_active, "active", f_type_dynamic); diff --git a/lib/colvars/colvardeps.h b/lib/colvars/colvardeps.h index b810a5fca1..dfb10d00e4 100644 --- a/lib/colvars/colvardeps.h +++ b/lib/colvars/colvardeps.h @@ -135,7 +135,8 @@ public: // with a non-static array // Intermediate classes (colvarbias and colvarcomp, which are also base classes) // implement this as virtual to allow overriding - virtual std::vector&features() = 0; + virtual const std::vector&features() = 0; + virtual std::vector&modify_features() = 0; void add_child(colvardeps *child); diff --git a/lib/colvars/colvars_version.h b/lib/colvars/colvars_version.h index e544756428..312c0fd1a0 100644 --- a/lib/colvars/colvars_version.h +++ b/lib/colvars/colvars_version.h @@ -1,4 +1,5 @@ -#define COLVARS_VERSION "2017-07-15" +#ifndef COLVARS_VERSION +#define COLVARS_VERSION "2017-08-06" // This file is part of the Collective Variables module (Colvars). // The original version of Colvars and its updates are located at: // https://github.com/colvars/colvars @@ -6,3 +7,4 @@ // If you wish to distribute your changes, please submit them to the // Colvars repository at GitHub. +#endif diff --git a/lib/colvars/colvarscript.cpp b/lib/colvars/colvarscript.cpp index 5bb2faae24..89302a16a2 100644 --- a/lib/colvars/colvarscript.cpp +++ b/lib/colvars/colvarscript.cpp @@ -472,7 +472,7 @@ int colvarscript::proc_features(colvardeps *obj, } if ((subcmd == "get") || (subcmd == "set")) { - std::vector &features = obj->features(); + std::vector const &features = obj->features(); std::string const req_feature(obj_to_str(objv[3])); colvardeps::feature *f = NULL; int fid = 0; diff --git a/lib/colvars/colvartypes.cpp b/lib/colvars/colvartypes.cpp index 5200d4d041..428fe1a4b1 100644 --- a/lib/colvars/colvartypes.cpp +++ b/lib/colvars/colvartypes.cpp @@ -19,6 +19,17 @@ bool colvarmodule::rotation::monitor_crossings = false; cvm::real colvarmodule::rotation::crossing_threshold = 1.0E-02; +/// Numerical recipes diagonalization +static int jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot); + +/// Eigenvector sort +static int eigsrt(cvm::real *d, cvm::real **v); + +/// Transpose the matrix +static int transpose(cvm::real **v); + + + std::string cvm::rvector::to_simple_string() const { std::ostringstream os; @@ -286,7 +297,12 @@ void colvarmodule::rotation::diagonalize_matrix(cvm::matrix2d &S, // diagonalize int jac_nrot = 0; - jacobi(S.c_array(), S_eigval.c_array(), S_eigvec.c_array(), &jac_nrot); + if (jacobi(S.c_array(), S_eigval.c_array(), S_eigvec.c_array(), &jac_nrot) != + COLVARS_OK) { + cvm::error("Too many iterations in routine jacobi.\n" + "This is usually the result of an ill-defined set of atoms for " + "rotational alignment (RMSD, rotateReference, etc).\n"); + } eigsrt(S_eigval.c_array(), S_eigvec.c_array()); // jacobi saves eigenvectors by columns transpose(S_eigvec.c_array()); @@ -528,7 +544,7 @@ void colvarmodule::rotation::calc_optimal_rotation(std::vector co #define n 4 -void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot) +int jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot) { int j,iq,ip,i; cvm::real tresh,theta,tau,t,sm,s,h,g,c; @@ -554,7 +570,7 @@ void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot) sm += std::fabs(a[ip][iq]); } if (sm == 0.0) { - return; + return COLVARS_OK; } if (i < 4) tresh=0.2*sm/(n*n); @@ -606,10 +622,11 @@ void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot) z[ip]=0.0; } } - cvm::error("Too many iterations in routine jacobi.\n"); + return COLVARS_ERROR; } -void eigsrt(cvm::real *d, cvm::real **v) + +int eigsrt(cvm::real *d, cvm::real **v) { int k,j,i; cvm::real p; @@ -628,9 +645,11 @@ void eigsrt(cvm::real *d, cvm::real **v) } } } + return COLVARS_OK; } -void transpose(cvm::real **v) + +int transpose(cvm::real **v) { cvm::real p; int i,j; @@ -641,6 +660,7 @@ void transpose(cvm::real **v) v[j][i]=p; } } + return COLVARS_OK; } #undef n diff --git a/lib/colvars/colvartypes.h b/lib/colvars/colvartypes.h index 17c09a5095..fe3160eb4b 100644 --- a/lib/colvars/colvartypes.h +++ b/lib/colvars/colvartypes.h @@ -1020,16 +1020,6 @@ inline cvm::rvector operator * (cvm::rmatrix const &m, } -/// Numerical recipes diagonalization -void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot); - -/// Eigenvector sort -void eigsrt(cvm::real *d, cvm::real **v); - -/// Transpose the matrix -void transpose(cvm::real **v); - - /// \brief 1-dimensional vector of real numbers with four components and diff --git a/lib/colvars/colvarvalue.cpp b/lib/colvars/colvarvalue.cpp index 7b498be6d6..312d101603 100644 --- a/lib/colvars/colvarvalue.cpp +++ b/lib/colvars/colvarvalue.cpp @@ -570,6 +570,50 @@ colvarvalue colvarvalue::dist2_grad(colvarvalue const &x2) const } +/// Return the midpoint between x1 and x2, optionally weighted by lambda +/// (which must be between 0.0 and 1.0) +colvarvalue const colvarvalue::interpolate(colvarvalue const &x1, + colvarvalue const &x2, + cvm::real const lambda) +{ + colvarvalue::check_types(x1, x2); + + if ((lambda < 0.0) || (lambda > 1.0)) { + cvm::error("Error: trying to interpolate between two colvarvalues with a " + "lamdba outside [0:1].\n", BUG_ERROR); + } + + colvarvalue interp = ((1.0-lambda)*x1 + lambda*x2); + cvm::real const d2 = x1.dist2(x2); + + switch (x1.type()) { + case colvarvalue::type_scalar: + case colvarvalue::type_3vector: + case colvarvalue::type_vector: + case colvarvalue::type_unit3vectorderiv: + case colvarvalue::type_quaternionderiv: + return interp; + break; + case colvarvalue::type_unit3vector: + case colvarvalue::type_quaternion: + if (interp.norm()/std::sqrt(d2) < 1.0e-6) { + cvm::error("Error: interpolation between "+cvm::to_str(x1)+" and "+ + cvm::to_str(x2)+" with lambda = "+cvm::to_str(lambda)+ + " is undefined: result = "+cvm::to_str(interp)+"\n", + INPUT_ERROR); + } + interp.apply_constraints(); + return interp; + break; + case colvarvalue::type_notset: + default: + x1.undef_op(); + break; + } + return colvarvalue(colvarvalue::type_notset); +} + + std::string colvarvalue::to_simple_string() const { switch (type()) { diff --git a/lib/colvars/colvarvalue.h b/lib/colvars/colvarvalue.h index fce0e1a970..41759e92b0 100644 --- a/lib/colvars/colvarvalue.h +++ b/lib/colvars/colvarvalue.h @@ -193,6 +193,12 @@ public: /// Derivative with respect to this \link colvarvalue \endlink of the square distance colvarvalue dist2_grad(colvarvalue const &x2) const; + /// Return the midpoint between x1 and x2, optionally weighted by lambda + /// (which must be between 0.0 and 1.0) + static colvarvalue const interpolate(colvarvalue const &x1, + colvarvalue const &x2, + cvm::real const lambda = 0.5); + /// Assignment operator (type of x is checked) colvarvalue & operator = (colvarvalue const &x); @@ -285,10 +291,10 @@ public: cvm::real & operator [] (int const i); /// Ensure that the two types are the same within a binary operator - int static check_types(colvarvalue const &x1, colvarvalue const &x2); + static int check_types(colvarvalue const &x1, colvarvalue const &x2); /// Ensure that the two types are the same within an assignment, or that the left side is type_notset - int static check_types_assign(Type const &vt1, Type const &vt2); + static int check_types_assign(Type const &vt1, Type const &vt2); /// Undefined operation void undef_op() const; @@ -317,14 +323,14 @@ public: /// \brief Optimized routine for the inner product of one collective /// variable with an array - void static inner_opt(colvarvalue const &x, + static void inner_opt(colvarvalue const &x, std::vector::iterator &xv, std::vector::iterator const &xv_end, std::vector::iterator &result); /// \brief Optimized routine for the inner product of one collective /// variable with an array - void static inner_opt(colvarvalue const &x, + static void inner_opt(colvarvalue const &x, std::list::iterator &xv, std::list::iterator const &xv_end, std::vector::iterator &result); @@ -332,14 +338,14 @@ public: /// \brief Optimized routine for the second order Legendre /// polynomial, (3cos^2(w)-1)/2, of one collective variable with an /// array - void static p2leg_opt(colvarvalue const &x, + static void p2leg_opt(colvarvalue const &x, std::vector::iterator &xv, std::vector::iterator const &xv_end, std::vector::iterator &result); /// \brief Optimized routine for the second order Legendre /// polynomial of one collective variable with an array - void static p2leg_opt(colvarvalue const &x, + static void p2leg_opt(colvarvalue const &x, std::list::iterator &xv, std::list::iterator const &xv_end, std::vector::iterator &result); diff --git a/lib/gpu/Install.py b/lib/gpu/Install.py index 657f1c8fcc..6ea2159de5 100644 --- a/lib/gpu/Install.py +++ b/lib/gpu/Install.py @@ -14,7 +14,7 @@ Syntax from lib dir: python Install.py -m machine -h hdir -a arch -p precision - specify one or more options, order does not matter -copies an existing Makefile.machine in lib/gpu to Makefile.auto +copies an existing Makefile.machine in lib/gpu to Makefile.auto optionally edits these variables in Makefile.auto: CUDA_HOME, CUDA_ARCH, CUDA_PRECISION, EXTRAMAKE optionally uses Makefile.auto to build the GPU library -> libgpu.a @@ -26,7 +26,7 @@ optionally copies Makefile.auto to a new Makefile.osuffix -h = set CUDA_HOME variable in Makefile.auto to hdir hdir = path to NVIDIA Cuda software, e.g. /usr/local/cuda -a = set CUDA_ARCH variable in Makefile.auto to arch - use arch = 20 for Tesla C2050/C2070 (Fermi) (deprecated as of CUDA 8.0) + use arch = 20 for Tesla C2050/C2070 (Fermi) (deprecated as of CUDA 8.0) or GeForce GTX 580 or similar use arch = 30 for Tesla K10 (Kepler) use arch = 35 for Tesla K40 (Kepler) or GeForce GTX Titan or similar @@ -108,10 +108,10 @@ if pflag: elif precision == "mixed": precstr = "-D_SINGLE_DOUBLE" elif precision == "single": precstr = "-D_SINGLE_SINGLE" else: error("Invalid precision setting") - + # create Makefile.auto # reset EXTRAMAKE, CUDA_HOME, CUDA_ARCH, CUDA_PRECISION if requested - + if not os.path.exists("Makefile.%s" % isuffix): error("lib/gpu/Makefile.%s does not exist" % isuffix) diff --git a/lib/gpu/lal_aux_fun1.h b/lib/gpu/lal_aux_fun1.h index b40bb7f943..47a216ff6f 100644 --- a/lib/gpu/lal_aux_fun1.h +++ b/lib/gpu/lal_aux_fun1.h @@ -22,21 +22,21 @@ offset=tid & (t_per_atom-1); \ ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom; -#define nbor_info(nbor_mem, packed_mem, nbor_stride, t_per_atom, ii, offset, \ - i, numj, stride, nbor_end, nbor_begin) \ - i=nbor_mem[ii]; \ - nbor_begin=ii+nbor_stride; \ - numj=nbor_mem[nbor_begin]; \ - if (nbor_mem==packed_mem) { \ - nbor_begin+=nbor_stride+fast_mul(ii,t_per_atom-1); \ - stride=fast_mul(t_per_atom,nbor_stride); \ - nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & (t_per_atom-1)); \ +#define nbor_info(dev_nbor, dev_packed, nbor_pitch, t_per_atom, ii, offset, \ + i, numj, n_stride, nbor_end, nbor_begin) \ + i=dev_nbor[ii]; \ + nbor_begin=ii+nbor_pitch; \ + numj=dev_nbor[nbor_begin]; \ + if (dev_nbor==dev_packed) { \ + nbor_begin+=nbor_pitch+fast_mul(ii,t_per_atom-1); \ + n_stride=fast_mul(t_per_atom,nbor_pitch); \ + nbor_end=nbor_begin+fast_mul(numj/t_per_atom,n_stride)+(numj & (t_per_atom-1)); \ nbor_begin+=offset; \ } else { \ - nbor_begin+=nbor_stride; \ - nbor_begin=nbor_mem[nbor_begin]; \ + nbor_begin+=nbor_pitch; \ + nbor_begin=dev_nbor[nbor_begin]; \ nbor_end=nbor_begin+numj; \ - stride=t_per_atom; \ + n_stride=t_per_atom; \ nbor_begin+=offset; \ } diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp index f772e36295..aa77a48c66 100644 --- a/lib/gpu/lal_base_three.cpp +++ b/lib/gpu/lal_base_three.cpp @@ -20,7 +20,7 @@ using namespace LAMMPS_AL; extern Device global_device; template -BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) { +BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) { device=&global_device; ans=new Answer(); nbor=new Neighbor(); @@ -53,8 +53,8 @@ int BaseThreeT::init_three(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, const void *pair_program, - const char *k_two, const char *k_three_center, - const char *k_three_end) { + const char *two, const char *three_center, + const char *three_end, const char *short_nbor) { screen=_screen; int gpu_nbor=0; @@ -70,10 +70,10 @@ int BaseThreeT::init_three(const int nlocal, const int nall, _gpu_host=1; _threads_per_atom=device->threads_per_atom(); - if (_threads_per_atom>1 && gpu_nbor==0) { + if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1 nbor->packing(true); _nbor_data=&(nbor->dev_packed); - } else + } else // neigh yes or tpa == 1 _nbor_data=&(nbor->dev_nbor); if (_threads_per_atom*_threads_per_atom>device->warp_size()) return -10; @@ -97,7 +97,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall, _block_pair=device->pair_block_size(); _block_size=device->block_ellipse(); - compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end); + compile_kernels(*ucl_device,pair_program,two,three_center,three_end,short_nbor); // Initialize host-device load balancer hd_balancer.init(device,gpu_nbor,gpu_split); @@ -113,6 +113,11 @@ int BaseThreeT::init_three(const int nlocal, const int nall, _max_an_bytes+=ans2->gpu_bytes(); #endif + int ef_nall=nall; + if (ef_nall==0) + ef_nall=2000; + dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE); + return 0; } @@ -136,6 +141,7 @@ void BaseThreeT::clear_atomic() { k_three_end.clear(); k_three_end_vatom.clear(); k_pair.clear(); + k_short_nbor.clear(); delete pair_program; _compiled=false; } @@ -143,6 +149,7 @@ void BaseThreeT::clear_atomic() { time_pair.clear(); hd_balancer.clear(); + dev_short_nbor.clear(); nbor->clear(); ans->clear(); #ifdef THREE_CONCURRENT @@ -169,6 +176,8 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist, if (!success) return NULL; + _nall = nall; + // originally the requirement that nall == nlist was enforced // to allow direct indexing neighbors of neighbors after re-arrangement // nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size()); @@ -203,6 +212,8 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum, return 0; atom->cast_copy_x(host_x,host_type); + _nall = nall; + int mn; nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag, nspecial, special, success, mn); @@ -247,12 +258,22 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall, reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success); if (!success) return; + _max_nbors = nbor->max_nbor_loop(nlist,numj,ilist); } atom->cast_x_data(host_x,host_type); hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); + // re-allocate dev_short_nbor if necessary + if (nall*(2+_max_nbors) > dev_short_nbor.cols()) { + int _nmax=static_cast(static_cast(nall)*1.10); + dev_short_nbor.resize((2+_max_nbors)*_nmax); + } + + // _ainum to be used in loop() for short neighbor list build + _ainum = nlist; + int evatom=0; if (eatom || vatom) evatom=1; @@ -300,7 +321,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full, // Build neighbor list on GPU if necessary if (ago==0) { - build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, success); if (!success) return NULL; @@ -313,6 +334,15 @@ int ** BaseThreeT::compute(const int ago, const int inum_full, *ilist=nbor->host_ilist.begin(); *jnum=nbor->host_acc.begin(); + // re-allocate dev_short_nbor if necessary + if (nall*(2+_max_nbors) > dev_short_nbor.cols()) { + int _nmax=static_cast(static_cast(nall)*1.10); + dev_short_nbor.resize((2+_max_nbors)*_nmax); + } + + // _ainum to be used in loop() for short neighbor list build + _ainum = nall; + int evatom=0; if (eatom || vatom) evatom=1; @@ -339,19 +369,20 @@ double BaseThreeT::host_memory_usage_atomic() const { template void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str, - const char *ktwo, const char *kthree_center, - const char *kthree_end) { + const char *two, const char *three_center, + const char *three_end, const char* short_nbor) { if (_compiled) return; - std::string vatom_name=std::string(kthree_end)+"_vatom"; + std::string vatom_name=std::string(three_end)+"_vatom"; pair_program=new UCL_Program(dev); pair_program->load_string(pair_str,device->compile_string().c_str()); - k_three_center.set_function(*pair_program,kthree_center); - k_three_end.set_function(*pair_program,kthree_end); + k_three_center.set_function(*pair_program,three_center); + k_three_end.set_function(*pair_program,three_end); k_three_end_vatom.set_function(*pair_program,vatom_name.c_str()); - k_pair.set_function(*pair_program,ktwo); + k_pair.set_function(*pair_program,two); + k_short_nbor.set_function(*pair_program,short_nbor); pos_tex.get_texture(*pair_program,"pos_tex"); #ifdef THREE_CONCURRENT diff --git a/lib/gpu/lal_base_three.h b/lib/gpu/lal_base_three.h index 4f27ecdf92..f5f36863c4 100644 --- a/lib/gpu/lal_base_three.h +++ b/lib/gpu/lal_base_three.h @@ -56,7 +56,8 @@ class BaseThree { const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, const void *pair_program, const char *k_two, - const char *k_three_center, const char *k_three_end); + const char *k_three_center, const char *k_three_end, + const char *k_short_nbor=NULL); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(); @@ -73,18 +74,18 @@ class BaseThree { } /// Check if there is enough storage for neighbors and realloc if not - /** \param nlocal number of particles whose nbors must be stored on device - * \param host_inum number of particles whose nbors need to copied to host - * \param current maximum number of neighbors + /** \param inum number of particles whose nbors must be stored on device + * \param max_nbors maximum number of neighbors + * \param success set to false if insufficient memory * \note olist_size=total number of local particles **/ inline void resize_local(const int inum, const int max_nbors, bool &success) { nbor->resize(inum,max_nbors,success); } /// Check if there is enough storage for neighbors and realloc if not - /** \param nlocal number of particles whose nbors must be stored on device + /** \param inum number of particles whose nbors must be stored on device * \param host_inum number of particles whose nbors need to copied to host - * \param current maximum number of neighbors + * \param max_nbors current maximum number of neighbors * \note host_inum is 0 if the host is performing neighboring * \note nlocal+host_inum=total number local particles * \note olist_size=0 **/ @@ -143,14 +144,6 @@ class BaseThree { const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success); - /// Pair loop with device neighboring - int * compute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success); - /// Pair loop with device neighboring int ** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, @@ -193,6 +186,9 @@ class BaseThree { /// Neighbor data Neighbor *nbor; + UCL_D_Vec dev_short_nbor; + UCL_Kernel k_short_nbor; + // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pair_program; UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom; @@ -207,12 +203,13 @@ class BaseThree { int _block_pair, _block_size, _threads_per_atom, _end_command_queue; int _gpu_nbor; double _max_bytes, _max_an_bytes; + int _max_nbors, _ainum, _nall; double _gpu_overhead, _driver_overhead; UCL_D_Vec *_nbor_data; void compile_kernels(UCL_Device &dev, const void *pair_string, - const char *k_two, const char *k_three_center, - const char *k_three_end); + const char *two, const char *three_center, + const char *three_end, const char* short_nbor); virtual void loop(const bool _eflag, const bool _vflag, const int evatom) = 0; diff --git a/lib/gpu/lal_sw.cpp b/lib/gpu/lal_sw.cpp index 3492d7030e..24984e4878 100644 --- a/lib/gpu/lal_sw.cpp +++ b/lib/gpu/lal_sw.cpp @@ -55,7 +55,7 @@ int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_ int success; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,sw,"k_sw","k_sw_three_center", - "k_sw_three_end"); + "k_sw_three_end","k_sw_short_nbor"); if (success!=0) return success; @@ -193,19 +193,30 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ + // build the short neighbor list + int ainum=this->_ainum; + int nbor_pitch=this->nbor->nbor_pitch(); + int GX=static_cast(ceil(static_cast(ainum)/ (BX/this->_threads_per_atom))); + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &sw3, &map, &elem2param, &_nelements, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &ainum, + &nbor_pitch, &this->_threads_per_atom); // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1 // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1 - int ainum=this->ans->inum(); - int nbor_pitch=this->nbor->nbor_pitch(); + ainum=this->ans->inum(); + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); this->time_pair.start(); - + this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); @@ -217,6 +228,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &evatom); @@ -231,7 +243,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); @@ -240,7 +252,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu index 46330c59e4..a5c9f49d08 100644 --- a/lib/gpu/lal_sw.cu +++ b/lib/gpu/lal_sw.cu @@ -130,6 +130,63 @@ texture sw3_tex; #endif +__kernel void k_sw_short_nbor(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict sw3, + const __global int *restrict map, + const __global int *restrict elem2param, + const int nelements, + const __global int * dev_nbor, + const __global int * dev_packed, + __global int * dev_short_nbor, + const int inum, const int nbor_pitch, const int t_per_atom) { + __local int n_stride; + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + if (iiinit_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,tersoff,"k_tersoff_repulsive", - "k_tersoff_three_center", "k_tersoff_three_end"); + "k_tersoff_three_center", "k_tersoff_three_end", + "k_tersoff_short_nbor"); if (success!=0) return success; @@ -157,11 +158,16 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int UCL_H_Vec cutsq_view(nparams,*(this->ucl_device), UCL_WRITE_ONLY); - for (int i=0; i(host_cutsq[i]); + if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i]; + } cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); ucl_copy(cutsq,cutsq_view,false); + _cutshortsq = static_cast(cutsqmax); + UCL_H_Vec dview_elem2param(nelements*nelements*nelements, *(this->ucl_device), UCL_WRITE_ONLY); @@ -219,171 +225,6 @@ double TersoffT::host_memory_usage() const { #define KTHREADS this->_threads_per_atom #define JTHREADS this->_threads_per_atom -// --------------------------------------------------------------------------- -// Copy nbor list from host if necessary and then calculate forces, virials,.. -// --------------------------------------------------------------------------- -template -void TersoffT::compute(const int f_ago, const int inum_full, const int nall, - const int nlist, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, - const double cpu_time, bool &success) { - this->acc_timers(); - if (inum_full==0) { - host_start=0; - // Make sure textures are correct if realloc by a different hybrid style - this->resize_atom(0,nall,success); - this->zero_timers(); - return; - } - - int ago=this->hd_balancer.ago_first(f_ago); - int inum=this->hd_balancer.balance(ago,inum_full,cpu_time); - this->ans->inum(inum); - #ifdef THREE_CONCURRENT - this->ans2->inum(inum); - #endif - host_start=inum; - - if (ago==0) { - this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success); - if (!success) - return; - _max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist); - } - - this->atom->cast_x_data(host_x,host_type); - this->hd_balancer.start_timer(); - this->atom->add_x_data(host_x,host_type); - - // re-allocate zetaij if necessary - if (nall*_max_nbors > _zetaij.cols()) { - int _nmax=static_cast(static_cast(nall)*1.10); - _zetaij.resize(_max_nbors*_nmax); - } - - int _eflag; - if (eflag) - _eflag=1; - else - _eflag=0; - - int ainum=nlist; - int nbor_pitch=this->nbor->nbor_pitch(); - int BX=this->block_pair(); - int GX=static_cast(ceil(static_cast(ainum)/ - (BX/(JTHREADS*KTHREADS)))); - - this->k_zeta.set_size(GX,BX); - this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, - &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom); - - int evatom=0; - if (eatom || vatom) - evatom=1; - #ifdef THREE_CONCURRENT - this->ucl_device->sync(); - #endif - loop(eflag,vflag,evatom); - this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist); - this->device->add_ans_object(this->ans); - #ifdef THREE_CONCURRENT - this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist); - this->device->add_ans_object(this->ans2); - #endif - this->hd_balancer.stop_timer(); -} - -// --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary and then compute forces, virials, energies -// --------------------------------------------------------------------------- -template -int ** TersoffT::compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, - const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, - const double cpu_time, bool &success) { - this->acc_timers(); - - if (inum_full==0) { - host_start=0; - // Make sure textures are correct if realloc by a different hybrid style - this->resize_atom(0,nall,success); - this->zero_timers(); - return NULL; - } - - this->hd_balancer.balance(cpu_time); - int inum=this->hd_balancer.get_gpu_count(ago,inum_full); - this->ans->inum(inum); - #ifdef THREE_CONCURRENT - this->ans2->inum(inum); - #endif - host_start=inum; - - // Build neighbor list on GPU if necessary - if (ago==0) { - _max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, - sublo, subhi, tag, nspecial, special, success); - if (!success) - return NULL; - this->hd_balancer.start_timer(); - } else { - this->atom->cast_x_data(host_x,host_type); - this->hd_balancer.start_timer(); - this->atom->add_x_data(host_x,host_type); - } - *ilist=this->nbor->host_ilist.begin(); - *jnum=this->nbor->host_acc.begin(); - - // re-allocate zetaij if necessary - if (nall*_max_nbors > _zetaij.cols()) { - int _nmax=static_cast(static_cast(nall)*1.10); - _zetaij.resize(_max_nbors*_nmax); - } - - int _eflag; - if (eflag) - _eflag=1; - else - _eflag=0; - - int ainum=nall; - int nbor_pitch=this->nbor->nbor_pitch(); - int BX=this->block_pair(); - int GX=static_cast(ceil(static_cast(ainum)/ - (BX/(JTHREADS*KTHREADS)))); - - this->k_zeta.set_size(GX,BX); - this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, - &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom); - - int evatom=0; - if (eatom || vatom) - evatom=1; - #ifdef THREE_CONCURRENT - this->ucl_device->sync(); - #endif - loop(eflag,vflag,evatom); - this->ans->copy_answers(eflag,vflag,eatom,vatom); - this->device->add_ans_object(this->ans); - #ifdef THREE_CONCURRENT - this->ans2->copy_answers(eflag,vflag,eatom,vatom); - this->device->add_ans_object(this->ans2); - #endif - this->hd_balancer.stop_timer(); - - return this->nbor->host_jlist.begin()-host_start; -} - // --------------------------------------------------------------------------- // Calculate energies, forces, and torques // --------------------------------------------------------------------------- @@ -402,9 +243,40 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { else vflag=0; - int ainum=this->ans->inum(); + // build the short neighbor list + int ainum=this->_ainum; int nbor_pitch=this->nbor->nbor_pitch(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/ + int GX=static_cast(ceil(static_cast(ainum)/ + (BX/this->_threads_per_atom))); + + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &cutsq, &map, + &elem2param, &_nelements, &_nparams, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &ainum, + &nbor_pitch, &this->_threads_per_atom); + + // re-allocate zetaij if necessary + int nall = this->_nall; + if (nall*this->_max_nbors > _zetaij.cols()) { + int _nmax=static_cast(static_cast(nall)*1.10); + _zetaij.resize(this->_max_nbors*_nmax); + } + + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast(ceil(static_cast(this->_ainum)/ + (BX/(JTHREADS*KTHREADS)))); + + this->k_zeta.set_size(GX,BX); + this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, + &map, &elem2param, &_nelements, &_nparams, &_zetaij, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom); + + ainum=this->ans->inum(); + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); this->time_pair.start(); @@ -412,6 +284,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq, &map, &elem2param, &_nelements, &_nparams, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); @@ -423,6 +296,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &evatom); @@ -437,7 +311,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); @@ -446,7 +320,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu index b7d48d9e34..cdeb5679d8 100644 --- a/lib/gpu/lal_tersoff.cu +++ b/lib/gpu/lal_tersoff.cu @@ -106,7 +106,7 @@ texture ts5_tex; ans[ii]=old; \ } -#define store_zeta(z, tid, t_per_atom, offset) \ +#define acc_zeta(z, tid, t_per_atom, offset) \ if (t_per_atom>1) { \ __local acctyp red_acc[BLOCK_PAIR]; \ red_acc[tid]=z; \ @@ -155,7 +155,7 @@ texture ts5_tex; ans[ii]=old; \ } -#define store_zeta(z, tid, t_per_atom, offset) \ +#define acc_zeta(z, tid, t_per_atom, offset) \ if (t_per_atom>1) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ z += shfl_xor(z, s, t_per_atom); \ @@ -164,6 +164,65 @@ texture ts5_tex; #endif +__kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_, + const __global numtyp *restrict cutsq, + const __global int *restrict map, + const __global int *restrict elem2param, + const int nelements, const int nparams, + const __global int * dev_nbor, + const __global int * dev_packed, + __global int * dev_short_nbor, + const int inum, const int nbor_pitch, + const int t_per_atom) { + __local int n_stride; + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + if (ii cutsq[ijparam]) continue; +// if (rsq1 > cutsq[ijparam]) continue; // compute zeta_ij z = (acctyp)0; int nbor_k = nborj_start-offset_j+offset_k; - for ( ; nbor_k < nbor_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k_end = nbor_end; + if (dev_packed==dev_nbor) { + int numk = dev_short_nbor[nbor_k-n_stride]; + k_end = nbor_k+fast_mul(numk,n_stride); + } + + for ( ; nbor_k < k_end; nbor_k+=n_stride) { + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == j) continue; @@ -284,10 +357,12 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_, //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride; //int idx = jj*n_stride + i*t_per_atom + offset_j; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - i, nbor_j, offset_j, idx); - store_zeta(z, tid, t_per_atom, offset_k); + //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor + int idx = nbor_j; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// i, nbor_j, offset_j, idx); + acc_zeta(z, tid, t_per_atom, offset_k); numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex); numtyp ijparam_lam2 = ts1_ijparam.y; @@ -330,6 +405,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_, const int nelements, const int nparams, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -356,8 +432,8 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_, __syncthreads(); if (ii0) - energy+=feng[1]; - if (vflag>0) { - virial[0] += delx*delx*force; - virial[1] += dely*dely*force; - virial[2] += delz*delz*force; - virial[3] += delx*dely*force; - virial[4] += delx*delz*force; - virial[5] += dely*delz*force; - } + if (eflag>0) + energy+=feng[1]; + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; } } // for nbor @@ -428,6 +511,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, const __global acctyp4 *restrict zetaij, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -461,20 +545,28 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, if (ii cutsq[ijparam]) continue; numtyp r1 = ucl_sqrt(rsq1); numtyp r1inv = ucl_rsqrt(rsq1); @@ -497,9 +588,11 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride; //int idx = jj*n_stride + i*t_per_atom + offset_j; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - i, nbor_j, offset_j, idx); + //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor + int idx = nbor_j; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// i, nbor_j, offset_j, idx); acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex); numtyp force = zeta_ij.x*tpainv; numtyp prefactor = zeta_ij.y; @@ -520,9 +613,15 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, virial[5] += delr1[1]*delr1[2]*mforce; } - int nbor_k=nborj_start-offset_j+offset_k; - for ( ; nbor_k cutsq[ijparam]) continue; - numtyp mdelr1[3]; mdelr1[0] = -delr1[0]; mdelr1[1] = -delr1[1]; @@ -683,13 +790,20 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, k_end=nbor_k+numk; nbor_k+=offset_k; } + + // recalculate numk and k_end for the use of short neighbor list + if (dev_packed==dev_nbor) { + numk = dev_short_nbor[nbor_k]; + nbor_k += n_stride; + k_end = nbor_k+fast_mul(numk,n_stride); + } int nbork_start = nbor_k; // look up for zeta_ji: find i in the j's neighbor list int m = tid / t_per_atom; int ijnum = -1; for ( ; nbor_k cutsq[ijparam]) continue; - numtyp mdelr1[3]; mdelr1[0] = -delr1[0]; mdelr1[1] = -delr1[1]; @@ -909,13 +1035,20 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, k_end=nbor_k+numk; nbor_k+=offset_k; } + + // recalculate numk and k_end for the use of short neighbor list + if (dev_packed==dev_nbor) { + numk = dev_short_nbor[nbor_k]; + nbor_k += n_stride; + k_end = nbor_k+fast_mul(numk,n_stride); + } int nbork_start = nbor_k; // look up for zeta_ji int m = tid / t_per_atom; int ijnum = -1; for ( ; nbor_k { const double* h, const double* gamma, const double* beta, const double* powern, const double* cutsq); - /// Pair loop with host neighboring - void compute(const int f_ago, const int inum_full, const int nall, - const int nlist, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, - int &host_start, const double cpu_time, bool &success); - - /// Pair loop with device neighboring - int ** compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **numj, const double cpu_time, bool &success); - /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -104,8 +89,7 @@ class Tersoff : public BaseThree { UCL_Kernel k_zeta; UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex; - - int _max_nbors; + numtyp _cutshortsq; private: bool _allocated; diff --git a/lib/gpu/lal_tersoff_mod.cpp b/lib/gpu/lal_tersoff_mod.cpp index 553dad3583..c37c07f1a1 100644 --- a/lib/gpu/lal_tersoff_mod.cpp +++ b/lib/gpu/lal_tersoff_mod.cpp @@ -55,7 +55,8 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in int success; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,tersoff_mod,"k_tersoff_mod_repulsive", - "k_tersoff_mod_three_center", "k_tersoff_mod_three_end"); + "k_tersoff_mod_three_center", "k_tersoff_mod_three_end", + "k_tersoff_mod_short_nbor"); if (success!=0) return success; @@ -157,11 +158,16 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in UCL_H_Vec cutsq_view(nparams,*(this->ucl_device), UCL_WRITE_ONLY); - for (int i=0; i(host_cutsq[i]); + if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i]; + } cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); ucl_copy(cutsq,cutsq_view,false); + _cutshortsq = static_cast(cutsqmax); + UCL_H_Vec dview_elem2param(nelements*nelements*nelements, *(this->ucl_device), UCL_WRITE_ONLY); @@ -219,171 +225,6 @@ double TersoffMT::host_memory_usage() const { #define KTHREADS this->_threads_per_atom #define JTHREADS this->_threads_per_atom -// --------------------------------------------------------------------------- -// Copy nbor list from host if necessary and then calculate forces, virials,.. -// --------------------------------------------------------------------------- -template -void TersoffMT::compute(const int f_ago, const int inum_full, const int nall, - const int nlist, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, - const double cpu_time, bool &success) { - this->acc_timers(); - if (inum_full==0) { - host_start=0; - // Make sure textures are correct if realloc by a different hybrid style - this->resize_atom(0,nall,success); - this->zero_timers(); - return; - } - - int ago=this->hd_balancer.ago_first(f_ago); - int inum=this->hd_balancer.balance(ago,inum_full,cpu_time); - this->ans->inum(inum); - #ifdef THREE_CONCURRENT - this->ans2->inum(inum); - #endif - host_start=inum; - - if (ago==0) { - this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success); - if (!success) - return; - _max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist); - } - - this->atom->cast_x_data(host_x,host_type); - this->hd_balancer.start_timer(); - this->atom->add_x_data(host_x,host_type); - - // re-allocate zetaij if necessary - if (nall*_max_nbors > _zetaij.cols()) { - int _nmax=static_cast(static_cast(nall)*1.10); - _zetaij.resize(_max_nbors*_nmax); - } - - int _eflag; - if (eflag) - _eflag=1; - else - _eflag=0; - - int ainum=nlist; - int nbor_pitch=this->nbor->nbor_pitch(); - int BX=this->block_pair(); - int GX=static_cast(ceil(static_cast(ainum)/ - (BX/(JTHREADS*KTHREADS)))); - - this->k_zeta.set_size(GX,BX); - this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, - &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom); - - int evatom=0; - if (eatom || vatom) - evatom=1; - #ifdef THREE_CONCURRENT - this->ucl_device->sync(); - #endif - loop(eflag,vflag,evatom); - this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist); - this->device->add_ans_object(this->ans); - #ifdef THREE_CONCURRENT - this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist); - this->device->add_ans_object(this->ans2); - #endif - this->hd_balancer.stop_timer(); -} - -// --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary and then compute forces, virials, energies -// --------------------------------------------------------------------------- -template -int ** TersoffMT::compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, - const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, - const double cpu_time, bool &success) { - this->acc_timers(); - - if (inum_full==0) { - host_start=0; - // Make sure textures are correct if realloc by a different hybrid style - this->resize_atom(0,nall,success); - this->zero_timers(); - return NULL; - } - - this->hd_balancer.balance(cpu_time); - int inum=this->hd_balancer.get_gpu_count(ago,inum_full); - this->ans->inum(inum); - #ifdef THREE_CONCURRENT - this->ans2->inum(inum); - #endif - host_start=inum; - - // Build neighbor list on GPU if necessary - if (ago==0) { - _max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, - sublo, subhi, tag, nspecial, special, success); - if (!success) - return NULL; - this->hd_balancer.start_timer(); - } else { - this->atom->cast_x_data(host_x,host_type); - this->hd_balancer.start_timer(); - this->atom->add_x_data(host_x,host_type); - } - *ilist=this->nbor->host_ilist.begin(); - *jnum=this->nbor->host_acc.begin(); - - // re-allocate zetaij if necessary - if (nall*_max_nbors > _zetaij.cols()) { - int _nmax=static_cast(static_cast(nall)*1.10); - _zetaij.resize(_max_nbors*_nmax); - } - - int _eflag; - if (eflag) - _eflag=1; - else - _eflag=0; - - int ainum=nall; - int nbor_pitch=this->nbor->nbor_pitch(); - int BX=this->block_pair(); - int GX=static_cast(ceil(static_cast(ainum)/ - (BX/(JTHREADS*KTHREADS)))); - - this->k_zeta.set_size(GX,BX); - this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, - &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom); - - int evatom=0; - if (eatom || vatom) - evatom=1; - #ifdef THREE_CONCURRENT - this->ucl_device->sync(); - #endif - loop(eflag,vflag,evatom); - this->ans->copy_answers(eflag,vflag,eatom,vatom); - this->device->add_ans_object(this->ans); - #ifdef THREE_CONCURRENT - this->ans2->copy_answers(eflag,vflag,eatom,vatom); - this->device->add_ans_object(this->ans2); - #endif - this->hd_balancer.stop_timer(); - - return this->nbor->host_jlist.begin()-host_start; -} - // --------------------------------------------------------------------------- // Calculate energies, forces, and torques // --------------------------------------------------------------------------- @@ -402,9 +243,40 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { else vflag=0; - int ainum=this->ans->inum(); + // build the short neighbor list + int ainum=this->_ainum; int nbor_pitch=this->nbor->nbor_pitch(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/ + int GX=static_cast(ceil(static_cast(ainum)/ + (BX/this->_threads_per_atom))); + + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &cutsq, &map, + &elem2param, &_nelements, &_nparams, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &ainum, + &nbor_pitch, &this->_threads_per_atom); + + // re-allocate zetaij if necessary + int nall = this->_nall; + if (nall*this->_max_nbors > _zetaij.cols()) { + int _nmax=static_cast(static_cast(nall)*1.10); + _zetaij.resize(this->_max_nbors*_nmax); + } + + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast(ceil(static_cast(this->_ainum)/ + (BX/(JTHREADS*KTHREADS)))); + + this->k_zeta.set_size(GX,BX); + this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, + &map, &elem2param, &_nelements, &_nparams, &_zetaij, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom); + + ainum=this->ans->inum(); + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); this->time_pair.start(); @@ -412,6 +284,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq, &map, &elem2param, &_nelements, &_nparams, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); @@ -423,6 +296,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &evatom); @@ -437,7 +311,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); @@ -446,7 +320,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu index 3a81b36941..576359b514 100644 --- a/lib/gpu/lal_tersoff_mod.cu +++ b/lib/gpu/lal_tersoff_mod.cu @@ -106,7 +106,7 @@ texture ts5_tex; ans[ii]=old; \ } -#define store_zeta(z, tid, t_per_atom, offset) \ +#define acc_zeta(z, tid, t_per_atom, offset) \ if (t_per_atom>1) { \ __local acctyp red_acc[BLOCK_PAIR]; \ red_acc[tid]=z; \ @@ -155,7 +155,7 @@ texture ts5_tex; ans[ii]=old; \ } -#define store_zeta(z, tid, t_per_atom, offset) \ +#define acc_zeta(z, tid, t_per_atom, offset) \ if (t_per_atom>1) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ z += shfl_xor(z, s, t_per_atom); \ @@ -164,6 +164,65 @@ texture ts5_tex; #endif +__kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_, + const __global numtyp *restrict cutsq, + const __global int *restrict map, + const __global int *restrict elem2param, + const int nelements, const int nparams, + const __global int * dev_nbor, + const __global int * dev_packed, + __global int * dev_short_nbor, + const int inum, const int nbor_pitch, + const int t_per_atom) { + __local int n_stride; + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + if (ii cutsq[ijparam]) continue; - // compute zeta_ij - z = (numtyp)0; + z = (acctyp)0; int nbor_k = nborj_start-offset_j+offset_k; - for ( ; nbor_k < nbor_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k_end = nbor_end; + if (dev_packed==dev_nbor) { + int numk = dev_short_nbor[nbor_k-n_stride]; + k_end = nbor_k+fast_mul(numk,n_stride); + } + + for ( ; nbor_k < k_end; nbor_k+=n_stride) { + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == j) continue; @@ -287,10 +358,12 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_, //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride; //int idx = jj*n_stride + i*t_per_atom + offset_j; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - i, nbor_j, offset_j, idx); - store_zeta(z, tid, t_per_atom, offset_k); + //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor + int idx = nbor_j; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// i, nbor_j, offset_j, idx); + acc_zeta(z, tid, t_per_atom, offset_k); numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex); numtyp ijparam_lam2 = ts1_ijparam.y; @@ -331,6 +404,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_, const int nelements, const int nparams, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -357,8 +431,8 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_, __syncthreads(); if (ii0) - energy+=feng[1]; - if (vflag>0) { - virial[0] += delx*delx*force; - virial[1] += dely*dely*force; - virial[2] += delz*delz*force; - virial[3] += delx*dely*force; - virial[4] += delx*delz*force; - virial[5] += dely*delz*force; - } + if (eflag>0) + energy+=feng[1]; + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; } } // for nbor @@ -430,6 +511,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_, const __global acctyp4 *restrict zetaij, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -465,20 +547,28 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_, if (ii cutsq[ijparam]) continue; numtyp r1 = ucl_sqrt(rsq1); numtyp r1inv = ucl_rsqrt(rsq1); @@ -501,9 +590,11 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_, //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride; //int idx = jj*n_stride + i*t_per_atom + offset_j; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - i, nbor_j, offset_j, idx); + //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor + int idx = nbor_j; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// i, nbor_j, offset_j, idx); acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex); numtyp force = zeta_ij.x*tpainv; numtyp prefactor = zeta_ij.y; @@ -524,9 +615,15 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_, virial[5] += delr1[1]*delr1[2]*mforce; } - int nbor_k=nborj_start-offset_j+offset_k; - for ( ; nbor_k cutsq[ijparam]) continue; - numtyp mdelr1[3]; mdelr1[0] = -delr1[0]; mdelr1[1] = -delr1[1]; @@ -693,13 +798,20 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_, k_end=nbor_k+numk; nbor_k+=offset_k; } + + // recalculate numk and k_end for the use of short neighbor list + if (dev_packed==dev_nbor) { + numk = dev_short_nbor[nbor_k]; + nbor_k += n_stride; + k_end = nbor_k+fast_mul(numk,n_stride); + } int nbork_start = nbor_k; // look up for zeta_ji: find i in the j's neighbor list int m = tid / t_per_atom; int ijnum = -1; for ( ; nbor_k cutsq[ijparam]) continue; - numtyp mdelr1[3]; mdelr1[0] = -delr1[0]; mdelr1[1] = -delr1[1]; @@ -928,13 +1052,20 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_, k_end=nbor_k+numk; nbor_k+=offset_k; } + + // recalculate numk and k_end for the use of short neighbor list + if (dev_packed==dev_nbor) { + numk = dev_short_nbor[nbor_k]; + nbor_k += n_stride; + k_end = nbor_k+fast_mul(numk,n_stride); + } int nbork_start = nbor_k; // look up for zeta_ji int m = tid / t_per_atom; int ijnum = -1; for ( ; nbor_k { const double* h, const double* beta, const double* powern, const double* powern_del, const double* ca1, const double* cutsq); - /// Pair loop with host neighboring - void compute(const int f_ago, const int inum_full, const int nall, - const int nlist, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, - int &host_start, const double cpu_time, bool &success); - - /// Pair loop with device neighboring - int ** compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **numj, const double cpu_time, bool &success); - /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -104,8 +89,7 @@ class TersoffMod : public BaseThree { UCL_Kernel k_zeta; UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex; - - int _max_nbors; + numtyp _cutshortsq; private: bool _allocated; diff --git a/lib/gpu/lal_tersoff_zbl.cpp b/lib/gpu/lal_tersoff_zbl.cpp index 9cce8a802d..341f663030 100644 --- a/lib/gpu/lal_tersoff_zbl.cpp +++ b/lib/gpu/lal_tersoff_zbl.cpp @@ -62,7 +62,8 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall, int success; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,tersoff_zbl,"k_tersoff_zbl_repulsive", - "k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end"); + "k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end", + "k_tersoff_zbl_short_nbor"); if (success!=0) return success; @@ -177,11 +178,16 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall, UCL_H_Vec cutsq_view(nparams,*(this->ucl_device), UCL_WRITE_ONLY); - for (int i=0; i(host_cutsq[i]); + if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i]; + } cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); ucl_copy(cutsq,cutsq_view,false); + _cutshortsq = static_cast(cutsqmax); + UCL_H_Vec dview_elem2param(nelements*nelements*nelements, *(this->ucl_device), UCL_WRITE_ONLY); @@ -244,171 +250,6 @@ double TersoffZT::host_memory_usage() const { #define KTHREADS this->_threads_per_atom #define JTHREADS this->_threads_per_atom -// --------------------------------------------------------------------------- -// Copy nbor list from host if necessary and then calculate forces, virials,.. -// --------------------------------------------------------------------------- -template -void TersoffZT::compute(const int f_ago, const int inum_full, const int nall, - const int nlist, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, - const double cpu_time, bool &success) { - this->acc_timers(); - if (inum_full==0) { - host_start=0; - // Make sure textures are correct if realloc by a different hybrid style - this->resize_atom(0,nall,success); - this->zero_timers(); - return; - } - - int ago=this->hd_balancer.ago_first(f_ago); - int inum=this->hd_balancer.balance(ago,inum_full,cpu_time); - this->ans->inum(inum); - #ifdef THREE_CONCURRENT - this->ans2->inum(inum); - #endif - host_start=inum; - - if (ago==0) { - this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success); - if (!success) - return; - _max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist); - } - - this->atom->cast_x_data(host_x,host_type); - this->hd_balancer.start_timer(); - this->atom->add_x_data(host_x,host_type); - - // re-allocate zetaij if necessary - if (nall*_max_nbors > _zetaij.cols()) { - int _nmax=static_cast(static_cast(nall)*1.10); - _zetaij.resize(_max_nbors*_nmax); - } - - int _eflag; - if (eflag) - _eflag=1; - else - _eflag=0; - - int ainum=nlist; - int nbor_pitch=this->nbor->nbor_pitch(); - int BX=this->block_pair(); - int GX=static_cast(ceil(static_cast(ainum)/ - (BX/(JTHREADS*KTHREADS)))); - - this->k_zeta.set_size(GX,BX); - this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq, - &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom); - - int evatom=0; - if (eatom || vatom) - evatom=1; - #ifdef THREE_CONCURRENT - this->ucl_device->sync(); - #endif - loop(eflag,vflag,evatom); - this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist); - this->device->add_ans_object(this->ans); - #ifdef THREE_CONCURRENT - this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist); - this->device->add_ans_object(this->ans2); - #endif - this->hd_balancer.stop_timer(); -} - -// --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary and then compute forces, virials, energies -// --------------------------------------------------------------------------- -template -int ** TersoffZT::compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, - const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, - const double cpu_time, bool &success) { - this->acc_timers(); - - if (inum_full==0) { - host_start=0; - // Make sure textures are correct if realloc by a different hybrid style - this->resize_atom(0,nall,success); - this->zero_timers(); - return NULL; - } - - this->hd_balancer.balance(cpu_time); - int inum=this->hd_balancer.get_gpu_count(ago,inum_full); - this->ans->inum(inum); - #ifdef THREE_CONCURRENT - this->ans2->inum(inum); - #endif - host_start=inum; - - // Build neighbor list on GPU if necessary - if (ago==0) { - _max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, - sublo, subhi, tag, nspecial, special, success); - if (!success) - return NULL; - this->hd_balancer.start_timer(); - } else { - this->atom->cast_x_data(host_x,host_type); - this->hd_balancer.start_timer(); - this->atom->add_x_data(host_x,host_type); - } - *ilist=this->nbor->host_ilist.begin(); - *jnum=this->nbor->host_acc.begin(); - - // re-allocate zetaij if necessary - if (nall*_max_nbors > _zetaij.cols()) { - int _nmax=static_cast(static_cast(nall)*1.10); - _zetaij.resize(_max_nbors*_nmax); - } - - int _eflag; - if (eflag) - _eflag=1; - else - _eflag=0; - - int ainum=nall; - int nbor_pitch=this->nbor->nbor_pitch(); - int BX=this->block_pair(); - int GX=static_cast(ceil(static_cast(ainum)/ - (BX/(JTHREADS*KTHREADS)))); - - this->k_zeta.set_size(GX,BX); - this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq, - &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom); - - int evatom=0; - if (eatom || vatom) - evatom=1; - #ifdef THREE_CONCURRENT - this->ucl_device->sync(); - #endif - loop(eflag,vflag,evatom); - this->ans->copy_answers(eflag,vflag,eatom,vatom); - this->device->add_ans_object(this->ans); - #ifdef THREE_CONCURRENT - this->ans2->copy_answers(eflag,vflag,eatom,vatom); - this->device->add_ans_object(this->ans2); - #endif - this->hd_balancer.stop_timer(); - - return this->nbor->host_jlist.begin()-host_start; -} - // --------------------------------------------------------------------------- // Calculate energies, forces, and torques // --------------------------------------------------------------------------- @@ -427,9 +268,40 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { else vflag=0; - int ainum=this->ans->inum(); + // build the short neighbor list + int ainum=this->_ainum; int nbor_pitch=this->nbor->nbor_pitch(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/ + int GX=static_cast(ceil(static_cast(ainum)/ + (BX/this->_threads_per_atom))); + + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &cutsq, &map, + &elem2param, &_nelements, &_nparams, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &ainum, + &nbor_pitch, &this->_threads_per_atom); + + // re-allocate zetaij if necessary + int nall = this->_nall; + if (nall*this->_max_nbors > _zetaij.cols()) { + int _nmax=static_cast(static_cast(nall)*1.10); + _zetaij.resize(this->_max_nbors*_nmax); + } + + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast(ceil(static_cast(this->_ainum)/ + (BX/(JTHREADS*KTHREADS)))); + + this->k_zeta.set_size(GX,BX); + this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq, + &map, &elem2param, &_nelements, &_nparams, &_zetaij, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom); + + ainum=this->ans->inum(); + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); this->time_pair.start(); @@ -438,6 +310,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { &_global_e, &_global_a_0, &_global_epsilon_0, &cutsq, &map, &elem2param, &_nelements, &_nparams, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); @@ -449,6 +322,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &evatom); @@ -463,7 +337,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); @@ -472,7 +346,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu index 9509b9802c..e8bb017f59 100644 --- a/lib/gpu/lal_tersoff_zbl.cu +++ b/lib/gpu/lal_tersoff_zbl.cu @@ -109,7 +109,7 @@ texture ts6_tex; ans[ii]=old; \ } -#define store_zeta(z, tid, t_per_atom, offset) \ +#define acc_zeta(z, tid, t_per_atom, offset) \ if (t_per_atom>1) { \ __local acctyp red_acc[BLOCK_PAIR]; \ red_acc[tid]=z; \ @@ -158,7 +158,7 @@ texture ts6_tex; ans[ii]=old; \ } -#define store_zeta(z, tid, t_per_atom, offset) \ +#define acc_zeta(z, tid, t_per_atom, offset) \ if (t_per_atom>1) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ z += shfl_xor(z, s, t_per_atom); \ @@ -167,6 +167,65 @@ texture ts6_tex; #endif +__kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_, + const __global numtyp *restrict cutsq, + const __global int *restrict map, + const __global int *restrict elem2param, + const int nelements, const int nparams, + const __global int * dev_nbor, + const __global int * dev_packed, + __global int * dev_short_nbor, + const int inum, const int nbor_pitch, + const int t_per_atom) { + __local int n_stride; + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + if (ii cutsq[ijparam]) continue; - // compute zeta_ij z = (acctyp)0; int nbor_k = nborj_start-offset_j+offset_k; - for ( ; nbor_k < nbor_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k_end = nbor_end; + if (dev_packed==dev_nbor) { + int numk = dev_short_nbor[nbor_k-n_stride]; + k_end = nbor_k+fast_mul(numk,n_stride); + } + + for ( ; nbor_k < k_end; nbor_k+=n_stride) { + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == j) continue; @@ -290,10 +361,12 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_, //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride; //int idx = jj*n_stride + i*t_per_atom + offset_j; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - i, nbor_j, offset_j, idx); - store_zeta(z, tid, t_per_atom, offset_k); + //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor + int idx = nbor_j; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// i, nbor_j, offset_j, idx); + acc_zeta(z, tid, t_per_atom, offset_k); numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex); numtyp ijparam_lam2 = ts1_ijparam.y; @@ -342,6 +415,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_, const int nelements, const int nparams, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -370,8 +444,8 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_, __syncthreads(); if (ii0) - energy+=feng[1]; - if (vflag>0) { - virial[0] += delx*delx*force; - virial[1] += dely*dely*force; - virial[2] += delz*delz*force; - virial[3] += delx*dely*force; - virial[4] += delx*delz*force; - virial[5] += dely*delz*force; - } + if (eflag>0) + energy+=feng[1]; + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; } } // for nbor @@ -448,6 +529,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_, const __global acctyp4 *restrict zetaij, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -481,20 +563,28 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_, if (ii cutsq[ijparam]) continue; numtyp r1 = ucl_sqrt(rsq1); numtyp r1inv = ucl_rsqrt(rsq1); @@ -517,9 +606,11 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_, //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride; //int idx = jj*n_stride + i*t_per_atom + offset_j; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - i, nbor_j, offset_j, idx); + //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor + int idx = nbor_j; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// i, nbor_j, offset_j, idx); acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex); numtyp force = zeta_ij.x*tpainv; numtyp prefactor = zeta_ij.y; @@ -540,9 +631,15 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_, virial[5] += delr1[1]*delr1[2]*mforce; } - int nbor_k=nborj_start-offset_j+offset_k; - for ( ; nbor_k cutsq[ijparam]) continue; - numtyp mdelr1[3]; mdelr1[0] = -delr1[0]; mdelr1[1] = -delr1[1]; @@ -703,13 +808,20 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_, k_end=nbor_k+numk; nbor_k+=offset_k; } + + // recalculate numk and k_end for the use of short neighbor list + if (dev_packed==dev_nbor) { + numk = dev_short_nbor[nbor_k]; + nbor_k += n_stride; + k_end = nbor_k+fast_mul(numk,n_stride); + } int nbork_start = nbor_k; // look up for zeta_ji: find i in the j's neighbor list int m = tid / t_per_atom; int ijnum = -1; for ( ; nbor_k cutsq[ijparam]) continue; - numtyp mdelr1[3]; mdelr1[0] = -delr1[0]; mdelr1[1] = -delr1[1]; @@ -929,13 +1053,20 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_, k_end=nbor_k+numk; nbor_k+=offset_k; } + + // recalculate numk and k_end for the use of short neighbor list + if (dev_packed==dev_nbor) { + numk = dev_short_nbor[nbor_k]; + nbor_k += n_stride; + k_end = nbor_k+fast_mul(numk,n_stride); + } int nbork_start = nbor_k; // look up for zeta_ji int m = tid / t_per_atom; int ijnum = -1; for ( ; nbor_k { const double* ZBLcut, const double* ZBLexpscale, const double global_e, const double global_a_0, const double global_epsilon_0, const double* cutsq); - /// Pair loop with host neighboring - void compute(const int f_ago, const int inum_full, const int nall, - const int nlist, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, - int &host_start, const double cpu_time, bool &success); - - /// Pair loop with device neighboring - int ** compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **numj, const double cpu_time, bool &success); - /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -109,8 +94,8 @@ class TersoffZBL : public BaseThree { UCL_Kernel k_zeta; UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex, ts6_tex; - int _max_nbors; numtyp _global_e,_global_a_0,_global_epsilon_0; + numtyp _cutshortsq; private: bool _allocated; diff --git a/lib/gpu/lal_vashishta.cpp b/lib/gpu/lal_vashishta.cpp index 96537e65d3..d03ac992bd 100644 --- a/lib/gpu/lal_vashishta.cpp +++ b/lib/gpu/lal_vashishta.cpp @@ -59,7 +59,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i int success; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,vashishta,"k_vashishta","k_vashishta_three_center", - "k_vashishta_three_end"); + "k_vashishta_three_end","k_vashishta_short_nbor"); if (success!=0) return success; @@ -128,15 +128,18 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i param4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); + double r0sqmax = 0; for (int i=0; i(r0sq); dview[i].y=static_cast(gamma[i]); dview[i].z=static_cast(cutsq[i]); dview[i].w=static_cast(r0[i]); } + _cutshortsq = static_cast(r0sqmax); + ucl_copy(param4,dview,false); param4_tex.get_texture(*(this->pair_program),"param4_tex"); param4_tex.bind_float(param4,4); @@ -223,15 +226,28 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ + // build the short neighbor list + int ainum=this->_ainum; + int nbor_pitch=this->nbor->nbor_pitch(); + int GX=static_cast(ceil(static_cast(ainum)/ (BX/this->_threads_per_atom))); + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, ¶m4, &map, + &elem2param, &_nelements, &_nparams, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &ainum, + &nbor_pitch, &this->_threads_per_atom); + // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1 // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1 - int ainum=this->ans->inum(); - int nbor_pitch=this->nbor->nbor_pitch(); + ainum=this->ans->inum(); + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); this->time_pair.start(); + // note that k_pair does not run with the short neighbor list this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5, &map, &elem2param, &_nelements, @@ -248,6 +264,7 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_center.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &evatom); Answer *end_ans; @@ -257,21 +274,19 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) { end_ans=this->ans; #endif if (evatom!=0) { - this->k_three_end_vatom.set_size(GX,BX); this->k_three_end_vatom.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } else { - this->k_three_end.set_size(GX,BX); this->k_three_end.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } diff --git a/lib/gpu/lal_vashishta.cu b/lib/gpu/lal_vashishta.cu index caa3c03613..fa7f413aa5 100644 --- a/lib/gpu/lal_vashishta.cu +++ b/lib/gpu/lal_vashishta.cu @@ -136,6 +136,64 @@ texture param5_tex; #endif +__kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict param4, + const __global int *restrict map, + const __global int *restrict elem2param, + const int nelements, const int nparams, + const __global int * dev_nbor, + const __global int * dev_packed, + __global int * dev_short_nbor, + const int inum, const int nbor_pitch, + const int t_per_atom) { + __local int n_stride; + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + if (ii0) energy += (param3_bigh*reta+vc2-vc3-param3_bigw*r6inv-r*param3_dvrc+param3_c0); @@ -255,31 +313,31 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_, numtyp r1 = ucl_sqrt(rsq1); \ numtyp rinvsq1 = ucl_recip(rsq1); \ numtyp rainv1 = ucl_recip(r1 - param_r0_ij); \ - numtyp gsrainv1 = param_gamma_ij * rainv1; \ + numtyp gsrainv1 = param_gamma_ij * rainv1; \ numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \ numtyp expgsrainv1 = ucl_exp(gsrainv1); \ \ numtyp r2 = ucl_sqrt(rsq2); \ numtyp rinvsq2 = ucl_recip(rsq2); \ numtyp rainv2 = ucl_recip(r2 - param_r0_ik); \ - numtyp gsrainv2 = param_gamma_ik * rainv2; \ + numtyp gsrainv2 = param_gamma_ik * rainv2; \ numtyp gsrainvsq2 = gsrainv2*rainv2/r2; \ numtyp expgsrainv2 = ucl_exp(gsrainv2); \ \ numtyp rinv12 = ucl_recip(r1*r2); \ numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \ - numtyp delcs = cs - param_costheta_ijk; \ + numtyp delcs = cs - param_costheta_ijk; \ numtyp delcssq = delcs*delcs; \ - numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \ + numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \ numtyp pcsinvsq = pcsinv*pcsinv; \ numtyp pcs = delcssq/pcsinv; \ \ numtyp facexp = expgsrainv1*expgsrainv2; \ \ - numtyp facrad = param_bigb_ijk * facexp*pcs; \ + numtyp facrad = param_bigb_ijk * facexp*pcs; \ numtyp frad1 = facrad*gsrainvsq1; \ numtyp frad2 = facrad*gsrainvsq2; \ - numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \ + numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \ numtyp facang12 = rinv12*facang; \ numtyp csfacang = cs*facang; \ numtyp csfac1 = rinvsq1*csfacang; \ @@ -311,28 +369,28 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_, numtyp r1 = ucl_sqrt(rsq1); \ numtyp rinvsq1 = ucl_recip(rsq1); \ numtyp rainv1 = ucl_recip(r1 - param_r0_ij); \ - numtyp gsrainv1 = param_gamma_ij * rainv1; \ + numtyp gsrainv1 = param_gamma_ij * rainv1; \ numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \ numtyp expgsrainv1 = ucl_exp(gsrainv1); \ \ numtyp r2 = ucl_sqrt(rsq2); \ numtyp rainv2 = ucl_recip(r2 - param_r0_ik); \ - numtyp gsrainv2 = param_gamma_ik * rainv2; \ + numtyp gsrainv2 = param_gamma_ik * rainv2; \ numtyp expgsrainv2 = ucl_exp(gsrainv2); \ \ numtyp rinv12 = ucl_recip(r1*r2); \ numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \ - numtyp delcs = cs - param_costheta_ijk; \ + numtyp delcs = cs - param_costheta_ijk; \ numtyp delcssq = delcs*delcs; \ - numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \ + numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \ numtyp pcsinvsq = pcsinv*pcsinv; \ numtyp pcs = delcssq/pcsinv; \ \ numtyp facexp = expgsrainv1*expgsrainv2; \ \ - numtyp facrad = param_bigb_ijk * facexp*pcs; \ + numtyp facrad = param_bigb_ijk * facexp*pcs; \ numtyp frad1 = facrad*gsrainvsq1; \ - numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \ + numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \ numtyp facang12 = rinv12*facang; \ numtyp csfacang = cs*facang; \ numtyp csfac1 = rinvsq1*csfacang; \ @@ -353,6 +411,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_, const int nelements, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -377,7 +436,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_, if (ii param_r0sq_ij) continue; + if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1 param_gamma_ij=param4_ijparam.y; param_r0_ij=param4_ijparam.w; - int nbor_k=nbor_j-offset_j+offset_k; - if (nbor_k<=nbor_j) - nbor_k+=n_stride; + int nbor_k,k_end; + if (dev_packed==dev_nbor) { + nbor_k=nborj_start-offset_j+offset_k; + int numk = dev_short_nbor[nbor_k-n_stride]; + k_end = nbor_k+fast_mul(numk,n_stride); + } else { + nbor_k = nbor_j-offset_j+offset_k; + if (nbor_k<=nbor_j) nbor_k += n_stride; + k_end = nbor_end; + } - for ( ; nbor_k param_r0sq_ij) continue; + if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1 param_gamma_ij=param4_ijparam.y; param_r0_ij = param4_ijparam.w; @@ -551,8 +637,15 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_, nbor_k+=offset_k; } + // recalculate numk and k_end for the use of short neighbor list + if (dev_packed==dev_nbor) { + numk = dev_short_nbor[nbor_k]; + nbor_k += n_stride; + k_end = nbor_k+fast_mul(numk,n_stride); + } + for ( ; nbor_k param_r0sq_ij) continue; + if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1 param_gamma_ij=param4_ijparam.y; param_r0_ij=param4_ijparam.w; @@ -690,8 +792,15 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_, nbor_k+=offset_k; } + // recalculate numk and k_end for the use of short neighbor list + if (dev_packed==dev_nbor) { + numk = dev_short_nbor[nbor_k]; + nbor_k += n_stride; + k_end = nbor_k+fast_mul(numk,n_stride); + } + for ( ; nbor_k { UCL_D_Vec elem2param; UCL_D_Vec map; int _nparams,_nelements; + numtyp _cutshortsq; UCL_Texture param1_tex, param2_tex, param3_tex, param4_tex, param5_tex; diff --git a/lib/kim/Install.py b/lib/kim/Install.py index 21ea859852..aa244ee6ea 100644 --- a/lib/kim/Install.py +++ b/lib/kim/Install.py @@ -6,6 +6,8 @@ from __future__ import print_function import sys,os,re,subprocess +# help message + help = """ Syntax from src dir: make lib-kim args="-b -v version -a kim-name" or: make lib-kim args="-b -a everything" @@ -23,7 +25,7 @@ specify one or more options, order does not matter -b = download and build base KIM API library with example Models this will delete any previous installation in the current folder -n = do NOT download and build base KIM API library. - Use an existing installation + Use an existing installation -p = specify location of KIM API installation (implies -n) -a = add single KIM model or model driver with kim-name to existing KIM API lib (see example below). @@ -78,13 +80,27 @@ def which(program): return None def geturl(url,fname): + success = False + if which('curl') != None: cmd = 'curl -L -o "%s" %s' % (fname,url) - elif which('wget') != None: + try: + subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) + success = True + except subprocess.CalledProcessError as e: + print("Calling curl failed with: %s" % e.output.decode('UTF-8')) + + if not success and which('wget') != None: cmd = 'wget -O "%s" %s' % (fname,url) - else: error("cannot find 'wget' or 'curl' to download source code") - txt = subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) - return txt + try: + subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) + success = True + except subprocess.CalledProcessError as e: + print("Calling wget failed with: %s" % e.output.decode('UTF-8')) + + if not success: + error("Failed to download source code with 'curl' or 'wget'") + return # parse args diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index acb54ff22f..3fe9e46111 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,5 +1,46 @@ # Change Log +## [2.03.13](https://github.com/kokkos/kokkos/tree/2.03.13) (2017-07-27) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.05...2.03.13) + +**Implemented enhancements:** + +- Disallow enabling both OpenMP and Threads in the same executable [\#406](https://github.com/kokkos/kokkos/issues/406) +- Make Kokkos::OpenMP respect OMP environment even if hwloc is available [\#630](https://github.com/kokkos/kokkos/issues/630) +- Improve Atomics Performance on KNL/Broadwell where PREFETCHW/RFO is Available [\#898](https://github.com/kokkos/kokkos/issues/898) +- Kokkos::resize should test whether dimensions have changed before resizing [\#904](https://github.com/kokkos/kokkos/issues/904) +- Develop performance-regression/acceptance tests [\#737](https://github.com/kokkos/kokkos/issues/737) +- Make the deep\_copy Profiling hook a start/end system [\#890](https://github.com/kokkos/kokkos/issues/890) +- Add deep\_copy Profiling hook [\#843](https://github.com/kokkos/kokkos/issues/843) +- Append tag name to parallel construct name for Profiling [\#842](https://github.com/kokkos/kokkos/issues/842) +- Add view label to `View bounds error` message for CUDA backend [\#870](https://github.com/kokkos/kokkos/issues/870) +- Disable printing the loaded profiling library [\#824](https://github.com/kokkos/kokkos/issues/824) +- "Declared but never referenced" warnings [\#853](https://github.com/kokkos/kokkos/issues/853) +- Warnings about lock\_address\_cuda\_space [\#852](https://github.com/kokkos/kokkos/issues/852) +- WorkGraph execution policy [\#771](https://github.com/kokkos/kokkos/issues/771) +- Simplify makefiles by guarding compilation with appropriate KOKKOS\_ENABLE\_\#\#\# macros [\#716](https://github.com/kokkos/kokkos/issues/716) +- Cmake build: wrong include install directory [\#668](https://github.com/kokkos/kokkos/issues/668) +- Derived View type and allocation [\#566](https://github.com/kokkos/kokkos/issues/566) +- Fix Compiler warnings when compiling core unit tests for Cuda [\#214](https://github.com/kokkos/kokkos/issues/214) + +**Fixed bugs:** + +- Out-of-bounds read in Kokkos\_Layout.hpp [\#975](https://github.com/kokkos/kokkos/issues/975) +- CudaClang: Fix failing test with Clang 4.0 [\#941](https://github.com/kokkos/kokkos/issues/941) +- Respawn when memory pool allocation fails \(not available memory\) [\#940](https://github.com/kokkos/kokkos/issues/940) +- Memory pool aborts on zero allocation request, returns NULL for \< minimum [\#939](https://github.com/kokkos/kokkos/issues/939) +- Error with TaskScheduler query of underlying memory pool [\#917](https://github.com/kokkos/kokkos/issues/917) +- Profiling::\*Callee static variables declared in header [\#863](https://github.com/kokkos/kokkos/issues/863) +- calling \*Space::name\(\) causes compile error [\#862](https://github.com/kokkos/kokkos/issues/862) +- bug in Profiling::deallocateData [\#860](https://github.com/kokkos/kokkos/issues/860) +- task\_depend test failing, CUDA 8.0 + Pascal + RDC [\#829](https://github.com/kokkos/kokkos/issues/829) +- \[develop branch\] Standalone cmake issues [\#826](https://github.com/kokkos/kokkos/issues/826) +- Kokkos CUDA failes to compile with OMPI\_CXX and MPICH\_CXX wrappers [\#776](https://github.com/kokkos/kokkos/issues/776) +- Task Team reduction on Pascal [\#767](https://github.com/kokkos/kokkos/issues/767) +- CUDA stack overflow with TaskDAG test [\#758](https://github.com/kokkos/kokkos/issues/758) +- TeamVector test on Cuda [\#670](https://github.com/kokkos/kokkos/issues/670) +- Clang 4.0 Cuda Build broken again [\#560](https://github.com/kokkos/kokkos/issues/560) + ## [2.03.05](https://github.com/kokkos/kokkos/tree/2.03.05) (2017-05-27) [Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.00...2.03.05) diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index 24cd772e00..d2967cf9a3 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -33,6 +33,7 @@ KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "lib KOKKOS_INTERNAL_USE_MEMKIND := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "experimental_memkind" | wc -l)) # Check for advanced settings. +KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "compiler_warnings" | wc -l)) KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "aggressive_vectorization" | wc -l)) KOKKOS_INTERNAL_DISABLE_PROFILING := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_profiling" | wc -l)) KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_dualview_modify_check" | wc -l)) @@ -78,14 +79,14 @@ KOKKOS_INTERNAL_COMPILER_PGI := $(strip $(shell $(CXX) --version 2 KOKKOS_INTERNAL_COMPILER_XL := $(strip $(shell $(CXX) -qversion 2>&1 | grep XL | wc -l)) KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-" | wc -l)) KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(CXX) --version 2>&1 | grep nvcc | wc -l)) -KOKKOS_INTERNAL_COMPILER_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep clang | wc -l)) -KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep "apple-darwin" | wc -l)) ifneq ($(OMPI_CXX),) - KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(OMPI_CXX) --version 2>&1 | grep "nvcc" | wc -l)) + KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(OMPI_CXX) --version 2>&1 | grep nvcc | wc -l)) endif ifneq ($(MPICH_CXX),) - KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(MPICH_CXX) --version 2>&1 | grep "nvcc" | wc -l)) + KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(MPICH_CXX) --version 2>&1 | grep nvcc | wc -l)) endif +KOKKOS_INTERNAL_COMPILER_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep clang | wc -l)) +KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep "apple-darwin" | wc -l)) ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2) KOKKOS_INTERNAL_COMPILER_CLANG = 1 @@ -111,6 +112,36 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) endif endif +# Set compiler warnings flags. +ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + # TODO check if PGI accepts GNU style warnings + KOKKOS_INTERNAL_COMPILER_WARNINGS = + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1) + KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) + KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + # TODO check if cray accepts GNU style warnings + KOKKOS_INTERNAL_COMPILER_WARNINGS = + else + #gcc + KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized + endif + endif + endif + endif + endif +else + KOKKOS_INTERNAL_COMPILER_WARNINGS = +endif + # Set OpenMP flags. ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) KOKKOS_INTERNAL_OPENMP_FLAG := -mp @@ -162,6 +193,7 @@ endif # Intel based. KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_WSM := $(strip $(shell echo $(KOKKOS_ARCH) | grep WSM | wc -l)) KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l)) KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l)) KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l)) @@ -229,13 +261,14 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l)) # Any AVX? +KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM) | bc )) KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc )) KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc )) KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc )) KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc )) # Decide what ISA level we are able to support. -KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc )) +KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM)+$(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc )) KOKKOS_INTERNAL_USE_ISA_KNC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc )) KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc )) @@ -243,7 +276,7 @@ KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ KOKKOS_INTERNAL_USE_TM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc )) # Incompatible flags? -KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc )) +KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc )) KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc)) ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1) @@ -257,12 +290,10 @@ endif KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src -# No warnings: KOKKOS_CXXFLAGS = -# INTEL and CLANG warnings: -#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized -# GCC warnings: -#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized -Wignored-qualifiers -Wempty-body -Wclobbered +ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1) + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_WARNINGS) +endif KOKKOS_LIBS = -lkokkos -ldl KOKKOS_LDFLAGS = -L$(shell pwd) @@ -486,6 +517,28 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1) endif endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1) + tmp := $(shell echo "\#define KOKKOS_ARCH_SSE42 1" >> KokkosCore_config.tmp ) + + ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) + KOKKOS_CXXFLAGS += -xSSE4.2 + KOKKOS_LDFLAGS += -xSSE4.2 + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + KOKKOS_CXXFLAGS += -tp=nehalem + KOKKOS_LDFLAGS += -tp=nehalem + else + # Assume that this is a really a GNU compiler. + KOKKOS_CXXFLAGS += -msse4.2 + KOKKOS_LDFLAGS += -msse4.2 + endif + endif + endif +endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1) tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp ) @@ -689,7 +742,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) endif endif -KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h) +KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1) ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h) KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l)) else diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets index 3cb52a04cd..a9341a907c 100644 --- a/lib/kokkos/Makefile.targets +++ b/lib/kokkos/Makefile.targets @@ -20,8 +20,10 @@ Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ta $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp -Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp +Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp +Kokkos_Rendezvous.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Rendezvous.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Rendezvous.cpp Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp @@ -36,6 +38,8 @@ Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp +Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) diff --git a/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp b/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp index 1e7ee68549..c2c118ce1a 100644 --- a/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp @@ -61,14 +61,19 @@ protected: { std::cout << std::setprecision(5) << std::scientific; - unsigned threads_count = omp_get_max_threads(); + int threads_count = 0; + #pragma omp parallel + { + #pragma omp atomic + ++threads_count; + } - if ( Kokkos::hwloc::available() ) { - threads_count = Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa(); + if (threads_count > 3) { + threads_count /= 2; } Kokkos::OpenMP::initialize( threads_count ); + Kokkos::OpenMP::print_configuration( std::cout ); } static void TearDownTestCase() diff --git a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp index 9cf02f74b4..2771f1793d 100644 --- a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp @@ -1,12 +1,12 @@ //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -35,7 +35,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER @@ -283,12 +283,12 @@ struct test_random_scalar { RandomGenerator& pool, unsigned int num_draws) { - using std::cerr; + using std::cout; using std::endl; using Kokkos::parallel_reduce; { - cerr << " -- Testing randomness properties" << endl; + cout << " -- Testing randomness properties" << endl; RandomProperties result; typedef test_random_functor functor_type; @@ -307,7 +307,7 @@ struct test_random_scalar { ( 1.5*tolerance > variance_eps)) ? 1:0; pass_covar = ((-2.0*tolerance < covariance_eps) && ( 2.0*tolerance > covariance_eps)) ? 1:0; - cerr << "Pass: " << pass_mean + cout << "Pass: " << pass_mean << " " << pass_var << " " << mean_eps << " " << variance_eps @@ -315,7 +315,7 @@ struct test_random_scalar { << " || " << tolerance << endl; } { - cerr << " -- Testing 1-D histogram" << endl; + cout << " -- Testing 1-D histogram" << endl; RandomProperties result; typedef test_histogram1d_functor functor_type; @@ -335,7 +335,7 @@ struct test_random_scalar { pass_hist1d_covar = ((-0.06 < covariance_eps) && ( 0.06 > covariance_eps)) ? 1:0; - cerr << "Density 1D: " << mean_eps + cout << "Density 1D: " << mean_eps << " " << variance_eps << " " << (result.covariance/HIST_DIM1D/HIST_DIM1D) << " || " << tolerance @@ -348,7 +348,7 @@ struct test_random_scalar { << endl; } { - cerr << " -- Testing 3-D histogram" << endl; + cout << " -- Testing 3-D histogram" << endl; RandomProperties result; typedef test_histogram3d_functor functor_type; @@ -368,7 +368,7 @@ struct test_random_scalar { pass_hist3d_covar = ((-tolerance < covariance_eps) && ( tolerance > covariance_eps)) ? 1:0; - cerr << "Density 3D: " << mean_eps + cout << "Density 3D: " << mean_eps << " " << variance_eps << " " << result.covariance/HIST_DIM1D/HIST_DIM1D << " || " << tolerance @@ -381,18 +381,18 @@ struct test_random_scalar { template void test_random(unsigned int num_draws) { - using std::cerr; + using std::cout; using std::endl; typename test_random_functor::type_1d density_1d("D1d"); typename test_random_functor::type_3d density_3d("D3d"); uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count(); - cerr << "Test Seed:" << ticks << endl; + cout << "Test Seed:" << ticks << endl; RandomGenerator pool(ticks); - cerr << "Test Scalar=int" << endl; + cout << "Test Scalar=int" << endl; test_random_scalar test_int(density_1d,density_3d,pool,num_draws); ASSERT_EQ( test_int.pass_mean,1); ASSERT_EQ( test_int.pass_var,1); @@ -406,7 +406,7 @@ void test_random(unsigned int num_draws) deep_copy(density_1d,0); deep_copy(density_3d,0); - cerr << "Test Scalar=unsigned int" << endl; + cout << "Test Scalar=unsigned int" << endl; test_random_scalar test_uint(density_1d,density_3d,pool,num_draws); ASSERT_EQ( test_uint.pass_mean,1); ASSERT_EQ( test_uint.pass_var,1); @@ -420,7 +420,7 @@ void test_random(unsigned int num_draws) deep_copy(density_1d,0); deep_copy(density_3d,0); - cerr << "Test Scalar=int64_t" << endl; + cout << "Test Scalar=int64_t" << endl; test_random_scalar test_int64(density_1d,density_3d,pool,num_draws); ASSERT_EQ( test_int64.pass_mean,1); ASSERT_EQ( test_int64.pass_var,1); @@ -434,7 +434,7 @@ void test_random(unsigned int num_draws) deep_copy(density_1d,0); deep_copy(density_3d,0); - cerr << "Test Scalar=uint64_t" << endl; + cout << "Test Scalar=uint64_t" << endl; test_random_scalar test_uint64(density_1d,density_3d,pool,num_draws); ASSERT_EQ( test_uint64.pass_mean,1); ASSERT_EQ( test_uint64.pass_var,1); @@ -448,7 +448,7 @@ void test_random(unsigned int num_draws) deep_copy(density_1d,0); deep_copy(density_3d,0); - cerr << "Test Scalar=float" << endl; + cout << "Test Scalar=float" << endl; test_random_scalar test_float(density_1d,density_3d,pool,num_draws); ASSERT_EQ( test_float.pass_mean,1); ASSERT_EQ( test_float.pass_var,1); @@ -462,7 +462,7 @@ void test_random(unsigned int num_draws) deep_copy(density_1d,0); deep_copy(density_3d,0); - cerr << "Test Scalar=double" << endl; + cout << "Test Scalar=double" << endl; test_random_scalar test_double(density_1d,density_3d,pool,num_draws); ASSERT_EQ( test_double.pass_mean,1); ASSERT_EQ( test_double.pass_var,1); diff --git a/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp b/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp index f952ab3db5..9e75b580bc 100644 --- a/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp +++ b/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ diff --git a/lib/kokkos/benchmarks/bytes_and_flops/main.cpp b/lib/kokkos/benchmarks/bytes_and_flops/main.cpp index f545247212..8db5ce0eb5 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/main.cpp +++ b/lib/kokkos/benchmarks/bytes_and_flops/main.cpp @@ -44,12 +44,13 @@ #include #include #include +#include int main(int argc, char* argv[]) { Kokkos::initialize(); - - if(argc<10) { + + if(argc<10) { printf("Arguments: N K R D U F T S\n"); printf(" P: Precision (1==float, 2==double)\n"); printf(" N,K: dimensions of the 2D array to allocate\n"); @@ -68,7 +69,7 @@ int main(int argc, char* argv[]) { Kokkos::finalize(); return 0; } - + int P = atoi(argv[1]); int N = atoi(argv[2]); @@ -80,7 +81,7 @@ int main(int argc, char* argv[]) { int T = atoi(argv[8]); int S = atoi(argv[9]); - if(U>8) {printf("U must be 1-8\n"); return 0;} + if(U>8) {printf("U must be 1-8\n"); return 0;} if( (D!=1) && (D!=2) && (D!=4) && (D!=8) && (D!=16) && (D!=32)) {printf("D must be one of 1,2,4,8,16,32\n"); return 0;} if( (P!=1) && (P!=2) ) {printf("P must be one of 1,2\n"); return 0;} diff --git a/lib/kokkos/benchmarks/gather/main.cpp b/lib/kokkos/benchmarks/gather/main.cpp index 161c6f2091..88eb0493c1 100644 --- a/lib/kokkos/benchmarks/gather/main.cpp +++ b/lib/kokkos/benchmarks/gather/main.cpp @@ -44,11 +44,11 @@ #include #include #include +#include int main(int argc, char* argv[]) { Kokkos::initialize(argc,argv); - if(argc<8) { printf("Arguments: S N K D\n"); printf(" S: Scalar Type Size (1==float, 2==double, 4=complex)\n"); diff --git a/lib/kokkos/benchmarks/policy_performance/Makefile b/lib/kokkos/benchmarks/policy_performance/Makefile new file mode 100644 index 0000000000..13aef3209c --- /dev/null +++ b/lib/kokkos/benchmarks/policy_performance/Makefile @@ -0,0 +1,44 @@ +KOKKOS_PATH = ../.. +SRC = $(wildcard *.cpp) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 -g +LINK = ${CXX} +LINKFLAGS = +EXE = policy_performance.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +KOKKOS_CUDA_OPTIONS+=enable_lambda +else +CXX = g++ +CXXFLAGS = -O3 -g -Wall -Werror +LINK = ${CXX} +LINKFLAGS = +EXE = policy_performance.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + +DEPFLAGS = -M + +OBJ = $(SRC:.cpp=.o) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) main.cpp policy_perf_test.hpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< diff --git a/lib/kokkos/benchmarks/policy_performance/main.cpp b/lib/kokkos/benchmarks/policy_performance/main.cpp new file mode 100644 index 0000000000..b0ed9bb512 --- /dev/null +++ b/lib/kokkos/benchmarks/policy_performance/main.cpp @@ -0,0 +1,170 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include "policy_perf_test.hpp" + +int main(int argc, char* argv[] ) { + Kokkos::initialize(argc,argv); + + if(argc<10) { + printf(" Ten arguments are needed to run this program:\n"); + printf(" (1)team_range, (2)thread_range, (3)vector_range, (4)outer_repeat, (5)thread_repeat, (6)vector_repeat, (7)team_size, (8)vector_size, (9)schedule, (10)test_type\n"); + printf(" team_range: number of teams (league_size)\n"); + printf(" thread_range: range for nested TeamThreadRange parallel_*\n"); + printf(" vector_range: range for nested ThreadVectorRange parallel_*\n"); + printf(" outer_repeat: number of repeats for outer parallel_* call\n"); + printf(" thread_repeat: number of repeats for TeamThreadRange parallel_* call\n"); + printf(" vector_repeat: number of repeats for ThreadVectorRange parallel_* call\n"); + printf(" team_size: number of team members (team_size)\n"); + printf(" vector_size: desired vectorization (if possible)\n"); + printf(" schedule: 1 == Static 2 == Dynamic\n"); + printf(" test_type: 3-digit code XYZ for testing (nested) parallel_*\n"); + printf(" code key: XYZ X in {1,2,3,4,5}, Y in {0,1,2}, Z in {0,1,2}\n"); + printf(" TeamPolicy:\n"); + printf(" X: 0 = none (never used, makes no sense); 1 = parallel_for; 2 = parallel_reduce\n"); + printf(" Y: 0 = none; 1 = parallel_for; 2 = parallel_reduce\n"); + printf(" Z: 0 = none; 1 = parallel_for; 2 = parallel_reduce\n"); + printf(" RangePolicy:\n"); + printf(" X: 3 = parallel_for; 4 = parallel_reduce; 5 = parallel_scan\n"); + printf(" Y: 0 = none\n"); + printf(" Z: 0 = none\n"); + printf(" Example Input:\n"); + printf(" 100000 32 32 100 100 100 8 1 1 100\n"); + Kokkos::finalize(); + return 0; + } + + int team_range = atoi(argv[1]); + int thread_range = atoi(argv[2]); + int vector_range = atoi(argv[3]); + + int outer_repeat = atoi(argv[4]); + int thread_repeat = atoi(argv[5]); + int vector_repeat = atoi(argv[6]); + + int team_size = atoi(argv[7]); + int vector_size = atoi(argv[8]); + int schedule = atoi(argv[9]); + int test_type = atoi(argv[10]); + + int disable_verbose_output = 0; + if ( argc > 11 ) { + disable_verbose_output = atoi(argv[11]); + } + + if ( schedule != 1 && schedule != 2 ) { + printf("schedule: %d\n", schedule); + printf("Options for schedule are: 1 == Static 2 == Dynamic\n"); + Kokkos::finalize(); + return -1; + } + + if ( test_type != 100 && test_type != 110 && test_type != 111 && test_type != 112 && test_type != 120 && test_type != 121 && test_type != 122 + && test_type != 200 && test_type != 210 && test_type != 211 && test_type != 212 && test_type != 220 && test_type != 221 && test_type != 222 + && test_type != 300 && test_type != 400 && test_type != 500 + ) + { + printf("Incorrect test_type option\n"); + Kokkos::finalize(); + return -2; + } + + double result = 0.0; + + Kokkos::parallel_reduce( "parallel_reduce warmup", Kokkos::TeamPolicy<>(10,1), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type team, double& lval) { + lval += 1; + }, result); + + typedef Kokkos::View view_type_1d; + typedef Kokkos::View view_type_2d; + typedef Kokkos::View view_type_3d; + + // Allocate view without initializing + // Call a 'warmup' test with 1 repeat - this will initialize the corresponding view appropriately for test and should obey first-touch etc + // Second call to test is the one we actually care about and time + view_type_1d v_1( Kokkos::ViewAllocateWithoutInitializing("v_1"), team_range*team_size); + view_type_2d v_2( Kokkos::ViewAllocateWithoutInitializing("v_2"), team_range*team_size, thread_range); + view_type_3d v_3( Kokkos::ViewAllocateWithoutInitializing("v_3"), team_range*team_size, thread_range, vector_range); + + double result_computed = 0.0; + double result_expect = 0.0; + double time = 0.0; + + if(schedule==1) { + if ( test_type != 500 ) { + // warmup - no repeat of loops + test_policy,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time); + test_policy,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time); + } + else { + // parallel_scan: initialize 1d view for parallel_scan + test_policy,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,100,v_1,v_2,v_3,result_computed,result_expect,time); + test_policy,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time); + } + } + if(schedule==2) { + if ( test_type != 500 ) { + // warmup - no repeat of loops + test_policy,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time); + test_policy,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time); + } + else { + // parallel_scan: initialize 1d view for parallel_scan + test_policy,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,100,v_1,v_2,v_3,result_computed,result_expect,time); + test_policy,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time); + } + } + + if ( disable_verbose_output == 0 ) { + printf("%7i %4i %2i %9i %4i %4i %4i %2i %1i %3i %e %e %lf\n",team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,schedule,test_type,result_computed,result_expect,time); + } + else { + printf("%lf\n",time); + } + + Kokkos::finalize(); + + return 0; +} diff --git a/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp b/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp new file mode 100644 index 0000000000..8c79f3b88d --- /dev/null +++ b/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp @@ -0,0 +1,354 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include + +template < class ViewType > +struct ParallelScanFunctor { + using value_type = double; + ViewType v; + + ParallelScanFunctor( const ViewType & v_ ) + : v(v_) + {} + + KOKKOS_INLINE_FUNCTION + void operator()( const int idx, value_type& val, const bool& final ) const + { + // inclusive scan + val += v(idx); + if ( final ) { + v(idx) = val; + } + } +}; + +template +void test_policy(int team_range, int thread_range, int vector_range, + int outer_repeat, int thread_repeat, int inner_repeat, + int team_size, int vector_size, int test_type, + ViewType1 &v1, ViewType2 &v2, ViewType3 &v3, + double &result, double &result_expect, double &time) { + + typedef Kokkos::TeamPolicy t_policy; + typedef typename t_policy::member_type t_team; + Kokkos::Timer timer; + + for(int orep = 0; orep(v1) +#if 0 + // This does not compile with pre Cuda 8.0 - see Github Issue #913 for explanation + KOKKOS_LAMBDA (const int idx, double& val, const bool& final) { + // inclusive scan + val += v1(idx); + if ( final ) { + v1(idx) = val; + } + } +#endif + ); + // result = v1( team_size*team_range - 1 ); // won't work with Cuda - need to copy result back to host to print + // result_expect = 0.5*(team_size*team_range)*(team_size*team_range-1); + } + + } // end outer for loop + + time = timer.seconds(); +} //end test_policy diff --git a/lib/kokkos/benchmarks/policy_performance/script_basic_testing.sh b/lib/kokkos/benchmarks/policy_performance/script_basic_testing.sh new file mode 100755 index 0000000000..e621fffbd4 --- /dev/null +++ b/lib/kokkos/benchmarks/policy_performance/script_basic_testing.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Script to check policy_perf_test code works with each possible combo of options + +echo "Performance test results for parallel_reduce code computing sum of sequence [0,N) with various (nested) policies" + +EXECUTABLE=policy_performance + +TEAMRANGE=1000 +THREADRANGE=4 +VECTORRANGE=32 +TEAMSIZE=4 +VECTORSIZE=1 +OREPEAT=1 +MREPEAT=1 +IREPEAT=1 +SCHEDULE=1 + +SUFFIX=host +if [ -e $EXECUTABLE.$SUFFIX ] +then +SCHEDULE=1 +echo "Host tests Static schedule" +for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500} +do + OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE +done + +SCHEDULE=2 +echo "Host tests Dynamic schedule" +for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500} +do + OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE +done +fi + +SUFFIX=cuda +if [ -e $EXECUTABLE.$SUFFIX ] +then +SCHEDULE=1 +echo "Cuda tests Static schedule" +for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500} +do + ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE +done + +SCHEDULE=2 +echo "Cuda tests Dynamic schedule" +for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500} +do + ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE +done +fi diff --git a/lib/kokkos/benchmarks/policy_performance/script_sample_usage.sh b/lib/kokkos/benchmarks/policy_performance/script_sample_usage.sh new file mode 100755 index 0000000000..f4bfb87f8f --- /dev/null +++ b/lib/kokkos/benchmarks/policy_performance/script_sample_usage.sh @@ -0,0 +1,126 @@ +#!/bin/bash + +# Sample script for benchmarking policy performance + +# Suggested enviroment variables to export prior to executing script: +# KNL: +# OMP_NUM_THREADS=256 KMP_AFFINITY=compact +# Power: +# OMP_NUM_THREADS=64 OMP_PROC_BIND=true + +# Constants and Variables: +# Vary: TEAMSIZE, and THREADRANGE +# for TEAMSIZE in {1,2,4,5,8}; do +# for THREADRANGE in {32,41,1000}; do +# Fixed: TEAMRANGE, VECTORRANGE, VECTORSIZE +# System specific: Adjust REPEAT values to architecture tests are run on + +# Tests +# Static SCHEDULE = 1 +# Tier 1: parallel_for + RangePolicy 300 +# Tier 2: parallel_reduce, parallel_scan + RangePolicy 400 500 +# Tier 3: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY +# Tier 4: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY +# Dynamic SCHEDULE = 2 +# Tier 5: parallel_for + RangePolicy 300 +# Tier 6: parallel_reduce, parallel_scan + RangePolicy 400 500 +# Tier 7: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY +# Tier 8: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY + +# Results grouped by: +# 0) SCHEDULE 1) CODE (test) 2) TEAMRANGE 3) TEAMSIZE 4) THREADRANGE + +EXECUTABLE=policy_performance + +# Default defined values +TEAMRANGE=1000 +THREADRANGE=1 +VECTORRANGE=32 +TEAMSIZE=1 +VECTORSIZE=1 +OREPEAT=1 +MREPEAT=1 +IREPEAT=1 +SCHEDULE=1 + +# Host tests +SUFFIX=host +if [ -e $EXECUTABLE.$SUFFIX ]; then +echo "Host" + +for SCHEDULE in {1,2}; do + +# Tier 1 and 2, 5 and 6 +for CODE in {300,400,500}; do + for TEAMSIZE in {1,2,4,5,8}; do + OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE + done +done + +# Tier 3, 7 +for CODE in {100,110,111,112,120,121,122}; do + for TEAMSIZE in {1,2,4,5,8}; do + for THREADRANGE in {32,41,1000}; do + OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE + done + done +done + +# Tier 4, 8 +for CODE in {200,210,211,212,220,221,222}; do + for TEAMSIZE in {1,2,4,5,8}; do + for THREADRANGE in {32,41,1000}; do + OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE + done + done +done + +done # end SCHEDULE + +fi # end host + + +# Cuda tests +SUFFIX=cuda +# TEAMRANGE=10000, TEAMSIZE=8 too large +# TEAMRANGE=10000, TEAMSIZE=8, THREADRANGE=1000 too large +if [ -e $EXECUTABLE.$SUFFIX ]; then +echo "Cuda" + +for SCHEDULE in {1,2}; do + +# Reset defaults +TEAMRANGE=1000 +THREADRANGE=1 +VECTORRANGE=32 +TEAMSIZE=1 +VECTORSIZE=1 + +# Tier 1 and 2, 5 and 6 +for CODE in {300,400,500}; do + for TEAMSIZE in {1,2,4,5,8}; do + ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE + done +done + +# Tier 3, 7 +for CODE in {100,110,111,112,120,121,122}; do + for TEAMSIZE in {1,2,4,5,8}; do + for THREADRANGE in {32,41,1000}; do + ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE + done + done +done + +# Tier 4, 8 +for CODE in {200,210,211,212,220,221,222}; do + for TEAMSIZE in {1,2,4,5,8}; do + for THREADRANGE in {32,41,1000}; do + ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE + done + done +done + +done # end SCHEDULE + +fi #end cuda diff --git a/lib/kokkos/bin/hpcbind b/lib/kokkos/bin/hpcbind new file mode 100755 index 0000000000..ca34648780 --- /dev/null +++ b/lib/kokkos/bin/hpcbind @@ -0,0 +1,454 @@ +#!/usr/bin/env bash + +################################################################################ +# Check if hwloc commands exist +################################################################################ +declare -i HPCBIND_HAS_HWLOC=1 +type hwloc-bind >/dev/null 2>&1 +HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?)) + +type hwloc-distrib >/dev/null 2>&1 +HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?)) + +type hwloc-ls >/dev/null 2>&1 +HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?)) + +type hwloc-calc >/dev/null 2>&1 +HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?)) + +type hwloc-ps >/dev/null 2>&1 +HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?)) + +if [[ ${HPCBIND_HAS_HWLOC} -eq 0 ]]; then + echo "hwloc not found, no process binding will occur" +fi + +# Get parent cpuset +HPCBIND_HWLOC_PARENT_CPUSET="" +if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then + MY_PID="$BASHPID" + HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2) +fi + +################################################################################ +# Check if nvidia-smi exist +################################################################################ +declare -i HPCBIND_HAS_NVIDIA=0 +type nvidia-smi >/dev/null 2>&1 +HPCBIND_HAS_NVIDIA=$((!$?)) + + +################################################################################ +# Get visible gpu +################################################################################ +declare -i NUM_GPUS=0 +HPCBIND_VISIBLE_GPUS="" +if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then + NUM_GPUS=$(nvidia-smi -L | wc -l); + GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )" + HPCBIND_VISIBLE_GPUS=${CUDA_VISIBLE_DEVICES:-${GPU_LIST}} +fi + +declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0)) + + +################################################################################ +# Get queue id +# supports sbatch, bsub, aprun +################################################################################ +HPCBIND_QUEUE_NAME="" +declare -i HPCBIND_QUEUE_INDEX=0 +declare -i HPCBIND_QUEUE_GPU_MAPPING=0 + +if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then + HPCBIND_QUEUE_GPU_MAPPING=1 + HPCBIND_QUEUE_NAME="sbatch" + HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID} +elif [[ ! -z "${LBS_JOBINDEX}" ]]; then + HPCBIND_QUEUE_GPU_MAPPING=1 + HPCBIND_QUEUE_NAME="bsub" + HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX} +elif [[ ! -z "${ALPS_APP_PE}" ]]; then + HPCBIND_QUEUE_GPU_MAPPING=1 + HPCBIND_QUEUE_NAME="aprun" + HPCBIND_QUEUE_INDEX=${ALPS_APP_PE} +fi + + +################################################################################ +# Show help +################################################################################ +function show_help { + local cmd=$(basename "$0") + echo "Usage: ${cmd} -- command ..." + echo " Set the process mask, OMP environment variables and CUDA environment" + echo " variables to sane values if possible. Uses hwloc and nvidia-smi if" + echo " available. Will preserve the current process binding, so it is safe" + echo " to use with a queuing system or mpiexec." + echo "" + echo "Options:" + echo " --no-hwloc-bind Disable binding" + echo " --proc-bind= Set the initial process mask for the script" + echo " LOC can be any valid location argument for" + echo " hwloc-calc Default: all" + echo " --distribute=N Distribute the current cpuset into N partitions" + echo " --distribute-partition=I" + echo " Use the i'th partition (zero based)" + echo " --visible-gpus= Comma separated list of gpu ids" + echo " Default: CUDA_VISIBLE_DEVICES or all gpus in" + echo " sequential order" + echo " --gpu-ignore-queue Ignore queue job id when choosing visible GPU" + echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES" + echo " --openmp=M.m Set env variables for the given OpenMP version" + echo " Default: 4.0" + echo " --openmp-percent=N Integer percentage of cpuset to use for OpenMP" + echo " threads Default: 100" + echo " --openmp-places= Op=threads|cores|sockets. Default: threads" + echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES" + echo " --force-openmp-num-threads=N" + echo " Override logic for selecting OMP_NUM_THREADS" + echo " --force-openmp-proc-bind=" + echo " Override logic for selecting OMP_PROC_BIND" + echo " --no-openmp-nested Set OMP_NESTED to false" + echo " --show-bindings Show the bindings" + echo " --lstopo Show bindings in lstopo without executing a command" + echo " -v|--verbose Show options and relevant environment variables" + echo " -h|--help Show this message" + echo "" + echo "Sample Usage:" + echo " Split the current process cpuset into 4 and use the 3rd partition" + echo " ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..." + echo " Bing the process to all even cores" + echo " ${cmd} --proc-bind=core:even -v -- command ..." + echo " Bind to the first 64 cores and split the current process cpuset into 4" + echo " ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..." + echo " skip GPU 0 when mapping visible devices" + echo " ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..." + echo " Display the current bindings" + echo " ${cmd} --proc-bind=numa:0 --show-bindings -- command" + echo " Display the current bindings using lstopo" + echo " ${cmd} --proc-bind=numa:0.core:odd --lstopo" + echo "" +} + + +################################################################################ +# Parse command line arguments +################################################################################ +# Show help if no command line arguments given +if [[ "$#" -eq 0 ]]; then + show_help + exit 0 +fi + +declare -a UNKNOWN_ARGS=() +declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC} +declare -i HPCBIND_DISTRIBUTE=1 +declare -i HPCBIND_PARTITION=0 +HPCBIND_PROC_BIND="all" +HPCBIND_OPENMP_VERSION=4.0 +declare -i HPCBIND_OPENMP_PERCENT=100 +HPCBIND_OPENMP_PLACES=${OMP_PLACES:-threads} +declare -i HPCBIND_OPENMP_PROC_BIND=1 +declare -i HPCBIND_OPENMP_FORCE_NUM_THREADS=-1 +HPCBIND_OPENMP_FORCE_PROC_BIND="" +HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true} +declare -i HPCBIND_VERBOSE=0 + +declare -i HPCBIND_SHOW_BINDINGS=0 +declare -i HPCBIND_LSTOPO=0 + +for i in $@; do + case $i in + # number of partitions to create + --no-hwloc-bind) + HPCBIND_ENABLE_HWLOC_BIND=0 + shift + ;; + --proc-bind=*) + HPCBIND_PROC_BIND="${i#*=}" + shift + ;; + --distribute=*) + HPCBIND_DISTRIBUTE="${i#*=}" + shift + ;; + # which partition to use + --distribute-partition=*) + HPCBIND_PARTITION="${i#*=}" + shift + ;; + --visible-gpus=*) + HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ') + shift + ;; + --gpu-ignore-queue) + HPCBIND_QUEUE_GPU_MAPPING=0 + shift + ;; + --no-gpu-mapping) + HPCBIND_ENABLE_GPU_MAPPING=0 + shift + ;; + --openmp=*) + HPCBIND_OPENMP_VERSION="${i#*=}" + shift + ;; + --openmp-percent=*) + HPCBIND_OPENMP_PERCENT="${i#*=}" + shift + ;; + --openmp-places=*) + HPCBIND_OPENMP_PLACES="${i#*=}" + shift + ;; + --no-openmp-proc-bind) + HPCBIND_OPENMP_PROC_BIND=0 + shift + ;; + --force-openmp-proc-bind=*) + HPCBIND_OPENMP_FORCE_PROC_BIND="${i#*=}" + shift + ;; + --force-openmp-num-threads=*) + HPCBIND_OPENMP_FORCE_NUM_THREADS="${i#*=}" + shift + ;; + --no-openmp-nested) + HPCBIND_OPENMP_NESTED="false" + shift + ;; + --show-bindings) + HPCBIND_VERBOSE=1 + HPCBIND_SHOW_BINDINGS=1 + shift + ;; + --lstopo) + HPCBIND_VERBOSE=1 + HPCBIND_SHOW_BINDINGS=0 + HPCBIND_LSTOPO=1 + shift + ;; + -v|--verbose) + HPCBIND_VERBOSE=1 + shift + ;; + -h|--help) + show_help + exit 0 + ;; + # ignore remaining arguments + --) + shift + break + ;; + # unknown option + *) + UNKNOWN_ARGS+=("$i") + shift + ;; + esac +done + + +################################################################################ +# Check unknown arguments +################################################################################ +if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then + echo "Uknown options: ${UNKNOWN_ARGS[*]}" + exit 1 +fi + + +################################################################################ +# Check that visible gpus are valid +################################################################################ +HPCBIND_VISIBLE_GPUS=(${HPCBIND_VISIBLE_GPUS}) +if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then + for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do + if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} || + ${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then + echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0" + HPCBIND_VISIBLE_GPUS[$i]=0; + fi + done + NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]} +fi + + +################################################################################ +# Check OpenMP percent +################################################################################ +if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then + echo "OpenMP percent < 1, setting to 1" + HPCBIND_OPENMP_PERCENT=1 +elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then + echo "OpenMP percent > 100, setting to 100" + HPCBIND_OPENMP_PERCENT=100 +fi + +################################################################################ +# Check distribute +################################################################################ +if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then + echo "Invalid input for distribute, changing distribute to 1" + HPCBIND_DISTRIBUTE=1 +fi + +if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then + echo "Invalid input for distribute-partition, changing to 0" + HPCBIND_PARTITION=0 +fi + + +################################################################################ +# Find cpuset and num threads +################################################################################ +HPCBIND_HWLOC_CPUSET="" +declare -i HPCBIND_NUM_PUS=0 + +if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then + if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then + BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND}) + else + BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND}) + fi + + CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE})) + HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]} + HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l) +else + HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor) +fi + +declare -i HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_NUM_PUS * HPCBIND_OPENMP_PERCENT)) +HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_OPENMP_NUM_THREADS / 100)) + + +if [[ ${HPCBIND_OPENMP_NUM_THREADS} -lt 1 ]]; then + HPCBIND_OPENMP_NUM_THREADS=1 +elif [[ ${HPCBIND_OPENMP_NUM_THREADS} -gt ${HPCBIND_NUM_PUS} ]]; then + HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_NUM_PUS} +fi + +if [[ ${HPCBIND_OPENMP_FORCE_NUM_THREADS} -gt 0 ]]; then + HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_OPENMP_FORCE_NUM_THREADS} +fi + +################################################################################ +# Set OpenMP environment variables +################################################################################ + +# set OMP_NUM_THREADS +export OMP_NUM_THREADS=${HPCBIND_OPENMP_NUM_THREADS} + +# set OMP_PROC_BIND and OMP_PLACES +if [[ ${HPCBIND_OPENMP_PROC_BIND} -eq 1 ]]; then + if [[ "${HPCBIND_OPENMP_FORCE_PROC_BIND}" == "" ]]; then + #default proc bind logic + if [[ "${HPCBIND_OPENMP_VERSION}" == "4.0" || "${HPCBIND_OPENMP_VERSION}" > "4.0" ]]; then + export OMP_PLACES="${HPCBIND_OPENMP_PLACES}" + export OMP_PROC_BIND="spread" + else + export OMP_PROC_BIND="true" + unset OMP_PLACES + fi + else + #force proc bind + export OMP_PLACES="${HPCBIND_OPENMP_PLACES}" + export OMP_PROC_BIND="${HPCBIND_OPENMP_FORCE_PROC_BIND}" + fi +else + # no openmp proc bind + unset OMP_PLACES + unset OMP_PROC_BIND +fi + +# set OMP_NESTED +export OMP_NESTED=${HPCBIND_OPENMP_NESTED} + + +################################################################################ +# Set CUDA environment variables +################################################################################ + +if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then + if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then + declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS)) + export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]} + else + declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION)) + declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS)) + export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]} + fi +fi + +################################################################################ +# Set hpcbind environment variables +################################################################################ +export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC} +export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA} +export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS} +export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET} +export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE} +export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION} +if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then + export HPCBIND_HWLOC_PARENT_CPUSET="all" +else + export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET} +fi +export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND} +export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING} +export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',') +export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION} +if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then + export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX} + export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME} + export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING} +fi + + +################################################################################ +# Print verbose +################################################################################ + +if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then + MY_ENV=$(env | sort) + echo "[HPCBIND]" + echo "${MY_ENV}" | grep -E "^HPCBIND_" + echo "[CUDA]" + echo "${MY_ENV}" | grep -E "^CUDA_" + echo "[OPENMP]" + echo "${MY_ENV}" | grep -E "^OMP_" +fi + +if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then + echo "[BINDINGS]" + hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu +elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then + echo "Unable to show bindings, hwloc not available." +fi + +################################################################################ +# Run command +################################################################################ + +if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then + if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then + hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@ + else + eval $@ + fi +else + if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then + if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then + echo "[BINDINGS]" + hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu + hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0 + else + hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} + fi + else + echo "Unable to show bindings, hwloc not available." + fi +fi diff --git a/lib/kokkos/bin/kokkos-bind b/lib/kokkos/bin/kokkos-bind new file mode 100755 index 0000000000..b6fe07a1bd --- /dev/null +++ b/lib/kokkos/bin/kokkos-bind @@ -0,0 +1,221 @@ +#!/usr/bin/env bash + +# check if hwloc commands exist +declare -i HAS_HWLOC=0 +type hwloc-bind >/dev/null 2>&1 +HAS_HWLOC="${HAS_HWLOC} + $?" + +type hwloc-distrib >/dev/null 2>&1 +HAS_HWLOC="${HAS_HWLOC} + $?" + +type hwloc-ls >/dev/null 2>&1 +HAS_HWLOC="${HAS_HWLOC} + $?" + +type hwloc-calc >/dev/null 2>&1 +HAS_HWLOC="${HAS_HWLOC} + $?" + +type hwloc-ps >/dev/null 2>&1 +HAS_HWLOC="${HAS_HWLOC} + $?" + + +#parse args +declare -a UNKNOWN_ARGS=() +declare -i DISTRIBUTE=1 +declare -i INDEX=0 +PROC_BIND="all" +CURRENT_CPUSET="" +OPENMP_VERSION=4.0 +OPENMP_PROC_BIND=True +OPENMP_NESTED=True +VERBOSE=False + +#get the current process cpuset +if [[ ${HAS_HWLOC} -eq 0 ]]; then + MY_PID="$BASHPID" + CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2) + echo "$CURRENT_CPUSET" +fi + +function show_help { + local cmd=$(basename "$0") + echo "Usage: ${cmd} -- command ..." + echo " Uses hwloc to divide the node into the given number of groups," + echo " set the appropriate OMP_NUM_THREADS and execute the command on the" + echo " selected group." + echo "" + echo " NOTE: This command assumes it has exclusive use of the node" + echo "" + echo "Options:" + echo " --proc-bind= Set the initial process mask for the script. " + echo " LOC can be any valid location argumnet for" + echo " hwloc-calc. Defaults to the entire machine" + echo " --distribute=N Distribute the current proc-bind into N groups" + echo " --index=I Use the i'th group (zero based)" + echo " --openmp=M.m Set env variables for the given OpenMP version" + echo " (default 4.0)" + echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES" + echo " --no-openmp-nested Set OMP_NESTED to false" + echo " -v|--verbose" + echo " -h|--help" + echo "" + echo "Sample Usage:" + echo " ${cmd} --distribute=4 --index=2 -v -- command ..." + echo "" +} + +if [[ "$#" -eq 0 ]]; then + show_help + exit 0 +fi + + +for i in $@; do + case $i in + # number of partitions to create + --proc-bind=*) + PROC_BIND="${i#*=}" + shift + ;; + --distribute=*) + DISTRIBUTE="${i#*=}" + shift + ;; + # which group to use + --index=*) + INDEX="${i#*=}" + shift + ;; + --openmp=*) + OPENMP_VERSION="${i#*=}" + shift + ;; + --no-openmp-proc-bind) + OPENMP_PROC_BIND=False + shift + ;; + --no-openmp-nested) + OPENMP_NESTED=False + shift + ;; + -v|--verbose) + VERBOSE=True + shift + ;; + -h|--help) + show_help + exit 0 + ;; + # ignore remaining arguments + --) + shift + break + ;; + # unknown option + *) + UNKNOWN_ARGS+=("$i") + shift + ;; + esac +done + +if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then + echo "Uknown options: ${UNKNOWN_ARGS[*]}" + exit 1 +fi + +if [[ ${DISTRIBUTE} -le 0 ]]; then + echo "Invalid input for distribute, changing distribute to 1" + DISTRIBUTE=1 +fi + +if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then + echo "Invalid input for index, changing index to 0" + INDEX=0 +fi + +if [[ ${HAS_HWLOC} -ne 0 ]]; then + echo "hwloc not found, no process binding will occur" + DISTRIBUTE=1 + INDEX=0 +fi + +if [[ ${HAS_HWLOC} -eq 0 ]]; then + + if [[ "${CURRENT_CPUSET}" == "" ]]; then + BINDING=$(hwloc-calc ${PROC_BIND}) + else + BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND}) + fi + + CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE})) + CPUSET=${CPUSETS[${INDEX}]} + NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l) + + if [[ "${VERBOSE}" == "True" ]]; then + echo "hwloc: true" + echo " proc_bind: ${PROC_BIND}" + echo " distribute: ${DISTRIBUTE}" + echo " index: ${INDEX}" + echo " parent_cpuset: ${CURRENT_CPUSET}" + echo " cpuset: ${CPUSET}" + echo "omp_num_threads: ${NUM_THREADS}" + echo "omp_proc_bind: ${OPENMP_PROC_BIND}" + echo "omp_nested: ${OPENMP_NESTED}" + echo "OpenMP: ${OPENMP_VERSION}" + fi + + # set OMP env + if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then + if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then + export OMP_PLACES="threads" + export OMP_PROC_BIND="spread" + else + export OMP_PROC_BIND="true" + unset OMP_PLACES + fi + else + unset OMP_PLACES + unset OMP_PROC_BIND + fi + if [[ "${OPENMP_NESTED}" == "True" ]]; then + export OMP_NESTED="true" + else + export OMP_NESTED="false" + fi + export OMP_NUM_THREADS="${NUM_THREADS}" + + hwloc-bind ${CPUSET} -- $@ +else + NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor) + + if [[ "${VERBOSE}" == "True" ]]; then + echo "hwloc: false" + echo "omp_num_threads: ${NUM_THREADS}" + echo "omp_proc_bind: ${OPENMP_PROC_BIND}" + echo "omp_nested: ${OPENMP_NESTED}" + echo "OpenMP: ${OPENMP_VERSION}" + fi + + # set OMP env + if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then + if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then + export OMP_PLACES="threads" + export OMP_PROC_BIND="spread" + else + export OMP_PROC_BIND="true" + unset OMP_PLACES + fi + else + unset OMP_PLACES + unset OMP_PROC_BIND + fi + if [[ "${OPENMP_NESTED}" == "True" ]]; then + export OMP_NESTED="true" + else + export OMP_NESTED="false" + fi + export OMP_NUM_THREADS="${NUM_THREADS}" + + eval $@ +fi + diff --git a/lib/kokkos/bin/runtest b/lib/kokkos/bin/runtest new file mode 100755 index 0000000000..92411fe5ba --- /dev/null +++ b/lib/kokkos/bin/runtest @@ -0,0 +1,165 @@ +#!/usr/bin/env bash + +function get_path() { + cd "$(dirname "$0")" + cd .. + echo "$(pwd -P)" +} + +KOKKOS_PATH="$(get_path "$0")" + +function show_help() { + local cmd=$(basename "$0") + echo "Usage: ${cmd} " + echo " Build and run the tests" + echo "" + echo "Options:" + echo " -j=N|--make-j=N Build the tests in parallel" + echo " -c|--clean Clean build and regenerate make files" + echo " --clean-on-pass Clean build when runtest passes" + echo " --output-prefix=
  Prefix of log files  Default: runtest"
+  echo "  --build-only           Only build the tests"
+  echo "  -v|--verbose           Tee STDOUT and STDERR to screen and files"
+  echo "  -h|--help              Show this message"
+  echo ""
+  ${KOKKOS_PATH}/generate_makefile.bash --help
+  return 0
+}
+
+
+declare -a GENERATE_ARGS=()
+declare -i VERBOSE=0
+declare -i CLEAN=0
+declare -i CLEAN_ON_PASS=0
+declare -i BUILD_ONLY=0
+OUTPUT="runtest"
+
+declare -i MAKE_J=${HPCBIND_NUM_PUS:-1}
+
+for i in $@; do
+  case $i in
+    -j=*|--make-j=*)
+      MAKE_J=${i#*=}
+      shift
+      ;;
+    -c|--clean)
+      CLEAN=1
+      shift
+      ;;
+    --clean-on-pass)
+      CLEAN_ON_PASS=1
+      shift
+      ;;
+    --output-prefix=*)
+      OUTPUT=${i#*=}
+      shift
+      ;;
+    --build-only)
+      BUILD_ONLY=1
+      shift
+      ;;
+    -v|--verbose)
+      VERBOSE=1
+      shift
+      ;;
+    -h|--help)
+      show_help
+      exit 0
+      ;;
+    *)
+      GENERATE_ARGS+=("$i")
+      shift
+      ;;
+  esac
+done
+
+if [[ "$(pwd -P)" == ${KOKKOS_PATH} ]]; then
+  echo "Cannot call $0 from root repository path ${KOKKOS_PATH}"
+  exit 1
+fi
+
+# Some makefile dependencies are incorrect, so clean needs to force
+# a new call to generate_makefiles.bash
+if [[ ${CLEAN} -eq 1 ]]; then
+  START=${SECONDS}
+  echo "Cleaning"
+  /bin/rm -rf algorithms containers core example install Makefile >/dev/null 2>&1
+  END=${SECONDS}
+  echo "    $((END-START)) seconds"
+  if [[ ${VERBOSE} -eq 1 ]]; then
+    echo ""
+    echo ""
+  fi
+fi
+
+declare -i START=${SECONDS}
+echo "Generating Makefile"
+echo "    ${KOKKOS_PATH}/generate_makefile.bash --kokkos-path=${KOKKOS_PATH} ${GENERATE_ARGS[@]}"
+
+if [[ ${VERBOSE} -eq 0 ]]; then
+  "${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > ${OUTPUT}.out 2> >(tee ${OUTPUT}.err >&2)
+else
+  "${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > >(tee ${OUTPUT}.out) 2> >(tee ${OUTPUT}.err >&2)
+fi
+declare -i RESULT=$?
+declare -i END=${SECONDS}
+if [[ ${RESULT} -eq 0 ]]; then
+  echo "    PASS:  $((END-START)) seconds"
+  if [[ ${VERBOSE} -eq 1 ]]; then
+    echo ""
+    echo ""
+  fi
+else
+  cat ${OUTPUT}.out | grep "FAIL"
+  cat ${OUTPUT}.err | grep "FAIL"
+  echo "    FAIL:  $((END-START)) seconds"
+  exit 1
+fi
+
+START=${SECONDS}
+echo "Building"
+if [[ ${VERBOSE} -eq 0 ]]; then
+  make --keep-going -j ${MAKE_J} build-test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2)
+else
+  make --keep-going -j ${MAKE_J} build-test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2)
+fi
+RESULT=$?
+END=${SECONDS}
+if [[ ${RESULT} -eq 0 ]]; then
+  echo "    PASS:  $((END-START)) seconds"
+  if [[ ${VERBOSE} -eq 1 ]]; then
+    echo ""
+    echo ""
+  fi
+else
+  cat ${OUTPUT}.out | grep -E "[[:space:]]error:[[:space:]]"
+  cat ${OUTPUT}.err | grep -E "[[:space:]]error:[[:space:]]"
+  echo "    FAIL:  $((END-START)) seconds"
+  exit 1
+fi
+
+if [[ ${BUILD_ONLY} -eq 0 ]]; then
+  START=${SECONDS}
+  echo "Testing"
+  if [[ ${VERBOSE} -eq 0 ]]; then
+    make --keep-going test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2)
+  else
+    make --keep-going test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2)
+  fi
+  RESULT=$?
+  END=${SECONDS}
+  if [[ ${RESULT} -eq 0 ]]; then
+    echo "    PASS:  $((END-START)) seconds"
+    if [[ ${CLEAN_ON_PASS} -eq 1 ]]; then
+      make clean
+    fi
+  else
+    cat ${OUTPUT}.out | grep "FAIL"
+    cat ${OUTPUT}.err | grep "FAIL"
+    echo "    FAIL:  $((END-START)) seconds"
+    exit 1
+  fi
+fi
+
+exit ${RESULT}
+
diff --git a/lib/kokkos/cmake/kokkos.cmake b/lib/kokkos/cmake/kokkos.cmake
index 235b7eaba4..396822c7fa 100644
--- a/lib/kokkos/cmake/kokkos.cmake
+++ b/lib/kokkos/cmake/kokkos.cmake
@@ -999,8 +999,12 @@ SET (Kokkos_INCLUDE_DIRS
     ${Kokkos_SOURCE_DIR}/containers/src
     ${Kokkos_SOURCE_DIR}/algorithms/src
     ${Kokkos_BINARY_DIR}  # to find KokkosCore_config.h
+    ${KOKKOS_INCLUDE_DIRS}
 )
 
+# pass include dirs back to parent scope
+SET(Kokkos_INCLUDE_DIRS_RET ${Kokkos_INCLUDE_DIRS} PARENT_SCOPE)
+
 INCLUDE_DIRECTORIES(${Kokkos_INCLUDE_DIRS})
 
 IF(KOKKOS_SEPARATE_LIBS)
diff --git a/lib/kokkos/config/master_history.txt b/lib/kokkos/config/master_history.txt
index cc6f4c97d7..0447db4b2b 100644
--- a/lib/kokkos/config/master_history.txt
+++ b/lib/kokkos/config/master_history.txt
@@ -7,3 +7,4 @@ tag:  2.02.07    date: 12:16:2016    master: 4b4cc4ba    develop: 382c0966
 tag:  2.02.15    date: 02:10:2017    master: 8c64cd93    develop: 28dea8b6
 tag:  2.03.00    date: 04:25:2017    master: 120d9ce7    develop: 015ba641
 tag:  2.03.05    date: 05:27:2017    master: 36b92f43    develop: 79073186
+tag:  2.03.13    date: 07:27:2017    master: da314444    develop: 29ccb58a
diff --git a/lib/kokkos/config/query_cuda_arch.cpp b/lib/kokkos/config/query_cuda_arch.cpp
new file mode 100644
index 0000000000..383f04e34e
--- /dev/null
+++ b/lib/kokkos/config/query_cuda_arch.cpp
@@ -0,0 +1,24 @@
+#include 
+#include 
+int main()
+{
+	cudaDeviceProp prop;
+  const cudaError_t err_code = cudaGetDeviceProperties(&prop, 0);
+  if (cudaSuccess != err_code) {
+		fprintf(stderr,"cudaGetDeviceProperties failed: %s\n", cudaGetErrorString(err_code));
+		return -1;
+	}
+  switch (prop.major) {
+    case 3:
+      printf("Kepler"); break;
+    case 5:
+      printf("Maxwell"); break;
+    case 6:
+      printf("Pascal"); break;
+    default:
+      fprintf(stderr, "Unspported Device %d%d\n", (int)prop.major, (int)prop.minor);
+      return -1;
+  }
+  printf("%d%d\n", (int)prop.major, (int)prop.minor);
+  return 0;
+}
diff --git a/lib/kokkos/config/test_all_sandia b/lib/kokkos/config/test_all_sandia
index 8e1246bf8b..005cd20721 100755
--- a/lib/kokkos/config/test_all_sandia
+++ b/lib/kokkos/config/test_all_sandia
@@ -160,9 +160,14 @@ if [ "$MACHINE" = "sems" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
     COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
@@ -280,13 +285,13 @@ elif [ "$MACHINE" = "apollo" ]; then
                "gcc/5.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
                "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
                "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
-               "clang/head $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
+               "clang/4.0.0 $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
                "cuda/8.0.44 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
   else
     # Format: (compiler module-list build-list exe-name warning-flag)
     COMPILERS=("cuda/8.0.44 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "clang/head $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
+               "clang/4.0.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
                "clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
                "gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
@@ -584,7 +589,7 @@ single_build_and_test() {
   else
     run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
     local -i build_start_time=$(date +%s)
-    run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
+    run_cmd make -j 32 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
     local -i build_end_time=$(date +%s)
     comment="build_time=$(($build_end_time-$build_start_time))"
 
diff --git a/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_pthread_intel b/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_pthread_intel
index 23968e8c0f..6527df2eb9 100755
--- a/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_pthread_intel
+++ b/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_pthread_intel
@@ -28,14 +28,14 @@ export JENKINS_DO_PTHREAD=ON
 export JENKINS_DO_SERIAL=OFF
 export JENKINS_DO_COMPLEX=OFF
 
-export ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
-export ARCH_C_FLAG="-xCORE-AVX2 -mkl"
+export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
+export JENKINS_ARCH_C_FLAG="-xCORE-AVX2 -mkl"
 export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a"
 export LAPACK_LIBRARIES=${BLAS_LIBRARIES}
 
 export JENKINS_DO_TESTS=ON
 export JENKINS_DO_EXAMPLES=ON
-export JENKINS_DO_SHARED=OFF
+export JENKINS_DO_SHARED=ON
 
 export QUEUE=haswell
 
diff --git a/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_serial_intel b/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_serial_intel
index 964de3a002..1a306bc2b2 100755
--- a/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_serial_intel
+++ b/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_serial_intel
@@ -28,14 +28,14 @@ export JENKINS_DO_PTHREAD=OFF
 export JENKINS_DO_SERIAL=ON
 export JENKINS_DO_COMPLEX=ON
 
-export ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
-export ARCH_C_FLAG="-xCORE-AVX2 -mkl"
+export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
+export JENKINS_ARCH_C_FLAG="-xCORE-AVX2 -mkl"
 export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a"
 export LAPACK_LIBRARIES=${BLAS_LIBRARIES}
 
 export JENKINS_DO_TESTS=ON
 export JENKINS_DO_EXAMPLES=ON
-export JENKINS_DO_SHARED=OFF
+export JENKINS_DO_SHARED=ON
 
 export QUEUE=haswell
 
diff --git a/lib/kokkos/containers/performance_tests/Makefile b/lib/kokkos/containers/performance_tests/Makefile
index edaaf1ee51..ec69363a17 100644
--- a/lib/kokkos/containers/performance_tests/Makefile
+++ b/lib/kokkos/containers/performance_tests/Makefile
@@ -60,7 +60,6 @@ test-threads: KokkosContainers_PerformanceTest_Threads
 test-openmp: KokkosContainers_PerformanceTest_OpenMP
 	./KokkosContainers_PerformanceTest_OpenMP
 
-
 build_all: $(TARGETS)
 
 test: $(TEST_TARGETS)
diff --git a/lib/kokkos/containers/performance_tests/TestMain.cpp b/lib/kokkos/containers/performance_tests/TestMain.cpp
index f952ab3db5..1224af7cdb 100644
--- a/lib/kokkos/containers/performance_tests/TestMain.cpp
+++ b/lib/kokkos/containers/performance_tests/TestMain.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,12 +36,15 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
 
 #include 
+#include 
+
+#include 
 
 int main(int argc, char *argv[]) {
   ::testing::InitGoogleTest(&argc,argv);
diff --git a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp
index b674ec4a74..6631184624 100644
--- a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp
+++ b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp
@@ -69,30 +69,13 @@ protected:
   {
     std::cout << std::setprecision(5) << std::scientific;
 
-    unsigned num_threads = 4;
-
-    if (Kokkos::hwloc::available()) {
-      num_threads = Kokkos::hwloc::get_available_numa_count()
-                    * Kokkos::hwloc::get_available_cores_per_numa()
-                    * Kokkos::hwloc::get_available_threads_per_core()
-                    ;
-
-    }
-
-    std::cout << "OpenMP: " << num_threads << std::endl;
-
-    Kokkos::OpenMP::initialize( num_threads );
-
-    std::cout << "available threads: " << omp_get_max_threads() << std::endl;
+    Kokkos::OpenMP::initialize();
+    Kokkos::OpenMP::print_configuration( std::cout );
   }
 
   static void TearDownTestCase()
   {
     Kokkos::OpenMP::finalize();
-
-    omp_set_num_threads(1);
-
-    ASSERT_EQ( 1 , omp_get_max_threads() );
   }
 };
 
diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp
index 937eab0d88..35cc8ec753 100644
--- a/lib/kokkos/containers/src/Kokkos_DualView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp
@@ -564,7 +564,7 @@ namespace Impl {
 template< class D, class A1, class A2, class A3, class ... Args >
 struct DualViewSubview {
 
-  typedef typename Kokkos::Experimental::Impl::ViewMapping
+  typedef typename Kokkos::Impl::ViewMapping
     < void
     , Kokkos::ViewTraits< D, A1, A2, A3 >
     , Args ...
diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
index 8e464506f9..d22d6b865d 100644
--- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
@@ -46,19 +46,6 @@
 ///
 /// This header file declares and defines Kokkos::Experimental::DynRankView and its
 /// related nonmember functions.
-/*
- *   Changes from View
- *   1. The rank of the DynRankView is returned by the method rank()
- *   2. Max rank of a DynRankView is 7
- *   3. subview name is subdynrankview
- *   4. Every subdynrankview is returned with LayoutStride
- *
- *   NEW: Redesigned DynRankView
- *   5. subview function name now available
- *   6. Copy and Copy-Assign View to DynRankView
- *   7. deep_copy between Views and DynRankViews
- *   8. rank( view ); returns the rank of View or DynRankView
- */
 
 #ifndef KOKKOS_DYNRANKVIEW_HPP
 #define KOKKOS_DYNRANKVIEW_HPP
@@ -117,6 +104,14 @@ struct DynRankDimTraits {
                       , layout.dimension[7] );
   }
 
+  // Extra overload to match that for specialize types v2
+  template 
+  KOKKOS_INLINE_FUNCTION
+  static size_t computeRank( const Kokkos::Impl::ViewCtorProp& prop, const Layout& layout )
+  {
+    return computeRank(layout);
+  }
+
   // Create the layout for the rank-7 view.
   // Non-strided Layout
   template 
@@ -158,8 +153,17 @@ struct DynRankDimTraits {
                  );
   }
 
+  // Extra overload to match that for specialize types
+  template 
+  KOKKOS_INLINE_FUNCTION
+  static typename std::enable_if< (std::is_same::value || std::is_same::value || std::is_same::value) , typename Traits::array_layout >::type createLayout( const ViewCtorProp& prop, const typename Traits::array_layout& layout )
+  {
+    return createLayout( layout );
+  }
+
   // Create a view from the given dimension arguments.
   // This is only necessary because the shmem constructor doesn't take a layout.
+  //   NDE shmem View's are not compatible with the added view_alloc value_type / fad_dim deduction functionality
   template 
   static ViewType createView( const ViewArg& arg
                             , const size_t N0
@@ -186,7 +190,8 @@ struct DynRankDimTraits {
   // Non-strided Layout
   template 
   KOKKOS_INLINE_FUNCTION
-  static typename std::enable_if< (std::is_same::value || std::is_same::value) && std::is_integral::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank )
+  static typename std::enable_if< (std::is_same::value || std::is_same::value) && std::is_integral::value , Layout >::type
+  reconstructLayout( const Layout& layout , iType dynrank )
   {
     return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0)
                  , dynrank > 1 ? layout.dimension[1] : ~size_t(0)
@@ -202,7 +207,8 @@ struct DynRankDimTraits {
   // LayoutStride
   template 
   KOKKOS_INLINE_FUNCTION
-  static typename std::enable_if< (std::is_same::value) && std::is_integral::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank )
+  static typename std::enable_if< (std::is_same::value) && std::is_integral::value , Layout >::type
+  reconstructLayout( const Layout& layout , iType dynrank )
   {
     return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0)
                  , dynrank > 0 ? layout.stride[0] : (0)
@@ -311,6 +317,11 @@ void dyn_rank_view_verify_operator_bounds
 /** \brief  Assign compatible default mappings */
 struct ViewToDynRankViewTag {};
 
+} // namespace Impl
+} // namespace Experimental
+
+namespace Impl {
+
 template< class DstTraits , class SrcTraits >
 class ViewMapping< DstTraits , SrcTraits ,
   typename std::enable_if<(
@@ -337,7 +348,7 @@ class ViewMapping< DstTraits , SrcTraits ,
         )
       )
     )
-  ) , ViewToDynRankViewTag >::type >
+  ) , Kokkos::Experimental::Impl::ViewToDynRankViewTag >::type >
 {
 private:
 
@@ -376,7 +387,7 @@ public:
 
       typedef typename DstType::offset_type  dst_offset_type ;
       dst.m_map.m_offset = dst_offset_type(std::integral_constant() , src.layout() ); //Check this for integer input1 for padding, etc
-      dst.m_map.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
+      dst.m_map.m_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
       dst.m_track.assign( src.m_track , DstTraits::is_managed );
       dst.m_rank = src.Rank ;
     }
@@ -384,22 +395,20 @@ public:
 
 } //end Impl
 
+namespace Experimental {
+
 /* \class DynRankView
  * \brief Container that creates a Kokkos view with rank determined at runtime.
- *   Essentially this is a rank 7 view that wraps the access operators
- *   to yield the functionality of a view
+ *   Essentially this is a rank 7 view
  *
  *   Changes from View
  *   1. The rank of the DynRankView is returned by the method rank()
  *   2. Max rank of a DynRankView is 7
- *   3. subview name is subdynrankview
- *   4. Every subdynrankview is returned with LayoutStride
- *
- *   NEW: Redesigned DynRankView
- *   5. subview function name now available
- *   6. Copy and Copy-Assign View to DynRankView
- *   7. deep_copy between Views and DynRankViews
- *   8. rank( view ); returns the rank of View or DynRankView
+ *   3. subview called with 'subview(...)' or 'subdynrankview(...)' (backward compatibility) 
+ *   4. Every subview is returned with LayoutStride
+ *   5. Copy and Copy-Assign View to DynRankView
+ *   6. deep_copy between Views and DynRankViews
+ *   7. rank( view ); returns the rank of View or DynRankView
  *
  */
 
@@ -427,7 +436,7 @@ public:
 
 
 private:
-  typedef Kokkos::Experimental::Impl::ViewMapping< traits , void > map_type ;
+  typedef Kokkos::Impl::ViewMapping< traits , void > map_type ;
   typedef Kokkos::Experimental::Impl::SharedAllocationTracker      track_type ;
 
   track_type  m_track ;
@@ -556,7 +565,7 @@ public:
   // Allow specializations to query their specialized map
 
   KOKKOS_INLINE_FUNCTION
-  const Kokkos::Experimental::Impl::ViewMapping< traits , void > &
+  const Kokkos::Impl::ViewMapping< traits , void > &
   implementation_map() const { return m_map ; }
 
   //----------------------------------------
@@ -803,7 +812,7 @@ public:
     , m_rank(rhs.m_rank)
     {
       typedef typename DynRankView ::traits SrcTraits ;
-      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
       static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
       Mapping::assign( m_map , rhs.m_map , rhs.m_track );
     }
@@ -813,7 +822,7 @@ public:
   DynRankView & operator = (const DynRankView & rhs )
     {
       typedef typename DynRankView ::traits SrcTraits ;
-      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
       static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
       Mapping::assign( m_map , rhs.m_map , rhs.m_track );
       m_track.assign( rhs.m_track , traits::is_managed );
@@ -831,7 +840,7 @@ public:
     , m_rank( rhs.Rank )
     {
       typedef typename View::traits  SrcTraits ;
-      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag >  Mapping ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag >  Mapping ;
       static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
       Mapping::assign( *this , rhs );
     }
@@ -841,7 +850,7 @@ public:
   DynRankView & operator = ( const View & rhs )
     {
       typedef typename View::traits  SrcTraits ;
-      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag >  Mapping ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag >  Mapping ;
       static_assert( Mapping::is_assignable , "Incompatible View to DynRankView copy assignment" );
       Mapping::assign( *this , rhs );
       return *this ;
@@ -870,7 +879,7 @@ public:
       )
       : m_track()
       , m_map()
-      , m_rank( Impl::DynRankDimTraits::computeRank(arg_layout) )
+      , m_rank( Impl::DynRankDimTraits::template computeRank< typename traits::array_layout, P...>(arg_prop, arg_layout) )
     {
       // Append layout and spaces if not input
       typedef Impl::ViewCtorProp< P ... > alloc_prop_input ;
@@ -923,7 +932,7 @@ public:
 //------------------------------------------------------------
 
       Kokkos::Experimental::Impl::SharedAllocationRecord<> *
-        record = m_map.allocate_shared( prop , Impl::DynRankDimTraits::createLayout(arg_layout) );
+        record = m_map.allocate_shared( prop , Impl::DynRankDimTraits::template createLayout(arg_prop, arg_layout) );
 
 //------------------------------------------------------------
 #if defined( KOKKOS_ENABLE_CUDA )
@@ -947,8 +956,8 @@ public:
                                >::type const & arg_layout
       )
       : m_track() // No memory tracking
-      , m_map( arg_prop , Impl::DynRankDimTraits::createLayout(arg_layout) )
-      , m_rank( Impl::DynRankDimTraits::computeRank(arg_layout) )
+      , m_map( arg_prop , Impl::DynRankDimTraits::template createLayout(arg_prop, arg_layout) )
+      , m_rank( Impl::DynRankDimTraits::template computeRank< typename traits::array_layout, P...>(arg_prop, arg_layout) )
     {
       static_assert(
         std::is_same< pointer_type
@@ -1034,6 +1043,7 @@ public:
     {}
 
   // For backward compatibility
+  // NDE This ctor does not take ViewCtorProp argument - should not use alternative createLayout call
   explicit inline
   DynRankView( const ViewAllocateWithoutInitializing & arg_prop
       , const typename traits::array_layout & arg_layout
@@ -1179,6 +1189,11 @@ namespace Impl {
 
 struct DynRankSubviewTag {};
 
+} // namespace Impl
+} // namespace Experimental
+
+namespace Impl {
+
 template< class SrcTraits , class ... Args >
 struct ViewMapping
   < typename std::enable_if<(
@@ -1192,7 +1207,7 @@ struct ViewMapping
         std::is_same< typename SrcTraits::array_layout
                     , Kokkos::LayoutStride >::value
       )
-    ), DynRankSubviewTag >::type
+    ), Kokkos::Experimental::Impl::DynRankSubviewTag >::type
   , SrcTraits
   , Args ... >
 {
@@ -1264,7 +1279,7 @@ public:
   };
 
 
-  typedef DynRankView< value_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits >  ret_type;
+  typedef Kokkos::Experimental::DynRankView< value_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits >  ret_type;
 
   template < typename T , class ... P >
   KOKKOS_INLINE_FUNCTION
@@ -1336,9 +1351,10 @@ public:
 
 } // end Impl
 
+namespace Experimental {
 
 template< class V , class ... Args >
-using Subdynrankview = typename Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , V , Args... >::ret_type ;
+using Subdynrankview = typename Kokkos::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , V , Args... >::ret_type ;
 
 template< class D , class ... P , class ...Args >
 KOKKOS_INLINE_FUNCTION
@@ -1348,7 +1364,7 @@ subdynrankview( const Kokkos::Experimental::DynRankView< D , P... > &src , Args.
     if ( src.rank() > sizeof...(Args) ) //allow sizeof...(Args) >= src.rank(), ignore the remaining args
       { Kokkos::abort("subdynrankview: num of args must be >= rank of the source DynRankView"); }
 
-    typedef Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ;
+    typedef Kokkos::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ;
 
     return metafcn::subview( src.rank() , src , args... );
   }
diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
index da96db2d6b..e9059d64c4 100644
--- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -57,7 +57,7 @@ namespace Experimental {
  */
 template< typename DataType , typename ... P >
 class DynamicView : public Kokkos::ViewTraits< DataType , P ... >
-{ 
+{
 public:
 
   typedef Kokkos::ViewTraits< DataType , P ... >  traits ;
@@ -68,7 +68,7 @@ private:
 
   typedef Kokkos::Experimental::Impl::SharedAllocationTracker   track_type ;
 
-  static_assert( traits::rank == 1 && traits::rank_dynamic == 1 
+  static_assert( traits::rank == 1 && traits::rank_dynamic == 1
                , "DynamicView must be rank-one" );
 
   static_assert( std::is_trivial< typename traits::value_type >::value &&
@@ -216,14 +216,14 @@ public:
         // Verify that allocation of the requested chunk in in progress.
 
         // The allocated chunk counter is m_chunks[ m_chunk_max ]
-        const uintptr_t n = 
+        const uintptr_t n =
           *reinterpret_cast( m_chunks + m_chunk_max );
 
         if ( n <= ic ) {
           Kokkos::abort("Kokkos::DynamicView array bounds error");
         }
 
-        // Allocation of this chunk is in progress 
+        // Allocation of this chunk is in progress
         // so wait for allocation to complete.
         while ( 0 == *ch );
       }
@@ -267,7 +267,7 @@ public:
         const uintptr_t jc_try = jc ;
 
         // Jump iteration to the chunk counter.
-        
+
         jc = atomic_compare_exchange( pc , jc_try , jc_try + 1 );
 
         if ( jc_try == jc ) {
@@ -316,7 +316,7 @@ public:
       }
       else {
         while ( NC + 1 <= *pc ) {
-          --*pc ;        
+          --*pc ;
           m_pool.deallocate( m_chunks[*pc]
                            , sizeof(value_type) << m_chunk_shift );
           m_chunks[*pc] = 0 ;
@@ -331,7 +331,7 @@ public:
     typename traits::value_type ** m_chunks ;
     uintptr_t                    * m_pc ;
     uintptr_t                      m_nc ;
-    unsigned                       m_chunk_shift ;  
+    unsigned                       m_chunk_shift ;
 
     KOKKOS_INLINE_FUNCTION
     void operator()( int ) const
@@ -348,7 +348,7 @@ public:
         }
         else {
           while ( m_nc + 1 <= *m_pc ) {
-            --*m_pc ;        
+            --*m_pc ;
             m_pool.deallocate( m_chunks[*m_pc]
                              , sizeof(value_type) << m_chunk_shift );
             m_chunks[*m_pc] = 0 ;
@@ -482,7 +482,7 @@ public:
   };
 
 
-  /**\brief  Allocation constructor 
+  /**\brief  Allocation constructor
    *
    *  Memory is allocated in chunks from the memory pool.
    *  The chunk size conforms to the memory pool's chunk size.
@@ -557,7 +557,7 @@ void deep_copy( const View & dst
 
   if ( DstExecCanAccessSrc ) {
     // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
-    Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
+    Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src );
   }
   else {
     Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
@@ -581,7 +581,7 @@ void deep_copy( const DynamicView & dst
 
   if ( DstExecCanAccessSrc ) {
     // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
-    Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
+    Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src );
   }
   else {
     Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
diff --git a/lib/kokkos/containers/unit_tests/TestCuda.cpp b/lib/kokkos/containers/unit_tests/TestCuda.cpp
index 5a78a5de9e..651a4e7eb8 100644
--- a/lib/kokkos/containers/unit_tests/TestCuda.cpp
+++ b/lib/kokkos/containers/unit_tests/TestCuda.cpp
@@ -69,6 +69,8 @@
 #include 
 #include 
 
+#include 
+
 //----------------------------------------------------------------------------
 
 
@@ -94,6 +96,10 @@ TEST_F( cuda , dyn_view_api) {
   TestDynViewAPI< double , Kokkos::Cuda >();
 }
 
+TEST_F( cuda, viewctorprop_embedded_dim ) {
+  TestViewCtorProp_EmbeddedDim< Kokkos::Cuda >::test_vcpt( 2, 3 );
+}
+
 TEST_F( cuda , staticcrsgraph )
 {
   TestStaticCrsGraph::run_test_graph< Kokkos::Cuda >();
diff --git a/lib/kokkos/containers/unit_tests/TestOpenMP.cpp b/lib/kokkos/containers/unit_tests/TestOpenMP.cpp
index 2448bd077b..5365d91361 100644
--- a/lib/kokkos/containers/unit_tests/TestOpenMP.cpp
+++ b/lib/kokkos/containers/unit_tests/TestOpenMP.cpp
@@ -66,6 +66,8 @@
 #include 
 #include 
 
+#include 
+
 #include 
 
 namespace Test {
@@ -76,14 +78,7 @@ protected:
   {
     std::cout << std::setprecision(5) << std::scientific;
 
-    unsigned threads_count = 4 ;
-
-    if ( Kokkos::hwloc::available() ) {
-      threads_count = Kokkos::hwloc::get_available_numa_count() *
-                      Kokkos::hwloc::get_available_cores_per_numa();
-    }
-
-    Kokkos::OpenMP::initialize( threads_count );
+    Kokkos::OpenMP::initialize();
   }
 
   static void TearDownTestCase()
@@ -96,6 +91,10 @@ TEST_F( openmp, dyn_view_api) {
   TestDynViewAPI< double , Kokkos::OpenMP >();
 }
 
+TEST_F( openmp, viewctorprop_embedded_dim ) {
+  TestViewCtorProp_EmbeddedDim< Kokkos::OpenMP >::test_vcpt( 2, 3 );
+}
+
 TEST_F( openmp, bitset )
 {
   test_bitset();
diff --git a/lib/kokkos/containers/unit_tests/TestSerial.cpp b/lib/kokkos/containers/unit_tests/TestSerial.cpp
index 06c4d9f6ed..1b9b5a2da3 100644
--- a/lib/kokkos/containers/unit_tests/TestSerial.cpp
+++ b/lib/kokkos/containers/unit_tests/TestSerial.cpp
@@ -67,6 +67,8 @@
 #include 
 #include 
 
+#include 
+
 namespace Test {
 
 class serial : public ::testing::Test {
@@ -85,6 +87,10 @@ TEST_F( serial, dyn_view_api) {
   TestDynViewAPI< double , Kokkos::Serial >();
 }
 
+TEST_F( serial, viewctorprop_embedded_dim ) {
+  TestViewCtorProp_EmbeddedDim< Kokkos::Serial >::test_vcpt( 2, 3 );
+}
+
 TEST_F( serial , staticcrsgraph )
 {
   TestStaticCrsGraph::run_test_graph< Kokkos::Serial >();
diff --git a/lib/kokkos/containers/unit_tests/TestThreads.cpp b/lib/kokkos/containers/unit_tests/TestThreads.cpp
index 938ec88e90..aca0b57d65 100644
--- a/lib/kokkos/containers/unit_tests/TestThreads.cpp
+++ b/lib/kokkos/containers/unit_tests/TestThreads.cpp
@@ -70,6 +70,8 @@
 #include 
 #include 
 
+#include 
+
 namespace Test {
 
 class threads : public ::testing::Test {
@@ -103,6 +105,10 @@ TEST_F( threads , dyn_view_api) {
   TestDynViewAPI< double , Kokkos::Threads >();
 }
 
+TEST_F( threads, viewctorprop_embedded_dim ) {
+  TestViewCtorProp_EmbeddedDim< Kokkos::Threads >::test_vcpt( 2, 3 );
+}
+
 TEST_F( threads , staticcrsgraph )
 {
   TestStaticCrsGraph::run_test_graph< Kokkos::Threads >();
diff --git a/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp b/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp
new file mode 100644
index 0000000000..1efd1ddc51
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp
@@ -0,0 +1,213 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include 
+
+#include 
+
+#include 
+#include 
+
+#include 
+#include 
+
+namespace Test {
+
+namespace {
+
+template 
+struct TestViewCtorProp_EmbeddedDim {
+
+  using ViewIntType     = typename Kokkos::View< int**, ExecSpace >;
+  using ViewDoubleType     = typename Kokkos::View< double*, ExecSpace >;
+
+  using DynRankViewIntType     = typename Kokkos::DynRankView< int, ExecSpace >;
+  using DynRankViewDoubleType     = typename Kokkos::DynRankView< double, ExecSpace >;
+
+  // Cuda 7.0 has issues with using a lamda in parallel_for to initialize the view - replace with this functor
+  template < class ViewType >
+  struct Functor {
+
+    ViewType v;
+
+    Functor( const ViewType & v_ ) : v(v_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const int i ) const {
+      v(i) = i;
+    }
+
+  };
+
+
+  static void test_vcpt( const int N0, const int N1 )
+  {
+
+    // Create two views to test
+    {
+      using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType ;
+      using VDT = typename TestViewCtorProp_EmbeddedDim::ViewDoubleType ;
+
+      VIT vi1("vi1", N0, N1);
+      VDT vd1("vd1", N0);
+
+      // TEST: Test for common type between two views, one with type double, other with type int
+      // Deduce common value_type and construct a view with that type
+      {
+        // Two views
+        auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1);
+        typedef typename decltype( view_alloc_arg )::value_type                    CommonViewValueType;
+        typedef typename Kokkos::View< CommonViewValueType*, ExecSpace >  CVT;
+        typedef typename CVT::HostMirror                                           HostCVT;
+
+        // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
+        CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
+
+        Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), 
+          Functor(cv1)
+        );
+
+        HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
+        Kokkos::deep_copy( hcv1, cv1 );
+
+        ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ;
+      #if 0
+      // debug output
+      for ( int i = 0; i < N0*N1; ++i ) {
+        printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) );
+      }
+
+      printf( " Common value type view: %s \n", typeid( CVT() ).name() );
+      printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() );
+      if ( std::is_same< CommonViewValueType, double >::value == true ) {
+        printf("Proper common value_type\n");
+      }
+      else {
+        printf("WRONG common value_type\n");
+      }
+      // end debug output
+      #endif
+      }
+
+      {
+        // Single view
+        auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1);
+        typedef typename decltype( view_alloc_arg )::value_type                    CommonViewValueType;
+        typedef typename Kokkos::View< CommonViewValueType*, ExecSpace >  CVT;
+        typedef typename CVT::HostMirror                                           HostCVT;
+
+        // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
+        CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
+
+        Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), 
+          Functor(cv1)
+        );
+
+        HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
+        Kokkos::deep_copy( hcv1, cv1 );
+
+        ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ;
+      }
+
+    }
+
+    // Create two dynamic rank views to test
+    {
+      using VIT = typename TestViewCtorProp_EmbeddedDim::DynRankViewIntType ;
+      using VDT = typename TestViewCtorProp_EmbeddedDim::DynRankViewDoubleType ;
+
+      VIT vi1("vi1", N0, N1);
+      VDT vd1("vd1", N0);
+
+      // TEST: Test for common type between two views, one with type double, other with type int
+      // Deduce common value_type and construct a view with that type
+      {
+        // Two views
+        auto view_alloc_arg = Kokkos::common_view_alloc_prop( vi1, vd1 );
+        typedef typename decltype( view_alloc_arg )::value_type                    CommonViewValueType;
+        typedef typename Kokkos::View< CommonViewValueType*, ExecSpace >  CVT;
+        typedef typename CVT::HostMirror                                           HostCVT;
+
+        // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
+        CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
+
+
+        Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), 
+          Functor(cv1)
+        );
+
+        HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
+        Kokkos::deep_copy( hcv1, cv1 );
+
+        ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ;
+      }
+
+      {
+        // Single views
+        auto view_alloc_arg = Kokkos::common_view_alloc_prop( vi1 );
+        typedef typename decltype( view_alloc_arg )::value_type                    CommonViewValueType;
+        typedef typename Kokkos::View< CommonViewValueType*, ExecSpace >  CVT;
+        typedef typename CVT::HostMirror                                           HostCVT;
+
+        // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
+        CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
+
+        Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), 
+          Functor(cv1)
+        );
+
+        HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
+        Kokkos::deep_copy( hcv1, cv1 );
+
+        ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ;
+      }
+    }
+
+
+  } // end test_vcpt
+
+}; // end struct
+
+} // namespace
+
+} // namespace Test
diff --git a/lib/kokkos/containers/unit_tests/UnitTestMain.cpp b/lib/kokkos/containers/unit_tests/UnitTestMain.cpp
index f952ab3db5..2b73535c83 100644
--- a/lib/kokkos/containers/unit_tests/UnitTestMain.cpp
+++ b/lib/kokkos/containers/unit_tests/UnitTestMain.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,12 +36,14 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
 
 #include 
+#include 
+#include 
 
 int main(int argc, char *argv[]) {
   ::testing::InitGoogleTest(&argc,argv);
diff --git a/lib/kokkos/core/perf_test/Makefile b/lib/kokkos/core/perf_test/Makefile
index f59e7bbe1c..bb9353f583 100644
--- a/lib/kokkos/core/perf_test/Makefile
+++ b/lib/kokkos/core/perf_test/Makefile
@@ -79,7 +79,6 @@ test-mempool: KokkosCore_PerformanceTest_Mempool
 test-taskdag: KokkosCore_PerformanceTest_TaskDAG
 	./KokkosCore_PerformanceTest_TaskDAG
 
-
 build_all: $(TARGETS)
 
 test: $(TEST_TARGETS)
diff --git a/lib/kokkos/core/perf_test/PerfTestMain.cpp b/lib/kokkos/core/perf_test/PerfTestMain.cpp
index d80cfab8b5..832f650b9a 100644
--- a/lib/kokkos/core/perf_test/PerfTestMain.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestMain.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,12 +36,14 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
 
 #include 
+#include 
+
 #include 
 
 namespace Test {
diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
new file mode 100644
index 0000000000..46321378d9
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
@@ -0,0 +1,2715 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_EXP_ITERATE_TILE_REFACTOR_HPP
+#define KOKKOS_CUDA_EXP_ITERATE_TILE_REFACTOR_HPP
+
+#include 
+#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
+
+#include 
+#include 
+#include 
+
+#include 
+
+// #include
+// Including the file above leads to following type of errors:
+// /home/ndellin/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp(84): error: incomplete type is not allowed
+// use existing Kokkos functionality, e.g. max blocks, once resolved
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+#include 
+#include 
+#endif
+
+namespace Kokkos { namespace Experimental { namespace Impl {
+
+namespace Refactor {
+
+// ------------------------------------------------------------------ //
+// ParallelFor iteration pattern
+template< int N , typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile;
+
+//Rank 2
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct DeviceIterateTile<2,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+        if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+
+          for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+            if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+              m_func(offset_0 , offset_1);
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+              m_func(offset_0 , offset_1);
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile<2,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if (RP::inner_direction == RP::Left) {
+      // Loop over size maxnumblocks until full range covered
+      for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+        if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+
+          for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+            if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+              m_func(Tag(), offset_0 , offset_1);
+            }
+          }
+        }
+      }
+    }
+    else {
+      for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+              m_func(Tag(), offset_0 , offset_1);
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 3
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct DeviceIterateTile<3,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+        if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+                if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+                  m_func(offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+                if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
+                  m_func(offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile<3,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if (RP::inner_direction == RP::Left) {
+      for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+        if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+                if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+                  m_func(Tag(), offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    else {
+      for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+                if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
+                  m_func(Tag(), offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 4
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct DeviceIterateTile<4,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
+
+      for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+        if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
+
+          for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+            if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
+
+              for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                  for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                    if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                      m_func(offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+                if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
+
+                  for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+                    if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
+                      m_func(offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile<4,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if (RP::inner_direction == RP::Left) {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
+
+      for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+        if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
+
+          for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+            if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
+
+              for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                  for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                    if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                      m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    else {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+                if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
+
+                  for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+                    if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
+                      m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 5
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct DeviceIterateTile<5,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    // LL
+    if (RP::inner_direction == RP::Left) {
+
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
+      const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
+      const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
+
+      for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+        if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
+
+          for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+            if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                    if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                      for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                          m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
+      const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
+      const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+                        if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
+                          m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile<5,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
+      const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
+      const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
+
+      for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+        if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
+
+          for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+            if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                    if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                      for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                          m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
+      const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
+      const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+                        if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
+                          m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 6
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct DeviceIterateTile<6,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
+      const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
+      const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id4 = (index_type)blockIdx.z % numbl4;
+      const index_type tile_id5 = (index_type)blockIdx.z / numbl4;
+      const index_type thr_id4 = (index_type)threadIdx.z % m_rp.m_tile[4];
+      const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
+
+      for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+        if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+
+          for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+            if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+              for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                  for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                    if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                      for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                        if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                          for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                            if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                              m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
+      const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
+      const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id4 = (index_type)blockIdx.z / numbl5;
+      const index_type tile_id5 = (index_type)blockIdx.z % numbl5;
+      const index_type thr_id4 = (index_type)threadIdx.z / m_rp.m_tile[5];
+      const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+                        if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+                          for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+                            if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+                              m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile<6,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
+      const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
+      const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id4 = (index_type)blockIdx.z % numbl4;
+      const index_type tile_id5 = (index_type)blockIdx.z / numbl4;
+      const index_type thr_id4 = (index_type)threadIdx.z % m_rp.m_tile[4];
+      const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
+
+      for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+        if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+
+          for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+            if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+              for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                  for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                    if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                      for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                        if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                          for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                            if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                              m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
+      const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
+      const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id4 = (index_type)blockIdx.z / numbl5;
+      const index_type tile_id5 = (index_type)blockIdx.z % numbl5;
+      const index_type thr_id4 = (index_type)threadIdx.z / m_rp.m_tile[5];
+      const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+                        if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+                          for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+                            if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+                              m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+} // Refactor
+
+// ----------------------------------------------------------------------------------
+
+namespace Reduce {
+
+template < typename T >
+using is_void = std::is_same< T, void >;
+
+template < typename T >
+struct is_array_type : std::false_type
+{
+  using value_type = T;
+};
+
+template < typename T >
+struct is_array_type< T* > : std::true_type
+{
+  using value_type = T;
+};
+
+template < typename T >
+struct is_array_type< T[] > : std::true_type
+{
+  using value_type = T;
+};
+
+// ------------------------------------------------------------------ //
+template< int N , typename RP , typename Functor , typename Tag , typename ValueType , typename Enable = void >
+struct DeviceIterateTile;
+
+// ParallelReduce iteration pattern
+// Scalar reductions
+
+// num_blocks = min( num_tiles, max_num_blocks ); //i.e. determined by number of tiles and reduction algorithm constraints
+// extract n-dim tile offsets (i.e. tile's global starting mulit-index) from the tileid = blockid using tile dimensions
+// local indices within a tile extracted from (index_type)threadIdx.x using tile dims, constrained by blocksize
+// combine tile and local id info for multi-dim global ids
+
+// Pattern:
+// Each block+thread is responsible for a tile+local_id combo (additional when striding by num_blocks)
+// 1. create offset arrays
+// 2. loop over number of tiles, striding by griddim (equal to num tiles, or max num blocks)
+// 3. temps set for tile_idx and thrd_idx, which will be modified
+// 4. if LL vs LR:
+//      determine tile starting point offsets (multidim)
+//      determine local index offsets (multidim)
+//      concatentate tile offset + local offset for global multi-dim index
+//    if offset withinin range bounds AND local offset within tile bounds, call functor
+
+// ValueType = T
+//Rank 2
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<2,RP,Functor,void,ValueType, typename std::enable_if< !is_array_type::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_v ); }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<2,RP,Functor,Tag, ValueType, typename std::enable_if< !is_array_type::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+//Rank 3
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<3,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<3,RP,Functor,Tag, ValueType, typename std::enable_if< !is_array_type::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+//Rank 4
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<4,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<4,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+//Rank 5
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<5,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<5,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+//Rank 6
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<6,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<6,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// ValueType = T[], T*
+//Rank 2
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<2,RP,Functor,void,ValueType, typename std::enable_if< is_array_type::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<2,RP,Functor,Tag, ValueType, typename std::enable_if< is_array_type::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
+        }
+      } //end for loop over num_tiles - product of tiles in each direction
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+//Rank 3
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<3,RP,Functor,void,ValueType , typename std::enable_if< is_array_type::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<3,RP,Functor,Tag, ValueType, typename std::enable_if< is_array_type::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+//Rank 4
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<4,RP,Functor,void,ValueType , typename std::enable_if< is_array_type::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<4,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+//Rank 5
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<5,RP,Functor,void,ValueType , typename std::enable_if< is_array_type::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<5,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+//Rank 6
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<6,RP,Functor,void,ValueType , typename std::enable_if< is_array_type::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<6,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+} // Reduce
+
+// ----------------------------------------------------------------------------------
+
+} } } //end namespace Kokkos::Experimental::Impl
+
+#endif
+#endif
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
index 13abcfd93c..cae8ecd489 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@@ -53,6 +53,7 @@
 #include 
 #include 
 #include 
+#include 
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -125,53 +126,12 @@ unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits:
 
 #endif
 
-
-namespace Kokkos {
-namespace Impl {
-  struct CudaLockArraysStruct {
-    int* atomic;
-    int* scratch;
-    int* threadid;
-    int n;
-  };
-}
-}
-__device__ __constant__
-#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-extern
-#endif
-Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
-
-#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
-#define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39
-
 namespace Kokkos {
 namespace Impl {
   void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink = false);
 }
 }
 
-namespace Kokkos {
-namespace Impl {
-__device__ inline
-bool lock_address_cuda_space(void* ptr) {
-  size_t offset = size_t(ptr);
-  offset = offset >> 2;
-  offset = offset & CUDA_SPACE_ATOMIC_MASK;
-  return (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[offset],0,1));
-}
-
-__device__ inline
-void unlock_address_cuda_space(void* ptr) {
-  size_t offset = size_t(ptr);
-  offset = offset >> 2;
-  offset = offset & CUDA_SPACE_ATOMIC_MASK;
-  atomicExch( &kokkos_impl_cuda_lock_arrays.atomic[ offset ], 0);
-}
-
-}
-}
-
 template< typename T >
 inline
 __device__
@@ -192,7 +152,7 @@ namespace Impl {
 // For 2.0 capability: 48 KB L1 and 16 KB shared
 //----------------------------------------------------------------------------
 
-template< class DriverType >
+template< class DriverType>
 __global__
 static void cuda_parallel_launch_constant_memory()
 {
@@ -202,19 +162,39 @@ static void cuda_parallel_launch_constant_memory()
   driver();
 }
 
-template< class DriverType >
+template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
+__global__
+__launch_bounds__(maxTperB, minBperSM)
+static void cuda_parallel_launch_constant_memory()
+{
+  const DriverType & driver =
+    *((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
+
+  driver();
+}
+
+template< class DriverType>
 __global__
 static void cuda_parallel_launch_local_memory( const DriverType driver )
 {
   driver();
 }
 
-template < class DriverType ,
-           bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
+template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
+__global__
+__launch_bounds__(maxTperB, minBperSM)
+static void cuda_parallel_launch_local_memory( const DriverType driver )
+{
+  driver();
+}
+
+template < class DriverType
+         , class LaunchBounds = Kokkos::LaunchBounds<>
+         , bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
 struct CudaParallelLaunch ;
 
-template < class DriverType >
-struct CudaParallelLaunch< DriverType , true > {
+template < class DriverType, class LaunchBounds >
+struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
 
   inline
   CudaParallelLaunch( const DriverType & driver
@@ -238,26 +218,19 @@ struct CudaParallelLaunch< DriverType , true > {
       }
       #ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
       else if ( shmem ) {
-        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared ) );
+        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
       } else {
-        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 ) );
+        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
       }
       #endif
 
       // Copy functor to constant memory on the device
       cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
 
-      #ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-      Kokkos::Impl::CudaLockArraysStruct locks;
-      locks.atomic = atomic_lock_array_cuda_space_ptr(false);
-      locks.scratch = scratch_lock_array_cuda_space_ptr(false);
-      locks.threadid = threadid_lock_array_cuda_space_ptr(false);
-      locks.n = Kokkos::Cuda::concurrency();
-      cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
-      #endif
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
 
       // Invoke the driver function on the device
-      cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem , stream >>>();
+      cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>();
 
 #if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
       CUDA_SAFE_CALL( cudaGetLastError() );
@@ -267,8 +240,8 @@ struct CudaParallelLaunch< DriverType , true > {
   }
 };
 
-template < class DriverType >
-struct CudaParallelLaunch< DriverType , false > {
+template < class DriverType, class LaunchBounds >
+struct CudaParallelLaunch< DriverType, LaunchBounds, false > {
 
   inline
   CudaParallelLaunch( const DriverType & driver
@@ -284,22 +257,15 @@ struct CudaParallelLaunch< DriverType , false > {
       }
       #ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
       else if ( shmem ) {
-        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferShared ) );
+        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
       } else {
-        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferL1 ) );
+        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
       }
       #endif
 
-      #ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-      Kokkos::Impl::CudaLockArraysStruct locks;
-      locks.atomic = atomic_lock_array_cuda_space_ptr(false);
-      locks.scratch = scratch_lock_array_cuda_space_ptr(false);
-      locks.threadid = threadid_lock_array_cuda_space_ptr(false);
-      locks.n = Kokkos::Cuda::concurrency();
-      cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
-      #endif
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
 
-      cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver );
+      cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>( driver );
 
 #if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
       CUDA_SAFE_CALL( cudaGetLastError() );
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
index 406b4f1e22..b699f0d6ba 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -230,18 +230,6 @@ void CudaHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t
   } catch(...) {}
 }
 
-constexpr const char* CudaSpace::name() {
-  return m_name;
-}
-
-constexpr const char* CudaUVMSpace::name() {
-  return m_name;
-}
-
-constexpr const char* CudaHostPinnedSpace::name() {
-  return m_name;
-}
-
 } // namespace Kokkos
 
 //----------------------------------------------------------------------------
@@ -655,11 +643,12 @@ reallocate_tracked( void * const arg_alloc_ptr
 SharedAllocationRecord< Kokkos::CudaSpace , void > *
 SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr )
 {
-  using Header     = SharedAllocationHeader ;
   using RecordBase = SharedAllocationRecord< void , void > ;
   using RecordCuda = SharedAllocationRecord< Kokkos::CudaSpace , void > ;
 
 #if 0
+  using Header     = SharedAllocationHeader ;
+
   // Copy the header from the allocation
   Header head ;
 
@@ -812,83 +801,6 @@ print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bo
   SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaHostPinned" , & s_root_record , detail );
 }
 
-} // namespace Impl
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace {
-  __global__ void init_lock_array_kernel_atomic() {
-    unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if(i>>();
-    init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
-  }
-}
-
 void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
   static void* ptr = NULL;
   static std::int64_t current_size = 0;
@@ -908,8 +820,8 @@ void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
   return ptr;
 }
 
-}
-}
+} // namespace Impl
+} // namespace Kokkos
 #else
 void KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR() {}
 #endif // KOKKOS_ENABLE_CUDA
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
index daf55cbd97..80e8f9bd8a 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
@@ -51,6 +51,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -69,9 +70,6 @@
 __device__ __constant__
 unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
 
-__device__ __constant__
-Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
-
 #endif
 
 /*--------------------------------------------------------------------------*/
@@ -103,6 +101,7 @@ int cuda_kernel_arch()
   return arch ;
 }
 
+#ifdef KOKKOS_ENABLE_CUDA_UVM
 bool cuda_launch_blocking()
 {
   const char * env = getenv("CUDA_LAUNCH_BLOCKING");
@@ -111,16 +110,13 @@ bool cuda_launch_blocking()
 
   return atoi(env);
 }
+#endif
 
 }
 
 void cuda_device_synchronize()
 {
-//  static const bool launch_blocking = cuda_launch_blocking();
-
-//  if (!launch_blocking) {
-    CUDA_SAFE_CALL( cudaDeviceSynchronize() );
-//  }
+  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
 }
 
 void cuda_internal_error_throw( cudaError e , const char * name, const char * file, const int line )
@@ -240,6 +236,7 @@ public:
   unsigned    m_maxWarpCount ;
   unsigned    m_maxBlock ;
   unsigned    m_maxSharedWords ;
+  uint32_t    m_maxConcurrency ;
   size_type   m_scratchSpaceCount ;
   size_type   m_scratchFlagsCount ;
   size_type   m_scratchUnifiedCount ;
@@ -248,6 +245,7 @@ public:
   size_type * m_scratchSpace ;
   size_type * m_scratchFlags ;
   size_type * m_scratchUnified ;
+  uint32_t  * m_scratchConcurrentBitset ;
   cudaStream_t * m_stream ;
 
   static int was_initialized;
@@ -274,6 +272,7 @@ public:
     , m_maxWarpCount( 0 )
     , m_maxBlock( 0 )
     , m_maxSharedWords( 0 )
+    , m_maxConcurrency( 0 )
     , m_scratchSpaceCount( 0 )
     , m_scratchFlagsCount( 0 )
     , m_scratchUnifiedCount( 0 )
@@ -282,6 +281,7 @@ public:
     , m_scratchSpace( 0 )
     , m_scratchFlags( 0 )
     , m_scratchUnified( 0 )
+    , m_scratchConcurrentBitset( 0 )
     , m_stream( 0 )
     {}
 
@@ -327,7 +327,8 @@ CudaInternal::~CudaInternal()
   if ( m_stream ||
        m_scratchSpace ||
        m_scratchFlags ||
-       m_scratchUnified ) {
+       m_scratchUnified ||
+       m_scratchConcurrentBitset ) {
     std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()"
               << std::endl ;
     std::cerr.flush();
@@ -339,6 +340,7 @@ CudaInternal::~CudaInternal()
   m_maxWarpCount            = 0 ;
   m_maxBlock                = 0 ;
   m_maxSharedWords          = 0 ;
+  m_maxConcurrency          = 0 ;
   m_scratchSpaceCount       = 0 ;
   m_scratchFlagsCount       = 0 ;
   m_scratchUnifiedCount     = 0 ;
@@ -347,6 +349,7 @@ CudaInternal::~CudaInternal()
   m_scratchSpace            = 0 ;
   m_scratchFlags            = 0 ;
   m_scratchUnified          = 0 ;
+  m_scratchConcurrentBitset = 0 ;
   m_stream                  = 0 ;
 }
 
@@ -485,6 +488,33 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
       (void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
     }
     //----------------------------------
+    // Concurrent bitset for obtaining unique tokens from within
+    // an executing kernel.
+    {
+      const unsigned max_threads_per_sm = 2048 ; // up to capability 7.0
+
+      m_maxConcurrency =
+        max_threads_per_sm * cudaProp.multiProcessorCount ;
+
+      const int32_t buffer_bound =
+         Kokkos::Impl::concurrent_bitset::buffer_bound( m_maxConcurrency );
+
+      // Allocate and initialize uint32_t[ buffer_bound ]
+
+      typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
+
+      Record * const r = Record::allocate( Kokkos::CudaSpace()
+                                         , "InternalScratchBitset"
+                                         , sizeof(uint32_t) * buffer_bound );
+
+      Record::increment( r );
+
+      m_scratchConcurrentBitset = reinterpret_cast( r->data() );
+
+      CUDA_SAFE_CALL( cudaMemset( m_scratchConcurrentBitset , 0 , sizeof(uint32_t) * buffer_bound ) );
+
+    }
+    //----------------------------------
 
     if ( stream_count ) {
       m_stream = (cudaStream_t*) ::malloc( stream_count * sizeof(cudaStream_t) );
@@ -543,16 +573,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
   cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
 
   // Init the array for used for arbitrarily sized atomics
-  Impl::init_lock_arrays_cuda_space();
-
-  #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-  Kokkos::Impl::CudaLockArraysStruct locks;
-  locks.atomic = atomic_lock_array_cuda_space_ptr(false);
-  locks.scratch = scratch_lock_array_cuda_space_ptr(false);
-  locks.threadid = threadid_lock_array_cuda_space_ptr(false);
-  locks.n = Kokkos::Cuda::concurrency();
-  cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
-  #endif
+  Impl::initialize_host_cuda_lock_arrays();
 }
 
 //----------------------------------------------------------------------------
@@ -635,9 +656,7 @@ void CudaInternal::finalize()
   was_finalized = 1;
   if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
 
-    atomic_lock_array_cuda_space_ptr(true);
-    scratch_lock_array_cuda_space_ptr(true);
-    threadid_lock_array_cuda_space_ptr(true);
+    Impl::finalize_host_cuda_lock_arrays();
 
     if ( m_stream ) {
       for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
@@ -653,6 +672,7 @@ void CudaInternal::finalize()
     RecordCuda::decrement( RecordCuda::get_record( m_scratchFlags ) );
     RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) );
     RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) );
+    RecordCuda::decrement( RecordCuda::get_record( m_scratchConcurrentBitset ) );
 
     m_cudaDev             = -1 ;
     m_multiProcCount      = 0 ;
@@ -666,6 +686,7 @@ void CudaInternal::finalize()
     m_scratchSpace        = 0 ;
     m_scratchFlags        = 0 ;
     m_scratchUnified      = 0 ;
+    m_scratchConcurrentBitset = 0 ;
     m_stream              = 0 ;
   }
 }
@@ -713,9 +734,8 @@ namespace Kokkos {
 Cuda::size_type Cuda::detect_device_count()
 { return Impl::CudaInternalDevices::singleton().m_cudaDevCount ; }
 
-int Cuda::concurrency() {
-  return 131072;
-}
+int Cuda::concurrency()
+{ return Impl::CudaInternal::singleton().m_maxConcurrency ; }
 
 int Cuda::is_initialized()
 { return Impl::CudaInternal::singleton().is_initialized(); }
@@ -798,7 +818,22 @@ void Cuda::fence()
 const char* Cuda::name() { return "Cuda"; }
 
 } // namespace Kokkos
+
+namespace Kokkos {
+namespace Experimental {
+
+UniqueToken< Kokkos::Cuda , Kokkos::Experimental::UniqueTokenScope::Global >::
+UniqueToken( Kokkos::Cuda const & )
+  : m_buffer( Kokkos::Impl::CudaInternal::singleton().m_scratchConcurrentBitset )
+  , m_count(  Kokkos::Impl::CudaInternal::singleton().m_maxConcurrency )
+  {}
+
+} // namespace Experimental
+} // namespace Kokkos
+
 #else
+
 void KOKKOS_CORE_SRC_CUDA_IMPL_PREVENT_LINK_ERROR() {}
+
 #endif // KOKKOS_ENABLE_CUDA
 
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
new file mode 100644
index 0000000000..237022ad23
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
@@ -0,0 +1,119 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include 
+
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include 
+#include 
+#include 
+
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+namespace Kokkos {
+namespace Impl {
+__device__ __constant__
+CudaLockArrays g_device_cuda_lock_arrays = { nullptr, nullptr, 0 };
+}
+}
+#endif
+
+namespace Kokkos {
+
+namespace {
+
+__global__ void init_lock_array_kernel_atomic() {
+  unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
+  if(i>>();
+  init_lock_array_kernel_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
+  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+}
+
+void finalize_host_cuda_lock_arrays() {
+  if (g_host_cuda_lock_arrays.atomic == nullptr) return;
+  cudaFree(g_host_cuda_lock_arrays.atomic);
+  g_host_cuda_lock_arrays.atomic = nullptr;
+  cudaFree(g_host_cuda_lock_arrays.scratch);
+  g_host_cuda_lock_arrays.scratch = nullptr;
+  g_host_cuda_lock_arrays.n = 0;
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+#endif
+}
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+#else
+
+void KOKKOS_CORE_SRC_CUDA_CUDA_LOCKS_PREVENT_LINK_ERROR() {}
+
+#endif
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
new file mode 100644
index 0000000000..d01f06fb4f
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
@@ -0,0 +1,166 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_LOCKS_HPP
+#define KOKKOS_CUDA_LOCKS_HPP
+
+#include 
+
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include 
+
+#include 
+
+namespace Kokkos {
+namespace Impl {
+
+struct CudaLockArrays {
+  std::int32_t* atomic;
+  std::int32_t* scratch;
+  std::int32_t n;
+};
+
+/// \brief This global variable in Host space is the central definition
+///        of these arrays.
+extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays ;
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        valid, initialized arrays.
+///
+/// This call is idempotent.
+void initialize_host_cuda_lock_arrays();
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        all null pointers, and all array memory has been freed.
+///
+/// This call is idempotent.
+void finalize_host_cuda_lock_arrays();
+
+} // namespace Impl
+} // namespace Kokkos
+
+#if defined( __CUDACC__ )
+
+namespace Kokkos {
+namespace Impl {
+
+/// \brief This global variable in CUDA space is what kernels use
+///        to get access to the lock arrays.
+///
+/// When relocatable device code is enabled, there can be one single
+/// instance of this global variable for the entire executable,
+/// whose definition will be in Kokkos_Cuda_Locks.cpp (and whose declaration
+/// here must then be extern.
+/// This one instance will be initialized by initialize_host_cuda_lock_arrays
+/// and need not be modified afterwards.
+///
+/// When relocatable device code is disabled, an instance of this variable
+/// will be created in every translation unit that sees this header file
+/// (we make this clear by marking it static, meaning no other translation
+///  unit can link to it).
+/// Since the Kokkos_Cuda_Locks.cpp translation unit cannot initialize the
+/// instances in other translation units, we must update this CUDA global
+/// variable based on the Host global variable prior to running any kernels
+/// that will use it.
+/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
+__device__ __constant__
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+extern
+#endif
+Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays ;
+
+#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
+
+/// \brief Aquire a lock for the address
+///
+/// This function tries to aquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully aquired the
+/// function returns true. Otherwise it returns false.
+__device__ inline
+bool lock_address_cuda_space(void* ptr) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  return (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset],0,1));
+}
+
+/// \brief Release lock for the address
+///
+/// This function releases the lock for the hash value derived
+/// from the provided ptr. This function should only be called
+/// after previously successfully aquiring a lock with
+/// lock_address.
+__device__ inline
+void unlock_address_cuda_space(void* ptr) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  atomicExch( &Kokkos::Impl::g_device_cuda_lock_arrays.atomic[ offset ], 0);
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/* Dan Ibanez: it is critical that this code be a macro, so that it will
+   capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
+   putting this in an inline function will NOT do the right thing! */
+#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
+{ \
+  CUDA_SAFE_CALL(cudaMemcpyToSymbol( \
+        Kokkos::Impl::g_device_cuda_lock_arrays , \
+        & Kokkos::Impl::g_host_cuda_lock_arrays , \
+        sizeof(Kokkos::Impl::CudaLockArrays) ) ); \
+}
+
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+#else
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
+#endif
+
+#endif /* defined( __CUDACC__ ) */
+
+#endif /* defined( KOKKOS_ENABLE_CUDA ) */
+
+#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
index 0c8c700e8f..e2eab19e45 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@@ -58,6 +58,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #if defined(KOKKOS_ENABLE_PROFILING)
@@ -65,6 +66,8 @@
 #include 
 #endif
 
+#include 
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
@@ -318,6 +321,7 @@ private:
   typedef Kokkos::RangePolicy< Traits ... > Policy;
   typedef typename Policy::member_type  Member ;
   typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::launch_bounds LaunchBounds ;
 
   const FunctorType  m_functor ;
   const Policy       m_policy ;
@@ -363,7 +367,7 @@ public:
       const dim3 block(  1 , CudaTraits::WarpSize * cuda_internal_maximum_warp_count(), 1);
       const dim3 grid( std::min( ( nwork + block.y - 1 ) / block.y , cuda_internal_maximum_grid_count() ) , 1 , 1);
 
-      CudaParallelLaunch< ParallelFor >( *this , grid , block , 0 );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
     }
 
   ParallelFor( const FunctorType  & arg_functor ,
@@ -373,6 +377,115 @@ public:
     { }
 };
 
+
+// MDRangePolicy impl
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                 , Kokkos::Cuda
+                 >
+{
+private:
+  typedef Kokkos::Experimental::MDRangePolicy< Traits ...  > Policy ;
+  using RP = Policy;
+  typedef typename Policy::array_index_type array_index_type;
+  typedef typename Policy::index_type index_type;
+  typedef typename Policy::launch_bounds LaunchBounds;
+
+
+  const FunctorType m_functor ;
+  const Policy      m_rp ;
+
+public:
+
+  inline
+  __device__
+  void operator()(void) const
+    {
+      Kokkos::Experimental::Impl::Refactor::DeviceIterateTile(m_rp,m_functor).exec_range();
+    }
+
+
+  inline
+  void execute() const
+  {
+    const array_index_type maxblocks = static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+    if ( RP::rank == 2 )
+    {
+      const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , 1);
+      const dim3 grid(
+            std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
+          , std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
+          , 1
+          );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+    else if ( RP::rank == 3 )
+    {
+      const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , m_rp.m_tile[2] );
+      const dim3 grid(
+          std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
+        , std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
+        , std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+    else if ( RP::rank == 4 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2] , m_rp.m_tile[3] );
+      const dim3 grid(
+          std::min( static_cast( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast(maxblocks) )
+        , std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1 ) / block.y , maxblocks )
+        , std::min( ( m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+    else if ( RP::rank == 5 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4] );
+      const dim3 grid(
+          std::min( static_cast( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast(maxblocks) )
+        , std::min( static_cast( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
+                  , static_cast(maxblocks) )
+        , std::min( ( m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+    else if ( RP::rank == 6 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4]*m_rp.m_tile[5] );
+      const dim3 grid(
+          std::min( static_cast( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast(maxblocks) )
+        ,  std::min( static_cast( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
+                  , static_cast(maxblocks) )
+        , std::min( static_cast( m_rp.m_tile_end[4] * m_rp.m_tile_end[5] )
+                  , static_cast(maxblocks) )
+        );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+    else
+    {
+      printf("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
+      Kokkos::abort("Aborting");
+    }
+
+  } //end execute
+
+//  inline
+  ParallelFor( const FunctorType & arg_functor
+             , Policy arg_policy )
+    : m_functor( arg_functor )
+    , m_rp(  arg_policy )
+    {}
+};
+
+
 template< class FunctorType , class ... Properties >
 class ParallelFor< FunctorType
                  , Kokkos::TeamPolicy< Properties ... >
@@ -384,6 +497,7 @@ private:
   typedef TeamPolicyInternal< Kokkos::Cuda , Properties ... >   Policy ;
   typedef typename Policy::member_type  Member ;
   typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::launch_bounds  LaunchBounds ;
 
 public:
 
@@ -430,15 +544,15 @@ public:
     if ( m_scratch_size[1]>0 ) {
       __shared__ int base_thread_id;
       if (threadIdx.x==0 && threadIdx.y==0 ) {
-        threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
+        threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % Kokkos::Impl::g_device_cuda_lock_arrays.n;
         threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
-        if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
+        if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
         int done = 0;
         while (!done) {
-          done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
+          done = (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],0,1));
           if(!done) {
             threadid += blockDim.x * blockDim.y;
-            if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
+            if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid = 0;
           }
         }
         base_thread_id = threadid;
@@ -448,7 +562,8 @@ public:
     }
 
 
-    for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
+    const int int_league_size = (int)m_league_size;
+    for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
 
       this-> template exec_team< WorkTag >(
         typename Policy::member_type( kokkos_impl_cuda_shared_memory()
@@ -462,7 +577,7 @@ public:
     if ( m_scratch_size[1]>0 ) {
       __syncthreads();
       if (threadIdx.x==0 && threadIdx.y==0 )
-        kokkos_impl_cuda_lock_arrays.atomic[threadid]=0;
+        Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid]=0;
     }
   }
 
@@ -473,7 +588,7 @@ public:
       const dim3 grid( int(m_league_size) , 1 , 1 );
       const dim3 block( int(m_vector_size) , int(m_team_size) , 1 );
 
-      CudaParallelLaunch< ParallelFor >( *this, grid, block, shmem_size_total ); // copy to device and execute
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute
 
     }
 
@@ -529,6 +644,7 @@ private:
   typedef typename Policy::WorkRange    WorkRange ;
   typedef typename Policy::work_tag     WorkTag ;
   typedef typename Policy::member_type  Member ;
+  typedef typename Policy::launch_bounds LaunchBounds ;
 
   typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional;
   typedef typename ReducerConditional::type ReducerTypeFwd;
@@ -563,6 +679,7 @@ private:
   typedef int DummySHMEMReductionType;
 
 public:
+  // Make the exec_range calls call to Reduce::DeviceIterateTile
   template< class TagType >
   __device__ inline
   typename std::enable_if< std::is_same< TagType , void >::value >::type
@@ -686,7 +803,7 @@ public:
 
       const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem( m_functor , block.y );
 
-      CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute
+      CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
 
       Cuda::fence();
 
@@ -737,6 +854,232 @@ public:
   { }
 };
 
+
+// MDRangePolicy impl
+template< class FunctorType , class ReducerType, class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                    , ReducerType
+                    , Kokkos::Cuda
+                    >
+{
+private:
+
+  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
+  typedef typename Policy::array_index_type                 array_index_type;
+  typedef typename Policy::index_type                       index_type;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::member_type  Member ;
+  typedef typename Policy::launch_bounds LaunchBounds;
+
+  typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTag > ValueJoin ;
+
+public:
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::value_type      value_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef FunctorType                           functor_type ;
+  typedef Cuda::size_type                       size_type ;
+
+  // Algorithmic constraints: blockSize is a power of two AND blockDim.y == blockDim.z == 1
+
+  const FunctorType   m_functor ;
+  const Policy        m_policy ; // used for workrange and nwork
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+  size_type *         m_scratch_space ;
+  size_type *         m_scratch_flags ;
+  size_type *         m_unified_space ;
+
+  typedef typename Kokkos::Experimental::Impl::Reduce::DeviceIterateTile DeviceIteratePattern;
+
+  // Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
+  enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
+  // Some crutch to do function overloading
+private:
+  typedef double DummyShflReductionType;
+  typedef int DummySHMEMReductionType;
+
+public:
+  inline
+  __device__
+  void
+  exec_range( reference_type update ) const
+  {
+    Kokkos::Experimental::Impl::Reduce::DeviceIterateTile(m_policy, m_functor, update).exec_range();
+  }
+
+  inline
+  __device__
+  void operator() (void) const {
+    run(Kokkos::Impl::if_c::select(1,1.0) );
+  }
+
+  __device__ inline
+  void run(const DummySHMEMReductionType& ) const
+  {
+    const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+      word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
+
+    {
+      reference_type value =
+        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory() + threadIdx.y * word_count.value );
+
+      // Number of blocks is bounded so that the reduction can be limited to two passes.
+      // Each thread block is given an approximately equal amount of work to perform.
+      // Accumulate the values for this block.
+      // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+      this-> exec_range( value );
+    }
+
+    // Reduce with final value at blockDim.y - 1 location.
+    // Problem: non power-of-two blockDim
+    if ( cuda_single_inter_block_reduce_scan(
+           ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x ,
+           kokkos_impl_cuda_shared_memory() , m_scratch_space , m_scratch_flags ) ) {
+
+      // This is the final block with the final result at the final threads' location
+      size_type * const shared = kokkos_impl_cuda_shared_memory() + ( blockDim.y - 1 ) * word_count.value ;
+      size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
+
+      if ( threadIdx.y == 0 ) {
+        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
+      }
+
+      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
+
+      for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; }
+    }
+  }
+
+  __device__ inline
+   void run(const DummyShflReductionType&) const
+   {
+
+     value_type value;
+     ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
+     // Number of blocks is bounded so that the reduction can be limited to two passes.
+     // Each thread block is given an approximately equal amount of work to perform.
+     // Accumulate the values for this block.
+     // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+     const Member work_part =
+       ( ( m_policy.m_num_tiles + ( gridDim.x - 1 ) ) / gridDim.x ); //portion of tiles handled by each block
+
+     this-> exec_range( value );
+
+     pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
+
+     int max_active_thread = work_part < blockDim.y ? work_part:blockDim.y;
+     max_active_thread = (max_active_thread == 0)?blockDim.y:max_active_thread;
+
+     value_type init;
+     ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init);
+     if(Impl::cuda_inter_block_reduction
+         (value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) {
+       const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
+       if(id==0) {
+         Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
+         *result = value;
+       }
+     }
+   }
+
+  // Determine block size constrained by shared memory:
+  static inline
+  unsigned local_block_size( const FunctorType & f )
+    {
+      unsigned n = CudaTraits::WarpSize * 8 ;
+      while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem( f , n ) ) { n >>= 1 ; }
+      return n ;
+    }
+
+  inline
+  void execute()
+    {
+      const int nwork = m_policy.m_num_tiles;
+      if ( nwork ) {
+        int block_size = m_policy.m_prod_tile_dims;
+        // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions
+        // Nearest power of two
+        int exponent_pow_two = std::ceil( std::log2(block_size) );
+        block_size = std::pow(2, exponent_pow_two);
+        int suggested_blocksize = local_block_size( m_functor );
+
+        block_size = (block_size > suggested_blocksize) ? block_size : suggested_blocksize ; //Note: block_size must be less than or equal to 512
+
+
+        m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_size /* block_size == max block_count */ );
+        m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
+        m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
+
+        // REQUIRED ( 1 , N , 1 )
+        const dim3 block( 1 , block_size , 1 );
+        // Required grid.x <= block.y
+        const dim3 grid( std::min( int(block.y) , int( nwork ) ) , 1 , 1 );
+
+      const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem( m_functor , block.y );
+
+      CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
+
+      Cuda::fence();
+
+      if ( m_result_ptr ) {
+        if ( m_unified_space ) {
+          const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer)  );
+          for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
+        }
+        else {
+          const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer)  );
+          DeepCopy( m_result_ptr , m_scratch_space , size );
+        }
+      }
+    }
+    else {
+      if (m_result_ptr) {
+        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
+      }
+    }
+  }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & arg_functor
+                , const Policy       & arg_policy
+                , const HostViewType & arg_result
+                , typename std::enable_if<
+                   Kokkos::is_view< HostViewType >::value
+                ,void*>::type = NULL)
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( InvalidType() )
+  , m_result_ptr( arg_result.ptr_on_device() )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  {}
+
+  ParallelReduce( const FunctorType  & arg_functor
+                , const Policy       & arg_policy
+                , const ReducerType & reducer)
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr( reducer.view().ptr_on_device() )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  {}
+};
+
+
 //----------------------------------------------------------------------------
 
 #if 1
@@ -753,6 +1096,7 @@ private:
   typedef TeamPolicyInternal< Kokkos::Cuda, Properties ... >  Policy ;
   typedef typename Policy::member_type  Member ;
   typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::launch_bounds     LaunchBounds ;
 
   typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional;
   typedef typename ReducerConditional::type ReducerTypeFwd;
@@ -819,15 +1163,15 @@ public:
     if ( m_scratch_size[1]>0 ) {
       __shared__ int base_thread_id;
       if (threadIdx.x==0 && threadIdx.y==0 ) {
-        threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
+        threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % Kokkos::Impl::g_device_cuda_lock_arrays.n;
         threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
-        if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
+        if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
         int done = 0;
         while (!done) {
-          done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
+          done = (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],0,1));
           if(!done) {
             threadid += blockDim.x * blockDim.y;
-            if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
+            if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid = 0;
           }
         }
         base_thread_id = threadid;
@@ -840,7 +1184,7 @@ public:
     if ( m_scratch_size[1]>0 ) {
       __syncthreads();
       if (threadIdx.x==0 && threadIdx.y==0 )
-        kokkos_impl_cuda_lock_arrays.atomic[threadid]=0;
+        Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid]=0;
     }
   }
 
@@ -854,7 +1198,8 @@ public:
       ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory() + threadIdx.y * word_count.value );
 
     // Iterate this block through the league
-    for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
+    const int int_league_size = (int)m_league_size;
+    for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
       this-> template exec_team< WorkTag >
         ( Member( kokkos_impl_cuda_shared_memory() + m_team_begin
                                         , m_shmem_begin
@@ -894,7 +1239,8 @@ public:
     ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
 
     // Iterate this block through the league
-    for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
+    const int int_league_size = (int)m_league_size;
+    for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
       this-> template exec_team< WorkTag >
         ( Member( kokkos_impl_cuda_shared_memory() + m_team_begin
                                         , m_shmem_begin
@@ -936,7 +1282,7 @@ public:
         const dim3 grid( block_count , 1 , 1 );
         const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
 
-        CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute
+        CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute
 
         Cuda::fence();
 
@@ -975,12 +1321,6 @@ public:
   , m_shmem_begin( 0 )
   , m_shmem_size( 0 )
   , m_scratch_ptr{NULL,NULL}
-  , m_league_size( arg_policy.league_size() )
-  , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
-      Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
-                                                               arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
-                                                               arg_policy.vector_length() )
-  , m_vector_size( arg_policy.vector_length() )
   , m_scratch_size{
     arg_policy.scratch_size(0,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
         Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
@@ -991,6 +1331,12 @@ public:
                                                                  arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
                                                                  arg_policy.vector_length() )
         )}
+  , m_league_size( arg_policy.league_size() )
+  , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
+      Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
+                                                               arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
+                                                               arg_policy.vector_length() )
+  , m_vector_size( arg_policy.vector_length() )
   {
     // Return Init value if the number of worksets is zero
     if( arg_policy.league_size() == 0) {
@@ -1150,6 +1496,7 @@ private:
   typedef typename reducer_type<>::pointer_type    pointer_type ;
   typedef typename reducer_type<>::reference_type  reference_type ;
   typedef typename reducer_type<>::value_type      value_type ;
+  typedef typename Policy::launch_bounds           LaunchBounds ;
 
   typedef Kokkos::Impl::FunctorAnalysis
     < Kokkos::Impl::FunctorPatternInterface::REDUCE
@@ -1273,7 +1620,7 @@ public:
         const int  shmem = m_shmem_team_begin + m_shmem_team_size ;
 
         // copy to device and execute
-        CudaParallelLaunch( *this, grid, block, shmem );
+        CudaParallelLaunch( *this, grid, block, shmem );
 
         Cuda::fence();
 
@@ -1373,7 +1720,7 @@ public:
 
     if ( CudaTraits::WarpSize < team_threads ) {
       // Need inter-warp team reduction (collectives) shared memory
-      // Speculate an upper bound for the value size 
+      // Speculate an upper bound for the value size
 
       m_shmem_team_begin =
         align_scratch( CudaTraits::warp_count(team_threads) * sizeof(double) );
@@ -1426,7 +1773,7 @@ public:
 
     // Reduce space has claim flag followed by vaue buffer
     const int global_reduce_value_size =
-      max_concurrent_block * 
+      max_concurrent_block *
       ( aligned_flag_size + align_scratch( value_size ) );
 
     // Scratch space has claim flag followed by scratch buffer
@@ -1469,6 +1816,7 @@ private:
   typedef typename Policy::member_type  Member ;
   typedef typename Policy::work_tag     WorkTag ;
   typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::launch_bounds  LaunchBounds ;
 
   typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
   typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
@@ -1655,10 +2003,10 @@ public:
         const int shmem = ValueTraits::value_size( m_functor ) * ( block_size + 2 );
 
         m_final = false ;
-        CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
+        CudaParallelLaunch< ParallelScan, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
 
         m_final = true ;
-        CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
+        CudaParallelLaunch< ParallelScan, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
       }
     }
 
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
index 432c7895cc..709cbbd534 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -151,7 +151,7 @@ template< class ValueType , class JoinOp>
 __device__
 inline void cuda_intra_warp_reduction( ValueType& result,
                                        const JoinOp& join,
-                                       const int max_active_thread = blockDim.y) {
+                                       const uint32_t max_active_thread = blockDim.y) {
 
   unsigned int shift = 1;
 
@@ -268,29 +268,33 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
         if( id + 1 < int(gridDim.x) )
           join(value, tmp);
       }
+      int active = __ballot(1);
       if (int(blockDim.x*blockDim.y) > 2) {
         value_type tmp = Kokkos::shfl_down(value, 2,32);
         if( id + 2 < int(gridDim.x) )
           join(value, tmp);
       }
+      active += __ballot(1);
       if (int(blockDim.x*blockDim.y) > 4) {
         value_type tmp = Kokkos::shfl_down(value, 4,32);
         if( id + 4 < int(gridDim.x) )
           join(value, tmp);
       }
+      active += __ballot(1);
       if (int(blockDim.x*blockDim.y) > 8) {
         value_type tmp = Kokkos::shfl_down(value, 8,32);
         if( id + 8 < int(gridDim.x) )
           join(value, tmp);
       }
+      active += __ballot(1);
       if (int(blockDim.x*blockDim.y) > 16) {
         value_type tmp = Kokkos::shfl_down(value, 16,32);
         if( id + 16 < int(gridDim.x) )
           join(value, tmp);
       }
+      active += __ballot(1);
     }
   }
-
   //The last block has in its thread=0 the global reduction value through "value"
   return last_block;
 #else
@@ -302,7 +306,7 @@ template< class ReducerType >
 __device__ inline
 typename std::enable_if< Kokkos::is_reducer::value >::type
 cuda_intra_warp_reduction( const ReducerType& reducer,
-                           const int max_active_thread = blockDim.y) {
+                           const uint32_t max_active_thread = blockDim.y) {
 
   typedef typename ReducerType::value_type ValueType;
 
@@ -428,26 +432,31 @@ cuda_inter_block_reduction( const ReducerType& reducer,
         if( id + 1 < int(gridDim.x) )
           reducer.join(value, tmp);
       }
+      int active = __ballot(1);
       if (int(blockDim.x*blockDim.y) > 2) {
         value_type tmp = Kokkos::shfl_down(value, 2,32);
         if( id + 2 < int(gridDim.x) )
           reducer.join(value, tmp);
       }
+      active += __ballot(1);
       if (int(blockDim.x*blockDim.y) > 4) {
         value_type tmp = Kokkos::shfl_down(value, 4,32);
         if( id + 4 < int(gridDim.x) )
           reducer.join(value, tmp);
       }
+      active += __ballot(1);
       if (int(blockDim.x*blockDim.y) > 8) {
         value_type tmp = Kokkos::shfl_down(value, 8,32);
         if( id + 8 < int(gridDim.x) )
           reducer.join(value, tmp);
       }
+      active += __ballot(1);
       if (int(blockDim.x*blockDim.y) > 16) {
         value_type tmp = Kokkos::shfl_down(value, 16,32);
         if( id + 16 < int(gridDim.x) )
           reducer.join(value, tmp);
       }
+      active += __ballot(1);
     }
   }
 
@@ -594,7 +603,7 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
   typedef FunctorValueOps<    FunctorType , ArgTag >  ValueOps ;
 
   typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  //typedef typename ValueTraits::reference_type  reference_type ;
 
   // '__ffs' = position of the least significant bit set to 1.
   // 'blockDim.y' is guaranteed to be a power of two so this
@@ -637,7 +646,7 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
 
     {
       void * const shared_ptr = shared_data + word_count.value * threadIdx.y ;
-      reference_type shared_value = ValueInit::init( functor , shared_ptr );
+      /* reference_type shared_value = */ ValueInit::init( functor , shared_ptr );
 
       for ( size_type i = b ; i < e ; ++i ) {
         ValueJoin::join( functor , shared_ptr , global_data + word_count.value * i );
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
index 3c6f0a5dda..5f08800c40 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
@@ -58,25 +58,56 @@ template class TaskQueue< Kokkos::Cuda > ;
 
 //----------------------------------------------------------------------------
 
+#if defined( KOKKOS_DEBUG )
+
+__device__
+void verify_warp_convergence( const char * const where )
+{
+  const unsigned b = __ballot(1);
+
+  if ( b != ~0u ) {
+
+printf(" verify_warp_convergence( %s ) (%d,%d,%d) (%d,%d,%d) failed %x\n"
+      , where
+      , blockIdx.x
+      , blockIdx.y
+      , blockIdx.z
+      , threadIdx.x
+      , threadIdx.y
+      , threadIdx.z
+      , b );
+
+  }
+}
+
+#endif // #if defined( KOKKOS_DEBUG )
+
+//----------------------------------------------------------------------------
+
 __device__
 void TaskQueueSpecialization< Kokkos::Cuda >::driver
-  ( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue )
+  ( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue 
+  , int32_t shmem_per_warp )
 {
   using Member = TaskExec< Kokkos::Cuda > ;
   using Queue  = TaskQueue< Kokkos::Cuda > ;
-  using task_root_type = TaskBase< Kokkos::Cuda , void , void > ;
+  using task_root_type = TaskBase< void , void , void > ;
+
+  extern __shared__ int32_t shmem_all[];
 
   task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
 
-  Member single_exec( 1 );
-  Member team_exec( blockDim.y );
+  int32_t * const warp_shmem =
+    shmem_all + ( threadIdx.z * shmem_per_warp ) / sizeof(int32_t);
+
+  task_root_type * const task_shmem = (task_root_type *) warp_shmem ;
 
   const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;
 
-  union {
-    task_root_type * ptr ;
-    int              raw[2] ;
-  } task ;
+  Member single_exec( warp_shmem , 1 );
+  Member team_exec( warp_shmem , blockDim.y );
+
+  task_root_type * task_ptr ;
 
   // Loop until all queues are empty and no tasks in flight
 
@@ -87,41 +118,86 @@ void TaskQueueSpecialization< Kokkos::Cuda >::driver
 
     if ( 0 == warp_lane ) {
 
-      task.ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+      task_ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
 
       // Loop by priority and then type
-      for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
-        for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
-          task.ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
+      for ( int i = 0 ; i < Queue::NumQueue && end == task_ptr ; ++i ) {
+        for ( int j = 0 ; j < 2 && end == task_ptr ; ++j ) {
+          task_ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
         }
       }
 
 #if 0
 printf("TaskQueue::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
-      , uintptr_t(task.ptr));
+      , uintptr_t(task_ptr));
 #endif
 
     }
 
     // shuffle broadcast
 
-    task.raw[0] = __shfl( task.raw[0] , 0 );
-    task.raw[1] = __shfl( task.raw[1] , 0 );
+    ((int*) & task_ptr )[0] = __shfl( ((int*) & task_ptr )[0] , 0 );
+    ((int*) & task_ptr )[1] = __shfl( ((int*) & task_ptr )[1] , 0 );
 
-    if ( 0 == task.ptr ) break ; // 0 == queue->m_ready_count
+#if defined( KOKKOS_DEBUG )
+    verify_warp_convergence("task_ptr");
+#endif
 
-    if ( end != task.ptr ) {
-      if ( task_root_type::TaskTeam == task.ptr->m_task_type ) {
+    if ( 0 == task_ptr ) break ; // 0 == queue->m_ready_count
+
+    if ( end != task_ptr ) {
+
+      // Whole warp copy task's closure to/from shared memory.
+      // Use all threads of warp for coalesced read/write.
+
+      int32_t const b = sizeof(task_root_type) / sizeof(int32_t);
+      int32_t const e = *((int32_t volatile *)( & task_ptr->m_alloc_size )) / sizeof(int32_t);
+
+      int32_t volatile * const task_mem = (int32_t volatile *) task_ptr ;
+
+      // copy global to shared memory:
+
+      for ( int32_t i = warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
+        warp_shmem[i] = task_mem[i] ;
+      }
+
+      Kokkos::memory_fence();
+
+      // Copy done - use memory fence so that memory writes are visible.
+      // For reliable warp convergence on Pascal and Volta an explicit
+      // warp level synchronization will also be required.
+
+      if ( task_root_type::TaskTeam == task_shmem->m_task_type ) {
         // Thread Team Task
-        (*task.ptr->m_apply)( task.ptr , & team_exec );
+        (*task_shmem->m_apply)( task_shmem , & team_exec );
       }
       else if ( 0 == threadIdx.y ) {
         // Single Thread Task
-        (*task.ptr->m_apply)( task.ptr , & single_exec );
+        (*task_shmem->m_apply)( task_shmem , & single_exec );
       }
 
+      // copy shared to global memory:
+
+      for ( int32_t i = b + warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
+        task_mem[i] = warp_shmem[i] ;
+      }
+
+      Kokkos::memory_fence();
+
+#if defined( KOKKOS_DEBUG )
+    verify_warp_convergence("apply");
+#endif
+
+      // If respawn requested copy respawn data back to main memory
+
       if ( 0 == warp_lane ) {
-        queue->complete( task.ptr );
+
+        if ( ((task_root_type *) task_root_type::LockTag) != task_shmem->m_next ) {
+          ( (volatile task_root_type *) task_ptr )->m_next = task_shmem->m_next ;
+          ( (volatile task_root_type *) task_ptr )->m_priority = task_shmem->m_priority ;
+        }
+
+        queue->complete( task_ptr );
       }
     }
   } while(1);
@@ -130,18 +206,20 @@ printf("TaskQueue::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
 namespace {
 
 __global__
-void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue )
-{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue ); }
+void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue 
+                            , int32_t shmem_size )
+{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue , shmem_size ); }
 
 }
 
 void TaskQueueSpecialization< Kokkos::Cuda >::execute
   ( TaskQueue< Kokkos::Cuda > * const queue )
 {
+  const int shared_per_warp = 2048 ;
   const int warps_per_block = 4 ;
   const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
   const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
-  const int shared = 0 ;
+  const int shared_total = shared_per_warp * warps_per_block ;
   const cudaStream_t stream = 0 ;
 
   CUDA_SAFE_CALL( cudaDeviceSynchronize() );
@@ -159,7 +237,7 @@ printf("cuda_task_queue_execute before\n");
   //
   // CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , stack_size ) );
 
-  cuda_task_queue_execute<<< grid , block , shared , stream >>>( queue );
+  cuda_task_queue_execute<<< grid , block , shared_total , stream >>>( queue , shared_per_warp );
 
   CUDA_SAFE_CALL( cudaGetLastError() );
 
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
index 5d08219ea5..4a52985d29 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
@@ -57,7 +57,7 @@ namespace {
 template< typename TaskType >
 __global__
 void set_cuda_task_base_apply_function_pointer
-  ( TaskBase::function_type * ptr )
+  ( TaskBase::function_type * ptr )
 { *ptr = TaskType::apply ; }
 
 }
@@ -78,7 +78,7 @@ public:
   void iff_single_thread_recursive_execute( queue_type * const ) {}
 
   __device__
-  static void driver( queue_type * const );
+  static void driver( queue_type * const , int32_t );
 
   static
   void execute( queue_type * const );
@@ -106,7 +106,14 @@ public:
 
 extern template class TaskQueue< Kokkos::Cuda > ;
 
+}} /* namespace Kokkos::Impl */
+
 //----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
 /**\brief  Impl::TaskExec is the TaskScheduler::member_type
  *         passed to tasks running in a Cuda space.
  *
@@ -134,11 +141,13 @@ private:
   friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda > ;
   friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Cuda > ;
 
+  int32_t * m_team_shmem ;
   const int m_team_size ;
 
   __device__
-  TaskExec( int arg_team_size = blockDim.y )
-    : m_team_size( arg_team_size ) {}
+  TaskExec( int32_t * arg_team_shmem , int arg_team_size = blockDim.y )
+    : m_team_shmem( arg_team_shmem )
+    , m_team_size( arg_team_size ) {}
 
 public:
 
@@ -154,7 +163,13 @@ public:
 
 };
 
+}} /* namespace Kokkos::Impl */
+
 //----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
 
 template
 struct TeamThreadRangeBoundariesStruct >
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
index 084daa098b..3f3d85ecd1 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
@@ -106,7 +106,7 @@ private:
   typedef Kokkos::Cuda                           execution_space ;
   typedef execution_space::scratch_memory_space  scratch_memory_space ;
 
-  void                * m_team_reduce ;
+  mutable void        * m_team_reduce ;
   scratch_memory_space  m_team_shared ;
   int                   m_team_reduce_size ;
   int                   m_league_rank ;
@@ -166,7 +166,7 @@ public:
       if ( 1 == blockDim.z ) { // team == block
         __syncthreads();
         // Wait for shared data write until all threads arrive here
-        if ( threadIdx.x == 0 && threadIdx.y == thread_id ) {
+        if ( threadIdx.x == 0u && threadIdx.y == (uint32_t)thread_id ) {
           *((ValueType*) m_team_reduce) = val ;
         }
         __syncthreads(); // Wait for shared data read until root thread writes
@@ -210,7 +210,7 @@ public:
       const int wx =
         ( threadIdx.x + blockDim.x * threadIdx.y ) & CudaTraits::WarpIndexMask ;
 
-      for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
+      for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) {
 
         cuda_shfl_down( reducer.reference() , tmp , i , CudaTraits::WarpSize );
 
@@ -354,7 +354,7 @@ public:
 
       for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
         cuda_shfl_down( reducer.reference() , tmp , i , blockDim.x );
-        if ( threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
+        if ( (int)threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
       }
 
       // Broadcast from root lane to all other lanes.
@@ -410,7 +410,7 @@ public:
 
         value_type tmp( reducer.reference() );
 
-        for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
+        for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) {
 
           cuda_shfl_down( reducer.reference(), tmp, i, CudaTraits::WarpSize );
 
@@ -479,7 +479,7 @@ public:
 
           __threadfence(); // Wait until global write is visible.
 
-          last_block = gridDim.x ==
+          last_block = (int)gridDim.x ==
                        1 + Kokkos::atomic_fetch_add(global_scratch_flags,1);
 
           // If last block then reset count
@@ -509,7 +509,7 @@ public:
         reducer.copy( ((pointer_type)shmem) + offset
                     , ((pointer_type)global_scratch_space) + offset );
 
-        for ( int i = nentry + tid ; i < gridDim.x ; i += nentry ) {
+        for ( int i = nentry + tid ; i < (int)gridDim.x ; i += nentry ) {
           reducer.join( ((pointer_type)shmem) + offset
                       , ((pointer_type)global_scratch_space)
                         + i * reducer.length() );
@@ -576,6 +576,14 @@ public:
     , m_league_size( arg_league_size )
     {}
 
+public:
+  // Declare to avoid unused private member warnings which are trigger
+  // when SFINAE excludes the member function which uses these variables
+  // Making another class a friend also surpresses these warnings
+  bool impl_avoid_sfinae_warning() const noexcept
+  {
+    return m_team_reduce_size > 0 && m_team_reduce != nullptr;
+  }
 };
 
 } // namspace Impl
@@ -913,10 +921,10 @@ void parallel_scan
     //  [t] += [t-4] if t >= 4
     //  ...
 
-    for ( int j = 1 ; j < blockDim.x ; j <<= 1 ) {
+    for ( int j = 1 ; j < (int)blockDim.x ; j <<= 1 ) {
       value_type tmp = 0 ;
       Impl::cuda_shfl_up( tmp , sval , j , blockDim.x );
-      if ( j <= threadIdx.x ) { sval += tmp ; }
+      if ( j <= (int)threadIdx.x ) { sval += tmp ; }
     }
 
     // Include accumulation and remove value for exclusive scan:
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp
new file mode 100644
index 0000000000..e11ae4798f
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp
@@ -0,0 +1,133 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_UNIQUE_TOKEN_HPP
+#define KOKKOS_CUDA_UNIQUE_TOKEN_HPP
+
+#include 
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include 
+#include 
+#include 
+#include 
+
+namespace Kokkos { namespace Experimental {
+
+// both global and instance Unique Tokens are implemented in the same way
+template<>
+class UniqueToken< Cuda, UniqueTokenScope::Global >
+{
+private:
+
+  uint32_t volatile * m_buffer ;
+  uint32_t            m_count ;
+
+public:
+
+  using execution_space = Cuda;
+
+  explicit
+  UniqueToken( execution_space const& );
+
+  KOKKOS_INLINE_FUNCTION
+  UniqueToken() : m_buffer(0), m_count(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  UniqueToken( const UniqueToken & ) = default;
+
+  KOKKOS_INLINE_FUNCTION
+  UniqueToken( UniqueToken && )      = default;
+
+  KOKKOS_INLINE_FUNCTION
+  UniqueToken & operator=( const UniqueToken & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  UniqueToken & operator=( UniqueToken && ) = default ;
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int32_t size() const noexcept { return m_count ; }
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int32_t acquire() const
+  {
+    const Kokkos::pair result =
+      Kokkos::Impl::concurrent_bitset::
+        acquire_bounded( m_buffer
+                       , m_count
+                       , Kokkos::Impl::clock_tic() % m_count
+                       );
+
+   if ( result.first < 0 ) {
+     Kokkos::abort("UniqueToken failure to release tokens, no tokens available" );
+   }
+
+    return result.first;
+  }
+
+  /// \brief release an acquired value
+  KOKKOS_INLINE_FUNCTION
+  void release( int32_t i ) const noexcept
+  {
+    Kokkos::Impl::concurrent_bitset::release( m_buffer, i );
+  }
+};
+
+template<>
+class UniqueToken< Cuda, UniqueTokenScope::Instance >
+  : public UniqueToken< Cuda, UniqueTokenScope::Global >
+{
+public:
+
+  explicit
+  UniqueToken( execution_space const& arg )
+    : UniqueToken< Cuda, UniqueTokenScope::Global >( arg ) {}
+};
+
+}} // namespace Kokkos::Experimental
+
+#endif // KOKKOS_ENABLE_CUDA
+#endif // KOKKOS_CUDA_UNIQUE_TOKEN_HPP
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
index f5e2d87fb6..d641622bb6 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@@ -221,7 +221,6 @@ struct CudaLDGFetch {
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
-namespace Experimental {
 namespace Impl {
 
 /** \brief  Replace Default ViewDataHandle with Cuda texture fetch specialization
@@ -294,9 +293,8 @@ public:
     }
 };
 
-}
-}
-}
+} // namespace Impl
+} // namespace Kokkos
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
new file mode 100644
index 0000000000..99778c64b1
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
@@ -0,0 +1,119 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_WORKGRAPHPOLICY_HPP
+#define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType ,
+                   Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
+                   Kokkos::Cuda
+                 >
+  : public Kokkos::Impl::Experimental::
+           WorkGraphExec< FunctorType,
+                          Kokkos::Cuda,
+                          Traits ...
+                        >
+{
+public:
+
+  typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... >   Policy ;
+  typedef Kokkos::Impl::Experimental::
+          WorkGraphExec Base ;
+  typedef ParallelFor        Self ;
+
+private:
+
+  template< class TagType >
+  __device__
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_one(const typename Policy::member_type& i) const {
+    Base::m_functor( i );
+  }
+
+  template< class TagType >
+  __device__
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_one(const typename Policy::member_type& i) const {
+    const TagType t{} ;
+    Base::m_functor( t , i );
+  }
+
+public:
+
+  __device__
+  inline
+  void operator()() const {
+    for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
+      exec_one< typename Policy::work_tag >( i );
+      Base::after_work(i);
+    }
+  }
+
+  inline
+  void execute()
+  {
+    const int warps_per_block = 4 ;
+    const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
+    const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
+    const int shared = 0 ;
+    const cudaStream_t stream = 0 ;
+
+    Kokkos::Impl::CudaParallelLaunch(*this, grid, block, shared, stream);
+  }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : Base( arg_functor, arg_policy )
+  {
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP */
diff --git a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
index 4f68d9c2c0..6ef7443a14 100644
--- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
+++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@@ -52,6 +52,7 @@
 
 #if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
 #include
+#include 
 #endif
 
 namespace Kokkos { namespace Experimental {
@@ -120,28 +121,17 @@ struct MDRangePolicy
                                        , typename traits::index_type
                                        > ;
 
+  typedef MDRangePolicy execution_policy; // needed for is_execution_space interrogation
+
   static_assert( !std::is_same::value
                , "Kokkos Error: MD iteration pattern not defined" );
 
   using iteration_pattern   = typename traits::iteration_pattern;
   using work_tag            = typename traits::work_tag;
+  using launch_bounds       = typename traits::launch_bounds;
+  using member_type = typename range_policy::member_type;
 
-  static constexpr int rank = iteration_pattern::rank;
-
-  static constexpr int outer_direction = static_cast (
-      (iteration_pattern::outer_direction != Iterate::Default)
-    ? iteration_pattern::outer_direction
-    : default_outer_direction< typename traits::execution_space>::value );
-
-  static constexpr int inner_direction = static_cast (
-      iteration_pattern::inner_direction != Iterate::Default
-    ? iteration_pattern::inner_direction
-    : default_inner_direction< typename traits::execution_space>::value ) ;
-
-
-  // Ugly ugly workaround intel 14 not handling scoped enum correctly
-  static constexpr int Right = static_cast( Iterate::Right );
-  static constexpr int Left  = static_cast( Iterate::Left );
+  enum { rank = static_cast(iteration_pattern::rank) };
 
   using index_type  = typename traits::index_type;
   using array_index_type = long;
@@ -155,11 +145,50 @@ struct MDRangePolicy
   // This would require the user to either pass a matching index_type parameter
   // as template parameter to the MDRangePolicy or static_cast the individual values
 
+  point_type m_lower;
+  point_type m_upper;
+  tile_type  m_tile;
+  point_type m_tile_end;
+  index_type m_num_tiles;
+  index_type m_prod_tile_dims;
+
+/*
+  // NDE enum impl definition alternative - replace static constexpr int ? 
+  enum { outer_direction = static_cast (
+      (iteration_pattern::outer_direction != Iterate::Default)
+    ? iteration_pattern::outer_direction
+    : default_outer_direction< typename traits::execution_space>::value ) };
+
+  enum { inner_direction = static_cast (
+      iteration_pattern::inner_direction != Iterate::Default
+    ? iteration_pattern::inner_direction
+    : default_inner_direction< typename traits::execution_space>::value ) };
+
+  enum { Right = static_cast( Iterate::Right ) };
+  enum { Left  = static_cast( Iterate::Left ) };
+*/
+  //static constexpr int rank = iteration_pattern::rank;
+
+  static constexpr int outer_direction = static_cast (
+      (iteration_pattern::outer_direction != Iterate::Default)
+    ? iteration_pattern::outer_direction
+    : default_outer_direction< typename traits::execution_space>::value );
+
+  static constexpr int inner_direction = static_cast (
+      iteration_pattern::inner_direction != Iterate::Default
+    ? iteration_pattern::inner_direction
+    : default_inner_direction< typename traits::execution_space>::value ) ;
+
+  // Ugly ugly workaround intel 14 not handling scoped enum correctly
+  static constexpr int Right = static_cast( Iterate::Right );
+  static constexpr int Left  = static_cast( Iterate::Left );
+
   MDRangePolicy( point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{} )
     : m_lower(lower)
     , m_upper(upper)
     , m_tile(tile)
     , m_num_tiles(1)
+    , m_prod_tile_dims(1)
   {
     // Host
     if ( true
@@ -172,8 +201,8 @@ struct MDRangePolicy
       for (int i=0; i 0)) )
+          if (  ((int)inner_direction == (int)Right && (i < rank-1))
+              || ((int)inner_direction == (int)Left && (i > 0)) )
           {
             m_tile[i] = 2;
           }
@@ -183,6 +212,7 @@ struct MDRangePolicy
         }
         m_tile_end[i] = static_cast((span + m_tile[i] - 1) / m_tile[i]);
         m_num_tiles *= m_tile_end[i];
+        m_prod_tile_dims *= m_tile[i];
       }
     }
     #if defined(KOKKOS_ENABLE_CUDA)
@@ -190,14 +220,18 @@ struct MDRangePolicy
     {
       index_type span;
       for (int i=0; i 0)) )
+          if (  ((int)inner_direction == (int)Right && (i < rank-1))
+              || ((int)inner_direction == (int)Left && (i > 0)) )
           {
-            m_tile[i] = 2;
+            if ( m_prod_tile_dims < 512 ) {
+              m_tile[i] = 2;
+            } else {
+              m_tile[i] = 1;
+            }
           }
           else {
             m_tile[i] = 16;
@@ -205,12 +239,9 @@ struct MDRangePolicy
         }
         m_tile_end[i] = static_cast((span + m_tile[i] - 1) / m_tile[i]);
         m_num_tiles *= m_tile_end[i];
+        m_prod_tile_dims *= m_tile[i];
       }
-      index_type total_tile_size_check = 1;
-      for (int i=0; i= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
+      if ( m_prod_tile_dims > 512 ) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 max per dim (Kepler), but product num_threads < 1024
         printf(" Tile dimensions exceed Cuda limits\n");
         Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
         //Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
@@ -223,19 +254,7 @@ struct MDRangePolicy
   template < typename LT , typename UT , typename TT = array_index_type >
   MDRangePolicy( std::initializer_list const& lower, std::initializer_list const& upper, std::initializer_list const& tile = {} )
   {
-#if 0
-    // This should work, less duplicated code but not yet extensively tested
-    point_type lower_tmp, upper_tmp;
-    tile_type tile_tmp;
-    for ( auto i = 0; i < rank; ++i ) {
-      lower_tmp[i] = static_cast(lower.begin()[i]);
-      upper_tmp[i] = static_cast(upper.begin()[i]);
-      tile_tmp[i]  = static_cast(tile.begin()[i]);
-    }
 
-    MDRangePolicy( lower_tmp, upper_tmp, tile_tmp );
-
-#else
     if(static_cast(m_lower.size()) != rank || static_cast(m_upper.size()) != rank)
       Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size");
 
@@ -249,7 +268,7 @@ struct MDRangePolicy
     }
 
     m_num_tiles = 1;
-
+    m_prod_tile_dims = 1;
 
     // Host
     if ( true
@@ -262,8 +281,8 @@ struct MDRangePolicy
       for (int i=0; i 0)) )
+          if (  ((int)inner_direction == (int)Right && (i < rank-1))
+              || ((int)inner_direction == (int)Left && (i > 0)) )
           {
             m_tile[i] = 2;
           }
@@ -273,6 +292,7 @@ struct MDRangePolicy
         }
         m_tile_end[i] = static_cast((span + m_tile[i] - 1) / m_tile[i]);
         m_num_tiles *= m_tile_end[i];
+        m_prod_tile_dims *= m_tile[i];
       }
     }
     #if defined(KOKKOS_ENABLE_CUDA)
@@ -284,10 +304,14 @@ struct MDRangePolicy
         if ( m_tile[i] <= 0 ) {
           // TODO: determine what is a good default tile size for cuda
           // may be rank dependent
-          if (  (inner_direction == Right && (i < rank-1))
-              || (inner_direction == Left && (i > 0)) )
+          if (  ((int)inner_direction == (int)Right && (i < rank-1))
+              || ((int)inner_direction == (int)Left && (i > 0)) )
           {
-            m_tile[i] = 2;
+            if ( m_prod_tile_dims < 512 ) {
+              m_tile[i] = 2;
+            } else {
+              m_tile[i] = 1;
+            }
           }
           else {
             m_tile[i] = 16;
@@ -295,32 +319,22 @@ struct MDRangePolicy
         }
         m_tile_end[i] = static_cast((span + m_tile[i] - 1) / m_tile[i]);
         m_num_tiles *= m_tile_end[i];
+        m_prod_tile_dims *= m_tile[i];
       }
-      index_type total_tile_size_check = 1;
-      for (int i=0; i= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
+      if ( m_prod_tile_dims > 512 ) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 max per dim (Kepler), but product num_threads < 1024
         printf(" Tile dimensions exceed Cuda limits\n");
         Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
         //Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
       }
     }
     #endif
-#endif
   }
 
-
-  point_type m_lower;
-  point_type m_upper;
-  tile_type  m_tile;
-  point_type m_tile_end;
-  index_type m_num_tiles;
 };
 // ------------------------------------------------------------------ //
 
 // ------------------------------------------------------------------ //
-//md_parallel_for
+//md_parallel_for - deprecated use parallel_for
 // ------------------------------------------------------------------ //
 template 
 void md_parallel_for( MDRange const& range
@@ -335,7 +349,6 @@ void md_parallel_for( MDRange const& range
 {
   Impl::MDFunctor g(range, f);
 
-  //using range_policy = typename MDRange::range_policy;
   using range_policy = typename MDRange::impl_range_policy;
 
   Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
@@ -354,7 +367,6 @@ void md_parallel_for( const std::string& str
 {
   Impl::MDFunctor g(range, f);
 
-  //using range_policy = typename MDRange::range_policy;
   using range_policy = typename MDRange::impl_range_policy;
 
   Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
@@ -395,7 +407,7 @@ void md_parallel_for( MDRange const& range
 // ------------------------------------------------------------------ //
 
 // ------------------------------------------------------------------ //
-//md_parallel_reduce
+//md_parallel_reduce - deprecated use parallel_reduce
 // ------------------------------------------------------------------ //
 template 
 void md_parallel_reduce( MDRange const& range
@@ -409,9 +421,8 @@ void md_parallel_reduce( MDRange const& range
                       ) >::type* = 0
                     )
 {
-  Impl::MDFunctor g(range, f, v);
+  Impl::MDFunctor g(range, f);
 
-  //using range_policy = typename MDRange::range_policy;
   using range_policy = typename MDRange::impl_range_policy;
   Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
 }
@@ -428,48 +439,14 @@ void md_parallel_reduce( const std::string& str
                       ) >::type* = 0
                     )
 {
-  Impl::MDFunctor g(range, f, v);
+  Impl::MDFunctor g(range, f);
 
-  //using range_policy = typename MDRange::range_policy;
   using range_policy = typename MDRange::impl_range_policy;
 
   Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
 }
 
-// Cuda - parallel_reduce not implemented yet
-/*
-template 
-void md_parallel_reduce( MDRange const& range
-                    , Functor const& f
-                    , ValueType & v
-                    , const std::string& str = ""
-                    , typename std::enable_if<( true
-                      #if defined( KOKKOS_ENABLE_CUDA)
-                      && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
-                      #endif
-                      ) >::type* = 0
-                    )
-{
-  Impl::DeviceIterateTile closure(range, f, v);
-  closure.execute();
-}
-
-template 
-void md_parallel_reduce( const std::string& str
-                    , MDRange const& range
-                    , Functor const& f
-                    , ValueType & v
-                    , typename std::enable_if<( true
-                      #if defined( KOKKOS_ENABLE_CUDA)
-                      && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
-                      #endif
-                      ) >::type* = 0
-                    )
-{
-  Impl::DeviceIterateTile closure(range, f, v);
-  closure.execute();
-}
-*/
+// Cuda - md_parallel_reduce not implemented - use parallel_reduce
 
 }} // namespace Kokkos::Experimental
 
diff --git a/lib/kokkos/core/src/Kokkos_Atomic.hpp b/lib/kokkos/core/src/Kokkos_Atomic.hpp
index 3ecae24da4..3c8673c66a 100644
--- a/lib/kokkos/core/src/Kokkos_Atomic.hpp
+++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp
@@ -114,40 +114,9 @@
 #endif /* Not pre-selected atomic implementation */
 #endif
 
-//----------------------------------------------------------------------------
-
-// Forward decalaration of functions supporting arbitrary sized atomics
-// This is necessary since Kokkos_Atomic.hpp is internally included very early
-// through Kokkos_HostSpace.hpp as well as the allocation tracker.
 #ifdef KOKKOS_ENABLE_CUDA
-namespace Kokkos {
-namespace Impl {
-/// \brief Aquire a lock for the address
-///
-/// This function tries to aquire the lock for the hash value derived
-/// from the provided ptr. If the lock is successfully aquired the
-/// function returns true. Otherwise it returns false.
-#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-extern
+#include 
 #endif
-__device__ inline
-bool lock_address_cuda_space(void* ptr);
-
-/// \brief Release lock for the address
-///
-/// This function releases the lock for the hash value derived
-/// from the provided ptr. This function should only be called
-/// after previously successfully aquiring a lock with
-/// lock_address.
-#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-extern
-#endif
-__device__ inline
-void unlock_address_cuda_space(void* ptr);
-}
-}
-#endif
-
 
 namespace Kokkos {
 template 
diff --git a/lib/kokkos/core/src/Kokkos_Concepts.hpp b/lib/kokkos/core/src/Kokkos_Concepts.hpp
index 9a2b53e157..5480dbf40c 100644
--- a/lib/kokkos/core/src/Kokkos_Concepts.hpp
+++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp
@@ -79,6 +79,21 @@ struct IndexType
   using type = T;
 };
 
+/**\brief Specify Launch Bounds for CUDA execution.
+ *
+ *  The "best" defaults may be architecture specific.
+ */
+template< unsigned int maxT = 1024 /* Max threads per block */
+        , unsigned int minB = 1    /* Min blocks per SM */
+        >
+struct LaunchBounds
+{
+  using launch_bounds = LaunchBounds;
+  using type = LaunchBounds;
+  static unsigned int constexpr maxTperB {maxT};
+  static unsigned int constexpr minBperSM {minB};
+};
+
 } // namespace Kokkos
 
 //----------------------------------------------------------------------------
@@ -119,6 +134,7 @@ using Kokkos::is_array_layout ;
 KOKKOS_IMPL_IS_CONCEPT( iteration_pattern )
 KOKKOS_IMPL_IS_CONCEPT( schedule_type )
 KOKKOS_IMPL_IS_CONCEPT( index_type )
+KOKKOS_IMPL_IS_CONCEPT( launch_bounds )
 
 }
 
diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp
index 19de791c0f..ddb11d2894 100644
--- a/lib/kokkos/core/src/Kokkos_Core.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@@ -96,11 +96,13 @@ struct InitArguments {
   int num_numa;
   int device_id;
 
-  InitArguments() {
-    num_threads = -1;
-    num_numa = -1;
-    device_id = -1;
-  }
+  InitArguments( int nt = -1
+               , int nn = -1
+               , int dv = -1)
+    : num_threads( nt )
+    , num_numa( nn )
+    , device_id( dv )
+  {}
 };
 
 void initialize(int& narg, char* arg[]);
@@ -168,6 +170,9 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
 
 } // namespace Kokkos
 
+#include 
+#include 
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
index 09081d2387..8c080f7a8f 100644
--- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
@@ -51,6 +51,9 @@
 #include 
 #include 
 
+#include 
+#include 
+
 //----------------------------------------------------------------------------
 // Have assumed a 64bit build (8byte pointers) throughout the code base.
 
diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp
new file mode 100644
index 0000000000..93b3fa5ca9
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Crs.hpp
@@ -0,0 +1,333 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CRS_HPP
+#define KOKKOS_CRS_HPP
+
+namespace Kokkos {
+namespace Experimental {
+
+/// \class Crs
+/// \brief Compressed row storage array.
+///
+/// \tparam DataType The type of stored entries.  If a Crs is
+///   used as the graph of a sparse matrix, then this is usually an
+///   integer type, the type of the column indices in the sparse
+///   matrix.
+///
+/// \tparam Arg1Type The second template parameter, corresponding
+///   either to the Device type (if there are no more template
+///   parameters) or to the Layout type (if there is at least one more
+///   template parameter).
+///
+/// \tparam Arg2Type The third template parameter, which if provided
+///   corresponds to the Device type.
+///
+/// \tparam SizeType The type of row offsets.  Usually the default
+///   parameter suffices.  However, setting a nondefault value is
+///   necessary in some cases, for example, if you want to have a
+///   sparse matrices with dimensions (and therefore column indices)
+///   that fit in \c int, but want to store more than INT_MAX
+///   entries in the sparse matrix.
+///
+/// A row has a range of entries:
+/// 
    +///
  • row_map[i0] <= entry < row_map[i0+1]
  • +///
  • 0 <= i1 < row_map[i0+1] - row_map[i0]
  • +///
  • entries( entry , i2 , i3 , ... );
  • +///
  • entries( row_map[i0] + i1 , i2 , i3 , ... );
  • +///
+template< class DataType, + class Arg1Type, + class Arg2Type = void, + typename SizeType = typename ViewTraits::size_type> +class Crs { +protected: + typedef ViewTraits traits; + +public: + typedef DataType data_type; + typedef typename traits::array_layout array_layout; + typedef typename traits::execution_space execution_space; + typedef typename traits::memory_space memory_space; + typedef typename traits::device_type device_type; + typedef SizeType size_type; + + typedef Crs< DataType , Arg1Type , Arg2Type , SizeType > staticcrsgraph_type; + typedef Crs< DataType , array_layout , typename traits::host_mirror_space , SizeType > HostMirror; + typedef View row_map_type; + typedef View entries_type; + + entries_type entries; + row_map_type row_map; + + //! Construct an empty view. + Crs () : entries(), row_map() {} + + //! Copy constructor (shallow copy). + Crs (const Crs& rhs) : entries (rhs.entries), row_map (rhs.row_map) + {} + + template + Crs (const EntriesType& entries_,const RowMapType& row_map_) : entries (entries_), row_map (row_map_) + {} + + /** \brief Assign to a view of the rhs array. + * If the old view is the last view + * then allocated memory is deallocated. + */ + Crs& operator= (const Crs& rhs) { + entries = rhs.entries; + row_map = rhs.row_map; + return *this; + } + + /** \brief Destroy this view of the array. + * If the last view then allocated memory is deallocated. + */ + ~Crs() {} + + /** \brief Return number of rows in the graph + */ + KOKKOS_INLINE_FUNCTION + size_type numRows() const { + return (row_map.dimension_0 () != 0) ? + row_map.dimension_0 () - static_cast (1) : + static_cast (0); + } +}; + +/*--------------------------------------------------------------------------*/ + +template< class OutCounts, + class DataType, + class Arg1Type, + class Arg2Type, + class SizeType> +void get_crs_transpose_counts( + OutCounts& out, + Crs const& in, + std::string const& name = "transpose_counts"); + +template< class OutCounts, + class InCrs> +void get_crs_row_map_from_counts( + OutCounts& out, + InCrs const& in, + std::string const& name = "row_map"); + +template< class DataType, + class Arg1Type, + class Arg2Type, + class SizeType> +void transpose_crs( + Crs& out, + Crs const& in); + +}} // namespace Kokkos::Experimental + +/*--------------------------------------------------------------------------*/ + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { +namespace Experimental { + +template +class GetCrsTransposeCounts { + public: + using execution_space = typename InCrs::execution_space; + using self_type = GetCrsTransposeCounts; + using index_type = typename InCrs::size_type; + private: + InCrs in; + OutCounts out; + public: + KOKKOS_INLINE_FUNCTION + void operator()(index_type i) const { + atomic_increment( &out[in.entries(i)] ); + } + GetCrsTransposeCounts(InCrs const& arg_in, OutCounts const& arg_out): + in(arg_in),out(arg_out) { + using policy_type = RangePolicy; + using closure_type = Kokkos::Impl::ParallelFor; + const closure_type closure(*this, policy_type(0, index_type(in.entries.size()))); + closure.execute(); + execution_space::fence(); + } +}; + +template +class CrsRowMapFromCounts { + public: + using execution_space = typename InCounts::execution_space; + using value_type = typename OutRowMap::value_type; + using index_type = typename InCounts::size_type; + private: + InCounts in; + OutRowMap out; + public: + KOKKOS_INLINE_FUNCTION + void operator()(index_type i, value_type& update, bool final_pass) const { + update += in(i); + if (final_pass) { + out(i + 1) = update; + if (i == 0) { + out(0) = 0; + } + } + } + KOKKOS_INLINE_FUNCTION + void init(value_type& update) const { update = 0; } + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& update, const volatile value_type& input) const { + update += input; + } + using self_type = CrsRowMapFromCounts; + CrsRowMapFromCounts(InCounts const& arg_in, OutRowMap const& arg_out): + in(arg_in),out(arg_out) { + using policy_type = RangePolicy; + using closure_type = Kokkos::Impl::ParallelScan; + closure_type closure(*this, policy_type(0, in.size())); + closure.execute(); + execution_space::fence(); + } +}; + +template +class FillCrsTransposeEntries { + public: + using execution_space = typename InCrs::execution_space; + using memory_space = typename InCrs::memory_space; + using value_type = typename OutCrs::entries_type::value_type; + using index_type = typename InCrs::size_type; + private: + using counters_type = View; + InCrs in; + OutCrs out; + counters_type counters; + public: + KOKKOS_INLINE_FUNCTION + void operator()(index_type i) const { + auto begin = in.row_map(i); + auto end = in.row_map(i + 1); + for (auto j = begin; j < end; ++j) { + auto ti = in.entries(j); + auto tbegin = out.row_map(ti); + auto tj = atomic_fetch_add( &counters(ti), 1 ); + out.entries( tbegin + tj ) = i; + } + } + using self_type = FillCrsTransposeEntries; + FillCrsTransposeEntries(InCrs const& arg_in, OutCrs const& arg_out): + in(arg_in),out(arg_out), + counters("counters", arg_out.numRows()) { + using policy_type = RangePolicy; + using closure_type = Kokkos::Impl::ParallelFor; + const closure_type closure(*this, policy_type(0, index_type(in.numRows()))); + closure.execute(); + execution_space::fence(); + } +}; + +}}} // namespace Kokkos::Impl::Experimental + +/*--------------------------------------------------------------------------*/ + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Experimental { + +template< class OutCounts, + class DataType, + class Arg1Type, + class Arg2Type, + class SizeType> +void get_crs_transpose_counts( + OutCounts& out, + Crs const& in, + std::string const& name) { + using InCrs = Crs; + out = OutCounts(name, in.numRows()); + Kokkos::Impl::Experimental:: + GetCrsTransposeCounts functor(in, out); +} + +template< class OutRowMap, + class InCounts> +void get_crs_row_map_from_counts( + OutRowMap& out, + InCounts const& in, + std::string const& name) { + out = OutRowMap(ViewAllocateWithoutInitializing(name), in.size() + 1); + Kokkos::Impl::Experimental:: + CrsRowMapFromCounts functor(in, out); +} + +template< class DataType, + class Arg1Type, + class Arg2Type, + class SizeType> +void transpose_crs( + Crs& out, + Crs const& in) +{ + typedef Crs crs_type ; + typedef typename crs_type::memory_space memory_space ; + typedef View counts_type ; + { + counts_type counts; + Kokkos::Experimental::get_crs_transpose_counts(counts, in); + Kokkos::Experimental::get_crs_row_map_from_counts(out.row_map, counts, + "tranpose_row_map"); + } + out.entries = decltype(out.entries)("transpose_entries", in.entries.size()); + Kokkos::Impl::Experimental:: + FillCrsTransposeEntries entries_functor(in, out); +} + +}} // namespace Kokkos::Experimental + +#endif /* #define KOKKOS_CRS_HPP */ diff --git a/lib/kokkos/core/src/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Kokkos_Cuda.hpp index f0f0f87458..197831dee5 100644 --- a/lib/kokkos/core/src/Kokkos_Cuda.hpp +++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp @@ -217,8 +217,8 @@ public: private: - cudaStream_t m_stream ; int m_device ; + cudaStream_t m_stream ; }; } // namespace Kokkos @@ -295,6 +295,7 @@ struct VerifyExecutionCanAccessMemorySpace #include #include #include +#include #include //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp index 307ab193b1..fb5985e164 100644 --- a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp @@ -90,7 +90,7 @@ public: , const size_t arg_alloc_size ) const ; /**\brief Return Name of the MemorySpace */ - static constexpr const char* name(); + static constexpr const char* name() { return m_name; } /*--------------------------------*/ /** \brief Error reporting for HostSpace attempt to access CudaSpace */ @@ -186,7 +186,7 @@ public: , const size_t arg_alloc_size ) const ; /**\brief Return Name of the MemorySpace */ - static constexpr const char* name(); + static constexpr const char* name() { return m_name; } /*--------------------------------*/ @@ -234,7 +234,7 @@ public: , const size_t arg_alloc_size ) const ; /**\brief Return Name of the MemorySpace */ - static constexpr const char* name(); + static constexpr const char* name() { return m_name; } private: diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp index 375a2d3744..a8c4d77c62 100644 --- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -384,6 +384,7 @@ Impl::PerThreadValue PerThread(const int& arg); * WorkTag (none): Tag which is used as the first argument for the functor operator. * Schedule (Schedule): Scheduling Policy (Dynamic, or Static). * IndexType (IndexType: Integer Index type used to iterate over the Index space. + * LaunchBounds (LaunchBounds<1024,1>: Launch Bounds for CUDA compilation. */ template< class ... Properties> class TeamPolicy: public @@ -561,6 +562,45 @@ KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct ThreadVectorRange( const TeamMemberType&, const iType& count ); +#if defined(KOKKOS_ENABLE_PROFILING) +namespace Impl { + +template::value > +struct ParallelConstructName; + +template +struct ParallelConstructName { + ParallelConstructName(std::string const& label):label_ref(label) { + if (label.empty()) { + default_name = std::string(typeid(FunctorType).name()) + "/" + + typeid(TagType).name(); + } + } + std::string const& get() { + return (label_ref.empty()) ? default_name : label_ref; + } + std::string const& label_ref; + std::string default_name; +}; + +template +struct ParallelConstructName { + ParallelConstructName(std::string const& label):label_ref(label) { + if (label.empty()) { + default_name = std::string(typeid(FunctorType).name()); + } + } + std::string const& get() { + return (label_ref.empty()) ? default_name : label_ref; + } + std::string const& label_ref; + std::string default_name; +}; + +} // namespace Impl +#endif /* defined KOKKOS_ENABLE_PROFILING */ + } // namespace Kokkos #endif /* #define KOKKOS_EXECPOLICY_HPP */ diff --git a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp index e224cd4e84..9c9af0dd8b 100644 --- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp @@ -126,14 +126,6 @@ public: //! This memory space preferred device_type typedef Kokkos::Device< execution_space, memory_space > device_type; - /*--------------------------------*/ - /* Functions unique to the HBWSpace */ - static int in_parallel(); - - static void register_in_parallel( int (*)() ); - - /*--------------------------------*/ - /**\brief Default memory space instance */ HBWSpace(); HBWSpace( const HBWSpace & rhs ) = default; diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp index d00cce8f60..431635047a 100644 --- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp @@ -130,14 +130,6 @@ public: //! This memory space preferred device_type typedef Kokkos::Device< execution_space, memory_space > device_type; - /*--------------------------------*/ - /* Functions unique to the HostSpace */ - static int in_parallel(); - - static void register_in_parallel( int (*)() ); - - /*--------------------------------*/ - /**\brief Default memory space instance */ HostSpace(); HostSpace( HostSpace && rhs ) = default; @@ -161,7 +153,7 @@ public: , const size_t arg_alloc_size ) const; /**\brief Return Name of the MemorySpace */ - static constexpr const char* name(); + static constexpr const char* name() { return m_name; } private: AllocationMechanism m_alloc_mech; diff --git a/lib/kokkos/core/src/Kokkos_Layout.hpp b/lib/kokkos/core/src/Kokkos_Layout.hpp index f300a6d9f6..87c705153e 100644 --- a/lib/kokkos/core/src/Kokkos_Layout.hpp +++ b/lib/kokkos/core/src/Kokkos_Layout.hpp @@ -156,6 +156,8 @@ struct LayoutStride { for ( int r = 0 ; r < ARRAY_LAYOUT_MAX_RANK ; ++r ) { tmp.dimension[r] = 0 ; tmp.stride[r] = 0 ; + } + for ( int r = 0 ; r < rank ; ++r ) { check_input &= ~int( 1 << order[r] ); } if ( 0 == check_input ) { diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp index 1439dbd3f8..250ef6630a 100644 --- a/lib/kokkos/core/src/Kokkos_Macros.hpp +++ b/lib/kokkos/core/src/Kokkos_Macros.hpp @@ -297,6 +297,10 @@ #endif #endif + #if defined( KOKKOS_ARCH_AVX512MIC ) + #define KOKKOS_ENABLE_RFO_PREFETCH 1 + #endif + #if defined( __MIC__ ) // Compiling for Xeon Phi #endif @@ -344,13 +348,18 @@ //#define KOKKOS_ENABLE_PRAGMA_VECTOR 1 //#define KOKKOS_ENABLE_PRAGMA_SIMD 1 + #if defined( KOKKOS_ARCH_AVX512MIC ) + #define KOKKOS_ENABLE_RFO_PREFETCH 1 + #endif + #if !defined( KOKKOS_FORCEINLINE_FUNCTION ) #define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline)) #endif #if !defined( KOKKOS_ENABLE_ASM ) && !defined( __PGIC__ ) && \ ( defined( __amd64 ) || defined( __amd64__ ) || \ - defined( __x86_64 ) || defined( __x86_64__ ) ) + defined( __x86_64 ) || defined( __x86_64__ ) || \ + defined(__PPC64__) ) #define KOKKOS_ENABLE_ASM 1 #endif #endif diff --git a/lib/kokkos/core/src/Kokkos_MasterLock.hpp b/lib/kokkos/core/src/Kokkos_MasterLock.hpp new file mode 100644 index 0000000000..81564b8eac --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_MasterLock.hpp @@ -0,0 +1,73 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_MASTER_LOCK_HPP +#define KOKKOS_MASTER_LOCK_HPP + +#include + +namespace Kokkos { namespace Experimental { + +// my be used to coordinate work between master instances +// SHOULD NOT be used within a parallel algorithm +// +// This lock should be used with with a scoped lock guard +// i.e. std::unique_lock, std::lock_guard +// +// cannot be copied or moved +// has the following functions available +// +// Lock() +// ~Lock() +// +// void lock() +// void unlock() +// bool try_lock() +// +template +class MasterLock; + +}} // namespace Kokkos::Experimental + +#endif //KOKKOS_MASTER_LOCK_HPP + diff --git a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp index dbf1ad8057..1da936067d 100644 --- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp +++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp @@ -66,11 +66,6 @@ private: enum : uint32_t { max_bit_count_lg2 = CB::max_bit_count_lg2 }; enum : uint32_t { max_bit_count = CB::max_bit_count }; - /* Defaults for min block, max block, and superblock sizes */ - enum : uint32_t { MIN_BLOCK_SIZE_LG2 = 6 /* 64 bytes */ }; - enum : uint32_t { MAX_BLOCK_SIZE_LG2 = 12 /* 4k bytes */ }; - enum : uint32_t { SUPERBLOCK_SIZE_LG2 = 16 /* 64k bytes */ }; - enum : uint32_t { HINT_PER_BLOCK_SIZE = 2 }; /* Each superblock has a concurrent bitset state @@ -85,6 +80,14 @@ private: * is concurrently updated. */ + /* Mapping between block_size <-> block_state + * + * block_state = ( m_sb_size_lg2 - block_size_lg2 ) << state_shift + * block_size = m_sb_size_lg2 - ( block_state >> state_shift ) + * + * Thus A_block_size < B_block_size <=> A_block_state > B_block_state + */ + typedef typename DeviceType::memory_space base_memory_space ; enum { accessible = @@ -251,10 +254,10 @@ public: * significant runtime performance improvements. */ MemoryPool( const base_memory_space & memspace - , const size_t min_total_alloc_size - , const uint32_t min_block_alloc_size // = 1 << MIN_BLOCK_SIZE_LG2 - , const uint32_t max_block_alloc_size // = 1 << MAX_BLOCK_SIZE_LG2 - , const uint32_t min_superblock_size // = 1 << SUPERBLOCK_SIZE_LG2 + , const size_t min_total_alloc_size + , size_t min_block_alloc_size = 0 + , size_t max_block_alloc_size = 0 + , size_t min_superblock_size = 0 ) : m_tracker() , m_sb_state_array(0) @@ -267,8 +270,43 @@ public: , m_data_offset(0) , m_unused_padding(0) { - const uint32_t int_align_lg2 = 3 ; /* align as int[8] */ - const uint32_t int_align_mask = ( 1u << int_align_lg2 ) - 1 ; + const uint32_t int_align_lg2 = 3 ; /* align as int[8] */ + const uint32_t int_align_mask = ( 1u << int_align_lg2 ) - 1 ; + + // Constraints and defaults: + // min_block_alloc_size <= max_block_alloc_size + // max_block_alloc_size <= min_superblock_size + // min_superblock_size <= min_total_alloc_size + + const uint32_t MIN_BLOCK_SIZE = 1u << 6 /* 64 bytes */ ; + const uint32_t MAX_BLOCK_SIZE = 1u << 12 /* 4k bytes */ ; + + if ( 0 == min_block_alloc_size ) min_block_alloc_size = MIN_BLOCK_SIZE ; + + if ( 0 == max_block_alloc_size ) { + + max_block_alloc_size = MAX_BLOCK_SIZE ; + + // Upper bound of total allocation size + max_block_alloc_size = std::min( size_t(max_block_alloc_size) + , min_total_alloc_size ); + + // Lower bound of minimum block size + max_block_alloc_size = std::max( max_block_alloc_size + , min_block_alloc_size ); + } + + if ( 0 == min_superblock_size ) { + min_superblock_size = max_block_alloc_size ; + + // Upper bound of total allocation size + min_superblock_size = std::min( size_t(min_superblock_size) + , min_total_alloc_size ); + + // Lower bound of maximum block size + min_superblock_size = std::max( min_superblock_size + , max_block_alloc_size ); + } // Block and superblock size is power of two: @@ -435,6 +473,8 @@ public: void * allocate( size_t alloc_size , int32_t attempt_limit = 1 ) const noexcept { + if ( 0 == alloc_size ) return (void*) 0 ; + void * p = 0 ; const uint32_t block_size_lg2 = get_block_size_lg2( alloc_size ); @@ -444,10 +484,9 @@ public: // Allocation will fit within a superblock // that has block sizes ( 1 << block_size_lg2 ) - const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ; - const uint32_t block_state = block_count_lg2 << state_shift ; - const uint32_t block_count = 1u << block_count_lg2 ; - const uint32_t block_count_mask = block_count - 1 ; + const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ; + const uint32_t block_state = block_count_lg2 << state_shift ; + const uint32_t block_count = 1u << block_count_lg2 ; // Superblock hints for this block size: // hint_sb_id_ptr[0] is the dynamically changing hint @@ -465,7 +504,7 @@ public: // the guess for which block within a superblock should // be claimed. If not available then a search occurs. - const uint32_t block_id_hint = block_count_mask & + const uint32_t block_id_hint = (uint32_t)( Kokkos::Impl::clock_tic() #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) // Spread out potentially concurrent access @@ -474,6 +513,9 @@ public: #endif ); + // expected state of superblock for allocation + uint32_t sb_state = block_state ; + int32_t sb_id = -1 ; volatile uint32_t * sb_state_array = 0 ; @@ -484,6 +526,8 @@ public: if ( sb_id < 0 ) { + // No superblock specified, try the hint for this block size + sb_id = hint_sb_id = int32_t( *hint_sb_id_ptr ); sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size ); @@ -493,16 +537,20 @@ public: // 0 <= sb_id // sb_state_array == m_sb_state_array + m_sb_state_size * sb_id - if ( block_state == ( state_header_mask & *sb_state_array ) ) { + if ( sb_state == ( state_header_mask & *sb_state_array ) ) { - // This superblock state is assigned to this block size. - // Try to claim a bit. + // This superblock state is as expected, for the moment. + // Attempt to claim a bit. The attempt updates the state + // so have already made sure the state header is as expected. + + const uint32_t count_lg2 = sb_state >> state_shift ; + const uint32_t mask = ( 1u << count_lg2 ) - 1 ; const Kokkos::pair result = CB::acquire_bounded_lg2( sb_state_array - , block_count_lg2 - , block_id_hint - , block_state + , count_lg2 + , block_id_hint & mask + , sb_state ); // If result.first < 0 then failed to acquire @@ -512,16 +560,18 @@ public: if ( 0 <= result.first ) { // acquired a bit + const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2 ; + // Set the allocated block pointer p = ((char*)( m_sb_state_array + m_data_offset )) + ( uint32_t(sb_id) << m_sb_size_lg2 ) // superblock memory - + ( result.first << block_size_lg2 ); // block memory + + ( result.first << size_lg2 ); // block memory break ; // Success } -// printf(" acquire block_count_lg2(%d) block_state(0x%x) sb_id(%d) result(%d,%d)\n" , block_count_lg2 , block_state , sb_id , result.first , result.second ); +// printf(" acquire count_lg2(%d) sb_state(0x%x) sb_id(%d) result(%d,%d)\n" , count_lg2 , sb_state , sb_id , result.first , result.second ); } //------------------------------------------------------------------ @@ -529,12 +579,18 @@ public: // Must find a new superblock. // Start searching at designated index for this block size. - // Look for a partially full superblock of this block size. - // Look for an empty superblock just in case cannot find partfull. + // Look for superblock that, in preferential order, + // 1) part-full superblock of this block size + // 2) empty superblock to claim for this block size + // 3) part-full superblock of the next larger block size + sb_state = block_state ; // Expect to find the desired state sb_id = -1 ; + bool update_hint = false ; int32_t sb_id_empty = -1 ; + int32_t sb_id_large = -1 ; + uint32_t sb_state_large = 0 ; sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size ; @@ -544,38 +600,54 @@ public: // Note that the state may change at any moment // as concurrent allocations and deallocations occur. - const uint32_t state = *sb_state_array ; - const uint32_t used = state & state_used_mask ; + const uint32_t full_state = *sb_state_array ; + const uint32_t used = full_state & state_used_mask ; + const uint32_t state = full_state & state_header_mask ; - if ( block_state == ( state & state_header_mask ) ) { + if ( state == block_state ) { // Superblock is assigned to this block size - if ( used < block_count ) { + if ( used < block_count ) { // There is room to allocate one block sb_id = id ; - if ( used + 1 < block_count ) { + // Is there room to allocate more than one block? - // There is room to allocate more than one block - - Kokkos::atomic_compare_exchange - ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) ); - } + update_hint = used + 1 < block_count ; break ; } } - else if ( ( used == 0 ) && ( sb_id_empty == -1 ) ) { + else if ( 0 == used ) { - // Superblock is not assigned to this block size - // and is the first empty superblock encountered. - // Save this id to use if a partfull superblock is not found. + // Superblock is empty - sb_id_empty = id ; + if ( -1 == sb_id_empty ) { + + // Superblock is not assigned to this block size + // and is the first empty superblock encountered. + // Save this id to use if a partfull superblock is not found. + + sb_id_empty = id ; + } } + else if ( ( -1 == sb_id_empty /* have not found an empty */ ) && + ( -1 == sb_id_large /* have not found a larger */ ) && + ( state < block_state /* a larger block */ ) && + // is not full: + ( used < ( 1u << ( state >> state_shift ) ) ) ) { + // First superblock encountered that is + // larger than this block size and + // has room for an allocation. + // Save this id to use of partfull or empty superblock not found + sb_id_large = id ; + sb_state_large = state ; + } + + // Iterate around the superblock array: if ( ++id < m_sb_count ) { sb_state_array += m_sb_state_size ; @@ -586,7 +658,7 @@ public: } } -// printf(" search m_sb_count(%d) sb_id(%d) sb_id_empty(%d)\n" , m_sb_count , sb_id , sb_id_empty ); +// printf(" search m_sb_count(%d) sb_id(%d) sb_id_empty(%d) sb_id_large(%d)\n" , m_sb_count , sb_id , sb_id_empty , sb_id_large); if ( sb_id < 0 ) { @@ -609,21 +681,31 @@ public: const uint32_t state_empty = state_header_mask & *sb_state_array ; - if ( state_empty == - Kokkos::atomic_compare_exchange - (sb_state_array,state_empty,block_state) ) { + // If this thread claims the empty block then update the hint + update_hint = + state_empty == + Kokkos::atomic_compare_exchange + (sb_state_array,state_empty,block_state); + } + else if ( 0 <= sb_id_large ) { - // If this thread claimed the block then update the hint + // Found a larger superblock with space available - Kokkos::atomic_compare_exchange - ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) ); - } + sb_id = sb_id_large ; + sb_state = sb_state_large ; + + sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size ); } else { // Did not find a potentially usable superblock --attempt_limit ; } } + + if ( update_hint ) { + Kokkos::atomic_compare_exchange + ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) ); + } } // end allocation attempt loop //-------------------------------------------------------------------- @@ -646,6 +728,8 @@ public: KOKKOS_INLINE_FUNCTION void deallocate( void * p , size_t /* alloc_size */ ) const noexcept { + if ( 0 == p ) return ; + // Determine which superblock and block const ptrdiff_t d = ((char*)p) - ((char*)( m_sb_state_array + m_data_offset )); diff --git a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp index 94b58b8aff..af9c8ea782 100644 --- a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp +++ b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp @@ -72,11 +72,11 @@ struct MemoryTraits { //! Tag this class as a kokkos memory traits: typedef MemoryTraits memory_traits ; - enum { Unmanaged = T & unsigned(Kokkos::Unmanaged) }; - enum { RandomAccess = T & unsigned(Kokkos::RandomAccess) }; - enum { Atomic = T & unsigned(Kokkos::Atomic) }; - enum { Restrict = T & unsigned(Kokkos::Restrict) }; - enum { Aligned = T & unsigned(Kokkos::Aligned) }; + enum : bool { Unmanaged = (unsigned(0) != (T & unsigned(Kokkos::Unmanaged))) }; + enum : bool { RandomAccess = (unsigned(0) != (T & unsigned(Kokkos::RandomAccess))) }; + enum : bool { Atomic = (unsigned(0) != (T & unsigned(Kokkos::Atomic))) }; + enum : bool { Restrict = (unsigned(0) != (T & unsigned(Kokkos::Restrict))) }; + enum : bool { Aligned = (unsigned(0) != (T & unsigned(Kokkos::Aligned))) }; }; @@ -109,7 +109,11 @@ enum { MEMORY_ALIGNMENT = #else ( 1 << Kokkos::Impl::integral_power_of_two( 128 ) ) #endif - , MEMORY_ALIGNMENT_THRESHOLD = 4 +#if defined( KOKKOS_MEMORY_ALIGNMENT_THRESHOLD ) + , MEMORY_ALIGNMENT_THRESHOLD = KOKKOS_MEMORY_ALIGNMENT_THRESHOLD +#else + , MEMORY_ALIGNMENT_THRESHOLD = 4 +#endif }; diff --git a/lib/kokkos/core/src/Kokkos_OpenMP.hpp b/lib/kokkos/core/src/Kokkos_OpenMP.hpp index 3e11621ce6..d5de01cf2f 100644 --- a/lib/kokkos/core/src/Kokkos_OpenMP.hpp +++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp @@ -47,10 +47,6 @@ #include #if defined( KOKKOS_ENABLE_OPENMP) -#if !defined(_OPENMP) -#error "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!" -#endif - #include #include @@ -67,95 +63,144 @@ #include #include +#include + /*--------------------------------------------------------------------------*/ namespace Kokkos { +namespace Impl { +class OpenMPExec; +} + /// \class OpenMP /// \brief Kokkos device for multicore processors in the host memory space. class OpenMP { public: - //------------------------------------ - //! \name Type declarations that all Kokkos devices must provide. - //@{ - //! Tag this class as a kokkos execution space using execution_space = OpenMP; + + using memory_space = #ifdef KOKKOS_ENABLE_HBWSPACE - using memory_space = Experimental::HBWSpace; + Experimental::HBWSpace; #else - using memory_space = HostSpace; + HostSpace; #endif + //! This execution space preferred device_type - using device_type = Kokkos::Device; - - using array_layout = LayoutRight; - using size_type = memory_space::size_type; - + using device_type = Kokkos::Device< execution_space, memory_space >; + using array_layout = LayoutRight; + using size_type = memory_space::size_type; using scratch_memory_space = ScratchMemorySpace< OpenMP >; - //@} - //------------------------------------ - //! \name Functions that all Kokkos execution spaces must implement. - //@{ + /// \brief Get a handle to the default execution space instance + inline + OpenMP() noexcept; - inline static bool in_parallel(); + // Using omp_get_max_threads(); is problematic + // On Intel (essentially an initial call to the OpenMP runtime + // without a parallel region before will set a process mask for a single core + // The runtime will than bind threads for a parallel region to other cores on the + // entering the first parallel region and make the process mask the aggregate of + // the thread masks. The intend seems to be to make serial code run fast, if you + // compile with OpenMP enabled but don't actually use parallel regions or so + // static int omp_max_threads = omp_get_max_threads(); + static int get_current_max_threads() noexcept; - /** \brief Set the device in a "sleep" state. A noop for OpenMP. */ - static bool sleep(); + /// \brief Initialize the default execution space + /// + /// if ( thread_count == -1 ) + /// then use the number of threads that openmp defaults to + /// if ( thread_count == 0 && Kokkos::hwlow_available() ) + /// then use hwloc to choose the number of threads and change + /// the default number of threads + /// if ( thread_count > 0 ) + /// then force openmp to use the given number of threads and change + /// the default number of threads + static void initialize( int thread_count = -1 ); - /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */ - static bool wake(); - - /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */ - static void fence() {} - - /// \brief Print configuration information to the given output stream. - static void print_configuration( std::ostream & , const bool detail = false ); - - /// \brief Free any resources being consumed by the device. + /// \brief Free any resources being consumed by the default execution space static void finalize(); - /** \brief Initialize the device. - * - * 1) If the hardware locality library is enabled and OpenMP has not - * already bound threads then bind OpenMP threads to maximize - * core utilization and group for memory hierarchy locality. - * - * 2) Allocate a HostThread for each OpenMP thread to hold its - * topology and fan in/out data. - */ - static void initialize( unsigned thread_count = 0 , - unsigned use_numa_count = 0 , - unsigned use_cores_per_numa = 0 ); + /// \brief is the default execution space initialized for current 'master' thread + static bool is_initialized() noexcept; - static int is_initialized(); + /// \brief Print configuration information to the given output stream. + static void print_configuration( std::ostream & , const bool verbose = false ); - /** \brief Return the maximum amount of concurrency. */ - static int concurrency(); + /// \brief is the instance running a parallel algorithm + inline + static bool in_parallel( OpenMP const& = OpenMP() ) noexcept; - //@} - //------------------------------------ - /** \brief This execution space has a topological thread pool which can be queried. - * - * All threads within a pool have a common memory space for which they are cache coherent. - * depth = 0 gives the number of threads in the whole pool. - * depth = 1 gives the number of threads in a NUMA region, typically sharing L3 cache. - * depth = 2 gives the number of threads at the finest granularity, typically sharing L1 cache. - */ - inline static int thread_pool_size( int depth = 0 ); + /// \brief Wait until all dispatched functors complete on the given instance + /// + /// This is a no-op on OpenMP + inline + static void fence( OpenMP const& = OpenMP() ) noexcept; + + /// \brief Does the given instance return immediately after launching + /// a parallel algorithm + /// + /// This always returns false on OpenMP + inline + static bool is_asynchronous( OpenMP const& = OpenMP() ) noexcept; + + + /// \brief Partition the default instance into new instances without creating + /// new masters + /// + /// This is a no-op on OpenMP since the default instance cannot be partitioned + /// without promoting other threads to 'master' + static std::vector partition(...); + + /// Non-default instances should be ref-counted so that when the last + /// is destroyed the instance resources are released + /// + /// This is a no-op on OpenMP since a non default instance cannot be created + static OpenMP create_instance(...); + + /// \brief Partition the default instance and call 'f' on each new 'master' thread + /// + /// Func is a functor with the following signiture + /// void( int partition_id, int num_partitions ) + template + static void partition_master( F const& f + , int requested_num_partitions = 0 + , int requested_partition_size = 0 + ); + + inline + static int thread_pool_size() noexcept; /** \brief The rank of the executing thread in this thread pool */ - KOKKOS_INLINE_FUNCTION static int thread_pool_rank(); + KOKKOS_INLINE_FUNCTION + static int thread_pool_rank() noexcept; - //------------------------------------ +#if !defined( KOKKOS_DISABLE_DEPRECATED ) + /// \brief Initialize the default execution space + static void initialize( int thread_count, + int use_numa_count, + int use_cores_per_numa = 0); - inline static unsigned max_hardware_threads() { return thread_pool_size(0); } + inline + static int thread_pool_size( int depth ); - KOKKOS_INLINE_FUNCTION static - unsigned hardware_thread_id() { return thread_pool_rank(); } + static void sleep() {}; + static void wake() {}; - static const char* name(); + // use UniqueToken + static int concurrency(); + + // use UniqueToken + inline + static int max_hardware_threads() noexcept; + + // use UniqueToken + KOKKOS_INLINE_FUNCTION + static int hardware_thread_id() noexcept; +#endif + + static constexpr const char* name() noexcept { return "OpenMP"; } }; } // namespace Kokkos @@ -195,6 +240,7 @@ struct VerifyExecutionCanAccessMemorySpace /*--------------------------------------------------------------------------*/ #include +#include #include #include diff --git a/lib/kokkos/core/src/Kokkos_Parallel.hpp b/lib/kokkos/core/src/Kokkos_Parallel.hpp index e412e608b2..fc8d6bec81 100644 --- a/lib/kokkos/core/src/Kokkos_Parallel.hpp +++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp @@ -177,22 +177,23 @@ void parallel_for( const ExecPolicy & policy ) { #if defined(KOKKOS_ENABLE_PROFILING) - uint64_t kpID = 0; - if(Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID); - } + uint64_t kpID = 0; + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Impl::ParallelConstructName name(str); + Kokkos::Profiling::beginParallelFor(name.get(), 0, &kpID); + } #endif - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); + Kokkos::Impl::shared_allocation_tracking_disable(); Impl::ParallelFor< FunctorType , ExecPolicy > closure( functor , policy ); - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); + Kokkos::Impl::shared_allocation_tracking_enable(); closure.execute(); #if defined(KOKKOS_ENABLE_PROFILING) - if(Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); + } #endif } @@ -210,14 +211,15 @@ void parallel_for( const size_t work_count #if defined(KOKKOS_ENABLE_PROFILING) uint64_t kpID = 0; - if(Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID); - } + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Impl::ParallelConstructName name(str); + Kokkos::Profiling::beginParallelFor(name.get(), 0, &kpID); + } #endif - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); + Kokkos::Impl::shared_allocation_tracking_disable(); Impl::ParallelFor< FunctorType , policy > closure( functor , policy(0,work_count) ); - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); + Kokkos::Impl::shared_allocation_tracking_enable(); closure.execute(); @@ -420,21 +422,22 @@ void parallel_scan( const ExecutionPolicy & policy { #if defined(KOKKOS_ENABLE_PROFILING) uint64_t kpID = 0; - if(Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); - } + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Impl::ParallelConstructName name(str); + Kokkos::Profiling::beginParallelScan(name.get(), 0, &kpID); + } #endif - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); + Kokkos::Impl::shared_allocation_tracking_disable(); Impl::ParallelScan< FunctorType , ExecutionPolicy > closure( functor , policy ); - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); + Kokkos::Impl::shared_allocation_tracking_enable(); closure.execute(); #if defined(KOKKOS_ENABLE_PROFILING) - if(Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelScan(kpID); - } + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelScan(kpID); + } #endif } @@ -453,21 +456,22 @@ void parallel_scan( const size_t work_count #if defined(KOKKOS_ENABLE_PROFILING) uint64_t kpID = 0; - if(Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); - } + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Impl::ParallelConstructName name(str); + Kokkos::Profiling::beginParallelScan(name.get(), 0, &kpID); + } #endif - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); + Kokkos::Impl::shared_allocation_tracking_disable(); Impl::ParallelScan< FunctorType , policy > closure( functor , policy(0,work_count) ); - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); + Kokkos::Impl::shared_allocation_tracking_enable(); closure.execute(); #if defined(KOKKOS_ENABLE_PROFILING) - if(Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelScan(kpID); - } + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelScan(kpID); + } #endif } diff --git a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp index 8ea5183e35..9df6d4ba09 100644 --- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp +++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp @@ -872,13 +872,14 @@ namespace Impl { const FunctorType& functor, ReturnType& return_value) { #if defined(KOKKOS_ENABLE_PROFILING) - uint64_t kpID = 0; - if(Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelReduce("" == label ? typeid(FunctorType).name() : label, 0, &kpID); - } + uint64_t kpID = 0; + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Impl::ParallelConstructName name(label); + Kokkos::Profiling::beginParallelReduce(name.get(), 0, &kpID); + } #endif - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); + Kokkos::Impl::shared_allocation_tracking_disable(); #ifdef KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER Impl::ParallelReduce closure(functor_adaptor::functor(functor), @@ -890,13 +891,13 @@ namespace Impl { policy, return_value_adapter::return_value(return_value,functor)); #endif - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); + Kokkos::Impl::shared_allocation_tracking_enable(); closure.execute(); #if defined(KOKKOS_ENABLE_PROFILING) - if(Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelReduce(kpID); - } + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelReduce(kpID); + } #endif } diff --git a/lib/kokkos/core/src/Kokkos_Serial.hpp b/lib/kokkos/core/src/Kokkos_Serial.hpp index 73e8ae3030..539761a1f9 100644 --- a/lib/kokkos/core/src/Kokkos_Serial.hpp +++ b/lib/kokkos/core/src/Kokkos_Serial.hpp @@ -66,6 +66,7 @@ #include +#include namespace Kokkos { @@ -526,6 +527,7 @@ public: } }; + /*--------------------------------------------------------------------------*/ template< class FunctorType , class ... Traits > @@ -604,6 +606,178 @@ public: {} }; +} // namespace Impl +} // namespace Kokkos + + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ +/* Parallel patterns for Kokkos::Serial with MDRangePolicy */ + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class ... Traits > +class ParallelFor< FunctorType , + Kokkos::Experimental::MDRangePolicy< Traits ... > , + Kokkos::Serial + > +{ +private: + + typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ; + typedef typename MDRangePolicy::impl_range_policy Policy ; + + typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type; + + const FunctorType m_functor ; + const MDRangePolicy m_mdr_policy ; + const Policy m_policy ; + + void + exec() const + { + const typename Policy::member_type e = m_policy.end(); + for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) { + iterate_type( m_mdr_policy, m_functor )( i ); + } + } + +public: + + inline + void execute() const + { this->exec(); } + + inline + ParallelFor( const FunctorType & arg_functor + , const MDRangePolicy & arg_policy ) + : m_functor( arg_functor ) + , m_mdr_policy( arg_policy ) + , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) ) + {} +}; + + +template< class FunctorType , class ReducerType , class ... Traits > +class ParallelReduce< FunctorType + , Kokkos::Experimental::MDRangePolicy< Traits ... > + , ReducerType + , Kokkos::Serial + > +{ +private: + + typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ; + typedef typename MDRangePolicy::impl_range_policy Policy ; + + typedef typename MDRangePolicy::work_tag WorkTag ; + + typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional; + typedef typename ReducerConditional::type ReducerTypeFwd; + + typedef typename ReducerTypeFwd::value_type ValueType; + + typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ; + + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ; + + typedef typename Analysis::pointer_type pointer_type ; + typedef typename Analysis::reference_type reference_type ; + + + using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy + , FunctorType + , WorkTag + , ValueType + >; + + + const FunctorType m_functor ; + const MDRangePolicy m_mdr_policy ; + const Policy m_policy ; + const ReducerType m_reducer ; + const pointer_type m_result_ptr ; + + inline + void + exec( reference_type update ) const + { + const typename Policy::member_type e = m_policy.end(); + for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) { + iterate_type( m_mdr_policy, m_functor, update )( i ); + } + } + +public: + + inline + void execute() const + { + const size_t pool_reduce_size = + Analysis::value_size( ReducerConditional::select(m_functor , m_reducer) ); + const size_t team_reduce_size = 0 ; // Never shrinks + const size_t team_shared_size = 0 ; // Never shrinks + const size_t thread_local_size = 0 ; // Never shrinks + + serial_resize_thread_team_data( pool_reduce_size + , team_reduce_size + , team_shared_size + , thread_local_size ); + + HostThreadTeamData & data = *serial_get_thread_team_data(); + + pointer_type ptr = + m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local()); + + reference_type update = + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr ); + + this-> exec( update ); + + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >:: + final( ReducerConditional::select(m_functor , m_reducer) , ptr ); + } + + template< class HostViewType > + ParallelReduce( const FunctorType & arg_functor , + const MDRangePolicy & arg_policy , + const HostViewType & arg_result_view , + typename std::enable_if< + Kokkos::is_view< HostViewType >::value && + !Kokkos::is_reducer_type::value + ,void*>::type = NULL) + : m_functor( arg_functor ) + , m_mdr_policy( arg_policy ) + , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) ) + , m_reducer( InvalidType() ) + , m_result_ptr( arg_result_view.data() ) + { + static_assert( Kokkos::is_view< HostViewType >::value + , "Kokkos::Serial reduce result must be a View" ); + + static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value + , "Kokkos::Serial reduce result must be a View in HostSpace" ); + } + + inline + ParallelReduce( const FunctorType & arg_functor + , MDRangePolicy arg_policy + , const ReducerType& reducer ) + : m_functor( arg_functor ) + , m_mdr_policy( arg_policy ) + , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) ) + , m_reducer( reducer ) + , m_result_ptr( reducer.view().data() ) + { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/ + } +}; + + + } // namespace Impl } // namespace Kokkos @@ -819,6 +993,60 @@ public: /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ +namespace Kokkos { namespace Experimental { + +template<> +class UniqueToken< Serial, UniqueTokenScope::Instance> +{ +public: + using execution_space = Serial; + using size_type = int; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken( execution_space const& = execution_space() ) noexcept {} + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + inline + int size() const noexcept { return 1; } + + /// \brief acquire value such that 0 <= value < size() + inline + int acquire() const noexcept { return 0; } + + /// \brief release a value acquired by generate + inline + void release( int ) const noexcept {} +}; + +template<> +class UniqueToken< Serial, UniqueTokenScope::Global> +{ +public: + using execution_space = Serial; + using size_type = int; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken( execution_space const& = execution_space() ) noexcept {} + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + inline + int size() const noexcept { return 1; } + + /// \brief acquire value such that 0 <= value < size() + inline + int acquire() const noexcept { return 0; } + + /// \brief release a value acquired by generate + inline + void release( int ) const noexcept {} +}; + +}} // namespace Kokkos::Experimental + #include #endif // defined( KOKKOS_ENABLE_SERIAL ) diff --git a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp index 7edda7aa75..fcfc91a4ee 100644 --- a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp +++ b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp @@ -148,7 +148,7 @@ private: typename std::conditional< Arg2_is_space , Arg2 , void >::type >::type ; - using task_base = Impl::TaskBase< Space , ValueType , void > ; + using task_base = Impl::TaskBase< void , void , void > ; using queue_type = Impl::TaskQueue< Space > ; task_base * m_task ; @@ -293,13 +293,17 @@ public: //---------------------------------------- KOKKOS_INLINE_FUNCTION - typename task_base::get_return_type + int is_ready() const noexcept + { return ( 0 == m_task ) || ( ((task_base*) task_base::LockTag) == m_task->m_wait ); } + + KOKKOS_INLINE_FUNCTION + const typename Impl::TaskResult< ValueType >::reference_type get() const { if ( 0 == m_task ) { Kokkos::abort( "Kokkos:::Future::get ERROR: is_null()"); } - return m_task->get(); + return Impl::TaskResult< ValueType >::get( m_task ); } }; @@ -396,7 +400,7 @@ private: using track_type = Kokkos::Impl::SharedAllocationTracker ; using queue_type = Kokkos::Impl::TaskQueue< ExecSpace > ; - using task_base = Impl::TaskBase< ExecSpace , void , void > ; + using task_base = Impl::TaskBase< void , void , void > ; track_type m_track ; queue_type * m_queue ; @@ -464,29 +468,19 @@ public: KOKKOS_INLINE_FUNCTION memory_pool * memory() const noexcept - { return m_queue ? m_queue->m_memory : (memory_pool*) 0 ; } + { return m_queue ? &( m_queue->m_memory ) : (memory_pool*) 0 ; } //---------------------------------------- /**\brief Allocation size for a spawned task */ template< typename FunctorType > KOKKOS_FUNCTION size_t spawn_allocation_size() const - { - using task_type = Impl::TaskBase< execution_space - , typename FunctorType::value_type - , FunctorType > ; - - return m_queue->allocate_block_size( sizeof(task_type) ); - } + { return m_queue->template spawn_allocation_size< FunctorType >(); } /**\brief Allocation size for a when_all aggregate */ KOKKOS_FUNCTION size_t when_all_allocation_size( int narg ) const - { - using task_base = Kokkos::Impl::TaskBase< ExecSpace , void , void > ; - - return m_queue->allocate_block_size( sizeof(task_base) + narg * sizeof(task_base*) ); - } + { return m_queue->when_all_allocation_size( narg ); } //---------------------------------------- @@ -507,7 +501,7 @@ public: queue_type * const queue = arg_policy.m_scheduler ? arg_policy.m_scheduler->m_queue : ( arg_policy.m_dependence.m_task - ? arg_policy.m_dependence.m_task->m_queue + ? static_cast(arg_policy.m_dependence.m_task->m_queue) : (queue_type*) 0 ); if ( 0 == queue ) { @@ -530,8 +524,12 @@ public: future_type f ; // Allocate task from memory pool + + const size_t alloc_size = + queue->template spawn_allocation_size< FunctorType >(); + f.m_task = - reinterpret_cast< task_type * >(queue->allocate(sizeof(task_type))); + reinterpret_cast< task_type * >(queue->allocate(alloc_size) ); if ( f.m_task ) { @@ -539,15 +537,17 @@ public: // Reference count starts at two: // +1 for the matching decrement when task is complete // +1 for the future - new ( f.m_task ) - task_type( arg_function - , queue - , arg_policy.m_dependence.m_task /* dependence */ - , 2 /* reference count */ - , int(sizeof(task_type)) /* allocation size */ - , int(arg_policy.m_task_type) - , int(arg_policy.m_priority) - , std::move(arg_functor) ); + new ( f.m_task ) task_type( std::move(arg_functor) ); + + f.m_task->m_apply = arg_function ; + f.m_task->m_queue = queue ; + f.m_task->m_next = arg_policy.m_dependence.m_task ; + f.m_task->m_ref_count = 2 ; + f.m_task->m_alloc_size = alloc_size ; + f.m_task->m_task_type = arg_policy.m_task_type ; + f.m_task->m_priority = arg_policy.m_priority ; + + Kokkos::memory_fence(); // The dependence (if any) is processed immediately // within the schedule function, as such the dependence's @@ -586,6 +586,30 @@ public: // Postcondition: task is in Executing-Respawn state } + template< typename FunctorType > + KOKKOS_FUNCTION static + void + respawn( FunctorType * arg_self + , TaskScheduler const & + , TaskPriority const & arg_priority + ) + { + // Precondition: task is in Executing state + + using value_type = typename FunctorType::value_type ; + using task_type = Impl::TaskBase< execution_space + , value_type + , FunctorType > ; + + task_type * const task = static_cast< task_type * >( arg_self ); + + task->m_priority = static_cast(arg_priority); + + task->add_dependence( (task_base*) 0 ); + + // Postcondition: task is in Executing-Respawn state + } + //---------------------------------------- /**\brief Return a future that is complete * when all input futures are complete. @@ -596,7 +620,7 @@ public: when_all( Future< A1 , A2 > const arg[] , int narg ) { using future_type = Future< execution_space > ; - using task_base = Kokkos::Impl::TaskBase< execution_space , void , void > ; + using task_base = Kokkos::Impl::TaskBase< void , void , void > ; future_type f ; @@ -610,9 +634,9 @@ public: // Increment reference count to track subsequent assignment. Kokkos::atomic_increment( &(t->m_ref_count) ); if ( queue == 0 ) { - queue = t->m_queue ; + queue = static_cast< queue_type * >( t->m_queue ); } - else if ( queue != t->m_queue ) { + else if ( queue != static_cast< queue_type * >( t->m_queue ) ) { Kokkos::abort("Kokkos when_all Futures must be in the same scheduler" ); } } @@ -620,28 +644,34 @@ public: if ( queue != 0 ) { - size_t const size = sizeof(task_base) + narg * sizeof(task_base*); + size_t const alloc_size = queue->when_all_allocation_size( narg ); f.m_task = - reinterpret_cast< task_base * >( queue->allocate( size ) ); + reinterpret_cast< task_base * >( queue->allocate( alloc_size ) ); if ( f.m_task ) { // Reference count starts at two: // +1 to match decrement when task completes // +1 for the future - new( f.m_task ) task_base( queue - , 2 /* reference count */ - , size /* allocation size */ - , narg /* dependence count */ - ); + + new( f.m_task ) task_base(); + + f.m_task->m_queue = queue ; + f.m_task->m_ref_count = 2 ; + f.m_task->m_alloc_size = alloc_size ; + f.m_task->m_dep_count = narg ; + f.m_task->m_task_type = task_base::Aggregate ; // Assign dependences, reference counts were already incremented - task_base ** const dep = f.m_task->aggregate_dependences(); + task_base * volatile * const dep = + f.m_task->aggregate_dependences(); for ( int i = 0 ; i < narg ; ++i ) { dep[i] = arg[i].m_task ; } + Kokkos::memory_fence(); + queue->schedule_aggregate( f.m_task ); // this when_all may be processed at any moment } diff --git a/lib/kokkos/core/src/Kokkos_UniqueToken.hpp b/lib/kokkos/core/src/Kokkos_UniqueToken.hpp new file mode 100644 index 0000000000..1ffb07a6db --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_UniqueToken.hpp @@ -0,0 +1,88 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_UNIQUE_TOKEN_HPP +#define KOKKOS_UNIQUE_TOKEN_HPP + +#include + +namespace Kokkos { namespace Experimental { + +enum class UniqueTokenScope : int +{ + Instance, + Global +}; + +/// \brief class to generate unique ids base on the required amount of concurrency +/// +/// This object should behave like a ref-counted object, so that when the last +/// instance is destroy resources are free if needed +template +class UniqueToken +{ +public: + using execution_space = ExecutionSpace; + using size_type = typename execution_space::size_type; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken( execution_space const& = execution_space() ); + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + size_type size() const ; + + /// \brief acquire value such that 0 <= value < size() + KOKKOS_INLINE_FUNCTION + size_type acquire() const ; + + /// \brief release a value acquired by generate + KOKKOS_INLINE_FUNCTION + void release( size_type ) const ; +}; + +}} // namespace Kokkos::Experimental + +#endif //KOKKOS_UNIQUE_TOKEN_HPP diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp index 3312aa6a96..1754e4a8fb 100644 --- a/lib/kokkos/core/src/Kokkos_View.hpp +++ b/lib/kokkos/core/src/Kokkos_View.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -54,11 +54,14 @@ #include #include +#if defined(KOKKOS_ENABLE_PROFILING) +#include +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace Impl { template< class DataType > @@ -73,16 +76,6 @@ struct ViewDataAnalysis ; template< class , class ... > class ViewMapping { public: enum { is_assignable = false }; }; -} /* namespace Impl */ -} /* namespace Experimental */ -} /* namespace Kokkos */ - -namespace Kokkos { -namespace Impl { - -using Kokkos::Experimental::Impl::ViewMapping ; -using Kokkos::Experimental::Impl::ViewDataAnalysis ; - } /* namespace Impl */ } /* namespace Kokkos */ @@ -1563,12 +1556,12 @@ namespace Kokkos { namespace Impl { inline -void shared_allocation_tracking_claim_and_disable() -{ Kokkos::Impl::SharedAllocationRecord::tracking_claim_and_disable(); } +void shared_allocation_tracking_disable() +{ Kokkos::Impl::SharedAllocationRecord::tracking_disable(); } inline -void shared_allocation_tracking_release_and_enable() -{ Kokkos::Impl::SharedAllocationRecord::tracking_release_and_enable(); } +void shared_allocation_tracking_enable() +{ Kokkos::Impl::SharedAllocationRecord::tracking_enable(); } } /* namespace Impl */ } /* namespace Kokkos */ @@ -1795,6 +1788,20 @@ void deep_copy if ( (void *) dst.data() != (void*) src.data() ) { +#if defined(KOKKOS_ENABLE_PROFILING) + if (Kokkos::Profiling::profileLibraryLoaded()) { + const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span(); + Kokkos::Profiling::beginDeepCopy( + Kokkos::Profiling::SpaceHandle(dst_memory_space::name()), + dst.label(), + dst.data(), + Kokkos::Profiling::SpaceHandle(src_memory_space::name()), + src.label(), + src.data(), + nbytes); + } +#endif + // Concern: If overlapping views then a parallel copy will be erroneous. // ... @@ -1882,7 +1889,14 @@ void deep_copy else { Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation"); } - } + +#if defined(KOKKOS_ENABLE_PROFILING) + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endDeepCopy(); + } +#endif + + } // ( (void *) dst.data() != (void*) src.data() ) } } /* namespace Kokkos */ @@ -2249,6 +2263,82 @@ resize( Kokkos::View & v , static_assert( Kokkos::ViewTraits::is_managed , "Can only resize managed views" ); + // Fix #904 by checking dimensions before actually resizing. + // + // Rank is known at compile time, so hopefully the compiler will + // remove branches that are compile-time false. The upcoming "if + // constexpr" language feature would make this certain. + if (view_type::Rank == 1 && + n0 == static_cast (v.extent(0))) { + return; + } + if (view_type::Rank == 2 && + n0 == static_cast (v.extent(0)) && + n1 == static_cast (v.extent(1))) { + return; + } + if (view_type::Rank == 3 && + n0 == static_cast (v.extent(0)) && + n1 == static_cast (v.extent(1)) && + n2 == static_cast (v.extent(2))) { + return; + } + if (view_type::Rank == 4 && + n0 == static_cast (v.extent(0)) && + n1 == static_cast (v.extent(1)) && + n2 == static_cast (v.extent(2)) && + n3 == static_cast (v.extent(3))) { + return; + } + if (view_type::Rank == 5 && + n0 == static_cast (v.extent(0)) && + n1 == static_cast (v.extent(1)) && + n2 == static_cast (v.extent(2)) && + n3 == static_cast (v.extent(3)) && + n4 == static_cast (v.extent(4))) { + return; + } + if (view_type::Rank == 6 && + n0 == static_cast (v.extent(0)) && + n1 == static_cast (v.extent(1)) && + n2 == static_cast (v.extent(2)) && + n3 == static_cast (v.extent(3)) && + n4 == static_cast (v.extent(4)) && + n5 == static_cast (v.extent(5))) { + return; + } + if (view_type::Rank == 7 && + n0 == static_cast (v.extent(0)) && + n1 == static_cast (v.extent(1)) && + n2 == static_cast (v.extent(2)) && + n3 == static_cast (v.extent(3)) && + n4 == static_cast (v.extent(4)) && + n5 == static_cast (v.extent(5)) && + n6 == static_cast (v.extent(6))) { + return; + } + if (view_type::Rank == 8 && + n0 == static_cast (v.extent(0)) && + n1 == static_cast (v.extent(1)) && + n2 == static_cast (v.extent(2)) && + n3 == static_cast (v.extent(3)) && + n4 == static_cast (v.extent(4)) && + n5 == static_cast (v.extent(5)) && + n6 == static_cast (v.extent(6)) && + n7 == static_cast (v.extent(7))) { + return; + } + // If Kokkos ever supports Views of rank > 8, the above code won't + // be incorrect, because avoiding reallocation in resize() is just + // an optimization. + + // TODO (mfh 27 Jun 2017) If the old View has enough space but just + // different dimensions (e.g., if the product of the dimensions, + // including extra space for alignment, will not change), then + // consider just reusing storage. For now, Kokkos always + // reallocates if any of the dimensions change, even if the old View + // has enough space. + view_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6, n7 ); Kokkos::Impl::ViewRemap< view_type , view_type >( v_resized , v ); @@ -2317,6 +2407,106 @@ void realloc( Kokkos::View & v , } } /* namespace Kokkos */ +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { namespace Impl { + +template < class Specialize, typename A, typename B > +struct CommonViewValueType; + +template < typename A, typename B > +struct CommonViewValueType< void, A, B > +{ + using value_type = typename std::common_type< A , B >::type; +}; + + +template < class Specialize, class ValueType > +struct CommonViewAllocProp; + +template < class ValueType > +struct CommonViewAllocProp< void, ValueType > +{ + using value_type = ValueType; + + template < class ... Views > + CommonViewAllocProp( const Views & ... ) {} +}; + + +template < class ... Views > +struct DeduceCommonViewAllocProp; + +// Base case must provide types for: +// 1. specialize 2. value_type 3. is_view 4. prop_type +template < class FirstView > +struct DeduceCommonViewAllocProp< FirstView > +{ + using specialize = typename FirstView::traits::specialize; + + using value_type = typename FirstView::traits::value_type; + + enum : bool { is_view = is_view< FirstView >::value }; + + using prop_type = CommonViewAllocProp< specialize, value_type >; +}; + + +template < class FirstView, class ... NextViews > +struct DeduceCommonViewAllocProp< FirstView, NextViews... > +{ + using NextTraits = DeduceCommonViewAllocProp< NextViews... >; + + using first_specialize = typename FirstView::traits::specialize; + using first_value_type = typename FirstView::traits::value_type; + + enum : bool { first_is_view = is_view< FirstView >::value }; + + using next_specialize = typename NextTraits::specialize; + using next_value_type = typename NextTraits::value_type; + + enum : bool { next_is_view = NextTraits::is_view }; + + // common types + + // determine specialize type + // if first and next specialize differ, but are not the same specialize, error out + static_assert( !(!std::is_same< first_specialize, next_specialize >::value && !std::is_same< first_specialize, void>::value && !std::is_same< void, next_specialize >::value) , "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void specialize trait allowed" ); + + // otherwise choose non-void specialize if either/both are non-void + using specialize = typename std::conditional< std::is_same< first_specialize, next_specialize >::value + , first_specialize + , typename std::conditional< ( std::is_same< first_specialize, void >::value + && !std::is_same< next_specialize, void >::value) + , next_specialize + , first_specialize + >::type + >::type; + + using value_type = typename CommonViewValueType< specialize, first_value_type, next_value_type >::value_type; + + enum : bool { is_view = (first_is_view && next_is_view) }; + + using prop_type = CommonViewAllocProp< specialize, value_type >; +}; + +} // end namespace Impl + +template < class ... Views > +using DeducedCommonPropsType = typename Impl::DeduceCommonViewAllocProp::prop_type ; + +// User function +template < class ... Views > +DeducedCommonPropsType +common_view_alloc_prop( Views const & ... views ) +{ + return DeducedCommonPropsType( views... ); +} + +} // namespace Kokkos + + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- // For backward compatibility: @@ -2350,6 +2540,9 @@ using Kokkos::Impl::WithoutInitializing_t ; using Kokkos::Impl::AllowPadding_t ; using Kokkos::Impl::SharedAllocationRecord ; using Kokkos::Impl::SharedAllocationTracker ; +using Kokkos::Impl::ViewMapping ; +using Kokkos::Impl::ViewDataAnalysis ; + } /* namespace Impl */ } /* namespace Experimental */ diff --git a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp new file mode 100644 index 0000000000..58b0f72f51 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp @@ -0,0 +1,265 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_WORKGRAPHPOLICY_HPP +#define KOKKOS_WORKGRAPHPOLICY_HPP + +namespace Kokkos { +namespace Impl { +namespace Experimental { + +template< class functor_type , class execution_space, class ... policy_args > +class WorkGraphExec; + +}}} // namespace Kokkos::Impl::Experimental + +namespace Kokkos { +namespace Experimental { + +template< class ... Properties > +class WorkGraphPolicy +{ +public: + + using self_type = WorkGraphPolicy; + using traits = Kokkos::Impl::PolicyTraits; + using index_type = typename traits::index_type; + using execution_space = typename traits::execution_space; + using work_tag = typename traits::work_tag; + using memory_space = typename execution_space::memory_space; + using graph_type = Kokkos::Experimental::Crs; + using member_type = index_type; + +private: + + graph_type m_graph; + + using ints_type = Kokkos::View; + using range_type = Kokkos::pair; + using ranges_type = Kokkos::View; + const std::int32_t m_total_work; + ints_type m_counts; + ints_type m_queue; + ranges_type m_ranges; + +public: + + struct TagZeroRanges {}; + KOKKOS_INLINE_FUNCTION + void operator()(TagZeroRanges, std::int32_t i) const { + m_ranges[i] = range_type(0, 0); + } + void zero_ranges() { + using policy_type = RangePolicy; + using closure_type = Kokkos::Impl::ParallelFor; + const closure_type closure(*this, policy_type(0, 1)); + closure.execute(); + execution_space::fence(); + } + + struct TagFillQueue {}; + KOKKOS_INLINE_FUNCTION + void operator()(TagFillQueue, std::int32_t i) const { + if (*((volatile std::int32_t*)(&m_counts(i))) == 0) push_work(i); + } + void fill_queue() { + using policy_type = RangePolicy; + using closure_type = Kokkos::Impl::ParallelFor; + const closure_type closure(*this, policy_type(0, m_total_work)); + closure.execute(); + execution_space::fence(); + } + +private: + + inline + void setup() { + if (m_graph.numRows() > std::numeric_limits::max()) { + Kokkos::abort("WorkGraphPolicy work must be indexable using int32_t"); + } + get_crs_transpose_counts(m_counts, m_graph); + m_queue = ints_type(ViewAllocateWithoutInitializing("queue"), m_total_work); + deep_copy(m_queue, std::int32_t(-1)); + m_ranges = ranges_type("ranges", 1); + fill_queue(); + } + + KOKKOS_INLINE_FUNCTION + std::int32_t pop_work() const { + range_type w(-1,-1); + while (true) { + const range_type w_new( w.first + 1 , w.second ); + w = atomic_compare_exchange( &m_ranges(0) , w , w_new ); + if ( w.first < w.second ) { // there was work in the queue + if ( w_new.first == w.first + 1 && w_new.second == w.second ) { + // we got a work item + std::int32_t i; + // the push_work function may have incremented the end counter + // but not yet written the work index into the queue. + // wait until the entry is valid. + while ( -1 == ( i = *((volatile std::int32_t*)(&m_queue( w.first ))) ) ); + return i; + } // we got a work item + } else { // there was no work in the queue +#ifdef KOKKOS_DEBUG + if ( w_new.first == w.first + 1 && w_new.second == w.second ) { + Kokkos::abort("bug in pop_work"); + } +#endif + if (w.first == m_total_work) { // all work is done + return -1; + } else { // need to wait for more work to be pushed + // take a guess that one work item will be pushed + // the key thing is we can't leave (w) alone, because + // otherwise the next compare_exchange may succeed in + // popping work from an empty queue + w.second++; + } + } // there was no work in the queue + } // while (true) + } + + KOKKOS_INLINE_FUNCTION + void push_work(std::int32_t i) const { + range_type w(-1,-1); + while (true) { + const range_type w_new( w.first , w.second + 1 ); + // try to increment the end counter + w = atomic_compare_exchange( &m_ranges(0) , w , w_new ); + // stop trying if the increment was successful + if ( w.first == w_new.first && w.second + 1 == w_new.second ) break; + } + // write the work index into the claimed spot in the queue + *((volatile std::int32_t*)(&m_queue( w.second ))) = i; + // push this write out into the memory system + memory_fence(); + } + + template< class functor_type , class execution_space, class ... policy_args > + friend class Kokkos::Impl::Experimental::WorkGraphExec; + +public: + + WorkGraphPolicy(graph_type arg_graph) + : m_graph(arg_graph) + , m_total_work( arg_graph.numRows() ) + { + setup(); + } + +}; + +}} // namespace Kokkos::Experimental + +/*--------------------------------------------------------------------------*/ + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { +namespace Experimental { + +template< class functor_type , class execution_space, class ... policy_args > +class WorkGraphExec +{ + public: + + using self_type = WorkGraphExec< functor_type, execution_space, policy_args ... >; + using policy_type = Kokkos::Experimental::WorkGraphPolicy< policy_args ... >; + using member_type = typename policy_type::member_type; + using memory_space = typename execution_space::memory_space; + + protected: + + const functor_type m_functor; + const policy_type m_policy; + + protected: + + KOKKOS_INLINE_FUNCTION + std::int32_t before_work() const { + return m_policy.pop_work(); + } + + KOKKOS_INLINE_FUNCTION + void after_work(std::int32_t i) const { + /* fence any writes that were done by the work item itself + (usually writing its result to global memory) */ + memory_fence(); + const std::int32_t begin = m_policy.m_graph.row_map( i ); + const std::int32_t end = m_policy.m_graph.row_map( i + 1 ); + for (std::int32_t j = begin; j < end; ++j) { + const std::int32_t next = m_policy.m_graph.entries( j ); + const std::int32_t old_count = atomic_fetch_add( &(m_policy.m_counts(next)), -1 ); + if ( old_count == 1 ) m_policy.push_work( next ); + } + } + + inline + WorkGraphExec( const functor_type & arg_functor + , const policy_type & arg_policy ) + : m_functor( arg_functor ) + , m_policy( arg_policy ) + { + } +}; + +}}} // namespace Kokkos::Impl::Experimental + +#ifdef KOKKOS_ENABLE_SERIAL +#include "impl/Kokkos_Serial_WorkGraphPolicy.hpp" +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +#include "OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp" +#endif + +#ifdef KOKKOS_ENABLE_CUDA +#include "Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp" +#endif + +#ifdef KOKKOS_ENABLE_THREADS +#include "Threads/Kokkos_Threads_WorkGraphPolicy.hpp" +#endif + +#endif /* #define KOKKOS_WORKGRAPHPOLICY_HPP */ diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp index 4e0ea93920..915fbe52c1 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp @@ -45,75 +45,100 @@ #if defined( KOKKOS_ENABLE_OPENMP ) #include +#include + #include #include #include + #include + #include -#include #include #include namespace Kokkos { namespace Impl { -namespace { -KOKKOS_INLINE_FUNCTION -int kokkos_omp_in_parallel(); +int g_openmp_hardware_max_threads = 1; -int kokkos_omp_in_critical_region = ( Kokkos::HostSpace::register_in_parallel( kokkos_omp_in_parallel ) , 0 ); +__thread int t_openmp_hardware_id = 0; +__thread Impl::OpenMPExec * t_openmp_instance = nullptr; -KOKKOS_INLINE_FUNCTION -int kokkos_omp_in_parallel() +void OpenMPExec::validate_partition( const int nthreads + , int & num_partitions + , int & partition_size + ) { -#ifndef __CUDA_ARCH__ - return omp_in_parallel() && ! kokkos_omp_in_critical_region ; -#else - return 0; -#endif + if (nthreads == 1) { + num_partitions = 1; + partition_size = 1; + } + else if( num_partitions < 1 && partition_size < 1) { + int idle = nthreads; + for (int np = 2; np <= nthreads ; ++np) { + for (int ps = 1; ps <= nthreads/np; ++ps) { + if (nthreads - np*ps < idle) { + idle = nthreads - np*ps; + num_partitions = np; + partition_size = ps; + } + if (idle == 0) { + break; + } + } + } + } + else if( num_partitions < 1 && partition_size > 0 ) { + if ( partition_size <= nthreads ) { + num_partitions = nthreads / partition_size; + } + else { + num_partitions = 1; + partition_size = nthreads; + } + } + else if( num_partitions > 0 && partition_size < 1 ) { + if ( num_partitions <= nthreads ) { + partition_size = nthreads / num_partitions; + } + else { + num_partitions = nthreads; + partition_size = 1; + } + } + else if ( num_partitions * partition_size > nthreads ) { + int idle = nthreads; + const int NP = num_partitions; + const int PS = partition_size; + for (int np = NP; np > 0; --np) { + for (int ps = PS; ps > 0; --ps) { + if ( (np*ps <= nthreads) + && (nthreads - np*ps < idle) ) { + idle = nthreads - np*ps; + num_partitions = np; + partition_size = ps; + } + if (idle == 0) { + break; + } + } + } + } + } -bool s_using_hwloc = false; - -} // namespace -} // namespace Impl -} // namespace Kokkos - - -namespace Kokkos { -namespace Impl { - -int OpenMPExec::m_map_rank[ OpenMPExec::MAX_THREAD_COUNT ] = { 0 }; - -int OpenMPExec::m_pool_topo[ 4 ] = { 0 }; - -HostThreadTeamData * OpenMPExec::m_pool[ OpenMPExec::MAX_THREAD_COUNT ] = { 0 }; - -void OpenMPExec::verify_is_process( const char * const label ) +void OpenMPExec::verify_is_master( const char * const label ) { - if ( omp_in_parallel() ) { + if ( !t_openmp_instance ) + { std::string msg( label ); - msg.append( " ERROR: in parallel" ); + msg.append( " ERROR: in parallel or not initialized" ); Kokkos::Impl::throw_runtime_exception( msg ); } } -void OpenMPExec::verify_initialized( const char * const label ) -{ - if ( 0 == m_pool[0] ) { - std::string msg( label ); - msg.append( " ERROR: not initialized" ); - Kokkos::Impl::throw_runtime_exception( msg ); - } - - if ( omp_get_max_threads() != Kokkos::OpenMP::thread_pool_size(0) ) { - std::string msg( label ); - msg.append( " ERROR: Initialized but threads modified inappropriately" ); - Kokkos::Impl::throw_runtime_exception( msg ); - } - -} } // namespace Impl } // namespace Kokkos @@ -133,11 +158,11 @@ void OpenMPExec::clear_thread_data() const int old_alloc_bytes = m_pool[0] ? ( member_bytes + m_pool[0]->scratch_bytes() ) : 0 ; - Kokkos::HostSpace space ; + OpenMP::memory_space space ; -#pragma omp parallel + #pragma omp parallel num_threads( m_pool_size ) { - const int rank = m_map_rank[ omp_get_thread_num() ]; + const int rank = omp_get_thread_num(); if ( 0 != m_pool[rank] ) { @@ -189,13 +214,13 @@ void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes , team_shared_bytes , thread_local_bytes ); - const int pool_size = omp_get_max_threads(); + OpenMP::memory_space space ; - Kokkos::HostSpace space ; + memory_fence(); -#pragma omp parallel + #pragma omp parallel num_threads(m_pool_size) { - const int rank = m_map_rank[ omp_get_thread_num() ]; + const int rank = omp_get_thread_num(); if ( 0 != m_pool[rank] ) { @@ -214,11 +239,14 @@ void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes , pool_reduce_bytes , team_reduce_bytes , team_shared_bytes - , thread_local_bytes ); + , thread_local_bytes + ); + + memory_fence(); } /* END #pragma omp parallel */ - HostThreadTeamData::organize_pool( m_pool , pool_size ); + HostThreadTeamData::organize_pool( m_pool , m_pool_size ); } } @@ -232,16 +260,8 @@ namespace Kokkos { //---------------------------------------------------------------------------- -int OpenMP::is_initialized() -{ return 0 != Impl::OpenMPExec::m_pool[0]; } - -void OpenMP::initialize( unsigned thread_count , - unsigned use_numa_count , - unsigned use_cores_per_numa ) +int OpenMP::get_current_max_threads() noexcept { - // Before any other call to OMP query the maximum number of threads - // and save the value for re-initialization unit testing. - // Using omp_get_max_threads(); is problematic in conjunction with // Hwloc on Intel (essentially an initial call to the OpenMP runtime // without a parallel region before will set a process mask for a single core @@ -250,110 +270,99 @@ void OpenMP::initialize( unsigned thread_count , // the thread masks. The intend seems to be to make serial code run fast, if you // compile with OpenMP enabled but don't actually use parallel regions or so // static int omp_max_threads = omp_get_max_threads(); - int nthreads = 0; + + int count = 0; #pragma omp parallel { #pragma omp atomic - nthreads++; + ++count; } + return count; +} - static int omp_max_threads = nthreads; - - const bool is_initialized = 0 != Impl::OpenMPExec::m_pool[0] ; - - bool thread_spawn_failed = false ; - - if ( ! is_initialized ) { - - // Use hwloc thread pinning if concerned with locality. - // If spreading threads across multiple NUMA regions. - // If hyperthreading is enabled. - Impl::s_using_hwloc = hwloc::available() && ( - ( 1 < Kokkos::hwloc::get_available_numa_count() ) || - ( 1 < Kokkos::hwloc::get_available_threads_per_core() ) ); - - std::pair threads_coord[ Impl::OpenMPExec::MAX_THREAD_COUNT ]; - - // If hwloc available then use it's maximum value. - - if ( thread_count == 0 ) { - thread_count = Impl::s_using_hwloc - ? Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa() * - Kokkos::hwloc::get_available_threads_per_core() - : omp_max_threads ; - } - - if(Impl::s_using_hwloc) - hwloc::thread_mapping( "Kokkos::OpenMP::initialize" , - false /* do not allow asynchronous */ , - thread_count , - use_numa_count , - use_cores_per_numa , - threads_coord ); - - // Spawn threads: - - omp_set_num_threads( thread_count ); - - // Verify OMP interaction: - if ( int(thread_count) != omp_get_max_threads() ) { - thread_spawn_failed = true ; - } - - // Verify spawning and bind threads: -#pragma omp parallel - { -#pragma omp critical - { - if ( int(thread_count) != omp_get_num_threads() ) { - thread_spawn_failed = true ; - } - - // Call to 'bind_this_thread' is not thread safe so place this whole block in a critical region. - // Call to 'new' may not be thread safe as well. - - const unsigned omp_rank = omp_get_thread_num(); - const unsigned thread_r = Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() - ? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord ) - : omp_rank ; - - Impl::OpenMPExec::m_map_rank[ omp_rank ] = thread_r ; - } -/* END #pragma omp critical */ - } -/* END #pragma omp parallel */ - - if ( ! thread_spawn_failed ) { - Impl::OpenMPExec::m_pool_topo[0] = thread_count ; - Impl::OpenMPExec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count; - Impl::OpenMPExec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1; - - // New, unified host thread team data: - { - size_t pool_reduce_bytes = 32 * thread_count ; - size_t team_reduce_bytes = 32 * thread_count ; - size_t team_shared_bytes = 1024 * thread_count ; - size_t thread_local_bytes = 1024 ; - - Impl::OpenMPExec::resize_thread_data( pool_reduce_bytes - , team_reduce_bytes - , team_shared_bytes - , thread_local_bytes - ); - } - } - } - - if ( is_initialized || thread_spawn_failed ) { - std::string msg("Kokkos::OpenMP::initialize ERROR"); - - if ( is_initialized ) { msg.append(" : already initialized"); } - if ( thread_spawn_failed ) { msg.append(" : failed spawning threads"); } +void OpenMP::initialize( int thread_count ) +{ + if ( omp_in_parallel() ) { + std::string msg("Kokkos::OpenMP::initialize ERROR : in parallel"); Kokkos::Impl::throw_runtime_exception(msg); } + if ( Impl::t_openmp_instance ) + { + finalize(); + } + + { + if (nullptr == std::getenv("OMP_PROC_BIND") ) { + printf("Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set\n"); + printf(" In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads\n"); + printf(" For best performance with OpenMP 3.1 set OMP_PROC_BIND=true\n"); + printf(" For unit testing set OMP_PROC_BIND=false\n"); + } + + OpenMP::memory_space space ; + + // Before any other call to OMP query the maximum number of threads + // and save the value for re-initialization unit testing. + + Impl::g_openmp_hardware_max_threads = get_current_max_threads(); + + int process_num_threads = Impl::g_openmp_hardware_max_threads; + + if ( Kokkos::hwloc::available() ) { + process_num_threads = Kokkos::hwloc::get_available_numa_count() + * Kokkos::hwloc::get_available_cores_per_numa() + * Kokkos::hwloc::get_available_threads_per_core(); + } + + // if thread_count < 0, use g_openmp_hardware_max_threads; + // if thread_count == 0, set g_openmp_hardware_max_threads to process_num_threads + // if thread_count > 0, set g_openmp_hardware_max_threads to thread_count + if (thread_count < 0 ) { + thread_count = Impl::g_openmp_hardware_max_threads; + } + else if( thread_count == 0 && Impl::g_openmp_hardware_max_threads != process_num_threads ) { + Impl::g_openmp_hardware_max_threads = process_num_threads; + omp_set_num_threads(Impl::g_openmp_hardware_max_threads); + } + else { + if( thread_count > process_num_threads ) { + printf( "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores.\n"); + printf( " process threads available : %3d, requested thread : %3d\n", process_num_threads, thread_count ); + } + Impl::g_openmp_hardware_max_threads = thread_count; + omp_set_num_threads(Impl::g_openmp_hardware_max_threads); + } + + // setup thread local + #pragma omp parallel num_threads(Impl::g_openmp_hardware_max_threads) + { + Impl::t_openmp_instance = nullptr; + Impl::t_openmp_hardware_id = omp_get_thread_num(); + Impl::SharedAllocationRecord< void, void >::tracking_enable(); + } + + void * const ptr = space.allocate( sizeof(Impl::OpenMPExec) ); + + Impl::t_openmp_instance = new (ptr) Impl::OpenMPExec( Impl::g_openmp_hardware_max_threads ); + + // New, unified host thread team data: + { + size_t pool_reduce_bytes = 32 * thread_count ; + size_t team_reduce_bytes = 32 * thread_count ; + size_t team_shared_bytes = 1024 * thread_count ; + size_t thread_local_bytes = 1024 ; + + Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes + , team_reduce_bytes + , team_shared_bytes + , thread_local_bytes + ); + } + } + + // Check for over-subscription //if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) { // std::cout << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl; @@ -373,20 +382,38 @@ void OpenMP::initialize( unsigned thread_count , void OpenMP::finalize() { - Impl::OpenMPExec::verify_initialized( "OpenMP::finalize" ); - Impl::OpenMPExec::verify_is_process( "OpenMP::finalize" ); + if ( omp_in_parallel() ) + { + std::string msg("Kokkos::OpenMP::finalize ERROR "); + if( !Impl::t_openmp_instance ) msg.append(": not initialized"); + if( omp_in_parallel() ) msg.append(": in parallel"); + Kokkos::Impl::throw_runtime_exception(msg); + } - // New, unified host thread team data: - Impl::OpenMPExec::clear_thread_data(); + if ( Impl::t_openmp_instance ) { - Impl::OpenMPExec::m_pool_topo[0] = 0 ; - Impl::OpenMPExec::m_pool_topo[1] = 0 ; - Impl::OpenMPExec::m_pool_topo[2] = 0 ; + const int nthreads = Impl::t_openmp_instance->m_pool_size <= Impl::g_openmp_hardware_max_threads + ? Impl::g_openmp_hardware_max_threads + : Impl::t_openmp_instance->m_pool_size; - omp_set_num_threads(1); + using Exec = Impl::OpenMPExec; + Exec * instance = Impl::t_openmp_instance; + instance->~Exec(); - if ( Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) { - hwloc::unbind_this_thread(); + OpenMP::memory_space space; + space.deallocate( instance, sizeof(Exec) ); + + #pragma omp parallel num_threads(nthreads) + { + Impl::t_openmp_hardware_id = 0; + Impl::t_openmp_instance = nullptr; + Impl::SharedAllocationRecord< void, void >::tracking_disable(); + } + + // allow main thread to track + Impl::SharedAllocationRecord< void, void >::tracking_enable(); + + Impl::g_openmp_hardware_max_threads = 1; } #if defined(KOKKOS_ENABLE_PROFILING) @@ -396,70 +423,48 @@ void OpenMP::finalize() //---------------------------------------------------------------------------- -void OpenMP::print_configuration( std::ostream & s , const bool detail ) +void OpenMP::print_configuration( std::ostream & s , const bool verbose ) { - Impl::OpenMPExec::verify_is_process( "OpenMP::print_configuration" ); - s << "Kokkos::OpenMP" ; -#if defined( KOKKOS_ENABLE_OPENMP ) - s << " KOKKOS_ENABLE_OPENMP" ; -#endif -#if defined( KOKKOS_ENABLE_HWLOC ) - - const unsigned numa_count_ = Kokkos::hwloc::get_available_numa_count(); - const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); - const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core(); - - s << " hwloc[" << numa_count_ << "x" << cores_per_numa << "x" << threads_per_core << "]" - << " hwloc_binding_" << ( Impl::s_using_hwloc ? "enabled" : "disabled" ) - ; -#endif - - const bool is_initialized = 0 != Impl::OpenMPExec::m_pool[0] ; + const bool is_initialized = Impl::t_openmp_instance != nullptr; if ( is_initialized ) { - const int numa_count = Kokkos::Impl::OpenMPExec::m_pool_topo[0] / Kokkos::Impl::OpenMPExec::m_pool_topo[1] ; - const int core_per_numa = Kokkos::Impl::OpenMPExec::m_pool_topo[1] / Kokkos::Impl::OpenMPExec::m_pool_topo[2] ; - const int thread_per_core = Kokkos::Impl::OpenMPExec::m_pool_topo[2] ; + Impl::OpenMPExec::verify_is_master( "OpenMP::print_configuration" ); + + const int numa_count = 1; + const int core_per_numa = Impl::g_openmp_hardware_max_threads; + const int thread_per_core = 1; s << " thread_pool_topology[ " << numa_count << " x " << core_per_numa << " x " << thread_per_core << " ]" << std::endl ; - - if ( detail ) { - std::vector< std::pair > coord( Kokkos::Impl::OpenMPExec::m_pool_topo[0] ); - -#pragma omp parallel - { -#pragma omp critical - { - coord[ omp_get_thread_num() ] = hwloc::get_this_thread_coordinate(); - } -/* END #pragma omp critical */ - } -/* END #pragma omp parallel */ - - for ( unsigned i = 0 ; i < coord.size() ; ++i ) { - s << " thread omp_rank[" << i << "]" - << " kokkos_rank[" << Impl::OpenMPExec::m_map_rank[ i ] << "]" - << " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]" - << std::endl ; - } - } } else { s << " not initialized" << std::endl ; } } +std::vector OpenMP::partition(...) +{ return std::vector(1); } + +OpenMP OpenMP::create_instance(...) { return OpenMP(); } + + +#if !defined( KOKKOS_DISABLE_DEPRECATED ) + int OpenMP::concurrency() { - return thread_pool_size(0); + return Impl::g_openmp_hardware_max_threads; } -const char* OpenMP::name() { return "OpenMP"; } +void OpenMP::initialize( int thread_count , int, int ) +{ + initialize(thread_count); +} + +#endif } // namespace Kokkos diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp index 75b7f5da4a..37d2ac8318 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp @@ -47,6 +47,10 @@ #include #if defined( KOKKOS_ENABLE_OPENMP ) +#if !defined(_OPENMP) +#error "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!" +#endif + #include #include @@ -54,6 +58,8 @@ #include +#include + #include #include #include @@ -63,8 +69,14 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -namespace Kokkos { -namespace Impl { +namespace Kokkos { namespace Impl { + +class OpenMPExec; + +extern int g_openmp_hardware_max_threads; + +extern __thread int t_openmp_hardware_id; +extern __thread OpenMPExec * t_openmp_instance; //---------------------------------------------------------------------------- /** \brief Data for OpenMP thread execution */ @@ -74,279 +86,279 @@ public: friend class Kokkos::OpenMP ; - enum { MAX_THREAD_COUNT = 4096 }; + enum { MAX_THREAD_COUNT = 512 }; + + void clear_thread_data(); + + static void validate_partition( const int nthreads + , int & num_partitions + , int & partition_size + ); private: + OpenMPExec( int arg_pool_size ) + : m_pool_size{ arg_pool_size } + , m_level{ omp_get_level() } + , m_pool() + {} - static int m_pool_topo[ 4 ]; - static int m_map_rank[ MAX_THREAD_COUNT ]; + ~OpenMPExec() + { + clear_thread_data(); + } - static HostThreadTeamData * m_pool[ MAX_THREAD_COUNT ]; + int m_pool_size; + int m_level; - static - void clear_thread_data(); + HostThreadTeamData * m_pool[ MAX_THREAD_COUNT ]; public: - // Topology of a cache coherent thread pool: - // TOTAL = NUMA x GRAIN - // pool_size( depth = 0 ) - // pool_size(0) = total number of threads - // pool_size(1) = number of threads per NUMA - // pool_size(2) = number of threads sharing finest grain memory hierarchy + static void verify_is_master( const char * const ); - inline static - int pool_size( int depth = 0 ) { return m_pool_topo[ depth ]; } - - static void finalize(); - - static void initialize( const unsigned team_count , - const unsigned threads_per_team , - const unsigned numa_count , - const unsigned cores_per_numa ); - - static void verify_is_process( const char * const ); - static void verify_initialized( const char * const ); - - - static void resize_thread_data( size_t pool_reduce_bytes , size_t team_reduce_bytes , size_t team_shared_bytes , size_t thread_local_bytes ); - inline static - HostThreadTeamData * get_thread_data() noexcept - { return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; } + inline + HostThreadTeamData * get_thread_data() const noexcept + { return m_pool[ m_level == omp_get_level() ? 0 : omp_get_thread_num() ]; } - inline static - HostThreadTeamData * get_thread_data( int i ) noexcept - { return m_pool[i]; } + inline + HostThreadTeamData * get_thread_data( int i ) const noexcept + { return m_pool[i]; } }; -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template< class ... Properties > -class TeamPolicyInternal< Kokkos::OpenMP, Properties ... >: public PolicyTraits -{ -public: - - //! Tag this class as a kokkos execution policy - typedef TeamPolicyInternal execution_policy ; - - typedef PolicyTraits traits; - - TeamPolicyInternal& operator = (const TeamPolicyInternal& p) { - m_league_size = p.m_league_size; - m_team_size = p.m_team_size; - m_team_alloc = p.m_team_alloc; - m_team_iter = p.m_team_iter; - m_team_scratch_size[0] = p.m_team_scratch_size[0]; - m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; - m_team_scratch_size[1] = p.m_team_scratch_size[1]; - m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; - m_chunk_size = p.m_chunk_size; - return *this; - } - - //---------------------------------------- - - template< class FunctorType > - inline static - int team_size_max( const FunctorType & ) { - int pool_size = traits::execution_space::thread_pool_size(1); - int max_host_team_size = Impl::HostThreadTeamData::max_team_members; - return pool_size - inline static - int team_size_recommended( const FunctorType & ) - { return traits::execution_space::thread_pool_size(2); } - - template< class FunctorType > - inline static - int team_size_recommended( const FunctorType &, const int& ) - { return traits::execution_space::thread_pool_size(2); } - - //---------------------------------------- - -private: - - int m_league_size ; - int m_team_size ; - int m_team_alloc ; - int m_team_iter ; - - size_t m_team_scratch_size[2]; - size_t m_thread_scratch_size[2]; - - int m_chunk_size; - - inline void init( const int league_size_request - , const int team_size_request ) - { - const int pool_size = traits::execution_space::thread_pool_size(0); - const int max_host_team_size = Impl::HostThreadTeamData::max_team_members; - const int team_max = pool_size 0) { - if(!Impl::is_integral_power_of_two( m_chunk_size )) - Kokkos::abort("TeamPolicy blocking granularity must be power of two" ); - } - - int new_chunk_size = 1; - while(new_chunk_size*100*concurrency < m_league_size) - new_chunk_size *= 2; - if(new_chunk_size < 128) { - new_chunk_size = 1; - while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) ) - new_chunk_size*=2; - } - m_chunk_size = new_chunk_size; - } - -public: - typedef Impl::HostThreadTeamMember< Kokkos::OpenMP > member_type ; -}; -} // namespace Impl - -} // namespace Kokkos +}} // namespace Kokkos::Impl //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { -inline -bool OpenMP::in_parallel() -{ return omp_in_parallel(); } +inline OpenMP::OpenMP() noexcept +{} inline -int OpenMP::thread_pool_size( int depth ) +bool OpenMP::is_initialized() noexcept +{ return Impl::t_openmp_instance != nullptr; } + +inline +bool OpenMP::in_parallel( OpenMP const& ) noexcept { - return Impl::OpenMPExec::pool_size(depth); + //t_openmp_instance is only non-null on a master thread + return !Impl::t_openmp_instance + || Impl::t_openmp_instance->m_level < omp_get_level() + ; +} + +inline +int OpenMP::thread_pool_size() noexcept +{ + return OpenMP::in_parallel() + ? omp_get_num_threads() + : Impl::t_openmp_instance->m_pool_size + ; } KOKKOS_INLINE_FUNCTION -int OpenMP::thread_pool_rank() +int OpenMP::thread_pool_rank() noexcept { #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) - return Impl::OpenMPExec::m_map_rank[ omp_get_thread_num() ]; + return Impl::t_openmp_instance ? 0 : omp_get_thread_num(); #else return -1 ; #endif } +inline +void OpenMP::fence( OpenMP const& instance ) noexcept {} + +inline +bool OpenMP::is_asynchronous( OpenMP const& instance ) noexcept +{ return false; } + +template +void OpenMP::partition_master( F const& f + , int num_partitions + , int partition_size + ) +{ + if (omp_get_nested()) { + using Exec = Impl::OpenMPExec; + + Exec * prev_instance = Impl::t_openmp_instance; + + Exec::validate_partition( prev_instance->m_pool_size, num_partitions, partition_size ); + + OpenMP::memory_space space; + + #pragma omp parallel num_threads(num_partitions) + { + void * const ptr = space.allocate( sizeof(Exec) ); + + Impl::t_openmp_instance = new (ptr) Exec( partition_size ); + + size_t pool_reduce_bytes = 32 * partition_size ; + size_t team_reduce_bytes = 32 * partition_size ; + size_t team_shared_bytes = 1024 * partition_size ; + size_t thread_local_bytes = 1024 ; + + Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes + , team_reduce_bytes + , team_shared_bytes + , thread_local_bytes + ); + + f( omp_get_thread_num(), omp_get_num_threads() ); + + Impl::t_openmp_instance->~Exec(); + space.deallocate( Impl::t_openmp_instance, sizeof(Exec) ); + Impl::t_openmp_instance = nullptr; + } + + Impl::t_openmp_instance = prev_instance; + } + else { + // nested openmp not enabled + f(0,1); + } +} + + +namespace Experimental { + +template<> +class MasterLock +{ +public: + void lock() { omp_set_lock( &m_lock ); } + void unlock() { omp_unset_lock( &m_lock ); } + bool try_lock() { return static_cast(omp_test_lock( &m_lock )); } + + MasterLock() { omp_init_lock( &m_lock ); } + ~MasterLock() { omp_destroy_lock( &m_lock ); } + + MasterLock( MasterLock const& ) = delete; + MasterLock( MasterLock && ) = delete; + MasterLock & operator=( MasterLock const& ) = delete; + MasterLock & operator=( MasterLock && ) = delete; + +private: + omp_lock_t m_lock; + +}; + +template<> +class UniqueToken< OpenMP, UniqueTokenScope::Instance> +{ +public: + using execution_space = OpenMP; + using size_type = int; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken( execution_space const& = execution_space() ) noexcept {} + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int size() const noexcept + { + #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + return Kokkos::OpenMP::thread_pool_size(); + #else + return 0 ; + #endif + } + + /// \brief acquire value such that 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int acquire() const noexcept + { + #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + return Kokkos::OpenMP::thread_pool_rank(); + #else + return 0 ; + #endif + } + + /// \brief release a value acquired by generate + KOKKOS_INLINE_FUNCTION + void release( int ) const noexcept {} +}; + +template<> +class UniqueToken< OpenMP, UniqueTokenScope::Global> +{ +public: + using execution_space = OpenMP; + using size_type = int; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken( execution_space const& = execution_space() ) noexcept {} + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int size() const noexcept + { + #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + return Kokkos::Impl::g_openmp_hardware_max_threads ; + #else + return 0 ; + #endif + } + + /// \brief acquire value such that 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int acquire() const noexcept + { + #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + return Kokkos::Impl::t_openmp_hardware_id ; + #else + return 0 ; + #endif + } + + /// \brief release a value acquired by generate + KOKKOS_INLINE_FUNCTION + void release( int ) const noexcept {} +}; + +} // namespace Experimental + + +#if !defined( KOKKOS_DISABLE_DEPRECATED ) + +inline +int OpenMP::thread_pool_size( int depth ) +{ + return depth < 2 + ? thread_pool_size() + : 1; +} + +KOKKOS_INLINE_FUNCTION +int OpenMP::hardware_thread_id() noexcept +{ +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + return Impl::t_openmp_hardware_id; +#else + return -1 ; +#endif +} + +inline +int OpenMP::max_hardware_threads() noexcept +{ + return Impl::g_openmp_hardware_max_threads; +} + +#endif // KOKKOS_DISABLE_DEPRECATED + } // namespace Kokkos #endif diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp index c47e0fc654..b54abb0068 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp @@ -52,6 +52,8 @@ #include #include +#include + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -71,8 +73,9 @@ private: typedef typename Policy::WorkRange WorkRange ; typedef typename Policy::member_type Member ; - const FunctorType m_functor ; - const Policy m_policy ; + OpenMPExec * m_instance ; + const FunctorType m_functor ; + const Policy m_policy ; template< class TagType > inline static @@ -110,16 +113,120 @@ private: public: inline void execute() const + { + enum { is_dynamic = std::is_same< typename Policy::schedule_type::type + , Kokkos::Dynamic >::value + }; + + if ( OpenMP::in_parallel() ) { + exec_range< WorkTag >( m_functor + , m_policy.begin() + , m_policy.end() ); + } + else { + + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for"); + + const int pool_size = OpenMP::thread_pool_size(); + #pragma omp parallel num_threads(pool_size) + { + HostThreadTeamData & data = *(m_instance->get_thread_data()); + + data.set_work_partition( m_policy.end() - m_policy.begin() + , m_policy.chunk_size() ); + + if ( is_dynamic ) { + // Make sure work partition is set before stealing + if ( data.pool_rendezvous() ) data.pool_rendezvous_release(); + } + + std::pair range(0,0); + + do { + + range = is_dynamic ? data.get_work_stealing_chunk() + : data.get_work_partition(); + + ParallelFor::template + exec_range< WorkTag >( m_functor + , range.first + m_policy.begin() + , range.second + m_policy.begin() ); + + } while ( is_dynamic && 0 <= range.first ); + } + } + } + + inline + ParallelFor( const FunctorType & arg_functor + , Policy arg_policy ) + : m_instance( t_openmp_instance ) + , m_functor( arg_functor ) + , m_policy( arg_policy ) + {} +}; + + +// MDRangePolicy impl +template< class FunctorType , class ... Traits > +class ParallelFor< FunctorType + , Kokkos::Experimental::MDRangePolicy< Traits ... > + , Kokkos::OpenMP + > +{ +private: + + typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ; + typedef typename MDRangePolicy::impl_range_policy Policy ; + typedef typename MDRangePolicy::work_tag WorkTag ; + + typedef typename Policy::WorkRange WorkRange ; + typedef typename Policy::member_type Member ; + + typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type; + + OpenMPExec * m_instance ; + const FunctorType m_functor ; + const MDRangePolicy m_mdr_policy ; + const Policy m_policy ; // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor + + inline static + void + exec_range( const MDRangePolicy & mdr_policy + , const FunctorType & functor + , const Member ibeg , const Member iend ) { + #ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION + #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP + #pragma ivdep + #endif + #endif + for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) { + iterate_type( mdr_policy, functor )( iwork ); + } + } + +public: + + inline void execute() const + { enum { is_dynamic = std::is_same< typename Policy::schedule_type::type , Kokkos::Dynamic >::value }; - OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_for"); - OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_for"); + if ( OpenMP::in_parallel() ) { + ParallelFor::exec_range ( m_mdr_policy + , m_functor + , m_policy.begin() + , m_policy.end() ); + } + else { -#pragma omp parallel + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for"); + + const int pool_size = OpenMP::thread_pool_size(); + #pragma omp parallel num_threads(pool_size) { - HostThreadTeamData & data = *OpenMPExec::get_thread_data(); + HostThreadTeamData & data = *(m_instance->get_thread_data()); data.set_work_partition( m_policy.end() - m_policy.begin() , m_policy.chunk_size() ); @@ -136,8 +243,8 @@ public: range = is_dynamic ? data.get_work_stealing_chunk() : data.get_work_partition(); - ParallelFor::template - exec_range< WorkTag >( m_functor + ParallelFor::exec_range( m_mdr_policy + , m_functor , range.first + m_policy.begin() , range.second + m_policy.begin() ); @@ -145,12 +252,15 @@ public: } // END #pragma omp parallel } + } inline ParallelFor( const FunctorType & arg_functor - , Policy arg_policy ) - : m_functor( arg_functor ) - , m_policy( arg_policy ) + , MDRangePolicy arg_policy ) + : m_instance( t_openmp_instance ) + , m_functor( arg_functor ) + , m_mdr_policy( arg_policy ) + , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) ) {} }; @@ -191,10 +301,11 @@ private: typedef typename Analysis::pointer_type pointer_type ; typedef typename Analysis::reference_type reference_type ; - const FunctorType m_functor ; - const Policy m_policy ; - const ReducerType m_reducer ; - const pointer_type m_result_ptr ; + OpenMPExec * m_instance; + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; template< class TagType > inline static @@ -228,21 +339,21 @@ public: enum { is_dynamic = std::is_same< typename Policy::schedule_type::type , Kokkos::Dynamic >::value }; - OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_reduce"); - OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_reduce"); + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce"); const size_t pool_reduce_bytes = Analysis::value_size( ReducerConditional::select(m_functor, m_reducer)); - OpenMPExec::resize_thread_data( pool_reduce_bytes + m_instance->resize_thread_data( pool_reduce_bytes , 0 // team_reduce_bytes , 0 // team_shared_bytes , 0 // thread_local_bytes ); -#pragma omp parallel + const int pool_size = OpenMP::thread_pool_size(); + #pragma omp parallel num_threads(pool_size) { - HostThreadTeamData & data = *OpenMPExec::get_thread_data(); + HostThreadTeamData & data = *(m_instance->get_thread_data()); data.set_work_partition( m_policy.end() - m_policy.begin() , m_policy.chunk_size() ); @@ -271,16 +382,15 @@ public: } while ( is_dynamic && 0 <= range.first ); } -// END #pragma omp parallel // Reduction: - const pointer_type ptr = pointer_type( OpenMPExec::get_thread_data(0)->pool_reduce_local() ); + const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() ); - for ( int i = 1 ; i < OpenMPExec::pool_size() ; ++i ) { + for ( int i = 1 ; i < pool_size ; ++i ) { ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr - , OpenMPExec::get_thread_data(i)->pool_reduce_local() ); + , m_instance->get_thread_data(i)->pool_reduce_local() ); } Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr ); @@ -303,7 +413,8 @@ public: Kokkos::is_view< ViewType >::value && !Kokkos::is_reducer_type::value ,void*>::type = NULL) - : m_functor( arg_functor ) + : m_instance( t_openmp_instance ) + , m_functor( arg_functor ) , m_policy( arg_policy ) , m_reducer( InvalidType() ) , m_result_ptr( arg_view.data() ) @@ -317,7 +428,8 @@ public: ParallelReduce( const FunctorType & arg_functor , Policy arg_policy , const ReducerType& reducer ) - : m_functor( arg_functor ) + : m_instance( t_openmp_instance ) + , m_functor( arg_functor ) , m_policy( arg_policy ) , m_reducer( reducer ) , m_result_ptr( reducer.view().data() ) @@ -329,6 +441,173 @@ public: }; + +// MDRangePolicy impl +template< class FunctorType , class ReducerType, class ... Traits > +class ParallelReduce< FunctorType + , Kokkos::Experimental::MDRangePolicy< Traits ...> + , ReducerType + , Kokkos::OpenMP + > +{ +private: + + typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ; + typedef typename MDRangePolicy::impl_range_policy Policy ; + + typedef typename MDRangePolicy::work_tag WorkTag ; + typedef typename Policy::WorkRange WorkRange ; + typedef typename Policy::member_type Member ; + + typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ; + + typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional; + typedef typename ReducerConditional::type ReducerTypeFwd; + + typedef typename ReducerTypeFwd::value_type ValueType; + + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; + typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ; + + typedef typename Analysis::pointer_type pointer_type ; + typedef typename Analysis::reference_type reference_type ; + + using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy + , FunctorType + , WorkTag + , ValueType + >; + + OpenMPExec * m_instance ; + const FunctorType m_functor ; + const MDRangePolicy m_mdr_policy ; + const Policy m_policy ; // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor + const ReducerType m_reducer ; + const pointer_type m_result_ptr ; + + inline static + void + exec_range( const MDRangePolicy & mdr_policy + , const FunctorType & functor + , const Member ibeg , const Member iend + , reference_type update ) + { + for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) { + iterate_type( mdr_policy, functor, update )( iwork ); + } + } + +public: + + inline void execute() const + { + enum { is_dynamic = std::is_same< typename Policy::schedule_type::type + , Kokkos::Dynamic >::value }; + + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce"); + + const size_t pool_reduce_bytes = + Analysis::value_size( ReducerConditional::select(m_functor, m_reducer)); + + m_instance->resize_thread_data( pool_reduce_bytes + , 0 // team_reduce_bytes + , 0 // team_shared_bytes + , 0 // thread_local_bytes + ); + + const int pool_size = OpenMP::thread_pool_size(); + #pragma omp parallel num_threads(pool_size) + { + HostThreadTeamData & data = *(m_instance->get_thread_data()); + + data.set_work_partition( m_policy.end() - m_policy.begin() + , m_policy.chunk_size() ); + + if ( is_dynamic ) { + // Make sure work partition is set before stealing + if ( data.pool_rendezvous() ) data.pool_rendezvous_release(); + } + + reference_type update = + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) + , data.pool_reduce_local() ); + + std::pair range(0,0); + + do { + + range = is_dynamic ? data.get_work_stealing_chunk() + : data.get_work_partition(); + + ParallelReduce::exec_range ( m_mdr_policy, m_functor + , range.first + m_policy.begin() + , range.second + m_policy.begin() + , update ); + + } while ( is_dynamic && 0 <= range.first ); + } +// END #pragma omp parallel + + // Reduction: + + const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() ); + + for ( int i = 1 ; i < pool_size ; ++i ) { + ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) + , ptr + , m_instance->get_thread_data(i)->pool_reduce_local() ); + } + + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr ); + + if ( m_result_ptr ) { + const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) ); + + for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; } + } + } + + //---------------------------------------- + + template< class ViewType > + inline + ParallelReduce( const FunctorType & arg_functor + , MDRangePolicy arg_policy + , const ViewType & arg_view + , typename std::enable_if< + Kokkos::is_view< ViewType >::value && + !Kokkos::is_reducer_type::value + ,void*>::type = NULL) + : m_instance( t_openmp_instance ) + , m_functor( arg_functor ) + , m_mdr_policy( arg_policy ) + , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) ) + , m_reducer( InvalidType() ) + , m_result_ptr( arg_view.data() ) + { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/ + } + + inline + ParallelReduce( const FunctorType & arg_functor + , MDRangePolicy arg_policy + , const ReducerType& reducer ) + : m_instance( t_openmp_instance ) + , m_functor( arg_functor ) + , m_mdr_policy( arg_policy ) + , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) ) + , m_reducer( reducer ) + , m_result_ptr( reducer.view().data() ) + { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/ + } + +}; + } // namespace Impl } // namespace Kokkos @@ -361,8 +640,9 @@ private: typedef typename Analysis::pointer_type pointer_type ; typedef typename Analysis::reference_type reference_type ; - const FunctorType m_functor ; - const Policy m_policy ; + OpenMPExec * m_instance; + const FunctorType m_functor; + const Policy m_policy; template< class TagType > inline static @@ -394,23 +674,23 @@ public: inline void execute() const { - OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_scan"); - OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_scan"); + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_scan"); const int value_count = Analysis::value_count( m_functor ); const size_t pool_reduce_bytes = 2 * Analysis::value_size( m_functor ); - OpenMPExec::resize_thread_data( pool_reduce_bytes + m_instance->resize_thread_data( pool_reduce_bytes , 0 // team_reduce_bytes , 0 // team_shared_bytes , 0 // thread_local_bytes ); -#pragma omp parallel + const int pool_size = OpenMP::thread_pool_size(); + #pragma omp parallel num_threads(pool_size) { - HostThreadTeamData & data = *OpenMPExec::get_thread_data(); + HostThreadTeamData & data = *(m_instance->get_thread_data()); - const WorkRange range( m_policy, data.pool_rank(), data.pool_size() ); + const WorkRange range( m_policy, omp_get_thread_num(), omp_get_num_threads() ); reference_type update_sum = ValueInit::init( m_functor , data.pool_reduce_local() ); @@ -422,7 +702,7 @@ public: pointer_type ptr_prev = 0 ; - const int n = data.pool_size(); + const int n = omp_get_num_threads(); for ( int i = 0 ; i < n ; ++i ) { @@ -452,7 +732,6 @@ public: ParallelScan::template exec_range< WorkTag > ( m_functor , range.begin() , range.end() , update_base , true ); } -/* END #pragma omp parallel */ } @@ -461,7 +740,8 @@ public: inline ParallelScan( const FunctorType & arg_functor , const Policy & arg_policy ) - : m_functor( arg_functor ) + : m_instance( t_openmp_instance ) + , m_functor( arg_functor ) , m_policy( arg_policy ) {} @@ -492,9 +772,10 @@ private: typedef typename Policy::schedule_type::type SchedTag ; typedef typename Policy::member_type Member ; - const FunctorType m_functor ; - const Policy m_policy ; - const int m_shmem_size ; + OpenMPExec * m_instance; + const FunctorType m_functor; + const Policy m_policy; + const int m_shmem_size; template< class TagType > inline static @@ -548,22 +829,22 @@ public: { enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value }; - OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_for"); - OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_for"); + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for"); const size_t pool_reduce_size = 0 ; // Never shrinks const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size(); const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1); const size_t thread_local_size = 0 ; // Never shrinks - OpenMPExec::resize_thread_data( pool_reduce_size + m_instance->resize_thread_data( pool_reduce_size , team_reduce_size , team_shared_size , thread_local_size ); -#pragma omp parallel + const int pool_size = OpenMP::thread_pool_size(); + #pragma omp parallel num_threads(pool_size) { - HostThreadTeamData & data = *OpenMPExec::get_thread_data(); + HostThreadTeamData & data = *(m_instance->get_thread_data()); const int active = data.organize_team( m_policy.team_size() ); @@ -598,14 +879,14 @@ public: data.disband_team(); } -// END #pragma omp parallel } inline ParallelFor( const FunctorType & arg_functor , const Policy & arg_policy ) - : m_functor( arg_functor ) + : m_instance( t_openmp_instance ) + , m_functor( arg_functor ) , m_policy( arg_policy ) , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + @@ -646,11 +927,12 @@ private: typedef typename Analysis::pointer_type pointer_type ; typedef typename Analysis::reference_type reference_type ; - const FunctorType m_functor ; - const Policy m_policy ; - const ReducerType m_reducer ; - const pointer_type m_result_ptr ; - const int m_shmem_size ; + OpenMPExec * m_instance; + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + const int m_shmem_size; template< class TagType > inline static @@ -706,8 +988,7 @@ public: { enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value }; - OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_reduce"); - OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_reduce"); + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce"); const size_t pool_reduce_size = Analysis::value_size( ReducerConditional::select(m_functor, m_reducer)); @@ -716,14 +997,15 @@ public: const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1); const size_t thread_local_size = 0 ; // Never shrinks - OpenMPExec::resize_thread_data( pool_reduce_size + m_instance->resize_thread_data( pool_reduce_size , team_reduce_size , team_shared_size , thread_local_size ); -#pragma omp parallel + const int pool_size = OpenMP::thread_pool_size(); + #pragma omp parallel num_threads(pool_size) { - HostThreadTeamData & data = *OpenMPExec::get_thread_data(); + HostThreadTeamData & data = *(m_instance->get_thread_data()); const int active = data.organize_team( m_policy.team_size() ); @@ -763,17 +1045,26 @@ public: } data.disband_team(); + + // This thread has updated 'pool_reduce_local()' with its + // contributions to the reduction. The parallel region is + // about to terminate and the master thread will load and + // reduce each 'pool_reduce_local()' contribution. + // Must 'memory_fence()' to guarantee that storing the update to + // 'pool_reduce_local()' will complete before this thread + // exits the parallel region. + + memory_fence(); } -// END #pragma omp parallel // Reduction: - const pointer_type ptr = pointer_type( OpenMPExec::get_thread_data(0)->pool_reduce_local() ); + const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() ); - for ( int i = 1 ; i < OpenMPExec::pool_size() ; ++i ) { + for ( int i = 1 ; i < pool_size ; ++i ) { ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr - , OpenMPExec::get_thread_data(i)->pool_reduce_local() ); + , m_instance->get_thread_data(i)->pool_reduce_local() ); } Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr ); @@ -796,7 +1087,8 @@ public: Kokkos::is_view< ViewType >::value && !Kokkos::is_reducer_type::value ,void*>::type = NULL) - : m_functor( arg_functor ) + : m_instance( t_openmp_instance ) + , m_functor( arg_functor ) , m_policy( arg_policy ) , m_reducer( InvalidType() ) , m_result_ptr( arg_result.ptr_on_device() ) @@ -810,7 +1102,8 @@ public: ParallelReduce( const FunctorType & arg_functor , Policy arg_policy , const ReducerType& reducer ) - : m_functor( arg_functor ) + : m_instance( t_openmp_instance ) + , m_functor( arg_functor ) , m_policy( arg_policy ) , m_reducer( reducer ) , m_result_ptr( reducer.view().data() ) diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp index d4ade211f8..77363876b0 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp @@ -105,7 +105,7 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::execute { using execution_space = Kokkos::OpenMP ; using queue_type = TaskQueue< execution_space > ; - using task_root_type = TaskBase< execution_space , void , void > ; + using task_root_type = TaskBase< void , void , void > ; using Member = Impl::HostThreadTeamMember< execution_space > ; static task_root_type * const end = @@ -115,23 +115,19 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::execute HostThreadTeamData & team_data_single = HostThreadTeamDataSingleton::singleton(); - const int team_size = Impl::OpenMPExec::pool_size(2); // Threads per core - // const int team_size = Impl::OpenMPExec::pool_size(1); // Threads per NUMA + Impl::OpenMPExec * instance = t_openmp_instance; + const int pool_size = OpenMP::thread_pool_size(); -#if 0 -fprintf(stdout,"TaskQueue execute %d\n", team_size ); -fflush(stdout); -#endif + const int team_size = 1; // Threads per core + instance->resize_thread_data( 0 /* global reduce buffer */ + , 512 * team_size /* team reduce buffer */ + , 0 /* team shared buffer */ + , 0 /* thread local buffer */ + ); - OpenMPExec::resize_thread_data( 0 /* global reduce buffer */ - , 512 * team_size /* team reduce buffer */ - , 0 /* team shared buffer */ - , 0 /* thread local buffer */ - ); - -#pragma omp parallel + #pragma omp parallel num_threads(pool_size) { - Impl::HostThreadTeamData & self = *Impl::OpenMPExec::get_thread_data(); + Impl::HostThreadTeamData & self = *(instance->get_thread_data()); // Organizing threads into a team performs a barrier across the // entire pool to insure proper initialization of the team @@ -142,18 +138,6 @@ fflush(stdout); Member single_exec( team_data_single ); Member team_exec( self ); -#if 0 -fprintf(stdout,"TaskQueue pool(%d of %d) team(%d of %d) league(%d of %d) running\n" - , self.pool_rank() - , self.pool_size() - , team_exec.team_rank() - , team_exec.team_size() - , team_exec.league_rank() - , team_exec.league_size() - ); -fflush(stdout); -#endif - // Loop until all queues are empty and no tasks in flight task_root_type * task = 0 ; @@ -197,15 +181,6 @@ fflush(stdout); // if a single thread task then execute now -#if 0 -fprintf(stdout,"TaskQueue pool(%d of %d) executing single task 0x%lx\n" - , self.pool_rank() - , self.pool_size() - , int64_t(task) - ); -fflush(stdout); -#endif - (*task->m_apply)( task , & single_exec ); leader_loop = true ; @@ -220,57 +195,14 @@ fflush(stdout); if ( 0 != task ) { // Thread Team Task -#if 0 -fprintf(stdout,"TaskQueue pool(%d of %d) team((%d of %d) league(%d of %d) executing team task 0x%lx\n" - , self.pool_rank() - , self.pool_size() - , team_exec.team_rank() - , team_exec.team_size() - , team_exec.league_rank() - , team_exec.league_size() - , int64_t(task) - ); -fflush(stdout); -#endif - (*task->m_apply)( task , & team_exec ); // The m_apply function performs a barrier } } while( 0 != task ); - -#if 0 -fprintf(stdout,"TaskQueue pool(%d of %d) team(%d of %d) league(%d of %d) ending\n" - , self.pool_rank() - , self.pool_size() - , team_exec.team_rank() - , team_exec.team_size() - , team_exec.league_rank() - , team_exec.league_size() - ); -fflush(stdout); -#endif - } - self.disband_team(); - -#if 0 -fprintf(stdout,"TaskQueue pool(%d of %d) disbanded\n" - , self.pool_rank() - , self.pool_size() - ); -fflush(stdout); -#endif - } -// END #pragma omp parallel - -#if 0 -fprintf(stdout,"TaskQueue execute %d end\n", team_size ); -fflush(stdout); -#endif - } void TaskQueueSpecialization< Kokkos::OpenMP >:: @@ -279,10 +211,10 @@ void TaskQueueSpecialization< Kokkos::OpenMP >:: { using execution_space = Kokkos::OpenMP ; using queue_type = TaskQueue< execution_space > ; - using task_root_type = TaskBase< execution_space , void , void > ; + using task_root_type = TaskBase< void , void , void > ; using Member = Impl::HostThreadTeamMember< execution_space > ; - if ( 1 == omp_get_num_threads() ) { + if ( 1 == OpenMP::thread_pool_size() ) { task_root_type * const end = (task_root_type *) task_root_type::EndTag ; diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp index 82fbef255b..dfa1635e08 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp @@ -45,7 +45,7 @@ #define KOKKOS_IMPL_OPENMP_TASK_HPP #include -#if defined( KOKKOS_ENABLE_TASKDAG ) +#if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG ) //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -60,7 +60,7 @@ public: using execution_space = Kokkos::OpenMP ; using queue_type = Kokkos::Impl::TaskQueue< execution_space > ; - using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ; + using task_base_type = Kokkos::Impl::TaskBase< void , void , void > ; using member_type = Kokkos::Impl::HostThreadTeamMember< execution_space > ; // Must specify memory space diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp new file mode 100644 index 0000000000..743e6b6e62 --- /dev/null +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp @@ -0,0 +1,245 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMP_TEAM_HPP +#define KOKKOS_OPENMP_TEAM_HPP + +#include +#if defined( KOKKOS_ENABLE_OPENMP ) + +#include + +namespace Kokkos { namespace Impl { + +template< class ... Properties > +class TeamPolicyInternal< Kokkos::OpenMP, Properties ... >: public PolicyTraits +{ +public: + + //! Tag this class as a kokkos execution policy + typedef TeamPolicyInternal execution_policy ; + + typedef PolicyTraits traits; + + TeamPolicyInternal& operator = (const TeamPolicyInternal& p) { + m_league_size = p.m_league_size; + m_team_size = p.m_team_size; + m_team_alloc = p.m_team_alloc; + m_team_iter = p.m_team_iter; + m_team_scratch_size[0] = p.m_team_scratch_size[0]; + m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; + m_team_scratch_size[1] = p.m_team_scratch_size[1]; + m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; + m_chunk_size = p.m_chunk_size; + return *this; + } + + //---------------------------------------- + + template< class FunctorType > + inline static + int team_size_max( const FunctorType & ) { + int pool_size = traits::execution_space::thread_pool_size(1); + int max_host_team_size = Impl::HostThreadTeamData::max_team_members; + return pool_size + inline static + int team_size_recommended( const FunctorType & ) + { return traits::execution_space::thread_pool_size(2); } + + template< class FunctorType > + inline static + int team_size_recommended( const FunctorType &, const int& ) + { return traits::execution_space::thread_pool_size(2); } + + //---------------------------------------- + +private: + + int m_league_size ; + int m_team_size ; + int m_team_alloc ; + int m_team_iter ; + + size_t m_team_scratch_size[2]; + size_t m_thread_scratch_size[2]; + + int m_chunk_size; + + inline void init( const int league_size_request + , const int team_size_request ) + { + const int pool_size = traits::execution_space::thread_pool_size(0); + const int max_host_team_size = Impl::HostThreadTeamData::max_team_members; + const int team_max = pool_size 0) { + if(!Impl::is_integral_power_of_two( m_chunk_size )) + Kokkos::abort("TeamPolicy blocking granularity must be power of two" ); + } + + int new_chunk_size = 1; + while(new_chunk_size*100*concurrency < m_league_size) + new_chunk_size *= 2; + if(new_chunk_size < 128) { + new_chunk_size = 1; + while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) ) + new_chunk_size*=2; + } + m_chunk_size = new_chunk_size; + } + +public: + typedef Impl::HostThreadTeamMember< Kokkos::OpenMP > member_type ; +}; + +}} // namespace Kokkos::Impl + +#endif +#endif /* KOKKOS_OPENMP_TEAM_HPP */ + + diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp new file mode 100644 index 0000000000..289ad15451 --- /dev/null +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp @@ -0,0 +1,107 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP +#define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class ... Traits > +class ParallelFor< FunctorType , + Kokkos::Experimental::WorkGraphPolicy< Traits ... > , + Kokkos::OpenMP + > + : public Kokkos::Impl::Experimental:: + WorkGraphExec< FunctorType, + Kokkos::OpenMP, + Traits ... + > +{ +private: + + typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ; + typedef Kokkos::Impl::Experimental:: + WorkGraphExec Base ; + + template< class TagType > + typename std::enable_if< std::is_same< TagType , void >::value >::type + exec_one(const typename Policy::member_type& i) const { + Base::m_functor( i ); + } + + template< class TagType > + typename std::enable_if< ! std::is_same< TagType , void >::value >::type + exec_one(const typename Policy::member_type& i) const { + const TagType t{} ; + Base::m_functor( t , i ); + } + +public: + + inline + void execute() + { + const int pool_size = OpenMP::thread_pool_size(); + + #pragma omp parallel num_threads(pool_size) + { + for (std::int32_t i; (-1 != (i = Base::before_work())); ) { + exec_one< typename Policy::work_tag >( i ); + Base::after_work(i); + } + } + } + + inline + ParallelFor( const FunctorType & arg_functor + , const Policy & arg_policy ) + : Base( arg_functor, arg_policy ) + { + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* #define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP */ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp index bec7844ed6..258a9d2ff7 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -45,7 +45,7 @@ #define KOKKOS_OPENMPTARGETEXEC_HPP #include -#include +#include #include #include @@ -59,10 +59,10 @@ namespace Impl { class OpenMPTargetExec { -public: +public: enum { MAX_ACTIVE_THREADS = 256*8*56*4 }; enum { MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS/32 }; - + private: static void* scratch_ptr; @@ -70,7 +70,7 @@ public: static void verify_is_process( const char * const ); static void verify_initialized( const char * const ); - static void* get_scratch_ptr(); + static void* get_scratch_ptr(); static void clear_scratch(); static void resize_scratch( int64_t reduce_bytes , int64_t team_reduce_bytes, int64_t team_shared_bytes, int64_t thread_local_bytes ); @@ -159,7 +159,7 @@ public: KOKKOS_INLINE_FUNCTION void team_barrier() const { - #pragma omp barrier + #pragma omp barrier } template @@ -191,13 +191,13 @@ public: typedef ValueType value_type; const JoinLambdaAdapter op(op_in); - + // Make sure there is enough scratch space: typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE , value_type , void >::type type ; const int n_values = TEAM_REDUCE_SIZE/sizeof(value_type); - type * team_scratch = (type*) ((char*)m_glb_scratch + TEAM_REDUCE_SIZE*omp_get_team_num()); + type * team_scratch = (type*) ((char*)m_glb_scratch + TEAM_REDUCE_SIZE*omp_get_team_num()); for(int i = m_team_rank; i < n_values; i+= m_team_size) { team_scratch[i] = value_type(); } @@ -209,7 +209,7 @@ public: team_scratch[m_team_rank%n_values]+=value; #pragma omp barrier } - + for(int d = 1; d #if defined( KOKKOS_ENABLE_QTHREADS ) -#include +#include //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp index 4c805310cc..35b2163ae5 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp +++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp @@ -45,14 +45,14 @@ #include #if defined( KOKKOS_ENABLE_THREADS ) -#include - #include #include #include #include #include + #include + #include #include #include @@ -80,9 +80,7 @@ const void * volatile s_current_function_arg = 0 ; struct Sentinel { Sentinel() - { - HostSpace::register_in_parallel( ThreadsExec::in_parallel ); - } + {} ~Sentinel() { @@ -122,6 +120,8 @@ void execute_function_noop( ThreadsExec & , const void * ) {} void ThreadsExec::driver(void) { + SharedAllocationRecord< void, void >::tracking_enable(); + ThreadsExec this_thread ; while ( ThreadsExec::Active == this_thread.m_pool_state ) { @@ -726,6 +726,8 @@ void ThreadsExec::initialize( unsigned thread_count , // Init the array for used for arbitrarily sized atomics Impl::init_lock_array_host_space(); + Impl::SharedAllocationRecord< void, void >::tracking_enable(); + #if defined(KOKKOS_ENABLE_PROFILING) Kokkos::Profiling::initialize(); #endif diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp index 74de3a2596..7557bad7d9 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp @@ -50,11 +50,12 @@ #include #include -#include +#include #include #include +#include //---------------------------------------------------------------------------- namespace Kokkos { @@ -275,6 +276,17 @@ public: if ( ! rev_rank ) { Final::final( f , reduce_memory() ); } + + // This thread has updated 'reduce_memory()' and upon returning + // from this function will set 'm_pool_state' to inactive. + // If this is a non-root thread then setting 'm_pool_state' + // to inactive triggers another thread to exit a spinwait + // and read the 'reduce_memory'. + // Must 'memory_fence()' to guarantee that storing the update to + // 'reduce_memory()' will complete before storing the the update to + // 'm_pool_state'. + + memory_fence(); } inline @@ -627,6 +639,62 @@ inline void Threads::fence() } /* namespace Kokkos */ +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { namespace Experimental { + +template<> +class UniqueToken< Threads, UniqueTokenScope::Instance> +{ +public: + using execution_space = Threads; + using size_type = int; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken( execution_space const& = execution_space() ) noexcept {} + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + inline + int size() const noexcept { return Threads::thread_pool_size(); } + + /// \brief acquire value such that 0 <= value < size() + inline + int acquire() const noexcept { return Threads::thread_pool_rank(); } + + /// \brief release a value acquired by generate + inline + void release( int ) const noexcept {} +}; + +template<> +class UniqueToken< Threads, UniqueTokenScope::Global> +{ +public: + using execution_space = Threads; + using size_type = int; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken( execution_space const& = execution_space() ) noexcept {} + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + inline + int size() const noexcept { return Threads::thread_pool_size(); } + + /// \brief acquire value such that 0 <= value < size() + inline + int acquire() const noexcept { return Threads::thread_pool_rank(); } + + /// \brief release a value acquired by generate + inline + void release( int ) const noexcept {} +}; + +}} // namespace Kokkos::Experimental //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- #endif diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp index c12019413b..6060bf191f 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp @@ -50,7 +50,7 @@ #include #include -#include +#include #include #include @@ -482,6 +482,8 @@ public: void next_static() { if ( m_league_rank < m_league_end ) { + // Make sure all stores are complete before entering the barrier + memory_fence(); team_barrier(); set_team_shared(); } @@ -518,6 +520,8 @@ public: return; if ( m_league_rank < m_league_chunk_end ) { + // Make sure all stores are complete before entering the barrier + memory_fence(); team_barrier(); set_team_shared(); } diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp index 0ee0cd3280..18ac7d26ad 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp @@ -55,6 +55,8 @@ #include #include +#include + //---------------------------------------------------------------------------- namespace Kokkos { @@ -174,6 +176,108 @@ public: {} }; + +// MDRangePolicy impl +template< class FunctorType , class ... Traits > +class ParallelFor< FunctorType + , Kokkos::Experimental::MDRangePolicy< Traits ... > + , Kokkos::Threads + > +{ +private: + typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ; + typedef typename MDRangePolicy::impl_range_policy Policy ; + + typedef typename MDRangePolicy::work_tag WorkTag ; + + typedef typename Policy::WorkRange WorkRange ; + typedef typename Policy::member_type Member ; + + typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type; + + const FunctorType m_functor ; + const MDRangePolicy m_mdr_policy ; + const Policy m_policy ; // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor + + inline static + void + exec_range( const MDRangePolicy & mdr_policy + , const FunctorType & functor + , const Member ibeg , const Member iend ) + { + #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \ + defined( KOKKOS_ENABLE_PRAGMA_IVDEP ) + #pragma ivdep + #endif + for ( Member i = ibeg ; i < iend ; ++i ) { + iterate_type( mdr_policy, functor )( i ); + } + } + + static void exec( ThreadsExec & exec , const void * arg ) + { + exec_schedule(exec,arg); + } + + template + static + typename std::enable_if< std::is_same::value >::type + exec_schedule( ThreadsExec & exec , const void * arg ) + { + const ParallelFor & self = * ((const ParallelFor *) arg ); + + WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() ); + + ParallelFor::exec_range + ( self.m_mdr_policy, self.m_functor , range.begin() , range.end() ); + + exec.fan_in(); + } + + template + static + typename std::enable_if< std::is_same::value >::type + exec_schedule( ThreadsExec & exec , const void * arg ) + { + const ParallelFor & self = * ((const ParallelFor *) arg ); + + WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() ); + + exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size()); + exec.reset_steal_target(); + exec.barrier(); + + long work_index = exec.get_work_index(); + + while(work_index != -1) { + const Member begin = static_cast(work_index) * self.m_policy.chunk_size(); + const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end(); + + ParallelFor::exec_range + ( self.m_mdr_policy, self.m_functor , begin , end ); + work_index = exec.get_work_index(); + } + + exec.fan_in(); + } + +public: + + inline + void execute() const + { + ThreadsExec::start( & ParallelFor::exec , this ); + ThreadsExec::fence(); + } + + ParallelFor( const FunctorType & arg_functor + , const MDRangePolicy & arg_policy ) + : m_functor( arg_functor ) + , m_mdr_policy( arg_policy ) + , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) ) + {} +}; + //---------------------------------------------------------------------------- /* ParallelFor Kokkos::Threads with TeamPolicy */ @@ -440,6 +544,169 @@ public: }; + +// MDRangePolicy impl +template< class FunctorType , class ReducerType, class ... Traits > +class ParallelReduce< FunctorType + , Kokkos::Experimental::MDRangePolicy< Traits ... > + , ReducerType + , Kokkos::Threads + > +{ +private: + + typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ; + typedef typename MDRangePolicy::impl_range_policy Policy ; + + typedef typename MDRangePolicy::work_tag WorkTag ; + typedef typename Policy::WorkRange WorkRange ; + typedef typename Policy::member_type Member ; + + typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional; + typedef typename ReducerConditional::type ReducerTypeFwd; + + typedef typename ReducerTypeFwd::value_type ValueType; + + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; + + typedef typename ValueTraits::pointer_type pointer_type ; + typedef typename ValueTraits::reference_type reference_type ; + + using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy + , FunctorType + , WorkTag + , ValueType + >; + + const FunctorType m_functor ; + const MDRangePolicy m_mdr_policy ; + const Policy m_policy ; // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor + const ReducerType m_reducer ; + const pointer_type m_result_ptr ; + + inline static + void + exec_range( const MDRangePolicy & mdr_policy + , const FunctorType & functor + , const Member & ibeg , const Member & iend + , reference_type update ) + { + #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \ + defined( KOKKOS_ENABLE_PRAGMA_IVDEP ) + #pragma ivdep + #endif + for ( Member i = ibeg ; i < iend ; ++i ) { + iterate_type( mdr_policy, functor, update )( i ); + } + } + + static void + exec( ThreadsExec & exec , const void * arg ) { + exec_schedule(exec, arg); + } + + template + static + typename std::enable_if< std::is_same::value >::type + exec_schedule( ThreadsExec & exec , const void * arg ) + { + const ParallelReduce & self = * ((const ParallelReduce *) arg ); + const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() ); + + ParallelReduce::exec_range + ( self.m_mdr_policy, self.m_functor , range.begin() , range.end() + , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) ); + + exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) ); + } + + template + static + typename std::enable_if< std::is_same::value >::type + exec_schedule( ThreadsExec & exec , const void * arg ) + { + const ParallelReduce & self = * ((const ParallelReduce *) arg ); + const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() ); + + exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size()); + exec.reset_steal_target(); + exec.barrier(); + + long work_index = exec.get_work_index(); + reference_type update = ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ); + while(work_index != -1) { + const Member begin = static_cast(work_index) * self.m_policy.chunk_size(); + const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end(); + ParallelReduce::exec_range + ( self.m_mdr_policy, self.m_functor , begin , end + , update ); + work_index = exec.get_work_index(); + } + + exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) ); + } + +public: + + inline + void execute() const + { + ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 ); + + ThreadsExec::start( & ParallelReduce::exec , this ); + + ThreadsExec::fence(); + + if ( m_result_ptr ) { + + const pointer_type data = + (pointer_type) ThreadsExec::root_reduce_scratch(); + + const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) ); + for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; } + } + } + + template< class HostViewType > + ParallelReduce( const FunctorType & arg_functor , + const MDRangePolicy & arg_policy , + const HostViewType & arg_result_view , + typename std::enable_if< + Kokkos::is_view< HostViewType >::value && + !Kokkos::is_reducer_type::value + ,void*>::type = NULL) + : m_functor( arg_functor ) + , m_mdr_policy( arg_policy ) + , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) ) + , m_reducer( InvalidType() ) + , m_result_ptr( arg_result_view.ptr_on_device() ) + { + static_assert( Kokkos::is_view< HostViewType >::value + , "Kokkos::Threads reduce result must be a View" ); + + static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value + , "Kokkos::Threads reduce result must be a View in HostSpace" ); + } + + inline + ParallelReduce( const FunctorType & arg_functor + , MDRangePolicy arg_policy + , const ReducerType& reducer ) + : m_functor( arg_functor ) + , m_mdr_policy( arg_policy ) + , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) ) + , m_reducer( reducer ) + , m_result_ptr( reducer.view().data() ) + { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/ + } + +}; + + //---------------------------------------------------------------------------- /* ParallelReduce with Kokkos::Threads and TeamPolicy */ diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp new file mode 100644 index 0000000000..be904a1670 --- /dev/null +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp @@ -0,0 +1,115 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_THREADS_WORKGRAPHPOLICY_HPP +#define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class ... Traits > +class ParallelFor< FunctorType , + Kokkos::Experimental::WorkGraphPolicy< Traits ... > , + Kokkos::Threads + > + : public Kokkos::Impl::Experimental:: + WorkGraphExec< FunctorType, + Kokkos::Threads, + Traits ... + > +{ +private: + + typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ; + typedef Kokkos::Impl::Experimental:: + WorkGraphExec Base ; + typedef ParallelFor, + Kokkos::Threads> Self ; + + template< class TagType > + typename std::enable_if< std::is_same< TagType , void >::value >::type + exec_one(const typename Policy::member_type& i) const { + Base::m_functor( i ); + } + + template< class TagType > + typename std::enable_if< ! std::is_same< TagType , void >::value >::type + exec_one(const typename Policy::member_type& i) const { + const TagType t{} ; + Base::m_functor( t , i ); + } + + inline void exec_one_thread() const { + for (std::int32_t i; (-1 != (i = Base::before_work())); ) { + exec_one< typename Policy::work_tag >( i ); + Base::after_work(i); + } + } + + static inline void thread_main( ThreadsExec&, const void* arg ) { + const Self& self = *(static_cast(arg)); + self.exec_one_thread(); + } + +public: + + inline + void execute() + { + ThreadsExec::start( & Self::thread_main, this ); + ThreadsExec::fence(); + } + + inline + ParallelFor( const FunctorType & arg_functor + , const Policy & arg_policy ) + : Base( arg_functor, arg_policy ) + { + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* #define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP */ diff --git a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp index 77a1e8754d..0171b209e5 100644 --- a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp +++ b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp @@ -141,7 +141,6 @@ namespace Kokkos { namespace Experimental { namespace Impl { #define LOOP_ARGS_8 LOOP_ARGS_7, i7 + m_offset[7] - // New Loop Macros... // parallel_for, non-tagged #define APPLY( func, ... ) \ @@ -1010,8 +1009,6 @@ namespace Kokkos { namespace Experimental { namespace Impl { // end tagged macros - - // Structs for calling loops template < int Rank, bool IsLeft, typename IType, typename Tagged, typename Enable = void > struct Tile_Loop_Type; @@ -1279,6 +1276,19 @@ struct Tile_Loop_Type<8, IsLeft, IType, Tagged, typename std::enable_if< !std::i template using is_void = std::is_same< T , void >; +template +struct is_type_array : std::false_type +{ + using value_type = T; +}; + +template +struct is_type_array< T[] > : std::true_type +{ + using value_type = T; +}; + + template < typename RP , typename Functor , typename Tag = void @@ -1761,18 +1771,17 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i RP const& m_rp; Functor const& m_func; typename std::conditional< std::is_same::value,int,Tag>::type m_tag; -// value_type & m_v; - }; -// ValueType: For reductions +// For ParallelReduce +// ValueType - scalar: For reductions template < typename RP , typename Functor , typename Tag , typename ValueType > -struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void::value >::type > +struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void::value && !is_type_array::value >::type > { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -2251,12 +2260,497 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i }; +// For ParallelReduce +// Extra specialization for array reductions +// ValueType[]: For array reductions +template < typename RP + , typename Functor + , typename Tag + , typename ValueType + > +struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void::value && is_type_array::value >::type > +{ + using index_type = typename RP::index_type; + using point_type = typename RP::point_type; + + using value_type = typename is_type_array::value_type; // strip away the 'array-ness' [], only underlying type remains + + inline + HostIterateTile( RP const& rp, Functor const& func, value_type *v ) // v should be an array; treat as pointer for compatibility since size is not known nor needed here + : m_rp(rp) //Cuda 7.0 does not like braces... + , m_func(func) + , m_v(v) // use with non-void ValueType struct + {} + + inline + bool check_iteration_bounds( point_type& partial_tile , point_type& offset ) const { + bool is_full_tile = true; + + for ( int i = 0; i < RP::rank; ++i ) { + if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) { + partial_tile[i] = m_rp.m_tile[i] ; + } + else { + is_full_tile = false ; + partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range + } + } + + return is_full_tile ; + } // end check bounds + + + template + struct RankTag + { + typedef RankTag type; + enum { value = (int)Rank }; + }; + + +#if KOKKOS_ENABLE_NEW_LOOP_MACROS + template + inline + void + operator()(IType tile_idx) const + { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == RP::Left) { + for (int i=0; i=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; + + Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_v, m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims ); + + } + +#else + template + inline + void + operator()(IType tile_idx) const + { operator_impl( tile_idx , RankTag() ); } + // added due to compiler error when using sfinae to choose operator based on rank + + + template + inline + void operator_impl( IType tile_idx , const RankTag<2> ) const + { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == RP::Left) { + for (int i=0; i=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; + + if (RP::inner_direction == RP::Left) { + if ( full_tile ) { +// #pragma simd + LOOP_2L(index_type, m_tiledims) { + apply( LOOP_ARGS_2 ); + } + } else { +// #pragma simd + LOOP_2L(index_type, m_tiledims) { + apply( LOOP_ARGS_2 ); + } + } + } // end RP::Left + else { + if ( full_tile ) { +// #pragma simd + LOOP_2R(index_type, m_tiledims) { + apply( LOOP_ARGS_2 ); + } + } else { +// #pragma simd + LOOP_2R(index_type, m_tiledims) { + apply( LOOP_ARGS_2 ); + } + } + } // end RP::Right + + } //end op() rank == 2 + + + template + inline + void operator_impl( IType tile_idx , const RankTag<3> ) const + { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == RP::Left) { + for (int i=0; i=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; + + if (RP::inner_direction == RP::Left) { + if ( full_tile ) { +// #pragma simd + LOOP_3L(index_type, m_tiledims) { + apply( LOOP_ARGS_3 ); + } + } else { +// #pragma simd + LOOP_3L(index_type, m_tiledims) { + apply( LOOP_ARGS_3 ); + } + } + } // end RP::Left + else { + if ( full_tile ) { +// #pragma simd + LOOP_3R(index_type, m_tiledims) { + apply( LOOP_ARGS_3 ); + } + } else { +// #pragma simd + LOOP_3R(index_type, m_tiledims) { + apply( LOOP_ARGS_3 ); + } + } + } // end RP::Right + + } //end op() rank == 3 + + + template + inline + void operator_impl( IType tile_idx , const RankTag<4> ) const + { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == RP::Left) { + for (int i=0; i=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; + + if (RP::inner_direction == RP::Left) { + if ( full_tile ) { +// #pragma simd + LOOP_4L(index_type, m_tiledims) { + apply( LOOP_ARGS_4 ); + } + } else { +// #pragma simd + LOOP_4L(index_type, m_tiledims) { + apply( LOOP_ARGS_4 ); + } + } + } // end RP::Left + else { + if ( full_tile ) { +// #pragma simd + LOOP_4R(index_type, m_tiledims) { + apply( LOOP_ARGS_4 ); + } + } else { +// #pragma simd + LOOP_4R(index_type, m_tiledims) { + apply( LOOP_ARGS_4 ); + } + } + } // end RP::Right + + } //end op() rank == 4 + + + template + inline + void operator_impl( IType tile_idx , const RankTag<5> ) const + { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == RP::Left) { + for (int i=0; i=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; + + if (RP::inner_direction == RP::Left) { + if ( full_tile ) { +// #pragma simd + LOOP_5L(index_type, m_tiledims) { + apply( LOOP_ARGS_5 ); + } + } else { +// #pragma simd + LOOP_5L(index_type, m_tiledims) { + apply( LOOP_ARGS_5 ); + } + } + } // end RP::Left + else { + if ( full_tile ) { +// #pragma simd + LOOP_5R(index_type, m_tiledims) { + apply( LOOP_ARGS_5 ); + } + } else { +// #pragma simd + LOOP_5R(index_type, m_tiledims) { + apply( LOOP_ARGS_5 ); + } + } + } // end RP::Right + + } //end op() rank == 5 + + + template + inline + void operator_impl( IType tile_idx , const RankTag<6> ) const + { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == RP::Left) { + for (int i=0; i=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; + + if (RP::inner_direction == RP::Left) { + if ( full_tile ) { +// #pragma simd + LOOP_6L(index_type, m_tiledims) { + apply( LOOP_ARGS_6 ); + } + } else { +// #pragma simd + LOOP_6L(index_type, m_tiledims) { + apply( LOOP_ARGS_6 ); + } + } + } // end RP::Left + else { + if ( full_tile ) { +// #pragma simd + LOOP_6R(index_type, m_tiledims) { + apply( LOOP_ARGS_6 ); + } + } else { +// #pragma simd + LOOP_6R(index_type, m_tiledims) { + apply( LOOP_ARGS_6 ); + } + } + } // end RP::Right + + } //end op() rank == 6 + + + template + inline + void operator_impl( IType tile_idx , const RankTag<7> ) const + { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == RP::Left) { + for (int i=0; i=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; + + if (RP::inner_direction == RP::Left) { + if ( full_tile ) { +// #pragma simd + LOOP_7L(index_type, m_tiledims) { + apply( LOOP_ARGS_7 ); + } + } else { +// #pragma simd + LOOP_7L(index_type, m_tiledims) { + apply( LOOP_ARGS_7 ); + } + } + } // end RP::Left + else { + if ( full_tile ) { +// #pragma simd + LOOP_7R(index_type, m_tiledims) { + apply( LOOP_ARGS_7 ); + } + } else { +// #pragma simd + LOOP_7R(index_type, m_tiledims) { + apply( LOOP_ARGS_7 ); + } + } + } // end RP::Right + + } //end op() rank == 7 + + + template + inline + void operator_impl( IType tile_idx , const RankTag<8> ) const + { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == RP::Left) { + for (int i=0; i=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; + + if (RP::inner_direction == RP::Left) { + if ( full_tile ) { +// #pragma simd + LOOP_8L(index_type, m_tiledims) { + apply( LOOP_ARGS_8 ); + } + } else { +// #pragma simd + LOOP_8L(index_type, m_tiledims) { + apply( LOOP_ARGS_8 ); + } + } + } // end RP::Left + else { + if ( full_tile ) { +// #pragma simd + LOOP_8R(index_type, m_tiledims) { + apply( LOOP_ARGS_8 ); + } + } else { +// #pragma simd + LOOP_8R(index_type, m_tiledims) { + apply( LOOP_ARGS_8 ); + } + } + } // end RP::Right + + } //end op() rank == 8 +#endif + + + template + typename std::enable_if<( sizeof...(Args) == RP::rank && std::is_same::value), void>::type + apply(Args &&... args) const + { + m_func(args... , m_v); + } + + template + typename std::enable_if<( sizeof...(Args) == RP::rank && !std::is_same::value), void>::type + apply(Args &&... args) const + { + m_func( m_tag, args... , m_v); + } + + + RP const& m_rp; + Functor const& m_func; + value_type * m_v; + typename std::conditional< std::is_same::value,int,Tag>::type m_tag; + +}; + + // ------------------------------------------------------------------ // // MDFunctor - wraps the range_policy and functor to pass to IterateTile -// Serial, Threads, OpenMP +// Used for md_parallel_{for,reduce} with Serial, Threads, OpenMP // Cuda uses DeviceIterateTile directly within md_parallel_for -// ParallelReduce +// TODO Once md_parallel_{for,reduce} removed, this can be removed + +// ParallelReduce - scalar reductions template < typename MDRange, typename Functor, typename ValueType = void > struct MDFunctor { @@ -2273,7 +2767,7 @@ struct MDFunctor inline - MDFunctor( MDRange const& range, Functor const& f, ValueType & v ) + MDFunctor( MDRange const& range, Functor const& f ) : m_range( range ) , m_func( f ) {} @@ -2290,7 +2784,6 @@ struct MDFunctor inline MDFunctor& operator=( MDFunctor && ) = default; -// KOKKOS_FORCEINLINE_FUNCTION //Caused cuda warning - __host__ warning inline void operator()(index_type t, value_type & v) const { @@ -2301,6 +2794,56 @@ struct MDFunctor Functor m_func; }; + +// ParallelReduce - array reductions +template < typename MDRange, typename Functor, typename ValueType > +struct MDFunctor< MDRange, Functor, ValueType[] > +{ + using range_policy = MDRange; + using functor_type = Functor; + using value_type = ValueType[]; + using work_tag = typename range_policy::work_tag; + using index_type = typename range_policy::index_type; + using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange + , Functor + , work_tag + , value_type + >; + + + inline + MDFunctor( MDRange const& range, Functor const& f ) + : m_range( range ) + , m_func( f ) + , value_count( f.value_count ) + {} + + inline + MDFunctor( MDFunctor const& ) = default; + + inline + MDFunctor& operator=( MDFunctor const& ) = default; + + inline + MDFunctor( MDFunctor && ) = default; + + inline + MDFunctor& operator=( MDFunctor && ) = default; + + // FIXME Init and Join, as defined in m_func, are not working through the MDFunctor + // Best path forward is to eliminate need for MDFunctor, directly use MDRangePolicy within Parallel{For,Reduce} ?? + inline + void operator()(index_type t, value_type v) const + { + iterate_type(m_range, m_func, v)(t); + } + + MDRange m_range; + Functor m_func; + size_t value_count; +}; + + // ParallelFor template < typename MDRange, typename Functor > struct MDFunctor< MDRange, Functor, void > @@ -2349,4 +2892,3 @@ struct MDFunctor< MDRange, Functor, void > } } } //end namespace Kokkos::Experimental::Impl #endif - diff --git a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp index c5685c5b62..3fb15c8d1e 100644 --- a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp @@ -55,16 +55,19 @@ template < typename ExecutionSpace = void , typename WorkTag = void , typename IndexType = void , typename IterationPattern = void + , typename LaunchBounds = void > struct PolicyTraitsBase { - using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType, IterationPattern>; + using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType, + IterationPattern, LaunchBounds>; using execution_space = ExecutionSpace; using schedule_type = Schedule; using work_tag = WorkTag; using index_type = IndexType; using iteration_pattern = IterationPattern; + using launch_bounds = LaunchBounds; }; @@ -78,6 +81,7 @@ struct SetExecutionSpace , typename PolicyBase::work_tag , typename PolicyBase::index_type , typename PolicyBase::iteration_pattern + , typename PolicyBase::launch_bounds >; }; @@ -91,6 +95,7 @@ struct SetSchedule , typename PolicyBase::work_tag , typename PolicyBase::index_type , typename PolicyBase::iteration_pattern + , typename PolicyBase::launch_bounds >; }; @@ -104,6 +109,7 @@ struct SetWorkTag , WorkTag , typename PolicyBase::index_type , typename PolicyBase::iteration_pattern + , typename PolicyBase::launch_bounds >; }; @@ -117,6 +123,7 @@ struct SetIndexType , typename PolicyBase::work_tag , IndexType , typename PolicyBase::iteration_pattern + , typename PolicyBase::launch_bounds >; }; @@ -131,6 +138,22 @@ struct SetIterationPattern , typename PolicyBase::work_tag , typename PolicyBase::index_type , IterationPattern + , typename PolicyBase::launch_bounds + >; +}; + + +template +struct SetLaunchBounds +{ + static_assert( is_void::value + , "Kokkos Error: More than one launch_bounds given" ); + using type = PolicyTraitsBase< typename PolicyBase::execution_space + , typename PolicyBase::schedule_type + , typename PolicyBase::work_tag + , typename PolicyBase::index_type + , typename PolicyBase::iteration_pattern + , LaunchBounds >; }; @@ -146,8 +169,9 @@ struct AnalyzePolicy : public , typename std::conditional< is_index_type::value , SetIndexType , typename std::conditional< std::is_integral::value , SetIndexType > , typename std::conditional< is_iteration_pattern::value, SetIterationPattern + , typename std::conditional< is_launch_bounds::value , SetLaunchBounds , SetWorkTag - >::type >::type >::type >::type>::type::type + >::type >::type >::type >::type >::type>::type::type , Traits... > {}; @@ -178,11 +202,18 @@ struct AnalyzePolicy , void // TODO set default iteration pattern , typename Base::iteration_pattern >::type; + + using launch_bounds = typename std::conditional< is_void< typename Base::launch_bounds >::value + , LaunchBounds<> + , typename Base::launch_bounds + >::type; + using type = PolicyTraitsBase< execution_space , schedule_type , work_tag , index_type , iteration_pattern + , launch_bounds >; }; diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp index 010b15064e..5b894b037b 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp @@ -41,6 +41,10 @@ //@HEADER */ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) +#include +#endif + #include #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP ) #define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP @@ -126,11 +130,21 @@ T atomic_compare_exchange( volatile T * const dest , const T & compare , inline int atomic_compare_exchange( volatile int * const dest, const int compare, const int val) -{ return __sync_val_compare_and_swap(dest,compare,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_val_compare_and_swap(dest,compare,val); +} inline long atomic_compare_exchange( volatile long * const dest, const long compare, const long val ) -{ return __sync_val_compare_and_swap(dest,compare,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_val_compare_and_swap(dest,compare,val); +} #if defined( KOKKOS_ENABLE_GNU_ATOMICS ) @@ -159,6 +173,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare, KOKKOS_INLINE_FUNCTION U() {}; } tmp ; +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + tmp.i = __sync_val_compare_and_swap( (int*) dest , *((int*)&compare) , *((int*)&val) ); return tmp.t ; } @@ -175,6 +193,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare, KOKKOS_INLINE_FUNCTION U() {}; } tmp ; +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + tmp.i = __sync_val_compare_and_swap( (long*) dest , *((long*)&compare) , *((long*)&val) ); return tmp.t ; } @@ -193,6 +215,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare, KOKKOS_INLINE_FUNCTION U() {}; } tmp ; +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + tmp.i = Impl::cas128( (Impl::cas128_t*) dest , *((Impl::cas128_t*)&compare) , *((Impl::cas128_t*)&val) ); return tmp.t ; } @@ -209,6 +235,10 @@ T atomic_compare_exchange( volatile T * const dest , const T compare , #endif , const T >::type& val ) { +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + while( !Impl::lock_address_host_space( (void*) dest ) ); T return_val = *dest; if( return_val == compare ) { diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp index 127de528f5..2a13a4865c 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp @@ -41,6 +41,10 @@ //@HEADER */ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) +#include +#endif + #include #if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_DECREMENT_HPP ) #define KOKKOS_ATOMIC_DECREMENT_HPP @@ -54,6 +58,10 @@ template<> KOKKOS_INLINE_FUNCTION void atomic_decrement(volatile char* a) { #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__) +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) a, _MM_HINT_ET0 ); +#endif + __asm__ __volatile__( "lock decb %0" : /* no output registers */ @@ -69,6 +77,10 @@ template<> KOKKOS_INLINE_FUNCTION void atomic_decrement(volatile short* a) { #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__) +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) a, _MM_HINT_ET0 ); +#endif + __asm__ __volatile__( "lock decw %0" : /* no output registers */ @@ -84,6 +96,10 @@ template<> KOKKOS_INLINE_FUNCTION void atomic_decrement(volatile int* a) { #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__) +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) a, _MM_HINT_ET0 ); +#endif + __asm__ __volatile__( "lock decl %0" : /* no output registers */ @@ -99,6 +115,9 @@ template<> KOKKOS_INLINE_FUNCTION void atomic_decrement(volatile long long int* a) { #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__) +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) a, _MM_HINT_ET0 ); +#endif __asm__ __volatile__( "lock decq %0" : /* no output registers */ diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp index a1ff47abce..9ba3cae9fc 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp @@ -41,6 +41,10 @@ //@HEADER */ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) +#include +#endif + #include #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP ) #define KOKKOS_ATOMIC_EXCHANGE_HPP @@ -81,6 +85,10 @@ T atomic_exchange( typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val ) { // int tmp = __ullAtomicExch( (int*) dest , *((int*)&val) ); +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + int tmp = atomicExch( ((int*)dest) , *((int*)&val) ); return *((T*)&tmp); } @@ -93,6 +101,11 @@ T atomic_exchange( sizeof(T) == sizeof(unsigned long long int) , const T & >::type val ) { typedef unsigned long long int type ; + +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + // type tmp = __ullAtomicExch( (type*) dest , *((type*)&val) ); type tmp = atomicExch( ((type*)dest) , *((type*)&val) ); return *((T*)&tmp); @@ -108,6 +121,10 @@ T atomic_exchange( volatile T * const dest , { T return_val; // This is a way to (hopefully) avoid dead lock in a warp +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + int done = 0; unsigned int active = __ballot(1); unsigned int done_active = 0; @@ -173,6 +190,9 @@ T atomic_exchange( volatile T * const dest , , const T & >::type val ) { typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ; +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif const type v = *((type*)&val); // Extract to be sure the value doesn't change @@ -201,6 +221,10 @@ T atomic_exchange( volatile T * const dest , typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t) , const T & >::type val ) { +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + union U { Impl::cas128_t i ; T t ; @@ -260,6 +284,10 @@ void atomic_assign( volatile T * const dest , { typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ; +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + const type v = *((type*)&val); // Extract to be sure the value doesn't change type assumed ; @@ -285,6 +313,10 @@ void atomic_assign( volatile T * const dest , typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t) , const T & >::type val ) { +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + union U { Impl::cas128_t i ; T t ; diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp index 860c8e0e43..084c55efed 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp @@ -41,6 +41,10 @@ //@HEADER */ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) +#include +#endif + #include #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP ) #define KOKKOS_ATOMIC_FETCH_ADD_HPP @@ -161,36 +165,60 @@ T atomic_fetch_add( volatile T * const dest , inline int atomic_fetch_add( volatile int * dest , const int val ) { - int original = val; +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif - __asm__ __volatile__( - "lock xadd %1, %0" - : "+m" (*dest), "+r" (original) - : "m" (*dest), "r" (original) - : "memory" + int original = val; + + __asm__ __volatile__( + "lock xadd %1, %0" + : "+m" (*dest), "+r" (original) + : "m" (*dest), "r" (original) + : "memory" ); - return original; + return original; } #else inline int atomic_fetch_add( volatile int * const dest , const int val ) -{ return __sync_fetch_and_add(dest, val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_add(dest, val); +} #endif inline long int atomic_fetch_add( volatile long int * const dest , const long int val ) -{ return __sync_fetch_and_add(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_add(dest,val); +} #if defined( KOKKOS_ENABLE_GNU_ATOMICS ) inline unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val ) -{ return __sync_fetch_and_add(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_add(dest,val); +} inline unsigned long int atomic_fetch_add( volatile unsigned long int * const dest , const unsigned long int val ) -{ return __sync_fetch_and_add(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_add(dest,val); +} #endif @@ -205,6 +233,10 @@ T atomic_fetch_add( volatile T * const dest , inline U() {}; } assume , oldval , newval ; +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + oldval.t = *dest ; do { @@ -228,6 +260,10 @@ T atomic_fetch_add( volatile T * const dest , inline U() {}; } assume , oldval , newval ; +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + oldval.t = *dest ; do { @@ -253,6 +289,10 @@ T atomic_fetch_add( volatile T * const dest , inline U() {}; } assume , oldval , newval ; +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + oldval.t = *dest ; do { diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp index 83f5b2a5aa..6ecb65336c 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp @@ -41,6 +41,10 @@ //@HEADER */ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) +#include +#endif + #include #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_AND_HPP ) #define KOKKOS_ATOMIC_FETCH_AND_HPP @@ -76,21 +80,41 @@ unsigned long long int atomic_fetch_and( volatile unsigned long long int * const inline int atomic_fetch_and( volatile int * const dest , const int val ) -{ return __sync_fetch_and_and(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_and(dest,val); +} inline long int atomic_fetch_and( volatile long int * const dest , const long int val ) -{ return __sync_fetch_and_and(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_and(dest,val); +} #if defined( KOKKOS_ENABLE_GNU_ATOMICS ) inline unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val ) -{ return __sync_fetch_and_and(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_and(dest,val); +} inline unsigned long int atomic_fetch_and( volatile unsigned long int * const dest , const unsigned long int val ) -{ return __sync_fetch_and_and(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_and(dest,val); +} #endif diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp index 8c73b4c3ef..ed3b438f89 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp @@ -41,6 +41,10 @@ //@HEADER */ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) +#include +#endif + #include #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_OR_HPP ) #define KOKKOS_ATOMIC_FETCH_OR_HPP @@ -76,21 +80,41 @@ unsigned long long int atomic_fetch_or( volatile unsigned long long int * const inline int atomic_fetch_or( volatile int * const dest , const int val ) -{ return __sync_fetch_and_or(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_or(dest,val); +} inline long int atomic_fetch_or( volatile long int * const dest , const long int val ) -{ return __sync_fetch_and_or(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_or(dest,val); +} #if defined( KOKKOS_ENABLE_GNU_ATOMICS ) inline unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val ) -{ return __sync_fetch_and_or(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_or(dest,val); +} inline unsigned long int atomic_fetch_or( volatile unsigned long int * const dest , const unsigned long int val ) -{ return __sync_fetch_and_or(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_or(dest,val); +} #endif diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp index 504731d3a2..038cc13e9a 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp @@ -41,6 +41,10 @@ //@HEADER */ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) +#include +#endif + #include #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_SUB_HPP ) #define KOKKOS_ATOMIC_FETCH_SUB_HPP @@ -136,21 +140,41 @@ T atomic_fetch_sub( volatile T * const dest , inline int atomic_fetch_sub( volatile int * const dest , const int val ) -{ return __sync_fetch_and_sub(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_sub(dest,val); +} inline long int atomic_fetch_sub( volatile long int * const dest , const long int val ) -{ return __sync_fetch_and_sub(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_sub(dest,val); +} #if defined( KOKKOS_ENABLE_GNU_ATOMICS ) inline unsigned int atomic_fetch_sub( volatile unsigned int * const dest , const unsigned int val ) -{ return __sync_fetch_and_sub(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_sub(dest,val); +} inline unsigned long int atomic_fetch_sub( volatile unsigned long int * const dest , const unsigned long int val ) -{ return __sync_fetch_and_sub(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_sub(dest,val); +} #endif @@ -161,6 +185,10 @@ T atomic_fetch_sub( volatile T * const dest , { union { int i ; T t ; } assume , oldval , newval ; +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + oldval.t = *dest ; do { @@ -178,6 +206,10 @@ T atomic_fetch_sub( volatile T * const dest , typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) && sizeof(T) == sizeof(long) , const T >::type val ) { +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + union { long i ; T t ; } assume , oldval , newval ; oldval.t = *dest ; @@ -202,6 +234,10 @@ T atomic_fetch_sub( volatile T * const dest , && ( sizeof(T) != 8 ) , const T >::type& val ) { +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + while( !Impl::lock_address_host_space( (void*) dest ) ); T return_val = *dest; *dest = return_val - val; diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp index 2985fad95e..e7626603fc 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp @@ -41,6 +41,10 @@ //@HEADER */ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) +#include +#endif + #include #if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_INCREMENT_HPP ) #define KOKKOS_ATOMIC_INCREMENT_HPP @@ -52,6 +56,9 @@ template<> KOKKOS_INLINE_FUNCTION void atomic_increment(volatile char* a) { #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__) +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) a, _MM_HINT_ET0 ); +#endif __asm__ __volatile__( "lock incb %0" : /* no output registers */ @@ -67,6 +74,9 @@ template<> KOKKOS_INLINE_FUNCTION void atomic_increment(volatile short* a) { #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__) +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) a, _MM_HINT_ET0 ); +#endif __asm__ __volatile__( "lock incw %0" : /* no output registers */ @@ -82,6 +92,9 @@ template<> KOKKOS_INLINE_FUNCTION void atomic_increment(volatile int* a) { #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__) +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) a, _MM_HINT_ET0 ); +#endif __asm__ __volatile__( "lock incl %0" : /* no output registers */ @@ -97,6 +110,9 @@ template<> KOKKOS_INLINE_FUNCTION void atomic_increment(volatile long long int* a) { #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__) +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) a, _MM_HINT_ET0 ); +#endif __asm__ __volatile__( "lock incq %0" : /* no output registers */ diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp index f0ff6d78ec..f52cc469ac 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp @@ -87,17 +87,12 @@ setenv("MEMKIND_HBW_NODES", "1", 0); #if defined( KOKKOS_ENABLE_OPENMP ) if( std::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value || std::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) { - if(num_threads>0) { - if(use_numa>0) { - Kokkos::OpenMP::initialize(num_threads,use_numa); - } - else { - Kokkos::OpenMP::initialize(num_threads); - } - } else { - Kokkos::OpenMP::initialize(); + if(use_numa>0) { + Kokkos::OpenMP::initialize(num_threads,use_numa); + } + else { + Kokkos::OpenMP::initialize(num_threads); } - //std::cout << "Kokkos::initialize() fyi: OpenMP enabled and initialized" << std::endl ; } else { //std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not initialized" << std::endl ; @@ -437,10 +432,7 @@ void initialize(int& narg, char* arg[]) iarg++; } - InitArguments arguments; - arguments.num_threads = num_threads; - arguments.num_numa = numa; - arguments.device_id = device; + InitArguments arguments{num_threads, numa, device}; Impl::initialize_internal(arguments); } diff --git a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp index dc75fb072f..fccd8e090f 100644 --- a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp @@ -170,28 +170,31 @@ struct FunctorValueTraits< FunctorType , ArgTag , true /* == exists FunctorType: static_assert( 0 == ( sizeof(value_type) % sizeof(int) ) , "Reduction functor's declared value_type requires: 0 == sizeof(value_type) % sizeof(int)" ); + /* this cast to bool is needed for correctness by NVCC */ + enum : bool { IsArray = static_cast(Impl::is_array< typename FunctorType::value_type >::value) }; + // If not an array then what is the sizeof(value_type) - enum { StaticValueSize = Impl::is_array< typename FunctorType::value_type >::value ? 0 : sizeof(value_type) }; + enum { StaticValueSize = IsArray ? 0 : sizeof(value_type) }; typedef value_type * pointer_type ; // The reference_type for an array is 'value_type *' // The reference_type for a single value is 'value_type &' - typedef typename Impl::if_c< ! StaticValueSize , value_type * - , value_type & >::type reference_type ; + typedef typename Impl::if_c< IsArray , value_type * + , value_type & >::type reference_type ; // Number of values if single value template< class F > KOKKOS_FORCEINLINE_FUNCTION static - typename Impl::enable_if< std::is_same::value && StaticValueSize , unsigned >::type + typename Impl::enable_if< std::is_same::value && ! IsArray , unsigned >::type value_count( const F & ) { return 1 ; } // Number of values if an array, protect via templating because 'f.value_count' // will only exist when the functor declares the value_type to be an array. template< class F > KOKKOS_FORCEINLINE_FUNCTION static - typename Impl::enable_if< std::is_same::value && ! StaticValueSize , unsigned >::type + typename Impl::enable_if< std::is_same::value && IsArray , unsigned >::type value_count( const F & f ) { return f.value_count ; } // Total size of the value diff --git a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp index 8cb7430035..e11f8b6d34 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp @@ -70,62 +70,6 @@ #ifdef KOKKOS_ENABLE_HBWSPACE #define MEMKIND_TYPE MEMKIND_HBW //hbw_get_kind(HBW_PAGESIZE_4KB) -namespace Kokkos { -namespace Experimental { -namespace { - -static const int QUERY_SPACE_IN_PARALLEL_MAX = 16 ; - -typedef int (* QuerySpaceInParallelPtr )(); - -QuerySpaceInParallelPtr s_in_parallel_query[ QUERY_SPACE_IN_PARALLEL_MAX ] ; -int s_in_parallel_query_count = 0 ; - -} // namespace - -void HBWSpace::register_in_parallel( int (*device_in_parallel)() ) -{ - if ( 0 == device_in_parallel ) { - Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel ERROR : given NULL" ) ); - } - - int i = -1 ; - - if ( ! (device_in_parallel)() ) { - for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i ); - } - - if ( i < s_in_parallel_query_count ) { - Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel_query ERROR : called in_parallel" ) ); - - } - - if ( QUERY_SPACE_IN_PARALLEL_MAX <= i ) { - Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel_query ERROR : exceeded maximum" ) ); - - } - - for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i ); - - if ( i == s_in_parallel_query_count ) { - s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ; - } -} - -int HBWSpace::in_parallel() -{ - const int n = s_in_parallel_query_count ; - - int i = 0 ; - - while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; } - - return i < n ; -} - -} // namespace Experiemtal -} // namespace Kokkos - /*--------------------------------------------------------------------------*/ namespace Kokkos { diff --git a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp index 2a5c34c375..a5a73ddebb 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp @@ -106,62 +106,6 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -namespace Kokkos { -namespace { - -static const int QUERY_SPACE_IN_PARALLEL_MAX = 16 ; - -typedef int (* QuerySpaceInParallelPtr )(); - -QuerySpaceInParallelPtr s_in_parallel_query[ QUERY_SPACE_IN_PARALLEL_MAX ] ; -int s_in_parallel_query_count = 0 ; - -} // namespace - -void HostSpace::register_in_parallel( int (*device_in_parallel)() ) -{ - if ( 0 == device_in_parallel ) { - Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel ERROR : given NULL" ) ); - } - - int i = -1 ; - - if ( ! (device_in_parallel)() ) { - for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i ); - } - - if ( i < s_in_parallel_query_count ) { - Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : called in_parallel" ) ); - - } - - if ( QUERY_SPACE_IN_PARALLEL_MAX <= i ) { - Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : exceeded maximum" ) ); - - } - - for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i ); - - if ( i == s_in_parallel_query_count ) { - s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ; - } -} - -int HostSpace::in_parallel() -{ - const int n = s_in_parallel_query_count ; - - int i = 0 ; - - while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; } - - return i < n ; -} - -} // namespace Kokkos - -/*--------------------------------------------------------------------------*/ - namespace Kokkos { /* Default allocation mechanism */ @@ -340,9 +284,6 @@ void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_ } } -constexpr const char* HostSpace::name() { - return m_name; -} } // namespace Kokkos //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp index ac200209c7..d2446bde09 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp @@ -45,7 +45,7 @@ #include #include #include -#include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -58,9 +58,11 @@ void HostThreadTeamData::organize_pool { bool ok = true ; + memory_fence(); + // Verify not already a member of a pool: for ( int rank = 0 ; rank < size && ok ; ++rank ) { - ok = ( 0 != members[rank] ) && ( 0 == members[rank]->m_pool_scratch ); + ok = ( nullptr != members[rank] ) && ( 0 == members[rank]->m_pool_scratch ); } if ( ok ) { @@ -89,7 +91,6 @@ void HostThreadTeamData::organize_pool mem->m_team_alloc = 1 ; mem->m_league_rank = rank ; mem->m_league_size = size ; - mem->m_pool_rendezvous_step = 0 ; mem->m_team_rendezvous_step = 0 ; pool[ rank ] = mem ; } @@ -116,7 +117,6 @@ void HostThreadTeamData::disband_pool() m_team_alloc = 1 ; m_league_rank = 0 ; m_league_size = 1 ; - m_pool_rendezvous_step = 0 ; m_team_rendezvous_step = 0 ; } @@ -256,11 +256,6 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle ; - union { - int64_t full ; - int8_t byte[8] ; - } value ; - if ( rank ) { const int group_begin = rank << shift_byte ; // == rank * size_byte @@ -275,13 +270,14 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer const int end = group_begin + size_byte < size ? size_byte : size - group_begin ; - value.full = 0 ; - for ( int i = 0 ; i < end ; ++i ) value.byte[i] = int8_t( step ); + int64_t value = 0 ; - store_fence(); // This should not be needed but fixes #742 + for ( int i = 0 ; i < end ; ++i ) { + ((int8_t*) & value )[i] = int8_t( step ); + } spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ] - , value.full ); + , value ); } { @@ -316,10 +312,12 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer const int end = size_byte < size ? 8 : size ; - value.full = 0 ; - for ( int i = 1 ; i < end ; ++i ) value.byte[i] = int8_t( step ); + int64_t value = 0 ; + for ( int i = 1 ; i < end ; ++i ) { + ((int8_t *) & value)[i] = int8_t( step ); + } - spinwait_until_equal( buffer[ sync_offset ], value.full ); + spinwait_until_equal( buffer[ sync_offset ], value ); } return rank ? 0 : 1 ; diff --git a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp index c050a16eae..7facc0a410 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp @@ -50,6 +50,7 @@ #include #include #include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -67,14 +68,12 @@ public: // Assume upper bounds on number of threads: // pool size <= 1024 threads - // pool rendezvous <= ( 1024 / 8 ) * 4 + 4 = 2052 // team size <= 64 threads - // team rendezvous <= ( 64 / 8 ) * 4 + 4 = 36 enum : int { max_pool_members = 1024 }; enum : int { max_team_members = 64 }; - enum : int { max_pool_rendezvous = ( max_pool_members / 8 ) * 4 + 4 }; - enum : int { max_team_rendezvous = ( max_team_members / 8 ) * 4 + 4 }; + enum : int { max_pool_rendezvous = rendezvous_buffer_size( max_pool_members ) }; + enum : int { max_team_rendezvous = rendezvous_buffer_size( max_team_members ) }; private: @@ -114,7 +113,6 @@ private: int m_league_size ; int m_work_chunk ; int m_steal_rank ; // work stealing rank - int mutable m_pool_rendezvous_step ; int mutable m_team_rendezvous_step ; HostThreadTeamData * team_member( int r ) const noexcept @@ -147,6 +145,7 @@ public: int team_rendezvous( int const root ) const noexcept { return 1 == m_team_size ? 1 : + HostThreadTeamData:: rendezvous( m_team_scratch + m_team_rendezvous , m_team_rendezvous_step , m_team_size @@ -157,6 +156,7 @@ public: int team_rendezvous() const noexcept { return 1 == m_team_size ? 1 : + HostThreadTeamData:: rendezvous( m_team_scratch + m_team_rendezvous , m_team_rendezvous_step , m_team_size @@ -167,6 +167,7 @@ public: void team_rendezvous_release() const noexcept { if ( 1 < m_team_size ) { + HostThreadTeamData:: rendezvous_release( m_team_scratch + m_team_rendezvous , m_team_rendezvous_step ); } @@ -175,19 +176,30 @@ public: inline int pool_rendezvous() const noexcept { + static constexpr int yield_wait = + #if defined( KOKKOS_COMPILER_IBM ) + // If running on IBM POWER architecture the global + // level rendzvous should immediately yield when + // waiting for other threads in the pool to arrive. + 1 + #else + 0 + #endif + ; return 1 == m_pool_size ? 1 : + Kokkos::Impl:: rendezvous( m_pool_scratch + m_pool_rendezvous - , m_pool_rendezvous_step , m_pool_size - , m_pool_rank ); + , m_pool_rank + , yield_wait ); } inline void pool_rendezvous_release() const noexcept { if ( 1 < m_pool_size ) { - rendezvous_release( m_pool_scratch + m_pool_rendezvous - , m_pool_rendezvous_step ); + Kokkos::Impl:: + rendezvous_release( m_pool_scratch + m_pool_rendezvous ); } } @@ -213,7 +225,6 @@ public: , m_league_size(1) , m_work_chunk(0) , m_steal_rank(0) - , m_pool_rendezvous_step(0) , m_team_rendezvous_step(0) {} @@ -406,7 +417,7 @@ fflush(stdout); // Steal from next team, round robin // The next team is offset by m_team_alloc if it fits in the pool. - m_steal_rank = m_team_base + m_team_alloc + m_team_size <= m_pool_size ? + m_steal_rank = m_team_base + m_team_alloc + m_team_size <= m_pool_size ? m_team_base + m_team_alloc : 0 ; } diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp index 98482cfab6..608d514c79 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp @@ -50,51 +50,70 @@ namespace Kokkos { namespace Profiling { +static initFunction initProfileLibrary = nullptr; +static finalizeFunction finalizeProfileLibrary = nullptr; + +static beginFunction beginForCallee = nullptr; +static beginFunction beginScanCallee = nullptr; +static beginFunction beginReduceCallee = nullptr; +static endFunction endForCallee = nullptr; +static endFunction endScanCallee = nullptr; +static endFunction endReduceCallee = nullptr; + +static pushFunction pushRegionCallee = nullptr; +static popFunction popRegionCallee = nullptr; + +static allocateDataFunction allocateDataCallee = nullptr; +static deallocateDataFunction deallocateDataCallee = nullptr; + +static beginDeepCopyFunction beginDeepCopyCallee = nullptr; +static endDeepCopyFunction endDeepCopyCallee = nullptr; + SpaceHandle::SpaceHandle(const char* space_name) { strncpy(name,space_name,64); } bool profileLibraryLoaded() { - return (NULL != initProfileLibrary); + return (nullptr != initProfileLibrary); } void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) { - if(NULL != beginForCallee) { + if(nullptr != beginForCallee) { Kokkos::fence(); (*beginForCallee)(kernelPrefix.c_str(), devID, kernelID); } } void endParallelFor(const uint64_t kernelID) { - if(NULL != endForCallee) { + if(nullptr != endForCallee) { Kokkos::fence(); (*endForCallee)(kernelID); } } void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) { - if(NULL != beginScanCallee) { + if(nullptr != beginScanCallee) { Kokkos::fence(); (*beginScanCallee)(kernelPrefix.c_str(), devID, kernelID); } } void endParallelScan(const uint64_t kernelID) { - if(NULL != endScanCallee) { + if(nullptr != endScanCallee) { Kokkos::fence(); (*endScanCallee)(kernelID); } } void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) { - if(NULL != beginReduceCallee) { + if(nullptr != beginReduceCallee) { Kokkos::fence(); (*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID); } } void endParallelReduce(const uint64_t kernelID) { - if(NULL != endReduceCallee) { + if(nullptr != endReduceCallee) { Kokkos::fence(); (*endReduceCallee)(kernelID); } @@ -102,31 +121,47 @@ void endParallelReduce(const uint64_t kernelID) { void pushRegion(const std::string& kName) { - if( NULL != pushRegionCallee ) { + if( nullptr != pushRegionCallee ) { Kokkos::fence(); (*pushRegionCallee)(kName.c_str()); } } void popRegion() { - if( NULL != popRegionCallee ) { + if( nullptr != popRegionCallee ) { Kokkos::fence(); (*popRegionCallee)(); } } void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) { - if(NULL != allocateDataCallee) { + if(nullptr != allocateDataCallee) { (*allocateDataCallee)(space,label.c_str(),ptr,size); } } void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) { - if(NULL != allocateDataCallee) { + if(nullptr != deallocateDataCallee) { (*deallocateDataCallee)(space,label.c_str(),ptr,size); } } +void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, const void* dst_ptr, + const SpaceHandle src_space, const std::string src_label, const void* src_ptr, + const uint64_t size) { + if(nullptr != beginDeepCopyCallee) { + (*beginDeepCopyCallee)(dst_space, dst_label.c_str(), dst_ptr, + src_space, src_label.c_str(), src_ptr, + size); + } +} + +void endDeepCopy() { + if(nullptr != endDeepCopyCallee) { + (*endDeepCopyCallee)(); + } +} + void initialize() { // Make sure initialize calls happens only once @@ -140,7 +175,7 @@ void initialize() { // If we do not find a profiling library in the environment then exit // early. - if( NULL == envProfileLibrary ) { + if( nullptr == envProfileLibrary ) { return ; } @@ -149,10 +184,10 @@ void initialize() { char* profileLibraryName = strtok(envProfileCopy, ";"); - if( (NULL != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) { + if( (nullptr != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) { firstProfileLibrary = dlopen(profileLibraryName, RTLD_NOW | RTLD_GLOBAL); - if(NULL == firstProfileLibrary) { + if(nullptr == firstProfileLibrary) { std::cerr << "Error: Unable to load KokkosP library: " << profileLibraryName << std::endl; } else { @@ -191,14 +226,19 @@ void initialize() { auto p12 = dlsym(firstProfileLibrary, "kokkosp_deallocate_data"); deallocateDataCallee = *((deallocateDataFunction*) &p12); + auto p13 = dlsym(firstProfileLibrary, "kokkosp_begin_deep_copy"); + beginDeepCopyCallee = *((beginDeepCopyFunction*) &p13); + auto p14 = dlsym(firstProfileLibrary, "kokkosp_end_deep_copy"); + endDeepCopyCallee = *((endDeepCopyFunction*) &p14); + } } - if(NULL != initProfileLibrary) { + if(nullptr != initProfileLibrary) { (*initProfileLibrary)(0, (uint64_t) KOKKOSP_INTERFACE_VERSION, (uint32_t) 0, - NULL); + nullptr); } free(envProfileCopy); @@ -210,28 +250,30 @@ void finalize() { if(is_finalized) return; is_finalized = 1; - if(NULL != finalizeProfileLibrary) { + if(nullptr != finalizeProfileLibrary) { (*finalizeProfileLibrary)(); - // Set all profile hooks to NULL to prevent + // Set all profile hooks to nullptr to prevent // any additional calls. Once we are told to // finalize, we mean it - initProfileLibrary = NULL; - finalizeProfileLibrary = NULL; + initProfileLibrary = nullptr; + finalizeProfileLibrary = nullptr; - beginForCallee = NULL; - beginScanCallee = NULL; - beginReduceCallee = NULL; - endScanCallee = NULL; - endForCallee = NULL; - endReduceCallee = NULL; + beginForCallee = nullptr; + beginScanCallee = nullptr; + beginReduceCallee = nullptr; + endScanCallee = nullptr; + endForCallee = nullptr; + endReduceCallee = nullptr; - pushRegionCallee = NULL; - popRegionCallee = NULL; + pushRegionCallee = nullptr; + popRegionCallee = nullptr; - allocateDataCallee = NULL; - deallocateDataCallee = NULL; + allocateDataCallee = nullptr; + deallocateDataCallee = nullptr; + beginDeepCopyCallee = nullptr; + endDeepCopyCallee = nullptr; } } } diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp index f76e5dfa04..2c2e524d9d 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp @@ -81,23 +81,11 @@ typedef void (*popFunction)(); typedef void (*allocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t); typedef void (*deallocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t); - -static initFunction initProfileLibrary = NULL; -static finalizeFunction finalizeProfileLibrary = NULL; - -static beginFunction beginForCallee = NULL; -static beginFunction beginScanCallee = NULL; -static beginFunction beginReduceCallee = NULL; -static endFunction endForCallee = NULL; -static endFunction endScanCallee = NULL; -static endFunction endReduceCallee = NULL; - -static pushFunction pushRegionCallee = NULL; -static popFunction popRegionCallee = NULL; - -static allocateDataFunction allocateDataCallee = NULL; -static deallocateDataFunction deallocateDataCallee = NULL; - +typedef void (*beginDeepCopyFunction)( + SpaceHandle, const char*, const void*, + SpaceHandle, const char*, const void*, + uint64_t); +typedef void (*endDeepCopyFunction)(); bool profileLibraryLoaded(); @@ -114,35 +102,14 @@ void popRegion(); void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size); void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size); +void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, const void* dst_ptr, + const SpaceHandle src_space, const std::string src_label, const void* src_ptr, + const uint64_t size); +void endDeepCopy(); + void initialize(); void finalize(); -//Define finalize_fake inline to get rid of warnings for unused static variables -inline void finalize_fake() { - if(NULL != finalizeProfileLibrary) { - (*finalizeProfileLibrary)(); - - // Set all profile hooks to NULL to prevent - // any additional calls. Once we are told to - // finalize, we mean it - beginForCallee = NULL; - beginScanCallee = NULL; - beginReduceCallee = NULL; - endScanCallee = NULL; - endForCallee = NULL; - endReduceCallee = NULL; - - allocateDataCallee = NULL; - deallocateDataCallee = NULL; - - initProfileLibrary = NULL; - finalizeProfileLibrary = NULL; - pushRegionCallee = NULL; - popRegionCallee = NULL; - } -} - - } } diff --git a/lib/kokkos/core/src/impl/Kokkos_Rendezvous.cpp b/lib/kokkos/core/src/impl/Kokkos_Rendezvous.cpp new file mode 100644 index 0000000000..ac697fce4b --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Rendezvous.cpp @@ -0,0 +1,208 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include +#include +#include + +namespace Kokkos { namespace Impl { + +//---------------------------------------------------------------------------- +/* pattern for rendezvous + * + * if ( rendezvous() ) { + * ... all other threads are still in team_rendezvous() ... + * rendezvous_release(); + * ... all other threads are released from team_rendezvous() ... + * } + */ + +int rendezvous( volatile int64_t * const buffer + , int const size + , int const rank + , int const slow + ) noexcept +{ + enum : int { shift_byte = 3 }; + enum : int { size_byte = ( 01 << shift_byte ) }; // == 8 + enum : int { mask_byte = size_byte - 1 }; + + enum : int { shift_mem_cycle = 2 }; + enum : int { size_mem_cycle = ( 01 << shift_mem_cycle ) }; // == 4 + enum : int { mask_mem_cycle = size_mem_cycle - 1 }; + + // Cycle step values: 1 <= step <= size_val_cycle + // An odd multiple of memory cycle so that when a memory location + // is reused it has a different value. + // Must be representable within a single byte: size_val_cycle < 16 + + enum : int { size_val_cycle = 3 * size_mem_cycle }; + + // Requires: + // Called by rank = [ 0 .. size ) + // buffer aligned to int64_t[4] + + // A sequence of rendezvous uses four cycled locations in memory + // and non-equal cycled synchronization values to + // 1) prevent rendezvous from overtaking one another and + // 2) give each spin wait location an int64_t[4] span + // so that it has its own cache line. + + const int64_t step = (buffer[0] % size_val_cycle ) + 1 ; + + // The leading int64_t[4] span is for thread 0 to write + // and all other threads to read spin-wait. + // sync_offset is the index into this array for this step. + + const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle + size_mem_cycle ; + + if ( rank ) { + + const int group_begin = rank << shift_byte ; // == rank * size_byte + + if ( group_begin < size ) { + + // This thread waits for threads + // [ group_begin .. group_begin + 8 ) + // [ rank*8 .. rank*8 + 8 ) + // to write to their designated bytes. + + const int end = group_begin + size_byte < size + ? size_byte : size - group_begin ; + + int64_t value = 0; + for ( int i = 0 ; i < end ; ++i ) { + value |= step << (i * size_byte ); + } + + store_fence(); // This should not be needed but fixes #742 + + if ( slow ) { + yield_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ] + , value ); + } + else { + spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ] + , value ); + } + } + + { + // This thread sets its designated byte. + // ( rank % size_byte ) + + // ( ( rank / size_byte ) * size_byte * size_mem_cycle ) + + // ( sync_offset * size_byte ) + const int offset = ( rank & mask_byte ) + + ( ( rank & ~mask_byte ) << shift_mem_cycle ) + + ( sync_offset << shift_byte ); + + // All of this thread's previous memory stores must be complete before + // this thread stores the step value at this thread's designated byte + // in the shared synchronization array. + + Kokkos::memory_fence(); + + ((volatile int8_t*) buffer)[ offset ] = int8_t( step ); + + // Memory fence to push the previous store out + Kokkos::memory_fence(); + } + + // Wait for thread 0 to release all other threads + + if ( slow ) { + yield_until_equal( buffer[ (step & mask_mem_cycle) + size_mem_cycle ] , int64_t(step) ); + } + else { + spinwait_until_equal( buffer[ (step & mask_mem_cycle) + size_mem_cycle ] , int64_t(step) ); + } + } + else { + // Thread 0 waits for threads [1..7] + // to write to their designated bytes. + + const int end = size_byte < size ? 8 : size ; + + int64_t value = 0; + for ( int i = 1 ; i < end ; ++i ) { + value |= step << (i * size_byte ); + } + + if ( slow ) { + yield_until_equal( buffer[ sync_offset ], value ); + } + else { + spinwait_until_equal( buffer[ sync_offset ], value ); + } + } + + return rank ? 0 : 1 ; +} + +void rendezvous_release( volatile int64_t * const buffer ) noexcept +{ + enum : int { shift_mem_cycle = 2 }; + enum : int { size_mem_cycle = ( 01 << shift_mem_cycle ) }; // == 4 + enum : int { mask_mem_cycle = size_mem_cycle - 1 }; + enum : int { size_val_cycle = 3 * size_mem_cycle }; + + // Requires: + // Called after team_rendezvous + // Called only by true == team_rendezvous(root) + + // update step + const int64_t step = (buffer[0] % size_val_cycle ) + 1; + buffer[0] = step; + + // Memory fence to be sure all previous writes are complete: + Kokkos::memory_fence(); + + buffer[ (step & mask_mem_cycle) + size_mem_cycle ] = step; + + // Memory fence to push the store out + Kokkos::memory_fence(); +} + +}} // namespace Kokkos::Impl + diff --git a/lib/kokkos/core/src/impl/Kokkos_Rendezvous.hpp b/lib/kokkos/core/src/impl/Kokkos_Rendezvous.hpp new file mode 100644 index 0000000000..57f8633bca --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Rendezvous.hpp @@ -0,0 +1,87 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_RENDEZVOUS_HPP +#define KOKKOS_IMPL_RENDEZVOUS_HPP + +#include + +namespace Kokkos { namespace Impl { + +inline +constexpr int rendezvous_buffer_size( int max_members ) noexcept +{ + return (((max_members + 7) / 8) * 4) + 4 + 4; +} + +/** \brief Thread pool rendezvous + * + * Rendezvous pattern: + * if ( rendezvous(root) ) { + * ... only root thread here while all others wait ... + * rendezvous_release(); + * } + * else { + * ... all other threads release here ... + * } + * + * Requires: buffer[ rendezvous_buffer_size( max_threads ) ]; + * + * When slow != 0 the expectation is thread arrival will be + * slow so the threads that arrive early should quickly yield + * their core to the runtime thus possibly allowing the late + * arriving threads to have more resources + * (e.g., power and clock frequency). + */ +int rendezvous( volatile int64_t * const buffer + , int const size + , int const rank + , int const slow = 0 ) noexcept ; + +void rendezvous_release( volatile int64_t * const buffer ) noexcept ; + + +}} // namespace Kokkos::Impl + +#endif // KOKKOS_IMPL_RENDEZVOUS_HPP + diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp index 755271c07e..dfbeba461e 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp @@ -50,6 +50,7 @@ #include #include +#include /*--------------------------------------------------------------------------*/ @@ -123,7 +124,6 @@ void serial_resize_thread_team_data( size_t pool_reduce_bytes } } -// Get thread team data structure for omp_get_thread_num() HostThreadTeamData * serial_get_thread_team_data() { return & g_serial_thread_team_data ; @@ -151,6 +151,8 @@ void Serial::initialize( unsigned threads_count (void) use_cores_per_numa; (void) allow_asynchronous_threadpool; + Impl::SharedAllocationRecord< void, void >::tracking_enable(); + // Init the array of locks used for arbitrarily sized atomics Impl::init_lock_array_host_space(); #if defined(KOKKOS_ENABLE_PROFILING) diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp index 76297161b1..0b6fbd9af0 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp @@ -62,7 +62,7 @@ void TaskQueueSpecialization< Kokkos::Serial >::execute { using execution_space = Kokkos::Serial ; using queue_type = TaskQueue< execution_space > ; - using task_root_type = TaskBase< execution_space , void , void > ; + using task_root_type = TaskBase< void , void , void > ; using Member = Impl::HostThreadTeamMember< execution_space > ; task_root_type * const end = (task_root_type *) task_root_type::EndTag ; @@ -122,7 +122,7 @@ void TaskQueueSpecialization< Kokkos::Serial > :: { using execution_space = Kokkos::Serial ; using queue_type = TaskQueue< execution_space > ; - using task_root_type = TaskBase< execution_space , void , void > ; + using task_root_type = TaskBase< void , void , void > ; using Member = Impl::HostThreadTeamMember< execution_space > ; task_root_type * const end = (task_root_type *) task_root_type::EndTag ; diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp index 2eb2b5cf52..39deebbbf1 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp @@ -65,7 +65,7 @@ public: using execution_space = Kokkos::Serial ; using memory_space = Kokkos::HostSpace ; using queue_type = Kokkos::Impl::TaskQueue< execution_space > ; - using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ; + using task_base_type = Kokkos::Impl::TaskBase< void , void , void > ; using member_type = Kokkos::Impl::HostThreadTeamMember< execution_space > ; static diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp new file mode 100644 index 0000000000..dc30ffe9e0 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp @@ -0,0 +1,102 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP +#define KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class ... Traits > +class ParallelFor< FunctorType , + Kokkos::Experimental::WorkGraphPolicy< Traits ... > , + Kokkos::Serial + > + : public Kokkos::Impl::Experimental:: + WorkGraphExec< FunctorType, + Kokkos::Serial, + Traits ... + > +{ +private: + + typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ; + typedef Kokkos::Impl::Experimental:: + WorkGraphExec Base ; + + template< class TagType > + typename std::enable_if< std::is_same< TagType , void >::value >::type + exec_one(const typename Policy::member_type& i) const { + Base::m_functor( i ); + } + + template< class TagType > + typename std::enable_if< ! std::is_same< TagType , void >::value >::type + exec_one(const typename Policy::member_type& i) const { + const TagType t{} ; + Base::m_functor( t , i ); + } + +public: + + inline + void execute() + { + for (std::int32_t i; (-1 != (i = Base::before_work())); ) { + exec_one< typename Policy::work_tag >( i ); + Base::after_work(i); + } + } + + inline + ParallelFor( const FunctorType & arg_functor + , const Policy & arg_policy ) + : Base( arg_functor, arg_policy ) + { + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* #define KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP */ diff --git a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp index e28c1194a7..af79523e0c 100644 --- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp @@ -46,23 +46,23 @@ namespace Kokkos { namespace Impl { -int SharedAllocationRecord< void , void >::s_tracking_enabled = 1 ; +namespace { -void SharedAllocationRecord< void , void >::tracking_claim_and_disable() -{ - // A host thread claim and disable tracking flag +__thread int t_tracking_enabled = 1; - while ( ! Kokkos::atomic_compare_exchange_strong( & s_tracking_enabled, 1, 0 ) ); } -void SharedAllocationRecord< void , void >::tracking_release_and_enable() -{ - // The host thread that claimed and disabled the tracking flag - // now release and enable tracking. +int SharedAllocationRecord< void , void >::tracking_enabled() +{ return t_tracking_enabled; } - if ( ! Kokkos::atomic_compare_exchange_strong( & s_tracking_enabled, 0, 1 ) ){ - Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::SharedAllocationRecord<>::tracking_release_and_enable FAILED, this host process thread did not hold the lock" ); - } +void SharedAllocationRecord< void , void >::tracking_disable() +{ + t_tracking_enabled = 0; +} + +void SharedAllocationRecord< void , void >::tracking_enable() +{ + t_tracking_enabled = 1; } //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp index 4dc61bb02e..2e3cc1a163 100644 --- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp @@ -71,6 +71,9 @@ public: KOKKOS_INLINE_FUNCTION static const SharedAllocationHeader * get_header( void * alloc_ptr ) { return reinterpret_cast( reinterpret_cast(alloc_ptr) - sizeof(SharedAllocationHeader) ); } + + KOKKOS_INLINE_FUNCTION + const char* label() const { return m_label; } }; template<> @@ -83,8 +86,6 @@ protected: typedef void (* function_type )( SharedAllocationRecord * ); - static int s_tracking_enabled ; - SharedAllocationHeader * const m_alloc_ptr ; size_t const m_alloc_size ; function_type const m_dealloc ; @@ -110,17 +111,17 @@ protected: public: inline std::string get_label() const { return std::string("Unmanaged"); } - static int tracking_enabled() { return s_tracking_enabled ; } + static int tracking_enabled(); /**\brief A host process thread claims and disables the * shared allocation tracking flag. */ - static void tracking_claim_and_disable(); + static void tracking_disable(); /**\brief A host process thread releases and enables the * shared allocation tracking flag. */ - static void tracking_release_and_enable(); + static void tracking_enable(); ~SharedAllocationRecord() = default ; @@ -317,6 +318,11 @@ public: #endif } + KOKKOS_INLINE_FUNCTION + bool has_record() const { + return (m_record_bits & (~DO_NOT_DEREF_FLAG)) != 0; + } + KOKKOS_FORCEINLINE_FUNCTION ~SharedAllocationTracker() { KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT } diff --git a/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp b/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp new file mode 100644 index 0000000000..3d3f83ed85 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp @@ -0,0 +1,210 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + +#include +#include +#include + +#if defined( KOKKOS_ENABLE_STDTHREAD ) + #include +#elif !defined( _WIN32 ) + #include + #include +#else + #include + #include + #include +#endif + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { +namespace { + +void host_thread_yield( const uint32_t i , const int force_yield ) +{ + static constexpr uint32_t sleep_limit = 1 << 13 ; + static constexpr uint32_t yield_limit = 1 << 12 ; + + const int c = Kokkos::Impl::bit_scan_reverse(i); + + if ( sleep_limit < i ) { + + // Attempt to put the thread to sleep for 'c' milliseconds + + #if defined( KOKKOS_ENABLE_STDTHREAD ) + std::this_thread::sleep_for( std::chrono::nanoseconds( c * 1000 ) ) + #elif !defined( _WIN32 ) + timespec req ; + req.tv_sec = 0 ; + req.tv_nsec = 1000 * c ; + nanosleep( &req, nullptr ); + #else /* defined( _WIN32 ) IS Microsoft Windows */ + Sleep(c); + #endif + } + + else if ( force_yield || yield_limit < i ) { + + // Attempt to yield thread resources to runtime + + #if defined( KOKKOS_ENABLE_STDTHREAD ) + std::this_thread::yield(); + #elif !defined( _WIN32 ) + sched_yield(); + #else /* defined( _WIN32 ) IS Microsoft Windows */ + YieldProcessor(); + #endif + } + + #if defined( KOKKOS_ENABLE_ASM ) + + else if ( (1u<<4) < i ) { + + // Insert a few no-ops to quiet the thread: + + for ( int k = 0 ; k < c ; ++k ) { + #if defined( __amd64 ) || defined( __amd64__ ) || \ + defined( __x86_64 ) || defined( __x86_64__ ) + #if !defined( _WIN32 ) /* IS NOT Microsoft Windows */ + asm volatile( "nop\n" ); + #else + __asm__ __volatile__( "nop\n" ); + #endif + #elif defined(__PPC64__) + asm volatile( "nop\n" ); + #endif + } + } + + { + // Insert memory pause + #if defined( __amd64 ) || defined( __amd64__ ) || \ + defined( __x86_64 ) || defined( __x86_64__ ) + #if !defined( _WIN32 ) /* IS NOT Microsoft Windows */ + asm volatile( "pause\n":::"memory" ); + #else + __asm__ __volatile__( "pause\n":::"memory" ); + #endif + #elif defined(__PPC64__) + asm volatile( "or 27, 27, 27" ::: "memory" ); + #endif + } + + #endif /* defined( KOKKOS_ENABLE_ASM ) */ +} + +}}} // namespace Kokkos::Impl::{anonymous} + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +void spinwait_while_equal( volatile int32_t & flag , const int32_t value ) +{ + Kokkos::store_fence(); + uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0); + Kokkos::load_fence(); +} + +void spinwait_until_equal( volatile int32_t & flag , const int32_t value ) +{ + Kokkos::store_fence(); + uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0); + Kokkos::load_fence(); +} + +void spinwait_while_equal( volatile int64_t & flag , const int64_t value ) +{ + Kokkos::store_fence(); + uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0); + Kokkos::load_fence(); +} + +void spinwait_until_equal( volatile int64_t & flag , const int64_t value ) +{ + Kokkos::store_fence(); + uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0); + Kokkos::load_fence(); +} + +void yield_while_equal( volatile int32_t & flag , const int32_t value ) +{ + Kokkos::store_fence(); + uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1); + Kokkos::load_fence(); +} + +void yield_until_equal( volatile int32_t & flag , const int32_t value ) +{ + Kokkos::store_fence(); + uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1); + Kokkos::load_fence(); +} + +void yield_while_equal( volatile int64_t & flag , const int64_t value ) +{ + Kokkos::store_fence(); + uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1); + Kokkos::load_fence(); +} + +void yield_until_equal( volatile int64_t & flag , const int64_t value ) +{ + Kokkos::store_fence(); + uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1); + Kokkos::load_fence(); +} + +} /* namespace Impl */ +} /* namespace Kokkos */ + +#else +void KOKKOS_CORE_SRC_IMPL_SPINWAIT_PREVENT_LINK_ERROR() {} +#endif + diff --git a/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp b/lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp similarity index 82% rename from lib/kokkos/core/src/impl/Kokkos_spinwait.hpp rename to lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp index 6e34b8a943..b49e308566 100644 --- a/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp @@ -59,6 +59,13 @@ void spinwait_until_equal( volatile int32_t & flag , const int32_t value ); void spinwait_while_equal( volatile int64_t & flag , const int64_t value ); void spinwait_until_equal( volatile int64_t & flag , const int64_t value ); + +void yield_while_equal( volatile int32_t & flag , const int32_t value ); +void yield_until_equal( volatile int32_t & flag , const int32_t value ); + +void yield_while_equal( volatile int64_t & flag , const int64_t value ); +void yield_until_equal( volatile int64_t & flag , const int64_t value ); + #else KOKKOS_INLINE_FUNCTION @@ -71,6 +78,16 @@ void spinwait_while_equal( volatile int64_t & , const int64_t ) {} KOKKOS_INLINE_FUNCTION void spinwait_until_equal( volatile int64_t & , const int64_t ) {} +KOKKOS_INLINE_FUNCTION +void yield_while_equal( volatile int32_t & , const int32_t ) {} +KOKKOS_INLINE_FUNCTION +void yield_until_equal( volatile int32_t & , const int32_t ) {} + +KOKKOS_INLINE_FUNCTION +void yield_while_equal( volatile int64_t & , const int64_t ) {} +KOKKOS_INLINE_FUNCTION +void yield_until_equal( volatile int64_t & , const int64_t ) {} + #endif } /* namespace Impl */ diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp index bee98e6745..5f8699302d 100644 --- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp @@ -59,24 +59,15 @@ namespace Kokkos { namespace Impl { -/*\brief Implementation data for task data management, access, and execution. - * - * Curiously recurring template pattern (CRTP) - * to allow static_cast from the - * task root type and a task's FunctorType. - * - * TaskBase< Space , ResultType , FunctorType > - * : TaskBase< Space , ResultType , void > - * , FunctorType - * { ... }; - * - * TaskBase< Space , ResultType , void > - * : TaskBase< Space , void , void > - * { ... }; - */ -template< typename Space , typename ResultType , typename FunctorType > +template< class Space , typename ResultType , class FunctorType > class TaskBase ; +template< typename Space > +class TaskQueue ; + +template< typename Space > +class TaskQueueSpecialization ; + } /* namespace Impl */ } /* namespace Kokkos */ @@ -86,8 +77,217 @@ class TaskBase ; namespace Kokkos { namespace Impl { -template< typename Space > -class TaskQueueSpecialization ; +/** \brief Base class for task management, access, and execution. + * + * Inheritance structure to allow static_cast from the task root type + * and a task's FunctorType. + * + * // Enable a functor to access the base class + * // and provide memory for result value. + * TaskBase< Space , ResultType , FunctorType > + * : TaskBase< void , void , void > + * , FunctorType + * { ... }; + * Followed by memory allocated for result value. + * + * + * States of a task: + * + * Constructing State, NOT IN a linked list + * m_wait == 0 + * m_next == 0 + * + * Scheduling transition : Constructing -> Waiting + * before: + * m_wait == 0 + * m_next == this task's initial dependence, 0 if none + * after: + * m_wait == EndTag + * m_next == EndTag + * + * Waiting State, IN a linked list + * m_apply != 0 + * m_queue != 0 + * m_ref_count > 0 + * m_wait == head of linked list of tasks waiting on this task + * m_next == next of linked list of tasks + * + * transition : Waiting -> Executing + * before: + * m_next == EndTag + * after:: + * m_next == LockTag + * + * Executing State, NOT IN a linked list + * m_apply != 0 + * m_queue != 0 + * m_ref_count > 0 + * m_wait == head of linked list of tasks waiting on this task + * m_next == LockTag + * + * Respawn transition : Executing -> Executing-Respawn + * before: + * m_next == LockTag + * after: + * m_next == this task's updated dependence, 0 if none + * + * Executing-Respawn State, NOT IN a linked list + * m_apply != 0 + * m_queue != 0 + * m_ref_count > 0 + * m_wait == head of linked list of tasks waiting on this task + * m_next == this task's updated dependence, 0 if none + * + * transition : Executing -> Complete + * before: + * m_wait == head of linked list + * after: + * m_wait == LockTag + * + * Complete State, NOT IN a linked list + * m_wait == LockTag: cannot add dependence (<=> complete) + * m_next == LockTag: not a member of a wait queue + * + */ +template<> +class TaskBase< void , void , void > +{ +public: + + enum : int16_t { TaskTeam = 0 , TaskSingle = 1 , Aggregate = 2 }; + enum : uintptr_t { LockTag = ~uintptr_t(0) , EndTag = ~uintptr_t(1) }; + + template< typename > friend class Kokkos::TaskScheduler ; + + typedef TaskQueue< void > queue_type ; + + typedef void (* function_type) ( TaskBase * , void * ); + + // sizeof(TaskBase) == 48 + + function_type m_apply ; ///< Apply function pointer + queue_type * m_queue ; ///< Pointer to queue + TaskBase * m_wait ; ///< Linked list of tasks waiting on this + TaskBase * m_next ; ///< Waiting linked-list next + int32_t m_ref_count ; ///< Reference count + int32_t m_alloc_size ; ///< Allocation size + int32_t m_dep_count ; ///< Aggregate's number of dependences + int16_t m_task_type ; ///< Type of task + int16_t m_priority ; ///< Priority of runnable task + + TaskBase( TaskBase && ) = delete ; + TaskBase( const TaskBase & ) = delete ; + TaskBase & operator = ( TaskBase && ) = delete ; + TaskBase & operator = ( const TaskBase & ) = delete ; + + KOKKOS_INLINE_FUNCTION ~TaskBase() = default ; + + KOKKOS_INLINE_FUNCTION constexpr + TaskBase() + : m_apply( 0 ) + , m_queue( 0 ) + , m_wait( 0 ) + , m_next( 0 ) + , m_ref_count( 0 ) + , m_alloc_size( 0 ) + , m_dep_count( 0 ) + , m_task_type( 0 ) + , m_priority( 0 ) + {} + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + TaskBase * volatile * aggregate_dependences() volatile + { return reinterpret_cast( this + 1 ); } + + KOKKOS_INLINE_FUNCTION + bool requested_respawn() + { + // This should only be called when a task has finished executing and is + // in the transition to either the complete or executing-respawn state. + TaskBase * const lock = reinterpret_cast< TaskBase * >( LockTag ); + return lock != m_next; + } + + KOKKOS_INLINE_FUNCTION + void add_dependence( TaskBase* dep ) + { + // Precondition: lock == m_next + + TaskBase * const lock = (TaskBase *) LockTag ; + + // Assign dependence to m_next. It will be processed in the subsequent + // call to schedule. Error if the dependence is reset. + if ( lock != Kokkos::atomic_exchange( & m_next, dep ) ) { + Kokkos::abort("TaskScheduler ERROR: resetting task dependence"); + } + + if ( 0 != dep ) { + // The future may be destroyed upon returning from this call + // so increment reference count to track this assignment. + Kokkos::atomic_increment( &(dep->m_ref_count) ); + } + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + int32_t reference_count() const + { return *((int32_t volatile *)( & m_ref_count )); } + +}; + +static_assert( sizeof(TaskBase) == 48 + , "Verifying expected sizeof(TaskBase)" ); + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template< typename ResultType > +struct TaskResult { + + enum : int32_t { size = sizeof(ResultType) }; + + using reference_type = ResultType & ; + + KOKKOS_INLINE_FUNCTION static + ResultType * ptr( TaskBase * task ) + { + return reinterpret_cast< ResultType * > + ( reinterpret_cast< char * >(task) + task->m_alloc_size - sizeof(ResultType) ); + } + + KOKKOS_INLINE_FUNCTION static + reference_type get( TaskBase * task ) + { return *ptr( task ); } +}; + +template<> +struct TaskResult< void > { + + enum : int32_t { size = 0 }; + + using reference_type = void ; + + KOKKOS_INLINE_FUNCTION static + void * ptr( TaskBase * ) { return (void*) 0 ; } + + KOKKOS_INLINE_FUNCTION static + reference_type get( TaskBase * ) {} +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template<> +class TaskQueue< void > {}; /** \brief Manage task allocation, deallocation, and scheduling. * @@ -95,7 +295,7 @@ class TaskQueueSpecialization ; * All other aspects of task management have shared implementation. */ template< typename ExecSpace > -class TaskQueue { +class TaskQueue : public TaskQueue { private: friend class TaskQueueSpecialization< ExecSpace > ; @@ -106,7 +306,7 @@ private: using memory_space = typename specialization::memory_space ; using device_type = Kokkos::Device< execution_space , memory_space > ; using memory_pool = Kokkos::MemoryPool< device_type > ; - using task_root_type = Kokkos::Impl::TaskBase ; + using task_root_type = Kokkos::Impl::TaskBase ; struct Destroy { TaskQueue * m_queue ; @@ -198,12 +398,10 @@ public: } // Assign task pointer with reference counting of assigned tasks - template< typename LV , typename RV > KOKKOS_FUNCTION static - void assign( TaskBase< execution_space,LV,void> ** const lhs - , TaskBase< execution_space,RV,void> * const rhs ) + void assign( task_root_type ** const lhs + , task_root_type * const rhs ) { - using task_lhs = TaskBase< execution_space,LV,void> ; #if 0 { printf( "assign( 0x%lx { 0x%lx %d %d } , 0x%lx { 0x%lx %d %d } )\n" @@ -225,7 +423,7 @@ public: // Force write of *lhs - *static_cast< task_lhs * volatile * >(lhs) = rhs ; + *static_cast< task_root_type * volatile * >(lhs) = rhs ; Kokkos::memory_fence(); } @@ -238,6 +436,38 @@ public: KOKKOS_FUNCTION void deallocate( void * p , size_t n ); ///< Deallocate to the memory pool + + + //---------------------------------------- + /**\brief Allocation size for a spawned task */ + + template< typename FunctorType > + KOKKOS_FUNCTION + size_t spawn_allocation_size() const + { + using value_type = typename FunctorType::value_type ; + + using task_type = Impl::TaskBase< execution_space + , value_type + , FunctorType > ; + + enum : size_t { align = ( 1 << 4 ) , align_mask = align - 1 }; + enum : size_t { task_size = sizeof(task_type) }; + enum : size_t { result_size = Impl::TaskResult< value_type >::size }; + enum : size_t { alloc_size = + ( ( task_size + align_mask ) & ~align_mask ) + + ( ( result_size + align_mask ) & ~align_mask ) }; + + return m_memory.allocate_block_size( task_size ); + } + + /**\brief Allocation size for a when_all aggregate */ + + KOKKOS_FUNCTION + size_t when_all_allocation_size( int narg ) const + { + return m_memory.allocate_block_size( sizeof(task_root_type) + narg * sizeof(task_root_type*) ); + } }; } /* namespace Impl */ @@ -249,261 +479,9 @@ public: namespace Kokkos { namespace Impl { -template<> -class TaskBase< void , void , void > { -public: - enum : int16_t { TaskTeam = 0 , TaskSingle = 1 , Aggregate = 2 }; - enum : uintptr_t { LockTag = ~uintptr_t(0) , EndTag = ~uintptr_t(1) }; -}; - -/** \brief Base class for task management, access, and execution. - * - * Inheritance structure to allow static_cast from the task root type - * and a task's FunctorType. - * - * // Enable a Future to access result data - * TaskBase< Space , ResultType , void > - * : TaskBase< void , void , void > - * { ... }; - * - * // Enable a functor to access the base class - * TaskBase< Space , ResultType , FunctorType > - * : TaskBase< Space , ResultType , void > - * , FunctorType - * { ... }; - * - * - * States of a task: - * - * Constructing State, NOT IN a linked list - * m_wait == 0 - * m_next == 0 - * - * Scheduling transition : Constructing -> Waiting - * before: - * m_wait == 0 - * m_next == this task's initial dependence, 0 if none - * after: - * m_wait == EndTag - * m_next == EndTag - * - * Waiting State, IN a linked list - * m_apply != 0 - * m_queue != 0 - * m_ref_count > 0 - * m_wait == head of linked list of tasks waiting on this task - * m_next == next of linked list of tasks - * - * transition : Waiting -> Executing - * before: - * m_next == EndTag - * after:: - * m_next == LockTag - * - * Executing State, NOT IN a linked list - * m_apply != 0 - * m_queue != 0 - * m_ref_count > 0 - * m_wait == head of linked list of tasks waiting on this task - * m_next == LockTag - * - * Respawn transition : Executing -> Executing-Respawn - * before: - * m_next == LockTag - * after: - * m_next == this task's updated dependence, 0 if none - * - * Executing-Respawn State, NOT IN a linked list - * m_apply != 0 - * m_queue != 0 - * m_ref_count > 0 - * m_wait == head of linked list of tasks waiting on this task - * m_next == this task's updated dependence, 0 if none - * - * transition : Executing -> Complete - * before: - * m_wait == head of linked list - * after: - * m_wait == LockTag - * - * Complete State, NOT IN a linked list - * m_wait == LockTag: cannot add dependence - * m_next == LockTag: not a member of a wait queue - * - */ -template< typename ExecSpace > -class TaskBase< ExecSpace , void , void > -{ -public: - - enum : int16_t { TaskTeam = TaskBase::TaskTeam - , TaskSingle = TaskBase::TaskSingle - , Aggregate = TaskBase::Aggregate }; - - enum : uintptr_t { LockTag = TaskBase::LockTag - , EndTag = TaskBase::EndTag }; - - using execution_space = ExecSpace ; - using queue_type = TaskQueue< execution_space > ; - - template< typename > friend class Kokkos::TaskScheduler ; - - typedef void (* function_type) ( TaskBase * , void * ); - - // sizeof(TaskBase) == 48 - - function_type m_apply ; ///< Apply function pointer - queue_type * m_queue ; ///< Queue in which this task resides - TaskBase * m_wait ; ///< Linked list of tasks waiting on this - TaskBase * m_next ; ///< Waiting linked-list next - int32_t m_ref_count ; ///< Reference count - int32_t m_alloc_size ; ///< Allocation size - int32_t m_dep_count ; ///< Aggregate's number of dependences - int16_t m_task_type ; ///< Type of task - int16_t m_priority ; ///< Priority of runnable task - - TaskBase() = delete ; - TaskBase( TaskBase && ) = delete ; - TaskBase( const TaskBase & ) = delete ; - TaskBase & operator = ( TaskBase && ) = delete ; - TaskBase & operator = ( const TaskBase & ) = delete ; - - KOKKOS_INLINE_FUNCTION ~TaskBase() = default ; - - // Constructor for a runnable task - KOKKOS_INLINE_FUNCTION - constexpr TaskBase( function_type arg_apply - , queue_type * arg_queue - , TaskBase * arg_dependence - , int arg_ref_count - , int arg_alloc_size - , int arg_task_type - , int arg_priority - ) noexcept - : m_apply( arg_apply ) - , m_queue( arg_queue ) - , m_wait( 0 ) - , m_next( arg_dependence ) - , m_ref_count( arg_ref_count ) - , m_alloc_size( arg_alloc_size ) - , m_dep_count( 0 ) - , m_task_type( arg_task_type ) - , m_priority( arg_priority ) - {} - - // Constructor for an aggregate task - KOKKOS_INLINE_FUNCTION - constexpr TaskBase( queue_type * arg_queue - , int arg_ref_count - , int arg_alloc_size - , int arg_dep_count - ) noexcept - : m_apply( 0 ) - , m_queue( arg_queue ) - , m_wait( 0 ) - , m_next( 0 ) - , m_ref_count( arg_ref_count ) - , m_alloc_size( arg_alloc_size ) - , m_dep_count( arg_dep_count ) - , m_task_type( Aggregate ) - , m_priority( 0 ) - {} - - //---------------------------------------- - - KOKKOS_INLINE_FUNCTION - TaskBase ** aggregate_dependences() - { return reinterpret_cast( this + 1 ); } - - KOKKOS_INLINE_FUNCTION - bool requested_respawn() - { - // This should only be called when a task has finished executing and is - // in the transition to either the complete or executing-respawn state. - TaskBase * const lock = reinterpret_cast< TaskBase * >( LockTag ); - return lock != m_next; - } - - KOKKOS_INLINE_FUNCTION - void add_dependence( TaskBase* dep ) - { - // Precondition: lock == m_next - - TaskBase * const lock = (TaskBase *) LockTag ; - - // Assign dependence to m_next. It will be processed in the subsequent - // call to schedule. Error if the dependence is reset. - if ( lock != Kokkos::atomic_exchange( & m_next, dep ) ) { - Kokkos::abort("TaskScheduler ERROR: resetting task dependence"); - } - - if ( 0 != dep ) { - // The future may be destroyed upon returning from this call - // so increment reference count to track this assignment. - Kokkos::atomic_increment( &(dep->m_ref_count) ); - } - } - - using get_return_type = void ; - - KOKKOS_INLINE_FUNCTION - get_return_type get() const {} -}; - -template < typename ExecSpace , typename ResultType > -class TaskBase< ExecSpace , ResultType , void > - : public TaskBase< ExecSpace , void , void > -{ -private: - - using root_type = TaskBase ; - using function_type = typename root_type::function_type ; - using queue_type = typename root_type::queue_type ; - - static_assert( sizeof(root_type) == 48 , "" ); - - TaskBase() = delete ; - TaskBase( TaskBase && ) = delete ; - TaskBase( const TaskBase & ) = delete ; - TaskBase & operator = ( TaskBase && ) = delete ; - TaskBase & operator = ( const TaskBase & ) = delete ; - -public: - - ResultType m_result ; - - KOKKOS_INLINE_FUNCTION ~TaskBase() = default ; - - // Constructor for runnable task - KOKKOS_INLINE_FUNCTION - constexpr TaskBase( function_type arg_apply - , queue_type * arg_queue - , root_type * arg_dependence - , int arg_ref_count - , int arg_alloc_size - , int arg_task_type - , int arg_priority - ) - : root_type( arg_apply - , arg_queue - , arg_dependence - , arg_ref_count - , arg_alloc_size - , arg_task_type - , arg_priority - ) - , m_result() - {} - - using get_return_type = ResultType const & ; - - KOKKOS_INLINE_FUNCTION - get_return_type get() const { return m_result ; } -}; - -template< typename ExecSpace , typename ResultType , typename FunctorType > +template< class ExecSpace , typename ResultType , class FunctorType > class TaskBase - : public TaskBase< ExecSpace , ResultType , void > + : public TaskBase< void , void , void > , public FunctorType { private: @@ -516,50 +494,31 @@ private: public: - using root_type = TaskBase< ExecSpace , void , void > ; - using base_type = TaskBase< ExecSpace , ResultType , void > ; - using specialization = TaskQueueSpecialization< ExecSpace > ; - using function_type = typename root_type::function_type ; - using queue_type = typename root_type::queue_type ; - using member_type = typename specialization::member_type ; + using root_type = TaskBase< void , void , void > ; using functor_type = FunctorType ; using result_type = ResultType ; - template< typename Type > - KOKKOS_INLINE_FUNCTION static - void apply_functor - ( Type * const task - , typename std::enable_if - < std::is_same< typename Type::result_type , void >::value - , member_type * const - >::type member - ) - { - using fType = typename Type::functor_type ; - static_cast(task)->operator()( *member ); - } + using specialization = TaskQueueSpecialization< ExecSpace > ; + using member_type = typename specialization::member_type ; - template< typename Type > - KOKKOS_INLINE_FUNCTION static - void apply_functor - ( Type * const task - , typename std::enable_if - < ! std::is_same< typename Type::result_type , void >::value - , member_type * const - >::type member - ) - { - using fType = typename Type::functor_type ; - static_cast(task)->operator()( *member , task->m_result ); - } + KOKKOS_INLINE_FUNCTION + void apply_functor( member_type * const member , void * ) + { functor_type::operator()( *member ); } + + template< typename T > + KOKKOS_INLINE_FUNCTION + void apply_functor( member_type * const member + , T * const result ) + { functor_type::operator()( *member , *result ); } KOKKOS_FUNCTION static void apply( root_type * root , void * exec ) { TaskBase * const task = static_cast< TaskBase * >( root ); member_type * const member = reinterpret_cast< member_type * >( exec ); + result_type * const result = TaskResult< result_type >::ptr( task ); - TaskBase::template apply_functor( task , member ); + task->apply_functor( member , result ); // Task may be serial or team. // If team then must synchronize before querying if respawn was requested. @@ -576,26 +535,9 @@ public: } // Constructor for runnable task - KOKKOS_INLINE_FUNCTION - constexpr TaskBase( function_type arg_apply - , queue_type * arg_queue - , root_type * arg_dependence - , int arg_ref_count - , int arg_alloc_size - , int arg_task_type - , int arg_priority - , FunctorType && arg_functor - ) - : base_type( arg_apply - , arg_queue - , arg_dependence - , arg_ref_count - , arg_alloc_size - , arg_task_type - , arg_priority - ) - , functor_type( arg_functor ) - {} + KOKKOS_INLINE_FUNCTION constexpr + TaskBase( FunctorType && arg_functor ) + : root_type() , functor_type( std::move(arg_functor) ) {} KOKKOS_INLINE_FUNCTION ~TaskBase() {} diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp index aee381afad..1974f7e1ca 100644 --- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp @@ -44,6 +44,8 @@ #include #if defined( KOKKOS_ENABLE_TASKDAG ) +#define KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING 0 + namespace Kokkos { namespace Impl { @@ -100,9 +102,11 @@ KOKKOS_FUNCTION void TaskQueue< ExecSpace >::decrement ( TaskQueue< ExecSpace >::task_root_type * task ) { - const int count = Kokkos::atomic_fetch_add(&(task->m_ref_count),-1); + task_root_type volatile & t = *task ; -#if 0 + const int count = Kokkos::atomic_fetch_add(&(t.m_ref_count),-1); + +#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING if ( 1 == count ) { printf( "decrement-destroy( 0x%lx { 0x%lx %d %d } )\n" , uintptr_t( task ) @@ -114,9 +118,13 @@ void TaskQueue< ExecSpace >::decrement #endif if ( ( 1 == count ) && - ( task->m_next == (task_root_type *) task_root_type::LockTag ) ) { + ( t.m_next == (task_root_type *) task_root_type::LockTag ) ) { // Reference count is zero and task is complete, deallocate. - task->m_queue->deallocate( task , task->m_alloc_size ); + + TaskQueue< ExecSpace > * const queue = + static_cast< TaskQueue< ExecSpace > * >( t.m_queue ); + + queue->deallocate( task , t.m_alloc_size ); } else if ( count <= 1 ) { Kokkos::abort("TaskScheduler task has negative reference count or is incomplete" ); @@ -171,7 +179,7 @@ bool TaskQueue< ExecSpace >::push_task // Fail the push attempt if the queue is locked; // otherwise retry until the push succeeds. -#if 0 +#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING printf( "push_task( 0x%lx { 0x%lx } 0x%lx { 0x%lx 0x%lx %d %d %d } )\n" , uintptr_t(queue) , uintptr_t(*queue) @@ -186,9 +194,9 @@ bool TaskQueue< ExecSpace >::push_task task_root_type * const zero = (task_root_type *) 0 ; task_root_type * const lock = (task_root_type *) task_root_type::LockTag ; - task_root_type * volatile * const next = & task->m_next ; + task_root_type * volatile & next = task->m_next ; - if ( zero != *next ) { + if ( zero != next ) { Kokkos::abort("TaskQueue::push_task ERROR: already a member of another queue" ); } @@ -196,9 +204,9 @@ bool TaskQueue< ExecSpace >::push_task while ( lock != y ) { - *next = y ; + next = y ; - // Do not proceed until '*next' has been stored. + // Do not proceed until 'next' has been stored. Kokkos::memory_fence(); task_root_type * const x = y ; @@ -211,9 +219,9 @@ bool TaskQueue< ExecSpace >::push_task // Failed, replace 'task->m_next' value since 'task' remains // not a member of a queue. - *next = zero ; + next = zero ; - // Do not proceed until '*next' has been stored. + // Do not proceed until 'next' has been stored. Kokkos::memory_fence(); return false ; @@ -270,11 +278,13 @@ TaskQueue< ExecSpace >::pop_ready_task // This thread has exclusive access to // the queue and the popped task's m_next. - *queue = task->m_next ; task->m_next = lock ; + task_root_type * volatile & next = task->m_next ; + + *queue = next ; next = lock ; Kokkos::memory_fence(); -#if 0 +#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING printf( "pop_ready_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n" , uintptr_t(queue) , uintptr_t(task) @@ -323,7 +333,7 @@ void TaskQueue< ExecSpace >::schedule_runnable // task->m_wait == head of linked list (queue) // task->m_next == member of linked list (queue) -#if 0 +#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING printf( "schedule_runnable( 0x%lx { 0x%lx 0x%lx %d %d %d }\n" , uintptr_t(task) , uintptr_t(task->m_wait) @@ -337,20 +347,22 @@ void TaskQueue< ExecSpace >::schedule_runnable task_root_type * const lock = (task_root_type *) task_root_type::LockTag ; task_root_type * const end = (task_root_type *) task_root_type::EndTag ; + task_root_type volatile & t = *task ; + bool respawn = false ; //---------------------------------------- - if ( zero == task->m_wait ) { + if ( zero == t.m_wait ) { // Task in Constructing state // - Transition to Waiting state // Preconditions: // - call occurs exclusively within a single thread - task->m_wait = end ; + t.m_wait = end ; // Task in Waiting state } - else if ( lock != task->m_wait ) { + else if ( lock != t.m_wait ) { // Task in Executing state with Respawn request // - Update dependence // - Transition to Waiting state @@ -373,7 +385,9 @@ void TaskQueue< ExecSpace >::schedule_runnable // Exclusive access so don't need an atomic exchange // task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero ); - task_root_type * dep = task->m_next ; task->m_next = zero ; + task_root_type * dep = t.m_next ; t.m_next = zero ; + + Kokkos::memory_fence(); const bool is_ready = ( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) ); @@ -398,7 +412,7 @@ void TaskQueue< ExecSpace >::schedule_runnable Kokkos::atomic_increment( & m_ready_count ); task_root_type * volatile * const ready_queue = - & m_ready[ task->m_priority ][ task->m_task_type ]; + & m_ready[ t.m_priority ][ t.m_task_type ]; // A push_task fails if the ready queue is locked. // A ready queue is only locked during a push or pop; @@ -441,7 +455,7 @@ void TaskQueue< ExecSpace >::schedule_aggregate // task->m_wait == head of linked list (queue) // task->m_next == member of linked list (queue) -#if 0 +#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING printf( "schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d }\n" , uintptr_t(task) , uintptr_t(task->m_wait) @@ -455,18 +469,20 @@ void TaskQueue< ExecSpace >::schedule_aggregate task_root_type * const lock = (task_root_type *) task_root_type::LockTag ; task_root_type * const end = (task_root_type *) task_root_type::EndTag ; + task_root_type volatile & t = *task ; + //---------------------------------------- - if ( zero == task->m_wait ) { + if ( zero == t.m_wait ) { // Task in Constructing state // - Transition to Waiting state // Preconditions: // - call occurs exclusively within a single thread - task->m_wait = end ; + t.m_wait = end ; // Task in Waiting state } - else if ( lock == task->m_wait ) { + else if ( lock == t.m_wait ) { // Task in Complete state Kokkos::abort("TaskQueue::schedule_aggregate ERROR: task is complete"); } @@ -477,14 +493,14 @@ void TaskQueue< ExecSpace >::schedule_aggregate // (1) created or // (2) being removed from a completed task's wait list. - task_root_type ** const aggr = task->aggregate_dependences(); + task_root_type * volatile * const aggr = t.aggregate_dependences(); // Assume the 'when_all' is complete until a dependence is // found that is not complete. bool is_complete = true ; - for ( int i = task->m_dep_count ; 0 < i && is_complete ; ) { + for ( int i = t.m_dep_count ; 0 < i && is_complete ; ) { --i ; @@ -523,7 +539,7 @@ void TaskQueue< ExecSpace >::schedule_aggregate // Complete the when_all 'task' to schedule other tasks // that are waiting for the when_all 'task' to complete. - task->m_next = lock ; + t.m_next = lock ; complete( task ); @@ -573,7 +589,7 @@ void TaskQueue< ExecSpace >::complete task_root_type * const lock = (task_root_type *) task_root_type::LockTag ; task_root_type * const end = (task_root_type *) task_root_type::EndTag ; -#if 0 +#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING printf( "complete( 0x%lx { 0x%lx 0x%lx %d %d %d }\n" , uintptr_t(task) , uintptr_t(task->m_wait) @@ -584,11 +600,13 @@ void TaskQueue< ExecSpace >::complete fflush( stdout ); #endif - const bool runnable = task_root_type::Aggregate != task->m_task_type ; + task_root_type volatile & t = *task ; + + const bool runnable = task_root_type::Aggregate != t.m_task_type ; //---------------------------------------- - if ( runnable && lock != task->m_next ) { + if ( runnable && lock != t.m_next ) { // Is a runnable task has finished executing and requested respawn. // Schedule the task for subsequent execution. @@ -607,7 +625,7 @@ void TaskQueue< ExecSpace >::complete // Stop other tasks from adding themselves to this task's wait queue // by locking the head of this task's wait queue. - task_root_type * x = Kokkos::atomic_exchange( & task->m_wait , lock ); + task_root_type * x = Kokkos::atomic_exchange( & t.m_wait , lock ); if ( x != (task_root_type *) lock ) { @@ -627,9 +645,13 @@ void TaskQueue< ExecSpace >::complete // Have exclusive access to 'x' until it is scheduled // Set x->m_next = zero <= no dependence, not a respawn - task_root_type * const next = x->m_next ; x->m_next = 0 ; + task_root_type volatile & vx = *x ; - if ( task_root_type::Aggregate != x->m_task_type ) { + task_root_type * const next = vx.m_next ; vx.m_next = 0 ; + + Kokkos::memory_fence(); + + if ( task_root_type::Aggregate != vx.m_task_type ) { schedule_runnable( x ); } else { diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp index c55636b64e..ed1a71bea7 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -47,7 +47,6 @@ #include namespace Kokkos { -namespace Experimental { namespace Impl { template< class DataType , class ArrayLayout , class V , size_t N , class P > @@ -94,13 +93,12 @@ public: typedef typename ViewDataType< non_const_scalar_type , array_scalar_dimension >::type non_const_scalar_array_type ; }; -}}} // namespace Kokkos::Experimental::Impl +}} // namespace Kokkos::Impl //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace Impl { /** \brief View mapping for non-specialized data type and standard layout */ @@ -597,7 +595,7 @@ public: } }; -}}} // namespace Kokkos::Experimental::Impl +}} // namespace Kokkos::Impl //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp index 6381aee468..f32c6bb2ee 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp @@ -96,6 +96,27 @@ struct is_view_label< const char[N] > : public std::true_type {}; template< typename ... P > struct ViewCtorProp ; +// Forward declare +template< typename Specialize , typename T > +struct CommonViewAllocProp ; + +/* Common value_type stored as ViewCtorProp + */ +template< typename Specialize , typename T > +struct ViewCtorProp< void , CommonViewAllocProp > +{ + ViewCtorProp() = default ; + ViewCtorProp( const ViewCtorProp & ) = default ; + ViewCtorProp & operator = ( const ViewCtorProp & ) = default ; + + using type = CommonViewAllocProp ; + + ViewCtorProp( const type & arg ) : value( arg ) {} + ViewCtorProp( type && arg ) : value( arg ) {} + + type value ; +}; + /* std::integral_constant are dummy arguments * that avoid duplicate base class errors */ diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp index 900bd88f1c..d346f9e639 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp @@ -62,7 +62,6 @@ //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace Impl { template< unsigned I , size_t ... Args > @@ -250,7 +249,7 @@ struct ViewDimensionAssignable< ViewDimension< DstArgs ... > }; -}}} // namespace Kokkos::Experimental::Impl +}} // namespace Kokkos::Impl //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -266,14 +265,11 @@ struct ALL_t { }} // namespace Kokkos::Impl namespace Kokkos { -namespace Experimental { namespace Impl { -using Kokkos::Impl::ALL_t ; - template< class T > struct is_integral_extent_type -{ enum { value = std::is_same::value ? 1 : 0 }; }; +{ enum { value = std::is_same::value ? 1 : 0 }; }; template< class iType > struct is_integral_extent_type< std::pair > @@ -314,10 +310,10 @@ struct SubviewLegalArgsCompileTime; template struct SubviewLegalArgsCompileTime { - enum { value =(((CurrentArg==RankDest-1) && (Kokkos::Experimental::Impl::is_integral_extent_type::value)) || + enum { value =(((CurrentArg==RankDest-1) && (Kokkos::Impl::is_integral_extent_type::value)) || ((CurrentArg>=RankDest) && (std::is_integral::value)) || ((CurrentArg::value)) || - ((CurrentArg==0) && (Kokkos::Experimental::Impl::is_integral_extent_type::value)) + ((CurrentArg==0) && (Kokkos::Impl::is_integral_extent_type::value)) ) && (SubviewLegalArgsCompileTime::value)}; }; @@ -331,7 +327,7 @@ struct SubviewLegalArgsCompileTime struct SubviewLegalArgsCompileTime { - enum { value =(((CurrentArg==RankSrc-RankDest) && (Kokkos::Experimental::Impl::is_integral_extent_type::value)) || + enum { value =(((CurrentArg==RankSrc-RankDest) && (Kokkos::Impl::is_integral_extent_type::value)) || ((CurrentArg::value)) || ((CurrentArg>=RankSrc-RankDest) && (std::is_same::value)) ) && (SubviewLegalArgsCompileTime::value)}; @@ -403,7 +399,7 @@ private: bool set( unsigned domain_rank , unsigned range_rank , const ViewDimension< DimArgs ... > & dim - , const Kokkos::Experimental::Impl::ALL_t + , const Kokkos::Impl::ALL_t , Args ... args ) { m_begin[ domain_rank ] = 0 ; @@ -519,7 +515,7 @@ private: , unsigned domain_rank , unsigned range_rank , const ViewDimension< DimArgs ... > & dim - , const Kokkos::Experimental::Impl::ALL_t + , const Kokkos::Impl::ALL_t , Args ... args ) const { const int n = std::min( buf_len , @@ -670,13 +666,12 @@ public: { return unsigned(i) < InternalRangeRank ? m_index[i] : ~0u ; } }; -}}} // namespace Kokkos::Experimental::Impl +}} // namespace Kokkos::Impl //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace Impl { /** \brief Given a value type and dimension generate the View data type */ @@ -814,13 +809,12 @@ public: typedef non_const_type non_const_scalar_array_type ; }; -}}} // namespace Kokkos::Experimental::Impl +}} // namespace Kokkos::Impl //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace Impl { template < class Dimension , class Layout , typename Enable = void > @@ -1228,14 +1222,14 @@ private: // If memory alignment is a multiple of the trivial scalar size then attempt to align. enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 }; - enum { div_ok = div ? div : 1 }; // To valid modulo zero in constexpr + enum { div_ok = (div != 0) ? div : 1 }; // To valid modulo zero in constexpr KOKKOS_INLINE_FUNCTION static constexpr size_t stride( size_t const N ) - { - return ( align && ( Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align < N ) && ( N % div_ok ) ) - ? N + align - ( N % div_ok ) : N ; - } + { + return ( (align != 0) && ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) && ((N % div_ok) != 0) ) + ? N + align - ( N % div_ok ) : N ; + } }; public: @@ -1707,12 +1701,12 @@ private: // If memory alignment is a multiple of the trivial scalar size then attempt to align. enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 }; - enum { div_ok = div ? div : 1 }; // To valid modulo zero in constexpr + enum { div_ok = (div != 0) ? div : 1 }; // To valid modulo zero in constexpr KOKKOS_INLINE_FUNCTION static constexpr size_t stride( size_t const N ) { - return ( align && ( Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align < N ) && ( N % div_ok ) ) + return ( (align != 0) && ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) && ((N % div_ok) != 0) ) ? N + align - ( N % div_ok ) : N ; } }; @@ -2225,13 +2219,12 @@ public: {} }; -}}} // namespace Kokkos::Experimental::Impl +}} // namespace Kokkos::Impl //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace Impl { /** \brief ViewDataHandle provides the type of the 'data handle' which the view @@ -2422,13 +2415,12 @@ struct ViewDataHandle< Traits , return handle_type( arg_data_ptr + offset ); } }; -}}} // namespace Kokkos::Experimental::Impl +}} // namespace Kokkos::Impl //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace Impl { //---------------------------------------------------------------------------- @@ -2451,8 +2443,9 @@ template< class ExecSpace , class ValueType > struct ViewValueFunctor< ExecSpace , ValueType , false /* is_scalar */ > { typedef Kokkos::RangePolicy< ExecSpace > PolicyType ; + typedef typename ExecSpace::execution_space Exec; - ExecSpace space ; + Exec space ; ValueType * ptr ; size_t n ; bool destroy ; @@ -2597,6 +2590,9 @@ private: public: + typedef void printable_label_typedef; + enum { is_managed = Traits::is_managed }; + //---------------------------------------- // Domain dimensions @@ -2944,7 +2940,7 @@ public: Kokkos::abort("View Assignment: trying to assign runtime dimension to non matching compile time dimension."); } dst.m_offset = dst_offset_type( src.m_offset ); - dst.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_handle , src_track ); + dst.m_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_handle , src_track ); } }; @@ -3102,7 +3098,7 @@ public: //---------------------------------------------------------------------------- -}}} // namespace Kokkos::Experimental::Impl +}} // namespace Kokkos::Impl //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -3151,6 +3147,77 @@ void view_error_operator_bounds view_error_operator_bounds(buf+n,len-n,map,args...); } +#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + +/* Check #3: is the View managed as determined by the MemoryTraits? */ +template< class MapType, + bool is_managed = (MapType::is_managed != 0) > +struct OperatorBoundsErrorOnDevice; + +template< class MapType > +struct OperatorBoundsErrorOnDevice< MapType, false > { +KOKKOS_INLINE_FUNCTION +static void run(MapType const&) { + Kokkos::abort("View bounds error"); +} +}; + +template< class MapType > +struct OperatorBoundsErrorOnDevice< MapType, true > { +KOKKOS_INLINE_FUNCTION +static void run(MapType const& map) { + char const* const user_alloc_start = reinterpret_cast(map.data()); + char const* const header_start = user_alloc_start - sizeof(SharedAllocationHeader); + SharedAllocationHeader const* const header = + reinterpret_cast(header_start); + char const* const label = header->label(); + enum { LEN = 128 }; + char msg[LEN]; + char const* const first_part = "View bounds error of view "; + char* p = msg; + char* const end = msg + LEN - 1; + for (char const* p2 = first_part; (*p2 != '\0') && (p < end); ++p, ++p2) { + *p = *p2; + } + for (char const* p2 = label; (*p2 != '\0') && (p < end); ++p, ++p2) { + *p = *p2; + } + *p = '\0'; + Kokkos::abort(msg); +} +}; + +/* Check #2: does the ViewMapping have the printable_label_typedef defined? + See above that only the non-specialized standard-layout ViewMapping has + this defined by default. + The existence of this typedef indicates the existence of MapType::is_managed */ +template< class T, class Enable = void > +struct has_printable_label_typedef : public std::false_type {}; + +template +struct has_printable_label_typedef< + T, typename enable_if_type::type> + : public std::true_type +{}; + +template< class MapType > +KOKKOS_INLINE_FUNCTION +void operator_bounds_error_on_device( + MapType const&, + std::false_type) { + Kokkos::abort("View bounds error"); +} + +template< class MapType > +KOKKOS_INLINE_FUNCTION +void operator_bounds_error_on_device( + MapType const& map, + std::true_type) { + OperatorBoundsErrorOnDevice< MapType >::run(map); +} + +#endif // ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + template< class MemorySpace , class MapType , class ... Args > KOKKOS_INLINE_FUNCTION void view_verify_operator_bounds @@ -3166,7 +3233,17 @@ void view_verify_operator_bounds view_error_operator_bounds<0>( buffer + n , LEN - n , map , args ... ); Kokkos::Impl::throw_runtime_exception(std::string(buffer)); #else - Kokkos::abort("View bounds error"); + /* Check #1: is there a SharedAllocationRecord? + (we won't use it, but if its not there then there isn't + a corresponding SharedAllocationHeader containing a label). + This check should cover the case of Views that don't + have the Unmanaged trait but were initialized by pointer. */ + if (tracker.has_record()) { + operator_bounds_error_on_device( + map, has_printable_label_typedef()); + } else { + Kokkos::abort("View bounds error"); + } #endif } } diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewTile.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewTile.hpp index ecbcf72fe0..5a8600e0ae 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewTile.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewTile.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -48,7 +48,6 @@ //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace Impl { // View mapping for rank two tiled array @@ -195,11 +194,9 @@ struct ViewMapping }; } /* namespace Impl */ -} /* namespace Experimental */ } /* namespace Kokkos */ namespace Kokkos { -namespace Experimental { template< typename T , unsigned N0 , unsigned N1 , class ... P > KOKKOS_INLINE_FUNCTION @@ -217,7 +214,6 @@ tile_subview( const Kokkos::View,P...> & ( src , SrcLayout() , i_tile0 , i_tile1 ); } -} /* namespace Experimental */ } /* namespace Kokkos */ //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp b/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp deleted file mode 100644 index 101b714fcd..0000000000 --- a/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp +++ /dev/null @@ -1,183 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 2.0 -// Copyright (2014) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#include -#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) - -#include - -#include -#include - -/*--------------------------------------------------------------------------*/ - -#if !defined( _WIN32 ) - #if defined( KOKKOS_ENABLE_ASM ) - #if defined( __arm__ ) || defined( __aarch64__ ) - /* No-operation instruction to idle the thread. */ - #define KOKKOS_INTERNAL_PAUSE - #else - /* Pause instruction to prevent excess processor bus usage */ - #define KOKKOS_INTERNAL_PAUSE asm volatile("pause\n":::"memory") - #endif - #define KOKKOS_INTERNAL_NOP2 asm volatile("nop\n" "nop\n") - #define KOKKOS_INTERNAL_NOP4 KOKKOS_INTERNAL_NOP2; KOKKOS_INTERNAL_NOP2 - #define KOKKOS_INTERNAL_NOP8 KOKKOS_INTERNAL_NOP4; KOKKOS_INTERNAL_NOP4; - #define KOKKOS_INTERNAL_NOP16 KOKKOS_INTERNAL_NOP8; KOKKOS_INTERNAL_NOP8; - #define KOKKOS_INTERNAL_NOP32 KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16; - namespace { - inline void kokkos_internal_yield( const unsigned i ) noexcept { - switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) { - case 0u: KOKKOS_INTERNAL_NOP2; break; - case 1u: KOKKOS_INTERNAL_NOP4; break; - case 2u: KOKKOS_INTERNAL_NOP8; break; - case 3u: KOKKOS_INTERNAL_NOP16; break; - default: KOKKOS_INTERNAL_NOP32; - } - KOKKOS_INTERNAL_PAUSE; - } - } - #else - #include - namespace { - inline void kokkos_internal_yield( const unsigned ) noexcept { - sched_yield(); - } - } - #endif -#else // defined( _WIN32 ) - #if defined ( KOKKOS_ENABLE_WINTHREAD ) - #include - namespace { - inline void kokkos_internal_yield( const unsigned ) noexcept { - Sleep(0); - } - } - #elif defined( _MSC_VER ) - #define NOMINMAX - #include - #include - namespace { - inline void kokkos_internal_yield( const unsigned ) noexcept { - YieldProcessor(); - } - } - #else - #define KOKKOS_INTERNAL_PAUSE __asm__ __volatile__("pause\n":::"memory") - #define KOKKOS_INTERNAL_NOP2 __asm__ __volatile__("nop\n" "nop") - #define KOKKOS_INTERNAL_NOP4 KOKKOS_INTERNAL_NOP2; KOKKOS_INTERNAL_NOP2 - #define KOKKOS_INTERNAL_NOP8 KOKKOS_INTERNAL_NOP4; KOKKOS_INTERNAL_NOP4; - #define KOKKOS_INTERNAL_NOP16 KOKKOS_INTERNAL_NOP8; KOKKOS_INTERNAL_NOP8; - #define KOKKOS_INTERNAL_NOP32 KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16; - namespace { - inline void kokkos_internal_yield( const unsigned i ) noexcept { - switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) { - case 0: KOKKOS_INTERNAL_NOP2; break; - case 1: KOKKOS_INTERNAL_NOP4; break; - case 2: KOKKOS_INTERNAL_NOP8; break; - case 3: KOKKOS_INTERNAL_NOP16; break; - default: KOKKOS_INTERNAL_NOP32; - } - KOKKOS_INTERNAL_PAUSE; - } - } - #endif -#endif - - -/*--------------------------------------------------------------------------*/ - -namespace Kokkos { -namespace Impl { - -void spinwait_while_equal( volatile int32_t & flag , const int32_t value ) -{ - Kokkos::store_fence(); - unsigned i = 0; - while ( value == flag ) { - kokkos_internal_yield(i); - ++i; - } - Kokkos::load_fence(); -} - -void spinwait_until_equal( volatile int32_t & flag , const int32_t value ) -{ - Kokkos::store_fence(); - unsigned i = 0; - while ( value != flag ) { - kokkos_internal_yield(i); - ++i; - } - Kokkos::load_fence(); -} - -void spinwait_while_equal( volatile int64_t & flag , const int64_t value ) -{ - Kokkos::store_fence(); - unsigned i = 0; - while ( value == flag ) { - kokkos_internal_yield(i); - ++i; - } - Kokkos::load_fence(); -} - -void spinwait_until_equal( volatile int64_t & flag , const int64_t value ) -{ - Kokkos::store_fence(); - unsigned i = 0; - while ( value != flag ) { - kokkos_internal_yield(i); - ++i; - } - Kokkos::load_fence(); -} - -} /* namespace Impl */ -} /* namespace Kokkos */ - -#else -void KOKKOS_CORE_SRC_IMPL_SPINWAIT_PREVENT_LINK_ERROR() {} -#endif - diff --git a/lib/kokkos/core/unit_test/CMakeLists.txt b/lib/kokkos/core/unit_test/CMakeLists.txt index 5d6f25ac95..475b6bb48a 100644 --- a/lib/kokkos/core/unit_test/CMakeLists.txt +++ b/lib/kokkos/core/unit_test/CMakeLists.txt @@ -57,6 +57,7 @@ IF(Kokkos_ENABLE_Serial) serial/TestSerial_ViewMapping_b.cpp serial/TestSerial_ViewMapping_subview.cpp serial/TestSerial_ViewOfClass.cpp + serial/TestSerial_WorkGraph.cpp COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " @@ -102,6 +103,7 @@ IF(Kokkos_ENABLE_Pthread) threads/TestThreads_ViewMapping_b.cpp threads/TestThreads_ViewMapping_subview.cpp threads/TestThreads_ViewOfClass.cpp + threads/TestThreads_WorkGraph.cpp COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " @@ -147,6 +149,8 @@ IF(Kokkos_ENABLE_OpenMP) openmp/TestOpenMP_ViewMapping_b.cpp openmp/TestOpenMP_ViewMapping_subview.cpp openmp/TestOpenMP_ViewOfClass.cpp + openmp/TestOpenMP_WorkGraph.cpp + openmp/TestOpenMP_UniqueToken.cpp COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " @@ -237,6 +241,7 @@ IF(Kokkos_ENABLE_Cuda) cuda/TestCuda_ViewMapping_b.cpp cuda/TestCuda_ViewMapping_subview.cpp cuda/TestCuda_ViewOfClass.cpp + cuda/TestCuda_WorkGraph.cpp COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " @@ -253,6 +258,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( default/TestDefaultDeviceType_b.cpp default/TestDefaultDeviceType_c.cpp default/TestDefaultDeviceType_d.cpp + default/TestDefaultDeviceTypeResize.cpp COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " diff --git a/lib/kokkos/core/unit_test/Makefile b/lib/kokkos/core/unit_test/Makefile index 41f192a486..c877aa7dd2 100644 --- a/lib/kokkos/core/unit_test/Makefile +++ b/lib/kokkos/core/unit_test/Makefile @@ -62,8 +62,9 @@ endif OBJ_CUDA += TestCuda_TeamReductionScan.o OBJ_CUDA += TestCuda_Other.o OBJ_CUDA += TestCuda_MDRange.o - OBJ_CUDA += TestCuda_Task.o + OBJ_CUDA += TestCuda_Task.o TestCuda_WorkGraph.o OBJ_CUDA += TestCuda_Spaces.o + OBJ_CUDA += TestCuda_UniqueToken.o TARGETS += KokkosCore_UnitTest_Cuda @@ -121,7 +122,8 @@ endif OBJ_OPENMP += TestOpenMP_TeamReductionScan.o OBJ_OPENMP += TestOpenMP_Other.o OBJ_OPENMP += TestOpenMP_MDRange.o - OBJ_OPENMP += TestOpenMP_Task.o + OBJ_OPENMP += TestOpenMP_Task.o TestOpenMP_WorkGraph.o + OBJ_OPENMP += TestOpenMP_UniqueToken.o TARGETS += KokkosCore_UnitTest_OpenMP @@ -208,7 +210,7 @@ endif OBJ_SERIAL += TestSerial_TeamReductionScan.o OBJ_SERIAL += TestSerial_Other.o OBJ_SERIAL += TestSerial_MDRange.o - OBJ_SERIAL += TestSerial_Task.o + OBJ_SERIAL += TestSerial_Task.o TestSerial_WorkGraph.o TARGETS += KokkosCore_UnitTest_Serial diff --git a/lib/kokkos/core/unit_test/TestAggregate.hpp b/lib/kokkos/core/unit_test/TestAggregate.hpp index 6896a27bfb..87440c36be 100644 --- a/lib/kokkos/core/unit_test/TestAggregate.hpp +++ b/lib/kokkos/core/unit_test/TestAggregate.hpp @@ -58,7 +58,7 @@ template< class DeviceType > void TestViewAggregate() { typedef Kokkos::Array< double, 32 > value_type; - typedef Kokkos::Experimental::Impl::ViewDataAnalysis< value_type *, Kokkos::LayoutLeft, value_type > analysis_1d; + typedef Kokkos::Impl::ViewDataAnalysis< value_type *, Kokkos::LayoutLeft, value_type > analysis_1d; static_assert( std::is_same< typename analysis_1d::specialize, Kokkos::Array<> >::value, "" ); diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp index 401da58a58..68864c8d66 100644 --- a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp @@ -186,6 +186,21 @@ void check_correct_initialization( const Kokkos::InitArguments & argstruct ) { // Figure out the number of threads the HostSpace ExecutionSpace should have initialized to. int expected_nthreads = argstruct.num_threads; +#ifdef KOKKOS_ENABLE_OPENMP + if ( std::is_same< Kokkos::HostSpace::execution_space, Kokkos::OpenMP >::value ) { + // use openmp default num threads + if ( expected_nthreads < 0 || ( expected_nthreads == 0 && !Kokkos::hwloc::available() ) ) { + expected_nthreads = omp_get_max_threads(); + } + // use hwloc if available + else if ( expected_nthreads == 0 && Kokkos::hwloc::available() ) { + expected_nthreads = Kokkos::hwloc::get_available_numa_count() + * Kokkos::hwloc::get_available_cores_per_numa() + * Kokkos::hwloc::get_available_threads_per_core(); + } + } +#endif + if ( expected_nthreads < 1 ) { if ( Kokkos::hwloc::available() ) { expected_nthreads = Kokkos::hwloc::get_available_numa_count() @@ -193,12 +208,6 @@ void check_correct_initialization( const Kokkos::InitArguments & argstruct ) { * Kokkos::hwloc::get_available_threads_per_core(); } else { -#ifdef KOKKOS_ENABLE_OPENMP - if ( std::is_same< Kokkos::HostSpace::execution_space, Kokkos::OpenMP >::value ) { - expected_nthreads = omp_get_max_threads(); - } - else -#endif expected_nthreads = 1; } diff --git a/lib/kokkos/core/unit_test/TestMDRange.hpp b/lib/kokkos/core/unit_test/TestMDRange.hpp index 091591bcbf..f579ddf02c 100644 --- a/lib/kokkos/core/unit_test/TestMDRange.hpp +++ b/lib/kokkos/core/unit_test/TestMDRange.hpp @@ -51,6 +51,180 @@ namespace Test { namespace { +template +struct TestMDRange_ReduceArray_2D { + + using DataType = int; + using ViewType_2 = typename Kokkos::View< DataType**, ExecSpace >; + using HostViewType_2 = typename ViewType_2::HostMirror; + + ViewType_2 input_view; + + using scalar_type = double; + using value_type = scalar_type[]; + const unsigned value_count; + + TestMDRange_ReduceArray_2D( const int N0, const int N1, const unsigned array_size ) + : input_view( "input_view", N0, N1 ) + , value_count( array_size ) + {} + + KOKKOS_INLINE_FUNCTION + void init( scalar_type dst[] ) const + { + for ( unsigned i = 0; i < value_count; ++i ) { + dst[i] = 0.0; + } + } + + KOKKOS_INLINE_FUNCTION + void join( volatile scalar_type dst[], + const volatile scalar_type src[] ) const + { + for ( unsigned i = 0; i < value_count; ++i ) { + dst[i] += src[i]; + } + } + + + KOKKOS_INLINE_FUNCTION + void operator()( const int i, const int j ) const + { + input_view( i, j ) = 1; + } + + KOKKOS_INLINE_FUNCTION + void operator()( const int i, const int j, value_type lsum ) const + { + lsum[0] += input_view( i, j ) * 2; //+=6 each time if InitTag => N0*N1*6 + lsum[1] += input_view( i, j ) ; //+=3 each time if InitTag => N0*N1*3 + } + + // tagged operators + struct InitTag {}; + KOKKOS_INLINE_FUNCTION + void operator()( const InitTag &, const int i, const int j ) const + { + input_view( i, j ) = 3; + } + + static void test_arrayreduce2( const int N0, const int N1 ) + { + using namespace Kokkos::Experimental; + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType, InitTag > range_type_init; + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type_init range_init( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } ); + range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } ); + + const unsigned array_size = 2; + + TestMDRange_ReduceArray_2D functor( N0, N1, array_size ); + + parallel_for( range_init, functor ); // Init the view to 3's + + double sums[ array_size ]; + parallel_reduce( range, functor, sums ); + + // Check output + //printf("Array Reduce result. N0 = %d N1 = %d N0*N1 = %d sums[0] = %lf sums[1] = %lf \n", N0, N1, N0*N1, sums[0], sums[1]); + + ASSERT_EQ( sums[0], 6 * N0 * N1 ); + ASSERT_EQ( sums[1], 3 * N0 * N1 ); + } + } +}; + +template +struct TestMDRange_ReduceArray_3D { + + using DataType = int; + using ViewType_3 = typename Kokkos::View< DataType***, ExecSpace >; + using HostViewType_3 = typename ViewType_3::HostMirror; + + ViewType_3 input_view; + + using scalar_type = double; + using value_type = scalar_type[]; + const unsigned value_count; + + TestMDRange_ReduceArray_3D( const int N0, const int N1, const int N2, const unsigned array_size ) + : input_view( "input_view", N0, N1, N2 ) + , value_count( array_size ) + {} + + KOKKOS_INLINE_FUNCTION + void init( scalar_type dst[] ) const + { + for ( unsigned i = 0; i < value_count; ++i ) { + dst[i] = 0.0; + } + } + + KOKKOS_INLINE_FUNCTION + void join( volatile scalar_type dst[], + const volatile scalar_type src[] ) const + { + for ( unsigned i = 0; i < value_count; ++i ) { + dst[i] += src[i]; + } + } + + + KOKKOS_INLINE_FUNCTION + void operator()( const int i, const int j, const int k ) const + { + input_view( i, j, k ) = 1; + } + + KOKKOS_INLINE_FUNCTION + void operator()( const int i, const int j, const int k, value_type lsum ) const + { + lsum[0] += input_view( i, j, k ) * 2; //+=6 each time if InitTag => N0*N1*N2*6 + lsum[1] += input_view( i, j, k ) ; //+=3 each time if InitTag => N0*N1*N2*3 + } + + // tagged operators + struct InitTag {}; + KOKKOS_INLINE_FUNCTION + void operator()( const InitTag &, const int i, const int j, const int k ) const + { + input_view( i, j, k ) = 3; + } + + static void test_arrayreduce3( const int N0, const int N1, const int N2 ) + { + using namespace Kokkos::Experimental; + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType, InitTag > range_type_init; + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type_init range_init( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); + range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); + + const unsigned array_size = 2; + + TestMDRange_ReduceArray_3D functor( N0, N1, N2, array_size ); + + parallel_for( range_init, functor ); // Init the view to 3's + + double sums[ array_size ]; + parallel_reduce( range, functor, sums ); + + ASSERT_EQ( sums[0], 6 * N0 * N1 * N2 ); + ASSERT_EQ( sums[1], 3 * N0 * N1 * N2 ); + } + } +}; + + template struct TestMDRange_2D { using DataType = int; @@ -58,6 +232,7 @@ struct TestMDRange_2D { using HostViewType = typename ViewType::HostMirror; ViewType input_view; + using value_type = double; TestMDRange_2D( const DataType N0, const DataType N1 ) : input_view( "input_view", N0, N1 ) {} @@ -68,7 +243,7 @@ struct TestMDRange_2D { } KOKKOS_INLINE_FUNCTION - void operator()( const int i, const int j, double &lsum ) const + void operator()( const int i, const int j, value_type &lsum ) const { lsum += input_view( i, j ) * 2; } @@ -81,6 +256,13 @@ struct TestMDRange_2D { input_view( i, j ) = 3; } + // reduction tagged operators + KOKKOS_INLINE_FUNCTION + void operator()( const InitTag &, const int i, const int j, value_type &lsum ) const + { + lsum += input_view( i, j ) * 3; + } + static void test_reduce2( const int N0, const int N1 ) { using namespace Kokkos::Experimental; @@ -94,13 +276,85 @@ struct TestMDRange_2D { TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 ); } + // Test with reducers - scalar + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType > range_type; + range_type range( {{ 0, 0 }}, {{ N0, N1 }}, {{ 3, 3 }} ); + + TestMDRange_2D functor( N0, N1 ); + + parallel_for( range, functor ); + + value_type sum = 0.0; + Kokkos::Experimental::Sum< value_type > reducer_scalar( sum ); + + parallel_reduce( range, functor, reducer_scalar ); + + ASSERT_EQ( sum, 2 * N0 * N1 ); + } + // Test with reducers - scalar view + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType > range_type; + range_type range( {{ 0, 0 }}, {{ N0, N1 }}, {{ 3, 3 }} ); + + TestMDRange_2D functor( N0, N1 ); + + parallel_for( range, functor ); + + value_type sum = 0.0; + Kokkos::View< value_type, Kokkos::HostSpace > sum_view("sum_view"); + sum_view() = sum; + Kokkos::Experimental::Sum< value_type > reducer_view( sum_view ); + + parallel_reduce( range, functor, reducer_view); + sum = sum_view(); + + ASSERT_EQ( sum, 2 * N0 * N1 ); + } + + // Tagged operator test + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Default, Iterate::Default >, Kokkos::IndexType, InitTag > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 2, 4 } } ); + + TestMDRange_2D functor( N0, N1 ); + + parallel_for( range, functor ); + + // check parallel_for results correct with InitTag + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view, functor.input_view ); + int counter = 0; + for ( int i = 0; i < N0; ++i ) + for ( int j = 0; j < N1; ++j ) + { + if ( h_view( i, j ) != 3 ) { + ++counter; + } + } + + if ( counter != 0 ) { + printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter ); + } + ASSERT_EQ( counter, 0 ); + + + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 9 * N0 * N1 ); + } + { typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Default, Iterate::Default>, Kokkos::IndexType > range_type; typedef typename range_type::tile_type tile_type; @@ -110,9 +364,9 @@ struct TestMDRange_2D { TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 ); } @@ -126,9 +380,9 @@ struct TestMDRange_2D { TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 ); } @@ -142,9 +396,9 @@ struct TestMDRange_2D { TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 ); } @@ -158,9 +412,9 @@ struct TestMDRange_2D { TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 ); } @@ -174,9 +428,9 @@ struct TestMDRange_2D { TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 ); } @@ -194,7 +448,7 @@ struct TestMDRange_2D { range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } ); TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -223,7 +477,7 @@ struct TestMDRange_2D { range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } ); TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -251,7 +505,7 @@ struct TestMDRange_2D { range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } } ); TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -280,7 +534,7 @@ struct TestMDRange_2D { range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } ); TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -309,7 +563,7 @@ struct TestMDRange_2D { range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 4, 4 } } ); TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -338,7 +592,7 @@ struct TestMDRange_2D { range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } ); TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -367,7 +621,7 @@ struct TestMDRange_2D { range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 7, 7 } } ); TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -396,7 +650,7 @@ struct TestMDRange_2D { range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 16, 16 } } ); TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -425,7 +679,7 @@ struct TestMDRange_2D { range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 5, 16 } } ); TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -455,6 +709,7 @@ struct TestMDRange_3D { using HostViewType = typename ViewType::HostMirror; ViewType input_view; + using value_type = double; TestMDRange_3D( const DataType N0, const DataType N1, const DataType N2 ) : input_view( "input_view", N0, N1, N2 ) {} @@ -478,6 +733,13 @@ struct TestMDRange_3D { input_view( i, j, k ) = 3; } + // reduction tagged operators + KOKKOS_INLINE_FUNCTION + void operator()( const InitTag &, const int i, const int j, const int k, value_type &lsum ) const + { + lsum += input_view( i, j, k ) * 3; + } + static void test_reduce3( const int N0, const int N1, const int N2 ) { using namespace Kokkos::Experimental; @@ -491,13 +753,86 @@ struct TestMDRange_3D { TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 * N2 ); } + // Test with reducers - scalar + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType > range_type; + range_type range( {{ 0, 0, 0 }}, {{ N0, N1, N2 }}, {{ 3, 3, 3 }} ); + + TestMDRange_3D functor( N0, N1, N2 ); + + parallel_for( range, functor ); + + value_type sum = 0.0; + Kokkos::Experimental::Sum< value_type > reducer_scalar( sum ); + + parallel_reduce( range, functor, reducer_scalar ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 ); + } + // Test with reducers - scalar view + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType > range_type; + range_type range( {{ 0, 0, 0 }}, {{ N0, N1, N2 }}, {{ 3, 3, 3 }} ); + + TestMDRange_3D functor( N0, N1, N2 ); + + parallel_for( range, functor ); + + value_type sum = 0.0; + Kokkos::View< value_type, Kokkos::HostSpace > sum_view("sum_view"); + sum_view() = sum; + Kokkos::Experimental::Sum< value_type > reducer_view( sum_view ); + + parallel_reduce( range, functor, reducer_view); + sum = sum_view(); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 ); + } + + // Tagged operator test + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Default, Iterate::Default >, Kokkos::IndexType, InitTag > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 6 } } ); + + TestMDRange_3D functor( N0, N1, N2 ); + + parallel_for( range, functor ); + + // check parallel_for results correct with InitTag + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view, functor.input_view ); + int counter = 0; + for ( int i = 0; i < N0; ++i ) + for ( int j = 0; j < N1; ++j ) + for ( int k = 0; k < N2; ++k ) + { + if ( h_view( i, j, k ) != 3 ) { + ++counter; + } + } + + if ( counter != 0 ) { + printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter ); + } + ASSERT_EQ( counter, 0 ); + + + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 9 * N0 * N1 * N2 ); + } + { typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Default, Iterate::Default >, Kokkos::IndexType > range_type; typedef typename range_type::tile_type tile_type; @@ -507,9 +842,9 @@ struct TestMDRange_3D { TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 * N2 ); } @@ -523,9 +858,9 @@ struct TestMDRange_3D { TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 * N2 ); } @@ -539,9 +874,9 @@ struct TestMDRange_3D { TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 * N2 ); } @@ -555,9 +890,9 @@ struct TestMDRange_3D { TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 * N2 ); } @@ -571,9 +906,9 @@ struct TestMDRange_3D { TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 * N2 ); } @@ -590,7 +925,7 @@ struct TestMDRange_3D { range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } } ); TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -620,7 +955,7 @@ struct TestMDRange_3D { range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -651,7 +986,7 @@ struct TestMDRange_3D { TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -681,7 +1016,7 @@ struct TestMDRange_3D { range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -711,7 +1046,7 @@ struct TestMDRange_3D { range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 2 } } ); TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -741,7 +1076,7 @@ struct TestMDRange_3D { range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 5, 7 } } ); TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -771,7 +1106,7 @@ struct TestMDRange_3D { range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 8, 8, 8 } } ); TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -801,7 +1136,7 @@ struct TestMDRange_3D { range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 2 } } ); TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -832,6 +1167,7 @@ struct TestMDRange_4D { using HostViewType = typename ViewType::HostMirror; ViewType input_view; + using value_type = double; TestMDRange_4D( const DataType N0, const DataType N1, const DataType N2, const DataType N3 ) : input_view( "input_view", N0, N1, N2, N3 ) {} @@ -855,6 +1191,191 @@ struct TestMDRange_4D { input_view( i, j, k, l ) = 3; } + // reduction tagged operators + KOKKOS_INLINE_FUNCTION + void operator()( const InitTag &, const int i, const int j, const int k, const int l, value_type &lsum ) const + { + lsum += input_view( i, j, k, l ) * 3; + } + + static void test_reduce4( const int N0, const int N1, const int N2, const int N3 ) + { + using namespace Kokkos::Experimental; + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4>, Kokkos::IndexType > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } ); + + TestMDRange_4D functor( N0, N1, N2, N3 ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 ); + } + + // Test with reducers - scalar + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4>, Kokkos::IndexType > range_type; + range_type range( {{ 0, 0, 0, 0 }}, {{ N0, N1, N2, N3 }}, {{ 3, 3, 3, 3 }} ); + + TestMDRange_4D functor( N0, N1, N2, N3 ); + + parallel_for( range, functor ); + + value_type sum = 0.0; + Kokkos::Experimental::Sum< value_type > reducer_scalar( sum ); + + parallel_reduce( range, functor, reducer_scalar ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 ); + } + + // Test with reducers - scalar view + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4>, Kokkos::IndexType > range_type; + range_type range( {{ 0, 0, 0, 0 }}, {{ N0, N1, N2, N3 }}, {{ 3, 3, 3, 3 }} ); + + TestMDRange_4D functor( N0, N1, N2, N3 ); + + parallel_for( range, functor ); + + value_type sum = 0.0; + Kokkos::View< value_type, Kokkos::HostSpace > sum_view("sum_view"); + sum_view() = sum; + Kokkos::Experimental::Sum< value_type > reducer_view( sum_view ); + + parallel_reduce( range, functor, reducer_view); + sum = sum_view(); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 ); + } + + // Tagged operator test + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Default, Iterate::Default >, Kokkos::IndexType, InitTag > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } ); + + TestMDRange_4D functor( N0, N1, N2, N3 ); + + parallel_for( range, functor ); + + // check parallel_for results correct with InitTag + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view, functor.input_view ); + int counter = 0; + for ( int i = 0; i < N0; ++i ) + for ( int j = 0; j < N1; ++j ) + for ( int k = 0; k < N2; ++k ) + for ( int l = 0; l < N3; ++l ) + { + if ( h_view( i, j, k, l ) != 3 ) { + ++counter; + } + } + + if ( counter != 0 ) { + printf( "Defaults + InitTag op(): Errors in test_reduce4 parallel_for init; mismatches = %d\n\n", counter ); + } + ASSERT_EQ( counter, 0 ); + + + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 9 * N0 * N1 * N2 * N3 ); + } + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Default, Iterate::Default >, Kokkos::IndexType > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } ); + + TestMDRange_4D functor( N0, N1, N2, N3 ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 ); + } + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Left, Iterate::Left>, Kokkos::IndexType > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } ); + + TestMDRange_4D functor( N0, N1, N2, N3 ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 ); + } + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Left, Iterate::Right>, Kokkos::IndexType > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } ); + + TestMDRange_4D functor( N0, N1, N2, N3 ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 ); + } + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Right, Iterate::Left>, Kokkos::IndexType > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } ); + + TestMDRange_4D functor( N0, N1, N2, N3 ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 ); + } + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Right, Iterate::Right>, Kokkos::IndexType > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } ); + + TestMDRange_4D functor( N0, N1, N2, N3 ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 ); + } + } // end test_reduce + + + static void test_for4( const int N0, const int N1, const int N2, const int N3 ) { using namespace Kokkos::Experimental; @@ -866,7 +1387,7 @@ struct TestMDRange_4D { range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } } ); TestMDRange_4D functor( N0, N1, N2, N3 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -897,7 +1418,7 @@ struct TestMDRange_4D { range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } ); TestMDRange_4D functor( N0, N1, N2, N3 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -929,7 +1450,7 @@ struct TestMDRange_4D { TestMDRange_4D functor( N0, N1, N2, N3 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -961,7 +1482,7 @@ struct TestMDRange_4D { TestMDRange_4D functor( N0, N1, N2, N3 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -993,7 +1514,7 @@ struct TestMDRange_4D { TestMDRange_4D functor( N0, N1, N2, N3 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1025,7 +1546,7 @@ struct TestMDRange_4D { TestMDRange_4D functor( N0, N1, N2, N3 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1057,7 +1578,7 @@ struct TestMDRange_4D { TestMDRange_4D functor( N0, N1, N2, N3 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1089,7 +1610,7 @@ struct TestMDRange_4D { TestMDRange_4D functor( N0, N1, N2, N3 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1121,6 +1642,7 @@ struct TestMDRange_5D { using HostViewType = typename ViewType::HostMirror; ViewType input_view; + using value_type = double; TestMDRange_5D( const DataType N0, const DataType N1, const DataType N2, const DataType N3, const DataType N4 ) : input_view( "input_view", N0, N1, N2, N3, N4 ) {} @@ -1131,7 +1653,7 @@ struct TestMDRange_5D { } KOKKOS_INLINE_FUNCTION - void operator()( const int i, const int j, const int k, const int l, const int m, double &lsum ) const + void operator()( const int i, const int j, const int k, const int l, const int m, value_type &lsum ) const { lsum += input_view( i, j, k, l, m ) * 2; } @@ -1144,6 +1666,110 @@ struct TestMDRange_5D { input_view( i, j, k, l, m ) = 3; } + // reduction tagged operators + KOKKOS_INLINE_FUNCTION + void operator()( const InitTag &, const int i, const int j, const int k, const int l, const int m, value_type &lsum ) const + { + lsum += input_view( i, j, k, l, m ) * 3; + } + + static void test_reduce5( const int N0, const int N1, const int N2, const int N3, const int N4 ) + { + using namespace Kokkos::Experimental; + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5>, Kokkos::IndexType > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } ); + + TestMDRange_5D functor( N0, N1, N2, N3, N4 ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 ); + } + + // Test with reducers - scalar + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5>, Kokkos::IndexType > range_type; + range_type range( {{ 0, 0, 0, 0, 0 }}, {{ N0, N1, N2, N3, N4 }}, {{ 3, 3, 3, 3, 3 }} ); + + TestMDRange_5D functor( N0, N1, N2, N3, N4 ); + + parallel_for( range, functor ); + + value_type sum = 0.0; + Kokkos::Experimental::Sum< value_type > reducer_scalar( sum ); + + parallel_reduce( range, functor, reducer_scalar ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 ); + } + + // Test with reducers - scalar view + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5>, Kokkos::IndexType > range_type; + range_type range( {{ 0, 0, 0, 0, 0 }}, {{ N0, N1, N2, N3, N4 }}, {{ 3, 3, 3, 3, 3 }} ); + + TestMDRange_5D functor( N0, N1, N2, N3, N4 ); + + parallel_for( range, functor ); + + value_type sum = 0.0; + Kokkos::View< value_type, Kokkos::HostSpace > sum_view("sum_view"); + sum_view() = sum; + Kokkos::Experimental::Sum< value_type > reducer_view( sum_view ); + + parallel_reduce( range, functor, reducer_view); + sum = sum_view(); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 ); + } + + // Tagged operator test + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5, Iterate::Default, Iterate::Default >, Kokkos::IndexType, InitTag > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 2, 4, 6, 2, 2 } } ); + + TestMDRange_5D functor( N0, N1, N2, N3, N4 ); + + parallel_for( range, functor ); + + // check parallel_for results correct with InitTag + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view, functor.input_view ); + int counter = 0; + for ( int i = 0; i < N0; ++i ) + for ( int j = 0; j < N1; ++j ) + for ( int k = 0; k < N2; ++k ) + for ( int l = 0; l < N3; ++l ) + for ( int m = 0; m < N4; ++m ) + { + if ( h_view( i, j, k, l, m ) != 3 ) { + ++counter; + } + } + + if ( counter != 0 ) { + printf( "Defaults + InitTag op(): Errors in test_reduce5 parallel_for init; mismatches = %d\n\n", counter ); + } + ASSERT_EQ( counter, 0 ); + + + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 9 * N0 * N1 * N2 * N3 * N4 ); + } + } + static void test_for5( const int N0, const int N1, const int N2, const int N3, const int N4 ) { using namespace Kokkos::Experimental; @@ -1155,7 +1781,7 @@ struct TestMDRange_5D { range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } } ); TestMDRange_5D functor( N0, N1, N2, N3, N4 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1184,10 +1810,10 @@ struct TestMDRange_5D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 7 } } ); + range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } ); TestMDRange_5D functor( N0, N1, N2, N3, N4 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1220,7 +1846,7 @@ struct TestMDRange_5D { TestMDRange_5D functor( N0, N1, N2, N3, N4 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1253,7 +1879,7 @@ struct TestMDRange_5D { TestMDRange_5D functor( N0, N1, N2, N3, N4 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1286,7 +1912,7 @@ struct TestMDRange_5D { TestMDRange_5D functor( N0, N1, N2, N3, N4 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1319,7 +1945,7 @@ struct TestMDRange_5D { TestMDRange_5D functor( N0, N1, N2, N3, N4 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1352,7 +1978,7 @@ struct TestMDRange_5D { TestMDRange_5D functor( N0, N1, N2, N3, N4 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1385,7 +2011,7 @@ struct TestMDRange_5D { TestMDRange_5D functor( N0, N1, N2, N3, N4 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1418,6 +2044,7 @@ struct TestMDRange_6D { using HostViewType = typename ViewType::HostMirror; ViewType input_view; + using value_type = double; TestMDRange_6D( const DataType N0, const DataType N1, const DataType N2, const DataType N3, const DataType N4, const DataType N5 ) : input_view( "input_view", N0, N1, N2, N3, N4, N5 ) {} @@ -1428,7 +2055,7 @@ struct TestMDRange_6D { } KOKKOS_INLINE_FUNCTION - void operator()( const int i, const int j, const int k, const int l, const int m, const int n, double &lsum ) const + void operator()( const int i, const int j, const int k, const int l, const int m, const int n, value_type &lsum ) const { lsum += input_view( i, j, k, l, m, n ) * 2; } @@ -1441,6 +2068,111 @@ struct TestMDRange_6D { input_view( i, j, k, l, m, n ) = 3; } + // reduction tagged operators + KOKKOS_INLINE_FUNCTION + void operator()( const InitTag &, const int i, const int j, const int k, const int l, const int m, const int n, value_type &lsum ) const + { + lsum += input_view( i, j, k, l, m, n ) * 3; + } + + static void test_reduce6( const int N0, const int N1, const int N2, const int N3, const int N4, const int N5 ) + { + using namespace Kokkos::Experimental; + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6>, Kokkos::IndexType > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } ); + + TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 ); + } + + // Test with reducers - scalar + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6>, Kokkos::IndexType > range_type; + range_type range( {{ 0, 0, 0, 0, 0, 0 }}, {{ N0, N1, N2, N3, N4, N5 }}, {{ 3, 3, 3, 3, 3, 2 }} ); + + TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); + + parallel_for( range, functor ); + + value_type sum = 0.0; + Kokkos::Experimental::Sum< value_type > reducer_scalar( sum ); + + parallel_reduce( range, functor, reducer_scalar ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 ); + } + + // Test with reducers - scalar view + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6>, Kokkos::IndexType > range_type; + range_type range( {{ 0, 0, 0, 0, 0, 0 }}, {{ N0, N1, N2, N3, N4, N5 }}, {{ 3, 3, 3, 3, 3, 2 }} ); + + TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); + + parallel_for( range, functor ); + + value_type sum = 0.0; + Kokkos::View< value_type, Kokkos::HostSpace > sum_view("sum_view"); + sum_view() = sum; + Kokkos::Experimental::Sum< value_type > reducer_view( sum_view ); + + parallel_reduce( range, functor, reducer_view); + sum = sum_view(); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 ); + } + + // Tagged operator test + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6, Iterate::Default, Iterate::Default >, Kokkos::IndexType, InitTag > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 2, 4, 6, 2, 2, 2 } } ); + + TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); + + parallel_for( range, functor ); + + // check parallel_for results correct with InitTag + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view, functor.input_view ); + int counter = 0; + for ( int i = 0; i < N0; ++i ) + for ( int j = 0; j < N1; ++j ) + for ( int k = 0; k < N2; ++k ) + for ( int l = 0; l < N3; ++l ) + for ( int m = 0; m < N4; ++m ) + for ( int n = 0; n < N5; ++n ) + { + if ( h_view( i, j, k, l, m, n ) != 3 ) { + ++counter; + } + } + + if ( counter != 0 ) { + printf( "Defaults + InitTag op(): Errors in test_reduce6 parallel_for init; mismatches = %d\n\n", counter ); + } + ASSERT_EQ( counter, 0 ); + + + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 9 * N0 * N1 * N2 * N3 * N4 * N5 ); + } + } + static void test_for6( const int N0, const int N1, const int N2, const int N3, const int N4, const int N5 ) { using namespace Kokkos::Experimental; @@ -1452,7 +2184,7 @@ struct TestMDRange_6D { range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } } ); TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1485,7 +2217,7 @@ struct TestMDRange_6D { range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1519,7 +2251,7 @@ struct TestMDRange_6D { TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1553,7 +2285,7 @@ struct TestMDRange_6D { TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1587,7 +2319,7 @@ struct TestMDRange_6D { TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1621,7 +2353,7 @@ struct TestMDRange_6D { TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1655,7 +2387,7 @@ struct TestMDRange_6D { TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1689,7 +2421,7 @@ struct TestMDRange_6D { TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1726,11 +2458,19 @@ TEST_F( TEST_CATEGORY , mdrange_for ) { TestMDRange_6D< TEST_EXECSPACE >::test_for6( 10, 10, 10, 10, 5, 5 ); } -#ifndef KOKKOS_ENABLE_CUDA TEST_F( TEST_CATEGORY , mdrange_reduce ) { TestMDRange_2D< TEST_EXECSPACE >::test_reduce2( 100, 100 ); TestMDRange_3D< TEST_EXECSPACE >::test_reduce3( 100, 10, 100 ); + TestMDRange_4D< TEST_EXECSPACE >::test_reduce4( 100, 10, 10, 10 ); + TestMDRange_5D< TEST_EXECSPACE >::test_reduce5( 100, 10, 10, 10, 5 ); + TestMDRange_6D< TEST_EXECSPACE >::test_reduce6( 100, 10, 10, 10, 5, 5 ); } -#endif + +//#ifndef KOKKOS_ENABLE_CUDA +TEST_F( TEST_CATEGORY , mdrange_array_reduce ) { + TestMDRange_ReduceArray_2D< TEST_EXECSPACE >::test_arrayreduce2( 4, 5 ); + TestMDRange_ReduceArray_3D< TEST_EXECSPACE >::test_arrayreduce3( 4, 5, 10 ); +} +//#endif } // namespace Test diff --git a/lib/kokkos/core/unit_test/TestMemoryPool.hpp b/lib/kokkos/core/unit_test/TestMemoryPool.hpp index 941cd6c26d..9f708390c2 100644 --- a/lib/kokkos/core/unit_test/TestMemoryPool.hpp +++ b/lib/kokkos/core/unit_test/TestMemoryPool.hpp @@ -54,6 +54,96 @@ namespace TestMemoryPool { +template< typename MemSpace = Kokkos::HostSpace > +void test_host_memory_pool_defaults() +{ + typedef typename MemSpace::execution_space Space ; + typedef typename Kokkos::MemoryPool< Space > MemPool ; + + { + const size_t MemoryCapacity = 32000 ; + const size_t MinBlockSize = 64 ; + const size_t MaxBlockSize = 1024 ; + const size_t SuperBlockSize = 4096 ; + + MemPool pool( MemSpace() + , MemoryCapacity + , MinBlockSize + , MaxBlockSize + , SuperBlockSize + ); + + typename MemPool::usage_statistics stats ; + + pool.get_usage_statistics( stats ); + + ASSERT_LE( MemoryCapacity , stats.capacity_bytes ); + ASSERT_LE( MinBlockSize , stats.min_block_bytes ); + ASSERT_LE( MaxBlockSize , stats.max_block_bytes ); + ASSERT_LE( SuperBlockSize , stats.superblock_bytes ); + } + + { + const size_t MemoryCapacity = 10000 ; + + MemPool pool( MemSpace() + , MemoryCapacity + ); + + typename MemPool::usage_statistics stats ; + + pool.get_usage_statistics( stats ); + + ASSERT_LE( MemoryCapacity , stats.capacity_bytes ); + ASSERT_LE( 64u /* default */ , stats.min_block_bytes ); + ASSERT_LE( stats.min_block_bytes , stats.max_block_bytes ); + ASSERT_LE( stats.max_block_bytes , stats.superblock_bytes ); + ASSERT_LE( stats.superblock_bytes , stats.capacity_bytes ); + } + + { + const size_t MemoryCapacity = 10000 ; + const size_t MinBlockSize = 32 ; // power of two is exact + + MemPool pool( MemSpace() + , MemoryCapacity + , MinBlockSize + ); + + typename MemPool::usage_statistics stats ; + + pool.get_usage_statistics( stats ); + + ASSERT_LE( MemoryCapacity , stats.capacity_bytes ); + ASSERT_EQ( MinBlockSize , stats.min_block_bytes ); + ASSERT_LE( stats.min_block_bytes , stats.max_block_bytes ); + ASSERT_LE( stats.max_block_bytes , stats.superblock_bytes ); + ASSERT_LE( stats.superblock_bytes , stats.capacity_bytes ); + } + + { + const size_t MemoryCapacity = 32000 ; + const size_t MinBlockSize = 32 ; // power of two is exact + const size_t MaxBlockSize = 1024 ; // power of two is exact + + MemPool pool( MemSpace() + , MemoryCapacity + , MinBlockSize + , MaxBlockSize + ); + + typename MemPool::usage_statistics stats ; + + pool.get_usage_statistics( stats ); + + ASSERT_LE( MemoryCapacity , stats.capacity_bytes ); + ASSERT_EQ( MinBlockSize , stats.min_block_bytes ); + ASSERT_EQ( MaxBlockSize , stats.max_block_bytes ); + ASSERT_LE( stats.max_block_bytes , stats.superblock_bytes ); + ASSERT_LE( stats.superblock_bytes , stats.capacity_bytes ); + } +} + template< typename MemSpace = Kokkos::HostSpace > void test_host_memory_pool_stats() { @@ -188,8 +278,8 @@ void print_memory_pool_stats << " bytes reserved = " << stats.reserved_bytes << std::endl << " bytes free = " << ( stats.capacity_bytes - ( stats.consumed_bytes + stats.reserved_bytes ) ) << std::endl - << " alloc used = " << stats.consumed_blocks << std::endl - << " alloc reserved = " << stats.reserved_blocks << std::endl + << " block used = " << stats.consumed_blocks << std::endl + << " block reserved = " << stats.reserved_blocks << std::endl << " super used = " << stats.consumed_superblocks << std::endl << " super reserved = " << ( stats.capacity_superblocks - stats.consumed_superblocks ) << std::endl @@ -302,15 +392,147 @@ void test_memory_pool_v2( const bool print_statistics //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -} // namespace TestMemoryPool { +template< class DeviceType > +struct TestMemoryPoolCorners { + + typedef Kokkos::View< uintptr_t * , DeviceType > ptrs_type ; + typedef Kokkos::MemoryPool< DeviceType > pool_type ; + + pool_type pool ; + ptrs_type ptrs ; + uint32_t size ; + uint32_t stride ; + + TestMemoryPoolCorners( const pool_type & arg_pool + , const ptrs_type & arg_ptrs + , const uint32_t arg_base + , const uint32_t arg_stride + ) + : pool( arg_pool ) + , ptrs( arg_ptrs ) + , size( arg_base ) + , stride( arg_stride ) + {} + + // Specify reduction argument value_type to + // avoid confusion with tag-dispatch. + + using value_type = long ; + + KOKKOS_INLINE_FUNCTION + void operator()( int i , long & err ) const noexcept + { + unsigned alloc_size = size << ( i % stride ); + if ( 0 == ptrs(i) ) { + ptrs(i) = (uintptr_t) pool.allocate( alloc_size ); + if ( ptrs(i) && ! alloc_size ) { ++err ; } + } + } + + struct TagDealloc {}; + + KOKKOS_INLINE_FUNCTION + void operator()( int i ) const noexcept + { + unsigned alloc_size = size << ( i % stride ); + if ( ptrs(i) ) { pool.deallocate( (void*) ptrs(i) , alloc_size ); } + ptrs(i) = 0 ; + } +}; + +template< class DeviceType > +void test_memory_pool_corners( const bool print_statistics + , const bool print_superblocks ) +{ + typedef typename DeviceType::memory_space memory_space ; + typedef typename DeviceType::execution_space execution_space ; + typedef Kokkos::MemoryPool< DeviceType > pool_type ; + typedef TestMemoryPoolCorners< DeviceType > functor_type ; + typedef typename functor_type::ptrs_type ptrs_type ; + + { + // superblock size 1 << 14 + const size_t min_superblock_size = 1u << 14 ; + + // four superblocks + const size_t total_alloc_size = min_superblock_size * 4 ; + + // block sizes { 64 , 128 , 256 , 512 } + // block counts { 256 , 128 , 64 , 32 } + const unsigned min_block_size = 64 ; + const unsigned max_block_size = 512 ; + const unsigned num_blocks = 480 ; + + pool_type pool( memory_space() + , total_alloc_size + , min_block_size + , max_block_size + , min_superblock_size ); + + // Allocate one block from each superblock to lock that + // superblock into the block size. + + ptrs_type ptrs("ptrs",num_blocks); + + long err = 0 ; + + Kokkos::parallel_reduce + ( Kokkos::RangePolicy< execution_space >(0,4) + , functor_type( pool , ptrs , 64 , 4 ) + , err + ); + + if ( print_statistics || err ) { + + typename pool_type::usage_statistics stats ; + + pool.get_usage_statistics( stats ); + + print_memory_pool_stats< pool_type >( stats ); + } + + if ( print_superblocks || err ) { + pool.print_state( std::cout ); + } + + // Now fill remaining allocations with small size + + Kokkos::parallel_reduce + ( Kokkos::RangePolicy< execution_space >(0,num_blocks) + , functor_type( pool , ptrs , 64 , 1 ) + , err + ); + + if ( print_statistics || err ) { + + typename pool_type::usage_statistics stats ; + + pool.get_usage_statistics( stats ); + + print_memory_pool_stats< pool_type >( stats ); + } + + if ( print_superblocks || err ) { + pool.print_state( std::cout ); + } + } +} + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +} // namespace TestMemoryPool namespace Test { TEST_F( TEST_CATEGORY, memory_pool ) { + TestMemoryPool::test_host_memory_pool_defaults<>(); TestMemoryPool::test_host_memory_pool_stats<>(); TestMemoryPool::test_memory_pool_v2< TEST_EXECSPACE >(false,false); + TestMemoryPool::test_memory_pool_corners< TEST_EXECSPACE >(false,false); } + } #endif diff --git a/lib/kokkos/core/unit_test/TestRange.hpp b/lib/kokkos/core/unit_test/TestRange.hpp index f55574761b..3cea1ad4a0 100644 --- a/lib/kokkos/core/unit_test/TestRange.hpp +++ b/lib/kokkos/core/unit_test/TestRange.hpp @@ -72,8 +72,33 @@ struct TestRange { typename view_type::HostMirror host_flags = Kokkos::create_mirror_view( m_flags ); Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType >( 0, N ), *this ); + +#if defined(KOKKOS_ENABLE_PROFILING) + { + typedef TestRange< ExecSpace, ScheduleType > ThisType; + std::string label("parallel_for"); + Kokkos::Impl::ParallelConstructName< ThisType, void> pcn(label); + ASSERT_EQ( pcn.get(), label ); + std::string empty_label(""); + Kokkos::Impl::ParallelConstructName< ThisType, void> empty_pcn(empty_label); + ASSERT_EQ( empty_pcn.get(), typeid(ThisType).name() ); + } +#endif + Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType, VerifyInitTag >( 0, N ), *this ); +#if defined(KOKKOS_ENABLE_PROFILING) + { + typedef TestRange< ExecSpace, ScheduleType > ThisType; + std::string label("parallel_for"); + Kokkos::Impl::ParallelConstructName< ThisType, VerifyInitTag> pcn(label); + ASSERT_EQ( pcn.get(), label ); + std::string empty_label(""); + Kokkos::Impl::ParallelConstructName< ThisType, VerifyInitTag> empty_pcn(empty_label); + ASSERT_EQ( empty_pcn.get(), std::string(typeid(ThisType).name()) + "/" + typeid(VerifyInitTag).name() ); + } +#endif + Kokkos::deep_copy( host_flags, m_flags ); int error_count = 0; diff --git a/lib/kokkos/core/unit_test/TestResize.hpp b/lib/kokkos/core/unit_test/TestResize.hpp new file mode 100644 index 0000000000..aaf0422b19 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestResize.hpp @@ -0,0 +1,140 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef TESTVIEWSUBVIEW_HPP_ +#define TESTVIEWSUBVIEW_HPP_ + +#include +#include + +namespace TestViewResize { + +template +void testResize () +{ + const int sizes[8] = {2, 3, 4, 5, 6, 7, 8, 9}; + + // Check #904 fix (no reallocation if dimensions didn't change). + { + typedef Kokkos::View view_type; + view_type view_1d ("view_1d", sizes[0]); + const int* oldPointer = view_1d.data (); + EXPECT_TRUE( oldPointer != NULL ); + Kokkos::resize (view_1d, sizes[0]); + const int* newPointer = view_1d.data (); + EXPECT_TRUE( oldPointer == newPointer ); + } + { + typedef Kokkos::View view_type; + view_type view_2d ("view_2d", sizes[0], sizes[1]); + const int* oldPointer = view_2d.data (); + EXPECT_TRUE( oldPointer != NULL ); + Kokkos::resize (view_2d, sizes[0], sizes[1]); + const int* newPointer = view_2d.data (); + EXPECT_TRUE( oldPointer == newPointer ); + } + { + typedef Kokkos::View view_type; + view_type view_3d ("view_3d", sizes[0], sizes[1], sizes[2]); + const int* oldPointer = view_3d.data (); + EXPECT_TRUE( oldPointer != NULL ); + Kokkos::resize (view_3d, sizes[0], sizes[1], sizes[2]); + const int* newPointer = view_3d.data (); + EXPECT_TRUE( oldPointer == newPointer ); + } + { + typedef Kokkos::View view_type; + view_type view_4d ("view_4d", sizes[0], sizes[1], sizes[2], sizes[3]); + const int* oldPointer = view_4d.data (); + EXPECT_TRUE( oldPointer != NULL ); + Kokkos::resize (view_4d, sizes[0], sizes[1], sizes[2], sizes[3]); + const int* newPointer = view_4d.data (); + EXPECT_TRUE( oldPointer == newPointer ); + } + { + typedef Kokkos::View view_type; + view_type view_5d ("view_5d", sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4]); + const int* oldPointer = view_5d.data (); + EXPECT_TRUE( oldPointer != NULL ); + Kokkos::resize (view_5d, sizes[0], sizes[1], sizes[2], sizes[3], sizes[4]); + const int* newPointer = view_5d.data (); + EXPECT_TRUE( oldPointer == newPointer ); + } + { + typedef Kokkos::View view_type; + view_type view_6d ("view_6d", sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4], sizes[5]); + const int* oldPointer = view_6d.data (); + EXPECT_TRUE( oldPointer != NULL ); + Kokkos::resize (view_6d, sizes[0], sizes[1], sizes[2], sizes[3], sizes[4], + sizes[5]); + const int* newPointer = view_6d.data (); + EXPECT_TRUE( oldPointer == newPointer ); + } + { + typedef Kokkos::View view_type; + view_type view_7d ("view_7d", sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4], sizes[5], sizes[6]); + const int* oldPointer = view_7d.data (); + EXPECT_TRUE( oldPointer != NULL ); + Kokkos::resize (view_7d, sizes[0], sizes[1], sizes[2], sizes[3], sizes[4], + sizes[5], sizes[6]); + const int* newPointer = view_7d.data (); + EXPECT_TRUE( oldPointer == newPointer ); + } + { + typedef Kokkos::View view_type; + view_type view_8d ("view_8d", sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4], sizes[5], sizes[6], sizes[7]); + const int* oldPointer = view_8d.data (); + EXPECT_TRUE( oldPointer != NULL ); + Kokkos::resize (view_8d, sizes[0], sizes[1], sizes[2], sizes[3], sizes[4], + sizes[5], sizes[6], sizes[7]); + const int* newPointer = view_8d.data (); + EXPECT_TRUE( oldPointer == newPointer ); + } +} + +} // namespace TestViewSubview + +#endif // TESTVIEWSUBVIEW_HPP_ diff --git a/lib/kokkos/core/unit_test/TestTaskScheduler.hpp b/lib/kokkos/core/unit_test/TestTaskScheduler.hpp index 3a88475620..4e66543857 100644 --- a/lib/kokkos/core/unit_test/TestTaskScheduler.hpp +++ b/lib/kokkos/core/unit_test/TestTaskScheduler.hpp @@ -250,13 +250,21 @@ struct TestTaskDependence { const int n = CHUNK < m_count ? CHUNK : m_count; if ( 1 < m_count ) { - future_type f[ CHUNK ]; + // Test use of memory pool for temporary allocation: + + // Raw allocation: + future_type * const f = + (future_type *) m_sched.memory()->allocate( sizeof(future_type) * n ); + + // In-place construction: + for ( int i = 0; i < n; ++i ) new(f+i) future_type(); const int inc = ( m_count + n - 1 ) / n; for ( int i = 0; i < n; ++i ) { long begin = i * inc; long count = begin + inc < m_count ? inc : m_count - begin; + f[i] = Kokkos::task_spawn( Kokkos::TaskSingle( m_sched ) , TestTaskDependence( count, m_sched, m_accum ) ); } @@ -264,6 +272,12 @@ struct TestTaskDependence { m_count = 0; Kokkos::respawn( this, Kokkos::when_all( f, n ) ); + + // In-place destruction to release future: + for ( int i = 0; i < n; ++i ) (f+i)->~future_type(); + + // Raw deallocation: + m_sched.memory()->deallocate( f , sizeof(future_type) * n ); } else if ( 1 == m_count ) { Kokkos::atomic_increment( & m_accum() ); @@ -641,19 +655,12 @@ namespace Test { TEST_F( TEST_CATEGORY, task_fib ) { - const int N = 24 ; // 25 triggers tbd bug on Cuda/Pascal + const int N = 27 ; for ( int i = 0; i < N; ++i ) { - TestTaskScheduler::TestFib< TEST_EXECSPACE >::run( i , ( i + 1 ) * ( i + 1 ) * 10000 ); + TestTaskScheduler::TestFib< TEST_EXECSPACE >::run( i , ( i + 1 ) * ( i + 1 ) * 2000 ); } } -#if defined(KOKKOS_ARCH_MAXWELL) || defined(KOKKOS_ARCH_PASCAL) - // TODO: Resolve bug in task DAG for Pascal - #define KOKKOS_IMPL_DISABLE_UNIT_TEST_TASK_DAG_PASCAL -#endif - -#ifndef KOKKOS_IMPL_DISABLE_UNIT_TEST_TASK_DAG_PASCAL - TEST_F( TEST_CATEGORY, task_depend ) { for ( int i = 0; i < 25; ++i ) { @@ -667,11 +674,8 @@ TEST_F( TEST_CATEGORY, task_team ) //TestTaskScheduler::TestTaskTeamValue< TEST_EXECSPACE >::run( 1000 ); // Put back after testing. } -#else //ndef KOKKOS_IMPL_DISABLE_UNIT_TEST_TASK_DAG_PASCAL -#undef KOKKOS_IMPL_DISABLE_UNIT_TEST_TASK_DAG_PASCAL -#endif //ndef KOKKOS_IMPL_DISABLE_UNIT_TEST_TASK_DAG_PASCAL - } + #endif // #if defined( KOKKOS_ENABLE_TASKDAG ) #endif // #ifndef KOKKOS_UNITTEST_TASKSCHEDULER_HPP diff --git a/lib/kokkos/core/unit_test/TestTeamVector.hpp b/lib/kokkos/core/unit_test/TestTeamVector.hpp index e9e2f7548a..7f4663d0f9 100644 --- a/lib/kokkos/core/unit_test/TestTeamVector.hpp +++ b/lib/kokkos/core/unit_test/TestTeamVector.hpp @@ -838,6 +838,18 @@ public: }, result ); const ScalarType solution = (ScalarType) nrows * (ScalarType) ncols; + + if ( int64_t(solution) != int64_t(result) ) { + printf( " TestTripleNestedReduce failed solution(%ld) != result(%ld), nrows(%d) ncols(%d) league_size(%d) team_size(%d)\n" + , int64_t(solution) + , int64_t(result) + , int32_t(nrows) + , int32_t(ncols) + , int32_t(nrows/chunk_size) + , int32_t(team_size) + ); + } + ASSERT_EQ( solution, result ); } }; diff --git a/lib/kokkos/core/unit_test/TestTile.hpp b/lib/kokkos/core/unit_test/TestTile.hpp index 8f57dfea75..f15667322f 100644 --- a/lib/kokkos/core/unit_test/TestTile.hpp +++ b/lib/kokkos/core/unit_test/TestTile.hpp @@ -94,7 +94,7 @@ struct ReduceTileErrors const size_t jtile = iwork / tile_dim0; if ( jtile < tile_dim1 ) { - tile_type tile = Kokkos::Experimental::tile_subview( m_array, itile, jtile ); + tile_type tile = Kokkos::tile_subview( m_array, itile, jtile ); if ( tile( 0, 0 ) != ptrdiff_t( ( itile + jtile * tile_dim0 ) * TileLayout::N0 * TileLayout::N1 ) ) { ++errors; diff --git a/lib/kokkos/core/unit_test/TestUniqueToken.hpp b/lib/kokkos/core/unit_test/TestUniqueToken.hpp new file mode 100644 index 0000000000..28add61a8a --- /dev/null +++ b/lib/kokkos/core/unit_test/TestUniqueToken.hpp @@ -0,0 +1,138 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include + +#include + +namespace Test { + +template< class Space > +class TestUniqueToken +{ +public: + typedef typename Space::execution_space execution_space; + typedef Kokkos::View< int * , execution_space > view_type ; + + Kokkos::Experimental::UniqueToken< execution_space , Kokkos::Experimental::UniqueTokenScope::Global > tokens ; + + view_type verify ; + view_type counts ; + view_type errors ; + + KOKKOS_INLINE_FUNCTION + void operator()( long ) const + { + const int32_t t = tokens.acquire(); + + bool ok = true ; + + ok = ok && 0 <= t ; + ok = ok && t < tokens.size(); + ok = ok && 0 == Kokkos::atomic_fetch_add( & verify(t) , 1 ); + + Kokkos::atomic_fetch_add( & counts(t) , 1 ); + + ok = ok && 1 == Kokkos::atomic_fetch_add( & verify(t) , -1 ); + + if ( ! ok ) { Kokkos::atomic_fetch_add( & errors(0) , 1 ) ; } + + tokens.release(t); + } + + TestUniqueToken() + : tokens( execution_space() ) + , verify( "TestUniqueTokenVerify" , tokens.size() ) + , counts( "TestUniqueTokenCounts" , tokens.size() ) + , errors( "TestUniqueTokenErrors" , 1 ) + {} + + static void run() + { + using policy = Kokkos::RangePolicy ; + + TestUniqueToken self ; + + { + const int duplicate = 100 ; + const long n = duplicate * self.tokens.size(); + + Kokkos::parallel_for( policy(0,n) , self ); + Kokkos::parallel_for( policy(0,n) , self ); + Kokkos::parallel_for( policy(0,n) , self ); + Kokkos::fence(); + } + + typename view_type::HostMirror host_counts = + Kokkos::create_mirror_view( self.counts ); + + Kokkos::deep_copy( host_counts , self.counts ); + + int32_t max = 0 ; + + { + const long n = host_counts.extent(0); + for ( long i = 0 ; i < n ; ++i ) { + if ( max < host_counts[i] ) max = host_counts[i] ; + } + } + + std::cout << "TestUniqueToken max reuse = " << max << std::endl ; + + typename view_type::HostMirror host_errors = + Kokkos::create_mirror_view( self.errors ); + + Kokkos::deep_copy( host_errors , self.errors ); + + ASSERT_EQ( host_errors(0) , 0 ); + } +}; + + +TEST_F( TEST_CATEGORY, unique_token ) +{ + TestUniqueToken< TEST_EXECSPACE >::run(); +} + +} // namespace Test + diff --git a/lib/kokkos/core/unit_test/TestViewCtorPropEmbeddedDim.hpp b/lib/kokkos/core/unit_test/TestViewCtorPropEmbeddedDim.hpp new file mode 100644 index 0000000000..305ddb2a1d --- /dev/null +++ b/lib/kokkos/core/unit_test/TestViewCtorPropEmbeddedDim.hpp @@ -0,0 +1,160 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include + +#include + +#include + +#include +#include + +namespace Test { + +namespace { + +template +struct TestViewCtorProp_EmbeddedDim { + + using ViewIntType = typename Kokkos::View< int**, ExecSpace >; + using ViewDoubleType = typename Kokkos::View< double*, ExecSpace >; + + // Cuda 7.0 has issues with using a lamda in parallel_for to initialize the view - replace with this functor + template < class ViewType > + struct Functor { + + ViewType v; + + Functor( const ViewType & v_ ) : v(v_) {} + + KOKKOS_INLINE_FUNCTION + void operator()( const int i ) const { + v(i) = i; + } + + }; + + + static void test_vcpt( const int N0, const int N1 ) + { + + // Create views to test + { + using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType ; + using VDT = typename TestViewCtorProp_EmbeddedDim::ViewDoubleType ; + + VIT vi1("vi1", N0, N1); + VDT vd1("vd1", N0); + + // TEST: Test for common type between two views, one with type double, other with type int + // Deduce common value_type and construct a view with that type + { + // Two views + auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1); + typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType; + typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT; + typedef typename CVT::HostMirror HostCVT; + + // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg + CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 ); + + Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), + Functor(cv1) + ); + + HostCVT hcv1 = Kokkos::create_mirror_view( cv1 ); + Kokkos::deep_copy( hcv1, cv1 ); + + ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ; + #if 0 + // debug output + for ( int i = 0; i < N0*N1; ++i ) { + printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) ); + } + + printf( " Common value type view: %s \n", typeid( CVT() ).name() ); + printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() ); + if ( std::is_same< CommonViewValueType, double >::value == true ) { + printf("Proper common value_type\n"); + } + else { + printf("WRONG common value_type\n"); + } + // end debug output + #endif + } + + { + // Single view + auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1); + typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType; + typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT; + typedef typename CVT::HostMirror HostCVT; + + // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg + CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 ); + + Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), + Functor(cv1) + ); + + HostCVT hcv1 = Kokkos::create_mirror_view( cv1 ); + Kokkos::deep_copy( hcv1, cv1 ); + + ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ; + } + + } + + } // end test_vcpt + +}; // end struct + +} // namespace + +TEST_F( TEST_CATEGORY , viewctorprop_embedded_dim ) { + TestViewCtorProp_EmbeddedDim< TEST_EXECSPACE >::test_vcpt( 2, 3 ); +} + +} // namespace Test diff --git a/lib/kokkos/core/unit_test/TestViewMapping_a.hpp b/lib/kokkos/core/unit_test/TestViewMapping_a.hpp index 6830c2e049..810ae72e73 100644 --- a/lib/kokkos/core/unit_test/TestViewMapping_a.hpp +++ b/lib/kokkos/core/unit_test/TestViewMapping_a.hpp @@ -56,24 +56,24 @@ void test_view_mapping() { typedef typename Space::execution_space ExecSpace; - typedef Kokkos::Experimental::Impl::ViewDimension<> dim_0; - typedef Kokkos::Experimental::Impl::ViewDimension< 2 > dim_s2; - typedef Kokkos::Experimental::Impl::ViewDimension< 2, 3 > dim_s2_s3; - typedef Kokkos::Experimental::Impl::ViewDimension< 2, 3, 4 > dim_s2_s3_s4; + typedef Kokkos::Impl::ViewDimension<> dim_0; + typedef Kokkos::Impl::ViewDimension< 2 > dim_s2; + typedef Kokkos::Impl::ViewDimension< 2, 3 > dim_s2_s3; + typedef Kokkos::Impl::ViewDimension< 2, 3, 4 > dim_s2_s3_s4; - typedef Kokkos::Experimental::Impl::ViewDimension< 0 > dim_s0; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 3 > dim_s0_s3; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 3, 4 > dim_s0_s3_s4; + typedef Kokkos::Impl::ViewDimension< 0 > dim_s0; + typedef Kokkos::Impl::ViewDimension< 0, 3 > dim_s0_s3; + typedef Kokkos::Impl::ViewDimension< 0, 3, 4 > dim_s0_s3_s4; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0 > dim_s0_s0; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 4 > dim_s0_s0_s4; + typedef Kokkos::Impl::ViewDimension< 0, 0 > dim_s0_s0; + typedef Kokkos::Impl::ViewDimension< 0, 0, 4 > dim_s0_s0_s4; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0 > dim_s0_s0_s0; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0 > dim_s0_s0_s0_s0; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0_s0; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0_s0_s0; + typedef Kokkos::Impl::ViewDimension< 0, 0, 0 > dim_s0_s0_s0; + typedef Kokkos::Impl::ViewDimension< 0, 0, 0, 0 > dim_s0_s0_s0_s0; + typedef Kokkos::Impl::ViewDimension< 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0; + typedef Kokkos::Impl::ViewDimension< 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0; + typedef Kokkos::Impl::ViewDimension< 0, 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0_s0; + typedef Kokkos::Impl::ViewDimension< 0, 0, 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0_s0_s0; // Fully static dimensions should not be larger than an int. ASSERT_LE( sizeof( dim_0 ), sizeof( int ) ); @@ -186,12 +186,12 @@ void test_view_mapping() //---------------------------------------- - typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s0, Kokkos::LayoutStride > stride_s0_s0_s0; + typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s0, Kokkos::LayoutStride > stride_s0_s0_s0; //---------------------------------------- // Static dimension. { - typedef Kokkos::Experimental::Impl::ViewOffset< dim_s2_s3_s4, Kokkos::LayoutLeft > left_s2_s3_s4; + typedef Kokkos::Impl::ViewOffset< dim_s2_s3_s4, Kokkos::LayoutLeft > left_s2_s3_s4; ASSERT_EQ( sizeof( left_s2_s3_s4 ), sizeof( dim_s2_s3_s4 ) ); @@ -223,7 +223,7 @@ void test_view_mapping() //---------------------------------------- // Small dimension is unpadded. { - typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4; + typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4; left_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >() , Kokkos::LayoutLeft( 2, 3, 0, 0, 0, 0, 0, 0 ) ); @@ -275,7 +275,7 @@ void test_view_mapping() constexpr int N0 = 2000; constexpr int N1 = 300; - typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4; + typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4; left_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >() , Kokkos::LayoutLeft( N0, N1, 0, 0, 0, 0, 0, 0 ) ); @@ -314,7 +314,7 @@ void test_view_mapping() //---------------------------------------- // Static dimension. { - typedef Kokkos::Experimental::Impl::ViewOffset< dim_s2_s3_s4, Kokkos::LayoutRight > right_s2_s3_s4; + typedef Kokkos::Impl::ViewOffset< dim_s2_s3_s4, Kokkos::LayoutRight > right_s2_s3_s4; ASSERT_EQ( sizeof( right_s2_s3_s4 ), sizeof( dim_s2_s3_s4 ) ); @@ -350,7 +350,7 @@ void test_view_mapping() //---------------------------------------- // Small dimension is unpadded. { - typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4; + typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4; right_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >() , Kokkos::LayoutRight( 2, 3, 0, 0, 0, 0, 0, 0 ) ); @@ -391,7 +391,7 @@ void test_view_mapping() constexpr int N0 = 2000; constexpr int N1 = 300; - typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4; + typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4; right_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >() , Kokkos::LayoutRight( N0, N1, 0, 0, 0, 0, 0, 0 ) ); @@ -431,18 +431,18 @@ void test_view_mapping() // Subview. { // Mapping rank 4 to rank 3 - typedef Kokkos::Experimental::Impl::SubviewExtents< 4, 3 > SubviewExtents; + typedef Kokkos::Impl::SubviewExtents< 4, 3 > SubviewExtents; constexpr int N0 = 1000; constexpr int N1 = 2000; constexpr int N2 = 3000; constexpr int N3 = 4000; - Kokkos::Experimental::Impl::ViewDimension< N0, N1, N2, N3 > dim; + Kokkos::Impl::ViewDimension< N0, N1, N2, N3 > dim; SubviewExtents tmp( dim , N0 / 2 - , Kokkos::Experimental::ALL + , Kokkos::ALL , std::pair< int, int >( N2 / 4, 10 + N2 / 4 ) , Kokkos::pair< int, int >( N3 / 4, 20 + N3 / 4 ) ); @@ -469,12 +469,12 @@ void test_view_mapping() constexpr int sub_N1 = 200; constexpr int sub_N2 = 4; - typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4; + typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4; left_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >() , Kokkos::LayoutLeft( N0, N1, 0, 0, 0, 0, 0, 0 ) ); - Kokkos::Experimental::Impl::SubviewExtents< 3, 3 > + Kokkos::Impl::SubviewExtents< 3, 3 > sub( dyn_off3.m_dim , Kokkos::pair< int, int >( 0, sub_N0 ) , Kokkos::pair< int, int >( 0, sub_N1 ) @@ -509,12 +509,12 @@ void test_view_mapping() constexpr int sub_N1 = 200; constexpr int sub_N2 = 4; - typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4; + typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4; right_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >() , Kokkos::LayoutRight( N0, N1, 0, 0, 0, 0, 0, 0 ) ); - Kokkos::Experimental::Impl::SubviewExtents< 3, 3 > + Kokkos::Impl::SubviewExtents< 3, 3 > sub( dyn_off3.m_dim , Kokkos::pair< int, int >( 0, sub_N0 ) , Kokkos::pair< int, int >( 0, sub_N1 ) @@ -544,7 +544,7 @@ void test_view_mapping() //---------------------------------------- // View data analysis. { - using namespace Kokkos::Experimental::Impl; + using namespace Kokkos::Impl; static_assert( rank_dynamic<>::value == 0, "" ); static_assert( rank_dynamic< 1 >::value == 0, "" ); @@ -554,7 +554,7 @@ void test_view_mapping() } { - using namespace Kokkos::Experimental::Impl; + using namespace Kokkos::Impl; typedef ViewArrayAnalysis< int[] > a_int_r1; typedef ViewArrayAnalysis< int**[4][5][6] > a_int_r5; @@ -598,7 +598,7 @@ void test_view_mapping() } { - using namespace Kokkos::Experimental::Impl; + using namespace Kokkos::Impl; typedef int t_i4[4]; @@ -616,12 +616,12 @@ void test_view_mapping() } { - using namespace Kokkos::Experimental::Impl; + using namespace Kokkos::Impl; typedef ViewDataAnalysis< const int[], void > a_const_int_r1; static_assert( std::is_same< typename a_const_int_r1::specialize, void >::value, "" ); - static_assert( std::is_same< typename a_const_int_r1::dimension, Kokkos::Experimental::Impl::ViewDimension<0> >::value, "" ); + static_assert( std::is_same< typename a_const_int_r1::dimension, Kokkos::Impl::ViewDimension<0> >::value, "" ); static_assert( std::is_same< typename a_const_int_r1::type, const int * >::value, "" ); static_assert( std::is_same< typename a_const_int_r1::value_type, const int >::value, "" ); @@ -637,7 +637,7 @@ void test_view_mapping() static_assert( std::is_same< typename a_const_int_r3::specialize, void >::value, "" ); - static_assert( std::is_same< typename a_const_int_r3::dimension, Kokkos::Experimental::Impl::ViewDimension<0, 0, 4> >::value, "" ); + static_assert( std::is_same< typename a_const_int_r3::dimension, Kokkos::Impl::ViewDimension<0, 0, 4> >::value, "" ); static_assert( std::is_same< typename a_const_int_r3::type, const int**[4] >::value, "" ); static_assert( std::is_same< typename a_const_int_r3::value_type, const int >::value, "" ); @@ -786,7 +786,7 @@ void test_view_mapping() // The execution space of the memory space must be available for view data initialization. if ( std::is_same< ExecSpace, typename ExecSpace::memory_space::execution_space >::value ) { - using namespace Kokkos::Experimental; + using namespace Kokkos; typedef typename ExecSpace::memory_space memory_space; typedef View< int*, memory_space > V; @@ -811,8 +811,8 @@ void test_view_mapping() { typedef Kokkos::ViewTraits< int***, Kokkos::LayoutStride, ExecSpace > traits_t; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0 > dims_t; - typedef Kokkos::Experimental::Impl::ViewOffset< dims_t, Kokkos::LayoutStride > offset_t; + typedef Kokkos::Impl::ViewDimension< 0, 0, 0 > dims_t; + typedef Kokkos::Impl::ViewOffset< dims_t, Kokkos::LayoutStride > offset_t; Kokkos::LayoutStride stride; @@ -836,8 +836,8 @@ void test_view_mapping() ASSERT_EQ( offset.span(), 60 ); ASSERT_TRUE( offset.span_is_contiguous() ); - Kokkos::Experimental::Impl::ViewMapping< traits_t, void > - v( Kokkos::Experimental::Impl::ViewCtorProp< int* >( (int*) 0 ), stride ); + Kokkos::Impl::ViewMapping< traits_t, void > + v( Kokkos::Impl::ViewCtorProp< int* >( (int*) 0 ), stride ); } { @@ -849,8 +849,8 @@ void test_view_mapping() constexpr int N1 = 11; V a( "a", N0, N1 ); - M b = Kokkos::Experimental::create_mirror( a ); - M c = Kokkos::Experimental::create_mirror_view( a ); + M b = Kokkos::create_mirror( a ); + M c = Kokkos::create_mirror_view( a ); M d; for ( int i0 = 0; i0 < N0; ++i0 ) @@ -859,8 +859,8 @@ void test_view_mapping() b( i0, i1 ) = 1 + i0 + i1 * N0; } - Kokkos::Experimental::deep_copy( a, b ); - Kokkos::Experimental::deep_copy( c, a ); + Kokkos::deep_copy( a, b ); + Kokkos::deep_copy( c, a ); for ( int i0 = 0; i0 < N0; ++i0 ) for ( int i1 = 0; i1 < N1; ++i1 ) @@ -868,7 +868,7 @@ void test_view_mapping() ASSERT_EQ( b( i0, i1 ), c( i0, i1 ) ); } - Kokkos::Experimental::resize( b, 5, 6 ); + Kokkos::resize( b, 5, 6 ); for ( int i0 = 0; i0 < 5; ++i0 ) for ( int i1 = 0; i1 < 6; ++i1 ) @@ -878,8 +878,8 @@ void test_view_mapping() ASSERT_EQ( b( i0, i1 ), val ); } - Kokkos::Experimental::realloc( c, 5, 6 ); - Kokkos::Experimental::realloc( d, 5, 6 ); + Kokkos::realloc( c, 5, 6 ); + Kokkos::realloc( d, 5, 6 ); ASSERT_EQ( b.dimension_0(), 5 ); ASSERT_EQ( b.dimension_1(), 6 ); @@ -889,7 +889,7 @@ void test_view_mapping() ASSERT_EQ( d.dimension_1(), 6 ); layout_type layout( 7, 8 ); - Kokkos::Experimental::resize( b, layout ); + Kokkos::resize( b, layout ); for ( int i0 = 0; i0 < 7; ++i0 ) for ( int i1 = 6; i1 < 8; ++i1 ) { @@ -909,8 +909,8 @@ void test_view_mapping() ASSERT_EQ( b( i0, i1 ), val ); } - Kokkos::Experimental::realloc( c, layout ); - Kokkos::Experimental::realloc( d, layout ); + Kokkos::realloc( c, layout ); + Kokkos::realloc( d, layout ); ASSERT_EQ( b.dimension_0(), 7 ); ASSERT_EQ( b.dimension_1(), 8 ); @@ -932,8 +932,8 @@ void test_view_mapping() const int order[] = { 1, 0 }; V a( "a", Kokkos::LayoutStride::order_dimensions( 2, order, dimensions ) ); - M b = Kokkos::Experimental::create_mirror( a ); - M c = Kokkos::Experimental::create_mirror_view( a ); + M b = Kokkos::create_mirror( a ); + M c = Kokkos::create_mirror_view( a ); M d; for ( int i0 = 0; i0 < N0; ++i0 ) @@ -942,8 +942,8 @@ void test_view_mapping() b( i0, i1 ) = 1 + i0 + i1 * N0; } - Kokkos::Experimental::deep_copy( a, b ); - Kokkos::Experimental::deep_copy( c, a ); + Kokkos::deep_copy( a, b ); + Kokkos::deep_copy( c, a ); for ( int i0 = 0; i0 < N0; ++i0 ) for ( int i1 = 0; i1 < N1; ++i1 ) @@ -954,7 +954,7 @@ void test_view_mapping() const int dimensions2[] = { 7, 8 }; const int order2[] = { 1, 0 }; layout_type layout = layout_type::order_dimensions( 2, order2, dimensions2 ); - Kokkos::Experimental::resize( b, layout ); + Kokkos::resize( b, layout ); for ( int i0 = 0; i0 < 7; ++i0 ) for ( int i1 = 0; i1 < 8; ++i1 ) @@ -964,8 +964,8 @@ void test_view_mapping() ASSERT_EQ( b( i0, i1 ), val ); } - Kokkos::Experimental::realloc( c, layout ); - Kokkos::Experimental::realloc( d, layout ); + Kokkos::realloc( c, layout ); + Kokkos::realloc( d, layout ); ASSERT_EQ( b.dimension_0(), 7 ); ASSERT_EQ( b.dimension_1(), 8 ); diff --git a/lib/kokkos/core/unit_test/TestViewSubview.hpp b/lib/kokkos/core/unit_test/TestViewSubview.hpp index e3a12e684e..106323492a 100644 --- a/lib/kokkos/core/unit_test/TestViewSubview.hpp +++ b/lib/kokkos/core/unit_test/TestViewSubview.hpp @@ -915,134 +915,134 @@ void test_3d_subview_5d_impl_layout() { inline void test_subview_legal_args_right() { - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::pair, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::Impl::ALL_t, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::pair, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, Kokkos::pair, Kokkos::pair, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, Kokkos::pair, Kokkos::Impl::ALL_t, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::pair, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::Impl::ALL_t, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::pair, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, Kokkos::pair, Kokkos::pair, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, Kokkos::pair, Kokkos::Impl::ALL_t, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, int, Kokkos::Impl::ALL_t, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, int, Kokkos::pair, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, int, Kokkos::Impl::ALL_t, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, int, Kokkos::Impl::ALL_t, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, int, Kokkos::pair, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, int, Kokkos::Impl::ALL_t, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair, int, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair, int, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, Kokkos::Impl::ALL_t, int, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, Kokkos::pair, int, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, Kokkos::Impl::ALL_t, int, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair, int, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair, int, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, Kokkos::Impl::ALL_t, int, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, Kokkos::pair, int, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, Kokkos::Impl::ALL_t, int, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair, int, Kokkos::pair >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair, int, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair, Kokkos::Impl::ALL_t, int, Kokkos::pair >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, int, Kokkos::pair >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, int, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair, int, Kokkos::pair >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair, int, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair, Kokkos::Impl::ALL_t, int, Kokkos::pair >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, int, Kokkos::pair >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, int, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair >::value ) ); - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::pair >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::pair >::value ) ); - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair, Kokkos::pair, Kokkos::pair >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair, Kokkos::pair, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::pair >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::pair >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair, Kokkos::pair, Kokkos::pair >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair, Kokkos::pair, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair >::value ) ); - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::pair >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::pair >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair, Kokkos::pair, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair, Kokkos::pair, Kokkos::pair >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::pair >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::pair >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair, Kokkos::pair, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair, Kokkos::pair, Kokkos::pair >::value ) ); } inline void test_subview_legal_args_left() { - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair, int, int >::value ) ); - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::pair, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::Impl::ALL_t, int, int >::value ) ); - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::pair, int, int >::value ) ); - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, Kokkos::pair, Kokkos::pair, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, Kokkos::pair, Kokkos::Impl::ALL_t, int, int >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair, int, int >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::pair, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::Impl::ALL_t, int, int >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::pair, int, int >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, Kokkos::pair, Kokkos::pair, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, Kokkos::pair, Kokkos::Impl::ALL_t, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, int, Kokkos::Impl::ALL_t, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, int, Kokkos::pair, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, int, Kokkos::Impl::ALL_t, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, int, Kokkos::Impl::ALL_t, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, int, Kokkos::pair, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, int, Kokkos::Impl::ALL_t, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair, int, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair, int, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, Kokkos::Impl::ALL_t, int, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, Kokkos::pair, int, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, Kokkos::Impl::ALL_t, int, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair, int, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair, int, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, Kokkos::Impl::ALL_t, int, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, Kokkos::pair, int, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, Kokkos::Impl::ALL_t, int, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, Kokkos::pair, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, Kokkos::pair, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair, int, Kokkos::pair >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair, int, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair, Kokkos::Impl::ALL_t, int, Kokkos::pair >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, int, Kokkos::pair >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, int, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair, int, Kokkos::pair >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair, int, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair, Kokkos::Impl::ALL_t, int, Kokkos::pair >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, int, Kokkos::pair >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, int, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::pair >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::pair >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair, Kokkos::pair, Kokkos::pair >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair, Kokkos::pair, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::pair >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::pair >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair, Kokkos::pair, Kokkos::pair >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair, Kokkos::pair, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair >::value ) ); - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::pair >::value ) ); - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::pair >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair, Kokkos::pair, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair, Kokkos::pair, Kokkos::pair >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::pair >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair, Kokkos::pair >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair, Kokkos::pair, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair, Kokkos::pair, Kokkos::pair >::value ) ); } } // namespace Impl diff --git a/lib/kokkos/core/unit_test/TestWorkGraph.hpp b/lib/kokkos/core/unit_test/TestWorkGraph.hpp new file mode 100644 index 0000000000..70cf6b47c0 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestWorkGraph.hpp @@ -0,0 +1,172 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + +#include + +namespace Test { + +namespace { + +/* This test is meant to be the WorkGraph equivalent of the Task DAG Scheduler test, + please see TestTaskScheduler.hpp for that test. + The algorithm computes the N-th fibonacci number as follows: + - Each "task" or "work item" computes the i-th fibonacci number + - If a task as (i < 2), it will record the known answer ahead of time. + - If a taks has (i >= 2), it will "spawn" two more tasks to compute + the (i - 1) and (i - 2) fibonacci numbers. + We do NOT do any de-duplication of these tasks. + De-duplication would result in only (N - 2) tasks which must be run in serial. + We allow duplicates both to increase the number of tasks and to increase the + amount of available parallelism. + */ + +template< class ExecSpace > +struct TestWorkGraph { + + using MemorySpace = typename ExecSpace::memory_space; + using Policy = Kokkos::Experimental::WorkGraphPolicy; + using Graph = typename Policy::graph_type; + using RowMap = typename Graph::row_map_type; + using Entries = typename Graph::entries_type; + using Values = Kokkos::View; + + long m_input; + Graph m_graph; + Graph m_transpose; + Values m_values; + + TestWorkGraph(long arg_input):m_input(arg_input) { + form_graph(); + transpose_crs(m_transpose, m_graph); + } + + inline + long full_fibonacci( long n ) { + constexpr long mask = 0x03; + long fib[4] = { 0, 1, 1, 2 }; + for ( long i = 2; i <= n; ++i ) { + fib[ i & mask ] = fib[ ( i - 1 ) & mask ] + fib[ ( i - 2 ) & mask ]; + } + return fib[ n & mask ]; + } + + struct HostEntry { + long input; + std::int32_t parent; + }; + std::vector form_host_graph() { + std::vector g; + g.push_back({ m_input , -1 }); + for (std::int32_t i = 0; i < std::int32_t(g.size()); ++i) { + auto e = g.at(std::size_t(i)); + if (e.input < 2) continue; + /* This part of the host graph formation is the equivalent of task spawning + in the Task DAG system. Notice how each task which is not a base case + spawns two more tasks, without any de-duplication */ + g.push_back({ e.input - 1, i }); + g.push_back({ e.input - 2, i }); + } + return g; + } + + void form_graph() { + auto hg = form_host_graph(); + m_graph.row_map = RowMap("row_map", hg.size() + 1); // row map always has one more + m_graph.entries = Entries("entries", hg.size() - 1); // all but the first have a parent + m_values = Values("values", hg.size()); + auto h_row_map = Kokkos::create_mirror_view(m_graph.row_map); + auto h_entries = Kokkos::create_mirror_view(m_graph.entries); + auto h_values = Kokkos::create_mirror_view(m_values); + h_row_map(0) = 0; + for (std::int32_t i = 0; i < std::int32_t(hg.size()); ++i) { + auto& e = hg.at(std::size_t(i)); + h_row_map(i + 1) = i; + if (e.input < 2) { + h_values(i) = e.input; + } + if (e.parent == -1) continue; + h_entries(i - 1) = e.parent; + } + Kokkos::deep_copy(m_graph.row_map, h_row_map); + Kokkos::deep_copy(m_graph.entries, h_entries); + Kokkos::deep_copy(m_values, h_values); + } + + KOKKOS_INLINE_FUNCTION + void operator()(std::int32_t i) const { + auto begin = m_transpose.row_map(i); + auto end = m_transpose.row_map(i + 1); + for (auto j = begin; j < end; ++j) { + auto k = m_transpose.entries(j); + m_values(i) += m_values( k ); + } + } + + void test_for() { + Kokkos::parallel_for(Policy(m_graph), *this); + auto h_values = Kokkos::create_mirror_view(m_values); + Kokkos::deep_copy(h_values, m_values); + ASSERT_EQ( h_values(0), full_fibonacci(m_input) ); + } + +}; + +} // anonymous namespace + +TEST_F( TEST_CATEGORY, DISABLED_workgraph_fib ) +{ + #ifdef KOKKOS_IMPL_CUDA_CLANG_WORKAROUND + int limit = 15; + #else + int limit = 27; + #endif + for ( int i = 0; i < limit; ++i) { + TestWorkGraph< TEST_EXECSPACE > f(i); + f.test_for(); + } +} + +} // namespace Test diff --git a/lib/kokkos/core/unit_test/UnitTestMain.cpp b/lib/kokkos/core/unit_test/UnitTestMain.cpp index 4f52fc9567..a7dc7c4973 100644 --- a/lib/kokkos/core/unit_test/UnitTestMain.cpp +++ b/lib/kokkos/core/unit_test/UnitTestMain.cpp @@ -42,6 +42,7 @@ */ #include +#include int main( int argc, char *argv[] ) { ::testing::InitGoogleTest( &argc, argv ); diff --git a/lib/kokkos/core/unit_test/UnitTestMainInit.cpp b/lib/kokkos/core/unit_test/UnitTestMainInit.cpp index 21f851274b..62a01e9033 100644 --- a/lib/kokkos/core/unit_test/UnitTestMainInit.cpp +++ b/lib/kokkos/core/unit_test/UnitTestMainInit.cpp @@ -42,6 +42,8 @@ */ #include +#include + #include int main( int argc, char *argv[] ) { diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp index ba06b71192..fa6722615c 100644 --- a/lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp +++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp @@ -48,3 +48,5 @@ #include #include #include + +#include diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_UniqueToken.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_UniqueToken.cpp new file mode 100644 index 0000000000..8424ae10d6 --- /dev/null +++ b/lib/kokkos/core/unit_test/cuda/TestCuda_UniqueToken.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_WorkGraph.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_WorkGraph.cpp new file mode 100644 index 0000000000..663ca1d560 --- /dev/null +++ b/lib/kokkos/core/unit_test/cuda/TestCuda_WorkGraph.cpp @@ -0,0 +1,45 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeResize.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeResize.cpp new file mode 100644 index 0000000000..c02905535b --- /dev/null +++ b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeResize.cpp @@ -0,0 +1,57 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include "TestResize.hpp" + +namespace Test { + +TEST( kokkosresize, host_space_access ) +{ + // Test with the default device type. + using TestViewResize::testResize; + typedef Kokkos::View::device_type device_type; + testResize (); +} + +} // namespace Test diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp index 2f8daf7ad7..c12574a65a 100644 --- a/lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp +++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp @@ -86,25 +86,26 @@ class openmp : public ::testing::Test { protected: static void SetUpTestCase() { - const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); - const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); - const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core(); + int threads_count = 0; + #pragma omp parallel + { + #pragma omp atomic + ++threads_count; + } - const unsigned threads_count = std::max( 1u, numa_count ) * - std::max( 2u, ( cores_per_numa * threads_per_core ) / 2 ); + if (threads_count > 3) { + threads_count /= 2; + } Kokkos::OpenMP::initialize( threads_count ); Kokkos::print_configuration( std::cout, true ); + srand( 10231 ); } static void TearDownTestCase() { Kokkos::OpenMP::finalize(); - - omp_set_num_threads( 1 ); - - ASSERT_EQ( 1, omp_get_max_threads() ); } }; diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp index 5e9535638d..33e7402ce6 100644 --- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp +++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp @@ -48,3 +48,93 @@ #include #include #include + +#include + +#include + +namespace Test { + +TEST_F( openmp, partition_master ) +{ + using Mutex = Kokkos::Experimental::MasterLock; + + Mutex mtx; + int errors = 0; + + auto master = [&errors, &mtx](int partition_id, int num_partitions) { + + const int pool_size = Kokkos::OpenMP::thread_pool_size(); + + { + std::unique_lock lock(mtx); + if ( Kokkos::OpenMP::in_parallel() ) { + ++errors; + } + if ( Kokkos::OpenMP::thread_pool_rank() != 0 ) { + ++errors; + } + } + + { + int local_errors = 0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0,1000) + , [pool_size]( const int , int & errs ) { + if ( Kokkos::OpenMP::thread_pool_size() != pool_size ) { + ++errs; + } + } + , local_errors + ); + Kokkos::atomic_add( &errors, local_errors ); + } + + Kokkos::Experimental::UniqueToken< Kokkos::OpenMP > token; + + Kokkos::View count( "", token.size() ); + + Kokkos::parallel_for( Kokkos::RangePolicy(0,1000), + [=] ( const int ) { + int i = token.acquire(); + ++count[i]; + token.release(i); + }); + + Kokkos::View sum (""); + Kokkos::parallel_for( Kokkos::RangePolicy(0,token.size()), + [=] ( const int i ) { + Kokkos::atomic_add( sum.data(), count[i] ); + }); + + if (sum() != 1000) { + Kokkos::atomic_add( &errors, 1 ); + } + }; + + master(0,1); + + ASSERT_EQ( errors, 0 ); + + Kokkos::OpenMP::partition_master( master ); + ASSERT_EQ( errors, 0 ); + + Kokkos::OpenMP::partition_master( master, 4, 0 ); + ASSERT_EQ( errors, 0 ); + + Kokkos::OpenMP::partition_master( master, 0, 4 ); + ASSERT_EQ( errors, 0 ); + + Kokkos::OpenMP::partition_master( master, 2, 2 ); + ASSERT_EQ( errors, 0 ); + + Kokkos::OpenMP::partition_master( master, 8, 0 ); + ASSERT_EQ( errors, 0 ); + + Kokkos::OpenMP::partition_master( master, 0, 8 ); + ASSERT_EQ( errors, 0 ); + + Kokkos::OpenMP::partition_master( master, 8, 8 ); + ASSERT_EQ( errors, 0 ); +} + +} // namespace Test diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_UniqueToken.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_UniqueToken.cpp new file mode 100644 index 0000000000..143a6d9910 --- /dev/null +++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_UniqueToken.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_WorkGraph.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_WorkGraph.cpp new file mode 100644 index 0000000000..ec6fa1653c --- /dev/null +++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_WorkGraph.cpp @@ -0,0 +1,45 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp index a6a76a03bd..bc39b1e160 100644 --- a/lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp +++ b/lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp @@ -48,3 +48,5 @@ #include #include #include + +#include diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_WorkGraph.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_WorkGraph.cpp new file mode 100644 index 0000000000..de1638de5e --- /dev/null +++ b/lib/kokkos/core/unit_test/serial/TestSerial_WorkGraph.cpp @@ -0,0 +1,45 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp index c11155c5c0..160b37a2c8 100644 --- a/lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp +++ b/lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp @@ -48,3 +48,5 @@ #include #include #include + +#include diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_WorkGraph.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_WorkGraph.cpp new file mode 100644 index 0000000000..6b7dbb26db --- /dev/null +++ b/lib/kokkos/core/unit_test/threads/TestThreads_WorkGraph.cpp @@ -0,0 +1,45 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include diff --git a/lib/kokkos/example/cmake_build/CMakeLists.txt b/lib/kokkos/example/cmake_build/CMakeLists.txt index 4e149726ee..f92c5c6513 100644 --- a/lib/kokkos/example/cmake_build/CMakeLists.txt +++ b/lib/kokkos/example/cmake_build/CMakeLists.txt @@ -40,5 +40,7 @@ list(APPEND CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -O3) add_subdirectory(${Example_SOURCE_DIR}/../.. ${Example_BINARY_DIR}/kokkos) +include_directories(${Kokkos_INCLUDE_DIRS_RET}) + add_executable(example cmake_example.cpp) target_link_libraries(example kokkos) diff --git a/lib/kokkos/example/feint/main.cpp b/lib/kokkos/example/feint/main.cpp index 616e584bf6..57a8f8fafb 100644 --- a/lib/kokkos/example/feint/main.cpp +++ b/lib/kokkos/example/feint/main.cpp @@ -69,12 +69,26 @@ int main() #if defined( KOKKOS_ENABLE_OPENMP ) { - // Use 4 cores per NUMA region, unless fewer available - const unsigned use_numa_count = Kokkos::hwloc::get_available_numa_count(); - const unsigned use_cores_per_numa = std::min( 4u , Kokkos::hwloc::get_available_cores_per_numa() ); + int num_threads = 0; + if ( Kokkos::hwloc::available() ) { + // Use 4 cores per NUMA region, unless fewer available + const unsigned use_numa_count = Kokkos::hwloc::get_available_numa_count(); + const unsigned use_cores_per_numa = std::min( 4u , Kokkos::hwloc::get_available_cores_per_numa() ); + num_threads = use_numa_count * use_cores_per_numa; - Kokkos::OpenMP::initialize( use_numa_count * use_cores_per_numa ); + } + else { + #pragma omp parallel + { + #pragma omp atomic + ++num_threads; + } + num_threads = std::max(4, num_threads/4); + } + + + Kokkos::OpenMP::initialize( num_threads ); std::cout << "feint< OpenMP , NotUsingAtomic >" << std::endl ; Kokkos::Example::feint< Kokkos::OpenMP , false >(); diff --git a/lib/kokkos/example/global_2_local_ids/G2L_Main.cpp b/lib/kokkos/example/global_2_local_ids/G2L_Main.cpp index fb33aef56e..b6b8b2f5e0 100644 --- a/lib/kokkos/example/global_2_local_ids/G2L_Main.cpp +++ b/lib/kokkos/example/global_2_local_ids/G2L_Main.cpp @@ -138,7 +138,16 @@ int main(int argc, char *argv[]) #endif #ifdef KOKKOS_ENABLE_OPENMP - Kokkos::OpenMP::initialize( threads_count ); + int num_threads = 0; + #pragma omp parallel + { + #pragma omp atomic + ++num_threads; + } + if( num_threads > 3 ) { + num_threads = std::max(4, num_threads/4); + } + Kokkos::OpenMP::initialize( num_threads ); num_errors += G2L::run_openmp(num_ids,num_find_iterations); Kokkos::OpenMP::finalize(); #endif diff --git a/lib/kokkos/example/grow_array/main.cpp b/lib/kokkos/example/grow_array/main.cpp index e7438a9bf4..3f1d534d93 100644 --- a/lib/kokkos/example/grow_array/main.cpp +++ b/lib/kokkos/example/grow_array/main.cpp @@ -88,7 +88,7 @@ int main( int argc , char ** argv ) #if defined( KOKKOS_ENABLE_OPENMP ) { std::cout << "Kokkos::OpenMP" << std::endl ; - Kokkos::OpenMP::initialize( num_threads , use_numa , use_core ); + Kokkos::OpenMP::initialize(); Example::grow_array< Kokkos::OpenMP >( length_array , span_values ); Kokkos::OpenMP::finalize(); } diff --git a/lib/kokkos/example/tutorial/03_simple_view/Makefile b/lib/kokkos/example/tutorial/03_simple_view/Makefile index e716b765e7..32483a2555 100644 --- a/lib/kokkos/example/tutorial/03_simple_view/Makefile +++ b/lib/kokkos/example/tutorial/03_simple_view/Makefile @@ -33,6 +33,7 @@ include $(KOKKOS_PATH)/Makefile.kokkos build: $(EXE) +#for unit testing only, for best preformance with OpenMP 4.0 or better test: $(EXE) ./$(EXE) diff --git a/lib/kokkos/example/tutorial/Advanced_Views/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/Makefile index bc4012f68c..12ac5652e5 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/Makefile +++ b/lib/kokkos/example/tutorial/Advanced_Views/Makefile @@ -22,100 +22,102 @@ endif build: mkdir -p 01_data_layouts cd ./01_data_layouts; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS} mkdir -p 02_memory_traits cd ./02_memory_traits; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS} mkdir -p 03_subviews cd ./03_subviews; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS} mkdir -p 04_dualviews cd ./04_dualviews; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS} mkdir -p 05_NVIDIA_UVM cd ./05_NVIDIA_UVM; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS} #mkdir -p 06_AtomicViews #cd ./06_AtomicViews; \ - #make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS} + #$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS} #mkdir -p 07_Overlapping_DeepCopy #cd ./07_Overlapping_DeepCopy; \ - #make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS} + #$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS} build-insource: cd ./01_data_layouts; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./02_memory_traits; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./03_subviews; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./04_dualviews; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./05_NVIDIA_UVM; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} #cd ./06_AtomicViews; \ - #make build -j 4 ${KOKKOS_SETTINGS} + #$(MAKE) build ${KOKKOS_SETTINGS} #cd ./07_Overlapping_DeepCopy; \ - #make build -j 4 ${KOKKOS_SETTINGS} + #$(MAKE) build ${KOKKOS_SETTINGS} + test: cd ./01_data_layouts; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS} cd ./02_memory_traits; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS} cd ./03_subviews; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS} cd ./04_dualviews; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS} cd ./05_NVIDIA_UVM; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS} #cd ./06_AtomicViews; \ - #make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS} + #$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS} #cd ./07_Overlapping_DeepCopy; \ - #make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS} + #$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS} test-insource: cd ./01_data_layouts; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./02_memory_traits; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./03_subviews; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./04_dualviews; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./05_NVIDIA_UVM; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} #cd ./06_AtomicViews; \ - #make test -j 4 ${KOKKOS_SETTINGS} + #$(MAKE) test ${KOKKOS_SETTINGS} #cd ./07_Overlapping_DeepCopy; \ - #make test -j 4 ${KOKKOS_SETTINGS} + #$(MAKE) test ${KOKKOS_SETTINGS} + clean: cd ./01_data_layouts; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS} cd ./02_memory_traits; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS} cd ./03_subviews; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS} cd ./04_dualviews; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS} cd ./05_NVIDIA_UVM; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS} #cd ./06_AtomicViews; \ - #make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS} + #$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS} #cd ./07_Overlapping_DeepCopy; \ - #make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS} + #$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS} clean-insource: cd ./01_data_layouts; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./02_memory_traits; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./03_subviews; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./04_dualviews; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./05_NVIDIA_UVM; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} #cd ./06_AtomicViews; \ - #make clean ${KOKKOS_SETTINGS} + #$(MAKE) clean ${KOKKOS_SETTINGS} #cd ./07_Overlapping_DeepCopy; \ - #make clean ${KOKKOS_SETTINGS} + #$(MAKE) clean ${KOKKOS_SETTINGS} diff --git a/lib/kokkos/example/tutorial/Algorithms/Makefile b/lib/kokkos/example/tutorial/Algorithms/Makefile index ad0b76f9d6..4e70ba7d97 100644 --- a/lib/kokkos/example/tutorial/Algorithms/Makefile +++ b/lib/kokkos/example/tutorial/Algorithms/Makefile @@ -22,22 +22,22 @@ endif build: mkdir -p 01_random_numbers cd ./01_random_numbers; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS} build-insource: cd ./01_random_numbers; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} test: cd ./01_random_numbers; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS} test-insource: cd ./01_random_numbers; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} clean: cd ./01_random_numbers; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS} clean-insource: cd ./01_random_numbers; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile index 44fdf90f8a..4bf6d487ae 100644 --- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile +++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile @@ -22,74 +22,74 @@ endif build: mkdir -p 01_thread_teams cd ./01_thread_teams; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS} mkdir -p 01_thread_teams_lambda cd ./01_thread_teams_lambda; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS} mkdir -p 02_nested_parallel_for cd ./02_nested_parallel_for; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS} mkdir -p 03_vectorization cd ./03_vectorization; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS} mkdir -p 04_team_scan cd ./04_team_scan; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS} build-insource: cd ./01_thread_teams; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./01_thread_teams_lambda; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./02_nested_parallel_for; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./03_vectorization; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./04_team_scan; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} test: cd ./01_thread_teams; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS} cd ./01_thread_teams_lambda; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS} cd ./02_nested_parallel_for; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS} cd ./03_vectorization; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS} cd ./04_team_scan; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS} test-insource: cd ./01_thread_teams; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./01_thread_teams_lambda; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./02_nested_parallel_for; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./03_vectorization; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./04_team_scan; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} clean: cd ./01_thread_teams; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS} cd ./01_thread_teams_lambda; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS} cd ./02_nested_parallel_for; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS} cd ./03_vectorization; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS} cd ./04_team_scan; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS} clean-insource: cd ./01_thread_teams; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./01_thread_teams_lambda; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./02_nested_parallel_for; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./03_vectorization; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./04_team_scan; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} diff --git a/lib/kokkos/example/tutorial/Makefile b/lib/kokkos/example/tutorial/Makefile index 063ace8aab..7b2732eeed 100644 --- a/lib/kokkos/example/tutorial/Makefile +++ b/lib/kokkos/example/tutorial/Makefile @@ -23,152 +23,152 @@ endif build: mkdir -p 01_hello_world cd ./01_hello_world; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS} mkdir -p 01_hello_world_lambda cd ./01_hello_world_lambda; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS} mkdir -p 02_simple_reduce cd ./02_simple_reduce; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS} mkdir -p 02_simple_reduce_lambda cd ./02_simple_reduce_lambda; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS} mkdir -p 03_simple_view cd ./03_simple_view; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS} mkdir -p 03_simple_view_lambda cd ./03_simple_view_lambda; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS} mkdir -p 04_simple_memoryspaces cd ./04_simple_memoryspaces; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS} mkdir -p 05_simple_atomics cd ./05_simple_atomics; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS} mkdir -p Advanced_Views cd ./Advanced_Views; \ - make build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' mkdir -p Algorithms cd ./Algorithms; \ - make build -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' mkdir -p Hierarchical_Parallelism cd ./Hierarchical_Parallelism; \ - make build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' build-insource: cd ./01_hello_world; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./01_hello_world_lambda; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./02_simple_reduce; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./02_simple_reduce_lambda; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./03_simple_view; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./03_simple_view_lambda; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./04_simple_memoryspaces; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./05_simple_atomics; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./Advanced_Views; \ - make build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' cd ./Algorithms; \ - make build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' cd ./Hierarchical_Parallelism; \ - make build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' test: cd ./01_hello_world; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS} cd ./01_hello_world_lambda; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS} cd ./02_simple_reduce; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS} cd ./02_simple_reduce_lambda; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS} cd ./03_simple_view; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS} cd ./03_simple_view_lambda; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS} cd ./04_simple_memoryspaces; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS} cd ./05_simple_atomics; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS} cd ./Advanced_Views; \ - make test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' cd ./Algorithms; \ - make test -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' cd ./Hierarchical_Parallelism; \ - make test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' test-insource: cd ./01_hello_world; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./01_hello_world_lambda; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./02_simple_reduce; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./02_simple_reduce_lambda; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./03_simple_view; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./03_simple_view_lambda; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./04_simple_memoryspaces; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./05_simple_atomics; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./Advanced_Views; \ - make test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' cd ./Algorithms; \ - make test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' cd ./Hierarchical_Parallelism; \ - make test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' clean: cd ./01_hello_world; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS} cd ./01_hello_world_lambda; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS} cd ./02_simple_reduce; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS} cd ./02_simple_reduce_lambda; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS} cd ./03_simple_view; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS} cd ./03_simple_view_lambda; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS} cd ./04_simple_memoryspaces; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS} cd ./05_simple_atomics; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS} cd ./Advanced_Views; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' cd ./Algorithms; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' cd ./Hierarchical_Parallelism; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' clean-insource: cd ./01_hello_world; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./01_hello_world_lambda; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./02_simple_reduce; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./02_simple_reduce_lambda; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./03_simple_view; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./03_simple_view_lambda; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./04_simple_memoryspaces; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./05_simple_atomics; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./Advanced_Views; \ - make clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' cd ./Algorithms; \ - make clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' cd ./Hierarchical_Parallelism; \ - make clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' diff --git a/lib/kokkos/example/tutorial/launch_bounds/CMakeLists.txt b/lib/kokkos/example/tutorial/launch_bounds/CMakeLists.txt new file mode 100644 index 0000000000..7c78db840f --- /dev/null +++ b/lib/kokkos/example/tutorial/launch_bounds/CMakeLists.txt @@ -0,0 +1,10 @@ + +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +TRIBITS_ADD_EXECUTABLE( + tutorial_02_simple_reduce + SOURCES simple_reduce.cpp + COMM serial mpi + ) diff --git a/lib/kokkos/example/tutorial/launch_bounds/Makefile b/lib/kokkos/example/tutorial/launch_bounds/Makefile new file mode 100644 index 0000000000..5b605a4119 --- /dev/null +++ b/lib/kokkos/example/tutorial/launch_bounds/Makefile @@ -0,0 +1,56 @@ +KOKKOS_PATH = ../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/launch_bounds/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LINKFLAGS = +EXE = launch_bounds.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LINKFLAGS = +EXE = launch_bounds.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + +# WAR for "undefined memcpy" w/ Ubuntu + CUDA 7.5 +CXXFLAGS += -D_FORCE_INLINES +# Additional compile-time information +CXXFLAGS += -Xptxas=-v + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +temp: + echo $(KOKKOS_INTERNAL_USE_CUDA) $(CUDA_PATH) + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/lib/kokkos/example/tutorial/launch_bounds/launch_bounds_reduce.cpp b/lib/kokkos/example/tutorial/launch_bounds/launch_bounds_reduce.cpp new file mode 100644 index 0000000000..9a26eda507 --- /dev/null +++ b/lib/kokkos/example/tutorial/launch_bounds/launch_bounds_reduce.cpp @@ -0,0 +1,173 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + +// +// First reduction (parallel_reduce) example: +// 1. Start up Kokkos +// 2. Execute a parallel_reduce loop in the default execution space, +// using a functor to define the loop body +// 3. Shut down Kokkos +// +struct collision { +// Reduction functor +// For each i, we generate 10 hashes, look for and count collisions +// We use parallel_reduce to count the total collisions +// Note that we're just counting collisions within the 10 generated +// one i. +// This function was chosen as one that very simply can increase the +// register count. + typedef int value_type; + + KOKKOS_INLINE_FUNCTION + int hash(int q) const { + // A simple hash by Justin Sobel + // Thanks to Arash Partow (partow.net) + char* fourchars = (char*)&q; + int hash = 1315423911; + for (int i=0; i<4; fourchars++, i++) { + hash ^= ((hash<<5) + *fourchars + (hash >> 2)); + } + return hash; + } + + KOKKOS_INLINE_FUNCTION + void operator () (const int i, int& lsum) const { + //This is a silly function which generates 10 hashes + // then checks for collisions + int a = hash(i)%64; + int b = hash(i*3)%64; + int c = hash(i*5)%64; + int d = hash(i*7)%64; + int e = hash(i*11)%64; + int f = hash(i*17)%64; + int g = hash(i*23)%64; + int h = hash(i*29)%64; + int j = hash(i*31)%64; + int k = hash(i*37)%64; + + + if (a==b) lsum++; + if (a==c) lsum++; + if (a==d) lsum++; + if (a==e) lsum++; + if (a==f) lsum++; + if (a==g) lsum++; + if (a==h) lsum++; + if (a==j) lsum++; + if (a==k) lsum++; + if (b==c) lsum++; + if (b==d) lsum++; + if (b==e) lsum++; + if (b==f) lsum++; + if (b==g) lsum++; + if (b==h) lsum++; + if (b==j) lsum++; + if (b==k) lsum++; + if (c==d) lsum++; + if (c==e) lsum++; + if (c==f) lsum++; + if (c==g) lsum++; + if (c==h) lsum++; + if (c==j) lsum++; + if (c==k) lsum++; + if (d==e) lsum++; + if (d==f) lsum++; + if (d==g) lsum++; + if (d==h) lsum++; + if (d==j) lsum++; + if (d==k) lsum++; + if (e==f) lsum++; + if (e==g) lsum++; + if (e==h) lsum++; + if (e==j) lsum++; + if (e==k) lsum++; + if (f==g) lsum++; + if (f==h) lsum++; + if (f==j) lsum++; + if (f==k) lsum++; + if (g==h) lsum++; + if (g==j) lsum++; + if (g==k) lsum++; + if (h==j) lsum++; + if (h==k) lsum++; + if (j==k) lsum++; + } + + + +}; + +int main (int argc, char* argv[]) { + Kokkos::initialize (argc, argv); + const int n = 10000; + + // Compute and count hash collisions in + // parallel, using Kokkos. + // This is not really a useful algorithm, but it demonstrates the + // LaunchBounds functionality + int sum1 = 0; + int sum2 = 0; + + //Without LaunchBounds, the kernel uses 56 registers + Kokkos::parallel_reduce (n, collision (), sum1); + + //With LaunchBounds, we can reduce the register usage to 32 + Kokkos::parallel_reduce (Kokkos::RangePolicy>(0,n), collision (), sum2); + + printf ("Number of collisions, " + "computed in parallel, is %i\n", sum1); + + if (sum1 != sum2) { + printf( "Uh-oh! Results do not match\n"); + return -1; + } + + Kokkos::finalize(); + + + return 0; +} + diff --git a/lib/kokkos/generate_makefile.bash b/lib/kokkos/generate_makefile.bash index 5f2442102d..6d636dc7e4 100755 --- a/lib/kokkos/generate_makefile.bash +++ b/lib/kokkos/generate_makefile.bash @@ -1,7 +1,6 @@ #!/bin/bash KOKKOS_DEVICES="" -MAKE_J_OPTION="32" KOKKOS_DO_EXAMPLES="1" @@ -70,7 +69,8 @@ do KOKKOS_DEBUG=yes ;; --make-j*) - MAKE_J_OPTION="${key#*=}" + echo "Warning: ${key} is deprecated" + echo "Call make with appropriate -j flag" ;; --no-examples) KOKKOS_DO_EXAMPLES="0" @@ -110,23 +110,34 @@ do echo "--with-devices: Explicitly add a set of backends." echo "" echo "--arch=[OPT]: Set target architectures. Options are:" + echo " [AMD]" + echo " AMDAVX = AMD CPU" + echo " [ARM]" echo " ARMv80 = ARMv8.0 Compatible CPU" echo " ARMv81 = ARMv8.1 Compatible CPU" echo " ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU" + echo " [IBM]" + echo " Power8 = IBM POWER8 CPUs" + echo " Power9 = IBM POWER9 CPUs" + echo " [Intel]" + echo " WSM = Intel Westmere CPUs" echo " SNB = Intel Sandy/Ivy Bridge CPUs" echo " HSW = Intel Haswell CPUs" echo " BDW = Intel Broadwell Xeon E-class CPUs" echo " SKX = Intel Sky Lake Xeon E-class HPC CPUs (AVX512)" + echo " [Intel Xeon Phi]" echo " KNC = Intel Knights Corner Xeon Phi" echo " KNL = Intel Knights Landing Xeon Phi" + echo " [NVIDIA]" echo " Kepler30 = NVIDIA Kepler generation CC 3.0" + echo " Kepler32 = NVIDIA Kepler generation CC 3.2" echo " Kepler35 = NVIDIA Kepler generation CC 3.5" echo " Kepler37 = NVIDIA Kepler generation CC 3.7" + echo " Maxwell50 = NVIDIA Maxwell generation CC 5.0" + echo " Maxwell52 = NVIDIA Maxwell generation CC 5.2" + echo " Maxwell53 = NVIDIA Maxwell generation CC 5.3" echo " Pascal60 = NVIDIA Pascal generation CC 6.0" echo " Pascal61 = NVIDIA Pascal generation CC 6.1" - echo " Maxwell50 = NVIDIA Maxwell generation CC 5.0" - echo " Power8 = IBM POWER8 CPUs" - echo " Power9 = IBM POWER9 CPUs" echo "" echo "--compiler=/Path/To/Compiler Set the compiler." echo "--debug,-dbg: Enable Debugging." @@ -142,10 +153,14 @@ do echo " tests.)" echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc." echo "--with-options=[OPT]: Additional options to Kokkos:" + echo " compiler_warnings" echo " aggressive_vectorization = add ivdep on loops" + echo " disable_profiling = do not compile with profiling hooks" + echo " " echo "--with-cuda-options=[OPT]: Additional options to CUDA:" echo " force_uvm, use_ldg, enable_lambda, rdc" - echo "--make-j=[NUM]: Set -j flag used during build." + echo "--make-j=[NUM]: DEPRECATED: call make with appropriate" + echo " -j flag" exit 0 ;; *) @@ -237,27 +252,27 @@ else KOKKOS_INSTALL_PATH=${KOKKOS_TEST_INSTALL_PATH} fi -mkdir install +mkdir -p install echo "#Makefile to satisfy existens of target kokkos-clean before installing the library" > install/Makefile.kokkos echo "kokkos-clean:" >> install/Makefile.kokkos echo "" >> install/Makefile.kokkos -mkdir core -mkdir core/unit_test -mkdir core/perf_test -mkdir containers -mkdir containers/unit_tests -mkdir containers/performance_tests -mkdir algorithms -mkdir algorithms/unit_tests -mkdir algorithms/performance_tests -mkdir example -mkdir example/fixture -mkdir example/feint -mkdir example/fenl -mkdir example/tutorial +mkdir -p core +mkdir -p core/unit_test +mkdir -p core/perf_test +mkdir -p containers +mkdir -p containers/unit_tests +mkdir -p containers/performance_tests +mkdir -p algorithms +mkdir -p algorithms/unit_tests +mkdir -p algorithms/performance_tests +mkdir -p example +mkdir -p example/fixture +mkdir -p example/feint +mkdir -p example/fenl +mkdir -p example/tutorial if [ ${#KOKKOS_ENABLE_EXAMPLE_ICHOL} -gt 0 ]; then - mkdir example/ichol + mkdir -p example/ichol fi KOKKOS_SETTINGS="${KOKKOS_SETTINGS_NO_KOKKOS_PATH} KOKKOS_PATH=${KOKKOS_PATH}" @@ -266,115 +281,115 @@ KOKKOS_SETTINGS="${KOKKOS_SETTINGS_NO_KOKKOS_PATH} KOKKOS_PATH=${KOKKOS_PATH}" echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > core/unit_test/Makefile echo "" >> core/unit_test/Makefile echo "all:" >> core/unit_test/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS}" >> core/unit_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS}" >> core/unit_test/Makefile echo "" >> core/unit_test/Makefile echo "test: all" >> core/unit_test/Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS} test" >> core/unit_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS} test" >> core/unit_test/Makefile echo "" >> core/unit_test/Makefile echo "clean:" >> core/unit_test/Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS} clean" >> core/unit_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS} clean" >> core/unit_test/Makefile echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > core/perf_test/Makefile echo "" >> core/perf_test/Makefile echo "all:" >> core/perf_test/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS}" >> core/perf_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS}" >> core/perf_test/Makefile echo "" >> core/perf_test/Makefile echo "test: all" >> core/perf_test/Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS} test" >> core/perf_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS} test" >> core/perf_test/Makefile echo "" >> core/perf_test/Makefile echo "clean:" >> core/perf_test/Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS} clean" >> core/perf_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS} clean" >> core/perf_test/Makefile echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > containers/unit_tests/Makefile echo "" >> containers/unit_tests/Makefile echo "all:" >> containers/unit_tests/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS}" >> containers/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS}" >> containers/unit_tests/Makefile echo "" >> containers/unit_tests/Makefile echo "test: all" >> containers/unit_tests/Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS} test" >> containers/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS} test" >> containers/unit_tests/Makefile echo "" >> containers/unit_tests/Makefile echo "clean:" >> containers/unit_tests/Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS} clean" >> containers/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS} clean" >> containers/unit_tests/Makefile echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > containers/performance_tests/Makefile echo "" >> containers/performance_tests/Makefile echo "all:" >> containers/performance_tests/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS}" >> containers/performance_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS}" >> containers/performance_tests/Makefile echo "" >> containers/performance_tests/Makefile echo "test: all" >> containers/performance_tests/Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS} test" >> containers/performance_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS} test" >> containers/performance_tests/Makefile echo "" >> containers/performance_tests/Makefile echo "clean:" >> containers/performance_tests/Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS} clean" >> containers/performance_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS} clean" >> containers/performance_tests/Makefile echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > algorithms/unit_tests/Makefile echo "" >> algorithms/unit_tests/Makefile echo "all:" >> algorithms/unit_tests/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS}" >> algorithms/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS}" >> algorithms/unit_tests/Makefile echo "" >> algorithms/unit_tests/Makefile echo "test: all" >> algorithms/unit_tests/Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS} test" >> algorithms/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS} test" >> algorithms/unit_tests/Makefile echo "" >> algorithms/unit_tests/Makefile echo "clean:" >> algorithms/unit_tests/Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS} clean" >> algorithms/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS} clean" >> algorithms/unit_tests/Makefile KOKKOS_SETTINGS="${KOKKOS_SETTINGS_NO_KOKKOS_PATH} KOKKOS_PATH=${KOKKOS_TEST_INSTALL_PATH}" echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/fixture/Makefile echo "" >> example/fixture/Makefile echo "all:" >> example/fixture/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS}" >> example/fixture/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS}" >> example/fixture/Makefile echo "" >> example/fixture/Makefile echo "test: all" >> example/fixture/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS} test" >> example/fixture/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS} test" >> example/fixture/Makefile echo "" >> example/fixture/Makefile echo "clean:" >> example/fixture/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS} clean" >> example/fixture/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS} clean" >> example/fixture/Makefile echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/feint/Makefile echo "" >> example/feint/Makefile echo "all:" >> example/feint/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS}" >> example/feint/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS}" >> example/feint/Makefile echo "" >> example/feint/Makefile echo "test: all" >> example/feint/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS} test" >> example/feint/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS} test" >> example/feint/Makefile echo "" >> example/feint/Makefile echo "clean:" >> example/feint/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS} clean" >> example/feint/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS} clean" >> example/feint/Makefile echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/fenl/Makefile echo "" >> example/fenl/Makefile echo "all:" >> example/fenl/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS}" >> example/fenl/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS}" >> example/fenl/Makefile echo "" >> example/fenl/Makefile echo "test: all" >> example/fenl/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS} test" >> example/fenl/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS} test" >> example/fenl/Makefile echo "" >> example/fenl/Makefile echo "clean:" >> example/fenl/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS} clean" >> example/fenl/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS} clean" >> example/fenl/Makefile echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/tutorial/Makefile echo "" >> example/tutorial/Makefile echo "build:" >> example/tutorial/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} build">> example/tutorial/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} build">> example/tutorial/Makefile echo "" >> example/tutorial/Makefile echo "test: build" >> example/tutorial/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} test" >> example/tutorial/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} test" >> example/tutorial/Makefile echo "" >> example/tutorial/Makefile echo "clean:" >> example/tutorial/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} clean" >> example/tutorial/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} clean" >> example/tutorial/Makefile if [ ${#KOKKOS_ENABLE_EXAMPLE_ICHOL} -gt 0 ]; then echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/ichol/Makefile echo "" >> example/ichol/Makefile echo "all:" >> example/ichol/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS}" >> example/ichol/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS}" >> example/ichol/Makefile echo "" >> example/ichol/Makefile echo "test: all" >> example/ichol/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS} test" >> example/ichol/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS} test" >> example/ichol/Makefile echo "" >> example/ichol/Makefile echo "clean:" >> example/ichol/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS} clean" >> example/ichol/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS} clean" >> example/ichol/Makefile fi KOKKOS_SETTINGS="${KOKKOS_SETTINGS_NO_KOKKOS_PATH} KOKKOS_PATH=${KOKKOS_PATH}" @@ -385,62 +400,64 @@ echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > Makefile echo "" >> Makefile echo "kokkoslib:" >> Makefile echo -e "\tcd core; \\" >> Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_INSTALL_PATH} build-lib" >> Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_INSTALL_PATH} build-lib" >> Makefile echo "" >> Makefile echo "install: kokkoslib" >> Makefile echo -e "\tcd core; \\" >> Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_INSTALL_PATH} install" >> Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_INSTALL_PATH} install" >> Makefile echo "" >> Makefile echo "kokkoslib-test:" >> Makefile echo -e "\tcd core; \\" >> Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_TEST_INSTALL_PATH} build-lib" >> Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_TEST_INSTALL_PATH} build-lib" >> Makefile echo "" >> Makefile echo "install-test: kokkoslib-test" >> Makefile echo -e "\tcd core; \\" >> Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_TEST_INSTALL_PATH} install" >> Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_TEST_INSTALL_PATH} install" >> Makefile echo "" >> Makefile echo "build-test: install-test" >> Makefile -echo -e "\tmake -C core/unit_test" >> Makefile -echo -e "\tmake -C core/perf_test" >> Makefile -echo -e "\tmake -C containers/unit_tests" >> Makefile -echo -e "\tmake -C containers/performance_tests" >> Makefile -echo -e "\tmake -C algorithms/unit_tests" >> Makefile +echo -e "\t\$(MAKE) -C core/unit_test" >> Makefile +echo -e "\t\$(MAKE) -C core/perf_test" >> Makefile +echo -e "\t\$(MAKE) -C containers/unit_tests" >> Makefile +echo -e "\t\$(MAKE) -C containers/performance_tests" >> Makefile +echo -e "\t\$(MAKE) -C algorithms/unit_tests" >> Makefile if [ ${KOKKOS_DO_EXAMPLES} -gt 0 ]; then -echo -e "\tmake -C example/fixture" >> Makefile -echo -e "\tmake -C example/feint" >> Makefile -echo -e "\tmake -C example/fenl" >> Makefile -echo -e "\tmake -C example/tutorial build" >> Makefile +echo -e "\t\$(MAKE) -C example/fixture" >> Makefile +echo -e "\t\$(MAKE) -C example/feint" >> Makefile +echo -e "\t\$(MAKE) -C example/fenl" >> Makefile +echo -e "\t\$(MAKE) -C example/tutorial build" >> Makefile fi echo "" >> Makefile echo "test: build-test" >> Makefile -echo -e "\tmake -C core/unit_test test" >> Makefile -echo -e "\tmake -C core/perf_test test" >> Makefile -echo -e "\tmake -C containers/unit_tests test" >> Makefile -echo -e "\tmake -C containers/performance_tests test" >> Makefile -echo -e "\tmake -C algorithms/unit_tests test" >> Makefile +echo -e "\t\$(MAKE) -C core/unit_test test" >> Makefile +echo -e "\t\$(MAKE) -C core/perf_test test" >> Makefile +echo -e "\t\$(MAKE) -C containers/unit_tests test" >> Makefile +echo -e "\t\$(MAKE) -C containers/performance_tests test" >> Makefile +echo -e "\t\$(MAKE) -C algorithms/unit_tests test" >> Makefile if [ ${KOKKOS_DO_EXAMPLES} -gt 0 ]; then -echo -e "\tmake -C example/fixture test" >> Makefile -echo -e "\tmake -C example/feint test" >> Makefile -echo -e "\tmake -C example/fenl test" >> Makefile -echo -e "\tmake -C example/tutorial test" >> Makefile +echo -e "\t\$(MAKE) -C example/fixture test" >> Makefile +echo -e "\t\$(MAKE) -C example/feint test" >> Makefile +echo -e "\t\$(MAKE) -C example/fenl test" >> Makefile +echo -e "\t\$(MAKE) -C example/tutorial test" >> Makefile fi echo "" >> Makefile echo "unit-tests-only:" >> Makefile -echo -e "\tmake -C core/unit_test test" >> Makefile -echo -e "\tmake -C containers/unit_tests test" >> Makefile -echo -e "\tmake -C algorithms/unit_tests test" >> Makefile +echo -e "\t\$(MAKE) -C core/unit_test test" >> Makefile +echo -e "\t\$(MAKE) -C containers/unit_tests test" >> Makefile +echo -e "\t\$(MAKE) -C algorithms/unit_tests test" >> Makefile echo "" >> Makefile + echo "clean:" >> Makefile -echo -e "\tmake -C core/unit_test clean" >> Makefile -echo -e "\tmake -C core/perf_test clean" >> Makefile -echo -e "\tmake -C containers/unit_tests clean" >> Makefile -echo -e "\tmake -C containers/performance_tests clean" >> Makefile -echo -e "\tmake -C algorithms/unit_tests clean" >> Makefile +echo -e "\t\$(MAKE) -C core/unit_test clean" >> Makefile +echo -e "\t\$(MAKE) -C core/perf_test clean" >> Makefile +echo -e "\t\$(MAKE) -C containers/unit_tests clean" >> Makefile +echo -e "\t\$(MAKE) -C containers/performance_tests clean" >> Makefile +echo -e "\t\$(MAKE) -C algorithms/unit_tests clean" >> Makefile if [ ${KOKKOS_DO_EXAMPLES} -gt 0 ]; then -echo -e "\tmake -C example/fixture clean" >> Makefile -echo -e "\tmake -C example/feint clean" >> Makefile -echo -e "\tmake -C example/fenl clean" >> Makefile -echo -e "\tmake -C example/tutorial clean" >> Makefile +echo -e "\t\$(MAKE) -C example/fixture clean" >> Makefile +echo -e "\t\$(MAKE) -C example/feint clean" >> Makefile +echo -e "\t\$(MAKE) -C example/fenl clean" >> Makefile +echo -e "\t\$(MAKE) -C example/tutorial clean" >> Makefile fi echo -e "\tcd core; \\" >> Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} clean" >> Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} clean" >> Makefile + diff --git a/lib/mscg/Install.py b/lib/mscg/Install.py index 154f5aa522..76c986ef6d 100644 --- a/lib/mscg/Install.py +++ b/lib/mscg/Install.py @@ -65,13 +65,27 @@ def which(program): return None def geturl(url,fname): + success = False + if which('curl') != None: cmd = 'curl -L -o "%s" %s' % (fname,url) - elif which('wget') != None: + try: + subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) + success = True + except subprocess.CalledProcessError as e: + print("Calling curl failed with: %s" % e.output.decode('UTF-8')) + + if not success and which('wget') != None: cmd = 'wget -O "%s" %s' % (fname,url) - else: error("cannot find 'wget' or 'curl' to download source code") - txt = subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) - return txt + try: + subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) + success = True + except subprocess.CalledProcessError as e: + print("Calling wget failed with: %s" % e.output.decode('UTF-8')) + + if not success: + error("Failed to download source code with 'curl' or 'wget'") + return # parse args diff --git a/lib/smd/Install.py b/lib/smd/Install.py index 00891339d0..9247cb449b 100644 --- a/lib/smd/Install.py +++ b/lib/smd/Install.py @@ -65,13 +65,27 @@ def which(program): return None def geturl(url,fname): + success = False + if which('curl') != None: cmd = 'curl -L -o "%s" %s' % (fname,url) - elif which('wget') != None: + try: + subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) + success = True + except subprocess.CalledProcessError as e: + print("Calling curl failed with: %s" % e.output.decode('UTF-8')) + + if not success and which('wget') != None: cmd = 'wget -O "%s" %s' % (fname,url) - else: error("cannot find 'wget' or 'curl' to download source code") - txt = subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) - return txt + try: + subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) + success = True + except subprocess.CalledProcessError as e: + print("Calling wget failed with: %s" % e.output.decode('UTF-8')) + + if not success: + error("Failed to download source code with 'curl' or 'wget'") + return # parse args diff --git a/lib/voronoi/Install.py b/lib/voronoi/Install.py index 4998358d27..f40eb53bc6 100644 --- a/lib/voronoi/Install.py +++ b/lib/voronoi/Install.py @@ -64,13 +64,27 @@ def which(program): return None def geturl(url,fname): + success = False + if which('curl') != None: cmd = 'curl -L -o "%s" %s' % (fname,url) - elif which('wget') != None: + try: + subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) + success = True + except subprocess.CalledProcessError as e: + print("Calling curl failed with: %s" % e.output.decode('UTF-8')) + + if not success and which('wget') != None: cmd = 'wget -O "%s" %s' % (fname,url) - else: error("cannot find 'wget' or 'curl' to download source code") - txt = subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) - return txt + try: + subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) + success = True + except subprocess.CalledProcessError as e: + print("Calling wget failed with: %s" % e.output.decode('UTF-8')) + + if not success: + error("Failed to download source code with 'curl' or 'wget'") + return # parse args diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp index 22ec8dde3b..87db73bd12 100644 --- a/src/GPU/fix_gpu.cpp +++ b/src/GPU/fix_gpu.cpp @@ -71,6 +71,22 @@ static const char cite_gpu_package[] = " year = 2013,\n" " volume = 184,\n" " pages = {2785--2793}\n" + "}\n\n" + "@Article{Trung15,\n" + " author = {T. D. Nguyen, S. J. Plimpton},\n" + " title = {Accelerating dissipative particle dynamics simulations for soft matter systems},\n" + " journal = {Comput.~Mater.~Sci.},\n" + " year = 2015,\n" + " volume = 100,\n" + " pages = {173--180}\n" + "}\n\n" + "@Article{Trung17,\n" + " author = {T. D. Nguyen},\n" + " title = {GPU-accelerated Tersoff potentials for massively parallel Molecular Dynamics simulations},\n" + " journal = {Comp.~Phys.~Comm.},\n" + " year = 2017,\n" + " volume = 212,\n" + " pages = {113--122}\n" "}\n\n"; /* ---------------------------------------------------------------------- */ diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi index ac8279949a..6a4c4c14be 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi @@ -14,7 +14,7 @@ SHFLAGS = -fPIC DEPFLAGS = -M LINK = mpiicpc -LINKFLAGS = -g -qopenmp $(OPTFLAGS) +LINKFLAGS = -qopenmp $(OPTFLAGS) LIB = -ltbbmalloc SIZE = size diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich index 389a578f72..d4cbdbdb03 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich @@ -7,7 +7,7 @@ SHELL = /bin/sh # specify flags and libraries needed for your compiler CC = mpicxx -cxx=icc -OPTFLAGS = -xAVX -O2 -fp-model fast=2 -no-prec-div -qoverride-limits +OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \ -fno-alias -ansi-alias -restrict $(OPTFLAGS) SHFLAGS = -fPIC diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi index b65905440d..50433ce4c6 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi @@ -8,7 +8,7 @@ SHELL = /bin/sh export OMPI_CXX = icc CC = mpicxx -OPTFLAGS = -xAVX -O2 -fp-model fast=2 -no-prec-div -qoverride-limits +OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \ -fno-alias -ansi-alias -restrict $(OPTFLAGS) SHFLAGS = -fPIC diff --git a/src/Makefile b/src/Makefile index 7dfc2c312a..3b67d2284f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -339,17 +339,18 @@ no-%: fi; # download/build/install a package library +# update the timestamp on main.cpp to trigger a relink with "make machine" lib-%: @if [ -e ../lib/$(LIBDIR)/Install.py ]; then \ echo "Installing lib $(@:lib-%=%)"; \ - cd ../lib/$(LIBDIR); $(PYTHON) Install.py $(args); \ + ( cd ../lib/$(LIBDIR); $(PYTHON) Install.py $(args) ); \ elif [ -e ../lib/$(LIBUSERDIR)/Install.py ]; then \ echo "Installing lib $(@:lib-user-%=%)"; \ - cd ../lib/$(LIBUSERDIR); $(PYTHON) Install.py $(args); \ + ( cd ../lib/$(LIBUSERDIR); $(PYTHON) Install.py $(args) ); \ else \ echo "Install script for lib $(@:lib-%=%) does not exist"; \ - fi; + fi; touch main.cpp # status = list src files that differ from package files # update = replace src files with newer package files diff --git a/src/USER-COLVARS/colvarproxy_lammps_version.h b/src/USER-COLVARS/colvarproxy_lammps_version.h index 834bd1748a..0eb6f2d95a 100644 --- a/src/USER-COLVARS/colvarproxy_lammps_version.h +++ b/src/USER-COLVARS/colvarproxy_lammps_version.h @@ -1,5 +1,5 @@ #ifndef COLVARPROXY_VERSION -#define COLVARPROXY_VERSION "2017-07-15" +#define COLVARPROXY_VERSION "2017-07-19" // This file is part of the Collective Variables module (Colvars). // The original version of Colvars and its updates are located at: // https://github.com/colvars/colvars diff --git a/src/USER-INTEL/Install.sh b/src/USER-INTEL/Install.sh index 736059aa06..275b4839f5 100644 --- a/src/USER-INTEL/Install.sh +++ b/src/USER-INTEL/Install.sh @@ -46,6 +46,7 @@ action npair_intel.h action npair_intel.cpp action intel_simd.h pair_sw_intel.cpp action intel_intrinsics.h pair_tersoff_intel.cpp +action intel_intrinsics_airebo.h pair_airebo_intel.cpp action verlet_lrt_intel.h pppm.cpp action verlet_lrt_intel.cpp pppm.cpp diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README index c02014d0ce..3b84446057 100644 --- a/src/USER-INTEL/README +++ b/src/USER-INTEL/README @@ -4,9 +4,9 @@ -------------------------------- W. Michael Brown (Intel) michael.w.brown at intel.com + Markus Hohnerbach (RWTH Aachen University) William McDoniel (RWTH Aachen University) Rodrigo Canales (RWTH Aachen University) - Markus H�hnerbach (RWTH Aachen University) Stan Moore (Sandia) Ahmed E. Ismail (RWTH Aachen University) Paolo Bientinesi (RWTH Aachen University) diff --git a/src/USER-INTEL/TEST/README b/src/USER-INTEL/TEST/README index 758c37bf56..434189dd26 100644 --- a/src/USER-INTEL/TEST/README +++ b/src/USER-INTEL/TEST/README @@ -8,6 +8,7 @@ # in.intel.sw - Silicon benchmark with Stillinger-Weber # in.intel.tersoff - Silicon benchmark with Tersoff # in.intel.water - Coarse-grain water benchmark using Stillinger-Weber +# in.intel.airebo - Polyethelene benchmark with AIREBO # ############################################################################# @@ -24,6 +25,7 @@ # in.intel.sw - 132.4 161.9 # in.intel.tersoff - 83.3 101.1 # in.intel.water - 53.4 90.3 +# in.intel.airebo - 7.3 11.8 # ############################################################################# diff --git a/src/USER-INTEL/TEST/in.intel.airebo b/src/USER-INTEL/TEST/in.intel.airebo new file mode 100644 index 0000000000..fcd8af4707 --- /dev/null +++ b/src/USER-INTEL/TEST/in.intel.airebo @@ -0,0 +1,47 @@ +# AIREBO polyethelene benchmark + +variable N index on # Newton Setting +variable w index 10 # Warmup Timesteps +variable t index 550 # Main Run Timesteps +variable m index 1 # Main Run Timestep Multiplier +variable n index 0 # Use NUMA Mapping for Multi-Node +variable p index 0 # Use Power Measurement +variable x index 4 +variable y index 2 +variable z index 2 + +variable xx equal 17*$x +variable yy equal 16*$y +variable zz equal 2*$z +variable rr equal floor($t*$m) +variable root getenv LMP_ROOT + +newton $N +if "$n > 0" then "processors * * * grid numa" + +variable root getenv LMP_ROOT + +units metal +atom_style atomic + +read_data ${root}/examples/airebo/data.airebo + +replicate ${xx} ${yy} ${zz} + +neighbor 0.5 bin +neigh_modify delay 5 every 1 + +pair_style airebo 3.0 1 1 +pair_coeff * * ${root}/potentials/CH.airebo C H + +velocity all create 300.0 761341 + +fix 1 all nve +timestep 0.0005 + +thermo 50 + +if "$p > 0" then "run_style verlet/power" + +if "$w > 0" then "run $w" +run ${rr} diff --git a/src/USER-INTEL/TEST/in.intel.eam b/src/USER-INTEL/TEST/in.intel.eam index 5a3b3064af..6486b22ee9 100644 --- a/src/USER-INTEL/TEST/in.intel.eam +++ b/src/USER-INTEL/TEST/in.intel.eam @@ -5,7 +5,6 @@ variable w index 10 # Warmup Timesteps variable t index 3100 # Main Run Timesteps variable m index 1 # Main Run Timestep Multiplier variable n index 0 # Use NUMA Mapping for Multi-Node -variable b index 3 # Neighbor binsize variable p index 0 # Use Power Measurement variable x index 4 diff --git a/src/USER-INTEL/TEST/in.intel.rhodo b/src/USER-INTEL/TEST/in.intel.rhodo index 05145d79c0..7ce7eb4452 100644 --- a/src/USER-INTEL/TEST/in.intel.rhodo +++ b/src/USER-INTEL/TEST/in.intel.rhodo @@ -5,7 +5,6 @@ variable w index 10 # Warmup Timesteps variable t index 520 # Main Run Timesteps variable m index 1 # Main Run Timestep Multiplier variable n index 0 # Use NUMA Mapping for Multi-Node -variable b index 3 # Neighbor binsize variable p index 0 # Use Power Measurement variable c index 0 # 1 to use collectives for PPPM variable d index 1 # 1 to use 'diff ad' for PPPM diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp index 3664bc248b..b4b664cb94 100644 --- a/src/USER-INTEL/intel_buffers.cpp +++ b/src/USER-INTEL/intel_buffers.cpp @@ -30,6 +30,9 @@ IntelBuffers::IntelBuffers(class LAMMPS *lmp_in) : _off_map_listlocal = 0; _ccachex = 0; _ncache_alloc = 0; + _ncachetag = 0; + _cutneighsq = 0; + _cutneighghostsq = 0; #ifdef _LMP_INTEL_OFFLOAD _separate_buffers = 0; _off_f = 0; @@ -447,12 +450,17 @@ void IntelBuffers::free_ncache() flt_t *ncachez = _ncachez; int *ncachej = _ncachej; int *ncachejtype = _ncachejtype; + int *ncachetag = _ncachetag; #ifdef _LMP_INTEL_OFFLOAD if (_off_ncache) { #pragma offload_transfer target(mic:_cop) \ nocopy(ncachex,ncachey,ncachez,ncachej:alloc_if(0) free_if(1)) \ nocopy(ncachejtype:alloc_if(0) free_if(1)) + if (ncachetag) { + #pragma offload_transfer target(mic:_cop) \ + nocopy(ncachetag:alloc_if(0) free_if(1)) + } } _off_ncache = 0; #endif @@ -462,8 +470,10 @@ void IntelBuffers::free_ncache() lmp->memory->destroy(ncachez); lmp->memory->destroy(ncachej); lmp->memory->destroy(ncachejtype); - + if (ncachetag) + lmp->memory->destroy(ncachetag); _ncache_alloc = 0; + _ncachetag = 0; } } @@ -480,7 +490,7 @@ void IntelBuffers::grow_ncache(const int off_flag, const int vsize = _ncache_stride * nt; if (_ncache_alloc) { - if (vsize > _ncache_alloc) + if (vsize > _ncache_alloc || (need_tag() && _ncachetag == 0)) free_ncache(); #ifdef _LMP_INTEL_OFFLOAD else if (off_flag && _off_ncache == 0) @@ -495,6 +505,8 @@ void IntelBuffers::grow_ncache(const int off_flag, lmp->memory->create(_ncachez, vsize, "_ncachez"); lmp->memory->create(_ncachej, vsize, "_ncachej"); lmp->memory->create(_ncachejtype, vsize, "_ncachejtype"); + if (need_tag()) + lmp->memory->create(_ncachetag, vsize, "_ncachetag"); _ncache_alloc = vsize; @@ -513,6 +525,14 @@ void IntelBuffers::grow_ncache(const int off_flag, nocopy(ncachez,ncachej:length(vsize) alloc_if(1) free_if(0)) \ nocopy(ncachejtype:length(vsize) alloc_if(1) free_if(0)) } + int tsize = vsize; + if (!need_tag()) { + tsize = 16; + lmp->memory->create(_ncachetag, tsize, "_ncachetag"); + } + int *ncachetag = _ncachetag; + #pragma offload_transfer target(mic:_cop) \ + nocopy(ncachetag:length(tsize) alloc_if(1) free_if(0)) _off_ncache = 1; } #endif @@ -548,7 +568,8 @@ void IntelBuffers::fdotr_reduce(const int nall, /* ---------------------------------------------------------------------- */ template -void IntelBuffers::set_ntypes(const int ntypes) +void IntelBuffers::set_ntypes(const int ntypes, + const int use_ghost_cut) { if (ntypes != _ntypes) { if (_ntypes > 0) { @@ -558,16 +579,34 @@ void IntelBuffers::set_ntypes(const int ntypes) #pragma offload_transfer target(mic:_cop) \ nocopy(cutneighsqo:alloc_if(0) free_if(1)) } + flt_t * cutneighghostsqo; + if (_cutneighghostsq && _off_threads > 0 && cutneighghostsqo != 0) { + cutneighghostsqo = _cutneighghostsq[0]; + #pragma offload_transfer target(mic:_cop) \ + nocopy(cutneighghostsqo:alloc_if(0) free_if(1)) + } #endif lmp->memory->destroy(_cutneighsq); + if (_cutneighghostsq != 0) lmp->memory->destroy(_cutneighghostsq); } if (ntypes > 0) { lmp->memory->create(_cutneighsq, ntypes, ntypes, "_cutneighsq"); + if (use_ghost_cut) + lmp->memory->create(_cutneighghostsq, ntypes, ntypes, + "_cutneighghostsq"); #ifdef _LMP_INTEL_OFFLOAD flt_t * cutneighsqo = _cutneighsq[0]; + const int ntypes2 = ntypes * ntypes; if (_off_threads > 0 && cutneighsqo != NULL) { #pragma offload_transfer target(mic:_cop) \ - nocopy(cutneighsqo:length(ntypes * ntypes) alloc_if(1) free_if(0)) + nocopy(cutneighsqo:length(ntypes2) alloc_if(1) free_if(0)) + } + if (use_ghost_cut) { + flt_t * cutneighghostsqo = _cutneighghostsq[0]; + if (_off_threads > 0 && cutneighghostsqo != NULL) { + #pragma offload_transfer target(mic:_cop) \ + nocopy(cutneighghostsqo:length(ntypes2) alloc_if(1) free_if(0)) + } } #endif } diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h index 7a7640a203..8040715b2e 100644 --- a/src/USER-INTEL/intel_buffers.h +++ b/src/USER-INTEL/intel_buffers.h @@ -109,12 +109,14 @@ class IntelBuffers { void free_ncache(); void grow_ncache(const int off_flag, const int nthreads); + void grow_ncachetag(const int off_flag, const int nthreads); inline int ncache_stride() { return _ncache_stride; } inline flt_t * get_ncachex() { return _ncachex; } inline flt_t * get_ncachey() { return _ncachey; } inline flt_t * get_ncachez() { return _ncachez; } inline int * get_ncachej() { return _ncachej; } inline int * get_ncachejtype() { return _ncachejtype; } + inline int * get_ncachetag() { return _ncachetag; } inline int get_max_nbors() { int mn = lmp->neighbor->oneatom * sizeof(int) / @@ -131,7 +133,7 @@ class IntelBuffers { _grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width); } - void set_ntypes(const int ntypes); + void set_ntypes(const int ntypes, const int use_ghost_cut = 0); inline int * firstneigh(const NeighList *list) { return _list_alloc; } inline int * cnumneigh(const NeighList *list) { return _cnumneigh; } @@ -162,6 +164,7 @@ class IntelBuffers { inline void zero_ev() { for (int i = 0; i < 8; i++) _ev_global[i] = _ev_global_host[i] = 0.0; } inline flt_t ** get_cutneighsq() { return _cutneighsq; } + inline flt_t ** get_cutneighghostsq() { return _cutneighghostsq; } inline int get_off_threads() { return _off_threads; } #ifdef _LMP_INTEL_OFFLOAD inline void set_off_params(const int n, const int cop, @@ -274,13 +277,10 @@ class IntelBuffers { used_ghost * sizeof(flt_t)); } } + #endif inline int need_tag() { return _need_tag; } inline void need_tag(const int nt) { _need_tag = nt; } - #else - inline int need_tag() { return 0; } - inline void need_tag(const int nt) { } - #endif double memory_usage(const int nthreads); @@ -298,7 +298,7 @@ class IntelBuffers { int _list_alloc_atoms; int *_list_alloc, *_cnumneigh, *_atombin, *_binpacked; - flt_t **_cutneighsq; + flt_t **_cutneighsq, **_cutneighghostsq; int _ntypes; int _ccache_stride; @@ -307,7 +307,10 @@ class IntelBuffers { int _ncache_stride, _ncache_alloc; flt_t *_ncachex, *_ncachey, *_ncachez; - int *_ncachej, *_ncachejtype; + int *_ncachej, *_ncachejtype, *_ncachetag; + + int _need_tag, _host_nmax; + #ifdef LMP_USE_AVXCD int _ccache_stride3; acc_t * _ccachef; @@ -324,7 +327,6 @@ class IntelBuffers { int *_off_map_special, *_off_map_nspecial, *_off_map_tag; int *_off_map_numneigh; bool _off_list_alloc; - int _need_tag, _host_nmax; #endif int _buf_size, _buf_local_size; diff --git a/src/USER-INTEL/intel_intrinsics_airebo.h b/src/USER-INTEL/intel_intrinsics_airebo.h new file mode 100644 index 0000000000..7b091a4ba1 --- /dev/null +++ b/src/USER-INTEL/intel_intrinsics_airebo.h @@ -0,0 +1,2279 @@ +#ifndef LMP_INTEL_AIREBO_SCALAR +# ifdef __INTEL_COMPILER +# if defined(__MIC__) || defined(__AVX512F__) +# define LMP_INTEL_AIREBO_512 +# elif defined(__AVX__) +# define LMP_INTEL_AIREBO_256 +# else +# define LMP_INTEL_AIREBO_SCALAR +# endif +# else +# define LMP_INTEL_AIREBO_SCALAR +# endif +#endif + +#ifdef LMP_INTEL_AIREBO_512 + +#include +#include + +#define VEC_INLINE __attribute__((always_inline)) + + +#ifndef FVEC_FIRST_PASS +# define FVEC_LEN 8 +# define FVEC_SUFFIX(a) a##pd +# define FVEC_SUFFIX_MASK(a) a##pd_mask +# define FVEC_MASK_T __mmask8 +# define FVEC_VEC_T __m512d +# define FVEC_SCAL_T double +# define IVEC_NAME ivec8 +# define FVEC_NAME fvec8pd +# define BVEC_NAME bvec8 +# define AVEC_NAME avec8pd +#else +# undef FVEC_LEN +# undef FVEC_SUFFIX +# undef FVEC_SUFFIX_MASK +# undef FVEC_MASK_T +# undef FVEC_VEC_T +# undef FVEC_SCAL_T +# undef IVEC_NAME +# undef FVEC_NAME +# undef BVEC_NAME +# undef AVEC_NAME + +# define FVEC_LEN 16 +# define FVEC_SUFFIX(a) a##ps +# define FVEC_SUFFIX_MASK(a) a##ps_mask +# define FVEC_MASK_T __mmask16 +# define FVEC_VEC_T __m512 +# define FVEC_SCAL_T float +# define IVEC_NAME ivec16 +# define FVEC_NAME fvec16ps +# define BVEC_NAME bvec16 +# define AVEC_NAME avec16ps +#endif + +namespace mm512 { + +#ifndef __AVX512F__ + +#ifndef FVEC_FIRST_PASS +VEC_INLINE static inline __m512i _mm512_mask_expand_epi32(__m512i src, + __mmask16 k, + __m512i a) { + int buf[16] __attribute__((aligned(64))); + _mm512_store_epi32(buf, a); + return _mm512_mask_loadunpacklo_epi32(src, k, buf); +} +VEC_INLINE static inline __m512i _mm512_maskz_expand_epi32(__mmask16 k, + __m512i a) { + int buf[16] __attribute__((aligned(64))); + _mm512_store_epi32(buf, a); + return _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), k, buf); +} +VEC_INLINE static inline __m512i _mm512_mask_compress_epi32(__m512i src, + __mmask16 k, + __m512i a) { + int buf[16] __attribute__((aligned(64))); + _mm512_store_epi32(buf, src); + _mm512_mask_packstorelo_epi32(buf, k, a); + return _mm512_load_epi32(buf); +} +VEC_INLINE static inline __m512i _mm512_maskz_compress_epi32(__mmask16 k, + __m512i a) { + int buf[16] __attribute__((aligned(64))) = {0}; + _mm512_mask_packstorelo_epi32(buf, k, a); + return _mm512_load_epi32(buf); +} + +VEC_INLINE static inline void _mm512_mask_compressstoreu_epi32(int * dest, + __mmask16 mask, + __m512i src) { + _mm512_mask_packstorelo_epi32(dest, mask, src); + _mm512_mask_packstorehi_epi32(dest + 16, mask, src); +} + +VEC_INLINE static inline __m512i _mm512_mask_loadu_epi32(__m512i src, + __mmask16 k, + const int * mem_addr) { + assert((k & (k + 1)) == 0); + __m512i ret = _mm512_mask_loadunpacklo_epi32(src, k, mem_addr); + ret = _mm512_mask_loadunpackhi_epi32(ret, k, mem_addr + 16); + return ret; +} +VEC_INLINE static inline __m512i _mm512_maskz_loadu_epi32(__mmask16 k, + const int * mem_addr) { + assert((k & (k + 1)) == 0); + __m512i ret = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), k, + mem_addr); + ret = _mm512_mask_loadunpackhi_epi32(ret, k, mem_addr + 16); + return ret; +} +VEC_INLINE static inline void _mm512_mask_storeu_epi32(int * dest, + __mmask16 mask, + __m512i src) { + assert((mask & (mask + 1)) == 0); + _mm512_mask_packstorelo_epi32(dest, mask, src); + _mm512_mask_packstorehi_epi32(dest + 16, mask, src); +} +#endif + +VEC_INLINE static inline FVEC_VEC_T FVEC_SUFFIX(_mm512_mask_expand_) + (FVEC_VEC_T src, __mmask16 k, FVEC_VEC_T a) { + FVEC_SCAL_T buf[FVEC_LEN] __attribute__((aligned(64))); + FVEC_SUFFIX(_mm512_store_)(buf, a); + return FVEC_SUFFIX(_mm512_mask_loadunpacklo_)(src, k, buf); +} +VEC_INLINE static inline FVEC_VEC_T FVEC_SUFFIX(_mm512_maskz_expand_) + (__mmask16 k, FVEC_VEC_T a) { + FVEC_SCAL_T buf[FVEC_LEN] __attribute__((aligned(64))); + FVEC_SUFFIX(_mm512_store_)(buf, a); + return FVEC_SUFFIX(_mm512_mask_loadunpacklo_)(FVEC_SUFFIX(_mm512_setzero_)(), + k, buf); +} +VEC_INLINE static inline FVEC_VEC_T FVEC_SUFFIX(_mm512_mask_compress_) + (FVEC_VEC_T src, __mmask16 k, FVEC_VEC_T a) { + FVEC_SCAL_T buf[FVEC_LEN] __attribute__((aligned(64))); + FVEC_SUFFIX(_mm512_store_)(buf, src); + FVEC_SUFFIX(_mm512_mask_packstorelo_)(buf, k, a); + return FVEC_SUFFIX(_mm512_load_)(buf); +} +VEC_INLINE static inline FVEC_VEC_T FVEC_SUFFIX(_mm512_maskz_compress_) + (__mmask16 k, FVEC_VEC_T a) { + FVEC_SCAL_T buf[FVEC_LEN] __attribute__((aligned(64))) = {0}; + FVEC_SUFFIX(_mm512_mask_packstorelo_)(buf, k, a); + return FVEC_SUFFIX(_mm512_load_)(buf); +} +VEC_INLINE static inline void FVEC_SUFFIX(_mm512_mask_storeu_) + (FVEC_SCAL_T * dest, FVEC_MASK_T mask, FVEC_VEC_T src) { + assert((mask & (mask + 1)) == 0); + FVEC_SUFFIX(_mm512_mask_packstorelo_)(dest, mask, src); + FVEC_SUFFIX(_mm512_mask_packstorehi_)(dest + FVEC_LEN, mask, src); +} +#endif + + +class FVEC_NAME; +class IVEC_NAME; +class AVEC_NAME; +class BVEC_NAME { + friend class FVEC_NAME; + friend class IVEC_NAME; + friend class AVEC_NAME; +# if FVEC_LEN==16 + friend class avec16pd; +# endif + FVEC_MASK_T val_; + VEC_INLINE BVEC_NAME(const FVEC_MASK_T &v) : val_(v) {} +public: + VEC_INLINE BVEC_NAME() {} + VEC_INLINE static BVEC_NAME kand(const BVEC_NAME &a, const BVEC_NAME &b) { + return _mm512_kand(a.val_, b.val_); + } + VEC_INLINE static BVEC_NAME kandn(const BVEC_NAME &a, const BVEC_NAME &b) { + return _mm512_kandn(a.val_, b.val_); + } + VEC_INLINE static BVEC_NAME knot(const BVEC_NAME &a) { + return _mm512_knot(a.val_); + } + VEC_INLINE static int kortestz(const BVEC_NAME &a, const BVEC_NAME &b) { + return _mm512_kortestz(a.val_, b.val_); + } + VEC_INLINE static BVEC_NAME masku_compress(const BVEC_NAME &mask, + const BVEC_NAME &a) { + const __m512i c_i1 = _mm512_set1_epi32(1); + __m512i a_int_vec = _mm512_mask_blend_epi32(a.val_, _mm512_setzero_epi32(), + c_i1); + __m512i compressed = _mm512_mask_compress_epi32(_mm512_undefined_epi32(), + mask.val_, a_int_vec); + return _mm512_cmpeq_epi32_mask(compressed, c_i1); + } + VEC_INLINE static BVEC_NAME mask_expand(const BVEC_NAME &src, + const BVEC_NAME &mask, + const BVEC_NAME &a) { + const __m512i c_i1 = _mm512_set1_epi32(1); + __m512i a_int_vec = _mm512_mask_blend_epi32(a.val_, _mm512_setzero_epi32(), + c_i1); + __m512i src_int_vec = _mm512_mask_blend_epi32(src.val_, + _mm512_setzero_epi32(), c_i1); + __m512i compressed = _mm512_mask_expand_epi32(src_int_vec, mask.val_, + a_int_vec); + return _mm512_cmpeq_epi32_mask(compressed, c_i1); + } + VEC_INLINE static BVEC_NAME full() { + return static_cast(0xFFFF); + } + VEC_INLINE static BVEC_NAME empty() { + return 0; + } + VEC_INLINE static BVEC_NAME only(int n) { + return full().val_ >> (FVEC_LEN - n); + } + VEC_INLINE static BVEC_NAME after(int n) { + return full().val_ << n; + } + VEC_INLINE static BVEC_NAME onlyafter(int only, int after) { + return (full().val_ >> (FVEC_LEN - only)) << after; + } + VEC_INLINE static int popcnt(const BVEC_NAME &a) { + return _popcnt32(a.val_); + } + VEC_INLINE static bool test_all_unset(const BVEC_NAME &a) { + return _mm512_kortestz(a.val_, a.val_); + } + VEC_INLINE static bool test_any_set(const BVEC_NAME &a) { + return ! test_all_unset(a); + } + VEC_INLINE static bool test_at(const BVEC_NAME &a, int i) { + assert(i < FVEC_LEN); + return a.val_ & (1 << i); + } + VEC_INLINE BVEC_NAME operator &(const BVEC_NAME &b) const { + return _mm512_kand(val_, b.val_); + } + VEC_INLINE BVEC_NAME operator |(const BVEC_NAME &b) const { + return _mm512_kor(val_, b.val_); + } + VEC_INLINE BVEC_NAME operator ~() const { + return _mm512_knot(val_); + } +}; + +class IVEC_NAME { + friend class FVEC_NAME; + friend class AVEC_NAME; +# if FVEC_LEN==16 + friend class avec16pd; +# endif + __m512i val_; + VEC_INLINE IVEC_NAME(const __m512i &v) : val_(v) {} +public: + static const int VL = 16; + VEC_INLINE IVEC_NAME() {} + + #define IVEC_MASK_BINFN_B(the_name) \ + VEC_INLINE static BVEC_NAME the_name(const IVEC_NAME &a, \ + const IVEC_NAME &b) { \ + return _mm512_##the_name##_epi32_mask(a.val_, b.val_); \ + } \ + VEC_INLINE static BVEC_NAME mask_##the_name( \ + const BVEC_NAME &mask, \ + const IVEC_NAME &a, \ + const IVEC_NAME &b \ + ) { \ + return _mm512_mask_##the_name##_epi32_mask( \ + mask.val_, a.val_, b.val_); \ + } + IVEC_MASK_BINFN_B(cmpeq) + IVEC_MASK_BINFN_B(cmplt) + IVEC_MASK_BINFN_B(cmpneq) + IVEC_MASK_BINFN_B(cmpgt) + + #define IVEC_MASK_BINFN_I(the_name) \ + VEC_INLINE static IVEC_NAME mask_##the_name( \ + const IVEC_NAME &src, const BVEC_NAME &mask, \ + const IVEC_NAME &a, const IVEC_NAME &b \ + ) { \ + return _mm512_mask_##the_name##_epi32( \ + src.val_, mask.val_, a.val_, b.val_); \ + } + IVEC_MASK_BINFN_I(add) + VEC_INLINE static IVEC_NAME mask_blend( + const BVEC_NAME &mask, const IVEC_NAME &a, const IVEC_NAME &b + ) { + return _mm512_mask_blend_epi32(mask.val_, a.val_, b.val_); + } + + #define IVEC_BINFN_I(the_name) \ + VEC_INLINE static IVEC_NAME the_name(const IVEC_NAME &a, \ + const IVEC_NAME &b) { \ + return _mm512_##the_name##_epi32(a.val_, b.val_); \ + } + IVEC_BINFN_I(mullo) + IVEC_BINFN_I(srlv) + VEC_INLINE static IVEC_NAME the_and(const IVEC_NAME &a, const IVEC_NAME &b) { + return _mm512_and_epi32(a.val_, b.val_); + } + + VEC_INLINE static IVEC_NAME mask_expand( + const IVEC_NAME &src, const BVEC_NAME &a, const IVEC_NAME &b + ) { + return _mm512_mask_expand_epi32(src.val_, + a.val_, b.val_); + } + VEC_INLINE static IVEC_NAME masku_compress( + const BVEC_NAME &a, const IVEC_NAME &b + ) { + return _mm512_mask_compress_epi32(_mm512_undefined_epi32(), a.val_, b.val_); + } + + VEC_INLINE static int at(const IVEC_NAME &a, int b) { + int data[16] __attribute__((aligned(64))); + _mm512_store_epi32(data, a.val_); + return data[b]; + } + + VEC_INLINE static IVEC_NAME load(const int * src) { + return _mm512_load_epi32(src); + } + VEC_INLINE static IVEC_NAME mask_loadu(const BVEC_NAME &mask, + const int * src) { + assert((mask.val_ & (mask.val_ + 1)) == 0); + assert(mask.val_ <= BVEC_NAME::full().val_); + return _mm512_mask_loadu_epi32(_mm512_undefined_epi32(), mask.val_, src); + } + VEC_INLINE static IVEC_NAME maskz_loadu(const BVEC_NAME &mask, + const int * src) { + assert((mask.val_ & (mask.val_ + 1)) == 0); + assert(mask.val_ <= BVEC_NAME::full().val_); + return _mm512_maskz_loadu_epi32(mask.val_, src); + } + VEC_INLINE static void mask_storeu(const BVEC_NAME &mask, int * dest, + const IVEC_NAME &src) { + assert((mask.val_ & (mask.val_ + 1)) == 0); + assert(mask.val_ <= BVEC_NAME::full().val_); + _mm512_mask_storeu_epi32(dest, mask.val_, src.val_); + } + VEC_INLINE static void store(int * dest, const IVEC_NAME &src) { + _mm512_store_epi32(dest, src.val_); + } + + VEC_INLINE static IVEC_NAME mask_gather( + const IVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx, + const int * mem, const int scale + ) { + assert(mask.val_ <= BVEC_NAME::full().val_); + assert(scale == sizeof(int)); + return _mm512_mask_i32gather_epi32(src.val_, mask.val_, idx.val_, mem, + sizeof(int)); + } + VEC_INLINE static void mask_i32scatter( + int * mem, const BVEC_NAME &mask, const IVEC_NAME &idx, + const IVEC_NAME &a, const int scale + ) { + assert(mask.val_ <= BVEC_NAME::full().val_); + assert(scale == sizeof(int)); + _mm512_mask_i32scatter_epi32(mem, mask.val_, idx.val_, a.val_, sizeof(int)); + } + + VEC_INLINE static void mask_compressstore(const BVEC_NAME &mask, int * dest, + const IVEC_NAME &src) { + _mm512_mask_compressstoreu_epi32(dest, mask.val_, src.val_); + } + + VEC_INLINE static IVEC_NAME set1(int i) { + return _mm512_set1_epi32(i); + } + VEC_INLINE static IVEC_NAME setzero() { + return _mm512_setzero_epi32(); + } + VEC_INLINE static IVEC_NAME undefined() { + return _mm512_undefined_epi32(); + } + + VEC_INLINE IVEC_NAME operator +(const IVEC_NAME &b) const { + return _mm512_add_epi32(this->val_, b.val_); + } + VEC_INLINE static void print(const char * str, const IVEC_NAME &a) { + int data[8] __attribute__((aligned(32))); + store(data, a); + printf("%s:", str); + for (int i = 0; i < FVEC_LEN; i++) { + printf(" %d", data[i]); + } + printf("\n"); + } +}; + +class FVEC_NAME { + friend class AVEC_NAME; +#if FVEC_LEN==16 + friend class avec16pd; +#endif + FVEC_VEC_T val_; + VEC_INLINE FVEC_NAME(const FVEC_VEC_T &v) : val_(v) {} +public: + static const int VL = FVEC_LEN; + VEC_INLINE FVEC_NAME() {} + VEC_INLINE static FVEC_SCAL_T at(const FVEC_NAME &a, int i) { + assert(i < FVEC_LEN); + FVEC_SCAL_T data[FVEC_LEN] __attribute__((aligned(64))); + FVEC_SUFFIX(_mm512_store_)(data, a.val_); + return data[i]; + } + VEC_INLINE static bool fast_compress() { return true; } + + #define FVEC_MASK_BINFN_B(the_name) \ + VEC_INLINE static BVEC_NAME the_name(const FVEC_NAME &a, \ + const FVEC_NAME &b) { \ + return FVEC_SUFFIX_MASK(_mm512_##the_name##_)(a.val_, b.val_); \ + } \ + VEC_INLINE static BVEC_NAME mask_##the_name( \ + const BVEC_NAME &mask, \ + const FVEC_NAME &a, const FVEC_NAME &b \ + ) { \ + return FVEC_SUFFIX_MASK(_mm512_mask_##the_name##_)( \ + mask.val_, a.val_, b.val_); \ + } + FVEC_MASK_BINFN_B(cmple) + FVEC_MASK_BINFN_B(cmplt) + FVEC_MASK_BINFN_B(cmpneq) + FVEC_MASK_BINFN_B(cmpnle) + FVEC_MASK_BINFN_B(cmpnlt) + + #define FVEC_UNFN_F(the_name) \ + VEC_INLINE static FVEC_NAME the_name(const FVEC_NAME &a) { \ + return FVEC_SUFFIX(_mm512_##the_name##_)(a.val_); \ + } + FVEC_UNFN_F(abs) + FVEC_UNFN_F(exp) + FVEC_UNFN_F(invsqrt) + FVEC_UNFN_F(recip) + FVEC_UNFN_F(sqrt) + + #define FVEC_MASK_UNFN_F(the_name) \ + VEC_INLINE static FVEC_NAME mask_##the_name( \ + const FVEC_NAME &src, const BVEC_NAME &mask, \ + const FVEC_NAME &a \ + ) { \ + return FVEC_SUFFIX(_mm512_mask_##the_name##_)( \ + src.val_, mask.val_, a.val_); \ + } + FVEC_MASK_UNFN_F(cos) + FVEC_MASK_UNFN_F(recip) + FVEC_MASK_UNFN_F(sqrt) + + #define FVEC_BINFN_F(the_name) \ + VEC_INLINE static FVEC_NAME the_name(const FVEC_NAME &a, \ + const FVEC_NAME &b) { \ + return FVEC_SUFFIX(_mm512_##the_name##_)(a.val_, b.val_); \ + } + FVEC_BINFN_F(max) + FVEC_BINFN_F(min) + + #define FVEC_MASK_BINFN_F(the_name) \ + VEC_INLINE static FVEC_NAME mask_##the_name( \ + const FVEC_NAME &src, const BVEC_NAME &mask, \ + const FVEC_NAME &a, const FVEC_NAME &b \ + ) { \ + return FVEC_SUFFIX(_mm512_mask_##the_name##_)( \ + src.val_, mask.val_, a.val_, b.val_); \ + } + FVEC_MASK_BINFN_F(add) + FVEC_MASK_BINFN_F(div) + FVEC_MASK_BINFN_F(mul) + FVEC_MASK_BINFN_F(sub) + VEC_INLINE static FVEC_NAME mask_blend( + const BVEC_NAME &mask, const FVEC_NAME &a, const FVEC_NAME &b + ) { + return FVEC_SUFFIX(_mm512_mask_blend_)(mask.val_, a.val_, b.val_); + } + + VEC_INLINE static FVEC_NAME mask_expand( + const FVEC_NAME &src, const BVEC_NAME &a, const FVEC_NAME &b + ) { + return FVEC_SUFFIX(_mm512_mask_expand_)(src.val_, + a.val_, b.val_); + } + VEC_INLINE static FVEC_NAME masku_compress( + const BVEC_NAME &a, const FVEC_NAME &b + ) { + return FVEC_SUFFIX(_mm512_mask_compress_)(FVEC_SUFFIX(_mm512_undefined_)(), + a.val_, b.val_); + } + + VEC_INLINE static FVEC_NAME set1(const FVEC_SCAL_T &a) { + return FVEC_SUFFIX(_mm512_set1_)(a); + } + VEC_INLINE static FVEC_NAME setzero() { + return FVEC_SUFFIX(_mm512_setzero_)(); + } + VEC_INLINE static FVEC_NAME undefined() { + return FVEC_SUFFIX(_mm512_undefined_)(); + } + + VEC_INLINE static FVEC_NAME load(const FVEC_SCAL_T *mem) { + return FVEC_SUFFIX(_mm512_load_)(mem); + } + VEC_INLINE static void mask_storeu(const BVEC_NAME &mask, FVEC_SCAL_T * dest, + const FVEC_NAME &a) { + FVEC_SUFFIX(_mm512_mask_storeu_)(dest, mask.val_, a.val_); + } + VEC_INLINE static void store(FVEC_SCAL_T * dest, const FVEC_NAME &a) { + FVEC_SUFFIX(_mm512_store_)(dest, a.val_); + } + + VEC_INLINE static FVEC_NAME gather(const IVEC_NAME &idx, + const FVEC_SCAL_T * mem, + const int scale) { + assert(scale == sizeof(FVEC_SCAL_T)); +# if FVEC_LEN==8 + return FVEC_SUFFIX(_mm512_i32logather_)(idx.val_, mem, sizeof(FVEC_SCAL_T)); +# else + return FVEC_SUFFIX(_mm512_i32gather_)(idx.val_, mem, sizeof(FVEC_SCAL_T)); +# endif + } + VEC_INLINE static FVEC_NAME mask_gather( + const FVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx, + const FVEC_SCAL_T * mem, const int scale + ) { + assert(scale == sizeof(FVEC_SCAL_T)); +# if FVEC_LEN==8 + return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_, + mem, sizeof(FVEC_SCAL_T)); +# else + return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_, + mem, sizeof(FVEC_SCAL_T)); +# endif + } + + VEC_INLINE static void gather_3_adjacent(const IVEC_NAME &idx, + const FVEC_SCAL_T * mem, + const int scale, + FVEC_NAME * out_0, + FVEC_NAME * out_1, + FVEC_NAME * out_2) { + assert(scale == sizeof(FVEC_SCAL_T)); + *out_0 = FVEC_NAME::gather(idx, mem + 0, scale); + *out_1 = FVEC_NAME::gather(idx, mem + 1, scale); + *out_2 = FVEC_NAME::gather(idx, mem + 2, scale); + } + VEC_INLINE static void gather_4_adjacent(const IVEC_NAME &idx, + const FVEC_SCAL_T * mem, + const int scale, FVEC_NAME * out_0, + FVEC_NAME * out_1, + FVEC_NAME * out_2, + FVEC_NAME * out_3) { + assert(scale == sizeof(FVEC_SCAL_T)); + *out_0 = FVEC_NAME::gather(idx, mem + 0, scale); + *out_1 = FVEC_NAME::gather(idx, mem + 1, scale); + *out_2 = FVEC_NAME::gather(idx, mem + 2, scale); + *out_3 = FVEC_NAME::gather(idx, mem + 3, scale); + } + + VEC_INLINE static FVEC_SCAL_T mask_reduce_add(const BVEC_NAME &mask, + const FVEC_NAME &a) { + return FVEC_SUFFIX(_mm512_mask_reduce_add_)(mask.val_, a.val_); + } + VEC_INLINE static FVEC_SCAL_T reduce_add(const FVEC_NAME &a) { + return FVEC_SUFFIX(_mm512_reduce_add_)(a.val_); + } + + VEC_INLINE static IVEC_NAME unpackloepi32(const FVEC_NAME &a) { +# if FVEC_LEN==8 + return _mm512_maskz_compress_epi32(0x5555, _mm512_castpd_si512(a.val_)); +# else + return _mm512_castps_si512(a.val_); +# endif + } + + VEC_INLINE static FVEC_NAME mask_sincos( + FVEC_NAME * cos, const FVEC_NAME &src_a, const FVEC_NAME &src_b, + const BVEC_NAME &mask, const FVEC_NAME &arg + ) { + return FVEC_SUFFIX(_mm512_mask_sincos_)(&cos->val_, src_a.val_, src_b.val_, + mask.val_, arg.val_); + } + + #define FVEC_BINOP(the_sym, the_name) \ + VEC_INLINE inline FVEC_NAME operator the_sym(const FVEC_NAME &b) const { \ + return FVEC_SUFFIX(_mm512_##the_name##_)(this->val_, b.val_); \ + } + FVEC_BINOP(+, add) + FVEC_BINOP(-, sub) + FVEC_BINOP(*, mul) + FVEC_BINOP(/, div) + + VEC_INLINE static void gather_prefetch0(const IVEC_NAME &a, void * mem) { + #ifdef __AVX512PF__ + _mm512_mask_prefetch_i32gather_ps(a.val_, BVEC_NAME::full().val_, mem, + sizeof(FVEC_SCAL_T), _MM_HINT_T0); + #endif + } +}; + +class AVEC_NAME { + FVEC_VEC_T val_; + VEC_INLINE AVEC_NAME(const FVEC_VEC_T &a) : val_(a) {} +public: + VEC_INLINE AVEC_NAME(const FVEC_NAME &a) : val_(a.val_) {} + VEC_INLINE static AVEC_NAME undefined() { + return FVEC_SUFFIX(_mm512_undefined_)(); + } + VEC_INLINE static AVEC_NAME mask_gather( + const AVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx, + const FVEC_SCAL_T * mem, const int scale + ) { + assert(scale == sizeof(FVEC_SCAL_T)); +# if FVEC_LEN==8 + return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_, + mem, sizeof(FVEC_SCAL_T)); +# else + return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_, + mem, sizeof(FVEC_SCAL_T)); +# endif + } + VEC_INLINE static void mask_i32loscatter( + FVEC_SCAL_T * mem, const BVEC_NAME &mask, const IVEC_NAME &idx, + const AVEC_NAME &a, const int scale + ) { + assert(scale == sizeof(FVEC_SCAL_T)); +# if FVEC_LEN==8 + FVEC_SUFFIX(_mm512_mask_i32loscatter_)(mem, mask.val_, idx.val_, a.val_, + sizeof(FVEC_SCAL_T)); +# else + FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_, idx.val_, a.val_, + sizeof(FVEC_SCAL_T)); +# endif + } + + #define AVEC_BINOP(the_sym, the_name) \ + VEC_INLINE inline AVEC_NAME operator the_sym(const AVEC_NAME &b) const { \ + return FVEC_SUFFIX(_mm512_##the_name##_)(this->val_, b.val_); \ + } + AVEC_BINOP(-, sub) + + VEC_INLINE static void gather_prefetch0(const IVEC_NAME &a, void * mem) { + _mm512_mask_prefetch_i32gather_ps(a.val_, BVEC_NAME::full().val_, mem, + sizeof(FVEC_SCAL_T), _MM_HINT_T0); + } +}; + +#if FVEC_LEN==16 +class avec16pd { + __m512d lo_, hi_; + VEC_INLINE avec16pd(const __m512d &lo, const __m512d &hi) : lo_(lo), hi_(hi) + {} + VEC_INLINE static __mmask8 get_bvec_hi(__mmask16 a) { + return a >> 8; + } + VEC_INLINE static __m512i get_ivec_hi(__m512i a) { + return _mm512_permute4f128_epi32(a, _MM_PERM_BADC); + } +public: + VEC_INLINE avec16pd(const FVEC_NAME &a) { + lo_ = _mm512_cvtpslo_pd(a.val_); + hi_ = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(a.val_, _MM_PERM_BADC)); + } + VEC_INLINE static avec16pd undefined() { + return avec16pd(_mm512_undefined_pd(), _mm512_undefined_pd()); + } + VEC_INLINE static avec16pd mask_gather( + const avec16pd &src, const BVEC_NAME &mask, const IVEC_NAME &idx, + const double * mem, const int scale + ) { + assert(scale == sizeof(double)); + __m512d lo = _mm512_mask_i32logather_pd(src.lo_, mask.val_, idx.val_, mem, + sizeof(double)); + __m512d hi = _mm512_mask_i32logather_pd(src.hi_, get_bvec_hi(mask.val_), + get_ivec_hi(idx.val_), mem, + sizeof(double)); + return avec16pd(lo, hi); + } + VEC_INLINE static void mask_i32loscatter( + double * mem, const BVEC_NAME &mask, const IVEC_NAME &idx, + const avec16pd &a, const int scale + ) { + assert(scale == sizeof(double)); + _mm512_mask_i32loscatter_pd(mem, mask.val_, idx.val_, a.lo_, + sizeof(double)); + _mm512_mask_i32loscatter_pd(mem, get_bvec_hi(mask.val_), + get_ivec_hi(idx.val_), a.hi_, sizeof(double)); + } + + #define AVEC2_BINOP(the_sym, the_name) \ + VEC_INLINE inline avec16pd operator the_sym(const avec16pd &b) const { \ + __m512d lo = _mm512_##the_name##_pd(this->lo_, b.lo_); \ + __m512d hi = _mm512_##the_name##_pd(this->hi_, b.hi_); \ + return avec16pd(lo, hi); \ + } + AVEC2_BINOP(-, sub) + + VEC_INLINE static void gather_prefetch0(const IVEC_NAME &a, void * mem) { + _mm512_mask_prefetch_i32gather_ps(a.val_, BVEC_NAME::full().val_, mem, + sizeof(double), _MM_HINT_T0); + } +}; +#endif + +} + + +#ifdef FVEC_FIRST_PASS + +template +struct intr_types; + +template<> +struct intr_types { + typedef mm512::fvec8pd fvec; + typedef mm512::ivec8 ivec; + typedef mm512::bvec8 bvec; + typedef mm512::avec8pd avec; +}; + +template<> +struct intr_types { + typedef mm512::fvec16ps fvec; + typedef mm512::ivec16 ivec; + typedef mm512::bvec16 bvec; + typedef mm512::avec16ps avec; +}; + +template<> +struct intr_types { + typedef mm512::fvec16ps fvec; + typedef mm512::ivec16 ivec; + typedef mm512::bvec16 bvec; + typedef mm512::avec16pd avec; +}; + +#endif + + +#ifndef FVEC_FIRST_PASS +# define FVEC_FIRST_PASS +# include "intel_intrinsics_airebo.h" +#endif + +#endif + +#ifdef LMP_INTEL_AIREBO_256 + +#include +#include +#include + +#define VEC_INLINE __attribute__((always_inline)) + + +#ifndef FVEC_FIRST_PASS +# define FVEC_LEN 4 +# define FVEC_SUFFIX(a) a##pd +# define FVEC_MASK_T __m256d +# define FVEC_VEC_T __m256d +# define FVEC_SCAL_T double +# define IVEC_NAME ivec4 +# define FVEC_NAME fvec4pd +# define BVEC_NAME bvec4 +# define AVEC_NAME avec4pd +#else +# undef FVEC_LEN +# undef FVEC_SUFFIX +# undef FVEC_SUFFIX_MASK +# undef FVEC_MASK_T +# undef FVEC_VEC_T +# undef FVEC_SCAL_T +# undef IVEC_NAME +# undef FVEC_NAME +# undef BVEC_NAME +# undef AVEC_NAME + +# define FVEC_LEN 8 +# define FVEC_SUFFIX(a) a##ps +# define FVEC_MASK_T __m256 +# define FVEC_VEC_T __m256 +# define FVEC_SCAL_T float +# define IVEC_NAME ivec8 +# define FVEC_NAME fvec8ps +# define BVEC_NAME bvec8 +# define AVEC_NAME avec8ps +#endif + + + +namespace mm256 { + +//#define __AVX2__ __AVX2__ + +#if !defined(__AVX2__) && !defined(FVEC_FIRST_PASS) + +#define IVEC_EM_BIN(op) \ + __m128i a_lo = _mm256_castsi256_si128(a); \ + __m128i b_lo = _mm256_castsi256_si128(b); \ + __m128i a_hi = _mm256_extractf128_si256(a, 1); \ + __m128i b_hi = _mm256_extractf128_si256(b, 1); \ + __m128i c_lo = op(a_lo, b_lo); \ + __m128i c_hi = op(a_hi, b_hi); \ + __m256i ret = _mm256_setr_m128i(c_lo, c_hi); \ + return ret; + +VEC_INLINE inline __m256i _cm256_add_epi32(const __m256i &a, const __m256i &b) { + IVEC_EM_BIN(_mm_add_epi32) +} + +VEC_INLINE inline __m256i _cm256_and_si256(const __m256i &a, const __m256i &b) { + IVEC_EM_BIN(_mm_and_si128) +} + +VEC_INLINE inline __m256i _cm256_andnot_si256(const __m256i &a, + const __m256i &b) { + IVEC_EM_BIN(_mm_andnot_si128) +} + +VEC_INLINE inline __m256i _cm256_cmpeq_epi32(const __m256i &a, + const __m256i &b) { + IVEC_EM_BIN(_mm_cmpeq_epi32) +} + +VEC_INLINE inline __m256i _cm256_cmpgt_epi32(const __m256i &a, + const __m256i &b) { + IVEC_EM_BIN(_mm_cmpgt_epi32) +} + +VEC_INLINE inline __m256i _cm256_cvtepu8_epi32(const __m128i &a) { + __m128i a_hi = _mm_castps_si128(_mm_permute_ps(_mm_castsi128_ps(a), 1)); + __m128i c_lo = _mm_cvtepu8_epi32(a); + __m128i c_hi = _mm_cvtepu8_epi32(a_hi); + __m256i ret = _mm256_setr_m128i(c_lo, c_hi); + return ret; + +} + +#define IVEC_EM_SCAL(op) \ + int buf_a[8] __attribute__((aligned(32))); \ + int buf_b[8] __attribute__((aligned(32))); \ + int dest[8] __attribute__((aligned(32))); \ + _mm256_store_si256((__m256i*)buf_a, a); \ + _mm256_store_si256((__m256i*)buf_b, b); \ + for (int i = 0; i < 8; i++) { \ + dest[i] = op; \ + } \ + return _mm256_load_si256((__m256i*) dest); + +VEC_INLINE inline __m256i _cm256_permutevar8x32_epi32(const __m256i &a, + const __m256i &b) { + IVEC_EM_SCAL(buf_a[buf_b[i]]) +} + +VEC_INLINE inline __m256i _cm256_mullo_epi32(__m256i a, __m256i b) { + IVEC_EM_BIN(_mm_mullo_epi32) +} + +VEC_INLINE inline __m256i _cm256_srlv_epi32(__m256i a, __m256i b) { + IVEC_EM_SCAL(buf_a[i] >> buf_b[i]) +} + + +VEC_INLINE inline __m256 _cm256_permutevar8x32_ps(const __m256 &a, + const __m256i &b) { + return _mm256_castsi256_ps(_cm256_permutevar8x32_epi32(_mm256_castps_si256(a), + b)); +} + +VEC_INLINE inline __m128i _cm_maskload_epi32(int const * mem, __m128i mask) { + return _mm_castps_si128(_mm_maskload_ps((float const *) mem, mask)); +} + +VEC_INLINE inline __m256i _cm256_maskload_epi32(int const * mem, __m256i mask) { + __m128i a_lo = _mm256_castsi256_si128(mask); + __m128i a_hi = _mm256_extractf128_si256(mask, 1); + __m128i c_lo = _cm_maskload_epi32(mem, a_lo); + __m128i c_hi = _cm_maskload_epi32(mem + 4, a_hi); + __m256i ret = _mm256_setr_m128i(c_lo, c_hi); + return ret; +} + + +VEC_INLINE inline __m256i _cm256_mask_i32gather_epi32(__m256i src, + int const * base_addr, + __m256i index, + __m256i mask, + const int scale) { + assert(scale == sizeof(int)); + int buf_index[8] __attribute__((aligned(32))); + int buf_mask[8] __attribute__((aligned(32))); + int dest[8] __attribute__((aligned(32))); + _mm256_store_si256((__m256i*)dest, src); + _mm256_store_si256((__m256i*)buf_index, index); + _mm256_store_si256((__m256i*)buf_mask, mask); + for (int i = 0; i < 8; i++) { + if (buf_mask[i]) dest[i] = base_addr[buf_index[i]]; + } + return _mm256_load_si256((__m256i*) dest); +} + +VEC_INLINE inline __m256 _cm256_mask_i32gather_ps(__m256 src, + float const * base_addr, + __m256i index, __m256 mask, + const int scale) { + return _mm256_castsi256_ps(_cm256_mask_i32gather_epi32( + _mm256_castps_si256(src), (const int *) base_addr, index, + _mm256_castps_si256(mask), scale)); +} + +VEC_INLINE inline __m256d _cm256_mask_i32gather_pd(__m256d src, + double const * base_addr, + __m128i index, __m256d mask, + const int scale) { + assert(scale == sizeof(double)); + int buf_index[4] __attribute__((aligned(32))); + int buf_mask[8] __attribute__((aligned(32))); + double dest[4] __attribute__((aligned(32))); + _mm256_store_pd(dest, src); + _mm_store_si128((__m128i*)buf_index, index); + _mm256_store_si256((__m256i*)buf_mask, _mm256_castpd_si256(mask)); + for (int i = 0; i < 4; i++) { + if (buf_mask[2*i]) dest[i] = base_addr[buf_index[i]]; + } + return _mm256_load_pd(dest); +} + +VEC_INLINE inline __m256i _cm256_i32gather_epi32(int const * base_addr, + __m256i index, + const int scale) { + assert(scale == sizeof(int)); + int buf_index[8] __attribute__((aligned(32))); + int dest[8] __attribute__((aligned(32))); + _mm256_store_si256((__m256i*)buf_index, index); + for (int i = 0; i < 8; i++) { + dest[i] = base_addr[buf_index[i]]; + } + return _mm256_load_si256((__m256i*) dest); +} + +VEC_INLINE inline __m256 _cm256_i32gather_ps(float const * base_addr, + __m256i index, const int scale) { + return _mm256_castsi256_ps(_cm256_i32gather_epi32((const int *) base_addr, + index, scale)); +} + +VEC_INLINE inline __m256d _cm256_i32gather_pd(double const * base_addr, + __m128i index, const int scale) { + assert(scale == sizeof(double)); + int buf_index[4] __attribute__((aligned(32))); + double dest[4] __attribute__((aligned(32))); + _mm_store_si128((__m128i*)buf_index, index); + for (int i = 0; i < 4; i++) { + dest[i] = base_addr[buf_index[i]]; + } + return _mm256_load_pd(dest); +} + +VEC_INLINE inline uint64_t _cdep_u64(uint64_t tmp, uint64_t mask) { + uint64_t dst = 0; + uint64_t k = 0; + const uint64_t one = 1; + const uint64_t zero = 0; + for (uint64_t m = 0; m < 64; m++) { + if (mask & (one << m)) { + dst |= static_cast((tmp & (one << k)) != zero) << m; + k += 1; + } + } + return dst; +} + +VEC_INLINE inline uint64_t _cext_u64(uint64_t tmp, uint64_t mask) { + uint64_t dst = 0; + uint64_t k = 0; + const uint64_t one = 1; + const uint64_t zero = 0; + for (uint64_t m = 0; m < 64; m++) { + if (mask & (one << m)) { + dst |= static_cast((tmp & (one << m)) != zero) << k; + k += 1; + } + } + return dst; +} + +#define _mm256_add_epi32 _cm256_add_epi32 +#define _mm256_and_si256 _cm256_and_si256 +#define _mm256_andnot_si256 _cm256_andnot_si256 +#define _mm256_cmpeq_epi32 _cm256_cmpeq_epi32 +#define _mm256_cmpgt_epi32 _cm256_cmpgt_epi32 +#define _mm256_permutevar8x32_epi32 _cm256_permutevar8x32_epi32 +#define _mm256_permutevar8x32_ps _cm256_permutevar8x32_ps +#define _mm_maskload_epi32 _cm_maskload_epi32 +#define _mm256_maskload_epi32 _cm256_maskload_epi32 +#define _mm256_mullo_epi32 _cm256_mullo_epi32 +#define _mm256_srlv_epi32 _cm256_srlv_epi32 +#define _mm256_mask_i32gather_epi32 _cm256_mask_i32gather_epi32 +#define _mm256_mask_i32gather_pd _cm256_mask_i32gather_pd +#define _mm256_mask_i32gather_ps _cm256_mask_i32gather_ps +#define _mm256_i32gather_epi32 _cm256_i32gather_epi32 +#define _mm256_i32gather_pd _cm256_i32gather_pd +#define _mm256_i32gather_ps _cm256_i32gather_ps +#define _pdep_u64 _cdep_u64 +#define _pext_u64 _cext_u64 +#define _mm256_cvtepu8_epi32 _cm256_cvtepu8_epi32 + +#endif + +#ifndef FVEC_FIRST_PASS + +VEC_INLINE inline __m256 _mm256_compress_ps(__m256 mask, __m256 a) { +# ifdef __AVX2__ + uint64_t expanded_mask = _pdep_u64(_mm256_movemask_ps(mask), + 0x0101010101010101); + // unpack each bit to a byte + expanded_mask *= 0xFF; // mask |= mask<<1 | mask<<2 | ... | mask<<7; + // the identity shuffle for vpermps, packed to one index per byte + const uint64_t identity_indices = 0x0706050403020100; + uint64_t wanted_indices = _pext_u64(identity_indices, expanded_mask); + + __m128i bytevec = _mm_cvtsi64_si128(wanted_indices); + __m256i shufmask = _mm256_cvtepu8_epi32(bytevec); + + return _mm256_permutevar8x32_ps(a, shufmask); +# else + int mask_buf[8] __attribute__((aligned(32))); + float a_buf[8] __attribute__((aligned(32))); + float dst_buf[8] __attribute__((aligned(32))); + _mm256_store_si256((__m256i*) mask_buf, _mm256_castps_si256(mask)); + _mm256_store_ps(a_buf, a); + int k = 0; + for (int i = 0; i < 8; i++) { + if (mask[i]) { + dst_buf[k++] = a_buf[i]; + } + } + return _mm256_load_ps(dst_buf); +# endif +} +VEC_INLINE inline __m256 _mm256_expand_ps(__m256 mask, __m256 a) { +# ifdef __AVX2__ + uint64_t expanded_mask = _pdep_u64(_mm256_movemask_ps(mask), + 0x0101010101010101); + expanded_mask *= 0xFF; + const uint64_t identity_indices = 0x0706050403020100; + uint64_t wanted_indices = _pdep_u64(identity_indices, expanded_mask); + __m128i bytevec = _mm_cvtsi64_si128(wanted_indices); + __m256i shufmask = _mm256_cvtepu8_epi32(bytevec); + return _mm256_permutevar8x32_ps(a, shufmask); +# else + int mask_buf[8] __attribute__((aligned(32))); + float a_buf[8] __attribute__((aligned(32))); + float dst_buf[8] __attribute__((aligned(32))) = {0}; + _mm256_store_si256((__m256i*) mask_buf, _mm256_castps_si256(mask)); + _mm256_store_ps(a_buf, a); + int k = 0; + for (int i = 0; i < 8; i++) { + if (mask[i]) { + dst_buf[i] = a_buf[k++]; + } + } + return _mm256_load_ps(dst_buf); +# endif +} + +VEC_INLINE inline __m256d _mm256_compress_pd(__m256d mask, __m256d a) { + return _mm256_castps_pd(_mm256_compress_ps(_mm256_castpd_ps(mask), + _mm256_castpd_ps(a))); +} +VEC_INLINE inline __m256d _mm256_expand_pd(__m256d mask, __m256d a) { + return _mm256_castps_pd(_mm256_expand_ps(_mm256_castpd_ps(mask), + _mm256_castpd_ps(a))); +} +#endif + + +class FVEC_NAME; +class IVEC_NAME; +class AVEC_NAME; +class BVEC_NAME { + friend class FVEC_NAME; + friend class IVEC_NAME; + friend class AVEC_NAME; +# if FVEC_LEN==8 + friend class avec8pd; +# endif + FVEC_MASK_T val_; + VEC_INLINE BVEC_NAME(const FVEC_MASK_T &v) : val_(v) {} + VEC_INLINE BVEC_NAME(const __m256i &v) : val_(FVEC_SUFFIX(_mm256_castsi256_) + (v)) {} +public: + VEC_INLINE BVEC_NAME() {} + VEC_INLINE static BVEC_NAME kand(const BVEC_NAME &a, const BVEC_NAME &b) { + return FVEC_SUFFIX(_mm256_and_)(a.val_, b.val_); + } + VEC_INLINE static BVEC_NAME kandn(const BVEC_NAME &a, const BVEC_NAME &b) { + return FVEC_SUFFIX(_mm256_andnot_)(a.val_, b.val_); + } + VEC_INLINE static BVEC_NAME masku_compress(const BVEC_NAME &mask, + const BVEC_NAME &a) { + return FVEC_SUFFIX(_mm256_compress_)(mask.val_, a.val_); + } + VEC_INLINE static BVEC_NAME mask_expand(const BVEC_NAME &src, + const BVEC_NAME &mask, + const BVEC_NAME &a) { + FVEC_MASK_T ret = FVEC_SUFFIX(_mm256_expand_)(mask.val_, a.val_); + ret = FVEC_SUFFIX(_mm256_and_)(mask.val_, ret); + ret = FVEC_SUFFIX(_mm256_or_)(ret, FVEC_SUFFIX(_mm256_andnot_) + (mask.val_, src.val_)); + return ret; + } + VEC_INLINE static BVEC_NAME full() { + __m256i a = _mm256_undefined_si256(); + return FVEC_SUFFIX(_mm256_castsi256_)(_mm256_cmpeq_epi32(a, a)); + } + VEC_INLINE static BVEC_NAME empty() { + return FVEC_SUFFIX(_mm256_setzero_)(); + } + VEC_INLINE static BVEC_NAME only(int n) { + static const unsigned int FULL_ps = (unsigned int) -1; + static const unsigned int LUT_ps[9][8] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {FULL_ps, 0, 0, 0, 0, 0, 0, 0}, + {FULL_ps, FULL_ps, 0, 0, 0, 0, 0, 0}, + {FULL_ps, FULL_ps, FULL_ps, 0, 0, 0, 0, 0}, + {FULL_ps, FULL_ps, FULL_ps, FULL_ps, 0, 0, 0, 0}, + {FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, 0, 0, 0}, + {FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, 0, 0}, + {FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, 0}, + {FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps}, + }; + static const unsigned long long FULL_pd = (unsigned long long) -1; + static const unsigned long long LUT_pd[5][4] = { + {0, 0, 0, 0}, + {FULL_pd, 0, 0, 0}, + {FULL_pd, FULL_pd, 0, 0}, + {FULL_pd, FULL_pd, FULL_pd, 0}, + {FULL_pd, FULL_pd, FULL_pd, FULL_pd}, + }; + return FVEC_SUFFIX(_mm256_load_)((const FVEC_SCAL_T*) FVEC_SUFFIX(LUT_)[n]); + } + VEC_INLINE static BVEC_NAME after(int n) { + static const unsigned int FULL_ps = (unsigned int) -1; + static const unsigned int LUT_ps[9][8] = { + {FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps}, + {0, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps}, + {0, 0, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps}, + {0, 0, 0, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps}, + {0, 0, 0, 0, FULL_ps, FULL_ps, FULL_ps, FULL_ps}, + {0, 0, 0, 0, 0, FULL_ps, FULL_ps, FULL_ps}, + {0, 0, 0, 0, 0, 0, FULL_ps, FULL_ps}, + {0, 0, 0, 0, 0, 0, 0, FULL_ps}, + {0, 0, 0, 0, 0, 0, 0, 0}, + }; + static const unsigned long long FULL_pd = (unsigned long long) -1; + static const unsigned long long LUT_pd[5][4] = { + {FULL_pd, FULL_pd, FULL_pd, FULL_pd}, + {0, FULL_pd, FULL_pd, FULL_pd}, + {0, 0, FULL_pd, FULL_pd}, + {0, 0, 0, FULL_pd}, + {0, 0, 0, 0}, + }; + return FVEC_SUFFIX(_mm256_load_)((const FVEC_SCAL_T*) FVEC_SUFFIX(LUT_)[n]); + } + VEC_INLINE static BVEC_NAME onlyafter(int only_, int after_) { + return kand(after(after_), only(after_ + only_)); + } + VEC_INLINE static int popcnt(const BVEC_NAME &a) { + return _popcnt32(FVEC_SUFFIX(_mm256_movemask_)(a.val_)); + } + VEC_INLINE static bool test_all_unset(const BVEC_NAME &a) { + return FVEC_SUFFIX(_mm256_testz_)(a.val_, a.val_); + } + VEC_INLINE static bool test_any_set(const BVEC_NAME &a) { + return ! test_all_unset(a); + } + VEC_INLINE static bool test_at(const BVEC_NAME &a, int i) { + assert(i < FVEC_LEN); + return FVEC_SUFFIX(_mm256_movemask_)(a.val_) & (1 << i); + } + VEC_INLINE BVEC_NAME operator &(const BVEC_NAME &b) const { + return FVEC_SUFFIX(_mm256_and_)(val_, b.val_); + } + VEC_INLINE BVEC_NAME operator |(const BVEC_NAME &b) const { + return FVEC_SUFFIX(_mm256_or_)(val_, b.val_); + } + VEC_INLINE BVEC_NAME operator ~() const { + return FVEC_SUFFIX(_mm256_andnot_)(val_, full().val_); + } +}; + +class IVEC_NAME { + friend class FVEC_NAME; + friend class AVEC_NAME; +# if FVEC_LEN==8 + friend class avec8pd; +# endif + __m256i val_; + VEC_INLINE IVEC_NAME(const __m256i &v) : val_(v) {} + VEC_INLINE static __m256i to(const FVEC_VEC_T &a) { +# if FVEC_LEN==4 + return _mm256_castpd_si256(a); +# else + return _mm256_castps_si256(a); +# endif + } + VEC_INLINE static FVEC_VEC_T from(const __m256i &a) { + return FVEC_SUFFIX(_mm256_castsi256_)(a); + } +public: + static const int VL = 8; + VEC_INLINE IVEC_NAME() {} + + #define IVEC_MASK_BINFN_B(the_name) \ + VEC_INLINE static BVEC_NAME the_name(const IVEC_NAME &a, \ + const IVEC_NAME &b) { \ + return _mm256_##the_name##_epi32(a.val_, b.val_); \ + } \ + VEC_INLINE static BVEC_NAME mask_##the_name( \ + const BVEC_NAME &mask, \ + const IVEC_NAME &a, const IVEC_NAME &b \ + ) { \ + BVEC_NAME ret = _mm256_##the_name##_epi32( \ + a.val_, b.val_); \ + return mask & ret; \ + } + IVEC_MASK_BINFN_B(cmpeq) + IVEC_MASK_BINFN_B(cmpgt) + + VEC_INLINE static __m256i _mm256_cmplt_epi32(__m256i a, __m256i b) { + __m256i le = _mm256_cmpgt_epi32(b, a); + __m256i eq = _mm256_cmpeq_epi32(a, b); + return _mm256_andnot_si256(eq, le); + } + + VEC_INLINE static __m256i _mm256_cmpneq_epi32(__m256i a, __m256i b) { + __m256i eq = _mm256_cmpeq_epi32(a, b); + __m256i t = _mm256_undefined_si256(); + __m256i f = _mm256_cmpeq_epi32(t, t); + return _mm256_andnot_si256(eq, f); + } + + IVEC_MASK_BINFN_B(cmplt) + IVEC_MASK_BINFN_B(cmpneq) + #undef IVEC_MASK_BINFN_B + + VEC_INLINE static IVEC_NAME mask_blend( + const BVEC_NAME &mask, const IVEC_NAME &a, const IVEC_NAME &b + ) { + return to(FVEC_SUFFIX(_mm256_blendv_)(from(a.val_), from(b.val_), + mask.val_)); + } + #define IVEC_MASK_BINFN_I(the_name) \ + VEC_INLINE static IVEC_NAME mask_##the_name( \ + const IVEC_NAME &src, const BVEC_NAME &mask, \ + const IVEC_NAME &a, const IVEC_NAME &b \ + ) { \ + IVEC_NAME ret = _mm256_##the_name##_epi32( \ + a.val_, b.val_); \ + return mask_blend(mask, src, ret); \ + } + IVEC_MASK_BINFN_I(add) + #undef IVEC_MASK_BINFN_I + + #define IVEC_BINFN_I(the_name) \ + VEC_INLINE static IVEC_NAME the_name(const IVEC_NAME &a, \ + const IVEC_NAME &b) { \ + return _mm256_##the_name##_epi32(a.val_, b.val_); \ + } + IVEC_BINFN_I(mullo) + IVEC_BINFN_I(srlv) + #undef IVEC_BINFN_I + VEC_INLINE static IVEC_NAME the_and(const IVEC_NAME &a, const IVEC_NAME &b) { + return _mm256_and_si256(a.val_, b.val_); + } + + VEC_INLINE static IVEC_NAME masku_compress(const BVEC_NAME &mask, + const IVEC_NAME &b) { + return to(FVEC_SUFFIX(_mm256_compress_)(mask.val_, from(b.val_))); + } + VEC_INLINE static IVEC_NAME mask_expand( + const IVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &b + ) { + FVEC_VEC_T ret = FVEC_SUFFIX(_mm256_expand_)(mask.val_, from(b.val_)); + ret = FVEC_SUFFIX(_mm256_and_)(mask.val_, ret); + ret = FVEC_SUFFIX(_mm256_or_)(ret, FVEC_SUFFIX(_mm256_andnot_) + (mask.val_, from(src.val_))); + return to(ret); + } + + VEC_INLINE static void store(int * dest, const IVEC_NAME &src) { + _mm256_store_si256((__m256i*)dest, src.val_); +# if FVEC_LEN==4 + dest[1] = dest[2]; + dest[2] = dest[4]; + dest[3] = dest[6]; +# endif + } + + VEC_INLINE static int at(const IVEC_NAME &a, int b) { + int data[8] __attribute__((aligned(32))); + store(data, a); + return data[b]; + } + + VEC_INLINE static void print(const char * str, const IVEC_NAME &a) { + int data[8] __attribute__((aligned(32))); + store(data, a); + printf("%s:", str); + for (int i = 0; i < FVEC_LEN; i++) { + printf(" %d", data[i]); + } + printf("\n"); + } + + VEC_INLINE static IVEC_NAME maskz_loadu(const BVEC_NAME &mask, + const int * src) { + FVEC_VEC_T mask_val = mask.val_; +# if FVEC_LEN==4 +# ifdef __AVX2__ + static const unsigned int mask_shuffle[8] __attribute__((aligned(32))) = + {0, 2, 4, 6, 0, 0, 0, 0}; + __m256 m = _mm256_castpd_ps(mask_val); + m = _mm256_permutevar8x32_ps(m, _mm256_load_si256((__m256i*)mask_shuffle)); + __m128i ret = _mm_maskload_epi32(src, + _mm256_castsi256_si128(_mm256_castps_si256(m))); + static const unsigned int load_shuffle[8] __attribute__((aligned(32))) = + {0, 0, 1, 1, 2, 2, 3, 3}; + return _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ret), + _mm256_load_si256((__m256i*)load_shuffle)); +# else + int dest[8] __attribute__((aligned(32))) = {0}; + int mask_buf[8] __attribute__((aligned(32))); + _mm256_store_pd((double*) mask_buf, mask.val_); + for (int i = 0; i < 4; i++) { + if (mask_buf[2*i]) { + int val = src[i]; + dest[2*i+0] = val; + dest[2*i+1] = val; + } + } + return _mm256_load_si256((__m256i*) dest); +# endif +# else + return _mm256_maskload_epi32(src, to(mask_val)); +# endif + } + + VEC_INLINE static IVEC_NAME mask_gather( + const IVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx, + const int * mem, const int scale + ) { + assert(scale == sizeof(int)); + return _mm256_mask_i32gather_epi32(src.val_, mem, idx.val_, to(mask.val_), + sizeof(int)); + } + + VEC_INLINE static void mask_compressstore(const BVEC_NAME &mask, int * dest, + const IVEC_NAME &src) { + int buf[8] __attribute__((aligned(64))); + const int stride = FVEC_LEN==4 ? 2 : 1; + _mm256_store_si256((__m256i*)buf, src.val_); + int mask_val = FVEC_SUFFIX(_mm256_movemask_)(mask.val_); + int k = 0; + #pragma unroll + for (int i = 0; i < FVEC_LEN; i++) { + if (mask_val & (1 << i)) + dest[k++] = buf[stride*i]; + } + } + + VEC_INLINE static IVEC_NAME set1(int i) { + return _mm256_set1_epi32(i); + } + VEC_INLINE static IVEC_NAME setzero() { + return _mm256_setzero_si256(); + } + VEC_INLINE static IVEC_NAME undefined() { + return _mm256_undefined_si256(); + } + + VEC_INLINE IVEC_NAME operator +(const IVEC_NAME &b) const { + return _mm256_add_epi32(this->val_, b.val_); + } +}; + +class FVEC_NAME { + friend class AVEC_NAME; +#if FVEC_LEN==8 + friend class avec8pd; +#endif + FVEC_VEC_T val_; + VEC_INLINE FVEC_NAME(const FVEC_VEC_T &v) : val_(v) {} +public: + static const int VL = FVEC_LEN; +# if defined(__AVX2__) || defined(__MIC__) || defined(__AVX512F__) + VEC_INLINE static bool fast_compress() { return true; } +# else + VEC_INLINE static bool fast_compress() { return false; } +# endif + VEC_INLINE FVEC_NAME() {} + VEC_INLINE static FVEC_SCAL_T at(const FVEC_NAME &a, int i) { + assert(i < FVEC_LEN); + FVEC_SCAL_T data[FVEC_LEN] __attribute__((aligned(64))); + FVEC_SUFFIX(_mm256_store_)(data, a.val_); + return data[i]; + } + + #define FVEC_MASK_BINFN_B(the_name, the_imm) \ + VEC_INLINE static BVEC_NAME the_name(const FVEC_NAME &a, \ + const FVEC_NAME &b) { \ + return FVEC_SUFFIX(_mm256_cmp_)(a.val_, b.val_, the_imm); \ + } \ + VEC_INLINE static BVEC_NAME mask_##the_name( \ + const BVEC_NAME &mask, \ + const FVEC_NAME &a, const FVEC_NAME &b \ + ) { \ + BVEC_NAME ret = FVEC_SUFFIX(_mm256_cmp_)( \ + a.val_, b.val_, the_imm); \ + return mask & ret; \ + } + FVEC_MASK_BINFN_B(cmple, _CMP_LE_OS) + FVEC_MASK_BINFN_B(cmplt, _CMP_LT_OS) + FVEC_MASK_BINFN_B(cmpneq, _CMP_NEQ_UQ) + FVEC_MASK_BINFN_B(cmpnle, _CMP_NLE_US) + FVEC_MASK_BINFN_B(cmpnlt, _CMP_NLT_US) + #undef FVEC_MASK_BINFN_B + + VEC_INLINE static __m256d _mm256_recip_pd(__m256d a) { + __m256d c_1 = _mm256_set1_pd(1); + return _mm256_div_pd(c_1, a); + } + VEC_INLINE static __m256 _mm256_recip_ps(__m256 a) { + return _mm256_rcp_ps(a); + } + VEC_INLINE static __m256d _mm256_abs_pd(__m256d a) { + const unsigned long long abs_mask = 0x7FFFFFFFFFFFFFFF; + const unsigned long long abs_full[8] = + {abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, + abs_mask}; + return _mm256_and_pd(_mm256_load_pd((double*)abs_full), a); + } + VEC_INLINE static __m256 _mm256_abs_ps(__m256 a) { + const unsigned long long abs_mask = 0x7FFFFFFF; + const unsigned long long abs_full[16] = + {abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, + abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, + abs_mask, abs_mask, abs_mask}; + return _mm256_and_ps(_mm256_load_ps((float*)abs_full), a); + } + + #define FVEC_UNFN_F(the_name) \ + VEC_INLINE static FVEC_NAME the_name(const FVEC_NAME &a) { \ + return FVEC_SUFFIX(_mm256_##the_name##_)(a.val_); \ + } + FVEC_UNFN_F(abs) + FVEC_UNFN_F(exp) + FVEC_UNFN_F(invsqrt) + FVEC_UNFN_F(recip) + FVEC_UNFN_F(sqrt) + #undef FVEC_UNFN_F + + VEC_INLINE static FVEC_NAME mask_blend( + const BVEC_NAME &mask, const FVEC_NAME &a, const FVEC_NAME &b + ) { + return FVEC_SUFFIX(_mm256_blendv_)(a.val_, b.val_, mask.val_); + } + #define FVEC_MASK_UNFN_F(the_name) \ + VEC_INLINE static FVEC_NAME mask_##the_name( \ + const FVEC_NAME &src, const BVEC_NAME &mask, \ + const FVEC_NAME &a \ + ) { \ + FVEC_NAME ret = FVEC_SUFFIX(_mm256_##the_name##_)( \ + a.val_); \ + return mask_blend(mask, src, ret); \ + } + FVEC_MASK_UNFN_F(cos) + FVEC_MASK_UNFN_F(recip) + FVEC_MASK_UNFN_F(sqrt) + #undef FVEC_MASK_UNFN_F + + #define FVEC_BINFN_F(the_name) \ + VEC_INLINE static FVEC_NAME the_name(const FVEC_NAME &a, \ + const FVEC_NAME &b) { \ + return FVEC_SUFFIX(_mm256_##the_name##_)(a.val_, b.val_); \ + } + FVEC_BINFN_F(max) + FVEC_BINFN_F(min) + #undef FVEC_BINFN_F + + #define FVEC_MASK_BINFN_F(the_name) \ + VEC_INLINE static FVEC_NAME mask_##the_name( \ + const FVEC_NAME &src, const BVEC_NAME &mask, \ + const FVEC_NAME &a, const FVEC_NAME &b \ + ) { \ + FVEC_NAME ret = FVEC_SUFFIX(_mm256_##the_name##_)( \ + a.val_, b.val_); \ + return mask_blend(mask, src, ret); \ + } + FVEC_MASK_BINFN_F(add) + FVEC_MASK_BINFN_F(div) + FVEC_MASK_BINFN_F(mul) + FVEC_MASK_BINFN_F(sub) + #undef FVEC_MASK_BINFN_F + + VEC_INLINE static FVEC_NAME mask_expand( + const FVEC_NAME &src, const BVEC_NAME &mask, const FVEC_NAME &b + ) { + FVEC_VEC_T ret = FVEC_SUFFIX(_mm256_expand_)(mask.val_, b.val_); + ret = FVEC_SUFFIX(_mm256_and_)(mask.val_, ret); + ret = FVEC_SUFFIX(_mm256_or_)(ret, FVEC_SUFFIX(_mm256_andnot_) + (mask.val_, src.val_)); + return ret; + } + VEC_INLINE static FVEC_NAME masku_compress( + const BVEC_NAME &mask, const FVEC_NAME &b + ) { + return FVEC_SUFFIX(_mm256_compress_)(mask.val_, b.val_); + } + + VEC_INLINE static FVEC_NAME set1(const FVEC_SCAL_T &a) { + return FVEC_SUFFIX(_mm256_set1_)(a); + } + VEC_INLINE static FVEC_NAME setzero() { + return FVEC_SUFFIX(_mm256_setzero_)(); + } + VEC_INLINE static FVEC_NAME undefined() { + return FVEC_SUFFIX(_mm256_undefined_)(); + } + + VEC_INLINE static FVEC_NAME load(const FVEC_SCAL_T *mem) { + return FVEC_SUFFIX(_mm256_load_)(mem); + } + VEC_INLINE static void store(FVEC_SCAL_T * dest, const FVEC_NAME &a) { + FVEC_SUFFIX(_mm256_store_)(dest, a.val_); + } + + + VEC_INLINE static FVEC_NAME gather(const IVEC_NAME &idx, + const FVEC_SCAL_T * mem, const int scale) { + assert(scale == sizeof(FVEC_SCAL_T)); +# if FVEC_LEN==4 +# ifdef __AVX2__ + static const unsigned int mask_shuffle[8] __attribute__((aligned(32))) = + {0, 2, 4, 6, 0, 0, 0, 0}; + __m256i m = _mm256_permutevar8x32_epi32(idx.val_, + _mm256_load_si256((__m256i*)mask_shuffle)); + __m128i idx_short = _mm256_castsi256_si128(m); + return FVEC_SUFFIX(_mm256_i32gather_)(mem, idx_short, sizeof(FVEC_SCAL_T)); +# else + int idx_buf[8] __attribute__((aligned(32))); + _mm256_store_si256((__m256i*) idx_buf, idx.val_); + double dest[4] __attribute__((aligned(32))); + for (int i = 0; i < 4; i++) { + dest[i] = mem[idx_buf[2*i]]; + } + return _mm256_load_pd(dest); +# endif +# else + return FVEC_SUFFIX(_mm256_i32gather_)(mem, idx.val_, sizeof(FVEC_SCAL_T)); +# endif + } + VEC_INLINE static FVEC_NAME mask_gather( + const FVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx, + const FVEC_SCAL_T * mem, const int scale + ) { + assert(scale == sizeof(FVEC_SCAL_T)); +# if FVEC_LEN==4 +# ifdef __AVX2__ + static const unsigned int mask_shuffle[8] __attribute__((aligned(32))) = + {0, 2, 4, 6, 0, 0, 0, 0}; + __m256i m = _mm256_permutevar8x32_epi32(idx.val_, + _mm256_load_si256((__m256i*)mask_shuffle)); + __m128i idx_short = _mm256_castsi256_si128(m); + return FVEC_SUFFIX(_mm256_mask_i32gather_)(src.val_, mem, idx_short, + mask.val_, sizeof(FVEC_SCAL_T)); +# else + int idx_buf[8] __attribute__((aligned(32))); + int mask_buf[8] __attribute__((aligned(32))); + _mm256_store_si256((__m256i*) idx_buf, idx.val_); + _mm256_store_pd((double*) mask_buf, mask.val_); + double dest[4] __attribute__((aligned(32))); + _mm256_store_pd((double*) dest, src.val_); + for (int i = 0; i < 4; i++) { + if (mask_buf[2*i]) + dest[i] = mem[idx_buf[2*i]]; + } + return _mm256_load_pd(dest); +# endif +# else + return FVEC_SUFFIX(_mm256_mask_i32gather_)(src.val_, mem, idx.val_, + mask.val_, sizeof(FVEC_SCAL_T)); +# endif + } + + VEC_INLINE static void gather_4_adjacent(const IVEC_NAME &idx, + const FVEC_SCAL_T * mem, const int scale, FVEC_NAME * out_0, + FVEC_NAME * out_1, FVEC_NAME * out_2, FVEC_NAME * out_3) { + assert(scale == sizeof(FVEC_SCAL_T)); + int idx_buf[8] __attribute__((aligned(32))); + _mm256_store_si256((__m256i*) idx_buf, idx.val_); +# if FVEC_LEN==4 + __m256d a0 = _mm256_load_pd(&mem[idx_buf[0]]); + __m256d a1 = _mm256_load_pd(&mem[idx_buf[2]]); + __m256d a2 = _mm256_load_pd(&mem[idx_buf[4]]); + __m256d a3 = _mm256_load_pd(&mem[idx_buf[6]]); + __m256d b0 = _mm256_unpacklo_pd(a0, a1); + __m256d b1 = _mm256_unpackhi_pd(a0, a1); + __m256d b2 = _mm256_unpacklo_pd(a2, a3); + __m256d b3 = _mm256_unpackhi_pd(a2, a3); + *out_0 = _mm256_permute2f128_pd(b0, b2, 0x20); + *out_1 = _mm256_permute2f128_pd(b1, b3, 0x20); + *out_2 = _mm256_permute2f128_pd(b0, b2, 0x31); + *out_3 = _mm256_permute2f128_pd(b1, b3, 0x31); +# else + const float *e0 = &mem[idx_buf[0]]; + const float *e1 = &mem[idx_buf[1]]; + const float *e2 = &mem[idx_buf[2]]; + const float *e3 = &mem[idx_buf[3]]; + const float *e4 = &mem[idx_buf[4]]; + const float *e5 = &mem[idx_buf[5]]; + const float *e6 = &mem[idx_buf[6]]; + const float *e7 = &mem[idx_buf[7]]; + __m256 a0 = _mm256_loadu2_m128(e4, e0); + __m256 a1 = _mm256_loadu2_m128(e5, e1); + __m256 b0 = _mm256_unpacklo_ps(a0, a1); + __m256 b1 = _mm256_unpackhi_ps(a0, a1); + __m256 a2 = _mm256_loadu2_m128(e6, e2); + __m256 a3 = _mm256_loadu2_m128(e7, e3); + __m256 b2 = _mm256_unpacklo_ps(a2, a3); + __m256 b3 = _mm256_unpackhi_ps(a2, a3); + *out_0 = _mm256_shuffle_ps(b0, b2, 0x44); + *out_1 = _mm256_shuffle_ps(b0, b2, 0xEE); + *out_2 = _mm256_shuffle_ps(b1, b3, 0x44); + *out_3 = _mm256_shuffle_ps(b1, b3, 0xEE); +# endif + } + VEC_INLINE static void gather_3_adjacent(const IVEC_NAME &idx, + const FVEC_SCAL_T * mem, + const int scale, + FVEC_NAME * out_0, + FVEC_NAME * out_1, + FVEC_NAME * out_2) { + assert(scale == sizeof(FVEC_SCAL_T)); + FVEC_NAME tmp_3; + gather_4_adjacent(idx, mem, scale, out_0, out_1, out_2, &tmp_3); + } + + VEC_INLINE static double _mm256_reduce_add_pd(__m256d a) { + __m256d t1 = _mm256_hadd_pd(a, a); + __m128d t2 = _mm256_extractf128_pd(t1, 1); + __m128d t3 = _mm256_castpd256_pd128(t1); + return _mm_cvtsd_f64(_mm_add_pd(t2, t3)); + } + + VEC_INLINE static float _mm256_reduce_add_ps(__m256 a) { + __m256 t1 = _mm256_hadd_ps(a, a); + __m128 t2 = _mm256_extractf128_ps(t1, 1); + __m128 t3 = _mm256_castps256_ps128(t1); + __m128 t4 = _mm_add_ps(t2, t3); + __m128 t5 = _mm_permute_ps(t4, 0x1B); // 0x1B = reverse + return _mm_cvtss_f32(_mm_add_ps(t4, t5)); + } + + VEC_INLINE static FVEC_SCAL_T reduce_add(const FVEC_NAME &a) { + return FVEC_SUFFIX(_mm256_reduce_add_)(a.val_); + } + VEC_INLINE static FVEC_SCAL_T mask_reduce_add(const BVEC_NAME &mask, + const FVEC_NAME &a) { + return reduce_add(FVEC_SUFFIX(_mm256_and_)(mask.val_, a.val_)); + } + + VEC_INLINE static IVEC_NAME unpackloepi32(const FVEC_NAME &a) { +# if FVEC_LEN==4 +# if __AVX2__ + static const unsigned int mask_shuffle[8] __attribute__((aligned(32))) = + {0, 0, 2, 2, 4, 4, 6, 6}; + __m256 m = _mm256_permutevar8x32_ps(_mm256_castpd_ps(a.val_), + _mm256_load_si256((__m256i*)mask_shuffle)); + return _mm256_castps_si256(m); +# else + __m128i a_lo = _mm256_castsi256_si128(_mm256_castpd_si256(a.val_)); + __m128i a_hi = _mm256_extractf128_si256(_mm256_castpd_si256(a.val_), 1); + __m128i c_lo = _mm_shuffle_epi32(a_lo, 0xA0); /*1010 0000*/ + __m128i c_hi = _mm_shuffle_epi32(a_hi, 0xA0); + __m256i ret = _mm256_setr_m128i(c_lo, c_hi); + return ret; +# endif +# else + return _mm256_castps_si256(a.val_); +# endif + } + + VEC_INLINE static FVEC_NAME mask_sincos( + FVEC_NAME * cos, const FVEC_NAME &src_a, const FVEC_NAME &src_b, + const BVEC_NAME &mask, const FVEC_NAME &arg + ) { + FVEC_VEC_T c, s = FVEC_SUFFIX(_mm256_sincos_)(&c, arg.val_); + *cos = mask_blend(mask, src_b, c); + return mask_blend(mask, src_a, s); + } + + #define FVEC_BINOP(the_sym, the_name) \ + VEC_INLINE inline FVEC_NAME operator the_sym(const FVEC_NAME &b) const { \ + return FVEC_SUFFIX(_mm256_##the_name##_)(this->val_, b.val_); \ + } + FVEC_BINOP(+, add) + FVEC_BINOP(-, sub) + FVEC_BINOP(*, mul) + FVEC_BINOP(/, div) + #undef FVEC_BINOP + + VEC_INLINE static void gather_prefetch0(const IVEC_NAME &a, void * mem) { + /* NOP */ + } +}; + +class AVEC_NAME { + friend class avec8pd; + FVEC_VEC_T val_; + VEC_INLINE AVEC_NAME(const FVEC_VEC_T &a) : val_(a) {} +public: + VEC_INLINE AVEC_NAME(const FVEC_NAME &a) : val_(a.val_) {} + VEC_INLINE static AVEC_NAME undefined() { + return FVEC_SUFFIX(_mm256_undefined_)(); + } + VEC_INLINE static AVEC_NAME mask_gather( + const AVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx, + const FVEC_SCAL_T * mem, const int scale + ) { + assert(scale == sizeof(FVEC_SCAL_T)); + return FVEC_NAME::mask_gather(src.val_, mask, idx, mem, scale); + } + VEC_INLINE static void mask_i32loscatter( + FVEC_SCAL_T * mem, const BVEC_NAME &mask, const IVEC_NAME &idx, + const AVEC_NAME &a, const int scale + ) { + assert(scale == sizeof(FVEC_SCAL_T)); + for (int l = 0; l < FVEC_NAME::VL; l++) { + if (BVEC_NAME::test_at(mask, l)) + mem[IVEC_NAME::at(idx, l)] = FVEC_NAME::at(a.val_, l); + } + } + + #define AVEC_BINOP(the_sym, the_name) \ + VEC_INLINE inline AVEC_NAME operator the_sym(const AVEC_NAME &b) const { \ + return FVEC_SUFFIX(_mm256_##the_name##_)(this->val_, b.val_); \ + } + AVEC_BINOP(-, sub) + #undef AVEC_BINOP +}; + +#if FVEC_LEN==8 +class avec8pd { + __m256d lo_, hi_; + VEC_INLINE avec8pd(const __m256d &lo, const __m256d &hi) : lo_(lo), hi_(hi) {} + VEC_INLINE static __m128 get_ps_hi(__m256 a) { + return _mm256_extractf128_ps(a, 1); + } + VEC_INLINE static __m128 get_ps_lo(__m256 a) { + return _mm256_castps256_ps128(a); + } + VEC_INLINE static __m128i get_si_hi(__m256i a) { + return _mm_castps_si128(get_ps_hi(_mm256_castsi256_ps(a))); + } + VEC_INLINE static __m128i get_si_lo(__m256i a) { + return _mm_castps_si128(get_ps_lo(_mm256_castsi256_ps(a))); + } +public: + VEC_INLINE avec8pd(const FVEC_NAME &a) { + lo_ = _mm256_cvtps_pd(get_ps_lo(a.val_)); + hi_ = _mm256_cvtps_pd(get_ps_hi(a.val_)); + } + VEC_INLINE static avec8pd undefined() { + return avec8pd(_mm256_undefined_pd(), _mm256_undefined_pd()); + } + VEC_INLINE static avec8pd mask_gather( + const avec8pd &src, const BVEC_NAME &mask, const IVEC_NAME &idx, + const double * mem, const int scale + ) { +# ifndef __AVX2__ + assert(scale == sizeof(double)); + int idx_buf[8] __attribute__((aligned(32))); + _mm256_store_si256((__m256i*) idx_buf, idx.val_); + int mask_val = _mm256_movemask_ps(mask.val_); + double ret_buf[8] __attribute__((aligned(32))); + _mm256_store_pd(&ret_buf[0], src.lo_); + _mm256_store_pd(&ret_buf[4], src.hi_); + for (int i = 0; i < 8; i++) { + if (mask_val & (1 << i)) { + ret_buf[i] = mem[idx_buf[i]]; + } + } + __m256d lo = _mm256_load_pd(&ret_buf[0]); + __m256d hi = _mm256_load_pd(&ret_buf[4]); +# else + static const unsigned int lo_shuffle[8] __attribute__((aligned(32))) = + {0, 0, 1, 1, 2, 2, 3, 3}; + static const unsigned int hi_shuffle[8] __attribute__((aligned(32))) = + {4, 4, 5, 5, 6, 6, 7, 7}; + __m256d lo_mask = _mm256_castps_pd(_mm256_permutevar8x32_ps(mask.val_, + _mm256_load_si256((__m256i*) lo_shuffle))); + __m256d hi_mask = _mm256_castps_pd(_mm256_permutevar8x32_ps(mask.val_, + _mm256_load_si256((__m256i*) hi_shuffle))); + __m256d lo = _mm256_mask_i32gather_pd(src.lo_, mem, get_si_lo(idx.val_), + lo_mask, sizeof(double)); + __m256d hi = _mm256_mask_i32gather_pd(src.hi_, mem, get_si_hi(idx.val_), + hi_mask, sizeof(double)); +# endif + return avec8pd(lo, hi); + } + VEC_INLINE static void mask_i32loscatter( + double * mem, const BVEC_NAME &mask, const IVEC_NAME &idx, + const avec8pd &a, const int scale + ) { + assert(scale == sizeof(double)); + double a_buf[8] __attribute__((aligned(32))); + _mm256_store_pd(a_buf, a.lo_); + _mm256_store_pd(&a_buf[4], a.hi_); + int idx_buf[8] __attribute__((aligned(32))); + _mm256_store_si256((__m256i*)idx_buf, idx.val_); + int mask_val = _mm256_movemask_ps(mask.val_); + for (int i = 0; i < 8; i++) { + if (mask_val & (1 << i)) + mem[idx_buf[i]] = a_buf[i]; + } + } + + #define AVEC2_BINOP(the_sym, the_name) \ + VEC_INLINE inline avec8pd operator the_sym(const avec8pd &b) const { \ + __m256d lo = _mm256_##the_name##_pd(this->lo_, b.lo_); \ + __m256d hi = _mm256_##the_name##_pd(this->hi_, b.hi_); \ + return avec8pd(lo, hi); \ + } + AVEC2_BINOP(-, sub) +}; +#endif + +} + + +#ifdef FVEC_FIRST_PASS + +template +struct intr_types; + +template<> +struct intr_types { + typedef mm256::fvec4pd fvec; + typedef mm256::ivec4 ivec; + typedef mm256::bvec4 bvec; + typedef mm256::avec4pd avec; +}; + +template<> +struct intr_types { + typedef mm256::fvec8ps fvec; + typedef mm256::ivec8 ivec; + typedef mm256::bvec8 bvec; + typedef mm256::avec8ps avec; +}; + +template<> +struct intr_types { + typedef mm256::fvec8ps fvec; + typedef mm256::ivec8 ivec; + typedef mm256::bvec8 bvec; + typedef mm256::avec8pd avec; +}; + +#endif + +#ifndef FVEC_FIRST_PASS +# define FVEC_FIRST_PASS +# include "intel_intrinsics_airebo.h" +#endif + +#endif + +#ifdef LMP_INTEL_AIREBO_SCALAR + +#include +#include +#include + +#define VEC_INLINE __attribute__((always_inline)) + +template +struct intr_types { + +class fvec; +class ivec; +class avec; +class bvec { + friend class fvec; + friend class ivec; + friend class avec; + bool val_; + VEC_INLINE bvec(const bool &v) : val_(v) {} +public: + VEC_INLINE bvec() {} + VEC_INLINE static bvec kand(const bvec &a, const bvec &b) { + return a.val_ && b.val_; + } + VEC_INLINE static bvec kandn(const bvec &a, const bvec &b) { + return (! a.val_) && b.val_; + } + VEC_INLINE static bvec knot(const bvec &a) { + return ! a.val_; + } + VEC_INLINE static int kortestz(const bvec &a, const bvec &b) { + return (! a.val_) && (! b.val_) ? true : false; + } + VEC_INLINE static bvec masku_compress(const bvec &mask, const bvec &a) { + return mask.val_ ? a.val_ : false; + } + VEC_INLINE static bvec mask_expand(const bvec &src, const bvec &mask, + const bvec &a) { + return mask.val_ ? a.val_ : src.val_; + } + VEC_INLINE static bvec full() { + return true; + } + VEC_INLINE static bvec empty() { + return false; + } + VEC_INLINE static bvec only(int n) { + return n == 1 ? true : false; + } + VEC_INLINE static bvec after(int n) { + return n == 0 ? true : false; + } + VEC_INLINE static bvec onlyafter(int only, int after) { + return after == 0 && only == 1 ? true : false; + } + VEC_INLINE static int popcnt(const bvec &a) { + return static_cast(a.val_); + } + VEC_INLINE static bool test_all_unset(const bvec &a) { + return kortestz(a, a); + } + VEC_INLINE static bool test_any_set(const bvec &a) { + return ! test_all_unset(a); + } + VEC_INLINE static bool test_at(const bvec &a, int i) { + assert(i < 1); + return a.val_; + } + VEC_INLINE bvec operator &(const bvec &b) const { + return val_ && b.val_; + } + VEC_INLINE bvec operator |(const bvec &b) const { + return val_ || b.val_; + } + VEC_INLINE bvec operator ~() const { + return ! val_; + } +}; + +class ivec { + friend class fvec; + friend class avec; + int val_; + VEC_INLINE ivec(const int &v) : val_(v) {} +public: + static const int VL = 1; + VEC_INLINE ivec() {} + + #define IVEC_MASK_BINFN_B(the_name, the_op) \ + VEC_INLINE static bvec the_name(const ivec &a, const ivec &b) { \ + return a.val_ the_op b.val_; \ + } \ + VEC_INLINE static bvec mask_##the_name( \ + const bvec &mask, \ + const ivec &a, const ivec &b \ + ) { \ + return mask.val_ && (a.val_ the_op b.val_); \ + \ + } + IVEC_MASK_BINFN_B(cmpeq, ==) + IVEC_MASK_BINFN_B(cmplt, <) + IVEC_MASK_BINFN_B(cmpneq, !=) + IVEC_MASK_BINFN_B(cmpgt, >) + + #define IVEC_MASK_BINFN_I(the_name, the_op) \ + VEC_INLINE static ivec mask_##the_name( \ + const ivec &src, const bvec &mask, \ + const ivec &a, const ivec &b \ + ) { \ + return mask.val_ ? a.val_ the_op b.val_ : src.val_; \ + } + IVEC_MASK_BINFN_I(add, +) + VEC_INLINE static ivec mask_blend( + const bvec &mask, const ivec &a, const ivec &b + ) { + return mask.val_ ? b.val_ : a.val_; + } + + #define IVEC_BINFN_I(the_name, the_op) \ + VEC_INLINE static ivec the_name(const ivec &a, const ivec &b) { \ + return a.val_ the_op b.val_; \ + } + IVEC_BINFN_I(mullo, *) + IVEC_BINFN_I(srlv, >>) + VEC_INLINE static ivec the_and(const ivec &a, const ivec &b) { + return a.val_ & b.val_; + } + + VEC_INLINE static ivec mask_expand( + const ivec &src, const bvec &a, const ivec &b + ) { + return a.val_ ? b.val_ : src.val_; + } + VEC_INLINE static ivec masku_compress( + const bvec &a, const ivec &b + ) { + return a.val_ ? b.val_ : 0; + } + + VEC_INLINE static int at(const ivec &a, int b) { + assert(b == 0); + return a.val_; + } + + VEC_INLINE static ivec load(const int * src) { + return *src; + } + VEC_INLINE static ivec mask_loadu(const bvec &mask, const int * src) { + return mask.val_ ? *src : 0xDEAD; + } + VEC_INLINE static ivec maskz_loadu(const bvec &mask, const int * src) { + return mask.val_ ? *src : 0; + } + VEC_INLINE static void mask_storeu(const bvec &mask, int * dest, + const ivec &src) { + if (mask.val_) *dest = src.val_; + } + VEC_INLINE static void store(int * dest, const ivec &src) { + *dest = src.val_; + } + + VEC_INLINE static ivec mask_gather( + const ivec &src, const bvec &mask, const ivec &idx, const int * mem, + const int scale + ) { + return mask.val_ ? *reinterpret_cast + (reinterpret_cast(mem) + scale * idx.val_) : src.val_; + } + VEC_INLINE static void mask_i32scatter( + int * mem, const bvec &mask, const ivec &idx, const ivec &a, + const int scale + ) { + if (mask.val_) *reinterpret_cast(reinterpret_cast(mem) + + scale * idx.val_) = a.val_; + } + + VEC_INLINE static void mask_compressstore(const bvec &mask, int * dest, + const ivec &src) { + if (mask.val_) *dest = src.val_; + } + + VEC_INLINE static ivec set( + int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8, + int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0 + ) { + return i0; + } + VEC_INLINE static ivec set1(int i) { + return i; + } + VEC_INLINE static ivec setzero() { + return 0; + } + VEC_INLINE static ivec undefined() { + return 0xDEAD; + } + + VEC_INLINE ivec operator +(const ivec &b) const { + return val_ + b.val_; + } +}; + +class fvec { + friend class avec; + flt_t val_; + VEC_INLINE fvec(const flt_t &v) : val_(v) {} +public: + static const int VL = 1; + VEC_INLINE fvec() {} + VEC_INLINE static flt_t at(const fvec &a, int i) { + assert(i < 1); + return a.val_; + } + VEC_INLINE static bool fast_compress() { return false; } + + #define FVEC_MASK_BINFN_B(the_name, the_op) \ + VEC_INLINE static bvec the_name(const fvec &a, const fvec &b) { \ + return a.val_ the_op b.val_; \ + } \ + VEC_INLINE static bvec mask_##the_name( \ + const bvec &mask, \ + const fvec &a, const fvec &b \ + ) { \ + return mask.val_ && (a.val_ the_op b.val_); \ + } + FVEC_MASK_BINFN_B(cmple, <=) + FVEC_MASK_BINFN_B(cmplt, <) + FVEC_MASK_BINFN_B(cmpneq, !=) + FVEC_MASK_BINFN_B(cmpnle, >) + FVEC_MASK_BINFN_B(cmpnlt, >=) + + #define FVEC_UNFN_F(the_name, the_fn) \ + VEC_INLINE static fvec the_name(const fvec &a) { \ + return the_fn(a.val_); \ + } + FVEC_UNFN_F(abs, fabs) + FVEC_UNFN_F(exp, ::exp) + FVEC_UNFN_F(invsqrt, 1/std::sqrt) + FVEC_UNFN_F(recip, 1/) + FVEC_UNFN_F(sqrt, std::sqrt) + + #define FVEC_MASK_UNFN_F(the_name, the_fn) \ + VEC_INLINE static fvec mask_##the_name( \ + const fvec &src, const bvec &mask, \ + const fvec &a \ + ) { \ + return mask.val_ ? the_fn(a.val_) : src.val_; \ + } + FVEC_MASK_UNFN_F(cos, std::cos) + FVEC_MASK_UNFN_F(recip, 1/) + FVEC_MASK_UNFN_F(sqrt, std::sqrt) + + #define FVEC_BINFN_F(the_name, the_fn) \ + VEC_INLINE static fvec the_name(const fvec &a, const fvec &b) { \ + return the_fn(a.val_, b.val_); \ + } + FVEC_BINFN_F(max, ::fmax) + FVEC_BINFN_F(min, ::fmin) + + #define FVEC_MASK_BINFN_F(the_name, the_op) \ + VEC_INLINE static fvec mask_##the_name( \ + const fvec &src, const bvec &mask, \ + const fvec &a, const fvec &b \ + ) { \ + return mask.val_ ? a.val_ the_op b.val_ : src.val_; \ + } + FVEC_MASK_BINFN_F(add, +) + FVEC_MASK_BINFN_F(div, /) + FVEC_MASK_BINFN_F(mul, *) + FVEC_MASK_BINFN_F(sub, -) + VEC_INLINE static fvec mask_blend( + const bvec &mask, const fvec &a, const fvec &b + ) { + return mask.val_ ? b.val_ : a.val_; + } + + VEC_INLINE static fvec mask_expand( + const fvec &src, const bvec &a, const fvec &b + ) { + return a.val_ ? b.val_ : src.val_; + } + VEC_INLINE static fvec masku_compress( + const bvec &a, const fvec &b + ) { + return a.val_ ? b.val_ : 0; + } + + VEC_INLINE static fvec set1(const flt_t &a) { + return a; + } + VEC_INLINE static fvec setzero() { + return 0; + } + VEC_INLINE static fvec undefined() { + return 1337.1337; + } + + VEC_INLINE static fvec load(const flt_t *mem) { + return *mem; + } + VEC_INLINE static void mask_storeu(const bvec &mask, flt_t * dest, + const fvec &a) { + if (mask.val_) *dest = a.val_; + } + VEC_INLINE static void store(flt_t * dest, const fvec &a) { + *dest = a.val_; + } + + VEC_INLINE static fvec gather(const ivec &idx, const flt_t * mem, + const int scale) { + return *reinterpret_cast(reinterpret_cast(mem) + + scale * idx.val_); + } + VEC_INLINE static fvec mask_gather( + const fvec &src, const bvec &mask, const ivec &idx, + const flt_t * mem, const int scale + ) { + return mask.val_ ? *reinterpret_cast + (reinterpret_cast(mem) + scale * idx.val_) : src.val_; + } + + VEC_INLINE static void gather_3_adjacent(const ivec &idx, const flt_t * mem, + const int scale, fvec * out_0, + fvec * out_1, fvec * out_2) { + assert(scale == sizeof(flt_t)); + *out_0 = gather(idx, mem + 0, scale); + *out_1 = gather(idx, mem + 1, scale); + *out_2 = gather(idx, mem + 2, scale); + } + VEC_INLINE static void gather_4_adjacent(const ivec &idx, const flt_t * mem, + const int scale, fvec * out_0, + fvec * out_1, fvec * out_2, + fvec * out_3) { + assert(scale == sizeof(flt_t)); + *out_0 = gather(idx, mem + 0, scale); + *out_1 = gather(idx, mem + 1, scale); + *out_2 = gather(idx, mem + 2, scale); + *out_3 = gather(idx, mem + 3, scale); + } + + VEC_INLINE static flt_t mask_reduce_add(const bvec &mask, const fvec &a) { + return mask.val_ ? a.val_ : 0; + } + VEC_INLINE static flt_t reduce_add(const fvec &a) { + return a.val_; + } + + VEC_INLINE static ivec unpackloepi32(const fvec &a) { + return reinterpret_cast(&a.val_)[0]; + } + + VEC_INLINE static fvec mask_sincos( + fvec * cos_out, const fvec &src_a, const fvec &src_b, + const bvec &mask, const fvec &arg + ) { + cos_out->val_ = mask.val_ ? ::cos(arg.val_) : src_b.val_; + return mask.val_ ? ::sin(arg.val_) : src_a.val_; + } + + #define FVEC_BINOP(the_sym, the_name) \ + VEC_INLINE inline fvec operator the_sym(const fvec &b) const { \ + return this->val_ the_sym b.val_; \ + } + FVEC_BINOP(+, add) + FVEC_BINOP(-, sub) + FVEC_BINOP(*, mul) + FVEC_BINOP(/, div) + + VEC_INLINE static void gather_prefetch0(const ivec &idx, const void * mem) {} +}; + +class avec { + acc_t val_; + VEC_INLINE avec(const acc_t &a) : val_(a) {} +public: + VEC_INLINE avec(const fvec &a) : val_(a.val_) {} + VEC_INLINE static avec undefined() { + return 1337.1337; + } + VEC_INLINE static avec mask_gather(const avec &src, const bvec &mask, + const ivec &idx, const acc_t * mem, + const int scale) { + return mask.val_ ? *reinterpret_cast + (reinterpret_cast(mem) + scale * idx.val_) : src.val_; + } + VEC_INLINE static void mask_i32loscatter(acc_t * mem, const bvec &mask, + const ivec &idx, const avec &a, + const int scale) { + if (mask.val_) *reinterpret_cast(reinterpret_cast(mem) + + idx.val_ * scale) = a.val_; + } + + #define AVEC_BINOP(the_sym, the_name) \ + VEC_INLINE inline avec operator the_sym(const avec &b) const { \ + return this->val_ the_sym b.val_; \ + } + AVEC_BINOP(-, sub) +}; + +}; + +#endif diff --git a/src/USER-INTEL/nbin_intel.cpp b/src/USER-INTEL/nbin_intel.cpp index c5574a78c7..3a36ead499 100644 --- a/src/USER-INTEL/nbin_intel.cpp +++ b/src/USER-INTEL/nbin_intel.cpp @@ -211,6 +211,8 @@ void NBinIntel::bin_atoms(IntelBuffers * buffers) { for (i = nall-1; i >= nlocal; i--) { if (mask[i] & bitmask) { ibin = coord2bin(atom->x[i]); + // Only necessary to store when neighboring ghost + atombin[i] = ibin; bins[i] = binhead[ibin]; binhead[ibin] = i; } @@ -222,14 +224,10 @@ void NBinIntel::bin_atoms(IntelBuffers * buffers) { binhead[ibin] = i; } } else { - for (i = nall-1; i >= nlocal; i--) { + for (i = nall-1; i >= 0; i--) { ibin = coord2bin(atom->x[i]); - bins[i] = binhead[ibin]; - binhead[ibin] = i; - } - for (i = nlocal-1; i >= 0; i--) { - ibin = coord2bin(atom->x[i]); - atombin[i]=ibin; + // Only necessary to store for ghost when neighboring ghost + atombin[i] = ibin; bins[i] = binhead[ibin]; binhead[ibin] = i; } diff --git a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp new file mode 100644 index 0000000000..12101712f1 --- /dev/null +++ b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp @@ -0,0 +1,593 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include "npair_full_bin_ghost_intel.h" +#include "neighbor.h" +#include "nstencil.h" +#include "neigh_list.h" +#include "atom.h" +#include "atom_vec.h" +#include "molecule.h" +#include "error.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +NPairFullBinGhostIntel::NPairFullBinGhostIntel(LAMMPS *lmp) : NPairIntel(lmp) {} + +/* ---------------------------------------------------------------------- + binned neighbor list construction for all neighbors + include neighbors of ghost atoms, but no "special neighbors" for ghosts + every neighbor pair appears in list of both atoms i and j +------------------------------------------------------------------------- */ + +void NPairFullBinGhostIntel::build(NeighList *list) +{ + #ifdef _LMP_INTEL_OFFLOAD + if (_fix->offload_noghost()) + error->all(FLERR, + "The 'ghost no' option cannot be used with this USER-INTEL pair style."); + #endif + + if (nstencil > INTEL_MAX_STENCIL_CHECK) + error->all(FLERR, "Too many neighbor bins for USER-INTEL package."); + + #ifdef _LMP_INTEL_OFFLOAD + if (exclude) + error->all(FLERR, "Exclusion lists not yet supported for Intel offload"); + #endif + + if (_fix->precision() == FixIntel::PREC_MODE_MIXED) + fbi(list, _fix->get_mixed_buffers()); + else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE) + fbi(list, _fix->get_double_buffers()); + else + fbi(list, _fix->get_single_buffers()); + + _fix->stop_watch(TIME_HOST_NEIGHBOR); +} + +/* ---------------------------------------------------------------------- */ + +template +void NPairFullBinGhostIntel::fbi(NeighList * list, + IntelBuffers * buffers) +{ + const int nlocal = atom->nlocal; + const int nall = atom->nlocal + atom->nghost; + list->inum = atom->nlocal; + list->gnum = atom->nghost; + + int host_start = _fix->host_start_neighbor(); + const int off_end = _fix->offload_end_neighbor(); + + #ifdef _LMP_INTEL_OFFLOAD + if (off_end) grow_stencil(); + if (_fix->full_host_list()) host_start = 0; + int offload_noghost = _fix->offload_noghost(); + #endif + + // only uses offload_end_neighbor to check whether we are doing offloading + // at all, no need to correct this later + buffers->grow_list(list, nall, comm->nthreads, off_end, + _fix->nbor_pack_width()); + + int need_ic = 0; + if (atom->molecular) + dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax, + neighbor->cutneighmax); + + if (need_ic) { + fbi(1, list, buffers, 0, off_end); + fbi(0, list, buffers, host_start, nlocal); + } else { + fbi(1, list, buffers, 0, off_end); + fbi(0, list, buffers, host_start, nlocal); + } +} + +/* ---------------------------------------------------------------------- */ + +template +void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, + IntelBuffers * buffers, + const int pstart, const int pend) { + if (pend-pstart == 0) return; + + const int nall = atom->nlocal + atom->nghost; + int pad = 1; + int nall_t = nall; + const int aend = nall; + + const int pack_width = _fix->nbor_pack_width(); + const ATOM_T * _noalias const x = buffers->get_x(); + int * _noalias const firstneigh = buffers->firstneigh(list); + const int e_nall = nall_t; + + const int molecular = atom->molecular; + int *ns = NULL; + tagint *s = NULL; + int tag_size = 0, special_size; + if (buffers->need_tag()) tag_size = e_nall; + if (molecular) { + s = atom->special[0]; + ns = atom->nspecial[0]; + special_size = aend; + } else { + s = &buffers->_special_holder; + ns = &buffers->_nspecial_holder; + special_size = 0; + } + const tagint * _noalias const special = s; + const int * _noalias const nspecial = ns; + const int maxspecial = atom->maxspecial; + const tagint * _noalias const tag = atom->tag; + + int * _noalias const ilist = list->ilist; + int * _noalias numneigh = list->numneigh; + int * _noalias const cnumneigh = buffers->cnumneigh(list); + const int nstencil = this->nstencil; + const int * _noalias const stencil = this->stencil; + const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0]; + const flt_t * _noalias const cutneighghostsq = + buffers->get_cutneighghostsq()[0]; + const int ntypes = atom->ntypes + 1; + const int nlocal = atom->nlocal; + + #ifndef _LMP_INTEL_OFFLOAD + int * const mask = atom->mask; + tagint * const molecule = atom->molecule; + #endif + + int *molindex = atom->molindex; + int *molatom = atom->molatom; + Molecule **onemols = atom->avec->onemols; + int moltemplate; + if (molecular == 2) moltemplate = 1; + else moltemplate = 0; + if (moltemplate) + error->all(FLERR, + "Can't use moltemplate with npair style full/bin/ghost/intel."); + + int tnum; + int *overflow; + double *timer_compute; + #ifdef _LMP_INTEL_OFFLOAD + if (offload) { + timer_compute = _fix->off_watch_neighbor(); + tnum = buffers->get_off_threads(); + overflow = _fix->get_off_overflow_flag(); + _fix->stop_watch(TIME_HOST_NEIGHBOR); + _fix->start_watch(TIME_OFFLOAD_LATENCY); + } else + #endif + { + tnum = comm->nthreads; + overflow = _fix->get_overflow_flag(); + } + const int nthreads = tnum; + const int maxnbors = buffers->get_max_nbors(); + int * _noalias const atombin = buffers->get_atombin(); + const int * _noalias const binpacked = buffers->get_binpacked(); + + const int xperiodic = domain->xperiodic; + const int yperiodic = domain->yperiodic; + const int zperiodic = domain->zperiodic; + const flt_t xprd_half = domain->xprd_half; + const flt_t yprd_half = domain->yprd_half; + const flt_t zprd_half = domain->zprd_half; + + flt_t * _noalias const ncachex = buffers->get_ncachex(); + flt_t * _noalias const ncachey = buffers->get_ncachey(); + flt_t * _noalias const ncachez = buffers->get_ncachez(); + int * _noalias const ncachej = buffers->get_ncachej(); + int * _noalias const ncachejtype = buffers->get_ncachejtype(); + int * _noalias const ncachetag = buffers->get_ncachetag(); + const int ncache_stride = buffers->ncache_stride(); + + const int mbinx = this->mbinx; + const int mbiny = this->mbiny; + const int mbinz = this->mbinz; + const int * const stencilxyz = &this->stencilxyz[0][0]; + + #ifdef _LMP_INTEL_OFFLOAD + const int * _noalias const binhead = this->binhead; + const int * _noalias const bins = this->bins; + const int cop = _fix->coprocessor_number(); + const int separate_buffers = _fix->separate_buffers(); + #pragma offload target(mic:cop) if(offload) \ + in(x:length(e_nall+1) alloc_if(0) free_if(0)) \ + in(tag:length(tag_size) alloc_if(0) free_if(0)) \ + in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \ + in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \ + in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \ + in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \ + in(cutneighsq:length(0) alloc_if(0) free_if(0)) \ + in(cutneighghostsq:length(0) alloc_if(0) free_if(0)) \ + in(firstneigh:length(0) alloc_if(0) free_if(0)) \ + in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ + in(numneigh:length(0) alloc_if(0) free_if(0)) \ + in(ilist:length(0) alloc_if(0) free_if(0)) \ + in(atombin:length(aend) alloc_if(0) free_if(0)) \ + in(stencil:length(nstencil) alloc_if(0) free_if(0)) \ + in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \ + in(ncachejtype,ncachetag:length(0) alloc_if(0) free_if(0)) \ + in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \ + in(separate_buffers,aend,nlocal,molecular,ntypes,mbinx,mbiny) \ + in(mbinz,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \ + in(stencilxyz:length(3*nstencil)) \ + out(overflow:length(5) alloc_if(0) free_if(0)) \ + out(timer_compute:length(1) alloc_if(0) free_if(0)) \ + signal(tag) + #endif + { + #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) + *timer_compute = MIC_Wtime(); + #endif + + #ifdef _LMP_INTEL_OFFLOAD + overflow[LMP_LOCAL_MIN] = 0; + overflow[LMP_LOCAL_MAX] = aend - 1; + overflow[LMP_GHOST_MIN] = e_nall; + overflow[LMP_GHOST_MAX] = -1; + #endif + + int nstencilp = 0; + int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL]; + for (int k = 0; k < nstencil; k++) { + binstart[nstencilp] = stencil[k]; + int end = stencil[k] + 1; + for (int kk = k + 1; kk < nstencil; kk++) { + if (stencil[kk-1]+1 == stencil[kk]) { + end++; + k++; + } else break; + } + binend[nstencilp] = end; + nstencilp++; + } + + const int mbinyx = mbiny * mbinx; + + #if defined(_OPENMP) + #pragma omp parallel + #endif + { + const int num = aend; + int tid, ifrom, ito; + + const double balance_factor = 2.0; + const double ibalance_factor = 1.0 / balance_factor; + const int gnum = num - nlocal; + const int wlocal = static_cast(ceil(balance_factor * nlocal)); + const int snum = wlocal + gnum; + IP_PRE_omp_range_id(ifrom, ito, tid, snum, nthreads); + if (ifrom < wlocal) ifrom = static_cast(ibalance_factor * ifrom); + else ifrom -= wlocal - nlocal; + if (ito < wlocal) ito = static_cast(ibalance_factor * ito); + else ito -= wlocal - nlocal; + + int e_ito = ito; + const int list_size = (e_ito + tid * 2 + 2) * maxnbors; + + int which; + + int pack_offset = maxnbors; + int ct = (ifrom + tid * 2) * maxnbors; + int *neighptr = firstneigh + ct; + const int obound = pack_offset + maxnbors * 2; + + const int toffs = tid * ncache_stride; + flt_t * _noalias const tx = ncachex + toffs; + flt_t * _noalias const ty = ncachey + toffs; + flt_t * _noalias const tz = ncachez + toffs; + int * _noalias const tj = ncachej + toffs; + int * _noalias const tjtype = ncachejtype + toffs; + int * _noalias const ttag = ncachetag + toffs; + + // loop over all atoms in other bins in stencil, store every pair + int istart, icount, ncount, oldbin = -9999999, lane, max_chunk; + for (int i = ifrom; i < ito; i++) { + const flt_t xtmp = x[i].x; + const flt_t ytmp = x[i].y; + const flt_t ztmp = x[i].z; + const int itype = x[i].w; + const tagint itag = tag[i]; + const int ioffset = ntypes * itype; + + const int ibin = atombin[i]; + if (ibin != oldbin) { + oldbin = ibin; + ncount = 0; + if (i < nlocal) { + for (int k = 0; k < nstencilp; k++) { + const int bstart = binhead[ibin + binstart[k]]; + const int bend = binhead[ibin + binend[k]]; + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd + #endif + for (int jj = bstart; jj < bend; jj++) + tj[ncount++] = binpacked[jj]; + } + } else { + const int zbin = ibin / mbinyx; + const int zrem = ibin % mbinyx; + const int ybin = zrem / mbinx; + const int xbin = zrem % mbinx; + for (int k = 0; k < nstencil; k++) { + const int xbin2 = xbin + stencilxyz[3 * k + 0]; + const int ybin2 = ybin + stencilxyz[3 * k + 1]; + const int zbin2 = zbin + stencilxyz[3 * k + 2]; + if (xbin2 < 0 || xbin2 >= mbinx || + ybin2 < 0 || ybin2 >= mbiny || + zbin2 < 0 || zbin2 >= mbinz) continue; + + const int bstart = binhead[ibin + stencil[k]]; + const int bend = binhead[ibin + stencil[k] + 1]; + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd + #endif + for (int jj = bstart; jj < bend; jj++) + tj[ncount++] = binpacked[jj]; + } + } // if i < nlocal + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd + #endif + for (int u = 0; u < ncount; u++) { + const int j = tj[u]; + tx[u] = x[j].x; + ty[u] = x[j].y; + tz[u] = x[j].z; + tjtype[u] = x[j].w; + ttag[u] = tag[j]; + } + } // if ibin != oldbin + + // ---------------------- Loop over other bins + + int n = maxnbors; + int n2 = n * 2; + int *neighptr2 = neighptr; + const flt_t * _noalias cutsq; + if (i < nlocal) cutsq = cutneighsq; + else cutsq = cutneighghostsq; + + const int icp = i; + + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma ivdep + #endif + for (int u = 0; u < ncount; u++) { + int addme = 1; + int j = tj[u]; + + if (i == j) addme = 0; + + // Cutoff Check + const flt_t delx = xtmp - tx[u]; + const flt_t dely = ytmp - ty[u]; + const flt_t delz = ztmp - tz[u]; + const int jtype = tjtype[u]; + const int jtag = ttag[u]; + const flt_t rsq = delx * delx + dely * dely + delz * delz; + if (rsq > cutsq[ioffset + jtype]) addme = 0; + + if (need_ic && icp < nlocal) { + int no_special; + ominimum_image_check(no_special, delx, dely, delz); + if (no_special) + j = -j - 1; + } + + int flist = 0; + if (itag > jtag) { + if (((itag+jtag) & 1) == 0) flist = 1; + } else if (itag < jtag) { + if (((itag+jtag) & 1) == 1) flist = 1; + } else { + if (tz[u] < ztmp) flist = 1; + else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1; + else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp) + flist = 1; + } + if (addme) { + if (flist) + neighptr2[n2++] = j; + else + neighptr[n++] = j; + } + } // for u + + #ifndef _LMP_INTEL_OFFLOAD + if (exclude) { + int alln = n; + n = maxnbors; + for (int u = pack_offset; u < alln; u++) { + const int j = neighptr[u]; + int pj = j; + if (need_ic) + if (pj < 0) pj = -j - 1; + const int jtype = x[pj].w; + if (exclusion(i,pj,itype,jtype,mask,molecule)) continue; + neighptr[n++] = j; + } + alln = n2; + n2 = maxnbors * 2; + for (int u = n2; u < alln; u++) { + const int j = neighptr[u]; + int pj = j; + if (need_ic) + if (pj < 0) pj = -j - 1; + const int jtype = x[pj].w; + if (exclusion(i,pj,itype,jtype,mask,molecule)) continue; + neighptr[n2++] = j; + } + } + #endif + int ns = n - maxnbors; + int alln = n; + atombin[i] = ns; + n = 0; + for (int u = maxnbors; u < alln; u++) + neighptr[n++] = neighptr[u]; + ns += n2 - maxnbors * 2; + for (int u = maxnbors * 2; u < n2; u++) + neighptr[n++] = neighptr[u]; + if (ns > maxnbors) *overflow = 1; + + ilist[i] = i; + cnumneigh[i] = ct; + numneigh[i] = ns; + + ct += ns; + const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); + const int edge = ct & (alignb - 1); + if (edge) ct += alignb - edge; + neighptr = firstneigh + ct; + if (ct + obound > list_size) { + if (i < ito - 1) { + *overflow = 1; + ct = (ifrom + tid * 2) * maxnbors; + } + } + } + + if (*overflow == 1) + for (int i = ifrom; i < ito; i++) + numneigh[i] = 0; + + #ifdef _LMP_INTEL_OFFLOAD + int ghost_offset = 0, nall_offset = e_nall; + if (separate_buffers) { + for (int i = ifrom; i < ito; ++i) { + int * _noalias jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + #if __INTEL_COMPILER+0 > 1499 + #pragma vector aligned + #pragma simd + #endif + for (int jj = 0; jj < jnum; jj++) { + int j = jlist[jj]; + if (need_ic && j < 0) j = -j - 1; + } + } + + overflow[LMP_LOCAL_MIN] = 0; + overflow[LMP_LOCAL_MAX] = nlocal - 1; + overflow[LMP_GHOST_MIN] = nlocal; + overflow[LMP_GHOST_MAX] = e_nall - 1; + + int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN]; + if (nghost < 0) nghost = 0; + if (offload) { + ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1; + nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost; + } else { + ghost_offset = overflow[LMP_GHOST_MIN] - nlocal; + nall_offset = nlocal + nghost; + } + } // if separate_buffers + #endif + + if (molecular) { + int ito_m = ito; + if (ito >= nlocal) ito_m = nlocal; + for (int i = ifrom; i < ito_m; ++i) { + int * _noalias jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd + #endif + for (int jj = 0; jj < jnum; jj++) { + const int j = jlist[jj]; + if (need_ic && j < 0) { + which = 0; + jlist[jj] = -j - 1; + } else + ofind_special(which, special, nspecial, i, tag[j]); + #ifdef _LMP_INTEL_OFFLOAD + if (j >= nlocal) { + if (j == e_nall) + jlist[jj] = nall_offset; + else if (which) + jlist[jj] = (j-ghost_offset) ^ (which << SBBITS); + else jlist[jj]-=ghost_offset; + } else + #endif + if (which) jlist[jj] = j ^ (which << SBBITS); + } + } // for i + } // if molecular + #ifdef _LMP_INTEL_OFFLOAD + else if (separate_buffers) { + for (int i = ifrom; i < ito; ++i) { + int * _noalias jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + int jj = 0; + #pragma vector aligned + #pragma simd + for (jj = 0; jj < jnum; jj++) { + if (jlist[jj] >= nlocal) { + if (jlist[jj] == e_nall) jlist[jj] = nall_offset; + else jlist[jj] -= ghost_offset; + } + } + } + } + #endif + } // end omp + #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) + *timer_compute = MIC_Wtime() - *timer_compute; + #endif + } // end offload + + #ifdef _LMP_INTEL_OFFLOAD + if (offload) { + _fix->stop_watch(TIME_OFFLOAD_LATENCY); + _fix->start_watch(TIME_HOST_NEIGHBOR); + for (int n = 0; n < aend; n++) { + ilist[n] = n; + numneigh[n] = 0; + } + } else { + for (int i = 0; i < aend; i++) + list->firstneigh[i] = firstneigh + cnumneigh[i]; + if (separate_buffers) { + _fix->start_watch(TIME_PACK); + _fix->set_neighbor_host_sizes(); + buffers->pack_sep_from_single(_fix->host_min_local(), + _fix->host_used_local(), + _fix->host_min_ghost(), + _fix->host_used_ghost()); + _fix->stop_watch(TIME_PACK); + } + } + #else + #pragma vector aligned + #pragma simd + for (int i = 0; i < aend; i++) + list->firstneigh[i] = firstneigh + cnumneigh[i]; + #endif +} diff --git a/src/USER-INTEL/npair_full_bin_ghost_intel.h b/src/USER-INTEL/npair_full_bin_ghost_intel.h new file mode 100644 index 0000000000..4449dfa1e1 --- /dev/null +++ b/src/USER-INTEL/npair_full_bin_ghost_intel.h @@ -0,0 +1,55 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef NPAIR_CLASS + +NPairStyle(full/bin/ghost/intel, + NPairFullBinGhostIntel, + NP_FULL | NP_BIN | NP_GHOST | NP_NEWTON | NP_NEWTOFF | + NP_ORTHO | NP_TRI | NP_INTEL) + +#else + +#ifndef LMP_NPAIR_FULL_BIN_GHOST_INTEL_H +#define LMP_NPAIR_FULL_BIN_GHOST_INTEL_H + +#include "npair_intel.h" + +namespace LAMMPS_NS { + +class NPairFullBinGhostIntel : public NPairIntel { + public: + NPairFullBinGhostIntel(class LAMMPS *); + ~NPairFullBinGhostIntel() {} + void build(class NeighList *); + private: + template + void fbi(NeighList * list, IntelBuffers * buffers); + template + void fbi(const int offload, NeighList * list, + IntelBuffers * buffers, + const int astart, const int aend); +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +*/ diff --git a/src/USER-INTEL/npair_intel.cpp b/src/USER-INTEL/npair_intel.cpp index b20b1dcd08..79dc75366e 100644 --- a/src/USER-INTEL/npair_intel.cpp +++ b/src/USER-INTEL/npair_intel.cpp @@ -143,6 +143,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, flt_t * _noalias const ncachez = buffers->get_ncachez(); int * _noalias const ncachej = buffers->get_ncachej(); int * _noalias const ncachejtype = buffers->get_ncachejtype(); + int * _noalias const ncachetag = buffers->get_ncachetag(); const int ncache_stride = buffers->ncache_stride(); #ifdef _LMP_INTEL_OFFLOAD @@ -165,7 +166,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, in(atombin:length(aend) alloc_if(0) free_if(0)) \ in(stencil:length(nstencil) alloc_if(0) free_if(0)) \ in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \ - in(ncachejtype:length(0) alloc_if(0) free_if(0)) \ + in(ncachejtype,ncachetag:length(0) alloc_if(0) free_if(0)) \ in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \ in(pad_width,offload_end,separate_buffers,astart,aend,nlocal,molecular) \ in(ntypes,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \ @@ -222,7 +223,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, ito += astart; int e_ito = ito; if (THREE && ito == num) { - int imod = ito % pack_width; + int imod = ito & (pack_width - 1); if (imod) e_ito += pack_width - imod; } const int list_size = (e_ito + tid * 2 + 2) * maxnbors; @@ -241,6 +242,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, flt_t * _noalias const tz = ncachez + toffs; int * _noalias const tj = ncachej + toffs; int * _noalias const tjtype = ncachejtype + toffs; + int * _noalias const ttag = ncachetag + toffs; flt_t * _noalias itx; flt_t * _noalias ity; @@ -287,13 +289,14 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, ty[u] = x[j].y; tz[u] = x[j].z; tjtype[u] = x[j].w; + if (THREE) ttag[u] = tag[j]; } if (FULL == 0 || TRI == 1) { icount = 0; istart = ncount; const int alignb = INTEL_DATA_ALIGN / sizeof(int); - int nedge = istart % alignb; + int nedge = istart & (alignb - 1); if (nedge) istart + (alignb - nedge); itx = tx + istart; ity = ty + istart; @@ -343,7 +346,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, // i bin (half) check and offload ghost check if (j < nlocal) { - const int ijmod = (i + j) % 2; + const int ijmod = (i + j) & 1; if (i > j) { if (ijmod == 0) addme = 0; } else if (i < j) { @@ -424,8 +427,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, } #endif - int pj; - if (THREE) pj = j; if (need_ic) { int no_special; ominimum_image_check(no_special, delx, dely, delz); @@ -434,12 +435,12 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, } if (THREE) { - const int jtag = tag[pj]; + const int jtag = ttag[u]; int flist = 0; if (itag > jtag) { - if ((itag+jtag) % 2 == 0) flist = 1; + if (((itag+jtag) & 1) == 0) flist = 1; } else if (itag < jtag) { - if ((itag+jtag) % 2 == 1) flist = 1; + if (((itag+jtag) & 1) == 1) flist = 1; } else { if (tz[u] < ztmp) flist = 1; else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1; @@ -512,7 +513,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, cnumneigh[i] += lane; numneigh[i] = ns; } else { - int edge = (n % pad_width); + int edge = n & (pad_width - 1); if (edge) { const int pad_end = n + (pad_width - edge); #if defined(LMP_SIMD_COMPILER) @@ -532,7 +533,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, if (lane == pack_width) { ct += max_chunk * pack_width; const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); - const int edge = (ct % alignb); + const int edge = ct & (alignb - 1); if (edge) ct += alignb - edge; neighptr = firstneigh + ct; max_chunk = 0; @@ -548,7 +549,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, } else { ct += n; const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); - const int edge = (ct % alignb); + const int edge = ct & (alignb - 1); if (edge) ct += alignb - edge; neighptr = firstneigh + ct; if (ct + obound > list_size) { diff --git a/src/USER-INTEL/pair_airebo_intel.cpp b/src/USER-INTEL/pair_airebo_intel.cpp new file mode 100644 index 0000000000..ad3c97c9df --- /dev/null +++ b/src/USER-INTEL/pair_airebo_intel.cpp @@ -0,0 +1,4891 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Markus Hohnerbach (RWTH) +------------------------------------------------------------------------- */ + +#ifdef __INTEL_OFFLOAD +#pragma offload_attribute(push, target(mic)) +#endif +#include +#include +#include +#include +#include +#include +#include +#include "lmptype.h" +#include "intel_preprocess.h" +#include "intel_intrinsics_airebo.h" +#ifdef __INTEL_OFFLOAD +#pragma offload_attribute(pop) +#endif + +#include +#include +#include "pair_airebo_intel.h" +#include "atom.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "force.h" +#include "comm.h" +#include "memory.h" +#include "error.h" +#include "group.h" +#include "kspace.h" +#include "modify.h" +#include "suffix.h" + +using namespace LAMMPS_NS; + +#ifdef __INTEL_OFFLOAD +#pragma offload_attribute(push, target(mic)) +#endif + +template +struct LAMMPS_NS::PairAIREBOIntelParam { + flt_t cutlj, cutljrebosq, cut3rebo; + flt_t sigmin, sigcut; + flt_t cutljsq[2][2]; + flt_t lj1[2][2], lj2[2][2], lj3[2][2], lj4[2][2]; + + flt_t smin, Nmin, Nmax, NCmin, NCmax, thmin, thmax; + flt_t rcmin[2][2], rcmax[2][2], rcmaxsq[2][2], rcmaxp[2][2]; + flt_t Q[2][2], alpha[2][2], A[2][2], rho[2][2], BIJc[2][2][3], + Beta[2][2][3]; + flt_t rcLJmin[2][2], rcLJmax[2][2], rcLJmaxsq[2][2], bLJmin[2][2], + bLJmax[2][2]; + flt_t epsilon[2][2], sigma[2][2], epsilonT[2][2]; + + // spline coefficients + + flt_t gCdom[5], gC1[4][6], gC2[4][6], gHdom[4], gH[3][6]; + flt_t gDom[5+4]; + flt_t gVal[(4+4+3)*6]; + flt_t pCCdom[2][2], pCHdom[2][2], pCC[4][4][16], pCH[4][4][16]; + flt_t piCCdom[3][2], piCHdom[3][2], piHHdom[3][2]; + acc_t piCC[4][4][9][64], piCH[4][4][9][64], piHH[4][4][9][64]; + flt_t Tijdom[3][2]; + acc_t Tijc[4][4][9][64]; + + // spline knot values + + flt_t PCCf[5][5], PCCdfdx[5][5], PCCdfdy[5][5], PCHf[5][5]; + flt_t PCHdfdx[5][5], PCHdfdy[5][5]; + flt_t piCCf[5][5][11], piCCdfdx[5][5][11]; + flt_t piCCdfdy[5][5][11], piCCdfdz[5][5][11]; + flt_t piCHf[5][5][11], piCHdfdx[5][5][11]; + flt_t piCHdfdy[5][5][11], piCHdfdz[5][5][11]; + flt_t piHHf[5][5][11], piHHdfdx[5][5][11]; + flt_t piHHdfdy[5][5][11], piHHdfdz[5][5][11]; + flt_t Tf[5][5][10], Tdfdx[5][5][10], Tdfdy[5][5][10], Tdfdz[5][5][10]; +}; + +namespace { + +struct NeighListAIREBO { + int * num; /* num_all */ + int * num_half; /* num_all */ + int * offset; /* num_all */ + int * entries; /* num_all * num_neighs_per_atom */ +}; + +template +struct AtomAIREBOT { + flt_t x, y, z; + int w; +}; + +template +struct ResultForceT { + acc_t x, y, z, w; +}; + +template +struct KernelArgsAIREBOT { + int num_local; + int num_all; + int num_neighs_per_atom; + int num_types; + int frebo_from_atom, frebo_to_atom; + int neigh_from_atom, neigh_to_atom; + int rebuild_flag; + flt_t skin; + struct NeighListAIREBO neigh_lmp; + struct NeighListAIREBO neigh_rebo; + PairAIREBOIntelParam params; + struct AtomAIREBOT * x; /* num_all */ + int * tag; /* num_all */ + flt_t * nC, * nH; /* num_all */ + int * map; /* num_types+1 */ + struct ResultForceT * result_f; /* num_all */ + acc_t result_eng; +}; + +template +void aut_lennard_jones(KernelArgsAIREBOT * ka, int morseflag); +template +void aut_rebo_neigh(KernelArgsAIREBOT * ka); +template +void aut_frebo(KernelArgsAIREBOT * ka, int torsion_flag); + +} + +#ifdef __INTEL_OFFLOAD +#pragma offload_attribute(pop) +#endif + +/* ---------------------------------------------------------------------- */ + +PairAIREBOIntel::PairAIREBOIntel(LAMMPS *lmp) : PairAIREBO(lmp) +{ + suffix_flag |= Suffix::INTEL; + REBO_cnumneigh = NULL; + REBO_num_skin = NULL; + REBO_list_data = NULL; + fix = NULL; +} + +/* ---------------------------------------------------------------------- */ + +PairAIREBOIntel::~PairAIREBOIntel() +{ + memory->destroy(REBO_cnumneigh); + memory->destroy(REBO_num_skin); + memory->destroy(REBO_list_data); +} + +/* ---------------------------------------------------------------------- */ + +void PairAIREBOIntel::init_style() +{ + PairAIREBO::init_style(); + neighbor->requests[neighbor->nrequest-1]->intel = 1; + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + fix->pair_init_check(); + #ifdef _LMP_INTEL_OFFLOAD + _cop = fix->coprocessor_number(); + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + pack_force_const(fix->get_mixed_buffers()); + fix->get_mixed_buffers()->need_tag(1); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + pack_force_const(fix->get_double_buffers()); + fix->get_double_buffers()->need_tag(1); + } else { + pack_force_const(fix->get_single_buffers()); + fix->get_single_buffers()->need_tag(1); + } + + #ifdef _LMP_INTEL_OFFLOAD + if (fix->offload_noghost()) + error->all(FLERR,"The 'ghost no' option cannot be used with airebo/intel."); + #endif +} + +/* ---------------------------------------------------------------------- */ + +template +T * calloc_it(size_t size) { + return static_cast(calloc(size, sizeof(T))); +} + +void PairAIREBOIntel::compute(int eflag, int vflag) +{ + if (fix->precision()==FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers()); + else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers()); + else + compute(eflag, vflag, fix->get_single_buffers()); + + fix->balance_stamp(); + vflag_fdotr = 0; +} + +/* ---------------------------------------------------------------------- */ + +template +PairAIREBOIntelParam PairAIREBOIntel::get_param() +{ + PairAIREBOIntelParam fc; + +#define A(a) \ + for (int i = 0; i < sizeof(this->a)/sizeof(double); i++) { \ + reinterpret_cast(&fc.a)[i] = \ + reinterpret_cast(&this->a)[i]; \ + } +#define A0(a) \ + for (int i = 0; i < sizeof(fc.a)/sizeof(flt_t); i++) { \ + reinterpret_cast(&fc.a)[i] = \ + reinterpret_cast(this->a[0])[i]; \ + } +#define B(a) \ + for (int i = 0; i < sizeof(this->a)/sizeof(double); i++) { \ + reinterpret_cast(&fc.a)[i] = \ + reinterpret_cast(&this->a)[i]; \ + } + + A(cutlj) A(cutljrebosq) A(cut3rebo) A(sigmin); + A(sigcut) A0(cutljsq) A0(lj1) A0(lj2) A0(lj3); + A0(lj4) A(smin) A(Nmin) A(Nmax) A(NCmin) A(NCmax) A(thmin) A(thmax); + A(rcmin) A(rcmax) A(rcmaxsq) A(rcmaxp) A(Q) A(alpha) A(A) A(rho) A(BIJc); + A(Beta) A(rcLJmin) A(rcLJmax) A(rcLJmaxsq) A(bLJmin) A(bLJmax) A(epsilon); + A(sigma) A(epsilonT) A(gCdom) A(gC1) A(gC2) A(gHdom) A(gH) A(pCCdom); + A(pCHdom) A(pCC) A(pCH) A(piCCdom) A(piCHdom) A(piHHdom) B(piCC); + B(piCH) B(piHH) A(Tijdom) B(Tijc) A(PCCf) A(PCCdfdx) A(PCCdfdy) A(PCHf); + A(PCHdfdx) A(PCHdfdy) A(piCCf) A(piCCdfdx) A(piCCdfdy) A(piCCdfdz); + A(piCHf) A(piCHdfdx) A(piCHdfdy) A(piCHdfdz) A(piHHf) A(piHHdfdx); + A(piHHdfdy) A(piHHdfdz) A(Tf) A(Tdfdx) A(Tdfdy) A(Tdfdz); + +#undef A +#undef A0 +#undef B + for (int i = 0; i < 5; i++) fc.gDom[i] = fc.gCdom[i]; + for (int i = 0; i < 4; i++) fc.gDom[5+i] = fc.gHdom[i]; + for (int i = 0; i < 4; i++) for (int j = 0; j < 6; j++) + fc.gVal[6*i+j] = fc.gC1[i][j]; + for (int i = 0; i < 4; i++) for (int j = 0; j < 6; j++) + fc.gVal[4*6+6*i+j] = fc.gC2[i][j]; + for (int i = 0; i < 3; i++) for (int j = 0; j < 6; j++) + fc.gVal[8*6+6*i+j] = fc.gH[i][j]; + + return fc; +} + +/* ---------------------------------------------------------------------- */ + +template +void PairAIREBOIntel::compute( + int eflag, int vflag, IntelBuffers * buffers +) { + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = vflag_fdotr = vflag_atom = 0; + pvector[0] = pvector[1] = pvector[2] = 0.0; + + const int inum = list->inum; + const int nthreads = comm->nthreads; + const int host_start = fix->host_start_pair(); + const int offload_end = fix->offload_end_pair(); + const int ago = neighbor->ago; + + if (ago != 0 && fix->separate_buffers() == 0) { + fix->start_watch(TIME_PACK); + int packthreads; + if (nthreads > INTEL_HTHREADS) packthreads = nthreads; + else packthreads = 1; + #if defined(_OPENMP) + #pragma omp parallel if(packthreads > 1) + #endif + { + int ifrom, ito, tid; + IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, + packthreads, sizeof(ATOM_T)); + buffers->thr_pack(ifrom,ito,ago); + } + fix->stop_watch(TIME_PACK); + } + + if (atom->nmax > maxlocal) { + #ifdef LMP_INTEL_OFFLOAD + if (maxlocal > 0 && _cop >= 0) { + int * const REBO_numneigh = this->REBO_numneigh; + int * const REBO_num_skin = this->REBO_num_skin; + int * const REBO_cnumneigh = this->REBO_cnumneigh; + int * const REBO_list_data = this->REBO_list_data; + double * const nC = this->nC; + double * const nH = this->nH; + #pragma offload_transfer target(mic:_cop) \ + nocopy(REBO_numneigh: alloc_if(0) free_if(1)) \ + nocopy(REBO_cnumneigh: alloc_if(0) free_if(1)) \ + nocopy(REBO_num_skin: alloc_if(0) free_if(1)) \ + nocopy(REBO_list_data: alloc_if(0) free_if(1)) \ + nocopy(nH: alloc_if(0) free_if(1)) \ + nocopy(nC: alloc_if(0) free_if(1)) + } + #endif + maxlocal = atom->nmax; + memory->destroy(REBO_numneigh); + memory->destroy(REBO_cnumneigh); + memory->destroy(REBO_list_data); + memory->sfree(REBO_firstneigh); + memory->destroy(nC); + memory->destroy(nH); + memory->create(REBO_numneigh,maxlocal,"AIREBO:numneigh"); + memory->create(REBO_cnumneigh,maxlocal,"AIREBO:cnumneigh"); + memory->create(REBO_num_skin,maxlocal,"AIREBO:cnumneigh"); + int max_nbors = buffers->get_max_nbors(); + memory->create(REBO_list_data,maxlocal * max_nbors,"AIREBO:list_data"); + REBO_firstneigh = (int **) memory->smalloc(maxlocal*sizeof(int *), + "AIREBO:firstneigh"); + memory->create(nC,maxlocal,"AIREBO:nC"); + memory->create(nH,maxlocal,"AIREBO:nH"); + #ifdef _LMP_INTEL_OFFLOAD + if (_cop >= 0) { + int * const REBO_numneigh = this->REBO_numneigh; + int * const REBO_num_skin = this->REBO_num_skin; + int * const REBO_cnumneigh = this->REBO_cnumneigh; + int * const REBO_list_data = this->REBO_list_data; + double * const nC = this->nC; + double * const nH = this->nH; + const int mnml = max_nbors * maxlocal; + #pragma offload_transfer target(mic:_cop) \ + nocopy(REBO_numneigh: length(maxlocal) alloc_if(1) free_if(0)) \ + nocopy(REBO_cnumneigh:length(maxlocal) alloc_if(1) free_if(0)) \ + nocopy(REBO_num_skin: length(maxlocal) alloc_if(1) free_if(0)) \ + nocopy(REBO_list_data:length(mnml) alloc_if(1) free_if(0)) \ + nocopy(nH: length(maxlocal) alloc_if(1) free_if(0)) \ + nocopy(nC: length(maxlocal) alloc_if(1) free_if(0)) + } + #endif + } + + if (evflag || vflag_fdotr) { + int ovflag = 0; + if (vflag_fdotr) ovflag = 2; + else if (vflag) ovflag = 1; + if (eflag) { + eval<1,1>(1, ovflag, buffers, 0, offload_end); + eval<1,1>(0, ovflag, buffers, host_start, inum); + } else { + eval<1,0>(1, ovflag, buffers, 0, offload_end); + eval<1,0>(0, ovflag, buffers, host_start, inum); + } + } else { + eval<0,0>(1, 0, buffers, 0, offload_end); + eval<0,0>(0, 0, buffers, host_start, inum); + } +} + +/* ---------------------------------------------------------------------- */ + +template +void PairAIREBOIntel::eval( + const int offload, const int vflag, + IntelBuffers * buffers, + const int astart, const int aend +) { + const int inum = aend - astart; + if (inum == 0) { + return; + } + int nlocal, nall, minlocal; + fix->get_buffern(offload, nlocal, nall, minlocal); + + const int ago = neighbor->ago; + IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall); + + ATOM_T * _noalias const x = buffers->get_x(offload); + const int * _noalias const numneighhalf = buffers->get_atombin(); + const int * _noalias const numneigh = list->numneigh; + const int * _noalias const cnumneigh = buffers->cnumneigh(list); + const int * _noalias const firstneigh = buffers->firstneigh(list); + int * const tag = atom->tag; + + const int ntypes = atom->ntypes + 1; + const int eatom = this->eflag_atom; + + int x_size, q_size, f_stride, ev_size, separate_flag; + IP_PRE_get_transfern(ago, 1 /*NEWTON_PAIR*/, EFLAG, vflag, + buffers, offload, fix, separate_flag, + x_size, q_size, ev_size, f_stride); + + int tc; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; + IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global); + + const int nthreads = tc; + const double skin = neighbor->skin; + const int max_nbor = buffers->get_max_nbors(); + const PairAIREBOIntelParam param = get_param(); + + // offload here + #ifdef _LMP_INTEL_OFFLOAD + int *overflow = fix->get_off_overflow_flag(); + double *timer_compute = fix->off_watch_pair(); + + int * const REBO_numneigh = this->REBO_numneigh; + int * const REBO_num_skin = this->REBO_num_skin; + int * const REBO_cnumneigh = this->REBO_cnumneigh; + int * const REBO_list_data = this->REBO_list_data; + double * const nC = this->nC; + double * const nH = this->nH; + const int torflag = this->torflag; + const int ljflag = this->ljflag; + const int morseflag = this->morseflag; + int * const map = this->map; + + if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY); + + #pragma offload target(mic:_cop) if(offload) \ + in(firstneigh:length(0) alloc_if(0) free_if(0)) \ + in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ + in(numneigh:length(0) alloc_if(0) free_if(0)) \ + in(numneighhalf:length(0) alloc_if(0) free_if(0)) \ + in(x:length(x_size) alloc_if(0) free_if(0)) \ + in(overflow:length(0) alloc_if(0) free_if(0)) \ + in(astart,nthreads,inum,nall,ntypes,vflag,eatom) \ + in(f_stride,nlocal,minlocal,separate_flag,offload) \ + out(f_start:length(f_stride) alloc_if(0) free_if(0)) \ + out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \ + out(timer_compute:length(1) alloc_if(0) free_if(0)) \ + in(param,skin,max_nbor) \ + in(tag: length(0) alloc_if(0) free_if(0)) \ + in(torflag, ljflag, morseflag, ago) \ + in(nC: length(0) alloc_if(0) free_if(0)) \ + in(nH: length(0) alloc_if(0) free_if(0)) \ + in(REBO_numneigh: length(0) alloc_if(0) free_if(0)) \ + in(REBO_cnumneigh: length(0) alloc_if(0) free_if(0)) \ + in(REBO_num_skin: length(0) alloc_if(0) free_if(0)) \ + in(REBO_list_data: length(0) alloc_if(0) free_if(0)) \ + in(map: length(0) alloc_if(0) free_if(0)) \ + signal(f_start) + #endif + { + #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) + *timer_compute = MIC_Wtime(); + #endif + + IP_PRE_repack_for_offload(1 /*NEWTON_PAIR*/, separate_flag, nlocal, nall, + f_stride, x, 0/*q*/); + + acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; + if (EVFLAG) { + oevdwl = oecoul = (acc_t)0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + } + + // loop over neighbors of my atoms + #if defined(_OPENMP) + #pragma omp parallel \ + shared(f_start,f_stride,nlocal,nall,minlocal) \ + reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int iifrom, iito, tid; + IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads); + iifrom += astart; + iito += astart; + + int neigh_iifrom, neigh_iito; + IP_PRE_omp_range(neigh_iifrom, neigh_iito, tid, nall, nthreads); + + FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride); + memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); + + KernelArgsAIREBOT args; + args.num_local = nlocal; + args.num_all = nall; + args.num_neighs_per_atom = max_nbor; + args.num_types = ntypes; + args.frebo_from_atom = 0; + args.frebo_to_atom = args.num_local; + args.neigh_from_atom = 0; + args.neigh_to_atom = args.num_all; + args.rebuild_flag = ago == 0; + args.skin = skin; + args.neigh_lmp.num = const_cast(numneigh); + args.neigh_lmp.num_half = const_cast(numneighhalf); + args.neigh_lmp.offset = const_cast(cnumneigh); + args.neigh_lmp.entries = const_cast(firstneigh); + args.neigh_rebo.num = REBO_numneigh; + args.neigh_rebo.num_half = REBO_num_skin; + args.neigh_rebo.offset = REBO_cnumneigh; + args.neigh_rebo.entries = REBO_list_data; + args.params = param; + args.tag = tag; + args.nC = reinterpret_cast(nC); + args.nH = reinterpret_cast(nH); + args.map = map; + args.result_eng = 0; + args.x = (AtomAIREBOT*) x; + + args.result_f = (ResultForceT *) f; + args.neigh_from_atom = neigh_iifrom; + args.neigh_to_atom = neigh_iito; + args.frebo_from_atom = iifrom; + args.frebo_to_atom = iito; + + aut_rebo_neigh(&args); + #if defined(_OPENMP) + #pragma omp barrier + #endif + aut_frebo(&args, torflag); + if (ljflag) aut_lennard_jones(&args, morseflag); + + oevdwl += args.result_eng; + + IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride, x, + offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5); + } // end of omp parallel region + IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag, + ov0, ov1, ov2, ov3, ov4, ov5); + if (EVFLAG) { + if (EFLAG) { + ev_global[0] = oevdwl; + ev_global[1] = oecoul; + } + if (vflag) { + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; + } + } + #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) + *timer_compute = MIC_Wtime() - *timer_compute; + #endif + } // end of offload region + + if (offload) + fix->stop_watch(TIME_OFFLOAD_LATENCY); + else + fix->stop_watch(TIME_HOST_PAIR); + + if (EVFLAG) + fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); + else + fix->add_result_array(f_start, 0, offload); +} + +/* ---------------------------------------------------------------------- */ + +template +void PairAIREBOIntel::pack_force_const(IntelBuffers * buffers) { + int tp1 = atom->ntypes + 1; + + buffers->set_ntypes(tp1,1); + flt_t **cutneighsq = buffers->get_cutneighsq(); + flt_t **cutneighghostsq = buffers->get_cutneighghostsq(); + + // Repeat cutsq calculation because done after call to init_style + double cut, cutneigh; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i, j); + cutneigh = cut + neighbor->skin; + cutsq[i][j] = cutsq[j][i] = cut*cut; + cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; + cut = cutghost[i][j] + neighbor->skin; + cutneighghostsq[i][j] = cutneighghostsq[j][i] = cut*cut; + } + } + } + + #ifdef _LMP_INTEL_OFFLOAD + if (_cop < 0) return; + flt_t * ocutneighsq = cutneighsq[0]; + size_t VL = 512 / 8 / sizeof(flt_t); + int ntypes = tp1; + int tp1sq = tp1 * tp1; + // TODO the lifecycle of "map" is currently not 100% correct + // it might not be freed if this method is called more than once + int * map = this->map; + #pragma offload_transfer target(mic:_cop) \ + in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0)) \ + in(map: length(tp1) alloc_if(1) free_if(0)) + #endif + +} + +/* ---------------------------------------------------------------------- + Implementation + ---------------------------------------------------------------------- */ + +namespace { + +#ifdef __INTEL_OFFLOAD +#pragma offload_attribute(push, target(mic)) +#endif + +namespace overloaded { + double sqrt(double a) { return ::sqrt(a); } + float sqrt(float a) { return ::sqrtf(a); } + double sin(double a) { return ::sin(a); } + float sin(float a) { return ::sinf(a); } + double cos(double a) { return ::cos(a); } + float cos(float a) { return ::cosf(a); } + double exp(double a) { return ::exp(a); } + float exp(float a) { return ::expf(a); } + double pow(double a, double b) { return ::pow(a, b); } + float pow(float a, float b) { return ::powf(a, b); } +} + +/* ---------------------------------------------------------------------- + Scalar AIREBO implementation, standalone, with massive code reuse + compared to original code. + ---------------------------------------------------------------------- */ + +#define M_PI 3.14159265358979323846 /* pi */ + +#define CARBON 0 +#define HYDROGEN 1 +#define TOL 1.0e-9 + +template +inline T fmin_nonan(T a, T b) { + return a < b ? a : b; +} +template +inline T fmax_nonan(T a, T b) { + return a > b ? a : b; +} + +template +inline flt_t Sp(flt_t r, flt_t lo, flt_t hi, flt_t * del) { + flt_t t = (r - lo) / (hi - lo); + if (t <= 0) { + if (del) *del = 0; + return 1; + } else if (t >= 1) { + if (del) *del = 0; + return 0; + } else { + t *= static_cast(M_PI); + if (del) *del = static_cast(-0.5 * M_PI) + * overloaded::sin(t) / (hi - lo); + return static_cast(0.5) * (1 + overloaded::cos(t)); + } +} + +template +inline flt_t Sp2(flt_t r, flt_t lo, flt_t hi, flt_t * del) { + flt_t t = (r - lo) / (hi - lo); + if (t <= 0) { + if (del) *del = 0; + return 1; + } else if (t >= 1) { + if (del) *del = 0; + return 0; + } else { + if (del) *del = 6 * (t * t - t) / (hi - lo); + return 1 - t * t * (3 - 2 * t); + } +} + +template +inline flt_t eval_poly_lin(int n, flt_t * coeffs, flt_t x, flt_t * deriv) { + flt_t result = coeffs[n - 1]; + *deriv = coeffs[n - 1] * (n - 1); + for (int i = n - 2; i > 0; i--) { + result = coeffs[i] + x * result; + *deriv = coeffs[i] * i + x * (*deriv); + } + result = coeffs[0] + x * result; + return result; +} + +template +inline flt_t gSpline(KernelArgsAIREBOT * ka, int itype, flt_t cos, flt_t N, flt_t * dgdc, flt_t * dgdN) { + flt_t NCmin = ka->params.NCmin; + flt_t NCmax = ka->params.NCmax; + int index = 0; + flt_t * gDom = NULL; + int nDom = 0; + int offs = 0; + if (itype == 0) { + nDom = 4; + gDom = &ka->params.gCdom[0]; + if (N > NCmin) offs = 4 * 6; + } else { + nDom = 3; + gDom = &ka->params.gHdom[0]; + offs = 8 * 6; + } + cos = fmax_nonan(gDom[0], fmin_nonan(gDom[nDom], cos)); + int i; + for (i = 0; i < nDom; i++) { + if (cos >= gDom[i] && cos <= gDom[i + 1]) { + index = i; + } + } + flt_t g = eval_poly_lin(6, &ka->params.gVal[offs+index*6], cos, dgdc); + *dgdN = 0; + if (itype == 0 && N > NCmin && N < NCmax) { + flt_t dg1; + flt_t g1 = eval_poly_lin(6, &ka->params.gVal[index*6], cos, &dg1); + flt_t dS; + flt_t cut = Sp(N, NCmin, NCmax, &dS); + *dgdN = dS * (g1 - g); + g = g + cut * (g1 - g); + *dgdc = *dgdc + cut * (dg1 - *dgdc); + } + return g; +} + +template +inline flt_t eval_poly_bi(int n, flt_t * coeffs, flt_t x, flt_t y, + flt_t * deriv) { + flt_t dy; + flt_t vy = eval_poly_lin(n, &coeffs[n * (n - 1)], y, &dy); + flt_t result = vy; + deriv[0] = vy * (n - 1); + deriv[1] = dy; + for (int i = n - 2; i > 0; i--) { + vy = eval_poly_lin(n, &coeffs[n * i], y, &dy); + result = vy + x * result; + deriv[0] = vy * i + x * deriv[0]; + deriv[1] = dy + x * deriv[1]; + } + result = eval_poly_lin(n, &coeffs[0], y, &dy) + x * result; + deriv[1] = dy + x * deriv[1]; + return result; +} + +template +inline flt_t eval_poly_tri(int n, flt_t * coeffs, flt_t x, flt_t y, flt_t z, + flt_t * deriv) { + flt_t dyz[2]; + flt_t vyz = eval_poly_bi(n, &coeffs[n * n * (n - 1)], y, z, &dyz[0]); + flt_t result = vyz; + deriv[0] = vyz * (n - 1); + deriv[1] = dyz[0]; + deriv[2] = dyz[1]; + for (int i = n - 2; i > 0; i--) { + vyz = eval_poly_bi(n, &coeffs[n * n * i], y, z, &dyz[0]); + result = vyz + x * result; + deriv[0] = vyz * i + x * deriv[0]; + deriv[1] = dyz[0] + x * deriv[1]; + deriv[2] = dyz[1] + x * deriv[2]; + } + result = eval_poly_bi(n, &coeffs[0], y, z, &dyz[0]) + x * result; + deriv[1] = dyz[0] + x * deriv[1]; + deriv[2] = dyz[1] + x * deriv[2]; + return result; +} + +template +inline flt_t PijSpline(KernelArgsAIREBOT * ka, int itype, + int jtype, flt_t NC, flt_t NH, flt_t * dN) { + dN[0] = 0.0; + dN[1] = 0.0; + if (itype == HYDROGEN) return 0; + flt_t *pCJdom = jtype == CARBON ? &ka->params.pCCdom[0][0] : + &ka->params.pCHdom[0][0]; + NC = fmax_nonan(pCJdom[0], fmin_nonan(pCJdom[1], NC)); + NH = fmax_nonan(pCJdom[2], fmin_nonan(pCJdom[3], NH)); + int nC = floor(NC); + int nH = floor(NH); + #define PijSelect(a, b) (jtype == CARBON ? ka->params.a : ka->params.b) + if (fabs(NC - nC) < TOL && fabs(NH - nH) < TOL) { + dN[0] = PijSelect(PCCdfdx, PCHdfdx)[nC][nH]; + dN[1] = PijSelect(PCCdfdy, PCHdfdy)[nC][nH]; + return PijSelect(PCCf, PCHf)[nC][nH]; + } + if (NC == pCJdom[1]) nC -= 1; + if (NH == pCJdom[3]) nH -= 1; + return eval_poly_bi(4, &PijSelect(pCC, pCH)[nC][nH][0], NC, NH, dN); + #undef PijSelect +} + +template +inline flt_t TijSpline(KernelArgsAIREBOT * ka, flt_t Nij, + flt_t Nji, flt_t Nijconj, acc_t * dN3) { + flt_t * Tijdom = &ka->params.Tijdom[0][0]; + Nij = fmax_nonan(Tijdom[0], fmin_nonan(Tijdom[1], Nij)); + Nji = fmax_nonan(Tijdom[2], fmin_nonan(Tijdom[3], Nji)); + Nijconj = fmax_nonan(Tijdom[4], fmin_nonan(Tijdom[5], Nijconj)); + int nij = floor(Nij); + int nji = floor(Nji); + int nijconj = floor(Nijconj); + if (fabs(Nij - nij) < TOL && fabs(Nji - nji) < + TOL && fabs(Nijconj - nijconj) < TOL) { + dN3[0] = ka->params.Tdfdx[nij][nji][nijconj]; + dN3[1] = ka->params.Tdfdy[nij][nji][nijconj]; + dN3[2] = ka->params.Tdfdz[nij][nji][nijconj]; + return ka->params.Tf[nij][nji][nijconj]; + } + if (Nij == Tijdom[1]) nij -= 1; + if (Nji == Tijdom[3]) nji -= 1; + if (Nijconj == Tijdom[5]) nijconj -= 1; + return eval_poly_tri(4, &ka->params.Tijc[nij][nji][nijconj][0], Nij, + Nji, Nijconj, dN3); +} + +template +inline flt_t piRCSpline(KernelArgsAIREBOT * ka, int itype, + int jtype, flt_t Nij, flt_t Nji, flt_t Nijconj, acc_t * dN3) { + const int HH = 2; + const int CH = 1; + /* const int CC = 0; */ + int select = itype + jtype; + #define piRCSelect(a, b, c) (select == HH ? ka->params.a : select == CH ? \ + ka->params.b : ka->params.c) + flt_t * piIJdom = &piRCSelect(piHHdom, piCHdom, piCCdom)[0][0]; + if (select == HH) { + if (Nij < piIJdom[0] || Nij > piIJdom[1] || Nji < piIJdom[2] || + Nji > piIJdom[3] || Nijconj < piIJdom[4] || Nijconj > piIJdom[5]) { + Nij = 0; + Nji = 0; + Nijconj = 0; + } + } + Nij = fmax_nonan(piIJdom[0], fmin_nonan(piIJdom[1], Nij)); + Nji = fmax_nonan(piIJdom[2], fmin_nonan(piIJdom[3], Nji)); + Nijconj = fmax_nonan(piIJdom[4], fmin_nonan(piIJdom[5], Nijconj)); + int nij = floor(Nij); + int nji = floor(Nji); + int nijconj = floor(Nijconj); + if (fabs(Nij - nij) < TOL && fabs(Nji - nji) < + TOL && fabs(Nijconj - nijconj) < TOL) { + dN3[0] = piRCSelect(piHHdfdx, piCHdfdx, piCCdfdx)[nij][nji][nijconj]; + dN3[1] = piRCSelect(piHHdfdy, piCHdfdy, piCCdfdy)[nij][nji][nijconj]; + dN3[2] = piRCSelect(piHHdfdz, piCHdfdz, piCCdfdz)[nij][nji][nijconj]; + return piRCSelect(piHHf, piCHf, piCCf)[nij][nji][nijconj]; + } + if (Nij == piIJdom[1]) nij -= 1; + if (Nji == piIJdom[3]) nji -= 1; + if (Nijconj == piIJdom[5]) nijconj -= 1; + return eval_poly_tri(4, + &piRCSelect(piHH, piCH, piCC)[nij][nji][nijconj][0], Nij, Nji, Nijconj, + dN3); + #undef piRCSelect +} + +/* + * Implements the p_ij term in airebo, which occurs on 4 different occasions + * in the original lammps code. + */ +template +inline flt_t frebo_pij(KernelArgsAIREBOT * ka, int i, int j, + flt_t rijx, flt_t rijy, flt_t rijz, flt_t rijmag, flt_t wij, flt_t VA, + flt_t * sum_N, acc_t fij[3]) { + ResultForceT * result_f = ka->result_f; + AtomAIREBOT * x = ka->x; + int * map = ka->map; + flt_t * nC = ka->nC; + flt_t * nH = ka->nH; + flt_t x_i = x[i].x; + flt_t y_i = x[i].y; + flt_t z_i = x[i].z; + int itype = map[x[i].w]; + int jtype = map[x[j].w]; + flt_t invrijm = 1 / rijmag; + flt_t invrijm2 = invrijm * invrijm; + flt_t rcminij = ka->params.rcmin[itype][jtype]; + flt_t rcmaxij = ka->params.rcmax[itype][jtype]; + flt_t Nmin = ka->params.Nmin; + flt_t Nmax = ka->params.Nmax; + flt_t Nij = nC[i] + nH[i] - wij; + flt_t NijC = nC[i] - wij * (1 - jtype); + flt_t NijH = nH[i] - wij * jtype; + flt_t sum_pij = 0; + flt_t sum_dpij_dN = 0; + flt_t dN2[2] = {0}; + flt_t pij = 0; + *sum_N = 0; + int * neighs = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i]; + int pass; + for (pass = 0; pass < 2; pass++) { + int kk; + int knum = ka->neigh_rebo.num[i]; + for (kk = 0; kk < knum; kk++) { + int k = neighs[kk]; + if (k == j) continue; + flt_t rikx = x_i - x[k].x; + flt_t riky = y_i - x[k].y; + flt_t rikz = z_i - x[k].z; + int ktype = map[x[k].w]; + flt_t rikmag = overloaded::sqrt(rikx * rikx + riky * riky + rikz * rikz); + flt_t rho_k = ka->params.rho[ktype][1]; + flt_t rho_j = ka->params.rho[jtype][1]; + flt_t lamdajik = 4 * itype * ((rho_k - rikmag) - (rho_j - rijmag)); + flt_t ex_lam = exp(lamdajik); + flt_t rcminik = ka->params.rcmin[itype][ktype]; + flt_t rcmaxik = ka->params.rcmax[itype][ktype]; + flt_t dwik; + flt_t wik = Sp(rikmag, rcminik, rcmaxik, &dwik); + flt_t Nki = nC[k] + nH[k] - wik; + flt_t cosjik = (rijx * rikx + rijy * riky + rijz * rikz) / + (rijmag * rikmag); + cosjik = fmin_nonan(1, fmax_nonan(-1, cosjik)); + flt_t dgdc, dgdN; + flt_t g = gSpline(ka, itype, cosjik, Nij, &dgdc, &dgdN); + if (pass == 0) { + sum_pij += wik * g * ex_lam; + sum_dpij_dN += wik * dgdN * ex_lam; + flt_t cutN = Sp(Nki, Nmin, Nmax, NULL); + *sum_N += (1 - ktype) * wik * cutN; + } else { + flt_t tmp = -0.5 * pij * pij * pij; + flt_t invrikm = 1 / rikmag; + flt_t rjkx = rikx - rijx; + flt_t rjky = riky - rijy; + flt_t rjkz = rikz - rijz; + flt_t rjkmag = sqrt(rjkx * rjkx + rjky * rjky + rjkz * rjkz); + flt_t rijrik = 2 * rijmag * rikmag; + flt_t rr = rijmag * rijmag - rikmag * rikmag; + flt_t dctdjk = -2 / rijrik; + flt_t dctdik = (-rr + rjkmag * rjkmag) / (rijrik * rikmag * rikmag); + flt_t dctdij = (rr + rjkmag * rjkmag) / (rijrik * rijmag * rijmag); + + acc_t fi[3], fj[3], fk[3]; + flt_t pref = 0.5 * VA * tmp; + flt_t tmp20 = pref * wik * dgdc * ex_lam; + fj[0] = fj[1] = fj[2] = 0; + fi[0] = -tmp20 * dctdik * rikx; + fi[1] = -tmp20 * dctdik * riky; + fi[2] = -tmp20 * dctdik * rikz; + fk[0] = tmp20 * dctdik * rikx; + fk[1] = tmp20 * dctdik * riky; + fk[2] = tmp20 * dctdik * rikz; + + fij[0] += -tmp20 * dctdij * rijx; + fij[1] += -tmp20 * dctdij * rijy; + fij[2] += -tmp20 * dctdij * rijz; + + fi[0] += -tmp20 * dctdjk * rjkx; + fi[1] += -tmp20 * dctdjk * rjky; + fi[2] += -tmp20 * dctdjk * rjkz; + fk[0] += tmp20 * dctdjk * rjkx; + fk[1] += tmp20 * dctdjk * rjky; + fk[2] += tmp20 * dctdjk * rjkz; + fij[0] -= -tmp20 * dctdjk * rjkx; + fij[1] -= -tmp20 * dctdjk * rjky; + fij[2] -= -tmp20 * dctdjk * rjkz; + + flt_t tmp21 = pref * (wik * g * ex_lam * 4 * itype); + fij[0] -= 1 * tmp21 * rijx * invrijm; + fij[1] -= 1 * tmp21 * rijy * invrijm; + fij[2] -= 1 * tmp21 * rijz * invrijm; + fi[0] -= tmp21 * (-rikx * invrikm); + fi[1] -= tmp21 * (-riky * invrikm); + fi[2] -= tmp21 * (-rikz * invrikm); + fk[0] -= tmp21 * (rikx * invrikm); + fk[1] -= tmp21 * (riky * invrikm); + fk[2] -= tmp21 * (rikz * invrikm); + + // coordination forces + + // dwik forces + flt_t tmp22 = pref * dwik * g * ex_lam * invrikm; + fi[0] -= tmp22 * rikx; + fi[1] -= tmp22 * riky; + fi[2] -= tmp22 * rikz; + fk[0] += tmp22 * rikx; + fk[1] += tmp22 * riky; + fk[2] += tmp22 * rikz; + + // PIJ forces + flt_t tmp23 = pref * dN2[ktype] * dwik * invrikm; + fi[0] -= tmp23 * rikx; + fi[1] -= tmp23 * riky; + fi[2] -= tmp23 * rikz; + fk[0] += tmp23 * rikx; + fk[1] += tmp23 * riky; + fk[2] += tmp23 * rikz; + + // dgdN forces + flt_t tmp24 = pref * sum_dpij_dN * dwik * invrikm; + fi[0] -= tmp24 * rikx; + fi[1] -= tmp24 * riky; + fi[2] -= tmp24 * rikz; + fk[0] += tmp24 * rikx; + fk[1] += tmp24 * riky; + fk[2] += tmp24 * rikz; + + result_f[i].x += fi[0]; + result_f[i].y += fi[1]; + result_f[i].z += fi[2]; + result_f[j].x += fj[0]; + result_f[j].y += fj[1]; + result_f[j].z += fj[2]; + result_f[k].x += fk[0]; + result_f[k].y += fk[1]; + result_f[k].z += fk[2]; + } + } + if (pass == 0) { + flt_t PijS = PijSpline(ka, itype, jtype, NijC, NijH, dN2); + pij = 1 / overloaded::sqrt(1 + sum_pij + PijS); + } + } + return pij; +} + +template +inline flt_t frebo_pi_rc(KernelArgsAIREBOT * ka, int itype, + int jtype, flt_t Nij, flt_t Nji, flt_t Nijconj, flt_t * dN3) { + acc_t dN3tmp[3] = {0}; + flt_t ret = piRCSpline(ka, itype, jtype, Nij, Nji, Nijconj, dN3tmp); + dN3[0] = dN3tmp[0]; + dN3[1] = dN3tmp[1]; + dN3[2] = dN3tmp[2]; + return ret; +} + +template +inline flt_t frebo_Tij(KernelArgsAIREBOT * ka, int itype, + int jtype, flt_t Nij, flt_t Nji, flt_t Nijconj, flt_t * dN3) { + dN3[0] = 0; + dN3[1] = 0; + dN3[2] = 0; + if (itype == HYDROGEN || jtype == HYDROGEN) return 0; + acc_t dN3tmp[3] = {0}; + flt_t ret = TijSpline(ka, Nij, Nji, Nijconj, dN3tmp); + dN3[0] = dN3tmp[0]; + dN3[1] = dN3tmp[1]; + dN3[2] = dN3tmp[2]; + return ret; +} + +/* + * Implements a scalar version of the sum cos^1(omega) term used in pi^dh_ij. + * Occurs in both bondorder and bondorderLJ. + */ +template +inline flt_t frebo_sum_omega(KernelArgsAIREBOT * ka, int i, int j, + flt_t r23x, flt_t r23y, flt_t r23z, flt_t r23mag, flt_t VA, acc_t fij[3]) { + ResultForceT * result_f = ka->result_f; + acc_t sum_omega = 0; + int a2 = i; + int a3 = j; + flt_t r32x = - r23x; + flt_t r32y = - r23y; + flt_t r32z = - r23z; + int * map = ka->map; + AtomAIREBOT * x = ka->x; + flt_t thmin = ka->params.thmin; + flt_t thmax = ka->params.thmax; + int itype = map[x[i].w]; + int jtype = map[x[j].w]; + int * neighs_i = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i]; + int * neighs_j = ka->neigh_rebo.entries + ka->neigh_rebo.offset[j]; + int num_i = ka->neigh_rebo.num[i]; + int num_j = ka->neigh_rebo.num[j]; + int kk; + for (kk = 0; kk < num_i; kk++) { + int k = neighs_i[kk]; + if (k == j) continue; + int a1 = k; + int ktype = map[x[k].w]; + flt_t r21x = x[a2].x - x[a1].x; + flt_t r21y = x[a2].y - x[a1].y; + flt_t r21z = x[a2].z - x[a1].z; + flt_t r21mag = overloaded::sqrt(r21x * r21x + r21y * r21y + r21z * r21z); + flt_t cos321 = (r23x * r21x + r23y * r21y + r23z * r21z) / + (r23mag * r21mag); + cos321 = fmin_nonan(1, fmax_nonan(-1, cos321)); + flt_t sin321 = overloaded::sqrt(1 - cos321 * cos321); + if (sin321 == 0) continue; + flt_t sink2i = 1 / (sin321 * sin321); + flt_t rik2i = 1 / (r21mag * r21mag); + flt_t rr = r23mag * r23mag - r21mag * r21mag; + flt_t r31x = r21x - r23x; + flt_t r31y = r21y - r23y; + flt_t r31z = r21z - r23z; + flt_t r31mag2 = r31x * r31x + r31y * r31y + r31z * r31z; + flt_t rijrik = 2 * r23mag * r21mag; + flt_t r21mag2 = r21mag * r21mag; + flt_t dctik = (-rr + r31mag2) / (rijrik * r21mag2); + flt_t dctij = (rr + r31mag2) / (rijrik * r23mag * r23mag); + flt_t dctjk = -2 / rijrik; + flt_t rcmin21 = ka->params.rcmin [itype][ktype]; + flt_t rcmaxp21 = ka->params.rcmaxp[itype][ktype]; + flt_t dw21; + flt_t w21 = Sp(r21mag, rcmin21, rcmaxp21, &dw21); + // why does this additional cutoff in the cosine exist? + // the original code by stuart answers this: + // it avoid issues when bonds in the dihedral are linear + // by switching the dihedral off beforehand. + // This is the reason for both the sin == 0 checks and the + // tspjik = Sp2(..) calls. + // Unfortunately, this is not exactly stated in the original paper. + // It might be similar in purpose to the H(sin - s^min) term that + // appears in that paper, but can not be found in original REBO papers. + flt_t dtsjik; + flt_t tspjik = Sp2(cos321, thmin, thmax, &dtsjik); + dtsjik = - dtsjik; + int ll; + for (ll = 0; ll < num_j; ll++) { + int l = neighs_j[ll]; + if (l == i || l == k) continue; + int ltype = map[x[l].w]; + int a4 = l; + flt_t r34x = x[a3].x - x[a4].x; + flt_t r34y = x[a3].y - x[a4].y; + flt_t r34z = x[a3].z - x[a4].z; + flt_t r34mag = overloaded::sqrt(r34x * r34x + r34y * r34y + r34z * r34z); + flt_t cos234 = (r32x * r34x + r32y * r34y + r32z * r34z) / + (r23mag * r34mag); + cos234 = fmin_nonan(1, fmax_nonan(-1, cos234)); + flt_t sin234 = overloaded::sqrt(1 - cos234 * cos234); + if (sin234 == 0) continue; + flt_t sinl2i = 1 / (sin234 * sin234); + flt_t rjl2i = 1 / (r34mag * r34mag); + + flt_t rcminjl = ka->params.rcmin[jtype][ltype]; + flt_t rcmaxpjl = ka->params.rcmaxp[jtype][ltype]; + flt_t dw34; + flt_t w34 = Sp(r34mag, rcminjl, rcmaxpjl, &dw34); + flt_t rr = (r23mag * r23mag) - (r34mag * r34mag); + flt_t r24x = r23x + r34x; + flt_t r24y = r23y + r34y; + flt_t r24z = r23z + r34z; + flt_t r242 = + (r24x * r24x) + (r24y * r24y) + (r24z * r24z); + flt_t rijrjl = 2 * r23mag * r34mag; + flt_t rjl2 = r34mag * r34mag; + flt_t dctjl = (-rr + r242) / (rijrjl * rjl2); + flt_t dctji = (rr + r242) / (rijrjl * r23mag * r23mag); + flt_t dctil = -2 / rijrjl; + flt_t dtsijl; + flt_t tspijl = Sp2(cos234, thmin, thmax, &dtsijl); + dtsijl = -dtsijl; // need minus sign + flt_t prefactor = VA; + + flt_t cross321x = (r32y * r21z) - (r32z * r21y); + flt_t cross321y = (r32z * r21x) - (r32x * r21z); + flt_t cross321z = (r32x * r21y) - (r32y * r21x); + flt_t cross234x = (r23y * r34z) - (r23z * r34y); + flt_t cross234y = (r23z * r34x) - (r23x * r34z); + flt_t cross234z = (r23x * r34y) - (r23y * r34x); + + flt_t cwnum = (cross321x * cross234x) + + (cross321y * cross234y) + + (cross321z * cross234z); + flt_t cwnom = r21mag * r34mag * r23mag * r23mag * sin321 * sin234; + flt_t om1234 = cwnum / cwnom; + flt_t cw = om1234; + sum_omega += ((1 - (om1234 * om1234)) * w21 * w34) * + (1 - tspjik) * (1 - tspijl); + if (VA == static_cast(0.0)) continue; + + flt_t dt1dik = (rik2i) - (dctik * sink2i * cos321); + flt_t dt1djk = (-dctjk * sink2i * cos321); + flt_t dt1djl = (rjl2i) - (dctjl * sinl2i * cos234); + flt_t dt1dil = (-dctil * sinl2i * cos234); + flt_t dt1dij = (2 / (r23mag * r23mag)) - + (dctij * sink2i * cos321) - + (dctji * sinl2i * cos234); + + flt_t dt2dikx = (-r23z * cross234y) + (r23y * cross234z); + flt_t dt2diky = (-r23x * cross234z) + (r23z * cross234x); + flt_t dt2dikz = (-r23y * cross234x) + (r23x * cross234y); + + flt_t dt2djlx = (-r23y * cross321z) + (r23z * cross321y); + flt_t dt2djly = (-r23z * cross321x) + (r23x * cross321z); + flt_t dt2djlz = (-r23x * cross321y) + (r23y * cross321x); + + flt_t dt2dijx = (r21z * cross234y) - (r34z * cross321y) - + flt_t (r21y * cross234z) + (r34y * cross321z); + flt_t dt2dijy = (r21x * cross234z) - (r34x * cross321z) - + flt_t (r21z * cross234x) + (r34z * cross321x); + flt_t dt2dijz = (r21y * cross234x) - (r34y * cross321x) - + flt_t (r21x * cross234y) + (r34x * cross321y); + + flt_t aa = (prefactor * 2 * cw / cwnom) * w21 * w34 * + (1 - tspjik) * (1 - tspijl); + flt_t aaa1 = -prefactor * (1 - (om1234 * om1234)) * + (1 - tspjik) * (1 - tspijl); + flt_t aaa2 = -prefactor * (1 - (om1234 * om1234)) * w21 * w34; + flt_t at2 = aa * cwnum; + + flt_t fcijpc = (-dt1dij * at2) + + (aaa2 * dtsjik * dctij * (1 - tspijl)) + + (aaa2 * dtsijl * dctji * (1 - tspjik)); + flt_t fcikpc = (-dt1dik * at2) + + (aaa2 * dtsjik * dctik * (1 - tspijl)); + flt_t fcjlpc = (-dt1djl * at2) + + (aaa2 * dtsijl * dctjl * (1 - tspjik)); + flt_t fcjkpc = (-dt1djk * at2) + + (aaa2 * dtsjik * dctjk * (1 - tspijl)); + flt_t fcilpc = (-dt1dil * at2) + + (aaa2 * dtsijl * dctil * (1 - tspjik)); + + flt_t F23x = (fcijpc * r23x) + (aa * dt2dijx); + flt_t F23y = (fcijpc * r23y) + (aa * dt2dijy); + flt_t F23z = (fcijpc * r23z) + (aa * dt2dijz); + + flt_t F12x = (fcikpc * r21x) + (aa * dt2dikx); + flt_t F12y = (fcikpc * r21y) + (aa * dt2diky); + flt_t F12z = (fcikpc * r21z) + (aa * dt2dikz); + + flt_t F34x = (fcjlpc * r34x) + (aa * dt2djlx); + flt_t F34y = (fcjlpc * r34y) + (aa * dt2djly); + flt_t F34z = (fcjlpc * r34z) + (aa * dt2djlz); + + flt_t F31x = (fcjkpc * r31x); + flt_t F31y = (fcjkpc * r31y); + flt_t F31z = (fcjkpc * r31z); + + flt_t F24x = (fcilpc * r24x); + flt_t F24y = (fcilpc * r24y); + flt_t F24z = (fcilpc * r24z); + + flt_t f1x = -F12x - F31x; + flt_t f1y = -F12y - F31y; + flt_t f1z = -F12z - F31z; + flt_t f2x = F12x + F31x; + flt_t f2y = F12y + F31y; + flt_t f2z = F12z + F31z; + flt_t f3x = F34x + F24x; + flt_t f3y = F34y + F24y; + flt_t f3z = F34z + F24z; + flt_t f4x = -F34x - F24x; + flt_t f4y = -F34y - F24y; + flt_t f4z = -F34z - F24z; + + fij[0] += F23x + F24x - F31x; + fij[1] += F23y + F24y - F31y; + fij[2] += F23z + F24z - F31z; + + // coordination forces + + flt_t tmp20 = VA * ((1 - (om1234 * om1234))) * + (1 - tspjik) * (1 - tspijl) * dw21 * w34 / r21mag; + f2x -= tmp20 * r21x; + f2y -= tmp20 * r21y; + f2z -= tmp20 * r21z; + f1x += tmp20 * r21x; + f1y += tmp20 * r21y; + f1z += tmp20 * r21z; + + flt_t tmp21 = VA * ((1 - (om1234 * om1234))) * + (1 - tspjik) * (1 - tspijl) * w21 * dw34 / r34mag; + f3x -= tmp21 * r34x; + f3y -= tmp21 * r34y; + f3z -= tmp21 * r34z; + f4x += tmp21 * r34x; + f4y += tmp21 * r34y; + f4z += tmp21 * r34z; + + result_f[a1].x += f1x; + result_f[a1].y += f1y; + result_f[a1].z += f1z; + result_f[a2].x += f2x; + result_f[a2].y += f2y; + result_f[a2].z += f2z; + result_f[a3].x += f3x; + result_f[a3].y += f3y; + result_f[a3].z += f3z; + result_f[a4].x += f4x; + result_f[a4].y += f4y; + result_f[a4].z += f4z; + } + } + return sum_omega; +} + +/* + * Implements a scalar implementation the force update due to splines. + * It is used for both pi^rc_ij and T_ij. + * Occurs four times in each bondorder and bondorderLJ. + */ +template +inline void frebo_N_spline_force(KernelArgsAIREBOT * ka, int i, + int j, flt_t VA, flt_t dN, flt_t dNconj, flt_t Nconj) { + int * map = ka->map; + AtomAIREBOT * x = ka->x; + ResultForceT * result_f = ka->result_f; + flt_t * nC = ka->nC; + flt_t * nH = ka->nH; + flt_t Nmin = ka->params.Nmin; + flt_t Nmax = ka->params.Nmax; + int itype = map[x[i].w]; + int * neighs = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i]; + int knum = ka->neigh_rebo.num[i]; + int kk; + for (kk = 0; kk < knum; kk++) { + int k = neighs[kk]; + if (k == j) continue; + flt_t rikx = x[i].x - x[k].x; + flt_t riky = x[i].y - x[k].y; + flt_t rikz = x[i].z - x[k].z; + flt_t rikmag = overloaded::sqrt(rikx * rikx + riky * riky + rikz * rikz); + int ktype = map[x[k].w]; + flt_t rcminik = ka->params.rcmin[itype][ktype]; + flt_t rcmaxik = ka->params.rcmax[itype][ktype]; + flt_t dwik; + flt_t wik = Sp(rikmag, rcminik, rcmaxik, &dwik); + flt_t Nki = nC[k] + nH[k] - wik; + flt_t dNki; + flt_t SpN = Sp(Nki, Nmin, Nmax, &dNki); + flt_t fdN = VA * dN * dwik / rikmag; + flt_t fdNconj = VA * dNconj * 2 * Nconj * dwik * SpN / rikmag; + flt_t ffactor = fdN; + if (ktype == 0) ffactor += fdNconj; + flt_t fkx = ffactor * rikx; + flt_t fky = ffactor * riky; + flt_t fkz = ffactor * rikz; + result_f[i].x -= fkx; + result_f[i].y -= fky; + result_f[i].z -= fkz; + result_f[k].x += fkx; + result_f[k].y += fky; + result_f[k].z += fkz; + if (ktype != 0 || fabs(dNki) <= TOL) continue; + int * neighs_k = ka->neigh_rebo.entries + ka->neigh_rebo.offset[k]; + int nnum = ka->neigh_rebo.num[k]; + int nn; + for (nn = 0; nn < nnum; nn++) { + int n = neighs_k[nn]; + if (n == i) continue; + flt_t rknx = x[k].x - x[n].x; + flt_t rkny = x[k].y - x[n].y; + flt_t rknz = x[k].z - x[n].z; + flt_t rknmag = overloaded::sqrt(rknx * rknx + rkny * rkny + rknz * rknz); + int ntype = map[x[n].w]; + flt_t rcminkn = ka->params.rcmin[ktype][ntype]; + flt_t rcmaxkn = ka->params.rcmax[ktype][ntype]; + flt_t dwkn; + Sp(rknmag, rcminkn, rcmaxkn, &dwkn); + flt_t ffactor = VA * dNconj * 2 * Nconj * wik * dNki * dwkn / rknmag; + result_f[k].x -= ffactor * rknx; + result_f[k].y -= ffactor * rkny; + result_f[k].z -= ffactor * rknz; + result_f[n].x += ffactor * rknx; + result_f[n].y += ffactor * rkny; + result_f[n].z += ffactor * rknz; + } + } +} + +/* + * This data-structure contains the result of a search through neighbor-lists. + * It is used to calculate C_ij and the corresponding force updates. + */ +template +struct LennardJonesPathAIREBOT { + AtomAIREBOT del[3]; + int num; + flt_t w[3]; + flt_t dw[3]; + flt_t r[3]; + int idx[4]; +}; + +/* + * Checks a candidate path stored in idxs whether it is better than *path + * and updates *path accordingly. + */ +template +inline flt_t ref_lennard_jones_test_path_single( + KernelArgsAIREBOT * ka, flt_t best, int num, int * idxs, + LennardJonesPathAIREBOT * path) { + LennardJonesPathAIREBOT result; + AtomAIREBOT * x = ka->x; + int * map = ka->map; + result.num = num; + flt_t combined = 1; + for (int i = num - 2; i >= 0; i--) { + int a0 = idxs[i+0]; + int a1 = idxs[i+1]; + flt_t delx = x[a1].x - x[a0].x; + flt_t dely = x[a1].y - x[a0].y; + flt_t delz = x[a1].z - x[a0].z; + flt_t rsq = delx * delx + dely * dely + delz * delz; + int type0 = map[x[a0].w]; + int type1 = map[x[a1].w]; + if (rsq >= ka->params.rcmaxsq[type0][type1]) return best; + flt_t r = overloaded::sqrt(rsq); + flt_t dw, w = Sp(r, ka->params.rcmin[type0][type1], + ka->params.rcmax[type0][type1], &dw); + if (w == 0) return best; + combined *= w; + if (combined <= best) return best; + result.idx[i] = a0; + result.del[i].x = delx; + result.del[i].y = dely; + result.del[i].z = delz; + result.r[i] = r; + result.w[i] = w; + result.dw[i] = dw; + } + result.idx[num - 1] = idxs[num - 1]; + *path = result; + return combined; +} + +/* + * Test through all paths surrounding i and j to find the corresponding + * best path. Uses the same iteration ordering as FLJ() does. + * Note that an optimization would use the j neighlist instead in the inner + * loop. + */ +template +inline flt_t ref_lennard_jones_test_path(KernelArgsAIREBOT * ka, + int i, int j, flt_t rij, flt_t rcmax, + LennardJonesPathAIREBOT * path) { + int idxs[4]; + idxs[0] = i; + idxs[1] = j; + flt_t best = 0; + if (rij <= rcmax) { + best = ref_lennard_jones_test_path_single(ka, best, 2, idxs, path); + if (best == static_cast(1.0)) return 0; + } + for (int kk = 0; kk < ka->neigh_rebo.num[i]; kk++) { + int k = ka->neigh_rebo.entries[ka->neigh_rebo.offset[i] + kk]; + if (k == j) continue; + idxs[1] = k; + idxs[2] = j; + best = ref_lennard_jones_test_path_single(ka, best, 3, idxs, path); + if (best == static_cast(1.0)) return 0; + for (int mm = 0; mm < ka->neigh_rebo.num[k]; mm++) { + int m = ka->neigh_rebo.entries[ka->neigh_rebo.offset[k] + mm]; + if (m == i || m == j) continue; + idxs[2] = m; + idxs[3] = j; + best = ref_lennard_jones_test_path_single(ka, best, 4, idxs, path); + if (best == static_cast(1.0)) return 0; + } + } + return 1 - best; +} + +/* + * Conducts the force update due to C_ij, given the active path. + */ +template +inline void ref_lennard_jones_force_path(KernelArgsAIREBOT * ka, + flt_t dC, LennardJonesPathAIREBOT * path) { + AtomAIREBOT * x = ka->x; + ResultForceT * result_f = ka->result_f; + for (int i = 0; i < path->num - 1; i++) { + flt_t fpair = dC * path->dw[i] / path->r[i]; + for (int j = 0; j < path->num - 1; j++) { + if (i != j) fpair *= path->w[j]; + } + result_f[path->idx[i+0]].x -= fpair * path->del[i].x; + result_f[path->idx[i+0]].y -= fpair * path->del[i].y; + result_f[path->idx[i+0]].z -= fpair * path->del[i].z; + result_f[path->idx[i+1]].x += fpair * path->del[i].x; + result_f[path->idx[i+1]].y += fpair * path->del[i].y; + result_f[path->idx[i+1]].z += fpair * path->del[i].z; + } +} + +/* + * Calculate the bondorderLJ term. + */ +template +inline flt_t ref_lennard_jones_bondorder(KernelArgsAIREBOT * ka, + int i, int j, flt_t VA, acc_t fij[3]) { + AtomAIREBOT * x = ka->x; + int * map = ka->map; + ResultForceT * result_f = ka->result_f; + + int itype = map[x[i].w]; + int jtype = map[x[j].w]; + + flt_t delx = x[i].x - x[j].x; + flt_t dely = x[i].y - x[j].y; + flt_t delz = x[i].z - x[j].z; + flt_t rsq = delx * delx + dely * dely + delz * delz; + flt_t rij = overloaded::sqrt(rsq); + + flt_t rcminij = ka->params.rcmin[itype][jtype]; + flt_t rcmaxij = ka->params.rcmax[itype][jtype]; + flt_t dwij; + flt_t wij = Sp(rij, rcminij, rcmaxij, &dwij); + + flt_t the_r = ka->params.rcmin[itype][jtype]; + flt_t scale = the_r / rij; + flt_t Nij = ka->nH[i] + ka->nC[i] - wij; + flt_t Nji = ka->nH[j] + ka->nC[j] - wij; + flt_t NconjtmpI; + acc_t fijc[3] = {0}, fjic[3] = {0}; + flt_t pij = frebo_pij(ka, i, j, delx * scale, dely * scale, + delz * scale, the_r, wij, 0.0, &NconjtmpI, fijc); + flt_t NconjtmpJ; + flt_t pji = frebo_pij(ka, j, i, -delx * scale, -dely * scale, + -delz * scale, the_r, wij, 0.0, &NconjtmpJ, fjic); + flt_t Nijconj = 1.0 + (NconjtmpI * NconjtmpI) + (NconjtmpJ * NconjtmpJ); + flt_t dN3_pi_rc[3]; + flt_t pi_rc = frebo_pi_rc(ka, itype, jtype, Nij, Nji, Nijconj, + dN3_pi_rc); + flt_t dN3_Tij[3]; + flt_t Tij = frebo_Tij(ka, itype, jtype, Nij, Nji, Nijconj, + dN3_Tij); + flt_t sum_omega = 0; + if (fabs(Tij) > TOL) { + sum_omega = frebo_sum_omega(ka, i, j, delx * scale, dely * + scale, delz * scale, the_r, 0.0, + fijc); + } + flt_t pi_dh = Tij * sum_omega; + flt_t bij = 0.5 * (pij + pji) + pi_rc + pi_dh; + flt_t dStb; + flt_t Stb = Sp2(bij, ka->params.bLJmin[itype][jtype], + ka->params.bLJmax[itype][jtype], &dStb); + if (dStb != 0) { + flt_t pij_reverse = frebo_pij(ka, i, j, delx * scale, + dely * scale, delz * scale, the_r, wij, VA * dStb, &NconjtmpI, fijc); + flt_t pji_reverse = frebo_pij(ka, j, i, -delx * scale, + -dely * scale, -delz * scale, the_r, wij, VA * dStb, &NconjtmpJ, fjic); + fijc[0] -= fjic[0]; + fijc[1] -= fjic[1]; + fijc[2] -= fjic[2]; + frebo_N_spline_force(ka, i, j, VA * dStb, dN3_pi_rc[0], + dN3_pi_rc[2], NconjtmpI); + frebo_N_spline_force(ka, j, i, VA * dStb, dN3_pi_rc[1], + dN3_pi_rc[2], NconjtmpJ); + if (fabs(Tij) > TOL) { + flt_t sum_omega_reverse = frebo_sum_omega(ka, i, j, + delx * scale, dely * scale, delz * scale, the_r, VA * dStb * Tij, fijc); + frebo_N_spline_force(ka, i, j, VA * dStb * sum_omega, dN3_Tij[0], + dN3_Tij[2], NconjtmpI); + frebo_N_spline_force(ka, j, i, VA * dStb * sum_omega, dN3_Tij[1], + dN3_Tij[2], NconjtmpJ); + } + assert(fij[0] == 0); + assert(fij[1] == 0); + assert(fij[2] == 0); + fij[0] = scale * (fijc[0] - (delx * delx * fijc[0] + dely * delx * + fijc[1] + delz * delx * fijc[2]) / rsq); + fij[1] = scale * (fijc[1] - (delx * dely * fijc[0] + dely * dely * + fijc[1] + delz * dely * fijc[2]) / rsq); + fij[2] = scale * (fijc[2] - (delx * delz * fijc[0] + dely * delz * + fijc[1] + delz * delz * fijc[2]) / rsq); + } + return Stb; +} + +/* + * Scalar reference implementation of neighbor routine. + */ +template +void ref_rebo_neigh(KernelArgsAIREBOT * ka) { + int offset = ka->neigh_from_atom * ka->num_neighs_per_atom; + for (int i = ka->neigh_from_atom; i < ka->neigh_to_atom; i++) { + ka->neigh_rebo.offset[i] = offset; + int itype = ka->map[ka->x[i].w]; + int n = 0; + ka->nC[i] = 0; + ka->nH[i] = 0; + for (int j = 0; j < ka->neigh_lmp.num[i]; j++) { + int ji = ka->neigh_lmp.entries[ka->neigh_lmp.offset[i] + j]; + flt_t delx = ka->x[i].x - ka->x[ji].x; + flt_t dely = ka->x[i].y - ka->x[ji].y; + flt_t delz = ka->x[i].z - ka->x[ji].z; + flt_t rsq = delx * delx + dely * dely + delz * delz; + int jtype = ka->map[ka->x[ji].w]; + if (rsq < ka->params.rcmaxsq[itype][jtype]) { + ka->neigh_rebo.entries[offset + n++] = ji; + flt_t rcmin = ka->params.rcmin[itype][jtype]; + flt_t rcmax = ka->params.rcmax[itype][jtype]; + if (jtype == CARBON) + ka->nC[i] += Sp(overloaded::sqrt(rsq), rcmin, rcmax, NULL); + else + ka->nH[i] += Sp(overloaded::sqrt(rsq), rcmin, rcmax, NULL); + } + } + ka->neigh_rebo.num[i] = n; + offset += n; + } +} + +template +void ref_torsion_single_interaction(KernelArgsAIREBOT * ka, int i, + int j) { + AtomAIREBOT * x = ka->x; + int * map = ka->map; + ResultForceT * f = ka->result_f; + flt_t (*rcmin)[2] = ka->params.rcmin; + flt_t (*rcmax)[2] = ka->params.rcmax; + flt_t (*epsilonT)[2] = ka->params.epsilonT; + flt_t thmin = ka->params.thmin; + flt_t thmax = ka->params.thmax; + int itype = map[x[i].w]; + flt_t xtmp = x[i].x; + flt_t ytmp = x[i].y; + flt_t ztmp = x[i].z; + int * REBO_neighs_i = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]]; + int jnum = ka->neigh_rebo.num[i]; + int jtype = map[x[j].w]; + + flt_t del32x = x[j].x-x[i].x; + flt_t del32y = x[j].y-x[i].y; + flt_t del32z = x[j].z-x[i].z; + flt_t rsq = del32x*del32x + del32y*del32y + del32z*del32z; + flt_t r32 = overloaded::sqrt(rsq); + flt_t del23x = -del32x; + flt_t del23y = -del32y; + flt_t del23z = -del32z; + flt_t r23 = r32; + flt_t dw23, w23 = Sp(r23,rcmin[itype][jtype],rcmax[itype][jtype], + &dw23); + + assert(itype == 0); + assert(jtype == 0); + + for (int kk = 0; kk < jnum; kk++) { + int k = REBO_neighs_i[kk]; + int ktype = map[x[k].w]; + if (k == j) continue; + flt_t del21x = x[i].x-x[k].x; + flt_t del21y = x[i].y-x[k].y; + flt_t del21z = x[i].z-x[k].z; + flt_t rsq = del21x*del21x + del21y*del21y + del21z*del21z; + flt_t r21 = overloaded::sqrt(rsq); + flt_t cos321 = - ((del21x*del32x) + (del21y*del32y) + + (del21z*del32z)) / (r21*r32); + cos321 = fmin(cos321,1); + cos321 = fmax(cos321,-1); + flt_t sin321 = overloaded::sqrt(1 - cos321*cos321); + if (sin321 < TOL) continue; + + flt_t deljkx = del21x-del23x; + flt_t deljky = del21y-del23y; + flt_t deljkz = del21z-del23z; + flt_t rjk2 = deljkx*deljkx + deljky*deljky + deljkz*deljkz; + flt_t rjk = overloaded::sqrt(rjk2); + flt_t rik2 = r21*r21; + flt_t dw21, w21 = Sp(r21,rcmin[itype][ktype],rcmax[itype][ktype], + &dw21); + + flt_t rij = r32; + flt_t rik = r21; + flt_t rij2 = r32*r32; + flt_t costmp = static_cast(0.5)*(rij2+rik2-rjk2)/rij/rik; + flt_t dtsjik, tspjik = Sp2(costmp,thmin,thmax,&dtsjik); + dtsjik = -dtsjik; + + int * REBO_neighs_j = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[j]]; + int lnum = ka->neigh_rebo.num[j]; + for (int ll = 0; ll < lnum; ll++) { + int l = REBO_neighs_j[ll]; + int ltype = map[x[l].w]; + if (l == i || l == k) continue; + flt_t del34x = x[j].x-x[l].x; + flt_t del34y = x[j].y-x[l].y; + flt_t del34z = x[j].z-x[l].z; + flt_t rsq = del34x*del34x + del34y*del34y + del34z*del34z; + flt_t r34 = overloaded::sqrt(rsq); + flt_t cos234 = (del32x*del34x + del32y*del34y + + del32z*del34z) / (r32*r34); + cos234 = fmin(cos234,1); + cos234 = fmax(cos234,-1); + flt_t sin234 = overloaded::sqrt(1 - cos234*cos234); + if (sin234 < TOL) continue; + flt_t dw34, w34 = Sp(r34,rcmin[jtype][ltype],rcmax[jtype][ltype], + &dw34); + flt_t delilx = del23x + del34x; + flt_t delily = del23y + del34y; + flt_t delilz = del23z + del34z; + flt_t ril2 = delilx*delilx + delily*delily + delilz*delilz; + flt_t ril = overloaded::sqrt(ril2); + flt_t rjl2 = r34*r34; + + flt_t rjl = r34; + flt_t costmp = static_cast(0.5)*(rij2+rjl2-ril2)/rij/rjl; + flt_t dtsijl, tspijl = Sp2(costmp,thmin,thmax,&dtsijl); + dtsijl = -dtsijl; //need minus sign + flt_t cross321x = (del32y*del21z)-(del32z*del21y); + flt_t cross321y = (del32z*del21x)-(del32x*del21z); + flt_t cross321z = (del32x*del21y)-(del32y*del21x); + flt_t cross321mag = overloaded::sqrt(cross321x*cross321x+ + cross321y*cross321y + cross321z*cross321z); + flt_t cross234x = (del23y*del34z)-(del23z*del34y); + flt_t cross234y = (del23z*del34x)-(del23x*del34z); + flt_t cross234z = (del23x*del34y)-(del23y*del34x); + flt_t cross234mag = overloaded::sqrt(cross234x*cross234x+ + cross234y*cross234y + cross234z*cross234z); + flt_t cwnum = (cross321x*cross234x) + + (cross321y*cross234y)+(cross321z*cross234z); + flt_t cwnom = r21*r34*r32*r32*sin321*sin234; + flt_t cw = cwnum/cwnom; + + flt_t cw2 = (static_cast(.5)*(1-cw)); + flt_t ekijl = epsilonT[ktype][ltype]; + flt_t Ec = 256*ekijl/405; + flt_t Vtors = (Ec*(overloaded::pow(cw2,5)))-(ekijl/10); + + ka->result_eng += Vtors*w21*w23*w34*(1-tspjik)*(1-tspijl); + + flt_t dndijx = (cross234y*del21z)-(cross234z*del21y); + flt_t dndijy = (cross234z*del21x)-(cross234x*del21z); + flt_t dndijz = (cross234x*del21y)-(cross234y*del21x); + + flt_t tmpvecx = (del34y*cross321z)-(del34z*cross321y); + flt_t tmpvecy = (del34z*cross321x)-(del34x*cross321z); + flt_t tmpvecz = (del34x*cross321y)-(del34y*cross321x); + + dndijx = dndijx+tmpvecx; + dndijy = dndijy+tmpvecy; + dndijz = dndijz+tmpvecz; + + flt_t dndikx = (del23y*cross234z)-(del23z*cross234y); + flt_t dndiky = (del23z*cross234x)-(del23x*cross234z); + flt_t dndikz = (del23x*cross234y)-(del23y*cross234x); + + flt_t dndjlx = (cross321y*del23z)-(cross321z*del23y); + flt_t dndjly = (cross321z*del23x)-(cross321x*del23z); + flt_t dndjlz = (cross321x*del23y)-(cross321y*del23x); + + flt_t dcidij = ((r23*r23)-(r21*r21)+(rjk*rjk))/(2*r23*r23*r21); + flt_t dcidik = ((r21*r21)-(r23*r23)+(rjk*rjk))/(2*r23*r21*r21); + flt_t dcidjk = (-rjk)/(r23*r21); + flt_t dcjdji = ((r23*r23)-(r34*r34)+(ril*ril))/(2*r23*r23*r34); + flt_t dcjdjl = ((r34*r34)-(r23*r23)+(ril*ril))/(2*r23*r34*r34); + flt_t dcjdil = (-ril)/(r23*r34); + + flt_t dsidij = (-cos321/sin321)*dcidij; + flt_t dsidik = (-cos321/sin321)*dcidik; + flt_t dsidjk = (-cos321/sin321)*dcidjk; + + flt_t dsjdji = (-cos234/sin234)*dcjdji; + flt_t dsjdjl = (-cos234/sin234)*dcjdjl; + flt_t dsjdil = (-cos234/sin234)*dcjdil; + + flt_t dxidij = (r21*sin321)+(r23*r21*dsidij); + flt_t dxidik = (r23*sin321)+(r23*r21*dsidik); + flt_t dxidjk = (r23*r21*dsidjk); + + flt_t dxjdji = (r34*sin234)+(r23*r34*dsjdji); + flt_t dxjdjl = (r23*sin234)+(r23*r34*dsjdjl); + flt_t dxjdil = (r23*r34*dsjdil); + + flt_t ddndij = (dxidij*cross234mag)+(cross321mag*dxjdji); + flt_t ddndik = dxidik*cross234mag; + flt_t ddndjk = dxidjk*cross234mag; + flt_t ddndjl = cross321mag*dxjdjl; + flt_t ddndil = cross321mag*dxjdil; + flt_t dcwddn = -cwnum/(cwnom*cwnom); + flt_t dcwdn = 1/cwnom; + flt_t dvpdcw = (-1)*Ec*static_cast(-0.5)*5*overloaded::pow(cw2,4)* + w23*w21*w34*(1-tspjik)*(1-tspijl); + + flt_t Ftmpx = dvpdcw*((dcwdn*dndijx)+(dcwddn*ddndij*del23x/r23)); + flt_t Ftmpy = dvpdcw*((dcwdn*dndijy)+(dcwddn*ddndij*del23y/r23)); + flt_t Ftmpz = dvpdcw*((dcwdn*dndijz)+(dcwddn*ddndij*del23z/r23)); + flt_t fix = Ftmpx; + flt_t fiy = Ftmpy; + flt_t fiz = Ftmpz; + flt_t fjx = -Ftmpx; + flt_t fjy = -Ftmpy; + flt_t fjz = -Ftmpz; + + Ftmpx = dvpdcw*((dcwdn*dndikx)+(dcwddn*ddndik*del21x/r21)); + Ftmpy = dvpdcw*((dcwdn*dndiky)+(dcwddn*ddndik*del21y/r21)); + Ftmpz = dvpdcw*((dcwdn*dndikz)+(dcwddn*ddndik*del21z/r21)); + fix += Ftmpx; + fiy += Ftmpy; + fiz += Ftmpz; + flt_t fkx = -Ftmpx; + flt_t fky = -Ftmpy; + flt_t fkz = -Ftmpz; + + Ftmpx = (dvpdcw*dcwddn*ddndjk*deljkx)/rjk; + Ftmpy = (dvpdcw*dcwddn*ddndjk*deljky)/rjk; + Ftmpz = (dvpdcw*dcwddn*ddndjk*deljkz)/rjk; + fjx += Ftmpx; + fjy += Ftmpy; + fjz += Ftmpz; + fkx -= Ftmpx; + fky -= Ftmpy; + fkz -= Ftmpz; + + Ftmpx = dvpdcw*((dcwdn*dndjlx)+(dcwddn*ddndjl*del34x/r34)); + Ftmpy = dvpdcw*((dcwdn*dndjly)+(dcwddn*ddndjl*del34y/r34)); + Ftmpz = dvpdcw*((dcwdn*dndjlz)+(dcwddn*ddndjl*del34z/r34)); + fjx += Ftmpx; + fjy += Ftmpy; + fjz += Ftmpz; + flt_t flx = -Ftmpx; + flt_t fly = -Ftmpy; + flt_t flz = -Ftmpz; + + Ftmpx = (dvpdcw*dcwddn*ddndil*delilx)/ril; + Ftmpy = (dvpdcw*dcwddn*ddndil*delily)/ril; + Ftmpz = (dvpdcw*dcwddn*ddndil*delilz)/ril; + fix += Ftmpx; + fiy += Ftmpy; + fiz += Ftmpz; + flx -= Ftmpx; + fly -= Ftmpy; + flz -= Ftmpz; + + // coordination forces + + flt_t fpair = Vtors*dw21*w23*w34*(1-tspjik)*(1-tspijl) / r21; + fix -= del21x*fpair; + fiy -= del21y*fpair; + fiz -= del21z*fpair; + fkx += del21x*fpair; + fky += del21y*fpair; + fkz += del21z*fpair; + + fpair = Vtors*w21*dw23*w34*(1-tspjik)*(1-tspijl) / r23; + fix -= del23x*fpair; + fiy -= del23y*fpair; + fiz -= del23z*fpair; + fjx += del23x*fpair; + fjy += del23y*fpair; + fjz += del23z*fpair; + + fpair = Vtors*w21*w23*dw34*(1-tspjik)*(1-tspijl) / r34; + fjx -= del34x*fpair; + fjy -= del34y*fpair; + fjz -= del34z*fpair; + flx += del34x*fpair; + fly += del34y*fpair; + flz += del34z*fpair; + + // additional cut off function forces + + flt_t fcpc = -Vtors*w21*w23*w34*dtsjik*(1-tspijl); + fpair = fcpc*dcidij/rij; + fix += fpair*del23x; + fiy += fpair*del23y; + fiz += fpair*del23z; + fjx -= fpair*del23x; + fjy -= fpair*del23y; + fjz -= fpair*del23z; + + fpair = fcpc*dcidik/rik; + fix += fpair*del21x; + fiy += fpair*del21y; + fiz += fpair*del21z; + fkx -= fpair*del21x; + fky -= fpair*del21y; + fkz -= fpair*del21z; + + fpair = fcpc*dcidjk/rjk; + fjx += fpair*deljkx; + fjy += fpair*deljky; + fjz += fpair*deljkz; + fkx -= fpair*deljkx; + fky -= fpair*deljky; + fkz -= fpair*deljkz; + + fcpc = -Vtors*w21*w23*w34*(1-tspjik)*dtsijl; + fpair = fcpc*dcjdji/rij; + fix += fpair*del23x; + fiy += fpair*del23y; + fiz += fpair*del23z; + fjx -= fpair*del23x; + fjy -= fpair*del23y; + fjz -= fpair*del23z; + + fpair = fcpc*dcjdjl/rjl; + fjx += fpair*del34x; + fjy += fpair*del34y; + fjz += fpair*del34z; + flx -= fpair*del34x; + fly -= fpair*del34y; + flz -= fpair*del34z; + + fpair = fcpc*dcjdil/ril; + fix += fpair*delilx; + fiy += fpair*delily; + fiz += fpair*delilz; + flx -= fpair*delilx; + fly -= fpair*delily; + flz -= fpair*delilz; + + // sum per-atom forces into atom force array + + f[i].x += fix; f[i].y += fiy; f[i].z += fiz; + f[j].x += fjx; f[j].y += fjy; f[j].z += fjz; + f[k].x += fkx; f[k].y += fky; f[k].z += fkz; + f[l].x += flx; f[l].y += fly; f[l].z += flz; + } + } +} + +template +void ref_torsion(KernelArgsAIREBOT * ka) { + AtomAIREBOT * x = ka->x; + int * map = ka->map; + int * tag = ka->tag; + for (int ii = ka->frebo_from_atom; ii < ka->frebo_to_atom; ii++) { + int i = ii; + int itag = tag[i]; + int itype = map[x[i].w]; + if (itype != 0) continue; + flt_t xtmp = x[i].x; + flt_t ytmp = x[i].y; + flt_t ztmp = x[i].z; + int * REBO_neighs_i = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]]; + int jnum = ka->neigh_rebo.num[i]; + for (int jj = 0; jj < jnum; jj++) { + int j = REBO_neighs_i[jj]; + int jtag = tag[j]; + + if (itag > jtag) { + if (((itag+jtag) & 1) == 0) continue; + } else if (itag < jtag) { + if (((itag+jtag) & 1) == 1) continue; + } else { + if (x[j].z < ztmp) continue; + if (x[j].z == ztmp && x[j].y < ytmp) continue; + if (x[j].z == ztmp && x[j].y == ytmp && x[j].x < xtmp) continue; + } + + int jtype = map[x[j].w]; + if (jtype != 0) continue; + ref_torsion_single_interaction(ka, i, j); + } + } +} + +/* + * Calculate single REBO interaction. + * Corresponds to FREBO method. Note that the bondorder() function is + * inlined. + */ +template +void ref_frebo_single_interaction(KernelArgsAIREBOT * ka, int i, + int j) { + AtomAIREBOT * x = ka->x; + int * map = ka->map; + ResultForceT * result_f = ka->result_f; + int jj; + int itype = map[x[i].w]; + flt_t x_i = x[i].x; + flt_t y_i = x[i].y; + flt_t z_i = x[i].z; + int jtype = map[x[j].w]; + flt_t delx = x[i].x - x[j].x; + flt_t dely = x[i].y - x[j].y; + flt_t delz = x[i].z - x[j].z; + flt_t rsq = delx * delx + dely * dely + delz * delz; + flt_t rij = overloaded::sqrt(rsq); + flt_t rcminij = ka->params.rcmin[itype][jtype]; + flt_t rcmaxij = ka->params.rcmax[itype][jtype]; + flt_t dwij; + flt_t wij = Sp(rij, rcminij, rcmaxij, &dwij); + if (wij <= TOL) return; + + flt_t Qij = ka->params.Q[itype][jtype]; + flt_t Aij = ka->params.A[itype][jtype]; + flt_t alphaij = ka->params.alpha[itype][jtype]; + + flt_t exp_alphar = exp(-alphaij * rij); + flt_t VR_by_wij = (1.0 + (Qij / rij)) * Aij * exp_alphar; + flt_t VR = wij * VR_by_wij; + flt_t pre = wij * Aij * exp_alphar; + flt_t dVRdi = pre * ((-alphaij) - (Qij / rsq) - (Qij * alphaij / rij)); + dVRdi += VR_by_wij * dwij; + + flt_t VA_by_wij = 0, dVA = 0; + for (int k = 0; k < 3; k++) { + flt_t BIJc = ka->params.BIJc[itype][jtype][k]; + flt_t Betaij = ka->params.Beta[itype][jtype][k]; + flt_t term = -BIJc * overloaded::exp(-Betaij * rij); + VA_by_wij += term; + dVA += -Betaij * wij * term; + } + dVA += VA_by_wij * dwij; + flt_t VA = VA_by_wij * wij; + + acc_t fij[3] = {0}; + flt_t Nij = ka->nH[i] + ka->nC[i] - wij; + flt_t Nji = ka->nH[j] + ka->nC[j] - wij; + flt_t NconjtmpI; + flt_t pij = frebo_pij(ka, i, j, delx, dely, delz, rij, wij, VA, &NconjtmpI, + fij); + flt_t NconjtmpJ; + acc_t fji[3] = {0}; + flt_t pji = frebo_pij(ka, j, i, -delx, -dely, -delz, rij, wij, VA, + &NconjtmpJ, fji); + fij[0] -= fji[0]; fij[1] -= fji[1]; fij[2] -= fji[2]; + flt_t Nijconj = 1.0 + (NconjtmpI * NconjtmpI) + (NconjtmpJ * NconjtmpJ); + flt_t dN3[3]; + flt_t pi_rc = frebo_pi_rc(ka, itype, jtype, Nij, Nji, Nijconj, dN3); + frebo_N_spline_force(ka, i, j, VA, dN3[0], dN3[2], NconjtmpI); + frebo_N_spline_force(ka, j, i, VA, dN3[1], dN3[2], NconjtmpJ); + flt_t Tij = frebo_Tij(ka, itype, jtype, Nij, Nji, Nijconj, dN3); + flt_t sum_omega = 0.0; + if (fabs(Tij) > TOL) { + sum_omega = frebo_sum_omega(ka, i, j, delx, dely, delz, rij, VA * Tij, fij); + frebo_N_spline_force(ka, i, j, VA * sum_omega, dN3[0], dN3[2], NconjtmpI); + frebo_N_spline_force(ka, j, i, VA * sum_omega, dN3[1], dN3[2], NconjtmpJ); + } + flt_t pi_dh = Tij * sum_omega; + flt_t bij = static_cast(0.5) * (pij + pji) + pi_rc + pi_dh; + flt_t dVAdi = bij * dVA; + flt_t fpair = -(dVRdi + dVAdi) / rij; + + result_f[i].x += fpair * delx + fij[0]; + result_f[i].y += fpair * dely + fij[1]; + result_f[i].z += fpair * delz + fij[2]; + result_f[j].x -= fpair * delx + fij[0]; + result_f[j].y -= fpair * dely + fij[1]; + result_f[j].z -= fpair * delz + fij[2]; + + flt_t evdwl = VR + bij * VA; + ka->result_eng += evdwl; + result_f[i].w += 0.5 * evdwl; + result_f[j].w += 0.5 * evdwl; +} + + +template +inline void ref_frebo_single_atom(KernelArgsAIREBOT * ka, int i) { + AtomAIREBOT * x = ka->x; + int * tag = ka->tag; + int jj; + int itag = tag[i]; + flt_t x_i = x[i].x; + flt_t y_i = x[i].y; + flt_t z_i = x[i].z; + int * neighs = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i]; + int jnum = ka->neigh_rebo.num[i]; + for (jj = 0; jj < jnum; jj++) { + int j = neighs[jj]; + int jtag = tag[j]; + if (itag > jtag) { + if (((itag + jtag) & 1) == 0) + continue; + } else if (itag < jtag) { + if (((itag + jtag) & 1) == 1) + continue; + } else { + if (x[j].z < z_i) + continue; + if (x[j].z == z_i && x[j].y < y_i) + continue; + if (x[j].z == z_i && x[j].y == y_i && x[j].x < x_i) + continue; + } + ref_frebo_single_interaction(ka, i, j); + } +} + + +template +void ref_frebo(KernelArgsAIREBOT * ka, int torflag) { + for (int i = ka->frebo_from_atom; i < ka->frebo_to_atom; i++) { + ref_frebo_single_atom(ka, i); + } + if (torflag) ref_torsion(ka); +} + +template +void ref_lennard_jones_single_interaction(KernelArgsAIREBOT * ka, + int i, int j, int morseflag) { + AtomAIREBOT * x = ka->x; + int * map = ka->map; + ResultForceT * result_f = ka->result_f; + + int itype = map[x[i].w]; + int jtype = map[x[j].w]; + + flt_t delx = x[i].x - x[j].x; + flt_t dely = x[i].y - x[j].y; + flt_t delz = x[i].z - x[j].z; + flt_t rsq = delx * delx + dely * dely + delz * delz; + + if (rsq >= ka->params.cutljsq[itype][jtype]) { return; } + flt_t rij = overloaded::sqrt(rsq); + + LennardJonesPathAIREBOT testpath; + flt_t cij = 1.0; + if (rij < ka->params.cut3rebo) { + #pragma noinline + cij = ref_lennard_jones_test_path(ka, i, j, rij, + ka->params.rcmax[itype][jtype], &testpath); + } + if (cij == 0) { + return; + } + + flt_t sigcut = ka->params.sigcut; + flt_t sigmin = ka->params.sigmin; + flt_t sigma = ka->params.sigma[itype][jtype]; + flt_t rljmax = sigcut * sigma; + flt_t rljmin = sigmin * sigma; + + flt_t dslw, slw = Sp2(rij, rljmin, rljmax, &dslw); + + flt_t vdw, dvdw; + if (morseflag) { + const flt_t exr = exp(-rij * ka->params.lj4[itype][jtype]); + vdw = ka->params.lj1[itype][jtype] * exr * + (ka->params.lj2[itype][jtype]*exr - 2); + dvdw = ka->params.lj3[itype][jtype] * exr * + (1 - ka->params.lj2[itype][jtype]*exr); + } else { + flt_t r2inv = 1 / rsq; + flt_t r6inv = r2inv * r2inv * r2inv; + + vdw = r6inv * (ka->params.lj3[itype][jtype]*r6inv - + ka->params.lj4[itype][jtype]); + dvdw = -r6inv * (ka->params.lj1[itype][jtype]*r6inv - + ka->params.lj2[itype][jtype]) / rij; + } + + flt_t VLJ = vdw * slw; + flt_t dVLJ = dvdw * slw + vdw * dslw; + + flt_t dStr, Str = Sp2(rij, ka->params.rcLJmin[itype][jtype], + ka->params.rcLJmax[itype][jtype], &dStr); + flt_t VA = Str * cij * VLJ; + flt_t Stb = 0; + acc_t fij[3] = {0}; + if (Str > 0) { + #pragma noinline + Stb = ref_lennard_jones_bondorder(ka, i, j, VA, fij); + } + flt_t fpair = -(dStr * (Stb * cij * VLJ - cij * VLJ) + + dVLJ * (Str * Stb * cij + cij - Str * cij)) / rij; + flt_t evdwl = VA * Stb + (1 - Str) * cij * VLJ; + result_f[i].x += fpair * delx + fij[0]; + result_f[i].y += fpair * dely + fij[1]; + result_f[i].z += fpair * delz + fij[2]; + result_f[j].x -= fpair * delx + fij[0]; + result_f[j].y -= fpair * dely + fij[1]; + result_f[j].z -= fpair * delz + fij[2]; + ka->result_eng += evdwl; + + if (cij < 1) { + #pragma noinline + ref_lennard_jones_force_path(ka, Str * Stb * VLJ + (1 - Str) * VLJ, + &testpath); + } +} + +template +void ref_lennard_jones_single_atom(KernelArgsAIREBOT * ka, int i, + int morseflag) { + AtomAIREBOT * x = ka->x; + int * tag = ka->tag; + int jj; + int itag = tag[i]; + int * neighs = ka->neigh_lmp.entries + ka->neigh_lmp.offset[i]; + int jnum = ka->neigh_lmp.num_half[i]; + for (jj = 0; jj < jnum; jj++) { + int j = neighs[jj]; + ref_lennard_jones_single_interaction(ka, i, j, morseflag); + } +} + +template +void ref_lennard_jones(KernelArgsAIREBOT * ka, int morseflag) { + for (int i = ka->frebo_from_atom; i < ka->frebo_to_atom; i++) { + #pragma noinline + ref_lennard_jones_single_atom(ka, i, morseflag); + } +} + +/* ---------------------------------------------------------------------- + Vectorized AIREBO implementation, standalone, using caching to reduce + memory access. + ---------------------------------------------------------------------- */ + +template +struct aut_wrap { + +typedef typename intr_types::fvec fvec; +typedef typename intr_types::avec avec; +typedef typename intr_types::ivec ivec; +typedef typename intr_types::bvec bvec; + +VEC_INLINE inline +static void aut_loadatoms_vec( + AtomAIREBOT * atoms, ivec j_vec, + fvec *x, fvec * y, fvec * z, bvec * type_mask, int * map, ivec map_i, + ivec c_1 +) { + const ivec c_4 = ivec::set1(4); + ivec j_vec_4 = ivec::mullo(c_4, j_vec); + fvec w; + fvec::gather_4_adjacent(j_vec_4, &atoms[0].x, sizeof(flt_t), x, y, z, &w); + ivec jtype = fvec::unpackloepi32(w); + jtype = ivec::srlv(map_i, jtype); //_mm512_castpd_si512(w)); + jtype = ivec::the_and(c_1, jtype); + bvec jtype_mask = ivec::cmpneq(jtype, ivec::setzero()); + *type_mask = jtype_mask; +} + +VEC_INLINE inline +static void aut_loadatoms_vec_notype( + AtomAIREBOT * atoms, ivec j_vec, + fvec *x, fvec * y, fvec * z +) { + const ivec c_4 = ivec::set1(4); + ivec j_vec_4 = ivec::mullo(c_4, j_vec); + fvec::gather_3_adjacent(j_vec_4, &atoms[0].x, sizeof(flt_t), x, y, z); +} + +static fvec aut_Sp2_deriv(fvec r, fvec lo, fvec hi, fvec * d) { + fvec c_1 = fvec::set1(1); + fvec c_2 = fvec::set1(2); + fvec c_3 = fvec::set1(3); + fvec c_6 = fvec::set1(6); + bvec m_lo = fvec::cmple(r, lo); + bvec m_hi = fvec::cmpnlt(r, hi); // nlt == ge + bvec m_tr = bvec::kandn(m_lo, ~ m_hi); + fvec ret = c_1; + ret = fvec::mask_blend(m_hi, ret, fvec::setzero()); + fvec der = fvec::setzero(); + if (bvec::test_any_set(m_tr)) { + fvec diff = hi - lo; + fvec rcp = fvec::recip(diff); + fvec t = (r - lo) * rcp; + fvec v = c_1 - t * t * ( c_3 - c_2 * t); + ret = fvec::mask_blend(m_tr, ret, v); + fvec dv = c_6 * rcp * ( t * t - t); + der = fvec::mask_blend(m_tr, der, dv); + } + *d = der; + return ret; +} + +static fvec aut_Sp_deriv(fvec r, fvec lo, fvec hi, fvec * d) { + fvec c_1 = fvec::set1(1); + fvec c_0_5 = fvec::set1(0.5); + fvec c_m0_5 = fvec::set1(-0.5); + fvec c_PI = fvec::set1(M_PI); + bvec m_lo = fvec::cmple(r, lo); + bvec m_hi = fvec::cmpnlt(r, hi); // nlt == ge + bvec m_tr = bvec::kandn(m_lo, ~ m_hi); + fvec ret = c_1; + ret = fvec::mask_blend(m_hi, ret, fvec::setzero()); + fvec der = fvec::setzero(); + if (bvec::test_any_set(m_tr)) { + fvec diff = hi - lo; + fvec rcp = fvec::mask_recip(c_1, m_tr, diff); + fvec t = (r - lo) / diff; + fvec sinval, cosval; + sinval = fvec::mask_sincos(&cosval, fvec::setzero(), c_1, m_tr, c_PI * t); + fvec v = c_0_5 * ( c_1 + cosval); + ret = fvec::mask_blend(m_tr, ret, v); + fvec dv = c_PI * c_m0_5 * rcp * sinval; + der = fvec::mask_blend(m_tr, der, dv); + } + *d = der; + return ret; +} + +static fvec aut_mask_Sp(bvec mask, fvec r, fvec lo, fvec hi) { + fvec c_1 = fvec::set1(1); + fvec c_0_5 = fvec::set1(0.5); + fvec c_PI = fvec::set1(M_PI); + bvec m_lo = fvec::mask_cmple(mask, r, lo); + bvec m_hi = fvec::mask_cmpnlt(mask, r, hi); // nlt == ge + bvec m_tr = bvec::kandn(m_lo, bvec::kandn(m_hi, mask)); + fvec ret = c_1; + ret = fvec::mask_blend(m_hi, ret, fvec::setzero()); + if (bvec::test_any_set(m_tr)) { + fvec rcp = fvec::mask_recip(c_1, m_tr, hi - lo); + fvec t = (r - lo) * rcp; + fvec v = c_0_5 * ( c_1 + fvec::mask_cos(c_1, m_tr, c_PI * t)); + ret = fvec::mask_blend(m_tr, ret, v); + } + return ret; +} + +static void aut_rebo_neigh(KernelArgsAIREBOT * ka) { + int offset = ka->neigh_from_atom * ka->num_neighs_per_atom; + ivec c_CARBON = ivec::setzero(); + int map_i = 0; + int i; + for (i = 1; i < ka->num_types; i++) { + if (ka->map[i]) + map_i |= (1 << i); + } + ivec c_i1 = ivec::set1(1); + ivec c_im = ivec::set1(map_i); + AtomAIREBOT * _noalias x = ka->x; + + for (i = ka->neigh_from_atom; i < ka->neigh_to_atom; i++) { + + fvec x_i = fvec::set1(x[i].x); + fvec y_i = fvec::set1(x[i].y); + fvec z_i = fvec::set1(x[i].z); + int itype = ka->map[ka->x[i].w]; + + fvec rcmaxsq0 = fvec::set1(ka->params.rcmaxsq[itype][0]); + fvec rcmaxsq1 = fvec::set1(ka->params.rcmaxsq[itype][1]); + fvec rcmax0 = fvec::set1(ka->params.rcmax[itype][0]); + fvec rcmax1 = fvec::set1(ka->params.rcmax[itype][1]); + fvec rcmin0 = fvec::set1(ka->params.rcmin[itype][0]); + fvec rcmin1 = fvec::set1(ka->params.rcmin[itype][1]); + fvec rcmaxskinsq0 = fvec::set1( + (ka->params.rcmax[itype][0] + ka->skin) * (ka->params.rcmax[itype][0] + + ka->skin)); + fvec rcmaxskinsq1 = fvec::set1( + (ka->params.rcmax[itype][1] + ka->skin) * (ka->params.rcmax[itype][1] + + ka->skin)); + fvec nC = fvec::setzero(); + fvec nH = fvec::setzero(); + + ka->neigh_rebo.offset[i] = offset; + + int jnum = ka->rebuild_flag ? ka->neigh_lmp.num[i] : + ka->neigh_rebo.num_half[i]; + int * neighs = ka->rebuild_flag ? + &ka->neigh_lmp.entries[ka->neigh_lmp.offset[i]] : + &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]+jnum]; + int * skin_target = &ka->neigh_rebo.entries[offset+ka->num_neighs_per_atom]; + int n = 0; + int n_skin = 0; + + int lowest_idx; + #pragma unroll(4) + for (lowest_idx = 0; lowest_idx < jnum; lowest_idx += fvec::VL) { + bvec j_mask = bvec::full(); + if (lowest_idx + fvec::VL > jnum) j_mask = bvec::only(jnum - lowest_idx); + + int * _noalias neighs_l = neighs + lowest_idx; + fvec x_j, y_j, z_j; + bvec jtype_mask; + ivec ji = ivec::maskz_loadu(j_mask, neighs_l); + aut_loadatoms_vec(x, ji, + &x_j, &y_j, &z_j, &jtype_mask, ka->map, c_im, c_i1); + fvec delx = x_i - x_j; + fvec dely = y_i - y_j; + fvec delz = z_i - z_j; + fvec rsq = delx * delx + dely * dely + delz * delz; + if (ka->rebuild_flag) { + fvec rcmaxskinsq = fvec::mask_blend(jtype_mask, rcmaxskinsq0, + rcmaxskinsq1); + bvec c_mask = fvec::mask_cmplt(j_mask, rsq, rcmaxskinsq); + ivec::mask_compressstore(c_mask, &skin_target[n_skin], ji); + n_skin += bvec::popcnt(c_mask); + } + fvec rcmaxsq = fvec::mask_blend(jtype_mask, rcmaxsq0, rcmaxsq1); + bvec c_mask = fvec::mask_cmplt(j_mask, rsq, rcmaxsq); + if (bvec::test_all_unset(c_mask)) continue; + ivec::mask_compressstore(c_mask, &ka->neigh_rebo.entries[offset + n], ji); + n += bvec::popcnt(c_mask); + fvec rcmax = fvec::mask_blend(jtype_mask, rcmax0, rcmax1); + fvec rcmin = fvec::mask_blend(jtype_mask, rcmin0, rcmin1); + fvec sp = aut_mask_Sp(c_mask, fvec::sqrt(rsq), rcmin, rcmax); + nC = fvec::mask_add(nC, bvec::kandn(jtype_mask, c_mask), nC, sp); + nH = fvec::mask_add(nH, bvec::kand (jtype_mask, c_mask), nH, sp); + } + ka->neigh_rebo.num[i] = n; + if (ka->rebuild_flag) { + for (int i = 0; i < n_skin; i++) { + ka->neigh_rebo.entries[offset+n_skin+i] = skin_target[i]; + } + } + if (ka->rebuild_flag) { + assert(n <= n_skin); + offset += 2 * n_skin; + ka->neigh_rebo.num_half[i] = n_skin; + } else { + assert(n <= jnum); + offset += 2 * jnum; + } + ka->nC[i] = fvec::reduce_add(nC); + ka->nH[i] = fvec::reduce_add(nH); + } +} + + +static fvec aut_eval_poly_lin_pd_2(int n, flt_t * vals, ivec idx, fvec x, + fvec * deriv) { + fvec c_1 = fvec::set1(1); + fvec x_i = c_1; + fvec x_im1 = fvec::setzero(); + fvec result = fvec::setzero(); + fvec i_v = fvec::setzero(); + *deriv = fvec::setzero(); + int i; + for (i = 0; i < n; i++) { + fvec coeff = fvec::gather(idx, vals + i, sizeof(flt_t)); + result = result + coeff * x_i; + *deriv = *deriv + coeff * x_im1 * i_v; + x_im1 = x_i; + x_i = x_i * x; + i_v = i_v + c_1; + } + return result; +} + +static fvec aut_mask_gSpline_pd_2(KernelArgsAIREBOT * ka, + bvec active_mask, int itype, fvec cosjik, + fvec Nij, fvec *dgdc, fvec *dgdN) { + int i; + flt_t * gDom = NULL; + int nDom = 0; + ivec offs = ivec::setzero(); + fvec NCmin = fvec::set1(ka->params.NCmin); + bvec Ngt = fvec::cmpnle(Nij, NCmin); //gt + if (itype == 0) { + nDom = 4; + gDom = &ka->params.gCdom[0]; + offs = ivec::mask_blend(Ngt, offs, ivec::set1(4*6)); + } else { + nDom = 3; + gDom = &ka->params.gHdom[0]; + offs = ivec::set1(8 * 6); + } + cosjik = fvec::max(fvec::set1(gDom[0]), fvec::min(fvec::set1(gDom[nDom]), + cosjik)); + ivec index6 = ivec::setzero(); + for (i = 0; i < nDom; i++) { + bvec cosge = fvec::cmpnlt(cosjik, fvec::set1(gDom[i])); //ge + bvec cosle = fvec::cmple(cosjik, fvec::set1(gDom[i+1])); + index6 = ivec::mask_blend(cosge & cosle, index6, ivec::set1(6*i)); + } + fvec g = aut_eval_poly_lin_pd_2(6, &ka->params.gVal[0], offs + index6, + cosjik, dgdc); + *dgdN = fvec::setzero(); + if (itype == 0) { + fvec NCmax = fvec::set1(ka->params.NCmax); + bvec Nlt = fvec::cmplt(Nij, NCmax); //gt + bvec Nmask = Ngt & Nlt; + if (bvec::test_any_set(Nmask)) { + fvec dg1; + fvec g1 = aut_eval_poly_lin_pd_2(6, &ka->params.gVal[0], index6, cosjik, + &dg1); + fvec dS; + fvec cut = aut_Sp_deriv(Nij, NCmin, NCmax, &dS); + *dgdN = fvec::mask_mul(*dgdN, Nmask, dS, g1 - g); + g = fvec::mask_add(g, Nmask, g, cut * ( g1 - g)); + *dgdc = fvec::mask_add(*dgdc, Nmask, *dgdc, cut * ( dg1 - *dgdc)); + } + } + return g; +} + +static fvec aut_PijSpline(KernelArgsAIREBOT * ka, int itype, + int jtype, fvec NijC, fvec NijH, fvec *dN2) { + flt_t ret[fvec::VL] __attribute__((aligned(64))); + flt_t dN20[fvec::VL] __attribute__((aligned(64))); + flt_t dN21[fvec::VL] __attribute__((aligned(64))); + flt_t NijC_[fvec::VL] __attribute__((aligned(64))); + flt_t NijH_[fvec::VL] __attribute__((aligned(64))); + flt_t tmp_dN2[2]; + fvec::store(NijC_, NijC); + fvec::store(NijH_, NijH); + int i; + for (i = 0; i < fvec::VL; i++) { + ret[i] = PijSpline(ka, itype, jtype, NijC_[i], NijH_[i], tmp_dN2); + dN20[i] = tmp_dN2[0]; + dN21[i] = tmp_dN2[1]; + } + dN2[0] = fvec::load(dN20); + dN2[1] = fvec::load(dN21); + return fvec::load(ret); +} + +/* + * aut_frebo_data stores all the short-ranged coordinations + * and intermediate values that get reused frequently during + * bondorder calculations. + * BUF_CAP should rarely exceed 4, so 8 is a very conservative + * value. + */ +static const int BUF_CAP = 8; +struct aut_frebo_data { + fvec rikx_buf[BUF_CAP]; + fvec riky_buf[BUF_CAP]; + fvec rikz_buf[BUF_CAP]; + fvec rikmag_buf[BUF_CAP]; + fvec cosjik_buf[BUF_CAP]; + ivec k_buf[BUF_CAP]; + fvec g_buf[BUF_CAP]; + fvec dgdc_buf[BUF_CAP]; + fvec ex_lam_buf[BUF_CAP]; + fvec wik_buf[BUF_CAP]; + fvec dwik_buf[BUF_CAP]; + fvec cutN_buf[BUF_CAP]; + fvec dcutN_buf[BUF_CAP]; + bvec ktype_buf[BUF_CAP]; + bvec mask_buf[BUF_CAP]; + fvec force_k_x_buf[BUF_CAP]; + fvec force_k_y_buf[BUF_CAP]; + fvec force_k_z_buf[BUF_CAP]; + int buf_len; + fvec x_i; + fvec y_i; + fvec z_i; + fvec x_j; + fvec y_j; + fvec z_j; + fvec nCi; + fvec nHi; + fvec force_i_x; + fvec force_i_y; + fvec force_i_z; + fvec force_j_x; + fvec force_j_y; + fvec force_j_z; +}; + +/* + * Initialize values in aut_frebo_data and perform the calculations + * for p_ij. + */ +static fvec aut_frebo_pij_pd_2( + KernelArgsAIREBOT * _noalias ka, + struct aut_frebo_data * _noalias data, + int itype, int jtype, + ivec vi, ivec vj, + fvec rijx, fvec rijy, fvec rijz, fvec rijmag, + fvec wij, fvec VA, fvec * sum_N, fvec fij[3] +) { + AtomAIREBOT * _noalias x = ka->x; + int * _noalias map = ka->map; + flt_t * _noalias nC = ka->nC; + flt_t * _noalias nH = ka->nH; + fvec x_i, y_i, z_i; + fvec x_j, y_j, z_j; + x_i = data->x_i; + y_i = data->y_i; + z_i = data->z_i; + x_j = data->x_j; + y_j = data->y_j; + z_j = data->z_j; + fvec invrijm = fvec::recip(rijmag); + fvec invrijm2 = invrijm * invrijm; + fvec rcminij = fvec::set1(ka->params.rcmin[itype][jtype]); + fvec rcmaxij = fvec::set1(ka->params.rcmax[itype][jtype]); + fvec Nmin = fvec::set1(ka->params.Nmin); + fvec Nmax = fvec::set1(ka->params.Nmax); + int map_i_scalar = 0; + { + int i; + for (i = 1; i < ka->num_types; i++) { + if (ka->map[i]) + map_i_scalar |= (1 << i); + } + } + ivec map_i = ivec::set1(map_i_scalar); + fvec nCi = data->nCi; + fvec nHi = data->nHi; + fvec Nij = nHi + nCi - wij; + fvec factor_jtype, factor_not_jtype; + if (jtype) { + factor_jtype = fvec::set1(1); + factor_not_jtype = fvec::set1(0); + } else { + factor_jtype = fvec::set1(0); + factor_not_jtype = fvec::set1(1); + } + fvec NijC = nCi - wij * factor_not_jtype; + fvec NijH = nHi - wij * factor_jtype; + fvec sum_pij = fvec::setzero(); + fvec sum_dpij_dN = fvec::setzero(); + fvec dN2[2]; + ivec offseti = ivec::mask_gather(ivec::setzero(), bvec::full(), vi, + ka->neigh_rebo.offset, sizeof(int)); + int buf_len = 0; + ivec knum = ivec::mask_gather(ivec::setzero(), bvec::full(), vi, + ka->neigh_rebo.num, sizeof(int)); + ivec kk = ivec::setzero(); + bvec active_mask = ivec::cmplt(kk, knum); + ivec c_i1 = ivec::set1(1); + fvec rho_j = fvec::set1(ka->params.rho[jtype][1]); + fvec rho_k0 = fvec::set1(ka->params.rho[0][1]); + fvec rho_k1 = fvec::set1(ka->params.rho[1][1]); + fvec c_4 = fvec::set1(4); + fvec c_2_0 = fvec::set1(2.0); + fvec c_m2_0 = fvec::set1(-2.0); + fvec c_4_0 = fvec::set1(4.0); + fvec c_0_5 = fvec::set1(0.5); + fvec c_m0_5 = fvec::set1(-0.5); + fvec c_1 = fvec::set1(1); + fvec c_m1 = fvec::set1(-1); + fvec factor_itype = itype ? c_1 : fvec::setzero(); + fvec rcmax0 = fvec::set1(ka->params.rcmax[itype][0]); + fvec rcmax1 = fvec::set1(ka->params.rcmax[itype][1]); + fvec rcmin0 = fvec::set1(ka->params.rcmin[itype][0]); + fvec rcmin1 = fvec::set1(ka->params.rcmin[itype][1]); + fvec result_f_i_x = fvec::setzero(); + fvec result_f_i_y = fvec::setzero(); + fvec result_f_i_z = fvec::setzero(); + fvec result_f_j_x = fvec::setzero(); + fvec result_f_j_y = fvec::setzero(); + fvec result_f_j_z = fvec::setzero(); + *sum_N = fvec::setzero(); + { + while (bvec::test_any_set(active_mask)) { + ivec k = ivec::mask_gather(ivec::setzero(), active_mask, kk + offseti, + ka->neigh_rebo.entries, sizeof(int)); + bvec excluded_mask = ivec::cmpeq(k, vj) & active_mask; + if (bvec::test_any_set(excluded_mask)) { + kk = ivec::mask_add(kk, excluded_mask, kk, c_i1); + active_mask = ivec::cmplt(kk, knum); + continue; + } + fvec x_k, y_k, z_k; + bvec ktype_mask; + aut_loadatoms_vec(x, k, &x_k, &y_k, &z_k, &ktype_mask, ka->map, map_i, + c_i1); + fvec rikx = x_i - x_k; + fvec riky = y_i - y_k; + fvec rikz = z_i - z_k; + fvec rikmag = fvec::sqrt(rikx * rikx + riky * riky + rikz * rikz); + fvec rho_k = fvec::mask_blend(ktype_mask, rho_k0, rho_k1); + fvec lamdajik = c_4 * factor_itype * ( rho_k - rikmag - ( rho_j - + rijmag)); + fvec ex_lam = fvec::exp(lamdajik); + fvec rcmax = fvec::mask_blend(ktype_mask, rcmax0, rcmax1); + fvec rcmin = fvec::mask_blend(ktype_mask, rcmin0, rcmin1); + fvec dwik; + fvec wik = aut_Sp_deriv(rikmag, rcmin, rcmax, &dwik); + fvec Nki = fvec::gather(k, nC, sizeof(flt_t)) + + fvec::gather(k, nH, sizeof(flt_t)) - wik; + fvec cosjik = (rijx * rikx + rijy * riky + rijz * rikz) / + ( rijmag * rikmag); + cosjik = fvec::min(c_1, fvec::max(c_m1, cosjik)); + fvec dgdc, dgdN; + fvec g = aut_mask_gSpline_pd_2(ka, active_mask, itype, cosjik, Nij, + &dgdc, &dgdN); + sum_pij = fvec::mask_add(sum_pij, active_mask, sum_pij, wik * g * ex_lam); + sum_dpij_dN = fvec::mask_add(sum_dpij_dN, active_mask, sum_dpij_dN, + wik * ex_lam * dgdN); + fvec dcutN; + fvec cutN = aut_Sp_deriv(Nki, Nmin, Nmax, &dcutN); + *sum_N = fvec::mask_add(*sum_N, active_mask, *sum_N, + fvec::mask_blend(ktype_mask, c_1, + fvec::setzero()) * wik * cutN); + if (buf_len == BUF_CAP) goto exceed_buffer; + data->rikx_buf[buf_len] = rikx; + data->riky_buf[buf_len] = riky; + data->rikz_buf[buf_len] = rikz; + data->rikmag_buf[buf_len] = rikmag; + data->cosjik_buf[buf_len] = cosjik; + data->ktype_buf[buf_len] = ktype_mask; + data->k_buf[buf_len] = k; + data->g_buf[buf_len] = g; + data->dgdc_buf[buf_len] = dgdc; + data->ex_lam_buf[buf_len] = ex_lam; + data->wik_buf[buf_len] = wik; + data->dwik_buf[buf_len] = dwik; + data->mask_buf[buf_len] = active_mask; + data->cutN_buf[buf_len] = cutN; + data->dcutN_buf[buf_len] = dcutN; + buf_len += 1; + kk = ivec::mask_add(kk, active_mask, kk, c_i1); + active_mask = ivec::cmplt(kk, knum); + } + data->buf_len = buf_len; + fvec PijS = aut_PijSpline(ka, itype, jtype, NijC, NijH, &dN2[0]); + fvec pij = fvec::invsqrt(c_1 + sum_pij + PijS); + fvec tmp = c_m0_5 * pij * pij * pij; + int buf_idx; + for (buf_idx = 0; buf_idx < buf_len; buf_idx++) { + fvec rikx = data->rikx_buf[buf_idx]; + fvec riky = data->riky_buf[buf_idx]; + fvec rikz = data->rikz_buf[buf_idx]; + fvec rikmag = data->rikmag_buf[buf_idx]; + fvec cosjik = data->cosjik_buf[buf_idx]; + bvec ktype_mask = data->ktype_buf[buf_idx]; + ivec k = data->k_buf[buf_idx]; + fvec g = data->g_buf[buf_idx]; + fvec dgdc = data->dgdc_buf[buf_idx]; + fvec ex_lam = data->ex_lam_buf[buf_idx]; + fvec wik = data->wik_buf[buf_idx]; + fvec dwik = data->dwik_buf[buf_idx]; + bvec mask = data->mask_buf[buf_idx]; + fvec invrikm = fvec::recip(rikmag); + fvec rjkx = rikx - rijx; + fvec rjky = riky - rijy; + fvec rjkz = rikz - rijz; + fvec rjkmag = fvec::sqrt( + rjkx * rjkx + rjky * rjky + rjkz * rjkz); + fvec rijrik = c_2_0 * rijmag * rikmag; + fvec rr = rijmag * rijmag - rikmag * rikmag; + fvec dctdjk = c_m2_0 / rijrik; + fvec dctdik = (rjkmag * rjkmag - rr) / ( rijrik * rikmag * rikmag); + fvec dctdij = (rjkmag * rjkmag + rr) / ( rijrik * rijmag * rijmag); + fvec fi[3], fj[3], fk[3]; + fvec pref = c_0_5 * VA * tmp; + fvec tmp20 = pref * wik * dgdc * ex_lam; + fj[0] = fj[1] = fj[2] = fvec::setzero(); + fvec tmpdik = tmp20 * dctdik; + fi[0] = fvec::setzero() - tmpdik * rikx; + fi[1] = fvec::setzero() - tmpdik * riky; + fi[2] = fvec::setzero() - tmpdik * rikz; + fk[0] = tmpdik * rikx; + fk[1] = tmpdik * riky; + fk[2] = tmpdik * rikz; + + fvec tmpdij = tmp20 * dctdij; + fij[0] = fvec::mask_sub(fij[0], mask, fij[0], tmpdij * rijx); + fij[1] = fvec::mask_sub(fij[1], mask, fij[1], tmpdij * rijy); + fij[2] = fvec::mask_sub(fij[2], mask, fij[2], tmpdij * rijz); + + fvec tmpdjk = tmp20 * dctdjk; + fi[0] = fi[0] - tmpdjk * rjkx; + fi[1] = fi[1] - tmpdjk * rjky; + fi[2] = fi[2] - tmpdjk * rjkz; + fk[0] = fk[0] + tmpdjk * rjkx; + fk[1] = fk[1] + tmpdjk * rjky; + fk[2] = fk[2] + tmpdjk * rjkz; + fij[0] = fvec::mask_add(fij[0], mask, fij[0], tmpdjk * rjkx); + fij[1] = fvec::mask_add(fij[1], mask, fij[1], tmpdjk * rjky); + fij[2] = fvec::mask_add(fij[2], mask, fij[2], tmpdjk * rjkz); + + if (itype) { + fvec tmp21 = pref * wik * g * ex_lam * c_4_0; + fvec tmp21ij = tmp21 * invrijm; + fij[0] = fvec::mask_sub(fij[0], mask, fij[0], tmp21ij * rijx); + fij[1] = fvec::mask_sub(fij[1], mask, fij[1], tmp21ij * rijy); + fij[2] = fvec::mask_sub(fij[2], mask, fij[2], tmp21ij * rijz); + fvec tmp21ik = tmp21 * invrikm; + fi[0] = fi[0] + tmp21ik * rikx; + fi[1] = fi[1] + tmp21ik * riky; + fi[2] = fi[2] + tmp21ik * rikz; + fk[0] = fk[0] - tmp21ik * rikx; + fk[1] = fk[1] - tmp21ik * riky; + fk[2] = fk[2] - tmp21ik * rikz; + } + + // coordination forces + + // dwik forces + fvec tmp22 = pref * dwik * g * ex_lam * invrikm; + fi[0] = fi[0] - tmp22 * rikx; + fi[1] = fi[1] - tmp22 * riky; + fi[2] = fi[2] - tmp22 * rikz; + fk[0] = fk[0] + tmp22 * rikx; + fk[1] = fk[1] + tmp22 * riky; + fk[2] = fk[2] + tmp22 * rikz; + + // PIJ forces + fvec dN2ktype = fvec::mask_blend(ktype_mask, dN2[0], dN2[1]); + fvec tmp23 = pref * dN2ktype * dwik * invrikm; + fi[0] = fi[0] - tmp23 * rikx; + fi[1] = fi[1] - tmp23 * riky; + fi[2] = fi[2] - tmp23 * rikz; + fk[0] = fk[0] + tmp23 * rikx; + fk[1] = fk[1] + tmp23 * riky; + fk[2] = fk[2] + tmp23 * rikz; + + // dgdN forces + fvec tmp24 = pref * sum_dpij_dN * dwik * invrikm; + fi[0] = fi[0] - tmp24 * rikx; + fi[1] = fi[1] - tmp24 * riky; + fi[2] = fi[2] - tmp24 * rikz; + fk[0] = fk[0] + tmp24 * rikx; + fk[1] = fk[1] + tmp24 * riky; + fk[2] = fk[2] + tmp24 * rikz; + + result_f_i_x = fvec::mask_add(result_f_i_x, mask, result_f_i_x, fi[0]); + result_f_i_y = fvec::mask_add(result_f_i_y, mask, result_f_i_y, fi[1]); + result_f_i_z = fvec::mask_add(result_f_i_z, mask, result_f_i_z, fi[2]); + result_f_j_x = fvec::mask_add(result_f_j_x, mask, result_f_j_x, fj[0]); + result_f_j_y = fvec::mask_add(result_f_j_y, mask, result_f_j_y, fj[1]); + result_f_j_z = fvec::mask_add(result_f_j_z, mask, result_f_j_z, fj[2]); + + data->force_k_x_buf[buf_idx] = fk[0]; + data->force_k_y_buf[buf_idx] = fk[1]; + data->force_k_z_buf[buf_idx] = fk[2]; + } + data->force_i_x = result_f_i_x; + data->force_i_y = result_f_i_y; + data->force_i_z = result_f_i_z; + data->force_j_x = result_f_j_x; + data->force_j_y = result_f_j_y; + data->force_j_z = result_f_j_z; + return pij; + } + exceed_buffer: + data->buf_len = -1; + return fvec::setzero(); +} + +/* + * Apply the force values stored iin aut_frebo_data to + * the respective neighbors. + */ +static void aut_frebo_data_writeback( + KernelArgsAIREBOT * _noalias ka, + struct aut_frebo_data * _noalias data) { + ResultForceT * _noalias result_f = ka->result_f; + flt_t fk_x_buf[fvec::VL] __attribute__((aligned(64))); + flt_t fk_y_buf[fvec::VL] __attribute__((aligned(64))); + flt_t fk_z_buf[fvec::VL] __attribute__((aligned(64))); + int fk_k_buf[ivec::VL] __attribute__((aligned(64))); + int buf_idx; + for (buf_idx = 0; buf_idx < data->buf_len; buf_idx++) { + ivec k = data->k_buf[buf_idx]; + bvec active_mask = data->mask_buf[buf_idx]; + + fvec::store(fk_x_buf, data->force_k_x_buf[buf_idx]); + fvec::store(fk_y_buf, data->force_k_y_buf[buf_idx]); + fvec::store(fk_z_buf, data->force_k_z_buf[buf_idx]); + ivec::store(fk_k_buf, k); + + int lane; + for (lane = 0; lane < fvec::VL; lane++) { + if (bvec::test_at(active_mask, lane)) {} else continue; + int kk = fk_k_buf[lane]; + result_f[kk].x += fk_x_buf[lane]; + result_f[kk].y += fk_y_buf[lane]; + result_f[kk].z += fk_z_buf[lane]; + } + } +} + +static void aut_frebo_N_spline_force( + KernelArgsAIREBOT * _noalias ka, + struct aut_frebo_data * _noalias data, int itype, int jtype, ivec vi, + ivec vj, fvec VA, fvec dN, fvec dNconj, fvec Nconj) { + ivec c_i1 = ivec::set1(1); + fvec c_2 = fvec::set1(2); + fvec c_TOL = fvec::set1(TOL); + ResultForceT * _noalias result_f = ka->result_f; + AtomAIREBOT * _noalias x = ka->x; + int * _noalias map = ka->map; + flt_t * _noalias nC = ka->nC; + flt_t * _noalias nH = ka->nH; + fvec x_i, y_i, z_i; + x_i = data->x_i; + y_i = data->y_i; + z_i = data->z_i; + fvec Nmin = fvec::set1(ka->params.Nmin); + fvec Nmax = fvec::set1(ka->params.Nmax); + int map_i_scalar = 0; + { + int i; + for (i = 1; i < ka->num_types; i++) { + if (ka->map[i]) + map_i_scalar |= (1 << i); + } + } + ivec map_i = ivec::set1(map_i_scalar); + fvec dN2[2]; + ivec kk = ivec::setzero(); + fvec rcmax0 = fvec::set1(ka->params.rcmax[itype][0]); + fvec rcmax1 = fvec::set1(ka->params.rcmax[itype][1]); + fvec rcmin0 = fvec::set1(ka->params.rcmin[itype][0]); + fvec rcmin1 = fvec::set1(ka->params.rcmin[itype][1]); + fvec result_f_i_x = fvec::setzero(); + fvec result_f_i_y = fvec::setzero(); + fvec result_f_i_z = fvec::setzero(); + int buf_idx; + for (buf_idx = 0; buf_idx < data->buf_len; buf_idx++) { + ivec k = data->k_buf[buf_idx]; + bvec active_mask = data->mask_buf[buf_idx]; + fvec rikx = data->rikx_buf[buf_idx]; + fvec riky = data->riky_buf[buf_idx]; + fvec rikz = data->rikz_buf[buf_idx]; + fvec rikmag = data->rikmag_buf[buf_idx]; + bvec ktype_mask = data->ktype_buf[buf_idx]; + + fvec dwik = data->dwik_buf[buf_idx]; + fvec wik = data->wik_buf[buf_idx]; + + fvec dNki = data->dcutN_buf[buf_idx]; + fvec SpN = data->cutN_buf[buf_idx]; + + fvec invrikmag = fvec::recip(rikmag); + fvec pref = VA * dwik * invrikmag; + fvec fdN = dN * pref; + fvec fdNconj = pref * SpN * c_2 * dNconj * Nconj; + fvec ffactor = fdN; + bvec ktype_is_C = ~ ktype_mask; + ffactor = fvec::mask_add(ffactor, ktype_is_C, ffactor, fdNconj); + + fvec fkx = ffactor * rikx; + fvec fky = ffactor * riky; + fvec fkz = ffactor * rikz; + + data->force_k_x_buf[buf_idx] = data->force_k_x_buf[buf_idx] + fkx; + data->force_k_y_buf[buf_idx] = data->force_k_y_buf[buf_idx] + fky; + data->force_k_z_buf[buf_idx] = data->force_k_z_buf[buf_idx] + fkz; + + result_f_i_x = fvec::mask_sub(result_f_i_x, active_mask, result_f_i_x, fkx); + result_f_i_y = fvec::mask_sub(result_f_i_y, active_mask, result_f_i_y, fky); + result_f_i_z = fvec::mask_sub(result_f_i_z, active_mask, result_f_i_z, fkz); + + bvec need_k_neighs = fvec::mask_cmpnle(active_mask, fvec::abs(dNki), c_TOL) + & ktype_is_C; + if (bvec::test_any_set(need_k_neighs)) { + int lane; + for (lane = 0; lane < fvec::VL; lane++) { + if (! bvec::test_at(need_k_neighs, lane)) continue; + int kk = ivec::at(k, lane); + int k = kk; + int ktype = map[x[k].w]; + int i = ivec::at(vi, lane); + fvec oldVA = VA; + double VA = fvec::at(oldVA, lane); + fvec oldwik = wik; + double wik = fvec::at(oldwik, lane); + fvec olddNconj = dNconj; + double dNconj = fvec::at(olddNconj, lane); + fvec oldNconj = Nconj; + double Nconj = fvec::at(oldNconj, lane); + fvec olddNki = dNki; + double dNki = fvec::at(olddNki, lane); + int * neighs_k = ka->neigh_rebo.entries + ka->neigh_rebo.offset[k]; + int nnum = ka->neigh_rebo.num[k]; + int nn; + for (nn = 0; nn < nnum; nn++) { + int n = neighs_k[nn]; + if (n == i) continue; + double rknx = x[k].x - x[n].x; + double rkny = x[k].y - x[n].y; + double rknz = x[k].z - x[n].z; + double rknmag = sqrt(rknx * rknx + rkny * rkny + rknz * rknz); + int ntype = map[x[n].w]; + double rcminkn = ka->params.rcmin[ktype][ntype]; + double rcmaxkn = ka->params.rcmax[ktype][ntype]; + double dwkn; + Sp(rknmag, rcminkn, rcmaxkn, &dwkn); + double ffactor = VA * dNconj * 2 * Nconj * wik * dNki * dwkn / rknmag; + result_f[k].x -= ffactor * rknx; + result_f[k].y -= ffactor * rkny; + result_f[k].z -= ffactor * rknz; + result_f[n].x += ffactor * rknx; + result_f[n].y += ffactor * rkny; + result_f[n].z += ffactor * rknz; + } + } + } + } + data->force_i_x = data->force_i_x + result_f_i_x; + data->force_i_y = data->force_i_y + result_f_i_y; + data->force_i_z = data->force_i_z + result_f_i_z; +} + +static fvec aut_frebo_pi_rc_pd(KernelArgsAIREBOT * ka, int itype, + int jtype, fvec Nij, fvec Nji, fvec Nijconj, + fvec * dN3) { + flt_t ret[fvec::VL] __attribute__((aligned(64))); + flt_t dN3ret[3][fvec::VL] __attribute__((aligned(64))); + int i; + for (i = 0; i < fvec::VL; i++) { + flt_t dN3tmp[3]; + ret[i] = frebo_pi_rc(ka, itype, jtype, fvec::at(Nij, i), fvec::at(Nji, i), + fvec::at(Nijconj, i), &dN3tmp[0]); + dN3ret[0][i] = dN3tmp[0]; + dN3ret[1][i] = dN3tmp[1]; + dN3ret[2][i] = dN3tmp[2]; + } + dN3[0] = fvec::load(&dN3ret[0][0]); + dN3[1] = fvec::load(&dN3ret[1][0]); + dN3[2] = fvec::load(&dN3ret[2][0]); + return fvec::load(&ret[0]); +} + +static fvec aut_frebo_Tij(KernelArgsAIREBOT * ka, int itype, + int jtype, fvec Nij, fvec Nji, fvec Nijconj, + fvec * dN3) { + flt_t ret[fvec::VL] __attribute__((aligned(64))); + flt_t dN3ret[3][fvec::VL] __attribute__((aligned(64))); + int i; + for (i = 0; i < fvec::VL; i++) { + flt_t dN3tmp[3]; + ret[i] = frebo_Tij(ka, itype, jtype, fvec::at(Nij, i), fvec::at(Nji, i), + fvec::at(Nijconj, i), &dN3tmp[0]); + dN3ret[0][i] = dN3tmp[0]; + dN3ret[1][i] = dN3tmp[1]; + dN3ret[2][i] = dN3tmp[2]; + } + dN3[0] = fvec::load(&dN3ret[0][0]); + dN3[1] = fvec::load(&dN3ret[1][0]); + dN3[2] = fvec::load(&dN3ret[2][0]); + return fvec::load(&ret[0]); +} + +static fvec aut_frebo_sum_omega( + KernelArgsAIREBOT * _noalias ka, + struct aut_frebo_data * _noalias i_data, + struct aut_frebo_data * _noalias j_data, + int itype, int jtype, + ivec vi, ivec vj, + fvec r23x, fvec r23y, fvec r23z, fvec r23mag, + fvec VA, fvec fij[3] +) { + fvec c_1 = fvec::set1(1); + fvec c_m1 = fvec::set1(-1); + fvec c_2 = fvec::set1(2); + fvec c_m2 = fvec::set1(-2); + fvec sum_omega = fvec::setzero(); + fvec thmin = fvec::set1(ka->params.thmin); + fvec thmax = fvec::set1(ka->params.thmax); + // 2 == i, 3 == j + fvec r32x = fvec::setzero() - r23x; + fvec r32y = fvec::setzero() - r23y; + fvec r32z = fvec::setzero() - r23z; + int buf_idx_i, buf_idx_j; + for (buf_idx_i = 0; buf_idx_i < i_data->buf_len; buf_idx_i++) { + // a1 == k == buf_idx_i + bvec mask_start = i_data->mask_buf[buf_idx_i]; + fvec r21x = i_data->rikx_buf[buf_idx_i]; // a2 - a1 -> i - k + fvec r21y = i_data->riky_buf[buf_idx_i]; // a2 - a1 -> i - k + fvec r21z = i_data->rikz_buf[buf_idx_i]; // a2 - a1 -> i - k + fvec r21mag = i_data->rikmag_buf[buf_idx_i]; + // TODO use buffered cosjik + fvec cos321 = ( + r23x * r21x + r23y * r21y + r23z * r21z) / ( r23mag * r21mag); + cos321 = fvec::min(c_1, fvec::max(c_m1, cos321)); + fvec sin321 = fvec::sqrt(c_1 - cos321 * cos321); + bvec mask_outer = fvec::cmpneq(fvec::setzero(), sin321) & mask_start; + // add "continue" + fvec sink2i = fvec::mask_recip(fvec::undefined(), mask_outer, + sin321 * sin321); + fvec rik2i = fvec::mask_recip(fvec::undefined(), mask_outer, + r21mag * r21mag); + fvec rr = r23mag * r23mag - r21mag * r21mag; + fvec r31x = r21x - r23x; + fvec r31y = r21y - r23y; + fvec r31z = r21z - r23z; + fvec r31mag2 = r31x * r31x + r31y * r31y + r31z * r31z; + fvec rijrik = c_2 * r23mag * r21mag; + fvec r21mag2 = r21mag * r21mag; + fvec dctik = fvec::mask_div(fvec::undefined(), mask_outer, r31mag2 - rr, + rijrik * r21mag2); + fvec dctij = fvec::mask_div(fvec::undefined(), mask_outer, r31mag2 + rr, + rijrik * r23mag * r23mag); + fvec dctjk = fvec::mask_div(fvec::undefined(), mask_outer, c_m2, rijrik); + fvec dw21 = i_data->dwik_buf[buf_idx_i]; + fvec w21 = i_data->wik_buf[buf_idx_i]; + fvec dtsjik; + fvec tspjik = aut_Sp2_deriv(cos321, thmin, thmax, &dtsjik); + dtsjik = fvec::setzero() - dtsjik; // todo replace by appropriate xor. + ivec k = i_data->k_buf[buf_idx_i]; + for (buf_idx_j = 0; buf_idx_j < j_data->buf_len; buf_idx_j++) { + // check l == k in second loop. + // l == a4 == buf_idx_j + ivec l = j_data->k_buf[buf_idx_j]; + bvec mask_inner_0 = ivec::mask_cmpneq(mask_outer, k, l) & + j_data->mask_buf[buf_idx_j]; + // add "continue" + fvec r34x = j_data->rikx_buf[buf_idx_j]; + fvec r34y = j_data->riky_buf[buf_idx_j]; + fvec r34z = j_data->rikz_buf[buf_idx_j]; + fvec r34mag = j_data->rikmag_buf[buf_idx_j]; + fvec cos234 = fvec::mask_div(fvec::undefined(), mask_inner_0, + r32x * r34x + r32y * r34y + r32z * r34z, + r23mag * r34mag); + cos234 = fvec::min(c_1, fvec::max(c_m1, cos234)); + fvec sin234 = fvec::mask_sqrt(fvec::undefined(), mask_inner_0, + c_1 - cos234 * cos234); + bvec mask_inner_1 = fvec::mask_cmpneq(mask_inner_0, sin234, + fvec::setzero()); + // add "continue" + fvec sinl2i = fvec::mask_recip(fvec::undefined(), mask_inner_1, + sin234 * sin234); + fvec rjl2i = fvec::mask_recip(fvec::undefined(), mask_inner_1, + r34mag * r34mag); + fvec dw34 = j_data->dwik_buf[buf_idx_j]; + fvec w34 = j_data->wik_buf[buf_idx_j]; + fvec rr = r23mag * r23mag - r34mag * r34mag; + fvec r24x = r23x + r34x; + fvec r24y = r23y + r34y; + fvec r24z = r23z + r34z; + fvec r242 = r24x * r24x + r24y * r24y + r24z * r24z; + fvec rijrjl = c_2 * r23mag * r34mag; + fvec rjl2 = r34mag * r34mag; + fvec dctjl = fvec::mask_div(fvec::undefined(), mask_inner_1, r242 - rr, + rijrjl * rjl2); + fvec dctji = fvec::mask_div(fvec::undefined(), mask_inner_1, r242 + rr, + rijrjl * r23mag * r23mag); + fvec dctil = fvec::mask_div(fvec::undefined(), mask_inner_1, c_m2, + rijrjl); + fvec dtsijl; + fvec tspijl = aut_Sp2_deriv(cos234, thmin, thmax, &dtsijl); + dtsijl = fvec::setzero() - dtsijl; + fvec prefactor = VA; + + fvec cross321x = r32y * r21z - r32z * r21y; + fvec cross321y = r32z * r21x - r32x * r21z; + fvec cross321z = r32x * r21y - r32y * r21x; + fvec cross234x = r23y * r34z - r23z * r34y; + fvec cross234y = r23z * r34x - r23x * r34z; + fvec cross234z = r23x * r34y - r23y * r34x; + + fvec cwnum = cross321x * cross234x + cross321y * cross234y + cross321z * + cross234z; + fvec cwnom = r21mag * r34mag * r23mag * r23mag * sin321 * sin234; + fvec om1234 = fvec::mask_div(fvec::undefined(), mask_inner_1, cwnum, + cwnom); + fvec cw = om1234; + fvec sum_omega_contrib = (c_1 - om1234 * om1234) * w21 * w34 * + (c_1 - tspjik) * ( c_1 - tspijl); + sum_omega = fvec::mask_add(sum_omega, mask_inner_1, sum_omega, + sum_omega_contrib); + fvec dt1dik = rik2i - dctik * sink2i * cos321; + fvec dt1djk = fvec::setzero() - dctjk * sink2i * cos321; + fvec dt1djl = rjl2i - dctjl * sinl2i * cos234; + fvec dt1dil = fvec::setzero() - dctil * sinl2i * cos234; + fvec dt1dij = fvec::mask_div(fvec::undefined(), mask_inner_1, c_2, + r23mag * r23mag) - + dctij * sink2i * cos321 - dctji * sinl2i * cos234; + + fvec dt2dikx = r23y * cross234z - r23z * cross234y; + fvec dt2diky = r23z * cross234x - r23x * cross234z; + fvec dt2dikz = r23x * cross234y - r23y * cross234x; + + fvec dt2djlx = r23z * cross321y - r23y * cross321z; + fvec dt2djly = r23x * cross321z - r23z * cross321x; + fvec dt2djlz = r23y * cross321x - r23x * cross321y; + + fvec dt2dijx = r21z * cross234y + r34y * cross321z - + ( r34z * cross321y + r21y * cross234z); + fvec dt2dijy = r21x * cross234z + r34z * cross321x - + ( r34x * cross321z + r21z * cross234x); + fvec dt2dijz = r21y * cross234x + r34x * cross321y - + ( r34y * cross321x + r21x * cross234y); + + fvec aa = prefactor * c_2 * fvec::mask_div(fvec::undefined(), + mask_inner_1, cw, cwnom) * + w21 * w34 * (c_1 - tspjik) * ( c_1 - tspijl); + fvec aaa1 = (fvec::setzero() - prefactor) * (c_1 - om1234 * om1234) * + (c_1 - tspjik) * (c_1 - tspijl); + fvec aaa2 = (fvec::setzero() - prefactor) * (c_1 - om1234 * om1234) * + w21 * w34; + fvec at2 = aa * cwnum; + + fvec fcijpc = aaa2 * dtsjik * dctij * (c_1 - tspijl) + aaa2 * dtsijl * + dctji * (c_1 - tspjik) - dt1dij * at2; + fvec fcikpc = aaa2 * dtsjik * dctik * (c_1 - tspijl) - dt1dik * at2; + fvec fcjlpc = aaa2 * dtsijl * dctjl * (c_1 - tspjik) - dt1djl * at2; + fvec fcjkpc = aaa2 * dtsjik * dctjk * (c_1 - tspijl) - dt1djk * at2; + fvec fcilpc = aaa2 * dtsijl * dctil * (c_1 - tspjik) - dt1dil * at2; + + fvec F23x = fcijpc * r23x + aa * dt2dijx; + fvec F23y = fcijpc * r23y + aa * dt2dijy; + fvec F23z = fcijpc * r23z + aa * dt2dijz; + + fvec F12x = fcikpc * r21x + aa * dt2dikx; + fvec F12y = fcikpc * r21y + aa * dt2diky; + fvec F12z = fcikpc * r21z + aa * dt2dikz; + + fvec F34x = fcjlpc * r34x + aa * dt2djlx; + fvec F34y = fcjlpc * r34y + aa * dt2djly; + fvec F34z = fcjlpc * r34z + aa * dt2djlz; + + fvec F31x = fcjkpc * r31x; + fvec F31y = fcjkpc * r31y; + fvec F31z = fcjkpc * r31z; + + fvec F24x = fcilpc * r24x; + fvec F24y = fcilpc * r24y; + fvec F24z = fcilpc * r24z; + + fvec f1x = fvec::setzero() - ( F12x + F31x); + fvec f1y = fvec::setzero() - ( F12y + F31y); + fvec f1z = fvec::setzero() - ( F12z + F31z); + fvec f2x = F12x + F31x; + fvec f2y = F12y + F31y; + fvec f2z = F12z + F31z; + fvec f3x = F34x + F24x; + fvec f3y = F34y + F24y; + fvec f3z = F34z + F24z; + fvec f4x = fvec::setzero() - ( F34x + F24x); + fvec f4y = fvec::setzero() - ( F34y + F24y); + fvec f4z = fvec::setzero() - ( F34z + F24z); + + fij[0] = fvec::mask_add(fij[0], mask_inner_1, fij[0], + F23x + F24x - F31x); + fij[1] = fvec::mask_add(fij[1], mask_inner_1, fij[1], + F23y + F24y - F31y); + fij[2] = fvec::mask_add(fij[2], mask_inner_1, fij[2], + F23z + F24z - F31z); + + fvec tmp20 = VA * (c_1 - om1234 * om1234) * (c_1 - tspjik) * + (c_1 - tspijl) * dw21 * w34 * fvec::mask_recip(fvec::undefined(), + mask_inner_1, r21mag); + f2x = f2x - tmp20 * r21x; + f2y = f2y - tmp20 * r21y; + f2z = f2z - tmp20 * r21z; + f1x = f1x + tmp20 * r21x; + f1y = f1y + tmp20 * r21y; + f1z = f1z + tmp20 * r21z; + + fvec tmp21 = VA * (c_1 - om1234 * om1234) * (c_1 - tspjik) * + (c_1 - tspijl) * w21 * dw34 * fvec::mask_recip(fvec::undefined(), + mask_inner_1, r34mag); + f3x = f3x - tmp21 * r34x; + f3y = f3y - tmp21 * r34y; + f3z = f3z - tmp21 * r34z; + f4x = f4x + tmp21 * r34x; + f4y = f4y + tmp21 * r34y; + f4z = f4z + tmp21 * r34z; + + // 1 == buf_idx_i, 2 == i, 3 == j, 4 == buf_idx_j + i_data->force_k_x_buf[buf_idx_i] = + fvec::mask_add(i_data->force_k_x_buf[buf_idx_i], + mask_inner_1, i_data->force_k_x_buf[buf_idx_i], f1x); + i_data->force_k_y_buf[buf_idx_i] = + fvec::mask_add(i_data->force_k_y_buf[buf_idx_i], mask_inner_1, + i_data->force_k_y_buf[buf_idx_i], f1y); + i_data->force_k_z_buf[buf_idx_i] = + fvec::mask_add(i_data->force_k_z_buf[buf_idx_i], mask_inner_1, + i_data->force_k_z_buf[buf_idx_i], f1z); + i_data->force_i_x = + fvec::mask_add(i_data->force_i_x, mask_inner_1, i_data->force_i_x, f2x); + i_data->force_i_y = + fvec::mask_add(i_data->force_i_y, mask_inner_1, i_data->force_i_y, f2y); + i_data->force_i_z = + fvec::mask_add(i_data->force_i_z, mask_inner_1, i_data->force_i_z, f2z); + j_data->force_i_x = + fvec::mask_add(j_data->force_i_x, mask_inner_1, j_data->force_i_x, f3x); + j_data->force_i_y = + fvec::mask_add(j_data->force_i_y, mask_inner_1, j_data->force_i_y, f3y); + j_data->force_i_z = + fvec::mask_add(j_data->force_i_z, mask_inner_1, j_data->force_i_z, f3z); + j_data->force_k_x_buf[buf_idx_j] = + fvec::mask_add(j_data->force_k_x_buf[buf_idx_j], mask_inner_1, + j_data->force_k_x_buf[buf_idx_j], f4x); + j_data->force_k_y_buf[buf_idx_j] = + fvec::mask_add(j_data->force_k_y_buf[buf_idx_j], mask_inner_1, + j_data->force_k_y_buf[buf_idx_j], f4y); + j_data->force_k_z_buf[buf_idx_j] = + fvec::mask_add(j_data->force_k_z_buf[buf_idx_j], mask_inner_1, + j_data->force_k_z_buf[buf_idx_j], f4z); + } + } + return sum_omega; +} + +static fvec aut_frebo_pi_dh( + KernelArgsAIREBOT * _noalias ka, + struct aut_frebo_data * _noalias i_data, + struct aut_frebo_data * _noalias j_data, + int itype, int jtype, ivec vi, ivec vj, + fvec r23x, fvec r23y, fvec r23z, fvec r23mag, + fvec VA, + fvec Nij, fvec Nji, fvec Nijconj, fvec NconjtmpI, fvec NconjtmpJ, + fvec fij[3] +) { + fvec c_TOL = fvec::set1(TOL); + fvec dN3[3]; + fvec Tij = aut_frebo_Tij(ka, itype, jtype, Nij, Nji, Nijconj, &dN3[0]); + bvec TijgtTOLmask = fvec::cmpnle(fvec::abs(Tij), c_TOL); + fvec sum_omega = fvec::setzero(); + if (bvec::test_any_set(TijgtTOLmask)) { + sum_omega = aut_frebo_sum_omega( + ka, i_data, j_data, itype, jtype, vi, vj, + r23x, r23y, r23z, r23mag, VA * Tij, fij); + sum_omega = fvec::mask_blend(TijgtTOLmask, fvec::setzero(), sum_omega); + aut_frebo_N_spline_force(ka, i_data, itype, jtype, vi, vj, VA * sum_omega, + dN3[0], dN3[2], NconjtmpI); + aut_frebo_N_spline_force(ka, j_data, jtype, itype, vj, vi, VA * sum_omega, + dN3[1], dN3[2], NconjtmpJ); + } + return Tij * sum_omega; +} + +/* + We can reuse the aut_frebo_data buffers here to do this calculation very + cheaply. +*/ +static void aut_torsion_vec( + KernelArgsAIREBOT * ka, + struct aut_frebo_data * i_data, + struct aut_frebo_data * j_data, + ivec i, ivec j, fvec wij, fvec dwij +) { + AtomAIREBOT * x = ka->x; + int * map = ka->map; + flt_t (*epsilonT)[2] = ka->params.epsilonT; + fvec epsilonT00 = fvec::set1(epsilonT[0][0]); + fvec epsilonT01 = fvec::set1(epsilonT[0][1]); + fvec epsilonT10 = fvec::set1(epsilonT[1][0]); + fvec epsilonT11 = fvec::set1(epsilonT[1][1]); + fvec thmin = fvec::set1(ka->params.thmin); + fvec thmax = fvec::set1(ka->params.thmax); + + const fvec c_1_0 = fvec::set1(1.0); + const fvec c_0_5 = fvec::set1(0.5); + const fvec c_0_1 = fvec::set1(0.1); + const fvec c_2_0 = fvec::set1(2.0); + const fvec c_2_5 = fvec::set1(2.5); + const fvec c_256_405 = fvec::set1(256.0/405.0); + + fvec del32x = j_data->x_i - i_data->x_i; + fvec del32y = j_data->y_i - i_data->y_i; + fvec del32z = j_data->z_i - i_data->z_i; + fvec rsq = del32x * del32x + del32y * del32y + del32z * del32z; + fvec r32 = fvec::sqrt(rsq); + fvec del23x = fvec::setzero() - del32x; + fvec del23y = fvec::setzero() - del32y; + fvec del23z = fvec::setzero() - del32z; + fvec r23 = r32; + fvec w23 = wij; + fvec dw23 = dwij; + + for (int buf_idx_i = 0; buf_idx_i < i_data->buf_len; buf_idx_i++) { + bvec mask_start = i_data->mask_buf[buf_idx_i]; + fvec del21x = i_data->rikx_buf[buf_idx_i]; // a2 - a1 -> i - k + fvec del21y = i_data->riky_buf[buf_idx_i]; // a2 - a1 -> i - k + fvec del21z = i_data->rikz_buf[buf_idx_i]; // a2 - a1 -> i - k + fvec r21 = i_data->rikmag_buf[buf_idx_i]; + fvec cos321 = i_data->cosjik_buf[buf_idx_i]; + fvec sin321 = fvec::sqrt(c_1_0 - cos321 * cos321); + // strictly equivalent to sin321 < TOL + mask_start = fvec::mask_cmpneq(mask_start, fvec::setzero(), sin321); + if (! bvec::test_any_set(mask_start)) continue; + + fvec deljkx = del21x - del23x; + fvec deljky = del21y - del23y; + fvec deljkz = del21z - del23z; + fvec rjk2 = deljkx * deljkx + deljky * deljky + deljkz * deljkz; + fvec rjk = fvec::sqrt(rjk2); + fvec rik2 = r21 * r21; + fvec w21 = i_data->wik_buf[buf_idx_i]; + fvec dw21 = i_data->dwik_buf[buf_idx_i]; + + fvec rij = r32; + fvec rik = r21; + fvec rij2 = r32 * r32; + fvec dtsjik; + fvec tspjik = aut_Sp2_deriv(cos321, thmin, thmax, &dtsjik); + dtsjik = fvec::setzero() - dtsjik; + + bvec ktype_mask = i_data->ktype_buf[buf_idx_i]; + fvec epsilonT0 = fvec::mask_blend(ktype_mask, epsilonT00, epsilonT10); + fvec epsilonT1 = fvec::mask_blend(ktype_mask, epsilonT01, epsilonT11); + + ivec k = i_data->k_buf[buf_idx_i]; + for (int buf_idx_j = 0; buf_idx_j < j_data->buf_len; buf_idx_j++) { + ivec l = j_data->k_buf[buf_idx_j]; + bvec mask_inner_0 = ivec::mask_cmpneq(mask_start, k, l) & + j_data->mask_buf[buf_idx_j]; + if (! bvec::test_any_set(mask_inner_0)) continue; + fvec del34x = j_data->rikx_buf[buf_idx_j]; + fvec del34y = j_data->riky_buf[buf_idx_j]; + fvec del34z = j_data->rikz_buf[buf_idx_j]; + fvec r34 = j_data->rikmag_buf[buf_idx_j]; + bvec ltype_mask = j_data->ktype_buf[buf_idx_j]; + fvec cos234 = j_data->cosjik_buf[buf_idx_j]; + fvec sin234 = fvec::sqrt(c_1_0 - cos234 * cos234); + // strictly equivalent to sin234 < TOL + mask_inner_0 = fvec::mask_cmpneq(mask_inner_0, sin234, fvec::setzero()); + if (! bvec::test_any_set(mask_inner_0)) continue; + fvec dw34 = j_data->dwik_buf[buf_idx_j]; + fvec w34 = j_data->wik_buf[buf_idx_j]; + fvec delilx = del23x + del34x; + fvec delily = del23y + del34y; + fvec delilz = del23z + del34z; + fvec ril2 = delilx * delilx + delily * delily + delilz * delilz; + fvec ril = fvec::sqrt(ril2); + fvec rjl2 = r34 * r34; + + fvec rjl = r34; + fvec dtsijl; + fvec tspijl = aut_Sp2_deriv(cos234, thmin, thmax, &dtsijl); + dtsijl = fvec::setzero() - dtsijl; + fvec cross321x = del32y * del21z - del32z * del21y; + fvec cross321y = del32z * del21x - del32x * del21z; + fvec cross321z = del32x * del21y - del32y * del21x; + fvec cross321mag = fvec::sqrt(cross321x * cross321x + + cross321y * cross321y + + cross321z * cross321z); + fvec cross234x = del23y * del34z - del23z * del34y; + fvec cross234y = del23z * del34x - del23x * del34z; + fvec cross234z = del23x * del34y - del23y * del34x; + fvec cross234mag = fvec::sqrt(cross234x * cross234x + + cross234y * cross234y + + cross234z * cross234z); + fvec cwnum = cross321x * cross234x + cross321y * cross234y + + cross321z * cross234z; + fvec cwnom = r21 * r34 * r32 * r32 * sin321 * sin234; + fvec cw = cwnum / cwnom; + + fvec cw2 = c_0_5 * ( c_1_0 - cw); + fvec ekijl = fvec::mask_blend(ltype_mask, epsilonT0, epsilonT1); + fvec Ec = c_256_405 * ekijl; + fvec cw2_5 = cw2 * cw2 * cw2 * cw2 * cw2; + fvec Vtors = Ec * cw2_5 - ekijl * c_0_1; + + fvec evdwl = Vtors * w21 * w23 * w34 * (c_1_0-tspjik) * (c_1_0-tspijl); + ka->result_eng += fvec::mask_reduce_add(mask_inner_0, evdwl); + + fvec dndijx = cross234y * del21z - cross234z * del21y; + fvec dndijy = cross234z * del21x - cross234x * del21z; + fvec dndijz = cross234x * del21y - cross234y * del21x; + + fvec tmpvecx = del34y * cross321z - del34z * cross321y; + fvec tmpvecy = del34z * cross321x - del34x * cross321z; + fvec tmpvecz = del34x * cross321y - del34y * cross321x; + + dndijx = dndijx + tmpvecx; + dndijy = dndijy + tmpvecy; + dndijz = dndijz + tmpvecz; + + fvec dndikx = del23y * cross234z - del23z * cross234y; + fvec dndiky = del23z * cross234x - del23x * cross234z; + fvec dndikz = del23x * cross234y - del23y * cross234x; + + fvec dndjlx = cross321y * del23z - cross321z * del23y; + fvec dndjly = cross321z * del23x - cross321x * del23z; + fvec dndjlz = cross321x * del23y - cross321y * del23x; + + fvec r23sq = r23 * r23; + fvec r21sq = r21 * r21; + fvec r34sq = r34 * r34; + fvec rjksq = rjk * rjk; + fvec rilsq = ril * ril; + fvec dcidij = (r23sq - r21sq + rjksq) / ( c_2_0 * r23sq * r21); + fvec dcidik = (r21sq - r23sq + rjksq) / ( c_2_0 * r21sq * r23); + fvec dcidjk = fvec::setzero() - rjk / ( r23 * r21); + fvec dcjdji = (r23sq - r34sq + rilsq) / ( c_2_0 * r23sq * r34); + fvec dcjdjl = (r34sq - r23sq + rilsq) / ( c_2_0 * r34sq * r23); + fvec dcjdil = fvec::setzero() - ril / ( r23 * r34); + + fvec dsidij = fvec::setzero() - cos321 / sin321 * dcidij; + fvec dsidik = fvec::setzero() - cos321 / sin321 * dcidik; + fvec dsidjk = fvec::setzero() - cos321 / sin321 * dcidjk; + + fvec dsjdji = fvec::setzero() - cos234 / sin234 * dcjdji; + fvec dsjdjl = fvec::setzero() - cos234 / sin234 * dcjdjl; + fvec dsjdil = fvec::setzero() - cos234 / sin234 * dcjdil; + + fvec dxidij = r21 * sin321 + r23 * r21 * dsidij; + fvec dxidik = r23 * sin321 + r23 * r21 * dsidik; + fvec dxidjk = r23 * r21 * dsidjk; + + fvec dxjdji = r34 * sin234 + r23 * r34 * dsjdji; + fvec dxjdjl = r23 * sin234 + r23 * r34 * dsjdjl; + fvec dxjdil = r23 * r34 * dsjdil; + + fvec ddndij = dxidij * cross234mag + cross321mag * dxjdji; + fvec ddndik = dxidik * cross234mag; + fvec ddndjk = dxidjk * cross234mag; + fvec ddndjl = cross321mag * dxjdjl; + fvec ddndil = cross321mag * dxjdil; + fvec dcwddn = fvec::setzero() - cwnum / ( cwnom * cwnom); + fvec dcwdn = fvec::recip(cwnom); + fvec cw2_4 = cw2 * cw2 * cw2 * cw2; + fvec dvpdcw = c_2_5 * Ec * cw2_4 * w23 * w21 * w34 * (c_1_0 - tspjik) * + (c_1_0 - tspijl); + + fvec Ftmpx = dvpdcw * (dcwdn * dndijx + dcwddn * ddndij * del23x / r23); + fvec Ftmpy = dvpdcw * (dcwdn * dndijy + dcwddn * ddndij * del23y / r23); + fvec Ftmpz = dvpdcw * (dcwdn * dndijz + dcwddn * ddndij * del23z / r23); + fvec fix = Ftmpx; + fvec fiy = Ftmpy; + fvec fiz = Ftmpz; + fvec fjx = fvec::setzero() - Ftmpx; + fvec fjy = fvec::setzero() - Ftmpy; + fvec fjz = fvec::setzero() - Ftmpz; + + Ftmpx = dvpdcw * (dcwdn * dndikx + dcwddn * ddndik * del21x / r21); + Ftmpy = dvpdcw * (dcwdn * dndiky + dcwddn * ddndik * del21y / r21); + Ftmpz = dvpdcw * (dcwdn * dndikz + dcwddn * ddndik * del21z / r21); + fix = fix + Ftmpx; + fiy = fiy + Ftmpy; + fiz = fiz + Ftmpz; + fvec fkx = fvec::setzero() - Ftmpx; + fvec fky = fvec::setzero() - Ftmpy; + fvec fkz = fvec::setzero() - Ftmpz; + + Ftmpx = dvpdcw * dcwddn * ddndjk * deljkx / rjk; + Ftmpy = dvpdcw * dcwddn * ddndjk * deljky / rjk; + Ftmpz = dvpdcw * dcwddn * ddndjk * deljkz / rjk; + fjx = fjx + Ftmpx; + fjy = fjy + Ftmpy; + fjz = fjz + Ftmpz; + fkx = fkx - Ftmpx; + fky = fky - Ftmpy; + fkz = fkz - Ftmpz; + + Ftmpx = dvpdcw * (dcwdn * dndjlx + dcwddn * ddndjl * del34x / r34); + Ftmpy = dvpdcw * (dcwdn * dndjly + dcwddn * ddndjl * del34y / r34); + Ftmpz = dvpdcw * (dcwdn * dndjlz + dcwddn * ddndjl * del34z / r34); + fjx = fjx + Ftmpx; + fjy = fjy + Ftmpy; + fjz = fjz + Ftmpz; + fvec flx = fvec::setzero() - Ftmpx; + fvec fly = fvec::setzero() - Ftmpy; + fvec flz = fvec::setzero() - Ftmpz; + + Ftmpx = dvpdcw * dcwddn * ddndil * delilx / ril; + Ftmpy = dvpdcw * dcwddn * ddndil * delily / ril; + Ftmpz = dvpdcw * dcwddn * ddndil * delilz / ril; + fix = fix + Ftmpx; + fiy = fiy + Ftmpy; + fiz = fiz + Ftmpz; + flx = flx - Ftmpx; + fly = fly - Ftmpy; + flz = flz - Ftmpz; + + // coordination forces + + fvec fpair = Vtors * dw21 * w23 * w34 * (c_1_0 - tspjik) * + (c_1_0 - tspijl) / r21; + fix = fix - del21x * fpair; + fiy = fiy - del21y * fpair; + fiz = fiz - del21z * fpair; + fkx = fkx + del21x * fpair; + fky = fky + del21y * fpair; + fkz = fkz + del21z * fpair; + + fpair = Vtors * w21 * dw23 * w34 * (c_1_0 - tspjik) * (c_1_0 - tspijl) / + r23; + fix = fix - del23x * fpair; + fiy = fiy - del23y * fpair; + fiz = fiz - del23z * fpair; + fjx = fjx + del23x * fpair; + fjy = fjy + del23y * fpair; + fjz = fjz + del23z * fpair; + + fpair = Vtors * w21 * w23 * dw34 * (c_1_0 - tspjik) * (c_1_0 - tspijl) / + r34; + fjx = fjx - del34x * fpair; + fjy = fjy - del34y * fpair; + fjz = fjz - del34z * fpair; + flx = flx + del34x * fpair; + fly = fly + del34y * fpair; + flz = flz + del34z * fpair; + + // additional cut off function forces + + fvec fcpc = fvec::setzero() - Vtors * w21 * w23 * w34 * dtsjik * (c_1_0 - + tspijl); + fpair = fcpc * dcidij / rij; + fix = fix + fpair * del23x; + fiy = fiy + fpair * del23y; + fiz = fiz + fpair * del23z; + fjx = fjx - fpair * del23x; + fjy = fjy - fpair * del23y; + fjz = fjz - fpair * del23z; + + fpair = fcpc * dcidik / rik; + fix = fix + fpair * del21x; + fiy = fiy + fpair * del21y; + fiz = fiz + fpair * del21z; + fkx = fkx - fpair * del21x; + fky = fky - fpair * del21y; + fkz = fkz - fpair * del21z; + + fpair = fcpc * dcidjk / rjk; + fjx = fjx + fpair * deljkx; + fjy = fjy + fpair * deljky; + fjz = fjz + fpair * deljkz; + fkx = fkx - fpair * deljkx; + fky = fky - fpair * deljky; + fkz = fkz - fpair * deljkz; + + fcpc = fvec::setzero() - Vtors * w21 * w23 * w34 * (c_1_0 - tspjik) * + dtsijl; + fpair = fcpc * dcjdji / rij; + fix = fix + fpair * del23x; + fiy = fiy + fpair * del23y; + fiz = fiz + fpair * del23z; + fjx = fjx - fpair * del23x; + fjy = fjy - fpair * del23y; + fjz = fjz - fpair * del23z; + + fpair = fcpc * dcjdjl / rjl; + fjx = fjx + fpair * del34x; + fjy = fjy + fpair * del34y; + fjz = fjz + fpair * del34z; + flx = flx - fpair * del34x; + fly = fly - fpair * del34y; + flz = flz - fpair * del34z; + + fpair = fcpc * dcjdil / ril; + fix = fix + fpair * delilx; + fiy = fiy + fpair * delily; + fiz = fiz + fpair * delilz; + flx = flx - fpair * delilx; + fly = fly - fpair * delily; + flz = flz - fpair * delilz; + + // sum per-atom forces into atom force array + + i_data->force_i_x = fvec::mask_add(i_data->force_i_x, mask_inner_0, + i_data->force_i_x, fix); + i_data->force_i_y = fvec::mask_add(i_data->force_i_y, mask_inner_0, + i_data->force_i_y, fiy); + i_data->force_i_z = fvec::mask_add(i_data->force_i_z, mask_inner_0, + i_data->force_i_z, fiz); + i_data->force_j_x = fvec::mask_add(i_data->force_j_x, mask_inner_0, + i_data->force_j_x, fjx); + i_data->force_j_y = fvec::mask_add(i_data->force_j_y, mask_inner_0, + i_data->force_j_y, fjy); + i_data->force_j_z = fvec::mask_add(i_data->force_j_z, mask_inner_0, + i_data->force_j_z, fjz); + i_data->force_k_x_buf[buf_idx_i] = + fvec::mask_add(i_data->force_k_x_buf[buf_idx_i], mask_inner_0, + i_data->force_k_x_buf[buf_idx_i], fkx); + i_data->force_k_y_buf[buf_idx_i] = + fvec::mask_add(i_data->force_k_y_buf[buf_idx_i], mask_inner_0, + i_data->force_k_y_buf[buf_idx_i], fky); + i_data->force_k_z_buf[buf_idx_i] = + fvec::mask_add(i_data->force_k_z_buf[buf_idx_i], mask_inner_0, + i_data->force_k_z_buf[buf_idx_i], fkz); + j_data->force_k_x_buf[buf_idx_j] = + fvec::mask_add(j_data->force_k_x_buf[buf_idx_j], mask_inner_0, + j_data->force_k_x_buf[buf_idx_j], flx); + j_data->force_k_y_buf[buf_idx_j] = + fvec::mask_add(j_data->force_k_y_buf[buf_idx_j], mask_inner_0, + j_data->force_k_y_buf[buf_idx_j], fly); + j_data->force_k_z_buf[buf_idx_j] = + fvec::mask_add(j_data->force_k_z_buf[buf_idx_j], mask_inner_0, + j_data->force_k_z_buf[buf_idx_j], flz); + } + } +} + +/* + * Processes VL elements of the same type itype/jtype for REBO and TORSION + * interactions. This allows us to reuse the aut_frebo_data buffes in the + * torsion calculaltion. + */ +static void aut_frebo_batch_of_kind(KernelArgsAIREBOT * ka, + int torflag, int itype, int jtype, + int * i_buf, int * j_buf) { + { // jump-scope for exceed_limits + AtomAIREBOT * x = ka->x; + int * tag = ka->tag; + int * map = ka->map; + ResultForceT * result_f = ka->result_f; + flt_t rcminij = ka->params.rcmin[itype][jtype]; + flt_t rcmaxij = ka->params.rcmax[itype][jtype]; + flt_t Qij = ka->params.Q[itype][jtype]; + flt_t Aij = ka->params.A[itype][jtype]; + flt_t alphaij = ka->params.alpha[itype][jtype]; + fvec vrcminij = fvec::set1(ka->params.rcmin[itype][jtype]); + fvec vrcmaxij = fvec::set1(ka->params.rcmax[itype][jtype]); + fvec vQij = fvec::set1(ka->params.Q[itype][jtype]); + fvec vAij = fvec::set1(ka->params.A[itype][jtype]); + fvec malphaij = fvec::set1(-ka->params.alpha[itype][jtype]); + fvec c_1_0 = fvec::set1(1); + fvec c_0_5 = fvec::set1(0.5); + fvec c_TOL = fvec::set1(1e-9); + struct aut_frebo_data i_data, j_data; + + fvec evdwl_vacc = fvec::setzero(); + ivec vi = ivec::maskz_loadu(bvec::full(), i_buf); + int tmp; + ivec vj = ivec::maskz_loadu(bvec::full(), j_buf); + fvec x_i, y_i, z_i; + fvec x_j, y_j, z_j; + aut_loadatoms_vec_notype(x, vi, &x_i, &y_i, &z_i); + aut_loadatoms_vec_notype(x, vj, &x_j, &y_j, &z_j); + i_data.x_i = x_i; + i_data.y_i = y_i; + i_data.z_i = z_i; + i_data.x_j = x_j; + i_data.y_j = y_j; + i_data.z_j = z_j; + j_data.x_i = x_j; + j_data.y_i = y_j; + j_data.z_i = z_j; + j_data.x_j = x_i; + j_data.y_j = y_i; + j_data.z_j = z_i; + fvec delx = x_i - x_j; + fvec dely = y_i - y_j; + fvec delz = z_i - z_j; + fvec rsq = delx * delx + dely * dely + delz * delz; + fvec rij = fvec::sqrt(rsq); + fvec dwij; + fvec wij = aut_Sp_deriv(rij, vrcminij, vrcmaxij, &dwij); + + fvec exp_alphar = fvec::exp(malphaij * rij); + fvec Qij_over_rij = vQij / rij; + fvec Qij_over_rsq = vQij / rsq; + fvec VR_by_wij = ( c_1_0 + Qij_over_rij) * vAij * exp_alphar; + fvec VR = wij * VR_by_wij; + fvec pre = wij * vAij * exp_alphar; + fvec dVRdi = pre * ( malphaij + malphaij * Qij_over_rij - Qij_over_rsq); + dVRdi = dVRdi + VR_by_wij * dwij; + + fvec VA_by_wij = fvec::setzero(); + fvec dVA = fvec::setzero(); + + int k; + for (k = 0; k < 3; k++) { + fvec mBIJc = fvec::set1(-ka->params.BIJc[itype][jtype][k]); + fvec mBetaij = fvec::set1(-ka->params.Beta[itype][jtype][k]); + fvec term = mBIJc * fvec::exp(mBetaij * rij); + VA_by_wij = VA_by_wij + term; + dVA = dVA + mBetaij * wij * term; + } + + dVA = dVA + dwij * VA_by_wij; + fvec VA = wij * VA_by_wij; + + bvec tol_check = fvec::cmplt(wij, c_TOL); + VA = fvec::mask_blend(tol_check, VA, fvec::setzero()); + dVA = fvec::mask_blend(tol_check, dVA, fvec::setzero()); + VR = fvec::mask_blend(tol_check, VR, fvec::setzero()); + dVRdi = fvec::mask_blend(tol_check, dVRdi, fvec::setzero()); + + fvec nHi = fvec::gather(vi, ka->nH, sizeof(flt_t)); + fvec nCi = fvec::gather(vi, ka->nC, sizeof(flt_t)); + fvec nHj = fvec::gather(vj, ka->nH, sizeof(flt_t)); + fvec nCj = fvec::gather(vj, ka->nC, sizeof(flt_t)); + fvec Nij = (nHi + nCi) - wij; + fvec Nji = (nHj + nCj) - wij; + i_data.nHi = nHi; + i_data.nCi = nCi; + j_data.nHi = nHj; + j_data.nCi = nCj; + fvec fij[3], fji[3]; + fij[0] = fvec::setzero(); fij[1] = fvec::setzero(); + fij[2] = fvec::setzero(); + fji[0] = fvec::setzero(); fji[1] = fvec::setzero(); + fji[2] = fvec::setzero(); + + fvec NconjtmpI; + fvec pij = aut_frebo_pij_pd_2( + ka, &i_data, itype, jtype, vi, vj, + delx, dely, delz, rij, wij, VA, &NconjtmpI, fij); + + if (i_data.buf_len < 0) goto exceed_limits; + + fvec NconjtmpJ; + fvec rjix = fvec::setzero() - delx; + fvec rjiy = fvec::setzero() - dely; + fvec rjiz = fvec::setzero() - delz; + fvec pji = aut_frebo_pij_pd_2( + ka, &j_data, jtype, itype, vj, vi, + rjix, rjiy, rjiz, rij, wij, VA, &NconjtmpJ, fji); + fij[0] = fij[0] - fji[0]; + fij[1] = fij[1] - fji[1]; + fij[2] = fij[2] - fji[2]; + + if (j_data.buf_len < 0) goto exceed_limits; + + if (torflag && itype == 0 && jtype == 0) + aut_torsion_vec(ka, &i_data, &j_data, vi, vj, wij, dwij); + + fvec Nijconj = c_1_0 + NconjtmpI * NconjtmpI + NconjtmpJ * NconjtmpJ; + fvec dN3[3]; + fvec pi_rc = aut_frebo_pi_rc_pd(ka, itype, jtype, Nij, Nji, Nijconj, dN3); + aut_frebo_N_spline_force(ka, &i_data, itype, jtype, vi, vj, VA, dN3[0], + dN3[2], NconjtmpI); + aut_frebo_N_spline_force(ka, &j_data, jtype, itype, vj, vi, VA, dN3[1], + dN3[2], NconjtmpJ); + fvec pi_dh = aut_frebo_pi_dh(ka, &i_data, &j_data, itype, jtype, vi, vj, + delx, dely, delz, rij, VA, Nij, Nji, Nijconj, + NconjtmpI, NconjtmpJ, fij); + + fvec bij = c_0_5 * ( pij + pji) + pi_rc + pi_dh; + fvec dVAdi = bij * dVA; + fvec fpair = (dVAdi + dVRdi) * fvec::recip(rij); + fvec result_f_j_x = fpair * delx - fij[0]; + fvec result_f_j_y = fpair * dely - fij[1]; + fvec result_f_j_z = fpair * delz - fij[2]; + fvec result_f_i_x = fvec::setzero() - result_f_j_x; + fvec result_f_i_y = fvec::setzero() - result_f_j_y; + fvec result_f_i_z = fvec::setzero() - result_f_j_z; + fvec evdwl = VR + bij * VA; + evdwl_vacc = evdwl_vacc + evdwl; + + aut_frebo_data_writeback(ka, &i_data); + aut_frebo_data_writeback(ka, &j_data); + + flt_t fi_x_buf[fvec::VL] __attribute__((aligned(64))); + flt_t fi_y_buf[fvec::VL] __attribute__((aligned(64))); + flt_t fi_z_buf[fvec::VL] __attribute__((aligned(64))); + int fi_i_buf[ivec::VL] __attribute__((aligned(64))); + flt_t fj_x_buf[fvec::VL] __attribute__((aligned(64))); + flt_t fj_y_buf[fvec::VL] __attribute__((aligned(64))); + flt_t fj_z_buf[fvec::VL] __attribute__((aligned(64))); + int fj_j_buf[ivec::VL] __attribute__((aligned(64))); + flt_t evdwl_buf[fvec::VL] __attribute__((aligned(64))); + + result_f_i_x = i_data.force_i_x + result_f_i_x; + result_f_i_y = i_data.force_i_y + result_f_i_y; + result_f_i_z = i_data.force_i_z + result_f_i_z; + result_f_j_x = i_data.force_j_x + result_f_j_x; + result_f_j_y = i_data.force_j_y + result_f_j_y; + result_f_j_z = i_data.force_j_z + result_f_j_z; + + result_f_i_x = j_data.force_j_x + result_f_i_x; + result_f_i_y = j_data.force_j_y + result_f_i_y; + result_f_i_z = j_data.force_j_z + result_f_i_z; + result_f_j_x = j_data.force_i_x + result_f_j_x; + result_f_j_y = j_data.force_i_y + result_f_j_y; + result_f_j_z = j_data.force_i_z + result_f_j_z; + + fvec::store(fi_x_buf, result_f_i_x); + fvec::store(fi_y_buf, result_f_i_y); + fvec::store(fi_z_buf, result_f_i_z); + ivec::store(fi_i_buf, vi); + fvec::store(fj_x_buf, result_f_j_x); + fvec::store(fj_y_buf, result_f_j_y); + fvec::store(fj_z_buf, result_f_j_z); + ivec::store(fj_j_buf, vj); + fvec::store(evdwl_buf, evdwl); + + int lane; + for (lane = 0; lane < fvec::VL; lane++) { + int ii = fi_i_buf[lane]; + result_f[ii].x += fi_x_buf[lane]; + result_f[ii].y += fi_y_buf[lane]; + result_f[ii].z += fi_z_buf[lane]; + result_f[ii].w += 0.5 * evdwl_buf[lane]; + int jj = fj_j_buf[lane]; + result_f[jj].x += fj_x_buf[lane]; + result_f[jj].y += fj_y_buf[lane]; + result_f[jj].z += fj_z_buf[lane]; + result_f[jj].w += 0.5 * evdwl_buf[lane]; + } + ka->result_eng += fvec::reduce_add(evdwl_vacc); + return; + } +exceed_limits: + for (int l = 0; l < fvec::VL; l++) { + int i = i_buf[l]; + int j = j_buf[l]; + ref_frebo_single_interaction(ka, i, j); + if (torflag && itype == 0 && jtype == 0) + ref_torsion_single_interaction(ka, i, j); + } +} + +/* + Orders the interactions by itype and jtype and passes chunks to the above + method. +*/ +static void aut_frebo(KernelArgsAIREBOT * ka, int torflag) { + AtomAIREBOT * _noalias x = ka->x; + int * _noalias tag = ka->tag; + int * _noalias map = ka->map; + int i_buf[2][2][fvec::VL]; + int j_buf[2][2][fvec::VL]; + int n_buf[2][2] = {0}; + for (int i = ka->frebo_from_atom; i < ka->frebo_to_atom; i++) { + int itag = tag[i]; + int itype = map[x[i].w]; + flt_t x_i = x[i].x; + flt_t y_i = x[i].y; + flt_t z_i = x[i].z; + int * neighs = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i]; + int jnum = ka->neigh_rebo.num[i]; + for (int jj = 0; jj < jnum; jj++) { + int j = neighs[jj]; + int jtag = tag[j]; + if (itag > jtag) { + if (((itag + jtag) & 1) == 0) + continue; + } else if (itag < jtag) { + if (((itag + jtag) & 1) == 1) + continue; + } else { + if (x[j].z < z_i) + continue; + if (x[j].z == z_i && x[j].y < y_i) + continue; + if (x[j].z == z_i && x[j].y == y_i && x[j].x < x_i) + continue; + } + int jtype = map[x[j].w]; + int ins = n_buf[itype][jtype]; + i_buf[itype][jtype][ins] = i; + j_buf[itype][jtype][ins] = j; + n_buf[itype][jtype] += 1; + if (n_buf[itype][jtype] == fvec::VL) { + aut_frebo_batch_of_kind(ka, torflag, itype, jtype, + i_buf[itype][jtype], j_buf[itype][jtype]); + n_buf[itype][jtype] = 0; + } + } + } + for (int itype = 0; itype < 2; itype++) { + for (int jtype = 0; jtype < 2; jtype++) { + for (int l = 0; l < n_buf[itype][jtype]; l++) { + int i = i_buf[itype][jtype][l]; + int j = j_buf[itype][jtype][l]; + ref_frebo_single_interaction(ka, i, j); + if (torflag && itype == 0 && jtype == 0) + ref_torsion_single_interaction(ka, i, j); + } + } + } +} + +/* + * Apply paths in scalar fashion, not crucial for performance. + */ +static void aut_airebo_lj_force_path(KernelArgsAIREBOT * ka, + bvec mask, fvec dC, LennardJonesPathAIREBOT path[fvec::VL]) { + for (int i = 0; i < fvec::VL; i++) { + if (bvec::test_at(mask, i)) { + ref_lennard_jones_force_path(ka, fvec::at(dC, i), &path[i]); + } + } +} + +/* + * Hash-Map for efficient calculation of C_ij. + * Can have up to ITEMS entries with associated paths, as well as + * 1024 entries. Open addressing, invalidation by using a different i. + * Only needs to be reset once per timestep. + */ +static const int OPT_TEST_PATH_SIZE = 1024; +static const int OPT_TEST_PATH_ITEMS = 128; +struct aut_airebo_lj_test_path_result_data { + LennardJonesPathAIREBOT testpath[OPT_TEST_PATH_ITEMS]; + int i[OPT_TEST_PATH_SIZE]; + int j[OPT_TEST_PATH_SIZE]; + flt_t cij[OPT_TEST_PATH_SIZE]; + int testpath_idx[OPT_TEST_PATH_SIZE]; +}; +static const unsigned int OPT_TEST_PATH_HASH = 2654435761; + +static int aut_lj_tap_hash_fn(int j, int attempt) { + uint32_t result = j; + result *= (uint32_t) OPT_TEST_PATH_HASH; + result += (uint32_t) attempt; + result %= (uint32_t) OPT_TEST_PATH_SIZE; + return result; +} + +static ivec aut_airebo_lj_tap_hash_fn_vec(ivec val, ivec attempt) { + const ivec golden = ivec::set1(OPT_TEST_PATH_HASH); + const ivec mask = ivec::set1(OPT_TEST_PATH_SIZE - 1); + ivec a = ivec::mullo(golden, val); + ivec b = a + attempt; + ivec c = ivec::the_and(b, mask); + return c; +} + +/* + * Enter all those (potential) neighbors of i (including 2nd and 3rd degree) + * into the hash-map. There is no good way to vectorize this, and it does not + * seem time-critical. + */ +static bool aut_airebo_lj_test_all_paths(KernelArgsAIREBOT * ka, + int i, struct aut_airebo_lj_test_path_result_data * result) { + AtomAIREBOT * x = ka->x; + int * map = ka->map; + flt_t (*rcmin)[2] = &ka->params.rcmin[0]; + flt_t (*rcmax)[2] = &ka->params.rcmax[0]; + flt_t rcminsq[2][2]; + rcminsq[0][0] = rcmin[0][0] * rcmin[0][0]; + rcminsq[0][1] = rcmin[0][1] * rcmin[0][1]; + rcminsq[1][0] = rcmin[1][0] * rcmin[1][0]; + rcminsq[1][1] = rcmin[1][1] * rcmin[1][1]; + int * neighs_i = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]]; + int itype = map[x[i].w]; + int path_insert_pos = 0; + for (int jj = 0; jj < ka->neigh_rebo.num[i]; jj++) { + int j = neighs_i[jj]; + int jtype = map[x[j].w]; + flt_t dijx = x[j].x - x[i].x; + flt_t dijy = x[j].y - x[i].y; + flt_t dijz = x[j].z - x[i].z; + flt_t rijsq = dijx * dijx + dijy * dijy + dijz * dijz; + flt_t wj = 1, dwj = 0; + flt_t rij = 0; + if (rijsq >= rcminsq[itype][jtype]) { + rij = overloaded::sqrt(rijsq); + wj = Sp(rij, rcmin[itype][jtype], rcmax[itype][jtype], &dwj); + } + int attempt = 0; + int start_hash_slot = aut_lj_tap_hash_fn(j, attempt); + int hash_slot = start_hash_slot; + while (result->i[hash_slot] == i && result->j[hash_slot] != j && + attempt < OPT_TEST_PATH_SIZE) { + hash_slot = aut_lj_tap_hash_fn(j, ++attempt); + } + if (attempt >= OPT_TEST_PATH_SIZE) goto exceed_limits; + bool init_slot = result->i[hash_slot] != i; + if (init_slot || (1 - wj < result->cij[hash_slot])) { + result->i[hash_slot] = i; + result->j[hash_slot] = j; + result->cij[hash_slot] = 1 - wj; + if (wj != 1.0) { + if (path_insert_pos >= OPT_TEST_PATH_ITEMS) goto exceed_limits; + result->testpath_idx[hash_slot] = path_insert_pos; + LennardJonesPathAIREBOT *path = + &result->testpath[path_insert_pos++]; + path->num = 2; + path->del[0].x = dijx; + path->del[0].y = dijy; + path->del[0].z = dijz; + if (rij == 0) rij = sqrt(rijsq); + path->r[0] = rij; + path->w[0] = wj; + path->dw[0] = dwj; + path->idx[0] = i; + path->idx[1] = j; + } + } + int * neighs_j = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[j]]; + for (int kk = 0; kk < ka->neigh_rebo.num[j]; kk++) { + int k = neighs_j[kk]; + if (k == i) continue; + int ktype = map[x[k].w]; + flt_t djkx = x[k].x - x[j].x; + flt_t djky = x[k].y - x[j].y; + flt_t djkz = x[k].z - x[j].z; + flt_t rjksq = djkx * djkx + djky * djky + djkz * djkz; + flt_t wk = 1, dwk = 0; + flt_t rjk = 0; + if (rjksq >= rcminsq[jtype][ktype]) { + rjk = overloaded::sqrt(rjksq); + wk = Sp(rjk, rcmin[jtype][ktype], rcmax[jtype][ktype], &dwk); + } + int attempt = 0; + int start_hash_slot = aut_lj_tap_hash_fn(k, attempt); + int hash_slot = start_hash_slot; + while (result->i[hash_slot] == i && result->j[hash_slot] != k && + attempt < OPT_TEST_PATH_SIZE) { + hash_slot = aut_lj_tap_hash_fn(k, ++attempt); + } + if (attempt >= OPT_TEST_PATH_SIZE) goto exceed_limits; + bool init_slot = result->i[hash_slot] != i; + if (init_slot || (1 - wj * wk < result->cij[hash_slot])) { + result->i[hash_slot] = i; + result->j[hash_slot] = k; + result->cij[hash_slot] = 1 - wj * wk; + if (wj * wk != 1.0) { + if (path_insert_pos >= OPT_TEST_PATH_ITEMS) goto exceed_limits; + result->testpath_idx[hash_slot] = path_insert_pos; + LennardJonesPathAIREBOT *path = + &result->testpath[path_insert_pos++]; + path->num = 3; + path->del[0].x = dijx; + path->del[0].y = dijy; + path->del[0].z = dijz; + if (rij == 0) rij = sqrt(rijsq); + path->r[0] = rij; + path->del[1].x = djkx; + path->del[1].y = djky; + path->del[1].z = djkz; + if (rjk == 0) rjk = sqrt(rjksq); + path->r[1] = rjk; + path->w[0] = wj; + path->dw[0] = dwj; + path->w[1] = wk; + path->dw[1] = dwk; + path->idx[0] = i; + path->idx[1] = j; + path->idx[2] = k; + } + } + int * neighs_k = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[k]]; + for (int ll = 0; ll < ka->neigh_rebo.num[k]; ll++) { + int l = neighs_k[ll]; + if ((l == i) || (l == j)) continue; + int ltype = map[x[l].w]; + flt_t dklx = x[l].x - x[k].x; + flt_t dkly = x[l].y - x[k].y; + flt_t dklz = x[l].z - x[k].z; + flt_t rklsq = dklx * dklx + dkly * dkly + dklz * dklz; + flt_t wl = 1, dwl = 0; + flt_t rkl = 0; + if (rklsq >= rcminsq[ktype][ltype]) { + rkl = overloaded::sqrt(rklsq); + wl = Sp(rkl, rcmin[ktype][ltype], rcmax[ktype][ltype], &dwl); + } + int attempt = 0; + int start_hash_slot = aut_lj_tap_hash_fn(l, attempt); + int hash_slot = start_hash_slot; + while (result->i[hash_slot] == i && result->j[hash_slot] != l && + attempt < OPT_TEST_PATH_SIZE) { + hash_slot = aut_lj_tap_hash_fn(l, ++attempt); + } + if (attempt >= OPT_TEST_PATH_SIZE) goto exceed_limits; + bool init_slot = result->i[hash_slot] != i; + if (init_slot || (1 - wj * wk * wl < result->cij[hash_slot])) { + result->i[hash_slot] = i; + result->j[hash_slot] = l; + result->cij[hash_slot] = 1 - wj * wk * wl; + if (wj * wk * wl != 1.0) { + if (path_insert_pos >= OPT_TEST_PATH_ITEMS) goto exceed_limits; + result->testpath_idx[hash_slot] = path_insert_pos; + LennardJonesPathAIREBOT *path = + &result->testpath[path_insert_pos++]; + path->num = 4; + path->del[0].x = dijx; + path->del[0].y = dijy; + path->del[0].z = dijz; + if (rij == 0) rij = sqrt(rijsq); + path->r[0] = rij; + path->del[1].x = djkx; + path->del[1].y = djky; + path->del[1].z = djkz; + if (rjk == 0) rjk = sqrt(rjksq); + path->r[1] = rjk; + path->del[2].x = dklx; + path->del[2].y = dkly; + path->del[2].z = dklz; + if (rkl == 0) rkl = sqrt(rklsq); + path->r[2] = rkl; + path->w[0] = wj; + path->dw[0] = dwj; + path->w[1] = wk; + path->dw[1] = dwk; + path->w[2] = wl; + path->dw[2] = dwl; + path->idx[0] = i; + path->idx[1] = j; + path->idx[2] = k; + path->idx[3] = l; + } + } + } + } + } + return true; +exceed_limits: + return false; +} + +/* + * Attempt to look up an element in the hash-map. + */ +static fvec aut_airebo_lj_tap_test_path(KernelArgsAIREBOT * ka, + struct aut_airebo_lj_test_path_result_data * test_path_result, + bvec need_search, ivec i_bc, ivec j, + LennardJonesPathAIREBOT path[fvec::VL] +) { + const ivec c_i1 = ivec::set1(1); + fvec cij = fvec::set1(1.0); + // first round: hash all j + // lookup i/j in hash list. + // if i matches and j matches: congrats + // if i matches and j does not: look up attempts + // if attempts > current_attempts: + // do another round of hashing + // for all those found: + + // fill in the path + // ----------------------------------------------- + // find all the correct hash slots, and a mask of where found. + ivec attempt = ivec::setzero(); + ivec hash_slot = aut_airebo_lj_tap_hash_fn_vec(j, attempt); + ivec lookup_i = ivec::mask_gather(ivec::undefined(), need_search, hash_slot, + &test_path_result->i[0], sizeof(int)); + bvec correct_i = ivec::mask_cmpeq(need_search, lookup_i, i_bc); + ivec lookup_j = ivec::mask_gather(ivec::undefined(), correct_i, hash_slot, + &test_path_result->j[0], sizeof(int)); + bvec found_items = ivec::mask_cmpeq(correct_i, lookup_j, j); + bvec another_attempt = correct_i & ~ found_items; + while (bvec::test_any_set(another_attempt)) { + attempt = ivec::mask_add(attempt, another_attempt, attempt, c_i1); + hash_slot = aut_airebo_lj_tap_hash_fn_vec(j, attempt); + ivec lookup_i_2 = ivec::mask_gather(lookup_i, another_attempt, hash_slot, + &test_path_result->i[0], sizeof(int)); + lookup_i = lookup_i_2; + correct_i = ivec::mask_cmpeq(need_search, lookup_i, i_bc); + lookup_j = ivec::mask_gather(lookup_j, another_attempt, hash_slot, + &test_path_result->j[0], sizeof(int)); + found_items = ivec::mask_cmpeq(correct_i, lookup_j, j); + another_attempt = correct_i & ~ found_items; + } + cij = fvec::mask_gather(cij, found_items, hash_slot, + &test_path_result->cij[0], sizeof(flt_t)); + bvec need_testpath = fvec::mask_cmplt(found_items, fvec::setzero(), cij); + if (bvec::test_any_set(need_testpath)) { + for (int i = 0; i < fvec::VL; i++) { + if (bvec::test_at(need_testpath, i)) { + int testpath_idx = + test_path_result->testpath_idx[ivec::at(hash_slot, i)]; + path[i] = test_path_result->testpath[testpath_idx]; + } + } + } + return cij; +} + +/* + * This function calculates the Lennard-Jones interaciton for those + * elements that require a bond-order calculation. + * It is similarly structured as the aut_frebo_batch_of_kind function. + * The forces due to bondorders are calculated speculatively and later + * updated with the correct outer derivative. + */ +template +static void aut_lj_with_bo( + KernelArgsAIREBOT * ka, + int itype, int jtype, + ivec i, ivec j, + fvec cij, LennardJonesPathAIREBOT testpath[fvec::VL] +) { + { // jump-scope for exceed_limits + AtomAIREBOT * _noalias x = ka->x; + ResultForceT * result_f = ka->result_f; + + ivec c_i4 = ivec::set1(4); + fvec c_1_0 = fvec::set1(1.0); + fvec c_2_0 = fvec::set1(2.0); + fvec c_0_5 = fvec::set1(0.5); + + fvec x_i, y_i, z_i; + aut_loadatoms_vec_notype(x, i, &x_i, &y_i, &z_i); + fvec x_j, y_j, z_j; + aut_loadatoms_vec_notype(x, j, &x_j, &y_j, &z_j); + fvec delx = x_i - x_j; + fvec dely = y_i - y_j; + fvec delz = z_i - z_j; + fvec rsq = delx * delx + dely * dely + delz * delz; + + fvec rij = fvec::sqrt(rsq); + bvec need_path_force = fvec::cmplt(cij, c_1_0); + flt_t sigcut = ka->params.sigcut; + flt_t sigmin = ka->params.sigmin; + flt_t sigma = ka->params.sigma[itype][jtype]; + flt_t rljmax = sigcut * sigma; + flt_t rljmin = sigmin * sigma; + fvec p_rljmin = fvec::set1(rljmin); + fvec p_rljmax = fvec::set1(rljmax); + + fvec dslw, slw = aut_Sp2_deriv(rij, p_rljmin, p_rljmax, &dslw); + + fvec p_lj1 = fvec::set1(ka->params.lj1[itype][jtype]); + fvec p_lj2 = fvec::set1(ka->params.lj2[itype][jtype]); + fvec p_lj3 = fvec::set1(ka->params.lj3[itype][jtype]); + fvec p_lj4 = fvec::set1(ka->params.lj4[itype][jtype]); + + fvec r2inv = fvec::recip(rsq); + + fvec vdw, dvdw; + if (MORSEFLAG) { + fvec exr = fvec::exp(fvec::setzero() - rij * p_lj4); + vdw = p_lj1 * exr * (p_lj2 * exr - c_2_0); + dvdw = p_lj3 * exr * (c_1_0 - p_lj2 * exr); + } else { + fvec r6inv = r2inv * r2inv * r2inv; + + vdw = r6inv * ( p_lj3 * r6inv - p_lj4); + fvec r7inv = r6inv * rij * r2inv; + dvdw = r7inv * ( p_lj2 - p_lj1 * r6inv); + } + + fvec VLJ = vdw * slw; + fvec dVLJ = dvdw * slw + vdw * dslw; + + fvec p_rcLJmin = fvec::set1(ka->params.rcLJmin[itype][jtype]); + fvec p_rcLJmax = fvec::set1(ka->params.rcLJmax[itype][jtype]); + fvec dStr, Str = aut_Sp2_deriv(rij, p_rcLJmin, p_rcLJmax, &dStr); + fvec VA = cij * VLJ * Str; + + fvec fij[3], fji[3]; + fij[0] = fvec::setzero(); fij[1] = fvec::setzero(); + fij[2] = fvec::setzero(); + fji[0] = fvec::setzero(); fji[1] = fvec::setzero(); + fji[2] = fvec::setzero(); + + ivec vi = i; + ivec vj = j; + + struct aut_frebo_data i_data, j_data; + i_data.x_i = x_i; + i_data.y_i = y_i; + i_data.z_i = z_i; + i_data.x_j = x_j; + i_data.y_j = y_j; + i_data.z_j = z_j; + j_data.x_i = x_j; + j_data.y_i = y_j; + j_data.z_i = z_j; + j_data.x_j = x_i; + j_data.y_j = y_i; + j_data.z_j = z_i; + + fvec p_rcmin = fvec::set1(ka->params.rcmin[itype][jtype]); + fvec p_rcmax = fvec::set1(ka->params.rcmax[itype][jtype]); + fvec dwij; + fvec wij = aut_Sp_deriv(rij, p_rcmin, p_rcmax, &dwij); + + fvec nHi = fvec::gather(vi, ka->nH, sizeof(flt_t)); + fvec nCi = fvec::gather(vi, ka->nC, sizeof(flt_t)); + fvec nHj = fvec::gather(vj, ka->nH, sizeof(flt_t)); + fvec nCj = fvec::gather(vj, ka->nC, sizeof(flt_t)); + fvec Nij = nHi + nCi - wij; + fvec Nji = nHj + nCj - wij; + i_data.nHi = nHi; + i_data.nCi = nCi; + j_data.nHi = nHj; + j_data.nCi = nCj; + + fvec the_r = fvec::set1(ka->params.rcmin[itype][jtype]); + fvec scale = the_r / rij; + + fvec NconjtmpI; + fvec pij = aut_frebo_pij_pd_2(ka, &i_data, itype, jtype, vi, vj, + delx * scale, dely * scale, delz * scale, + the_r, wij, VA, &NconjtmpI, fij); + + if (i_data.buf_len < 0) goto exceed_limits; + + fvec NconjtmpJ; + fvec rjix = fvec::setzero() - delx; + fvec rjiy = fvec::setzero() - dely; + fvec rjiz = fvec::setzero() - delz; + fvec pji = aut_frebo_pij_pd_2(ka, &j_data, jtype, itype, vj, vi, + rjix * scale, rjiy * scale, rjiz * scale, + the_r, wij, VA, &NconjtmpJ, fji); + fij[0] = fij[0] - fji[0]; + fij[1] = fij[1] - fji[1]; + fij[2] = fij[2] - fji[2]; + + if (j_data.buf_len < 0) goto exceed_limits; + + fvec Nijconj = c_1_0 + NconjtmpI * NconjtmpI + NconjtmpJ * NconjtmpJ; + fvec dN3[3]; + fvec pi_rc = aut_frebo_pi_rc_pd(ka, itype, jtype, Nij, Nji, Nijconj, dN3); + + fvec c_TOL = fvec::set1(TOL); + fvec dN3_dh[3]; + fvec Tij = aut_frebo_Tij(ka, itype, jtype, Nij, Nji, Nijconj, &dN3_dh[0]); + bvec TijgtTOLmask = fvec::cmpnle(fvec::abs(Tij), c_TOL); + fvec sum_omega = fvec::setzero(); + if (bvec::test_any_set(TijgtTOLmask)) { + sum_omega = aut_frebo_sum_omega( + ka, &i_data, &j_data, itype, jtype, vi, vj, + delx * scale, dely * scale, delz * scale, the_r, VA * Tij, fij); + sum_omega = fvec::mask_blend(TijgtTOLmask, fvec::setzero(), sum_omega); + } + fvec pi_dh = Tij * sum_omega; + + fvec bij = c_0_5 * ( pij + pji) + pi_rc + pi_dh; + + fvec p_bLJmin = fvec::set1(ka->params.bLJmin[itype][jtype]); + fvec p_bLJmax = fvec::set1(ka->params.bLJmax[itype][jtype]); + fvec dStb, Stb = aut_Sp2_deriv(bij, p_bLJmin, p_bLJmax, &dStb); + + bvec need_bo_deriv = fvec::cmpneq(dStb, fvec::setzero()); + // fix up j_data, i_data, fij: + // multiply each by dStb + if (bvec::test_any_set(need_bo_deriv)) { + i_data.force_i_x = dStb * i_data.force_i_x; + i_data.force_i_y = dStb * i_data.force_i_y; + i_data.force_i_z = dStb * i_data.force_i_z; + i_data.force_j_x = dStb * i_data.force_j_x; + i_data.force_j_y = dStb * i_data.force_j_y; + i_data.force_j_z = dStb * i_data.force_j_z; + j_data.force_i_x = dStb * j_data.force_i_x; + j_data.force_i_y = dStb * j_data.force_i_y; + j_data.force_i_z = dStb * j_data.force_i_z; + j_data.force_j_x = dStb * j_data.force_j_x; + j_data.force_j_y = dStb * j_data.force_j_y; + j_data.force_j_z = dStb * j_data.force_j_z; + for (int k = 0; k < i_data.buf_len; k++) { + i_data.force_k_x_buf[k] = dStb * i_data.force_k_x_buf[k]; + i_data.force_k_y_buf[k] = dStb * i_data.force_k_y_buf[k]; + i_data.force_k_z_buf[k] = dStb * i_data.force_k_z_buf[k]; + } + for (int k = 0; k < j_data.buf_len; k++) { + j_data.force_k_x_buf[k] = dStb * j_data.force_k_x_buf[k]; + j_data.force_k_y_buf[k] = dStb * j_data.force_k_y_buf[k]; + j_data.force_k_z_buf[k] = dStb * j_data.force_k_z_buf[k]; + } + fvec fijc[3]; + fijc[0] = dStb * fij[0]; + fijc[1] = dStb * fij[1]; + fijc[2] = dStb * fij[2]; + fij[0] = scale * (fijc[0] - (delx * delx * fijc[0] + dely * delx * + fijc[1] + delz * delx * fijc[2]) / rsq); + fij[1] = scale * (fijc[1] - (delx * dely * fijc[0] + dely * dely * + fijc[1] + delz * dely * fijc[2]) / rsq); + fij[2] = scale * (fijc[2] - (delx * delz * fijc[0] + dely * delz * + fijc[1] + delz * delz * fijc[2]) / rsq); + + aut_frebo_N_spline_force(ka, &i_data, itype, jtype, vi, vj, dStb * VA, + dN3[0], dN3[2], NconjtmpI); + aut_frebo_N_spline_force(ka, &j_data, jtype, itype, vj, vi, dStb * VA, + dN3[1], dN3[2], NconjtmpJ); + if (bvec::test_any_set(TijgtTOLmask)) { + aut_frebo_N_spline_force(ka, &i_data, itype, jtype, vi, vj, + dStb * VA * sum_omega, dN3_dh[0], dN3_dh[2], + NconjtmpI); + aut_frebo_N_spline_force(ka, &j_data, jtype, itype, vj, vi, + dStb * VA * sum_omega, dN3_dh[1], dN3_dh[2], + NconjtmpJ); + } + + aut_frebo_data_writeback(ka, &i_data); + aut_frebo_data_writeback(ka, &j_data); + } else { + fij[0] = fvec::setzero(); + fij[1] = fvec::setzero(); + fij[2] = fvec::setzero(); + } + + fvec fpdVLJ = cij * dVLJ * ( c_1_0 + Str * ( Stb - c_1_0)); + fvec fpdStr = dStr * cij * ( Stb * VLJ - VLJ); + fvec fpair = r2inv * rij * ( fvec::setzero() - ( fpdVLJ + fpdStr)); + fvec evdwl = VA * Stb + cij * VLJ * ( c_1_0 - Str); + + fvec result_f_i_x = fpair * delx + fij[0]; + fvec result_f_i_y = fpair * dely + fij[1]; + fvec result_f_i_z = fpair * delz + fij[2]; + fvec result_f_j_x = fvec::setzero() - result_f_i_x; + fvec result_f_j_y = fvec::setzero() - result_f_i_y; + fvec result_f_j_z = fvec::setzero() - result_f_i_z; + + flt_t fi_x_buf[fvec::VL] __attribute__((aligned(64))); + flt_t fi_y_buf[fvec::VL] __attribute__((aligned(64))); + flt_t fi_z_buf[fvec::VL] __attribute__((aligned(64))); + int fi_i_buf[ivec::VL] __attribute__((aligned(64))); + flt_t fj_x_buf[fvec::VL] __attribute__((aligned(64))); + flt_t fj_y_buf[fvec::VL] __attribute__((aligned(64))); + flt_t fj_z_buf[fvec::VL] __attribute__((aligned(64))); + int fj_j_buf[ivec::VL] __attribute__((aligned(64))); + flt_t evdwl_buf[fvec::VL] __attribute__((aligned(64))); + + if (bvec::test_any_set(need_bo_deriv)) { + result_f_i_x = i_data.force_i_x + result_f_i_x; + result_f_i_y = i_data.force_i_y + result_f_i_y; + result_f_i_z = i_data.force_i_z + result_f_i_z; + result_f_j_x = i_data.force_j_x + result_f_j_x; + result_f_j_y = i_data.force_j_y + result_f_j_y; + result_f_j_z = i_data.force_j_z + result_f_j_z; + + result_f_i_x = j_data.force_j_x + result_f_i_x; + result_f_i_y = j_data.force_j_y + result_f_i_y; + result_f_i_z = j_data.force_j_z + result_f_i_z; + result_f_j_x = j_data.force_i_x + result_f_j_x; + result_f_j_y = j_data.force_i_y + result_f_j_y; + result_f_j_z = j_data.force_i_z + result_f_j_z; + } + + fvec::store(fi_x_buf, result_f_i_x); + fvec::store(fi_y_buf, result_f_i_y); + fvec::store(fi_z_buf, result_f_i_z); + ivec::store(fi_i_buf, vi); + fvec::store(fj_x_buf, result_f_j_x); + fvec::store(fj_y_buf, result_f_j_y); + fvec::store(fj_z_buf, result_f_j_z); + ivec::store(fj_j_buf, vj); + fvec::store(evdwl_buf, evdwl); + + int lane; + for (lane = 0; lane < fvec::VL; lane++) { + int ii = fi_i_buf[lane]; + result_f[ii].x += fi_x_buf[lane]; + result_f[ii].y += fi_y_buf[lane]; + result_f[ii].z += fi_z_buf[lane]; + result_f[ii].w += 0.5 * evdwl_buf[lane]; + int jj = fj_j_buf[lane]; + result_f[jj].x += fj_x_buf[lane]; + result_f[jj].y += fj_y_buf[lane]; + result_f[jj].z += fj_z_buf[lane]; + result_f[jj].w += 0.5 * evdwl_buf[lane]; + } + ka->result_eng += fvec::reduce_add(evdwl); + + if (bvec::test_any_set(need_path_force)) { + fvec dC = VLJ * ( Str * Stb + c_1_0 - Str); + aut_airebo_lj_force_path(ka, need_path_force, dC, testpath); + } + return; + } +exceed_limits: + for (int l = 0; l < fvec::VL; l++) { + ref_lennard_jones_single_interaction(ka, ivec::at(i, l), ivec::at(j, l), + MORSEFLAG); + } + return; +} + +/* + * Calculate the lennard-jones interaction. + * Uses the above hash-map, and outlines the calculation if the bondorder is + * needed. + * Agressively compresses to get the most values calculated. + */ +template +static void aut_lennard_jones(KernelArgsAIREBOT * ka) { + AtomAIREBOT * x = ka->x; + int * tag = ka->tag; + int * map = ka->map; + ResultForceT * result_f = ka->result_f; + ivec c_i1 = ivec::set1(1); + ivec c_i4 = ivec::set1(4); + fvec c_1_0 = fvec::set1(1.0); + fvec c_2_0 = fvec::set1(2.0); + fvec c_0_0 = fvec::set1(0.0); + int map_i_scalar = 0; + { + int i; + for (i = 1; i < ka->num_types; i++) { + if (ka->map[i]) + map_i_scalar |= (1 << i); + } + } + ivec map_i = ivec::set1(map_i_scalar); + fvec result_eng = fvec::setzero(); + + struct aut_airebo_lj_test_path_result_data test_path_result; + for (int i = 0; i < OPT_TEST_PATH_SIZE; i++) { + test_path_result.i[i] = -1; + } + + ivec i_bo[2][2]; + ivec j_bo[2][2]; + fvec cij_bo[2][2]; + LennardJonesPathAIREBOT testpath_bo[2][2][fvec::VL]; + int num_bo[2][2] = {0}; + + for (int i = ka->frebo_from_atom; i < ka->frebo_to_atom; i++) { + ivec itag_bc = ivec::set1(tag[i]); + int itype = map[x[i].w]; + fvec x_i = fvec::set1(x[i].x); + fvec y_i = fvec::set1(x[i].y); + fvec z_i = fvec::set1(x[i].z); + ivec i_bc = ivec::set1(i); + + fvec cutljsq0 = fvec::set1(ka->params.cutljsq[itype][0]); + fvec cutljsq1 = fvec::set1(ka->params.cutljsq[itype][1]); + fvec p_rcmax0 = fvec::set1(ka->params.rcmax[itype][0]); + fvec p_rcmax1 = fvec::set1(ka->params.rcmax[itype][1]); + flt_t sigcut = ka->params.sigcut; + flt_t sigmin = ka->params.sigmin; + flt_t sigma0 = ka->params.sigma[itype][0]; + flt_t rljmax0 = sigcut * sigma0; + flt_t rljmin0 = sigmin * sigma0; + flt_t sigma1 = ka->params.sigma[itype][1]; + flt_t rljmax1 = sigcut * sigma1; + flt_t rljmin1 = sigmin * sigma1; + fvec p_rljmax0 = fvec::set1(rljmax0); + fvec p_rljmax1 = fvec::set1(rljmax1); + fvec p_rljmin0 = fvec::set1(rljmin0); + fvec p_rljmin1 = fvec::set1(rljmin1); + fvec p_rcLJmax0 = fvec::set1(ka->params.rcLJmax[itype][0]); + fvec p_rcLJmax1 = fvec::set1(ka->params.rcLJmax[itype][1]); + fvec p_rcLJmin0 = fvec::set1(ka->params.rcLJmin[itype][0]); + fvec p_rcLJmin1 = fvec::set1(ka->params.rcLJmin[itype][1]); + fvec p_lj10 = fvec::set1(ka->params.lj1[itype][0]); + fvec p_lj11 = fvec::set1(ka->params.lj1[itype][1]); + fvec p_lj20 = fvec::set1(ka->params.lj2[itype][0]); + fvec p_lj21 = fvec::set1(ka->params.lj2[itype][1]); + fvec p_lj30 = fvec::set1(ka->params.lj3[itype][0]); + fvec p_lj31 = fvec::set1(ka->params.lj3[itype][1]); + fvec p_lj40 = fvec::set1(ka->params.lj4[itype][0]); + fvec p_lj41 = fvec::set1(ka->params.lj4[itype][1]); + + int * neighs = ka->neigh_lmp.entries + ka->neigh_lmp.offset[i]; + int jnum = ka->neigh_lmp.num_half[i]; + + bool tap_success = aut_airebo_lj_test_all_paths(ka, i, &test_path_result); + if (! tap_success) { + for (int jj = 0; jj < jnum; jj++) { + ref_lennard_jones_single_interaction(ka, i, neighs[jj], MORSEFLAG); + } + continue; + } + + ivec j_2; + fvec delx_2, dely_2, delz_2, rsq_2; + bvec jtype_mask_2; + int num_2 = 0; + + fvec result_f_i_x = fvec::setzero(); + fvec result_f_i_y = fvec::setzero(); + fvec result_f_i_z = fvec::setzero(); + + int jj = 0; + bool rest_j = jj < jnum; + bool rest_2 = fvec::fast_compress(); + #pragma forceinline recursive + while (rest_j || rest_2) { + fvec delx, dely, delz, rsq; + bvec jtype_mask, within_cutoff; + ivec j; + if (rest_j) { + bvec mask_0 = bvec::full(); + //0xFF >> (8 - (jnum - jj)); + if (jj + (fvec::VL - 1) >= jnum) mask_0 = bvec::only(jnum - jj); + j = ivec::maskz_loadu(mask_0, &neighs[jj]); + fvec x_j, y_j, z_j; + aut_loadatoms_vec(x, j, &x_j, &y_j, &z_j, &jtype_mask, map, map_i, + c_i1); + fvec::gather_prefetch0(ivec::mullo(c_i4, + ivec::maskz_loadu(bvec::full(), &neighs[jj + fvec::VL])), x); + _mm_prefetch((const char*)&neighs[jj + 2 * fvec::VL], _MM_HINT_T0); + delx = x_i - x_j; + dely = y_i - y_j; + delz = z_i - z_j; + rsq = delx * delx + dely * dely + delz * delz; + fvec cutoff_sq = fvec::mask_blend(jtype_mask, cutljsq0, cutljsq1); + within_cutoff = fvec::mask_cmplt(mask_0, rsq, cutoff_sq); + + if (fvec::fast_compress()) { + j = ivec::masku_compress(within_cutoff, j); + delx = fvec::masku_compress(within_cutoff, delx); + dely = fvec::masku_compress(within_cutoff, dely); + delz = fvec::masku_compress(within_cutoff, delz); + rsq = fvec::masku_compress(within_cutoff, rsq); + jtype_mask = bvec::masku_compress(within_cutoff, jtype_mask); + //within_cutoff = 0xFF >> (8 - _cc_popcnt(within_cutoff)); + + bvec mask_2 = bvec::after(num_2);//0xFF << num_2; + j_2 = ivec::mask_expand(j_2, mask_2, j); + delx_2 = fvec::mask_expand(delx_2, mask_2, delx); + dely_2 = fvec::mask_expand(dely_2, mask_2, dely); + delz_2 = fvec::mask_expand(delz_2, mask_2, delz); + rsq_2 = fvec::mask_expand(rsq_2, mask_2, rsq); + jtype_mask_2 = bvec::mask_expand(jtype_mask_2, mask_2, jtype_mask); + num_2 = num_2 + bvec::popcnt(within_cutoff); + if (num_2 < fvec::VL) { + jj += fvec::VL; + rest_j = jj < jnum; + continue; + } + + num_2 -= fvec::VL; + //(0xFF >> (8 - num_2)) << (_cc_popcnt(within_cutoff) - num_2); + mask_2 = bvec::onlyafter(num_2, bvec::popcnt(within_cutoff) - num_2); + { + ivec tmp_j = j_2; + j_2 = ivec::masku_compress(mask_2, j); + j = tmp_j; + fvec tmp_delx = delx_2; + delx_2 = fvec::masku_compress(mask_2, delx); + delx = tmp_delx; + fvec tmp_dely = dely_2; + dely_2 = fvec::masku_compress(mask_2, dely); + dely = tmp_dely; + fvec tmp_delz = delz_2; + delz_2 = fvec::masku_compress(mask_2, delz); + delz = tmp_delz; + fvec tmp_rsq = rsq_2; + rsq_2 = fvec::masku_compress(mask_2, rsq); + rsq = tmp_rsq; + bvec tmp_jtype_mask = jtype_mask_2; + jtype_mask_2 = bvec::masku_compress(mask_2, jtype_mask); + jtype_mask = tmp_jtype_mask; + within_cutoff = bvec::full(); + } + } + } else if (rest_2) { + rest_2 = false; + j = j_2; + delx = delx_2; + dely = dely_2; + delz = delz_2; + rsq = rsq_2; + jtype_mask = jtype_mask_2; + within_cutoff = bvec::only(num_2); + num_2 = 0; + } + + bvec current_mask = within_cutoff; + if (bvec::test_all_unset(current_mask)) { + jj += fvec::VL; + rest_j = jj < jnum; + continue; + } + + fvec rij = fvec::sqrt(rsq); + LennardJonesPathAIREBOT testpath[fvec::VL]; + fvec cij = c_1_0; + fvec p_cut3rebo = fvec::set1(ka->params.cut3rebo); + bvec need_search = fvec::mask_cmplt(current_mask, rij, p_cut3rebo); + if (bvec::test_any_set(need_search)) { + fvec p_rcmax = fvec::mask_blend(jtype_mask, p_rcmax0, p_rcmax1); + #pragma noinline + cij = aut_airebo_lj_tap_test_path(ka, &test_path_result, need_search, + i_bc, j, testpath); + } + current_mask = fvec::mask_cmplt(current_mask, c_0_0, cij); + if (bvec::test_all_unset(current_mask)) { + jj += fvec::VL; + rest_j = jj < jnum; + continue; + } + bvec need_path_force = fvec::mask_cmplt(current_mask, cij, c_1_0); + + fvec p_rljmax = fvec::mask_blend(jtype_mask, p_rljmax0, p_rljmax1); + fvec p_rljmin = fvec::mask_blend(jtype_mask, p_rljmin0, p_rljmin1); + + fvec dslw, slw = aut_Sp2_deriv(rij, p_rljmin, p_rljmax, &dslw); + + fvec p_lj1 = fvec::mask_blend(jtype_mask, p_lj10, p_lj11); + fvec p_lj2 = fvec::mask_blend(jtype_mask, p_lj20, p_lj21); + fvec p_lj3 = fvec::mask_blend(jtype_mask, p_lj30, p_lj31); + fvec p_lj4 = fvec::mask_blend(jtype_mask, p_lj40, p_lj41); + + fvec vdw, dvdw; + + fvec r2inv = fvec::recip(rsq); + + if (MORSEFLAG) { + fvec exr = fvec::exp(fvec::setzero() - rij * p_lj4); + vdw = p_lj1 * exr * (p_lj2 * exr - c_2_0); + dvdw = p_lj3 * exr * (c_1_0 - p_lj2 * exr); + } else { + fvec r6inv = r2inv * r2inv * r2inv; + + vdw = r6inv * ( p_lj3 * r6inv - p_lj4); + fvec r7inv = r6inv * rij * r2inv; + dvdw = r7inv * ( p_lj2 - p_lj1 * r6inv); + } + + fvec VLJ = vdw * slw; + fvec dVLJ = dvdw * slw + vdw * dslw; + + fvec p_rcLJmin = fvec::mask_blend(jtype_mask, p_rcLJmin0, p_rcLJmin1); + fvec p_rcLJmax = fvec::mask_blend(jtype_mask, p_rcLJmax0, p_rcLJmax1); + fvec dStr, Str = aut_Sp2_deriv(rij, p_rcLJmin, p_rcLJmax, &dStr); + fvec VA = cij * VLJ * Str; + bvec need_bondorder = fvec::mask_cmplt(current_mask, c_0_0, Str); + fvec Stb = fvec::setzero(); + fvec fij[3]; + fij[0] = fvec::setzero(); + fij[1] = fvec::setzero(); + fij[2] = fvec::setzero(); + if (bvec::test_any_set(need_bondorder)) { + for (int jtype = 0; jtype < 2; jtype++) { + bvec need_bo_with_jtype = need_bondorder; + if (jtype) need_bo_with_jtype = need_bo_with_jtype & jtype_mask; + else need_bo_with_jtype = need_bo_with_jtype & ~ jtype_mask; + ivec jtmp = ivec::masku_compress(need_bo_with_jtype, j); + ivec itmp = ivec::masku_compress(need_bo_with_jtype, ivec::set1(i)); + fvec cijtmp = fvec::masku_compress(need_bo_with_jtype, cij); + bvec insert_mask = bvec::after(num_bo[itype][jtype]); + i_bo[itype][jtype] = ivec::mask_expand(i_bo[itype][jtype], + insert_mask, itmp); + j_bo[itype][jtype] = ivec::mask_expand(j_bo[itype][jtype], + insert_mask, jtmp); + cij_bo[itype][jtype] = fvec::mask_expand(cij_bo[itype][jtype], + insert_mask, cijtmp); + bvec need_path_force_with_jtype = need_bo_with_jtype & + need_path_force; + int testpath_end = fvec::VL; + if (bvec::test_any_set(need_path_force_with_jtype)) { + int pos = num_bo[itype][jtype]; + for (int l = 0; l < fvec::VL; l++) { + if (pos >= fvec::VL) { + testpath_end = l; + break; + } + if (bvec::test_at(need_path_force_with_jtype, l)) { + testpath_bo[itype][jtype][pos] = testpath[l]; + } + if (bvec::test_at(need_bo_with_jtype, l)) { + pos += 1; + } + } + } + num_bo[itype][jtype] = num_bo[itype][jtype] + + bvec::popcnt(need_bo_with_jtype); + if (num_bo[itype][jtype] >= fvec::VL) { + #pragma noinline + aut_lj_with_bo(ka, itype, jtype, i_bo[itype][jtype], + j_bo[itype][jtype], cij_bo[itype][jtype], + testpath_bo[itype][jtype]); + num_bo[itype][jtype] -= fvec::VL; + insert_mask = bvec::onlyafter(num_bo[itype][jtype], + bvec::popcnt(need_bo_with_jtype) - + num_bo[itype][jtype]); + i_bo[itype][jtype] = ivec::masku_compress(insert_mask, itmp); + j_bo[itype][jtype] = ivec::masku_compress(insert_mask, jtmp); + cij_bo[itype][jtype] = fvec::masku_compress(insert_mask, cijtmp); + if (bvec::test_any_set(need_path_force_with_jtype)) { + int pos = 0; + for (int l = testpath_end; l < fvec::VL; l++) { + if (bvec::test_at(need_path_force_with_jtype, l)) { + testpath_bo[itype][jtype][pos] = testpath[l]; + } + if (bvec::test_at(need_bo_with_jtype, l)) { + pos += 1; + } + } + } + } + } + current_mask = current_mask & ~ need_bondorder; + need_path_force = need_path_force & ~ need_bondorder; + } + + fvec fpdVLJ = cij * dVLJ * ( c_1_0 + Str * ( Stb - c_1_0)); + fvec fpdStr = dStr * cij * ( Stb * VLJ - VLJ); + fvec fpair = r2inv * rij * ( fvec::setzero() - ( fpdVLJ + fpdStr)); + fvec evdwl = VA * Stb + cij * VLJ * ( c_1_0 - Str); + + fvec fix = fpair * delx + fij[0]; + fvec fiy = fpair * dely + fij[1]; + fvec fiz = fpair * delz + fij[2]; + result_f_i_x = fvec::mask_add(result_f_i_x, current_mask, result_f_i_x, + fix); + result_f_i_y = fvec::mask_add(result_f_i_y, current_mask, result_f_i_y, + fiy); + result_f_i_z = fvec::mask_add(result_f_i_z, current_mask, result_f_i_z, + fiz); + result_eng = fvec::mask_add(result_eng, current_mask, result_eng, evdwl); + + ivec j_dbl_idx = ivec::mullo(j, c_i4); + avec fjx = avec::mask_gather(avec::undefined(), current_mask, j_dbl_idx, + &ka->result_f[0].x, sizeof(acc_t)); + avec fjy = avec::mask_gather(avec::undefined(), current_mask, j_dbl_idx, + &ka->result_f[0].y, sizeof(acc_t)); + avec fjz = avec::mask_gather(avec::undefined(), current_mask, j_dbl_idx, + &ka->result_f[0].z, sizeof(acc_t)); + + fjx = fjx - fix; + fjy = fjy - fiy; + fjz = fjz - fiz; + avec::mask_i32loscatter(&ka->result_f[0].x, current_mask, j_dbl_idx, fjx, + sizeof(acc_t)); + avec::mask_i32loscatter(&ka->result_f[0].y, current_mask, j_dbl_idx, fjy, + sizeof(acc_t)); + avec::mask_i32loscatter(&ka->result_f[0].z, current_mask, j_dbl_idx, fjz, + sizeof(acc_t)); + + if (bvec::test_any_set(need_path_force)) { + fvec dC = VLJ * ( Str * Stb + c_1_0 - Str); + #pragma noinline + aut_airebo_lj_force_path(ka, need_path_force, dC, testpath); + } + jj += fvec::VL; + rest_j = jj < jnum; + } + ka->result_f[i].x += fvec::reduce_add(result_f_i_x); + ka->result_f[i].y += fvec::reduce_add(result_f_i_y); + ka->result_f[i].z += fvec::reduce_add(result_f_i_z); + } + for (int itype = 0; itype < 2; itype++) { + for (int jtype = 0; jtype < 2; jtype++) { + for (int l = 0; l < num_bo[itype][jtype]; l++) { + ref_lennard_jones_single_interaction(ka,ivec::at(i_bo[itype][jtype],l), + ivec::at(j_bo[itype][jtype], l), + MORSEFLAG); + } + } + } + ka->result_eng += fvec::reduce_add(result_eng); +} + +}; + +template +void aut_lennard_jones(KernelArgsAIREBOT * ka, int morseflag) { +#ifdef LMP_INTEL_AIREBO_REF + ref_lennard_jones(ka, morseflag); +#else + if (morseflag) { + aut_wrap::template aut_lennard_jones<1>(ka); + } else { + aut_wrap::template aut_lennard_jones<0>(ka); + } +#endif +} + +template +void aut_rebo_neigh(KernelArgsAIREBOT * ka) { +#ifdef LMP_INTEL_AIREBO_REF + ref_rebo_neigh(ka); +#else + aut_wrap::aut_rebo_neigh(ka); +#endif +} + +template +void aut_frebo(KernelArgsAIREBOT * ka, int torsion_flag) { +#ifdef LMP_INTEL_AIREBO_REF + ref_frebo(ka, torsion_flag); +#else + aut_wrap::aut_frebo(ka, torsion_flag); +#endif +} + +#ifdef __INTEL_OFFLOAD +#pragma offload_attribute(pop) +#endif + +} + diff --git a/src/USER-INTEL/pair_airebo_intel.h b/src/USER-INTEL/pair_airebo_intel.h new file mode 100644 index 0000000000..d3179c09f1 --- /dev/null +++ b/src/USER-INTEL/pair_airebo_intel.h @@ -0,0 +1,110 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Markus Hohnerbach (RWTH) +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(airebo/intel,PairAIREBOIntel) + +#else + +#ifndef LMP_PAIR_AIREBO_INTEL_H +#define LMP_PAIR_AIREBO_INTEL_H + +#include "pair.h" +#include "fix_intel.h" +#include "pair_airebo.h" +//#include "airebo_common.h" + +namespace LAMMPS_NS { + +template +struct PairAIREBOIntelParam; + +class PairAIREBOIntel : public PairAIREBO { + public: + PairAIREBOIntel(class LAMMPS *); + virtual ~PairAIREBOIntel(); + virtual void compute(int, int); + virtual void init_style(); + protected: + + template + void compute(int eflag, int vflag, IntelBuffers *buffers); + + template + void eval(const int offload, const int vflag, + IntelBuffers * buffers, + const int astart, const int aend); + + template + void pack_force_const(IntelBuffers * buffers); + + template + PairAIREBOIntelParam get_param(); + + FixIntel * fix; + int _cop; + + int * REBO_cnumneigh; + int * REBO_num_skin; + int * REBO_list_data; + +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: Illegal ... command + +Self-explanatory. Check the input script syntax and compare to the +documentation for the command. You can use -echo screen as a +command-line option when running LAMMPS to see the offending line. + +E: Incorrect args for pair coefficients + +Self-explanatory. Check the input script or data file. + +E: Pair style AIREBO requires atom IDs + +This is a requirement to use the AIREBO potential. + +E: Pair style AIREBO requires newton pair on + +See the newton command. This is a restriction to use the AIREBO +potential. + +E: All pair coeffs are not set + +All pair coefficients must be set in the data file or by the +pair_coeff command before running a simulation. + +E: Neighbor list overflow, boost neigh_modify one + +There are too many neighbors of a single atom. Use the neigh_modify +command to increase the max number of neighbors allowed for one atom. +You may also want to boost the page size. + +E: Cannot open AIREBO potential file %s + +The specified AIREBO potential file cannot be opened. Check that the +path and name are correct. + +*/ diff --git a/src/USER-INTEL/pair_airebo_morse_intel.cpp b/src/USER-INTEL/pair_airebo_morse_intel.cpp new file mode 100644 index 0000000000..9c0f3b8ed0 --- /dev/null +++ b/src/USER-INTEL/pair_airebo_morse_intel.cpp @@ -0,0 +1,37 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Markus Hohnerbach (RWTH) +------------------------------------------------------------------------- */ + +#include "pair_airebo_morse_intel.h" +#include "error.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +PairAIREBOMorseIntel::PairAIREBOMorseIntel(LAMMPS *lmp) + : PairAIREBOIntel(lmp) {} + +/* ---------------------------------------------------------------------- + global settings +------------------------------------------------------------------------- */ + +void PairAIREBOMorseIntel::settings(int narg, char **arg) +{ + PairAIREBOIntel::settings(narg,arg); + + morseflag = 1; +} diff --git a/src/USER-INTEL/pair_airebo_morse_intel.h b/src/USER-INTEL/pair_airebo_morse_intel.h new file mode 100644 index 0000000000..5210ea80ee --- /dev/null +++ b/src/USER-INTEL/pair_airebo_morse_intel.h @@ -0,0 +1,40 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Markus Hohnerbach (RWTH) +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(airebo/morse/intel,PairAIREBOMorseIntel) + +#else + +#ifndef LMP_PAIR_AIREBO_MORSE_INTEL_H +#define LMP_PAIR_AIREBO_MORSE_INTEL_H + +#include "pair_airebo_intel.h" + +namespace LAMMPS_NS { + +class PairAIREBOMorseIntel : public PairAIREBOIntel { + public: + PairAIREBOMorseIntel(class LAMMPS *); + virtual void settings(int, char **); +}; + +} + +#endif +#endif diff --git a/src/USER-INTEL/pair_eam_alloy_intel.cpp b/src/USER-INTEL/pair_eam_alloy_intel.cpp new file mode 100644 index 0000000000..4f47c7ee23 --- /dev/null +++ b/src/USER-INTEL/pair_eam_alloy_intel.cpp @@ -0,0 +1,326 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Stephen Foiles (SNL), Murray Daw (SNL) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include "pair_eam_alloy_intel.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "memory.h" +#include "error.h" + +using namespace LAMMPS_NS; + +#define MAXLINE 1024 + +/* ---------------------------------------------------------------------- */ + +PairEAMAlloyIntel::PairEAMAlloyIntel(LAMMPS *lmp) : PairEAMIntel(lmp) +{ + one_coeff = 1; +} + +/* ---------------------------------------------------------------------- + set coeffs for one or more type pairs + read DYNAMO setfl file +------------------------------------------------------------------------- */ + +void PairEAMAlloyIntel::coeff(int narg, char **arg) +{ + int i,j; + + if (!allocated) allocate(); + + if (narg != 3 + atom->ntypes) + error->all(FLERR,"Incorrect args for pair coefficients"); + + // insure I,J args are * * + + if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0) + error->all(FLERR,"Incorrect args for pair coefficients"); + + // read EAM setfl file + + if (setfl) { + for (i = 0; i < setfl->nelements; i++) delete [] setfl->elements[i]; + delete [] setfl->elements; + delete [] setfl->mass; + memory->destroy(setfl->frho); + memory->destroy(setfl->rhor); + memory->destroy(setfl->z2r); + delete setfl; + } + setfl = new Setfl(); + read_file(arg[2]); + + // read args that map atom types to elements in potential file + // map[i] = which element the Ith atom type is, -1 if NULL + + for (i = 3; i < narg; i++) { + if (strcmp(arg[i],"NULL") == 0) { + map[i-2] = -1; + continue; + } + for (j = 0; j < setfl->nelements; j++) + if (strcmp(arg[i],setfl->elements[j]) == 0) break; + if (j < setfl->nelements) map[i-2] = j; + else error->all(FLERR,"No matching element in EAM potential file"); + } + + // clear setflag since coeff() called once with I,J = * * + + int n = atom->ntypes; + for (i = 1; i <= n; i++) + for (j = i; j <= n; j++) + setflag[i][j] = 0; + + // set setflag i,j for type pairs where both are mapped to elements + // set mass of atom type if i = j + + int count = 0; + for (i = 1; i <= n; i++) { + for (j = i; j <= n; j++) { + if (map[i] >= 0 && map[j] >= 0) { + setflag[i][j] = 1; + if (i == j) atom->set_mass(FLERR,i,setfl->mass[map[i]]); + count++; + } + scale[i][j] = 1.0; + } + } + + if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients"); +} + +/* ---------------------------------------------------------------------- + read a multi-element DYNAMO setfl file +------------------------------------------------------------------------- */ + +void PairEAMAlloyIntel::read_file(char *filename) +{ + Setfl *file = setfl; + + // open potential file + + int me = comm->me; + FILE *fptr; + char line[MAXLINE]; + + if (me == 0) { + fptr = force->open_potential(filename); + if (fptr == NULL) { + char str[128]; + sprintf(str,"Cannot open EAM potential file %s",filename); + error->one(FLERR,str); + } + } + + // read and broadcast header + // extract element names from nelements line + + int n; + if (me == 0) { + fgets(line,MAXLINE,fptr); + fgets(line,MAXLINE,fptr); + fgets(line,MAXLINE,fptr); + fgets(line,MAXLINE,fptr); + n = strlen(line) + 1; + } + MPI_Bcast(&n,1,MPI_INT,0,world); + MPI_Bcast(line,n,MPI_CHAR,0,world); + + sscanf(line,"%d",&file->nelements); + int nwords = atom->count_words(line); + if (nwords != file->nelements + 1) + error->all(FLERR,"Incorrect element names in EAM potential file"); + + char **words = new char*[file->nelements+1]; + nwords = 0; + strtok(line," \t\n\r\f"); + while ((words[nwords++] = strtok(NULL," \t\n\r\f"))) continue; + + file->elements = new char*[file->nelements]; + for (int i = 0; i < file->nelements; i++) { + n = strlen(words[i]) + 1; + file->elements[i] = new char[n]; + strcpy(file->elements[i],words[i]); + } + delete [] words; + + if (me == 0) { + fgets(line,MAXLINE,fptr); + sscanf(line,"%d %lg %d %lg %lg", + &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut); + } + + MPI_Bcast(&file->nrho,1,MPI_INT,0,world); + MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world); + MPI_Bcast(&file->nr,1,MPI_INT,0,world); + MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world); + MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world); + + file->mass = new double[file->nelements]; + memory->create(file->frho,file->nelements,file->nrho+1,"pair:frho"); + memory->create(file->rhor,file->nelements,file->nr+1,"pair:rhor"); + memory->create(file->z2r,file->nelements,file->nelements,file->nr+1, + "pair:z2r"); + + int i,j,tmp; + for (i = 0; i < file->nelements; i++) { + if (me == 0) { + fgets(line,MAXLINE,fptr); + sscanf(line,"%d %lg",&tmp,&file->mass[i]); + } + MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world); + + if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]); + MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world); + if (me == 0) grab(fptr,file->nr,&file->rhor[i][1]); + MPI_Bcast(&file->rhor[i][1],file->nr,MPI_DOUBLE,0,world); + } + + for (i = 0; i < file->nelements; i++) + for (j = 0; j <= i; j++) { + if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]); + MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world); + } + + // close the potential file + + if (me == 0) fclose(fptr); +} + +/* ---------------------------------------------------------------------- + copy read-in setfl potential to standard array format +------------------------------------------------------------------------- */ + +void PairEAMAlloyIntel::file2array() +{ + int i,j,m,n; + int ntypes = atom->ntypes; + + // set function params directly from setfl file + + nrho = setfl->nrho; + nr = setfl->nr; + drho = setfl->drho; + dr = setfl->dr; + rhomax = (nrho-1) * drho; + + // ------------------------------------------------------------------ + // setup frho arrays + // ------------------------------------------------------------------ + + // allocate frho arrays + // nfrho = # of setfl elements + 1 for zero array + + nfrho = setfl->nelements + 1; + memory->destroy(frho); + memory->create(frho,nfrho,nrho+1,"pair:frho"); + + // copy each element's frho to global frho + + for (i = 0; i < setfl->nelements; i++) + for (m = 1; m <= nrho; m++) frho[i][m] = setfl->frho[i][m]; + + // add extra frho of zeroes for non-EAM types to point to (pair hybrid) + // this is necessary b/c fp is still computed for non-EAM atoms + + for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0; + + // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to + // if atom type doesn't point to element (non-EAM atom in pair hybrid) + // then map it to last frho array of zeroes + + for (i = 1; i <= ntypes; i++) + if (map[i] >= 0) type2frho[i] = map[i]; + else type2frho[i] = nfrho-1; + + // ------------------------------------------------------------------ + // setup rhor arrays + // ------------------------------------------------------------------ + + // allocate rhor arrays + // nrhor = # of setfl elements + + nrhor = setfl->nelements; + memory->destroy(rhor); + memory->create(rhor,nrhor,nr+1,"pair:rhor"); + + // copy each element's rhor to global rhor + + for (i = 0; i < setfl->nelements; i++) + for (m = 1; m <= nr; m++) rhor[i][m] = setfl->rhor[i][m]; + + // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to + // for setfl files, I,J mapping only depends on I + // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used + + for (i = 1; i <= ntypes; i++) + for (j = 1; j <= ntypes; j++) + type2rhor[i][j] = map[i]; + + // ------------------------------------------------------------------ + // setup z2r arrays + // ------------------------------------------------------------------ + + // allocate z2r arrays + // nz2r = N*(N+1)/2 where N = # of setfl elements + + nz2r = setfl->nelements * (setfl->nelements+1) / 2; + memory->destroy(z2r); + memory->create(z2r,nz2r,nr+1,"pair:z2r"); + + // copy each element pair z2r to global z2r, only for I >= J + + n = 0; + for (i = 0; i < setfl->nelements; i++) + for (j = 0; j <= i; j++) { + for (m = 1; m <= nr; m++) z2r[n][m] = setfl->z2r[i][j][m]; + n++; + } + + // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to + // set of z2r arrays only fill lower triangular Nelement matrix + // value = n = sum over rows of lower-triangular matrix until reach irow,icol + // swap indices when irow < icol to stay lower triangular + // if map = -1 (non-EAM atom in pair hybrid): + // type2z2r is not used by non-opt + // but set type2z2r to 0 since accessed by opt + + int irow,icol; + for (i = 1; i <= ntypes; i++) { + for (j = 1; j <= ntypes; j++) { + irow = map[i]; + icol = map[j]; + if (irow == -1 || icol == -1) { + type2z2r[i][j] = 0; + continue; + } + if (irow < icol) { + irow = map[j]; + icol = map[i]; + } + n = 0; + for (m = 0; m < irow; m++) n += m + 1; + n += icol; + type2z2r[i][j] = n; + } + } +} diff --git a/src/USER-INTEL/pair_eam_alloy_intel.h b/src/USER-INTEL/pair_eam_alloy_intel.h new file mode 100644 index 0000000000..4967c3709d --- /dev/null +++ b/src/USER-INTEL/pair_eam_alloy_intel.h @@ -0,0 +1,43 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(eam/alloy/intel,PairEAMAlloyIntel) + +#else + +#ifndef LMP_PAIR_EAM_ALLOY_INTEL_H +#define LMP_PAIR_EAM_ALLOY_INTEL_H + +#include "pair_eam_intel.h" + +namespace LAMMPS_NS { + +// need virtual public b/c of how eam/alloy/opt inherits from it + +class PairEAMAlloyIntel : virtual public PairEAMIntel { + public: + PairEAMAlloyIntel(class LAMMPS *); + virtual ~PairEAMAlloyIntel() {} + void coeff(int, char **); + + protected: + void read_file(char *); + void file2array(); +}; + +} + +#endif +#endif diff --git a/src/USER-INTEL/pair_eam_fs_intel.cpp b/src/USER-INTEL/pair_eam_fs_intel.cpp new file mode 100644 index 0000000000..cfcc8200cc --- /dev/null +++ b/src/USER-INTEL/pair_eam_fs_intel.cpp @@ -0,0 +1,335 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Tim Lau (MIT) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include "pair_eam_fs_intel.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "memory.h" +#include "error.h" + +using namespace LAMMPS_NS; + +#define MAXLINE 1024 + +/* ---------------------------------------------------------------------- */ + +PairEAMFSIntel::PairEAMFSIntel(LAMMPS *lmp) : PairEAMIntel(lmp) +{ + one_coeff = 1; +} + +/* ---------------------------------------------------------------------- + set coeffs for one or more type pairs + read EAM Finnis-Sinclair file +------------------------------------------------------------------------- */ + +void PairEAMFSIntel::coeff(int narg, char **arg) +{ + int i,j; + + if (!allocated) allocate(); + + if (narg != 3 + atom->ntypes) + error->all(FLERR,"Incorrect args for pair coefficients"); + + // insure I,J args are * * + + if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0) + error->all(FLERR,"Incorrect args for pair coefficients"); + + // read EAM Finnis-Sinclair file + + if (fs) { + for (i = 0; i < fs->nelements; i++) delete [] fs->elements[i]; + delete [] fs->elements; + delete [] fs->mass; + memory->destroy(fs->frho); + memory->destroy(fs->rhor); + memory->destroy(fs->z2r); + delete fs; + } + fs = new Fs(); + read_file(arg[2]); + + // read args that map atom types to elements in potential file + // map[i] = which element the Ith atom type is, -1 if NULL + + for (i = 3; i < narg; i++) { + if (strcmp(arg[i],"NULL") == 0) { + map[i-2] = -1; + continue; + } + for (j = 0; j < fs->nelements; j++) + if (strcmp(arg[i],fs->elements[j]) == 0) break; + if (j < fs->nelements) map[i-2] = j; + else error->all(FLERR,"No matching element in EAM potential file"); + } + + // clear setflag since coeff() called once with I,J = * * + + int n = atom->ntypes; + for (i = 1; i <= n; i++) + for (j = i; j <= n; j++) + setflag[i][j] = 0; + + // set setflag i,j for type pairs where both are mapped to elements + // set mass of atom type if i = j + + int count = 0; + for (i = 1; i <= n; i++) { + for (j = i; j <= n; j++) { + if (map[i] >= 0 && map[j] >= 0) { + setflag[i][j] = 1; + if (i == j) atom->set_mass(FLERR,i,fs->mass[map[i]]); + count++; + } + scale[i][j] = 1.0; + } + } + + if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients"); +} + +/* ---------------------------------------------------------------------- + read a multi-element DYNAMO setfl file +------------------------------------------------------------------------- */ + +void PairEAMFSIntel::read_file(char *filename) +{ + Fs *file = fs; + + // open potential file + + int me = comm->me; + FILE *fptr; + char line[MAXLINE]; + + if (me == 0) { + fptr = force->open_potential(filename); + if (fptr == NULL) { + char str[128]; + sprintf(str,"Cannot open EAM potential file %s",filename); + error->one(FLERR,str); + } + } + + // read and broadcast header + // extract element names from nelements line + + int n; + if (me == 0) { + fgets(line,MAXLINE,fptr); + fgets(line,MAXLINE,fptr); + fgets(line,MAXLINE,fptr); + fgets(line,MAXLINE,fptr); + n = strlen(line) + 1; + } + MPI_Bcast(&n,1,MPI_INT,0,world); + MPI_Bcast(line,n,MPI_CHAR,0,world); + + sscanf(line,"%d",&file->nelements); + int nwords = atom->count_words(line); + if (nwords != file->nelements + 1) + error->all(FLERR,"Incorrect element names in EAM potential file"); + + char **words = new char*[file->nelements+1]; + nwords = 0; + strtok(line," \t\n\r\f"); + while ((words[nwords++] = strtok(NULL," \t\n\r\f"))) continue; + + file->elements = new char*[file->nelements]; + for (int i = 0; i < file->nelements; i++) { + n = strlen(words[i]) + 1; + file->elements[i] = new char[n]; + strcpy(file->elements[i],words[i]); + } + delete [] words; + + if (me == 0) { + fgets(line,MAXLINE,fptr); + sscanf(line,"%d %lg %d %lg %lg", + &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut); + } + + MPI_Bcast(&file->nrho,1,MPI_INT,0,world); + MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world); + MPI_Bcast(&file->nr,1,MPI_INT,0,world); + MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world); + MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world); + + file->mass = new double[file->nelements]; + memory->create(file->frho,file->nelements,file->nrho+1, + "pair:frho"); + memory->create(file->rhor,file->nelements,file->nelements, + file->nr+1,"pair:rhor"); + memory->create(file->z2r,file->nelements,file->nelements, + file->nr+1,"pair:z2r"); + + int i,j,tmp; + for (i = 0; i < file->nelements; i++) { + if (me == 0) { + fgets(line,MAXLINE,fptr); + sscanf(line,"%d %lg",&tmp,&file->mass[i]); + } + MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world); + + if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]); + MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world); + + for (j = 0; j < file->nelements; j++) { + if (me == 0) grab(fptr,file->nr,&file->rhor[i][j][1]); + MPI_Bcast(&file->rhor[i][j][1],file->nr,MPI_DOUBLE,0,world); + } + } + + for (i = 0; i < file->nelements; i++) + for (j = 0; j <= i; j++) { + if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]); + MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world); + } + + // close the potential file + + if (me == 0) fclose(fptr); +} + +/* ---------------------------------------------------------------------- + copy read-in setfl potential to standard array format +------------------------------------------------------------------------- */ + +void PairEAMFSIntel::file2array() +{ + int i,j,m,n; + int ntypes = atom->ntypes; + + // set function params directly from fs file + + nrho = fs->nrho; + nr = fs->nr; + drho = fs->drho; + dr = fs->dr; + rhomax = (nrho-1) * drho; + + // ------------------------------------------------------------------ + // setup frho arrays + // ------------------------------------------------------------------ + + // allocate frho arrays + // nfrho = # of fs elements + 1 for zero array + + nfrho = fs->nelements + 1; + memory->destroy(frho); + memory->create(frho,nfrho,nrho+1,"pair:frho"); + + // copy each element's frho to global frho + + for (i = 0; i < fs->nelements; i++) + for (m = 1; m <= nrho; m++) frho[i][m] = fs->frho[i][m]; + + // add extra frho of zeroes for non-EAM types to point to (pair hybrid) + // this is necessary b/c fp is still computed for non-EAM atoms + + for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0; + + // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to + // if atom type doesn't point to element (non-EAM atom in pair hybrid) + // then map it to last frho array of zeroes + + for (i = 1; i <= ntypes; i++) + if (map[i] >= 0) type2frho[i] = map[i]; + else type2frho[i] = nfrho-1; + + // ------------------------------------------------------------------ + // setup rhor arrays + // ------------------------------------------------------------------ + + // allocate rhor arrays + // nrhor = square of # of fs elements + + nrhor = fs->nelements * fs->nelements; + memory->destroy(rhor); + memory->create(rhor,nrhor,nr+1,"pair:rhor"); + + // copy each element pair rhor to global rhor + + n = 0; + for (i = 0; i < fs->nelements; i++) + for (j = 0; j < fs->nelements; j++) { + for (m = 1; m <= nr; m++) rhor[n][m] = fs->rhor[i][j][m]; + n++; + } + + // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to + // for fs files, there is a full NxN set of rhor arrays + // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used + + for (i = 1; i <= ntypes; i++) + for (j = 1; j <= ntypes; j++) + type2rhor[i][j] = map[i] * fs->nelements + map[j]; + + // ------------------------------------------------------------------ + // setup z2r arrays + // ------------------------------------------------------------------ + + // allocate z2r arrays + // nz2r = N*(N+1)/2 where N = # of fs elements + + nz2r = fs->nelements * (fs->nelements+1) / 2; + memory->destroy(z2r); + memory->create(z2r,nz2r,nr+1,"pair:z2r"); + + // copy each element pair z2r to global z2r, only for I >= J + + n = 0; + for (i = 0; i < fs->nelements; i++) + for (j = 0; j <= i; j++) { + for (m = 1; m <= nr; m++) z2r[n][m] = fs->z2r[i][j][m]; + n++; + } + + // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to + // set of z2r arrays only fill lower triangular Nelement matrix + // value = n = sum over rows of lower-triangular matrix until reach irow,icol + // swap indices when irow < icol to stay lower triangular + // if map = -1 (non-EAM atom in pair hybrid): + // type2z2r is not used by non-opt + // but set type2z2r to 0 since accessed by opt + + int irow,icol; + for (i = 1; i <= ntypes; i++) { + for (j = 1; j <= ntypes; j++) { + irow = map[i]; + icol = map[j]; + if (irow == -1 || icol == -1) { + type2z2r[i][j] = 0; + continue; + } + if (irow < icol) { + irow = map[j]; + icol = map[i]; + } + n = 0; + for (m = 0; m < irow; m++) n += m + 1; + n += icol; + type2z2r[i][j] = n; + } + } +} diff --git a/src/USER-INTEL/pair_eam_fs_intel.h b/src/USER-INTEL/pair_eam_fs_intel.h new file mode 100644 index 0000000000..da2ab9d2d7 --- /dev/null +++ b/src/USER-INTEL/pair_eam_fs_intel.h @@ -0,0 +1,43 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(eam/fs/intel,PairEAMFSIntel) + +#else + +#ifndef LMP_PAIR_EAM_FS_INTEL_H +#define LMP_PAIR_EAM_FS_INTEL_H + +#include "pair_eam_intel.h" + +namespace LAMMPS_NS { + +// need virtual public b/c of how eam/fs/opt inherits from it + +class PairEAMFSIntel : virtual public PairEAMIntel { + public: + PairEAMFSIntel(class LAMMPS *); + virtual ~PairEAMFSIntel() {} + void coeff(int, char **); + + protected: + void read_file(char *); + void file2array(); +}; + +} + +#endif +#endif diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp index ed7dd424af..3fbb58308b 100644 --- a/src/USER-INTEL/pair_gayberne_intel.cpp +++ b/src/USER-INTEL/pair_gayberne_intel.cpp @@ -428,7 +428,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, } else multiple_forms = true; } - const int edge = (packed_j % pad_width); + const int edge = packed_j & (pad_width - 1); if (edge) { const int packed_end = packed_j + (pad_width - edge); #if defined(LMP_SIMD_COMPILER) diff --git a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp new file mode 100644 index 0000000000..0dc2c275e8 --- /dev/null +++ b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp @@ -0,0 +1,595 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + This software is distributed under the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include +#include "pair_lj_charmm_coul_charmm_intel.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "group.h" +#include "memory.h" +#include "modify.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "memory.h" +#include "suffix.h" +using namespace LAMMPS_NS; + +#define LJ_T typename IntelBuffers::vec4_t + +/* ---------------------------------------------------------------------- */ + +PairLJCharmmCoulCharmmIntel::PairLJCharmmCoulCharmmIntel(LAMMPS *lmp) : + PairLJCharmmCoulCharmm(lmp) +{ + suffix_flag |= Suffix::INTEL; +} + +/* ---------------------------------------------------------------------- */ + +PairLJCharmmCoulCharmmIntel::~PairLJCharmmCoulCharmmIntel() +{ +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCharmmCoulCharmmIntel::compute(int eflag, int vflag) +{ + if (fix->precision()==FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); + + fix->balance_stamp(); + vflag_fdotr = 0; +} + +template +void PairLJCharmmCoulCharmmIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) { + ev_setup(eflag,vflag); + } else evflag = vflag_fdotr = 0; + + const int inum = list->inum; + const int nthreads = comm->nthreads; + const int host_start = fix->host_start_pair(); + const int offload_end = fix->offload_end_pair(); + const int ago = neighbor->ago; + + if (ago != 0 && fix->separate_buffers() == 0) { + fix->start_watch(TIME_PACK); + + int packthreads; + if (nthreads > INTEL_HTHREADS) packthreads = nthreads; + else packthreads = 1; + #if defined(_OPENMP) + #pragma omp parallel if(packthreads > 1) + #endif + { + int ifrom, ito, tid; + IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost, + packthreads, sizeof(ATOM_T)); + buffers->thr_pack(ifrom,ito,ago); + } + fix->stop_watch(TIME_PACK); + } + + // -------------------- Regular version + int ovflag = 0; + if (vflag_fdotr) ovflag = 2; + else if (vflag) ovflag = 1; + if (eflag) { + if (force->newton_pair) { + eval<1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0>(0, ovflag, buffers, fc, host_start, inum); + } + } else { + if (force->newton_pair) { + eval<0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0>(0, ovflag, buffers, fc, host_start, inum); + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag, + IntelBuffers *buffers, + const ForceConst &fc, + const int astart, const int aend) +{ + const int inum = aend - astart; + if (inum == 0) return; + int nlocal, nall, minlocal; + fix->get_buffern(offload, nlocal, nall, minlocal); + + const int ago = neighbor->ago; + IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall); + + ATOM_T * _noalias const x = buffers->get_x(offload); + flt_t * _noalias const q = buffers->get_q(offload); + + const int * _noalias const numneigh = list->numneigh; + const int * _noalias const cnumneigh = buffers->cnumneigh(list); + const int * _noalias const firstneigh = buffers->firstneigh(list); + + const flt_t * _noalias const special_coul = fc.special_coul; + const flt_t * _noalias const special_lj = fc.special_lj; + const flt_t qqrd2e = force->qqrd2e; + const flt_t inv_denom_lj = (flt_t)1.0/denom_lj; + const flt_t inv_denom_coul = (flt_t)1.0/denom_coul; + + const flt_t * _noalias const cutsq = fc.cutsq[0]; + const LJ_T * _noalias const lj = fc.lj[0]; + const flt_t cut_ljsq = fc.cut_ljsq; + const flt_t cut_lj_innersq = fc.cut_lj_innersq; + const flt_t cut_coul_innersq = fc.cut_coul_innersq; + const flt_t cut_coulsq = fc.cut_coulsq; + + const int ntypes = atom->ntypes + 1; + const int eatom = this->eflag_atom; + + flt_t * _noalias const ccachex = buffers->get_ccachex(); + flt_t * _noalias const ccachey = buffers->get_ccachey(); + flt_t * _noalias const ccachez = buffers->get_ccachez(); + flt_t * _noalias const ccachew = buffers->get_ccachew(); + int * _noalias const ccachei = buffers->get_ccachei(); + int * _noalias const ccachej = buffers->get_ccachej(); + const int ccache_stride = _ccache_stride; + + // Determine how much data to transfer + int x_size, q_size, f_stride, ev_size, separate_flag; + IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, + buffers, offload, fix, separate_flag, + x_size, q_size, ev_size, f_stride); + + int tc; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; + IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global); + + const int nthreads = tc; + #ifdef _LMP_INTEL_OFFLOAD + int *overflow = fix->get_off_overflow_flag(); + double *timer_compute = fix->off_watch_pair(); + + if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY); + #pragma offload target(mic:_cop) if(offload) \ + in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \ + in(cutsq,lj:length(0) alloc_if(0) free_if(0)) \ + in(firstneigh:length(0) alloc_if(0) free_if(0)) \ + in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ + in(numneigh:length(0) alloc_if(0) free_if(0)) \ + in(x:length(x_size) alloc_if(0) free_if(0)) \ + in(q:length(q_size) alloc_if(0) free_if(0)) \ + in(overflow:length(0) alloc_if(0) free_if(0)) \ + in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \ + in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \ + in(ccache_stride,nthreads,qqrd2e,inum,nall,ntypes,cut_coulsq) \ + in(vflag,eatom,f_stride,separate_flag,offload) \ + in(astart,cut_ljsq,cut_lj_innersq,nlocal,inv_denom_lj,minlocal) \ + in(inv_denom_coul,cut_coul_innersq) \ + out(f_start:length(f_stride) alloc_if(0) free_if(0)) \ + out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \ + out(timer_compute:length(1) alloc_if(0) free_if(0)) \ + signal(f_start) + #endif + { + #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) + *timer_compute = MIC_Wtime(); + #endif + + IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, + f_stride, x, q); + + acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; + if (EFLAG) oevdwl = oecoul = (acc_t)0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + + // loop over neighbors of my atoms + #if defined(_OPENMP) + #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int iifrom, iip, iito, tid; + IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads); + iifrom += astart; + iito += astart; + + int foff; + if (NEWTON_PAIR) foff = tid * f_stride - minlocal; + else foff = -minlocal; + FORCE_T * _noalias const f = f_start + foff; + if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); + flt_t cutboth = cut_coulsq; + + const int toffs = tid * ccache_stride; + flt_t * _noalias const tdelx = ccachex + toffs; + flt_t * _noalias const tdely = ccachey + toffs; + flt_t * _noalias const tdelz = ccachez + toffs; + flt_t * _noalias const trsq = ccachew + toffs; + int * _noalias const tj = ccachei + toffs; + int * _noalias const tjtype = ccachej + toffs; + + for (int i = iifrom; i < iito; i += iip) { + // const int i = ilist[ii]; + const int itype = x[i].w; + + const int ptr_off = itype * ntypes; + const flt_t * _noalias const cutsqi = cutsq + ptr_off; + const LJ_T * _noalias const lji = lj + ptr_off; + + const int * _noalias const jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + + acc_t fxtmp,fytmp,fztmp,fwtmp; + acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5; + + const flt_t xtmp = x[i].x; + const flt_t ytmp = x[i].y; + const flt_t ztmp = x[i].z; + const flt_t qtmp = q[i]; + fxtmp = fytmp = fztmp = (acc_t)0; + if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; + if (NEWTON_PAIR == 0) + if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; + + int ej = 0; + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma ivdep + #endif + for (int jj = 0; jj < jnum; jj++) { + const int j = jlist[jj] & NEIGHMASK; + const flt_t delx = xtmp - x[j].x; + const flt_t dely = ytmp - x[j].y; + const flt_t delz = ztmp - x[j].z; + const flt_t rsq = delx * delx + dely * dely + delz * delz; + + if (rsq < cut_coulsq) { + trsq[ej]=rsq; + tdelx[ej]=delx; + tdely[ej]=dely; + tdelz[ej]=delz; + tjtype[ej]=x[j].w; + tj[ej]=jlist[jj]; + ej++; + } + } + + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \ + sv0, sv1, sv2, sv3, sv4, sv5) + #endif + for (int jj = 0; jj < ej; jj++) { + flt_t forcecoul, forcelj, evdwl; + forcecoul = forcelj = evdwl = (flt_t)0.0; + + const int j = tj[jj] & NEIGHMASK; + const int sbindex = tj[jj] >> SBBITS & 3; + const flt_t rsq = trsq[jj]; + const flt_t r2inv = (flt_t)1.0 / rsq; + const flt_t r_inv = (flt_t)1.0 / sqrt(rsq); + forcecoul = qqrd2e * qtmp * q[j] * r_inv; + if (rsq > cut_coul_innersq) { + const flt_t ccr = cut_coulsq - rsq; + const flt_t switch1 = ccr * ccr * inv_denom_coul * + (cut_coulsq + (flt_t)2.0 * rsq - (flt_t)3.0 * cut_coul_innersq); + forcecoul *= switch1; + } + + #ifdef INTEL_VMASK + if (rsq < cut_ljsq) { + #endif + const int jtype = tjtype[jj]; + flt_t r6inv = r2inv * r2inv * r2inv; + forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y); + if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w); + + #ifdef INTEL_VMASK + if (rsq > cut_lj_innersq) { + #endif + const flt_t drsq = cut_ljsq - rsq; + const flt_t cut2 = (rsq - cut_lj_innersq) * drsq; + const flt_t switch1 = drsq * (drsq * drsq + (flt_t)3.0 * cut2) * + inv_denom_lj; + const flt_t switch2 = (flt_t)12.0 * rsq * cut2 * inv_denom_lj; + if (EFLAG) { + #ifndef INTEL_VMASK + if (rsq > cut_lj_innersq) { + #endif + forcelj = forcelj * switch1 + evdwl * switch2; + evdwl *= switch1; + #ifndef INTEL_VMASK + } + #endif + } else { + const flt_t philj = r6inv * (lji[jtype].z*r6inv - + lji[jtype].w); + #ifndef INTEL_VMASK + if (rsq > cut_lj_innersq) + #endif + forcelj = forcelj * switch1 + philj * switch2; + } + #ifdef INTEL_VMASK + } + #endif + + #ifdef INTEL_VMASK + } + #else + if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; } + #endif + if (sbindex) { + const flt_t factor_coul = special_coul[sbindex]; + forcecoul *= factor_coul; + const flt_t factor_lj = special_lj[sbindex]; + forcelj *= factor_lj; + if (EFLAG) evdwl *= factor_lj; + } + + const flt_t fpair = (forcecoul + forcelj) * r2inv; + const flt_t fpx = fpair * tdelx[jj]; + fxtmp += fpx; + if (NEWTON_PAIR) f[j].x -= fpx; + const flt_t fpy = fpair * tdely[jj]; + fytmp += fpy; + if (NEWTON_PAIR) f[j].y -= fpy; + const flt_t fpz = fpair * tdelz[jj]; + fztmp += fpz; + if (NEWTON_PAIR) f[j].z -= fpz; + + if (EFLAG) { + sevdwl += evdwl; + secoul += forcecoul; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * forcecoul; + if (NEWTON_PAIR) + f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * forcecoul; + } + } + if (NEWTON_PAIR == 0) + IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj], + fpx, fpy, fpz); + } // for jj + if (NEWTON_PAIR) { + f[i].x += fxtmp; + f[i].y += fytmp; + f[i].z += fztmp; + } else { + f[i].x = fxtmp; + f[i].y = fytmp; + f[i].z = fztmp; + } + IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); + } // for ii + + IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); + } // end of omp parallel region + + IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, + ov0, ov1, ov2, ov3, ov4, ov5); + + if (EFLAG) { + if (NEWTON_PAIR == 0) { + oevdwl *= (acc_t)0.5; + oecoul *= (acc_t)0.5; + } + ev_global[0] = oevdwl; + ev_global[1] = oecoul; + } + if (vflag) { + if (NEWTON_PAIR == 0) { + ov0 *= (acc_t)0.5; + ov1 *= (acc_t)0.5; + ov2 *= (acc_t)0.5; + ov3 *= (acc_t)0.5; + ov4 *= (acc_t)0.5; + ov5 *= (acc_t)0.5; + } + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; + } + #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) + *timer_compute = MIC_Wtime() - *timer_compute; + #endif + } // end of offload region + + if (offload) + fix->stop_watch(TIME_OFFLOAD_LATENCY); + else + fix->stop_watch(TIME_HOST_PAIR); + + if (EFLAG || vflag) + fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); + else + fix->add_result_array(f_start, 0, offload); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCharmmCoulCharmmIntel::init_style() +{ + PairLJCharmmCoulCharmm::init_style(); + if (force->newton_pair == 0) { + neighbor->requests[neighbor->nrequest-1]->half = 0; + neighbor->requests[neighbor->nrequest-1]->full = 1; + } + neighbor->requests[neighbor->nrequest-1]->intel = 1; + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + fix->pair_init_check(); + #ifdef _LMP_INTEL_OFFLOAD + _cop = fix->coprocessor_number(); + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + pack_force_const(force_const_single, fix->get_mixed_buffers()); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + pack_force_const(force_const_double, fix->get_double_buffers()); + else + pack_force_const(force_const_single, fix->get_single_buffers()); +} + +template +void PairLJCharmmCoulCharmmIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + int off_ccache = 0; + #ifdef _LMP_INTEL_OFFLOAD + if (_cop >= 0) off_ccache = 1; + #endif + buffers->grow_ccache(off_ccache, comm->nthreads, 1); + _ccache_stride = buffers->ccache_stride(); + + int tp1 = atom->ntypes + 1; + + fc.set_ntypes(tp1, memory, _cop); + buffers->set_ntypes(tp1); + flt_t **cutneighsq = buffers->get_cutneighsq(); + + // Repeat cutsq calculation because done after call to init_style + double cut, cutneigh; + if (cut_lj > cut_coul) + error->all(FLERR, + "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic"); + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i, j); + cutneigh = cut + neighbor->skin; + cutsq[i][j] = cutsq[j][i] = cut*cut; + cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; + } + } + } + + cut_coul_innersq = cut_coul_inner * cut_coul_inner; + cut_lj_innersq = cut_lj_inner * cut_lj_inner; + cut_ljsq = cut_lj * cut_lj; + cut_coulsq = cut_coul * cut_coul; + cut_bothsq = MAX(cut_ljsq, cut_coulsq); + + fc.cut_coulsq = cut_coulsq; + fc.cut_ljsq = cut_ljsq; + fc.cut_coul_innersq = cut_coul_innersq; + fc.cut_lj_innersq = cut_lj_innersq; + + for (int i = 0; i < 4; i++) { + fc.special_lj[i] = force->special_lj[i]; + fc.special_coul[i] = force->special_coul[i]; + fc.special_coul[0] = 1.0; + fc.special_lj[0] = 1.0; + } + + for (int i = 0; i < tp1; i++) { + for (int j = 0; j < tp1; j++) { + fc.lj[i][j].x = lj1[i][j]; + fc.lj[i][j].y = lj2[i][j]; + fc.lj[i][j].z = lj3[i][j]; + fc.lj[i][j].w = lj4[i][j]; + fc.cutsq[i][j] = cutsq[i][j]; + } + } + + #ifdef _LMP_INTEL_OFFLOAD + if (_cop < 0) return; + flt_t * special_lj = fc.special_lj; + flt_t * special_coul = fc.special_coul; + flt_t * cutsq = fc.cutsq[0]; + LJ_T * lj = fc.lj[0]; + flt_t * ocutneighsq = cutneighsq[0]; + int tp1sq = tp1 * tp1; + #pragma offload_transfer target(mic:_cop) \ + in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \ + in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0)) \ + in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0)) + #endif +} + +/* ---------------------------------------------------------------------- */ + +template +void PairLJCharmmCoulCharmmIntel::ForceConst::set_ntypes( + const int ntypes, Memory *memory, const int cop) { + if (ntypes != _ntypes) { + if (_ntypes > 0) { + #ifdef _LMP_INTEL_OFFLOAD + flt_t * ospecial_lj = special_lj; + flt_t * ospecial_coul = special_coul; + flt_t * ocutsq = cutsq[0]; + typename IntelBuffers::vec4_t * olj = lj[0]; + if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL && + ospecial_coul != NULL && cop >= 0) { + #pragma offload_transfer target(mic:cop) \ + nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \ + nocopy(ocutsq, olj: alloc_if(0) free_if(1)) + } + #endif + + _memory->destroy(cutsq); + _memory->destroy(lj); + } + if (ntypes > 0) { + _cop = cop; + memory->create(cutsq,ntypes,ntypes,"fc.cutsq"); + memory->create(lj,ntypes,ntypes,"fc.lj"); + + #ifdef _LMP_INTEL_OFFLOAD + flt_t * ospecial_lj = special_lj; + flt_t * ospecial_coul = special_coul; + flt_t * ocutsq = cutsq[0]; + typename IntelBuffers::vec4_t * olj = lj[0]; + int tp1sq = ntypes*ntypes; + if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL && + ospecial_coul != NULL && cop >= 0) { + #pragma offload_transfer target(mic:cop) \ + nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \ + nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \ + nocopy(ocutsq,olj: length(tp1sq) alloc_if(1) free_if(0)) + } + #endif + } + } + _ntypes=ntypes; + _memory=memory; +} diff --git a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.h b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.h new file mode 100644 index 0000000000..64d6077477 --- /dev/null +++ b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.h @@ -0,0 +1,100 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/charmm/coul/charmm/intel,PairLJCharmmCoulCharmmIntel) + +#else + +#ifndef LMP_PAIR_LJ_CHARMM_COUL_CHARMM_INTEL_H +#define LMP_PAIR_LJ_CHARMM_COUL_CHARMM_INTEL_H + +#include "pair_lj_charmm_coul_charmm.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + +class PairLJCharmmCoulCharmmIntel : public PairLJCharmmCoulCharmm { + + public: + PairLJCharmmCoulCharmmIntel(class LAMMPS *); + virtual ~PairLJCharmmCoulCharmmIntel(); + + virtual void compute(int, int); + void init_style(); + + typedef struct { float x,y,z; int w; } sng4_t; + + private: + FixIntel *fix; + int _cop, _ccache_stride; + + template class ForceConst; + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int offload, const int vflag, + IntelBuffers * buffers, + const ForceConst &fc, const int astart, const int aend); + + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + // ---------------------------------------------------------------------- + template + class ForceConst { + public: + _alignvar(flt_t special_coul[4],64); + _alignvar(flt_t special_lj[4],64); + flt_t **cutsq; + flt_t cut_coulsq, cut_ljsq; + flt_t cut_coul_innersq, cut_lj_innersq; + typename IntelBuffers::vec4_t **lj; + + ForceConst() : _ntypes(0) {} + ~ForceConst() { set_ntypes(0,NULL,_cop); } + + void set_ntypes(const int ntypes, Memory *memory, const int cop); + + private: + int _ntypes, _cop; + Memory *_memory; + }; + ForceConst force_const_single; + ForceConst force_const_double; +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: The 'package intel' command is required for /intel styles + +Self-explanatory. + +E: Intel varient of lj/charmm/coul/charmm expects lj cutoff<=coulombic + +The intel accelerated version of the CHARMM style requires that the +Lennard-Jones cutoff is not greater than the coulombic cutoff. + +*/ diff --git a/src/USER-INTEL/pair_rebo_intel.cpp b/src/USER-INTEL/pair_rebo_intel.cpp new file mode 100644 index 0000000000..006830a5fa --- /dev/null +++ b/src/USER-INTEL/pair_rebo_intel.cpp @@ -0,0 +1,42 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Markus Hohnerbach (RWTH) +------------------------------------------------------------------------- */ + +#include "pair_rebo_intel.h" +#include "error.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +PairREBOIntel::PairREBOIntel(LAMMPS *lmp) : PairAIREBOIntel(lmp) {} + +/* ---------------------------------------------------------------------- + global settings +------------------------------------------------------------------------- */ + +void PairREBOIntel::settings(int narg, char **arg) +{ + if (narg != 0) error->all(FLERR,"Illegal pair_style command"); + + cutlj = 0.0; + ljflag = torflag = 0; + // + // this one parameter for C-C interactions is different in REBO vs AIREBO + // see Favata, Micheletti, Ryu, Pugno, Comp Phys Comm (2016) + + PCCf_2_0 = 0.0; +} diff --git a/src/USER-INTEL/pair_rebo_intel.h b/src/USER-INTEL/pair_rebo_intel.h new file mode 100644 index 0000000000..e76279a248 --- /dev/null +++ b/src/USER-INTEL/pair_rebo_intel.h @@ -0,0 +1,40 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Markus Hohnerbach (RWTH) +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(rebo/intel,PairREBOIntel) + +#else + +#ifndef LMP_PAIR_REBO_INTEL_H +#define LMP_PAIR_REBO_INTEL_H + +#include "pair_airebo_intel.h" + +namespace LAMMPS_NS { + +class PairREBOIntel : public PairAIREBOIntel { + public: + PairREBOIntel(class LAMMPS *); + virtual void settings(int, char **); +}; + +} + +#endif +#endif diff --git a/src/USER-INTEL/pair_sw_intel.cpp b/src/USER-INTEL/pair_sw_intel.cpp index 7a6b7afd92..fff104f39b 100644 --- a/src/USER-INTEL/pair_sw_intel.cpp +++ b/src/USER-INTEL/pair_sw_intel.cpp @@ -345,16 +345,17 @@ void PairSWIntel::eval(const int offload, const int vflag, if (jj < jnumhalf) ejnumhalf++; } } - int ejnum_pad = ejnum; - while ( (ejnum_pad % pad_width) != 0) { - tdelx[ejnum_pad] = (flt_t)0.0; - tdely[ejnum_pad] = (flt_t)0.0; - tdelz[ejnum_pad] = (flt_t)0.0; - trsq[ejnum_pad] = p2[3].cutsq + (flt_t)1.0; - tj[ejnum_pad] = nall; - if (!ONETYPE) tjtype[ejnum_pad] = 0; - ejnum_pad++; + int ejrem = ejnum & (pad_width - 1); + if (ejrem) ejrem = pad_width - ejrem; + const int ejnum_pad = ejnum + ejrem; + for (int jj = ejnum; jj < ejnum_pad; jj++) { + tdelx[jj] = (flt_t)0.0; + tdely[jj] = (flt_t)0.0; + tdelz[jj] = (flt_t)0.0; + trsq[jj] = p2[3].cutsq + (flt_t)1.0; + tj[jj] = nall; + if (!ONETYPE) tjtype[jj] = 0; } #if defined(LMP_SIMD_COMPILER) diff --git a/src/force.cpp b/src/force.cpp index 33e6630406..060cae10eb 100644 --- a/src/force.cpp +++ b/src/force.cpp @@ -833,10 +833,6 @@ void Force::set_special(int narg, char **arg) else if (strcmp(arg[iarg+1],"yes") == 0) special_dihedral = 1; else error->all(FLERR,"Illegal special_bonds command"); iarg += 2; - } else if (strcmp(arg[iarg],"extra") == 0) { - if (iarg+2 > narg) error->all(FLERR,"Illegal special_bonds command"); - special_extra = atoi(arg[iarg+1]); - iarg += 2; } else error->all(FLERR,"Illegal special_bonds command"); } @@ -844,8 +840,6 @@ void Force::set_special(int narg, char **arg) if (special_lj[i] < 0.0 || special_lj[i] > 1.0 || special_coul[i] < 0.0 || special_coul[i] > 1.0) error->all(FLERR,"Illegal special_bonds command"); - - if (special_extra < 0) error->all(FLERR,"Illegal special_bonds command"); } /* ---------------------------------------------------------------------- diff --git a/src/info.cpp b/src/info.cpp index 9fcc24fde9..03eb1e10ed 100644 --- a/src/info.cpp +++ b/src/info.cpp @@ -45,7 +45,7 @@ #include #ifdef _WIN32 -#define PSAPI_VERSION=1 +#define PSAPI_VERSION 1 #include #include #include diff --git a/src/input.cpp b/src/input.cpp index 570560373a..7d11b8741b 100644 --- a/src/input.cpp +++ b/src/input.cpp @@ -1867,7 +1867,6 @@ void Input::special_bonds() double coul3 = force->special_coul[3]; int angle = force->special_angle; int dihedral = force->special_dihedral; - int extra = force->special_extra; force->set_special(narg,arg); @@ -1877,8 +1876,7 @@ void Input::special_bonds() if (lj2 != force->special_lj[2] || lj3 != force->special_lj[3] || coul2 != force->special_coul[2] || coul3 != force->special_coul[3] || angle != force->special_angle || - dihedral != force->special_dihedral || - extra != force->special_extra) { + dihedral != force->special_dihedral) { Special special(lmp); special.build(); } diff --git a/src/molecule.cpp b/src/molecule.cpp index e0e9ec8aaf..b0fec4bcbc 100644 --- a/src/molecule.cpp +++ b/src/molecule.cpp @@ -427,47 +427,61 @@ void Molecule::read(int flag) // search line for header keywords and set corresponding variable - if (strstr(line,"atoms")) sscanf(line,"%d",&natoms); - else if (strstr(line,"bonds")) sscanf(line,"%d",&nbonds); - else if (strstr(line,"angles")) sscanf(line,"%d",&nangles); - else if (strstr(line,"dihedrals")) sscanf(line,"%d",&ndihedrals); - else if (strstr(line,"impropers")) sscanf(line,"%d",&nimpropers); - - else if (strstr(line,"mass")) { + int nmatch = 0; + int nwant = 0; + if (strstr(line,"atoms")) { + nmatch = sscanf(line,"%d",&natoms); + nwant = 1; + } else if (strstr(line,"bonds")) { + nmatch = sscanf(line,"%d",&nbonds); + nwant = 1; + } else if (strstr(line,"angles")) { + nmatch = sscanf(line,"%d",&nangles); + nwant = 1; + } else if (strstr(line,"dihedrals")) { + nmatch = sscanf(line,"%d",&ndihedrals); + nwant = 1; + } else if (strstr(line,"impropers")) { + nmatch = sscanf(line,"%d",&nimpropers); + nwant = 1; + } else if (strstr(line,"mass")) { massflag = 1; - sscanf(line,"%lg",&masstotal); + nmatch = sscanf(line,"%lg",&masstotal); + nwant = 1; masstotal *= sizescale*sizescale*sizescale; - } - else if (strstr(line,"com")) { + } else if (strstr(line,"com")) { comflag = 1; - sscanf(line,"%lg %lg %lg",&com[0],&com[1],&com[2]); + nmatch = sscanf(line,"%lg %lg %lg",&com[0],&com[1],&com[2]); + nwant = 3; com[0] *= sizescale; com[1] *= sizescale; com[2] *= sizescale; if (domain->dimension == 2 && com[2] != 0.0) error->all(FLERR,"Molecule file z center-of-mass must be 0.0 for 2d"); - } - else if (strstr(line,"inertia")) { + } else if (strstr(line,"inertia")) { inertiaflag = 1; - sscanf(line,"%lg %lg %lg %lg %lg %lg", - &itensor[0],&itensor[1],&itensor[2], - &itensor[3],&itensor[4],&itensor[5]); - itensor[0] *= sizescale*sizescale*sizescale*sizescale*sizescale; - itensor[1] *= sizescale*sizescale*sizescale*sizescale*sizescale; - itensor[2] *= sizescale*sizescale*sizescale*sizescale*sizescale; - itensor[3] *= sizescale*sizescale*sizescale*sizescale*sizescale; - itensor[4] *= sizescale*sizescale*sizescale*sizescale*sizescale; - itensor[5] *= sizescale*sizescale*sizescale*sizescale*sizescale; - } - else if (strstr(line,"body")) { + nmatch = sscanf(line,"%lg %lg %lg %lg %lg %lg", + &itensor[0],&itensor[1],&itensor[2], + &itensor[3],&itensor[4],&itensor[5]); + nwant = 6; + const double scale5 = sizescale*sizescale*sizescale*sizescale*sizescale; + itensor[0] *= scale5; + itensor[1] *= scale5; + itensor[2] *= scale5; + itensor[3] *= scale5; + itensor[4] *= scale5; + itensor[5] *= scale5; + } else if (strstr(line,"body")) { bodyflag = 1; avec_body = (AtomVecBody *) atom->style_match("body"); if (!avec_body) error->all(FLERR,"Molecule file requires atom style body"); - sscanf(line,"%d %d",&nibody,&ndbody); - } + nmatch = sscanf(line,"%d %d",&nibody,&ndbody); + nwant = 2; + } else break; - else break; + if (nmatch != nwant) + error->all(FLERR,"Invalid header in molecule file"); } // error checks @@ -493,7 +507,7 @@ void Molecule::read(int flag) // loop over sections of molecule file - while (strlen(keyword)) { + while (strlen(keyword) > 0) { if (strcmp(keyword,"Coords") == 0) { xflag = 1; if (flag) coords(line); @@ -517,22 +531,22 @@ void Molecule::read(int flag) } else if (strcmp(keyword,"Bonds") == 0) { if (nbonds == 0) - error->all(FLERR,"Molecule file has bonds but no nbonds setting"); + error->all(FLERR,"Molecule file has bonds but no nbonds setting"); bondflag = tag_require = 1; bonds(flag,line); } else if (strcmp(keyword,"Angles") == 0) { if (nangles == 0) - error->all(FLERR,"Molecule file has angles but no nangles setting"); + error->all(FLERR,"Molecule file has angles but no nangles setting"); angleflag = tag_require = 1; angles(flag,line); } else if (strcmp(keyword,"Dihedrals") == 0) { if (ndihedrals == 0) error->all(FLERR,"Molecule file has dihedrals " - "but no ndihedrals setting"); + "but no ndihedrals setting"); dihedralflag = tag_require = 1; dihedrals(flag,line); } else if (strcmp(keyword,"Impropers") == 0) { if (nimpropers == 0) error->all(FLERR,"Molecule file has impropers " - "but no nimpropers setting"); + "but no nimpropers setting"); improperflag = tag_require = 1; impropers(flag,line); @@ -552,26 +566,26 @@ void Molecule::read(int flag) shakeatomflag = tag_require = 1; if (shaketypeflag) shakeflag = 1; if (!shakeflagflag) - error->all(FLERR,"Molecule file shake flags not before shake atoms"); + error->all(FLERR,"Molecule file shake flags not before shake atoms"); if (flag) shakeatom_read(line); else skip_lines(natoms,line); } else if (strcmp(keyword,"Shake Bond Types") == 0) { shaketypeflag = 1; if (shakeatomflag) shakeflag = 1; if (!shakeflagflag) - error->all(FLERR,"Molecule file shake flags not before shake bonds"); + error->all(FLERR,"Molecule file shake flags not before shake bonds"); if (flag) shaketype_read(line); else skip_lines(natoms,line); } else if (strcmp(keyword,"Body Integers") == 0) { if (bodyflag == 0 || nibody == 0) - error->all(FLERR,"Molecule file has body params " + error->all(FLERR,"Molecule file has body params " "but no setting for them"); ibodyflag = 1; body(flag,0,line); } else if (strcmp(keyword,"Body Doubles") == 0) { if (bodyflag == 0 || ndbody == 0) - error->all(FLERR,"Molecule file has body params " + error->all(FLERR,"Molecule file has body params " "but no setting for them"); dbodyflag = 1; body(flag,1,line); @@ -618,7 +632,7 @@ void Molecule::read(int flag) // body particle must have natom = 1 // set radius by having body class compute its own radius - + if (bodyflag) { radiusflag = 1; if (natoms != 1) @@ -641,12 +655,9 @@ void Molecule::coords(char *line) int tmp; for (int i = 0; i < natoms; i++) { readline(line); - if (i == 0) { - int nwords = atom->count_words(line); - if (nwords != 4) - error->all(FLERR,"Invalid Coords section in molecule file"); - } - sscanf(line,"%d %lg %lg %lg",&tmp,&x[i][0],&x[i][1],&x[i][2]); + if (4 != sscanf(line,"%d %lg %lg %lg",&tmp,&x[i][0],&x[i][1],&x[i][2])) + error->all(FLERR,"Invalid Coords section in molecule file"); + x[i][0] *= sizescale; x[i][1] *= sizescale; x[i][2] *= sizescale; @@ -669,12 +680,8 @@ void Molecule::types(char *line) int tmp; for (int i = 0; i < natoms; i++) { readline(line); - if (i == 0) { - int nwords = atom->count_words(line); - if (nwords != 2) - error->all(FLERR,"Invalid Types section in molecule file"); - } - sscanf(line,"%d %d",&tmp,&type[i]); + if (2 != sscanf(line,"%d %d",&tmp,&type[i])) + error->all(FLERR,"Invalid Types section in molecule file"); type[i] += toffset; } @@ -695,12 +702,8 @@ void Molecule::charges(char *line) int tmp; for (int i = 0; i < natoms; i++) { readline(line); - if (i == 0) { - int nwords = atom->count_words(line); - if (nwords != 2) - error->all(FLERR,"Invalid Charges section in molecule file"); - } - sscanf(line,"%d %lg",&tmp,&q[i]); + if (2 != sscanf(line,"%d %lg",&tmp,&q[i])) + error->all(FLERR,"Invalid Charges section in molecule file"); } } @@ -714,12 +717,8 @@ void Molecule::diameters(char *line) maxradius = 0.0; for (int i = 0; i < natoms; i++) { readline(line); - if (i == 0) { - int nwords = atom->count_words(line); - if (nwords != 2) - error->all(FLERR,"Invalid Diameters section in molecule file"); - } - sscanf(line,"%d %lg",&tmp,&radius[i]); + if (2 != sscanf(line,"%d %lg",&tmp,&radius[i])) + error->all(FLERR,"Invalid Diameters section in molecule file"); radius[i] *= sizescale; radius[i] *= 0.5; maxradius = MAX(maxradius,radius[i]); @@ -739,12 +738,8 @@ void Molecule::masses(char *line) int tmp; for (int i = 0; i < natoms; i++) { readline(line); - if (i == 0) { - int nwords = atom->count_words(line); - if (nwords != 2) - error->all(FLERR,"Invalid Masses section in molecule file"); - } - sscanf(line,"%d %lg",&tmp,&rmass[i]); + if (2 != sscanf(line,"%d %lg",&tmp,&rmass[i])) + error->all(FLERR,"Invalid Masses section in molecule file"); rmass[i] *= sizescale*sizescale*sizescale; } @@ -773,17 +768,13 @@ void Molecule::bonds(int flag, char *line) for (int i = 0; i < nbonds; i++) { readline(line); - if (i == 0) { - int nwords = atom->count_words(line); - if (nwords != 4) - error->all(FLERR,"Invalid Bonds section in molecule file"); - } - sscanf(line,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT, - &tmp,&itype,&atom1,&atom2); + if (4 != sscanf(line,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT, + &tmp,&itype,&atom1,&atom2)) + error->all(FLERR,"Invalid Bonds section in molecule file"); itype += boffset; if (atom1 <= 0 || atom1 > natoms || - atom2 <= 0 || atom2 > natoms) + atom2 <= 0 || atom2 > natoms) error->one(FLERR,"Invalid atom ID in Bonds section of molecule file"); if (itype <= 0) error->one(FLERR,"Invalid bond type in Bonds section of molecule file"); @@ -795,10 +786,10 @@ void Molecule::bonds(int flag, char *line) bond_atom[m][num_bond[m]] = atom2; num_bond[m]++; if (newton_bond == 0) { - m = atom2-1; - bond_type[m][num_bond[m]] = itype; - bond_atom[m][num_bond[m]] = atom1; - num_bond[m]++; + m = atom2-1; + bond_type[m][num_bond[m]] = itype; + bond_atom[m][num_bond[m]] = atom1; + num_bond[m]++; } } else { count[atom1-1]++; @@ -835,13 +826,9 @@ void Molecule::angles(int flag, char *line) for (int i = 0; i < nangles; i++) { readline(line); - if (i == 0) { - int nwords = atom->count_words(line); - if (nwords != 5) - error->all(FLERR,"Invalid Angles section in molecule file"); - } - sscanf(line,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT, - &tmp,&itype,&atom1,&atom2,&atom3); + if (5 != sscanf(line,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT, + &tmp,&itype,&atom1,&atom2,&atom3)) + error->all(FLERR,"Invalid Angles section in molecule file"); itype += aoffset; if (atom1 <= 0 || atom1 > natoms || @@ -860,24 +847,24 @@ void Molecule::angles(int flag, char *line) angle_atom3[m][num_angle[m]] = atom3; num_angle[m]++; if (newton_bond == 0) { - m = atom1-1; - angle_type[m][num_angle[m]] = itype; - angle_atom1[m][num_angle[m]] = atom1; - angle_atom2[m][num_angle[m]] = atom2; - angle_atom3[m][num_angle[m]] = atom3; - num_angle[m]++; - m = atom3-1; - angle_type[m][num_angle[m]] = itype; - angle_atom1[m][num_angle[m]] = atom1; - angle_atom2[m][num_angle[m]] = atom2; - angle_atom3[m][num_angle[m]] = atom3; - num_angle[m]++; + m = atom1-1; + angle_type[m][num_angle[m]] = itype; + angle_atom1[m][num_angle[m]] = atom1; + angle_atom2[m][num_angle[m]] = atom2; + angle_atom3[m][num_angle[m]] = atom3; + num_angle[m]++; + m = atom3-1; + angle_type[m][num_angle[m]] = itype; + angle_atom1[m][num_angle[m]] = atom1; + angle_atom2[m][num_angle[m]] = atom2; + angle_atom3[m][num_angle[m]] = atom3; + num_angle[m]++; } } else { count[atom2-1]++; if (newton_bond == 0) { - count[atom1-1]++; - count[atom3-1]++; + count[atom1-1]++; + count[atom3-1]++; } } } @@ -911,14 +898,10 @@ void Molecule::dihedrals(int flag, char *line) for (int i = 0; i < ndihedrals; i++) { readline(line); - if (i == 0) { - int nwords = atom->count_words(line); - if (nwords != 6) - error->all(FLERR,"Invalid Dihedrals section in molecule file"); - } - sscanf(line,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT " " + if (6 != sscanf(line,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " ", - &tmp,&itype,&atom1,&atom2,&atom3,&atom4); + &tmp,&itype,&atom1,&atom2,&atom3,&atom4)) + error->all(FLERR,"Invalid Dihedrals section in molecule file"); itype += doffset; if (atom1 <= 0 || atom1 > natoms || @@ -926,10 +909,10 @@ void Molecule::dihedrals(int flag, char *line) atom3 <= 0 || atom3 > natoms || atom4 <= 0 || atom4 > natoms) error->one(FLERR, - "Invalid atom ID in dihedrals section of molecule file"); + "Invalid atom ID in dihedrals section of molecule file"); if (itype <= 0) error->one(FLERR, - "Invalid dihedral type in dihedrals section of molecule file"); + "Invalid dihedral type in dihedrals section of molecule file"); if (flag) { m = atom2-1; @@ -941,34 +924,34 @@ void Molecule::dihedrals(int flag, char *line) dihedral_atom4[m][num_dihedral[m]] = atom4; num_dihedral[m]++; if (newton_bond == 0) { - m = atom1-1; - dihedral_type[m][num_dihedral[m]] = itype; - dihedral_atom1[m][num_dihedral[m]] = atom1; - dihedral_atom2[m][num_dihedral[m]] = atom2; - dihedral_atom3[m][num_dihedral[m]] = atom3; - dihedral_atom4[m][num_dihedral[m]] = atom4; - num_dihedral[m]++; - m = atom3-1; - dihedral_type[m][num_dihedral[m]] = itype; - dihedral_atom1[m][num_dihedral[m]] = atom1; - dihedral_atom2[m][num_dihedral[m]] = atom2; - dihedral_atom3[m][num_dihedral[m]] = atom3; - dihedral_atom4[m][num_dihedral[m]] = atom4; - num_dihedral[m]++; - m = atom4-1; - dihedral_type[m][num_dihedral[m]] = itype; - dihedral_atom1[m][num_dihedral[m]] = atom1; - dihedral_atom2[m][num_dihedral[m]] = atom2; - dihedral_atom3[m][num_dihedral[m]] = atom3; - dihedral_atom4[m][num_dihedral[m]] = atom4; - num_dihedral[m]++; + m = atom1-1; + dihedral_type[m][num_dihedral[m]] = itype; + dihedral_atom1[m][num_dihedral[m]] = atom1; + dihedral_atom2[m][num_dihedral[m]] = atom2; + dihedral_atom3[m][num_dihedral[m]] = atom3; + dihedral_atom4[m][num_dihedral[m]] = atom4; + num_dihedral[m]++; + m = atom3-1; + dihedral_type[m][num_dihedral[m]] = itype; + dihedral_atom1[m][num_dihedral[m]] = atom1; + dihedral_atom2[m][num_dihedral[m]] = atom2; + dihedral_atom3[m][num_dihedral[m]] = atom3; + dihedral_atom4[m][num_dihedral[m]] = atom4; + num_dihedral[m]++; + m = atom4-1; + dihedral_type[m][num_dihedral[m]] = itype; + dihedral_atom1[m][num_dihedral[m]] = atom1; + dihedral_atom2[m][num_dihedral[m]] = atom2; + dihedral_atom3[m][num_dihedral[m]] = atom3; + dihedral_atom4[m][num_dihedral[m]] = atom4; + num_dihedral[m]++; } } else { count[atom2-1]++; if (newton_bond == 0) { - count[atom1-1]++; - count[atom3-1]++; - count[atom4-1]++; + count[atom1-1]++; + count[atom3-1]++; + count[atom4-1]++; } } } @@ -1002,14 +985,10 @@ void Molecule::impropers(int flag, char *line) for (int i = 0; i < nimpropers; i++) { readline(line); - if (i == 0) { - int nwords = atom->count_words(line); - if (nwords != 6) - error->all(FLERR,"Invalid Impropers section in molecule file"); - } - sscanf(line,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT " " + if (6 != sscanf(line,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " ", - &tmp,&itype,&atom1,&atom2,&atom3,&atom4); + &tmp,&itype,&atom1,&atom2,&atom3,&atom4)) + error->all(FLERR,"Invalid Impropers section in molecule file"); itype += ioffset; if (atom1 <= 0 || atom1 > natoms || @@ -1017,10 +996,10 @@ void Molecule::impropers(int flag, char *line) atom3 <= 0 || atom3 > natoms || atom4 <= 0 || atom4 > natoms) error->one(FLERR, - "Invalid atom ID in impropers section of molecule file"); + "Invalid atom ID in impropers section of molecule file"); if (itype <= 0) error->one(FLERR, - "Invalid improper type in impropers section of molecule file"); + "Invalid improper type in impropers section of molecule file"); if (flag) { m = atom2-1; @@ -1032,34 +1011,34 @@ void Molecule::impropers(int flag, char *line) improper_atom4[m][num_improper[m]] = atom4; num_improper[m]++; if (newton_bond == 0) { - m = atom1-1; - improper_type[m][num_improper[m]] = itype; - improper_atom1[m][num_improper[m]] = atom1; - improper_atom2[m][num_improper[m]] = atom2; - improper_atom3[m][num_improper[m]] = atom3; - improper_atom4[m][num_improper[m]] = atom4; - num_improper[m]++; - m = atom3-1; - improper_type[m][num_improper[m]] = itype; - improper_atom1[m][num_improper[m]] = atom1; - improper_atom2[m][num_improper[m]] = atom2; - improper_atom3[m][num_improper[m]] = atom3; - improper_atom4[m][num_improper[m]] = atom4; - num_improper[m]++; - m = atom4-1; - improper_type[m][num_improper[m]] = itype; - improper_atom1[m][num_improper[m]] = atom1; - improper_atom2[m][num_improper[m]] = atom2; - improper_atom3[m][num_improper[m]] = atom3; - improper_atom4[m][num_improper[m]] = atom4; - num_improper[m]++; + m = atom1-1; + improper_type[m][num_improper[m]] = itype; + improper_atom1[m][num_improper[m]] = atom1; + improper_atom2[m][num_improper[m]] = atom2; + improper_atom3[m][num_improper[m]] = atom3; + improper_atom4[m][num_improper[m]] = atom4; + num_improper[m]++; + m = atom3-1; + improper_type[m][num_improper[m]] = itype; + improper_atom1[m][num_improper[m]] = atom1; + improper_atom2[m][num_improper[m]] = atom2; + improper_atom3[m][num_improper[m]] = atom3; + improper_atom4[m][num_improper[m]] = atom4; + num_improper[m]++; + m = atom4-1; + improper_type[m][num_improper[m]] = itype; + improper_atom1[m][num_improper[m]] = atom1; + improper_atom2[m][num_improper[m]] = atom2; + improper_atom3[m][num_improper[m]] = atom3; + improper_atom4[m][num_improper[m]] = atom4; + num_improper[m]++; } } else { count[atom2-1]++; if (newton_bond == 0) { - count[atom1-1]++; - count[atom3-1]++; - count[atom4-1]++; + count[atom1-1]++; + count[atom3-1]++; + count[atom4-1]++; } } } @@ -1087,13 +1066,9 @@ void Molecule::nspecial_read(int flag, char *line) for (int i = 0; i < natoms; i++) { readline(line); - if (i == 0) { - int nwords = atom->count_words(line); - if (nwords != 4) - error->all(FLERR,"Invalid Special Bond Counts section in " - "molecule file"); - } - sscanf(line,"%d %d %d %d",&tmp,&c1,&c2,&c3); + if (4 != sscanf(line,"%d %d %d %d",&tmp,&c1,&c2,&c3)) + error->all(FLERR,"Invalid Special Bond Counts section in " + "molecule file"); if (flag) { nspecial[i][0] = c1; @@ -1117,13 +1092,13 @@ void Molecule::special_read(char *line) nwords = parse(line,words,maxspecial+1); if (nwords != nspecial[i][2]+1) error->all(FLERR,"Molecule file special list " - "does not match special count"); + "does not match special count"); for (m = 1; m < nwords; m++) { special[i][m-1] = ATOTAGINT(words[m]); if (special[i][m-1] <= 0 || special[i][m-1] > natoms || - special[i][m-1] == i+1) - error->all(FLERR,"Invalid special atom index in molecule file"); + special[i][m-1] == i+1) + error->all(FLERR,"Invalid special atom index in molecule file"); } } @@ -1229,7 +1204,8 @@ void Molecule::shakeflag_read(char *line) int tmp; for (int i = 0; i < natoms; i++) { readline(line); - sscanf(line,"%d %d",&tmp,&shake_flag[i]); + if (2 != sscanf(line,"%d %d",&tmp,&shake_flag[i])) + error->all(FLERR,"Invalid Shake Flags section in molecule file"); } for (int i = 0; i < natoms; i++) @@ -1243,23 +1219,32 @@ void Molecule::shakeflag_read(char *line) void Molecule::shakeatom_read(char *line) { - int tmp; + int tmp, nmatch, nwant; for (int i = 0; i < natoms; i++) { readline(line); - if (shake_flag[i] == 1) - sscanf(line,"%d " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT, - &tmp,&shake_atom[i][0],&shake_atom[i][1],&shake_atom[i][2]); - else if (shake_flag[i] == 2) - sscanf(line,"%d " TAGINT_FORMAT " " TAGINT_FORMAT, - &tmp,&shake_atom[i][0],&shake_atom[i][1]); - else if (shake_flag[i] == 3) - sscanf(line,"%d " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT, - &tmp,&shake_atom[i][0],&shake_atom[i][1],&shake_atom[i][2]); - else if (shake_flag[i] == 4) - sscanf(line,"%d " TAGINT_FORMAT " " TAGINT_FORMAT " " - TAGINT_FORMAT " " TAGINT_FORMAT, - &tmp,&shake_atom[i][0],&shake_atom[i][1], - &shake_atom[i][2],&shake_atom[i][3]); + if (shake_flag[i] == 1) { + nmatch = sscanf(line,"%d " TAGINT_FORMAT " " TAGINT_FORMAT + " " TAGINT_FORMAT,&tmp,&shake_atom[i][0], + &shake_atom[i][1],&shake_atom[i][2]); + nwant = 4; + } else if (shake_flag[i] == 2) { + nmatch = sscanf(line,"%d " TAGINT_FORMAT " " TAGINT_FORMAT, + &tmp,&shake_atom[i][0],&shake_atom[i][1]); + nwant = 3; + } else if (shake_flag[i] == 3) { + nmatch = sscanf(line,"%d " TAGINT_FORMAT " " TAGINT_FORMAT + " " TAGINT_FORMAT,&tmp,&shake_atom[i][0], + &shake_atom[i][1],&shake_atom[i][2]); + nwant = 4; + } else if (shake_flag[i] == 4) { + nmatch = sscanf(line,"%d " TAGINT_FORMAT " " TAGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT, + &tmp,&shake_atom[i][0],&shake_atom[i][1], + &shake_atom[i][2],&shake_atom[i][3]); + nwant = 5; + } + if (nmatch != nwant) + error->all(FLERR,"Invalid shake atom in molecule file"); } for (int i = 0; i < natoms; i++) { @@ -1277,19 +1262,27 @@ void Molecule::shakeatom_read(char *line) void Molecule::shaketype_read(char *line) { - int tmp; + int tmp,nmatch,nwant; for (int i = 0; i < natoms; i++) { readline(line); - if (shake_flag[i] == 1) - sscanf(line,"%d %d %d %d",&tmp, - &shake_type[i][0],&shake_type[i][1],&shake_type[i][2]); - else if (shake_flag[i] == 2) - sscanf(line,"%d %d",&tmp,&shake_type[i][0]); - else if (shake_flag[i] == 3) - sscanf(line,"%d %d %d",&tmp,&shake_type[i][0],&shake_type[i][1]); - else if (shake_flag[i] == 4) - sscanf(line,"%d %d %d %d",&tmp, - &shake_type[i][0],&shake_type[i][1],&shake_type[i][2]); + if (shake_flag[i] == 1) { + nmatch = sscanf(line,"%d %d %d %d",&tmp,&shake_type[i][0], + &shake_type[i][1],&shake_type[i][2]); + nwant = 4; + } else if (shake_flag[i] == 2) { + nmatch = sscanf(line,"%d %d",&tmp,&shake_type[i][0]); + nwant = 2; + } else if (shake_flag[i] == 3) { + nmatch = sscanf(line,"%d %d %d",&tmp,&shake_type[i][0], + &shake_type[i][1]); + nwant = 3; + } else if (shake_flag[i] == 4) { + nmatch = sscanf(line,"%d %d %d %d",&tmp,&shake_type[i][0], + &shake_type[i][1],&shake_type[i][2]); + nwant = 4; + } + if (nmatch != nwant) + error->all(FLERR,"Invalid shake type data in molecule file"); } for (int i = 0; i < natoms; i++) { @@ -1501,46 +1494,46 @@ void Molecule::allocate() if (bondflag) { memory->create(bond_type,natoms,bond_per_atom, - "molecule:bond_type"); + "molecule:bond_type"); memory->create(bond_atom,natoms,bond_per_atom, - "molecule:bond_atom"); + "molecule:bond_atom"); } if (angleflag) { memory->create(angle_type,natoms,angle_per_atom, - "molecule:angle_type"); + "molecule:angle_type"); memory->create(angle_atom1,natoms,angle_per_atom, - "molecule:angle_atom1"); + "molecule:angle_atom1"); memory->create(angle_atom2,natoms,angle_per_atom, - "molecule:angle_atom2"); + "molecule:angle_atom2"); memory->create(angle_atom3,natoms,angle_per_atom, - "molecule:angle_atom3"); + "molecule:angle_atom3"); } if (dihedralflag) { memory->create(dihedral_type,natoms,dihedral_per_atom, - "molecule:dihedral_type"); + "molecule:dihedral_type"); memory->create(dihedral_atom1,natoms,dihedral_per_atom, - "molecule:dihedral_atom1"); + "molecule:dihedral_atom1"); memory->create(dihedral_atom2,natoms,dihedral_per_atom, - "molecule:dihedral_atom2"); + "molecule:dihedral_atom2"); memory->create(dihedral_atom3,natoms,dihedral_per_atom, - "molecule:dihedral_atom3"); + "molecule:dihedral_atom3"); memory->create(dihedral_atom4,natoms,dihedral_per_atom, - "molecule:dihedral_atom4"); + "molecule:dihedral_atom4"); } if (improperflag) { memory->create(improper_type,natoms,improper_per_atom, - "molecule:improper_type"); + "molecule:improper_type"); memory->create(improper_atom1,natoms,improper_per_atom, - "molecule:improper_atom1"); + "molecule:improper_atom1"); memory->create(improper_atom2,natoms,improper_per_atom, - "molecule:improper_atom2"); + "molecule:improper_atom2"); memory->create(improper_atom3,natoms,improper_per_atom, - "molecule:improper_atom3"); + "molecule:improper_atom3"); memory->create(improper_atom4,natoms,improper_per_atom, - "molecule:improper_atom4"); + "molecule:improper_atom4"); } if (shakeflag) { @@ -1653,7 +1646,7 @@ void Molecule::parse_keyword(int flag, char *line, char *keyword) if (me == 0) { if (fgets(line,MAXLINE,fp) == NULL) eof = 1; while (eof == 0 && strspn(line," \t\n\r") == strlen(line)) { - if (fgets(line,MAXLINE,fp) == NULL) eof = 1; + if (fgets(line,MAXLINE,fp) == NULL) eof = 1; } if (fgets(keyword,MAXLINE,fp) == NULL) eof = 1; } diff --git a/src/pair.cpp b/src/pair.cpp index ce711c4f5d..05319e33f2 100644 --- a/src/pair.cpp +++ b/src/pair.cpp @@ -75,7 +75,7 @@ Pair::Pair(LAMMPS *lmp) : Pointers(lmp) ewaldflag = pppmflag = msmflag = dispersionflag = tip4pflag = dipoleflag = 0; reinitflag = 1; - // pair_modify settingsx + // pair_modify settings compute_flag = 1; manybody_flag = 0; diff --git a/src/pair_hybrid.cpp b/src/pair_hybrid.cpp index 48364a86c4..4a98cca614 100644 --- a/src/pair_hybrid.cpp +++ b/src/pair_hybrid.cpp @@ -36,7 +36,7 @@ PairHybrid::PairHybrid(LAMMPS *lmp) : Pair(lmp), map(NULL), special_lj(NULL), special_coul(NULL), compute_tally(NULL) { nstyles = 0; - + outerflag = 0; respaflag = 0; } @@ -487,7 +487,7 @@ void PairHybrid::init_style() if (((force->special_lj[i] == 0.0) || (force->special_lj[i] == 1.0)) && (force->special_lj[i] != special_lj[istyle][i])) error->all(FLERR,"Pair_modify special setting for pair hybrid " - "incompatible with global special_bonds setting"); + "incompatible with global special_bonds setting"); } } @@ -497,7 +497,7 @@ void PairHybrid::init_style() || (force->special_coul[i] == 1.0)) && (force->special_coul[i] != special_coul[istyle][i])) error->all(FLERR,"Pair_modify special setting for pair hybrid " - "incompatible with global special_bonds setting"); + "incompatible with global special_bonds setting"); } } } @@ -829,6 +829,12 @@ void PairHybrid::modify_params(int narg, char **arg) Pair::modify_params(narg,arg); for (int m = 0; m < nstyles; m++) styles[m]->modify_params(narg,arg); } + + // reset global compute_flag since there may have been changes + // to any of the substyles + compute_flag = 0; + for (int m = 0; m < nstyles; m++) + if (styles[m]->compute_flag) compute_flag = 1; } /* ---------------------------------------------------------------------- diff --git a/src/version.h b/src/version.h index 0a22a92328..ff33fa3b06 100644 --- a/src/version.h +++ b/src/version.h @@ -1 +1 @@ -#define LAMMPS_VERSION "10 Aug 2017" +#define LAMMPS_VERSION "17 Aug 2017" diff --git a/tools/msi2lmp/README b/tools/msi2lmp/README index db9b1aca5e..9ac7af5607 100644 --- a/tools/msi2lmp/README +++ b/tools/msi2lmp/README @@ -140,6 +140,8 @@ msi2lmp has the following known limitations: - there is no support for auto-equivalences to supplement fully parameterized interactions with heuristic ones - there is no support for bond increments +- there is no support for coordinates defined by symmetry operations, + i.e. the .mdf file has to be set up for space group P1 ------------------------------------------------------------------------