Merge branch 'master' into USER-DPD_kokkos

This commit is contained in:
Stan Moore 2017-08-22 13:50:19 -06:00
commit 5c985946d5
276 changed files with 26793 additions and 4212 deletions

21
.github/CODEOWNERS vendored Normal file
View File

@ -0,0 +1,21 @@
# This file contains file patterns that triggers automatic
# code review requests from users that are owners of these files
# Order matters, the last match has the highest precedence
# library folders
lib/colvars/* @giacomofiorin
lib/compress/* @akohlmey
lib/kokkos/* @stanmoore1
lib/molfile/* @akohlmey
lib/qmmm/* @akohlmey
lib/vtk/* @rbberger
# packages
src/KOKKOS @stanmoore1
src/USER-CGSDK @akohlmey
src/USER-COLVARS @giacomofiorin
src/USER-OMP @akohlmey
src/USER-QMMM @akohlmey
# tools
tools/msi2lmp/* @akohlmey

8
.gitignore vendored
View File

@ -32,3 +32,11 @@ log.cite
.Trashes
ehthumbs.db
Thumbs.db
#cmake
/build*
/CMakeCache.txt
/CMakeFiles/
/Makefile
/cmake_install.cmake
/lmp

547
cmake/CMakeLists.txt Normal file
View File

@ -0,0 +1,547 @@
########################################
# CMake build system
# This file is part of LAMMPS
# Created by Christoph Junghans and Richard Berger
cmake_minimum_required(VERSION 3.1)
project(lammps)
set(SOVERSION 0)
set(LAMMPS_SOURCE_DIR ${CMAKE_SOURCE_DIR}/../src)
set(LAMMPS_LIB_SOURCE_DIR ${CMAKE_SOURCE_DIR}/../lib)
set(LAMMPS_LIB_BINARY_DIR ${CMAKE_BINARY_DIR}/lib)
#To not conflict with old Makefile build system, we build everything here
file(GLOB LIB_SOURCES ${LAMMPS_SOURCE_DIR}/*.cpp)
file(GLOB LMP_SOURCES ${LAMMPS_SOURCE_DIR}/main.cpp)
list(REMOVE_ITEM LIB_SOURCES ${LMP_SOURCES})
# Cmake modules/macros are in a subdirectory to keep this file cleaner
set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/Modules)
if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CXX_FLAGS)
#release comes with -O3 by default
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." FORCE)
endif(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CXX_FLAGS)
foreach(STYLE_FILE style_angle.h style_atom.h style_body.h style_bond.h style_command.h style_compute.h style_dihedral.h style_dump.h
style_fix.h style_improper.h style_integrate.h style_kspace.h style_minimize.h style_nbin.h style_npair.h style_nstencil.h
style_ntopo.h style_pair.h style_reader.h style_region.h)
if(EXISTS ${LAMMPS_SOURCE_DIR}/${STYLE_FILE})
message(FATAL_ERROR "There is a ${STYLE_FILE} in ${LAMMPS_SOURCE_DIR}, please clean up the source directory first")
endif()
endforeach()
enable_language(CXX)
######################################################################
# compiler tests
# these need ot be done early (before further tests).
#####################################################################
include(CheckCCompilerFlag)
########################################################################
# User input options #
########################################################################
option(BUILD_SHARED_LIBS "Build shared libs" OFF)
option(INSTALL_LIB "Install lammps library and header" ON)
include(GNUInstallDirs)
set(LAMMPS_LINK_LIBS)
option(ENABLE_MPI "Build MPI version" OFF)
if(ENABLE_MPI)
find_package(MPI REQUIRED)
include_directories(${MPI_C_INCLUDE_PATH})
list(APPEND LAMMPS_LINK_LIBS ${MPI_CXX_LIBRARIES})
option(LAMMPS_LONGLONG_TO_LONG "Workaround if your system or MPI version does not recognize 'long long' data types" OFF)
if(LAMMPS_LONGLONG_TO_LONG)
add_definitions(-DLAMMPS_LONGLONG_TO_LONG)
endif()
else()
file(GLOB MPI_SOURCES ${LAMMPS_SOURCE_DIR}/STUBS/mpi.c)
list(APPEND LIB_SOURCES ${MPI_SOURCES})
include_directories(${LAMMPS_SOURCE_DIR}/STUBS)
endif()
set(LAMMPS_SIZE_LIMIT "LAMMPS_SMALLBIG" CACHE STRING "Lammps size limit")
set_property(CACHE LAMMPS_SIZE_LIMIT PROPERTY STRINGS LAMMPS_SMALLBIG LAMMPS_BIGBIG LAMMPS_SMALLSMALL)
add_definitions(-D${LAMMPS_SIZE_LIMIT})
set(LAMMPS_MEMALIGN "64" CACHE STRING "enables the use of the posix_memalign() call instead of malloc() when large chunks or memory are allocated by LAMMPS")
add_definitions(-DLAMMPS_MEMALIGN=${LAMMPS_MEMALIGN})
option(LAMMPS_EXCEPTIONS "enable the use of C++ exceptions for error messages (useful for library interface)" OFF)
if(LAMMPS_EXCEPTIONS)
add_definitions(-DLAMMPS_EXCEPTIONS)
endif()
option(CMAKE_VERBOSE_MAKEFILE "Verbose makefile" OFF)
option(ENABLE_TESTING "Enable testing" OFF)
if(ENABLE_TESTING)
enable_testing()
endif(ENABLE_TESTING)
option(ENABLE_ALL "Build all default packages" OFF)
set(DEFAULT_PACKAGES ASPHERE BODY CLASS2 COLLOID COMPRESS CORESHELL DIPOLE GRANULAR
KSPACE MANYBODY MC MEAM MISC MOLECULE PERI QEQ
REAX REPLICA RIGID SHOCK SNAP SRD)
set(OTHER_PACKAGES KIM PYTHON MSCG MPIIO VORONOI POEMS
USER-ATC USER-AWPMD USER-CGDNA
USER-CGSDK USER-COLVARS USER-DIFFRACTION USER-DPD USER-DRUDE USER-EFF
USER-FEP USER-H5MD USER-LB USER-MANIFOLD USER-MEAMC USER-MGPT USER-MISC
USER-MOLFILE USER-NETCDF USER-PHONON USER-QTB USER-REAXC USER-SMD
USER-SMTBQ USER-SPH USER-TALLY USER-VTK USER-QUIP USER-QMMM)
set(ACCEL_PACKAGES USER-OMP KOKKOS OPT USER-INTEL GPU)
foreach(PKG ${DEFAULT_PACKAGES})
option(ENABLE_${PKG} "Build ${PKG} Package" ${ENABLE_ALL})
endforeach()
foreach(PKG ${ACCEL_PACKAGES} ${OTHER_PACKAGES})
option(ENABLE_${PKG} "Build ${PKG} Package" OFF)
endforeach()
macro(pkg_depends PKG1 PKG2)
if(ENABLE_${PKG1} AND NOT ENABLE_${PKG2})
message(FATAL_ERROR "${PKG1} package needs LAMMPS to be build with ${PKG2}")
endif()
endmacro()
pkg_depends(MPIIO MPI)
pkg_depends(QEQ MANYBODY)
pkg_depends(USER-ATC MANYBODY)
pkg_depends(USER-H5MD MPI)
pkg_depends(USER-LB MPI)
pkg_depends(USER-MISC MANYBODY)
pkg_depends(USER-PHONON KSPACE)
if(ENABLE_BODY AND ENABLE_POEMS)
message(FATAL_ERROR "BODY and POEMS cannot be enabled at the same time")
endif()
######################################################
# packages with special compiler needs or external libs
######################################################
if(ENABLE_REAX OR ENABLE_MEAM OR ENABLE_USER-QUIP OR ENABLE_USER-QMMM)
enable_language(Fortran)
endif()
if(ENABLE_KOKKOS OR ENABLE_MSCG)
# starting with CMake 3.1 this is all you have to do to enforce C++11
set(CMAKE_CXX_STANDARD 11) # C++11...
set(CMAKE_CXX_STANDARD_REQUIRED ON) #...is required...
set(CMAKE_CXX_EXTENSIONS OFF) #...without compiler extensions like gnu++11
endif()
if(ENABLE_USER-OMP OR ENABLE_KOKKOS OR ENABLE_USER-INTEL)
find_package(OpenMP REQUIRED)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
endif()
if(ENABLE_KSPACE)
set(FFT "KISSFFT" CACHE STRING "FFT library for KSPACE package")
set_property(CACHE FFT PROPERTY STRINGS KISSFFT FFTW3 MKL FFTW2)
if(NOT FFT STREQUAL "KISSFFT")
find_package(${FFT} REQUIRED)
add_definitions(-DFFT_${FFT})
include_directories(${${FFT}_INCLUDE_DIRS})
list(APPEND LAMMPS_LINK_LIBS ${${FFT}_LIBRARIES})
endif()
set(PACK_OPTIMIZATION "PACK_ARRAY" CACHE STRING "Optimization for FFT")
set_property(CACHE PACK_OPTIMIZATION PROPERTY STRINGS PACK_ARRAY PACK_POINTER PACK_MEMCPY)
if(NOT PACK_OPTIMIZATION STREQUAL "PACK_ARRAY")
add_definitions(-D${PACK_OPTIMIZATION})
endif()
endif()
if(ENABLE_MISC)
option(LAMMPS_XDR "include XDR compatibility files for doing particle dumps in XTC format" OFF)
if(LAMMPS_XDR)
add_definitions(-DLAMMPS_XDR)
endif()
endif()
if(ENABLE_MSCG OR ENABLE_USER-ATC OR ENABLE_USER-AWPMD OR ENABLE_USER-QUIP)
find_package(LAPACK)
if(LAPACK_FOUND)
list(APPEND LAMMPS_LINK_LIBS ${LAPACK_LIBRARIES})
else()
enable_language(Fortran)
file(GLOB LAPACK_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/linalg/*.f)
list(APPEND LIB_SOURCES ${LAPACK_SOURCES})
endif()
endif()
if(ENABLE_PYTHON)
find_package(PythonInterp REQUIRED)
find_package(PythonLibs REQUIRED)
add_definitions(-DLMP_PYTHON)
include_directories(${PYTHON_INCLUDE_DIR})
list(APPEND LAMMPS_LINK_LIBS ${PYTHON_LIBRARY})
if(NOT PYTHON_INSTDIR)
execute_process(COMMAND ${PYTHON_EXECUTABLE}
-c "import distutils.sysconfig as cg; print(cg.get_python_lib(1,0,prefix='${CMAKE_INSTALL_PREFIX}'))"
OUTPUT_VARIABLE PYTHON_INSTDIR OUTPUT_STRIP_TRAILING_WHITESPACE)
endif()
install(FILES ${CMAKE_SOURCE_DIR}/../python/lammps.py DESTINATION ${PYTHON_INSTDIR})
if(NOT BUILD_SHARED_LIBS)
message(FATAL_ERROR "Python package need lammps to be build shared, use -DBUILD_SHARED_LIBS=ON")
endif()
endif()
find_package(JPEG)
if(JPEG_FOUND)
add_definitions(-DLAMMPS_JPEG)
include_directories(${JPEG_INCLUDE_DIR})
list(APPEND LAMMPS_LINK_LIBS ${JPEG_LIBRARIES})
endif()
find_package(PNG)
find_package(ZLIB)
if(PNG_FOUND AND ZLIB_FOUND)
include_directories(${PNG_INCLUDE_DIRS} ${ZLIB_INCLUDE_DIRS})
list(APPEND LAMMPS_LINK_LIBS ${PNG_LIBRARIES} ${ZLIB_LIBRARIES})
add_definitions(-DLAMMPS_PNG)
endif()
find_program(GZIP_EXECUTABLE gzip)
find_package_handle_standard_args(GZIP REQUIRED_VARS GZIP_EXECUTABLE)
if(GZIP_FOUND)
add_definitions(-DLAMMPS_GZIP)
endif()
find_program(FFMPEG_EXECUTABLE ffmpeg)
find_package_handle_standard_args(FFMPEG REQUIRED_VARS FFMPEG_EXECUTABLE)
if(FFMPEG_FOUND)
add_definitions(-DLAMMPS_FFMPEG)
endif()
if(ENABLE_VORONOI)
find_package(VORO REQUIRED) #some distros
include_directories(${VORO_INCLUDE_DIRS})
list(APPEND LAMMPS_LINK_LIBS ${VORO_LIBRARIES})
endif()
if(ENABLE_USER-MOLFILE)
list(APPEND LAMMPS_LINK_LIBS ${CMAKE_DL_LIBS})
endif()
if(ENABLE_USER-NETCDF)
find_package(NetCDF REQUIRED)
include_directories(NETCDF_INCLUDE_DIR)
list(APPEND LAMMPS_LINK_LIBS ${NETCDF_LIBRARY})
add_definitions(-DLMP_HAS_NETCDF -DNC_64BIT_DATA=0x0020)
endif()
if(ENABLE_USER-SMD)
find_package(Eigen3 REQUIRED)
include_directories(${EIGEN3_INCLUDE_DIR})
endif()
if(ENABLE_USER-QUIP)
find_package(QUIP REQUIRED)
list(APPEND LAMMPS_LINK_LIBS ${QUIP_LIBRARIES} ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES})
endif()
if(ENABLE_USER-QMMM)
find_package(QE REQUIRED)
include_directories(${QE_INCLUDE_DIRS})
list(APPEND LAMMPS_LINK_LIBS ${QE_LIBRARIES} ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES})
endif()
if(ENABLE_USER-AWPMD)
include_directories(${LAMMPS_LIB_SOURCE_DIR}/awpmd/systems/interact
${LAMMPS_LIB_SOURCE_DIR}/awpmd/ivutils/include)
endif()
if(ENABLE_USER-H5MD)
find_package(HDF5 REQUIRED)
list(APPEND LAMMPS_LINK_LIBS ${HDF5_LIBRARIES})
include_directories(${HDF5_INCLUDE_DIRS} ${LAMMPS_LIB_SOURCE_DIR}/h5md/include)
endif()
if(ENABLE_USER-VTK)
find_package(VTK REQUIRED NO_MODULE)
include(${VTK_USE_FILE})
add_definitions(-DLAMMPS_VTK)
list(APPEND LAMMPS_LINK_LIBS ${VTK_LIBRARIES})
endif()
if(ENABLE_KIM)
find_package(KIM REQUIRED)
list(APPEND LAMMPS_LINK_LIBS ${KIM_LIBRARIES})
include_directories(${KIM_INCLUDE_DIRS})
endif()
if(ENABLE_MSCG)
find_package(GSL REQUIRED)
set(LAMMPS_LIB_MSCG_BIN_DIR ${LAMMPS_LIB_BINARY_DIR}/mscg)
set(MSCG_TARBALL ${LAMMPS_LIB_MSCG_BIN_DIR}/MS-CG-master.zip)
set(LAMMPS_LIB_MSCG_BIN_DIR ${LAMMPS_LIB_MSCG_BIN_DIR}/MSCG-release-master/src)
if(NOT EXISTS ${LAMMPS_LIB_MSCG_BIN_DIR})
if(NOT EXISTS ${MSCG_TARBALL})
message(STATUS "Downloading ${MSCG_TARBALL}")
file(DOWNLOAD
https://github.com/uchicago-voth/MSCG-release/archive/master.zip
${MSCG_TARBALL} SHOW_PROGRESS) #EXPECTED_MD5 cannot be due due to master
endif()
message(STATUS "Unpacking ${MSCG_TARBALL}")
execute_process(COMMAND ${CMAKE_COMMAND} -E tar xvf ${MSCG_TARBALL}
WORKING_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/mscg)
endif()
file(GLOB MSCG_SOURCES ${LAMMPS_LIB_MSCG_BIN_DIR}/*.cpp)
list(APPEND LIB_SOURCES ${MSCG_SOURCES})
foreach(MSCG_SOURCE ${MSCG_SOURCES})
set_property(SOURCE ${MSCG_SOURCE} APPEND PROPERTY COMPILE_DEFINITIONS
DIMENSION=3 _exclude_gromacs=1)
endforeach()
include_directories(${LAMMPS_LIB_MSCG_BIN_DIR} ${GSL_INCLUDE_DIRS})
list(APPEND LAMMPS_LINK_LIBS ${GSL_LIBRARIES})
endif()
########################################################################
# Basic system tests (standard libraries, headers, functions, types) #
########################################################################
include(CheckIncludeFile)
foreach(HEADER math.h)
check_include_file(${HEADER} FOUND_${HEADER})
if(NOT FOUND_${HEADER})
message(FATAL_ERROR "Could not find needed header - ${HEADER}")
endif(NOT FOUND_${HEADER})
endforeach(HEADER)
set(MATH_LIBRARIES "m" CACHE STRING "math library")
mark_as_advanced( MATH_LIBRARIES )
include(CheckLibraryExists)
foreach(FUNC sin cos)
check_library_exists(${MATH_LIBRARIES} ${FUNC} "" FOUND_${FUNC}_${MATH_LIBRARIES})
if(NOT FOUND_${FUNC}_${MATH_LIBRARIES})
message(FATAL_ERROR "Could not find needed math function - ${FUNC}")
endif(NOT FOUND_${FUNC}_${MATH_LIBRARIES})
endforeach(FUNC)
list(APPEND LAMMPS_LINK_LIBS ${MATH_LIBRARIES})
######################################
# Generate Basic Style files
######################################
include(StyleHeaderUtils)
RegisterStyles(${LAMMPS_SOURCE_DIR})
##############################################
# add sources of enabled packages
############################################
foreach(PKG ${DEFAULT_PACKAGES} ${OTHER_PACKAGES})
if(ENABLE_${PKG})
set(${PKG}_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/${PKG})
# detects styles in package and adds them to global list
RegisterStyles(${${PKG}_SOURCES_DIR})
file(GLOB ${PKG}_SOURCES ${${PKG}_SOURCES_DIR}/*.cpp)
list(APPEND LIB_SOURCES ${${PKG}_SOURCES})
include_directories(${${PKG}_SOURCES_DIR})
endif()
endforeach()
##############################################
# add lib sources of (simple) enabled packages
############################################
foreach(SIMPLE_LIB REAX MEAM POEMS USER-ATC USER-AWPMD USER-COLVARS USER-H5MD
USER-MOLFILE USER-QMMM)
if(ENABLE_${SIMPLE_LIB})
string(REGEX REPLACE "^USER-" "" SIMPLE_LIB "${SIMPLE_LIB}")
string(TOLOWER "${SIMPLE_LIB}" INC_DIR)
file(GLOB_RECURSE ${SIMPLE_LIB}_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/${INC_DIR}/*.F
${LAMMPS_LIB_SOURCE_DIR}/${INC_DIR}/*.c ${LAMMPS_LIB_SOURCE_DIR}/${INC_DIR}/*.cpp)
list(APPEND LIB_SOURCES ${${SIMPLE_LIB}_SOURCES})
include_directories(${LAMMPS_LIB_SOURCE_DIR}/${INC_DIR})
endif()
endforeach()
######################################################################
# packages which selectively include variants based on enabled styles
# e.g. accelerator packages
######################################################################
if(ENABLE_USER-OMP)
set(USER-OMP_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/USER-OMP)
set(USER-OMP_SOURCES ${USER-OMP_SOURCES_DIR}/thr_data.cpp
${USER-OMP_SOURCES_DIR}/thr_omp.cpp
${USER-OMP_SOURCES_DIR}/fix_nh_omp.cpp
${USER-OMP_SOURCES_DIR}/fix_nh_sphere_omp.cpp)
set_property(GLOBAL PROPERTY "OMP_SOURCES" "${USER-OMP_SOURCES}")
# detects styles which have USER-OMP version
RegisterStylesExt(${USER-OMP_SOURCES_DIR} omp OMP_SOURCES)
get_property(USER-OMP_SOURCES GLOBAL PROPERTY OMP_SOURCES)
list(APPEND LIB_SOURCES ${USER-OMP_SOURCES})
include_directories(${USER-OMP_SOURCES_DIR})
endif()
if(ENABLE_KOKKOS)
set(LAMMPS_LIB_KOKKOS_SRC_DIR ${LAMMPS_LIB_SOURCE_DIR}/kokkos)
set(LAMMPS_LIB_KOKKOS_BIN_DIR ${LAMMPS_LIB_BINARY_DIR}/kokkos)
add_definitions(-DLMP_KOKKOS)
add_subdirectory(${LAMMPS_LIB_KOKKOS_SRC_DIR} ${LAMMPS_LIB_KOKKOS_BIN_DIR})
set(Kokkos_INCLUDE_DIRS ${LAMMPS_LIB_KOKKOS_SRC_DIR}/core/src
${LAMMPS_LIB_KOKKOS_SRC_DIR}/containers/src
${LAMMPS_LIB_KOKKOS_SRC_DIR}/algorithms/src
${LAMMPS_LIB_KOKKOS_BIN_DIR})
include_directories(${Kokkos_INCLUDE_DIRS})
list(APPEND LAMMPS_LINK_LIBS kokkos)
set(KOKKOS_PKG_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/KOKKOS)
set(KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/kokkos.cpp
${KOKKOS_PKG_SOURCES_DIR}/atom_kokkos.cpp
${KOKKOS_PKG_SOURCES_DIR}/atom_vec_kokkos.cpp
${KOKKOS_PKG_SOURCES_DIR}/comm_kokkos.cpp
${KOKKOS_PKG_SOURCES_DIR}/comm_tiled_kokkos.cpp
${KOKKOS_PKG_SOURCES_DIR}/neighbor_kokkos.cpp
${KOKKOS_PKG_SOURCES_DIR}/neigh_list_kokkos.cpp
${KOKKOS_PKG_SOURCES_DIR}/neigh_bond_kokkos.cpp
${KOKKOS_PKG_SOURCES_DIR}/fix_nh_kokkos.cpp
${KOKKOS_PKG_SOURCES_DIR}/domain_kokkos.cpp
${KOKKOS_PKG_SOURCES_DIR}/modify_kokkos.cpp)
set_property(GLOBAL PROPERTY "KOKKOS_PKG_SOURCES" "${KOKKOS_PKG_SOURCES}")
# detects styles which have KOKKOS version
RegisterStylesExt(${KOKKOS_PKG_SOURCES_DIR} kokkos KOKKOS_PKG_SOURCES)
get_property(KOKKOS_PKG_SOURCES GLOBAL PROPERTY KOKKOS_PKG_SOURCES)
list(APPEND LIB_SOURCES ${KOKKOS_PKG_SOURCES})
include_directories(${KOKKOS_PKG_SOURCES_DIR})
endif()
if(ENABLE_OPT)
set(OPT_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/OPT)
set(OPT_SOURCES)
set_property(GLOBAL PROPERTY "OPT_SOURCES" "${OPT_SOURCES}")
# detects styles which have OPT version
RegisterStylesExt(${OPT_SOURCES_DIR} opt OPT_SOURCES)
get_property(OPT_SOURCES GLOBAL PROPERTY OPT_SOURCES)
list(APPEND LIB_SOURCES ${OPT_SOURCES})
include_directories(${OPT_SOURCES_DIR})
endif()
if(ENABLE_USER-INTEL)
set(USER-INTEL_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/USER-INTEL)
set(USER-INTEL_SOURCES ${USER-INTEL_SOURCES_DIR}/intel_preprocess.h
${USER-INTEL_SOURCES_DIR}/intel_buffers.h
${USER-INTEL_SOURCES_DIR}/intel_buffers.cpp
${USER-INTEL_SOURCES_DIR}/math_extra_intel.h
${USER-INTEL_SOURCES_DIR}/nbin_intel.h
${USER-INTEL_SOURCES_DIR}/nbin_intel.cpp
${USER-INTEL_SOURCES_DIR}/npair_intel.h
${USER-INTEL_SOURCES_DIR}/npair_intel.cpp
${USER-INTEL_SOURCES_DIR}/intel_simd.h
${USER-INTEL_SOURCES_DIR}/intel_intrinsics.h)
set_property(GLOBAL PROPERTY "USER-INTEL_SOURCES" "${USER-INTEL_SOURCES}")
# detects styles which have USER-INTEL version
RegisterStylesExt(${USER-INTEL_SOURCES_DIR} opt USER-INTEL_SOURCES)
get_property(USER-INTEL_SOURCES GLOBAL PROPERTY USER-INTEL_SOURCES)
list(APPEND LIB_SOURCES ${USER-INTEL_SOURCES})
include_directories(${USER-INTEL_SOURCES_DIR})
endif()
if(ENABLE_GPU)
find_package(CUDA REQUIRED)
find_program(BIN2C bin2c)
if(NOT BIN2C)
message(FATAL_ERROR "Couldn't find bin2c, use -DBIN2C helping cmake to find it.")
endif()
include_directories(${CUDA_INCLUDE_DIRS})
list(APPEND LAMMPS_LINK_LIBS ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
set(GPU_PREC "SINGLE_DOUBLE" CACHE STRING "Lammps gpu precision size")
set_property(CACHE GPU_PREC PROPERTY STRINGS SINGLE_DOUBLE SINGLE_SINGLE DOUBLE_DOUBLE)
add_definitions(-D_${GPU_PREC})
add_definitions(-DNV_KERNEL -DUCL_CUDADR)
option(CUDPP_OPT "Enable CUDPP_OPT" ON)
set(GPU_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/GPU)
set(GPU_SOURCES ${GPU_SOURCES_DIR}/gpu_extra.h)
set_property(GLOBAL PROPERTY "GPU_SOURCES" "${GPU_SOURCES}")
# detects styles which have GPU version
RegisterStylesExt(${GPU_SOURCES_DIR} opt GPU_SOURCES)
get_property(GPU_SOURCES GLOBAL PROPERTY GPU_SOURCES)
file(GLOB GPU_LIB_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/*.cpp)
file(GLOB GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/*.cu ${CMAKE_SOURCE_DIR}/gpu/*.cu)
file(GLOB_RECURSE GPU_NOT_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_pppm.cu)
list(REMOVE_ITEM GPU_LIB_CU ${GPU_NOT_LIB_CU})
include_directories(${GPU_SOURCES_DIR} ${LAMMPS_LIB_SOURCE_DIR}/gpu ${LAMMPS_LIB_BINARY_DIR}/gpu)
if(CUDPP_OPT)
include_directories(${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
add_definitions(-DCUDPP_OPT)
file(GLOB GPU_LIB_CUDPP_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini/*.cpp)
file(GLOB GPU_LIB_CUDPP_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini/*.cu)
endif()
cuda_compile(GPU_OBJS ${GPU_LIB_CU} ${GPU_LIB_CUDPP_CU} OPTIONS $<$<BOOL:${BUILD_SHARED_LIBS}>:-Xcompiler=-fPIC>)
file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)
foreach(CU_OBJ ${GPU_OBJS})
get_filename_component(CU_NAME ${CU_OBJ} NAME_WE)
string(REGEX REPLACE "^.*_lal_" "" CU_NAME "${CU_NAME}")
add_custom_command(OUTPUT ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h
COMMAND ${BIN2C} -c -n ${CU_NAME} ${CU_OBJ} > ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h
DEPENDS ${CU_OBJ}
COMMENT "Generating ${CU_NAME}_cubin.h")
list(APPEND LIB_SOURCES ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h)
if(${CU_NAME} STREQUAL "pppm_d") #pppm_d doesn't get linked into the lib
set(CU_FORBIDDEN_OBJ "${CU_OBJ}")
endif()
endforeach()
list(REMOVE_ITEM GPU_OBJS "${CU_FORBIDDEN_OBJ}")
list(APPEND LIB_SOURCES ${GPU_SOURCES} ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS})
set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "${LAMMPS_LIB_BINARY_DIR}/gpu/*_cubin.h")
endif()
######################################################
# Generate style headers based on global list of
# styles registered during package selection
######################################################
set(LAMMPS_STYLE_HEADERS_DIR ${CMAKE_CURRENT_BINARY_DIR}/styles)
GenerateStyleHeaders(${LAMMPS_STYLE_HEADERS_DIR})
include_directories(${LAMMPS_SOURCE_DIR})
include_directories(${LAMMPS_STYLE_HEADERS_DIR})
###########################################
# Actually add executable and lib to build
############################################
add_library(lammps ${LIB_SOURCES})
target_link_libraries(lammps ${LAMMPS_LINK_LIBS})
set_target_properties(lammps PROPERTIES SOVERSION ${SOVERSION})
if(INSTALL_LIB)
install(TARGETS lammps LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
install(FILES ${LAMMPS_SOURCE_DIR}/lammps.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
elseif(BUILD_SHARED_LIBS)
message(FATAL_ERROR "Shared library has to be installed, use -DINSTALL_LIB=ON to install lammps with a library")
endif()
add_executable(lmp ${LMP_SOURCES})
target_link_libraries(lmp lammps)
install(TARGETS lmp DESTINATION ${CMAKE_INSTALL_BINDIR})
if(ENABLE_TESTING)
add_test(ShowHelp ${CMAKE_CURRENT_BINARY_DIR}/lmp -help)
endif()
##################################
# Print package summary
##################################
foreach(PKG ${DEFAULT_PACKAGES} ${OTHER_PACKAGES} ${ACCEL_PACKAGES})
if(ENABLE_${PKG})
message(STATUS "Building package: ${PKG}")
endif()
endforeach()

View File

@ -0,0 +1,22 @@
# - Find fftw2
# Find the native FFTW2 headers and libraries.
#
# FFTW2_INCLUDE_DIRS - where to find fftw2.h, etc.
# FFTW2_LIBRARIES - List of libraries when using fftw2.
# FFTW2_FOUND - True if fftw2 found.
#
find_path(FFTW2_INCLUDE_DIR fftw.h)
find_library(FFTW2_LIBRARY NAMES fftw)
set(FFTW2_LIBRARIES ${FFTW2_LIBRARY})
set(FFTW2_INCLUDE_DIRS ${FFTW2_INCLUDE_DIR})
include(FindPackageHandleStandardArgs)
# handle the QUIETLY and REQUIRED arguments and set FFTW2_FOUND to TRUE
# if all listed variables are TRUE
find_package_handle_standard_args(FFTW2 DEFAULT_MSG FFTW2_LIBRARY FFTW2_INCLUDE_DIR)
mark_as_advanced(FFTW2_INCLUDE_DIR FFTW2_LIBRARY )

View File

@ -0,0 +1,25 @@
# - Find fftw3
# Find the native FFTW3 headers and libraries.
#
# FFTW3_INCLUDE_DIRS - where to find fftw3.h, etc.
# FFTW3_LIBRARIES - List of libraries when using fftw3.
# FFTW3_FOUND - True if fftw3 found.
#
find_package(PkgConfig)
pkg_check_modules(PC_FFTW3 fftw3)
find_path(FFTW3_INCLUDE_DIR fftw3.h HINTS ${PC_FFTW3_INCLUDE_DIRS})
find_library(FFTW3_LIBRARY NAMES fftw3 HINTS ${PC_FFTW3_LIBRARY_DIRS})
set(FFTW3_LIBRARIES ${FFTW3_LIBRARY})
set(FFTW3_INCLUDE_DIRS ${FFTW3_INCLUDE_DIR})
include(FindPackageHandleStandardArgs)
# handle the QUIETLY and REQUIRED arguments and set FFTW3_FOUND to TRUE
# if all listed variables are TRUE
find_package_handle_standard_args(FFTW3 DEFAULT_MSG FFTW3_LIBRARY FFTW3_INCLUDE_DIR)
mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_LIBRARY )

View File

@ -0,0 +1,22 @@
# - Find kim
# Find the native KIM headers and libraries.
#
# KIM_INCLUDE_DIRS - where to find kim.h, etc.
# KIM_LIBRARIES - List of libraries when using kim.
# KIM_FOUND - True if kim found.
#
find_path(KIM_INCLUDE_DIR KIM_API.h PATH_SUFFIXES kim-api-v1)
find_library(KIM_LIBRARY NAMES kim-api-v1)
set(KIM_LIBRARIES ${KIM_LIBRARY})
set(KIM_INCLUDE_DIRS ${KIM_INCLUDE_DIR})
include(FindPackageHandleStandardArgs)
# handle the QUIETLY and REQUIRED arguments and set KIM_FOUND to TRUE
# if all listed variables are TRUE
find_package_handle_standard_args(KIM DEFAULT_MSG KIM_LIBRARY KIM_INCLUDE_DIR)
mark_as_advanced(KIM_INCLUDE_DIR KIM_LIBRARY )

View File

@ -0,0 +1,22 @@
# - Find mkl
# Find the native MKL headers and libraries.
#
# MKL_INCLUDE_DIRS - where to find mkl.h, etc.
# MKL_LIBRARIES - List of libraries when using mkl.
# MKL_FOUND - True if mkl found.
#
find_path(MKL_INCLUDE_DIR mkl_dfti.h HINTS $ENV{MKLROOT}/include)
find_library(MKL_LIBRARY NAMES mkl_rt HINTS $ENV{MKLROOT}/lib $ENV{MKLROOT}/lib/intel64)
set(MKL_LIBRARIES ${MKL_LIBRARY})
set(MKL_INCLUDE_DIRS ${MKL_INCLUDE_DIR})
include(FindPackageHandleStandardArgs)
# handle the QUIETLY and REQUIRED arguments and set MKL_FOUND to TRUE
# if all listed variables are TRUE
find_package_handle_standard_args(MKL DEFAULT_MSG MKL_LIBRARY MKL_INCLUDE_DIR)
mark_as_advanced(MKL_INCLUDE_DIR MKL_LIBRARY )

View File

@ -0,0 +1,118 @@
# - Find NetCDF
# Find the native NetCDF includes and library
#
# NETCDF_INCLUDE_DIR - user modifiable choice of where netcdf headers are
# NETCDF_LIBRARY - user modifiable choice of where netcdf libraries are
#
# Your package can require certain interfaces to be FOUND by setting these
#
# NETCDF_CXX - require the C++ interface and link the C++ library
# NETCDF_F77 - require the F77 interface and link the fortran library
# NETCDF_F90 - require the F90 interface and link the fortran library
#
# Or equivalently by calling FindNetCDF with a COMPONENTS argument containing one or
# more of "CXX;F77;F90".
#
# When interfaces are requested the user has access to interface specific hints:
#
# NETCDF_${LANG}_INCLUDE_DIR - where to search for interface header files
# NETCDF_${LANG}_LIBRARY - where to search for interface libraries
#
# This module returns these variables for the rest of the project to use.
#
# NETCDF_FOUND - True if NetCDF found including required interfaces (see below)
# NETCDF_LIBRARIES - All netcdf related libraries.
# NETCDF_INCLUDE_DIRS - All directories to include.
# NETCDF_HAS_INTERFACES - Whether requested interfaces were found or not.
# NETCDF_${LANG}_INCLUDE_DIRS/NETCDF_${LANG}_LIBRARIES - C/C++/F70/F90 only interface
#
# Normal usage would be:
# set (NETCDF_F90 "YES")
# find_package (NetCDF REQUIRED)
# target_link_libraries (uses_everthing ${NETCDF_LIBRARIES})
# target_link_libraries (only_uses_f90 ${NETCDF_F90_LIBRARIES})
#search starting from user editable cache var
if (NETCDF_INCLUDE_DIR AND NETCDF_LIBRARY)
# Already in cache, be silent
set (NETCDF_FIND_QUIETLY TRUE)
endif ()
set(USE_DEFAULT_PATHS "NO_DEFAULT_PATH")
if(NETCDF_USE_DEFAULT_PATHS)
set(USE_DEFAULT_PATHS "")
endif()
find_path (NETCDF_INCLUDE_DIR netcdf.h
HINTS "${NETCDF_DIR}/include")
mark_as_advanced (NETCDF_INCLUDE_DIR)
set (NETCDF_C_INCLUDE_DIRS ${NETCDF_INCLUDE_DIR})
find_library (NETCDF_LIBRARY NAMES netcdf
HINTS "${NETCDF_DIR}/lib")
mark_as_advanced (NETCDF_LIBRARY)
set (NETCDF_C_LIBRARIES ${NETCDF_LIBRARY})
#start finding requested language components
set (NetCDF_libs "")
set (NetCDF_includes "${NETCDF_INCLUDE_DIR}")
get_filename_component (NetCDF_lib_dirs "${NETCDF_LIBRARY}" PATH)
set (NETCDF_HAS_INTERFACES "YES") # will be set to NO if we're missing any interfaces
macro (NetCDF_check_interface lang header libs)
if (NETCDF_${lang})
#search starting from user modifiable cache var
find_path (NETCDF_${lang}_INCLUDE_DIR NAMES ${header}
HINTS "${NETCDF_INCLUDE_DIR}"
HINTS "${NETCDF_${lang}_ROOT}/include"
${USE_DEFAULT_PATHS})
find_library (NETCDF_${lang}_LIBRARY NAMES ${libs}
HINTS "${NetCDF_lib_dirs}"
HINTS "${NETCDF_${lang}_ROOT}/lib"
${USE_DEFAULT_PATHS})
mark_as_advanced (NETCDF_${lang}_INCLUDE_DIR NETCDF_${lang}_LIBRARY)
#export to internal varS that rest of project can use directly
set (NETCDF_${lang}_LIBRARIES ${NETCDF_${lang}_LIBRARY})
set (NETCDF_${lang}_INCLUDE_DIRS ${NETCDF_${lang}_INCLUDE_DIR})
if (NETCDF_${lang}_INCLUDE_DIR AND NETCDF_${lang}_LIBRARY)
list (APPEND NetCDF_libs ${NETCDF_${lang}_LIBRARY})
list (APPEND NetCDF_includes ${NETCDF_${lang}_INCLUDE_DIR})
else ()
set (NETCDF_HAS_INTERFACES "NO")
message (STATUS "Failed to find NetCDF interface for ${lang}")
endif ()
endif ()
endmacro ()
list (FIND NetCDF_FIND_COMPONENTS "CXX" _nextcomp)
if (_nextcomp GREATER -1)
set (NETCDF_CXX 1)
endif ()
list (FIND NetCDF_FIND_COMPONENTS "F77" _nextcomp)
if (_nextcomp GREATER -1)
set (NETCDF_F77 1)
endif ()
list (FIND NetCDF_FIND_COMPONENTS "F90" _nextcomp)
if (_nextcomp GREATER -1)
set (NETCDF_F90 1)
endif ()
NetCDF_check_interface (CXX netcdfcpp.h netcdf_c++)
NetCDF_check_interface (F77 netcdf.inc netcdff)
NetCDF_check_interface (F90 netcdf.mod netcdff)
#export accumulated results to internal varS that rest of project can depend on
list (APPEND NetCDF_libs "${NETCDF_C_LIBRARIES}")
set (NETCDF_LIBRARIES ${NetCDF_libs})
set (NETCDF_INCLUDE_DIRS ${NetCDF_includes})
# handle the QUIETLY and REQUIRED arguments and set NETCDF_FOUND to TRUE if
# all listed variables are TRUE
include (FindPackageHandleStandardArgs)
find_package_handle_standard_args (NetCDF
DEFAULT_MSG NETCDF_LIBRARIES NETCDF_INCLUDE_DIRS NETCDF_HAS_INTERFACES)

View File

@ -0,0 +1,29 @@
# - Find quantum-espresso
# Find the native QE headers and libraries.
#
# QE_INCLUDE_DIRS - where to find quantum-espresso.h, etc.
# QE_LIBRARIES - List of libraries when using quantum-espresso.
# QE_FOUND - True if quantum-espresso found.
#
find_path(QE_INCLUDE_DIR libqecouple.h PATH_SUFFIXES COUPLE/include)
find_library(QECOUPLE_LIBRARY NAMES qecouple)
find_library(PW_LIBRARY NAMES pw)
find_library(QEMOD_LIBRARY NAMES qemod)
find_library(QEFFT_LIBRARY NAMES qefft)
find_library(QELA_LIBRARY NAMES qela)
find_library(CLIB_LIBRARY NAMES clib)
find_library(IOTK_LIBRARY NAMES iotk)
set(QE_LIBRARIES ${QECOUPLE_LIBRARY} ${PW_LIBRARY} ${QEMOD_LIBRARY} ${QEFFT_LIBRARY} ${QELA_LIBRARY} ${CLIB_LIBRARY} ${IOTK_LIBRARY})
set(QE_INCLUDE_DIRS ${QE_INCLUDE_DIR})
include(FindPackageHandleStandardArgs)
# handle the QUIETLY and REQUIRED arguments and set QE_FOUND to TRUE
# if all listed variables are TRUE
find_package_handle_standard_args(QE DEFAULT_MSG QECOUPLE_LIBRARY PW_LIBRARY QEMOD_LIBRARY QEFFT_LIBRARY QELA_LIBRARY CLIB_LIBRARY IOTK_LIBRARY QE_INCLUDE_DIR)
mark_as_advanced(QE_INCLUDE_DIR QECOUPLE_LIBRARY PW_LIBRARY QEMOD_LIBRARY QEFFT_LIBRARY QELA_LIBRARY CLIB_LIBRARY IOTK_LIBRARY)

View File

@ -0,0 +1,18 @@
# - Find quip
# Find the native QUIP libraries.
#
# QUIP_LIBRARIES - List of libraries when using fftw3.
# QUIP_FOUND - True if fftw3 found.
#
find_library(QUIP_LIBRARY NAMES quip)
set(QUIP_LIBRARIES ${QUIP_LIBRARY})
include(FindPackageHandleStandardArgs)
# handle the QUIETLY and REQUIRED arguments and set QUIP_FOUND to TRUE
# if all listed variables are TRUE
find_package_handle_standard_args(QUIP DEFAULT_MSG QUIP_LIBRARY)
mark_as_advanced(QUIP_LIBRARY)

View File

@ -0,0 +1,22 @@
# - Find voro++
# Find the native VORO headers and libraries.
#
# VORO_INCLUDE_DIRS - where to find voro++.hh, etc.
# VORO_LIBRARIES - List of libraries when using voro++.
# VORO_FOUND - True if voro++ found.
#
find_path(VORO_INCLUDE_DIR voro++.hh PATH_SUFFIXES voro++)
find_library(VORO_LIBRARY NAMES voro++)
set(VORO_LIBRARIES ${VORO_LIBRARY})
set(VORO_INCLUDE_DIRS ${VORO_INCLUDE_DIR})
include(FindPackageHandleStandardArgs)
# handle the QUIETLY and REQUIRED arguments and set VORO_FOUND to TRUE
# if all listed variables are TRUE
find_package_handle_standard_args(VORO DEFAULT_MSG VORO_LIBRARY VORO_INCLUDE_DIR)
mark_as_advanced(VORO_INCLUDE_DIR VORO_LIBRARY )

View File

@ -0,0 +1,132 @@
function(FindStyleHeaders path style_class file_pattern headers)
file(GLOB files "${path}/${file_pattern}*.h")
get_property(hlist GLOBAL PROPERTY ${headers})
foreach(file_name ${files})
file(STRINGS ${file_name} is_style LIMIT_COUNT 1 REGEX ${style_class})
if(is_style)
list(APPEND hlist ${file_name})
endif()
endforeach()
set_property(GLOBAL PROPERTY ${headers} "${hlist}")
endfunction(FindStyleHeaders)
function(FindStyleHeadersExt path style_class extension headers sources)
get_property(hlist GLOBAL PROPERTY ${headers})
get_property(slist GLOBAL PROPERTY ${sources})
set(ext_list)
get_filename_component(abs_path "${path}" ABSOLUTE)
foreach(file_name ${hlist})
get_filename_component(basename ${file_name} NAME_WE)
set(ext_file_name "${abs_path}/${basename}_${extension}.h")
if(EXISTS "${ext_file_name}")
file(STRINGS ${ext_file_name} is_style LIMIT_COUNT 1 REGEX ${style_class})
if(is_style)
list(APPEND ext_list ${ext_file_name})
set(source_file_name "${abs_path}/${basename}_${extension}.cpp")
if(EXISTS "${source_file_name}")
list(APPEND slist ${source_file_name})
endif()
endif()
endif()
endforeach()
list(APPEND hlist ${ext_list})
set_property(GLOBAL PROPERTY ${headers} "${hlist}")
set_property(GLOBAL PROPERTY ${sources} "${slist}")
endfunction(FindStyleHeadersExt)
function(CreateStyleHeader path filename)
math(EXPR N "${ARGC}-2")
set(temp "")
if(N GREATER 0)
math(EXPR ARG_END "${ARGC}-1")
foreach(IDX RANGE 2 ${ARG_END})
list(GET ARGV ${IDX} FNAME)
get_filename_component(FNAME ${FNAME} NAME)
set(temp "${temp}#include \"${FNAME}\"\n")
endforeach()
endif()
message(STATUS "Generating ${filename}...")
file(WRITE "${path}/${filename}.tmp" "${temp}" )
execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different "${path}/${filename}.tmp" "${path}/${filename}")
endfunction(CreateStyleHeader)
function(GenerateStyleHeader path property style)
get_property(files GLOBAL PROPERTY ${property})
#message("${property} = ${files}")
CreateStyleHeader("${path}" "style_${style}.h" ${files})
endfunction(GenerateStyleHeader)
function(RegisterStyles search_path)
FindStyleHeaders(${search_path} ANGLE_CLASS angle_ ANGLE ) # angle ) # force
FindStyleHeaders(${search_path} ATOM_CLASS atom_vec_ ATOM_VEC ) # atom ) # atom atom_vec_hybrid
FindStyleHeaders(${search_path} BODY_CLASS body_ BODY ) # body ) # atom_vec_body
FindStyleHeaders(${search_path} BOND_CLASS bond_ BOND ) # bond ) # force
FindStyleHeaders(${search_path} COMMAND_CLASS "" COMMAND ) # command ) # input
FindStyleHeaders(${search_path} COMPUTE_CLASS compute_ COMPUTE ) # compute ) # modify
FindStyleHeaders(${search_path} DIHEDRAL_CLASS dihedral_ DIHEDRAL ) # dihedral ) # force
FindStyleHeaders(${search_path} DUMP_CLASS dump_ DUMP ) # dump ) # output write_dump
FindStyleHeaders(${search_path} FIX_CLASS fix_ FIX ) # fix ) # modify
FindStyleHeaders(${search_path} IMPROPER_CLASS improper_ IMPROPER ) # improper ) # force
FindStyleHeaders(${search_path} INTEGRATE_CLASS "" INTEGRATE ) # integrate ) # update
FindStyleHeaders(${search_path} KSPACE_CLASS "" KSPACE ) # kspace ) # force
FindStyleHeaders(${search_path} MINIMIZE_CLASS min_ MINIMIZE ) # minimize ) # update
FindStyleHeaders(${search_path} NBIN_CLASS nbin_ NBIN ) # nbin ) # neighbor
FindStyleHeaders(${search_path} NPAIR_CLASS npair_ NPAIR ) # npair ) # neighbor
FindStyleHeaders(${search_path} NSTENCIL_CLASS nstencil_ NSTENCIL ) # nstencil ) # neighbor
FindStyleHeaders(${search_path} NTOPO_CLASS ntopo_ NTOPO ) # ntopo ) # neighbor
FindStyleHeaders(${search_path} PAIR_CLASS pair_ PAIR ) # pair ) # force
FindStyleHeaders(${search_path} READER_CLASS reader_ READER ) # reader ) # read_dump
FindStyleHeaders(${search_path} REGION_CLASS region_ REGION ) # region ) # domain
endfunction(RegisterStyles)
function(RegisterStylesExt search_path extension sources)
FindStyleHeadersExt(${search_path} ANGLE_CLASS ${extension} ANGLE ${sources})
FindStyleHeadersExt(${search_path} ATOM_CLASS ${extension} ATOM_VEC ${sources})
FindStyleHeadersExt(${search_path} BODY_CLASS ${extension} BODY ${sources})
FindStyleHeadersExt(${search_path} BOND_CLASS ${extension} BOND ${sources})
FindStyleHeadersExt(${search_path} COMMAND_CLASS ${extension} COMMAND ${sources})
FindStyleHeadersExt(${search_path} COMPUTE_CLASS ${extension} COMPUTE ${sources})
FindStyleHeadersExt(${search_path} DIHEDRAL_CLASS ${extension} DIHEDRAL ${sources})
FindStyleHeadersExt(${search_path} DUMP_CLASS ${extension} DUMP ${sources})
FindStyleHeadersExt(${search_path} FIX_CLASS ${extension} FIX ${sources})
FindStyleHeadersExt(${search_path} IMPROPER_CLASS ${extension} IMPROPER ${sources})
FindStyleHeadersExt(${search_path} INTEGRATE_CLASS ${extension} INTEGRATE ${sources})
FindStyleHeadersExt(${search_path} KSPACE_CLASS ${extension} KSPACE ${sources})
FindStyleHeadersExt(${search_path} MINIMIZE_CLASS ${extension} MINIMIZE ${sources})
FindStyleHeadersExt(${search_path} NBIN_CLASS ${extension} NBIN ${sources})
FindStyleHeadersExt(${search_path} NPAIR_CLASS ${extension} NPAIR ${sources})
FindStyleHeadersExt(${search_path} NSTENCIL_CLASS ${extension} NSTENCIL ${sources})
FindStyleHeadersExt(${search_path} NTOPO_CLASS ${extension} NTOPO ${sources})
FindStyleHeadersExt(${search_path} PAIR_CLASS ${extension} PAIR ${sources})
FindStyleHeadersExt(${search_path} READER_CLASS ${extension} READER ${sources})
FindStyleHeadersExt(${search_path} REGION_CLASS ${extension} REGION ${sources})
endfunction(RegisterStylesExt)
function(GenerateStyleHeaders output_path)
GenerateStyleHeader(${output_path} ANGLE angle ) # force
GenerateStyleHeader(${output_path} ATOM_VEC atom ) # atom atom_vec_hybrid
GenerateStyleHeader(${output_path} BODY body ) # atom_vec_body
GenerateStyleHeader(${output_path} BOND bond ) # force
GenerateStyleHeader(${output_path} COMMAND command ) # input
GenerateStyleHeader(${output_path} COMPUTE compute ) # modify
GenerateStyleHeader(${output_path} DIHEDRAL dihedral ) # force
GenerateStyleHeader(${output_path} DUMP dump ) # output write_dump
GenerateStyleHeader(${output_path} FIX fix ) # modify
GenerateStyleHeader(${output_path} IMPROPER improper ) # force
GenerateStyleHeader(${output_path} INTEGRATE integrate ) # update
GenerateStyleHeader(${output_path} KSPACE kspace ) # force
GenerateStyleHeader(${output_path} MINIMIZE minimize ) # update
GenerateStyleHeader(${output_path} NBIN nbin ) # neighbor
GenerateStyleHeader(${output_path} NPAIR npair ) # neighbor
GenerateStyleHeader(${output_path} NSTENCIL nstencil ) # neighbor
GenerateStyleHeader(${output_path} NTOPO ntopo ) # neighbor
GenerateStyleHeader(${output_path} PAIR pair ) # force
GenerateStyleHeader(${output_path} READER reader ) # read_dump
GenerateStyleHeader(${output_path} REGION region ) # domain
endfunction(GenerateStyleHeaders)

19
cmake/README Normal file
View File

@ -0,0 +1,19 @@
cmake-buildsystem
-----------------
To use the cmake build system instead of the make-driven one, do:
```
cmake /path/to/lammps/source/cmake
```
(please note the cmake directory as the very end)
To enable package, e.g. GPU do
```
cmake /path/to/lammps/source/cmake -DENABLE_GPU=ON
```
cmake has many many options, do get an overview use the curses-based cmake interface, ccmake:
```
ccmake /path/to/lammps/source/cmake
```
(Don't forget to press "g" for generate once you are done with configuring)

4
cmake/gpu/lal_pppm_d.cu Normal file
View File

@ -0,0 +1,4 @@
#define grdtyp double
#define grdtyp4 double4
#include "lal_pppm.cu"

4
cmake/gpu/lal_pppm_f.cu Normal file
View File

@ -0,0 +1,4 @@
#define grdtyp float
#define grdtyp4 float4
#include "lal_pppm.cu"

Binary file not shown.

Before

Width:  |  Height:  |  Size: 14 KiB

After

Width:  |  Height:  |  Size: 20 KiB

View File

@ -1,7 +1,7 @@
<!-- HTML_ONLY -->
<HEAD>
<TITLE>LAMMPS Users Manual</TITLE>
<META NAME="docnumber" CONTENT="10 Aug 2017 version">
<META NAME="docnumber" CONTENT="17 Aug 2017 version">
<META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
<META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation. This software and manual is distributed under the GNU General Public License.">
</HEAD>
@ -21,7 +21,7 @@
<H1></H1>
LAMMPS Documentation :c,h3
10 Aug 2017 version :c,h4
17 Aug 2017 version :c,h4
Version info: :h4
@ -79,7 +79,7 @@ bug reports and feature requests are mainly coordinated through the
"LAMMPS project on GitHub."_https://github.com/lammps/lammps
The lammps.org domain, currently hosting "public continuous integration
testing"_https://ci.lammps.org/job/lammps/ and "precompiled Linux
RPM and Windows installer packages"_http://rpm.lammps.org is located
RPM and Windows installer packages"_http://packages.lammps.org is located
at Temple University and managed by Richard Berger,
richard.berger at temple.edu.

View File

@ -892,8 +892,8 @@ KOKKOS, o = USER-OMP, t = OPT.
"hybrid"_pair_hybrid.html,
"hybrid/overlay"_pair_hybrid.html,
"adp (o)"_pair_adp.html,
"airebo (o)"_pair_airebo.html,
"airebo/morse (o)"_pair_airebo.html,
"airebo (oi)"_pair_airebo.html,
"airebo/morse (oi)"_pair_airebo.html,
"beck (go)"_pair_beck.html,
"body"_pair_body.html,
"bop"_pair_bop.html,
@ -927,8 +927,8 @@ KOKKOS, o = USER-OMP, t = OPT.
"dpd/tstat (go)"_pair_dpd.html,
"dsmc"_pair_dsmc.html,
"eam (gkiot)"_pair_eam.html,
"eam/alloy (gkot)"_pair_eam.html,
"eam/fs (gkot)"_pair_eam.html,
"eam/alloy (gkiot)"_pair_eam.html,
"eam/fs (gkiot)"_pair_eam.html,
"eim (o)"_pair_eim.html,
"gauss (go)"_pair_gauss.html,
"gayberne (gio)"_pair_gayberne.html,
@ -942,9 +942,9 @@ KOKKOS, o = USER-OMP, t = OPT.
"kim"_pair_kim.html,
"lcbop"_pair_lcbop.html,
"line/lj"_pair_line_lj.html,
"lj/charmm/coul/charmm (ko)"_pair_charmm.html,
"lj/charmm/coul/charmm (kio)"_pair_charmm.html,
"lj/charmm/coul/charmm/implicit (ko)"_pair_charmm.html,
"lj/charmm/coul/long (giko)"_pair_charmm.html,
"lj/charmm/coul/long (gkio)"_pair_charmm.html,
"lj/charmm/coul/msm"_pair_charmm.html,
"lj/charmmfsw/coul/charmmfsh"_pair_charmm.html,
"lj/charmmfsw/coul/long"_pair_charmm.html,
@ -990,7 +990,7 @@ KOKKOS, o = USER-OMP, t = OPT.
"polymorphic"_pair_polymorphic.html,
"python"_pair_python.html,
"reax"_pair_reax.html,
"rebo (o)"_pair_airebo.html,
"rebo (oi)"_pair_airebo.html,
"resquared (go)"_pair_resquared.html,
"snap"_pair_snap.html,
"soft (go)"_pair_soft.html,

View File

@ -7886,8 +7886,8 @@ keyword to allow for additional bonds to be formed :dd
{New bond exceeded special list size in fix bond/create} :dt
See the "special_bonds extra" command
(or the "read_data extra/special/per/atom" command)
See the "read_data extra/special/per/atom" command
(or the "create_box extra/special/per/atom" command)
for info on how to leave space in the special bonds
list to allow for additional bonds to be formed. :dd
@ -9666,8 +9666,8 @@ you are running. :dd
{Special list size exceeded in fix bond/create} :dt
See the special_bonds extra command
(or the read_data extra/special/per/atom command)
See the "read_data extra/special/per/atom" command
(or the "create_box extra/special/per/atom" command)
for info on how to leave space in the special bonds
list to allow for additional bonds to be formed. :dd

View File

@ -662,27 +662,25 @@ your own build system. Due to differences between the Windows OS
and Windows system libraries to Unix-like environments like Linux
or MacOS, when compiling for Windows a few adjustments may be needed:
Do not set the -DLAMMPS_MEMALIGN define (see LMP_INC makefile variable)
Do [not] set the -DLAMMPS_MEMALIGN define (see LMP_INC makefile variable)
Add -lwsock32 -lpsapi to the linker flags (see LIB makefile variable)
Try adding -static-libgcc or -static or both to the linker flags when your
LAMMPS executable complains about missing .dll files :ul
Try adding -static-libgcc or -static or both to the linker flags when your LAMMPS executable complains about missing .dll files :ul
Since none of the current LAMMPS core developers
has significant experience building executables on Windows, we are
happy to distribute contributed instructions and modifications, but
we cannot provide support for those.
Since none of the current LAMMPS core developers has significant
experience building executables on Windows, we are happy to distribute
contributed instructions and modifications to improve the situation,
but we cannot provide support for those.
With the so-called "Anniversary Update" to Windows 10, there is a
Ubuntu Linux subsystem available for Windows, that can be installed
and then used to compile/install LAMMPS as if you are running on a
Ubuntu Linux system instead of Windows.
As an alternative, you can download "daily builds" (and some older
versions) of the installer packages from
"rpm.lammps.org/windows.html"_http://rpm.lammps.org/windows.html.
These executables are built with most optional packages and the
download includes documentation, potential files, some tools and
many examples, but no source code.
As an alternative, you can download pre-compiled installer packages from
"packages.lammps.org/windows.html"_http://packages.lammps.org/windows.html.
These executables are built with most optional packages included and the
download includes documentation, potential files, some tools and many
examples, but no source code.
:line
@ -1095,7 +1093,7 @@ LAMMPS to be built with one or more of its optional packages.
:line
On a Windows box, you can skip making LAMMPS and simply download an
installer package from "here"_http://rpm.lammps.org/windows.html
installer package from "here"_http://packages.lammps.org/windows.html
For running the non-MPI executable, follow these steps:
@ -1107,18 +1105,27 @@ the [in.lj] input from the bench folder. (e.g. by typing: cd "Documents"). :l
At the command prompt, type "lmp_serial -in in.lj", replacing [in.lj]
with the name of your LAMMPS input script. :l
The serial executable includes support for multi-threading
parallelization from the styles in the USER-OMP packages.
To run with, e.g. 4 threads, type "lmp_serial -in in.lj -pk omp 4 -sf omp"
:ule
For the MPI version, which allows you to run LAMMPS under Windows on
multiple processors, follow these steps:
For the MPI version, which allows you to run LAMMPS under Windows with
the more general message passing parallel library (LAMMPS has been
designed from ground up to use MPI efficiently), follow these steps:
Download and install
"MPICH2"_http://www.mcs.anl.gov/research/projects/mpich2/downloads/index.php?s=downloads
for Windows. :ulb,l
Download and install a compatible MPI library binary package:
for 32-bit Windows
"mpich2-1.4.1p1-win-ia32.msi"_download.lammps.org/thirdparty/mpich2-1.4.1p1-win-ia32.msi
and for 64-bit Windows
"mpich2-1.4.1p1-win-x86-64.msi"_download.lammps.org/thirdparty/mpich2-1.4.1p1-win-x86-64.msi
:ulb,l
The LAMMPS Windows installer packages will automatically adjust your
path for the default location of this MPI package. After the installation
of the MPICH software, it needs to be integrated into the system.
of the MPICH2 software, it needs to be integrated into the system.
For this you need to start a Command Prompt in {Administrator Mode}
(right click on the icon and select it). Change into the MPICH2
installation directory, then into the subdirectory [bin] and execute
@ -1137,7 +1144,7 @@ or
mpiexec -np 4 lmp_mpi -in in.lj :pre
replacing in.lj with the name of your LAMMPS input script. For the latter
replacing [in.lj] with the name of your LAMMPS input script. For the latter
case, you may be prompted to enter your password. :l
In this mode, output may not immediately show up on the screen, so if
@ -1149,6 +1156,11 @@ something like:
lmp_mpi -in in.lj :pre
And the parallel executable also includes OpenMP multi-threading, which
can be combined with MPI using something like:
mpiexec -localonly 2 lmp_mpi -in in.lj -pk omp 2 -sf omp :pre
:ule
:line

View File

@ -29,8 +29,10 @@ Bond Styles: fene, harmonic :l
Dihedral Styles: charmm, harmonic, opls :l
Fixes: nve, npt, nvt, nvt/sllod :l
Improper Styles: cvff, harmonic :l
Pair Styles: buck/coul/cut, buck/coul/long, buck, eam, gayberne,
charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, sw, tersoff :l
Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long,
buck, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm,
lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo,
sw, tersoff :l
K-Space Styles: pppm, pppm/disp :l
:ule

View File

@ -150,10 +150,9 @@ atoms. Note that adding a single bond always adds a new 1st neighbor
but may also induce *many* new 2nd and 3rd neighbors, depending on the
molecular topology of your system. The "extra special per atom"
parameter must typically be set to allow for the new maximum total
size (1st + 2nd + 3rd neighbors) of this per-atom list. There are 3
size (1st + 2nd + 3rd neighbors) of this per-atom list. There are 2
ways to do this. See the "read_data"_read_data.html or
"create_box"_create_box.html or "special_bonds extra" commands for
details.
"create_box"_create_box.html commands for details.
NOTE: Even if you do not use the {atype}, {dtype}, or {itype}
keywords, the list of topological neighbors is updated for atoms

View File

@ -7,10 +7,13 @@
:line
pair_style airebo command :h3
pair_style airebo/intel command :h3
pair_style airebo/omp command :h3
pair_style airebo/morse command :h3
pair_style airebo/morse/intel command :h3
pair_style airebo/morse/omp command :h3
pair_style rebo command :h3
pair_style rebo/intel command :h3
pair_style rebo/omp command :h3
[Syntax:]

View File

@ -7,6 +7,7 @@
:line
pair_style lj/charmm/coul/charmm command :h3
pair_style lj/charmm/coul/charmm/intel command :h3
pair_style lj/charmm/coul/charmm/omp command :h3
pair_style lj/charmm/coul/charmm/implicit command :h3
pair_style lj/charmm/coul/charmm/implicit/omp command :h3

View File

@ -14,6 +14,7 @@ pair_style eam/omp command :h3
pair_style eam/opt command :h3
pair_style eam/alloy command :h3
pair_style eam/alloy/gpu command :h3
pair_style eam/alloy/intel command :h3
pair_style eam/alloy/kk command :h3
pair_style eam/alloy/omp command :h3
pair_style eam/alloy/opt command :h3
@ -21,6 +22,7 @@ pair_style eam/cd command :h3
pair_style eam/cd/omp command :h3
pair_style eam/fs command :h3
pair_style eam/fs/gpu command :h3
pair_style eam/fs/intel command :h3
pair_style eam/fs/kk command :h3
pair_style eam/fs/omp command :h3
pair_style eam/fs/opt command :h3

View File

@ -25,9 +25,7 @@ keyword = {amber} or {charmm} or {dreiding} or {fene} or {lj/coul} or {lj} or {c
{coul} values = w1,w2,w3
w1,w2,w3 = weights (0.0 to 1.0) on pairwise Coulombic interactions
{angle} value = {yes} or {no}
{dihedral} value = {yes} or {no}
{extra} value = N
N = number of extra 1-2,1-3,1-4 interactions to save space for :pre
{dihedral} value = {yes} or {no} :pre
:ule
Examples:
@ -36,8 +34,7 @@ special_bonds amber
special_bonds charmm
special_bonds fene dihedral no
special_bonds lj/coul 0.0 0.0 0.5 angle yes dihedral yes
special_bonds lj 0.0 0.0 0.5 coul 0.0 0.0 0.0 dihedral yes
special_bonds lj/coul 0 1 1 extra 2 :pre
special_bonds lj 0.0 0.0 0.5 coul 0.0 0.0 0.0 dihedral yes :pre
[Description:]
@ -178,14 +175,6 @@ interaction between atoms 2 and 5 will be unaffected (full weighting
of 1.0). If the {dihedral} keyword is specified as {no} which is the
default, then the 2,5 interaction will also be weighted by 0.5.
The {extra} keyword can be used when additional bonds will be created
during a simulation run, e.g. by the "fix
bond/create"_fix_bond_create.html command. It can also be used if
molecules will be added to the system, e.g. via the "fix
deposit"_fix_deposit.html, or "fix pour"_fix_pour.html commands, which
will have atoms with more special neighbors than any atom in the
current system has.
:line
NOTE: LAMMPS stores and maintains a data structure with a list of the
@ -194,8 +183,9 @@ the system). If new bonds are created (or molecules added containing
atoms with more special neighbors), the size of this list needs to
grow. Note that adding a single bond always adds a new 1st neighbor
but may also induce *many* new 2nd and 3rd neighbors, depending on the
molecular topology of your system. Using the {extra} keyword leaves
empty space in the list for this N additional 1st, 2nd, or 3rd
molecular topology of your system. Using the {extra/special/per/atom}
keyword to either "read_data"_read_data.html or "create_box"_create_box.html
reserves empty space in the list for this N additional 1st, 2nd, or 3rd
neighbors to be added. If you do not do this, you may get an error
when bonds (or molecules) are added.
@ -203,8 +193,7 @@ when bonds (or molecules) are added.
NOTE: If you reuse this command in an input script, you should set all
the options you need each time. This command cannot be used a 2nd
time incrementally, e.g. to add some extra storage locations via the
{extra} keyword. E.g. these two commands:
time incrementally. E.g. these two commands:
special_bonds lj 0.0 1.0 1.0
special_bonds coul 0.0 0.0 1.0
@ -221,25 +210,6 @@ Coul: coul 0.0 0.0 1.0
because the LJ settings are reset to their default values
each time the command is issued.
Likewise
special_bonds amber
special_bonds extra 2 :pre
is not the same as this single command:
special_bonds amber extra 2 :pre
since in the former case, the 2nd command will reset all the LJ and
Coulombic weights to 0.0 (the default).
One exception to this rule is the {extra} option itself. It is not
reset to its default value of 0 each time the special_bonds command is
invoked. This is because it can also be set by the
"read_data"_read_data.html and "create_box"_create_box.html commands,
so this command will not override those settings unless you explicitly
use {extra} as an option.
[Restrictions:] none
[Related commands:]

0
doc/src/tutorial_bash_on_windows.txt Executable file → Normal file
View File

View File

@ -176,12 +176,13 @@ By recognizing the fix {drude}, LAMMPS will find and store matching
DC-DP pairs and will treat DP as equivalent to their DC in the
{special bonds} relations. It may be necessary to extend the space
for storing such special relations. In this case extra space should
be reserved by using the {extra} keyword of the {special_bonds}
be reserved by using the {extra/special/per/atom} keyword of either
the "read_data"_read_data.html or "create_box"_create_box.html
command. With our phenol, there is 1 more special neighbor for which
space is required. Otherwise LAMMPS crashes and gives the required
value.
special_bonds lj/coul 0.0 0.0 0.5 extra 1 :pre
read_data data-p.lmp extra/special/per/atom 1 :pre
Let us assume we want to run a simple NVT simulation at 300 K. Note
that Drude oscillators need to be thermalized at a low temperature in

0
doc/src/tutorials.txt Executable file → Normal file
View File

View File

@ -32,7 +32,7 @@ where Makefile.g++ uses the GNU C++ compiler and is a good template to start.
**Optional**: if you use the Install.py script provided in this folder, you
can give the machine name as the '-m' argument. This can be the suffix of one
of the files from either this folder, or from src/MAKE.
of the files from either this folder, or from src/MAKE/MACHINES.
*This is only supported by the Install.py within the lib/colvars folder*.
When you are done building this library, two files should
@ -53,10 +53,10 @@ settings in Makefile.common should work.
For the reference manual see:
http://colvars.github.io/colvars-refman-lammps
A copy of reference manual is also in:
A copy of the reference manual is also in:
doc/PDF/colvars-refman-lammps.pdf
Also included is a Doxygen-based developer documentation:
Also available is a Doxygen-based developer documentation:
http://colvars.github.io/doxygen/html/
The reference article is:

View File

@ -88,7 +88,12 @@ public:
static std::vector<feature *> cv_features;
/// \brief Implementation of the feature list accessor for colvar
std::vector<feature *> &features() {
virtual const std::vector<feature *> &features()
{
return cv_features;
}
virtual std::vector<feature *> &modify_features()
{
return cv_features;
}

View File

@ -206,7 +206,12 @@ public:
static std::vector<feature *> ag_features;
/// \brief Implementation of the feature list accessor for atom group
virtual std::vector<feature *> &features() {
virtual const std::vector<feature *> &features()
{
return ag_features;
}
virtual std::vector<feature *> &modify_features()
{
return ag_features;
}

View File

@ -384,6 +384,7 @@ std::ostream & colvarbias::write_traj(std::ostream &os)
os << " ";
if (b_output_energy)
os << " "
<< std::setprecision(cvm::en_prec) << std::setw(cvm::en_width)
<< bias_energy;
return os;
}

View File

@ -175,7 +175,11 @@ public:
static std::vector<feature *> cvb_features;
/// \brief Implementation of the feature list accessor for colvarbias
virtual std::vector<feature *> &features()
virtual const std::vector<feature *> &features()
{
return cvb_features;
}
virtual std::vector<feature *> &modify_features()
{
return cvb_features;
}

View File

@ -99,12 +99,9 @@ int colvarbias_restraint_centers::init(std::string const &conf)
if (null_centers) {
// try to initialize the restraint centers for the first time
colvar_centers.resize(num_variables());
colvar_centers_raw.resize(num_variables());
for (i = 0; i < num_variables(); i++) {
colvar_centers[i].type(variables(i)->value());
colvar_centers[i].reset();
colvar_centers_raw[i].type(variables(i)->value());
colvar_centers_raw[i].reset();
}
}
@ -113,7 +110,6 @@ int colvarbias_restraint_centers::init(std::string const &conf)
if (cvm::debug()) {
cvm::log("colvarbias_restraint: parsing initial centers, i = "+cvm::to_str(i)+".\n");
}
colvar_centers_raw[i] = colvar_centers[i];
colvar_centers[i].apply_constraints();
}
null_centers = false;
@ -141,8 +137,6 @@ int colvarbias_restraint_centers::change_configuration(std::string const &conf)
for (size_t i = 0; i < num_variables(); i++) {
colvar_centers[i].type(variables(i)->value());
colvar_centers[i].apply_constraints();
colvar_centers_raw[i].type(variables(i)->value());
colvar_centers_raw[i] = colvar_centers[i];
}
}
return COLVARS_OK;
@ -232,7 +226,6 @@ int colvarbias_restraint_moving::set_state_params(std::string const &conf)
{
if (b_chg_centers || b_chg_force_k) {
if (target_nstages) {
// cvm::log ("Reading current stage from the restart.\n");
if (!get_keyval(conf, "stage", stage))
cvm::error("Error: current stage is missing from the restart.\n");
}
@ -265,100 +258,127 @@ int colvarbias_restraint_centers_moving::init(std::string const &conf)
size_t i;
if (get_keyval(conf, "targetCenters", target_centers, colvar_centers)) {
if (colvar_centers.size() != num_variables()) {
if (target_centers.size() != num_variables()) {
cvm::error("Error: number of target centers does not match "
"that of collective variables.\n");
"that of collective variables.\n", INPUT_ERROR);
}
b_chg_centers = true;
for (i = 0; i < target_centers.size(); i++) {
target_centers[i].apply_constraints();
centers_incr.push_back(colvar_centers[i]);
centers_incr[i].reset();
}
}
if (b_chg_centers) {
// parse moving restraint options
// parse moving schedule options
colvarbias_restraint_moving::init(conf);
if (initial_centers.size() == 0) {
// One-time init
initial_centers = colvar_centers;
}
// Call to check that the definition is correct
for (i = 0; i < num_variables(); i++) {
colvarvalue const midpoint =
colvarvalue::interpolate(initial_centers[i],
target_centers[i],
0.5);
}
} else {
target_centers.clear();
return COLVARS_OK;
}
get_keyval(conf, "outputCenters", b_output_centers, b_output_centers);
get_keyval(conf, "outputAccumulatedWork", b_output_acc_work, b_output_acc_work);
get_keyval(conf, "outputAccumulatedWork", b_output_acc_work,
b_output_acc_work); // TODO this conflicts with stages
return COLVARS_OK;
}
int colvarbias_restraint_centers_moving::update_centers(cvm::real lambda)
{
if (cvm::debug()) {
cvm::log("Updating centers for the restraint bias \""+
this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
}
size_t i;
for (i = 0; i < num_variables(); i++) {
colvarvalue const c_new = colvarvalue::interpolate(initial_centers[i],
target_centers[i],
lambda);
centers_incr[i] = (c_new).dist2_grad(colvar_centers[i]);
colvar_centers[i] = c_new;
variables(i)->wrap(colvar_centers[i]);
}
if (cvm::debug()) {
cvm::log("New centers for the restraint bias \""+
this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
}
return cvm::get_error();
}
int colvarbias_restraint_centers_moving::update()
{
if (b_chg_centers) {
if (cvm::debug()) {
cvm::log("Updating centers for the restraint bias \""+
this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
}
if (!centers_incr.size()) {
// if this is the first calculation, calculate the advancement
// at each simulation step (or stage, if applicable)
// (take current stage into account: it can be non-zero
// if we are restarting a staged calculation)
centers_incr.resize(num_variables());
for (size_t i = 0; i < num_variables(); i++) {
centers_incr[i].type(variables(i)->value());
centers_incr[i] = (target_centers[i] - colvar_centers_raw[i]) /
cvm::real( target_nstages ? (target_nstages - stage) :
(target_nsteps - cvm::step_absolute()));
}
if (cvm::debug()) {
cvm::log("Center increment for the restraint bias \""+
this->name+"\": "+cvm::to_str(centers_incr)+" at stage "+cvm::to_str(stage)+ ".\n");
}
}
if (target_nstages) {
if ((cvm::step_relative() > 0)
&& (cvm::step_absolute() % target_nsteps) == 0
&& stage < target_nstages) {
for (size_t i = 0; i < num_variables(); i++) {
colvar_centers_raw[i] += centers_incr[i];
colvar_centers[i] = colvar_centers_raw[i];
variables(i)->wrap(colvar_centers[i]);
colvar_centers[i].apply_constraints();
// Staged update
if (stage <= target_nstages) {
if ((cvm::step_relative() > 0) &&
((cvm::step_absolute() % target_nsteps) == 1)) {
cvm::real const lambda =
cvm::real(stage)/cvm::real(target_nstages);
update_centers(lambda);
stage++;
cvm::log("Moving restraint \"" + this->name +
"\" stage " + cvm::to_str(stage) +
" : setting centers to " + cvm::to_str(colvar_centers) +
" at step " + cvm::to_str(cvm::step_absolute()));
} else {
for (size_t i = 0; i < num_variables(); i++) {
centers_incr[i].reset();
}
}
stage++;
cvm::log("Moving restraint \"" + this->name +
"\" stage " + cvm::to_str(stage) +
" : setting centers to " + cvm::to_str(colvar_centers) +
" at step " + cvm::to_str(cvm::step_absolute()));
}
} else if ((cvm::step_relative() > 0) && (cvm::step_absolute() <= target_nsteps)) {
// move the restraint centers in the direction of the targets
// (slow growth)
} else {
// Continuous update
if (cvm::step_absolute() <= target_nsteps) {
cvm::real const lambda =
cvm::real(cvm::step_absolute())/cvm::real(target_nsteps);
update_centers(lambda);
} else {
for (size_t i = 0; i < num_variables(); i++) {
centers_incr[i].reset();
}
}
}
if (cvm::step_relative() == 0) {
for (size_t i = 0; i < num_variables(); i++) {
colvar_centers_raw[i] += centers_incr[i];
colvar_centers[i] = colvar_centers_raw[i];
variables(i)->wrap(colvar_centers[i]);
colvar_centers[i].apply_constraints();
// finite differences are undefined when restarting
centers_incr[i].reset();
}
}
if (cvm::debug()) {
cvm::log("New centers for the restraint bias \""+
this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
cvm::log("Center increment for the restraint bias \""+
this->name+"\": "+cvm::to_str(centers_incr)+
" at stage "+cvm::to_str(stage)+ ".\n");
}
}
return COLVARS_OK;
return cvm::get_error();
}
int colvarbias_restraint_centers_moving::update_acc_work()
{
if (b_output_acc_work) {
if ((cvm::step_relative() > 0) || (cvm::step_absolute() == 0)) {
if ((cvm::step_relative() > 0) &&
(cvm::step_absolute() <= target_nsteps)) {
for (size_t i = 0; i < num_variables(); i++) {
// project forces on the calculated increments at this step
acc_work += colvar_forces[i] * centers_incr[i];
@ -383,13 +403,6 @@ std::string const colvarbias_restraint_centers_moving::get_state_params() const
<< colvar_centers[i];
}
os << "\n";
os << "centers_raw ";
for (i = 0; i < num_variables(); i++) {
os << " "
<< std::setprecision(cvm::cv_prec) << std::setw(cvm::cv_width)
<< colvar_centers_raw[i];
}
os << "\n";
if (b_output_acc_work) {
os << "accumulatedWork "
@ -398,7 +411,7 @@ std::string const colvarbias_restraint_centers_moving::get_state_params() const
}
}
return colvarbias_restraint_moving::get_state_params() + os.str();
return os.str();
}
@ -410,8 +423,6 @@ int colvarbias_restraint_centers_moving::set_state_params(std::string const &con
// cvm::log ("Reading the updated restraint centers from the restart.\n");
if (!get_keyval(conf, "centers", colvar_centers))
cvm::error("Error: restraint centers are missing from the restart.\n");
if (!get_keyval(conf, "centers_raw", colvar_centers_raw))
cvm::error("Error: \"raw\" restraint centers are missing from the restart.\n");
if (b_output_acc_work) {
if (!get_keyval(conf, "accumulatedWork", acc_work))
cvm::error("Error: accumulatedWork is missing from the restart.\n");
@ -609,7 +620,7 @@ std::string const colvarbias_restraint_k_moving::get_state_params() const
<< std::setprecision(cvm::en_prec)
<< std::setw(cvm::en_width) << force_k << "\n";
}
return colvarbias_restraint_moving::get_state_params() + os.str();
return os.str();
}
@ -770,6 +781,7 @@ cvm::real colvarbias_restraint_harmonic::d_restraint_potential_dk(size_t i) cons
std::string const colvarbias_restraint_harmonic::get_state_params() const
{
return colvarbias_restraint::get_state_params() +
colvarbias_restraint_moving::get_state_params() +
colvarbias_restraint_centers_moving::get_state_params() +
colvarbias_restraint_k_moving::get_state_params();
}
@ -779,6 +791,7 @@ int colvarbias_restraint_harmonic::set_state_params(std::string const &conf)
{
int error_code = COLVARS_OK;
error_code |= colvarbias_restraint::set_state_params(conf);
error_code |= colvarbias_restraint_moving::set_state_params(conf);
error_code |= colvarbias_restraint_centers_moving::set_state_params(conf);
error_code |= colvarbias_restraint_k_moving::set_state_params(conf);
return error_code;
@ -1037,6 +1050,7 @@ cvm::real colvarbias_restraint_harmonic_walls::d_restraint_potential_dk(size_t i
std::string const colvarbias_restraint_harmonic_walls::get_state_params() const
{
return colvarbias_restraint::get_state_params() +
colvarbias_restraint_moving::get_state_params() +
colvarbias_restraint_k_moving::get_state_params();
}
@ -1045,6 +1059,7 @@ int colvarbias_restraint_harmonic_walls::set_state_params(std::string const &con
{
int error_code = COLVARS_OK;
error_code |= colvarbias_restraint::set_state_params(conf);
error_code |= colvarbias_restraint_moving::set_state_params(conf);
error_code |= colvarbias_restraint_k_moving::set_state_params(conf);
return error_code;
}
@ -1164,6 +1179,7 @@ cvm::real colvarbias_restraint_linear::d_restraint_potential_dk(size_t i) const
std::string const colvarbias_restraint_linear::get_state_params() const
{
return colvarbias_restraint::get_state_params() +
colvarbias_restraint_moving::get_state_params() +
colvarbias_restraint_centers_moving::get_state_params() +
colvarbias_restraint_k_moving::get_state_params();
}
@ -1173,6 +1189,7 @@ int colvarbias_restraint_linear::set_state_params(std::string const &conf)
{
int error_code = COLVARS_OK;
error_code |= colvarbias_restraint::set_state_params(conf);
error_code |= colvarbias_restraint_moving::set_state_params(conf);
error_code |= colvarbias_restraint_centers_moving::set_state_params(conf);
error_code |= colvarbias_restraint_k_moving::set_state_params(conf);
return error_code;

View File

@ -74,9 +74,6 @@ protected:
/// \brief Restraint centers
std::vector<colvarvalue> colvar_centers;
/// \brief Restraint centers outside the domain of the colvars (no wrapping or constraints applied)
std::vector<colvarvalue> colvar_centers_raw;
};
@ -156,10 +153,16 @@ protected:
/// \brief New restraint centers
std::vector<colvarvalue> target_centers;
/// \brief Initial value of the restraint centers
std::vector<colvarvalue> initial_centers;
/// \brief Amplitude of the restraint centers' increment at each step
/// (or stage) towards the new values (calculated from target_nsteps)
/// towards the new values (calculated from target_nsteps)
std::vector<colvarvalue> centers_incr;
/// \brief Update the centers by interpolating between initial and target
virtual int update_centers(cvm::real lambda);
/// Whether to write the current restraint centers to the trajectory file
bool b_output_centers;

View File

@ -132,9 +132,15 @@ public:
static std::vector<feature *> cvc_features;
/// \brief Implementation of the feature list accessor for colvar
virtual std::vector<feature *> &features() {
virtual const std::vector<feature *> &features()
{
return cvc_features;
}
virtual std::vector<feature *> &modify_features()
{
return cvc_features;
}
/// \brief Obtain data needed for the calculation for the backend
virtual void read_data();

View File

@ -374,8 +374,8 @@ int colvardeps::decr_ref_count(int feature_id) {
}
void colvardeps::init_feature(int feature_id, const char *description, feature_type type) {
features()[feature_id]->description = description;
features()[feature_id]->type = type;
modify_features()[feature_id]->description = description;
modify_features()[feature_id]->type = type;
}
// Shorthand macros for describing dependencies
@ -401,7 +401,7 @@ void colvardeps::init_cvb_requires() {
int i;
if (features().size() == 0) {
for (i = 0; i < f_cvb_ntot; i++) {
features().push_back(new feature);
modify_features().push_back(new feature);
}
init_feature(f_cvb_active, "active", f_type_dynamic);
@ -438,7 +438,7 @@ void colvardeps::init_cv_requires() {
size_t i;
if (features().size() == 0) {
for (i = 0; i < f_cv_ntot; i++) {
features().push_back(new feature);
modify_features().push_back(new feature);
}
init_feature(f_cv_active, "active", f_type_dynamic);
@ -554,7 +554,7 @@ void colvardeps::init_cvc_requires() {
// Initialize static array once and for all
if (features().size() == 0) {
for (i = 0; i < colvardeps::f_cvc_ntot; i++) {
features().push_back(new feature);
modify_features().push_back(new feature);
}
init_feature(f_cvc_active, "active", f_type_dynamic);
@ -633,7 +633,7 @@ void colvardeps::init_ag_requires() {
// Initialize static array once and for all
if (features().size() == 0) {
for (i = 0; i < f_ag_ntot; i++) {
features().push_back(new feature);
modify_features().push_back(new feature);
}
init_feature(f_ag_active, "active", f_type_dynamic);

View File

@ -135,7 +135,8 @@ public:
// with a non-static array
// Intermediate classes (colvarbias and colvarcomp, which are also base classes)
// implement this as virtual to allow overriding
virtual std::vector<feature *>&features() = 0;
virtual const std::vector<feature *>&features() = 0;
virtual std::vector<feature *>&modify_features() = 0;
void add_child(colvardeps *child);

View File

@ -1,4 +1,5 @@
#define COLVARS_VERSION "2017-07-15"
#ifndef COLVARS_VERSION
#define COLVARS_VERSION "2017-08-06"
// This file is part of the Collective Variables module (Colvars).
// The original version of Colvars and its updates are located at:
// https://github.com/colvars/colvars
@ -6,3 +7,4 @@
// If you wish to distribute your changes, please submit them to the
// Colvars repository at GitHub.
#endif

View File

@ -472,7 +472,7 @@ int colvarscript::proc_features(colvardeps *obj,
}
if ((subcmd == "get") || (subcmd == "set")) {
std::vector<colvardeps::feature *> &features = obj->features();
std::vector<colvardeps::feature *> const &features = obj->features();
std::string const req_feature(obj_to_str(objv[3]));
colvardeps::feature *f = NULL;
int fid = 0;

View File

@ -19,6 +19,17 @@ bool colvarmodule::rotation::monitor_crossings = false;
cvm::real colvarmodule::rotation::crossing_threshold = 1.0E-02;
/// Numerical recipes diagonalization
static int jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot);
/// Eigenvector sort
static int eigsrt(cvm::real *d, cvm::real **v);
/// Transpose the matrix
static int transpose(cvm::real **v);
std::string cvm::rvector::to_simple_string() const
{
std::ostringstream os;
@ -286,7 +297,12 @@ void colvarmodule::rotation::diagonalize_matrix(cvm::matrix2d<cvm::real> &S,
// diagonalize
int jac_nrot = 0;
jacobi(S.c_array(), S_eigval.c_array(), S_eigvec.c_array(), &jac_nrot);
if (jacobi(S.c_array(), S_eigval.c_array(), S_eigvec.c_array(), &jac_nrot) !=
COLVARS_OK) {
cvm::error("Too many iterations in routine jacobi.\n"
"This is usually the result of an ill-defined set of atoms for "
"rotational alignment (RMSD, rotateReference, etc).\n");
}
eigsrt(S_eigval.c_array(), S_eigvec.c_array());
// jacobi saves eigenvectors by columns
transpose(S_eigvec.c_array());
@ -528,7 +544,7 @@ void colvarmodule::rotation::calc_optimal_rotation(std::vector<cvm::atom_pos> co
#define n 4
void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
int jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
{
int j,iq,ip,i;
cvm::real tresh,theta,tau,t,sm,s,h,g,c;
@ -554,7 +570,7 @@ void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
sm += std::fabs(a[ip][iq]);
}
if (sm == 0.0) {
return;
return COLVARS_OK;
}
if (i < 4)
tresh=0.2*sm/(n*n);
@ -606,10 +622,11 @@ void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
z[ip]=0.0;
}
}
cvm::error("Too many iterations in routine jacobi.\n");
return COLVARS_ERROR;
}
void eigsrt(cvm::real *d, cvm::real **v)
int eigsrt(cvm::real *d, cvm::real **v)
{
int k,j,i;
cvm::real p;
@ -628,9 +645,11 @@ void eigsrt(cvm::real *d, cvm::real **v)
}
}
}
return COLVARS_OK;
}
void transpose(cvm::real **v)
int transpose(cvm::real **v)
{
cvm::real p;
int i,j;
@ -641,6 +660,7 @@ void transpose(cvm::real **v)
v[j][i]=p;
}
}
return COLVARS_OK;
}
#undef n

View File

@ -1020,16 +1020,6 @@ inline cvm::rvector operator * (cvm::rmatrix const &m,
}
/// Numerical recipes diagonalization
void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot);
/// Eigenvector sort
void eigsrt(cvm::real *d, cvm::real **v);
/// Transpose the matrix
void transpose(cvm::real **v);
/// \brief 1-dimensional vector of real numbers with four components and

View File

@ -570,6 +570,50 @@ colvarvalue colvarvalue::dist2_grad(colvarvalue const &x2) const
}
/// Return the midpoint between x1 and x2, optionally weighted by lambda
/// (which must be between 0.0 and 1.0)
colvarvalue const colvarvalue::interpolate(colvarvalue const &x1,
colvarvalue const &x2,
cvm::real const lambda)
{
colvarvalue::check_types(x1, x2);
if ((lambda < 0.0) || (lambda > 1.0)) {
cvm::error("Error: trying to interpolate between two colvarvalues with a "
"lamdba outside [0:1].\n", BUG_ERROR);
}
colvarvalue interp = ((1.0-lambda)*x1 + lambda*x2);
cvm::real const d2 = x1.dist2(x2);
switch (x1.type()) {
case colvarvalue::type_scalar:
case colvarvalue::type_3vector:
case colvarvalue::type_vector:
case colvarvalue::type_unit3vectorderiv:
case colvarvalue::type_quaternionderiv:
return interp;
break;
case colvarvalue::type_unit3vector:
case colvarvalue::type_quaternion:
if (interp.norm()/std::sqrt(d2) < 1.0e-6) {
cvm::error("Error: interpolation between "+cvm::to_str(x1)+" and "+
cvm::to_str(x2)+" with lambda = "+cvm::to_str(lambda)+
" is undefined: result = "+cvm::to_str(interp)+"\n",
INPUT_ERROR);
}
interp.apply_constraints();
return interp;
break;
case colvarvalue::type_notset:
default:
x1.undef_op();
break;
}
return colvarvalue(colvarvalue::type_notset);
}
std::string colvarvalue::to_simple_string() const
{
switch (type()) {

View File

@ -193,6 +193,12 @@ public:
/// Derivative with respect to this \link colvarvalue \endlink of the square distance
colvarvalue dist2_grad(colvarvalue const &x2) const;
/// Return the midpoint between x1 and x2, optionally weighted by lambda
/// (which must be between 0.0 and 1.0)
static colvarvalue const interpolate(colvarvalue const &x1,
colvarvalue const &x2,
cvm::real const lambda = 0.5);
/// Assignment operator (type of x is checked)
colvarvalue & operator = (colvarvalue const &x);
@ -285,10 +291,10 @@ public:
cvm::real & operator [] (int const i);
/// Ensure that the two types are the same within a binary operator
int static check_types(colvarvalue const &x1, colvarvalue const &x2);
static int check_types(colvarvalue const &x1, colvarvalue const &x2);
/// Ensure that the two types are the same within an assignment, or that the left side is type_notset
int static check_types_assign(Type const &vt1, Type const &vt2);
static int check_types_assign(Type const &vt1, Type const &vt2);
/// Undefined operation
void undef_op() const;
@ -317,14 +323,14 @@ public:
/// \brief Optimized routine for the inner product of one collective
/// variable with an array
void static inner_opt(colvarvalue const &x,
static void inner_opt(colvarvalue const &x,
std::vector<colvarvalue>::iterator &xv,
std::vector<colvarvalue>::iterator const &xv_end,
std::vector<cvm::real>::iterator &result);
/// \brief Optimized routine for the inner product of one collective
/// variable with an array
void static inner_opt(colvarvalue const &x,
static void inner_opt(colvarvalue const &x,
std::list<colvarvalue>::iterator &xv,
std::list<colvarvalue>::iterator const &xv_end,
std::vector<cvm::real>::iterator &result);
@ -332,14 +338,14 @@ public:
/// \brief Optimized routine for the second order Legendre
/// polynomial, (3cos^2(w)-1)/2, of one collective variable with an
/// array
void static p2leg_opt(colvarvalue const &x,
static void p2leg_opt(colvarvalue const &x,
std::vector<colvarvalue>::iterator &xv,
std::vector<colvarvalue>::iterator const &xv_end,
std::vector<cvm::real>::iterator &result);
/// \brief Optimized routine for the second order Legendre
/// polynomial of one collective variable with an array
void static p2leg_opt(colvarvalue const &x,
static void p2leg_opt(colvarvalue const &x,
std::list<colvarvalue>::iterator &xv,
std::list<colvarvalue>::iterator const &xv_end,
std::vector<cvm::real>::iterator &result);

View File

@ -22,21 +22,21 @@
offset=tid & (t_per_atom-1); \
ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
#define nbor_info(nbor_mem, packed_mem, nbor_stride, t_per_atom, ii, offset, \
i, numj, stride, nbor_end, nbor_begin) \
i=nbor_mem[ii]; \
nbor_begin=ii+nbor_stride; \
numj=nbor_mem[nbor_begin]; \
if (nbor_mem==packed_mem) { \
nbor_begin+=nbor_stride+fast_mul(ii,t_per_atom-1); \
stride=fast_mul(t_per_atom,nbor_stride); \
nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & (t_per_atom-1)); \
#define nbor_info(dev_nbor, dev_packed, nbor_pitch, t_per_atom, ii, offset, \
i, numj, n_stride, nbor_end, nbor_begin) \
i=dev_nbor[ii]; \
nbor_begin=ii+nbor_pitch; \
numj=dev_nbor[nbor_begin]; \
if (dev_nbor==dev_packed) { \
nbor_begin+=nbor_pitch+fast_mul(ii,t_per_atom-1); \
n_stride=fast_mul(t_per_atom,nbor_pitch); \
nbor_end=nbor_begin+fast_mul(numj/t_per_atom,n_stride)+(numj & (t_per_atom-1)); \
nbor_begin+=offset; \
} else { \
nbor_begin+=nbor_stride; \
nbor_begin=nbor_mem[nbor_begin]; \
nbor_begin+=nbor_pitch; \
nbor_begin=dev_nbor[nbor_begin]; \
nbor_end=nbor_begin+numj; \
stride=t_per_atom; \
n_stride=t_per_atom; \
nbor_begin+=offset; \
}

View File

@ -20,7 +20,7 @@ using namespace LAMMPS_AL;
extern Device<PRECISION,ACC_PRECISION> global_device;
template <class numtyp, class acctyp>
BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
device=&global_device;
ans=new Answer<numtyp,acctyp>();
nbor=new Neighbor();
@ -53,8 +53,8 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size, const double gpu_split,
FILE *_screen, const void *pair_program,
const char *k_two, const char *k_three_center,
const char *k_three_end) {
const char *two, const char *three_center,
const char *three_end, const char *short_nbor) {
screen=_screen;
int gpu_nbor=0;
@ -70,10 +70,10 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
_gpu_host=1;
_threads_per_atom=device->threads_per_atom();
if (_threads_per_atom>1 && gpu_nbor==0) {
if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else
} else // neigh yes or tpa == 1
_nbor_data=&(nbor->dev_nbor);
if (_threads_per_atom*_threads_per_atom>device->warp_size())
return -10;
@ -97,7 +97,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
_block_pair=device->pair_block_size();
_block_size=device->block_ellipse();
compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end);
compile_kernels(*ucl_device,pair_program,two,three_center,three_end,short_nbor);
// Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split);
@ -113,6 +113,11 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
_max_an_bytes+=ans2->gpu_bytes();
#endif
int ef_nall=nall;
if (ef_nall==0)
ef_nall=2000;
dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
return 0;
}
@ -136,6 +141,7 @@ void BaseThreeT::clear_atomic() {
k_three_end.clear();
k_three_end_vatom.clear();
k_pair.clear();
k_short_nbor.clear();
delete pair_program;
_compiled=false;
}
@ -143,6 +149,7 @@ void BaseThreeT::clear_atomic() {
time_pair.clear();
hd_balancer.clear();
dev_short_nbor.clear();
nbor->clear();
ans->clear();
#ifdef THREE_CONCURRENT
@ -169,6 +176,8 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
if (!success)
return NULL;
_nall = nall;
// originally the requirement that nall == nlist was enforced
// to allow direct indexing neighbors of neighbors after re-arrangement
// nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
@ -203,6 +212,8 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
return 0;
atom->cast_copy_x(host_x,host_type);
_nall = nall;
int mn;
nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag,
nspecial, special, success, mn);
@ -247,12 +258,22 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
if (!success)
return;
_max_nbors = nbor->max_nbor_loop(nlist,numj,ilist);
}
atom->cast_x_data(host_x,host_type);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
// re-allocate dev_short_nbor if necessary
if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
dev_short_nbor.resize((2+_max_nbors)*_nmax);
}
// _ainum to be used in loop() for short neighbor list build
_ainum = nlist;
int evatom=0;
if (eatom || vatom)
evatom=1;
@ -300,7 +321,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
// Build neighbor list on GPU if necessary
if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
_max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
@ -313,6 +334,15 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
*ilist=nbor->host_ilist.begin();
*jnum=nbor->host_acc.begin();
// re-allocate dev_short_nbor if necessary
if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
dev_short_nbor.resize((2+_max_nbors)*_nmax);
}
// _ainum to be used in loop() for short neighbor list build
_ainum = nall;
int evatom=0;
if (eatom || vatom)
evatom=1;
@ -339,19 +369,20 @@ double BaseThreeT::host_memory_usage_atomic() const {
template <class numtyp, class acctyp>
void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
const char *ktwo, const char *kthree_center,
const char *kthree_end) {
const char *two, const char *three_center,
const char *three_end, const char* short_nbor) {
if (_compiled)
return;
std::string vatom_name=std::string(kthree_end)+"_vatom";
std::string vatom_name=std::string(three_end)+"_vatom";
pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,device->compile_string().c_str());
k_three_center.set_function(*pair_program,kthree_center);
k_three_end.set_function(*pair_program,kthree_end);
k_three_center.set_function(*pair_program,three_center);
k_three_end.set_function(*pair_program,three_end);
k_three_end_vatom.set_function(*pair_program,vatom_name.c_str());
k_pair.set_function(*pair_program,ktwo);
k_pair.set_function(*pair_program,two);
k_short_nbor.set_function(*pair_program,short_nbor);
pos_tex.get_texture(*pair_program,"pos_tex");
#ifdef THREE_CONCURRENT

View File

@ -56,7 +56,8 @@ class BaseThree {
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen,
const void *pair_program, const char *k_two,
const char *k_three_center, const char *k_three_end);
const char *k_three_center, const char *k_three_end,
const char *k_short_nbor=NULL);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead();
@ -73,18 +74,18 @@ class BaseThree {
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
/** \param inum number of particles whose nbors must be stored on device
* \param max_nbors maximum number of neighbors
* \param success set to false if insufficient memory
* \note olist_size=total number of local particles **/
inline void resize_local(const int inum, const int max_nbors, bool &success) {
nbor->resize(inum,max_nbors,success);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
/** \param inum number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \param max_nbors current maximum number of neighbors
* \note host_inum is 0 if the host is performing neighboring
* \note nlocal+host_inum=total number local particles
* \note olist_size=0 **/
@ -143,14 +144,6 @@ class BaseThree {
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success);
/// Pair loop with device neighboring
int * compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success);
/// Pair loop with device neighboring
int ** compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo,
@ -193,6 +186,9 @@ class BaseThree {
/// Neighbor data
Neighbor *nbor;
UCL_D_Vec<int> dev_short_nbor;
UCL_Kernel k_short_nbor;
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program;
UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
@ -207,12 +203,13 @@ class BaseThree {
int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
int _gpu_nbor;
double _max_bytes, _max_an_bytes;
int _max_nbors, _ainum, _nall;
double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const void *pair_string,
const char *k_two, const char *k_three_center,
const char *k_three_end);
const char *two, const char *three_center,
const char *three_end, const char* short_nbor);
virtual void loop(const bool _eflag, const bool _vflag,
const int evatom) = 0;

View File

@ -55,7 +55,7 @@ int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_
int success;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,sw,"k_sw","k_sw_three_center",
"k_sw_three_end");
"k_sw_three_end","k_sw_short_nbor");
if (success!=0)
return success;
@ -193,19 +193,30 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
// build the short neighbor list
int ainum=this->_ainum;
int nbor_pitch=this->nbor->nbor_pitch();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &sw3, &map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
ainum=this->ans->inum();
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
@ -217,6 +228,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &evatom);
@ -231,7 +243,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
@ -240,7 +252,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

View File

@ -130,6 +130,63 @@ texture<int4> sw3_tex;
#endif
__kernel void k_sw_short_nbor(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict sw3,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const int inum, const int nbor_pitch, const int t_per_atom) {
__local int n_stride;
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
int ncount = 0;
int m = nbor;
dev_short_nbor[m] = 0;
int nbor_short = nbor+n_stride;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int nj = j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
}
} // for nbor
// store the number of neighbors for each thread
dev_short_nbor[m] = ncount;
} // if ii
}
__kernel void k_sw(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict sw1,
@ -140,6 +197,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
@ -158,8 +216,8 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
int nbor, nbor_end, i, numj;
const int* nbor_mem = dev_packed;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -167,9 +225,17 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor];
nbor += n_stride;
nbor_end = nbor+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int j=nbor_mem[nbor];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -337,6 +403,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -361,7 +428,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -371,9 +438,18 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -395,14 +471,23 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_ijparam.x;
int nbor_k=nbor_j-offset_j+offset_k;
if (nbor_k<=nbor_j)
nbor_k+=n_stride;
int nbor_k,k_end;
if (dev_packed==dev_nbor) {
nbor_k=nborj_start-offset_j+offset_k;
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
} else {
nbor_k = nbor_j-offset_j+offset_k;
if (nbor_k<=nbor_j) nbor_k += n_stride;
k_end = nbor_end;
}
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (dev_packed==dev_nbor && k <= j) continue;
numtyp4 kx; fetch4(kx,k,pos_tex);
int ktype=kx.w;
ktype=map[ktype];
@ -460,6 +545,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -484,7 +570,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -494,8 +580,16 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -534,8 +628,15 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -598,6 +699,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -622,7 +724,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -632,8 +734,16 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -672,8 +782,15 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;

View File

@ -55,7 +55,8 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
int success;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,tersoff,"k_tersoff_repulsive",
"k_tersoff_three_center", "k_tersoff_three_end");
"k_tersoff_three_center", "k_tersoff_three_end",
"k_tersoff_short_nbor");
if (success!=0)
return success;
@ -157,11 +158,16 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int i=0; i<nparams; i++)
double cutsqmax = 0.0;
for (int i=0; i<nparams; i++) {
cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
}
cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
ucl_copy(cutsq,cutsq_view,false);
_cutshortsq = static_cast<numtyp>(cutsqmax);
UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
*(this->ucl_device), UCL_WRITE_ONLY);
@ -219,171 +225,6 @@ double TersoffT::host_memory_usage() const {
#define KTHREADS this->_threads_per_atom
#define JTHREADS this->_threads_per_atom
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void TersoffT::compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start,
const double cpu_time, bool &success) {
this->acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return;
}
int ago=this->hd_balancer.ago_first(f_ago);
int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
this->ans->inum(inum);
#ifdef THREE_CONCURRENT
this->ans2->inum(inum);
#endif
host_start=inum;
if (ago==0) {
this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
if (!success)
return;
_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
}
this->atom->cast_x_data(host_x,host_type);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
// re-allocate zetaij if necessary
if (nall*_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(_max_nbors*_nmax);
}
int _eflag;
if (eflag)
_eflag=1;
else
_eflag=0;
int ainum=nlist;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
evatom=1;
#ifdef THREE_CONCURRENT
this->ucl_device->sync();
#endif
loop(eflag,vflag,evatom);
this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
this->device->add_ans_object(this->ans);
#ifdef THREE_CONCURRENT
this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
this->device->add_ans_object(this->ans2);
#endif
this->hd_balancer.stop_timer();
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int ** TersoffT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum,
const double cpu_time, bool &success) {
this->acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return NULL;
}
this->hd_balancer.balance(cpu_time);
int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
this->ans->inum(inum);
#ifdef THREE_CONCURRENT
this->ans2->inum(inum);
#endif
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
this->hd_balancer.start_timer();
} else {
this->atom->cast_x_data(host_x,host_type);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
}
*ilist=this->nbor->host_ilist.begin();
*jnum=this->nbor->host_acc.begin();
// re-allocate zetaij if necessary
if (nall*_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(_max_nbors*_nmax);
}
int _eflag;
if (eflag)
_eflag=1;
else
_eflag=0;
int ainum=nall;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
evatom=1;
#ifdef THREE_CONCURRENT
this->ucl_device->sync();
#endif
loop(eflag,vflag,evatom);
this->ans->copy_answers(eflag,vflag,eatom,vatom);
this->device->add_ans_object(this->ans);
#ifdef THREE_CONCURRENT
this->ans2->copy_answers(eflag,vflag,eatom,vatom);
this->device->add_ans_object(this->ans2);
#endif
this->hd_balancer.stop_timer();
return this->nbor->host_jlist.begin()-host_start;
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
@ -402,9 +243,40 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
else
vflag=0;
int ainum=this->ans->inum();
// build the short neighbor list
int ainum=this->_ainum;
int nbor_pitch=this->nbor->nbor_pitch();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
&elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// re-allocate zetaij if necessary
int nall = this->_nall;
if (nall*this->_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(this->_max_nbors*_nmax);
}
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
ainum=this->ans->inum();
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
@ -412,6 +284,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq,
&map, &elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
@ -423,6 +296,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &evatom);
@ -437,7 +311,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
@ -446,7 +320,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
}

View File

@ -106,7 +106,7 @@ texture<int4> ts5_tex;
ans[ii]=old; \
}
#define store_zeta(z, tid, t_per_atom, offset) \
#define acc_zeta(z, tid, t_per_atom, offset) \
if (t_per_atom>1) { \
__local acctyp red_acc[BLOCK_PAIR]; \
red_acc[tid]=z; \
@ -155,7 +155,7 @@ texture<int4> ts5_tex;
ans[ii]=old; \
}
#define store_zeta(z, tid, t_per_atom, offset) \
#define acc_zeta(z, tid, t_per_atom, offset) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
z += shfl_xor(z, s, t_per_atom); \
@ -164,6 +164,65 @@ texture<int4> ts5_tex;
#endif
__kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
const __global numtyp *restrict cutsq,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const int inum, const int nbor_pitch,
const int t_per_atom) {
__local int n_stride;
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
int ncount = 0;
int m = nbor;
dev_short_nbor[m] = 0;
int nbor_short = nbor+n_stride;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int nj = j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
}
} // for nbor
// store the number of neighbors for each thread
dev_short_nbor[m] = ncount;
} // if ii
}
// Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
// while the block size should never be less than 32.
// SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
@ -184,6 +243,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
__global acctyp4 * zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
const int eflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__local int tpa_sq,n_stride;
@ -211,22 +271,29 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor_j, nbor_end;
int i, numj;
int nbor_j, nbor_end, i, numj;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
int nborj_start = nbor_j;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -241,14 +308,20 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
delr1.z = jx.z-ix.z;
numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
if (rsq1 > cutsq[ijparam]) continue;
// if (rsq1 > cutsq[ijparam]) continue;
// compute zeta_ij
z = (acctyp)0;
int nbor_k = nborj_start-offset_j+offset_k;
for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k_end = nbor_end;
if (dev_packed==dev_nbor) {
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == j) continue;
@ -284,10 +357,12 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
i, nbor_j, offset_j, idx);
store_zeta(z, tid, t_per_atom, offset_k);
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acc_zeta(z, tid, t_per_atom, offset_k);
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
numtyp ijparam_lam2 = ts1_ijparam.y;
@ -330,6 +405,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -356,8 +432,8 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
int nbor, nbor_end, i, numj;
const int* nbor_mem=dev_packed;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -365,9 +441,17 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor];
nbor += n_stride;
nbor_end = nbor+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int j=nbor_mem[nbor];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -382,32 +466,31 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
numtyp ijparam_biga = ts2_ijparam.x;
numtyp ijparam_bigr = ts2_ijparam.z;
numtyp ijparam_bigd = ts2_ijparam.w;
// rsq<cutsq[ijparam]
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
numtyp ijparam_biga = ts2_ijparam.x;
numtyp ijparam_bigr = ts2_ijparam.z;
numtyp ijparam_bigd = ts2_ijparam.w;
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
rsq, eflag, feng);
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
rsq, eflag, feng);
numtyp force = feng[0];
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
numtyp force = feng[0];
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0)
energy+=feng[1];
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
if (eflag>0)
energy+=feng[1];
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
} // for nbor
@ -428,6 +511,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -461,20 +545,28 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
int nborj_start = nbor_j;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -489,7 +581,6 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp r1 = ucl_sqrt(rsq1);
numtyp r1inv = ucl_rsqrt(rsq1);
@ -497,9 +588,11 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
i, nbor_j, offset_j, idx);
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
numtyp force = zeta_ij.x*tpainv;
numtyp prefactor = zeta_ij.y;
@ -520,9 +613,15 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
virial[5] += delr1[1]*delr1[2]*mforce;
}
int nbor_k=nborj_start-offset_j+offset_k;
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int nbor_k = nborj_start-offset_j+offset_k;
int k_end = nbor_end;
if (dev_packed==dev_nbor) {
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (j == k) continue;
@ -598,6 +697,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -632,7 +732,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -643,9 +743,18 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
itype=map[itype];
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -660,8 +769,6 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp mdelr1[3];
mdelr1[0] = -delr1[0];
mdelr1[1] = -delr1[1];
@ -683,13 +790,20 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
int nbork_start = nbor_k;
// look up for zeta_ji: find i in the j's neighbor list
int m = tid / t_per_atom;
int ijnum = -1;
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
@ -711,9 +825,11 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, ijnum, offset_kf, idx);
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -736,7 +852,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
// attractive forces
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -777,9 +893,11 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, nbor_k, offset_k, idx);
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@ -824,6 +942,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -858,7 +977,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -869,9 +988,18 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
itype=map[itype];
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -886,8 +1014,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp mdelr1[3];
mdelr1[0] = -delr1[0];
mdelr1[1] = -delr1[1];
@ -909,13 +1035,20 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
int nbork_start = nbor_k;
// look up for zeta_ji
int m = tid / t_per_atom;
int ijnum = -1;
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
@ -937,9 +1070,11 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, ijnum, offset_kf, idx);
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -962,7 +1097,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
// attractive forces
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -1010,9 +1145,11 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, nbor_k, offset_k, idx);
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;
@ -1040,7 +1177,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
virial[3] += TWOTHIRD*(delr2[0]*fj[1] + mdelr1[0]*fk[1]);
virial[4] += TWOTHIRD*(delr2[0]*fj[2] + mdelr1[0]*fk[2]);
virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]);
}
} // for nbor

View File

@ -47,21 +47,6 @@ class Tersoff : public BaseThree<numtyp, acctyp> {
const double* h, const double* gamma, const double* beta,
const double* powern, const double* cutsq);
/// Pair loop with host neighboring
void compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, const bool eflag,
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success);
/// Pair loop with device neighboring
int ** compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
@ -104,8 +89,7 @@ class Tersoff : public BaseThree<numtyp, acctyp> {
UCL_Kernel k_zeta;
UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex;
int _max_nbors;
numtyp _cutshortsq;
private:
bool _allocated;

View File

@ -55,7 +55,8 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
int success;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,tersoff_mod,"k_tersoff_mod_repulsive",
"k_tersoff_mod_three_center", "k_tersoff_mod_three_end");
"k_tersoff_mod_three_center", "k_tersoff_mod_three_end",
"k_tersoff_mod_short_nbor");
if (success!=0)
return success;
@ -157,11 +158,16 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int i=0; i<nparams; i++)
double cutsqmax = 0.0;
for (int i=0; i<nparams; i++) {
cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
}
cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
ucl_copy(cutsq,cutsq_view,false);
_cutshortsq = static_cast<numtyp>(cutsqmax);
UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
*(this->ucl_device), UCL_WRITE_ONLY);
@ -219,171 +225,6 @@ double TersoffMT::host_memory_usage() const {
#define KTHREADS this->_threads_per_atom
#define JTHREADS this->_threads_per_atom
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void TersoffMT::compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start,
const double cpu_time, bool &success) {
this->acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return;
}
int ago=this->hd_balancer.ago_first(f_ago);
int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
this->ans->inum(inum);
#ifdef THREE_CONCURRENT
this->ans2->inum(inum);
#endif
host_start=inum;
if (ago==0) {
this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
if (!success)
return;
_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
}
this->atom->cast_x_data(host_x,host_type);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
// re-allocate zetaij if necessary
if (nall*_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(_max_nbors*_nmax);
}
int _eflag;
if (eflag)
_eflag=1;
else
_eflag=0;
int ainum=nlist;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
evatom=1;
#ifdef THREE_CONCURRENT
this->ucl_device->sync();
#endif
loop(eflag,vflag,evatom);
this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
this->device->add_ans_object(this->ans);
#ifdef THREE_CONCURRENT
this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
this->device->add_ans_object(this->ans2);
#endif
this->hd_balancer.stop_timer();
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int ** TersoffMT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum,
const double cpu_time, bool &success) {
this->acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return NULL;
}
this->hd_balancer.balance(cpu_time);
int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
this->ans->inum(inum);
#ifdef THREE_CONCURRENT
this->ans2->inum(inum);
#endif
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
this->hd_balancer.start_timer();
} else {
this->atom->cast_x_data(host_x,host_type);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
}
*ilist=this->nbor->host_ilist.begin();
*jnum=this->nbor->host_acc.begin();
// re-allocate zetaij if necessary
if (nall*_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(_max_nbors*_nmax);
}
int _eflag;
if (eflag)
_eflag=1;
else
_eflag=0;
int ainum=nall;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
evatom=1;
#ifdef THREE_CONCURRENT
this->ucl_device->sync();
#endif
loop(eflag,vflag,evatom);
this->ans->copy_answers(eflag,vflag,eatom,vatom);
this->device->add_ans_object(this->ans);
#ifdef THREE_CONCURRENT
this->ans2->copy_answers(eflag,vflag,eatom,vatom);
this->device->add_ans_object(this->ans2);
#endif
this->hd_balancer.stop_timer();
return this->nbor->host_jlist.begin()-host_start;
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
@ -402,9 +243,40 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
else
vflag=0;
int ainum=this->ans->inum();
// build the short neighbor list
int ainum=this->_ainum;
int nbor_pitch=this->nbor->nbor_pitch();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
&elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// re-allocate zetaij if necessary
int nall = this->_nall;
if (nall*this->_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(this->_max_nbors*_nmax);
}
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
ainum=this->ans->inum();
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
@ -412,6 +284,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq,
&map, &elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
@ -423,6 +296,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &evatom);
@ -437,7 +311,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
@ -446,7 +320,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
}

View File

@ -106,7 +106,7 @@ texture<int4> ts5_tex;
ans[ii]=old; \
}
#define store_zeta(z, tid, t_per_atom, offset) \
#define acc_zeta(z, tid, t_per_atom, offset) \
if (t_per_atom>1) { \
__local acctyp red_acc[BLOCK_PAIR]; \
red_acc[tid]=z; \
@ -155,7 +155,7 @@ texture<int4> ts5_tex;
ans[ii]=old; \
}
#define store_zeta(z, tid, t_per_atom, offset) \
#define acc_zeta(z, tid, t_per_atom, offset) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
z += shfl_xor(z, s, t_per_atom); \
@ -164,6 +164,65 @@ texture<int4> ts5_tex;
#endif
__kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
const __global numtyp *restrict cutsq,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const int inum, const int nbor_pitch,
const int t_per_atom) {
__local int n_stride;
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
int ncount = 0;
int m = nbor;
dev_short_nbor[m] = 0;
int nbor_short = nbor+n_stride;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int nj = j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
}
} // for nbor
// store the number of neighbors for each thread
dev_short_nbor[m] = ncount;
} // if ii
}
// Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
// while the block size should never be less than 32.
// SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
@ -184,6 +243,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
__global acctyp4 * zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
const int eflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__local int tpa_sq,n_stride;
@ -211,22 +271,29 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor_j, nbor_end;
int i, numj;
int nbor_j, nbor_end, i, numj;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
int nborj_start = nbor_j;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -241,14 +308,18 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
delr1.z = jx.z-ix.z;
numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
if (rsq1 > cutsq[ijparam]) continue;
// compute zeta_ij
z = (numtyp)0;
z = (acctyp)0;
int nbor_k = nborj_start-offset_j+offset_k;
for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k_end = nbor_end;
if (dev_packed==dev_nbor) {
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == j) continue;
@ -287,10 +358,12 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
i, nbor_j, offset_j, idx);
store_zeta(z, tid, t_per_atom, offset_k);
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acc_zeta(z, tid, t_per_atom, offset_k);
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
numtyp ijparam_lam2 = ts1_ijparam.y;
@ -331,6 +404,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -357,8 +431,8 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
int nbor, nbor_end, i, numj;
const int* nbor_mem=dev_packed;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -366,9 +440,17 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor];
nbor += n_stride;
nbor_end = nbor+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int j=nbor_mem[nbor];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -383,32 +465,31 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
numtyp ijparam_biga = ts2_ijparam.x;
numtyp ijparam_bigr = ts2_ijparam.z;
numtyp ijparam_bigd = ts2_ijparam.w;
// rsq<cutsq[ijparam]
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
numtyp ijparam_biga = ts2_ijparam.x;
numtyp ijparam_bigr = ts2_ijparam.z;
numtyp ijparam_bigd = ts2_ijparam.w;
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
rsq, eflag, feng);
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
rsq, eflag, feng);
numtyp force = feng[0];
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
numtyp force = feng[0];
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0)
energy+=feng[1];
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
if (eflag>0)
energy+=feng[1];
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
} // for nbor
@ -430,6 +511,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -465,20 +547,28 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
int nborj_start = nbor_j;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -493,7 +583,6 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp r1 = ucl_sqrt(rsq1);
numtyp r1inv = ucl_rsqrt(rsq1);
@ -501,9 +590,11 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
i, nbor_j, offset_j, idx);
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
numtyp force = zeta_ij.x*tpainv;
numtyp prefactor = zeta_ij.y;
@ -524,9 +615,15 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
virial[5] += delr1[1]*delr1[2]*mforce;
}
int nbor_k=nborj_start-offset_j+offset_k;
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int nbor_k = nborj_start-offset_j+offset_k;
int k_end = nbor_end;
if (dev_packed==dev_nbor) {
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (j == k) continue;
@ -606,6 +703,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -642,7 +740,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -653,9 +751,18 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
itype=map[itype];
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -670,8 +777,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp mdelr1[3];
mdelr1[0] = -delr1[0];
mdelr1[1] = -delr1[1];
@ -693,13 +798,20 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
int nbork_start = nbor_k;
// look up for zeta_ji: find i in the j's neighbor list
int m = tid / t_per_atom;
int ijnum = -1;
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
@ -721,9 +833,11 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, ijnum, offset_kf, idx);
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -746,7 +860,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
// attractive forces
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -790,9 +904,11 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, nbor_k, offset_k, idx);
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@ -841,6 +957,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -877,7 +994,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -888,9 +1005,18 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
itype=map[itype];
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -905,8 +1031,6 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp mdelr1[3];
mdelr1[0] = -delr1[0];
mdelr1[1] = -delr1[1];
@ -928,13 +1052,20 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
int nbork_start = nbor_k;
// look up for zeta_ji
int m = tid / t_per_atom;
int ijnum = -1;
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
@ -956,9 +1087,11 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, ijnum, offset_kf, idx);
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -981,7 +1114,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
// attractive forces
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -1032,9 +1165,11 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, nbor_k, offset_k, idx);
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;

View File

@ -47,21 +47,6 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
const double* h, const double* beta, const double* powern,
const double* powern_del, const double* ca1, const double* cutsq);
/// Pair loop with host neighboring
void compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, const bool eflag,
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success);
/// Pair loop with device neighboring
int ** compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
@ -104,8 +89,7 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
UCL_Kernel k_zeta;
UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex;
int _max_nbors;
numtyp _cutshortsq;
private:
bool _allocated;

View File

@ -62,7 +62,8 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
int success;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,tersoff_zbl,"k_tersoff_zbl_repulsive",
"k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end");
"k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end",
"k_tersoff_zbl_short_nbor");
if (success!=0)
return success;
@ -177,11 +178,16 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int i=0; i<nparams; i++)
double cutsqmax = 0.0;
for (int i=0; i<nparams; i++) {
cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
}
cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
ucl_copy(cutsq,cutsq_view,false);
_cutshortsq = static_cast<numtyp>(cutsqmax);
UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
*(this->ucl_device), UCL_WRITE_ONLY);
@ -244,171 +250,6 @@ double TersoffZT::host_memory_usage() const {
#define KTHREADS this->_threads_per_atom
#define JTHREADS this->_threads_per_atom
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void TersoffZT::compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start,
const double cpu_time, bool &success) {
this->acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return;
}
int ago=this->hd_balancer.ago_first(f_ago);
int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
this->ans->inum(inum);
#ifdef THREE_CONCURRENT
this->ans2->inum(inum);
#endif
host_start=inum;
if (ago==0) {
this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
if (!success)
return;
_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
}
this->atom->cast_x_data(host_x,host_type);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
// re-allocate zetaij if necessary
if (nall*_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(_max_nbors*_nmax);
}
int _eflag;
if (eflag)
_eflag=1;
else
_eflag=0;
int ainum=nlist;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
evatom=1;
#ifdef THREE_CONCURRENT
this->ucl_device->sync();
#endif
loop(eflag,vflag,evatom);
this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
this->device->add_ans_object(this->ans);
#ifdef THREE_CONCURRENT
this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
this->device->add_ans_object(this->ans2);
#endif
this->hd_balancer.stop_timer();
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int ** TersoffZT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum,
const double cpu_time, bool &success) {
this->acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return NULL;
}
this->hd_balancer.balance(cpu_time);
int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
this->ans->inum(inum);
#ifdef THREE_CONCURRENT
this->ans2->inum(inum);
#endif
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
this->hd_balancer.start_timer();
} else {
this->atom->cast_x_data(host_x,host_type);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
}
*ilist=this->nbor->host_ilist.begin();
*jnum=this->nbor->host_acc.begin();
// re-allocate zetaij if necessary
if (nall*_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(_max_nbors*_nmax);
}
int _eflag;
if (eflag)
_eflag=1;
else
_eflag=0;
int ainum=nall;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
evatom=1;
#ifdef THREE_CONCURRENT
this->ucl_device->sync();
#endif
loop(eflag,vflag,evatom);
this->ans->copy_answers(eflag,vflag,eatom,vatom);
this->device->add_ans_object(this->ans);
#ifdef THREE_CONCURRENT
this->ans2->copy_answers(eflag,vflag,eatom,vatom);
this->device->add_ans_object(this->ans2);
#endif
this->hd_balancer.stop_timer();
return this->nbor->host_jlist.begin()-host_start;
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
@ -427,9 +268,40 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
else
vflag=0;
int ainum=this->ans->inum();
// build the short neighbor list
int ainum=this->_ainum;
int nbor_pitch=this->nbor->nbor_pitch();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
&elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// re-allocate zetaij if necessary
int nall = this->_nall;
if (nall*this->_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(this->_max_nbors*_nmax);
}
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
ainum=this->ans->inum();
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
@ -438,6 +310,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
&_global_e, &_global_a_0, &_global_epsilon_0, &cutsq,
&map, &elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
@ -449,6 +322,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &evatom);
@ -463,7 +337,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
@ -472,7 +346,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
}

View File

@ -109,7 +109,7 @@ texture<int4> ts6_tex;
ans[ii]=old; \
}
#define store_zeta(z, tid, t_per_atom, offset) \
#define acc_zeta(z, tid, t_per_atom, offset) \
if (t_per_atom>1) { \
__local acctyp red_acc[BLOCK_PAIR]; \
red_acc[tid]=z; \
@ -158,7 +158,7 @@ texture<int4> ts6_tex;
ans[ii]=old; \
}
#define store_zeta(z, tid, t_per_atom, offset) \
#define acc_zeta(z, tid, t_per_atom, offset) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
z += shfl_xor(z, s, t_per_atom); \
@ -167,6 +167,65 @@ texture<int4> ts6_tex;
#endif
__kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
const __global numtyp *restrict cutsq,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const int inum, const int nbor_pitch,
const int t_per_atom) {
__local int n_stride;
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
int ncount = 0;
int m = nbor;
dev_short_nbor[m] = 0;
int nbor_short = nbor+n_stride;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int nj = j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
}
} // for nbor
// store the number of neighbors for each thread
dev_short_nbor[m] = ncount;
} // if ii
}
// Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
// while the block size should never be less than 32.
// SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
@ -188,6 +247,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
__global acctyp4 * zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
const int eflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__local int tpa_sq,n_stride;
@ -217,22 +277,29 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor_j, nbor_end;
int i, numj;
int nbor_j, nbor_end, i, numj;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
int nborj_start = nbor_j;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -247,14 +314,18 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
delr1.z = jx.z-ix.z;
numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
if (rsq1 > cutsq[ijparam]) continue;
// compute zeta_ij
z = (acctyp)0;
int nbor_k = nborj_start-offset_j+offset_k;
for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k_end = nbor_end;
if (dev_packed==dev_nbor) {
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == j) continue;
@ -290,10 +361,12 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
i, nbor_j, offset_j, idx);
store_zeta(z, tid, t_per_atom, offset_k);
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acc_zeta(z, tid, t_per_atom, offset_k);
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
numtyp ijparam_lam2 = ts1_ijparam.y;
@ -342,6 +415,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -370,8 +444,8 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
int nbor, nbor_end, i, numj;
const int* nbor_mem=dev_packed;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -379,9 +453,17 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor];
nbor += n_stride;
nbor_end = nbor+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int j=nbor_mem[nbor];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -396,38 +478,37 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
numtyp ijparam_biga = ts2_ijparam.x;
numtyp ijparam_bigr = ts2_ijparam.z;
numtyp ijparam_bigd = ts2_ijparam.w;
numtyp4 ts6_ijparam = ts6[ijparam];
numtyp ijparam_Z_i = ts6_ijparam.x;
numtyp ijparam_Z_j = ts6_ijparam.y;
numtyp ijparam_ZBLcut = ts6_ijparam.z;
numtyp ijparam_ZBLexpscale = ts6_ijparam.w;
// rsq<cutsq[ijparam]
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
numtyp ijparam_biga = ts2_ijparam.x;
numtyp ijparam_bigr = ts2_ijparam.z;
numtyp ijparam_bigd = ts2_ijparam.w;
numtyp4 ts6_ijparam = ts6[ijparam];
numtyp ijparam_Z_i = ts6_ijparam.x;
numtyp ijparam_Z_j = ts6_ijparam.y;
numtyp ijparam_ZBLcut = ts6_ijparam.z;
numtyp ijparam_ZBLexpscale = ts6_ijparam.w;
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
ijparam_Z_i, ijparam_Z_j, ijparam_ZBLcut, ijparam_ZBLexpscale,
global_e, global_a_0, global_epsilon_0, rsq, eflag, feng);
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
ijparam_Z_i, ijparam_Z_j, ijparam_ZBLcut, ijparam_ZBLexpscale,
global_e, global_a_0, global_epsilon_0, rsq, eflag, feng);
numtyp force = feng[0];
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
numtyp force = feng[0];
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0)
energy+=feng[1];
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
if (eflag>0)
energy+=feng[1];
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
} // for nbor
@ -448,6 +529,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -481,20 +563,28 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
int nborj_start = nbor_j;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -509,7 +599,6 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp r1 = ucl_sqrt(rsq1);
numtyp r1inv = ucl_rsqrt(rsq1);
@ -517,9 +606,11 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
i, nbor_j, offset_j, idx);
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
numtyp force = zeta_ij.x*tpainv;
numtyp prefactor = zeta_ij.y;
@ -540,9 +631,15 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
virial[5] += delr1[1]*delr1[2]*mforce;
}
int nbor_k=nborj_start-offset_j+offset_k;
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int nbor_k = nborj_start-offset_j+offset_k;
int k_end = nbor_end;
if (dev_packed==dev_nbor) {
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (j == k) continue;
@ -618,6 +715,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -652,7 +750,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -663,9 +761,18 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
itype=map[itype];
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -680,8 +787,6 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp mdelr1[3];
mdelr1[0] = -delr1[0];
mdelr1[1] = -delr1[1];
@ -703,13 +808,20 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
int nbork_start = nbor_k;
// look up for zeta_ji: find i in the j's neighbor list
int m = tid / t_per_atom;
int ijnum = -1;
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
@ -731,9 +843,11 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, ijnum, offset_kf, idx);
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -756,7 +870,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
// attractive forces
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -797,9 +911,11 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, nbor_k, offset_k, idx);
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@ -844,6 +960,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -878,7 +995,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -889,9 +1006,18 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
itype=map[itype];
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -906,8 +1032,6 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp mdelr1[3];
mdelr1[0] = -delr1[0];
mdelr1[1] = -delr1[1];
@ -929,13 +1053,20 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
int nbork_start = nbor_k;
// look up for zeta_ji
int m = tid / t_per_atom;
int ijnum = -1;
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
@ -957,9 +1088,11 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, ijnum, offset_kf, idx);
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -982,7 +1115,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
// attractive forces
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -1030,9 +1163,11 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, nbor_k, offset_k, idx);
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;

View File

@ -49,21 +49,6 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
const double* ZBLcut, const double* ZBLexpscale, const double global_e,
const double global_a_0, const double global_epsilon_0, const double* cutsq);
/// Pair loop with host neighboring
void compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, const bool eflag,
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success);
/// Pair loop with device neighboring
int ** compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
@ -109,8 +94,8 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
UCL_Kernel k_zeta;
UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex, ts6_tex;
int _max_nbors;
numtyp _global_e,_global_a_0,_global_epsilon_0;
numtyp _cutshortsq;
private:
bool _allocated;

View File

@ -59,7 +59,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
int success;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,vashishta,"k_vashishta","k_vashishta_three_center",
"k_vashishta_three_end");
"k_vashishta_three_end","k_vashishta_short_nbor");
if (success!=0)
return success;
@ -128,15 +128,18 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
param4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
double r0sqmax = 0;
for (int i=0; i<nparams; i++) {
double r0sq = r0[i]*r0[i]-1e-4; // TODO: should we have the 1e-4?
double r0sq = r0[i]*r0[i]; // TODO: should we have the 1e-4?
if (r0sqmax < r0sq) r0sqmax = r0sq;
dview[i].x=static_cast<numtyp>(r0sq);
dview[i].y=static_cast<numtyp>(gamma[i]);
dview[i].z=static_cast<numtyp>(cutsq[i]);
dview[i].w=static_cast<numtyp>(r0[i]);
}
_cutshortsq = static_cast<numtyp>(r0sqmax);
ucl_copy(param4,dview,false);
param4_tex.get_texture(*(this->pair_program),"param4_tex");
param4_tex.bind_float(param4,4);
@ -223,15 +226,28 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
// build the short neighbor list
int ainum=this->_ainum;
int nbor_pitch=this->nbor->nbor_pitch();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &param4, &map,
&elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
ainum=this->ans->inum();
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
// note that k_pair does not run with the short neighbor list
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
&map, &elem2param, &_nelements,
@ -248,6 +264,7 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_center.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &evatom);
Answer<numtyp,acctyp> *end_ans;
@ -257,21 +274,19 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
end_ans=this->ans;
#endif
if (evatom!=0) {
this->k_three_end_vatom.set_size(GX,BX);
this->k_three_end_vatom.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
} else {
this->k_three_end.set_size(GX,BX);
this->k_three_end.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
}

View File

@ -136,6 +136,64 @@ texture<int4> param5_tex;
#endif
__kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict param4,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const int inum, const int nbor_pitch,
const int t_per_atom) {
__local int n_stride;
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
int ncount = 0;
int m = nbor;
dev_short_nbor[m] = 0;
int nbor_short = nbor+n_stride;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int nj = j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<param4[ijparam].x) { //param4[ijparam].x = r0sq; //param4[ijparam].z=cutsq
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
}
} // for nbor
// store the number of neighbors for each thread
dev_short_nbor[m] = ncount;
} // if ii
}
__kernel void k_vashishta(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict param1,
@ -166,8 +224,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
int nbor, nbor_end, i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -211,7 +268,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
numtyp param3_dvrc=param3_ijparam.z;
numtyp param3_c0 =param3_ijparam.w;
numtyp r=sqrt(rsq);
numtyp r=ucl_sqrt(rsq);
numtyp rinvsq=1.0/rsq;
numtyp r4inv = rinvsq*rinvsq;
numtyp r6inv = rinvsq*r4inv;
@ -219,8 +276,8 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
numtyp reta = pow(r,-param1_eta);
numtyp lam1r = r*param1_lam1inv;
numtyp lam4r = r*param1_lam4inv;
numtyp vc2 = param1_zizj * exp(-lam1r)/r;
numtyp vc3 = param2_mbigd * r4inv*exp(-lam4r);
numtyp vc2 = param1_zizj * ucl_exp(-lam1r)/r;
numtyp vc3 = param2_mbigd * r4inv*ucl_exp(-lam4r);
numtyp force = (param2_dvrc*r
- (4.0*vc3 + lam4r*vc3+param2_big6w*r6inv
@ -230,6 +287,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0)
energy += (param3_bigh*reta+vc2-vc3-param3_bigw*r6inv-r*param3_dvrc+param3_c0);
@ -255,31 +313,31 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
numtyp r1 = ucl_sqrt(rsq1); \
numtyp rinvsq1 = ucl_recip(rsq1); \
numtyp rainv1 = ucl_recip(r1 - param_r0_ij); \
numtyp gsrainv1 = param_gamma_ij * rainv1; \
numtyp gsrainv1 = param_gamma_ij * rainv1; \
numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \
numtyp expgsrainv1 = ucl_exp(gsrainv1); \
\
numtyp r2 = ucl_sqrt(rsq2); \
numtyp rinvsq2 = ucl_recip(rsq2); \
numtyp rainv2 = ucl_recip(r2 - param_r0_ik); \
numtyp gsrainv2 = param_gamma_ik * rainv2; \
numtyp gsrainv2 = param_gamma_ik * rainv2; \
numtyp gsrainvsq2 = gsrainv2*rainv2/r2; \
numtyp expgsrainv2 = ucl_exp(gsrainv2); \
\
numtyp rinv12 = ucl_recip(r1*r2); \
numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \
numtyp delcs = cs - param_costheta_ijk; \
numtyp delcs = cs - param_costheta_ijk; \
numtyp delcssq = delcs*delcs; \
numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \
numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \
numtyp pcsinvsq = pcsinv*pcsinv; \
numtyp pcs = delcssq/pcsinv; \
\
numtyp facexp = expgsrainv1*expgsrainv2; \
\
numtyp facrad = param_bigb_ijk * facexp*pcs; \
numtyp facrad = param_bigb_ijk * facexp*pcs; \
numtyp frad1 = facrad*gsrainvsq1; \
numtyp frad2 = facrad*gsrainvsq2; \
numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \
numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \
numtyp facang12 = rinv12*facang; \
numtyp csfacang = cs*facang; \
numtyp csfac1 = rinvsq1*csfacang; \
@ -311,28 +369,28 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
numtyp r1 = ucl_sqrt(rsq1); \
numtyp rinvsq1 = ucl_recip(rsq1); \
numtyp rainv1 = ucl_recip(r1 - param_r0_ij); \
numtyp gsrainv1 = param_gamma_ij * rainv1; \
numtyp gsrainv1 = param_gamma_ij * rainv1; \
numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \
numtyp expgsrainv1 = ucl_exp(gsrainv1); \
\
numtyp r2 = ucl_sqrt(rsq2); \
numtyp rainv2 = ucl_recip(r2 - param_r0_ik); \
numtyp gsrainv2 = param_gamma_ik * rainv2; \
numtyp gsrainv2 = param_gamma_ik * rainv2; \
numtyp expgsrainv2 = ucl_exp(gsrainv2); \
\
numtyp rinv12 = ucl_recip(r1*r2); \
numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \
numtyp delcs = cs - param_costheta_ijk; \
numtyp delcs = cs - param_costheta_ijk; \
numtyp delcssq = delcs*delcs; \
numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \
numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \
numtyp pcsinvsq = pcsinv*pcsinv; \
numtyp pcs = delcssq/pcsinv; \
\
numtyp facexp = expgsrainv1*expgsrainv2; \
\
numtyp facrad = param_bigb_ijk * facexp*pcs; \
numtyp facrad = param_bigb_ijk * facexp*pcs; \
numtyp frad1 = facrad*gsrainvsq1; \
numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \
numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \
numtyp facang12 = rinv12*facang; \
numtyp csfacang = cs*facang; \
numtyp csfac1 = rinvsq1*csfacang; \
@ -353,6 +411,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -377,7 +436,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -387,9 +446,18 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -406,18 +474,27 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
param_r0sq_ij=param4_ijparam.x;
if (rsq1 > param_r0sq_ij) continue;
if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
param_gamma_ij=param4_ijparam.y;
param_r0_ij=param4_ijparam.w;
int nbor_k=nbor_j-offset_j+offset_k;
if (nbor_k<=nbor_j)
nbor_k+=n_stride;
int nbor_k,k_end;
if (dev_packed==dev_nbor) {
nbor_k=nborj_start-offset_j+offset_k;
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
} else {
nbor_k = nbor_j-offset_j+offset_k;
if (nbor_k<=nbor_j) nbor_k += n_stride;
k_end = nbor_end;
}
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (dev_packed==dev_nbor && k <= j) continue;
numtyp4 kx; fetch4(kx,k,pos_tex);
int ktype=kx.w;
ktype=map[ktype];
@ -478,6 +555,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -502,7 +580,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -512,8 +590,16 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -529,7 +615,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
param_r0sq_ij = param4_ijparam.x;
if (rsq1 > param_r0sq_ij) continue;
if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
param_gamma_ij=param4_ijparam.y;
param_r0_ij = param4_ijparam.w;
@ -551,8 +637,15 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -617,6 +710,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -641,7 +735,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -651,8 +745,16 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -668,7 +770,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
param_r0sq_ij=param4_ijparam.x;
if (rsq1 > param_r0sq_ij) continue;
if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
param_gamma_ij=param4_ijparam.y;
param_r0_ij=param4_ijparam.w;
@ -690,8 +792,15 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;

View File

@ -82,6 +82,7 @@ class Vashishta : public BaseThree<numtyp, acctyp> {
UCL_D_Vec<int> elem2param;
UCL_D_Vec<int> map;
int _nparams,_nelements;
numtyp _cutshortsq;
UCL_Texture param1_tex, param2_tex, param3_tex, param4_tex, param5_tex;

View File

@ -6,6 +6,8 @@
from __future__ import print_function
import sys,os,re,subprocess
# help message
help = """
Syntax from src dir: make lib-kim args="-b -v version -a kim-name"
or: make lib-kim args="-b -a everything"
@ -78,13 +80,27 @@ def which(program):
return None
def geturl(url,fname):
success = False
if which('curl') != None:
cmd = 'curl -L -o "%s" %s' % (fname,url)
elif which('wget') != None:
try:
subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
success = True
except subprocess.CalledProcessError as e:
print("Calling curl failed with: %s" % e.output.decode('UTF-8'))
if not success and which('wget') != None:
cmd = 'wget -O "%s" %s' % (fname,url)
else: error("cannot find 'wget' or 'curl' to download source code")
txt = subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
return txt
try:
subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
success = True
except subprocess.CalledProcessError as e:
print("Calling wget failed with: %s" % e.output.decode('UTF-8'))
if not success:
error("Failed to download source code with 'curl' or 'wget'")
return
# parse args

View File

@ -1,5 +1,46 @@
# Change Log
## [2.03.13](https://github.com/kokkos/kokkos/tree/2.03.13) (2017-07-27)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.05...2.03.13)
**Implemented enhancements:**
- Disallow enabling both OpenMP and Threads in the same executable [\#406](https://github.com/kokkos/kokkos/issues/406)
- Make Kokkos::OpenMP respect OMP environment even if hwloc is available [\#630](https://github.com/kokkos/kokkos/issues/630)
- Improve Atomics Performance on KNL/Broadwell where PREFETCHW/RFO is Available [\#898](https://github.com/kokkos/kokkos/issues/898)
- Kokkos::resize should test whether dimensions have changed before resizing [\#904](https://github.com/kokkos/kokkos/issues/904)
- Develop performance-regression/acceptance tests [\#737](https://github.com/kokkos/kokkos/issues/737)
- Make the deep\_copy Profiling hook a start/end system [\#890](https://github.com/kokkos/kokkos/issues/890)
- Add deep\_copy Profiling hook [\#843](https://github.com/kokkos/kokkos/issues/843)
- Append tag name to parallel construct name for Profiling [\#842](https://github.com/kokkos/kokkos/issues/842)
- Add view label to `View bounds error` message for CUDA backend [\#870](https://github.com/kokkos/kokkos/issues/870)
- Disable printing the loaded profiling library [\#824](https://github.com/kokkos/kokkos/issues/824)
- "Declared but never referenced" warnings [\#853](https://github.com/kokkos/kokkos/issues/853)
- Warnings about lock\_address\_cuda\_space [\#852](https://github.com/kokkos/kokkos/issues/852)
- WorkGraph execution policy [\#771](https://github.com/kokkos/kokkos/issues/771)
- Simplify makefiles by guarding compilation with appropriate KOKKOS\_ENABLE\_\#\#\# macros [\#716](https://github.com/kokkos/kokkos/issues/716)
- Cmake build: wrong include install directory [\#668](https://github.com/kokkos/kokkos/issues/668)
- Derived View type and allocation [\#566](https://github.com/kokkos/kokkos/issues/566)
- Fix Compiler warnings when compiling core unit tests for Cuda [\#214](https://github.com/kokkos/kokkos/issues/214)
**Fixed bugs:**
- Out-of-bounds read in Kokkos\_Layout.hpp [\#975](https://github.com/kokkos/kokkos/issues/975)
- CudaClang: Fix failing test with Clang 4.0 [\#941](https://github.com/kokkos/kokkos/issues/941)
- Respawn when memory pool allocation fails \(not available memory\) [\#940](https://github.com/kokkos/kokkos/issues/940)
- Memory pool aborts on zero allocation request, returns NULL for \< minimum [\#939](https://github.com/kokkos/kokkos/issues/939)
- Error with TaskScheduler query of underlying memory pool [\#917](https://github.com/kokkos/kokkos/issues/917)
- Profiling::\*Callee static variables declared in header [\#863](https://github.com/kokkos/kokkos/issues/863)
- calling \*Space::name\(\) causes compile error [\#862](https://github.com/kokkos/kokkos/issues/862)
- bug in Profiling::deallocateData [\#860](https://github.com/kokkos/kokkos/issues/860)
- task\_depend test failing, CUDA 8.0 + Pascal + RDC [\#829](https://github.com/kokkos/kokkos/issues/829)
- \[develop branch\] Standalone cmake issues [\#826](https://github.com/kokkos/kokkos/issues/826)
- Kokkos CUDA failes to compile with OMPI\_CXX and MPICH\_CXX wrappers [\#776](https://github.com/kokkos/kokkos/issues/776)
- Task Team reduction on Pascal [\#767](https://github.com/kokkos/kokkos/issues/767)
- CUDA stack overflow with TaskDAG test [\#758](https://github.com/kokkos/kokkos/issues/758)
- TeamVector test on Cuda [\#670](https://github.com/kokkos/kokkos/issues/670)
- Clang 4.0 Cuda Build broken again [\#560](https://github.com/kokkos/kokkos/issues/560)
## [2.03.05](https://github.com/kokkos/kokkos/tree/2.03.05) (2017-05-27)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.00...2.03.05)

View File

@ -33,6 +33,7 @@ KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "lib
KOKKOS_INTERNAL_USE_MEMKIND := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "experimental_memkind" | wc -l))
# Check for advanced settings.
KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "compiler_warnings" | wc -l))
KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "aggressive_vectorization" | wc -l))
KOKKOS_INTERNAL_DISABLE_PROFILING := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_profiling" | wc -l))
KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_dualview_modify_check" | wc -l))
@ -78,14 +79,14 @@ KOKKOS_INTERNAL_COMPILER_PGI := $(strip $(shell $(CXX) --version 2
KOKKOS_INTERNAL_COMPILER_XL := $(strip $(shell $(CXX) -qversion 2>&1 | grep XL | wc -l))
KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-" | wc -l))
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(CXX) --version 2>&1 | grep nvcc | wc -l))
KOKKOS_INTERNAL_COMPILER_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep clang | wc -l))
KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep "apple-darwin" | wc -l))
ifneq ($(OMPI_CXX),)
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(OMPI_CXX) --version 2>&1 | grep "nvcc" | wc -l))
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(OMPI_CXX) --version 2>&1 | grep nvcc | wc -l))
endif
ifneq ($(MPICH_CXX),)
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(MPICH_CXX) --version 2>&1 | grep "nvcc" | wc -l))
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(MPICH_CXX) --version 2>&1 | grep nvcc | wc -l))
endif
KOKKOS_INTERNAL_COMPILER_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep clang | wc -l))
KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep "apple-darwin" | wc -l))
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2)
KOKKOS_INTERNAL_COMPILER_CLANG = 1
@ -111,6 +112,36 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
endif
endif
# Set compiler warnings flags.
ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
# TODO check if PGI accepts GNU style warnings
KOKKOS_INTERNAL_COMPILER_WARNINGS =
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
# TODO check if cray accepts GNU style warnings
KOKKOS_INTERNAL_COMPILER_WARNINGS =
else
#gcc
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
endif
endif
endif
endif
endif
else
KOKKOS_INTERNAL_COMPILER_WARNINGS =
endif
# Set OpenMP flags.
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_INTERNAL_OPENMP_FLAG := -mp
@ -162,6 +193,7 @@ endif
# Intel based.
KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
KOKKOS_INTERNAL_USE_ARCH_WSM := $(strip $(shell echo $(KOKKOS_ARCH) | grep WSM | wc -l))
KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l))
@ -229,13 +261,14 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l))
# Any AVX?
KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
# Decide what ISA level we are able to support.
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM)+$(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
KOKKOS_INTERNAL_USE_ISA_KNC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc ))
@ -243,7 +276,7 @@ KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_
KOKKOS_INTERNAL_USE_TM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
# Incompatible flags?
KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc ))
KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc ))
KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
@ -257,12 +290,10 @@ endif
KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src
# No warnings:
KOKKOS_CXXFLAGS =
# INTEL and CLANG warnings:
#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
# GCC warnings:
#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized -Wignored-qualifiers -Wempty-body -Wclobbered
ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_WARNINGS)
endif
KOKKOS_LIBS = -lkokkos -ldl
KOKKOS_LDFLAGS = -L$(shell pwd)
@ -486,6 +517,28 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_SSE42 1" >> KokkosCore_config.tmp )
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -xSSE4.2
KOKKOS_LDFLAGS += -xSSE4.2
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_CXXFLAGS += -tp=nehalem
KOKKOS_LDFLAGS += -tp=nehalem
else
# Assume that this is a really a GNU compiler.
KOKKOS_CXXFLAGS += -msse4.2
KOKKOS_LDFLAGS += -msse4.2
endif
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp )
@ -689,7 +742,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
endif
endif
KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)
KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1)
ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h)
KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
else

View File

@ -20,8 +20,10 @@ Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ta
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
Kokkos_Rendezvous.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Rendezvous.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Rendezvous.cpp
Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
@ -36,6 +38,8 @@ Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
endif
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)

View File

@ -61,14 +61,19 @@ protected:
{
std::cout << std::setprecision(5) << std::scientific;
unsigned threads_count = omp_get_max_threads();
int threads_count = 0;
#pragma omp parallel
{
#pragma omp atomic
++threads_count;
}
if ( Kokkos::hwloc::available() ) {
threads_count = Kokkos::hwloc::get_available_numa_count() *
Kokkos::hwloc::get_available_cores_per_numa();
if (threads_count > 3) {
threads_count /= 2;
}
Kokkos::OpenMP::initialize( threads_count );
Kokkos::OpenMP::print_configuration( std::cout );
}
static void TearDownTestCase()

View File

@ -283,12 +283,12 @@ struct test_random_scalar {
RandomGenerator& pool,
unsigned int num_draws)
{
using std::cerr;
using std::cout;
using std::endl;
using Kokkos::parallel_reduce;
{
cerr << " -- Testing randomness properties" << endl;
cout << " -- Testing randomness properties" << endl;
RandomProperties result;
typedef test_random_functor<RandomGenerator, Scalar> functor_type;
@ -307,7 +307,7 @@ struct test_random_scalar {
( 1.5*tolerance > variance_eps)) ? 1:0;
pass_covar = ((-2.0*tolerance < covariance_eps) &&
( 2.0*tolerance > covariance_eps)) ? 1:0;
cerr << "Pass: " << pass_mean
cout << "Pass: " << pass_mean
<< " " << pass_var
<< " " << mean_eps
<< " " << variance_eps
@ -315,7 +315,7 @@ struct test_random_scalar {
<< " || " << tolerance << endl;
}
{
cerr << " -- Testing 1-D histogram" << endl;
cout << " -- Testing 1-D histogram" << endl;
RandomProperties result;
typedef test_histogram1d_functor<typename RandomGenerator::device_type> functor_type;
@ -335,7 +335,7 @@ struct test_random_scalar {
pass_hist1d_covar = ((-0.06 < covariance_eps) &&
( 0.06 > covariance_eps)) ? 1:0;
cerr << "Density 1D: " << mean_eps
cout << "Density 1D: " << mean_eps
<< " " << variance_eps
<< " " << (result.covariance/HIST_DIM1D/HIST_DIM1D)
<< " || " << tolerance
@ -348,7 +348,7 @@ struct test_random_scalar {
<< endl;
}
{
cerr << " -- Testing 3-D histogram" << endl;
cout << " -- Testing 3-D histogram" << endl;
RandomProperties result;
typedef test_histogram3d_functor<typename RandomGenerator::device_type> functor_type;
@ -368,7 +368,7 @@ struct test_random_scalar {
pass_hist3d_covar = ((-tolerance < covariance_eps) &&
( tolerance > covariance_eps)) ? 1:0;
cerr << "Density 3D: " << mean_eps
cout << "Density 3D: " << mean_eps
<< " " << variance_eps
<< " " << result.covariance/HIST_DIM1D/HIST_DIM1D
<< " || " << tolerance
@ -381,18 +381,18 @@ struct test_random_scalar {
template <class RandomGenerator>
void test_random(unsigned int num_draws)
{
using std::cerr;
using std::cout;
using std::endl;
typename test_random_functor<RandomGenerator,int>::type_1d density_1d("D1d");
typename test_random_functor<RandomGenerator,int>::type_3d density_3d("D3d");
uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count();
cerr << "Test Seed:" << ticks << endl;
cout << "Test Seed:" << ticks << endl;
RandomGenerator pool(ticks);
cerr << "Test Scalar=int" << endl;
cout << "Test Scalar=int" << endl;
test_random_scalar<RandomGenerator,int> test_int(density_1d,density_3d,pool,num_draws);
ASSERT_EQ( test_int.pass_mean,1);
ASSERT_EQ( test_int.pass_var,1);
@ -406,7 +406,7 @@ void test_random(unsigned int num_draws)
deep_copy(density_1d,0);
deep_copy(density_3d,0);
cerr << "Test Scalar=unsigned int" << endl;
cout << "Test Scalar=unsigned int" << endl;
test_random_scalar<RandomGenerator,unsigned int> test_uint(density_1d,density_3d,pool,num_draws);
ASSERT_EQ( test_uint.pass_mean,1);
ASSERT_EQ( test_uint.pass_var,1);
@ -420,7 +420,7 @@ void test_random(unsigned int num_draws)
deep_copy(density_1d,0);
deep_copy(density_3d,0);
cerr << "Test Scalar=int64_t" << endl;
cout << "Test Scalar=int64_t" << endl;
test_random_scalar<RandomGenerator,int64_t> test_int64(density_1d,density_3d,pool,num_draws);
ASSERT_EQ( test_int64.pass_mean,1);
ASSERT_EQ( test_int64.pass_var,1);
@ -434,7 +434,7 @@ void test_random(unsigned int num_draws)
deep_copy(density_1d,0);
deep_copy(density_3d,0);
cerr << "Test Scalar=uint64_t" << endl;
cout << "Test Scalar=uint64_t" << endl;
test_random_scalar<RandomGenerator,uint64_t> test_uint64(density_1d,density_3d,pool,num_draws);
ASSERT_EQ( test_uint64.pass_mean,1);
ASSERT_EQ( test_uint64.pass_var,1);
@ -448,7 +448,7 @@ void test_random(unsigned int num_draws)
deep_copy(density_1d,0);
deep_copy(density_3d,0);
cerr << "Test Scalar=float" << endl;
cout << "Test Scalar=float" << endl;
test_random_scalar<RandomGenerator,float> test_float(density_1d,density_3d,pool,num_draws);
ASSERT_EQ( test_float.pass_mean,1);
ASSERT_EQ( test_float.pass_var,1);
@ -462,7 +462,7 @@ void test_random(unsigned int num_draws)
deep_copy(density_1d,0);
deep_copy(density_3d,0);
cerr << "Test Scalar=double" << endl;
cout << "Test Scalar=double" << endl;
test_random_scalar<RandomGenerator,double> test_double(density_1d,density_3d,pool,num_draws);
ASSERT_EQ( test_double.pass_mean,1);
ASSERT_EQ( test_double.pass_var,1);

View File

@ -44,6 +44,7 @@
#include<Kokkos_Core.hpp>
#include<impl/Kokkos_Timer.hpp>
#include<bench.hpp>
#include<cstdlib>
int main(int argc, char* argv[]) {
Kokkos::initialize();

View File

@ -44,11 +44,11 @@
#include<Kokkos_Core.hpp>
#include<impl/Kokkos_Timer.hpp>
#include<gather.hpp>
#include<cstdlib>
int main(int argc, char* argv[]) {
Kokkos::initialize(argc,argv);
if(argc<8) {
printf("Arguments: S N K D\n");
printf(" S: Scalar Type Size (1==float, 2==double, 4=complex<double>)\n");

View File

@ -0,0 +1,44 @@
KOKKOS_PATH = ../..
SRC = $(wildcard *.cpp)
default: build
echo "Start Build"
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
CXXFLAGS = -O3 -g
LINK = ${CXX}
LINKFLAGS =
EXE = policy_performance.cuda
KOKKOS_DEVICES = "Cuda,OpenMP"
KOKKOS_ARCH = "SNB,Kepler35"
KOKKOS_CUDA_OPTIONS+=enable_lambda
else
CXX = g++
CXXFLAGS = -O3 -g -Wall -Werror
LINK = ${CXX}
LINKFLAGS =
EXE = policy_performance.host
KOKKOS_DEVICES = "OpenMP"
KOKKOS_ARCH = "SNB"
endif
DEPFLAGS = -M
OBJ = $(SRC:.cpp=.o)
LIB =
include $(KOKKOS_PATH)/Makefile.kokkos
build: $(EXE)
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean: kokkos-clean
rm -f *.o *.cuda *.host
# Compilation rules
%.o:%.cpp $(KOKKOS_CPP_DEPENDS) main.cpp policy_perf_test.hpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<

View File

@ -0,0 +1,170 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#include "policy_perf_test.hpp"
int main(int argc, char* argv[] ) {
Kokkos::initialize(argc,argv);
if(argc<10) {
printf(" Ten arguments are needed to run this program:\n");
printf(" (1)team_range, (2)thread_range, (3)vector_range, (4)outer_repeat, (5)thread_repeat, (6)vector_repeat, (7)team_size, (8)vector_size, (9)schedule, (10)test_type\n");
printf(" team_range: number of teams (league_size)\n");
printf(" thread_range: range for nested TeamThreadRange parallel_*\n");
printf(" vector_range: range for nested ThreadVectorRange parallel_*\n");
printf(" outer_repeat: number of repeats for outer parallel_* call\n");
printf(" thread_repeat: number of repeats for TeamThreadRange parallel_* call\n");
printf(" vector_repeat: number of repeats for ThreadVectorRange parallel_* call\n");
printf(" team_size: number of team members (team_size)\n");
printf(" vector_size: desired vectorization (if possible)\n");
printf(" schedule: 1 == Static 2 == Dynamic\n");
printf(" test_type: 3-digit code XYZ for testing (nested) parallel_*\n");
printf(" code key: XYZ X in {1,2,3,4,5}, Y in {0,1,2}, Z in {0,1,2}\n");
printf(" TeamPolicy:\n");
printf(" X: 0 = none (never used, makes no sense); 1 = parallel_for; 2 = parallel_reduce\n");
printf(" Y: 0 = none; 1 = parallel_for; 2 = parallel_reduce\n");
printf(" Z: 0 = none; 1 = parallel_for; 2 = parallel_reduce\n");
printf(" RangePolicy:\n");
printf(" X: 3 = parallel_for; 4 = parallel_reduce; 5 = parallel_scan\n");
printf(" Y: 0 = none\n");
printf(" Z: 0 = none\n");
printf(" Example Input:\n");
printf(" 100000 32 32 100 100 100 8 1 1 100\n");
Kokkos::finalize();
return 0;
}
int team_range = atoi(argv[1]);
int thread_range = atoi(argv[2]);
int vector_range = atoi(argv[3]);
int outer_repeat = atoi(argv[4]);
int thread_repeat = atoi(argv[5]);
int vector_repeat = atoi(argv[6]);
int team_size = atoi(argv[7]);
int vector_size = atoi(argv[8]);
int schedule = atoi(argv[9]);
int test_type = atoi(argv[10]);
int disable_verbose_output = 0;
if ( argc > 11 ) {
disable_verbose_output = atoi(argv[11]);
}
if ( schedule != 1 && schedule != 2 ) {
printf("schedule: %d\n", schedule);
printf("Options for schedule are: 1 == Static 2 == Dynamic\n");
Kokkos::finalize();
return -1;
}
if ( test_type != 100 && test_type != 110 && test_type != 111 && test_type != 112 && test_type != 120 && test_type != 121 && test_type != 122
&& test_type != 200 && test_type != 210 && test_type != 211 && test_type != 212 && test_type != 220 && test_type != 221 && test_type != 222
&& test_type != 300 && test_type != 400 && test_type != 500
)
{
printf("Incorrect test_type option\n");
Kokkos::finalize();
return -2;
}
double result = 0.0;
Kokkos::parallel_reduce( "parallel_reduce warmup", Kokkos::TeamPolicy<>(10,1),
KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type team, double& lval) {
lval += 1;
}, result);
typedef Kokkos::View<double*, Kokkos::LayoutRight> view_type_1d;
typedef Kokkos::View<double**, Kokkos::LayoutRight> view_type_2d;
typedef Kokkos::View<double***, Kokkos::LayoutRight> view_type_3d;
// Allocate view without initializing
// Call a 'warmup' test with 1 repeat - this will initialize the corresponding view appropriately for test and should obey first-touch etc
// Second call to test is the one we actually care about and time
view_type_1d v_1( Kokkos::ViewAllocateWithoutInitializing("v_1"), team_range*team_size);
view_type_2d v_2( Kokkos::ViewAllocateWithoutInitializing("v_2"), team_range*team_size, thread_range);
view_type_3d v_3( Kokkos::ViewAllocateWithoutInitializing("v_3"), team_range*team_size, thread_range, vector_range);
double result_computed = 0.0;
double result_expect = 0.0;
double time = 0.0;
if(schedule==1) {
if ( test_type != 500 ) {
// warmup - no repeat of loops
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
}
else {
// parallel_scan: initialize 1d view for parallel_scan
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,100,v_1,v_2,v_3,result_computed,result_expect,time);
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
}
}
if(schedule==2) {
if ( test_type != 500 ) {
// warmup - no repeat of loops
test_policy<Kokkos::Schedule<Kokkos::Dynamic>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
test_policy<Kokkos::Schedule<Kokkos::Dynamic>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
}
else {
// parallel_scan: initialize 1d view for parallel_scan
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,100,v_1,v_2,v_3,result_computed,result_expect,time);
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
}
}
if ( disable_verbose_output == 0 ) {
printf("%7i %4i %2i %9i %4i %4i %4i %2i %1i %3i %e %e %lf\n",team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,schedule,test_type,result_computed,result_expect,time);
}
else {
printf("%lf\n",time);
}
Kokkos::finalize();
return 0;
}

View File

@ -0,0 +1,354 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
template < class ViewType >
struct ParallelScanFunctor {
using value_type = double;
ViewType v;
ParallelScanFunctor( const ViewType & v_ )
: v(v_)
{}
KOKKOS_INLINE_FUNCTION
void operator()( const int idx, value_type& val, const bool& final ) const
{
// inclusive scan
val += v(idx);
if ( final ) {
v(idx) = val;
}
}
};
template<class ScheduleType,class IndexType,class ViewType1, class ViewType2, class ViewType3>
void test_policy(int team_range, int thread_range, int vector_range,
int outer_repeat, int thread_repeat, int inner_repeat,
int team_size, int vector_size, int test_type,
ViewType1 &v1, ViewType2 &v2, ViewType3 &v3,
double &result, double &result_expect, double &time) {
typedef Kokkos::TeamPolicy<ScheduleType,IndexType> t_policy;
typedef typename t_policy::member_type t_team;
Kokkos::Timer timer;
for(int orep = 0; orep<outer_repeat; orep++) {
if (test_type == 100) {
Kokkos::parallel_for("100 outer for", t_policy(team_range,team_size),
KOKKOS_LAMBDA (const t_team& team) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
v1(idx) = idx;
// prevent compiler optimizing loop away
});
}
if (test_type == 110) {
Kokkos::parallel_for("110 outer for", t_policy(team_range,team_size),
KOKKOS_LAMBDA (const t_team& team) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
for (int tr = 0; tr<thread_repeat; ++tr) {
// Each team launches a parallel_for; thread_range is partitioned among team members
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
v2( idx, t ) = t;
// prevent compiler optimizing loop away
});
}
});
}
if (test_type == 111) {
Kokkos::parallel_for("111 outer for", t_policy(team_range,team_size,vector_size),
KOKKOS_LAMBDA (const t_team& team) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
for (int tr = 0; tr<thread_repeat; ++tr) {
// Each team launches a parallel_for; thread_range is partitioned among team members
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
for (int vr = 0; vr<inner_repeat; ++vr)
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi) {
v3( idx, t, vi ) = vi;
// prevent compiler optimizing loop away
});
});
}
});
}
if (test_type == 112) {
Kokkos::parallel_for("112 outer for", t_policy(team_range,team_size,vector_size),
KOKKOS_LAMBDA (const t_team& team) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
for (int tr = 0; tr<thread_repeat; ++tr) {
// Each team launches a parallel_for; thread_range is partitioned among team members
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
double vector_result = 0.0;
for (int vr = 0; vr<inner_repeat; ++vr) {
vector_result = 0.0;
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi, double &vval) {
vval += 1;
}, vector_result);
}
v2( idx, t ) = vector_result;
// prevent compiler optimizing loop away
});
}
});
}
if (test_type == 120) {
Kokkos::parallel_for("120 outer for", t_policy(team_range,team_size),
KOKKOS_LAMBDA (const t_team& team) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
double team_result = 0.0;
for (int tr = 0; tr<thread_repeat; ++tr) {
team_result = 0.0;
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
lval += 1;
}, team_result);
}
v1(idx) = team_result;
// prevent compiler optimizing loop away
});
}
if (test_type == 121) {
Kokkos::parallel_for("121 outer for", t_policy(team_range,team_size,vector_size),
KOKKOS_LAMBDA (const t_team& team) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
double team_result = 0.0;
for (int tr = 0; tr<thread_repeat; ++tr) {
team_result = 0.0;
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
lval += 1;
for (int vr = 0; vr<inner_repeat; ++vr) {
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi) {
v3( idx, t, vi ) = vi;
// prevent compiler optimizing loop away
});
}
}, team_result);
}
v3( idx, 0, 0 ) = team_result;
// prevent compiler optimizing loop away
});
}
if (test_type == 122) {
Kokkos::parallel_for("122 outer for", t_policy(team_range,team_size,vector_size),
KOKKOS_LAMBDA (const t_team& team) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
double team_result = 0.0;
for (int tr = 0; tr<thread_repeat; ++tr) {
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
double vector_result = 0.0;
for (int vr = 0; vr<inner_repeat; ++vr)
vector_result = 0.0;
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi, double &vval) {
vval += 1;
}, vector_result);
lval += vector_result;
}, team_result);
}
v1(idx) = team_result;
// prevent compiler optimizing loop away
});
}
if (test_type == 200) {
Kokkos::parallel_reduce("200 outer reduce", t_policy(team_range,team_size),
KOKKOS_LAMBDA (const t_team& team, double& lval) {
lval+=team.team_size()*team.league_rank() + team.team_rank();
},result);
result_expect = 0.5* (team_range*team_size)*(team_range*team_size-1);
// sum ( seq( [0, team_range*team_size) )
}
if (test_type == 210) {
Kokkos::parallel_reduce("210 outer reduce", t_policy(team_range,team_size),
KOKKOS_LAMBDA (const t_team& team, double& lval) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
double thread_for = 1.0;
for(int tr = 0; tr<thread_repeat; tr++) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
v2(idx,t) = t;
// prevent compiler optimizing loop away
});
}
lval+=(team.team_size()*team.league_rank() + team.team_rank() + thread_for);
},result);
result_expect = 0.5* (team_range*team_size)*(team_range*team_size-1) + (team_range*team_size);
// sum ( seq( [0, team_range*team_size) + 1 per team_member (total of team_range*team_size) )
}
if (test_type == 211) {
Kokkos::parallel_reduce("211 outer reduce", t_policy(team_range,team_size,vector_size),
KOKKOS_LAMBDA (const t_team& team, double& lval) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
double thread_for = 1.0;
for(int tr = 0; tr<thread_repeat; tr++) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
for (int vr = 0; vr<inner_repeat; ++vr)
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi) {
v3(idx, t, vi) = vi;
// prevent compiler optimizing loop away
});
});
}
lval+=idx+thread_for;
},result);
result_expect = 0.5*(team_range*team_size)*(team_range*team_size-1) + (team_range*team_size);
// sum ( seq( [0, team_range*team_size) + 1 per team_member (total of team_range*team_size) )
}
if (test_type == 212) {
Kokkos::parallel_reduce("212 outer reduce", t_policy(team_range,team_size,vector_size),
KOKKOS_LAMBDA (const t_team& team, double& lval) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
double vector_result = 0.0;
for(int tr = 0; tr<thread_repeat; tr++) {
// This parallel_for is executed by each team; the thread_range is partitioned among the team members
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
v2(idx,t) = t;
// prevent compiler optimizing loop away
for (int vr = 0; vr<inner_repeat; ++vr) {
vector_result = 0.0;
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi, double &vval) {
vval += vi;
}, vector_result );
}
});
}
lval+= idx + vector_result;
},result);
result_expect = 0.5*(team_range*team_size)*(team_range*team_size-1) + (0.5*vector_range*(vector_range-1)*team_range*team_size);
// sum ( seq( [0, team_range*team_size) + sum( seq( [0, vector_range) ) per team_member (total of team_range*team_size) )
}
if (test_type == 220) {
Kokkos::parallel_reduce("220 outer reduce", t_policy(team_range,team_size),
KOKKOS_LAMBDA (const t_team& team, double& lval) {
double team_result = 0.0;
for(int tr = 0; tr<thread_repeat; tr++) {
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
tval += t;
},team_result);
}
lval+=team_result*team.league_rank(); // constant * league_rank
},result);
result_expect = 0.5*(team_range)*(team_range-1) * team_size * 0.5*(thread_range)*(thread_range-1);
// sum ( seq( [0, team_range) * constant ); constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
}
if (test_type == 221) {
Kokkos::parallel_reduce("221 outer reduce", t_policy(team_range,team_size,vector_size),
KOKKOS_LAMBDA (const t_team& team, double& lval) {
long idx = team.league_rank()*team.team_size() + team.team_rank();
double team_result = 0;
for(int tr = 0; tr<thread_repeat; tr++) {
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
double vector_for = 1.0;
for (int vr = 0; vr<inner_repeat; ++vr) {
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi) {
v3(idx, t, vi) = vi;
// prevent compiler optimizing loop away
});
}
tval += t + vector_for;
},team_result);
}
lval+=team_result*team.league_rank();
},result);
result_expect = 0.5* (team_range)*(team_range-1) * team_size * (0.5*(thread_range) * (thread_range-1) + thread_range);
// sum ( seq( [0, team_range) * constant ) + 1 per member per team; constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
}
if (test_type == 222) {
Kokkos::parallel_reduce("222 outer reduce", t_policy(team_range,team_size,vector_size),
KOKKOS_LAMBDA (const t_team& team, double& lval) {
double team_result = 0.0;
for(int tr = 0; tr<thread_repeat; tr++) {
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
double vector_result = 0.0;
for (int vr = 0; vr<inner_repeat; ++vr) {
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi, double& vval) {
vval += vi;
}, vector_result);
}
tval += t + vector_result;
},team_result);
}
lval+=team_result*team.league_rank();
},result);
result_expect = 0.5* (team_range)*(team_range-1) * team_size * (0.5*(thread_range) * (thread_range-1) + thread_range*0.5*(vector_range)*(vector_range-1));
// sum ( seq( [0, team_range) * constant ) + 1 + sum( seq([0,vector_range) ) per member per team; constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
}
// parallel_for RangePolicy: range = team_size*team_range
if (test_type == 300) {
Kokkos::parallel_for("300 outer for", team_size*team_range,
KOKKOS_LAMBDA (const int idx) {
v1(idx) = idx;
// prevent compiler from optimizing away the loop
});
}
// parallel_reduce RangePolicy: range = team_size*team_range
if (test_type == 400) {
Kokkos::parallel_reduce("400 outer reduce", team_size*team_range,
KOKKOS_LAMBDA (const int idx, double& val) {
val += idx;
}, result);
result_expect = 0.5*(team_size*team_range)*(team_size*team_range-1);
}
// parallel_scan RangePolicy: range = team_size*team_range
if (test_type == 500) {
Kokkos::parallel_scan("500 outer scan", team_size*team_range,
ParallelScanFunctor<ViewType1>(v1)
#if 0
// This does not compile with pre Cuda 8.0 - see Github Issue #913 for explanation
KOKKOS_LAMBDA (const int idx, double& val, const bool& final) {
// inclusive scan
val += v1(idx);
if ( final ) {
v1(idx) = val;
}
}
#endif
);
// result = v1( team_size*team_range - 1 ); // won't work with Cuda - need to copy result back to host to print
// result_expect = 0.5*(team_size*team_range)*(team_size*team_range-1);
}
} // end outer for loop
time = timer.seconds();
} //end test_policy

View File

@ -0,0 +1,53 @@
#!/bin/bash
# Script to check policy_perf_test code works with each possible combo of options
echo "Performance test results for parallel_reduce code computing sum of sequence [0,N) with various (nested) policies"
EXECUTABLE=policy_performance
TEAMRANGE=1000
THREADRANGE=4
VECTORRANGE=32
TEAMSIZE=4
VECTORSIZE=1
OREPEAT=1
MREPEAT=1
IREPEAT=1
SCHEDULE=1
SUFFIX=host
if [ -e $EXECUTABLE.$SUFFIX ]
then
SCHEDULE=1
echo "Host tests Static schedule"
for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
do
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
done
SCHEDULE=2
echo "Host tests Dynamic schedule"
for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
do
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
done
fi
SUFFIX=cuda
if [ -e $EXECUTABLE.$SUFFIX ]
then
SCHEDULE=1
echo "Cuda tests Static schedule"
for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
do
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
done
SCHEDULE=2
echo "Cuda tests Dynamic schedule"
for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
do
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
done
fi

View File

@ -0,0 +1,126 @@
#!/bin/bash
# Sample script for benchmarking policy performance
# Suggested enviroment variables to export prior to executing script:
# KNL:
# OMP_NUM_THREADS=256 KMP_AFFINITY=compact
# Power:
# OMP_NUM_THREADS=64 OMP_PROC_BIND=true
# Constants and Variables:
# Vary: TEAMSIZE, and THREADRANGE
# for TEAMSIZE in {1,2,4,5,8}; do
# for THREADRANGE in {32,41,1000}; do
# Fixed: TEAMRANGE, VECTORRANGE, VECTORSIZE
# System specific: Adjust REPEAT values to architecture tests are run on
# Tests
# Static SCHEDULE = 1
# Tier 1: parallel_for + RangePolicy 300
# Tier 2: parallel_reduce, parallel_scan + RangePolicy 400 500
# Tier 3: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY
# Tier 4: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY
# Dynamic SCHEDULE = 2
# Tier 5: parallel_for + RangePolicy 300
# Tier 6: parallel_reduce, parallel_scan + RangePolicy 400 500
# Tier 7: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY
# Tier 8: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY
# Results grouped by:
# 0) SCHEDULE 1) CODE (test) 2) TEAMRANGE 3) TEAMSIZE 4) THREADRANGE
EXECUTABLE=policy_performance
# Default defined values
TEAMRANGE=1000
THREADRANGE=1
VECTORRANGE=32
TEAMSIZE=1
VECTORSIZE=1
OREPEAT=1
MREPEAT=1
IREPEAT=1
SCHEDULE=1
# Host tests
SUFFIX=host
if [ -e $EXECUTABLE.$SUFFIX ]; then
echo "Host"
for SCHEDULE in {1,2}; do
# Tier 1 and 2, 5 and 6
for CODE in {300,400,500}; do
for TEAMSIZE in {1,2,4,5,8}; do
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
done
done
# Tier 3, 7
for CODE in {100,110,111,112,120,121,122}; do
for TEAMSIZE in {1,2,4,5,8}; do
for THREADRANGE in {32,41,1000}; do
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
done
done
done
# Tier 4, 8
for CODE in {200,210,211,212,220,221,222}; do
for TEAMSIZE in {1,2,4,5,8}; do
for THREADRANGE in {32,41,1000}; do
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
done
done
done
done # end SCHEDULE
fi # end host
# Cuda tests
SUFFIX=cuda
# TEAMRANGE=10000, TEAMSIZE=8 too large
# TEAMRANGE=10000, TEAMSIZE=8, THREADRANGE=1000 too large
if [ -e $EXECUTABLE.$SUFFIX ]; then
echo "Cuda"
for SCHEDULE in {1,2}; do
# Reset defaults
TEAMRANGE=1000
THREADRANGE=1
VECTORRANGE=32
TEAMSIZE=1
VECTORSIZE=1
# Tier 1 and 2, 5 and 6
for CODE in {300,400,500}; do
for TEAMSIZE in {1,2,4,5,8}; do
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
done
done
# Tier 3, 7
for CODE in {100,110,111,112,120,121,122}; do
for TEAMSIZE in {1,2,4,5,8}; do
for THREADRANGE in {32,41,1000}; do
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
done
done
done
# Tier 4, 8
for CODE in {200,210,211,212,220,221,222}; do
for TEAMSIZE in {1,2,4,5,8}; do
for THREADRANGE in {32,41,1000}; do
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
done
done
done
done # end SCHEDULE
fi #end cuda

454
lib/kokkos/bin/hpcbind Executable file
View File

@ -0,0 +1,454 @@
#!/usr/bin/env bash
################################################################################
# Check if hwloc commands exist
################################################################################
declare -i HPCBIND_HAS_HWLOC=1
type hwloc-bind >/dev/null 2>&1
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
type hwloc-distrib >/dev/null 2>&1
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
type hwloc-ls >/dev/null 2>&1
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
type hwloc-calc >/dev/null 2>&1
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
type hwloc-ps >/dev/null 2>&1
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
if [[ ${HPCBIND_HAS_HWLOC} -eq 0 ]]; then
echo "hwloc not found, no process binding will occur"
fi
# Get parent cpuset
HPCBIND_HWLOC_PARENT_CPUSET=""
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
MY_PID="$BASHPID"
HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
fi
################################################################################
# Check if nvidia-smi exist
################################################################################
declare -i HPCBIND_HAS_NVIDIA=0
type nvidia-smi >/dev/null 2>&1
HPCBIND_HAS_NVIDIA=$((!$?))
################################################################################
# Get visible gpu
################################################################################
declare -i NUM_GPUS=0
HPCBIND_VISIBLE_GPUS=""
if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
NUM_GPUS=$(nvidia-smi -L | wc -l);
GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )"
HPCBIND_VISIBLE_GPUS=${CUDA_VISIBLE_DEVICES:-${GPU_LIST}}
fi
declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0))
################################################################################
# Get queue id
# supports sbatch, bsub, aprun
################################################################################
HPCBIND_QUEUE_NAME=""
declare -i HPCBIND_QUEUE_INDEX=0
declare -i HPCBIND_QUEUE_GPU_MAPPING=0
if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
HPCBIND_QUEUE_GPU_MAPPING=1
HPCBIND_QUEUE_NAME="sbatch"
HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID}
elif [[ ! -z "${LBS_JOBINDEX}" ]]; then
HPCBIND_QUEUE_GPU_MAPPING=1
HPCBIND_QUEUE_NAME="bsub"
HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX}
elif [[ ! -z "${ALPS_APP_PE}" ]]; then
HPCBIND_QUEUE_GPU_MAPPING=1
HPCBIND_QUEUE_NAME="aprun"
HPCBIND_QUEUE_INDEX=${ALPS_APP_PE}
fi
################################################################################
# Show help
################################################################################
function show_help {
local cmd=$(basename "$0")
echo "Usage: ${cmd} <options> -- command ..."
echo " Set the process mask, OMP environment variables and CUDA environment"
echo " variables to sane values if possible. Uses hwloc and nvidia-smi if"
echo " available. Will preserve the current process binding, so it is safe"
echo " to use with a queuing system or mpiexec."
echo ""
echo "Options:"
echo " --no-hwloc-bind Disable binding"
echo " --proc-bind=<LOC> Set the initial process mask for the script"
echo " LOC can be any valid location argument for"
echo " hwloc-calc Default: all"
echo " --distribute=N Distribute the current cpuset into N partitions"
echo " --distribute-partition=I"
echo " Use the i'th partition (zero based)"
echo " --visible-gpus=<L> Comma separated list of gpu ids"
echo " Default: CUDA_VISIBLE_DEVICES or all gpus in"
echo " sequential order"
echo " --gpu-ignore-queue Ignore queue job id when choosing visible GPU"
echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES"
echo " --openmp=M.m Set env variables for the given OpenMP version"
echo " Default: 4.0"
echo " --openmp-percent=N Integer percentage of cpuset to use for OpenMP"
echo " threads Default: 100"
echo " --openmp-places=<Op> Op=threads|cores|sockets. Default: threads"
echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
echo " --force-openmp-num-threads=N"
echo " Override logic for selecting OMP_NUM_THREADS"
echo " --force-openmp-proc-bind=<OP>"
echo " Override logic for selecting OMP_PROC_BIND"
echo " --no-openmp-nested Set OMP_NESTED to false"
echo " --show-bindings Show the bindings"
echo " --lstopo Show bindings in lstopo without executing a command"
echo " -v|--verbose Show options and relevant environment variables"
echo " -h|--help Show this message"
echo ""
echo "Sample Usage:"
echo " Split the current process cpuset into 4 and use the 3rd partition"
echo " ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..."
echo " Bing the process to all even cores"
echo " ${cmd} --proc-bind=core:even -v -- command ..."
echo " Bind to the first 64 cores and split the current process cpuset into 4"
echo " ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..."
echo " skip GPU 0 when mapping visible devices"
echo " ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..."
echo " Display the current bindings"
echo " ${cmd} --proc-bind=numa:0 --show-bindings -- command"
echo " Display the current bindings using lstopo"
echo " ${cmd} --proc-bind=numa:0.core:odd --lstopo"
echo ""
}
################################################################################
# Parse command line arguments
################################################################################
# Show help if no command line arguments given
if [[ "$#" -eq 0 ]]; then
show_help
exit 0
fi
declare -a UNKNOWN_ARGS=()
declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC}
declare -i HPCBIND_DISTRIBUTE=1
declare -i HPCBIND_PARTITION=0
HPCBIND_PROC_BIND="all"
HPCBIND_OPENMP_VERSION=4.0
declare -i HPCBIND_OPENMP_PERCENT=100
HPCBIND_OPENMP_PLACES=${OMP_PLACES:-threads}
declare -i HPCBIND_OPENMP_PROC_BIND=1
declare -i HPCBIND_OPENMP_FORCE_NUM_THREADS=-1
HPCBIND_OPENMP_FORCE_PROC_BIND=""
HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true}
declare -i HPCBIND_VERBOSE=0
declare -i HPCBIND_SHOW_BINDINGS=0
declare -i HPCBIND_LSTOPO=0
for i in $@; do
case $i in
# number of partitions to create
--no-hwloc-bind)
HPCBIND_ENABLE_HWLOC_BIND=0
shift
;;
--proc-bind=*)
HPCBIND_PROC_BIND="${i#*=}"
shift
;;
--distribute=*)
HPCBIND_DISTRIBUTE="${i#*=}"
shift
;;
# which partition to use
--distribute-partition=*)
HPCBIND_PARTITION="${i#*=}"
shift
;;
--visible-gpus=*)
HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ')
shift
;;
--gpu-ignore-queue)
HPCBIND_QUEUE_GPU_MAPPING=0
shift
;;
--no-gpu-mapping)
HPCBIND_ENABLE_GPU_MAPPING=0
shift
;;
--openmp=*)
HPCBIND_OPENMP_VERSION="${i#*=}"
shift
;;
--openmp-percent=*)
HPCBIND_OPENMP_PERCENT="${i#*=}"
shift
;;
--openmp-places=*)
HPCBIND_OPENMP_PLACES="${i#*=}"
shift
;;
--no-openmp-proc-bind)
HPCBIND_OPENMP_PROC_BIND=0
shift
;;
--force-openmp-proc-bind=*)
HPCBIND_OPENMP_FORCE_PROC_BIND="${i#*=}"
shift
;;
--force-openmp-num-threads=*)
HPCBIND_OPENMP_FORCE_NUM_THREADS="${i#*=}"
shift
;;
--no-openmp-nested)
HPCBIND_OPENMP_NESTED="false"
shift
;;
--show-bindings)
HPCBIND_VERBOSE=1
HPCBIND_SHOW_BINDINGS=1
shift
;;
--lstopo)
HPCBIND_VERBOSE=1
HPCBIND_SHOW_BINDINGS=0
HPCBIND_LSTOPO=1
shift
;;
-v|--verbose)
HPCBIND_VERBOSE=1
shift
;;
-h|--help)
show_help
exit 0
;;
# ignore remaining arguments
--)
shift
break
;;
# unknown option
*)
UNKNOWN_ARGS+=("$i")
shift
;;
esac
done
################################################################################
# Check unknown arguments
################################################################################
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
echo "Uknown options: ${UNKNOWN_ARGS[*]}"
exit 1
fi
################################################################################
# Check that visible gpus are valid
################################################################################
HPCBIND_VISIBLE_GPUS=(${HPCBIND_VISIBLE_GPUS})
if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do
if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} ||
${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then
echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0"
HPCBIND_VISIBLE_GPUS[$i]=0;
fi
done
NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]}
fi
################################################################################
# Check OpenMP percent
################################################################################
if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then
echo "OpenMP percent < 1, setting to 1"
HPCBIND_OPENMP_PERCENT=1
elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then
echo "OpenMP percent > 100, setting to 100"
HPCBIND_OPENMP_PERCENT=100
fi
################################################################################
# Check distribute
################################################################################
if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then
echo "Invalid input for distribute, changing distribute to 1"
HPCBIND_DISTRIBUTE=1
fi
if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
echo "Invalid input for distribute-partition, changing to 0"
HPCBIND_PARTITION=0
fi
################################################################################
# Find cpuset and num threads
################################################################################
HPCBIND_HWLOC_CPUSET=""
declare -i HPCBIND_NUM_PUS=0
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND})
else
BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND})
fi
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]}
HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l)
else
HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor)
fi
declare -i HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_NUM_PUS * HPCBIND_OPENMP_PERCENT))
HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_OPENMP_NUM_THREADS / 100))
if [[ ${HPCBIND_OPENMP_NUM_THREADS} -lt 1 ]]; then
HPCBIND_OPENMP_NUM_THREADS=1
elif [[ ${HPCBIND_OPENMP_NUM_THREADS} -gt ${HPCBIND_NUM_PUS} ]]; then
HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_NUM_PUS}
fi
if [[ ${HPCBIND_OPENMP_FORCE_NUM_THREADS} -gt 0 ]]; then
HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_OPENMP_FORCE_NUM_THREADS}
fi
################################################################################
# Set OpenMP environment variables
################################################################################
# set OMP_NUM_THREADS
export OMP_NUM_THREADS=${HPCBIND_OPENMP_NUM_THREADS}
# set OMP_PROC_BIND and OMP_PLACES
if [[ ${HPCBIND_OPENMP_PROC_BIND} -eq 1 ]]; then
if [[ "${HPCBIND_OPENMP_FORCE_PROC_BIND}" == "" ]]; then
#default proc bind logic
if [[ "${HPCBIND_OPENMP_VERSION}" == "4.0" || "${HPCBIND_OPENMP_VERSION}" > "4.0" ]]; then
export OMP_PLACES="${HPCBIND_OPENMP_PLACES}"
export OMP_PROC_BIND="spread"
else
export OMP_PROC_BIND="true"
unset OMP_PLACES
fi
else
#force proc bind
export OMP_PLACES="${HPCBIND_OPENMP_PLACES}"
export OMP_PROC_BIND="${HPCBIND_OPENMP_FORCE_PROC_BIND}"
fi
else
# no openmp proc bind
unset OMP_PLACES
unset OMP_PROC_BIND
fi
# set OMP_NESTED
export OMP_NESTED=${HPCBIND_OPENMP_NESTED}
################################################################################
# Set CUDA environment variables
################################################################################
if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then
declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
else
declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
fi
fi
################################################################################
# Set hpcbind environment variables
################################################################################
export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC}
export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA}
export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS}
export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET}
export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE}
export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION}
if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
export HPCBIND_HWLOC_PARENT_CPUSET="all"
else
export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET}
fi
export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND}
export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION}
if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then
export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX}
export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME}
export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING}
fi
################################################################################
# Print verbose
################################################################################
if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then
MY_ENV=$(env | sort)
echo "[HPCBIND]"
echo "${MY_ENV}" | grep -E "^HPCBIND_"
echo "[CUDA]"
echo "${MY_ENV}" | grep -E "^CUDA_"
echo "[OPENMP]"
echo "${MY_ENV}" | grep -E "^OMP_"
fi
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
echo "[BINDINGS]"
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
echo "Unable to show bindings, hwloc not available."
fi
################################################################################
# Run command
################################################################################
if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@
else
eval $@
fi
else
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
echo "[BINDINGS]"
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0
else
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET}
fi
else
echo "Unable to show bindings, hwloc not available."
fi
fi

221
lib/kokkos/bin/kokkos-bind Executable file
View File

@ -0,0 +1,221 @@
#!/usr/bin/env bash
# check if hwloc commands exist
declare -i HAS_HWLOC=0
type hwloc-bind >/dev/null 2>&1
HAS_HWLOC="${HAS_HWLOC} + $?"
type hwloc-distrib >/dev/null 2>&1
HAS_HWLOC="${HAS_HWLOC} + $?"
type hwloc-ls >/dev/null 2>&1
HAS_HWLOC="${HAS_HWLOC} + $?"
type hwloc-calc >/dev/null 2>&1
HAS_HWLOC="${HAS_HWLOC} + $?"
type hwloc-ps >/dev/null 2>&1
HAS_HWLOC="${HAS_HWLOC} + $?"
#parse args
declare -a UNKNOWN_ARGS=()
declare -i DISTRIBUTE=1
declare -i INDEX=0
PROC_BIND="all"
CURRENT_CPUSET=""
OPENMP_VERSION=4.0
OPENMP_PROC_BIND=True
OPENMP_NESTED=True
VERBOSE=False
#get the current process cpuset
if [[ ${HAS_HWLOC} -eq 0 ]]; then
MY_PID="$BASHPID"
CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
echo "$CURRENT_CPUSET"
fi
function show_help {
local cmd=$(basename "$0")
echo "Usage: ${cmd} <options> -- command ..."
echo " Uses hwloc to divide the node into the given number of groups,"
echo " set the appropriate OMP_NUM_THREADS and execute the command on the"
echo " selected group."
echo ""
echo " NOTE: This command assumes it has exclusive use of the node"
echo ""
echo "Options:"
echo " --proc-bind=<LOC> Set the initial process mask for the script. "
echo " LOC can be any valid location argumnet for"
echo " hwloc-calc. Defaults to the entire machine"
echo " --distribute=N Distribute the current proc-bind into N groups"
echo " --index=I Use the i'th group (zero based)"
echo " --openmp=M.m Set env variables for the given OpenMP version"
echo " (default 4.0)"
echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
echo " --no-openmp-nested Set OMP_NESTED to false"
echo " -v|--verbose"
echo " -h|--help"
echo ""
echo "Sample Usage:"
echo " ${cmd} --distribute=4 --index=2 -v -- command ..."
echo ""
}
if [[ "$#" -eq 0 ]]; then
show_help
exit 0
fi
for i in $@; do
case $i in
# number of partitions to create
--proc-bind=*)
PROC_BIND="${i#*=}"
shift
;;
--distribute=*)
DISTRIBUTE="${i#*=}"
shift
;;
# which group to use
--index=*)
INDEX="${i#*=}"
shift
;;
--openmp=*)
OPENMP_VERSION="${i#*=}"
shift
;;
--no-openmp-proc-bind)
OPENMP_PROC_BIND=False
shift
;;
--no-openmp-nested)
OPENMP_NESTED=False
shift
;;
-v|--verbose)
VERBOSE=True
shift
;;
-h|--help)
show_help
exit 0
;;
# ignore remaining arguments
--)
shift
break
;;
# unknown option
*)
UNKNOWN_ARGS+=("$i")
shift
;;
esac
done
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
echo "Uknown options: ${UNKNOWN_ARGS[*]}"
exit 1
fi
if [[ ${DISTRIBUTE} -le 0 ]]; then
echo "Invalid input for distribute, changing distribute to 1"
DISTRIBUTE=1
fi
if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then
echo "Invalid input for index, changing index to 0"
INDEX=0
fi
if [[ ${HAS_HWLOC} -ne 0 ]]; then
echo "hwloc not found, no process binding will occur"
DISTRIBUTE=1
INDEX=0
fi
if [[ ${HAS_HWLOC} -eq 0 ]]; then
if [[ "${CURRENT_CPUSET}" == "" ]]; then
BINDING=$(hwloc-calc ${PROC_BIND})
else
BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND})
fi
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE}))
CPUSET=${CPUSETS[${INDEX}]}
NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l)
if [[ "${VERBOSE}" == "True" ]]; then
echo "hwloc: true"
echo " proc_bind: ${PROC_BIND}"
echo " distribute: ${DISTRIBUTE}"
echo " index: ${INDEX}"
echo " parent_cpuset: ${CURRENT_CPUSET}"
echo " cpuset: ${CPUSET}"
echo "omp_num_threads: ${NUM_THREADS}"
echo "omp_proc_bind: ${OPENMP_PROC_BIND}"
echo "omp_nested: ${OPENMP_NESTED}"
echo "OpenMP: ${OPENMP_VERSION}"
fi
# set OMP env
if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
export OMP_PLACES="threads"
export OMP_PROC_BIND="spread"
else
export OMP_PROC_BIND="true"
unset OMP_PLACES
fi
else
unset OMP_PLACES
unset OMP_PROC_BIND
fi
if [[ "${OPENMP_NESTED}" == "True" ]]; then
export OMP_NESTED="true"
else
export OMP_NESTED="false"
fi
export OMP_NUM_THREADS="${NUM_THREADS}"
hwloc-bind ${CPUSET} -- $@
else
NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor)
if [[ "${VERBOSE}" == "True" ]]; then
echo "hwloc: false"
echo "omp_num_threads: ${NUM_THREADS}"
echo "omp_proc_bind: ${OPENMP_PROC_BIND}"
echo "omp_nested: ${OPENMP_NESTED}"
echo "OpenMP: ${OPENMP_VERSION}"
fi
# set OMP env
if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
export OMP_PLACES="threads"
export OMP_PROC_BIND="spread"
else
export OMP_PROC_BIND="true"
unset OMP_PLACES
fi
else
unset OMP_PLACES
unset OMP_PROC_BIND
fi
if [[ "${OPENMP_NESTED}" == "True" ]]; then
export OMP_NESTED="true"
else
export OMP_NESTED="false"
fi
export OMP_NUM_THREADS="${NUM_THREADS}"
eval $@
fi

165
lib/kokkos/bin/runtest Executable file
View File

@ -0,0 +1,165 @@
#!/usr/bin/env bash
function get_path() {
cd "$(dirname "$0")"
cd ..
echo "$(pwd -P)"
}
KOKKOS_PATH="$(get_path "$0")"
function show_help() {
local cmd=$(basename "$0")
echo "Usage: ${cmd} <options> "
echo " Build and run the tests"
echo ""
echo "Options:"
echo " -j=N|--make-j=N Build the tests in parallel"
echo " -c|--clean Clean build and regenerate make files"
echo " --clean-on-pass Clean build when runtest passes"
echo " --output-prefix=<pre> Prefix of log files Default: runtest"
echo " --build-only Only build the tests"
echo " -v|--verbose Tee STDOUT and STDERR to screen and files"
echo " -h|--help Show this message"
echo ""
${KOKKOS_PATH}/generate_makefile.bash --help
return 0
}
declare -a GENERATE_ARGS=()
declare -i VERBOSE=0
declare -i CLEAN=0
declare -i CLEAN_ON_PASS=0
declare -i BUILD_ONLY=0
OUTPUT="runtest"
declare -i MAKE_J=${HPCBIND_NUM_PUS:-1}
for i in $@; do
case $i in
-j=*|--make-j=*)
MAKE_J=${i#*=}
shift
;;
-c|--clean)
CLEAN=1
shift
;;
--clean-on-pass)
CLEAN_ON_PASS=1
shift
;;
--output-prefix=*)
OUTPUT=${i#*=}
shift
;;
--build-only)
BUILD_ONLY=1
shift
;;
-v|--verbose)
VERBOSE=1
shift
;;
-h|--help)
show_help
exit 0
;;
*)
GENERATE_ARGS+=("$i")
shift
;;
esac
done
if [[ "$(pwd -P)" == ${KOKKOS_PATH} ]]; then
echo "Cannot call $0 from root repository path ${KOKKOS_PATH}"
exit 1
fi
# Some makefile dependencies are incorrect, so clean needs to force
# a new call to generate_makefiles.bash
if [[ ${CLEAN} -eq 1 ]]; then
START=${SECONDS}
echo "Cleaning"
/bin/rm -rf algorithms containers core example install Makefile >/dev/null 2>&1
END=${SECONDS}
echo " $((END-START)) seconds"
if [[ ${VERBOSE} -eq 1 ]]; then
echo ""
echo ""
fi
fi
declare -i START=${SECONDS}
echo "Generating Makefile"
echo " ${KOKKOS_PATH}/generate_makefile.bash --kokkos-path=${KOKKOS_PATH} ${GENERATE_ARGS[@]}"
if [[ ${VERBOSE} -eq 0 ]]; then
"${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > ${OUTPUT}.out 2> >(tee ${OUTPUT}.err >&2)
else
"${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > >(tee ${OUTPUT}.out) 2> >(tee ${OUTPUT}.err >&2)
fi
declare -i RESULT=$?
declare -i END=${SECONDS}
if [[ ${RESULT} -eq 0 ]]; then
echo " PASS: $((END-START)) seconds"
if [[ ${VERBOSE} -eq 1 ]]; then
echo ""
echo ""
fi
else
cat ${OUTPUT}.out | grep "FAIL"
cat ${OUTPUT}.err | grep "FAIL"
echo " FAIL: $((END-START)) seconds"
exit 1
fi
START=${SECONDS}
echo "Building"
if [[ ${VERBOSE} -eq 0 ]]; then
make --keep-going -j ${MAKE_J} build-test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2)
else
make --keep-going -j ${MAKE_J} build-test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2)
fi
RESULT=$?
END=${SECONDS}
if [[ ${RESULT} -eq 0 ]]; then
echo " PASS: $((END-START)) seconds"
if [[ ${VERBOSE} -eq 1 ]]; then
echo ""
echo ""
fi
else
cat ${OUTPUT}.out | grep -E "[[:space:]]error:[[:space:]]"
cat ${OUTPUT}.err | grep -E "[[:space:]]error:[[:space:]]"
echo " FAIL: $((END-START)) seconds"
exit 1
fi
if [[ ${BUILD_ONLY} -eq 0 ]]; then
START=${SECONDS}
echo "Testing"
if [[ ${VERBOSE} -eq 0 ]]; then
make --keep-going test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2)
else
make --keep-going test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2)
fi
RESULT=$?
END=${SECONDS}
if [[ ${RESULT} -eq 0 ]]; then
echo " PASS: $((END-START)) seconds"
if [[ ${CLEAN_ON_PASS} -eq 1 ]]; then
make clean
fi
else
cat ${OUTPUT}.out | grep "FAIL"
cat ${OUTPUT}.err | grep "FAIL"
echo " FAIL: $((END-START)) seconds"
exit 1
fi
fi
exit ${RESULT}

View File

@ -999,8 +999,12 @@ SET (Kokkos_INCLUDE_DIRS
${Kokkos_SOURCE_DIR}/containers/src
${Kokkos_SOURCE_DIR}/algorithms/src
${Kokkos_BINARY_DIR} # to find KokkosCore_config.h
${KOKKOS_INCLUDE_DIRS}
)
# pass include dirs back to parent scope
SET(Kokkos_INCLUDE_DIRS_RET ${Kokkos_INCLUDE_DIRS} PARENT_SCOPE)
INCLUDE_DIRECTORIES(${Kokkos_INCLUDE_DIRS})
IF(KOKKOS_SEPARATE_LIBS)

View File

@ -7,3 +7,4 @@ tag: 2.02.07 date: 12:16:2016 master: 4b4cc4ba develop: 382c0966
tag: 2.02.15 date: 02:10:2017 master: 8c64cd93 develop: 28dea8b6
tag: 2.03.00 date: 04:25:2017 master: 120d9ce7 develop: 015ba641
tag: 2.03.05 date: 05:27:2017 master: 36b92f43 develop: 79073186
tag: 2.03.13 date: 07:27:2017 master: da314444 develop: 29ccb58a

View File

@ -0,0 +1,24 @@
#include <cstdio>
#include <cuda_runtime_api.h>
int main()
{
cudaDeviceProp prop;
const cudaError_t err_code = cudaGetDeviceProperties(&prop, 0);
if (cudaSuccess != err_code) {
fprintf(stderr,"cudaGetDeviceProperties failed: %s\n", cudaGetErrorString(err_code));
return -1;
}
switch (prop.major) {
case 3:
printf("Kepler"); break;
case 5:
printf("Maxwell"); break;
case 6:
printf("Pascal"); break;
default:
fprintf(stderr, "Unspported Device %d%d\n", (int)prop.major, (int)prop.minor);
return -1;
}
printf("%d%d\n", (int)prop.major, (int)prop.minor);
return 0;
}

View File

@ -160,9 +160,14 @@ if [ "$MACHINE" = "sems" ]; then
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
@ -280,13 +285,13 @@ elif [ "$MACHINE" = "apollo" ]; then
"gcc/5.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
"intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
"clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
"clang/head $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
"clang/4.0.0 $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
"cuda/8.0.44 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
)
else
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("cuda/8.0.44 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
"clang/head $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
"clang/4.0.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
"clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
"gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
@ -584,7 +589,7 @@ single_build_and_test() {
else
run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
local -i build_start_time=$(date +%s)
run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
run_cmd make -j 32 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
local -i build_end_time=$(date +%s)
comment="build_time=$(($build_end_time-$build_start_time))"

View File

@ -28,14 +28,14 @@ export JENKINS_DO_PTHREAD=ON
export JENKINS_DO_SERIAL=OFF
export JENKINS_DO_COMPLEX=OFF
export ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
export ARCH_C_FLAG="-xCORE-AVX2 -mkl"
export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
export JENKINS_ARCH_C_FLAG="-xCORE-AVX2 -mkl"
export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a"
export LAPACK_LIBRARIES=${BLAS_LIBRARIES}
export JENKINS_DO_TESTS=ON
export JENKINS_DO_EXAMPLES=ON
export JENKINS_DO_SHARED=OFF
export JENKINS_DO_SHARED=ON
export QUEUE=haswell

View File

@ -28,14 +28,14 @@ export JENKINS_DO_PTHREAD=OFF
export JENKINS_DO_SERIAL=ON
export JENKINS_DO_COMPLEX=ON
export ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
export ARCH_C_FLAG="-xCORE-AVX2 -mkl"
export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
export JENKINS_ARCH_C_FLAG="-xCORE-AVX2 -mkl"
export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a"
export LAPACK_LIBRARIES=${BLAS_LIBRARIES}
export JENKINS_DO_TESTS=ON
export JENKINS_DO_EXAMPLES=ON
export JENKINS_DO_SHARED=OFF
export JENKINS_DO_SHARED=ON
export QUEUE=haswell

View File

@ -60,7 +60,6 @@ test-threads: KokkosContainers_PerformanceTest_Threads
test-openmp: KokkosContainers_PerformanceTest_OpenMP
./KokkosContainers_PerformanceTest_OpenMP
build_all: $(TARGETS)
test: $(TEST_TARGETS)

View File

@ -42,6 +42,9 @@
*/
#include <gtest/gtest.h>
#include <cstdlib>
#include <Kokkos_Macros.hpp>
int main(int argc, char *argv[]) {
::testing::InitGoogleTest(&argc,argv);

View File

@ -69,30 +69,13 @@ protected:
{
std::cout << std::setprecision(5) << std::scientific;
unsigned num_threads = 4;
if (Kokkos::hwloc::available()) {
num_threads = Kokkos::hwloc::get_available_numa_count()
* Kokkos::hwloc::get_available_cores_per_numa()
* Kokkos::hwloc::get_available_threads_per_core()
;
}
std::cout << "OpenMP: " << num_threads << std::endl;
Kokkos::OpenMP::initialize( num_threads );
std::cout << "available threads: " << omp_get_max_threads() << std::endl;
Kokkos::OpenMP::initialize();
Kokkos::OpenMP::print_configuration( std::cout );
}
static void TearDownTestCase()
{
Kokkos::OpenMP::finalize();
omp_set_num_threads(1);
ASSERT_EQ( 1 , omp_get_max_threads() );
}
};

View File

@ -564,7 +564,7 @@ namespace Impl {
template< class D, class A1, class A2, class A3, class ... Args >
struct DualViewSubview {
typedef typename Kokkos::Experimental::Impl::ViewMapping
typedef typename Kokkos::Impl::ViewMapping
< void
, Kokkos::ViewTraits< D, A1, A2, A3 >
, Args ...

View File

@ -46,19 +46,6 @@
///
/// This header file declares and defines Kokkos::Experimental::DynRankView and its
/// related nonmember functions.
/*
* Changes from View
* 1. The rank of the DynRankView is returned by the method rank()
* 2. Max rank of a DynRankView is 7
* 3. subview name is subdynrankview
* 4. Every subdynrankview is returned with LayoutStride
*
* NEW: Redesigned DynRankView
* 5. subview function name now available
* 6. Copy and Copy-Assign View to DynRankView
* 7. deep_copy between Views and DynRankViews
* 8. rank( view ); returns the rank of View or DynRankView
*/
#ifndef KOKKOS_DYNRANKVIEW_HPP
#define KOKKOS_DYNRANKVIEW_HPP
@ -117,6 +104,14 @@ struct DynRankDimTraits {
, layout.dimension[7] );
}
// Extra overload to match that for specialize types v2
template <typename Layout, typename ... P>
KOKKOS_INLINE_FUNCTION
static size_t computeRank( const Kokkos::Impl::ViewCtorProp<P...>& prop, const Layout& layout )
{
return computeRank(layout);
}
// Create the layout for the rank-7 view.
// Non-strided Layout
template <typename Layout>
@ -158,8 +153,17 @@ struct DynRankDimTraits {
);
}
// Extra overload to match that for specialize types
template <typename Traits, typename ... P>
KOKKOS_INLINE_FUNCTION
static typename std::enable_if< (std::is_same<typename Traits::array_layout , Kokkos::LayoutRight>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutLeft>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutStride>::value) , typename Traits::array_layout >::type createLayout( const ViewCtorProp<P...>& prop, const typename Traits::array_layout& layout )
{
return createLayout( layout );
}
// Create a view from the given dimension arguments.
// This is only necessary because the shmem constructor doesn't take a layout.
// NDE shmem View's are not compatible with the added view_alloc value_type / fad_dim deduction functionality
template <typename ViewType, typename ViewArg>
static ViewType createView( const ViewArg& arg
, const size_t N0
@ -186,7 +190,8 @@ struct DynRankDimTraits {
// Non-strided Layout
template <typename Layout , typename iType>
KOKKOS_INLINE_FUNCTION
static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank )
static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) && std::is_integral<iType>::value , Layout >::type
reconstructLayout( const Layout& layout , iType dynrank )
{
return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0)
, dynrank > 1 ? layout.dimension[1] : ~size_t(0)
@ -202,7 +207,8 @@ struct DynRankDimTraits {
// LayoutStride
template <typename Layout , typename iType>
KOKKOS_INLINE_FUNCTION
static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank )
static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) && std::is_integral<iType>::value , Layout >::type
reconstructLayout( const Layout& layout , iType dynrank )
{
return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0)
, dynrank > 0 ? layout.stride[0] : (0)
@ -311,6 +317,11 @@ void dyn_rank_view_verify_operator_bounds
/** \brief Assign compatible default mappings */
struct ViewToDynRankViewTag {};
} // namespace Impl
} // namespace Experimental
namespace Impl {
template< class DstTraits , class SrcTraits >
class ViewMapping< DstTraits , SrcTraits ,
typename std::enable_if<(
@ -337,7 +348,7 @@ class ViewMapping< DstTraits , SrcTraits ,
)
)
)
) , ViewToDynRankViewTag >::type >
) , Kokkos::Experimental::Impl::ViewToDynRankViewTag >::type >
{
private:
@ -376,7 +387,7 @@ public:
typedef typename DstType::offset_type dst_offset_type ;
dst.m_map.m_offset = dst_offset_type(std::integral_constant<unsigned,0>() , src.layout() ); //Check this for integer input1 for padding, etc
dst.m_map.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
dst.m_map.m_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
dst.m_track.assign( src.m_track , DstTraits::is_managed );
dst.m_rank = src.Rank ;
}
@ -384,22 +395,20 @@ public:
} //end Impl
namespace Experimental {
/* \class DynRankView
* \brief Container that creates a Kokkos view with rank determined at runtime.
* Essentially this is a rank 7 view that wraps the access operators
* to yield the functionality of a view
* Essentially this is a rank 7 view
*
* Changes from View
* 1. The rank of the DynRankView is returned by the method rank()
* 2. Max rank of a DynRankView is 7
* 3. subview name is subdynrankview
* 4. Every subdynrankview is returned with LayoutStride
*
* NEW: Redesigned DynRankView
* 5. subview function name now available
* 6. Copy and Copy-Assign View to DynRankView
* 7. deep_copy between Views and DynRankViews
* 8. rank( view ); returns the rank of View or DynRankView
* 3. subview called with 'subview(...)' or 'subdynrankview(...)' (backward compatibility)
* 4. Every subview is returned with LayoutStride
* 5. Copy and Copy-Assign View to DynRankView
* 6. deep_copy between Views and DynRankViews
* 7. rank( view ); returns the rank of View or DynRankView
*
*/
@ -427,7 +436,7 @@ public:
private:
typedef Kokkos::Experimental::Impl::ViewMapping< traits , void > map_type ;
typedef Kokkos::Impl::ViewMapping< traits , void > map_type ;
typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ;
track_type m_track ;
@ -556,7 +565,7 @@ public:
// Allow specializations to query their specialized map
KOKKOS_INLINE_FUNCTION
const Kokkos::Experimental::Impl::ViewMapping< traits , void > &
const Kokkos::Impl::ViewMapping< traits , void > &
implementation_map() const { return m_map ; }
//----------------------------------------
@ -803,7 +812,7 @@ public:
, m_rank(rhs.m_rank)
{
typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
Mapping::assign( m_map , rhs.m_map , rhs.m_track );
}
@ -813,7 +822,7 @@ public:
DynRankView & operator = (const DynRankView<RT,RP...> & rhs )
{
typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
Mapping::assign( m_map , rhs.m_map , rhs.m_track );
m_track.assign( rhs.m_track , traits::is_managed );
@ -831,7 +840,7 @@ public:
, m_rank( rhs.Rank )
{
typedef typename View<RT,RP...>::traits SrcTraits ;
typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ;
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ;
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
Mapping::assign( *this , rhs );
}
@ -841,7 +850,7 @@ public:
DynRankView & operator = ( const View<RT,RP...> & rhs )
{
typedef typename View<RT,RP...>::traits SrcTraits ;
typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ;
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ;
static_assert( Mapping::is_assignable , "Incompatible View to DynRankView copy assignment" );
Mapping::assign( *this , rhs );
return *this ;
@ -870,7 +879,7 @@ public:
)
: m_track()
, m_map()
, m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) )
, m_rank( Impl::DynRankDimTraits<typename traits::specialize>::template computeRank< typename traits::array_layout, P...>(arg_prop, arg_layout) )
{
// Append layout and spaces if not input
typedef Impl::ViewCtorProp< P ... > alloc_prop_input ;
@ -923,7 +932,7 @@ public:
//------------------------------------------------------------
Kokkos::Experimental::Impl::SharedAllocationRecord<> *
record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) );
record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) );
//------------------------------------------------------------
#if defined( KOKKOS_ENABLE_CUDA )
@ -947,8 +956,8 @@ public:
>::type const & arg_layout
)
: m_track() // No memory tracking
, m_map( arg_prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) )
, m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) )
, m_map( arg_prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) )
, m_rank( Impl::DynRankDimTraits<typename traits::specialize>::template computeRank< typename traits::array_layout, P...>(arg_prop, arg_layout) )
{
static_assert(
std::is_same< pointer_type
@ -1034,6 +1043,7 @@ public:
{}
// For backward compatibility
// NDE This ctor does not take ViewCtorProp argument - should not use alternative createLayout call
explicit inline
DynRankView( const ViewAllocateWithoutInitializing & arg_prop
, const typename traits::array_layout & arg_layout
@ -1179,6 +1189,11 @@ namespace Impl {
struct DynRankSubviewTag {};
} // namespace Impl
} // namespace Experimental
namespace Impl {
template< class SrcTraits , class ... Args >
struct ViewMapping
< typename std::enable_if<(
@ -1192,7 +1207,7 @@ struct ViewMapping
std::is_same< typename SrcTraits::array_layout
, Kokkos::LayoutStride >::value
)
), DynRankSubviewTag >::type
), Kokkos::Experimental::Impl::DynRankSubviewTag >::type
, SrcTraits
, Args ... >
{
@ -1264,7 +1279,7 @@ public:
};
typedef DynRankView< value_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits > ret_type;
typedef Kokkos::Experimental::DynRankView< value_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits > ret_type;
template < typename T , class ... P >
KOKKOS_INLINE_FUNCTION
@ -1336,9 +1351,10 @@ public:
} // end Impl
namespace Experimental {
template< class V , class ... Args >
using Subdynrankview = typename Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , V , Args... >::ret_type ;
using Subdynrankview = typename Kokkos::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , V , Args... >::ret_type ;
template< class D , class ... P , class ...Args >
KOKKOS_INLINE_FUNCTION
@ -1348,7 +1364,7 @@ subdynrankview( const Kokkos::Experimental::DynRankView< D , P... > &src , Args.
if ( src.rank() > sizeof...(Args) ) //allow sizeof...(Args) >= src.rank(), ignore the remaining args
{ Kokkos::abort("subdynrankview: num of args must be >= rank of the source DynRankView"); }
typedef Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ;
typedef Kokkos::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ;
return metafcn::subview( src.rank() , src , args... );
}

View File

@ -557,7 +557,7 @@ void deep_copy( const View<T,DP...> & dst
if ( DstExecCanAccessSrc ) {
// Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src );
}
else {
Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
@ -581,7 +581,7 @@ void deep_copy( const DynamicView<T,DP...> & dst
if ( DstExecCanAccessSrc ) {
// Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src );
}
else {
Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");

View File

@ -69,6 +69,8 @@
#include <Kokkos_ErrorReporter.hpp>
#include <TestErrorReporter.hpp>
#include <TestViewCtorPropEmbeddedDim.hpp>
//----------------------------------------------------------------------------
@ -94,6 +96,10 @@ TEST_F( cuda , dyn_view_api) {
TestDynViewAPI< double , Kokkos::Cuda >();
}
TEST_F( cuda, viewctorprop_embedded_dim ) {
TestViewCtorProp_EmbeddedDim< Kokkos::Cuda >::test_vcpt( 2, 3 );
}
TEST_F( cuda , staticcrsgraph )
{
TestStaticCrsGraph::run_test_graph< Kokkos::Cuda >();

View File

@ -66,6 +66,8 @@
#include <Kokkos_ErrorReporter.hpp>
#include <TestErrorReporter.hpp>
#include <TestViewCtorPropEmbeddedDim.hpp>
#include <iomanip>
namespace Test {
@ -76,14 +78,7 @@ protected:
{
std::cout << std::setprecision(5) << std::scientific;
unsigned threads_count = 4 ;
if ( Kokkos::hwloc::available() ) {
threads_count = Kokkos::hwloc::get_available_numa_count() *
Kokkos::hwloc::get_available_cores_per_numa();
}
Kokkos::OpenMP::initialize( threads_count );
Kokkos::OpenMP::initialize();
}
static void TearDownTestCase()
@ -96,6 +91,10 @@ TEST_F( openmp, dyn_view_api) {
TestDynViewAPI< double , Kokkos::OpenMP >();
}
TEST_F( openmp, viewctorprop_embedded_dim ) {
TestViewCtorProp_EmbeddedDim< Kokkos::OpenMP >::test_vcpt( 2, 3 );
}
TEST_F( openmp, bitset )
{
test_bitset<Kokkos::OpenMP>();

View File

@ -67,6 +67,8 @@
#include <Kokkos_ErrorReporter.hpp>
#include <TestErrorReporter.hpp>
#include <TestViewCtorPropEmbeddedDim.hpp>
namespace Test {
class serial : public ::testing::Test {
@ -85,6 +87,10 @@ TEST_F( serial, dyn_view_api) {
TestDynViewAPI< double , Kokkos::Serial >();
}
TEST_F( serial, viewctorprop_embedded_dim ) {
TestViewCtorProp_EmbeddedDim< Kokkos::Serial >::test_vcpt( 2, 3 );
}
TEST_F( serial , staticcrsgraph )
{
TestStaticCrsGraph::run_test_graph< Kokkos::Serial >();

View File

@ -70,6 +70,8 @@
#include <Kokkos_ErrorReporter.hpp>
#include <TestErrorReporter.hpp>
#include <TestViewCtorPropEmbeddedDim.hpp>
namespace Test {
class threads : public ::testing::Test {
@ -103,6 +105,10 @@ TEST_F( threads , dyn_view_api) {
TestDynViewAPI< double , Kokkos::Threads >();
}
TEST_F( threads, viewctorprop_embedded_dim ) {
TestViewCtorProp_EmbeddedDim< Kokkos::Threads >::test_vcpt( 2, 3 );
}
TEST_F( threads , staticcrsgraph )
{
TestStaticCrsGraph::run_test_graph< Kokkos::Threads >();

View File

@ -0,0 +1,213 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <cstdio>
#include <gtest/gtest.h>
#include <Kokkos_Core.hpp>
#include <Kokkos_DynRankView.hpp>
#include <type_traits>
#include <typeinfo>
namespace Test {
namespace {
template <typename ExecSpace >
struct TestViewCtorProp_EmbeddedDim {
using ViewIntType = typename Kokkos::View< int**, ExecSpace >;
using ViewDoubleType = typename Kokkos::View< double*, ExecSpace >;
using DynRankViewIntType = typename Kokkos::DynRankView< int, ExecSpace >;
using DynRankViewDoubleType = typename Kokkos::DynRankView< double, ExecSpace >;
// Cuda 7.0 has issues with using a lamda in parallel_for to initialize the view - replace with this functor
template < class ViewType >
struct Functor {
ViewType v;
Functor( const ViewType & v_ ) : v(v_) {}
KOKKOS_INLINE_FUNCTION
void operator()( const int i ) const {
v(i) = i;
}
};
static void test_vcpt( const int N0, const int N1 )
{
// Create two views to test
{
using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType ;
using VDT = typename TestViewCtorProp_EmbeddedDim::ViewDoubleType ;
VIT vi1("vi1", N0, N1);
VDT vd1("vd1", N0);
// TEST: Test for common type between two views, one with type double, other with type int
// Deduce common value_type and construct a view with that type
{
// Two views
auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1);
typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType;
typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT;
typedef typename CVT::HostMirror HostCVT;
// Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1),
Functor<CVT>(cv1)
);
HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
Kokkos::deep_copy( hcv1, cv1 );
ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ;
#if 0
// debug output
for ( int i = 0; i < N0*N1; ++i ) {
printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) );
}
printf( " Common value type view: %s \n", typeid( CVT() ).name() );
printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() );
if ( std::is_same< CommonViewValueType, double >::value == true ) {
printf("Proper common value_type\n");
}
else {
printf("WRONG common value_type\n");
}
// end debug output
#endif
}
{
// Single view
auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1);
typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType;
typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT;
typedef typename CVT::HostMirror HostCVT;
// Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1),
Functor<CVT>(cv1)
);
HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
Kokkos::deep_copy( hcv1, cv1 );
ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ;
}
}
// Create two dynamic rank views to test
{
using VIT = typename TestViewCtorProp_EmbeddedDim::DynRankViewIntType ;
using VDT = typename TestViewCtorProp_EmbeddedDim::DynRankViewDoubleType ;
VIT vi1("vi1", N0, N1);
VDT vd1("vd1", N0);
// TEST: Test for common type between two views, one with type double, other with type int
// Deduce common value_type and construct a view with that type
{
// Two views
auto view_alloc_arg = Kokkos::common_view_alloc_prop( vi1, vd1 );
typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType;
typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT;
typedef typename CVT::HostMirror HostCVT;
// Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1),
Functor<CVT>(cv1)
);
HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
Kokkos::deep_copy( hcv1, cv1 );
ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ;
}
{
// Single views
auto view_alloc_arg = Kokkos::common_view_alloc_prop( vi1 );
typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType;
typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT;
typedef typename CVT::HostMirror HostCVT;
// Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1),
Functor<CVT>(cv1)
);
HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
Kokkos::deep_copy( hcv1, cv1 );
ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ;
}
}
} // end test_vcpt
}; // end struct
} // namespace
} // namespace Test

View File

@ -42,6 +42,8 @@
*/
#include <gtest/gtest.h>
#include <cstdlib>
#include <Kokkos_Macros.hpp>
int main(int argc, char *argv[]) {
::testing::InitGoogleTest(&argc,argv);

View File

@ -79,7 +79,6 @@ test-mempool: KokkosCore_PerformanceTest_Mempool
test-taskdag: KokkosCore_PerformanceTest_TaskDAG
./KokkosCore_PerformanceTest_TaskDAG
build_all: $(TARGETS)
test: $(TEST_TARGETS)

View File

@ -42,6 +42,8 @@
*/
#include <gtest/gtest.h>
#include <cstdlib>
#include <Kokkos_Core.hpp>
namespace Test {

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More