forked from lijiext/lammps
Merge branch 'master' into USER-DPD_kokkos
This commit is contained in:
commit
5c985946d5
|
@ -0,0 +1,21 @@
|
|||
# This file contains file patterns that triggers automatic
|
||||
# code review requests from users that are owners of these files
|
||||
# Order matters, the last match has the highest precedence
|
||||
|
||||
# library folders
|
||||
lib/colvars/* @giacomofiorin
|
||||
lib/compress/* @akohlmey
|
||||
lib/kokkos/* @stanmoore1
|
||||
lib/molfile/* @akohlmey
|
||||
lib/qmmm/* @akohlmey
|
||||
lib/vtk/* @rbberger
|
||||
|
||||
# packages
|
||||
src/KOKKOS @stanmoore1
|
||||
src/USER-CGSDK @akohlmey
|
||||
src/USER-COLVARS @giacomofiorin
|
||||
src/USER-OMP @akohlmey
|
||||
src/USER-QMMM @akohlmey
|
||||
|
||||
# tools
|
||||
tools/msi2lmp/* @akohlmey
|
|
@ -32,3 +32,11 @@ log.cite
|
|||
.Trashes
|
||||
ehthumbs.db
|
||||
Thumbs.db
|
||||
|
||||
#cmake
|
||||
/build*
|
||||
/CMakeCache.txt
|
||||
/CMakeFiles/
|
||||
/Makefile
|
||||
/cmake_install.cmake
|
||||
/lmp
|
||||
|
|
|
@ -0,0 +1,547 @@
|
|||
########################################
|
||||
# CMake build system
|
||||
# This file is part of LAMMPS
|
||||
# Created by Christoph Junghans and Richard Berger
|
||||
cmake_minimum_required(VERSION 3.1)
|
||||
|
||||
project(lammps)
|
||||
set(SOVERSION 0)
|
||||
set(LAMMPS_SOURCE_DIR ${CMAKE_SOURCE_DIR}/../src)
|
||||
set(LAMMPS_LIB_SOURCE_DIR ${CMAKE_SOURCE_DIR}/../lib)
|
||||
set(LAMMPS_LIB_BINARY_DIR ${CMAKE_BINARY_DIR}/lib)
|
||||
|
||||
#To not conflict with old Makefile build system, we build everything here
|
||||
file(GLOB LIB_SOURCES ${LAMMPS_SOURCE_DIR}/*.cpp)
|
||||
file(GLOB LMP_SOURCES ${LAMMPS_SOURCE_DIR}/main.cpp)
|
||||
list(REMOVE_ITEM LIB_SOURCES ${LMP_SOURCES})
|
||||
|
||||
# Cmake modules/macros are in a subdirectory to keep this file cleaner
|
||||
set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/Modules)
|
||||
|
||||
if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CXX_FLAGS)
|
||||
#release comes with -O3 by default
|
||||
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." FORCE)
|
||||
endif(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CXX_FLAGS)
|
||||
|
||||
foreach(STYLE_FILE style_angle.h style_atom.h style_body.h style_bond.h style_command.h style_compute.h style_dihedral.h style_dump.h
|
||||
style_fix.h style_improper.h style_integrate.h style_kspace.h style_minimize.h style_nbin.h style_npair.h style_nstencil.h
|
||||
style_ntopo.h style_pair.h style_reader.h style_region.h)
|
||||
if(EXISTS ${LAMMPS_SOURCE_DIR}/${STYLE_FILE})
|
||||
message(FATAL_ERROR "There is a ${STYLE_FILE} in ${LAMMPS_SOURCE_DIR}, please clean up the source directory first")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
enable_language(CXX)
|
||||
|
||||
######################################################################
|
||||
# compiler tests
|
||||
# these need ot be done early (before further tests).
|
||||
#####################################################################
|
||||
include(CheckCCompilerFlag)
|
||||
|
||||
########################################################################
|
||||
# User input options #
|
||||
########################################################################
|
||||
option(BUILD_SHARED_LIBS "Build shared libs" OFF)
|
||||
option(INSTALL_LIB "Install lammps library and header" ON)
|
||||
include(GNUInstallDirs)
|
||||
|
||||
set(LAMMPS_LINK_LIBS)
|
||||
option(ENABLE_MPI "Build MPI version" OFF)
|
||||
if(ENABLE_MPI)
|
||||
find_package(MPI REQUIRED)
|
||||
include_directories(${MPI_C_INCLUDE_PATH})
|
||||
list(APPEND LAMMPS_LINK_LIBS ${MPI_CXX_LIBRARIES})
|
||||
option(LAMMPS_LONGLONG_TO_LONG "Workaround if your system or MPI version does not recognize 'long long' data types" OFF)
|
||||
if(LAMMPS_LONGLONG_TO_LONG)
|
||||
add_definitions(-DLAMMPS_LONGLONG_TO_LONG)
|
||||
endif()
|
||||
else()
|
||||
file(GLOB MPI_SOURCES ${LAMMPS_SOURCE_DIR}/STUBS/mpi.c)
|
||||
list(APPEND LIB_SOURCES ${MPI_SOURCES})
|
||||
include_directories(${LAMMPS_SOURCE_DIR}/STUBS)
|
||||
endif()
|
||||
|
||||
set(LAMMPS_SIZE_LIMIT "LAMMPS_SMALLBIG" CACHE STRING "Lammps size limit")
|
||||
set_property(CACHE LAMMPS_SIZE_LIMIT PROPERTY STRINGS LAMMPS_SMALLBIG LAMMPS_BIGBIG LAMMPS_SMALLSMALL)
|
||||
add_definitions(-D${LAMMPS_SIZE_LIMIT})
|
||||
|
||||
set(LAMMPS_MEMALIGN "64" CACHE STRING "enables the use of the posix_memalign() call instead of malloc() when large chunks or memory are allocated by LAMMPS")
|
||||
add_definitions(-DLAMMPS_MEMALIGN=${LAMMPS_MEMALIGN})
|
||||
|
||||
option(LAMMPS_EXCEPTIONS "enable the use of C++ exceptions for error messages (useful for library interface)" OFF)
|
||||
if(LAMMPS_EXCEPTIONS)
|
||||
add_definitions(-DLAMMPS_EXCEPTIONS)
|
||||
endif()
|
||||
|
||||
option(CMAKE_VERBOSE_MAKEFILE "Verbose makefile" OFF)
|
||||
|
||||
option(ENABLE_TESTING "Enable testing" OFF)
|
||||
if(ENABLE_TESTING)
|
||||
enable_testing()
|
||||
endif(ENABLE_TESTING)
|
||||
|
||||
option(ENABLE_ALL "Build all default packages" OFF)
|
||||
set(DEFAULT_PACKAGES ASPHERE BODY CLASS2 COLLOID COMPRESS CORESHELL DIPOLE GRANULAR
|
||||
KSPACE MANYBODY MC MEAM MISC MOLECULE PERI QEQ
|
||||
REAX REPLICA RIGID SHOCK SNAP SRD)
|
||||
set(OTHER_PACKAGES KIM PYTHON MSCG MPIIO VORONOI POEMS
|
||||
USER-ATC USER-AWPMD USER-CGDNA
|
||||
USER-CGSDK USER-COLVARS USER-DIFFRACTION USER-DPD USER-DRUDE USER-EFF
|
||||
USER-FEP USER-H5MD USER-LB USER-MANIFOLD USER-MEAMC USER-MGPT USER-MISC
|
||||
USER-MOLFILE USER-NETCDF USER-PHONON USER-QTB USER-REAXC USER-SMD
|
||||
USER-SMTBQ USER-SPH USER-TALLY USER-VTK USER-QUIP USER-QMMM)
|
||||
set(ACCEL_PACKAGES USER-OMP KOKKOS OPT USER-INTEL GPU)
|
||||
foreach(PKG ${DEFAULT_PACKAGES})
|
||||
option(ENABLE_${PKG} "Build ${PKG} Package" ${ENABLE_ALL})
|
||||
endforeach()
|
||||
foreach(PKG ${ACCEL_PACKAGES} ${OTHER_PACKAGES})
|
||||
option(ENABLE_${PKG} "Build ${PKG} Package" OFF)
|
||||
endforeach()
|
||||
|
||||
macro(pkg_depends PKG1 PKG2)
|
||||
if(ENABLE_${PKG1} AND NOT ENABLE_${PKG2})
|
||||
message(FATAL_ERROR "${PKG1} package needs LAMMPS to be build with ${PKG2}")
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
pkg_depends(MPIIO MPI)
|
||||
pkg_depends(QEQ MANYBODY)
|
||||
pkg_depends(USER-ATC MANYBODY)
|
||||
pkg_depends(USER-H5MD MPI)
|
||||
pkg_depends(USER-LB MPI)
|
||||
pkg_depends(USER-MISC MANYBODY)
|
||||
pkg_depends(USER-PHONON KSPACE)
|
||||
|
||||
if(ENABLE_BODY AND ENABLE_POEMS)
|
||||
message(FATAL_ERROR "BODY and POEMS cannot be enabled at the same time")
|
||||
endif()
|
||||
|
||||
######################################################
|
||||
# packages with special compiler needs or external libs
|
||||
######################################################
|
||||
if(ENABLE_REAX OR ENABLE_MEAM OR ENABLE_USER-QUIP OR ENABLE_USER-QMMM)
|
||||
enable_language(Fortran)
|
||||
endif()
|
||||
|
||||
if(ENABLE_KOKKOS OR ENABLE_MSCG)
|
||||
# starting with CMake 3.1 this is all you have to do to enforce C++11
|
||||
set(CMAKE_CXX_STANDARD 11) # C++11...
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON) #...is required...
|
||||
set(CMAKE_CXX_EXTENSIONS OFF) #...without compiler extensions like gnu++11
|
||||
endif()
|
||||
|
||||
if(ENABLE_USER-OMP OR ENABLE_KOKKOS OR ENABLE_USER-INTEL)
|
||||
find_package(OpenMP REQUIRED)
|
||||
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
|
||||
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
|
||||
endif()
|
||||
|
||||
if(ENABLE_KSPACE)
|
||||
set(FFT "KISSFFT" CACHE STRING "FFT library for KSPACE package")
|
||||
set_property(CACHE FFT PROPERTY STRINGS KISSFFT FFTW3 MKL FFTW2)
|
||||
if(NOT FFT STREQUAL "KISSFFT")
|
||||
find_package(${FFT} REQUIRED)
|
||||
add_definitions(-DFFT_${FFT})
|
||||
include_directories(${${FFT}_INCLUDE_DIRS})
|
||||
list(APPEND LAMMPS_LINK_LIBS ${${FFT}_LIBRARIES})
|
||||
endif()
|
||||
set(PACK_OPTIMIZATION "PACK_ARRAY" CACHE STRING "Optimization for FFT")
|
||||
set_property(CACHE PACK_OPTIMIZATION PROPERTY STRINGS PACK_ARRAY PACK_POINTER PACK_MEMCPY)
|
||||
if(NOT PACK_OPTIMIZATION STREQUAL "PACK_ARRAY")
|
||||
add_definitions(-D${PACK_OPTIMIZATION})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(ENABLE_MISC)
|
||||
option(LAMMPS_XDR "include XDR compatibility files for doing particle dumps in XTC format" OFF)
|
||||
if(LAMMPS_XDR)
|
||||
add_definitions(-DLAMMPS_XDR)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(ENABLE_MSCG OR ENABLE_USER-ATC OR ENABLE_USER-AWPMD OR ENABLE_USER-QUIP)
|
||||
find_package(LAPACK)
|
||||
if(LAPACK_FOUND)
|
||||
list(APPEND LAMMPS_LINK_LIBS ${LAPACK_LIBRARIES})
|
||||
else()
|
||||
enable_language(Fortran)
|
||||
file(GLOB LAPACK_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/linalg/*.f)
|
||||
list(APPEND LIB_SOURCES ${LAPACK_SOURCES})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(ENABLE_PYTHON)
|
||||
find_package(PythonInterp REQUIRED)
|
||||
find_package(PythonLibs REQUIRED)
|
||||
add_definitions(-DLMP_PYTHON)
|
||||
include_directories(${PYTHON_INCLUDE_DIR})
|
||||
list(APPEND LAMMPS_LINK_LIBS ${PYTHON_LIBRARY})
|
||||
if(NOT PYTHON_INSTDIR)
|
||||
execute_process(COMMAND ${PYTHON_EXECUTABLE}
|
||||
-c "import distutils.sysconfig as cg; print(cg.get_python_lib(1,0,prefix='${CMAKE_INSTALL_PREFIX}'))"
|
||||
OUTPUT_VARIABLE PYTHON_INSTDIR OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
endif()
|
||||
install(FILES ${CMAKE_SOURCE_DIR}/../python/lammps.py DESTINATION ${PYTHON_INSTDIR})
|
||||
if(NOT BUILD_SHARED_LIBS)
|
||||
message(FATAL_ERROR "Python package need lammps to be build shared, use -DBUILD_SHARED_LIBS=ON")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
find_package(JPEG)
|
||||
if(JPEG_FOUND)
|
||||
add_definitions(-DLAMMPS_JPEG)
|
||||
include_directories(${JPEG_INCLUDE_DIR})
|
||||
list(APPEND LAMMPS_LINK_LIBS ${JPEG_LIBRARIES})
|
||||
endif()
|
||||
|
||||
find_package(PNG)
|
||||
find_package(ZLIB)
|
||||
if(PNG_FOUND AND ZLIB_FOUND)
|
||||
include_directories(${PNG_INCLUDE_DIRS} ${ZLIB_INCLUDE_DIRS})
|
||||
list(APPEND LAMMPS_LINK_LIBS ${PNG_LIBRARIES} ${ZLIB_LIBRARIES})
|
||||
add_definitions(-DLAMMPS_PNG)
|
||||
endif()
|
||||
|
||||
find_program(GZIP_EXECUTABLE gzip)
|
||||
find_package_handle_standard_args(GZIP REQUIRED_VARS GZIP_EXECUTABLE)
|
||||
if(GZIP_FOUND)
|
||||
add_definitions(-DLAMMPS_GZIP)
|
||||
endif()
|
||||
|
||||
find_program(FFMPEG_EXECUTABLE ffmpeg)
|
||||
find_package_handle_standard_args(FFMPEG REQUIRED_VARS FFMPEG_EXECUTABLE)
|
||||
if(FFMPEG_FOUND)
|
||||
add_definitions(-DLAMMPS_FFMPEG)
|
||||
endif()
|
||||
|
||||
if(ENABLE_VORONOI)
|
||||
find_package(VORO REQUIRED) #some distros
|
||||
include_directories(${VORO_INCLUDE_DIRS})
|
||||
list(APPEND LAMMPS_LINK_LIBS ${VORO_LIBRARIES})
|
||||
endif()
|
||||
|
||||
if(ENABLE_USER-MOLFILE)
|
||||
list(APPEND LAMMPS_LINK_LIBS ${CMAKE_DL_LIBS})
|
||||
endif()
|
||||
|
||||
if(ENABLE_USER-NETCDF)
|
||||
find_package(NetCDF REQUIRED)
|
||||
include_directories(NETCDF_INCLUDE_DIR)
|
||||
list(APPEND LAMMPS_LINK_LIBS ${NETCDF_LIBRARY})
|
||||
add_definitions(-DLMP_HAS_NETCDF -DNC_64BIT_DATA=0x0020)
|
||||
endif()
|
||||
|
||||
if(ENABLE_USER-SMD)
|
||||
find_package(Eigen3 REQUIRED)
|
||||
include_directories(${EIGEN3_INCLUDE_DIR})
|
||||
endif()
|
||||
|
||||
if(ENABLE_USER-QUIP)
|
||||
find_package(QUIP REQUIRED)
|
||||
list(APPEND LAMMPS_LINK_LIBS ${QUIP_LIBRARIES} ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES})
|
||||
endif()
|
||||
|
||||
if(ENABLE_USER-QMMM)
|
||||
find_package(QE REQUIRED)
|
||||
include_directories(${QE_INCLUDE_DIRS})
|
||||
list(APPEND LAMMPS_LINK_LIBS ${QE_LIBRARIES} ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES})
|
||||
endif()
|
||||
|
||||
if(ENABLE_USER-AWPMD)
|
||||
include_directories(${LAMMPS_LIB_SOURCE_DIR}/awpmd/systems/interact
|
||||
${LAMMPS_LIB_SOURCE_DIR}/awpmd/ivutils/include)
|
||||
endif()
|
||||
|
||||
if(ENABLE_USER-H5MD)
|
||||
find_package(HDF5 REQUIRED)
|
||||
list(APPEND LAMMPS_LINK_LIBS ${HDF5_LIBRARIES})
|
||||
include_directories(${HDF5_INCLUDE_DIRS} ${LAMMPS_LIB_SOURCE_DIR}/h5md/include)
|
||||
endif()
|
||||
|
||||
if(ENABLE_USER-VTK)
|
||||
find_package(VTK REQUIRED NO_MODULE)
|
||||
include(${VTK_USE_FILE})
|
||||
add_definitions(-DLAMMPS_VTK)
|
||||
list(APPEND LAMMPS_LINK_LIBS ${VTK_LIBRARIES})
|
||||
endif()
|
||||
|
||||
if(ENABLE_KIM)
|
||||
find_package(KIM REQUIRED)
|
||||
list(APPEND LAMMPS_LINK_LIBS ${KIM_LIBRARIES})
|
||||
include_directories(${KIM_INCLUDE_DIRS})
|
||||
endif()
|
||||
|
||||
if(ENABLE_MSCG)
|
||||
find_package(GSL REQUIRED)
|
||||
set(LAMMPS_LIB_MSCG_BIN_DIR ${LAMMPS_LIB_BINARY_DIR}/mscg)
|
||||
set(MSCG_TARBALL ${LAMMPS_LIB_MSCG_BIN_DIR}/MS-CG-master.zip)
|
||||
set(LAMMPS_LIB_MSCG_BIN_DIR ${LAMMPS_LIB_MSCG_BIN_DIR}/MSCG-release-master/src)
|
||||
if(NOT EXISTS ${LAMMPS_LIB_MSCG_BIN_DIR})
|
||||
if(NOT EXISTS ${MSCG_TARBALL})
|
||||
message(STATUS "Downloading ${MSCG_TARBALL}")
|
||||
file(DOWNLOAD
|
||||
https://github.com/uchicago-voth/MSCG-release/archive/master.zip
|
||||
${MSCG_TARBALL} SHOW_PROGRESS) #EXPECTED_MD5 cannot be due due to master
|
||||
endif()
|
||||
message(STATUS "Unpacking ${MSCG_TARBALL}")
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} -E tar xvf ${MSCG_TARBALL}
|
||||
WORKING_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/mscg)
|
||||
endif()
|
||||
file(GLOB MSCG_SOURCES ${LAMMPS_LIB_MSCG_BIN_DIR}/*.cpp)
|
||||
list(APPEND LIB_SOURCES ${MSCG_SOURCES})
|
||||
foreach(MSCG_SOURCE ${MSCG_SOURCES})
|
||||
set_property(SOURCE ${MSCG_SOURCE} APPEND PROPERTY COMPILE_DEFINITIONS
|
||||
DIMENSION=3 _exclude_gromacs=1)
|
||||
endforeach()
|
||||
include_directories(${LAMMPS_LIB_MSCG_BIN_DIR} ${GSL_INCLUDE_DIRS})
|
||||
list(APPEND LAMMPS_LINK_LIBS ${GSL_LIBRARIES})
|
||||
endif()
|
||||
|
||||
########################################################################
|
||||
# Basic system tests (standard libraries, headers, functions, types) #
|
||||
########################################################################
|
||||
include(CheckIncludeFile)
|
||||
foreach(HEADER math.h)
|
||||
check_include_file(${HEADER} FOUND_${HEADER})
|
||||
if(NOT FOUND_${HEADER})
|
||||
message(FATAL_ERROR "Could not find needed header - ${HEADER}")
|
||||
endif(NOT FOUND_${HEADER})
|
||||
endforeach(HEADER)
|
||||
|
||||
set(MATH_LIBRARIES "m" CACHE STRING "math library")
|
||||
mark_as_advanced( MATH_LIBRARIES )
|
||||
include(CheckLibraryExists)
|
||||
foreach(FUNC sin cos)
|
||||
check_library_exists(${MATH_LIBRARIES} ${FUNC} "" FOUND_${FUNC}_${MATH_LIBRARIES})
|
||||
if(NOT FOUND_${FUNC}_${MATH_LIBRARIES})
|
||||
message(FATAL_ERROR "Could not find needed math function - ${FUNC}")
|
||||
endif(NOT FOUND_${FUNC}_${MATH_LIBRARIES})
|
||||
endforeach(FUNC)
|
||||
list(APPEND LAMMPS_LINK_LIBS ${MATH_LIBRARIES})
|
||||
|
||||
######################################
|
||||
# Generate Basic Style files
|
||||
######################################
|
||||
include(StyleHeaderUtils)
|
||||
RegisterStyles(${LAMMPS_SOURCE_DIR})
|
||||
|
||||
##############################################
|
||||
# add sources of enabled packages
|
||||
############################################
|
||||
foreach(PKG ${DEFAULT_PACKAGES} ${OTHER_PACKAGES})
|
||||
if(ENABLE_${PKG})
|
||||
set(${PKG}_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/${PKG})
|
||||
|
||||
# detects styles in package and adds them to global list
|
||||
RegisterStyles(${${PKG}_SOURCES_DIR})
|
||||
|
||||
file(GLOB ${PKG}_SOURCES ${${PKG}_SOURCES_DIR}/*.cpp)
|
||||
list(APPEND LIB_SOURCES ${${PKG}_SOURCES})
|
||||
include_directories(${${PKG}_SOURCES_DIR})
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
##############################################
|
||||
# add lib sources of (simple) enabled packages
|
||||
############################################
|
||||
foreach(SIMPLE_LIB REAX MEAM POEMS USER-ATC USER-AWPMD USER-COLVARS USER-H5MD
|
||||
USER-MOLFILE USER-QMMM)
|
||||
if(ENABLE_${SIMPLE_LIB})
|
||||
string(REGEX REPLACE "^USER-" "" SIMPLE_LIB "${SIMPLE_LIB}")
|
||||
string(TOLOWER "${SIMPLE_LIB}" INC_DIR)
|
||||
file(GLOB_RECURSE ${SIMPLE_LIB}_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/${INC_DIR}/*.F
|
||||
${LAMMPS_LIB_SOURCE_DIR}/${INC_DIR}/*.c ${LAMMPS_LIB_SOURCE_DIR}/${INC_DIR}/*.cpp)
|
||||
list(APPEND LIB_SOURCES ${${SIMPLE_LIB}_SOURCES})
|
||||
include_directories(${LAMMPS_LIB_SOURCE_DIR}/${INC_DIR})
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
######################################################################
|
||||
# packages which selectively include variants based on enabled styles
|
||||
# e.g. accelerator packages
|
||||
######################################################################
|
||||
if(ENABLE_USER-OMP)
|
||||
set(USER-OMP_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/USER-OMP)
|
||||
set(USER-OMP_SOURCES ${USER-OMP_SOURCES_DIR}/thr_data.cpp
|
||||
${USER-OMP_SOURCES_DIR}/thr_omp.cpp
|
||||
${USER-OMP_SOURCES_DIR}/fix_nh_omp.cpp
|
||||
${USER-OMP_SOURCES_DIR}/fix_nh_sphere_omp.cpp)
|
||||
set_property(GLOBAL PROPERTY "OMP_SOURCES" "${USER-OMP_SOURCES}")
|
||||
|
||||
# detects styles which have USER-OMP version
|
||||
RegisterStylesExt(${USER-OMP_SOURCES_DIR} omp OMP_SOURCES)
|
||||
|
||||
get_property(USER-OMP_SOURCES GLOBAL PROPERTY OMP_SOURCES)
|
||||
|
||||
list(APPEND LIB_SOURCES ${USER-OMP_SOURCES})
|
||||
include_directories(${USER-OMP_SOURCES_DIR})
|
||||
endif()
|
||||
|
||||
if(ENABLE_KOKKOS)
|
||||
set(LAMMPS_LIB_KOKKOS_SRC_DIR ${LAMMPS_LIB_SOURCE_DIR}/kokkos)
|
||||
set(LAMMPS_LIB_KOKKOS_BIN_DIR ${LAMMPS_LIB_BINARY_DIR}/kokkos)
|
||||
add_definitions(-DLMP_KOKKOS)
|
||||
add_subdirectory(${LAMMPS_LIB_KOKKOS_SRC_DIR} ${LAMMPS_LIB_KOKKOS_BIN_DIR})
|
||||
|
||||
set(Kokkos_INCLUDE_DIRS ${LAMMPS_LIB_KOKKOS_SRC_DIR}/core/src
|
||||
${LAMMPS_LIB_KOKKOS_SRC_DIR}/containers/src
|
||||
${LAMMPS_LIB_KOKKOS_SRC_DIR}/algorithms/src
|
||||
${LAMMPS_LIB_KOKKOS_BIN_DIR})
|
||||
include_directories(${Kokkos_INCLUDE_DIRS})
|
||||
list(APPEND LAMMPS_LINK_LIBS kokkos)
|
||||
|
||||
set(KOKKOS_PKG_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/KOKKOS)
|
||||
set(KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/kokkos.cpp
|
||||
${KOKKOS_PKG_SOURCES_DIR}/atom_kokkos.cpp
|
||||
${KOKKOS_PKG_SOURCES_DIR}/atom_vec_kokkos.cpp
|
||||
${KOKKOS_PKG_SOURCES_DIR}/comm_kokkos.cpp
|
||||
${KOKKOS_PKG_SOURCES_DIR}/comm_tiled_kokkos.cpp
|
||||
${KOKKOS_PKG_SOURCES_DIR}/neighbor_kokkos.cpp
|
||||
${KOKKOS_PKG_SOURCES_DIR}/neigh_list_kokkos.cpp
|
||||
${KOKKOS_PKG_SOURCES_DIR}/neigh_bond_kokkos.cpp
|
||||
${KOKKOS_PKG_SOURCES_DIR}/fix_nh_kokkos.cpp
|
||||
${KOKKOS_PKG_SOURCES_DIR}/domain_kokkos.cpp
|
||||
${KOKKOS_PKG_SOURCES_DIR}/modify_kokkos.cpp)
|
||||
set_property(GLOBAL PROPERTY "KOKKOS_PKG_SOURCES" "${KOKKOS_PKG_SOURCES}")
|
||||
|
||||
# detects styles which have KOKKOS version
|
||||
RegisterStylesExt(${KOKKOS_PKG_SOURCES_DIR} kokkos KOKKOS_PKG_SOURCES)
|
||||
|
||||
get_property(KOKKOS_PKG_SOURCES GLOBAL PROPERTY KOKKOS_PKG_SOURCES)
|
||||
|
||||
list(APPEND LIB_SOURCES ${KOKKOS_PKG_SOURCES})
|
||||
include_directories(${KOKKOS_PKG_SOURCES_DIR})
|
||||
endif()
|
||||
|
||||
if(ENABLE_OPT)
|
||||
set(OPT_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/OPT)
|
||||
set(OPT_SOURCES)
|
||||
set_property(GLOBAL PROPERTY "OPT_SOURCES" "${OPT_SOURCES}")
|
||||
|
||||
# detects styles which have OPT version
|
||||
RegisterStylesExt(${OPT_SOURCES_DIR} opt OPT_SOURCES)
|
||||
|
||||
get_property(OPT_SOURCES GLOBAL PROPERTY OPT_SOURCES)
|
||||
|
||||
list(APPEND LIB_SOURCES ${OPT_SOURCES})
|
||||
include_directories(${OPT_SOURCES_DIR})
|
||||
endif()
|
||||
|
||||
if(ENABLE_USER-INTEL)
|
||||
set(USER-INTEL_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/USER-INTEL)
|
||||
set(USER-INTEL_SOURCES ${USER-INTEL_SOURCES_DIR}/intel_preprocess.h
|
||||
${USER-INTEL_SOURCES_DIR}/intel_buffers.h
|
||||
${USER-INTEL_SOURCES_DIR}/intel_buffers.cpp
|
||||
${USER-INTEL_SOURCES_DIR}/math_extra_intel.h
|
||||
${USER-INTEL_SOURCES_DIR}/nbin_intel.h
|
||||
${USER-INTEL_SOURCES_DIR}/nbin_intel.cpp
|
||||
${USER-INTEL_SOURCES_DIR}/npair_intel.h
|
||||
${USER-INTEL_SOURCES_DIR}/npair_intel.cpp
|
||||
${USER-INTEL_SOURCES_DIR}/intel_simd.h
|
||||
${USER-INTEL_SOURCES_DIR}/intel_intrinsics.h)
|
||||
|
||||
set_property(GLOBAL PROPERTY "USER-INTEL_SOURCES" "${USER-INTEL_SOURCES}")
|
||||
|
||||
# detects styles which have USER-INTEL version
|
||||
RegisterStylesExt(${USER-INTEL_SOURCES_DIR} opt USER-INTEL_SOURCES)
|
||||
|
||||
get_property(USER-INTEL_SOURCES GLOBAL PROPERTY USER-INTEL_SOURCES)
|
||||
|
||||
list(APPEND LIB_SOURCES ${USER-INTEL_SOURCES})
|
||||
include_directories(${USER-INTEL_SOURCES_DIR})
|
||||
endif()
|
||||
|
||||
if(ENABLE_GPU)
|
||||
find_package(CUDA REQUIRED)
|
||||
find_program(BIN2C bin2c)
|
||||
if(NOT BIN2C)
|
||||
message(FATAL_ERROR "Couldn't find bin2c, use -DBIN2C helping cmake to find it.")
|
||||
endif()
|
||||
include_directories(${CUDA_INCLUDE_DIRS})
|
||||
list(APPEND LAMMPS_LINK_LIBS ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
|
||||
set(GPU_PREC "SINGLE_DOUBLE" CACHE STRING "Lammps gpu precision size")
|
||||
set_property(CACHE GPU_PREC PROPERTY STRINGS SINGLE_DOUBLE SINGLE_SINGLE DOUBLE_DOUBLE)
|
||||
add_definitions(-D_${GPU_PREC})
|
||||
add_definitions(-DNV_KERNEL -DUCL_CUDADR)
|
||||
option(CUDPP_OPT "Enable CUDPP_OPT" ON)
|
||||
|
||||
set(GPU_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/GPU)
|
||||
set(GPU_SOURCES ${GPU_SOURCES_DIR}/gpu_extra.h)
|
||||
|
||||
set_property(GLOBAL PROPERTY "GPU_SOURCES" "${GPU_SOURCES}")
|
||||
|
||||
# detects styles which have GPU version
|
||||
RegisterStylesExt(${GPU_SOURCES_DIR} opt GPU_SOURCES)
|
||||
|
||||
get_property(GPU_SOURCES GLOBAL PROPERTY GPU_SOURCES)
|
||||
|
||||
file(GLOB GPU_LIB_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/*.cpp)
|
||||
file(GLOB GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/*.cu ${CMAKE_SOURCE_DIR}/gpu/*.cu)
|
||||
file(GLOB_RECURSE GPU_NOT_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_pppm.cu)
|
||||
list(REMOVE_ITEM GPU_LIB_CU ${GPU_NOT_LIB_CU})
|
||||
include_directories(${GPU_SOURCES_DIR} ${LAMMPS_LIB_SOURCE_DIR}/gpu ${LAMMPS_LIB_BINARY_DIR}/gpu)
|
||||
if(CUDPP_OPT)
|
||||
include_directories(${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
|
||||
add_definitions(-DCUDPP_OPT)
|
||||
file(GLOB GPU_LIB_CUDPP_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini/*.cpp)
|
||||
file(GLOB GPU_LIB_CUDPP_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini/*.cu)
|
||||
endif()
|
||||
cuda_compile(GPU_OBJS ${GPU_LIB_CU} ${GPU_LIB_CUDPP_CU} OPTIONS $<$<BOOL:${BUILD_SHARED_LIBS}>:-Xcompiler=-fPIC>)
|
||||
file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)
|
||||
foreach(CU_OBJ ${GPU_OBJS})
|
||||
get_filename_component(CU_NAME ${CU_OBJ} NAME_WE)
|
||||
string(REGEX REPLACE "^.*_lal_" "" CU_NAME "${CU_NAME}")
|
||||
add_custom_command(OUTPUT ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h
|
||||
COMMAND ${BIN2C} -c -n ${CU_NAME} ${CU_OBJ} > ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h
|
||||
DEPENDS ${CU_OBJ}
|
||||
COMMENT "Generating ${CU_NAME}_cubin.h")
|
||||
list(APPEND LIB_SOURCES ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h)
|
||||
if(${CU_NAME} STREQUAL "pppm_d") #pppm_d doesn't get linked into the lib
|
||||
set(CU_FORBIDDEN_OBJ "${CU_OBJ}")
|
||||
endif()
|
||||
endforeach()
|
||||
list(REMOVE_ITEM GPU_OBJS "${CU_FORBIDDEN_OBJ}")
|
||||
list(APPEND LIB_SOURCES ${GPU_SOURCES} ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS})
|
||||
set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "${LAMMPS_LIB_BINARY_DIR}/gpu/*_cubin.h")
|
||||
endif()
|
||||
|
||||
######################################################
|
||||
# Generate style headers based on global list of
|
||||
# styles registered during package selection
|
||||
######################################################
|
||||
set(LAMMPS_STYLE_HEADERS_DIR ${CMAKE_CURRENT_BINARY_DIR}/styles)
|
||||
|
||||
GenerateStyleHeaders(${LAMMPS_STYLE_HEADERS_DIR})
|
||||
|
||||
include_directories(${LAMMPS_SOURCE_DIR})
|
||||
include_directories(${LAMMPS_STYLE_HEADERS_DIR})
|
||||
|
||||
###########################################
|
||||
# Actually add executable and lib to build
|
||||
############################################
|
||||
add_library(lammps ${LIB_SOURCES})
|
||||
target_link_libraries(lammps ${LAMMPS_LINK_LIBS})
|
||||
set_target_properties(lammps PROPERTIES SOVERSION ${SOVERSION})
|
||||
if(INSTALL_LIB)
|
||||
install(TARGETS lammps LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
|
||||
install(FILES ${LAMMPS_SOURCE_DIR}/lammps.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
elseif(BUILD_SHARED_LIBS)
|
||||
message(FATAL_ERROR "Shared library has to be installed, use -DINSTALL_LIB=ON to install lammps with a library")
|
||||
endif()
|
||||
|
||||
add_executable(lmp ${LMP_SOURCES})
|
||||
target_link_libraries(lmp lammps)
|
||||
install(TARGETS lmp DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
if(ENABLE_TESTING)
|
||||
add_test(ShowHelp ${CMAKE_CURRENT_BINARY_DIR}/lmp -help)
|
||||
endif()
|
||||
|
||||
##################################
|
||||
# Print package summary
|
||||
##################################
|
||||
foreach(PKG ${DEFAULT_PACKAGES} ${OTHER_PACKAGES} ${ACCEL_PACKAGES})
|
||||
if(ENABLE_${PKG})
|
||||
message(STATUS "Building package: ${PKG}")
|
||||
endif()
|
||||
endforeach()
|
|
@ -0,0 +1,22 @@
|
|||
# - Find fftw2
|
||||
# Find the native FFTW2 headers and libraries.
|
||||
#
|
||||
# FFTW2_INCLUDE_DIRS - where to find fftw2.h, etc.
|
||||
# FFTW2_LIBRARIES - List of libraries when using fftw2.
|
||||
# FFTW2_FOUND - True if fftw2 found.
|
||||
#
|
||||
|
||||
find_path(FFTW2_INCLUDE_DIR fftw.h)
|
||||
|
||||
find_library(FFTW2_LIBRARY NAMES fftw)
|
||||
|
||||
set(FFTW2_LIBRARIES ${FFTW2_LIBRARY})
|
||||
set(FFTW2_INCLUDE_DIRS ${FFTW2_INCLUDE_DIR})
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
# handle the QUIETLY and REQUIRED arguments and set FFTW2_FOUND to TRUE
|
||||
# if all listed variables are TRUE
|
||||
|
||||
find_package_handle_standard_args(FFTW2 DEFAULT_MSG FFTW2_LIBRARY FFTW2_INCLUDE_DIR)
|
||||
|
||||
mark_as_advanced(FFTW2_INCLUDE_DIR FFTW2_LIBRARY )
|
|
@ -0,0 +1,25 @@
|
|||
# - Find fftw3
|
||||
# Find the native FFTW3 headers and libraries.
|
||||
#
|
||||
# FFTW3_INCLUDE_DIRS - where to find fftw3.h, etc.
|
||||
# FFTW3_LIBRARIES - List of libraries when using fftw3.
|
||||
# FFTW3_FOUND - True if fftw3 found.
|
||||
#
|
||||
|
||||
find_package(PkgConfig)
|
||||
|
||||
pkg_check_modules(PC_FFTW3 fftw3)
|
||||
find_path(FFTW3_INCLUDE_DIR fftw3.h HINTS ${PC_FFTW3_INCLUDE_DIRS})
|
||||
|
||||
find_library(FFTW3_LIBRARY NAMES fftw3 HINTS ${PC_FFTW3_LIBRARY_DIRS})
|
||||
|
||||
set(FFTW3_LIBRARIES ${FFTW3_LIBRARY})
|
||||
set(FFTW3_INCLUDE_DIRS ${FFTW3_INCLUDE_DIR})
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
# handle the QUIETLY and REQUIRED arguments and set FFTW3_FOUND to TRUE
|
||||
# if all listed variables are TRUE
|
||||
|
||||
find_package_handle_standard_args(FFTW3 DEFAULT_MSG FFTW3_LIBRARY FFTW3_INCLUDE_DIR)
|
||||
|
||||
mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_LIBRARY )
|
|
@ -0,0 +1,22 @@
|
|||
# - Find kim
|
||||
# Find the native KIM headers and libraries.
|
||||
#
|
||||
# KIM_INCLUDE_DIRS - where to find kim.h, etc.
|
||||
# KIM_LIBRARIES - List of libraries when using kim.
|
||||
# KIM_FOUND - True if kim found.
|
||||
#
|
||||
|
||||
find_path(KIM_INCLUDE_DIR KIM_API.h PATH_SUFFIXES kim-api-v1)
|
||||
|
||||
find_library(KIM_LIBRARY NAMES kim-api-v1)
|
||||
|
||||
set(KIM_LIBRARIES ${KIM_LIBRARY})
|
||||
set(KIM_INCLUDE_DIRS ${KIM_INCLUDE_DIR})
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
# handle the QUIETLY and REQUIRED arguments and set KIM_FOUND to TRUE
|
||||
# if all listed variables are TRUE
|
||||
|
||||
find_package_handle_standard_args(KIM DEFAULT_MSG KIM_LIBRARY KIM_INCLUDE_DIR)
|
||||
|
||||
mark_as_advanced(KIM_INCLUDE_DIR KIM_LIBRARY )
|
|
@ -0,0 +1,22 @@
|
|||
# - Find mkl
|
||||
# Find the native MKL headers and libraries.
|
||||
#
|
||||
# MKL_INCLUDE_DIRS - where to find mkl.h, etc.
|
||||
# MKL_LIBRARIES - List of libraries when using mkl.
|
||||
# MKL_FOUND - True if mkl found.
|
||||
#
|
||||
|
||||
find_path(MKL_INCLUDE_DIR mkl_dfti.h HINTS $ENV{MKLROOT}/include)
|
||||
|
||||
find_library(MKL_LIBRARY NAMES mkl_rt HINTS $ENV{MKLROOT}/lib $ENV{MKLROOT}/lib/intel64)
|
||||
|
||||
set(MKL_LIBRARIES ${MKL_LIBRARY})
|
||||
set(MKL_INCLUDE_DIRS ${MKL_INCLUDE_DIR})
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
# handle the QUIETLY and REQUIRED arguments and set MKL_FOUND to TRUE
|
||||
# if all listed variables are TRUE
|
||||
|
||||
find_package_handle_standard_args(MKL DEFAULT_MSG MKL_LIBRARY MKL_INCLUDE_DIR)
|
||||
|
||||
mark_as_advanced(MKL_INCLUDE_DIR MKL_LIBRARY )
|
|
@ -0,0 +1,118 @@
|
|||
# - Find NetCDF
|
||||
# Find the native NetCDF includes and library
|
||||
#
|
||||
# NETCDF_INCLUDE_DIR - user modifiable choice of where netcdf headers are
|
||||
# NETCDF_LIBRARY - user modifiable choice of where netcdf libraries are
|
||||
#
|
||||
# Your package can require certain interfaces to be FOUND by setting these
|
||||
#
|
||||
# NETCDF_CXX - require the C++ interface and link the C++ library
|
||||
# NETCDF_F77 - require the F77 interface and link the fortran library
|
||||
# NETCDF_F90 - require the F90 interface and link the fortran library
|
||||
#
|
||||
# Or equivalently by calling FindNetCDF with a COMPONENTS argument containing one or
|
||||
# more of "CXX;F77;F90".
|
||||
#
|
||||
# When interfaces are requested the user has access to interface specific hints:
|
||||
#
|
||||
# NETCDF_${LANG}_INCLUDE_DIR - where to search for interface header files
|
||||
# NETCDF_${LANG}_LIBRARY - where to search for interface libraries
|
||||
#
|
||||
# This module returns these variables for the rest of the project to use.
|
||||
#
|
||||
# NETCDF_FOUND - True if NetCDF found including required interfaces (see below)
|
||||
# NETCDF_LIBRARIES - All netcdf related libraries.
|
||||
# NETCDF_INCLUDE_DIRS - All directories to include.
|
||||
# NETCDF_HAS_INTERFACES - Whether requested interfaces were found or not.
|
||||
# NETCDF_${LANG}_INCLUDE_DIRS/NETCDF_${LANG}_LIBRARIES - C/C++/F70/F90 only interface
|
||||
#
|
||||
# Normal usage would be:
|
||||
# set (NETCDF_F90 "YES")
|
||||
# find_package (NetCDF REQUIRED)
|
||||
# target_link_libraries (uses_everthing ${NETCDF_LIBRARIES})
|
||||
# target_link_libraries (only_uses_f90 ${NETCDF_F90_LIBRARIES})
|
||||
|
||||
#search starting from user editable cache var
|
||||
if (NETCDF_INCLUDE_DIR AND NETCDF_LIBRARY)
|
||||
# Already in cache, be silent
|
||||
set (NETCDF_FIND_QUIETLY TRUE)
|
||||
endif ()
|
||||
|
||||
set(USE_DEFAULT_PATHS "NO_DEFAULT_PATH")
|
||||
if(NETCDF_USE_DEFAULT_PATHS)
|
||||
set(USE_DEFAULT_PATHS "")
|
||||
endif()
|
||||
|
||||
find_path (NETCDF_INCLUDE_DIR netcdf.h
|
||||
HINTS "${NETCDF_DIR}/include")
|
||||
mark_as_advanced (NETCDF_INCLUDE_DIR)
|
||||
set (NETCDF_C_INCLUDE_DIRS ${NETCDF_INCLUDE_DIR})
|
||||
|
||||
find_library (NETCDF_LIBRARY NAMES netcdf
|
||||
HINTS "${NETCDF_DIR}/lib")
|
||||
mark_as_advanced (NETCDF_LIBRARY)
|
||||
|
||||
set (NETCDF_C_LIBRARIES ${NETCDF_LIBRARY})
|
||||
|
||||
#start finding requested language components
|
||||
set (NetCDF_libs "")
|
||||
set (NetCDF_includes "${NETCDF_INCLUDE_DIR}")
|
||||
|
||||
get_filename_component (NetCDF_lib_dirs "${NETCDF_LIBRARY}" PATH)
|
||||
set (NETCDF_HAS_INTERFACES "YES") # will be set to NO if we're missing any interfaces
|
||||
|
||||
macro (NetCDF_check_interface lang header libs)
|
||||
if (NETCDF_${lang})
|
||||
#search starting from user modifiable cache var
|
||||
find_path (NETCDF_${lang}_INCLUDE_DIR NAMES ${header}
|
||||
HINTS "${NETCDF_INCLUDE_DIR}"
|
||||
HINTS "${NETCDF_${lang}_ROOT}/include"
|
||||
${USE_DEFAULT_PATHS})
|
||||
|
||||
find_library (NETCDF_${lang}_LIBRARY NAMES ${libs}
|
||||
HINTS "${NetCDF_lib_dirs}"
|
||||
HINTS "${NETCDF_${lang}_ROOT}/lib"
|
||||
${USE_DEFAULT_PATHS})
|
||||
|
||||
mark_as_advanced (NETCDF_${lang}_INCLUDE_DIR NETCDF_${lang}_LIBRARY)
|
||||
|
||||
#export to internal varS that rest of project can use directly
|
||||
set (NETCDF_${lang}_LIBRARIES ${NETCDF_${lang}_LIBRARY})
|
||||
set (NETCDF_${lang}_INCLUDE_DIRS ${NETCDF_${lang}_INCLUDE_DIR})
|
||||
|
||||
if (NETCDF_${lang}_INCLUDE_DIR AND NETCDF_${lang}_LIBRARY)
|
||||
list (APPEND NetCDF_libs ${NETCDF_${lang}_LIBRARY})
|
||||
list (APPEND NetCDF_includes ${NETCDF_${lang}_INCLUDE_DIR})
|
||||
else ()
|
||||
set (NETCDF_HAS_INTERFACES "NO")
|
||||
message (STATUS "Failed to find NetCDF interface for ${lang}")
|
||||
endif ()
|
||||
endif ()
|
||||
endmacro ()
|
||||
|
||||
list (FIND NetCDF_FIND_COMPONENTS "CXX" _nextcomp)
|
||||
if (_nextcomp GREATER -1)
|
||||
set (NETCDF_CXX 1)
|
||||
endif ()
|
||||
list (FIND NetCDF_FIND_COMPONENTS "F77" _nextcomp)
|
||||
if (_nextcomp GREATER -1)
|
||||
set (NETCDF_F77 1)
|
||||
endif ()
|
||||
list (FIND NetCDF_FIND_COMPONENTS "F90" _nextcomp)
|
||||
if (_nextcomp GREATER -1)
|
||||
set (NETCDF_F90 1)
|
||||
endif ()
|
||||
NetCDF_check_interface (CXX netcdfcpp.h netcdf_c++)
|
||||
NetCDF_check_interface (F77 netcdf.inc netcdff)
|
||||
NetCDF_check_interface (F90 netcdf.mod netcdff)
|
||||
|
||||
#export accumulated results to internal varS that rest of project can depend on
|
||||
list (APPEND NetCDF_libs "${NETCDF_C_LIBRARIES}")
|
||||
set (NETCDF_LIBRARIES ${NetCDF_libs})
|
||||
set (NETCDF_INCLUDE_DIRS ${NetCDF_includes})
|
||||
|
||||
# handle the QUIETLY and REQUIRED arguments and set NETCDF_FOUND to TRUE if
|
||||
# all listed variables are TRUE
|
||||
include (FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args (NetCDF
|
||||
DEFAULT_MSG NETCDF_LIBRARIES NETCDF_INCLUDE_DIRS NETCDF_HAS_INTERFACES)
|
|
@ -0,0 +1,29 @@
|
|||
# - Find quantum-espresso
|
||||
# Find the native QE headers and libraries.
|
||||
#
|
||||
# QE_INCLUDE_DIRS - where to find quantum-espresso.h, etc.
|
||||
# QE_LIBRARIES - List of libraries when using quantum-espresso.
|
||||
# QE_FOUND - True if quantum-espresso found.
|
||||
#
|
||||
|
||||
find_path(QE_INCLUDE_DIR libqecouple.h PATH_SUFFIXES COUPLE/include)
|
||||
|
||||
find_library(QECOUPLE_LIBRARY NAMES qecouple)
|
||||
find_library(PW_LIBRARY NAMES pw)
|
||||
find_library(QEMOD_LIBRARY NAMES qemod)
|
||||
find_library(QEFFT_LIBRARY NAMES qefft)
|
||||
find_library(QELA_LIBRARY NAMES qela)
|
||||
find_library(CLIB_LIBRARY NAMES clib)
|
||||
find_library(IOTK_LIBRARY NAMES iotk)
|
||||
|
||||
|
||||
set(QE_LIBRARIES ${QECOUPLE_LIBRARY} ${PW_LIBRARY} ${QEMOD_LIBRARY} ${QEFFT_LIBRARY} ${QELA_LIBRARY} ${CLIB_LIBRARY} ${IOTK_LIBRARY})
|
||||
set(QE_INCLUDE_DIRS ${QE_INCLUDE_DIR})
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
# handle the QUIETLY and REQUIRED arguments and set QE_FOUND to TRUE
|
||||
# if all listed variables are TRUE
|
||||
|
||||
find_package_handle_standard_args(QE DEFAULT_MSG QECOUPLE_LIBRARY PW_LIBRARY QEMOD_LIBRARY QEFFT_LIBRARY QELA_LIBRARY CLIB_LIBRARY IOTK_LIBRARY QE_INCLUDE_DIR)
|
||||
|
||||
mark_as_advanced(QE_INCLUDE_DIR QECOUPLE_LIBRARY PW_LIBRARY QEMOD_LIBRARY QEFFT_LIBRARY QELA_LIBRARY CLIB_LIBRARY IOTK_LIBRARY)
|
|
@ -0,0 +1,18 @@
|
|||
# - Find quip
|
||||
# Find the native QUIP libraries.
|
||||
#
|
||||
# QUIP_LIBRARIES - List of libraries when using fftw3.
|
||||
# QUIP_FOUND - True if fftw3 found.
|
||||
#
|
||||
|
||||
find_library(QUIP_LIBRARY NAMES quip)
|
||||
|
||||
set(QUIP_LIBRARIES ${QUIP_LIBRARY})
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
# handle the QUIETLY and REQUIRED arguments and set QUIP_FOUND to TRUE
|
||||
# if all listed variables are TRUE
|
||||
|
||||
find_package_handle_standard_args(QUIP DEFAULT_MSG QUIP_LIBRARY)
|
||||
|
||||
mark_as_advanced(QUIP_LIBRARY)
|
|
@ -0,0 +1,22 @@
|
|||
# - Find voro++
|
||||
# Find the native VORO headers and libraries.
|
||||
#
|
||||
# VORO_INCLUDE_DIRS - where to find voro++.hh, etc.
|
||||
# VORO_LIBRARIES - List of libraries when using voro++.
|
||||
# VORO_FOUND - True if voro++ found.
|
||||
#
|
||||
|
||||
find_path(VORO_INCLUDE_DIR voro++.hh PATH_SUFFIXES voro++)
|
||||
|
||||
find_library(VORO_LIBRARY NAMES voro++)
|
||||
|
||||
set(VORO_LIBRARIES ${VORO_LIBRARY})
|
||||
set(VORO_INCLUDE_DIRS ${VORO_INCLUDE_DIR})
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
# handle the QUIETLY and REQUIRED arguments and set VORO_FOUND to TRUE
|
||||
# if all listed variables are TRUE
|
||||
|
||||
find_package_handle_standard_args(VORO DEFAULT_MSG VORO_LIBRARY VORO_INCLUDE_DIR)
|
||||
|
||||
mark_as_advanced(VORO_INCLUDE_DIR VORO_LIBRARY )
|
|
@ -0,0 +1,132 @@
|
|||
function(FindStyleHeaders path style_class file_pattern headers)
|
||||
file(GLOB files "${path}/${file_pattern}*.h")
|
||||
get_property(hlist GLOBAL PROPERTY ${headers})
|
||||
|
||||
foreach(file_name ${files})
|
||||
file(STRINGS ${file_name} is_style LIMIT_COUNT 1 REGEX ${style_class})
|
||||
if(is_style)
|
||||
list(APPEND hlist ${file_name})
|
||||
endif()
|
||||
endforeach()
|
||||
set_property(GLOBAL PROPERTY ${headers} "${hlist}")
|
||||
endfunction(FindStyleHeaders)
|
||||
|
||||
function(FindStyleHeadersExt path style_class extension headers sources)
|
||||
get_property(hlist GLOBAL PROPERTY ${headers})
|
||||
get_property(slist GLOBAL PROPERTY ${sources})
|
||||
set(ext_list)
|
||||
get_filename_component(abs_path "${path}" ABSOLUTE)
|
||||
|
||||
foreach(file_name ${hlist})
|
||||
get_filename_component(basename ${file_name} NAME_WE)
|
||||
set(ext_file_name "${abs_path}/${basename}_${extension}.h")
|
||||
if(EXISTS "${ext_file_name}")
|
||||
file(STRINGS ${ext_file_name} is_style LIMIT_COUNT 1 REGEX ${style_class})
|
||||
if(is_style)
|
||||
list(APPEND ext_list ${ext_file_name})
|
||||
|
||||
set(source_file_name "${abs_path}/${basename}_${extension}.cpp")
|
||||
if(EXISTS "${source_file_name}")
|
||||
list(APPEND slist ${source_file_name})
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
list(APPEND hlist ${ext_list})
|
||||
set_property(GLOBAL PROPERTY ${headers} "${hlist}")
|
||||
set_property(GLOBAL PROPERTY ${sources} "${slist}")
|
||||
endfunction(FindStyleHeadersExt)
|
||||
|
||||
function(CreateStyleHeader path filename)
|
||||
math(EXPR N "${ARGC}-2")
|
||||
|
||||
set(temp "")
|
||||
if(N GREATER 0)
|
||||
math(EXPR ARG_END "${ARGC}-1")
|
||||
|
||||
foreach(IDX RANGE 2 ${ARG_END})
|
||||
list(GET ARGV ${IDX} FNAME)
|
||||
get_filename_component(FNAME ${FNAME} NAME)
|
||||
set(temp "${temp}#include \"${FNAME}\"\n")
|
||||
endforeach()
|
||||
endif()
|
||||
message(STATUS "Generating ${filename}...")
|
||||
file(WRITE "${path}/${filename}.tmp" "${temp}" )
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different "${path}/${filename}.tmp" "${path}/${filename}")
|
||||
endfunction(CreateStyleHeader)
|
||||
|
||||
function(GenerateStyleHeader path property style)
|
||||
get_property(files GLOBAL PROPERTY ${property})
|
||||
#message("${property} = ${files}")
|
||||
CreateStyleHeader("${path}" "style_${style}.h" ${files})
|
||||
endfunction(GenerateStyleHeader)
|
||||
|
||||
function(RegisterStyles search_path)
|
||||
FindStyleHeaders(${search_path} ANGLE_CLASS angle_ ANGLE ) # angle ) # force
|
||||
FindStyleHeaders(${search_path} ATOM_CLASS atom_vec_ ATOM_VEC ) # atom ) # atom atom_vec_hybrid
|
||||
FindStyleHeaders(${search_path} BODY_CLASS body_ BODY ) # body ) # atom_vec_body
|
||||
FindStyleHeaders(${search_path} BOND_CLASS bond_ BOND ) # bond ) # force
|
||||
FindStyleHeaders(${search_path} COMMAND_CLASS "" COMMAND ) # command ) # input
|
||||
FindStyleHeaders(${search_path} COMPUTE_CLASS compute_ COMPUTE ) # compute ) # modify
|
||||
FindStyleHeaders(${search_path} DIHEDRAL_CLASS dihedral_ DIHEDRAL ) # dihedral ) # force
|
||||
FindStyleHeaders(${search_path} DUMP_CLASS dump_ DUMP ) # dump ) # output write_dump
|
||||
FindStyleHeaders(${search_path} FIX_CLASS fix_ FIX ) # fix ) # modify
|
||||
FindStyleHeaders(${search_path} IMPROPER_CLASS improper_ IMPROPER ) # improper ) # force
|
||||
FindStyleHeaders(${search_path} INTEGRATE_CLASS "" INTEGRATE ) # integrate ) # update
|
||||
FindStyleHeaders(${search_path} KSPACE_CLASS "" KSPACE ) # kspace ) # force
|
||||
FindStyleHeaders(${search_path} MINIMIZE_CLASS min_ MINIMIZE ) # minimize ) # update
|
||||
FindStyleHeaders(${search_path} NBIN_CLASS nbin_ NBIN ) # nbin ) # neighbor
|
||||
FindStyleHeaders(${search_path} NPAIR_CLASS npair_ NPAIR ) # npair ) # neighbor
|
||||
FindStyleHeaders(${search_path} NSTENCIL_CLASS nstencil_ NSTENCIL ) # nstencil ) # neighbor
|
||||
FindStyleHeaders(${search_path} NTOPO_CLASS ntopo_ NTOPO ) # ntopo ) # neighbor
|
||||
FindStyleHeaders(${search_path} PAIR_CLASS pair_ PAIR ) # pair ) # force
|
||||
FindStyleHeaders(${search_path} READER_CLASS reader_ READER ) # reader ) # read_dump
|
||||
FindStyleHeaders(${search_path} REGION_CLASS region_ REGION ) # region ) # domain
|
||||
endfunction(RegisterStyles)
|
||||
|
||||
function(RegisterStylesExt search_path extension sources)
|
||||
FindStyleHeadersExt(${search_path} ANGLE_CLASS ${extension} ANGLE ${sources})
|
||||
FindStyleHeadersExt(${search_path} ATOM_CLASS ${extension} ATOM_VEC ${sources})
|
||||
FindStyleHeadersExt(${search_path} BODY_CLASS ${extension} BODY ${sources})
|
||||
FindStyleHeadersExt(${search_path} BOND_CLASS ${extension} BOND ${sources})
|
||||
FindStyleHeadersExt(${search_path} COMMAND_CLASS ${extension} COMMAND ${sources})
|
||||
FindStyleHeadersExt(${search_path} COMPUTE_CLASS ${extension} COMPUTE ${sources})
|
||||
FindStyleHeadersExt(${search_path} DIHEDRAL_CLASS ${extension} DIHEDRAL ${sources})
|
||||
FindStyleHeadersExt(${search_path} DUMP_CLASS ${extension} DUMP ${sources})
|
||||
FindStyleHeadersExt(${search_path} FIX_CLASS ${extension} FIX ${sources})
|
||||
FindStyleHeadersExt(${search_path} IMPROPER_CLASS ${extension} IMPROPER ${sources})
|
||||
FindStyleHeadersExt(${search_path} INTEGRATE_CLASS ${extension} INTEGRATE ${sources})
|
||||
FindStyleHeadersExt(${search_path} KSPACE_CLASS ${extension} KSPACE ${sources})
|
||||
FindStyleHeadersExt(${search_path} MINIMIZE_CLASS ${extension} MINIMIZE ${sources})
|
||||
FindStyleHeadersExt(${search_path} NBIN_CLASS ${extension} NBIN ${sources})
|
||||
FindStyleHeadersExt(${search_path} NPAIR_CLASS ${extension} NPAIR ${sources})
|
||||
FindStyleHeadersExt(${search_path} NSTENCIL_CLASS ${extension} NSTENCIL ${sources})
|
||||
FindStyleHeadersExt(${search_path} NTOPO_CLASS ${extension} NTOPO ${sources})
|
||||
FindStyleHeadersExt(${search_path} PAIR_CLASS ${extension} PAIR ${sources})
|
||||
FindStyleHeadersExt(${search_path} READER_CLASS ${extension} READER ${sources})
|
||||
FindStyleHeadersExt(${search_path} REGION_CLASS ${extension} REGION ${sources})
|
||||
endfunction(RegisterStylesExt)
|
||||
|
||||
function(GenerateStyleHeaders output_path)
|
||||
GenerateStyleHeader(${output_path} ANGLE angle ) # force
|
||||
GenerateStyleHeader(${output_path} ATOM_VEC atom ) # atom atom_vec_hybrid
|
||||
GenerateStyleHeader(${output_path} BODY body ) # atom_vec_body
|
||||
GenerateStyleHeader(${output_path} BOND bond ) # force
|
||||
GenerateStyleHeader(${output_path} COMMAND command ) # input
|
||||
GenerateStyleHeader(${output_path} COMPUTE compute ) # modify
|
||||
GenerateStyleHeader(${output_path} DIHEDRAL dihedral ) # force
|
||||
GenerateStyleHeader(${output_path} DUMP dump ) # output write_dump
|
||||
GenerateStyleHeader(${output_path} FIX fix ) # modify
|
||||
GenerateStyleHeader(${output_path} IMPROPER improper ) # force
|
||||
GenerateStyleHeader(${output_path} INTEGRATE integrate ) # update
|
||||
GenerateStyleHeader(${output_path} KSPACE kspace ) # force
|
||||
GenerateStyleHeader(${output_path} MINIMIZE minimize ) # update
|
||||
GenerateStyleHeader(${output_path} NBIN nbin ) # neighbor
|
||||
GenerateStyleHeader(${output_path} NPAIR npair ) # neighbor
|
||||
GenerateStyleHeader(${output_path} NSTENCIL nstencil ) # neighbor
|
||||
GenerateStyleHeader(${output_path} NTOPO ntopo ) # neighbor
|
||||
GenerateStyleHeader(${output_path} PAIR pair ) # force
|
||||
GenerateStyleHeader(${output_path} READER reader ) # read_dump
|
||||
GenerateStyleHeader(${output_path} REGION region ) # domain
|
||||
endfunction(GenerateStyleHeaders)
|
|
@ -0,0 +1,19 @@
|
|||
cmake-buildsystem
|
||||
-----------------
|
||||
|
||||
To use the cmake build system instead of the make-driven one, do:
|
||||
```
|
||||
cmake /path/to/lammps/source/cmake
|
||||
```
|
||||
(please note the cmake directory as the very end)
|
||||
|
||||
To enable package, e.g. GPU do
|
||||
```
|
||||
cmake /path/to/lammps/source/cmake -DENABLE_GPU=ON
|
||||
```
|
||||
|
||||
cmake has many many options, do get an overview use the curses-based cmake interface, ccmake:
|
||||
```
|
||||
ccmake /path/to/lammps/source/cmake
|
||||
```
|
||||
(Don't forget to press "g" for generate once you are done with configuring)
|
|
@ -0,0 +1,4 @@
|
|||
#define grdtyp double
|
||||
#define grdtyp4 double4
|
||||
|
||||
#include "lal_pppm.cu"
|
|
@ -0,0 +1,4 @@
|
|||
#define grdtyp float
|
||||
#define grdtyp4 float4
|
||||
|
||||
#include "lal_pppm.cu"
|
Binary file not shown.
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 20 KiB |
|
@ -1,7 +1,7 @@
|
|||
<!-- HTML_ONLY -->
|
||||
<HEAD>
|
||||
<TITLE>LAMMPS Users Manual</TITLE>
|
||||
<META NAME="docnumber" CONTENT="10 Aug 2017 version">
|
||||
<META NAME="docnumber" CONTENT="17 Aug 2017 version">
|
||||
<META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
|
||||
<META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation. This software and manual is distributed under the GNU General Public License.">
|
||||
</HEAD>
|
||||
|
@ -21,7 +21,7 @@
|
|||
<H1></H1>
|
||||
|
||||
LAMMPS Documentation :c,h3
|
||||
10 Aug 2017 version :c,h4
|
||||
17 Aug 2017 version :c,h4
|
||||
|
||||
Version info: :h4
|
||||
|
||||
|
@ -79,7 +79,7 @@ bug reports and feature requests are mainly coordinated through the
|
|||
"LAMMPS project on GitHub."_https://github.com/lammps/lammps
|
||||
The lammps.org domain, currently hosting "public continuous integration
|
||||
testing"_https://ci.lammps.org/job/lammps/ and "precompiled Linux
|
||||
RPM and Windows installer packages"_http://rpm.lammps.org is located
|
||||
RPM and Windows installer packages"_http://packages.lammps.org is located
|
||||
at Temple University and managed by Richard Berger,
|
||||
richard.berger at temple.edu.
|
||||
|
||||
|
|
Binary file not shown.
|
@ -892,8 +892,8 @@ KOKKOS, o = USER-OMP, t = OPT.
|
|||
"hybrid"_pair_hybrid.html,
|
||||
"hybrid/overlay"_pair_hybrid.html,
|
||||
"adp (o)"_pair_adp.html,
|
||||
"airebo (o)"_pair_airebo.html,
|
||||
"airebo/morse (o)"_pair_airebo.html,
|
||||
"airebo (oi)"_pair_airebo.html,
|
||||
"airebo/morse (oi)"_pair_airebo.html,
|
||||
"beck (go)"_pair_beck.html,
|
||||
"body"_pair_body.html,
|
||||
"bop"_pair_bop.html,
|
||||
|
@ -927,8 +927,8 @@ KOKKOS, o = USER-OMP, t = OPT.
|
|||
"dpd/tstat (go)"_pair_dpd.html,
|
||||
"dsmc"_pair_dsmc.html,
|
||||
"eam (gkiot)"_pair_eam.html,
|
||||
"eam/alloy (gkot)"_pair_eam.html,
|
||||
"eam/fs (gkot)"_pair_eam.html,
|
||||
"eam/alloy (gkiot)"_pair_eam.html,
|
||||
"eam/fs (gkiot)"_pair_eam.html,
|
||||
"eim (o)"_pair_eim.html,
|
||||
"gauss (go)"_pair_gauss.html,
|
||||
"gayberne (gio)"_pair_gayberne.html,
|
||||
|
@ -942,9 +942,9 @@ KOKKOS, o = USER-OMP, t = OPT.
|
|||
"kim"_pair_kim.html,
|
||||
"lcbop"_pair_lcbop.html,
|
||||
"line/lj"_pair_line_lj.html,
|
||||
"lj/charmm/coul/charmm (ko)"_pair_charmm.html,
|
||||
"lj/charmm/coul/charmm (kio)"_pair_charmm.html,
|
||||
"lj/charmm/coul/charmm/implicit (ko)"_pair_charmm.html,
|
||||
"lj/charmm/coul/long (giko)"_pair_charmm.html,
|
||||
"lj/charmm/coul/long (gkio)"_pair_charmm.html,
|
||||
"lj/charmm/coul/msm"_pair_charmm.html,
|
||||
"lj/charmmfsw/coul/charmmfsh"_pair_charmm.html,
|
||||
"lj/charmmfsw/coul/long"_pair_charmm.html,
|
||||
|
@ -990,7 +990,7 @@ KOKKOS, o = USER-OMP, t = OPT.
|
|||
"polymorphic"_pair_polymorphic.html,
|
||||
"python"_pair_python.html,
|
||||
"reax"_pair_reax.html,
|
||||
"rebo (o)"_pair_airebo.html,
|
||||
"rebo (oi)"_pair_airebo.html,
|
||||
"resquared (go)"_pair_resquared.html,
|
||||
"snap"_pair_snap.html,
|
||||
"soft (go)"_pair_soft.html,
|
||||
|
|
|
@ -7886,8 +7886,8 @@ keyword to allow for additional bonds to be formed :dd
|
|||
|
||||
{New bond exceeded special list size in fix bond/create} :dt
|
||||
|
||||
See the "special_bonds extra" command
|
||||
(or the "read_data extra/special/per/atom" command)
|
||||
See the "read_data extra/special/per/atom" command
|
||||
(or the "create_box extra/special/per/atom" command)
|
||||
for info on how to leave space in the special bonds
|
||||
list to allow for additional bonds to be formed. :dd
|
||||
|
||||
|
@ -9666,8 +9666,8 @@ you are running. :dd
|
|||
|
||||
{Special list size exceeded in fix bond/create} :dt
|
||||
|
||||
See the special_bonds extra command
|
||||
(or the read_data extra/special/per/atom command)
|
||||
See the "read_data extra/special/per/atom" command
|
||||
(or the "create_box extra/special/per/atom" command)
|
||||
for info on how to leave space in the special bonds
|
||||
list to allow for additional bonds to be formed. :dd
|
||||
|
||||
|
|
|
@ -662,27 +662,25 @@ your own build system. Due to differences between the Windows OS
|
|||
and Windows system libraries to Unix-like environments like Linux
|
||||
or MacOS, when compiling for Windows a few adjustments may be needed:
|
||||
|
||||
Do not set the -DLAMMPS_MEMALIGN define (see LMP_INC makefile variable)
|
||||
Do [not] set the -DLAMMPS_MEMALIGN define (see LMP_INC makefile variable)
|
||||
Add -lwsock32 -lpsapi to the linker flags (see LIB makefile variable)
|
||||
Try adding -static-libgcc or -static or both to the linker flags when your
|
||||
LAMMPS executable complains about missing .dll files :ul
|
||||
Try adding -static-libgcc or -static or both to the linker flags when your LAMMPS executable complains about missing .dll files :ul
|
||||
|
||||
Since none of the current LAMMPS core developers
|
||||
has significant experience building executables on Windows, we are
|
||||
happy to distribute contributed instructions and modifications, but
|
||||
we cannot provide support for those.
|
||||
Since none of the current LAMMPS core developers has significant
|
||||
experience building executables on Windows, we are happy to distribute
|
||||
contributed instructions and modifications to improve the situation,
|
||||
but we cannot provide support for those.
|
||||
|
||||
With the so-called "Anniversary Update" to Windows 10, there is a
|
||||
Ubuntu Linux subsystem available for Windows, that can be installed
|
||||
and then used to compile/install LAMMPS as if you are running on a
|
||||
Ubuntu Linux system instead of Windows.
|
||||
|
||||
As an alternative, you can download "daily builds" (and some older
|
||||
versions) of the installer packages from
|
||||
"rpm.lammps.org/windows.html"_http://rpm.lammps.org/windows.html.
|
||||
These executables are built with most optional packages and the
|
||||
download includes documentation, potential files, some tools and
|
||||
many examples, but no source code.
|
||||
As an alternative, you can download pre-compiled installer packages from
|
||||
"packages.lammps.org/windows.html"_http://packages.lammps.org/windows.html.
|
||||
These executables are built with most optional packages included and the
|
||||
download includes documentation, potential files, some tools and many
|
||||
examples, but no source code.
|
||||
|
||||
:line
|
||||
|
||||
|
@ -1095,7 +1093,7 @@ LAMMPS to be built with one or more of its optional packages.
|
|||
:line
|
||||
|
||||
On a Windows box, you can skip making LAMMPS and simply download an
|
||||
installer package from "here"_http://rpm.lammps.org/windows.html
|
||||
installer package from "here"_http://packages.lammps.org/windows.html
|
||||
|
||||
For running the non-MPI executable, follow these steps:
|
||||
|
||||
|
@ -1107,18 +1105,27 @@ the [in.lj] input from the bench folder. (e.g. by typing: cd "Documents"). :l
|
|||
|
||||
At the command prompt, type "lmp_serial -in in.lj", replacing [in.lj]
|
||||
with the name of your LAMMPS input script. :l
|
||||
|
||||
The serial executable includes support for multi-threading
|
||||
parallelization from the styles in the USER-OMP packages.
|
||||
|
||||
To run with, e.g. 4 threads, type "lmp_serial -in in.lj -pk omp 4 -sf omp"
|
||||
:ule
|
||||
|
||||
For the MPI version, which allows you to run LAMMPS under Windows on
|
||||
multiple processors, follow these steps:
|
||||
For the MPI version, which allows you to run LAMMPS under Windows with
|
||||
the more general message passing parallel library (LAMMPS has been
|
||||
designed from ground up to use MPI efficiently), follow these steps:
|
||||
|
||||
Download and install
|
||||
"MPICH2"_http://www.mcs.anl.gov/research/projects/mpich2/downloads/index.php?s=downloads
|
||||
for Windows. :ulb,l
|
||||
Download and install a compatible MPI library binary package:
|
||||
for 32-bit Windows
|
||||
"mpich2-1.4.1p1-win-ia32.msi"_download.lammps.org/thirdparty/mpich2-1.4.1p1-win-ia32.msi
|
||||
and for 64-bit Windows
|
||||
"mpich2-1.4.1p1-win-x86-64.msi"_download.lammps.org/thirdparty/mpich2-1.4.1p1-win-x86-64.msi
|
||||
:ulb,l
|
||||
|
||||
The LAMMPS Windows installer packages will automatically adjust your
|
||||
path for the default location of this MPI package. After the installation
|
||||
of the MPICH software, it needs to be integrated into the system.
|
||||
of the MPICH2 software, it needs to be integrated into the system.
|
||||
For this you need to start a Command Prompt in {Administrator Mode}
|
||||
(right click on the icon and select it). Change into the MPICH2
|
||||
installation directory, then into the subdirectory [bin] and execute
|
||||
|
@ -1137,7 +1144,7 @@ or
|
|||
|
||||
mpiexec -np 4 lmp_mpi -in in.lj :pre
|
||||
|
||||
replacing in.lj with the name of your LAMMPS input script. For the latter
|
||||
replacing [in.lj] with the name of your LAMMPS input script. For the latter
|
||||
case, you may be prompted to enter your password. :l
|
||||
|
||||
In this mode, output may not immediately show up on the screen, so if
|
||||
|
@ -1149,6 +1156,11 @@ something like:
|
|||
|
||||
lmp_mpi -in in.lj :pre
|
||||
|
||||
And the parallel executable also includes OpenMP multi-threading, which
|
||||
can be combined with MPI using something like:
|
||||
|
||||
mpiexec -localonly 2 lmp_mpi -in in.lj -pk omp 2 -sf omp :pre
|
||||
|
||||
:ule
|
||||
|
||||
:line
|
||||
|
|
|
@ -29,8 +29,10 @@ Bond Styles: fene, harmonic :l
|
|||
Dihedral Styles: charmm, harmonic, opls :l
|
||||
Fixes: nve, npt, nvt, nvt/sllod :l
|
||||
Improper Styles: cvff, harmonic :l
|
||||
Pair Styles: buck/coul/cut, buck/coul/long, buck, eam, gayberne,
|
||||
charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, sw, tersoff :l
|
||||
Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long,
|
||||
buck, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm,
|
||||
lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo,
|
||||
sw, tersoff :l
|
||||
K-Space Styles: pppm, pppm/disp :l
|
||||
:ule
|
||||
|
||||
|
|
|
@ -150,10 +150,9 @@ atoms. Note that adding a single bond always adds a new 1st neighbor
|
|||
but may also induce *many* new 2nd and 3rd neighbors, depending on the
|
||||
molecular topology of your system. The "extra special per atom"
|
||||
parameter must typically be set to allow for the new maximum total
|
||||
size (1st + 2nd + 3rd neighbors) of this per-atom list. There are 3
|
||||
size (1st + 2nd + 3rd neighbors) of this per-atom list. There are 2
|
||||
ways to do this. See the "read_data"_read_data.html or
|
||||
"create_box"_create_box.html or "special_bonds extra" commands for
|
||||
details.
|
||||
"create_box"_create_box.html commands for details.
|
||||
|
||||
NOTE: Even if you do not use the {atype}, {dtype}, or {itype}
|
||||
keywords, the list of topological neighbors is updated for atoms
|
||||
|
|
|
@ -7,10 +7,13 @@
|
|||
:line
|
||||
|
||||
pair_style airebo command :h3
|
||||
pair_style airebo/intel command :h3
|
||||
pair_style airebo/omp command :h3
|
||||
pair_style airebo/morse command :h3
|
||||
pair_style airebo/morse/intel command :h3
|
||||
pair_style airebo/morse/omp command :h3
|
||||
pair_style rebo command :h3
|
||||
pair_style rebo/intel command :h3
|
||||
pair_style rebo/omp command :h3
|
||||
|
||||
[Syntax:]
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
:line
|
||||
|
||||
pair_style lj/charmm/coul/charmm command :h3
|
||||
pair_style lj/charmm/coul/charmm/intel command :h3
|
||||
pair_style lj/charmm/coul/charmm/omp command :h3
|
||||
pair_style lj/charmm/coul/charmm/implicit command :h3
|
||||
pair_style lj/charmm/coul/charmm/implicit/omp command :h3
|
||||
|
|
|
@ -14,6 +14,7 @@ pair_style eam/omp command :h3
|
|||
pair_style eam/opt command :h3
|
||||
pair_style eam/alloy command :h3
|
||||
pair_style eam/alloy/gpu command :h3
|
||||
pair_style eam/alloy/intel command :h3
|
||||
pair_style eam/alloy/kk command :h3
|
||||
pair_style eam/alloy/omp command :h3
|
||||
pair_style eam/alloy/opt command :h3
|
||||
|
@ -21,6 +22,7 @@ pair_style eam/cd command :h3
|
|||
pair_style eam/cd/omp command :h3
|
||||
pair_style eam/fs command :h3
|
||||
pair_style eam/fs/gpu command :h3
|
||||
pair_style eam/fs/intel command :h3
|
||||
pair_style eam/fs/kk command :h3
|
||||
pair_style eam/fs/omp command :h3
|
||||
pair_style eam/fs/opt command :h3
|
||||
|
|
|
@ -25,9 +25,7 @@ keyword = {amber} or {charmm} or {dreiding} or {fene} or {lj/coul} or {lj} or {c
|
|||
{coul} values = w1,w2,w3
|
||||
w1,w2,w3 = weights (0.0 to 1.0) on pairwise Coulombic interactions
|
||||
{angle} value = {yes} or {no}
|
||||
{dihedral} value = {yes} or {no}
|
||||
{extra} value = N
|
||||
N = number of extra 1-2,1-3,1-4 interactions to save space for :pre
|
||||
{dihedral} value = {yes} or {no} :pre
|
||||
:ule
|
||||
|
||||
Examples:
|
||||
|
@ -36,8 +34,7 @@ special_bonds amber
|
|||
special_bonds charmm
|
||||
special_bonds fene dihedral no
|
||||
special_bonds lj/coul 0.0 0.0 0.5 angle yes dihedral yes
|
||||
special_bonds lj 0.0 0.0 0.5 coul 0.0 0.0 0.0 dihedral yes
|
||||
special_bonds lj/coul 0 1 1 extra 2 :pre
|
||||
special_bonds lj 0.0 0.0 0.5 coul 0.0 0.0 0.0 dihedral yes :pre
|
||||
|
||||
[Description:]
|
||||
|
||||
|
@ -178,14 +175,6 @@ interaction between atoms 2 and 5 will be unaffected (full weighting
|
|||
of 1.0). If the {dihedral} keyword is specified as {no} which is the
|
||||
default, then the 2,5 interaction will also be weighted by 0.5.
|
||||
|
||||
The {extra} keyword can be used when additional bonds will be created
|
||||
during a simulation run, e.g. by the "fix
|
||||
bond/create"_fix_bond_create.html command. It can also be used if
|
||||
molecules will be added to the system, e.g. via the "fix
|
||||
deposit"_fix_deposit.html, or "fix pour"_fix_pour.html commands, which
|
||||
will have atoms with more special neighbors than any atom in the
|
||||
current system has.
|
||||
|
||||
:line
|
||||
|
||||
NOTE: LAMMPS stores and maintains a data structure with a list of the
|
||||
|
@ -194,8 +183,9 @@ the system). If new bonds are created (or molecules added containing
|
|||
atoms with more special neighbors), the size of this list needs to
|
||||
grow. Note that adding a single bond always adds a new 1st neighbor
|
||||
but may also induce *many* new 2nd and 3rd neighbors, depending on the
|
||||
molecular topology of your system. Using the {extra} keyword leaves
|
||||
empty space in the list for this N additional 1st, 2nd, or 3rd
|
||||
molecular topology of your system. Using the {extra/special/per/atom}
|
||||
keyword to either "read_data"_read_data.html or "create_box"_create_box.html
|
||||
reserves empty space in the list for this N additional 1st, 2nd, or 3rd
|
||||
neighbors to be added. If you do not do this, you may get an error
|
||||
when bonds (or molecules) are added.
|
||||
|
||||
|
@ -203,8 +193,7 @@ when bonds (or molecules) are added.
|
|||
|
||||
NOTE: If you reuse this command in an input script, you should set all
|
||||
the options you need each time. This command cannot be used a 2nd
|
||||
time incrementally, e.g. to add some extra storage locations via the
|
||||
{extra} keyword. E.g. these two commands:
|
||||
time incrementally. E.g. these two commands:
|
||||
|
||||
special_bonds lj 0.0 1.0 1.0
|
||||
special_bonds coul 0.0 0.0 1.0
|
||||
|
@ -221,25 +210,6 @@ Coul: coul 0.0 0.0 1.0
|
|||
because the LJ settings are reset to their default values
|
||||
each time the command is issued.
|
||||
|
||||
Likewise
|
||||
|
||||
special_bonds amber
|
||||
special_bonds extra 2 :pre
|
||||
|
||||
is not the same as this single command:
|
||||
|
||||
special_bonds amber extra 2 :pre
|
||||
|
||||
since in the former case, the 2nd command will reset all the LJ and
|
||||
Coulombic weights to 0.0 (the default).
|
||||
|
||||
One exception to this rule is the {extra} option itself. It is not
|
||||
reset to its default value of 0 each time the special_bonds command is
|
||||
invoked. This is because it can also be set by the
|
||||
"read_data"_read_data.html and "create_box"_create_box.html commands,
|
||||
so this command will not override those settings unless you explicitly
|
||||
use {extra} as an option.
|
||||
|
||||
[Restrictions:] none
|
||||
|
||||
[Related commands:]
|
||||
|
|
|
@ -176,12 +176,13 @@ By recognizing the fix {drude}, LAMMPS will find and store matching
|
|||
DC-DP pairs and will treat DP as equivalent to their DC in the
|
||||
{special bonds} relations. It may be necessary to extend the space
|
||||
for storing such special relations. In this case extra space should
|
||||
be reserved by using the {extra} keyword of the {special_bonds}
|
||||
be reserved by using the {extra/special/per/atom} keyword of either
|
||||
the "read_data"_read_data.html or "create_box"_create_box.html
|
||||
command. With our phenol, there is 1 more special neighbor for which
|
||||
space is required. Otherwise LAMMPS crashes and gives the required
|
||||
value.
|
||||
|
||||
special_bonds lj/coul 0.0 0.0 0.5 extra 1 :pre
|
||||
read_data data-p.lmp extra/special/per/atom 1 :pre
|
||||
|
||||
Let us assume we want to run a simple NVT simulation at 300 K. Note
|
||||
that Drude oscillators need to be thermalized at a low temperature in
|
||||
|
|
|
@ -45,12 +45,12 @@ while iarg < nargs:
|
|||
if args[iarg] == "-m":
|
||||
if iarg+2 > len(args): error()
|
||||
machine = args[iarg+1]
|
||||
iarg += 2
|
||||
iarg += 2
|
||||
elif args[iarg] == "-e":
|
||||
if iarg+2 > len(args): error()
|
||||
extraflag = True
|
||||
suffix = args[iarg+1]
|
||||
iarg += 2
|
||||
iarg += 2
|
||||
else: error()
|
||||
|
||||
# set lib from working dir
|
||||
|
|
|
@ -32,7 +32,7 @@ where Makefile.g++ uses the GNU C++ compiler and is a good template to start.
|
|||
|
||||
**Optional**: if you use the Install.py script provided in this folder, you
|
||||
can give the machine name as the '-m' argument. This can be the suffix of one
|
||||
of the files from either this folder, or from src/MAKE.
|
||||
of the files from either this folder, or from src/MAKE/MACHINES.
|
||||
*This is only supported by the Install.py within the lib/colvars folder*.
|
||||
|
||||
When you are done building this library, two files should
|
||||
|
@ -53,10 +53,10 @@ settings in Makefile.common should work.
|
|||
For the reference manual see:
|
||||
http://colvars.github.io/colvars-refman-lammps
|
||||
|
||||
A copy of reference manual is also in:
|
||||
A copy of the reference manual is also in:
|
||||
doc/PDF/colvars-refman-lammps.pdf
|
||||
|
||||
Also included is a Doxygen-based developer documentation:
|
||||
Also available is a Doxygen-based developer documentation:
|
||||
http://colvars.github.io/doxygen/html/
|
||||
|
||||
The reference article is:
|
||||
|
|
|
@ -88,7 +88,12 @@ public:
|
|||
static std::vector<feature *> cv_features;
|
||||
|
||||
/// \brief Implementation of the feature list accessor for colvar
|
||||
std::vector<feature *> &features() {
|
||||
virtual const std::vector<feature *> &features()
|
||||
{
|
||||
return cv_features;
|
||||
}
|
||||
virtual std::vector<feature *> &modify_features()
|
||||
{
|
||||
return cv_features;
|
||||
}
|
||||
|
||||
|
|
|
@ -206,7 +206,12 @@ public:
|
|||
static std::vector<feature *> ag_features;
|
||||
|
||||
/// \brief Implementation of the feature list accessor for atom group
|
||||
virtual std::vector<feature *> &features() {
|
||||
virtual const std::vector<feature *> &features()
|
||||
{
|
||||
return ag_features;
|
||||
}
|
||||
virtual std::vector<feature *> &modify_features()
|
||||
{
|
||||
return ag_features;
|
||||
}
|
||||
|
||||
|
|
|
@ -384,6 +384,7 @@ std::ostream & colvarbias::write_traj(std::ostream &os)
|
|||
os << " ";
|
||||
if (b_output_energy)
|
||||
os << " "
|
||||
<< std::setprecision(cvm::en_prec) << std::setw(cvm::en_width)
|
||||
<< bias_energy;
|
||||
return os;
|
||||
}
|
||||
|
|
|
@ -175,7 +175,11 @@ public:
|
|||
static std::vector<feature *> cvb_features;
|
||||
|
||||
/// \brief Implementation of the feature list accessor for colvarbias
|
||||
virtual std::vector<feature *> &features()
|
||||
virtual const std::vector<feature *> &features()
|
||||
{
|
||||
return cvb_features;
|
||||
}
|
||||
virtual std::vector<feature *> &modify_features()
|
||||
{
|
||||
return cvb_features;
|
||||
}
|
||||
|
|
|
@ -99,12 +99,9 @@ int colvarbias_restraint_centers::init(std::string const &conf)
|
|||
if (null_centers) {
|
||||
// try to initialize the restraint centers for the first time
|
||||
colvar_centers.resize(num_variables());
|
||||
colvar_centers_raw.resize(num_variables());
|
||||
for (i = 0; i < num_variables(); i++) {
|
||||
colvar_centers[i].type(variables(i)->value());
|
||||
colvar_centers[i].reset();
|
||||
colvar_centers_raw[i].type(variables(i)->value());
|
||||
colvar_centers_raw[i].reset();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -113,7 +110,6 @@ int colvarbias_restraint_centers::init(std::string const &conf)
|
|||
if (cvm::debug()) {
|
||||
cvm::log("colvarbias_restraint: parsing initial centers, i = "+cvm::to_str(i)+".\n");
|
||||
}
|
||||
colvar_centers_raw[i] = colvar_centers[i];
|
||||
colvar_centers[i].apply_constraints();
|
||||
}
|
||||
null_centers = false;
|
||||
|
@ -141,8 +137,6 @@ int colvarbias_restraint_centers::change_configuration(std::string const &conf)
|
|||
for (size_t i = 0; i < num_variables(); i++) {
|
||||
colvar_centers[i].type(variables(i)->value());
|
||||
colvar_centers[i].apply_constraints();
|
||||
colvar_centers_raw[i].type(variables(i)->value());
|
||||
colvar_centers_raw[i] = colvar_centers[i];
|
||||
}
|
||||
}
|
||||
return COLVARS_OK;
|
||||
|
@ -232,7 +226,6 @@ int colvarbias_restraint_moving::set_state_params(std::string const &conf)
|
|||
{
|
||||
if (b_chg_centers || b_chg_force_k) {
|
||||
if (target_nstages) {
|
||||
// cvm::log ("Reading current stage from the restart.\n");
|
||||
if (!get_keyval(conf, "stage", stage))
|
||||
cvm::error("Error: current stage is missing from the restart.\n");
|
||||
}
|
||||
|
@ -265,100 +258,127 @@ int colvarbias_restraint_centers_moving::init(std::string const &conf)
|
|||
|
||||
size_t i;
|
||||
if (get_keyval(conf, "targetCenters", target_centers, colvar_centers)) {
|
||||
if (colvar_centers.size() != num_variables()) {
|
||||
if (target_centers.size() != num_variables()) {
|
||||
cvm::error("Error: number of target centers does not match "
|
||||
"that of collective variables.\n");
|
||||
"that of collective variables.\n", INPUT_ERROR);
|
||||
}
|
||||
b_chg_centers = true;
|
||||
for (i = 0; i < target_centers.size(); i++) {
|
||||
target_centers[i].apply_constraints();
|
||||
centers_incr.push_back(colvar_centers[i]);
|
||||
centers_incr[i].reset();
|
||||
}
|
||||
}
|
||||
|
||||
if (b_chg_centers) {
|
||||
// parse moving restraint options
|
||||
// parse moving schedule options
|
||||
colvarbias_restraint_moving::init(conf);
|
||||
if (initial_centers.size() == 0) {
|
||||
// One-time init
|
||||
initial_centers = colvar_centers;
|
||||
}
|
||||
// Call to check that the definition is correct
|
||||
for (i = 0; i < num_variables(); i++) {
|
||||
colvarvalue const midpoint =
|
||||
colvarvalue::interpolate(initial_centers[i],
|
||||
target_centers[i],
|
||||
0.5);
|
||||
}
|
||||
} else {
|
||||
target_centers.clear();
|
||||
return COLVARS_OK;
|
||||
}
|
||||
|
||||
get_keyval(conf, "outputCenters", b_output_centers, b_output_centers);
|
||||
get_keyval(conf, "outputAccumulatedWork", b_output_acc_work, b_output_acc_work);
|
||||
get_keyval(conf, "outputAccumulatedWork", b_output_acc_work,
|
||||
b_output_acc_work); // TODO this conflicts with stages
|
||||
|
||||
return COLVARS_OK;
|
||||
}
|
||||
|
||||
|
||||
int colvarbias_restraint_centers_moving::update_centers(cvm::real lambda)
|
||||
{
|
||||
if (cvm::debug()) {
|
||||
cvm::log("Updating centers for the restraint bias \""+
|
||||
this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
|
||||
}
|
||||
size_t i;
|
||||
for (i = 0; i < num_variables(); i++) {
|
||||
colvarvalue const c_new = colvarvalue::interpolate(initial_centers[i],
|
||||
target_centers[i],
|
||||
lambda);
|
||||
centers_incr[i] = (c_new).dist2_grad(colvar_centers[i]);
|
||||
colvar_centers[i] = c_new;
|
||||
variables(i)->wrap(colvar_centers[i]);
|
||||
}
|
||||
if (cvm::debug()) {
|
||||
cvm::log("New centers for the restraint bias \""+
|
||||
this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
|
||||
}
|
||||
return cvm::get_error();
|
||||
}
|
||||
|
||||
|
||||
int colvarbias_restraint_centers_moving::update()
|
||||
{
|
||||
if (b_chg_centers) {
|
||||
|
||||
if (cvm::debug()) {
|
||||
cvm::log("Updating centers for the restraint bias \""+
|
||||
this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
|
||||
}
|
||||
|
||||
if (!centers_incr.size()) {
|
||||
// if this is the first calculation, calculate the advancement
|
||||
// at each simulation step (or stage, if applicable)
|
||||
// (take current stage into account: it can be non-zero
|
||||
// if we are restarting a staged calculation)
|
||||
centers_incr.resize(num_variables());
|
||||
for (size_t i = 0; i < num_variables(); i++) {
|
||||
centers_incr[i].type(variables(i)->value());
|
||||
centers_incr[i] = (target_centers[i] - colvar_centers_raw[i]) /
|
||||
cvm::real( target_nstages ? (target_nstages - stage) :
|
||||
(target_nsteps - cvm::step_absolute()));
|
||||
}
|
||||
if (cvm::debug()) {
|
||||
cvm::log("Center increment for the restraint bias \""+
|
||||
this->name+"\": "+cvm::to_str(centers_incr)+" at stage "+cvm::to_str(stage)+ ".\n");
|
||||
}
|
||||
}
|
||||
|
||||
if (target_nstages) {
|
||||
if ((cvm::step_relative() > 0)
|
||||
&& (cvm::step_absolute() % target_nsteps) == 0
|
||||
&& stage < target_nstages) {
|
||||
|
||||
for (size_t i = 0; i < num_variables(); i++) {
|
||||
colvar_centers_raw[i] += centers_incr[i];
|
||||
colvar_centers[i] = colvar_centers_raw[i];
|
||||
variables(i)->wrap(colvar_centers[i]);
|
||||
colvar_centers[i].apply_constraints();
|
||||
// Staged update
|
||||
if (stage <= target_nstages) {
|
||||
if ((cvm::step_relative() > 0) &&
|
||||
((cvm::step_absolute() % target_nsteps) == 1)) {
|
||||
cvm::real const lambda =
|
||||
cvm::real(stage)/cvm::real(target_nstages);
|
||||
update_centers(lambda);
|
||||
stage++;
|
||||
cvm::log("Moving restraint \"" + this->name +
|
||||
"\" stage " + cvm::to_str(stage) +
|
||||
" : setting centers to " + cvm::to_str(colvar_centers) +
|
||||
" at step " + cvm::to_str(cvm::step_absolute()));
|
||||
} else {
|
||||
for (size_t i = 0; i < num_variables(); i++) {
|
||||
centers_incr[i].reset();
|
||||
}
|
||||
}
|
||||
stage++;
|
||||
cvm::log("Moving restraint \"" + this->name +
|
||||
"\" stage " + cvm::to_str(stage) +
|
||||
" : setting centers to " + cvm::to_str(colvar_centers) +
|
||||
" at step " + cvm::to_str(cvm::step_absolute()));
|
||||
}
|
||||
} else if ((cvm::step_relative() > 0) && (cvm::step_absolute() <= target_nsteps)) {
|
||||
// move the restraint centers in the direction of the targets
|
||||
// (slow growth)
|
||||
} else {
|
||||
// Continuous update
|
||||
if (cvm::step_absolute() <= target_nsteps) {
|
||||
cvm::real const lambda =
|
||||
cvm::real(cvm::step_absolute())/cvm::real(target_nsteps);
|
||||
update_centers(lambda);
|
||||
} else {
|
||||
for (size_t i = 0; i < num_variables(); i++) {
|
||||
centers_incr[i].reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (cvm::step_relative() == 0) {
|
||||
for (size_t i = 0; i < num_variables(); i++) {
|
||||
colvar_centers_raw[i] += centers_incr[i];
|
||||
colvar_centers[i] = colvar_centers_raw[i];
|
||||
variables(i)->wrap(colvar_centers[i]);
|
||||
colvar_centers[i].apply_constraints();
|
||||
// finite differences are undefined when restarting
|
||||
centers_incr[i].reset();
|
||||
}
|
||||
}
|
||||
|
||||
if (cvm::debug()) {
|
||||
cvm::log("New centers for the restraint bias \""+
|
||||
this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
|
||||
cvm::log("Center increment for the restraint bias \""+
|
||||
this->name+"\": "+cvm::to_str(centers_incr)+
|
||||
" at stage "+cvm::to_str(stage)+ ".\n");
|
||||
}
|
||||
}
|
||||
|
||||
return COLVARS_OK;
|
||||
return cvm::get_error();
|
||||
}
|
||||
|
||||
|
||||
int colvarbias_restraint_centers_moving::update_acc_work()
|
||||
{
|
||||
if (b_output_acc_work) {
|
||||
if ((cvm::step_relative() > 0) || (cvm::step_absolute() == 0)) {
|
||||
if ((cvm::step_relative() > 0) &&
|
||||
(cvm::step_absolute() <= target_nsteps)) {
|
||||
for (size_t i = 0; i < num_variables(); i++) {
|
||||
// project forces on the calculated increments at this step
|
||||
acc_work += colvar_forces[i] * centers_incr[i];
|
||||
|
@ -383,13 +403,6 @@ std::string const colvarbias_restraint_centers_moving::get_state_params() const
|
|||
<< colvar_centers[i];
|
||||
}
|
||||
os << "\n";
|
||||
os << "centers_raw ";
|
||||
for (i = 0; i < num_variables(); i++) {
|
||||
os << " "
|
||||
<< std::setprecision(cvm::cv_prec) << std::setw(cvm::cv_width)
|
||||
<< colvar_centers_raw[i];
|
||||
}
|
||||
os << "\n";
|
||||
|
||||
if (b_output_acc_work) {
|
||||
os << "accumulatedWork "
|
||||
|
@ -398,7 +411,7 @@ std::string const colvarbias_restraint_centers_moving::get_state_params() const
|
|||
}
|
||||
}
|
||||
|
||||
return colvarbias_restraint_moving::get_state_params() + os.str();
|
||||
return os.str();
|
||||
}
|
||||
|
||||
|
||||
|
@ -410,8 +423,6 @@ int colvarbias_restraint_centers_moving::set_state_params(std::string const &con
|
|||
// cvm::log ("Reading the updated restraint centers from the restart.\n");
|
||||
if (!get_keyval(conf, "centers", colvar_centers))
|
||||
cvm::error("Error: restraint centers are missing from the restart.\n");
|
||||
if (!get_keyval(conf, "centers_raw", colvar_centers_raw))
|
||||
cvm::error("Error: \"raw\" restraint centers are missing from the restart.\n");
|
||||
if (b_output_acc_work) {
|
||||
if (!get_keyval(conf, "accumulatedWork", acc_work))
|
||||
cvm::error("Error: accumulatedWork is missing from the restart.\n");
|
||||
|
@ -609,7 +620,7 @@ std::string const colvarbias_restraint_k_moving::get_state_params() const
|
|||
<< std::setprecision(cvm::en_prec)
|
||||
<< std::setw(cvm::en_width) << force_k << "\n";
|
||||
}
|
||||
return colvarbias_restraint_moving::get_state_params() + os.str();
|
||||
return os.str();
|
||||
}
|
||||
|
||||
|
||||
|
@ -770,6 +781,7 @@ cvm::real colvarbias_restraint_harmonic::d_restraint_potential_dk(size_t i) cons
|
|||
std::string const colvarbias_restraint_harmonic::get_state_params() const
|
||||
{
|
||||
return colvarbias_restraint::get_state_params() +
|
||||
colvarbias_restraint_moving::get_state_params() +
|
||||
colvarbias_restraint_centers_moving::get_state_params() +
|
||||
colvarbias_restraint_k_moving::get_state_params();
|
||||
}
|
||||
|
@ -779,6 +791,7 @@ int colvarbias_restraint_harmonic::set_state_params(std::string const &conf)
|
|||
{
|
||||
int error_code = COLVARS_OK;
|
||||
error_code |= colvarbias_restraint::set_state_params(conf);
|
||||
error_code |= colvarbias_restraint_moving::set_state_params(conf);
|
||||
error_code |= colvarbias_restraint_centers_moving::set_state_params(conf);
|
||||
error_code |= colvarbias_restraint_k_moving::set_state_params(conf);
|
||||
return error_code;
|
||||
|
@ -1037,6 +1050,7 @@ cvm::real colvarbias_restraint_harmonic_walls::d_restraint_potential_dk(size_t i
|
|||
std::string const colvarbias_restraint_harmonic_walls::get_state_params() const
|
||||
{
|
||||
return colvarbias_restraint::get_state_params() +
|
||||
colvarbias_restraint_moving::get_state_params() +
|
||||
colvarbias_restraint_k_moving::get_state_params();
|
||||
}
|
||||
|
||||
|
@ -1045,6 +1059,7 @@ int colvarbias_restraint_harmonic_walls::set_state_params(std::string const &con
|
|||
{
|
||||
int error_code = COLVARS_OK;
|
||||
error_code |= colvarbias_restraint::set_state_params(conf);
|
||||
error_code |= colvarbias_restraint_moving::set_state_params(conf);
|
||||
error_code |= colvarbias_restraint_k_moving::set_state_params(conf);
|
||||
return error_code;
|
||||
}
|
||||
|
@ -1164,6 +1179,7 @@ cvm::real colvarbias_restraint_linear::d_restraint_potential_dk(size_t i) const
|
|||
std::string const colvarbias_restraint_linear::get_state_params() const
|
||||
{
|
||||
return colvarbias_restraint::get_state_params() +
|
||||
colvarbias_restraint_moving::get_state_params() +
|
||||
colvarbias_restraint_centers_moving::get_state_params() +
|
||||
colvarbias_restraint_k_moving::get_state_params();
|
||||
}
|
||||
|
@ -1173,6 +1189,7 @@ int colvarbias_restraint_linear::set_state_params(std::string const &conf)
|
|||
{
|
||||
int error_code = COLVARS_OK;
|
||||
error_code |= colvarbias_restraint::set_state_params(conf);
|
||||
error_code |= colvarbias_restraint_moving::set_state_params(conf);
|
||||
error_code |= colvarbias_restraint_centers_moving::set_state_params(conf);
|
||||
error_code |= colvarbias_restraint_k_moving::set_state_params(conf);
|
||||
return error_code;
|
||||
|
|
|
@ -74,9 +74,6 @@ protected:
|
|||
|
||||
/// \brief Restraint centers
|
||||
std::vector<colvarvalue> colvar_centers;
|
||||
|
||||
/// \brief Restraint centers outside the domain of the colvars (no wrapping or constraints applied)
|
||||
std::vector<colvarvalue> colvar_centers_raw;
|
||||
};
|
||||
|
||||
|
||||
|
@ -156,10 +153,16 @@ protected:
|
|||
/// \brief New restraint centers
|
||||
std::vector<colvarvalue> target_centers;
|
||||
|
||||
/// \brief Initial value of the restraint centers
|
||||
std::vector<colvarvalue> initial_centers;
|
||||
|
||||
/// \brief Amplitude of the restraint centers' increment at each step
|
||||
/// (or stage) towards the new values (calculated from target_nsteps)
|
||||
/// towards the new values (calculated from target_nsteps)
|
||||
std::vector<colvarvalue> centers_incr;
|
||||
|
||||
/// \brief Update the centers by interpolating between initial and target
|
||||
virtual int update_centers(cvm::real lambda);
|
||||
|
||||
/// Whether to write the current restraint centers to the trajectory file
|
||||
bool b_output_centers;
|
||||
|
||||
|
|
|
@ -132,9 +132,15 @@ public:
|
|||
static std::vector<feature *> cvc_features;
|
||||
|
||||
/// \brief Implementation of the feature list accessor for colvar
|
||||
virtual std::vector<feature *> &features() {
|
||||
virtual const std::vector<feature *> &features()
|
||||
{
|
||||
return cvc_features;
|
||||
}
|
||||
virtual std::vector<feature *> &modify_features()
|
||||
{
|
||||
return cvc_features;
|
||||
}
|
||||
|
||||
|
||||
/// \brief Obtain data needed for the calculation for the backend
|
||||
virtual void read_data();
|
||||
|
|
|
@ -374,8 +374,8 @@ int colvardeps::decr_ref_count(int feature_id) {
|
|||
}
|
||||
|
||||
void colvardeps::init_feature(int feature_id, const char *description, feature_type type) {
|
||||
features()[feature_id]->description = description;
|
||||
features()[feature_id]->type = type;
|
||||
modify_features()[feature_id]->description = description;
|
||||
modify_features()[feature_id]->type = type;
|
||||
}
|
||||
|
||||
// Shorthand macros for describing dependencies
|
||||
|
@ -401,7 +401,7 @@ void colvardeps::init_cvb_requires() {
|
|||
int i;
|
||||
if (features().size() == 0) {
|
||||
for (i = 0; i < f_cvb_ntot; i++) {
|
||||
features().push_back(new feature);
|
||||
modify_features().push_back(new feature);
|
||||
}
|
||||
|
||||
init_feature(f_cvb_active, "active", f_type_dynamic);
|
||||
|
@ -438,7 +438,7 @@ void colvardeps::init_cv_requires() {
|
|||
size_t i;
|
||||
if (features().size() == 0) {
|
||||
for (i = 0; i < f_cv_ntot; i++) {
|
||||
features().push_back(new feature);
|
||||
modify_features().push_back(new feature);
|
||||
}
|
||||
|
||||
init_feature(f_cv_active, "active", f_type_dynamic);
|
||||
|
@ -554,7 +554,7 @@ void colvardeps::init_cvc_requires() {
|
|||
// Initialize static array once and for all
|
||||
if (features().size() == 0) {
|
||||
for (i = 0; i < colvardeps::f_cvc_ntot; i++) {
|
||||
features().push_back(new feature);
|
||||
modify_features().push_back(new feature);
|
||||
}
|
||||
|
||||
init_feature(f_cvc_active, "active", f_type_dynamic);
|
||||
|
@ -633,7 +633,7 @@ void colvardeps::init_ag_requires() {
|
|||
// Initialize static array once and for all
|
||||
if (features().size() == 0) {
|
||||
for (i = 0; i < f_ag_ntot; i++) {
|
||||
features().push_back(new feature);
|
||||
modify_features().push_back(new feature);
|
||||
}
|
||||
|
||||
init_feature(f_ag_active, "active", f_type_dynamic);
|
||||
|
|
|
@ -135,7 +135,8 @@ public:
|
|||
// with a non-static array
|
||||
// Intermediate classes (colvarbias and colvarcomp, which are also base classes)
|
||||
// implement this as virtual to allow overriding
|
||||
virtual std::vector<feature *>&features() = 0;
|
||||
virtual const std::vector<feature *>&features() = 0;
|
||||
virtual std::vector<feature *>&modify_features() = 0;
|
||||
|
||||
void add_child(colvardeps *child);
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#define COLVARS_VERSION "2017-07-15"
|
||||
#ifndef COLVARS_VERSION
|
||||
#define COLVARS_VERSION "2017-08-06"
|
||||
// This file is part of the Collective Variables module (Colvars).
|
||||
// The original version of Colvars and its updates are located at:
|
||||
// https://github.com/colvars/colvars
|
||||
|
@ -6,3 +7,4 @@
|
|||
// If you wish to distribute your changes, please submit them to the
|
||||
// Colvars repository at GitHub.
|
||||
|
||||
#endif
|
||||
|
|
|
@ -472,7 +472,7 @@ int colvarscript::proc_features(colvardeps *obj,
|
|||
}
|
||||
|
||||
if ((subcmd == "get") || (subcmd == "set")) {
|
||||
std::vector<colvardeps::feature *> &features = obj->features();
|
||||
std::vector<colvardeps::feature *> const &features = obj->features();
|
||||
std::string const req_feature(obj_to_str(objv[3]));
|
||||
colvardeps::feature *f = NULL;
|
||||
int fid = 0;
|
||||
|
|
|
@ -19,6 +19,17 @@ bool colvarmodule::rotation::monitor_crossings = false;
|
|||
cvm::real colvarmodule::rotation::crossing_threshold = 1.0E-02;
|
||||
|
||||
|
||||
/// Numerical recipes diagonalization
|
||||
static int jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot);
|
||||
|
||||
/// Eigenvector sort
|
||||
static int eigsrt(cvm::real *d, cvm::real **v);
|
||||
|
||||
/// Transpose the matrix
|
||||
static int transpose(cvm::real **v);
|
||||
|
||||
|
||||
|
||||
std::string cvm::rvector::to_simple_string() const
|
||||
{
|
||||
std::ostringstream os;
|
||||
|
@ -286,7 +297,12 @@ void colvarmodule::rotation::diagonalize_matrix(cvm::matrix2d<cvm::real> &S,
|
|||
|
||||
// diagonalize
|
||||
int jac_nrot = 0;
|
||||
jacobi(S.c_array(), S_eigval.c_array(), S_eigvec.c_array(), &jac_nrot);
|
||||
if (jacobi(S.c_array(), S_eigval.c_array(), S_eigvec.c_array(), &jac_nrot) !=
|
||||
COLVARS_OK) {
|
||||
cvm::error("Too many iterations in routine jacobi.\n"
|
||||
"This is usually the result of an ill-defined set of atoms for "
|
||||
"rotational alignment (RMSD, rotateReference, etc).\n");
|
||||
}
|
||||
eigsrt(S_eigval.c_array(), S_eigvec.c_array());
|
||||
// jacobi saves eigenvectors by columns
|
||||
transpose(S_eigvec.c_array());
|
||||
|
@ -528,7 +544,7 @@ void colvarmodule::rotation::calc_optimal_rotation(std::vector<cvm::atom_pos> co
|
|||
|
||||
#define n 4
|
||||
|
||||
void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
|
||||
int jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
|
||||
{
|
||||
int j,iq,ip,i;
|
||||
cvm::real tresh,theta,tau,t,sm,s,h,g,c;
|
||||
|
@ -554,7 +570,7 @@ void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
|
|||
sm += std::fabs(a[ip][iq]);
|
||||
}
|
||||
if (sm == 0.0) {
|
||||
return;
|
||||
return COLVARS_OK;
|
||||
}
|
||||
if (i < 4)
|
||||
tresh=0.2*sm/(n*n);
|
||||
|
@ -606,10 +622,11 @@ void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
|
|||
z[ip]=0.0;
|
||||
}
|
||||
}
|
||||
cvm::error("Too many iterations in routine jacobi.\n");
|
||||
return COLVARS_ERROR;
|
||||
}
|
||||
|
||||
void eigsrt(cvm::real *d, cvm::real **v)
|
||||
|
||||
int eigsrt(cvm::real *d, cvm::real **v)
|
||||
{
|
||||
int k,j,i;
|
||||
cvm::real p;
|
||||
|
@ -628,9 +645,11 @@ void eigsrt(cvm::real *d, cvm::real **v)
|
|||
}
|
||||
}
|
||||
}
|
||||
return COLVARS_OK;
|
||||
}
|
||||
|
||||
void transpose(cvm::real **v)
|
||||
|
||||
int transpose(cvm::real **v)
|
||||
{
|
||||
cvm::real p;
|
||||
int i,j;
|
||||
|
@ -641,6 +660,7 @@ void transpose(cvm::real **v)
|
|||
v[j][i]=p;
|
||||
}
|
||||
}
|
||||
return COLVARS_OK;
|
||||
}
|
||||
|
||||
#undef n
|
||||
|
|
|
@ -1020,16 +1020,6 @@ inline cvm::rvector operator * (cvm::rmatrix const &m,
|
|||
}
|
||||
|
||||
|
||||
/// Numerical recipes diagonalization
|
||||
void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot);
|
||||
|
||||
/// Eigenvector sort
|
||||
void eigsrt(cvm::real *d, cvm::real **v);
|
||||
|
||||
/// Transpose the matrix
|
||||
void transpose(cvm::real **v);
|
||||
|
||||
|
||||
|
||||
|
||||
/// \brief 1-dimensional vector of real numbers with four components and
|
||||
|
|
|
@ -570,6 +570,50 @@ colvarvalue colvarvalue::dist2_grad(colvarvalue const &x2) const
|
|||
}
|
||||
|
||||
|
||||
/// Return the midpoint between x1 and x2, optionally weighted by lambda
|
||||
/// (which must be between 0.0 and 1.0)
|
||||
colvarvalue const colvarvalue::interpolate(colvarvalue const &x1,
|
||||
colvarvalue const &x2,
|
||||
cvm::real const lambda)
|
||||
{
|
||||
colvarvalue::check_types(x1, x2);
|
||||
|
||||
if ((lambda < 0.0) || (lambda > 1.0)) {
|
||||
cvm::error("Error: trying to interpolate between two colvarvalues with a "
|
||||
"lamdba outside [0:1].\n", BUG_ERROR);
|
||||
}
|
||||
|
||||
colvarvalue interp = ((1.0-lambda)*x1 + lambda*x2);
|
||||
cvm::real const d2 = x1.dist2(x2);
|
||||
|
||||
switch (x1.type()) {
|
||||
case colvarvalue::type_scalar:
|
||||
case colvarvalue::type_3vector:
|
||||
case colvarvalue::type_vector:
|
||||
case colvarvalue::type_unit3vectorderiv:
|
||||
case colvarvalue::type_quaternionderiv:
|
||||
return interp;
|
||||
break;
|
||||
case colvarvalue::type_unit3vector:
|
||||
case colvarvalue::type_quaternion:
|
||||
if (interp.norm()/std::sqrt(d2) < 1.0e-6) {
|
||||
cvm::error("Error: interpolation between "+cvm::to_str(x1)+" and "+
|
||||
cvm::to_str(x2)+" with lambda = "+cvm::to_str(lambda)+
|
||||
" is undefined: result = "+cvm::to_str(interp)+"\n",
|
||||
INPUT_ERROR);
|
||||
}
|
||||
interp.apply_constraints();
|
||||
return interp;
|
||||
break;
|
||||
case colvarvalue::type_notset:
|
||||
default:
|
||||
x1.undef_op();
|
||||
break;
|
||||
}
|
||||
return colvarvalue(colvarvalue::type_notset);
|
||||
}
|
||||
|
||||
|
||||
std::string colvarvalue::to_simple_string() const
|
||||
{
|
||||
switch (type()) {
|
||||
|
|
|
@ -193,6 +193,12 @@ public:
|
|||
/// Derivative with respect to this \link colvarvalue \endlink of the square distance
|
||||
colvarvalue dist2_grad(colvarvalue const &x2) const;
|
||||
|
||||
/// Return the midpoint between x1 and x2, optionally weighted by lambda
|
||||
/// (which must be between 0.0 and 1.0)
|
||||
static colvarvalue const interpolate(colvarvalue const &x1,
|
||||
colvarvalue const &x2,
|
||||
cvm::real const lambda = 0.5);
|
||||
|
||||
/// Assignment operator (type of x is checked)
|
||||
colvarvalue & operator = (colvarvalue const &x);
|
||||
|
||||
|
@ -285,10 +291,10 @@ public:
|
|||
cvm::real & operator [] (int const i);
|
||||
|
||||
/// Ensure that the two types are the same within a binary operator
|
||||
int static check_types(colvarvalue const &x1, colvarvalue const &x2);
|
||||
static int check_types(colvarvalue const &x1, colvarvalue const &x2);
|
||||
|
||||
/// Ensure that the two types are the same within an assignment, or that the left side is type_notset
|
||||
int static check_types_assign(Type const &vt1, Type const &vt2);
|
||||
static int check_types_assign(Type const &vt1, Type const &vt2);
|
||||
|
||||
/// Undefined operation
|
||||
void undef_op() const;
|
||||
|
@ -317,14 +323,14 @@ public:
|
|||
|
||||
/// \brief Optimized routine for the inner product of one collective
|
||||
/// variable with an array
|
||||
void static inner_opt(colvarvalue const &x,
|
||||
static void inner_opt(colvarvalue const &x,
|
||||
std::vector<colvarvalue>::iterator &xv,
|
||||
std::vector<colvarvalue>::iterator const &xv_end,
|
||||
std::vector<cvm::real>::iterator &result);
|
||||
|
||||
/// \brief Optimized routine for the inner product of one collective
|
||||
/// variable with an array
|
||||
void static inner_opt(colvarvalue const &x,
|
||||
static void inner_opt(colvarvalue const &x,
|
||||
std::list<colvarvalue>::iterator &xv,
|
||||
std::list<colvarvalue>::iterator const &xv_end,
|
||||
std::vector<cvm::real>::iterator &result);
|
||||
|
@ -332,14 +338,14 @@ public:
|
|||
/// \brief Optimized routine for the second order Legendre
|
||||
/// polynomial, (3cos^2(w)-1)/2, of one collective variable with an
|
||||
/// array
|
||||
void static p2leg_opt(colvarvalue const &x,
|
||||
static void p2leg_opt(colvarvalue const &x,
|
||||
std::vector<colvarvalue>::iterator &xv,
|
||||
std::vector<colvarvalue>::iterator const &xv_end,
|
||||
std::vector<cvm::real>::iterator &result);
|
||||
|
||||
/// \brief Optimized routine for the second order Legendre
|
||||
/// polynomial of one collective variable with an array
|
||||
void static p2leg_opt(colvarvalue const &x,
|
||||
static void p2leg_opt(colvarvalue const &x,
|
||||
std::list<colvarvalue>::iterator &xv,
|
||||
std::list<colvarvalue>::iterator const &xv_end,
|
||||
std::vector<cvm::real>::iterator &result);
|
||||
|
|
|
@ -14,7 +14,7 @@ Syntax from lib dir: python Install.py -m machine -h hdir -a arch -p precision -
|
|||
|
||||
specify one or more options, order does not matter
|
||||
|
||||
copies an existing Makefile.machine in lib/gpu to Makefile.auto
|
||||
copies an existing Makefile.machine in lib/gpu to Makefile.auto
|
||||
optionally edits these variables in Makefile.auto:
|
||||
CUDA_HOME, CUDA_ARCH, CUDA_PRECISION, EXTRAMAKE
|
||||
optionally uses Makefile.auto to build the GPU library -> libgpu.a
|
||||
|
@ -26,7 +26,7 @@ optionally copies Makefile.auto to a new Makefile.osuffix
|
|||
-h = set CUDA_HOME variable in Makefile.auto to hdir
|
||||
hdir = path to NVIDIA Cuda software, e.g. /usr/local/cuda
|
||||
-a = set CUDA_ARCH variable in Makefile.auto to arch
|
||||
use arch = 20 for Tesla C2050/C2070 (Fermi) (deprecated as of CUDA 8.0)
|
||||
use arch = 20 for Tesla C2050/C2070 (Fermi) (deprecated as of CUDA 8.0)
|
||||
or GeForce GTX 580 or similar
|
||||
use arch = 30 for Tesla K10 (Kepler)
|
||||
use arch = 35 for Tesla K40 (Kepler) or GeForce GTX Titan or similar
|
||||
|
@ -108,10 +108,10 @@ if pflag:
|
|||
elif precision == "mixed": precstr = "-D_SINGLE_DOUBLE"
|
||||
elif precision == "single": precstr = "-D_SINGLE_SINGLE"
|
||||
else: error("Invalid precision setting")
|
||||
|
||||
|
||||
# create Makefile.auto
|
||||
# reset EXTRAMAKE, CUDA_HOME, CUDA_ARCH, CUDA_PRECISION if requested
|
||||
|
||||
|
||||
if not os.path.exists("Makefile.%s" % isuffix):
|
||||
error("lib/gpu/Makefile.%s does not exist" % isuffix)
|
||||
|
||||
|
|
|
@ -22,21 +22,21 @@
|
|||
offset=tid & (t_per_atom-1); \
|
||||
ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
|
||||
|
||||
#define nbor_info(nbor_mem, packed_mem, nbor_stride, t_per_atom, ii, offset, \
|
||||
i, numj, stride, nbor_end, nbor_begin) \
|
||||
i=nbor_mem[ii]; \
|
||||
nbor_begin=ii+nbor_stride; \
|
||||
numj=nbor_mem[nbor_begin]; \
|
||||
if (nbor_mem==packed_mem) { \
|
||||
nbor_begin+=nbor_stride+fast_mul(ii,t_per_atom-1); \
|
||||
stride=fast_mul(t_per_atom,nbor_stride); \
|
||||
nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & (t_per_atom-1)); \
|
||||
#define nbor_info(dev_nbor, dev_packed, nbor_pitch, t_per_atom, ii, offset, \
|
||||
i, numj, n_stride, nbor_end, nbor_begin) \
|
||||
i=dev_nbor[ii]; \
|
||||
nbor_begin=ii+nbor_pitch; \
|
||||
numj=dev_nbor[nbor_begin]; \
|
||||
if (dev_nbor==dev_packed) { \
|
||||
nbor_begin+=nbor_pitch+fast_mul(ii,t_per_atom-1); \
|
||||
n_stride=fast_mul(t_per_atom,nbor_pitch); \
|
||||
nbor_end=nbor_begin+fast_mul(numj/t_per_atom,n_stride)+(numj & (t_per_atom-1)); \
|
||||
nbor_begin+=offset; \
|
||||
} else { \
|
||||
nbor_begin+=nbor_stride; \
|
||||
nbor_begin=nbor_mem[nbor_begin]; \
|
||||
nbor_begin+=nbor_pitch; \
|
||||
nbor_begin=dev_nbor[nbor_begin]; \
|
||||
nbor_end=nbor_begin+numj; \
|
||||
stride=t_per_atom; \
|
||||
n_stride=t_per_atom; \
|
||||
nbor_begin+=offset; \
|
||||
}
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ using namespace LAMMPS_AL;
|
|||
extern Device<PRECISION,ACC_PRECISION> global_device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
|
||||
BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
|
||||
device=&global_device;
|
||||
ans=new Answer<numtyp,acctyp>();
|
||||
nbor=new Neighbor();
|
||||
|
@ -53,8 +53,8 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
|||
const int max_nbors, const int maxspecial,
|
||||
const double cell_size, const double gpu_split,
|
||||
FILE *_screen, const void *pair_program,
|
||||
const char *k_two, const char *k_three_center,
|
||||
const char *k_three_end) {
|
||||
const char *two, const char *three_center,
|
||||
const char *three_end, const char *short_nbor) {
|
||||
screen=_screen;
|
||||
|
||||
int gpu_nbor=0;
|
||||
|
@ -70,10 +70,10 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
|||
_gpu_host=1;
|
||||
|
||||
_threads_per_atom=device->threads_per_atom();
|
||||
if (_threads_per_atom>1 && gpu_nbor==0) {
|
||||
if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1
|
||||
nbor->packing(true);
|
||||
_nbor_data=&(nbor->dev_packed);
|
||||
} else
|
||||
} else // neigh yes or tpa == 1
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
if (_threads_per_atom*_threads_per_atom>device->warp_size())
|
||||
return -10;
|
||||
|
@ -97,7 +97,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
|||
|
||||
_block_pair=device->pair_block_size();
|
||||
_block_size=device->block_ellipse();
|
||||
compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end);
|
||||
compile_kernels(*ucl_device,pair_program,two,three_center,three_end,short_nbor);
|
||||
|
||||
// Initialize host-device load balancer
|
||||
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||
|
@ -113,6 +113,11 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
|||
_max_an_bytes+=ans2->gpu_bytes();
|
||||
#endif
|
||||
|
||||
int ef_nall=nall;
|
||||
if (ef_nall==0)
|
||||
ef_nall=2000;
|
||||
dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -136,6 +141,7 @@ void BaseThreeT::clear_atomic() {
|
|||
k_three_end.clear();
|
||||
k_three_end_vatom.clear();
|
||||
k_pair.clear();
|
||||
k_short_nbor.clear();
|
||||
delete pair_program;
|
||||
_compiled=false;
|
||||
}
|
||||
|
@ -143,6 +149,7 @@ void BaseThreeT::clear_atomic() {
|
|||
time_pair.clear();
|
||||
hd_balancer.clear();
|
||||
|
||||
dev_short_nbor.clear();
|
||||
nbor->clear();
|
||||
ans->clear();
|
||||
#ifdef THREE_CONCURRENT
|
||||
|
@ -169,6 +176,8 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
|
|||
if (!success)
|
||||
return NULL;
|
||||
|
||||
_nall = nall;
|
||||
|
||||
// originally the requirement that nall == nlist was enforced
|
||||
// to allow direct indexing neighbors of neighbors after re-arrangement
|
||||
// nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
|
||||
|
@ -203,6 +212,8 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
|
|||
return 0;
|
||||
atom->cast_copy_x(host_x,host_type);
|
||||
|
||||
_nall = nall;
|
||||
|
||||
int mn;
|
||||
nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag,
|
||||
nspecial, special, success, mn);
|
||||
|
@ -247,12 +258,22 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
|
|||
reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
|
||||
if (!success)
|
||||
return;
|
||||
_max_nbors = nbor->max_nbor_loop(nlist,numj,ilist);
|
||||
}
|
||||
|
||||
atom->cast_x_data(host_x,host_type);
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
|
||||
// re-allocate dev_short_nbor if necessary
|
||||
if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
dev_short_nbor.resize((2+_max_nbors)*_nmax);
|
||||
}
|
||||
|
||||
// _ainum to be used in loop() for short neighbor list build
|
||||
_ainum = nlist;
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
evatom=1;
|
||||
|
@ -300,7 +321,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
|
|||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
_max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
sublo, subhi, tag, nspecial, special, success);
|
||||
if (!success)
|
||||
return NULL;
|
||||
|
@ -313,6 +334,15 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
|
|||
*ilist=nbor->host_ilist.begin();
|
||||
*jnum=nbor->host_acc.begin();
|
||||
|
||||
// re-allocate dev_short_nbor if necessary
|
||||
if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
dev_short_nbor.resize((2+_max_nbors)*_nmax);
|
||||
}
|
||||
|
||||
// _ainum to be used in loop() for short neighbor list build
|
||||
_ainum = nall;
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
evatom=1;
|
||||
|
@ -339,19 +369,20 @@ double BaseThreeT::host_memory_usage_atomic() const {
|
|||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
const char *ktwo, const char *kthree_center,
|
||||
const char *kthree_end) {
|
||||
const char *two, const char *three_center,
|
||||
const char *three_end, const char* short_nbor) {
|
||||
if (_compiled)
|
||||
return;
|
||||
|
||||
std::string vatom_name=std::string(kthree_end)+"_vatom";
|
||||
std::string vatom_name=std::string(three_end)+"_vatom";
|
||||
|
||||
pair_program=new UCL_Program(dev);
|
||||
pair_program->load_string(pair_str,device->compile_string().c_str());
|
||||
k_three_center.set_function(*pair_program,kthree_center);
|
||||
k_three_end.set_function(*pair_program,kthree_end);
|
||||
k_three_center.set_function(*pair_program,three_center);
|
||||
k_three_end.set_function(*pair_program,three_end);
|
||||
k_three_end_vatom.set_function(*pair_program,vatom_name.c_str());
|
||||
k_pair.set_function(*pair_program,ktwo);
|
||||
k_pair.set_function(*pair_program,two);
|
||||
k_short_nbor.set_function(*pair_program,short_nbor);
|
||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||
|
||||
#ifdef THREE_CONCURRENT
|
||||
|
|
|
@ -56,7 +56,8 @@ class BaseThree {
|
|||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen,
|
||||
const void *pair_program, const char *k_two,
|
||||
const char *k_three_center, const char *k_three_end);
|
||||
const char *k_three_center, const char *k_three_end,
|
||||
const char *k_short_nbor=NULL);
|
||||
|
||||
/// Estimate the overhead for GPU context changes and CPU driver
|
||||
void estimate_gpu_overhead();
|
||||
|
@ -73,18 +74,18 @@ class BaseThree {
|
|||
}
|
||||
|
||||
/// Check if there is enough storage for neighbors and realloc if not
|
||||
/** \param nlocal number of particles whose nbors must be stored on device
|
||||
* \param host_inum number of particles whose nbors need to copied to host
|
||||
* \param current maximum number of neighbors
|
||||
/** \param inum number of particles whose nbors must be stored on device
|
||||
* \param max_nbors maximum number of neighbors
|
||||
* \param success set to false if insufficient memory
|
||||
* \note olist_size=total number of local particles **/
|
||||
inline void resize_local(const int inum, const int max_nbors, bool &success) {
|
||||
nbor->resize(inum,max_nbors,success);
|
||||
}
|
||||
|
||||
/// Check if there is enough storage for neighbors and realloc if not
|
||||
/** \param nlocal number of particles whose nbors must be stored on device
|
||||
/** \param inum number of particles whose nbors must be stored on device
|
||||
* \param host_inum number of particles whose nbors need to copied to host
|
||||
* \param current maximum number of neighbors
|
||||
* \param max_nbors current maximum number of neighbors
|
||||
* \note host_inum is 0 if the host is performing neighboring
|
||||
* \note nlocal+host_inum=total number local particles
|
||||
* \note olist_size=0 **/
|
||||
|
@ -143,14 +144,6 @@ class BaseThree {
|
|||
const bool vflag, const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time, bool &success);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int * compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int ** compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type, double *sublo,
|
||||
|
@ -193,6 +186,9 @@ class BaseThree {
|
|||
/// Neighbor data
|
||||
Neighbor *nbor;
|
||||
|
||||
UCL_D_Vec<int> dev_short_nbor;
|
||||
UCL_Kernel k_short_nbor;
|
||||
|
||||
// ------------------------- DEVICE KERNELS -------------------------
|
||||
UCL_Program *pair_program;
|
||||
UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
|
||||
|
@ -207,12 +203,13 @@ class BaseThree {
|
|||
int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
|
||||
int _gpu_nbor;
|
||||
double _max_bytes, _max_an_bytes;
|
||||
int _max_nbors, _ainum, _nall;
|
||||
double _gpu_overhead, _driver_overhead;
|
||||
UCL_D_Vec<int> *_nbor_data;
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const void *pair_string,
|
||||
const char *k_two, const char *k_three_center,
|
||||
const char *k_three_end);
|
||||
const char *two, const char *three_center,
|
||||
const char *three_end, const char* short_nbor);
|
||||
|
||||
virtual void loop(const bool _eflag, const bool _vflag,
|
||||
const int evatom) = 0;
|
||||
|
|
|
@ -55,7 +55,7 @@ int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_
|
|||
int success;
|
||||
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
|
||||
_screen,sw,"k_sw","k_sw_three_center",
|
||||
"k_sw_three_end");
|
||||
"k_sw_three_end","k_sw_short_nbor");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
@ -193,19 +193,30 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
// build the short neighbor list
|
||||
int ainum=this->_ainum;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/this->_threads_per_atom)));
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &sw3, &map, &elem2param, &_nelements,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
|
||||
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
|
||||
int ainum=this->ans->inum();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
ainum=this->ans->inum();
|
||||
nbor_pitch=this->nbor->nbor_pitch();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
this->time_pair.start();
|
||||
|
||||
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
|
||||
&map, &elem2param, &_nelements,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv,
|
||||
&eflag, &vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
|
@ -217,6 +228,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3,
|
||||
&map, &elem2param, &_nelements,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &evatom);
|
||||
|
||||
|
@ -231,7 +243,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
|
||||
&map, &elem2param, &_nelements,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&this->nbor->dev_acc, &this->dev_short_nbor,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
|
||||
|
@ -240,7 +252,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
|
||||
&map, &elem2param, &_nelements,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&this->nbor->dev_acc, &this->dev_short_nbor,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
|
||||
|
|
|
@ -130,6 +130,63 @@ texture<int4> sw3_tex;
|
|||
|
||||
#endif
|
||||
|
||||
__kernel void k_sw_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict sw3,
|
||||
const __global int *restrict map,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global int * dev_short_nbor,
|
||||
const int inum, const int nbor_pitch, const int t_per_atom) {
|
||||
__local int n_stride;
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
int ncount = 0;
|
||||
int m = nbor;
|
||||
dev_short_nbor[m] = 0;
|
||||
int nbor_short = nbor+n_stride;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
int nj = j;
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
|
||||
dev_short_nbor[nbor_short] = nj;
|
||||
nbor_short += n_stride;
|
||||
ncount++;
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
// store the number of neighbors for each thread
|
||||
dev_short_nbor[m] = ncount;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_sw(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict sw1,
|
||||
|
@ -140,6 +197,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
|||
const int nelements,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
|
@ -158,8 +216,8 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
int nbor, nbor_end, i, numj;
|
||||
const int* nbor_mem = dev_packed;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
@ -167,9 +225,17 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
|||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor];
|
||||
nbor += n_stride;
|
||||
nbor_end = nbor+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
int j=nbor_mem[nbor];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -337,6 +403,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
|||
const int nelements,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
|
@ -361,7 +428,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
|||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end;
|
||||
|
||||
const int* nbor_mem = dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
|
@ -371,9 +438,18 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
|||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -395,14 +471,23 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
|||
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ij=sw3_ijparam.x;
|
||||
|
||||
int nbor_k=nbor_j-offset_j+offset_k;
|
||||
if (nbor_k<=nbor_j)
|
||||
nbor_k+=n_stride;
|
||||
int nbor_k,k_end;
|
||||
if (dev_packed==dev_nbor) {
|
||||
nbor_k=nborj_start-offset_j+offset_k;
|
||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
} else {
|
||||
nbor_k = nbor_j-offset_j+offset_k;
|
||||
if (nbor_k<=nbor_j) nbor_k += n_stride;
|
||||
k_end = nbor_end;
|
||||
}
|
||||
|
||||
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (dev_packed==dev_nbor && k <= j) continue;
|
||||
|
||||
numtyp4 kx; fetch4(kx,k,pos_tex);
|
||||
int ktype=kx.w;
|
||||
ktype=map[ktype];
|
||||
|
@ -460,6 +545,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
|||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
|
@ -484,7 +570,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
|||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
const int* nbor_mem = dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
|
@ -494,8 +580,16 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
|||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -534,8 +628,15 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
|||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
// recalculate numk and k_end for the use of short neighbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numk = dev_short_nbor[nbor_k];
|
||||
nbor_k += n_stride;
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
|
@ -598,6 +699,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
|
@ -622,7 +724,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
const int* nbor_mem = dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
|
@ -632,8 +734,16 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -672,8 +782,15 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
// recalculate numk and k_end for the use of short neighbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numk = dev_short_nbor[nbor_k];
|
||||
nbor_k += n_stride;
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
|
|
|
@ -55,7 +55,8 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
|
|||
int success;
|
||||
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
|
||||
_screen,tersoff,"k_tersoff_repulsive",
|
||||
"k_tersoff_three_center", "k_tersoff_three_end");
|
||||
"k_tersoff_three_center", "k_tersoff_three_end",
|
||||
"k_tersoff_short_nbor");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
@ -157,11 +158,16 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
|
|||
|
||||
UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
|
||||
UCL_WRITE_ONLY);
|
||||
for (int i=0; i<nparams; i++)
|
||||
double cutsqmax = 0.0;
|
||||
for (int i=0; i<nparams; i++) {
|
||||
cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
|
||||
if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
|
||||
}
|
||||
cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
|
||||
ucl_copy(cutsq,cutsq_view,false);
|
||||
|
||||
_cutshortsq = static_cast<numtyp>(cutsqmax);
|
||||
|
||||
UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
|
||||
*(this->ucl_device), UCL_WRITE_ONLY);
|
||||
|
||||
|
@ -219,171 +225,6 @@ double TersoffT::host_memory_usage() const {
|
|||
|
||||
#define KTHREADS this->_threads_per_atom
|
||||
#define JTHREADS this->_threads_per_atom
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void TersoffT::compute(const int f_ago, const int inum_full, const int nall,
|
||||
const int nlist, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
this->acc_timers();
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
this->resize_atom(0,nall,success);
|
||||
this->zero_timers();
|
||||
return;
|
||||
}
|
||||
|
||||
int ago=this->hd_balancer.ago_first(f_ago);
|
||||
int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
|
||||
this->ans->inum(inum);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->inum(inum);
|
||||
#endif
|
||||
host_start=inum;
|
||||
|
||||
if (ago==0) {
|
||||
this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
|
||||
if (!success)
|
||||
return;
|
||||
_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
|
||||
}
|
||||
|
||||
this->atom->cast_x_data(host_x,host_type);
|
||||
this->hd_balancer.start_timer();
|
||||
this->atom->add_x_data(host_x,host_type);
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
if (nall*_max_nbors > _zetaij.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_zetaij.resize(_max_nbors*_nmax);
|
||||
}
|
||||
|
||||
int _eflag;
|
||||
if (eflag)
|
||||
_eflag=1;
|
||||
else
|
||||
_eflag=0;
|
||||
|
||||
int ainum=nlist;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/(JTHREADS*KTHREADS))));
|
||||
|
||||
this->k_zeta.set_size(GX,BX);
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
evatom=1;
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ucl_device->sync();
|
||||
#endif
|
||||
loop(eflag,vflag,evatom);
|
||||
this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
this->device->add_ans_object(this->ans);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
this->device->add_ans_object(this->ans2);
|
||||
#endif
|
||||
this->hd_balancer.stop_timer();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reneighbor on GPU if necessary and then compute forces, virials, energies
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int ** TersoffT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success) {
|
||||
this->acc_timers();
|
||||
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
this->resize_atom(0,nall,success);
|
||||
this->zero_timers();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
this->hd_balancer.balance(cpu_time);
|
||||
int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
|
||||
this->ans->inum(inum);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->inum(inum);
|
||||
#endif
|
||||
host_start=inum;
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
sublo, subhi, tag, nspecial, special, success);
|
||||
if (!success)
|
||||
return NULL;
|
||||
this->hd_balancer.start_timer();
|
||||
} else {
|
||||
this->atom->cast_x_data(host_x,host_type);
|
||||
this->hd_balancer.start_timer();
|
||||
this->atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
*ilist=this->nbor->host_ilist.begin();
|
||||
*jnum=this->nbor->host_acc.begin();
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
if (nall*_max_nbors > _zetaij.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_zetaij.resize(_max_nbors*_nmax);
|
||||
}
|
||||
|
||||
int _eflag;
|
||||
if (eflag)
|
||||
_eflag=1;
|
||||
else
|
||||
_eflag=0;
|
||||
|
||||
int ainum=nall;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/(JTHREADS*KTHREADS))));
|
||||
|
||||
this->k_zeta.set_size(GX,BX);
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
evatom=1;
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ucl_device->sync();
|
||||
#endif
|
||||
loop(eflag,vflag,evatom);
|
||||
this->ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
this->device->add_ans_object(this->ans);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->copy_answers(eflag,vflag,eatom,vatom);
|
||||
this->device->add_ans_object(this->ans2);
|
||||
#endif
|
||||
this->hd_balancer.stop_timer();
|
||||
|
||||
return this->nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
|
@ -402,9 +243,40 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
else
|
||||
vflag=0;
|
||||
|
||||
int ainum=this->ans->inum();
|
||||
// build the short neighbor list
|
||||
int ainum=this->_ainum;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
|
||||
&elem2param, &_nelements, &_nparams,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
int nall = this->_nall;
|
||||
if (nall*this->_max_nbors > _zetaij.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_zetaij.resize(this->_max_nbors*_nmax);
|
||||
}
|
||||
|
||||
nbor_pitch=this->nbor->nbor_pitch();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
|
||||
(BX/(JTHREADS*KTHREADS))));
|
||||
|
||||
this->k_zeta.set_size(GX,BX);
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
ainum=this->ans->inum();
|
||||
nbor_pitch=this->nbor->nbor_pitch();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
this->time_pair.start();
|
||||
|
@ -412,6 +284,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv,
|
||||
&eflag, &vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
|
@ -423,6 +296,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &evatom);
|
||||
|
||||
|
@ -437,7 +311,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&this->nbor->dev_acc, &this->dev_short_nbor,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
|
||||
|
@ -446,7 +320,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&this->nbor->dev_acc, &this->dev_short_nbor,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
}
|
||||
|
|
|
@ -106,7 +106,7 @@ texture<int4> ts5_tex;
|
|||
ans[ii]=old; \
|
||||
}
|
||||
|
||||
#define store_zeta(z, tid, t_per_atom, offset) \
|
||||
#define acc_zeta(z, tid, t_per_atom, offset) \
|
||||
if (t_per_atom>1) { \
|
||||
__local acctyp red_acc[BLOCK_PAIR]; \
|
||||
red_acc[tid]=z; \
|
||||
|
@ -155,7 +155,7 @@ texture<int4> ts5_tex;
|
|||
ans[ii]=old; \
|
||||
}
|
||||
|
||||
#define store_zeta(z, tid, t_per_atom, offset) \
|
||||
#define acc_zeta(z, tid, t_per_atom, offset) \
|
||||
if (t_per_atom>1) { \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
z += shfl_xor(z, s, t_per_atom); \
|
||||
|
@ -164,6 +164,65 @@ texture<int4> ts5_tex;
|
|||
|
||||
#endif
|
||||
|
||||
__kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const __global int *restrict map,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global int * dev_short_nbor,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
__local int n_stride;
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
int ncount = 0;
|
||||
int m = nbor;
|
||||
dev_short_nbor[m] = 0;
|
||||
int nbor_short = nbor+n_stride;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
int nj = j;
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq[ijparam]) {
|
||||
dev_short_nbor[nbor_short] = nj;
|
||||
nbor_short += n_stride;
|
||||
ncount++;
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
// store the number of neighbors for each thread
|
||||
dev_short_nbor[m] = ncount;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
// Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
|
||||
// while the block size should never be less than 32.
|
||||
// SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
|
||||
|
@ -184,6 +243,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
|
|||
__global acctyp4 * zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
const int eflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
__local int tpa_sq,n_stride;
|
||||
|
@ -211,22 +271,29 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor_j, nbor_end;
|
||||
int i, numj;
|
||||
|
||||
int nbor_j, nbor_end, i, numj;
|
||||
const int* nbor_mem=dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
int offset_k=tid & (t_per_atom-1);
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -241,14 +308,20 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
|
|||
delr1.z = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
// if (rsq1 > cutsq[ijparam]) continue;
|
||||
|
||||
// compute zeta_ij
|
||||
z = (acctyp)0;
|
||||
|
||||
int nbor_k = nborj_start-offset_j+offset_k;
|
||||
for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k_end = nbor_end;
|
||||
if (dev_packed==dev_nbor) {
|
||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
|
||||
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == j) continue;
|
||||
|
@ -284,10 +357,12 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
|
|||
|
||||
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
|
||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
i, nbor_j, offset_j, idx);
|
||||
store_zeta(z, tid, t_per_atom, offset_k);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
int idx = nbor_j;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// i, nbor_j, offset_j, idx);
|
||||
acc_zeta(z, tid, t_per_atom, offset_k);
|
||||
|
||||
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
|
||||
numtyp ijparam_lam2 = ts1_ijparam.y;
|
||||
|
@ -330,6 +405,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
|
|||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
|
@ -356,8 +432,8 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
int nbor, nbor_end, i, numj;
|
||||
const int* nbor_mem=dev_packed;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
@ -365,9 +441,17 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
|
|||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor];
|
||||
nbor += n_stride;
|
||||
nbor_end = nbor+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
int j=nbor_mem[nbor];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -382,32 +466,31 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
|
|||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq[ijparam]) {
|
||||
numtyp feng[2];
|
||||
numtyp ijparam_lam1 = ts1[ijparam].x;
|
||||
numtyp4 ts2_ijparam = ts2[ijparam];
|
||||
numtyp ijparam_biga = ts2_ijparam.x;
|
||||
numtyp ijparam_bigr = ts2_ijparam.z;
|
||||
numtyp ijparam_bigd = ts2_ijparam.w;
|
||||
// rsq<cutsq[ijparam]
|
||||
numtyp feng[2];
|
||||
numtyp ijparam_lam1 = ts1[ijparam].x;
|
||||
numtyp4 ts2_ijparam = ts2[ijparam];
|
||||
numtyp ijparam_biga = ts2_ijparam.x;
|
||||
numtyp ijparam_bigr = ts2_ijparam.z;
|
||||
numtyp ijparam_bigd = ts2_ijparam.w;
|
||||
|
||||
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
|
||||
rsq, eflag, feng);
|
||||
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
|
||||
rsq, eflag, feng);
|
||||
|
||||
numtyp force = feng[0];
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
numtyp force = feng[0];
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0)
|
||||
energy+=feng[1];
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
if (eflag>0)
|
||||
energy+=feng[1];
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
|
@ -428,6 +511,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
|
|||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
|
@ -461,20 +545,28 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
|
|||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end;
|
||||
|
||||
const int* nbor_mem=dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
int offset_k=tid & (t_per_atom-1);
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -489,7 +581,6 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
|
|||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
|
||||
|
@ -497,9 +588,11 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
|
|||
|
||||
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
|
||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
i, nbor_j, offset_j, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
int idx = nbor_j;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// i, nbor_j, offset_j, idx);
|
||||
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
|
||||
numtyp force = zeta_ij.x*tpainv;
|
||||
numtyp prefactor = zeta_ij.y;
|
||||
|
@ -520,9 +613,15 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
|
|||
virial[5] += delr1[1]*delr1[2]*mforce;
|
||||
}
|
||||
|
||||
int nbor_k=nborj_start-offset_j+offset_k;
|
||||
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int nbor_k = nborj_start-offset_j+offset_k;
|
||||
int k_end = nbor_end;
|
||||
if (dev_packed==dev_nbor) {
|
||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (j == k) continue;
|
||||
|
@ -598,6 +697,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
|||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
|
@ -632,7 +732,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
|||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
const int* nbor_mem=dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
|
@ -643,9 +743,18 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
|||
itype=map[itype];
|
||||
|
||||
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -660,8 +769,6 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
|||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
|
||||
numtyp mdelr1[3];
|
||||
mdelr1[0] = -delr1[0];
|
||||
mdelr1[1] = -delr1[1];
|
||||
|
@ -683,13 +790,20 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
|||
k_end=nbor_k+numk;
|
||||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
// recalculate numk and k_end for the use of short neighbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numk = dev_short_nbor[nbor_k];
|
||||
nbor_k += n_stride;
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
int nbork_start = nbor_k;
|
||||
|
||||
// look up for zeta_ji: find i in the j's neighbor list
|
||||
int m = tid / t_per_atom;
|
||||
int ijnum = -1;
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
|
@ -711,9 +825,11 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
|||
|
||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, ijnum, offset_kf, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, ijnum, offset_kf, idx);
|
||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||
numtyp force = zeta_ji.x*tpainv;
|
||||
numtyp prefactor_ji = zeta_ji.y;
|
||||
|
@ -736,7 +852,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
|||
|
||||
// attractive forces
|
||||
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
|
@ -777,9 +893,11 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
|||
|
||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, nbor_k, offset_k, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, nbor_k, offset_k, idx);
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
|
||||
|
@ -824,6 +942,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
|
@ -858,7 +977,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
const int* nbor_mem = dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
|
@ -869,9 +988,18 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
itype=map[itype];
|
||||
|
||||
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -886,8 +1014,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
|
||||
numtyp mdelr1[3];
|
||||
mdelr1[0] = -delr1[0];
|
||||
mdelr1[1] = -delr1[1];
|
||||
|
@ -909,13 +1035,20 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
k_end=nbor_k+numk;
|
||||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
// recalculate numk and k_end for the use of short neighbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numk = dev_short_nbor[nbor_k];
|
||||
nbor_k += n_stride;
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
int nbork_start = nbor_k;
|
||||
|
||||
// look up for zeta_ji
|
||||
int m = tid / t_per_atom;
|
||||
int ijnum = -1;
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
|
@ -937,9 +1070,11 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
|
||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, ijnum, offset_kf, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, ijnum, offset_kf, idx);
|
||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||
numtyp force = zeta_ji.x*tpainv;
|
||||
numtyp prefactor_ji = zeta_ji.y;
|
||||
|
@ -962,7 +1097,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
|
||||
// attractive forces
|
||||
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
|
@ -1010,9 +1145,11 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
|
||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, nbor_k, offset_k, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, nbor_k, offset_k, idx);
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
|
||||
|
@ -1040,7 +1177,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
virial[3] += TWOTHIRD*(delr2[0]*fj[1] + mdelr1[0]*fk[1]);
|
||||
virial[4] += TWOTHIRD*(delr2[0]*fj[2] + mdelr1[0]*fk[2]);
|
||||
virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]);
|
||||
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
|
|
|
@ -47,21 +47,6 @@ class Tersoff : public BaseThree<numtyp, acctyp> {
|
|||
const double* h, const double* gamma, const double* beta,
|
||||
const double* powern, const double* cutsq);
|
||||
|
||||
/// Pair loop with host neighboring
|
||||
void compute(const int f_ago, const int inum_full, const int nall,
|
||||
const int nlist, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh, const bool eflag,
|
||||
const bool vflag, const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time, bool &success);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int ** compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **numj, const double cpu_time, bool &success);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
|
@ -104,8 +89,7 @@ class Tersoff : public BaseThree<numtyp, acctyp> {
|
|||
|
||||
UCL_Kernel k_zeta;
|
||||
UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex;
|
||||
|
||||
int _max_nbors;
|
||||
numtyp _cutshortsq;
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
|
|
|
@ -55,7 +55,8 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
|
|||
int success;
|
||||
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
|
||||
_screen,tersoff_mod,"k_tersoff_mod_repulsive",
|
||||
"k_tersoff_mod_three_center", "k_tersoff_mod_three_end");
|
||||
"k_tersoff_mod_three_center", "k_tersoff_mod_three_end",
|
||||
"k_tersoff_mod_short_nbor");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
@ -157,11 +158,16 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
|
|||
|
||||
UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
|
||||
UCL_WRITE_ONLY);
|
||||
for (int i=0; i<nparams; i++)
|
||||
double cutsqmax = 0.0;
|
||||
for (int i=0; i<nparams; i++) {
|
||||
cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
|
||||
if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
|
||||
}
|
||||
cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
|
||||
ucl_copy(cutsq,cutsq_view,false);
|
||||
|
||||
_cutshortsq = static_cast<numtyp>(cutsqmax);
|
||||
|
||||
UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
|
||||
*(this->ucl_device), UCL_WRITE_ONLY);
|
||||
|
||||
|
@ -219,171 +225,6 @@ double TersoffMT::host_memory_usage() const {
|
|||
|
||||
#define KTHREADS this->_threads_per_atom
|
||||
#define JTHREADS this->_threads_per_atom
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void TersoffMT::compute(const int f_ago, const int inum_full, const int nall,
|
||||
const int nlist, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
this->acc_timers();
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
this->resize_atom(0,nall,success);
|
||||
this->zero_timers();
|
||||
return;
|
||||
}
|
||||
|
||||
int ago=this->hd_balancer.ago_first(f_ago);
|
||||
int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
|
||||
this->ans->inum(inum);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->inum(inum);
|
||||
#endif
|
||||
host_start=inum;
|
||||
|
||||
if (ago==0) {
|
||||
this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
|
||||
if (!success)
|
||||
return;
|
||||
_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
|
||||
}
|
||||
|
||||
this->atom->cast_x_data(host_x,host_type);
|
||||
this->hd_balancer.start_timer();
|
||||
this->atom->add_x_data(host_x,host_type);
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
if (nall*_max_nbors > _zetaij.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_zetaij.resize(_max_nbors*_nmax);
|
||||
}
|
||||
|
||||
int _eflag;
|
||||
if (eflag)
|
||||
_eflag=1;
|
||||
else
|
||||
_eflag=0;
|
||||
|
||||
int ainum=nlist;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/(JTHREADS*KTHREADS))));
|
||||
|
||||
this->k_zeta.set_size(GX,BX);
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
evatom=1;
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ucl_device->sync();
|
||||
#endif
|
||||
loop(eflag,vflag,evatom);
|
||||
this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
this->device->add_ans_object(this->ans);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
this->device->add_ans_object(this->ans2);
|
||||
#endif
|
||||
this->hd_balancer.stop_timer();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reneighbor on GPU if necessary and then compute forces, virials, energies
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int ** TersoffMT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success) {
|
||||
this->acc_timers();
|
||||
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
this->resize_atom(0,nall,success);
|
||||
this->zero_timers();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
this->hd_balancer.balance(cpu_time);
|
||||
int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
|
||||
this->ans->inum(inum);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->inum(inum);
|
||||
#endif
|
||||
host_start=inum;
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
sublo, subhi, tag, nspecial, special, success);
|
||||
if (!success)
|
||||
return NULL;
|
||||
this->hd_balancer.start_timer();
|
||||
} else {
|
||||
this->atom->cast_x_data(host_x,host_type);
|
||||
this->hd_balancer.start_timer();
|
||||
this->atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
*ilist=this->nbor->host_ilist.begin();
|
||||
*jnum=this->nbor->host_acc.begin();
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
if (nall*_max_nbors > _zetaij.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_zetaij.resize(_max_nbors*_nmax);
|
||||
}
|
||||
|
||||
int _eflag;
|
||||
if (eflag)
|
||||
_eflag=1;
|
||||
else
|
||||
_eflag=0;
|
||||
|
||||
int ainum=nall;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/(JTHREADS*KTHREADS))));
|
||||
|
||||
this->k_zeta.set_size(GX,BX);
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
evatom=1;
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ucl_device->sync();
|
||||
#endif
|
||||
loop(eflag,vflag,evatom);
|
||||
this->ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
this->device->add_ans_object(this->ans);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->copy_answers(eflag,vflag,eatom,vatom);
|
||||
this->device->add_ans_object(this->ans2);
|
||||
#endif
|
||||
this->hd_balancer.stop_timer();
|
||||
|
||||
return this->nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
|
@ -402,9 +243,40 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
else
|
||||
vflag=0;
|
||||
|
||||
int ainum=this->ans->inum();
|
||||
// build the short neighbor list
|
||||
int ainum=this->_ainum;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
|
||||
&elem2param, &_nelements, &_nparams,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
int nall = this->_nall;
|
||||
if (nall*this->_max_nbors > _zetaij.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_zetaij.resize(this->_max_nbors*_nmax);
|
||||
}
|
||||
|
||||
nbor_pitch=this->nbor->nbor_pitch();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
|
||||
(BX/(JTHREADS*KTHREADS))));
|
||||
|
||||
this->k_zeta.set_size(GX,BX);
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
ainum=this->ans->inum();
|
||||
nbor_pitch=this->nbor->nbor_pitch();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
this->time_pair.start();
|
||||
|
@ -412,6 +284,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv,
|
||||
&eflag, &vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
|
@ -423,6 +296,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &evatom);
|
||||
|
||||
|
@ -437,7 +311,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&this->nbor->dev_acc, &this->dev_short_nbor,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
|
||||
|
@ -446,7 +320,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&this->nbor->dev_acc, &this->dev_short_nbor,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
}
|
||||
|
|
|
@ -106,7 +106,7 @@ texture<int4> ts5_tex;
|
|||
ans[ii]=old; \
|
||||
}
|
||||
|
||||
#define store_zeta(z, tid, t_per_atom, offset) \
|
||||
#define acc_zeta(z, tid, t_per_atom, offset) \
|
||||
if (t_per_atom>1) { \
|
||||
__local acctyp red_acc[BLOCK_PAIR]; \
|
||||
red_acc[tid]=z; \
|
||||
|
@ -155,7 +155,7 @@ texture<int4> ts5_tex;
|
|||
ans[ii]=old; \
|
||||
}
|
||||
|
||||
#define store_zeta(z, tid, t_per_atom, offset) \
|
||||
#define acc_zeta(z, tid, t_per_atom, offset) \
|
||||
if (t_per_atom>1) { \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
z += shfl_xor(z, s, t_per_atom); \
|
||||
|
@ -164,6 +164,65 @@ texture<int4> ts5_tex;
|
|||
|
||||
#endif
|
||||
|
||||
__kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const __global int *restrict map,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global int * dev_short_nbor,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
__local int n_stride;
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
int ncount = 0;
|
||||
int m = nbor;
|
||||
dev_short_nbor[m] = 0;
|
||||
int nbor_short = nbor+n_stride;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
int nj = j;
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq[ijparam]) {
|
||||
dev_short_nbor[nbor_short] = nj;
|
||||
nbor_short += n_stride;
|
||||
ncount++;
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
// store the number of neighbors for each thread
|
||||
dev_short_nbor[m] = ncount;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
// Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
|
||||
// while the block size should never be less than 32.
|
||||
// SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
|
||||
|
@ -184,6 +243,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
|
|||
__global acctyp4 * zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
const int eflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
__local int tpa_sq,n_stride;
|
||||
|
@ -211,22 +271,29 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor_j, nbor_end;
|
||||
int i, numj;
|
||||
|
||||
int nbor_j, nbor_end, i, numj;
|
||||
const int* nbor_mem=dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
int offset_k=tid & (t_per_atom-1);
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -241,14 +308,18 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
|
|||
delr1.z = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
|
||||
// compute zeta_ij
|
||||
z = (numtyp)0;
|
||||
z = (acctyp)0;
|
||||
|
||||
int nbor_k = nborj_start-offset_j+offset_k;
|
||||
for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k_end = nbor_end;
|
||||
if (dev_packed==dev_nbor) {
|
||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
|
||||
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == j) continue;
|
||||
|
@ -287,10 +358,12 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
|
|||
|
||||
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
|
||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
i, nbor_j, offset_j, idx);
|
||||
store_zeta(z, tid, t_per_atom, offset_k);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
int idx = nbor_j;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// i, nbor_j, offset_j, idx);
|
||||
acc_zeta(z, tid, t_per_atom, offset_k);
|
||||
|
||||
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
|
||||
numtyp ijparam_lam2 = ts1_ijparam.y;
|
||||
|
@ -331,6 +404,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
|
|||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
|
@ -357,8 +431,8 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
int nbor, nbor_end, i, numj;
|
||||
const int* nbor_mem=dev_packed;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
@ -366,9 +440,17 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
|
|||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor];
|
||||
nbor += n_stride;
|
||||
nbor_end = nbor+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
int j=nbor_mem[nbor];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -383,32 +465,31 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
|
|||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq[ijparam]) {
|
||||
numtyp feng[2];
|
||||
numtyp ijparam_lam1 = ts1[ijparam].x;
|
||||
numtyp4 ts2_ijparam = ts2[ijparam];
|
||||
numtyp ijparam_biga = ts2_ijparam.x;
|
||||
numtyp ijparam_bigr = ts2_ijparam.z;
|
||||
numtyp ijparam_bigd = ts2_ijparam.w;
|
||||
// rsq<cutsq[ijparam]
|
||||
numtyp feng[2];
|
||||
numtyp ijparam_lam1 = ts1[ijparam].x;
|
||||
numtyp4 ts2_ijparam = ts2[ijparam];
|
||||
numtyp ijparam_biga = ts2_ijparam.x;
|
||||
numtyp ijparam_bigr = ts2_ijparam.z;
|
||||
numtyp ijparam_bigd = ts2_ijparam.w;
|
||||
|
||||
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
|
||||
rsq, eflag, feng);
|
||||
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
|
||||
rsq, eflag, feng);
|
||||
|
||||
numtyp force = feng[0];
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
numtyp force = feng[0];
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0)
|
||||
energy+=feng[1];
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
if (eflag>0)
|
||||
energy+=feng[1];
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
|
@ -430,6 +511,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
|
|||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
|
@ -465,20 +547,28 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
|
|||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end;
|
||||
|
||||
const int* nbor_mem=dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
int offset_k=tid & (t_per_atom-1);
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -493,7 +583,6 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
|
|||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
|
||||
|
@ -501,9 +590,11 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
|
|||
|
||||
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
|
||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
i, nbor_j, offset_j, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
int idx = nbor_j;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// i, nbor_j, offset_j, idx);
|
||||
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
|
||||
numtyp force = zeta_ij.x*tpainv;
|
||||
numtyp prefactor = zeta_ij.y;
|
||||
|
@ -524,9 +615,15 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
|
|||
virial[5] += delr1[1]*delr1[2]*mforce;
|
||||
}
|
||||
|
||||
int nbor_k=nborj_start-offset_j+offset_k;
|
||||
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int nbor_k = nborj_start-offset_j+offset_k;
|
||||
int k_end = nbor_end;
|
||||
if (dev_packed==dev_nbor) {
|
||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (j == k) continue;
|
||||
|
@ -606,6 +703,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
|||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
|
@ -642,7 +740,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
|||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
const int* nbor_mem=dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
|
@ -653,9 +751,18 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
|||
itype=map[itype];
|
||||
|
||||
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -670,8 +777,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
|||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
|
||||
numtyp mdelr1[3];
|
||||
mdelr1[0] = -delr1[0];
|
||||
mdelr1[1] = -delr1[1];
|
||||
|
@ -693,13 +798,20 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
|||
k_end=nbor_k+numk;
|
||||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
// recalculate numk and k_end for the use of short neighbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numk = dev_short_nbor[nbor_k];
|
||||
nbor_k += n_stride;
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
int nbork_start = nbor_k;
|
||||
|
||||
// look up for zeta_ji: find i in the j's neighbor list
|
||||
int m = tid / t_per_atom;
|
||||
int ijnum = -1;
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
|
@ -721,9 +833,11 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
|||
|
||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, ijnum, offset_kf, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, ijnum, offset_kf, idx);
|
||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||
numtyp force = zeta_ji.x*tpainv;
|
||||
numtyp prefactor_ji = zeta_ji.y;
|
||||
|
@ -746,7 +860,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
|||
|
||||
// attractive forces
|
||||
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
|
@ -790,9 +904,11 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
|||
|
||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, nbor_k, offset_k, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, nbor_k, offset_k, idx);
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
|
||||
|
@ -841,6 +957,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
|
@ -877,7 +994,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
const int* nbor_mem = dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
|
@ -888,9 +1005,18 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
itype=map[itype];
|
||||
|
||||
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -905,8 +1031,6 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
|
||||
numtyp mdelr1[3];
|
||||
mdelr1[0] = -delr1[0];
|
||||
mdelr1[1] = -delr1[1];
|
||||
|
@ -928,13 +1052,20 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
k_end=nbor_k+numk;
|
||||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
// recalculate numk and k_end for the use of short neighbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numk = dev_short_nbor[nbor_k];
|
||||
nbor_k += n_stride;
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
int nbork_start = nbor_k;
|
||||
|
||||
// look up for zeta_ji
|
||||
int m = tid / t_per_atom;
|
||||
int ijnum = -1;
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
|
@ -956,9 +1087,11 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
|
||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, ijnum, offset_kf, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, ijnum, offset_kf, idx);
|
||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||
numtyp force = zeta_ji.x*tpainv;
|
||||
numtyp prefactor_ji = zeta_ji.y;
|
||||
|
@ -981,7 +1114,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
|
||||
// attractive forces
|
||||
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
|
@ -1032,9 +1165,11 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
|
||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, nbor_k, offset_k, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, nbor_k, offset_k, idx);
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
|
||||
|
|
|
@ -47,21 +47,6 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
|
|||
const double* h, const double* beta, const double* powern,
|
||||
const double* powern_del, const double* ca1, const double* cutsq);
|
||||
|
||||
/// Pair loop with host neighboring
|
||||
void compute(const int f_ago, const int inum_full, const int nall,
|
||||
const int nlist, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh, const bool eflag,
|
||||
const bool vflag, const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time, bool &success);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int ** compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **numj, const double cpu_time, bool &success);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
|
@ -104,8 +89,7 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
|
|||
|
||||
UCL_Kernel k_zeta;
|
||||
UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex;
|
||||
|
||||
int _max_nbors;
|
||||
numtyp _cutshortsq;
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
|
|
|
@ -62,7 +62,8 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
|
|||
int success;
|
||||
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
|
||||
_screen,tersoff_zbl,"k_tersoff_zbl_repulsive",
|
||||
"k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end");
|
||||
"k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end",
|
||||
"k_tersoff_zbl_short_nbor");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
@ -177,11 +178,16 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
|
|||
|
||||
UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
|
||||
UCL_WRITE_ONLY);
|
||||
for (int i=0; i<nparams; i++)
|
||||
double cutsqmax = 0.0;
|
||||
for (int i=0; i<nparams; i++) {
|
||||
cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
|
||||
if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
|
||||
}
|
||||
cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
|
||||
ucl_copy(cutsq,cutsq_view,false);
|
||||
|
||||
_cutshortsq = static_cast<numtyp>(cutsqmax);
|
||||
|
||||
UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
|
||||
*(this->ucl_device), UCL_WRITE_ONLY);
|
||||
|
||||
|
@ -244,171 +250,6 @@ double TersoffZT::host_memory_usage() const {
|
|||
|
||||
#define KTHREADS this->_threads_per_atom
|
||||
#define JTHREADS this->_threads_per_atom
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void TersoffZT::compute(const int f_ago, const int inum_full, const int nall,
|
||||
const int nlist, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
this->acc_timers();
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
this->resize_atom(0,nall,success);
|
||||
this->zero_timers();
|
||||
return;
|
||||
}
|
||||
|
||||
int ago=this->hd_balancer.ago_first(f_ago);
|
||||
int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
|
||||
this->ans->inum(inum);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->inum(inum);
|
||||
#endif
|
||||
host_start=inum;
|
||||
|
||||
if (ago==0) {
|
||||
this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
|
||||
if (!success)
|
||||
return;
|
||||
_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
|
||||
}
|
||||
|
||||
this->atom->cast_x_data(host_x,host_type);
|
||||
this->hd_balancer.start_timer();
|
||||
this->atom->add_x_data(host_x,host_type);
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
if (nall*_max_nbors > _zetaij.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_zetaij.resize(_max_nbors*_nmax);
|
||||
}
|
||||
|
||||
int _eflag;
|
||||
if (eflag)
|
||||
_eflag=1;
|
||||
else
|
||||
_eflag=0;
|
||||
|
||||
int ainum=nlist;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/(JTHREADS*KTHREADS))));
|
||||
|
||||
this->k_zeta.set_size(GX,BX);
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
evatom=1;
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ucl_device->sync();
|
||||
#endif
|
||||
loop(eflag,vflag,evatom);
|
||||
this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
this->device->add_ans_object(this->ans);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
this->device->add_ans_object(this->ans2);
|
||||
#endif
|
||||
this->hd_balancer.stop_timer();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reneighbor on GPU if necessary and then compute forces, virials, energies
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int ** TersoffZT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success) {
|
||||
this->acc_timers();
|
||||
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
this->resize_atom(0,nall,success);
|
||||
this->zero_timers();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
this->hd_balancer.balance(cpu_time);
|
||||
int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
|
||||
this->ans->inum(inum);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->inum(inum);
|
||||
#endif
|
||||
host_start=inum;
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
sublo, subhi, tag, nspecial, special, success);
|
||||
if (!success)
|
||||
return NULL;
|
||||
this->hd_balancer.start_timer();
|
||||
} else {
|
||||
this->atom->cast_x_data(host_x,host_type);
|
||||
this->hd_balancer.start_timer();
|
||||
this->atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
*ilist=this->nbor->host_ilist.begin();
|
||||
*jnum=this->nbor->host_acc.begin();
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
if (nall*_max_nbors > _zetaij.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_zetaij.resize(_max_nbors*_nmax);
|
||||
}
|
||||
|
||||
int _eflag;
|
||||
if (eflag)
|
||||
_eflag=1;
|
||||
else
|
||||
_eflag=0;
|
||||
|
||||
int ainum=nall;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/(JTHREADS*KTHREADS))));
|
||||
|
||||
this->k_zeta.set_size(GX,BX);
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
evatom=1;
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ucl_device->sync();
|
||||
#endif
|
||||
loop(eflag,vflag,evatom);
|
||||
this->ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
this->device->add_ans_object(this->ans);
|
||||
#ifdef THREE_CONCURRENT
|
||||
this->ans2->copy_answers(eflag,vflag,eatom,vatom);
|
||||
this->device->add_ans_object(this->ans2);
|
||||
#endif
|
||||
this->hd_balancer.stop_timer();
|
||||
|
||||
return this->nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
|
@ -427,9 +268,40 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
else
|
||||
vflag=0;
|
||||
|
||||
int ainum=this->ans->inum();
|
||||
// build the short neighbor list
|
||||
int ainum=this->_ainum;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
|
||||
&elem2param, &_nelements, &_nparams,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
int nall = this->_nall;
|
||||
if (nall*this->_max_nbors > _zetaij.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_zetaij.resize(this->_max_nbors*_nmax);
|
||||
}
|
||||
|
||||
nbor_pitch=this->nbor->nbor_pitch();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
|
||||
(BX/(JTHREADS*KTHREADS))));
|
||||
|
||||
this->k_zeta.set_size(GX,BX);
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
ainum=this->ans->inum();
|
||||
nbor_pitch=this->nbor->nbor_pitch();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
this->time_pair.start();
|
||||
|
@ -438,6 +310,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
&_global_e, &_global_a_0, &_global_epsilon_0, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv,
|
||||
&eflag, &vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
|
@ -449,6 +322,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &evatom);
|
||||
|
||||
|
@ -463,7 +337,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&this->nbor->dev_acc, &this->dev_short_nbor,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
|
||||
|
@ -472,7 +346,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&this->nbor->dev_acc, &this->dev_short_nbor,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
}
|
||||
|
|
|
@ -109,7 +109,7 @@ texture<int4> ts6_tex;
|
|||
ans[ii]=old; \
|
||||
}
|
||||
|
||||
#define store_zeta(z, tid, t_per_atom, offset) \
|
||||
#define acc_zeta(z, tid, t_per_atom, offset) \
|
||||
if (t_per_atom>1) { \
|
||||
__local acctyp red_acc[BLOCK_PAIR]; \
|
||||
red_acc[tid]=z; \
|
||||
|
@ -158,7 +158,7 @@ texture<int4> ts6_tex;
|
|||
ans[ii]=old; \
|
||||
}
|
||||
|
||||
#define store_zeta(z, tid, t_per_atom, offset) \
|
||||
#define acc_zeta(z, tid, t_per_atom, offset) \
|
||||
if (t_per_atom>1) { \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
z += shfl_xor(z, s, t_per_atom); \
|
||||
|
@ -167,6 +167,65 @@ texture<int4> ts6_tex;
|
|||
|
||||
#endif
|
||||
|
||||
__kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const __global int *restrict map,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global int * dev_short_nbor,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
__local int n_stride;
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
int ncount = 0;
|
||||
int m = nbor;
|
||||
dev_short_nbor[m] = 0;
|
||||
int nbor_short = nbor+n_stride;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
int nj = j;
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq[ijparam]) {
|
||||
dev_short_nbor[nbor_short] = nj;
|
||||
nbor_short += n_stride;
|
||||
ncount++;
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
// store the number of neighbors for each thread
|
||||
dev_short_nbor[m] = ncount;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
// Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
|
||||
// while the block size should never be less than 32.
|
||||
// SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
|
||||
|
@ -188,6 +247,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
|
|||
__global acctyp4 * zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
const int eflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
__local int tpa_sq,n_stride;
|
||||
|
@ -217,22 +277,29 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor_j, nbor_end;
|
||||
int i, numj;
|
||||
|
||||
int nbor_j, nbor_end, i, numj;
|
||||
const int* nbor_mem=dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
int offset_k=tid & (t_per_atom-1);
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -247,14 +314,18 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
|
|||
delr1.z = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
|
||||
// compute zeta_ij
|
||||
z = (acctyp)0;
|
||||
|
||||
int nbor_k = nborj_start-offset_j+offset_k;
|
||||
for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k_end = nbor_end;
|
||||
if (dev_packed==dev_nbor) {
|
||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
|
||||
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == j) continue;
|
||||
|
@ -290,10 +361,12 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
|
|||
|
||||
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
|
||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
i, nbor_j, offset_j, idx);
|
||||
store_zeta(z, tid, t_per_atom, offset_k);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
int idx = nbor_j;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// i, nbor_j, offset_j, idx);
|
||||
acc_zeta(z, tid, t_per_atom, offset_k);
|
||||
|
||||
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
|
||||
numtyp ijparam_lam2 = ts1_ijparam.y;
|
||||
|
@ -342,6 +415,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
|
|||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
|
@ -370,8 +444,8 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
int nbor, nbor_end, i, numj;
|
||||
const int* nbor_mem=dev_packed;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
@ -379,9 +453,17 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
|
|||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor];
|
||||
nbor += n_stride;
|
||||
nbor_end = nbor+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
int j=nbor_mem[nbor];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -396,38 +478,37 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
|
|||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq[ijparam]) {
|
||||
numtyp feng[2];
|
||||
numtyp ijparam_lam1 = ts1[ijparam].x;
|
||||
numtyp4 ts2_ijparam = ts2[ijparam];
|
||||
numtyp ijparam_biga = ts2_ijparam.x;
|
||||
numtyp ijparam_bigr = ts2_ijparam.z;
|
||||
numtyp ijparam_bigd = ts2_ijparam.w;
|
||||
numtyp4 ts6_ijparam = ts6[ijparam];
|
||||
numtyp ijparam_Z_i = ts6_ijparam.x;
|
||||
numtyp ijparam_Z_j = ts6_ijparam.y;
|
||||
numtyp ijparam_ZBLcut = ts6_ijparam.z;
|
||||
numtyp ijparam_ZBLexpscale = ts6_ijparam.w;
|
||||
// rsq<cutsq[ijparam]
|
||||
numtyp feng[2];
|
||||
numtyp ijparam_lam1 = ts1[ijparam].x;
|
||||
numtyp4 ts2_ijparam = ts2[ijparam];
|
||||
numtyp ijparam_biga = ts2_ijparam.x;
|
||||
numtyp ijparam_bigr = ts2_ijparam.z;
|
||||
numtyp ijparam_bigd = ts2_ijparam.w;
|
||||
numtyp4 ts6_ijparam = ts6[ijparam];
|
||||
numtyp ijparam_Z_i = ts6_ijparam.x;
|
||||
numtyp ijparam_Z_j = ts6_ijparam.y;
|
||||
numtyp ijparam_ZBLcut = ts6_ijparam.z;
|
||||
numtyp ijparam_ZBLexpscale = ts6_ijparam.w;
|
||||
|
||||
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
|
||||
ijparam_Z_i, ijparam_Z_j, ijparam_ZBLcut, ijparam_ZBLexpscale,
|
||||
global_e, global_a_0, global_epsilon_0, rsq, eflag, feng);
|
||||
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
|
||||
ijparam_Z_i, ijparam_Z_j, ijparam_ZBLcut, ijparam_ZBLexpscale,
|
||||
global_e, global_a_0, global_epsilon_0, rsq, eflag, feng);
|
||||
|
||||
numtyp force = feng[0];
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
numtyp force = feng[0];
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0)
|
||||
energy+=feng[1];
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
if (eflag>0)
|
||||
energy+=feng[1];
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
|
@ -448,6 +529,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
|
|||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
|
@ -481,20 +563,28 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
|
|||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end;
|
||||
|
||||
const int* nbor_mem=dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
int offset_k=tid & (t_per_atom-1);
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -509,7 +599,6 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
|
|||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
|
||||
|
@ -517,9 +606,11 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
|
|||
|
||||
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
|
||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
i, nbor_j, offset_j, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||
int idx = nbor_j;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// i, nbor_j, offset_j, idx);
|
||||
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
|
||||
numtyp force = zeta_ij.x*tpainv;
|
||||
numtyp prefactor = zeta_ij.y;
|
||||
|
@ -540,9 +631,15 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
|
|||
virial[5] += delr1[1]*delr1[2]*mforce;
|
||||
}
|
||||
|
||||
int nbor_k=nborj_start-offset_j+offset_k;
|
||||
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int nbor_k = nborj_start-offset_j+offset_k;
|
||||
int k_end = nbor_end;
|
||||
if (dev_packed==dev_nbor) {
|
||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (j == k) continue;
|
||||
|
@ -618,6 +715,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
|||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
|
@ -652,7 +750,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
|||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
const int* nbor_mem=dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
|
@ -663,9 +761,18 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
|||
itype=map[itype];
|
||||
|
||||
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -680,8 +787,6 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
|||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
|
||||
numtyp mdelr1[3];
|
||||
mdelr1[0] = -delr1[0];
|
||||
mdelr1[1] = -delr1[1];
|
||||
|
@ -703,13 +808,20 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
|||
k_end=nbor_k+numk;
|
||||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
// recalculate numk and k_end for the use of short neighbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numk = dev_short_nbor[nbor_k];
|
||||
nbor_k += n_stride;
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
int nbork_start = nbor_k;
|
||||
|
||||
// look up for zeta_ji: find i in the j's neighbor list
|
||||
int m = tid / t_per_atom;
|
||||
int ijnum = -1;
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
|
@ -731,9 +843,11 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
|||
|
||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, ijnum, offset_kf, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, ijnum, offset_kf, idx);
|
||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||
numtyp force = zeta_ji.x*tpainv;
|
||||
numtyp prefactor_ji = zeta_ji.y;
|
||||
|
@ -756,7 +870,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
|||
|
||||
// attractive forces
|
||||
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
|
@ -797,9 +911,11 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
|||
|
||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, nbor_k, offset_k, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, nbor_k, offset_k, idx);
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
|
||||
|
@ -844,6 +960,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
|
@ -878,7 +995,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
const int* nbor_mem = dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
|
@ -889,9 +1006,18 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
itype=map[itype];
|
||||
|
||||
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -906,8 +1032,6 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
|
||||
if (rsq1 > cutsq[ijparam]) continue;
|
||||
|
||||
numtyp mdelr1[3];
|
||||
mdelr1[0] = -delr1[0];
|
||||
mdelr1[1] = -delr1[1];
|
||||
|
@ -929,13 +1053,20 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
k_end=nbor_k+numk;
|
||||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
// recalculate numk and k_end for the use of short neighbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numk = dev_short_nbor[nbor_k];
|
||||
nbor_k += n_stride;
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
int nbork_start = nbor_k;
|
||||
|
||||
// look up for zeta_ji
|
||||
int m = tid / t_per_atom;
|
||||
int ijnum = -1;
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
|
@ -957,9 +1088,11 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
|
||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, ijnum, offset_kf, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, ijnum, offset_kf, idx);
|
||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||
numtyp force = zeta_ji.x*tpainv;
|
||||
numtyp prefactor_ji = zeta_ji.y;
|
||||
|
@ -982,7 +1115,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
|
||||
// attractive forces
|
||||
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
|
@ -1030,9 +1163,11 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
|
||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||
int idx;
|
||||
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
j, nbor_k, offset_k, idx);
|
||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||
// j, nbor_k, offset_k, idx);
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
|
||||
|
|
|
@ -49,21 +49,6 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
|
|||
const double* ZBLcut, const double* ZBLexpscale, const double global_e,
|
||||
const double global_a_0, const double global_epsilon_0, const double* cutsq);
|
||||
|
||||
/// Pair loop with host neighboring
|
||||
void compute(const int f_ago, const int inum_full, const int nall,
|
||||
const int nlist, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh, const bool eflag,
|
||||
const bool vflag, const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time, bool &success);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int ** compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **numj, const double cpu_time, bool &success);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
|
@ -109,8 +94,8 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
|
|||
UCL_Kernel k_zeta;
|
||||
UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex, ts6_tex;
|
||||
|
||||
int _max_nbors;
|
||||
numtyp _global_e,_global_a_0,_global_epsilon_0;
|
||||
numtyp _cutshortsq;
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
|
|
|
@ -59,7 +59,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
|
|||
int success;
|
||||
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
|
||||
_screen,vashishta,"k_vashishta","k_vashishta_three_center",
|
||||
"k_vashishta_three_end");
|
||||
"k_vashishta_three_end","k_vashishta_short_nbor");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
@ -128,15 +128,18 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
|
|||
|
||||
param4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
|
||||
|
||||
double r0sqmax = 0;
|
||||
for (int i=0; i<nparams; i++) {
|
||||
double r0sq = r0[i]*r0[i]-1e-4; // TODO: should we have the 1e-4?
|
||||
|
||||
double r0sq = r0[i]*r0[i]; // TODO: should we have the 1e-4?
|
||||
if (r0sqmax < r0sq) r0sqmax = r0sq;
|
||||
dview[i].x=static_cast<numtyp>(r0sq);
|
||||
dview[i].y=static_cast<numtyp>(gamma[i]);
|
||||
dview[i].z=static_cast<numtyp>(cutsq[i]);
|
||||
dview[i].w=static_cast<numtyp>(r0[i]);
|
||||
}
|
||||
|
||||
_cutshortsq = static_cast<numtyp>(r0sqmax);
|
||||
|
||||
ucl_copy(param4,dview,false);
|
||||
param4_tex.get_texture(*(this->pair_program),"param4_tex");
|
||||
param4_tex.bind_float(param4,4);
|
||||
|
@ -223,15 +226,28 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
// build the short neighbor list
|
||||
int ainum=this->_ainum;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, ¶m4, &map,
|
||||
&elem2param, &_nelements, &_nparams,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
|
||||
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
|
||||
int ainum=this->ans->inum();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
ainum=this->ans->inum();
|
||||
nbor_pitch=this->nbor->nbor_pitch();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
this->time_pair.start();
|
||||
|
||||
// note that k_pair does not run with the short neighbor list
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5,
|
||||
&map, &elem2param, &_nelements,
|
||||
|
@ -248,6 +264,7 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
this->k_three_center.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5,
|
||||
&map, &elem2param, &_nelements,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &evatom);
|
||||
Answer<numtyp,acctyp> *end_ans;
|
||||
|
@ -257,21 +274,19 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
|||
end_ans=this->ans;
|
||||
#endif
|
||||
if (evatom!=0) {
|
||||
|
||||
this->k_three_end_vatom.set_size(GX,BX);
|
||||
this->k_three_end_vatom.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5,
|
||||
&map, &elem2param, &_nelements,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&this->nbor->dev_acc, &this->dev_short_nbor,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
} else {
|
||||
|
||||
this->k_three_end.set_size(GX,BX);
|
||||
this->k_three_end.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5,
|
||||
&map, &elem2param, &_nelements,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&this->nbor->dev_acc, &this->dev_short_nbor,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
}
|
||||
|
|
|
@ -136,6 +136,64 @@ texture<int4> param5_tex;
|
|||
|
||||
#endif
|
||||
|
||||
__kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict param4,
|
||||
const __global int *restrict map,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global int * dev_short_nbor,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
__local int n_stride;
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
int ncount = 0;
|
||||
int m = nbor;
|
||||
dev_short_nbor[m] = 0;
|
||||
int nbor_short = nbor+n_stride;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
int nj = j;
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<param4[ijparam].x) { //param4[ijparam].x = r0sq; //param4[ijparam].z=cutsq
|
||||
dev_short_nbor[nbor_short] = nj;
|
||||
nbor_short += n_stride;
|
||||
ncount++;
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
// store the number of neighbors for each thread
|
||||
dev_short_nbor[m] = ncount;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict param1,
|
||||
|
@ -166,8 +224,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
int nbor, nbor_end, i, numj;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
@ -211,7 +268,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
|||
numtyp param3_dvrc=param3_ijparam.z;
|
||||
numtyp param3_c0 =param3_ijparam.w;
|
||||
|
||||
numtyp r=sqrt(rsq);
|
||||
numtyp r=ucl_sqrt(rsq);
|
||||
numtyp rinvsq=1.0/rsq;
|
||||
numtyp r4inv = rinvsq*rinvsq;
|
||||
numtyp r6inv = rinvsq*r4inv;
|
||||
|
@ -219,8 +276,8 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
|||
numtyp reta = pow(r,-param1_eta);
|
||||
numtyp lam1r = r*param1_lam1inv;
|
||||
numtyp lam4r = r*param1_lam4inv;
|
||||
numtyp vc2 = param1_zizj * exp(-lam1r)/r;
|
||||
numtyp vc3 = param2_mbigd * r4inv*exp(-lam4r);
|
||||
numtyp vc2 = param1_zizj * ucl_exp(-lam1r)/r;
|
||||
numtyp vc3 = param2_mbigd * r4inv*ucl_exp(-lam4r);
|
||||
|
||||
numtyp force = (param2_dvrc*r
|
||||
- (4.0*vc3 + lam4r*vc3+param2_big6w*r6inv
|
||||
|
@ -230,6 +287,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
|||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0)
|
||||
energy += (param3_bigh*reta+vc2-vc3-param3_bigw*r6inv-r*param3_dvrc+param3_c0);
|
||||
|
||||
|
@ -255,31 +313,31 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
|||
numtyp r1 = ucl_sqrt(rsq1); \
|
||||
numtyp rinvsq1 = ucl_recip(rsq1); \
|
||||
numtyp rainv1 = ucl_recip(r1 - param_r0_ij); \
|
||||
numtyp gsrainv1 = param_gamma_ij * rainv1; \
|
||||
numtyp gsrainv1 = param_gamma_ij * rainv1; \
|
||||
numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \
|
||||
numtyp expgsrainv1 = ucl_exp(gsrainv1); \
|
||||
\
|
||||
numtyp r2 = ucl_sqrt(rsq2); \
|
||||
numtyp rinvsq2 = ucl_recip(rsq2); \
|
||||
numtyp rainv2 = ucl_recip(r2 - param_r0_ik); \
|
||||
numtyp gsrainv2 = param_gamma_ik * rainv2; \
|
||||
numtyp gsrainv2 = param_gamma_ik * rainv2; \
|
||||
numtyp gsrainvsq2 = gsrainv2*rainv2/r2; \
|
||||
numtyp expgsrainv2 = ucl_exp(gsrainv2); \
|
||||
\
|
||||
numtyp rinv12 = ucl_recip(r1*r2); \
|
||||
numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \
|
||||
numtyp delcs = cs - param_costheta_ijk; \
|
||||
numtyp delcs = cs - param_costheta_ijk; \
|
||||
numtyp delcssq = delcs*delcs; \
|
||||
numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \
|
||||
numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \
|
||||
numtyp pcsinvsq = pcsinv*pcsinv; \
|
||||
numtyp pcs = delcssq/pcsinv; \
|
||||
\
|
||||
numtyp facexp = expgsrainv1*expgsrainv2; \
|
||||
\
|
||||
numtyp facrad = param_bigb_ijk * facexp*pcs; \
|
||||
numtyp facrad = param_bigb_ijk * facexp*pcs; \
|
||||
numtyp frad1 = facrad*gsrainvsq1; \
|
||||
numtyp frad2 = facrad*gsrainvsq2; \
|
||||
numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \
|
||||
numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \
|
||||
numtyp facang12 = rinv12*facang; \
|
||||
numtyp csfacang = cs*facang; \
|
||||
numtyp csfac1 = rinvsq1*csfacang; \
|
||||
|
@ -311,28 +369,28 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
|||
numtyp r1 = ucl_sqrt(rsq1); \
|
||||
numtyp rinvsq1 = ucl_recip(rsq1); \
|
||||
numtyp rainv1 = ucl_recip(r1 - param_r0_ij); \
|
||||
numtyp gsrainv1 = param_gamma_ij * rainv1; \
|
||||
numtyp gsrainv1 = param_gamma_ij * rainv1; \
|
||||
numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \
|
||||
numtyp expgsrainv1 = ucl_exp(gsrainv1); \
|
||||
\
|
||||
numtyp r2 = ucl_sqrt(rsq2); \
|
||||
numtyp rainv2 = ucl_recip(r2 - param_r0_ik); \
|
||||
numtyp gsrainv2 = param_gamma_ik * rainv2; \
|
||||
numtyp gsrainv2 = param_gamma_ik * rainv2; \
|
||||
numtyp expgsrainv2 = ucl_exp(gsrainv2); \
|
||||
\
|
||||
numtyp rinv12 = ucl_recip(r1*r2); \
|
||||
numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \
|
||||
numtyp delcs = cs - param_costheta_ijk; \
|
||||
numtyp delcs = cs - param_costheta_ijk; \
|
||||
numtyp delcssq = delcs*delcs; \
|
||||
numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \
|
||||
numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \
|
||||
numtyp pcsinvsq = pcsinv*pcsinv; \
|
||||
numtyp pcs = delcssq/pcsinv; \
|
||||
\
|
||||
numtyp facexp = expgsrainv1*expgsrainv2; \
|
||||
\
|
||||
numtyp facrad = param_bigb_ijk * facexp*pcs; \
|
||||
numtyp facrad = param_bigb_ijk * facexp*pcs; \
|
||||
numtyp frad1 = facrad*gsrainvsq1; \
|
||||
numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \
|
||||
numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \
|
||||
numtyp facang12 = rinv12*facang; \
|
||||
numtyp csfacang = cs*facang; \
|
||||
numtyp csfac1 = rinvsq1*csfacang; \
|
||||
|
@ -353,6 +411,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
|
|||
const int nelements,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
|
@ -377,7 +436,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
|
|||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end;
|
||||
|
||||
const int* nbor_mem = dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
|
@ -387,9 +446,18 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
|
|||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
int nborj_start = nbor_j;
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -406,18 +474,27 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
|
|||
|
||||
numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
|
||||
param_r0sq_ij=param4_ijparam.x;
|
||||
if (rsq1 > param_r0sq_ij) continue;
|
||||
if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
|
||||
param_gamma_ij=param4_ijparam.y;
|
||||
param_r0_ij=param4_ijparam.w;
|
||||
|
||||
int nbor_k=nbor_j-offset_j+offset_k;
|
||||
if (nbor_k<=nbor_j)
|
||||
nbor_k+=n_stride;
|
||||
int nbor_k,k_end;
|
||||
if (dev_packed==dev_nbor) {
|
||||
nbor_k=nborj_start-offset_j+offset_k;
|
||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
} else {
|
||||
nbor_k = nbor_j-offset_j+offset_k;
|
||||
if (nbor_k<=nbor_j) nbor_k += n_stride;
|
||||
k_end = nbor_end;
|
||||
}
|
||||
|
||||
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (dev_packed==dev_nbor && k <= j) continue;
|
||||
|
||||
numtyp4 kx; fetch4(kx,k,pos_tex);
|
||||
int ktype=kx.w;
|
||||
ktype=map[ktype];
|
||||
|
@ -478,6 +555,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
|
|||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
|
@ -502,7 +580,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
|
|||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
const int* nbor_mem = dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
|
@ -512,8 +590,16 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
|
|||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -529,7 +615,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
|
|||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
|
||||
param_r0sq_ij = param4_ijparam.x;
|
||||
if (rsq1 > param_r0sq_ij) continue;
|
||||
if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
|
||||
|
||||
param_gamma_ij=param4_ijparam.y;
|
||||
param_r0_ij = param4_ijparam.w;
|
||||
|
@ -551,8 +637,15 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
|
|||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
// recalculate numk and k_end for the use of short neighbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numk = dev_short_nbor[nbor_k];
|
||||
nbor_k += n_stride;
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
|
@ -617,6 +710,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
const __global int * dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
|
@ -641,7 +735,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
const int* nbor_mem = dev_packed;
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,nbor_end,nbor_j);
|
||||
|
@ -651,8 +745,16 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor_j];
|
||||
nbor_j += n_stride;
|
||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
int j=dev_packed[nbor_j];
|
||||
int j=nbor_mem[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -668,7 +770,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
|
||||
param_r0sq_ij=param4_ijparam.x;
|
||||
if (rsq1 > param_r0sq_ij) continue;
|
||||
if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
|
||||
|
||||
param_gamma_ij=param4_ijparam.y;
|
||||
param_r0_ij=param4_ijparam.w;
|
||||
|
@ -690,8 +792,15 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
// recalculate numk and k_end for the use of short neighbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numk = dev_short_nbor[nbor_k];
|
||||
nbor_k += n_stride;
|
||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||
}
|
||||
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
int k=nbor_mem[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
|
|
|
@ -82,6 +82,7 @@ class Vashishta : public BaseThree<numtyp, acctyp> {
|
|||
UCL_D_Vec<int> elem2param;
|
||||
UCL_D_Vec<int> map;
|
||||
int _nparams,_nelements;
|
||||
numtyp _cutshortsq;
|
||||
|
||||
UCL_Texture param1_tex, param2_tex, param3_tex, param4_tex, param5_tex;
|
||||
|
||||
|
|
|
@ -6,6 +6,8 @@
|
|||
from __future__ import print_function
|
||||
import sys,os,re,subprocess
|
||||
|
||||
# help message
|
||||
|
||||
help = """
|
||||
Syntax from src dir: make lib-kim args="-b -v version -a kim-name"
|
||||
or: make lib-kim args="-b -a everything"
|
||||
|
@ -23,7 +25,7 @@ specify one or more options, order does not matter
|
|||
-b = download and build base KIM API library with example Models
|
||||
this will delete any previous installation in the current folder
|
||||
-n = do NOT download and build base KIM API library.
|
||||
Use an existing installation
|
||||
Use an existing installation
|
||||
-p = specify location of KIM API installation (implies -n)
|
||||
-a = add single KIM model or model driver with kim-name
|
||||
to existing KIM API lib (see example below).
|
||||
|
@ -78,13 +80,27 @@ def which(program):
|
|||
return None
|
||||
|
||||
def geturl(url,fname):
|
||||
success = False
|
||||
|
||||
if which('curl') != None:
|
||||
cmd = 'curl -L -o "%s" %s' % (fname,url)
|
||||
elif which('wget') != None:
|
||||
try:
|
||||
subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
|
||||
success = True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print("Calling curl failed with: %s" % e.output.decode('UTF-8'))
|
||||
|
||||
if not success and which('wget') != None:
|
||||
cmd = 'wget -O "%s" %s' % (fname,url)
|
||||
else: error("cannot find 'wget' or 'curl' to download source code")
|
||||
txt = subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
|
||||
return txt
|
||||
try:
|
||||
subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
|
||||
success = True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print("Calling wget failed with: %s" % e.output.decode('UTF-8'))
|
||||
|
||||
if not success:
|
||||
error("Failed to download source code with 'curl' or 'wget'")
|
||||
return
|
||||
|
||||
# parse args
|
||||
|
||||
|
|
|
@ -1,5 +1,46 @@
|
|||
# Change Log
|
||||
|
||||
## [2.03.13](https://github.com/kokkos/kokkos/tree/2.03.13) (2017-07-27)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.05...2.03.13)
|
||||
|
||||
**Implemented enhancements:**
|
||||
|
||||
- Disallow enabling both OpenMP and Threads in the same executable [\#406](https://github.com/kokkos/kokkos/issues/406)
|
||||
- Make Kokkos::OpenMP respect OMP environment even if hwloc is available [\#630](https://github.com/kokkos/kokkos/issues/630)
|
||||
- Improve Atomics Performance on KNL/Broadwell where PREFETCHW/RFO is Available [\#898](https://github.com/kokkos/kokkos/issues/898)
|
||||
- Kokkos::resize should test whether dimensions have changed before resizing [\#904](https://github.com/kokkos/kokkos/issues/904)
|
||||
- Develop performance-regression/acceptance tests [\#737](https://github.com/kokkos/kokkos/issues/737)
|
||||
- Make the deep\_copy Profiling hook a start/end system [\#890](https://github.com/kokkos/kokkos/issues/890)
|
||||
- Add deep\_copy Profiling hook [\#843](https://github.com/kokkos/kokkos/issues/843)
|
||||
- Append tag name to parallel construct name for Profiling [\#842](https://github.com/kokkos/kokkos/issues/842)
|
||||
- Add view label to `View bounds error` message for CUDA backend [\#870](https://github.com/kokkos/kokkos/issues/870)
|
||||
- Disable printing the loaded profiling library [\#824](https://github.com/kokkos/kokkos/issues/824)
|
||||
- "Declared but never referenced" warnings [\#853](https://github.com/kokkos/kokkos/issues/853)
|
||||
- Warnings about lock\_address\_cuda\_space [\#852](https://github.com/kokkos/kokkos/issues/852)
|
||||
- WorkGraph execution policy [\#771](https://github.com/kokkos/kokkos/issues/771)
|
||||
- Simplify makefiles by guarding compilation with appropriate KOKKOS\_ENABLE\_\#\#\# macros [\#716](https://github.com/kokkos/kokkos/issues/716)
|
||||
- Cmake build: wrong include install directory [\#668](https://github.com/kokkos/kokkos/issues/668)
|
||||
- Derived View type and allocation [\#566](https://github.com/kokkos/kokkos/issues/566)
|
||||
- Fix Compiler warnings when compiling core unit tests for Cuda [\#214](https://github.com/kokkos/kokkos/issues/214)
|
||||
|
||||
**Fixed bugs:**
|
||||
|
||||
- Out-of-bounds read in Kokkos\_Layout.hpp [\#975](https://github.com/kokkos/kokkos/issues/975)
|
||||
- CudaClang: Fix failing test with Clang 4.0 [\#941](https://github.com/kokkos/kokkos/issues/941)
|
||||
- Respawn when memory pool allocation fails \(not available memory\) [\#940](https://github.com/kokkos/kokkos/issues/940)
|
||||
- Memory pool aborts on zero allocation request, returns NULL for \< minimum [\#939](https://github.com/kokkos/kokkos/issues/939)
|
||||
- Error with TaskScheduler query of underlying memory pool [\#917](https://github.com/kokkos/kokkos/issues/917)
|
||||
- Profiling::\*Callee static variables declared in header [\#863](https://github.com/kokkos/kokkos/issues/863)
|
||||
- calling \*Space::name\(\) causes compile error [\#862](https://github.com/kokkos/kokkos/issues/862)
|
||||
- bug in Profiling::deallocateData [\#860](https://github.com/kokkos/kokkos/issues/860)
|
||||
- task\_depend test failing, CUDA 8.0 + Pascal + RDC [\#829](https://github.com/kokkos/kokkos/issues/829)
|
||||
- \[develop branch\] Standalone cmake issues [\#826](https://github.com/kokkos/kokkos/issues/826)
|
||||
- Kokkos CUDA failes to compile with OMPI\_CXX and MPICH\_CXX wrappers [\#776](https://github.com/kokkos/kokkos/issues/776)
|
||||
- Task Team reduction on Pascal [\#767](https://github.com/kokkos/kokkos/issues/767)
|
||||
- CUDA stack overflow with TaskDAG test [\#758](https://github.com/kokkos/kokkos/issues/758)
|
||||
- TeamVector test on Cuda [\#670](https://github.com/kokkos/kokkos/issues/670)
|
||||
- Clang 4.0 Cuda Build broken again [\#560](https://github.com/kokkos/kokkos/issues/560)
|
||||
|
||||
|
||||
## [2.03.05](https://github.com/kokkos/kokkos/tree/2.03.05) (2017-05-27)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.00...2.03.05)
|
||||
|
|
|
@ -33,6 +33,7 @@ KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "lib
|
|||
KOKKOS_INTERNAL_USE_MEMKIND := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "experimental_memkind" | wc -l))
|
||||
|
||||
# Check for advanced settings.
|
||||
KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "compiler_warnings" | wc -l))
|
||||
KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "aggressive_vectorization" | wc -l))
|
||||
KOKKOS_INTERNAL_DISABLE_PROFILING := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_profiling" | wc -l))
|
||||
KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_dualview_modify_check" | wc -l))
|
||||
|
@ -78,14 +79,14 @@ KOKKOS_INTERNAL_COMPILER_PGI := $(strip $(shell $(CXX) --version 2
|
|||
KOKKOS_INTERNAL_COMPILER_XL := $(strip $(shell $(CXX) -qversion 2>&1 | grep XL | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-" | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(CXX) --version 2>&1 | grep nvcc | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep clang | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep "apple-darwin" | wc -l))
|
||||
ifneq ($(OMPI_CXX),)
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(OMPI_CXX) --version 2>&1 | grep "nvcc" | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(OMPI_CXX) --version 2>&1 | grep nvcc | wc -l))
|
||||
endif
|
||||
ifneq ($(MPICH_CXX),)
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(MPICH_CXX) --version 2>&1 | grep "nvcc" | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(MPICH_CXX) --version 2>&1 | grep nvcc | wc -l))
|
||||
endif
|
||||
KOKKOS_INTERNAL_COMPILER_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep clang | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep "apple-darwin" | wc -l))
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2)
|
||||
KOKKOS_INTERNAL_COMPILER_CLANG = 1
|
||||
|
@ -111,6 +112,36 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
|||
endif
|
||||
endif
|
||||
|
||||
# Set compiler warnings flags.
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
# TODO check if PGI accepts GNU style warnings
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS =
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
# TODO check if cray accepts GNU style warnings
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS =
|
||||
else
|
||||
#gcc
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS =
|
||||
endif
|
||||
|
||||
# Set OpenMP flags.
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
KOKKOS_INTERNAL_OPENMP_FLAG := -mp
|
||||
|
@ -162,6 +193,7 @@ endif
|
|||
|
||||
# Intel based.
|
||||
KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_WSM := $(strip $(shell echo $(KOKKOS_ARCH) | grep WSM | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l))
|
||||
|
@ -229,13 +261,14 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
|
|||
KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l))
|
||||
|
||||
# Any AVX?
|
||||
KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
|
||||
|
||||
# Decide what ISA level we are able to support.
|
||||
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM)+$(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ISA_KNC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc ))
|
||||
|
||||
|
@ -243,7 +276,7 @@ KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_
|
|||
KOKKOS_INTERNAL_USE_TM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
|
||||
|
||||
# Incompatible flags?
|
||||
KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
|
||||
|
@ -257,12 +290,10 @@ endif
|
|||
|
||||
KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src
|
||||
|
||||
# No warnings:
|
||||
KOKKOS_CXXFLAGS =
|
||||
# INTEL and CLANG warnings:
|
||||
#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
|
||||
# GCC warnings:
|
||||
#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized -Wignored-qualifiers -Wempty-body -Wclobbered
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1)
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_WARNINGS)
|
||||
endif
|
||||
|
||||
KOKKOS_LIBS = -lkokkos -ldl
|
||||
KOKKOS_LDFLAGS = -L$(shell pwd)
|
||||
|
@ -486,6 +517,28 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_SSE42 1" >> KokkosCore_config.tmp )
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
|
||||
KOKKOS_CXXFLAGS += -xSSE4.2
|
||||
KOKKOS_LDFLAGS += -xSSE4.2
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
KOKKOS_CXXFLAGS += -tp=nehalem
|
||||
KOKKOS_LDFLAGS += -tp=nehalem
|
||||
else
|
||||
# Assume that this is a really a GNU compiler.
|
||||
KOKKOS_CXXFLAGS += -msse4.2
|
||||
KOKKOS_LDFLAGS += -msse4.2
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp )
|
||||
|
||||
|
@ -689,7 +742,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
|||
endif
|
||||
endif
|
||||
|
||||
KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)
|
||||
KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1)
|
||||
ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h)
|
||||
KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
|
||||
else
|
||||
|
|
|
@ -20,8 +20,10 @@ Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ta
|
|||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
|
||||
Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
|
||||
Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
|
||||
Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
|
||||
Kokkos_Rendezvous.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Rendezvous.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Rendezvous.cpp
|
||||
Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
|
||||
Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
|
||||
|
@ -36,6 +38,8 @@ Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu
|
|||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
|
||||
Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
|
||||
Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
|
||||
|
|
|
@ -61,14 +61,19 @@ protected:
|
|||
{
|
||||
std::cout << std::setprecision(5) << std::scientific;
|
||||
|
||||
unsigned threads_count = omp_get_max_threads();
|
||||
int threads_count = 0;
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp atomic
|
||||
++threads_count;
|
||||
}
|
||||
|
||||
if ( Kokkos::hwloc::available() ) {
|
||||
threads_count = Kokkos::hwloc::get_available_numa_count() *
|
||||
Kokkos::hwloc::get_available_cores_per_numa();
|
||||
if (threads_count > 3) {
|
||||
threads_count /= 2;
|
||||
}
|
||||
|
||||
Kokkos::OpenMP::initialize( threads_count );
|
||||
Kokkos::OpenMP::print_configuration( std::cout );
|
||||
}
|
||||
|
||||
static void TearDownTestCase()
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -35,7 +35,7 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
|
||||
|
@ -283,12 +283,12 @@ struct test_random_scalar {
|
|||
RandomGenerator& pool,
|
||||
unsigned int num_draws)
|
||||
{
|
||||
using std::cerr;
|
||||
using std::cout;
|
||||
using std::endl;
|
||||
using Kokkos::parallel_reduce;
|
||||
|
||||
{
|
||||
cerr << " -- Testing randomness properties" << endl;
|
||||
cout << " -- Testing randomness properties" << endl;
|
||||
|
||||
RandomProperties result;
|
||||
typedef test_random_functor<RandomGenerator, Scalar> functor_type;
|
||||
|
@ -307,7 +307,7 @@ struct test_random_scalar {
|
|||
( 1.5*tolerance > variance_eps)) ? 1:0;
|
||||
pass_covar = ((-2.0*tolerance < covariance_eps) &&
|
||||
( 2.0*tolerance > covariance_eps)) ? 1:0;
|
||||
cerr << "Pass: " << pass_mean
|
||||
cout << "Pass: " << pass_mean
|
||||
<< " " << pass_var
|
||||
<< " " << mean_eps
|
||||
<< " " << variance_eps
|
||||
|
@ -315,7 +315,7 @@ struct test_random_scalar {
|
|||
<< " || " << tolerance << endl;
|
||||
}
|
||||
{
|
||||
cerr << " -- Testing 1-D histogram" << endl;
|
||||
cout << " -- Testing 1-D histogram" << endl;
|
||||
|
||||
RandomProperties result;
|
||||
typedef test_histogram1d_functor<typename RandomGenerator::device_type> functor_type;
|
||||
|
@ -335,7 +335,7 @@ struct test_random_scalar {
|
|||
pass_hist1d_covar = ((-0.06 < covariance_eps) &&
|
||||
( 0.06 > covariance_eps)) ? 1:0;
|
||||
|
||||
cerr << "Density 1D: " << mean_eps
|
||||
cout << "Density 1D: " << mean_eps
|
||||
<< " " << variance_eps
|
||||
<< " " << (result.covariance/HIST_DIM1D/HIST_DIM1D)
|
||||
<< " || " << tolerance
|
||||
|
@ -348,7 +348,7 @@ struct test_random_scalar {
|
|||
<< endl;
|
||||
}
|
||||
{
|
||||
cerr << " -- Testing 3-D histogram" << endl;
|
||||
cout << " -- Testing 3-D histogram" << endl;
|
||||
|
||||
RandomProperties result;
|
||||
typedef test_histogram3d_functor<typename RandomGenerator::device_type> functor_type;
|
||||
|
@ -368,7 +368,7 @@ struct test_random_scalar {
|
|||
pass_hist3d_covar = ((-tolerance < covariance_eps) &&
|
||||
( tolerance > covariance_eps)) ? 1:0;
|
||||
|
||||
cerr << "Density 3D: " << mean_eps
|
||||
cout << "Density 3D: " << mean_eps
|
||||
<< " " << variance_eps
|
||||
<< " " << result.covariance/HIST_DIM1D/HIST_DIM1D
|
||||
<< " || " << tolerance
|
||||
|
@ -381,18 +381,18 @@ struct test_random_scalar {
|
|||
template <class RandomGenerator>
|
||||
void test_random(unsigned int num_draws)
|
||||
{
|
||||
using std::cerr;
|
||||
using std::cout;
|
||||
using std::endl;
|
||||
typename test_random_functor<RandomGenerator,int>::type_1d density_1d("D1d");
|
||||
typename test_random_functor<RandomGenerator,int>::type_3d density_3d("D3d");
|
||||
|
||||
|
||||
uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count();
|
||||
cerr << "Test Seed:" << ticks << endl;
|
||||
cout << "Test Seed:" << ticks << endl;
|
||||
|
||||
RandomGenerator pool(ticks);
|
||||
|
||||
cerr << "Test Scalar=int" << endl;
|
||||
cout << "Test Scalar=int" << endl;
|
||||
test_random_scalar<RandomGenerator,int> test_int(density_1d,density_3d,pool,num_draws);
|
||||
ASSERT_EQ( test_int.pass_mean,1);
|
||||
ASSERT_EQ( test_int.pass_var,1);
|
||||
|
@ -406,7 +406,7 @@ void test_random(unsigned int num_draws)
|
|||
deep_copy(density_1d,0);
|
||||
deep_copy(density_3d,0);
|
||||
|
||||
cerr << "Test Scalar=unsigned int" << endl;
|
||||
cout << "Test Scalar=unsigned int" << endl;
|
||||
test_random_scalar<RandomGenerator,unsigned int> test_uint(density_1d,density_3d,pool,num_draws);
|
||||
ASSERT_EQ( test_uint.pass_mean,1);
|
||||
ASSERT_EQ( test_uint.pass_var,1);
|
||||
|
@ -420,7 +420,7 @@ void test_random(unsigned int num_draws)
|
|||
deep_copy(density_1d,0);
|
||||
deep_copy(density_3d,0);
|
||||
|
||||
cerr << "Test Scalar=int64_t" << endl;
|
||||
cout << "Test Scalar=int64_t" << endl;
|
||||
test_random_scalar<RandomGenerator,int64_t> test_int64(density_1d,density_3d,pool,num_draws);
|
||||
ASSERT_EQ( test_int64.pass_mean,1);
|
||||
ASSERT_EQ( test_int64.pass_var,1);
|
||||
|
@ -434,7 +434,7 @@ void test_random(unsigned int num_draws)
|
|||
deep_copy(density_1d,0);
|
||||
deep_copy(density_3d,0);
|
||||
|
||||
cerr << "Test Scalar=uint64_t" << endl;
|
||||
cout << "Test Scalar=uint64_t" << endl;
|
||||
test_random_scalar<RandomGenerator,uint64_t> test_uint64(density_1d,density_3d,pool,num_draws);
|
||||
ASSERT_EQ( test_uint64.pass_mean,1);
|
||||
ASSERT_EQ( test_uint64.pass_var,1);
|
||||
|
@ -448,7 +448,7 @@ void test_random(unsigned int num_draws)
|
|||
deep_copy(density_1d,0);
|
||||
deep_copy(density_3d,0);
|
||||
|
||||
cerr << "Test Scalar=float" << endl;
|
||||
cout << "Test Scalar=float" << endl;
|
||||
test_random_scalar<RandomGenerator,float> test_float(density_1d,density_3d,pool,num_draws);
|
||||
ASSERT_EQ( test_float.pass_mean,1);
|
||||
ASSERT_EQ( test_float.pass_var,1);
|
||||
|
@ -462,7 +462,7 @@ void test_random(unsigned int num_draws)
|
|||
deep_copy(density_1d,0);
|
||||
deep_copy(density_3d,0);
|
||||
|
||||
cerr << "Test Scalar=double" << endl;
|
||||
cout << "Test Scalar=double" << endl;
|
||||
test_random_scalar<RandomGenerator,double> test_double(density_1d,density_3d,pool,num_draws);
|
||||
ASSERT_EQ( test_double.pass_mean,1);
|
||||
ASSERT_EQ( test_double.pass_var,1);
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,7 +36,7 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
|
|
@ -44,12 +44,13 @@
|
|||
#include<Kokkos_Core.hpp>
|
||||
#include<impl/Kokkos_Timer.hpp>
|
||||
#include<bench.hpp>
|
||||
#include<cstdlib>
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
Kokkos::initialize();
|
||||
|
||||
|
||||
if(argc<10) {
|
||||
|
||||
if(argc<10) {
|
||||
printf("Arguments: N K R D U F T S\n");
|
||||
printf(" P: Precision (1==float, 2==double)\n");
|
||||
printf(" N,K: dimensions of the 2D array to allocate\n");
|
||||
|
@ -68,7 +69,7 @@ int main(int argc, char* argv[]) {
|
|||
Kokkos::finalize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int P = atoi(argv[1]);
|
||||
int N = atoi(argv[2]);
|
||||
|
@ -80,7 +81,7 @@ int main(int argc, char* argv[]) {
|
|||
int T = atoi(argv[8]);
|
||||
int S = atoi(argv[9]);
|
||||
|
||||
if(U>8) {printf("U must be 1-8\n"); return 0;}
|
||||
if(U>8) {printf("U must be 1-8\n"); return 0;}
|
||||
if( (D!=1) && (D!=2) && (D!=4) && (D!=8) && (D!=16) && (D!=32)) {printf("D must be one of 1,2,4,8,16,32\n"); return 0;}
|
||||
if( (P!=1) && (P!=2) ) {printf("P must be one of 1,2\n"); return 0;}
|
||||
|
||||
|
|
|
@ -44,11 +44,11 @@
|
|||
#include<Kokkos_Core.hpp>
|
||||
#include<impl/Kokkos_Timer.hpp>
|
||||
#include<gather.hpp>
|
||||
#include<cstdlib>
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
Kokkos::initialize(argc,argv);
|
||||
|
||||
|
||||
if(argc<8) {
|
||||
printf("Arguments: S N K D\n");
|
||||
printf(" S: Scalar Type Size (1==float, 2==double, 4=complex<double>)\n");
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
KOKKOS_PATH = ../..
|
||||
SRC = $(wildcard *.cpp)
|
||||
|
||||
default: build
|
||||
echo "Start Build"
|
||||
|
||||
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
|
||||
CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
|
||||
CXXFLAGS = -O3 -g
|
||||
LINK = ${CXX}
|
||||
LINKFLAGS =
|
||||
EXE = policy_performance.cuda
|
||||
KOKKOS_DEVICES = "Cuda,OpenMP"
|
||||
KOKKOS_ARCH = "SNB,Kepler35"
|
||||
KOKKOS_CUDA_OPTIONS+=enable_lambda
|
||||
else
|
||||
CXX = g++
|
||||
CXXFLAGS = -O3 -g -Wall -Werror
|
||||
LINK = ${CXX}
|
||||
LINKFLAGS =
|
||||
EXE = policy_performance.host
|
||||
KOKKOS_DEVICES = "OpenMP"
|
||||
KOKKOS_ARCH = "SNB"
|
||||
endif
|
||||
|
||||
DEPFLAGS = -M
|
||||
|
||||
OBJ = $(SRC:.cpp=.o)
|
||||
LIB =
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
build: $(EXE)
|
||||
|
||||
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
|
||||
|
||||
clean: kokkos-clean
|
||||
rm -f *.o *.cuda *.host
|
||||
|
||||
# Compilation rules
|
||||
|
||||
%.o:%.cpp $(KOKKOS_CPP_DEPENDS) main.cpp policy_perf_test.hpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
|
|
@ -0,0 +1,170 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include "policy_perf_test.hpp"
|
||||
|
||||
int main(int argc, char* argv[] ) {
|
||||
Kokkos::initialize(argc,argv);
|
||||
|
||||
if(argc<10) {
|
||||
printf(" Ten arguments are needed to run this program:\n");
|
||||
printf(" (1)team_range, (2)thread_range, (3)vector_range, (4)outer_repeat, (5)thread_repeat, (6)vector_repeat, (7)team_size, (8)vector_size, (9)schedule, (10)test_type\n");
|
||||
printf(" team_range: number of teams (league_size)\n");
|
||||
printf(" thread_range: range for nested TeamThreadRange parallel_*\n");
|
||||
printf(" vector_range: range for nested ThreadVectorRange parallel_*\n");
|
||||
printf(" outer_repeat: number of repeats for outer parallel_* call\n");
|
||||
printf(" thread_repeat: number of repeats for TeamThreadRange parallel_* call\n");
|
||||
printf(" vector_repeat: number of repeats for ThreadVectorRange parallel_* call\n");
|
||||
printf(" team_size: number of team members (team_size)\n");
|
||||
printf(" vector_size: desired vectorization (if possible)\n");
|
||||
printf(" schedule: 1 == Static 2 == Dynamic\n");
|
||||
printf(" test_type: 3-digit code XYZ for testing (nested) parallel_*\n");
|
||||
printf(" code key: XYZ X in {1,2,3,4,5}, Y in {0,1,2}, Z in {0,1,2}\n");
|
||||
printf(" TeamPolicy:\n");
|
||||
printf(" X: 0 = none (never used, makes no sense); 1 = parallel_for; 2 = parallel_reduce\n");
|
||||
printf(" Y: 0 = none; 1 = parallel_for; 2 = parallel_reduce\n");
|
||||
printf(" Z: 0 = none; 1 = parallel_for; 2 = parallel_reduce\n");
|
||||
printf(" RangePolicy:\n");
|
||||
printf(" X: 3 = parallel_for; 4 = parallel_reduce; 5 = parallel_scan\n");
|
||||
printf(" Y: 0 = none\n");
|
||||
printf(" Z: 0 = none\n");
|
||||
printf(" Example Input:\n");
|
||||
printf(" 100000 32 32 100 100 100 8 1 1 100\n");
|
||||
Kokkos::finalize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int team_range = atoi(argv[1]);
|
||||
int thread_range = atoi(argv[2]);
|
||||
int vector_range = atoi(argv[3]);
|
||||
|
||||
int outer_repeat = atoi(argv[4]);
|
||||
int thread_repeat = atoi(argv[5]);
|
||||
int vector_repeat = atoi(argv[6]);
|
||||
|
||||
int team_size = atoi(argv[7]);
|
||||
int vector_size = atoi(argv[8]);
|
||||
int schedule = atoi(argv[9]);
|
||||
int test_type = atoi(argv[10]);
|
||||
|
||||
int disable_verbose_output = 0;
|
||||
if ( argc > 11 ) {
|
||||
disable_verbose_output = atoi(argv[11]);
|
||||
}
|
||||
|
||||
if ( schedule != 1 && schedule != 2 ) {
|
||||
printf("schedule: %d\n", schedule);
|
||||
printf("Options for schedule are: 1 == Static 2 == Dynamic\n");
|
||||
Kokkos::finalize();
|
||||
return -1;
|
||||
}
|
||||
|
||||
if ( test_type != 100 && test_type != 110 && test_type != 111 && test_type != 112 && test_type != 120 && test_type != 121 && test_type != 122
|
||||
&& test_type != 200 && test_type != 210 && test_type != 211 && test_type != 212 && test_type != 220 && test_type != 221 && test_type != 222
|
||||
&& test_type != 300 && test_type != 400 && test_type != 500
|
||||
)
|
||||
{
|
||||
printf("Incorrect test_type option\n");
|
||||
Kokkos::finalize();
|
||||
return -2;
|
||||
}
|
||||
|
||||
double result = 0.0;
|
||||
|
||||
Kokkos::parallel_reduce( "parallel_reduce warmup", Kokkos::TeamPolicy<>(10,1),
|
||||
KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type team, double& lval) {
|
||||
lval += 1;
|
||||
}, result);
|
||||
|
||||
typedef Kokkos::View<double*, Kokkos::LayoutRight> view_type_1d;
|
||||
typedef Kokkos::View<double**, Kokkos::LayoutRight> view_type_2d;
|
||||
typedef Kokkos::View<double***, Kokkos::LayoutRight> view_type_3d;
|
||||
|
||||
// Allocate view without initializing
|
||||
// Call a 'warmup' test with 1 repeat - this will initialize the corresponding view appropriately for test and should obey first-touch etc
|
||||
// Second call to test is the one we actually care about and time
|
||||
view_type_1d v_1( Kokkos::ViewAllocateWithoutInitializing("v_1"), team_range*team_size);
|
||||
view_type_2d v_2( Kokkos::ViewAllocateWithoutInitializing("v_2"), team_range*team_size, thread_range);
|
||||
view_type_3d v_3( Kokkos::ViewAllocateWithoutInitializing("v_3"), team_range*team_size, thread_range, vector_range);
|
||||
|
||||
double result_computed = 0.0;
|
||||
double result_expect = 0.0;
|
||||
double time = 0.0;
|
||||
|
||||
if(schedule==1) {
|
||||
if ( test_type != 500 ) {
|
||||
// warmup - no repeat of loops
|
||||
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
}
|
||||
else {
|
||||
// parallel_scan: initialize 1d view for parallel_scan
|
||||
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,100,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
}
|
||||
}
|
||||
if(schedule==2) {
|
||||
if ( test_type != 500 ) {
|
||||
// warmup - no repeat of loops
|
||||
test_policy<Kokkos::Schedule<Kokkos::Dynamic>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
test_policy<Kokkos::Schedule<Kokkos::Dynamic>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
}
|
||||
else {
|
||||
// parallel_scan: initialize 1d view for parallel_scan
|
||||
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,100,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
}
|
||||
}
|
||||
|
||||
if ( disable_verbose_output == 0 ) {
|
||||
printf("%7i %4i %2i %9i %4i %4i %4i %2i %1i %3i %e %e %lf\n",team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,schedule,test_type,result_computed,result_expect,time);
|
||||
}
|
||||
else {
|
||||
printf("%lf\n",time);
|
||||
}
|
||||
|
||||
Kokkos::finalize();
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,354 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
template < class ViewType >
|
||||
struct ParallelScanFunctor {
|
||||
using value_type = double;
|
||||
ViewType v;
|
||||
|
||||
ParallelScanFunctor( const ViewType & v_ )
|
||||
: v(v_)
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( const int idx, value_type& val, const bool& final ) const
|
||||
{
|
||||
// inclusive scan
|
||||
val += v(idx);
|
||||
if ( final ) {
|
||||
v(idx) = val;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class ScheduleType,class IndexType,class ViewType1, class ViewType2, class ViewType3>
|
||||
void test_policy(int team_range, int thread_range, int vector_range,
|
||||
int outer_repeat, int thread_repeat, int inner_repeat,
|
||||
int team_size, int vector_size, int test_type,
|
||||
ViewType1 &v1, ViewType2 &v2, ViewType3 &v3,
|
||||
double &result, double &result_expect, double &time) {
|
||||
|
||||
typedef Kokkos::TeamPolicy<ScheduleType,IndexType> t_policy;
|
||||
typedef typename t_policy::member_type t_team;
|
||||
Kokkos::Timer timer;
|
||||
|
||||
for(int orep = 0; orep<outer_repeat; orep++) {
|
||||
|
||||
if (test_type == 100) {
|
||||
Kokkos::parallel_for("100 outer for", t_policy(team_range,team_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
v1(idx) = idx;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
|
||||
if (test_type == 110) {
|
||||
Kokkos::parallel_for("110 outer for", t_policy(team_range,team_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
for (int tr = 0; tr<thread_repeat; ++tr) {
|
||||
// Each team launches a parallel_for; thread_range is partitioned among team members
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
|
||||
v2( idx, t ) = t;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
if (test_type == 111) {
|
||||
Kokkos::parallel_for("111 outer for", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
for (int tr = 0; tr<thread_repeat; ++tr) {
|
||||
// Each team launches a parallel_for; thread_range is partitioned among team members
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
|
||||
for (int vr = 0; vr<inner_repeat; ++vr)
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi) {
|
||||
v3( idx, t, vi ) = vi;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
if (test_type == 112) {
|
||||
Kokkos::parallel_for("112 outer for", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
for (int tr = 0; tr<thread_repeat; ++tr) {
|
||||
// Each team launches a parallel_for; thread_range is partitioned among team members
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
|
||||
double vector_result = 0.0;
|
||||
for (int vr = 0; vr<inner_repeat; ++vr) {
|
||||
vector_result = 0.0;
|
||||
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi, double &vval) {
|
||||
vval += 1;
|
||||
}, vector_result);
|
||||
}
|
||||
v2( idx, t ) = vector_result;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
if (test_type == 120) {
|
||||
Kokkos::parallel_for("120 outer for", t_policy(team_range,team_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double team_result = 0.0;
|
||||
for (int tr = 0; tr<thread_repeat; ++tr) {
|
||||
team_result = 0.0;
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
|
||||
lval += 1;
|
||||
}, team_result);
|
||||
}
|
||||
v1(idx) = team_result;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
if (test_type == 121) {
|
||||
Kokkos::parallel_for("121 outer for", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double team_result = 0.0;
|
||||
for (int tr = 0; tr<thread_repeat; ++tr) {
|
||||
team_result = 0.0;
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
|
||||
lval += 1;
|
||||
for (int vr = 0; vr<inner_repeat; ++vr) {
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi) {
|
||||
v3( idx, t, vi ) = vi;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
}, team_result);
|
||||
}
|
||||
v3( idx, 0, 0 ) = team_result;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
if (test_type == 122) {
|
||||
Kokkos::parallel_for("122 outer for", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double team_result = 0.0;
|
||||
for (int tr = 0; tr<thread_repeat; ++tr) {
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
|
||||
double vector_result = 0.0;
|
||||
for (int vr = 0; vr<inner_repeat; ++vr)
|
||||
vector_result = 0.0;
|
||||
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi, double &vval) {
|
||||
vval += 1;
|
||||
}, vector_result);
|
||||
lval += vector_result;
|
||||
}, team_result);
|
||||
}
|
||||
v1(idx) = team_result;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
if (test_type == 200) {
|
||||
Kokkos::parallel_reduce("200 outer reduce", t_policy(team_range,team_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
lval+=team.team_size()*team.league_rank() + team.team_rank();
|
||||
},result);
|
||||
result_expect = 0.5* (team_range*team_size)*(team_range*team_size-1);
|
||||
// sum ( seq( [0, team_range*team_size) )
|
||||
}
|
||||
if (test_type == 210) {
|
||||
Kokkos::parallel_reduce("210 outer reduce", t_policy(team_range,team_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double thread_for = 1.0;
|
||||
for(int tr = 0; tr<thread_repeat; tr++) {
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
|
||||
v2(idx,t) = t;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
lval+=(team.team_size()*team.league_rank() + team.team_rank() + thread_for);
|
||||
},result);
|
||||
result_expect = 0.5* (team_range*team_size)*(team_range*team_size-1) + (team_range*team_size);
|
||||
// sum ( seq( [0, team_range*team_size) + 1 per team_member (total of team_range*team_size) )
|
||||
}
|
||||
if (test_type == 211) {
|
||||
Kokkos::parallel_reduce("211 outer reduce", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double thread_for = 1.0;
|
||||
for(int tr = 0; tr<thread_repeat; tr++) {
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
|
||||
for (int vr = 0; vr<inner_repeat; ++vr)
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi) {
|
||||
v3(idx, t, vi) = vi;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
});
|
||||
}
|
||||
lval+=idx+thread_for;
|
||||
},result);
|
||||
result_expect = 0.5*(team_range*team_size)*(team_range*team_size-1) + (team_range*team_size);
|
||||
// sum ( seq( [0, team_range*team_size) + 1 per team_member (total of team_range*team_size) )
|
||||
}
|
||||
if (test_type == 212) {
|
||||
Kokkos::parallel_reduce("212 outer reduce", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double vector_result = 0.0;
|
||||
for(int tr = 0; tr<thread_repeat; tr++) {
|
||||
// This parallel_for is executed by each team; the thread_range is partitioned among the team members
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
|
||||
v2(idx,t) = t;
|
||||
// prevent compiler optimizing loop away
|
||||
for (int vr = 0; vr<inner_repeat; ++vr) {
|
||||
vector_result = 0.0;
|
||||
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi, double &vval) {
|
||||
vval += vi;
|
||||
}, vector_result );
|
||||
}
|
||||
});
|
||||
}
|
||||
lval+= idx + vector_result;
|
||||
},result);
|
||||
result_expect = 0.5*(team_range*team_size)*(team_range*team_size-1) + (0.5*vector_range*(vector_range-1)*team_range*team_size);
|
||||
// sum ( seq( [0, team_range*team_size) + sum( seq( [0, vector_range) ) per team_member (total of team_range*team_size) )
|
||||
}
|
||||
if (test_type == 220) {
|
||||
Kokkos::parallel_reduce("220 outer reduce", t_policy(team_range,team_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
double team_result = 0.0;
|
||||
for(int tr = 0; tr<thread_repeat; tr++) {
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
|
||||
tval += t;
|
||||
},team_result);
|
||||
}
|
||||
lval+=team_result*team.league_rank(); // constant * league_rank
|
||||
},result);
|
||||
result_expect = 0.5*(team_range)*(team_range-1) * team_size * 0.5*(thread_range)*(thread_range-1);
|
||||
// sum ( seq( [0, team_range) * constant ); constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
|
||||
}
|
||||
if (test_type == 221) {
|
||||
Kokkos::parallel_reduce("221 outer reduce", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double team_result = 0;
|
||||
for(int tr = 0; tr<thread_repeat; tr++) {
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
|
||||
double vector_for = 1.0;
|
||||
for (int vr = 0; vr<inner_repeat; ++vr) {
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi) {
|
||||
v3(idx, t, vi) = vi;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
tval += t + vector_for;
|
||||
},team_result);
|
||||
}
|
||||
lval+=team_result*team.league_rank();
|
||||
},result);
|
||||
result_expect = 0.5* (team_range)*(team_range-1) * team_size * (0.5*(thread_range) * (thread_range-1) + thread_range);
|
||||
// sum ( seq( [0, team_range) * constant ) + 1 per member per team; constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
|
||||
}
|
||||
if (test_type == 222) {
|
||||
Kokkos::parallel_reduce("222 outer reduce", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
double team_result = 0.0;
|
||||
for(int tr = 0; tr<thread_repeat; tr++) {
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
|
||||
double vector_result = 0.0;
|
||||
for (int vr = 0; vr<inner_repeat; ++vr) {
|
||||
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi, double& vval) {
|
||||
vval += vi;
|
||||
}, vector_result);
|
||||
}
|
||||
tval += t + vector_result;
|
||||
},team_result);
|
||||
}
|
||||
lval+=team_result*team.league_rank();
|
||||
},result);
|
||||
result_expect = 0.5* (team_range)*(team_range-1) * team_size * (0.5*(thread_range) * (thread_range-1) + thread_range*0.5*(vector_range)*(vector_range-1));
|
||||
// sum ( seq( [0, team_range) * constant ) + 1 + sum( seq([0,vector_range) ) per member per team; constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
|
||||
}
|
||||
|
||||
// parallel_for RangePolicy: range = team_size*team_range
|
||||
if (test_type == 300) {
|
||||
Kokkos::parallel_for("300 outer for", team_size*team_range,
|
||||
KOKKOS_LAMBDA (const int idx) {
|
||||
v1(idx) = idx;
|
||||
// prevent compiler from optimizing away the loop
|
||||
});
|
||||
}
|
||||
// parallel_reduce RangePolicy: range = team_size*team_range
|
||||
if (test_type == 400) {
|
||||
Kokkos::parallel_reduce("400 outer reduce", team_size*team_range,
|
||||
KOKKOS_LAMBDA (const int idx, double& val) {
|
||||
val += idx;
|
||||
}, result);
|
||||
result_expect = 0.5*(team_size*team_range)*(team_size*team_range-1);
|
||||
}
|
||||
// parallel_scan RangePolicy: range = team_size*team_range
|
||||
if (test_type == 500) {
|
||||
Kokkos::parallel_scan("500 outer scan", team_size*team_range,
|
||||
ParallelScanFunctor<ViewType1>(v1)
|
||||
#if 0
|
||||
// This does not compile with pre Cuda 8.0 - see Github Issue #913 for explanation
|
||||
KOKKOS_LAMBDA (const int idx, double& val, const bool& final) {
|
||||
// inclusive scan
|
||||
val += v1(idx);
|
||||
if ( final ) {
|
||||
v1(idx) = val;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
);
|
||||
// result = v1( team_size*team_range - 1 ); // won't work with Cuda - need to copy result back to host to print
|
||||
// result_expect = 0.5*(team_size*team_range)*(team_size*team_range-1);
|
||||
}
|
||||
|
||||
} // end outer for loop
|
||||
|
||||
time = timer.seconds();
|
||||
} //end test_policy
|
|
@ -0,0 +1,53 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Script to check policy_perf_test code works with each possible combo of options
|
||||
|
||||
echo "Performance test results for parallel_reduce code computing sum of sequence [0,N) with various (nested) policies"
|
||||
|
||||
EXECUTABLE=policy_performance
|
||||
|
||||
TEAMRANGE=1000
|
||||
THREADRANGE=4
|
||||
VECTORRANGE=32
|
||||
TEAMSIZE=4
|
||||
VECTORSIZE=1
|
||||
OREPEAT=1
|
||||
MREPEAT=1
|
||||
IREPEAT=1
|
||||
SCHEDULE=1
|
||||
|
||||
SUFFIX=host
|
||||
if [ -e $EXECUTABLE.$SUFFIX ]
|
||||
then
|
||||
SCHEDULE=1
|
||||
echo "Host tests Static schedule"
|
||||
for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
|
||||
do
|
||||
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
|
||||
SCHEDULE=2
|
||||
echo "Host tests Dynamic schedule"
|
||||
for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
|
||||
do
|
||||
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
fi
|
||||
|
||||
SUFFIX=cuda
|
||||
if [ -e $EXECUTABLE.$SUFFIX ]
|
||||
then
|
||||
SCHEDULE=1
|
||||
echo "Cuda tests Static schedule"
|
||||
for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
|
||||
do
|
||||
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
|
||||
SCHEDULE=2
|
||||
echo "Cuda tests Dynamic schedule"
|
||||
for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
|
||||
do
|
||||
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
fi
|
|
@ -0,0 +1,126 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Sample script for benchmarking policy performance
|
||||
|
||||
# Suggested enviroment variables to export prior to executing script:
|
||||
# KNL:
|
||||
# OMP_NUM_THREADS=256 KMP_AFFINITY=compact
|
||||
# Power:
|
||||
# OMP_NUM_THREADS=64 OMP_PROC_BIND=true
|
||||
|
||||
# Constants and Variables:
|
||||
# Vary: TEAMSIZE, and THREADRANGE
|
||||
# for TEAMSIZE in {1,2,4,5,8}; do
|
||||
# for THREADRANGE in {32,41,1000}; do
|
||||
# Fixed: TEAMRANGE, VECTORRANGE, VECTORSIZE
|
||||
# System specific: Adjust REPEAT values to architecture tests are run on
|
||||
|
||||
# Tests
|
||||
# Static SCHEDULE = 1
|
||||
# Tier 1: parallel_for + RangePolicy 300
|
||||
# Tier 2: parallel_reduce, parallel_scan + RangePolicy 400 500
|
||||
# Tier 3: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY
|
||||
# Tier 4: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY
|
||||
# Dynamic SCHEDULE = 2
|
||||
# Tier 5: parallel_for + RangePolicy 300
|
||||
# Tier 6: parallel_reduce, parallel_scan + RangePolicy 400 500
|
||||
# Tier 7: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY
|
||||
# Tier 8: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY
|
||||
|
||||
# Results grouped by:
|
||||
# 0) SCHEDULE 1) CODE (test) 2) TEAMRANGE 3) TEAMSIZE 4) THREADRANGE
|
||||
|
||||
EXECUTABLE=policy_performance
|
||||
|
||||
# Default defined values
|
||||
TEAMRANGE=1000
|
||||
THREADRANGE=1
|
||||
VECTORRANGE=32
|
||||
TEAMSIZE=1
|
||||
VECTORSIZE=1
|
||||
OREPEAT=1
|
||||
MREPEAT=1
|
||||
IREPEAT=1
|
||||
SCHEDULE=1
|
||||
|
||||
# Host tests
|
||||
SUFFIX=host
|
||||
if [ -e $EXECUTABLE.$SUFFIX ]; then
|
||||
echo "Host"
|
||||
|
||||
for SCHEDULE in {1,2}; do
|
||||
|
||||
# Tier 1 and 2, 5 and 6
|
||||
for CODE in {300,400,500}; do
|
||||
for TEAMSIZE in {1,2,4,5,8}; do
|
||||
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
done
|
||||
|
||||
# Tier 3, 7
|
||||
for CODE in {100,110,111,112,120,121,122}; do
|
||||
for TEAMSIZE in {1,2,4,5,8}; do
|
||||
for THREADRANGE in {32,41,1000}; do
|
||||
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
# Tier 4, 8
|
||||
for CODE in {200,210,211,212,220,221,222}; do
|
||||
for TEAMSIZE in {1,2,4,5,8}; do
|
||||
for THREADRANGE in {32,41,1000}; do
|
||||
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
done # end SCHEDULE
|
||||
|
||||
fi # end host
|
||||
|
||||
|
||||
# Cuda tests
|
||||
SUFFIX=cuda
|
||||
# TEAMRANGE=10000, TEAMSIZE=8 too large
|
||||
# TEAMRANGE=10000, TEAMSIZE=8, THREADRANGE=1000 too large
|
||||
if [ -e $EXECUTABLE.$SUFFIX ]; then
|
||||
echo "Cuda"
|
||||
|
||||
for SCHEDULE in {1,2}; do
|
||||
|
||||
# Reset defaults
|
||||
TEAMRANGE=1000
|
||||
THREADRANGE=1
|
||||
VECTORRANGE=32
|
||||
TEAMSIZE=1
|
||||
VECTORSIZE=1
|
||||
|
||||
# Tier 1 and 2, 5 and 6
|
||||
for CODE in {300,400,500}; do
|
||||
for TEAMSIZE in {1,2,4,5,8}; do
|
||||
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
done
|
||||
|
||||
# Tier 3, 7
|
||||
for CODE in {100,110,111,112,120,121,122}; do
|
||||
for TEAMSIZE in {1,2,4,5,8}; do
|
||||
for THREADRANGE in {32,41,1000}; do
|
||||
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
# Tier 4, 8
|
||||
for CODE in {200,210,211,212,220,221,222}; do
|
||||
for TEAMSIZE in {1,2,4,5,8}; do
|
||||
for THREADRANGE in {32,41,1000}; do
|
||||
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
done # end SCHEDULE
|
||||
|
||||
fi #end cuda
|
|
@ -0,0 +1,454 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
################################################################################
|
||||
# Check if hwloc commands exist
|
||||
################################################################################
|
||||
declare -i HPCBIND_HAS_HWLOC=1
|
||||
type hwloc-bind >/dev/null 2>&1
|
||||
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
|
||||
|
||||
type hwloc-distrib >/dev/null 2>&1
|
||||
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
|
||||
|
||||
type hwloc-ls >/dev/null 2>&1
|
||||
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
|
||||
|
||||
type hwloc-calc >/dev/null 2>&1
|
||||
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
|
||||
|
||||
type hwloc-ps >/dev/null 2>&1
|
||||
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
|
||||
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 0 ]]; then
|
||||
echo "hwloc not found, no process binding will occur"
|
||||
fi
|
||||
|
||||
# Get parent cpuset
|
||||
HPCBIND_HWLOC_PARENT_CPUSET=""
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
|
||||
MY_PID="$BASHPID"
|
||||
HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
# Check if nvidia-smi exist
|
||||
################################################################################
|
||||
declare -i HPCBIND_HAS_NVIDIA=0
|
||||
type nvidia-smi >/dev/null 2>&1
|
||||
HPCBIND_HAS_NVIDIA=$((!$?))
|
||||
|
||||
|
||||
################################################################################
|
||||
# Get visible gpu
|
||||
################################################################################
|
||||
declare -i NUM_GPUS=0
|
||||
HPCBIND_VISIBLE_GPUS=""
|
||||
if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
|
||||
NUM_GPUS=$(nvidia-smi -L | wc -l);
|
||||
GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )"
|
||||
HPCBIND_VISIBLE_GPUS=${CUDA_VISIBLE_DEVICES:-${GPU_LIST}}
|
||||
fi
|
||||
|
||||
declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0))
|
||||
|
||||
|
||||
################################################################################
|
||||
# Get queue id
|
||||
# supports sbatch, bsub, aprun
|
||||
################################################################################
|
||||
HPCBIND_QUEUE_NAME=""
|
||||
declare -i HPCBIND_QUEUE_INDEX=0
|
||||
declare -i HPCBIND_QUEUE_GPU_MAPPING=0
|
||||
|
||||
if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
|
||||
HPCBIND_QUEUE_GPU_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="sbatch"
|
||||
HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID}
|
||||
elif [[ ! -z "${LBS_JOBINDEX}" ]]; then
|
||||
HPCBIND_QUEUE_GPU_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="bsub"
|
||||
HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX}
|
||||
elif [[ ! -z "${ALPS_APP_PE}" ]]; then
|
||||
HPCBIND_QUEUE_GPU_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="aprun"
|
||||
HPCBIND_QUEUE_INDEX=${ALPS_APP_PE}
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Show help
|
||||
################################################################################
|
||||
function show_help {
|
||||
local cmd=$(basename "$0")
|
||||
echo "Usage: ${cmd} <options> -- command ..."
|
||||
echo " Set the process mask, OMP environment variables and CUDA environment"
|
||||
echo " variables to sane values if possible. Uses hwloc and nvidia-smi if"
|
||||
echo " available. Will preserve the current process binding, so it is safe"
|
||||
echo " to use with a queuing system or mpiexec."
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --no-hwloc-bind Disable binding"
|
||||
echo " --proc-bind=<LOC> Set the initial process mask for the script"
|
||||
echo " LOC can be any valid location argument for"
|
||||
echo " hwloc-calc Default: all"
|
||||
echo " --distribute=N Distribute the current cpuset into N partitions"
|
||||
echo " --distribute-partition=I"
|
||||
echo " Use the i'th partition (zero based)"
|
||||
echo " --visible-gpus=<L> Comma separated list of gpu ids"
|
||||
echo " Default: CUDA_VISIBLE_DEVICES or all gpus in"
|
||||
echo " sequential order"
|
||||
echo " --gpu-ignore-queue Ignore queue job id when choosing visible GPU"
|
||||
echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES"
|
||||
echo " --openmp=M.m Set env variables for the given OpenMP version"
|
||||
echo " Default: 4.0"
|
||||
echo " --openmp-percent=N Integer percentage of cpuset to use for OpenMP"
|
||||
echo " threads Default: 100"
|
||||
echo " --openmp-places=<Op> Op=threads|cores|sockets. Default: threads"
|
||||
echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
|
||||
echo " --force-openmp-num-threads=N"
|
||||
echo " Override logic for selecting OMP_NUM_THREADS"
|
||||
echo " --force-openmp-proc-bind=<OP>"
|
||||
echo " Override logic for selecting OMP_PROC_BIND"
|
||||
echo " --no-openmp-nested Set OMP_NESTED to false"
|
||||
echo " --show-bindings Show the bindings"
|
||||
echo " --lstopo Show bindings in lstopo without executing a command"
|
||||
echo " -v|--verbose Show options and relevant environment variables"
|
||||
echo " -h|--help Show this message"
|
||||
echo ""
|
||||
echo "Sample Usage:"
|
||||
echo " Split the current process cpuset into 4 and use the 3rd partition"
|
||||
echo " ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..."
|
||||
echo " Bing the process to all even cores"
|
||||
echo " ${cmd} --proc-bind=core:even -v -- command ..."
|
||||
echo " Bind to the first 64 cores and split the current process cpuset into 4"
|
||||
echo " ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..."
|
||||
echo " skip GPU 0 when mapping visible devices"
|
||||
echo " ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..."
|
||||
echo " Display the current bindings"
|
||||
echo " ${cmd} --proc-bind=numa:0 --show-bindings -- command"
|
||||
echo " Display the current bindings using lstopo"
|
||||
echo " ${cmd} --proc-bind=numa:0.core:odd --lstopo"
|
||||
echo ""
|
||||
}
|
||||
|
||||
|
||||
################################################################################
|
||||
# Parse command line arguments
|
||||
################################################################################
|
||||
# Show help if no command line arguments given
|
||||
if [[ "$#" -eq 0 ]]; then
|
||||
show_help
|
||||
exit 0
|
||||
fi
|
||||
|
||||
declare -a UNKNOWN_ARGS=()
|
||||
declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC}
|
||||
declare -i HPCBIND_DISTRIBUTE=1
|
||||
declare -i HPCBIND_PARTITION=0
|
||||
HPCBIND_PROC_BIND="all"
|
||||
HPCBIND_OPENMP_VERSION=4.0
|
||||
declare -i HPCBIND_OPENMP_PERCENT=100
|
||||
HPCBIND_OPENMP_PLACES=${OMP_PLACES:-threads}
|
||||
declare -i HPCBIND_OPENMP_PROC_BIND=1
|
||||
declare -i HPCBIND_OPENMP_FORCE_NUM_THREADS=-1
|
||||
HPCBIND_OPENMP_FORCE_PROC_BIND=""
|
||||
HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true}
|
||||
declare -i HPCBIND_VERBOSE=0
|
||||
|
||||
declare -i HPCBIND_SHOW_BINDINGS=0
|
||||
declare -i HPCBIND_LSTOPO=0
|
||||
|
||||
for i in $@; do
|
||||
case $i in
|
||||
# number of partitions to create
|
||||
--no-hwloc-bind)
|
||||
HPCBIND_ENABLE_HWLOC_BIND=0
|
||||
shift
|
||||
;;
|
||||
--proc-bind=*)
|
||||
HPCBIND_PROC_BIND="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--distribute=*)
|
||||
HPCBIND_DISTRIBUTE="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
# which partition to use
|
||||
--distribute-partition=*)
|
||||
HPCBIND_PARTITION="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--visible-gpus=*)
|
||||
HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ')
|
||||
shift
|
||||
;;
|
||||
--gpu-ignore-queue)
|
||||
HPCBIND_QUEUE_GPU_MAPPING=0
|
||||
shift
|
||||
;;
|
||||
--no-gpu-mapping)
|
||||
HPCBIND_ENABLE_GPU_MAPPING=0
|
||||
shift
|
||||
;;
|
||||
--openmp=*)
|
||||
HPCBIND_OPENMP_VERSION="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--openmp-percent=*)
|
||||
HPCBIND_OPENMP_PERCENT="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--openmp-places=*)
|
||||
HPCBIND_OPENMP_PLACES="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--no-openmp-proc-bind)
|
||||
HPCBIND_OPENMP_PROC_BIND=0
|
||||
shift
|
||||
;;
|
||||
--force-openmp-proc-bind=*)
|
||||
HPCBIND_OPENMP_FORCE_PROC_BIND="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--force-openmp-num-threads=*)
|
||||
HPCBIND_OPENMP_FORCE_NUM_THREADS="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--no-openmp-nested)
|
||||
HPCBIND_OPENMP_NESTED="false"
|
||||
shift
|
||||
;;
|
||||
--show-bindings)
|
||||
HPCBIND_VERBOSE=1
|
||||
HPCBIND_SHOW_BINDINGS=1
|
||||
shift
|
||||
;;
|
||||
--lstopo)
|
||||
HPCBIND_VERBOSE=1
|
||||
HPCBIND_SHOW_BINDINGS=0
|
||||
HPCBIND_LSTOPO=1
|
||||
shift
|
||||
;;
|
||||
-v|--verbose)
|
||||
HPCBIND_VERBOSE=1
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
# ignore remaining arguments
|
||||
--)
|
||||
shift
|
||||
break
|
||||
;;
|
||||
# unknown option
|
||||
*)
|
||||
UNKNOWN_ARGS+=("$i")
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
|
||||
################################################################################
|
||||
# Check unknown arguments
|
||||
################################################################################
|
||||
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
|
||||
echo "Uknown options: ${UNKNOWN_ARGS[*]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Check that visible gpus are valid
|
||||
################################################################################
|
||||
HPCBIND_VISIBLE_GPUS=(${HPCBIND_VISIBLE_GPUS})
|
||||
if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
|
||||
for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do
|
||||
if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} ||
|
||||
${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then
|
||||
echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0"
|
||||
HPCBIND_VISIBLE_GPUS[$i]=0;
|
||||
fi
|
||||
done
|
||||
NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]}
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Check OpenMP percent
|
||||
################################################################################
|
||||
if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then
|
||||
echo "OpenMP percent < 1, setting to 1"
|
||||
HPCBIND_OPENMP_PERCENT=1
|
||||
elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then
|
||||
echo "OpenMP percent > 100, setting to 100"
|
||||
HPCBIND_OPENMP_PERCENT=100
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
# Check distribute
|
||||
################################################################################
|
||||
if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then
|
||||
echo "Invalid input for distribute, changing distribute to 1"
|
||||
HPCBIND_DISTRIBUTE=1
|
||||
fi
|
||||
|
||||
if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
|
||||
echo "Invalid input for distribute-partition, changing to 0"
|
||||
HPCBIND_PARTITION=0
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Find cpuset and num threads
|
||||
################################################################################
|
||||
HPCBIND_HWLOC_CPUSET=""
|
||||
declare -i HPCBIND_NUM_PUS=0
|
||||
|
||||
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
|
||||
if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
|
||||
BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND})
|
||||
else
|
||||
BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND})
|
||||
fi
|
||||
|
||||
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
|
||||
HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]}
|
||||
HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l)
|
||||
else
|
||||
HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor)
|
||||
fi
|
||||
|
||||
declare -i HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_NUM_PUS * HPCBIND_OPENMP_PERCENT))
|
||||
HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_OPENMP_NUM_THREADS / 100))
|
||||
|
||||
|
||||
if [[ ${HPCBIND_OPENMP_NUM_THREADS} -lt 1 ]]; then
|
||||
HPCBIND_OPENMP_NUM_THREADS=1
|
||||
elif [[ ${HPCBIND_OPENMP_NUM_THREADS} -gt ${HPCBIND_NUM_PUS} ]]; then
|
||||
HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_NUM_PUS}
|
||||
fi
|
||||
|
||||
if [[ ${HPCBIND_OPENMP_FORCE_NUM_THREADS} -gt 0 ]]; then
|
||||
HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_OPENMP_FORCE_NUM_THREADS}
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
# Set OpenMP environment variables
|
||||
################################################################################
|
||||
|
||||
# set OMP_NUM_THREADS
|
||||
export OMP_NUM_THREADS=${HPCBIND_OPENMP_NUM_THREADS}
|
||||
|
||||
# set OMP_PROC_BIND and OMP_PLACES
|
||||
if [[ ${HPCBIND_OPENMP_PROC_BIND} -eq 1 ]]; then
|
||||
if [[ "${HPCBIND_OPENMP_FORCE_PROC_BIND}" == "" ]]; then
|
||||
#default proc bind logic
|
||||
if [[ "${HPCBIND_OPENMP_VERSION}" == "4.0" || "${HPCBIND_OPENMP_VERSION}" > "4.0" ]]; then
|
||||
export OMP_PLACES="${HPCBIND_OPENMP_PLACES}"
|
||||
export OMP_PROC_BIND="spread"
|
||||
else
|
||||
export OMP_PROC_BIND="true"
|
||||
unset OMP_PLACES
|
||||
fi
|
||||
else
|
||||
#force proc bind
|
||||
export OMP_PLACES="${HPCBIND_OPENMP_PLACES}"
|
||||
export OMP_PROC_BIND="${HPCBIND_OPENMP_FORCE_PROC_BIND}"
|
||||
fi
|
||||
else
|
||||
# no openmp proc bind
|
||||
unset OMP_PLACES
|
||||
unset OMP_PROC_BIND
|
||||
fi
|
||||
|
||||
# set OMP_NESTED
|
||||
export OMP_NESTED=${HPCBIND_OPENMP_NESTED}
|
||||
|
||||
|
||||
################################################################################
|
||||
# Set CUDA environment variables
|
||||
################################################################################
|
||||
|
||||
if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
|
||||
if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then
|
||||
declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
|
||||
export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
|
||||
else
|
||||
declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
|
||||
declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
|
||||
export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
|
||||
fi
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
# Set hpcbind environment variables
|
||||
################################################################################
|
||||
export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC}
|
||||
export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA}
|
||||
export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS}
|
||||
export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET}
|
||||
export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE}
|
||||
export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION}
|
||||
if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
|
||||
export HPCBIND_HWLOC_PARENT_CPUSET="all"
|
||||
else
|
||||
export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET}
|
||||
fi
|
||||
export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND}
|
||||
export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
|
||||
export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
|
||||
export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION}
|
||||
if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then
|
||||
export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX}
|
||||
export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME}
|
||||
export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING}
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Print verbose
|
||||
################################################################################
|
||||
|
||||
if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then
|
||||
MY_ENV=$(env | sort)
|
||||
echo "[HPCBIND]"
|
||||
echo "${MY_ENV}" | grep -E "^HPCBIND_"
|
||||
echo "[CUDA]"
|
||||
echo "${MY_ENV}" | grep -E "^CUDA_"
|
||||
echo "[OPENMP]"
|
||||
echo "${MY_ENV}" | grep -E "^OMP_"
|
||||
fi
|
||||
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
|
||||
echo "[BINDINGS]"
|
||||
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
|
||||
elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
|
||||
echo "Unable to show bindings, hwloc not available."
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
# Run command
|
||||
################################################################################
|
||||
|
||||
if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then
|
||||
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
|
||||
hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@
|
||||
else
|
||||
eval $@
|
||||
fi
|
||||
else
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
|
||||
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
|
||||
echo "[BINDINGS]"
|
||||
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
|
||||
hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0
|
||||
else
|
||||
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET}
|
||||
fi
|
||||
else
|
||||
echo "Unable to show bindings, hwloc not available."
|
||||
fi
|
||||
fi
|
|
@ -0,0 +1,221 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# check if hwloc commands exist
|
||||
declare -i HAS_HWLOC=0
|
||||
type hwloc-bind >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
type hwloc-distrib >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
type hwloc-ls >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
type hwloc-calc >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
type hwloc-ps >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
|
||||
#parse args
|
||||
declare -a UNKNOWN_ARGS=()
|
||||
declare -i DISTRIBUTE=1
|
||||
declare -i INDEX=0
|
||||
PROC_BIND="all"
|
||||
CURRENT_CPUSET=""
|
||||
OPENMP_VERSION=4.0
|
||||
OPENMP_PROC_BIND=True
|
||||
OPENMP_NESTED=True
|
||||
VERBOSE=False
|
||||
|
||||
#get the current process cpuset
|
||||
if [[ ${HAS_HWLOC} -eq 0 ]]; then
|
||||
MY_PID="$BASHPID"
|
||||
CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
|
||||
echo "$CURRENT_CPUSET"
|
||||
fi
|
||||
|
||||
function show_help {
|
||||
local cmd=$(basename "$0")
|
||||
echo "Usage: ${cmd} <options> -- command ..."
|
||||
echo " Uses hwloc to divide the node into the given number of groups,"
|
||||
echo " set the appropriate OMP_NUM_THREADS and execute the command on the"
|
||||
echo " selected group."
|
||||
echo ""
|
||||
echo " NOTE: This command assumes it has exclusive use of the node"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --proc-bind=<LOC> Set the initial process mask for the script. "
|
||||
echo " LOC can be any valid location argumnet for"
|
||||
echo " hwloc-calc. Defaults to the entire machine"
|
||||
echo " --distribute=N Distribute the current proc-bind into N groups"
|
||||
echo " --index=I Use the i'th group (zero based)"
|
||||
echo " --openmp=M.m Set env variables for the given OpenMP version"
|
||||
echo " (default 4.0)"
|
||||
echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
|
||||
echo " --no-openmp-nested Set OMP_NESTED to false"
|
||||
echo " -v|--verbose"
|
||||
echo " -h|--help"
|
||||
echo ""
|
||||
echo "Sample Usage:"
|
||||
echo " ${cmd} --distribute=4 --index=2 -v -- command ..."
|
||||
echo ""
|
||||
}
|
||||
|
||||
if [[ "$#" -eq 0 ]]; then
|
||||
show_help
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
||||
for i in $@; do
|
||||
case $i in
|
||||
# number of partitions to create
|
||||
--proc-bind=*)
|
||||
PROC_BIND="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--distribute=*)
|
||||
DISTRIBUTE="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
# which group to use
|
||||
--index=*)
|
||||
INDEX="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--openmp=*)
|
||||
OPENMP_VERSION="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--no-openmp-proc-bind)
|
||||
OPENMP_PROC_BIND=False
|
||||
shift
|
||||
;;
|
||||
--no-openmp-nested)
|
||||
OPENMP_NESTED=False
|
||||
shift
|
||||
;;
|
||||
-v|--verbose)
|
||||
VERBOSE=True
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
# ignore remaining arguments
|
||||
--)
|
||||
shift
|
||||
break
|
||||
;;
|
||||
# unknown option
|
||||
*)
|
||||
UNKNOWN_ARGS+=("$i")
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
|
||||
echo "Uknown options: ${UNKNOWN_ARGS[*]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ${DISTRIBUTE} -le 0 ]]; then
|
||||
echo "Invalid input for distribute, changing distribute to 1"
|
||||
DISTRIBUTE=1
|
||||
fi
|
||||
|
||||
if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then
|
||||
echo "Invalid input for index, changing index to 0"
|
||||
INDEX=0
|
||||
fi
|
||||
|
||||
if [[ ${HAS_HWLOC} -ne 0 ]]; then
|
||||
echo "hwloc not found, no process binding will occur"
|
||||
DISTRIBUTE=1
|
||||
INDEX=0
|
||||
fi
|
||||
|
||||
if [[ ${HAS_HWLOC} -eq 0 ]]; then
|
||||
|
||||
if [[ "${CURRENT_CPUSET}" == "" ]]; then
|
||||
BINDING=$(hwloc-calc ${PROC_BIND})
|
||||
else
|
||||
BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND})
|
||||
fi
|
||||
|
||||
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE}))
|
||||
CPUSET=${CPUSETS[${INDEX}]}
|
||||
NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l)
|
||||
|
||||
if [[ "${VERBOSE}" == "True" ]]; then
|
||||
echo "hwloc: true"
|
||||
echo " proc_bind: ${PROC_BIND}"
|
||||
echo " distribute: ${DISTRIBUTE}"
|
||||
echo " index: ${INDEX}"
|
||||
echo " parent_cpuset: ${CURRENT_CPUSET}"
|
||||
echo " cpuset: ${CPUSET}"
|
||||
echo "omp_num_threads: ${NUM_THREADS}"
|
||||
echo "omp_proc_bind: ${OPENMP_PROC_BIND}"
|
||||
echo "omp_nested: ${OPENMP_NESTED}"
|
||||
echo "OpenMP: ${OPENMP_VERSION}"
|
||||
fi
|
||||
|
||||
# set OMP env
|
||||
if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
|
||||
if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
|
||||
export OMP_PLACES="threads"
|
||||
export OMP_PROC_BIND="spread"
|
||||
else
|
||||
export OMP_PROC_BIND="true"
|
||||
unset OMP_PLACES
|
||||
fi
|
||||
else
|
||||
unset OMP_PLACES
|
||||
unset OMP_PROC_BIND
|
||||
fi
|
||||
if [[ "${OPENMP_NESTED}" == "True" ]]; then
|
||||
export OMP_NESTED="true"
|
||||
else
|
||||
export OMP_NESTED="false"
|
||||
fi
|
||||
export OMP_NUM_THREADS="${NUM_THREADS}"
|
||||
|
||||
hwloc-bind ${CPUSET} -- $@
|
||||
else
|
||||
NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor)
|
||||
|
||||
if [[ "${VERBOSE}" == "True" ]]; then
|
||||
echo "hwloc: false"
|
||||
echo "omp_num_threads: ${NUM_THREADS}"
|
||||
echo "omp_proc_bind: ${OPENMP_PROC_BIND}"
|
||||
echo "omp_nested: ${OPENMP_NESTED}"
|
||||
echo "OpenMP: ${OPENMP_VERSION}"
|
||||
fi
|
||||
|
||||
# set OMP env
|
||||
if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
|
||||
if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
|
||||
export OMP_PLACES="threads"
|
||||
export OMP_PROC_BIND="spread"
|
||||
else
|
||||
export OMP_PROC_BIND="true"
|
||||
unset OMP_PLACES
|
||||
fi
|
||||
else
|
||||
unset OMP_PLACES
|
||||
unset OMP_PROC_BIND
|
||||
fi
|
||||
if [[ "${OPENMP_NESTED}" == "True" ]]; then
|
||||
export OMP_NESTED="true"
|
||||
else
|
||||
export OMP_NESTED="false"
|
||||
fi
|
||||
export OMP_NUM_THREADS="${NUM_THREADS}"
|
||||
|
||||
eval $@
|
||||
fi
|
||||
|
|
@ -0,0 +1,165 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
function get_path() {
|
||||
cd "$(dirname "$0")"
|
||||
cd ..
|
||||
echo "$(pwd -P)"
|
||||
}
|
||||
|
||||
KOKKOS_PATH="$(get_path "$0")"
|
||||
|
||||
function show_help() {
|
||||
local cmd=$(basename "$0")
|
||||
echo "Usage: ${cmd} <options> "
|
||||
echo " Build and run the tests"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " -j=N|--make-j=N Build the tests in parallel"
|
||||
echo " -c|--clean Clean build and regenerate make files"
|
||||
echo " --clean-on-pass Clean build when runtest passes"
|
||||
echo " --output-prefix=<pre> Prefix of log files Default: runtest"
|
||||
echo " --build-only Only build the tests"
|
||||
echo " -v|--verbose Tee STDOUT and STDERR to screen and files"
|
||||
echo " -h|--help Show this message"
|
||||
echo ""
|
||||
${KOKKOS_PATH}/generate_makefile.bash --help
|
||||
return 0
|
||||
}
|
||||
|
||||
|
||||
declare -a GENERATE_ARGS=()
|
||||
declare -i VERBOSE=0
|
||||
declare -i CLEAN=0
|
||||
declare -i CLEAN_ON_PASS=0
|
||||
declare -i BUILD_ONLY=0
|
||||
OUTPUT="runtest"
|
||||
|
||||
declare -i MAKE_J=${HPCBIND_NUM_PUS:-1}
|
||||
|
||||
for i in $@; do
|
||||
case $i in
|
||||
-j=*|--make-j=*)
|
||||
MAKE_J=${i#*=}
|
||||
shift
|
||||
;;
|
||||
-c|--clean)
|
||||
CLEAN=1
|
||||
shift
|
||||
;;
|
||||
--clean-on-pass)
|
||||
CLEAN_ON_PASS=1
|
||||
shift
|
||||
;;
|
||||
--output-prefix=*)
|
||||
OUTPUT=${i#*=}
|
||||
shift
|
||||
;;
|
||||
--build-only)
|
||||
BUILD_ONLY=1
|
||||
shift
|
||||
;;
|
||||
-v|--verbose)
|
||||
VERBOSE=1
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
GENERATE_ARGS+=("$i")
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ "$(pwd -P)" == ${KOKKOS_PATH} ]]; then
|
||||
echo "Cannot call $0 from root repository path ${KOKKOS_PATH}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Some makefile dependencies are incorrect, so clean needs to force
|
||||
# a new call to generate_makefiles.bash
|
||||
if [[ ${CLEAN} -eq 1 ]]; then
|
||||
START=${SECONDS}
|
||||
echo "Cleaning"
|
||||
/bin/rm -rf algorithms containers core example install Makefile >/dev/null 2>&1
|
||||
END=${SECONDS}
|
||||
echo " $((END-START)) seconds"
|
||||
if [[ ${VERBOSE} -eq 1 ]]; then
|
||||
echo ""
|
||||
echo ""
|
||||
fi
|
||||
fi
|
||||
|
||||
declare -i START=${SECONDS}
|
||||
echo "Generating Makefile"
|
||||
echo " ${KOKKOS_PATH}/generate_makefile.bash --kokkos-path=${KOKKOS_PATH} ${GENERATE_ARGS[@]}"
|
||||
|
||||
if [[ ${VERBOSE} -eq 0 ]]; then
|
||||
"${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > ${OUTPUT}.out 2> >(tee ${OUTPUT}.err >&2)
|
||||
else
|
||||
"${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > >(tee ${OUTPUT}.out) 2> >(tee ${OUTPUT}.err >&2)
|
||||
fi
|
||||
declare -i RESULT=$?
|
||||
declare -i END=${SECONDS}
|
||||
if [[ ${RESULT} -eq 0 ]]; then
|
||||
echo " PASS: $((END-START)) seconds"
|
||||
if [[ ${VERBOSE} -eq 1 ]]; then
|
||||
echo ""
|
||||
echo ""
|
||||
fi
|
||||
else
|
||||
cat ${OUTPUT}.out | grep "FAIL"
|
||||
cat ${OUTPUT}.err | grep "FAIL"
|
||||
echo " FAIL: $((END-START)) seconds"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
START=${SECONDS}
|
||||
echo "Building"
|
||||
if [[ ${VERBOSE} -eq 0 ]]; then
|
||||
make --keep-going -j ${MAKE_J} build-test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2)
|
||||
else
|
||||
make --keep-going -j ${MAKE_J} build-test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2)
|
||||
fi
|
||||
RESULT=$?
|
||||
END=${SECONDS}
|
||||
if [[ ${RESULT} -eq 0 ]]; then
|
||||
echo " PASS: $((END-START)) seconds"
|
||||
if [[ ${VERBOSE} -eq 1 ]]; then
|
||||
echo ""
|
||||
echo ""
|
||||
fi
|
||||
else
|
||||
cat ${OUTPUT}.out | grep -E "[[:space:]]error:[[:space:]]"
|
||||
cat ${OUTPUT}.err | grep -E "[[:space:]]error:[[:space:]]"
|
||||
echo " FAIL: $((END-START)) seconds"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ${BUILD_ONLY} -eq 0 ]]; then
|
||||
START=${SECONDS}
|
||||
echo "Testing"
|
||||
if [[ ${VERBOSE} -eq 0 ]]; then
|
||||
make --keep-going test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2)
|
||||
else
|
||||
make --keep-going test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2)
|
||||
fi
|
||||
RESULT=$?
|
||||
END=${SECONDS}
|
||||
if [[ ${RESULT} -eq 0 ]]; then
|
||||
echo " PASS: $((END-START)) seconds"
|
||||
if [[ ${CLEAN_ON_PASS} -eq 1 ]]; then
|
||||
make clean
|
||||
fi
|
||||
else
|
||||
cat ${OUTPUT}.out | grep "FAIL"
|
||||
cat ${OUTPUT}.err | grep "FAIL"
|
||||
echo " FAIL: $((END-START)) seconds"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
exit ${RESULT}
|
||||
|
|
@ -999,8 +999,12 @@ SET (Kokkos_INCLUDE_DIRS
|
|||
${Kokkos_SOURCE_DIR}/containers/src
|
||||
${Kokkos_SOURCE_DIR}/algorithms/src
|
||||
${Kokkos_BINARY_DIR} # to find KokkosCore_config.h
|
||||
${KOKKOS_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
# pass include dirs back to parent scope
|
||||
SET(Kokkos_INCLUDE_DIRS_RET ${Kokkos_INCLUDE_DIRS} PARENT_SCOPE)
|
||||
|
||||
INCLUDE_DIRECTORIES(${Kokkos_INCLUDE_DIRS})
|
||||
|
||||
IF(KOKKOS_SEPARATE_LIBS)
|
||||
|
|
|
@ -7,3 +7,4 @@ tag: 2.02.07 date: 12:16:2016 master: 4b4cc4ba develop: 382c0966
|
|||
tag: 2.02.15 date: 02:10:2017 master: 8c64cd93 develop: 28dea8b6
|
||||
tag: 2.03.00 date: 04:25:2017 master: 120d9ce7 develop: 015ba641
|
||||
tag: 2.03.05 date: 05:27:2017 master: 36b92f43 develop: 79073186
|
||||
tag: 2.03.13 date: 07:27:2017 master: da314444 develop: 29ccb58a
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
#include <cstdio>
|
||||
#include <cuda_runtime_api.h>
|
||||
int main()
|
||||
{
|
||||
cudaDeviceProp prop;
|
||||
const cudaError_t err_code = cudaGetDeviceProperties(&prop, 0);
|
||||
if (cudaSuccess != err_code) {
|
||||
fprintf(stderr,"cudaGetDeviceProperties failed: %s\n", cudaGetErrorString(err_code));
|
||||
return -1;
|
||||
}
|
||||
switch (prop.major) {
|
||||
case 3:
|
||||
printf("Kepler"); break;
|
||||
case 5:
|
||||
printf("Maxwell"); break;
|
||||
case 6:
|
||||
printf("Pascal"); break;
|
||||
default:
|
||||
fprintf(stderr, "Unspported Device %d%d\n", (int)prop.major, (int)prop.minor);
|
||||
return -1;
|
||||
}
|
||||
printf("%d%d\n", (int)prop.major, (int)prop.minor);
|
||||
return 0;
|
||||
}
|
|
@ -160,9 +160,14 @@ if [ "$MACHINE" = "sems" ]; then
|
|||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
|
@ -280,13 +285,13 @@ elif [ "$MACHINE" = "apollo" ]; then
|
|||
"gcc/5.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
|
||||
"intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
|
||||
"clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/head $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
|
||||
"clang/4.0.0 $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
|
||||
"cuda/8.0.44 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
)
|
||||
else
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("cuda/8.0.44 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"clang/head $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
|
||||
"clang/4.0.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
|
||||
"clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
|
||||
"gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
|
@ -584,7 +589,7 @@ single_build_and_test() {
|
|||
else
|
||||
run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
|
||||
local -i build_start_time=$(date +%s)
|
||||
run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
|
||||
run_cmd make -j 32 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
|
||||
local -i build_end_time=$(date +%s)
|
||||
comment="build_time=$(($build_end_time-$build_start_time))"
|
||||
|
||||
|
|
|
@ -28,14 +28,14 @@ export JENKINS_DO_PTHREAD=ON
|
|||
export JENKINS_DO_SERIAL=OFF
|
||||
export JENKINS_DO_COMPLEX=OFF
|
||||
|
||||
export ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
|
||||
export ARCH_C_FLAG="-xCORE-AVX2 -mkl"
|
||||
export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
|
||||
export JENKINS_ARCH_C_FLAG="-xCORE-AVX2 -mkl"
|
||||
export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a"
|
||||
export LAPACK_LIBRARIES=${BLAS_LIBRARIES}
|
||||
|
||||
export JENKINS_DO_TESTS=ON
|
||||
export JENKINS_DO_EXAMPLES=ON
|
||||
export JENKINS_DO_SHARED=OFF
|
||||
export JENKINS_DO_SHARED=ON
|
||||
|
||||
export QUEUE=haswell
|
||||
|
||||
|
|
|
@ -28,14 +28,14 @@ export JENKINS_DO_PTHREAD=OFF
|
|||
export JENKINS_DO_SERIAL=ON
|
||||
export JENKINS_DO_COMPLEX=ON
|
||||
|
||||
export ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
|
||||
export ARCH_C_FLAG="-xCORE-AVX2 -mkl"
|
||||
export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
|
||||
export JENKINS_ARCH_C_FLAG="-xCORE-AVX2 -mkl"
|
||||
export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a"
|
||||
export LAPACK_LIBRARIES=${BLAS_LIBRARIES}
|
||||
|
||||
export JENKINS_DO_TESTS=ON
|
||||
export JENKINS_DO_EXAMPLES=ON
|
||||
export JENKINS_DO_SHARED=OFF
|
||||
export JENKINS_DO_SHARED=ON
|
||||
|
||||
export QUEUE=haswell
|
||||
|
||||
|
|
|
@ -60,7 +60,6 @@ test-threads: KokkosContainers_PerformanceTest_Threads
|
|||
test-openmp: KokkosContainers_PerformanceTest_OpenMP
|
||||
./KokkosContainers_PerformanceTest_OpenMP
|
||||
|
||||
|
||||
build_all: $(TARGETS)
|
||||
|
||||
test: $(TEST_TARGETS)
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,12 +36,15 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
::testing::InitGoogleTest(&argc,argv);
|
||||
|
|
|
@ -69,30 +69,13 @@ protected:
|
|||
{
|
||||
std::cout << std::setprecision(5) << std::scientific;
|
||||
|
||||
unsigned num_threads = 4;
|
||||
|
||||
if (Kokkos::hwloc::available()) {
|
||||
num_threads = Kokkos::hwloc::get_available_numa_count()
|
||||
* Kokkos::hwloc::get_available_cores_per_numa()
|
||||
* Kokkos::hwloc::get_available_threads_per_core()
|
||||
;
|
||||
|
||||
}
|
||||
|
||||
std::cout << "OpenMP: " << num_threads << std::endl;
|
||||
|
||||
Kokkos::OpenMP::initialize( num_threads );
|
||||
|
||||
std::cout << "available threads: " << omp_get_max_threads() << std::endl;
|
||||
Kokkos::OpenMP::initialize();
|
||||
Kokkos::OpenMP::print_configuration( std::cout );
|
||||
}
|
||||
|
||||
static void TearDownTestCase()
|
||||
{
|
||||
Kokkos::OpenMP::finalize();
|
||||
|
||||
omp_set_num_threads(1);
|
||||
|
||||
ASSERT_EQ( 1 , omp_get_max_threads() );
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -564,7 +564,7 @@ namespace Impl {
|
|||
template< class D, class A1, class A2, class A3, class ... Args >
|
||||
struct DualViewSubview {
|
||||
|
||||
typedef typename Kokkos::Experimental::Impl::ViewMapping
|
||||
typedef typename Kokkos::Impl::ViewMapping
|
||||
< void
|
||||
, Kokkos::ViewTraits< D, A1, A2, A3 >
|
||||
, Args ...
|
||||
|
|
|
@ -46,19 +46,6 @@
|
|||
///
|
||||
/// This header file declares and defines Kokkos::Experimental::DynRankView and its
|
||||
/// related nonmember functions.
|
||||
/*
|
||||
* Changes from View
|
||||
* 1. The rank of the DynRankView is returned by the method rank()
|
||||
* 2. Max rank of a DynRankView is 7
|
||||
* 3. subview name is subdynrankview
|
||||
* 4. Every subdynrankview is returned with LayoutStride
|
||||
*
|
||||
* NEW: Redesigned DynRankView
|
||||
* 5. subview function name now available
|
||||
* 6. Copy and Copy-Assign View to DynRankView
|
||||
* 7. deep_copy between Views and DynRankViews
|
||||
* 8. rank( view ); returns the rank of View or DynRankView
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_DYNRANKVIEW_HPP
|
||||
#define KOKKOS_DYNRANKVIEW_HPP
|
||||
|
@ -117,6 +104,14 @@ struct DynRankDimTraits {
|
|||
, layout.dimension[7] );
|
||||
}
|
||||
|
||||
// Extra overload to match that for specialize types v2
|
||||
template <typename Layout, typename ... P>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static size_t computeRank( const Kokkos::Impl::ViewCtorProp<P...>& prop, const Layout& layout )
|
||||
{
|
||||
return computeRank(layout);
|
||||
}
|
||||
|
||||
// Create the layout for the rank-7 view.
|
||||
// Non-strided Layout
|
||||
template <typename Layout>
|
||||
|
@ -158,8 +153,17 @@ struct DynRankDimTraits {
|
|||
);
|
||||
}
|
||||
|
||||
// Extra overload to match that for specialize types
|
||||
template <typename Traits, typename ... P>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static typename std::enable_if< (std::is_same<typename Traits::array_layout , Kokkos::LayoutRight>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutLeft>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutStride>::value) , typename Traits::array_layout >::type createLayout( const ViewCtorProp<P...>& prop, const typename Traits::array_layout& layout )
|
||||
{
|
||||
return createLayout( layout );
|
||||
}
|
||||
|
||||
// Create a view from the given dimension arguments.
|
||||
// This is only necessary because the shmem constructor doesn't take a layout.
|
||||
// NDE shmem View's are not compatible with the added view_alloc value_type / fad_dim deduction functionality
|
||||
template <typename ViewType, typename ViewArg>
|
||||
static ViewType createView( const ViewArg& arg
|
||||
, const size_t N0
|
||||
|
@ -186,7 +190,8 @@ struct DynRankDimTraits {
|
|||
// Non-strided Layout
|
||||
template <typename Layout , typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank )
|
||||
static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) && std::is_integral<iType>::value , Layout >::type
|
||||
reconstructLayout( const Layout& layout , iType dynrank )
|
||||
{
|
||||
return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0)
|
||||
, dynrank > 1 ? layout.dimension[1] : ~size_t(0)
|
||||
|
@ -202,7 +207,8 @@ struct DynRankDimTraits {
|
|||
// LayoutStride
|
||||
template <typename Layout , typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank )
|
||||
static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) && std::is_integral<iType>::value , Layout >::type
|
||||
reconstructLayout( const Layout& layout , iType dynrank )
|
||||
{
|
||||
return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0)
|
||||
, dynrank > 0 ? layout.stride[0] : (0)
|
||||
|
@ -311,6 +317,11 @@ void dyn_rank_view_verify_operator_bounds
|
|||
/** \brief Assign compatible default mappings */
|
||||
struct ViewToDynRankViewTag {};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Experimental
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template< class DstTraits , class SrcTraits >
|
||||
class ViewMapping< DstTraits , SrcTraits ,
|
||||
typename std::enable_if<(
|
||||
|
@ -337,7 +348,7 @@ class ViewMapping< DstTraits , SrcTraits ,
|
|||
)
|
||||
)
|
||||
)
|
||||
) , ViewToDynRankViewTag >::type >
|
||||
) , Kokkos::Experimental::Impl::ViewToDynRankViewTag >::type >
|
||||
{
|
||||
private:
|
||||
|
||||
|
@ -376,7 +387,7 @@ public:
|
|||
|
||||
typedef typename DstType::offset_type dst_offset_type ;
|
||||
dst.m_map.m_offset = dst_offset_type(std::integral_constant<unsigned,0>() , src.layout() ); //Check this for integer input1 for padding, etc
|
||||
dst.m_map.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
|
||||
dst.m_map.m_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
|
||||
dst.m_track.assign( src.m_track , DstTraits::is_managed );
|
||||
dst.m_rank = src.Rank ;
|
||||
}
|
||||
|
@ -384,22 +395,20 @@ public:
|
|||
|
||||
} //end Impl
|
||||
|
||||
namespace Experimental {
|
||||
|
||||
/* \class DynRankView
|
||||
* \brief Container that creates a Kokkos view with rank determined at runtime.
|
||||
* Essentially this is a rank 7 view that wraps the access operators
|
||||
* to yield the functionality of a view
|
||||
* Essentially this is a rank 7 view
|
||||
*
|
||||
* Changes from View
|
||||
* 1. The rank of the DynRankView is returned by the method rank()
|
||||
* 2. Max rank of a DynRankView is 7
|
||||
* 3. subview name is subdynrankview
|
||||
* 4. Every subdynrankview is returned with LayoutStride
|
||||
*
|
||||
* NEW: Redesigned DynRankView
|
||||
* 5. subview function name now available
|
||||
* 6. Copy and Copy-Assign View to DynRankView
|
||||
* 7. deep_copy between Views and DynRankViews
|
||||
* 8. rank( view ); returns the rank of View or DynRankView
|
||||
* 3. subview called with 'subview(...)' or 'subdynrankview(...)' (backward compatibility)
|
||||
* 4. Every subview is returned with LayoutStride
|
||||
* 5. Copy and Copy-Assign View to DynRankView
|
||||
* 6. deep_copy between Views and DynRankViews
|
||||
* 7. rank( view ); returns the rank of View or DynRankView
|
||||
*
|
||||
*/
|
||||
|
||||
|
@ -427,7 +436,7 @@ public:
|
|||
|
||||
|
||||
private:
|
||||
typedef Kokkos::Experimental::Impl::ViewMapping< traits , void > map_type ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , void > map_type ;
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ;
|
||||
|
||||
track_type m_track ;
|
||||
|
@ -556,7 +565,7 @@ public:
|
|||
// Allow specializations to query their specialized map
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const Kokkos::Experimental::Impl::ViewMapping< traits , void > &
|
||||
const Kokkos::Impl::ViewMapping< traits , void > &
|
||||
implementation_map() const { return m_map ; }
|
||||
|
||||
//----------------------------------------
|
||||
|
@ -803,7 +812,7 @@ public:
|
|||
, m_rank(rhs.m_rank)
|
||||
{
|
||||
typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
|
||||
typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
|
||||
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
|
||||
Mapping::assign( m_map , rhs.m_map , rhs.m_track );
|
||||
}
|
||||
|
@ -813,7 +822,7 @@ public:
|
|||
DynRankView & operator = (const DynRankView<RT,RP...> & rhs )
|
||||
{
|
||||
typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
|
||||
typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
|
||||
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
|
||||
Mapping::assign( m_map , rhs.m_map , rhs.m_track );
|
||||
m_track.assign( rhs.m_track , traits::is_managed );
|
||||
|
@ -831,7 +840,7 @@ public:
|
|||
, m_rank( rhs.Rank )
|
||||
{
|
||||
typedef typename View<RT,RP...>::traits SrcTraits ;
|
||||
typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ;
|
||||
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
|
||||
Mapping::assign( *this , rhs );
|
||||
}
|
||||
|
@ -841,7 +850,7 @@ public:
|
|||
DynRankView & operator = ( const View<RT,RP...> & rhs )
|
||||
{
|
||||
typedef typename View<RT,RP...>::traits SrcTraits ;
|
||||
typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ;
|
||||
static_assert( Mapping::is_assignable , "Incompatible View to DynRankView copy assignment" );
|
||||
Mapping::assign( *this , rhs );
|
||||
return *this ;
|
||||
|
@ -870,7 +879,7 @@ public:
|
|||
)
|
||||
: m_track()
|
||||
, m_map()
|
||||
, m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) )
|
||||
, m_rank( Impl::DynRankDimTraits<typename traits::specialize>::template computeRank< typename traits::array_layout, P...>(arg_prop, arg_layout) )
|
||||
{
|
||||
// Append layout and spaces if not input
|
||||
typedef Impl::ViewCtorProp< P ... > alloc_prop_input ;
|
||||
|
@ -923,7 +932,7 @@ public:
|
|||
//------------------------------------------------------------
|
||||
|
||||
Kokkos::Experimental::Impl::SharedAllocationRecord<> *
|
||||
record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) );
|
||||
record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) );
|
||||
|
||||
//------------------------------------------------------------
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
|
@ -947,8 +956,8 @@ public:
|
|||
>::type const & arg_layout
|
||||
)
|
||||
: m_track() // No memory tracking
|
||||
, m_map( arg_prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) )
|
||||
, m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) )
|
||||
, m_map( arg_prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) )
|
||||
, m_rank( Impl::DynRankDimTraits<typename traits::specialize>::template computeRank< typename traits::array_layout, P...>(arg_prop, arg_layout) )
|
||||
{
|
||||
static_assert(
|
||||
std::is_same< pointer_type
|
||||
|
@ -1034,6 +1043,7 @@ public:
|
|||
{}
|
||||
|
||||
// For backward compatibility
|
||||
// NDE This ctor does not take ViewCtorProp argument - should not use alternative createLayout call
|
||||
explicit inline
|
||||
DynRankView( const ViewAllocateWithoutInitializing & arg_prop
|
||||
, const typename traits::array_layout & arg_layout
|
||||
|
@ -1179,6 +1189,11 @@ namespace Impl {
|
|||
|
||||
struct DynRankSubviewTag {};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Experimental
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template< class SrcTraits , class ... Args >
|
||||
struct ViewMapping
|
||||
< typename std::enable_if<(
|
||||
|
@ -1192,7 +1207,7 @@ struct ViewMapping
|
|||
std::is_same< typename SrcTraits::array_layout
|
||||
, Kokkos::LayoutStride >::value
|
||||
)
|
||||
), DynRankSubviewTag >::type
|
||||
), Kokkos::Experimental::Impl::DynRankSubviewTag >::type
|
||||
, SrcTraits
|
||||
, Args ... >
|
||||
{
|
||||
|
@ -1264,7 +1279,7 @@ public:
|
|||
};
|
||||
|
||||
|
||||
typedef DynRankView< value_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits > ret_type;
|
||||
typedef Kokkos::Experimental::DynRankView< value_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits > ret_type;
|
||||
|
||||
template < typename T , class ... P >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -1336,9 +1351,10 @@ public:
|
|||
|
||||
} // end Impl
|
||||
|
||||
namespace Experimental {
|
||||
|
||||
template< class V , class ... Args >
|
||||
using Subdynrankview = typename Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , V , Args... >::ret_type ;
|
||||
using Subdynrankview = typename Kokkos::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , V , Args... >::ret_type ;
|
||||
|
||||
template< class D , class ... P , class ...Args >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -1348,7 +1364,7 @@ subdynrankview( const Kokkos::Experimental::DynRankView< D , P... > &src , Args.
|
|||
if ( src.rank() > sizeof...(Args) ) //allow sizeof...(Args) >= src.rank(), ignore the remaining args
|
||||
{ Kokkos::abort("subdynrankview: num of args must be >= rank of the source DynRankView"); }
|
||||
|
||||
typedef Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ;
|
||||
typedef Kokkos::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ;
|
||||
|
||||
return metafcn::subview( src.rank() , src , args... );
|
||||
}
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,7 +36,7 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
@ -57,7 +57,7 @@ namespace Experimental {
|
|||
*/
|
||||
template< typename DataType , typename ... P >
|
||||
class DynamicView : public Kokkos::ViewTraits< DataType , P ... >
|
||||
{
|
||||
{
|
||||
public:
|
||||
|
||||
typedef Kokkos::ViewTraits< DataType , P ... > traits ;
|
||||
|
@ -68,7 +68,7 @@ private:
|
|||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ;
|
||||
|
||||
static_assert( traits::rank == 1 && traits::rank_dynamic == 1
|
||||
static_assert( traits::rank == 1 && traits::rank_dynamic == 1
|
||||
, "DynamicView must be rank-one" );
|
||||
|
||||
static_assert( std::is_trivial< typename traits::value_type >::value &&
|
||||
|
@ -216,14 +216,14 @@ public:
|
|||
// Verify that allocation of the requested chunk in in progress.
|
||||
|
||||
// The allocated chunk counter is m_chunks[ m_chunk_max ]
|
||||
const uintptr_t n =
|
||||
const uintptr_t n =
|
||||
*reinterpret_cast<uintptr_t volatile *>( m_chunks + m_chunk_max );
|
||||
|
||||
if ( n <= ic ) {
|
||||
Kokkos::abort("Kokkos::DynamicView array bounds error");
|
||||
}
|
||||
|
||||
// Allocation of this chunk is in progress
|
||||
// Allocation of this chunk is in progress
|
||||
// so wait for allocation to complete.
|
||||
while ( 0 == *ch );
|
||||
}
|
||||
|
@ -267,7 +267,7 @@ public:
|
|||
const uintptr_t jc_try = jc ;
|
||||
|
||||
// Jump iteration to the chunk counter.
|
||||
|
||||
|
||||
jc = atomic_compare_exchange( pc , jc_try , jc_try + 1 );
|
||||
|
||||
if ( jc_try == jc ) {
|
||||
|
@ -316,7 +316,7 @@ public:
|
|||
}
|
||||
else {
|
||||
while ( NC + 1 <= *pc ) {
|
||||
--*pc ;
|
||||
--*pc ;
|
||||
m_pool.deallocate( m_chunks[*pc]
|
||||
, sizeof(value_type) << m_chunk_shift );
|
||||
m_chunks[*pc] = 0 ;
|
||||
|
@ -331,7 +331,7 @@ public:
|
|||
typename traits::value_type ** m_chunks ;
|
||||
uintptr_t * m_pc ;
|
||||
uintptr_t m_nc ;
|
||||
unsigned m_chunk_shift ;
|
||||
unsigned m_chunk_shift ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( int ) const
|
||||
|
@ -348,7 +348,7 @@ public:
|
|||
}
|
||||
else {
|
||||
while ( m_nc + 1 <= *m_pc ) {
|
||||
--*m_pc ;
|
||||
--*m_pc ;
|
||||
m_pool.deallocate( m_chunks[*m_pc]
|
||||
, sizeof(value_type) << m_chunk_shift );
|
||||
m_chunks[*m_pc] = 0 ;
|
||||
|
@ -482,7 +482,7 @@ public:
|
|||
};
|
||||
|
||||
|
||||
/**\brief Allocation constructor
|
||||
/**\brief Allocation constructor
|
||||
*
|
||||
* Memory is allocated in chunks from the memory pool.
|
||||
* The chunk size conforms to the memory pool's chunk size.
|
||||
|
@ -557,7 +557,7 @@ void deep_copy( const View<T,DP...> & dst
|
|||
|
||||
if ( DstExecCanAccessSrc ) {
|
||||
// Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
|
||||
Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
|
||||
Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src );
|
||||
}
|
||||
else {
|
||||
Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
|
||||
|
@ -581,7 +581,7 @@ void deep_copy( const DynamicView<T,DP...> & dst
|
|||
|
||||
if ( DstExecCanAccessSrc ) {
|
||||
// Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
|
||||
Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
|
||||
Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src );
|
||||
}
|
||||
else {
|
||||
Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
|
||||
|
|
|
@ -69,6 +69,8 @@
|
|||
#include <Kokkos_ErrorReporter.hpp>
|
||||
#include <TestErrorReporter.hpp>
|
||||
|
||||
#include <TestViewCtorPropEmbeddedDim.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -94,6 +96,10 @@ TEST_F( cuda , dyn_view_api) {
|
|||
TestDynViewAPI< double , Kokkos::Cuda >();
|
||||
}
|
||||
|
||||
TEST_F( cuda, viewctorprop_embedded_dim ) {
|
||||
TestViewCtorProp_EmbeddedDim< Kokkos::Cuda >::test_vcpt( 2, 3 );
|
||||
}
|
||||
|
||||
TEST_F( cuda , staticcrsgraph )
|
||||
{
|
||||
TestStaticCrsGraph::run_test_graph< Kokkos::Cuda >();
|
||||
|
|
|
@ -66,6 +66,8 @@
|
|||
#include <Kokkos_ErrorReporter.hpp>
|
||||
#include <TestErrorReporter.hpp>
|
||||
|
||||
#include <TestViewCtorPropEmbeddedDim.hpp>
|
||||
|
||||
#include <iomanip>
|
||||
|
||||
namespace Test {
|
||||
|
@ -76,14 +78,7 @@ protected:
|
|||
{
|
||||
std::cout << std::setprecision(5) << std::scientific;
|
||||
|
||||
unsigned threads_count = 4 ;
|
||||
|
||||
if ( Kokkos::hwloc::available() ) {
|
||||
threads_count = Kokkos::hwloc::get_available_numa_count() *
|
||||
Kokkos::hwloc::get_available_cores_per_numa();
|
||||
}
|
||||
|
||||
Kokkos::OpenMP::initialize( threads_count );
|
||||
Kokkos::OpenMP::initialize();
|
||||
}
|
||||
|
||||
static void TearDownTestCase()
|
||||
|
@ -96,6 +91,10 @@ TEST_F( openmp, dyn_view_api) {
|
|||
TestDynViewAPI< double , Kokkos::OpenMP >();
|
||||
}
|
||||
|
||||
TEST_F( openmp, viewctorprop_embedded_dim ) {
|
||||
TestViewCtorProp_EmbeddedDim< Kokkos::OpenMP >::test_vcpt( 2, 3 );
|
||||
}
|
||||
|
||||
TEST_F( openmp, bitset )
|
||||
{
|
||||
test_bitset<Kokkos::OpenMP>();
|
||||
|
|
|
@ -67,6 +67,8 @@
|
|||
#include <Kokkos_ErrorReporter.hpp>
|
||||
#include <TestErrorReporter.hpp>
|
||||
|
||||
#include <TestViewCtorPropEmbeddedDim.hpp>
|
||||
|
||||
namespace Test {
|
||||
|
||||
class serial : public ::testing::Test {
|
||||
|
@ -85,6 +87,10 @@ TEST_F( serial, dyn_view_api) {
|
|||
TestDynViewAPI< double , Kokkos::Serial >();
|
||||
}
|
||||
|
||||
TEST_F( serial, viewctorprop_embedded_dim ) {
|
||||
TestViewCtorProp_EmbeddedDim< Kokkos::Serial >::test_vcpt( 2, 3 );
|
||||
}
|
||||
|
||||
TEST_F( serial , staticcrsgraph )
|
||||
{
|
||||
TestStaticCrsGraph::run_test_graph< Kokkos::Serial >();
|
||||
|
|
|
@ -70,6 +70,8 @@
|
|||
#include <Kokkos_ErrorReporter.hpp>
|
||||
#include <TestErrorReporter.hpp>
|
||||
|
||||
#include <TestViewCtorPropEmbeddedDim.hpp>
|
||||
|
||||
namespace Test {
|
||||
|
||||
class threads : public ::testing::Test {
|
||||
|
@ -103,6 +105,10 @@ TEST_F( threads , dyn_view_api) {
|
|||
TestDynViewAPI< double , Kokkos::Threads >();
|
||||
}
|
||||
|
||||
TEST_F( threads, viewctorprop_embedded_dim ) {
|
||||
TestViewCtorProp_EmbeddedDim< Kokkos::Threads >::test_vcpt( 2, 3 );
|
||||
}
|
||||
|
||||
TEST_F( threads , staticcrsgraph )
|
||||
{
|
||||
TestStaticCrsGraph::run_test_graph< Kokkos::Threads >();
|
||||
|
|
|
@ -0,0 +1,213 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Kokkos_DynRankView.hpp>
|
||||
|
||||
#include <type_traits>
|
||||
#include <typeinfo>
|
||||
|
||||
namespace Test {
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename ExecSpace >
|
||||
struct TestViewCtorProp_EmbeddedDim {
|
||||
|
||||
using ViewIntType = typename Kokkos::View< int**, ExecSpace >;
|
||||
using ViewDoubleType = typename Kokkos::View< double*, ExecSpace >;
|
||||
|
||||
using DynRankViewIntType = typename Kokkos::DynRankView< int, ExecSpace >;
|
||||
using DynRankViewDoubleType = typename Kokkos::DynRankView< double, ExecSpace >;
|
||||
|
||||
// Cuda 7.0 has issues with using a lamda in parallel_for to initialize the view - replace with this functor
|
||||
template < class ViewType >
|
||||
struct Functor {
|
||||
|
||||
ViewType v;
|
||||
|
||||
Functor( const ViewType & v_ ) : v(v_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( const int i ) const {
|
||||
v(i) = i;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
static void test_vcpt( const int N0, const int N1 )
|
||||
{
|
||||
|
||||
// Create two views to test
|
||||
{
|
||||
using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType ;
|
||||
using VDT = typename TestViewCtorProp_EmbeddedDim::ViewDoubleType ;
|
||||
|
||||
VIT vi1("vi1", N0, N1);
|
||||
VDT vd1("vd1", N0);
|
||||
|
||||
// TEST: Test for common type between two views, one with type double, other with type int
|
||||
// Deduce common value_type and construct a view with that type
|
||||
{
|
||||
// Two views
|
||||
auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1);
|
||||
typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType;
|
||||
typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT;
|
||||
typedef typename CVT::HostMirror HostCVT;
|
||||
|
||||
// Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
|
||||
CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
|
||||
|
||||
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1),
|
||||
Functor<CVT>(cv1)
|
||||
);
|
||||
|
||||
HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
|
||||
Kokkos::deep_copy( hcv1, cv1 );
|
||||
|
||||
ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ;
|
||||
#if 0
|
||||
// debug output
|
||||
for ( int i = 0; i < N0*N1; ++i ) {
|
||||
printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) );
|
||||
}
|
||||
|
||||
printf( " Common value type view: %s \n", typeid( CVT() ).name() );
|
||||
printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() );
|
||||
if ( std::is_same< CommonViewValueType, double >::value == true ) {
|
||||
printf("Proper common value_type\n");
|
||||
}
|
||||
else {
|
||||
printf("WRONG common value_type\n");
|
||||
}
|
||||
// end debug output
|
||||
#endif
|
||||
}
|
||||
|
||||
{
|
||||
// Single view
|
||||
auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1);
|
||||
typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType;
|
||||
typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT;
|
||||
typedef typename CVT::HostMirror HostCVT;
|
||||
|
||||
// Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
|
||||
CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
|
||||
|
||||
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1),
|
||||
Functor<CVT>(cv1)
|
||||
);
|
||||
|
||||
HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
|
||||
Kokkos::deep_copy( hcv1, cv1 );
|
||||
|
||||
ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Create two dynamic rank views to test
|
||||
{
|
||||
using VIT = typename TestViewCtorProp_EmbeddedDim::DynRankViewIntType ;
|
||||
using VDT = typename TestViewCtorProp_EmbeddedDim::DynRankViewDoubleType ;
|
||||
|
||||
VIT vi1("vi1", N0, N1);
|
||||
VDT vd1("vd1", N0);
|
||||
|
||||
// TEST: Test for common type between two views, one with type double, other with type int
|
||||
// Deduce common value_type and construct a view with that type
|
||||
{
|
||||
// Two views
|
||||
auto view_alloc_arg = Kokkos::common_view_alloc_prop( vi1, vd1 );
|
||||
typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType;
|
||||
typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT;
|
||||
typedef typename CVT::HostMirror HostCVT;
|
||||
|
||||
// Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
|
||||
CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
|
||||
|
||||
|
||||
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1),
|
||||
Functor<CVT>(cv1)
|
||||
);
|
||||
|
||||
HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
|
||||
Kokkos::deep_copy( hcv1, cv1 );
|
||||
|
||||
ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ;
|
||||
}
|
||||
|
||||
{
|
||||
// Single views
|
||||
auto view_alloc_arg = Kokkos::common_view_alloc_prop( vi1 );
|
||||
typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType;
|
||||
typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT;
|
||||
typedef typename CVT::HostMirror HostCVT;
|
||||
|
||||
// Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
|
||||
CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
|
||||
|
||||
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1),
|
||||
Functor<CVT>(cv1)
|
||||
);
|
||||
|
||||
HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
|
||||
Kokkos::deep_copy( hcv1, cv1 );
|
||||
|
||||
ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // end test_vcpt
|
||||
|
||||
}; // end struct
|
||||
|
||||
} // namespace
|
||||
|
||||
} // namespace Test
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,12 +36,14 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <cstdlib>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
::testing::InitGoogleTest(&argc,argv);
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue