Merge branch 'master' into simplify_build_choices

# Conflicts:
#	cmake/Modules/Packages/GPU.cmake
This commit is contained in:
Axel Kohlmeyer 2020-04-11 21:31:07 -04:00
commit 5542ad31dc
No known key found for this signature in database
GPG Key ID: D9B44E93BF0C375A
105 changed files with 2800 additions and 538 deletions

View File

@ -704,6 +704,14 @@ if(PKG_GPU)
message(STATUS "GPU architecture: ${GPU_ARCH}")
elseif(GPU_API STREQUAL "OPENCL")
message(STATUS "OpenCL tuning: ${OCL_TUNE}")
elseif(GPU_API STREQUAL "HIP")
message(STATUS "HIP platform: ${HIP_PLATFORM}")
message(STATUS "HIP architecture: ${HIP_ARCH}")
if(HIP_USE_DEVICE_SORT)
message(STATUS "HIP GPU sorting: on")
else()
message(STATUS "HIP GPU sorting: off")
endif()
endif()
message(STATUS "GPU precision: ${GPU_PREC}")
endif()

View File

@ -0,0 +1,16 @@
# - Find CUB
# Find the CUB header library
#
# CUB_INCLUDE_DIRS - where to find cub/cub.cuh
# CUB_FOUND - True if CUB found.
#
find_path(CUB_INCLUDE_DIR cub.cuh PATH_SUFFIXES cub)
include(FindPackageHandleStandardArgs)
# handle the QUIETLY and REQUIRED arguments and set CUB_FOUND to TRUE
# if all listed variables are TRUE
find_package_handle_standard_args(CUB DEFAULT_MSG CUB_INCLUDE_DIR)
mark_as_advanced(CUB_INCLUDE_DIR)

View File

@ -0,0 +1,3 @@
# utility script to call GenerateBinaryHeader function
include(${SOURCE_DIR}/Modules/LAMMPSUtils.cmake)
GenerateBinaryHeader(${VARNAME} ${HEADER_FILE} ${SOURCE_FILES})

View File

@ -69,3 +69,19 @@ macro(pkg_depends PKG1 PKG2)
message(FATAL_ERROR "${PKG1} package needs LAMMPS to be build with ${PKG2}")
endif()
endmacro()
# CMake-only replacement for bin2c and xxd
function(GenerateBinaryHeader varname outfile files)
message("Creating ${outfile}...")
file(WRITE ${outfile} "// CMake generated file\n")
math(EXPR ARG_END "${ARGC}-1")
foreach(IDX RANGE 2 ${ARG_END})
list(GET ARGV ${IDX} filename)
file(READ ${filename} content HEX)
string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," content "${content}")
string(REGEX REPLACE ",$" "" content "${content}")
file(APPEND ${outfile} "const unsigned char ${varname}[] = { ${content} };\n")
file(APPEND ${outfile} "const unsigned int ${varname}_size = sizeof(${varname});\n")
endforeach()
endfunction(GenerateBinaryHeader)

View File

@ -1,209 +1,363 @@
set(GPU_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/GPU)
set(GPU_SOURCES ${GPU_SOURCES_DIR}/gpu_extra.h
${GPU_SOURCES_DIR}/fix_gpu.h
${GPU_SOURCES_DIR}/fix_gpu.cpp)
set(GPU_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/GPU)
set(GPU_SOURCES ${GPU_SOURCES_DIR}/gpu_extra.h
${GPU_SOURCES_DIR}/fix_gpu.h
${GPU_SOURCES_DIR}/fix_gpu.cpp)
set(GPU_API "opencl" CACHE STRING "API used by GPU package")
set(GPU_API_VALUES opencl cuda)
set_property(CACHE GPU_API PROPERTY STRINGS ${GPU_API_VALUES})
validate_option(GPU_API GPU_API_VALUES)
string(TOUPPER ${GPU_API} GPU_API)
set(GPU_API "opencl" CACHE STRING "API used by GPU package")
set(GPU_API_VALUES opencl cuda hip)
set_property(CACHE GPU_API PROPERTY STRINGS ${GPU_API_VALUES})
validate_option(GPU_API GPU_API_VALUES)
string(TOUPPER ${GPU_API} GPU_API)
set(GPU_PREC "mixed" CACHE STRING "LAMMPS GPU precision")
set(GPU_PREC_VALUES double mixed single)
set_property(CACHE GPU_PREC PROPERTY STRINGS ${GPU_PREC_VALUES})
validate_option(GPU_PREC GPU_PREC_VALUES)
string(TOUPPER ${GPU_PREC} GPU_PREC)
set(GPU_PREC "mixed" CACHE STRING "LAMMPS GPU precision")
set(GPU_PREC_VALUES double mixed single)
set_property(CACHE GPU_PREC PROPERTY STRINGS ${GPU_PREC_VALUES})
validate_option(GPU_PREC GPU_PREC_VALUES)
string(TOUPPER ${GPU_PREC} GPU_PREC)
if(GPU_PREC STREQUAL "DOUBLE")
set(GPU_PREC_SETTING "DOUBLE_DOUBLE")
elseif(GPU_PREC STREQUAL "MIXED")
set(GPU_PREC_SETTING "SINGLE_DOUBLE")
elseif(GPU_PREC STREQUAL "SINGLE")
set(GPU_PREC_SETTING "SINGLE_SINGLE")
if(GPU_PREC STREQUAL "DOUBLE")
set(GPU_PREC_SETTING "DOUBLE_DOUBLE")
elseif(GPU_PREC STREQUAL "MIXED")
set(GPU_PREC_SETTING "SINGLE_DOUBLE")
elseif(GPU_PREC STREQUAL "SINGLE")
set(GPU_PREC_SETTING "SINGLE_SINGLE")
endif()
file(GLOB GPU_LIB_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cpp)
file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)
if(GPU_API STREQUAL "CUDA")
find_package(CUDA REQUIRED)
find_program(BIN2C bin2c)
if(NOT BIN2C)
message(FATAL_ERROR "Could not find bin2c, use -DBIN2C=/path/to/bin2c to help cmake finding it.")
endif()
option(CUDPP_OPT "Enable CUDPP_OPT" ON)
option(CUDA_MPS_SUPPORT "Enable tweaks to support CUDA Multi-process service (MPS)" OFF)
if(CUDA_MPS_SUPPORT)
set(GPU_CUDA_MPS_FLAGS "-DCUDA_PROXY")
endif()
file(GLOB GPU_LIB_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cpp)
file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)
set(GPU_ARCH "sm_50" CACHE STRING "LAMMPS GPU CUDA SM primary architecture (e.g. sm_60)")
if(GPU_API STREQUAL "CUDA")
file(GLOB GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cu ${CMAKE_CURRENT_SOURCE_DIR}/gpu/[^.]*.cu)
list(REMOVE_ITEM GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_pppm.cu)
cuda_include_directories(${LAMMPS_LIB_SOURCE_DIR}/gpu ${LAMMPS_LIB_BINARY_DIR}/gpu)
if(CUDPP_OPT)
cuda_include_directories(${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
file(GLOB GPU_LIB_CUDPP_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini/[^.]*.cpp)
file(GLOB GPU_LIB_CUDPP_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini/[^.]*.cu)
endif()
# build arch/gencode commands for nvcc based on CUDA toolkit version and use choice
# --arch translates directly instead of JIT, so this should be for the preferred or most common architecture
set(GPU_CUDA_GENCODE "-arch=${GPU_ARCH} ")
# Fermi (GPU Arch 2.x) is supported by CUDA 3.2 to CUDA 8.0
if((CUDA_VERSION VERSION_GREATER "3.1") AND (CUDA_VERSION VERSION_LESS "9.0"))
string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_20,code=[sm_20,compute_20] ")
endif()
# Kepler (GPU Arch 3.x) is supported by CUDA 5 and later
if(CUDA_VERSION VERSION_GREATER "4.9")
string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_30,code=[sm_30,compute_30] -gencode arch=compute_35,code=[sm_35,compute_35] ")
endif()
# Maxwell (GPU Arch 5.x) is supported by CUDA 6 and later
if(CUDA_VERSION VERSION_GREATER "5.9")
string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52] ")
endif()
# Pascal (GPU Arch 6.x) is supported by CUDA 8 and later
if(CUDA_VERSION VERSION_GREATER "7.9")
string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61] ")
endif()
# Volta (GPU Arch 7.0) is supported by CUDA 9 and later
if(CUDA_VERSION VERSION_GREATER "8.9")
string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_70,code=[sm_70,compute_70] ")
endif()
# Turing (GPU Arch 7.5) is supported by CUDA 10 and later
if(CUDA_VERSION VERSION_GREATER "9.9")
string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_75,code=[sm_75,compute_75] ")
endif()
cuda_compile_fatbin(GPU_GEN_OBJS ${GPU_LIB_CU} OPTIONS
-DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -DNV_KERNEL -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING})
cuda_compile(GPU_OBJS ${GPU_LIB_CUDPP_CU} OPTIONS ${CUDA_REQUEST_PIC}
-DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING})
foreach(CU_OBJ ${GPU_GEN_OBJS})
get_filename_component(CU_NAME ${CU_OBJ} NAME_WE)
string(REGEX REPLACE "^.*_lal_" "" CU_NAME "${CU_NAME}")
add_custom_command(OUTPUT ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h
COMMAND ${BIN2C} -c -n ${CU_NAME} ${CU_OBJ} > ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h
DEPENDS ${CU_OBJ}
COMMENT "Generating ${CU_NAME}_cubin.h")
list(APPEND GPU_LIB_SOURCES ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h)
endforeach()
set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "${LAMMPS_LIB_BINARY_DIR}/gpu/*_cubin.h")
add_library(gpu STATIC ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS})
target_link_libraries(gpu PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu ${CUDA_INCLUDE_DIRS})
target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT ${GPU_CUDA_MPS_FLAGS})
if(CUDPP_OPT)
target_include_directories(gpu PRIVATE ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
target_compile_definitions(gpu PRIVATE -DUSE_CUDPP)
endif()
target_link_libraries(lammps PRIVATE gpu)
add_executable(nvc_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
target_compile_definitions(nvc_get_devices PRIVATE -DUCL_CUDADR)
target_link_libraries(nvc_get_devices PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
target_include_directories(nvc_get_devices PRIVATE ${CUDA_INCLUDE_DIRS})
elseif(GPU_API STREQUAL "OPENCL")
if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
# download and unpack support binaries for compilation of windows binaries.
set(LAMMPS_THIRDPARTY_URL "http://download.lammps.org/thirdparty")
file(DOWNLOAD "${LAMMPS_THIRDPARTY_URL}/opencl-win-devel.tar.gz" "${CMAKE_CURRENT_BINARY_DIR}/opencl-win-devel.tar.gz"
EXPECTED_MD5 2c00364888d5671195598b44c2e0d44d)
execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf opencl-win-devel.tar.gz WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
add_library(OpenCL::OpenCL UNKNOWN IMPORTED)
if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86")
set_target_properties(OpenCL::OpenCL PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/lib_win32/libOpenCL.dll")
elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64")
set_target_properties(OpenCL::OpenCL PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/lib_win64/libOpenCL.dll")
endif()
set_target_properties(OpenCL::OpenCL PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/include")
else()
find_package(OpenCL REQUIRED)
endif()
set(OCL_TUNE "generic" CACHE STRING "OpenCL Device Tuning")
set(OCL_TUNE_VALUES intel fermi kepler cypress generic)
set_property(CACHE OCL_TUNE PROPERTY STRINGS ${OCL_TUNE_VALUES})
validate_option(OCL_TUNE OCL_TUNE_VALUES)
string(TOUPPER ${OCL_TUNE} OCL_TUNE)
include(OpenCLUtils)
set(OCL_COMMON_HEADERS ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_preprocessor.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_aux_fun1.h)
file(GLOB GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cu)
list(REMOVE_ITEM GPU_LIB_CU
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_gayberne.cu
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_gayberne_lj.cu
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_re_squared.cu
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_re_squared_lj.cu
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu
)
foreach(GPU_KERNEL ${GPU_LIB_CU})
get_filename_component(basename ${GPU_KERNEL} NAME_WE)
string(SUBSTRING ${basename} 4 -1 KERNEL_NAME)
GenerateOpenCLHeader(${KERNEL_NAME} ${CMAKE_CURRENT_BINARY_DIR}/gpu/${KERNEL_NAME}_cl.h ${OCL_COMMON_HEADERS} ${GPU_KERNEL})
list(APPEND GPU_LIB_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/gpu/${KERNEL_NAME}_cl.h)
endforeach()
GenerateOpenCLHeader(gayberne ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_ellipsoid_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_gayberne.cu)
GenerateOpenCLHeader(gayberne_lj ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_lj_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_ellipsoid_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_gayberne_lj.cu)
GenerateOpenCLHeader(re_squared ${CMAKE_CURRENT_BINARY_DIR}/gpu/re_squared_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_ellipsoid_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_re_squared.cu)
GenerateOpenCLHeader(re_squared_lj ${CMAKE_CURRENT_BINARY_DIR}/gpu/re_squared_lj_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_ellipsoid_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_re_squared_lj.cu)
GenerateOpenCLHeader(tersoff ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu)
GenerateOpenCLHeader(tersoff_zbl ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu)
GenerateOpenCLHeader(tersoff_mod ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu)
list(APPEND GPU_LIB_SOURCES
${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_cl.h
${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_lj_cl.h
${CMAKE_CURRENT_BINARY_DIR}/gpu/re_squared_cl.h
${CMAKE_CURRENT_BINARY_DIR}/gpu/re_squared_lj_cl.h
${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h
${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h
${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h
)
add_library(gpu STATIC ${GPU_LIB_SOURCES})
target_link_libraries(gpu PRIVATE OpenCL::OpenCL)
target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu)
target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -D${OCL_TUNE}_OCL -DMPI_GERYON -DUCL_NO_EXIT)
target_compile_definitions(gpu PRIVATE -DUSE_OPENCL)
target_link_libraries(lammps PRIVATE gpu)
add_executable(ocl_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
target_compile_definitions(ocl_get_devices PRIVATE -DUCL_OPENCL)
target_link_libraries(ocl_get_devices PRIVATE OpenCL::OpenCL)
elseif(GPU_API STREQUAL "HIP")
if(NOT DEFINED HIP_PATH)
if(NOT DEFINED ENV{HIP_PATH})
set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
else()
set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
endif()
endif()
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
find_package(HIP REQUIRED)
option(HIP_USE_DEVICE_SORT "Use GPU sorting" ON)
if(NOT DEFINED HIP_PLATFORM)
if(NOT DEFINED ENV{HIP_PLATFORM})
set(HIP_PLATFORM "hcc" CACHE PATH "HIP Platform to be used during compilation")
else()
set(HIP_PLATFORM $ENV{HIP_PLATFORM} CACHE PATH "HIP Platform used during compilation")
endif()
endif()
set(ENV{HIP_PLATFORM} ${HIP_PLATFORM})
if(HIP_PLATFORM STREQUAL "hcc")
set(HIP_ARCH "gfx906" CACHE STRING "HIP target architecture")
elseif(HIP_PLATFORM STREQUAL "nvcc")
find_package(CUDA REQUIRED)
find_program(BIN2C bin2c)
if(NOT BIN2C)
message(FATAL_ERROR "Could not find bin2c, use -DBIN2C=/path/to/bin2c to help cmake finding it.")
endif()
option(CUDPP_OPT "Enable CUDPP_OPT" ON)
option(CUDA_MPS_SUPPORT "Enable tweaks to support CUDA Multi-process service (MPS)" OFF)
if(CUDA_MPS_SUPPORT)
set(GPU_CUDA_MPS_FLAGS "-DCUDA_PROXY")
endif()
set(GPU_ARCH "sm_30" CACHE STRING "LAMMPS GPU CUDA SM primary architecture (e.g. sm_60)")
file(GLOB GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cu ${CMAKE_CURRENT_SOURCE_DIR}/gpu/[^.]*.cu)
list(REMOVE_ITEM GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_pppm.cu)
cuda_include_directories(${LAMMPS_LIB_SOURCE_DIR}/gpu ${LAMMPS_LIB_BINARY_DIR}/gpu)
if(CUDPP_OPT)
cuda_include_directories(${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
file(GLOB GPU_LIB_CUDPP_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini/[^.]*.cpp)
file(GLOB GPU_LIB_CUDPP_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini/[^.]*.cu)
endif()
set(HIP_ARCH "sm_50" CACHE STRING "HIP primary CUDA architecture (e.g. sm_60)")
# build arch/gencode commands for nvcc based on CUDA toolkit version and use choice
# --arch translates directly instead of JIT, so this should be for the preferred or most common architecture
set(GPU_CUDA_GENCODE "-arch=${GPU_ARCH} ")
set(HIP_CUDA_GENCODE "-arch=${HIP_ARCH} ")
# Fermi (GPU Arch 2.x) is supported by CUDA 3.2 to CUDA 8.0
if((CUDA_VERSION VERSION_GREATER "3.1") AND (CUDA_VERSION VERSION_LESS "9.0"))
string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_20,code=[sm_20,compute_20] ")
string(APPEND HIP_CUDA_GENCODE "-gencode arch=compute_20,code=[sm_20,compute_20] ")
endif()
# Kepler (GPU Arch 3.x) is supported by CUDA 5 and later
if(CUDA_VERSION VERSION_GREATER "4.9")
string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_30,code=[sm_30,compute_30] -gencode arch=compute_35,code=[sm_35,compute_35] ")
string(APPEND HIP_CUDA_GENCODE "-gencode arch=compute_30,code=[sm_30,compute_30] -gencode arch=compute_35,code=[sm_35,compute_35] ")
endif()
# Maxwell (GPU Arch 5.x) is supported by CUDA 6 and later
if(CUDA_VERSION VERSION_GREATER "5.9")
string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52] ")
string(APPEND HIP_CUDA_GENCODE "-gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52] ")
endif()
# Pascal (GPU Arch 6.x) is supported by CUDA 8 and later
if(CUDA_VERSION VERSION_GREATER "7.9")
string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61] ")
string(APPEND HIP_CUDA_GENCODE "-gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61] ")
endif()
# Volta (GPU Arch 7.0) is supported by CUDA 9 and later
if(CUDA_VERSION VERSION_GREATER "8.9")
string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_70,code=[sm_70,compute_70] ")
string(APPEND HIP_CUDA_GENCODE "-gencode arch=compute_70,code=[sm_70,compute_70] ")
endif()
# Turing (GPU Arch 7.5) is supported by CUDA 10 and later
if(CUDA_VERSION VERSION_GREATER "9.9")
string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_75,code=[sm_75,compute_75] ")
string(APPEND HIP_CUDA_GENCODE "-gencode arch=compute_75,code=[sm_75,compute_75] ")
endif()
endif()
file(GLOB GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cu ${CMAKE_CURRENT_SOURCE_DIR}/gpu/[^.]*.cu)
list(REMOVE_ITEM GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_pppm.cu)
set(GPU_LIB_CU_HIP "")
foreach(CU_FILE ${GPU_LIB_CU})
get_filename_component(CU_NAME ${CU_FILE} NAME_WE)
string(REGEX REPLACE "^.*lal_" "" CU_NAME "${CU_NAME}")
set(CU_CPP_FILE "${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}.cu.cpp")
set(CUBIN_FILE "${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}.cubin")
set(CUBIN_H_FILE "${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h")
if(HIP_PLATFORM STREQUAL "hcc")
configure_file(${CU_FILE} ${CU_CPP_FILE} COPYONLY)
add_custom_command(OUTPUT ${CUBIN_FILE}
VERBATIM COMMAND ${HIP_HIPCC_EXECUTABLE} --genco -t="${HIP_ARCH}" -f=\"-O3 -ffast-math -DUSE_HIP -D_${GPU_PREC_SETTING} -I${LAMMPS_LIB_SOURCE_DIR}/gpu\" -o ${CUBIN_FILE} ${CU_CPP_FILE}
DEPENDS ${CU_CPP_FILE}
COMMENT "Generating ${CU_NAME}.cubin")
elseif(HIP_PLATFORM STREQUAL "nvcc")
add_custom_command(OUTPUT ${CUBIN_FILE}
VERBATIM COMMAND ${HIP_HIPCC_EXECUTABLE} --fatbin --use_fast_math -DUSE_HIP -D_${GPU_PREC_SETTING} ${HIP_CUDA_GENCODE} -I${LAMMPS_LIB_SOURCE_DIR}/gpu -o ${CUBIN_FILE} ${CU_FILE}
DEPENDS ${CU_FILE}
COMMENT "Generating ${CU_NAME}.cubin")
endif()
cuda_compile_fatbin(GPU_GEN_OBJS ${GPU_LIB_CU} OPTIONS
-DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -DNV_KERNEL -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING})
add_custom_command(OUTPUT ${CUBIN_H_FILE}
COMMAND ${CMAKE_COMMAND} -D SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR} -D VARNAME=${CU_NAME} -D HEADER_FILE=${CUBIN_H_FILE} -D SOURCE_FILES=${CUBIN_FILE} -P ${CMAKE_CURRENT_SOURCE_DIR}/Modules/GenerateBinaryHeader.cmake
DEPENDS ${CUBIN_FILE}
COMMENT "Generating ${CU_NAME}_cubin.h")
cuda_compile(GPU_OBJS ${GPU_LIB_CUDPP_CU} OPTIONS ${CUDA_REQUEST_PIC}
-DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING})
list(APPEND GPU_LIB_SOURCES ${CUBIN_H_FILE})
endforeach()
foreach(CU_OBJ ${GPU_GEN_OBJS})
get_filename_component(CU_NAME ${CU_OBJ} NAME_WE)
string(REGEX REPLACE "^.*_lal_" "" CU_NAME "${CU_NAME}")
add_custom_command(OUTPUT ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h
COMMAND ${BIN2C} -c -n ${CU_NAME} ${CU_OBJ} > ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h
DEPENDS ${CU_OBJ}
COMMENT "Generating ${CU_NAME}_cubin.h")
list(APPEND GPU_LIB_SOURCES ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h)
endforeach()
set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "${LAMMPS_LIB_BINARY_DIR}/gpu/*_cubin.h")
set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "${LAMMPS_LIB_BINARY_DIR}/gpu/*_cubin.h ${LAMMPS_LIB_BINARY_DIR}/gpu/*.cu.cpp")
hip_add_library(gpu STATIC ${GPU_LIB_SOURCES})
target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu)
target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT)
target_compile_definitions(gpu PRIVATE -DUSE_HIP)
add_library(gpu STATIC ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS})
target_link_libraries(gpu PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu ${CUDA_INCLUDE_DIRS})
target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT ${GPU_CUDA_MPS_FLAGS})
if(CUDPP_OPT)
target_include_directories(gpu PRIVATE ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
target_compile_definitions(gpu PRIVATE -DUSE_CUDPP)
endif()
if(HIP_USE_DEVICE_SORT)
# add hipCUB
target_include_directories(gpu PRIVATE ${HIP_ROOT_DIR}/../include)
target_compile_definitions(gpu PRIVATE -DUSE_HIP_DEVICE_SORT)
target_link_libraries(lammps PRIVATE gpu)
if(HIP_PLATFORM STREQUAL "nvcc")
find_package(CUB)
add_executable(nvc_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
target_compile_definitions(nvc_get_devices PRIVATE -DUCL_CUDADR)
target_link_libraries(nvc_get_devices PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
target_include_directories(nvc_get_devices PRIVATE ${CUDA_INCLUDE_DIRS})
elseif(GPU_API STREQUAL "OPENCL")
if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
# download and unpack support binaries for compilation of windows binaries.
set(LAMMPS_THIRDPARTY_URL "http://download.lammps.org/thirdparty")
file(DOWNLOAD "${LAMMPS_THIRDPARTY_URL}/opencl-win-devel.tar.gz" "${CMAKE_CURRENT_BINARY_DIR}/opencl-win-devel.tar.gz"
EXPECTED_MD5 2c00364888d5671195598b44c2e0d44d)
execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf opencl-win-devel.tar.gz WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
add_library(OpenCL::OpenCL UNKNOWN IMPORTED)
if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86")
set_target_properties(OpenCL::OpenCL PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/lib_win32/libOpenCL.dll")
elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64")
set_target_properties(OpenCL::OpenCL PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/lib_win64/libOpenCL.dll")
if(CUB_FOUND)
set(DOWNLOAD_CUB_DEFAULT OFF)
else()
set(DOWNLOAD_CUB_DEFAULT ON)
endif()
set_target_properties(OpenCL::OpenCL PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/include")
else()
find_package(OpenCL REQUIRED)
option(DOWNLOAD_CUB "Download and compile the CUB library instead of using an already installed one" ${DOWNLOAD_CUB_DEFAULT})
if(DOWNLOAD_CUB)
message(STATUS "CUB download requested")
include(ExternalProject)
ExternalProject_Add(CUB
GIT_REPOSITORY https://github.com/NVlabs/cub
TIMEOUT 5
PREFIX "${CMAKE_CURRENT_BINARY_DIR}"
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
UPDATE_COMMAND ""
)
ExternalProject_get_property(CUB SOURCE_DIR)
set(CUB_INCLUDE_DIR ${SOURCE_DIR})
else()
find_package(CUB)
if(NOT CUB_FOUND)
message(FATAL_ERROR "CUB library not found. Help CMake to find it by setting CUB_INCLUDE_DIR, or set DOWNLOAD_VORO=ON to download it")
endif()
endif()
target_include_directories(gpu PRIVATE ${CUB_INCLUDE_DIR})
endif()
set(OCL_TUNE "generic" CACHE STRING "OpenCL Device Tuning")
set(OCL_TUNE_VALUES intel fermi kepler cypress generic)
set_property(CACHE OCL_TUNE PROPERTY STRINGS ${OCL_TUNE_VALUES})
validate_option(OCL_TUNE OCL_TUNE_VALUES)
string(TOUPPER ${OCL_TUNE} OCL_TUNE)
include(OpenCLUtils)
set(OCL_COMMON_HEADERS ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_preprocessor.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_aux_fun1.h)
file(GLOB GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cu)
list(REMOVE_ITEM GPU_LIB_CU
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_gayberne.cu
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_gayberne_lj.cu
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_re_squared.cu
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_re_squared_lj.cu
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu
)
foreach(GPU_KERNEL ${GPU_LIB_CU})
get_filename_component(basename ${GPU_KERNEL} NAME_WE)
string(SUBSTRING ${basename} 4 -1 KERNEL_NAME)
GenerateOpenCLHeader(${KERNEL_NAME} ${CMAKE_CURRENT_BINARY_DIR}/gpu/${KERNEL_NAME}_cl.h ${OCL_COMMON_HEADERS} ${GPU_KERNEL})
list(APPEND GPU_LIB_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/gpu/${KERNEL_NAME}_cl.h)
endforeach()
GenerateOpenCLHeader(gayberne ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_ellipsoid_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_gayberne.cu)
GenerateOpenCLHeader(gayberne_lj ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_lj_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_ellipsoid_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_gayberne_lj.cu)
GenerateOpenCLHeader(re_squared ${CMAKE_CURRENT_BINARY_DIR}/gpu/re_squared_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_ellipsoid_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_re_squared.cu)
GenerateOpenCLHeader(re_squared_lj ${CMAKE_CURRENT_BINARY_DIR}/gpu/re_squared_lj_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_ellipsoid_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_re_squared_lj.cu)
GenerateOpenCLHeader(tersoff ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu)
GenerateOpenCLHeader(tersoff_zbl ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu)
GenerateOpenCLHeader(tersoff_mod ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu)
list(APPEND GPU_LIB_SOURCES
${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_cl.h
${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_lj_cl.h
${CMAKE_CURRENT_BINARY_DIR}/gpu/re_squared_cl.h
${CMAKE_CURRENT_BINARY_DIR}/gpu/re_squared_lj_cl.h
${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h
${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h
${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h
)
add_library(gpu STATIC ${GPU_LIB_SOURCES})
target_link_libraries(gpu PRIVATE OpenCL::OpenCL)
target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu)
target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -D${OCL_TUNE}_OCL -DMPI_GERYON -DUCL_NO_EXIT)
target_compile_definitions(gpu PRIVATE -DUSE_OPENCL)
target_link_libraries(lammps PRIVATE gpu)
add_executable(ocl_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
target_compile_definitions(ocl_get_devices PRIVATE -DUCL_OPENCL)
target_link_libraries(ocl_get_devices PRIVATE OpenCL::OpenCL)
endif()
# GPU package
FindStyleHeaders(${GPU_SOURCES_DIR} FIX_CLASS fix_ FIX)
hip_add_executable(hip_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
target_compile_definitions(hip_get_devices PRIVATE -DUCL_HIP)
set_property(GLOBAL PROPERTY "GPU_SOURCES" "${GPU_SOURCES}")
if(HIP_PLATFORM STREQUAL "nvcc")
target_compile_definitions(gpu PRIVATE -D__HIP_PLATFORM_NVCC__)
target_include_directories(gpu PRIVATE ${HIP_ROOT_DIR}/include)
target_include_directories(gpu PRIVATE ${CUDA_INCLUDE_DIRS})
target_link_libraries(gpu PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
# detects styles which have GPU version
RegisterStylesExt(${GPU_SOURCES_DIR} gpu GPU_SOURCES)
get_property(GPU_SOURCES GLOBAL PROPERTY GPU_SOURCES)
target_link_libraries(gpu PRIVATE MPI::MPI_CXX)
if(NOT BUILD_SHARED_LIBS)
install(TARGETS gpu EXPORT LAMMPS_Targets LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
target_compile_definitions(hip_get_devices PRIVATE -D__HIP_PLATFORM_NVCC__)
target_include_directories(hip_get_devices PRIVATE ${HIP_ROOT_DIR}/include)
target_include_directories(hip_get_devices PRIVATE ${CUDA_INCLUDE_DIRS})
target_link_libraries(hip_get_devices PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
endif()
target_compile_definitions(gpu PRIVATE -DLAMMPS_${LAMMPS_SIZES})
set_target_properties(gpu PROPERTIES OUTPUT_NAME lammps_gpu${LAMMPS_MACHINE})
target_sources(lammps PRIVATE ${GPU_SOURCES})
target_include_directories(lammps PRIVATE ${GPU_SOURCES_DIR})
target_link_libraries(lammps PRIVATE gpu)
endif()
# GPU package
FindStyleHeaders(${GPU_SOURCES_DIR} FIX_CLASS fix_ FIX)
set_property(GLOBAL PROPERTY "GPU_SOURCES" "${GPU_SOURCES}")
# detects styles which have GPU version
RegisterStylesExt(${GPU_SOURCES_DIR} gpu GPU_SOURCES)
get_property(GPU_SOURCES GLOBAL PROPERTY GPU_SOURCES)
target_link_libraries(gpu PRIVATE MPI::MPI_CXX)
if(NOT BUILD_SHARED_LIBS)
install(TARGETS gpu EXPORT LAMMPS_Targets LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
endif()
target_compile_definitions(gpu PRIVATE -DLAMMPS_${LAMMPS_SIZES})
set_target_properties(gpu PROPERTIES OUTPUT_NAME lammps_gpu${LAMMPS_MACHINE})
target_sources(lammps PRIVATE ${GPU_SOURCES})
target_include_directories(lammps PRIVATE ${GPU_SOURCES_DIR})

View File

@ -87,27 +87,33 @@ GPU package
---------------------
To build with this package, you must choose options for precision and
which GPU hardware to build for.
which GPU hardware to build for. The GPU package currently supports
three different types of backends: OpenCL, CUDA and HIP.
**CMake build**\ :
.. code-block:: bash
-D GPU_API=value # value = opencl (default) or cuda
-D GPU_PREC=value # precision setting
# value = double or mixed (default) or single
-D OCL_TUNE=value # hardware choice for GPU_API=opencl
# generic (default) or intel (Intel CPU) or fermi, kepler, cypress (NVIDIA)
-D GPU_ARCH=value # primary GPU hardware choice for GPU_API=cuda
# value = sm_XX, see below
# default is sm_30
-D CUDPP_OPT=value # optimization setting for GPU_API=cuda
# enables CUDA Performance Primitives Optimizations
# value = yes (default) or no
-D CUDA_MPS_SUPPORT=value # enables some tweaks required to run with active nvidia-cuda-mps daemon
# value = yes or no (default)
-D GPU_API=value # value = opencl (default) or cuda or hip
-D GPU_PREC=value # precision setting
# value = double or mixed (default) or single
-D OCL_TUNE=value # hardware choice for GPU_API=opencl
# generic (default) or intel (Intel CPU) or fermi, kepler, cypress (NVIDIA)
-D GPU_ARCH=value # primary GPU hardware choice for GPU_API=cuda
# value = sm_XX, see below
# default is sm_30
-D HIP_ARCH=value # primary GPU hardware choice for GPU_API=hip
# value depends on selected HIP_PLATFORM
# default is 'gfx906' for HIP_PLATFORM=hcc and 'sm_30' for HIP_PLATFORM=nvcc
-D HIP_USE_DEVICE_SORT=value # enables GPU sorting
# value = yes (default) or no
-D CUDPP_OPT=value # optimization setting for GPU_API=cuda
# enables CUDA Performance Primitives Optimizations
# value = yes (default) or no
-D CUDA_MPS_SUPPORT=value # enables some tweaks required to run with active nvidia-cuda-mps daemon
# value = yes or no (default)
GPU_ARCH settings for different GPU hardware is as follows:
:code:`GPU_ARCH` settings for different GPU hardware is as follows:
* sm_12 or sm_13 for GT200 (supported by CUDA 3.2 until CUDA 6.5)
* sm_20 or sm_21 for Fermi (supported by CUDA 3.2 until CUDA 7.5)
@ -126,6 +132,28 @@ Thus the GPU_ARCH setting is merely an optimization, to have code for
the preferred GPU architecture directly included rather than having to wait
for the JIT compiler of the CUDA driver to translate it.
If you are compiling with HIP, note that before running CMake you will have to
set appropriate environment variables. Some variables such as
:code:`HCC_AMDGPU_TARGET` or :code:`CUDA_PATH` are necessary for :code:`hipcc`
and the linker to work correctly.
.. code:: bash
# AMDGPU target
export HIP_PLATFORM=hcc
export HCC_AMDGPU_TARGET=gfx906
cmake -D PKG_GPU=on -D GPU_API=HIP -D HIP_ARCH=gfx906 -D CMAKE_CXX_COMPILER=hipcc ..
make -j 4
.. code:: bash
# CUDA target
# !!! DO NOT set CMAKE_CXX_COMPILER !!!
export HIP_PLATFORM=nvcc
export CUDA_PATH=/usr/local/cuda
cmake -D PKG_GPU=on -D GPU_API=HIP -D HIP_ARCH=sm_70 ..
make -j 4
**Traditional make**\ :
Before building LAMMPS, you must build the GPU library in ``lib/gpu``\ .

View File

@ -14,7 +14,7 @@ Syntax
* adapt = style name of this fix command
* N = adapt simulation settings every this many timesteps
* one or more attribute/arg pairs may be appended
* attribute = *pair* or *kspace* or *atom*
* attribute = *pair* or *bond* or *kspace* or *atom*
.. parsed-literal::
@ -86,8 +86,8 @@ the end of a simulation. Even if *reset* is specified as *yes*\ , a
restart file written during a simulation will contain the modified
settings.
If the *scale* keyword is set to *no*\ , then the value the parameter is
set to will be whatever the variable generates. If the *scale*
If the *scale* keyword is set to *no*\ , then the value of the altered
parameter will be whatever the variable generates. If the *scale*
keyword is set to *yes*\ , then the value of the altered parameter will
be the initial value of that parameter multiplied by whatever the
variable generates. I.e. the variable is now a "scale factor" applied
@ -319,26 +319,23 @@ The *atom* keyword enables various atom properties to be changed. The
current list of atom parameters that can be varied by this fix:
* charge = charge on particle
* diameter = diameter of particle
* diameter, or, diameter/disc = diameter of particle
The *v_name* argument of the *atom* keyword is the name of an
:doc:`equal-style variable <variable>` which will be evaluated each time
this fix is invoked to set the parameter to a new value. It should be
specified as v_name, where name is the variable name. See the
this fix is invoked to set, or scale the parameter to a new value.
It should be specified as v_name, where name is the variable name. See the
discussion above describing the formulas associated with equal-style
variables. The new value is assigned to the corresponding attribute
for all atoms in the fix group.
.. note::
The *atom* keyword works this way whether the *scale* keyword is
set to *no* or *yes*\ . I.e. the use of scale yes is not yet supported
by the *atom* keyword.
If the atom parameter is *diameter* and per-atom density and per-atom
mass are defined for particles (e.g. :doc:`atom_style granular <atom_style>`), then the mass of each particle is also
changed when the diameter changes (density is assumed to stay
constant).
changed when the diameter changes. The mass is set from the particle volume
for 3d systems (density is assumed to stay constant). For 2d, the default is
for LAMMPS to model particles with a radius attribute as spheres.
However, if the atom parameter is *diameter/disc*, then the mass is
set from the particle area (the density is assumed to be in mass/distance^2 units).
For example, these commands would shrink the diameter of all granular
particles in the "center" group from 1.0 to 0.1 in a linear fashion

View File

@ -170,6 +170,7 @@ ba
Babadi
Babaei
backcolor
backends
Baczewski
Bagi
Bagnold

1
lib/gpu/.gitignore vendored
View File

@ -2,6 +2,7 @@
/obj_ocl
/ocl_get_devices
/nvc_get_devices
/hip_get_devices
/*.cubin
/*_cubin.h
/*_cl.h

148
lib/gpu/Makefile.hip Normal file
View File

@ -0,0 +1,148 @@
# /* ----------------------------------------------------------------------
# Generic Linux Makefile for HIP
# - export HIP_PLATFORM=hcc (or nvcc) before execution
# - change HIP_ARCH for your GPU
# ------------------------------------------------------------------------- */
# this setting should match LAMMPS Makefile
# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
LMP_INC = -DLAMMPS_SMALLBIG
# precision for GPU calculations
# -D_SINGLE_SINGLE # Single precision for all calculations
# -D_DOUBLE_DOUBLE # Double precision for all calculations
# -D_SINGLE_DOUBLE # Accumulation of forces, etc. in double
HIP_PRECISION = -D_SINGLE_DOUBLE
HIP_OPTS = -O3
HIP_HOST_OPTS = -Wno-deprecated-declarations
HIP_HOST_INCLUDE =
# use device sort
# requires linking with hipcc and hipCUB + (rocPRIM or CUB for AMD or Nvidia respectively)
HIP_HOST_OPTS += -DUSE_HIP_DEVICE_SORT
# path to cub
HIP_HOST_INCLUDE += -I./
# path to hipcub
HIP_HOST_INCLUDE += -I$(HIP_PATH)/../include
# use mpi
HIP_HOST_OPTS += -DMPI_GERYON -DUCL_NO_EXIT
# this settings should match LAMMPS Makefile
MPI_COMP_OPTS = $(shell mpicxx --showme:compile)
MPI_LINK_OPTS = $(shell mpicxx --showme:link)
#MPI_COMP_OPTS += -I/usr/include/mpi -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1
HIP_PATH ?= $(wildcard /opt/rocm/hip)
HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler)
ifeq (hcc,$(HIP_PLATFORM))
HIP_OPTS += -ffast-math
# possible values: gfx803,gfx900,gfx906
HIP_ARCH = gfx906
else ifeq (nvcc,$(HIP_PLATFORM))
HIP_OPTS += --use_fast_math
HIP_ARCH = -gencode arch=compute_30,code=[sm_30,compute_30] -gencode arch=compute_32,code=[sm_32,compute_32] -gencode arch=compute_35,code=[sm_35,compute_35] \
-gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52] -gencode arch=compute_53,code=[sm_53,compute_53]\
-gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61] -gencode arch=compute_62,code=[sm_62,compute_62]\
-gencode arch=compute_70,code=[sm_70,compute_70] -gencode arch=compute_72,code=[sm_72,compute_72] -gencode arch=compute_75,code=[sm_75,compute_75]
else
$(error Specify HIP platform using 'export HIP_PLATFORM=(hcc,nvcc)')
endif
BIN_DIR = .
OBJ_DIR = ./obj
LIB_DIR = .
AR = ar
BSH = /bin/sh
# /* ----------------------------------------------------------------------
# don't change section below without need
# ------------------------------------------------------------------------- */
HIP_OPTS += -DUSE_HIP $(HIP_PRECISION)
HIP_GPU_OPTS += $(HIP_OPTS) -I./
ifeq (hcc,$(HIP_PLATFORM))
HIP_HOST_OPTS += -fPIC
HIP_GPU_CC = $(HIP_PATH)/bin/hipcc --genco
HIP_GPU_OPTS_S = -t="$(HIP_ARCH)" -f=\"
HIP_GPU_OPTS_E = \"
HIP_KERNEL_SUFFIX = .cpp
HIP_LIBS_TARGET = export HCC_AMDGPU_TARGET := $(HIP_ARCH)
export HCC_AMDGPU_TARGET := $(HIP_ARCH)
else ifeq (nvcc,$(HIP_PLATFORM))
HIP_GPU_CC = $(HIP_PATH)/bin/hipcc --fatbin
HIP_GPU_OPTS += $(HIP_ARCH)
HIP_GPU_SORT_ARCH = $(HIP_ARCH)
# fix nvcc can't handle -pthread flag
MPI_COMP_OPTS := $(subst -pthread,-Xcompiler -pthread,$(MPI_COMP_OPTS))
MPI_LINK_OPTS := $(subst -pthread,-Xcompiler -pthread,$(MPI_LINK_OPTS))
endif
# hipcc is essential for device sort, because of hipcub is header only library and ROCm gpu code generation is deferred to the linking stage
HIP_HOST_CC = $(HIP_PATH)/bin/hipcc
HIP_HOST_OPTS += $(HIP_OPTS) $(MPI_COMP_OPTS) $(LMP_INC)
HIP_HOST_CC_CMD = $(HIP_HOST_CC) $(HIP_HOST_OPTS) $(HIP_HOST_INCLUDE)
# sources
ALL_H = $(wildcard ./geryon/ucl*.h) $(wildcard ./geryon/hip*.h) $(wildcard ./lal_*.h)
SRCS := $(wildcard ./lal_*.cpp)
OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o))
CUS := $(wildcard lal_*.cu)
CUHS := $(filter-out pppm_cubin.h, $(CUS:lal_%.cu=%_cubin.h)) pppm_f_cubin.h pppm_d_cubin.h
CUHS := $(addprefix $(OBJ_DIR)/, $(CUHS))
all: $(OBJ_DIR) $(CUHS) $(LIB_DIR)/libgpu.a $(BIN_DIR)/hip_get_devices
$(OBJ_DIR):
mkdir -p $@
# GPU kernels compilation
$(OBJ_DIR)/pppm_f_cubin.h: lal_pppm.cu $(ALL_H)
@cp $< $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX)
$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) -Dgrdtyp=float -Dgrdtyp4=float4 $(HIP_GPU_OPTS_E) -o $(OBJ_DIR)/pppm_f.cubin $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX)
@xxd -i $(OBJ_DIR)/pppm_f.cubin $@
@sed -i "s/[a-zA-Z0-9_]*pppm_f_cubin/pppm_f/g" $@
@rm $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/pppm_f.cubin
$(OBJ_DIR)/pppm_d_cubin.h: lal_pppm.cu $(ALL_H)
@cp $< $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX)
$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) -Dgrdtyp=double -Dgrdtyp4=double4 $(HIP_GPU_OPTS_E) -o $(OBJ_DIR)/pppm_d.cubin $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX)
@xxd -i $(OBJ_DIR)/pppm_d.cubin $@
@sed -i "s/[a-zA-Z0-9_]*pppm_d_cubin/pppm_d/g" $@
@rm $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/pppm_d.cubin
$(OBJ_DIR)/%_cubin.h: lal_%.cu $(ALL_H)
@cp $< $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX)
$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) $(HIP_GPU_OPTS_E) -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX)
@xxd -i $(OBJ_DIR)/$*.cubin $@
@sed -i "s/[a-zA-Z0-9_]*$*_cubin/$*/g" $@
@rm $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/$*.cubin
# host sources compilation
$(OBJ_DIR)/lal_atom.o: lal_atom.cpp $(CUHS) $(ALL_H)
$(HIP_HOST_CC_CMD) -o $@ -c $< -I$(OBJ_DIR) $(HIP_GPU_SORT_ARCH)
$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(ALL_H)
$(HIP_HOST_CC_CMD) -o $@ -c $< -I$(OBJ_DIR)
# libgpu building
$(LIB_DIR)/libgpu.a: $(OBJS)
$(AR) -crs $@ $(OBJS)
printf "export HIP_PLATFORM := %s\n%s\n" "$(HIP_PLATFORM)" "$(HIP_LIBS_TARGET)" > Makefile.lammps
# test app building
$(BIN_DIR)/hip_get_devices: ./geryon/ucl_get_devices.cpp $(ALL_H)
$(HIP_HOST_CC_CMD) -o $@ $< -DUCL_HIP $(MPI_LINK_OPTS)
clean:
-rm -f $(BIN_DIR)/hip_get_devices $(LIB_DIR)/libgpu.a $(OBJS) $(OBJ_DIR)/temp_* $(CUHS)

View File

@ -67,8 +67,8 @@ library requires installing the CUDA GPU driver and CUDA toolkit for
your operating system. Installation of the CUDA SDK is not necessary.
In addition to the LAMMPS library, the binary nvc_get_devices will also
be built. This can be used to query the names and properties of GPU
devices on your system. A Makefile for OpenCL compilation is provided,
but support for OpenCL use is not currently provided by the developers.
devices on your system. A Makefile for OpenCL and ROCm HIP compilation
is provided, but support for it is not currently provided by the developers.
Details of the implementation are provided in:
----
@ -169,6 +169,25 @@ NOTE: The system-specific setting LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG,
src/MAKE/Makefile.foo) should be consistent with that specified
when building libgpu.a (i.e. by LMP_INC in the lib/gpu/Makefile.bar).
BUILDING FOR HIP FRAMEWORK
--------------------------------
1. Install the latest ROCm framework (https://github.com/RadeonOpenCompute/ROCm).
2. GPU sorting requires installing hipcub
(https://github.com/ROCmSoftwarePlatform/hipCUB). The HIP CUDA-backend
additionally requires cub (https://nvlabs.github.io/cub). Download and
extract the cub directory to lammps/lib/gpu/ or specify an appropriate
path in lammps/lib/gpu/Makefile.hip.
3. In Makefile.hip it is possible to specify the target platform via
export HIP_PLATFORM=hcc or HIP_PLATFORM=nvcc as well as the target
architecture (gfx803, gfx900, gfx906 etc.)
4. If your MPI implementation does not support `mpicxx --showme` command,
it is required to specify the corresponding MPI compiler and linker flags
in lammps/lib/gpu/Makefile.hip and in lammps/src/MAKE/OPTIONS/Makefile.hip.
5. Building the GPU library (libgpu.a):
cd lammps/lib/gpu; make -f Makefile.hip -j
6. Building the LAMMPS executable (lmp_hip):
cd ../../src; make hip -j
EXAMPLE CONVENTIONAL BUILD PROCESS
--------------------------------

519
lib/gpu/geryon/hip_device.h Normal file
View File

@ -0,0 +1,519 @@
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef HIP_DEVICE
#define HIP_DEVICE
#include <hip/hip_runtime.h>
#include <unordered_map>
#include <string>
#include <vector>
#include <iostream>
#include "hip_macros.h"
#include "ucl_types.h"
namespace ucl_hip {
// --------------------------------------------------------------------------
// - COMMAND QUEUE STUFF
// --------------------------------------------------------------------------
typedef hipStream_t command_queue;
inline void ucl_sync(hipStream_t &stream) {
CU_SAFE_CALL(hipStreamSynchronize(stream));
}
struct NVDProperties {
int device_id;
std::string name;
int major;
int minor;
CUDA_INT_TYPE totalGlobalMem;
int multiProcessorCount;
int maxThreadsPerBlock;
int maxThreadsDim[3];
int maxGridSize[3];
int sharedMemPerBlock;
int totalConstantMemory;
int SIMDWidth;
int memPitch;
int regsPerBlock;
int clockRate;
int textureAlign;
int kernelExecTimeoutEnabled;
int integrated;
int canMapHostMemory;
int concurrentKernels;
int ECCEnabled;
int computeMode;
};
/// Class for looking at device properties
/** \note Calls to change the device outside of the class results in incorrect
* behavior
* \note There is no error checking for indexing past the number of devices **/
class UCL_Device {
public:
/// Collect properties for every GPU on the node
/** \note You must set the active GPU with set() before using the device **/
inline UCL_Device();
inline ~UCL_Device();
/// Returns 1 (For compatibility with OpenCL)
inline int num_platforms() { return 1; }
/// Return a string with name and info of the current platform
inline std::string platform_name()
{ return "HIP platform"; }
/// Delete any contexts/data and set the platform number to be used
inline int set_platform(const int pid);
/// Return the number of devices that support CUDA
inline int num_devices() { return _properties.size(); }
/// Set the CUDA device to the specified device number
/** A context and default command queue will be created for the device
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
* be allocated for use. clear() is called to delete any contexts and
* associated data from previous calls to set(). **/
inline int set(int num);
/// Delete any context and associated data stored from a call to set()
inline void clear();
/// Get the current device number
inline int device_num() { return _device; }
/// Returns the default stream for the current device
inline command_queue & cq() { return cq(0); }
/// Returns the stream indexed by i
inline command_queue & cq(const int i) { return _cq[i]; }
/// Block until all commands in the default stream have completed
inline void sync() { sync(0); }
/// Block until all commands in the specified stream have completed
inline void sync(const int i) { ucl_sync(cq(i)); }
/// Get the number of command queues currently available on device
inline int num_queues()
{ return _cq.size(); }
/// Add a stream for device computations
inline void push_command_queue() {
_cq.push_back(hipStream_t());
CU_SAFE_CALL(hipStreamCreateWithFlags(&_cq.back(),0));
}
/// Remove a stream for device computations
/** \note You cannot delete the default stream **/
inline void pop_command_queue() {
if (_cq.size()<2) return;
CU_SAFE_CALL_NS(hipStreamDestroy(_cq.back()));
_cq.pop_back();
}
/// Set the default command queue (by default this is the null stream)
/** \param i index of the command queue (as added by push_command_queue())
If i is 0, the default command queue is set to the null stream **/
inline void set_command_queue(const int i) {
if (i==0) _cq[0]=0;
else _cq[0]=_cq[i];
}
/// Get the current CUDA device name
inline std::string name() { return name(_device); }
/// Get the CUDA device name
inline std::string name(const int i)
{ return std::string(_properties[i].name); }
/// Get a string telling the type of the current device
inline std::string device_type_name() { return device_type_name(_device); }
/// Get a string telling the type of the device
inline std::string device_type_name(const int i) { return "GPU"; }
/// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type() { return device_type(_device); }
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type(const int i) { return UCL_GPU; }
/// Returns true if host memory is efficiently addressable from device
inline bool shared_memory() { return shared_memory(_device); }
/// Returns true if host memory is efficiently addressable from device
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
/// Returns true if double precision is support for the current device
inline bool double_precision() { return double_precision(_device); }
/// Returns true if double precision is support for the device
inline bool double_precision(const int i) {return arch(i)>=1.3;}
/// Get the number of compute units on the current device
inline unsigned cus() { return cus(_device); }
/// Get the number of compute units
inline unsigned cus(const int i)
{ return _properties[i].multiProcessorCount; }
/// Get the number of cores in the current device
inline unsigned cores() { return cores(_device); }
/// Get the number of cores
inline unsigned cores(const int i)
{ if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32;
else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48;
else return _properties[i].multiProcessorCount*192; }
/// Get the gigabytes of global memory in the current device
inline double gigabytes() { return gigabytes(_device); }
/// Get the gigabytes of global memory
inline double gigabytes(const int i)
{ return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
/// Get the bytes of global memory in the current device
inline size_t bytes() { return bytes(_device); }
/// Get the bytes of global memory
inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; }
// Get the gigabytes of free memory in the current device
inline double free_gigabytes() { return free_gigabytes(_device); }
// Get the gigabytes of free memory
inline double free_gigabytes(const int i)
{ return static_cast<double>(free_bytes(i))/1073741824; }
// Get the bytes of free memory in the current device
inline size_t free_bytes() { return free_bytes(_device); }
// Get the bytes of free memory
inline size_t free_bytes(const int i) {
CUDA_INT_TYPE dfree, dtotal;
CU_SAFE_CALL_NS(hipMemGetInfo(&dfree, &dtotal));
return static_cast<size_t>(dfree);
}
/// Return the GPGPU compute capability for current device
inline double arch() { return arch(_device); }
/// Return the GPGPU compute capability
inline double arch(const int i)
{ return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
/// Clock rate in GHz for current device
inline double clock_rate() { return clock_rate(_device); }
/// Clock rate in GHz
inline double clock_rate(const int i)
{ return _properties[i].clockRate*1e-6;}
/// Get the maximum number of threads per block
inline size_t group_size() { return group_size(_device); }
/// Get the maximum number of threads per block
inline size_t group_size(const int i)
{ return _properties[i].maxThreadsPerBlock; }
/// Return the maximum memory pitch in bytes for current device
inline size_t max_pitch() { return max_pitch(_device); }
/// Return the maximum memory pitch in bytes
inline size_t max_pitch(const int i) { return _properties[i].memPitch; }
/// Returns false if accelerator cannot be shared by multiple processes
/** If it cannot be determined, true is returned **/
inline bool sharing_supported() { return sharing_supported(_device); }
/// Returns false if accelerator cannot be shared by multiple processes
/** If it cannot be determined, true is returned **/
inline bool sharing_supported(const int i)
{ return (_properties[i].computeMode == hipComputeModeDefault); }
/// True if splitting device into equal subdevices supported
inline bool fission_equal()
{ return fission_equal(_device); }
/// True if splitting device into equal subdevices supported
inline bool fission_equal(const int i)
{ return false; }
/// True if splitting device into subdevices by specified counts supported
inline bool fission_by_counts()
{ return fission_by_counts(_device); }
/// True if splitting device into subdevices by specified counts supported
inline bool fission_by_counts(const int i)
{ return false; }
/// True if splitting device into subdevices by affinity domains supported
inline bool fission_by_affinity()
{ return fission_by_affinity(_device); }
/// True if splitting device into subdevices by affinity domains supported
inline bool fission_by_affinity(const int i)
{ return false; }
/// Maximum number of subdevices allowed from device fission
inline int max_sub_devices()
{ return max_sub_devices(_device); }
/// Maximum number of subdevices allowed from device fission
inline int max_sub_devices(const int i)
{ return 0; }
/// List all devices along with all properties
inline void print_all(std::ostream &out);
/// Select the platform that has accelerators (for compatibility with OpenCL)
inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; }
inline int load_module(const void* program, hipModule_t& module, std::string *log=NULL){
auto it = _loaded_modules.emplace(program, hipModule_t());
if(!it.second){
module = it.first->second;
return UCL_SUCCESS;
}
const unsigned int num_opts=2;
hipJitOption options[num_opts];
void *values[num_opts];
// set up size of compilation log buffer
options[0] = hipJitOptionInfoLogBufferSizeBytes;
values[0] = (void *)(int)10240;
// set up pointer to the compilation log buffer
options[1] = hipJitOptionInfoLogBuffer;
char clog[10240] = { 0 };
values[1] = clog;
hipError_t err=hipModuleLoadDataEx(&module,program,num_opts, options,(void **)values);
if (log!=NULL)
*log=std::string(clog);
if (err != hipSuccess) {
#ifndef UCL_NO_EXIT
std::cerr << std::endl
<< "----------------------------------------------------------\n"
<< " UCL Error: Error compiling PTX Program...\n"
<< "----------------------------------------------------------\n";
std::cerr << log << std::endl;
#endif
_loaded_modules.erase(it.first);
return UCL_COMPILE_ERROR;
}
it.first->second = module;
return UCL_SUCCESS;
}
private:
std::unordered_map<const void*, hipModule_t> _loaded_modules;
int _device, _num_devices;
std::vector<NVDProperties> _properties;
std::vector<hipStream_t> _cq;
hipDevice_t _cu_device;
};
// Grabs the properties for all devices
UCL_Device::UCL_Device() {
CU_SAFE_CALL_NS(hipInit(0));
CU_SAFE_CALL_NS(hipGetDeviceCount(&_num_devices));
for (int i=0; i<_num_devices; ++i) {
hipDevice_t dev;
CU_SAFE_CALL_NS(hipDeviceGet(&dev,i));
int major, minor;
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, dev));
if (major==9999)
continue;
NVDProperties prop;
prop.device_id = i;
prop.major=major;
prop.minor=minor;
char namecstr[1024];
CU_SAFE_CALL_NS(hipDeviceGetName(namecstr,1024,dev));
prop.name=namecstr;
CU_SAFE_CALL_NS(hipDeviceTotalMem(&prop.totalGlobalMem,dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.multiProcessorCount, hipDeviceAttributeMultiprocessorCount, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsPerBlock, hipDeviceAttributeMaxThreadsPerBlock, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[0], hipDeviceAttributeMaxBlockDimX, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[1], hipDeviceAttributeMaxBlockDimY, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[2], hipDeviceAttributeMaxBlockDimZ, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[0], hipDeviceAttributeMaxGridDimX, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[1], hipDeviceAttributeMaxGridDimY, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[2], hipDeviceAttributeMaxGridDimZ, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.sharedMemPerBlock, hipDeviceAttributeMaxSharedMemoryPerBlock, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.totalConstantMemory, hipDeviceAttributeTotalConstantMemory, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.SIMDWidth, hipDeviceAttributeWarpSize, dev));
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.regsPerBlock, hipDeviceAttributeMaxRegistersPerBlock, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.clockRate, hipDeviceAttributeClockRate, dev));
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev));
//#if CUDA_VERSION >= 2020
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.integrated, hipDeviceAttributeIntegrated, dev));
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.computeMode, hipDeviceAttributeComputeMode,dev));
//#endif
//#if CUDA_VERSION >= 3010
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.concurrentKernels, hipDeviceAttributeConcurrentKernels, dev));
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
//#endif
_properties.push_back(prop);
}
_device=-1;
_cq.push_back(hipStream_t());
_cq.back()=0;
}
UCL_Device::~UCL_Device() {
clear();
}
int UCL_Device::set_platform(const int pid) {
clear();
#ifdef UCL_DEBUG
assert(pid<num_platforms());
#endif
return UCL_SUCCESS;
}
// Set the CUDA device to the specified device number
int UCL_Device::set(int num) {
clear();
_device=_properties[num].device_id;
hipError_t err=hipDeviceGet(&_cu_device,_device);
if (err!=hipSuccess) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not access accelerator number " << num
<< " for use.\n";
UCL_GERYON_EXIT;
#endif
return UCL_ERROR;
}
//hipError_t err=hipCtxCreate(&_context,0,_cu_device); deprecated and unnecessary
err=hipSetDevice(_device);
if (err!=hipSuccess) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not set accelerator number " << num
<< " for use.\n";
UCL_GERYON_EXIT;
#endif
return UCL_ERROR;
}
return UCL_SUCCESS;
}
void UCL_Device::clear() {
if (_device>-1) {
for (int i=1; i<num_queues(); i++) pop_command_queue();
CU_SAFE_CALL_NS(hipDeviceReset());
}
_device=-1;
}
// List all devices along with all properties
void UCL_Device::print_all(std::ostream &out) {
//#if CUDA_VERSION >= 2020
int driver_version;
hipDriverGetVersion(&driver_version);
out << "Driver Version: "
<< driver_version/1000 << "." << driver_version%100
<< std::endl;
//#endif
if (num_devices() == 0)
out << "There is no device supporting HIP\n";
for (int i=0; i<num_devices(); ++i) {
out << "\nDevice " << i << ": \"" << name(i) << "\"\n";
out << " Type of device: "
<< device_type_name(i).c_str() << std::endl;
out << " Compute capability: "
<< arch(i) << std::endl;
out << " Double precision support: ";
if (double_precision(i))
out << "Yes\n";
else
out << "No\n";
out << " Total amount of global memory: "
<< gigabytes(i) << " GB\n";
//#if CUDA_VERSION >= 2000
out << " Number of compute units/multiprocessors: "
<< _properties[i].multiProcessorCount << std::endl;
out << " Number of cores: "
<< cores(i) << std::endl;
//#endif
out << " Total amount of constant memory: "
<< _properties[i].totalConstantMemory << " bytes\n";
out << " Total amount of local/shared memory per block: "
<< _properties[i].sharedMemPerBlock << " bytes\n";
out << " Total number of registers available per block: "
<< _properties[i].regsPerBlock << std::endl;
out << " Warp size: "
<< _properties[i].SIMDWidth << std::endl;
out << " Maximum number of threads per block: "
<< _properties[i].maxThreadsPerBlock << std::endl;
out << " Maximum group size (# of threads per block) "
<< _properties[i].maxThreadsDim[0] << " x "
<< _properties[i].maxThreadsDim[1] << " x "
<< _properties[i].maxThreadsDim[2] << std::endl;
out << " Maximum item sizes (# threads for each dim) "
<< _properties[i].maxGridSize[0] << " x "
<< _properties[i].maxGridSize[1] << " x "
<< _properties[i].maxGridSize[2] << std::endl;
//out << " Maximum memory pitch: "
// << max_pitch(i) << " bytes\n";
//out << " Texture alignment: "
// << _properties[i].textureAlign << " bytes\n";
out << " Clock rate: "
<< clock_rate(i) << " GHz\n";
//#if CUDA_VERSION >= 2020
//out << " Run time limit on kernels: ";
//if (_properties[i].kernelExecTimeoutEnabled)
// out << "Yes\n";
//else
// out << "No\n";
out << " Integrated: ";
if (_properties[i].integrated)
out << "Yes\n";
else
out << "No\n";
//out << " Support host page-locked memory mapping: ";
//if (_properties[i].canMapHostMemory)
// out << "Yes\n";
//else
// out << "No\n";
out << " Compute mode: ";
if (_properties[i].computeMode == hipComputeModeDefault)
out << "Default\n"; // multiple threads can use device
//#if CUDA_VERSION >= 8000
// else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
//#else
else if (_properties[i].computeMode == hipComputeModeExclusive)
//#endif
out << "Exclusive\n"; // only thread can use device
else if (_properties[i].computeMode == hipComputeModeProhibited)
out << "Prohibited\n"; // no thread can use device
//#if CUDART_VERSION >= 4000
else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
out << "Exclusive Process\n"; // multiple threads 1 process
//#endif
else
out << "Unknown\n";
//#endif
//#if CUDA_VERSION >= 3010
out << " Concurrent kernel execution: ";
if (_properties[i].concurrentKernels)
out << "Yes\n";
else
out << "No\n";
//out << " Device has ECC support enabled: ";
//if (_properties[i].ECCEnabled)
// out << "Yes\n";
//else
// out << "No\n";
//#endif
}
}
}
#endif

298
lib/gpu/geryon/hip_kernel.h Normal file
View File

@ -0,0 +1,298 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef HIP_KERNEL
#define HIP_KERNEL
#include <hip/hip_runtime.h>
#include "hip_device.h"
#include <fstream>
#include <string>
#include <iostream>
namespace ucl_hip {
class UCL_Texture;
template <class numtyp> class UCL_D_Vec;
template <class numtyp> class UCL_D_Mat;
template <class hosttype, class devtype> class UCL_Vector;
template <class hosttype, class devtype> class UCL_Matrix;
#define UCL_MAX_KERNEL_ARGS 256
/// Class storing 1 or more kernel functions from a single string or file
class UCL_Program {
UCL_Device* _device_ptr;
public:
inline UCL_Program(UCL_Device &device) { _device_ptr = &device; _cq=device.cq(); }
inline UCL_Program(UCL_Device &device, const void *program,
const char *flags="", std::string *log=NULL) {
_device_ptr = &device; _cq=device.cq();
init(device);
load_string(program,flags,log);
}
inline ~UCL_Program() {}
/// Initialize the program with a device
inline void init(UCL_Device &device) { _device_ptr = &device; _cq=device.cq(); }
/// Clear any data associated with program
/** \note Must call init() after each clear **/
inline void clear() { }
/// Load a program from a file and compile with flags
inline int load(const char *filename, const char *flags="", std::string *log=NULL) {
std::ifstream in(filename);
if (!in || in.is_open()==false) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not open kernel file: "
<< filename << std::endl;
UCL_GERYON_EXIT;
#endif
return UCL_FILE_NOT_FOUND;
}
std::string program((std::istreambuf_iterator<char>(in)),
std::istreambuf_iterator<char>());
in.close();
return load_string(program.c_str(),flags,log);
}
/// Load a program from a string and compile with flags
inline int load_string(const void *program, const char *flags="", std::string *log=NULL) {
return _device_ptr->load_module(program, _module, log);
}
friend class UCL_Kernel;
private:
hipModule_t _module;
hipStream_t _cq;
friend class UCL_Texture;
};
/// Class for dealing with CUDA Driver kernels
class UCL_Kernel {
public:
UCL_Kernel() : _dimensions(1), _num_args(0) {
_num_blocks[0]=0;
}
UCL_Kernel(UCL_Program &program, const char *function) :
_dimensions(1), _num_args(0) {
_num_blocks[0]=0;
set_function(program,function);
_cq=program._cq;
}
~UCL_Kernel() {}
/// Clear any function associated with the kernel
inline void clear() { }
/// Get the kernel function from a program
/** \ret UCL_ERROR_FLAG (UCL_SUCCESS, UCL_FILE_NOT_FOUND, UCL_ERROR) **/
inline int set_function(UCL_Program &program, const char *function) {
hipError_t err=hipModuleGetFunction(&_kernel,program._module,function);
if (err!=hipSuccess) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not find function: " << function
<< " in program.\n";
UCL_GERYON_EXIT;
#endif
return UCL_FUNCTION_NOT_FOUND;
}
_cq=program._cq;
return UCL_SUCCESS;
}
/// Set the kernel argument.
/** If not a device pointer, this must be repeated each time the argument
* changes
* \note To set kernel parameter i (i>0), parameter i-1 must be set **/
template <class dtype>
inline void set_arg(const unsigned index, const dtype * const arg) {
if (index==_num_args)
add_arg(arg);
else if (index<_num_args){
assert(0==1); // not implemented
}
else
assert(0==1); // Must add kernel parameters in sequential order
}
/// Set a geryon container as a kernel argument.
template <class numtyp>
inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
{ set_arg(&arg->begin()); }
/// Set a geryon container as a kernel argument.
template <class numtyp>
inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
{ set_arg(&arg->begin()); }
/// Set a geryon container as a kernel argument.
template <class hosttype, class devtype>
inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
{ set_arg(&arg->device.begin()); }
/// Set a geryon container as a kernel argument.
template <class hosttype, class devtype>
inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
{ set_arg(&arg->device.begin()); }
/// Add a kernel argument.
inline void add_arg(const hipDeviceptr_t* const arg) {
add_arg<void*>((void**)arg);
}
/// Add a kernel argument.
template <class dtype>
inline void add_arg(const dtype* const arg) {
const auto old_size = _hip_kernel_args.size();
const auto aligned_size = (old_size+alignof(dtype)-1) & ~(alignof(dtype)-1);
const auto arg_size = sizeof(dtype);
_hip_kernel_args.resize(aligned_size + arg_size);
*((dtype*)(&_hip_kernel_args[aligned_size])) = *arg;
_num_args++;
if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
}
/// Add a geryon container as a kernel argument.
template <class numtyp>
inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
{ add_arg(&arg->begin()); }
/// Add a geryon container as a kernel argument.
template <class numtyp>
inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
{ add_arg(&arg->begin()); }
/// Add a geryon container as a kernel argument.
template <class hosttype, class devtype>
inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
{ add_arg(&arg->device.begin()); }
/// Add a geryon container as a kernel argument.
template <class hosttype, class devtype>
inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
{ add_arg(&arg->device.begin()); }
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks, const size_t block_size) {
_dimensions=1;
_num_blocks[0]=num_blocks;
_num_blocks[1]=1;
_num_blocks[2]=1;
_block_size[0]=block_size;
_block_size[1]=1;
_block_size[2]=1;
}
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue for the kernel is changed to cq **/
inline void set_size(const size_t num_blocks, const size_t block_size,
command_queue &cq)
{ _cq=cq; set_size(num_blocks,block_size); }
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y) {
_dimensions=2;
_num_blocks[0]=num_blocks_x;
_num_blocks[1]=num_blocks_y;
_num_blocks[2]=1;
_block_size[0]=block_size_x;
_block_size[1]=block_size_y;
_block_size[2]=1;
}
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue for the kernel is changed to cq **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y,
command_queue &cq)
{_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x,
const size_t block_size_y, const size_t block_size_z) {
_dimensions=2;
_num_blocks[0]=num_blocks_x;
_num_blocks[1]=num_blocks_y;
_num_blocks[2]=1;
_block_size[0]=block_size_x;
_block_size[1]=block_size_y;
_block_size[2]=block_size_z;
}
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y,
const size_t block_size_z, command_queue &cq) {
_cq=cq;
set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
block_size_z);
}
/// Run the kernel in the default command queue
inline void run() {
size_t args_size = _hip_kernel_args.size();
void *config[] = {
HIP_LAUNCH_PARAM_BUFFER_POINTER, (void*)_hip_kernel_args.data(),
HIP_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
HIP_LAUNCH_PARAM_END
};
const auto res = hipModuleLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1],
_num_blocks[2],_block_size[0],_block_size[1],
_block_size[2],0,_cq, NULL, config);
CU_SAFE_CALL(res);
//#endif
}
/// Clear any arguments associated with the kernel
inline void clear_args() {
_num_args=0;
_hip_kernel_args.clear();
}
/// Return the default command queue/stream associated with this data
inline command_queue & cq() { return _cq; }
/// Change the default command queue associated with matrix
inline void cq(command_queue &cq_in) { _cq=cq_in; }
#include "ucl_arg_kludge.h"
private:
hipFunction_t _kernel;
hipStream_t _cq;
unsigned _dimensions;
unsigned _num_blocks[3];
unsigned _num_args;
friend class UCL_Texture;
unsigned _block_size[3];
std::vector<char> _hip_kernel_args;
};
} // namespace
#endif

View File

@ -0,0 +1,83 @@
#ifndef HIP_MACROS_H
#define HIP_MACROS_H
#include <cstdio>
#include <cassert>
#include <hip/hip_runtime.h>
//#if CUDA_VERSION >= 3020
#define CUDA_INT_TYPE size_t
//#else
//#define CUDA_INT_TYPE unsigned
//#endif
#ifdef MPI_GERYON
#include "mpi.h"
#define NVD_GERYON_EXIT do { \
int is_final; \
MPI_Finalized(&is_final); \
if (!is_final) \
MPI_Abort(MPI_COMM_WORLD,-1); \
} while(0)
#else
#define NVD_GERYON_EXIT assert(0==1)
#endif
#ifndef UCL_GERYON_EXIT
#define UCL_GERYON_EXIT NVD_GERYON_EXIT
#endif
#ifdef UCL_DEBUG
#define UCL_SYNC_DEBUG
#define UCL_DESTRUCT_CHECK
#endif
#ifndef UCL_NO_API_CHECK
#define CU_SAFE_CALL_NS( call ) do { \
hipError_t err = call; \
if( hipSuccess != err) { \
fprintf(stderr, "HIP runtime error %d in call at file '%s' in line %i.\n", \
err, __FILE__, __LINE__ ); \
NVD_GERYON_EXIT; \
} } while (0)
#ifdef UCL_SYNC_DEBUG
#define CU_SAFE_CALL( call ) do { \
CU_SAFE_CALL_NS( call ); \
hipError_t err=hipCtxSynchronize(); \
if( hipSuccess != err) { \
fprintf(stderr, "HIP runtime error %d in file '%s' in line %i.\n", \
err, __FILE__, __LINE__ ); \
NVD_GERYON_EXIT; \
} } while (0)
#else
#define CU_SAFE_CALL( call ) CU_SAFE_CALL_NS( call )
#endif
#else // not DEBUG
// void macros for performance reasons
#define CU_SAFE_CALL_NS( call ) call
#define CU_SAFE_CALL( call) call
#endif
#ifdef UCL_DESTRUCT_CHECK
#define CU_DESTRUCT_CALL( call) CU_SAFE_CALL( call)
#define CU_DESTRUCT_CALL_NS( call) CU_SAFE_CALL_NS( call)
#else
#define CU_DESTRUCT_CALL( call) call
#define CU_DESTRUCT_CALL_NS( call) call
#endif
#endif

43
lib/gpu/geryon/hip_mat.h Normal file
View File

@ -0,0 +1,43 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
/*! \file */
#ifndef HIP_MAT_H
#define HIP_MAT_H
#include <hip/hip_runtime.h>
#include "hip_memory.h"
/// Namespace for CUDA Driver routines
namespace ucl_hip {
#define _UCL_MAT_ALLOW
#define _UCL_DEVICE_PTR_MAT
#include "ucl_basemat.h"
#include "ucl_h_vec.h"
#include "ucl_h_mat.h"
#include "ucl_d_vec.h"
#include "ucl_d_mat.h"
#include "ucl_s_obj_help.h"
#include "ucl_vector.h"
#include "ucl_matrix.h"
#undef _UCL_DEVICE_PTR_MAT
#undef _UCL_MAT_ALLOW
#define UCL_COPY_ALLOW
#include "ucl_copy.h"
#undef UCL_COPY_ALLOW
#define UCL_PRINT_ALLOW
#include "ucl_print.h"
#undef UCL_PRINT_ALLOW
} // namespace ucl_cudadr
#endif

279
lib/gpu/geryon/hip_memory.h Normal file
View File

@ -0,0 +1,279 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef HIP_MEMORY_H
#define HIP_MEMORY_H
#include <hip/hip_runtime.h>
#include <iostream>
#include <cassert>
#include <cstring>
#include "hip_macros.h"
#include "hip_device.h"
#include "ucl_types.h"
namespace ucl_hip {
// --------------------------------------------------------------------------
// - API Specific Types
// --------------------------------------------------------------------------
//typedef dim3 ucl_kernel_dim;
#ifdef __HIP_PLATFORM_NVCC__
typedef enum hipArray_Format {
HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01,
HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02,
HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03,
HIP_AD_FORMAT_SIGNED_INT8 = 0x08,
HIP_AD_FORMAT_SIGNED_INT16 = 0x09,
HIP_AD_FORMAT_SIGNED_INT32 = 0x0a,
HIP_AD_FORMAT_HALF = 0x10,
HIP_AD_FORMAT_FLOAT = 0x20
}hipArray_Format;
#endif
// --------------------------------------------------------------------------
// - API SPECIFIC DEVICE POINTERS
// --------------------------------------------------------------------------
typedef hipDeviceptr_t device_ptr;
// --------------------------------------------------------------------------
// - HOST MEMORY ALLOCATION ROUTINES
// --------------------------------------------------------------------------
template <class mat_type, class copy_type>
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
hipError_t err=hipSuccess;
if (kind==UCL_NOT_PINNED)
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
else if (kind==UCL_WRITE_ONLY)
err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined);
else
err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault);
if (err!=hipSuccess || *(mat.host_ptr())==NULL)
return UCL_MEMORY_ERROR;
mat.cq()=cm.cq();
return UCL_SUCCESS;
}
template <class mat_type>
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
hipError_t err=hipSuccess;
if (kind==UCL_NOT_PINNED)
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
else if (kind==UCL_WRITE_ONLY)
err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined);
else
err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault);
if (err!=hipSuccess || *(mat.host_ptr())==NULL)
return UCL_MEMORY_ERROR;
mat.cq()=dev.cq();
return UCL_SUCCESS;
}
template <class mat_type>
inline void _host_free(mat_type &mat) {
if (mat.kind()==UCL_VIEW)
return;
else if (mat.kind()!=UCL_NOT_PINNED)
CU_DESTRUCT_CALL(hipHostFree(mat.begin()));
else
free(mat.begin());
}
template <class mat_type>
inline int _host_resize(mat_type &mat, const size_t n) {
_host_free(mat);
hipError_t err=hipSuccess;
if (mat.kind()==UCL_NOT_PINNED)
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
else if (mat.kind()==UCL_WRITE_ONLY)
err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined);
else
err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault);
if (err!=hipSuccess || *(mat.host_ptr())==NULL)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
// --------------------------------------------------------------------------
// - DEVICE MEMORY ALLOCATION ROUTINES
// --------------------------------------------------------------------------
template <class mat_type, class copy_type>
inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
const enum UCL_MEMOPT kind) {
hipError_t err=hipMalloc((void**)&mat.cbegin(),n);
if (err!=hipSuccess)
return UCL_MEMORY_ERROR;
mat.cq()=cm.cq();
return UCL_SUCCESS;
}
template <class mat_type>
inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
const enum UCL_MEMOPT kind) {
hipError_t err=hipMalloc((void**)&mat.cbegin(),n);
if (err!=hipSuccess)
return UCL_MEMORY_ERROR;
mat.cq()=dev.cq();
return UCL_SUCCESS;
}
template <class mat_type, class copy_type>
inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
const size_t cols, size_t &pitch,
const enum UCL_MEMOPT kind) {
hipError_t err;
size_t upitch;
err=hipMallocPitch((void**)&mat.cbegin(),&upitch,
cols*sizeof(typename mat_type::data_type),rows);
pitch=static_cast<size_t>(upitch);
if (err!=hipSuccess)
return UCL_MEMORY_ERROR;
mat.cq()=cm.cq();
return UCL_SUCCESS;
}
template <class mat_type, class copy_type>
inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
const size_t cols, size_t &pitch,
const enum UCL_MEMOPT kind) {
hipError_t err;
size_t upitch;
err=hipMallocPitch((void**)&mat.cbegin(),&upitch,
cols*sizeof(typename mat_type::data_type),rows);
pitch=static_cast<size_t>(upitch);
if (err!=hipSuccess)
return UCL_MEMORY_ERROR;
mat.cq()=d.cq();
return UCL_SUCCESS;
}
template <class mat_type>
inline void _device_free(mat_type &mat) {
if (mat.kind()!=UCL_VIEW){
CU_DESTRUCT_CALL(hipFree((void*)mat.cbegin()));
}
}
template <class mat_type>
inline int _device_resize(mat_type &mat, const size_t n) {
_device_free(mat);
hipError_t err=hipMalloc((void**)&mat.cbegin(),n);
if (err!=hipSuccess)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
template <class mat_type>
inline int _device_resize(mat_type &mat, const size_t rows,
const size_t cols, size_t &pitch) {
_device_free(mat);
hipError_t err;
size_t upitch;
err=hipMallocPitch((void**)&mat.cbegin(),&upitch,
cols*sizeof(typename mat_type::data_type),rows);
pitch=static_cast<size_t>(upitch);
if (err!=hipSuccess)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
inline void _device_view(hipDeviceptr_t *ptr, hipDeviceptr_t &in) {
*ptr=in;
}
template <class numtyp>
inline void _device_view(hipDeviceptr_t *ptr, numtyp *in) {
*ptr=0;
}
inline void _device_view(hipDeviceptr_t *ptr, hipDeviceptr_t &in,
const size_t offset, const size_t numsize) {
*ptr=(hipDeviceptr_t)(((char*)in)+offset*numsize);
}
template <class numtyp>
inline void _device_view(hipDeviceptr_t *ptr, numtyp *in,
const size_t offset, const size_t numsize) {
*ptr=0;
}
// --------------------------------------------------------------------------
// - DEVICE IMAGE ALLOCATION ROUTINES
// --------------------------------------------------------------------------
template <class mat_type, class copy_type>
inline void _device_image_alloc(mat_type &mat, copy_type &cm, const size_t rows,
const size_t cols) {
assert(0==1);
}
template <class mat_type, class copy_type>
inline void _device_image_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
const size_t cols) {
assert(0==1);
}
template <class mat_type>
inline void _device_image_free(mat_type &mat) {
assert(0==1);
}
// --------------------------------------------------------------------------
// - ZERO ROUTINES
// --------------------------------------------------------------------------
inline void _host_zero(void *ptr, const size_t n) {
memset(ptr,0,n);
}
template <class mat_type>
inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
CU_SAFE_CALL(hipMemsetAsync((void*)mat.cbegin(),0,n,cq));
}
// --------------------------------------------------------------------------
// - MEMCPY ROUTINES
// --------------------------------------------------------------------------
template<class mat1, class mat2>
hipMemcpyKind _memcpy_kind(mat1 &dst, const mat2 &src){
assert(mat1::MEM_TYPE < 2 && mat2::MEM_TYPE < 2);
return (hipMemcpyKind)((1 - mat2::MEM_TYPE)*2 + (1 - mat1::MEM_TYPE));
}
template<class mat1, class mat2>
inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n) {
CU_SAFE_CALL(hipMemcpy((void*)dst.begin(), (void*)src.begin(), n, _memcpy_kind(dst, src)));
}
template<class mat1, class mat2>
inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n, hipStream_t &cq) {
CU_SAFE_CALL(hipMemcpyAsync((void*)dst.begin(), (void*)src.begin(), n, _memcpy_kind(dst, src), cq));
}
template<class mat1, class mat2>
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
CU_SAFE_CALL(hipMemcpy2D((void*)dst.begin(), dpitch, (void*)src.begin(), spitch, cols, rows, _memcpy_kind(dst, src)));
}
template<class mat1, class mat2>
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
const size_t spitch, const size_t cols,
const size_t rows,hipStream_t &cq) {
CU_SAFE_CALL(hipMemcpy2DAsync((void*)dst.begin(), dpitch, (void*)src.begin(), spitch, cols, rows, _memcpy_kind(dst, src), cq));
}
} // namespace ucl_cudart
#endif

View File

@ -0,0 +1,113 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef HIP_TEXTURE
#define HIP_TEXTURE
#include <hip/hip_runtime.h>
#include "hip_kernel.h"
#include "hip_mat.h"
namespace ucl_hip {
#ifdef __HIP_PLATFORM_NVCC__
inline hipError_t hipModuleGetTexRef(CUtexref* texRef, hipModule_t hmod, const char* name){
return hipCUResultTohipError(cuModuleGetTexRef(texRef, hmod, name));
}
inline hipError_t hipTexRefSetFormat(CUtexref tex, hipArray_Format fmt, int NumPackedComponents) {
return hipCUResultTohipError(cuTexRefSetFormat(tex, (CUarray_format)fmt, NumPackedComponents ));
}
inline hipError_t hipTexRefSetAddress(size_t* offset, CUtexref tex, hipDeviceptr_t devPtr, size_t size) {
return hipCUResultTohipError(cuTexRefSetAddress(offset, tex, devPtr, size));
}
#endif
/// Class storing a texture reference
class UCL_Texture {
public:
UCL_Texture() {}
~UCL_Texture() {}
/// Construct with a specified texture reference
inline UCL_Texture(UCL_Program &prog, const char *texture_name)
{ get_texture(prog,texture_name); }
/// Set the texture reference for this object
inline void get_texture(UCL_Program &prog, const char *texture_name)
{
#ifdef __HIP_PLATFORM_NVCC__
CU_SAFE_CALL(hipModuleGetTexRef(&_tex, prog._module, texture_name));
#else
size_t _global_var_size;
CU_SAFE_CALL(hipModuleGetGlobal(&_device_ptr_to_global_var, &_global_var_size, prog._module, texture_name));
#endif
}
/// Bind a float array where each fetch grabs a vector of length numel
template<class numtyp>
inline void bind_float(UCL_D_Vec<numtyp> &vec, const unsigned numel)
{ _bind_float(vec,numel); }
/// Bind a float array where each fetch grabs a vector of length numel
template<class numtyp>
inline void bind_float(UCL_D_Mat<numtyp> &vec, const unsigned numel)
{ _bind_float(vec,numel); }
/// Bind a float array where each fetch grabs a vector of length numel
template<class numtyp, class devtyp>
inline void bind_float(UCL_Vector<numtyp, devtyp> &vec, const unsigned numel)
{ _bind_float(vec.device,numel); }
/// Bind a float array where each fetch grabs a vector of length numel
template<class numtyp, class devtyp>
inline void bind_float(UCL_Matrix<numtyp, devtyp> &vec, const unsigned numel)
{ _bind_float(vec.device,numel); }
/// Unbind the texture reference from the memory allocation
inline void unbind() { }
/// Make a texture reference available to kernel
inline void allow(UCL_Kernel &kernel) {
//#if CUDA_VERSION < 4000
//CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
//#endif
}
private:
#ifdef __HIP_PLATFORM_NVCC__
CUtexref _tex;
#else
void* _device_ptr_to_global_var;
#endif
friend class UCL_Kernel;
template<class mat_typ>
inline void _bind_float(mat_typ &vec, const unsigned numel) {
#ifdef UCL_DEBUG
assert(numel!=0 && numel<5);
#endif
#ifdef __HIP_PLATFORM_NVCC__
if (vec.element_size()==sizeof(float))
CU_SAFE_CALL(hipTexRefSetFormat(_tex, HIP_AD_FORMAT_FLOAT, numel));
else {
if (numel>2)
CU_SAFE_CALL(hipTexRefSetFormat(_tex, HIP_AD_FORMAT_SIGNED_INT32, numel));
else
CU_SAFE_CALL(hipTexRefSetFormat(_tex,HIP_AD_FORMAT_SIGNED_INT32,numel*2));
}
CU_SAFE_CALL(hipTexRefSetAddress(NULL, _tex, vec.cbegin(), vec.numel()*vec.element_size()));
#else
void* data_ptr = (void*)vec.cbegin();
CU_SAFE_CALL(hipMemcpyHtoD(hipDeviceptr_t(_device_ptr_to_global_var), &data_ptr, sizeof(void*)));
#endif
}
};
} // namespace
#endif

107
lib/gpu/geryon/hip_timer.h Normal file
View File

@ -0,0 +1,107 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef HIP_TIMER_H
#define HIP_TIMER_H
#include <hip/hip_runtime.h>
#include "hip_macros.h"
#include "hip_device.h"
namespace ucl_hip {
/// Class for timing CUDA Driver events
class UCL_Timer {
public:
inline UCL_Timer() : _total_time(0.0f), _initialized(false) { }
inline UCL_Timer(UCL_Device &dev) : _total_time(0.0f), _initialized(false)
{ init(dev); }
inline ~UCL_Timer() { clear(); }
/// Clear any data associated with timer
/** \note init() must be called to reuse timer after a clear() **/
inline void clear() {
if (_initialized) {
CU_DESTRUCT_CALL(hipEventDestroy(start_event));
CU_DESTRUCT_CALL(hipEventDestroy(stop_event));
_initialized=false;
_total_time=0.0;
}
}
/// Initialize default command queue for timing
inline void init(UCL_Device &dev) { init(dev, dev.cq()); }
/// Initialize command queue for timing
inline void init(UCL_Device &dev, command_queue &cq) {
clear();
_cq=cq;
_initialized=true;
CU_SAFE_CALL( hipEventCreateWithFlags(&start_event,0) );
CU_SAFE_CALL( hipEventCreateWithFlags(&stop_event,0) );
}
/// Start timing on command queue
inline void start() { CU_SAFE_CALL(hipEventRecord(start_event,_cq)); }
/// Stop timing on command queue
inline void stop() { CU_SAFE_CALL(hipEventRecord(stop_event,_cq)); }
/// Block until the start event has been reached on device
inline void sync_start()
{ CU_SAFE_CALL(hipEventSynchronize(start_event)); }
/// Block until the stop event has been reached on device
inline void sync_stop()
{ CU_SAFE_CALL(hipEventSynchronize(stop_event)); }
/// Set the time elapsed to zero (not the total_time)
inline void zero() {
CU_SAFE_CALL(hipEventRecord(start_event,_cq));
CU_SAFE_CALL(hipEventRecord(stop_event,_cq));
}
/// Set the total time to zero
inline void zero_total() { _total_time=0.0; }
/// Add time from previous start and stop to total
/** Forces synchronization **/
inline double add_to_total()
{ double t=time(); _total_time+=t; return t/1000.0; }
/// Add a user specified time to the total (ms)
inline void add_time_to_total(const double t) { _total_time+=t; }
/// Return the time (ms) of last start to stop - Forces synchronization
inline double time() {
float timer;
CU_SAFE_CALL(hipEventSynchronize(stop_event));
CU_SAFE_CALL( hipEventElapsedTime(&timer,start_event,stop_event) );
return timer;
}
/// Return the time (s) of last start to stop - Forces synchronization
inline double seconds() { return time()/1000.0; }
/// Return the total time in ms
inline double total_time() { return _total_time; }
/// Return the total time in seconds
inline double total_seconds() { return _total_time/1000.0; }
private:
hipEvent_t start_event, stop_event;
hipStream_t _cq;
double _total_time;
bool _initialized;
};
} // namespace
#endif

View File

@ -36,6 +36,11 @@ using namespace ucl_cudadr;
using namespace ucl_cudart;
#endif
#ifdef UCL_HIP
#include "hip_device.h"
using namespace ucl_hip;
#endif
int main(int argc, char** argv) {
UCL_Device cop;
std::cout << "Found " << cop.num_platforms() << " platform(s).\n";

View File

@ -179,13 +179,15 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
if (_eflag) {
for (int i=0; i<_inum; i++)
evdwl+=engv[i];
if (_ef_atom)
if (_ilist==NULL)
if (_ef_atom) {
if (_ilist==NULL) {
for (int i=0; i<_inum; i++)
eatom[i]+=engv[i];
else
} else {
for (int i=0; i<_inum; i++)
eatom[_ilist[i]]+=engv[i];
}
}
vstart=_inum;
}
if (_vflag) {
@ -193,7 +195,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
for (int j=0; j<6; j++) {
for (int i=vstart; i<iend; i++)
virial[j]+=engv[i];
if (_vf_atom)
if (_vf_atom){
if (_ilist==NULL) {
int ii=0;
for (int i=vstart; i<iend; i++)
@ -203,6 +205,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
for (int i=vstart; i<iend; i++)
vatom[_ilist[ii++]][j]+=engv[i];
}
}
vstart+=_inum;
iend+=_inum;
}
@ -228,7 +231,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
evdwl+=engv[i];
for (int i=_inum; i<iend; i++)
ecoul+=engv[i];
if (_ef_atom)
if (_ef_atom) {
if (_ilist==NULL) {
for (int i=0; i<_inum; i++)
eatom[i]+=engv[i];
@ -240,6 +243,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
for (int i=_inum, ii=0; i<iend; i++)
eatom[_ilist[ii++]]+=engv[i];
}
}
vstart=iend;
iend+=_inum;
}
@ -247,7 +251,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
for (int j=0; j<6; j++) {
for (int i=vstart; i<iend; i++)
virial[j]+=engv[i];
if (_vf_atom)
if (_vf_atom) {
if (_ilist==NULL) {
for (int i=vstart, ii=0; i<iend; i++)
vatom[ii++][j]+=engv[i];
@ -255,6 +259,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
for (int i=vstart, ii=0; i<iend; i++)
vatom[_ilist[ii++]][j]+=engv[i];
}
}
vstart+=_inum;
iend+=_inum;
}

View File

@ -27,6 +27,10 @@ using namespace ucl_opencl;
#include "geryon/nvc_timer.h"
#include "geryon/nvc_mat.h"
using namespace ucl_cudart;
#elif defined(USE_HIP)
#include "geryon/hip_timer.h"
#include "geryon/hip_mat.h"
using namespace ucl_hip;
#else
#include "geryon/nvd_timer.h"
#include "geryon/nvd_mat.h"

View File

@ -15,6 +15,11 @@
#include "lal_atom.h"
#ifdef USE_HIP_DEVICE_SORT
#include <hip/hip_runtime.h>
#include <hipcub/hipcub.hpp>
#endif
namespace LAMMPS_AL {
#define AtomT Atom<numtyp,acctyp>
@ -70,6 +75,26 @@ bool AtomT::alloc(const int nall) {
}
#endif
#ifdef USE_HIP_DEVICE_SORT
if (_gpu_nbor==1) {
size_t temp_storage_bytes = 0;
if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, sort_out_keys, sort_out_keys, sort_out_values, sort_out_values, _max_atoms))
return false;
if(sort_out_size < _max_atoms){
if (sort_out_keys ) hipFree(sort_out_keys);
if (sort_out_values) hipFree(sort_out_values);
hipMalloc(&sort_out_keys , _max_atoms * sizeof(unsigned));
hipMalloc(&sort_out_values, _max_atoms * sizeof(int ));
sort_out_size = _max_atoms;
}
if(temp_storage_bytes > sort_temp_storage_size){
if(sort_temp_storage) hipFree(sort_temp_storage);
hipMalloc(&sort_temp_storage, temp_storage_bytes);
sort_temp_storage_size = temp_storage_bytes;
}
}
#endif
// --------------------------- Device allocations
int gpu_bytes=0;
success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
@ -184,6 +209,27 @@ bool AtomT::add_fields(const bool charge, const bool rot,
return false;
}
#endif
#ifdef USE_HIP_DEVICE_SORT
if (_gpu_nbor==1) {
size_t temp_storage_bytes = 0;
if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, sort_out_keys, sort_out_keys, sort_out_values, sort_out_values, _max_atoms))
return false;
if(sort_out_size < _max_atoms){
if (sort_out_keys ) hipFree(sort_out_keys);
if (sort_out_values) hipFree(sort_out_values);
hipMalloc(&sort_out_keys , _max_atoms * sizeof(unsigned));
hipMalloc(&sort_out_values, _max_atoms * sizeof(int ));
sort_out_size = _max_atoms;
}
if(temp_storage_bytes > sort_temp_storage_size){
if(sort_temp_storage) hipFree(sort_temp_storage);
hipMalloc(&sort_temp_storage, temp_storage_bytes);
sort_temp_storage_size = temp_storage_bytes;
}
}
#endif
success=success && (dev_particle_id.alloc(_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=dev_particle_id.row_bytes();
@ -275,6 +321,19 @@ void AtomT::clear_resize() {
if (_gpu_nbor==1) cudppDestroyPlan(sort_plan);
#endif
#ifdef USE_HIP_DEVICE_SORT
if (_gpu_nbor==1) {
if(sort_out_keys) hipFree(sort_out_keys);
if(sort_out_values) hipFree(sort_out_values);
if(sort_temp_storage) hipFree(sort_temp_storage);
sort_out_keys = nullptr;
sort_out_values = nullptr;
sort_temp_storage = nullptr;
sort_temp_storage_size = 0;
sort_out_size = 0;
}
#endif
if (_gpu_nbor==2) {
host_particle_id.clear();
host_cell_id.clear();
@ -326,6 +385,22 @@ void AtomT::sort_neighbor(const int num_atoms) {
UCL_GERYON_EXIT;
}
#endif
#ifdef USE_HIP_DEVICE_SORT
if(sort_out_size < num_atoms){
printf("AtomT::sort_neighbor: invalid temp buffer size\n");
UCL_GERYON_EXIT;
}
if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(sort_temp_storage, sort_temp_storage_size, (unsigned *)dev_cell_id.begin(), sort_out_keys, (int *)dev_particle_id.begin(), sort_out_values, num_atoms)){
printf("AtomT::sort_neighbor: DeviceRadixSort error\n");
UCL_GERYON_EXIT;
}
if(hipSuccess != hipMemcpy((unsigned *)dev_cell_id.begin(), sort_out_keys , num_atoms*sizeof(unsigned), hipMemcpyDeviceToDevice) ||
hipSuccess != hipMemcpy((int *) dev_particle_id.begin(), sort_out_values, num_atoms*sizeof(int ), hipMemcpyDeviceToDevice)){
printf("AtomT::sort_neighbor: copy output error\n");
UCL_GERYON_EXIT;
}
#endif
}
#ifdef GPU_CAST

View File

@ -13,7 +13,7 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_preprocessor.h"
#endif

View File

@ -29,6 +29,11 @@ using namespace ucl_opencl;
#include "geryon/nvc_mat.h"
#include "geryon/nvc_kernel.h"
using namespace ucl_cudart;
#elif defined(USE_HIP)
#include "geryon/hip_timer.h"
#include "geryon/hip_mat.h"
#include "geryon/hip_kernel.h"
using namespace ucl_hip;
#else
#include "geryon/nvd_timer.h"
#include "geryon/nvd_mat.h"
@ -477,6 +482,14 @@ class Atom {
CUDPPConfiguration sort_config;
CUDPPHandle sort_plan;
#endif
#ifdef USE_HIP_DEVICE_SORT
unsigned* sort_out_keys = nullptr;
int* sort_out_values = nullptr;
void* sort_temp_storage = nullptr;
size_t sort_temp_storage_size = 0;
size_t sort_out_size = 0;
#endif
};
}

View File

@ -13,7 +13,7 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_preprocessor.h"
#endif

View File

@ -24,6 +24,8 @@
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif

View File

@ -25,6 +25,8 @@
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif

View File

@ -23,6 +23,8 @@
#ifdef USE_OPENCL
#include "geryon/ocl_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif

View File

@ -23,6 +23,8 @@
#ifdef USE_OPENCL
#include "geryon/ocl_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif

View File

@ -24,6 +24,8 @@
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif

View File

@ -24,6 +24,8 @@
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif

View File

@ -13,12 +13,12 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -13,12 +13,12 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -13,15 +13,15 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -13,15 +13,16 @@
// email : ndactrung@gmail.com
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -13,15 +13,15 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -13,15 +13,15 @@
// email : ndactrung@gmail.com
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -13,12 +13,12 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -13,15 +13,15 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -13,15 +13,15 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -13,15 +13,15 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -13,12 +13,12 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -13,15 +13,15 @@
// email : ndtrung@umich.edu
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -13,15 +13,15 @@
// email : ndtrung@umich.edu
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -13,15 +13,15 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -13,15 +13,15 @@
// email : a.kohlmeyer@temple.edu
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -13,15 +13,15 @@
// email : ndactrung@gmail.com
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -268,7 +268,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
gpu_nbor=1;
else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
gpu_nbor=2;
#ifndef USE_CUDPP
#if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT)
if (gpu_nbor==1)
gpu_nbor=2;
#endif
@ -341,7 +341,7 @@ int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
gpu_nbor=1;
else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
gpu_nbor=2;
#ifndef USE_CUDPP
#if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT)
if (gpu_nbor==1)
gpu_nbor=2;
#endif
@ -712,7 +712,7 @@ int DeviceT::compile_kernels() {
gpu_lib_data.update_host(false);
_ptx_arch=static_cast<double>(gpu_lib_data[0])/100.0;
#ifndef USE_OPENCL
#if !(defined(USE_OPENCL) || defined(USE_HIP))
if (_ptx_arch>gpu->arch() || floor(_ptx_arch)<floor(gpu->arch()))
return -4;
#endif

View File

@ -13,7 +13,7 @@
// email : brownw@ornl.gov
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_preprocessor.h"
#endif

View File

@ -13,16 +13,16 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
texture<float4> mu_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
_texture( mu_tex,float4);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
texture<int4,1> mu_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
_texture_2d( mu_tex,int4);
#endif
#else

View File

@ -13,17 +13,17 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
texture<float4> mu_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
_texture( mu_tex,float4);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
texture<int4,1> mu_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
_texture_2d( mu_tex,int4);
#endif
#else

View File

@ -13,16 +13,16 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
texture<float4> mu_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
_texture( mu_tex,float4);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
texture<int4,1> mu_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
_texture_2d( mu_tex,int4);
#endif
#else

View File

@ -13,14 +13,14 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float4> vel_tex;
_texture( pos_tex,float4);
_texture( vel_tex,float4);
#else
texture<int4,1> pos_tex;
texture<int4,1> vel_tex;
_texture_2d( pos_tex,int4);
_texture_2d( vel_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -13,27 +13,27 @@
// email : brownw@ornl.gov nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> fp_tex;
texture<float4> rhor_sp1_tex;
texture<float4> rhor_sp2_tex;
texture<float4> frho_sp1_tex;
texture<float4> frho_sp2_tex;
texture<float4> z2r_sp1_tex;
texture<float4> z2r_sp2_tex;
_texture( pos_tex,float4);
_texture( fp_tex,float);
_texture( rhor_sp1_tex,float4);
_texture( rhor_sp2_tex,float4);
_texture( frho_sp1_tex,float4);
_texture( frho_sp2_tex,float4);
_texture( z2r_sp1_tex,float4);
_texture( z2r_sp2_tex,float4);
#else
texture<int4> pos_tex;
texture<int2> fp_tex;
texture<int4> rhor_sp1_tex;
texture<int4> rhor_sp2_tex;
texture<int4> frho_sp1_tex;
texture<int4> frho_sp2_tex;
texture<int4> z2r_sp1_tex;
texture<int4> z2r_sp2_tex;
_texture( pos_tex,int4);
_texture( fp_tex,int2);
_texture( rhor_sp1_tex,int4);
_texture( rhor_sp2_tex,int4);
_texture( frho_sp1_tex,int4);
_texture( frho_sp2_tex,int4);
_texture( z2r_sp1_tex,int4);
_texture( z2r_sp2_tex,int4);
#endif
#else

View File

@ -18,12 +18,14 @@
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex, quat_tex;
_texture( pos_tex, float4);
_texture( quat_tex,float4);
#else
texture<int4,1> pos_tex, quat_tex;
_texture_2d( pos_tex,int4);
_texture_2d( quat_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -13,12 +13,12 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_preprocessor.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -13,12 +13,12 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -13,7 +13,7 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_ellipsoid_extra.h"
#endif

View File

@ -13,7 +13,7 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_ellipsoid_extra.h"
#endif

View File

@ -13,12 +13,12 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -13,12 +13,12 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -13,15 +13,15 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -13,15 +13,15 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -13,15 +13,15 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -13,15 +13,15 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -13,19 +13,19 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
texture<float> gcons_tex;
texture<float> dgcons_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
_texture( gcons_tex,float);
_texture( dgcons_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
texture<int2> gcons_tex;
texture<int2> dgcons_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
_texture( gcons_tex,int2);
_texture( dgcons_tex,int2);
#endif
#else

View File

@ -13,12 +13,12 @@
// email : ndactrung@gmail.com
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -13,15 +13,15 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -13,13 +13,13 @@
// email : ibains@nvidia.com
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else

View File

@ -13,15 +13,15 @@
// email : ndactrung@gmail.com
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -13,13 +13,13 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else

View File

@ -13,12 +13,12 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -13,15 +13,15 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -23,7 +23,7 @@ const char *lj_tip4p=0;
#include "lal_lj_tip4p_long.h"
#include <cassert>
using namespace LAMMPS_AL;
namespace LAMMPS_AL {
#define LJTIP4PLongT LJ_TIP4PLong<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
@ -381,6 +381,5 @@ int** LJTIP4PLongT::compute(const int ago, const int inum_full,
}
template class LJ_TIP4PLong<PRECISION,ACC_PRECISION>;
}

View File

@ -13,7 +13,7 @@
// email : thevsevak@gmail.com
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifdef LAMMPS_SMALLBIG
@ -27,11 +27,11 @@
#define tagint int
#endif
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -13,12 +13,12 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -13,13 +13,13 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else

View File

@ -13,7 +13,7 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_preprocessor.h"
#endif

View File

@ -14,7 +14,7 @@
// email : penwang@nvidia.com, brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_preprocessor.h"
#ifdef LAMMPS_SMALLBIG
#define tagint int
@ -27,9 +27,9 @@
#define tagint int
#endif
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
__kernel void calc_cell_id(const numtyp4 *restrict pos,

View File

@ -24,6 +24,10 @@ using namespace ucl_opencl;
#include "geryon/nvc_kernel.h"
#include "geryon/nvc_texture.h"
using namespace ucl_cudart;
#elif defined(USE_HIP)
#include "geryon/hip_kernel.h"
#include "geryon/hip_texture.h"
using namespace ucl_hip;
#else
#include "geryon/nvd_kernel.h"
#include "geryon/nvd_texture.h"

View File

@ -13,15 +13,15 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_preprocessor.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
// Allow PPPM to compile without atomics for NVIDIA 1.0 cards, error

View File

@ -23,6 +23,8 @@
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif

View File

@ -24,9 +24,11 @@ struct _lgpu_int2 {
int x; int y;
};
#ifndef USE_HIP
#ifndef int2
#define int2 _lgpu_int2
#endif
#endif
struct _lgpu_float2 {
float x; float y;

View File

@ -1,4 +1,4 @@
// **************************************************************************
// **************************************************************************
// preprocessor.cu
// -------------------
// W. Michael Brown (ORNL)
@ -60,6 +60,150 @@
//
//*************************************************************************/
#define _texture(name, type) texture<type> name
#define _texture_2d(name, type) texture<type,1> name
// -------------------------------------------------------------------------
// HIP DEFINITIONS
// -------------------------------------------------------------------------
#ifdef USE_HIP
#include <hip/hip_runtime.h>
#ifdef __HIP_PLATFORM_HCC__
#define mul24(x, y) __mul24(x, y)
#undef _texture
#undef _texture_2d
#define _texture(name, type) __device__ type* name
#define _texture_2d(name, type) __device__ type* name
#endif
#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
#define THREAD_ID_X threadIdx.x
#define THREAD_ID_Y threadIdx.y
#define BLOCK_ID_X blockIdx.x
#define BLOCK_ID_Y blockIdx.y
#define BLOCK_SIZE_X blockDim.x
#define BLOCK_SIZE_Y blockDim.y
#define __kernel extern "C" __global__
#ifdef __local
#undef __local
#endif
#define __local __shared__
#define __global
#define restrict __restrict__
#define atom_add atomicAdd
#define ucl_inline static __inline__ __device__
#define THREADS_PER_ATOM 4
#define THREADS_PER_CHARGE 8
#define BLOCK_NBOR_BUILD 128
#define BLOCK_PAIR 256
#define BLOCK_BIO_PAIR 256
#define BLOCK_ELLIPSE 128
#define MAX_SHARED_TYPES 11
#ifdef _SINGLE_SINGLE
ucl_inline double shfl_xor(double var, int laneMask, int width) {
#ifdef __HIP_PLATFORM_HCC__
return __shfl_xor(var, laneMask, width);
#else
return __shfl_xor_sync(0xffffffff, var, laneMask, width);
#endif
}
#else
ucl_inline double shfl_xor(double var, int laneMask, int width) {
int2 tmp;
tmp.x = __double2hiint(var);
tmp.y = __double2loint(var);
#ifdef __HIP_PLATFORM_HCC__
tmp.x = __shfl_xor(tmp.x,laneMask,width);
tmp.y = __shfl_xor(tmp.y,laneMask,width);
#else
tmp.x = __shfl_xor_sync(0xffffffff, tmp.x,laneMask,width);
tmp.y = __shfl_xor_sync(0xffffffff, tmp.y,laneMask,width);
#endif
return __hiloint2double(tmp.x,tmp.y);
}
#endif
#ifdef __HIP_PLATFORM_HCC__
#define ARCH 600
#define WARP_SIZE 64
#endif
#ifdef __HIP_PLATFORM_NVCC__
#define ARCH __CUDA_ARCH__
#define WARP_SIZE 32
#endif
#define fast_mul(X,Y) (X)*(Y)
#define MEM_THREADS WARP_SIZE
#define PPPM_BLOCK_1D 64
#define BLOCK_CELL_2D 8
#define BLOCK_CELL_ID 128
#define MAX_BIO_SHARED_TYPES 128
#ifdef __HIP_PLATFORM_NVCC__
#ifdef _DOUBLE_DOUBLE
#define fetch4(ans,i,pos_tex) { \
int4 xy = tex1Dfetch(pos_tex,i*2); \
int4 zt = tex1Dfetch(pos_tex,i*2+1); \
ans.x=__hiloint2double(xy.y, xy.x); \
ans.y=__hiloint2double(xy.w, xy.z); \
ans.z=__hiloint2double(zt.y, zt.x); \
ans.w=__hiloint2double(zt.w, zt.z); \
}
#define fetch(ans,i,q_tex) { \
int2 qt = tex1Dfetch(q_tex,i); \
ans=__hiloint2double(qt.y, qt.x); \
}
#else
#define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
#define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
#endif
#else
#ifdef _DOUBLE_DOUBLE
#define fetch4(ans,i,pos_tex) (ans=*(((double4*)pos_tex) + i))
#define fetch(ans,i,q_tex) (ans=*(((double *) q_tex) + i))
#else
#define fetch4(ans,i,pos_tex) (ans=*(((float4*)pos_tex) + i))
#define fetch(ans,i,q_tex) (ans=*(((float *) q_tex) + i))
#endif
#endif
#ifdef _DOUBLE_DOUBLE
#define ucl_exp exp
#define ucl_powr pow
#define ucl_atan atan
#define ucl_cbrt cbrt
#define ucl_ceil ceil
#define ucl_abs fabs
#define ucl_rsqrt rsqrt
#define ucl_sqrt sqrt
#define ucl_recip(x) ((numtyp)1.0/(x))
#else
#define ucl_atan atanf
#define ucl_cbrt cbrtf
#define ucl_ceil ceilf
#define ucl_abs fabsf
#define ucl_recip(x) ((numtyp)1.0/(x))
#define ucl_rsqrt rsqrtf
#define ucl_sqrt sqrtf
#ifdef NO_HARDWARE_TRANSCENDENTALS
#define ucl_exp expf
#define ucl_powr powf
#else
#define ucl_exp __expf
#define ucl_powr __powf
#endif
#endif
#endif
// -------------------------------------------------------------------------
// CUDA DEFINITIONS
// -------------------------------------------------------------------------

View File

@ -13,7 +13,7 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_ellipsoid_extra.h"
#endif

View File

@ -13,7 +13,7 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_ellipsoid_extra.h"
#endif

View File

@ -13,12 +13,12 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -13,19 +13,19 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float4> sw1_tex;
texture<float4> sw2_tex;
texture<float4> sw3_tex;
_texture( pos_tex,float4);
_texture( sw1_tex,float4);
_texture( sw2_tex,float4);
_texture( sw3_tex,float4);
#else
texture<int4,1> pos_tex;
texture<int4> sw1_tex;
texture<int4> sw2_tex;
texture<int4> sw3_tex;
_texture_2d( pos_tex,int4);
_texture( sw1_tex,int4);
_texture( sw2_tex,int4);
_texture( sw3_tex,int4);
#endif
#else

View File

@ -13,12 +13,12 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -13,23 +13,23 @@
// email : ndactrung@gmail.com
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_tersoff_extra.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float4> ts1_tex;
texture<float4> ts2_tex;
texture<float4> ts3_tex;
texture<float4> ts4_tex;
texture<float4> ts5_tex;
_texture( pos_tex,float4);
_texture( ts1_tex,float4);
_texture( ts2_tex,float4);
_texture( ts3_tex,float4);
_texture( ts4_tex,float4);
_texture( ts5_tex,float4);
#else
texture<int4,1> pos_tex;
texture<int4> ts1_tex;
texture<int4> ts2_tex;
texture<int4> ts3_tex;
texture<int4> ts4_tex;
texture<int4> ts5_tex;
_texture_2d( pos_tex,int4);
_texture( ts1_tex,int4);
_texture( ts2_tex,int4);
_texture( ts3_tex,int4);
_texture( ts4_tex,int4);
_texture( ts5_tex,int4);
#endif
#else

View File

@ -16,7 +16,7 @@
#ifndef LAL_TERSOFF_EXTRA_H
#define LAL_TERSOFF_EXTRA_H
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#else
#endif

View File

@ -13,23 +13,23 @@
// email : ndactrung@gmail.com
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_tersoff_mod_extra.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float4> ts1_tex;
texture<float4> ts2_tex;
texture<float4> ts3_tex;
texture<float4> ts4_tex;
texture<float4> ts5_tex;
_texture( pos_tex,float4);
_texture( ts1_tex,float4);
_texture( ts2_tex,float4);
_texture( ts3_tex,float4);
_texture( ts4_tex,float4);
_texture( ts5_tex,float4);
#else
texture<int4,1> pos_tex;
texture<int4> ts1_tex;
texture<int4> ts2_tex;
texture<int4> ts3_tex;
texture<int4> ts4_tex;
texture<int4> ts5_tex;
_texture_2d( pos_tex,int4);
_texture( ts1_tex,int4);
_texture( ts2_tex,int4);
_texture( ts3_tex,int4);
_texture( ts4_tex,int4);
_texture( ts5_tex,int4);
#endif
#else

View File

@ -16,7 +16,7 @@
#ifndef LAL_TERSOFF_MOD_EXTRA_H
#define LAL_TERSOFF_MOD_EXTRA_H
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#else
#endif

View File

@ -13,25 +13,25 @@
// email : ndactrung@gmail.com
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_tersoff_zbl_extra.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float4> ts1_tex;
texture<float4> ts2_tex;
texture<float4> ts3_tex;
texture<float4> ts4_tex;
texture<float4> ts5_tex;
texture<float4> ts6_tex;
_texture( pos_tex,float4);
_texture( ts1_tex,float4);
_texture( ts2_tex,float4);
_texture( ts3_tex,float4);
_texture( ts4_tex,float4);
_texture( ts5_tex,float4);
_texture( ts6_tex,float4);
#else
texture<int4,1> pos_tex;
texture<int4> ts1_tex;
texture<int4> ts2_tex;
texture<int4> ts3_tex;
texture<int4> ts4_tex;
texture<int4> ts5_tex;
texture<int4> ts6_tex;
_texture_2d( pos_tex,int4);
_texture( ts1_tex,int4);
_texture( ts2_tex,int4);
_texture( ts3_tex,int4);
_texture( ts4_tex,int4);
_texture( ts5_tex,int4);
_texture( ts6_tex,int4);
#endif
#else

View File

@ -16,7 +16,7 @@
#ifndef LAL_TERSOFF_ZBL_EXTRA_H
#define LAL_TERSOFF_ZBL_EXTRA_H
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#else
#endif

View File

@ -15,12 +15,12 @@
dekoning@ifi.unicamp.br
***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -13,23 +13,23 @@
// email : andershaf@gmail.com
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float4> param1_tex;
texture<float4> param2_tex;
texture<float4> param3_tex;
texture<float4> param4_tex;
texture<float4> param5_tex;
_texture( pos_tex,float4);
_texture( param1_tex,float4);
_texture( param2_tex,float4);
_texture( param3_tex,float4);
_texture( param4_tex,float4);
_texture( param5_tex,float4);
#else
texture<int4,1> pos_tex;
texture<int4> param1_tex;
texture<int4> param2_tex;
texture<int4> param3_tex;
texture<int4> param4_tex;
texture<int4> param5_tex;
_texture_2d( pos_tex,int4);
_texture( param1_tex,int4);
_texture( param2_tex,int4);
_texture( param3_tex,int4);
_texture( param4_tex,int4);
_texture( param5_tex,int4);
#endif
#else

View File

@ -13,12 +13,12 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -13,15 +13,15 @@
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> rad_tex;
_texture( pos_tex,float4);
_texture( rad_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> rad_tex;
_texture_2d( pos_tex,int4);
_texture( rad_tex,int2);
#endif
#else

View File

@ -13,12 +13,12 @@
// email : ndactrung@gmail.com
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

Some files were not shown because too many files have changed in this diff Show More