Merge branch 'master' into simplify_build_choices

# Conflicts: # cmake/Modules/Packages/GPU.cmake
2020-04-11 21:31:07 -04:00 · 2020-04-11 21:31:07 -04:00 · 5542ad31dc
parent 8407a77893 d6624fe444
commit 5542ad31dc
105 changed files with 2800 additions and 538 deletions
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -704,6 +704,14 @@ if(PKG_GPU)
    message(STATUS "GPU architecture: ${GPU_ARCH}")
  elseif(GPU_API STREQUAL "OPENCL")
    message(STATUS "OpenCL tuning:    ${OCL_TUNE}")
+  elseif(GPU_API STREQUAL "HIP")
+    message(STATUS "HIP platform:     ${HIP_PLATFORM}")
+    message(STATUS "HIP architecture: ${HIP_ARCH}")
+    if(HIP_USE_DEVICE_SORT)
+      message(STATUS "HIP GPU sorting: on")
+    else()
+      message(STATUS "HIP GPU sorting: off")
+    endif()
  endif()
  message(STATUS "GPU precision:    ${GPU_PREC}")
 endif()
--- a/cmake/Modules/FindCUB.cmake
+++ b/cmake/Modules/FindCUB.cmake
@ -0,0 +1,16 @@
+# - Find CUB
+# Find the CUB header library
+#
+#  CUB_INCLUDE_DIRS - where to find cub/cub.cuh
+#  CUB_FOUND        - True if CUB found.
+#
+
+find_path(CUB_INCLUDE_DIR cub.cuh PATH_SUFFIXES cub)
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set CUB_FOUND to TRUE
+# if all listed variables are TRUE
+
+find_package_handle_standard_args(CUB DEFAULT_MSG CUB_INCLUDE_DIR)
+
+mark_as_advanced(CUB_INCLUDE_DIR)
--- a/cmake/Modules/GenerateBinaryHeader.cmake
+++ b/cmake/Modules/GenerateBinaryHeader.cmake
@ -0,0 +1,3 @@
+# utility script to call GenerateBinaryHeader function
+include(${SOURCE_DIR}/Modules/LAMMPSUtils.cmake)
+GenerateBinaryHeader(${VARNAME} ${HEADER_FILE} ${SOURCE_FILES})
--- a/cmake/Modules/LAMMPSUtils.cmake
+++ b/cmake/Modules/LAMMPSUtils.cmake
@ -69,3 +69,19 @@ macro(pkg_depends PKG1 PKG2)
    message(FATAL_ERROR "${PKG1} package needs LAMMPS to be build with ${PKG2}")
  endif()
 endmacro()
+
+# CMake-only replacement for bin2c and xxd
+function(GenerateBinaryHeader varname outfile files)
+    message("Creating ${outfile}...")
+    file(WRITE ${outfile} "// CMake generated file\n")
+    math(EXPR ARG_END   "${ARGC}-1")
+
+    foreach(IDX RANGE 2 ${ARG_END})
+        list(GET ARGV ${IDX} filename)
+        file(READ ${filename} content HEX)
+        string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," content "${content}")
+        string(REGEX REPLACE ",$" "" content "${content}")
+        file(APPEND ${outfile} "const unsigned char ${varname}[] = { ${content} };\n")
+        file(APPEND ${outfile} "const unsigned int ${varname}_size = sizeof(${varname});\n")
+    endforeach()
+endfunction(GenerateBinaryHeader)
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@ -1,209 +1,363 @@
-  set(GPU_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/GPU)
-  set(GPU_SOURCES ${GPU_SOURCES_DIR}/gpu_extra.h
-                  ${GPU_SOURCES_DIR}/fix_gpu.h
-                  ${GPU_SOURCES_DIR}/fix_gpu.cpp)
+set(GPU_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/GPU)
+set(GPU_SOURCES ${GPU_SOURCES_DIR}/gpu_extra.h
+                ${GPU_SOURCES_DIR}/fix_gpu.h
+                ${GPU_SOURCES_DIR}/fix_gpu.cpp)

-  set(GPU_API "opencl" CACHE STRING "API used by GPU package")
-  set(GPU_API_VALUES opencl cuda)
-  set_property(CACHE GPU_API PROPERTY STRINGS ${GPU_API_VALUES})
-  validate_option(GPU_API GPU_API_VALUES)
-  string(TOUPPER ${GPU_API} GPU_API)
+set(GPU_API "opencl" CACHE STRING "API used by GPU package")
+set(GPU_API_VALUES opencl cuda hip)
+set_property(CACHE GPU_API PROPERTY STRINGS ${GPU_API_VALUES})
+validate_option(GPU_API GPU_API_VALUES)
+string(TOUPPER ${GPU_API} GPU_API)

-  set(GPU_PREC "mixed" CACHE STRING "LAMMPS GPU precision")
-  set(GPU_PREC_VALUES double mixed single)
-  set_property(CACHE GPU_PREC PROPERTY STRINGS ${GPU_PREC_VALUES})
-  validate_option(GPU_PREC GPU_PREC_VALUES)
-  string(TOUPPER ${GPU_PREC} GPU_PREC)
+set(GPU_PREC "mixed" CACHE STRING "LAMMPS GPU precision")
+set(GPU_PREC_VALUES double mixed single)
+set_property(CACHE GPU_PREC PROPERTY STRINGS ${GPU_PREC_VALUES})
+validate_option(GPU_PREC GPU_PREC_VALUES)
+string(TOUPPER ${GPU_PREC} GPU_PREC)

-  if(GPU_PREC STREQUAL "DOUBLE")
-    set(GPU_PREC_SETTING "DOUBLE_DOUBLE")
-  elseif(GPU_PREC STREQUAL "MIXED")
-    set(GPU_PREC_SETTING "SINGLE_DOUBLE")
-  elseif(GPU_PREC STREQUAL "SINGLE")
-    set(GPU_PREC_SETTING "SINGLE_SINGLE")
+if(GPU_PREC STREQUAL "DOUBLE")
+  set(GPU_PREC_SETTING "DOUBLE_DOUBLE")
+elseif(GPU_PREC STREQUAL "MIXED")
+  set(GPU_PREC_SETTING "SINGLE_DOUBLE")
+elseif(GPU_PREC STREQUAL "SINGLE")
+  set(GPU_PREC_SETTING "SINGLE_SINGLE")
+endif()
+
+file(GLOB GPU_LIB_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cpp)
+file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)
+
+if(GPU_API STREQUAL "CUDA")
+  find_package(CUDA REQUIRED)
+  find_program(BIN2C bin2c)
+  if(NOT BIN2C)
+    message(FATAL_ERROR "Could not find bin2c, use -DBIN2C=/path/to/bin2c to help cmake finding it.")
+  endif()
+  option(CUDPP_OPT "Enable CUDPP_OPT" ON)
+  option(CUDA_MPS_SUPPORT "Enable tweaks to support CUDA Multi-process service (MPS)" OFF)
+  if(CUDA_MPS_SUPPORT)
+    set(GPU_CUDA_MPS_FLAGS "-DCUDA_PROXY")
  endif()

-  file(GLOB GPU_LIB_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cpp)
-  file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)
+  set(GPU_ARCH "sm_50" CACHE STRING "LAMMPS GPU CUDA SM primary architecture (e.g. sm_60)")

-  if(GPU_API STREQUAL "CUDA")
+  file(GLOB GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cu ${CMAKE_CURRENT_SOURCE_DIR}/gpu/[^.]*.cu)
+  list(REMOVE_ITEM GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_pppm.cu)
+
+  cuda_include_directories(${LAMMPS_LIB_SOURCE_DIR}/gpu ${LAMMPS_LIB_BINARY_DIR}/gpu)
+
+  if(CUDPP_OPT)
+    cuda_include_directories(${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
+    file(GLOB GPU_LIB_CUDPP_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini/[^.]*.cpp)
+    file(GLOB GPU_LIB_CUDPP_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini/[^.]*.cu)
+  endif()
+
+  # build arch/gencode commands for nvcc based on CUDA toolkit version and use choice
+  # --arch translates directly instead of JIT, so this should be for the preferred or most common architecture
+  set(GPU_CUDA_GENCODE "-arch=${GPU_ARCH} ")
+  # Fermi (GPU Arch 2.x) is supported by CUDA 3.2 to CUDA 8.0
+  if((CUDA_VERSION VERSION_GREATER "3.1") AND (CUDA_VERSION VERSION_LESS "9.0"))
+    string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_20,code=[sm_20,compute_20] ")
+  endif()
+  # Kepler (GPU Arch 3.x) is supported by CUDA 5 and later
+  if(CUDA_VERSION VERSION_GREATER "4.9")
+    string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_30,code=[sm_30,compute_30] -gencode arch=compute_35,code=[sm_35,compute_35] ")
+  endif()
+  # Maxwell (GPU Arch 5.x) is supported by CUDA 6 and later
+  if(CUDA_VERSION VERSION_GREATER "5.9")
+    string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52] ")
+  endif()
+  # Pascal (GPU Arch 6.x) is supported by CUDA 8 and later
+  if(CUDA_VERSION VERSION_GREATER "7.9")
+    string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61] ")
+  endif()
+  # Volta (GPU Arch 7.0) is supported by CUDA 9 and later
+  if(CUDA_VERSION VERSION_GREATER "8.9")
+    string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_70,code=[sm_70,compute_70] ")
+  endif()
+  # Turing (GPU Arch 7.5) is supported by CUDA 10 and later
+  if(CUDA_VERSION VERSION_GREATER "9.9")
+    string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_75,code=[sm_75,compute_75] ")
+  endif()
+
+  cuda_compile_fatbin(GPU_GEN_OBJS ${GPU_LIB_CU} OPTIONS
+          -DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -DNV_KERNEL -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING})
+
+  cuda_compile(GPU_OBJS ${GPU_LIB_CUDPP_CU} OPTIONS ${CUDA_REQUEST_PIC}
+          -DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING})
+
+  foreach(CU_OBJ ${GPU_GEN_OBJS})
+    get_filename_component(CU_NAME ${CU_OBJ} NAME_WE)
+    string(REGEX REPLACE "^.*_lal_" "" CU_NAME "${CU_NAME}")
+    add_custom_command(OUTPUT ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h
+      COMMAND ${BIN2C} -c -n ${CU_NAME} ${CU_OBJ} > ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h
+      DEPENDS ${CU_OBJ}
+      COMMENT "Generating ${CU_NAME}_cubin.h")
+    list(APPEND GPU_LIB_SOURCES ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h)
+  endforeach()
+  set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "${LAMMPS_LIB_BINARY_DIR}/gpu/*_cubin.h")
+
+  add_library(gpu STATIC ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS})
+  target_link_libraries(gpu PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
+  target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu ${CUDA_INCLUDE_DIRS})
+  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT ${GPU_CUDA_MPS_FLAGS})
+  if(CUDPP_OPT)
+    target_include_directories(gpu PRIVATE ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
+    target_compile_definitions(gpu PRIVATE -DUSE_CUDPP)
+  endif()
+
+  target_link_libraries(lammps PRIVATE gpu)
+
+  add_executable(nvc_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
+  target_compile_definitions(nvc_get_devices PRIVATE -DUCL_CUDADR)
+  target_link_libraries(nvc_get_devices PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
+  target_include_directories(nvc_get_devices PRIVATE ${CUDA_INCLUDE_DIRS})
+
+elseif(GPU_API STREQUAL "OPENCL")
+  if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
+    # download and unpack support binaries for compilation of windows binaries.
+    set(LAMMPS_THIRDPARTY_URL "http://download.lammps.org/thirdparty")
+    file(DOWNLOAD "${LAMMPS_THIRDPARTY_URL}/opencl-win-devel.tar.gz" "${CMAKE_CURRENT_BINARY_DIR}/opencl-win-devel.tar.gz"
+            EXPECTED_MD5 2c00364888d5671195598b44c2e0d44d)
+    execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf opencl-win-devel.tar.gz WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    add_library(OpenCL::OpenCL UNKNOWN IMPORTED)
+    if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86")
+      set_target_properties(OpenCL::OpenCL PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/lib_win32/libOpenCL.dll")
+    elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64")
+      set_target_properties(OpenCL::OpenCL PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/lib_win64/libOpenCL.dll")
+    endif()
+    set_target_properties(OpenCL::OpenCL PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/include")
+  else()
+    find_package(OpenCL REQUIRED)
+  endif()
+  set(OCL_TUNE "generic" CACHE STRING "OpenCL Device Tuning")
+  set(OCL_TUNE_VALUES intel fermi kepler cypress generic)
+  set_property(CACHE OCL_TUNE PROPERTY STRINGS ${OCL_TUNE_VALUES})
+  validate_option(OCL_TUNE OCL_TUNE_VALUES)
+  string(TOUPPER ${OCL_TUNE} OCL_TUNE)
+
+  include(OpenCLUtils)
+  set(OCL_COMMON_HEADERS ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_preprocessor.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_aux_fun1.h)
+
+  file(GLOB GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cu)
+  list(REMOVE_ITEM GPU_LIB_CU
+    ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_gayberne.cu
+    ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_gayberne_lj.cu
+    ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_re_squared.cu
+    ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_re_squared_lj.cu
+    ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu
+    ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu
+    ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu
+  )
+
+  foreach(GPU_KERNEL ${GPU_LIB_CU})
+      get_filename_component(basename ${GPU_KERNEL} NAME_WE)
+      string(SUBSTRING ${basename} 4 -1 KERNEL_NAME)
+      GenerateOpenCLHeader(${KERNEL_NAME} ${CMAKE_CURRENT_BINARY_DIR}/gpu/${KERNEL_NAME}_cl.h ${OCL_COMMON_HEADERS} ${GPU_KERNEL})
+      list(APPEND GPU_LIB_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/gpu/${KERNEL_NAME}_cl.h)
+  endforeach()
+
+  GenerateOpenCLHeader(gayberne ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_ellipsoid_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_gayberne.cu)
+  GenerateOpenCLHeader(gayberne_lj ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_lj_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_ellipsoid_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_gayberne_lj.cu)
+  GenerateOpenCLHeader(re_squared ${CMAKE_CURRENT_BINARY_DIR}/gpu/re_squared_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_ellipsoid_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_re_squared.cu)
+  GenerateOpenCLHeader(re_squared_lj ${CMAKE_CURRENT_BINARY_DIR}/gpu/re_squared_lj_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_ellipsoid_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_re_squared_lj.cu)
+  GenerateOpenCLHeader(tersoff ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu)
+  GenerateOpenCLHeader(tersoff_zbl ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu)
+  GenerateOpenCLHeader(tersoff_mod ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu)
+
+  list(APPEND GPU_LIB_SOURCES
+    ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_cl.h
+    ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_lj_cl.h
+    ${CMAKE_CURRENT_BINARY_DIR}/gpu/re_squared_cl.h
+    ${CMAKE_CURRENT_BINARY_DIR}/gpu/re_squared_lj_cl.h
+    ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h
+    ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h
+    ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h
+  )
+
+  add_library(gpu STATIC ${GPU_LIB_SOURCES})
+  target_link_libraries(gpu PRIVATE OpenCL::OpenCL)
+  target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu)
+  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -D${OCL_TUNE}_OCL -DMPI_GERYON -DUCL_NO_EXIT)
+  target_compile_definitions(gpu PRIVATE -DUSE_OPENCL)
+
+  target_link_libraries(lammps PRIVATE gpu)
+
+  add_executable(ocl_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
+  target_compile_definitions(ocl_get_devices PRIVATE -DUCL_OPENCL)
+  target_link_libraries(ocl_get_devices PRIVATE OpenCL::OpenCL)
+elseif(GPU_API STREQUAL "HIP")
+  if(NOT DEFINED HIP_PATH)
+      if(NOT DEFINED ENV{HIP_PATH})
+          set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
+      else()
+          set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
+      endif()
+  endif()
+  set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+  find_package(HIP REQUIRED)
+  option(HIP_USE_DEVICE_SORT "Use GPU sorting" ON)
+
+  if(NOT DEFINED HIP_PLATFORM)
+      if(NOT DEFINED ENV{HIP_PLATFORM})
+          set(HIP_PLATFORM "hcc" CACHE PATH "HIP Platform to be used during compilation")
+      else()
+          set(HIP_PLATFORM $ENV{HIP_PLATFORM} CACHE PATH "HIP Platform used during compilation")
+      endif()
+  endif()
+
+  set(ENV{HIP_PLATFORM} ${HIP_PLATFORM})
+
+  if(HIP_PLATFORM STREQUAL "hcc")
+    set(HIP_ARCH "gfx906" CACHE STRING "HIP target architecture")
+  elseif(HIP_PLATFORM STREQUAL "nvcc")
    find_package(CUDA REQUIRED)
-    find_program(BIN2C bin2c)
-    if(NOT BIN2C)
-      message(FATAL_ERROR "Could not find bin2c, use -DBIN2C=/path/to/bin2c to help cmake finding it.")
-    endif()
-    option(CUDPP_OPT "Enable CUDPP_OPT" ON)
-    option(CUDA_MPS_SUPPORT "Enable tweaks to support CUDA Multi-process service (MPS)" OFF)
-    if(CUDA_MPS_SUPPORT)
-      set(GPU_CUDA_MPS_FLAGS "-DCUDA_PROXY")
-    endif()
-
-    set(GPU_ARCH "sm_30" CACHE STRING "LAMMPS GPU CUDA SM primary architecture (e.g. sm_60)")
-
-    file(GLOB GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cu ${CMAKE_CURRENT_SOURCE_DIR}/gpu/[^.]*.cu)
-    list(REMOVE_ITEM GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_pppm.cu)
-
-    cuda_include_directories(${LAMMPS_LIB_SOURCE_DIR}/gpu ${LAMMPS_LIB_BINARY_DIR}/gpu)
-
-    if(CUDPP_OPT)
-      cuda_include_directories(${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
-      file(GLOB GPU_LIB_CUDPP_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini/[^.]*.cpp)
-      file(GLOB GPU_LIB_CUDPP_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini/[^.]*.cu)
-    endif()
+    set(HIP_ARCH "sm_50" CACHE STRING "HIP primary CUDA architecture (e.g. sm_60)")

    # build arch/gencode commands for nvcc based on CUDA toolkit version and use choice
    # --arch translates directly instead of JIT, so this should be for the preferred or most common architecture
-    set(GPU_CUDA_GENCODE "-arch=${GPU_ARCH} ")
+    set(HIP_CUDA_GENCODE "-arch=${HIP_ARCH} ")
    # Fermi (GPU Arch 2.x) is supported by CUDA 3.2 to CUDA 8.0
    if((CUDA_VERSION VERSION_GREATER "3.1") AND (CUDA_VERSION VERSION_LESS "9.0"))
-      string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_20,code=[sm_20,compute_20] ")
+      string(APPEND HIP_CUDA_GENCODE "-gencode arch=compute_20,code=[sm_20,compute_20] ")
    endif()
    # Kepler (GPU Arch 3.x) is supported by CUDA 5 and later
    if(CUDA_VERSION VERSION_GREATER "4.9")
-      string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_30,code=[sm_30,compute_30] -gencode arch=compute_35,code=[sm_35,compute_35] ")
+      string(APPEND HIP_CUDA_GENCODE "-gencode arch=compute_30,code=[sm_30,compute_30] -gencode arch=compute_35,code=[sm_35,compute_35] ")
    endif()
    # Maxwell (GPU Arch 5.x) is supported by CUDA 6 and later
    if(CUDA_VERSION VERSION_GREATER "5.9")
-      string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52] ")
+      string(APPEND HIP_CUDA_GENCODE "-gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52] ")
    endif()
    # Pascal (GPU Arch 6.x) is supported by CUDA 8 and later
    if(CUDA_VERSION VERSION_GREATER "7.9")
-      string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61] ")
+      string(APPEND HIP_CUDA_GENCODE "-gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61] ")
    endif()
    # Volta (GPU Arch 7.0) is supported by CUDA 9 and later
    if(CUDA_VERSION VERSION_GREATER "8.9")
-      string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_70,code=[sm_70,compute_70] ")
+      string(APPEND HIP_CUDA_GENCODE "-gencode arch=compute_70,code=[sm_70,compute_70] ")
    endif()
    # Turing (GPU Arch 7.5) is supported by CUDA 10 and later
    if(CUDA_VERSION VERSION_GREATER "9.9")
-      string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_75,code=[sm_75,compute_75] ")
+      string(APPEND HIP_CUDA_GENCODE "-gencode arch=compute_75,code=[sm_75,compute_75] ")
+    endif()
+  endif()
+
+  file(GLOB GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cu ${CMAKE_CURRENT_SOURCE_DIR}/gpu/[^.]*.cu)
+  list(REMOVE_ITEM GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_pppm.cu)
+
+  set(GPU_LIB_CU_HIP "")
+  foreach(CU_FILE ${GPU_LIB_CU})
+    get_filename_component(CU_NAME ${CU_FILE} NAME_WE)
+    string(REGEX REPLACE "^.*lal_" "" CU_NAME "${CU_NAME}")
+
+    set(CU_CPP_FILE  "${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}.cu.cpp")
+    set(CUBIN_FILE   "${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}.cubin")
+    set(CUBIN_H_FILE "${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h")
+
+    if(HIP_PLATFORM STREQUAL "hcc")
+        configure_file(${CU_FILE} ${CU_CPP_FILE} COPYONLY)
+
+        add_custom_command(OUTPUT ${CUBIN_FILE}
+          VERBATIM COMMAND ${HIP_HIPCC_EXECUTABLE} --genco -t="${HIP_ARCH}" -f=\"-O3 -ffast-math -DUSE_HIP -D_${GPU_PREC_SETTING} -I${LAMMPS_LIB_SOURCE_DIR}/gpu\" -o ${CUBIN_FILE} ${CU_CPP_FILE}
+          DEPENDS ${CU_CPP_FILE}
+          COMMENT "Generating ${CU_NAME}.cubin")
+    elseif(HIP_PLATFORM STREQUAL "nvcc")
+        add_custom_command(OUTPUT ${CUBIN_FILE}
+          VERBATIM COMMAND ${HIP_HIPCC_EXECUTABLE} --fatbin --use_fast_math -DUSE_HIP -D_${GPU_PREC_SETTING} ${HIP_CUDA_GENCODE} -I${LAMMPS_LIB_SOURCE_DIR}/gpu -o ${CUBIN_FILE} ${CU_FILE}
+          DEPENDS ${CU_FILE}
+          COMMENT "Generating ${CU_NAME}.cubin")
    endif()

-    cuda_compile_fatbin(GPU_GEN_OBJS ${GPU_LIB_CU} OPTIONS
-            -DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -DNV_KERNEL -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING})
+    add_custom_command(OUTPUT ${CUBIN_H_FILE}
+      COMMAND ${CMAKE_COMMAND} -D SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR} -D VARNAME=${CU_NAME} -D HEADER_FILE=${CUBIN_H_FILE} -D SOURCE_FILES=${CUBIN_FILE} -P ${CMAKE_CURRENT_SOURCE_DIR}/Modules/GenerateBinaryHeader.cmake
+      DEPENDS ${CUBIN_FILE}
+      COMMENT "Generating ${CU_NAME}_cubin.h")

-    cuda_compile(GPU_OBJS ${GPU_LIB_CUDPP_CU} OPTIONS ${CUDA_REQUEST_PIC}
-            -DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING})
+    list(APPEND GPU_LIB_SOURCES ${CUBIN_H_FILE})
+  endforeach()

-    foreach(CU_OBJ ${GPU_GEN_OBJS})
-      get_filename_component(CU_NAME ${CU_OBJ} NAME_WE)
-      string(REGEX REPLACE "^.*_lal_" "" CU_NAME "${CU_NAME}")
-      add_custom_command(OUTPUT ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h
-        COMMAND ${BIN2C} -c -n ${CU_NAME} ${CU_OBJ} > ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h
-        DEPENDS ${CU_OBJ}
-        COMMENT "Generating ${CU_NAME}_cubin.h")
-      list(APPEND GPU_LIB_SOURCES ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h)
-    endforeach()
-    set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "${LAMMPS_LIB_BINARY_DIR}/gpu/*_cubin.h")
+  set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "${LAMMPS_LIB_BINARY_DIR}/gpu/*_cubin.h ${LAMMPS_LIB_BINARY_DIR}/gpu/*.cu.cpp")

+  hip_add_library(gpu STATIC ${GPU_LIB_SOURCES})
+  target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu)
+  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT)
+  target_compile_definitions(gpu PRIVATE -DUSE_HIP)

-    add_library(gpu STATIC ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS})
-    target_link_libraries(gpu PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
-    target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu ${CUDA_INCLUDE_DIRS})
-    target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT ${GPU_CUDA_MPS_FLAGS})
-    if(CUDPP_OPT)
-      target_include_directories(gpu PRIVATE ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
-      target_compile_definitions(gpu PRIVATE -DUSE_CUDPP)
-    endif()
+  if(HIP_USE_DEVICE_SORT)
+    # add hipCUB
+    target_include_directories(gpu PRIVATE ${HIP_ROOT_DIR}/../include)
+    target_compile_definitions(gpu PRIVATE -DUSE_HIP_DEVICE_SORT)

-    target_link_libraries(lammps PRIVATE gpu)
+    if(HIP_PLATFORM STREQUAL "nvcc")
+      find_package(CUB)

-    add_executable(nvc_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
-    target_compile_definitions(nvc_get_devices PRIVATE -DUCL_CUDADR)
-    target_link_libraries(nvc_get_devices PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
-    target_include_directories(nvc_get_devices PRIVATE ${CUDA_INCLUDE_DIRS})
-
-
-  elseif(GPU_API STREQUAL "OPENCL")
-    if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
-      # download and unpack support binaries for compilation of windows binaries.
-      set(LAMMPS_THIRDPARTY_URL "http://download.lammps.org/thirdparty")
-      file(DOWNLOAD "${LAMMPS_THIRDPARTY_URL}/opencl-win-devel.tar.gz" "${CMAKE_CURRENT_BINARY_DIR}/opencl-win-devel.tar.gz"
-              EXPECTED_MD5 2c00364888d5671195598b44c2e0d44d)
-      execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf opencl-win-devel.tar.gz WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-      add_library(OpenCL::OpenCL UNKNOWN IMPORTED)
-      if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86")
-	set_target_properties(OpenCL::OpenCL PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/lib_win32/libOpenCL.dll")
-      elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64")
-	set_target_properties(OpenCL::OpenCL PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/lib_win64/libOpenCL.dll")
+      if(CUB_FOUND)
+        set(DOWNLOAD_CUB_DEFAULT OFF)
+      else()
+        set(DOWNLOAD_CUB_DEFAULT ON)
      endif()
-      set_target_properties(OpenCL::OpenCL PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/include")
-    else()
-      find_package(OpenCL REQUIRED)
+
+      option(DOWNLOAD_CUB "Download and compile the CUB library instead of using an already installed one" ${DOWNLOAD_CUB_DEFAULT})
+
+      if(DOWNLOAD_CUB)
+        message(STATUS "CUB download requested")
+        include(ExternalProject)
+
+        ExternalProject_Add(CUB
+          GIT_REPOSITORY https://github.com/NVlabs/cub
+          TIMEOUT 5
+          PREFIX "${CMAKE_CURRENT_BINARY_DIR}"
+          CONFIGURE_COMMAND ""
+          BUILD_COMMAND ""
+          INSTALL_COMMAND ""
+          UPDATE_COMMAND ""
+        )
+        ExternalProject_get_property(CUB SOURCE_DIR)
+        set(CUB_INCLUDE_DIR ${SOURCE_DIR})
+      else()
+        find_package(CUB)
+        if(NOT CUB_FOUND)
+          message(FATAL_ERROR "CUB library not found. Help CMake to find it by setting CUB_INCLUDE_DIR, or set DOWNLOAD_VORO=ON to download it")
+        endif()
+      endif()
+
+      target_include_directories(gpu PRIVATE ${CUB_INCLUDE_DIR})
    endif()
-    set(OCL_TUNE "generic" CACHE STRING "OpenCL Device Tuning")
-    set(OCL_TUNE_VALUES intel fermi kepler cypress generic)
-    set_property(CACHE OCL_TUNE PROPERTY STRINGS ${OCL_TUNE_VALUES})
-    validate_option(OCL_TUNE OCL_TUNE_VALUES)
-    string(TOUPPER ${OCL_TUNE} OCL_TUNE)
-
-    include(OpenCLUtils)
-    set(OCL_COMMON_HEADERS ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_preprocessor.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_aux_fun1.h)
-
-    file(GLOB GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cu)
-    list(REMOVE_ITEM GPU_LIB_CU
-      ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_gayberne.cu
-      ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_gayberne_lj.cu
-      ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_re_squared.cu
-      ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_re_squared_lj.cu
-      ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu
-      ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu
-      ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu
-    )
-
-    foreach(GPU_KERNEL ${GPU_LIB_CU})
-        get_filename_component(basename ${GPU_KERNEL} NAME_WE)
-        string(SUBSTRING ${basename} 4 -1 KERNEL_NAME)
-        GenerateOpenCLHeader(${KERNEL_NAME} ${CMAKE_CURRENT_BINARY_DIR}/gpu/${KERNEL_NAME}_cl.h ${OCL_COMMON_HEADERS} ${GPU_KERNEL})
-        list(APPEND GPU_LIB_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/gpu/${KERNEL_NAME}_cl.h)
-    endforeach()
-
-    GenerateOpenCLHeader(gayberne ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_ellipsoid_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_gayberne.cu)
-    GenerateOpenCLHeader(gayberne_lj ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_lj_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_ellipsoid_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_gayberne_lj.cu)
-    GenerateOpenCLHeader(re_squared ${CMAKE_CURRENT_BINARY_DIR}/gpu/re_squared_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_ellipsoid_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_re_squared.cu)
-    GenerateOpenCLHeader(re_squared_lj ${CMAKE_CURRENT_BINARY_DIR}/gpu/re_squared_lj_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_ellipsoid_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_re_squared_lj.cu)
-    GenerateOpenCLHeader(tersoff ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu)
-    GenerateOpenCLHeader(tersoff_zbl ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu)
-    GenerateOpenCLHeader(tersoff_mod ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu)
-
-    list(APPEND GPU_LIB_SOURCES
-      ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_cl.h
-      ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_lj_cl.h
-      ${CMAKE_CURRENT_BINARY_DIR}/gpu/re_squared_cl.h
-      ${CMAKE_CURRENT_BINARY_DIR}/gpu/re_squared_lj_cl.h
-      ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h
-      ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h
-      ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h
-    )
-
-    add_library(gpu STATIC ${GPU_LIB_SOURCES})
-    target_link_libraries(gpu PRIVATE OpenCL::OpenCL)
-    target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu)
-    target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -D${OCL_TUNE}_OCL -DMPI_GERYON -DUCL_NO_EXIT)
-    target_compile_definitions(gpu PRIVATE -DUSE_OPENCL)
-
-    target_link_libraries(lammps PRIVATE gpu)
-
-    add_executable(ocl_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
-    target_compile_definitions(ocl_get_devices PRIVATE -DUCL_OPENCL)
-    target_link_libraries(ocl_get_devices PRIVATE OpenCL::OpenCL)
  endif()

-  # GPU package
-  FindStyleHeaders(${GPU_SOURCES_DIR} FIX_CLASS fix_ FIX)
+  hip_add_executable(hip_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
+  target_compile_definitions(hip_get_devices PRIVATE -DUCL_HIP)

-  set_property(GLOBAL PROPERTY "GPU_SOURCES" "${GPU_SOURCES}")
+  if(HIP_PLATFORM STREQUAL "nvcc")
+    target_compile_definitions(gpu PRIVATE -D__HIP_PLATFORM_NVCC__)
+    target_include_directories(gpu PRIVATE ${HIP_ROOT_DIR}/include)
+    target_include_directories(gpu PRIVATE ${CUDA_INCLUDE_DIRS})
+    target_link_libraries(gpu PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})

-  # detects styles which have GPU version
-  RegisterStylesExt(${GPU_SOURCES_DIR} gpu GPU_SOURCES)
-
-  get_property(GPU_SOURCES GLOBAL PROPERTY GPU_SOURCES)
-
-  target_link_libraries(gpu PRIVATE MPI::MPI_CXX)
-  if(NOT BUILD_SHARED_LIBS)
-    install(TARGETS gpu EXPORT LAMMPS_Targets LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+    target_compile_definitions(hip_get_devices PRIVATE -D__HIP_PLATFORM_NVCC__)
+    target_include_directories(hip_get_devices PRIVATE ${HIP_ROOT_DIR}/include)
+    target_include_directories(hip_get_devices PRIVATE ${CUDA_INCLUDE_DIRS})
+    target_link_libraries(hip_get_devices PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
  endif()
-  target_compile_definitions(gpu PRIVATE -DLAMMPS_${LAMMPS_SIZES})
-  set_target_properties(gpu PROPERTIES OUTPUT_NAME lammps_gpu${LAMMPS_MACHINE})
-  target_sources(lammps PRIVATE ${GPU_SOURCES})
-  target_include_directories(lammps PRIVATE ${GPU_SOURCES_DIR})
+
+  target_link_libraries(lammps PRIVATE gpu)
+endif()
+
+# GPU package
+FindStyleHeaders(${GPU_SOURCES_DIR} FIX_CLASS fix_ FIX)
+
+set_property(GLOBAL PROPERTY "GPU_SOURCES" "${GPU_SOURCES}")
+
+# detects styles which have GPU version
+RegisterStylesExt(${GPU_SOURCES_DIR} gpu GPU_SOURCES)
+
+get_property(GPU_SOURCES GLOBAL PROPERTY GPU_SOURCES)
+
+target_link_libraries(gpu PRIVATE MPI::MPI_CXX)
+if(NOT BUILD_SHARED_LIBS)
+  install(TARGETS gpu EXPORT LAMMPS_Targets LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+endif()
+target_compile_definitions(gpu PRIVATE -DLAMMPS_${LAMMPS_SIZES})
+set_target_properties(gpu PROPERTIES OUTPUT_NAME lammps_gpu${LAMMPS_MACHINE})
+target_sources(lammps PRIVATE ${GPU_SOURCES})
+target_include_directories(lammps PRIVATE ${GPU_SOURCES_DIR})
--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@ -87,27 +87,33 @@ GPU package
 ---------------------

 To build with this package, you must choose options for precision and
-which GPU hardware to build for.
+which GPU hardware to build for. The GPU package currently supports
+three different types of backends: OpenCL, CUDA and HIP.

 **CMake build**\ :

 .. code-block:: bash

-   -D GPU_API=value          # value = opencl (default) or cuda
-   -D GPU_PREC=value         # precision setting
-                             # value = double or mixed (default) or single
-   -D OCL_TUNE=value         # hardware choice for GPU_API=opencl
-                             # generic (default) or intel (Intel CPU) or fermi, kepler, cypress (NVIDIA)
-   -D GPU_ARCH=value         # primary GPU hardware choice for GPU_API=cuda
-                             # value = sm_XX, see below
-                             # default is sm_30
-   -D CUDPP_OPT=value        # optimization setting for GPU_API=cuda
-                             # enables CUDA Performance Primitives Optimizations
-                             # value = yes (default) or no
-   -D CUDA_MPS_SUPPORT=value # enables some tweaks required to run with active nvidia-cuda-mps daemon
-                             # value = yes or no (default)
+   -D GPU_API=value             # value = opencl (default) or cuda or hip
+   -D GPU_PREC=value            # precision setting
+                                # value = double or mixed (default) or single
+   -D OCL_TUNE=value            # hardware choice for GPU_API=opencl
+                                # generic (default) or intel (Intel CPU) or fermi, kepler, cypress (NVIDIA)
+   -D GPU_ARCH=value            # primary GPU hardware choice for GPU_API=cuda
+                                # value = sm_XX, see below
+                                # default is sm_30
+   -D HIP_ARCH=value            # primary GPU hardware choice for GPU_API=hip
+                                # value depends on selected HIP_PLATFORM
+                                # default is 'gfx906' for HIP_PLATFORM=hcc and 'sm_30' for HIP_PLATFORM=nvcc
+   -D HIP_USE_DEVICE_SORT=value # enables GPU sorting
+                                # value = yes (default) or no
+   -D CUDPP_OPT=value           # optimization setting for GPU_API=cuda
+                                # enables CUDA Performance Primitives Optimizations
+                                # value = yes (default) or no
+   -D CUDA_MPS_SUPPORT=value    # enables some tweaks required to run with active nvidia-cuda-mps daemon
+                                # value = yes or no (default)

-GPU_ARCH settings for different GPU hardware is as follows:
+:code:`GPU_ARCH` settings for different GPU hardware is as follows:

 * sm_12 or sm_13 for GT200 (supported by CUDA 3.2 until CUDA 6.5)
 * sm_20 or sm_21 for Fermi (supported by CUDA 3.2 until CUDA 7.5)
@ -126,6 +132,28 @@ Thus the GPU_ARCH setting is merely an optimization, to have code for
 the preferred GPU architecture directly included rather than having to wait
 for the JIT compiler of the CUDA driver to translate it.

+If you are compiling with HIP, note that before running CMake you will have to
+set appropriate environment variables. Some variables such as
+:code:`HCC_AMDGPU_TARGET` or :code:`CUDA_PATH` are necessary for :code:`hipcc`
+and the linker to work correctly.
+
+.. code:: bash
+
+   # AMDGPU target
+   export HIP_PLATFORM=hcc
+   export HCC_AMDGPU_TARGET=gfx906
+   cmake -D PKG_GPU=on -D GPU_API=HIP -D HIP_ARCH=gfx906 -D CMAKE_CXX_COMPILER=hipcc ..
+   make -j 4
+
+.. code:: bash
+
+   # CUDA target
+   # !!! DO NOT set CMAKE_CXX_COMPILER !!!
+   export HIP_PLATFORM=nvcc
+   export CUDA_PATH=/usr/local/cuda
+   cmake -D PKG_GPU=on -D GPU_API=HIP -D HIP_ARCH=sm_70 ..
+   make -j 4
+
 **Traditional make**\ :

 Before building LAMMPS, you must build the GPU library in ``lib/gpu``\ .
--- a/doc/src/fix_adapt.rst
+++ b/doc/src/fix_adapt.rst
@ -14,7 +14,7 @@ Syntax
 * adapt = style name of this fix command
 * N = adapt simulation settings every this many timesteps
 * one or more attribute/arg pairs may be appended
-* attribute = *pair* or *kspace* or *atom*
+* attribute = *pair* or *bond* or *kspace* or *atom*

  .. parsed-literal::

@ -86,8 +86,8 @@ the end of a simulation.  Even if *reset* is specified as *yes*\ , a
 restart file written during a simulation will contain the modified
 settings.

-If the *scale* keyword is set to *no*\ , then the value the parameter is
-set to will be whatever the variable generates.  If the *scale*
+If the *scale* keyword is set to *no*\ , then the value of the altered
+parameter will be whatever the variable generates.  If the *scale*
 keyword is set to *yes*\ , then the value of the altered parameter will
 be the initial value of that parameter multiplied by whatever the
 variable generates.  I.e. the variable is now a "scale factor" applied
@ -319,26 +319,23 @@ The *atom* keyword enables various atom properties to be changed.  The
 current list of atom parameters that can be varied by this fix:

 * charge = charge on particle
-* diameter = diameter of particle
+* diameter, or, diameter/disc = diameter of particle

 The *v_name* argument of the *atom* keyword is the name of an
 :doc:`equal-style variable <variable>` which will be evaluated each time
-this fix is invoked to set the parameter to a new value.  It should be
-specified as v_name, where name is the variable name.  See the
+this fix is invoked to set, or scale the parameter to a new value.
+It should be specified as v_name, where name is the variable name.  See the
 discussion above describing the formulas associated with equal-style
 variables.  The new value is assigned to the corresponding attribute
 for all atoms in the fix group.

-.. note::
-
-   The *atom* keyword works this way whether the *scale* keyword is
-   set to *no* or *yes*\ .  I.e. the use of scale yes is not yet supported
-   by the *atom* keyword.
-
 If the atom parameter is *diameter* and per-atom density and per-atom
 mass are defined for particles (e.g. :doc:`atom_style granular <atom_style>`), then the mass of each particle is also
-changed when the diameter changes (density is assumed to stay
-constant).
+changed when the diameter changes. The mass is set from the particle volume
+for 3d systems (density is assumed to stay constant). For 2d, the default is
+for LAMMPS to model particles with a radius attribute as spheres.
+However, if the atom parameter is *diameter/disc*, then the mass is
+set from the particle area (the density is assumed to be in mass/distance^2 units).

 For example, these commands would shrink the diameter of all granular
 particles in the "center" group from 1.0 to 0.1 in a linear fashion
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@ -170,6 +170,7 @@ ba
 Babadi
 Babaei
 backcolor
+backends
 Baczewski
 Bagi
 Bagnold
--- a/lib/gpu/.gitignore
+++ b/lib/gpu/.gitignore
@ -2,6 +2,7 @@
 /obj_ocl
 /ocl_get_devices
 /nvc_get_devices
+/hip_get_devices
 /*.cubin
 /*_cubin.h
 /*_cl.h
--- a/lib/gpu/Makefile.hip
+++ b/lib/gpu/Makefile.hip
@ -0,0 +1,148 @@
+# /* ----------------------------------------------------------------------
+#  Generic Linux Makefile for HIP
+#     - export HIP_PLATFORM=hcc (or nvcc) before execution
+#     - change HIP_ARCH for your GPU
+# ------------------------------------------------------------------------- */
+
+# this setting should match LAMMPS Makefile
+# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
+
+LMP_INC = -DLAMMPS_SMALLBIG
+
+# precision for GPU calculations
+# -D_SINGLE_SINGLE  # Single precision for all calculations
+# -D_DOUBLE_DOUBLE  # Double precision for all calculations
+# -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double
+
+HIP_PRECISION = -D_SINGLE_DOUBLE
+
+HIP_OPTS = -O3
+HIP_HOST_OPTS = -Wno-deprecated-declarations
+HIP_HOST_INCLUDE =
+
+# use device sort
+# requires linking with hipcc and hipCUB + (rocPRIM or CUB for AMD or Nvidia respectively)
+HIP_HOST_OPTS += -DUSE_HIP_DEVICE_SORT
+# path to cub
+HIP_HOST_INCLUDE += -I./
+# path to hipcub
+HIP_HOST_INCLUDE += -I$(HIP_PATH)/../include
+
+# use mpi
+HIP_HOST_OPTS += -DMPI_GERYON -DUCL_NO_EXIT
+# this settings should match LAMMPS Makefile
+MPI_COMP_OPTS = $(shell mpicxx --showme:compile)
+MPI_LINK_OPTS = $(shell mpicxx --showme:link)
+#MPI_COMP_OPTS += -I/usr/include/mpi  -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1
+
+HIP_PATH ?= $(wildcard /opt/rocm/hip)
+HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler)
+
+ifeq (hcc,$(HIP_PLATFORM))
+	HIP_OPTS  += -ffast-math
+	# possible values: gfx803,gfx900,gfx906
+	HIP_ARCH = gfx906
+else ifeq (nvcc,$(HIP_PLATFORM))
+	HIP_OPTS  += --use_fast_math
+	HIP_ARCH = -gencode arch=compute_30,code=[sm_30,compute_30] -gencode arch=compute_32,code=[sm_32,compute_32] -gencode arch=compute_35,code=[sm_35,compute_35] \
+		    -gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52] -gencode arch=compute_53,code=[sm_53,compute_53]\
+			-gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61] -gencode arch=compute_62,code=[sm_62,compute_62]\
+			-gencode arch=compute_70,code=[sm_70,compute_70] -gencode arch=compute_72,code=[sm_72,compute_72] -gencode arch=compute_75,code=[sm_75,compute_75]
+else
+	$(error Specify HIP platform using 'export HIP_PLATFORM=(hcc,nvcc)')
+endif
+
+BIN_DIR = .
+OBJ_DIR = ./obj
+LIB_DIR = .
+AR = ar
+BSH = /bin/sh
+
+
+# /* ----------------------------------------------------------------------
+#  				don't change section below without need
+# ------------------------------------------------------------------------- */
+
+HIP_OPTS += -DUSE_HIP $(HIP_PRECISION)
+HIP_GPU_OPTS += $(HIP_OPTS) -I./
+
+ifeq (hcc,$(HIP_PLATFORM))
+	HIP_HOST_OPTS += -fPIC
+	HIP_GPU_CC  = $(HIP_PATH)/bin/hipcc --genco
+	HIP_GPU_OPTS_S = -t="$(HIP_ARCH)" -f=\"
+	HIP_GPU_OPTS_E = \"
+	HIP_KERNEL_SUFFIX = .cpp
+	HIP_LIBS_TARGET = export HCC_AMDGPU_TARGET := $(HIP_ARCH)
+	export HCC_AMDGPU_TARGET := $(HIP_ARCH)
+else ifeq (nvcc,$(HIP_PLATFORM))
+	HIP_GPU_CC  = $(HIP_PATH)/bin/hipcc --fatbin
+	HIP_GPU_OPTS += $(HIP_ARCH)
+	HIP_GPU_SORT_ARCH = $(HIP_ARCH)
+	# fix nvcc can't handle -pthread flag
+	MPI_COMP_OPTS := $(subst -pthread,-Xcompiler -pthread,$(MPI_COMP_OPTS))
+	MPI_LINK_OPTS := $(subst -pthread,-Xcompiler -pthread,$(MPI_LINK_OPTS))
+endif
+
+# hipcc is essential for device sort, because of hipcub is header only library and ROCm gpu code generation is deferred to the linking stage
+HIP_HOST_CC = $(HIP_PATH)/bin/hipcc
+HIP_HOST_OPTS += $(HIP_OPTS) $(MPI_COMP_OPTS) $(LMP_INC)
+HIP_HOST_CC_CMD  = $(HIP_HOST_CC) $(HIP_HOST_OPTS) $(HIP_HOST_INCLUDE)
+
+# sources
+
+ALL_H  =  $(wildcard ./geryon/ucl*.h) $(wildcard ./geryon/hip*.h) $(wildcard ./lal_*.h)
+SRCS := $(wildcard ./lal_*.cpp)
+OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o))
+CUS  := $(wildcard lal_*.cu)
+CUHS := $(filter-out pppm_cubin.h, $(CUS:lal_%.cu=%_cubin.h)) pppm_f_cubin.h pppm_d_cubin.h
+CUHS := $(addprefix $(OBJ_DIR)/, $(CUHS))
+
+all: $(OBJ_DIR) $(CUHS) $(LIB_DIR)/libgpu.a $(BIN_DIR)/hip_get_devices
+
+$(OBJ_DIR):
+	mkdir -p $@
+
+# GPU kernels compilation
+
+$(OBJ_DIR)/pppm_f_cubin.h: lal_pppm.cu  $(ALL_H)
+	@cp $< $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX)
+	$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) -Dgrdtyp=float  -Dgrdtyp4=float4 $(HIP_GPU_OPTS_E)  -o $(OBJ_DIR)/pppm_f.cubin $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX)
+	@xxd -i $(OBJ_DIR)/pppm_f.cubin $@
+	@sed -i "s/[a-zA-Z0-9_]*pppm_f_cubin/pppm_f/g" $@
+	@rm $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/pppm_f.cubin
+
+$(OBJ_DIR)/pppm_d_cubin.h: lal_pppm.cu  $(ALL_H)
+	@cp $< $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX)
+	$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) -Dgrdtyp=double -Dgrdtyp4=double4 $(HIP_GPU_OPTS_E)  -o $(OBJ_DIR)/pppm_d.cubin $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX)
+	@xxd -i $(OBJ_DIR)/pppm_d.cubin $@
+	@sed -i "s/[a-zA-Z0-9_]*pppm_d_cubin/pppm_d/g" $@
+	@rm $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/pppm_d.cubin
+
+$(OBJ_DIR)/%_cubin.h: lal_%.cu  $(ALL_H)
+	@cp $< $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX)
+	$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) $(HIP_GPU_OPTS_E)  -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX)
+	@xxd -i $(OBJ_DIR)/$*.cubin $@
+	@sed -i "s/[a-zA-Z0-9_]*$*_cubin/$*/g" $@
+	@rm $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/$*.cubin
+
+# host sources compilation
+
+$(OBJ_DIR)/lal_atom.o: lal_atom.cpp $(CUHS) $(ALL_H)
+	$(HIP_HOST_CC_CMD) -o $@ -c $< -I$(OBJ_DIR) $(HIP_GPU_SORT_ARCH)
+
+$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(ALL_H)
+	$(HIP_HOST_CC_CMD) -o $@ -c $< -I$(OBJ_DIR)
+
+# libgpu building
+
+$(LIB_DIR)/libgpu.a: $(OBJS)
+	$(AR) -crs $@ $(OBJS)
+	printf "export HIP_PLATFORM := %s\n%s\n" "$(HIP_PLATFORM)" "$(HIP_LIBS_TARGET)" > Makefile.lammps
+
+# test app building
+
+$(BIN_DIR)/hip_get_devices: ./geryon/ucl_get_devices.cpp $(ALL_H)
+	$(HIP_HOST_CC_CMD) -o $@ $< -DUCL_HIP $(MPI_LINK_OPTS)
+
+clean:
+	-rm -f $(BIN_DIR)/hip_get_devices $(LIB_DIR)/libgpu.a $(OBJS) $(OBJ_DIR)/temp_* $(CUHS)
--- a/lib/gpu/README
+++ b/lib/gpu/README
@ -67,8 +67,8 @@ library requires installing the CUDA GPU driver and CUDA toolkit for
 your operating system. Installation of the CUDA SDK is not necessary.
 In addition to the LAMMPS library, the binary nvc_get_devices will also
 be built. This can be used to query the names and properties of GPU 
-devices on your system. A Makefile for OpenCL compilation is provided,
-but support for OpenCL use is not currently provided by the developers.
+devices on your system. A Makefile for OpenCL and ROCm HIP compilation
+is provided, but support for it is not currently provided by the developers.
 Details of the implementation are provided in:

 ----
@ -169,6 +169,25 @@ NOTE: The system-specific setting LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG,
      src/MAKE/Makefile.foo) should be consistent with that specified 
      when building libgpu.a (i.e. by LMP_INC in the lib/gpu/Makefile.bar).

+                      BUILDING FOR HIP FRAMEWORK
+                   --------------------------------
+1. Install the latest ROCm framework (https://github.com/RadeonOpenCompute/ROCm).
+2. GPU sorting requires installing hipcub 
+(https://github.com/ROCmSoftwarePlatform/hipCUB). The HIP CUDA-backend
+additionally requires cub (https://nvlabs.github.io/cub). Download and
+extract the cub directory to lammps/lib/gpu/ or specify an appropriate
+path in lammps/lib/gpu/Makefile.hip.
+3. In Makefile.hip it is possible to specify the target platform via 
+export HIP_PLATFORM=hcc or HIP_PLATFORM=nvcc as well as the target 
+architecture (gfx803, gfx900, gfx906 etc.)
+4. If your MPI implementation does not support `mpicxx --showme` command,
+it is required to specify the corresponding MPI compiler and linker flags
+in lammps/lib/gpu/Makefile.hip and in lammps/src/MAKE/OPTIONS/Makefile.hip.
+5. Building the GPU library (libgpu.a): 
+    cd lammps/lib/gpu; make -f Makefile.hip -j
+6. Building the LAMMPS executable (lmp_hip):
+    cd ../../src; make hip -j
+
                      EXAMPLE CONVENTIONAL BUILD PROCESS
                  --------------------------------
                    
--- a/lib/gpu/geryon/hip_device.h
+++ b/lib/gpu/geryon/hip_device.h
@ -0,0 +1,519 @@
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef HIP_DEVICE
+#define HIP_DEVICE
+
+
+#include <hip/hip_runtime.h>
+#include <unordered_map>
+#include <string>
+#include <vector>
+#include <iostream>
+#include "hip_macros.h"
+#include "ucl_types.h"
+
+namespace ucl_hip {
+
+// --------------------------------------------------------------------------
+// - COMMAND QUEUE STUFF
+// --------------------------------------------------------------------------
+typedef hipStream_t command_queue;
+
+inline void ucl_sync(hipStream_t &stream) {
+  CU_SAFE_CALL(hipStreamSynchronize(stream));
+}
+
+struct NVDProperties {
+  int device_id;
+  std::string name;
+  int major;
+  int minor;
+  CUDA_INT_TYPE totalGlobalMem;
+  int multiProcessorCount;
+
+  int maxThreadsPerBlock;
+  int maxThreadsDim[3];
+  int maxGridSize[3];
+  int sharedMemPerBlock;
+  int totalConstantMemory;
+  int SIMDWidth;
+  int memPitch;
+  int regsPerBlock;
+  int clockRate;
+  int textureAlign;
+
+  int kernelExecTimeoutEnabled;
+  int integrated;
+  int canMapHostMemory;
+  int concurrentKernels;
+  int ECCEnabled;
+  int computeMode;
+};
+
+/// Class for looking at device properties
+/** \note Calls to change the device outside of the class results in incorrect
+  *       behavior
+  * \note There is no error checking for indexing past the number of devices **/
+class UCL_Device {
+ public:
+  /// Collect properties for every GPU on the node
+  /** \note You must set the active GPU with set() before using the device **/
+  inline UCL_Device();
+
+  inline ~UCL_Device();
+
+  /// Returns 1 (For compatibility with OpenCL)
+  inline int num_platforms() { return 1; }
+
+  /// Return a string with name and info of the current platform
+  inline std::string platform_name()
+    { return "HIP platform"; }
+
+  /// Delete any contexts/data and set the platform number to be used
+  inline int set_platform(const int pid);
+
+  /// Return the number of devices that support CUDA
+  inline int num_devices() { return _properties.size(); }
+
+  /// Set the CUDA device to the specified device number
+  /** A context and default command queue will be created for the device
+    * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
+    * be allocated for use. clear() is called to delete any contexts and
+    * associated data from previous calls to set(). **/
+  inline int set(int num);
+
+  /// Delete any context and associated data stored from a call to set()
+  inline void clear();
+
+  /// Get the current device number
+  inline int device_num() { return _device; }
+
+  /// Returns the default stream for the current device
+  inline command_queue & cq() { return cq(0); }
+
+  /// Returns the stream indexed by i
+  inline command_queue & cq(const int i) { return _cq[i]; }
+
+  /// Block until all commands in the default stream have completed
+  inline void sync() { sync(0); }
+
+  /// Block until all commands in the specified stream have completed
+  inline void sync(const int i) { ucl_sync(cq(i)); }
+
+  /// Get the number of command queues currently available on device
+  inline int num_queues()
+    { return _cq.size(); }
+
+  /// Add a stream for device computations
+  inline void push_command_queue() {
+    _cq.push_back(hipStream_t());
+    CU_SAFE_CALL(hipStreamCreateWithFlags(&_cq.back(),0));
+  }
+
+  /// Remove a stream for device computations
+  /** \note You cannot delete the default stream **/
+  inline void pop_command_queue() {
+    if (_cq.size()<2) return;
+    CU_SAFE_CALL_NS(hipStreamDestroy(_cq.back()));
+    _cq.pop_back();
+  }
+
+  /// Set the default command queue (by default this is the null stream)
+  /** \param i index of the command queue (as added by push_command_queue())
+      If i is 0, the default command queue is set to the null stream **/
+  inline void set_command_queue(const int i) {
+    if (i==0) _cq[0]=0;
+    else _cq[0]=_cq[i];
+  }
+
+  /// Get the current CUDA device name
+  inline std::string name() { return name(_device); }
+  /// Get the CUDA device name
+  inline std::string name(const int i)
+    { return std::string(_properties[i].name); }
+
+  /// Get a string telling the type of the current device
+  inline std::string device_type_name() { return device_type_name(_device); }
+  /// Get a string telling the type of the device
+  inline std::string device_type_name(const int i) { return "GPU"; }
+
+  /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
+  inline int device_type() { return device_type(_device); }
+  /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
+  inline int device_type(const int i) { return UCL_GPU; }
+
+  /// Returns true if host memory is efficiently addressable from device
+  inline bool shared_memory() { return shared_memory(_device); }
+  /// Returns true if host memory is efficiently addressable from device
+  inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
+
+  /// Returns true if double precision is support for the current device
+  inline bool double_precision() { return double_precision(_device); }
+  /// Returns true if double precision is support for the device
+  inline bool double_precision(const int i) {return arch(i)>=1.3;}
+
+  /// Get the number of compute units on the current device
+  inline unsigned cus() { return cus(_device); }
+  /// Get the number of compute units
+  inline unsigned cus(const int i)
+    { return _properties[i].multiProcessorCount; }
+
+  /// Get the number of cores in the current device
+  inline unsigned cores() { return cores(_device); }
+  /// Get the number of cores
+  inline unsigned cores(const int i)
+    { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
+      else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32;
+      else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48;
+      else return _properties[i].multiProcessorCount*192; }
+
+  /// Get the gigabytes of global memory in the current device
+  inline double gigabytes() { return gigabytes(_device); }
+  /// Get the gigabytes of global memory
+  inline double gigabytes(const int i)
+    { return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
+
+  /// Get the bytes of global memory in the current device
+  inline size_t bytes() { return bytes(_device); }
+  /// Get the bytes of global memory
+  inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; }
+
+  // Get the gigabytes of free memory in the current device
+  inline double free_gigabytes() { return free_gigabytes(_device); }
+  // Get the gigabytes of free memory
+  inline double free_gigabytes(const int i)
+    { return static_cast<double>(free_bytes(i))/1073741824; }
+
+  // Get the bytes of free memory in the current device
+  inline size_t free_bytes() { return free_bytes(_device); }
+  // Get the bytes of free memory
+  inline size_t free_bytes(const int i) {
+    CUDA_INT_TYPE dfree, dtotal;
+    CU_SAFE_CALL_NS(hipMemGetInfo(&dfree, &dtotal));
+    return static_cast<size_t>(dfree);
+  }
+
+  /// Return the GPGPU compute capability for current device
+  inline double arch() { return arch(_device); }
+  /// Return the GPGPU compute capability
+  inline double arch(const int i)
+    { return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
+
+  /// Clock rate in GHz for current device
+  inline double clock_rate() { return clock_rate(_device); }
+  /// Clock rate in GHz
+  inline double clock_rate(const int i)
+    { return _properties[i].clockRate*1e-6;}
+
+  /// Get the maximum number of threads per block
+  inline size_t group_size() { return group_size(_device); }
+  /// Get the maximum number of threads per block
+  inline size_t group_size(const int i)
+    { return _properties[i].maxThreadsPerBlock; }
+
+  /// Return the maximum memory pitch in bytes for current device
+  inline size_t max_pitch() { return max_pitch(_device); }
+  /// Return the maximum memory pitch in bytes
+  inline size_t max_pitch(const int i) { return _properties[i].memPitch; }
+
+  /// Returns false if accelerator cannot be shared by multiple processes
+  /** If it cannot be determined, true is returned **/
+  inline bool sharing_supported() { return sharing_supported(_device); }
+  /// Returns false if accelerator cannot be shared by multiple processes
+  /** If it cannot be determined, true is returned **/
+  inline bool sharing_supported(const int i)
+    { return (_properties[i].computeMode == hipComputeModeDefault); }
+
+  /// True if splitting device into equal subdevices supported
+  inline bool fission_equal()
+    { return fission_equal(_device); }
+  /// True if splitting device into equal subdevices supported
+  inline bool fission_equal(const int i)
+    { return false; }
+  /// True if splitting device into subdevices by specified counts supported
+  inline bool fission_by_counts()
+    { return fission_by_counts(_device); }
+  /// True if splitting device into subdevices by specified counts supported
+  inline bool fission_by_counts(const int i)
+    { return false; }
+  /// True if splitting device into subdevices by affinity domains supported
+  inline bool fission_by_affinity()
+    { return fission_by_affinity(_device); }
+  /// True if splitting device into subdevices by affinity domains supported
+  inline bool fission_by_affinity(const int i)
+    { return false; }
+
+  /// Maximum number of subdevices allowed from device fission
+  inline int max_sub_devices()
+    { return max_sub_devices(_device); }
+  /// Maximum number of subdevices allowed from device fission
+  inline int max_sub_devices(const int i)
+    { return 0; }
+
+  /// List all devices along with all properties
+  inline void print_all(std::ostream &out);
+
+  /// Select the platform that has accelerators (for compatibility with OpenCL)
+  inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; }
+
+  inline int load_module(const void* program, hipModule_t& module, std::string *log=NULL){
+    auto it = _loaded_modules.emplace(program, hipModule_t());
+    if(!it.second){
+      module = it.first->second;
+      return UCL_SUCCESS;
+    }
+    const unsigned int num_opts=2;
+    hipJitOption options[num_opts];
+    void *values[num_opts];
+
+    // set up size of compilation log buffer
+    options[0] = hipJitOptionInfoLogBufferSizeBytes;
+    values[0] = (void *)(int)10240;
+    // set up pointer to the compilation log buffer
+    options[1] = hipJitOptionInfoLogBuffer;
+    char clog[10240] = { 0 };
+    values[1] = clog;
+
+    hipError_t err=hipModuleLoadDataEx(&module,program,num_opts, options,(void **)values);
+
+    if (log!=NULL)
+      *log=std::string(clog);
+
+    if (err != hipSuccess) {
+      #ifndef UCL_NO_EXIT
+      std::cerr << std::endl
+                << "----------------------------------------------------------\n"
+                << " UCL Error: Error compiling PTX Program...\n"
+                << "----------------------------------------------------------\n";
+      std::cerr << log << std::endl;
+      #endif
+      _loaded_modules.erase(it.first);
+      return UCL_COMPILE_ERROR;
+    }
+    it.first->second = module;
+    return UCL_SUCCESS;
+  }
+ private:
+  std::unordered_map<const void*, hipModule_t> _loaded_modules;
+  int _device, _num_devices;
+  std::vector<NVDProperties> _properties;
+  std::vector<hipStream_t> _cq;
+  hipDevice_t _cu_device;
+};
+
+// Grabs the properties for all devices
+UCL_Device::UCL_Device() {
+  CU_SAFE_CALL_NS(hipInit(0));
+  CU_SAFE_CALL_NS(hipGetDeviceCount(&_num_devices));
+  for (int i=0; i<_num_devices; ++i) {
+    hipDevice_t dev;
+    CU_SAFE_CALL_NS(hipDeviceGet(&dev,i));
+    int major, minor;
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, dev));
+    if (major==9999)
+      continue;
+
+    NVDProperties prop;
+    prop.device_id = i;
+    prop.major=major;
+    prop.minor=minor;
+
+    char namecstr[1024];
+    CU_SAFE_CALL_NS(hipDeviceGetName(namecstr,1024,dev));
+    prop.name=namecstr;
+
+    CU_SAFE_CALL_NS(hipDeviceTotalMem(&prop.totalGlobalMem,dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.multiProcessorCount, hipDeviceAttributeMultiprocessorCount, dev));
+
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsPerBlock, hipDeviceAttributeMaxThreadsPerBlock, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[0], hipDeviceAttributeMaxBlockDimX, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[1], hipDeviceAttributeMaxBlockDimY, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[2], hipDeviceAttributeMaxBlockDimZ, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[0], hipDeviceAttributeMaxGridDimX, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[1], hipDeviceAttributeMaxGridDimY, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[2], hipDeviceAttributeMaxGridDimZ, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.sharedMemPerBlock, hipDeviceAttributeMaxSharedMemoryPerBlock, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.totalConstantMemory, hipDeviceAttributeTotalConstantMemory, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.SIMDWidth, hipDeviceAttributeWarpSize, dev));
+    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.regsPerBlock, hipDeviceAttributeMaxRegistersPerBlock, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.clockRate, hipDeviceAttributeClockRate, dev));
+    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev));
+
+    //#if CUDA_VERSION >= 2020
+    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.integrated, hipDeviceAttributeIntegrated, dev));
+    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.computeMode, hipDeviceAttributeComputeMode,dev));
+    //#endif
+    //#if CUDA_VERSION >= 3010
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.concurrentKernels, hipDeviceAttributeConcurrentKernels, dev));
+    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
+    //#endif
+
+    _properties.push_back(prop);
+  }
+  _device=-1;
+  _cq.push_back(hipStream_t());
+  _cq.back()=0;
+}
+
+UCL_Device::~UCL_Device() {
+  clear();
+}
+
+int UCL_Device::set_platform(const int pid) {
+  clear();
+  #ifdef UCL_DEBUG
+  assert(pid<num_platforms());
+  #endif
+  return UCL_SUCCESS;
+}
+
+// Set the CUDA device to the specified device number
+int UCL_Device::set(int num) {
+  clear();
+  _device=_properties[num].device_id;
+  hipError_t err=hipDeviceGet(&_cu_device,_device);
+  if (err!=hipSuccess) {
+    #ifndef UCL_NO_EXIT
+    std::cerr << "UCL Error: Could not access accelerator number " << num
+              << " for use.\n";
+    UCL_GERYON_EXIT;
+    #endif
+    return UCL_ERROR;
+  }
+  //hipError_t err=hipCtxCreate(&_context,0,_cu_device); deprecated and unnecessary
+  err=hipSetDevice(_device);
+  if (err!=hipSuccess) {
+    #ifndef UCL_NO_EXIT
+    std::cerr << "UCL Error: Could not set accelerator number " << num
+              << " for use.\n";
+    UCL_GERYON_EXIT;
+    #endif
+    return UCL_ERROR;
+  }
+  return UCL_SUCCESS;
+}
+
+void UCL_Device::clear() {
+  if (_device>-1) {
+    for (int i=1; i<num_queues(); i++) pop_command_queue();
+    CU_SAFE_CALL_NS(hipDeviceReset());
+  }
+  _device=-1;
+}
+
+// List all devices along with all properties
+void UCL_Device::print_all(std::ostream &out) {
+  //#if CUDA_VERSION >= 2020
+  int driver_version;
+  hipDriverGetVersion(&driver_version);
+  out << "Driver Version:                           "
+      << driver_version/1000 << "." << driver_version%100
+                  << std::endl;
+  //#endif
+
+  if (num_devices() == 0)
+    out << "There is no device supporting HIP\n";
+  for (int i=0; i<num_devices(); ++i) {
+    out << "\nDevice " << i << ": \"" << name(i) << "\"\n";
+    out << "  Type of device:                                "
+        << device_type_name(i).c_str() << std::endl;
+    out << "  Compute capability:                            "
+        << arch(i) << std::endl;
+    out << "  Double precision support:                      ";
+    if (double_precision(i))
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Total amount of global memory:                 "
+        << gigabytes(i) << " GB\n";
+    //#if CUDA_VERSION >= 2000
+    out << "  Number of compute units/multiprocessors:       "
+        << _properties[i].multiProcessorCount << std::endl;
+    out << "  Number of cores:                               "
+        << cores(i) << std::endl;
+    //#endif
+    out << "  Total amount of constant memory:               "
+        << _properties[i].totalConstantMemory << " bytes\n";
+    out << "  Total amount of local/shared memory per block: "
+        << _properties[i].sharedMemPerBlock << " bytes\n";
+    out << "  Total number of registers available per block: "
+        << _properties[i].regsPerBlock << std::endl;
+    out << "  Warp size:                                     "
+        << _properties[i].SIMDWidth << std::endl;
+    out << "  Maximum number of threads per block:           "
+        << _properties[i].maxThreadsPerBlock << std::endl;
+    out << "  Maximum group size (# of threads per block)    "
+        << _properties[i].maxThreadsDim[0] << " x "
+        << _properties[i].maxThreadsDim[1] << " x "
+        << _properties[i].maxThreadsDim[2] << std::endl;
+    out << "  Maximum item sizes (# threads for each dim)    "
+        << _properties[i].maxGridSize[0] << " x "
+        << _properties[i].maxGridSize[1] << " x "
+        << _properties[i].maxGridSize[2] << std::endl;
+    //out << "  Maximum memory pitch:                          "
+    //    << max_pitch(i) << " bytes\n";
+    //out << "  Texture alignment:                             "
+    //    << _properties[i].textureAlign << " bytes\n";
+    out << "  Clock rate:                                    "
+        << clock_rate(i) << " GHz\n";
+    //#if CUDA_VERSION >= 2020
+    //out << "  Run time limit on kernels:                     ";
+    //if (_properties[i].kernelExecTimeoutEnabled)
+    //  out << "Yes\n";
+    //else
+    //  out << "No\n";
+    out << "  Integrated:                                    ";
+    if (_properties[i].integrated)
+      out << "Yes\n";
+    else
+      out << "No\n";
+    //out << "  Support host page-locked memory mapping:       ";
+    //if (_properties[i].canMapHostMemory)
+    //  out << "Yes\n";
+    //else
+    //  out << "No\n";
+    out << "  Compute mode:                                  ";
+    if (_properties[i].computeMode == hipComputeModeDefault)
+      out << "Default\n"; // multiple threads can use device
+//#if CUDA_VERSION >= 8000
+//    else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
+//#else
+    else if (_properties[i].computeMode == hipComputeModeExclusive)
+//#endif
+      out << "Exclusive\n"; // only thread can use device
+    else if (_properties[i].computeMode == hipComputeModeProhibited)
+      out << "Prohibited\n"; // no thread can use device
+    //#if CUDART_VERSION >= 4000
+    else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
+      out << "Exclusive Process\n"; // multiple threads 1 process
+    //#endif
+    else
+      out << "Unknown\n";
+    //#endif
+    //#if CUDA_VERSION >= 3010
+    out << "  Concurrent kernel execution:                   ";
+    if (_properties[i].concurrentKernels)
+      out << "Yes\n";
+    else
+      out << "No\n";
+    //out << "  Device has ECC support enabled:                ";
+    //if (_properties[i].ECCEnabled)
+    //  out << "Yes\n";
+    //else
+    //  out << "No\n";
+    //#endif
+  }
+}
+
+}
+
+#endif
--- a/lib/gpu/geryon/hip_kernel.h
+++ b/lib/gpu/geryon/hip_kernel.h
@ -0,0 +1,298 @@
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef HIP_KERNEL
+#define HIP_KERNEL
+
+
+#include <hip/hip_runtime.h>
+#include "hip_device.h"
+#include <fstream>
+#include <string>
+#include <iostream>
+
+namespace ucl_hip {
+
+class UCL_Texture;
+template <class numtyp> class UCL_D_Vec;
+template <class numtyp> class UCL_D_Mat;
+template <class hosttype, class devtype> class UCL_Vector;
+template <class hosttype, class devtype> class UCL_Matrix;
+#define UCL_MAX_KERNEL_ARGS 256
+
+/// Class storing 1 or more kernel functions from a single string or file
+class UCL_Program {
+  UCL_Device* _device_ptr;
+ public:
+  inline UCL_Program(UCL_Device &device) { _device_ptr = &device; _cq=device.cq(); }
+  inline UCL_Program(UCL_Device &device, const void *program,
+                     const char *flags="", std::string *log=NULL) {
+    _device_ptr = &device; _cq=device.cq();
+    init(device);
+    load_string(program,flags,log);
+  }
+
+  inline ~UCL_Program() {}
+
+  /// Initialize the program with a device
+  inline void init(UCL_Device &device) { _device_ptr = &device; _cq=device.cq(); }
+
+  /// Clear any data associated with program
+  /** \note Must call init() after each clear **/
+  inline void clear() { }
+
+  /// Load a program from a file and compile with flags
+  inline int load(const char *filename, const char *flags="", std::string *log=NULL) {
+    std::ifstream in(filename);
+    if (!in || in.is_open()==false) {
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not open kernel file: "
+                << filename << std::endl;
+      UCL_GERYON_EXIT;
+      #endif
+      return UCL_FILE_NOT_FOUND;
+    }
+
+    std::string program((std::istreambuf_iterator<char>(in)),
+                        std::istreambuf_iterator<char>());
+    in.close();
+    return load_string(program.c_str(),flags,log);
+  }
+
+  /// Load a program from a string and compile with flags
+  inline int load_string(const void *program, const char *flags="", std::string *log=NULL) {
+    return _device_ptr->load_module(program, _module, log);
+  }
+
+  friend class UCL_Kernel;
+ private:
+  hipModule_t _module;
+  hipStream_t _cq;
+  friend class UCL_Texture;
+};
+
+/// Class for dealing with CUDA Driver kernels
+class UCL_Kernel {
+ public:
+  UCL_Kernel() : _dimensions(1), _num_args(0) {
+    _num_blocks[0]=0;
+  }
+
+  UCL_Kernel(UCL_Program &program, const char *function) :
+    _dimensions(1), _num_args(0) {
+    _num_blocks[0]=0;
+    set_function(program,function);
+    _cq=program._cq;
+  }
+
+  ~UCL_Kernel() {}
+
+  /// Clear any function associated with the kernel
+  inline void clear() { }
+
+  /// Get the kernel function from a program
+  /** \ret UCL_ERROR_FLAG (UCL_SUCCESS, UCL_FILE_NOT_FOUND, UCL_ERROR) **/
+  inline int set_function(UCL_Program &program, const char *function) {
+    hipError_t err=hipModuleGetFunction(&_kernel,program._module,function);
+    if (err!=hipSuccess) {
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not find function: " << function
+                << " in program.\n";
+      UCL_GERYON_EXIT;
+      #endif
+      return UCL_FUNCTION_NOT_FOUND;
+    }
+    _cq=program._cq;
+    return UCL_SUCCESS;
+  }
+
+  /// Set the kernel argument.
+  /** If not a device pointer, this must be repeated each time the argument
+    * changes
+    * \note To set kernel parameter i (i>0), parameter i-1 must be set **/
+  template <class dtype>
+  inline void set_arg(const unsigned index, const dtype * const arg) {
+    if (index==_num_args)
+      add_arg(arg);
+    else if (index<_num_args){
+      assert(0==1); // not implemented
+    }
+    else
+      assert(0==1); // Must add kernel parameters in sequential order
+  }
+
+  /// Set a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
+    { set_arg(&arg->begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
+    { set_arg(&arg->begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
+    { set_arg(&arg->device.begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
+    { set_arg(&arg->device.begin()); }
+
+  /// Add a kernel argument.
+  inline void add_arg(const hipDeviceptr_t* const arg) {
+    add_arg<void*>((void**)arg);
+  }
+
+  /// Add a kernel argument.
+  template <class dtype>
+  inline void add_arg(const dtype* const arg) {
+    const auto old_size = _hip_kernel_args.size();
+    const auto aligned_size = (old_size+alignof(dtype)-1) & ~(alignof(dtype)-1);
+    const auto arg_size = sizeof(dtype);
+    _hip_kernel_args.resize(aligned_size + arg_size);
+    *((dtype*)(&_hip_kernel_args[aligned_size])) = *arg; 
+    _num_args++;
+    if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
+  }
+
+  /// Add a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
+    { add_arg(&arg->begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
+    { add_arg(&arg->begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
+    { add_arg(&arg->device.begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
+    { add_arg(&arg->device.begin()); }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
+  inline void set_size(const size_t num_blocks, const size_t block_size) {
+    _dimensions=1;
+    _num_blocks[0]=num_blocks;
+    _num_blocks[1]=1;
+    _num_blocks[2]=1;
+    
+    _block_size[0]=block_size;
+    _block_size[1]=1;
+    _block_size[2]=1;
+  }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue for the kernel is changed to cq **/
+  inline void set_size(const size_t num_blocks, const size_t block_size,
+                       command_queue &cq)
+    { _cq=cq; set_size(num_blocks,block_size); }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, const size_t block_size_y) {
+    _dimensions=2;
+    _num_blocks[0]=num_blocks_x;
+    _num_blocks[1]=num_blocks_y;
+    _num_blocks[2]=1;
+
+    _block_size[0]=block_size_x;
+    _block_size[1]=block_size_y;
+    _block_size[2]=1;
+  }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue for the kernel is changed to cq **/
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, const size_t block_size_y,
+                       command_queue &cq)
+    {_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x,
+                       const size_t block_size_y, const size_t block_size_z) {
+    _dimensions=2;
+    _num_blocks[0]=num_blocks_x;
+    _num_blocks[1]=num_blocks_y;
+    _num_blocks[2]=1;
+
+    _block_size[0]=block_size_x;
+    _block_size[1]=block_size_y;
+    _block_size[2]=block_size_z;
+  }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, const size_t block_size_y,
+                       const size_t block_size_z, command_queue &cq) {
+    _cq=cq;
+    set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
+             block_size_z);
+  }
+
+  /// Run the kernel in the default command queue
+  inline void run() {
+    size_t args_size = _hip_kernel_args.size();
+    void *config[] = {
+            HIP_LAUNCH_PARAM_BUFFER_POINTER, (void*)_hip_kernel_args.data(),
+            HIP_LAUNCH_PARAM_BUFFER_SIZE,    &args_size,
+            HIP_LAUNCH_PARAM_END
+    };
+    const auto res = hipModuleLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1],
+                                _num_blocks[2],_block_size[0],_block_size[1],
+                                _block_size[2],0,_cq, NULL, config);
+    CU_SAFE_CALL(res);
+//#endif
+  }
+
+  /// Clear any arguments associated with the kernel
+  inline void clear_args() {
+    _num_args=0;
+    _hip_kernel_args.clear();
+  }
+
+  /// Return the default command queue/stream associated with this data
+  inline command_queue & cq() { return _cq; }
+  /// Change the default command queue associated with matrix
+  inline void cq(command_queue &cq_in) { _cq=cq_in; }
+  #include "ucl_arg_kludge.h"
+
+ private:
+  hipFunction_t _kernel;
+  hipStream_t _cq;
+  unsigned _dimensions;
+  unsigned _num_blocks[3];
+  unsigned _num_args;
+  friend class UCL_Texture;
+
+  unsigned _block_size[3];
+  std::vector<char> _hip_kernel_args;
+};
+
+} // namespace
+
+#endif
+
--- a/lib/gpu/geryon/hip_macros.h
+++ b/lib/gpu/geryon/hip_macros.h
@ -0,0 +1,83 @@
+#ifndef HIP_MACROS_H
+#define HIP_MACROS_H
+
+#include <cstdio>
+#include <cassert>
+#include <hip/hip_runtime.h>
+
+//#if CUDA_VERSION >= 3020
+#define CUDA_INT_TYPE size_t
+//#else
+//#define CUDA_INT_TYPE unsigned
+//#endif
+
+#ifdef MPI_GERYON
+#include "mpi.h"
+#define NVD_GERYON_EXIT do {                                               \
+  int is_final;                                                            \
+  MPI_Finalized(&is_final);                                                \
+  if (!is_final)                                                           \
+    MPI_Abort(MPI_COMM_WORLD,-1);                                          \
+  } while(0)
+#else
+#define NVD_GERYON_EXIT assert(0==1)
+#endif
+
+#ifndef UCL_GERYON_EXIT
+#define UCL_GERYON_EXIT NVD_GERYON_EXIT
+#endif
+
+#ifdef UCL_DEBUG
+#define UCL_SYNC_DEBUG
+#define UCL_DESTRUCT_CHECK
+#endif
+
+#ifndef UCL_NO_API_CHECK
+
+#define CU_SAFE_CALL_NS( call ) do {                                         \
+    hipError_t err = call;                                                     \
+    if( hipSuccess != err) {                                               \
+        fprintf(stderr, "HIP runtime error %d in call at file '%s' in line %i.\n",   \
+                err, __FILE__, __LINE__ );                                   \
+        NVD_GERYON_EXIT;                                                     \
+    } } while (0)
+
+#ifdef UCL_SYNC_DEBUG
+
+#define CU_SAFE_CALL( call ) do {                                            \
+    CU_SAFE_CALL_NS( call );                                                 \
+    hipError_t err=hipCtxSynchronize();                                                  \
+    if( hipSuccess != err) {                                               \
+        fprintf(stderr, "HIP runtime error %d in file '%s' in line %i.\n",   \
+                err, __FILE__, __LINE__ );                                   \
+        NVD_GERYON_EXIT;                                                     \
+    } } while (0)
+
+#else
+
+#define CU_SAFE_CALL( call ) CU_SAFE_CALL_NS( call )
+
+#endif
+
+#else  // not DEBUG
+
+// void macros for performance reasons
+#define CU_SAFE_CALL_NS( call ) call
+#define CU_SAFE_CALL( call) call
+
+#endif
+
+#ifdef UCL_DESTRUCT_CHECK
+
+#define CU_DESTRUCT_CALL( call) CU_SAFE_CALL( call)
+#define CU_DESTRUCT_CALL_NS( call) CU_SAFE_CALL_NS( call)
+
+#else
+
+#define CU_DESTRUCT_CALL( call) call
+#define CU_DESTRUCT_CALL_NS( call) call
+
+#endif
+
+#endif
+
--- a/lib/gpu/geryon/hip_mat.h
+++ b/lib/gpu/geryon/hip_mat.h
@ -0,0 +1,43 @@
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+/*! \file */
+
+#ifndef HIP_MAT_H
+#define HIP_MAT_H
+
+
+#include <hip/hip_runtime.h>
+#include "hip_memory.h"
+
+/// Namespace for CUDA Driver routines
+namespace ucl_hip {
+
+#define _UCL_MAT_ALLOW
+#define _UCL_DEVICE_PTR_MAT
+#include "ucl_basemat.h"
+#include "ucl_h_vec.h"
+#include "ucl_h_mat.h"
+#include "ucl_d_vec.h"
+#include "ucl_d_mat.h"
+#include "ucl_s_obj_help.h"
+#include "ucl_vector.h"
+#include "ucl_matrix.h"
+#undef _UCL_DEVICE_PTR_MAT
+#undef _UCL_MAT_ALLOW
+
+#define UCL_COPY_ALLOW
+#include "ucl_copy.h"
+#undef UCL_COPY_ALLOW
+
+#define UCL_PRINT_ALLOW
+#include "ucl_print.h"
+#undef UCL_PRINT_ALLOW
+
+} // namespace ucl_cudadr
+
+#endif
--- a/lib/gpu/geryon/hip_memory.h
+++ b/lib/gpu/geryon/hip_memory.h
@ -0,0 +1,279 @@
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef HIP_MEMORY_H
+#define HIP_MEMORY_H
+
+
+#include <hip/hip_runtime.h>
+#include <iostream>
+#include <cassert>
+#include <cstring>
+#include "hip_macros.h"
+#include "hip_device.h"
+#include "ucl_types.h"
+
+namespace ucl_hip {
+
+// --------------------------------------------------------------------------
+// - API Specific Types
+// --------------------------------------------------------------------------
+//typedef dim3 ucl_kernel_dim;
+
+#ifdef __HIP_PLATFORM_NVCC__
+typedef enum hipArray_Format {
+    HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+    HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+    HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+    HIP_AD_FORMAT_SIGNED_INT8 = 0x08,
+    HIP_AD_FORMAT_SIGNED_INT16 = 0x09,
+    HIP_AD_FORMAT_SIGNED_INT32 = 0x0a,
+    HIP_AD_FORMAT_HALF = 0x10,
+    HIP_AD_FORMAT_FLOAT = 0x20
+}hipArray_Format;
+#endif
+
+// --------------------------------------------------------------------------
+// - API SPECIFIC DEVICE POINTERS
+// --------------------------------------------------------------------------
+typedef hipDeviceptr_t device_ptr;
+
+// --------------------------------------------------------------------------
+// - HOST MEMORY ALLOCATION ROUTINES
+// --------------------------------------------------------------------------
+template <class mat_type, class copy_type>
+inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
+                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
+  hipError_t err=hipSuccess;
+  if (kind==UCL_NOT_PINNED)
+    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+  else if (kind==UCL_WRITE_ONLY)
+    err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined);
+  else
+    err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault);
+  if (err!=hipSuccess || *(mat.host_ptr())==NULL)
+    return UCL_MEMORY_ERROR;
+  mat.cq()=cm.cq();
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
+                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
+  hipError_t err=hipSuccess;
+  if (kind==UCL_NOT_PINNED)
+    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+  else if (kind==UCL_WRITE_ONLY)
+    err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined);
+  else
+    err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault);
+  if (err!=hipSuccess || *(mat.host_ptr())==NULL)
+    return UCL_MEMORY_ERROR;
+  mat.cq()=dev.cq();
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline void _host_free(mat_type &mat) {
+  if (mat.kind()==UCL_VIEW)
+    return;
+  else if (mat.kind()!=UCL_NOT_PINNED)
+    CU_DESTRUCT_CALL(hipHostFree(mat.begin()));
+  else
+    free(mat.begin());
+}
+
+template <class mat_type>
+inline int _host_resize(mat_type &mat, const size_t n) {
+  _host_free(mat);
+  hipError_t err=hipSuccess;
+  if (mat.kind()==UCL_NOT_PINNED)
+    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+  else if (mat.kind()==UCL_WRITE_ONLY)
+    err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined);
+  else
+    err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault);
+  if (err!=hipSuccess || *(mat.host_ptr())==NULL)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
+// --------------------------------------------------------------------------
+// - DEVICE MEMORY ALLOCATION ROUTINES
+// --------------------------------------------------------------------------
+template <class mat_type, class copy_type>
+inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
+                         const enum UCL_MEMOPT kind) {
+  hipError_t err=hipMalloc((void**)&mat.cbegin(),n);
+  if (err!=hipSuccess)
+    return UCL_MEMORY_ERROR;
+  mat.cq()=cm.cq();
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
+                         const enum UCL_MEMOPT kind) {
+  hipError_t err=hipMalloc((void**)&mat.cbegin(),n);
+  if (err!=hipSuccess)
+    return UCL_MEMORY_ERROR;
+  mat.cq()=dev.cq();
+  return UCL_SUCCESS;
+}
+
+template <class mat_type, class copy_type>
+inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
+                         const size_t cols, size_t &pitch,
+                         const enum UCL_MEMOPT kind) {
+  hipError_t err;
+  size_t upitch;
+  err=hipMallocPitch((void**)&mat.cbegin(),&upitch,
+                      cols*sizeof(typename mat_type::data_type),rows);
+  pitch=static_cast<size_t>(upitch);
+  if (err!=hipSuccess)
+    return UCL_MEMORY_ERROR;
+  mat.cq()=cm.cq();
+  return UCL_SUCCESS;
+}
+
+template <class mat_type, class copy_type>
+inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
+                         const size_t cols, size_t &pitch,
+                         const enum UCL_MEMOPT kind) {
+  hipError_t err;
+  size_t upitch;
+  err=hipMallocPitch((void**)&mat.cbegin(),&upitch,
+                      cols*sizeof(typename mat_type::data_type),rows);
+  pitch=static_cast<size_t>(upitch);
+  if (err!=hipSuccess)
+    return UCL_MEMORY_ERROR;
+  mat.cq()=d.cq();
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline void _device_free(mat_type &mat) {
+  if (mat.kind()!=UCL_VIEW){
+    CU_DESTRUCT_CALL(hipFree((void*)mat.cbegin()));
+  }
+}
+
+template <class mat_type>
+inline int _device_resize(mat_type &mat, const size_t n) {
+  _device_free(mat);
+  hipError_t err=hipMalloc((void**)&mat.cbegin(),n);
+  if (err!=hipSuccess)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline int _device_resize(mat_type &mat, const size_t rows,
+                          const size_t cols, size_t &pitch) {
+  _device_free(mat);
+  hipError_t err;
+  size_t upitch;
+  err=hipMallocPitch((void**)&mat.cbegin(),&upitch,
+                      cols*sizeof(typename mat_type::data_type),rows);
+  pitch=static_cast<size_t>(upitch);
+  if (err!=hipSuccess)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
+inline void _device_view(hipDeviceptr_t *ptr, hipDeviceptr_t &in) {
+  *ptr=in;
+}
+
+template <class numtyp>
+inline void _device_view(hipDeviceptr_t *ptr, numtyp *in) {
+  *ptr=0;
+}
+
+inline void _device_view(hipDeviceptr_t *ptr, hipDeviceptr_t &in,
+                         const size_t offset, const size_t numsize) {
+  *ptr=(hipDeviceptr_t)(((char*)in)+offset*numsize);
+}
+
+template <class numtyp>
+inline void _device_view(hipDeviceptr_t *ptr, numtyp *in,
+                         const size_t offset, const size_t numsize) {
+  *ptr=0;
+}
+
+// --------------------------------------------------------------------------
+// - DEVICE IMAGE ALLOCATION ROUTINES
+// --------------------------------------------------------------------------
+template <class mat_type, class copy_type>
+inline void _device_image_alloc(mat_type &mat, copy_type &cm, const size_t rows,
+                                const size_t cols) {
+  assert(0==1);
+}
+
+template <class mat_type, class copy_type>
+inline void _device_image_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
+                                const size_t cols) {
+  assert(0==1);
+}
+
+template <class mat_type>
+inline void _device_image_free(mat_type &mat) {
+  assert(0==1);
+}
+
+// --------------------------------------------------------------------------
+// - ZERO ROUTINES
+// --------------------------------------------------------------------------
+inline void _host_zero(void *ptr, const size_t n) {
+  memset(ptr,0,n);
+}
+
+template <class mat_type>
+inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
+    CU_SAFE_CALL(hipMemsetAsync((void*)mat.cbegin(),0,n,cq));
+}
+
+
+// --------------------------------------------------------------------------
+// - MEMCPY ROUTINES
+// --------------------------------------------------------------------------
+
+
+template<class mat1, class mat2>
+hipMemcpyKind _memcpy_kind(mat1 &dst, const mat2 &src){
+  assert(mat1::MEM_TYPE < 2 && mat2::MEM_TYPE < 2);
+  return (hipMemcpyKind)((1 - mat2::MEM_TYPE)*2 + (1 - mat1::MEM_TYPE));
+}
+
+template<class mat1, class mat2>
+inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n) {
+  CU_SAFE_CALL(hipMemcpy((void*)dst.begin(), (void*)src.begin(), n, _memcpy_kind(dst, src)));
+}
+
+template<class mat1, class mat2>
+inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n, hipStream_t &cq) {
+  CU_SAFE_CALL(hipMemcpyAsync((void*)dst.begin(), (void*)src.begin(), n, _memcpy_kind(dst, src), cq));
+}
+
+template<class mat1, class mat2>
+inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
+                       const size_t spitch, const size_t cols,
+                       const size_t rows) {
+  CU_SAFE_CALL(hipMemcpy2D((void*)dst.begin(), dpitch, (void*)src.begin(), spitch, cols, rows, _memcpy_kind(dst, src)));
+}
+
+template<class mat1, class mat2>
+inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
+                       const size_t spitch, const size_t cols,
+                       const size_t rows,hipStream_t &cq) {
+  CU_SAFE_CALL(hipMemcpy2DAsync((void*)dst.begin(), dpitch, (void*)src.begin(), spitch, cols, rows, _memcpy_kind(dst, src), cq));
+}
+
+} // namespace ucl_cudart
+
+#endif
+
--- a/lib/gpu/geryon/hip_texture.h
+++ b/lib/gpu/geryon/hip_texture.h
@ -0,0 +1,113 @@
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef HIP_TEXTURE
+#define HIP_TEXTURE
+
+
+#include <hip/hip_runtime.h>
+#include "hip_kernel.h"
+#include "hip_mat.h"
+
+namespace ucl_hip {
+
+#ifdef __HIP_PLATFORM_NVCC__
+inline hipError_t hipModuleGetTexRef(CUtexref* texRef, hipModule_t hmod, const char* name){
+  return hipCUResultTohipError(cuModuleGetTexRef(texRef, hmod, name)); 
+}
+inline hipError_t hipTexRefSetFormat(CUtexref tex, hipArray_Format fmt, int NumPackedComponents) {
+    return hipCUResultTohipError(cuTexRefSetFormat(tex, (CUarray_format)fmt, NumPackedComponents ));
+}
+inline hipError_t hipTexRefSetAddress(size_t* offset, CUtexref tex, hipDeviceptr_t devPtr, size_t size) {
+    return hipCUResultTohipError(cuTexRefSetAddress(offset, tex, devPtr, size));
+}
+#endif
+
+/// Class storing a texture reference
+class UCL_Texture {
+ public:
+  UCL_Texture() {}
+  ~UCL_Texture() {}
+  /// Construct with a specified texture reference
+  inline UCL_Texture(UCL_Program &prog, const char *texture_name)
+    { get_texture(prog,texture_name); }
+  /// Set the texture reference for this object
+  inline void get_texture(UCL_Program &prog, const char *texture_name)
+    { 
+  #ifdef __HIP_PLATFORM_NVCC__
+      CU_SAFE_CALL(hipModuleGetTexRef(&_tex, prog._module, texture_name)); 
+  #else
+      size_t _global_var_size;
+      CU_SAFE_CALL(hipModuleGetGlobal(&_device_ptr_to_global_var, &_global_var_size, prog._module, texture_name));
+  #endif
+    }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class numtyp>
+  inline void bind_float(UCL_D_Vec<numtyp> &vec, const unsigned numel)
+    { _bind_float(vec,numel); }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class numtyp>
+  inline void bind_float(UCL_D_Mat<numtyp> &vec, const unsigned numel)
+    { _bind_float(vec,numel); }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class numtyp, class devtyp>
+  inline void bind_float(UCL_Vector<numtyp, devtyp> &vec, const unsigned numel)
+    { _bind_float(vec.device,numel); }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class numtyp, class devtyp>
+  inline void bind_float(UCL_Matrix<numtyp, devtyp> &vec, const unsigned numel)
+    { _bind_float(vec.device,numel); }
+
+  /// Unbind the texture reference from the memory allocation
+  inline void unbind() { }
+
+  /// Make a texture reference available to kernel
+  inline void allow(UCL_Kernel &kernel) {
+    //#if CUDA_VERSION < 4000
+    //CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
+    //#endif
+  }
+
+ private:
+#ifdef __HIP_PLATFORM_NVCC__
+  CUtexref _tex;
+#else
+  void* _device_ptr_to_global_var;
+#endif
+  friend class UCL_Kernel;
+
+  template<class mat_typ>
+  inline void _bind_float(mat_typ &vec, const unsigned numel) {
+    #ifdef UCL_DEBUG
+    assert(numel!=0 && numel<5);
+    #endif
+
+#ifdef __HIP_PLATFORM_NVCC__
+    if (vec.element_size()==sizeof(float))
+      CU_SAFE_CALL(hipTexRefSetFormat(_tex, HIP_AD_FORMAT_FLOAT, numel));
+    else {
+      if (numel>2)
+        CU_SAFE_CALL(hipTexRefSetFormat(_tex, HIP_AD_FORMAT_SIGNED_INT32, numel));
+      else
+        CU_SAFE_CALL(hipTexRefSetFormat(_tex,HIP_AD_FORMAT_SIGNED_INT32,numel*2));
+    }
+    CU_SAFE_CALL(hipTexRefSetAddress(NULL, _tex, vec.cbegin(), vec.numel()*vec.element_size()));
+#else
+    void* data_ptr = (void*)vec.cbegin();
+    CU_SAFE_CALL(hipMemcpyHtoD(hipDeviceptr_t(_device_ptr_to_global_var), &data_ptr, sizeof(void*)));
+#endif
+  }
+};
+
+} // namespace
+
+#endif
+
--- a/lib/gpu/geryon/hip_timer.h
+++ b/lib/gpu/geryon/hip_timer.h
@ -0,0 +1,107 @@
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef HIP_TIMER_H
+#define HIP_TIMER_H
+
+
+#include <hip/hip_runtime.h>
+#include "hip_macros.h"
+#include "hip_device.h"
+
+namespace ucl_hip {
+
+/// Class for timing CUDA Driver events
+class UCL_Timer {
+ public:
+  inline UCL_Timer() : _total_time(0.0f), _initialized(false) { }
+  inline UCL_Timer(UCL_Device &dev) : _total_time(0.0f), _initialized(false)
+    { init(dev); }
+
+  inline ~UCL_Timer() { clear(); }
+
+  /// Clear any data associated with timer
+  /** \note init() must be called to reuse timer after a clear() **/
+  inline void clear() {
+    if (_initialized) {
+      CU_DESTRUCT_CALL(hipEventDestroy(start_event));
+      CU_DESTRUCT_CALL(hipEventDestroy(stop_event));
+      _initialized=false;
+      _total_time=0.0;
+    }
+  }
+
+  /// Initialize default command queue for timing
+  inline void init(UCL_Device &dev) { init(dev, dev.cq()); }
+
+  /// Initialize command queue for timing
+  inline void init(UCL_Device &dev, command_queue &cq) {
+    clear();
+    _cq=cq;
+    _initialized=true;
+    CU_SAFE_CALL( hipEventCreateWithFlags(&start_event,0) );
+    CU_SAFE_CALL( hipEventCreateWithFlags(&stop_event,0) );
+  }
+
+  /// Start timing on command queue
+  inline void start() { CU_SAFE_CALL(hipEventRecord(start_event,_cq)); }
+
+  /// Stop timing on command queue
+  inline void stop() { CU_SAFE_CALL(hipEventRecord(stop_event,_cq)); }
+
+  /// Block until the start event has been reached on device
+  inline void sync_start()
+    { CU_SAFE_CALL(hipEventSynchronize(start_event)); }
+
+  /// Block until the stop event has been reached on device
+  inline void sync_stop()
+    { CU_SAFE_CALL(hipEventSynchronize(stop_event)); }
+
+  /// Set the time elapsed to zero (not the total_time)
+  inline void zero() {
+    CU_SAFE_CALL(hipEventRecord(start_event,_cq));
+    CU_SAFE_CALL(hipEventRecord(stop_event,_cq));
+  }
+
+  /// Set the total time to zero
+  inline void zero_total() { _total_time=0.0; }
+
+  /// Add time from previous start and stop to total
+  /** Forces synchronization **/
+  inline double add_to_total()
+    { double t=time(); _total_time+=t; return t/1000.0; }
+
+  /// Add a user specified time to the total (ms)
+  inline void add_time_to_total(const double t) { _total_time+=t; }
+
+  /// Return the time (ms) of last start to stop - Forces synchronization
+  inline double time() {
+    float timer;
+    CU_SAFE_CALL(hipEventSynchronize(stop_event));
+    CU_SAFE_CALL( hipEventElapsedTime(&timer,start_event,stop_event) );
+    return timer;
+  }
+
+  /// Return the time (s) of last start to stop - Forces synchronization
+  inline double seconds() { return time()/1000.0; }
+
+  /// Return the total time in ms
+  inline double total_time() { return _total_time; }
+
+  /// Return the total time in seconds
+  inline double total_seconds() { return _total_time/1000.0; }
+
+ private:
+  hipEvent_t start_event, stop_event;
+  hipStream_t _cq;
+  double _total_time;
+  bool _initialized;
+};
+
+} // namespace
+
+#endif
--- a/lib/gpu/geryon/ucl_get_devices.cpp
+++ b/lib/gpu/geryon/ucl_get_devices.cpp
@ -36,6 +36,11 @@ using namespace ucl_cudadr;
 using namespace ucl_cudart;
 #endif

+#ifdef UCL_HIP
+#include "hip_device.h"
+using namespace ucl_hip;
+#endif
+
 int main(int argc, char** argv) {
  UCL_Device cop;
  std::cout << "Found " << cop.num_platforms() << " platform(s).\n";
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@ -179,13 +179,15 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
  if (_eflag) {
    for (int i=0; i<_inum; i++)
      evdwl+=engv[i];
-    if (_ef_atom)
-      if (_ilist==NULL)
+    if (_ef_atom) {
+      if (_ilist==NULL) {
        for (int i=0; i<_inum; i++)
          eatom[i]+=engv[i];
-      else
+      } else {
        for (int i=0; i<_inum; i++)
          eatom[_ilist[i]]+=engv[i];
+      }
+    }
    vstart=_inum;
  }
  if (_vflag) {
@ -193,7 +195,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
    for (int j=0; j<6; j++) {
      for (int i=vstart; i<iend; i++)
        virial[j]+=engv[i];
-      if (_vf_atom)
+      if (_vf_atom){
        if (_ilist==NULL) {
          int ii=0;
          for (int i=vstart; i<iend; i++)
@ -203,6 +205,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
          for (int i=vstart; i<iend; i++)
            vatom[_ilist[ii++]][j]+=engv[i];
        }
+      }
      vstart+=_inum;
      iend+=_inum;
    }
@ -228,7 +231,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
      evdwl+=engv[i];
    for (int i=_inum; i<iend; i++)
      ecoul+=engv[i];
-    if (_ef_atom)
+    if (_ef_atom) {
      if (_ilist==NULL) {
        for (int i=0; i<_inum; i++)
          eatom[i]+=engv[i];
@ -240,6 +243,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
        for (int i=_inum, ii=0; i<iend; i++)
          eatom[_ilist[ii++]]+=engv[i];
      }
+    }
    vstart=iend;
    iend+=_inum;
  }
@ -247,7 +251,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
    for (int j=0; j<6; j++) {
      for (int i=vstart; i<iend; i++)
        virial[j]+=engv[i];
-      if (_vf_atom)
+      if (_vf_atom) {
        if (_ilist==NULL) {
          for (int i=vstart, ii=0; i<iend; i++)
            vatom[ii++][j]+=engv[i];
@ -255,6 +259,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
          for (int i=vstart, ii=0; i<iend; i++)
            vatom[_ilist[ii++]][j]+=engv[i];
        }
+      }
      vstart+=_inum;
      iend+=_inum;
    }
--- a/lib/gpu/lal_answer.h
+++ b/lib/gpu/lal_answer.h
@ -27,6 +27,10 @@ using namespace ucl_opencl;
 #include "geryon/nvc_timer.h"
 #include "geryon/nvc_mat.h"
 using namespace ucl_cudart;
+#elif defined(USE_HIP)
+#include "geryon/hip_timer.h"
+#include "geryon/hip_mat.h"
+using namespace ucl_hip;
 #else
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@ -15,6 +15,11 @@

 #include "lal_atom.h"

+#ifdef USE_HIP_DEVICE_SORT
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+#endif
+
 namespace LAMMPS_AL {
 #define AtomT Atom<numtyp,acctyp>

@ -70,6 +75,26 @@ bool AtomT::alloc(const int nall) {
  }
  #endif

+  #ifdef USE_HIP_DEVICE_SORT
+  if (_gpu_nbor==1) {
+    size_t   temp_storage_bytes = 0;
+    if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, sort_out_keys, sort_out_keys, sort_out_values, sort_out_values, _max_atoms))
+      return false;
+    if(sort_out_size < _max_atoms){
+      if (sort_out_keys  ) hipFree(sort_out_keys);
+      if (sort_out_values) hipFree(sort_out_values);
+      hipMalloc(&sort_out_keys  , _max_atoms * sizeof(unsigned));
+      hipMalloc(&sort_out_values, _max_atoms * sizeof(int     ));
+      sort_out_size = _max_atoms;
+    }
+    if(temp_storage_bytes > sort_temp_storage_size){
+      if(sort_temp_storage) hipFree(sort_temp_storage);
+      hipMalloc(&sort_temp_storage, temp_storage_bytes);
+      sort_temp_storage_size = temp_storage_bytes;
+    }
+  }
+  #endif
+
  // ---------------------------  Device allocations
  int gpu_bytes=0;
  success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
@ -184,6 +209,27 @@ bool AtomT::add_fields(const bool charge, const bool rot,
        return false;
    }
    #endif
+
+    #ifdef USE_HIP_DEVICE_SORT
+    if (_gpu_nbor==1) {
+      size_t   temp_storage_bytes = 0;
+      if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, sort_out_keys, sort_out_keys, sort_out_values, sort_out_values, _max_atoms))
+        return false;
+      if(sort_out_size < _max_atoms){
+        if (sort_out_keys  ) hipFree(sort_out_keys);
+        if (sort_out_values) hipFree(sort_out_values);
+        hipMalloc(&sort_out_keys  , _max_atoms * sizeof(unsigned));
+        hipMalloc(&sort_out_values, _max_atoms * sizeof(int     ));
+        sort_out_size = _max_atoms;
+      }
+      if(temp_storage_bytes > sort_temp_storage_size){
+        if(sort_temp_storage) hipFree(sort_temp_storage);
+        hipMalloc(&sort_temp_storage, temp_storage_bytes);
+        sort_temp_storage_size = temp_storage_bytes;
+      }
+    }
+    #endif
+
    success=success && (dev_particle_id.alloc(_max_atoms,*dev,
                                              UCL_READ_ONLY)==UCL_SUCCESS);
    gpu_bytes+=dev_particle_id.row_bytes();
@ -275,6 +321,19 @@ void AtomT::clear_resize() {
  if (_gpu_nbor==1) cudppDestroyPlan(sort_plan);
  #endif

+  #ifdef USE_HIP_DEVICE_SORT
+  if (_gpu_nbor==1) {
+    if(sort_out_keys)     hipFree(sort_out_keys);
+    if(sort_out_values)   hipFree(sort_out_values);
+    if(sort_temp_storage) hipFree(sort_temp_storage);
+    sort_out_keys = nullptr;
+    sort_out_values = nullptr;
+    sort_temp_storage = nullptr;
+    sort_temp_storage_size = 0;
+    sort_out_size = 0;
+  }
+  #endif
+
  if (_gpu_nbor==2) {
    host_particle_id.clear();
    host_cell_id.clear();
@ -326,6 +385,22 @@ void AtomT::sort_neighbor(const int num_atoms) {
    UCL_GERYON_EXIT;
  }
  #endif
+
+  #ifdef USE_HIP_DEVICE_SORT
+    if(sort_out_size < num_atoms){
+      printf("AtomT::sort_neighbor: invalid temp buffer size\n");
+      UCL_GERYON_EXIT;
+    }
+    if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(sort_temp_storage, sort_temp_storage_size, (unsigned *)dev_cell_id.begin(), sort_out_keys, (int *)dev_particle_id.begin(), sort_out_values, num_atoms)){
+      printf("AtomT::sort_neighbor: DeviceRadixSort error\n");
+      UCL_GERYON_EXIT;
+    }
+    if(hipSuccess != hipMemcpy((unsigned *)dev_cell_id.begin(), sort_out_keys  , num_atoms*sizeof(unsigned), hipMemcpyDeviceToDevice) ||
+       hipSuccess != hipMemcpy((int *) dev_particle_id.begin(), sort_out_values, num_atoms*sizeof(int     ), hipMemcpyDeviceToDevice)){
+      printf("AtomT::sort_neighbor: copy output error\n");
+      UCL_GERYON_EXIT;
+    }
+  #endif
 }

 #ifdef GPU_CAST
--- a/lib/gpu/lal_atom.cu
+++ b/lib/gpu/lal_atom.cu
@ -13,7 +13,7 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_preprocessor.h"
 #endif

--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@ -29,6 +29,11 @@ using namespace ucl_opencl;
 #include "geryon/nvc_mat.h"
 #include "geryon/nvc_kernel.h"
 using namespace ucl_cudart;
+#elif defined(USE_HIP)
+#include "geryon/hip_timer.h"
+#include "geryon/hip_mat.h"
+#include "geryon/hip_kernel.h"
+using namespace ucl_hip;
 #else
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
@ -477,6 +482,14 @@ class Atom {
  CUDPPConfiguration sort_config;
  CUDPPHandle sort_plan;
  #endif
+
+  #ifdef USE_HIP_DEVICE_SORT
+  unsigned* sort_out_keys = nullptr;
+  int* sort_out_values = nullptr;
+  void* sort_temp_storage = nullptr;
+  size_t sort_temp_storage_size = 0;
+  size_t sort_out_size = 0;
+  #endif
 };

 }
--- a/lib/gpu/lal_aux_fun1.h
+++ b/lib/gpu/lal_aux_fun1.h
@ -13,7 +13,7 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_preprocessor.h"
 #endif

--- a/lib/gpu/lal_base_atomic.h
+++ b/lib/gpu/lal_base_atomic.h
@ -24,6 +24,8 @@
 #include "geryon/ocl_texture.h"
 #elif defined(USE_CUDART)
 #include "geryon/nvc_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
--- a/lib/gpu/lal_base_charge.h
+++ b/lib/gpu/lal_base_charge.h
@ -25,6 +25,8 @@
 #include "geryon/ocl_texture.h"
 #elif defined(USE_CUDART)
 #include "geryon/nvc_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
--- a/lib/gpu/lal_base_dipole.h
+++ b/lib/gpu/lal_base_dipole.h
@ -23,6 +23,8 @@

 #ifdef USE_OPENCL
 #include "geryon/ocl_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
--- a/lib/gpu/lal_base_dpd.h
+++ b/lib/gpu/lal_base_dpd.h
@ -23,6 +23,8 @@

 #ifdef USE_OPENCL
 #include "geryon/ocl_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
--- a/lib/gpu/lal_base_ellipsoid.h
+++ b/lib/gpu/lal_base_ellipsoid.h
@ -24,6 +24,8 @@
 #include "geryon/ocl_texture.h"
 #elif defined(USE_CUDART)
 #include "geryon/nvc_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
--- a/lib/gpu/lal_base_three.h
+++ b/lib/gpu/lal_base_three.h
@ -24,6 +24,8 @@
 #include "geryon/ocl_texture.h"
 #elif defined(USE_CUDART)
 #include "geryon/nvc_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
--- a/lib/gpu/lal_beck.cu
+++ b/lib/gpu/lal_beck.cu
@ -13,12 +13,12 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@ -13,12 +13,12 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@ -13,15 +13,15 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_born_coul_long_cs.cu
+++ b/lib/gpu/lal_born_coul_long_cs.cu
@ -13,15 +13,16 @@
 //    email                : ndactrung@gmail.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
+
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@ -13,15 +13,15 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_born_coul_wolf_cs.cu
+++ b/lib/gpu/lal_born_coul_wolf_cs.cu
@ -13,15 +13,15 @@
 //    email                : ndactrung@gmail.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@ -13,12 +13,12 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@ -13,15 +13,15 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@ -13,15 +13,15 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@ -13,15 +13,15 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_colloid.cu
+++ b/lib/gpu/lal_colloid.cu
@ -13,12 +13,12 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_coul.cu
+++ b/lib/gpu/lal_coul.cu
@ -13,15 +13,15 @@
 //    email                : ndtrung@umich.edu
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_coul_debye.cu
+++ b/lib/gpu/lal_coul_debye.cu
@ -13,15 +13,15 @@
 //    email                : ndtrung@umich.edu
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_coul_dsf.cu
+++ b/lib/gpu/lal_coul_dsf.cu
@ -13,15 +13,15 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@ -13,15 +13,15 @@
 //    email                : a.kohlmeyer@temple.edu
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_coul_long_cs.cu
+++ b/lib/gpu/lal_coul_long_cs.cu
@ -13,15 +13,15 @@
 //    email                : ndactrung@gmail.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -268,7 +268,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
    gpu_nbor=1;
  else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
    gpu_nbor=2;
-  #ifndef USE_CUDPP
+  #if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT)
  if (gpu_nbor==1)
    gpu_nbor=2;
  #endif
@ -341,7 +341,7 @@ int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
    gpu_nbor=1;
  else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
    gpu_nbor=2;
-  #ifndef USE_CUDPP
+  #if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT)
  if (gpu_nbor==1)
    gpu_nbor=2;
  #endif
@ -712,7 +712,7 @@ int DeviceT::compile_kernels() {
  gpu_lib_data.update_host(false);

  _ptx_arch=static_cast<double>(gpu_lib_data[0])/100.0;
-  #ifndef USE_OPENCL
+  #if !(defined(USE_OPENCL) || defined(USE_HIP))
  if (_ptx_arch>gpu->arch() || floor(_ptx_arch)<floor(gpu->arch()))
    return -4;
  #endif
--- a/lib/gpu/lal_device.cu
+++ b/lib/gpu/lal_device.cu
@ -13,7 +13,7 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_preprocessor.h"
 #endif

--- a/lib/gpu/lal_dipole_lj.cu
+++ b/lib/gpu/lal_dipole_lj.cu
@ -13,16 +13,16 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
-texture<float4> mu_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
+_texture( mu_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
-texture<int4,1> mu_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
+_texture_2d( mu_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_dipole_lj_sf.cu
+++ b/lib/gpu/lal_dipole_lj_sf.cu
@ -13,17 +13,17 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"

 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
-texture<float4> mu_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
+_texture( mu_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
-texture<int4,1> mu_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
+_texture_2d( mu_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_dipole_long_lj.cu
+++ b/lib/gpu/lal_dipole_long_lj.cu
@ -13,16 +13,16 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
-texture<float4> mu_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
+_texture( mu_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
-texture<int4,1> mu_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
+_texture_2d( mu_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_dpd.cu
+++ b/lib/gpu/lal_dpd.cu
@ -13,14 +13,14 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float4> vel_tex;
+_texture( pos_tex,float4);
+_texture( vel_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int4,1> vel_tex;
+_texture_2d( pos_tex,int4);
+_texture_2d( vel_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@ -13,27 +13,27 @@
 //    email                : brownw@ornl.gov nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"

 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> fp_tex;
-texture<float4> rhor_sp1_tex;
-texture<float4> rhor_sp2_tex;
-texture<float4> frho_sp1_tex;
-texture<float4> frho_sp2_tex;
-texture<float4> z2r_sp1_tex;
-texture<float4> z2r_sp2_tex;
+_texture( pos_tex,float4);
+_texture( fp_tex,float);
+_texture( rhor_sp1_tex,float4);
+_texture( rhor_sp2_tex,float4);
+_texture( frho_sp1_tex,float4);
+_texture( frho_sp2_tex,float4);
+_texture( z2r_sp1_tex,float4);
+_texture( z2r_sp2_tex,float4);
 #else
-texture<int4> pos_tex;
-texture<int2> fp_tex;
-texture<int4> rhor_sp1_tex;
-texture<int4> rhor_sp2_tex;
-texture<int4> frho_sp1_tex;
-texture<int4> frho_sp2_tex;
-texture<int4> z2r_sp1_tex;
-texture<int4> z2r_sp2_tex;
+_texture( pos_tex,int4);
+_texture( fp_tex,int2);
+_texture( rhor_sp1_tex,int4);
+_texture( rhor_sp2_tex,int4);
+_texture( frho_sp1_tex,int4);
+_texture( frho_sp2_tex,int4);
+_texture( z2r_sp1_tex,int4);
+_texture( z2r_sp2_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_ellipsoid_extra.h
+++ b/lib/gpu/lal_ellipsoid_extra.h
@ -18,12 +18,14 @@

 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex, quat_tex;
+_texture( pos_tex, float4);
+_texture( quat_tex,float4);
 #else
-texture<int4,1> pos_tex, quat_tex;
+_texture_2d( pos_tex,int4);
+_texture_2d( quat_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_ellipsoid_nbor.cu
+++ b/lib/gpu/lal_ellipsoid_nbor.cu
@ -13,12 +13,12 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_preprocessor.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_gauss.cu
+++ b/lib/gpu/lal_gauss.cu
@ -13,12 +13,12 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@ -13,7 +13,7 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_ellipsoid_extra.h"
 #endif

--- a/lib/gpu/lal_gayberne_lj.cu
+++ b/lib/gpu/lal_gayberne_lj.cu
@ -13,7 +13,7 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_ellipsoid_extra.h"
 #endif

--- a/lib/gpu/lal_lj.cu
+++ b/lib/gpu/lal_lj.cu
@ -13,12 +13,12 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@ -13,12 +13,12 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_lj_class2_long.cu
+++ b/lib/gpu/lal_lj_class2_long.cu
@ -13,15 +13,15 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_coul.cu
+++ b/lib/gpu/lal_lj_coul.cu
@ -13,15 +13,15 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_coul_debye.cu
+++ b/lib/gpu/lal_lj_coul_debye.cu
@ -13,15 +13,15 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_coul_long.cu
+++ b/lib/gpu/lal_lj_coul_long.cu
@ -13,15 +13,15 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_coul_msm.cu
+++ b/lib/gpu/lal_lj_coul_msm.cu
@ -13,19 +13,19 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
-texture<float> gcons_tex;
-texture<float> dgcons_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
+_texture( gcons_tex,float);
+_texture( dgcons_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
-texture<int2> gcons_tex;
-texture<int2> dgcons_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
+_texture( gcons_tex,int2);
+_texture( dgcons_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_cubic.cu
+++ b/lib/gpu/lal_lj_cubic.cu
@ -13,12 +13,12 @@
 //    email                : ndactrung@gmail.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_lj_dsf.cu
+++ b/lib/gpu/lal_lj_dsf.cu
@ -13,15 +13,15 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@ -13,13 +13,13 @@
 //    email                : ibains@nvidia.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_lj_expand_coul_long.cu
+++ b/lib/gpu/lal_lj_expand_coul_long.cu
@ -13,15 +13,15 @@
 //    email                : ndactrung@gmail.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_gromacs.cu
+++ b/lib/gpu/lal_lj_gromacs.cu
@ -13,13 +13,13 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_lj_sdk.cu
+++ b/lib/gpu/lal_lj_sdk.cu
@ -13,12 +13,12 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_lj_sdk_long.cu
+++ b/lib/gpu/lal_lj_sdk_long.cu
@ -13,15 +13,15 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_tip4p_long.cpp
+++ b/lib/gpu/lal_lj_tip4p_long.cpp
@ -23,7 +23,7 @@ const char *lj_tip4p=0;

 #include "lal_lj_tip4p_long.h"
 #include <cassert>
-using namespace LAMMPS_AL;
+namespace LAMMPS_AL {
 #define LJTIP4PLongT LJ_TIP4PLong<numtyp, acctyp>

 extern Device<PRECISION,ACC_PRECISION> device;
@ -381,6 +381,5 @@ int** LJTIP4PLongT::compute(const int ago, const int inum_full,
 }


-
-
 template class LJ_TIP4PLong<PRECISION,ACC_PRECISION>;
+}
--- a/lib/gpu/lal_lj_tip4p_long.cu
+++ b/lib/gpu/lal_lj_tip4p_long.cu
@ -13,7 +13,7 @@
 //    email                : thevsevak@gmail.com
 // ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifdef LAMMPS_SMALLBIG
@ -27,11 +27,11 @@
 #define tagint int
 #endif
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_mie.cu
+++ b/lib/gpu/lal_mie.cu
@ -13,12 +13,12 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_morse.cu
+++ b/lib/gpu/lal_morse.cu
@ -13,13 +13,13 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_neighbor_cpu.cu
+++ b/lib/gpu/lal_neighbor_cpu.cu
@ -13,7 +13,7 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_preprocessor.h"
 #endif

--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@ -14,7 +14,7 @@
 //    email                : penwang@nvidia.com, brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_preprocessor.h"
 #ifdef LAMMPS_SMALLBIG
 #define tagint int
@ -27,9 +27,9 @@
 #define tagint int
 #endif
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif

 __kernel void calc_cell_id(const numtyp4 *restrict pos,
--- a/lib/gpu/lal_neighbor_shared.h
+++ b/lib/gpu/lal_neighbor_shared.h
@ -24,6 +24,10 @@ using namespace ucl_opencl;
 #include "geryon/nvc_kernel.h"
 #include "geryon/nvc_texture.h"
 using namespace ucl_cudart;
+#elif defined(USE_HIP)
+#include "geryon/hip_kernel.h"
+#include "geryon/hip_texture.h"
+using namespace ucl_hip;
 #else
 #include "geryon/nvd_kernel.h"
 #include "geryon/nvd_texture.h"
--- a/lib/gpu/lal_pppm.cu
+++ b/lib/gpu/lal_pppm.cu
@ -13,15 +13,15 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_preprocessor.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 // Allow PPPM to compile without atomics for NVIDIA 1.0 cards, error
--- a/lib/gpu/lal_pppm.h
+++ b/lib/gpu/lal_pppm.h
@ -23,6 +23,8 @@
 #include "geryon/ocl_texture.h"
 #elif defined(USE_CUDART)
 #include "geryon/nvc_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
--- a/lib/gpu/lal_precision.h
+++ b/lib/gpu/lal_precision.h
@ -24,9 +24,11 @@ struct _lgpu_int2 {
  int x; int y;
 };

+#ifndef USE_HIP
 #ifndef int2
 #define int2 _lgpu_int2
 #endif
+#endif

 struct _lgpu_float2 {
  float x; float y;
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@ -1,4 +1,4 @@
-// **************************************************************************
+// **************************************************************************
 //                              preprocessor.cu
 //                             -------------------
 //                           W. Michael Brown (ORNL)
@ -60,6 +60,150 @@
 //
 //*************************************************************************/

+#define _texture(name, type)  texture<type> name
+#define _texture_2d(name, type) texture<type,1> name
+
+// -------------------------------------------------------------------------
+//                            HIP DEFINITIONS
+// -------------------------------------------------------------------------
+
+#ifdef USE_HIP
+  #include <hip/hip_runtime.h>
+  #ifdef __HIP_PLATFORM_HCC__
+    #define mul24(x, y) __mul24(x, y)
+    #undef _texture
+    #undef _texture_2d
+    #define _texture(name, type)  __device__ type* name
+    #define _texture_2d(name, type)  __device__ type* name
+  #endif
+  #define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
+  #define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
+  #define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
+  #define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
+  #define THREAD_ID_X threadIdx.x
+  #define THREAD_ID_Y threadIdx.y
+  #define BLOCK_ID_X blockIdx.x
+  #define BLOCK_ID_Y blockIdx.y
+  #define BLOCK_SIZE_X blockDim.x
+  #define BLOCK_SIZE_Y blockDim.y
+  #define __kernel extern "C" __global__
+  #ifdef __local
+    #undef __local
+  #endif
+  #define __local __shared__
+  #define __global
+  #define restrict __restrict__
+  #define atom_add atomicAdd
+  #define ucl_inline static __inline__ __device__
+
+  #define THREADS_PER_ATOM 4
+  #define THREADS_PER_CHARGE 8
+  #define BLOCK_NBOR_BUILD 128
+  #define BLOCK_PAIR 256
+  #define BLOCK_BIO_PAIR 256
+  #define BLOCK_ELLIPSE 128
+  #define MAX_SHARED_TYPES 11
+
+  #ifdef _SINGLE_SINGLE
+    ucl_inline double shfl_xor(double var, int laneMask, int width) {
+  #ifdef __HIP_PLATFORM_HCC__
+      return __shfl_xor(var, laneMask, width);
+  #else
+      return __shfl_xor_sync(0xffffffff, var, laneMask, width);
+  #endif
+    }
+  #else
+    ucl_inline double shfl_xor(double var, int laneMask, int width) {
+      int2 tmp;
+      tmp.x = __double2hiint(var);
+      tmp.y = __double2loint(var);
+  #ifdef __HIP_PLATFORM_HCC__
+      tmp.x = __shfl_xor(tmp.x,laneMask,width);
+      tmp.y = __shfl_xor(tmp.y,laneMask,width);
+  #else
+      tmp.x = __shfl_xor_sync(0xffffffff, tmp.x,laneMask,width);
+      tmp.y = __shfl_xor_sync(0xffffffff, tmp.y,laneMask,width);
+  #endif
+      return __hiloint2double(tmp.x,tmp.y);
+    }
+  #endif
+
+  #ifdef __HIP_PLATFORM_HCC__
+    #define ARCH 600
+    #define WARP_SIZE 64
+  #endif
+
+  #ifdef __HIP_PLATFORM_NVCC__
+    #define ARCH __CUDA_ARCH__
+    #define WARP_SIZE 32
+  #endif
+
+  #define fast_mul(X,Y) (X)*(Y)
+
+  #define MEM_THREADS WARP_SIZE
+  #define PPPM_BLOCK_1D 64
+  #define BLOCK_CELL_2D 8
+  #define BLOCK_CELL_ID 128
+  #define MAX_BIO_SHARED_TYPES 128
+
+  #ifdef __HIP_PLATFORM_NVCC__
+    #ifdef _DOUBLE_DOUBLE
+      #define fetch4(ans,i,pos_tex) {                        \
+        int4 xy = tex1Dfetch(pos_tex,i*2);                   \
+        int4 zt = tex1Dfetch(pos_tex,i*2+1);                 \
+        ans.x=__hiloint2double(xy.y, xy.x);                  \
+        ans.y=__hiloint2double(xy.w, xy.z);                  \
+        ans.z=__hiloint2double(zt.y, zt.x);                  \
+        ans.w=__hiloint2double(zt.w, zt.z);                  \
+      }
+      #define fetch(ans,i,q_tex) {                           \
+        int2 qt = tex1Dfetch(q_tex,i);                       \
+        ans=__hiloint2double(qt.y, qt.x);                    \
+      }
+    #else
+      #define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
+      #define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
+    #endif
+  #else
+    #ifdef _DOUBLE_DOUBLE
+      #define fetch4(ans,i,pos_tex) (ans=*(((double4*)pos_tex) + i))
+      #define fetch(ans,i,q_tex)    (ans=*(((double *)  q_tex) + i))
+    #else
+      #define fetch4(ans,i,pos_tex) (ans=*(((float4*)pos_tex) + i))
+      #define fetch(ans,i,q_tex)    (ans=*(((float *)  q_tex) + i))
+    #endif
+  #endif
+
+  #ifdef _DOUBLE_DOUBLE
+    #define ucl_exp exp
+    #define ucl_powr pow
+    #define ucl_atan atan
+    #define ucl_cbrt cbrt
+    #define ucl_ceil ceil
+    #define ucl_abs fabs
+    #define ucl_rsqrt rsqrt
+    #define ucl_sqrt sqrt
+    #define ucl_recip(x) ((numtyp)1.0/(x))
+
+  #else
+    #define ucl_atan atanf
+    #define ucl_cbrt cbrtf
+    #define ucl_ceil ceilf
+    #define ucl_abs fabsf
+    #define ucl_recip(x) ((numtyp)1.0/(x))
+    #define ucl_rsqrt rsqrtf
+    #define ucl_sqrt sqrtf
+
+    #ifdef NO_HARDWARE_TRANSCENDENTALS
+      #define ucl_exp expf
+      #define ucl_powr powf
+    #else
+      #define ucl_exp __expf
+      #define ucl_powr __powf
+    #endif
+  #endif
+#endif
+    
 // -------------------------------------------------------------------------
 //                            CUDA DEFINITIONS
 // -------------------------------------------------------------------------
--- a/lib/gpu/lal_re_squared.cu
+++ b/lib/gpu/lal_re_squared.cu
@ -13,7 +13,7 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_ellipsoid_extra.h"
 #endif

--- a/lib/gpu/lal_re_squared_lj.cu
+++ b/lib/gpu/lal_re_squared_lj.cu
@ -13,7 +13,7 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_ellipsoid_extra.h"
 #endif

--- a/lib/gpu/lal_soft.cu
+++ b/lib/gpu/lal_soft.cu
@ -13,12 +13,12 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@ -13,19 +13,19 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"

 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float4> sw1_tex;
-texture<float4> sw2_tex;
-texture<float4> sw3_tex;
+_texture( pos_tex,float4);
+_texture( sw1_tex,float4);
+_texture( sw2_tex,float4);
+_texture( sw3_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int4> sw1_tex;
-texture<int4> sw2_tex;
-texture<int4> sw3_tex;
+_texture_2d( pos_tex,int4);
+_texture( sw1_tex,int4);
+_texture( sw2_tex,int4);
+_texture( sw3_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_table.cu
+++ b/lib/gpu/lal_table.cu
@ -13,12 +13,12 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@ -13,23 +13,23 @@
 //       email                : ndactrung@gmail.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_tersoff_extra.h"

 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float4> ts1_tex;
-texture<float4> ts2_tex;
-texture<float4> ts3_tex;
-texture<float4> ts4_tex;
-texture<float4> ts5_tex;
+_texture( pos_tex,float4);
+_texture( ts1_tex,float4);
+_texture( ts2_tex,float4);
+_texture( ts3_tex,float4);
+_texture( ts4_tex,float4);
+_texture( ts5_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int4> ts1_tex;
-texture<int4> ts2_tex;
-texture<int4> ts3_tex;
-texture<int4> ts4_tex;
-texture<int4> ts5_tex;
+_texture_2d( pos_tex,int4);
+_texture( ts1_tex,int4);
+_texture( ts2_tex,int4);
+_texture( ts3_tex,int4);
+_texture( ts4_tex,int4);
+_texture( ts5_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_tersoff_extra.h
+++ b/lib/gpu/lal_tersoff_extra.h
@ -16,7 +16,7 @@
 #ifndef LAL_TERSOFF_EXTRA_H
 #define LAL_TERSOFF_EXTRA_H

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #else
 #endif
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@ -13,23 +13,23 @@
 //       email                : ndactrung@gmail.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_tersoff_mod_extra.h"

 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float4> ts1_tex;
-texture<float4> ts2_tex;
-texture<float4> ts3_tex;
-texture<float4> ts4_tex;
-texture<float4> ts5_tex;
+_texture( pos_tex,float4);
+_texture( ts1_tex,float4);
+_texture( ts2_tex,float4);
+_texture( ts3_tex,float4);
+_texture( ts4_tex,float4);
+_texture( ts5_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int4> ts1_tex;
-texture<int4> ts2_tex;
-texture<int4> ts3_tex;
-texture<int4> ts4_tex;
-texture<int4> ts5_tex;
+_texture_2d( pos_tex,int4);
+_texture( ts1_tex,int4);
+_texture( ts2_tex,int4);
+_texture( ts3_tex,int4);
+_texture( ts4_tex,int4);
+_texture( ts5_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_tersoff_mod_extra.h
+++ b/lib/gpu/lal_tersoff_mod_extra.h
@ -16,7 +16,7 @@
 #ifndef LAL_TERSOFF_MOD_EXTRA_H
 #define LAL_TERSOFF_MOD_EXTRA_H

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #else
 #endif
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@ -13,25 +13,25 @@
 //       email                : ndactrung@gmail.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_tersoff_zbl_extra.h"

 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float4> ts1_tex;
-texture<float4> ts2_tex;
-texture<float4> ts3_tex;
-texture<float4> ts4_tex;
-texture<float4> ts5_tex;
-texture<float4> ts6_tex;
+_texture( pos_tex,float4);
+_texture( ts1_tex,float4);
+_texture( ts2_tex,float4);
+_texture( ts3_tex,float4);
+_texture( ts4_tex,float4);
+_texture( ts5_tex,float4);
+_texture( ts6_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int4> ts1_tex;
-texture<int4> ts2_tex;
-texture<int4> ts3_tex;
-texture<int4> ts4_tex;
-texture<int4> ts5_tex;
-texture<int4> ts6_tex;
+_texture_2d( pos_tex,int4);
+_texture( ts1_tex,int4);
+_texture( ts2_tex,int4);
+_texture( ts3_tex,int4);
+_texture( ts4_tex,int4);
+_texture( ts5_tex,int4);
+_texture( ts6_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_tersoff_zbl_extra.h
+++ b/lib/gpu/lal_tersoff_zbl_extra.h
@ -16,7 +16,7 @@
 #ifndef LAL_TERSOFF_ZBL_EXTRA_H
 #define LAL_TERSOFF_ZBL_EXTRA_H

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #else
 #endif
--- a/lib/gpu/lal_ufm.cu
+++ b/lib/gpu/lal_ufm.cu
@ -15,12 +15,12 @@
                           dekoning@ifi.unicamp.br
 ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_vashishta.cu
+++ b/lib/gpu/lal_vashishta.cu
@ -13,23 +13,23 @@
 //    email                : andershaf@gmail.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"

 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float4> param1_tex;
-texture<float4> param2_tex;
-texture<float4> param3_tex;
-texture<float4> param4_tex;
-texture<float4> param5_tex;
+_texture( pos_tex,float4);
+_texture( param1_tex,float4);
+_texture( param2_tex,float4);
+_texture( param3_tex,float4);
+_texture( param4_tex,float4);
+_texture( param5_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int4> param1_tex;
-texture<int4> param2_tex;
-texture<int4> param3_tex;
-texture<int4> param4_tex;
-texture<int4> param5_tex;
+_texture_2d( pos_tex,int4);
+_texture( param1_tex,int4);
+_texture( param2_tex,int4);
+_texture( param3_tex,int4);
+_texture( param4_tex,int4);
+_texture( param5_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_yukawa.cu
+++ b/lib/gpu/lal_yukawa.cu
@ -13,12 +13,12 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_yukawa_colloid.cu
+++ b/lib/gpu/lal_yukawa_colloid.cu
@ -13,15 +13,15 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> rad_tex;
+_texture( pos_tex,float4);
+_texture( rad_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> rad_tex;
+_texture_2d( pos_tex,int4);
+_texture( rad_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_zbl.cu
+++ b/lib/gpu/lal_zbl.cu
@ -13,12 +13,12 @@
 //    email                : ndactrung@gmail.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/Show More
+++ b/Show More