Merge 'gpu_hip_port' into master

2020-01-28 20:09:40 +03:00 · 2020-01-28 20:09:40 +03:00 · 66c5fa2abd
parent 5eef3b1828
commit 66c5fa2abd
91 changed files with 2290 additions and 312 deletions
--- a/lib/gpu/Makefile.hip
+++ b/lib/gpu/Makefile.hip
@ -0,0 +1,148 @@
+# /* ----------------------------------------------------------------------   
+#  Generic Linux Makefile for HIP
+#     - export HIP_PLATFORM=hcc (or nvcc) before execution
+#     - change HIP_ARCH for your GPU
+# ------------------------------------------------------------------------- */
+
+# this setting should match LAMMPS Makefile
+# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
+
+LMP_INC = -DLAMMPS_SMALLBIG
+
+# precision for GPU calculations
+# -D_SINGLE_SINGLE  # Single precision for all calculations
+# -D_DOUBLE_DOUBLE  # Double precision for all calculations
+# -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double
+
+HIP_PRECISION = -D_SINGLE_DOUBLE
+
+HIP_OPTS = -O3 
+HIP_HOST_OPTS = -Wno-deprecated-declarations
+HIP_HOST_INCLUDE = 
+
+# use device sort 
+# requires linking with hipcc and hipCUB + (rocPRIM or CUB for AMD or Nvidia respectively)
+HIP_HOST_OPTS += -DUSE_HIP_DEVICE_SORT 
+# path to cub
+HIP_HOST_INCLUDE += -I./
+# path to hipcub
+HIP_HOST_INCLUDE += -I$(HIP_PATH)/../include
+
+# use mpi
+HIP_HOST_OPTS += -DMPI_GERYON -DUCL_NO_EXIT
+# this settings should match LAMMPS Makefile
+MPI_COMP_OPTS = $(shell mpicxx --showme:compile)
+MPI_LINK_OPTS = $(shell mpicxx --showme:link)
+#MPI_COMP_OPTS += -I/usr/include/mpi  -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1
+
+HIP_PATH ?= $(wildcard /opt/rocm/hip)
+HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler)
+
+ifeq (hcc,$(HIP_PLATFORM))
+	HIP_OPTS  += -ffast-math
+	# possible values: gfx803,gfx900,gfx906
+	HIP_ARCH = gfx906
+else ifeq (nvcc,$(HIP_PLATFORM))
+	HIP_OPTS  += --use_fast_math
+	HIP_ARCH = -gencode arch=compute_30,code=[sm_30,compute_30] -gencode arch=compute_32,code=[sm_32,compute_32] -gencode arch=compute_35,code=[sm_35,compute_35] \
+		    -gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52] -gencode arch=compute_53,code=[sm_53,compute_53]\
+			-gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61] -gencode arch=compute_62,code=[sm_62,compute_62]\
+			-gencode arch=compute_70,code=[sm_70,compute_70] -gencode arch=compute_72,code=[sm_72,compute_72] -gencode arch=compute_75,code=[sm_75,compute_75] 
+else
+	$(error Specify HIP platform using 'export HIP_PLATFORM=(hcc,nvcc)')
+endif
+
+BIN_DIR = .
+OBJ_DIR = ./obj
+LIB_DIR = .
+AR = ar
+BSH = /bin/sh
+
+
+# /* ----------------------------------------------------------------------   
+#  				don't change section below without need			
+# ------------------------------------------------------------------------- */
+
+HIP_OPTS += -DUSE_HIP $(HIP_PRECISION)
+HIP_GPU_OPTS += $(HIP_OPTS) -I./
+
+ifeq (hcc,$(HIP_PLATFORM))
+	HIP_HOST_OPTS += -fPIC
+	HIP_GPU_CC  = $(HIP_PATH)/bin/hipcc --genco
+	HIP_GPU_OPTS_S = -t="$(HIP_ARCH)" -f=\" 
+	HIP_GPU_OPTS_E = \"
+	HIP_KERNEL_SUFFIX = .cpp
+	HIP_LIBS_TARGET = export HCC_AMDGPU_TARGET := $(HIP_ARCH)
+	export HCC_AMDGPU_TARGET := $(HIP_ARCH)
+else ifeq (nvcc,$(HIP_PLATFORM))
+	HIP_GPU_CC  = $(HIP_PATH)/bin/hipcc --fatbin 
+	HIP_GPU_OPTS += $(HIP_ARCH)
+	HIP_GPU_SORT_ARCH = $(HIP_ARCH)
+	# fix nvcc can't handle -pthread flag
+	MPI_COMP_OPTS := $(subst -pthread,-Xcompiler -pthread,$(MPI_COMP_OPTS))
+	MPI_LINK_OPTS := $(subst -pthread,-Xcompiler -pthread,$(MPI_LINK_OPTS))
+endif
+
+# hipcc is essential for device sort, because of hipcub is header only library and ROCm gpu code generation is deferred to the linking stage
+HIP_HOST_CC = $(HIP_PATH)/bin/hipcc
+HIP_HOST_OPTS += $(HIP_OPTS) $(MPI_COMP_OPTS) $(LMP_INC)
+HIP_HOST_CC_CMD  = $(HIP_HOST_CC) $(HIP_HOST_OPTS) $(HIP_HOST_INCLUDE)
+
+# sources
+
+ALL_H  =  $(wildcard ./geryon/ucl*.h) $(wildcard ./geryon/hip*.h) $(wildcard ./lal_*.h)
+SRCS := $(wildcard ./lal_*.cpp)
+OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o))
+CUS  := $(wildcard lal_*.cu)
+CUHS := $(filter-out pppm_cubin.h, $(CUS:lal_%.cu=%_cubin.h)) pppm_f_cubin.h pppm_d_cubin.h
+CUHS := $(addprefix $(OBJ_DIR)/, $(CUHS))
+
+all: $(OBJ_DIR) $(CUHS) $(LIB_DIR)/libgpu.a $(BIN_DIR)/hip_get_devices
+
+$(OBJ_DIR):
+	mkdir -p $@    
+
+# GPU kernels compilation 
+
+$(OBJ_DIR)/pppm_f_cubin.h: lal_pppm.cu  $(ALL_H)
+	@cp $< $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX)
+	$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) -Dgrdtyp=float  -Dgrdtyp4=float4 $(HIP_GPU_OPTS_E)  -o $(OBJ_DIR)/pppm_f.cubin $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX)
+	@xxd -i $(OBJ_DIR)/pppm_f.cubin $@
+	@sed -i "s/[a-zA-Z0-9_]*pppm_f_cubin/pppm_f/g" $@
+	@rm $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/pppm_f.cubin
+
+$(OBJ_DIR)/pppm_d_cubin.h: lal_pppm.cu  $(ALL_H)
+	@cp $< $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX)
+	$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) -Dgrdtyp=double -Dgrdtyp4=double4 $(HIP_GPU_OPTS_E)  -o $(OBJ_DIR)/pppm_d.cubin $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX)
+	@xxd -i $(OBJ_DIR)/pppm_d.cubin $@
+	@sed -i "s/[a-zA-Z0-9_]*pppm_d_cubin/pppm_d/g" $@
+	@rm $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/pppm_d.cubin
+
+$(OBJ_DIR)/%_cubin.h: lal_%.cu  $(ALL_H)
+	@cp $< $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX)
+	$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) $(HIP_GPU_OPTS_E)  -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX)
+	@xxd -i $(OBJ_DIR)/$*.cubin $@
+	@sed -i "s/[a-zA-Z0-9_]*$*_cubin/$*/g" $@
+	@rm $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/$*.cubin
+
+# host sources compilation
+
+$(OBJ_DIR)/lal_atom.o: lal_atom.cpp $(CUHS) $(ALL_H)
+	$(HIP_HOST_CC_CMD) -o $@ -c $< -I$(OBJ_DIR) $(HIP_GPU_SORT_ARCH) 
+
+$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(ALL_H)
+	$(HIP_HOST_CC_CMD) -o $@ -c $< -I$(OBJ_DIR)
+
+# libgpu building
+
+$(LIB_DIR)/libgpu.a: $(OBJS)
+	$(AR) -crs $@ $(OBJS)
+	echo "export HIP_PLATFORM := $(HIP_PLATFORM)\n$(HIP_LIBS_TARGET)" > 'Makefile.lammps'   
+
+# test app building
+
+$(BIN_DIR)/hip_get_devices: ./geryon/ucl_get_devices.cpp $(ALL_H)
+	$(HIP_HOST_CC_CMD) -o $@ $< -DUCL_HIP $(MPI_LINK_OPTS)
+
+clean:
+	-rm -f $(BIN_DIR)/hip_get_devices $(LIB_DIR)/libgpu.a $(OBJS) $(OBJ_DIR)/temp_* $(CUHS)
--- a/lib/gpu/geryon/hip_device.h
+++ b/lib/gpu/geryon/hip_device.h
@ -0,0 +1,519 @@
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef HIP_DEVICE
+#define HIP_DEVICE
+
+
+#include <hip/hip_runtime.h>
+#include <unordered_map>
+#include <string>
+#include <vector>
+#include <iostream>
+#include "hip_macros.h"
+#include "ucl_types.h"
+
+namespace ucl_hip {
+
+// --------------------------------------------------------------------------
+// - COMMAND QUEUE STUFF
+// --------------------------------------------------------------------------
+typedef hipStream_t command_queue;
+
+inline void ucl_sync(hipStream_t &stream) {
+  CU_SAFE_CALL(hipStreamSynchronize(stream));
+}
+
+struct NVDProperties {
+  int device_id;
+  std::string name;
+  int major;
+  int minor;
+  CUDA_INT_TYPE totalGlobalMem;
+  int multiProcessorCount;
+
+  int maxThreadsPerBlock;
+  int maxThreadsDim[3];
+  int maxGridSize[3];
+  int sharedMemPerBlock;
+  int totalConstantMemory;
+  int SIMDWidth;
+  int memPitch;
+  int regsPerBlock;
+  int clockRate;
+  int textureAlign;
+
+  int kernelExecTimeoutEnabled;
+  int integrated;
+  int canMapHostMemory;
+  int concurrentKernels;
+  int ECCEnabled;
+  int computeMode;
+};
+
+/// Class for looking at device properties
+/** \note Calls to change the device outside of the class results in incorrect
+  *       behavior
+  * \note There is no error checking for indexing past the number of devices **/
+class UCL_Device {
+ public:
+  /// Collect properties for every GPU on the node
+  /** \note You must set the active GPU with set() before using the device **/
+  inline UCL_Device();
+
+  inline ~UCL_Device();
+
+  /// Returns 1 (For compatibility with OpenCL)
+  inline int num_platforms() { return 1; }
+
+  /// Return a string with name and info of the current platform
+  inline std::string platform_name()
+    { return "HIP platform"; }
+
+  /// Delete any contexts/data and set the platform number to be used
+  inline int set_platform(const int pid);
+
+  /// Return the number of devices that support CUDA
+  inline int num_devices() { return _properties.size(); }
+
+  /// Set the CUDA device to the specified device number
+  /** A context and default command queue will be created for the device
+    * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
+    * be allocated for use. clear() is called to delete any contexts and
+    * associated data from previous calls to set(). **/
+  inline int set(int num);
+
+  /// Delete any context and associated data stored from a call to set()
+  inline void clear();
+
+  /// Get the current device number
+  inline int device_num() { return _device; }
+
+  /// Returns the default stream for the current device
+  inline command_queue & cq() { return cq(0); }
+
+  /// Returns the stream indexed by i
+  inline command_queue & cq(const int i) { return _cq[i]; }
+
+  /// Block until all commands in the default stream have completed
+  inline void sync() { sync(0); }
+
+  /// Block until all commands in the specified stream have completed
+  inline void sync(const int i) { ucl_sync(cq(i)); }
+
+  /// Get the number of command queues currently available on device
+  inline int num_queues()
+    { return _cq.size(); }
+
+  /// Add a stream for device computations
+  inline void push_command_queue() {
+    _cq.push_back(hipStream_t());
+    CU_SAFE_CALL(hipStreamCreateWithFlags(&_cq.back(),0));
+  }
+
+  /// Remove a stream for device computations
+  /** \note You cannot delete the default stream **/
+  inline void pop_command_queue() {
+    if (_cq.size()<2) return;
+    CU_SAFE_CALL_NS(hipStreamDestroy(_cq.back()));
+    _cq.pop_back();
+  }
+
+  /// Set the default command queue (by default this is the null stream)
+  /** \param i index of the command queue (as added by push_command_queue())
+      If i is 0, the default command queue is set to the null stream **/
+  inline void set_command_queue(const int i) {
+    if (i==0) _cq[0]=0;
+    else _cq[0]=_cq[i];
+  }
+
+  /// Get the current CUDA device name
+  inline std::string name() { return name(_device); }
+  /// Get the CUDA device name
+  inline std::string name(const int i)
+    { return std::string(_properties[i].name); }
+
+  /// Get a string telling the type of the current device
+  inline std::string device_type_name() { return device_type_name(_device); }
+  /// Get a string telling the type of the device
+  inline std::string device_type_name(const int i) { return "GPU"; }
+
+  /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
+  inline int device_type() { return device_type(_device); }
+  /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
+  inline int device_type(const int i) { return UCL_GPU; }
+
+  /// Returns true if host memory is efficiently addressable from device
+  inline bool shared_memory() { return shared_memory(_device); }
+  /// Returns true if host memory is efficiently addressable from device
+  inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
+
+  /// Returns true if double precision is support for the current device
+  inline bool double_precision() { return double_precision(_device); }
+  /// Returns true if double precision is support for the device
+  inline bool double_precision(const int i) {return arch(i)>=1.3;}
+
+  /// Get the number of compute units on the current device
+  inline unsigned cus() { return cus(_device); }
+  /// Get the number of compute units
+  inline unsigned cus(const int i)
+    { return _properties[i].multiProcessorCount; }
+
+  /// Get the number of cores in the current device
+  inline unsigned cores() { return cores(_device); }
+  /// Get the number of cores
+  inline unsigned cores(const int i)
+    { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
+      else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32;
+      else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48;
+      else return _properties[i].multiProcessorCount*192; }
+
+  /// Get the gigabytes of global memory in the current device
+  inline double gigabytes() { return gigabytes(_device); }
+  /// Get the gigabytes of global memory
+  inline double gigabytes(const int i)
+    { return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
+
+  /// Get the bytes of global memory in the current device
+  inline size_t bytes() { return bytes(_device); }
+  /// Get the bytes of global memory
+  inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; }
+
+  // Get the gigabytes of free memory in the current device
+  inline double free_gigabytes() { return free_gigabytes(_device); }
+  // Get the gigabytes of free memory
+  inline double free_gigabytes(const int i)
+    { return static_cast<double>(free_bytes(i))/1073741824; }
+
+  // Get the bytes of free memory in the current device
+  inline size_t free_bytes() { return free_bytes(_device); }
+  // Get the bytes of free memory
+  inline size_t free_bytes(const int i) {
+    CUDA_INT_TYPE dfree, dtotal;
+    CU_SAFE_CALL_NS(hipMemGetInfo(&dfree, &dtotal));
+    return static_cast<size_t>(dfree);
+  }
+
+  /// Return the GPGPU compute capability for current device
+  inline double arch() { return arch(_device); }
+  /// Return the GPGPU compute capability
+  inline double arch(const int i)
+    { return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
+
+  /// Clock rate in GHz for current device
+  inline double clock_rate() { return clock_rate(_device); }
+  /// Clock rate in GHz
+  inline double clock_rate(const int i)
+    { return _properties[i].clockRate*1e-6;}
+
+  /// Get the maximum number of threads per block
+  inline size_t group_size() { return group_size(_device); }
+  /// Get the maximum number of threads per block
+  inline size_t group_size(const int i)
+    { return _properties[i].maxThreadsPerBlock; }
+
+  /// Return the maximum memory pitch in bytes for current device
+  inline size_t max_pitch() { return max_pitch(_device); }
+  /// Return the maximum memory pitch in bytes
+  inline size_t max_pitch(const int i) { return _properties[i].memPitch; }
+
+  /// Returns false if accelerator cannot be shared by multiple processes
+  /** If it cannot be determined, true is returned **/
+  inline bool sharing_supported() { return sharing_supported(_device); }
+  /// Returns false if accelerator cannot be shared by multiple processes
+  /** If it cannot be determined, true is returned **/
+  inline bool sharing_supported(const int i)
+    { return (_properties[i].computeMode == hipComputeModeDefault); }
+
+  /// True if splitting device into equal subdevices supported
+  inline bool fission_equal()
+    { return fission_equal(_device); }
+  /// True if splitting device into equal subdevices supported
+  inline bool fission_equal(const int i)
+    { return false; }
+  /// True if splitting device into subdevices by specified counts supported
+  inline bool fission_by_counts()
+    { return fission_by_counts(_device); }
+  /// True if splitting device into subdevices by specified counts supported
+  inline bool fission_by_counts(const int i)
+    { return false; }
+  /// True if splitting device into subdevices by affinity domains supported
+  inline bool fission_by_affinity()
+    { return fission_by_affinity(_device); }
+  /// True if splitting device into subdevices by affinity domains supported
+  inline bool fission_by_affinity(const int i)
+    { return false; }
+
+  /// Maximum number of subdevices allowed from device fission
+  inline int max_sub_devices()
+    { return max_sub_devices(_device); }
+  /// Maximum number of subdevices allowed from device fission
+  inline int max_sub_devices(const int i)
+    { return 0; }
+
+  /// List all devices along with all properties
+  inline void print_all(std::ostream &out);
+
+  /// Select the platform that has accelerators (for compatibility with OpenCL)
+  inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; }
+
+  inline int load_module(const void* program, hipModule_t& module, std::string *log=NULL){
+    auto it = _loaded_modules.emplace(program, hipModule_t());
+    if(!it.second){
+      module = it.first->second;
+      return UCL_SUCCESS;
+    }
+    const unsigned int num_opts=2;
+    hipJitOption options[num_opts];
+    void *values[num_opts];
+
+    // set up size of compilation log buffer
+    options[0] = hipJitOptionInfoLogBufferSizeBytes;
+    values[0] = (void *)(int)10240;
+    // set up pointer to the compilation log buffer
+    options[1] = hipJitOptionInfoLogBuffer;
+    char clog[10240] = { 0 };
+    values[1] = clog;
+
+    hipError_t err=hipModuleLoadDataEx(&module,program,num_opts, options,(void **)values);
+
+    if (log!=NULL)
+      *log=std::string(clog);
+
+    if (err != hipSuccess) {
+      #ifndef UCL_NO_EXIT
+      std::cerr << std::endl
+                << "----------------------------------------------------------\n"
+                << " UCL Error: Error compiling PTX Program...\n"
+                << "----------------------------------------------------------\n";
+      std::cerr << log << std::endl;
+      #endif
+      _loaded_modules.erase(it.first);
+      return UCL_COMPILE_ERROR;
+    }
+    it.first->second = module;
+    return UCL_SUCCESS;
+  }
+ private:
+  std::unordered_map<const void*, hipModule_t> _loaded_modules;
+  int _device, _num_devices;
+  std::vector<NVDProperties> _properties;
+  std::vector<hipStream_t> _cq;
+  hipDevice_t _cu_device;
+};
+
+// Grabs the properties for all devices
+UCL_Device::UCL_Device() {
+  CU_SAFE_CALL_NS(hipInit(0));
+  CU_SAFE_CALL_NS(hipGetDeviceCount(&_num_devices));
+  for (int i=0; i<_num_devices; ++i) {
+    hipDevice_t dev;
+    CU_SAFE_CALL_NS(hipDeviceGet(&dev,i));
+    int major, minor;
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, dev));
+    if (major==9999)
+      continue;
+
+    NVDProperties prop;
+    prop.device_id = i;
+    prop.major=major;
+    prop.minor=minor;
+
+    char namecstr[1024];
+    CU_SAFE_CALL_NS(hipDeviceGetName(namecstr,1024,dev));
+    prop.name=namecstr;
+
+    CU_SAFE_CALL_NS(hipDeviceTotalMem(&prop.totalGlobalMem,dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.multiProcessorCount, hipDeviceAttributeMultiprocessorCount, dev));
+
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsPerBlock, hipDeviceAttributeMaxThreadsPerBlock, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[0], hipDeviceAttributeMaxBlockDimX, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[1], hipDeviceAttributeMaxBlockDimY, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[2], hipDeviceAttributeMaxBlockDimZ, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[0], hipDeviceAttributeMaxGridDimX, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[1], hipDeviceAttributeMaxGridDimY, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[2], hipDeviceAttributeMaxGridDimZ, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.sharedMemPerBlock, hipDeviceAttributeMaxSharedMemoryPerBlock, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.totalConstantMemory, hipDeviceAttributeTotalConstantMemory, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.SIMDWidth, hipDeviceAttributeWarpSize, dev));
+    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.regsPerBlock, hipDeviceAttributeMaxRegistersPerBlock, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.clockRate, hipDeviceAttributeClockRate, dev));
+    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev));
+
+    //#if CUDA_VERSION >= 2020
+    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.integrated, hipDeviceAttributeIntegrated, dev));
+    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.computeMode, hipDeviceAttributeComputeMode,dev));
+    //#endif
+    //#if CUDA_VERSION >= 3010
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.concurrentKernels, hipDeviceAttributeConcurrentKernels, dev));
+    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
+    //#endif
+
+    _properties.push_back(prop);
+  }
+  _device=-1;
+  _cq.push_back(hipStream_t());
+  _cq.back()=0;
+}
+
+UCL_Device::~UCL_Device() {
+  clear();
+}
+
+int UCL_Device::set_platform(const int pid) {
+  clear();
+  #ifdef UCL_DEBUG
+  assert(pid<num_platforms());
+  #endif
+  return UCL_SUCCESS;
+}
+
+// Set the CUDA device to the specified device number
+int UCL_Device::set(int num) {
+  clear();
+  _device=_properties[num].device_id;
+  hipError_t err=hipDeviceGet(&_cu_device,_device);
+  if (err!=hipSuccess) {
+    #ifndef UCL_NO_EXIT
+    std::cerr << "UCL Error: Could not access accelerator number " << num
+              << " for use.\n";
+    UCL_GERYON_EXIT;
+    #endif
+    return UCL_ERROR;
+  }
+  //hipError_t err=hipCtxCreate(&_context,0,_cu_device); deprecated and unnecessary
+  err=hipSetDevice(_device);
+  if (err!=hipSuccess) {
+    #ifndef UCL_NO_EXIT
+    std::cerr << "UCL Error: Could not set accelerator number " << num
+              << " for use.\n";
+    UCL_GERYON_EXIT;
+    #endif
+    return UCL_ERROR;
+  }
+  return UCL_SUCCESS;
+}
+
+void UCL_Device::clear() {
+  if (_device>-1) {
+    for (int i=1; i<num_queues(); i++) pop_command_queue();
+    CU_SAFE_CALL_NS(hipDeviceReset());
+  }
+  _device=-1;
+}
+
+// List all devices along with all properties
+void UCL_Device::print_all(std::ostream &out) {
+  //#if CUDA_VERSION >= 2020
+  int driver_version;
+  hipDriverGetVersion(&driver_version);
+  out << "Driver Version:                           "
+      << driver_version/1000 << "." << driver_version%100
+                  << std::endl;
+  //#endif
+
+  if (num_devices() == 0)
+    out << "There is no device supporting HIP\n";
+  for (int i=0; i<num_devices(); ++i) {
+    out << "\nDevice " << i << ": \"" << name(i) << "\"\n";
+    out << "  Type of device:                                "
+        << device_type_name(i).c_str() << std::endl;
+    out << "  Compute capability:                            "
+        << arch(i) << std::endl;
+    out << "  Double precision support:                      ";
+    if (double_precision(i))
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Total amount of global memory:                 "
+        << gigabytes(i) << " GB\n";
+    //#if CUDA_VERSION >= 2000
+    out << "  Number of compute units/multiprocessors:       "
+        << _properties[i].multiProcessorCount << std::endl;
+    out << "  Number of cores:                               "
+        << cores(i) << std::endl;
+    //#endif
+    out << "  Total amount of constant memory:               "
+        << _properties[i].totalConstantMemory << " bytes\n";
+    out << "  Total amount of local/shared memory per block: "
+        << _properties[i].sharedMemPerBlock << " bytes\n";
+    out << "  Total number of registers available per block: "
+        << _properties[i].regsPerBlock << std::endl;
+    out << "  Warp size:                                     "
+        << _properties[i].SIMDWidth << std::endl;
+    out << "  Maximum number of threads per block:           "
+        << _properties[i].maxThreadsPerBlock << std::endl;
+    out << "  Maximum group size (# of threads per block)    "
+        << _properties[i].maxThreadsDim[0] << " x "
+        << _properties[i].maxThreadsDim[1] << " x "
+        << _properties[i].maxThreadsDim[2] << std::endl;
+    out << "  Maximum item sizes (# threads for each dim)    "
+        << _properties[i].maxGridSize[0] << " x "
+        << _properties[i].maxGridSize[1] << " x "
+        << _properties[i].maxGridSize[2] << std::endl;
+    //out << "  Maximum memory pitch:                          "
+    //    << max_pitch(i) << " bytes\n";
+    //out << "  Texture alignment:                             "
+    //    << _properties[i].textureAlign << " bytes\n";
+    out << "  Clock rate:                                    "
+        << clock_rate(i) << " GHz\n";
+    //#if CUDA_VERSION >= 2020
+    //out << "  Run time limit on kernels:                     ";
+    //if (_properties[i].kernelExecTimeoutEnabled)
+    //  out << "Yes\n";
+    //else
+    //  out << "No\n";
+    out << "  Integrated:                                    ";
+    if (_properties[i].integrated)
+      out << "Yes\n";
+    else
+      out << "No\n";
+    //out << "  Support host page-locked memory mapping:       ";
+    //if (_properties[i].canMapHostMemory)
+    //  out << "Yes\n";
+    //else
+    //  out << "No\n";
+    out << "  Compute mode:                                  ";
+    if (_properties[i].computeMode == hipComputeModeDefault)
+      out << "Default\n"; // multiple threads can use device
+//#if CUDA_VERSION >= 8000
+//    else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
+//#else
+    else if (_properties[i].computeMode == hipComputeModeExclusive)
+//#endif
+      out << "Exclusive\n"; // only thread can use device
+    else if (_properties[i].computeMode == hipComputeModeProhibited)
+      out << "Prohibited\n"; // no thread can use device
+    //#if CUDART_VERSION >= 4000
+    else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
+      out << "Exclusive Process\n"; // multiple threads 1 process
+    //#endif
+    else
+      out << "Unknown\n";
+    //#endif
+    //#if CUDA_VERSION >= 3010
+    out << "  Concurrent kernel execution:                   ";
+    if (_properties[i].concurrentKernels)
+      out << "Yes\n";
+    else
+      out << "No\n";
+    //out << "  Device has ECC support enabled:                ";
+    //if (_properties[i].ECCEnabled)
+    //  out << "Yes\n";
+    //else
+    //  out << "No\n";
+    //#endif
+  }
+}
+
+}
+
+#endif
--- a/lib/gpu/geryon/hip_kernel.h
+++ b/lib/gpu/geryon/hip_kernel.h
@ -0,0 +1,298 @@
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef HIP_KERNEL
+#define HIP_KERNEL
+
+
+#include <hip/hip_runtime.h>
+#include "hip_device.h"
+#include <fstream>
+#include <string>
+#include <iostream>
+
+namespace ucl_hip {
+
+class UCL_Texture;
+template <class numtyp> class UCL_D_Vec;
+template <class numtyp> class UCL_D_Mat;
+template <class hosttype, class devtype> class UCL_Vector;
+template <class hosttype, class devtype> class UCL_Matrix;
+#define UCL_MAX_KERNEL_ARGS 256
+
+/// Class storing 1 or more kernel functions from a single string or file
+class UCL_Program {
+  UCL_Device* _device_ptr;
+ public:
+  inline UCL_Program(UCL_Device &device) { _device_ptr = &device; _cq=device.cq(); }
+  inline UCL_Program(UCL_Device &device, const void *program,
+                     const char *flags="", std::string *log=NULL) {
+    _device_ptr = &device; _cq=device.cq();
+    init(device);
+    load_string(program,flags,log);
+  }
+
+  inline ~UCL_Program() {}
+
+  /// Initialize the program with a device
+  inline void init(UCL_Device &device) { _device_ptr = &device; _cq=device.cq(); }
+
+  /// Clear any data associated with program
+  /** \note Must call init() after each clear **/
+  inline void clear() { }
+
+  /// Load a program from a file and compile with flags
+  inline int load(const char *filename, const char *flags="", std::string *log=NULL) {
+    std::ifstream in(filename);
+    if (!in || in.is_open()==false) {
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not open kernel file: "
+                << filename << std::endl;
+      UCL_GERYON_EXIT;
+      #endif
+      return UCL_FILE_NOT_FOUND;
+    }
+
+    std::string program((std::istreambuf_iterator<char>(in)),
+                        std::istreambuf_iterator<char>());
+    in.close();
+    return load_string(program.c_str(),flags,log);
+  }
+
+  /// Load a program from a string and compile with flags
+  inline int load_string(const void *program, const char *flags="", std::string *log=NULL) {
+    return _device_ptr->load_module(program, _module, log);
+  }
+
+  friend class UCL_Kernel;
+ private:
+  hipModule_t _module;
+  hipStream_t _cq;
+  friend class UCL_Texture;
+};
+
+/// Class for dealing with CUDA Driver kernels
+class UCL_Kernel {
+ public:
+  UCL_Kernel() : _dimensions(1), _num_args(0) {
+    _num_blocks[0]=0;
+  }
+
+  UCL_Kernel(UCL_Program &program, const char *function) :
+    _dimensions(1), _num_args(0) {
+    _num_blocks[0]=0;
+    set_function(program,function);
+    _cq=program._cq;
+  }
+
+  ~UCL_Kernel() {}
+
+  /// Clear any function associated with the kernel
+  inline void clear() { }
+
+  /// Get the kernel function from a program
+  /** \ret UCL_ERROR_FLAG (UCL_SUCCESS, UCL_FILE_NOT_FOUND, UCL_ERROR) **/
+  inline int set_function(UCL_Program &program, const char *function) {
+    hipError_t err=hipModuleGetFunction(&_kernel,program._module,function);
+    if (err!=hipSuccess) {
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not find function: " << function
+                << " in program.\n";
+      UCL_GERYON_EXIT;
+      #endif
+      return UCL_FUNCTION_NOT_FOUND;
+    }
+    _cq=program._cq;
+    return UCL_SUCCESS;
+  }
+
+  /// Set the kernel argument.
+  /** If not a device pointer, this must be repeated each time the argument
+    * changes
+    * \note To set kernel parameter i (i>0), parameter i-1 must be set **/
+  template <class dtype>
+  inline void set_arg(const unsigned index, const dtype * const arg) {
+    if (index==_num_args)
+      add_arg(arg);
+    else if (index<_num_args){
+      assert(0==1); // not implemented
+    }
+    else
+      assert(0==1); // Must add kernel parameters in sequential order
+  }
+
+  /// Set a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
+    { set_arg(&arg->begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
+    { set_arg(&arg->begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
+    { set_arg(&arg->device.begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
+    { set_arg(&arg->device.begin()); }
+
+  /// Add a kernel argument.
+  inline void add_arg(const hipDeviceptr_t* const arg) {
+    add_arg<void*>((void**)arg);
+  }
+
+  /// Add a kernel argument.
+  template <class dtype>
+  inline void add_arg(const dtype* const arg) {
+    const auto old_size = _hip_kernel_args.size();
+    const auto aligned_size = (old_size+alignof(dtype)-1) & ~(alignof(dtype)-1);
+    const auto arg_size = sizeof(dtype);
+    _hip_kernel_args.resize(aligned_size + arg_size);
+    *((dtype*)(&_hip_kernel_args[aligned_size])) = *arg; 
+    _num_args++;
+    if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
+  }
+
+  /// Add a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
+    { add_arg(&arg->begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
+    { add_arg(&arg->begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
+    { add_arg(&arg->device.begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
+    { add_arg(&arg->device.begin()); }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
+  inline void set_size(const size_t num_blocks, const size_t block_size) {
+    _dimensions=1;
+    _num_blocks[0]=num_blocks;
+    _num_blocks[1]=1;
+    _num_blocks[2]=1;
+    
+    _block_size[0]=block_size;
+    _block_size[1]=1;
+    _block_size[2]=1;
+  }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue for the kernel is changed to cq **/
+  inline void set_size(const size_t num_blocks, const size_t block_size,
+                       command_queue &cq)
+    { _cq=cq; set_size(num_blocks,block_size); }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, const size_t block_size_y) {
+    _dimensions=2;
+    _num_blocks[0]=num_blocks_x;
+    _num_blocks[1]=num_blocks_y;
+    _num_blocks[2]=1;
+
+    _block_size[0]=block_size_x;
+    _block_size[1]=block_size_y;
+    _block_size[2]=1;
+  }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue for the kernel is changed to cq **/
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, const size_t block_size_y,
+                       command_queue &cq)
+    {_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x,
+                       const size_t block_size_y, const size_t block_size_z) {
+    _dimensions=2;
+    _num_blocks[0]=num_blocks_x;
+    _num_blocks[1]=num_blocks_y;
+    _num_blocks[2]=1;
+
+    _block_size[0]=block_size_x;
+    _block_size[1]=block_size_y;
+    _block_size[2]=block_size_z;
+  }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, const size_t block_size_y,
+                       const size_t block_size_z, command_queue &cq) {
+    _cq=cq;
+    set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
+             block_size_z);
+  }
+
+  /// Run the kernel in the default command queue
+  inline void run() {
+    size_t args_size = _hip_kernel_args.size();
+    void *config[] = {
+            HIP_LAUNCH_PARAM_BUFFER_POINTER, (void*)_hip_kernel_args.data(),
+            HIP_LAUNCH_PARAM_BUFFER_SIZE,    &args_size,
+            HIP_LAUNCH_PARAM_END
+    };
+    const auto res = hipModuleLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1],
+                                _num_blocks[2],_block_size[0],_block_size[1],
+                                _block_size[2],0,_cq, NULL, config);
+    CU_SAFE_CALL(res);
+//#endif
+  }
+
+  /// Clear any arguments associated with the kernel
+  inline void clear_args() {
+    _num_args=0;
+    _hip_kernel_args.clear();
+  }
+
+  /// Return the default command queue/stream associated with this data
+  inline command_queue & cq() { return _cq; }
+  /// Change the default command queue associated with matrix
+  inline void cq(command_queue &cq_in) { _cq=cq_in; }
+  #include "ucl_arg_kludge.h"
+
+ private:
+  hipFunction_t _kernel;
+  hipStream_t _cq;
+  unsigned _dimensions;
+  unsigned _num_blocks[3];
+  unsigned _num_args;
+  friend class UCL_Texture;
+
+  unsigned _block_size[3];
+  std::vector<char> _hip_kernel_args;
+};
+
+} // namespace
+
+#endif
+
--- a/lib/gpu/geryon/hip_macros.h
+++ b/lib/gpu/geryon/hip_macros.h
@ -0,0 +1,83 @@
+#ifndef HIP_MACROS_H
+#define HIP_MACROS_H
+
+#include <cstdio>
+#include <cassert>
+#include <hip/hip_runtime.h>
+
+//#if CUDA_VERSION >= 3020
+#define CUDA_INT_TYPE size_t
+//#else
+//#define CUDA_INT_TYPE unsigned
+//#endif
+
+#ifdef MPI_GERYON
+#include "mpi.h"
+#define NVD_GERYON_EXIT do {                                               \
+  int is_final;                                                            \
+  MPI_Finalized(&is_final);                                                \
+  if (!is_final)                                                           \
+    MPI_Abort(MPI_COMM_WORLD,-1);                                          \
+  } while(0)
+#else
+#define NVD_GERYON_EXIT assert(0==1)
+#endif
+
+#ifndef UCL_GERYON_EXIT
+#define UCL_GERYON_EXIT NVD_GERYON_EXIT
+#endif
+
+#ifdef UCL_DEBUG
+#define UCL_SYNC_DEBUG
+#define UCL_DESTRUCT_CHECK
+#endif
+
+#ifndef UCL_NO_API_CHECK
+
+#define CU_SAFE_CALL_NS( call ) do {                                         \
+    hipError_t err = call;                                                     \
+    if( hipSuccess != err) {                                               \
+        fprintf(stderr, "HIP runtime error %d in call at file '%s' in line %i.\n",   \
+                err, __FILE__, __LINE__ );                                   \
+        NVD_GERYON_EXIT;                                                     \
+    } } while (0)
+
+#ifdef UCL_SYNC_DEBUG
+
+#define CU_SAFE_CALL( call ) do {                                            \
+    CU_SAFE_CALL_NS( call );                                                 \
+    hipError_t err=hipCtxSynchronize();                                                  \
+    if( hipSuccess != err) {                                               \
+        fprintf(stderr, "HIP runtime error %d in file '%s' in line %i.\n",   \
+                err, __FILE__, __LINE__ );                                   \
+        NVD_GERYON_EXIT;                                                     \
+    } } while (0)
+
+#else
+
+#define CU_SAFE_CALL( call ) CU_SAFE_CALL_NS( call )
+
+#endif
+
+#else  // not DEBUG
+
+// void macros for performance reasons
+#define CU_SAFE_CALL_NS( call ) call
+#define CU_SAFE_CALL( call) call
+
+#endif
+
+#ifdef UCL_DESTRUCT_CHECK
+
+#define CU_DESTRUCT_CALL( call) CU_SAFE_CALL( call)
+#define CU_DESTRUCT_CALL_NS( call) CU_SAFE_CALL_NS( call)
+
+#else
+
+#define CU_DESTRUCT_CALL( call) call
+#define CU_DESTRUCT_CALL_NS( call) call
+
+#endif
+
+#endif
+
--- a/lib/gpu/geryon/hip_mat.h
+++ b/lib/gpu/geryon/hip_mat.h
@ -0,0 +1,43 @@
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+/*! \file */
+
+#ifndef HIP_MAT_H
+#define HIP_MAT_H
+
+
+#include <hip/hip_runtime.h>
+#include "hip_memory.h"
+
+/// Namespace for CUDA Driver routines
+namespace ucl_hip {
+
+#define _UCL_MAT_ALLOW
+#define _UCL_DEVICE_PTR_MAT
+#include "ucl_basemat.h"
+#include "ucl_h_vec.h"
+#include "ucl_h_mat.h"
+#include "ucl_d_vec.h"
+#include "ucl_d_mat.h"
+#include "ucl_s_obj_help.h"
+#include "ucl_vector.h"
+#include "ucl_matrix.h"
+#undef _UCL_DEVICE_PTR_MAT
+#undef _UCL_MAT_ALLOW
+
+#define UCL_COPY_ALLOW
+#include "ucl_copy.h"
+#undef UCL_COPY_ALLOW
+
+#define UCL_PRINT_ALLOW
+#include "ucl_print.h"
+#undef UCL_PRINT_ALLOW
+
+} // namespace ucl_cudadr
+
+#endif
--- a/lib/gpu/geryon/hip_memory.h
+++ b/lib/gpu/geryon/hip_memory.h
@ -0,0 +1,279 @@
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef HIP_MEMORY_H
+#define HIP_MEMORY_H
+
+
+#include <hip/hip_runtime.h>
+#include <iostream>
+#include <cassert>
+#include <cstring>
+#include "hip_macros.h"
+#include "hip_device.h"
+#include "ucl_types.h"
+
+namespace ucl_hip {
+
+// --------------------------------------------------------------------------
+// - API Specific Types
+// --------------------------------------------------------------------------
+//typedef dim3 ucl_kernel_dim;
+
+#ifdef __HIP_PLATFORM_NVCC__
+typedef enum hipArray_Format {
+    HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+    HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+    HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+    HIP_AD_FORMAT_SIGNED_INT8 = 0x08,
+    HIP_AD_FORMAT_SIGNED_INT16 = 0x09,
+    HIP_AD_FORMAT_SIGNED_INT32 = 0x0a,
+    HIP_AD_FORMAT_HALF = 0x10,
+    HIP_AD_FORMAT_FLOAT = 0x20
+}hipArray_Format;
+#endif
+
+// --------------------------------------------------------------------------
+// - API SPECIFIC DEVICE POINTERS
+// --------------------------------------------------------------------------
+typedef hipDeviceptr_t device_ptr;
+
+// --------------------------------------------------------------------------
+// - HOST MEMORY ALLOCATION ROUTINES
+// --------------------------------------------------------------------------
+template <class mat_type, class copy_type>
+inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
+                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
+  hipError_t err=hipSuccess;
+  if (kind==UCL_NOT_PINNED)
+    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+  else if (kind==UCL_WRITE_ONLY)
+    err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined);
+  else
+    err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault);
+  if (err!=hipSuccess || *(mat.host_ptr())==NULL)
+    return UCL_MEMORY_ERROR;
+  mat.cq()=cm.cq();
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
+                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
+  hipError_t err=hipSuccess;
+  if (kind==UCL_NOT_PINNED)
+    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+  else if (kind==UCL_WRITE_ONLY)
+    err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined);
+  else
+    err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault);
+  if (err!=hipSuccess || *(mat.host_ptr())==NULL)
+    return UCL_MEMORY_ERROR;
+  mat.cq()=dev.cq();
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline void _host_free(mat_type &mat) {
+  if (mat.kind()==UCL_VIEW)
+    return;
+  else if (mat.kind()!=UCL_NOT_PINNED)
+    CU_DESTRUCT_CALL(hipHostFree(mat.begin()));
+  else
+    free(mat.begin());
+}
+
+template <class mat_type>
+inline int _host_resize(mat_type &mat, const size_t n) {
+  _host_free(mat);
+  hipError_t err=hipSuccess;
+  if (mat.kind()==UCL_NOT_PINNED)
+    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+  else if (mat.kind()==UCL_WRITE_ONLY)
+    err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined);
+  else
+    err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault);
+  if (err!=hipSuccess || *(mat.host_ptr())==NULL)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
+// --------------------------------------------------------------------------
+// - DEVICE MEMORY ALLOCATION ROUTINES
+// --------------------------------------------------------------------------
+template <class mat_type, class copy_type>
+inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
+                         const enum UCL_MEMOPT kind) {
+  hipError_t err=hipMalloc((void**)&mat.cbegin(),n);
+  if (err!=hipSuccess)
+    return UCL_MEMORY_ERROR;
+  mat.cq()=cm.cq();
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
+                         const enum UCL_MEMOPT kind) {
+  hipError_t err=hipMalloc((void**)&mat.cbegin(),n);
+  if (err!=hipSuccess)
+    return UCL_MEMORY_ERROR;
+  mat.cq()=dev.cq();
+  return UCL_SUCCESS;
+}
+
+template <class mat_type, class copy_type>
+inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
+                         const size_t cols, size_t &pitch,
+                         const enum UCL_MEMOPT kind) {
+  hipError_t err;
+  size_t upitch;
+  err=hipMallocPitch((void**)&mat.cbegin(),&upitch,
+                      cols*sizeof(typename mat_type::data_type),rows);
+  pitch=static_cast<size_t>(upitch);
+  if (err!=hipSuccess)
+    return UCL_MEMORY_ERROR;
+  mat.cq()=cm.cq();
+  return UCL_SUCCESS;
+}
+
+template <class mat_type, class copy_type>
+inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
+                         const size_t cols, size_t &pitch,
+                         const enum UCL_MEMOPT kind) {
+  hipError_t err;
+  size_t upitch;
+  err=hipMallocPitch((void**)&mat.cbegin(),&upitch,
+                      cols*sizeof(typename mat_type::data_type),rows);
+  pitch=static_cast<size_t>(upitch);
+  if (err!=hipSuccess)
+    return UCL_MEMORY_ERROR;
+  mat.cq()=d.cq();
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline void _device_free(mat_type &mat) {
+  if (mat.kind()!=UCL_VIEW){
+    CU_DESTRUCT_CALL(hipFree((void*)mat.cbegin()));
+  }
+}
+
+template <class mat_type>
+inline int _device_resize(mat_type &mat, const size_t n) {
+  _device_free(mat);
+  hipError_t err=hipMalloc((void**)&mat.cbegin(),n);
+  if (err!=hipSuccess)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline int _device_resize(mat_type &mat, const size_t rows,
+                          const size_t cols, size_t &pitch) {
+  _device_free(mat);
+  hipError_t err;
+  size_t upitch;
+  err=hipMallocPitch((void**)&mat.cbegin(),&upitch,
+                      cols*sizeof(typename mat_type::data_type),rows);
+  pitch=static_cast<size_t>(upitch);
+  if (err!=hipSuccess)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
+inline void _device_view(hipDeviceptr_t *ptr, hipDeviceptr_t &in) {
+  *ptr=in;
+}
+
+template <class numtyp>
+inline void _device_view(hipDeviceptr_t *ptr, numtyp *in) {
+  *ptr=0;
+}
+
+inline void _device_view(hipDeviceptr_t *ptr, hipDeviceptr_t &in,
+                         const size_t offset, const size_t numsize) {
+  *ptr=(hipDeviceptr_t)(((char*)in)+offset*numsize);
+}
+
+template <class numtyp>
+inline void _device_view(hipDeviceptr_t *ptr, numtyp *in,
+                         const size_t offset, const size_t numsize) {
+  *ptr=0;
+}
+
+// --------------------------------------------------------------------------
+// - DEVICE IMAGE ALLOCATION ROUTINES
+// --------------------------------------------------------------------------
+template <class mat_type, class copy_type>
+inline void _device_image_alloc(mat_type &mat, copy_type &cm, const size_t rows,
+                                const size_t cols) {
+  assert(0==1);
+}
+
+template <class mat_type, class copy_type>
+inline void _device_image_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
+                                const size_t cols) {
+  assert(0==1);
+}
+
+template <class mat_type>
+inline void _device_image_free(mat_type &mat) {
+  assert(0==1);
+}
+
+// --------------------------------------------------------------------------
+// - ZERO ROUTINES
+// --------------------------------------------------------------------------
+inline void _host_zero(void *ptr, const size_t n) {
+  memset(ptr,0,n);
+}
+
+template <class mat_type>
+inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
+    CU_SAFE_CALL(hipMemsetAsync((void*)mat.cbegin(),0,n,cq));
+}
+
+
+// --------------------------------------------------------------------------
+// - MEMCPY ROUTINES
+// --------------------------------------------------------------------------
+
+
+template<class mat1, class mat2>
+hipMemcpyKind _memcpy_kind(mat1 &dst, const mat2 &src){
+  assert(mat1::MEM_TYPE < 2 && mat2::MEM_TYPE < 2);
+  return (hipMemcpyKind)((1 - mat2::MEM_TYPE)*2 + (1 - mat1::MEM_TYPE));
+}
+
+template<class mat1, class mat2>
+inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n) {
+  CU_SAFE_CALL(hipMemcpy((void*)dst.begin(), (void*)src.begin(), n, _memcpy_kind(dst, src)));
+}
+
+template<class mat1, class mat2>
+inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n, hipStream_t &cq) {
+  CU_SAFE_CALL(hipMemcpyAsync((void*)dst.begin(), (void*)src.begin(), n, _memcpy_kind(dst, src), cq));
+}
+
+template<class mat1, class mat2>
+inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
+                       const size_t spitch, const size_t cols,
+                       const size_t rows) {
+  CU_SAFE_CALL(hipMemcpy2D((void*)dst.begin(), dpitch, (void*)src.begin(), spitch, cols, rows, _memcpy_kind(dst, src)));
+}
+
+template<class mat1, class mat2>
+inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
+                       const size_t spitch, const size_t cols,
+                       const size_t rows,hipStream_t &cq) {
+  CU_SAFE_CALL(hipMemcpy2DAsync((void*)dst.begin(), dpitch, (void*)src.begin(), spitch, cols, rows, _memcpy_kind(dst, src), cq));
+}
+
+} // namespace ucl_cudart
+
+#endif
+
--- a/lib/gpu/geryon/hip_texture.h
+++ b/lib/gpu/geryon/hip_texture.h
@ -0,0 +1,113 @@
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef HIP_TEXTURE
+#define HIP_TEXTURE
+
+
+#include <hip/hip_runtime.h>
+#include "hip_kernel.h"
+#include "hip_mat.h"
+
+namespace ucl_hip {
+
+#ifdef __HIP_PLATFORM_NVCC__
+inline hipError_t hipModuleGetTexRef(CUtexref* texRef, hipModule_t hmod, const char* name){
+  return hipCUResultTohipError(cuModuleGetTexRef(texRef, hmod, name)); 
+}
+inline hipError_t hipTexRefSetFormat(CUtexref tex, hipArray_Format fmt, int NumPackedComponents) {
+    return hipCUResultTohipError(cuTexRefSetFormat(tex, (CUarray_format)fmt, NumPackedComponents ));
+}
+inline hipError_t hipTexRefSetAddress(size_t* offset, CUtexref tex, hipDeviceptr_t devPtr, size_t size) {
+    return hipCUResultTohipError(cuTexRefSetAddress(offset, tex, devPtr, size));
+}
+#endif
+
+/// Class storing a texture reference
+class UCL_Texture {
+ public:
+  UCL_Texture() {}
+  ~UCL_Texture() {}
+  /// Construct with a specified texture reference
+  inline UCL_Texture(UCL_Program &prog, const char *texture_name)
+    { get_texture(prog,texture_name); }
+  /// Set the texture reference for this object
+  inline void get_texture(UCL_Program &prog, const char *texture_name)
+    { 
+  #ifdef __HIP_PLATFORM_NVCC__
+      CU_SAFE_CALL(hipModuleGetTexRef(&_tex, prog._module, texture_name)); 
+  #else
+      size_t _global_var_size;
+      CU_SAFE_CALL(hipModuleGetGlobal(&_device_ptr_to_global_var, &_global_var_size, prog._module, texture_name));
+  #endif
+    }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class numtyp>
+  inline void bind_float(UCL_D_Vec<numtyp> &vec, const unsigned numel)
+    { _bind_float(vec,numel); }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class numtyp>
+  inline void bind_float(UCL_D_Mat<numtyp> &vec, const unsigned numel)
+    { _bind_float(vec,numel); }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class numtyp, class devtyp>
+  inline void bind_float(UCL_Vector<numtyp, devtyp> &vec, const unsigned numel)
+    { _bind_float(vec.device,numel); }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class numtyp, class devtyp>
+  inline void bind_float(UCL_Matrix<numtyp, devtyp> &vec, const unsigned numel)
+    { _bind_float(vec.device,numel); }
+
+  /// Unbind the texture reference from the memory allocation
+  inline void unbind() { }
+
+  /// Make a texture reference available to kernel
+  inline void allow(UCL_Kernel &kernel) {
+    //#if CUDA_VERSION < 4000
+    //CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
+    //#endif
+  }
+
+ private:
+#ifdef __HIP_PLATFORM_NVCC__
+  CUtexref _tex;
+#else
+  void* _device_ptr_to_global_var;
+#endif
+  friend class UCL_Kernel;
+
+  template<class mat_typ>
+  inline void _bind_float(mat_typ &vec, const unsigned numel) {
+    #ifdef UCL_DEBUG
+    assert(numel!=0 && numel<5);
+    #endif
+
+#ifdef __HIP_PLATFORM_NVCC__
+    if (vec.element_size()==sizeof(float))
+      CU_SAFE_CALL(hipTexRefSetFormat(_tex, HIP_AD_FORMAT_FLOAT, numel));
+    else {
+      if (numel>2)
+        CU_SAFE_CALL(hipTexRefSetFormat(_tex, HIP_AD_FORMAT_SIGNED_INT32, numel));
+      else
+        CU_SAFE_CALL(hipTexRefSetFormat(_tex,HIP_AD_FORMAT_SIGNED_INT32,numel*2));
+    }
+    CU_SAFE_CALL(hipTexRefSetAddress(NULL, _tex, vec.cbegin(), vec.numel()*vec.element_size()));
+#else
+    void* data_ptr = (void*)vec.cbegin();
+    CU_SAFE_CALL(hipMemcpyHtoD(hipDeviceptr_t(_device_ptr_to_global_var), &data_ptr, sizeof(void*)));
+#endif
+  }
+};
+
+} // namespace
+
+#endif
+
--- a/lib/gpu/geryon/hip_timer.h
+++ b/lib/gpu/geryon/hip_timer.h
@ -0,0 +1,107 @@
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef HIP_TIMER_H
+#define HIP_TIMER_H
+
+
+#include <hip/hip_runtime.h>
+#include "hip_macros.h"
+#include "hip_device.h"
+
+namespace ucl_hip {
+
+/// Class for timing CUDA Driver events
+class UCL_Timer {
+ public:
+  inline UCL_Timer() : _total_time(0.0f), _initialized(false) { }
+  inline UCL_Timer(UCL_Device &dev) : _total_time(0.0f), _initialized(false)
+    { init(dev); }
+
+  inline ~UCL_Timer() { clear(); }
+
+  /// Clear any data associated with timer
+  /** \note init() must be called to reuse timer after a clear() **/
+  inline void clear() {
+    if (_initialized) {
+      CU_DESTRUCT_CALL(hipEventDestroy(start_event));
+      CU_DESTRUCT_CALL(hipEventDestroy(stop_event));
+      _initialized=false;
+      _total_time=0.0;
+    }
+  }
+
+  /// Initialize default command queue for timing
+  inline void init(UCL_Device &dev) { init(dev, dev.cq()); }
+
+  /// Initialize command queue for timing
+  inline void init(UCL_Device &dev, command_queue &cq) {
+    clear();
+    _cq=cq;
+    _initialized=true;
+    CU_SAFE_CALL( hipEventCreateWithFlags(&start_event,0) );
+    CU_SAFE_CALL( hipEventCreateWithFlags(&stop_event,0) );
+  }
+
+  /// Start timing on command queue
+  inline void start() { CU_SAFE_CALL(hipEventRecord(start_event,_cq)); }
+
+  /// Stop timing on command queue
+  inline void stop() { CU_SAFE_CALL(hipEventRecord(stop_event,_cq)); }
+
+  /// Block until the start event has been reached on device
+  inline void sync_start()
+    { CU_SAFE_CALL(hipEventSynchronize(start_event)); }
+
+  /// Block until the stop event has been reached on device
+  inline void sync_stop()
+    { CU_SAFE_CALL(hipEventSynchronize(stop_event)); }
+
+  /// Set the time elapsed to zero (not the total_time)
+  inline void zero() {
+    CU_SAFE_CALL(hipEventRecord(start_event,_cq));
+    CU_SAFE_CALL(hipEventRecord(stop_event,_cq));
+  }
+
+  /// Set the total time to zero
+  inline void zero_total() { _total_time=0.0; }
+
+  /// Add time from previous start and stop to total
+  /** Forces synchronization **/
+  inline double add_to_total()
+    { double t=time(); _total_time+=t; return t/1000.0; }
+
+  /// Add a user specified time to the total (ms)
+  inline void add_time_to_total(const double t) { _total_time+=t; }
+
+  /// Return the time (ms) of last start to stop - Forces synchronization
+  inline double time() {
+    float timer;
+    CU_SAFE_CALL(hipEventSynchronize(stop_event));
+    CU_SAFE_CALL( hipEventElapsedTime(&timer,start_event,stop_event) );
+    return timer;
+  }
+
+  /// Return the time (s) of last start to stop - Forces synchronization
+  inline double seconds() { return time()/1000.0; }
+
+  /// Return the total time in ms
+  inline double total_time() { return _total_time; }
+
+  /// Return the total time in seconds
+  inline double total_seconds() { return _total_time/1000.0; }
+
+ private:
+  hipEvent_t start_event, stop_event;
+  hipStream_t _cq;
+  double _total_time;
+  bool _initialized;
+};
+
+} // namespace
+
+#endif
--- a/lib/gpu/geryon/ucl_get_devices.cpp
+++ b/lib/gpu/geryon/ucl_get_devices.cpp
@ -36,6 +36,11 @@ using namespace ucl_cudadr;
 using namespace ucl_cudart;
 #endif

+#ifdef UCL_HIP
+#include "hip_device.h"
+using namespace ucl_hip;
+#endif
+
 int main(int argc, char** argv) {
  UCL_Device cop;
  std::cout << "Found " << cop.num_platforms() << " platform(s).\n";
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@ -179,13 +179,15 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
  if (_eflag) {
    for (int i=0; i<_inum; i++)
      evdwl+=engv[i];
-    if (_ef_atom)
-      if (_ilist==NULL)
+    if (_ef_atom) {
+      if (_ilist==NULL) {
        for (int i=0; i<_inum; i++)
          eatom[i]+=engv[i];
-      else
+      } else {
        for (int i=0; i<_inum; i++)
          eatom[_ilist[i]]+=engv[i];
+      }
+    }
    vstart=_inum;
  }
  if (_vflag) {
@ -193,7 +195,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
    for (int j=0; j<6; j++) {
      for (int i=vstart; i<iend; i++)
        virial[j]+=engv[i];
-      if (_vf_atom)
+      if (_vf_atom){
        if (_ilist==NULL) {
          int ii=0;
          for (int i=vstart; i<iend; i++)
@ -203,6 +205,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
          for (int i=vstart; i<iend; i++)
            vatom[_ilist[ii++]][j]+=engv[i];
        }
+      }
      vstart+=_inum;
      iend+=_inum;
    }
@ -228,7 +231,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
      evdwl+=engv[i];
    for (int i=_inum; i<iend; i++)
      ecoul+=engv[i];
-    if (_ef_atom)
+    if (_ef_atom) {
      if (_ilist==NULL) {
        for (int i=0; i<_inum; i++)
          eatom[i]+=engv[i];
@ -240,6 +243,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
        for (int i=_inum, ii=0; i<iend; i++)
          eatom[_ilist[ii++]]+=engv[i];
      }
+    }
    vstart=iend;
    iend+=_inum;
  }
@ -247,7 +251,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
    for (int j=0; j<6; j++) {
      for (int i=vstart; i<iend; i++)
        virial[j]+=engv[i];
-      if (_vf_atom)
+      if (_vf_atom) {
        if (_ilist==NULL) {
          for (int i=vstart, ii=0; i<iend; i++)
            vatom[ii++][j]+=engv[i];
@ -255,6 +259,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
          for (int i=vstart, ii=0; i<iend; i++)
            vatom[_ilist[ii++]][j]+=engv[i];
        }
+      }
      vstart+=_inum;
      iend+=_inum;
    }
--- a/lib/gpu/lal_answer.h
+++ b/lib/gpu/lal_answer.h
@ -27,6 +27,10 @@ using namespace ucl_opencl;
 #include "geryon/nvc_timer.h"
 #include "geryon/nvc_mat.h"
 using namespace ucl_cudart;
+#elif defined(USE_HIP)
+#include "geryon/hip_timer.h"
+#include "geryon/hip_mat.h"
+using namespace ucl_hip;
 #else
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@ -15,6 +15,11 @@

 #include "lal_atom.h"

+#ifdef USE_HIP_DEVICE_SORT
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+#endif
+
 namespace LAMMPS_AL {
 #define AtomT Atom<numtyp,acctyp>

@ -70,6 +75,26 @@ bool AtomT::alloc(const int nall) {
  }
  #endif

+  #ifdef USE_HIP_DEVICE_SORT
+  if (_gpu_nbor==1) {
+    size_t   temp_storage_bytes = 0;
+    if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, sort_out_keys, sort_out_keys, sort_out_values, sort_out_values, _max_atoms))
+      return false;
+    if(sort_out_size < _max_atoms){
+      if (sort_out_keys  ) hipFree(sort_out_keys);
+      if (sort_out_values) hipFree(sort_out_values);
+      hipMalloc(&sort_out_keys  , _max_atoms * sizeof(unsigned));
+      hipMalloc(&sort_out_values, _max_atoms * sizeof(int     ));
+      sort_out_size = _max_atoms;
+    }
+    if(temp_storage_bytes > sort_temp_storage_size){
+      if(sort_temp_storage) hipFree(sort_temp_storage);
+      hipMalloc(&sort_temp_storage, temp_storage_bytes);
+      sort_temp_storage_size = temp_storage_bytes;
+    }
+  }
+  #endif
+
  // ---------------------------  Device allocations
  int gpu_bytes=0;
  success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
@ -184,6 +209,27 @@ bool AtomT::add_fields(const bool charge, const bool rot,
        return false;
    }
    #endif
+
+    #ifdef USE_HIP_DEVICE_SORT
+    if (_gpu_nbor==1) {
+      size_t   temp_storage_bytes = 0;
+      if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, sort_out_keys, sort_out_keys, sort_out_values, sort_out_values, _max_atoms))
+        return false;
+      if(sort_out_size < _max_atoms){
+        if (sort_out_keys  ) hipFree(sort_out_keys);
+        if (sort_out_values) hipFree(sort_out_values);
+        hipMalloc(&sort_out_keys  , _max_atoms * sizeof(unsigned));
+        hipMalloc(&sort_out_values, _max_atoms * sizeof(int     ));
+        sort_out_size = _max_atoms;
+      }
+      if(temp_storage_bytes > sort_temp_storage_size){
+        if(sort_temp_storage) hipFree(sort_temp_storage);
+        hipMalloc(&sort_temp_storage, temp_storage_bytes);
+        sort_temp_storage_size = temp_storage_bytes;
+      }
+    }
+    #endif
+
    success=success && (dev_particle_id.alloc(_max_atoms,*dev,
                                              UCL_READ_ONLY)==UCL_SUCCESS);
    gpu_bytes+=dev_particle_id.row_bytes();
@ -275,6 +321,19 @@ void AtomT::clear_resize() {
  if (_gpu_nbor==1) cudppDestroyPlan(sort_plan);
  #endif

+  #ifdef USE_HIP_DEVICE_SORT
+  if (_gpu_nbor==1) {
+    if(sort_out_keys)     hipFree(sort_out_keys);
+    if(sort_out_values)   hipFree(sort_out_values);
+    if(sort_temp_storage) hipFree(sort_temp_storage);
+    sort_out_keys = nullptr;
+    sort_out_values = nullptr;
+    sort_temp_storage = nullptr;
+    sort_temp_storage_size = 0;
+    sort_out_size = 0;
+  }
+  #endif
+
  if (_gpu_nbor==2) {
    host_particle_id.clear();
    host_cell_id.clear();
@ -326,6 +385,22 @@ void AtomT::sort_neighbor(const int num_atoms) {
    UCL_GERYON_EXIT;
  }
  #endif
+
+  #ifdef USE_HIP_DEVICE_SORT
+    if(sort_out_size < num_atoms){
+      printf("AtomT::sort_neighbor: invalid temp buffer size\n");
+      UCL_GERYON_EXIT;
+    }
+    if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(sort_temp_storage, sort_temp_storage_size, (unsigned *)dev_cell_id.begin(), sort_out_keys, (int *)dev_particle_id.begin(), sort_out_values, num_atoms)){
+      printf("AtomT::sort_neighbor: DeviceRadixSort error\n");
+      UCL_GERYON_EXIT;
+    }
+    if(hipSuccess != hipMemcpy((unsigned *)dev_cell_id.begin(), sort_out_keys  , num_atoms*sizeof(unsigned), hipMemcpyDeviceToDevice) ||
+       hipSuccess != hipMemcpy((int *) dev_particle_id.begin(), sort_out_values, num_atoms*sizeof(int     ), hipMemcpyDeviceToDevice)){
+      printf("AtomT::sort_neighbor: copy output error\n");
+      UCL_GERYON_EXIT;
+    }
+  #endif
 }

 #ifdef GPU_CAST
--- a/lib/gpu/lal_atom.cu
+++ b/lib/gpu/lal_atom.cu
@ -13,7 +13,7 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_preprocessor.h"
 #endif

--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@ -29,6 +29,11 @@ using namespace ucl_opencl;
 #include "geryon/nvc_mat.h"
 #include "geryon/nvc_kernel.h"
 using namespace ucl_cudart;
+#elif defined(USE_HIP)
+#include "geryon/hip_timer.h"
+#include "geryon/hip_mat.h"
+#include "geryon/hip_kernel.h"
+using namespace ucl_hip;
 #else
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
@ -477,6 +482,14 @@ class Atom {
  CUDPPConfiguration sort_config;
  CUDPPHandle sort_plan;
  #endif
+
+  #ifdef USE_HIP_DEVICE_SORT
+  unsigned* sort_out_keys = nullptr;
+  int* sort_out_values = nullptr;
+  void* sort_temp_storage = nullptr;
+  size_t sort_temp_storage_size = 0;
+  size_t sort_out_size = 0;
+  #endif
 };

 }
--- a/lib/gpu/lal_aux_fun1.h
+++ b/lib/gpu/lal_aux_fun1.h
@ -13,7 +13,7 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_preprocessor.h"
 #endif

--- a/lib/gpu/lal_base_atomic.h
+++ b/lib/gpu/lal_base_atomic.h
@ -24,6 +24,8 @@
 #include "geryon/ocl_texture.h"
 #elif defined(USE_CUDART)
 #include "geryon/nvc_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
--- a/lib/gpu/lal_base_charge.h
+++ b/lib/gpu/lal_base_charge.h
@ -25,6 +25,8 @@
 #include "geryon/ocl_texture.h"
 #elif defined(USE_CUDART)
 #include "geryon/nvc_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
--- a/lib/gpu/lal_base_dipole.h
+++ b/lib/gpu/lal_base_dipole.h
@ -23,6 +23,8 @@

 #ifdef USE_OPENCL
 #include "geryon/ocl_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
--- a/lib/gpu/lal_base_dpd.h
+++ b/lib/gpu/lal_base_dpd.h
@ -23,6 +23,8 @@

 #ifdef USE_OPENCL
 #include "geryon/ocl_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
--- a/lib/gpu/lal_base_ellipsoid.h
+++ b/lib/gpu/lal_base_ellipsoid.h
@ -24,6 +24,8 @@
 #include "geryon/ocl_texture.h"
 #elif defined(USE_CUDART)
 #include "geryon/nvc_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
--- a/lib/gpu/lal_base_three.h
+++ b/lib/gpu/lal_base_three.h
@ -24,6 +24,8 @@
 #include "geryon/ocl_texture.h"
 #elif defined(USE_CUDART)
 #include "geryon/nvc_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
--- a/lib/gpu/lal_beck.cu
+++ b/lib/gpu/lal_beck.cu
@ -13,12 +13,12 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@ -13,12 +13,12 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@ -13,15 +13,15 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_born_coul_long_cs.cu
+++ b/lib/gpu/lal_born_coul_long_cs.cu
@ -13,15 +13,16 @@
 //    email                : ndactrung@gmail.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
+
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@ -13,15 +13,15 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_born_coul_wolf_cs.cu
+++ b/lib/gpu/lal_born_coul_wolf_cs.cu
@ -13,15 +13,15 @@
 //    email                : ndactrung@gmail.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@ -13,12 +13,12 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@ -13,15 +13,15 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@ -13,15 +13,15 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@ -13,15 +13,15 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_colloid.cu
+++ b/lib/gpu/lal_colloid.cu
@ -13,12 +13,12 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_coul.cu
+++ b/lib/gpu/lal_coul.cu
@ -13,15 +13,15 @@
 //    email                : ndtrung@umich.edu
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_coul_debye.cu
+++ b/lib/gpu/lal_coul_debye.cu
@ -13,15 +13,15 @@
 //    email                : ndtrung@umich.edu
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_coul_dsf.cu
+++ b/lib/gpu/lal_coul_dsf.cu
@ -13,15 +13,15 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@ -13,15 +13,15 @@
 //    email                : a.kohlmeyer@temple.edu
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_coul_long_cs.cu
+++ b/lib/gpu/lal_coul_long_cs.cu
@ -13,15 +13,15 @@
 //    email                : ndactrung@gmail.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -268,7 +268,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
    gpu_nbor=1;
  else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
    gpu_nbor=2;
-  #ifndef USE_CUDPP
+  #if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT)
  if (gpu_nbor==1)
    gpu_nbor=2;
  #endif
@ -341,7 +341,7 @@ int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
    gpu_nbor=1;
  else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
    gpu_nbor=2;
-  #ifndef USE_CUDPP
+  #if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT)
  if (gpu_nbor==1)
    gpu_nbor=2;
  #endif
@ -712,7 +712,7 @@ int DeviceT::compile_kernels() {
  gpu_lib_data.update_host(false);

  _ptx_arch=static_cast<double>(gpu_lib_data[0])/100.0;
-  #ifndef USE_OPENCL
+  #if !(defined(USE_OPENCL) || defined(USE_HIP))
  if (_ptx_arch>gpu->arch() || floor(_ptx_arch)<floor(gpu->arch()))
    return -4;
  #endif
--- a/lib/gpu/lal_device.cu
+++ b/lib/gpu/lal_device.cu
@ -13,7 +13,7 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_preprocessor.h"
 #endif

--- a/lib/gpu/lal_dipole_lj.cu
+++ b/lib/gpu/lal_dipole_lj.cu
@ -13,16 +13,16 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
-texture<float4> mu_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
+_texture( mu_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
-texture<int4,1> mu_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
+_texture_2d( mu_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_dipole_lj_sf.cu
+++ b/lib/gpu/lal_dipole_lj_sf.cu
@ -13,17 +13,17 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"

 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
-texture<float4> mu_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
+_texture( mu_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
-texture<int4,1> mu_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
+_texture_2d( mu_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_dipole_long_lj.cu
+++ b/lib/gpu/lal_dipole_long_lj.cu
@ -13,16 +13,16 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
-texture<float4> mu_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
+_texture( mu_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
-texture<int4,1> mu_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
+_texture_2d( mu_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_dpd.cu
+++ b/lib/gpu/lal_dpd.cu
@ -13,14 +13,14 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float4> vel_tex;
+_texture( pos_tex,float4);
+_texture( vel_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int4,1> vel_tex;
+_texture_2d( pos_tex,int4);
+_texture_2d( vel_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@ -13,27 +13,27 @@
 //    email                : brownw@ornl.gov nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"

 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> fp_tex;
-texture<float4> rhor_sp1_tex;
-texture<float4> rhor_sp2_tex;
-texture<float4> frho_sp1_tex;
-texture<float4> frho_sp2_tex;
-texture<float4> z2r_sp1_tex;
-texture<float4> z2r_sp2_tex;
+_texture( pos_tex,float4);
+_texture( fp_tex,float);
+_texture( rhor_sp1_tex,float4);
+_texture( rhor_sp2_tex,float4);
+_texture( frho_sp1_tex,float4);
+_texture( frho_sp2_tex,float4);
+_texture( z2r_sp1_tex,float4);
+_texture( z2r_sp2_tex,float4);
 #else
-texture<int4> pos_tex;
-texture<int2> fp_tex;
-texture<int4> rhor_sp1_tex;
-texture<int4> rhor_sp2_tex;
-texture<int4> frho_sp1_tex;
-texture<int4> frho_sp2_tex;
-texture<int4> z2r_sp1_tex;
-texture<int4> z2r_sp2_tex;
+_texture( pos_tex,int4);
+_texture( fp_tex,int2);
+_texture( rhor_sp1_tex,int4);
+_texture( rhor_sp2_tex,int4);
+_texture( frho_sp1_tex,int4);
+_texture( frho_sp2_tex,int4);
+_texture( z2r_sp1_tex,int4);
+_texture( z2r_sp2_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_ellipsoid_extra.h
+++ b/lib/gpu/lal_ellipsoid_extra.h
@ -18,12 +18,14 @@

 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex, quat_tex;
+_texture( pos_tex, float4);
+_texture( quat_tex,float4);
 #else
-texture<int4,1> pos_tex, quat_tex;
+_texture_2d( pos_tex,int4);
+_texture_2d( quat_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_ellipsoid_nbor.cu
+++ b/lib/gpu/lal_ellipsoid_nbor.cu
@ -13,12 +13,12 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_preprocessor.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_gauss.cu
+++ b/lib/gpu/lal_gauss.cu
@ -13,12 +13,12 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@ -13,7 +13,7 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_ellipsoid_extra.h"
 #endif

--- a/lib/gpu/lal_gayberne_lj.cu
+++ b/lib/gpu/lal_gayberne_lj.cu
@ -13,7 +13,7 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_ellipsoid_extra.h"
 #endif

--- a/lib/gpu/lal_lj.cu
+++ b/lib/gpu/lal_lj.cu
@ -13,12 +13,12 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@ -13,12 +13,12 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_lj_class2_long.cu
+++ b/lib/gpu/lal_lj_class2_long.cu
@ -13,15 +13,15 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_coul.cu
+++ b/lib/gpu/lal_lj_coul.cu
@ -13,15 +13,15 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_coul_debye.cu
+++ b/lib/gpu/lal_lj_coul_debye.cu
@ -13,15 +13,15 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_coul_long.cu
+++ b/lib/gpu/lal_lj_coul_long.cu
@ -13,15 +13,15 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_coul_msm.cu
+++ b/lib/gpu/lal_lj_coul_msm.cu
@ -13,19 +13,19 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
-texture<float> gcons_tex;
-texture<float> dgcons_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
+_texture( gcons_tex,float);
+_texture( dgcons_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
-texture<int2> gcons_tex;
-texture<int2> dgcons_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
+_texture( gcons_tex,int2);
+_texture( dgcons_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_cubic.cu
+++ b/lib/gpu/lal_lj_cubic.cu
@ -13,12 +13,12 @@
 //    email                : ndactrung@gmail.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_lj_dsf.cu
+++ b/lib/gpu/lal_lj_dsf.cu
@ -13,15 +13,15 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@ -13,13 +13,13 @@
 //    email                : ibains@nvidia.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_lj_expand_coul_long.cu
+++ b/lib/gpu/lal_lj_expand_coul_long.cu
@ -13,15 +13,15 @@
 //    email                : ndactrung@gmail.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_gromacs.cu
+++ b/lib/gpu/lal_lj_gromacs.cu
@ -13,13 +13,13 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_lj_sdk.cu
+++ b/lib/gpu/lal_lj_sdk.cu
@ -13,12 +13,12 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_lj_sdk_long.cu
+++ b/lib/gpu/lal_lj_sdk_long.cu
@ -13,15 +13,15 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_tip4p_long.cpp
+++ b/lib/gpu/lal_lj_tip4p_long.cpp
@ -23,7 +23,7 @@ const char *lj_tip4p=0;

 #include "lal_lj_tip4p_long.h"
 #include <cassert>
-using namespace LAMMPS_AL;
+namespace LAMMPS_AL {
 #define LJTIP4PLongT LJ_TIP4PLong<numtyp, acctyp>

 extern Device<PRECISION,ACC_PRECISION> device;
@ -370,6 +370,5 @@ int** LJTIP4PLongT::compute(const int ago, const int inum_full,
 }


-
-
 template class LJ_TIP4PLong<PRECISION,ACC_PRECISION>;
+}
--- a/lib/gpu/lal_lj_tip4p_long.cu
+++ b/lib/gpu/lal_lj_tip4p_long.cu
@ -13,7 +13,7 @@
 //    email                : thevsevak@gmail.com
 // ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifdef LAMMPS_SMALLBIG
@ -27,11 +27,11 @@
 #define tagint int
 #endif
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_mie.cu
+++ b/lib/gpu/lal_mie.cu
@ -13,12 +13,12 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_morse.cu
+++ b/lib/gpu/lal_morse.cu
@ -13,13 +13,13 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_neighbor_cpu.cu
+++ b/lib/gpu/lal_neighbor_cpu.cu
@ -13,7 +13,7 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_preprocessor.h"
 #endif

--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@ -14,7 +14,7 @@
 //    email                : penwang@nvidia.com, brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_preprocessor.h"
 #ifdef LAMMPS_SMALLBIG
 #define tagint int
@ -27,9 +27,9 @@
 #define tagint int
 #endif
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif

 __kernel void calc_cell_id(const numtyp4 *restrict pos,
--- a/lib/gpu/lal_neighbor_shared.h
+++ b/lib/gpu/lal_neighbor_shared.h
@ -24,6 +24,10 @@ using namespace ucl_opencl;
 #include "geryon/nvc_kernel.h"
 #include "geryon/nvc_texture.h"
 using namespace ucl_cudart;
+#elif defined(USE_HIP)
+#include "geryon/hip_kernel.h"
+#include "geryon/hip_texture.h"
+using namespace ucl_hip;
 #else
 #include "geryon/nvd_kernel.h"
 #include "geryon/nvd_texture.h"
--- a/lib/gpu/lal_pppm.cu
+++ b/lib/gpu/lal_pppm.cu
@ -13,15 +13,15 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_preprocessor.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 // Allow PPPM to compile without atomics for NVIDIA 1.0 cards, error
--- a/lib/gpu/lal_pppm.h
+++ b/lib/gpu/lal_pppm.h
@ -23,6 +23,8 @@
 #include "geryon/ocl_texture.h"
 #elif defined(USE_CUDART)
 #include "geryon/nvc_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
--- a/lib/gpu/lal_precision.h
+++ b/lib/gpu/lal_precision.h
@ -24,9 +24,11 @@ struct _lgpu_int2 {
  int x; int y;
 };

+#ifndef USE_HIP
 #ifndef int2
 #define int2 _lgpu_int2
 #endif
+#endif

 struct _lgpu_float2 {
  float x; float y;
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@ -1,4 +1,4 @@
-// **************************************************************************
+// **************************************************************************
 //                              preprocessor.cu
 //                             -------------------
 //                           W. Michael Brown (ORNL)
@ -60,6 +60,150 @@
 //
 //*************************************************************************/

+#define _texture(name, type)  texture<type> name
+#define _texture_2d(name, type) texture<type,1> name
+
+// -------------------------------------------------------------------------
+//                            HIP DEFINITIONS
+// -------------------------------------------------------------------------
+
+#ifdef USE_HIP
+  #include <hip/hip_runtime.h>
+  #ifdef __HIP_PLATFORM_HCC__
+    #define mul24(x, y) __mul24(x, y)
+    #undef _texture
+    #undef _texture_2d
+    #define _texture(name, type)  __device__ type* name
+    #define _texture_2d(name, type)  __device__ type* name
+  #endif
+  #define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
+  #define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
+  #define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
+  #define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
+  #define THREAD_ID_X threadIdx.x
+  #define THREAD_ID_Y threadIdx.y
+  #define BLOCK_ID_X blockIdx.x
+  #define BLOCK_ID_Y blockIdx.y
+  #define BLOCK_SIZE_X blockDim.x
+  #define BLOCK_SIZE_Y blockDim.y
+  #define __kernel extern "C" __global__
+  #ifdef __local
+    #undef __local
+  #endif
+  #define __local __shared__
+  #define __global
+  #define restrict __restrict__
+  #define atom_add atomicAdd
+  #define ucl_inline static __inline__ __device__
+
+  #define THREADS_PER_ATOM 4
+  #define THREADS_PER_CHARGE 8
+  #define BLOCK_NBOR_BUILD 128
+  #define BLOCK_PAIR 256
+  #define BLOCK_BIO_PAIR 256
+  #define BLOCK_ELLIPSE 128
+  #define MAX_SHARED_TYPES 11
+
+  #ifdef _SINGLE_SINGLE
+    ucl_inline double shfl_xor(double var, int laneMask, int width) {
+  #ifdef __HIP_PLATFORM_HCC__
+      return __shfl_xor(var, laneMask, width);
+  #else
+      return __shfl_xor_sync(0xffffffff, var, laneMask, width);
+  #endif
+    }
+  #else
+    ucl_inline double shfl_xor(double var, int laneMask, int width) {
+      int2 tmp;
+      tmp.x = __double2hiint(var);
+      tmp.y = __double2loint(var);
+  #ifdef __HIP_PLATFORM_HCC__
+      tmp.x = __shfl_xor(tmp.x,laneMask,width);
+      tmp.y = __shfl_xor(tmp.y,laneMask,width);
+  #else
+      tmp.x = __shfl_xor_sync(0xffffffff, tmp.x,laneMask,width);
+      tmp.y = __shfl_xor_sync(0xffffffff, tmp.y,laneMask,width);
+  #endif
+      return __hiloint2double(tmp.x,tmp.y);
+    }
+  #endif
+
+  #ifdef __HIP_PLATFORM_HCC__
+    #define ARCH 600
+    #define WARP_SIZE 64
+  #endif
+
+  #ifdef __HIP_PLATFORM_NVCC__
+    #define ARCH __CUDA_ARCH__
+    #define WARP_SIZE 32
+  #endif
+
+  #define fast_mul(X,Y) (X)*(Y)
+
+  #define MEM_THREADS WARP_SIZE
+  #define PPPM_BLOCK_1D 64
+  #define BLOCK_CELL_2D 8
+  #define BLOCK_CELL_ID 128
+  #define MAX_BIO_SHARED_TYPES 128
+
+  #ifdef __HIP_PLATFORM_NVCC__
+    #ifdef _DOUBLE_DOUBLE
+      #define fetch4(ans,i,pos_tex) {                        \
+        int4 xy = tex1Dfetch(pos_tex,i*2);                   \
+        int4 zt = tex1Dfetch(pos_tex,i*2+1);                 \
+        ans.x=__hiloint2double(xy.y, xy.x);                  \
+        ans.y=__hiloint2double(xy.w, xy.z);                  \
+        ans.z=__hiloint2double(zt.y, zt.x);                  \
+        ans.w=__hiloint2double(zt.w, zt.z);                  \
+      }
+      #define fetch(ans,i,q_tex) {                           \
+        int2 qt = tex1Dfetch(q_tex,i);                       \
+        ans=__hiloint2double(qt.y, qt.x);                    \
+      }
+    #else
+      #define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
+      #define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
+    #endif
+  #else
+    #ifdef _DOUBLE_DOUBLE
+      #define fetch4(ans,i,pos_tex) (ans=*(((double4*)pos_tex) + i))
+      #define fetch(ans,i,q_tex)    (ans=*(((double *)  q_tex) + i))
+    #else
+      #define fetch4(ans,i,pos_tex) (ans=*(((float4*)pos_tex) + i))
+      #define fetch(ans,i,q_tex)    (ans=*(((float *)  q_tex) + i))
+    #endif
+  #endif
+
+  #ifdef _DOUBLE_DOUBLE
+    #define ucl_exp exp
+    #define ucl_powr pow
+    #define ucl_atan atan
+    #define ucl_cbrt cbrt
+    #define ucl_ceil ceil
+    #define ucl_abs fabs
+    #define ucl_rsqrt rsqrt
+    #define ucl_sqrt sqrt
+    #define ucl_recip(x) ((numtyp)1.0/(x))
+
+  #else
+    #define ucl_atan atanf
+    #define ucl_cbrt cbrtf
+    #define ucl_ceil ceilf
+    #define ucl_abs fabsf
+    #define ucl_recip(x) ((numtyp)1.0/(x))
+    #define ucl_rsqrt rsqrtf
+    #define ucl_sqrt sqrtf
+
+    #ifdef NO_HARDWARE_TRANSCENDENTALS
+      #define ucl_exp expf
+      #define ucl_powr powf
+    #else
+      #define ucl_exp __expf
+      #define ucl_powr __powf
+    #endif
+  #endif
+#endif
+    
 // -------------------------------------------------------------------------
 //                            CUDA DEFINITIONS
 // -------------------------------------------------------------------------
--- a/lib/gpu/lal_re_squared.cu
+++ b/lib/gpu/lal_re_squared.cu
@ -13,7 +13,7 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_ellipsoid_extra.h"
 #endif

--- a/lib/gpu/lal_re_squared_lj.cu
+++ b/lib/gpu/lal_re_squared_lj.cu
@ -13,7 +13,7 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_ellipsoid_extra.h"
 #endif

--- a/lib/gpu/lal_soft.cu
+++ b/lib/gpu/lal_soft.cu
@ -13,12 +13,12 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@ -13,19 +13,19 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"

 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float4> sw1_tex;
-texture<float4> sw2_tex;
-texture<float4> sw3_tex;
+_texture( pos_tex,float4);
+_texture( sw1_tex,float4);
+_texture( sw2_tex,float4);
+_texture( sw3_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int4> sw1_tex;
-texture<int4> sw2_tex;
-texture<int4> sw3_tex;
+_texture_2d( pos_tex,int4);
+_texture( sw1_tex,int4);
+_texture( sw2_tex,int4);
+_texture( sw3_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_table.cu
+++ b/lib/gpu/lal_table.cu
@ -13,12 +13,12 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@ -13,23 +13,23 @@
 //       email                : ndactrung@gmail.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_tersoff_extra.h"

 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float4> ts1_tex;
-texture<float4> ts2_tex;
-texture<float4> ts3_tex;
-texture<float4> ts4_tex;
-texture<float4> ts5_tex;
+_texture( pos_tex,float4);
+_texture( ts1_tex,float4);
+_texture( ts2_tex,float4);
+_texture( ts3_tex,float4);
+_texture( ts4_tex,float4);
+_texture( ts5_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int4> ts1_tex;
-texture<int4> ts2_tex;
-texture<int4> ts3_tex;
-texture<int4> ts4_tex;
-texture<int4> ts5_tex;
+_texture_2d( pos_tex,int4);
+_texture( ts1_tex,int4);
+_texture( ts2_tex,int4);
+_texture( ts3_tex,int4);
+_texture( ts4_tex,int4);
+_texture( ts5_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_tersoff_extra.h
+++ b/lib/gpu/lal_tersoff_extra.h
@ -16,7 +16,7 @@
 #ifndef LAL_TERSOFF_EXTRA_H
 #define LAL_TERSOFF_EXTRA_H

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #else
 #endif
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@ -13,23 +13,23 @@
 //       email                : ndactrung@gmail.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_tersoff_mod_extra.h"

 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float4> ts1_tex;
-texture<float4> ts2_tex;
-texture<float4> ts3_tex;
-texture<float4> ts4_tex;
-texture<float4> ts5_tex;
+_texture( pos_tex,float4);
+_texture( ts1_tex,float4);
+_texture( ts2_tex,float4);
+_texture( ts3_tex,float4);
+_texture( ts4_tex,float4);
+_texture( ts5_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int4> ts1_tex;
-texture<int4> ts2_tex;
-texture<int4> ts3_tex;
-texture<int4> ts4_tex;
-texture<int4> ts5_tex;
+_texture_2d( pos_tex,int4);
+_texture( ts1_tex,int4);
+_texture( ts2_tex,int4);
+_texture( ts3_tex,int4);
+_texture( ts4_tex,int4);
+_texture( ts5_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_tersoff_mod_extra.h
+++ b/lib/gpu/lal_tersoff_mod_extra.h
@ -16,7 +16,7 @@
 #ifndef LAL_TERSOFF_MOD_EXTRA_H
 #define LAL_TERSOFF_MOD_EXTRA_H

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #else
 #endif
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@ -13,25 +13,25 @@
 //       email                : ndactrung@gmail.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_tersoff_zbl_extra.h"

 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float4> ts1_tex;
-texture<float4> ts2_tex;
-texture<float4> ts3_tex;
-texture<float4> ts4_tex;
-texture<float4> ts5_tex;
-texture<float4> ts6_tex;
+_texture( pos_tex,float4);
+_texture( ts1_tex,float4);
+_texture( ts2_tex,float4);
+_texture( ts3_tex,float4);
+_texture( ts4_tex,float4);
+_texture( ts5_tex,float4);
+_texture( ts6_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int4> ts1_tex;
-texture<int4> ts2_tex;
-texture<int4> ts3_tex;
-texture<int4> ts4_tex;
-texture<int4> ts5_tex;
-texture<int4> ts6_tex;
+_texture_2d( pos_tex,int4);
+_texture( ts1_tex,int4);
+_texture( ts2_tex,int4);
+_texture( ts3_tex,int4);
+_texture( ts4_tex,int4);
+_texture( ts5_tex,int4);
+_texture( ts6_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_tersoff_zbl_extra.h
+++ b/lib/gpu/lal_tersoff_zbl_extra.h
@ -16,7 +16,7 @@
 #ifndef LAL_TERSOFF_ZBL_EXTRA_H
 #define LAL_TERSOFF_ZBL_EXTRA_H

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #else
 #endif
--- a/lib/gpu/lal_ufm.cu
+++ b/lib/gpu/lal_ufm.cu
@ -15,12 +15,12 @@
                           dekoning@ifi.unicamp.br
 ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_vashishta.cu
+++ b/lib/gpu/lal_vashishta.cu
@ -13,23 +13,23 @@
 //    email                : andershaf@gmail.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"

 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float4> param1_tex;
-texture<float4> param2_tex;
-texture<float4> param3_tex;
-texture<float4> param4_tex;
-texture<float4> param5_tex;
+_texture( pos_tex,float4);
+_texture( param1_tex,float4);
+_texture( param2_tex,float4);
+_texture( param3_tex,float4);
+_texture( param4_tex,float4);
+_texture( param5_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int4> param1_tex;
-texture<int4> param2_tex;
-texture<int4> param3_tex;
-texture<int4> param4_tex;
-texture<int4> param5_tex;
+_texture_2d( pos_tex,int4);
+_texture( param1_tex,int4);
+_texture( param2_tex,int4);
+_texture( param3_tex,int4);
+_texture( param4_tex,int4);
+_texture( param5_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_yukawa.cu
+++ b/lib/gpu/lal_yukawa.cu
@ -13,12 +13,12 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_yukawa_colloid.cu
+++ b/lib/gpu/lal_yukawa_colloid.cu
@ -13,15 +13,15 @@
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> rad_tex;
+_texture( pos_tex,float4);
+_texture( rad_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> rad_tex;
+_texture_2d( pos_tex,int4);
+_texture( rad_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_zbl.cu
+++ b/lib/gpu/lal_zbl.cu
@ -13,12 +13,12 @@
 //    email                : ndactrung@gmail.com
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/src/MAKE/OPTIONS/Makefile.hip
+++ b/src/MAKE/OPTIONS/Makefile.hip
@ -0,0 +1,120 @@
+# hip = MPI with HIP(clang)
+
+SHELL = /bin/sh
+
+# ---------------------------------------------------------------------
+# compiler/linker settings
+# specify flags and libraries needed for your compiler
+
+CC =		mpicxx
+CCFLAGS =	-g -O3 
+SHFLAGS =	-fPIC
+DEPFLAGS =	-M
+
+HIP_PATH ?= $(wildcard /opt/rocm/hip)
+LINK =		$(HIP_PATH)/bin/hipcc
+LINKFLAGS =	-g -O3 $(shell mpicxx --showme:link)
+LIB = 
+SIZE =		size
+
+ARCHIVE =	ar
+ARFLAGS =	-rc
+SHLIBFLAGS =	-shared
+
+# ---------------------------------------------------------------------
+# LAMMPS-specific settings, all OPTIONAL
+# specify settings for LAMMPS features you will use
+# if you change any -D setting, do full re-compile after "make clean"
+
+# LAMMPS ifdef settings
+# see possible settings in Section 2.2 (step 4) of manual
+
+LMP_INC =	-DLAMMPS_GZIP -DLAMMPS_MEMALIGN=64
+
+# MPI library
+# see discussion in Section 2.2 (step 5) of manual
+# MPI wrapper compiler/linker can provide this info
+# can point to dummy MPI library in src/STUBS as in Makefile.serial
+# use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts
+# INC = path for mpi.h, MPI compiler settings
+# PATH = path for MPI library
+# LIB = name of MPI library
+
+MPI_INC =       -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1
+MPI_PATH = 
+MPI_LIB =	
+
+# FFT library
+# see discussion in Section 2.2 (step 6) of manual
+# can be left blank to use provided KISS FFT library
+# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
+# PATH = path for FFT library
+# LIB = name of FFT library
+
+FFT_INC =    	
+FFT_PATH = 
+FFT_LIB =	
+
+# JPEG and/or PNG library
+# see discussion in Section 2.2 (step 7) of manual
+# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
+# INC = path(s) for jpeglib.h and/or png.h
+# PATH = path(s) for JPEG library and/or PNG library
+# LIB = name(s) of JPEG library and/or PNG library
+
+JPG_INC =       
+JPG_PATH = 	
+JPG_LIB =	
+
+# ---------------------------------------------------------------------
+# build rules and dependencies
+# do not edit this section
+
+include	Makefile.package.settings
+include	Makefile.package
+
+ifeq (nvcc,${HIP_PLATFORM})
+	# fix nvcc can't handle -pthread flag
+	LINKFLAGS := $(subst -pthread,-Xcompiler -pthread,$(LINKFLAGS))
+endif
+
+EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
+EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
+EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
+EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS)
+EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS)
+
+# Path to src files
+
+vpath %.cpp ..
+vpath %.h ..
+
+# Link target
+
+$(EXE):	$(OBJ) $(EXTRA_LINK_DEPENDS)
+	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
+	$(SIZE) $(EXE)
+
+# Library targets
+
+lib:	$(OBJ) $(EXTRA_LINK_DEPENDS)
+	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
+
+shlib:	$(OBJ) $(EXTRA_LINK_DEPENDS)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
+# Compilation rules
+
+%.o:%.cpp
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
+
+# Individual dependencies
+
+depend : fastdep.exe $(SRC)
+	@./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1
+
+fastdep.exe: ../DEPEND/fastdep.c
+	cc -O -o $@ $<
+
+sinclude .depend