diff --git a/lib/gpu/Makefile.hip b/lib/gpu/Makefile.hip new file mode 100644 index 0000000000..5c9f251004 --- /dev/null +++ b/lib/gpu/Makefile.hip @@ -0,0 +1,148 @@ +# /* ---------------------------------------------------------------------- +# Generic Linux Makefile for HIP +# - export HIP_PLATFORM=hcc (or nvcc) before execution +# - change HIP_ARCH for your GPU +# ------------------------------------------------------------------------- */ + +# this setting should match LAMMPS Makefile +# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL + +LMP_INC = -DLAMMPS_SMALLBIG + +# precision for GPU calculations +# -D_SINGLE_SINGLE # Single precision for all calculations +# -D_DOUBLE_DOUBLE # Double precision for all calculations +# -D_SINGLE_DOUBLE # Accumulation of forces, etc. in double + +HIP_PRECISION = -D_SINGLE_DOUBLE + +HIP_OPTS = -O3 +HIP_HOST_OPTS = -Wno-deprecated-declarations +HIP_HOST_INCLUDE = + +# use device sort +# requires linking with hipcc and hipCUB + (rocPRIM or CUB for AMD or Nvidia respectively) +HIP_HOST_OPTS += -DUSE_HIP_DEVICE_SORT +# path to cub +HIP_HOST_INCLUDE += -I./ +# path to hipcub +HIP_HOST_INCLUDE += -I$(HIP_PATH)/../include + +# use mpi +HIP_HOST_OPTS += -DMPI_GERYON -DUCL_NO_EXIT +# this settings should match LAMMPS Makefile +MPI_COMP_OPTS = $(shell mpicxx --showme:compile) +MPI_LINK_OPTS = $(shell mpicxx --showme:link) +#MPI_COMP_OPTS += -I/usr/include/mpi -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 + +HIP_PATH ?= $(wildcard /opt/rocm/hip) +HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler) + +ifeq (hcc,$(HIP_PLATFORM)) + HIP_OPTS += -ffast-math + # possible values: gfx803,gfx900,gfx906 + HIP_ARCH = gfx906 +else ifeq (nvcc,$(HIP_PLATFORM)) + HIP_OPTS += --use_fast_math + HIP_ARCH = -gencode arch=compute_30,code=[sm_30,compute_30] -gencode arch=compute_32,code=[sm_32,compute_32] -gencode arch=compute_35,code=[sm_35,compute_35] \ + -gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52] -gencode arch=compute_53,code=[sm_53,compute_53]\ + -gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61] -gencode arch=compute_62,code=[sm_62,compute_62]\ + -gencode arch=compute_70,code=[sm_70,compute_70] -gencode arch=compute_72,code=[sm_72,compute_72] -gencode arch=compute_75,code=[sm_75,compute_75] +else + $(error Specify HIP platform using 'export HIP_PLATFORM=(hcc,nvcc)') +endif + +BIN_DIR = . +OBJ_DIR = ./obj +LIB_DIR = . +AR = ar +BSH = /bin/sh + + +# /* ---------------------------------------------------------------------- +# don't change section below without need +# ------------------------------------------------------------------------- */ + +HIP_OPTS += -DUSE_HIP $(HIP_PRECISION) +HIP_GPU_OPTS += $(HIP_OPTS) -I./ + +ifeq (hcc,$(HIP_PLATFORM)) + HIP_HOST_OPTS += -fPIC + HIP_GPU_CC = $(HIP_PATH)/bin/hipcc --genco + HIP_GPU_OPTS_S = -t="$(HIP_ARCH)" -f=\" + HIP_GPU_OPTS_E = \" + HIP_KERNEL_SUFFIX = .cpp + HIP_LIBS_TARGET = export HCC_AMDGPU_TARGET := $(HIP_ARCH) + export HCC_AMDGPU_TARGET := $(HIP_ARCH) +else ifeq (nvcc,$(HIP_PLATFORM)) + HIP_GPU_CC = $(HIP_PATH)/bin/hipcc --fatbin + HIP_GPU_OPTS += $(HIP_ARCH) + HIP_GPU_SORT_ARCH = $(HIP_ARCH) + # fix nvcc can't handle -pthread flag + MPI_COMP_OPTS := $(subst -pthread,-Xcompiler -pthread,$(MPI_COMP_OPTS)) + MPI_LINK_OPTS := $(subst -pthread,-Xcompiler -pthread,$(MPI_LINK_OPTS)) +endif + +# hipcc is essential for device sort, because of hipcub is header only library and ROCm gpu code generation is deferred to the linking stage +HIP_HOST_CC = $(HIP_PATH)/bin/hipcc +HIP_HOST_OPTS += $(HIP_OPTS) $(MPI_COMP_OPTS) $(LMP_INC) +HIP_HOST_CC_CMD = $(HIP_HOST_CC) $(HIP_HOST_OPTS) $(HIP_HOST_INCLUDE) + +# sources + +ALL_H = $(wildcard ./geryon/ucl*.h) $(wildcard ./geryon/hip*.h) $(wildcard ./lal_*.h) +SRCS := $(wildcard ./lal_*.cpp) +OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o)) +CUS := $(wildcard lal_*.cu) +CUHS := $(filter-out pppm_cubin.h, $(CUS:lal_%.cu=%_cubin.h)) pppm_f_cubin.h pppm_d_cubin.h +CUHS := $(addprefix $(OBJ_DIR)/, $(CUHS)) + +all: $(OBJ_DIR) $(CUHS) $(LIB_DIR)/libgpu.a $(BIN_DIR)/hip_get_devices + +$(OBJ_DIR): + mkdir -p $@ + +# GPU kernels compilation + +$(OBJ_DIR)/pppm_f_cubin.h: lal_pppm.cu $(ALL_H) + @cp $< $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX) + $(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) -Dgrdtyp=float -Dgrdtyp4=float4 $(HIP_GPU_OPTS_E) -o $(OBJ_DIR)/pppm_f.cubin $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX) + @xxd -i $(OBJ_DIR)/pppm_f.cubin $@ + @sed -i "s/[a-zA-Z0-9_]*pppm_f_cubin/pppm_f/g" $@ + @rm $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/pppm_f.cubin + +$(OBJ_DIR)/pppm_d_cubin.h: lal_pppm.cu $(ALL_H) + @cp $< $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX) + $(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) -Dgrdtyp=double -Dgrdtyp4=double4 $(HIP_GPU_OPTS_E) -o $(OBJ_DIR)/pppm_d.cubin $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX) + @xxd -i $(OBJ_DIR)/pppm_d.cubin $@ + @sed -i "s/[a-zA-Z0-9_]*pppm_d_cubin/pppm_d/g" $@ + @rm $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/pppm_d.cubin + +$(OBJ_DIR)/%_cubin.h: lal_%.cu $(ALL_H) + @cp $< $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX) + $(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) $(HIP_GPU_OPTS_E) -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX) + @xxd -i $(OBJ_DIR)/$*.cubin $@ + @sed -i "s/[a-zA-Z0-9_]*$*_cubin/$*/g" $@ + @rm $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/$*.cubin + +# host sources compilation + +$(OBJ_DIR)/lal_atom.o: lal_atom.cpp $(CUHS) $(ALL_H) + $(HIP_HOST_CC_CMD) -o $@ -c $< -I$(OBJ_DIR) $(HIP_GPU_SORT_ARCH) + +$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(ALL_H) + $(HIP_HOST_CC_CMD) -o $@ -c $< -I$(OBJ_DIR) + +# libgpu building + +$(LIB_DIR)/libgpu.a: $(OBJS) + $(AR) -crs $@ $(OBJS) + echo "export HIP_PLATFORM := $(HIP_PLATFORM)\n$(HIP_LIBS_TARGET)" > 'Makefile.lammps' + +# test app building + +$(BIN_DIR)/hip_get_devices: ./geryon/ucl_get_devices.cpp $(ALL_H) + $(HIP_HOST_CC_CMD) -o $@ $< -DUCL_HIP $(MPI_LINK_OPTS) + +clean: + -rm -f $(BIN_DIR)/hip_get_devices $(LIB_DIR)/libgpu.a $(OBJS) $(OBJ_DIR)/temp_* $(CUHS) diff --git a/lib/gpu/geryon/hip_device.h b/lib/gpu/geryon/hip_device.h new file mode 100644 index 0000000000..93f38d28bb --- /dev/null +++ b/lib/gpu/geryon/hip_device.h @@ -0,0 +1,519 @@ +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +#ifndef HIP_DEVICE +#define HIP_DEVICE + + +#include +#include +#include +#include +#include +#include "hip_macros.h" +#include "ucl_types.h" + +namespace ucl_hip { + +// -------------------------------------------------------------------------- +// - COMMAND QUEUE STUFF +// -------------------------------------------------------------------------- +typedef hipStream_t command_queue; + +inline void ucl_sync(hipStream_t &stream) { + CU_SAFE_CALL(hipStreamSynchronize(stream)); +} + +struct NVDProperties { + int device_id; + std::string name; + int major; + int minor; + CUDA_INT_TYPE totalGlobalMem; + int multiProcessorCount; + + int maxThreadsPerBlock; + int maxThreadsDim[3]; + int maxGridSize[3]; + int sharedMemPerBlock; + int totalConstantMemory; + int SIMDWidth; + int memPitch; + int regsPerBlock; + int clockRate; + int textureAlign; + + int kernelExecTimeoutEnabled; + int integrated; + int canMapHostMemory; + int concurrentKernels; + int ECCEnabled; + int computeMode; +}; + +/// Class for looking at device properties +/** \note Calls to change the device outside of the class results in incorrect + * behavior + * \note There is no error checking for indexing past the number of devices **/ +class UCL_Device { + public: + /// Collect properties for every GPU on the node + /** \note You must set the active GPU with set() before using the device **/ + inline UCL_Device(); + + inline ~UCL_Device(); + + /// Returns 1 (For compatibility with OpenCL) + inline int num_platforms() { return 1; } + + /// Return a string with name and info of the current platform + inline std::string platform_name() + { return "HIP platform"; } + + /// Delete any contexts/data and set the platform number to be used + inline int set_platform(const int pid); + + /// Return the number of devices that support CUDA + inline int num_devices() { return _properties.size(); } + + /// Set the CUDA device to the specified device number + /** A context and default command queue will be created for the device + * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not + * be allocated for use. clear() is called to delete any contexts and + * associated data from previous calls to set(). **/ + inline int set(int num); + + /// Delete any context and associated data stored from a call to set() + inline void clear(); + + /// Get the current device number + inline int device_num() { return _device; } + + /// Returns the default stream for the current device + inline command_queue & cq() { return cq(0); } + + /// Returns the stream indexed by i + inline command_queue & cq(const int i) { return _cq[i]; } + + /// Block until all commands in the default stream have completed + inline void sync() { sync(0); } + + /// Block until all commands in the specified stream have completed + inline void sync(const int i) { ucl_sync(cq(i)); } + + /// Get the number of command queues currently available on device + inline int num_queues() + { return _cq.size(); } + + /// Add a stream for device computations + inline void push_command_queue() { + _cq.push_back(hipStream_t()); + CU_SAFE_CALL(hipStreamCreateWithFlags(&_cq.back(),0)); + } + + /// Remove a stream for device computations + /** \note You cannot delete the default stream **/ + inline void pop_command_queue() { + if (_cq.size()<2) return; + CU_SAFE_CALL_NS(hipStreamDestroy(_cq.back())); + _cq.pop_back(); + } + + /// Set the default command queue (by default this is the null stream) + /** \param i index of the command queue (as added by push_command_queue()) + If i is 0, the default command queue is set to the null stream **/ + inline void set_command_queue(const int i) { + if (i==0) _cq[0]=0; + else _cq[0]=_cq[i]; + } + + /// Get the current CUDA device name + inline std::string name() { return name(_device); } + /// Get the CUDA device name + inline std::string name(const int i) + { return std::string(_properties[i].name); } + + /// Get a string telling the type of the current device + inline std::string device_type_name() { return device_type_name(_device); } + /// Get a string telling the type of the device + inline std::string device_type_name(const int i) { return "GPU"; } + + /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) + inline int device_type() { return device_type(_device); } + /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) + inline int device_type(const int i) { return UCL_GPU; } + + /// Returns true if host memory is efficiently addressable from device + inline bool shared_memory() { return shared_memory(_device); } + /// Returns true if host memory is efficiently addressable from device + inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; } + + /// Returns true if double precision is support for the current device + inline bool double_precision() { return double_precision(_device); } + /// Returns true if double precision is support for the device + inline bool double_precision(const int i) {return arch(i)>=1.3;} + + /// Get the number of compute units on the current device + inline unsigned cus() { return cus(_device); } + /// Get the number of compute units + inline unsigned cus(const int i) + { return _properties[i].multiProcessorCount; } + + /// Get the number of cores in the current device + inline unsigned cores() { return cores(_device); } + /// Get the number of cores + inline unsigned cores(const int i) + { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; + else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32; + else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48; + else return _properties[i].multiProcessorCount*192; } + + /// Get the gigabytes of global memory in the current device + inline double gigabytes() { return gigabytes(_device); } + /// Get the gigabytes of global memory + inline double gigabytes(const int i) + { return static_cast(_properties[i].totalGlobalMem)/1073741824; } + + /// Get the bytes of global memory in the current device + inline size_t bytes() { return bytes(_device); } + /// Get the bytes of global memory + inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; } + + // Get the gigabytes of free memory in the current device + inline double free_gigabytes() { return free_gigabytes(_device); } + // Get the gigabytes of free memory + inline double free_gigabytes(const int i) + { return static_cast(free_bytes(i))/1073741824; } + + // Get the bytes of free memory in the current device + inline size_t free_bytes() { return free_bytes(_device); } + // Get the bytes of free memory + inline size_t free_bytes(const int i) { + CUDA_INT_TYPE dfree, dtotal; + CU_SAFE_CALL_NS(hipMemGetInfo(&dfree, &dtotal)); + return static_cast(dfree); + } + + /// Return the GPGPU compute capability for current device + inline double arch() { return arch(_device); } + /// Return the GPGPU compute capability + inline double arch(const int i) + { return static_cast(_properties[i].minor)/10+_properties[i].major;} + + /// Clock rate in GHz for current device + inline double clock_rate() { return clock_rate(_device); } + /// Clock rate in GHz + inline double clock_rate(const int i) + { return _properties[i].clockRate*1e-6;} + + /// Get the maximum number of threads per block + inline size_t group_size() { return group_size(_device); } + /// Get the maximum number of threads per block + inline size_t group_size(const int i) + { return _properties[i].maxThreadsPerBlock; } + + /// Return the maximum memory pitch in bytes for current device + inline size_t max_pitch() { return max_pitch(_device); } + /// Return the maximum memory pitch in bytes + inline size_t max_pitch(const int i) { return _properties[i].memPitch; } + + /// Returns false if accelerator cannot be shared by multiple processes + /** If it cannot be determined, true is returned **/ + inline bool sharing_supported() { return sharing_supported(_device); } + /// Returns false if accelerator cannot be shared by multiple processes + /** If it cannot be determined, true is returned **/ + inline bool sharing_supported(const int i) + { return (_properties[i].computeMode == hipComputeModeDefault); } + + /// True if splitting device into equal subdevices supported + inline bool fission_equal() + { return fission_equal(_device); } + /// True if splitting device into equal subdevices supported + inline bool fission_equal(const int i) + { return false; } + /// True if splitting device into subdevices by specified counts supported + inline bool fission_by_counts() + { return fission_by_counts(_device); } + /// True if splitting device into subdevices by specified counts supported + inline bool fission_by_counts(const int i) + { return false; } + /// True if splitting device into subdevices by affinity domains supported + inline bool fission_by_affinity() + { return fission_by_affinity(_device); } + /// True if splitting device into subdevices by affinity domains supported + inline bool fission_by_affinity(const int i) + { return false; } + + /// Maximum number of subdevices allowed from device fission + inline int max_sub_devices() + { return max_sub_devices(_device); } + /// Maximum number of subdevices allowed from device fission + inline int max_sub_devices(const int i) + { return 0; } + + /// List all devices along with all properties + inline void print_all(std::ostream &out); + + /// Select the platform that has accelerators (for compatibility with OpenCL) + inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; } + + inline int load_module(const void* program, hipModule_t& module, std::string *log=NULL){ + auto it = _loaded_modules.emplace(program, hipModule_t()); + if(!it.second){ + module = it.first->second; + return UCL_SUCCESS; + } + const unsigned int num_opts=2; + hipJitOption options[num_opts]; + void *values[num_opts]; + + // set up size of compilation log buffer + options[0] = hipJitOptionInfoLogBufferSizeBytes; + values[0] = (void *)(int)10240; + // set up pointer to the compilation log buffer + options[1] = hipJitOptionInfoLogBuffer; + char clog[10240] = { 0 }; + values[1] = clog; + + hipError_t err=hipModuleLoadDataEx(&module,program,num_opts, options,(void **)values); + + if (log!=NULL) + *log=std::string(clog); + + if (err != hipSuccess) { + #ifndef UCL_NO_EXIT + std::cerr << std::endl + << "----------------------------------------------------------\n" + << " UCL Error: Error compiling PTX Program...\n" + << "----------------------------------------------------------\n"; + std::cerr << log << std::endl; + #endif + _loaded_modules.erase(it.first); + return UCL_COMPILE_ERROR; + } + it.first->second = module; + return UCL_SUCCESS; + } + private: + std::unordered_map _loaded_modules; + int _device, _num_devices; + std::vector _properties; + std::vector _cq; + hipDevice_t _cu_device; +}; + +// Grabs the properties for all devices +UCL_Device::UCL_Device() { + CU_SAFE_CALL_NS(hipInit(0)); + CU_SAFE_CALL_NS(hipGetDeviceCount(&_num_devices)); + for (int i=0; i<_num_devices; ++i) { + hipDevice_t dev; + CU_SAFE_CALL_NS(hipDeviceGet(&dev,i)); + int major, minor; + CU_SAFE_CALL_NS(hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, dev)); + CU_SAFE_CALL_NS(hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, dev)); + if (major==9999) + continue; + + NVDProperties prop; + prop.device_id = i; + prop.major=major; + prop.minor=minor; + + char namecstr[1024]; + CU_SAFE_CALL_NS(hipDeviceGetName(namecstr,1024,dev)); + prop.name=namecstr; + + CU_SAFE_CALL_NS(hipDeviceTotalMem(&prop.totalGlobalMem,dev)); + CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.multiProcessorCount, hipDeviceAttributeMultiprocessorCount, dev)); + + CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsPerBlock, hipDeviceAttributeMaxThreadsPerBlock, dev)); + CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[0], hipDeviceAttributeMaxBlockDimX, dev)); + CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[1], hipDeviceAttributeMaxBlockDimY, dev)); + CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[2], hipDeviceAttributeMaxBlockDimZ, dev)); + CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[0], hipDeviceAttributeMaxGridDimX, dev)); + CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[1], hipDeviceAttributeMaxGridDimY, dev)); + CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[2], hipDeviceAttributeMaxGridDimZ, dev)); + CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.sharedMemPerBlock, hipDeviceAttributeMaxSharedMemoryPerBlock, dev)); + CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.totalConstantMemory, hipDeviceAttributeTotalConstantMemory, dev)); + CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.SIMDWidth, hipDeviceAttributeWarpSize, dev)); + //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev)); + CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.regsPerBlock, hipDeviceAttributeMaxRegistersPerBlock, dev)); + CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.clockRate, hipDeviceAttributeClockRate, dev)); + //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev)); + + //#if CUDA_VERSION >= 2020 + //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev)); + CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.integrated, hipDeviceAttributeIntegrated, dev)); + //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev)); + CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.computeMode, hipDeviceAttributeComputeMode,dev)); + //#endif + //#if CUDA_VERSION >= 3010 + CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.concurrentKernels, hipDeviceAttributeConcurrentKernels, dev)); + //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev)); + //#endif + + _properties.push_back(prop); + } + _device=-1; + _cq.push_back(hipStream_t()); + _cq.back()=0; +} + +UCL_Device::~UCL_Device() { + clear(); +} + +int UCL_Device::set_platform(const int pid) { + clear(); + #ifdef UCL_DEBUG + assert(pid-1) { + for (int i=1; i= 2020 + int driver_version; + hipDriverGetVersion(&driver_version); + out << "Driver Version: " + << driver_version/1000 << "." << driver_version%100 + << std::endl; + //#endif + + if (num_devices() == 0) + out << "There is no device supporting HIP\n"; + for (int i=0; i= 2000 + out << " Number of compute units/multiprocessors: " + << _properties[i].multiProcessorCount << std::endl; + out << " Number of cores: " + << cores(i) << std::endl; + //#endif + out << " Total amount of constant memory: " + << _properties[i].totalConstantMemory << " bytes\n"; + out << " Total amount of local/shared memory per block: " + << _properties[i].sharedMemPerBlock << " bytes\n"; + out << " Total number of registers available per block: " + << _properties[i].regsPerBlock << std::endl; + out << " Warp size: " + << _properties[i].SIMDWidth << std::endl; + out << " Maximum number of threads per block: " + << _properties[i].maxThreadsPerBlock << std::endl; + out << " Maximum group size (# of threads per block) " + << _properties[i].maxThreadsDim[0] << " x " + << _properties[i].maxThreadsDim[1] << " x " + << _properties[i].maxThreadsDim[2] << std::endl; + out << " Maximum item sizes (# threads for each dim) " + << _properties[i].maxGridSize[0] << " x " + << _properties[i].maxGridSize[1] << " x " + << _properties[i].maxGridSize[2] << std::endl; + //out << " Maximum memory pitch: " + // << max_pitch(i) << " bytes\n"; + //out << " Texture alignment: " + // << _properties[i].textureAlign << " bytes\n"; + out << " Clock rate: " + << clock_rate(i) << " GHz\n"; + //#if CUDA_VERSION >= 2020 + //out << " Run time limit on kernels: "; + //if (_properties[i].kernelExecTimeoutEnabled) + // out << "Yes\n"; + //else + // out << "No\n"; + out << " Integrated: "; + if (_properties[i].integrated) + out << "Yes\n"; + else + out << "No\n"; + //out << " Support host page-locked memory mapping: "; + //if (_properties[i].canMapHostMemory) + // out << "Yes\n"; + //else + // out << "No\n"; + out << " Compute mode: "; + if (_properties[i].computeMode == hipComputeModeDefault) + out << "Default\n"; // multiple threads can use device +//#if CUDA_VERSION >= 8000 +// else if (_properties[i].computeMode == hipComputeModeExclusiveProcess) +//#else + else if (_properties[i].computeMode == hipComputeModeExclusive) +//#endif + out << "Exclusive\n"; // only thread can use device + else if (_properties[i].computeMode == hipComputeModeProhibited) + out << "Prohibited\n"; // no thread can use device + //#if CUDART_VERSION >= 4000 + else if (_properties[i].computeMode == hipComputeModeExclusiveProcess) + out << "Exclusive Process\n"; // multiple threads 1 process + //#endif + else + out << "Unknown\n"; + //#endif + //#if CUDA_VERSION >= 3010 + out << " Concurrent kernel execution: "; + if (_properties[i].concurrentKernels) + out << "Yes\n"; + else + out << "No\n"; + //out << " Device has ECC support enabled: "; + //if (_properties[i].ECCEnabled) + // out << "Yes\n"; + //else + // out << "No\n"; + //#endif + } +} + +} + +#endif diff --git a/lib/gpu/geryon/hip_kernel.h b/lib/gpu/geryon/hip_kernel.h new file mode 100644 index 0000000000..654eb44772 --- /dev/null +++ b/lib/gpu/geryon/hip_kernel.h @@ -0,0 +1,298 @@ +/* ----------------------------------------------------------------------- + Copyright (2010) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +#ifndef HIP_KERNEL +#define HIP_KERNEL + + +#include +#include "hip_device.h" +#include +#include +#include + +namespace ucl_hip { + +class UCL_Texture; +template class UCL_D_Vec; +template class UCL_D_Mat; +template class UCL_Vector; +template class UCL_Matrix; +#define UCL_MAX_KERNEL_ARGS 256 + +/// Class storing 1 or more kernel functions from a single string or file +class UCL_Program { + UCL_Device* _device_ptr; + public: + inline UCL_Program(UCL_Device &device) { _device_ptr = &device; _cq=device.cq(); } + inline UCL_Program(UCL_Device &device, const void *program, + const char *flags="", std::string *log=NULL) { + _device_ptr = &device; _cq=device.cq(); + init(device); + load_string(program,flags,log); + } + + inline ~UCL_Program() {} + + /// Initialize the program with a device + inline void init(UCL_Device &device) { _device_ptr = &device; _cq=device.cq(); } + + /// Clear any data associated with program + /** \note Must call init() after each clear **/ + inline void clear() { } + + /// Load a program from a file and compile with flags + inline int load(const char *filename, const char *flags="", std::string *log=NULL) { + std::ifstream in(filename); + if (!in || in.is_open()==false) { + #ifndef UCL_NO_EXIT + std::cerr << "UCL Error: Could not open kernel file: " + << filename << std::endl; + UCL_GERYON_EXIT; + #endif + return UCL_FILE_NOT_FOUND; + } + + std::string program((std::istreambuf_iterator(in)), + std::istreambuf_iterator()); + in.close(); + return load_string(program.c_str(),flags,log); + } + + /// Load a program from a string and compile with flags + inline int load_string(const void *program, const char *flags="", std::string *log=NULL) { + return _device_ptr->load_module(program, _module, log); + } + + friend class UCL_Kernel; + private: + hipModule_t _module; + hipStream_t _cq; + friend class UCL_Texture; +}; + +/// Class for dealing with CUDA Driver kernels +class UCL_Kernel { + public: + UCL_Kernel() : _dimensions(1), _num_args(0) { + _num_blocks[0]=0; + } + + UCL_Kernel(UCL_Program &program, const char *function) : + _dimensions(1), _num_args(0) { + _num_blocks[0]=0; + set_function(program,function); + _cq=program._cq; + } + + ~UCL_Kernel() {} + + /// Clear any function associated with the kernel + inline void clear() { } + + /// Get the kernel function from a program + /** \ret UCL_ERROR_FLAG (UCL_SUCCESS, UCL_FILE_NOT_FOUND, UCL_ERROR) **/ + inline int set_function(UCL_Program &program, const char *function) { + hipError_t err=hipModuleGetFunction(&_kernel,program._module,function); + if (err!=hipSuccess) { + #ifndef UCL_NO_EXIT + std::cerr << "UCL Error: Could not find function: " << function + << " in program.\n"; + UCL_GERYON_EXIT; + #endif + return UCL_FUNCTION_NOT_FOUND; + } + _cq=program._cq; + return UCL_SUCCESS; + } + + /// Set the kernel argument. + /** If not a device pointer, this must be repeated each time the argument + * changes + * \note To set kernel parameter i (i>0), parameter i-1 must be set **/ + template + inline void set_arg(const unsigned index, const dtype * const arg) { + if (index==_num_args) + add_arg(arg); + else if (index<_num_args){ + assert(0==1); // not implemented + } + else + assert(0==1); // Must add kernel parameters in sequential order + } + + /// Set a geryon container as a kernel argument. + template + inline void set_arg(const UCL_D_Vec * const arg) + { set_arg(&arg->begin()); } + + /// Set a geryon container as a kernel argument. + template + inline void set_arg(const UCL_D_Mat * const arg) + { set_arg(&arg->begin()); } + + /// Set a geryon container as a kernel argument. + template + inline void set_arg(const UCL_Vector * const arg) + { set_arg(&arg->device.begin()); } + + /// Set a geryon container as a kernel argument. + template + inline void set_arg(const UCL_Matrix * const arg) + { set_arg(&arg->device.begin()); } + + /// Add a kernel argument. + inline void add_arg(const hipDeviceptr_t* const arg) { + add_arg((void**)arg); + } + + /// Add a kernel argument. + template + inline void add_arg(const dtype* const arg) { + const auto old_size = _hip_kernel_args.size(); + const auto aligned_size = (old_size+alignof(dtype)-1) & ~(alignof(dtype)-1); + const auto arg_size = sizeof(dtype); + _hip_kernel_args.resize(aligned_size + arg_size); + *((dtype*)(&_hip_kernel_args[aligned_size])) = *arg; + _num_args++; + if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1); + } + + /// Add a geryon container as a kernel argument. + template + inline void add_arg(const UCL_D_Vec * const arg) + { add_arg(&arg->begin()); } + + /// Add a geryon container as a kernel argument. + template + inline void add_arg(const UCL_D_Mat * const arg) + { add_arg(&arg->begin()); } + + /// Add a geryon container as a kernel argument. + template + inline void add_arg(const UCL_Vector * const arg) + { add_arg(&arg->device.begin()); } + + /// Add a geryon container as a kernel argument. + template + inline void add_arg(const UCL_Matrix * const arg) + { add_arg(&arg->device.begin()); } + + /// Set the number of thread blocks and the number of threads in each block + /** \note This should be called before any arguments have been added + \note The default command queue is used for the kernel execution **/ + inline void set_size(const size_t num_blocks, const size_t block_size) { + _dimensions=1; + _num_blocks[0]=num_blocks; + _num_blocks[1]=1; + _num_blocks[2]=1; + + _block_size[0]=block_size; + _block_size[1]=1; + _block_size[2]=1; + } + + /// Set the number of thread blocks and the number of threads in each block + /** \note This should be called before any arguments have been added + \note The default command queue for the kernel is changed to cq **/ + inline void set_size(const size_t num_blocks, const size_t block_size, + command_queue &cq) + { _cq=cq; set_size(num_blocks,block_size); } + + /// Set the number of thread blocks and the number of threads in each block + /** \note This should be called before any arguments have been added + \note The default command queue is used for the kernel execution **/ + inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, + const size_t block_size_x, const size_t block_size_y) { + _dimensions=2; + _num_blocks[0]=num_blocks_x; + _num_blocks[1]=num_blocks_y; + _num_blocks[2]=1; + + _block_size[0]=block_size_x; + _block_size[1]=block_size_y; + _block_size[2]=1; + } + + /// Set the number of thread blocks and the number of threads in each block + /** \note This should be called before any arguments have been added + \note The default command queue for the kernel is changed to cq **/ + inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, + const size_t block_size_x, const size_t block_size_y, + command_queue &cq) + {_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);} + + /// Set the number of thread blocks and the number of threads in each block + /** \note This should be called before any arguments have been added + \note The default command queue is used for the kernel execution **/ + inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, + const size_t block_size_x, + const size_t block_size_y, const size_t block_size_z) { + _dimensions=2; + _num_blocks[0]=num_blocks_x; + _num_blocks[1]=num_blocks_y; + _num_blocks[2]=1; + + _block_size[0]=block_size_x; + _block_size[1]=block_size_y; + _block_size[2]=block_size_z; + } + + /// Set the number of thread blocks and the number of threads in each block + /** \note This should be called before any arguments have been added + \note The default command queue is used for the kernel execution **/ + inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, + const size_t block_size_x, const size_t block_size_y, + const size_t block_size_z, command_queue &cq) { + _cq=cq; + set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y, + block_size_z); + } + + /// Run the kernel in the default command queue + inline void run() { + size_t args_size = _hip_kernel_args.size(); + void *config[] = { + HIP_LAUNCH_PARAM_BUFFER_POINTER, (void*)_hip_kernel_args.data(), + HIP_LAUNCH_PARAM_BUFFER_SIZE, &args_size, + HIP_LAUNCH_PARAM_END + }; + const auto res = hipModuleLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1], + _num_blocks[2],_block_size[0],_block_size[1], + _block_size[2],0,_cq, NULL, config); + CU_SAFE_CALL(res); +//#endif + } + + /// Clear any arguments associated with the kernel + inline void clear_args() { + _num_args=0; + _hip_kernel_args.clear(); + } + + /// Return the default command queue/stream associated with this data + inline command_queue & cq() { return _cq; } + /// Change the default command queue associated with matrix + inline void cq(command_queue &cq_in) { _cq=cq_in; } + #include "ucl_arg_kludge.h" + + private: + hipFunction_t _kernel; + hipStream_t _cq; + unsigned _dimensions; + unsigned _num_blocks[3]; + unsigned _num_args; + friend class UCL_Texture; + + unsigned _block_size[3]; + std::vector _hip_kernel_args; +}; + +} // namespace + +#endif + diff --git a/lib/gpu/geryon/hip_macros.h b/lib/gpu/geryon/hip_macros.h new file mode 100644 index 0000000000..9c9971b896 --- /dev/null +++ b/lib/gpu/geryon/hip_macros.h @@ -0,0 +1,83 @@ +#ifndef HIP_MACROS_H +#define HIP_MACROS_H + +#include +#include +#include + +//#if CUDA_VERSION >= 3020 +#define CUDA_INT_TYPE size_t +//#else +//#define CUDA_INT_TYPE unsigned +//#endif + +#ifdef MPI_GERYON +#include "mpi.h" +#define NVD_GERYON_EXIT do { \ + int is_final; \ + MPI_Finalized(&is_final); \ + if (!is_final) \ + MPI_Abort(MPI_COMM_WORLD,-1); \ + } while(0) +#else +#define NVD_GERYON_EXIT assert(0==1) +#endif + +#ifndef UCL_GERYON_EXIT +#define UCL_GERYON_EXIT NVD_GERYON_EXIT +#endif + +#ifdef UCL_DEBUG +#define UCL_SYNC_DEBUG +#define UCL_DESTRUCT_CHECK +#endif + +#ifndef UCL_NO_API_CHECK + +#define CU_SAFE_CALL_NS( call ) do { \ + hipError_t err = call; \ + if( hipSuccess != err) { \ + fprintf(stderr, "HIP runtime error %d in call at file '%s' in line %i.\n", \ + err, __FILE__, __LINE__ ); \ + NVD_GERYON_EXIT; \ + } } while (0) + +#ifdef UCL_SYNC_DEBUG + +#define CU_SAFE_CALL( call ) do { \ + CU_SAFE_CALL_NS( call ); \ + hipError_t err=hipCtxSynchronize(); \ + if( hipSuccess != err) { \ + fprintf(stderr, "HIP runtime error %d in file '%s' in line %i.\n", \ + err, __FILE__, __LINE__ ); \ + NVD_GERYON_EXIT; \ + } } while (0) + +#else + +#define CU_SAFE_CALL( call ) CU_SAFE_CALL_NS( call ) + +#endif + +#else // not DEBUG + +// void macros for performance reasons +#define CU_SAFE_CALL_NS( call ) call +#define CU_SAFE_CALL( call) call + +#endif + +#ifdef UCL_DESTRUCT_CHECK + +#define CU_DESTRUCT_CALL( call) CU_SAFE_CALL( call) +#define CU_DESTRUCT_CALL_NS( call) CU_SAFE_CALL_NS( call) + +#else + +#define CU_DESTRUCT_CALL( call) call +#define CU_DESTRUCT_CALL_NS( call) call + +#endif + +#endif + diff --git a/lib/gpu/geryon/hip_mat.h b/lib/gpu/geryon/hip_mat.h new file mode 100644 index 0000000000..d9bbb4e521 --- /dev/null +++ b/lib/gpu/geryon/hip_mat.h @@ -0,0 +1,43 @@ +/* ----------------------------------------------------------------------- + Copyright (2010) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +/*! \file */ + +#ifndef HIP_MAT_H +#define HIP_MAT_H + + +#include +#include "hip_memory.h" + +/// Namespace for CUDA Driver routines +namespace ucl_hip { + +#define _UCL_MAT_ALLOW +#define _UCL_DEVICE_PTR_MAT +#include "ucl_basemat.h" +#include "ucl_h_vec.h" +#include "ucl_h_mat.h" +#include "ucl_d_vec.h" +#include "ucl_d_mat.h" +#include "ucl_s_obj_help.h" +#include "ucl_vector.h" +#include "ucl_matrix.h" +#undef _UCL_DEVICE_PTR_MAT +#undef _UCL_MAT_ALLOW + +#define UCL_COPY_ALLOW +#include "ucl_copy.h" +#undef UCL_COPY_ALLOW + +#define UCL_PRINT_ALLOW +#include "ucl_print.h" +#undef UCL_PRINT_ALLOW + +} // namespace ucl_cudadr + +#endif diff --git a/lib/gpu/geryon/hip_memory.h b/lib/gpu/geryon/hip_memory.h new file mode 100644 index 0000000000..13f60ad939 --- /dev/null +++ b/lib/gpu/geryon/hip_memory.h @@ -0,0 +1,279 @@ +/* ----------------------------------------------------------------------- + Copyright (2010) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +#ifndef HIP_MEMORY_H +#define HIP_MEMORY_H + + +#include +#include +#include +#include +#include "hip_macros.h" +#include "hip_device.h" +#include "ucl_types.h" + +namespace ucl_hip { + +// -------------------------------------------------------------------------- +// - API Specific Types +// -------------------------------------------------------------------------- +//typedef dim3 ucl_kernel_dim; + +#ifdef __HIP_PLATFORM_NVCC__ +typedef enum hipArray_Format { + HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01, + HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02, + HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03, + HIP_AD_FORMAT_SIGNED_INT8 = 0x08, + HIP_AD_FORMAT_SIGNED_INT16 = 0x09, + HIP_AD_FORMAT_SIGNED_INT32 = 0x0a, + HIP_AD_FORMAT_HALF = 0x10, + HIP_AD_FORMAT_FLOAT = 0x20 +}hipArray_Format; +#endif + +// -------------------------------------------------------------------------- +// - API SPECIFIC DEVICE POINTERS +// -------------------------------------------------------------------------- +typedef hipDeviceptr_t device_ptr; + +// -------------------------------------------------------------------------- +// - HOST MEMORY ALLOCATION ROUTINES +// -------------------------------------------------------------------------- +template +inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n, + const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){ + hipError_t err=hipSuccess; + if (kind==UCL_NOT_PINNED) + *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n); + else if (kind==UCL_WRITE_ONLY) + err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined); + else + err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault); + if (err!=hipSuccess || *(mat.host_ptr())==NULL) + return UCL_MEMORY_ERROR; + mat.cq()=cm.cq(); + return UCL_SUCCESS; +} + +template +inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n, + const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){ + hipError_t err=hipSuccess; + if (kind==UCL_NOT_PINNED) + *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n); + else if (kind==UCL_WRITE_ONLY) + err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined); + else + err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault); + if (err!=hipSuccess || *(mat.host_ptr())==NULL) + return UCL_MEMORY_ERROR; + mat.cq()=dev.cq(); + return UCL_SUCCESS; +} + +template +inline void _host_free(mat_type &mat) { + if (mat.kind()==UCL_VIEW) + return; + else if (mat.kind()!=UCL_NOT_PINNED) + CU_DESTRUCT_CALL(hipHostFree(mat.begin())); + else + free(mat.begin()); +} + +template +inline int _host_resize(mat_type &mat, const size_t n) { + _host_free(mat); + hipError_t err=hipSuccess; + if (mat.kind()==UCL_NOT_PINNED) + *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n); + else if (mat.kind()==UCL_WRITE_ONLY) + err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined); + else + err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault); + if (err!=hipSuccess || *(mat.host_ptr())==NULL) + return UCL_MEMORY_ERROR; + return UCL_SUCCESS; +} + +// -------------------------------------------------------------------------- +// - DEVICE MEMORY ALLOCATION ROUTINES +// -------------------------------------------------------------------------- +template +inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n, + const enum UCL_MEMOPT kind) { + hipError_t err=hipMalloc((void**)&mat.cbegin(),n); + if (err!=hipSuccess) + return UCL_MEMORY_ERROR; + mat.cq()=cm.cq(); + return UCL_SUCCESS; +} + +template +inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n, + const enum UCL_MEMOPT kind) { + hipError_t err=hipMalloc((void**)&mat.cbegin(),n); + if (err!=hipSuccess) + return UCL_MEMORY_ERROR; + mat.cq()=dev.cq(); + return UCL_SUCCESS; +} + +template +inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows, + const size_t cols, size_t &pitch, + const enum UCL_MEMOPT kind) { + hipError_t err; + size_t upitch; + err=hipMallocPitch((void**)&mat.cbegin(),&upitch, + cols*sizeof(typename mat_type::data_type),rows); + pitch=static_cast(upitch); + if (err!=hipSuccess) + return UCL_MEMORY_ERROR; + mat.cq()=cm.cq(); + return UCL_SUCCESS; +} + +template +inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows, + const size_t cols, size_t &pitch, + const enum UCL_MEMOPT kind) { + hipError_t err; + size_t upitch; + err=hipMallocPitch((void**)&mat.cbegin(),&upitch, + cols*sizeof(typename mat_type::data_type),rows); + pitch=static_cast(upitch); + if (err!=hipSuccess) + return UCL_MEMORY_ERROR; + mat.cq()=d.cq(); + return UCL_SUCCESS; +} + +template +inline void _device_free(mat_type &mat) { + if (mat.kind()!=UCL_VIEW){ + CU_DESTRUCT_CALL(hipFree((void*)mat.cbegin())); + } +} + +template +inline int _device_resize(mat_type &mat, const size_t n) { + _device_free(mat); + hipError_t err=hipMalloc((void**)&mat.cbegin(),n); + if (err!=hipSuccess) + return UCL_MEMORY_ERROR; + return UCL_SUCCESS; +} + +template +inline int _device_resize(mat_type &mat, const size_t rows, + const size_t cols, size_t &pitch) { + _device_free(mat); + hipError_t err; + size_t upitch; + err=hipMallocPitch((void**)&mat.cbegin(),&upitch, + cols*sizeof(typename mat_type::data_type),rows); + pitch=static_cast(upitch); + if (err!=hipSuccess) + return UCL_MEMORY_ERROR; + return UCL_SUCCESS; +} + +inline void _device_view(hipDeviceptr_t *ptr, hipDeviceptr_t &in) { + *ptr=in; +} + +template +inline void _device_view(hipDeviceptr_t *ptr, numtyp *in) { + *ptr=0; +} + +inline void _device_view(hipDeviceptr_t *ptr, hipDeviceptr_t &in, + const size_t offset, const size_t numsize) { + *ptr=(hipDeviceptr_t)(((char*)in)+offset*numsize); +} + +template +inline void _device_view(hipDeviceptr_t *ptr, numtyp *in, + const size_t offset, const size_t numsize) { + *ptr=0; +} + +// -------------------------------------------------------------------------- +// - DEVICE IMAGE ALLOCATION ROUTINES +// -------------------------------------------------------------------------- +template +inline void _device_image_alloc(mat_type &mat, copy_type &cm, const size_t rows, + const size_t cols) { + assert(0==1); +} + +template +inline void _device_image_alloc(mat_type &mat, UCL_Device &d, const size_t rows, + const size_t cols) { + assert(0==1); +} + +template +inline void _device_image_free(mat_type &mat) { + assert(0==1); +} + +// -------------------------------------------------------------------------- +// - ZERO ROUTINES +// -------------------------------------------------------------------------- +inline void _host_zero(void *ptr, const size_t n) { + memset(ptr,0,n); +} + +template +inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) { + CU_SAFE_CALL(hipMemsetAsync((void*)mat.cbegin(),0,n,cq)); +} + + +// -------------------------------------------------------------------------- +// - MEMCPY ROUTINES +// -------------------------------------------------------------------------- + + +template +hipMemcpyKind _memcpy_kind(mat1 &dst, const mat2 &src){ + assert(mat1::MEM_TYPE < 2 && mat2::MEM_TYPE < 2); + return (hipMemcpyKind)((1 - mat2::MEM_TYPE)*2 + (1 - mat1::MEM_TYPE)); +} + +template +inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n) { + CU_SAFE_CALL(hipMemcpy((void*)dst.begin(), (void*)src.begin(), n, _memcpy_kind(dst, src))); +} + +template +inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n, hipStream_t &cq) { + CU_SAFE_CALL(hipMemcpyAsync((void*)dst.begin(), (void*)src.begin(), n, _memcpy_kind(dst, src), cq)); +} + +template +inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, + const size_t spitch, const size_t cols, + const size_t rows) { + CU_SAFE_CALL(hipMemcpy2D((void*)dst.begin(), dpitch, (void*)src.begin(), spitch, cols, rows, _memcpy_kind(dst, src))); +} + +template +inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, + const size_t spitch, const size_t cols, + const size_t rows,hipStream_t &cq) { + CU_SAFE_CALL(hipMemcpy2DAsync((void*)dst.begin(), dpitch, (void*)src.begin(), spitch, cols, rows, _memcpy_kind(dst, src), cq)); +} + +} // namespace ucl_cudart + +#endif + diff --git a/lib/gpu/geryon/hip_texture.h b/lib/gpu/geryon/hip_texture.h new file mode 100644 index 0000000000..e7aa4e1461 --- /dev/null +++ b/lib/gpu/geryon/hip_texture.h @@ -0,0 +1,113 @@ +/* ----------------------------------------------------------------------- + Copyright (2010) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +#ifndef HIP_TEXTURE +#define HIP_TEXTURE + + +#include +#include "hip_kernel.h" +#include "hip_mat.h" + +namespace ucl_hip { + +#ifdef __HIP_PLATFORM_NVCC__ +inline hipError_t hipModuleGetTexRef(CUtexref* texRef, hipModule_t hmod, const char* name){ + return hipCUResultTohipError(cuModuleGetTexRef(texRef, hmod, name)); +} +inline hipError_t hipTexRefSetFormat(CUtexref tex, hipArray_Format fmt, int NumPackedComponents) { + return hipCUResultTohipError(cuTexRefSetFormat(tex, (CUarray_format)fmt, NumPackedComponents )); +} +inline hipError_t hipTexRefSetAddress(size_t* offset, CUtexref tex, hipDeviceptr_t devPtr, size_t size) { + return hipCUResultTohipError(cuTexRefSetAddress(offset, tex, devPtr, size)); +} +#endif + +/// Class storing a texture reference +class UCL_Texture { + public: + UCL_Texture() {} + ~UCL_Texture() {} + /// Construct with a specified texture reference + inline UCL_Texture(UCL_Program &prog, const char *texture_name) + { get_texture(prog,texture_name); } + /// Set the texture reference for this object + inline void get_texture(UCL_Program &prog, const char *texture_name) + { + #ifdef __HIP_PLATFORM_NVCC__ + CU_SAFE_CALL(hipModuleGetTexRef(&_tex, prog._module, texture_name)); + #else + size_t _global_var_size; + CU_SAFE_CALL(hipModuleGetGlobal(&_device_ptr_to_global_var, &_global_var_size, prog._module, texture_name)); + #endif + } + + /// Bind a float array where each fetch grabs a vector of length numel + template + inline void bind_float(UCL_D_Vec &vec, const unsigned numel) + { _bind_float(vec,numel); } + + /// Bind a float array where each fetch grabs a vector of length numel + template + inline void bind_float(UCL_D_Mat &vec, const unsigned numel) + { _bind_float(vec,numel); } + + /// Bind a float array where each fetch grabs a vector of length numel + template + inline void bind_float(UCL_Vector &vec, const unsigned numel) + { _bind_float(vec.device,numel); } + + /// Bind a float array where each fetch grabs a vector of length numel + template + inline void bind_float(UCL_Matrix &vec, const unsigned numel) + { _bind_float(vec.device,numel); } + + /// Unbind the texture reference from the memory allocation + inline void unbind() { } + + /// Make a texture reference available to kernel + inline void allow(UCL_Kernel &kernel) { + //#if CUDA_VERSION < 4000 + //CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex)); + //#endif + } + + private: +#ifdef __HIP_PLATFORM_NVCC__ + CUtexref _tex; +#else + void* _device_ptr_to_global_var; +#endif + friend class UCL_Kernel; + + template + inline void _bind_float(mat_typ &vec, const unsigned numel) { + #ifdef UCL_DEBUG + assert(numel!=0 && numel<5); + #endif + +#ifdef __HIP_PLATFORM_NVCC__ + if (vec.element_size()==sizeof(float)) + CU_SAFE_CALL(hipTexRefSetFormat(_tex, HIP_AD_FORMAT_FLOAT, numel)); + else { + if (numel>2) + CU_SAFE_CALL(hipTexRefSetFormat(_tex, HIP_AD_FORMAT_SIGNED_INT32, numel)); + else + CU_SAFE_CALL(hipTexRefSetFormat(_tex,HIP_AD_FORMAT_SIGNED_INT32,numel*2)); + } + CU_SAFE_CALL(hipTexRefSetAddress(NULL, _tex, vec.cbegin(), vec.numel()*vec.element_size())); +#else + void* data_ptr = (void*)vec.cbegin(); + CU_SAFE_CALL(hipMemcpyHtoD(hipDeviceptr_t(_device_ptr_to_global_var), &data_ptr, sizeof(void*))); +#endif + } +}; + +} // namespace + +#endif + diff --git a/lib/gpu/geryon/hip_timer.h b/lib/gpu/geryon/hip_timer.h new file mode 100644 index 0000000000..3be0b8cfd6 --- /dev/null +++ b/lib/gpu/geryon/hip_timer.h @@ -0,0 +1,107 @@ +/* ----------------------------------------------------------------------- + Copyright (2010) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the Simplified BSD License. + ----------------------------------------------------------------------- */ + +#ifndef HIP_TIMER_H +#define HIP_TIMER_H + + +#include +#include "hip_macros.h" +#include "hip_device.h" + +namespace ucl_hip { + +/// Class for timing CUDA Driver events +class UCL_Timer { + public: + inline UCL_Timer() : _total_time(0.0f), _initialized(false) { } + inline UCL_Timer(UCL_Device &dev) : _total_time(0.0f), _initialized(false) + { init(dev); } + + inline ~UCL_Timer() { clear(); } + + /// Clear any data associated with timer + /** \note init() must be called to reuse timer after a clear() **/ + inline void clear() { + if (_initialized) { + CU_DESTRUCT_CALL(hipEventDestroy(start_event)); + CU_DESTRUCT_CALL(hipEventDestroy(stop_event)); + _initialized=false; + _total_time=0.0; + } + } + + /// Initialize default command queue for timing + inline void init(UCL_Device &dev) { init(dev, dev.cq()); } + + /// Initialize command queue for timing + inline void init(UCL_Device &dev, command_queue &cq) { + clear(); + _cq=cq; + _initialized=true; + CU_SAFE_CALL( hipEventCreateWithFlags(&start_event,0) ); + CU_SAFE_CALL( hipEventCreateWithFlags(&stop_event,0) ); + } + + /// Start timing on command queue + inline void start() { CU_SAFE_CALL(hipEventRecord(start_event,_cq)); } + + /// Stop timing on command queue + inline void stop() { CU_SAFE_CALL(hipEventRecord(stop_event,_cq)); } + + /// Block until the start event has been reached on device + inline void sync_start() + { CU_SAFE_CALL(hipEventSynchronize(start_event)); } + + /// Block until the stop event has been reached on device + inline void sync_stop() + { CU_SAFE_CALL(hipEventSynchronize(stop_event)); } + + /// Set the time elapsed to zero (not the total_time) + inline void zero() { + CU_SAFE_CALL(hipEventRecord(start_event,_cq)); + CU_SAFE_CALL(hipEventRecord(stop_event,_cq)); + } + + /// Set the total time to zero + inline void zero_total() { _total_time=0.0; } + + /// Add time from previous start and stop to total + /** Forces synchronization **/ + inline double add_to_total() + { double t=time(); _total_time+=t; return t/1000.0; } + + /// Add a user specified time to the total (ms) + inline void add_time_to_total(const double t) { _total_time+=t; } + + /// Return the time (ms) of last start to stop - Forces synchronization + inline double time() { + float timer; + CU_SAFE_CALL(hipEventSynchronize(stop_event)); + CU_SAFE_CALL( hipEventElapsedTime(&timer,start_event,stop_event) ); + return timer; + } + + /// Return the time (s) of last start to stop - Forces synchronization + inline double seconds() { return time()/1000.0; } + + /// Return the total time in ms + inline double total_time() { return _total_time; } + + /// Return the total time in seconds + inline double total_seconds() { return _total_time/1000.0; } + + private: + hipEvent_t start_event, stop_event; + hipStream_t _cq; + double _total_time; + bool _initialized; +}; + +} // namespace + +#endif diff --git a/lib/gpu/geryon/ucl_get_devices.cpp b/lib/gpu/geryon/ucl_get_devices.cpp index 1fa758fb46..b8dfc6f7b1 100644 --- a/lib/gpu/geryon/ucl_get_devices.cpp +++ b/lib/gpu/geryon/ucl_get_devices.cpp @@ -36,6 +36,11 @@ using namespace ucl_cudadr; using namespace ucl_cudart; #endif +#ifdef UCL_HIP +#include "hip_device.h" +using namespace ucl_hip; +#endif + int main(int argc, char** argv) { UCL_Device cop; std::cout << "Found " << cop.num_platforms() << " platform(s).\n"; diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp index aa6d33d334..95d40c0d0a 100644 --- a/lib/gpu/lal_answer.cpp +++ b/lib/gpu/lal_answer.cpp @@ -179,13 +179,15 @@ double AnswerT::energy_virial(double *eatom, double **vatom, if (_eflag) { for (int i=0; i<_inum; i++) evdwl+=engv[i]; - if (_ef_atom) - if (_ilist==NULL) + if (_ef_atom) { + if (_ilist==NULL) { for (int i=0; i<_inum; i++) eatom[i]+=engv[i]; - else + } else { for (int i=0; i<_inum; i++) eatom[_ilist[i]]+=engv[i]; + } + } vstart=_inum; } if (_vflag) { @@ -193,7 +195,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom, for (int j=0; j<6; j++) { for (int i=vstart; i +#include +#endif + namespace LAMMPS_AL { #define AtomT Atom @@ -70,6 +75,26 @@ bool AtomT::alloc(const int nall) { } #endif + #ifdef USE_HIP_DEVICE_SORT + if (_gpu_nbor==1) { + size_t temp_storage_bytes = 0; + if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, sort_out_keys, sort_out_keys, sort_out_values, sort_out_values, _max_atoms)) + return false; + if(sort_out_size < _max_atoms){ + if (sort_out_keys ) hipFree(sort_out_keys); + if (sort_out_values) hipFree(sort_out_values); + hipMalloc(&sort_out_keys , _max_atoms * sizeof(unsigned)); + hipMalloc(&sort_out_values, _max_atoms * sizeof(int )); + sort_out_size = _max_atoms; + } + if(temp_storage_bytes > sort_temp_storage_size){ + if(sort_temp_storage) hipFree(sort_temp_storage); + hipMalloc(&sort_temp_storage, temp_storage_bytes); + sort_temp_storage_size = temp_storage_bytes; + } + } + #endif + // --------------------------- Device allocations int gpu_bytes=0; success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY, @@ -184,6 +209,27 @@ bool AtomT::add_fields(const bool charge, const bool rot, return false; } #endif + + #ifdef USE_HIP_DEVICE_SORT + if (_gpu_nbor==1) { + size_t temp_storage_bytes = 0; + if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, sort_out_keys, sort_out_keys, sort_out_values, sort_out_values, _max_atoms)) + return false; + if(sort_out_size < _max_atoms){ + if (sort_out_keys ) hipFree(sort_out_keys); + if (sort_out_values) hipFree(sort_out_values); + hipMalloc(&sort_out_keys , _max_atoms * sizeof(unsigned)); + hipMalloc(&sort_out_values, _max_atoms * sizeof(int )); + sort_out_size = _max_atoms; + } + if(temp_storage_bytes > sort_temp_storage_size){ + if(sort_temp_storage) hipFree(sort_temp_storage); + hipMalloc(&sort_temp_storage, temp_storage_bytes); + sort_temp_storage_size = temp_storage_bytes; + } + } + #endif + success=success && (dev_particle_id.alloc(_max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=dev_particle_id.row_bytes(); @@ -275,6 +321,19 @@ void AtomT::clear_resize() { if (_gpu_nbor==1) cudppDestroyPlan(sort_plan); #endif + #ifdef USE_HIP_DEVICE_SORT + if (_gpu_nbor==1) { + if(sort_out_keys) hipFree(sort_out_keys); + if(sort_out_values) hipFree(sort_out_values); + if(sort_temp_storage) hipFree(sort_temp_storage); + sort_out_keys = nullptr; + sort_out_values = nullptr; + sort_temp_storage = nullptr; + sort_temp_storage_size = 0; + sort_out_size = 0; + } + #endif + if (_gpu_nbor==2) { host_particle_id.clear(); host_cell_id.clear(); @@ -326,6 +385,22 @@ void AtomT::sort_neighbor(const int num_atoms) { UCL_GERYON_EXIT; } #endif + + #ifdef USE_HIP_DEVICE_SORT + if(sort_out_size < num_atoms){ + printf("AtomT::sort_neighbor: invalid temp buffer size\n"); + UCL_GERYON_EXIT; + } + if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(sort_temp_storage, sort_temp_storage_size, (unsigned *)dev_cell_id.begin(), sort_out_keys, (int *)dev_particle_id.begin(), sort_out_values, num_atoms)){ + printf("AtomT::sort_neighbor: DeviceRadixSort error\n"); + UCL_GERYON_EXIT; + } + if(hipSuccess != hipMemcpy((unsigned *)dev_cell_id.begin(), sort_out_keys , num_atoms*sizeof(unsigned), hipMemcpyDeviceToDevice) || + hipSuccess != hipMemcpy((int *) dev_particle_id.begin(), sort_out_values, num_atoms*sizeof(int ), hipMemcpyDeviceToDevice)){ + printf("AtomT::sort_neighbor: copy output error\n"); + UCL_GERYON_EXIT; + } + #endif } #ifdef GPU_CAST diff --git a/lib/gpu/lal_atom.cu b/lib/gpu/lal_atom.cu index 28ff31c566..99c76ba625 100644 --- a/lib/gpu/lal_atom.cu +++ b/lib/gpu/lal_atom.cu @@ -13,7 +13,7 @@ // email : brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_preprocessor.h" #endif diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h index 57880d7ca9..e39740d6c8 100644 --- a/lib/gpu/lal_atom.h +++ b/lib/gpu/lal_atom.h @@ -29,6 +29,11 @@ using namespace ucl_opencl; #include "geryon/nvc_mat.h" #include "geryon/nvc_kernel.h" using namespace ucl_cudart; +#elif defined(USE_HIP) +#include "geryon/hip_timer.h" +#include "geryon/hip_mat.h" +#include "geryon/hip_kernel.h" +using namespace ucl_hip; #else #include "geryon/nvd_timer.h" #include "geryon/nvd_mat.h" @@ -477,6 +482,14 @@ class Atom { CUDPPConfiguration sort_config; CUDPPHandle sort_plan; #endif + + #ifdef USE_HIP_DEVICE_SORT + unsigned* sort_out_keys = nullptr; + int* sort_out_values = nullptr; + void* sort_temp_storage = nullptr; + size_t sort_temp_storage_size = 0; + size_t sort_out_size = 0; + #endif }; } diff --git a/lib/gpu/lal_aux_fun1.h b/lib/gpu/lal_aux_fun1.h index 47a216ff6f..5b7150d950 100644 --- a/lib/gpu/lal_aux_fun1.h +++ b/lib/gpu/lal_aux_fun1.h @@ -13,7 +13,7 @@ // email : brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_preprocessor.h" #endif diff --git a/lib/gpu/lal_base_atomic.h b/lib/gpu/lal_base_atomic.h index e3e9829abc..fef810b17b 100644 --- a/lib/gpu/lal_base_atomic.h +++ b/lib/gpu/lal_base_atomic.h @@ -24,6 +24,8 @@ #include "geryon/ocl_texture.h" #elif defined(USE_CUDART) #include "geryon/nvc_texture.h" +#elif defined(USE_HIP) +#include "geryon/hip_texture.h" #else #include "geryon/nvd_texture.h" #endif diff --git a/lib/gpu/lal_base_charge.h b/lib/gpu/lal_base_charge.h index 64c19554b9..ea81dcdc4e 100644 --- a/lib/gpu/lal_base_charge.h +++ b/lib/gpu/lal_base_charge.h @@ -25,6 +25,8 @@ #include "geryon/ocl_texture.h" #elif defined(USE_CUDART) #include "geryon/nvc_texture.h" +#elif defined(USE_HIP) +#include "geryon/hip_texture.h" #else #include "geryon/nvd_texture.h" #endif diff --git a/lib/gpu/lal_base_dipole.h b/lib/gpu/lal_base_dipole.h index b51c4303cf..31a2a2d5f7 100644 --- a/lib/gpu/lal_base_dipole.h +++ b/lib/gpu/lal_base_dipole.h @@ -23,6 +23,8 @@ #ifdef USE_OPENCL #include "geryon/ocl_texture.h" +#elif defined(USE_HIP) +#include "geryon/hip_texture.h" #else #include "geryon/nvd_texture.h" #endif diff --git a/lib/gpu/lal_base_dpd.h b/lib/gpu/lal_base_dpd.h index 7a75282d0a..1e6f2ab1f2 100644 --- a/lib/gpu/lal_base_dpd.h +++ b/lib/gpu/lal_base_dpd.h @@ -23,6 +23,8 @@ #ifdef USE_OPENCL #include "geryon/ocl_texture.h" +#elif defined(USE_HIP) +#include "geryon/hip_texture.h" #else #include "geryon/nvd_texture.h" #endif diff --git a/lib/gpu/lal_base_ellipsoid.h b/lib/gpu/lal_base_ellipsoid.h index 7deeccbf44..061baac5b6 100644 --- a/lib/gpu/lal_base_ellipsoid.h +++ b/lib/gpu/lal_base_ellipsoid.h @@ -24,6 +24,8 @@ #include "geryon/ocl_texture.h" #elif defined(USE_CUDART) #include "geryon/nvc_texture.h" +#elif defined(USE_HIP) +#include "geryon/hip_texture.h" #else #include "geryon/nvd_texture.h" #endif diff --git a/lib/gpu/lal_base_three.h b/lib/gpu/lal_base_three.h index f5f36863c4..75589f705d 100644 --- a/lib/gpu/lal_base_three.h +++ b/lib/gpu/lal_base_three.h @@ -24,6 +24,8 @@ #include "geryon/ocl_texture.h" #elif defined(USE_CUDART) #include "geryon/nvc_texture.h" +#elif defined(USE_HIP) +#include "geryon/hip_texture.h" #else #include "geryon/nvd_texture.h" #endif diff --git a/lib/gpu/lal_beck.cu b/lib/gpu/lal_beck.cu index 7d72128b5f..bdfa57a0ce 100644 --- a/lib/gpu/lal_beck.cu +++ b/lib/gpu/lal_beck.cu @@ -13,12 +13,12 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; +_texture( pos_tex,float4); #else -texture pos_tex; +_texture_2d( pos_tex,int4); #endif #else #define pos_tex x_ diff --git a/lib/gpu/lal_born.cu b/lib/gpu/lal_born.cu index 0ca7fea5fe..6e1d7d95a0 100644 --- a/lib/gpu/lal_born.cu +++ b/lib/gpu/lal_born.cu @@ -13,12 +13,12 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; +_texture( pos_tex,float4); #else -texture pos_tex; +_texture_2d( pos_tex,int4); #endif #else #define pos_tex x_ diff --git a/lib/gpu/lal_born_coul_long.cu b/lib/gpu/lal_born_coul_long.cu index 71e5e0ae50..441ce4beb5 100644 --- a/lib/gpu/lal_born_coul_long.cu +++ b/lib/gpu/lal_born_coul_long.cu @@ -13,15 +13,15 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif #else diff --git a/lib/gpu/lal_born_coul_long_cs.cu b/lib/gpu/lal_born_coul_long_cs.cu index b3e79d9ec8..f4b6da2d0d 100644 --- a/lib/gpu/lal_born_coul_long_cs.cu +++ b/lib/gpu/lal_born_coul_long_cs.cu @@ -13,15 +13,16 @@ // email : ndactrung@gmail.com // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" + #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif #else diff --git a/lib/gpu/lal_born_coul_wolf.cu b/lib/gpu/lal_born_coul_wolf.cu index 2c2249feeb..e34367e18b 100644 --- a/lib/gpu/lal_born_coul_wolf.cu +++ b/lib/gpu/lal_born_coul_wolf.cu @@ -13,15 +13,15 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif #else diff --git a/lib/gpu/lal_born_coul_wolf_cs.cu b/lib/gpu/lal_born_coul_wolf_cs.cu index 847387bfe8..1a02420736 100644 --- a/lib/gpu/lal_born_coul_wolf_cs.cu +++ b/lib/gpu/lal_born_coul_wolf_cs.cu @@ -13,15 +13,15 @@ // email : ndactrung@gmail.com // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif #else diff --git a/lib/gpu/lal_buck.cu b/lib/gpu/lal_buck.cu index c1e1c7d7e2..c23186f2d8 100644 --- a/lib/gpu/lal_buck.cu +++ b/lib/gpu/lal_buck.cu @@ -13,12 +13,12 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; +_texture( pos_tex,float4); #else -texture pos_tex; +_texture_2d( pos_tex,int4); #endif #else #define pos_tex x_ diff --git a/lib/gpu/lal_buck_coul.cu b/lib/gpu/lal_buck_coul.cu index 6f0d414825..2282532f4a 100644 --- a/lib/gpu/lal_buck_coul.cu +++ b/lib/gpu/lal_buck_coul.cu @@ -13,15 +13,15 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif #else diff --git a/lib/gpu/lal_buck_coul_long.cu b/lib/gpu/lal_buck_coul_long.cu index da3237a31f..469c235571 100644 --- a/lib/gpu/lal_buck_coul_long.cu +++ b/lib/gpu/lal_buck_coul_long.cu @@ -13,15 +13,15 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif #else diff --git a/lib/gpu/lal_charmm_long.cu b/lib/gpu/lal_charmm_long.cu index 244131f833..a797707057 100644 --- a/lib/gpu/lal_charmm_long.cu +++ b/lib/gpu/lal_charmm_long.cu @@ -13,15 +13,15 @@ // email : brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif #else diff --git a/lib/gpu/lal_colloid.cu b/lib/gpu/lal_colloid.cu index 28a9809b19..437faff25b 100644 --- a/lib/gpu/lal_colloid.cu +++ b/lib/gpu/lal_colloid.cu @@ -13,12 +13,12 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; +_texture( pos_tex,float4); #else -texture pos_tex; +_texture_2d( pos_tex,int4); #endif #else #define pos_tex x_ diff --git a/lib/gpu/lal_coul.cu b/lib/gpu/lal_coul.cu index 503e674c81..21d849bb6f 100644 --- a/lib/gpu/lal_coul.cu +++ b/lib/gpu/lal_coul.cu @@ -13,15 +13,15 @@ // email : ndtrung@umich.edu // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif #else diff --git a/lib/gpu/lal_coul_debye.cu b/lib/gpu/lal_coul_debye.cu index 464a1b18de..ab8bc5b961 100644 --- a/lib/gpu/lal_coul_debye.cu +++ b/lib/gpu/lal_coul_debye.cu @@ -13,15 +13,15 @@ // email : ndtrung@umich.edu // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif #else diff --git a/lib/gpu/lal_coul_dsf.cu b/lib/gpu/lal_coul_dsf.cu index 82c44cd382..147ac68552 100644 --- a/lib/gpu/lal_coul_dsf.cu +++ b/lib/gpu/lal_coul_dsf.cu @@ -13,15 +13,15 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif #else diff --git a/lib/gpu/lal_coul_long.cu b/lib/gpu/lal_coul_long.cu index 365195e00c..f97a039629 100644 --- a/lib/gpu/lal_coul_long.cu +++ b/lib/gpu/lal_coul_long.cu @@ -13,15 +13,15 @@ // email : a.kohlmeyer@temple.edu // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif #else diff --git a/lib/gpu/lal_coul_long_cs.cu b/lib/gpu/lal_coul_long_cs.cu index 3c34666131..1479157944 100644 --- a/lib/gpu/lal_coul_long_cs.cu +++ b/lib/gpu/lal_coul_long_cs.cu @@ -13,15 +13,15 @@ // email : ndactrung@gmail.com // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif #else diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index 5bd306ea5b..aaf74ed28c 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -268,7 +268,7 @@ int DeviceT::init(Answer &ans, const bool charge, gpu_nbor=1; else if (_gpu_mode==Device::GPU_HYB_NEIGH) gpu_nbor=2; - #ifndef USE_CUDPP + #if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT) if (gpu_nbor==1) gpu_nbor=2; #endif @@ -341,7 +341,7 @@ int DeviceT::init_nbor(Neighbor *nbor, const int nlocal, gpu_nbor=1; else if (_gpu_mode==Device::GPU_HYB_NEIGH) gpu_nbor=2; - #ifndef USE_CUDPP + #if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT) if (gpu_nbor==1) gpu_nbor=2; #endif @@ -712,7 +712,7 @@ int DeviceT::compile_kernels() { gpu_lib_data.update_host(false); _ptx_arch=static_cast(gpu_lib_data[0])/100.0; - #ifndef USE_OPENCL + #if !(defined(USE_OPENCL) || defined(USE_HIP)) if (_ptx_arch>gpu->arch() || floor(_ptx_arch)arch())) return -4; #endif diff --git a/lib/gpu/lal_device.cu b/lib/gpu/lal_device.cu index 37d0758845..afc7a0b988 100644 --- a/lib/gpu/lal_device.cu +++ b/lib/gpu/lal_device.cu @@ -13,7 +13,7 @@ // email : brownw@ornl.gov // *************************************************************************** -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_preprocessor.h" #endif diff --git a/lib/gpu/lal_dipole_lj.cu b/lib/gpu/lal_dipole_lj.cu index 745bdb7f27..8ea49e7f60 100644 --- a/lib/gpu/lal_dipole_lj.cu +++ b/lib/gpu/lal_dipole_lj.cu @@ -13,16 +13,16 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; -texture mu_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); +_texture( mu_tex,float4); #else -texture pos_tex; -texture q_tex; -texture mu_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); +_texture_2d( mu_tex,int4); #endif #else diff --git a/lib/gpu/lal_dipole_lj_sf.cu b/lib/gpu/lal_dipole_lj_sf.cu index 9847e84823..9d753d9b63 100644 --- a/lib/gpu/lal_dipole_lj_sf.cu +++ b/lib/gpu/lal_dipole_lj_sf.cu @@ -13,17 +13,17 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; -texture mu_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); +_texture( mu_tex,float4); #else -texture pos_tex; -texture q_tex; -texture mu_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); +_texture_2d( mu_tex,int4); #endif #else diff --git a/lib/gpu/lal_dipole_long_lj.cu b/lib/gpu/lal_dipole_long_lj.cu index f888dece9b..95c3b7a3db 100644 --- a/lib/gpu/lal_dipole_long_lj.cu +++ b/lib/gpu/lal_dipole_long_lj.cu @@ -13,16 +13,16 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; -texture mu_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); +_texture( mu_tex,float4); #else -texture pos_tex; -texture q_tex; -texture mu_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); +_texture_2d( mu_tex,int4); #endif #else diff --git a/lib/gpu/lal_dpd.cu b/lib/gpu/lal_dpd.cu index 462401ad70..d97f430f77 100644 --- a/lib/gpu/lal_dpd.cu +++ b/lib/gpu/lal_dpd.cu @@ -13,14 +13,14 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture vel_tex; +_texture( pos_tex,float4); +_texture( vel_tex,float4); #else -texture pos_tex; -texture vel_tex; +_texture_2d( pos_tex,int4); +_texture_2d( vel_tex,int4); #endif #else #define pos_tex x_ diff --git a/lib/gpu/lal_eam.cu b/lib/gpu/lal_eam.cu index 13440b7d45..9427b1832f 100644 --- a/lib/gpu/lal_eam.cu +++ b/lib/gpu/lal_eam.cu @@ -13,27 +13,27 @@ // email : brownw@ornl.gov nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture fp_tex; -texture rhor_sp1_tex; -texture rhor_sp2_tex; -texture frho_sp1_tex; -texture frho_sp2_tex; -texture z2r_sp1_tex; -texture z2r_sp2_tex; +_texture( pos_tex,float4); +_texture( fp_tex,float); +_texture( rhor_sp1_tex,float4); +_texture( rhor_sp2_tex,float4); +_texture( frho_sp1_tex,float4); +_texture( frho_sp2_tex,float4); +_texture( z2r_sp1_tex,float4); +_texture( z2r_sp2_tex,float4); #else -texture pos_tex; -texture fp_tex; -texture rhor_sp1_tex; -texture rhor_sp2_tex; -texture frho_sp1_tex; -texture frho_sp2_tex; -texture z2r_sp1_tex; -texture z2r_sp2_tex; +_texture( pos_tex,int4); +_texture( fp_tex,int2); +_texture( rhor_sp1_tex,int4); +_texture( rhor_sp2_tex,int4); +_texture( frho_sp1_tex,int4); +_texture( frho_sp2_tex,int4); +_texture( z2r_sp1_tex,int4); +_texture( z2r_sp2_tex,int4); #endif #else diff --git a/lib/gpu/lal_ellipsoid_extra.h b/lib/gpu/lal_ellipsoid_extra.h index 71668f5e02..e6122c7404 100644 --- a/lib/gpu/lal_ellipsoid_extra.h +++ b/lib/gpu/lal_ellipsoid_extra.h @@ -18,12 +18,14 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex, quat_tex; +_texture( pos_tex, float4); +_texture( quat_tex,float4); #else -texture pos_tex, quat_tex; +_texture_2d( pos_tex,int4); +_texture_2d( quat_tex,int4); #endif #else #define pos_tex x_ diff --git a/lib/gpu/lal_ellipsoid_nbor.cu b/lib/gpu/lal_ellipsoid_nbor.cu index cac77f5dd3..e6eedc7159 100644 --- a/lib/gpu/lal_ellipsoid_nbor.cu +++ b/lib/gpu/lal_ellipsoid_nbor.cu @@ -13,12 +13,12 @@ // email : brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_preprocessor.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; +_texture( pos_tex,float4); #else -texture pos_tex; +_texture_2d( pos_tex,int4); #endif #else #define pos_tex x_ diff --git a/lib/gpu/lal_gauss.cu b/lib/gpu/lal_gauss.cu index 98e71ea413..f9d3741537 100644 --- a/lib/gpu/lal_gauss.cu +++ b/lib/gpu/lal_gauss.cu @@ -13,12 +13,12 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; +_texture( pos_tex,float4); #else -texture pos_tex; +_texture_2d( pos_tex,int4); #endif #else #define pos_tex x_ diff --git a/lib/gpu/lal_gayberne.cu b/lib/gpu/lal_gayberne.cu index cd1ee59fc6..5c035da004 100644 --- a/lib/gpu/lal_gayberne.cu +++ b/lib/gpu/lal_gayberne.cu @@ -13,7 +13,7 @@ // email : brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_ellipsoid_extra.h" #endif diff --git a/lib/gpu/lal_gayberne_lj.cu b/lib/gpu/lal_gayberne_lj.cu index 7925b72784..eb9c797dc7 100644 --- a/lib/gpu/lal_gayberne_lj.cu +++ b/lib/gpu/lal_gayberne_lj.cu @@ -13,7 +13,7 @@ // email : brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_ellipsoid_extra.h" #endif diff --git a/lib/gpu/lal_lj.cu b/lib/gpu/lal_lj.cu index 5838ac95cf..716346a83d 100644 --- a/lib/gpu/lal_lj.cu +++ b/lib/gpu/lal_lj.cu @@ -13,12 +13,12 @@ // email : brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; +_texture( pos_tex,float4); #else -texture pos_tex; +_texture_2d( pos_tex,int4); #endif #else #define pos_tex x_ diff --git a/lib/gpu/lal_lj96.cu b/lib/gpu/lal_lj96.cu index 8dd63ef920..aa06caa4ae 100644 --- a/lib/gpu/lal_lj96.cu +++ b/lib/gpu/lal_lj96.cu @@ -13,12 +13,12 @@ // email : brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; +_texture( pos_tex,float4); #else -texture pos_tex; +_texture_2d( pos_tex,int4); #endif #else #define pos_tex x_ diff --git a/lib/gpu/lal_lj_class2_long.cu b/lib/gpu/lal_lj_class2_long.cu index 41ceca35d7..4e1bf9c1f7 100644 --- a/lib/gpu/lal_lj_class2_long.cu +++ b/lib/gpu/lal_lj_class2_long.cu @@ -13,15 +13,15 @@ // email : brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif #else diff --git a/lib/gpu/lal_lj_coul.cu b/lib/gpu/lal_lj_coul.cu index 5c7f0da46f..cd72f72d97 100644 --- a/lib/gpu/lal_lj_coul.cu +++ b/lib/gpu/lal_lj_coul.cu @@ -13,15 +13,15 @@ // email : brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif #else diff --git a/lib/gpu/lal_lj_coul_debye.cu b/lib/gpu/lal_lj_coul_debye.cu index 91b105b3da..40b7046623 100644 --- a/lib/gpu/lal_lj_coul_debye.cu +++ b/lib/gpu/lal_lj_coul_debye.cu @@ -13,15 +13,15 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif #else diff --git a/lib/gpu/lal_lj_coul_long.cu b/lib/gpu/lal_lj_coul_long.cu index 0e25bb2dbc..6a09cc4b75 100644 --- a/lib/gpu/lal_lj_coul_long.cu +++ b/lib/gpu/lal_lj_coul_long.cu @@ -13,15 +13,15 @@ // email : brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif #else diff --git a/lib/gpu/lal_lj_coul_msm.cu b/lib/gpu/lal_lj_coul_msm.cu index 3f73c6f47d..c8eaa47b3d 100644 --- a/lib/gpu/lal_lj_coul_msm.cu +++ b/lib/gpu/lal_lj_coul_msm.cu @@ -13,19 +13,19 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; -texture gcons_tex; -texture dgcons_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); +_texture( gcons_tex,float); +_texture( dgcons_tex,float); #else -texture pos_tex; -texture q_tex; -texture gcons_tex; -texture dgcons_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); +_texture( gcons_tex,int2); +_texture( dgcons_tex,int2); #endif #else diff --git a/lib/gpu/lal_lj_cubic.cu b/lib/gpu/lal_lj_cubic.cu index 683c6b2aac..b6a0768a36 100644 --- a/lib/gpu/lal_lj_cubic.cu +++ b/lib/gpu/lal_lj_cubic.cu @@ -13,12 +13,12 @@ // email : ndactrung@gmail.com // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; +_texture( pos_tex,float4); #else -texture pos_tex; +_texture_2d( pos_tex,int4); #endif #else #define pos_tex x_ diff --git a/lib/gpu/lal_lj_dsf.cu b/lib/gpu/lal_lj_dsf.cu index 323576fe77..2475743ccc 100644 --- a/lib/gpu/lal_lj_dsf.cu +++ b/lib/gpu/lal_lj_dsf.cu @@ -13,15 +13,15 @@ // email : brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif #else diff --git a/lib/gpu/lal_lj_expand.cu b/lib/gpu/lal_lj_expand.cu index 9281ad27bd..4496835588 100644 --- a/lib/gpu/lal_lj_expand.cu +++ b/lib/gpu/lal_lj_expand.cu @@ -13,13 +13,13 @@ // email : ibains@nvidia.com // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; +_texture( pos_tex,float4); #else -texture pos_tex; +_texture_2d( pos_tex,int4); #endif #else diff --git a/lib/gpu/lal_lj_expand_coul_long.cu b/lib/gpu/lal_lj_expand_coul_long.cu index aa8f02be8c..e9de9bab27 100644 --- a/lib/gpu/lal_lj_expand_coul_long.cu +++ b/lib/gpu/lal_lj_expand_coul_long.cu @@ -13,15 +13,15 @@ // email : ndactrung@gmail.com // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif #else diff --git a/lib/gpu/lal_lj_gromacs.cu b/lib/gpu/lal_lj_gromacs.cu index 93dc3d9456..dcef79dc90 100644 --- a/lib/gpu/lal_lj_gromacs.cu +++ b/lib/gpu/lal_lj_gromacs.cu @@ -13,13 +13,13 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; +_texture( pos_tex,float4); #else -texture pos_tex; +_texture_2d( pos_tex,int4); #endif #else diff --git a/lib/gpu/lal_lj_sdk.cu b/lib/gpu/lal_lj_sdk.cu index 01b2cdd18d..a11b1c7887 100644 --- a/lib/gpu/lal_lj_sdk.cu +++ b/lib/gpu/lal_lj_sdk.cu @@ -13,12 +13,12 @@ // email : brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; +_texture( pos_tex,float4); #else -texture pos_tex; +_texture_2d( pos_tex,int4); #endif #else #define pos_tex x_ diff --git a/lib/gpu/lal_lj_sdk_long.cu b/lib/gpu/lal_lj_sdk_long.cu index 5ff64b2254..e28fa19db4 100644 --- a/lib/gpu/lal_lj_sdk_long.cu +++ b/lib/gpu/lal_lj_sdk_long.cu @@ -13,15 +13,15 @@ // email : brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif #else diff --git a/lib/gpu/lal_lj_tip4p_long.cpp b/lib/gpu/lal_lj_tip4p_long.cpp index d44edc8cbd..0b781300c7 100644 --- a/lib/gpu/lal_lj_tip4p_long.cpp +++ b/lib/gpu/lal_lj_tip4p_long.cpp @@ -23,7 +23,7 @@ const char *lj_tip4p=0; #include "lal_lj_tip4p_long.h" #include -using namespace LAMMPS_AL; +namespace LAMMPS_AL { #define LJTIP4PLongT LJ_TIP4PLong extern Device device; @@ -370,6 +370,5 @@ int** LJTIP4PLongT::compute(const int ago, const int inum_full, } - - template class LJ_TIP4PLong; +} diff --git a/lib/gpu/lal_lj_tip4p_long.cu b/lib/gpu/lal_lj_tip4p_long.cu index 147c460795..092513da4d 100644 --- a/lib/gpu/lal_lj_tip4p_long.cu +++ b/lib/gpu/lal_lj_tip4p_long.cu @@ -13,7 +13,7 @@ // email : thevsevak@gmail.com // *************************************************************************** -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifdef LAMMPS_SMALLBIG @@ -27,11 +27,11 @@ #define tagint int #endif #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif #else diff --git a/lib/gpu/lal_mie.cu b/lib/gpu/lal_mie.cu index 33018566eb..e2ede4d3a1 100644 --- a/lib/gpu/lal_mie.cu +++ b/lib/gpu/lal_mie.cu @@ -13,12 +13,12 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; +_texture( pos_tex,float4); #else -texture pos_tex; +_texture_2d( pos_tex,int4); #endif #else #define pos_tex x_ diff --git a/lib/gpu/lal_morse.cu b/lib/gpu/lal_morse.cu index 0a14071d19..7e4e0e54fa 100644 --- a/lib/gpu/lal_morse.cu +++ b/lib/gpu/lal_morse.cu @@ -13,13 +13,13 @@ // email : brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; +_texture( pos_tex,float4); #else -texture pos_tex; +_texture_2d( pos_tex,int4); #endif #else diff --git a/lib/gpu/lal_neighbor_cpu.cu b/lib/gpu/lal_neighbor_cpu.cu index d005eb9f97..29141a8b90 100644 --- a/lib/gpu/lal_neighbor_cpu.cu +++ b/lib/gpu/lal_neighbor_cpu.cu @@ -13,7 +13,7 @@ // email : brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_preprocessor.h" #endif diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu index 83692a24e4..8a2b603217 100644 --- a/lib/gpu/lal_neighbor_gpu.cu +++ b/lib/gpu/lal_neighbor_gpu.cu @@ -14,7 +14,7 @@ // email : penwang@nvidia.com, brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_preprocessor.h" #ifdef LAMMPS_SMALLBIG #define tagint int @@ -27,9 +27,9 @@ #define tagint int #endif #ifndef _DOUBLE_DOUBLE -texture pos_tex; +_texture( pos_tex,float4); #else -texture pos_tex; +_texture_2d( pos_tex,int4); #endif __kernel void calc_cell_id(const numtyp4 *restrict pos, diff --git a/lib/gpu/lal_neighbor_shared.h b/lib/gpu/lal_neighbor_shared.h index 834ee8406d..5cfc4e4767 100644 --- a/lib/gpu/lal_neighbor_shared.h +++ b/lib/gpu/lal_neighbor_shared.h @@ -24,6 +24,10 @@ using namespace ucl_opencl; #include "geryon/nvc_kernel.h" #include "geryon/nvc_texture.h" using namespace ucl_cudart; +#elif defined(USE_HIP) +#include "geryon/hip_kernel.h" +#include "geryon/hip_texture.h" +using namespace ucl_hip; #else #include "geryon/nvd_kernel.h" #include "geryon/nvd_texture.h" diff --git a/lib/gpu/lal_pppm.cu b/lib/gpu/lal_pppm.cu index 24636b9a93..6a7408c720 100644 --- a/lib/gpu/lal_pppm.cu +++ b/lib/gpu/lal_pppm.cu @@ -13,15 +13,15 @@ // email : brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_preprocessor.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; +_texture( pos_tex,float4); +_texture( q_tex,float); #else -texture pos_tex; -texture q_tex; +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); #endif // Allow PPPM to compile without atomics for NVIDIA 1.0 cards, error diff --git a/lib/gpu/lal_pppm.h b/lib/gpu/lal_pppm.h index 045423e079..bc5f216076 100644 --- a/lib/gpu/lal_pppm.h +++ b/lib/gpu/lal_pppm.h @@ -23,6 +23,8 @@ #include "geryon/ocl_texture.h" #elif defined(USE_CUDART) #include "geryon/nvc_texture.h" +#elif defined(USE_HIP) +#include "geryon/hip_texture.h" #else #include "geryon/nvd_texture.h" #endif diff --git a/lib/gpu/lal_precision.h b/lib/gpu/lal_precision.h index d5b1b9b6c0..7f82ba18aa 100644 --- a/lib/gpu/lal_precision.h +++ b/lib/gpu/lal_precision.h @@ -24,9 +24,11 @@ struct _lgpu_int2 { int x; int y; }; +#ifndef USE_HIP #ifndef int2 #define int2 _lgpu_int2 #endif +#endif struct _lgpu_float2 { float x; float y; diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h index 566a451c21..cd95355ee4 100644 --- a/lib/gpu/lal_preprocessor.h +++ b/lib/gpu/lal_preprocessor.h @@ -1,4 +1,4 @@ -// ************************************************************************** +// ************************************************************************** // preprocessor.cu // ------------------- // W. Michael Brown (ORNL) @@ -60,6 +60,150 @@ // //*************************************************************************/ +#define _texture(name, type) texture name +#define _texture_2d(name, type) texture name + +// ------------------------------------------------------------------------- +// HIP DEFINITIONS +// ------------------------------------------------------------------------- + +#ifdef USE_HIP + #include + #ifdef __HIP_PLATFORM_HCC__ + #define mul24(x, y) __mul24(x, y) + #undef _texture + #undef _texture_2d + #define _texture(name, type) __device__ type* name + #define _texture_2d(name, type) __device__ type* name + #endif + #define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x) + #define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y) + #define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x); + #define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y); + #define THREAD_ID_X threadIdx.x + #define THREAD_ID_Y threadIdx.y + #define BLOCK_ID_X blockIdx.x + #define BLOCK_ID_Y blockIdx.y + #define BLOCK_SIZE_X blockDim.x + #define BLOCK_SIZE_Y blockDim.y + #define __kernel extern "C" __global__ + #ifdef __local + #undef __local + #endif + #define __local __shared__ + #define __global + #define restrict __restrict__ + #define atom_add atomicAdd + #define ucl_inline static __inline__ __device__ + + #define THREADS_PER_ATOM 4 + #define THREADS_PER_CHARGE 8 + #define BLOCK_NBOR_BUILD 128 + #define BLOCK_PAIR 256 + #define BLOCK_BIO_PAIR 256 + #define BLOCK_ELLIPSE 128 + #define MAX_SHARED_TYPES 11 + + #ifdef _SINGLE_SINGLE + ucl_inline double shfl_xor(double var, int laneMask, int width) { + #ifdef __HIP_PLATFORM_HCC__ + return __shfl_xor(var, laneMask, width); + #else + return __shfl_xor_sync(0xffffffff, var, laneMask, width); + #endif + } + #else + ucl_inline double shfl_xor(double var, int laneMask, int width) { + int2 tmp; + tmp.x = __double2hiint(var); + tmp.y = __double2loint(var); + #ifdef __HIP_PLATFORM_HCC__ + tmp.x = __shfl_xor(tmp.x,laneMask,width); + tmp.y = __shfl_xor(tmp.y,laneMask,width); + #else + tmp.x = __shfl_xor_sync(0xffffffff, tmp.x,laneMask,width); + tmp.y = __shfl_xor_sync(0xffffffff, tmp.y,laneMask,width); + #endif + return __hiloint2double(tmp.x,tmp.y); + } + #endif + + #ifdef __HIP_PLATFORM_HCC__ + #define ARCH 600 + #define WARP_SIZE 64 + #endif + + #ifdef __HIP_PLATFORM_NVCC__ + #define ARCH __CUDA_ARCH__ + #define WARP_SIZE 32 + #endif + + #define fast_mul(X,Y) (X)*(Y) + + #define MEM_THREADS WARP_SIZE + #define PPPM_BLOCK_1D 64 + #define BLOCK_CELL_2D 8 + #define BLOCK_CELL_ID 128 + #define MAX_BIO_SHARED_TYPES 128 + + #ifdef __HIP_PLATFORM_NVCC__ + #ifdef _DOUBLE_DOUBLE + #define fetch4(ans,i,pos_tex) { \ + int4 xy = tex1Dfetch(pos_tex,i*2); \ + int4 zt = tex1Dfetch(pos_tex,i*2+1); \ + ans.x=__hiloint2double(xy.y, xy.x); \ + ans.y=__hiloint2double(xy.w, xy.z); \ + ans.z=__hiloint2double(zt.y, zt.x); \ + ans.w=__hiloint2double(zt.w, zt.z); \ + } + #define fetch(ans,i,q_tex) { \ + int2 qt = tex1Dfetch(q_tex,i); \ + ans=__hiloint2double(qt.y, qt.x); \ + } + #else + #define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i); + #define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i); + #endif + #else + #ifdef _DOUBLE_DOUBLE + #define fetch4(ans,i,pos_tex) (ans=*(((double4*)pos_tex) + i)) + #define fetch(ans,i,q_tex) (ans=*(((double *) q_tex) + i)) + #else + #define fetch4(ans,i,pos_tex) (ans=*(((float4*)pos_tex) + i)) + #define fetch(ans,i,q_tex) (ans=*(((float *) q_tex) + i)) + #endif + #endif + + #ifdef _DOUBLE_DOUBLE + #define ucl_exp exp + #define ucl_powr pow + #define ucl_atan atan + #define ucl_cbrt cbrt + #define ucl_ceil ceil + #define ucl_abs fabs + #define ucl_rsqrt rsqrt + #define ucl_sqrt sqrt + #define ucl_recip(x) ((numtyp)1.0/(x)) + + #else + #define ucl_atan atanf + #define ucl_cbrt cbrtf + #define ucl_ceil ceilf + #define ucl_abs fabsf + #define ucl_recip(x) ((numtyp)1.0/(x)) + #define ucl_rsqrt rsqrtf + #define ucl_sqrt sqrtf + + #ifdef NO_HARDWARE_TRANSCENDENTALS + #define ucl_exp expf + #define ucl_powr powf + #else + #define ucl_exp __expf + #define ucl_powr __powf + #endif + #endif +#endif + // ------------------------------------------------------------------------- // CUDA DEFINITIONS // ------------------------------------------------------------------------- diff --git a/lib/gpu/lal_re_squared.cu b/lib/gpu/lal_re_squared.cu index e238734074..cd525a1ade 100644 --- a/lib/gpu/lal_re_squared.cu +++ b/lib/gpu/lal_re_squared.cu @@ -13,7 +13,7 @@ // email : brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_ellipsoid_extra.h" #endif diff --git a/lib/gpu/lal_re_squared_lj.cu b/lib/gpu/lal_re_squared_lj.cu index d69dae2461..b3c44febe7 100644 --- a/lib/gpu/lal_re_squared_lj.cu +++ b/lib/gpu/lal_re_squared_lj.cu @@ -13,7 +13,7 @@ // email : brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_ellipsoid_extra.h" #endif diff --git a/lib/gpu/lal_soft.cu b/lib/gpu/lal_soft.cu index 831b986725..bccfa85ed5 100644 --- a/lib/gpu/lal_soft.cu +++ b/lib/gpu/lal_soft.cu @@ -13,12 +13,12 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; +_texture( pos_tex,float4); #else -texture pos_tex; +_texture_2d( pos_tex,int4); #endif #else #define pos_tex x_ diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu index 3b6de5a683..de36d29efb 100644 --- a/lib/gpu/lal_sw.cu +++ b/lib/gpu/lal_sw.cu @@ -13,19 +13,19 @@ // email : brownw@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture sw1_tex; -texture sw2_tex; -texture sw3_tex; +_texture( pos_tex,float4); +_texture( sw1_tex,float4); +_texture( sw2_tex,float4); +_texture( sw3_tex,float4); #else -texture pos_tex; -texture sw1_tex; -texture sw2_tex; -texture sw3_tex; +_texture_2d( pos_tex,int4); +_texture( sw1_tex,int4); +_texture( sw2_tex,int4); +_texture( sw3_tex,int4); #endif #else diff --git a/lib/gpu/lal_table.cu b/lib/gpu/lal_table.cu index 971b56d96e..8c0b52e05f 100644 --- a/lib/gpu/lal_table.cu +++ b/lib/gpu/lal_table.cu @@ -13,12 +13,12 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; +_texture( pos_tex,float4); #else -texture pos_tex; +_texture_2d( pos_tex,int4); #endif #else #define pos_tex x_ diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu index 2e29ca721b..d57efaf15c 100644 --- a/lib/gpu/lal_tersoff.cu +++ b/lib/gpu/lal_tersoff.cu @@ -13,23 +13,23 @@ // email : ndactrung@gmail.com // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_tersoff_extra.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture ts1_tex; -texture ts2_tex; -texture ts3_tex; -texture ts4_tex; -texture ts5_tex; +_texture( pos_tex,float4); +_texture( ts1_tex,float4); +_texture( ts2_tex,float4); +_texture( ts3_tex,float4); +_texture( ts4_tex,float4); +_texture( ts5_tex,float4); #else -texture pos_tex; -texture ts1_tex; -texture ts2_tex; -texture ts3_tex; -texture ts4_tex; -texture ts5_tex; +_texture_2d( pos_tex,int4); +_texture( ts1_tex,int4); +_texture( ts2_tex,int4); +_texture( ts3_tex,int4); +_texture( ts4_tex,int4); +_texture( ts5_tex,int4); #endif #else diff --git a/lib/gpu/lal_tersoff_extra.h b/lib/gpu/lal_tersoff_extra.h index 47d16678f0..7ee29751b7 100644 --- a/lib/gpu/lal_tersoff_extra.h +++ b/lib/gpu/lal_tersoff_extra.h @@ -16,7 +16,7 @@ #ifndef LAL_TERSOFF_EXTRA_H #define LAL_TERSOFF_EXTRA_H -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #else #endif diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu index c85f5e08ca..da284f39ee 100644 --- a/lib/gpu/lal_tersoff_mod.cu +++ b/lib/gpu/lal_tersoff_mod.cu @@ -13,23 +13,23 @@ // email : ndactrung@gmail.com // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_tersoff_mod_extra.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture ts1_tex; -texture ts2_tex; -texture ts3_tex; -texture ts4_tex; -texture ts5_tex; +_texture( pos_tex,float4); +_texture( ts1_tex,float4); +_texture( ts2_tex,float4); +_texture( ts3_tex,float4); +_texture( ts4_tex,float4); +_texture( ts5_tex,float4); #else -texture pos_tex; -texture ts1_tex; -texture ts2_tex; -texture ts3_tex; -texture ts4_tex; -texture ts5_tex; +_texture_2d( pos_tex,int4); +_texture( ts1_tex,int4); +_texture( ts2_tex,int4); +_texture( ts3_tex,int4); +_texture( ts4_tex,int4); +_texture( ts5_tex,int4); #endif #else diff --git a/lib/gpu/lal_tersoff_mod_extra.h b/lib/gpu/lal_tersoff_mod_extra.h index a130d98488..fb658cb0da 100644 --- a/lib/gpu/lal_tersoff_mod_extra.h +++ b/lib/gpu/lal_tersoff_mod_extra.h @@ -16,7 +16,7 @@ #ifndef LAL_TERSOFF_MOD_EXTRA_H #define LAL_TERSOFF_MOD_EXTRA_H -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #else #endif diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu index b574a529c0..a170715f57 100644 --- a/lib/gpu/lal_tersoff_zbl.cu +++ b/lib/gpu/lal_tersoff_zbl.cu @@ -13,25 +13,25 @@ // email : ndactrung@gmail.com // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_tersoff_zbl_extra.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture ts1_tex; -texture ts2_tex; -texture ts3_tex; -texture ts4_tex; -texture ts5_tex; -texture ts6_tex; +_texture( pos_tex,float4); +_texture( ts1_tex,float4); +_texture( ts2_tex,float4); +_texture( ts3_tex,float4); +_texture( ts4_tex,float4); +_texture( ts5_tex,float4); +_texture( ts6_tex,float4); #else -texture pos_tex; -texture ts1_tex; -texture ts2_tex; -texture ts3_tex; -texture ts4_tex; -texture ts5_tex; -texture ts6_tex; +_texture_2d( pos_tex,int4); +_texture( ts1_tex,int4); +_texture( ts2_tex,int4); +_texture( ts3_tex,int4); +_texture( ts4_tex,int4); +_texture( ts5_tex,int4); +_texture( ts6_tex,int4); #endif #else diff --git a/lib/gpu/lal_tersoff_zbl_extra.h b/lib/gpu/lal_tersoff_zbl_extra.h index 32c05a3716..9e5bcb10b4 100644 --- a/lib/gpu/lal_tersoff_zbl_extra.h +++ b/lib/gpu/lal_tersoff_zbl_extra.h @@ -16,7 +16,7 @@ #ifndef LAL_TERSOFF_ZBL_EXTRA_H #define LAL_TERSOFF_ZBL_EXTRA_H -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #else #endif diff --git a/lib/gpu/lal_ufm.cu b/lib/gpu/lal_ufm.cu index 51c4df3b5b..33d0f3c956 100644 --- a/lib/gpu/lal_ufm.cu +++ b/lib/gpu/lal_ufm.cu @@ -15,12 +15,12 @@ dekoning@ifi.unicamp.br ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; +_texture( pos_tex,float4); #else -texture pos_tex; +_texture_2d( pos_tex,int4); #endif #else #define pos_tex x_ diff --git a/lib/gpu/lal_vashishta.cu b/lib/gpu/lal_vashishta.cu index 0da46c3b53..d13bc659e7 100644 --- a/lib/gpu/lal_vashishta.cu +++ b/lib/gpu/lal_vashishta.cu @@ -13,23 +13,23 @@ // email : andershaf@gmail.com // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture param1_tex; -texture param2_tex; -texture param3_tex; -texture param4_tex; -texture param5_tex; +_texture( pos_tex,float4); +_texture( param1_tex,float4); +_texture( param2_tex,float4); +_texture( param3_tex,float4); +_texture( param4_tex,float4); +_texture( param5_tex,float4); #else -texture pos_tex; -texture param1_tex; -texture param2_tex; -texture param3_tex; -texture param4_tex; -texture param5_tex; +_texture_2d( pos_tex,int4); +_texture( param1_tex,int4); +_texture( param2_tex,int4); +_texture( param3_tex,int4); +_texture( param4_tex,int4); +_texture( param5_tex,int4); #endif #else diff --git a/lib/gpu/lal_yukawa.cu b/lib/gpu/lal_yukawa.cu index a8d637ec97..5237549b0a 100644 --- a/lib/gpu/lal_yukawa.cu +++ b/lib/gpu/lal_yukawa.cu @@ -13,12 +13,12 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; +_texture( pos_tex,float4); #else -texture pos_tex; +_texture_2d( pos_tex,int4); #endif #else #define pos_tex x_ diff --git a/lib/gpu/lal_yukawa_colloid.cu b/lib/gpu/lal_yukawa_colloid.cu index a3cbbbc11c..8c006a09be 100644 --- a/lib/gpu/lal_yukawa_colloid.cu +++ b/lib/gpu/lal_yukawa_colloid.cu @@ -13,15 +13,15 @@ // email : nguyentd@ornl.gov // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture rad_tex; +_texture( pos_tex,float4); +_texture( rad_tex,float); #else -texture pos_tex; -texture rad_tex; +_texture_2d( pos_tex,int4); +_texture( rad_tex,int2); #endif #else diff --git a/lib/gpu/lal_zbl.cu b/lib/gpu/lal_zbl.cu index 33c850e134..fbedfe2de2 100644 --- a/lib/gpu/lal_zbl.cu +++ b/lib/gpu/lal_zbl.cu @@ -13,12 +13,12 @@ // email : ndactrung@gmail.com // ***************************************************************************/ -#ifdef NV_KERNEL +#if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE -texture pos_tex; +_texture( pos_tex,float4); #else -texture pos_tex; +_texture_2d( pos_tex,int4); #endif #else #define pos_tex x_ diff --git a/src/MAKE/OPTIONS/Makefile.hip b/src/MAKE/OPTIONS/Makefile.hip new file mode 100644 index 0000000000..12158a32cb --- /dev/null +++ b/src/MAKE/OPTIONS/Makefile.hip @@ -0,0 +1,120 @@ +# hip = MPI with HIP(clang) + +SHELL = /bin/sh + +# --------------------------------------------------------------------- +# compiler/linker settings +# specify flags and libraries needed for your compiler + +CC = mpicxx +CCFLAGS = -g -O3 +SHFLAGS = -fPIC +DEPFLAGS = -M + +HIP_PATH ?= $(wildcard /opt/rocm/hip) +LINK = $(HIP_PATH)/bin/hipcc +LINKFLAGS = -g -O3 $(shell mpicxx --showme:link) +LIB = +SIZE = size + +ARCHIVE = ar +ARFLAGS = -rc +SHLIBFLAGS = -shared + +# --------------------------------------------------------------------- +# LAMMPS-specific settings, all OPTIONAL +# specify settings for LAMMPS features you will use +# if you change any -D setting, do full re-compile after "make clean" + +# LAMMPS ifdef settings +# see possible settings in Section 2.2 (step 4) of manual + +LMP_INC = -DLAMMPS_GZIP -DLAMMPS_MEMALIGN=64 + +# MPI library +# see discussion in Section 2.2 (step 5) of manual +# MPI wrapper compiler/linker can provide this info +# can point to dummy MPI library in src/STUBS as in Makefile.serial +# use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts +# INC = path for mpi.h, MPI compiler settings +# PATH = path for MPI library +# LIB = name of MPI library + +MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 +MPI_PATH = +MPI_LIB = + +# FFT library +# see discussion in Section 2.2 (step 6) of manual +# can be left blank to use provided KISS FFT library +# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings +# PATH = path for FFT library +# LIB = name of FFT library + +FFT_INC = +FFT_PATH = +FFT_LIB = + +# JPEG and/or PNG library +# see discussion in Section 2.2 (step 7) of manual +# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC +# INC = path(s) for jpeglib.h and/or png.h +# PATH = path(s) for JPEG library and/or PNG library +# LIB = name(s) of JPEG library and/or PNG library + +JPG_INC = +JPG_PATH = +JPG_LIB = + +# --------------------------------------------------------------------- +# build rules and dependencies +# do not edit this section + +include Makefile.package.settings +include Makefile.package + +ifeq (nvcc,${HIP_PLATFORM}) + # fix nvcc can't handle -pthread flag + LINKFLAGS := $(subst -pthread,-Xcompiler -pthread,$(LINKFLAGS)) +endif + +EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC) +EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH) +EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB) +EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS) +EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS) + +# Path to src files + +vpath %.cpp .. +vpath %.h .. + +# Link target + +$(EXE): $(OBJ) $(EXTRA_LINK_DEPENDS) + $(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE) + $(SIZE) $(EXE) + +# Library targets + +lib: $(OBJ) $(EXTRA_LINK_DEPENDS) + $(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ) + +shlib: $(OBJ) $(EXTRA_LINK_DEPENDS) + $(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \ + $(OBJ) $(EXTRA_LIB) $(LIB) + +# Compilation rules + +%.o:%.cpp + $(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $< + +# Individual dependencies + +depend : fastdep.exe $(SRC) + @./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1 + +fastdep.exe: ../DEPEND/fastdep.c + cc -O -o $@ $< + +sinclude .depend