llvm-project/polly/tools/GPURuntime/GPUJIT.c

439 lines
15 KiB
C

/******************** GPUJIT.cpp - GPUJIT Execution Engine ********************/
/* */
/* The LLVM Compiler Infrastructure */
/* */
/* This file is dual licensed under the MIT and the University of Illinois */
/* Open Source License. See LICENSE.TXT for details. */
/* */
/******************************************************************************/
/* */
/* This file implements GPUJIT, a ptx string execution engine for GPU. */
/* */
/******************************************************************************/
#include "GPUJIT.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include <dlfcn.h>
#include <stdio.h>
/* Define Polly's GPGPU data types. */
struct PollyGPUContextT {
CUcontext Cuda;
};
struct PollyGPUModuleT {
CUmodule Cuda;
};
struct PollyGPUFunctionT {
CUfunction Cuda;
};
struct PollyGPUDeviceT {
CUdevice Cuda;
};
struct PollyGPUDevicePtrT {
CUdeviceptr Cuda;
};
struct PollyGPUEventT {
cudaEvent_t Cuda;
};
/* Dynamic library handles for the CUDA and CUDA runtime library. */
static void *HandleCuda;
static void *HandleCudaRT;
/* Type-defines of function pointer to CUDA driver APIs. */
typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t);
static CuMemAllocFcnTy *CuMemAllocFcnPtr;
typedef CUresult CUDAAPI CuFuncSetBlockShapeFcnTy(CUfunction, int, int, int);
static CuFuncSetBlockShapeFcnTy *CuFuncSetBlockShapeFcnPtr;
typedef CUresult CUDAAPI CuParamSetvFcnTy(CUfunction, int, void *,
unsigned int);
static CuParamSetvFcnTy *CuParamSetvFcnPtr;
typedef CUresult CUDAAPI CuParamSetSizeFcnTy(CUfunction, unsigned int);
static CuParamSetSizeFcnTy *CuParamSetSizeFcnPtr;
typedef CUresult CUDAAPI CuLaunchGridFcnTy(CUfunction, int, int);
static CuLaunchGridFcnTy *CuLaunchGridFcnPtr;
typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t);
static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr;
typedef CUresult CUDAAPI CuMemcpyHtoDFcnTy(CUdeviceptr, const void *, size_t);
static CuMemcpyHtoDFcnTy *CuMemcpyHtoDFcnPtr;
typedef CUresult CUDAAPI CuMemFreeFcnTy(CUdeviceptr);
static CuMemFreeFcnTy *CuMemFreeFcnPtr;
typedef CUresult CUDAAPI CuModuleUnloadFcnTy(CUmodule);
static CuModuleUnloadFcnTy *CuModuleUnloadFcnPtr;
typedef CUresult CUDAAPI CuCtxDestroyFcnTy(CUcontext);
static CuCtxDestroyFcnTy *CuCtxDestroyFcnPtr;
typedef CUresult CUDAAPI CuInitFcnTy(unsigned int);
static CuInitFcnTy *CuInitFcnPtr;
typedef CUresult CUDAAPI CuDeviceGetCountFcnTy(int *);
static CuDeviceGetCountFcnTy *CuDeviceGetCountFcnPtr;
typedef CUresult CUDAAPI CuCtxCreateFcnTy(CUcontext *, unsigned int, CUdevice);
static CuCtxCreateFcnTy *CuCtxCreateFcnPtr;
typedef CUresult CUDAAPI CuDeviceGetFcnTy(CUdevice *, int);
static CuDeviceGetFcnTy *CuDeviceGetFcnPtr;
typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *,
unsigned int, CUjit_option *,
void **);
static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;
typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
const char *);
static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;
typedef CUresult CUDAAPI CuDeviceComputeCapabilityFcnTy(int *, int *, CUdevice);
static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr;
typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;
/* Type-defines of function pointer ot CUDA runtime APIs. */
typedef cudaError_t CUDARTAPI CudaEventCreateFcnTy(cudaEvent_t *);
static CudaEventCreateFcnTy *CudaEventCreateFcnPtr;
typedef cudaError_t CUDARTAPI CudaEventRecordFcnTy(cudaEvent_t,
cudaStream_t);
static CudaEventRecordFcnTy *CudaEventRecordFcnPtr;
typedef cudaError_t CUDARTAPI CudaEventSynchronizeFcnTy(cudaEvent_t);
static CudaEventSynchronizeFcnTy *CudaEventSynchronizeFcnPtr;
typedef cudaError_t CUDARTAPI CudaEventElapsedTimeFcnTy(float *, cudaEvent_t,
cudaEvent_t);
static CudaEventElapsedTimeFcnTy *CudaEventElapsedTimeFcnPtr;
typedef cudaError_t CUDARTAPI CudaEventDestroyFcnTy(cudaEvent_t);
static CudaEventDestroyFcnTy *CudaEventDestroyFcnPtr;
typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
static void *getAPIHandle(void *Handle, const char *FuncName) {
char *Err;
void *FuncPtr;
dlerror();
FuncPtr = dlsym(Handle, FuncName);
if ((Err = dlerror()) != 0) {
fprintf(stdout, "Load CUDA driver API failed: %s. \n", Err);
return 0;
}
return FuncPtr;
}
static int initialDeviceAPILibraries() {
HandleCuda = dlopen("libcuda.so", RTLD_LAZY);
if (!HandleCuda) {
printf("Cannot open library: %s. \n", dlerror());
return 0;
}
HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY);
if (!HandleCudaRT) {
printf("Cannot open library: %s. \n", dlerror());
return 0;
}
return 1;
}
static int initialDeviceAPIs() {
if (initialDeviceAPILibraries() == 0)
return 0;
/* Get function pointer to CUDA Driver APIs.
*
* Note that compilers conforming to the ISO C standard are required to
* generate a warning if a conversion from a void * pointer to a function
* pointer is attempted as in the following statements. The warning
* of this kind of cast may not be emitted by clang and new versions of gcc
* as it is valid on POSIX 2008.
*/
CuFuncSetBlockShapeFcnPtr =
(CuFuncSetBlockShapeFcnTy *) getAPIHandle(HandleCuda,
"cuFuncSetBlockShape");
CuParamSetvFcnPtr = (CuParamSetvFcnTy *) getAPIHandle(HandleCuda,
"cuParamSetv");
CuParamSetSizeFcnPtr = (CuParamSetSizeFcnTy *) getAPIHandle(HandleCuda,
"cuParamSetSize");
CuLaunchGridFcnPtr = (CuLaunchGridFcnTy *) getAPIHandle(HandleCuda,
"cuLaunchGrid");
CuMemAllocFcnPtr = (CuMemAllocFcnTy *) getAPIHandle(HandleCuda,
"cuMemAlloc_v2");
CuMemFreeFcnPtr = (CuMemFreeFcnTy *) getAPIHandle(HandleCuda, "cuMemFree_v2");
CuMemcpyDtoHFcnPtr = (CuMemcpyDtoHFcnTy *) getAPIHandle(HandleCuda,
"cuMemcpyDtoH_v2");
CuMemcpyHtoDFcnPtr = (CuMemcpyHtoDFcnTy *) getAPIHandle(HandleCuda,
"cuMemcpyHtoD_v2");
CuModuleUnloadFcnPtr = (CuModuleUnloadFcnTy *) getAPIHandle(HandleCuda,
"cuModuleUnload");
CuCtxDestroyFcnPtr = (CuCtxDestroyFcnTy *) getAPIHandle(HandleCuda,
"cuCtxDestroy");
CuInitFcnPtr = (CuInitFcnTy *) getAPIHandle(HandleCuda, "cuInit");
CuDeviceGetCountFcnPtr = (CuDeviceGetCountFcnTy *) getAPIHandle(HandleCuda,
"cuDeviceGetCount");
CuDeviceGetFcnPtr = (CuDeviceGetFcnTy *) getAPIHandle(HandleCuda,
"cuDeviceGet");
CuCtxCreateFcnPtr = (CuCtxCreateFcnTy *) getAPIHandle(HandleCuda,
"cuCtxCreate_v2");
CuModuleLoadDataExFcnPtr =
(CuModuleLoadDataExFcnTy *) getAPIHandle(HandleCuda, "cuModuleLoadDataEx");
CuModuleGetFunctionFcnPtr =
(CuModuleGetFunctionFcnTy *)getAPIHandle(HandleCuda, "cuModuleGetFunction");
CuDeviceComputeCapabilityFcnPtr =
(CuDeviceComputeCapabilityFcnTy *)getAPIHandle(HandleCuda,
"cuDeviceComputeCapability");
CuDeviceGetNameFcnPtr =
(CuDeviceGetNameFcnTy *) getAPIHandle(HandleCuda, "cuDeviceGetName");
/* Get function pointer to CUDA Runtime APIs. */
CudaEventCreateFcnPtr =
(CudaEventCreateFcnTy *) getAPIHandle(HandleCudaRT, "cudaEventCreate");
CudaEventRecordFcnPtr =
(CudaEventRecordFcnTy *) getAPIHandle(HandleCudaRT, "cudaEventRecord");
CudaEventSynchronizeFcnPtr =
(CudaEventSynchronizeFcnTy *) getAPIHandle(HandleCudaRT,
"cudaEventSynchronize");
CudaEventElapsedTimeFcnPtr =
(CudaEventElapsedTimeFcnTy *) getAPIHandle(HandleCudaRT,
"cudaEventElapsedTime");
CudaEventDestroyFcnPtr =
(CudaEventDestroyFcnTy *) getAPIHandle(HandleCudaRT, "cudaEventDestroy");
CudaThreadSynchronizeFcnPtr =
(CudaThreadSynchronizeFcnTy *) getAPIHandle(HandleCudaRT,
"cudaThreadSynchronize");
return 1;
}
void polly_initDevice(PollyGPUContext **Context, PollyGPUDevice **Device) {
int Major = 0, Minor = 0, DeviceID = 0;
char DeviceName[256];
int DeviceCount = 0;
/* Get API handles. */
if (initialDeviceAPIs() == 0) {
fprintf(stdout, "Getting the \"handle\" for the CUDA driver API failed.\n");
exit(-1);
}
if (CuInitFcnPtr(0) != CUDA_SUCCESS) {
fprintf(stdout, "Initializing the CUDA driver API failed.\n");
exit(-1);
}
/* Get number of devices that supports CUDA. */
CuDeviceGetCountFcnPtr(&DeviceCount);
if (DeviceCount == 0) {
fprintf(stdout, "There is no device supporting CUDA.\n");
exit(-1);
}
/* We select the 1st device as default. */
*Device = malloc(sizeof(PollyGPUDevice));
if (*Device == 0) {
fprintf(stdout, "Allocate memory for Polly GPU device failed.\n");
exit(-1);
}
CuDeviceGetFcnPtr(&((*Device)->Cuda), 0);
/* Get compute capabilities and the device name. */
CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, (*Device)->Cuda);
CuDeviceGetNameFcnPtr(DeviceName, 256, (*Device)->Cuda);
fprintf(stderr, "> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
/* Create context on the device. */
*Context = malloc(sizeof(PollyGPUContext));
if (*Context == 0) {
fprintf(stdout, "Allocate memory for Polly GPU context failed.\n");
exit(-1);
}
CuCtxCreateFcnPtr(&((*Context)->Cuda), 0, (*Device)->Cuda);
}
void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module) {
*Module = malloc(sizeof(PollyGPUModule));
if (*Module == 0) {
fprintf(stdout, "Allocate memory for Polly GPU module failed.\n");
exit(-1);
}
if (CuModuleLoadDataExFcnPtr(&((*Module)->Cuda), PTXBuffer, 0, 0, 0)
!= CUDA_SUCCESS) {
fprintf(stdout, "Loading ptx assembly text failed.\n");
exit(-1);
}
}
void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module,
PollyGPUFunction **Kernel) {
*Kernel = malloc(sizeof(PollyGPUFunction));
if (*Kernel == 0) {
fprintf(stdout, "Allocate memory for Polly GPU kernel failed.\n");
exit(-1);
}
/* Locate the kernel entry point. */
if(CuModuleGetFunctionFcnPtr(&((*Kernel)->Cuda), Module->Cuda, KernelName)
!= CUDA_SUCCESS) {
fprintf(stdout, "Loading kernel function failed.\n");
exit(-1);
}
}
void polly_startTimerByCudaEvent(PollyGPUEvent **Start, PollyGPUEvent **Stop) {
*Start = malloc(sizeof(PollyGPUEvent));
if (*Start == 0) {
fprintf(stdout, "Allocate memory for Polly GPU start timer failed.\n");
exit(-1);
}
CudaEventCreateFcnPtr(&((*Start)->Cuda));
*Stop = malloc(sizeof(PollyGPUEvent));
if (*Stop == 0) {
fprintf(stdout, "Allocate memory for Polly GPU stop timer failed.\n");
exit(-1);
}
CudaEventCreateFcnPtr(&((*Stop)->Cuda));
/* Record the start time. */
CudaEventRecordFcnPtr((*Start)->Cuda, 0);
}
void polly_stopTimerByCudaEvent(PollyGPUEvent *Start, PollyGPUEvent *Stop,
float *ElapsedTimes) {
/* Record the end time. */
CudaEventRecordFcnPtr(Stop->Cuda, 0);
CudaEventSynchronizeFcnPtr(Start->Cuda);
CudaEventSynchronizeFcnPtr(Stop->Cuda);
CudaEventElapsedTimeFcnPtr(ElapsedTimes, Start->Cuda, Stop->Cuda);
CudaEventDestroyFcnPtr(Start->Cuda);
CudaEventDestroyFcnPtr(Stop->Cuda);
fprintf(stderr, "Processing time: %f (ms).\n", *ElapsedTimes);
free(Start);
free(Stop);
}
void polly_allocateMemoryForHostAndDevice(void **HostData,
PollyGPUDevicePtr **DevData,
int MemSize) {
if ((*HostData = (int *)malloc(MemSize)) == 0) {
fprintf(stdout, "Could not allocate host memory.\n");
exit(-1);
}
*DevData = malloc(sizeof(PollyGPUDevicePtr));
if (*DevData == 0) {
fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n");
exit(-1);
}
CuMemAllocFcnPtr(&((*DevData)->Cuda), MemSize);
}
void polly_copyFromHostToDevice(PollyGPUDevicePtr *DevData, void *HostData,
int MemSize) {
CUdeviceptr CuDevData = DevData->Cuda;
CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
}
void polly_copyFromDeviceToHost(void *HostData, PollyGPUDevicePtr *DevData,
int MemSize) {
if(CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) != CUDA_SUCCESS) {
fprintf(stdout, "Copying results from device to host memory failed.\n");
exit(-1);
}
}
void polly_setKernelParameters(PollyGPUFunction *Kernel, int BlockWidth,
int BlockHeight, PollyGPUDevicePtr *DevData) {
int ParamOffset = 0;
CuFuncSetBlockShapeFcnPtr(Kernel->Cuda, BlockWidth, BlockHeight, 1);
CuParamSetvFcnPtr(Kernel->Cuda, ParamOffset, &(DevData->Cuda),
sizeof(DevData->Cuda));
ParamOffset += sizeof(DevData->Cuda);
CuParamSetSizeFcnPtr(Kernel->Cuda, ParamOffset);
}
void polly_launchKernel(PollyGPUFunction *Kernel, int GridWidth,
int GridHeight) {
if (CuLaunchGridFcnPtr(Kernel->Cuda, GridWidth, GridHeight) != CUDA_SUCCESS) {
fprintf(stdout, "Launching CUDA kernel failed.\n");
exit(-1);
}
CudaThreadSynchronizeFcnPtr();
fprintf(stdout, "CUDA kernel launched.\n");
}
void polly_cleanupGPGPUResources(void *HostData, PollyGPUDevicePtr *DevData,
PollyGPUModule *Module,
PollyGPUContext *Context,
PollyGPUFunction *Kernel) {
if (HostData) {
free(HostData);
HostData = 0;
}
if (DevData->Cuda) {
CuMemFreeFcnPtr(DevData->Cuda);
free(DevData);
}
if (Module->Cuda) {
CuModuleUnloadFcnPtr(Module->Cuda);
free(Module);
}
if (Context->Cuda) {
CuCtxDestroyFcnPtr(Context->Cuda);
free(Context);
}
if (Kernel) {
free(Kernel);
}
dlclose(HandleCuda);
dlclose(HandleCudaRT);
}