forked from OSchip/llvm-project
Replace CUDA data types with Polly's GPGPU data types.
Contributed by: Yabin Hu <yabin.hwu@gmail.com> llvm-svn: 159725
This commit is contained in:
parent
eb7b9f8248
commit
5c0f6f3350
|
@ -144,9 +144,6 @@ macro(add_polly_library name)
|
|||
if (SCOPLIB_FOUND)
|
||||
target_link_libraries( ${name} ${SCOPLIB_LIBRARY})
|
||||
endif(SCOPLIB_FOUND)
|
||||
if (CUDALIB_FOUND)
|
||||
target_link_libraries( ${name} ${CUDALIB_LIBRARY})
|
||||
endif(CUDALIB_FOUND)
|
||||
|
||||
if( LLVM_LINK_COMPONENTS )
|
||||
llvm_config(${name} ${LLVM_LINK_COMPONENTS})
|
||||
|
|
|
@ -8,13 +8,13 @@ FIND_PATH(CUDALIB_INCLUDE_DIR
|
|||
|
||||
FIND_LIBRARY(CUDALIB_LIBRARY NAMES cuda)
|
||||
|
||||
IF (CUDALIB_INCLUDE_DIR AND CUDALIB_LIBRARY)
|
||||
IF (CUDALIB_INCLUDE_DIR)
|
||||
SET(CUDALIB_FOUND TRUE)
|
||||
ENDIF (CUDALIB_INCLUDE_DIR AND CUDALIB_LIBRARY)
|
||||
ENDIF (CUDALIB_INCLUDE_DIR)
|
||||
|
||||
IF (CUDALIB_FOUND)
|
||||
IF (NOT CUDA_FIND_QUIETLY)
|
||||
MESSAGE(STATUS "Found CUDA: ${CUDALIB_LIBRARY}")
|
||||
MESSAGE(STATUS "Found CUDA: ${CUDALIB_INCLUDE_DIR}")
|
||||
ENDIF (NOT CUDA_FIND_QUIETLY)
|
||||
ELSE (CUDALIB_FOUND)
|
||||
IF (CUDA_FIND_REQUIRED)
|
||||
|
|
|
@ -12,9 +12,36 @@
|
|||
/******************************************************************************/
|
||||
|
||||
#include "GPUJIT.h"
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <dlfcn.h>
|
||||
#include <stdio.h>
|
||||
|
||||
/* Define Polly's GPGPU data types. */
|
||||
struct PollyGPUContextT {
|
||||
CUcontext Cuda;
|
||||
};
|
||||
|
||||
struct PollyGPUModuleT {
|
||||
CUmodule Cuda;
|
||||
};
|
||||
|
||||
struct PollyGPUFunctionT {
|
||||
CUfunction Cuda;
|
||||
};
|
||||
|
||||
struct PollyGPUDeviceT {
|
||||
CUdevice Cuda;
|
||||
};
|
||||
|
||||
struct PollyGPUDevicePtrT {
|
||||
CUdeviceptr Cuda;
|
||||
};
|
||||
|
||||
struct PollyGPUEventT {
|
||||
cudaEvent_t Cuda;
|
||||
};
|
||||
|
||||
/* Dynamic library handles for the CUDA and CUDA runtime library. */
|
||||
static void *HandleCuda;
|
||||
static void *HandleCudaRT;
|
||||
|
@ -218,7 +245,7 @@ static int initialDeviceAPIs() {
|
|||
return 1;
|
||||
}
|
||||
|
||||
void polly_initDevice(CUcontext *Context, CUdevice *Device) {
|
||||
void polly_initDevice(PollyGPUContext **Context, PollyGPUDevice **Device) {
|
||||
int Major = 0, Minor = 0, DeviceID = 0;
|
||||
char DeviceName[256];
|
||||
int DeviceCount = 0;
|
||||
|
@ -242,85 +269,135 @@ void polly_initDevice(CUcontext *Context, CUdevice *Device) {
|
|||
}
|
||||
|
||||
/* We select the 1st device as default. */
|
||||
CuDeviceGetFcnPtr(Device, 0);
|
||||
*Device = malloc(sizeof(PollyGPUDevice));
|
||||
if (*Device == 0) {
|
||||
fprintf(stdout, "Allocate memory for Polly GPU device failed.\n");
|
||||
exit(-1);
|
||||
}
|
||||
CuDeviceGetFcnPtr(&((*Device)->Cuda), 0);
|
||||
|
||||
/* Get compute capabilities and the device name. */
|
||||
CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, *Device);
|
||||
CuDeviceGetNameFcnPtr(DeviceName, 256, *Device);
|
||||
CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, (*Device)->Cuda);
|
||||
CuDeviceGetNameFcnPtr(DeviceName, 256, (*Device)->Cuda);
|
||||
fprintf(stderr, "> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
|
||||
|
||||
/* Create context on the device. */
|
||||
CuCtxCreateFcnPtr(Context, 0, *Device);
|
||||
*Context = malloc(sizeof(PollyGPUContext));
|
||||
if (*Context == 0) {
|
||||
fprintf(stdout, "Allocate memory for Polly GPU context failed.\n");
|
||||
exit(-1);
|
||||
}
|
||||
CuCtxCreateFcnPtr(&((*Context)->Cuda), 0, (*Device)->Cuda);
|
||||
}
|
||||
|
||||
void polly_getPTXModule(void *PTXBuffer, CUmodule *Module) {
|
||||
if(CuModuleLoadDataExFcnPtr(Module, PTXBuffer, 0, 0, 0) != CUDA_SUCCESS) {
|
||||
void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module) {
|
||||
*Module = malloc(sizeof(PollyGPUModule));
|
||||
if (*Module == 0) {
|
||||
fprintf(stdout, "Allocate memory for Polly GPU module failed.\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
if (CuModuleLoadDataExFcnPtr(&((*Module)->Cuda), PTXBuffer, 0, 0, 0)
|
||||
!= CUDA_SUCCESS) {
|
||||
fprintf(stdout, "Loading ptx assembly text failed.\n");
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
void polly_getPTXKernelEntry(const char *KernelName, CUmodule *Module,
|
||||
CUfunction *Kernel) {
|
||||
void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module,
|
||||
PollyGPUFunction **Kernel) {
|
||||
*Kernel = malloc(sizeof(PollyGPUFunction));
|
||||
if (*Kernel == 0) {
|
||||
fprintf(stdout, "Allocate memory for Polly GPU kernel failed.\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
/* Locate the kernel entry point. */
|
||||
if(CuModuleGetFunctionFcnPtr(Kernel, *Module, KernelName)
|
||||
if(CuModuleGetFunctionFcnPtr(&((*Kernel)->Cuda), Module->Cuda, KernelName)
|
||||
!= CUDA_SUCCESS) {
|
||||
fprintf(stdout, "Loading kernel function failed.\n");
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
void polly_startTimerByCudaEvent(cudaEvent_t *StartTimer,
|
||||
cudaEvent_t *StopTimer) {
|
||||
CudaEventCreateFcnPtr(StartTimer);
|
||||
CudaEventCreateFcnPtr(StopTimer);
|
||||
CudaEventRecordFcnPtr(*StartTimer, 0);
|
||||
void polly_startTimerByCudaEvent(PollyGPUEvent **Start, PollyGPUEvent **Stop) {
|
||||
*Start = malloc(sizeof(PollyGPUEvent));
|
||||
if (*Start == 0) {
|
||||
fprintf(stdout, "Allocate memory for Polly GPU start timer failed.\n");
|
||||
exit(-1);
|
||||
}
|
||||
CudaEventCreateFcnPtr(&((*Start)->Cuda));
|
||||
|
||||
*Stop = malloc(sizeof(PollyGPUEvent));
|
||||
if (*Stop == 0) {
|
||||
fprintf(stdout, "Allocate memory for Polly GPU stop timer failed.\n");
|
||||
exit(-1);
|
||||
}
|
||||
CudaEventCreateFcnPtr(&((*Stop)->Cuda));
|
||||
|
||||
/* Record the start time. */
|
||||
CudaEventRecordFcnPtr((*Start)->Cuda, 0);
|
||||
}
|
||||
|
||||
void polly_stopTimerByCudaEvent(cudaEvent_t *StartTimer,
|
||||
cudaEvent_t *StopTimer, float *ElapsedTimes) {
|
||||
CudaEventRecordFcnPtr(*StopTimer, 0);
|
||||
CudaEventSynchronizeFcnPtr(*StopTimer);
|
||||
CudaEventElapsedTimeFcnPtr(ElapsedTimes, *StartTimer, *StopTimer );
|
||||
CudaEventDestroyFcnPtr(*StartTimer);
|
||||
CudaEventDestroyFcnPtr(*StopTimer);
|
||||
void polly_stopTimerByCudaEvent(PollyGPUEvent *Start, PollyGPUEvent *Stop,
|
||||
float *ElapsedTimes) {
|
||||
/* Record the end time. */
|
||||
CudaEventRecordFcnPtr(Stop->Cuda, 0);
|
||||
CudaEventSynchronizeFcnPtr(Start->Cuda);
|
||||
CudaEventSynchronizeFcnPtr(Stop->Cuda);
|
||||
CudaEventElapsedTimeFcnPtr(ElapsedTimes, Start->Cuda, Stop->Cuda);
|
||||
CudaEventDestroyFcnPtr(Start->Cuda);
|
||||
CudaEventDestroyFcnPtr(Stop->Cuda);
|
||||
fprintf(stderr, "Processing time: %f (ms).\n", *ElapsedTimes);
|
||||
|
||||
free(Start);
|
||||
free(Stop);
|
||||
}
|
||||
|
||||
void polly_allocateMemoryForHostAndDevice(void **PtrHostData,
|
||||
CUdeviceptr *PtrDevData,
|
||||
void polly_allocateMemoryForHostAndDevice(void **HostData,
|
||||
PollyGPUDevicePtr **DevData,
|
||||
int MemSize) {
|
||||
if ((*PtrHostData = (int *)malloc(MemSize)) == 0) {
|
||||
if ((*HostData = (int *)malloc(MemSize)) == 0) {
|
||||
fprintf(stdout, "Could not allocate host memory.\n");
|
||||
exit(-1);
|
||||
}
|
||||
CuMemAllocFcnPtr(PtrDevData, MemSize);
|
||||
|
||||
*DevData = malloc(sizeof(PollyGPUDevicePtr));
|
||||
if (*DevData == 0) {
|
||||
fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n");
|
||||
exit(-1);
|
||||
}
|
||||
CuMemAllocFcnPtr(&((*DevData)->Cuda), MemSize);
|
||||
}
|
||||
|
||||
void polly_copyFromHostToDevice(CUdeviceptr DevData, void *HostData,
|
||||
void polly_copyFromHostToDevice(PollyGPUDevicePtr *DevData, void *HostData,
|
||||
int MemSize) {
|
||||
CuMemcpyHtoDFcnPtr(DevData, HostData, MemSize);
|
||||
CUdeviceptr CuDevData = DevData->Cuda;
|
||||
CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
|
||||
}
|
||||
|
||||
void polly_copyFromDeviceToHost(void *HostData, CUdeviceptr DevData,
|
||||
void polly_copyFromDeviceToHost(void *HostData, PollyGPUDevicePtr *DevData,
|
||||
int MemSize) {
|
||||
if(CuMemcpyDtoHFcnPtr(HostData, DevData, MemSize) != CUDA_SUCCESS) {
|
||||
if(CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) != CUDA_SUCCESS) {
|
||||
fprintf(stdout, "Copying results from device to host memory failed.\n");
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
void polly_setKernelParameters(CUfunction *Kernel, int BlockWidth,
|
||||
int BlockHeight, CUdeviceptr DevData) {
|
||||
void polly_setKernelParameters(PollyGPUFunction *Kernel, int BlockWidth,
|
||||
int BlockHeight, PollyGPUDevicePtr *DevData) {
|
||||
int ParamOffset = 0;
|
||||
CuFuncSetBlockShapeFcnPtr(*Kernel, BlockWidth, BlockHeight, 1);
|
||||
CuParamSetvFcnPtr(*Kernel, ParamOffset, &DevData, sizeof(DevData));
|
||||
ParamOffset += sizeof(DevData);
|
||||
CuParamSetSizeFcnPtr(*Kernel, ParamOffset);
|
||||
|
||||
CuFuncSetBlockShapeFcnPtr(Kernel->Cuda, BlockWidth, BlockHeight, 1);
|
||||
CuParamSetvFcnPtr(Kernel->Cuda, ParamOffset, &(DevData->Cuda),
|
||||
sizeof(DevData->Cuda));
|
||||
ParamOffset += sizeof(DevData->Cuda);
|
||||
CuParamSetSizeFcnPtr(Kernel->Cuda, ParamOffset);
|
||||
}
|
||||
|
||||
void polly_launchKernel(CUfunction *Kernel, int GridWidth, int GridHeight) {
|
||||
if (CuLaunchGridFcnPtr(*Kernel, GridWidth, GridHeight) != CUDA_SUCCESS) {
|
||||
void polly_launchKernel(PollyGPUFunction *Kernel, int GridWidth,
|
||||
int GridHeight) {
|
||||
if (CuLaunchGridFcnPtr(Kernel->Cuda, GridWidth, GridHeight) != CUDA_SUCCESS) {
|
||||
fprintf(stdout, "Launching CUDA kernel failed.\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
@ -328,26 +405,32 @@ void polly_launchKernel(CUfunction *Kernel, int GridWidth, int GridHeight) {
|
|||
fprintf(stdout, "CUDA kernel launched.\n");
|
||||
}
|
||||
|
||||
void polly_cleanupGPGPUResources(void *HostData, CUdeviceptr DevData,
|
||||
CUmodule *Module, CUcontext *Context) {
|
||||
void polly_cleanupGPGPUResources(void *HostData, PollyGPUDevicePtr *DevData,
|
||||
PollyGPUModule *Module,
|
||||
PollyGPUContext *Context,
|
||||
PollyGPUFunction *Kernel) {
|
||||
if (HostData) {
|
||||
free(HostData);
|
||||
HostData = 0;
|
||||
}
|
||||
|
||||
if (DevData) {
|
||||
CuMemFreeFcnPtr(DevData);
|
||||
DevData = 0;
|
||||
if (DevData->Cuda) {
|
||||
CuMemFreeFcnPtr(DevData->Cuda);
|
||||
free(DevData);
|
||||
}
|
||||
|
||||
if (*Module) {
|
||||
CuModuleUnloadFcnPtr(*Module);
|
||||
*Module = 0;
|
||||
if (Module->Cuda) {
|
||||
CuModuleUnloadFcnPtr(Module->Cuda);
|
||||
free(Module);
|
||||
}
|
||||
|
||||
if (*Context) {
|
||||
CuCtxDestroyFcnPtr(*Context);
|
||||
*Context = 0;
|
||||
if (Context->Cuda) {
|
||||
CuCtxDestroyFcnPtr(Context->Cuda);
|
||||
free(Context);
|
||||
}
|
||||
|
||||
if (Kernel) {
|
||||
free(Kernel);
|
||||
}
|
||||
|
||||
dlclose(HandleCuda);
|
||||
|
|
|
@ -14,28 +14,93 @@
|
|||
#ifndef GPUJIT_H_
|
||||
#define GPUJIT_H_
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
/*
|
||||
* The following demostrates how we can use the GPURuntime library to
|
||||
* execute a GPU kernel.
|
||||
*
|
||||
* char KernelString[] = "\n\
|
||||
* .version 1.4\n\
|
||||
* .target sm_10, map_f64_to_f32\n\
|
||||
* .entry _Z8myKernelPi (\n\
|
||||
* .param .u64 __cudaparm__Z8myKernelPi_data)\n\
|
||||
* {\n\
|
||||
* .reg .u16 %rh<4>;\n\
|
||||
* .reg .u32 %r<5>;\n\
|
||||
* .reg .u64 %rd<6>;\n\
|
||||
* cvt.u32.u16 %r1, %tid.x;\n\
|
||||
* mov.u16 %rh1, %ctaid.x;\n\
|
||||
* mov.u16 %rh2, %ntid.x;\n\
|
||||
* mul.wide.u16 %r2, %rh1, %rh2;\n\
|
||||
* add.u32 %r3, %r1, %r2;\n\
|
||||
* ld.param.u64 %rd1, [__cudaparm__Z8myKernelPi_data];\n\
|
||||
* cvt.s64.s32 %rd2, %r3;\n\
|
||||
* mul.wide.s32 %rd3, %r3, 4;\n\
|
||||
* add.u64 %rd4, %rd1, %rd3;\n\
|
||||
* st.global.s32 [%rd4+0], %r3;\n\
|
||||
* exit;\n\
|
||||
* }\n\
|
||||
* ";
|
||||
*
|
||||
* const char *Entry = "_Z8myKernelPi";
|
||||
*
|
||||
* int main() {
|
||||
* PollyGPUContext *Context;
|
||||
* PollyGPUModule *Module;
|
||||
* PollyGPUFunction *Kernel;
|
||||
* PollyGPUDevice *Device;
|
||||
* PollyGPUDevicePtr *PtrDevData;
|
||||
* int *HostData;
|
||||
* PollyGPUEvent *Start;
|
||||
* PollyGPUEvent *Stop;
|
||||
* float *ElapsedTime;
|
||||
* int MemSize;
|
||||
* int BlockWidth = 16;
|
||||
* int BlockHeight = 16;
|
||||
* int GridWidth = 8;
|
||||
* int GridHeight = 8;
|
||||
*
|
||||
* MemSize = 256*64*sizeof(int);
|
||||
* polly_initDevice(&Context, &Device);
|
||||
* polly_getPTXModule(KernelString, &Module);
|
||||
* polly_getPTXKernelEntry(Entry, Module, &Kernel);
|
||||
* polly_allocateMemoryForHostAndDevice(&HostData, &DevData, MemSize);
|
||||
* polly_setKernelParameters(Kernel, BlockWidth, BlockHeight, DevData);
|
||||
* polly_startTimerByCudaEvent(&Start, &Stop);
|
||||
* polly_launchKernel(Kernel, GridWidth, GridHeight);
|
||||
* polly_copyFromDeviceToHost(HostData, DevData, MemSize);
|
||||
* polly_stopTimerByCudaEvent(Start, Stop, ElapsedTime);
|
||||
* polly_cleanupGPGPUResources(HostData, DevData, Module, Context, Kernel);
|
||||
* }
|
||||
*
|
||||
*/
|
||||
|
||||
void polly_initDevice(CUcontext *Context, CUdevice *Device);
|
||||
void polly_getPTXModule(void *PTXBuffer, CUmodule *Module);
|
||||
void polly_getPTXKernelEntry(const char *KernelName,
|
||||
CUmodule *Module,
|
||||
CUfunction *Kernel);
|
||||
void polly_startTimerByCudaEvent(cudaEvent_t *StartTimer,
|
||||
cudaEvent_t *StopTimer);
|
||||
void polly_stopTimerByCudaEvent(cudaEvent_t *StartTimer, cudaEvent_t *StopTimer,
|
||||
typedef struct PollyGPUContextT PollyGPUContext;
|
||||
typedef struct PollyGPUModuleT PollyGPUModule;
|
||||
typedef struct PollyGPUFunctionT PollyGPUFunction;
|
||||
typedef struct PollyGPUDeviceT PollyGPUDevice;
|
||||
typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;
|
||||
typedef struct PollyGPUEventT PollyGPUEvent;
|
||||
|
||||
void polly_initDevice(PollyGPUContext **Context, PollyGPUDevice **Device);
|
||||
void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module);
|
||||
void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module,
|
||||
PollyGPUFunction **Kernel);
|
||||
void polly_startTimerByCudaEvent(PollyGPUEvent **Start, PollyGPUEvent **Stop);
|
||||
void polly_stopTimerByCudaEvent(PollyGPUEvent *Start, PollyGPUEvent *Stop,
|
||||
float *ElapsedTimes);
|
||||
void polly_copyFromHostToDevice(CUdeviceptr DevData, void *HostData,
|
||||
void polly_copyFromHostToDevice(PollyGPUDevicePtr *DevData, void *HostData,
|
||||
int MemSize);
|
||||
void polly_copyFromDeviceToHost(void *HostData, CUdeviceptr DevData,
|
||||
void polly_copyFromDeviceToHost(void *HostData, PollyGPUDevicePtr *DevData,
|
||||
int MemSize);
|
||||
void polly_allocateMemoryForHostAndDevice(void **PtrHostData,
|
||||
CUdeviceptr *PtrDevData,
|
||||
void polly_allocateMemoryForHostAndDevice(void **HostData,
|
||||
PollyGPUDevicePtr **DevData,
|
||||
int MemSize);
|
||||
void polly_setKernelParameters(CUfunction *Kernel, int BlockWidth,
|
||||
int BlockHeight, CUdeviceptr DevData);
|
||||
void polly_launchKernel(CUfunction *Kernel, int GridWidth, int GridHeight);
|
||||
void polly_cleanupGPGPUResources(void *HostData, CUdeviceptr DevData,
|
||||
CUmodule *Module, CUcontext *Context);
|
||||
void polly_setKernelParameters(PollyGPUFunction *Kernel, int BlockWidth,
|
||||
int BlockHeight, PollyGPUDevicePtr *DevData);
|
||||
void polly_launchKernel(PollyGPUFunction *Kernel, int GridWidth,
|
||||
int GridHeight);
|
||||
void polly_cleanupGPGPUResources(void *HostData, PollyGPUDevicePtr *DevData,
|
||||
PollyGPUModule *Module,
|
||||
PollyGPUContext *Context,
|
||||
PollyGPUFunction *Kernel);
|
||||
#endif /* GPUJIT_H_ */
|
||||
|
|
Loading…
Reference in New Issue