forked from OSchip/llvm-project
107 lines
4.6 KiB
C
107 lines
4.6 KiB
C
/******************************************************************************/
|
|
/* */
|
|
/* The LLVM Compiler Infrastructure */
|
|
/* */
|
|
/* This file is dual licensed under the MIT and the University of Illinois */
|
|
/* Open Source License. See LICENSE.TXT for details. */
|
|
/* */
|
|
/******************************************************************************/
|
|
/* */
|
|
/* This file defines GPUJIT. */
|
|
/* */
|
|
/******************************************************************************/
|
|
|
|
#ifndef GPUJIT_H_
|
|
#define GPUJIT_H_
|
|
|
|
/*
|
|
* The following demostrates how we can use the GPURuntime library to
|
|
* execute a GPU kernel.
|
|
*
|
|
* char KernelString[] = "\n\
|
|
* .version 1.4\n\
|
|
* .target sm_10, map_f64_to_f32\n\
|
|
* .entry _Z8myKernelPi (\n\
|
|
* .param .u64 __cudaparm__Z8myKernelPi_data)\n\
|
|
* {\n\
|
|
* .reg .u16 %rh<4>;\n\
|
|
* .reg .u32 %r<5>;\n\
|
|
* .reg .u64 %rd<6>;\n\
|
|
* cvt.u32.u16 %r1, %tid.x;\n\
|
|
* mov.u16 %rh1, %ctaid.x;\n\
|
|
* mov.u16 %rh2, %ntid.x;\n\
|
|
* mul.wide.u16 %r2, %rh1, %rh2;\n\
|
|
* add.u32 %r3, %r1, %r2;\n\
|
|
* ld.param.u64 %rd1, [__cudaparm__Z8myKernelPi_data];\n\
|
|
* cvt.s64.s32 %rd2, %r3;\n\
|
|
* mul.wide.s32 %rd3, %r3, 4;\n\
|
|
* add.u64 %rd4, %rd1, %rd3;\n\
|
|
* st.global.s32 [%rd4+0], %r3;\n\
|
|
* exit;\n\
|
|
* }\n\
|
|
* ";
|
|
*
|
|
* const char *Entry = "_Z8myKernelPi";
|
|
*
|
|
* int main() {
|
|
* PollyGPUContext *Context;
|
|
* PollyGPUModule *Module;
|
|
* PollyGPUFunction *Kernel;
|
|
* PollyGPUDevice *Device;
|
|
* PollyGPUDevicePtr *PtrDevData;
|
|
* int *HostData;
|
|
* PollyGPUEvent *Start;
|
|
* PollyGPUEvent *Stop;
|
|
* float *ElapsedTime;
|
|
* int MemSize;
|
|
* int BlockWidth = 16;
|
|
* int BlockHeight = 16;
|
|
* int GridWidth = 8;
|
|
* int GridHeight = 8;
|
|
*
|
|
* MemSize = 256*64*sizeof(int);
|
|
* polly_initDevice(&Context, &Device);
|
|
* polly_getPTXModule(KernelString, &Module);
|
|
* polly_getPTXKernelEntry(Entry, Module, &Kernel);
|
|
* polly_allocateMemoryForHostAndDevice(&HostData, &DevData, MemSize);
|
|
* polly_setKernelParameters(Kernel, BlockWidth, BlockHeight, DevData);
|
|
* polly_startTimerByCudaEvent(&Start, &Stop);
|
|
* polly_launchKernel(Kernel, GridWidth, GridHeight);
|
|
* polly_copyFromDeviceToHost(HostData, DevData, MemSize);
|
|
* polly_stopTimerByCudaEvent(Start, Stop, ElapsedTime);
|
|
* polly_cleanupGPGPUResources(HostData, DevData, Module, Context, Kernel);
|
|
* }
|
|
*
|
|
*/
|
|
|
|
typedef struct PollyGPUContextT PollyGPUContext;
|
|
typedef struct PollyGPUModuleT PollyGPUModule;
|
|
typedef struct PollyGPUFunctionT PollyGPUFunction;
|
|
typedef struct PollyGPUDeviceT PollyGPUDevice;
|
|
typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;
|
|
typedef struct PollyGPUEventT PollyGPUEvent;
|
|
|
|
void polly_initDevice(PollyGPUContext **Context, PollyGPUDevice **Device);
|
|
void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module);
|
|
void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module,
|
|
PollyGPUFunction **Kernel);
|
|
void polly_startTimerByCudaEvent(PollyGPUEvent **Start, PollyGPUEvent **Stop);
|
|
void polly_stopTimerByCudaEvent(PollyGPUEvent *Start, PollyGPUEvent *Stop,
|
|
float *ElapsedTimes);
|
|
void polly_copyFromHostToDevice(PollyGPUDevicePtr *DevData, void *HostData,
|
|
int MemSize);
|
|
void polly_copyFromDeviceToHost(void *HostData, PollyGPUDevicePtr *DevData,
|
|
int MemSize);
|
|
void polly_allocateMemoryForHostAndDevice(void **HostData,
|
|
PollyGPUDevicePtr **DevData,
|
|
int MemSize);
|
|
void polly_setKernelParameters(PollyGPUFunction *Kernel, int BlockWidth,
|
|
int BlockHeight, PollyGPUDevicePtr *DevData);
|
|
void polly_launchKernel(PollyGPUFunction *Kernel, int GridWidth,
|
|
int GridHeight);
|
|
void polly_cleanupGPGPUResources(void *HostData, PollyGPUDevicePtr *DevData,
|
|
PollyGPUModule *Module,
|
|
PollyGPUContext *Context,
|
|
PollyGPUFunction *Kernel);
|
|
#endif /* GPUJIT_H_ */
|