forked from OSchip/llvm-project
GPGPU: Add basic support for kernel launches
llvm-svn: 276863
This commit is contained in:
parent
499375ceaa
commit
79a947c233
|
@ -204,6 +204,29 @@ private:
|
|||
/// @returns A set of values referenced by the kernel.
|
||||
SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel);
|
||||
|
||||
/// Compute the sizes of the execution grid for a given kernel.
|
||||
///
|
||||
/// @param Kernel The kernel to compute grid sizes for.
|
||||
///
|
||||
/// @returns A tuple with grid sizes for X and Y dimension
|
||||
std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel);
|
||||
|
||||
/// Compute the sizes of the thread blocks for a given kernel.
|
||||
///
|
||||
/// @param Kernel The kernel to compute thread block sizes for.
|
||||
///
|
||||
/// @returns A tuple with thread block sizes for X, Y, and Z dimensions.
|
||||
std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel);
|
||||
|
||||
/// Create kernel launch parameters.
|
||||
///
|
||||
/// @param Kernel The kernel to create parameters for.
|
||||
/// @param F The kernel function that has been created.
|
||||
///
|
||||
/// @returns A stack allocated array with pointers to the parameter
|
||||
/// values that are passed to the kernel.
|
||||
Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F);
|
||||
|
||||
/// Create GPU kernel.
|
||||
///
|
||||
/// Code generate the kernel described by @p KernelStmt.
|
||||
|
@ -296,6 +319,13 @@ private:
|
|||
/// @returns A pointer to the newly initialized context.
|
||||
Value *createCallInitContext();
|
||||
|
||||
/// Create a call to get the device pointer for a kernel allocation.
|
||||
///
|
||||
/// @param Allocation The Polly GPU allocation
|
||||
///
|
||||
/// @returns The device parameter corresponding to this allocation.
|
||||
Value *createCallGetDevicePtr(Value *Allocation);
|
||||
|
||||
/// Create a call to free the GPU context.
|
||||
///
|
||||
/// @param Context A pointer to an initialized GPU context.
|
||||
|
@ -339,6 +369,21 @@ private:
|
|||
///
|
||||
/// @param GPUKernel THe kernel to free.
|
||||
void createCallFreeKernel(Value *GPUKernel);
|
||||
|
||||
/// Create a call to launch a GPU kernel.
|
||||
///
|
||||
/// @param GPUKernel The kernel to launch.
|
||||
/// @param GridDimX The size of the first grid dimension.
|
||||
/// @param GridDimY The size of the second grid dimension.
|
||||
/// @param GridBlockX The size of the first block dimension.
|
||||
/// @param GridBlockY The size of the second block dimension.
|
||||
/// @param GridBlockZ The size of the third block dimension.
|
||||
/// @param Paramters A pointer to an array that contains itself pointers to
|
||||
/// the parameter values passed for each kernel argument.
|
||||
void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX,
|
||||
Value *GridDimY, Value *BlockDimX,
|
||||
Value *BlockDimY, Value *BlockDimZ,
|
||||
Value *Parameters);
|
||||
};
|
||||
|
||||
void GPUNodeBuilder::initializeAfterRTH() {
|
||||
|
@ -393,6 +438,50 @@ Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) {
|
|||
return Builder.CreateCall(F, {Buffer, Entry});
|
||||
}
|
||||
|
||||
Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) {
|
||||
const char *Name = "polly_getDevicePtr";
|
||||
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
||||
Function *F = M->getFunction(Name);
|
||||
|
||||
// If F is not available, declare it.
|
||||
if (!F) {
|
||||
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||
std::vector<Type *> Args;
|
||||
Args.push_back(Builder.getInt8PtrTy());
|
||||
FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
|
||||
F = Function::Create(Ty, Linkage, Name, M);
|
||||
}
|
||||
|
||||
return Builder.CreateCall(F, {Allocation});
|
||||
}
|
||||
|
||||
void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX,
|
||||
Value *GridDimY, Value *BlockDimX,
|
||||
Value *BlockDimY, Value *BlockDimZ,
|
||||
Value *Parameters) {
|
||||
const char *Name = "polly_launchKernel";
|
||||
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
||||
Function *F = M->getFunction(Name);
|
||||
|
||||
// If F is not available, declare it.
|
||||
if (!F) {
|
||||
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||
std::vector<Type *> Args;
|
||||
Args.push_back(Builder.getInt8PtrTy());
|
||||
Args.push_back(Builder.getInt32Ty());
|
||||
Args.push_back(Builder.getInt32Ty());
|
||||
Args.push_back(Builder.getInt32Ty());
|
||||
Args.push_back(Builder.getInt32Ty());
|
||||
Args.push_back(Builder.getInt32Ty());
|
||||
Args.push_back(Builder.getInt8PtrTy());
|
||||
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
||||
F = Function::Create(Ty, Linkage, Name, M);
|
||||
}
|
||||
|
||||
Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
|
||||
BlockDimZ, Parameters});
|
||||
}
|
||||
|
||||
void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) {
|
||||
const char *Name = "polly_freeKernel";
|
||||
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
||||
|
@ -755,6 +844,77 @@ void GPUNodeBuilder::clearLoops(Function *F) {
|
|||
}
|
||||
}
|
||||
|
||||
std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) {
|
||||
std::vector<Value *> Sizes;
|
||||
isl_ast_build *Context = isl_ast_build_from_context(S.getContext());
|
||||
|
||||
for (long i = 0; i < Kernel->n_grid; i++) {
|
||||
isl_pw_aff *Size = isl_multi_pw_aff_get_pw_aff(Kernel->grid_size, i);
|
||||
isl_ast_expr *GridSize = isl_ast_build_expr_from_pw_aff(Context, Size);
|
||||
Value *Res = ExprBuilder.create(GridSize);
|
||||
Res = Builder.CreateTrunc(Res, Builder.getInt32Ty());
|
||||
Sizes.push_back(Res);
|
||||
}
|
||||
isl_ast_build_free(Context);
|
||||
|
||||
for (long i = Kernel->n_grid; i < 3; i++)
|
||||
Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1));
|
||||
|
||||
return std::make_tuple(Sizes[0], Sizes[1]);
|
||||
}
|
||||
|
||||
std::tuple<Value *, Value *, Value *>
|
||||
GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) {
|
||||
std::vector<Value *> Sizes;
|
||||
|
||||
for (long i = 0; i < Kernel->n_block; i++) {
|
||||
Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]);
|
||||
Sizes.push_back(Res);
|
||||
}
|
||||
|
||||
for (long i = Kernel->n_block; i < 3; i++)
|
||||
Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1));
|
||||
|
||||
return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]);
|
||||
}
|
||||
|
||||
Value *GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel,
|
||||
Function *F) {
|
||||
Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), F->getNumOperands());
|
||||
|
||||
BasicBlock *EntryBlock =
|
||||
&Builder.GetInsertBlock()->getParent()->getEntryBlock();
|
||||
std::string Launch = "polly_launch_" + std::to_string(Kernel->id);
|
||||
Instruction *Parameters =
|
||||
new AllocaInst(ArrayTy, Launch + "_params", EntryBlock->getTerminator());
|
||||
|
||||
int Index = 0;
|
||||
for (long i = 0; i < Prog->n_array; i++) {
|
||||
if (!ppcg_kernel_requires_array_argument(Kernel, i))
|
||||
continue;
|
||||
|
||||
isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
|
||||
const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);
|
||||
|
||||
Value *DevArray = DeviceAllocations[(ScopArrayInfo *)SAI];
|
||||
DevArray = createCallGetDevicePtr(DevArray);
|
||||
Instruction *Param = new AllocaInst(
|
||||
Builder.getInt8PtrTy(), Launch + "_param_" + std::to_string(Index),
|
||||
EntryBlock->getTerminator());
|
||||
Builder.CreateStore(DevArray, Param);
|
||||
Value *Slot = Builder.CreateGEP(Parameters,
|
||||
{Builder.getInt64(0), Builder.getInt64(i)});
|
||||
Value *ParamTyped =
|
||||
Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
|
||||
Builder.CreateStore(ParamTyped, Slot);
|
||||
Index++;
|
||||
}
|
||||
|
||||
auto Location = EntryBlock->getTerminator();
|
||||
return new BitCastInst(Parameters, Builder.getInt8PtrTy(),
|
||||
Launch + "_params_i8ptr", Location);
|
||||
}
|
||||
|
||||
void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
|
||||
isl_id *Id = isl_ast_node_get_annotation(KernelStmt);
|
||||
ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id);
|
||||
|
@ -805,11 +965,22 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
|
|||
S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array);
|
||||
LocalArrays.clear();
|
||||
|
||||
Value *Parameters = createLaunchParameters(Kernel, F);
|
||||
|
||||
std::string ASMString = finalizeKernelFunction();
|
||||
std::string Name = "kernel_" + std::to_string(Kernel->id);
|
||||
Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name);
|
||||
Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name");
|
||||
Value *GPUKernel = createCallGetKernel(KernelString, NameString);
|
||||
|
||||
Value *GridDimX, *GridDimY;
|
||||
std::tie(GridDimX, GridDimY) = getGridSizes(Kernel);
|
||||
|
||||
Value *BlockDimX, *BlockDimY, *BlockDimZ;
|
||||
std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
|
||||
|
||||
createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
|
||||
BlockDimZ, Parameters);
|
||||
createCallFreeKernel(GPUKernel);
|
||||
}
|
||||
|
||||
|
|
|
@ -96,7 +96,13 @@
|
|||
; IR-NEXT: %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 4194304)
|
||||
; IR-NEXT: [[HostPtr:%.*]] = bitcast [1024 x float]* %A to i8*
|
||||
; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[HostPtr]], i8* %p_dev_array_MemRef_A, i64 4194304)
|
||||
; IR-NEXT: [[DevPtr:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
|
||||
; IR-NEXT: store i8* [[DevPtr]], i8** %polly_launch_0_param_0
|
||||
; IR-NEXT: [[ParamSlot:%.*]] = getelementptr [0 x i8*], [0 x i8*]* %polly_launch_0_params, i64 0, i64 0
|
||||
; IR-NEXT: [[ParamTyped:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8*
|
||||
; IR-NEXT: store i8* [[ParamTyped]], i8** [[ParamSlot]]
|
||||
; IR-NEXT: call i8* @polly_getKernel
|
||||
; IR-NEXT: call void @polly_launchKernel(i8* %5, i32 32, i32 32, i32 32, i32 16, i32 1, i8* %polly_launch_0_params_i8ptr)
|
||||
; IR-NEXT: call void @polly_freeKernel
|
||||
; IR-NEXT: [[HostPtr2:%.*]] = bitcast [1024 x float]* %A to i8*
|
||||
; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_A, i8* [[HostPtr2]], i64 4194304)
|
||||
|
|
|
@ -30,8 +30,10 @@
|
|||
|
||||
; IR-LABEL: polly.loop_header: ; preds = %polly.loop_header, %polly.loop_preheader
|
||||
; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
|
||||
; IR-NEXT: call i8* @polly_getKernel
|
||||
; IR-NEXT: call void @polly_freeKernel
|
||||
; ...
|
||||
; IR: call i8* @polly_getKernel
|
||||
; ...
|
||||
; IR: call void @polly_freeKernel
|
||||
; IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1
|
||||
; IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, 98
|
||||
; IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
|
||||
|
|
|
@ -54,18 +54,12 @@ static void *HandleCudaRT;
|
|||
typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t);
|
||||
static CuMemAllocFcnTy *CuMemAllocFcnPtr;
|
||||
|
||||
typedef CUresult CUDAAPI CuFuncSetBlockShapeFcnTy(CUfunction, int, int, int);
|
||||
static CuFuncSetBlockShapeFcnTy *CuFuncSetBlockShapeFcnPtr;
|
||||
|
||||
typedef CUresult CUDAAPI CuParamSetvFcnTy(CUfunction, int, void *,
|
||||
unsigned int);
|
||||
static CuParamSetvFcnTy *CuParamSetvFcnPtr;
|
||||
|
||||
typedef CUresult CUDAAPI CuParamSetSizeFcnTy(CUfunction, unsigned int);
|
||||
static CuParamSetSizeFcnTy *CuParamSetSizeFcnPtr;
|
||||
|
||||
typedef CUresult CUDAAPI CuLaunchGridFcnTy(CUfunction, int, int);
|
||||
static CuLaunchGridFcnTy *CuLaunchGridFcnPtr;
|
||||
typedef CUresult CUDAAPI CuLaunchKernelFcnTy(
|
||||
CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
|
||||
unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
|
||||
unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
|
||||
void **kernelParams, void **extra);
|
||||
static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr;
|
||||
|
||||
typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t);
|
||||
static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr;
|
||||
|
@ -178,17 +172,8 @@ static int initialDeviceAPIs() {
|
|||
* of this kind of cast may not be emitted by clang and new versions of gcc
|
||||
* as it is valid on POSIX 2008.
|
||||
*/
|
||||
CuFuncSetBlockShapeFcnPtr = (CuFuncSetBlockShapeFcnTy *)getAPIHandle(
|
||||
HandleCuda, "cuFuncSetBlockShape");
|
||||
|
||||
CuParamSetvFcnPtr =
|
||||
(CuParamSetvFcnTy *)getAPIHandle(HandleCuda, "cuParamSetv");
|
||||
|
||||
CuParamSetSizeFcnPtr =
|
||||
(CuParamSetSizeFcnTy *)getAPIHandle(HandleCuda, "cuParamSetSize");
|
||||
|
||||
CuLaunchGridFcnPtr =
|
||||
(CuLaunchGridFcnTy *)getAPIHandle(HandleCuda, "cuLaunchGrid");
|
||||
CuLaunchKernelFcnPtr =
|
||||
(CuLaunchKernelFcnTy *)getAPIHandle(HandleCuda, "cuLaunchKernel");
|
||||
|
||||
CuMemAllocFcnPtr =
|
||||
(CuMemAllocFcnTy *)getAPIHandle(HandleCuda, "cuMemAlloc_v2");
|
||||
|
@ -407,29 +392,25 @@ void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
|
|||
}
|
||||
}
|
||||
|
||||
void polly_setKernelParameters(PollyGPUFunction *Kernel, int BlockWidth,
|
||||
int BlockHeight, PollyGPUDevicePtr *DevData) {
|
||||
void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
|
||||
unsigned int GridDimY, unsigned int BlockDimX,
|
||||
unsigned int BlockDimY, unsigned int BlockDimZ,
|
||||
void **Parameters) {
|
||||
dump_function();
|
||||
|
||||
int ParamOffset = 0;
|
||||
unsigned GridDimZ = 1;
|
||||
unsigned int SharedMemBytes = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE;
|
||||
CUstream Stream = 0;
|
||||
void **Extra = 0;
|
||||
|
||||
CuFuncSetBlockShapeFcnPtr(Kernel->Cuda, BlockWidth, BlockHeight, 1);
|
||||
CuParamSetvFcnPtr(Kernel->Cuda, ParamOffset, &(DevData->Cuda),
|
||||
sizeof(DevData->Cuda));
|
||||
ParamOffset += sizeof(DevData->Cuda);
|
||||
CuParamSetSizeFcnPtr(Kernel->Cuda, ParamOffset);
|
||||
}
|
||||
|
||||
void polly_launchKernel(PollyGPUFunction *Kernel, int GridWidth,
|
||||
int GridHeight) {
|
||||
dump_function();
|
||||
|
||||
if (CuLaunchGridFcnPtr(Kernel->Cuda, GridWidth, GridHeight) != CUDA_SUCCESS) {
|
||||
CUresult Res;
|
||||
Res = CuLaunchKernelFcnPtr(Kernel->Cuda, GridDimX, GridDimY, GridDimZ,
|
||||
BlockDimX, BlockDimY, BlockDimZ, SharedMemBytes,
|
||||
Stream, Parameters, Extra);
|
||||
if (Res != CUDA_SUCCESS) {
|
||||
fprintf(stdout, "Launching CUDA kernel failed.\n");
|
||||
exit(-1);
|
||||
}
|
||||
CudaThreadSynchronizeFcnPtr();
|
||||
debug_print("CUDA kernel launched.\n");
|
||||
}
|
||||
|
||||
void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
|
||||
|
@ -458,6 +439,12 @@ PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
|
|||
return DevData;
|
||||
}
|
||||
|
||||
void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) {
|
||||
dump_function();
|
||||
|
||||
return (void *)Allocation->Cuda;
|
||||
}
|
||||
|
||||
void polly_freeContext(PollyGPUContext *Context) {
|
||||
dump_function();
|
||||
|
||||
|
|
|
@ -49,17 +49,25 @@
|
|||
* PollyGPUDevicePtr *DevArray;
|
||||
* int *HostData;
|
||||
* int MemSize;
|
||||
* int BlockWidth = 16;
|
||||
* int BlockHeight = 16;
|
||||
* int GridWidth = 8;
|
||||
* int GridHeight = 8;
|
||||
*
|
||||
* int GridX = 8;
|
||||
* int GridY = 8;
|
||||
*
|
||||
* int BlockX = 16;
|
||||
* int BlockY = 16;
|
||||
* int BlockZ = 1;
|
||||
*
|
||||
* MemSize = 256*64*sizeof(int);
|
||||
* Context = polly_initContext();
|
||||
* DevArray = polly_allocateMemoryForDevice(MemSize);
|
||||
* Kernel = polly_getKernel(KernelString, KernelName);
|
||||
* polly_setKernelParameters(Kernel, BlockWidth, BlockHeight, DevData);
|
||||
* polly_launchKernel(Kernel, GridWidth, GridHeight);
|
||||
*
|
||||
* void *Params[1];
|
||||
* void *DevPtr = polly_getDevicePtr(DevArray)
|
||||
* Params[0] = &DevPtr;
|
||||
*
|
||||
* polly_launchKernel(Kernel, GridX, GridY, BlockX, BlockY, BlockZ, Params);
|
||||
*
|
||||
* polly_copyFromDeviceToHost(HostData, DevData, MemSize);
|
||||
* polly_freeKernel(Kernel);
|
||||
* polly_freeDeviceMemory(DevArray);
|
||||
|
@ -80,10 +88,10 @@ void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
|
|||
long MemSize);
|
||||
void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
|
||||
long MemSize);
|
||||
void polly_setKernelParameters(PollyGPUFunction *Kernel, int BlockWidth,
|
||||
int BlockHeight, PollyGPUDevicePtr *DevData);
|
||||
void polly_launchKernel(PollyGPUFunction *Kernel, int GridWidth,
|
||||
int GridHeight);
|
||||
void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
|
||||
unsigned int GridDimY, unsigned int BlockSizeX,
|
||||
unsigned int BlockSizeY, unsigned int BlockSizeZ,
|
||||
void **Parameters);
|
||||
void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation);
|
||||
void polly_freeContext(PollyGPUContext *Context);
|
||||
#endif /* GPUJIT_H_ */
|
||||
|
|
Loading…
Reference in New Issue