GPGPU: Add basic support for kernel launches

llvm-svn: 276863
2016-07-27 13:20:16 +00:00 · 2016-07-27 13:20:16 +00:00 · 79a947c233
parent 499375ceaa
commit 79a947c233
5 changed files with 226 additions and 52 deletions
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@ -204,6 +204,29 @@ private:
  /// @returns A set of values referenced by the kernel.
  SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel);

+  /// Compute the sizes of the execution grid for a given kernel.
+  ///
+  /// @param Kernel The kernel to compute grid sizes for.
+  ///
+  /// @returns A tuple with grid sizes for X and Y dimension
+  std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel);
+
+  /// Compute the sizes of the thread blocks for a given kernel.
+  ///
+  /// @param Kernel The kernel to compute thread block sizes for.
+  ///
+  /// @returns A tuple with thread block sizes for X, Y, and Z dimensions.
+  std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel);
+
+  /// Create kernel launch parameters.
+  ///
+  /// @param Kernel The kernel to create parameters for.
+  /// @param F      The kernel function that has been created.
+  ///
+  /// @returns A stack allocated array with pointers to the parameter
+  ///          values that are passed to the kernel.
+  Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F);
+
  /// Create GPU kernel.
  ///
  /// Code generate the kernel described by @p KernelStmt.
@ -296,6 +319,13 @@ private:
  /// @returns A pointer to the newly initialized context.
  Value *createCallInitContext();

+  /// Create a call to get the device pointer for a kernel allocation.
+  ///
+  /// @param Allocation The Polly GPU allocation
+  ///
+  /// @returns The device parameter corresponding to this allocation.
+  Value *createCallGetDevicePtr(Value *Allocation);
+
  /// Create a call to free the GPU context.
  ///
  /// @param Context A pointer to an initialized GPU context.
@ -339,6 +369,21 @@ private:
  ///
  /// @param GPUKernel THe kernel to free.
  void createCallFreeKernel(Value *GPUKernel);
+
+  /// Create a call to launch a GPU kernel.
+  ///
+  /// @param GPUKernel  The kernel to launch.
+  /// @param GridDimX   The size of the first grid dimension.
+  /// @param GridDimY   The size of the second grid dimension.
+  /// @param GridBlockX The size of the first block dimension.
+  /// @param GridBlockY The size of the second block dimension.
+  /// @param GridBlockZ The size of the third block dimension.
+  /// @param Paramters  A pointer to an array that contains itself pointers to
+  ///                   the parameter values passed for each kernel argument.
+  void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX,
+                              Value *GridDimY, Value *BlockDimX,
+                              Value *BlockDimY, Value *BlockDimZ,
+                              Value *Parameters);
 };

 void GPUNodeBuilder::initializeAfterRTH() {
@ -393,6 +438,50 @@ Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) {
  return Builder.CreateCall(F, {Buffer, Entry});
 }

+Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) {
+  const char *Name = "polly_getDevicePtr";
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type *> Args;
+    Args.push_back(Builder.getInt8PtrTy());
+    FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  return Builder.CreateCall(F, {Allocation});
+}
+
+void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX,
+                                            Value *GridDimY, Value *BlockDimX,
+                                            Value *BlockDimY, Value *BlockDimZ,
+                                            Value *Parameters) {
+  const char *Name = "polly_launchKernel";
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type *> Args;
+    Args.push_back(Builder.getInt8PtrTy());
+    Args.push_back(Builder.getInt32Ty());
+    Args.push_back(Builder.getInt32Ty());
+    Args.push_back(Builder.getInt32Ty());
+    Args.push_back(Builder.getInt32Ty());
+    Args.push_back(Builder.getInt32Ty());
+    Args.push_back(Builder.getInt8PtrTy());
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
+                         BlockDimZ, Parameters});
+}
+
 void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) {
  const char *Name = "polly_freeKernel";
  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
@ -755,6 +844,77 @@ void GPUNodeBuilder::clearLoops(Function *F) {
  }
 }

+std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) {
+  std::vector<Value *> Sizes;
+  isl_ast_build *Context = isl_ast_build_from_context(S.getContext());
+
+  for (long i = 0; i < Kernel->n_grid; i++) {
+    isl_pw_aff *Size = isl_multi_pw_aff_get_pw_aff(Kernel->grid_size, i);
+    isl_ast_expr *GridSize = isl_ast_build_expr_from_pw_aff(Context, Size);
+    Value *Res = ExprBuilder.create(GridSize);
+    Res = Builder.CreateTrunc(Res, Builder.getInt32Ty());
+    Sizes.push_back(Res);
+  }
+  isl_ast_build_free(Context);
+
+  for (long i = Kernel->n_grid; i < 3; i++)
+    Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1));
+
+  return std::make_tuple(Sizes[0], Sizes[1]);
+}
+
+std::tuple<Value *, Value *, Value *>
+GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) {
+  std::vector<Value *> Sizes;
+
+  for (long i = 0; i < Kernel->n_block; i++) {
+    Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]);
+    Sizes.push_back(Res);
+  }
+
+  for (long i = Kernel->n_block; i < 3; i++)
+    Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1));
+
+  return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]);
+}
+
+Value *GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel,
+                                              Function *F) {
+  Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), F->getNumOperands());
+
+  BasicBlock *EntryBlock =
+      &Builder.GetInsertBlock()->getParent()->getEntryBlock();
+  std::string Launch = "polly_launch_" + std::to_string(Kernel->id);
+  Instruction *Parameters =
+      new AllocaInst(ArrayTy, Launch + "_params", EntryBlock->getTerminator());
+
+  int Index = 0;
+  for (long i = 0; i < Prog->n_array; i++) {
+    if (!ppcg_kernel_requires_array_argument(Kernel, i))
+      continue;
+
+    isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
+    const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);
+
+    Value *DevArray = DeviceAllocations[(ScopArrayInfo *)SAI];
+    DevArray = createCallGetDevicePtr(DevArray);
+    Instruction *Param = new AllocaInst(
+        Builder.getInt8PtrTy(), Launch + "_param_" + std::to_string(Index),
+        EntryBlock->getTerminator());
+    Builder.CreateStore(DevArray, Param);
+    Value *Slot = Builder.CreateGEP(Parameters,
+                                    {Builder.getInt64(0), Builder.getInt64(i)});
+    Value *ParamTyped =
+        Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
+    Builder.CreateStore(ParamTyped, Slot);
+    Index++;
+  }
+
+  auto Location = EntryBlock->getTerminator();
+  return new BitCastInst(Parameters, Builder.getInt8PtrTy(),
+                         Launch + "_params_i8ptr", Location);
+}
+
 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
  isl_id *Id = isl_ast_node_get_annotation(KernelStmt);
  ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id);
@ -805,11 +965,22 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
    S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array);
  LocalArrays.clear();

+  Value *Parameters = createLaunchParameters(Kernel, F);
+
  std::string ASMString = finalizeKernelFunction();
  std::string Name = "kernel_" + std::to_string(Kernel->id);
  Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name);
  Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name");
  Value *GPUKernel = createCallGetKernel(KernelString, NameString);
+
+  Value *GridDimX, *GridDimY;
+  std::tie(GridDimX, GridDimY) = getGridSizes(Kernel);
+
+  Value *BlockDimX, *BlockDimY, *BlockDimZ;
+  std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
+
+  createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
+                         BlockDimZ, Parameters);
  createCallFreeKernel(GPUKernel);
 }

--- a/polly/test/GPGPU/double-parallel-loop.ll
+++ b/polly/test/GPGPU/double-parallel-loop.ll
@ -96,7 +96,13 @@
 ; IR-NEXT:    %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 4194304)
 ; IR-NEXT:    [[HostPtr:%.*]] = bitcast [1024 x float]* %A to i8*
 ; IR-NEXT:    call void @polly_copyFromHostToDevice(i8* [[HostPtr]], i8* %p_dev_array_MemRef_A, i64 4194304)
+; IR-NEXT:    [[DevPtr:%.*]]  = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
+; IR-NEXT:    store i8* [[DevPtr]], i8** %polly_launch_0_param_0
+; IR-NEXT:    [[ParamSlot:%.*]] = getelementptr [0 x i8*], [0 x i8*]* %polly_launch_0_params, i64 0, i64 0
+; IR-NEXT:    [[ParamTyped:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8*
+; IR-NEXT:    store i8* [[ParamTyped]], i8** [[ParamSlot]]
 ; IR-NEXT:    call i8* @polly_getKernel
+; IR-NEXT:    call void @polly_launchKernel(i8* %5, i32 32, i32 32, i32 32, i32 16, i32 1, i8* %polly_launch_0_params_i8ptr)
 ; IR-NEXT:    call void @polly_freeKernel
 ; IR-NEXT:    [[HostPtr2:%.*]] = bitcast [1024 x float]* %A to i8*
 ; IR-NEXT:    call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_A, i8* [[HostPtr2]], i64 4194304)
--- a/polly/test/GPGPU/host-control-flow.ll
+++ b/polly/test/GPGPU/host-control-flow.ll
@ -30,8 +30,10 @@

 ; IR-LABEL: polly.loop_header:                                ; preds = %polly.loop_header, %polly.loop_preheader
 ; IR-NEXT:   %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
-; IR-NEXT: call i8* @polly_getKernel
-; IR-NEXT: call void @polly_freeKernel
+; ...
+; IR: call i8* @polly_getKernel
+; ...
+; IR: call void @polly_freeKernel
 ; IR-NEXT:   %polly.indvar_next = add nsw i64 %polly.indvar, 1
 ; IR-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar, 98
 ; IR-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
--- a/polly/tools/GPURuntime/GPUJIT.c
+++ b/polly/tools/GPURuntime/GPUJIT.c
@ -54,18 +54,12 @@ static void *HandleCudaRT;
 typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t);
 static CuMemAllocFcnTy *CuMemAllocFcnPtr;

-typedef CUresult CUDAAPI CuFuncSetBlockShapeFcnTy(CUfunction, int, int, int);
-static CuFuncSetBlockShapeFcnTy *CuFuncSetBlockShapeFcnPtr;
-
-typedef CUresult CUDAAPI CuParamSetvFcnTy(CUfunction, int, void *,
-                                          unsigned int);
-static CuParamSetvFcnTy *CuParamSetvFcnPtr;
-
-typedef CUresult CUDAAPI CuParamSetSizeFcnTy(CUfunction, unsigned int);
-static CuParamSetSizeFcnTy *CuParamSetSizeFcnPtr;
-
-typedef CUresult CUDAAPI CuLaunchGridFcnTy(CUfunction, int, int);
-static CuLaunchGridFcnTy *CuLaunchGridFcnPtr;
+typedef CUresult CUDAAPI CuLaunchKernelFcnTy(
+    CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
+    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
+    unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
+    void **kernelParams, void **extra);
+static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr;

 typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t);
 static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr;
@ -178,17 +172,8 @@ static int initialDeviceAPIs() {
   * of this kind of cast may not be emitted by clang and new versions of gcc
   * as it is valid on POSIX 2008.
   */
-  CuFuncSetBlockShapeFcnPtr = (CuFuncSetBlockShapeFcnTy *)getAPIHandle(
-      HandleCuda, "cuFuncSetBlockShape");
-
-  CuParamSetvFcnPtr =
-      (CuParamSetvFcnTy *)getAPIHandle(HandleCuda, "cuParamSetv");
-
-  CuParamSetSizeFcnPtr =
-      (CuParamSetSizeFcnTy *)getAPIHandle(HandleCuda, "cuParamSetSize");
-
-  CuLaunchGridFcnPtr =
-      (CuLaunchGridFcnTy *)getAPIHandle(HandleCuda, "cuLaunchGrid");
+  CuLaunchKernelFcnPtr =
+      (CuLaunchKernelFcnTy *)getAPIHandle(HandleCuda, "cuLaunchKernel");

  CuMemAllocFcnPtr =
      (CuMemAllocFcnTy *)getAPIHandle(HandleCuda, "cuMemAlloc_v2");
@ -407,29 +392,25 @@ void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
  }
 }

-void polly_setKernelParameters(PollyGPUFunction *Kernel, int BlockWidth,
-                               int BlockHeight, PollyGPUDevicePtr *DevData) {
+void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
+                        unsigned int GridDimY, unsigned int BlockDimX,
+                        unsigned int BlockDimY, unsigned int BlockDimZ,
+                        void **Parameters) {
  dump_function();

-  int ParamOffset = 0;
+  unsigned GridDimZ = 1;
+  unsigned int SharedMemBytes = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE;
+  CUstream Stream = 0;
+  void **Extra = 0;

-  CuFuncSetBlockShapeFcnPtr(Kernel->Cuda, BlockWidth, BlockHeight, 1);
-  CuParamSetvFcnPtr(Kernel->Cuda, ParamOffset, &(DevData->Cuda),
-                    sizeof(DevData->Cuda));
-  ParamOffset += sizeof(DevData->Cuda);
-  CuParamSetSizeFcnPtr(Kernel->Cuda, ParamOffset);
-}
-
-void polly_launchKernel(PollyGPUFunction *Kernel, int GridWidth,
-                        int GridHeight) {
-  dump_function();
-
-  if (CuLaunchGridFcnPtr(Kernel->Cuda, GridWidth, GridHeight) != CUDA_SUCCESS) {
+  CUresult Res;
+  Res = CuLaunchKernelFcnPtr(Kernel->Cuda, GridDimX, GridDimY, GridDimZ,
+                             BlockDimX, BlockDimY, BlockDimZ, SharedMemBytes,
+                             Stream, Parameters, Extra);
+  if (Res != CUDA_SUCCESS) {
    fprintf(stdout, "Launching CUDA kernel failed.\n");
    exit(-1);
  }
-  CudaThreadSynchronizeFcnPtr();
-  debug_print("CUDA kernel launched.\n");
 }

 void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
@ -458,6 +439,12 @@ PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
  return DevData;
 }

+void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) {
+  dump_function();
+
+  return (void *)Allocation->Cuda;
+}
+
 void polly_freeContext(PollyGPUContext *Context) {
  dump_function();

--- a/polly/tools/GPURuntime/GPUJIT.h
+++ b/polly/tools/GPURuntime/GPUJIT.h
@ -49,17 +49,25 @@
 *   PollyGPUDevicePtr *DevArray;
 *   int *HostData;
 *   int MemSize;
- *   int BlockWidth = 16;
- *   int BlockHeight = 16;
- *   int GridWidth = 8;
- *   int GridHeight = 8;
+ *
+ *   int GridX = 8;
+ *   int GridY = 8;
+ *
+ *   int BlockX = 16;
+ *   int BlockY = 16;
+ *   int BlockZ = 1;
 *
 *   MemSize = 256*64*sizeof(int);
 *   Context = polly_initContext();
 *   DevArray = polly_allocateMemoryForDevice(MemSize);
 *   Kernel = polly_getKernel(KernelString, KernelName);
- *   polly_setKernelParameters(Kernel, BlockWidth, BlockHeight, DevData);
- *   polly_launchKernel(Kernel, GridWidth, GridHeight);
+ *
+ *   void *Params[1];
+ *   void *DevPtr = polly_getDevicePtr(DevArray)
+ *   Params[0] = &DevPtr;
+ *
+ *   polly_launchKernel(Kernel, GridX, GridY, BlockX, BlockY, BlockZ, Params);
+ *
 *   polly_copyFromDeviceToHost(HostData, DevData, MemSize);
 *   polly_freeKernel(Kernel);
 *   polly_freeDeviceMemory(DevArray);
@ -80,10 +88,10 @@ void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
                                long MemSize);
 void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
                                long MemSize);
-void polly_setKernelParameters(PollyGPUFunction *Kernel, int BlockWidth,
-                               int BlockHeight, PollyGPUDevicePtr *DevData);
-void polly_launchKernel(PollyGPUFunction *Kernel, int GridWidth,
-                        int GridHeight);
+void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
+                        unsigned int GridDimY, unsigned int BlockSizeX,
+                        unsigned int BlockSizeY, unsigned int BlockSizeZ,
+                        void **Parameters);
 void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation);
 void polly_freeContext(PollyGPUContext *Context);
 #endif /* GPUJIT_H_ */