[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen

Summary: When compiling for GPU, one can now choose to compile for OpenCL or CUDA, with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library for that purpose, correctly choosing the corresponding library calls to the option chosen when compiling (via different initialization calls). Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far). Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay Reviewed By: grosser, Meinersbur Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia Tags: #polly Differential Revision: https://reviews.llvm.org/D32431 llvm-svn: 302379
2017-05-07 21:03:46 +00:00 · 2017-05-07 21:03:46 +00:00 · 17f01968f1
parent 2d1c6d6e8d
commit 17f01968f1
10 changed files with 1405 additions and 133 deletions
--- a/polly/CMakeLists.txt
+++ b/polly/CMakeLists.txt
@ -152,9 +152,10 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)

 option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF)
 if (POLLY_ENABLE_GPGPU_CODEGEN)
-  # Do not require CUDA, as GPU code generation test cases can be run without
-  # a cuda library.
+  # Do not require CUDA/OpenCL, as GPU code generation test cases can be run
+  # without a CUDA/OpenCL library.
  FIND_PACKAGE(CUDA)
+  FIND_PACKAGE(OpenCL)
  set(GPU_CODEGEN TRUE)
 else(POLLY_ENABLE_GPGPU_CODEGEN)
  set(GPU_CODEGEN FALSE)
@ -163,8 +164,13 @@ endif(POLLY_ENABLE_GPGPU_CODEGEN)

 # Support GPGPU code generation if the library is available.
 if (CUDALIB_FOUND)
+  add_definitions(-DHAS_LIBCUDART)
  INCLUDE_DIRECTORIES( ${CUDALIB_INCLUDE_DIR} )
 endif(CUDALIB_FOUND)
+if (OpenCL_FOUND)
+  add_definitions(-DHAS_LIBOPENCL)
+  INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} )
+endif(OpenCL_FOUND)

 option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in Polly" ON)
 if (NOT POLLY_BUNDLED_ISL)
--- a/polly/include/polly/CodeGen/PPCGCodeGeneration.h
+++ b/polly/include/polly/CodeGen/PPCGCodeGeneration.h
@ -0,0 +1,24 @@
+//===--- polly/PPCGCodeGeneration.h - Polly Accelerator Code Generation. --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Take a scop created by ScopInfo and map it to GPU code using the ppcg
+// GPU mapping strategy.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POLLY_PPCGCODEGENERATION_H
+#define POLLY_PPCGCODEGENERATION_H
+
+/// The GPU Architecture to target.
+enum GPUArch { NVPTX64 };
+
+/// The GPU Runtime implementation to use.
+enum GPURuntime { CUDA, OpenCL };
+
+#endif // POLLY_PPCGCODEGENERATION_H
--- a/polly/include/polly/LinkAllPasses.h
+++ b/polly/include/polly/LinkAllPasses.h
@ -15,6 +15,7 @@
 #ifndef POLLY_LINKALLPASSES_H
 #define POLLY_LINKALLPASSES_H

+#include "polly/CodeGen/PPCGCodeGeneration.h"
 #include "polly/Config/config.h"
 #include "polly/PruneUnprofitable.h"
 #include "polly/Simplify.h"
@ -48,7 +49,8 @@ llvm::Pass *createScopInfoWrapperPassPass();
 llvm::Pass *createIslAstInfoPass();
 llvm::Pass *createCodeGenerationPass();
 #ifdef GPU_CODEGEN
-llvm::Pass *createPPCGCodeGenerationPass();
+llvm::Pass *createPPCGCodeGenerationPass(GPUArch Arch = GPUArch::NVPTX64,
+                                         GPURuntime Runtime = GPURuntime::CUDA);
 #endif
 llvm::Pass *createIslScheduleOptimizerPass();
 llvm::Pass *createFlattenSchedulePass();
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//

+#include "polly/CodeGen/PPCGCodeGeneration.h"
 #include "polly/CodeGen/IslAst.h"
 #include "polly/CodeGen/IslNodeBuilder.h"
 #include "polly/CodeGen/Utils.h"
@ -153,9 +154,9 @@ public:
  GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator,
                 const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE,
                 DominatorTree &DT, Scop &S, BasicBlock *StartBlock,
-                 gpu_prog *Prog)
+                 gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch)
      : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock),
-        Prog(Prog) {
+        Prog(Prog), Runtime(Runtime), Arch(Arch) {
    getExprBuilder().setIDToSAI(&IDToSAI);
  }

@ -201,6 +202,12 @@ private:
  /// The GPU program we generate code for.
  gpu_prog *Prog;

+  /// The GPU Runtime implementation to use (OpenCL or CUDA).
+  GPURuntime Runtime;
+
+  /// The GPU Architecture to target.
+  GPUArch Arch;
+
  /// Class to free isl_ids.
  class IslIdDeleter {
  public:
@ -752,7 +759,17 @@ void GPUNodeBuilder::createCallSynchronizeDevice() {
 }

 Value *GPUNodeBuilder::createCallInitContext() {
-  const char *Name = "polly_initContext";
+  const char *Name;
+
+  switch (Runtime) {
+  case GPURuntime::CUDA:
+    Name = "polly_initContextCUDA";
+    break;
+  case GPURuntime::OpenCL:
+    Name = "polly_initContextCL";
+    break;
+  }
+
  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  Function *F = M->getFunction(Name);

@ -1028,7 +1045,15 @@ void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr,

 void GPUNodeBuilder::createKernelSync() {
  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
+
+  Function *Sync;
+
+  switch (Arch) {
+  case GPUArch::NVPTX64:
+    Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
+    break;
+  }
+
  Builder.CreateCall(Sync, {});
 }

@ -1434,7 +1459,12 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
  auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
  auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
                              GPUModule.get());
-  FN->setCallingConv(CallingConv::PTX_Kernel);
+
+  switch (Arch) {
+  case GPUArch::NVPTX64:
+    FN->setCallingConv(CallingConv::PTX_Kernel);
+    break;
+  }

  auto Arg = FN->arg_begin();
  for (long i = 0; i < Kernel->n_array; i++) {
@ -1495,12 +1525,19 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
 }

 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
-  Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x,
-                                   Intrinsic::nvvm_read_ptx_sreg_ctaid_y};
+  Intrinsic::ID IntrinsicsBID[2];
+  Intrinsic::ID IntrinsicsTID[3];

-  Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x,
-                                   Intrinsic::nvvm_read_ptx_sreg_tid_y,
-                                   Intrinsic::nvvm_read_ptx_sreg_tid_z};
+  switch (Arch) {
+  case GPUArch::NVPTX64:
+    IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
+    IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
+
+    IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x;
+    IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y;
+    IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z;
+    break;
+  }

  auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable {
    std::string Name = isl_id_get_name(Id);
@ -1649,11 +1686,18 @@ void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) {

 void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
                                          SetVector<Value *> &SubtreeValues) {
-
  std::string Identifier = "kernel_" + std::to_string(Kernel->id);
  GPUModule.reset(new Module(Identifier, Builder.getContext()));
-  GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
-  GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
+
+  switch (Arch) {
+  case GPUArch::NVPTX64:
+    if (Runtime == GPURuntime::CUDA)
+      GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
+    else if (Runtime == GPURuntime::OpenCL)
+      GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl"));
+    GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
+    break;
+  }

  Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);

@ -1674,7 +1718,21 @@ void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
 }

 std::string GPUNodeBuilder::createKernelASM() {
-  llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda"));
+  llvm::Triple GPUTriple;
+
+  switch (Arch) {
+  case GPUArch::NVPTX64:
+    switch (Runtime) {
+    case GPURuntime::CUDA:
+      GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda"));
+      break;
+    case GPURuntime::OpenCL:
+      GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl"));
+      break;
+    }
+    break;
+  }
+
  std::string ErrMsg;
  auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg);

@ -1685,9 +1743,17 @@ std::string GPUNodeBuilder::createKernelASM() {

  TargetOptions Options;
  Options.UnsafeFPMath = FastMath;
-  std::unique_ptr<TargetMachine> TargetM(
-      GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "",
-                                     Options, Optional<Reloc::Model>()));
+
+  std::string subtarget;
+
+  switch (Arch) {
+  case GPUArch::NVPTX64:
+    subtarget = CudaVersion;
+    break;
+  }
+
+  std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine(
+      GPUTriple.getTriple(), subtarget, "", Options, Optional<Reloc::Model>()));

  SmallString<0> ASMString;
  raw_svector_ostream ASMStream(ASMString);
@ -1739,6 +1805,10 @@ class PPCGCodeGeneration : public ScopPass {
 public:
  static char ID;

+  GPURuntime Runtime = GPURuntime::CUDA;
+
+  GPUArch Architecture = GPUArch::NVPTX64;
+
  /// The scop that is currently processed.
  Scop *S;

@ -2522,7 +2592,7 @@ public:
        executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI);

    GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S,
-                               StartBlock, Prog);
+                               StartBlock, Prog, Runtime, Architecture);

    // TODO: Handle LICM
    auto SplitBlock = StartBlock->getSinglePredecessor();
@ -2610,7 +2680,12 @@ public:

 char PPCGCodeGeneration::ID = 1;

-Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); }
+Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) {
+  PPCGCodeGeneration *generator = new PPCGCodeGeneration();
+  generator->Runtime = Runtime;
+  generator->Architecture = Arch;
+  return generator;
+}

 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg",
                      "Polly - Apply PPCG translation to SCOP", false, false)
--- a/polly/lib/Support/RegisterPasses.cpp
+++ b/polly/lib/Support/RegisterPasses.cpp
@ -23,6 +23,7 @@
 #include "polly/Canonicalization.h"
 #include "polly/CodeGen/CodeGeneration.h"
 #include "polly/CodeGen/CodegenCleanup.h"
+#include "polly/CodeGen/PPCGCodeGeneration.h"
 #include "polly/DeLICM.h"
 #include "polly/DependenceInfo.h"
 #include "polly/FlattenSchedule.h"
@ -101,6 +102,23 @@ static cl::opt<TargetChoice>
                          ),
           cl::init(TARGET_CPU), cl::ZeroOrMore, cl::cat(PollyCategory));

+#ifdef GPU_CODEGEN
+static cl::opt<GPURuntime> GPURuntimeChoice(
+    "polly-gpu-runtime", cl::desc("The GPU Runtime API to target"),
+    cl::values(clEnumValN(GPURuntime::CUDA, "libcudart",
+                          "use the CUDA Runtime API"),
+               clEnumValN(GPURuntime::OpenCL, "libopencl",
+                          "use the OpenCL Runtime API")),
+    cl::init(GPURuntime::CUDA), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<GPUArch>
+    GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"),
+                  cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64",
+                                        "target NVIDIA 64-bit architecture")),
+                  cl::init(GPUArch::NVPTX64), cl::ZeroOrMore,
+                  cl::cat(PollyCategory));
+#endif
+
 VectorizerChoice polly::PollyVectorizerChoice;
 static cl::opt<polly::VectorizerChoice, true> Vectorizer(
    "polly-vectorizer", cl::desc("Select the vectorization strategy"),
@ -309,7 +327,8 @@ void registerPollyPasses(llvm::legacy::PassManagerBase &PM) {

  if (Target == TARGET_GPU) {
 #ifdef GPU_CODEGEN
-    PM.add(polly::createPPCGCodeGenerationPass());
+    PM.add(
+        polly::createPPCGCodeGenerationPass(GPUArchChoice, GPURuntimeChoice));
 #endif
  } else {
    switch (CodeGeneration) {
--- a/polly/test/GPGPU/cuda-managed-memory-simple.ll
+++ b/polly/test/GPGPU/cuda-managed-memory-simple.ll
@ -35,7 +35,7 @@
 ; CHECK-NOT: polly_freeDeviceMemory
 ; CHECK-NOT: polly_allocateMemoryForDevice

-; CHECK:       %13 = call i8* @polly_initContext()
+; CHECK:       %13 = call i8* @polly_initContextCUDA()
 ; CHECK-NEXT:  %14 = bitcast i32* %A to i8*
 ; CHECK-NEXT:  %15 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
 ; CHECK-NEXT:  store i8* %14, i8** %polly_launch_0_param_0
@ -46,7 +46,7 @@
 ; CHECK-NEXT:  store i8* %17, i8** %polly_launch_0_param_1
 ; CHECK-NEXT:  %19 = bitcast i8** %polly_launch_0_param_1 to i8*
 ; CHECK-NEXT:  store i8* %19, i8** %18
-; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))
+; CHECK-NEXT:  %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))
 ; CHECK-NEXT:  call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
 ; CHECK-NEXT:  call void @polly_freeKernel(i8* %20)
 ; CHECK-NEXT:  call void @polly_synchronizeDevice()
--- a/polly/test/GPGPU/size-cast.ll
+++ b/polly/test/GPGPU/size-cast.ll
@ -29,7 +29,7 @@
 ; CODE-NEXT:   if (arg >= 32 * b0 + t0 + 1048576 * c0 + 1)
 ; CODE-NEXT:     Stmt_bb6(0, 32 * b0 + t0 + 1048576 * c0);

-; IR:        call i8* @polly_initContext()
+; IR:        call i8* @polly_initContextCUDA()
 ; IR-NEXT:   sext i32 %arg to i64
 ; IR-NEXT:   mul i64
 ; IR-NEXT:   @polly_allocateMemoryForDevice
--- a/polly/tools/CMakeLists.txt
+++ b/polly/tools/CMakeLists.txt
@ -1,5 +1,5 @@
-if (CUDALIB_FOUND)
+if (CUDALIB_FOUND OR OpenCL_FOUND)
  add_subdirectory(GPURuntime)
-endif (CUDALIB_FOUND)
+endif (CUDALIB_FOUND OR OpenCL_FOUND)

 set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE)
--- a/polly/tools/GPURuntime/GPUJIT.c
+++ b/polly/tools/GPURuntime/GPUJIT.c
--- a/polly/tools/GPURuntime/GPUJIT.h
+++ b/polly/tools/GPURuntime/GPUJIT.h
@ -76,12 +76,27 @@
 *
 */

+typedef enum PollyGPURuntimeT {
+  RUNTIME_NONE,
+  RUNTIME_CUDA,
+  RUNTIME_CL
+} PollyGPURuntime;
+
 typedef struct PollyGPUContextT PollyGPUContext;
 typedef struct PollyGPUFunctionT PollyGPUFunction;
 typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;

-PollyGPUContext *polly_initContext();
-PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
+typedef struct OpenCLContextT OpenCLContext;
+typedef struct OpenCLKernelT OpenCLKernel;
+typedef struct OpenCLDevicePtrT OpenCLDevicePtr;
+
+typedef struct CUDAContextT CUDAContext;
+typedef struct CUDAKernelT CUDAKernel;
+typedef struct CUDADevicePtrT CUDADevicePtr;
+
+PollyGPUContext *polly_initContextCUDA();
+PollyGPUContext *polly_initContextCL();
+PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
                                  const char *KernelName);
 void polly_freeKernel(PollyGPUFunction *Kernel);
 void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,