Revert "[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen"

This reverts commit 17a84e414adb51ee375d14836d4c2a817b191933. Patches should have been submitted in the order of: 1. D32852 2. D32854 3. D32431 I mistakenly pushed D32431(3) first. Reverting to push in the correct order. llvm-svn: 302217
2017-05-05 09:02:08 +00:00 · 2017-05-05 09:02:08 +00:00 · c1267b9baa
parent 2b0fae877e
commit c1267b9baa
10 changed files with 161 additions and 1433 deletions
--- a/polly/CMakeLists.txt
+++ b/polly/CMakeLists.txt
@ -152,10 +152,9 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)

 option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF)
 if (POLLY_ENABLE_GPGPU_CODEGEN)
-  # Do not require CUDA/OpenCL, as GPU code generation test cases can be run
-  # without a CUDA/OpenCL library.
+  # Do not require CUDA, as GPU code generation test cases can be run without
+  # a cuda library.
  FIND_PACKAGE(CUDA)
-  FIND_PACKAGE(OpenCL)
  set(GPU_CODEGEN TRUE)
 else(POLLY_ENABLE_GPGPU_CODEGEN)
  set(GPU_CODEGEN FALSE)
@ -164,13 +163,8 @@ endif(POLLY_ENABLE_GPGPU_CODEGEN)

 # Support GPGPU code generation if the library is available.
 if (CUDALIB_FOUND)
-  add_definitions(-DHAS_LIBCUDART)
  INCLUDE_DIRECTORIES( ${CUDALIB_INCLUDE_DIR} )
 endif(CUDALIB_FOUND)
-if (OpenCL_FOUND)
-  add_definitions(-DHAS_LIBOPENCL)
-  INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} )
-endif(OpenCL_FOUND)

 option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in Polly" ON)
 if (NOT POLLY_BUNDLED_ISL)
--- a/polly/include/polly/CodeGen/PPCGCodeGeneration.h
+++ b/polly/include/polly/CodeGen/PPCGCodeGeneration.h
@ -1,24 +0,0 @@
-//===--- polly/PPCGCodeGeneration.h - Polly Accelerator Code Generation. --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Take a scop created by ScopInfo and map it to GPU code using the ppcg
-// GPU mapping strategy.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef POLLY_PPCGCODEGENERATION_H
-#define POLLY_PPCGCODEGENERATION_H
-
-/// The GPU Architecture to target.
-enum GPUArch { NVPTX64 };
-
-/// The GPU Runtime implementation to use.
-enum GPURuntime { CUDA, OpenCL };
-
-#endif // POLLY_PPCGCODEGENERATION_H
--- a/polly/include/polly/LinkAllPasses.h
+++ b/polly/include/polly/LinkAllPasses.h
@ -15,7 +15,6 @@
 #ifndef POLLY_LINKALLPASSES_H
 #define POLLY_LINKALLPASSES_H

-#include "polly/CodeGen/PPCGCodeGeneration.h"
 #include "polly/Config/config.h"
 #include "polly/PruneUnprofitable.h"
 #include "polly/Simplify.h"
@ -49,8 +48,7 @@ llvm::Pass *createScopInfoWrapperPassPass();
 llvm::Pass *createIslAstInfoPass();
 llvm::Pass *createCodeGenerationPass();
 #ifdef GPU_CODEGEN
-llvm::Pass *createPPCGCodeGenerationPass(GPUArch Arch = GPUArch::NVPTX64,
-                                         GPURuntime Runtime = GPURuntime::CUDA);
+llvm::Pass *createPPCGCodeGenerationPass();
 #endif
 llvm::Pass *createIslScheduleOptimizerPass();
 llvm::Pass *createFlattenSchedulePass();
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//

-#include "polly/CodeGen/PPCGCodeGeneration.h"
 #include "polly/CodeGen/IslAst.h"
 #include "polly/CodeGen/IslNodeBuilder.h"
 #include "polly/CodeGen/Utils.h"
@ -154,9 +153,9 @@ public:
  GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator,
                 const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE,
                 DominatorTree &DT, Scop &S, BasicBlock *StartBlock,
-                 gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch)
+                 gpu_prog *Prog)
      : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock),
-        Prog(Prog), Runtime(Runtime), Arch(Arch) {
+        Prog(Prog) {
    getExprBuilder().setIDToSAI(&IDToSAI);
  }

@ -202,12 +201,6 @@ private:
  /// The GPU program we generate code for.
  gpu_prog *Prog;

-  /// The GPU Runtime implementation to use (OpenCL or CUDA).
-  GPURuntime Runtime;
-
-  /// The GPU Architecture to target.
-  GPUArch Arch;
-
  /// Class to free isl_ids.
  class IslIdDeleter {
  public:
@ -759,17 +752,7 @@ void GPUNodeBuilder::createCallSynchronizeDevice() {
 }

 Value *GPUNodeBuilder::createCallInitContext() {
-  const char *Name;
-
-  switch (Runtime) {
-  case GPURuntime::CUDA:
-    Name = "polly_initContextCUDA";
-    break;
-  case GPURuntime::OpenCL:
-    Name = "polly_initContextCL";
-    break;
-  }
-
+  const char *Name = "polly_initContext";
  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  Function *F = M->getFunction(Name);

@ -1045,15 +1028,7 @@ void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr,

 void GPUNodeBuilder::createKernelSync() {
  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-
-  Function *Sync;
-
-  switch (Arch) {
-  case GPUArch::NVPTX64:
-    Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
-    break;
-  }
-
+  auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
  Builder.CreateCall(Sync, {});
 }

@ -1459,12 +1434,7 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
  auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
  auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
                              GPUModule.get());
-
-  switch (Arch) {
-  case GPUArch::NVPTX64:
-    FN->setCallingConv(CallingConv::PTX_Kernel);
-    break;
-  }
+  FN->setCallingConv(CallingConv::PTX_Kernel);

  auto Arg = FN->arg_begin();
  for (long i = 0; i < Kernel->n_array; i++) {
@ -1525,19 +1495,12 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
 }

 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
-  Intrinsic::ID IntrinsicsBID[2];
-  Intrinsic::ID IntrinsicsTID[3];
+  Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x,
+                                   Intrinsic::nvvm_read_ptx_sreg_ctaid_y};

-  switch (Arch) {
-  case GPUArch::NVPTX64:
-    IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
-    IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
-
-    IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x;
-    IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y;
-    IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z;
-    break;
-  }
+  Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x,
+                                   Intrinsic::nvvm_read_ptx_sreg_tid_y,
+                                   Intrinsic::nvvm_read_ptx_sreg_tid_z};

  auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable {
    std::string Name = isl_id_get_name(Id);
@ -1686,18 +1649,11 @@ void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) {

 void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
                                          SetVector<Value *> &SubtreeValues) {
+
  std::string Identifier = "kernel_" + std::to_string(Kernel->id);
  GPUModule.reset(new Module(Identifier, Builder.getContext()));
-
-  switch (Arch) {
-  case GPUArch::NVPTX64:
-    if (Runtime == GPURuntime::CUDA)
-      GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
-    else if (Runtime == GPURuntime::OpenCL)
-      GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl"));
-    GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
-    break;
-  }
+  GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
+  GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));

  Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);

@ -1718,21 +1674,7 @@ void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
 }

 std::string GPUNodeBuilder::createKernelASM() {
-  llvm::Triple GPUTriple;
-
-  switch (Arch) {
-  case GPUArch::NVPTX64:
-    switch (Runtime) {
-    case GPURuntime::CUDA:
-      GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda"));
-      break;
-    case GPURuntime::OpenCL:
-      GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl"));
-      break;
-    }
-    break;
-  }
-
+  llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda"));
  std::string ErrMsg;
  auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg);

@ -1743,17 +1685,9 @@ std::string GPUNodeBuilder::createKernelASM() {

  TargetOptions Options;
  Options.UnsafeFPMath = FastMath;
-
-  std::string subtarget;
-
-  switch (Arch) {
-  case GPUArch::NVPTX64:
-    subtarget = CudaVersion;
-    break;
-  }
-
-  std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine(
-      GPUTriple.getTriple(), subtarget, "", Options, Optional<Reloc::Model>()));
+  std::unique_ptr<TargetMachine> TargetM(
+      GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "",
+                                     Options, Optional<Reloc::Model>()));

  SmallString<0> ASMString;
  raw_svector_ostream ASMStream(ASMString);
@ -1805,10 +1739,6 @@ class PPCGCodeGeneration : public ScopPass {
 public:
  static char ID;

-  GPURuntime Runtime = GPURuntime::CUDA;
-
-  GPUArch Architecture = GPUArch::NVPTX64;
-
  /// The scop that is currently processed.
  Scop *S;

@ -2592,7 +2522,7 @@ public:
        executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI);

    GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S,
-                               StartBlock, Prog, Runtime, Architecture);
+                               StartBlock, Prog);

    // TODO: Handle LICM
    auto SplitBlock = StartBlock->getSinglePredecessor();
@ -2680,12 +2610,7 @@ public:

 char PPCGCodeGeneration::ID = 1;

-Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) {
-  PPCGCodeGeneration *generator = new PPCGCodeGeneration();
-  generator->Runtime = Runtime;
-  generator->Architecture = Arch;
-  return generator;
-}
+Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); }

 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg",
                      "Polly - Apply PPCG translation to SCOP", false, false)
--- a/polly/lib/Support/RegisterPasses.cpp
+++ b/polly/lib/Support/RegisterPasses.cpp
@ -23,7 +23,6 @@
 #include "polly/Canonicalization.h"
 #include "polly/CodeGen/CodeGeneration.h"
 #include "polly/CodeGen/CodegenCleanup.h"
-#include "polly/CodeGen/PPCGCodeGeneration.h"
 #include "polly/DeLICM.h"
 #include "polly/DependenceInfo.h"
 #include "polly/FlattenSchedule.h"
@ -102,23 +101,6 @@ static cl::opt<TargetChoice>
                          ),
           cl::init(TARGET_CPU), cl::ZeroOrMore, cl::cat(PollyCategory));

-#ifdef GPU_CODEGEN
-static cl::opt<GPURuntime> GPURuntimeChoice(
-    "polly-gpu-runtime", cl::desc("The GPU Runtime API to target"),
-    cl::values(clEnumValN(GPURuntime::CUDA, "libcudart",
-                          "use the CUDA Runtime API"),
-               clEnumValN(GPURuntime::OpenCL, "libopencl",
-                          "use the OpenCL Runtime API")),
-    cl::init(GPURuntime::CUDA), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-static cl::opt<GPUArch>
-    GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"),
-                  cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64",
-                                        "target NVIDIA 64-bit architecture")),
-                  cl::init(GPUArch::NVPTX64), cl::ZeroOrMore,
-                  cl::cat(PollyCategory));
-#endif
-
 VectorizerChoice polly::PollyVectorizerChoice;
 static cl::opt<polly::VectorizerChoice, true> Vectorizer(
    "polly-vectorizer", cl::desc("Select the vectorization strategy"),
@ -327,8 +309,7 @@ void registerPollyPasses(llvm::legacy::PassManagerBase &PM) {

  if (Target == TARGET_GPU) {
 #ifdef GPU_CODEGEN
-    PM.add(
-        polly::createPPCGCodeGenerationPass(GPUArchChoice, GPURuntimeChoice));
+    PM.add(polly::createPPCGCodeGenerationPass());
 #endif
  } else {
    switch (CodeGeneration) {
--- a/polly/test/GPGPU/cuda-managed-memory-simple.ll
+++ b/polly/test/GPGPU/cuda-managed-memory-simple.ll
@ -35,7 +35,7 @@
 ; CHECK-NOT: polly_freeDeviceMemory
 ; CHECK-NOT: polly_allocateMemoryForDevice

-; CHECK:       %13 = call i8* @polly_initContextCUDA()
+; CHECK:       %13 = call i8* @polly_initContext()
 ; CHECK-NEXT:  %14 = bitcast i32* %A to i8*
 ; CHECK-NEXT:  %15 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
 ; CHECK-NEXT:  store i8* %14, i8** %polly_launch_0_param_0
@ -46,7 +46,7 @@
 ; CHECK-NEXT:  store i8* %17, i8** %polly_launch_0_param_1
 ; CHECK-NEXT:  %19 = bitcast i8** %polly_launch_0_param_1 to i8*
 ; CHECK-NEXT:  store i8* %19, i8** %18
-; CHECK-NEXT:  %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))
+; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))
 ; CHECK-NEXT:  call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
 ; CHECK-NEXT:  call void @polly_freeKernel(i8* %20)
 ; CHECK-NEXT:  call void @polly_synchronizeDevice()
--- a/polly/test/GPGPU/size-cast.ll
+++ b/polly/test/GPGPU/size-cast.ll
@ -29,7 +29,7 @@
 ; CODE-NEXT:   if (arg >= 32 * b0 + t0 + 1048576 * c0 + 1)
 ; CODE-NEXT:     Stmt_bb6(0, 32 * b0 + t0 + 1048576 * c0);

-; IR:        call i8* @polly_initContextCUDA()
+; IR:        call i8* @polly_initContext()
 ; IR-NEXT:   sext i32 %arg to i64
 ; IR-NEXT:   mul i64
 ; IR-NEXT:   @polly_allocateMemoryForDevice
--- a/polly/tools/CMakeLists.txt
+++ b/polly/tools/CMakeLists.txt
@ -1,5 +1,5 @@
-if (CUDALIB_FOUND OR OpenCL_FOUND)
+if (CUDALIB_FOUND)
  add_subdirectory(GPURuntime)
-endif (CUDALIB_FOUND OR OpenCL_FOUND)
+endif (CUDALIB_FOUND)

 set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE)
--- a/polly/tools/GPURuntime/GPUJIT.c
+++ b/polly/tools/GPURuntime/GPUJIT.c
--- a/polly/tools/GPURuntime/GPUJIT.h
+++ b/polly/tools/GPURuntime/GPUJIT.h
@ -76,27 +76,12 @@
 *
 */

-typedef enum PollyGPURuntimeT {
-  RUNTIME_NONE,
-  RUNTIME_CUDA,
-  RUNTIME_CL
-} PollyGPURuntime;
-
 typedef struct PollyGPUContextT PollyGPUContext;
 typedef struct PollyGPUFunctionT PollyGPUFunction;
 typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;

-typedef struct OpenCLContextT OpenCLContext;
-typedef struct OpenCLKernelT OpenCLKernel;
-typedef struct OpenCLDevicePtrT OpenCLDevicePtr;
-
-typedef struct CUDAContextT CUDAContext;
-typedef struct CUDAKernelT CUDAKernel;
-typedef struct CUDADevicePtrT CUDADevicePtr;
-
-PollyGPUContext *polly_initContextCUDA();
-PollyGPUContext *polly_initContextCL();
-PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
+PollyGPUContext *polly_initContext();
+PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
                                  const char *KernelName);
 void polly_freeKernel(PollyGPUFunction *Kernel);
 void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,