[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel

Summary: Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively. In addition to that, runtime support has been added to execute said SPIR code on Intel GPU's, where the system is equipped with Intel's open source driver Beignet (development version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime flag value to be 'libopencl'. The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex string transformations. Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip). Reviewers: bollu, grosser, Meinersbur, singam-sanjay Reviewed By: grosser, singam-sanjay Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton Tags: #polly Differential Revision: https://reviews.llvm.org/D35185 llvm-svn: 308751
2017-07-21 16:11:06 +00:00 · 2017-07-21 16:11:06 +00:00 · 2f3073b5cb
parent 4403b2b668
commit 2f3073b5cb
5 changed files with 353 additions and 43 deletions
--- a/polly/include/polly/CodeGen/PPCGCodeGeneration.h
+++ b/polly/include/polly/CodeGen/PPCGCodeGeneration.h
@ -16,7 +16,7 @@
 #define POLLY_PPCGCODEGENERATION_H

 /// The GPU Architecture to target.
-enum GPUArch { NVPTX64 };
+enum GPUArch { NVPTX64, SPIR32, SPIR64 };

 /// The GPU Runtime implementation to use.
 enum GPURuntime { CUDA, OpenCL };
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@ -545,6 +545,11 @@ private:
  /// @param The kernel to generate the intrinsic functions for.
  void insertKernelIntrinsics(ppcg_kernel *Kernel);

+  /// Insert function calls to retrieve the SPIR group/local ids.
+  ///
+  /// @param The kernel to generate the function calls for.
+  void insertKernelCallsSPIR(ppcg_kernel *Kernel);
+
  /// Setup the creation of functions referenced by the GPU kernel.
  ///
  /// 1. Create new function declarations in GPUModule which are the same as
@ -1254,10 +1259,24 @@ void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr,

 void GPUNodeBuilder::createKernelSync() {
  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  const char *SpirName = "__gen_ocl_barrier_global";

  Function *Sync;

  switch (Arch) {
+  case GPUArch::SPIR64:
+  case GPUArch::SPIR32:
+    Sync = M->getFunction(SpirName);
+
+    // If Sync is not available, declare it.
+    if (!Sync) {
+      GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+      std::vector<Type *> Args;
+      FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+      Sync = Function::Create(Ty, Linkage, SpirName, M);
+      Sync->setCallingConv(CallingConv::SPIR_FUNC);
+    }
+    break;
  case GPUArch::NVPTX64:
    Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
    break;
@ -1668,7 +1687,8 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {

  finalizeKernelArguments(Kernel);
  Function *F = Builder.GetInsertBlock()->getParent();
-  addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
+  if (Arch == GPUArch::NVPTX64)
+    addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
  clearDominators(F);
  clearScalarEvolution(F);
  clearLoops(F);
@ -1725,12 +1745,35 @@ static std::string computeNVPTXDataLayout(bool is64Bit) {
  return Ret;
 }

+/// Compute the DataLayout string for a SPIR kernel.
+///
+/// @param is64Bit Are we looking for a 64 bit architecture?
+static std::string computeSPIRDataLayout(bool is64Bit) {
+  std::string Ret = "";
+
+  if (!is64Bit) {
+    Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
+           "64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:"
+           "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:"
+           "256:256-v256:256:256-v512:512:512-v1024:1024:1024";
+  } else {
+    Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
+           "64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:"
+           "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:"
+           "256:256-v256:256:256-v512:512:512-v1024:1024:1024";
+  }
+
+  return Ret;
+}
+
 Function *
 GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
                                         SetVector<Value *> &SubtreeValues) {
  std::vector<Type *> Args;
  std::string Identifier = getKernelFuncName(Kernel->id);

+  std::vector<Metadata *> MemoryType;
+
  for (long i = 0; i < Prog->n_array; i++) {
    if (!ppcg_kernel_requires_array_argument(Kernel, i))
      continue;
@ -1739,16 +1782,23 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
      isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
      const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);
      Args.push_back(SAI->getElementType());
+      MemoryType.push_back(
+          ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
    } else {
      static const int UseGlobalMemory = 1;
      Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory));
+      MemoryType.push_back(
+          ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 1)));
    }
  }

  int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);

-  for (long i = 0; i < NumHostIters; i++)
+  for (long i = 0; i < NumHostIters; i++) {
    Args.push_back(Builder.getInt64Ty());
+    MemoryType.push_back(
+        ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
+  }

  int NumVars = isl_space_dim(Kernel->space, isl_dim_param);

@ -1757,19 +1807,49 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
    Value *Val = IDToValue[Id];
    isl_id_free(Id);
    Args.push_back(Val->getType());
+    MemoryType.push_back(
+        ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
  }

-  for (auto *V : SubtreeValues)
+  for (auto *V : SubtreeValues) {
    Args.push_back(V->getType());
+    MemoryType.push_back(
+        ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
+  }

  auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
  auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
                              GPUModule.get());

+  std::vector<Metadata *> EmptyStrings;
+
+  for (unsigned int i = 0; i < MemoryType.size(); i++) {
+    EmptyStrings.push_back(MDString::get(FN->getContext(), ""));
+  }
+
+  if (Arch == GPUArch::SPIR32 || Arch == GPUArch::SPIR64) {
+    FN->setMetadata("kernel_arg_addr_space",
+                    MDNode::get(FN->getContext(), MemoryType));
+    FN->setMetadata("kernel_arg_name",
+                    MDNode::get(FN->getContext(), EmptyStrings));
+    FN->setMetadata("kernel_arg_access_qual",
+                    MDNode::get(FN->getContext(), EmptyStrings));
+    FN->setMetadata("kernel_arg_type",
+                    MDNode::get(FN->getContext(), EmptyStrings));
+    FN->setMetadata("kernel_arg_type_qual",
+                    MDNode::get(FN->getContext(), EmptyStrings));
+    FN->setMetadata("kernel_arg_base_type",
+                    MDNode::get(FN->getContext(), EmptyStrings));
+  }
+
  switch (Arch) {
  case GPUArch::NVPTX64:
    FN->setCallingConv(CallingConv::PTX_Kernel);
    break;
+  case GPUArch::SPIR32:
+  case GPUArch::SPIR64:
+    FN->setCallingConv(CallingConv::SPIR_KERNEL);
+    break;
  }

  auto Arg = FN->arg_begin();
@ -1835,6 +1915,9 @@ void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
  Intrinsic::ID IntrinsicsTID[3];

  switch (Arch) {
+  case GPUArch::SPIR64:
+  case GPUArch::SPIR32:
+    llvm_unreachable("Cannot generate NVVM intrinsics for SPIR");
  case GPUArch::NVPTX64:
    IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
    IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
@ -1866,6 +1949,41 @@ void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
  }
 }

+void GPUNodeBuilder::insertKernelCallsSPIR(ppcg_kernel *Kernel) {
+  const char *GroupName[3] = {"__gen_ocl_get_group_id0",
+                              "__gen_ocl_get_group_id1",
+                              "__gen_ocl_get_group_id2"};
+
+  const char *LocalName[3] = {"__gen_ocl_get_local_id0",
+                              "__gen_ocl_get_local_id1",
+                              "__gen_ocl_get_local_id2"};
+
+  auto createFunc = [this](const char *Name, __isl_take isl_id *Id) mutable {
+    Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+    Function *FN = M->getFunction(Name);
+
+    // If FN is not available, declare it.
+    if (!FN) {
+      GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+      std::vector<Type *> Args;
+      FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Args, false);
+      FN = Function::Create(Ty, Linkage, Name, M);
+      FN->setCallingConv(CallingConv::SPIR_FUNC);
+    }
+
+    Value *Val = Builder.CreateCall(FN, {});
+    Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name);
+    IDToValue[Id] = Val;
+    KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
+  };
+
+  for (int i = 0; i < Kernel->n_grid; ++i)
+    createFunc(GroupName[i], isl_id_list_get_id(Kernel->block_ids, i));
+
+  for (int i = 0; i < Kernel->n_block; ++i)
+    createFunc(LocalName[i], isl_id_list_get_id(Kernel->thread_ids, i));
+}
+
 void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) {
  auto Arg = FN->arg_begin();
  for (long i = 0; i < Kernel->n_array; i++) {
@ -2004,6 +2122,14 @@ void GPUNodeBuilder::createKernelFunction(
      GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl"));
    GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
    break;
+  case GPUArch::SPIR32:
+    GPUModule->setTargetTriple(Triple::normalize("spir-unknown-unknown"));
+    GPUModule->setDataLayout(computeSPIRDataLayout(false /* is64Bit */));
+    break;
+  case GPUArch::SPIR64:
+    GPUModule->setTargetTriple(Triple::normalize("spir64-unknown-unknown"));
+    GPUModule->setDataLayout(computeSPIRDataLayout(true /* is64Bit */));
+    break;
  }

  Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);
@ -2021,7 +2147,16 @@ void GPUNodeBuilder::createKernelFunction(

  prepareKernelArguments(Kernel, FN);
  createKernelVariables(Kernel, FN);
-  insertKernelIntrinsics(Kernel);
+
+  switch (Arch) {
+  case GPUArch::NVPTX64:
+    insertKernelIntrinsics(Kernel);
+    break;
+  case GPUArch::SPIR32:
+  case GPUArch::SPIR64:
+    insertKernelCallsSPIR(Kernel);
+    break;
+  }
 }

 std::string GPUNodeBuilder::createKernelASM() {
@ -2038,6 +2173,13 @@ std::string GPUNodeBuilder::createKernelASM() {
      break;
    }
    break;
+  case GPUArch::SPIR64:
+  case GPUArch::SPIR32:
+    std::string SPIRAssembly;
+    raw_string_ostream IROstream(SPIRAssembly);
+    IROstream << *GPUModule;
+    IROstream.flush();
+    return SPIRAssembly;
  }

  std::string ErrMsg;
@ -2057,6 +2199,9 @@ std::string GPUNodeBuilder::createKernelASM() {
  case GPUArch::NVPTX64:
    subtarget = CudaVersion;
    break;
+  case GPUArch::SPIR32:
+  case GPUArch::SPIR64:
+    llvm_unreachable("No subtarget for SPIR architecture");
  }

  std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine(
@ -2097,13 +2242,15 @@ std::string GPUNodeBuilder::finalizeKernelFunction() {
  if (DumpKernelIR)
    outs() << *GPUModule << "\n";

-  // Optimize module.
-  llvm::legacy::PassManager OptPasses;
-  PassManagerBuilder PassBuilder;
-  PassBuilder.OptLevel = 3;
-  PassBuilder.SizeLevel = 0;
-  PassBuilder.populateModulePassManager(OptPasses);
-  OptPasses.run(*GPUModule);
+  if (Arch != GPUArch::SPIR32 && Arch != GPUArch::SPIR64) {
+    // Optimize module.
+    llvm::legacy::PassManager OptPasses;
+    PassManagerBuilder PassBuilder;
+    PassBuilder.OptLevel = 3;
+    PassBuilder.SizeLevel = 0;
+    PassBuilder.populateModulePassManager(OptPasses);
+    OptPasses.run(*GPUModule);
+  }

  std::string Assembly = createKernelASM();

--- a/polly/lib/Support/RegisterPasses.cpp
+++ b/polly/lib/Support/RegisterPasses.cpp
@ -117,7 +117,11 @@ static cl::opt<GPURuntime> GPURuntimeChoice(
 static cl::opt<GPUArch>
    GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"),
                  cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64",
-                                        "target NVIDIA 64-bit architecture")),
+                                        "target NVIDIA 64-bit architecture"),
+                             clEnumValN(GPUArch::SPIR32, "spir32",
+                                        "target SPIR 32-bit architecture"),
+                             clEnumValN(GPUArch::SPIR64, "spir64",
+                                        "target SPIR 64-bit architecture")),
                  cl::init(GPUArch::NVPTX64), cl::ZeroOrMore,
                  cl::cat(PollyCategory));
 #endif
--- a/polly/test/GPGPU/spir-codegen.ll
+++ b/polly/test/GPGPU/spir-codegen.ll
@ -0,0 +1,118 @@
+; RUN: opt -O3 -polly -polly-target=gpu \
+; RUN: -polly-gpu-arch=spir32 \
+; RUN: -polly-acc-dump-kernel-ir -polly-process-unprofitable -disable-output < %s | \
+; RUN: FileCheck %s
+
+; REQUIRES: pollyacc
+
+; CHECK:      target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+; CHECK-NEXT: target triple = "spir-unknown-unknown"
+
+; CHECK-LABEL: define spir_kernel void @FUNC_double_parallel_loop_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef0) #0 !kernel_arg_addr_space !0 !kernel_arg_name !1 !kernel_arg_access_qual !1 !kernel_arg_type !1 !kernel_arg_type_qual !1 !kernel_arg_base_type !1 {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %0 = call i32 @__gen_ocl_get_group_id0()
+; CHECK-NEXT:   %__gen_ocl_get_group_id0 = zext i32 %0 to i64
+; CHECK-NEXT:   %1 = call i32 @__gen_ocl_get_group_id1()
+; CHECK-NEXT:   %__gen_ocl_get_group_id1 = zext i32 %1 to i64
+; CHECK-NEXT:   %2 = call i32 @__gen_ocl_get_local_id0()
+; CHECK-NEXT:   %__gen_ocl_get_local_id0 = zext i32 %2 to i64
+; CHECK-NEXT:   %3 = call i32 @__gen_ocl_get_local_id1()
+; CHECK-NEXT:   %__gen_ocl_get_local_id1 = zext i32 %3 to i64
+; CHECK-NEXT:   br label %polly.loop_preheader
+
+; CHECK-LABEL: polly.loop_exit:                                  ; preds = %polly.stmt.bb5
+; CHECK-NEXT:   ret void
+
+; CHECK-LABEL: polly.loop_header:                                ; preds = %polly.stmt.bb5, %polly.loop_preheader
+; CHECK-NEXT:   %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ]
+; CHECK-NEXT:   %4 = mul nsw i64 32, %__gen_ocl_get_group_id0
+; CHECK-NEXT:   %5 = add nsw i64 %4, %__gen_ocl_get_local_id0
+; CHECK-NEXT:   %6 = mul nsw i64 32, %__gen_ocl_get_group_id1
+; CHECK-NEXT:   %7 = add nsw i64 %6, %__gen_ocl_get_local_id1
+; CHECK-NEXT:   %8 = mul nsw i64 16, %polly.indvar
+; CHECK-NEXT:   %9 = add nsw i64 %7, %8
+; CHECK-NEXT:   br label %polly.stmt.bb5
+
+; CHECK-LABEL: polly.stmt.bb5:                                   ; preds = %polly.loop_header
+; CHECK-NEXT:   %10 = mul i64 %5, %9
+; CHECK-NEXT:   %p_tmp6 = sitofp i64 %10 to float
+; CHECK-NEXT:   %polly.access.cast.MemRef0 = bitcast i8 addrspace(1)* %MemRef0 to float addrspace(1)*
+; CHECK-NEXT:   %11 = mul nsw i64 32, %__gen_ocl_get_group_id0
+; CHECK-NEXT:   %12 = add nsw i64 %11, %__gen_ocl_get_local_id0
+; CHECK-NEXT:   %polly.access.mul.MemRef0 = mul nsw i64 %12, 1024
+; CHECK-NEXT:   %13 = mul nsw i64 32, %__gen_ocl_get_group_id1
+; CHECK-NEXT:   %14 = add nsw i64 %13, %__gen_ocl_get_local_id1
+; CHECK-NEXT:   %15 = mul nsw i64 16, %polly.indvar
+; CHECK-NEXT:   %16 = add nsw i64 %14, %15
+; CHECK-NEXT:   %polly.access.add.MemRef0 = add nsw i64 %polly.access.mul.MemRef0, %16
+; CHECK-NEXT:   %polly.access.MemRef0 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef0, i64 %polly.access.add.MemRef0
+; CHECK-NEXT:   %tmp8_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef0, align 4
+; CHECK-NEXT:   %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6
+; CHECK-NEXT:   %polly.access.cast.MemRef01 = bitcast i8 addrspace(1)* %MemRef0 to float addrspace(1)*
+; CHECK-NEXT:   %17 = mul nsw i64 32, %__gen_ocl_get_group_id0
+; CHECK-NEXT:   %18 = add nsw i64 %17, %__gen_ocl_get_local_id0
+; CHECK-NEXT:   %polly.access.mul.MemRef02 = mul nsw i64 %18, 1024
+; CHECK-NEXT:   %19 = mul nsw i64 32, %__gen_ocl_get_group_id1
+; CHECK-NEXT:   %20 = add nsw i64 %19, %__gen_ocl_get_local_id1
+; CHECK-NEXT:   %21 = mul nsw i64 16, %polly.indvar
+; CHECK-NEXT:   %22 = add nsw i64 %20, %21
+; CHECK-NEXT:   %polly.access.add.MemRef03 = add nsw i64 %polly.access.mul.MemRef02, %22
+; CHECK-NEXT:   %polly.access.MemRef04 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef01, i64 %polly.access.add.MemRef03
+; CHECK-NEXT:   store float %p_tmp9, float addrspace(1)* %polly.access.MemRef04, align 4
+; CHECK-NEXT:   %polly.indvar_next = add nsw i64 %polly.indvar, 1
+; CHECK-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar_next, 1
+; CHECK-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
+
+; CHECK-LABEL: polly.loop_preheader:                             ; preds = %entry
+; CHECK-NEXT:   br label %polly.loop_header
+
+; CHECK: attributes #0 = { "polly.skip.fn" }
+
+;    void double_parallel_loop(float A[][1024]) {
+;      for (long i = 0; i < 1024; i++)
+;        for (long j = 0; j < 1024; j++)
+;          A[i][j] += i * j;
+;    }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @double_parallel_loop([1024 x float]* %A) {
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb13, %bb
+  %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ]
+  %exitcond1 = icmp ne i64 %i.0, 1024
+  br i1 %exitcond1, label %bb3, label %bb15
+
+bb3:                                              ; preds = %bb2
+  br label %bb4
+
+bb4:                                              ; preds = %bb10, %bb3
+  %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ]
+  %exitcond = icmp ne i64 %j.0, 1024
+  br i1 %exitcond, label %bb5, label %bb12
+
+bb5:                                              ; preds = %bb4
+  %tmp = mul nuw nsw i64 %i.0, %j.0
+  %tmp6 = sitofp i64 %tmp to float
+  %tmp7 = getelementptr inbounds [1024 x float], [1024 x float]* %A, i64 %i.0, i64 %j.0
+  %tmp8 = load float, float* %tmp7, align 4
+  %tmp9 = fadd float %tmp8, %tmp6
+  store float %tmp9, float* %tmp7, align 4
+  br label %bb10
+
+bb10:                                             ; preds = %bb5
+  %tmp11 = add nuw nsw i64 %j.0, 1
+  br label %bb4
+
+bb12:                                             ; preds = %bb4
+  br label %bb13
+
+bb13:                                             ; preds = %bb12
+  %tmp14 = add nuw nsw i64 %i.0, 1
+  br label %bb2
+
+bb15:                                             ; preds = %bb2
+  ret void
+}
--- a/polly/tools/GPURuntime/GPUJIT.c
+++ b/polly/tools/GPURuntime/GPUJIT.c
@ -23,13 +23,14 @@
 #include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
-#endif
+#endif /* __APPLE__ */
 #endif /* HAS_LIBOPENCL */

 #include <dlfcn.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <string.h>
+#include <unistd.h>

 static int DebugMode;
 static int CacheMode;
@ -89,6 +90,7 @@ struct OpenCLDevicePtrT {

 /* Dynamic library handles for the OpenCL runtime library. */
 static void *HandleOpenCL;
+static void *HandleOpenCLBeignet;

 /* Type-defines of function pointer to OpenCL Runtime API. */
 typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries,
@ -139,6 +141,12 @@ clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer,
                          const cl_event *EventWaitList, cl_event *Event);
 static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr;

+typedef cl_program
+clCreateProgramWithLLVMIntelFcnTy(cl_context Context, cl_uint NumDevices,
+                                  const cl_device_id *DeviceList,
+                                  const char *Filename, cl_int *ErrcodeRet);
+static clCreateProgramWithLLVMIntelFcnTy *clCreateProgramWithLLVMIntelFcnPtr;
+
 typedef cl_program clCreateProgramWithBinaryFcnTy(
    cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList,
    const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus,
@ -210,6 +218,7 @@ static void *getAPIHandleCL(void *Handle, const char *FuncName) {
 }

 static int initialDeviceAPILibrariesCL() {
+  HandleOpenCLBeignet = dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY);
  HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY);
  if (!HandleOpenCL) {
    fprintf(stderr, "Cannot open library: %s. \n", dlerror());
@ -237,67 +246,79 @@ static int initialDeviceAPIsCL() {
  if (initialDeviceAPILibrariesCL() == 0)
    return 0;

+  // FIXME: We are now always selecting the Intel Beignet driver if it is
+  // available on the system, instead of a possible NVIDIA or AMD OpenCL
+  // API. This selection should occurr based on the target architecture
+  // chosen when compiling.
+  void *Handle =
+      (HandleOpenCLBeignet != NULL ? HandleOpenCLBeignet : HandleOpenCL);
+
  clGetPlatformIDsFcnPtr =
-      (clGetPlatformIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetPlatformIDs");
+      (clGetPlatformIDsFcnTy *)getAPIHandleCL(Handle, "clGetPlatformIDs");

  clGetDeviceIDsFcnPtr =
-      (clGetDeviceIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceIDs");
+      (clGetDeviceIDsFcnTy *)getAPIHandleCL(Handle, "clGetDeviceIDs");

  clGetDeviceInfoFcnPtr =
-      (clGetDeviceInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceInfo");
+      (clGetDeviceInfoFcnTy *)getAPIHandleCL(Handle, "clGetDeviceInfo");

  clGetKernelInfoFcnPtr =
-      (clGetKernelInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetKernelInfo");
+      (clGetKernelInfoFcnTy *)getAPIHandleCL(Handle, "clGetKernelInfo");

  clCreateContextFcnPtr =
-      (clCreateContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateContext");
+      (clCreateContextFcnTy *)getAPIHandleCL(Handle, "clCreateContext");

  clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL(
-      HandleOpenCL, "clCreateCommandQueue");
+      Handle, "clCreateCommandQueue");

  clCreateBufferFcnPtr =
-      (clCreateBufferFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateBuffer");
+      (clCreateBufferFcnTy *)getAPIHandleCL(Handle, "clCreateBuffer");

  clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL(
-      HandleOpenCL, "clEnqueueWriteBuffer");
+      Handle, "clEnqueueWriteBuffer");
+
+  if (HandleOpenCLBeignet)
+    clCreateProgramWithLLVMIntelFcnPtr =
+        (clCreateProgramWithLLVMIntelFcnTy *)getAPIHandleCL(
+            Handle, "clCreateProgramWithLLVMIntel");

  clCreateProgramWithBinaryFcnPtr =
      (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL(
-          HandleOpenCL, "clCreateProgramWithBinary");
+          Handle, "clCreateProgramWithBinary");

  clBuildProgramFcnPtr =
-      (clBuildProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clBuildProgram");
+      (clBuildProgramFcnTy *)getAPIHandleCL(Handle, "clBuildProgram");

  clCreateKernelFcnPtr =
-      (clCreateKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateKernel");
+      (clCreateKernelFcnTy *)getAPIHandleCL(Handle, "clCreateKernel");

  clSetKernelArgFcnPtr =
-      (clSetKernelArgFcnTy *)getAPIHandleCL(HandleOpenCL, "clSetKernelArg");
+      (clSetKernelArgFcnTy *)getAPIHandleCL(Handle, "clSetKernelArg");

  clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL(
-      HandleOpenCL, "clEnqueueNDRangeKernel");
+      Handle, "clEnqueueNDRangeKernel");

-  clEnqueueReadBufferFcnPtr = (clEnqueueReadBufferFcnTy *)getAPIHandleCL(
-      HandleOpenCL, "clEnqueueReadBuffer");
+  clEnqueueReadBufferFcnPtr =
+      (clEnqueueReadBufferFcnTy *)getAPIHandleCL(Handle, "clEnqueueReadBuffer");

-  clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(HandleOpenCL, "clFlush");
+  clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(Handle, "clFlush");

-  clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(HandleOpenCL, "clFinish");
+  clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(Handle, "clFinish");

  clReleaseKernelFcnPtr =
-      (clReleaseKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseKernel");
+      (clReleaseKernelFcnTy *)getAPIHandleCL(Handle, "clReleaseKernel");

  clReleaseProgramFcnPtr =
-      (clReleaseProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseProgram");
+      (clReleaseProgramFcnTy *)getAPIHandleCL(Handle, "clReleaseProgram");

-  clReleaseMemObjectFcnPtr = (clReleaseMemObjectFcnTy *)getAPIHandleCL(
-      HandleOpenCL, "clReleaseMemObject");
+  clReleaseMemObjectFcnPtr =
+      (clReleaseMemObjectFcnTy *)getAPIHandleCL(Handle, "clReleaseMemObject");

  clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL(
-      HandleOpenCL, "clReleaseCommandQueue");
+      Handle, "clReleaseCommandQueue");

  clReleaseContextFcnPtr =
-      (clReleaseContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseContext");
+      (clReleaseContextFcnTy *)getAPIHandleCL(Handle, "clReleaseContext");

  return 1;
 }
@ -481,12 +502,32 @@ static PollyGPUFunction *getKernelCL(const char *BinaryBuffer,
  }

  cl_int Ret;
-  size_t BinarySize = strlen(BinaryBuffer);
-  ((OpenCLKernel *)Function->Kernel)->Program = clCreateProgramWithBinaryFcnPtr(
-      ((OpenCLContext *)GlobalContext->Context)->Context, 1, &GlobalDeviceID,
-      (const size_t *)&BinarySize, (const unsigned char **)&BinaryBuffer, NULL,
-      &Ret);
-  checkOpenCLError(Ret, "Failed to create program from binary.\n");
+
+  if (HandleOpenCLBeignet) {
+    // TODO: This is a workaround, since clCreateProgramWithLLVMIntel only
+    // accepts a filename to a valid llvm-ir file as an argument, instead
+    // of accepting the BinaryBuffer directly.
+    FILE *fp = fopen("kernel.ll", "wb");
+    if (fp != NULL) {
+      fputs(BinaryBuffer, fp);
+      fclose(fp);
+    }
+
+    ((OpenCLKernel *)Function->Kernel)->Program =
+        clCreateProgramWithLLVMIntelFcnPtr(
+            ((OpenCLContext *)GlobalContext->Context)->Context, 1,
+            &GlobalDeviceID, "kernel.ll", &Ret);
+    checkOpenCLError(Ret, "Failed to create program from llvm.\n");
+    unlink("kernel.ll");
+  } else {
+    size_t BinarySize = strlen(BinaryBuffer);
+    ((OpenCLKernel *)Function->Kernel)->Program =
+        clCreateProgramWithBinaryFcnPtr(
+            ((OpenCLContext *)GlobalContext->Context)->Context, 1,
+            &GlobalDeviceID, (const size_t *)&BinarySize,
+            (const unsigned char **)&BinaryBuffer, NULL, &Ret);
+    checkOpenCLError(Ret, "Failed to create program from binary.\n");
+  }

  Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1,
                             &GlobalDeviceID, NULL, NULL, NULL);