[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen

Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).

Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).

Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay

Reviewed By: grosser, Meinersbur

Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia

Tags: #polly

Differential Revision: https://reviews.llvm.org/D32431

llvm-svn: 302379
This commit is contained in:
Siddharth Bhat 2017-05-07 21:03:46 +00:00
parent 2d1c6d6e8d
commit 17f01968f1
10 changed files with 1405 additions and 133 deletions

View File

@ -152,9 +152,10 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF)
if (POLLY_ENABLE_GPGPU_CODEGEN)
# Do not require CUDA, as GPU code generation test cases can be run without
# a cuda library.
# Do not require CUDA/OpenCL, as GPU code generation test cases can be run
# without a CUDA/OpenCL library.
FIND_PACKAGE(CUDA)
FIND_PACKAGE(OpenCL)
set(GPU_CODEGEN TRUE)
else(POLLY_ENABLE_GPGPU_CODEGEN)
set(GPU_CODEGEN FALSE)
@ -163,8 +164,13 @@ endif(POLLY_ENABLE_GPGPU_CODEGEN)
# Support GPGPU code generation if the library is available.
if (CUDALIB_FOUND)
add_definitions(-DHAS_LIBCUDART)
INCLUDE_DIRECTORIES( ${CUDALIB_INCLUDE_DIR} )
endif(CUDALIB_FOUND)
if (OpenCL_FOUND)
add_definitions(-DHAS_LIBOPENCL)
INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} )
endif(OpenCL_FOUND)
option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in Polly" ON)
if (NOT POLLY_BUNDLED_ISL)

View File

@ -0,0 +1,24 @@
//===--- polly/PPCGCodeGeneration.h - Polly Accelerator Code Generation. --===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// Take a scop created by ScopInfo and map it to GPU code using the ppcg
// GPU mapping strategy.
//
//===----------------------------------------------------------------------===//
#ifndef POLLY_PPCGCODEGENERATION_H
#define POLLY_PPCGCODEGENERATION_H
/// The GPU Architecture to target.
enum GPUArch { NVPTX64 };
/// The GPU Runtime implementation to use.
enum GPURuntime { CUDA, OpenCL };
#endif // POLLY_PPCGCODEGENERATION_H

View File

@ -15,6 +15,7 @@
#ifndef POLLY_LINKALLPASSES_H
#define POLLY_LINKALLPASSES_H
#include "polly/CodeGen/PPCGCodeGeneration.h"
#include "polly/Config/config.h"
#include "polly/PruneUnprofitable.h"
#include "polly/Simplify.h"
@ -48,7 +49,8 @@ llvm::Pass *createScopInfoWrapperPassPass();
llvm::Pass *createIslAstInfoPass();
llvm::Pass *createCodeGenerationPass();
#ifdef GPU_CODEGEN
llvm::Pass *createPPCGCodeGenerationPass();
llvm::Pass *createPPCGCodeGenerationPass(GPUArch Arch = GPUArch::NVPTX64,
GPURuntime Runtime = GPURuntime::CUDA);
#endif
llvm::Pass *createIslScheduleOptimizerPass();
llvm::Pass *createFlattenSchedulePass();

View File

@ -12,6 +12,7 @@
//
//===----------------------------------------------------------------------===//
#include "polly/CodeGen/PPCGCodeGeneration.h"
#include "polly/CodeGen/IslAst.h"
#include "polly/CodeGen/IslNodeBuilder.h"
#include "polly/CodeGen/Utils.h"
@ -153,9 +154,9 @@ public:
GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator,
const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE,
DominatorTree &DT, Scop &S, BasicBlock *StartBlock,
gpu_prog *Prog)
gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch)
: IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock),
Prog(Prog) {
Prog(Prog), Runtime(Runtime), Arch(Arch) {
getExprBuilder().setIDToSAI(&IDToSAI);
}
@ -201,6 +202,12 @@ private:
/// The GPU program we generate code for.
gpu_prog *Prog;
/// The GPU Runtime implementation to use (OpenCL or CUDA).
GPURuntime Runtime;
/// The GPU Architecture to target.
GPUArch Arch;
/// Class to free isl_ids.
class IslIdDeleter {
public:
@ -752,7 +759,17 @@ void GPUNodeBuilder::createCallSynchronizeDevice() {
}
Value *GPUNodeBuilder::createCallInitContext() {
const char *Name = "polly_initContext";
const char *Name;
switch (Runtime) {
case GPURuntime::CUDA:
Name = "polly_initContextCUDA";
break;
case GPURuntime::OpenCL:
Name = "polly_initContextCL";
break;
}
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Function *F = M->getFunction(Name);
@ -1028,7 +1045,15 @@ void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr,
void GPUNodeBuilder::createKernelSync() {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
Function *Sync;
switch (Arch) {
case GPUArch::NVPTX64:
Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
break;
}
Builder.CreateCall(Sync, {});
}
@ -1434,7 +1459,12 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
GPUModule.get());
FN->setCallingConv(CallingConv::PTX_Kernel);
switch (Arch) {
case GPUArch::NVPTX64:
FN->setCallingConv(CallingConv::PTX_Kernel);
break;
}
auto Arg = FN->arg_begin();
for (long i = 0; i < Kernel->n_array; i++) {
@ -1495,12 +1525,19 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
}
void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x,
Intrinsic::nvvm_read_ptx_sreg_ctaid_y};
Intrinsic::ID IntrinsicsBID[2];
Intrinsic::ID IntrinsicsTID[3];
Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x,
Intrinsic::nvvm_read_ptx_sreg_tid_y,
Intrinsic::nvvm_read_ptx_sreg_tid_z};
switch (Arch) {
case GPUArch::NVPTX64:
IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x;
IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y;
IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z;
break;
}
auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable {
std::string Name = isl_id_get_name(Id);
@ -1649,11 +1686,18 @@ void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) {
void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
SetVector<Value *> &SubtreeValues) {
std::string Identifier = "kernel_" + std::to_string(Kernel->id);
GPUModule.reset(new Module(Identifier, Builder.getContext()));
GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
switch (Arch) {
case GPUArch::NVPTX64:
if (Runtime == GPURuntime::CUDA)
GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
else if (Runtime == GPURuntime::OpenCL)
GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl"));
GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
break;
}
Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);
@ -1674,7 +1718,21 @@ void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
}
std::string GPUNodeBuilder::createKernelASM() {
llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda"));
llvm::Triple GPUTriple;
switch (Arch) {
case GPUArch::NVPTX64:
switch (Runtime) {
case GPURuntime::CUDA:
GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda"));
break;
case GPURuntime::OpenCL:
GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl"));
break;
}
break;
}
std::string ErrMsg;
auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg);
@ -1685,9 +1743,17 @@ std::string GPUNodeBuilder::createKernelASM() {
TargetOptions Options;
Options.UnsafeFPMath = FastMath;
std::unique_ptr<TargetMachine> TargetM(
GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "",
Options, Optional<Reloc::Model>()));
std::string subtarget;
switch (Arch) {
case GPUArch::NVPTX64:
subtarget = CudaVersion;
break;
}
std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine(
GPUTriple.getTriple(), subtarget, "", Options, Optional<Reloc::Model>()));
SmallString<0> ASMString;
raw_svector_ostream ASMStream(ASMString);
@ -1739,6 +1805,10 @@ class PPCGCodeGeneration : public ScopPass {
public:
static char ID;
GPURuntime Runtime = GPURuntime::CUDA;
GPUArch Architecture = GPUArch::NVPTX64;
/// The scop that is currently processed.
Scop *S;
@ -2522,7 +2592,7 @@ public:
executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI);
GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S,
StartBlock, Prog);
StartBlock, Prog, Runtime, Architecture);
// TODO: Handle LICM
auto SplitBlock = StartBlock->getSinglePredecessor();
@ -2610,7 +2680,12 @@ public:
char PPCGCodeGeneration::ID = 1;
Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); }
Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) {
PPCGCodeGeneration *generator = new PPCGCodeGeneration();
generator->Runtime = Runtime;
generator->Architecture = Arch;
return generator;
}
INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg",
"Polly - Apply PPCG translation to SCOP", false, false)

View File

@ -23,6 +23,7 @@
#include "polly/Canonicalization.h"
#include "polly/CodeGen/CodeGeneration.h"
#include "polly/CodeGen/CodegenCleanup.h"
#include "polly/CodeGen/PPCGCodeGeneration.h"
#include "polly/DeLICM.h"
#include "polly/DependenceInfo.h"
#include "polly/FlattenSchedule.h"
@ -101,6 +102,23 @@ static cl::opt<TargetChoice>
),
cl::init(TARGET_CPU), cl::ZeroOrMore, cl::cat(PollyCategory));
#ifdef GPU_CODEGEN
static cl::opt<GPURuntime> GPURuntimeChoice(
"polly-gpu-runtime", cl::desc("The GPU Runtime API to target"),
cl::values(clEnumValN(GPURuntime::CUDA, "libcudart",
"use the CUDA Runtime API"),
clEnumValN(GPURuntime::OpenCL, "libopencl",
"use the OpenCL Runtime API")),
cl::init(GPURuntime::CUDA), cl::ZeroOrMore, cl::cat(PollyCategory));
static cl::opt<GPUArch>
GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"),
cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64",
"target NVIDIA 64-bit architecture")),
cl::init(GPUArch::NVPTX64), cl::ZeroOrMore,
cl::cat(PollyCategory));
#endif
VectorizerChoice polly::PollyVectorizerChoice;
static cl::opt<polly::VectorizerChoice, true> Vectorizer(
"polly-vectorizer", cl::desc("Select the vectorization strategy"),
@ -309,7 +327,8 @@ void registerPollyPasses(llvm::legacy::PassManagerBase &PM) {
if (Target == TARGET_GPU) {
#ifdef GPU_CODEGEN
PM.add(polly::createPPCGCodeGenerationPass());
PM.add(
polly::createPPCGCodeGenerationPass(GPUArchChoice, GPURuntimeChoice));
#endif
} else {
switch (CodeGeneration) {

View File

@ -35,7 +35,7 @@
; CHECK-NOT: polly_freeDeviceMemory
; CHECK-NOT: polly_allocateMemoryForDevice
; CHECK: %13 = call i8* @polly_initContext()
; CHECK: %13 = call i8* @polly_initContextCUDA()
; CHECK-NEXT: %14 = bitcast i32* %A to i8*
; CHECK-NEXT: %15 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
; CHECK-NEXT: store i8* %14, i8** %polly_launch_0_param_0
@ -46,7 +46,7 @@
; CHECK-NEXT: store i8* %17, i8** %polly_launch_0_param_1
; CHECK-NEXT: %19 = bitcast i8** %polly_launch_0_param_1 to i8*
; CHECK-NEXT: store i8* %19, i8** %18
; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))
; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))
; CHECK-NEXT: call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
; CHECK-NEXT: call void @polly_freeKernel(i8* %20)
; CHECK-NEXT: call void @polly_synchronizeDevice()

View File

@ -29,7 +29,7 @@
; CODE-NEXT: if (arg >= 32 * b0 + t0 + 1048576 * c0 + 1)
; CODE-NEXT: Stmt_bb6(0, 32 * b0 + t0 + 1048576 * c0);
; IR: call i8* @polly_initContext()
; IR: call i8* @polly_initContextCUDA()
; IR-NEXT: sext i32 %arg to i64
; IR-NEXT: mul i64
; IR-NEXT: @polly_allocateMemoryForDevice

View File

@ -1,5 +1,5 @@
if (CUDALIB_FOUND)
if (CUDALIB_FOUND OR OpenCL_FOUND)
add_subdirectory(GPURuntime)
endif (CUDALIB_FOUND)
endif (CUDALIB_FOUND OR OpenCL_FOUND)
set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE)

File diff suppressed because it is too large Load Diff

View File

@ -76,12 +76,27 @@
*
*/
typedef enum PollyGPURuntimeT {
RUNTIME_NONE,
RUNTIME_CUDA,
RUNTIME_CL
} PollyGPURuntime;
typedef struct PollyGPUContextT PollyGPUContext;
typedef struct PollyGPUFunctionT PollyGPUFunction;
typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;
PollyGPUContext *polly_initContext();
PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
typedef struct OpenCLContextT OpenCLContext;
typedef struct OpenCLKernelT OpenCLKernel;
typedef struct OpenCLDevicePtrT OpenCLDevicePtr;
typedef struct CUDAContextT CUDAContext;
typedef struct CUDAKernelT CUDAKernel;
typedef struct CUDADevicePtrT CUDADevicePtr;
PollyGPUContext *polly_initContextCUDA();
PollyGPUContext *polly_initContextCL();
PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
const char *KernelName);
void polly_freeKernel(PollyGPUFunction *Kernel);
void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,