forked from OSchip/llvm-project
Revert "[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen"
This reverts commit 17a84e414adb51ee375d14836d4c2a817b191933. Patches should have been submitted in the order of: 1. D32852 2. D32854 3. D32431 I mistakenly pushed D32431(3) first. Reverting to push in the correct order. llvm-svn: 302217
This commit is contained in:
parent
2b0fae877e
commit
c1267b9baa
|
@ -152,10 +152,9 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
|
|||
|
||||
option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF)
|
||||
if (POLLY_ENABLE_GPGPU_CODEGEN)
|
||||
# Do not require CUDA/OpenCL, as GPU code generation test cases can be run
|
||||
# without a CUDA/OpenCL library.
|
||||
# Do not require CUDA, as GPU code generation test cases can be run without
|
||||
# a cuda library.
|
||||
FIND_PACKAGE(CUDA)
|
||||
FIND_PACKAGE(OpenCL)
|
||||
set(GPU_CODEGEN TRUE)
|
||||
else(POLLY_ENABLE_GPGPU_CODEGEN)
|
||||
set(GPU_CODEGEN FALSE)
|
||||
|
@ -164,13 +163,8 @@ endif(POLLY_ENABLE_GPGPU_CODEGEN)
|
|||
|
||||
# Support GPGPU code generation if the library is available.
|
||||
if (CUDALIB_FOUND)
|
||||
add_definitions(-DHAS_LIBCUDART)
|
||||
INCLUDE_DIRECTORIES( ${CUDALIB_INCLUDE_DIR} )
|
||||
endif(CUDALIB_FOUND)
|
||||
if (OpenCL_FOUND)
|
||||
add_definitions(-DHAS_LIBOPENCL)
|
||||
INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} )
|
||||
endif(OpenCL_FOUND)
|
||||
|
||||
option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in Polly" ON)
|
||||
if (NOT POLLY_BUNDLED_ISL)
|
||||
|
|
|
@ -1,24 +0,0 @@
|
|||
//===--- polly/PPCGCodeGeneration.h - Polly Accelerator Code Generation. --===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Take a scop created by ScopInfo and map it to GPU code using the ppcg
|
||||
// GPU mapping strategy.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef POLLY_PPCGCODEGENERATION_H
|
||||
#define POLLY_PPCGCODEGENERATION_H
|
||||
|
||||
/// The GPU Architecture to target.
|
||||
enum GPUArch { NVPTX64 };
|
||||
|
||||
/// The GPU Runtime implementation to use.
|
||||
enum GPURuntime { CUDA, OpenCL };
|
||||
|
||||
#endif // POLLY_PPCGCODEGENERATION_H
|
|
@ -15,7 +15,6 @@
|
|||
#ifndef POLLY_LINKALLPASSES_H
|
||||
#define POLLY_LINKALLPASSES_H
|
||||
|
||||
#include "polly/CodeGen/PPCGCodeGeneration.h"
|
||||
#include "polly/Config/config.h"
|
||||
#include "polly/PruneUnprofitable.h"
|
||||
#include "polly/Simplify.h"
|
||||
|
@ -49,8 +48,7 @@ llvm::Pass *createScopInfoWrapperPassPass();
|
|||
llvm::Pass *createIslAstInfoPass();
|
||||
llvm::Pass *createCodeGenerationPass();
|
||||
#ifdef GPU_CODEGEN
|
||||
llvm::Pass *createPPCGCodeGenerationPass(GPUArch Arch = GPUArch::NVPTX64,
|
||||
GPURuntime Runtime = GPURuntime::CUDA);
|
||||
llvm::Pass *createPPCGCodeGenerationPass();
|
||||
#endif
|
||||
llvm::Pass *createIslScheduleOptimizerPass();
|
||||
llvm::Pass *createFlattenSchedulePass();
|
||||
|
|
|
@ -12,7 +12,6 @@
|
|||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "polly/CodeGen/PPCGCodeGeneration.h"
|
||||
#include "polly/CodeGen/IslAst.h"
|
||||
#include "polly/CodeGen/IslNodeBuilder.h"
|
||||
#include "polly/CodeGen/Utils.h"
|
||||
|
@ -154,9 +153,9 @@ public:
|
|||
GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator,
|
||||
const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE,
|
||||
DominatorTree &DT, Scop &S, BasicBlock *StartBlock,
|
||||
gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch)
|
||||
gpu_prog *Prog)
|
||||
: IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock),
|
||||
Prog(Prog), Runtime(Runtime), Arch(Arch) {
|
||||
Prog(Prog) {
|
||||
getExprBuilder().setIDToSAI(&IDToSAI);
|
||||
}
|
||||
|
||||
|
@ -202,12 +201,6 @@ private:
|
|||
/// The GPU program we generate code for.
|
||||
gpu_prog *Prog;
|
||||
|
||||
/// The GPU Runtime implementation to use (OpenCL or CUDA).
|
||||
GPURuntime Runtime;
|
||||
|
||||
/// The GPU Architecture to target.
|
||||
GPUArch Arch;
|
||||
|
||||
/// Class to free isl_ids.
|
||||
class IslIdDeleter {
|
||||
public:
|
||||
|
@ -759,17 +752,7 @@ void GPUNodeBuilder::createCallSynchronizeDevice() {
|
|||
}
|
||||
|
||||
Value *GPUNodeBuilder::createCallInitContext() {
|
||||
const char *Name;
|
||||
|
||||
switch (Runtime) {
|
||||
case GPURuntime::CUDA:
|
||||
Name = "polly_initContextCUDA";
|
||||
break;
|
||||
case GPURuntime::OpenCL:
|
||||
Name = "polly_initContextCL";
|
||||
break;
|
||||
}
|
||||
|
||||
const char *Name = "polly_initContext";
|
||||
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
||||
Function *F = M->getFunction(Name);
|
||||
|
||||
|
@ -1045,15 +1028,7 @@ void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr,
|
|||
|
||||
void GPUNodeBuilder::createKernelSync() {
|
||||
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
||||
|
||||
Function *Sync;
|
||||
|
||||
switch (Arch) {
|
||||
case GPUArch::NVPTX64:
|
||||
Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
|
||||
break;
|
||||
}
|
||||
|
||||
auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
|
||||
Builder.CreateCall(Sync, {});
|
||||
}
|
||||
|
||||
|
@ -1459,12 +1434,7 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
|
|||
auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
|
||||
auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
|
||||
GPUModule.get());
|
||||
|
||||
switch (Arch) {
|
||||
case GPUArch::NVPTX64:
|
||||
FN->setCallingConv(CallingConv::PTX_Kernel);
|
||||
break;
|
||||
}
|
||||
FN->setCallingConv(CallingConv::PTX_Kernel);
|
||||
|
||||
auto Arg = FN->arg_begin();
|
||||
for (long i = 0; i < Kernel->n_array; i++) {
|
||||
|
@ -1525,19 +1495,12 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
|
|||
}
|
||||
|
||||
void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
|
||||
Intrinsic::ID IntrinsicsBID[2];
|
||||
Intrinsic::ID IntrinsicsTID[3];
|
||||
Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x,
|
||||
Intrinsic::nvvm_read_ptx_sreg_ctaid_y};
|
||||
|
||||
switch (Arch) {
|
||||
case GPUArch::NVPTX64:
|
||||
IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
|
||||
IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
|
||||
|
||||
IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x;
|
||||
IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y;
|
||||
IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z;
|
||||
break;
|
||||
}
|
||||
Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x,
|
||||
Intrinsic::nvvm_read_ptx_sreg_tid_y,
|
||||
Intrinsic::nvvm_read_ptx_sreg_tid_z};
|
||||
|
||||
auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable {
|
||||
std::string Name = isl_id_get_name(Id);
|
||||
|
@ -1686,18 +1649,11 @@ void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) {
|
|||
|
||||
void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
|
||||
SetVector<Value *> &SubtreeValues) {
|
||||
|
||||
std::string Identifier = "kernel_" + std::to_string(Kernel->id);
|
||||
GPUModule.reset(new Module(Identifier, Builder.getContext()));
|
||||
|
||||
switch (Arch) {
|
||||
case GPUArch::NVPTX64:
|
||||
if (Runtime == GPURuntime::CUDA)
|
||||
GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
|
||||
else if (Runtime == GPURuntime::OpenCL)
|
||||
GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl"));
|
||||
GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
|
||||
break;
|
||||
}
|
||||
GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
|
||||
GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
|
||||
|
||||
Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);
|
||||
|
||||
|
@ -1718,21 +1674,7 @@ void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
|
|||
}
|
||||
|
||||
std::string GPUNodeBuilder::createKernelASM() {
|
||||
llvm::Triple GPUTriple;
|
||||
|
||||
switch (Arch) {
|
||||
case GPUArch::NVPTX64:
|
||||
switch (Runtime) {
|
||||
case GPURuntime::CUDA:
|
||||
GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda"));
|
||||
break;
|
||||
case GPURuntime::OpenCL:
|
||||
GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl"));
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda"));
|
||||
std::string ErrMsg;
|
||||
auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg);
|
||||
|
||||
|
@ -1743,17 +1685,9 @@ std::string GPUNodeBuilder::createKernelASM() {
|
|||
|
||||
TargetOptions Options;
|
||||
Options.UnsafeFPMath = FastMath;
|
||||
|
||||
std::string subtarget;
|
||||
|
||||
switch (Arch) {
|
||||
case GPUArch::NVPTX64:
|
||||
subtarget = CudaVersion;
|
||||
break;
|
||||
}
|
||||
|
||||
std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine(
|
||||
GPUTriple.getTriple(), subtarget, "", Options, Optional<Reloc::Model>()));
|
||||
std::unique_ptr<TargetMachine> TargetM(
|
||||
GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "",
|
||||
Options, Optional<Reloc::Model>()));
|
||||
|
||||
SmallString<0> ASMString;
|
||||
raw_svector_ostream ASMStream(ASMString);
|
||||
|
@ -1805,10 +1739,6 @@ class PPCGCodeGeneration : public ScopPass {
|
|||
public:
|
||||
static char ID;
|
||||
|
||||
GPURuntime Runtime = GPURuntime::CUDA;
|
||||
|
||||
GPUArch Architecture = GPUArch::NVPTX64;
|
||||
|
||||
/// The scop that is currently processed.
|
||||
Scop *S;
|
||||
|
||||
|
@ -2592,7 +2522,7 @@ public:
|
|||
executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI);
|
||||
|
||||
GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S,
|
||||
StartBlock, Prog, Runtime, Architecture);
|
||||
StartBlock, Prog);
|
||||
|
||||
// TODO: Handle LICM
|
||||
auto SplitBlock = StartBlock->getSinglePredecessor();
|
||||
|
@ -2680,12 +2610,7 @@ public:
|
|||
|
||||
char PPCGCodeGeneration::ID = 1;
|
||||
|
||||
Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) {
|
||||
PPCGCodeGeneration *generator = new PPCGCodeGeneration();
|
||||
generator->Runtime = Runtime;
|
||||
generator->Architecture = Arch;
|
||||
return generator;
|
||||
}
|
||||
Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); }
|
||||
|
||||
INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg",
|
||||
"Polly - Apply PPCG translation to SCOP", false, false)
|
||||
|
|
|
@ -23,7 +23,6 @@
|
|||
#include "polly/Canonicalization.h"
|
||||
#include "polly/CodeGen/CodeGeneration.h"
|
||||
#include "polly/CodeGen/CodegenCleanup.h"
|
||||
#include "polly/CodeGen/PPCGCodeGeneration.h"
|
||||
#include "polly/DeLICM.h"
|
||||
#include "polly/DependenceInfo.h"
|
||||
#include "polly/FlattenSchedule.h"
|
||||
|
@ -102,23 +101,6 @@ static cl::opt<TargetChoice>
|
|||
),
|
||||
cl::init(TARGET_CPU), cl::ZeroOrMore, cl::cat(PollyCategory));
|
||||
|
||||
#ifdef GPU_CODEGEN
|
||||
static cl::opt<GPURuntime> GPURuntimeChoice(
|
||||
"polly-gpu-runtime", cl::desc("The GPU Runtime API to target"),
|
||||
cl::values(clEnumValN(GPURuntime::CUDA, "libcudart",
|
||||
"use the CUDA Runtime API"),
|
||||
clEnumValN(GPURuntime::OpenCL, "libopencl",
|
||||
"use the OpenCL Runtime API")),
|
||||
cl::init(GPURuntime::CUDA), cl::ZeroOrMore, cl::cat(PollyCategory));
|
||||
|
||||
static cl::opt<GPUArch>
|
||||
GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"),
|
||||
cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64",
|
||||
"target NVIDIA 64-bit architecture")),
|
||||
cl::init(GPUArch::NVPTX64), cl::ZeroOrMore,
|
||||
cl::cat(PollyCategory));
|
||||
#endif
|
||||
|
||||
VectorizerChoice polly::PollyVectorizerChoice;
|
||||
static cl::opt<polly::VectorizerChoice, true> Vectorizer(
|
||||
"polly-vectorizer", cl::desc("Select the vectorization strategy"),
|
||||
|
@ -327,8 +309,7 @@ void registerPollyPasses(llvm::legacy::PassManagerBase &PM) {
|
|||
|
||||
if (Target == TARGET_GPU) {
|
||||
#ifdef GPU_CODEGEN
|
||||
PM.add(
|
||||
polly::createPPCGCodeGenerationPass(GPUArchChoice, GPURuntimeChoice));
|
||||
PM.add(polly::createPPCGCodeGenerationPass());
|
||||
#endif
|
||||
} else {
|
||||
switch (CodeGeneration) {
|
||||
|
|
|
@ -35,7 +35,7 @@
|
|||
; CHECK-NOT: polly_freeDeviceMemory
|
||||
; CHECK-NOT: polly_allocateMemoryForDevice
|
||||
|
||||
; CHECK: %13 = call i8* @polly_initContextCUDA()
|
||||
; CHECK: %13 = call i8* @polly_initContext()
|
||||
; CHECK-NEXT: %14 = bitcast i32* %A to i8*
|
||||
; CHECK-NEXT: %15 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
|
||||
; CHECK-NEXT: store i8* %14, i8** %polly_launch_0_param_0
|
||||
|
@ -46,7 +46,7 @@
|
|||
; CHECK-NEXT: store i8* %17, i8** %polly_launch_0_param_1
|
||||
; CHECK-NEXT: %19 = bitcast i8** %polly_launch_0_param_1 to i8*
|
||||
; CHECK-NEXT: store i8* %19, i8** %18
|
||||
; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))
|
||||
; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))
|
||||
; CHECK-NEXT: call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
|
||||
; CHECK-NEXT: call void @polly_freeKernel(i8* %20)
|
||||
; CHECK-NEXT: call void @polly_synchronizeDevice()
|
||||
|
|
|
@ -29,7 +29,7 @@
|
|||
; CODE-NEXT: if (arg >= 32 * b0 + t0 + 1048576 * c0 + 1)
|
||||
; CODE-NEXT: Stmt_bb6(0, 32 * b0 + t0 + 1048576 * c0);
|
||||
|
||||
; IR: call i8* @polly_initContextCUDA()
|
||||
; IR: call i8* @polly_initContext()
|
||||
; IR-NEXT: sext i32 %arg to i64
|
||||
; IR-NEXT: mul i64
|
||||
; IR-NEXT: @polly_allocateMemoryForDevice
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
if (CUDALIB_FOUND OR OpenCL_FOUND)
|
||||
if (CUDALIB_FOUND)
|
||||
add_subdirectory(GPURuntime)
|
||||
endif (CUDALIB_FOUND OR OpenCL_FOUND)
|
||||
endif (CUDALIB_FOUND)
|
||||
|
||||
set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE)
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -76,27 +76,12 @@
|
|||
*
|
||||
*/
|
||||
|
||||
typedef enum PollyGPURuntimeT {
|
||||
RUNTIME_NONE,
|
||||
RUNTIME_CUDA,
|
||||
RUNTIME_CL
|
||||
} PollyGPURuntime;
|
||||
|
||||
typedef struct PollyGPUContextT PollyGPUContext;
|
||||
typedef struct PollyGPUFunctionT PollyGPUFunction;
|
||||
typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;
|
||||
|
||||
typedef struct OpenCLContextT OpenCLContext;
|
||||
typedef struct OpenCLKernelT OpenCLKernel;
|
||||
typedef struct OpenCLDevicePtrT OpenCLDevicePtr;
|
||||
|
||||
typedef struct CUDAContextT CUDAContext;
|
||||
typedef struct CUDAKernelT CUDAKernel;
|
||||
typedef struct CUDADevicePtrT CUDADevicePtr;
|
||||
|
||||
PollyGPUContext *polly_initContextCUDA();
|
||||
PollyGPUContext *polly_initContextCL();
|
||||
PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
|
||||
PollyGPUContext *polly_initContext();
|
||||
PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
|
||||
const char *KernelName);
|
||||
void polly_freeKernel(PollyGPUFunction *Kernel);
|
||||
void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
|
||||
|
|
Loading…
Reference in New Issue