diff --git a/polly/CMakeLists.txt b/polly/CMakeLists.txt index 8a2724ee32f2..c3e066024b90 100644 --- a/polly/CMakeLists.txt +++ b/polly/CMakeLists.txt @@ -152,10 +152,9 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF) if (POLLY_ENABLE_GPGPU_CODEGEN) - # Do not require CUDA/OpenCL, as GPU code generation test cases can be run - # without a CUDA/OpenCL library. + # Do not require CUDA, as GPU code generation test cases can be run without + # a cuda library. FIND_PACKAGE(CUDA) - FIND_PACKAGE(OpenCL) set(GPU_CODEGEN TRUE) else(POLLY_ENABLE_GPGPU_CODEGEN) set(GPU_CODEGEN FALSE) @@ -164,13 +163,8 @@ endif(POLLY_ENABLE_GPGPU_CODEGEN) # Support GPGPU code generation if the library is available. if (CUDALIB_FOUND) - add_definitions(-DHAS_LIBCUDART) INCLUDE_DIRECTORIES( ${CUDALIB_INCLUDE_DIR} ) endif(CUDALIB_FOUND) -if (OpenCL_FOUND) - add_definitions(-DHAS_LIBOPENCL) - INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} ) -endif(OpenCL_FOUND) option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in Polly" ON) if (NOT POLLY_BUNDLED_ISL) diff --git a/polly/include/polly/CodeGen/PPCGCodeGeneration.h b/polly/include/polly/CodeGen/PPCGCodeGeneration.h deleted file mode 100644 index b498326dedf4..000000000000 --- a/polly/include/polly/CodeGen/PPCGCodeGeneration.h +++ /dev/null @@ -1,24 +0,0 @@ -//===--- polly/PPCGCodeGeneration.h - Polly Accelerator Code Generation. --===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Take a scop created by ScopInfo and map it to GPU code using the ppcg -// GPU mapping strategy. -// -//===----------------------------------------------------------------------===// - -#ifndef POLLY_PPCGCODEGENERATION_H -#define POLLY_PPCGCODEGENERATION_H - -/// The GPU Architecture to target. -enum GPUArch { NVPTX64 }; - -/// The GPU Runtime implementation to use. -enum GPURuntime { CUDA, OpenCL }; - -#endif // POLLY_PPCGCODEGENERATION_H diff --git a/polly/include/polly/LinkAllPasses.h b/polly/include/polly/LinkAllPasses.h index 8b6e188af9e3..9d42e4c84edb 100644 --- a/polly/include/polly/LinkAllPasses.h +++ b/polly/include/polly/LinkAllPasses.h @@ -15,7 +15,6 @@ #ifndef POLLY_LINKALLPASSES_H #define POLLY_LINKALLPASSES_H -#include "polly/CodeGen/PPCGCodeGeneration.h" #include "polly/Config/config.h" #include "polly/PruneUnprofitable.h" #include "polly/Simplify.h" @@ -49,8 +48,7 @@ llvm::Pass *createScopInfoWrapperPassPass(); llvm::Pass *createIslAstInfoPass(); llvm::Pass *createCodeGenerationPass(); #ifdef GPU_CODEGEN -llvm::Pass *createPPCGCodeGenerationPass(GPUArch Arch = GPUArch::NVPTX64, - GPURuntime Runtime = GPURuntime::CUDA); +llvm::Pass *createPPCGCodeGenerationPass(); #endif llvm::Pass *createIslScheduleOptimizerPass(); llvm::Pass *createFlattenSchedulePass(); diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp index 45e570c90b5f..7d3d42ab2a83 100644 --- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp +++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp @@ -12,7 +12,6 @@ // //===----------------------------------------------------------------------===// -#include "polly/CodeGen/PPCGCodeGeneration.h" #include "polly/CodeGen/IslAst.h" #include "polly/CodeGen/IslNodeBuilder.h" #include "polly/CodeGen/Utils.h" @@ -154,9 +153,9 @@ public: GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, Scop &S, BasicBlock *StartBlock, - gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch) + gpu_prog *Prog) : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock), - Prog(Prog), Runtime(Runtime), Arch(Arch) { + Prog(Prog) { getExprBuilder().setIDToSAI(&IDToSAI); } @@ -202,12 +201,6 @@ private: /// The GPU program we generate code for. gpu_prog *Prog; - /// The GPU Runtime implementation to use (OpenCL or CUDA). - GPURuntime Runtime; - - /// The GPU Architecture to target. - GPUArch Arch; - /// Class to free isl_ids. class IslIdDeleter { public: @@ -759,17 +752,7 @@ void GPUNodeBuilder::createCallSynchronizeDevice() { } Value *GPUNodeBuilder::createCallInitContext() { - const char *Name; - - switch (Runtime) { - case GPURuntime::CUDA: - Name = "polly_initContextCUDA"; - break; - case GPURuntime::OpenCL: - Name = "polly_initContextCL"; - break; - } - + const char *Name = "polly_initContext"; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Function *F = M->getFunction(Name); @@ -1045,15 +1028,7 @@ void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr, void GPUNodeBuilder::createKernelSync() { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - - Function *Sync; - - switch (Arch) { - case GPUArch::NVPTX64: - Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); - break; - } - + auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); Builder.CreateCall(Sync, {}); } @@ -1459,12 +1434,7 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, GPUModule.get()); - - switch (Arch) { - case GPUArch::NVPTX64: - FN->setCallingConv(CallingConv::PTX_Kernel); - break; - } + FN->setCallingConv(CallingConv::PTX_Kernel); auto Arg = FN->arg_begin(); for (long i = 0; i < Kernel->n_array; i++) { @@ -1525,19 +1495,12 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, } void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { - Intrinsic::ID IntrinsicsBID[2]; - Intrinsic::ID IntrinsicsTID[3]; + Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x, + Intrinsic::nvvm_read_ptx_sreg_ctaid_y}; - switch (Arch) { - case GPUArch::NVPTX64: - IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x; - IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y; - - IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x; - IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y; - IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z; - break; - } + Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x, + Intrinsic::nvvm_read_ptx_sreg_tid_y, + Intrinsic::nvvm_read_ptx_sreg_tid_z}; auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { std::string Name = isl_id_get_name(Id); @@ -1686,18 +1649,11 @@ void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) { void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, SetVector &SubtreeValues) { + std::string Identifier = "kernel_" + std::to_string(Kernel->id); GPUModule.reset(new Module(Identifier, Builder.getContext())); - - switch (Arch) { - case GPUArch::NVPTX64: - if (Runtime == GPURuntime::CUDA) - GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); - else if (Runtime == GPURuntime::OpenCL) - GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl")); - GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); - break; - } + GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); + GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); @@ -1718,21 +1674,7 @@ void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, } std::string GPUNodeBuilder::createKernelASM() { - llvm::Triple GPUTriple; - - switch (Arch) { - case GPUArch::NVPTX64: - switch (Runtime) { - case GPURuntime::CUDA: - GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda")); - break; - case GPURuntime::OpenCL: - GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl")); - break; - } - break; - } - + llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda")); std::string ErrMsg; auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); @@ -1743,17 +1685,9 @@ std::string GPUNodeBuilder::createKernelASM() { TargetOptions Options; Options.UnsafeFPMath = FastMath; - - std::string subtarget; - - switch (Arch) { - case GPUArch::NVPTX64: - subtarget = CudaVersion; - break; - } - - std::unique_ptr TargetM(GPUTarget->createTargetMachine( - GPUTriple.getTriple(), subtarget, "", Options, Optional())); + std::unique_ptr TargetM( + GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "", + Options, Optional())); SmallString<0> ASMString; raw_svector_ostream ASMStream(ASMString); @@ -1805,10 +1739,6 @@ class PPCGCodeGeneration : public ScopPass { public: static char ID; - GPURuntime Runtime = GPURuntime::CUDA; - - GPUArch Architecture = GPUArch::NVPTX64; - /// The scop that is currently processed. Scop *S; @@ -2592,7 +2522,7 @@ public: executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI); GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S, - StartBlock, Prog, Runtime, Architecture); + StartBlock, Prog); // TODO: Handle LICM auto SplitBlock = StartBlock->getSinglePredecessor(); @@ -2680,12 +2610,7 @@ public: char PPCGCodeGeneration::ID = 1; -Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) { - PPCGCodeGeneration *generator = new PPCGCodeGeneration(); - generator->Runtime = Runtime; - generator->Architecture = Arch; - return generator; -} +Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); } INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", "Polly - Apply PPCG translation to SCOP", false, false) diff --git a/polly/lib/Support/RegisterPasses.cpp b/polly/lib/Support/RegisterPasses.cpp index 6188b8cdb604..9c8eac03f71a 100644 --- a/polly/lib/Support/RegisterPasses.cpp +++ b/polly/lib/Support/RegisterPasses.cpp @@ -23,7 +23,6 @@ #include "polly/Canonicalization.h" #include "polly/CodeGen/CodeGeneration.h" #include "polly/CodeGen/CodegenCleanup.h" -#include "polly/CodeGen/PPCGCodeGeneration.h" #include "polly/DeLICM.h" #include "polly/DependenceInfo.h" #include "polly/FlattenSchedule.h" @@ -102,23 +101,6 @@ static cl::opt ), cl::init(TARGET_CPU), cl::ZeroOrMore, cl::cat(PollyCategory)); -#ifdef GPU_CODEGEN -static cl::opt GPURuntimeChoice( - "polly-gpu-runtime", cl::desc("The GPU Runtime API to target"), - cl::values(clEnumValN(GPURuntime::CUDA, "libcudart", - "use the CUDA Runtime API"), - clEnumValN(GPURuntime::OpenCL, "libopencl", - "use the OpenCL Runtime API")), - cl::init(GPURuntime::CUDA), cl::ZeroOrMore, cl::cat(PollyCategory)); - -static cl::opt - GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"), - cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64", - "target NVIDIA 64-bit architecture")), - cl::init(GPUArch::NVPTX64), cl::ZeroOrMore, - cl::cat(PollyCategory)); -#endif - VectorizerChoice polly::PollyVectorizerChoice; static cl::opt Vectorizer( "polly-vectorizer", cl::desc("Select the vectorization strategy"), @@ -327,8 +309,7 @@ void registerPollyPasses(llvm::legacy::PassManagerBase &PM) { if (Target == TARGET_GPU) { #ifdef GPU_CODEGEN - PM.add( - polly::createPPCGCodeGenerationPass(GPUArchChoice, GPURuntimeChoice)); + PM.add(polly::createPPCGCodeGenerationPass()); #endif } else { switch (CodeGeneration) { diff --git a/polly/test/GPGPU/cuda-managed-memory-simple.ll b/polly/test/GPGPU/cuda-managed-memory-simple.ll index 4a97ec56ad5d..0f5ece145253 100644 --- a/polly/test/GPGPU/cuda-managed-memory-simple.ll +++ b/polly/test/GPGPU/cuda-managed-memory-simple.ll @@ -35,7 +35,7 @@ ; CHECK-NOT: polly_freeDeviceMemory ; CHECK-NOT: polly_allocateMemoryForDevice -; CHECK: %13 = call i8* @polly_initContextCUDA() +; CHECK: %13 = call i8* @polly_initContext() ; CHECK-NEXT: %14 = bitcast i32* %A to i8* ; CHECK-NEXT: %15 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0 ; CHECK-NEXT: store i8* %14, i8** %polly_launch_0_param_0 @@ -46,7 +46,7 @@ ; CHECK-NEXT: store i8* %17, i8** %polly_launch_0_param_1 ; CHECK-NEXT: %19 = bitcast i8** %polly_launch_0_param_1 to i8* ; CHECK-NEXT: store i8* %19, i8** %18 -; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0)) +; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0)) ; CHECK-NEXT: call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) ; CHECK-NEXT: call void @polly_freeKernel(i8* %20) ; CHECK-NEXT: call void @polly_synchronizeDevice() diff --git a/polly/test/GPGPU/size-cast.ll b/polly/test/GPGPU/size-cast.ll index 59caf1260ba1..9cb5df46d239 100644 --- a/polly/test/GPGPU/size-cast.ll +++ b/polly/test/GPGPU/size-cast.ll @@ -29,7 +29,7 @@ ; CODE-NEXT: if (arg >= 32 * b0 + t0 + 1048576 * c0 + 1) ; CODE-NEXT: Stmt_bb6(0, 32 * b0 + t0 + 1048576 * c0); -; IR: call i8* @polly_initContextCUDA() +; IR: call i8* @polly_initContext() ; IR-NEXT: sext i32 %arg to i64 ; IR-NEXT: mul i64 ; IR-NEXT: @polly_allocateMemoryForDevice diff --git a/polly/tools/CMakeLists.txt b/polly/tools/CMakeLists.txt index 8e5ce59cb90e..4ce60e1a3e81 100644 --- a/polly/tools/CMakeLists.txt +++ b/polly/tools/CMakeLists.txt @@ -1,5 +1,5 @@ -if (CUDALIB_FOUND OR OpenCL_FOUND) +if (CUDALIB_FOUND) add_subdirectory(GPURuntime) -endif (CUDALIB_FOUND OR OpenCL_FOUND) +endif (CUDALIB_FOUND) set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE) diff --git a/polly/tools/GPURuntime/GPUJIT.c b/polly/tools/GPURuntime/GPUJIT.c index 80f4c430903e..457a7477d62a 100644 --- a/polly/tools/GPURuntime/GPUJIT.c +++ b/polly/tools/GPURuntime/GPUJIT.c @@ -12,20 +12,8 @@ /******************************************************************************/ #include "GPUJIT.h" - -#ifdef HAS_LIBCUDART #include #include -#endif /* HAS_LIBCUDART */ - -#ifdef HAS_LIBOPENCL -#ifdef __APPLE__ -#include -#else -#include -#endif -#endif /* HAS_LIBOPENCL */ - #include #include #include @@ -34,8 +22,6 @@ static int DebugMode; static int CacheMode; -static PollyGPURuntime Runtime = RUNTIME_NONE; - static void debug_print(const char *format, ...) { if (!DebugMode) return; @@ -47,853 +33,18 @@ static void debug_print(const char *format, ...) { } #define dump_function() debug_print("-> %s\n", __func__) -#define KERNEL_CACHE_SIZE 10 - -static void err_runtime() { - fprintf(stderr, "Runtime not correctly initialized.\n"); - exit(-1); -} - +/* Define Polly's GPGPU data types. */ struct PollyGPUContextT { - void *Context; -}; - -struct PollyGPUFunctionT { - void *Kernel; -}; - -struct PollyGPUDevicePtrT { - void *DevicePtr; -}; - -/******************************************************************************/ -/* OpenCL */ -/******************************************************************************/ -#ifdef HAS_LIBOPENCL - -struct OpenCLContextT { - cl_context Context; - cl_command_queue CommandQueue; -}; - -struct OpenCLKernelT { - cl_kernel Kernel; - cl_program Program; - const char *BinaryString; -}; - -struct OpenCLDevicePtrT { - cl_mem MemObj; -}; - -/* Dynamic library handles for the OpenCL runtime library. */ -static void *HandleOpenCL; - -/* Type-defines of function pointer to OpenCL Runtime API. */ -typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries, - cl_platform_id *Platforms, - cl_uint *NumPlatforms); -static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr; - -typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform, - cl_device_type DeviceType, - cl_uint NumEntries, cl_device_id *Devices, - cl_uint *NumDevices); -static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr; - -typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device, - cl_device_info ParamName, - size_t ParamValueSize, void *ParamValue, - size_t *ParamValueSizeRet); -static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr; - -typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info ParamName, - size_t ParamValueSize, void *ParamValue, - size_t *ParamValueSizeRet); -static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr; - -typedef cl_context clCreateContextFcnTy( - const cl_context_properties *Properties, cl_uint NumDevices, - const cl_device_id *Devices, - void CL_CALLBACK *pfn_notify(const char *Errinfo, const void *PrivateInfo, - size_t CB, void *UserData), - void *UserData, cl_int *ErrcodeRet); -static clCreateContextFcnTy *clCreateContextFcnPtr; - -typedef cl_command_queue -clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device, - cl_command_queue_properties Properties, - cl_int *ErrcodeRet); -static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr; - -typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags Flags, - size_t Size, void *HostPtr, - cl_int *ErrcodeRet); -static clCreateBufferFcnTy *clCreateBufferFcnPtr; - -typedef cl_int -clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer, - cl_bool BlockingWrite, size_t Offset, size_t Size, - const void *Ptr, cl_uint NumEventsInWaitList, - const cl_event *EventWaitList, cl_event *Event); -static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr; - -typedef cl_program clCreateProgramWithBinaryFcnTy( - cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList, - const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus, - cl_int *ErrcodeRet); -static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr; - -typedef cl_int clBuildProgramFcnTy( - cl_program Program, cl_uint NumDevices, const cl_device_id *DeviceList, - const char *Options, - void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData), - void *UserData); -static clBuildProgramFcnTy *clBuildProgramFcnPtr; - -typedef cl_kernel clCreateKernelFcnTy(cl_program Program, - const char *KernelName, - cl_int *ErrcodeRet); -static clCreateKernelFcnTy *clCreateKernelFcnPtr; - -typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex, - size_t ArgSize, const void *ArgValue); -static clSetKernelArgFcnTy *clSetKernelArgFcnPtr; - -typedef cl_int clEnqueueNDRangeKernelFcnTy( - cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim, - const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, - const size_t *LocalWorkSize, cl_uint NumEventsInWaitList, - const cl_event *EventWaitList, cl_event *Event); -static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr; - -typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue, - cl_mem Buffer, cl_bool BlockingRead, - size_t Offset, size_t Size, void *Ptr, - cl_uint NumEventsInWaitList, - const cl_event *EventWaitList, - cl_event *Event); -static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr; - -typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue); -static clFlushFcnTy *clFlushFcnPtr; - -typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue); -static clFinishFcnTy *clFinishFcnPtr; - -typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel); -static clReleaseKernelFcnTy *clReleaseKernelFcnPtr; - -typedef cl_int clReleaseProgramFcnTy(cl_program Program); -static clReleaseProgramFcnTy *clReleaseProgramFcnPtr; - -typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject); -static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr; - -typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue); -static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr; - -typedef cl_int clReleaseContextFcnTy(cl_context Context); -static clReleaseContextFcnTy *clReleaseContextFcnPtr; - -static void *getAPIHandleCL(void *Handle, const char *FuncName) { - char *Err; - void *FuncPtr; - dlerror(); - FuncPtr = dlsym(Handle, FuncName); - if ((Err = dlerror()) != 0) { - fprintf(stderr, "Load OpenCL Runtime API failed: %s. \n", Err); - return 0; - } - return FuncPtr; -} - -static int initialDeviceAPILibrariesCL() { - HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY); - if (!HandleOpenCL) { - fprintf(stderr, "Cannot open library: %s. \n", dlerror()); - return 0; - } - return 1; -} - -static int initialDeviceAPIsCL() { - if (initialDeviceAPILibrariesCL() == 0) - return 0; - - /* Get function pointer to OpenCL Runtime API. - * - * Note that compilers conforming to the ISO C standard are required to - * generate a warning if a conversion from a void * pointer to a function - * pointer is attempted as in the following statements. The warning - * of this kind of cast may not be emitted by clang and new versions of gcc - * as it is valid on POSIX 2008. - */ - clGetPlatformIDsFcnPtr = - (clGetPlatformIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetPlatformIDs"); - - clGetDeviceIDsFcnPtr = - (clGetDeviceIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceIDs"); - - clGetDeviceInfoFcnPtr = - (clGetDeviceInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceInfo"); - - clGetKernelInfoFcnPtr = - (clGetKernelInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetKernelInfo"); - - clCreateContextFcnPtr = - (clCreateContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateContext"); - - clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL( - HandleOpenCL, "clCreateCommandQueue"); - - clCreateBufferFcnPtr = - (clCreateBufferFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateBuffer"); - - clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL( - HandleOpenCL, "clEnqueueWriteBuffer"); - - clCreateProgramWithBinaryFcnPtr = - (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL( - HandleOpenCL, "clCreateProgramWithBinary"); - - clBuildProgramFcnPtr = - (clBuildProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clBuildProgram"); - - clCreateKernelFcnPtr = - (clCreateKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateKernel"); - - clSetKernelArgFcnPtr = - (clSetKernelArgFcnTy *)getAPIHandleCL(HandleOpenCL, "clSetKernelArg"); - - clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL( - HandleOpenCL, "clEnqueueNDRangeKernel"); - - clEnqueueReadBufferFcnPtr = (clEnqueueReadBufferFcnTy *)getAPIHandleCL( - HandleOpenCL, "clEnqueueReadBuffer"); - - clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(HandleOpenCL, "clFlush"); - - clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(HandleOpenCL, "clFinish"); - - clReleaseKernelFcnPtr = - (clReleaseKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseKernel"); - - clReleaseProgramFcnPtr = - (clReleaseProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseProgram"); - - clReleaseMemObjectFcnPtr = (clReleaseMemObjectFcnTy *)getAPIHandleCL( - HandleOpenCL, "clReleaseMemObject"); - - clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL( - HandleOpenCL, "clReleaseCommandQueue"); - - clReleaseContextFcnPtr = - (clReleaseContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseContext"); - - return 1; -} - -/* Context and Device. */ -static PollyGPUContext *GlobalContext = NULL; -static cl_device_id GlobalDeviceID = NULL; - -/* Fd-Decl: Print out OpenCL Error codes to human readable strings. */ -static void printOpenCLError(int Error); - -static void checkOpenCLError(int Ret, const char *format, ...) { - if (Ret == CL_SUCCESS) - return; - - printOpenCLError(Ret); - va_list args; - va_start(args, format); - vfprintf(stderr, format, args); - va_end(args); - exit(-1); -} - -static PollyGPUContext *initContextCL() { - dump_function(); - - PollyGPUContext *Context; - - cl_platform_id PlatformID = NULL; - cl_device_id DeviceID = NULL; - cl_uint NumDevicesRet; - cl_int Ret; - - char DeviceRevision[256]; - char DeviceName[256]; - size_t DeviceRevisionRetSize, DeviceNameRetSize; - - static __thread PollyGPUContext *CurrentContext = NULL; - - if (CurrentContext) - return CurrentContext; - - /* Get API handles. */ - if (initialDeviceAPIsCL() == 0) { - fprintf(stderr, "Getting the \"handle\" for the OpenCL Runtime failed.\n"); - exit(-1); - } - - /* Get number of devices that support OpenCL. */ - static const int NumberOfPlatforms = 1; - Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL); - checkOpenCLError(Ret, "Failed to get platform IDs.\n"); - // TODO: Extend to CL_DEVICE_TYPE_ALL? - static const int NumberOfDevices = 1; - Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU, NumberOfDevices, - &DeviceID, &NumDevicesRet); - checkOpenCLError(Ret, "Failed to get device IDs.\n"); - - GlobalDeviceID = DeviceID; - if (NumDevicesRet == 0) { - fprintf(stderr, "There is no device supporting OpenCL.\n"); - exit(-1); - } - - /* Get device revision. */ - Ret = - clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION, sizeof(DeviceRevision), - DeviceRevision, &DeviceRevisionRetSize); - checkOpenCLError(Ret, "Failed to fetch device revision.\n"); - - /* Get device name. */ - Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME, sizeof(DeviceName), - DeviceName, &DeviceNameRetSize); - checkOpenCLError(Ret, "Failed to fetch device name.\n"); - - debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName); - - /* Create context on the device. */ - Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext)); - if (Context == 0) { - fprintf(stderr, "Allocate memory for Polly GPU context failed.\n"); - exit(-1); - } - Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext)); - if (Context->Context == 0) { - fprintf(stderr, "Allocate memory for Polly OpenCL context failed.\n"); - exit(-1); - } - ((OpenCLContext *)Context->Context)->Context = - clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL, &Ret); - checkOpenCLError(Ret, "Failed to create context.\n"); - - static const int ExtraProperties = 0; - ((OpenCLContext *)Context->Context)->CommandQueue = - clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->Context, - DeviceID, ExtraProperties, &Ret); - checkOpenCLError(Ret, "Failed to create command queue.\n"); - - if (CacheMode) - CurrentContext = Context; - - GlobalContext = Context; - return Context; -} - -static void freeKernelCL(PollyGPUFunction *Kernel) { - dump_function(); - - if (CacheMode) - return; - - if (!GlobalContext) { - fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); - exit(-1); - } - - cl_int Ret; - Ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue); - checkOpenCLError(Ret, "Failed to flush command queue.\n"); - Ret = clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue); - checkOpenCLError(Ret, "Failed to finish command queue.\n"); - - if (((OpenCLKernel *)Kernel->Kernel)->Kernel) { - cl_int Ret = - clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel); - checkOpenCLError(Ret, "Failed to release kernel.\n"); - } - - if (((OpenCLKernel *)Kernel->Kernel)->Program) { - cl_int Ret = - clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Program); - checkOpenCLError(Ret, "Failed to release program.\n"); - } - - if (Kernel->Kernel) - free((OpenCLKernel *)Kernel->Kernel); - - if (Kernel) - free(Kernel); -} - -static PollyGPUFunction *getKernelCL(const char *BinaryBuffer, - const char *KernelName) { - dump_function(); - - if (!GlobalContext) { - fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); - exit(-1); - } - - static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE]; - static __thread int NextCacheItem = 0; - - for (long i = 0; i < KERNEL_CACHE_SIZE; i++) { - // We exploit here the property that all Polly-ACC kernels are allocated - // as global constants, hence a pointer comparision is sufficient to - // determin equality. - if (KernelCache[i] && - ((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString == - BinaryBuffer) { - debug_print(" -> using cached kernel\n"); - return KernelCache[i]; - } - } - - PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction)); - if (Function == 0) { - fprintf(stderr, "Allocate memory for Polly GPU function failed.\n"); - exit(-1); - } - Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel)); - if (Function->Kernel == 0) { - fprintf(stderr, "Allocate memory for Polly OpenCL kernel failed.\n"); - exit(-1); - } - - if (!GlobalDeviceID) { - fprintf(stderr, "GPGPU-code generation not initialized correctly.\n"); - exit(-1); - } - - cl_int Ret; - size_t BinarySize = strlen(BinaryBuffer); - ((OpenCLKernel *)Function->Kernel)->Program = clCreateProgramWithBinaryFcnPtr( - ((OpenCLContext *)GlobalContext->Context)->Context, 1, &GlobalDeviceID, - (const size_t *)&BinarySize, (const unsigned char **)&BinaryBuffer, NULL, - &Ret); - checkOpenCLError(Ret, "Failed to create program from binary.\n"); - - Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1, - &GlobalDeviceID, NULL, NULL, NULL); - checkOpenCLError(Ret, "Failed to build program.\n"); - - ((OpenCLKernel *)Function->Kernel)->Kernel = clCreateKernelFcnPtr( - ((OpenCLKernel *)Function->Kernel)->Program, KernelName, &Ret); - checkOpenCLError(Ret, "Failed to create kernel.\n"); - - ((OpenCLKernel *)Function->Kernel)->BinaryString = BinaryBuffer; - - if (CacheMode) { - if (KernelCache[NextCacheItem]) - freeKernelCL(KernelCache[NextCacheItem]); - - KernelCache[NextCacheItem] = Function; - - NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE; - } - - return Function; -} - -static void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr *DevData, - long MemSize) { - dump_function(); - - if (!GlobalContext) { - fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); - exit(-1); - } - - cl_int Ret; - Ret = clEnqueueWriteBufferFcnPtr( - ((OpenCLContext *)GlobalContext->Context)->CommandQueue, - ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize, - HostData, 0, NULL, NULL); - checkOpenCLError(Ret, "Copying data from host memory to device failed.\n"); -} - -static void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void *HostData, - long MemSize) { - dump_function(); - - if (!GlobalContext) { - fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); - exit(-1); - } - - cl_int Ret; - Ret = clEnqueueReadBufferFcnPtr( - ((OpenCLContext *)GlobalContext->Context)->CommandQueue, - ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize, - HostData, 0, NULL, NULL); - checkOpenCLError(Ret, "Copying results from device to host memory failed.\n"); -} - -static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX, - unsigned int GridDimY, unsigned int BlockDimX, - unsigned int BlockDimY, unsigned int BlockDimZ, - void **Parameters) { - dump_function(); - - cl_int Ret; - cl_uint NumArgs; - - if (!GlobalContext) { - fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); - exit(-1); - } - - OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel; - Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS, - sizeof(cl_uint), &NumArgs, NULL); - checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n"); - - // TODO: Pass the size of the kernel arguments in to launchKernelCL, along - // with the arguments themselves. This is a dirty workaround that can be - // broken. - for (cl_uint i = 0; i < NumArgs; i++) { - Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 8, (void *)Parameters[i]); - if (Ret == CL_INVALID_ARG_SIZE) { - Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 4, (void *)Parameters[i]); - if (Ret == CL_INVALID_ARG_SIZE) { - Ret = - clSetKernelArgFcnPtr(CLKernel->Kernel, i, 2, (void *)Parameters[i]); - if (Ret == CL_INVALID_ARG_SIZE) { - Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 1, - (void *)Parameters[i]); - checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i); - } - } - } - if (Ret != CL_SUCCESS && Ret != CL_INVALID_ARG_SIZE) { - fprintf(stderr, "Failed to set Kernel argument.\n"); - printOpenCLError(Ret); - exit(-1); - } - } - - unsigned int GridDimZ = 1; - size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY, - BlockDimZ * GridDimZ}; - size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ}; - - static const int WorkDim = 3; - OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context; - Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel, - WorkDim, NULL, GlobalWorkSize, - LocalWorkSize, 0, NULL, NULL); - checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n"); -} - -static void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) { - dump_function(); - - OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr; - cl_int Ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->MemObj); - checkOpenCLError(Ret, "Failed to free device memory.\n"); - - free(DevPtr); - free(Allocation); -} - -static PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) { - dump_function(); - - if (!GlobalContext) { - fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); - exit(-1); - } - - PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr)); - if (DevData == 0) { - fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n"); - exit(-1); - } - DevData->DevicePtr = (OpenCLDevicePtr *)malloc(sizeof(OpenCLDevicePtr)); - if (DevData->DevicePtr == 0) { - fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n"); - exit(-1); - } - - cl_int Ret; - ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj = - clCreateBufferFcnPtr(((OpenCLContext *)GlobalContext->Context)->Context, - CL_MEM_READ_WRITE, MemSize, NULL, &Ret); - checkOpenCLError(Ret, - "Allocate memory for GPU device memory pointer failed.\n"); - - return DevData; -} - -static void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) { - dump_function(); - - OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr; - return (void *)DevPtr->MemObj; -} - -static void synchronizeDeviceCL() { - dump_function(); - - if (!GlobalContext) { - fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); - exit(-1); - } - - if (clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue) != - CL_SUCCESS) { - fprintf(stderr, "Synchronizing device and host memory failed.\n"); - exit(-1); - } -} - -static void freeContextCL(PollyGPUContext *Context) { - dump_function(); - - cl_int Ret; - - GlobalContext = NULL; - - OpenCLContext *Ctx = (OpenCLContext *)Context->Context; - if (Ctx->CommandQueue) { - Ret = clReleaseCommandQueueFcnPtr(Ctx->CommandQueue); - checkOpenCLError(Ret, "Could not release command queue.\n"); - } - - if (Ctx->Context) { - Ret = clReleaseContextFcnPtr(Ctx->Context); - checkOpenCLError(Ret, "Could not release context.\n"); - } - - free(Ctx); - free(Context); -} - -static void printOpenCLError(int Error) { - - switch (Error) { - case CL_SUCCESS: - // Success, don't print an error. - break; - - // JIT/Runtime errors. - case CL_DEVICE_NOT_FOUND: - fprintf(stderr, "Device not found.\n"); - break; - case CL_DEVICE_NOT_AVAILABLE: - fprintf(stderr, "Device not available.\n"); - break; - case CL_COMPILER_NOT_AVAILABLE: - fprintf(stderr, "Compiler not available.\n"); - break; - case CL_MEM_OBJECT_ALLOCATION_FAILURE: - fprintf(stderr, "Mem object allocation failure.\n"); - break; - case CL_OUT_OF_RESOURCES: - fprintf(stderr, "Out of resources.\n"); - break; - case CL_OUT_OF_HOST_MEMORY: - fprintf(stderr, "Out of host memory.\n"); - break; - case CL_PROFILING_INFO_NOT_AVAILABLE: - fprintf(stderr, "Profiling info not available.\n"); - break; - case CL_MEM_COPY_OVERLAP: - fprintf(stderr, "Mem copy overlap.\n"); - break; - case CL_IMAGE_FORMAT_MISMATCH: - fprintf(stderr, "Image format mismatch.\n"); - break; - case CL_IMAGE_FORMAT_NOT_SUPPORTED: - fprintf(stderr, "Image format not supported.\n"); - break; - case CL_BUILD_PROGRAM_FAILURE: - fprintf(stderr, "Build program failure.\n"); - break; - case CL_MAP_FAILURE: - fprintf(stderr, "Map failure.\n"); - break; - case CL_MISALIGNED_SUB_BUFFER_OFFSET: - fprintf(stderr, "Misaligned sub buffer offset.\n"); - break; - case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: - fprintf(stderr, "Exec status error for events in wait list.\n"); - break; - case CL_COMPILE_PROGRAM_FAILURE: - fprintf(stderr, "Compile program failure.\n"); - break; - case CL_LINKER_NOT_AVAILABLE: - fprintf(stderr, "Linker not available.\n"); - break; - case CL_LINK_PROGRAM_FAILURE: - fprintf(stderr, "Link program failure.\n"); - break; - case CL_DEVICE_PARTITION_FAILED: - fprintf(stderr, "Device partition failed.\n"); - break; - case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: - fprintf(stderr, "Kernel arg info not available.\n"); - break; - - // Compiler errors. - case CL_INVALID_VALUE: - fprintf(stderr, "Invalid value.\n"); - break; - case CL_INVALID_DEVICE_TYPE: - fprintf(stderr, "Invalid device type.\n"); - break; - case CL_INVALID_PLATFORM: - fprintf(stderr, "Invalid platform.\n"); - break; - case CL_INVALID_DEVICE: - fprintf(stderr, "Invalid device.\n"); - break; - case CL_INVALID_CONTEXT: - fprintf(stderr, "Invalid context.\n"); - break; - case CL_INVALID_QUEUE_PROPERTIES: - fprintf(stderr, "Invalid queue properties.\n"); - break; - case CL_INVALID_COMMAND_QUEUE: - fprintf(stderr, "Invalid command queue.\n"); - break; - case CL_INVALID_HOST_PTR: - fprintf(stderr, "Invalid host pointer.\n"); - break; - case CL_INVALID_MEM_OBJECT: - fprintf(stderr, "Invalid memory object.\n"); - break; - case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: - fprintf(stderr, "Invalid image format descriptor.\n"); - break; - case CL_INVALID_IMAGE_SIZE: - fprintf(stderr, "Invalid image size.\n"); - break; - case CL_INVALID_SAMPLER: - fprintf(stderr, "Invalid sampler.\n"); - break; - case CL_INVALID_BINARY: - fprintf(stderr, "Invalid binary.\n"); - break; - case CL_INVALID_BUILD_OPTIONS: - fprintf(stderr, "Invalid build options.\n"); - break; - case CL_INVALID_PROGRAM: - fprintf(stderr, "Invalid program.\n"); - break; - case CL_INVALID_PROGRAM_EXECUTABLE: - fprintf(stderr, "Invalid program executable.\n"); - break; - case CL_INVALID_KERNEL_NAME: - fprintf(stderr, "Invalid kernel name.\n"); - break; - case CL_INVALID_KERNEL_DEFINITION: - fprintf(stderr, "Invalid kernel definition.\n"); - break; - case CL_INVALID_KERNEL: - fprintf(stderr, "Invalid kernel.\n"); - break; - case CL_INVALID_ARG_INDEX: - fprintf(stderr, "Invalid arg index.\n"); - break; - case CL_INVALID_ARG_VALUE: - fprintf(stderr, "Invalid arg value.\n"); - break; - case CL_INVALID_ARG_SIZE: - fprintf(stderr, "Invalid arg size.\n"); - break; - case CL_INVALID_KERNEL_ARGS: - fprintf(stderr, "Invalid kernel args.\n"); - break; - case CL_INVALID_WORK_DIMENSION: - fprintf(stderr, "Invalid work dimension.\n"); - break; - case CL_INVALID_WORK_GROUP_SIZE: - fprintf(stderr, "Invalid work group size.\n"); - break; - case CL_INVALID_WORK_ITEM_SIZE: - fprintf(stderr, "Invalid work item size.\n"); - break; - case CL_INVALID_GLOBAL_OFFSET: - fprintf(stderr, "Invalid global offset.\n"); - break; - case CL_INVALID_EVENT_WAIT_LIST: - fprintf(stderr, "Invalid event wait list.\n"); - break; - case CL_INVALID_EVENT: - fprintf(stderr, "Invalid event.\n"); - break; - case CL_INVALID_OPERATION: - fprintf(stderr, "Invalid operation.\n"); - break; - case CL_INVALID_GL_OBJECT: - fprintf(stderr, "Invalid GL object.\n"); - break; - case CL_INVALID_BUFFER_SIZE: - fprintf(stderr, "Invalid buffer size.\n"); - break; - case CL_INVALID_MIP_LEVEL: - fprintf(stderr, "Invalid mip level.\n"); - break; - case CL_INVALID_GLOBAL_WORK_SIZE: - fprintf(stderr, "Invalid global work size.\n"); - break; - case CL_INVALID_PROPERTY: - fprintf(stderr, "Invalid property.\n"); - break; - case CL_INVALID_IMAGE_DESCRIPTOR: - fprintf(stderr, "Invalid image descriptor.\n"); - break; - case CL_INVALID_COMPILER_OPTIONS: - fprintf(stderr, "Invalid compiler options.\n"); - break; - case CL_INVALID_LINKER_OPTIONS: - fprintf(stderr, "Invalid linker options.\n"); - break; - case CL_INVALID_DEVICE_PARTITION_COUNT: - fprintf(stderr, "Invalid device partition count.\n"); - break; - case CL_INVALID_PIPE_SIZE: - fprintf(stderr, "Invalid pipe size.\n"); - break; - case CL_INVALID_DEVICE_QUEUE: - fprintf(stderr, "Invalid device queue.\n"); - break; - - // NVIDIA specific error. - case -9999: - fprintf(stderr, "NVIDIA invalid read or write buffer.\n"); - break; - - default: - fprintf(stderr, "Unknown error code!\n"); - break; - } -} - -#endif /* HAS_LIBOPENCL */ -/******************************************************************************/ -/* CUDA */ -/******************************************************************************/ -#ifdef HAS_LIBCUDART - -struct CUDAContextT { CUcontext Cuda; }; -struct CUDAKernelT { +struct PollyGPUFunctionT { CUfunction Cuda; CUmodule CudaModule; - const char *BinaryString; + const char *PTXString; }; -struct CUDADevicePtrT { +struct PollyGPUDevicePtrT { CUdeviceptr Cuda; }; @@ -906,10 +57,10 @@ typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t); static CuMemAllocFcnTy *CuMemAllocFcnPtr; typedef CUresult CUDAAPI CuLaunchKernelFcnTy( - CUfunction F, unsigned int GridDimX, unsigned int GridDimY, - unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY, - unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream, - void **KernelParams, void **Extra); + CUfunction f, unsigned int gridDimX, unsigned int gridDimY, + unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, + unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, + void **kernelParams, void **extra); static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr; typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t); @@ -944,8 +95,8 @@ typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *, void **); static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr; -typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *Module, - const void *Image); +typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *module, + const void *image); static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr; typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule, @@ -958,25 +109,25 @@ static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr; typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice); static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr; -typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState State, - CUjitInputType Type, void *Data, - size_t Size, const char *Name, - unsigned int NumOptions, - CUjit_option *Options, - void **OptionValues); +typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState state, + CUjitInputType type, void *data, + size_t size, const char *name, + unsigned int numOptions, + CUjit_option *options, + void **optionValues); static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr; -typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int NumOptions, - CUjit_option *Options, - void **OptionValues, - CUlinkState *StateOut); +typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int numOptions, + CUjit_option *options, + void **optionValues, + CUlinkState *stateOut); static CuLinkCreateFcnTy *CuLinkCreateFcnPtr; -typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState State, void **CubinOut, - size_t *SizeOut); +typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState state, void **cubinOut, + size_t *sizeOut); static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr; -typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState State); +typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state); static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr; typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy(); @@ -986,36 +137,36 @@ static CuCtxSynchronizeFcnTy *CuCtxSynchronizeFcnPtr; typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void); static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr; -static void *getAPIHandleCUDA(void *Handle, const char *FuncName) { +static void *getAPIHandle(void *Handle, const char *FuncName) { char *Err; void *FuncPtr; dlerror(); FuncPtr = dlsym(Handle, FuncName); if ((Err = dlerror()) != 0) { - fprintf(stderr, "Load CUDA driver API failed: %s. \n", Err); + fprintf(stdout, "Load CUDA driver API failed: %s. \n", Err); return 0; } return FuncPtr; } -static int initialDeviceAPILibrariesCUDA() { +static int initialDeviceAPILibraries() { HandleCuda = dlopen("libcuda.so", RTLD_LAZY); if (!HandleCuda) { - fprintf(stderr, "Cannot open library: %s. \n", dlerror()); + printf("Cannot open library: %s. \n", dlerror()); return 0; } HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY); if (!HandleCudaRT) { - fprintf(stderr, "Cannot open library: %s. \n", dlerror()); + printf("Cannot open library: %s. \n", dlerror()); return 0; } return 1; } -static int initialDeviceAPIsCUDA() { - if (initialDeviceAPILibrariesCUDA() == 0) +static int initialDeviceAPIs() { + if (initialDeviceAPILibraries() == 0) return 0; /* Get function pointer to CUDA Driver APIs. @@ -1027,76 +178,77 @@ static int initialDeviceAPIsCUDA() { * as it is valid on POSIX 2008. */ CuLaunchKernelFcnPtr = - (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel"); + (CuLaunchKernelFcnTy *)getAPIHandle(HandleCuda, "cuLaunchKernel"); CuMemAllocFcnPtr = - (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2"); + (CuMemAllocFcnTy *)getAPIHandle(HandleCuda, "cuMemAlloc_v2"); - CuMemFreeFcnPtr = - (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2"); + CuMemFreeFcnPtr = (CuMemFreeFcnTy *)getAPIHandle(HandleCuda, "cuMemFree_v2"); CuMemcpyDtoHFcnPtr = - (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2"); + (CuMemcpyDtoHFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyDtoH_v2"); CuMemcpyHtoDFcnPtr = - (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2"); + (CuMemcpyHtoDFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyHtoD_v2"); CuModuleUnloadFcnPtr = - (CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleUnload"); + (CuModuleUnloadFcnTy *)getAPIHandle(HandleCuda, "cuModuleUnload"); CuCtxDestroyFcnPtr = - (CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy"); + (CuCtxDestroyFcnTy *)getAPIHandle(HandleCuda, "cuCtxDestroy"); - CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit"); + CuInitFcnPtr = (CuInitFcnTy *)getAPIHandle(HandleCuda, "cuInit"); CuDeviceGetCountFcnPtr = - (CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetCount"); + (CuDeviceGetCountFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetCount"); CuDeviceGetFcnPtr = - (CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet"); + (CuDeviceGetFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGet"); CuCtxCreateFcnPtr = - (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2"); + (CuCtxCreateFcnTy *)getAPIHandle(HandleCuda, "cuCtxCreate_v2"); - CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA( - HandleCuda, "cuModuleLoadDataEx"); + CuModuleLoadDataExFcnPtr = + (CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx"); CuModuleLoadDataFcnPtr = - (CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleLoadData"); + (CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadData"); - CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandleCUDA( + CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle( HandleCuda, "cuModuleGetFunction"); CuDeviceComputeCapabilityFcnPtr = - (CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA( + (CuDeviceComputeCapabilityFcnTy *)getAPIHandle( HandleCuda, "cuDeviceComputeCapability"); CuDeviceGetNameFcnPtr = - (CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetName"); + (CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName"); CuLinkAddDataFcnPtr = - (CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkAddData"); + (CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData"); CuLinkCreateFcnPtr = - (CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate"); + (CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate"); CuLinkCompleteFcnPtr = - (CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkComplete"); + (CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete"); CuLinkDestroyFcnPtr = - (CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkDestroy"); + (CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy"); CuCtxSynchronizeFcnPtr = - (CuCtxSynchronizeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxSynchronize"); + (CuCtxSynchronizeFcnTy *)getAPIHandle(HandleCuda, "cuCtxSynchronize"); /* Get function pointer to CUDA Runtime APIs. */ - CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandleCUDA( + CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle( HandleCudaRT, "cudaThreadSynchronize"); return 1; } -static PollyGPUContext *initContextCUDA() { +PollyGPUContext *polly_initContext() { + DebugMode = getenv("POLLY_DEBUG") != 0; + dump_function(); PollyGPUContext *Context; CUdevice Device; @@ -1111,20 +263,20 @@ static PollyGPUContext *initContextCUDA() { return CurrentContext; /* Get API handles. */ - if (initialDeviceAPIsCUDA() == 0) { - fprintf(stderr, "Getting the \"handle\" for the CUDA driver API failed.\n"); + if (initialDeviceAPIs() == 0) { + fprintf(stdout, "Getting the \"handle\" for the CUDA driver API failed.\n"); exit(-1); } if (CuInitFcnPtr(0) != CUDA_SUCCESS) { - fprintf(stderr, "Initializing the CUDA driver API failed.\n"); + fprintf(stdout, "Initializing the CUDA driver API failed.\n"); exit(-1); } /* Get number of devices that supports CUDA. */ CuDeviceGetCountFcnPtr(&DeviceCount); if (DeviceCount == 0) { - fprintf(stderr, "There is no device supporting CUDA.\n"); + fprintf(stdout, "There is no device supporting CUDA.\n"); exit(-1); } @@ -1138,15 +290,12 @@ static PollyGPUContext *initContextCUDA() { /* Create context on the device. */ Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext)); if (Context == 0) { - fprintf(stderr, "Allocate memory for Polly GPU context failed.\n"); + fprintf(stdout, "Allocate memory for Polly GPU context failed.\n"); exit(-1); } - Context->Context = malloc(sizeof(CUDAContext)); - if (Context->Context == 0) { - fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n"); - exit(-1); - } - CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0, Device); + CuCtxCreateFcnPtr(&(Context->Cuda), 0, Device); + + CacheMode = getenv("POLLY_NOCACHE") == 0; if (CacheMode) CurrentContext = Context; @@ -1154,24 +303,18 @@ static PollyGPUContext *initContextCUDA() { return Context; } -static void freeKernelCUDA(PollyGPUFunction *Kernel) { - dump_function(); - - if (CacheMode) - return; - - if (((CUDAKernel *)Kernel->Kernel)->CudaModule) - CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule); - - if (Kernel->Kernel) - free((CUDAKernel *)Kernel->Kernel); +static void freeKernel(PollyGPUFunction *Kernel) { + if (Kernel->CudaModule) + CuModuleUnloadFcnPtr(Kernel->CudaModule); if (Kernel) free(Kernel); } -static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer, - const char *KernelName) { +#define KERNEL_CACHE_SIZE 10 + +PollyGPUFunction *polly_getKernel(const char *PTXBuffer, + const char *KernelName) { dump_function(); static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE]; @@ -1181,21 +324,16 @@ static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer, // We exploit here the property that all Polly-ACC kernels are allocated // as global constants, hence a pointer comparision is sufficient to // determin equality. - if (KernelCache[i] && - ((CUDAKernel *)KernelCache[i]->Kernel)->BinaryString == BinaryBuffer) { + if (KernelCache[i] && KernelCache[i]->PTXString == PTXBuffer) { debug_print(" -> using cached kernel\n"); return KernelCache[i]; } } PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction)); + if (Function == 0) { - fprintf(stderr, "Allocate memory for Polly GPU function failed.\n"); - exit(-1); - } - Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel)); - if (Function->Kernel == 0) { - fprintf(stderr, "Allocate memory for Polly CUDA function failed.\n"); + fprintf(stdout, "Allocate memory for Polly GPU function failed.\n"); exit(-1); } @@ -1232,45 +370,43 @@ static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer, memset(ErrorLog, 0, sizeof(ErrorLog)); CuLinkCreateFcnPtr(6, Options, OptionVals, &LState); - Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)BinaryBuffer, - strlen(BinaryBuffer) + 1, 0, 0, 0, 0); + Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)PTXBuffer, + strlen(PTXBuffer) + 1, 0, 0, 0, 0); if (Res != CUDA_SUCCESS) { - fprintf(stderr, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog); + fprintf(stdout, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog); exit(-1); } Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize); if (Res != CUDA_SUCCESS) { - fprintf(stderr, "Complete ptx linker step failed.\n"); - fprintf(stderr, "\n%s\n", ErrorLog); + fprintf(stdout, "Complete ptx linker step failed.\n"); + fprintf(stdout, "\n%s\n", ErrorLog); exit(-1); } debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime, InfoLog); - Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel *)Function->Kernel)->CudaModule), - CuOut); + Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut); if (Res != CUDA_SUCCESS) { - fprintf(stderr, "Loading ptx assembly text failed.\n"); + fprintf(stdout, "Loading ptx assembly text failed.\n"); exit(-1); } - Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel *)Function->Kernel)->Cuda), - ((CUDAKernel *)Function->Kernel)->CudaModule, + Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda), Function->CudaModule, KernelName); if (Res != CUDA_SUCCESS) { - fprintf(stderr, "Loading kernel function failed.\n"); + fprintf(stdout, "Loading kernel function failed.\n"); exit(-1); } CuLinkDestroyFcnPtr(LState); - ((CUDAKernel *)Function->Kernel)->BinaryString = BinaryBuffer; + Function->PTXString = PTXBuffer; if (CacheMode) { if (KernelCache[NextCacheItem]) - freeKernelCUDA(KernelCache[NextCacheItem]); + freeKernel(KernelCache[NextCacheItem]); KernelCache[NextCacheItem] = Function; @@ -1280,220 +416,37 @@ static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer, return Function; } -static void synchronizeDeviceCUDA() { - dump_function(); - if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) { - fprintf(stderr, "Synchronizing device and host memory failed.\n"); - exit(-1); - } -} - -static void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr *DevData, - long MemSize) { - dump_function(); - - CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda; - CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize); -} - -static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void *HostData, - long MemSize) { - dump_function(); - - if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr *)DevData->DevicePtr)->Cuda, - MemSize) != CUDA_SUCCESS) { - fprintf(stderr, "Copying results from device to host memory failed.\n"); - exit(-1); - } -} - -static void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int GridDimX, - unsigned int GridDimY, unsigned int BlockDimX, - unsigned int BlockDimY, unsigned int BlockDimZ, - void **Parameters) { - dump_function(); - - unsigned GridDimZ = 1; - unsigned int SharedMemBytes = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE; - CUstream Stream = 0; - void **Extra = 0; - - CUresult Res; - Res = - CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda, GridDimX, - GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ, - SharedMemBytes, Stream, Parameters, Extra); - if (Res != CUDA_SUCCESS) { - fprintf(stderr, "Launching CUDA kernel failed.\n"); - exit(-1); - } -} - -static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) { - dump_function(); - CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr; - CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda); - free(DevPtr); - free(Allocation); -} - -static PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) { - dump_function(); - - PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr)); - if (DevData == 0) { - fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n"); - exit(-1); - } - DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr)); - if (DevData->DevicePtr == 0) { - fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n"); - exit(-1); - } - - CUresult Res = - CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda), MemSize); - - if (Res != CUDA_SUCCESS) { - fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n"); - exit(-1); - } - - return DevData; -} - -static void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) { - dump_function(); - - CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr; - return (void *)DevPtr->Cuda; -} - -static void freeContextCUDA(PollyGPUContext *Context) { - dump_function(); - - CUDAContext *Ctx = (CUDAContext *)Context->Context; - if (Ctx->Cuda) { - CuCtxDestroyFcnPtr(Ctx->Cuda); - free(Ctx); - free(Context); - } - - dlclose(HandleCuda); - dlclose(HandleCudaRT); -} - -#endif /* HAS_LIBCUDART */ -/******************************************************************************/ -/* API */ -/******************************************************************************/ - -PollyGPUContext *polly_initContext() { - DebugMode = getenv("POLLY_DEBUG") != 0; - CacheMode = getenv("POLLY_NOCACHE") == 0; - - dump_function(); - - PollyGPUContext *Context; - - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - Context = initContextCUDA(); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - Context = initContextCL(); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); - } - - return Context; -} - void polly_freeKernel(PollyGPUFunction *Kernel) { dump_function(); - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - freeKernelCUDA(Kernel); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - freeKernelCL(Kernel); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); - } -} + if (CacheMode) + return; -PollyGPUFunction *polly_getKernel(const char *BinaryBuffer, - const char *KernelName) { - dump_function(); - - PollyGPUFunction *Function; - - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - Function = getKernelCUDA(BinaryBuffer, KernelName); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - Function = getKernelCL(BinaryBuffer, KernelName); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); - } - - return Function; + freeKernel(Kernel); } void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, long MemSize) { dump_function(); - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - copyFromHostToDeviceCUDA(HostData, DevData, MemSize); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - copyFromHostToDeviceCL(HostData, DevData, MemSize); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); - } + CUdeviceptr CuDevData = DevData->Cuda; + CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize); } void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData, long MemSize) { dump_function(); - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - copyFromDeviceToHostCUDA(DevData, HostData, MemSize); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - copyFromDeviceToHostCL(DevData, HostData, MemSize); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); + if (CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) != CUDA_SUCCESS) { + fprintf(stdout, "Copying results from device to host memory failed.\n"); + exit(-1); + } +} +void polly_synchronizeDevice() { + dump_function(); + if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) { + fprintf(stdout, "Synchronizing device and host memory failed.\n"); + exit(-1); } } @@ -1503,61 +456,42 @@ void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX, void **Parameters) { dump_function(); - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, - BlockDimZ, Parameters); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ, - Parameters); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); + unsigned GridDimZ = 1; + unsigned int SharedMemBytes = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE; + CUstream Stream = 0; + void **Extra = 0; + + CUresult Res; + Res = CuLaunchKernelFcnPtr(Kernel->Cuda, GridDimX, GridDimY, GridDimZ, + BlockDimX, BlockDimY, BlockDimZ, SharedMemBytes, + Stream, Parameters, Extra); + if (Res != CUDA_SUCCESS) { + fprintf(stdout, "Launching CUDA kernel failed.\n"); + exit(-1); } } void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) { dump_function(); - - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - freeDeviceMemoryCUDA(Allocation); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - freeDeviceMemoryCL(Allocation); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); - } + CuMemFreeFcnPtr((CUdeviceptr)Allocation->Cuda); + free(Allocation); } PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) { dump_function(); - PollyGPUDevicePtr *DevData; + PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr)); - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - DevData = allocateMemoryForDeviceCUDA(MemSize); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - DevData = allocateMemoryForDeviceCL(MemSize); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); + if (DevData == 0) { + fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); + exit(-1); + } + + CUresult Res = CuMemAllocFcnPtr(&(DevData->Cuda), MemSize); + + if (Res != CUDA_SUCCESS) { + fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); + exit(-1); } return DevData; @@ -1566,43 +500,7 @@ PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) { void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) { dump_function(); - void *DevPtr; - - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - DevPtr = getDevicePtrCUDA(Allocation); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - DevPtr = getDevicePtrCL(Allocation); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); - } - - return DevPtr; -} - -void polly_synchronizeDevice() { - dump_function(); - - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - synchronizeDeviceCUDA(); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - synchronizeDeviceCL(); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); - } + return (void *)Allocation->Cuda; } void polly_freeContext(PollyGPUContext *Context) { @@ -1611,40 +509,11 @@ void polly_freeContext(PollyGPUContext *Context) { if (CacheMode) return; - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - freeContextCUDA(Context); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - freeContextCL(Context); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); + if (Context->Cuda) { + CuCtxDestroyFcnPtr(Context->Cuda); + free(Context); } -} -/* Initialize GPUJIT with CUDA as runtime library. */ -PollyGPUContext *polly_initContextCUDA() { -#ifdef HAS_LIBCUDART - Runtime = RUNTIME_CUDA; - return polly_initContext(); -#else - fprintf(stderr, "GPU Runtime was built without CUDA support.\n"); - exit(-1); -#endif /* HAS_LIBCUDART */ -} - -/* Initialize GPUJIT with OpenCL as runtime library. */ -PollyGPUContext *polly_initContextCL() { -#ifdef HAS_LIBOPENCL - Runtime = RUNTIME_CL; - return polly_initContext(); -#else - fprintf(stderr, "GPU Runtime was built without OpenCL support.\n"); - exit(-1); -#endif /* HAS_LIBOPENCL */ + dlclose(HandleCuda); + dlclose(HandleCudaRT); } diff --git a/polly/tools/GPURuntime/GPUJIT.h b/polly/tools/GPURuntime/GPUJIT.h index f6de70b92f40..1f886ec9a9f0 100644 --- a/polly/tools/GPURuntime/GPUJIT.h +++ b/polly/tools/GPURuntime/GPUJIT.h @@ -76,27 +76,12 @@ * */ -typedef enum PollyGPURuntimeT { - RUNTIME_NONE, - RUNTIME_CUDA, - RUNTIME_CL -} PollyGPURuntime; - typedef struct PollyGPUContextT PollyGPUContext; typedef struct PollyGPUFunctionT PollyGPUFunction; typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr; -typedef struct OpenCLContextT OpenCLContext; -typedef struct OpenCLKernelT OpenCLKernel; -typedef struct OpenCLDevicePtrT OpenCLDevicePtr; - -typedef struct CUDAContextT CUDAContext; -typedef struct CUDAKernelT CUDAKernel; -typedef struct CUDADevicePtrT CUDADevicePtr; - -PollyGPUContext *polly_initContextCUDA(); -PollyGPUContext *polly_initContextCL(); -PollyGPUFunction *polly_getKernel(const char *BinaryBuffer, +PollyGPUContext *polly_initContext(); +PollyGPUFunction *polly_getKernel(const char *PTXBuffer, const char *KernelName); void polly_freeKernel(PollyGPUFunction *Kernel); void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,