2015-04-27 20:02:36 +08:00
|
|
|
/******************** GPUJIT.c - GPUJIT Execution Engine **********************/
|
2012-06-11 17:25:01 +08:00
|
|
|
/* */
|
|
|
|
/* The LLVM Compiler Infrastructure */
|
|
|
|
/* */
|
2012-07-06 18:40:15 +08:00
|
|
|
/* This file is dual licensed under the MIT and the University of Illinois */
|
2015-04-27 20:02:36 +08:00
|
|
|
/* Open Source License. See LICENSE.TXT for details. */
|
2012-06-11 17:25:01 +08:00
|
|
|
/* */
|
|
|
|
/******************************************************************************/
|
|
|
|
/* */
|
|
|
|
/* This file implements GPUJIT, a ptx string execution engine for GPU. */
|
|
|
|
/* */
|
|
|
|
/******************************************************************************/
|
|
|
|
|
|
|
|
#include "GPUJIT.h"
|
2012-07-05 05:45:03 +08:00
|
|
|
#include <cuda.h>
|
|
|
|
#include <cuda_runtime.h>
|
2012-06-11 17:25:01 +08:00
|
|
|
#include <dlfcn.h>
|
2016-07-06 11:04:47 +08:00
|
|
|
#include <stdarg.h>
|
2012-06-11 17:25:01 +08:00
|
|
|
#include <stdio.h>
|
2016-07-26 00:31:21 +08:00
|
|
|
#include <string.h>
|
2012-06-11 17:25:01 +08:00
|
|
|
|
2016-07-06 11:04:47 +08:00
|
|
|
static int DebugMode;
|
2016-08-04 17:15:58 +08:00
|
|
|
static int CacheMode;
|
2016-07-06 11:04:47 +08:00
|
|
|
|
|
|
|
static void debug_print(const char *format, ...) {
|
|
|
|
if (!DebugMode)
|
|
|
|
return;
|
|
|
|
|
|
|
|
va_list args;
|
|
|
|
va_start(args, format);
|
|
|
|
vfprintf(stderr, format, args);
|
|
|
|
va_end(args);
|
|
|
|
}
|
|
|
|
#define dump_function() debug_print("-> %s\n", __func__)
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
/* Define Polly's GPGPU data types. */
|
2012-07-05 05:45:03 +08:00
|
|
|
struct PollyGPUContextT {
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302215
2017-05-05 15:54:49 +08:00
|
|
|
CUcontext Cuda;
|
|
|
|
};
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
struct PollyGPUFunctionT {
|
2012-07-05 05:45:03 +08:00
|
|
|
CUfunction Cuda;
|
2016-07-26 00:31:21 +08:00
|
|
|
CUmodule CudaModule;
|
2017-05-05 17:02:08 +08:00
|
|
|
const char *PTXString;
|
2012-07-05 05:45:03 +08:00
|
|
|
};
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
struct PollyGPUDevicePtrT {
|
2012-07-05 05:45:03 +08:00
|
|
|
CUdeviceptr Cuda;
|
|
|
|
};
|
|
|
|
|
2012-06-11 17:25:01 +08:00
|
|
|
/* Dynamic library handles for the CUDA and CUDA runtime library. */
|
|
|
|
static void *HandleCuda;
|
|
|
|
static void *HandleCudaRT;
|
|
|
|
|
|
|
|
/* Type-defines of function pointer to CUDA driver APIs. */
|
|
|
|
typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t);
|
|
|
|
static CuMemAllocFcnTy *CuMemAllocFcnPtr;
|
|
|
|
|
2016-07-27 21:20:16 +08:00
|
|
|
typedef CUresult CUDAAPI CuLaunchKernelFcnTy(
|
2017-05-05 17:02:08 +08:00
|
|
|
CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
|
|
|
|
unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
|
|
|
|
unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
|
|
|
|
void **kernelParams, void **extra);
|
2016-07-27 21:20:16 +08:00
|
|
|
static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr;
|
2012-06-11 17:25:01 +08:00
|
|
|
|
|
|
|
typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t);
|
|
|
|
static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr;
|
|
|
|
|
|
|
|
typedef CUresult CUDAAPI CuMemcpyHtoDFcnTy(CUdeviceptr, const void *, size_t);
|
|
|
|
static CuMemcpyHtoDFcnTy *CuMemcpyHtoDFcnPtr;
|
|
|
|
|
|
|
|
typedef CUresult CUDAAPI CuMemFreeFcnTy(CUdeviceptr);
|
|
|
|
static CuMemFreeFcnTy *CuMemFreeFcnPtr;
|
|
|
|
|
|
|
|
typedef CUresult CUDAAPI CuModuleUnloadFcnTy(CUmodule);
|
|
|
|
static CuModuleUnloadFcnTy *CuModuleUnloadFcnPtr;
|
|
|
|
|
|
|
|
typedef CUresult CUDAAPI CuCtxDestroyFcnTy(CUcontext);
|
|
|
|
static CuCtxDestroyFcnTy *CuCtxDestroyFcnPtr;
|
|
|
|
|
|
|
|
typedef CUresult CUDAAPI CuInitFcnTy(unsigned int);
|
|
|
|
static CuInitFcnTy *CuInitFcnPtr;
|
|
|
|
|
|
|
|
typedef CUresult CUDAAPI CuDeviceGetCountFcnTy(int *);
|
|
|
|
static CuDeviceGetCountFcnTy *CuDeviceGetCountFcnPtr;
|
|
|
|
|
|
|
|
typedef CUresult CUDAAPI CuCtxCreateFcnTy(CUcontext *, unsigned int, CUdevice);
|
|
|
|
static CuCtxCreateFcnTy *CuCtxCreateFcnPtr;
|
|
|
|
|
|
|
|
typedef CUresult CUDAAPI CuDeviceGetFcnTy(CUdevice *, int);
|
|
|
|
static CuDeviceGetFcnTy *CuDeviceGetFcnPtr;
|
|
|
|
|
2016-03-08 15:34:58 +08:00
|
|
|
typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *,
|
|
|
|
unsigned int, CUjit_option *,
|
|
|
|
void **);
|
2012-06-11 17:25:01 +08:00
|
|
|
static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *module,
|
|
|
|
const void *image);
|
2016-07-26 00:31:21 +08:00
|
|
|
static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;
|
|
|
|
|
2016-03-08 15:34:58 +08:00
|
|
|
typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
|
|
|
|
const char *);
|
2012-06-11 17:25:01 +08:00
|
|
|
static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;
|
|
|
|
|
|
|
|
typedef CUresult CUDAAPI CuDeviceComputeCapabilityFcnTy(int *, int *, CUdevice);
|
|
|
|
static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr;
|
|
|
|
|
|
|
|
typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
|
|
|
|
static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState state,
|
|
|
|
CUjitInputType type, void *data,
|
|
|
|
size_t size, const char *name,
|
|
|
|
unsigned int numOptions,
|
|
|
|
CUjit_option *options,
|
|
|
|
void **optionValues);
|
2016-07-26 00:31:21 +08:00
|
|
|
static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int numOptions,
|
|
|
|
CUjit_option *options,
|
|
|
|
void **optionValues,
|
|
|
|
CUlinkState *stateOut);
|
2016-07-26 00:31:21 +08:00
|
|
|
static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState state, void **cubinOut,
|
|
|
|
size_t *sizeOut);
|
2016-07-26 00:31:21 +08:00
|
|
|
static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state);
|
2016-07-26 00:31:21 +08:00
|
|
|
static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
|
|
|
|
|
2017-04-28 19:16:30 +08:00
|
|
|
typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy();
|
|
|
|
static CuCtxSynchronizeFcnTy *CuCtxSynchronizeFcnPtr;
|
|
|
|
|
2012-06-11 17:25:01 +08:00
|
|
|
/* Type-defines of function pointer ot CUDA runtime APIs. */
|
|
|
|
typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
|
|
|
|
static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
static void *getAPIHandle(void *Handle, const char *FuncName) {
|
2012-06-11 17:25:01 +08:00
|
|
|
char *Err;
|
|
|
|
void *FuncPtr;
|
|
|
|
dlerror();
|
|
|
|
FuncPtr = dlsym(Handle, FuncName);
|
|
|
|
if ((Err = dlerror()) != 0) {
|
2017-05-08 02:31:25 +08:00
|
|
|
fprintf(stderr, "Load CUDA driver API failed: %s. \n", Err);
|
2012-06-11 17:25:01 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return FuncPtr;
|
|
|
|
}
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
static int initialDeviceAPILibraries() {
|
2012-06-11 17:25:01 +08:00
|
|
|
HandleCuda = dlopen("libcuda.so", RTLD_LAZY);
|
|
|
|
if (!HandleCuda) {
|
2017-05-05 17:02:08 +08:00
|
|
|
printf("Cannot open library: %s. \n", dlerror());
|
2012-06-11 17:25:01 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY);
|
|
|
|
if (!HandleCudaRT) {
|
2017-05-05 17:02:08 +08:00
|
|
|
printf("Cannot open library: %s. \n", dlerror());
|
2012-06-11 17:25:01 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
static int initialDeviceAPIs() {
|
|
|
|
if (initialDeviceAPILibraries() == 0)
|
2012-06-11 17:25:01 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* Get function pointer to CUDA Driver APIs.
|
|
|
|
*
|
|
|
|
* Note that compilers conforming to the ISO C standard are required to
|
|
|
|
* generate a warning if a conversion from a void * pointer to a function
|
|
|
|
* pointer is attempted as in the following statements. The warning
|
|
|
|
* of this kind of cast may not be emitted by clang and new versions of gcc
|
|
|
|
* as it is valid on POSIX 2008.
|
|
|
|
*/
|
2016-07-27 21:20:16 +08:00
|
|
|
CuLaunchKernelFcnPtr =
|
2017-05-05 17:02:08 +08:00
|
|
|
(CuLaunchKernelFcnTy *)getAPIHandle(HandleCuda, "cuLaunchKernel");
|
2012-06-11 17:25:01 +08:00
|
|
|
|
2013-03-23 09:05:07 +08:00
|
|
|
CuMemAllocFcnPtr =
|
2017-05-05 17:02:08 +08:00
|
|
|
(CuMemAllocFcnTy *)getAPIHandle(HandleCuda, "cuMemAlloc_v2");
|
2012-06-11 17:25:01 +08:00
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
CuMemFreeFcnPtr = (CuMemFreeFcnTy *)getAPIHandle(HandleCuda, "cuMemFree_v2");
|
2012-06-11 17:25:01 +08:00
|
|
|
|
2013-03-23 09:05:07 +08:00
|
|
|
CuMemcpyDtoHFcnPtr =
|
2017-05-05 17:02:08 +08:00
|
|
|
(CuMemcpyDtoHFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyDtoH_v2");
|
2012-06-11 17:25:01 +08:00
|
|
|
|
2013-03-23 09:05:07 +08:00
|
|
|
CuMemcpyHtoDFcnPtr =
|
2017-05-05 17:02:08 +08:00
|
|
|
(CuMemcpyHtoDFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyHtoD_v2");
|
2012-06-11 17:25:01 +08:00
|
|
|
|
2013-03-23 09:05:07 +08:00
|
|
|
CuModuleUnloadFcnPtr =
|
2017-05-05 17:02:08 +08:00
|
|
|
(CuModuleUnloadFcnTy *)getAPIHandle(HandleCuda, "cuModuleUnload");
|
2012-06-11 17:25:01 +08:00
|
|
|
|
2013-03-23 09:05:07 +08:00
|
|
|
CuCtxDestroyFcnPtr =
|
2017-05-05 17:02:08 +08:00
|
|
|
(CuCtxDestroyFcnTy *)getAPIHandle(HandleCuda, "cuCtxDestroy");
|
2012-06-11 17:25:01 +08:00
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
CuInitFcnPtr = (CuInitFcnTy *)getAPIHandle(HandleCuda, "cuInit");
|
2012-06-11 17:25:01 +08:00
|
|
|
|
2013-03-23 09:05:07 +08:00
|
|
|
CuDeviceGetCountFcnPtr =
|
2017-05-05 17:02:08 +08:00
|
|
|
(CuDeviceGetCountFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetCount");
|
2012-06-11 17:25:01 +08:00
|
|
|
|
2013-03-23 09:05:07 +08:00
|
|
|
CuDeviceGetFcnPtr =
|
2017-05-05 17:02:08 +08:00
|
|
|
(CuDeviceGetFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGet");
|
2012-06-11 17:25:01 +08:00
|
|
|
|
2013-03-23 09:05:07 +08:00
|
|
|
CuCtxCreateFcnPtr =
|
2017-05-05 17:02:08 +08:00
|
|
|
(CuCtxCreateFcnTy *)getAPIHandle(HandleCuda, "cuCtxCreate_v2");
|
2012-06-11 17:25:01 +08:00
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
CuModuleLoadDataExFcnPtr =
|
|
|
|
(CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx");
|
2012-06-11 17:25:01 +08:00
|
|
|
|
2016-07-26 00:31:21 +08:00
|
|
|
CuModuleLoadDataFcnPtr =
|
2017-05-05 17:02:08 +08:00
|
|
|
(CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadData");
|
2016-07-26 00:31:21 +08:00
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle(
|
2013-03-23 09:05:07 +08:00
|
|
|
HandleCuda, "cuModuleGetFunction");
|
2012-06-11 17:25:01 +08:00
|
|
|
|
|
|
|
CuDeviceComputeCapabilityFcnPtr =
|
2017-05-05 17:02:08 +08:00
|
|
|
(CuDeviceComputeCapabilityFcnTy *)getAPIHandle(
|
2013-03-23 09:05:07 +08:00
|
|
|
HandleCuda, "cuDeviceComputeCapability");
|
2012-06-11 17:25:01 +08:00
|
|
|
|
|
|
|
CuDeviceGetNameFcnPtr =
|
2017-05-05 17:02:08 +08:00
|
|
|
(CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName");
|
2012-06-11 17:25:01 +08:00
|
|
|
|
2016-07-26 00:31:21 +08:00
|
|
|
CuLinkAddDataFcnPtr =
|
2017-05-05 17:02:08 +08:00
|
|
|
(CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData");
|
2016-07-26 00:31:21 +08:00
|
|
|
|
|
|
|
CuLinkCreateFcnPtr =
|
2017-05-05 17:02:08 +08:00
|
|
|
(CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate");
|
2016-07-26 00:31:21 +08:00
|
|
|
|
|
|
|
CuLinkCompleteFcnPtr =
|
2017-05-05 17:02:08 +08:00
|
|
|
(CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete");
|
2016-07-26 00:31:21 +08:00
|
|
|
|
|
|
|
CuLinkDestroyFcnPtr =
|
2017-05-05 17:02:08 +08:00
|
|
|
(CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy");
|
2016-07-26 00:31:21 +08:00
|
|
|
|
2017-04-28 19:16:30 +08:00
|
|
|
CuCtxSynchronizeFcnPtr =
|
2017-05-05 17:02:08 +08:00
|
|
|
(CuCtxSynchronizeFcnTy *)getAPIHandle(HandleCuda, "cuCtxSynchronize");
|
2017-04-28 19:16:30 +08:00
|
|
|
|
2012-06-11 17:25:01 +08:00
|
|
|
/* Get function pointer to CUDA Runtime APIs. */
|
2017-05-05 17:02:08 +08:00
|
|
|
CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle(
|
2013-03-23 09:05:07 +08:00
|
|
|
HandleCudaRT, "cudaThreadSynchronize");
|
2012-06-11 17:25:01 +08:00
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
PollyGPUContext *polly_initContext() {
|
|
|
|
DebugMode = getenv("POLLY_DEBUG") != 0;
|
|
|
|
|
2016-07-06 11:04:47 +08:00
|
|
|
dump_function();
|
2016-07-25 17:16:01 +08:00
|
|
|
PollyGPUContext *Context;
|
|
|
|
CUdevice Device;
|
2016-07-06 11:04:47 +08:00
|
|
|
|
2012-06-11 17:25:01 +08:00
|
|
|
int Major = 0, Minor = 0, DeviceID = 0;
|
|
|
|
char DeviceName[256];
|
|
|
|
int DeviceCount = 0;
|
|
|
|
|
2016-08-04 17:15:58 +08:00
|
|
|
static __thread PollyGPUContext *CurrentContext = NULL;
|
|
|
|
|
|
|
|
if (CurrentContext)
|
|
|
|
return CurrentContext;
|
|
|
|
|
2012-06-11 17:25:01 +08:00
|
|
|
/* Get API handles. */
|
2017-05-05 17:02:08 +08:00
|
|
|
if (initialDeviceAPIs() == 0) {
|
2017-05-08 02:31:25 +08:00
|
|
|
fprintf(stderr, "Getting the \"handle\" for the CUDA driver API failed.\n");
|
2012-06-11 17:25:01 +08:00
|
|
|
exit(-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (CuInitFcnPtr(0) != CUDA_SUCCESS) {
|
2017-05-08 02:31:25 +08:00
|
|
|
fprintf(stderr, "Initializing the CUDA driver API failed.\n");
|
2012-06-11 17:25:01 +08:00
|
|
|
exit(-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Get number of devices that supports CUDA. */
|
|
|
|
CuDeviceGetCountFcnPtr(&DeviceCount);
|
|
|
|
if (DeviceCount == 0) {
|
2017-05-08 02:31:25 +08:00
|
|
|
fprintf(stderr, "There is no device supporting CUDA.\n");
|
2012-06-11 17:25:01 +08:00
|
|
|
exit(-1);
|
|
|
|
}
|
|
|
|
|
2016-07-25 17:16:01 +08:00
|
|
|
CuDeviceGetFcnPtr(&Device, 0);
|
2012-06-11 17:25:01 +08:00
|
|
|
|
|
|
|
/* Get compute capabilities and the device name. */
|
2016-07-25 17:16:01 +08:00
|
|
|
CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, Device);
|
|
|
|
CuDeviceGetNameFcnPtr(DeviceName, 256, Device);
|
2016-07-06 11:04:53 +08:00
|
|
|
debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
|
2012-06-11 17:25:01 +08:00
|
|
|
|
|
|
|
/* Create context on the device. */
|
2016-07-25 17:16:01 +08:00
|
|
|
Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
|
|
|
|
if (Context == 0) {
|
2017-05-08 02:31:25 +08:00
|
|
|
fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
|
2012-07-05 05:45:03 +08:00
|
|
|
exit(-1);
|
|
|
|
}
|
2017-05-05 17:02:08 +08:00
|
|
|
CuCtxCreateFcnPtr(&(Context->Cuda), 0, Device);
|
|
|
|
|
|
|
|
CacheMode = getenv("POLLY_NOCACHE") == 0;
|
2016-08-04 17:15:58 +08:00
|
|
|
|
|
|
|
if (CacheMode)
|
|
|
|
CurrentContext = Context;
|
|
|
|
|
2016-07-25 17:16:01 +08:00
|
|
|
return Context;
|
2012-06-11 17:25:01 +08:00
|
|
|
}
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
static void freeKernel(PollyGPUFunction *Kernel) {
|
|
|
|
if (Kernel->CudaModule)
|
|
|
|
CuModuleUnloadFcnPtr(Kernel->CudaModule);
|
2016-08-04 17:15:58 +08:00
|
|
|
|
|
|
|
if (Kernel)
|
|
|
|
free(Kernel);
|
|
|
|
}
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
#define KERNEL_CACHE_SIZE 10
|
|
|
|
|
|
|
|
PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
|
|
|
|
const char *KernelName) {
|
2016-07-06 11:04:47 +08:00
|
|
|
dump_function();
|
|
|
|
|
2016-08-04 17:15:58 +08:00
|
|
|
static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
|
|
|
|
static __thread int NextCacheItem = 0;
|
|
|
|
|
|
|
|
for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
|
|
|
|
// We exploit here the property that all Polly-ACC kernels are allocated
|
|
|
|
// as global constants, hence a pointer comparision is sufficient to
|
|
|
|
// determin equality.
|
2017-05-05 17:02:08 +08:00
|
|
|
if (KernelCache[i] && KernelCache[i]->PTXString == PTXBuffer) {
|
2016-08-04 17:15:58 +08:00
|
|
|
debug_print(" -> using cached kernel\n");
|
|
|
|
return KernelCache[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-26 00:31:21 +08:00
|
|
|
PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
|
2017-05-05 17:02:08 +08:00
|
|
|
|
2016-07-26 00:31:21 +08:00
|
|
|
if (Function == 0) {
|
2017-05-08 02:31:25 +08:00
|
|
|
fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
|
2012-07-05 05:45:03 +08:00
|
|
|
exit(-1);
|
|
|
|
}
|
|
|
|
|
2016-07-26 00:31:21 +08:00
|
|
|
CUresult Res;
|
|
|
|
CUlinkState LState;
|
|
|
|
CUjit_option Options[6];
|
|
|
|
void *OptionVals[6];
|
|
|
|
float Walltime = 0;
|
|
|
|
unsigned long LogSize = 8192;
|
|
|
|
char ErrorLog[8192], InfoLog[8192];
|
|
|
|
void *CuOut;
|
|
|
|
size_t OutSize;
|
|
|
|
|
|
|
|
// Setup linker options
|
|
|
|
// Return walltime from JIT compilation
|
|
|
|
Options[0] = CU_JIT_WALL_TIME;
|
|
|
|
OptionVals[0] = (void *)&Walltime;
|
|
|
|
// Pass a buffer for info messages
|
|
|
|
Options[1] = CU_JIT_INFO_LOG_BUFFER;
|
|
|
|
OptionVals[1] = (void *)InfoLog;
|
|
|
|
// Pass the size of the info buffer
|
|
|
|
Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
|
|
|
|
OptionVals[2] = (void *)LogSize;
|
|
|
|
// Pass a buffer for error message
|
|
|
|
Options[3] = CU_JIT_ERROR_LOG_BUFFER;
|
|
|
|
OptionVals[3] = (void *)ErrorLog;
|
|
|
|
// Pass the size of the error buffer
|
|
|
|
Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
|
|
|
|
OptionVals[4] = (void *)LogSize;
|
|
|
|
// Make the linker verbose
|
|
|
|
Options[5] = CU_JIT_LOG_VERBOSE;
|
|
|
|
OptionVals[5] = (void *)1;
|
|
|
|
|
|
|
|
memset(ErrorLog, 0, sizeof(ErrorLog));
|
|
|
|
|
|
|
|
CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
|
2017-05-05 17:02:08 +08:00
|
|
|
Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)PTXBuffer,
|
|
|
|
strlen(PTXBuffer) + 1, 0, 0, 0, 0);
|
2016-07-26 00:31:21 +08:00
|
|
|
if (Res != CUDA_SUCCESS) {
|
2017-05-08 02:31:25 +08:00
|
|
|
fprintf(stderr, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
|
2012-06-11 17:25:01 +08:00
|
|
|
exit(-1);
|
|
|
|
}
|
|
|
|
|
2016-07-26 00:31:21 +08:00
|
|
|
Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
|
|
|
|
if (Res != CUDA_SUCCESS) {
|
2017-05-08 02:31:25 +08:00
|
|
|
fprintf(stderr, "Complete ptx linker step failed.\n");
|
|
|
|
fprintf(stderr, "\n%s\n", ErrorLog);
|
2016-07-26 00:31:21 +08:00
|
|
|
exit(-1);
|
|
|
|
}
|
2016-07-06 11:04:47 +08:00
|
|
|
|
2016-07-26 00:31:21 +08:00
|
|
|
debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
|
|
|
|
InfoLog);
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut);
|
2016-07-26 00:31:21 +08:00
|
|
|
if (Res != CUDA_SUCCESS) {
|
2017-05-08 02:31:25 +08:00
|
|
|
fprintf(stderr, "Loading ptx assembly text failed.\n");
|
2012-07-05 05:45:03 +08:00
|
|
|
exit(-1);
|
|
|
|
}
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda), Function->CudaModule,
|
2016-07-26 00:31:21 +08:00
|
|
|
KernelName);
|
|
|
|
if (Res != CUDA_SUCCESS) {
|
2017-05-08 02:31:25 +08:00
|
|
|
fprintf(stderr, "Loading kernel function failed.\n");
|
2012-06-11 17:25:01 +08:00
|
|
|
exit(-1);
|
|
|
|
}
|
2016-07-26 00:31:21 +08:00
|
|
|
|
|
|
|
CuLinkDestroyFcnPtr(LState);
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
Function->PTXString = PTXBuffer;
|
2016-08-04 17:15:58 +08:00
|
|
|
|
|
|
|
if (CacheMode) {
|
|
|
|
if (KernelCache[NextCacheItem])
|
2017-05-05 17:02:08 +08:00
|
|
|
freeKernel(KernelCache[NextCacheItem]);
|
2016-08-04 17:15:58 +08:00
|
|
|
|
|
|
|
KernelCache[NextCacheItem] = Function;
|
|
|
|
|
|
|
|
NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
|
|
|
|
}
|
|
|
|
|
2016-07-26 00:31:21 +08:00
|
|
|
return Function;
|
|
|
|
}
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
void polly_freeKernel(PollyGPUFunction *Kernel) {
|
2016-07-26 00:31:21 +08:00
|
|
|
dump_function();
|
2017-05-05 17:02:08 +08:00
|
|
|
|
|
|
|
if (CacheMode)
|
|
|
|
return;
|
|
|
|
|
|
|
|
freeKernel(Kernel);
|
2012-06-11 17:25:01 +08:00
|
|
|
}
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
|
|
|
|
long MemSize) {
|
2016-07-06 11:04:47 +08:00
|
|
|
dump_function();
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
CUdeviceptr CuDevData = DevData->Cuda;
|
2012-07-05 05:45:03 +08:00
|
|
|
CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
|
2012-06-11 17:25:01 +08:00
|
|
|
}
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
|
|
|
|
long MemSize) {
|
2016-07-06 11:04:47 +08:00
|
|
|
dump_function();
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
if (CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) != CUDA_SUCCESS) {
|
2017-05-08 02:31:25 +08:00
|
|
|
fprintf(stderr, "Copying results from device to host memory failed.\n");
|
2017-05-05 17:02:08 +08:00
|
|
|
exit(-1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
void polly_synchronizeDevice() {
|
|
|
|
dump_function();
|
|
|
|
if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
|
2017-05-08 02:31:25 +08:00
|
|
|
fprintf(stderr, "Synchronizing device and host memory failed.\n");
|
2017-04-28 19:16:30 +08:00
|
|
|
exit(-1);
|
|
|
|
}
|
|
|
|
}
|
2012-06-11 17:25:01 +08:00
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
|
|
|
|
unsigned int GridDimY, unsigned int BlockDimX,
|
|
|
|
unsigned int BlockDimY, unsigned int BlockDimZ,
|
|
|
|
void **Parameters) {
|
2016-07-06 11:04:47 +08:00
|
|
|
dump_function();
|
|
|
|
|
2016-07-27 21:20:16 +08:00
|
|
|
unsigned GridDimZ = 1;
|
|
|
|
unsigned int SharedMemBytes = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE;
|
|
|
|
CUstream Stream = 0;
|
|
|
|
void **Extra = 0;
|
2016-07-06 11:04:47 +08:00
|
|
|
|
2016-07-27 21:20:16 +08:00
|
|
|
CUresult Res;
|
2017-05-05 17:02:08 +08:00
|
|
|
Res = CuLaunchKernelFcnPtr(Kernel->Cuda, GridDimX, GridDimY, GridDimZ,
|
|
|
|
BlockDimX, BlockDimY, BlockDimZ, SharedMemBytes,
|
|
|
|
Stream, Parameters, Extra);
|
2016-07-27 21:20:16 +08:00
|
|
|
if (Res != CUDA_SUCCESS) {
|
2017-05-08 02:31:25 +08:00
|
|
|
fprintf(stderr, "Launching CUDA kernel failed.\n");
|
2012-06-11 17:25:01 +08:00
|
|
|
exit(-1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
|
2016-07-25 20:47:33 +08:00
|
|
|
dump_function();
|
2017-05-05 17:02:08 +08:00
|
|
|
CuMemFreeFcnPtr((CUdeviceptr)Allocation->Cuda);
|
2016-07-25 20:47:33 +08:00
|
|
|
free(Allocation);
|
|
|
|
}
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
|
2016-07-25 20:47:33 +08:00
|
|
|
dump_function();
|
|
|
|
|
|
|
|
PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
|
2017-05-05 17:02:08 +08:00
|
|
|
|
2016-07-25 20:47:33 +08:00
|
|
|
if (DevData == 0) {
|
2017-05-08 02:31:25 +08:00
|
|
|
fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
|
2016-07-25 20:47:33 +08:00
|
|
|
exit(-1);
|
|
|
|
}
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
CUresult Res = CuMemAllocFcnPtr(&(DevData->Cuda), MemSize);
|
2016-07-25 20:47:33 +08:00
|
|
|
|
|
|
|
if (Res != CUDA_SUCCESS) {
|
2017-05-08 02:31:25 +08:00
|
|
|
fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
|
2016-07-25 20:47:33 +08:00
|
|
|
exit(-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
return DevData;
|
|
|
|
}
|
|
|
|
|
2016-07-27 21:20:16 +08:00
|
|
|
void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) {
|
|
|
|
dump_function();
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
return (void *)Allocation->Cuda;
|
2016-07-27 21:20:16 +08:00
|
|
|
}
|
|
|
|
|
2016-07-25 17:16:01 +08:00
|
|
|
void polly_freeContext(PollyGPUContext *Context) {
|
2016-07-25 20:47:28 +08:00
|
|
|
dump_function();
|
2012-06-11 17:25:01 +08:00
|
|
|
|
2016-08-04 17:15:58 +08:00
|
|
|
if (CacheMode)
|
|
|
|
return;
|
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
if (Context->Cuda) {
|
|
|
|
CuCtxDestroyFcnPtr(Context->Cuda);
|
|
|
|
free(Context);
|
2012-06-11 17:25:01 +08:00
|
|
|
}
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302215
2017-05-05 15:54:49 +08:00
|
|
|
|
2017-05-05 17:02:08 +08:00
|
|
|
dlclose(HandleCuda);
|
|
|
|
dlclose(HandleCudaRT);
|
2012-06-11 17:25:01 +08:00
|
|
|
}
|