forked from OSchip/llvm-project
1857 lines
57 KiB
C
1857 lines
57 KiB
C
/******************** GPUJIT.c - GPUJIT Execution Engine **********************/
|
|
/* */
|
|
/* Part of the LLVM Project, under the Apache License v2.0 with LLVM */
|
|
/* Exceptions. */
|
|
/* See https://llvm.org/LICENSE.txt for license information. */
|
|
/* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */
|
|
/* */
|
|
/******************************************************************************/
|
|
/* */
|
|
/* This file implements GPUJIT, a ptx string execution engine for GPU. */
|
|
/* */
|
|
/******************************************************************************/
|
|
|
|
#include "GPUJIT.h"
|
|
|
|
#ifdef HAS_LIBCUDART
|
|
#include <cuda.h>
|
|
#include <cuda_runtime.h>
|
|
#endif /* HAS_LIBCUDART */
|
|
|
|
#ifdef HAS_LIBOPENCL
|
|
#ifdef __APPLE__
|
|
#include <OpenCL/opencl.h>
|
|
#else
|
|
#include <CL/cl.h>
|
|
#endif /* __APPLE__ */
|
|
#endif /* HAS_LIBOPENCL */
|
|
|
|
#include <assert.h>
|
|
#include <dlfcn.h>
|
|
#include <stdarg.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <unistd.h>
|
|
|
|
static int DebugMode;
|
|
static int CacheMode;
|
|
#define max(x, y) ((x) > (y) ? (x) : (y))
|
|
|
|
static PollyGPURuntime Runtime = RUNTIME_NONE;
|
|
|
|
static void debug_print(const char *format, ...) {
|
|
if (!DebugMode)
|
|
return;
|
|
|
|
va_list args;
|
|
va_start(args, format);
|
|
vfprintf(stderr, format, args);
|
|
va_end(args);
|
|
}
|
|
#define dump_function() debug_print("-> %s\n", __func__)
|
|
|
|
#define KERNEL_CACHE_SIZE 10
|
|
|
|
static void err_runtime() __attribute__((noreturn));
|
|
static void err_runtime() {
|
|
fprintf(stderr, "Runtime not correctly initialized.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
struct PollyGPUContextT {
|
|
void *Context;
|
|
};
|
|
|
|
struct PollyGPUFunctionT {
|
|
void *Kernel;
|
|
};
|
|
|
|
struct PollyGPUDevicePtrT {
|
|
void *DevicePtr;
|
|
};
|
|
|
|
/******************************************************************************/
|
|
/* OpenCL */
|
|
/******************************************************************************/
|
|
#ifdef HAS_LIBOPENCL
|
|
|
|
struct OpenCLContextT {
|
|
cl_context Context;
|
|
cl_command_queue CommandQueue;
|
|
};
|
|
|
|
struct OpenCLKernelT {
|
|
cl_kernel Kernel;
|
|
cl_program Program;
|
|
const char *BinaryString;
|
|
};
|
|
|
|
struct OpenCLDevicePtrT {
|
|
cl_mem MemObj;
|
|
};
|
|
|
|
/* Dynamic library handles for the OpenCL runtime library. */
|
|
static void *HandleOpenCL;
|
|
static void *HandleOpenCLBeignet;
|
|
|
|
/* Type-defines of function pointer to OpenCL Runtime API. */
|
|
typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries,
|
|
cl_platform_id *Platforms,
|
|
cl_uint *NumPlatforms);
|
|
static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr;
|
|
|
|
typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform,
|
|
cl_device_type DeviceType,
|
|
cl_uint NumEntries, cl_device_id *Devices,
|
|
cl_uint *NumDevices);
|
|
static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr;
|
|
|
|
typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device,
|
|
cl_device_info ParamName,
|
|
size_t ParamValueSize, void *ParamValue,
|
|
size_t *ParamValueSizeRet);
|
|
static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr;
|
|
|
|
typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info ParamName,
|
|
size_t ParamValueSize, void *ParamValue,
|
|
size_t *ParamValueSizeRet);
|
|
static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr;
|
|
|
|
typedef cl_context clCreateContextFcnTy(
|
|
const cl_context_properties *Properties, cl_uint NumDevices,
|
|
const cl_device_id *Devices,
|
|
void CL_CALLBACK *pfn_notify(const char *Errinfo, const void *PrivateInfo,
|
|
size_t CB, void *UserData),
|
|
void *UserData, cl_int *ErrcodeRet);
|
|
static clCreateContextFcnTy *clCreateContextFcnPtr;
|
|
|
|
typedef cl_command_queue
|
|
clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device,
|
|
cl_command_queue_properties Properties,
|
|
cl_int *ErrcodeRet);
|
|
static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr;
|
|
|
|
typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags Flags,
|
|
size_t Size, void *HostPtr,
|
|
cl_int *ErrcodeRet);
|
|
static clCreateBufferFcnTy *clCreateBufferFcnPtr;
|
|
|
|
typedef cl_int
|
|
clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer,
|
|
cl_bool BlockingWrite, size_t Offset, size_t Size,
|
|
const void *Ptr, cl_uint NumEventsInWaitList,
|
|
const cl_event *EventWaitList, cl_event *Event);
|
|
static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr;
|
|
|
|
typedef cl_program
|
|
clCreateProgramWithLLVMIntelFcnTy(cl_context Context, cl_uint NumDevices,
|
|
const cl_device_id *DeviceList,
|
|
const char *Filename, cl_int *ErrcodeRet);
|
|
static clCreateProgramWithLLVMIntelFcnTy *clCreateProgramWithLLVMIntelFcnPtr;
|
|
|
|
typedef cl_program clCreateProgramWithBinaryFcnTy(
|
|
cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList,
|
|
const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus,
|
|
cl_int *ErrcodeRet);
|
|
static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr;
|
|
|
|
typedef cl_int clBuildProgramFcnTy(
|
|
cl_program Program, cl_uint NumDevices, const cl_device_id *DeviceList,
|
|
const char *Options,
|
|
void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData),
|
|
void *UserData);
|
|
static clBuildProgramFcnTy *clBuildProgramFcnPtr;
|
|
|
|
typedef cl_kernel clCreateKernelFcnTy(cl_program Program,
|
|
const char *KernelName,
|
|
cl_int *ErrcodeRet);
|
|
static clCreateKernelFcnTy *clCreateKernelFcnPtr;
|
|
|
|
typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex,
|
|
size_t ArgSize, const void *ArgValue);
|
|
static clSetKernelArgFcnTy *clSetKernelArgFcnPtr;
|
|
|
|
typedef cl_int clEnqueueNDRangeKernelFcnTy(
|
|
cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim,
|
|
const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
|
|
const size_t *LocalWorkSize, cl_uint NumEventsInWaitList,
|
|
const cl_event *EventWaitList, cl_event *Event);
|
|
static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr;
|
|
|
|
typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue,
|
|
cl_mem Buffer, cl_bool BlockingRead,
|
|
size_t Offset, size_t Size, void *Ptr,
|
|
cl_uint NumEventsInWaitList,
|
|
const cl_event *EventWaitList,
|
|
cl_event *Event);
|
|
static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr;
|
|
|
|
typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue);
|
|
static clFlushFcnTy *clFlushFcnPtr;
|
|
|
|
typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue);
|
|
static clFinishFcnTy *clFinishFcnPtr;
|
|
|
|
typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel);
|
|
static clReleaseKernelFcnTy *clReleaseKernelFcnPtr;
|
|
|
|
typedef cl_int clReleaseProgramFcnTy(cl_program Program);
|
|
static clReleaseProgramFcnTy *clReleaseProgramFcnPtr;
|
|
|
|
typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject);
|
|
static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr;
|
|
|
|
typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue);
|
|
static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr;
|
|
|
|
typedef cl_int clReleaseContextFcnTy(cl_context Context);
|
|
static clReleaseContextFcnTy *clReleaseContextFcnPtr;
|
|
|
|
static void *getAPIHandleCL(void *Handle, const char *FuncName) {
|
|
char *Err;
|
|
void *FuncPtr;
|
|
dlerror();
|
|
FuncPtr = dlsym(Handle, FuncName);
|
|
if ((Err = dlerror()) != 0) {
|
|
fprintf(stderr, "Load OpenCL Runtime API failed: %s. \n", Err);
|
|
return 0;
|
|
}
|
|
return FuncPtr;
|
|
}
|
|
|
|
static int initialDeviceAPILibrariesCL() {
|
|
HandleOpenCLBeignet = dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY);
|
|
HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY);
|
|
if (!HandleOpenCL) {
|
|
fprintf(stderr, "Cannot open library: %s. \n", dlerror());
|
|
return 0;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
/* Get function pointer to OpenCL Runtime API.
|
|
*
|
|
* Note that compilers conforming to the ISO C standard are required to
|
|
* generate a warning if a conversion from a void * pointer to a function
|
|
* pointer is attempted as in the following statements. The warning
|
|
* of this kind of cast may not be emitted by clang and new versions of gcc
|
|
* as it is valid on POSIX 2008. For compilers required to generate a warning,
|
|
* we temporarily disable -Wpedantic, to avoid bloating the output with
|
|
* unnecessary warnings.
|
|
*
|
|
* Reference:
|
|
* http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
|
|
*/
|
|
#pragma GCC diagnostic push
|
|
#pragma GCC diagnostic ignored "-Wpedantic"
|
|
static int initialDeviceAPIsCL() {
|
|
if (initialDeviceAPILibrariesCL() == 0)
|
|
return 0;
|
|
|
|
// FIXME: We are now always selecting the Intel Beignet driver if it is
|
|
// available on the system, instead of a possible NVIDIA or AMD OpenCL
|
|
// API. This selection should occurr based on the target architecture
|
|
// chosen when compiling.
|
|
void *Handle =
|
|
(HandleOpenCLBeignet != NULL ? HandleOpenCLBeignet : HandleOpenCL);
|
|
|
|
clGetPlatformIDsFcnPtr =
|
|
(clGetPlatformIDsFcnTy *)getAPIHandleCL(Handle, "clGetPlatformIDs");
|
|
|
|
clGetDeviceIDsFcnPtr =
|
|
(clGetDeviceIDsFcnTy *)getAPIHandleCL(Handle, "clGetDeviceIDs");
|
|
|
|
clGetDeviceInfoFcnPtr =
|
|
(clGetDeviceInfoFcnTy *)getAPIHandleCL(Handle, "clGetDeviceInfo");
|
|
|
|
clGetKernelInfoFcnPtr =
|
|
(clGetKernelInfoFcnTy *)getAPIHandleCL(Handle, "clGetKernelInfo");
|
|
|
|
clCreateContextFcnPtr =
|
|
(clCreateContextFcnTy *)getAPIHandleCL(Handle, "clCreateContext");
|
|
|
|
clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL(
|
|
Handle, "clCreateCommandQueue");
|
|
|
|
clCreateBufferFcnPtr =
|
|
(clCreateBufferFcnTy *)getAPIHandleCL(Handle, "clCreateBuffer");
|
|
|
|
clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL(
|
|
Handle, "clEnqueueWriteBuffer");
|
|
|
|
if (HandleOpenCLBeignet)
|
|
clCreateProgramWithLLVMIntelFcnPtr =
|
|
(clCreateProgramWithLLVMIntelFcnTy *)getAPIHandleCL(
|
|
Handle, "clCreateProgramWithLLVMIntel");
|
|
|
|
clCreateProgramWithBinaryFcnPtr =
|
|
(clCreateProgramWithBinaryFcnTy *)getAPIHandleCL(
|
|
Handle, "clCreateProgramWithBinary");
|
|
|
|
clBuildProgramFcnPtr =
|
|
(clBuildProgramFcnTy *)getAPIHandleCL(Handle, "clBuildProgram");
|
|
|
|
clCreateKernelFcnPtr =
|
|
(clCreateKernelFcnTy *)getAPIHandleCL(Handle, "clCreateKernel");
|
|
|
|
clSetKernelArgFcnPtr =
|
|
(clSetKernelArgFcnTy *)getAPIHandleCL(Handle, "clSetKernelArg");
|
|
|
|
clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL(
|
|
Handle, "clEnqueueNDRangeKernel");
|
|
|
|
clEnqueueReadBufferFcnPtr =
|
|
(clEnqueueReadBufferFcnTy *)getAPIHandleCL(Handle, "clEnqueueReadBuffer");
|
|
|
|
clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(Handle, "clFlush");
|
|
|
|
clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(Handle, "clFinish");
|
|
|
|
clReleaseKernelFcnPtr =
|
|
(clReleaseKernelFcnTy *)getAPIHandleCL(Handle, "clReleaseKernel");
|
|
|
|
clReleaseProgramFcnPtr =
|
|
(clReleaseProgramFcnTy *)getAPIHandleCL(Handle, "clReleaseProgram");
|
|
|
|
clReleaseMemObjectFcnPtr =
|
|
(clReleaseMemObjectFcnTy *)getAPIHandleCL(Handle, "clReleaseMemObject");
|
|
|
|
clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL(
|
|
Handle, "clReleaseCommandQueue");
|
|
|
|
clReleaseContextFcnPtr =
|
|
(clReleaseContextFcnTy *)getAPIHandleCL(Handle, "clReleaseContext");
|
|
|
|
return 1;
|
|
}
|
|
#pragma GCC diagnostic pop
|
|
|
|
/* Context and Device. */
|
|
static PollyGPUContext *GlobalContext = NULL;
|
|
static cl_device_id GlobalDeviceID = NULL;
|
|
|
|
/* Fd-Decl: Print out OpenCL Error codes to human readable strings. */
|
|
static void printOpenCLError(int Error);
|
|
|
|
static void checkOpenCLError(int Ret, const char *format, ...) {
|
|
if (Ret == CL_SUCCESS)
|
|
return;
|
|
|
|
printOpenCLError(Ret);
|
|
va_list args;
|
|
va_start(args, format);
|
|
vfprintf(stderr, format, args);
|
|
va_end(args);
|
|
exit(-1);
|
|
}
|
|
|
|
static PollyGPUContext *initContextCL() {
|
|
dump_function();
|
|
|
|
PollyGPUContext *Context;
|
|
|
|
cl_platform_id PlatformID = NULL;
|
|
cl_device_id DeviceID = NULL;
|
|
cl_uint NumDevicesRet;
|
|
cl_int Ret;
|
|
|
|
char DeviceRevision[256];
|
|
char DeviceName[256];
|
|
size_t DeviceRevisionRetSize, DeviceNameRetSize;
|
|
|
|
static __thread PollyGPUContext *CurrentContext = NULL;
|
|
|
|
if (CurrentContext)
|
|
return CurrentContext;
|
|
|
|
/* Get API handles. */
|
|
if (initialDeviceAPIsCL() == 0) {
|
|
fprintf(stderr, "Getting the \"handle\" for the OpenCL Runtime failed.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
/* Get number of devices that support OpenCL. */
|
|
static const int NumberOfPlatforms = 1;
|
|
Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL);
|
|
checkOpenCLError(Ret, "Failed to get platform IDs.\n");
|
|
// TODO: Extend to CL_DEVICE_TYPE_ALL?
|
|
static const int NumberOfDevices = 1;
|
|
Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU, NumberOfDevices,
|
|
&DeviceID, &NumDevicesRet);
|
|
checkOpenCLError(Ret, "Failed to get device IDs.\n");
|
|
|
|
GlobalDeviceID = DeviceID;
|
|
if (NumDevicesRet == 0) {
|
|
fprintf(stderr, "There is no device supporting OpenCL.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
/* Get device revision. */
|
|
Ret =
|
|
clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION, sizeof(DeviceRevision),
|
|
DeviceRevision, &DeviceRevisionRetSize);
|
|
checkOpenCLError(Ret, "Failed to fetch device revision.\n");
|
|
|
|
/* Get device name. */
|
|
Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME, sizeof(DeviceName),
|
|
DeviceName, &DeviceNameRetSize);
|
|
checkOpenCLError(Ret, "Failed to fetch device name.\n");
|
|
|
|
debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
|
|
|
|
/* Create context on the device. */
|
|
Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
|
|
if (Context == 0) {
|
|
fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
|
|
exit(-1);
|
|
}
|
|
Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext));
|
|
if (Context->Context == 0) {
|
|
fprintf(stderr, "Allocate memory for Polly OpenCL context failed.\n");
|
|
exit(-1);
|
|
}
|
|
((OpenCLContext *)Context->Context)->Context =
|
|
clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL, &Ret);
|
|
checkOpenCLError(Ret, "Failed to create context.\n");
|
|
|
|
static const int ExtraProperties = 0;
|
|
((OpenCLContext *)Context->Context)->CommandQueue =
|
|
clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->Context,
|
|
DeviceID, ExtraProperties, &Ret);
|
|
checkOpenCLError(Ret, "Failed to create command queue.\n");
|
|
|
|
if (CacheMode)
|
|
CurrentContext = Context;
|
|
|
|
GlobalContext = Context;
|
|
return Context;
|
|
}
|
|
|
|
static void freeKernelCL(PollyGPUFunction *Kernel) {
|
|
dump_function();
|
|
|
|
if (CacheMode)
|
|
return;
|
|
|
|
if (!GlobalContext) {
|
|
fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
cl_int Ret;
|
|
Ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
|
|
checkOpenCLError(Ret, "Failed to flush command queue.\n");
|
|
Ret = clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
|
|
checkOpenCLError(Ret, "Failed to finish command queue.\n");
|
|
|
|
if (((OpenCLKernel *)Kernel->Kernel)->Kernel) {
|
|
cl_int Ret =
|
|
clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel);
|
|
checkOpenCLError(Ret, "Failed to release kernel.\n");
|
|
}
|
|
|
|
if (((OpenCLKernel *)Kernel->Kernel)->Program) {
|
|
cl_int Ret =
|
|
clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Program);
|
|
checkOpenCLError(Ret, "Failed to release program.\n");
|
|
}
|
|
|
|
if (Kernel->Kernel)
|
|
free((OpenCLKernel *)Kernel->Kernel);
|
|
|
|
if (Kernel)
|
|
free(Kernel);
|
|
}
|
|
|
|
static PollyGPUFunction *getKernelCL(const char *BinaryBuffer,
|
|
const char *KernelName) {
|
|
dump_function();
|
|
|
|
if (!GlobalContext) {
|
|
fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
|
|
static __thread int NextCacheItem = 0;
|
|
|
|
for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
|
|
// We exploit here the property that all Polly-ACC kernels are allocated
|
|
// as global constants, hence a pointer comparision is sufficient to
|
|
// determin equality.
|
|
if (KernelCache[i] &&
|
|
((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString ==
|
|
BinaryBuffer) {
|
|
debug_print(" -> using cached kernel\n");
|
|
return KernelCache[i];
|
|
}
|
|
}
|
|
|
|
PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
|
|
if (Function == 0) {
|
|
fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
|
|
exit(-1);
|
|
}
|
|
Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel));
|
|
if (Function->Kernel == 0) {
|
|
fprintf(stderr, "Allocate memory for Polly OpenCL kernel failed.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
if (!GlobalDeviceID) {
|
|
fprintf(stderr, "GPGPU-code generation not initialized correctly.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
cl_int Ret;
|
|
|
|
if (HandleOpenCLBeignet) {
|
|
// This is a workaround, since clCreateProgramWithLLVMIntel only
|
|
// accepts a filename to a valid llvm-ir file as an argument, instead
|
|
// of accepting the BinaryBuffer directly.
|
|
char FileName[] = "/tmp/polly_kernelXXXXXX";
|
|
int File = mkstemp(FileName);
|
|
write(File, BinaryBuffer, strlen(BinaryBuffer));
|
|
|
|
((OpenCLKernel *)Function->Kernel)->Program =
|
|
clCreateProgramWithLLVMIntelFcnPtr(
|
|
((OpenCLContext *)GlobalContext->Context)->Context, 1,
|
|
&GlobalDeviceID, FileName, &Ret);
|
|
checkOpenCLError(Ret, "Failed to create program from llvm.\n");
|
|
close(File);
|
|
unlink(FileName);
|
|
} else {
|
|
size_t BinarySize = strlen(BinaryBuffer);
|
|
((OpenCLKernel *)Function->Kernel)->Program =
|
|
clCreateProgramWithBinaryFcnPtr(
|
|
((OpenCLContext *)GlobalContext->Context)->Context, 1,
|
|
&GlobalDeviceID, (const size_t *)&BinarySize,
|
|
(const unsigned char **)&BinaryBuffer, NULL, &Ret);
|
|
checkOpenCLError(Ret, "Failed to create program from binary.\n");
|
|
}
|
|
|
|
Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1,
|
|
&GlobalDeviceID, NULL, NULL, NULL);
|
|
checkOpenCLError(Ret, "Failed to build program.\n");
|
|
|
|
((OpenCLKernel *)Function->Kernel)->Kernel = clCreateKernelFcnPtr(
|
|
((OpenCLKernel *)Function->Kernel)->Program, KernelName, &Ret);
|
|
checkOpenCLError(Ret, "Failed to create kernel.\n");
|
|
|
|
((OpenCLKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
|
|
|
|
if (CacheMode) {
|
|
if (KernelCache[NextCacheItem])
|
|
freeKernelCL(KernelCache[NextCacheItem]);
|
|
|
|
KernelCache[NextCacheItem] = Function;
|
|
|
|
NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
|
|
}
|
|
|
|
return Function;
|
|
}
|
|
|
|
static void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr *DevData,
|
|
long MemSize) {
|
|
dump_function();
|
|
|
|
if (!GlobalContext) {
|
|
fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
cl_int Ret;
|
|
Ret = clEnqueueWriteBufferFcnPtr(
|
|
((OpenCLContext *)GlobalContext->Context)->CommandQueue,
|
|
((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
|
|
HostData, 0, NULL, NULL);
|
|
checkOpenCLError(Ret, "Copying data from host memory to device failed.\n");
|
|
}
|
|
|
|
static void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void *HostData,
|
|
long MemSize) {
|
|
dump_function();
|
|
|
|
if (!GlobalContext) {
|
|
fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
cl_int Ret;
|
|
Ret = clEnqueueReadBufferFcnPtr(
|
|
((OpenCLContext *)GlobalContext->Context)->CommandQueue,
|
|
((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
|
|
HostData, 0, NULL, NULL);
|
|
checkOpenCLError(Ret, "Copying results from device to host memory failed.\n");
|
|
}
|
|
|
|
static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX,
|
|
unsigned int GridDimY, unsigned int BlockDimX,
|
|
unsigned int BlockDimY, unsigned int BlockDimZ,
|
|
void **Parameters) {
|
|
dump_function();
|
|
|
|
cl_int Ret;
|
|
cl_uint NumArgs;
|
|
|
|
if (!GlobalContext) {
|
|
fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel;
|
|
Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS,
|
|
sizeof(cl_uint), &NumArgs, NULL);
|
|
checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n");
|
|
|
|
/* Argument sizes are stored at the end of the Parameters array. */
|
|
for (cl_uint i = 0; i < NumArgs; i++) {
|
|
Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i,
|
|
*((int *)Parameters[NumArgs + i]),
|
|
(void *)Parameters[i]);
|
|
checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i);
|
|
}
|
|
|
|
unsigned int GridDimZ = 1;
|
|
size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY,
|
|
BlockDimZ * GridDimZ};
|
|
size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ};
|
|
|
|
static const int WorkDim = 3;
|
|
OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context;
|
|
Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel,
|
|
WorkDim, NULL, GlobalWorkSize,
|
|
LocalWorkSize, 0, NULL, NULL);
|
|
checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n");
|
|
}
|
|
|
|
static void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) {
|
|
dump_function();
|
|
|
|
OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
|
|
cl_int Ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->MemObj);
|
|
checkOpenCLError(Ret, "Failed to free device memory.\n");
|
|
|
|
free(DevPtr);
|
|
free(Allocation);
|
|
}
|
|
|
|
static PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) {
|
|
dump_function();
|
|
|
|
if (!GlobalContext) {
|
|
fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
|
|
if (DevData == 0) {
|
|
fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
|
|
exit(-1);
|
|
}
|
|
DevData->DevicePtr = (OpenCLDevicePtr *)malloc(sizeof(OpenCLDevicePtr));
|
|
if (DevData->DevicePtr == 0) {
|
|
fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
cl_int Ret;
|
|
((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj =
|
|
clCreateBufferFcnPtr(((OpenCLContext *)GlobalContext->Context)->Context,
|
|
CL_MEM_READ_WRITE, MemSize, NULL, &Ret);
|
|
checkOpenCLError(Ret,
|
|
"Allocate memory for GPU device memory pointer failed.\n");
|
|
|
|
return DevData;
|
|
}
|
|
|
|
static void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) {
|
|
dump_function();
|
|
|
|
OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
|
|
return (void *)DevPtr->MemObj;
|
|
}
|
|
|
|
static void synchronizeDeviceCL() {
|
|
dump_function();
|
|
|
|
if (!GlobalContext) {
|
|
fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
if (clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue) !=
|
|
CL_SUCCESS) {
|
|
fprintf(stderr, "Synchronizing device and host memory failed.\n");
|
|
exit(-1);
|
|
}
|
|
}
|
|
|
|
static void freeContextCL(PollyGPUContext *Context) {
|
|
dump_function();
|
|
|
|
cl_int Ret;
|
|
|
|
GlobalContext = NULL;
|
|
|
|
OpenCLContext *Ctx = (OpenCLContext *)Context->Context;
|
|
if (Ctx->CommandQueue) {
|
|
Ret = clReleaseCommandQueueFcnPtr(Ctx->CommandQueue);
|
|
checkOpenCLError(Ret, "Could not release command queue.\n");
|
|
}
|
|
|
|
if (Ctx->Context) {
|
|
Ret = clReleaseContextFcnPtr(Ctx->Context);
|
|
checkOpenCLError(Ret, "Could not release context.\n");
|
|
}
|
|
|
|
free(Ctx);
|
|
free(Context);
|
|
}
|
|
|
|
static void printOpenCLError(int Error) {
|
|
|
|
switch (Error) {
|
|
case CL_SUCCESS:
|
|
// Success, don't print an error.
|
|
break;
|
|
|
|
// JIT/Runtime errors.
|
|
case CL_DEVICE_NOT_FOUND:
|
|
fprintf(stderr, "Device not found.\n");
|
|
break;
|
|
case CL_DEVICE_NOT_AVAILABLE:
|
|
fprintf(stderr, "Device not available.\n");
|
|
break;
|
|
case CL_COMPILER_NOT_AVAILABLE:
|
|
fprintf(stderr, "Compiler not available.\n");
|
|
break;
|
|
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
|
|
fprintf(stderr, "Mem object allocation failure.\n");
|
|
break;
|
|
case CL_OUT_OF_RESOURCES:
|
|
fprintf(stderr, "Out of resources.\n");
|
|
break;
|
|
case CL_OUT_OF_HOST_MEMORY:
|
|
fprintf(stderr, "Out of host memory.\n");
|
|
break;
|
|
case CL_PROFILING_INFO_NOT_AVAILABLE:
|
|
fprintf(stderr, "Profiling info not available.\n");
|
|
break;
|
|
case CL_MEM_COPY_OVERLAP:
|
|
fprintf(stderr, "Mem copy overlap.\n");
|
|
break;
|
|
case CL_IMAGE_FORMAT_MISMATCH:
|
|
fprintf(stderr, "Image format mismatch.\n");
|
|
break;
|
|
case CL_IMAGE_FORMAT_NOT_SUPPORTED:
|
|
fprintf(stderr, "Image format not supported.\n");
|
|
break;
|
|
case CL_BUILD_PROGRAM_FAILURE:
|
|
fprintf(stderr, "Build program failure.\n");
|
|
break;
|
|
case CL_MAP_FAILURE:
|
|
fprintf(stderr, "Map failure.\n");
|
|
break;
|
|
case CL_MISALIGNED_SUB_BUFFER_OFFSET:
|
|
fprintf(stderr, "Misaligned sub buffer offset.\n");
|
|
break;
|
|
case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
|
|
fprintf(stderr, "Exec status error for events in wait list.\n");
|
|
break;
|
|
case CL_COMPILE_PROGRAM_FAILURE:
|
|
fprintf(stderr, "Compile program failure.\n");
|
|
break;
|
|
case CL_LINKER_NOT_AVAILABLE:
|
|
fprintf(stderr, "Linker not available.\n");
|
|
break;
|
|
case CL_LINK_PROGRAM_FAILURE:
|
|
fprintf(stderr, "Link program failure.\n");
|
|
break;
|
|
case CL_DEVICE_PARTITION_FAILED:
|
|
fprintf(stderr, "Device partition failed.\n");
|
|
break;
|
|
case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
|
|
fprintf(stderr, "Kernel arg info not available.\n");
|
|
break;
|
|
|
|
// Compiler errors.
|
|
case CL_INVALID_VALUE:
|
|
fprintf(stderr, "Invalid value.\n");
|
|
break;
|
|
case CL_INVALID_DEVICE_TYPE:
|
|
fprintf(stderr, "Invalid device type.\n");
|
|
break;
|
|
case CL_INVALID_PLATFORM:
|
|
fprintf(stderr, "Invalid platform.\n");
|
|
break;
|
|
case CL_INVALID_DEVICE:
|
|
fprintf(stderr, "Invalid device.\n");
|
|
break;
|
|
case CL_INVALID_CONTEXT:
|
|
fprintf(stderr, "Invalid context.\n");
|
|
break;
|
|
case CL_INVALID_QUEUE_PROPERTIES:
|
|
fprintf(stderr, "Invalid queue properties.\n");
|
|
break;
|
|
case CL_INVALID_COMMAND_QUEUE:
|
|
fprintf(stderr, "Invalid command queue.\n");
|
|
break;
|
|
case CL_INVALID_HOST_PTR:
|
|
fprintf(stderr, "Invalid host pointer.\n");
|
|
break;
|
|
case CL_INVALID_MEM_OBJECT:
|
|
fprintf(stderr, "Invalid memory object.\n");
|
|
break;
|
|
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
|
|
fprintf(stderr, "Invalid image format descriptor.\n");
|
|
break;
|
|
case CL_INVALID_IMAGE_SIZE:
|
|
fprintf(stderr, "Invalid image size.\n");
|
|
break;
|
|
case CL_INVALID_SAMPLER:
|
|
fprintf(stderr, "Invalid sampler.\n");
|
|
break;
|
|
case CL_INVALID_BINARY:
|
|
fprintf(stderr, "Invalid binary.\n");
|
|
break;
|
|
case CL_INVALID_BUILD_OPTIONS:
|
|
fprintf(stderr, "Invalid build options.\n");
|
|
break;
|
|
case CL_INVALID_PROGRAM:
|
|
fprintf(stderr, "Invalid program.\n");
|
|
break;
|
|
case CL_INVALID_PROGRAM_EXECUTABLE:
|
|
fprintf(stderr, "Invalid program executable.\n");
|
|
break;
|
|
case CL_INVALID_KERNEL_NAME:
|
|
fprintf(stderr, "Invalid kernel name.\n");
|
|
break;
|
|
case CL_INVALID_KERNEL_DEFINITION:
|
|
fprintf(stderr, "Invalid kernel definition.\n");
|
|
break;
|
|
case CL_INVALID_KERNEL:
|
|
fprintf(stderr, "Invalid kernel.\n");
|
|
break;
|
|
case CL_INVALID_ARG_INDEX:
|
|
fprintf(stderr, "Invalid arg index.\n");
|
|
break;
|
|
case CL_INVALID_ARG_VALUE:
|
|
fprintf(stderr, "Invalid arg value.\n");
|
|
break;
|
|
case CL_INVALID_ARG_SIZE:
|
|
fprintf(stderr, "Invalid arg size.\n");
|
|
break;
|
|
case CL_INVALID_KERNEL_ARGS:
|
|
fprintf(stderr, "Invalid kernel args.\n");
|
|
break;
|
|
case CL_INVALID_WORK_DIMENSION:
|
|
fprintf(stderr, "Invalid work dimension.\n");
|
|
break;
|
|
case CL_INVALID_WORK_GROUP_SIZE:
|
|
fprintf(stderr, "Invalid work group size.\n");
|
|
break;
|
|
case CL_INVALID_WORK_ITEM_SIZE:
|
|
fprintf(stderr, "Invalid work item size.\n");
|
|
break;
|
|
case CL_INVALID_GLOBAL_OFFSET:
|
|
fprintf(stderr, "Invalid global offset.\n");
|
|
break;
|
|
case CL_INVALID_EVENT_WAIT_LIST:
|
|
fprintf(stderr, "Invalid event wait list.\n");
|
|
break;
|
|
case CL_INVALID_EVENT:
|
|
fprintf(stderr, "Invalid event.\n");
|
|
break;
|
|
case CL_INVALID_OPERATION:
|
|
fprintf(stderr, "Invalid operation.\n");
|
|
break;
|
|
case CL_INVALID_GL_OBJECT:
|
|
fprintf(stderr, "Invalid GL object.\n");
|
|
break;
|
|
case CL_INVALID_BUFFER_SIZE:
|
|
fprintf(stderr, "Invalid buffer size.\n");
|
|
break;
|
|
case CL_INVALID_MIP_LEVEL:
|
|
fprintf(stderr, "Invalid mip level.\n");
|
|
break;
|
|
case CL_INVALID_GLOBAL_WORK_SIZE:
|
|
fprintf(stderr, "Invalid global work size.\n");
|
|
break;
|
|
case CL_INVALID_PROPERTY:
|
|
fprintf(stderr, "Invalid property.\n");
|
|
break;
|
|
case CL_INVALID_IMAGE_DESCRIPTOR:
|
|
fprintf(stderr, "Invalid image descriptor.\n");
|
|
break;
|
|
case CL_INVALID_COMPILER_OPTIONS:
|
|
fprintf(stderr, "Invalid compiler options.\n");
|
|
break;
|
|
case CL_INVALID_LINKER_OPTIONS:
|
|
fprintf(stderr, "Invalid linker options.\n");
|
|
break;
|
|
case CL_INVALID_DEVICE_PARTITION_COUNT:
|
|
fprintf(stderr, "Invalid device partition count.\n");
|
|
break;
|
|
case -69: // OpenCL 2.0 Code for CL_INVALID_PIPE_SIZE
|
|
fprintf(stderr, "Invalid pipe size.\n");
|
|
break;
|
|
case -70: // OpenCL 2.0 Code for CL_INVALID_DEVICE_QUEUE
|
|
fprintf(stderr, "Invalid device queue.\n");
|
|
break;
|
|
|
|
// NVIDIA specific error.
|
|
case -9999:
|
|
fprintf(stderr, "NVIDIA invalid read or write buffer.\n");
|
|
break;
|
|
|
|
default:
|
|
fprintf(stderr, "Unknown error code!\n");
|
|
break;
|
|
}
|
|
}
|
|
|
|
#endif /* HAS_LIBOPENCL */
|
|
/******************************************************************************/
|
|
/* CUDA */
|
|
/******************************************************************************/
|
|
#ifdef HAS_LIBCUDART
|
|
|
|
struct CUDAContextT {
|
|
CUcontext Cuda;
|
|
};
|
|
|
|
struct CUDAKernelT {
|
|
CUfunction Cuda;
|
|
CUmodule CudaModule;
|
|
const char *BinaryString;
|
|
};
|
|
|
|
struct CUDADevicePtrT {
|
|
CUdeviceptr Cuda;
|
|
};
|
|
|
|
/* Dynamic library handles for the CUDA and CUDA runtime library. */
|
|
static void *HandleCuda;
|
|
static void *HandleCudaRT;
|
|
|
|
/* Type-defines of function pointer to CUDA driver APIs. */
|
|
typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t);
|
|
static CuMemAllocFcnTy *CuMemAllocFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuMemAllocManagedFcnTy(CUdeviceptr *, size_t,
|
|
unsigned int);
|
|
static CuMemAllocManagedFcnTy *CuMemAllocManagedFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuLaunchKernelFcnTy(
|
|
CUfunction F, unsigned int GridDimX, unsigned int GridDimY,
|
|
unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY,
|
|
unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream,
|
|
void **KernelParams, void **Extra);
|
|
static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t);
|
|
static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuMemcpyHtoDFcnTy(CUdeviceptr, const void *, size_t);
|
|
static CuMemcpyHtoDFcnTy *CuMemcpyHtoDFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuMemFreeFcnTy(CUdeviceptr);
|
|
static CuMemFreeFcnTy *CuMemFreeFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuModuleUnloadFcnTy(CUmodule);
|
|
static CuModuleUnloadFcnTy *CuModuleUnloadFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuProfilerStopFcnTy();
|
|
static CuProfilerStopFcnTy *CuProfilerStopFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuCtxDestroyFcnTy(CUcontext);
|
|
static CuCtxDestroyFcnTy *CuCtxDestroyFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuInitFcnTy(unsigned int);
|
|
static CuInitFcnTy *CuInitFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuDeviceGetCountFcnTy(int *);
|
|
static CuDeviceGetCountFcnTy *CuDeviceGetCountFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuCtxCreateFcnTy(CUcontext *, unsigned int, CUdevice);
|
|
static CuCtxCreateFcnTy *CuCtxCreateFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuCtxGetCurrentFcnTy(CUcontext *);
|
|
static CuCtxGetCurrentFcnTy *CuCtxGetCurrentFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuDeviceGetFcnTy(CUdevice *, int);
|
|
static CuDeviceGetFcnTy *CuDeviceGetFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *,
|
|
unsigned int, CUjit_option *,
|
|
void **);
|
|
static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *Module,
|
|
const void *Image);
|
|
static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
|
|
const char *);
|
|
static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuDeviceComputeCapabilityFcnTy(int *, int *, CUdevice);
|
|
static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
|
|
static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState State,
|
|
CUjitInputType Type, void *Data,
|
|
size_t Size, const char *Name,
|
|
unsigned int NumOptions,
|
|
CUjit_option *Options,
|
|
void **OptionValues);
|
|
static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int NumOptions,
|
|
CUjit_option *Options,
|
|
void **OptionValues,
|
|
CUlinkState *StateOut);
|
|
static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState State, void **CubinOut,
|
|
size_t *SizeOut);
|
|
static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState State);
|
|
static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
|
|
|
|
typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy();
|
|
static CuCtxSynchronizeFcnTy *CuCtxSynchronizeFcnPtr;
|
|
|
|
/* Type-defines of function pointer ot CUDA runtime APIs. */
|
|
typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
|
|
static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
|
|
|
|
static void *getAPIHandleCUDA(void *Handle, const char *FuncName) {
|
|
char *Err;
|
|
void *FuncPtr;
|
|
dlerror();
|
|
FuncPtr = dlsym(Handle, FuncName);
|
|
if ((Err = dlerror()) != 0) {
|
|
fprintf(stderr, "Load CUDA driver API failed: %s. \n", Err);
|
|
return 0;
|
|
}
|
|
return FuncPtr;
|
|
}
|
|
|
|
static int initialDeviceAPILibrariesCUDA() {
|
|
HandleCuda = dlopen("libcuda.so", RTLD_LAZY);
|
|
if (!HandleCuda) {
|
|
fprintf(stderr, "Cannot open library: %s. \n", dlerror());
|
|
return 0;
|
|
}
|
|
|
|
HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY);
|
|
if (!HandleCudaRT) {
|
|
fprintf(stderr, "Cannot open library: %s. \n", dlerror());
|
|
return 0;
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
/* Get function pointer to CUDA Driver APIs.
|
|
*
|
|
* Note that compilers conforming to the ISO C standard are required to
|
|
* generate a warning if a conversion from a void * pointer to a function
|
|
* pointer is attempted as in the following statements. The warning
|
|
* of this kind of cast may not be emitted by clang and new versions of gcc
|
|
* as it is valid on POSIX 2008. For compilers required to generate a warning,
|
|
* we temporarily disable -Wpedantic, to avoid bloating the output with
|
|
* unnecessary warnings.
|
|
*
|
|
* Reference:
|
|
* http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
|
|
*/
|
|
#pragma GCC diagnostic push
|
|
#pragma GCC diagnostic ignored "-Wpedantic"
|
|
static int initialDeviceAPIsCUDA() {
|
|
if (initialDeviceAPILibrariesCUDA() == 0)
|
|
return 0;
|
|
|
|
CuLaunchKernelFcnPtr =
|
|
(CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel");
|
|
|
|
CuMemAllocFcnPtr =
|
|
(CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2");
|
|
|
|
CuMemAllocManagedFcnPtr = (CuMemAllocManagedFcnTy *)getAPIHandleCUDA(
|
|
HandleCuda, "cuMemAllocManaged");
|
|
|
|
CuMemFreeFcnPtr =
|
|
(CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2");
|
|
|
|
CuMemcpyDtoHFcnPtr =
|
|
(CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2");
|
|
|
|
CuMemcpyHtoDFcnPtr =
|
|
(CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2");
|
|
|
|
CuModuleUnloadFcnPtr =
|
|
(CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleUnload");
|
|
|
|
CuProfilerStopFcnPtr =
|
|
(CuProfilerStopFcnTy *)getAPIHandleCUDA(HandleCuda, "cuProfilerStop");
|
|
|
|
CuCtxDestroyFcnPtr =
|
|
(CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy");
|
|
|
|
CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit");
|
|
|
|
CuDeviceGetCountFcnPtr =
|
|
(CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetCount");
|
|
|
|
CuDeviceGetFcnPtr =
|
|
(CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet");
|
|
|
|
CuCtxCreateFcnPtr =
|
|
(CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2");
|
|
|
|
CuCtxGetCurrentFcnPtr =
|
|
(CuCtxGetCurrentFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxGetCurrent");
|
|
|
|
CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA(
|
|
HandleCuda, "cuModuleLoadDataEx");
|
|
|
|
CuModuleLoadDataFcnPtr =
|
|
(CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleLoadData");
|
|
|
|
CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandleCUDA(
|
|
HandleCuda, "cuModuleGetFunction");
|
|
|
|
CuDeviceComputeCapabilityFcnPtr =
|
|
(CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA(
|
|
HandleCuda, "cuDeviceComputeCapability");
|
|
|
|
CuDeviceGetNameFcnPtr =
|
|
(CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetName");
|
|
|
|
CuLinkAddDataFcnPtr =
|
|
(CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkAddData");
|
|
|
|
CuLinkCreateFcnPtr =
|
|
(CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate");
|
|
|
|
CuLinkCompleteFcnPtr =
|
|
(CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkComplete");
|
|
|
|
CuLinkDestroyFcnPtr =
|
|
(CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkDestroy");
|
|
|
|
CuCtxSynchronizeFcnPtr =
|
|
(CuCtxSynchronizeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxSynchronize");
|
|
|
|
/* Get function pointer to CUDA Runtime APIs. */
|
|
CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandleCUDA(
|
|
HandleCudaRT, "cudaThreadSynchronize");
|
|
|
|
return 1;
|
|
}
|
|
#pragma GCC diagnostic pop
|
|
|
|
static PollyGPUContext *initContextCUDA() {
|
|
dump_function();
|
|
PollyGPUContext *Context;
|
|
CUdevice Device;
|
|
|
|
int Major = 0, Minor = 0, DeviceID = 0;
|
|
char DeviceName[256];
|
|
int DeviceCount = 0;
|
|
|
|
static __thread PollyGPUContext *CurrentContext = NULL;
|
|
|
|
if (CurrentContext)
|
|
return CurrentContext;
|
|
|
|
/* Get API handles. */
|
|
if (initialDeviceAPIsCUDA() == 0) {
|
|
fprintf(stderr, "Getting the \"handle\" for the CUDA driver API failed.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
if (CuInitFcnPtr(0) != CUDA_SUCCESS) {
|
|
fprintf(stderr, "Initializing the CUDA driver API failed.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
/* Get number of devices that supports CUDA. */
|
|
CuDeviceGetCountFcnPtr(&DeviceCount);
|
|
if (DeviceCount == 0) {
|
|
fprintf(stderr, "There is no device supporting CUDA.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
CuDeviceGetFcnPtr(&Device, 0);
|
|
|
|
/* Get compute capabilities and the device name. */
|
|
CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, Device);
|
|
CuDeviceGetNameFcnPtr(DeviceName, 256, Device);
|
|
debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
|
|
|
|
/* Create context on the device. */
|
|
Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
|
|
if (Context == 0) {
|
|
fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
|
|
exit(-1);
|
|
}
|
|
Context->Context = malloc(sizeof(CUDAContext));
|
|
if (Context->Context == 0) {
|
|
fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
// In cases where managed memory is used, it is quite likely that
|
|
// `cudaMallocManaged` / `polly_mallocManaged` was called before
|
|
// `polly_initContext` was called.
|
|
//
|
|
// If `polly_initContext` calls `CuCtxCreate` when there already was a
|
|
// pre-existing context created by the runtime API, this causes code running
|
|
// on P100 to hang. So, we query for a pre-existing context to try and use.
|
|
// If there is no pre-existing context, we create a new context
|
|
|
|
// The possible pre-existing context from previous runtime API calls.
|
|
CUcontext MaybeRuntimeAPIContext;
|
|
if (CuCtxGetCurrentFcnPtr(&MaybeRuntimeAPIContext) != CUDA_SUCCESS) {
|
|
fprintf(stderr, "cuCtxGetCurrent failed.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
// There was no previous context, initialise it.
|
|
if (MaybeRuntimeAPIContext == NULL) {
|
|
if (CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0,
|
|
Device) != CUDA_SUCCESS) {
|
|
fprintf(stderr, "cuCtxCreateFcnPtr failed.\n");
|
|
exit(-1);
|
|
}
|
|
} else {
|
|
((CUDAContext *)Context->Context)->Cuda = MaybeRuntimeAPIContext;
|
|
}
|
|
|
|
if (CacheMode)
|
|
CurrentContext = Context;
|
|
|
|
return Context;
|
|
}
|
|
|
|
static void freeKernelCUDA(PollyGPUFunction *Kernel) {
|
|
dump_function();
|
|
|
|
if (CacheMode)
|
|
return;
|
|
|
|
if (((CUDAKernel *)Kernel->Kernel)->CudaModule)
|
|
CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule);
|
|
|
|
if (Kernel->Kernel)
|
|
free((CUDAKernel *)Kernel->Kernel);
|
|
|
|
if (Kernel)
|
|
free(Kernel);
|
|
}
|
|
|
|
static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer,
|
|
const char *KernelName) {
|
|
dump_function();
|
|
|
|
static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
|
|
static __thread int NextCacheItem = 0;
|
|
|
|
for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
|
|
// We exploit here the property that all Polly-ACC kernels are allocated
|
|
// as global constants, hence a pointer comparision is sufficient to
|
|
// determin equality.
|
|
if (KernelCache[i] &&
|
|
((CUDAKernel *)KernelCache[i]->Kernel)->BinaryString == BinaryBuffer) {
|
|
debug_print(" -> using cached kernel\n");
|
|
return KernelCache[i];
|
|
}
|
|
}
|
|
|
|
PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
|
|
if (Function == 0) {
|
|
fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
|
|
exit(-1);
|
|
}
|
|
Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel));
|
|
if (Function->Kernel == 0) {
|
|
fprintf(stderr, "Allocate memory for Polly CUDA function failed.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
CUresult Res;
|
|
CUlinkState LState;
|
|
CUjit_option Options[6];
|
|
void *OptionVals[6];
|
|
float Walltime = 0;
|
|
unsigned long LogSize = 8192;
|
|
char ErrorLog[8192], InfoLog[8192];
|
|
void *CuOut;
|
|
size_t OutSize;
|
|
|
|
// Setup linker options
|
|
// Return walltime from JIT compilation
|
|
Options[0] = CU_JIT_WALL_TIME;
|
|
OptionVals[0] = (void *)&Walltime;
|
|
// Pass a buffer for info messages
|
|
Options[1] = CU_JIT_INFO_LOG_BUFFER;
|
|
OptionVals[1] = (void *)InfoLog;
|
|
// Pass the size of the info buffer
|
|
Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
|
|
OptionVals[2] = (void *)LogSize;
|
|
// Pass a buffer for error message
|
|
Options[3] = CU_JIT_ERROR_LOG_BUFFER;
|
|
OptionVals[3] = (void *)ErrorLog;
|
|
// Pass the size of the error buffer
|
|
Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
|
|
OptionVals[4] = (void *)LogSize;
|
|
// Make the linker verbose
|
|
Options[5] = CU_JIT_LOG_VERBOSE;
|
|
OptionVals[5] = (void *)1;
|
|
|
|
memset(ErrorLog, 0, sizeof(ErrorLog));
|
|
|
|
CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
|
|
Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)BinaryBuffer,
|
|
strlen(BinaryBuffer) + 1, 0, 0, 0, 0);
|
|
if (Res != CUDA_SUCCESS) {
|
|
fprintf(stderr, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
|
|
exit(-1);
|
|
}
|
|
|
|
Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
|
|
if (Res != CUDA_SUCCESS) {
|
|
fprintf(stderr, "Complete ptx linker step failed.\n");
|
|
fprintf(stderr, "\n%s\n", ErrorLog);
|
|
exit(-1);
|
|
}
|
|
|
|
debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
|
|
InfoLog);
|
|
|
|
Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel *)Function->Kernel)->CudaModule),
|
|
CuOut);
|
|
if (Res != CUDA_SUCCESS) {
|
|
fprintf(stderr, "Loading ptx assembly text failed.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel *)Function->Kernel)->Cuda),
|
|
((CUDAKernel *)Function->Kernel)->CudaModule,
|
|
KernelName);
|
|
if (Res != CUDA_SUCCESS) {
|
|
fprintf(stderr, "Loading kernel function failed.\n");
|
|
exit(-1);
|
|
}
|
|
|
|
CuLinkDestroyFcnPtr(LState);
|
|
|
|
((CUDAKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
|
|
|
|
if (CacheMode) {
|
|
if (KernelCache[NextCacheItem])
|
|
freeKernelCUDA(KernelCache[NextCacheItem]);
|
|
|
|
KernelCache[NextCacheItem] = Function;
|
|
|
|
NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
|
|
}
|
|
|
|
return Function;
|
|
}
|
|
|
|
static void synchronizeDeviceCUDA() {
|
|
dump_function();
|
|
if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
|
|
fprintf(stderr, "Synchronizing device and host memory failed.\n");
|
|
exit(-1);
|
|
}
|
|
}
|
|
|
|
static void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr *DevData,
|
|
long MemSize) {
|
|
dump_function();
|
|
|
|
CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda;
|
|
CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
|
|
}
|
|
|
|
static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void *HostData,
|
|
long MemSize) {
|
|
dump_function();
|
|
|
|
if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr *)DevData->DevicePtr)->Cuda,
|
|
MemSize) != CUDA_SUCCESS) {
|
|
fprintf(stderr, "Copying results from device to host memory failed.\n");
|
|
exit(-1);
|
|
}
|
|
}
|
|
|
|
static void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int GridDimX,
|
|
unsigned int GridDimY, unsigned int BlockDimX,
|
|
unsigned int BlockDimY, unsigned int BlockDimZ,
|
|
void **Parameters) {
|
|
dump_function();
|
|
|
|
unsigned GridDimZ = 1;
|
|
unsigned int SharedMemBytes = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE;
|
|
CUstream Stream = 0;
|
|
void **Extra = 0;
|
|
|
|
CUresult Res;
|
|
Res =
|
|
CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda, GridDimX,
|
|
GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ,
|
|
SharedMemBytes, Stream, Parameters, Extra);
|
|
if (Res != CUDA_SUCCESS) {
|
|
fprintf(stderr, "Launching CUDA kernel failed.\n");
|
|
exit(-1);
|
|
}
|
|
}
|
|
|
|
// Maximum number of managed memory pointers.
|
|
#define DEFAULT_MAX_POINTERS 4000
|
|
// For the rationale behing a list of free pointers, see `polly_freeManaged`.
|
|
void **g_managedptrs;
|
|
unsigned long long g_nmanagedptrs = 0;
|
|
unsigned long long g_maxmanagedptrs = 0;
|
|
|
|
__attribute__((constructor)) static void initManagedPtrsBuffer() {
|
|
g_maxmanagedptrs = DEFAULT_MAX_POINTERS;
|
|
const char *maxManagedPointersString = getenv("POLLY_MAX_MANAGED_POINTERS");
|
|
if (maxManagedPointersString)
|
|
g_maxmanagedptrs = atoll(maxManagedPointersString);
|
|
|
|
g_managedptrs = (void **)malloc(sizeof(void *) * g_maxmanagedptrs);
|
|
}
|
|
|
|
// Add a pointer as being allocated by cuMallocManaged
|
|
void addManagedPtr(void *mem) {
|
|
assert(g_maxmanagedptrs > 0 && "g_maxmanagedptrs was set to 0!");
|
|
assert(g_nmanagedptrs < g_maxmanagedptrs &&
|
|
"We have hit the maximum number of "
|
|
"managed pointers allowed. Set the "
|
|
"POLLY_MAX_MANAGED_POINTERS environment variable. ");
|
|
g_managedptrs[g_nmanagedptrs++] = mem;
|
|
}
|
|
|
|
int isManagedPtr(void *mem) {
|
|
for (unsigned long long i = 0; i < g_nmanagedptrs; i++) {
|
|
if (g_managedptrs[i] == mem)
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void freeManagedCUDA(void *mem) {
|
|
dump_function();
|
|
|
|
// In a real-world program this was used (COSMO), there were more `free`
|
|
// calls in the original source than `malloc` calls. Hence, replacing all
|
|
// `free`s with `cudaFree` does not work, since we would try to free
|
|
// 'illegal' memory.
|
|
// As a quick fix, we keep a free list and check if `mem` is a managed memory
|
|
// pointer. If it is, we call `cudaFree`.
|
|
// If not, we pass it along to the underlying allocator.
|
|
// This is a hack, and can be removed if the underlying issue is fixed.
|
|
if (isManagedPtr(mem)) {
|
|
if (CuMemFreeFcnPtr((size_t)mem) != CUDA_SUCCESS) {
|
|
fprintf(stderr, "cudaFree failed.\n");
|
|
exit(-1);
|
|
}
|
|
return;
|
|
} else {
|
|
free(mem);
|
|
}
|
|
}
|
|
|
|
void *mallocManagedCUDA(size_t size) {
|
|
// Note: [Size 0 allocations]
|
|
// Sometimes, some runtime computation of size could create a size of 0
|
|
// for an allocation. In these cases, we do not wish to fail.
|
|
// The CUDA API fails on size 0 allocations.
|
|
// So, we allocate size a minimum of size 1.
|
|
if (!size && DebugMode)
|
|
fprintf(stderr, "cudaMallocManaged called with size 0. "
|
|
"Promoting to size 1");
|
|
size = max(size, 1);
|
|
PollyGPUContext *_ = polly_initContextCUDA();
|
|
assert(_ && "polly_initContextCUDA failed");
|
|
|
|
void *newMemPtr;
|
|
const CUresult Res = CuMemAllocManagedFcnPtr((CUdeviceptr *)&newMemPtr, size,
|
|
CU_MEM_ATTACH_GLOBAL);
|
|
if (Res != CUDA_SUCCESS) {
|
|
fprintf(stderr, "cudaMallocManaged failed for size: %zu\n", size);
|
|
exit(-1);
|
|
}
|
|
addManagedPtr(newMemPtr);
|
|
return newMemPtr;
|
|
}
|
|
|
|
static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) {
|
|
dump_function();
|
|
CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
|
|
CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda);
|
|
free(DevPtr);
|
|
free(Allocation);
|
|
}
|
|
|
|
static PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) {
|
|
if (!MemSize && DebugMode)
|
|
fprintf(stderr, "allocateMemoryForDeviceCUDA called with size 0. "
|
|
"Promoting to size 1");
|
|
// see: [Size 0 allocations]
|
|
MemSize = max(MemSize, 1);
|
|
dump_function();
|
|
|
|
PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
|
|
if (DevData == 0) {
|
|
fprintf(stderr,
|
|
"Allocate memory for GPU device memory pointer failed."
|
|
" Line: %d | Size: %ld\n",
|
|
__LINE__, MemSize);
|
|
exit(-1);
|
|
}
|
|
DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr));
|
|
if (DevData->DevicePtr == 0) {
|
|
fprintf(stderr,
|
|
"Allocate memory for GPU device memory pointer failed."
|
|
" Line: %d | Size: %ld\n",
|
|
__LINE__, MemSize);
|
|
exit(-1);
|
|
}
|
|
|
|
CUresult Res =
|
|
CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda), MemSize);
|
|
|
|
if (Res != CUDA_SUCCESS) {
|
|
fprintf(stderr,
|
|
"Allocate memory for GPU device memory pointer failed."
|
|
" Line: %d | Size: %ld\n",
|
|
__LINE__, MemSize);
|
|
exit(-1);
|
|
}
|
|
|
|
return DevData;
|
|
}
|
|
|
|
static void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) {
|
|
dump_function();
|
|
|
|
CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
|
|
return (void *)DevPtr->Cuda;
|
|
}
|
|
|
|
static void freeContextCUDA(PollyGPUContext *Context) {
|
|
dump_function();
|
|
|
|
CUDAContext *Ctx = (CUDAContext *)Context->Context;
|
|
if (Ctx->Cuda) {
|
|
CuProfilerStopFcnPtr();
|
|
CuCtxDestroyFcnPtr(Ctx->Cuda);
|
|
free(Ctx);
|
|
free(Context);
|
|
}
|
|
|
|
dlclose(HandleCuda);
|
|
dlclose(HandleCudaRT);
|
|
}
|
|
|
|
#endif /* HAS_LIBCUDART */
|
|
/******************************************************************************/
|
|
/* API */
|
|
/******************************************************************************/
|
|
|
|
PollyGPUContext *polly_initContext() {
|
|
DebugMode = getenv("POLLY_DEBUG") != 0;
|
|
CacheMode = getenv("POLLY_NOCACHE") == 0;
|
|
|
|
dump_function();
|
|
|
|
PollyGPUContext *Context;
|
|
|
|
switch (Runtime) {
|
|
#ifdef HAS_LIBCUDART
|
|
case RUNTIME_CUDA:
|
|
Context = initContextCUDA();
|
|
break;
|
|
#endif /* HAS_LIBCUDART */
|
|
#ifdef HAS_LIBOPENCL
|
|
case RUNTIME_CL:
|
|
Context = initContextCL();
|
|
break;
|
|
#endif /* HAS_LIBOPENCL */
|
|
default:
|
|
err_runtime();
|
|
}
|
|
|
|
return Context;
|
|
}
|
|
|
|
void polly_freeKernel(PollyGPUFunction *Kernel) {
|
|
dump_function();
|
|
|
|
switch (Runtime) {
|
|
#ifdef HAS_LIBCUDART
|
|
case RUNTIME_CUDA:
|
|
freeKernelCUDA(Kernel);
|
|
break;
|
|
#endif /* HAS_LIBCUDART */
|
|
#ifdef HAS_LIBOPENCL
|
|
case RUNTIME_CL:
|
|
freeKernelCL(Kernel);
|
|
break;
|
|
#endif /* HAS_LIBOPENCL */
|
|
default:
|
|
err_runtime();
|
|
}
|
|
}
|
|
|
|
PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
|
|
const char *KernelName) {
|
|
dump_function();
|
|
|
|
PollyGPUFunction *Function;
|
|
|
|
switch (Runtime) {
|
|
#ifdef HAS_LIBCUDART
|
|
case RUNTIME_CUDA:
|
|
Function = getKernelCUDA(BinaryBuffer, KernelName);
|
|
break;
|
|
#endif /* HAS_LIBCUDART */
|
|
#ifdef HAS_LIBOPENCL
|
|
case RUNTIME_CL:
|
|
Function = getKernelCL(BinaryBuffer, KernelName);
|
|
break;
|
|
#endif /* HAS_LIBOPENCL */
|
|
default:
|
|
err_runtime();
|
|
}
|
|
|
|
return Function;
|
|
}
|
|
|
|
void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
|
|
long MemSize) {
|
|
dump_function();
|
|
|
|
switch (Runtime) {
|
|
#ifdef HAS_LIBCUDART
|
|
case RUNTIME_CUDA:
|
|
copyFromHostToDeviceCUDA(HostData, DevData, MemSize);
|
|
break;
|
|
#endif /* HAS_LIBCUDART */
|
|
#ifdef HAS_LIBOPENCL
|
|
case RUNTIME_CL:
|
|
copyFromHostToDeviceCL(HostData, DevData, MemSize);
|
|
break;
|
|
#endif /* HAS_LIBOPENCL */
|
|
default:
|
|
err_runtime();
|
|
}
|
|
}
|
|
|
|
void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
|
|
long MemSize) {
|
|
dump_function();
|
|
|
|
switch (Runtime) {
|
|
#ifdef HAS_LIBCUDART
|
|
case RUNTIME_CUDA:
|
|
copyFromDeviceToHostCUDA(DevData, HostData, MemSize);
|
|
break;
|
|
#endif /* HAS_LIBCUDART */
|
|
#ifdef HAS_LIBOPENCL
|
|
case RUNTIME_CL:
|
|
copyFromDeviceToHostCL(DevData, HostData, MemSize);
|
|
break;
|
|
#endif /* HAS_LIBOPENCL */
|
|
default:
|
|
err_runtime();
|
|
}
|
|
}
|
|
|
|
void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
|
|
unsigned int GridDimY, unsigned int BlockDimX,
|
|
unsigned int BlockDimY, unsigned int BlockDimZ,
|
|
void **Parameters) {
|
|
dump_function();
|
|
|
|
switch (Runtime) {
|
|
#ifdef HAS_LIBCUDART
|
|
case RUNTIME_CUDA:
|
|
launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
|
|
BlockDimZ, Parameters);
|
|
break;
|
|
#endif /* HAS_LIBCUDART */
|
|
#ifdef HAS_LIBOPENCL
|
|
case RUNTIME_CL:
|
|
launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ,
|
|
Parameters);
|
|
break;
|
|
#endif /* HAS_LIBOPENCL */
|
|
default:
|
|
err_runtime();
|
|
}
|
|
}
|
|
|
|
void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
|
|
dump_function();
|
|
|
|
switch (Runtime) {
|
|
#ifdef HAS_LIBCUDART
|
|
case RUNTIME_CUDA:
|
|
freeDeviceMemoryCUDA(Allocation);
|
|
break;
|
|
#endif /* HAS_LIBCUDART */
|
|
#ifdef HAS_LIBOPENCL
|
|
case RUNTIME_CL:
|
|
freeDeviceMemoryCL(Allocation);
|
|
break;
|
|
#endif /* HAS_LIBOPENCL */
|
|
default:
|
|
err_runtime();
|
|
}
|
|
}
|
|
|
|
PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
|
|
dump_function();
|
|
|
|
PollyGPUDevicePtr *DevData;
|
|
|
|
switch (Runtime) {
|
|
#ifdef HAS_LIBCUDART
|
|
case RUNTIME_CUDA:
|
|
DevData = allocateMemoryForDeviceCUDA(MemSize);
|
|
break;
|
|
#endif /* HAS_LIBCUDART */
|
|
#ifdef HAS_LIBOPENCL
|
|
case RUNTIME_CL:
|
|
DevData = allocateMemoryForDeviceCL(MemSize);
|
|
break;
|
|
#endif /* HAS_LIBOPENCL */
|
|
default:
|
|
err_runtime();
|
|
}
|
|
|
|
return DevData;
|
|
}
|
|
|
|
void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) {
|
|
dump_function();
|
|
|
|
void *DevPtr;
|
|
|
|
switch (Runtime) {
|
|
#ifdef HAS_LIBCUDART
|
|
case RUNTIME_CUDA:
|
|
DevPtr = getDevicePtrCUDA(Allocation);
|
|
break;
|
|
#endif /* HAS_LIBCUDART */
|
|
#ifdef HAS_LIBOPENCL
|
|
case RUNTIME_CL:
|
|
DevPtr = getDevicePtrCL(Allocation);
|
|
break;
|
|
#endif /* HAS_LIBOPENCL */
|
|
default:
|
|
err_runtime();
|
|
}
|
|
|
|
return DevPtr;
|
|
}
|
|
|
|
void polly_synchronizeDevice() {
|
|
dump_function();
|
|
|
|
switch (Runtime) {
|
|
#ifdef HAS_LIBCUDART
|
|
case RUNTIME_CUDA:
|
|
synchronizeDeviceCUDA();
|
|
break;
|
|
#endif /* HAS_LIBCUDART */
|
|
#ifdef HAS_LIBOPENCL
|
|
case RUNTIME_CL:
|
|
synchronizeDeviceCL();
|
|
break;
|
|
#endif /* HAS_LIBOPENCL */
|
|
default:
|
|
err_runtime();
|
|
}
|
|
}
|
|
|
|
void polly_freeContext(PollyGPUContext *Context) {
|
|
dump_function();
|
|
|
|
if (CacheMode)
|
|
return;
|
|
|
|
switch (Runtime) {
|
|
#ifdef HAS_LIBCUDART
|
|
case RUNTIME_CUDA:
|
|
freeContextCUDA(Context);
|
|
break;
|
|
#endif /* HAS_LIBCUDART */
|
|
#ifdef HAS_LIBOPENCL
|
|
case RUNTIME_CL:
|
|
freeContextCL(Context);
|
|
break;
|
|
#endif /* HAS_LIBOPENCL */
|
|
default:
|
|
err_runtime();
|
|
}
|
|
}
|
|
|
|
void polly_freeManaged(void *mem) {
|
|
dump_function();
|
|
|
|
#ifdef HAS_LIBCUDART
|
|
freeManagedCUDA(mem);
|
|
#else
|
|
fprintf(stderr, "No CUDA Runtime. Managed memory only supported by CUDA\n");
|
|
exit(-1);
|
|
#endif
|
|
}
|
|
|
|
void *polly_mallocManaged(size_t size) {
|
|
dump_function();
|
|
|
|
#ifdef HAS_LIBCUDART
|
|
return mallocManagedCUDA(size);
|
|
#else
|
|
fprintf(stderr, "No CUDA Runtime. Managed memory only supported by CUDA\n");
|
|
exit(-1);
|
|
#endif
|
|
}
|
|
|
|
/* Initialize GPUJIT with CUDA as runtime library. */
|
|
PollyGPUContext *polly_initContextCUDA() {
|
|
#ifdef HAS_LIBCUDART
|
|
Runtime = RUNTIME_CUDA;
|
|
return polly_initContext();
|
|
#else
|
|
fprintf(stderr, "GPU Runtime was built without CUDA support.\n");
|
|
exit(-1);
|
|
#endif /* HAS_LIBCUDART */
|
|
}
|
|
|
|
/* Initialize GPUJIT with OpenCL as runtime library. */
|
|
PollyGPUContext *polly_initContextCL() {
|
|
#ifdef HAS_LIBOPENCL
|
|
Runtime = RUNTIME_CL;
|
|
return polly_initContext();
|
|
#else
|
|
fprintf(stderr, "GPU Runtime was built without OpenCL support.\n");
|
|
exit(-1);
|
|
#endif /* HAS_LIBOPENCL */
|
|
}
|