forked from OSchip/llvm-project
511 lines
18 KiB
C++
511 lines
18 KiB
C++
//===--- cuda_acxxel.cpp - CUDA implementation of the Acxxel API ----------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
///
|
|
/// This file defines the standard CUDA implementation of the Acxxel API.
|
|
///
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "acxxel.h"
|
|
|
|
#include "cuda.h"
|
|
#include "cuda_runtime.h"
|
|
|
|
#include <array>
|
|
#include <cassert>
|
|
#include <sstream>
|
|
#include <vector>
|
|
|
|
namespace acxxel {
|
|
|
|
namespace {
|
|
|
|
static std::string getCUErrorMessage(CUresult Result) {
|
|
if (!Result)
|
|
return "success";
|
|
const char *ErrorName = "UNKNOWN_ERROR_NAME";
|
|
const char *ErrorDescription = "UNKNOWN_ERROR_DESCRIPTION";
|
|
cuGetErrorName(Result, &ErrorName);
|
|
cuGetErrorString(Result, &ErrorDescription);
|
|
std::ostringstream OutStream;
|
|
OutStream << "CUDA driver error: code = " << Result
|
|
<< ", name = " << ErrorName
|
|
<< ", description = " << ErrorDescription;
|
|
return OutStream.str();
|
|
}
|
|
|
|
static Status getCUError(CUresult Result, const std::string &Message) {
|
|
if (!Result)
|
|
return Status();
|
|
std::ostringstream OutStream;
|
|
OutStream << getCUErrorMessage(Result) << ", message = " << Message;
|
|
return Status(OutStream.str());
|
|
}
|
|
|
|
static std::string getCUDAErrorMessage(cudaError_t E) {
|
|
if (!E)
|
|
return "success";
|
|
std::ostringstream OutStream;
|
|
OutStream << "CUDA runtime error: code = " << E
|
|
<< ", name = " << cudaGetErrorName(E)
|
|
<< ", description = " << cudaGetErrorString(E);
|
|
return OutStream.str();
|
|
}
|
|
|
|
static Status getCUDAError(cudaError_t E, const std::string &Message) {
|
|
if (!E)
|
|
return Status();
|
|
std::ostringstream OutStream;
|
|
OutStream << getCUDAErrorMessage(E) << ", message = " << Message;
|
|
return Status(OutStream.str());
|
|
}
|
|
|
|
static void logCUWarning(CUresult Result, const std::string &Message) {
|
|
if (Result) {
|
|
std::ostringstream OutStream;
|
|
OutStream << Message << ": " << getCUErrorMessage(Result);
|
|
logWarning(OutStream.str());
|
|
}
|
|
}
|
|
|
|
/// A CUDA Platform implementation.
|
|
class CUDAPlatform : public Platform {
|
|
public:
|
|
~CUDAPlatform() override = default;
|
|
|
|
static Expected<CUDAPlatform> create();
|
|
|
|
Expected<int> getDeviceCount() override;
|
|
|
|
Expected<Stream> createStream(int DeviceIndex) override;
|
|
|
|
Status streamSync(void *Stream) override;
|
|
|
|
Status streamWaitOnEvent(void *Stream, void *Event) override;
|
|
|
|
Expected<Event> createEvent(int DeviceIndex) override;
|
|
|
|
protected:
|
|
Expected<void *> rawMallocD(ptrdiff_t ByteCount, int DeviceIndex) override;
|
|
HandleDestructor getDeviceMemoryHandleDestructor() override;
|
|
void *getDeviceMemorySpanHandle(void *BaseHandle, size_t ByteSize,
|
|
size_t ByteOffset) override;
|
|
virtual void rawDestroyDeviceMemorySpanHandle(void *Handle) override;
|
|
|
|
Expected<void *> rawGetDeviceSymbolAddress(const void *Symbol,
|
|
int DeviceIndex) override;
|
|
Expected<ptrdiff_t> rawGetDeviceSymbolSize(const void *Symbol,
|
|
int DeviceIndex) override;
|
|
|
|
Status rawRegisterHostMem(const void *Memory, ptrdiff_t ByteCount) override;
|
|
HandleDestructor getUnregisterHostMemoryHandleDestructor() override;
|
|
|
|
Expected<void *> rawMallocRegisteredH(ptrdiff_t ByteCount) override;
|
|
HandleDestructor getFreeHostMemoryHandleDestructor() override;
|
|
|
|
Status asyncCopyDToD(const void *DeviceSrc, ptrdiff_t DeviceSrcByteOffset,
|
|
void *DeviceDst, ptrdiff_t DeviceDstByteOffset,
|
|
ptrdiff_t ByteCount, void *Stream) override;
|
|
Status asyncCopyDToH(const void *DeviceSrc, ptrdiff_t DeviceSrcByteOffset,
|
|
void *HostDst, ptrdiff_t ByteCount,
|
|
void *Stream) override;
|
|
Status asyncCopyHToD(const void *HostSrc, void *DeviceDst,
|
|
ptrdiff_t DeviceDstByteOffset, ptrdiff_t ByteCount,
|
|
void *Stream) override;
|
|
|
|
Status asyncMemsetD(void *DeviceDst, ptrdiff_t ByteOffset,
|
|
ptrdiff_t ByteCount, char ByteValue,
|
|
void *Stream) override;
|
|
|
|
Status addStreamCallback(Stream &Stream, StreamCallback Callback) override;
|
|
|
|
Expected<Program> createProgramFromSource(Span<const char> Source,
|
|
int DeviceIndex) override;
|
|
|
|
Status enqueueEvent(void *Event, void *Stream) override;
|
|
bool eventIsDone(void *Event) override;
|
|
Status eventSync(void *Event) override;
|
|
Expected<float> getSecondsBetweenEvents(void *StartEvent,
|
|
void *EndEvent) override;
|
|
|
|
Expected<void *> rawCreateKernel(void *Program,
|
|
const std::string &Name) override;
|
|
HandleDestructor getKernelHandleDestructor() override;
|
|
|
|
Status rawEnqueueKernelLaunch(void *Stream, void *Kernel,
|
|
KernelLaunchDimensions LaunchDimensions,
|
|
Span<void *> Arguments,
|
|
Span<size_t> ArgumentSizes,
|
|
size_t SharedMemoryBytes) override;
|
|
|
|
private:
|
|
explicit CUDAPlatform(const std::vector<CUcontext> &Contexts)
|
|
: TheContexts(Contexts) {}
|
|
|
|
Status setContext(int DeviceIndex) {
|
|
if (DeviceIndex < 0 ||
|
|
static_cast<size_t>(DeviceIndex) >= TheContexts.size())
|
|
return Status("invalid deivce index " + std::to_string(DeviceIndex));
|
|
return getCUError(cuCtxSetCurrent(TheContexts[DeviceIndex]),
|
|
"cuCtxSetCurrent");
|
|
}
|
|
|
|
// Vector of contexts for each device.
|
|
std::vector<CUcontext> TheContexts;
|
|
};
|
|
|
|
Expected<CUDAPlatform> CUDAPlatform::create() {
|
|
std::vector<CUcontext> Contexts;
|
|
if (CUresult Result = cuInit(0))
|
|
return getCUError(Result, "cuInit");
|
|
|
|
int DeviceCount = 0;
|
|
if (CUresult Result = cuDeviceGetCount(&DeviceCount))
|
|
return getCUError(Result, "cuDeviceGetCount");
|
|
|
|
for (int I = 0; I < DeviceCount; ++I) {
|
|
CUdevice Device;
|
|
if (CUresult Result = cuDeviceGet(&Device, I))
|
|
return getCUError(Result, "cuDeviceGet");
|
|
CUcontext Context;
|
|
if (CUresult Result = cuDevicePrimaryCtxRetain(&Context, Device))
|
|
return getCUError(Result, "cuDevicePrimaryCtxRetain");
|
|
if (CUresult Result = cuCtxSetCurrent(Context))
|
|
return getCUError(Result, "cuCtxSetCurrent");
|
|
Contexts.emplace_back(Context);
|
|
}
|
|
|
|
return CUDAPlatform(Contexts);
|
|
}
|
|
|
|
Expected<int> CUDAPlatform::getDeviceCount() {
|
|
int Count = 0;
|
|
if (CUresult Result = cuDeviceGetCount(&Count))
|
|
return getCUError(Result, "cuDeviceGetCount");
|
|
return Count;
|
|
}
|
|
|
|
static void cudaDestroyStream(void *H) {
|
|
logCUWarning(cuStreamDestroy(static_cast<CUstream_st *>(H)),
|
|
"cuStreamDestroy");
|
|
}
|
|
|
|
Expected<Stream> CUDAPlatform::createStream(int DeviceIndex) {
|
|
Status S = setContext(DeviceIndex);
|
|
if (S.isError())
|
|
return S;
|
|
unsigned int Flags = CU_STREAM_DEFAULT;
|
|
CUstream Handle;
|
|
if (CUresult Result = cuStreamCreate(&Handle, Flags))
|
|
return getCUError(Result, "cuStreamCreate");
|
|
return constructStream(this, DeviceIndex, Handle, cudaDestroyStream);
|
|
}
|
|
|
|
Status CUDAPlatform::streamSync(void *Stream) {
|
|
return getCUError(cuStreamSynchronize(static_cast<CUstream_st *>(Stream)),
|
|
"cuStreamSynchronize");
|
|
}
|
|
|
|
Status CUDAPlatform::streamWaitOnEvent(void *Stream, void *Event) {
|
|
// CUDA docs says flags must be 0.
|
|
unsigned int Flags = 0u;
|
|
return getCUError(cuStreamWaitEvent(static_cast<CUstream_st *>(Stream),
|
|
static_cast<CUevent_st *>(Event), Flags),
|
|
"cuStreamWaitEvent");
|
|
}
|
|
|
|
static void cudaDestroyEvent(void *H) {
|
|
logCUWarning(cuEventDestroy(static_cast<CUevent_st *>(H)), "cuEventDestroy");
|
|
}
|
|
|
|
Expected<Event> CUDAPlatform::createEvent(int DeviceIndex) {
|
|
Status S = setContext(DeviceIndex);
|
|
if (S.isError())
|
|
return S;
|
|
unsigned int Flags = CU_EVENT_DEFAULT;
|
|
CUevent Handle;
|
|
if (CUresult Result = cuEventCreate(&Handle, Flags))
|
|
return getCUError(Result, "cuEventCreate");
|
|
return constructEvent(this, DeviceIndex, Handle, cudaDestroyEvent);
|
|
}
|
|
|
|
Status CUDAPlatform::enqueueEvent(void *Event, void *Stream) {
|
|
return getCUError(cuEventRecord(static_cast<CUevent_st *>(Event),
|
|
static_cast<CUstream_st *>(Stream)),
|
|
"cuEventRecord");
|
|
}
|
|
|
|
bool CUDAPlatform::eventIsDone(void *Event) {
|
|
return cuEventQuery(static_cast<CUevent_st *>(Event)) != CUDA_ERROR_NOT_READY;
|
|
}
|
|
|
|
Status CUDAPlatform::eventSync(void *Event) {
|
|
return getCUError(cuEventSynchronize(static_cast<CUevent_st *>(Event)),
|
|
"cuEventSynchronize");
|
|
}
|
|
|
|
Expected<float> CUDAPlatform::getSecondsBetweenEvents(void *StartEvent,
|
|
void *EndEvent) {
|
|
float Milliseconds;
|
|
if (CUresult Result = cuEventElapsedTime(
|
|
&Milliseconds, static_cast<CUevent_st *>(StartEvent),
|
|
static_cast<CUevent_st *>(EndEvent)))
|
|
return getCUError(Result, "cuEventElapsedTime");
|
|
return Milliseconds * 1e-6;
|
|
}
|
|
|
|
Expected<void *> CUDAPlatform::rawMallocD(ptrdiff_t ByteCount,
|
|
int DeviceIndex) {
|
|
Status S = setContext(DeviceIndex);
|
|
if (S.isError())
|
|
return S;
|
|
if (!ByteCount)
|
|
return nullptr;
|
|
CUdeviceptr Pointer;
|
|
if (CUresult Result = cuMemAlloc(&Pointer, ByteCount))
|
|
return getCUError(Result, "cuMemAlloc");
|
|
return reinterpret_cast<void *>(Pointer);
|
|
}
|
|
|
|
static void cudaDestroyDeviceMemory(void *H) {
|
|
logCUWarning(cuMemFree(reinterpret_cast<CUdeviceptr>(H)), "cuMemFree");
|
|
}
|
|
|
|
HandleDestructor CUDAPlatform::getDeviceMemoryHandleDestructor() {
|
|
return cudaDestroyDeviceMemory;
|
|
}
|
|
|
|
void *CUDAPlatform::getDeviceMemorySpanHandle(void *BaseHandle, size_t,
|
|
size_t ByteOffset) {
|
|
return static_cast<char *>(BaseHandle) + ByteOffset;
|
|
}
|
|
|
|
void CUDAPlatform::rawDestroyDeviceMemorySpanHandle(void *) {
|
|
// Do nothing for this platform.
|
|
}
|
|
|
|
Expected<void *> CUDAPlatform::rawGetDeviceSymbolAddress(const void *Symbol,
|
|
int DeviceIndex) {
|
|
Status S = setContext(DeviceIndex);
|
|
if (S.isError())
|
|
return S;
|
|
void *Address;
|
|
if (cudaError_t Status = cudaGetSymbolAddress(&Address, Symbol))
|
|
return getCUDAError(Status, "cudaGetSymbolAddress");
|
|
return Address;
|
|
}
|
|
|
|
Expected<ptrdiff_t> CUDAPlatform::rawGetDeviceSymbolSize(const void *Symbol,
|
|
int DeviceIndex) {
|
|
Status S = setContext(DeviceIndex);
|
|
if (S.isError())
|
|
return S;
|
|
size_t Size;
|
|
if (cudaError_t Status = cudaGetSymbolSize(&Size, Symbol))
|
|
return getCUDAError(Status, "cudaGetSymbolSize");
|
|
return Size;
|
|
}
|
|
|
|
static const void *offsetVoidPtr(const void *Ptr, ptrdiff_t ByteOffset) {
|
|
return static_cast<const void *>(static_cast<const char *>(Ptr) + ByteOffset);
|
|
}
|
|
|
|
static void *offsetVoidPtr(void *Ptr, ptrdiff_t ByteOffset) {
|
|
return static_cast<void *>(static_cast<char *>(Ptr) + ByteOffset);
|
|
}
|
|
|
|
Status CUDAPlatform::rawRegisterHostMem(const void *Memory,
|
|
ptrdiff_t ByteCount) {
|
|
unsigned int Flags = 0;
|
|
return getCUError(
|
|
cuMemHostRegister(const_cast<void *>(Memory), ByteCount, Flags),
|
|
"cuMemHostRegiser");
|
|
}
|
|
|
|
static void cudaUnregisterHostMemoryHandleDestructor(void *H) {
|
|
logCUWarning(cuMemHostUnregister(H), "cuMemHostUnregister");
|
|
}
|
|
|
|
HandleDestructor CUDAPlatform::getUnregisterHostMemoryHandleDestructor() {
|
|
return cudaUnregisterHostMemoryHandleDestructor;
|
|
}
|
|
|
|
Expected<void *> CUDAPlatform::rawMallocRegisteredH(ptrdiff_t ByteCount) {
|
|
unsigned int Flags = 0;
|
|
void *Memory;
|
|
if (CUresult Result = cuMemHostAlloc(&Memory, ByteCount, Flags))
|
|
return getCUError(Result, "cuMemHostAlloc");
|
|
return Memory;
|
|
}
|
|
|
|
static void cudaFreeHostMemoryHandleDestructor(void *H) {
|
|
logCUWarning(cuMemFreeHost(H), "cuMemFreeHost");
|
|
}
|
|
|
|
HandleDestructor CUDAPlatform::getFreeHostMemoryHandleDestructor() {
|
|
return cudaFreeHostMemoryHandleDestructor;
|
|
}
|
|
|
|
Status CUDAPlatform::asyncCopyDToD(const void *DeviceSrc,
|
|
ptrdiff_t DeviceSrcByteOffset,
|
|
void *DeviceDst,
|
|
ptrdiff_t DeviceDstByteOffset,
|
|
ptrdiff_t ByteCount, void *Stream) {
|
|
return getCUError(
|
|
cuMemcpyDtoDAsync(reinterpret_cast<CUdeviceptr>(
|
|
offsetVoidPtr(DeviceDst, DeviceDstByteOffset)),
|
|
reinterpret_cast<CUdeviceptr>(
|
|
offsetVoidPtr(DeviceSrc, DeviceSrcByteOffset)),
|
|
ByteCount, static_cast<CUstream_st *>(Stream)),
|
|
"cuMemcpyDtoDAsync");
|
|
}
|
|
|
|
Status CUDAPlatform::asyncCopyDToH(const void *DeviceSrc,
|
|
ptrdiff_t DeviceSrcByteOffset, void *HostDst,
|
|
ptrdiff_t ByteCount, void *Stream) {
|
|
return getCUError(
|
|
cuMemcpyDtoHAsync(HostDst, reinterpret_cast<CUdeviceptr>(offsetVoidPtr(
|
|
DeviceSrc, DeviceSrcByteOffset)),
|
|
ByteCount, static_cast<CUstream_st *>(Stream)),
|
|
"cuMemcpyDtoHAsync");
|
|
}
|
|
|
|
Status CUDAPlatform::asyncCopyHToD(const void *HostSrc, void *DeviceDst,
|
|
ptrdiff_t DeviceDstByteOffset,
|
|
ptrdiff_t ByteCount, void *Stream) {
|
|
return getCUError(
|
|
cuMemcpyHtoDAsync(reinterpret_cast<CUdeviceptr>(
|
|
offsetVoidPtr(DeviceDst, DeviceDstByteOffset)),
|
|
HostSrc, ByteCount, static_cast<CUstream_st *>(Stream)),
|
|
"cuMemcpyHtoDAsync");
|
|
}
|
|
|
|
Status CUDAPlatform::asyncMemsetD(void *DeviceDst, ptrdiff_t ByteOffset,
|
|
ptrdiff_t ByteCount, char ByteValue,
|
|
void *Stream) {
|
|
return getCUError(
|
|
cuMemsetD8Async(
|
|
reinterpret_cast<CUdeviceptr>(offsetVoidPtr(DeviceDst, ByteOffset)),
|
|
ByteValue, ByteCount, static_cast<CUstream_st *>(Stream)),
|
|
"cuMemsetD8Async");
|
|
}
|
|
|
|
struct StreamCallbackUserData {
|
|
StreamCallbackUserData(Stream &Stream, StreamCallback Function)
|
|
: TheStream(Stream), TheFunction(std::move(Function)) {}
|
|
|
|
Stream &TheStream;
|
|
StreamCallback TheFunction;
|
|
};
|
|
|
|
static void CUDA_CB cuStreamCallbackShim(CUstream HStream, CUresult Status,
|
|
void *UserData) {
|
|
std::unique_ptr<StreamCallbackUserData> Data(
|
|
static_cast<StreamCallbackUserData *>(UserData));
|
|
Stream &TheStream = Data->TheStream;
|
|
assert(static_cast<CUstream_st *>(TheStream) == HStream);
|
|
Data->TheFunction(TheStream,
|
|
getCUError(Status, "stream callback error state"));
|
|
}
|
|
|
|
Status CUDAPlatform::addStreamCallback(Stream &Stream,
|
|
StreamCallback Callback) {
|
|
// CUDA docs say flags must always be 0 here.
|
|
unsigned int Flags = 0u;
|
|
std::unique_ptr<StreamCallbackUserData> UserData(
|
|
new StreamCallbackUserData(Stream, std::move(Callback)));
|
|
return getCUError(cuStreamAddCallback(Stream, cuStreamCallbackShim,
|
|
UserData.release(), Flags),
|
|
"cuStreamAddCallback");
|
|
}
|
|
|
|
static void cudaDestroyProgram(void *H) {
|
|
logCUWarning(cuModuleUnload(static_cast<CUmod_st *>(H)), "cuModuleUnload");
|
|
}
|
|
|
|
Expected<Program> CUDAPlatform::createProgramFromSource(Span<const char> Source,
|
|
int DeviceIndex) {
|
|
Status S = setContext(DeviceIndex);
|
|
if (S.isError())
|
|
return S;
|
|
CUmodule Module;
|
|
constexpr int LogBufferSizeBytes = 1024;
|
|
char InfoLogBuffer[LogBufferSizeBytes];
|
|
char ErrorLogBuffer[LogBufferSizeBytes];
|
|
constexpr size_t OptionsCount = 4;
|
|
std::array<CUjit_option, OptionsCount> OptionNames = {
|
|
{CU_JIT_INFO_LOG_BUFFER, CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
|
|
CU_JIT_ERROR_LOG_BUFFER, CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES}};
|
|
std::array<void *, OptionsCount> OptionValues = {
|
|
{InfoLogBuffer, const_cast<int *>(&LogBufferSizeBytes), ErrorLogBuffer,
|
|
const_cast<int *>(&LogBufferSizeBytes)}};
|
|
if (CUresult Result =
|
|
cuModuleLoadDataEx(&Module, Source.data(), OptionsCount,
|
|
OptionNames.data(), OptionValues.data())) {
|
|
InfoLogBuffer[LogBufferSizeBytes - 1] = '\0';
|
|
ErrorLogBuffer[LogBufferSizeBytes - 1] = '\0';
|
|
std::ostringstream OutStream;
|
|
OutStream << "Error creating program from source: "
|
|
<< getCUErrorMessage(Result)
|
|
<< "\nINFO MESSAGES\n================\n"
|
|
<< InfoLogBuffer << "\nERROR MESSAGES\n==================\n"
|
|
<< ErrorLogBuffer;
|
|
return Status(OutStream.str());
|
|
}
|
|
return constructProgram(this, Module, cudaDestroyProgram);
|
|
}
|
|
|
|
Expected<void *> CUDAPlatform::rawCreateKernel(void *Program,
|
|
const std::string &Name) {
|
|
CUmodule Module = static_cast<CUmodule>(Program);
|
|
CUfunction Kernel;
|
|
if (CUresult Result = cuModuleGetFunction(&Kernel, Module, Name.c_str()))
|
|
return getCUError(Result, "cuModuleGetFunction");
|
|
return Kernel;
|
|
}
|
|
|
|
static void cudaDestroyKernel(void *) {
|
|
// Do nothing.
|
|
}
|
|
|
|
HandleDestructor CUDAPlatform::getKernelHandleDestructor() {
|
|
return cudaDestroyKernel;
|
|
}
|
|
|
|
Status CUDAPlatform::rawEnqueueKernelLaunch(
|
|
void *Stream, void *Kernel, KernelLaunchDimensions LaunchDimensions,
|
|
Span<void *> Arguments, Span<size_t>, size_t SharedMemoryBytes) {
|
|
return getCUError(
|
|
cuLaunchKernel(static_cast<CUfunction>(Kernel), LaunchDimensions.GridX,
|
|
LaunchDimensions.GridY, LaunchDimensions.GridZ,
|
|
LaunchDimensions.BlockX, LaunchDimensions.BlockY,
|
|
LaunchDimensions.BlockZ, SharedMemoryBytes,
|
|
static_cast<CUstream>(Stream), Arguments.data(), nullptr),
|
|
"cuLaunchKernel");
|
|
}
|
|
|
|
} // namespace
|
|
|
|
namespace cuda {
|
|
|
|
/// Gets the CUDAPlatform instance and returns it as an unowned pointer to a
|
|
/// Platform.
|
|
Expected<Platform *> getPlatform() {
|
|
static auto MaybePlatform = []() -> Expected<CUDAPlatform *> {
|
|
Expected<CUDAPlatform> CreationResult = CUDAPlatform::create();
|
|
if (CreationResult.isError())
|
|
return CreationResult.getError();
|
|
else
|
|
return new CUDAPlatform(CreationResult.takeValue());
|
|
}();
|
|
return MaybePlatform;
|
|
}
|
|
|
|
} // namespace cuda
|
|
|
|
} // namespace acxxel
|