llvm-project/parallel-libs/acxxel/cuda_acxxel.cpp

511 lines
18 KiB
C++

//===--- cuda_acxxel.cpp - CUDA implementation of the Acxxel API ----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// This file defines the standard CUDA implementation of the Acxxel API.
///
//===----------------------------------------------------------------------===//
#include "acxxel.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include <array>
#include <cassert>
#include <sstream>
#include <vector>
namespace acxxel {
namespace {
static std::string getCUErrorMessage(CUresult Result) {
if (!Result)
return "success";
const char *ErrorName = "UNKNOWN_ERROR_NAME";
const char *ErrorDescription = "UNKNOWN_ERROR_DESCRIPTION";
cuGetErrorName(Result, &ErrorName);
cuGetErrorString(Result, &ErrorDescription);
std::ostringstream OutStream;
OutStream << "CUDA driver error: code = " << Result
<< ", name = " << ErrorName
<< ", description = " << ErrorDescription;
return OutStream.str();
}
static Status getCUError(CUresult Result, const std::string &Message) {
if (!Result)
return Status();
std::ostringstream OutStream;
OutStream << getCUErrorMessage(Result) << ", message = " << Message;
return Status(OutStream.str());
}
static std::string getCUDAErrorMessage(cudaError_t E) {
if (!E)
return "success";
std::ostringstream OutStream;
OutStream << "CUDA runtime error: code = " << E
<< ", name = " << cudaGetErrorName(E)
<< ", description = " << cudaGetErrorString(E);
return OutStream.str();
}
static Status getCUDAError(cudaError_t E, const std::string &Message) {
if (!E)
return Status();
std::ostringstream OutStream;
OutStream << getCUDAErrorMessage(E) << ", message = " << Message;
return Status(OutStream.str());
}
static void logCUWarning(CUresult Result, const std::string &Message) {
if (Result) {
std::ostringstream OutStream;
OutStream << Message << ": " << getCUErrorMessage(Result);
logWarning(OutStream.str());
}
}
/// A CUDA Platform implementation.
class CUDAPlatform : public Platform {
public:
~CUDAPlatform() override = default;
static Expected<CUDAPlatform> create();
Expected<int> getDeviceCount() override;
Expected<Stream> createStream(int DeviceIndex) override;
Status streamSync(void *Stream) override;
Status streamWaitOnEvent(void *Stream, void *Event) override;
Expected<Event> createEvent(int DeviceIndex) override;
protected:
Expected<void *> rawMallocD(ptrdiff_t ByteCount, int DeviceIndex) override;
HandleDestructor getDeviceMemoryHandleDestructor() override;
void *getDeviceMemorySpanHandle(void *BaseHandle, size_t ByteSize,
size_t ByteOffset) override;
virtual void rawDestroyDeviceMemorySpanHandle(void *Handle) override;
Expected<void *> rawGetDeviceSymbolAddress(const void *Symbol,
int DeviceIndex) override;
Expected<ptrdiff_t> rawGetDeviceSymbolSize(const void *Symbol,
int DeviceIndex) override;
Status rawRegisterHostMem(const void *Memory, ptrdiff_t ByteCount) override;
HandleDestructor getUnregisterHostMemoryHandleDestructor() override;
Expected<void *> rawMallocRegisteredH(ptrdiff_t ByteCount) override;
HandleDestructor getFreeHostMemoryHandleDestructor() override;
Status asyncCopyDToD(const void *DeviceSrc, ptrdiff_t DeviceSrcByteOffset,
void *DeviceDst, ptrdiff_t DeviceDstByteOffset,
ptrdiff_t ByteCount, void *Stream) override;
Status asyncCopyDToH(const void *DeviceSrc, ptrdiff_t DeviceSrcByteOffset,
void *HostDst, ptrdiff_t ByteCount,
void *Stream) override;
Status asyncCopyHToD(const void *HostSrc, void *DeviceDst,
ptrdiff_t DeviceDstByteOffset, ptrdiff_t ByteCount,
void *Stream) override;
Status asyncMemsetD(void *DeviceDst, ptrdiff_t ByteOffset,
ptrdiff_t ByteCount, char ByteValue,
void *Stream) override;
Status addStreamCallback(Stream &Stream, StreamCallback Callback) override;
Expected<Program> createProgramFromSource(Span<const char> Source,
int DeviceIndex) override;
Status enqueueEvent(void *Event, void *Stream) override;
bool eventIsDone(void *Event) override;
Status eventSync(void *Event) override;
Expected<float> getSecondsBetweenEvents(void *StartEvent,
void *EndEvent) override;
Expected<void *> rawCreateKernel(void *Program,
const std::string &Name) override;
HandleDestructor getKernelHandleDestructor() override;
Status rawEnqueueKernelLaunch(void *Stream, void *Kernel,
KernelLaunchDimensions LaunchDimensions,
Span<void *> Arguments,
Span<size_t> ArgumentSizes,
size_t SharedMemoryBytes) override;
private:
explicit CUDAPlatform(const std::vector<CUcontext> &Contexts)
: TheContexts(Contexts) {}
Status setContext(int DeviceIndex) {
if (DeviceIndex < 0 ||
static_cast<size_t>(DeviceIndex) >= TheContexts.size())
return Status("invalid deivce index " + std::to_string(DeviceIndex));
return getCUError(cuCtxSetCurrent(TheContexts[DeviceIndex]),
"cuCtxSetCurrent");
}
// Vector of contexts for each device.
std::vector<CUcontext> TheContexts;
};
Expected<CUDAPlatform> CUDAPlatform::create() {
std::vector<CUcontext> Contexts;
if (CUresult Result = cuInit(0))
return getCUError(Result, "cuInit");
int DeviceCount = 0;
if (CUresult Result = cuDeviceGetCount(&DeviceCount))
return getCUError(Result, "cuDeviceGetCount");
for (int I = 0; I < DeviceCount; ++I) {
CUdevice Device;
if (CUresult Result = cuDeviceGet(&Device, I))
return getCUError(Result, "cuDeviceGet");
CUcontext Context;
if (CUresult Result = cuDevicePrimaryCtxRetain(&Context, Device))
return getCUError(Result, "cuDevicePrimaryCtxRetain");
if (CUresult Result = cuCtxSetCurrent(Context))
return getCUError(Result, "cuCtxSetCurrent");
Contexts.emplace_back(Context);
}
return CUDAPlatform(Contexts);
}
Expected<int> CUDAPlatform::getDeviceCount() {
int Count = 0;
if (CUresult Result = cuDeviceGetCount(&Count))
return getCUError(Result, "cuDeviceGetCount");
return Count;
}
static void cudaDestroyStream(void *H) {
logCUWarning(cuStreamDestroy(static_cast<CUstream_st *>(H)),
"cuStreamDestroy");
}
Expected<Stream> CUDAPlatform::createStream(int DeviceIndex) {
Status S = setContext(DeviceIndex);
if (S.isError())
return S;
unsigned int Flags = CU_STREAM_DEFAULT;
CUstream Handle;
if (CUresult Result = cuStreamCreate(&Handle, Flags))
return getCUError(Result, "cuStreamCreate");
return constructStream(this, DeviceIndex, Handle, cudaDestroyStream);
}
Status CUDAPlatform::streamSync(void *Stream) {
return getCUError(cuStreamSynchronize(static_cast<CUstream_st *>(Stream)),
"cuStreamSynchronize");
}
Status CUDAPlatform::streamWaitOnEvent(void *Stream, void *Event) {
// CUDA docs says flags must be 0.
unsigned int Flags = 0u;
return getCUError(cuStreamWaitEvent(static_cast<CUstream_st *>(Stream),
static_cast<CUevent_st *>(Event), Flags),
"cuStreamWaitEvent");
}
static void cudaDestroyEvent(void *H) {
logCUWarning(cuEventDestroy(static_cast<CUevent_st *>(H)), "cuEventDestroy");
}
Expected<Event> CUDAPlatform::createEvent(int DeviceIndex) {
Status S = setContext(DeviceIndex);
if (S.isError())
return S;
unsigned int Flags = CU_EVENT_DEFAULT;
CUevent Handle;
if (CUresult Result = cuEventCreate(&Handle, Flags))
return getCUError(Result, "cuEventCreate");
return constructEvent(this, DeviceIndex, Handle, cudaDestroyEvent);
}
Status CUDAPlatform::enqueueEvent(void *Event, void *Stream) {
return getCUError(cuEventRecord(static_cast<CUevent_st *>(Event),
static_cast<CUstream_st *>(Stream)),
"cuEventRecord");
}
bool CUDAPlatform::eventIsDone(void *Event) {
return cuEventQuery(static_cast<CUevent_st *>(Event)) != CUDA_ERROR_NOT_READY;
}
Status CUDAPlatform::eventSync(void *Event) {
return getCUError(cuEventSynchronize(static_cast<CUevent_st *>(Event)),
"cuEventSynchronize");
}
Expected<float> CUDAPlatform::getSecondsBetweenEvents(void *StartEvent,
void *EndEvent) {
float Milliseconds;
if (CUresult Result = cuEventElapsedTime(
&Milliseconds, static_cast<CUevent_st *>(StartEvent),
static_cast<CUevent_st *>(EndEvent)))
return getCUError(Result, "cuEventElapsedTime");
return Milliseconds * 1e-6;
}
Expected<void *> CUDAPlatform::rawMallocD(ptrdiff_t ByteCount,
int DeviceIndex) {
Status S = setContext(DeviceIndex);
if (S.isError())
return S;
if (!ByteCount)
return nullptr;
CUdeviceptr Pointer;
if (CUresult Result = cuMemAlloc(&Pointer, ByteCount))
return getCUError(Result, "cuMemAlloc");
return reinterpret_cast<void *>(Pointer);
}
static void cudaDestroyDeviceMemory(void *H) {
logCUWarning(cuMemFree(reinterpret_cast<CUdeviceptr>(H)), "cuMemFree");
}
HandleDestructor CUDAPlatform::getDeviceMemoryHandleDestructor() {
return cudaDestroyDeviceMemory;
}
void *CUDAPlatform::getDeviceMemorySpanHandle(void *BaseHandle, size_t,
size_t ByteOffset) {
return static_cast<char *>(BaseHandle) + ByteOffset;
}
void CUDAPlatform::rawDestroyDeviceMemorySpanHandle(void *) {
// Do nothing for this platform.
}
Expected<void *> CUDAPlatform::rawGetDeviceSymbolAddress(const void *Symbol,
int DeviceIndex) {
Status S = setContext(DeviceIndex);
if (S.isError())
return S;
void *Address;
if (cudaError_t Status = cudaGetSymbolAddress(&Address, Symbol))
return getCUDAError(Status, "cudaGetSymbolAddress");
return Address;
}
Expected<ptrdiff_t> CUDAPlatform::rawGetDeviceSymbolSize(const void *Symbol,
int DeviceIndex) {
Status S = setContext(DeviceIndex);
if (S.isError())
return S;
size_t Size;
if (cudaError_t Status = cudaGetSymbolSize(&Size, Symbol))
return getCUDAError(Status, "cudaGetSymbolSize");
return Size;
}
static const void *offsetVoidPtr(const void *Ptr, ptrdiff_t ByteOffset) {
return static_cast<const void *>(static_cast<const char *>(Ptr) + ByteOffset);
}
static void *offsetVoidPtr(void *Ptr, ptrdiff_t ByteOffset) {
return static_cast<void *>(static_cast<char *>(Ptr) + ByteOffset);
}
Status CUDAPlatform::rawRegisterHostMem(const void *Memory,
ptrdiff_t ByteCount) {
unsigned int Flags = 0;
return getCUError(
cuMemHostRegister(const_cast<void *>(Memory), ByteCount, Flags),
"cuMemHostRegiser");
}
static void cudaUnregisterHostMemoryHandleDestructor(void *H) {
logCUWarning(cuMemHostUnregister(H), "cuMemHostUnregister");
}
HandleDestructor CUDAPlatform::getUnregisterHostMemoryHandleDestructor() {
return cudaUnregisterHostMemoryHandleDestructor;
}
Expected<void *> CUDAPlatform::rawMallocRegisteredH(ptrdiff_t ByteCount) {
unsigned int Flags = 0;
void *Memory;
if (CUresult Result = cuMemHostAlloc(&Memory, ByteCount, Flags))
return getCUError(Result, "cuMemHostAlloc");
return Memory;
}
static void cudaFreeHostMemoryHandleDestructor(void *H) {
logCUWarning(cuMemFreeHost(H), "cuMemFreeHost");
}
HandleDestructor CUDAPlatform::getFreeHostMemoryHandleDestructor() {
return cudaFreeHostMemoryHandleDestructor;
}
Status CUDAPlatform::asyncCopyDToD(const void *DeviceSrc,
ptrdiff_t DeviceSrcByteOffset,
void *DeviceDst,
ptrdiff_t DeviceDstByteOffset,
ptrdiff_t ByteCount, void *Stream) {
return getCUError(
cuMemcpyDtoDAsync(reinterpret_cast<CUdeviceptr>(
offsetVoidPtr(DeviceDst, DeviceDstByteOffset)),
reinterpret_cast<CUdeviceptr>(
offsetVoidPtr(DeviceSrc, DeviceSrcByteOffset)),
ByteCount, static_cast<CUstream_st *>(Stream)),
"cuMemcpyDtoDAsync");
}
Status CUDAPlatform::asyncCopyDToH(const void *DeviceSrc,
ptrdiff_t DeviceSrcByteOffset, void *HostDst,
ptrdiff_t ByteCount, void *Stream) {
return getCUError(
cuMemcpyDtoHAsync(HostDst, reinterpret_cast<CUdeviceptr>(offsetVoidPtr(
DeviceSrc, DeviceSrcByteOffset)),
ByteCount, static_cast<CUstream_st *>(Stream)),
"cuMemcpyDtoHAsync");
}
Status CUDAPlatform::asyncCopyHToD(const void *HostSrc, void *DeviceDst,
ptrdiff_t DeviceDstByteOffset,
ptrdiff_t ByteCount, void *Stream) {
return getCUError(
cuMemcpyHtoDAsync(reinterpret_cast<CUdeviceptr>(
offsetVoidPtr(DeviceDst, DeviceDstByteOffset)),
HostSrc, ByteCount, static_cast<CUstream_st *>(Stream)),
"cuMemcpyHtoDAsync");
}
Status CUDAPlatform::asyncMemsetD(void *DeviceDst, ptrdiff_t ByteOffset,
ptrdiff_t ByteCount, char ByteValue,
void *Stream) {
return getCUError(
cuMemsetD8Async(
reinterpret_cast<CUdeviceptr>(offsetVoidPtr(DeviceDst, ByteOffset)),
ByteValue, ByteCount, static_cast<CUstream_st *>(Stream)),
"cuMemsetD8Async");
}
struct StreamCallbackUserData {
StreamCallbackUserData(Stream &Stream, StreamCallback Function)
: TheStream(Stream), TheFunction(std::move(Function)) {}
Stream &TheStream;
StreamCallback TheFunction;
};
static void CUDA_CB cuStreamCallbackShim(CUstream HStream, CUresult Status,
void *UserData) {
std::unique_ptr<StreamCallbackUserData> Data(
static_cast<StreamCallbackUserData *>(UserData));
Stream &TheStream = Data->TheStream;
assert(static_cast<CUstream_st *>(TheStream) == HStream);
Data->TheFunction(TheStream,
getCUError(Status, "stream callback error state"));
}
Status CUDAPlatform::addStreamCallback(Stream &Stream,
StreamCallback Callback) {
// CUDA docs say flags must always be 0 here.
unsigned int Flags = 0u;
std::unique_ptr<StreamCallbackUserData> UserData(
new StreamCallbackUserData(Stream, std::move(Callback)));
return getCUError(cuStreamAddCallback(Stream, cuStreamCallbackShim,
UserData.release(), Flags),
"cuStreamAddCallback");
}
static void cudaDestroyProgram(void *H) {
logCUWarning(cuModuleUnload(static_cast<CUmod_st *>(H)), "cuModuleUnload");
}
Expected<Program> CUDAPlatform::createProgramFromSource(Span<const char> Source,
int DeviceIndex) {
Status S = setContext(DeviceIndex);
if (S.isError())
return S;
CUmodule Module;
constexpr int LogBufferSizeBytes = 1024;
char InfoLogBuffer[LogBufferSizeBytes];
char ErrorLogBuffer[LogBufferSizeBytes];
constexpr size_t OptionsCount = 4;
std::array<CUjit_option, OptionsCount> OptionNames = {
{CU_JIT_INFO_LOG_BUFFER, CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
CU_JIT_ERROR_LOG_BUFFER, CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES}};
std::array<void *, OptionsCount> OptionValues = {
{InfoLogBuffer, const_cast<int *>(&LogBufferSizeBytes), ErrorLogBuffer,
const_cast<int *>(&LogBufferSizeBytes)}};
if (CUresult Result =
cuModuleLoadDataEx(&Module, Source.data(), OptionsCount,
OptionNames.data(), OptionValues.data())) {
InfoLogBuffer[LogBufferSizeBytes - 1] = '\0';
ErrorLogBuffer[LogBufferSizeBytes - 1] = '\0';
std::ostringstream OutStream;
OutStream << "Error creating program from source: "
<< getCUErrorMessage(Result)
<< "\nINFO MESSAGES\n================\n"
<< InfoLogBuffer << "\nERROR MESSAGES\n==================\n"
<< ErrorLogBuffer;
return Status(OutStream.str());
}
return constructProgram(this, Module, cudaDestroyProgram);
}
Expected<void *> CUDAPlatform::rawCreateKernel(void *Program,
const std::string &Name) {
CUmodule Module = static_cast<CUmodule>(Program);
CUfunction Kernel;
if (CUresult Result = cuModuleGetFunction(&Kernel, Module, Name.c_str()))
return getCUError(Result, "cuModuleGetFunction");
return Kernel;
}
static void cudaDestroyKernel(void *) {
// Do nothing.
}
HandleDestructor CUDAPlatform::getKernelHandleDestructor() {
return cudaDestroyKernel;
}
Status CUDAPlatform::rawEnqueueKernelLaunch(
void *Stream, void *Kernel, KernelLaunchDimensions LaunchDimensions,
Span<void *> Arguments, Span<size_t>, size_t SharedMemoryBytes) {
return getCUError(
cuLaunchKernel(static_cast<CUfunction>(Kernel), LaunchDimensions.GridX,
LaunchDimensions.GridY, LaunchDimensions.GridZ,
LaunchDimensions.BlockX, LaunchDimensions.BlockY,
LaunchDimensions.BlockZ, SharedMemoryBytes,
static_cast<CUstream>(Stream), Arguments.data(), nullptr),
"cuLaunchKernel");
}
} // namespace
namespace cuda {
/// Gets the CUDAPlatform instance and returns it as an unowned pointer to a
/// Platform.
Expected<Platform *> getPlatform() {
static auto MaybePlatform = []() -> Expected<CUDAPlatform *> {
Expected<CUDAPlatform> CreationResult = CUDAPlatform::create();
if (CreationResult.isError())
return CreationResult.getError();
else
return new CUDAPlatform(CreationResult.takeValue());
}();
return MaybePlatform;
}
} // namespace cuda
} // namespace acxxel