forked from OSchip/llvm-project
[SE] Add CUDA platform
Summary: Basic CUDA platform implementation and cmake infrastructure to control whether it's used. A few important TODOs will be handled in later patches: * Log some error messages that can't easily be returned as Errors. * Cache modules and kernels to prevent reloading them if someone tries to reload a kernel that's already loaded. * Tolerate shared memory arguments for kernel launches. Reviewers: jlebar Subscribers: beanz, mgorny, jprice, jlebar, parallel_libs-commits Differential Revision: https://reviews.llvm.org/D24538 llvm-svn: 281524
This commit is contained in:
parent
d56a27e242
commit
6bfc863d74
|
@ -3,9 +3,14 @@ cmake_minimum_required(VERSION 3.1)
|
|||
option(STREAM_EXECUTOR_UNIT_TESTS "enable unit tests" ON)
|
||||
option(STREAM_EXECUTOR_ENABLE_DOXYGEN "enable StreamExecutor doxygen" ON)
|
||||
option(STREAM_EXECUTOR_ENABLE_CONFIG_TOOL "enable building streamexecutor-config tool" ON)
|
||||
option(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM "enable building the CUDA StreamExecutor platform" OFF)
|
||||
|
||||
configure_file("include/streamexecutor/PlatformOptions.h.in" "include/streamexecutor/PlatformOptions.h")
|
||||
|
||||
# First find includes relative to the streamexecutor top-level source path.
|
||||
include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/include)
|
||||
# Also look for configured headers in the top-level binary directory.
|
||||
include_directories(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/include)
|
||||
|
||||
# If we are not building as part of LLVM, build StreamExecutor as a standalone
|
||||
# project using LLVM as an external library:
|
||||
|
|
|
@ -37,25 +37,29 @@ public:
|
|||
|
||||
virtual std::string getName() const = 0;
|
||||
|
||||
virtual std::string getPlatformName() const = 0;
|
||||
|
||||
/// Creates a platform-specific kernel.
|
||||
virtual Expected<const void *>
|
||||
createKernel(const MultiKernelLoaderSpec &Spec) {
|
||||
return make_error("createKernel not implemented for platform " + getName());
|
||||
return make_error("createKernel not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
virtual Error destroyKernel(const void *Handle) {
|
||||
return make_error("destroyKernel not implemented for platform " +
|
||||
getName());
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Creates a platform-specific stream.
|
||||
virtual Expected<const void *> createStream() {
|
||||
return make_error("createStream not implemented for platform " + getName());
|
||||
return make_error("createStream not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
virtual Error destroyStream(const void *Handle) {
|
||||
return make_error("destroyStream not implemented for platform " +
|
||||
getName());
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Launches a kernel on the given stream.
|
||||
|
@ -63,7 +67,8 @@ public:
|
|||
BlockDimensions BlockSize, GridDimensions GridSize,
|
||||
const void *PKernelHandle,
|
||||
const PackedKernelArgumentArrayBase &ArgumentArray) {
|
||||
return make_error("launch not implemented for platform " + getName());
|
||||
return make_error("launch not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Copies data from the device to the host.
|
||||
|
@ -72,7 +77,8 @@ public:
|
|||
virtual Error copyD2H(const void *PlatformStreamHandle,
|
||||
const void *DeviceSrcHandle, size_t SrcByteOffset,
|
||||
void *HostDst, size_t DstByteOffset, size_t ByteCount) {
|
||||
return make_error("copyD2H not implemented for platform " + getName());
|
||||
return make_error("copyD2H not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Copies data from the host to the device.
|
||||
|
@ -81,7 +87,8 @@ public:
|
|||
virtual Error copyH2D(const void *PlatformStreamHandle, const void *HostSrc,
|
||||
size_t SrcByteOffset, const void *DeviceDstHandle,
|
||||
size_t DstByteOffset, size_t ByteCount) {
|
||||
return make_error("copyH2D not implemented for platform " + getName());
|
||||
return make_error("copyH2D not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Copies data from one device location to another.
|
||||
|
@ -89,39 +96,40 @@ public:
|
|||
const void *DeviceSrcHandle, size_t SrcByteOffset,
|
||||
const void *DeviceDstHandle, size_t DstByteOffset,
|
||||
size_t ByteCount) {
|
||||
return make_error("copyD2D not implemented for platform " + getName());
|
||||
return make_error("copyD2D not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Blocks the host until the given stream completes all the work enqueued up
|
||||
/// to the point this function is called.
|
||||
virtual Error blockHostUntilDone(const void *PlatformStreamHandle) {
|
||||
return make_error("blockHostUntilDone not implemented for platform " +
|
||||
getName());
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Allocates untyped device memory of a given size in bytes.
|
||||
virtual Expected<void *> allocateDeviceMemory(size_t ByteCount) {
|
||||
return make_error("allocateDeviceMemory not implemented for platform " +
|
||||
getName());
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Frees device memory previously allocated by allocateDeviceMemory.
|
||||
virtual Error freeDeviceMemory(const void *Handle) {
|
||||
return make_error("freeDeviceMemory not implemented for platform " +
|
||||
getName());
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Registers previously allocated host memory so it can be used with copyH2D
|
||||
/// and copyD2H.
|
||||
virtual Error registerHostMemory(void *Memory, size_t ByteCount) {
|
||||
return make_error("registerHostMemory not implemented for platform " +
|
||||
getName());
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Unregisters host memory previously registered with registerHostMemory.
|
||||
virtual Error unregisterHostMemory(const void *Memory) {
|
||||
return make_error("unregisterHostMemory not implemented for platform " +
|
||||
getName());
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Copies the given number of bytes from device memory to host memory.
|
||||
|
@ -133,7 +141,7 @@ public:
|
|||
size_t SrcByteOffset, void *HostDst,
|
||||
size_t DstByteOffset, size_t ByteCount) {
|
||||
return make_error("synchronousCopyD2H not implemented for platform " +
|
||||
getName());
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Similar to synchronousCopyD2H(const void *, size_t, void
|
||||
|
@ -143,7 +151,7 @@ public:
|
|||
const void *DeviceDstHandle,
|
||||
size_t DstByteOffset, size_t ByteCount) {
|
||||
return make_error("synchronousCopyH2D not implemented for platform " +
|
||||
getName());
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Similar to synchronousCopyD2H(const void *, size_t, void
|
||||
|
@ -154,7 +162,7 @@ public:
|
|||
const void *DeviceDstHandle,
|
||||
size_t DstByteOffset, size_t ByteCount) {
|
||||
return make_error("synchronousCopyD2D not implemented for platform " +
|
||||
getName());
|
||||
getPlatformName());
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
//===-- PlatformOptions.h - Platform option macros --------------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// This contents of this file are filled in at configuration time. This file
|
||||
/// defines macros that represent the platform configuration state of the build,
|
||||
/// e.g. which platforms are enabled.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
|
||||
#ifndef STREAMEXECUTOR_PLATFORMOPTIONS_H
|
||||
#define STREAMEXECUTOR_PLATFORMOPTIONS_H
|
||||
|
||||
#cmakedefine STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM
|
||||
|
||||
#endif // STREAMEXECUTOR_PLATFORMOPTIONS_H
|
|
@ -0,0 +1,42 @@
|
|||
//===-- CUDAPlatform.h - CUDA platform subclass -----------------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Declaration of the CUDAPlatform class.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORM_H
|
||||
#define STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORM_H
|
||||
|
||||
#include "streamexecutor/Platform.h"
|
||||
#include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h"
|
||||
|
||||
#include "llvm/Support/Mutex.h"
|
||||
|
||||
#include <map>
|
||||
|
||||
namespace streamexecutor {
|
||||
namespace cuda {
|
||||
|
||||
class CUDAPlatform : public Platform {
|
||||
public:
|
||||
size_t getDeviceCount() const override;
|
||||
|
||||
Expected<Device> getDevice(size_t DeviceIndex) override;
|
||||
|
||||
private:
|
||||
llvm::sys::Mutex Mutex;
|
||||
std::map<size_t, CUDAPlatformDevice> PlatformDevices;
|
||||
};
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace streamexecutor
|
||||
|
||||
#endif // STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORM_H
|
|
@ -0,0 +1,93 @@
|
|||
//===-- CUDAPlatformDevice.h - CUDAPlatformDevice class ---------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Declaration of the CUDAPlatformDevice class.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H
|
||||
#define STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H
|
||||
|
||||
#include "streamexecutor/PlatformDevice.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
namespace cuda {
|
||||
|
||||
Error CUresultToError(int CUResult, const llvm::Twine &Message);
|
||||
|
||||
class CUDAPlatformDevice : public PlatformDevice {
|
||||
public:
|
||||
static Expected<CUDAPlatformDevice> create(size_t DeviceIndex);
|
||||
|
||||
CUDAPlatformDevice(const CUDAPlatformDevice &) = delete;
|
||||
CUDAPlatformDevice &operator=(const CUDAPlatformDevice &) = delete;
|
||||
|
||||
CUDAPlatformDevice(CUDAPlatformDevice &&) noexcept;
|
||||
CUDAPlatformDevice &operator=(CUDAPlatformDevice &&) noexcept;
|
||||
|
||||
~CUDAPlatformDevice() override;
|
||||
|
||||
std::string getName() const override;
|
||||
|
||||
std::string getPlatformName() const override { return "CUDA"; }
|
||||
|
||||
Expected<const void *>
|
||||
createKernel(const MultiKernelLoaderSpec &Spec) override;
|
||||
Error destroyKernel(const void *Handle) override;
|
||||
|
||||
Expected<const void *> createStream() override;
|
||||
Error destroyStream(const void *Handle) override;
|
||||
|
||||
Error launch(const void *PlatformStreamHandle, BlockDimensions BlockSize,
|
||||
GridDimensions GridSize, const void *PKernelHandle,
|
||||
const PackedKernelArgumentArrayBase &ArgumentArray) override;
|
||||
|
||||
Error copyD2H(const void *PlatformStreamHandle, const void *DeviceSrcHandle,
|
||||
size_t SrcByteOffset, void *HostDst, size_t DstByteOffset,
|
||||
size_t ByteCount) override;
|
||||
|
||||
Error copyH2D(const void *PlatformStreamHandle, const void *HostSrc,
|
||||
size_t SrcByteOffset, const void *DeviceDstHandle,
|
||||
size_t DstByteOffset, size_t ByteCount) override;
|
||||
|
||||
Error copyD2D(const void *PlatformStreamHandle, const void *DeviceSrcHandle,
|
||||
size_t SrcByteOffset, const void *DeviceDstHandle,
|
||||
size_t DstByteOffset, size_t ByteCount) override;
|
||||
|
||||
Error blockHostUntilDone(const void *PlatformStreamHandle) override;
|
||||
|
||||
Expected<void *> allocateDeviceMemory(size_t ByteCount) override;
|
||||
Error freeDeviceMemory(const void *Handle) override;
|
||||
|
||||
Error registerHostMemory(void *Memory, size_t ByteCount) override;
|
||||
Error unregisterHostMemory(const void *Memory) override;
|
||||
|
||||
Error synchronousCopyD2H(const void *DeviceSrcHandle, size_t SrcByteOffset,
|
||||
void *HostDst, size_t DstByteOffset,
|
||||
size_t ByteCount) override;
|
||||
|
||||
Error synchronousCopyH2D(const void *HostSrc, size_t SrcByteOffset,
|
||||
const void *DeviceDstHandle, size_t DstByteOffset,
|
||||
size_t ByteCount) override;
|
||||
|
||||
Error synchronousCopyD2D(const void *DeviceDstHandle, size_t DstByteOffset,
|
||||
const void *DeviceSrcHandle, size_t SrcByteOffset,
|
||||
size_t ByteCount) override;
|
||||
|
||||
private:
|
||||
CUDAPlatformDevice(size_t DeviceIndex) : DeviceIndex(DeviceIndex) {}
|
||||
|
||||
int DeviceIndex;
|
||||
};
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace streamexecutor
|
||||
|
||||
#endif // STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H
|
|
@ -29,6 +29,8 @@ class HostPlatformDevice : public PlatformDevice {
|
|||
public:
|
||||
std::string getName() const override { return "host"; }
|
||||
|
||||
std::string getPlatformName() const override { return "host"; }
|
||||
|
||||
Expected<const void *>
|
||||
createKernel(const MultiKernelLoaderSpec &Spec) override {
|
||||
if (!Spec.hasHostFunction()) {
|
||||
|
|
|
@ -3,6 +3,26 @@ macro(add_se_library name)
|
|||
set_target_properties(${name} PROPERTIES FOLDER "streamexecutor libraries")
|
||||
endmacro(add_se_library)
|
||||
|
||||
if(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM)
|
||||
set(
|
||||
CMAKE_MODULE_PATH
|
||||
${CMAKE_MODULE_PATH}
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/platforms/cuda/cmake/modules/")
|
||||
|
||||
find_package(Libcuda REQUIRED)
|
||||
include_directories(${LIBCUDA_INCLUDE_DIRS})
|
||||
|
||||
set(
|
||||
STREAM_EXECUTOR_CUDA_PLATFORM_TARGET_OBJECT
|
||||
$<TARGET_OBJECTS:streamexecutor_cuda_platform>)
|
||||
|
||||
set(
|
||||
STREAM_EXECUTOR_LIBCUDA_LIBRARIES
|
||||
${LIBCUDA_LIBRARIES})
|
||||
endif(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM)
|
||||
|
||||
add_subdirectory(platforms)
|
||||
|
||||
add_se_library(
|
||||
streamexecutor
|
||||
Device.cpp
|
||||
|
@ -16,6 +36,8 @@ add_se_library(
|
|||
PlatformDevice.cpp
|
||||
PlatformManager.cpp
|
||||
Stream.cpp
|
||||
)
|
||||
${STREAM_EXECUTOR_CUDA_PLATFORM_TARGET_OBJECT}
|
||||
LINK_LIBS
|
||||
${STREAM_EXECUTOR_LIBCUDA_LIBRARIES})
|
||||
|
||||
install(TARGETS streamexecutor DESTINATION lib)
|
||||
|
|
|
@ -13,8 +13,14 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "streamexecutor/PlatformManager.h"
|
||||
|
||||
#include "streamexecutor/PlatformOptions.h"
|
||||
#include "streamexecutor/platforms/host/HostPlatform.h"
|
||||
|
||||
#ifdef STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM
|
||||
#include "streamexecutor/platforms/cuda/CUDAPlatform.h"
|
||||
#endif
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
PlatformManager::PlatformManager() {
|
||||
|
@ -26,6 +32,10 @@ PlatformManager::PlatformManager() {
|
|||
// themselves when they are loaded.
|
||||
|
||||
PlatformsByName.emplace("host", llvm::make_unique<host::HostPlatform>());
|
||||
|
||||
#ifdef STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM
|
||||
PlatformsByName.emplace("cuda", llvm::make_unique<cuda::CUDAPlatform>());
|
||||
#endif
|
||||
}
|
||||
|
||||
Expected<Platform *> PlatformManager::getPlatformByName(llvm::StringRef Name) {
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
if(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM)
|
||||
add_subdirectory(cuda)
|
||||
endif()
|
|
@ -0,0 +1,5 @@
|
|||
add_library(
|
||||
streamexecutor_cuda_platform
|
||||
OBJECT
|
||||
CUDAPlatform.cpp
|
||||
CUDAPlatformDevice.cpp)
|
|
@ -0,0 +1,65 @@
|
|||
//===-- CUDAPlatform.cpp - CUDA platform implementation -------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Implementation of CUDA platform internals.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "streamexecutor/platforms/cuda/CUDAPlatform.h"
|
||||
#include "streamexecutor/Device.h"
|
||||
#include "streamexecutor/Platform.h"
|
||||
#include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h"
|
||||
|
||||
#include "llvm/Support/Mutex.h"
|
||||
|
||||
#include "cuda.h"
|
||||
|
||||
#include <map>
|
||||
|
||||
namespace streamexecutor {
|
||||
namespace cuda {
|
||||
|
||||
static CUresult ensureCUDAInitialized() {
|
||||
static CUresult InitResult = []() { return cuInit(0); }();
|
||||
return InitResult;
|
||||
}
|
||||
|
||||
size_t CUDAPlatform::getDeviceCount() const {
|
||||
if (ensureCUDAInitialized())
|
||||
// TODO(jhen): Log an error.
|
||||
return 0;
|
||||
|
||||
int DeviceCount = 0;
|
||||
CUresult Result = cuDeviceGetCount(&DeviceCount);
|
||||
(void)Result;
|
||||
// TODO(jhen): Log an error.
|
||||
|
||||
return DeviceCount;
|
||||
}
|
||||
|
||||
Expected<Device> CUDAPlatform::getDevice(size_t DeviceIndex) {
|
||||
if (CUresult InitResult = ensureCUDAInitialized())
|
||||
return CUresultToError(InitResult, "cached cuInit return value");
|
||||
|
||||
llvm::sys::ScopedLock Lock(Mutex);
|
||||
auto Iterator = PlatformDevices.find(DeviceIndex);
|
||||
if (Iterator == PlatformDevices.end()) {
|
||||
if (auto MaybePDevice = CUDAPlatformDevice::create(DeviceIndex)) {
|
||||
Iterator =
|
||||
PlatformDevices.emplace(DeviceIndex, std::move(*MaybePDevice)).first;
|
||||
} else {
|
||||
return MaybePDevice.takeError();
|
||||
}
|
||||
}
|
||||
return Device(&Iterator->second);
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace streamexecutor
|
|
@ -0,0 +1,280 @@
|
|||
//===-- CUDAPlatformDevice.cpp - CUDAPlatformDevice implementation --------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Implementation of CUDAPlatformDevice.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h"
|
||||
#include "streamexecutor/PlatformDevice.h"
|
||||
|
||||
#include "cuda.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
namespace cuda {
|
||||
|
||||
static void *offset(const void *Base, size_t Offset) {
|
||||
return const_cast<char *>(static_cast<const char *>(Base) + Offset);
|
||||
}
|
||||
|
||||
Error CUresultToError(int CUResult, const llvm::Twine &Message) {
|
||||
CUresult Result = static_cast<CUresult>(CUResult);
|
||||
if (Result) {
|
||||
const char *ErrorName;
|
||||
if (cuGetErrorName(Result, &ErrorName))
|
||||
ErrorName = "UNKNOWN ERROR NAME";
|
||||
const char *ErrorString;
|
||||
if (cuGetErrorString(Result, &ErrorString))
|
||||
ErrorString = "UNKNOWN ERROR DESCRIPTION";
|
||||
return make_error("CUDA driver error: '" + Message + "', error code = " +
|
||||
llvm::Twine(static_cast<int>(Result)) + ", name = " +
|
||||
ErrorName + ", description = '" + ErrorString + "'");
|
||||
} else
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
std::string CUDAPlatformDevice::getName() const {
|
||||
static std::string CachedName = [](int DeviceIndex) {
|
||||
static constexpr size_t MAX_DRIVER_NAME_BYTES = 1024;
|
||||
std::string Name = "CUDA device " + std::to_string(DeviceIndex);
|
||||
char NameFromDriver[MAX_DRIVER_NAME_BYTES];
|
||||
if (!cuDeviceGetName(NameFromDriver, MAX_DRIVER_NAME_BYTES - 1,
|
||||
DeviceIndex)) {
|
||||
NameFromDriver[MAX_DRIVER_NAME_BYTES - 1] = '\0';
|
||||
Name.append(": ").append(NameFromDriver);
|
||||
}
|
||||
return Name;
|
||||
}(DeviceIndex);
|
||||
return CachedName;
|
||||
}
|
||||
|
||||
Expected<CUDAPlatformDevice> CUDAPlatformDevice::create(size_t DeviceIndex) {
|
||||
CUdevice DeviceHandle;
|
||||
if (CUresult Result = cuDeviceGet(&DeviceHandle, DeviceIndex))
|
||||
return CUresultToError(Result, "cuDeviceGet");
|
||||
|
||||
CUcontext ContextHandle;
|
||||
if (CUresult Result = cuDevicePrimaryCtxRetain(&ContextHandle, DeviceHandle))
|
||||
return CUresultToError(Result, "cuDevicePrimaryCtxRetain");
|
||||
|
||||
if (CUresult Result = cuCtxSetCurrent(ContextHandle))
|
||||
return CUresultToError(Result, "cuCtxSetCurrent");
|
||||
|
||||
return CUDAPlatformDevice(DeviceIndex);
|
||||
}
|
||||
|
||||
CUDAPlatformDevice::CUDAPlatformDevice(CUDAPlatformDevice &&Other) noexcept
|
||||
: DeviceIndex(Other.DeviceIndex) {
|
||||
Other.DeviceIndex = -1;
|
||||
}
|
||||
|
||||
CUDAPlatformDevice &CUDAPlatformDevice::
|
||||
operator=(CUDAPlatformDevice &&Other) noexcept {
|
||||
DeviceIndex = Other.DeviceIndex;
|
||||
Other.DeviceIndex = -1;
|
||||
return *this;
|
||||
}
|
||||
|
||||
CUDAPlatformDevice::~CUDAPlatformDevice() {
|
||||
CUresult Result = cuDevicePrimaryCtxRelease(DeviceIndex);
|
||||
(void)Result;
|
||||
// TODO(jhen): Log error.
|
||||
}
|
||||
|
||||
Expected<const void *>
|
||||
CUDAPlatformDevice::createKernel(const MultiKernelLoaderSpec &Spec) {
|
||||
// TODO(jhen): Maybe first check loaded modules?
|
||||
if (!Spec.hasCUDAPTXInMemory())
|
||||
return make_error("no CUDA code available to create kernel");
|
||||
|
||||
CUdevice Device = static_cast<int>(DeviceIndex);
|
||||
int ComputeCapabilityMajor = 0;
|
||||
int ComputeCapabilityMinor = 0;
|
||||
if (CUresult Result = cuDeviceGetAttribute(
|
||||
&ComputeCapabilityMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
|
||||
Device))
|
||||
return CUresultToError(
|
||||
Result,
|
||||
"cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR");
|
||||
if (CUresult Result = cuDeviceGetAttribute(
|
||||
&ComputeCapabilityMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
|
||||
Device))
|
||||
return CUresultToError(
|
||||
Result,
|
||||
"cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR");
|
||||
const char *Code = Spec.getCUDAPTXInMemory().getCode(ComputeCapabilityMajor,
|
||||
ComputeCapabilityMinor);
|
||||
|
||||
if (!Code)
|
||||
return make_error("no suitable CUDA source found for compute capability " +
|
||||
llvm::Twine(ComputeCapabilityMajor) + "." +
|
||||
llvm::Twine(ComputeCapabilityMinor));
|
||||
|
||||
CUmodule Module;
|
||||
if (CUresult Result = cuModuleLoadData(&Module, Code))
|
||||
return CUresultToError(Result, "cuModuleLoadData");
|
||||
|
||||
CUfunction Function;
|
||||
if (CUresult Result =
|
||||
cuModuleGetFunction(&Function, Module, Spec.getKernelName().c_str()))
|
||||
return CUresultToError(Result, "cuModuleGetFunction");
|
||||
|
||||
// TODO(jhen): Should I save this function pointer in case someone asks for
|
||||
// it again?
|
||||
|
||||
// TODO(jhen): Should I save the module pointer so I can unload it when I
|
||||
// destroy this device?
|
||||
|
||||
return static_cast<const void *>(Function);
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::destroyKernel(const void *Handle) {
|
||||
// TODO(jhen): Maybe keep track of kernels for each module and unload the
|
||||
// module after they are all destroyed.
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
Expected<const void *> CUDAPlatformDevice::createStream() {
|
||||
CUstream Stream;
|
||||
if (CUresult Result = cuStreamCreate(&Stream, CU_STREAM_DEFAULT))
|
||||
return CUresultToError(Result, "cuStreamCreate");
|
||||
return Stream;
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::destroyStream(const void *Handle) {
|
||||
return CUresultToError(
|
||||
cuStreamDestroy(static_cast<CUstream>(const_cast<void *>(Handle))),
|
||||
"cuStreamDestroy");
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::launch(
|
||||
const void *PlatformStreamHandle, BlockDimensions BlockSize,
|
||||
GridDimensions GridSize, const void *PKernelHandle,
|
||||
const PackedKernelArgumentArrayBase &ArgumentArray) {
|
||||
CUfunction Function =
|
||||
reinterpret_cast<CUfunction>(const_cast<void *>(PKernelHandle));
|
||||
CUstream Stream =
|
||||
reinterpret_cast<CUstream>(const_cast<void *>(PlatformStreamHandle));
|
||||
// TODO(jhen): Deal with shared memory arguments.
|
||||
unsigned SharedMemoryBytes = 0;
|
||||
void **ArgumentAddresses = const_cast<void **>(ArgumentArray.getAddresses());
|
||||
return CUresultToError(cuLaunchKernel(Function, GridSize.X, GridSize.Y,
|
||||
GridSize.Z, BlockSize.X, BlockSize.Y,
|
||||
BlockSize.Z, SharedMemoryBytes, Stream,
|
||||
ArgumentAddresses, nullptr),
|
||||
"cuLaunchKernel");
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::copyD2H(const void *PlatformStreamHandle,
|
||||
const void *DeviceSrcHandle,
|
||||
size_t SrcByteOffset, void *HostDst,
|
||||
size_t DstByteOffset, size_t ByteCount) {
|
||||
return CUresultToError(
|
||||
cuMemcpyDtoHAsync(
|
||||
offset(HostDst, DstByteOffset),
|
||||
reinterpret_cast<CUdeviceptr>(offset(DeviceSrcHandle, SrcByteOffset)),
|
||||
ByteCount,
|
||||
static_cast<CUstream>(const_cast<void *>(PlatformStreamHandle))),
|
||||
"cuMemcpyDtoHAsync");
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::copyH2D(const void *PlatformStreamHandle,
|
||||
const void *HostSrc, size_t SrcByteOffset,
|
||||
const void *DeviceDstHandle,
|
||||
size_t DstByteOffset, size_t ByteCount) {
|
||||
return CUresultToError(
|
||||
cuMemcpyHtoDAsync(
|
||||
reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
|
||||
offset(HostSrc, SrcByteOffset), ByteCount,
|
||||
static_cast<CUstream>(const_cast<void *>(PlatformStreamHandle))),
|
||||
"cuMemcpyHtoDAsync");
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::copyD2D(const void *PlatformStreamHandle,
|
||||
const void *DeviceSrcHandle,
|
||||
size_t SrcByteOffset,
|
||||
const void *DeviceDstHandle,
|
||||
size_t DstByteOffset, size_t ByteCount) {
|
||||
return CUresultToError(
|
||||
cuMemcpyDtoDAsync(
|
||||
reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
|
||||
reinterpret_cast<CUdeviceptr>(offset(DeviceSrcHandle, SrcByteOffset)),
|
||||
ByteCount,
|
||||
static_cast<CUstream>(const_cast<void *>(PlatformStreamHandle))),
|
||||
"cuMemcpyDtoDAsync");
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::blockHostUntilDone(const void *PlatformStreamHandle) {
|
||||
return CUresultToError(cuStreamSynchronize(static_cast<CUstream>(
|
||||
const_cast<void *>(PlatformStreamHandle))),
|
||||
"cuStreamSynchronize");
|
||||
}
|
||||
|
||||
Expected<void *> CUDAPlatformDevice::allocateDeviceMemory(size_t ByteCount) {
|
||||
CUdeviceptr Pointer;
|
||||
if (CUresult Result = cuMemAlloc(&Pointer, ByteCount))
|
||||
return CUresultToError(Result, "cuMemAlloc");
|
||||
return reinterpret_cast<void *>(Pointer);
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::freeDeviceMemory(const void *Handle) {
|
||||
return CUresultToError(cuMemFree(reinterpret_cast<CUdeviceptr>(Handle)),
|
||||
"cuMemFree");
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::registerHostMemory(void *Memory, size_t ByteCount) {
|
||||
return CUresultToError(cuMemHostRegister(Memory, ByteCount, 0u),
|
||||
"cuMemHostRegister");
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::unregisterHostMemory(const void *Memory) {
|
||||
return CUresultToError(cuMemHostUnregister(const_cast<void *>(Memory)),
|
||||
"cuMemHostUnregister");
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::synchronousCopyD2H(const void *DeviceSrcHandle,
|
||||
size_t SrcByteOffset,
|
||||
void *HostDst,
|
||||
size_t DstByteOffset,
|
||||
size_t ByteCount) {
|
||||
return CUresultToError(cuMemcpyDtoH(offset(HostDst, DstByteOffset),
|
||||
reinterpret_cast<CUdeviceptr>(offset(
|
||||
DeviceSrcHandle, SrcByteOffset)),
|
||||
ByteCount),
|
||||
"cuMemcpyDtoH");
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::synchronousCopyH2D(const void *HostSrc,
|
||||
size_t SrcByteOffset,
|
||||
const void *DeviceDstHandle,
|
||||
size_t DstByteOffset,
|
||||
size_t ByteCount) {
|
||||
return CUresultToError(
|
||||
cuMemcpyHtoD(
|
||||
reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
|
||||
offset(HostSrc, SrcByteOffset), ByteCount),
|
||||
"cuMemcpyHtoD");
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::synchronousCopyD2D(const void *DeviceDstHandle,
|
||||
size_t DstByteOffset,
|
||||
const void *DeviceSrcHandle,
|
||||
size_t SrcByteOffset,
|
||||
size_t ByteCount) {
|
||||
return CUresultToError(
|
||||
cuMemcpyDtoD(
|
||||
reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
|
||||
reinterpret_cast<CUdeviceptr>(offset(DeviceSrcHandle, SrcByteOffset)),
|
||||
ByteCount),
|
||||
"cuMemcpyDtoD");
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace streamexecutor
|
|
@ -0,0 +1,21 @@
|
|||
# - Try to find the libcuda library
|
||||
# Once done this will define
|
||||
# LIBCUDA_FOUND - System has libcuda
|
||||
# LIBCUDA_INCLUDE_DIRS - The libcuda include directories
|
||||
# LIBCUDA_LIBRARIES - The libraries needed to use libcuda
|
||||
|
||||
# TODO(jhen): Allow users to specify a search path.
|
||||
find_path(LIBCUDA_INCLUDE_DIR cuda.h /usr/local/cuda/include)
|
||||
# TODO(jhen): Use the library that goes with the headers.
|
||||
find_library(LIBCUDA_LIBRARY cuda)
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
# handle the QUIETLY and REQUIRED arguments and set LIBCUDA_FOUND to TRUE if
|
||||
# all listed variables are TRUE
|
||||
find_package_handle_standard_args(
|
||||
LIBCUDA DEFAULT_MSG LIBCUDA_INCLUDE_DIR LIBCUDA_LIBRARY)
|
||||
|
||||
mark_as_advanced(LIBCUDA_INCLUDE_DIR LIBCUDA_LIBRARY)
|
||||
|
||||
set(LIBCUDA_LIBRARIES ${LIBCUDA_LIBRARY})
|
||||
set(LIBCUDA_INCLUDE_DIRS ${LIBCUDA_INCLUDE_DIR})
|
Loading…
Reference in New Issue