Initial check-in of Acxxel (StreamExecutor renamed)
Summary:
Acxxel is basically a simplified redesign of StreamExecutor.
Here are the major points where Acxxel differs from the current
StreamExecutor design:
* Acxxel doesn't support the kernel and kernel loader types designed for
emission by the compiler to support type-safe kernel launches. For
CUDA, kernels in Acxxel can be seamlessly launched using the standard
CUDA triple-chevron kernel launch syntax that is available with clang
and nvcc. For CUDA and OpenCL, kernel arguments can be passed in the
old-fashioned way, as one array of pointers to arguments and another
array of argument sizes. Although OpenCL doesn't get a type-safe
kernel launch method, it does still get the benefit of all the memory
management wrappers. In the future, clang may add support for
triple-chevron OpenCL kernel launchs, or some other type-safe OpenCL
kernel launch method.
* Acxxel does not depend on any other code in LLVM, so it builds
completely independently from LLVM.
The goal will be to check in Acxxel and remove StreamExecutor, or
perhaps to remove the old StreamExecutor and rename Acxxel to
StreamExecutor, so I think Acxxel should be thought of as a new version
of StreamExecutor, not as a separate project.
Reviewers: jlebar, jprice
Subscribers: beanz, mgorny, modocache, parallel_libs-commits
Differential Revision: https://reviews.llvm.org/D25701
llvm-svn: 285111
2016-10-26 04:18:56 +08:00
|
|
|
//===--- cuda_acxxel.cpp - CUDA implementation of the Acxxel API ----------===//
|
|
|
|
//
|
2019-01-19 16:50:56 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
Initial check-in of Acxxel (StreamExecutor renamed)
Summary:
Acxxel is basically a simplified redesign of StreamExecutor.
Here are the major points where Acxxel differs from the current
StreamExecutor design:
* Acxxel doesn't support the kernel and kernel loader types designed for
emission by the compiler to support type-safe kernel launches. For
CUDA, kernels in Acxxel can be seamlessly launched using the standard
CUDA triple-chevron kernel launch syntax that is available with clang
and nvcc. For CUDA and OpenCL, kernel arguments can be passed in the
old-fashioned way, as one array of pointers to arguments and another
array of argument sizes. Although OpenCL doesn't get a type-safe
kernel launch method, it does still get the benefit of all the memory
management wrappers. In the future, clang may add support for
triple-chevron OpenCL kernel launchs, or some other type-safe OpenCL
kernel launch method.
* Acxxel does not depend on any other code in LLVM, so it builds
completely independently from LLVM.
The goal will be to check in Acxxel and remove StreamExecutor, or
perhaps to remove the old StreamExecutor and rename Acxxel to
StreamExecutor, so I think Acxxel should be thought of as a new version
of StreamExecutor, not as a separate project.
Reviewers: jlebar, jprice
Subscribers: beanz, mgorny, modocache, parallel_libs-commits
Differential Revision: https://reviews.llvm.org/D25701
llvm-svn: 285111
2016-10-26 04:18:56 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
///
|
|
|
|
/// This file defines the standard CUDA implementation of the Acxxel API.
|
|
|
|
///
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "acxxel.h"
|
|
|
|
|
|
|
|
#include "cuda.h"
|
|
|
|
#include "cuda_runtime.h"
|
|
|
|
|
|
|
|
#include <array>
|
|
|
|
#include <cassert>
|
|
|
|
#include <sstream>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
namespace acxxel {
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
static std::string getCUErrorMessage(CUresult Result) {
|
|
|
|
if (!Result)
|
|
|
|
return "success";
|
|
|
|
const char *ErrorName = "UNKNOWN_ERROR_NAME";
|
|
|
|
const char *ErrorDescription = "UNKNOWN_ERROR_DESCRIPTION";
|
|
|
|
cuGetErrorName(Result, &ErrorName);
|
|
|
|
cuGetErrorString(Result, &ErrorDescription);
|
|
|
|
std::ostringstream OutStream;
|
|
|
|
OutStream << "CUDA driver error: code = " << Result
|
|
|
|
<< ", name = " << ErrorName
|
|
|
|
<< ", description = " << ErrorDescription;
|
|
|
|
return OutStream.str();
|
|
|
|
}
|
|
|
|
|
|
|
|
static Status getCUError(CUresult Result, const std::string &Message) {
|
|
|
|
if (!Result)
|
|
|
|
return Status();
|
|
|
|
std::ostringstream OutStream;
|
|
|
|
OutStream << getCUErrorMessage(Result) << ", message = " << Message;
|
|
|
|
return Status(OutStream.str());
|
|
|
|
}
|
|
|
|
|
|
|
|
static std::string getCUDAErrorMessage(cudaError_t E) {
|
|
|
|
if (!E)
|
|
|
|
return "success";
|
|
|
|
std::ostringstream OutStream;
|
|
|
|
OutStream << "CUDA runtime error: code = " << E
|
|
|
|
<< ", name = " << cudaGetErrorName(E)
|
|
|
|
<< ", description = " << cudaGetErrorString(E);
|
|
|
|
return OutStream.str();
|
|
|
|
}
|
|
|
|
|
|
|
|
static Status getCUDAError(cudaError_t E, const std::string &Message) {
|
|
|
|
if (!E)
|
|
|
|
return Status();
|
|
|
|
std::ostringstream OutStream;
|
|
|
|
OutStream << getCUDAErrorMessage(E) << ", message = " << Message;
|
|
|
|
return Status(OutStream.str());
|
|
|
|
}
|
|
|
|
|
|
|
|
static void logCUWarning(CUresult Result, const std::string &Message) {
|
|
|
|
if (Result) {
|
|
|
|
std::ostringstream OutStream;
|
|
|
|
OutStream << Message << ": " << getCUErrorMessage(Result);
|
|
|
|
logWarning(OutStream.str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// A CUDA Platform implementation.
|
|
|
|
class CUDAPlatform : public Platform {
|
|
|
|
public:
|
|
|
|
~CUDAPlatform() override = default;
|
|
|
|
|
|
|
|
static Expected<CUDAPlatform> create();
|
|
|
|
|
|
|
|
Expected<int> getDeviceCount() override;
|
|
|
|
|
2016-10-28 08:54:02 +08:00
|
|
|
Expected<Stream> createStream(int DeviceIndex) override;
|
Initial check-in of Acxxel (StreamExecutor renamed)
Summary:
Acxxel is basically a simplified redesign of StreamExecutor.
Here are the major points where Acxxel differs from the current
StreamExecutor design:
* Acxxel doesn't support the kernel and kernel loader types designed for
emission by the compiler to support type-safe kernel launches. For
CUDA, kernels in Acxxel can be seamlessly launched using the standard
CUDA triple-chevron kernel launch syntax that is available with clang
and nvcc. For CUDA and OpenCL, kernel arguments can be passed in the
old-fashioned way, as one array of pointers to arguments and another
array of argument sizes. Although OpenCL doesn't get a type-safe
kernel launch method, it does still get the benefit of all the memory
management wrappers. In the future, clang may add support for
triple-chevron OpenCL kernel launchs, or some other type-safe OpenCL
kernel launch method.
* Acxxel does not depend on any other code in LLVM, so it builds
completely independently from LLVM.
The goal will be to check in Acxxel and remove StreamExecutor, or
perhaps to remove the old StreamExecutor and rename Acxxel to
StreamExecutor, so I think Acxxel should be thought of as a new version
of StreamExecutor, not as a separate project.
Reviewers: jlebar, jprice
Subscribers: beanz, mgorny, modocache, parallel_libs-commits
Differential Revision: https://reviews.llvm.org/D25701
llvm-svn: 285111
2016-10-26 04:18:56 +08:00
|
|
|
|
|
|
|
Status streamSync(void *Stream) override;
|
|
|
|
|
|
|
|
Status streamWaitOnEvent(void *Stream, void *Event) override;
|
|
|
|
|
2016-10-28 08:54:02 +08:00
|
|
|
Expected<Event> createEvent(int DeviceIndex) override;
|
Initial check-in of Acxxel (StreamExecutor renamed)
Summary:
Acxxel is basically a simplified redesign of StreamExecutor.
Here are the major points where Acxxel differs from the current
StreamExecutor design:
* Acxxel doesn't support the kernel and kernel loader types designed for
emission by the compiler to support type-safe kernel launches. For
CUDA, kernels in Acxxel can be seamlessly launched using the standard
CUDA triple-chevron kernel launch syntax that is available with clang
and nvcc. For CUDA and OpenCL, kernel arguments can be passed in the
old-fashioned way, as one array of pointers to arguments and another
array of argument sizes. Although OpenCL doesn't get a type-safe
kernel launch method, it does still get the benefit of all the memory
management wrappers. In the future, clang may add support for
triple-chevron OpenCL kernel launchs, or some other type-safe OpenCL
kernel launch method.
* Acxxel does not depend on any other code in LLVM, so it builds
completely independently from LLVM.
The goal will be to check in Acxxel and remove StreamExecutor, or
perhaps to remove the old StreamExecutor and rename Acxxel to
StreamExecutor, so I think Acxxel should be thought of as a new version
of StreamExecutor, not as a separate project.
Reviewers: jlebar, jprice
Subscribers: beanz, mgorny, modocache, parallel_libs-commits
Differential Revision: https://reviews.llvm.org/D25701
llvm-svn: 285111
2016-10-26 04:18:56 +08:00
|
|
|
|
|
|
|
protected:
|
2016-10-28 08:54:02 +08:00
|
|
|
Expected<void *> rawMallocD(ptrdiff_t ByteCount, int DeviceIndex) override;
|
Initial check-in of Acxxel (StreamExecutor renamed)
Summary:
Acxxel is basically a simplified redesign of StreamExecutor.
Here are the major points where Acxxel differs from the current
StreamExecutor design:
* Acxxel doesn't support the kernel and kernel loader types designed for
emission by the compiler to support type-safe kernel launches. For
CUDA, kernels in Acxxel can be seamlessly launched using the standard
CUDA triple-chevron kernel launch syntax that is available with clang
and nvcc. For CUDA and OpenCL, kernel arguments can be passed in the
old-fashioned way, as one array of pointers to arguments and another
array of argument sizes. Although OpenCL doesn't get a type-safe
kernel launch method, it does still get the benefit of all the memory
management wrappers. In the future, clang may add support for
triple-chevron OpenCL kernel launchs, or some other type-safe OpenCL
kernel launch method.
* Acxxel does not depend on any other code in LLVM, so it builds
completely independently from LLVM.
The goal will be to check in Acxxel and remove StreamExecutor, or
perhaps to remove the old StreamExecutor and rename Acxxel to
StreamExecutor, so I think Acxxel should be thought of as a new version
of StreamExecutor, not as a separate project.
Reviewers: jlebar, jprice
Subscribers: beanz, mgorny, modocache, parallel_libs-commits
Differential Revision: https://reviews.llvm.org/D25701
llvm-svn: 285111
2016-10-26 04:18:56 +08:00
|
|
|
HandleDestructor getDeviceMemoryHandleDestructor() override;
|
|
|
|
void *getDeviceMemorySpanHandle(void *BaseHandle, size_t ByteSize,
|
|
|
|
size_t ByteOffset) override;
|
|
|
|
virtual void rawDestroyDeviceMemorySpanHandle(void *Handle) override;
|
|
|
|
|
2016-10-28 08:54:02 +08:00
|
|
|
Expected<void *> rawGetDeviceSymbolAddress(const void *Symbol,
|
|
|
|
int DeviceIndex) override;
|
|
|
|
Expected<ptrdiff_t> rawGetDeviceSymbolSize(const void *Symbol,
|
|
|
|
int DeviceIndex) override;
|
Initial check-in of Acxxel (StreamExecutor renamed)
Summary:
Acxxel is basically a simplified redesign of StreamExecutor.
Here are the major points where Acxxel differs from the current
StreamExecutor design:
* Acxxel doesn't support the kernel and kernel loader types designed for
emission by the compiler to support type-safe kernel launches. For
CUDA, kernels in Acxxel can be seamlessly launched using the standard
CUDA triple-chevron kernel launch syntax that is available with clang
and nvcc. For CUDA and OpenCL, kernel arguments can be passed in the
old-fashioned way, as one array of pointers to arguments and another
array of argument sizes. Although OpenCL doesn't get a type-safe
kernel launch method, it does still get the benefit of all the memory
management wrappers. In the future, clang may add support for
triple-chevron OpenCL kernel launchs, or some other type-safe OpenCL
kernel launch method.
* Acxxel does not depend on any other code in LLVM, so it builds
completely independently from LLVM.
The goal will be to check in Acxxel and remove StreamExecutor, or
perhaps to remove the old StreamExecutor and rename Acxxel to
StreamExecutor, so I think Acxxel should be thought of as a new version
of StreamExecutor, not as a separate project.
Reviewers: jlebar, jprice
Subscribers: beanz, mgorny, modocache, parallel_libs-commits
Differential Revision: https://reviews.llvm.org/D25701
llvm-svn: 285111
2016-10-26 04:18:56 +08:00
|
|
|
|
|
|
|
Status rawRegisterHostMem(const void *Memory, ptrdiff_t ByteCount) override;
|
|
|
|
HandleDestructor getUnregisterHostMemoryHandleDestructor() override;
|
|
|
|
|
|
|
|
Expected<void *> rawMallocRegisteredH(ptrdiff_t ByteCount) override;
|
|
|
|
HandleDestructor getFreeHostMemoryHandleDestructor() override;
|
|
|
|
|
|
|
|
Status asyncCopyDToD(const void *DeviceSrc, ptrdiff_t DeviceSrcByteOffset,
|
|
|
|
void *DeviceDst, ptrdiff_t DeviceDstByteOffset,
|
|
|
|
ptrdiff_t ByteCount, void *Stream) override;
|
|
|
|
Status asyncCopyDToH(const void *DeviceSrc, ptrdiff_t DeviceSrcByteOffset,
|
|
|
|
void *HostDst, ptrdiff_t ByteCount,
|
|
|
|
void *Stream) override;
|
|
|
|
Status asyncCopyHToD(const void *HostSrc, void *DeviceDst,
|
|
|
|
ptrdiff_t DeviceDstByteOffset, ptrdiff_t ByteCount,
|
|
|
|
void *Stream) override;
|
|
|
|
|
|
|
|
Status asyncMemsetD(void *DeviceDst, ptrdiff_t ByteOffset,
|
|
|
|
ptrdiff_t ByteCount, char ByteValue,
|
|
|
|
void *Stream) override;
|
|
|
|
|
|
|
|
Status addStreamCallback(Stream &Stream, StreamCallback Callback) override;
|
|
|
|
|
2016-10-28 08:54:02 +08:00
|
|
|
Expected<Program> createProgramFromSource(Span<const char> Source,
|
|
|
|
int DeviceIndex) override;
|
Initial check-in of Acxxel (StreamExecutor renamed)
Summary:
Acxxel is basically a simplified redesign of StreamExecutor.
Here are the major points where Acxxel differs from the current
StreamExecutor design:
* Acxxel doesn't support the kernel and kernel loader types designed for
emission by the compiler to support type-safe kernel launches. For
CUDA, kernels in Acxxel can be seamlessly launched using the standard
CUDA triple-chevron kernel launch syntax that is available with clang
and nvcc. For CUDA and OpenCL, kernel arguments can be passed in the
old-fashioned way, as one array of pointers to arguments and another
array of argument sizes. Although OpenCL doesn't get a type-safe
kernel launch method, it does still get the benefit of all the memory
management wrappers. In the future, clang may add support for
triple-chevron OpenCL kernel launchs, or some other type-safe OpenCL
kernel launch method.
* Acxxel does not depend on any other code in LLVM, so it builds
completely independently from LLVM.
The goal will be to check in Acxxel and remove StreamExecutor, or
perhaps to remove the old StreamExecutor and rename Acxxel to
StreamExecutor, so I think Acxxel should be thought of as a new version
of StreamExecutor, not as a separate project.
Reviewers: jlebar, jprice
Subscribers: beanz, mgorny, modocache, parallel_libs-commits
Differential Revision: https://reviews.llvm.org/D25701
llvm-svn: 285111
2016-10-26 04:18:56 +08:00
|
|
|
|
|
|
|
Status enqueueEvent(void *Event, void *Stream) override;
|
|
|
|
bool eventIsDone(void *Event) override;
|
|
|
|
Status eventSync(void *Event) override;
|
|
|
|
Expected<float> getSecondsBetweenEvents(void *StartEvent,
|
|
|
|
void *EndEvent) override;
|
|
|
|
|
|
|
|
Expected<void *> rawCreateKernel(void *Program,
|
|
|
|
const std::string &Name) override;
|
|
|
|
HandleDestructor getKernelHandleDestructor() override;
|
|
|
|
|
|
|
|
Status rawEnqueueKernelLaunch(void *Stream, void *Kernel,
|
|
|
|
KernelLaunchDimensions LaunchDimensions,
|
|
|
|
Span<void *> Arguments,
|
|
|
|
Span<size_t> ArgumentSizes,
|
|
|
|
size_t SharedMemoryBytes) override;
|
|
|
|
|
|
|
|
private:
|
|
|
|
explicit CUDAPlatform(const std::vector<CUcontext> &Contexts)
|
|
|
|
: TheContexts(Contexts) {}
|
|
|
|
|
2016-10-28 08:54:02 +08:00
|
|
|
Status setContext(int DeviceIndex) {
|
|
|
|
if (DeviceIndex < 0 ||
|
|
|
|
static_cast<size_t>(DeviceIndex) >= TheContexts.size())
|
|
|
|
return Status("invalid deivce index " + std::to_string(DeviceIndex));
|
|
|
|
return getCUError(cuCtxSetCurrent(TheContexts[DeviceIndex]),
|
|
|
|
"cuCtxSetCurrent");
|
|
|
|
}
|
|
|
|
|
Initial check-in of Acxxel (StreamExecutor renamed)
Summary:
Acxxel is basically a simplified redesign of StreamExecutor.
Here are the major points where Acxxel differs from the current
StreamExecutor design:
* Acxxel doesn't support the kernel and kernel loader types designed for
emission by the compiler to support type-safe kernel launches. For
CUDA, kernels in Acxxel can be seamlessly launched using the standard
CUDA triple-chevron kernel launch syntax that is available with clang
and nvcc. For CUDA and OpenCL, kernel arguments can be passed in the
old-fashioned way, as one array of pointers to arguments and another
array of argument sizes. Although OpenCL doesn't get a type-safe
kernel launch method, it does still get the benefit of all the memory
management wrappers. In the future, clang may add support for
triple-chevron OpenCL kernel launchs, or some other type-safe OpenCL
kernel launch method.
* Acxxel does not depend on any other code in LLVM, so it builds
completely independently from LLVM.
The goal will be to check in Acxxel and remove StreamExecutor, or
perhaps to remove the old StreamExecutor and rename Acxxel to
StreamExecutor, so I think Acxxel should be thought of as a new version
of StreamExecutor, not as a separate project.
Reviewers: jlebar, jprice
Subscribers: beanz, mgorny, modocache, parallel_libs-commits
Differential Revision: https://reviews.llvm.org/D25701
llvm-svn: 285111
2016-10-26 04:18:56 +08:00
|
|
|
// Vector of contexts for each device.
|
|
|
|
std::vector<CUcontext> TheContexts;
|
|
|
|
};
|
|
|
|
|
|
|
|
Expected<CUDAPlatform> CUDAPlatform::create() {
|
|
|
|
std::vector<CUcontext> Contexts;
|
|
|
|
if (CUresult Result = cuInit(0))
|
|
|
|
return getCUError(Result, "cuInit");
|
|
|
|
|
|
|
|
int DeviceCount = 0;
|
|
|
|
if (CUresult Result = cuDeviceGetCount(&DeviceCount))
|
|
|
|
return getCUError(Result, "cuDeviceGetCount");
|
|
|
|
|
|
|
|
for (int I = 0; I < DeviceCount; ++I) {
|
|
|
|
CUdevice Device;
|
|
|
|
if (CUresult Result = cuDeviceGet(&Device, I))
|
|
|
|
return getCUError(Result, "cuDeviceGet");
|
|
|
|
CUcontext Context;
|
|
|
|
if (CUresult Result = cuDevicePrimaryCtxRetain(&Context, Device))
|
|
|
|
return getCUError(Result, "cuDevicePrimaryCtxRetain");
|
|
|
|
if (CUresult Result = cuCtxSetCurrent(Context))
|
|
|
|
return getCUError(Result, "cuCtxSetCurrent");
|
|
|
|
Contexts.emplace_back(Context);
|
|
|
|
}
|
|
|
|
|
|
|
|
return CUDAPlatform(Contexts);
|
|
|
|
}
|
|
|
|
|
|
|
|
Expected<int> CUDAPlatform::getDeviceCount() {
|
|
|
|
int Count = 0;
|
|
|
|
if (CUresult Result = cuDeviceGetCount(&Count))
|
|
|
|
return getCUError(Result, "cuDeviceGetCount");
|
|
|
|
return Count;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cudaDestroyStream(void *H) {
|
|
|
|
logCUWarning(cuStreamDestroy(static_cast<CUstream_st *>(H)),
|
|
|
|
"cuStreamDestroy");
|
|
|
|
}
|
|
|
|
|
2016-10-28 08:54:02 +08:00
|
|
|
Expected<Stream> CUDAPlatform::createStream(int DeviceIndex) {
|
|
|
|
Status S = setContext(DeviceIndex);
|
|
|
|
if (S.isError())
|
|
|
|
return S;
|
Initial check-in of Acxxel (StreamExecutor renamed)
Summary:
Acxxel is basically a simplified redesign of StreamExecutor.
Here are the major points where Acxxel differs from the current
StreamExecutor design:
* Acxxel doesn't support the kernel and kernel loader types designed for
emission by the compiler to support type-safe kernel launches. For
CUDA, kernels in Acxxel can be seamlessly launched using the standard
CUDA triple-chevron kernel launch syntax that is available with clang
and nvcc. For CUDA and OpenCL, kernel arguments can be passed in the
old-fashioned way, as one array of pointers to arguments and another
array of argument sizes. Although OpenCL doesn't get a type-safe
kernel launch method, it does still get the benefit of all the memory
management wrappers. In the future, clang may add support for
triple-chevron OpenCL kernel launchs, or some other type-safe OpenCL
kernel launch method.
* Acxxel does not depend on any other code in LLVM, so it builds
completely independently from LLVM.
The goal will be to check in Acxxel and remove StreamExecutor, or
perhaps to remove the old StreamExecutor and rename Acxxel to
StreamExecutor, so I think Acxxel should be thought of as a new version
of StreamExecutor, not as a separate project.
Reviewers: jlebar, jprice
Subscribers: beanz, mgorny, modocache, parallel_libs-commits
Differential Revision: https://reviews.llvm.org/D25701
llvm-svn: 285111
2016-10-26 04:18:56 +08:00
|
|
|
unsigned int Flags = CU_STREAM_DEFAULT;
|
|
|
|
CUstream Handle;
|
|
|
|
if (CUresult Result = cuStreamCreate(&Handle, Flags))
|
|
|
|
return getCUError(Result, "cuStreamCreate");
|
2016-10-28 08:54:02 +08:00
|
|
|
return constructStream(this, DeviceIndex, Handle, cudaDestroyStream);
|
Initial check-in of Acxxel (StreamExecutor renamed)
Summary:
Acxxel is basically a simplified redesign of StreamExecutor.
Here are the major points where Acxxel differs from the current
StreamExecutor design:
* Acxxel doesn't support the kernel and kernel loader types designed for
emission by the compiler to support type-safe kernel launches. For
CUDA, kernels in Acxxel can be seamlessly launched using the standard
CUDA triple-chevron kernel launch syntax that is available with clang
and nvcc. For CUDA and OpenCL, kernel arguments can be passed in the
old-fashioned way, as one array of pointers to arguments and another
array of argument sizes. Although OpenCL doesn't get a type-safe
kernel launch method, it does still get the benefit of all the memory
management wrappers. In the future, clang may add support for
triple-chevron OpenCL kernel launchs, or some other type-safe OpenCL
kernel launch method.
* Acxxel does not depend on any other code in LLVM, so it builds
completely independently from LLVM.
The goal will be to check in Acxxel and remove StreamExecutor, or
perhaps to remove the old StreamExecutor and rename Acxxel to
StreamExecutor, so I think Acxxel should be thought of as a new version
of StreamExecutor, not as a separate project.
Reviewers: jlebar, jprice
Subscribers: beanz, mgorny, modocache, parallel_libs-commits
Differential Revision: https://reviews.llvm.org/D25701
llvm-svn: 285111
2016-10-26 04:18:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
Status CUDAPlatform::streamSync(void *Stream) {
|
|
|
|
return getCUError(cuStreamSynchronize(static_cast<CUstream_st *>(Stream)),
|
|
|
|
"cuStreamSynchronize");
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CUDAPlatform::streamWaitOnEvent(void *Stream, void *Event) {
|
|
|
|
// CUDA docs says flags must be 0.
|
|
|
|
unsigned int Flags = 0u;
|
|
|
|
return getCUError(cuStreamWaitEvent(static_cast<CUstream_st *>(Stream),
|
|
|
|
static_cast<CUevent_st *>(Event), Flags),
|
|
|
|
"cuStreamWaitEvent");
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cudaDestroyEvent(void *H) {
|
|
|
|
logCUWarning(cuEventDestroy(static_cast<CUevent_st *>(H)), "cuEventDestroy");
|
|
|
|
}
|
|
|
|
|
2016-10-28 08:54:02 +08:00
|
|
|
Expected<Event> CUDAPlatform::createEvent(int DeviceIndex) {
|
|
|
|
Status S = setContext(DeviceIndex);
|
|
|
|
if (S.isError())
|
|
|
|
return S;
|
Initial check-in of Acxxel (StreamExecutor renamed)
Summary:
Acxxel is basically a simplified redesign of StreamExecutor.
Here are the major points where Acxxel differs from the current
StreamExecutor design:
* Acxxel doesn't support the kernel and kernel loader types designed for
emission by the compiler to support type-safe kernel launches. For
CUDA, kernels in Acxxel can be seamlessly launched using the standard
CUDA triple-chevron kernel launch syntax that is available with clang
and nvcc. For CUDA and OpenCL, kernel arguments can be passed in the
old-fashioned way, as one array of pointers to arguments and another
array of argument sizes. Although OpenCL doesn't get a type-safe
kernel launch method, it does still get the benefit of all the memory
management wrappers. In the future, clang may add support for
triple-chevron OpenCL kernel launchs, or some other type-safe OpenCL
kernel launch method.
* Acxxel does not depend on any other code in LLVM, so it builds
completely independently from LLVM.
The goal will be to check in Acxxel and remove StreamExecutor, or
perhaps to remove the old StreamExecutor and rename Acxxel to
StreamExecutor, so I think Acxxel should be thought of as a new version
of StreamExecutor, not as a separate project.
Reviewers: jlebar, jprice
Subscribers: beanz, mgorny, modocache, parallel_libs-commits
Differential Revision: https://reviews.llvm.org/D25701
llvm-svn: 285111
2016-10-26 04:18:56 +08:00
|
|
|
unsigned int Flags = CU_EVENT_DEFAULT;
|
|
|
|
CUevent Handle;
|
|
|
|
if (CUresult Result = cuEventCreate(&Handle, Flags))
|
|
|
|
return getCUError(Result, "cuEventCreate");
|
2016-10-28 08:54:02 +08:00
|
|
|
return constructEvent(this, DeviceIndex, Handle, cudaDestroyEvent);
|
Initial check-in of Acxxel (StreamExecutor renamed)
Summary:
Acxxel is basically a simplified redesign of StreamExecutor.
Here are the major points where Acxxel differs from the current
StreamExecutor design:
* Acxxel doesn't support the kernel and kernel loader types designed for
emission by the compiler to support type-safe kernel launches. For
CUDA, kernels in Acxxel can be seamlessly launched using the standard
CUDA triple-chevron kernel launch syntax that is available with clang
and nvcc. For CUDA and OpenCL, kernel arguments can be passed in the
old-fashioned way, as one array of pointers to arguments and another
array of argument sizes. Although OpenCL doesn't get a type-safe
kernel launch method, it does still get the benefit of all the memory
management wrappers. In the future, clang may add support for
triple-chevron OpenCL kernel launchs, or some other type-safe OpenCL
kernel launch method.
* Acxxel does not depend on any other code in LLVM, so it builds
completely independently from LLVM.
The goal will be to check in Acxxel and remove StreamExecutor, or
perhaps to remove the old StreamExecutor and rename Acxxel to
StreamExecutor, so I think Acxxel should be thought of as a new version
of StreamExecutor, not as a separate project.
Reviewers: jlebar, jprice
Subscribers: beanz, mgorny, modocache, parallel_libs-commits
Differential Revision: https://reviews.llvm.org/D25701
llvm-svn: 285111
2016-10-26 04:18:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
Status CUDAPlatform::enqueueEvent(void *Event, void *Stream) {
|
|
|
|
return getCUError(cuEventRecord(static_cast<CUevent_st *>(Event),
|
|
|
|
static_cast<CUstream_st *>(Stream)),
|
|
|
|
"cuEventRecord");
|
|
|
|
}
|
|
|
|
|
|
|
|
bool CUDAPlatform::eventIsDone(void *Event) {
|
|
|
|
return cuEventQuery(static_cast<CUevent_st *>(Event)) != CUDA_ERROR_NOT_READY;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CUDAPlatform::eventSync(void *Event) {
|
|
|
|
return getCUError(cuEventSynchronize(static_cast<CUevent_st *>(Event)),
|
|
|
|
"cuEventSynchronize");
|
|
|
|
}
|
|
|
|
|
|
|
|
Expected<float> CUDAPlatform::getSecondsBetweenEvents(void *StartEvent,
|
|
|
|
void *EndEvent) {
|
|
|
|
float Milliseconds;
|
|
|
|
if (CUresult Result = cuEventElapsedTime(
|
|
|
|
&Milliseconds, static_cast<CUevent_st *>(StartEvent),
|
|
|
|
static_cast<CUevent_st *>(EndEvent)))
|
|
|
|
return getCUError(Result, "cuEventElapsedTime");
|
|
|
|
return Milliseconds * 1e-6;
|
|
|
|
}
|
|
|
|
|
2016-10-28 08:54:02 +08:00
|
|
|
Expected<void *> CUDAPlatform::rawMallocD(ptrdiff_t ByteCount,
|
|
|
|
int DeviceIndex) {
|
|
|
|
Status S = setContext(DeviceIndex);
|
|
|
|
if (S.isError())
|
|
|
|
return S;
|
Initial check-in of Acxxel (StreamExecutor renamed)
Summary:
Acxxel is basically a simplified redesign of StreamExecutor.
Here are the major points where Acxxel differs from the current
StreamExecutor design:
* Acxxel doesn't support the kernel and kernel loader types designed for
emission by the compiler to support type-safe kernel launches. For
CUDA, kernels in Acxxel can be seamlessly launched using the standard
CUDA triple-chevron kernel launch syntax that is available with clang
and nvcc. For CUDA and OpenCL, kernel arguments can be passed in the
old-fashioned way, as one array of pointers to arguments and another
array of argument sizes. Although OpenCL doesn't get a type-safe
kernel launch method, it does still get the benefit of all the memory
management wrappers. In the future, clang may add support for
triple-chevron OpenCL kernel launchs, or some other type-safe OpenCL
kernel launch method.
* Acxxel does not depend on any other code in LLVM, so it builds
completely independently from LLVM.
The goal will be to check in Acxxel and remove StreamExecutor, or
perhaps to remove the old StreamExecutor and rename Acxxel to
StreamExecutor, so I think Acxxel should be thought of as a new version
of StreamExecutor, not as a separate project.
Reviewers: jlebar, jprice
Subscribers: beanz, mgorny, modocache, parallel_libs-commits
Differential Revision: https://reviews.llvm.org/D25701
llvm-svn: 285111
2016-10-26 04:18:56 +08:00
|
|
|
if (!ByteCount)
|
|
|
|
return nullptr;
|
|
|
|
CUdeviceptr Pointer;
|
|
|
|
if (CUresult Result = cuMemAlloc(&Pointer, ByteCount))
|
|
|
|
return getCUError(Result, "cuMemAlloc");
|
|
|
|
return reinterpret_cast<void *>(Pointer);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cudaDestroyDeviceMemory(void *H) {
|
|
|
|
logCUWarning(cuMemFree(reinterpret_cast<CUdeviceptr>(H)), "cuMemFree");
|
|
|
|
}
|
|
|
|
|
|
|
|
HandleDestructor CUDAPlatform::getDeviceMemoryHandleDestructor() {
|
|
|
|
return cudaDestroyDeviceMemory;
|
|
|
|
}
|
|
|
|
|
|
|
|
void *CUDAPlatform::getDeviceMemorySpanHandle(void *BaseHandle, size_t,
|
|
|
|
size_t ByteOffset) {
|
|
|
|
return static_cast<char *>(BaseHandle) + ByteOffset;
|
|
|
|
}
|
|
|
|
|
|
|
|
void CUDAPlatform::rawDestroyDeviceMemorySpanHandle(void *) {
|
|
|
|
// Do nothing for this platform.
|
|
|
|
}
|
|
|
|
|
2016-10-28 08:54:02 +08:00
|
|
|
Expected<void *> CUDAPlatform::rawGetDeviceSymbolAddress(const void *Symbol,
|
|
|
|
int DeviceIndex) {
|
|
|
|
Status S = setContext(DeviceIndex);
|
|
|
|
if (S.isError())
|
|
|
|
return S;
|
Initial check-in of Acxxel (StreamExecutor renamed)
Summary:
Acxxel is basically a simplified redesign of StreamExecutor.
Here are the major points where Acxxel differs from the current
StreamExecutor design:
* Acxxel doesn't support the kernel and kernel loader types designed for
emission by the compiler to support type-safe kernel launches. For
CUDA, kernels in Acxxel can be seamlessly launched using the standard
CUDA triple-chevron kernel launch syntax that is available with clang
and nvcc. For CUDA and OpenCL, kernel arguments can be passed in the
old-fashioned way, as one array of pointers to arguments and another
array of argument sizes. Although OpenCL doesn't get a type-safe
kernel launch method, it does still get the benefit of all the memory
management wrappers. In the future, clang may add support for
triple-chevron OpenCL kernel launchs, or some other type-safe OpenCL
kernel launch method.
* Acxxel does not depend on any other code in LLVM, so it builds
completely independently from LLVM.
The goal will be to check in Acxxel and remove StreamExecutor, or
perhaps to remove the old StreamExecutor and rename Acxxel to
StreamExecutor, so I think Acxxel should be thought of as a new version
of StreamExecutor, not as a separate project.
Reviewers: jlebar, jprice
Subscribers: beanz, mgorny, modocache, parallel_libs-commits
Differential Revision: https://reviews.llvm.org/D25701
llvm-svn: 285111
2016-10-26 04:18:56 +08:00
|
|
|
void *Address;
|
|
|
|
if (cudaError_t Status = cudaGetSymbolAddress(&Address, Symbol))
|
|
|
|
return getCUDAError(Status, "cudaGetSymbolAddress");
|
|
|
|
return Address;
|
|
|
|
}
|
|
|
|
|
2016-10-28 08:54:02 +08:00
|
|
|
Expected<ptrdiff_t> CUDAPlatform::rawGetDeviceSymbolSize(const void *Symbol,
|
|
|
|
int DeviceIndex) {
|
|
|
|
Status S = setContext(DeviceIndex);
|
|
|
|
if (S.isError())
|
|
|
|
return S;
|
Initial check-in of Acxxel (StreamExecutor renamed)
Summary:
Acxxel is basically a simplified redesign of StreamExecutor.
Here are the major points where Acxxel differs from the current
StreamExecutor design:
* Acxxel doesn't support the kernel and kernel loader types designed for
emission by the compiler to support type-safe kernel launches. For
CUDA, kernels in Acxxel can be seamlessly launched using the standard
CUDA triple-chevron kernel launch syntax that is available with clang
and nvcc. For CUDA and OpenCL, kernel arguments can be passed in the
old-fashioned way, as one array of pointers to arguments and another
array of argument sizes. Although OpenCL doesn't get a type-safe
kernel launch method, it does still get the benefit of all the memory
management wrappers. In the future, clang may add support for
triple-chevron OpenCL kernel launchs, or some other type-safe OpenCL
kernel launch method.
* Acxxel does not depend on any other code in LLVM, so it builds
completely independently from LLVM.
The goal will be to check in Acxxel and remove StreamExecutor, or
perhaps to remove the old StreamExecutor and rename Acxxel to
StreamExecutor, so I think Acxxel should be thought of as a new version
of StreamExecutor, not as a separate project.
Reviewers: jlebar, jprice
Subscribers: beanz, mgorny, modocache, parallel_libs-commits
Differential Revision: https://reviews.llvm.org/D25701
llvm-svn: 285111
2016-10-26 04:18:56 +08:00
|
|
|
size_t Size;
|
|
|
|
if (cudaError_t Status = cudaGetSymbolSize(&Size, Symbol))
|
|
|
|
return getCUDAError(Status, "cudaGetSymbolSize");
|
|
|
|
return Size;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const void *offsetVoidPtr(const void *Ptr, ptrdiff_t ByteOffset) {
|
|
|
|
return static_cast<const void *>(static_cast<const char *>(Ptr) + ByteOffset);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *offsetVoidPtr(void *Ptr, ptrdiff_t ByteOffset) {
|
|
|
|
return static_cast<void *>(static_cast<char *>(Ptr) + ByteOffset);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CUDAPlatform::rawRegisterHostMem(const void *Memory,
|
|
|
|
ptrdiff_t ByteCount) {
|
|
|
|
unsigned int Flags = 0;
|
|
|
|
return getCUError(
|
|
|
|
cuMemHostRegister(const_cast<void *>(Memory), ByteCount, Flags),
|
|
|
|
"cuMemHostRegiser");
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cudaUnregisterHostMemoryHandleDestructor(void *H) {
|
|
|
|
logCUWarning(cuMemHostUnregister(H), "cuMemHostUnregister");
|
|
|
|
}
|
|
|
|
|
|
|
|
HandleDestructor CUDAPlatform::getUnregisterHostMemoryHandleDestructor() {
|
|
|
|
return cudaUnregisterHostMemoryHandleDestructor;
|
|
|
|
}
|
|
|
|
|
|
|
|
Expected<void *> CUDAPlatform::rawMallocRegisteredH(ptrdiff_t ByteCount) {
|
|
|
|
unsigned int Flags = 0;
|
|
|
|
void *Memory;
|
|
|
|
if (CUresult Result = cuMemHostAlloc(&Memory, ByteCount, Flags))
|
|
|
|
return getCUError(Result, "cuMemHostAlloc");
|
|
|
|
return Memory;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cudaFreeHostMemoryHandleDestructor(void *H) {
|
|
|
|
logCUWarning(cuMemFreeHost(H), "cuMemFreeHost");
|
|
|
|
}
|
|
|
|
|
|
|
|
HandleDestructor CUDAPlatform::getFreeHostMemoryHandleDestructor() {
|
|
|
|
return cudaFreeHostMemoryHandleDestructor;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CUDAPlatform::asyncCopyDToD(const void *DeviceSrc,
|
|
|
|
ptrdiff_t DeviceSrcByteOffset,
|
|
|
|
void *DeviceDst,
|
|
|
|
ptrdiff_t DeviceDstByteOffset,
|
|
|
|
ptrdiff_t ByteCount, void *Stream) {
|
|
|
|
return getCUError(
|
|
|
|
cuMemcpyDtoDAsync(reinterpret_cast<CUdeviceptr>(
|
|
|
|
offsetVoidPtr(DeviceDst, DeviceDstByteOffset)),
|
|
|
|
reinterpret_cast<CUdeviceptr>(
|
|
|
|
offsetVoidPtr(DeviceSrc, DeviceSrcByteOffset)),
|
|
|
|
ByteCount, static_cast<CUstream_st *>(Stream)),
|
|
|
|
"cuMemcpyDtoDAsync");
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CUDAPlatform::asyncCopyDToH(const void *DeviceSrc,
|
|
|
|
ptrdiff_t DeviceSrcByteOffset, void *HostDst,
|
|
|
|
ptrdiff_t ByteCount, void *Stream) {
|
|
|
|
return getCUError(
|
|
|
|
cuMemcpyDtoHAsync(HostDst, reinterpret_cast<CUdeviceptr>(offsetVoidPtr(
|
|
|
|
DeviceSrc, DeviceSrcByteOffset)),
|
|
|
|
ByteCount, static_cast<CUstream_st *>(Stream)),
|
|
|
|
"cuMemcpyDtoHAsync");
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CUDAPlatform::asyncCopyHToD(const void *HostSrc, void *DeviceDst,
|
|
|
|
ptrdiff_t DeviceDstByteOffset,
|
|
|
|
ptrdiff_t ByteCount, void *Stream) {
|
|
|
|
return getCUError(
|
|
|
|
cuMemcpyHtoDAsync(reinterpret_cast<CUdeviceptr>(
|
|
|
|
offsetVoidPtr(DeviceDst, DeviceDstByteOffset)),
|
|
|
|
HostSrc, ByteCount, static_cast<CUstream_st *>(Stream)),
|
|
|
|
"cuMemcpyHtoDAsync");
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CUDAPlatform::asyncMemsetD(void *DeviceDst, ptrdiff_t ByteOffset,
|
|
|
|
ptrdiff_t ByteCount, char ByteValue,
|
|
|
|
void *Stream) {
|
|
|
|
return getCUError(
|
|
|
|
cuMemsetD8Async(
|
|
|
|
reinterpret_cast<CUdeviceptr>(offsetVoidPtr(DeviceDst, ByteOffset)),
|
|
|
|
ByteValue, ByteCount, static_cast<CUstream_st *>(Stream)),
|
|
|
|
"cuMemsetD8Async");
|
|
|
|
}
|
|
|
|
|
|
|
|
struct StreamCallbackUserData {
|
|
|
|
StreamCallbackUserData(Stream &Stream, StreamCallback Function)
|
|
|
|
: TheStream(Stream), TheFunction(std::move(Function)) {}
|
|
|
|
|
|
|
|
Stream &TheStream;
|
|
|
|
StreamCallback TheFunction;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void CUDA_CB cuStreamCallbackShim(CUstream HStream, CUresult Status,
|
|
|
|
void *UserData) {
|
|
|
|
std::unique_ptr<StreamCallbackUserData> Data(
|
|
|
|
static_cast<StreamCallbackUserData *>(UserData));
|
|
|
|
Stream &TheStream = Data->TheStream;
|
|
|
|
assert(static_cast<CUstream_st *>(TheStream) == HStream);
|
|
|
|
Data->TheFunction(TheStream,
|
|
|
|
getCUError(Status, "stream callback error state"));
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CUDAPlatform::addStreamCallback(Stream &Stream,
|
|
|
|
StreamCallback Callback) {
|
|
|
|
// CUDA docs say flags must always be 0 here.
|
|
|
|
unsigned int Flags = 0u;
|
|
|
|
std::unique_ptr<StreamCallbackUserData> UserData(
|
|
|
|
new StreamCallbackUserData(Stream, std::move(Callback)));
|
|
|
|
return getCUError(cuStreamAddCallback(Stream, cuStreamCallbackShim,
|
|
|
|
UserData.release(), Flags),
|
|
|
|
"cuStreamAddCallback");
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cudaDestroyProgram(void *H) {
|
|
|
|
logCUWarning(cuModuleUnload(static_cast<CUmod_st *>(H)), "cuModuleUnload");
|
|
|
|
}
|
|
|
|
|
2016-10-28 08:54:02 +08:00
|
|
|
Expected<Program> CUDAPlatform::createProgramFromSource(Span<const char> Source,
|
|
|
|
int DeviceIndex) {
|
|
|
|
Status S = setContext(DeviceIndex);
|
|
|
|
if (S.isError())
|
|
|
|
return S;
|
Initial check-in of Acxxel (StreamExecutor renamed)
Summary:
Acxxel is basically a simplified redesign of StreamExecutor.
Here are the major points where Acxxel differs from the current
StreamExecutor design:
* Acxxel doesn't support the kernel and kernel loader types designed for
emission by the compiler to support type-safe kernel launches. For
CUDA, kernels in Acxxel can be seamlessly launched using the standard
CUDA triple-chevron kernel launch syntax that is available with clang
and nvcc. For CUDA and OpenCL, kernel arguments can be passed in the
old-fashioned way, as one array of pointers to arguments and another
array of argument sizes. Although OpenCL doesn't get a type-safe
kernel launch method, it does still get the benefit of all the memory
management wrappers. In the future, clang may add support for
triple-chevron OpenCL kernel launchs, or some other type-safe OpenCL
kernel launch method.
* Acxxel does not depend on any other code in LLVM, so it builds
completely independently from LLVM.
The goal will be to check in Acxxel and remove StreamExecutor, or
perhaps to remove the old StreamExecutor and rename Acxxel to
StreamExecutor, so I think Acxxel should be thought of as a new version
of StreamExecutor, not as a separate project.
Reviewers: jlebar, jprice
Subscribers: beanz, mgorny, modocache, parallel_libs-commits
Differential Revision: https://reviews.llvm.org/D25701
llvm-svn: 285111
2016-10-26 04:18:56 +08:00
|
|
|
CUmodule Module;
|
|
|
|
constexpr int LogBufferSizeBytes = 1024;
|
|
|
|
char InfoLogBuffer[LogBufferSizeBytes];
|
|
|
|
char ErrorLogBuffer[LogBufferSizeBytes];
|
|
|
|
constexpr size_t OptionsCount = 4;
|
|
|
|
std::array<CUjit_option, OptionsCount> OptionNames = {
|
|
|
|
{CU_JIT_INFO_LOG_BUFFER, CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
|
|
|
|
CU_JIT_ERROR_LOG_BUFFER, CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES}};
|
|
|
|
std::array<void *, OptionsCount> OptionValues = {
|
|
|
|
{InfoLogBuffer, const_cast<int *>(&LogBufferSizeBytes), ErrorLogBuffer,
|
|
|
|
const_cast<int *>(&LogBufferSizeBytes)}};
|
|
|
|
if (CUresult Result =
|
|
|
|
cuModuleLoadDataEx(&Module, Source.data(), OptionsCount,
|
|
|
|
OptionNames.data(), OptionValues.data())) {
|
|
|
|
InfoLogBuffer[LogBufferSizeBytes - 1] = '\0';
|
|
|
|
ErrorLogBuffer[LogBufferSizeBytes - 1] = '\0';
|
|
|
|
std::ostringstream OutStream;
|
|
|
|
OutStream << "Error creating program from source: "
|
|
|
|
<< getCUErrorMessage(Result)
|
|
|
|
<< "\nINFO MESSAGES\n================\n"
|
|
|
|
<< InfoLogBuffer << "\nERROR MESSAGES\n==================\n"
|
|
|
|
<< ErrorLogBuffer;
|
|
|
|
return Status(OutStream.str());
|
|
|
|
}
|
|
|
|
return constructProgram(this, Module, cudaDestroyProgram);
|
|
|
|
}
|
|
|
|
|
|
|
|
Expected<void *> CUDAPlatform::rawCreateKernel(void *Program,
|
|
|
|
const std::string &Name) {
|
|
|
|
CUmodule Module = static_cast<CUmodule>(Program);
|
|
|
|
CUfunction Kernel;
|
|
|
|
if (CUresult Result = cuModuleGetFunction(&Kernel, Module, Name.c_str()))
|
|
|
|
return getCUError(Result, "cuModuleGetFunction");
|
|
|
|
return Kernel;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cudaDestroyKernel(void *) {
|
|
|
|
// Do nothing.
|
|
|
|
}
|
|
|
|
|
|
|
|
HandleDestructor CUDAPlatform::getKernelHandleDestructor() {
|
|
|
|
return cudaDestroyKernel;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CUDAPlatform::rawEnqueueKernelLaunch(
|
|
|
|
void *Stream, void *Kernel, KernelLaunchDimensions LaunchDimensions,
|
|
|
|
Span<void *> Arguments, Span<size_t>, size_t SharedMemoryBytes) {
|
|
|
|
return getCUError(
|
|
|
|
cuLaunchKernel(static_cast<CUfunction>(Kernel), LaunchDimensions.GridX,
|
|
|
|
LaunchDimensions.GridY, LaunchDimensions.GridZ,
|
|
|
|
LaunchDimensions.BlockX, LaunchDimensions.BlockY,
|
|
|
|
LaunchDimensions.BlockZ, SharedMemoryBytes,
|
|
|
|
static_cast<CUstream>(Stream), Arguments.data(), nullptr),
|
|
|
|
"cuLaunchKernel");
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
namespace cuda {
|
|
|
|
|
|
|
|
/// Gets the CUDAPlatform instance and returns it as an unowned pointer to a
|
|
|
|
/// Platform.
|
|
|
|
Expected<Platform *> getPlatform() {
|
|
|
|
static auto MaybePlatform = []() -> Expected<CUDAPlatform *> {
|
|
|
|
Expected<CUDAPlatform> CreationResult = CUDAPlatform::create();
|
|
|
|
if (CreationResult.isError())
|
|
|
|
return CreationResult.getError();
|
|
|
|
else
|
|
|
|
return new CUDAPlatform(CreationResult.takeValue());
|
|
|
|
}();
|
|
|
|
return MaybePlatform;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace cuda
|
|
|
|
|
|
|
|
} // namespace acxxel
|