2011-10-07 02:29:37 +08:00
|
|
|
//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
|
|
|
|
//
|
2019-01-19 16:50:56 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2011-10-07 02:29:37 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This provides a class for CUDA code generation targeting the NVIDIA CUDA
|
|
|
|
// runtime library.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "CGCUDARuntime.h"
|
2011-10-07 02:51:56 +08:00
|
|
|
#include "CodeGenFunction.h"
|
|
|
|
#include "CodeGenModule.h"
|
|
|
|
#include "clang/AST/Decl.h"
|
2019-02-01 05:34:03 +08:00
|
|
|
#include "clang/Basic/Cuda.h"
|
|
|
|
#include "clang/CodeGen/CodeGenABITypes.h"
|
2018-04-20 21:04:45 +08:00
|
|
|
#include "clang/CodeGen/ConstantInitBuilder.h"
|
2013-01-02 19:45:17 +08:00
|
|
|
#include "llvm/IR/BasicBlock.h"
|
|
|
|
#include "llvm/IR/Constants.h"
|
|
|
|
#include "llvm/IR/DerivedTypes.h"
|
2018-04-20 21:04:45 +08:00
|
|
|
#include "llvm/Support/Format.h"
|
2011-10-07 02:29:37 +08:00
|
|
|
|
|
|
|
using namespace clang;
|
|
|
|
using namespace CodeGen;
|
|
|
|
|
|
|
|
namespace {
|
2018-05-18 23:07:56 +08:00
|
|
|
constexpr unsigned CudaFatMagic = 0x466243b1;
|
|
|
|
constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
|
2011-10-07 02:29:37 +08:00
|
|
|
|
|
|
|
class CGNVCUDARuntime : public CGCUDARuntime {
|
2011-10-07 02:51:56 +08:00
|
|
|
|
|
|
|
private:
|
2016-11-19 16:17:24 +08:00
|
|
|
llvm::IntegerType *IntTy, *SizeTy;
|
|
|
|
llvm::Type *VoidTy;
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
|
|
|
|
|
|
|
|
/// Convenience reference to LLVM Context
|
|
|
|
llvm::LLVMContext &Context;
|
|
|
|
/// Convenience reference to the current module
|
|
|
|
llvm::Module &TheModule;
|
|
|
|
/// Keeps track of kernel launch stubs emitted in this module
|
2019-02-14 10:00:09 +08:00
|
|
|
struct KernelInfo {
|
|
|
|
llvm::Function *Kernel;
|
|
|
|
const Decl *D;
|
|
|
|
};
|
|
|
|
llvm::SmallVector<KernelInfo, 16> EmittedKernels;
|
|
|
|
struct VarInfo {
|
|
|
|
llvm::GlobalVariable *Var;
|
|
|
|
const VarDecl *D;
|
|
|
|
unsigned Flag;
|
|
|
|
};
|
|
|
|
llvm::SmallVector<VarInfo, 16> DeviceVars;
|
2018-03-01 01:53:46 +08:00
|
|
|
/// Keeps track of variable containing handle of GPU binary. Populated by
|
2015-05-08 03:34:16 +08:00
|
|
|
/// ModuleCtorFunction() and used to create corresponding cleanup calls in
|
|
|
|
/// ModuleDtorFunction()
|
2018-03-01 01:53:46 +08:00
|
|
|
llvm::GlobalVariable *GpuBinaryHandle = nullptr;
|
2018-04-20 21:04:45 +08:00
|
|
|
/// Whether we generate relocatable device code.
|
|
|
|
bool RelocatableDeviceCode;
|
2019-02-14 10:00:09 +08:00
|
|
|
/// Mangle context for device.
|
|
|
|
std::unique_ptr<MangleContext> DeviceMC;
|
2011-10-07 02:51:56 +08:00
|
|
|
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee getSetupArgumentFn() const;
|
|
|
|
llvm::FunctionCallee getLaunchFn() const;
|
2011-10-07 02:51:56 +08:00
|
|
|
|
2018-04-20 21:04:45 +08:00
|
|
|
llvm::FunctionType *getRegisterGlobalsFnTy() const;
|
|
|
|
llvm::FunctionType *getCallbackFnTy() const;
|
|
|
|
llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
|
2018-04-25 09:10:37 +08:00
|
|
|
std::string addPrefixToName(StringRef FuncName) const;
|
|
|
|
std::string addUnderscoredPrefixToName(StringRef FuncName) const;
|
2018-04-20 21:04:45 +08:00
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
/// Creates a function to register all kernel stubs generated in this module.
|
2016-03-03 02:28:50 +08:00
|
|
|
llvm::Function *makeRegisterGlobalsFn();
|
2015-05-08 03:34:16 +08:00
|
|
|
|
|
|
|
/// Helper function that generates a constant string and returns a pointer to
|
|
|
|
/// the start of the string. The result of this function can be used anywhere
|
|
|
|
/// where the C code specifies const char*.
|
|
|
|
llvm::Constant *makeConstantString(const std::string &Str,
|
|
|
|
const std::string &Name = "",
|
2016-08-13 02:44:01 +08:00
|
|
|
const std::string &SectionName = "",
|
2015-05-08 03:34:16 +08:00
|
|
|
unsigned Alignment = 0) {
|
|
|
|
llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
|
|
|
|
llvm::ConstantInt::get(SizeTy, 0)};
|
Compute and preserve alignment more faithfully in IR-generation.
Introduce an Address type to bundle a pointer value with an
alignment. Introduce APIs on CGBuilderTy to work with Address
values. Change core APIs on CGF/CGM to traffic in Address where
appropriate. Require alignments to be non-zero. Update a ton
of code to compute and propagate alignment information.
As part of this, I've promoted CGBuiltin's EmitPointerWithAlignment
helper function to CGF and made use of it in a number of places in
the expression emitter.
The end result is that we should now be significantly more correct
when performing operations on objects that are locally known to
be under-aligned. Since alignment is not reliably tracked in the
type system, there are inherent limits to this, but at least we
are no longer confused by standard operations like derived-to-base
conversions and array-to-pointer decay. I've also fixed a large
number of bugs where we were applying the complete-object alignment
to a pointer instead of the non-virtual alignment, although most of
these were hidden by the very conservative approach we took with
member alignment.
Also, because IRGen now reliably asserts on zero alignments, we
should no longer be subject to an absurd but frustrating recurring
bug where an incomplete type would report a zero alignment and then
we'd naively do a alignmentAtOffset on it and emit code using an
alignment equal to the largest power-of-two factor of the offset.
We should also now be emitting much more aggressive alignment
attributes in the presence of over-alignment. In particular,
field access now uses alignmentAtOffset instead of min.
Several times in this patch, I had to change the existing
code-generation pattern in order to more effectively use
the Address APIs. For the most part, this seems to be a strict
improvement, like doing pointer arithmetic with GEPs instead of
ptrtoint. That said, I've tried very hard to not change semantics,
but it is likely that I've failed in a few places, for which I
apologize.
ABIArgInfo now always carries the assumed alignment of indirect and
indirect byval arguments. In order to cut down on what was already
a dauntingly large patch, I changed the code to never set align
attributes in the IR on non-byval indirect arguments. That is,
we still generate code which assumes that indirect arguments have
the given alignment, but we don't express this information to the
backend except where it's semantically required (i.e. on byvals).
This is likely a minor regression for those targets that did provide
this information, but it'll be trivial to add it back in a later
patch.
I partially punted on applying this work to CGBuiltin. Please
do not add more uses of the CreateDefaultAligned{Load,Store}
APIs; they will be going away eventually.
llvm-svn: 246985
2015-09-08 16:05:57 +08:00
|
|
|
auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
|
2016-08-13 02:44:01 +08:00
|
|
|
llvm::GlobalVariable *GV =
|
|
|
|
cast<llvm::GlobalVariable>(ConstStr.getPointer());
|
2018-06-08 19:17:08 +08:00
|
|
|
if (!SectionName.empty()) {
|
2016-08-13 02:44:01 +08:00
|
|
|
GV->setSection(SectionName);
|
2018-06-08 19:17:08 +08:00
|
|
|
// Mark the address as used which make sure that this section isn't
|
|
|
|
// merged and we will really have it in the object file.
|
|
|
|
GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
|
|
|
|
}
|
2016-08-13 02:44:01 +08:00
|
|
|
if (Alignment)
|
|
|
|
GV->setAlignment(Alignment);
|
|
|
|
|
Compute and preserve alignment more faithfully in IR-generation.
Introduce an Address type to bundle a pointer value with an
alignment. Introduce APIs on CGBuilderTy to work with Address
values. Change core APIs on CGF/CGM to traffic in Address where
appropriate. Require alignments to be non-zero. Update a ton
of code to compute and propagate alignment information.
As part of this, I've promoted CGBuiltin's EmitPointerWithAlignment
helper function to CGF and made use of it in a number of places in
the expression emitter.
The end result is that we should now be significantly more correct
when performing operations on objects that are locally known to
be under-aligned. Since alignment is not reliably tracked in the
type system, there are inherent limits to this, but at least we
are no longer confused by standard operations like derived-to-base
conversions and array-to-pointer decay. I've also fixed a large
number of bugs where we were applying the complete-object alignment
to a pointer instead of the non-virtual alignment, although most of
these were hidden by the very conservative approach we took with
member alignment.
Also, because IRGen now reliably asserts on zero alignments, we
should no longer be subject to an absurd but frustrating recurring
bug where an incomplete type would report a zero alignment and then
we'd naively do a alignmentAtOffset on it and emit code using an
alignment equal to the largest power-of-two factor of the offset.
We should also now be emitting much more aggressive alignment
attributes in the presence of over-alignment. In particular,
field access now uses alignmentAtOffset instead of min.
Several times in this patch, I had to change the existing
code-generation pattern in order to more effectively use
the Address APIs. For the most part, this seems to be a strict
improvement, like doing pointer arithmetic with GEPs instead of
ptrtoint. That said, I've tried very hard to not change semantics,
but it is likely that I've failed in a few places, for which I
apologize.
ABIArgInfo now always carries the assumed alignment of indirect and
indirect byval arguments. In order to cut down on what was already
a dauntingly large patch, I changed the code to never set align
attributes in the IR on non-byval indirect arguments. That is,
we still generate code which assumes that indirect arguments have
the given alignment, but we don't express this information to the
backend except where it's semantically required (i.e. on byvals).
This is likely a minor regression for those targets that did provide
this information, but it'll be trivial to add it back in a later
patch.
I partially punted on applying this work to CGBuiltin. Please
do not add more uses of the CreateDefaultAligned{Load,Store}
APIs; they will be going away eventually.
llvm-svn: 246985
2015-09-08 16:05:57 +08:00
|
|
|
return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
|
|
|
|
ConstStr.getPointer(), Zeros);
|
2018-04-20 21:04:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Helper function that generates an empty dummy function returning void.
|
|
|
|
llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
|
|
|
|
assert(FnTy->getReturnType()->isVoidTy() &&
|
|
|
|
"Can only generate dummy functions returning void!");
|
|
|
|
llvm::Function *DummyFunc = llvm::Function::Create(
|
|
|
|
FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
|
|
|
|
|
|
|
|
llvm::BasicBlock *DummyBlock =
|
|
|
|
llvm::BasicBlock::Create(Context, "", DummyFunc);
|
|
|
|
CGBuilderTy FuncBuilder(CGM, Context);
|
|
|
|
FuncBuilder.SetInsertPoint(DummyBlock);
|
|
|
|
FuncBuilder.CreateRetVoid();
|
|
|
|
|
|
|
|
return DummyFunc;
|
|
|
|
}
|
2015-05-08 03:34:16 +08:00
|
|
|
|
2019-02-01 05:34:03 +08:00
|
|
|
void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
|
|
|
|
void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
|
2019-02-14 10:00:09 +08:00
|
|
|
std::string getDeviceSideName(const Decl *ND);
|
2015-05-08 03:34:16 +08:00
|
|
|
|
2011-10-07 02:29:37 +08:00
|
|
|
public:
|
|
|
|
CGNVCUDARuntime(CodeGenModule &CGM);
|
2011-10-07 02:51:56 +08:00
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
|
2019-02-14 10:00:09 +08:00
|
|
|
void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
|
|
|
|
unsigned Flags) override {
|
|
|
|
DeviceVars.push_back({&Var, VD, Flags});
|
2016-03-03 02:28:50 +08:00
|
|
|
}
|
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
/// Creates module constructor function
|
|
|
|
llvm::Function *makeModuleCtorFunction() override;
|
|
|
|
/// Creates module destructor function
|
|
|
|
llvm::Function *makeModuleDtorFunction() override;
|
2011-10-07 02:29:37 +08:00
|
|
|
};
|
|
|
|
|
2015-06-23 07:07:51 +08:00
|
|
|
}
|
2011-10-07 02:29:37 +08:00
|
|
|
|
2018-04-25 09:10:37 +08:00
|
|
|
std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
|
|
|
|
if (CGM.getLangOpts().HIP)
|
|
|
|
return ((Twine("hip") + Twine(FuncName)).str());
|
|
|
|
return ((Twine("cuda") + Twine(FuncName)).str());
|
|
|
|
}
|
|
|
|
std::string
|
|
|
|
CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
|
|
|
|
if (CGM.getLangOpts().HIP)
|
|
|
|
return ((Twine("__hip") + Twine(FuncName)).str());
|
|
|
|
return ((Twine("__cuda") + Twine(FuncName)).str());
|
|
|
|
}
|
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
|
|
|
|
: CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
|
2018-04-20 21:04:45 +08:00
|
|
|
TheModule(CGM.getModule()),
|
2019-02-14 10:00:09 +08:00
|
|
|
RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
|
|
|
|
DeviceMC(CGM.getContext().createMangleContext(
|
|
|
|
CGM.getContext().getAuxTargetInfo())) {
|
2011-10-07 02:51:56 +08:00
|
|
|
CodeGen::CodeGenTypes &Types = CGM.getTypes();
|
|
|
|
ASTContext &Ctx = CGM.getContext();
|
|
|
|
|
2016-11-19 16:17:24 +08:00
|
|
|
IntTy = CGM.IntTy;
|
|
|
|
SizeTy = CGM.SizeTy;
|
|
|
|
VoidTy = CGM.VoidTy;
|
2011-10-07 02:51:56 +08:00
|
|
|
|
|
|
|
CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
|
|
|
|
VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
|
2015-05-08 03:34:16 +08:00
|
|
|
VoidPtrPtrTy = VoidPtrTy->getPointerTo();
|
2011-10-07 02:51:56 +08:00
|
|
|
}
|
|
|
|
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
|
2011-10-07 02:51:56 +08:00
|
|
|
// cudaError_t cudaSetupArgument(void *, size_t, size_t)
|
2016-07-02 19:41:41 +08:00
|
|
|
llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
|
2018-04-25 09:10:37 +08:00
|
|
|
return CGM.CreateRuntimeFunction(
|
|
|
|
llvm::FunctionType::get(IntTy, Params, false),
|
|
|
|
addPrefixToName("SetupArgument"));
|
2011-10-07 02:51:56 +08:00
|
|
|
}
|
|
|
|
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
|
2018-04-25 09:10:37 +08:00
|
|
|
if (CGM.getLangOpts().HIP) {
|
|
|
|
// hipError_t hipLaunchByPtr(char *);
|
|
|
|
return CGM.CreateRuntimeFunction(
|
|
|
|
llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
|
|
|
|
} else {
|
|
|
|
// cudaError_t cudaLaunch(char *);
|
|
|
|
return CGM.CreateRuntimeFunction(
|
|
|
|
llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
|
|
|
|
}
|
2015-05-08 03:34:16 +08:00
|
|
|
}
|
|
|
|
|
2018-04-20 21:04:45 +08:00
|
|
|
llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
|
|
|
|
return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
|
|
|
|
}
|
|
|
|
|
|
|
|
llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
|
|
|
|
return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
|
|
|
|
}
|
|
|
|
|
|
|
|
llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
|
|
|
|
auto CallbackFnTy = getCallbackFnTy();
|
|
|
|
auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
|
|
|
|
llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
|
|
|
|
VoidPtrTy, CallbackFnTy->getPointerTo()};
|
|
|
|
return llvm::FunctionType::get(VoidTy, Params, false);
|
|
|
|
}
|
|
|
|
|
2019-02-14 10:00:09 +08:00
|
|
|
std::string CGNVCUDARuntime::getDeviceSideName(const Decl *D) {
|
|
|
|
auto *ND = cast<const NamedDecl>(D);
|
|
|
|
std::string DeviceSideName;
|
|
|
|
if (DeviceMC->shouldMangleDeclName(ND)) {
|
|
|
|
SmallString<256> Buffer;
|
|
|
|
llvm::raw_svector_ostream Out(Buffer);
|
|
|
|
DeviceMC->mangleName(ND, Out);
|
|
|
|
DeviceSideName = Out.str();
|
|
|
|
} else
|
|
|
|
DeviceSideName = ND->getIdentifier()->getName();
|
|
|
|
return DeviceSideName;
|
|
|
|
}
|
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
|
|
|
|
FunctionArgList &Args) {
|
2019-02-14 10:00:09 +08:00
|
|
|
assert(getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName() ||
|
2019-02-27 10:02:52 +08:00
|
|
|
getDeviceSideName(CGF.CurFuncDecl) + ".stub" == CGF.CurFn->getName() ||
|
2019-02-14 10:00:09 +08:00
|
|
|
CGF.CGM.getContext().getTargetInfo().getCXXABI() !=
|
|
|
|
CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI());
|
|
|
|
|
|
|
|
EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
|
2019-02-01 05:34:03 +08:00
|
|
|
if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
|
|
|
|
CudaFeature::CUDA_USES_NEW_LAUNCH))
|
|
|
|
emitDeviceStubBodyNew(CGF, Args);
|
|
|
|
else
|
|
|
|
emitDeviceStubBodyLegacy(CGF, Args);
|
2011-10-07 02:51:56 +08:00
|
|
|
}
|
|
|
|
|
2019-02-01 05:34:03 +08:00
|
|
|
// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
|
|
|
|
// array and kernels are launched using cudaLaunchKernel().
|
|
|
|
void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
|
|
|
|
FunctionArgList &Args) {
|
|
|
|
// Build the shadow stack entry at the very start of the function.
|
|
|
|
|
|
|
|
// Calculate amount of space we will need for all arguments. If we have no
|
|
|
|
// args, allocate a single pointer so we still have a valid pointer to the
|
|
|
|
// argument array that we can pass to runtime, even if it will be unused.
|
|
|
|
Address KernelArgs = CGF.CreateTempAlloca(
|
|
|
|
VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
|
|
|
|
llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
|
|
|
|
// Store pointers to the arguments in a locally allocated launch_args.
|
|
|
|
for (unsigned i = 0; i < Args.size(); ++i) {
|
|
|
|
llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
|
|
|
|
llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
|
|
|
|
CGF.Builder.CreateDefaultAlignedStore(
|
|
|
|
VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
|
|
|
|
}
|
|
|
|
|
|
|
|
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
|
|
|
|
|
|
|
|
// Lookup cudaLaunchKernel function.
|
|
|
|
// cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
|
|
|
|
// void **args, size_t sharedMem,
|
|
|
|
// cudaStream_t stream);
|
|
|
|
TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
|
|
|
|
DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
|
|
|
|
IdentifierInfo &cudaLaunchKernelII =
|
|
|
|
CGM.getContext().Idents.get("cudaLaunchKernel");
|
|
|
|
FunctionDecl *cudaLaunchKernelFD = nullptr;
|
|
|
|
for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
|
|
|
|
if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
|
|
|
|
cudaLaunchKernelFD = FD;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cudaLaunchKernelFD == nullptr) {
|
|
|
|
CGM.Error(CGF.CurFuncDecl->getLocation(),
|
|
|
|
"Can't find declaration for cudaLaunchKernel()");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
// Create temporary dim3 grid_dim, block_dim.
|
|
|
|
ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
|
|
|
|
QualType Dim3Ty = GridDimParam->getType();
|
|
|
|
Address GridDim =
|
|
|
|
CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
|
|
|
|
Address BlockDim =
|
|
|
|
CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
|
|
|
|
Address ShmemSize =
|
|
|
|
CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
|
|
|
|
Address Stream =
|
|
|
|
CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
|
2019-02-01 05:34:03 +08:00
|
|
|
llvm::FunctionType::get(IntTy,
|
|
|
|
{/*gridDim=*/GridDim.getType(),
|
|
|
|
/*blockDim=*/BlockDim.getType(),
|
|
|
|
/*ShmemSize=*/ShmemSize.getType(),
|
|
|
|
/*Stream=*/Stream.getType()},
|
|
|
|
/*isVarArg=*/false),
|
|
|
|
"__cudaPopCallConfiguration");
|
|
|
|
|
|
|
|
CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
|
|
|
|
{GridDim.getPointer(), BlockDim.getPointer(),
|
|
|
|
ShmemSize.getPointer(), Stream.getPointer()});
|
|
|
|
|
|
|
|
// Emit the call to cudaLaunch
|
|
|
|
llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
|
|
|
|
CallArgList LaunchKernelArgs;
|
|
|
|
LaunchKernelArgs.add(RValue::get(Kernel),
|
|
|
|
cudaLaunchKernelFD->getParamDecl(0)->getType());
|
|
|
|
LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
|
|
|
|
LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
|
|
|
|
LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
|
|
|
|
cudaLaunchKernelFD->getParamDecl(3)->getType());
|
|
|
|
LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
|
|
|
|
cudaLaunchKernelFD->getParamDecl(4)->getType());
|
|
|
|
LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
|
|
|
|
cudaLaunchKernelFD->getParamDecl(5)->getType());
|
|
|
|
|
|
|
|
QualType QT = cudaLaunchKernelFD->getType();
|
|
|
|
QualType CQT = QT.getCanonicalType();
|
2019-02-02 09:48:23 +08:00
|
|
|
llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
|
2019-02-01 05:34:03 +08:00
|
|
|
llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
|
|
|
|
|
|
|
|
const CGFunctionInfo &FI =
|
|
|
|
CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee cudaLaunchKernelFn =
|
2019-02-01 05:34:03 +08:00
|
|
|
CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel");
|
|
|
|
CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
|
|
|
|
LaunchKernelArgs);
|
|
|
|
CGF.EmitBranch(EndBlock);
|
|
|
|
|
|
|
|
CGF.EmitBlock(EndBlock);
|
|
|
|
}
|
|
|
|
|
|
|
|
void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
|
|
|
|
FunctionArgList &Args) {
|
2016-07-28 06:36:21 +08:00
|
|
|
// Emit a call to cudaSetupArgument for each arg in Args.
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
|
2016-07-28 06:36:21 +08:00
|
|
|
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
|
|
|
|
CharUnits Offset = CharUnits::Zero();
|
|
|
|
for (const VarDecl *A : Args) {
|
|
|
|
CharUnits TyWidth, TyAlign;
|
|
|
|
std::tie(TyWidth, TyAlign) =
|
|
|
|
CGM.getContext().getTypeInfoInChars(A->getType());
|
|
|
|
Offset = Offset.alignTo(TyAlign);
|
|
|
|
llvm::Value *Args[] = {
|
|
|
|
CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
|
|
|
|
VoidPtrTy),
|
|
|
|
llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()),
|
|
|
|
llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
|
|
|
|
};
|
2019-01-30 10:54:28 +08:00
|
|
|
llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
|
2011-10-07 02:51:56 +08:00
|
|
|
llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
|
2019-01-30 10:54:28 +08:00
|
|
|
llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
|
2016-07-28 06:36:21 +08:00
|
|
|
llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
|
2019-01-30 10:54:28 +08:00
|
|
|
CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
|
2011-10-07 02:51:56 +08:00
|
|
|
CGF.EmitBlock(NextBlock);
|
2016-07-28 06:36:21 +08:00
|
|
|
Offset += TyWidth;
|
2011-10-07 02:51:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Emit the call to cudaLaunch
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
|
2011-10-07 02:51:56 +08:00
|
|
|
llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
|
2013-03-01 03:01:20 +08:00
|
|
|
CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
|
2011-10-07 02:51:56 +08:00
|
|
|
CGF.EmitBranch(EndBlock);
|
|
|
|
|
|
|
|
CGF.EmitBlock(EndBlock);
|
2011-10-07 02:29:37 +08:00
|
|
|
}
|
|
|
|
|
2016-03-03 02:28:50 +08:00
|
|
|
/// Creates a function that sets up state on the host side for CUDA objects that
|
|
|
|
/// have a presence on both the host and device sides. Specifically, registers
|
|
|
|
/// the host side of kernel functions and device global variables with the CUDA
|
|
|
|
/// runtime.
|
2015-05-08 03:34:16 +08:00
|
|
|
/// \code
|
2016-03-03 02:28:50 +08:00
|
|
|
/// void __cuda_register_globals(void** GpuBinaryHandle) {
|
2015-05-08 03:34:16 +08:00
|
|
|
/// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
|
|
|
|
/// ...
|
|
|
|
/// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
|
2016-03-03 02:28:50 +08:00
|
|
|
/// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
|
|
|
|
/// ...
|
|
|
|
/// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
|
2015-05-08 03:34:16 +08:00
|
|
|
/// }
|
|
|
|
/// \endcode
|
2016-03-03 02:28:50 +08:00
|
|
|
llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
|
2016-03-03 02:28:53 +08:00
|
|
|
// No need to register anything
|
|
|
|
if (EmittedKernels.empty() && DeviceVars.empty())
|
|
|
|
return nullptr;
|
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
|
2018-04-20 21:04:45 +08:00
|
|
|
getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
|
2018-04-25 09:10:37 +08:00
|
|
|
addUnderscoredPrefixToName("_register_globals"), &TheModule);
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::BasicBlock *EntryBB =
|
|
|
|
llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
|
Compute and preserve alignment more faithfully in IR-generation.
Introduce an Address type to bundle a pointer value with an
alignment. Introduce APIs on CGBuilderTy to work with Address
values. Change core APIs on CGF/CGM to traffic in Address where
appropriate. Require alignments to be non-zero. Update a ton
of code to compute and propagate alignment information.
As part of this, I've promoted CGBuiltin's EmitPointerWithAlignment
helper function to CGF and made use of it in a number of places in
the expression emitter.
The end result is that we should now be significantly more correct
when performing operations on objects that are locally known to
be under-aligned. Since alignment is not reliably tracked in the
type system, there are inherent limits to this, but at least we
are no longer confused by standard operations like derived-to-base
conversions and array-to-pointer decay. I've also fixed a large
number of bugs where we were applying the complete-object alignment
to a pointer instead of the non-virtual alignment, although most of
these were hidden by the very conservative approach we took with
member alignment.
Also, because IRGen now reliably asserts on zero alignments, we
should no longer be subject to an absurd but frustrating recurring
bug where an incomplete type would report a zero alignment and then
we'd naively do a alignmentAtOffset on it and emit code using an
alignment equal to the largest power-of-two factor of the offset.
We should also now be emitting much more aggressive alignment
attributes in the presence of over-alignment. In particular,
field access now uses alignmentAtOffset instead of min.
Several times in this patch, I had to change the existing
code-generation pattern in order to more effectively use
the Address APIs. For the most part, this seems to be a strict
improvement, like doing pointer arithmetic with GEPs instead of
ptrtoint. That said, I've tried very hard to not change semantics,
but it is likely that I've failed in a few places, for which I
apologize.
ABIArgInfo now always carries the assumed alignment of indirect and
indirect byval arguments. In order to cut down on what was already
a dauntingly large patch, I changed the code to never set align
attributes in the IR on non-byval indirect arguments. That is,
we still generate code which assumes that indirect arguments have
the given alignment, but we don't express this information to the
backend except where it's semantically required (i.e. on byvals).
This is likely a minor regression for those targets that did provide
this information, but it'll be trivial to add it back in a later
patch.
I partially punted on applying this work to CGBuiltin. Please
do not add more uses of the CreateDefaultAligned{Load,Store}
APIs; they will be going away eventually.
llvm-svn: 246985
2015-09-08 16:05:57 +08:00
|
|
|
CGBuilderTy Builder(CGM, Context);
|
2015-05-08 03:34:16 +08:00
|
|
|
Builder.SetInsertPoint(EntryBB);
|
|
|
|
|
|
|
|
// void __cudaRegisterFunction(void **, const char *, char *, const char *,
|
|
|
|
// int, uint3*, uint3*, dim3*, dim3*, int*)
|
2016-07-02 20:03:57 +08:00
|
|
|
llvm::Type *RegisterFuncParams[] = {
|
2015-05-08 03:34:16 +08:00
|
|
|
VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
|
|
|
|
VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
|
2018-04-25 09:10:37 +08:00
|
|
|
addUnderscoredPrefixToName("RegisterFunction"));
|
2015-05-08 03:34:16 +08:00
|
|
|
|
|
|
|
// Extract GpuBinaryHandle passed as the first argument passed to
|
2016-03-03 02:28:50 +08:00
|
|
|
// __cuda_register_globals() and generate __cudaRegisterFunction() call for
|
2015-05-08 03:34:16 +08:00
|
|
|
// each emitted kernel.
|
|
|
|
llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
|
2019-02-14 10:00:09 +08:00
|
|
|
for (auto &&I : EmittedKernels) {
|
|
|
|
llvm::Constant *KernelName = makeConstantString(getDeviceSideName(I.D));
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
|
2016-03-03 02:28:50 +08:00
|
|
|
llvm::Value *Args[] = {
|
2019-02-14 10:00:09 +08:00
|
|
|
&GpuBinaryHandlePtr,
|
|
|
|
Builder.CreateBitCast(I.Kernel, VoidPtrTy),
|
|
|
|
KernelName,
|
|
|
|
KernelName,
|
|
|
|
llvm::ConstantInt::get(IntTy, -1),
|
|
|
|
NullPtr,
|
|
|
|
NullPtr,
|
|
|
|
NullPtr,
|
|
|
|
NullPtr,
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
|
2016-03-03 02:28:50 +08:00
|
|
|
Builder.CreateCall(RegisterFunc, Args);
|
|
|
|
}
|
|
|
|
|
|
|
|
// void __cudaRegisterVar(void **, char *, char *, const char *,
|
|
|
|
// int, int, int, int)
|
2016-07-02 20:03:57 +08:00
|
|
|
llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
|
|
|
|
CharPtrTy, IntTy, IntTy,
|
|
|
|
IntTy, IntTy};
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
|
2016-03-03 02:28:50 +08:00
|
|
|
llvm::FunctionType::get(IntTy, RegisterVarParams, false),
|
2018-04-25 09:10:37 +08:00
|
|
|
addUnderscoredPrefixToName("RegisterVar"));
|
2019-02-14 10:00:09 +08:00
|
|
|
for (auto &&Info : DeviceVars) {
|
|
|
|
llvm::GlobalVariable *Var = Info.Var;
|
|
|
|
unsigned Flags = Info.Flag;
|
|
|
|
llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
|
2016-03-03 02:28:50 +08:00
|
|
|
uint64_t VarSize =
|
|
|
|
CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
|
|
|
|
llvm::Value *Args[] = {
|
|
|
|
&GpuBinaryHandlePtr,
|
|
|
|
Builder.CreateBitCast(Var, VoidPtrTy),
|
|
|
|
VarName,
|
|
|
|
VarName,
|
|
|
|
llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0),
|
|
|
|
llvm::ConstantInt::get(IntTy, VarSize),
|
|
|
|
llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0),
|
|
|
|
llvm::ConstantInt::get(IntTy, 0)};
|
|
|
|
Builder.CreateCall(RegisterVar, Args);
|
2015-05-08 03:34:16 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
Builder.CreateRetVoid();
|
|
|
|
return RegisterKernelsFunc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Creates a global constructor function for the module:
|
2018-07-21 06:45:24 +08:00
|
|
|
///
|
|
|
|
/// For CUDA:
|
2015-05-08 03:34:16 +08:00
|
|
|
/// \code
|
|
|
|
/// void __cuda_module_ctor(void*) {
|
2018-03-01 01:53:46 +08:00
|
|
|
/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
|
|
|
|
/// __cuda_register_globals(Handle);
|
2015-05-08 03:34:16 +08:00
|
|
|
/// }
|
|
|
|
/// \endcode
|
2018-07-21 06:45:24 +08:00
|
|
|
///
|
|
|
|
/// For HIP:
|
|
|
|
/// \code
|
|
|
|
/// void __hip_module_ctor(void*) {
|
|
|
|
/// if (__hip_gpubin_handle == 0) {
|
|
|
|
/// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
|
|
|
|
/// __hip_register_globals(__hip_gpubin_handle);
|
|
|
|
/// }
|
|
|
|
/// }
|
|
|
|
/// \endcode
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
|
2018-05-18 23:07:56 +08:00
|
|
|
bool IsHIP = CGM.getLangOpts().HIP;
|
2019-04-03 04:49:41 +08:00
|
|
|
bool IsCUDA = CGM.getLangOpts().CUDA;
|
2018-03-01 01:53:46 +08:00
|
|
|
// No need to generate ctors/dtors if there is no GPU binary.
|
2018-05-18 23:07:56 +08:00
|
|
|
StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
|
|
|
|
if (CudaGpuBinaryFileName.empty() && !IsHIP)
|
2016-03-03 02:28:53 +08:00
|
|
|
return nullptr;
|
2019-04-03 05:54:41 +08:00
|
|
|
if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
|
|
|
|
DeviceVars.empty())
|
2019-04-03 04:10:18 +08:00
|
|
|
return nullptr;
|
2016-03-03 02:28:53 +08:00
|
|
|
|
2018-05-18 23:07:56 +08:00
|
|
|
// void __{cuda|hip}_register_globals(void* handle);
|
2016-03-03 02:28:50 +08:00
|
|
|
llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
|
2018-04-20 21:04:45 +08:00
|
|
|
// We always need a function to pass in as callback. Create a dummy
|
|
|
|
// implementation if we don't need to register anything.
|
|
|
|
if (RelocatableDeviceCode && !RegisterGlobalsFunc)
|
|
|
|
RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
|
|
|
|
|
2018-05-18 23:07:56 +08:00
|
|
|
// void ** __{cuda|hip}RegisterFatBinary(void *);
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
|
2018-04-25 09:10:37 +08:00
|
|
|
addUnderscoredPrefixToName("RegisterFatBinary"));
|
2015-05-08 03:34:16 +08:00
|
|
|
// struct { int magic, int version, void * gpu_binary, void * dont_care };
|
|
|
|
llvm::StructType *FatbinWrapperTy =
|
2017-05-10 03:31:30 +08:00
|
|
|
llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
|
2015-05-08 03:34:16 +08:00
|
|
|
|
2018-03-01 01:53:46 +08:00
|
|
|
// Register GPU binary with the CUDA runtime, store returned handle in a
|
|
|
|
// global variable and save a reference in GpuBinaryHandle to be cleaned up
|
|
|
|
// in destructor on exit. Then associate all known kernels with the GPU binary
|
|
|
|
// handle so CUDA runtime can figure out what to call on the GPU side.
|
2018-10-03 01:48:54 +08:00
|
|
|
std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
|
|
|
|
if (!CudaGpuBinaryFileName.empty()) {
|
2018-05-18 23:07:56 +08:00
|
|
|
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
|
|
|
|
llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
|
|
|
|
if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
|
|
|
|
CGM.getDiags().Report(diag::err_cannot_open_file)
|
|
|
|
<< CudaGpuBinaryFileName << EC.message();
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
|
2018-03-01 01:53:46 +08:00
|
|
|
}
|
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::Function *ModuleCtorFunc = llvm::Function::Create(
|
|
|
|
llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
|
2018-04-25 09:10:37 +08:00
|
|
|
llvm::GlobalValue::InternalLinkage,
|
|
|
|
addUnderscoredPrefixToName("_module_ctor"), &TheModule);
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::BasicBlock *CtorEntryBB =
|
|
|
|
llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
|
Compute and preserve alignment more faithfully in IR-generation.
Introduce an Address type to bundle a pointer value with an
alignment. Introduce APIs on CGBuilderTy to work with Address
values. Change core APIs on CGF/CGM to traffic in Address where
appropriate. Require alignments to be non-zero. Update a ton
of code to compute and propagate alignment information.
As part of this, I've promoted CGBuiltin's EmitPointerWithAlignment
helper function to CGF and made use of it in a number of places in
the expression emitter.
The end result is that we should now be significantly more correct
when performing operations on objects that are locally known to
be under-aligned. Since alignment is not reliably tracked in the
type system, there are inherent limits to this, but at least we
are no longer confused by standard operations like derived-to-base
conversions and array-to-pointer decay. I've also fixed a large
number of bugs where we were applying the complete-object alignment
to a pointer instead of the non-virtual alignment, although most of
these were hidden by the very conservative approach we took with
member alignment.
Also, because IRGen now reliably asserts on zero alignments, we
should no longer be subject to an absurd but frustrating recurring
bug where an incomplete type would report a zero alignment and then
we'd naively do a alignmentAtOffset on it and emit code using an
alignment equal to the largest power-of-two factor of the offset.
We should also now be emitting much more aggressive alignment
attributes in the presence of over-alignment. In particular,
field access now uses alignmentAtOffset instead of min.
Several times in this patch, I had to change the existing
code-generation pattern in order to more effectively use
the Address APIs. For the most part, this seems to be a strict
improvement, like doing pointer arithmetic with GEPs instead of
ptrtoint. That said, I've tried very hard to not change semantics,
but it is likely that I've failed in a few places, for which I
apologize.
ABIArgInfo now always carries the assumed alignment of indirect and
indirect byval arguments. In order to cut down on what was already
a dauntingly large patch, I changed the code to never set align
attributes in the IR on non-byval indirect arguments. That is,
we still generate code which assumes that indirect arguments have
the given alignment, but we don't express this information to the
backend except where it's semantically required (i.e. on byvals).
This is likely a minor regression for those targets that did provide
this information, but it'll be trivial to add it back in a later
patch.
I partially punted on applying this work to CGBuiltin. Please
do not add more uses of the CreateDefaultAligned{Load,Store}
APIs; they will be going away eventually.
llvm-svn: 246985
2015-09-08 16:05:57 +08:00
|
|
|
CGBuilderTy CtorBuilder(CGM, Context);
|
2015-05-08 03:34:16 +08:00
|
|
|
|
|
|
|
CtorBuilder.SetInsertPoint(CtorEntryBB);
|
|
|
|
|
2018-04-20 21:04:45 +08:00
|
|
|
const char *FatbinConstantName;
|
2018-05-18 23:07:56 +08:00
|
|
|
const char *FatbinSectionName;
|
|
|
|
const char *ModuleIDSectionName;
|
|
|
|
StringRef ModuleIDPrefix;
|
|
|
|
llvm::Constant *FatBinStr;
|
|
|
|
unsigned FatMagic;
|
|
|
|
if (IsHIP) {
|
|
|
|
FatbinConstantName = ".hip_fatbin";
|
|
|
|
FatbinSectionName = ".hipFatBinSegment";
|
|
|
|
|
|
|
|
ModuleIDSectionName = "__hip_module_id";
|
|
|
|
ModuleIDPrefix = "__hip_";
|
|
|
|
|
2018-10-03 01:48:54 +08:00
|
|
|
if (CudaGpuBinary) {
|
|
|
|
// If fatbin is available from early finalization, create a string
|
|
|
|
// literal containing the fat binary loaded from the given file.
|
|
|
|
FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "",
|
|
|
|
FatbinConstantName, 8);
|
|
|
|
} else {
|
|
|
|
// If fatbin is not available, create an external symbol
|
|
|
|
// __hip_fatbin in section .hip_fatbin. The external symbol is supposed
|
|
|
|
// to contain the fat binary but will be populated somewhere else,
|
|
|
|
// e.g. by lld through link script.
|
|
|
|
FatBinStr = new llvm::GlobalVariable(
|
2018-05-18 23:07:56 +08:00
|
|
|
CGM.getModule(), CGM.Int8Ty,
|
|
|
|
/*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
|
|
|
|
"__hip_fatbin", nullptr,
|
|
|
|
llvm::GlobalVariable::NotThreadLocal);
|
2018-10-03 01:48:54 +08:00
|
|
|
cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
|
|
|
|
}
|
2018-05-18 23:07:56 +08:00
|
|
|
|
|
|
|
FatMagic = HIPFatMagic;
|
|
|
|
} else {
|
|
|
|
if (RelocatableDeviceCode)
|
2018-06-29 01:15:52 +08:00
|
|
|
FatbinConstantName = CGM.getTriple().isMacOSX()
|
|
|
|
? "__NV_CUDA,__nv_relfatbin"
|
|
|
|
: "__nv_relfatbin";
|
2018-05-18 23:07:56 +08:00
|
|
|
else
|
|
|
|
FatbinConstantName =
|
|
|
|
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
|
|
|
|
// NVIDIA's cuobjdump looks for fatbins in this section.
|
|
|
|
FatbinSectionName =
|
|
|
|
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
|
|
|
|
|
2018-06-29 01:15:52 +08:00
|
|
|
ModuleIDSectionName = CGM.getTriple().isMacOSX()
|
|
|
|
? "__NV_CUDA,__nv_module_id"
|
|
|
|
: "__nv_module_id";
|
2018-05-18 23:07:56 +08:00
|
|
|
ModuleIDPrefix = "__nv_";
|
|
|
|
|
|
|
|
// For CUDA, create a string literal containing the fat binary loaded from
|
|
|
|
// the given file.
|
|
|
|
FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "",
|
|
|
|
FatbinConstantName, 8);
|
|
|
|
FatMagic = CudaFatMagic;
|
|
|
|
}
|
2018-03-01 01:53:46 +08:00
|
|
|
|
|
|
|
// Create initialized wrapper structure that points to the loaded GPU binary
|
|
|
|
ConstantInitBuilder Builder(CGM);
|
|
|
|
auto Values = Builder.beginStruct(FatbinWrapperTy);
|
|
|
|
// Fatbin wrapper magic.
|
2018-05-18 23:07:56 +08:00
|
|
|
Values.addInt(IntTy, FatMagic);
|
2018-03-01 01:53:46 +08:00
|
|
|
// Fatbin version.
|
|
|
|
Values.addInt(IntTy, 1);
|
|
|
|
// Data.
|
2018-05-18 23:07:56 +08:00
|
|
|
Values.add(FatBinStr);
|
2018-03-01 01:53:46 +08:00
|
|
|
// Unused in fatbin v1.
|
|
|
|
Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
|
|
|
|
llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
|
2018-04-25 09:10:37 +08:00
|
|
|
addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
|
2018-03-01 01:53:46 +08:00
|
|
|
/*constant*/ true);
|
|
|
|
FatbinWrapper->setSection(FatbinSectionName);
|
|
|
|
|
2018-07-21 06:45:24 +08:00
|
|
|
// There is only one HIP fat binary per linked module, however there are
|
|
|
|
// multiple constructor functions. Make sure the fat binary is registered
|
|
|
|
// only once. The constructor functions are executed by the dynamic loader
|
|
|
|
// before the program gains control. The dynamic loader cannot execute the
|
|
|
|
// constructor functions concurrently since doing that would not guarantee
|
|
|
|
// thread safety of the loaded program. Therefore we can assume sequential
|
|
|
|
// execution of constructor functions here.
|
|
|
|
if (IsHIP) {
|
2018-10-03 01:48:54 +08:00
|
|
|
auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
|
|
|
|
llvm::GlobalValue::LinkOnceAnyLinkage;
|
2018-07-21 06:45:24 +08:00
|
|
|
llvm::BasicBlock *IfBlock =
|
|
|
|
llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
|
|
|
|
llvm::BasicBlock *ExitBlock =
|
|
|
|
llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
|
|
|
|
// The name, size, and initialization pattern of this variable is part
|
|
|
|
// of HIP ABI.
|
|
|
|
GpuBinaryHandle = new llvm::GlobalVariable(
|
|
|
|
TheModule, VoidPtrPtrTy, /*isConstant=*/false,
|
2018-10-03 01:48:54 +08:00
|
|
|
Linkage,
|
2018-07-21 06:45:24 +08:00
|
|
|
/*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
|
|
|
|
"__hip_gpubin_handle");
|
|
|
|
GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity());
|
2018-08-18 01:47:31 +08:00
|
|
|
// Prevent the weak symbol in different shared libraries being merged.
|
2018-10-03 01:48:54 +08:00
|
|
|
if (Linkage != llvm::GlobalValue::InternalLinkage)
|
|
|
|
GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
|
2018-07-21 06:45:24 +08:00
|
|
|
Address GpuBinaryAddr(
|
|
|
|
GpuBinaryHandle,
|
|
|
|
CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
|
|
|
|
{
|
|
|
|
auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
|
|
|
|
llvm::Constant *Zero =
|
|
|
|
llvm::Constant::getNullValue(HandleValue->getType());
|
|
|
|
llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
|
|
|
|
CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
|
|
|
|
}
|
|
|
|
{
|
|
|
|
CtorBuilder.SetInsertPoint(IfBlock);
|
|
|
|
// GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
|
|
|
|
llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
|
|
|
|
RegisterFatbinFunc,
|
|
|
|
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
|
|
|
|
CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
|
|
|
|
CtorBuilder.CreateBr(ExitBlock);
|
|
|
|
}
|
|
|
|
{
|
|
|
|
CtorBuilder.SetInsertPoint(ExitBlock);
|
|
|
|
// Call __hip_register_globals(GpuBinaryHandle);
|
|
|
|
if (RegisterGlobalsFunc) {
|
|
|
|
auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
|
|
|
|
CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (!RelocatableDeviceCode) {
|
|
|
|
// Register binary with CUDA runtime. This is substantially different in
|
|
|
|
// default mode vs. separate compilation!
|
|
|
|
// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
|
2018-04-20 21:04:45 +08:00
|
|
|
llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
|
|
|
|
RegisterFatbinFunc,
|
|
|
|
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
|
|
|
|
GpuBinaryHandle = new llvm::GlobalVariable(
|
|
|
|
TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
|
2018-07-21 06:45:24 +08:00
|
|
|
llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
|
|
|
|
GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity());
|
2018-04-20 21:04:45 +08:00
|
|
|
CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
|
|
|
|
CGM.getPointerAlign());
|
|
|
|
|
2018-07-21 06:45:24 +08:00
|
|
|
// Call __cuda_register_globals(GpuBinaryHandle);
|
2018-04-20 21:04:45 +08:00
|
|
|
if (RegisterGlobalsFunc)
|
|
|
|
CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
|
2019-02-06 06:38:58 +08:00
|
|
|
|
|
|
|
// Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
|
|
|
|
if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
|
|
|
|
CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
|
|
|
|
// void __cudaRegisterFatBinaryEnd(void **);
|
|
|
|
llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
|
|
|
|
llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
|
|
|
|
"__cudaRegisterFatBinaryEnd");
|
|
|
|
CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
|
|
|
|
}
|
2018-04-20 21:04:45 +08:00
|
|
|
} else {
|
|
|
|
// Generate a unique module ID.
|
2018-05-18 23:07:56 +08:00
|
|
|
SmallString<64> ModuleID;
|
|
|
|
llvm::raw_svector_ostream OS(ModuleID);
|
2018-10-06 02:39:58 +08:00
|
|
|
OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
|
2018-05-18 23:07:56 +08:00
|
|
|
llvm::Constant *ModuleIDConstant =
|
|
|
|
makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32);
|
|
|
|
|
2018-07-21 06:45:24 +08:00
|
|
|
// Create an alias for the FatbinWrapper that nvcc will look for.
|
2018-04-20 21:04:45 +08:00
|
|
|
llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
|
2018-05-18 23:07:56 +08:00
|
|
|
Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
|
2018-04-20 21:04:45 +08:00
|
|
|
|
2018-07-21 06:45:24 +08:00
|
|
|
// void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
|
2018-04-20 21:04:45 +08:00
|
|
|
// void *, void (*)(void **))
|
2018-07-21 06:45:24 +08:00
|
|
|
SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
|
2018-05-18 23:07:56 +08:00
|
|
|
RegisterLinkedBinaryName += ModuleID;
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
|
2018-04-20 21:04:45 +08:00
|
|
|
getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
|
|
|
|
|
|
|
|
assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
|
|
|
|
llvm::Value *Args[] = {RegisterGlobalsFunc,
|
|
|
|
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
|
2018-05-18 23:07:56 +08:00
|
|
|
ModuleIDConstant,
|
2018-04-20 21:04:45 +08:00
|
|
|
makeDummyFunction(getCallbackFnTy())};
|
|
|
|
CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
|
|
|
|
}
|
2015-05-08 03:34:16 +08:00
|
|
|
|
2018-06-28 02:32:51 +08:00
|
|
|
// Create destructor and register it with atexit() the way NVCC does it. Doing
|
|
|
|
// it during regular destructor phase worked in CUDA before 9.2 but results in
|
|
|
|
// double-free in 9.2.
|
|
|
|
if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
|
|
|
|
// extern "C" int atexit(void (*f)(void));
|
|
|
|
llvm::FunctionType *AtExitTy =
|
|
|
|
llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee AtExitFunc =
|
2018-06-28 02:32:51 +08:00
|
|
|
CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
|
|
|
|
/*Local=*/true);
|
|
|
|
CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
|
|
|
|
}
|
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
CtorBuilder.CreateRetVoid();
|
|
|
|
return ModuleCtorFunc;
|
|
|
|
}
|
|
|
|
|
2018-03-01 01:53:46 +08:00
|
|
|
/// Creates a global destructor function that unregisters the GPU code blob
|
2015-05-08 03:34:16 +08:00
|
|
|
/// registered by constructor.
|
2018-07-21 06:45:24 +08:00
|
|
|
///
|
|
|
|
/// For CUDA:
|
2015-05-08 03:34:16 +08:00
|
|
|
/// \code
|
|
|
|
/// void __cuda_module_dtor(void*) {
|
2018-03-01 01:53:46 +08:00
|
|
|
/// __cudaUnregisterFatBinary(Handle);
|
2015-05-08 03:34:16 +08:00
|
|
|
/// }
|
|
|
|
/// \endcode
|
2018-07-21 06:45:24 +08:00
|
|
|
///
|
|
|
|
/// For HIP:
|
|
|
|
/// \code
|
|
|
|
/// void __hip_module_dtor(void*) {
|
|
|
|
/// if (__hip_gpubin_handle) {
|
|
|
|
/// __hipUnregisterFatBinary(__hip_gpubin_handle);
|
|
|
|
/// __hip_gpubin_handle = 0;
|
|
|
|
/// }
|
|
|
|
/// }
|
|
|
|
/// \endcode
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
|
2018-03-01 01:53:46 +08:00
|
|
|
// No need for destructor if we don't have a handle to unregister.
|
|
|
|
if (!GpuBinaryHandle)
|
2016-03-03 02:28:53 +08:00
|
|
|
return nullptr;
|
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
// void __cudaUnregisterFatBinary(void ** handle);
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
|
2018-04-25 09:10:37 +08:00
|
|
|
addUnderscoredPrefixToName("UnregisterFatBinary"));
|
2015-05-08 03:34:16 +08:00
|
|
|
|
|
|
|
llvm::Function *ModuleDtorFunc = llvm::Function::Create(
|
|
|
|
llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
|
2018-04-25 09:10:37 +08:00
|
|
|
llvm::GlobalValue::InternalLinkage,
|
|
|
|
addUnderscoredPrefixToName("_module_dtor"), &TheModule);
|
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::BasicBlock *DtorEntryBB =
|
|
|
|
llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
|
Compute and preserve alignment more faithfully in IR-generation.
Introduce an Address type to bundle a pointer value with an
alignment. Introduce APIs on CGBuilderTy to work with Address
values. Change core APIs on CGF/CGM to traffic in Address where
appropriate. Require alignments to be non-zero. Update a ton
of code to compute and propagate alignment information.
As part of this, I've promoted CGBuiltin's EmitPointerWithAlignment
helper function to CGF and made use of it in a number of places in
the expression emitter.
The end result is that we should now be significantly more correct
when performing operations on objects that are locally known to
be under-aligned. Since alignment is not reliably tracked in the
type system, there are inherent limits to this, but at least we
are no longer confused by standard operations like derived-to-base
conversions and array-to-pointer decay. I've also fixed a large
number of bugs where we were applying the complete-object alignment
to a pointer instead of the non-virtual alignment, although most of
these were hidden by the very conservative approach we took with
member alignment.
Also, because IRGen now reliably asserts on zero alignments, we
should no longer be subject to an absurd but frustrating recurring
bug where an incomplete type would report a zero alignment and then
we'd naively do a alignmentAtOffset on it and emit code using an
alignment equal to the largest power-of-two factor of the offset.
We should also now be emitting much more aggressive alignment
attributes in the presence of over-alignment. In particular,
field access now uses alignmentAtOffset instead of min.
Several times in this patch, I had to change the existing
code-generation pattern in order to more effectively use
the Address APIs. For the most part, this seems to be a strict
improvement, like doing pointer arithmetic with GEPs instead of
ptrtoint. That said, I've tried very hard to not change semantics,
but it is likely that I've failed in a few places, for which I
apologize.
ABIArgInfo now always carries the assumed alignment of indirect and
indirect byval arguments. In order to cut down on what was already
a dauntingly large patch, I changed the code to never set align
attributes in the IR on non-byval indirect arguments. That is,
we still generate code which assumes that indirect arguments have
the given alignment, but we don't express this information to the
backend except where it's semantically required (i.e. on byvals).
This is likely a minor regression for those targets that did provide
this information, but it'll be trivial to add it back in a later
patch.
I partially punted on applying this work to CGBuiltin. Please
do not add more uses of the CreateDefaultAligned{Load,Store}
APIs; they will be going away eventually.
llvm-svn: 246985
2015-09-08 16:05:57 +08:00
|
|
|
CGBuilderTy DtorBuilder(CGM, Context);
|
2015-05-08 03:34:16 +08:00
|
|
|
DtorBuilder.SetInsertPoint(DtorEntryBB);
|
|
|
|
|
2018-07-21 06:45:24 +08:00
|
|
|
Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
|
|
|
|
GpuBinaryHandle->getAlignment()));
|
|
|
|
auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
|
|
|
|
// There is only one HIP fat binary per linked module, however there are
|
|
|
|
// multiple destructor functions. Make sure the fat binary is unregistered
|
|
|
|
// only once.
|
|
|
|
if (CGM.getLangOpts().HIP) {
|
|
|
|
llvm::BasicBlock *IfBlock =
|
|
|
|
llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
|
|
|
|
llvm::BasicBlock *ExitBlock =
|
|
|
|
llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
|
|
|
|
llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
|
|
|
|
llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
|
|
|
|
DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
|
|
|
|
|
|
|
|
DtorBuilder.SetInsertPoint(IfBlock);
|
|
|
|
DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
|
|
|
|
DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
|
|
|
|
DtorBuilder.CreateBr(ExitBlock);
|
|
|
|
|
|
|
|
DtorBuilder.SetInsertPoint(ExitBlock);
|
|
|
|
} else {
|
|
|
|
DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
|
|
|
|
}
|
2015-05-08 03:34:16 +08:00
|
|
|
DtorBuilder.CreateRetVoid();
|
|
|
|
return ModuleDtorFunc;
|
|
|
|
}
|
|
|
|
|
2011-10-07 02:29:37 +08:00
|
|
|
CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
|
|
|
|
return new CGNVCUDARuntime(CGM);
|
|
|
|
}
|