2011-10-07 02:29:37 +08:00
|
|
|
//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
|
|
|
|
//
|
2019-01-19 16:50:56 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2011-10-07 02:29:37 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This provides a class for CUDA code generation targeting the NVIDIA CUDA
|
|
|
|
// runtime library.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "CGCUDARuntime.h"
|
2021-03-25 05:28:56 +08:00
|
|
|
#include "CGCXXABI.h"
|
2011-10-07 02:51:56 +08:00
|
|
|
#include "CodeGenFunction.h"
|
|
|
|
#include "CodeGenModule.h"
|
|
|
|
#include "clang/AST/Decl.h"
|
2019-02-01 05:34:03 +08:00
|
|
|
#include "clang/Basic/Cuda.h"
|
|
|
|
#include "clang/CodeGen/CodeGenABITypes.h"
|
2018-04-20 21:04:45 +08:00
|
|
|
#include "clang/CodeGen/ConstantInitBuilder.h"
|
2013-01-02 19:45:17 +08:00
|
|
|
#include "llvm/IR/BasicBlock.h"
|
|
|
|
#include "llvm/IR/Constants.h"
|
|
|
|
#include "llvm/IR/DerivedTypes.h"
|
2020-10-15 20:38:46 +08:00
|
|
|
#include "llvm/IR/ReplaceConstant.h"
|
2018-04-20 21:04:45 +08:00
|
|
|
#include "llvm/Support/Format.h"
|
2011-10-07 02:29:37 +08:00
|
|
|
|
|
|
|
using namespace clang;
|
|
|
|
using namespace CodeGen;
|
|
|
|
|
|
|
|
namespace {
|
2018-05-18 23:07:56 +08:00
|
|
|
constexpr unsigned CudaFatMagic = 0x466243b1;
|
|
|
|
constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
|
2011-10-07 02:29:37 +08:00
|
|
|
|
|
|
|
class CGNVCUDARuntime : public CGCUDARuntime {
|
2011-10-07 02:51:56 +08:00
|
|
|
|
|
|
|
private:
|
2016-11-19 16:17:24 +08:00
|
|
|
llvm::IntegerType *IntTy, *SizeTy;
|
|
|
|
llvm::Type *VoidTy;
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
|
|
|
|
|
|
|
|
/// Convenience reference to LLVM Context
|
|
|
|
llvm::LLVMContext &Context;
|
|
|
|
/// Convenience reference to the current module
|
|
|
|
llvm::Module &TheModule;
|
2021-02-09 01:29:29 +08:00
|
|
|
/// Keeps track of kernel launch stubs and handles emitted in this module
|
2019-02-14 10:00:09 +08:00
|
|
|
struct KernelInfo {
|
2021-02-09 01:29:29 +08:00
|
|
|
llvm::Function *Kernel; // stub function to help launch kernel
|
2019-02-14 10:00:09 +08:00
|
|
|
const Decl *D;
|
|
|
|
};
|
|
|
|
llvm::SmallVector<KernelInfo, 16> EmittedKernels;
|
2021-02-09 01:29:29 +08:00
|
|
|
// Map a device stub function to a symbol for identifying kernel in host code.
|
|
|
|
// For CUDA, the symbol for identifying the kernel is the same as the device
|
|
|
|
// stub function. For HIP, they are different.
|
|
|
|
llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles;
|
|
|
|
// Map a kernel handle to the kernel stub.
|
|
|
|
llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
|
2019-02-14 10:00:09 +08:00
|
|
|
struct VarInfo {
|
|
|
|
llvm::GlobalVariable *Var;
|
|
|
|
const VarDecl *D;
|
2020-03-28 03:47:12 +08:00
|
|
|
DeviceVarFlags Flags;
|
2019-02-14 10:00:09 +08:00
|
|
|
};
|
|
|
|
llvm::SmallVector<VarInfo, 16> DeviceVars;
|
2018-03-01 01:53:46 +08:00
|
|
|
/// Keeps track of variable containing handle of GPU binary. Populated by
|
2015-05-08 03:34:16 +08:00
|
|
|
/// ModuleCtorFunction() and used to create corresponding cleanup calls in
|
|
|
|
/// ModuleDtorFunction()
|
2018-03-01 01:53:46 +08:00
|
|
|
llvm::GlobalVariable *GpuBinaryHandle = nullptr;
|
2018-04-20 21:04:45 +08:00
|
|
|
/// Whether we generate relocatable device code.
|
|
|
|
bool RelocatableDeviceCode;
|
2019-02-14 10:00:09 +08:00
|
|
|
/// Mangle context for device.
|
|
|
|
std::unique_ptr<MangleContext> DeviceMC;
|
2011-10-07 02:51:56 +08:00
|
|
|
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee getSetupArgumentFn() const;
|
|
|
|
llvm::FunctionCallee getLaunchFn() const;
|
2011-10-07 02:51:56 +08:00
|
|
|
|
2018-04-20 21:04:45 +08:00
|
|
|
llvm::FunctionType *getRegisterGlobalsFnTy() const;
|
|
|
|
llvm::FunctionType *getCallbackFnTy() const;
|
|
|
|
llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
|
2018-04-25 09:10:37 +08:00
|
|
|
std::string addPrefixToName(StringRef FuncName) const;
|
|
|
|
std::string addUnderscoredPrefixToName(StringRef FuncName) const;
|
2018-04-20 21:04:45 +08:00
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
/// Creates a function to register all kernel stubs generated in this module.
|
2016-03-03 02:28:50 +08:00
|
|
|
llvm::Function *makeRegisterGlobalsFn();
|
2015-05-08 03:34:16 +08:00
|
|
|
|
|
|
|
/// Helper function that generates a constant string and returns a pointer to
|
|
|
|
/// the start of the string. The result of this function can be used anywhere
|
|
|
|
/// where the C code specifies const char*.
|
|
|
|
llvm::Constant *makeConstantString(const std::string &Str,
|
|
|
|
const std::string &Name = "",
|
2016-08-13 02:44:01 +08:00
|
|
|
const std::string &SectionName = "",
|
2015-05-08 03:34:16 +08:00
|
|
|
unsigned Alignment = 0) {
|
|
|
|
llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
|
|
|
|
llvm::ConstantInt::get(SizeTy, 0)};
|
Compute and preserve alignment more faithfully in IR-generation.
Introduce an Address type to bundle a pointer value with an
alignment. Introduce APIs on CGBuilderTy to work with Address
values. Change core APIs on CGF/CGM to traffic in Address where
appropriate. Require alignments to be non-zero. Update a ton
of code to compute and propagate alignment information.
As part of this, I've promoted CGBuiltin's EmitPointerWithAlignment
helper function to CGF and made use of it in a number of places in
the expression emitter.
The end result is that we should now be significantly more correct
when performing operations on objects that are locally known to
be under-aligned. Since alignment is not reliably tracked in the
type system, there are inherent limits to this, but at least we
are no longer confused by standard operations like derived-to-base
conversions and array-to-pointer decay. I've also fixed a large
number of bugs where we were applying the complete-object alignment
to a pointer instead of the non-virtual alignment, although most of
these were hidden by the very conservative approach we took with
member alignment.
Also, because IRGen now reliably asserts on zero alignments, we
should no longer be subject to an absurd but frustrating recurring
bug where an incomplete type would report a zero alignment and then
we'd naively do a alignmentAtOffset on it and emit code using an
alignment equal to the largest power-of-two factor of the offset.
We should also now be emitting much more aggressive alignment
attributes in the presence of over-alignment. In particular,
field access now uses alignmentAtOffset instead of min.
Several times in this patch, I had to change the existing
code-generation pattern in order to more effectively use
the Address APIs. For the most part, this seems to be a strict
improvement, like doing pointer arithmetic with GEPs instead of
ptrtoint. That said, I've tried very hard to not change semantics,
but it is likely that I've failed in a few places, for which I
apologize.
ABIArgInfo now always carries the assumed alignment of indirect and
indirect byval arguments. In order to cut down on what was already
a dauntingly large patch, I changed the code to never set align
attributes in the IR on non-byval indirect arguments. That is,
we still generate code which assumes that indirect arguments have
the given alignment, but we don't express this information to the
backend except where it's semantically required (i.e. on byvals).
This is likely a minor regression for those targets that did provide
this information, but it'll be trivial to add it back in a later
patch.
I partially punted on applying this work to CGBuiltin. Please
do not add more uses of the CreateDefaultAligned{Load,Store}
APIs; they will be going away eventually.
llvm-svn: 246985
2015-09-08 16:05:57 +08:00
|
|
|
auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
|
2016-08-13 02:44:01 +08:00
|
|
|
llvm::GlobalVariable *GV =
|
|
|
|
cast<llvm::GlobalVariable>(ConstStr.getPointer());
|
2018-06-08 19:17:08 +08:00
|
|
|
if (!SectionName.empty()) {
|
2016-08-13 02:44:01 +08:00
|
|
|
GV->setSection(SectionName);
|
2018-06-08 19:17:08 +08:00
|
|
|
// Mark the address as used which make sure that this section isn't
|
|
|
|
// merged and we will really have it in the object file.
|
|
|
|
GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
|
|
|
|
}
|
2016-08-13 02:44:01 +08:00
|
|
|
if (Alignment)
|
2019-10-03 21:00:29 +08:00
|
|
|
GV->setAlignment(llvm::Align(Alignment));
|
2016-08-13 02:44:01 +08:00
|
|
|
|
Compute and preserve alignment more faithfully in IR-generation.
Introduce an Address type to bundle a pointer value with an
alignment. Introduce APIs on CGBuilderTy to work with Address
values. Change core APIs on CGF/CGM to traffic in Address where
appropriate. Require alignments to be non-zero. Update a ton
of code to compute and propagate alignment information.
As part of this, I've promoted CGBuiltin's EmitPointerWithAlignment
helper function to CGF and made use of it in a number of places in
the expression emitter.
The end result is that we should now be significantly more correct
when performing operations on objects that are locally known to
be under-aligned. Since alignment is not reliably tracked in the
type system, there are inherent limits to this, but at least we
are no longer confused by standard operations like derived-to-base
conversions and array-to-pointer decay. I've also fixed a large
number of bugs where we were applying the complete-object alignment
to a pointer instead of the non-virtual alignment, although most of
these were hidden by the very conservative approach we took with
member alignment.
Also, because IRGen now reliably asserts on zero alignments, we
should no longer be subject to an absurd but frustrating recurring
bug where an incomplete type would report a zero alignment and then
we'd naively do a alignmentAtOffset on it and emit code using an
alignment equal to the largest power-of-two factor of the offset.
We should also now be emitting much more aggressive alignment
attributes in the presence of over-alignment. In particular,
field access now uses alignmentAtOffset instead of min.
Several times in this patch, I had to change the existing
code-generation pattern in order to more effectively use
the Address APIs. For the most part, this seems to be a strict
improvement, like doing pointer arithmetic with GEPs instead of
ptrtoint. That said, I've tried very hard to not change semantics,
but it is likely that I've failed in a few places, for which I
apologize.
ABIArgInfo now always carries the assumed alignment of indirect and
indirect byval arguments. In order to cut down on what was already
a dauntingly large patch, I changed the code to never set align
attributes in the IR on non-byval indirect arguments. That is,
we still generate code which assumes that indirect arguments have
the given alignment, but we don't express this information to the
backend except where it's semantically required (i.e. on byvals).
This is likely a minor regression for those targets that did provide
this information, but it'll be trivial to add it back in a later
patch.
I partially punted on applying this work to CGBuiltin. Please
do not add more uses of the CreateDefaultAligned{Load,Store}
APIs; they will be going away eventually.
llvm-svn: 246985
2015-09-08 16:05:57 +08:00
|
|
|
return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
|
|
|
|
ConstStr.getPointer(), Zeros);
|
2018-04-20 21:04:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Helper function that generates an empty dummy function returning void.
|
|
|
|
llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
|
|
|
|
assert(FnTy->getReturnType()->isVoidTy() &&
|
|
|
|
"Can only generate dummy functions returning void!");
|
|
|
|
llvm::Function *DummyFunc = llvm::Function::Create(
|
|
|
|
FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
|
|
|
|
|
|
|
|
llvm::BasicBlock *DummyBlock =
|
|
|
|
llvm::BasicBlock::Create(Context, "", DummyFunc);
|
|
|
|
CGBuilderTy FuncBuilder(CGM, Context);
|
|
|
|
FuncBuilder.SetInsertPoint(DummyBlock);
|
|
|
|
FuncBuilder.CreateRetVoid();
|
|
|
|
|
|
|
|
return DummyFunc;
|
|
|
|
}
|
2015-05-08 03:34:16 +08:00
|
|
|
|
2019-02-01 05:34:03 +08:00
|
|
|
void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
|
|
|
|
void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
|
2020-03-06 01:59:33 +08:00
|
|
|
std::string getDeviceSideName(const NamedDecl *ND) override;
|
2015-05-08 03:34:16 +08:00
|
|
|
|
2019-02-14 10:00:09 +08:00
|
|
|
void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
|
2021-01-27 06:11:01 +08:00
|
|
|
bool Extern, bool Constant) {
|
2020-03-28 03:47:12 +08:00
|
|
|
DeviceVars.push_back({&Var,
|
|
|
|
VD,
|
|
|
|
{DeviceVarFlags::Variable, Extern, Constant,
|
2020-10-15 20:38:46 +08:00
|
|
|
VD->hasAttr<HIPManagedAttr>(),
|
|
|
|
/*Normalized*/ false, 0}});
|
2020-03-28 03:47:12 +08:00
|
|
|
}
|
|
|
|
void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
|
2021-01-27 06:11:01 +08:00
|
|
|
bool Extern, int Type) {
|
2020-03-28 03:47:12 +08:00
|
|
|
DeviceVars.push_back({&Var,
|
|
|
|
VD,
|
|
|
|
{DeviceVarFlags::Surface, Extern, /*Constant*/ false,
|
2020-10-15 20:38:46 +08:00
|
|
|
/*Managed*/ false,
|
2020-03-28 03:47:12 +08:00
|
|
|
/*Normalized*/ false, Type}});
|
|
|
|
}
|
|
|
|
void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
|
2021-01-27 06:11:01 +08:00
|
|
|
bool Extern, int Type, bool Normalized) {
|
2020-03-28 03:47:12 +08:00
|
|
|
DeviceVars.push_back({&Var,
|
|
|
|
VD,
|
|
|
|
{DeviceVarFlags::Texture, Extern, /*Constant*/ false,
|
2020-10-15 20:38:46 +08:00
|
|
|
/*Managed*/ false, Normalized, Type}});
|
2016-03-03 02:28:50 +08:00
|
|
|
}
|
|
|
|
|
2021-02-06 03:40:16 +08:00
|
|
|
/// Creates module constructor function
|
|
|
|
llvm::Function *makeModuleCtorFunction();
|
|
|
|
/// Creates module destructor function
|
|
|
|
llvm::Function *makeModuleDtorFunction();
|
|
|
|
/// Transform managed variables for device compilation.
|
|
|
|
void transformManagedVars();
|
|
|
|
|
2021-01-27 06:11:01 +08:00
|
|
|
public:
|
|
|
|
CGNVCUDARuntime(CodeGenModule &CGM);
|
|
|
|
|
2021-02-09 01:29:29 +08:00
|
|
|
llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
|
|
|
|
llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
|
|
|
|
auto Loc = KernelStubs.find(Handle);
|
|
|
|
assert(Loc != KernelStubs.end());
|
|
|
|
return Loc->second;
|
|
|
|
}
|
2021-01-27 06:11:01 +08:00
|
|
|
void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
|
|
|
|
void handleVarRegistration(const VarDecl *VD,
|
|
|
|
llvm::GlobalVariable &Var) override;
|
|
|
|
void
|
|
|
|
internalizeDeviceSideVar(const VarDecl *D,
|
|
|
|
llvm::GlobalValue::LinkageTypes &Linkage) override;
|
2021-02-06 03:40:16 +08:00
|
|
|
|
|
|
|
llvm::Function *finalizeModule() override;
|
2011-10-07 02:29:37 +08:00
|
|
|
};
|
|
|
|
|
2015-06-23 07:07:51 +08:00
|
|
|
}
|
2011-10-07 02:29:37 +08:00
|
|
|
|
2018-04-25 09:10:37 +08:00
|
|
|
std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
|
|
|
|
if (CGM.getLangOpts().HIP)
|
|
|
|
return ((Twine("hip") + Twine(FuncName)).str());
|
|
|
|
return ((Twine("cuda") + Twine(FuncName)).str());
|
|
|
|
}
|
|
|
|
std::string
|
|
|
|
CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
|
|
|
|
if (CGM.getLangOpts().HIP)
|
|
|
|
return ((Twine("__hip") + Twine(FuncName)).str());
|
|
|
|
return ((Twine("__cuda") + Twine(FuncName)).str());
|
|
|
|
}
|
|
|
|
|
2021-04-23 23:22:35 +08:00
|
|
|
static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
|
|
|
|
// If the host and device have different C++ ABIs, mark it as the device
|
|
|
|
// mangle context so that the mangling needs to retrieve the additional
|
|
|
|
// device lambda mangling number instead of the regular host one.
|
|
|
|
if (CGM.getContext().getAuxTargetInfo() &&
|
|
|
|
CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
|
|
|
|
CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
|
|
|
|
return std::unique_ptr<MangleContext>(
|
|
|
|
CGM.getContext().createDeviceMangleContext(
|
|
|
|
*CGM.getContext().getAuxTargetInfo()));
|
|
|
|
}
|
|
|
|
|
|
|
|
return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
|
|
|
|
CGM.getContext().getAuxTargetInfo()));
|
|
|
|
}
|
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
|
|
|
|
: CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
|
2018-04-20 21:04:45 +08:00
|
|
|
TheModule(CGM.getModule()),
|
2019-02-14 10:00:09 +08:00
|
|
|
RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
|
2021-04-23 23:22:35 +08:00
|
|
|
DeviceMC(InitDeviceMC(CGM)) {
|
2011-10-07 02:51:56 +08:00
|
|
|
CodeGen::CodeGenTypes &Types = CGM.getTypes();
|
|
|
|
ASTContext &Ctx = CGM.getContext();
|
|
|
|
|
2016-11-19 16:17:24 +08:00
|
|
|
IntTy = CGM.IntTy;
|
|
|
|
SizeTy = CGM.SizeTy;
|
|
|
|
VoidTy = CGM.VoidTy;
|
2011-10-07 02:51:56 +08:00
|
|
|
|
|
|
|
CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
|
|
|
|
VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
|
2015-05-08 03:34:16 +08:00
|
|
|
VoidPtrPtrTy = VoidPtrTy->getPointerTo();
|
2011-10-07 02:51:56 +08:00
|
|
|
}
|
|
|
|
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
|
2011-10-07 02:51:56 +08:00
|
|
|
// cudaError_t cudaSetupArgument(void *, size_t, size_t)
|
2016-07-02 19:41:41 +08:00
|
|
|
llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
|
2018-04-25 09:10:37 +08:00
|
|
|
return CGM.CreateRuntimeFunction(
|
|
|
|
llvm::FunctionType::get(IntTy, Params, false),
|
|
|
|
addPrefixToName("SetupArgument"));
|
2011-10-07 02:51:56 +08:00
|
|
|
}
|
|
|
|
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
|
2018-04-25 09:10:37 +08:00
|
|
|
if (CGM.getLangOpts().HIP) {
|
|
|
|
// hipError_t hipLaunchByPtr(char *);
|
|
|
|
return CGM.CreateRuntimeFunction(
|
|
|
|
llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
|
|
|
|
} else {
|
|
|
|
// cudaError_t cudaLaunch(char *);
|
|
|
|
return CGM.CreateRuntimeFunction(
|
|
|
|
llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
|
|
|
|
}
|
2015-05-08 03:34:16 +08:00
|
|
|
}
|
|
|
|
|
2018-04-20 21:04:45 +08:00
|
|
|
llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
|
|
|
|
return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
|
|
|
|
}
|
|
|
|
|
|
|
|
llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
|
|
|
|
return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
|
|
|
|
}
|
|
|
|
|
|
|
|
llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
|
|
|
|
auto CallbackFnTy = getCallbackFnTy();
|
|
|
|
auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
|
|
|
|
llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
|
|
|
|
VoidPtrTy, CallbackFnTy->getPointerTo()};
|
|
|
|
return llvm::FunctionType::get(VoidTy, Params, false);
|
|
|
|
}
|
|
|
|
|
2020-03-06 01:59:33 +08:00
|
|
|
std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
|
|
|
|
GlobalDecl GD;
|
|
|
|
// D could be either a kernel or a variable.
|
|
|
|
if (auto *FD = dyn_cast<FunctionDecl>(ND))
|
|
|
|
GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
|
|
|
|
else
|
|
|
|
GD = GlobalDecl(ND);
|
2019-02-14 10:00:09 +08:00
|
|
|
std::string DeviceSideName;
|
2021-03-25 05:28:56 +08:00
|
|
|
MangleContext *MC;
|
|
|
|
if (CGM.getLangOpts().CUDAIsDevice)
|
|
|
|
MC = &CGM.getCXXABI().getMangleContext();
|
|
|
|
else
|
|
|
|
MC = DeviceMC.get();
|
|
|
|
if (MC->shouldMangleDeclName(ND)) {
|
2019-02-14 10:00:09 +08:00
|
|
|
SmallString<256> Buffer;
|
|
|
|
llvm::raw_svector_ostream Out(Buffer);
|
2021-03-25 05:28:56 +08:00
|
|
|
MC->mangleName(GD, Out);
|
2020-01-29 03:23:46 +08:00
|
|
|
DeviceSideName = std::string(Out.str());
|
2019-02-14 10:00:09 +08:00
|
|
|
} else
|
2020-01-29 03:23:46 +08:00
|
|
|
DeviceSideName = std::string(ND->getIdentifier()->getName());
|
2021-01-20 06:36:58 +08:00
|
|
|
|
|
|
|
// Make unique name for device side static file-scope variable for HIP.
|
|
|
|
if (CGM.getContext().shouldExternalizeStaticVar(ND) &&
|
|
|
|
CGM.getLangOpts().GPURelocatableDeviceCode &&
|
|
|
|
!CGM.getLangOpts().CUID.empty()) {
|
|
|
|
SmallString<256> Buffer;
|
|
|
|
llvm::raw_svector_ostream Out(Buffer);
|
|
|
|
Out << DeviceSideName;
|
|
|
|
CGM.printPostfixForExternalizedStaticVar(Out);
|
|
|
|
DeviceSideName = std::string(Out.str());
|
|
|
|
}
|
2019-02-14 10:00:09 +08:00
|
|
|
return DeviceSideName;
|
|
|
|
}
|
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
|
|
|
|
FunctionArgList &Args) {
|
2019-02-14 10:00:09 +08:00
|
|
|
EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
|
2021-02-09 01:29:29 +08:00
|
|
|
if (auto *GV = dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn])) {
|
|
|
|
GV->setLinkage(CGF.CurFn->getLinkage());
|
|
|
|
GV->setInitializer(CGF.CurFn);
|
|
|
|
}
|
2019-02-01 05:34:03 +08:00
|
|
|
if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
|
2019-09-25 03:16:40 +08:00
|
|
|
CudaFeature::CUDA_USES_NEW_LAUNCH) ||
|
2020-07-01 12:00:04 +08:00
|
|
|
(CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
|
2019-02-01 05:34:03 +08:00
|
|
|
emitDeviceStubBodyNew(CGF, Args);
|
|
|
|
else
|
|
|
|
emitDeviceStubBodyLegacy(CGF, Args);
|
2011-10-07 02:51:56 +08:00
|
|
|
}
|
|
|
|
|
2019-02-01 05:34:03 +08:00
|
|
|
// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
|
|
|
|
// array and kernels are launched using cudaLaunchKernel().
|
|
|
|
void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
|
|
|
|
FunctionArgList &Args) {
|
|
|
|
// Build the shadow stack entry at the very start of the function.
|
|
|
|
|
|
|
|
// Calculate amount of space we will need for all arguments. If we have no
|
|
|
|
// args, allocate a single pointer so we still have a valid pointer to the
|
|
|
|
// argument array that we can pass to runtime, even if it will be unused.
|
|
|
|
Address KernelArgs = CGF.CreateTempAlloca(
|
|
|
|
VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
|
|
|
|
llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
|
|
|
|
// Store pointers to the arguments in a locally allocated launch_args.
|
|
|
|
for (unsigned i = 0; i < Args.size(); ++i) {
|
|
|
|
llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
|
|
|
|
llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
|
|
|
|
CGF.Builder.CreateDefaultAlignedStore(
|
2021-07-18 00:25:31 +08:00
|
|
|
VoidVarPtr,
|
|
|
|
CGF.Builder.CreateConstGEP1_32(VoidPtrTy, KernelArgs.getPointer(), i));
|
2019-02-01 05:34:03 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
|
|
|
|
|
2019-09-25 03:16:40 +08:00
|
|
|
// Lookup cudaLaunchKernel/hipLaunchKernel function.
|
2019-02-01 05:34:03 +08:00
|
|
|
// cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
|
|
|
|
// void **args, size_t sharedMem,
|
|
|
|
// cudaStream_t stream);
|
2019-09-25 03:16:40 +08:00
|
|
|
// hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
|
|
|
|
// void **args, size_t sharedMem,
|
|
|
|
// hipStream_t stream);
|
2019-02-01 05:34:03 +08:00
|
|
|
TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
|
|
|
|
DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
|
2019-09-25 03:16:40 +08:00
|
|
|
auto LaunchKernelName = addPrefixToName("LaunchKernel");
|
2019-02-01 05:34:03 +08:00
|
|
|
IdentifierInfo &cudaLaunchKernelII =
|
2019-09-25 03:16:40 +08:00
|
|
|
CGM.getContext().Idents.get(LaunchKernelName);
|
2019-02-01 05:34:03 +08:00
|
|
|
FunctionDecl *cudaLaunchKernelFD = nullptr;
|
2021-04-20 00:31:31 +08:00
|
|
|
for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
|
2019-02-01 05:34:03 +08:00
|
|
|
if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
|
|
|
|
cudaLaunchKernelFD = FD;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cudaLaunchKernelFD == nullptr) {
|
|
|
|
CGM.Error(CGF.CurFuncDecl->getLocation(),
|
2019-09-25 03:16:40 +08:00
|
|
|
"Can't find declaration for " + LaunchKernelName);
|
2019-02-01 05:34:03 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
// Create temporary dim3 grid_dim, block_dim.
|
|
|
|
ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
|
|
|
|
QualType Dim3Ty = GridDimParam->getType();
|
|
|
|
Address GridDim =
|
|
|
|
CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
|
|
|
|
Address BlockDim =
|
|
|
|
CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
|
|
|
|
Address ShmemSize =
|
|
|
|
CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
|
|
|
|
Address Stream =
|
|
|
|
CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
|
2019-02-01 05:34:03 +08:00
|
|
|
llvm::FunctionType::get(IntTy,
|
|
|
|
{/*gridDim=*/GridDim.getType(),
|
|
|
|
/*blockDim=*/BlockDim.getType(),
|
|
|
|
/*ShmemSize=*/ShmemSize.getType(),
|
|
|
|
/*Stream=*/Stream.getType()},
|
|
|
|
/*isVarArg=*/false),
|
2019-09-25 03:16:40 +08:00
|
|
|
addUnderscoredPrefixToName("PopCallConfiguration"));
|
2019-02-01 05:34:03 +08:00
|
|
|
|
|
|
|
CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
|
|
|
|
{GridDim.getPointer(), BlockDim.getPointer(),
|
|
|
|
ShmemSize.getPointer(), Stream.getPointer()});
|
|
|
|
|
|
|
|
// Emit the call to cudaLaunch
|
2021-02-09 01:29:29 +08:00
|
|
|
llvm::Value *Kernel =
|
|
|
|
CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy);
|
2019-02-01 05:34:03 +08:00
|
|
|
CallArgList LaunchKernelArgs;
|
|
|
|
LaunchKernelArgs.add(RValue::get(Kernel),
|
|
|
|
cudaLaunchKernelFD->getParamDecl(0)->getType());
|
|
|
|
LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
|
|
|
|
LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
|
|
|
|
LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
|
|
|
|
cudaLaunchKernelFD->getParamDecl(3)->getType());
|
|
|
|
LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
|
|
|
|
cudaLaunchKernelFD->getParamDecl(4)->getType());
|
|
|
|
LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
|
|
|
|
cudaLaunchKernelFD->getParamDecl(5)->getType());
|
|
|
|
|
|
|
|
QualType QT = cudaLaunchKernelFD->getType();
|
|
|
|
QualType CQT = QT.getCanonicalType();
|
2019-02-02 09:48:23 +08:00
|
|
|
llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
|
2019-02-01 05:34:03 +08:00
|
|
|
llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
|
|
|
|
|
|
|
|
const CGFunctionInfo &FI =
|
|
|
|
CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee cudaLaunchKernelFn =
|
2019-09-25 03:16:40 +08:00
|
|
|
CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
|
2019-02-01 05:34:03 +08:00
|
|
|
CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
|
|
|
|
LaunchKernelArgs);
|
|
|
|
CGF.EmitBranch(EndBlock);
|
|
|
|
|
|
|
|
CGF.EmitBlock(EndBlock);
|
|
|
|
}
|
|
|
|
|
|
|
|
void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
|
|
|
|
FunctionArgList &Args) {
|
2016-07-28 06:36:21 +08:00
|
|
|
// Emit a call to cudaSetupArgument for each arg in Args.
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
|
2016-07-28 06:36:21 +08:00
|
|
|
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
|
|
|
|
CharUnits Offset = CharUnits::Zero();
|
|
|
|
for (const VarDecl *A : Args) {
|
2020-08-24 16:19:29 +08:00
|
|
|
auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
|
|
|
|
Offset = Offset.alignTo(TInfo.Align);
|
2016-07-28 06:36:21 +08:00
|
|
|
llvm::Value *Args[] = {
|
|
|
|
CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
|
|
|
|
VoidPtrTy),
|
2020-08-24 16:19:29 +08:00
|
|
|
llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
|
2016-07-28 06:36:21 +08:00
|
|
|
llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
|
|
|
|
};
|
2019-01-30 10:54:28 +08:00
|
|
|
llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
|
2011-10-07 02:51:56 +08:00
|
|
|
llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
|
2019-01-30 10:54:28 +08:00
|
|
|
llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
|
2016-07-28 06:36:21 +08:00
|
|
|
llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
|
2019-01-30 10:54:28 +08:00
|
|
|
CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
|
2011-10-07 02:51:56 +08:00
|
|
|
CGF.EmitBlock(NextBlock);
|
2020-08-24 16:19:29 +08:00
|
|
|
Offset += TInfo.Width;
|
2011-10-07 02:51:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Emit the call to cudaLaunch
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
|
2021-02-09 01:29:29 +08:00
|
|
|
llvm::Value *Arg =
|
|
|
|
CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy);
|
2013-03-01 03:01:20 +08:00
|
|
|
CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
|
2011-10-07 02:51:56 +08:00
|
|
|
CGF.EmitBranch(EndBlock);
|
|
|
|
|
|
|
|
CGF.EmitBlock(EndBlock);
|
2011-10-07 02:29:37 +08:00
|
|
|
}
|
|
|
|
|
2020-10-15 20:38:46 +08:00
|
|
|
// Replace the original variable Var with the address loaded from variable
|
|
|
|
// ManagedVar populated by HIP runtime.
|
|
|
|
static void replaceManagedVar(llvm::GlobalVariable *Var,
|
|
|
|
llvm::GlobalVariable *ManagedVar) {
|
|
|
|
SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
|
|
|
|
for (auto &&VarUse : Var->uses()) {
|
|
|
|
WorkList.push_back({VarUse.getUser()});
|
|
|
|
}
|
|
|
|
while (!WorkList.empty()) {
|
|
|
|
auto &&WorkItem = WorkList.pop_back_val();
|
|
|
|
auto *U = WorkItem.back();
|
|
|
|
if (isa<llvm::ConstantExpr>(U)) {
|
|
|
|
for (auto &&UU : U->uses()) {
|
|
|
|
WorkItem.push_back(UU.getUser());
|
|
|
|
WorkList.push_back(WorkItem);
|
|
|
|
WorkItem.pop_back();
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (auto *I = dyn_cast<llvm::Instruction>(U)) {
|
|
|
|
llvm::Value *OldV = Var;
|
|
|
|
llvm::Instruction *NewV =
|
|
|
|
new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
|
|
|
|
llvm::Align(Var->getAlignment()), I);
|
|
|
|
WorkItem.pop_back();
|
|
|
|
// Replace constant expressions directly or indirectly using the managed
|
|
|
|
// variable with instructions.
|
|
|
|
for (auto &&Op : WorkItem) {
|
|
|
|
auto *CE = cast<llvm::ConstantExpr>(Op);
|
|
|
|
auto *NewInst = llvm::createReplacementInstr(CE, I);
|
|
|
|
NewInst->replaceUsesOfWith(OldV, NewV);
|
|
|
|
OldV = CE;
|
|
|
|
NewV = NewInst;
|
|
|
|
}
|
|
|
|
I->replaceUsesOfWith(OldV, NewV);
|
|
|
|
} else {
|
|
|
|
llvm_unreachable("Invalid use of managed variable");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-03-03 02:28:50 +08:00
|
|
|
/// Creates a function that sets up state on the host side for CUDA objects that
|
|
|
|
/// have a presence on both the host and device sides. Specifically, registers
|
|
|
|
/// the host side of kernel functions and device global variables with the CUDA
|
|
|
|
/// runtime.
|
2015-05-08 03:34:16 +08:00
|
|
|
/// \code
|
2016-03-03 02:28:50 +08:00
|
|
|
/// void __cuda_register_globals(void** GpuBinaryHandle) {
|
2015-05-08 03:34:16 +08:00
|
|
|
/// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
|
|
|
|
/// ...
|
|
|
|
/// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
|
2016-03-03 02:28:50 +08:00
|
|
|
/// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
|
|
|
|
/// ...
|
|
|
|
/// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
|
2015-05-08 03:34:16 +08:00
|
|
|
/// }
|
|
|
|
/// \endcode
|
2016-03-03 02:28:50 +08:00
|
|
|
llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
|
2016-03-03 02:28:53 +08:00
|
|
|
// No need to register anything
|
|
|
|
if (EmittedKernels.empty() && DeviceVars.empty())
|
|
|
|
return nullptr;
|
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
|
2018-04-20 21:04:45 +08:00
|
|
|
getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
|
2018-04-25 09:10:37 +08:00
|
|
|
addUnderscoredPrefixToName("_register_globals"), &TheModule);
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::BasicBlock *EntryBB =
|
|
|
|
llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
|
Compute and preserve alignment more faithfully in IR-generation.
Introduce an Address type to bundle a pointer value with an
alignment. Introduce APIs on CGBuilderTy to work with Address
values. Change core APIs on CGF/CGM to traffic in Address where
appropriate. Require alignments to be non-zero. Update a ton
of code to compute and propagate alignment information.
As part of this, I've promoted CGBuiltin's EmitPointerWithAlignment
helper function to CGF and made use of it in a number of places in
the expression emitter.
The end result is that we should now be significantly more correct
when performing operations on objects that are locally known to
be under-aligned. Since alignment is not reliably tracked in the
type system, there are inherent limits to this, but at least we
are no longer confused by standard operations like derived-to-base
conversions and array-to-pointer decay. I've also fixed a large
number of bugs where we were applying the complete-object alignment
to a pointer instead of the non-virtual alignment, although most of
these were hidden by the very conservative approach we took with
member alignment.
Also, because IRGen now reliably asserts on zero alignments, we
should no longer be subject to an absurd but frustrating recurring
bug where an incomplete type would report a zero alignment and then
we'd naively do a alignmentAtOffset on it and emit code using an
alignment equal to the largest power-of-two factor of the offset.
We should also now be emitting much more aggressive alignment
attributes in the presence of over-alignment. In particular,
field access now uses alignmentAtOffset instead of min.
Several times in this patch, I had to change the existing
code-generation pattern in order to more effectively use
the Address APIs. For the most part, this seems to be a strict
improvement, like doing pointer arithmetic with GEPs instead of
ptrtoint. That said, I've tried very hard to not change semantics,
but it is likely that I've failed in a few places, for which I
apologize.
ABIArgInfo now always carries the assumed alignment of indirect and
indirect byval arguments. In order to cut down on what was already
a dauntingly large patch, I changed the code to never set align
attributes in the IR on non-byval indirect arguments. That is,
we still generate code which assumes that indirect arguments have
the given alignment, but we don't express this information to the
backend except where it's semantically required (i.e. on byvals).
This is likely a minor regression for those targets that did provide
this information, but it'll be trivial to add it back in a later
patch.
I partially punted on applying this work to CGBuiltin. Please
do not add more uses of the CreateDefaultAligned{Load,Store}
APIs; they will be going away eventually.
llvm-svn: 246985
2015-09-08 16:05:57 +08:00
|
|
|
CGBuilderTy Builder(CGM, Context);
|
2015-05-08 03:34:16 +08:00
|
|
|
Builder.SetInsertPoint(EntryBB);
|
|
|
|
|
|
|
|
// void __cudaRegisterFunction(void **, const char *, char *, const char *,
|
|
|
|
// int, uint3*, uint3*, dim3*, dim3*, int*)
|
2016-07-02 20:03:57 +08:00
|
|
|
llvm::Type *RegisterFuncParams[] = {
|
2015-05-08 03:34:16 +08:00
|
|
|
VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
|
|
|
|
VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
|
2018-04-25 09:10:37 +08:00
|
|
|
addUnderscoredPrefixToName("RegisterFunction"));
|
2015-05-08 03:34:16 +08:00
|
|
|
|
|
|
|
// Extract GpuBinaryHandle passed as the first argument passed to
|
2016-03-03 02:28:50 +08:00
|
|
|
// __cuda_register_globals() and generate __cudaRegisterFunction() call for
|
2015-05-08 03:34:16 +08:00
|
|
|
// each emitted kernel.
|
|
|
|
llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
|
2019-02-14 10:00:09 +08:00
|
|
|
for (auto &&I : EmittedKernels) {
|
2020-03-06 01:59:33 +08:00
|
|
|
llvm::Constant *KernelName =
|
|
|
|
makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
|
2016-03-03 02:28:50 +08:00
|
|
|
llvm::Value *Args[] = {
|
2019-02-14 10:00:09 +08:00
|
|
|
&GpuBinaryHandlePtr,
|
2021-02-09 01:29:29 +08:00
|
|
|
Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy),
|
2019-02-14 10:00:09 +08:00
|
|
|
KernelName,
|
|
|
|
KernelName,
|
|
|
|
llvm::ConstantInt::get(IntTy, -1),
|
|
|
|
NullPtr,
|
|
|
|
NullPtr,
|
|
|
|
NullPtr,
|
|
|
|
NullPtr,
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
|
2016-03-03 02:28:50 +08:00
|
|
|
Builder.CreateCall(RegisterFunc, Args);
|
|
|
|
}
|
|
|
|
|
2020-04-03 22:17:06 +08:00
|
|
|
llvm::Type *VarSizeTy = IntTy;
|
|
|
|
// For HIP or CUDA 9.0+, device variable size is type of `size_t`.
|
|
|
|
if (CGM.getLangOpts().HIP ||
|
|
|
|
ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
|
|
|
|
VarSizeTy = SizeTy;
|
|
|
|
|
2016-03-03 02:28:50 +08:00
|
|
|
// void __cudaRegisterVar(void **, char *, char *, const char *,
|
|
|
|
// int, int, int, int)
|
2016-07-02 20:03:57 +08:00
|
|
|
llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
|
2020-04-03 22:17:06 +08:00
|
|
|
CharPtrTy, IntTy, VarSizeTy,
|
2016-07-02 20:03:57 +08:00
|
|
|
IntTy, IntTy};
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
|
2020-04-03 22:17:06 +08:00
|
|
|
llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
|
2018-04-25 09:10:37 +08:00
|
|
|
addUnderscoredPrefixToName("RegisterVar"));
|
2020-10-15 20:38:46 +08:00
|
|
|
// void __hipRegisterManagedVar(void **, char *, char *, const char *,
|
|
|
|
// size_t, unsigned)
|
|
|
|
llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
|
|
|
|
CharPtrTy, VarSizeTy, IntTy};
|
|
|
|
llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
|
|
|
|
llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
|
|
|
|
addUnderscoredPrefixToName("RegisterManagedVar"));
|
2020-03-28 03:47:12 +08:00
|
|
|
// void __cudaRegisterSurface(void **, const struct surfaceReference *,
|
|
|
|
// const void **, const char *, int, int);
|
|
|
|
llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
|
|
|
|
llvm::FunctionType::get(
|
|
|
|
VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
|
|
|
|
false),
|
|
|
|
addUnderscoredPrefixToName("RegisterSurface"));
|
|
|
|
// void __cudaRegisterTexture(void **, const struct textureReference *,
|
|
|
|
// const void **, const char *, int, int, int)
|
|
|
|
llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
|
|
|
|
llvm::FunctionType::get(
|
|
|
|
VoidTy,
|
|
|
|
{VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
|
|
|
|
false),
|
|
|
|
addUnderscoredPrefixToName("RegisterTexture"));
|
2019-02-14 10:00:09 +08:00
|
|
|
for (auto &&Info : DeviceVars) {
|
|
|
|
llvm::GlobalVariable *Var = Info.Var;
|
2021-02-06 03:40:16 +08:00
|
|
|
assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
|
|
|
|
"External variables should not show up here, except HIP managed "
|
|
|
|
"variables");
|
2019-02-14 10:00:09 +08:00
|
|
|
llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
|
2020-03-28 22:12:58 +08:00
|
|
|
switch (Info.Flags.getKind()) {
|
2020-03-28 03:47:12 +08:00
|
|
|
case DeviceVarFlags::Variable: {
|
|
|
|
uint64_t VarSize =
|
|
|
|
CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
|
2020-10-15 20:38:46 +08:00
|
|
|
if (Info.Flags.isManaged()) {
|
|
|
|
auto ManagedVar = new llvm::GlobalVariable(
|
|
|
|
CGM.getModule(), Var->getType(),
|
|
|
|
/*isConstant=*/false, Var->getLinkage(),
|
2021-02-06 03:40:16 +08:00
|
|
|
/*Init=*/Var->isDeclaration()
|
|
|
|
? nullptr
|
|
|
|
: llvm::ConstantPointerNull::get(Var->getType()),
|
|
|
|
/*Name=*/"", /*InsertBefore=*/nullptr,
|
2020-10-15 20:38:46 +08:00
|
|
|
llvm::GlobalVariable::NotThreadLocal);
|
2021-02-03 07:06:33 +08:00
|
|
|
ManagedVar->setDSOLocal(Var->isDSOLocal());
|
|
|
|
ManagedVar->setVisibility(Var->getVisibility());
|
2021-02-06 03:40:16 +08:00
|
|
|
ManagedVar->setExternallyInitialized(true);
|
|
|
|
ManagedVar->takeName(Var);
|
|
|
|
Var->setName(Twine(ManagedVar->getName() + ".managed"));
|
2020-10-15 20:38:46 +08:00
|
|
|
replaceManagedVar(Var, ManagedVar);
|
|
|
|
llvm::Value *Args[] = {
|
|
|
|
&GpuBinaryHandlePtr,
|
|
|
|
Builder.CreateBitCast(ManagedVar, VoidPtrTy),
|
|
|
|
Builder.CreateBitCast(Var, VoidPtrTy),
|
|
|
|
VarName,
|
|
|
|
llvm::ConstantInt::get(VarSizeTy, VarSize),
|
|
|
|
llvm::ConstantInt::get(IntTy, Var->getAlignment())};
|
2021-02-06 03:40:16 +08:00
|
|
|
if (!Var->isDeclaration())
|
|
|
|
Builder.CreateCall(RegisterManagedVar, Args);
|
2020-10-15 20:38:46 +08:00
|
|
|
} else {
|
|
|
|
llvm::Value *Args[] = {
|
|
|
|
&GpuBinaryHandlePtr,
|
|
|
|
Builder.CreateBitCast(Var, VoidPtrTy),
|
|
|
|
VarName,
|
|
|
|
VarName,
|
|
|
|
llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
|
|
|
|
llvm::ConstantInt::get(VarSizeTy, VarSize),
|
|
|
|
llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
|
|
|
|
llvm::ConstantInt::get(IntTy, 0)};
|
|
|
|
Builder.CreateCall(RegisterVar, Args);
|
|
|
|
}
|
2020-03-28 03:47:12 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
case DeviceVarFlags::Surface:
|
|
|
|
Builder.CreateCall(
|
|
|
|
RegisterSurf,
|
|
|
|
{&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
|
2020-03-28 22:12:58 +08:00
|
|
|
VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
|
|
|
|
llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
|
2020-03-28 03:47:12 +08:00
|
|
|
break;
|
|
|
|
case DeviceVarFlags::Texture:
|
|
|
|
Builder.CreateCall(
|
|
|
|
RegisterTex,
|
|
|
|
{&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
|
2020-03-28 22:12:58 +08:00
|
|
|
VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
|
|
|
|
llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
|
|
|
|
llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
|
2020-03-28 03:47:12 +08:00
|
|
|
break;
|
|
|
|
}
|
2015-05-08 03:34:16 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
Builder.CreateRetVoid();
|
|
|
|
return RegisterKernelsFunc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Creates a global constructor function for the module:
|
2018-07-21 06:45:24 +08:00
|
|
|
///
|
|
|
|
/// For CUDA:
|
2015-05-08 03:34:16 +08:00
|
|
|
/// \code
|
|
|
|
/// void __cuda_module_ctor(void*) {
|
2018-03-01 01:53:46 +08:00
|
|
|
/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
|
|
|
|
/// __cuda_register_globals(Handle);
|
2015-05-08 03:34:16 +08:00
|
|
|
/// }
|
|
|
|
/// \endcode
|
2018-07-21 06:45:24 +08:00
|
|
|
///
|
|
|
|
/// For HIP:
|
|
|
|
/// \code
|
|
|
|
/// void __hip_module_ctor(void*) {
|
|
|
|
/// if (__hip_gpubin_handle == 0) {
|
|
|
|
/// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
|
|
|
|
/// __hip_register_globals(__hip_gpubin_handle);
|
|
|
|
/// }
|
|
|
|
/// }
|
|
|
|
/// \endcode
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
|
2018-05-18 23:07:56 +08:00
|
|
|
bool IsHIP = CGM.getLangOpts().HIP;
|
2019-04-03 04:49:41 +08:00
|
|
|
bool IsCUDA = CGM.getLangOpts().CUDA;
|
2018-03-01 01:53:46 +08:00
|
|
|
// No need to generate ctors/dtors if there is no GPU binary.
|
2018-05-18 23:07:56 +08:00
|
|
|
StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
|
|
|
|
if (CudaGpuBinaryFileName.empty() && !IsHIP)
|
2016-03-03 02:28:53 +08:00
|
|
|
return nullptr;
|
2019-04-03 05:54:41 +08:00
|
|
|
if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
|
|
|
|
DeviceVars.empty())
|
2019-04-03 04:10:18 +08:00
|
|
|
return nullptr;
|
2016-03-03 02:28:53 +08:00
|
|
|
|
2018-05-18 23:07:56 +08:00
|
|
|
// void __{cuda|hip}_register_globals(void* handle);
|
2016-03-03 02:28:50 +08:00
|
|
|
llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
|
2018-04-20 21:04:45 +08:00
|
|
|
// We always need a function to pass in as callback. Create a dummy
|
|
|
|
// implementation if we don't need to register anything.
|
|
|
|
if (RelocatableDeviceCode && !RegisterGlobalsFunc)
|
|
|
|
RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
|
|
|
|
|
2018-05-18 23:07:56 +08:00
|
|
|
// void ** __{cuda|hip}RegisterFatBinary(void *);
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
|
2018-04-25 09:10:37 +08:00
|
|
|
addUnderscoredPrefixToName("RegisterFatBinary"));
|
2015-05-08 03:34:16 +08:00
|
|
|
// struct { int magic, int version, void * gpu_binary, void * dont_care };
|
|
|
|
llvm::StructType *FatbinWrapperTy =
|
2017-05-10 03:31:30 +08:00
|
|
|
llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
|
2015-05-08 03:34:16 +08:00
|
|
|
|
2018-03-01 01:53:46 +08:00
|
|
|
// Register GPU binary with the CUDA runtime, store returned handle in a
|
|
|
|
// global variable and save a reference in GpuBinaryHandle to be cleaned up
|
|
|
|
// in destructor on exit. Then associate all known kernels with the GPU binary
|
|
|
|
// handle so CUDA runtime can figure out what to call on the GPU side.
|
2018-10-03 01:48:54 +08:00
|
|
|
std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
|
|
|
|
if (!CudaGpuBinaryFileName.empty()) {
|
2018-05-18 23:07:56 +08:00
|
|
|
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
|
|
|
|
llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
|
|
|
|
if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
|
|
|
|
CGM.getDiags().Report(diag::err_cannot_open_file)
|
|
|
|
<< CudaGpuBinaryFileName << EC.message();
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
|
2018-03-01 01:53:46 +08:00
|
|
|
}
|
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::Function *ModuleCtorFunc = llvm::Function::Create(
|
|
|
|
llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
|
2018-04-25 09:10:37 +08:00
|
|
|
llvm::GlobalValue::InternalLinkage,
|
|
|
|
addUnderscoredPrefixToName("_module_ctor"), &TheModule);
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::BasicBlock *CtorEntryBB =
|
|
|
|
llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
|
Compute and preserve alignment more faithfully in IR-generation.
Introduce an Address type to bundle a pointer value with an
alignment. Introduce APIs on CGBuilderTy to work with Address
values. Change core APIs on CGF/CGM to traffic in Address where
appropriate. Require alignments to be non-zero. Update a ton
of code to compute and propagate alignment information.
As part of this, I've promoted CGBuiltin's EmitPointerWithAlignment
helper function to CGF and made use of it in a number of places in
the expression emitter.
The end result is that we should now be significantly more correct
when performing operations on objects that are locally known to
be under-aligned. Since alignment is not reliably tracked in the
type system, there are inherent limits to this, but at least we
are no longer confused by standard operations like derived-to-base
conversions and array-to-pointer decay. I've also fixed a large
number of bugs where we were applying the complete-object alignment
to a pointer instead of the non-virtual alignment, although most of
these were hidden by the very conservative approach we took with
member alignment.
Also, because IRGen now reliably asserts on zero alignments, we
should no longer be subject to an absurd but frustrating recurring
bug where an incomplete type would report a zero alignment and then
we'd naively do a alignmentAtOffset on it and emit code using an
alignment equal to the largest power-of-two factor of the offset.
We should also now be emitting much more aggressive alignment
attributes in the presence of over-alignment. In particular,
field access now uses alignmentAtOffset instead of min.
Several times in this patch, I had to change the existing
code-generation pattern in order to more effectively use
the Address APIs. For the most part, this seems to be a strict
improvement, like doing pointer arithmetic with GEPs instead of
ptrtoint. That said, I've tried very hard to not change semantics,
but it is likely that I've failed in a few places, for which I
apologize.
ABIArgInfo now always carries the assumed alignment of indirect and
indirect byval arguments. In order to cut down on what was already
a dauntingly large patch, I changed the code to never set align
attributes in the IR on non-byval indirect arguments. That is,
we still generate code which assumes that indirect arguments have
the given alignment, but we don't express this information to the
backend except where it's semantically required (i.e. on byvals).
This is likely a minor regression for those targets that did provide
this information, but it'll be trivial to add it back in a later
patch.
I partially punted on applying this work to CGBuiltin. Please
do not add more uses of the CreateDefaultAligned{Load,Store}
APIs; they will be going away eventually.
llvm-svn: 246985
2015-09-08 16:05:57 +08:00
|
|
|
CGBuilderTy CtorBuilder(CGM, Context);
|
2015-05-08 03:34:16 +08:00
|
|
|
|
|
|
|
CtorBuilder.SetInsertPoint(CtorEntryBB);
|
|
|
|
|
2018-04-20 21:04:45 +08:00
|
|
|
const char *FatbinConstantName;
|
2018-05-18 23:07:56 +08:00
|
|
|
const char *FatbinSectionName;
|
|
|
|
const char *ModuleIDSectionName;
|
|
|
|
StringRef ModuleIDPrefix;
|
|
|
|
llvm::Constant *FatBinStr;
|
|
|
|
unsigned FatMagic;
|
|
|
|
if (IsHIP) {
|
|
|
|
FatbinConstantName = ".hip_fatbin";
|
|
|
|
FatbinSectionName = ".hipFatBinSegment";
|
|
|
|
|
|
|
|
ModuleIDSectionName = "__hip_module_id";
|
|
|
|
ModuleIDPrefix = "__hip_";
|
|
|
|
|
2018-10-03 01:48:54 +08:00
|
|
|
if (CudaGpuBinary) {
|
|
|
|
// If fatbin is available from early finalization, create a string
|
|
|
|
// literal containing the fat binary loaded from the given file.
|
2020-09-26 00:34:38 +08:00
|
|
|
const unsigned HIPCodeObjectAlign = 4096;
|
|
|
|
FatBinStr =
|
|
|
|
makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
|
|
|
|
FatbinConstantName, HIPCodeObjectAlign);
|
2018-10-03 01:48:54 +08:00
|
|
|
} else {
|
|
|
|
// If fatbin is not available, create an external symbol
|
|
|
|
// __hip_fatbin in section .hip_fatbin. The external symbol is supposed
|
|
|
|
// to contain the fat binary but will be populated somewhere else,
|
|
|
|
// e.g. by lld through link script.
|
|
|
|
FatBinStr = new llvm::GlobalVariable(
|
2018-05-18 23:07:56 +08:00
|
|
|
CGM.getModule(), CGM.Int8Ty,
|
|
|
|
/*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
|
|
|
|
"__hip_fatbin", nullptr,
|
|
|
|
llvm::GlobalVariable::NotThreadLocal);
|
2018-10-03 01:48:54 +08:00
|
|
|
cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
|
|
|
|
}
|
2018-05-18 23:07:56 +08:00
|
|
|
|
|
|
|
FatMagic = HIPFatMagic;
|
|
|
|
} else {
|
|
|
|
if (RelocatableDeviceCode)
|
2018-06-29 01:15:52 +08:00
|
|
|
FatbinConstantName = CGM.getTriple().isMacOSX()
|
|
|
|
? "__NV_CUDA,__nv_relfatbin"
|
|
|
|
: "__nv_relfatbin";
|
2018-05-18 23:07:56 +08:00
|
|
|
else
|
|
|
|
FatbinConstantName =
|
|
|
|
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
|
|
|
|
// NVIDIA's cuobjdump looks for fatbins in this section.
|
|
|
|
FatbinSectionName =
|
|
|
|
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
|
|
|
|
|
2018-06-29 01:15:52 +08:00
|
|
|
ModuleIDSectionName = CGM.getTriple().isMacOSX()
|
|
|
|
? "__NV_CUDA,__nv_module_id"
|
|
|
|
: "__nv_module_id";
|
2018-05-18 23:07:56 +08:00
|
|
|
ModuleIDPrefix = "__nv_";
|
|
|
|
|
|
|
|
// For CUDA, create a string literal containing the fat binary loaded from
|
|
|
|
// the given file.
|
2020-01-29 03:23:46 +08:00
|
|
|
FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
|
2018-05-18 23:07:56 +08:00
|
|
|
FatbinConstantName, 8);
|
|
|
|
FatMagic = CudaFatMagic;
|
|
|
|
}
|
2018-03-01 01:53:46 +08:00
|
|
|
|
|
|
|
// Create initialized wrapper structure that points to the loaded GPU binary
|
|
|
|
ConstantInitBuilder Builder(CGM);
|
|
|
|
auto Values = Builder.beginStruct(FatbinWrapperTy);
|
|
|
|
// Fatbin wrapper magic.
|
2018-05-18 23:07:56 +08:00
|
|
|
Values.addInt(IntTy, FatMagic);
|
2018-03-01 01:53:46 +08:00
|
|
|
// Fatbin version.
|
|
|
|
Values.addInt(IntTy, 1);
|
|
|
|
// Data.
|
2018-05-18 23:07:56 +08:00
|
|
|
Values.add(FatBinStr);
|
2018-03-01 01:53:46 +08:00
|
|
|
// Unused in fatbin v1.
|
|
|
|
Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
|
|
|
|
llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
|
2018-04-25 09:10:37 +08:00
|
|
|
addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
|
2018-03-01 01:53:46 +08:00
|
|
|
/*constant*/ true);
|
|
|
|
FatbinWrapper->setSection(FatbinSectionName);
|
|
|
|
|
2018-07-21 06:45:24 +08:00
|
|
|
// There is only one HIP fat binary per linked module, however there are
|
|
|
|
// multiple constructor functions. Make sure the fat binary is registered
|
|
|
|
// only once. The constructor functions are executed by the dynamic loader
|
|
|
|
// before the program gains control. The dynamic loader cannot execute the
|
|
|
|
// constructor functions concurrently since doing that would not guarantee
|
|
|
|
// thread safety of the loaded program. Therefore we can assume sequential
|
|
|
|
// execution of constructor functions here.
|
|
|
|
if (IsHIP) {
|
2018-10-03 01:48:54 +08:00
|
|
|
auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
|
|
|
|
llvm::GlobalValue::LinkOnceAnyLinkage;
|
2018-07-21 06:45:24 +08:00
|
|
|
llvm::BasicBlock *IfBlock =
|
|
|
|
llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
|
|
|
|
llvm::BasicBlock *ExitBlock =
|
|
|
|
llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
|
|
|
|
// The name, size, and initialization pattern of this variable is part
|
|
|
|
// of HIP ABI.
|
|
|
|
GpuBinaryHandle = new llvm::GlobalVariable(
|
|
|
|
TheModule, VoidPtrPtrTy, /*isConstant=*/false,
|
2018-10-03 01:48:54 +08:00
|
|
|
Linkage,
|
2018-07-21 06:45:24 +08:00
|
|
|
/*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
|
|
|
|
"__hip_gpubin_handle");
|
2019-10-03 21:00:29 +08:00
|
|
|
GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
|
2018-08-18 01:47:31 +08:00
|
|
|
// Prevent the weak symbol in different shared libraries being merged.
|
2018-10-03 01:48:54 +08:00
|
|
|
if (Linkage != llvm::GlobalValue::InternalLinkage)
|
|
|
|
GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
|
2018-07-21 06:45:24 +08:00
|
|
|
Address GpuBinaryAddr(
|
|
|
|
GpuBinaryHandle,
|
|
|
|
CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
|
|
|
|
{
|
|
|
|
auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
|
|
|
|
llvm::Constant *Zero =
|
|
|
|
llvm::Constant::getNullValue(HandleValue->getType());
|
|
|
|
llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
|
|
|
|
CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
|
|
|
|
}
|
|
|
|
{
|
|
|
|
CtorBuilder.SetInsertPoint(IfBlock);
|
|
|
|
// GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
|
|
|
|
llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
|
|
|
|
RegisterFatbinFunc,
|
|
|
|
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
|
|
|
|
CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
|
|
|
|
CtorBuilder.CreateBr(ExitBlock);
|
|
|
|
}
|
|
|
|
{
|
|
|
|
CtorBuilder.SetInsertPoint(ExitBlock);
|
|
|
|
// Call __hip_register_globals(GpuBinaryHandle);
|
|
|
|
if (RegisterGlobalsFunc) {
|
|
|
|
auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
|
|
|
|
CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (!RelocatableDeviceCode) {
|
|
|
|
// Register binary with CUDA runtime. This is substantially different in
|
|
|
|
// default mode vs. separate compilation!
|
|
|
|
// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
|
2018-04-20 21:04:45 +08:00
|
|
|
llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
|
|
|
|
RegisterFatbinFunc,
|
|
|
|
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
|
|
|
|
GpuBinaryHandle = new llvm::GlobalVariable(
|
|
|
|
TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
|
2018-07-21 06:45:24 +08:00
|
|
|
llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
|
2019-10-03 21:00:29 +08:00
|
|
|
GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
|
2018-04-20 21:04:45 +08:00
|
|
|
CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
|
|
|
|
CGM.getPointerAlign());
|
|
|
|
|
2018-07-21 06:45:24 +08:00
|
|
|
// Call __cuda_register_globals(GpuBinaryHandle);
|
2018-04-20 21:04:45 +08:00
|
|
|
if (RegisterGlobalsFunc)
|
|
|
|
CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
|
2019-02-06 06:38:58 +08:00
|
|
|
|
|
|
|
// Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
|
|
|
|
if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
|
|
|
|
CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
|
|
|
|
// void __cudaRegisterFatBinaryEnd(void **);
|
|
|
|
llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
|
|
|
|
llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
|
|
|
|
"__cudaRegisterFatBinaryEnd");
|
|
|
|
CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
|
|
|
|
}
|
2018-04-20 21:04:45 +08:00
|
|
|
} else {
|
|
|
|
// Generate a unique module ID.
|
2018-05-18 23:07:56 +08:00
|
|
|
SmallString<64> ModuleID;
|
|
|
|
llvm::raw_svector_ostream OS(ModuleID);
|
2018-10-06 02:39:58 +08:00
|
|
|
OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
|
2020-01-29 03:23:46 +08:00
|
|
|
llvm::Constant *ModuleIDConstant = makeConstantString(
|
|
|
|
std::string(ModuleID.str()), "", ModuleIDSectionName, 32);
|
2018-05-18 23:07:56 +08:00
|
|
|
|
2018-07-21 06:45:24 +08:00
|
|
|
// Create an alias for the FatbinWrapper that nvcc will look for.
|
2018-04-20 21:04:45 +08:00
|
|
|
llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
|
2018-05-18 23:07:56 +08:00
|
|
|
Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
|
2018-04-20 21:04:45 +08:00
|
|
|
|
2018-07-21 06:45:24 +08:00
|
|
|
// void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
|
2018-04-20 21:04:45 +08:00
|
|
|
// void *, void (*)(void **))
|
2018-07-21 06:45:24 +08:00
|
|
|
SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
|
2018-05-18 23:07:56 +08:00
|
|
|
RegisterLinkedBinaryName += ModuleID;
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
|
2018-04-20 21:04:45 +08:00
|
|
|
getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
|
|
|
|
|
|
|
|
assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
|
|
|
|
llvm::Value *Args[] = {RegisterGlobalsFunc,
|
|
|
|
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
|
2018-05-18 23:07:56 +08:00
|
|
|
ModuleIDConstant,
|
2018-04-20 21:04:45 +08:00
|
|
|
makeDummyFunction(getCallbackFnTy())};
|
|
|
|
CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
|
|
|
|
}
|
2015-05-08 03:34:16 +08:00
|
|
|
|
2018-06-28 02:32:51 +08:00
|
|
|
// Create destructor and register it with atexit() the way NVCC does it. Doing
|
|
|
|
// it during regular destructor phase worked in CUDA before 9.2 but results in
|
|
|
|
// double-free in 9.2.
|
|
|
|
if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
|
|
|
|
// extern "C" int atexit(void (*f)(void));
|
|
|
|
llvm::FunctionType *AtExitTy =
|
|
|
|
llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee AtExitFunc =
|
2018-06-28 02:32:51 +08:00
|
|
|
CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
|
|
|
|
/*Local=*/true);
|
|
|
|
CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
|
|
|
|
}
|
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
CtorBuilder.CreateRetVoid();
|
|
|
|
return ModuleCtorFunc;
|
|
|
|
}
|
|
|
|
|
2018-03-01 01:53:46 +08:00
|
|
|
/// Creates a global destructor function that unregisters the GPU code blob
|
2015-05-08 03:34:16 +08:00
|
|
|
/// registered by constructor.
|
2018-07-21 06:45:24 +08:00
|
|
|
///
|
|
|
|
/// For CUDA:
|
2015-05-08 03:34:16 +08:00
|
|
|
/// \code
|
|
|
|
/// void __cuda_module_dtor(void*) {
|
2018-03-01 01:53:46 +08:00
|
|
|
/// __cudaUnregisterFatBinary(Handle);
|
2015-05-08 03:34:16 +08:00
|
|
|
/// }
|
|
|
|
/// \endcode
|
2018-07-21 06:45:24 +08:00
|
|
|
///
|
|
|
|
/// For HIP:
|
|
|
|
/// \code
|
|
|
|
/// void __hip_module_dtor(void*) {
|
|
|
|
/// if (__hip_gpubin_handle) {
|
|
|
|
/// __hipUnregisterFatBinary(__hip_gpubin_handle);
|
|
|
|
/// __hip_gpubin_handle = 0;
|
|
|
|
/// }
|
|
|
|
/// }
|
|
|
|
/// \endcode
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
|
2018-03-01 01:53:46 +08:00
|
|
|
// No need for destructor if we don't have a handle to unregister.
|
|
|
|
if (!GpuBinaryHandle)
|
2016-03-03 02:28:53 +08:00
|
|
|
return nullptr;
|
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
// void __cudaUnregisterFatBinary(void ** handle);
|
2019-02-06 00:42:33 +08:00
|
|
|
llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
|
2018-04-25 09:10:37 +08:00
|
|
|
addUnderscoredPrefixToName("UnregisterFatBinary"));
|
2015-05-08 03:34:16 +08:00
|
|
|
|
|
|
|
llvm::Function *ModuleDtorFunc = llvm::Function::Create(
|
|
|
|
llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
|
2018-04-25 09:10:37 +08:00
|
|
|
llvm::GlobalValue::InternalLinkage,
|
|
|
|
addUnderscoredPrefixToName("_module_dtor"), &TheModule);
|
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::BasicBlock *DtorEntryBB =
|
|
|
|
llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
|
Compute and preserve alignment more faithfully in IR-generation.
Introduce an Address type to bundle a pointer value with an
alignment. Introduce APIs on CGBuilderTy to work with Address
values. Change core APIs on CGF/CGM to traffic in Address where
appropriate. Require alignments to be non-zero. Update a ton
of code to compute and propagate alignment information.
As part of this, I've promoted CGBuiltin's EmitPointerWithAlignment
helper function to CGF and made use of it in a number of places in
the expression emitter.
The end result is that we should now be significantly more correct
when performing operations on objects that are locally known to
be under-aligned. Since alignment is not reliably tracked in the
type system, there are inherent limits to this, but at least we
are no longer confused by standard operations like derived-to-base
conversions and array-to-pointer decay. I've also fixed a large
number of bugs where we were applying the complete-object alignment
to a pointer instead of the non-virtual alignment, although most of
these were hidden by the very conservative approach we took with
member alignment.
Also, because IRGen now reliably asserts on zero alignments, we
should no longer be subject to an absurd but frustrating recurring
bug where an incomplete type would report a zero alignment and then
we'd naively do a alignmentAtOffset on it and emit code using an
alignment equal to the largest power-of-two factor of the offset.
We should also now be emitting much more aggressive alignment
attributes in the presence of over-alignment. In particular,
field access now uses alignmentAtOffset instead of min.
Several times in this patch, I had to change the existing
code-generation pattern in order to more effectively use
the Address APIs. For the most part, this seems to be a strict
improvement, like doing pointer arithmetic with GEPs instead of
ptrtoint. That said, I've tried very hard to not change semantics,
but it is likely that I've failed in a few places, for which I
apologize.
ABIArgInfo now always carries the assumed alignment of indirect and
indirect byval arguments. In order to cut down on what was already
a dauntingly large patch, I changed the code to never set align
attributes in the IR on non-byval indirect arguments. That is,
we still generate code which assumes that indirect arguments have
the given alignment, but we don't express this information to the
backend except where it's semantically required (i.e. on byvals).
This is likely a minor regression for those targets that did provide
this information, but it'll be trivial to add it back in a later
patch.
I partially punted on applying this work to CGBuiltin. Please
do not add more uses of the CreateDefaultAligned{Load,Store}
APIs; they will be going away eventually.
llvm-svn: 246985
2015-09-08 16:05:57 +08:00
|
|
|
CGBuilderTy DtorBuilder(CGM, Context);
|
2015-05-08 03:34:16 +08:00
|
|
|
DtorBuilder.SetInsertPoint(DtorEntryBB);
|
|
|
|
|
2018-07-21 06:45:24 +08:00
|
|
|
Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
|
|
|
|
GpuBinaryHandle->getAlignment()));
|
|
|
|
auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
|
|
|
|
// There is only one HIP fat binary per linked module, however there are
|
|
|
|
// multiple destructor functions. Make sure the fat binary is unregistered
|
|
|
|
// only once.
|
|
|
|
if (CGM.getLangOpts().HIP) {
|
|
|
|
llvm::BasicBlock *IfBlock =
|
|
|
|
llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
|
|
|
|
llvm::BasicBlock *ExitBlock =
|
|
|
|
llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
|
|
|
|
llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
|
|
|
|
llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
|
|
|
|
DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
|
|
|
|
|
|
|
|
DtorBuilder.SetInsertPoint(IfBlock);
|
|
|
|
DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
|
|
|
|
DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
|
|
|
|
DtorBuilder.CreateBr(ExitBlock);
|
|
|
|
|
|
|
|
DtorBuilder.SetInsertPoint(ExitBlock);
|
|
|
|
} else {
|
|
|
|
DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
|
|
|
|
}
|
2015-05-08 03:34:16 +08:00
|
|
|
DtorBuilder.CreateRetVoid();
|
|
|
|
return ModuleDtorFunc;
|
|
|
|
}
|
|
|
|
|
2011-10-07 02:29:37 +08:00
|
|
|
CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
|
|
|
|
return new CGNVCUDARuntime(CGM);
|
|
|
|
}
|
2021-01-27 06:11:01 +08:00
|
|
|
|
|
|
|
void CGNVCUDARuntime::internalizeDeviceSideVar(
|
|
|
|
const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
|
2021-02-03 07:06:33 +08:00
|
|
|
// For -fno-gpu-rdc, host-side shadows of external declarations of device-side
|
|
|
|
// global variables become internal definitions. These have to be internal in
|
|
|
|
// order to prevent name conflicts with global host variables with the same
|
|
|
|
// name in a different TUs.
|
2021-01-27 06:11:01 +08:00
|
|
|
//
|
2021-02-03 07:06:33 +08:00
|
|
|
// For -fgpu-rdc, the shadow variables should not be internalized because
|
|
|
|
// they may be accessed by different TU.
|
|
|
|
if (CGM.getLangOpts().GPURelocatableDeviceCode)
|
|
|
|
return;
|
|
|
|
|
2021-01-27 06:11:01 +08:00
|
|
|
// __shared__ variables are odd. Shadows do get created, but
|
|
|
|
// they are not registered with the CUDA runtime, so they
|
|
|
|
// can't really be used to access their device-side
|
|
|
|
// counterparts. It's not clear yet whether it's nvcc's bug or
|
|
|
|
// a feature, but we've got to do the same for compatibility.
|
|
|
|
if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
|
|
|
|
D->hasAttr<CUDASharedAttr>() ||
|
|
|
|
D->getType()->isCUDADeviceBuiltinSurfaceType() ||
|
|
|
|
D->getType()->isCUDADeviceBuiltinTextureType()) {
|
|
|
|
Linkage = llvm::GlobalValue::InternalLinkage;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
|
|
|
|
llvm::GlobalVariable &GV) {
|
|
|
|
if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
|
|
|
|
// Shadow variables and their properties must be registered with CUDA
|
|
|
|
// runtime. Skip Extern global variables, which will be registered in
|
|
|
|
// the TU where they are defined.
|
|
|
|
//
|
|
|
|
// Don't register a C++17 inline variable. The local symbol can be
|
|
|
|
// discarded and referencing a discarded local symbol from outside the
|
|
|
|
// comdat (__cuda_register_globals) is disallowed by the ELF spec.
|
2021-05-11 22:09:38 +08:00
|
|
|
//
|
2021-02-06 03:40:16 +08:00
|
|
|
// HIP managed variables need to be always recorded in device and host
|
|
|
|
// compilations for transformation.
|
2021-05-11 22:09:38 +08:00
|
|
|
//
|
|
|
|
// HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
|
|
|
|
// added to llvm.compiler-used, therefore they are safe to be registered.
|
2021-02-06 03:40:16 +08:00
|
|
|
if ((!D->hasExternalStorage() && !D->isInline()) ||
|
2021-05-11 22:09:38 +08:00
|
|
|
CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
|
2021-02-06 03:40:16 +08:00
|
|
|
D->hasAttr<HIPManagedAttr>()) {
|
2021-01-27 06:11:01 +08:00
|
|
|
registerDeviceVar(D, GV, !D->hasDefinition(),
|
|
|
|
D->hasAttr<CUDAConstantAttr>());
|
2021-02-06 03:40:16 +08:00
|
|
|
}
|
2021-01-27 06:11:01 +08:00
|
|
|
} else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
|
|
|
|
D->getType()->isCUDADeviceBuiltinTextureType()) {
|
|
|
|
// Builtin surfaces and textures and their template arguments are
|
|
|
|
// also registered with CUDA runtime.
|
2021-04-06 19:04:01 +08:00
|
|
|
const auto *TD = cast<ClassTemplateSpecializationDecl>(
|
|
|
|
D->getType()->castAs<RecordType>()->getDecl());
|
2021-01-27 06:11:01 +08:00
|
|
|
const TemplateArgumentList &Args = TD->getTemplateArgs();
|
|
|
|
if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
|
|
|
|
assert(Args.size() == 2 &&
|
|
|
|
"Unexpected number of template arguments of CUDA device "
|
|
|
|
"builtin surface type.");
|
|
|
|
auto SurfType = Args[1].getAsIntegral();
|
|
|
|
if (!D->hasExternalStorage())
|
|
|
|
registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
|
|
|
|
} else {
|
|
|
|
assert(Args.size() == 3 &&
|
|
|
|
"Unexpected number of template arguments of CUDA device "
|
|
|
|
"builtin texture type.");
|
|
|
|
auto TexType = Args[1].getAsIntegral();
|
|
|
|
auto Normalized = Args[2].getAsIntegral();
|
|
|
|
if (!D->hasExternalStorage())
|
|
|
|
registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
|
|
|
|
Normalized.getZExtValue());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2021-02-06 03:40:16 +08:00
|
|
|
|
|
|
|
// Transform managed variables to pointers to managed variables in device code.
|
|
|
|
// Each use of the original managed variable is replaced by a load from the
|
|
|
|
// transformed managed variable. The transformed managed variable contains
|
|
|
|
// the address of managed memory which will be allocated by the runtime.
|
|
|
|
void CGNVCUDARuntime::transformManagedVars() {
|
|
|
|
for (auto &&Info : DeviceVars) {
|
|
|
|
llvm::GlobalVariable *Var = Info.Var;
|
|
|
|
if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
|
|
|
|
Info.Flags.isManaged()) {
|
|
|
|
auto ManagedVar = new llvm::GlobalVariable(
|
|
|
|
CGM.getModule(), Var->getType(),
|
|
|
|
/*isConstant=*/false, Var->getLinkage(),
|
|
|
|
/*Init=*/Var->isDeclaration()
|
|
|
|
? nullptr
|
|
|
|
: llvm::ConstantPointerNull::get(Var->getType()),
|
|
|
|
/*Name=*/"", /*InsertBefore=*/nullptr,
|
|
|
|
llvm::GlobalVariable::NotThreadLocal,
|
|
|
|
CGM.getContext().getTargetAddressSpace(LangAS::cuda_device));
|
|
|
|
ManagedVar->setDSOLocal(Var->isDSOLocal());
|
|
|
|
ManagedVar->setVisibility(Var->getVisibility());
|
|
|
|
ManagedVar->setExternallyInitialized(true);
|
|
|
|
replaceManagedVar(Var, ManagedVar);
|
|
|
|
ManagedVar->takeName(Var);
|
|
|
|
Var->setName(Twine(ManagedVar->getName()) + ".managed");
|
|
|
|
// Keep managed variables even if they are not used in device code since
|
|
|
|
// they need to be allocated by the runtime.
|
|
|
|
if (!Var->isDeclaration()) {
|
|
|
|
assert(!ManagedVar->isDeclaration());
|
|
|
|
CGM.addCompilerUsedGlobal(Var);
|
|
|
|
CGM.addCompilerUsedGlobal(ManagedVar);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returns module constructor to be added.
|
|
|
|
llvm::Function *CGNVCUDARuntime::finalizeModule() {
|
|
|
|
if (CGM.getLangOpts().CUDAIsDevice) {
|
|
|
|
transformManagedVars();
|
2021-03-18 04:14:03 +08:00
|
|
|
|
|
|
|
// Mark ODR-used device variables as compiler used to prevent it from being
|
|
|
|
// eliminated by optimization. This is necessary for device variables
|
|
|
|
// ODR-used by host functions. Sema correctly marks them as ODR-used no
|
|
|
|
// matter whether they are ODR-used by device or host functions.
|
|
|
|
//
|
|
|
|
// We do not need to do this if the variable has used attribute since it
|
|
|
|
// has already been added.
|
|
|
|
//
|
|
|
|
// Static device variables have been externalized at this point, therefore
|
|
|
|
// variables with LLVM private or internal linkage need not be added.
|
|
|
|
for (auto &&Info : DeviceVars) {
|
|
|
|
auto Kind = Info.Flags.getKind();
|
|
|
|
if (!Info.Var->isDeclaration() &&
|
|
|
|
!llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
|
|
|
|
(Kind == DeviceVarFlags::Variable ||
|
|
|
|
Kind == DeviceVarFlags::Surface ||
|
|
|
|
Kind == DeviceVarFlags::Texture) &&
|
|
|
|
Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
|
|
|
|
CGM.addCompilerUsedGlobal(Info.Var);
|
|
|
|
}
|
|
|
|
}
|
2021-02-06 03:40:16 +08:00
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
return makeModuleCtorFunction();
|
|
|
|
}
|
2021-02-09 01:29:29 +08:00
|
|
|
|
|
|
|
llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
|
|
|
|
GlobalDecl GD) {
|
|
|
|
auto Loc = KernelHandles.find(F);
|
|
|
|
if (Loc != KernelHandles.end())
|
|
|
|
return Loc->second;
|
|
|
|
|
|
|
|
if (!CGM.getLangOpts().HIP) {
|
|
|
|
KernelHandles[F] = F;
|
|
|
|
KernelStubs[F] = F;
|
|
|
|
return F;
|
|
|
|
}
|
|
|
|
|
|
|
|
auto *Var = new llvm::GlobalVariable(
|
|
|
|
TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
|
|
|
|
/*Initializer=*/nullptr,
|
|
|
|
CGM.getMangledName(
|
|
|
|
GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
|
|
|
|
Var->setAlignment(CGM.getPointerAlign().getAsAlign());
|
|
|
|
Var->setDSOLocal(F->isDSOLocal());
|
|
|
|
Var->setVisibility(F->getVisibility());
|
|
|
|
KernelHandles[F] = Var;
|
|
|
|
KernelStubs[Var] = F;
|
|
|
|
return Var;
|
|
|
|
}
|