2011-10-07 02:29:37 +08:00
|
|
|
//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This provides a class for CUDA code generation targeting the NVIDIA CUDA
|
|
|
|
// runtime library.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "CGCUDARuntime.h"
|
2011-10-07 02:51:56 +08:00
|
|
|
#include "CodeGenFunction.h"
|
|
|
|
#include "CodeGenModule.h"
|
|
|
|
#include "clang/AST/Decl.h"
|
2013-01-02 19:45:17 +08:00
|
|
|
#include "llvm/IR/BasicBlock.h"
|
2014-03-04 19:02:08 +08:00
|
|
|
#include "llvm/IR/CallSite.h"
|
2013-01-02 19:45:17 +08:00
|
|
|
#include "llvm/IR/Constants.h"
|
|
|
|
#include "llvm/IR/DerivedTypes.h"
|
2011-10-07 02:29:37 +08:00
|
|
|
|
|
|
|
using namespace clang;
|
|
|
|
using namespace CodeGen;
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
class CGNVCUDARuntime : public CGCUDARuntime {
|
2011-10-07 02:51:56 +08:00
|
|
|
|
|
|
|
private:
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::Type *IntTy, *SizeTy, *VoidTy;
|
|
|
|
llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
|
|
|
|
|
|
|
|
/// Convenience reference to LLVM Context
|
|
|
|
llvm::LLVMContext &Context;
|
|
|
|
/// Convenience reference to the current module
|
|
|
|
llvm::Module &TheModule;
|
|
|
|
/// Keeps track of kernel launch stubs emitted in this module
|
|
|
|
llvm::SmallVector<llvm::Function *, 16> EmittedKernels;
|
2016-03-03 02:28:50 +08:00
|
|
|
llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars;
|
2015-05-08 03:34:16 +08:00
|
|
|
/// Keeps track of variables containing handles of GPU binaries. Populated by
|
|
|
|
/// ModuleCtorFunction() and used to create corresponding cleanup calls in
|
|
|
|
/// ModuleDtorFunction()
|
|
|
|
llvm::SmallVector<llvm::GlobalVariable *, 16> GpuBinaryHandles;
|
2011-10-07 02:51:56 +08:00
|
|
|
|
|
|
|
llvm::Constant *getSetupArgumentFn() const;
|
|
|
|
llvm::Constant *getLaunchFn() const;
|
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
/// Creates a function to register all kernel stubs generated in this module.
|
2016-03-03 02:28:50 +08:00
|
|
|
llvm::Function *makeRegisterGlobalsFn();
|
2015-05-08 03:34:16 +08:00
|
|
|
|
|
|
|
/// Helper function that generates a constant string and returns a pointer to
|
|
|
|
/// the start of the string. The result of this function can be used anywhere
|
|
|
|
/// where the C code specifies const char*.
|
|
|
|
llvm::Constant *makeConstantString(const std::string &Str,
|
|
|
|
const std::string &Name = "",
|
2016-08-13 02:44:01 +08:00
|
|
|
const std::string &SectionName = "",
|
2015-05-08 03:34:16 +08:00
|
|
|
unsigned Alignment = 0) {
|
|
|
|
llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
|
|
|
|
llvm::ConstantInt::get(SizeTy, 0)};
|
Compute and preserve alignment more faithfully in IR-generation.
Introduce an Address type to bundle a pointer value with an
alignment. Introduce APIs on CGBuilderTy to work with Address
values. Change core APIs on CGF/CGM to traffic in Address where
appropriate. Require alignments to be non-zero. Update a ton
of code to compute and propagate alignment information.
As part of this, I've promoted CGBuiltin's EmitPointerWithAlignment
helper function to CGF and made use of it in a number of places in
the expression emitter.
The end result is that we should now be significantly more correct
when performing operations on objects that are locally known to
be under-aligned. Since alignment is not reliably tracked in the
type system, there are inherent limits to this, but at least we
are no longer confused by standard operations like derived-to-base
conversions and array-to-pointer decay. I've also fixed a large
number of bugs where we were applying the complete-object alignment
to a pointer instead of the non-virtual alignment, although most of
these were hidden by the very conservative approach we took with
member alignment.
Also, because IRGen now reliably asserts on zero alignments, we
should no longer be subject to an absurd but frustrating recurring
bug where an incomplete type would report a zero alignment and then
we'd naively do a alignmentAtOffset on it and emit code using an
alignment equal to the largest power-of-two factor of the offset.
We should also now be emitting much more aggressive alignment
attributes in the presence of over-alignment. In particular,
field access now uses alignmentAtOffset instead of min.
Several times in this patch, I had to change the existing
code-generation pattern in order to more effectively use
the Address APIs. For the most part, this seems to be a strict
improvement, like doing pointer arithmetic with GEPs instead of
ptrtoint. That said, I've tried very hard to not change semantics,
but it is likely that I've failed in a few places, for which I
apologize.
ABIArgInfo now always carries the assumed alignment of indirect and
indirect byval arguments. In order to cut down on what was already
a dauntingly large patch, I changed the code to never set align
attributes in the IR on non-byval indirect arguments. That is,
we still generate code which assumes that indirect arguments have
the given alignment, but we don't express this information to the
backend except where it's semantically required (i.e. on byvals).
This is likely a minor regression for those targets that did provide
this information, but it'll be trivial to add it back in a later
patch.
I partially punted on applying this work to CGBuiltin. Please
do not add more uses of the CreateDefaultAligned{Load,Store}
APIs; they will be going away eventually.
llvm-svn: 246985
2015-09-08 16:05:57 +08:00
|
|
|
auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
|
2016-08-13 02:44:01 +08:00
|
|
|
llvm::GlobalVariable *GV =
|
|
|
|
cast<llvm::GlobalVariable>(ConstStr.getPointer());
|
|
|
|
if (!SectionName.empty())
|
|
|
|
GV->setSection(SectionName);
|
|
|
|
if (Alignment)
|
|
|
|
GV->setAlignment(Alignment);
|
|
|
|
|
Compute and preserve alignment more faithfully in IR-generation.
Introduce an Address type to bundle a pointer value with an
alignment. Introduce APIs on CGBuilderTy to work with Address
values. Change core APIs on CGF/CGM to traffic in Address where
appropriate. Require alignments to be non-zero. Update a ton
of code to compute and propagate alignment information.
As part of this, I've promoted CGBuiltin's EmitPointerWithAlignment
helper function to CGF and made use of it in a number of places in
the expression emitter.
The end result is that we should now be significantly more correct
when performing operations on objects that are locally known to
be under-aligned. Since alignment is not reliably tracked in the
type system, there are inherent limits to this, but at least we
are no longer confused by standard operations like derived-to-base
conversions and array-to-pointer decay. I've also fixed a large
number of bugs where we were applying the complete-object alignment
to a pointer instead of the non-virtual alignment, although most of
these were hidden by the very conservative approach we took with
member alignment.
Also, because IRGen now reliably asserts on zero alignments, we
should no longer be subject to an absurd but frustrating recurring
bug where an incomplete type would report a zero alignment and then
we'd naively do a alignmentAtOffset on it and emit code using an
alignment equal to the largest power-of-two factor of the offset.
We should also now be emitting much more aggressive alignment
attributes in the presence of over-alignment. In particular,
field access now uses alignmentAtOffset instead of min.
Several times in this patch, I had to change the existing
code-generation pattern in order to more effectively use
the Address APIs. For the most part, this seems to be a strict
improvement, like doing pointer arithmetic with GEPs instead of
ptrtoint. That said, I've tried very hard to not change semantics,
but it is likely that I've failed in a few places, for which I
apologize.
ABIArgInfo now always carries the assumed alignment of indirect and
indirect byval arguments. In order to cut down on what was already
a dauntingly large patch, I changed the code to never set align
attributes in the IR on non-byval indirect arguments. That is,
we still generate code which assumes that indirect arguments have
the given alignment, but we don't express this information to the
backend except where it's semantically required (i.e. on byvals).
This is likely a minor regression for those targets that did provide
this information, but it'll be trivial to add it back in a later
patch.
I partially punted on applying this work to CGBuiltin. Please
do not add more uses of the CreateDefaultAligned{Load,Store}
APIs; they will be going away eventually.
llvm-svn: 246985
2015-09-08 16:05:57 +08:00
|
|
|
return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
|
|
|
|
ConstStr.getPointer(), Zeros);
|
2015-05-08 03:34:16 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args);
|
|
|
|
|
2011-10-07 02:29:37 +08:00
|
|
|
public:
|
|
|
|
CGNVCUDARuntime(CodeGenModule &CGM);
|
2011-10-07 02:51:56 +08:00
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
|
2016-03-03 02:28:50 +08:00
|
|
|
void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override {
|
|
|
|
DeviceVars.push_back(std::make_pair(&Var, Flags));
|
|
|
|
}
|
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
/// Creates module constructor function
|
|
|
|
llvm::Function *makeModuleCtorFunction() override;
|
|
|
|
/// Creates module destructor function
|
|
|
|
llvm::Function *makeModuleDtorFunction() override;
|
2011-10-07 02:29:37 +08:00
|
|
|
};
|
|
|
|
|
2015-06-23 07:07:51 +08:00
|
|
|
}
|
2011-10-07 02:29:37 +08:00
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
|
|
|
|
: CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
|
|
|
|
TheModule(CGM.getModule()) {
|
2011-10-07 02:51:56 +08:00
|
|
|
CodeGen::CodeGenTypes &Types = CGM.getTypes();
|
|
|
|
ASTContext &Ctx = CGM.getContext();
|
|
|
|
|
|
|
|
IntTy = Types.ConvertType(Ctx.IntTy);
|
|
|
|
SizeTy = Types.ConvertType(Ctx.getSizeType());
|
2015-05-08 03:34:16 +08:00
|
|
|
VoidTy = llvm::Type::getVoidTy(Context);
|
2011-10-07 02:51:56 +08:00
|
|
|
|
|
|
|
CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
|
|
|
|
VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
|
2015-05-08 03:34:16 +08:00
|
|
|
VoidPtrPtrTy = VoidPtrTy->getPointerTo();
|
2011-10-07 02:51:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const {
|
|
|
|
// cudaError_t cudaSetupArgument(void *, size_t, size_t)
|
2016-07-02 19:41:41 +08:00
|
|
|
llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
|
2011-10-07 02:51:56 +08:00
|
|
|
return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy,
|
|
|
|
Params, false),
|
|
|
|
"cudaSetupArgument");
|
|
|
|
}
|
|
|
|
|
|
|
|
llvm::Constant *CGNVCUDARuntime::getLaunchFn() const {
|
|
|
|
// cudaError_t cudaLaunch(char *)
|
2015-05-08 03:34:16 +08:00
|
|
|
return CGM.CreateRuntimeFunction(
|
|
|
|
llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
|
|
|
|
}
|
|
|
|
|
|
|
|
void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
|
|
|
|
FunctionArgList &Args) {
|
|
|
|
EmittedKernels.push_back(CGF.CurFn);
|
|
|
|
emitDeviceStubBody(CGF, Args);
|
2011-10-07 02:51:56 +08:00
|
|
|
}
|
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF,
|
2011-10-07 02:51:56 +08:00
|
|
|
FunctionArgList &Args) {
|
2016-07-28 06:36:21 +08:00
|
|
|
// Emit a call to cudaSetupArgument for each arg in Args.
|
2011-10-07 02:51:56 +08:00
|
|
|
llvm::Constant *cudaSetupArgFn = getSetupArgumentFn();
|
2016-07-28 06:36:21 +08:00
|
|
|
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
|
|
|
|
CharUnits Offset = CharUnits::Zero();
|
|
|
|
for (const VarDecl *A : Args) {
|
|
|
|
CharUnits TyWidth, TyAlign;
|
|
|
|
std::tie(TyWidth, TyAlign) =
|
|
|
|
CGM.getContext().getTypeInfoInChars(A->getType());
|
|
|
|
Offset = Offset.alignTo(TyAlign);
|
|
|
|
llvm::Value *Args[] = {
|
|
|
|
CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
|
|
|
|
VoidPtrTy),
|
|
|
|
llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()),
|
|
|
|
llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
|
|
|
|
};
|
2013-03-01 03:01:20 +08:00
|
|
|
llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
|
2011-10-07 02:51:56 +08:00
|
|
|
llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
|
|
|
|
llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero);
|
2016-07-28 06:36:21 +08:00
|
|
|
llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
|
2011-10-07 02:51:56 +08:00
|
|
|
CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock);
|
|
|
|
CGF.EmitBlock(NextBlock);
|
2016-07-28 06:36:21 +08:00
|
|
|
Offset += TyWidth;
|
2011-10-07 02:51:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Emit the call to cudaLaunch
|
|
|
|
llvm::Constant *cudaLaunchFn = getLaunchFn();
|
|
|
|
llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
|
2013-03-01 03:01:20 +08:00
|
|
|
CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
|
2011-10-07 02:51:56 +08:00
|
|
|
CGF.EmitBranch(EndBlock);
|
|
|
|
|
|
|
|
CGF.EmitBlock(EndBlock);
|
2011-10-07 02:29:37 +08:00
|
|
|
}
|
|
|
|
|
2016-03-03 02:28:50 +08:00
|
|
|
/// Creates a function that sets up state on the host side for CUDA objects that
|
|
|
|
/// have a presence on both the host and device sides. Specifically, registers
|
|
|
|
/// the host side of kernel functions and device global variables with the CUDA
|
|
|
|
/// runtime.
|
2015-05-08 03:34:16 +08:00
|
|
|
/// \code
|
2016-03-03 02:28:50 +08:00
|
|
|
/// void __cuda_register_globals(void** GpuBinaryHandle) {
|
2015-05-08 03:34:16 +08:00
|
|
|
/// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
|
|
|
|
/// ...
|
|
|
|
/// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
|
2016-03-03 02:28:50 +08:00
|
|
|
/// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
|
|
|
|
/// ...
|
|
|
|
/// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
|
2015-05-08 03:34:16 +08:00
|
|
|
/// }
|
|
|
|
/// \endcode
|
2016-03-03 02:28:50 +08:00
|
|
|
llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
|
2016-03-03 02:28:53 +08:00
|
|
|
// No need to register anything
|
|
|
|
if (EmittedKernels.empty() && DeviceVars.empty())
|
|
|
|
return nullptr;
|
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
|
|
|
|
llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
|
2016-03-03 02:28:50 +08:00
|
|
|
llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", &TheModule);
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::BasicBlock *EntryBB =
|
|
|
|
llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
|
Compute and preserve alignment more faithfully in IR-generation.
Introduce an Address type to bundle a pointer value with an
alignment. Introduce APIs on CGBuilderTy to work with Address
values. Change core APIs on CGF/CGM to traffic in Address where
appropriate. Require alignments to be non-zero. Update a ton
of code to compute and propagate alignment information.
As part of this, I've promoted CGBuiltin's EmitPointerWithAlignment
helper function to CGF and made use of it in a number of places in
the expression emitter.
The end result is that we should now be significantly more correct
when performing operations on objects that are locally known to
be under-aligned. Since alignment is not reliably tracked in the
type system, there are inherent limits to this, but at least we
are no longer confused by standard operations like derived-to-base
conversions and array-to-pointer decay. I've also fixed a large
number of bugs where we were applying the complete-object alignment
to a pointer instead of the non-virtual alignment, although most of
these were hidden by the very conservative approach we took with
member alignment.
Also, because IRGen now reliably asserts on zero alignments, we
should no longer be subject to an absurd but frustrating recurring
bug where an incomplete type would report a zero alignment and then
we'd naively do a alignmentAtOffset on it and emit code using an
alignment equal to the largest power-of-two factor of the offset.
We should also now be emitting much more aggressive alignment
attributes in the presence of over-alignment. In particular,
field access now uses alignmentAtOffset instead of min.
Several times in this patch, I had to change the existing
code-generation pattern in order to more effectively use
the Address APIs. For the most part, this seems to be a strict
improvement, like doing pointer arithmetic with GEPs instead of
ptrtoint. That said, I've tried very hard to not change semantics,
but it is likely that I've failed in a few places, for which I
apologize.
ABIArgInfo now always carries the assumed alignment of indirect and
indirect byval arguments. In order to cut down on what was already
a dauntingly large patch, I changed the code to never set align
attributes in the IR on non-byval indirect arguments. That is,
we still generate code which assumes that indirect arguments have
the given alignment, but we don't express this information to the
backend except where it's semantically required (i.e. on byvals).
This is likely a minor regression for those targets that did provide
this information, but it'll be trivial to add it back in a later
patch.
I partially punted on applying this work to CGBuiltin. Please
do not add more uses of the CreateDefaultAligned{Load,Store}
APIs; they will be going away eventually.
llvm-svn: 246985
2015-09-08 16:05:57 +08:00
|
|
|
CGBuilderTy Builder(CGM, Context);
|
2015-05-08 03:34:16 +08:00
|
|
|
Builder.SetInsertPoint(EntryBB);
|
|
|
|
|
|
|
|
// void __cudaRegisterFunction(void **, const char *, char *, const char *,
|
|
|
|
// int, uint3*, uint3*, dim3*, dim3*, int*)
|
2016-07-02 20:03:57 +08:00
|
|
|
llvm::Type *RegisterFuncParams[] = {
|
2015-05-08 03:34:16 +08:00
|
|
|
VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
|
|
|
|
VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
|
|
|
|
llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction(
|
|
|
|
llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
|
|
|
|
"__cudaRegisterFunction");
|
|
|
|
|
|
|
|
// Extract GpuBinaryHandle passed as the first argument passed to
|
2016-03-03 02:28:50 +08:00
|
|
|
// __cuda_register_globals() and generate __cudaRegisterFunction() call for
|
2015-05-08 03:34:16 +08:00
|
|
|
// each emitted kernel.
|
|
|
|
llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
|
|
|
|
for (llvm::Function *Kernel : EmittedKernels) {
|
|
|
|
llvm::Constant *KernelName = makeConstantString(Kernel->getName());
|
|
|
|
llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
|
2016-03-03 02:28:50 +08:00
|
|
|
llvm::Value *Args[] = {
|
2015-05-08 03:34:16 +08:00
|
|
|
&GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy),
|
|
|
|
KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr,
|
|
|
|
NullPtr, NullPtr, NullPtr,
|
|
|
|
llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
|
2016-03-03 02:28:50 +08:00
|
|
|
Builder.CreateCall(RegisterFunc, Args);
|
|
|
|
}
|
|
|
|
|
|
|
|
// void __cudaRegisterVar(void **, char *, char *, const char *,
|
|
|
|
// int, int, int, int)
|
2016-07-02 20:03:57 +08:00
|
|
|
llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
|
|
|
|
CharPtrTy, IntTy, IntTy,
|
|
|
|
IntTy, IntTy};
|
2016-03-03 02:28:50 +08:00
|
|
|
llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction(
|
|
|
|
llvm::FunctionType::get(IntTy, RegisterVarParams, false),
|
|
|
|
"__cudaRegisterVar");
|
|
|
|
for (auto &Pair : DeviceVars) {
|
|
|
|
llvm::GlobalVariable *Var = Pair.first;
|
|
|
|
unsigned Flags = Pair.second;
|
|
|
|
llvm::Constant *VarName = makeConstantString(Var->getName());
|
|
|
|
uint64_t VarSize =
|
|
|
|
CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
|
|
|
|
llvm::Value *Args[] = {
|
|
|
|
&GpuBinaryHandlePtr,
|
|
|
|
Builder.CreateBitCast(Var, VoidPtrTy),
|
|
|
|
VarName,
|
|
|
|
VarName,
|
|
|
|
llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0),
|
|
|
|
llvm::ConstantInt::get(IntTy, VarSize),
|
|
|
|
llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0),
|
|
|
|
llvm::ConstantInt::get(IntTy, 0)};
|
|
|
|
Builder.CreateCall(RegisterVar, Args);
|
2015-05-08 03:34:16 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
Builder.CreateRetVoid();
|
|
|
|
return RegisterKernelsFunc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Creates a global constructor function for the module:
|
|
|
|
/// \code
|
|
|
|
/// void __cuda_module_ctor(void*) {
|
|
|
|
/// Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0);
|
2016-03-03 02:28:50 +08:00
|
|
|
/// __cuda_register_globals(Handle0);
|
2015-05-08 03:34:16 +08:00
|
|
|
/// ...
|
|
|
|
/// HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN);
|
2016-03-03 02:28:50 +08:00
|
|
|
/// __cuda_register_globals(HandleN);
|
2015-05-08 03:34:16 +08:00
|
|
|
/// }
|
|
|
|
/// \endcode
|
|
|
|
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
|
2016-03-03 02:28:53 +08:00
|
|
|
// No need to generate ctors/dtors if there are no GPU binaries.
|
|
|
|
if (CGM.getCodeGenOpts().CudaGpuBinaryFileNames.empty())
|
|
|
|
return nullptr;
|
|
|
|
|
2016-03-03 02:28:50 +08:00
|
|
|
// void __cuda_register_globals(void* handle);
|
|
|
|
llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
|
2015-05-08 03:34:16 +08:00
|
|
|
// void ** __cudaRegisterFatBinary(void *);
|
|
|
|
llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
|
|
|
|
llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
|
|
|
|
"__cudaRegisterFatBinary");
|
|
|
|
// struct { int magic, int version, void * gpu_binary, void * dont_care };
|
|
|
|
llvm::StructType *FatbinWrapperTy =
|
|
|
|
llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy, nullptr);
|
|
|
|
|
|
|
|
llvm::Function *ModuleCtorFunc = llvm::Function::Create(
|
|
|
|
llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
|
|
|
|
llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule);
|
|
|
|
llvm::BasicBlock *CtorEntryBB =
|
|
|
|
llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
|
Compute and preserve alignment more faithfully in IR-generation.
Introduce an Address type to bundle a pointer value with an
alignment. Introduce APIs on CGBuilderTy to work with Address
values. Change core APIs on CGF/CGM to traffic in Address where
appropriate. Require alignments to be non-zero. Update a ton
of code to compute and propagate alignment information.
As part of this, I've promoted CGBuiltin's EmitPointerWithAlignment
helper function to CGF and made use of it in a number of places in
the expression emitter.
The end result is that we should now be significantly more correct
when performing operations on objects that are locally known to
be under-aligned. Since alignment is not reliably tracked in the
type system, there are inherent limits to this, but at least we
are no longer confused by standard operations like derived-to-base
conversions and array-to-pointer decay. I've also fixed a large
number of bugs where we were applying the complete-object alignment
to a pointer instead of the non-virtual alignment, although most of
these were hidden by the very conservative approach we took with
member alignment.
Also, because IRGen now reliably asserts on zero alignments, we
should no longer be subject to an absurd but frustrating recurring
bug where an incomplete type would report a zero alignment and then
we'd naively do a alignmentAtOffset on it and emit code using an
alignment equal to the largest power-of-two factor of the offset.
We should also now be emitting much more aggressive alignment
attributes in the presence of over-alignment. In particular,
field access now uses alignmentAtOffset instead of min.
Several times in this patch, I had to change the existing
code-generation pattern in order to more effectively use
the Address APIs. For the most part, this seems to be a strict
improvement, like doing pointer arithmetic with GEPs instead of
ptrtoint. That said, I've tried very hard to not change semantics,
but it is likely that I've failed in a few places, for which I
apologize.
ABIArgInfo now always carries the assumed alignment of indirect and
indirect byval arguments. In order to cut down on what was already
a dauntingly large patch, I changed the code to never set align
attributes in the IR on non-byval indirect arguments. That is,
we still generate code which assumes that indirect arguments have
the given alignment, but we don't express this information to the
backend except where it's semantically required (i.e. on byvals).
This is likely a minor regression for those targets that did provide
this information, but it'll be trivial to add it back in a later
patch.
I partially punted on applying this work to CGBuiltin. Please
do not add more uses of the CreateDefaultAligned{Load,Store}
APIs; they will be going away eventually.
llvm-svn: 246985
2015-09-08 16:05:57 +08:00
|
|
|
CGBuilderTy CtorBuilder(CGM, Context);
|
2015-05-08 03:34:16 +08:00
|
|
|
|
|
|
|
CtorBuilder.SetInsertPoint(CtorEntryBB);
|
|
|
|
|
|
|
|
// For each GPU binary, register it with the CUDA runtime and store returned
|
|
|
|
// handle in a global variable and save the handle in GpuBinaryHandles vector
|
|
|
|
// to be cleaned up in destructor on exit. Then associate all known kernels
|
|
|
|
// with the GPU binary handle so CUDA runtime can figure out what to call on
|
|
|
|
// the GPU side.
|
|
|
|
for (const std::string &GpuBinaryFileName :
|
|
|
|
CGM.getCodeGenOpts().CudaGpuBinaryFileNames) {
|
|
|
|
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
|
|
|
|
llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName);
|
|
|
|
if (std::error_code EC = GpuBinaryOrErr.getError()) {
|
|
|
|
CGM.getDiags().Report(diag::err_cannot_open_file) << GpuBinaryFileName
|
|
|
|
<< EC.message();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create initialized wrapper structure that points to the loaded GPU binary
|
|
|
|
llvm::Constant *Values[] = {
|
|
|
|
llvm::ConstantInt::get(IntTy, 0x466243b1), // Fatbin wrapper magic.
|
|
|
|
llvm::ConstantInt::get(IntTy, 1), // Fatbin version.
|
2016-08-13 02:44:01 +08:00
|
|
|
makeConstantString(GpuBinaryOrErr.get()->getBuffer(), // Data.
|
|
|
|
"", ".nv_fatbin", 8), //
|
2015-05-08 03:34:16 +08:00
|
|
|
llvm::ConstantPointerNull::get(VoidPtrTy)}; // Unused in fatbin v1.
|
|
|
|
llvm::GlobalVariable *FatbinWrapper = new llvm::GlobalVariable(
|
|
|
|
TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage,
|
|
|
|
llvm::ConstantStruct::get(FatbinWrapperTy, Values),
|
|
|
|
"__cuda_fatbin_wrapper");
|
2016-01-15 05:41:27 +08:00
|
|
|
// NVIDIA's cuobjdump looks for fatbins in this section.
|
|
|
|
FatbinWrapper->setSection(".nvFatBinSegment");
|
2015-05-08 03:34:16 +08:00
|
|
|
|
|
|
|
// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
|
|
|
|
llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
|
|
|
|
RegisterFatbinFunc,
|
|
|
|
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
|
|
|
|
llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable(
|
|
|
|
TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
|
|
|
|
llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
|
Compute and preserve alignment more faithfully in IR-generation.
Introduce an Address type to bundle a pointer value with an
alignment. Introduce APIs on CGBuilderTy to work with Address
values. Change core APIs on CGF/CGM to traffic in Address where
appropriate. Require alignments to be non-zero. Update a ton
of code to compute and propagate alignment information.
As part of this, I've promoted CGBuiltin's EmitPointerWithAlignment
helper function to CGF and made use of it in a number of places in
the expression emitter.
The end result is that we should now be significantly more correct
when performing operations on objects that are locally known to
be under-aligned. Since alignment is not reliably tracked in the
type system, there are inherent limits to this, but at least we
are no longer confused by standard operations like derived-to-base
conversions and array-to-pointer decay. I've also fixed a large
number of bugs where we were applying the complete-object alignment
to a pointer instead of the non-virtual alignment, although most of
these were hidden by the very conservative approach we took with
member alignment.
Also, because IRGen now reliably asserts on zero alignments, we
should no longer be subject to an absurd but frustrating recurring
bug where an incomplete type would report a zero alignment and then
we'd naively do a alignmentAtOffset on it and emit code using an
alignment equal to the largest power-of-two factor of the offset.
We should also now be emitting much more aggressive alignment
attributes in the presence of over-alignment. In particular,
field access now uses alignmentAtOffset instead of min.
Several times in this patch, I had to change the existing
code-generation pattern in order to more effectively use
the Address APIs. For the most part, this seems to be a strict
improvement, like doing pointer arithmetic with GEPs instead of
ptrtoint. That said, I've tried very hard to not change semantics,
but it is likely that I've failed in a few places, for which I
apologize.
ABIArgInfo now always carries the assumed alignment of indirect and
indirect byval arguments. In order to cut down on what was already
a dauntingly large patch, I changed the code to never set align
attributes in the IR on non-byval indirect arguments. That is,
we still generate code which assumes that indirect arguments have
the given alignment, but we don't express this information to the
backend except where it's semantically required (i.e. on byvals).
This is likely a minor regression for those targets that did provide
this information, but it'll be trivial to add it back in a later
patch.
I partially punted on applying this work to CGBuiltin. Please
do not add more uses of the CreateDefaultAligned{Load,Store}
APIs; they will be going away eventually.
llvm-svn: 246985
2015-09-08 16:05:57 +08:00
|
|
|
CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
|
|
|
|
CGM.getPointerAlign());
|
2015-05-08 03:34:16 +08:00
|
|
|
|
2016-03-03 02:28:50 +08:00
|
|
|
// Call __cuda_register_globals(GpuBinaryHandle);
|
2016-03-03 02:28:53 +08:00
|
|
|
if (RegisterGlobalsFunc)
|
|
|
|
CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
|
2015-05-08 03:34:16 +08:00
|
|
|
|
|
|
|
// Save GpuBinaryHandle so we can unregister it in destructor.
|
|
|
|
GpuBinaryHandles.push_back(GpuBinaryHandle);
|
|
|
|
}
|
|
|
|
|
|
|
|
CtorBuilder.CreateRetVoid();
|
|
|
|
return ModuleCtorFunc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Creates a global destructor function that unregisters all GPU code blobs
|
|
|
|
/// registered by constructor.
|
|
|
|
/// \code
|
|
|
|
/// void __cuda_module_dtor(void*) {
|
|
|
|
/// __cudaUnregisterFatBinary(Handle0);
|
|
|
|
/// ...
|
|
|
|
/// __cudaUnregisterFatBinary(HandleN);
|
|
|
|
/// }
|
|
|
|
/// \endcode
|
|
|
|
llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
|
2016-03-03 02:28:53 +08:00
|
|
|
// No need for destructor if we don't have handles to unregister.
|
|
|
|
if (GpuBinaryHandles.empty())
|
|
|
|
return nullptr;
|
|
|
|
|
2015-05-08 03:34:16 +08:00
|
|
|
// void __cudaUnregisterFatBinary(void ** handle);
|
|
|
|
llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
|
|
|
|
llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
|
|
|
|
"__cudaUnregisterFatBinary");
|
|
|
|
|
|
|
|
llvm::Function *ModuleDtorFunc = llvm::Function::Create(
|
|
|
|
llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
|
|
|
|
llvm::GlobalValue::InternalLinkage, "__cuda_module_dtor", &TheModule);
|
|
|
|
llvm::BasicBlock *DtorEntryBB =
|
|
|
|
llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
|
Compute and preserve alignment more faithfully in IR-generation.
Introduce an Address type to bundle a pointer value with an
alignment. Introduce APIs on CGBuilderTy to work with Address
values. Change core APIs on CGF/CGM to traffic in Address where
appropriate. Require alignments to be non-zero. Update a ton
of code to compute and propagate alignment information.
As part of this, I've promoted CGBuiltin's EmitPointerWithAlignment
helper function to CGF and made use of it in a number of places in
the expression emitter.
The end result is that we should now be significantly more correct
when performing operations on objects that are locally known to
be under-aligned. Since alignment is not reliably tracked in the
type system, there are inherent limits to this, but at least we
are no longer confused by standard operations like derived-to-base
conversions and array-to-pointer decay. I've also fixed a large
number of bugs where we were applying the complete-object alignment
to a pointer instead of the non-virtual alignment, although most of
these were hidden by the very conservative approach we took with
member alignment.
Also, because IRGen now reliably asserts on zero alignments, we
should no longer be subject to an absurd but frustrating recurring
bug where an incomplete type would report a zero alignment and then
we'd naively do a alignmentAtOffset on it and emit code using an
alignment equal to the largest power-of-two factor of the offset.
We should also now be emitting much more aggressive alignment
attributes in the presence of over-alignment. In particular,
field access now uses alignmentAtOffset instead of min.
Several times in this patch, I had to change the existing
code-generation pattern in order to more effectively use
the Address APIs. For the most part, this seems to be a strict
improvement, like doing pointer arithmetic with GEPs instead of
ptrtoint. That said, I've tried very hard to not change semantics,
but it is likely that I've failed in a few places, for which I
apologize.
ABIArgInfo now always carries the assumed alignment of indirect and
indirect byval arguments. In order to cut down on what was already
a dauntingly large patch, I changed the code to never set align
attributes in the IR on non-byval indirect arguments. That is,
we still generate code which assumes that indirect arguments have
the given alignment, but we don't express this information to the
backend except where it's semantically required (i.e. on byvals).
This is likely a minor regression for those targets that did provide
this information, but it'll be trivial to add it back in a later
patch.
I partially punted on applying this work to CGBuiltin. Please
do not add more uses of the CreateDefaultAligned{Load,Store}
APIs; they will be going away eventually.
llvm-svn: 246985
2015-09-08 16:05:57 +08:00
|
|
|
CGBuilderTy DtorBuilder(CGM, Context);
|
2015-05-08 03:34:16 +08:00
|
|
|
DtorBuilder.SetInsertPoint(DtorEntryBB);
|
|
|
|
|
|
|
|
for (llvm::GlobalVariable *GpuBinaryHandle : GpuBinaryHandles) {
|
Compute and preserve alignment more faithfully in IR-generation.
Introduce an Address type to bundle a pointer value with an
alignment. Introduce APIs on CGBuilderTy to work with Address
values. Change core APIs on CGF/CGM to traffic in Address where
appropriate. Require alignments to be non-zero. Update a ton
of code to compute and propagate alignment information.
As part of this, I've promoted CGBuiltin's EmitPointerWithAlignment
helper function to CGF and made use of it in a number of places in
the expression emitter.
The end result is that we should now be significantly more correct
when performing operations on objects that are locally known to
be under-aligned. Since alignment is not reliably tracked in the
type system, there are inherent limits to this, but at least we
are no longer confused by standard operations like derived-to-base
conversions and array-to-pointer decay. I've also fixed a large
number of bugs where we were applying the complete-object alignment
to a pointer instead of the non-virtual alignment, although most of
these were hidden by the very conservative approach we took with
member alignment.
Also, because IRGen now reliably asserts on zero alignments, we
should no longer be subject to an absurd but frustrating recurring
bug where an incomplete type would report a zero alignment and then
we'd naively do a alignmentAtOffset on it and emit code using an
alignment equal to the largest power-of-two factor of the offset.
We should also now be emitting much more aggressive alignment
attributes in the presence of over-alignment. In particular,
field access now uses alignmentAtOffset instead of min.
Several times in this patch, I had to change the existing
code-generation pattern in order to more effectively use
the Address APIs. For the most part, this seems to be a strict
improvement, like doing pointer arithmetic with GEPs instead of
ptrtoint. That said, I've tried very hard to not change semantics,
but it is likely that I've failed in a few places, for which I
apologize.
ABIArgInfo now always carries the assumed alignment of indirect and
indirect byval arguments. In order to cut down on what was already
a dauntingly large patch, I changed the code to never set align
attributes in the IR on non-byval indirect arguments. That is,
we still generate code which assumes that indirect arguments have
the given alignment, but we don't express this information to the
backend except where it's semantically required (i.e. on byvals).
This is likely a minor regression for those targets that did provide
this information, but it'll be trivial to add it back in a later
patch.
I partially punted on applying this work to CGBuiltin. Please
do not add more uses of the CreateDefaultAligned{Load,Store}
APIs; they will be going away eventually.
llvm-svn: 246985
2015-09-08 16:05:57 +08:00
|
|
|
auto HandleValue =
|
|
|
|
DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign());
|
|
|
|
DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
|
2015-05-08 03:34:16 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
DtorBuilder.CreateRetVoid();
|
|
|
|
return ModuleDtorFunc;
|
|
|
|
}
|
|
|
|
|
2011-10-07 02:29:37 +08:00
|
|
|
CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
|
|
|
|
return new CGNVCUDARuntime(CGM);
|
|
|
|
}
|