forked from OSchip/llvm-project
[CUDA] Emit host-side 'shadows' for device-side global variables
... and register them with CUDA runtime. This is needed for commonly used cudaMemcpy*() APIs that use address of host-side shadow to access their counterparts on device side. Fixes PR26340 Differential Revision: http://reviews.llvm.org/D17779 llvm-svn: 262498
This commit is contained in:
parent
cdf3a2a5be
commit
42e1949b46
|
@ -38,6 +38,7 @@ private:
|
||||||
llvm::Module &TheModule;
|
llvm::Module &TheModule;
|
||||||
/// Keeps track of kernel launch stubs emitted in this module
|
/// Keeps track of kernel launch stubs emitted in this module
|
||||||
llvm::SmallVector<llvm::Function *, 16> EmittedKernels;
|
llvm::SmallVector<llvm::Function *, 16> EmittedKernels;
|
||||||
|
llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars;
|
||||||
/// Keeps track of variables containing handles of GPU binaries. Populated by
|
/// Keeps track of variables containing handles of GPU binaries. Populated by
|
||||||
/// ModuleCtorFunction() and used to create corresponding cleanup calls in
|
/// ModuleCtorFunction() and used to create corresponding cleanup calls in
|
||||||
/// ModuleDtorFunction()
|
/// ModuleDtorFunction()
|
||||||
|
@ -47,7 +48,7 @@ private:
|
||||||
llvm::Constant *getLaunchFn() const;
|
llvm::Constant *getLaunchFn() const;
|
||||||
|
|
||||||
/// Creates a function to register all kernel stubs generated in this module.
|
/// Creates a function to register all kernel stubs generated in this module.
|
||||||
llvm::Function *makeRegisterKernelsFn();
|
llvm::Function *makeRegisterGlobalsFn();
|
||||||
|
|
||||||
/// Helper function that generates a constant string and returns a pointer to
|
/// Helper function that generates a constant string and returns a pointer to
|
||||||
/// the start of the string. The result of this function can be used anywhere
|
/// the start of the string. The result of this function can be used anywhere
|
||||||
|
@ -68,6 +69,10 @@ public:
|
||||||
CGNVCUDARuntime(CodeGenModule &CGM);
|
CGNVCUDARuntime(CodeGenModule &CGM);
|
||||||
|
|
||||||
void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
|
void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
|
||||||
|
void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override {
|
||||||
|
DeviceVars.push_back(std::make_pair(&Var, Flags));
|
||||||
|
}
|
||||||
|
|
||||||
/// Creates module constructor function
|
/// Creates module constructor function
|
||||||
llvm::Function *makeModuleCtorFunction() override;
|
llvm::Function *makeModuleCtorFunction() override;
|
||||||
/// Creates module destructor function
|
/// Creates module destructor function
|
||||||
|
@ -158,19 +163,24 @@ void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF,
|
||||||
CGF.EmitBlock(EndBlock);
|
CGF.EmitBlock(EndBlock);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates internal function to register all kernel stubs generated in this
|
/// Creates a function that sets up state on the host side for CUDA objects that
|
||||||
/// module with the CUDA runtime.
|
/// have a presence on both the host and device sides. Specifically, registers
|
||||||
|
/// the host side of kernel functions and device global variables with the CUDA
|
||||||
|
/// runtime.
|
||||||
/// \code
|
/// \code
|
||||||
/// void __cuda_register_kernels(void** GpuBinaryHandle) {
|
/// void __cuda_register_globals(void** GpuBinaryHandle) {
|
||||||
/// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
|
/// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
|
||||||
/// ...
|
/// ...
|
||||||
/// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
|
/// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
|
||||||
|
/// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
|
||||||
|
/// ...
|
||||||
|
/// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
|
||||||
/// }
|
/// }
|
||||||
/// \endcode
|
/// \endcode
|
||||||
llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() {
|
llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
|
||||||
llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
|
llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
|
||||||
llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
|
llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
|
||||||
llvm::GlobalValue::InternalLinkage, "__cuda_register_kernels", &TheModule);
|
llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", &TheModule);
|
||||||
llvm::BasicBlock *EntryBB =
|
llvm::BasicBlock *EntryBB =
|
||||||
llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
|
llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
|
||||||
CGBuilderTy Builder(CGM, Context);
|
CGBuilderTy Builder(CGM, Context);
|
||||||
|
@ -186,18 +196,44 @@ llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() {
|
||||||
"__cudaRegisterFunction");
|
"__cudaRegisterFunction");
|
||||||
|
|
||||||
// Extract GpuBinaryHandle passed as the first argument passed to
|
// Extract GpuBinaryHandle passed as the first argument passed to
|
||||||
// __cuda_register_kernels() and generate __cudaRegisterFunction() call for
|
// __cuda_register_globals() and generate __cudaRegisterFunction() call for
|
||||||
// each emitted kernel.
|
// each emitted kernel.
|
||||||
llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
|
llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
|
||||||
for (llvm::Function *Kernel : EmittedKernels) {
|
for (llvm::Function *Kernel : EmittedKernels) {
|
||||||
llvm::Constant *KernelName = makeConstantString(Kernel->getName());
|
llvm::Constant *KernelName = makeConstantString(Kernel->getName());
|
||||||
llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
|
llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
|
||||||
llvm::Value *args[] = {
|
llvm::Value *Args[] = {
|
||||||
&GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy),
|
&GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy),
|
||||||
KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr,
|
KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr,
|
||||||
NullPtr, NullPtr, NullPtr,
|
NullPtr, NullPtr, NullPtr,
|
||||||
llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
|
llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
|
||||||
Builder.CreateCall(RegisterFunc, args);
|
Builder.CreateCall(RegisterFunc, Args);
|
||||||
|
}
|
||||||
|
|
||||||
|
// void __cudaRegisterVar(void **, char *, char *, const char *,
|
||||||
|
// int, int, int, int)
|
||||||
|
std::vector<llvm::Type *> RegisterVarParams = {
|
||||||
|
VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy,
|
||||||
|
IntTy, IntTy, IntTy, IntTy};
|
||||||
|
llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction(
|
||||||
|
llvm::FunctionType::get(IntTy, RegisterVarParams, false),
|
||||||
|
"__cudaRegisterVar");
|
||||||
|
for (auto &Pair : DeviceVars) {
|
||||||
|
llvm::GlobalVariable *Var = Pair.first;
|
||||||
|
unsigned Flags = Pair.second;
|
||||||
|
llvm::Constant *VarName = makeConstantString(Var->getName());
|
||||||
|
uint64_t VarSize =
|
||||||
|
CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
|
||||||
|
llvm::Value *Args[] = {
|
||||||
|
&GpuBinaryHandlePtr,
|
||||||
|
Builder.CreateBitCast(Var, VoidPtrTy),
|
||||||
|
VarName,
|
||||||
|
VarName,
|
||||||
|
llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0),
|
||||||
|
llvm::ConstantInt::get(IntTy, VarSize),
|
||||||
|
llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0),
|
||||||
|
llvm::ConstantInt::get(IntTy, 0)};
|
||||||
|
Builder.CreateCall(RegisterVar, Args);
|
||||||
}
|
}
|
||||||
|
|
||||||
Builder.CreateRetVoid();
|
Builder.CreateRetVoid();
|
||||||
|
@ -208,15 +244,15 @@ llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() {
|
||||||
/// \code
|
/// \code
|
||||||
/// void __cuda_module_ctor(void*) {
|
/// void __cuda_module_ctor(void*) {
|
||||||
/// Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0);
|
/// Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0);
|
||||||
/// __cuda_register_kernels(Handle0);
|
/// __cuda_register_globals(Handle0);
|
||||||
/// ...
|
/// ...
|
||||||
/// HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN);
|
/// HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN);
|
||||||
/// __cuda_register_kernels(HandleN);
|
/// __cuda_register_globals(HandleN);
|
||||||
/// }
|
/// }
|
||||||
/// \endcode
|
/// \endcode
|
||||||
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
|
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
|
||||||
// void __cuda_register_kernels(void* handle);
|
// void __cuda_register_globals(void* handle);
|
||||||
llvm::Function *RegisterKernelsFunc = makeRegisterKernelsFn();
|
llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
|
||||||
// void ** __cudaRegisterFatBinary(void *);
|
// void ** __cudaRegisterFatBinary(void *);
|
||||||
llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
|
llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
|
||||||
llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
|
llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
|
||||||
|
@ -272,8 +308,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
|
||||||
CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
|
CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
|
||||||
CGM.getPointerAlign());
|
CGM.getPointerAlign());
|
||||||
|
|
||||||
// Call __cuda_register_kernels(GpuBinaryHandle);
|
// Call __cuda_register_globals(GpuBinaryHandle);
|
||||||
CtorBuilder.CreateCall(RegisterKernelsFunc, RegisterFatbinCall);
|
CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
|
||||||
|
|
||||||
// Save GpuBinaryHandle so we can unregister it in destructor.
|
// Save GpuBinaryHandle so we can unregister it in destructor.
|
||||||
GpuBinaryHandles.push_back(GpuBinaryHandle);
|
GpuBinaryHandles.push_back(GpuBinaryHandle);
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
|
|
||||||
namespace llvm {
|
namespace llvm {
|
||||||
class Function;
|
class Function;
|
||||||
|
class GlobalVariable;
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace clang {
|
namespace clang {
|
||||||
|
@ -37,6 +38,12 @@ protected:
|
||||||
CodeGenModule &CGM;
|
CodeGenModule &CGM;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
// Global variable properties that must be passed to CUDA runtime.
|
||||||
|
enum DeviceVarFlags {
|
||||||
|
ExternDeviceVar = 0x01, // extern
|
||||||
|
ConstantDeviceVar = 0x02, // __constant__
|
||||||
|
};
|
||||||
|
|
||||||
CGCUDARuntime(CodeGenModule &CGM) : CGM(CGM) {}
|
CGCUDARuntime(CodeGenModule &CGM) : CGM(CGM) {}
|
||||||
virtual ~CGCUDARuntime();
|
virtual ~CGCUDARuntime();
|
||||||
|
|
||||||
|
@ -46,6 +53,7 @@ public:
|
||||||
|
|
||||||
/// Emits a kernel launch stub.
|
/// Emits a kernel launch stub.
|
||||||
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0;
|
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0;
|
||||||
|
virtual void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) = 0;
|
||||||
|
|
||||||
/// Constructs and returns a module initialization function or nullptr if it's
|
/// Constructs and returns a module initialization function or nullptr if it's
|
||||||
/// not needed. Must be called after all kernels have been emitted.
|
/// not needed. Must be called after all kernels have been emitted.
|
||||||
|
|
|
@ -1528,11 +1528,18 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
|
||||||
!Global->hasAttr<CUDASharedAttr>())
|
!Global->hasAttr<CUDASharedAttr>())
|
||||||
return;
|
return;
|
||||||
} else {
|
} else {
|
||||||
if (!Global->hasAttr<CUDAHostAttr>() && (
|
// We need to emit host-side 'shadows' for all global
|
||||||
Global->hasAttr<CUDADeviceAttr>() ||
|
// device-side variables because the CUDA runtime needs their
|
||||||
Global->hasAttr<CUDAConstantAttr>() ||
|
// size and host-side address in order to provide access to
|
||||||
Global->hasAttr<CUDASharedAttr>()))
|
// their device-side incarnations.
|
||||||
|
|
||||||
|
// So device-only functions are the only things we skip.
|
||||||
|
if (isa<FunctionDecl>(Global) && !Global->hasAttr<CUDAHostAttr>() &&
|
||||||
|
Global->hasAttr<CUDADeviceAttr>())
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
assert((isa<FunctionDecl>(Global) || isa<VarDecl>(Global)) &&
|
||||||
|
"Expected Variable or Function");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1561,8 +1568,15 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
|
||||||
} else {
|
} else {
|
||||||
const auto *VD = cast<VarDecl>(Global);
|
const auto *VD = cast<VarDecl>(Global);
|
||||||
assert(VD->isFileVarDecl() && "Cannot emit local var decl as global.");
|
assert(VD->isFileVarDecl() && "Cannot emit local var decl as global.");
|
||||||
|
// We need to emit device-side global CUDA variables even if a
|
||||||
if (VD->isThisDeclarationADefinition() != VarDecl::Definition &&
|
// variable does not have a definition -- we still need to define
|
||||||
|
// host-side shadow for it.
|
||||||
|
bool MustEmitForCuda = LangOpts.CUDA && !LangOpts.CUDAIsDevice &&
|
||||||
|
!VD->hasDefinition() &&
|
||||||
|
(VD->hasAttr<CUDAConstantAttr>() ||
|
||||||
|
VD->hasAttr<CUDADeviceAttr>());
|
||||||
|
if (!MustEmitForCuda &&
|
||||||
|
VD->isThisDeclarationADefinition() != VarDecl::Definition &&
|
||||||
!Context.isMSStaticDataMemberInlineDefinition(VD))
|
!Context.isMSStaticDataMemberInlineDefinition(VD))
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -2444,6 +2458,10 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
|
||||||
if (D->hasAttr<AnnotateAttr>())
|
if (D->hasAttr<AnnotateAttr>())
|
||||||
AddGlobalAnnotations(D, GV);
|
AddGlobalAnnotations(D, GV);
|
||||||
|
|
||||||
|
// Set the llvm linkage type as appropriate.
|
||||||
|
llvm::GlobalValue::LinkageTypes Linkage =
|
||||||
|
getLLVMLinkageVarDefinition(D, GV->isConstant());
|
||||||
|
|
||||||
// CUDA B.2.1 "The __device__ qualifier declares a variable that resides on
|
// CUDA B.2.1 "The __device__ qualifier declares a variable that resides on
|
||||||
// the device. [...]"
|
// the device. [...]"
|
||||||
// CUDA B.2.2 "The __constant__ qualifier, optionally used together with
|
// CUDA B.2.2 "The __constant__ qualifier, optionally used together with
|
||||||
|
@ -2451,9 +2469,34 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
|
||||||
// Is accessible from all the threads within the grid and from the host
|
// Is accessible from all the threads within the grid and from the host
|
||||||
// through the runtime library (cudaGetSymbolAddress() / cudaGetSymbolSize()
|
// through the runtime library (cudaGetSymbolAddress() / cudaGetSymbolSize()
|
||||||
// / cudaMemcpyToSymbol() / cudaMemcpyFromSymbol())."
|
// / cudaMemcpyToSymbol() / cudaMemcpyFromSymbol())."
|
||||||
if (GV && LangOpts.CUDA && LangOpts.CUDAIsDevice &&
|
if (GV && LangOpts.CUDA) {
|
||||||
(D->hasAttr<CUDAConstantAttr>() || D->hasAttr<CUDADeviceAttr>())) {
|
if (LangOpts.CUDAIsDevice) {
|
||||||
|
if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>())
|
||||||
GV->setExternallyInitialized(true);
|
GV->setExternallyInitialized(true);
|
||||||
|
} else {
|
||||||
|
// Host-side shadows of external declarations of device-side
|
||||||
|
// global variables become internal definitions. These have to
|
||||||
|
// be internal in order to prevent name conflicts with global
|
||||||
|
// host variables with the same name in a different TUs.
|
||||||
|
if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
|
||||||
|
Linkage = llvm::GlobalValue::InternalLinkage;
|
||||||
|
|
||||||
|
// Shadow variables and their properties must be registered
|
||||||
|
// with CUDA runtime.
|
||||||
|
unsigned Flags = 0;
|
||||||
|
if (!D->hasDefinition())
|
||||||
|
Flags |= CGCUDARuntime::ExternDeviceVar;
|
||||||
|
if (D->hasAttr<CUDAConstantAttr>())
|
||||||
|
Flags |= CGCUDARuntime::ConstantDeviceVar;
|
||||||
|
getCUDARuntime().registerDeviceVar(*GV, Flags);
|
||||||
|
} else if (D->hasAttr<CUDASharedAttr>())
|
||||||
|
// __shared__ variables are odd. Shadows do get created, but
|
||||||
|
// they are not registered with the CUDA runtime, so they
|
||||||
|
// can't really be used to access their device-side
|
||||||
|
// counterparts. It's not clear yet whether it's nvcc's bug or
|
||||||
|
// a feature, but we've got to do the same for compatibility.
|
||||||
|
Linkage = llvm::GlobalValue::InternalLinkage;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
GV->setInitializer(Init);
|
GV->setInitializer(Init);
|
||||||
|
|
||||||
|
@ -2470,9 +2513,6 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
|
||||||
|
|
||||||
GV->setAlignment(getContext().getDeclAlign(D).getQuantity());
|
GV->setAlignment(getContext().getDeclAlign(D).getQuantity());
|
||||||
|
|
||||||
// Set the llvm linkage type as appropriate.
|
|
||||||
llvm::GlobalValue::LinkageTypes Linkage =
|
|
||||||
getLLVMLinkageVarDefinition(D, GV->isConstant());
|
|
||||||
|
|
||||||
// On Darwin, if the normal linkage of a C++ thread_local variable is
|
// On Darwin, if the normal linkage of a C++ thread_local variable is
|
||||||
// LinkOnce or Weak, we keep the normal linkage to prevent multiple
|
// LinkOnce or Weak, we keep the normal linkage to prevent multiple
|
||||||
|
|
|
@ -2,6 +2,40 @@
|
||||||
|
|
||||||
#include "Inputs/cuda.h"
|
#include "Inputs/cuda.h"
|
||||||
|
|
||||||
|
// CHECK-DAG: @device_var = internal global i32
|
||||||
|
__device__ int device_var;
|
||||||
|
|
||||||
|
// CHECK-DAG: @constant_var = internal global i32
|
||||||
|
__constant__ int constant_var;
|
||||||
|
|
||||||
|
// CHECK-DAG: @shared_var = internal global i32
|
||||||
|
__shared__ int shared_var;
|
||||||
|
|
||||||
|
// Make sure host globals don't get internalized...
|
||||||
|
// CHECK-DAG: @host_var = global i32
|
||||||
|
int host_var;
|
||||||
|
// ... and that extern vars remain external.
|
||||||
|
// CHECK-DAG: @ext_host_var = external global i32
|
||||||
|
extern int ext_host_var;
|
||||||
|
|
||||||
|
// Shadows for external device-side variables are *definitions* of
|
||||||
|
// those variables.
|
||||||
|
// CHECK-DAG: @ext_device_var = internal global i32
|
||||||
|
extern __device__ int ext_device_var;
|
||||||
|
// CHECK-DAG: @ext_device_var = internal global i32
|
||||||
|
extern __constant__ int ext_constant_var;
|
||||||
|
|
||||||
|
void use_pointers() {
|
||||||
|
int *p;
|
||||||
|
p = &device_var;
|
||||||
|
p = &constant_var;
|
||||||
|
p = &shared_var;
|
||||||
|
p = &host_var;
|
||||||
|
p = &ext_device_var;
|
||||||
|
p = &ext_constant_var;
|
||||||
|
p = &ext_host_var;
|
||||||
|
}
|
||||||
|
|
||||||
// Make sure that all parts of GPU code init/cleanup are there:
|
// Make sure that all parts of GPU code init/cleanup are there:
|
||||||
// * constant unnamed string with the kernel name
|
// * constant unnamed string with the kernel name
|
||||||
// CHECK: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
|
// CHECK: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
|
||||||
|
@ -32,9 +66,14 @@ __global__ void kernelfunc(int i, int j, int k) {}
|
||||||
// CHECK: call{{.*}}kernelfunc
|
// CHECK: call{{.*}}kernelfunc
|
||||||
void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
|
void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
|
||||||
|
|
||||||
// Test that we've built a function to register kernels
|
// Test that we've built a function to register kernels and global vars.
|
||||||
// CHECK: define internal void @__cuda_register_kernels
|
// CHECK: define internal void @__cuda_register_globals
|
||||||
// CHECK: call{{.*}}cudaRegisterFunction(i8** %0, {{.*}}kernelfunc
|
// CHECK: call{{.*}}cudaRegisterFunction(i8** %0, {{.*}}kernelfunc
|
||||||
|
// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}device_var{{.*}}i32 0, i32 4, i32 0, i32 0
|
||||||
|
// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}constant_var{{.*}}i32 0, i32 4, i32 1, i32 0
|
||||||
|
// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_device_var{{.*}}i32 1, i32 4, i32 0, i32 0
|
||||||
|
// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_constant_var{{.*}}i32 1, i32 4, i32 1, i32 0
|
||||||
|
// CHECK: ret void
|
||||||
|
|
||||||
// Test that we've built contructor..
|
// Test that we've built contructor..
|
||||||
// CHECK: define internal void @__cuda_module_ctor
|
// CHECK: define internal void @__cuda_module_ctor
|
||||||
|
@ -42,8 +81,8 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
|
||||||
// CHECK: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper
|
// CHECK: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper
|
||||||
// .. stores return value in __cuda_gpubin_handle
|
// .. stores return value in __cuda_gpubin_handle
|
||||||
// CHECK-NEXT: store{{.*}}__cuda_gpubin_handle
|
// CHECK-NEXT: store{{.*}}__cuda_gpubin_handle
|
||||||
// .. and then calls __cuda_register_kernels
|
// .. and then calls __cuda_register_globals
|
||||||
// CHECK-NEXT: call void @__cuda_register_kernels
|
// CHECK-NEXT: call void @__cuda_register_globals
|
||||||
|
|
||||||
// Test that we've created destructor.
|
// Test that we've created destructor.
|
||||||
// CHECK: define internal void @__cuda_module_dtor
|
// CHECK: define internal void @__cuda_module_dtor
|
||||||
|
|
|
@ -9,15 +9,15 @@
|
||||||
// CHECK-DEVICE-NOT: module asm "file scope asm is host only"
|
// CHECK-DEVICE-NOT: module asm "file scope asm is host only"
|
||||||
__asm__("file scope asm is host only");
|
__asm__("file scope asm is host only");
|
||||||
|
|
||||||
// CHECK-HOST-NOT: constantdata = externally_initialized global
|
// CHECK-HOST: constantdata = internal global
|
||||||
// CHECK-DEVICE: constantdata = externally_initialized global
|
// CHECK-DEVICE: constantdata = externally_initialized global
|
||||||
__constant__ char constantdata[256];
|
__constant__ char constantdata[256];
|
||||||
|
|
||||||
// CHECK-HOST-NOT: devicedata = externally_initialized global
|
// CHECK-HOST: devicedata = internal global
|
||||||
// CHECK-DEVICE: devicedata = externally_initialized global
|
// CHECK-DEVICE: devicedata = externally_initialized global
|
||||||
__device__ char devicedata[256];
|
__device__ char devicedata[256];
|
||||||
|
|
||||||
// CHECK-HOST-NOT: shareddata = global
|
// CHECK-HOST: shareddata = internal global
|
||||||
// CHECK-DEVICE: shareddata = global
|
// CHECK-DEVICE: shareddata = global
|
||||||
__shared__ char shareddata[256];
|
__shared__ char shareddata[256];
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue