[CUDA] Emit host-side 'shadows' for device-side global variables

... and register them with CUDA runtime. This is needed for commonly used cudaMemcpy*() APIs that use address of host-side shadow to access their counterparts on device side. Fixes PR26340 Differential Revision: http://reviews.llvm.org/D17779 llvm-svn: 262498
2016-03-02 18:28:50 +00:00 · 2016-03-02 18:28:50 +00:00 · 42e1949b46
parent cdf3a2a5be
commit 42e1949b46
5 changed files with 157 additions and 34 deletions
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@ -38,6 +38,7 @@ private:
  llvm::Module &TheModule;
  /// Keeps track of kernel launch stubs emitted in this module
  llvm::SmallVector<llvm::Function *, 16> EmittedKernels;
  llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars;
  /// Keeps track of variables containing handles of GPU binaries. Populated by
  /// ModuleCtorFunction() and used to create corresponding cleanup calls in
  /// ModuleDtorFunction()
@ -47,7 +48,7 @@ private:
  llvm::Constant *getLaunchFn() const;
  /// Creates a function to register all kernel stubs generated in this module.
-  llvm::Function *makeRegisterKernelsFn();
+  llvm::Function *makeRegisterGlobalsFn();
  /// Helper function that generates a constant string and returns a pointer to
  /// the start of the string.  The result of this function can be used anywhere
@ -68,6 +69,10 @@ public:
  CGNVCUDARuntime(CodeGenModule &CGM);
  void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
  void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override {
    DeviceVars.push_back(std::make_pair(&Var, Flags));
  }
  /// Creates module constructor function
  llvm::Function *makeModuleCtorFunction() override;
  /// Creates module destructor function
@ -158,19 +163,24 @@ void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF,
  CGF.EmitBlock(EndBlock);
 }
-/// Creates internal function to register all kernel stubs generated in this
+/// Creates a function that sets up state on the host side for CUDA objects that
-/// module with the CUDA runtime.
+/// have a presence on both the host and device sides. Specifically, registers
 /// the host side of kernel functions and device global variables with the CUDA
 /// runtime.
 /// \code
-/// void __cuda_register_kernels(void** GpuBinaryHandle) {
+/// void __cuda_register_globals(void** GpuBinaryHandle) {
 ///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
 ///    ...
 ///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
 ///    ...
 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
 /// }
 /// \endcode
-llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() {
+llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
  llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
      llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
-      llvm::GlobalValue::InternalLinkage, "__cuda_register_kernels", &TheModule);
+      llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", &TheModule);
  llvm::BasicBlock *EntryBB =
      llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
  CGBuilderTy Builder(CGM, Context);
@ -186,18 +196,44 @@ llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() {
      "__cudaRegisterFunction");
  // Extract GpuBinaryHandle passed as the first argument passed to
-  // __cuda_register_kernels() and generate __cudaRegisterFunction() call for
+  // __cuda_register_globals() and generate __cudaRegisterFunction() call for
  // each emitted kernel.
  llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
  for (llvm::Function *Kernel : EmittedKernels) {
    llvm::Constant *KernelName = makeConstantString(Kernel->getName());
    llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
-    llvm::Value *args[] = {
+    llvm::Value *Args[] = {
        &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy),
        KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr,
        NullPtr, NullPtr, NullPtr,
        llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
-    Builder.CreateCall(RegisterFunc, args);
+    Builder.CreateCall(RegisterFunc, Args);
  }
  // void __cudaRegisterVar(void **, char *, char *, const char *,
  //                        int, int, int, int)
  std::vector<llvm::Type *> RegisterVarParams = {
      VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy,
      IntTy,        IntTy,     IntTy,     IntTy};
  llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction(
      llvm::FunctionType::get(IntTy, RegisterVarParams, false),
      "__cudaRegisterVar");
  for (auto &Pair : DeviceVars) {
    llvm::GlobalVariable *Var = Pair.first;
    unsigned Flags = Pair.second;
    llvm::Constant *VarName = makeConstantString(Var->getName());
    uint64_t VarSize =
        CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
    llvm::Value *Args[] = {
        &GpuBinaryHandlePtr,
        Builder.CreateBitCast(Var, VoidPtrTy),
        VarName,
        VarName,
        llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0),
        llvm::ConstantInt::get(IntTy, VarSize),
        llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0),
        llvm::ConstantInt::get(IntTy, 0)};
    Builder.CreateCall(RegisterVar, Args);
  }
  Builder.CreateRetVoid();
@ -208,15 +244,15 @@ llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() {
 /// \code
 /// void __cuda_module_ctor(void*) {
 ///     Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0);
-///     __cuda_register_kernels(Handle0);
+///     __cuda_register_globals(Handle0);
 ///     ...
 ///     HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN);
-///     __cuda_register_kernels(HandleN);
+///     __cuda_register_globals(HandleN);
 /// }
 /// \endcode
 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
-  // void __cuda_register_kernels(void* handle);
+  // void __cuda_register_globals(void* handle);
-  llvm::Function *RegisterKernelsFunc = makeRegisterKernelsFn();
+  llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
  // void ** __cudaRegisterFatBinary(void *);
  llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
      llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
@ -272,8 +308,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
    CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
                                   CGM.getPointerAlign());
-    // Call __cuda_register_kernels(GpuBinaryHandle);
+    // Call __cuda_register_globals(GpuBinaryHandle);
-    CtorBuilder.CreateCall(RegisterKernelsFunc, RegisterFatbinCall);
+    CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
    // Save GpuBinaryHandle so we can unregister it in destructor.
    GpuBinaryHandles.push_back(GpuBinaryHandle);
--- a/clang/lib/CodeGen/CGCUDARuntime.h
+++ b/clang/lib/CodeGen/CGCUDARuntime.h
@ -18,6 +18,7 @@
 namespace llvm {
 class Function;
 class GlobalVariable;
 }
 namespace clang {
@ -37,6 +38,12 @@ protected:
  CodeGenModule &CGM;
 public:
  // Global variable properties that must be passed to CUDA runtime.
  enum DeviceVarFlags {
    ExternDeviceVar = 0x01,   // extern
    ConstantDeviceVar = 0x02, // __constant__
  };
  CGCUDARuntime(CodeGenModule &CGM) : CGM(CGM) {}
  virtual ~CGCUDARuntime();
@ -46,6 +53,7 @@ public:
  /// Emits a kernel launch stub.
  virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0;
  virtual void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) = 0;
  /// Constructs and returns a module initialization function or nullptr if it's
  /// not needed. Must be called after all kernels have been emitted.
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@ -1528,11 +1528,18 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
          !Global->hasAttr<CUDASharedAttr>())
        return;
    } else {
-      if (!Global->hasAttr<CUDAHostAttr>() && (
+      // We need to emit host-side 'shadows' for all global
-            Global->hasAttr<CUDADeviceAttr>() ||
+      // device-side variables because the CUDA runtime needs their
-            Global->hasAttr<CUDAConstantAttr>() ||
+      // size and host-side address in order to provide access to
-            Global->hasAttr<CUDASharedAttr>()))
+      // their device-side incarnations.
      // So device-only functions are the only things we skip.
      if (isa<FunctionDecl>(Global) && !Global->hasAttr<CUDAHostAttr>() &&
          Global->hasAttr<CUDADeviceAttr>())
        return;
      assert((isa<FunctionDecl>(Global) || isa<VarDecl>(Global)) &&
             "Expected Variable or Function");
    }
  }
@ -1561,8 +1568,15 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
  } else {
    const auto *VD = cast<VarDecl>(Global);
    assert(VD->isFileVarDecl() && "Cannot emit local var decl as global.");
-
+    // We need to emit device-side global CUDA variables even if a
-    if (VD->isThisDeclarationADefinition() != VarDecl::Definition &&
+    // variable does not have a definition -- we still need to define
    // host-side shadow for it.
    bool MustEmitForCuda = LangOpts.CUDA && !LangOpts.CUDAIsDevice &&
                           !VD->hasDefinition() &&
                           (VD->hasAttr<CUDAConstantAttr>() ||
                            VD->hasAttr<CUDADeviceAttr>());
    if (!MustEmitForCuda &&
        VD->isThisDeclarationADefinition() != VarDecl::Definition &&
        !Context.isMSStaticDataMemberInlineDefinition(VD))
      return;
  }
@ -2444,6 +2458,10 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
  if (D->hasAttr<AnnotateAttr>())
    AddGlobalAnnotations(D, GV);
  // Set the llvm linkage type as appropriate.
  llvm::GlobalValue::LinkageTypes Linkage =
      getLLVMLinkageVarDefinition(D, GV->isConstant());
  // CUDA B.2.1 "The __device__ qualifier declares a variable that resides on
  // the device. [...]"
  // CUDA B.2.2 "The __constant__ qualifier, optionally used together with
@ -2451,9 +2469,34 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
  // Is accessible from all the threads within the grid and from the host
  // through the runtime library (cudaGetSymbolAddress() / cudaGetSymbolSize()
  // / cudaMemcpyToSymbol() / cudaMemcpyFromSymbol())."
-  if (GV && LangOpts.CUDA && LangOpts.CUDAIsDevice &&
+  if (GV && LangOpts.CUDA) {
-      (D->hasAttr<CUDAConstantAttr>() || D->hasAttr<CUDADeviceAttr>())) {
+    if (LangOpts.CUDAIsDevice) {
      if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>())
        GV->setExternallyInitialized(true);
    } else {
      // Host-side shadows of external declarations of device-side
      // global variables become internal definitions. These have to
      // be internal in order to prevent name conflicts with global
      // host variables with the same name in a different TUs.
      if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
        Linkage = llvm::GlobalValue::InternalLinkage;
        // Shadow variables and their properties must be registered
        // with CUDA runtime.
        unsigned Flags = 0;
        if (!D->hasDefinition())
          Flags |= CGCUDARuntime::ExternDeviceVar;
        if (D->hasAttr<CUDAConstantAttr>())
          Flags |= CGCUDARuntime::ConstantDeviceVar;
        getCUDARuntime().registerDeviceVar(*GV, Flags);
      } else if (D->hasAttr<CUDASharedAttr>())
        // __shared__ variables are odd. Shadows do get created, but
        // they are not registered with the CUDA runtime, so they
        // can't really be used to access their device-side
        // counterparts. It's not clear yet whether it's nvcc's bug or
        // a feature, but we've got to do the same for compatibility.
        Linkage = llvm::GlobalValue::InternalLinkage;
    }
  }
  GV->setInitializer(Init);
@ -2470,9 +2513,6 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
  GV->setAlignment(getContext().getDeclAlign(D).getQuantity());
  // Set the llvm linkage type as appropriate.
  llvm::GlobalValue::LinkageTypes Linkage =
      getLLVMLinkageVarDefinition(D, GV->isConstant());
  // On Darwin, if the normal linkage of a C++ thread_local variable is
  // LinkOnce or Weak, we keep the normal linkage to prevent multiple
--- a/clang/test/CodeGenCUDA/device-stub.cu
+++ b/clang/test/CodeGenCUDA/device-stub.cu
@ -2,6 +2,40 @@
 #include "Inputs/cuda.h"
 // CHECK-DAG: @device_var = internal global i32
 __device__ int device_var;
 // CHECK-DAG: @constant_var = internal global i32
 __constant__ int constant_var;
 // CHECK-DAG: @shared_var = internal global i32
 __shared__ int shared_var;
 // Make sure host globals don't get internalized...
 // CHECK-DAG: @host_var = global i32
 int host_var;
 // ... and that extern vars remain external.
 // CHECK-DAG: @ext_host_var = external global i32
 extern int ext_host_var;
 // Shadows for external device-side variables are *definitions* of
 // those variables.
 // CHECK-DAG: @ext_device_var = internal global i32
 extern __device__ int ext_device_var;
 // CHECK-DAG: @ext_device_var = internal global i32
 extern __constant__ int ext_constant_var;
 void use_pointers() {
  int *p;
  p = &device_var;
  p = &constant_var;
  p = &shared_var;
  p = &host_var;
  p = &ext_device_var;
  p = &ext_constant_var;
  p = &ext_host_var;
 }
 // Make sure that all parts of GPU code init/cleanup are there:
 // * constant unnamed string with the kernel name
 // CHECK: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
@ -32,9 +66,14 @@ __global__ void kernelfunc(int i, int j, int k) {}
 // CHECK: call{{.*}}kernelfunc
 void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
-// Test that we've built a function to register kernels
+// Test that we've built a function to register kernels and global vars.
-// CHECK: define internal void @__cuda_register_kernels
+// CHECK: define internal void @__cuda_register_globals
 // CHECK: call{{.*}}cudaRegisterFunction(i8** %0, {{.*}}kernelfunc
 // CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}device_var{{.*}}i32 0, i32 4, i32 0, i32 0
 // CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}constant_var{{.*}}i32 0, i32 4, i32 1, i32 0
 // CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_device_var{{.*}}i32 1, i32 4, i32 0, i32 0
 // CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_constant_var{{.*}}i32 1, i32 4, i32 1, i32 0
 // CHECK: ret void
 // Test that we've built contructor..
 // CHECK: define internal void @__cuda_module_ctor
@ -42,8 +81,8 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
 // CHECK: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper
 //   .. stores return value in __cuda_gpubin_handle
 // CHECK-NEXT: store{{.*}}__cuda_gpubin_handle
-//   .. and then calls __cuda_register_kernels
+//   .. and then calls __cuda_register_globals
-// CHECK-NEXT: call void @__cuda_register_kernels
+// CHECK-NEXT: call void @__cuda_register_globals
 // Test that we've created destructor.
 // CHECK: define internal void @__cuda_module_dtor
--- a/clang/test/CodeGenCUDA/filter-decl.cu
+++ b/clang/test/CodeGenCUDA/filter-decl.cu
@ -9,15 +9,15 @@
 // CHECK-DEVICE-NOT: module asm "file scope asm is host only"
 __asm__("file scope asm is host only");
-// CHECK-HOST-NOT: constantdata = externally_initialized global
+// CHECK-HOST: constantdata = internal global
 // CHECK-DEVICE: constantdata = externally_initialized global
 __constant__ char constantdata[256];
-// CHECK-HOST-NOT: devicedata = externally_initialized global
+// CHECK-HOST: devicedata = internal global
 // CHECK-DEVICE: devicedata = externally_initialized global
 __device__ char devicedata[256];
-// CHECK-HOST-NOT: shareddata = global
+// CHECK-HOST: shareddata = internal global
 // CHECK-DEVICE: shareddata = global
 __shared__ char shareddata[256];