[HIP] Emit kernel symbol

Currently clang uses stub function to launch kernel. This is inconvenient to interop with C++ programs since the stub function has different name as kernel, which is required by ROCm debugger. This patch emits a variable symbol which has the same name as the kernel and uses it to register and launch the kernel. This allows C++ program to launch a kernel by using the original kernel name. Reviewed by: Artem Belevich Differential Revision: https://reviews.llvm.org/D86376
2021-02-08 12:29:29 -05:00 · 2021-02-08 12:29:29 -05:00 · 5cf2a37f12
parent 154c47dc06
commit 5cf2a37f12
9 changed files with 208 additions and 23 deletions
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@ -42,12 +42,18 @@ private:
  llvm::LLVMContext &Context;
  /// Convenience reference to the current module
  llvm::Module &TheModule;
-  /// Keeps track of kernel launch stubs emitted in this module
+  /// Keeps track of kernel launch stubs and handles emitted in this module
  struct KernelInfo {
-    llvm::Function *Kernel;
+    llvm::Function *Kernel; // stub function to help launch kernel
    const Decl *D;
  };
  llvm::SmallVector<KernelInfo, 16> EmittedKernels;
+  // Map a device stub function to a symbol for identifying kernel in host code.
+  // For CUDA, the symbol for identifying the kernel is the same as the device
+  // stub function. For HIP, they are different.
+  llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles;
+  // Map a kernel handle to the kernel stub.
+  llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
  struct VarInfo {
    llvm::GlobalVariable *Var;
    const VarDecl *D;
@ -154,6 +160,12 @@ private:
 public:
  CGNVCUDARuntime(CodeGenModule &CGM);

+  llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
+  llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
+    auto Loc = KernelStubs.find(Handle);
+    assert(Loc != KernelStubs.end());
+    return Loc->second;
+  }
  void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
  void handleVarRegistration(const VarDecl *VD,
                             llvm::GlobalVariable &Var) override;
@ -272,6 +284,10 @@ std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
                                     FunctionArgList &Args) {
  EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
+  if (auto *GV = dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn])) {
+    GV->setLinkage(CGF.CurFn->getLinkage());
+    GV->setInitializer(CGF.CurFn);
+  }
  if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
                         CudaFeature::CUDA_USES_NEW_LAUNCH) ||
      (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
@ -350,7 +366,8 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
                               ShmemSize.getPointer(), Stream.getPointer()});

  // Emit the call to cudaLaunch
-  llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
+  llvm::Value *Kernel =
+      CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy);
  CallArgList LaunchKernelArgs;
  LaunchKernelArgs.add(RValue::get(Kernel),
                       cudaLaunchKernelFD->getParamDecl(0)->getType());
@ -405,7 +422,8 @@ void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,

  // Emit the call to cudaLaunch
  llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
-  llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
+  llvm::Value *Arg =
+      CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy);
  CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
  CGF.EmitBranch(EndBlock);

@ -499,7 +517,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
    llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
    llvm::Value *Args[] = {
        &GpuBinaryHandlePtr,
-        Builder.CreateBitCast(I.Kernel, VoidPtrTy),
+        Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy),
        KernelName,
        KernelName,
        llvm::ConstantInt::get(IntTy, -1),
@ -1070,3 +1088,28 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
  }
  return makeModuleCtorFunction();
 }
+
+llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
+                                                    GlobalDecl GD) {
+  auto Loc = KernelHandles.find(F);
+  if (Loc != KernelHandles.end())
+    return Loc->second;
+
+  if (!CGM.getLangOpts().HIP) {
+    KernelHandles[F] = F;
+    KernelStubs[F] = F;
+    return F;
+  }
+
+  auto *Var = new llvm::GlobalVariable(
+      TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
+      /*Initializer=*/nullptr,
+      CGM.getMangledName(
+          GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
+  Var->setAlignment(CGM.getPointerAlign().getAsAlign());
+  Var->setDSOLocal(F->isDSOLocal());
+  Var->setVisibility(F->getVisibility());
+  KernelHandles[F] = Var;
+  KernelStubs[Var] = F;
+  return Var;
+}
--- a/clang/lib/CodeGen/CGCUDARuntime.h
+++ b/clang/lib/CodeGen/CGCUDARuntime.h
@ -15,6 +15,7 @@
 #ifndef LLVM_CLANG_LIB_CODEGEN_CGCUDARUNTIME_H
 #define LLVM_CLANG_LIB_CODEGEN_CGCUDARUNTIME_H

+#include "clang/AST/GlobalDecl.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/GlobalValue.h"

@ -94,6 +95,13 @@ public:
  /// compilation is for host.
  virtual std::string getDeviceSideName(const NamedDecl *ND) = 0;

+  /// Get kernel handle by stub function.
+  virtual llvm::GlobalValue *getKernelHandle(llvm::Function *Stub,
+                                             GlobalDecl GD) = 0;
+
+  /// Get kernel stub by kernel handle.
+  virtual llvm::Function *getKernelStub(llvm::GlobalValue *Handle) = 0;
+
  /// Adjust linkage of shadow variables in host compilation.
  virtual void
  internalizeDeviceSideVar(const VarDecl *D,
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//

+#include "CGCUDARuntime.h"
 #include "CGCXXABI.h"
 #include "CGCall.h"
 #include "CGCleanup.h"
@ -4871,8 +4872,12 @@ static CGCallee EmitDirectCallee(CodeGenFunction &CGF, GlobalDecl GD) {
      return CGCallee::forBuiltin(builtinID, FD);
  }

-  llvm::Constant *calleePtr = EmitFunctionDeclPointer(CGF.CGM, GD);
-  return CGCallee::forDirect(calleePtr, GD);
+  llvm::Constant *CalleePtr = EmitFunctionDeclPointer(CGF.CGM, GD);
+  if (CGF.CGM.getLangOpts().CUDA && !CGF.CGM.getLangOpts().CUDAIsDevice &&
+      FD->hasAttr<CUDAGlobalAttr>())
+    CalleePtr = CGF.CGM.getCUDARuntime().getKernelStub(
+        cast<llvm::GlobalValue>(CalleePtr->stripPointerCasts()));
+  return CGCallee::forDirect(CalleePtr, GD);
 }

 CGCallee CodeGenFunction::EmitCallee(const Expr *E) {
@ -5266,6 +5271,19 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType, const CGCallee &OrigCallee
    Callee.setFunctionPointer(CalleePtr);
  }

+  // HIP function pointer contains kernel handle when it is used in triple
+  // chevron. The kernel stub needs to be loaded from kernel handle and used
+  // as callee.
+  if (CGM.getLangOpts().HIP && !CGM.getLangOpts().CUDAIsDevice &&
+      isa<CUDAKernelCallExpr>(E) &&
+      (!TargetDecl || !isa<FunctionDecl>(TargetDecl))) {
+    llvm::Value *Handle = Callee.getFunctionPointer();
+    Handle->dump();
+    auto *Cast =
+        Builder.CreateBitCast(Handle, Handle->getType()->getPointerTo());
+    auto *Stub = Builder.CreateLoad(Address(Cast, CGM.getPointerAlign()));
+    Callee.setFunctionPointer(Stub);
+  }
  llvm::CallBase *CallOrInvoke = nullptr;
  RValue Call = EmitCall(FnInfo, Callee, ReturnValue, Args, &CallOrInvoke,
                         E->getExprLoc());
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@ -3571,9 +3571,19 @@ llvm::Constant *CodeGenModule::GetAddrOfFunction(GlobalDecl GD,
  }

  StringRef MangledName = getMangledName(GD);
-  return GetOrCreateLLVMFunction(MangledName, Ty, GD, ForVTable, DontDefer,
-                                 /*IsThunk=*/false, llvm::AttributeList(),
-                                 IsForDefinition);
+  auto *F = GetOrCreateLLVMFunction(MangledName, Ty, GD, ForVTable, DontDefer,
+                                    /*IsThunk=*/false, llvm::AttributeList(),
+                                    IsForDefinition);
+  // Returns kernel handle for HIP kernel stub function.
+  if (LangOpts.CUDA && !LangOpts.CUDAIsDevice &&
+      cast<FunctionDecl>(GD.getDecl())->hasAttr<CUDAGlobalAttr>()) {
+    auto *Handle = getCUDARuntime().getKernelHandle(
+        cast<llvm::Function>(F->stripPointerCasts()), GD);
+    if (IsForDefinition)
+      return F;
+    return llvm::ConstantExpr::getBitCast(Handle, Ty->getPointerTo());
+  }
+  return F;
 }

 static const FunctionDecl *
--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@ -2,6 +2,7 @@

 #include <stddef.h>

+#if __HIP__ || __CUDA__
 #define __constant__ __attribute__((constant))
 #define __device__ __attribute__((device))
 #define __global__ __attribute__((global))
@ -11,13 +12,22 @@
 #define __managed__ __attribute__((managed))
 #endif
 #define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#else
+#define __constant__
+#define __device__
+#define __global__
+#define __host__
+#define __shared__
+#define __managed__
+#define __launch_bounds__(...)
+#endif

 struct dim3 {
  unsigned x, y, z;
  __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
 };

-#ifdef __HIP__
+#if __HIP__ || HIP_PLATFORM
 typedef struct hipStream *hipStream_t;
 typedef enum hipError {} hipError_t;
 int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
--- a/clang/test/CodeGenCUDA/cxx-call-kernel.cpp
+++ b/clang/test/CodeGenCUDA/cxx-call-kernel.cpp
@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -x hip -emit-llvm-bc %s -o %t.hip.bc
+// RUN: %clang_cc1 -mlink-bitcode-file %t.hip.bc -DHIP_PLATFORM -emit-llvm \
+// RUN:   %s -o - | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+// CHECK: @_Z2g1i = constant void (i32)* @_Z17__device_stub__g1i, align 8
+#if __HIP__
+__global__ void g1(int x) {}
+#else
+extern void g1(int x);
+
+// CHECK: call i32 @hipLaunchKernel{{.*}}@_Z2g1i
+void test() {
+  hipLaunchKernel((void*)g1, 1, 1, nullptr, 0, 0);
+}
+
+// CHECK: __hipRegisterFunction{{.*}}@_Z2g1i
+#endif
--- a/clang/test/CodeGenCUDA/kernel-dbg-info.cu
+++ b/clang/test/CodeGenCUDA/kernel-dbg-info.cu
@ -30,6 +30,9 @@ extern "C" __global__ void ckernel(int *a) {
  *a = 1;
 }

+// Kernel symbol for launching kernel.
+// CHECK: @[[SYM:ckernel]] = constant void (i32*)* @__device_stub__ckernel, align 8
+
 // Device side kernel names
 // CHECK: @[[CKERN:[0-9]*]] = {{.*}} c"ckernel\00"

@ -40,7 +43,7 @@ extern "C" __global__ void ckernel(int *a) {
 // Make sure there is no !dbg between function attributes and '{'
 // CHECK: define{{.*}} void @[[CSTUB:__device_stub__ckernel]]{{.*}} #{{[0-9]+}} {
 // CHECK-NOT: call {{.*}}@hipLaunchByPtr{{.*}}!dbg
-// CHECK: call {{.*}}@hipLaunchByPtr{{.*}}@[[CSTUB]]
+// CHECK: call {{.*}}@hipLaunchByPtr{{.*}}@[[SYM]]
 // CHECK-NOT: ret {{.*}}!dbg

 // CHECK-LABEL: define {{.*}}@_Z8hostfuncPi{{.*}}!dbg
--- a/clang/test/CodeGenCUDA/kernel-stub-name.cu
+++ b/clang/test/CodeGenCUDA/kernel-stub-name.cu
@ -2,10 +2,17 @@

 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fcuda-include-gpubinary %t -o - -x hip\
-// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=CHECK
+// RUN:   | FileCheck %s

 #include "Inputs/cuda.h"

+// Kernel handles
+
+// CHECK: @[[HCKERN:ckernel]] = constant void ()* @__device_stub__ckernel, align 8
+// CHECK: @[[HNSKERN:_ZN2ns8nskernelEv]] = constant void ()* @_ZN2ns23__device_stub__nskernelEv, align 8
+// CHECK: @[[HTKERN:_Z10kernelfuncIiEvv]] = linkonce_odr constant void ()* @_Z25__device_stub__kernelfuncIiEvv, align 8
+// CHECK: @[[HDKERN:_Z11kernel_declv]] = external constant void ()*, align 8
+
 extern "C" __global__ void ckernel() {}

 namespace ns {
@ -17,6 +24,11 @@ __global__ void kernelfunc() {}

 __global__ void kernel_decl();

+void (*kernel_ptr)();
+void *void_ptr;
+
+void launch(void *kern);
+
 // Device side kernel names

 // CHECK: @[[CKERN:[0-9]*]] = {{.*}} c"ckernel\00"
@ -26,16 +38,20 @@ __global__ void kernel_decl();
 // Non-template kernel stub functions

 // CHECK: define{{.*}}@[[CSTUB:__device_stub__ckernel]]
-// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[CSTUB]]
+// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HCKERN]]
 // CHECK: define{{.*}}@[[NSSTUB:_ZN2ns23__device_stub__nskernelEv]]
-// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[NSSTUB]]
+// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HNSKERN]]

-// CHECK-LABEL: define{{.*}}@_Z8hostfuncv()
+
+// Check kernel stub is used for triple chevron
+
+// CHECK-LABEL: define{{.*}}@_Z4fun1v()
 // CHECK: call void @[[CSTUB]]()
 // CHECK: call void @[[NSSTUB]]()
 // CHECK: call void @[[TSTUB:_Z25__device_stub__kernelfuncIiEvv]]()
 // CHECK: call void @[[DSTUB:_Z26__device_stub__kernel_declv]]()
-void hostfunc(void) {
+
+void fun1(void) {
  ckernel<<<1, 1>>>();
  ns::nskernel<<<1, 1>>>();
  kernelfunc<int><<<1, 1>>>();
@ -45,11 +61,69 @@ void hostfunc(void) {
 // Template kernel stub functions

 // CHECK: define{{.*}}@[[TSTUB]]
-// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[TSTUB]]
+// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HTKERN]]
+
+// Check declaration of stub function for external kernel.

 // CHECK: declare{{.*}}@[[DSTUB]]

+// Check kernel handle is used for passing the kernel as a function pointer
+
+// CHECK-LABEL: define{{.*}}@_Z4fun2v()
+// CHECK: call void @_Z6launchPv({{.*}}[[HCKERN]]
+// CHECK: call void @_Z6launchPv({{.*}}[[HNSKERN]]
+// CHECK: call void @_Z6launchPv({{.*}}[[HTKERN]]
+// CHECK: call void @_Z6launchPv({{.*}}[[HDKERN]]
+void fun2() {
+  launch((void *)ckernel);
+  launch((void *)ns::nskernel);
+  launch((void *)kernelfunc<int>);
+  launch((void *)kernel_decl);
+}
+
+// Check kernel handle is used for assigning a kernel to a function pointer
+
+// CHECK-LABEL: define{{.*}}@_Z4fun3v()
+// CHECK:  store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr, align 8
+// CHECK:  store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr, align 8
+// CHECK:  store i8* bitcast (void ()** @[[HCKERN]] to i8*), i8** @void_ptr, align 8
+// CHECK:  store i8* bitcast (void ()** @[[HCKERN]] to i8*), i8** @void_ptr, align 8
+void fun3() {
+  kernel_ptr = ckernel;
+  kernel_ptr = &ckernel;
+  void_ptr = (void *)ckernel;
+  void_ptr = (void *)&ckernel;
+}
+
+// Check kernel stub is loaded from kernel handle when function pointer is
+// used with triple chevron
+
+// CHECK-LABEL: define{{.*}}@_Z4fun4v()
+// CHECK:  store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr
+// CHECK:  call i32 @_Z16hipConfigureCall4dim3S_mP9hipStream
+// CHECK:  %[[HANDLE:.*]] = load void ()*, void ()** @kernel_ptr, align 8
+// CHECK:  %[[CAST:.*]] = bitcast void ()* %[[HANDLE]] to void ()**
+// CHECK:  %[[STUB:.*]] = load void ()*, void ()** %[[CAST]], align 8
+// CHECK:  call void %[[STUB]]()
+void fun4() {
+  kernel_ptr = ckernel;
+  kernel_ptr<<<1,1>>>();
+}
+
+// Check kernel handle is passed to a function
+
+// CHECK-LABEL: define{{.*}}@_Z4fun5v()
+// CHECK:  store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr
+// CHECK:  %[[HANDLE:.*]] = load void ()*, void ()** @kernel_ptr, align 8
+// CHECK:  %[[CAST:.*]] = bitcast void ()* %[[HANDLE]] to i8*
+// CHECK:  call void @_Z6launchPv(i8* %[[CAST]])
+void fun5() {
+  kernel_ptr = ckernel;
+  launch((void *)kernel_ptr);
+}
+
 // CHECK-LABEL: define{{.*}}@__hip_register_globals
-// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[CSTUB]]{{.*}}@[[CKERN]]
-// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[NSSTUB]]{{.*}}@[[NSKERN]]
-// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[TSTUB]]{{.*}}@[[TKERN]]
+// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HCKERN]]{{.*}}@[[CKERN]]
+// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HNSKERN]]{{.*}}@[[NSKERN]]
+// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HTKERN]]{{.*}}@[[TKERN]]
+// CHECK-NOT: call{{.*}}@__hipRegisterFunction{{.*}}@[[HDKERN]]{{.*}}@[[DKERN]]
--- a/clang/test/CodeGenCUDA/unnamed-types.cu
+++ b/clang/test/CodeGenCUDA/unnamed-types.cu
@ -54,7 +54,7 @@ void f1(float *p) {
              [] __device__ (float x) { return x + 5.f; });
 }
 // HOST: @__hip_register_globals
-// HOST: __hipRegisterFunction{{.*}}@_Z17__device_stub__k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_{{.*}}@0
-// HOST: __hipRegisterFunction{{.*}}@_Z17__device_stub__k1IZ2f1PfEUlfE_Z2f1S0_EUlffE_Z2f1S0_EUlfE0_EvS0_T_T0_T1_{{.*}}@1
+// HOST: __hipRegisterFunction{{.*}}@_Z2k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_{{.*}}@0
+// HOST: __hipRegisterFunction{{.*}}@_Z2k1IZ2f1PfEUlfE_Z2f1S0_EUlffE_Z2f1S0_EUlfE0_EvS0_T_T0_T1_{{.*}}@1
 // MSVC: __hipRegisterFunction{{.*}}@"??$k0@V<lambda_1>@?0???R1?0??f1@@YAXPEAM@Z@QEBA@0@Z@@@YAXPEAMV<lambda_1>@?0???R0?0??f1@@YAX0@Z@QEBA@0@Z@@Z{{.*}}@0
 // MSVC: __hipRegisterFunction{{.*}}@"??$k1@V<lambda_2>@?0??f1@@YAXPEAM@Z@V<lambda_3>@?0??2@YAX0@Z@V<lambda_4>@?0??2@YAX0@Z@@@YAXPEAMV<lambda_2>@?0??f1@@YAX0@Z@V<lambda_3>@?0??1@YAX0@Z@V<lambda_4>@?0??1@YAX0@Z@@Z{{.*}}@1