[HIP] Emit kernel symbol

Currently clang uses stub function to launch kernel. This is inconvenient
to interop with C++ programs since the stub function has different name
as kernel, which is required by ROCm debugger.

This patch emits a variable symbol which has the same name as the kernel
and uses it to register and launch the kernel. This allows C++ program to
launch a kernel by using the original kernel name.

Reviewed by: Artem Belevich

Differential Revision: https://reviews.llvm.org/D86376
This commit is contained in:
Yaxun (Sam) Liu 2021-02-08 12:29:29 -05:00
parent 154c47dc06
commit 5cf2a37f12
9 changed files with 208 additions and 23 deletions

View File

@ -42,12 +42,18 @@ private:
llvm::LLVMContext &Context;
/// Convenience reference to the current module
llvm::Module &TheModule;
/// Keeps track of kernel launch stubs emitted in this module
/// Keeps track of kernel launch stubs and handles emitted in this module
struct KernelInfo {
llvm::Function *Kernel;
llvm::Function *Kernel; // stub function to help launch kernel
const Decl *D;
};
llvm::SmallVector<KernelInfo, 16> EmittedKernels;
// Map a device stub function to a symbol for identifying kernel in host code.
// For CUDA, the symbol for identifying the kernel is the same as the device
// stub function. For HIP, they are different.
llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles;
// Map a kernel handle to the kernel stub.
llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
struct VarInfo {
llvm::GlobalVariable *Var;
const VarDecl *D;
@ -154,6 +160,12 @@ private:
public:
CGNVCUDARuntime(CodeGenModule &CGM);
llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
auto Loc = KernelStubs.find(Handle);
assert(Loc != KernelStubs.end());
return Loc->second;
}
void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
void handleVarRegistration(const VarDecl *VD,
llvm::GlobalVariable &Var) override;
@ -272,6 +284,10 @@ std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
FunctionArgList &Args) {
EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
if (auto *GV = dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn])) {
GV->setLinkage(CGF.CurFn->getLinkage());
GV->setInitializer(CGF.CurFn);
}
if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
CudaFeature::CUDA_USES_NEW_LAUNCH) ||
(CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
@ -350,7 +366,8 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
ShmemSize.getPointer(), Stream.getPointer()});
// Emit the call to cudaLaunch
llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
llvm::Value *Kernel =
CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy);
CallArgList LaunchKernelArgs;
LaunchKernelArgs.add(RValue::get(Kernel),
cudaLaunchKernelFD->getParamDecl(0)->getType());
@ -405,7 +422,8 @@ void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
// Emit the call to cudaLaunch
llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
llvm::Value *Arg =
CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy);
CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
CGF.EmitBranch(EndBlock);
@ -499,7 +517,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
llvm::Value *Args[] = {
&GpuBinaryHandlePtr,
Builder.CreateBitCast(I.Kernel, VoidPtrTy),
Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy),
KernelName,
KernelName,
llvm::ConstantInt::get(IntTy, -1),
@ -1070,3 +1088,28 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
}
return makeModuleCtorFunction();
}
llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
GlobalDecl GD) {
auto Loc = KernelHandles.find(F);
if (Loc != KernelHandles.end())
return Loc->second;
if (!CGM.getLangOpts().HIP) {
KernelHandles[F] = F;
KernelStubs[F] = F;
return F;
}
auto *Var = new llvm::GlobalVariable(
TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
/*Initializer=*/nullptr,
CGM.getMangledName(
GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
Var->setAlignment(CGM.getPointerAlign().getAsAlign());
Var->setDSOLocal(F->isDSOLocal());
Var->setVisibility(F->getVisibility());
KernelHandles[F] = Var;
KernelStubs[Var] = F;
return Var;
}

View File

@ -15,6 +15,7 @@
#ifndef LLVM_CLANG_LIB_CODEGEN_CGCUDARUNTIME_H
#define LLVM_CLANG_LIB_CODEGEN_CGCUDARUNTIME_H
#include "clang/AST/GlobalDecl.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/GlobalValue.h"
@ -94,6 +95,13 @@ public:
/// compilation is for host.
virtual std::string getDeviceSideName(const NamedDecl *ND) = 0;
/// Get kernel handle by stub function.
virtual llvm::GlobalValue *getKernelHandle(llvm::Function *Stub,
GlobalDecl GD) = 0;
/// Get kernel stub by kernel handle.
virtual llvm::Function *getKernelStub(llvm::GlobalValue *Handle) = 0;
/// Adjust linkage of shadow variables in host compilation.
virtual void
internalizeDeviceSideVar(const VarDecl *D,

View File

@ -10,6 +10,7 @@
//
//===----------------------------------------------------------------------===//
#include "CGCUDARuntime.h"
#include "CGCXXABI.h"
#include "CGCall.h"
#include "CGCleanup.h"
@ -4871,8 +4872,12 @@ static CGCallee EmitDirectCallee(CodeGenFunction &CGF, GlobalDecl GD) {
return CGCallee::forBuiltin(builtinID, FD);
}
llvm::Constant *calleePtr = EmitFunctionDeclPointer(CGF.CGM, GD);
return CGCallee::forDirect(calleePtr, GD);
llvm::Constant *CalleePtr = EmitFunctionDeclPointer(CGF.CGM, GD);
if (CGF.CGM.getLangOpts().CUDA && !CGF.CGM.getLangOpts().CUDAIsDevice &&
FD->hasAttr<CUDAGlobalAttr>())
CalleePtr = CGF.CGM.getCUDARuntime().getKernelStub(
cast<llvm::GlobalValue>(CalleePtr->stripPointerCasts()));
return CGCallee::forDirect(CalleePtr, GD);
}
CGCallee CodeGenFunction::EmitCallee(const Expr *E) {
@ -5266,6 +5271,19 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType, const CGCallee &OrigCallee
Callee.setFunctionPointer(CalleePtr);
}
// HIP function pointer contains kernel handle when it is used in triple
// chevron. The kernel stub needs to be loaded from kernel handle and used
// as callee.
if (CGM.getLangOpts().HIP && !CGM.getLangOpts().CUDAIsDevice &&
isa<CUDAKernelCallExpr>(E) &&
(!TargetDecl || !isa<FunctionDecl>(TargetDecl))) {
llvm::Value *Handle = Callee.getFunctionPointer();
Handle->dump();
auto *Cast =
Builder.CreateBitCast(Handle, Handle->getType()->getPointerTo());
auto *Stub = Builder.CreateLoad(Address(Cast, CGM.getPointerAlign()));
Callee.setFunctionPointer(Stub);
}
llvm::CallBase *CallOrInvoke = nullptr;
RValue Call = EmitCall(FnInfo, Callee, ReturnValue, Args, &CallOrInvoke,
E->getExprLoc());

View File

@ -3571,9 +3571,19 @@ llvm::Constant *CodeGenModule::GetAddrOfFunction(GlobalDecl GD,
}
StringRef MangledName = getMangledName(GD);
return GetOrCreateLLVMFunction(MangledName, Ty, GD, ForVTable, DontDefer,
/*IsThunk=*/false, llvm::AttributeList(),
IsForDefinition);
auto *F = GetOrCreateLLVMFunction(MangledName, Ty, GD, ForVTable, DontDefer,
/*IsThunk=*/false, llvm::AttributeList(),
IsForDefinition);
// Returns kernel handle for HIP kernel stub function.
if (LangOpts.CUDA && !LangOpts.CUDAIsDevice &&
cast<FunctionDecl>(GD.getDecl())->hasAttr<CUDAGlobalAttr>()) {
auto *Handle = getCUDARuntime().getKernelHandle(
cast<llvm::Function>(F->stripPointerCasts()), GD);
if (IsForDefinition)
return F;
return llvm::ConstantExpr::getBitCast(Handle, Ty->getPointerTo());
}
return F;
}
static const FunctionDecl *

View File

@ -2,6 +2,7 @@
#include <stddef.h>
#if __HIP__ || __CUDA__
#define __constant__ __attribute__((constant))
#define __device__ __attribute__((device))
#define __global__ __attribute__((global))
@ -11,13 +12,22 @@
#define __managed__ __attribute__((managed))
#endif
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
#else
#define __constant__
#define __device__
#define __global__
#define __host__
#define __shared__
#define __managed__
#define __launch_bounds__(...)
#endif
struct dim3 {
unsigned x, y, z;
__host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
};
#ifdef __HIP__
#if __HIP__ || HIP_PLATFORM
typedef struct hipStream *hipStream_t;
typedef enum hipError {} hipError_t;
int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,

View File

@ -0,0 +1,19 @@
// RUN: %clang_cc1 -x hip -emit-llvm-bc %s -o %t.hip.bc
// RUN: %clang_cc1 -mlink-bitcode-file %t.hip.bc -DHIP_PLATFORM -emit-llvm \
// RUN: %s -o - | FileCheck %s
#include "Inputs/cuda.h"
// CHECK: @_Z2g1i = constant void (i32)* @_Z17__device_stub__g1i, align 8
#if __HIP__
__global__ void g1(int x) {}
#else
extern void g1(int x);
// CHECK: call i32 @hipLaunchKernel{{.*}}@_Z2g1i
void test() {
hipLaunchKernel((void*)g1, 1, 1, nullptr, 0, 0);
}
// CHECK: __hipRegisterFunction{{.*}}@_Z2g1i
#endif

View File

@ -30,6 +30,9 @@ extern "C" __global__ void ckernel(int *a) {
*a = 1;
}
// Kernel symbol for launching kernel.
// CHECK: @[[SYM:ckernel]] = constant void (i32*)* @__device_stub__ckernel, align 8
// Device side kernel names
// CHECK: @[[CKERN:[0-9]*]] = {{.*}} c"ckernel\00"
@ -40,7 +43,7 @@ extern "C" __global__ void ckernel(int *a) {
// Make sure there is no !dbg between function attributes and '{'
// CHECK: define{{.*}} void @[[CSTUB:__device_stub__ckernel]]{{.*}} #{{[0-9]+}} {
// CHECK-NOT: call {{.*}}@hipLaunchByPtr{{.*}}!dbg
// CHECK: call {{.*}}@hipLaunchByPtr{{.*}}@[[CSTUB]]
// CHECK: call {{.*}}@hipLaunchByPtr{{.*}}@[[SYM]]
// CHECK-NOT: ret {{.*}}!dbg
// CHECK-LABEL: define {{.*}}@_Z8hostfuncPi{{.*}}!dbg

View File

@ -2,10 +2,17 @@
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -fcuda-include-gpubinary %t -o - -x hip\
// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=CHECK
// RUN: | FileCheck %s
#include "Inputs/cuda.h"
// Kernel handles
// CHECK: @[[HCKERN:ckernel]] = constant void ()* @__device_stub__ckernel, align 8
// CHECK: @[[HNSKERN:_ZN2ns8nskernelEv]] = constant void ()* @_ZN2ns23__device_stub__nskernelEv, align 8
// CHECK: @[[HTKERN:_Z10kernelfuncIiEvv]] = linkonce_odr constant void ()* @_Z25__device_stub__kernelfuncIiEvv, align 8
// CHECK: @[[HDKERN:_Z11kernel_declv]] = external constant void ()*, align 8
extern "C" __global__ void ckernel() {}
namespace ns {
@ -17,6 +24,11 @@ __global__ void kernelfunc() {}
__global__ void kernel_decl();
void (*kernel_ptr)();
void *void_ptr;
void launch(void *kern);
// Device side kernel names
// CHECK: @[[CKERN:[0-9]*]] = {{.*}} c"ckernel\00"
@ -26,16 +38,20 @@ __global__ void kernel_decl();
// Non-template kernel stub functions
// CHECK: define{{.*}}@[[CSTUB:__device_stub__ckernel]]
// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[CSTUB]]
// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HCKERN]]
// CHECK: define{{.*}}@[[NSSTUB:_ZN2ns23__device_stub__nskernelEv]]
// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[NSSTUB]]
// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HNSKERN]]
// CHECK-LABEL: define{{.*}}@_Z8hostfuncv()
// Check kernel stub is used for triple chevron
// CHECK-LABEL: define{{.*}}@_Z4fun1v()
// CHECK: call void @[[CSTUB]]()
// CHECK: call void @[[NSSTUB]]()
// CHECK: call void @[[TSTUB:_Z25__device_stub__kernelfuncIiEvv]]()
// CHECK: call void @[[DSTUB:_Z26__device_stub__kernel_declv]]()
void hostfunc(void) {
void fun1(void) {
ckernel<<<1, 1>>>();
ns::nskernel<<<1, 1>>>();
kernelfunc<int><<<1, 1>>>();
@ -45,11 +61,69 @@ void hostfunc(void) {
// Template kernel stub functions
// CHECK: define{{.*}}@[[TSTUB]]
// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[TSTUB]]
// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HTKERN]]
// Check declaration of stub function for external kernel.
// CHECK: declare{{.*}}@[[DSTUB]]
// Check kernel handle is used for passing the kernel as a function pointer
// CHECK-LABEL: define{{.*}}@_Z4fun2v()
// CHECK: call void @_Z6launchPv({{.*}}[[HCKERN]]
// CHECK: call void @_Z6launchPv({{.*}}[[HNSKERN]]
// CHECK: call void @_Z6launchPv({{.*}}[[HTKERN]]
// CHECK: call void @_Z6launchPv({{.*}}[[HDKERN]]
void fun2() {
launch((void *)ckernel);
launch((void *)ns::nskernel);
launch((void *)kernelfunc<int>);
launch((void *)kernel_decl);
}
// Check kernel handle is used for assigning a kernel to a function pointer
// CHECK-LABEL: define{{.*}}@_Z4fun3v()
// CHECK: store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr, align 8
// CHECK: store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr, align 8
// CHECK: store i8* bitcast (void ()** @[[HCKERN]] to i8*), i8** @void_ptr, align 8
// CHECK: store i8* bitcast (void ()** @[[HCKERN]] to i8*), i8** @void_ptr, align 8
void fun3() {
kernel_ptr = ckernel;
kernel_ptr = &ckernel;
void_ptr = (void *)ckernel;
void_ptr = (void *)&ckernel;
}
// Check kernel stub is loaded from kernel handle when function pointer is
// used with triple chevron
// CHECK-LABEL: define{{.*}}@_Z4fun4v()
// CHECK: store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr
// CHECK: call i32 @_Z16hipConfigureCall4dim3S_mP9hipStream
// CHECK: %[[HANDLE:.*]] = load void ()*, void ()** @kernel_ptr, align 8
// CHECK: %[[CAST:.*]] = bitcast void ()* %[[HANDLE]] to void ()**
// CHECK: %[[STUB:.*]] = load void ()*, void ()** %[[CAST]], align 8
// CHECK: call void %[[STUB]]()
void fun4() {
kernel_ptr = ckernel;
kernel_ptr<<<1,1>>>();
}
// Check kernel handle is passed to a function
// CHECK-LABEL: define{{.*}}@_Z4fun5v()
// CHECK: store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr
// CHECK: %[[HANDLE:.*]] = load void ()*, void ()** @kernel_ptr, align 8
// CHECK: %[[CAST:.*]] = bitcast void ()* %[[HANDLE]] to i8*
// CHECK: call void @_Z6launchPv(i8* %[[CAST]])
void fun5() {
kernel_ptr = ckernel;
launch((void *)kernel_ptr);
}
// CHECK-LABEL: define{{.*}}@__hip_register_globals
// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[CSTUB]]{{.*}}@[[CKERN]]
// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[NSSTUB]]{{.*}}@[[NSKERN]]
// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[TSTUB]]{{.*}}@[[TKERN]]
// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HCKERN]]{{.*}}@[[CKERN]]
// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HNSKERN]]{{.*}}@[[NSKERN]]
// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HTKERN]]{{.*}}@[[TKERN]]
// CHECK-NOT: call{{.*}}@__hipRegisterFunction{{.*}}@[[HDKERN]]{{.*}}@[[DKERN]]

View File

@ -54,7 +54,7 @@ void f1(float *p) {
[] __device__ (float x) { return x + 5.f; });
}
// HOST: @__hip_register_globals
// HOST: __hipRegisterFunction{{.*}}@_Z17__device_stub__k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_{{.*}}@0
// HOST: __hipRegisterFunction{{.*}}@_Z17__device_stub__k1IZ2f1PfEUlfE_Z2f1S0_EUlffE_Z2f1S0_EUlfE0_EvS0_T_T0_T1_{{.*}}@1
// HOST: __hipRegisterFunction{{.*}}@_Z2k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_{{.*}}@0
// HOST: __hipRegisterFunction{{.*}}@_Z2k1IZ2f1PfEUlfE_Z2f1S0_EUlffE_Z2f1S0_EUlfE0_EvS0_T_T0_T1_{{.*}}@1
// MSVC: __hipRegisterFunction{{.*}}@"??$k0@V<lambda_1>@?0???R1?0??f1@@YAXPEAM@Z@QEBA@0@Z@@@YAXPEAMV<lambda_1>@?0???R0?0??f1@@YAX0@Z@QEBA@0@Z@@Z{{.*}}@0
// MSVC: __hipRegisterFunction{{.*}}@"??$k1@V<lambda_2>@?0??f1@@YAXPEAM@Z@V<lambda_3>@?0??2@YAX0@Z@V<lambda_4>@?0??2@YAX0@Z@@@YAXPEAMV<lambda_2>@?0??f1@@YAX0@Z@V<lambda_3>@?0??1@YAX0@Z@V<lambda_4>@?0??1@YAX0@Z@@Z{{.*}}@1