forked from OSchip/llvm-project
[HIP] Register/unregister device fat binary only once
HIP generates one fat binary for all devices after linking. However, for each compilation unit a ctor function is emitted which register the same fat binary. Measures need to be taken to make sure the fat binary is only registered once. Currently each ctor function calls __hipRegisterFatBinary and stores the returned value to __hip_gpubin_handle. This patch changes the linkage of __hip_gpubin_handle to be linkonce so that they are shared between LLVM modules. Then this patch adds check of value of __hip_gpubin_handle to make sure __hipRegisterFatBinary is only called once. The code is equivalent to void *_gpubin_handle; void ctor() { if (__hip_gpubin_handle == 0) { __hip_gpubin_handle = __hipRegisterFatBinary(...); } // register kernels and variables. } The patch also does similar change to dtors so that __hipUnregisterFatBinary is called once. Differential Revision: https://reviews.llvm.org/D49083 llvm-svn: 337631
This commit is contained in:
parent
32f7fb5713
commit
f99752b66b
|
@ -309,12 +309,24 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
|
|||
}
|
||||
|
||||
/// Creates a global constructor function for the module:
|
||||
///
|
||||
/// For CUDA:
|
||||
/// \code
|
||||
/// void __cuda_module_ctor(void*) {
|
||||
/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
|
||||
/// __cuda_register_globals(Handle);
|
||||
/// }
|
||||
/// \endcode
|
||||
///
|
||||
/// For HIP:
|
||||
/// \code
|
||||
/// void __hip_module_ctor(void*) {
|
||||
/// if (__hip_gpubin_handle == 0) {
|
||||
/// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
|
||||
/// __hip_register_globals(__hip_gpubin_handle);
|
||||
/// }
|
||||
/// }
|
||||
/// \endcode
|
||||
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
|
||||
bool IsHIP = CGM.getLangOpts().HIP;
|
||||
// No need to generate ctors/dtors if there is no GPU binary.
|
||||
|
@ -427,22 +439,68 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
|
|||
/*constant*/ true);
|
||||
FatbinWrapper->setSection(FatbinSectionName);
|
||||
|
||||
// Register binary with CUDA/HIP runtime. This is substantially different in
|
||||
// default mode vs. separate compilation!
|
||||
if (!RelocatableDeviceCode) {
|
||||
// GpuBinaryHandle = __{cuda|hip}RegisterFatBinary(&FatbinWrapper);
|
||||
// There is only one HIP fat binary per linked module, however there are
|
||||
// multiple constructor functions. Make sure the fat binary is registered
|
||||
// only once. The constructor functions are executed by the dynamic loader
|
||||
// before the program gains control. The dynamic loader cannot execute the
|
||||
// constructor functions concurrently since doing that would not guarantee
|
||||
// thread safety of the loaded program. Therefore we can assume sequential
|
||||
// execution of constructor functions here.
|
||||
if (IsHIP) {
|
||||
llvm::BasicBlock *IfBlock =
|
||||
llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
|
||||
llvm::BasicBlock *ExitBlock =
|
||||
llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
|
||||
// The name, size, and initialization pattern of this variable is part
|
||||
// of HIP ABI.
|
||||
GpuBinaryHandle = new llvm::GlobalVariable(
|
||||
TheModule, VoidPtrPtrTy, /*isConstant=*/false,
|
||||
llvm::GlobalValue::LinkOnceAnyLinkage,
|
||||
/*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
|
||||
"__hip_gpubin_handle");
|
||||
GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity());
|
||||
Address GpuBinaryAddr(
|
||||
GpuBinaryHandle,
|
||||
CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
|
||||
{
|
||||
auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
|
||||
llvm::Constant *Zero =
|
||||
llvm::Constant::getNullValue(HandleValue->getType());
|
||||
llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
|
||||
CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
|
||||
}
|
||||
{
|
||||
CtorBuilder.SetInsertPoint(IfBlock);
|
||||
// GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
|
||||
llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
|
||||
RegisterFatbinFunc,
|
||||
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
|
||||
CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
|
||||
CtorBuilder.CreateBr(ExitBlock);
|
||||
}
|
||||
{
|
||||
CtorBuilder.SetInsertPoint(ExitBlock);
|
||||
// Call __hip_register_globals(GpuBinaryHandle);
|
||||
if (RegisterGlobalsFunc) {
|
||||
auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
|
||||
CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
|
||||
}
|
||||
}
|
||||
} else if (!RelocatableDeviceCode) {
|
||||
// Register binary with CUDA runtime. This is substantially different in
|
||||
// default mode vs. separate compilation!
|
||||
// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
|
||||
llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
|
||||
RegisterFatbinFunc,
|
||||
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
|
||||
GpuBinaryHandle = new llvm::GlobalVariable(
|
||||
TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
|
||||
llvm::ConstantPointerNull::get(VoidPtrPtrTy),
|
||||
addUnderscoredPrefixToName("_gpubin_handle"));
|
||||
|
||||
llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
|
||||
GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity());
|
||||
CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
|
||||
CGM.getPointerAlign());
|
||||
|
||||
// Call __{cuda|hip}_register_globals(GpuBinaryHandle);
|
||||
// Call __cuda_register_globals(GpuBinaryHandle);
|
||||
if (RegisterGlobalsFunc)
|
||||
CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
|
||||
} else {
|
||||
|
@ -453,15 +511,13 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
|
|||
llvm::Constant *ModuleIDConstant =
|
||||
makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32);
|
||||
|
||||
// Create an alias for the FatbinWrapper that nvcc or hip backend will
|
||||
// look for.
|
||||
// Create an alias for the FatbinWrapper that nvcc will look for.
|
||||
llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
|
||||
Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
|
||||
|
||||
// void __{cuda|hip}RegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
|
||||
// void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
|
||||
// void *, void (*)(void **))
|
||||
SmallString<128> RegisterLinkedBinaryName(
|
||||
addUnderscoredPrefixToName("RegisterLinkedBinary"));
|
||||
SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
|
||||
RegisterLinkedBinaryName += ModuleID;
|
||||
llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
|
||||
getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
|
||||
|
@ -493,11 +549,23 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
|
|||
|
||||
/// Creates a global destructor function that unregisters the GPU code blob
|
||||
/// registered by constructor.
|
||||
///
|
||||
/// For CUDA:
|
||||
/// \code
|
||||
/// void __cuda_module_dtor(void*) {
|
||||
/// __cudaUnregisterFatBinary(Handle);
|
||||
/// }
|
||||
/// \endcode
|
||||
///
|
||||
/// For HIP:
|
||||
/// \code
|
||||
/// void __hip_module_dtor(void*) {
|
||||
/// if (__hip_gpubin_handle) {
|
||||
/// __hipUnregisterFatBinary(__hip_gpubin_handle);
|
||||
/// __hip_gpubin_handle = 0;
|
||||
/// }
|
||||
/// }
|
||||
/// \endcode
|
||||
llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
|
||||
// No need for destructor if we don't have a handle to unregister.
|
||||
if (!GpuBinaryHandle)
|
||||
|
@ -518,10 +586,30 @@ llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
|
|||
CGBuilderTy DtorBuilder(CGM, Context);
|
||||
DtorBuilder.SetInsertPoint(DtorEntryBB);
|
||||
|
||||
auto HandleValue =
|
||||
DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign());
|
||||
DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
|
||||
Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
|
||||
GpuBinaryHandle->getAlignment()));
|
||||
auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
|
||||
// There is only one HIP fat binary per linked module, however there are
|
||||
// multiple destructor functions. Make sure the fat binary is unregistered
|
||||
// only once.
|
||||
if (CGM.getLangOpts().HIP) {
|
||||
llvm::BasicBlock *IfBlock =
|
||||
llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
|
||||
llvm::BasicBlock *ExitBlock =
|
||||
llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
|
||||
llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
|
||||
llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
|
||||
DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
|
||||
|
||||
DtorBuilder.SetInsertPoint(IfBlock);
|
||||
DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
|
||||
DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
|
||||
DtorBuilder.CreateBr(ExitBlock);
|
||||
|
||||
DtorBuilder.SetInsertPoint(ExitBlock);
|
||||
} else {
|
||||
DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
|
||||
}
|
||||
DtorBuilder.CreateRetVoid();
|
||||
return ModuleDtorFunc;
|
||||
}
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS
|
||||
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
|
||||
// RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - -x hip \
|
||||
// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,RDC,HIP,HIPRDC
|
||||
// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP
|
||||
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
|
||||
// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN
|
||||
|
||||
|
@ -79,11 +79,11 @@ void use_pointers() {
|
|||
// CUDA-SAME: section ".nvFatBinSegment"
|
||||
// HIP-SAME: section ".hipFatBinSegment"
|
||||
// * variable to save GPU binary handle after initialization
|
||||
// NORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null
|
||||
// CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null
|
||||
// HIP: @__[[PREFIX]]_gpubin_handle = linkonce global i8** null
|
||||
// * constant unnamed string with NVModuleID
|
||||
// RDC: [[MODULE_ID_GLOBAL:@.*]] = private constant
|
||||
// CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
|
||||
// HIPRDC-SAME: c"[[MODULE_ID:.+]]\00", section "__hip_module_id", align 32
|
||||
// * Make sure our constructor was added to global ctor list.
|
||||
// ALL: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor
|
||||
// * Alias to global symbol containing the NVModuleID.
|
||||
|
@ -120,10 +120,18 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
|
|||
// ALL: define internal void @__[[PREFIX]]_module_ctor
|
||||
|
||||
// In separate mode it calls __[[PREFIX]]RegisterFatBinary(&__[[PREFIX]]_fatbin_wrapper)
|
||||
// HIP only register fat binary once.
|
||||
// HIP: load i8**, i8*** @__hip_gpubin_handle
|
||||
// HIP-NEXT: icmp eq i8** {{.*}}, null
|
||||
// HIP-NEXT: br i1 {{.*}}, label %if, label %exit
|
||||
// HIP: if:
|
||||
// NORDC: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
|
||||
// .. stores return value in __[[PREFIX]]_gpubin_handle
|
||||
// NORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle
|
||||
// .. and then calls __[[PREFIX]]_register_globals
|
||||
// HIP-NEXT: br label %exit
|
||||
// HIP: exit:
|
||||
// HIP-NEXT: load i8**, i8*** @__hip_gpubin_handle
|
||||
// NORDC-NEXT: call void @__[[PREFIX]]_register_globals
|
||||
// * In separate mode we also register a destructor.
|
||||
// NORDC-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor)
|
||||
|
@ -136,7 +144,14 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
|
|||
// Test that we've created destructor.
|
||||
// NORDC: define internal void @__[[PREFIX]]_module_dtor
|
||||
// NORDC: load{{.*}}__[[PREFIX]]_gpubin_handle
|
||||
// NORDC-NEXT: call void @__[[PREFIX]]UnregisterFatBinary
|
||||
// CUDANORDC-NEXT: call void @__[[PREFIX]]UnregisterFatBinary
|
||||
// HIP-NEXT: icmp ne i8** {{.*}}, null
|
||||
// HIP-NEXT: br i1 {{.*}}, label %if, label %exit
|
||||
// HIP: if:
|
||||
// HIP-NEXT: call void @__[[PREFIX]]UnregisterFatBinary
|
||||
// HIP-NEXT: store i8** null, i8*** @__hip_gpubin_handle
|
||||
// HIP-NEXT: br label %exit
|
||||
// HIP: exit:
|
||||
|
||||
// There should be no __[[PREFIX]]_register_globals if we have no
|
||||
// device-side globals, but we still need to register GPU binary.
|
||||
|
|
Loading…
Reference in New Issue