[HIP] Support offloading by linker script

To support linking device code in different source files, it is necessary to
embed fat binary at host linking stage.

This patch emits an external symbol for fat binary in host codegen, then
embed the fat binary by lld through a linker script.

Differential Revision: https://reviews.llvm.org/D46472

llvm-svn: 332724
This commit is contained in:
Yaxun Liu 2018-05-18 15:07:56 +00:00
parent 655ef1875b
commit 29155b01c1
6 changed files with 236 additions and 59 deletions

View File

@ -586,6 +586,8 @@ def fno_cuda_rdc : Flag<["-"], "fno-cuda-rdc">;
def fcuda_short_ptr : Flag<["-"], "fcuda-short-ptr">, Flags<[CC1Option]>,
HelpText<"Use 32-bit pointers for accessing const/local/shared address spaces.">;
def fno_cuda_short_ptr : Flag<["-"], "fno-cuda-short-ptr">;
def fhip_dump_offload_linker_script : Flag<["-"], "fhip-dump-offload-linker-script">,
Group<f_Group>, Flags<[NoArgumentUnused, HelpHidden]>;
def dA : Flag<["-"], "dA">, Group<d_Group>;
def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
HelpText<"Print macro definitions in -E mode in addition to normal output">;

View File

@ -27,6 +27,8 @@ using namespace clang;
using namespace CodeGen;
namespace {
constexpr unsigned CudaFatMagic = 0x466243b1;
constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
class CGNVCUDARuntime : public CGCUDARuntime {
@ -310,19 +312,20 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
/// }
/// \endcode
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
bool IsHIP = CGM.getLangOpts().HIP;
// No need to generate ctors/dtors if there is no GPU binary.
std::string GpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
if (GpuBinaryFileName.empty())
StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
if (CudaGpuBinaryFileName.empty() && !IsHIP)
return nullptr;
// void __cuda_register_globals(void* handle);
// void __{cuda|hip}_register_globals(void* handle);
llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
// We always need a function to pass in as callback. Create a dummy
// implementation if we don't need to register anything.
if (RelocatableDeviceCode && !RegisterGlobalsFunc)
RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
// void ** __cudaRegisterFatBinary(void *);
// void ** __{cuda|hip}RegisterFatBinary(void *);
llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
addUnderscoredPrefixToName("RegisterFatBinary"));
@ -334,12 +337,16 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
// global variable and save a reference in GpuBinaryHandle to be cleaned up
// in destructor on exit. Then associate all known kernels with the GPU binary
// handle so CUDA runtime can figure out what to call on the GPU side.
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName);
if (std::error_code EC = GpuBinaryOrErr.getError()) {
CGM.getDiags().Report(diag::err_cannot_open_file)
<< GpuBinaryFileName << EC.message();
return nullptr;
std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary;
if (!IsHIP) {
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
CGM.getDiags().Report(diag::err_cannot_open_file)
<< CudaGpuBinaryFileName << EC.message();
return nullptr;
}
CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
}
llvm::Function *ModuleCtorFunc = llvm::Function::Create(
@ -353,28 +360,60 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
CtorBuilder.SetInsertPoint(CtorEntryBB);
const char *FatbinConstantName;
if (RelocatableDeviceCode)
const char *FatbinSectionName;
const char *ModuleIDSectionName;
StringRef ModuleIDPrefix;
llvm::Constant *FatBinStr;
unsigned FatMagic;
if (IsHIP) {
FatbinConstantName = ".hip_fatbin";
FatbinSectionName = ".hipFatBinSegment";
ModuleIDSectionName = "__hip_module_id";
ModuleIDPrefix = "__hip_";
// For HIP, create an external symbol __hip_fatbin in section .hip_fatbin.
// The external symbol is supposed to contain the fat binary but will be
// populated somewhere else, e.g. by lld through link script.
FatBinStr = new llvm::GlobalVariable(
CGM.getModule(), CGM.Int8Ty,
/*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
"__hip_fatbin", nullptr,
llvm::GlobalVariable::NotThreadLocal);
cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
FatMagic = HIPFatMagic;
} else {
if (RelocatableDeviceCode)
// TODO: Figure out how this is called on mac OS!
FatbinConstantName = "__nv_relfatbin";
else
FatbinConstantName =
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
// NVIDIA's cuobjdump looks for fatbins in this section.
FatbinSectionName =
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
// TODO: Figure out how this is called on mac OS!
FatbinConstantName = "__nv_relfatbin";
else
FatbinConstantName =
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
// NVIDIA's cuobjdump looks for fatbins in this section.
const char *FatbinSectionName =
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
// TODO: Figure out how this is called on mac OS!
const char *NVModuleIDSectionName = "__nv_module_id";
ModuleIDSectionName = "__nv_module_id";
ModuleIDPrefix = "__nv_";
// For CUDA, create a string literal containing the fat binary loaded from
// the given file.
FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "",
FatbinConstantName, 8);
FatMagic = CudaFatMagic;
}
// Create initialized wrapper structure that points to the loaded GPU binary
ConstantInitBuilder Builder(CGM);
auto Values = Builder.beginStruct(FatbinWrapperTy);
// Fatbin wrapper magic.
Values.addInt(IntTy, 0x466243b1);
Values.addInt(IntTy, FatMagic);
// Fatbin version.
Values.addInt(IntTy, 1);
// Data.
Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "",
FatbinConstantName, 8));
Values.add(FatBinStr);
// Unused in fatbin v1.
Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
@ -382,10 +421,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
/*constant*/ true);
FatbinWrapper->setSection(FatbinSectionName);
// Register binary with CUDA runtime. This is substantially different in
// Register binary with CUDA/HIP runtime. This is substantially different in
// default mode vs. separate compilation!
if (!RelocatableDeviceCode) {
// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
// GpuBinaryHandle = __{cuda|hip}RegisterFatBinary(&FatbinWrapper);
llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
RegisterFatbinFunc,
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
@ -397,34 +436,34 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
CGM.getPointerAlign());
// Call __cuda_register_globals(GpuBinaryHandle);
// Call __{cuda|hip}_register_globals(GpuBinaryHandle);
if (RegisterGlobalsFunc)
CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
} else {
// Generate a unique module ID.
SmallString<64> NVModuleID;
llvm::raw_svector_ostream OS(NVModuleID);
OS << "__nv_" << llvm::format("%x", FatbinWrapper->getGUID());
llvm::Constant *NVModuleIDConstant =
makeConstantString(NVModuleID.str(), "", NVModuleIDSectionName, 32);
SmallString<64> ModuleID;
llvm::raw_svector_ostream OS(ModuleID);
OS << ModuleIDPrefix << llvm::format("%x", FatbinWrapper->getGUID());
llvm::Constant *ModuleIDConstant =
makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32);
// Create an alias for the FatbinWrapper that nvcc will look for.
// Create an alias for the FatbinWrapper that nvcc or hip backend will
// look for.
llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
Twine("__fatbinwrap") + NVModuleID,
FatbinWrapper);
Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
// void __cudaRegisterLinkedBinary%NVModuleID%(void (*)(void *), void *,
// void __{cuda|hip}RegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
// void *, void (*)(void **))
SmallString<128> RegisterLinkedBinaryName(
addUnderscoredPrefixToName("RegisterLinkedBinary"));
RegisterLinkedBinaryName += NVModuleID;
RegisterLinkedBinaryName += ModuleID;
llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
llvm::Value *Args[] = {RegisterGlobalsFunc,
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
NVModuleIDConstant,
ModuleIDConstant,
makeDummyFunction(getCallbackFnTy())};
CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
}

View File

@ -146,12 +146,14 @@ void tools::AddLinkerInputs(const ToolChain &TC, const InputInfoList &Inputs,
Args.AddAllArgValues(CmdArgs, options::OPT_Zlinker_input);
for (const auto &II : Inputs) {
// If the current tool chain refers to an OpenMP offloading host, we should
// ignore inputs that refer to OpenMP offloading devices - they will be
// embedded according to a proper linker script.
// If the current tool chain refers to an OpenMP or HIP offloading host, we
// should ignore inputs that refer to OpenMP or HIP offloading devices -
// they will be embedded according to a proper linker script.
if (auto *IA = II.getAction())
if (JA.isHostOffloading(Action::OFK_OpenMP) &&
IA->isDeviceOffloading(Action::OFK_OpenMP))
if ((JA.isHostOffloading(Action::OFK_OpenMP) &&
IA->isDeviceOffloading(Action::OFK_OpenMP)) ||
(JA.isHostOffloading(Action::OFK_HIP) &&
IA->isDeviceOffloading(Action::OFK_HIP)))
continue;
if (!TC.HasNativeLLVMSupport() && types::isLLVMIR(II.getType()))
@ -1288,6 +1290,124 @@ void tools::AddOpenMPLinkerScript(const ToolChain &TC, Compilation &C,
Lksf << LksBuffer;
}
/// Add HIP linker script arguments at the end of the argument list so that
/// the fat binary is built by embedding the device images into the host. The
/// linker script also defines a symbol required by the code generation so that
/// the image can be retrieved at runtime. This should be used only in tool
/// chains that support linker scripts.
void tools::AddHIPLinkerScript(const ToolChain &TC, Compilation &C,
const InputInfo &Output,
const InputInfoList &Inputs, const ArgList &Args,
ArgStringList &CmdArgs, const JobAction &JA,
const Tool &T) {
// If this is not a HIP host toolchain, we don't need to do anything.
if (!JA.isHostOffloading(Action::OFK_HIP))
return;
// Create temporary linker script. Keep it if save-temps is enabled.
const char *LKS;
SmallString<256> Name = llvm::sys::path::filename(Output.getFilename());
if (C.getDriver().isSaveTempsEnabled()) {
llvm::sys::path::replace_extension(Name, "lk");
LKS = C.getArgs().MakeArgString(Name.c_str());
} else {
llvm::sys::path::replace_extension(Name, "");
Name = C.getDriver().GetTemporaryPath(Name, "lk");
LKS = C.addTempFile(C.getArgs().MakeArgString(Name.c_str()));
}
// Add linker script option to the command.
CmdArgs.push_back("-T");
CmdArgs.push_back(LKS);
// Create a buffer to write the contents of the linker script.
std::string LksBuffer;
llvm::raw_string_ostream LksStream(LksBuffer);
// Get the HIP offload tool chain.
auto *HIPTC = static_cast<const toolchains::CudaToolChain *>(
C.getSingleOffloadToolChain<Action::OFK_HIP>());
assert(HIPTC->getTriple().getArch() == llvm::Triple::amdgcn &&
"Wrong platform");
// Construct clang-offload-bundler command to bundle object files for
// for different GPU archs.
ArgStringList BundlerArgs;
BundlerArgs.push_back(Args.MakeArgString("-type=o"));
// ToDo: Remove the dummy host binary entry which is required by
// clang-offload-bundler.
std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux";
std::string BundlerInputArg = "-inputs=/dev/null";
for (const auto &II : Inputs) {
const Action *A = II.getAction();
// Is this a device linking action?
if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) {
BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" +
StringRef(A->getOffloadingArch()).str();
BundlerInputArg = BundlerInputArg + "," + II.getFilename();
}
}
BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg));
BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg));
std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "o");
const char *BundleFile =
C.addTempFile(C.getArgs().MakeArgString(BundleFileName.c_str()));
auto BundlerOutputArg =
Args.MakeArgString(std::string("-outputs=").append(BundleFile));
BundlerArgs.push_back(BundlerOutputArg);
SmallString<128> BundlerPath(C.getDriver().Dir);
llvm::sys::path::append(BundlerPath, "clang-offload-bundler");
const char *Bundler = Args.MakeArgString(BundlerPath);
C.addCommand(llvm::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs));
// Add commands to embed target binaries. We ensure that each section and
// image is 16-byte aligned. This is not mandatory, but increases the
// likelihood of data to be aligned with a cache block in several main host
// machines.
LksStream << "/*\n";
LksStream << " HIP Offload Linker Script\n";
LksStream << " *** Automatically generated by Clang ***\n";
LksStream << "*/\n";
LksStream << "TARGET(binary)\n";
LksStream << "INPUT(" << BundleFileName << ")\n";
LksStream << "SECTIONS\n";
LksStream << "{\n";
LksStream << " .hip_fatbin :\n";
LksStream << " ALIGN(0x10)\n";
LksStream << " {\n";
LksStream << " PROVIDE_HIDDEN(__hip_fatbin = .);\n";
LksStream << " " << BundleFileName << "\n";
LksStream << " }\n";
LksStream << "}\n";
LksStream << "INSERT BEFORE .data\n";
LksStream.flush();
// Dump the contents of the linker script if the user requested that. We
// support this option to enable testing of behavior with -###.
if (C.getArgs().hasArg(options::OPT_fhip_dump_offload_linker_script))
llvm::errs() << LksBuffer;
// If this is a dry run, do not create the linker script file.
if (C.getArgs().hasArg(options::OPT__HASH_HASH_HASH))
return;
// Open script file and write the contents.
std::error_code EC;
llvm::raw_fd_ostream Lksf(LKS, EC, llvm::sys::fs::F_None);
if (EC) {
C.getDriver().Diag(clang::diag::err_unable_to_make_temp) << EC.message();
return;
}
Lksf << LksBuffer;
}
SmallString<128> tools::getStatsFileName(const llvm::opt::ArgList &Args,
const InputInfo &Output,
const InputInfo &Input,

View File

@ -52,6 +52,12 @@ void AddOpenMPLinkerScript(const ToolChain &TC, Compilation &C,
llvm::opt::ArgStringList &CmdArgs,
const JobAction &JA);
void AddHIPLinkerScript(const ToolChain &TC, Compilation &C,
const InputInfo &Output, const InputInfoList &Inputs,
const llvm::opt::ArgList &Args,
llvm::opt::ArgStringList &CmdArgs, const JobAction &JA,
const Tool &T);
const char *SplitDebugName(const llvm::opt::ArgList &Args,
const InputInfo &Input);

View File

@ -535,6 +535,10 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
// Add OpenMP offloading linker script args if required.
AddOpenMPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA);
// Add HIP offloading linker script args if required.
AddHIPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA,
*this);
C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
}

View File

@ -1,13 +1,13 @@
// RUN: echo "GPU binary would be here" > %t
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -fcuda-include-gpubinary %t -o - \
// RUN: | FileCheck %s --check-prefixes=ALL,NORDC,CUDA
// RUN: | FileCheck %s --check-prefixes=ALL,NORDC,CUDA,CUDANORDC
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -fcuda-include-gpubinary %t -o - -DNOGLOBALS \
// RUN: | FileCheck %s -check-prefix=NOGLOBALS
// RUN: | FileCheck %s -check-prefixes=NOGLOBALS,CUDANOGLOBALS
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - \
// RUN: | FileCheck %s --check-prefixes=ALL,RDC,CUDA
// RUN: | FileCheck %s --check-prefixes=ALL,RDC,CUDA,CUDARDC
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \
// RUN: | FileCheck %s -check-prefix=NOGPUBIN
@ -16,10 +16,10 @@
// RUN: | FileCheck %s --check-prefixes=ALL,NORDC,HIP
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -fcuda-include-gpubinary %t -o - -DNOGLOBALS -x hip \
// RUN: | FileCheck %s -check-prefix=NOGLOBALS
// RUN: | FileCheck %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - -x hip \
// RUN: | FileCheck %s --check-prefixes=ALL,RDC,HIP
// RUN: | FileCheck %s --check-prefixes=ALL,RDC,HIP,HIPRDC
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
// RUN: | FileCheck %s -check-prefix=NOGPUBIN
@ -64,21 +64,26 @@ void use_pointers() {
// * constant unnamed string with the kernel name
// ALL: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
// * constant unnamed string with GPU binary
// ALL: private unnamed_addr constant{{.*GPU binary would be here.*}}\00"
// NORDC-SAME: section ".nv_fatbin", align 8
// RDC-SAME: section "__nv_relfatbin", align 8
// HIP: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin"
// CUDA: @[[FATBIN:.*]] = private unnamed_addr constant{{.*GPU binary would be here.*}}\00",
// CUDANORDC-SAME: section ".nv_fatbin", align 8
// CUDARDC-SAME: section "__nv_relfatbin", align 8
// * constant struct that wraps GPU binary
// CUDA: @__[[PREFIX:cuda]]_fatbin_wrapper = internal constant
// CUDA-SAME: { i32, i32, i8*, i8* }
// HIP: @__[[PREFIX:hip]]_fatbin_wrapper = internal constant
// HIP-SAME: { i32, i32, i8*, i8* }
// ALL-SAME: { i32 1180844977, i32 1, {{.*}}, i8* null }
// ALL-SAME: section ".nvFatBinSegment"
// ALL: @__[[PREFIX:cuda|hip]]_fatbin_wrapper = internal constant
// ALL-SAME: { i32, i32, i8*, i8* }
// CUDA-SAME: { i32 1180844977, i32 1,
// HIP-SAME: { i32 1212764230, i32 1,
// CUDA-SAME: i8* getelementptr inbounds ({{.*}}@[[FATBIN]], i64 0, i64 0),
// HIP-SAME: i8* @[[FATBIN]],
// ALL-SAME: i8* null }
// CUDA-SAME: section ".nvFatBinSegment"
// HIP-SAME: section ".hipFatBinSegment"
// * variable to save GPU binary handle after initialization
// NORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null
// * constant unnamed string with NVModuleID
// RDC: [[MODULE_ID_GLOBAL:@.*]] = private unnamed_addr constant
// RDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
// CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
// HIPRDC-SAME: c"[[MODULE_ID:.+]]\00", section "__hip_module_id", align 32
// * Make sure our constructor was added to global ctor list.
// ALL: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor
// * In separate mode we also register a destructor.
@ -136,9 +141,10 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
// There should be no __[[PREFIX]]_register_globals if we have no
// device-side globals, but we still need to register GPU binary.
// Skip GPU binary string first.
// NOGLOBALS: @0 = private unnamed_addr constant{{.*}}
// CUDANOGLOBALS: @{{.*}} = private unnamed_addr constant{{.*}}
// HIPNOGLOBALS: @{{.*}} = external constant{{.*}}
// NOGLOBALS-NOT: define internal void @__{{.*}}_register_globals
// NOGLOBALS: define internal void @__[[PREFIX:.*]]_module_ctor
// NOGLOBALS: define internal void @__[[PREFIX:cuda|hip]]_module_ctor
// NOGLOBALS: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
// NOGLOBALS-NOT: call void @__[[PREFIX]]_register_globals
// NOGLOBALS: define internal void @__[[PREFIX]]_module_dtor