[OpenMP] Initial Implementation of LTO and bitcode linking in linker wrapper

This patch implements the fist support for handling LTO in the
offloading pipeline. The flag `-foffload-lto` is used to control if
bitcode is embedded into the device. If bitcode is found in the device,
the extracted files will be sent to the LTO pipeline to be linked and
sent to the backend. This implementation does not separately link the
device bitcode libraries yet.

Depends on D116675

Differential Revision: https://reviews.llvm.org/D116975
This commit is contained in:
Joseph Huber 2022-01-07 17:12:51 -05:00
parent 0e82c7553b
commit c732c3df74
4 changed files with 360 additions and 6 deletions

View File

@ -4083,7 +4083,7 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
auto TC = ToolChains.begin();
for (Action *&A : DeviceActions) {
A = ConstructPhaseAction(C, Args, Phase, A);
A = ConstructPhaseAction(C, Args, Phase, A, Action::OFK_OpenMP);
if (isa<CompileJobAction>(A)) {
HostAction->setCannotBeCollapsedWithNextDependentAction();
@ -4203,6 +4203,12 @@ Action *Driver::ConstructPhaseAction(
Args.hasArg(options::OPT_S) ? types::TY_LTO_IR : types::TY_LTO_BC;
return C.MakeAction<BackendJobAction>(Input, Output);
}
if (isUsingLTO(/* IsOffload */ true) &&
TargetDeviceOffloadKind == Action::OFK_OpenMP) {
types::ID Output =
Args.hasArg(options::OPT_S) ? types::TY_LTO_IR : types::TY_LTO_BC;
return C.MakeAction<BackendJobAction>(Input, Output);
}
if (Args.hasArg(options::OPT_emit_llvm) ||
(TargetDeviceOffloadKind == Action::OFK_HIP &&
Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,

View File

@ -4619,7 +4619,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
if (JA.getType() == types::TY_LLVM_BC)
CmdArgs.push_back("-emit-llvm-uselists");
if (IsUsingLTO) {
if (IsUsingLTO && !Args.hasArg(options::OPT_fopenmp_new_driver)) {
// Only AMDGPU supports device-side LTO.
if (IsDeviceOffloadAction && !Triple.isAMDGPU()) {
D.Diag(diag::err_drv_unsupported_opt_for_target)
@ -8150,6 +8150,39 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
const char *LinkingOutput) const {
ArgStringList CmdArgs;
if (getToolChain().getDriver().isUsingLTO(/* IsOffload */ true)) {
// Pass in target features for each toolchain.
auto OpenMPTCRange = C.getOffloadToolChains<Action::OFK_OpenMP>();
for (auto TI = OpenMPTCRange.first, TE = OpenMPTCRange.second; TI != TE;
++TI) {
const ToolChain *TC = TI->second;
const ArgList &TCArgs = C.getArgsForToolChain(TC, "", Action::OFK_OpenMP);
ArgStringList FeatureArgs;
TC->addClangTargetOptions(TCArgs, FeatureArgs, Action::OFK_OpenMP);
auto FeatureIt = llvm::find(FeatureArgs, "-target-feature");
CmdArgs.push_back(Args.MakeArgString(
"-target-feature=" + TC->getTripleString() + "=" + *(FeatureIt + 1)));
}
// Pass in the optimization level to use for LTO.
if (const Arg *A = Args.getLastArg(options::OPT_O_Group)) {
StringRef OOpt;
if (A->getOption().matches(options::OPT_O4) ||
A->getOption().matches(options::OPT_Ofast))
OOpt = "3";
else if (A->getOption().matches(options::OPT_O)) {
OOpt = A->getValue();
if (OOpt == "g")
OOpt = "1";
else if (OOpt == "s" || OOpt == "z")
OOpt = "2";
} else if (A->getOption().matches(options::OPT_O0))
OOpt = "0";
if (!OOpt.empty())
CmdArgs.push_back(Args.MakeArgString(Twine("-opt-level=O") + OOpt));
}
}
// Construct the link job so we can wrap around it.
Linker->ConstructJob(C, JA, Output, Inputs, Args, LinkingOutput);
const auto &LinkCommand = C.getJobs().getJobs().back();

View File

@ -1,4 +1,15 @@
set(LLVM_LINK_COMPONENTS BitWriter Core BinaryFormat IRReader Object Support)
set(LLVM_LINK_COMPONENTS
${LLVM_TARGETS_TO_BUILD}
BitWriter
Core
BinaryFormat
MC
Passes
IRReader
Object
Support
CodeGen
LTO)
if(NOT CLANG_BUILT_STANDALONE)
set(tablegen_deps intrinsics_gen)

View File

@ -17,9 +17,12 @@
#include "clang/Basic/Version.h"
#include "llvm/BinaryFormat/Magic.h"
#include "llvm/Bitcode/BitcodeWriter.h"
#include "llvm/CodeGen/CommandFlags.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DiagnosticPrinter.h"
#include "llvm/IR/Module.h"
#include "llvm/IRReader/IRReader.h"
#include "llvm/LTO/LTO.h"
#include "llvm/Object/Archive.h"
#include "llvm/Object/ArchiveWriter.h"
#include "llvm/Object/Binary.h"
@ -36,6 +39,7 @@
#include "llvm/Support/Signals.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/StringSaver.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Support/WithColor.h"
#include "llvm/Support/raw_ostream.h"
@ -58,6 +62,15 @@ static cl::opt<std::string> LinkerUserPath("linker-path",
cl::desc("Path of linker binary"),
cl::cat(ClangLinkerWrapperCategory));
static cl::opt<std::string>
TargetFeatures("target-feature", cl::desc("Target features for triple"),
cl::cat(ClangLinkerWrapperCategory));
static cl::opt<std::string> OptLevel("opt-level",
cl::desc("Optimization level for LTO"),
cl::init("O0"),
cl::cat(ClangLinkerWrapperCategory));
// Do not parse linker options.
static cl::list<std::string>
HostLinkerArgs(cl::Sink, cl::desc("<options to be passed to linker>..."));
@ -68,6 +81,9 @@ static std::string LinkerExecutable;
/// Temporary files created by the linker wrapper.
static SmallVector<std::string, 16> TempFiles;
/// Codegen flags for LTO backend.
static codegen::RegisterCodeGenFlags CodeGenFlags;
/// Magic section string that marks the existence of offloading data. The
/// section string will be formatted as `.llvm.offloading.<triple>.<arch>`.
#define OFFLOAD_SECTION_MAGIC_STR ".llvm.offloading."
@ -191,6 +207,28 @@ extractFromBinary(const ObjectFile &Obj,
if (ToBeStripped.empty())
return None;
// If the object file to strip doesn't exist we need to write it so we can
// pass it to llvm-strip.
SmallString<128> StripFile = Obj.getFileName();
if (!sys::fs::exists(StripFile)) {
SmallString<128> TempFile;
if (std::error_code EC = sys::fs::createTemporaryFile(
sys::path::stem(StripFile), "o", TempFile))
return createFileError(TempFile, EC);
TempFiles.push_back(static_cast<std::string>(TempFile));
auto Contents = Obj.getMemoryBufferRef().getBuffer();
Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
FileOutputBuffer::create(TempFile, Contents.size());
if (!OutputOrErr)
return OutputOrErr.takeError();
std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
std::copy(Contents.begin(), Contents.end(), Output->getBufferStart());
if (Error E = Output->commit())
return E;
StripFile = TempFile;
}
// We will use llvm-strip to remove the now unneeded section containing the
// offloading code.
ErrorOr<std::string> StripPath = sys::findProgramByName("llvm-strip");
@ -207,7 +245,7 @@ extractFromBinary(const ObjectFile &Obj,
SmallVector<StringRef, 8> StripArgs;
StripArgs.push_back(*StripPath);
StripArgs.push_back("--no-strip-all");
StripArgs.push_back(Obj.getFileName());
StripArgs.push_back(StripFile);
for (auto &Section : ToBeStripped) {
StripArgs.push_back("--remove-section");
StripArgs.push_back(Section);
@ -408,6 +446,44 @@ extractFromBuffer(std::unique_ptr<MemoryBuffer> Buffer,
// TODO: Move these to a separate file.
namespace nvptx {
Expected<std::string> assemble(StringRef InputFile, Triple TheTriple,
StringRef Arch) {
// NVPTX uses the nvlink binary to link device object files.
ErrorOr<std::string> PtxasPath =
sys::findProgramByName("ptxas", sys::path::parent_path(LinkerExecutable));
if (!PtxasPath)
PtxasPath = sys::findProgramByName("ptxas");
if (!PtxasPath)
return createStringError(PtxasPath.getError(),
"Unable to find 'ptxas' in path");
// Create a new file to write the linked device image to.
SmallString<128> TempFile;
if (std::error_code EC = sys::fs::createTemporaryFile(
TheTriple.getArchName() + "-" + Arch, "cubin", TempFile))
return createFileError(TempFile, EC);
TempFiles.push_back(static_cast<std::string>(TempFile));
// TODO: Pass in arguments like `-g` and `-v` from the driver.
SmallVector<StringRef, 16> CmdArgs;
std::string Opt = "-" + OptLevel;
CmdArgs.push_back(*PtxasPath);
CmdArgs.push_back(TheTriple.isArch64Bit() ? "-m64" : "-m32");
CmdArgs.push_back("-o");
CmdArgs.push_back(TempFile);
CmdArgs.push_back(Opt);
CmdArgs.push_back("--gpu-name");
CmdArgs.push_back(Arch);
CmdArgs.push_back("-c");
CmdArgs.push_back(InputFile);
if (sys::ExecuteAndWait(*PtxasPath, CmdArgs))
return createStringError(inconvertibleErrorCode(), "'ptxas' failed");
return static_cast<std::string>(TempFile);
}
Expected<std::string> link(ArrayRef<StringRef> InputFiles,
ArrayRef<std::string> LinkerArgs, Triple TheTriple,
StringRef Arch) {
@ -468,6 +544,221 @@ Expected<std::string> linkDevice(ArrayRef<StringRef> InputFiles,
}
}
void diagnosticHandler(const DiagnosticInfo &DI) {
std::string ErrStorage;
raw_string_ostream OS(ErrStorage);
DiagnosticPrinterRawOStream DP(OS);
DI.print(DP);
switch (DI.getSeverity()) {
case DS_Error:
WithColor::error(errs(), LinkerExecutable) << ErrStorage;
break;
case DS_Warning:
WithColor::warning(errs(), LinkerExecutable) << ErrStorage;
break;
case DS_Note:
WithColor::note(errs(), LinkerExecutable) << ErrStorage;
break;
case DS_Remark:
WithColor::remark(errs(), LinkerExecutable) << ErrStorage;
break;
}
}
// Get the target features passed in from the driver as <triple>=<features>.
std::vector<std::string> getTargetFeatures(const Triple &TheTriple) {
std::vector<std::string> Features;
auto TargetAndFeatures = StringRef(TargetFeatures).split('=');
if (TargetAndFeatures.first != TheTriple.getTriple())
return Features;
for (auto Feature : llvm::split(TargetAndFeatures.second, ','))
Features.push_back(Feature.str());
return Features;
}
CodeGenOpt::Level getCGOptLevel(unsigned OptLevel) {
switch (OptLevel) {
case 0:
return CodeGenOpt::None;
case 1:
return CodeGenOpt::Less;
case 2:
return CodeGenOpt::Default;
case 3:
return CodeGenOpt::Aggressive;
}
llvm_unreachable("Invalid optimization level");
}
std::unique_ptr<lto::LTO> createLTO(const Triple &TheTriple, StringRef Arch,
bool WholeProgram) {
lto::Config Conf;
lto::ThinBackend Backend;
// TODO: Handle index-only thin-LTO
Backend = lto::createInProcessThinBackend(
llvm::heavyweight_hardware_concurrency(1));
Conf.CPU = Arch.str();
Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags(TheTriple);
Conf.MAttrs = getTargetFeatures(TheTriple);
Conf.CGOptLevel = getCGOptLevel(OptLevel[1] - '0');
Conf.OptLevel = OptLevel[1] - '0';
Conf.DefaultTriple = TheTriple.getTriple();
Conf.DiagHandler = diagnosticHandler;
Conf.PTO.LoopVectorization = Conf.OptLevel > 1;
Conf.PTO.SLPVectorization = Conf.OptLevel > 1;
// TODO: Handle outputting bitcode using a module hook.
if (TheTriple.isNVPTX())
Conf.CGFileType = CGFT_AssemblyFile;
else
Conf.CGFileType = CGFT_ObjectFile;
// TODO: Handle remark files
Conf.HasWholeProgramVisibility = WholeProgram;
return std::make_unique<lto::LTO>(std::move(Conf), Backend);
}
// Returns true if \p S is valid as a C language identifier and will be given
// `__start_` and `__stop_` symbols.
bool isValidCIdentifier(StringRef S) {
return !S.empty() && (isAlpha(S[0]) || S[0] == '_') &&
std::all_of(S.begin() + 1, S.end(),
[](char C) { return C == '_' || isAlnum(C); });
}
Expected<Optional<std::string>> linkBitcodeFiles(ArrayRef<StringRef> InputFiles,
const Triple &TheTriple,
StringRef Arch) {
SmallVector<std::unique_ptr<MemoryBuffer>, 4> SavedBuffers;
SmallVector<std::unique_ptr<lto::InputFile>, 4> BitcodeFiles;
StringMap<bool> UsedInRegularObj;
// Search for bitcode files in the input and create an LTO input file. If it
// is not a bitcode file, scan its symbol table for symbols we need to
// save.
for (StringRef File : InputFiles) {
ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
MemoryBuffer::getFileOrSTDIN(File);
if (std::error_code EC = BufferOrErr.getError())
return createFileError(File, EC);
file_magic Type = identify_magic((*BufferOrErr)->getBuffer());
if (Type != file_magic::bitcode) {
Expected<std::unique_ptr<ObjectFile>> ObjFile =
ObjectFile::createObjectFile(**BufferOrErr, Type);
if (!ObjFile)
return ObjFile.takeError();
for (auto &Sym : (*ObjFile)->symbols()) {
Expected<StringRef> Name = Sym.getName();
if (!Name)
return Name.takeError();
UsedInRegularObj[*Name] = true;
}
} else {
Expected<std::unique_ptr<lto::InputFile>> InputFileOrErr =
llvm::lto::InputFile::create(**BufferOrErr);
if (!InputFileOrErr)
return InputFileOrErr.takeError();
BitcodeFiles.push_back(std::move(*InputFileOrErr));
SavedBuffers.push_back(std::move(*BufferOrErr));
}
}
if (BitcodeFiles.empty())
return None;
// We have visibility of the whole program if every input is bitcode, all
// inputs are statically linked so there should be no external references.
bool WholeProgram = BitcodeFiles.size() == InputFiles.size();
StringMap<bool> PrevailingSymbols;
// TODO: Run more tests to verify that this is correct.
// Create the LTO instance with the necessary config and add the bitcode files
// to it after resolving symbols. We make a few assumptions about symbol
// resolution.
// 1. The target is going to be a stand-alone executable file.
// 2. We do not support relocatable object files.
// 3. All inputs are relocatable object files extracted from host binaries, so
// there is no resolution to a dynamic library.
auto LTOBackend = createLTO(TheTriple, Arch, WholeProgram);
for (auto &BitcodeFile : BitcodeFiles) {
const auto Symbols = BitcodeFile->symbols();
SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
size_t Idx = 0;
for (auto &Sym : Symbols) {
lto::SymbolResolution &Res = Resolutions[Idx++];
// We will use this as the prevailing symbol definition in LTO unless
// it is undefined in the module or another symbol has already been used.
Res.Prevailing = !Sym.isUndefined() && !PrevailingSymbols[Sym.getName()];
// We need LTO to preserve symbols referenced in other object files, or
// are needed by the rest of the toolchain.
Res.VisibleToRegularObj =
UsedInRegularObj[Sym.getName()] ||
isValidCIdentifier(Sym.getSectionName()) ||
(Res.Prevailing && Sym.getName().startswith("__omp"));
// We do not currently support shared libraries, so no symbols will be
// referenced externally by shared libraries.
Res.ExportDynamic = false;
// The result will currently always be an executable, so the only time the
// definition will not reside in this link unit is if it's undefined.
Res.FinalDefinitionInLinkageUnit = !Sym.isUndefined();
// We do not support linker redefined symbols (e.g. --wrap) for device
// image linking, so the symbols will not be changed after LTO.
Res.LinkerRedefined = false;
// Mark this symbol as the prevailing one.
PrevailingSymbols[Sym.getName()] |= Res.Prevailing;
}
// Add the bitcode file with its resolved symbols to the LTO job.
if (Error Err = LTOBackend->add(std::move(BitcodeFile), Resolutions))
return Err;
}
// Run the LTO job to compile the bitcode.
size_t MaxTasks = LTOBackend->getMaxTasks();
std::vector<SmallString<128>> Files(MaxTasks);
auto AddStream = [&](size_t Task) -> std::unique_ptr<CachedFileStream> {
int FD = -1;
auto &TempFile = Files[Task];
StringRef Extension = (TheTriple.isNVPTX()) ? "s" : "o";
if (std::error_code EC = sys::fs::createTemporaryFile(
"lto-" + TheTriple.getTriple(), Extension, FD, TempFile))
return nullptr;
TempFiles.push_back(static_cast<std::string>(TempFile));
return std::make_unique<CachedFileStream>(
std::make_unique<llvm::raw_fd_ostream>(FD, true));
};
if (Error Err = LTOBackend->run(AddStream))
return Err;
for (auto &File : Files) {
if (!TheTriple.isNVPTX())
continue;
auto FileOrErr = nvptx::assemble(File, TheTriple, Arch);
if (!FileOrErr)
return FileOrErr.takeError();
File = *FileOrErr;
}
return static_cast<std::string>(Files.front());
}
/// Runs the appropriate linking action on all the device files specified in \p
/// DeviceFiles. The linked device images are returned in \p LinkedImages.
Error linkDeviceFiles(ArrayRef<DeviceFile> DeviceFiles,
@ -485,6 +776,12 @@ Error linkDeviceFiles(ArrayRef<DeviceFile> DeviceFiles,
StringRef Arch(TargetFeatures.second);
// TODO: Run LTO or bitcode linking before the final link job.
auto ObjectOrErr =
linkBitcodeFiles(LinkerInput.getValue(), TheTriple, Arch);
if (!ObjectOrErr)
return ObjectOrErr.takeError();
if ((*ObjectOrErr).hasValue())
LinkerInput.getValue() = {**ObjectOrErr};
auto ImageOrErr =
linkDevice(LinkerInput.getValue(), LinkerArgs, TheTriple, Arch);
@ -509,7 +806,7 @@ Expected<std::string> wrapDeviceImage(StringRef ImageFile) {
// Create a new file to write the wrapped bitcode file to.
SmallString<128> BitcodeFile;
if (std::error_code EC =
sys::fs::createTemporaryFile("offload", "bc", BitcodeFile))
sys::fs::createTemporaryFile("wrapper", "bc", BitcodeFile))
return createFileError(BitcodeFile, EC);
TempFiles.push_back(static_cast<std::string>(BitcodeFile));
@ -535,7 +832,7 @@ Expected<std::string> wrapDeviceImage(StringRef ImageFile) {
// Create a new file to write the wrapped bitcode file to.
SmallString<128> ObjectFile;
if (std::error_code EC =
sys::fs::createTemporaryFile("offload", "o", ObjectFile))
sys::fs::createTemporaryFile("image", "o", ObjectFile))
return createFileError(BitcodeFile, EC);
TempFiles.push_back(static_cast<std::string>(ObjectFile));
@ -573,6 +870,8 @@ Optional<std::string> findFromSearchPaths(StringRef Name,
Optional<std::string> searchLibraryBaseName(StringRef Name,
ArrayRef<StringRef> SearchPaths) {
for (StringRef Dir : SearchPaths) {
if (Optional<std::string> File = findFile(Dir, "lib" + Name + ".so"))
return None;
if (Optional<std::string> File = findFile(Dir, "lib" + Name + ".a"))
return File;
}
@ -595,6 +894,11 @@ Optional<std::string> searchLibrary(StringRef Input,
int main(int argc, const char **argv) {
InitLLVM X(argc, argv);
InitializeAllTargetInfos();
InitializeAllTargets();
InitializeAllTargetMCs();
InitializeAllAsmParsers();
InitializeAllAsmPrinters();
LinkerExecutable = argv[0];
sys::PrintStackTraceOnErrorSignal(argv[0]);