forked from OSchip/llvm-project
[OpenMP] Link the bitcode library late for device LTO
Summary: This patch adds support for linking the OpenMP device bitcode library late when doing LTO. This simply passes it in as an additional device file when doing the final device linking phase with LTO. This has the advantage that we don't link it multiple times, and the device references do not get inlined and prevent us from doing needed OpenMP optimizations when we have visiblity of the whole module. Fix some failings where the implicit conversion of an Error to an Expected triggered the deleted copy constructor. Depends on D116675 Differential revision: https://reviews.llvm.org/D117048
This commit is contained in:
parent
c732c3df74
commit
3762111aa9
|
@ -285,6 +285,10 @@ void AMDGPUOpenMPToolChain::addClangTargetOptions(
|
|||
if (DriverArgs.hasArg(options::OPT_nogpulib))
|
||||
return;
|
||||
|
||||
// Link the bitcode library late if we're using device LTO.
|
||||
if (getDriver().isUsingLTO(/* IsOffload */ true))
|
||||
return;
|
||||
|
||||
std::string BitcodeSuffix;
|
||||
if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
|
||||
options::OPT_fno_openmp_target_new_runtime, true))
|
||||
|
|
|
@ -8164,6 +8164,34 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
|
|||
"-target-feature=" + TC->getTripleString() + "=" + *(FeatureIt + 1)));
|
||||
}
|
||||
|
||||
// Pass in the bitcode library to be linked during LTO.
|
||||
for (auto TI = OpenMPTCRange.first, TE = OpenMPTCRange.second; TI != TE;
|
||||
++TI) {
|
||||
const ToolChain *TC = TI->second;
|
||||
const Driver &D = TC->getDriver();
|
||||
const ArgList &TCArgs = C.getArgsForToolChain(TC, "", Action::OFK_OpenMP);
|
||||
StringRef Arch = TCArgs.getLastArgValue(options::OPT_march_EQ);
|
||||
|
||||
std::string BitcodeSuffix;
|
||||
if (TCArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
|
||||
options::OPT_fno_openmp_target_new_runtime, true))
|
||||
BitcodeSuffix += "new-";
|
||||
if (TC->getTriple().isNVPTX())
|
||||
BitcodeSuffix += "nvptx-";
|
||||
else if (TC->getTriple().isAMDGPU())
|
||||
BitcodeSuffix += "amdgpu-";
|
||||
BitcodeSuffix += Arch;
|
||||
|
||||
ArgStringList BitcodeLibrary;
|
||||
addOpenMPDeviceRTL(D, TCArgs, BitcodeLibrary, BitcodeSuffix,
|
||||
TC->getTriple());
|
||||
|
||||
if (!BitcodeLibrary.empty())
|
||||
CmdArgs.push_back(
|
||||
Args.MakeArgString("-target-library=" + TC->getTripleString() +
|
||||
"-" + Arch + "=" + BitcodeLibrary.back()));
|
||||
}
|
||||
|
||||
// Pass in the optimization level to use for LTO.
|
||||
if (const Arg *A = Args.getLastArg(options::OPT_O_Group)) {
|
||||
StringRef OOpt;
|
||||
|
|
|
@ -744,6 +744,10 @@ void CudaToolChain::addClangTargetOptions(
|
|||
return;
|
||||
}
|
||||
|
||||
// Link the bitcode library late if we're using device LTO.
|
||||
if (getDriver().isUsingLTO(/* IsOffload */ true))
|
||||
return;
|
||||
|
||||
std::string BitcodeSuffix;
|
||||
if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
|
||||
options::OPT_fno_openmp_target_new_runtime, true))
|
||||
|
|
|
@ -68,9 +68,14 @@ static cl::opt<std::string>
|
|||
|
||||
static cl::opt<std::string> OptLevel("opt-level",
|
||||
cl::desc("Optimization level for LTO"),
|
||||
cl::init("O0"),
|
||||
cl::init("O2"),
|
||||
cl::cat(ClangLinkerWrapperCategory));
|
||||
|
||||
static cl::opt<std::string>
|
||||
BitcodeLibrary("target-library",
|
||||
cl::desc("Path for the target bitcode library"),
|
||||
cl::cat(ClangLinkerWrapperCategory));
|
||||
|
||||
// Do not parse linker options.
|
||||
static cl::list<std::string>
|
||||
HostLinkerArgs(cl::Sink, cl::desc("<options to be passed to linker>..."));
|
||||
|
@ -197,7 +202,7 @@ extractFromBinary(const ObjectFile &Obj,
|
|||
std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
|
||||
std::copy(Contents->begin(), Contents->end(), Output->getBufferStart());
|
||||
if (Error E = Output->commit())
|
||||
return E;
|
||||
return std::move(E);
|
||||
|
||||
DeviceFiles.emplace_back(DeviceTriple, Arch, TempFile);
|
||||
ToBeStripped.push_back(*Name);
|
||||
|
@ -225,7 +230,7 @@ extractFromBinary(const ObjectFile &Obj,
|
|||
std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
|
||||
std::copy(Contents.begin(), Contents.end(), Output->getBufferStart());
|
||||
if (Error E = Output->commit())
|
||||
return E;
|
||||
return std::move(E);
|
||||
StripFile = TempFile;
|
||||
}
|
||||
|
||||
|
@ -307,7 +312,7 @@ extractFromBitcode(std::unique_ptr<MemoryBuffer> Buffer,
|
|||
std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
|
||||
std::copy(Contents.begin(), Contents.end(), Output->getBufferStart());
|
||||
if (Error E = Output->commit())
|
||||
return E;
|
||||
return std::move(E);
|
||||
|
||||
DeviceFiles.emplace_back(DeviceTriple, Arch, TempFile);
|
||||
ToBeDeleted.push_back(&GV);
|
||||
|
@ -318,7 +323,7 @@ extractFromBitcode(std::unique_ptr<MemoryBuffer> Buffer,
|
|||
|
||||
// We need to materialize the lazy module before we make any changes.
|
||||
if (Error Err = M->materializeAll())
|
||||
return Err;
|
||||
return std::move(Err);
|
||||
|
||||
// Remove the global from the module and write it to a new file.
|
||||
for (GlobalVariable *GV : ToBeDeleted) {
|
||||
|
@ -392,7 +397,7 @@ extractFromArchive(const Archive &Library,
|
|||
}
|
||||
|
||||
if (Err)
|
||||
return Err;
|
||||
return std::move(Err);
|
||||
|
||||
if (!NewMembers)
|
||||
return None;
|
||||
|
@ -406,9 +411,9 @@ extractFromArchive(const Archive &Library,
|
|||
|
||||
std::unique_ptr<MemoryBuffer> Buffer =
|
||||
MemoryBuffer::getMemBuffer(Library.getMemoryBufferRef(), false);
|
||||
if (Error WriteErr = writeArchive(TempFile, Members, true, Library.kind(),
|
||||
if (Error Err = writeArchive(TempFile, Members, true, Library.kind(),
|
||||
true, Library.isThin(), std::move(Buffer)))
|
||||
return WriteErr;
|
||||
return std::move(Err);
|
||||
|
||||
return static_cast<std::string>(TempFile);
|
||||
}
|
||||
|
@ -726,7 +731,7 @@ Expected<Optional<std::string>> linkBitcodeFiles(ArrayRef<StringRef> InputFiles,
|
|||
|
||||
// Add the bitcode file with its resolved symbols to the LTO job.
|
||||
if (Error Err = LTOBackend->add(std::move(BitcodeFile), Resolutions))
|
||||
return Err;
|
||||
return std::move(Err);
|
||||
}
|
||||
|
||||
// Run the LTO job to compile the bitcode.
|
||||
|
@ -744,7 +749,7 @@ Expected<Optional<std::string>> linkBitcodeFiles(ArrayRef<StringRef> InputFiles,
|
|||
std::make_unique<llvm::raw_fd_ostream>(FD, true));
|
||||
};
|
||||
if (Error Err = LTOBackend->run(AddStream))
|
||||
return Err;
|
||||
return std::move(Err);
|
||||
|
||||
for (auto &File : Files) {
|
||||
if (!TheTriple.isNVPTX())
|
||||
|
@ -957,6 +962,17 @@ int main(int argc, const char **argv) {
|
|||
}
|
||||
}
|
||||
|
||||
// Add the device bitcode library to the device files if it was passed in.
|
||||
if (!BitcodeLibrary.empty()) {
|
||||
// FIXME: Hacky workaround to avoid a backend crash at O0.
|
||||
if (OptLevel[1] - '0' == 0)
|
||||
OptLevel[1] = '1';
|
||||
auto DeviceAndPath = StringRef(BitcodeLibrary).split('=');
|
||||
auto TripleAndArch = DeviceAndPath.first.rsplit('-');
|
||||
DeviceFiles.emplace_back(TripleAndArch.first, TripleAndArch.second,
|
||||
DeviceAndPath.second);
|
||||
}
|
||||
|
||||
// Link the device images extracted from the linker input.
|
||||
SmallVector<std::string, 16> LinkedImages;
|
||||
if (Error Err = linkDeviceFiles(DeviceFiles, LinkerArgs, LinkedImages))
|
||||
|
|
Loading…
Reference in New Issue