[ELF] Parallelize writes of different OutputSections

We currently process one OutputSection at a time and for each OutputSection
write contained input sections in parallel. This strategy does not leverage
multi-threading well. Instead, parallelize writes of different OutputSections.

The default TaskSize for parallelFor often leads to inferior sharding. We
prepare the task in the caller instead.

* Move llvm::parallel::detail::TaskGroup to llvm::parallel::TaskGroup
* Add llvm::parallel::TaskGroup::execute.
* Change writeSections to declare TaskGroup and pass it to writeTo.

Speed-up with --threads=8:

* clang -DCMAKE_BUILD_TYPE=Release: 1.11x as fast
* clang -DCMAKE_BUILD_TYPE=Debug: 1.10x as fast
* chrome -DCMAKE_BUILD_TYPE=Release: 1.04x as fast
* scylladb build/release: 1.09x as fast

On M1, many benchmarks are a small fraction of a percentage faster. Mozilla showed the largest difference with the patch being about 1.03x as fast.

Differential Revision: https://reviews.llvm.org/D131247
This commit is contained in:
Fangrui Song 2022-08-24 09:40:03 -07:00
parent e854c17b02
commit 3b4d800911
8 changed files with 132 additions and 59 deletions

View File

@ -332,7 +332,10 @@ template <class ELFT> void OutputSection::maybeCompress() {
// Write uncompressed data to a temporary zero-initialized buffer. // Write uncompressed data to a temporary zero-initialized buffer.
auto buf = std::make_unique<uint8_t[]>(size); auto buf = std::make_unique<uint8_t[]>(size);
writeTo<ELFT>(buf.get()); {
parallel::TaskGroup tg;
writeTo<ELFT>(buf.get(), tg);
}
// We chose 1 (Z_BEST_SPEED) as the default compression level because it is // We chose 1 (Z_BEST_SPEED) as the default compression level because it is
// the fastest. If -O2 is given, we use level 6 to compress debug info more by // the fastest. If -O2 is given, we use level 6 to compress debug info more by
// ~15%. We found that level 7 to 9 doesn't make much difference (~1% more // ~15%. We found that level 7 to 9 doesn't make much difference (~1% more
@ -386,7 +389,8 @@ static void writeInt(uint8_t *buf, uint64_t data, uint64_t size) {
llvm_unreachable("unsupported Size argument"); llvm_unreachable("unsupported Size argument");
} }
template <class ELFT> void OutputSection::writeTo(uint8_t *buf) { template <class ELFT>
void OutputSection::writeTo(uint8_t *buf, parallel::TaskGroup &tg) {
llvm::TimeTraceScope timeScope("Write sections", name); llvm::TimeTraceScope timeScope("Write sections", name);
if (type == SHT_NOBITS) if (type == SHT_NOBITS)
return; return;
@ -419,41 +423,68 @@ template <class ELFT> void OutputSection::writeTo(uint8_t *buf) {
} }
// Write leading padding. // Write leading padding.
SmallVector<InputSection *, 0> storage;
ArrayRef<InputSection *> sections = getInputSections(*this, storage); ArrayRef<InputSection *> sections = getInputSections(*this, storage);
std::array<uint8_t, 4> filler = getFiller(); std::array<uint8_t, 4> filler = getFiller();
bool nonZeroFiller = read32(filler.data()) != 0; bool nonZeroFiller = read32(filler.data()) != 0;
if (nonZeroFiller) if (nonZeroFiller)
fill(buf, sections.empty() ? size : sections[0]->outSecOff, filler); fill(buf, sections.empty() ? size : sections[0]->outSecOff, filler);
parallelFor(0, sections.size(), [&](size_t i) { auto fn = [=](size_t begin, size_t end) {
InputSection *isec = sections[i]; size_t numSections = sections.size();
if (auto *s = dyn_cast<SyntheticSection>(isec)) for (size_t i = begin; i != end; ++i) {
s->writeTo(buf + isec->outSecOff); InputSection *isec = sections[i];
else if (auto *s = dyn_cast<SyntheticSection>(isec))
isec->writeTo<ELFT>(buf + isec->outSecOff); s->writeTo(buf + isec->outSecOff);
// Fill gaps between sections.
if (nonZeroFiller) {
uint8_t *start = buf + isec->outSecOff + isec->getSize();
uint8_t *end;
if (i + 1 == sections.size())
end = buf + size;
else else
end = buf + sections[i + 1]->outSecOff; isec->writeTo<ELFT>(buf + isec->outSecOff);
if (isec->nopFiller) {
assert(target->nopInstrs);
nopInstrFill(start, end - start);
} else
fill(start, end - start, filler);
}
});
// Linker scripts may have BYTE()-family commands with which you // Fill gaps between sections.
// can write arbitrary bytes to the output. Process them if any. if (nonZeroFiller) {
uint8_t *start = buf + isec->outSecOff + isec->getSize();
uint8_t *end;
if (i + 1 == numSections)
end = buf + size;
else
end = buf + sections[i + 1]->outSecOff;
if (isec->nopFiller) {
assert(target->nopInstrs);
nopInstrFill(start, end - start);
} else
fill(start, end - start, filler);
}
}
};
// If there is any BYTE()-family command (rare), write the section content
// first then process BYTE to overwrite the filler content. The write is
// serial due to the limitation of llvm/Support/Parallel.h.
bool written = false;
size_t numSections = sections.size();
for (SectionCommand *cmd : commands) for (SectionCommand *cmd : commands)
if (auto *data = dyn_cast<ByteCommand>(cmd)) if (auto *data = dyn_cast<ByteCommand>(cmd)) {
if (!std::exchange(written, true))
fn(0, numSections);
writeInt(buf + data->offset, data->expression().getValue(), data->size); writeInt(buf + data->offset, data->expression().getValue(), data->size);
}
if (written || !numSections)
return;
// There is no data command. Write content asynchronously to overlap the write
// time with other output sections. Note, if a linker script specifies
// overlapping output sections (needs --noinhibit-exec or --no-check-sections
// to supress the error), the output may be non-deterministic.
const size_t taskSizeLimit = 4 << 20;
for (size_t begin = 0, i = 0, taskSize = 0;;) {
taskSize += sections[i]->getSize();
bool done = ++i == numSections;
if (done || taskSize >= taskSizeLimit) {
tg.execute([=] { fn(begin, i); });
if (done)
break;
begin = i;
taskSize = 0;
}
}
} }
static void finalizeShtGroup(OutputSection *os, InputSection *section) { static void finalizeShtGroup(OutputSection *os, InputSection *section) {
@ -673,10 +704,14 @@ template void OutputSection::writeHeaderTo<ELF32BE>(ELF32BE::Shdr *Shdr);
template void OutputSection::writeHeaderTo<ELF64LE>(ELF64LE::Shdr *Shdr); template void OutputSection::writeHeaderTo<ELF64LE>(ELF64LE::Shdr *Shdr);
template void OutputSection::writeHeaderTo<ELF64BE>(ELF64BE::Shdr *Shdr); template void OutputSection::writeHeaderTo<ELF64BE>(ELF64BE::Shdr *Shdr);
template void OutputSection::writeTo<ELF32LE>(uint8_t *Buf); template void OutputSection::writeTo<ELF32LE>(uint8_t *,
template void OutputSection::writeTo<ELF32BE>(uint8_t *Buf); llvm::parallel::TaskGroup &);
template void OutputSection::writeTo<ELF64LE>(uint8_t *Buf); template void OutputSection::writeTo<ELF32BE>(uint8_t *,
template void OutputSection::writeTo<ELF64BE>(uint8_t *Buf); llvm::parallel::TaskGroup &);
template void OutputSection::writeTo<ELF64LE>(uint8_t *,
llvm::parallel::TaskGroup &);
template void OutputSection::writeTo<ELF64BE>(uint8_t *,
llvm::parallel::TaskGroup &);
template void OutputSection::maybeCompress<ELF32LE>(); template void OutputSection::maybeCompress<ELF32LE>();
template void OutputSection::maybeCompress<ELF32BE>(); template void OutputSection::maybeCompress<ELF32BE>();

View File

@ -12,6 +12,7 @@
#include "InputSection.h" #include "InputSection.h"
#include "LinkerScript.h" #include "LinkerScript.h"
#include "lld/Common/LLVM.h" #include "lld/Common/LLVM.h"
#include "llvm/Support/Parallel.h"
#include <array> #include <array>
@ -104,7 +105,8 @@ public:
bool relro = false; bool relro = false;
void finalize(); void finalize();
template <class ELFT> void writeTo(uint8_t *buf); template <class ELFT>
void writeTo(uint8_t *buf, llvm::parallel::TaskGroup &tg);
// Check that the addends for dynamic relocations were written correctly. // Check that the addends for dynamic relocations were written correctly.
void checkDynRelAddends(const uint8_t *bufStart); void checkDynRelAddends(const uint8_t *bufStart);
template <class ELFT> void maybeCompress(); template <class ELFT> void maybeCompress();
@ -114,6 +116,8 @@ public:
void sortCtorsDtors(); void sortCtorsDtors();
private: private:
SmallVector<InputSection *, 0> storage;
// Used for implementation of --compress-debug-sections option. // Used for implementation of --compress-debug-sections option.
CompressedData compressed; CompressedData compressed;

View File

@ -2839,9 +2839,10 @@ template <class ELFT> void Writer<ELFT>::openFile() {
} }
template <class ELFT> void Writer<ELFT>::writeSectionsBinary() { template <class ELFT> void Writer<ELFT>::writeSectionsBinary() {
parallel::TaskGroup tg;
for (OutputSection *sec : outputSections) for (OutputSection *sec : outputSections)
if (sec->flags & SHF_ALLOC) if (sec->flags & SHF_ALLOC)
sec->writeTo<ELFT>(Out::bufferStart + sec->offset); sec->writeTo<ELFT>(Out::bufferStart + sec->offset, tg);
} }
static void fillTrap(uint8_t *i, uint8_t *end) { static void fillTrap(uint8_t *i, uint8_t *end) {
@ -2884,16 +2885,21 @@ template <class ELFT> void Writer<ELFT>::writeTrapInstr() {
template <class ELFT> void Writer<ELFT>::writeSections() { template <class ELFT> void Writer<ELFT>::writeSections() {
llvm::TimeTraceScope timeScope("Write sections"); llvm::TimeTraceScope timeScope("Write sections");
// In -r or --emit-relocs mode, write the relocation sections first as in {
// ELf_Rel targets we might find out that we need to modify the relocated // In -r or --emit-relocs mode, write the relocation sections first as in
// section while doing it. // ELf_Rel targets we might find out that we need to modify the relocated
for (OutputSection *sec : outputSections) // section while doing it.
if (sec->type == SHT_REL || sec->type == SHT_RELA) parallel::TaskGroup tg;
sec->writeTo<ELFT>(Out::bufferStart + sec->offset); for (OutputSection *sec : outputSections)
if (sec->type == SHT_REL || sec->type == SHT_RELA)
for (OutputSection *sec : outputSections) sec->writeTo<ELFT>(Out::bufferStart + sec->offset, tg);
if (sec->type != SHT_REL && sec->type != SHT_RELA) }
sec->writeTo<ELFT>(Out::bufferStart + sec->offset); {
parallel::TaskGroup tg;
for (OutputSection *sec : outputSections)
if (sec->type != SHT_REL && sec->type != SHT_RELA)
sec->writeTo<ELFT>(Out::bufferStart + sec->offset, tg);
}
// Finally, check that all dynamic relocation addends were written correctly. // Finally, check that all dynamic relocation addends were written correctly.
if (config->checkDynamicRelocs && config->writeAddends) { if (config->checkDynamicRelocs && config->writeAddends) {

View File

@ -1,6 +1,7 @@
// REQUIRES: arm // REQUIRES: arm
// RUN: llvm-mc -g --triple=armv7a-linux-gnueabihf -arm-add-build-attributes -filetype=obj -o %t.o %s // RUN: llvm-mc -g --triple=armv7a-linux-gnueabihf -arm-add-build-attributes -filetype=obj -o %t.o %s
// RUN: ld.lld %t.o -o %t 2>&1 | FileCheck %s --check-prefix=WARN /// Use --threads=1 to keep emitted warnings across sections sequential.
// RUN: ld.lld %t.o -o %t --threads=1 2>&1 | FileCheck %s --check-prefix=WARN
// RUN: llvm-objdump --no-show-raw-insn -d %t | FileCheck %s // RUN: llvm-objdump --no-show-raw-insn -d %t | FileCheck %s
.syntax unified .syntax unified

View File

@ -1,6 +1,7 @@
# REQUIRES: hexagon # REQUIRES: hexagon
# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %s -o %t.o # RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %s -o %t.o
# RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck --implicit-check-not "out of range" %s ## Use --threads=1 to keep emitted warnings across sections sequential.
# RUN: not ld.lld %t.o -o /dev/null --threads=1 2>&1 | FileCheck --implicit-check-not "out of range" %s
.globl _start .globl _start
.type _start, @function .type _start, @function

View File

@ -88,8 +88,8 @@
# BROKEN-OUTPUT-FILE-NEXT: 8010 01010101 01010101 01010101 01010101 # BROKEN-OUTPUT-FILE-NEXT: 8010 01010101 01010101 01010101 01010101
# BROKEN-OUTPUT-FILE-NEXT: 8020 01010101 01010101 01010101 01010101 # BROKEN-OUTPUT-FILE-NEXT: 8020 01010101 01010101 01010101 01010101
# BROKEN-OUTPUT-FILE-NEXT: 8030 01010101 01010101 01010101 01010101 # BROKEN-OUTPUT-FILE-NEXT: 8030 01010101 01010101 01010101 01010101
# Starting here the contents of .sec2 overwrites .sec1: ## Starting here the content may be from either .sec1 or .sec2, depending on the write order.
# BROKEN-OUTPUT-FILE-NEXT: 8040 02020202 02020202 02020202 02020202 # BROKEN-OUTPUT-FILE-NEXT: 8040
# RUN: llvm-readelf --sections -l %t.so | FileCheck %s -check-prefix BAD-BOTH # RUN: llvm-readelf --sections -l %t.so | FileCheck %s -check-prefix BAD-BOTH
# BAD-BOTH-LABEL: Section Headers: # BAD-BOTH-LABEL: Section Headers:

View File

@ -30,9 +30,6 @@ namespace parallel {
extern ThreadPoolStrategy strategy; extern ThreadPoolStrategy strategy;
namespace detail { namespace detail {
#if LLVM_ENABLE_THREADS
class Latch { class Latch {
uint32_t Count; uint32_t Count;
mutable std::mutex Mutex; mutable std::mutex Mutex;
@ -61,20 +58,42 @@ public:
Cond.wait(lock, [&] { return Count == 0; }); Cond.wait(lock, [&] { return Count == 0; });
} }
}; };
} // namespace detail
class TaskGroup { class TaskGroup {
Latch L; detail::Latch L;
bool Parallel; bool Parallel;
public: public:
TaskGroup(); TaskGroup();
~TaskGroup(); ~TaskGroup();
// Spawn a task, but does not wait for it to finish.
void spawn(std::function<void()> f); void spawn(std::function<void()> f);
// Similar to spawn, but execute the task immediately when ThreadsRequested ==
// 1. The difference is to give the following pattern a more intuitive order
// when single threading is requested.
//
// for (size_t begin = 0, i = 0, taskSize = 0;;) {
// taskSize += ...
// bool done = ++i == end;
// if (done || taskSize >= taskSizeLimit) {
// tg.execute([=] { fn(begin, i); });
// if (done)
// break;
// begin = i;
// taskSize = 0;
// }
// }
void execute(std::function<void()> f);
void sync() const { L.sync(); } void sync() const { L.sync(); }
}; };
namespace detail {
#if LLVM_ENABLE_THREADS
const ptrdiff_t MinParallelSize = 1024; const ptrdiff_t MinParallelSize = 1024;
/// Inclusive median. /// Inclusive median.

View File

@ -19,10 +19,9 @@
llvm::ThreadPoolStrategy llvm::parallel::strategy; llvm::ThreadPoolStrategy llvm::parallel::strategy;
#if LLVM_ENABLE_THREADS
namespace llvm { namespace llvm {
namespace parallel { namespace parallel {
#if LLVM_ENABLE_THREADS
namespace detail { namespace detail {
namespace { namespace {
@ -143,6 +142,8 @@ Executor *Executor::getDefaultExecutor() {
return Exec.get(); return Exec.get();
} }
} // namespace } // namespace
} // namespace detail
#endif
static std::atomic<int> TaskGroupInstances; static std::atomic<int> TaskGroupInstances;
@ -159,21 +160,27 @@ TaskGroup::~TaskGroup() {
} }
void TaskGroup::spawn(std::function<void()> F) { void TaskGroup::spawn(std::function<void()> F) {
#if LLVM_ENABLE_THREADS
if (Parallel) { if (Parallel) {
L.inc(); L.inc();
Executor::getDefaultExecutor()->add([&, F = std::move(F)] { detail::Executor::getDefaultExecutor()->add([&, F = std::move(F)] {
F(); F();
L.dec(); L.dec();
}); });
} else { return;
F();
} }
#endif
F();
} }
} // namespace detail void TaskGroup::execute(std::function<void()> F) {
if (parallel::strategy.ThreadsRequested == 1)
F();
else
spawn(F);
}
} // namespace parallel } // namespace parallel
} // namespace llvm } // namespace llvm
#endif // LLVM_ENABLE_THREADS
void llvm::parallelFor(size_t Begin, size_t End, void llvm::parallelFor(size_t Begin, size_t End,
llvm::function_ref<void(size_t)> Fn) { llvm::function_ref<void(size_t)> Fn) {
@ -190,7 +197,7 @@ void llvm::parallelFor(size_t Begin, size_t End,
if (TaskSize == 0) if (TaskSize == 0)
TaskSize = 1; TaskSize = 1;
parallel::detail::TaskGroup TG; parallel::TaskGroup TG;
for (; Begin + TaskSize < End; Begin += TaskSize) { for (; Begin + TaskSize < End; Begin += TaskSize) {
TG.spawn([=, &Fn] { TG.spawn([=, &Fn] {
for (size_t I = Begin, E = Begin + TaskSize; I != E; ++I) for (size_t I = Begin, E = Begin + TaskSize; I != E; ++I)