forked from OSchip/llvm-project
[ELF] Parallelize writes of different OutputSections
We currently process one OutputSection at a time and for each OutputSection write contained input sections in parallel. This strategy does not leverage multi-threading well. Instead, parallelize writes of different OutputSections. The default TaskSize for parallelFor often leads to inferior sharding. We prepare the task in the caller instead. * Move llvm::parallel::detail::TaskGroup to llvm::parallel::TaskGroup * Add llvm::parallel::TaskGroup::execute. * Change writeSections to declare TaskGroup and pass it to writeTo. Speed-up with --threads=8: * clang -DCMAKE_BUILD_TYPE=Release: 1.11x as fast * clang -DCMAKE_BUILD_TYPE=Debug: 1.10x as fast * chrome -DCMAKE_BUILD_TYPE=Release: 1.04x as fast * scylladb build/release: 1.09x as fast On M1, many benchmarks are a small fraction of a percentage faster. Mozilla showed the largest difference with the patch being about 1.03x as fast. Differential Revision: https://reviews.llvm.org/D131247
This commit is contained in:
parent
e854c17b02
commit
3b4d800911
|
@ -332,7 +332,10 @@ template <class ELFT> void OutputSection::maybeCompress() {
|
||||||
|
|
||||||
// Write uncompressed data to a temporary zero-initialized buffer.
|
// Write uncompressed data to a temporary zero-initialized buffer.
|
||||||
auto buf = std::make_unique<uint8_t[]>(size);
|
auto buf = std::make_unique<uint8_t[]>(size);
|
||||||
writeTo<ELFT>(buf.get());
|
{
|
||||||
|
parallel::TaskGroup tg;
|
||||||
|
writeTo<ELFT>(buf.get(), tg);
|
||||||
|
}
|
||||||
// We chose 1 (Z_BEST_SPEED) as the default compression level because it is
|
// We chose 1 (Z_BEST_SPEED) as the default compression level because it is
|
||||||
// the fastest. If -O2 is given, we use level 6 to compress debug info more by
|
// the fastest. If -O2 is given, we use level 6 to compress debug info more by
|
||||||
// ~15%. We found that level 7 to 9 doesn't make much difference (~1% more
|
// ~15%. We found that level 7 to 9 doesn't make much difference (~1% more
|
||||||
|
@ -386,7 +389,8 @@ static void writeInt(uint8_t *buf, uint64_t data, uint64_t size) {
|
||||||
llvm_unreachable("unsupported Size argument");
|
llvm_unreachable("unsupported Size argument");
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class ELFT> void OutputSection::writeTo(uint8_t *buf) {
|
template <class ELFT>
|
||||||
|
void OutputSection::writeTo(uint8_t *buf, parallel::TaskGroup &tg) {
|
||||||
llvm::TimeTraceScope timeScope("Write sections", name);
|
llvm::TimeTraceScope timeScope("Write sections", name);
|
||||||
if (type == SHT_NOBITS)
|
if (type == SHT_NOBITS)
|
||||||
return;
|
return;
|
||||||
|
@ -419,41 +423,68 @@ template <class ELFT> void OutputSection::writeTo(uint8_t *buf) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write leading padding.
|
// Write leading padding.
|
||||||
SmallVector<InputSection *, 0> storage;
|
|
||||||
ArrayRef<InputSection *> sections = getInputSections(*this, storage);
|
ArrayRef<InputSection *> sections = getInputSections(*this, storage);
|
||||||
std::array<uint8_t, 4> filler = getFiller();
|
std::array<uint8_t, 4> filler = getFiller();
|
||||||
bool nonZeroFiller = read32(filler.data()) != 0;
|
bool nonZeroFiller = read32(filler.data()) != 0;
|
||||||
if (nonZeroFiller)
|
if (nonZeroFiller)
|
||||||
fill(buf, sections.empty() ? size : sections[0]->outSecOff, filler);
|
fill(buf, sections.empty() ? size : sections[0]->outSecOff, filler);
|
||||||
|
|
||||||
parallelFor(0, sections.size(), [&](size_t i) {
|
auto fn = [=](size_t begin, size_t end) {
|
||||||
InputSection *isec = sections[i];
|
size_t numSections = sections.size();
|
||||||
if (auto *s = dyn_cast<SyntheticSection>(isec))
|
for (size_t i = begin; i != end; ++i) {
|
||||||
s->writeTo(buf + isec->outSecOff);
|
InputSection *isec = sections[i];
|
||||||
else
|
if (auto *s = dyn_cast<SyntheticSection>(isec))
|
||||||
isec->writeTo<ELFT>(buf + isec->outSecOff);
|
s->writeTo(buf + isec->outSecOff);
|
||||||
|
|
||||||
// Fill gaps between sections.
|
|
||||||
if (nonZeroFiller) {
|
|
||||||
uint8_t *start = buf + isec->outSecOff + isec->getSize();
|
|
||||||
uint8_t *end;
|
|
||||||
if (i + 1 == sections.size())
|
|
||||||
end = buf + size;
|
|
||||||
else
|
else
|
||||||
end = buf + sections[i + 1]->outSecOff;
|
isec->writeTo<ELFT>(buf + isec->outSecOff);
|
||||||
if (isec->nopFiller) {
|
|
||||||
assert(target->nopInstrs);
|
|
||||||
nopInstrFill(start, end - start);
|
|
||||||
} else
|
|
||||||
fill(start, end - start, filler);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// Linker scripts may have BYTE()-family commands with which you
|
// Fill gaps between sections.
|
||||||
// can write arbitrary bytes to the output. Process them if any.
|
if (nonZeroFiller) {
|
||||||
|
uint8_t *start = buf + isec->outSecOff + isec->getSize();
|
||||||
|
uint8_t *end;
|
||||||
|
if (i + 1 == numSections)
|
||||||
|
end = buf + size;
|
||||||
|
else
|
||||||
|
end = buf + sections[i + 1]->outSecOff;
|
||||||
|
if (isec->nopFiller) {
|
||||||
|
assert(target->nopInstrs);
|
||||||
|
nopInstrFill(start, end - start);
|
||||||
|
} else
|
||||||
|
fill(start, end - start, filler);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// If there is any BYTE()-family command (rare), write the section content
|
||||||
|
// first then process BYTE to overwrite the filler content. The write is
|
||||||
|
// serial due to the limitation of llvm/Support/Parallel.h.
|
||||||
|
bool written = false;
|
||||||
|
size_t numSections = sections.size();
|
||||||
for (SectionCommand *cmd : commands)
|
for (SectionCommand *cmd : commands)
|
||||||
if (auto *data = dyn_cast<ByteCommand>(cmd))
|
if (auto *data = dyn_cast<ByteCommand>(cmd)) {
|
||||||
|
if (!std::exchange(written, true))
|
||||||
|
fn(0, numSections);
|
||||||
writeInt(buf + data->offset, data->expression().getValue(), data->size);
|
writeInt(buf + data->offset, data->expression().getValue(), data->size);
|
||||||
|
}
|
||||||
|
if (written || !numSections)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// There is no data command. Write content asynchronously to overlap the write
|
||||||
|
// time with other output sections. Note, if a linker script specifies
|
||||||
|
// overlapping output sections (needs --noinhibit-exec or --no-check-sections
|
||||||
|
// to supress the error), the output may be non-deterministic.
|
||||||
|
const size_t taskSizeLimit = 4 << 20;
|
||||||
|
for (size_t begin = 0, i = 0, taskSize = 0;;) {
|
||||||
|
taskSize += sections[i]->getSize();
|
||||||
|
bool done = ++i == numSections;
|
||||||
|
if (done || taskSize >= taskSizeLimit) {
|
||||||
|
tg.execute([=] { fn(begin, i); });
|
||||||
|
if (done)
|
||||||
|
break;
|
||||||
|
begin = i;
|
||||||
|
taskSize = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void finalizeShtGroup(OutputSection *os, InputSection *section) {
|
static void finalizeShtGroup(OutputSection *os, InputSection *section) {
|
||||||
|
@ -673,10 +704,14 @@ template void OutputSection::writeHeaderTo<ELF32BE>(ELF32BE::Shdr *Shdr);
|
||||||
template void OutputSection::writeHeaderTo<ELF64LE>(ELF64LE::Shdr *Shdr);
|
template void OutputSection::writeHeaderTo<ELF64LE>(ELF64LE::Shdr *Shdr);
|
||||||
template void OutputSection::writeHeaderTo<ELF64BE>(ELF64BE::Shdr *Shdr);
|
template void OutputSection::writeHeaderTo<ELF64BE>(ELF64BE::Shdr *Shdr);
|
||||||
|
|
||||||
template void OutputSection::writeTo<ELF32LE>(uint8_t *Buf);
|
template void OutputSection::writeTo<ELF32LE>(uint8_t *,
|
||||||
template void OutputSection::writeTo<ELF32BE>(uint8_t *Buf);
|
llvm::parallel::TaskGroup &);
|
||||||
template void OutputSection::writeTo<ELF64LE>(uint8_t *Buf);
|
template void OutputSection::writeTo<ELF32BE>(uint8_t *,
|
||||||
template void OutputSection::writeTo<ELF64BE>(uint8_t *Buf);
|
llvm::parallel::TaskGroup &);
|
||||||
|
template void OutputSection::writeTo<ELF64LE>(uint8_t *,
|
||||||
|
llvm::parallel::TaskGroup &);
|
||||||
|
template void OutputSection::writeTo<ELF64BE>(uint8_t *,
|
||||||
|
llvm::parallel::TaskGroup &);
|
||||||
|
|
||||||
template void OutputSection::maybeCompress<ELF32LE>();
|
template void OutputSection::maybeCompress<ELF32LE>();
|
||||||
template void OutputSection::maybeCompress<ELF32BE>();
|
template void OutputSection::maybeCompress<ELF32BE>();
|
||||||
|
|
|
@ -12,6 +12,7 @@
|
||||||
#include "InputSection.h"
|
#include "InputSection.h"
|
||||||
#include "LinkerScript.h"
|
#include "LinkerScript.h"
|
||||||
#include "lld/Common/LLVM.h"
|
#include "lld/Common/LLVM.h"
|
||||||
|
#include "llvm/Support/Parallel.h"
|
||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
|
|
||||||
|
@ -104,7 +105,8 @@ public:
|
||||||
bool relro = false;
|
bool relro = false;
|
||||||
|
|
||||||
void finalize();
|
void finalize();
|
||||||
template <class ELFT> void writeTo(uint8_t *buf);
|
template <class ELFT>
|
||||||
|
void writeTo(uint8_t *buf, llvm::parallel::TaskGroup &tg);
|
||||||
// Check that the addends for dynamic relocations were written correctly.
|
// Check that the addends for dynamic relocations were written correctly.
|
||||||
void checkDynRelAddends(const uint8_t *bufStart);
|
void checkDynRelAddends(const uint8_t *bufStart);
|
||||||
template <class ELFT> void maybeCompress();
|
template <class ELFT> void maybeCompress();
|
||||||
|
@ -114,6 +116,8 @@ public:
|
||||||
void sortCtorsDtors();
|
void sortCtorsDtors();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
SmallVector<InputSection *, 0> storage;
|
||||||
|
|
||||||
// Used for implementation of --compress-debug-sections option.
|
// Used for implementation of --compress-debug-sections option.
|
||||||
CompressedData compressed;
|
CompressedData compressed;
|
||||||
|
|
||||||
|
|
|
@ -2839,9 +2839,10 @@ template <class ELFT> void Writer<ELFT>::openFile() {
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class ELFT> void Writer<ELFT>::writeSectionsBinary() {
|
template <class ELFT> void Writer<ELFT>::writeSectionsBinary() {
|
||||||
|
parallel::TaskGroup tg;
|
||||||
for (OutputSection *sec : outputSections)
|
for (OutputSection *sec : outputSections)
|
||||||
if (sec->flags & SHF_ALLOC)
|
if (sec->flags & SHF_ALLOC)
|
||||||
sec->writeTo<ELFT>(Out::bufferStart + sec->offset);
|
sec->writeTo<ELFT>(Out::bufferStart + sec->offset, tg);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void fillTrap(uint8_t *i, uint8_t *end) {
|
static void fillTrap(uint8_t *i, uint8_t *end) {
|
||||||
|
@ -2884,16 +2885,21 @@ template <class ELFT> void Writer<ELFT>::writeTrapInstr() {
|
||||||
template <class ELFT> void Writer<ELFT>::writeSections() {
|
template <class ELFT> void Writer<ELFT>::writeSections() {
|
||||||
llvm::TimeTraceScope timeScope("Write sections");
|
llvm::TimeTraceScope timeScope("Write sections");
|
||||||
|
|
||||||
// In -r or --emit-relocs mode, write the relocation sections first as in
|
{
|
||||||
// ELf_Rel targets we might find out that we need to modify the relocated
|
// In -r or --emit-relocs mode, write the relocation sections first as in
|
||||||
// section while doing it.
|
// ELf_Rel targets we might find out that we need to modify the relocated
|
||||||
for (OutputSection *sec : outputSections)
|
// section while doing it.
|
||||||
if (sec->type == SHT_REL || sec->type == SHT_RELA)
|
parallel::TaskGroup tg;
|
||||||
sec->writeTo<ELFT>(Out::bufferStart + sec->offset);
|
for (OutputSection *sec : outputSections)
|
||||||
|
if (sec->type == SHT_REL || sec->type == SHT_RELA)
|
||||||
for (OutputSection *sec : outputSections)
|
sec->writeTo<ELFT>(Out::bufferStart + sec->offset, tg);
|
||||||
if (sec->type != SHT_REL && sec->type != SHT_RELA)
|
}
|
||||||
sec->writeTo<ELFT>(Out::bufferStart + sec->offset);
|
{
|
||||||
|
parallel::TaskGroup tg;
|
||||||
|
for (OutputSection *sec : outputSections)
|
||||||
|
if (sec->type != SHT_REL && sec->type != SHT_RELA)
|
||||||
|
sec->writeTo<ELFT>(Out::bufferStart + sec->offset, tg);
|
||||||
|
}
|
||||||
|
|
||||||
// Finally, check that all dynamic relocation addends were written correctly.
|
// Finally, check that all dynamic relocation addends were written correctly.
|
||||||
if (config->checkDynamicRelocs && config->writeAddends) {
|
if (config->checkDynamicRelocs && config->writeAddends) {
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
// REQUIRES: arm
|
// REQUIRES: arm
|
||||||
// RUN: llvm-mc -g --triple=armv7a-linux-gnueabihf -arm-add-build-attributes -filetype=obj -o %t.o %s
|
// RUN: llvm-mc -g --triple=armv7a-linux-gnueabihf -arm-add-build-attributes -filetype=obj -o %t.o %s
|
||||||
// RUN: ld.lld %t.o -o %t 2>&1 | FileCheck %s --check-prefix=WARN
|
/// Use --threads=1 to keep emitted warnings across sections sequential.
|
||||||
|
// RUN: ld.lld %t.o -o %t --threads=1 2>&1 | FileCheck %s --check-prefix=WARN
|
||||||
// RUN: llvm-objdump --no-show-raw-insn -d %t | FileCheck %s
|
// RUN: llvm-objdump --no-show-raw-insn -d %t | FileCheck %s
|
||||||
|
|
||||||
.syntax unified
|
.syntax unified
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
# REQUIRES: hexagon
|
# REQUIRES: hexagon
|
||||||
# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %s -o %t.o
|
# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %s -o %t.o
|
||||||
# RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck --implicit-check-not "out of range" %s
|
## Use --threads=1 to keep emitted warnings across sections sequential.
|
||||||
|
# RUN: not ld.lld %t.o -o /dev/null --threads=1 2>&1 | FileCheck --implicit-check-not "out of range" %s
|
||||||
|
|
||||||
.globl _start
|
.globl _start
|
||||||
.type _start, @function
|
.type _start, @function
|
||||||
|
|
|
@ -88,8 +88,8 @@
|
||||||
# BROKEN-OUTPUT-FILE-NEXT: 8010 01010101 01010101 01010101 01010101
|
# BROKEN-OUTPUT-FILE-NEXT: 8010 01010101 01010101 01010101 01010101
|
||||||
# BROKEN-OUTPUT-FILE-NEXT: 8020 01010101 01010101 01010101 01010101
|
# BROKEN-OUTPUT-FILE-NEXT: 8020 01010101 01010101 01010101 01010101
|
||||||
# BROKEN-OUTPUT-FILE-NEXT: 8030 01010101 01010101 01010101 01010101
|
# BROKEN-OUTPUT-FILE-NEXT: 8030 01010101 01010101 01010101 01010101
|
||||||
# Starting here the contents of .sec2 overwrites .sec1:
|
## Starting here the content may be from either .sec1 or .sec2, depending on the write order.
|
||||||
# BROKEN-OUTPUT-FILE-NEXT: 8040 02020202 02020202 02020202 02020202
|
# BROKEN-OUTPUT-FILE-NEXT: 8040
|
||||||
|
|
||||||
# RUN: llvm-readelf --sections -l %t.so | FileCheck %s -check-prefix BAD-BOTH
|
# RUN: llvm-readelf --sections -l %t.so | FileCheck %s -check-prefix BAD-BOTH
|
||||||
# BAD-BOTH-LABEL: Section Headers:
|
# BAD-BOTH-LABEL: Section Headers:
|
||||||
|
|
|
@ -30,9 +30,6 @@ namespace parallel {
|
||||||
extern ThreadPoolStrategy strategy;
|
extern ThreadPoolStrategy strategy;
|
||||||
|
|
||||||
namespace detail {
|
namespace detail {
|
||||||
|
|
||||||
#if LLVM_ENABLE_THREADS
|
|
||||||
|
|
||||||
class Latch {
|
class Latch {
|
||||||
uint32_t Count;
|
uint32_t Count;
|
||||||
mutable std::mutex Mutex;
|
mutable std::mutex Mutex;
|
||||||
|
@ -61,20 +58,42 @@ public:
|
||||||
Cond.wait(lock, [&] { return Count == 0; });
|
Cond.wait(lock, [&] { return Count == 0; });
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
} // namespace detail
|
||||||
|
|
||||||
class TaskGroup {
|
class TaskGroup {
|
||||||
Latch L;
|
detail::Latch L;
|
||||||
bool Parallel;
|
bool Parallel;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
TaskGroup();
|
TaskGroup();
|
||||||
~TaskGroup();
|
~TaskGroup();
|
||||||
|
|
||||||
|
// Spawn a task, but does not wait for it to finish.
|
||||||
void spawn(std::function<void()> f);
|
void spawn(std::function<void()> f);
|
||||||
|
|
||||||
|
// Similar to spawn, but execute the task immediately when ThreadsRequested ==
|
||||||
|
// 1. The difference is to give the following pattern a more intuitive order
|
||||||
|
// when single threading is requested.
|
||||||
|
//
|
||||||
|
// for (size_t begin = 0, i = 0, taskSize = 0;;) {
|
||||||
|
// taskSize += ...
|
||||||
|
// bool done = ++i == end;
|
||||||
|
// if (done || taskSize >= taskSizeLimit) {
|
||||||
|
// tg.execute([=] { fn(begin, i); });
|
||||||
|
// if (done)
|
||||||
|
// break;
|
||||||
|
// begin = i;
|
||||||
|
// taskSize = 0;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
void execute(std::function<void()> f);
|
||||||
|
|
||||||
void sync() const { L.sync(); }
|
void sync() const { L.sync(); }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
|
||||||
|
#if LLVM_ENABLE_THREADS
|
||||||
const ptrdiff_t MinParallelSize = 1024;
|
const ptrdiff_t MinParallelSize = 1024;
|
||||||
|
|
||||||
/// Inclusive median.
|
/// Inclusive median.
|
||||||
|
|
|
@ -19,10 +19,9 @@
|
||||||
|
|
||||||
llvm::ThreadPoolStrategy llvm::parallel::strategy;
|
llvm::ThreadPoolStrategy llvm::parallel::strategy;
|
||||||
|
|
||||||
#if LLVM_ENABLE_THREADS
|
|
||||||
|
|
||||||
namespace llvm {
|
namespace llvm {
|
||||||
namespace parallel {
|
namespace parallel {
|
||||||
|
#if LLVM_ENABLE_THREADS
|
||||||
namespace detail {
|
namespace detail {
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
@ -143,6 +142,8 @@ Executor *Executor::getDefaultExecutor() {
|
||||||
return Exec.get();
|
return Exec.get();
|
||||||
}
|
}
|
||||||
} // namespace
|
} // namespace
|
||||||
|
} // namespace detail
|
||||||
|
#endif
|
||||||
|
|
||||||
static std::atomic<int> TaskGroupInstances;
|
static std::atomic<int> TaskGroupInstances;
|
||||||
|
|
||||||
|
@ -159,21 +160,27 @@ TaskGroup::~TaskGroup() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void TaskGroup::spawn(std::function<void()> F) {
|
void TaskGroup::spawn(std::function<void()> F) {
|
||||||
|
#if LLVM_ENABLE_THREADS
|
||||||
if (Parallel) {
|
if (Parallel) {
|
||||||
L.inc();
|
L.inc();
|
||||||
Executor::getDefaultExecutor()->add([&, F = std::move(F)] {
|
detail::Executor::getDefaultExecutor()->add([&, F = std::move(F)] {
|
||||||
F();
|
F();
|
||||||
L.dec();
|
L.dec();
|
||||||
});
|
});
|
||||||
} else {
|
return;
|
||||||
F();
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
F();
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace detail
|
void TaskGroup::execute(std::function<void()> F) {
|
||||||
|
if (parallel::strategy.ThreadsRequested == 1)
|
||||||
|
F();
|
||||||
|
else
|
||||||
|
spawn(F);
|
||||||
|
}
|
||||||
} // namespace parallel
|
} // namespace parallel
|
||||||
} // namespace llvm
|
} // namespace llvm
|
||||||
#endif // LLVM_ENABLE_THREADS
|
|
||||||
|
|
||||||
void llvm::parallelFor(size_t Begin, size_t End,
|
void llvm::parallelFor(size_t Begin, size_t End,
|
||||||
llvm::function_ref<void(size_t)> Fn) {
|
llvm::function_ref<void(size_t)> Fn) {
|
||||||
|
@ -190,7 +197,7 @@ void llvm::parallelFor(size_t Begin, size_t End,
|
||||||
if (TaskSize == 0)
|
if (TaskSize == 0)
|
||||||
TaskSize = 1;
|
TaskSize = 1;
|
||||||
|
|
||||||
parallel::detail::TaskGroup TG;
|
parallel::TaskGroup TG;
|
||||||
for (; Begin + TaskSize < End; Begin += TaskSize) {
|
for (; Begin + TaskSize < End; Begin += TaskSize) {
|
||||||
TG.spawn([=, &Fn] {
|
TG.spawn([=, &Fn] {
|
||||||
for (size_t I = Begin, E = Begin + TaskSize; I != E; ++I)
|
for (size_t I = Begin, E = Begin + TaskSize; I != E; ++I)
|
||||||
|
|
Loading…
Reference in New Issue