[ELF] Parallelize writes of different OutputSections

We currently process one OutputSection at a time and for each OutputSection write contained input sections in parallel. This strategy does not leverage multi-threading well. Instead, parallelize writes of different OutputSections. The default TaskSize for parallelFor often leads to inferior sharding. We prepare the task in the caller instead. * Move llvm::parallel::detail::TaskGroup to llvm::parallel::TaskGroup * Add llvm::parallel::TaskGroup::execute. * Change writeSections to declare TaskGroup and pass it to writeTo. Speed-up with --threads=8: * clang -DCMAKE_BUILD_TYPE=Release: 1.11x as fast * clang -DCMAKE_BUILD_TYPE=Debug: 1.10x as fast * chrome -DCMAKE_BUILD_TYPE=Release: 1.04x as fast * scylladb build/release: 1.09x as fast On M1, many benchmarks are a small fraction of a percentage faster. Mozilla showed the largest difference with the patch being about 1.03x as fast. Differential Revision: https://reviews.llvm.org/D131247
2022-08-24 09:40:03 -07:00 · 2022-08-24 09:40:03 -07:00 · 3b4d800911
parent e854c17b02
commit 3b4d800911
8 changed files with 132 additions and 59 deletions
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@ -332,7 +332,10 @@ template <class ELFT> void OutputSection::maybeCompress() {

  // Write uncompressed data to a temporary zero-initialized buffer.
  auto buf = std::make_unique<uint8_t[]>(size);
-  writeTo<ELFT>(buf.get());
+  {
+    parallel::TaskGroup tg;
+    writeTo<ELFT>(buf.get(), tg);
+  }
  // We chose 1 (Z_BEST_SPEED) as the default compression level because it is
  // the fastest. If -O2 is given, we use level 6 to compress debug info more by
  // ~15%. We found that level 7 to 9 doesn't make much difference (~1% more
@ -386,7 +389,8 @@ static void writeInt(uint8_t *buf, uint64_t data, uint64_t size) {
    llvm_unreachable("unsupported Size argument");
 }

-template <class ELFT> void OutputSection::writeTo(uint8_t *buf) {
+template <class ELFT>
+void OutputSection::writeTo(uint8_t *buf, parallel::TaskGroup &tg) {
  llvm::TimeTraceScope timeScope("Write sections", name);
  if (type == SHT_NOBITS)
    return;
@ -419,41 +423,68 @@ template <class ELFT> void OutputSection::writeTo(uint8_t *buf) {
  }

  // Write leading padding.
-  SmallVector<InputSection *, 0> storage;
  ArrayRef<InputSection *> sections = getInputSections(*this, storage);
  std::array<uint8_t, 4> filler = getFiller();
  bool nonZeroFiller = read32(filler.data()) != 0;
  if (nonZeroFiller)
    fill(buf, sections.empty() ? size : sections[0]->outSecOff, filler);

-  parallelFor(0, sections.size(), [&](size_t i) {
-    InputSection *isec = sections[i];
-    if (auto *s = dyn_cast<SyntheticSection>(isec))
-      s->writeTo(buf + isec->outSecOff);
-    else
-      isec->writeTo<ELFT>(buf + isec->outSecOff);
-
-    // Fill gaps between sections.
-    if (nonZeroFiller) {
-      uint8_t *start = buf + isec->outSecOff + isec->getSize();
-      uint8_t *end;
-      if (i + 1 == sections.size())
-        end = buf + size;
+  auto fn = [=](size_t begin, size_t end) {
+    size_t numSections = sections.size();
+    for (size_t i = begin; i != end; ++i) {
+      InputSection *isec = sections[i];
+      if (auto *s = dyn_cast<SyntheticSection>(isec))
+        s->writeTo(buf + isec->outSecOff);
      else
-        end = buf + sections[i + 1]->outSecOff;
-      if (isec->nopFiller) {
-        assert(target->nopInstrs);
-        nopInstrFill(start, end - start);
-      } else
-        fill(start, end - start, filler);
-    }
-  });
+        isec->writeTo<ELFT>(buf + isec->outSecOff);

-  // Linker scripts may have BYTE()-family commands with which you
-  // can write arbitrary bytes to the output. Process them if any.
+      // Fill gaps between sections.
+      if (nonZeroFiller) {
+        uint8_t *start = buf + isec->outSecOff + isec->getSize();
+        uint8_t *end;
+        if (i + 1 == numSections)
+          end = buf + size;
+        else
+          end = buf + sections[i + 1]->outSecOff;
+        if (isec->nopFiller) {
+          assert(target->nopInstrs);
+          nopInstrFill(start, end - start);
+        } else
+          fill(start, end - start, filler);
+      }
+    }
+  };
+
+  // If there is any BYTE()-family command (rare), write the section content
+  // first then process BYTE to overwrite the filler content. The write is
+  // serial due to the limitation of llvm/Support/Parallel.h.
+  bool written = false;
+  size_t numSections = sections.size();
  for (SectionCommand *cmd : commands)
-    if (auto *data = dyn_cast<ByteCommand>(cmd))
+    if (auto *data = dyn_cast<ByteCommand>(cmd)) {
+      if (!std::exchange(written, true))
+        fn(0, numSections);
      writeInt(buf + data->offset, data->expression().getValue(), data->size);
+    }
+  if (written || !numSections)
+    return;
+
+  // There is no data command. Write content asynchronously to overlap the write
+  // time with other output sections. Note, if a linker script specifies
+  // overlapping output sections (needs --noinhibit-exec or --no-check-sections
+  // to supress the error), the output may be non-deterministic.
+  const size_t taskSizeLimit = 4 << 20;
+  for (size_t begin = 0, i = 0, taskSize = 0;;) {
+    taskSize += sections[i]->getSize();
+    bool done = ++i == numSections;
+    if (done || taskSize >= taskSizeLimit) {
+      tg.execute([=] { fn(begin, i); });
+      if (done)
+        break;
+      begin = i;
+      taskSize = 0;
+    }
+  }
 }

 static void finalizeShtGroup(OutputSection *os, InputSection *section) {
@ -673,10 +704,14 @@ template void OutputSection::writeHeaderTo<ELF32BE>(ELF32BE::Shdr *Shdr);
 template void OutputSection::writeHeaderTo<ELF64LE>(ELF64LE::Shdr *Shdr);
 template void OutputSection::writeHeaderTo<ELF64BE>(ELF64BE::Shdr *Shdr);

-template void OutputSection::writeTo<ELF32LE>(uint8_t *Buf);
-template void OutputSection::writeTo<ELF32BE>(uint8_t *Buf);
-template void OutputSection::writeTo<ELF64LE>(uint8_t *Buf);
-template void OutputSection::writeTo<ELF64BE>(uint8_t *Buf);
+template void OutputSection::writeTo<ELF32LE>(uint8_t *,
+                                              llvm::parallel::TaskGroup &);
+template void OutputSection::writeTo<ELF32BE>(uint8_t *,
+                                              llvm::parallel::TaskGroup &);
+template void OutputSection::writeTo<ELF64LE>(uint8_t *,
+                                              llvm::parallel::TaskGroup &);
+template void OutputSection::writeTo<ELF64BE>(uint8_t *,
+                                              llvm::parallel::TaskGroup &);

 template void OutputSection::maybeCompress<ELF32LE>();
 template void OutputSection::maybeCompress<ELF32BE>();
--- a/lld/ELF/OutputSections.h
+++ b/lld/ELF/OutputSections.h
@ -12,6 +12,7 @@
 #include "InputSection.h"
 #include "LinkerScript.h"
 #include "lld/Common/LLVM.h"
+#include "llvm/Support/Parallel.h"

 #include <array>

@ -104,7 +105,8 @@ public:
  bool relro = false;

  void finalize();
-  template <class ELFT> void writeTo(uint8_t *buf);
+  template <class ELFT>
+  void writeTo(uint8_t *buf, llvm::parallel::TaskGroup &tg);
  // Check that the addends for dynamic relocations were written correctly.
  void checkDynRelAddends(const uint8_t *bufStart);
  template <class ELFT> void maybeCompress();
@ -114,6 +116,8 @@ public:
  void sortCtorsDtors();

 private:
+  SmallVector<InputSection *, 0> storage;
+
  // Used for implementation of --compress-debug-sections option.
  CompressedData compressed;

--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@ -2839,9 +2839,10 @@ template <class ELFT> void Writer<ELFT>::openFile() {
 }

 template <class ELFT> void Writer<ELFT>::writeSectionsBinary() {
+  parallel::TaskGroup tg;
  for (OutputSection *sec : outputSections)
    if (sec->flags & SHF_ALLOC)
-      sec->writeTo<ELFT>(Out::bufferStart + sec->offset);
+      sec->writeTo<ELFT>(Out::bufferStart + sec->offset, tg);
 }

 static void fillTrap(uint8_t *i, uint8_t *end) {
@ -2884,16 +2885,21 @@ template <class ELFT> void Writer<ELFT>::writeTrapInstr() {
 template <class ELFT> void Writer<ELFT>::writeSections() {
  llvm::TimeTraceScope timeScope("Write sections");

-  // In -r or --emit-relocs mode, write the relocation sections first as in
-  // ELf_Rel targets we might find out that we need to modify the relocated
-  // section while doing it.
-  for (OutputSection *sec : outputSections)
-    if (sec->type == SHT_REL || sec->type == SHT_RELA)
-      sec->writeTo<ELFT>(Out::bufferStart + sec->offset);
-
-  for (OutputSection *sec : outputSections)
-    if (sec->type != SHT_REL && sec->type != SHT_RELA)
-      sec->writeTo<ELFT>(Out::bufferStart + sec->offset);
+  {
+    // In -r or --emit-relocs mode, write the relocation sections first as in
+    // ELf_Rel targets we might find out that we need to modify the relocated
+    // section while doing it.
+    parallel::TaskGroup tg;
+    for (OutputSection *sec : outputSections)
+      if (sec->type == SHT_REL || sec->type == SHT_RELA)
+        sec->writeTo<ELFT>(Out::bufferStart + sec->offset, tg);
+  }
+  {
+    parallel::TaskGroup tg;
+    for (OutputSection *sec : outputSections)
+      if (sec->type != SHT_REL && sec->type != SHT_RELA)
+        sec->writeTo<ELFT>(Out::bufferStart + sec->offset, tg);
+  }

  // Finally, check that all dynamic relocation addends were written correctly.
  if (config->checkDynamicRelocs && config->writeAddends) {
--- a/lld/test/ELF/arm-thumb-interwork-notfunc.s
+++ b/lld/test/ELF/arm-thumb-interwork-notfunc.s
@ -1,6 +1,7 @@
 // REQUIRES: arm
 // RUN: llvm-mc -g --triple=armv7a-linux-gnueabihf -arm-add-build-attributes -filetype=obj -o %t.o %s
-// RUN: ld.lld %t.o -o %t 2>&1 | FileCheck %s --check-prefix=WARN
+/// Use --threads=1 to keep emitted warnings across sections sequential.
+// RUN: ld.lld %t.o -o %t --threads=1 2>&1 | FileCheck %s --check-prefix=WARN
 // RUN: llvm-objdump --no-show-raw-insn -d %t | FileCheck %s

 .syntax unified
--- a/lld/test/ELF/hexagon-jump-error.s
+++ b/lld/test/ELF/hexagon-jump-error.s
@ -1,6 +1,7 @@
 # REQUIRES: hexagon
 # RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %s -o %t.o
-# RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck --implicit-check-not "out of range" %s
+## Use --threads=1 to keep emitted warnings across sections sequential.
+# RUN: not ld.lld %t.o -o /dev/null --threads=1 2>&1 | FileCheck --implicit-check-not "out of range" %s

 	.globl	_start
 	.type	_start, @function
--- a/lld/test/ELF/linkerscript/overlapping-sections.s
+++ b/lld/test/ELF/linkerscript/overlapping-sections.s
@ -88,8 +88,8 @@
 # BROKEN-OUTPUT-FILE-NEXT: 8010 01010101 01010101 01010101 01010101
 # BROKEN-OUTPUT-FILE-NEXT: 8020 01010101 01010101 01010101 01010101
 # BROKEN-OUTPUT-FILE-NEXT: 8030 01010101 01010101 01010101 01010101
-# Starting here the contents of .sec2 overwrites .sec1:
-# BROKEN-OUTPUT-FILE-NEXT: 8040 02020202 02020202 02020202 02020202
+## Starting here the content may be from either .sec1 or .sec2, depending on the write order.
+# BROKEN-OUTPUT-FILE-NEXT: 8040

 # RUN: llvm-readelf --sections -l %t.so | FileCheck %s -check-prefix BAD-BOTH
 # BAD-BOTH-LABEL: Section Headers:
--- a/llvm/include/llvm/Support/Parallel.h
+++ b/llvm/include/llvm/Support/Parallel.h
@ -30,9 +30,6 @@ namespace parallel {
 extern ThreadPoolStrategy strategy;

 namespace detail {
-
-#if LLVM_ENABLE_THREADS
-
 class Latch {
  uint32_t Count;
  mutable std::mutex Mutex;
@ -61,20 +58,42 @@ public:
    Cond.wait(lock, [&] { return Count == 0; });
  }
 };
+} // namespace detail

 class TaskGroup {
-  Latch L;
+  detail::Latch L;
  bool Parallel;

 public:
  TaskGroup();
  ~TaskGroup();

+  // Spawn a task, but does not wait for it to finish.
  void spawn(std::function<void()> f);

+  // Similar to spawn, but execute the task immediately when ThreadsRequested ==
+  // 1. The difference is to give the following pattern a more intuitive order
+  // when single threading is requested.
+  //
+  // for (size_t begin = 0, i = 0, taskSize = 0;;) {
+  //   taskSize += ...
+  //   bool done = ++i == end;
+  //   if (done || taskSize >= taskSizeLimit) {
+  //     tg.execute([=] { fn(begin, i); });
+  //     if (done)
+  //       break;
+  //     begin = i;
+  //     taskSize = 0;
+  //   }
+  // }
+  void execute(std::function<void()> f);
+
  void sync() const { L.sync(); }
 };

+namespace detail {
+
+#if LLVM_ENABLE_THREADS
 const ptrdiff_t MinParallelSize = 1024;

 /// Inclusive median.
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@ -19,10 +19,9 @@

 llvm::ThreadPoolStrategy llvm::parallel::strategy;

-#if LLVM_ENABLE_THREADS
-
 namespace llvm {
 namespace parallel {
+#if LLVM_ENABLE_THREADS
 namespace detail {

 namespace {
@ -143,6 +142,8 @@ Executor *Executor::getDefaultExecutor() {
  return Exec.get();
 }
 } // namespace
+} // namespace detail
+#endif

 static std::atomic<int> TaskGroupInstances;

@ -159,21 +160,27 @@ TaskGroup::~TaskGroup() {
 }

 void TaskGroup::spawn(std::function<void()> F) {
+#if LLVM_ENABLE_THREADS
  if (Parallel) {
    L.inc();
-    Executor::getDefaultExecutor()->add([&, F = std::move(F)] {
+    detail::Executor::getDefaultExecutor()->add([&, F = std::move(F)] {
      F();
      L.dec();
    });
-  } else {
-    F();
+    return;
  }
+#endif
+  F();
 }

-} // namespace detail
+void TaskGroup::execute(std::function<void()> F) {
+  if (parallel::strategy.ThreadsRequested == 1)
+    F();
+  else
+    spawn(F);
+}
 } // namespace parallel
 } // namespace llvm
-#endif // LLVM_ENABLE_THREADS

 void llvm::parallelFor(size_t Begin, size_t End,
                       llvm::function_ref<void(size_t)> Fn) {
@ -190,7 +197,7 @@ void llvm::parallelFor(size_t Begin, size_t End,
    if (TaskSize == 0)
      TaskSize = 1;

-    parallel::detail::TaskGroup TG;
+    parallel::TaskGroup TG;
    for (; Begin + TaskSize < End; Begin += TaskSize) {
      TG.spawn([=, &Fn] {
        for (size_t I = Begin, E = Begin + TaskSize; I != E; ++I)