Re-land [ThinLTO] Re-order modules for optimal multi-threaded processing

This reverts 9b5b305023 and fixes the unwanted re-ordering when generating ThinLTO indexes. The goal of this patch is to better balance thread utilization during ThinLTO in-process linking (in llvm-lto2 or in LLD). Before this patch, large modules would often be scheduled late during execution, taking a long time to complete, thus starving the thread pool. We now sort modules in descending order, based on each module's bitcode size, so that larger modules are processed first. By doing so, smaller modules have a better chance to keep the thread pool active, and thus avoid starvation when the bitcode compilation is almost complete. In our case (on dual Intel Xeon Gold 6140, Windows 10 version 2004, two-stage build), this saves 15 sec when linking `clang.exe` with LLD & -flto=thin, /opt:lldltojobs=all, no ThinLTO cache, -DLLVM_INTEGRATED_CRT_ALLOC=d:\git\rpmalloc. Before patch: 100 sec After patch: 85 sec Inspired by the work done by David Callahan in D60495. Differential Revision: https://reviews.llvm.org/D87966
2020-10-13 21:54:00 -04:00 · 2020-10-13 21:54:00 -04:00 · 617d64f6c5
parent 8f8b9f2cca
commit 617d64f6c5
4 changed files with 89 additions and 23 deletions
--- a/lld/test/COFF/thinlto-module-order.ll
+++ b/lld/test/COFF/thinlto-module-order.ll
@ -0,0 +1,26 @@
+; REQUIRES: x86
+
+; RUN: opt -thinlto-bc %s -o %t1.obj
+; RUN: opt -thinlto-bc %p/Inputs/thinlto.ll -o %t2.obj
+
+; Ensure module re-ordering in LTO::runThinLTO does not affect the processing order.
+
+; RUN: lld-link -thinlto-index-only:%t3 /entry:main %t1.obj %t2.obj
+; RUN: cat %t3 | FileCheck %s --check-prefix=NORMAL
+; NORMAL: thinlto-module-order.ll.tmp1.o
+; NORMAL: thinlto-module-order.ll.tmp2.o
+
+; RUN: lld-link -thinlto-index-only:%t3 /entry:main %t2.obj %t1.obj 
+; RUN: cat %t3 | FileCheck %s --check-prefix=REVERSED
+; REVERSED: thinlto-module-order.ll.tmp2.o
+; REVERSED: thinlto-module-order.ll.tmp1.o
+
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.0.24215"
+
+declare void @g(...)
+
+define void @main() {
+  call void (...) @g()
+  ret void
+}
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@ -91,6 +91,10 @@ setupLLVMOptimizationRemarks(LLVMContext &Context, StringRef RemarksFilename,
 Expected<std::unique_ptr<ToolOutputFile>>
 setupStatsFile(StringRef StatsFilename);

+/// Produces a container ordering for optimal multi-threaded processing. Returns
+/// ordered indices to elements in the input array.
+std::vector<int> generateModulesOrdering(ArrayRef<BitcodeModule *> R);
+
 class LTO;
 struct SymbolResolution;
 class ThinBackendProc;
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@ -1107,6 +1107,7 @@ public:
      const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
      MapVector<StringRef, BitcodeModule> &ModuleMap) = 0;
  virtual Error wait() = 0;
+  virtual unsigned getThreadCount() = 0;
 };

 namespace {
@ -1221,6 +1222,10 @@ public:
    else
      return Error::success();
  }
+
+  unsigned getThreadCount() override {
+    return BackendThreadPool.getThreadCount();
+  }
 };
 } // end anonymous namespace

@ -1309,6 +1314,10 @@ public:
  }

  Error wait() override { return Error::success(); }
+
+  // WriteIndexesThinBackend should always return 1 to prevent module
+  // re-ordering and avoid non-determinism in the final link.
+  unsigned getThreadCount() override { return 1; }
 };
 } // end anonymous namespace

@ -1443,17 +1452,37 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
  auto &ModuleMap =
      ThinLTO.ModulesToCompile ? *ThinLTO.ModulesToCompile : ThinLTO.ModuleMap;

-  // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for combined
-  // module and parallel code generation partitions.
-  unsigned Task = RegularLTO.ParallelCodeGenParallelismLevel;
-  for (auto &Mod : ModuleMap) {
-    if (Error E = BackendProc->start(Task, Mod.second, ImportLists[Mod.first],
-                                     ExportLists[Mod.first],
-                                     ResolvedODR[Mod.first], ThinLTO.ModuleMap))
-      return E;
-    ++Task;
-  }
+  auto ProcessOneModule = [&](int I) -> Error {
+    auto &Mod = *(ModuleMap.begin() + I);
+    // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for
+    // combined module and parallel code generation partitions.
+    return BackendProc->start(RegularLTO.ParallelCodeGenParallelismLevel + I,
+                              Mod.second, ImportLists[Mod.first],
+                              ExportLists[Mod.first], ResolvedODR[Mod.first],
+                              ThinLTO.ModuleMap);
+  };

+  if (BackendProc->getThreadCount() == 1) {
+    // Process the modules in the order they were provided on the command-line.
+    // It is important for this codepath to be used for WriteIndexesThinBackend,
+    // to ensure the emitted LinkedObjectsFile lists ThinLTO objects in the same
+    // order as the inputs, which otherwise would affect the final link order.
+    for (int I = 0, E = ModuleMap.size(); I != E; ++I)
+      if (Error E = ProcessOneModule(I))
+        return E;
+  } else {
+    // When executing in parallel, process largest bitsize modules first to
+    // improve parallelism, and avoid starving the thread pool near the end.
+    // This saves about 15 sec on a 36-core machine while link `clang.exe` (out
+    // of 100 sec).
+    std::vector<BitcodeModule *> ModulesVec;
+    ModulesVec.reserve(ModuleMap.size());
+    for (auto &Mod : ModuleMap)
+      ModulesVec.push_back(&Mod.second);
+    for (int I : generateModulesOrdering(ModulesVec))
+      if (Error E = ProcessOneModule(I))
+        return E;
+  }
  return BackendProc->wait();
 }

@ -1495,3 +1524,18 @@ lto::setupStatsFile(StringRef StatsFilename) {
  StatsFile->keep();
  return std::move(StatsFile);
 }
+
+// Compute the ordering we will process the inputs: the rough heuristic here
+// is to sort them per size so that the largest module get schedule as soon as
+// possible. This is purely a compile-time optimization.
+std::vector<int> lto::generateModulesOrdering(ArrayRef<BitcodeModule *> R) {
+  std::vector<int> ModulesOrdering;
+  ModulesOrdering.resize(R.size());
+  std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0);
+  llvm::sort(ModulesOrdering, [&](int LeftIndex, int RightIndex) {
+    auto LSize = R[LeftIndex]->getBuffer().size();
+    auto RSize = R[RightIndex]->getBuffer().size();
+    return LSize > RSize;
+  });
+  return ModulesOrdering;
+}
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@ -1054,19 +1054,11 @@ void ThinLTOCodeGenerator::run() {
    ModuleToDefinedGVSummaries[ModuleIdentifier];
  }

-  // Compute the ordering we will process the inputs: the rough heuristic here
-  // is to sort them per size so that the largest module get schedule as soon as
-  // possible. This is purely a compile-time optimization.
-  std::vector<int> ModulesOrdering;
-  ModulesOrdering.resize(Modules.size());
-  std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0);
-  llvm::sort(ModulesOrdering, [&](int LeftIndex, int RightIndex) {
-    auto LSize =
-        Modules[LeftIndex]->getSingleBitcodeModule().getBuffer().size();
-    auto RSize =
-        Modules[RightIndex]->getSingleBitcodeModule().getBuffer().size();
-    return LSize > RSize;
-  });
+  std::vector<BitcodeModule *> ModulesVec;
+  ModulesVec.reserve(Modules.size());
+  for (auto &Mod : Modules)
+    ModulesVec.push_back(&Mod->getSingleBitcodeModule());
+  std::vector<int> ModulesOrdering = lto::generateModulesOrdering(ModulesVec);

  // Parallel optimizer + codegen
  {