[BOLT] Custom function alignment

Summary: A new 'compact' function aligner that takes function sizes in consideration. The approach is based on the following assumptions: -- It is not desirable to introduce a large offset when aligning short functions, as it leads to a lot of "wasted" address space. -- For longer functions, the offset can be larger than the default 32 bytes; However, using 64 bytes for the offset still worsen performance, as again a lot of address space is wasted. -- Cold parts of functions can still use the default max-32 offset. The algorithm is switched on/off by flag 'use-compact-aligner' and is controlled by parameters align-functions-max-bytes and align-cold-functions-max-bytes described above. In my tests the best performance is produced with '-use-compact-aligner=true -align-functions-max-bytes=48 -align-cold-functions-max-bytes=32'. (cherry picked from FBD6194092)
2017-10-27 15:05:31 -07:00 · 2017-10-27 15:05:31 -07:00 · a0c041f72a
parent dd6ecdd782
commit a0c041f72a
6 changed files with 175 additions and 19 deletions
--- a/bolt/BinaryFunction.h
+++ b/bolt/BinaryFunction.h
@ -241,7 +241,13 @@ private:
  uint64_t MaxSize{std::numeric_limits<uint64_t>::max()};

  /// Alignment requirements for the function.
-  uint64_t Alignment{2};
+  uint16_t Alignment{2};
+
+  /// Maximum number of bytes used for alignment of hot part of the function.
+  uint16_t MaxAlignmentBytes{0};
+
+  /// Maximum number of bytes used for alignment of cold part of the function.
+  uint16_t MaxColdAlignmentBytes{0};

  const MCSymbol *PersonalityFunction{nullptr};
  uint8_t PersonalityEncoding{dwarf::DW_EH_PE_sdata4 | dwarf::DW_EH_PE_pcrel};
@ -1580,15 +1586,33 @@ public:
    return *this;
  }

-  BinaryFunction &setAlignment(uint64_t Align) {
+  BinaryFunction &setAlignment(uint16_t Align) {
    Alignment = Align;
    return *this;
  }

-  uint64_t getAlignment() const {
+  uint16_t getAlignment() const {
    return Alignment;
  }

+  BinaryFunction &setMaxAlignmentBytes(uint16_t MaxAlignBytes) {
+    MaxAlignmentBytes = MaxAlignBytes;
+    return *this;
+  }
+
+  uint16_t getMaxAlignmentBytes() const {
+    return MaxAlignmentBytes;
+  }
+
+  BinaryFunction &setMaxColdAlignmentBytes(uint16_t MaxAlignBytes) {
+    MaxColdAlignmentBytes = MaxAlignBytes;
+    return *this;
+  }
+
+  uint16_t getMaxColdAlignmentBytes() const {
+    return MaxColdAlignmentBytes;
+  }
+
  BinaryFunction &setImageAddress(uint64_t Address) {
    ImageAddress = Address;
    return *this;
--- a/bolt/BinaryPassManager.cpp
+++ b/bolt/BinaryPassManager.cpp
@ -10,6 +10,7 @@
 //===----------------------------------------------------------------------===//

 #include "BinaryPassManager.h"
+#include "Passes/Aligner.h"
 #include "Passes/AllocCombiner.h"
 #include "Passes/FrameOptimizer.h"
 #include "Passes/IndirectCallPromotion.h"
@ -393,6 +394,8 @@ void BinaryFunctionPassManager::runAllPasses(
    llvm::make_unique<SimplifyConditionalTailCalls>(PrintSCTC),
    opts::SimplifyConditionalTailCalls);

+  Manager.registerPass(llvm::make_unique<AlignerPass>());
+
  // This pass should always run last.*
  Manager.registerPass(llvm::make_unique<FinalizeFunctions>(PrintFinalized));

--- a/bolt/Passes/Aligner.cpp
+++ b/bolt/Passes/Aligner.cpp
@ -0,0 +1,101 @@
+//===--- Aligner.cpp ------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Aligner.h"
+
+using namespace llvm;
+
+namespace opts {
+extern cl::OptionCategory BoltOptCategory;
+
+extern cl::opt<bool> Relocs;
+
+cl::opt<bool>
+UseCompactAligner("use-compact-aligner",
+  cl::desc("Use compact approach for aligning functions"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<unsigned>
+AlignFunctions("align-functions",
+  cl::desc("align functions at a given value (relocation mode)"),
+  cl::init(64),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<unsigned>
+AlignFunctionsMaxBytes("align-functions-max-bytes",
+  cl::desc("maximum number of bytes to use to align functions"),
+  cl::init(32),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+} // end namespace opts
+
+namespace llvm {
+namespace bolt {
+
+namespace {
+
+// Align function to the specified byte-boundary (typically, 64) offsetting
+// the fuction by not more than the corresponding value
+void alignMaxBytes(BinaryFunction &Function) {
+  Function.setAlignment(opts::AlignFunctions);
+  Function.setMaxAlignmentBytes(opts::AlignFunctionsMaxBytes);
+  Function.setMaxColdAlignmentBytes(opts::AlignFunctionsMaxBytes);
+}
+
+// Align function to the specified byte-boundary (typically, 64) offsetting
+// the fuction by not more than the minimum over
+// -- the size of the function
+// -- the specified number of bytes
+void alignCompact(BinaryContext &BC, BinaryFunction &Function) {
+  size_t HotSize = 0;
+  size_t ColdSize = 0;
+  for (const auto *BB : Function.layout()) {
+    if (BB->isCold())
+      ColdSize += BC.computeCodeSize(BB->begin(), BB->end());
+    else
+      HotSize += BC.computeCodeSize(BB->begin(), BB->end());
+  }
+
+  Function.setAlignment(opts::AlignFunctions);
+  if (HotSize > 0)
+    Function.setMaxAlignmentBytes(
+      std::min(size_t(opts::AlignFunctionsMaxBytes), HotSize));
+
+  // using the same option, max-align-bytes, both for cold and hot parts of the
+  // functions, as aligning cold functions typically does not affect performance
+  if (ColdSize > 0)
+    Function.setMaxColdAlignmentBytes(
+      std::min(size_t(opts::AlignFunctionsMaxBytes), ColdSize));
+}
+
+} // end anonymous namespace
+
+void AlignerPass::runOnFunctions(BinaryContext &BC,
+                                 std::map<uint64_t, BinaryFunction> &BFs,
+                                 std::set<uint64_t> &LargeFunctions) {
+  if (!opts::Relocs)
+    return;
+
+  for (auto &It : BFs) {
+    auto &Function = It.second;
+    if (opts::UseCompactAligner)
+      alignCompact(BC, Function);
+    else
+      alignMaxBytes(Function);
+  }
+}
+
+} // end namespace bolt
+} // end namespace llvm
--- a/bolt/Passes/Aligner.h
+++ b/bolt/Passes/Aligner.h
@ -0,0 +1,38 @@
+//===--------- Passes/Aligner.h -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_ALIGNER_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_ALIGNER_H
+
+#include "BinaryPasses.h"
+
+namespace llvm {
+namespace bolt {
+
+class AlignerPass : public BinaryFunctionPass {
+ public:
+  explicit AlignerPass() : BinaryFunctionPass(false) {}
+
+  const char *getName() const override {
+    return "aligner";
+  }
+
+  /// Pass entry point
+  void runOnFunctions(BinaryContext &BC,
+                      std::map<uint64_t, BinaryFunction> &BFs,
+                      std::set<uint64_t> &LargeFunctions) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+
+#endif
--- a/bolt/Passes/CMakeLists.txt
+++ b/bolt/Passes/CMakeLists.txt
@ -1,4 +1,5 @@
 add_llvm_library(LLVMBOLTPasses
+  Aligner.cpp
  AllocCombiner.cpp
  BinaryPasses.cpp
  BinaryFunctionCallGraph.cpp
--- a/bolt/RewriteInstance.cpp
+++ b/bolt/RewriteInstance.cpp
@ -90,20 +90,6 @@ OutputFilename("o",
  cl::Required,
  cl::cat(BoltOutputCategory));

-cl::opt<unsigned>
-AlignFunctions("align-functions",
-  cl::desc("align functions at a given value (relocation mode)"),
-  cl::init(64),
-  cl::ZeroOrMore,
-  cl::cat(BoltOptCategory));
-
-cl::opt<unsigned>
-AlignFunctionsMaxBytes("align-functions-max-bytes",
-  cl::desc("maximum number of bytes to use to align functions"),
-  cl::init(32),
-  cl::ZeroOrMore,
-  cl::cat(BoltOptCategory));
-
 cl::opt<bool>
 AllowStripped("allow-stripped",
  cl::desc("allow processing of stripped binaries"),
@ -2190,8 +2176,11 @@ void RewriteInstance::emitFunction(MCStreamer &Streamer, BinaryFunction &Functio

  if (opts::Relocs) {
    Streamer.EmitCodeAlignment(BinaryFunction::MinAlign);
-    Streamer.EmitCodeAlignment(opts::AlignFunctions,
-                               opts::AlignFunctionsMaxBytes);
+    auto MaxAlignBytes = EmitColdPart
+      ? Function.getMaxColdAlignmentBytes()
+      : Function.getMaxAlignmentBytes();
+    if (MaxAlignBytes > 0)
+      Streamer.EmitCodeAlignment(Function.getAlignment(), MaxAlignBytes);
  } else {
    Streamer.EmitCodeAlignment(Function.getAlignment());
    Streamer.setCodeSkew(EmitColdPart ? 0 : Function.getAddress());