[X86] Reduce the number of emitted fragments due to branch align

Summary: Currently, a BoundaryAlign fragment may be inserted after the branch that needs to be aligned to truncate the current fragment, this fragment is unused at most of time. To avoid that, we can insert a new empty Data fragment instead. Non-relaxable instruction is usually emitted into Data fragment, so the inserted empty Data fragment will be reused at a high possibility. Reviewers: annita.zhang, reames, MaskRay, craig.topper, LuoYuanke, jyknight Reviewed By: reames, LuoYuanke Subscribers: llvm-commits, dexonsmith, hiraditya Tags: #llvm Differential Revision: https://reviews.llvm.org/D75438
2020-03-03 16:54:23 +08:00 · 2020-03-03 16:54:23 +08:00 · 3a503ce663
parent 629cf3c1c5
commit 3a503ce663
6 changed files with 71 additions and 110 deletions
--- a/llvm/include/llvm/MC/MCFragment.h
+++ b/llvm/include/llvm/MC/MCFragment.h
@ -523,20 +523,16 @@ public:
 class MCBoundaryAlignFragment : public MCFragment {
  /// The alignment requirement of the branch to be aligned.
  Align AlignBoundary;
-  /// Flag to indicate whether the branch is fused.  Use in determining the
-  /// region of fragments being aligned.
-  bool Fused : 1;
-  /// Flag to indicate whether NOPs should be emitted.
-  bool EmitNops : 1;
+  /// The last fragment in the set of fragments to be aligned.
+  const MCFragment *LastFragment = nullptr;
  /// The size of the fragment.  The size is lazily set during relaxation, and
  /// is not meaningful before that.
  uint64_t Size = 0;

 public:
-  MCBoundaryAlignFragment(Align AlignBoundary = Align(1), bool Fused = false,
-                          bool EmitNops = false, MCSection *Sec = nullptr)
-      : MCFragment(FT_BoundaryAlign, false, Sec), AlignBoundary(AlignBoundary),
-        Fused(Fused), EmitNops(EmitNops) {}
+  MCBoundaryAlignFragment(Align AlignBoundary, MCSection *Sec = nullptr)
+      : MCFragment(FT_BoundaryAlign, false, Sec), AlignBoundary(AlignBoundary) {
+  }

  uint64_t getSize() const { return Size; }
  void setSize(uint64_t Value) { Size = Value; }
@ -544,11 +540,11 @@ public:
  Align getAlignment() const { return AlignBoundary; }
  void setAlignment(Align Value) { AlignBoundary = Value; }

-  bool isFused() const { return Fused; }
-  void setFused(bool Value) { Fused = Value; }
-
-  bool canEmitNops() const { return EmitNops; }
-  void setEmitNops(bool Value) { EmitNops = Value; }
+  const MCFragment *getLastFragment() const { return LastFragment; }
+  void setLastFragment(const MCFragment *F) {
+    assert(!F || getParent() == F->getParent());
+    LastFragment = F;
+  }

  static bool classof(const MCFragment *F) {
    return F->getKind() == MCFragment::FT_BoundaryAlign;
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@ -996,27 +996,22 @@ static bool needPadding(uint64_t StartAddr, uint64_t Size,

 bool MCAssembler::relaxBoundaryAlign(MCAsmLayout &Layout,
                                     MCBoundaryAlignFragment &BF) {
-  // The MCBoundaryAlignFragment that doesn't emit NOP should not be relaxed.
-  if (!BF.canEmitNops())
+  // BoundaryAlignFragment that doesn't need to align any fragment should not be
+  // relaxed.
+  if (!BF.getLastFragment())
    return false;

-  uint64_t AlignedOffset = Layout.getFragmentOffset(BF.getNextNode());
+  uint64_t AlignedOffset = Layout.getFragmentOffset(&BF);
  uint64_t AlignedSize = 0;
-  const MCFragment *F = BF.getNextNode();
-  // If the branch is unfused, it is emitted into one fragment, otherwise it is
-  // emitted into two fragments at most, the next MCBoundaryAlignFragment(if
-  // exists) also marks the end of the branch.
-  for (auto i = 0, N = BF.isFused() ? 2 : 1;
-       i != N && !isa<MCBoundaryAlignFragment>(F); ++i, F = F->getNextNode()) {
+  for (const MCFragment *F = BF.getLastFragment(); F != &BF;
+       F = F->getPrevNode())
    AlignedSize += computeFragmentSize(Layout, *F);
-  }
-  uint64_t OldSize = BF.getSize();
-  AlignedOffset -= OldSize;
+
  Align BoundaryAlignment = BF.getAlignment();
  uint64_t NewSize = needPadding(AlignedOffset, AlignedSize, BoundaryAlignment)
                         ? offsetToAlignment(AlignedOffset, BoundaryAlignment)
                         : 0U;
-  if (NewSize == OldSize)
+  if (NewSize == BF.getSize())
    return false;
  BF.setSize(NewSize);
  Layout.invalidateFragmentsFrom(&BF);
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@ -424,14 +424,9 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
  }
  case MCFragment::FT_BoundaryAlign: {
    const auto *BF = cast<MCBoundaryAlignFragment>(this);
-    if (BF->canEmitNops())
-      OS << " (can emit nops to align";
-    if (BF->isFused())
-      OS << " fused branch)";
-    else
-      OS << " unfused branch)";
    OS << "\n       ";
    OS << " BoundarySize:" << BF->getAlignment().value()
+       << " LastFragment:" << BF->getLastFragment()
       << " Size:" << BF->getSize();
    break;
  }
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@ -215,15 +215,6 @@ MCObjectStreamer::getOrCreateDataFragment(const MCSubtargetInfo *STI) {
  return F;
 }

-MCBoundaryAlignFragment *MCObjectStreamer::getOrCreateBoundaryAlignFragment() {
-  auto *F = dyn_cast_or_null<MCBoundaryAlignFragment>(getCurrentFragment());
-  if (!F || F->canEmitNops()) {
-    F = new MCBoundaryAlignFragment();
-    insert(F);
-  }
-  return F;
-}
-
 void MCObjectStreamer::visitUsedSymbol(const MCSymbol &Sym) {
  Assembler->registerSymbol(Sym);
 }
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@ -133,6 +133,7 @@ class X86AsmBackend : public MCAsmBackend {
  bool needAlign(MCObjectStreamer &OS) const;
  bool needAlignInst(const MCInst &Inst) const;
  MCInst PrevInst;
+  MCBoundaryAlignFragment *PendingBoundaryAlign = nullptr;

 public:
  X86AsmBackend(const Target &T, const MCSubtargetInfo &STI)
@ -493,27 +494,30 @@ bool X86AsmBackend::needAlignInst(const MCInst &Inst) const {
          (AlignBranchType & X86::AlignBranchIndirect));
 }

-/// Insert MCBoundaryAlignFragment before instructions to align branches.
+/// Insert BoundaryAlignFragment before instructions to align branches.
 void X86AsmBackend::alignBranchesBegin(MCObjectStreamer &OS,
                                       const MCInst &Inst) {
  if (!needAlign(OS))
    return;

-  MCFragment *CF = OS.getCurrentFragment();
-  bool NeedAlignFused = AlignBranchType & X86::AlignBranchFused;
-  if (hasInterruptDelaySlot(PrevInst)) {
+  if (hasInterruptDelaySlot(PrevInst))
    // If this instruction follows an interrupt enabling instruction with a one
    // instruction delay, inserting a nop would change behavior.
-  } else if (NeedAlignFused && isMacroFused(PrevInst, Inst) && CF) {
+    return;
+
+  if (!isMacroFused(PrevInst, Inst))
+    // Macro fusion doesn't happen indeed, clear the pending.
+    PendingBoundaryAlign = nullptr;
+
+  if (PendingBoundaryAlign &&
+      OS.getCurrentFragment()->getPrevNode() == PendingBoundaryAlign) {
    // Macro fusion actually happens and there is no other fragment inserted
-    // after the previous instruction. NOP can be emitted in PF to align fused
-    // jcc.
-    if (auto *PF =
-            dyn_cast_or_null<MCBoundaryAlignFragment>(CF->getPrevNode())) {
-      const_cast<MCBoundaryAlignFragment *>(PF)->setEmitNops(true);
-      const_cast<MCBoundaryAlignFragment *>(PF)->setFused(true);
-    }
-  } else if (needAlignInst(Inst)) {
+    // after the previous instruction.
+    //
+    // Do nothing here since we already inserted a BoudaryAlign fragment when
+    // we met the first instruction in the fused pair and we'll tie them
+    // together in alignBranchesEnd.
+    //
    // Note: When there is at least one fragment, such as MCAlignFragment,
    // inserted after the previous instruction, e.g.
    //
@ -525,36 +529,38 @@ void X86AsmBackend::alignBranchesBegin(MCObjectStreamer &OS,
    //
    // We will treat the JCC as a unfused branch although it may be fused
    // with the CMP.
-    auto *F = OS.getOrCreateBoundaryAlignFragment();
-    F->setAlignment(AlignBoundary);
-    F->setEmitNops(true);
-    F->setFused(false);
-  } else if (NeedAlignFused && isFirstMacroFusibleInst(Inst, *MCII)) {
-    // We don't know if macro fusion happens until the reaching the next
-    // instruction, so a place holder is put here if necessary.
-    auto *F = OS.getOrCreateBoundaryAlignFragment();
-    F->setAlignment(AlignBoundary);
+    return;
  }

-  PrevInst = Inst;
+  if (needAlignInst(Inst) || ((AlignBranchType & X86::AlignBranchFused) &&
+                              isFirstMacroFusibleInst(Inst, *MCII))) {
+    // If we meet a unfused branch or the first instuction in a fusiable pair,
+    // insert a BoundaryAlign fragment.
+    OS.insert(PendingBoundaryAlign =
+                  new MCBoundaryAlignFragment(AlignBoundary));
+  }
 }

-/// Insert a MCBoundaryAlignFragment to mark the end of the branch to be aligned
-/// if necessary.
+/// Set the last fragment to be aligned for the BoundaryAlignFragment.
 void X86AsmBackend::alignBranchesEnd(MCObjectStreamer &OS, const MCInst &Inst) {
  if (!needAlign(OS))
    return;
-  // If the branch is emitted into a MCRelaxableFragment, we can determine the
-  // size of the branch easily in MCAssembler::relaxBoundaryAlign. When the
-  // branch is fused, the fused branch(macro fusion pair) must be emitted into
-  // two fragments. Or when the branch is unfused, the branch must be emitted
-  // into one fragment. The MCRelaxableFragment naturally marks the end of the
-  // fused or unfused branch.
-  // Otherwise, we need to insert a MCBoundaryAlignFragment to mark the end of
-  // the branch. This MCBoundaryAlignFragment may be reused to emit NOP to align
-  // other branch.
-  if (needAlignInst(Inst) && !isa<MCRelaxableFragment>(OS.getCurrentFragment()))
-    OS.insert(new MCBoundaryAlignFragment(AlignBoundary));
+
+  PrevInst = Inst;
+
+  if (!needAlignInst(Inst) || !PendingBoundaryAlign)
+    return;
+
+  // Tie the aligned instructions into a a pending BoundaryAlign.
+  PendingBoundaryAlign->setLastFragment(OS.getCurrentFragment());
+  PendingBoundaryAlign = nullptr;
+
+  // We need to ensure that further data isn't added to the current
+  // DataFragment, so that we can get the size of instructions later in
+  // MCAssembler::relaxBoundaryAlign. The easiest way is to insert a new empty
+  // DataFragment.
+  if (isa_and_nonnull<MCDataFragment>(OS.getCurrentFragment()))
+    OS.insert(new MCDataFragment());

  // Update the maximum alignment on the current section if necessary.
  MCSection *Sec = OS.getCurrentSectionOnly();
@ -862,16 +868,12 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm,
      // If we're looking at a boundary align, make sure we don't try to pad
      // its target instructions for some following directive.  Doing so would
      // break the alignment of the current boundary align.
-      if (F.getKind() == MCFragment::FT_BoundaryAlign) {
-        auto &BF = cast<MCBoundaryAlignFragment>(F);
-        const MCFragment *F = BF.getNextNode();
-        // If the branch is unfused, it is emitted into one fragment, otherwise
-        // it is emitted into two fragments at most, the next
-        // MCBoundaryAlignFragment(if exists) also marks the end of the branch.
-        for (int i = 0, N = BF.isFused() ? 2 : 1;
-             i != N && !isa<MCBoundaryAlignFragment>(F);
-             ++i, F = F->getNextNode(), I++) {
-        }
+      if (auto *BF = dyn_cast<MCBoundaryAlignFragment>(&F)) {
+        const MCFragment *LastFragment = BF->getLastFragment();
+        if (!LastFragment)
+          continue;
+        while (&*I != LastFragment)
+          ++I;
      }
    }
  }
--- a/llvm/test/MC/X86/align-branch-64-negative.s
+++ b/llvm/test/MC/X86/align-branch-64-negative.s
@ -24,31 +24,13 @@ labeled_call_test1:
 label_before:
  callq bar

-  # In the second test, we have a label which is expected to be bound to the
-  # end of the call.  For instance, we want the to associate some data with
-  # the return address of the call.
-  # CHECK-LABEL: <labeled_call_test2>:
-  # CHECK: 5a: callq
-  # CHECK: 5f: nop
-  # CHECK: 60 <label_after>:
-  # CHECK: 60: jmp
-  .globl  labeled_call_test2
-  .p2align  5
-labeled_call_test2:
-  .rept 26
-  int3
-  .endr
-  callq bar
-label_after:
-  jmp bar
-
-  # Our third test is like the first w/a labeled fault, but specifically to
+  # Our second test is like the first w/a labeled fault, but specifically to
  # a fused memory comparison.  This is the form produced by implicit null
  # checks for instance.
  # CHECK-LABEL: <implicit_null_check>:
-  # CHECK: 9f <fault_addr>:
-  # CHECK: 9f: nop
-  # CHECK: a0: cmpq
+  # CHECK: 5f <fault_addr>:
+  # CHECK: 5f: nop
+  # CHECK: 60: cmpq
  .globl  implicit_null_check
  .p2align  5
 implicit_null_check: