[CodeGen] Emit alignment "Max Skip" operand

The current AsmPrinter has support to emit the "Max Skip" operand (the 3rd of .p2align), however has no support for it to actually be specified. Adding MaxBytesForAlignment to MachineBasicBlock provides this capability on a per-block basis. Leaving the value as default (0) causes no observable differences in behaviour. Differential Revision: https://reviews.llvm.org/D114590
2021-12-01 10:51:31 +00:00 · 2021-12-01 10:51:31 +00:00 · 73d92faa2f
parent 5109737c92
commit 73d92faa2f
7 changed files with 170 additions and 13 deletions
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@ -431,7 +431,8 @@ public:
  /// global value is specified, and if that global has an explicit alignment
  /// requested, it will override the alignment request if required for
  /// correctness.
-  void emitAlignment(Align Alignment, const GlobalObject *GV = nullptr) const;
+  void emitAlignment(Align Alignment, const GlobalObject *GV = nullptr,
+                     unsigned MaxBytesToEmit = 0) const;

  /// Lower the specified LLVM Constant to an MCExpr.
  virtual const MCExpr *lowerConstant(const Constant *CV);
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@ -136,6 +136,10 @@ private:
  /// Alignment of the basic block. One if the basic block does not need to be
  /// aligned.
  Align Alignment;
+  /// Maximum amount of bytes that can be added to align the basic block. If the
+  /// alignment cannot be reached in this many bytes, no bytes are emitted.
+  /// Zero to represent no maximum.
+  unsigned MaxBytesForAlignment = 0;

  /// Indicate that this basic block is entered via an exception handler.
  bool IsEHPad = false;
@ -521,6 +525,19 @@ public:
  /// Set alignment of the basic block.
  void setAlignment(Align A) { Alignment = A; }

+  void setAlignment(Align A, unsigned MaxBytes) {
+    setAlignment(A);
+    setMaxBytesForAlignment(MaxBytes);
+  }
+
+  /// Return the maximum amount of padding allowed for aligning the basic block.
+  unsigned getMaxBytesForAlignment() const { return MaxBytesForAlignment; }
+
+  /// Set the maximum amount of padding allowed for aligning the basic block
+  void setMaxBytesForAlignment(unsigned MaxBytes) {
+    MaxBytesForAlignment = MaxBytes;
+  }
+
  /// Returns true if the block is a landing pad. That is this basic block is
  /// entered via an exception handler.
  bool isEHPad() const { return IsEHPad; }
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@ -1801,11 +1801,14 @@ public:
  /// Return the preferred loop alignment.
  virtual Align getPrefLoopAlignment(MachineLoop *ML = nullptr) const;

+  /// Return the maximum amount of bytes allowed to be emitted when padding for
+  /// alignment
+  virtual unsigned
+  getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const;
+
  /// Should loops be aligned even when the function is marked OptSize (but not
  /// MinSize).
-  virtual bool alignLoopsWithOptSize() const {
-    return false;
-  }
+  virtual bool alignLoopsWithOptSize() const { return false; }

  /// If the target has a standard location for the stack protector guard,
  /// returns the address of that location. Otherwise, returns nullptr.
@ -2340,6 +2343,9 @@ protected:
  /// means the target does not care about loop alignment. The target may also
  /// override getPrefLoopAlignment to provide per-loop values.
  void setPrefLoopAlignment(Align Alignment) { PrefLoopAlignment = Alignment; }
+  void setMaxBytesForAlignment(unsigned MaxBytes) {
+    MaxBytesForAlignment = MaxBytes;
+  }

  /// Set the minimum stack alignment of an argument.
  void setMinStackArgumentAlignment(Align Alignment) {
@ -3029,6 +3035,8 @@ private:

  /// The preferred loop alignment (in log2 bot in bytes).
  Align PrefLoopAlignment;
+  /// The maximum amount of bytes permitted to be emitted for alignment.
+  unsigned MaxBytesForAlignment;

  /// Size in bits of the maximum atomics size the backend supports.
  /// Accesses larger than this will be expanded by AtomicExpandPass.
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@ -2477,7 +2477,8 @@ void AsmPrinter::emitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
 // two boundary.  If a global value is specified, and if that global has
 // an explicit alignment requested, it will override the alignment request
 // if required for correctness.
-void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV) const {
+void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV,
+                               unsigned MaxBytesToEmit) const {
  if (GV)
    Alignment = getGVAlignment(GV, GV->getParent()->getDataLayout(), Alignment);

@ -2490,9 +2491,9 @@ void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV) const {
      STI = &getSubtargetInfo();
    else
      STI = TM.getMCSubtargetInfo();
-    OutStreamer->emitCodeAlignment(Alignment.value(), STI);
+    OutStreamer->emitCodeAlignment(Alignment.value(), STI, MaxBytesToEmit);
  } else
-    OutStreamer->emitValueToAlignment(Alignment.value());
+    OutStreamer->emitValueToAlignment(Alignment.value(), 0, 1, MaxBytesToEmit);
 }

 //===----------------------------------------------------------------------===//
@ -3286,7 +3287,7 @@ void AsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
  // Emit an alignment directive for this block, if needed.
  const Align Alignment = MBB.getAlignment();
  if (Alignment != Align(1))
-    emitAlignment(Alignment);
+    emitAlignment(Alignment, nullptr, MBB.getMaxBytesForAlignment());

  // Switch to a new section if this basic block must begin a section. The
  // entry block is always placed in the function section and is handled
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@ -96,6 +96,12 @@ static cl::opt<unsigned> AlignAllNonFallThruBlocks(
             "format (e.g 4 means align on 16B boundaries)."),
    cl::init(0), cl::Hidden);

+static cl::opt<unsigned> MaxBytesForAlignmentOverride(
+    "max-bytes-for-alignment",
+    cl::desc("Forces the maximum bytes allowed to be emitted when padding for "
+             "alignment"),
+    cl::init(0), cl::Hidden);
+
 // FIXME: Find a good default for this flag and remove the flag.
 static cl::opt<unsigned> ExitBlockBias(
    "block-placement-exit-block-bias",
@ -2929,10 +2935,21 @@ void MachineBlockPlacement::alignBlocks() {
    MachineBasicBlock *LayoutPred =
        &*std::prev(MachineFunction::iterator(ChainBB));

+    auto DetermineMaxAlignmentPadding = [&]() {
+      // Set the maximum bytes allowed to be emitted for alignment.
+      unsigned MaxBytes;
+      if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0)
+        MaxBytes = MaxBytesForAlignmentOverride;
+      else
+        MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB);
+      ChainBB->setMaxBytesForAlignment(MaxBytes);
+    };
+
    // Force alignment if all the predecessors are jumps. We already checked
    // that the block isn't cold above.
    if (!LayoutPred->isSuccessor(ChainBB)) {
      ChainBB->setAlignment(Align);
+      DetermineMaxAlignmentPadding();
      continue;
    }

@ -2943,8 +2960,10 @@ void MachineBlockPlacement::alignBlocks() {
    BranchProbability LayoutProb =
        MBPI->getEdgeProbability(LayoutPred, ChainBB);
    BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
-    if (LayoutEdgeFreq <= (Freq * ColdProb))
+    if (LayoutEdgeFreq <= (Freq * ColdProb)) {
      ChainBB->setAlignment(Align);
+      DetermineMaxAlignmentPadding();
+    }
  }
 }

@ -3418,17 +3437,30 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
  ComputedEdges.clear();
  ChainAllocator.DestroyAll();

+  bool HasMaxBytesOverride =
+      MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
+
  if (AlignAllBlock)
    // Align all of the blocks in the function to a specific alignment.
-    for (MachineBasicBlock &MBB : MF)
-      MBB.setAlignment(Align(1ULL << AlignAllBlock));
+    for (MachineBasicBlock &MBB : MF) {
+      if (HasMaxBytesOverride)
+        MBB.setAlignment(Align(1ULL << AlignAllBlock),
+                         MaxBytesForAlignmentOverride);
+      else
+        MBB.setAlignment(Align(1ULL << AlignAllBlock));
+    }
  else if (AlignAllNonFallThruBlocks) {
    // Align all of the blocks that have no fall-through predecessors to a
    // specific alignment.
    for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
      auto LayoutPred = std::prev(MBI);
-      if (!LayoutPred->isSuccessor(&*MBI))
-        MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
+      if (!LayoutPred->isSuccessor(&*MBI)) {
+        if (HasMaxBytesOverride)
+          MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
+                            MaxBytesForAlignmentOverride);
+        else
+          MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
+      }
    }
  }
  if (ViewBlockLayoutWithBFI != GVDT_None &&
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@ -2040,6 +2040,11 @@ Align TargetLoweringBase::getPrefLoopAlignment(MachineLoop *ML) const {
  return PrefLoopAlignment;
 }

+unsigned TargetLoweringBase::getMaxPermittedBytesForAlignment(
+    MachineBasicBlock *MBB) const {
+  return MaxBytesForAlignment;
+}
+
 //===----------------------------------------------------------------------===//
 //  Reciprocal Estimates
 //===----------------------------------------------------------------------===//
--- a/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll
@ -0,0 +1,93 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -max-bytes-for-alignment=8 --align-loops=32 < %s -o -| FileCheck %s --check-prefixes=CHECK,CHECK-EXPLICIT
+; RUN: llc -mtriple=aarch64-none-linux-gnu --align-loops=32 < %s -o -| FileCheck %s --check-prefixes=CHECK,CHECK-IMPLICIT
+; RUN: llc -mtriple=aarch64-none-linux-gnu --align-loops=32 < %s -o - --filetype=obj | llvm-objdump --arch=aarch64  -d -| FileCheck %s --check-prefixes=CHECK-OBJ,CHECK-OBJ-IMPLICIT
+; RUN: llc -mtriple=aarch64-none-linux-gnu -max-bytes-for-alignment=8 --align-loops=32 < %s -o - --filetype=obj | llvm-objdump --arch=aarch64  -d -| FileCheck %s --check-prefixes=CHECK-OBJ,CHECK-OBJ-EXPLICIT
+
+; This test is checking that the correct operands to the .p2align are emitted correctly, and that the resulting obj
+; is padded as expected. The key interest in the CHECK-OBJ-* sections is the size of the padding region (the nops),
+; and not the exact instructions either side of them (But the last instruction of the EXPLICIT and IMPLICIT checks
+; should be the same, at different locations)
+define i32 @a(i32 %x, i32* nocapture readonly %y, i32* nocapture readonly %z) {
+; CHECK-LABEL: a:
+; CHECK-EXPLICIT:    .p2align 5, 0x0, 8
+; CHECK-IMPLICIT:    .p2align 5
+; CHECK-NEXT:  .LBB0_5: // %vector.body
+; CHECK-EXPLICIT:    .p2align 5, 0x0, 8
+; CHECK-IMPLICIT:    .p2align 5
+; CHECK-NEXT:  .LBB0_8: // %for.body
+; CHECK-OBJ;Disassembly of section .text:
+; CHECK-OBJ:               88: 2a 00 0a 8b   add
+; CHECK-OBJ-IMPLICIT-NEXT: 8c: 1f 20 03 d5   nop
+; CHECK-OBJ-IMPLICIT-NEXT: 90: 1f 20 03 d5   nop
+; CHECK-OBJ-IMPLICIT-NEXT: 94: 1f 20 03 d5   nop
+; CHECK-OBJ-IMPLICIT-NEXT: 98: 1f 20 03 d5   nop
+; CHECK-OBJ-IMPLICIT-NEXT: 9c: 1f 20 03 d5   nop
+; CHECK-OBJ-IMPLICIT-NEXT: a0: 4b 45 40 b8   ldr
+; CHECK-OBJ-EXPLICIT-NEXT: 8c: 4b 45 40 b8   ldr
+entry:
+  %cmp10 = icmp sgt i32 %x, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %x to i64
+  %min.iters.check = icmp ult i32 %x, 8
+  br i1 %min.iters.check, label %for.body.preheader17, label %vector.ph
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i64 %wide.trip.count, 4294967288
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %10, %vector.body ]
+  %vec.phi13 = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %11, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %y, i64 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+  %2 = getelementptr inbounds i32, i32* %0, i64 4
+  %3 = bitcast i32* %2 to <4 x i32>*
+  %wide.load14 = load <4 x i32>, <4 x i32>* %3, align 4
+  %4 = getelementptr inbounds i32, i32* %z, i64 %index
+  %5 = bitcast i32* %4 to <4 x i32>*
+  %wide.load15 = load <4 x i32>, <4 x i32>* %5, align 4
+  %6 = getelementptr inbounds i32, i32* %4, i64 4
+  %7 = bitcast i32* %6 to <4 x i32>*
+  %wide.load16 = load <4 x i32>, <4 x i32>* %7, align 4
+  %8 = add <4 x i32> %wide.load, %vec.phi
+  %9 = add <4 x i32> %wide.load14, %vec.phi13
+  %10 = add <4 x i32> %8, %wide.load15
+  %11 = add <4 x i32> %9, %wide.load16
+  %index.next = add nuw i64 %index, 8
+  %12 = icmp eq i64 %index.next, %n.vec
+  br i1 %12, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %bin.rdx = add <4 x i32> %11, %10
+  %13 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %bin.rdx)
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader17
+
+for.body.preheader17:                             ; preds = %for.body.preheader, %middle.block
+  %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+  %b.011.ph = phi i32 [ 0, %for.body.preheader ], [ %13, %middle.block ]
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  %b.0.lcssa = phi i32 [ 0, %entry ], [ %13, %middle.block ], [ %add3, %for.body ]
+  ret i32 %b.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader17, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader17 ]
+  %b.011 = phi i32 [ %add3, %for.body ], [ %b.011.ph, %for.body.preheader17 ]
+  %arrayidx = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
+  %14 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %z, i64 %indvars.iv
+  %15 = load i32, i32* %arrayidx2, align 4
+  %add = add i32 %14, %b.011
+  %add3 = add i32 %add, %15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)