[PowerPC/BlockPlacement] Allow target to provide a per-loop alignment preference

The existing code provided for specifying a global loop alignment preference. However, the preferred loop alignment might depend on the loop itself. For recent POWER cores, loops between 5 and 8 instructions should have 32-byte alignment (while the others are better with 16-byte alignment) so that the entire loop will fit in one i-cache line. To support this, getPrefLoopAlignment has been made virtual, and can be provided with an optional MachineLoop* so the target can inspect the loop before answering the query. The default behavior, as before, is to return the value set with setPrefLoopAlignment. MachineBlockPlacement now queries the target for each loop instead of only once per function. There should be no functional change for other targets. llvm-svn: 225117
2015-01-03 17:58:24 +00:00 · 2015-01-03 17:58:24 +00:00 · 5772566ed6
parent 409af50858
commit 5772566ed6
5 changed files with 92 additions and 13 deletions
--- a/llvm/include/llvm/Target/TargetLowering.h
+++ b/llvm/include/llvm/Target/TargetLowering.h
@ -51,6 +51,7 @@ namespace llvm {
  class MachineFunction;
  class MachineInstr;
  class MachineJumpTableInfo;
+  class MachineLoop;
  class Mangler;
  class MCContext;
  class MCExpr;
@ -929,7 +930,7 @@ public:
  }

  /// Return the preferred loop alignment.
-  unsigned getPrefLoopAlignment() const {
+  virtual unsigned getPrefLoopAlignment(MachineLoop *ML = nullptr) const {
    return PrefLoopAlignment;
  }

@ -1336,7 +1337,8 @@ protected:

  /// Set the target's preferred loop alignment. Default alignment is zero, it
  /// means the target does not care about loop alignment.  The alignment is
-  /// specified in log2(bytes).
+  /// specified in log2(bytes). The target may also override
+  /// getPrefLoopAlignment to provide per-loop values.
  void setPrefLoopAlignment(unsigned Align) {
    PrefLoopAlignment = Align;
  }
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@ -1046,9 +1046,6 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
  if (F.getFunction()->getAttributes().
        hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize))
    return;
-  unsigned Align = TLI->getPrefLoopAlignment();
-  if (!Align)
-    return;  // Don't care about loop alignment.
  if (FunctionChain.begin() == FunctionChain.end())
    return;  // Empty chain.

@ -1066,6 +1063,10 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
    if (!L)
      continue;

+    unsigned Align = TLI->getPrefLoopAlignment(L);
+    if (!Align)
+      continue;  // Don't care about loop alignment.
+
    // If the block is cold relative to the function entry don't waste space
    // aligning it.
    BlockFrequency Freq = MBFI->getBlockFreq(*BI);
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
@ -9049,6 +9050,40 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
  }
 }

+unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
+  switch (Subtarget.getDarwinDirective()) {
+  default: break;
+  case PPC::DIR_970:
+  case PPC::DIR_PWR4:
+  case PPC::DIR_PWR5:
+  case PPC::DIR_PWR5X:
+  case PPC::DIR_PWR6:
+  case PPC::DIR_PWR6X:
+  case PPC::DIR_PWR7:
+  case PPC::DIR_PWR8: {
+    if (!ML)
+      break;
+
+    const PPCInstrInfo *TII =
+      static_cast<const PPCInstrInfo *>(getTargetMachine().getSubtargetImpl()->
+                                          getInstrInfo());
+
+    // For small loops (between 5 and 8 instructions), align to a 32-byte
+    // boundary so that the entire loop fits in one instruction-cache line.
+    uint64_t LoopSize = 0;
+    for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I)
+      for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J)
+        LoopSize += TII->GetInstSizeInBytes(J);
+
+    if (LoopSize > 16 && LoopSize <= 32)
+      return 5;
+
+    break;
+  }
+  }
+
+  return TargetLowering::getPrefLoopAlignment(ML);
+}

 /// getConstraintType - Given a constraint, return the type of
 /// constraint it is for this target.
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@ -449,6 +449,8 @@ namespace llvm {
                                       const SelectionDAG &DAG,
                                       unsigned Depth = 0) const override;

+    unsigned getPrefLoopAlignment(MachineLoop *ML) const override;
+
    Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
                                  bool IsStore, bool IsLoad) const override;
    Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
--- a/llvm/test/CodeGen/PowerPC/code-align.ll
+++ b/llvm/test/CodeGen/PowerPC/code-align.ll
@ -1,15 +1,15 @@
 ; RUN: llc -mcpu=ppc64 < %s | FileCheck %s -check-prefix=GENERIC
-; RUN: llc -mcpu=970 < %s | FileCheck %s -check-prefix=BASIC
+; RUN: llc -mcpu=970 < %s | FileCheck %s -check-prefix=PWR
 ; RUN: llc -mcpu=a2 < %s | FileCheck %s -check-prefix=BASIC
 ; RUN: llc -mcpu=e500mc < %s | FileCheck %s -check-prefix=BASIC
 ; RUN: llc -mcpu=e5500 < %s | FileCheck %s -check-prefix=BASIC
-; RUN: llc -mcpu=pwr4 < %s | FileCheck %s -check-prefix=BASIC
-; RUN: llc -mcpu=pwr5 < %s | FileCheck %s -check-prefix=BASIC
-; RUN: llc -mcpu=pwr5x < %s | FileCheck %s -check-prefix=BASIC
-; RUN: llc -mcpu=pwr6 < %s | FileCheck %s -check-prefix=BASIC
-; RUN: llc -mcpu=pwr6x < %s | FileCheck %s -check-prefix=BASIC
-; RUN: llc -mcpu=pwr7 < %s | FileCheck %s -check-prefix=BASIC
-; RUN: llc -mcpu=pwr8 < %s | FileCheck %s -check-prefix=BASIC
+; RUN: llc -mcpu=pwr4 < %s | FileCheck %s -check-prefix=PWR
+; RUN: llc -mcpu=pwr5 < %s | FileCheck %s -check-prefix=PWR
+; RUN: llc -mcpu=pwr5x < %s | FileCheck %s -check-prefix=PWR
+; RUN: llc -mcpu=pwr6 < %s | FileCheck %s -check-prefix=PWR
+; RUN: llc -mcpu=pwr6x < %s | FileCheck %s -check-prefix=PWR
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s -check-prefix=PWR
+; RUN: llc -mcpu=pwr8 < %s | FileCheck %s -check-prefix=PWR
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"

@ -21,10 +21,13 @@ entry:

 ; GENERIC-LABEL: .globl  foo
 ; BASIC-LABEL: .globl  foo
+; PWR-LABEL: .globl  foo
 ; GENERIC: .align  2
 ; BASIC: .align  4
+; PWR: .align  4
 ; GENERIC: @foo
 ; BASIC: @foo
+; PWR: @foo
 }

 ; Function Attrs: nounwind
@ -34,12 +37,16 @@ entry:

 ; GENERIC-LABEL: @loop
 ; BASIC-LABEL: @loop
+; PWR-LABEL: @loop
 ; GENERIC: mtctr
 ; BASIC: mtctr
+; PWR: mtctr
 ; GENERIC-NOT: .align
 ; BASIC: .align  4
+; PWR: .align  4
 ; GENERIC: bdnz
 ; BASIC: bdnz
+; PWR: bdnz

 vector.body:                                      ; preds = %vector.body, %entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
@ -60,6 +67,38 @@ for.end:                                          ; preds = %vector.body
  ret void
 }

+; Function Attrs: nounwind
+define void @sloop(i32 signext %x, i32* nocapture %a) #1 {
+entry:
+  br label %for.body
+
+; GENERIC-LABEL: @sloop
+; BASIC-LABEL: @sloop
+; PWR-LABEL: @sloop
+; GENERIC: mtctr
+; BASIC: mtctr
+; PWR: mtctr
+; GENERIC-NOT: .align
+; BASIC: .align  4
+; PWR: .align  5
+; GENERIC: bdnz
+; BASIC: bdnz
+; PWR: bdnz
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %0, 4
+  store i32 %add, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 2048
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }