diff --git a/llvm/include/llvm/Target/TargetLowering.h b/llvm/include/llvm/Target/TargetLowering.h index 9a04d3c6c679..7a62d5d19e33 100644 --- a/llvm/include/llvm/Target/TargetLowering.h +++ b/llvm/include/llvm/Target/TargetLowering.h @@ -51,6 +51,7 @@ namespace llvm { class MachineFunction; class MachineInstr; class MachineJumpTableInfo; + class MachineLoop; class Mangler; class MCContext; class MCExpr; @@ -929,7 +930,7 @@ public: } /// Return the preferred loop alignment. - unsigned getPrefLoopAlignment() const { + virtual unsigned getPrefLoopAlignment(MachineLoop *ML = nullptr) const { return PrefLoopAlignment; } @@ -1336,7 +1337,8 @@ protected: /// Set the target's preferred loop alignment. Default alignment is zero, it /// means the target does not care about loop alignment. The alignment is - /// specified in log2(bytes). + /// specified in log2(bytes). The target may also override + /// getPrefLoopAlignment to provide per-loop values. void setPrefLoopAlignment(unsigned Align) { PrefLoopAlignment = Align; } diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 08fd20036f04..aaa7d9156976 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -1046,9 +1046,6 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { if (F.getFunction()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize)) return; - unsigned Align = TLI->getPrefLoopAlignment(); - if (!Align) - return; // Don't care about loop alignment. if (FunctionChain.begin() == FunctionChain.end()) return; // Empty chain. @@ -1066,6 +1063,10 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { if (!L) continue; + unsigned Align = TLI->getPrefLoopAlignment(L); + if (!Align) + continue; // Don't care about loop alignment. + // If the block is cold relative to the function entry don't waste space // aligning it. BlockFrequency Freq = MBFI->getBlockFreq(*BI); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 8d8c32264dbb..203a610a6bc6 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" @@ -9049,6 +9050,40 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, } } +unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { + switch (Subtarget.getDarwinDirective()) { + default: break; + case PPC::DIR_970: + case PPC::DIR_PWR4: + case PPC::DIR_PWR5: + case PPC::DIR_PWR5X: + case PPC::DIR_PWR6: + case PPC::DIR_PWR6X: + case PPC::DIR_PWR7: + case PPC::DIR_PWR8: { + if (!ML) + break; + + const PPCInstrInfo *TII = + static_cast(getTargetMachine().getSubtargetImpl()-> + getInstrInfo()); + + // For small loops (between 5 and 8 instructions), align to a 32-byte + // boundary so that the entire loop fits in one instruction-cache line. + uint64_t LoopSize = 0; + for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I) + for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) + LoopSize += TII->GetInstSizeInBytes(J); + + if (LoopSize > 16 && LoopSize <= 32) + return 5; + + break; + } + } + + return TargetLowering::getPrefLoopAlignment(ML); +} /// getConstraintType - Given a constraint, return the type of /// constraint it is for this target. diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index b4b11d846e37..d9142c7e7d03 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -449,6 +449,8 @@ namespace llvm { const SelectionDAG &DAG, unsigned Depth = 0) const override; + unsigned getPrefLoopAlignment(MachineLoop *ML) const override; + Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord, bool IsStore, bool IsLoad) const override; Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord, diff --git a/llvm/test/CodeGen/PowerPC/code-align.ll b/llvm/test/CodeGen/PowerPC/code-align.ll index 5550547d010d..306230be5005 100644 --- a/llvm/test/CodeGen/PowerPC/code-align.ll +++ b/llvm/test/CodeGen/PowerPC/code-align.ll @@ -1,15 +1,15 @@ ; RUN: llc -mcpu=ppc64 < %s | FileCheck %s -check-prefix=GENERIC -; RUN: llc -mcpu=970 < %s | FileCheck %s -check-prefix=BASIC +; RUN: llc -mcpu=970 < %s | FileCheck %s -check-prefix=PWR ; RUN: llc -mcpu=a2 < %s | FileCheck %s -check-prefix=BASIC ; RUN: llc -mcpu=e500mc < %s | FileCheck %s -check-prefix=BASIC ; RUN: llc -mcpu=e5500 < %s | FileCheck %s -check-prefix=BASIC -; RUN: llc -mcpu=pwr4 < %s | FileCheck %s -check-prefix=BASIC -; RUN: llc -mcpu=pwr5 < %s | FileCheck %s -check-prefix=BASIC -; RUN: llc -mcpu=pwr5x < %s | FileCheck %s -check-prefix=BASIC -; RUN: llc -mcpu=pwr6 < %s | FileCheck %s -check-prefix=BASIC -; RUN: llc -mcpu=pwr6x < %s | FileCheck %s -check-prefix=BASIC -; RUN: llc -mcpu=pwr7 < %s | FileCheck %s -check-prefix=BASIC -; RUN: llc -mcpu=pwr8 < %s | FileCheck %s -check-prefix=BASIC +; RUN: llc -mcpu=pwr4 < %s | FileCheck %s -check-prefix=PWR +; RUN: llc -mcpu=pwr5 < %s | FileCheck %s -check-prefix=PWR +; RUN: llc -mcpu=pwr5x < %s | FileCheck %s -check-prefix=PWR +; RUN: llc -mcpu=pwr6 < %s | FileCheck %s -check-prefix=PWR +; RUN: llc -mcpu=pwr6x < %s | FileCheck %s -check-prefix=PWR +; RUN: llc -mcpu=pwr7 < %s | FileCheck %s -check-prefix=PWR +; RUN: llc -mcpu=pwr8 < %s | FileCheck %s -check-prefix=PWR target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -21,10 +21,13 @@ entry: ; GENERIC-LABEL: .globl foo ; BASIC-LABEL: .globl foo +; PWR-LABEL: .globl foo ; GENERIC: .align 2 ; BASIC: .align 4 +; PWR: .align 4 ; GENERIC: @foo ; BASIC: @foo +; PWR: @foo } ; Function Attrs: nounwind @@ -34,12 +37,16 @@ entry: ; GENERIC-LABEL: @loop ; BASIC-LABEL: @loop +; PWR-LABEL: @loop ; GENERIC: mtctr ; BASIC: mtctr +; PWR: mtctr ; GENERIC-NOT: .align ; BASIC: .align 4 +; PWR: .align 4 ; GENERIC: bdnz ; BASIC: bdnz +; PWR: bdnz vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] @@ -60,6 +67,38 @@ for.end: ; preds = %vector.body ret void } +; Function Attrs: nounwind +define void @sloop(i32 signext %x, i32* nocapture %a) #1 { +entry: + br label %for.body + +; GENERIC-LABEL: @sloop +; BASIC-LABEL: @sloop +; PWR-LABEL: @sloop +; GENERIC: mtctr +; BASIC: mtctr +; PWR: mtctr +; GENERIC-NOT: .align +; BASIC: .align 4 +; PWR: .align 5 +; GENERIC: bdnz +; BASIC: bdnz +; PWR: bdnz + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %add = add nsw i32 %0, 4 + store i32 %add, i32* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 2048 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind }