llvm-project/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp

392 lines
14 KiB
C++
Raw Normal View History

//===-- PPCBranchSelector.cpp - Emit long conditional branches ------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains a pass that scans a machine function to determine which
// conditional branches need more than 16 bits of displacement to reach their
// target basic block. It does this in two passes; a calculation of basic block
2010-08-24 04:30:51 +08:00
// positions pass, and a branch pseudo op to machine branch opcode pass. This
// pass should be run last, just before the assembly printer.
//
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/PPCPredicates.h"
#include "PPC.h"
#include "PPCInstrBuilder.h"
#include "PPCInstrInfo.h"
#include "PPCSubtarget.h"
2006-11-17 02:13:49 +08:00
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetMachine.h"
#include <algorithm>
using namespace llvm;
#define DEBUG_TYPE "ppc-branch-select"
STATISTIC(NumExpanded, "Number of branches expanded to long format");
2006-11-17 02:13:49 +08:00
namespace {
struct PPCBSel : public MachineFunctionPass {
2007-05-03 09:11:54 +08:00
static char ID;
PPCBSel() : MachineFunctionPass(ID) {
initializePPCBSelPass(*PassRegistry::getPassRegistry());
}
// The sizes of the basic blocks in the function (the first
// element of the pair); the second element of the pair is the amount of the
// size that is due to potential padding.
std::vector<std::pair<unsigned, unsigned>> BlockSizes;
// The first block number which has imprecise instruction address.
int FirstImpreciseBlock = -1;
unsigned GetAlignmentAdjustment(MachineBasicBlock &MBB, unsigned Offset);
unsigned ComputeBlockSizes(MachineFunction &Fn);
void modifyAdjustment(MachineFunction &Fn);
int computeBranchSize(MachineFunction &Fn,
const MachineBasicBlock *Src,
const MachineBasicBlock *Dest,
unsigned BrOffset);
bool runOnMachineFunction(MachineFunction &Fn) override;
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::NoVRegs);
}
StringRef getPassName() const override { return "PowerPC Branch Selector"; }
};
2007-05-03 09:11:54 +08:00
char PPCBSel::ID = 0;
}
INITIALIZE_PASS(PPCBSel, "ppc-branch-select", "PowerPC Branch Selector",
false, false)
/// createPPCBranchSelectionPass - returns an instance of the Branch Selection
/// Pass
///
FunctionPass *llvm::createPPCBranchSelectionPass() {
return new PPCBSel();
}
/// In order to make MBB aligned, we need to add an adjustment value to the
/// original Offset.
unsigned PPCBSel::GetAlignmentAdjustment(MachineBasicBlock &MBB,
unsigned Offset) {
unsigned Align = MBB.getAlignment();
if (!Align)
return 0;
unsigned AlignAmt = 1 << Align;
unsigned ParentAlign = MBB.getParent()->getAlignment();
if (Align <= ParentAlign)
return OffsetToAlignment(Offset, AlignAmt);
// The alignment of this MBB is larger than the function's alignment, so we
// can't tell whether or not it will insert nops. Assume that it will.
if (FirstImpreciseBlock < 0)
FirstImpreciseBlock = MBB.getNumber();
return AlignAmt + OffsetToAlignment(Offset, AlignAmt);
}
/// We need to be careful about the offset of the first block in the function
/// because it might not have the function's alignment. This happens because,
/// under the ELFv2 ABI, for functions which require a TOC pointer, we add a
/// two-instruction sequence to the start of the function.
/// Note: This needs to be synchronized with the check in
/// PPCLinuxAsmPrinter::EmitFunctionBodyStart.
static inline unsigned GetInitialOffset(MachineFunction &Fn) {
unsigned InitialOffset = 0;
if (Fn.getSubtarget<PPCSubtarget>().isELFv2ABI() &&
!Fn.getRegInfo().use_empty(PPC::X2))
InitialOffset = 8;
return InitialOffset;
}
/// Measure each MBB and compute a size for the entire function.
unsigned PPCBSel::ComputeBlockSizes(MachineFunction &Fn) {
const PPCInstrInfo *TII =
static_cast<const PPCInstrInfo *>(Fn.getSubtarget().getInstrInfo());
unsigned FuncSize = GetInitialOffset(Fn);
for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
++MFI) {
MachineBasicBlock *MBB = &*MFI;
// The end of the previous block may have extra nops if this block has an
// alignment requirement.
if (MBB->getNumber() > 0) {
unsigned AlignExtra = GetAlignmentAdjustment(*MBB, FuncSize);
auto &BS = BlockSizes[MBB->getNumber()-1];
BS.first += AlignExtra;
BS.second = AlignExtra;
FuncSize += AlignExtra;
}
unsigned BlockSize = 0;
for (MachineInstr &MI : *MBB) {
BlockSize += TII->getInstSizeInBytes(MI);
if (MI.isInlineAsm() && (FirstImpreciseBlock < 0))
FirstImpreciseBlock = MBB->getNumber();
}
BlockSizes[MBB->getNumber()].first = BlockSize;
FuncSize += BlockSize;
}
return FuncSize;
}
/// Modify the basic block align adjustment.
void PPCBSel::modifyAdjustment(MachineFunction &Fn) {
unsigned Offset = GetInitialOffset(Fn);
for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
++MFI) {
MachineBasicBlock *MBB = &*MFI;
if (MBB->getNumber() > 0) {
auto &BS = BlockSizes[MBB->getNumber()-1];
BS.first -= BS.second;
Offset -= BS.second;
unsigned AlignExtra = GetAlignmentAdjustment(*MBB, Offset);
BS.first += AlignExtra;
BS.second = AlignExtra;
Offset += AlignExtra;
}
Offset += BlockSizes[MBB->getNumber()].first;
}
}
/// Determine the offset from the branch in Src block to the Dest block.
/// BrOffset is the offset of the branch instruction inside Src block.
int PPCBSel::computeBranchSize(MachineFunction &Fn,
const MachineBasicBlock *Src,
const MachineBasicBlock *Dest,
unsigned BrOffset) {
int BranchSize;
unsigned MaxAlign = 2;
bool NeedExtraAdjustment = false;
if (Dest->getNumber() <= Src->getNumber()) {
// If this is a backwards branch, the delta is the offset from the
// start of this block to this branch, plus the sizes of all blocks
// from this block to the dest.
BranchSize = BrOffset;
MaxAlign = std::max(MaxAlign, Src->getAlignment());
int DestBlock = Dest->getNumber();
BranchSize += BlockSizes[DestBlock].first;
for (unsigned i = DestBlock+1, e = Src->getNumber(); i < e; ++i) {
BranchSize += BlockSizes[i].first;
MaxAlign = std::max(MaxAlign,
Fn.getBlockNumbered(i)->getAlignment());
}
NeedExtraAdjustment = (FirstImpreciseBlock >= 0) &&
(DestBlock >= FirstImpreciseBlock);
} else {
// Otherwise, add the size of the blocks between this block and the
// dest to the number of bytes left in this block.
unsigned StartBlock = Src->getNumber();
BranchSize = BlockSizes[StartBlock].first - BrOffset;
MaxAlign = std::max(MaxAlign, Dest->getAlignment());
for (unsigned i = StartBlock+1, e = Dest->getNumber(); i != e; ++i) {
BranchSize += BlockSizes[i].first;
MaxAlign = std::max(MaxAlign,
Fn.getBlockNumbered(i)->getAlignment());
}
NeedExtraAdjustment = (FirstImpreciseBlock >= 0) &&
(Src->getNumber() >= FirstImpreciseBlock);
}
// We tend to over estimate code size due to large alignment and
// inline assembly. Usually it causes larger computed branch offset.
// But sometimes it may also causes smaller computed branch offset
// than actual branch offset. If the offset is close to the limit of
// encoding, it may cause problem at run time.
// Following is a simplified example.
//
// actual estimated
// address address
// ...
// bne Far 100 10c
// .p2align 4
// Near: 110 110
// ...
// Far: 8108 8108
//
// Actual offset: 0x8108 - 0x100 = 0x8008
// Computed offset: 0x8108 - 0x10c = 0x7ffc
//
// This example also shows when we can get the largest gap between
// estimated offset and actual offset. If there is an aligned block
// ABB between branch and target, assume its alignment is <align>
// bits. Now consider the accumulated function size FSIZE till the end
// of previous block PBB. If the estimated FSIZE is multiple of
// 2^<align>, we don't need any padding for the estimated address of
// ABB. If actual FSIZE at the end of PBB is 4 bytes more than
// multiple of 2^<align>, then we need (2^<align> - 4) bytes of
// padding. It also means the actual branch offset is (2^<align> - 4)
// larger than computed offset. Other actual FSIZE needs less padding
// bytes, so causes smaller gap between actual and computed offset.
//
// On the other hand, if the inline asm or large alignment occurs
// between the branch block and destination block, the estimated address
// can be <delta> larger than actual address. If padding bytes are
// needed for a later aligned block, the actual number of padding bytes
// is at most <delta> more than estimated padding bytes. So the actual
// aligned block address is less than or equal to the estimated aligned
// block address. So the actual branch offset is less than or equal to
// computed branch offset.
//
// The computed offset is at most ((1 << alignment) - 4) bytes smaller
// than actual offset. So we add this number to the offset for safety.
if (NeedExtraAdjustment)
BranchSize += (1 << MaxAlign) - 4;
return BranchSize;
}
bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
const PPCInstrInfo *TII =
static_cast<const PPCInstrInfo *>(Fn.getSubtarget().getInstrInfo());
// Give the blocks of the function a dense, in-order, numbering.
Fn.RenumberBlocks();
BlockSizes.resize(Fn.getNumBlockIDs());
FirstImpreciseBlock = -1;
// Measure each MBB and compute a size for the entire function.
unsigned FuncSize = ComputeBlockSizes(Fn);
// If the entire function is smaller than the displacement of a branch field,
// we know we don't need to shrink any branches in this function. This is a
// common case.
if (FuncSize < (1 << 15)) {
BlockSizes.clear();
return false;
}
// For each conditional branch, if the offset to its destination is larger
// than the offset field allows, transform it into a long branch sequence
// like this:
// short branch:
// bCC MBB
// long branch:
// b!CC $PC+8
// b MBB
//
bool MadeChange = true;
bool EverMadeChange = false;
while (MadeChange) {
// Iteratively expand branches until we reach a fixed point.
MadeChange = false;
for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
++MFI) {
MachineBasicBlock &MBB = *MFI;
unsigned MBBStartOffset = 0;
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
I != E; ++I) {
MachineBasicBlock *Dest = nullptr;
if (I->getOpcode() == PPC::BCC && !I->getOperand(2).isImm())
Dest = I->getOperand(2).getMBB();
Add CR-bit tracking to the PowerPC backend for i1 values This change enables tracking i1 values in the PowerPC backend using the condition register bits. These bits can be treated on PowerPC as separate registers; individual bit operations (and, or, xor, etc.) are supported. Tracking booleans in CR bits has several advantages: - Reduction in register pressure (because we no longer need GPRs to store boolean values). - Logical operations on booleans can be handled more efficiently; we used to have to move all results from comparisons into GPRs, perform promoted logical operations in GPRs, and then move the result back into condition register bits to be used by conditional branches. This can be very inefficient, because the throughput of these CR <-> GPR moves have high latency and low throughput (especially when other associated instructions are accounted for). - On the POWER7 and similar cores, we can increase total throughput by using the CR bits. CR bit operations have a dedicated functional unit. Most of this is more-or-less mechanical: Adjustments were needed in the calling-convention code, support was added for spilling/restoring individual condition-register bits, and conditional branch instruction definitions taking specific CR bits were added (plus patterns and code for generating bit-level operations). This is enabled by default when running at -O2 and higher. For -O0 and -O1, where the ability to debug is more important, this feature is disabled by default. Individual CR bits do not have assigned DWARF register numbers, and storing values in CR bits makes them invisible to the debugger. It is critical, however, that we don't move i1 values that have been promoted to larger values (such as those passed as function arguments) into bit registers only to quickly turn around and move the values back into GPRs (such as happens when values are returned by functions). A pair of target-specific DAG combines are added to remove the trunc/extends in: trunc(binary-ops(binary-ops(zext(x), zext(y)), ...) and: zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...) In short, we only want to use CR bits where some of the i1 values come from comparisons or are used by conditional branches or selects. To put it another way, if we can do the entire i1 computation in GPRs, then we probably should (on the POWER7, the GPR-operation throughput is higher, and for all cores, the CR <-> GPR moves are expensive). POWER7 test-suite performance results (from 10 runs in each configuration): SingleSource/Benchmarks/Misc/mandel-2: 35% speedup MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown MultiSource/Applications/lemon/lemon: 8% slowdown llvm-svn: 202451
2014-02-28 08:27:01 +08:00
else if ((I->getOpcode() == PPC::BC || I->getOpcode() == PPC::BCn) &&
!I->getOperand(1).isImm())
Dest = I->getOperand(1).getMBB();
else if ((I->getOpcode() == PPC::BDNZ8 || I->getOpcode() == PPC::BDNZ ||
I->getOpcode() == PPC::BDZ8 || I->getOpcode() == PPC::BDZ) &&
!I->getOperand(0).isImm())
Dest = I->getOperand(0).getMBB();
if (!Dest) {
MBBStartOffset += TII->getInstSizeInBytes(*I);
continue;
}
// Determine the offset from the current branch to the destination
// block.
int BranchSize = computeBranchSize(Fn, &MBB, Dest, MBBStartOffset);
// If this branch is in range, ignore it.
if (isInt<16>(BranchSize)) {
MBBStartOffset += 4;
continue;
}
// Otherwise, we have to expand it to a long branch.
MachineInstr &OldBranch = *I;
DebugLoc dl = OldBranch.getDebugLoc();
if (I->getOpcode() == PPC::BCC) {
// The BCC operands are:
// 0. PPC branch predicate
// 1. CR register
// 2. Target MBB
PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm();
unsigned CRReg = I->getOperand(1).getReg();
// Jump over the uncond branch inst (i.e. $PC+8) on opposite condition.
BuildMI(MBB, I, dl, TII->get(PPC::BCC))
.addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2);
Add CR-bit tracking to the PowerPC backend for i1 values This change enables tracking i1 values in the PowerPC backend using the condition register bits. These bits can be treated on PowerPC as separate registers; individual bit operations (and, or, xor, etc.) are supported. Tracking booleans in CR bits has several advantages: - Reduction in register pressure (because we no longer need GPRs to store boolean values). - Logical operations on booleans can be handled more efficiently; we used to have to move all results from comparisons into GPRs, perform promoted logical operations in GPRs, and then move the result back into condition register bits to be used by conditional branches. This can be very inefficient, because the throughput of these CR <-> GPR moves have high latency and low throughput (especially when other associated instructions are accounted for). - On the POWER7 and similar cores, we can increase total throughput by using the CR bits. CR bit operations have a dedicated functional unit. Most of this is more-or-less mechanical: Adjustments were needed in the calling-convention code, support was added for spilling/restoring individual condition-register bits, and conditional branch instruction definitions taking specific CR bits were added (plus patterns and code for generating bit-level operations). This is enabled by default when running at -O2 and higher. For -O0 and -O1, where the ability to debug is more important, this feature is disabled by default. Individual CR bits do not have assigned DWARF register numbers, and storing values in CR bits makes them invisible to the debugger. It is critical, however, that we don't move i1 values that have been promoted to larger values (such as those passed as function arguments) into bit registers only to quickly turn around and move the values back into GPRs (such as happens when values are returned by functions). A pair of target-specific DAG combines are added to remove the trunc/extends in: trunc(binary-ops(binary-ops(zext(x), zext(y)), ...) and: zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...) In short, we only want to use CR bits where some of the i1 values come from comparisons or are used by conditional branches or selects. To put it another way, if we can do the entire i1 computation in GPRs, then we probably should (on the POWER7, the GPR-operation throughput is higher, and for all cores, the CR <-> GPR moves are expensive). POWER7 test-suite performance results (from 10 runs in each configuration): SingleSource/Benchmarks/Misc/mandel-2: 35% speedup MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown MultiSource/Applications/lemon/lemon: 8% slowdown llvm-svn: 202451
2014-02-28 08:27:01 +08:00
} else if (I->getOpcode() == PPC::BC) {
unsigned CRBit = I->getOperand(0).getReg();
BuildMI(MBB, I, dl, TII->get(PPC::BCn)).addReg(CRBit).addImm(2);
} else if (I->getOpcode() == PPC::BCn) {
unsigned CRBit = I->getOperand(0).getReg();
BuildMI(MBB, I, dl, TII->get(PPC::BC)).addReg(CRBit).addImm(2);
} else if (I->getOpcode() == PPC::BDNZ) {
BuildMI(MBB, I, dl, TII->get(PPC::BDZ)).addImm(2);
} else if (I->getOpcode() == PPC::BDNZ8) {
BuildMI(MBB, I, dl, TII->get(PPC::BDZ8)).addImm(2);
} else if (I->getOpcode() == PPC::BDZ) {
BuildMI(MBB, I, dl, TII->get(PPC::BDNZ)).addImm(2);
} else if (I->getOpcode() == PPC::BDZ8) {
BuildMI(MBB, I, dl, TII->get(PPC::BDNZ8)).addImm(2);
} else {
llvm_unreachable("Unhandled branch type!");
}
// Uncond branch to the real destination.
I = BuildMI(MBB, I, dl, TII->get(PPC::B)).addMBB(Dest);
// Remove the old branch from the function.
OldBranch.eraseFromParent();
// Remember that this instruction is 8-bytes, increase the size of the
// block by 4, remember to iterate.
BlockSizes[MBB.getNumber()].first += 4;
MBBStartOffset += 8;
++NumExpanded;
MadeChange = true;
}
}
if (MadeChange) {
// If we're going to iterate again, make sure we've updated our
// padding-based contributions to the block sizes.
modifyAdjustment(Fn);
}
EverMadeChange |= MadeChange;
}
BlockSizes.clear();
return true;
}