[RegAllocGreedy] Introduce a late pass to repair broken hints.

A broken hint is a copy where both ends are assigned different colors. When a
variable gets evicted in the neighborhood of such copies, it is likely we can
reconcile some of them.


** Context **

Copies are inserted during the register allocation via splitting. These split
points are required to relax the constraints on the allocation problem. When
such a point is inserted, both ends of the copy would not share the same color
with respect to the current allocation problem. When variables get evicted,
the allocation problem becomes different and some split point may not be
required anymore. However, the related variables may already have been colored.

This usually shows up in the assembly with pattern like this:
def A
...
save A to B
def A
use A
restore A from B
...
use B

Whereas we could simply have done:
def B
...
def A
use A
...
use B


** Proposed Solution **

A variable having a broken hint is marked for late recoloring if and only if
selecting a register for it evict another variable. Indeed, if no eviction
happens this is pointless to look for recoloring opportunities as it means the
situation was the same as the initial allocation problem where we had to break
the hint.

Finally, when everything has been allocated, we look for recoloring
opportunities for all the identified candidates.
The recoloring is performed very late to rely on accurate copy cost (all
involved variables are allocated).
The recoloring is simple unlike the last change recoloring. It propagates the
color of the broken hint to all its copy-related variables. If the color is
available for them, the recoloring uses it, otherwise it gives up on that hint
even if a more complex coloring would have worked.

The recoloring happens only if it is profitable. The profitability is evaluated
using the expected frequency of the copies of the currently recolored variable
with a) its current color and b) with the target color. If a) is greater or
equal than b), then it is profitable and the recoloring happen.


** Example **

Consider the following example:
BB1:
  a =
  b =
BB2:
  ...
   = b
   = a
Let us assume b gets split:
BB1:
  a =
  b =
BB2:
  c = b
  ...
  d = c
  = d
  = a
Because of how the allocation work, b, c, and d may be assigned different
colors. Now, if a gets evicted to make room for c, assuming b and d were
assigned to something different than a.
We end up with:
BB1:
  a =
  st a, SpillSlot
  b =
BB2:
  c = b
  ...
  d = c
  = d
  e = ld SpillSlot
  = e
This is likely that we can assign the same register for b, c, and d,
getting rid of 2 copies.


** Performances **

Both ARM64 and x86_64 show performance improvements of up to 3% for the
llvm-testsuite + externals with Os and O3. There are a few regressions too that
comes from the (in)accuracy of the block frequency estimate.

<rdar://problem/18312047>

llvm-svn: 225422
This commit is contained in:
Quentin Colombet 2015-01-08 01:16:39 +00:00
parent 2b6917b020
commit a799e2e014
4 changed files with 357 additions and 2 deletions

View File

@ -90,6 +90,7 @@ void RegAllocBase::allocatePhysRegs() {
// Unused registers can appear when the spiller coalesces snippets.
if (MRI->reg_nodbg_empty(VirtReg->reg)) {
DEBUG(dbgs() << "Dropping unused " << *VirtReg << '\n');
aboutToRemoveInterval(*VirtReg);
LIS->removeInterval(VirtReg->reg);
continue;
}
@ -139,6 +140,7 @@ void RegAllocBase::allocatePhysRegs() {
assert(!VRM->hasPhys(SplitVirtReg->reg) && "Register already assigned");
if (MRI->reg_nodbg_empty(SplitVirtReg->reg)) {
DEBUG(dbgs() << "not queueing unused " << *SplitVirtReg << '\n');
aboutToRemoveInterval(*SplitVirtReg);
LIS->removeInterval(SplitVirtReg->reg);
continue;
}

View File

@ -96,6 +96,9 @@ protected:
// Use this group name for NamedRegionTimer.
static const char TimerGroupName[];
/// Method called when the allocator is about to remove a LiveInterval.
virtual void aboutToRemoveInterval(LiveInterval &LI) {}
public:
/// VerifyEnabled - True when -verify-regalloc is given.
static bool VerifyEnabled;

View File

@ -296,6 +296,9 @@ class RAGreedy : public MachineFunctionPass,
/// obtained from the TargetSubtargetInfo.
bool EnableLocalReassign;
/// Set of broken hints that may be reconciled later because of eviction.
SmallSetVector<LiveInterval *, 8> SetOfBrokenHints;
public:
RAGreedy();
@ -311,6 +314,7 @@ public:
void enqueue(LiveInterval *LI) override;
LiveInterval *dequeue() override;
unsigned selectOrSplit(LiveInterval&, SmallVectorImpl<unsigned>&) override;
void aboutToRemoveInterval(LiveInterval &) override;
/// Perform register allocation.
bool runOnMachineFunction(MachineFunction &mf) override;
@ -378,6 +382,24 @@ private:
SmallVirtRegSet &, unsigned);
bool tryRecoloringCandidates(PQueue &, SmallVectorImpl<unsigned> &,
SmallVirtRegSet &, unsigned);
void tryHintRecoloring(LiveInterval &);
void tryHintsRecoloring();
/// Model the information carried by one end of a copy.
struct HintInfo {
/// The frequency of the copy.
BlockFrequency Freq;
/// The virtual register or physical register.
unsigned Reg;
/// Its currently assigned register.
/// In case of a physical register Reg == PhysReg.
unsigned PhysReg;
HintInfo(BlockFrequency Freq, unsigned Reg, unsigned PhysReg)
: Freq(Freq), Reg(Reg), PhysReg(PhysReg) {}
};
typedef SmallVector<HintInfo, 4> HintsInfo;
BlockFrequency getBrokenHintFreq(const HintsInfo &, unsigned);
void collectHintInfo(unsigned, HintsInfo &);
};
} // end anonymous namespace
@ -453,7 +475,9 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
bool RAGreedy::LRE_CanEraseVirtReg(unsigned VirtReg) {
if (VRM->hasPhys(VirtReg)) {
Matrix->unassign(LIS->getInterval(VirtReg));
LiveInterval &LI = LIS->getInterval(VirtReg);
Matrix->unassign(LI);
aboutToRemoveInterval(LI);
return true;
}
// Unassigned virtreg is probably in the priority queue.
@ -2213,6 +2237,11 @@ unsigned RAGreedy::tryAssignCSRFirstTime(LiveInterval &VirtReg,
return PhysReg;
}
void RAGreedy::aboutToRemoveInterval(LiveInterval &LI) {
// Do not keep invalid information around.
SetOfBrokenHints.remove(&LI);
}
void RAGreedy::initializeCSRCost() {
// We use the larger one out of the command-line option and the value report
// by TRI.
@ -2238,6 +2267,170 @@ void RAGreedy::initializeCSRCost() {
CSRCost = CSRCost.getFrequency() * (ActualEntry / FixedEntry);
}
/// \brief Collect the hint info for \p Reg.
/// The results are stored into \p Out.
/// \p Out is not cleared before being populated.
void RAGreedy::collectHintInfo(unsigned Reg, HintsInfo &Out) {
for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) {
if (!Instr.isFullCopy())
continue;
// Look for the other end of the copy.
unsigned OtherReg = Instr.getOperand(0).getReg();
if (OtherReg == Reg) {
OtherReg = Instr.getOperand(1).getReg();
if (OtherReg == Reg)
continue;
}
// Get the current assignment.
unsigned OtherPhysReg = TargetRegisterInfo::isPhysicalRegister(OtherReg)
? OtherReg
: VRM->getPhys(OtherReg);
// Push the collected information.
Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg,
OtherPhysReg));
}
}
/// \brief Using the given \p List, compute the cost of the broken hints if
/// \p PhysReg was used.
/// \return The cost of \p List for \p PhysReg.
BlockFrequency RAGreedy::getBrokenHintFreq(const HintsInfo &List,
unsigned PhysReg) {
BlockFrequency Cost = 0;
for (const HintInfo &Info : List) {
if (Info.PhysReg != PhysReg)
Cost += Info.Freq;
}
return Cost;
}
/// \brief Using the register assigned to \p VirtReg, try to recolor
/// all the live ranges that are copy-related with \p VirtReg.
/// The recoloring is then propagated to all the live-ranges that have
/// been recolored and so on, until no more copies can be coalesced or
/// it is not profitable.
/// For a given live range, profitability is determined by the sum of the
/// frequencies of the non-identity copies it would introduce with the old
/// and new register.
void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
// We have a broken hint, check if it is possible to fix it by
// reusing PhysReg for the copy-related live-ranges. Indeed, we evicted
// some register and PhysReg may be available for the other live-ranges.
SmallSet<unsigned, 4> Visited;
SmallVector<unsigned, 2> RecoloringCandidates;
HintsInfo Info;
unsigned Reg = VirtReg.reg;
unsigned PhysReg = VRM->getPhys(Reg);
// Start the recoloring algorithm from the input live-interval, then
// it will propagate to the ones that are copy-related with it.
Visited.insert(Reg);
RecoloringCandidates.push_back(Reg);
DEBUG(dbgs() << "Trying to reconcile hints for: " << PrintReg(Reg, TRI) << '('
<< PrintReg(PhysReg, TRI) << ")\n");
do {
Reg = RecoloringCandidates.pop_back_val();
// We cannot recolor physcal register.
if (TargetRegisterInfo::isPhysicalRegister(Reg))
continue;
assert(VRM->hasPhys(Reg) && "We have unallocated variable!!");
// Get the live interval mapped with this virtual register to be able
// to check for the interference with the new color.
LiveInterval &LI = LIS->getInterval(Reg);
unsigned CurrPhys = VRM->getPhys(Reg);
// Check that the new color matches the register class constraints and
// that it is free for this live range.
if (CurrPhys != PhysReg && (!MRI->getRegClass(Reg)->contains(PhysReg) ||
Matrix->checkInterference(LI, PhysReg)))
continue;
DEBUG(dbgs() << PrintReg(Reg, TRI) << '(' << PrintReg(CurrPhys, TRI)
<< ") is recolorable.\n");
// Gather the hint info.
Info.clear();
collectHintInfo(Reg, Info);
// Check if recoloring the live-range will increase the cost of the
// non-identity copies.
if (CurrPhys != PhysReg) {
DEBUG(dbgs() << "Checking profitability:\n");
BlockFrequency OldCopiesCost = getBrokenHintFreq(Info, CurrPhys);
BlockFrequency NewCopiesCost = getBrokenHintFreq(Info, PhysReg);
DEBUG(dbgs() << "Old Cost: " << OldCopiesCost.getFrequency()
<< "\nNew Cost: " << NewCopiesCost.getFrequency() << '\n');
if (OldCopiesCost < NewCopiesCost) {
DEBUG(dbgs() << "=> Not profitable.\n");
continue;
}
// At this point, the cost is either cheaper or equal. If it is
// equal, we consider this is profitable because it may expose
// more recoloring opportunities.
DEBUG(dbgs() << "=> Profitable.\n");
// Recolor the live-range.
Matrix->unassign(LI);
Matrix->assign(LI, PhysReg);
}
// Push all copy-related live-ranges to keep reconciling the broken
// hints.
for (const HintInfo &HI : Info) {
if (Visited.insert(HI.Reg).second)
RecoloringCandidates.push_back(HI.Reg);
}
} while (!RecoloringCandidates.empty());
}
/// \brief Try to recolor broken hints.
/// Broken hints may be repaired by recoloring when an evicted variable
/// freed up a register for a larger live-range.
/// Consider the following example:
/// BB1:
/// a =
/// b =
/// BB2:
/// ...
/// = b
/// = a
/// Let us assume b gets split:
/// BB1:
/// a =
/// b =
/// BB2:
/// c = b
/// ...
/// d = c
/// = d
/// = a
/// Because of how the allocation work, b, c, and d may be assigned different
/// colors. Now, if a gets evicted later:
/// BB1:
/// a =
/// st a, SpillSlot
/// b =
/// BB2:
/// c = b
/// ...
/// d = c
/// = d
/// e = ld SpillSlot
/// = e
/// This is likely that we can assign the same register for b, c, and d,
/// getting rid of 2 copies.
void RAGreedy::tryHintsRecoloring() {
for (LiveInterval *LI : SetOfBrokenHints) {
assert(TargetRegisterInfo::isVirtualRegister(LI->reg) &&
"Recoloring is possible only for virtual registers");
// Some dead defs may be around (e.g., because of debug uses).
// Ignore those.
if (!VRM->hasPhys(LI->reg))
continue;
tryHintRecoloring(*LI);
}
}
unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
SmallVectorImpl<unsigned> &NewVRegs,
SmallVirtRegSet &FixedRegisters,
@ -2274,8 +2467,18 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
// queue. The RS_Split ranges already failed to do this, and they should not
// get a second chance until they have been split.
if (Stage != RS_Split)
if (unsigned PhysReg = tryEvict(VirtReg, Order, NewVRegs, CostPerUseLimit))
if (unsigned PhysReg =
tryEvict(VirtReg, Order, NewVRegs, CostPerUseLimit)) {
unsigned Hint = MRI->getSimpleHint(VirtReg.reg);
// If VirtReg has a hint and that hint is broken record this
// virtual register as a recoloring candidate for broken hint.
// Indeed, since we evicted a variable in its neighborhood it is
// likely we can at least partially recolor some of the
// copy-related live-ranges.
if (Hint && Hint != PhysReg)
SetOfBrokenHints.insert(&VirtReg);
return PhysReg;
}
assert(NewVRegs.empty() && "Cannot append to existing NewVRegs");
@ -2355,8 +2558,10 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
NextCascade = 1;
IntfCache.init(MF, Matrix->getLiveUnions(), Indexes, LIS, TRI);
GlobalCand.resize(32); // This will grow as needed.
SetOfBrokenHints.clear();
allocatePhysRegs();
tryHintsRecoloring();
releaseMemory();
return true;
}

View File

@ -0,0 +1,145 @@
; RUN: llc < %s -o - -mtriple=x86_64-apple-macosx | FileCheck %s
; Test case for the recoloring of broken hints.
; This is tricky to have something reasonably small to kick this optimization since
; it requires that spliting and spilling occur.
; The bottom line is that this test case is fragile.
; This was reduced from the make_list function from the llvm-testsuite:
; SingleSource/Benchmarks/McGill/chomp.c
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.9.0"
%struct._list = type { i32*, %struct._list* }
@ncol = external global i32, align 4
@nrow = external global i32, align 4
declare noalias i32* @copy_data()
declare noalias i8* @malloc(i64)
declare i32 @get_value()
declare i32 @in_wanted(i32* nocapture readonly)
declare noalias i32* @make_data()
; CHECK-LABEL: make_list:
; Function prologue.
; CHECK: pushq
; CHECK: subq ${{[0-9]+}}, %rsp
; Move the first argument (%data) into a temporary register.
; It will not survive the call to malloc otherwise.
; CHECK: movq %rdi, [[ARG1:%r[0-9a-z]+]]
; CHECK: callq _malloc
; Compute %data - 1 as used for load in land.rhs.i (via the variable %indvars.iv.next.i).
; CHECK: addq $-4, [[ARG1]]
; We use to produce a useless copy here and move %data in another temporary register.
; CHECK-NOT: movq [[ARG1]]
; End of the first basic block.
; CHECK: .align
; Now check that %data is used in an address computation.
; CHECK: leaq ([[ARG1]]
define %struct._list* @make_list(i32* nocapture readonly %data, i32* nocapture %value, i32* nocapture %all) {
entry:
%call = tail call i8* @malloc(i64 16)
%next = getelementptr inbounds i8* %call, i64 8
%tmp = bitcast i8* %next to %struct._list**
%tmp2 = bitcast i8* %call to %struct._list*
%.pre78 = load i32* @ncol, align 4
br label %for.cond1.preheader
for.cond1.preheader: ; preds = %for.inc32, %entry
%tmp4 = phi i32 [ %.pre78, %entry ], [ 0, %for.inc32 ]
%current.077 = phi %struct._list* [ %tmp2, %entry ], [ %current.1.lcssa, %for.inc32 ]
%cmp270 = icmp eq i32 %tmp4, 0
br i1 %cmp270, label %for.inc32, label %for.body3
for.body3: ; preds = %if.end31, %for.cond1.preheader
%current.173 = phi %struct._list* [ %current.2, %if.end31 ], [ %current.077, %for.cond1.preheader ]
%row.172 = phi i32 [ %row.3, %if.end31 ], [ 0, %for.cond1.preheader ]
%col.071 = phi i32 [ %inc, %if.end31 ], [ 0, %for.cond1.preheader ]
%call4 = tail call i32* @make_data()
%tmp5 = load i32* @ncol, align 4
%tobool14.i = icmp eq i32 %tmp5, 0
br i1 %tobool14.i, label %while.cond.i, label %while.body.lr.ph.i
while.body.lr.ph.i: ; preds = %for.body3
%tmp6 = sext i32 %tmp5 to i64
br label %while.body.i
while.body.i: ; preds = %while.body.i, %while.body.lr.ph.i
%indvars.iv.i = phi i64 [ %tmp6, %while.body.lr.ph.i ], [ %indvars.iv.next.i, %while.body.i ]
%indvars.iv.next.i = add nsw i64 %indvars.iv.i, -1
%tmp9 = trunc i64 %indvars.iv.next.i to i32
%tobool.i = icmp eq i32 %tmp9, 0
br i1 %tobool.i, label %while.cond.i, label %while.body.i
while.cond.i: ; preds = %land.rhs.i, %while.body.i, %for.body3
%indvars.iv.i64 = phi i64 [ %indvars.iv.next.i65, %land.rhs.i ], [ 0, %for.body3 ], [ %tmp6, %while.body.i ]
%indvars.iv.next.i65 = add nsw i64 %indvars.iv.i64, -1
%tmp10 = trunc i64 %indvars.iv.i64 to i32
%tobool.i66 = icmp eq i32 %tmp10, 0
br i1 %tobool.i66, label %if.else, label %land.rhs.i
land.rhs.i: ; preds = %while.cond.i
%arrayidx.i67 = getelementptr inbounds i32* %call4, i64 %indvars.iv.next.i65
%tmp11 = load i32* %arrayidx.i67, align 4
%arrayidx2.i68 = getelementptr inbounds i32* %data, i64 %indvars.iv.next.i65
%tmp12 = load i32* %arrayidx2.i68, align 4
%cmp.i69 = icmp eq i32 %tmp11, %tmp12
br i1 %cmp.i69, label %while.cond.i, label %equal_data.exit
equal_data.exit: ; preds = %land.rhs.i
%cmp3.i = icmp slt i32 %tmp10, 1
br i1 %cmp3.i, label %if.else, label %if.then
if.then: ; preds = %equal_data.exit
%next7 = getelementptr inbounds %struct._list* %current.173, i64 0, i32 1
%tmp14 = load %struct._list** %next7, align 8
%next12 = getelementptr inbounds %struct._list* %tmp14, i64 0, i32 1
store %struct._list* null, %struct._list** %next12, align 8
%tmp15 = load %struct._list** %next7, align 8
%tmp16 = load i32* %value, align 4
%cmp14 = icmp eq i32 %tmp16, 1
%.tmp16 = select i1 %cmp14, i32 0, i32 %tmp16
%tmp18 = load i32* %all, align 4
%tmp19 = or i32 %tmp18, %.tmp16
%tmp20 = icmp eq i32 %tmp19, 0
br i1 %tmp20, label %if.then19, label %if.end31
if.then19: ; preds = %if.then
%call21 = tail call i32 @in_wanted(i32* %call4)
br label %if.end31
if.else: ; preds = %equal_data.exit, %while.cond.i
%cmp26 = icmp eq i32 %col.071, 0
%.row.172 = select i1 %cmp26, i32 0, i32 %row.172
%sub30 = add nsw i32 %tmp5, -1
br label %if.end31
if.end31: ; preds = %if.else, %if.then19, %if.then
%col.1 = phi i32 [ %sub30, %if.else ], [ 0, %if.then ], [ 0, %if.then19 ]
%row.3 = phi i32 [ %.row.172, %if.else ], [ %row.172, %if.then ], [ 0, %if.then19 ]
%current.2 = phi %struct._list* [ %current.173, %if.else ], [ %tmp15, %if.then ], [ %tmp15, %if.then19 ]
%inc = add nsw i32 %col.1, 1
%tmp25 = load i32* @ncol, align 4
%cmp2 = icmp eq i32 %inc, %tmp25
br i1 %cmp2, label %for.cond1.for.inc32_crit_edge, label %for.body3
for.cond1.for.inc32_crit_edge: ; preds = %if.end31
%.pre79 = load i32* @nrow, align 4
br label %for.inc32
for.inc32: ; preds = %for.cond1.for.inc32_crit_edge, %for.cond1.preheader
%tmp26 = phi i32 [ %.pre79, %for.cond1.for.inc32_crit_edge ], [ 0, %for.cond1.preheader ]
%current.1.lcssa = phi %struct._list* [ %current.2, %for.cond1.for.inc32_crit_edge ], [ %current.077, %for.cond1.preheader ]
%row.1.lcssa = phi i32 [ %row.3, %for.cond1.for.inc32_crit_edge ], [ 0, %for.cond1.preheader ]
%inc33 = add nsw i32 %row.1.lcssa, 1
%cmp = icmp eq i32 %inc33, %tmp26
br i1 %cmp, label %for.end34, label %for.cond1.preheader
for.end34: ; preds = %for.inc32
%.pre = load %struct._list** %tmp, align 8
ret %struct._list* %.pre
}