llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

2551 lines
96 KiB
C++
Raw Normal View History

//===- RegAllocGreedy.cpp - greedy register allocator ---------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines the RAGreedy function pass for register allocation in
// optimized builds.
//
//===----------------------------------------------------------------------===//
#include "RegAllocGreedy.h"
#include "AllocationOrder.h"
#include "InterferenceCache.h"
#include "LiveDebugVariables.h"
#include "RegAllocBase.h"
#include "RegAllocEvictionAdvisor.h"
#include "SpillPlacement.h"
#include "SplitKit.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/IndexedMap.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/CodeGen/CalcSpillWeights.h"
#include "llvm/CodeGen/EdgeBundles.h"
#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervalUnion.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveRangeEdit.h"
#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/LiveStacks.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegAllocRegistry.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/CodeGen/SlotIndexes.h"
#include "llvm/CodeGen/Spiller.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/InitializePasses.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/BlockFrequency.h"
#include "llvm/Support/BranchProbability.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/Timer.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <utility>
using namespace llvm;
#define DEBUG_TYPE "regalloc"
STATISTIC(NumGlobalSplits, "Number of split global live ranges");
STATISTIC(NumLocalSplits, "Number of split local live ranges");
STATISTIC(NumEvicted, "Number of interferences evicted");
Recommit r265547, and r265610,r265639,r265657 on top of it, plus two fixes with one about error verify-regalloc reported, and another about live range update of phi after rematerialization. r265547: Replace analyzeSiblingValues with new algorithm to fix its compile time issue. The patch is to solve PR17409 and its duplicates. analyzeSiblingValues is a N x N complexity algorithm where N is the number of siblings generated by reg splitting. Although it causes siginificant compile time issue when N is large, it is also important for performance since it removes redundent spills and enables rematerialization. To solve the compile time issue, the patch removes analyzeSiblingValues and replaces it with lower cost alternatives containing two parts. The first part creates a new spill hoisting method in postOptimization of register allocation. It does spill hoisting at once after all the spills are generated instead of inside every instance of selectOrSplit. The second part queries the define expr of the original register for rematerializaiton and keep it always available during register allocation even if it is already dead. It deletes those dead instructions only in postOptimization. With the two parts in the patch, it can remove analyzeSiblingValues without sacrificing performance. Patches on top of r265547: r265610 "Fix the compare-clang diff error introduced by r265547." r265639 "Fix the sanitizer bootstrap error in r265547." r265657 "InlineSpiller.cpp: Escap \@ in r265547. [-Wdocumentation]" Differential Revision: http://reviews.llvm.org/D15302 Differential Revision: http://reviews.llvm.org/D18934 Differential Revision: http://reviews.llvm.org/D18935 Differential Revision: http://reviews.llvm.org/D18936 llvm-svn: 266162
2016-04-13 11:08:27 +08:00
static cl::opt<SplitEditor::ComplementSpillMode> SplitSpillMode(
"split-spill-mode", cl::Hidden,
cl::desc("Spill mode for splitting live ranges"),
cl::values(clEnumValN(SplitEditor::SM_Partition, "default", "Default"),
clEnumValN(SplitEditor::SM_Size, "size", "Optimize for size"),
clEnumValN(SplitEditor::SM_Speed, "speed", "Optimize for speed")),
Recommit r265547, and r265610,r265639,r265657 on top of it, plus two fixes with one about error verify-regalloc reported, and another about live range update of phi after rematerialization. r265547: Replace analyzeSiblingValues with new algorithm to fix its compile time issue. The patch is to solve PR17409 and its duplicates. analyzeSiblingValues is a N x N complexity algorithm where N is the number of siblings generated by reg splitting. Although it causes siginificant compile time issue when N is large, it is also important for performance since it removes redundent spills and enables rematerialization. To solve the compile time issue, the patch removes analyzeSiblingValues and replaces it with lower cost alternatives containing two parts. The first part creates a new spill hoisting method in postOptimization of register allocation. It does spill hoisting at once after all the spills are generated instead of inside every instance of selectOrSplit. The second part queries the define expr of the original register for rematerializaiton and keep it always available during register allocation even if it is already dead. It deletes those dead instructions only in postOptimization. With the two parts in the patch, it can remove analyzeSiblingValues without sacrificing performance. Patches on top of r265547: r265610 "Fix the compare-clang diff error introduced by r265547." r265639 "Fix the sanitizer bootstrap error in r265547." r265657 "InlineSpiller.cpp: Escap \@ in r265547. [-Wdocumentation]" Differential Revision: http://reviews.llvm.org/D15302 Differential Revision: http://reviews.llvm.org/D18934 Differential Revision: http://reviews.llvm.org/D18935 Differential Revision: http://reviews.llvm.org/D18936 llvm-svn: 266162
2016-04-13 11:08:27 +08:00
cl::init(SplitEditor::SM_Speed));
static cl::opt<unsigned>
LastChanceRecoloringMaxDepth("lcr-max-depth", cl::Hidden,
cl::desc("Last chance recoloring max depth"),
cl::init(5));
static cl::opt<unsigned> LastChanceRecoloringMaxInterference(
"lcr-max-interf", cl::Hidden,
cl::desc("Last chance recoloring maximum number of considered"
" interference at a time"),
cl::init(8));
static cl::opt<bool> ExhaustiveSearch(
"exhaustive-register-search", cl::NotHidden,
cl::desc("Exhaustive Search for registers bypassing the depth "
"and interference cutoffs of last chance recoloring"),
cl::Hidden);
static cl::opt<bool> EnableDeferredSpilling(
"enable-deferred-spilling", cl::Hidden,
cl::desc("Instead of spilling a variable right away, defer the actual "
"code insertion to the end of the allocation. That way the "
"allocator might still find a suitable coloring for this "
"variable because of other evicted variables."),
cl::init(false));
// FIXME: Find a good default for this flag and remove the flag.
static cl::opt<unsigned>
CSRFirstTimeCost("regalloc-csr-first-time-cost",
cl::desc("Cost for first time use of callee-saved register."),
cl::init(0), cl::Hidden);
static cl::opt<unsigned long> GrowRegionComplexityBudget(
"grow-region-complexity-budget",
cl::desc("growRegion() does not scale with the number of BB edges, so "
"limit its budget and bail out once we reach the limit."),
cl::init(10000), cl::Hidden);
static cl::opt<bool> GreedyRegClassPriorityTrumpsGlobalness(
"greedy-regclass-priority-trumps-globalness",
cl::desc("Change the greedy register allocator's live range priority "
"calculation to make the AllocationPriority of the register class "
"more important then whether the range is global"),
cl::Hidden);
static RegisterRegAlloc greedyRegAlloc("greedy", "greedy register allocator",
createGreedyRegisterAllocator);
char RAGreedy::ID = 0;
char &llvm::RAGreedyID = RAGreedy::ID;
INITIALIZE_PASS_BEGIN(RAGreedy, "greedy",
"Greedy Register Allocator", false, false)
INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables)
INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
INITIALIZE_PASS_DEPENDENCY(RegisterCoalescer)
INITIALIZE_PASS_DEPENDENCY(MachineScheduler)
INITIALIZE_PASS_DEPENDENCY(LiveStacks)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
INITIALIZE_PASS_DEPENDENCY(EdgeBundles)
INITIALIZE_PASS_DEPENDENCY(SpillPlacement)
INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)
INITIALIZE_PASS_DEPENDENCY(RegAllocEvictionAdvisorAnalysis)
INITIALIZE_PASS_END(RAGreedy, "greedy",
"Greedy Register Allocator", false, false)
#ifndef NDEBUG
const char *const RAGreedy::StageName[] = {
"RS_New",
"RS_Assign",
"RS_Split",
"RS_Split2",
"RS_Spill",
"RS_Memory",
"RS_Done"
};
#endif
// Hysteresis to use when comparing floats.
// This helps stabilize decisions based on float comparisons.
const float Hysteresis = (2007 / 2048.0f); // 0.97998046875
FunctionPass* llvm::createGreedyRegisterAllocator() {
return new RAGreedy();
}
RegAlloc: Allow targets to split register allocation AMDGPU normally spills SGPRs to VGPRs. Previously, since all register classes are handled at the same time, this was problematic. We don't know ahead of time how many registers will be needed to be reserved to handle the spilling. If no VGPRs were left for spilling, we would have to try to spill to memory. If the spilled SGPRs were required for exec mask manipulation, it is highly problematic because the lanes active at the point of spill are not necessarily the same as at the restore point. Avoid this problem by fully allocating SGPRs in a separate regalloc run from VGPRs. This way we know the exact number of VGPRs needed, and can reserve them for a second run. This fixes the most serious issues, but it is still possible using inline asm to make all VGPRs unavailable. Start erroring in the case where we ever would require memory for an SGPR spill. This is implemented by giving each regalloc pass a callback which reports if a register class should be handled or not. A few passes need some small changes to deal with leftover virtual registers. In the AMDGPU implementation, a new pass is introduced to take the place of PrologEpilogInserter for SGPR spills emitted during the first run. One disadvantage of this is currently StackSlotColoring is no longer used for SGPR spills. It would need to be run again, which will require more work. Error if the standard -regalloc option is used. Introduce new separate -sgpr-regalloc and -vgpr-regalloc flags, so the two runs can be controlled individually. PBQB is not currently supported, so this also prevents using the unhandled allocator.
2018-09-27 07:36:28 +08:00
namespace llvm {
FunctionPass* createGreedyRegisterAllocator(
std::function<bool(const TargetRegisterInfo &TRI,
const TargetRegisterClass &RC)> Ftor);
}
FunctionPass* llvm::createGreedyRegisterAllocator(
std::function<bool(const TargetRegisterInfo &TRI,
const TargetRegisterClass &RC)> Ftor) {
return new RAGreedy(Ftor);
}
RAGreedy::RAGreedy(RegClassFilterFunc F):
MachineFunctionPass(ID),
RegAllocBase(F) {
}
void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesCFG();
AU.addRequired<MachineBlockFrequencyInfo>();
AU.addPreserved<MachineBlockFrequencyInfo>();
[PM/AA] Rebuild LLVM's alias analysis infrastructure in a way compatible with the new pass manager, and no longer relying on analysis groups. This builds essentially a ground-up new AA infrastructure stack for LLVM. The core ideas are the same that are used throughout the new pass manager: type erased polymorphism and direct composition. The design is as follows: - FunctionAAResults is a type-erasing alias analysis results aggregation interface to walk a single query across a range of results from different alias analyses. Currently this is function-specific as we always assume that aliasing queries are *within* a function. - AAResultBase is a CRTP utility providing stub implementations of various parts of the alias analysis result concept, notably in several cases in terms of other more general parts of the interface. This can be used to implement only a narrow part of the interface rather than the entire interface. This isn't really ideal, this logic should be hoisted into FunctionAAResults as currently it will cause a significant amount of redundant work, but it faithfully models the behavior of the prior infrastructure. - All the alias analysis passes are ported to be wrapper passes for the legacy PM and new-style analysis passes for the new PM with a shared result object. In some cases (most notably CFL), this is an extremely naive approach that we should revisit when we can specialize for the new pass manager. - BasicAA has been restructured to reflect that it is much more fundamentally a function analysis because it uses dominator trees and loop info that need to be constructed for each function. All of the references to getting alias analysis results have been updated to use the new aggregation interface. All the preservation and other pass management code has been updated accordingly. The way the FunctionAAResultsWrapperPass works is to detect the available alias analyses when run, and add them to the results object. This means that we should be able to continue to respect when various passes are added to the pipeline, for example adding CFL or adding TBAA passes should just cause their results to be available and to get folded into this. The exception to this rule is BasicAA which really needs to be a function pass due to using dominator trees and loop info. As a consequence, the FunctionAAResultsWrapperPass directly depends on BasicAA and always includes it in the aggregation. This has significant implications for preserving analyses. Generally, most passes shouldn't bother preserving FunctionAAResultsWrapperPass because rebuilding the results just updates the set of known AA passes. The exception to this rule are LoopPass instances which need to preserve all the function analyses that the loop pass manager will end up needing. This means preserving both BasicAAWrapperPass and the aggregating FunctionAAResultsWrapperPass. Now, when preserving an alias analysis, you do so by directly preserving that analysis. This is only necessary for non-immutable-pass-provided alias analyses though, and there are only three of interest: BasicAA, GlobalsAA (formerly GlobalsModRef), and SCEVAA. Usually BasicAA is preserved when needed because it (like DominatorTree and LoopInfo) is marked as a CFG-only pass. I've expanded GlobalsAA into the preserved set everywhere we previously were preserving all of AliasAnalysis, and I've added SCEVAA in the intersection of that with where we preserve SCEV itself. One significant challenge to all of this is that the CGSCC passes were actually using the alias analysis implementations by taking advantage of a pretty amazing set of loop holes in the old pass manager's analysis management code which allowed analysis groups to slide through in many cases. Moving away from analysis groups makes this problem much more obvious. To fix it, I've leveraged the flexibility the design of the new PM components provides to just directly construct the relevant alias analyses for the relevant functions in the IPO passes that need them. This is a bit hacky, but should go away with the new pass manager, and is already in many ways cleaner than the prior state. Another significant challenge is that various facilities of the old alias analysis infrastructure just don't fit any more. The most significant of these is the alias analysis 'counter' pass. That pass relied on the ability to snoop on AA queries at different points in the analysis group chain. Instead, I'm planning to build printing functionality directly into the aggregation layer. I've not included that in this patch merely to keep it smaller. Note that all of this needs a nearly complete rewrite of the AA documentation. I'm planning to do that, but I'd like to make sure the new design settles, and to flesh out a bit more of what it looks like in the new pass manager first. Differential Revision: http://reviews.llvm.org/D12080 llvm-svn: 247167
2015-09-10 01:55:00 +08:00
AU.addRequired<AAResultsWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
AU.addRequired<LiveIntervals>();
AU.addPreserved<LiveIntervals>();
AU.addRequired<SlotIndexes>();
AU.addPreserved<SlotIndexes>();
AU.addRequired<LiveDebugVariables>();
AU.addPreserved<LiveDebugVariables>();
AU.addRequired<LiveStacks>();
AU.addPreserved<LiveStacks>();
AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
AU.addRequired<MachineLoopInfo>();
AU.addPreserved<MachineLoopInfo>();
AU.addRequired<VirtRegMap>();
AU.addPreserved<VirtRegMap>();
AU.addRequired<LiveRegMatrix>();
AU.addPreserved<LiveRegMatrix>();
AU.addRequired<EdgeBundles>();
AU.addRequired<SpillPlacement>();
AU.addRequired<MachineOptimizationRemarkEmitterPass>();
AU.addRequired<RegAllocEvictionAdvisorAnalysis>();
MachineFunctionPass::getAnalysisUsage(AU);
}
//===----------------------------------------------------------------------===//
// LiveRangeEdit delegate methods
//===----------------------------------------------------------------------===//
bool RAGreedy::LRE_CanEraseVirtReg(Register VirtReg) {
LiveInterval &LI = LIS->getInterval(VirtReg);
if (VRM->hasPhys(VirtReg)) {
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
Matrix->unassign(LI);
aboutToRemoveInterval(LI);
return true;
}
// Unassigned virtreg is probably in the priority queue.
// RegAllocBase will erase it after dequeueing.
// Nonetheless, clear the live-range so that the debug
// dump will show the right state for that VirtReg.
LI.clear();
return false;
}
void RAGreedy::LRE_WillShrinkVirtReg(Register VirtReg) {
if (!VRM->hasPhys(VirtReg))
return;
// Register is assigned, put it back on the queue for reassignment.
LiveInterval &LI = LIS->getInterval(VirtReg);
Matrix->unassign(LI);
RegAlloc: Allow targets to split register allocation AMDGPU normally spills SGPRs to VGPRs. Previously, since all register classes are handled at the same time, this was problematic. We don't know ahead of time how many registers will be needed to be reserved to handle the spilling. If no VGPRs were left for spilling, we would have to try to spill to memory. If the spilled SGPRs were required for exec mask manipulation, it is highly problematic because the lanes active at the point of spill are not necessarily the same as at the restore point. Avoid this problem by fully allocating SGPRs in a separate regalloc run from VGPRs. This way we know the exact number of VGPRs needed, and can reserve them for a second run. This fixes the most serious issues, but it is still possible using inline asm to make all VGPRs unavailable. Start erroring in the case where we ever would require memory for an SGPR spill. This is implemented by giving each regalloc pass a callback which reports if a register class should be handled or not. A few passes need some small changes to deal with leftover virtual registers. In the AMDGPU implementation, a new pass is introduced to take the place of PrologEpilogInserter for SGPR spills emitted during the first run. One disadvantage of this is currently StackSlotColoring is no longer used for SGPR spills. It would need to be run again, which will require more work. Error if the standard -regalloc option is used. Introduce new separate -sgpr-regalloc and -vgpr-regalloc flags, so the two runs can be controlled individually. PBQB is not currently supported, so this also prevents using the unhandled allocator.
2018-09-27 07:36:28 +08:00
RegAllocBase::enqueue(&LI);
}
void RAGreedy::LRE_DidCloneVirtReg(Register New, Register Old) {
ExtraInfo->LRE_DidCloneVirtReg(New, Old);
}
void RAGreedy::ExtraRegInfo::LRE_DidCloneVirtReg(Register New, Register Old) {
// Cloning a register we haven't even heard about yet? Just ignore it.
if (!Info.inBounds(Old))
return;
// LRE may clone a virtual register because dead code elimination causes it to
// be split into connected components. The new components are much smaller
// than the original, so they should get a new chance at being assigned.
// same stage as the parent.
Info[Old].Stage = RS_Assign;
Info.grow(New.id());
Info[New] = Info[Old];
}
void RAGreedy::releaseMemory() {
SpillerInstance.reset();
GlobalCand.clear();
}
void RAGreedy::enqueueImpl(const LiveInterval *LI) { enqueue(Queue, LI); }
void RAGreedy::enqueue(PQueue &CurQueue, const LiveInterval *LI) {
// Prioritize live ranges by size, assigning larger ranges first.
// The queue holds (size, reg) pairs.
const unsigned Size = LI->getSize();
const Register Reg = LI->reg();
assert(Reg.isVirtual() && "Can only enqueue virtual registers");
unsigned Prio;
auto Stage = ExtraInfo->getOrInitStage(Reg);
if (Stage == RS_New) {
Stage = RS_Assign;
ExtraInfo->setStage(Reg, Stage);
}
if (Stage == RS_Split) {
// Unsplit ranges that couldn't be allocated immediately are deferred until
// everything else has been allocated.
Prio = Size;
} else if (Stage == RS_Memory) {
// Memory operand should be considered last.
// Change the priority such that Memory operand are assigned in
// the reverse order that they came in.
// TODO: Make this a member variable and probably do something about hints.
static unsigned MemOp = 0;
Prio = MemOp++;
} else {
// Giant live ranges fall back to the global assignment heuristic, which
// prevents excessive spilling in pathological cases.
bool ReverseLocal = TRI->reverseLocalAssignment();
const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
bool ForceGlobal =
!ReverseLocal && (Size / SlotIndex::InstrDist) >
(2 * RegClassInfo.getNumAllocatableRegs(&RC));
unsigned GlobalBit = 0;
if (Stage == RS_Assign && !ForceGlobal && !LI->empty() &&
Allocate local registers in order for optimal coloring. Also avoid locals evicting locals just because they want a cheaper register. Problem: MI Sched knows exactly how many registers we have and assumes they can be colored. In cases where we have large blocks, usually from unrolled loops, greedy coloring fails. This is a source of "regressions" from the MI Scheduler on x86. I noticed this issue on x86 where we have long chains of two-address defs in the same live range. It's easy to see this in matrix multiplication benchmarks like IRSmk and even the unit test misched-matmul.ll. A fundamental difference between the LLVM register allocator and conventional graph coloring is that in our model a live range can't discover its neighbors, it can only verify its neighbors. That's why we initially went for greedy coloring and added eviction to deal with the hard cases. However, for singly defined and two-address live ranges, we can optimally color without visiting neighbors simply by processing the live ranges in instruction order. Other beneficial side effects: It is much easier to understand and debug regalloc for large blocks when the live ranges are allocated in order. Yes, global allocation is still very confusing, but it's nice to be able to comprehend what happened locally. Heuristics could be added to bias register assignment based on instruction locality (think late register pairing, banks...). Intuituvely this will make some test cases that are on the threshold of register pressure more stable. llvm-svn: 187139
2013-07-26 02:35:14 +08:00
LIS->intervalIsInOneMBB(*LI)) {
// Allocate original local ranges in linear instruction order. Since they
// are singly defined, this produces optimal coloring in the absence of
// global interference and other constraints.
if (!ReverseLocal)
Prio = LI->beginIndex().getInstrDistance(Indexes->getLastIndex());
else {
// Allocating bottom up may allow many short LRGs to be assigned first
// to one of the cheap registers. This could be much faster for very
// large blocks on targets with many physical registers.
Prio = Indexes->getZeroIndex().getInstrDistance(LI->endIndex());
}
} else {
Allocate local registers in order for optimal coloring. Also avoid locals evicting locals just because they want a cheaper register. Problem: MI Sched knows exactly how many registers we have and assumes they can be colored. In cases where we have large blocks, usually from unrolled loops, greedy coloring fails. This is a source of "regressions" from the MI Scheduler on x86. I noticed this issue on x86 where we have long chains of two-address defs in the same live range. It's easy to see this in matrix multiplication benchmarks like IRSmk and even the unit test misched-matmul.ll. A fundamental difference between the LLVM register allocator and conventional graph coloring is that in our model a live range can't discover its neighbors, it can only verify its neighbors. That's why we initially went for greedy coloring and added eviction to deal with the hard cases. However, for singly defined and two-address live ranges, we can optimally color without visiting neighbors simply by processing the live ranges in instruction order. Other beneficial side effects: It is much easier to understand and debug regalloc for large blocks when the live ranges are allocated in order. Yes, global allocation is still very confusing, but it's nice to be able to comprehend what happened locally. Heuristics could be added to bias register assignment based on instruction locality (think late register pairing, banks...). Intuituvely this will make some test cases that are on the threshold of register pressure more stable. llvm-svn: 187139
2013-07-26 02:35:14 +08:00
// Allocate global and split ranges in long->short order. Long ranges that
// don't fit should be spilled (or split) ASAP so they don't create
// interference. Mark a bit to prioritize global above local ranges.
Prio = Size;
GlobalBit = 1;
Allocate local registers in order for optimal coloring. Also avoid locals evicting locals just because they want a cheaper register. Problem: MI Sched knows exactly how many registers we have and assumes they can be colored. In cases where we have large blocks, usually from unrolled loops, greedy coloring fails. This is a source of "regressions" from the MI Scheduler on x86. I noticed this issue on x86 where we have long chains of two-address defs in the same live range. It's easy to see this in matrix multiplication benchmarks like IRSmk and even the unit test misched-matmul.ll. A fundamental difference between the LLVM register allocator and conventional graph coloring is that in our model a live range can't discover its neighbors, it can only verify its neighbors. That's why we initially went for greedy coloring and added eviction to deal with the hard cases. However, for singly defined and two-address live ranges, we can optimally color without visiting neighbors simply by processing the live ranges in instruction order. Other beneficial side effects: It is much easier to understand and debug regalloc for large blocks when the live ranges are allocated in order. Yes, global allocation is still very confusing, but it's nice to be able to comprehend what happened locally. Heuristics could be added to bias register assignment based on instruction locality (think late register pairing, banks...). Intuituvely this will make some test cases that are on the threshold of register pressure more stable. llvm-svn: 187139
2013-07-26 02:35:14 +08:00
}
if (RegClassPriorityTrumpsGlobalness)
Prio |= RC.AllocationPriority << 25 | GlobalBit << 24;
else
Prio |= GlobalBit << 29 | RC.AllocationPriority << 24;
Allocate local registers in order for optimal coloring. Also avoid locals evicting locals just because they want a cheaper register. Problem: MI Sched knows exactly how many registers we have and assumes they can be colored. In cases where we have large blocks, usually from unrolled loops, greedy coloring fails. This is a source of "regressions" from the MI Scheduler on x86. I noticed this issue on x86 where we have long chains of two-address defs in the same live range. It's easy to see this in matrix multiplication benchmarks like IRSmk and even the unit test misched-matmul.ll. A fundamental difference between the LLVM register allocator and conventional graph coloring is that in our model a live range can't discover its neighbors, it can only verify its neighbors. That's why we initially went for greedy coloring and added eviction to deal with the hard cases. However, for singly defined and two-address live ranges, we can optimally color without visiting neighbors simply by processing the live ranges in instruction order. Other beneficial side effects: It is much easier to understand and debug regalloc for large blocks when the live ranges are allocated in order. Yes, global allocation is still very confusing, but it's nice to be able to comprehend what happened locally. Heuristics could be added to bias register assignment based on instruction locality (think late register pairing, banks...). Intuituvely this will make some test cases that are on the threshold of register pressure more stable. llvm-svn: 187139
2013-07-26 02:35:14 +08:00
// Mark a higher bit to prioritize global and local above RS_Split.
Prio |= (1u << 31);
// Boost ranges that have a physical register hint.
if (VRM->hasKnownPreference(Reg))
Prio |= (1u << 30);
}
// The virtual register number is a tie breaker for same-sized ranges.
// Give lower vreg numbers higher priority to assign them first.
CurQueue.push(std::make_pair(Prio, ~Reg));
}
const LiveInterval *RAGreedy::dequeue() { return dequeue(Queue); }
const LiveInterval *RAGreedy::dequeue(PQueue &CurQueue) {
if (CurQueue.empty())
return nullptr;
LiveInterval *LI = &LIS->getInterval(~CurQueue.top().second);
CurQueue.pop();
return LI;
}
//===----------------------------------------------------------------------===//
// Direct Assignment
//===----------------------------------------------------------------------===//
/// tryAssign - Try to assign VirtReg to an available register.
MCRegister RAGreedy::tryAssign(const LiveInterval &VirtReg,
AllocationOrder &Order,
SmallVectorImpl<Register> &NewVRegs,
const SmallVirtRegSet &FixedRegisters) {
MCRegister PhysReg;
for (auto I = Order.begin(), E = Order.end(); I != E && !PhysReg; ++I) {
assert(*I);
if (!Matrix->checkInterference(VirtReg, *I)) {
if (I.isHint())
return *I;
else
PhysReg = *I;
}
}
if (!PhysReg.isValid())
return PhysReg;
// PhysReg is available, but there may be a better choice.
// If we missed a simple hint, try to cheaply evict interference from the
// preferred register.
if (Register Hint = MRI->getSimpleHint(VirtReg.reg()))
if (Order.isHint(Hint)) {
MCRegister PhysHint = Hint.asMCReg();
LLVM_DEBUG(dbgs() << "missed hint " << printReg(PhysHint, TRI) << '\n');
if (EvictAdvisor->canEvictHintInterference(VirtReg, PhysHint,
FixedRegisters)) {
evictInterference(VirtReg, PhysHint, NewVRegs);
return PhysHint;
}
// Record the missed hint, we may be able to recover
// at the end if the surrounding allocation changed.
SetOfBrokenHints.insert(&VirtReg);
}
// Try to evict interference from a cheaper alternative.
uint8_t Cost = RegCosts[PhysReg];
// Most registers have 0 additional cost.
if (!Cost)
return PhysReg;
LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << " is available at cost "
<< (unsigned)Cost << '\n');
MCRegister CheapReg = tryEvict(VirtReg, Order, NewVRegs, Cost, FixedRegisters);
return CheapReg ? CheapReg : PhysReg;
}
//===----------------------------------------------------------------------===//
// Interference eviction
//===----------------------------------------------------------------------===//
Register RegAllocEvictionAdvisor::canReassign(const LiveInterval &VirtReg,
Register PrevReg) const {
auto Order =
AllocationOrder::create(VirtReg.reg(), *VRM, RegClassInfo, Matrix);
MCRegister PhysReg;
for (auto I = Order.begin(), E = Order.end(); I != E && !PhysReg; ++I) {
if ((*I).id() == PrevReg.id())
continue;
MCRegUnitIterator Units(*I, TRI);
for (; Units.isValid(); ++Units) {
// Instantiate a "subquery", not to be confused with the Queries array.
LiveIntervalUnion::Query subQ(VirtReg, Matrix->getLiveUnions()[*Units]);
if (subQ.checkInterference())
break;
}
// If no units have interference, break out with the current PhysReg.
if (!Units.isValid())
PhysReg = *I;
}
if (PhysReg)
LLVM_DEBUG(dbgs() << "can reassign: " << VirtReg << " from "
<< printReg(PrevReg, TRI) << " to "
<< printReg(PhysReg, TRI) << '\n');
return PhysReg;
}
/// evictInterference - Evict any interferring registers that prevent VirtReg
/// from being assigned to Physreg. This assumes that canEvictInterference
/// returned true.
void RAGreedy::evictInterference(const LiveInterval &VirtReg,
MCRegister PhysReg,
2020-06-30 23:57:24 +08:00
SmallVectorImpl<Register> &NewVRegs) {
// Make sure that VirtReg has a cascade number, and assign that cascade
// number to every evicted register. These live ranges than then only be
// evicted by a newer cascade, preventing infinite loops.
unsigned Cascade = ExtraInfo->getOrAssignNewCascade(VirtReg.reg());
LLVM_DEBUG(dbgs() << "evicting " << printReg(PhysReg, TRI)
<< " interference: Cascade " << Cascade << '\n');
// Collect all interfering virtregs first.
SmallVector<const LiveInterval *, 8> Intfs;
for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
// We usually have the interfering VRegs cached so collectInterferingVRegs()
// should be fast, we may need to recalculate if when different physregs
// overlap the same register unit so we had different SubRanges queried
// against it.
ArrayRef<const LiveInterval *> IVR = Q.interferingVRegs();
Intfs.append(IVR.begin(), IVR.end());
}
// Evict them second. This will invalidate the queries.
for (const LiveInterval *Intf : Intfs) {
// The same VirtReg may be present in multiple RegUnits. Skip duplicates.
if (!VRM->hasPhys(Intf->reg()))
continue;
Add logic to greedy reg alloc to avoid bad eviction chains This fixes bugzilla 26810 https://bugs.llvm.org/show_bug.cgi?id=26810 This is intended to prevent sequences like: movl %ebp, 8(%esp) # 4-byte Spill movl %ecx, %ebp movl %ebx, %ecx movl %edi, %ebx movl %edx, %edi cltd idivl %esi movl %edi, %edx movl %ebx, %edi movl %ecx, %ebx movl %ebp, %ecx movl 16(%esp), %ebp # 4 - byte Reload Such sequences are created in 2 scenarios: Scenario #1: vreg0 is evicted from physreg0 by vreg1 Evictee vreg0 is intended for region splitting with split candidate physreg0 (the reg vreg0 was evicted from) Region splitting creates a local interval because of interference with the evictor vreg1 (normally region spliiting creates 2 interval, the "by reg" and "by stack" intervals. Local interval created when interference occurs.) one of the split intervals ends up evicting vreg2 from physreg1 Evictee vreg2 is intended for region splitting with split candidate physreg1 one of the split intervals ends up evicting vreg3 from physreg2 etc.. until someone spills Scenario #2 vreg0 is evicted from physreg0 by vreg1 vreg2 is evicted from physreg2 by vreg3 etc Evictee vreg0 is intended for region splitting with split candidate physreg1 Region splitting creates a local interval because of interference with the evictor vreg1 one of the split intervals ends up evicting back original evictor vreg1 from physreg0 (the reg vreg0 was evicted from) Another evictee vreg2 is intended for region splitting with split candidate physreg1 one of the split intervals ends up evicting vreg3 from physreg2 etc.. until someone spills As compile time was a concern, I've added a flag to control weather we do cost calculations for local intervals we expect to be created (it's on by default for X86 target, off for the rest). Differential Revision: https://reviews.llvm.org/D35816 Change-Id: Id9411ff7bbb845463d289ba2ae97737a1ee7cc39 llvm-svn: 316295
2017-10-23 01:59:38 +08:00
Matrix->unassign(*Intf);
assert((ExtraInfo->getCascade(Intf->reg()) < Cascade ||
VirtReg.isSpillable() < Intf->isSpillable()) &&
"Cannot decrease cascade number, illegal eviction");
ExtraInfo->setCascade(Intf->reg(), Cascade);
++NumEvicted;
NewVRegs.push_back(Intf->reg());
}
}
/// Returns true if the given \p PhysReg is a callee saved register and has not
/// been used for allocation yet.
bool RegAllocEvictionAdvisor::isUnusedCalleeSavedReg(MCRegister PhysReg) const {
2020-06-30 23:57:24 +08:00
MCRegister CSR = RegClassInfo.getLastCalleeSavedAlias(PhysReg);
if (!CSR)
return false;
return !Matrix->isPhysRegUsed(PhysReg);
}
Optional<unsigned>
RegAllocEvictionAdvisor::getOrderLimit(const LiveInterval &VirtReg,
const AllocationOrder &Order,
unsigned CostPerUseLimit) const {
unsigned OrderLimit = Order.getOrder().size();
if (CostPerUseLimit < uint8_t(~0u)) {
// Check of any registers in RC are below CostPerUseLimit.
const TargetRegisterClass *RC = MRI->getRegClass(VirtReg.reg());
uint8_t MinCost = RegClassInfo.getMinCost(RC);
if (MinCost >= CostPerUseLimit) {
LLVM_DEBUG(dbgs() << TRI->getRegClassName(RC) << " minimum cost = "
<< MinCost << ", no cheaper registers to be found.\n");
return None;
}
// It is normal for register classes to have a long tail of registers with
// the same cost. We don't need to look at them if they're too expensive.
if (RegCosts[Order.getOrder().back()] >= CostPerUseLimit) {
OrderLimit = RegClassInfo.getLastCostChange(RC);
LLVM_DEBUG(dbgs() << "Only trying the first " << OrderLimit
<< " regs.\n");
}
}
return OrderLimit;
}
bool RegAllocEvictionAdvisor::canAllocatePhysReg(unsigned CostPerUseLimit,
MCRegister PhysReg) const {
if (RegCosts[PhysReg] >= CostPerUseLimit)
return false;
// The first use of a callee-saved register in a function has cost 1.
// Don't start using a CSR when the CostPerUseLimit is low.
if (CostPerUseLimit == 1 && isUnusedCalleeSavedReg(PhysReg)) {
LLVM_DEBUG(
dbgs() << printReg(PhysReg, TRI) << " would clobber CSR "
<< printReg(RegClassInfo.getLastCalleeSavedAlias(PhysReg), TRI)
<< '\n');
return false;
}
return true;
}
/// tryEvict - Try to evict all interferences for a physreg.
/// @param VirtReg Currently unassigned virtual register.
/// @param Order Physregs to try.
/// @return Physreg to assign VirtReg, or 0.
MCRegister RAGreedy::tryEvict(const LiveInterval &VirtReg,
AllocationOrder &Order,
SmallVectorImpl<Register> &NewVRegs,
uint8_t CostPerUseLimit,
const SmallVirtRegSet &FixedRegisters) {
NamedRegionTimer T("evict", "Evict", TimerGroupName, TimerGroupDescription,
TimePassesIsEnabled);
MCRegister BestPhys = EvictAdvisor->tryFindEvictionCandidate(
VirtReg, Order, CostPerUseLimit, FixedRegisters);
if (BestPhys.isValid())
evictInterference(VirtReg, BestPhys, NewVRegs);
return BestPhys;
}
//===----------------------------------------------------------------------===//
// Region Splitting
//===----------------------------------------------------------------------===//
/// addSplitConstraints - Fill out the SplitConstraints vector based on the
/// interference pattern in Physreg and its aliases. Add the constraints to
/// SpillPlacement and return the static cost of this split in Cost, assuming
/// that all preferences in SplitConstraints are met.
/// Return false if there are no bundles with positive bias.
bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
BlockFrequency &Cost) {
ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
// Reset interference dependent info.
SplitConstraints.resize(UseBlocks.size());
BlockFrequency StaticCost = 0;
for (unsigned I = 0; I != UseBlocks.size(); ++I) {
const SplitAnalysis::BlockInfo &BI = UseBlocks[I];
SpillPlacement::BlockConstraint &BC = SplitConstraints[I];
BC.Number = BI.MBB->getNumber();
Intf.moveToBlock(BC.Number);
BC.Entry = BI.LiveIn ? SpillPlacement::PrefReg : SpillPlacement::DontCare;
BC.Exit = (BI.LiveOut &&
!LIS->getInstructionFromIndex(BI.LastInstr)->isImplicitDef())
? SpillPlacement::PrefReg
: SpillPlacement::DontCare;
BC.ChangesValue = BI.FirstDef.isValid();
if (!Intf.hasInterference())
continue;
// Number of spill code instructions to insert.
unsigned Ins = 0;
// Interference for the live-in value.
if (BI.LiveIn) {
if (Intf.first() <= Indexes->getMBBStartIdx(BC.Number)) {
BC.Entry = SpillPlacement::MustSpill;
++Ins;
} else if (Intf.first() < BI.FirstInstr) {
BC.Entry = SpillPlacement::PrefSpill;
++Ins;
} else if (Intf.first() < BI.LastInstr) {
++Ins;
}
// Abort if the spill cannot be inserted at the MBB' start
if (((BC.Entry == SpillPlacement::MustSpill) ||
(BC.Entry == SpillPlacement::PrefSpill)) &&
SlotIndex::isEarlierInstr(BI.FirstInstr,
SA->getFirstSplitPoint(BC.Number)))
return false;
}
// Interference for the live-out value.
if (BI.LiveOut) {
if (Intf.last() >= SA->getLastSplitPoint(BC.Number)) {
BC.Exit = SpillPlacement::MustSpill;
++Ins;
} else if (Intf.last() > BI.LastInstr) {
BC.Exit = SpillPlacement::PrefSpill;
++Ins;
} else if (Intf.last() > BI.FirstInstr) {
++Ins;
}
}
// Accumulate the total frequency of inserted spill code.
while (Ins--)
StaticCost += SpillPlacer->getBlockFrequency(BC.Number);
}
Cost = StaticCost;
// Add constraints for use-blocks. Note that these are the only constraints
// that may add a positive bias, it is downhill from here.
SpillPlacer->addConstraints(SplitConstraints);
return SpillPlacer->scanActiveBundles();
}
/// addThroughConstraints - Add constraints and links to SpillPlacer from the
/// live-through blocks in Blocks.
bool RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf,
ArrayRef<unsigned> Blocks) {
const unsigned GroupSize = 8;
SpillPlacement::BlockConstraint BCS[GroupSize];
unsigned TBS[GroupSize];
unsigned B = 0, T = 0;
for (unsigned Number : Blocks) {
Intf.moveToBlock(Number);
if (!Intf.hasInterference()) {
assert(T < GroupSize && "Array overflow");
TBS[T] = Number;
if (++T == GroupSize) {
SpillPlacer->addLinks(makeArrayRef(TBS, T));
T = 0;
}
continue;
}
assert(B < GroupSize && "Array overflow");
BCS[B].Number = Number;
// Abort if the spill cannot be inserted at the MBB' start
MachineBasicBlock *MBB = MF->getBlockNumbered(Number);
auto FirstNonDebugInstr = MBB->getFirstNonDebugInstr();
if (FirstNonDebugInstr != MBB->end() &&
SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*FirstNonDebugInstr),
SA->getFirstSplitPoint(Number)))
return false;
// Interference for the live-in value.
if (Intf.first() <= Indexes->getMBBStartIdx(Number))
BCS[B].Entry = SpillPlacement::MustSpill;
else
BCS[B].Entry = SpillPlacement::PrefSpill;
// Interference for the live-out value.
if (Intf.last() >= SA->getLastSplitPoint(Number))
BCS[B].Exit = SpillPlacement::MustSpill;
else
BCS[B].Exit = SpillPlacement::PrefSpill;
if (++B == GroupSize) {
SpillPlacer->addConstraints(makeArrayRef(BCS, B));
B = 0;
}
}
SpillPlacer->addConstraints(makeArrayRef(BCS, B));
SpillPlacer->addLinks(makeArrayRef(TBS, T));
return true;
}
bool RAGreedy::growRegion(GlobalSplitCandidate &Cand) {
// Keep track of through blocks that have not been added to SpillPlacer.
BitVector Todo = SA->getThroughBlocks();
SmallVectorImpl<unsigned> &ActiveBlocks = Cand.ActiveBlocks;
unsigned AddedTo = 0;
#ifndef NDEBUG
unsigned Visited = 0;
#endif
unsigned long Budget = GrowRegionComplexityBudget;
while (true) {
ArrayRef<unsigned> NewBundles = SpillPlacer->getRecentPositive();
// Find new through blocks in the periphery of PrefRegBundles.
for (unsigned Bundle : NewBundles) {
// Look at all blocks connected to Bundle in the full graph.
ArrayRef<unsigned> Blocks = Bundles->getBlocks(Bundle);
// Limit compilation time by bailing out after we use all our budget.
if (Blocks.size() >= Budget)
return false;
Budget -= Blocks.size();
for (unsigned Block : Blocks) {
if (!Todo.test(Block))
continue;
Todo.reset(Block);
// This is a new through block. Add it to SpillPlacer later.
ActiveBlocks.push_back(Block);
#ifndef NDEBUG
++Visited;
#endif
}
}
// Any new blocks to add?
if (ActiveBlocks.size() == AddedTo)
break;
// Compute through constraints from the interference, or assume that all
// through blocks prefer spilling when forming compact regions.
auto NewBlocks = makeArrayRef(ActiveBlocks).slice(AddedTo);
if (Cand.PhysReg) {
if (!addThroughConstraints(Cand.Intf, NewBlocks))
return false;
} else
// Provide a strong negative bias on through blocks to prevent unwanted
// liveness on loop backedges.
SpillPlacer->addPrefSpill(NewBlocks, /* Strong= */ true);
AddedTo = ActiveBlocks.size();
// Perhaps iterating can enable more bundles?
SpillPlacer->iterate();
}
LLVM_DEBUG(dbgs() << ", v=" << Visited);
return true;
}
/// calcCompactRegion - Compute the set of edge bundles that should be live
/// when splitting the current live range into compact regions. Compact
/// regions can be computed without looking at interference. They are the
/// regions formed by removing all the live-through blocks from the live range.
///
/// Returns false if the current live range is already compact, or if the
/// compact regions would form single block regions anyway.
bool RAGreedy::calcCompactRegion(GlobalSplitCandidate &Cand) {
// Without any through blocks, the live range is already compact.
if (!SA->getNumThroughBlocks())
return false;
// Compact regions don't correspond to any physreg.
Cand.reset(IntfCache, MCRegister::NoRegister);
LLVM_DEBUG(dbgs() << "Compact region bundles");
// Use the spill placer to determine the live bundles. GrowRegion pretends
// that all the through blocks have interference when PhysReg is unset.
SpillPlacer->prepare(Cand.LiveBundles);
// The static split cost will be zero since Cand.Intf reports no interference.
BlockFrequency Cost;
if (!addSplitConstraints(Cand.Intf, Cost)) {
LLVM_DEBUG(dbgs() << ", none.\n");
return false;
}
if (!growRegion(Cand)) {
LLVM_DEBUG(dbgs() << ", cannot spill all interferences.\n");
return false;
}
SpillPlacer->finish();
if (!Cand.LiveBundles.any()) {
LLVM_DEBUG(dbgs() << ", none.\n");
return false;
}
LLVM_DEBUG({
for (int I : Cand.LiveBundles.set_bits())
dbgs() << " EB#" << I;
dbgs() << ".\n";
});
return true;
}
/// calcSpillCost - Compute how expensive it would be to split the live range in
/// SA around all use blocks instead of forming bundle regions.
BlockFrequency RAGreedy::calcSpillCost() {
BlockFrequency Cost = 0;
ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
for (const SplitAnalysis::BlockInfo &BI : UseBlocks) {
unsigned Number = BI.MBB->getNumber();
// We normally only need one spill instruction - a load or a store.
Cost += SpillPlacer->getBlockFrequency(Number);
// Unless the value is redefined in the block.
if (BI.LiveIn && BI.LiveOut && BI.FirstDef)
Cost += SpillPlacer->getBlockFrequency(Number);
}
return Cost;
}
/// calcGlobalSplitCost - Return the global split cost of following the split
/// pattern in LiveBundles. This cost should be added to the local cost of the
/// interference pattern in SplitConstraints.
///
Add logic to greedy reg alloc to avoid bad eviction chains This fixes bugzilla 26810 https://bugs.llvm.org/show_bug.cgi?id=26810 This is intended to prevent sequences like: movl %ebp, 8(%esp) # 4-byte Spill movl %ecx, %ebp movl %ebx, %ecx movl %edi, %ebx movl %edx, %edi cltd idivl %esi movl %edi, %edx movl %ebx, %edi movl %ecx, %ebx movl %ebp, %ecx movl 16(%esp), %ebp # 4 - byte Reload Such sequences are created in 2 scenarios: Scenario #1: vreg0 is evicted from physreg0 by vreg1 Evictee vreg0 is intended for region splitting with split candidate physreg0 (the reg vreg0 was evicted from) Region splitting creates a local interval because of interference with the evictor vreg1 (normally region spliiting creates 2 interval, the "by reg" and "by stack" intervals. Local interval created when interference occurs.) one of the split intervals ends up evicting vreg2 from physreg1 Evictee vreg2 is intended for region splitting with split candidate physreg1 one of the split intervals ends up evicting vreg3 from physreg2 etc.. until someone spills Scenario #2 vreg0 is evicted from physreg0 by vreg1 vreg2 is evicted from physreg2 by vreg3 etc Evictee vreg0 is intended for region splitting with split candidate physreg1 Region splitting creates a local interval because of interference with the evictor vreg1 one of the split intervals ends up evicting back original evictor vreg1 from physreg0 (the reg vreg0 was evicted from) Another evictee vreg2 is intended for region splitting with split candidate physreg1 one of the split intervals ends up evicting vreg3 from physreg2 etc.. until someone spills As compile time was a concern, I've added a flag to control weather we do cost calculations for local intervals we expect to be created (it's on by default for X86 target, off for the rest). Differential Revision: https://reviews.llvm.org/D35816 Change-Id: Id9411ff7bbb845463d289ba2ae97737a1ee7cc39 llvm-svn: 316295
2017-10-23 01:59:38 +08:00
BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand,
const AllocationOrder &Order) {
BlockFrequency GlobalCost = 0;
const BitVector &LiveBundles = Cand.LiveBundles;
ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
for (unsigned I = 0; I != UseBlocks.size(); ++I) {
const SplitAnalysis::BlockInfo &BI = UseBlocks[I];
SpillPlacement::BlockConstraint &BC = SplitConstraints[I];
bool RegIn = LiveBundles[Bundles->getBundle(BC.Number, false)];
bool RegOut = LiveBundles[Bundles->getBundle(BC.Number, true)];
unsigned Ins = 0;
Add logic to greedy reg alloc to avoid bad eviction chains This fixes bugzilla 26810 https://bugs.llvm.org/show_bug.cgi?id=26810 This is intended to prevent sequences like: movl %ebp, 8(%esp) # 4-byte Spill movl %ecx, %ebp movl %ebx, %ecx movl %edi, %ebx movl %edx, %edi cltd idivl %esi movl %edi, %edx movl %ebx, %edi movl %ecx, %ebx movl %ebp, %ecx movl 16(%esp), %ebp # 4 - byte Reload Such sequences are created in 2 scenarios: Scenario #1: vreg0 is evicted from physreg0 by vreg1 Evictee vreg0 is intended for region splitting with split candidate physreg0 (the reg vreg0 was evicted from) Region splitting creates a local interval because of interference with the evictor vreg1 (normally region spliiting creates 2 interval, the "by reg" and "by stack" intervals. Local interval created when interference occurs.) one of the split intervals ends up evicting vreg2 from physreg1 Evictee vreg2 is intended for region splitting with split candidate physreg1 one of the split intervals ends up evicting vreg3 from physreg2 etc.. until someone spills Scenario #2 vreg0 is evicted from physreg0 by vreg1 vreg2 is evicted from physreg2 by vreg3 etc Evictee vreg0 is intended for region splitting with split candidate physreg1 Region splitting creates a local interval because of interference with the evictor vreg1 one of the split intervals ends up evicting back original evictor vreg1 from physreg0 (the reg vreg0 was evicted from) Another evictee vreg2 is intended for region splitting with split candidate physreg1 one of the split intervals ends up evicting vreg3 from physreg2 etc.. until someone spills As compile time was a concern, I've added a flag to control weather we do cost calculations for local intervals we expect to be created (it's on by default for X86 target, off for the rest). Differential Revision: https://reviews.llvm.org/D35816 Change-Id: Id9411ff7bbb845463d289ba2ae97737a1ee7cc39 llvm-svn: 316295
2017-10-23 01:59:38 +08:00
Cand.Intf.moveToBlock(BC.Number);
if (BI.LiveIn)
Ins += RegIn != (BC.Entry == SpillPlacement::PrefReg);
if (BI.LiveOut)
Ins += RegOut != (BC.Exit == SpillPlacement::PrefReg);
while (Ins--)
GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
}
for (unsigned Number : Cand.ActiveBlocks) {
bool RegIn = LiveBundles[Bundles->getBundle(Number, false)];
bool RegOut = LiveBundles[Bundles->getBundle(Number, true)];
if (!RegIn && !RegOut)
continue;
if (RegIn && RegOut) {
// We need double spill code if this block has interference.
Cand.Intf.moveToBlock(Number);
if (Cand.Intf.hasInterference()) {
GlobalCost += SpillPlacer->getBlockFrequency(Number);
GlobalCost += SpillPlacer->getBlockFrequency(Number);
}
continue;
}
// live-in / stack-out or stack-in live-out.
GlobalCost += SpillPlacer->getBlockFrequency(Number);
}
return GlobalCost;
}
/// splitAroundRegion - Split the current live range around the regions
/// determined by BundleCand and GlobalCand.
///
/// Before calling this function, GlobalCand and BundleCand must be initialized
/// so each bundle is assigned to a valid candidate, or NoCand for the
/// stack-bound bundles. The shared SA/SE SplitAnalysis and SplitEditor
/// objects must be initialized for the current live range, and intervals
/// created for the used candidates.
///
/// @param LREdit The LiveRangeEdit object handling the current split.
/// @param UsedCands List of used GlobalCand entries. Every BundleCand value
/// must appear in this list.
void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
ArrayRef<unsigned> UsedCands) {
// These are the intervals created for new global ranges. We may create more
// intervals for local ranges.
const unsigned NumGlobalIntvs = LREdit.size();
LLVM_DEBUG(dbgs() << "splitAroundRegion with " << NumGlobalIntvs
<< " globals.\n");
assert(NumGlobalIntvs && "No global intervals configured");
// Isolate even single instructions when dealing with a proper sub-class.
// That guarantees register class inflation for the stack interval because it
// is all copies.
Register Reg = SA->getParent().reg();
bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
Reapply r134047 now that the world is ready for it. This patch will sometimes choose live range split points next to interference instead of always splitting next to a register point. That means spill code can now appear almost anywhere, and it was necessary to fix code that didn't expect that. The difficult places were: - Between a CALL returning a value on the x87 stack and the corresponding FpPOP_RETVAL (was FpGET_ST0). Probably also near x87 inline assembly, but that didn't actually show up in testing. - Between a CALL popping arguments off the stack and the corresponding ADJCALLSTACKUP. Both are fixed now. The only place spill code can't appear is after terminators, see SplitAnalysis::getLastSplitPoint. Original commit message: Rewrite RAGreedy::splitAroundRegion, now with cool ASCII art. This function has to deal with a lot of special cases, and the old version got it wrong sometimes. In particular, it would sometimes leave multiple uses in the stack interval in a single block. That causes bad code with multiple reloads in the same basic block. The new version handles block entry and exit in a single pass. It first eliminates all the easy cases, and then goes on to create a local interval for the blocks with difficult interference. Previously, we would only create the local interval for completely isolated blocks. It can happen that the stack interval becomes completely empty because we could allocate a register in all edge bundles, and the new local intervals deal with the interference. The empty stack interval is harmless, but we need to remove a SplitKit assertion that checks for empty intervals. llvm-svn: 134125
2011-06-30 09:30:39 +08:00
// First handle all the blocks with uses.
ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
for (const SplitAnalysis::BlockInfo &BI : UseBlocks) {
unsigned Number = BI.MBB->getNumber();
unsigned IntvIn = 0, IntvOut = 0;
SlotIndex IntfIn, IntfOut;
if (BI.LiveIn) {
unsigned CandIn = BundleCand[Bundles->getBundle(Number, false)];
if (CandIn != NoCand) {
GlobalSplitCandidate &Cand = GlobalCand[CandIn];
IntvIn = Cand.IntvIdx;
Cand.Intf.moveToBlock(Number);
IntfIn = Cand.Intf.first();
}
}
if (BI.LiveOut) {
unsigned CandOut = BundleCand[Bundles->getBundle(Number, true)];
if (CandOut != NoCand) {
GlobalSplitCandidate &Cand = GlobalCand[CandOut];
IntvOut = Cand.IntvIdx;
Cand.Intf.moveToBlock(Number);
IntfOut = Cand.Intf.last();
}
}
// Create separate intervals for isolated blocks with multiple uses.
if (!IntvIn && !IntvOut) {
LLVM_DEBUG(dbgs() << printMBBReference(*BI.MBB) << " isolated.\n");
if (SA->shouldSplitSingleBlock(BI, SingleInstrs))
Reapply r134047 now that the world is ready for it. This patch will sometimes choose live range split points next to interference instead of always splitting next to a register point. That means spill code can now appear almost anywhere, and it was necessary to fix code that didn't expect that. The difficult places were: - Between a CALL returning a value on the x87 stack and the corresponding FpPOP_RETVAL (was FpGET_ST0). Probably also near x87 inline assembly, but that didn't actually show up in testing. - Between a CALL popping arguments off the stack and the corresponding ADJCALLSTACKUP. Both are fixed now. The only place spill code can't appear is after terminators, see SplitAnalysis::getLastSplitPoint. Original commit message: Rewrite RAGreedy::splitAroundRegion, now with cool ASCII art. This function has to deal with a lot of special cases, and the old version got it wrong sometimes. In particular, it would sometimes leave multiple uses in the stack interval in a single block. That causes bad code with multiple reloads in the same basic block. The new version handles block entry and exit in a single pass. It first eliminates all the easy cases, and then goes on to create a local interval for the blocks with difficult interference. Previously, we would only create the local interval for completely isolated blocks. It can happen that the stack interval becomes completely empty because we could allocate a register in all edge bundles, and the new local intervals deal with the interference. The empty stack interval is harmless, but we need to remove a SplitKit assertion that checks for empty intervals. llvm-svn: 134125
2011-06-30 09:30:39 +08:00
SE->splitSingleBlock(BI);
continue;
}
if (IntvIn && IntvOut)
SE->splitLiveThroughBlock(Number, IntvIn, IntfIn, IntvOut, IntfOut);
else if (IntvIn)
SE->splitRegInBlock(BI, IntvIn, IntfIn);
else
SE->splitRegOutBlock(BI, IntvOut, IntfOut);
}
// Handle live-through blocks. The relevant live-through blocks are stored in
// the ActiveBlocks list with each candidate. We need to filter out
// duplicates.
BitVector Todo = SA->getThroughBlocks();
for (unsigned UsedCand : UsedCands) {
ArrayRef<unsigned> Blocks = GlobalCand[UsedCand].ActiveBlocks;
for (unsigned Number : Blocks) {
if (!Todo.test(Number))
continue;
Todo.reset(Number);
unsigned IntvIn = 0, IntvOut = 0;
SlotIndex IntfIn, IntfOut;
unsigned CandIn = BundleCand[Bundles->getBundle(Number, false)];
if (CandIn != NoCand) {
GlobalSplitCandidate &Cand = GlobalCand[CandIn];
IntvIn = Cand.IntvIdx;
Cand.Intf.moveToBlock(Number);
IntfIn = Cand.Intf.first();
}
unsigned CandOut = BundleCand[Bundles->getBundle(Number, true)];
if (CandOut != NoCand) {
GlobalSplitCandidate &Cand = GlobalCand[CandOut];
IntvOut = Cand.IntvIdx;
Cand.Intf.moveToBlock(Number);
IntfOut = Cand.Intf.last();
}
if (!IntvIn && !IntvOut)
continue;
SE->splitLiveThroughBlock(Number, IntvIn, IntfIn, IntvOut, IntfOut);
}
}
++NumGlobalSplits;
SmallVector<unsigned, 8> IntvMap;
SE->finish(&IntvMap);
DebugVars->splitRegister(Reg, LREdit.regs(), *LIS);
unsigned OrigBlocks = SA->getNumLiveBlocks();
// Sort out the new intervals created by splitting. We get four kinds:
// - Remainder intervals should not be split again.
// - Candidate intervals can be assigned to Cand.PhysReg.
// - Block-local splits are candidates for local splitting.
// - DCE leftovers should go back on the queue.
for (unsigned I = 0, E = LREdit.size(); I != E; ++I) {
const LiveInterval &Reg = LIS->getInterval(LREdit.get(I));
// Ignore old intervals from DCE.
if (ExtraInfo->getOrInitStage(Reg.reg()) != RS_New)
continue;
// Remainder interval. Don't try splitting again, spill if it doesn't
// allocate.
if (IntvMap[I] == 0) {
ExtraInfo->setStage(Reg, RS_Spill);
continue;
}
// Global intervals. Allow repeated splitting as long as the number of live
// blocks is strictly decreasing.
if (IntvMap[I] < NumGlobalIntvs) {
if (SA->countLiveBlocks(&Reg) >= OrigBlocks) {
LLVM_DEBUG(dbgs() << "Main interval covers the same " << OrigBlocks
<< " blocks as original.\n");
// Don't allow repeated splitting as a safe guard against looping.
ExtraInfo->setStage(Reg, RS_Split2);
}
continue;
}
// Other intervals are treated as new. This includes local intervals created
// for blocks with multiple uses, and anything created by DCE.
}
if (VerifyEnabled)
MF->verify(this, "After splitting live range around region");
}
MCRegister RAGreedy::tryRegionSplit(const LiveInterval &VirtReg,
AllocationOrder &Order,
SmallVectorImpl<Register> &NewVRegs) {
if (!TRI->shouldRegionSplitForVirtReg(*MF, VirtReg))
return MCRegister::NoRegister;
unsigned NumCands = 0;
Add logic to greedy reg alloc to avoid bad eviction chains This fixes bugzilla 26810 https://bugs.llvm.org/show_bug.cgi?id=26810 This is intended to prevent sequences like: movl %ebp, 8(%esp) # 4-byte Spill movl %ecx, %ebp movl %ebx, %ecx movl %edi, %ebx movl %edx, %edi cltd idivl %esi movl %edi, %edx movl %ebx, %edi movl %ecx, %ebx movl %ebp, %ecx movl 16(%esp), %ebp # 4 - byte Reload Such sequences are created in 2 scenarios: Scenario #1: vreg0 is evicted from physreg0 by vreg1 Evictee vreg0 is intended for region splitting with split candidate physreg0 (the reg vreg0 was evicted from) Region splitting creates a local interval because of interference with the evictor vreg1 (normally region spliiting creates 2 interval, the "by reg" and "by stack" intervals. Local interval created when interference occurs.) one of the split intervals ends up evicting vreg2 from physreg1 Evictee vreg2 is intended for region splitting with split candidate physreg1 one of the split intervals ends up evicting vreg3 from physreg2 etc.. until someone spills Scenario #2 vreg0 is evicted from physreg0 by vreg1 vreg2 is evicted from physreg2 by vreg3 etc Evictee vreg0 is intended for region splitting with split candidate physreg1 Region splitting creates a local interval because of interference with the evictor vreg1 one of the split intervals ends up evicting back original evictor vreg1 from physreg0 (the reg vreg0 was evicted from) Another evictee vreg2 is intended for region splitting with split candidate physreg1 one of the split intervals ends up evicting vreg3 from physreg2 etc.. until someone spills As compile time was a concern, I've added a flag to control weather we do cost calculations for local intervals we expect to be created (it's on by default for X86 target, off for the rest). Differential Revision: https://reviews.llvm.org/D35816 Change-Id: Id9411ff7bbb845463d289ba2ae97737a1ee7cc39 llvm-svn: 316295
2017-10-23 01:59:38 +08:00
BlockFrequency SpillCost = calcSpillCost();
BlockFrequency BestCost;
// Check if we can split this live range around a compact region.
bool HasCompact = calcCompactRegion(GlobalCand.front());
if (HasCompact) {
// Yes, keep GlobalCand[0] as the compact region candidate.
NumCands = 1;
BestCost = BlockFrequency::getMaxFrequency();
} else {
// No benefit from the compact region, our fallback will be per-block
// splitting. Make sure we find a solution that is cheaper than spilling.
Add logic to greedy reg alloc to avoid bad eviction chains This fixes bugzilla 26810 https://bugs.llvm.org/show_bug.cgi?id=26810 This is intended to prevent sequences like: movl %ebp, 8(%esp) # 4-byte Spill movl %ecx, %ebp movl %ebx, %ecx movl %edi, %ebx movl %edx, %edi cltd idivl %esi movl %edi, %edx movl %ebx, %edi movl %ecx, %ebx movl %ebp, %ecx movl 16(%esp), %ebp # 4 - byte Reload Such sequences are created in 2 scenarios: Scenario #1: vreg0 is evicted from physreg0 by vreg1 Evictee vreg0 is intended for region splitting with split candidate physreg0 (the reg vreg0 was evicted from) Region splitting creates a local interval because of interference with the evictor vreg1 (normally region spliiting creates 2 interval, the "by reg" and "by stack" intervals. Local interval created when interference occurs.) one of the split intervals ends up evicting vreg2 from physreg1 Evictee vreg2 is intended for region splitting with split candidate physreg1 one of the split intervals ends up evicting vreg3 from physreg2 etc.. until someone spills Scenario #2 vreg0 is evicted from physreg0 by vreg1 vreg2 is evicted from physreg2 by vreg3 etc Evictee vreg0 is intended for region splitting with split candidate physreg1 Region splitting creates a local interval because of interference with the evictor vreg1 one of the split intervals ends up evicting back original evictor vreg1 from physreg0 (the reg vreg0 was evicted from) Another evictee vreg2 is intended for region splitting with split candidate physreg1 one of the split intervals ends up evicting vreg3 from physreg2 etc.. until someone spills As compile time was a concern, I've added a flag to control weather we do cost calculations for local intervals we expect to be created (it's on by default for X86 target, off for the rest). Differential Revision: https://reviews.llvm.org/D35816 Change-Id: Id9411ff7bbb845463d289ba2ae97737a1ee7cc39 llvm-svn: 316295
2017-10-23 01:59:38 +08:00
BestCost = SpillCost;
LLVM_DEBUG(dbgs() << "Cost of isolating all blocks = ";
MBFI->printBlockFreq(dbgs(), BestCost) << '\n');
}
unsigned BestCand = calculateRegionSplitCost(VirtReg, Order, BestCost,
NumCands, false /*IgnoreCSR*/);
// No solutions found, fall back to single block splitting.
if (!HasCompact && BestCand == NoCand)
return MCRegister::NoRegister;
return doRegionSplit(VirtReg, BestCand, HasCompact, NewVRegs);
}
unsigned RAGreedy::calculateRegionSplitCost(const LiveInterval &VirtReg,
AllocationOrder &Order,
BlockFrequency &BestCost,
unsigned &NumCands,
bool IgnoreCSR) {
unsigned BestCand = NoCand;
for (MCPhysReg PhysReg : Order) {
assert(PhysReg);
if (IgnoreCSR && EvictAdvisor->isUnusedCalleeSavedReg(PhysReg))
continue;
// Discard bad candidates before we run out of interference cache cursors.
// This will only affect register classes with a lot of registers (>32).
if (NumCands == IntfCache.getMaxCursors()) {
unsigned WorstCount = ~0u;
unsigned Worst = 0;
for (unsigned CandIndex = 0; CandIndex != NumCands; ++CandIndex) {
if (CandIndex == BestCand || !GlobalCand[CandIndex].PhysReg)
continue;
unsigned Count = GlobalCand[CandIndex].LiveBundles.count();
if (Count < WorstCount) {
Worst = CandIndex;
WorstCount = Count;
}
}
--NumCands;
GlobalCand[Worst] = GlobalCand[NumCands];
if (BestCand == NumCands)
BestCand = Worst;
}
if (GlobalCand.size() <= NumCands)
GlobalCand.resize(NumCands+1);
GlobalSplitCandidate &Cand = GlobalCand[NumCands];
Cand.reset(IntfCache, PhysReg);
SpillPlacer->prepare(Cand.LiveBundles);
BlockFrequency Cost;
if (!addSplitConstraints(Cand.Intf, Cost)) {
LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << "\tno positive bundles\n");
continue;
}
LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << "\tstatic = ";
MBFI->printBlockFreq(dbgs(), Cost));
if (Cost >= BestCost) {
LLVM_DEBUG({
if (BestCand == NoCand)
dbgs() << " worse than no bundles\n";
else
dbgs() << " worse than "
<< printReg(GlobalCand[BestCand].PhysReg, TRI) << '\n';
});
continue;
}
if (!growRegion(Cand)) {
LLVM_DEBUG(dbgs() << ", cannot spill all interferences.\n");
continue;
}
SpillPlacer->finish();
// No live bundles, defer to splitSingleBlocks().
if (!Cand.LiveBundles.any()) {
LLVM_DEBUG(dbgs() << " no bundles.\n");
continue;
}
Cost += calcGlobalSplitCost(Cand, Order);
LLVM_DEBUG({
dbgs() << ", total = ";
MBFI->printBlockFreq(dbgs(), Cost) << " with bundles";
for (int I : Cand.LiveBundles.set_bits())
dbgs() << " EB#" << I;
dbgs() << ".\n";
});
if (Cost < BestCost) {
BestCand = NumCands;
BestCost = Cost;
}
++NumCands;
}
Add logic to greedy reg alloc to avoid bad eviction chains This fixes bugzilla 26810 https://bugs.llvm.org/show_bug.cgi?id=26810 This is intended to prevent sequences like: movl %ebp, 8(%esp) # 4-byte Spill movl %ecx, %ebp movl %ebx, %ecx movl %edi, %ebx movl %edx, %edi cltd idivl %esi movl %edi, %edx movl %ebx, %edi movl %ecx, %ebx movl %ebp, %ecx movl 16(%esp), %ebp # 4 - byte Reload Such sequences are created in 2 scenarios: Scenario #1: vreg0 is evicted from physreg0 by vreg1 Evictee vreg0 is intended for region splitting with split candidate physreg0 (the reg vreg0 was evicted from) Region splitting creates a local interval because of interference with the evictor vreg1 (normally region spliiting creates 2 interval, the "by reg" and "by stack" intervals. Local interval created when interference occurs.) one of the split intervals ends up evicting vreg2 from physreg1 Evictee vreg2 is intended for region splitting with split candidate physreg1 one of the split intervals ends up evicting vreg3 from physreg2 etc.. until someone spills Scenario #2 vreg0 is evicted from physreg0 by vreg1 vreg2 is evicted from physreg2 by vreg3 etc Evictee vreg0 is intended for region splitting with split candidate physreg1 Region splitting creates a local interval because of interference with the evictor vreg1 one of the split intervals ends up evicting back original evictor vreg1 from physreg0 (the reg vreg0 was evicted from) Another evictee vreg2 is intended for region splitting with split candidate physreg1 one of the split intervals ends up evicting vreg3 from physreg2 etc.. until someone spills As compile time was a concern, I've added a flag to control weather we do cost calculations for local intervals we expect to be created (it's on by default for X86 target, off for the rest). Differential Revision: https://reviews.llvm.org/D35816 Change-Id: Id9411ff7bbb845463d289ba2ae97737a1ee7cc39 llvm-svn: 316295
2017-10-23 01:59:38 +08:00
return BestCand;
}
unsigned RAGreedy::doRegionSplit(const LiveInterval &VirtReg, unsigned BestCand,
bool HasCompact,
2020-06-30 23:57:24 +08:00
SmallVectorImpl<Register> &NewVRegs) {
SmallVector<unsigned, 8> UsedCands;
// Prepare split editor.
Recommit r265547, and r265610,r265639,r265657 on top of it, plus two fixes with one about error verify-regalloc reported, and another about live range update of phi after rematerialization. r265547: Replace analyzeSiblingValues with new algorithm to fix its compile time issue. The patch is to solve PR17409 and its duplicates. analyzeSiblingValues is a N x N complexity algorithm where N is the number of siblings generated by reg splitting. Although it causes siginificant compile time issue when N is large, it is also important for performance since it removes redundent spills and enables rematerialization. To solve the compile time issue, the patch removes analyzeSiblingValues and replaces it with lower cost alternatives containing two parts. The first part creates a new spill hoisting method in postOptimization of register allocation. It does spill hoisting at once after all the spills are generated instead of inside every instance of selectOrSplit. The second part queries the define expr of the original register for rematerializaiton and keep it always available during register allocation even if it is already dead. It deletes those dead instructions only in postOptimization. With the two parts in the patch, it can remove analyzeSiblingValues without sacrificing performance. Patches on top of r265547: r265610 "Fix the compare-clang diff error introduced by r265547." r265639 "Fix the sanitizer bootstrap error in r265547." r265657 "InlineSpiller.cpp: Escap \@ in r265547. [-Wdocumentation]" Differential Revision: http://reviews.llvm.org/D15302 Differential Revision: http://reviews.llvm.org/D18934 Differential Revision: http://reviews.llvm.org/D18935 Differential Revision: http://reviews.llvm.org/D18936 llvm-svn: 266162
2016-04-13 11:08:27 +08:00
LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
SE->reset(LREdit, SplitSpillMode);
// Assign all edge bundles to the preferred candidate, or NoCand.
BundleCand.assign(Bundles->getNumBundles(), NoCand);
// Assign bundles for the best candidate region.
if (BestCand != NoCand) {
GlobalSplitCandidate &Cand = GlobalCand[BestCand];
if (unsigned B = Cand.getBundles(BundleCand, BestCand)) {
UsedCands.push_back(BestCand);
Cand.IntvIdx = SE->openIntv();
LLVM_DEBUG(dbgs() << "Split for " << printReg(Cand.PhysReg, TRI) << " in "
<< B << " bundles, intv " << Cand.IntvIdx << ".\n");
(void)B;
}
}
// Assign bundles for the compact region.
if (HasCompact) {
GlobalSplitCandidate &Cand = GlobalCand.front();
assert(!Cand.PhysReg && "Compact region has no physreg");
if (unsigned B = Cand.getBundles(BundleCand, 0)) {
UsedCands.push_back(0);
Cand.IntvIdx = SE->openIntv();
LLVM_DEBUG(dbgs() << "Split for compact region in " << B
<< " bundles, intv " << Cand.IntvIdx << ".\n");
(void)B;
}
}
splitAroundRegion(LREdit, UsedCands);
return 0;
}
//===----------------------------------------------------------------------===//
// Per-Block Splitting
//===----------------------------------------------------------------------===//
/// tryBlockSplit - Split a global live range around every block with uses. This
/// creates a lot of local live ranges, that will be split by tryLocalSplit if
/// they don't allocate.
unsigned RAGreedy::tryBlockSplit(const LiveInterval &VirtReg,
AllocationOrder &Order,
2020-06-30 23:57:24 +08:00
SmallVectorImpl<Register> &NewVRegs) {
assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed");
Register Reg = VirtReg.reg();
bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
Recommit r265547, and r265610,r265639,r265657 on top of it, plus two fixes with one about error verify-regalloc reported, and another about live range update of phi after rematerialization. r265547: Replace analyzeSiblingValues with new algorithm to fix its compile time issue. The patch is to solve PR17409 and its duplicates. analyzeSiblingValues is a N x N complexity algorithm where N is the number of siblings generated by reg splitting. Although it causes siginificant compile time issue when N is large, it is also important for performance since it removes redundent spills and enables rematerialization. To solve the compile time issue, the patch removes analyzeSiblingValues and replaces it with lower cost alternatives containing two parts. The first part creates a new spill hoisting method in postOptimization of register allocation. It does spill hoisting at once after all the spills are generated instead of inside every instance of selectOrSplit. The second part queries the define expr of the original register for rematerializaiton and keep it always available during register allocation even if it is already dead. It deletes those dead instructions only in postOptimization. With the two parts in the patch, it can remove analyzeSiblingValues without sacrificing performance. Patches on top of r265547: r265610 "Fix the compare-clang diff error introduced by r265547." r265639 "Fix the sanitizer bootstrap error in r265547." r265657 "InlineSpiller.cpp: Escap \@ in r265547. [-Wdocumentation]" Differential Revision: http://reviews.llvm.org/D15302 Differential Revision: http://reviews.llvm.org/D18934 Differential Revision: http://reviews.llvm.org/D18935 Differential Revision: http://reviews.llvm.org/D18936 llvm-svn: 266162
2016-04-13 11:08:27 +08:00
LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
SE->reset(LREdit, SplitSpillMode);
ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
for (const SplitAnalysis::BlockInfo &BI : UseBlocks) {
if (SA->shouldSplitSingleBlock(BI, SingleInstrs))
SE->splitSingleBlock(BI);
}
// No blocks were split.
if (LREdit.empty())
return 0;
// We did split for some blocks.
SmallVector<unsigned, 8> IntvMap;
SE->finish(&IntvMap);
// Tell LiveDebugVariables about the new ranges.
DebugVars->splitRegister(Reg, LREdit.regs(), *LIS);
// Sort out the new intervals created by splitting. The remainder interval
// goes straight to spilling, the new local ranges get to stay RS_New.
for (unsigned I = 0, E = LREdit.size(); I != E; ++I) {
const LiveInterval &LI = LIS->getInterval(LREdit.get(I));
if (ExtraInfo->getOrInitStage(LI.reg()) == RS_New && IntvMap[I] == 0)
ExtraInfo->setStage(LI, RS_Spill);
}
if (VerifyEnabled)
MF->verify(this, "After splitting live range around basic blocks");
return 0;
}
//===----------------------------------------------------------------------===//
// Per-Instruction Splitting
//===----------------------------------------------------------------------===//
/// Get the number of allocatable registers that match the constraints of \p Reg
/// on \p MI and that are also in \p SuperRC.
static unsigned getNumAllocatableRegsForConstraints(
const MachineInstr *MI, Register Reg, const TargetRegisterClass *SuperRC,
const TargetInstrInfo *TII, const TargetRegisterInfo *TRI,
const RegisterClassInfo &RCI) {
assert(SuperRC && "Invalid register class");
const TargetRegisterClass *ConstrainedRC =
MI->getRegClassConstraintEffectForVReg(Reg, SuperRC, TII, TRI,
/* ExploreBundle */ true);
if (!ConstrainedRC)
return 0;
return RCI.getNumAllocatableRegs(ConstrainedRC);
}
/// tryInstructionSplit - Split a live range around individual instructions.
/// This is normally not worthwhile since the spiller is doing essentially the
/// same thing. However, when the live range is in a constrained register
/// class, it may help to insert copies such that parts of the live range can
/// be moved to a larger register class.
///
/// This is similar to spilling to a larger register class.
unsigned RAGreedy::tryInstructionSplit(const LiveInterval &VirtReg,
AllocationOrder &Order,
SmallVectorImpl<Register> &NewVRegs) {
const TargetRegisterClass *CurRC = MRI->getRegClass(VirtReg.reg());
// There is no point to this if there are no larger sub-classes.
if (!RegClassInfo.isProperSubClass(CurRC))
return 0;
// Always enable split spill mode, since we're effectively spilling to a
// register.
Recommit r265547, and r265610,r265639,r265657 on top of it, plus two fixes with one about error verify-regalloc reported, and another about live range update of phi after rematerialization. r265547: Replace analyzeSiblingValues with new algorithm to fix its compile time issue. The patch is to solve PR17409 and its duplicates. analyzeSiblingValues is a N x N complexity algorithm where N is the number of siblings generated by reg splitting. Although it causes siginificant compile time issue when N is large, it is also important for performance since it removes redundent spills and enables rematerialization. To solve the compile time issue, the patch removes analyzeSiblingValues and replaces it with lower cost alternatives containing two parts. The first part creates a new spill hoisting method in postOptimization of register allocation. It does spill hoisting at once after all the spills are generated instead of inside every instance of selectOrSplit. The second part queries the define expr of the original register for rematerializaiton and keep it always available during register allocation even if it is already dead. It deletes those dead instructions only in postOptimization. With the two parts in the patch, it can remove analyzeSiblingValues without sacrificing performance. Patches on top of r265547: r265610 "Fix the compare-clang diff error introduced by r265547." r265639 "Fix the sanitizer bootstrap error in r265547." r265657 "InlineSpiller.cpp: Escap \@ in r265547. [-Wdocumentation]" Differential Revision: http://reviews.llvm.org/D15302 Differential Revision: http://reviews.llvm.org/D18934 Differential Revision: http://reviews.llvm.org/D18935 Differential Revision: http://reviews.llvm.org/D18936 llvm-svn: 266162
2016-04-13 11:08:27 +08:00
LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
SE->reset(LREdit, SplitEditor::SM_Size);
ArrayRef<SlotIndex> Uses = SA->getUseSlots();
if (Uses.size() <= 1)
return 0;
LLVM_DEBUG(dbgs() << "Split around " << Uses.size()
<< " individual instrs.\n");
const TargetRegisterClass *SuperRC =
TRI->getLargestLegalSuperClass(CurRC, *MF);
unsigned SuperRCNumAllocatableRegs =
RegClassInfo.getNumAllocatableRegs(SuperRC);
// Split around every non-copy instruction if this split will relax
// the constraints on the virtual register.
// Otherwise, splitting just inserts uncoalescable copies that do not help
// the allocation.
for (const SlotIndex Use : Uses) {
if (const MachineInstr *MI = Indexes->getInstructionFromIndex(Use))
if (MI->isFullCopy() ||
SuperRCNumAllocatableRegs ==
getNumAllocatableRegsForConstraints(MI, VirtReg.reg(), SuperRC,
TII, TRI, RegClassInfo)) {
LLVM_DEBUG(dbgs() << " skip:\t" << Use << '\t' << *MI);
continue;
}
SE->openIntv();
SlotIndex SegStart = SE->enterIntvBefore(Use);
SlotIndex SegStop = SE->leaveIntvAfter(Use);
SE->useIntv(SegStart, SegStop);
}
if (LREdit.empty()) {
LLVM_DEBUG(dbgs() << "All uses were copies.\n");
return 0;
}
SmallVector<unsigned, 8> IntvMap;
SE->finish(&IntvMap);
DebugVars->splitRegister(VirtReg.reg(), LREdit.regs(), *LIS);
// Assign all new registers to RS_Spill. This was the last chance.
ExtraInfo->setStage(LREdit.begin(), LREdit.end(), RS_Spill);
return 0;
}
//===----------------------------------------------------------------------===//
// Local Splitting
//===----------------------------------------------------------------------===//
/// calcGapWeights - Compute the maximum spill weight that needs to be evicted
/// in order to use PhysReg between two entries in SA->UseSlots.
///
/// GapWeight[I] represents the gap between UseSlots[I] and UseSlots[I + 1].
///
void RAGreedy::calcGapWeights(MCRegister PhysReg,
SmallVectorImpl<float> &GapWeight) {
assert(SA->getUseBlocks().size() == 1 && "Not a local interval");
const SplitAnalysis::BlockInfo &BI = SA->getUseBlocks().front();
ArrayRef<SlotIndex> Uses = SA->getUseSlots();
const unsigned NumGaps = Uses.size()-1;
// Start and end points for the interference check.
SlotIndex StartIdx =
BI.LiveIn ? BI.FirstInstr.getBaseIndex() : BI.FirstInstr;
SlotIndex StopIdx =
BI.LiveOut ? BI.LastInstr.getBoundaryIndex() : BI.LastInstr;
GapWeight.assign(NumGaps, 0.0f);
// Add interference from each overlapping register.
for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
if (!Matrix->query(const_cast<LiveInterval&>(SA->getParent()), *Units)
.checkInterference())
continue;
// We know that VirtReg is a continuous interval from FirstInstr to
// LastInstr, so we don't need InterferenceQuery.
//
// Interference that overlaps an instruction is counted in both gaps
// surrounding the instruction. The exception is interference before
// StartIdx and after StopIdx.
//
LiveIntervalUnion::SegmentIter IntI =
Matrix->getLiveUnions()[*Units] .find(StartIdx);
for (unsigned Gap = 0; IntI.valid() && IntI.start() < StopIdx; ++IntI) {
// Skip the gaps before IntI.
while (Uses[Gap+1].getBoundaryIndex() < IntI.start())
if (++Gap == NumGaps)
break;
if (Gap == NumGaps)
break;
// Update the gaps covered by IntI.
const float weight = IntI.value()->weight();
for (; Gap != NumGaps; ++Gap) {
GapWeight[Gap] = std::max(GapWeight[Gap], weight);
if (Uses[Gap+1].getBaseIndex() >= IntI.stop())
break;
}
if (Gap == NumGaps)
break;
}
}
// Add fixed interference.
for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
const LiveRange &LR = LIS->getRegUnit(*Units);
LiveRange::const_iterator I = LR.find(StartIdx);
LiveRange::const_iterator E = LR.end();
// Same loop as above. Mark any overlapped gaps as HUGE_VALF.
for (unsigned Gap = 0; I != E && I->start < StopIdx; ++I) {
while (Uses[Gap+1].getBoundaryIndex() < I->start)
if (++Gap == NumGaps)
break;
if (Gap == NumGaps)
break;
for (; Gap != NumGaps; ++Gap) {
GapWeight[Gap] = huge_valf;
if (Uses[Gap+1].getBaseIndex() >= I->end)
break;
}
if (Gap == NumGaps)
break;
}
}
}
/// tryLocalSplit - Try to split VirtReg into smaller intervals inside its only
/// basic block.
///
unsigned RAGreedy::tryLocalSplit(const LiveInterval &VirtReg,
AllocationOrder &Order,
2020-06-30 23:57:24 +08:00
SmallVectorImpl<Register> &NewVRegs) {
// TODO: the function currently only handles a single UseBlock; it should be
// possible to generalize.
if (SA->getUseBlocks().size() != 1)
return 0;
const SplitAnalysis::BlockInfo &BI = SA->getUseBlocks().front();
// Note that it is possible to have an interval that is live-in or live-out
// while only covering a single block - A phi-def can use undef values from
// predecessors, and the block could be a single-block loop.
// We don't bother doing anything clever about such a case, we simply assume
// that the interval is continuous from FirstInstr to LastInstr. We should
// make sure that we don't do anything illegal to such an interval, though.
ArrayRef<SlotIndex> Uses = SA->getUseSlots();
if (Uses.size() <= 2)
return 0;
const unsigned NumGaps = Uses.size()-1;
LLVM_DEBUG({
dbgs() << "tryLocalSplit: ";
for (const auto &Use : Uses)
dbgs() << ' ' << Use;
dbgs() << '\n';
});
// If VirtReg is live across any register mask operands, compute a list of
// gaps with register masks.
SmallVector<unsigned, 8> RegMaskGaps;
if (Matrix->checkRegMaskInterference(VirtReg)) {
// Get regmask slots for the whole block.
ArrayRef<SlotIndex> RMS = LIS->getRegMaskSlotsInBlock(BI.MBB->getNumber());
LLVM_DEBUG(dbgs() << RMS.size() << " regmasks in block:");
// Constrain to VirtReg's live range.
unsigned RI =
llvm::lower_bound(RMS, Uses.front().getRegSlot()) - RMS.begin();
unsigned RE = RMS.size();
for (unsigned I = 0; I != NumGaps && RI != RE; ++I) {
// Look for Uses[I] <= RMS <= Uses[I + 1].
assert(!SlotIndex::isEarlierInstr(RMS[RI], Uses[I]));
if (SlotIndex::isEarlierInstr(Uses[I + 1], RMS[RI]))
continue;
// Skip a regmask on the same instruction as the last use. It doesn't
// overlap the live range.
if (SlotIndex::isSameInstr(Uses[I + 1], RMS[RI]) && I + 1 == NumGaps)
break;
LLVM_DEBUG(dbgs() << ' ' << RMS[RI] << ':' << Uses[I] << '-'
<< Uses[I + 1]);
RegMaskGaps.push_back(I);
// Advance ri to the next gap. A regmask on one of the uses counts in
// both gaps.
while (RI != RE && SlotIndex::isEarlierInstr(RMS[RI], Uses[I + 1]))
++RI;
}
LLVM_DEBUG(dbgs() << '\n');
}
// Since we allow local split results to be split again, there is a risk of
// creating infinite loops. It is tempting to require that the new live
// ranges have less instructions than the original. That would guarantee
// convergence, but it is too strict. A live range with 3 instructions can be
// split 2+3 (including the COPY), and we want to allow that.
//
// Instead we use these rules:
//
// 1. Allow any split for ranges with getStage() < RS_Split2. (Except for the
// noop split, of course).
// 2. Require progress be made for ranges with getStage() == RS_Split2. All
// the new ranges must have fewer instructions than before the split.
// 3. New ranges with the same number of instructions are marked RS_Split2,
// smaller ranges are marked RS_New.
//
// These rules allow a 3 -> 2+3 split once, which we need. They also prevent
// excessive splitting and infinite loops.
//
bool ProgressRequired = ExtraInfo->getStage(VirtReg) >= RS_Split2;
// Best split candidate.
unsigned BestBefore = NumGaps;
unsigned BestAfter = 0;
float BestDiff = 0;
const float blockFreq =
SpillPlacer->getBlockFrequency(BI.MBB->getNumber()).getFrequency() *
(1.0f / MBFI->getEntryFreq());
SmallVector<float, 8> GapWeight;
for (MCPhysReg PhysReg : Order) {
assert(PhysReg);
// Keep track of the largest spill weight that would need to be evicted in
// order to make use of PhysReg between UseSlots[I] and UseSlots[I + 1].
calcGapWeights(PhysReg, GapWeight);
// Remove any gaps with regmask clobbers.
if (Matrix->checkRegMaskInterference(VirtReg, PhysReg))
for (unsigned I = 0, E = RegMaskGaps.size(); I != E; ++I)
GapWeight[RegMaskGaps[I]] = huge_valf;
// Try to find the best sequence of gaps to close.
// The new spill weight must be larger than any gap interference.
// We will split before Uses[SplitBefore] and after Uses[SplitAfter].
unsigned SplitBefore = 0, SplitAfter = 1;
// MaxGap should always be max(GapWeight[SplitBefore..SplitAfter-1]).
// It is the spill weight that needs to be evicted.
float MaxGap = GapWeight[0];
while (true) {
// Live before/after split?
const bool LiveBefore = SplitBefore != 0 || BI.LiveIn;
const bool LiveAfter = SplitAfter != NumGaps || BI.LiveOut;
LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << ' ' << Uses[SplitBefore]
<< '-' << Uses[SplitAfter] << " I=" << MaxGap);
// Stop before the interval gets so big we wouldn't be making progress.
if (!LiveBefore && !LiveAfter) {
LLVM_DEBUG(dbgs() << " all\n");
break;
}
// Should the interval be extended or shrunk?
bool Shrink = true;
// How many gaps would the new range have?
unsigned NewGaps = LiveBefore + SplitAfter - SplitBefore + LiveAfter;
// Legally, without causing looping?
bool Legal = !ProgressRequired || NewGaps < NumGaps;
if (Legal && MaxGap < huge_valf) {
// Estimate the new spill weight. Each instruction reads or writes the
// register. Conservatively assume there are no read-modify-write
// instructions.
//
// Try to guess the size of the new interval.
2014-11-05 04:51:24 +08:00
const float EstWeight = normalizeSpillWeight(
blockFreq * (NewGaps + 1),
Uses[SplitBefore].distance(Uses[SplitAfter]) +
(LiveBefore + LiveAfter) * SlotIndex::InstrDist,
1);
// Would this split be possible to allocate?
// Never allocate all gaps, we wouldn't be making progress.
LLVM_DEBUG(dbgs() << " w=" << EstWeight);
if (EstWeight * Hysteresis >= MaxGap) {
Shrink = false;
float Diff = EstWeight - MaxGap;
if (Diff > BestDiff) {
LLVM_DEBUG(dbgs() << " (best)");
BestDiff = Hysteresis * Diff;
BestBefore = SplitBefore;
BestAfter = SplitAfter;
}
}
}
// Try to shrink.
if (Shrink) {
if (++SplitBefore < SplitAfter) {
LLVM_DEBUG(dbgs() << " shrink\n");
// Recompute the max when necessary.
if (GapWeight[SplitBefore - 1] >= MaxGap) {
MaxGap = GapWeight[SplitBefore];
for (unsigned I = SplitBefore + 1; I != SplitAfter; ++I)
MaxGap = std::max(MaxGap, GapWeight[I]);
}
continue;
}
MaxGap = 0;
}
// Try to extend the interval.
if (SplitAfter >= NumGaps) {
LLVM_DEBUG(dbgs() << " end\n");
break;
}
LLVM_DEBUG(dbgs() << " extend\n");
MaxGap = std::max(MaxGap, GapWeight[SplitAfter++]);
}
}
// Didn't find any candidates?
if (BestBefore == NumGaps)
return 0;
LLVM_DEBUG(dbgs() << "Best local split range: " << Uses[BestBefore] << '-'
<< Uses[BestAfter] << ", " << BestDiff << ", "
<< (BestAfter - BestBefore + 1) << " instrs\n");
Recommit r265547, and r265610,r265639,r265657 on top of it, plus two fixes with one about error verify-regalloc reported, and another about live range update of phi after rematerialization. r265547: Replace analyzeSiblingValues with new algorithm to fix its compile time issue. The patch is to solve PR17409 and its duplicates. analyzeSiblingValues is a N x N complexity algorithm where N is the number of siblings generated by reg splitting. Although it causes siginificant compile time issue when N is large, it is also important for performance since it removes redundent spills and enables rematerialization. To solve the compile time issue, the patch removes analyzeSiblingValues and replaces it with lower cost alternatives containing two parts. The first part creates a new spill hoisting method in postOptimization of register allocation. It does spill hoisting at once after all the spills are generated instead of inside every instance of selectOrSplit. The second part queries the define expr of the original register for rematerializaiton and keep it always available during register allocation even if it is already dead. It deletes those dead instructions only in postOptimization. With the two parts in the patch, it can remove analyzeSiblingValues without sacrificing performance. Patches on top of r265547: r265610 "Fix the compare-clang diff error introduced by r265547." r265639 "Fix the sanitizer bootstrap error in r265547." r265657 "InlineSpiller.cpp: Escap \@ in r265547. [-Wdocumentation]" Differential Revision: http://reviews.llvm.org/D15302 Differential Revision: http://reviews.llvm.org/D18934 Differential Revision: http://reviews.llvm.org/D18935 Differential Revision: http://reviews.llvm.org/D18936 llvm-svn: 266162
2016-04-13 11:08:27 +08:00
LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
SE->reset(LREdit);
SE->openIntv();
SlotIndex SegStart = SE->enterIntvBefore(Uses[BestBefore]);
SlotIndex SegStop = SE->leaveIntvAfter(Uses[BestAfter]);
SE->useIntv(SegStart, SegStop);
SmallVector<unsigned, 8> IntvMap;
SE->finish(&IntvMap);
DebugVars->splitRegister(VirtReg.reg(), LREdit.regs(), *LIS);
// If the new range has the same number of instructions as before, mark it as
// RS_Split2 so the next split will be forced to make progress. Otherwise,
// leave the new intervals as RS_New so they can compete.
bool LiveBefore = BestBefore != 0 || BI.LiveIn;
bool LiveAfter = BestAfter != NumGaps || BI.LiveOut;
unsigned NewGaps = LiveBefore + BestAfter - BestBefore + LiveAfter;
if (NewGaps >= NumGaps) {
LLVM_DEBUG(dbgs() << "Tagging non-progress ranges:");
assert(!ProgressRequired && "Didn't make progress when it was required.");
for (unsigned I = 0, E = IntvMap.size(); I != E; ++I)
if (IntvMap[I] == 1) {
ExtraInfo->setStage(LIS->getInterval(LREdit.get(I)), RS_Split2);
LLVM_DEBUG(dbgs() << ' ' << printReg(LREdit.get(I)));
}
LLVM_DEBUG(dbgs() << '\n');
}
++NumLocalSplits;
return 0;
}
//===----------------------------------------------------------------------===//
// Live Range Splitting
//===----------------------------------------------------------------------===//
/// trySplit - Try to split VirtReg or one of its interferences, making it
/// assignable.
/// @return Physreg when VirtReg may be assigned and/or new NewVRegs.
unsigned RAGreedy::trySplit(const LiveInterval &VirtReg, AllocationOrder &Order,
2020-06-30 23:57:24 +08:00
SmallVectorImpl<Register> &NewVRegs,
const SmallVirtRegSet &FixedRegisters) {
// Ranges must be Split2 or less.
if (ExtraInfo->getStage(VirtReg) >= RS_Spill)
return 0;
// Local intervals are handled separately.
if (LIS->intervalIsInOneMBB(VirtReg)) {
NamedRegionTimer T("local_split", "Local Splitting", TimerGroupName,
TimerGroupDescription, TimePassesIsEnabled);
SA->analyze(&VirtReg);
2020-06-30 23:57:24 +08:00
Register PhysReg = tryLocalSplit(VirtReg, Order, NewVRegs);
if (PhysReg || !NewVRegs.empty())
return PhysReg;
return tryInstructionSplit(VirtReg, Order, NewVRegs);
}
NamedRegionTimer T("global_split", "Global Splitting", TimerGroupName,
TimerGroupDescription, TimePassesIsEnabled);
SA->analyze(&VirtReg);
// First try to split around a region spanning multiple blocks. RS_Split2
// ranges already made dubious progress with region splitting, so they go
// straight to single block splitting.
if (ExtraInfo->getStage(VirtReg) < RS_Split2) {
MCRegister PhysReg = tryRegionSplit(VirtReg, Order, NewVRegs);
if (PhysReg || !NewVRegs.empty())
return PhysReg;
}
// Then isolate blocks.
return tryBlockSplit(VirtReg, Order, NewVRegs);
}
//===----------------------------------------------------------------------===//
// Last Chance Recoloring
//===----------------------------------------------------------------------===//
/// Return true if \p reg has any tied def operand.
static bool hasTiedDef(MachineRegisterInfo *MRI, unsigned reg) {
for (const MachineOperand &MO : MRI->def_operands(reg))
if (MO.isTied())
return true;
return false;
}
/// Return true if the existing assignment of \p Intf overlaps, but is not the
/// same, as \p PhysReg.
static bool assignedRegPartiallyOverlaps(const TargetRegisterInfo &TRI,
const VirtRegMap &VRM,
MCRegister PhysReg,
const LiveInterval &Intf) {
MCRegister AssignedReg = VRM.getPhys(Intf.reg());
if (PhysReg == AssignedReg)
return false;
return TRI.regsOverlap(PhysReg, AssignedReg);
}
/// mayRecolorAllInterferences - Check if the virtual registers that
/// interfere with \p VirtReg on \p PhysReg (or one of its aliases) may be
/// recolored to free \p PhysReg.
/// When true is returned, \p RecoloringCandidates has been augmented with all
/// the live intervals that need to be recolored in order to free \p PhysReg
/// for \p VirtReg.
/// \p FixedRegisters contains all the virtual registers that cannot be
/// recolored.
bool RAGreedy::mayRecolorAllInterferences(
MCRegister PhysReg, const LiveInterval &VirtReg,
SmallLISet &RecoloringCandidates, const SmallVirtRegSet &FixedRegisters) {
const TargetRegisterClass *CurRC = MRI->getRegClass(VirtReg.reg());
for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
// If there is LastChanceRecoloringMaxInterference or more interferences,
// chances are one would not be recolorable.
if (Q.interferingVRegs(LastChanceRecoloringMaxInterference).size() >=
LastChanceRecoloringMaxInterference &&
!ExhaustiveSearch) {
LLVM_DEBUG(dbgs() << "Early abort: too many interferences.\n");
CutOffInfo |= CO_Interf;
return false;
}
for (const LiveInterval *Intf : reverse(Q.interferingVRegs())) {
// If Intf is done and sits on the same register class as VirtReg, it
// would not be recolorable as it is in the same state as
// VirtReg. However there are at least two exceptions.
//
// If VirtReg has tied defs and Intf doesn't, then
// there is still a point in examining if it can be recolorable.
//
// Additionally, if the register class has overlapping tuple members, it
// may still be recolorable using a different tuple. This is more likely
// if the existing assignment aliases with the candidate.
//
if (((ExtraInfo->getStage(*Intf) == RS_Done &&
MRI->getRegClass(Intf->reg()) == CurRC &&
!assignedRegPartiallyOverlaps(*TRI, *VRM, PhysReg, *Intf)) &&
!(hasTiedDef(MRI, VirtReg.reg()) &&
!hasTiedDef(MRI, Intf->reg()))) ||
FixedRegisters.count(Intf->reg())) {
LLVM_DEBUG(
dbgs() << "Early abort: the interference is not recolorable.\n");
return false;
}
RecoloringCandidates.insert(Intf);
}
}
return true;
}
/// tryLastChanceRecoloring - Try to assign a color to \p VirtReg by recoloring
/// its interferences.
/// Last chance recoloring chooses a color for \p VirtReg and recolors every
/// virtual register that was using it. The recoloring process may recursively
/// use the last chance recoloring. Therefore, when a virtual register has been
/// assigned a color by this mechanism, it is marked as Fixed, i.e., it cannot
/// be last-chance-recolored again during this recoloring "session".
/// E.g.,
/// Let
/// vA can use {R1, R2 }
/// vB can use { R2, R3}
/// vC can use {R1 }
/// Where vA, vB, and vC cannot be split anymore (they are reloads for
/// instance) and they all interfere.
///
/// vA is assigned R1
/// vB is assigned R2
/// vC tries to evict vA but vA is already done.
/// Regular register allocation fails.
///
/// Last chance recoloring kicks in:
/// vC does as if vA was evicted => vC uses R1.
/// vC is marked as fixed.
/// vA needs to find a color.
/// None are available.
/// vA cannot evict vC: vC is a fixed virtual register now.
/// vA does as if vB was evicted => vA uses R2.
/// vB needs to find a color.
/// R3 is available.
/// Recoloring => vC = R1, vA = R2, vB = R3
///
2014-02-25 12:21:15 +08:00
/// \p Order defines the preferred allocation order for \p VirtReg.
/// \p NewRegs will contain any new virtual register that have been created
/// (split, spill) during the process and that must be assigned.
/// \p FixedRegisters contains all the virtual registers that cannot be
/// recolored.
///
/// \p RecolorStack tracks the original assignments of successfully recolored
/// registers.
///
/// \p Depth gives the current depth of the last chance recoloring.
/// \return a physical register that can be used for VirtReg or ~0u if none
/// exists.
unsigned RAGreedy::tryLastChanceRecoloring(const LiveInterval &VirtReg,
AllocationOrder &Order,
2020-06-30 23:57:24 +08:00
SmallVectorImpl<Register> &NewVRegs,
SmallVirtRegSet &FixedRegisters,
RecoloringStack &RecolorStack,
unsigned Depth) {
if (!TRI->shouldUseLastChanceRecoloringForVirtReg(*MF, VirtReg))
return ~0u;
LLVM_DEBUG(dbgs() << "Try last chance recoloring for " << VirtReg << '\n');
const ssize_t EntryStackSize = RecolorStack.size();
// Ranges must be Done.
assert((ExtraInfo->getStage(VirtReg) >= RS_Done || !VirtReg.isSpillable()) &&
"Last chance recoloring should really be last chance");
// Set the max depth to LastChanceRecoloringMaxDepth.
// We may want to reconsider that if we end up with a too large search space
// for target with hundreds of registers.
// Indeed, in that case we may want to cut the search space earlier.
if (Depth >= LastChanceRecoloringMaxDepth && !ExhaustiveSearch) {
LLVM_DEBUG(dbgs() << "Abort because max depth has been reached.\n");
CutOffInfo |= CO_Depth;
return ~0u;
}
// Set of Live intervals that will need to be recolored.
SmallLISet RecoloringCandidates;
// Mark VirtReg as fixed, i.e., it will not be recolored pass this point in
// this recoloring "session".
assert(!FixedRegisters.count(VirtReg.reg()));
FixedRegisters.insert(VirtReg.reg());
2020-06-30 23:57:24 +08:00
SmallVector<Register, 4> CurrentNewVRegs;
for (MCRegister PhysReg : Order) {
assert(PhysReg.isValid());
LLVM_DEBUG(dbgs() << "Try to assign: " << VirtReg << " to "
<< printReg(PhysReg, TRI) << '\n');
RecoloringCandidates.clear();
CurrentNewVRegs.clear();
// It is only possible to recolor virtual register interference.
if (Matrix->checkInterference(VirtReg, PhysReg) >
LiveRegMatrix::IK_VirtReg) {
LLVM_DEBUG(
dbgs() << "Some interferences are not with virtual registers.\n");
continue;
}
// Early give up on this PhysReg if it is obvious we cannot recolor all
// the interferences.
if (!mayRecolorAllInterferences(PhysReg, VirtReg, RecoloringCandidates,
FixedRegisters)) {
LLVM_DEBUG(dbgs() << "Some interferences cannot be recolored.\n");
continue;
}
2022-03-01 07:26:41 +08:00
// RecoloringCandidates contains all the virtual registers that interfere
// with VirtReg on PhysReg (or one of its aliases). Enqueue them for
// recoloring and perform the actual recoloring.
PQueue RecoloringQueue;
for (const LiveInterval *RC : RecoloringCandidates) {
Register ItVirtReg = RC->reg();
enqueue(RecoloringQueue, RC);
assert(VRM->hasPhys(ItVirtReg) &&
"Interferences are supposed to be with allocated variables");
// Record the current allocation.
RecolorStack.push_back(std::make_pair(RC, VRM->getPhys(ItVirtReg)));
// unset the related struct.
Matrix->unassign(*RC);
}
// Do as if VirtReg was assigned to PhysReg so that the underlying
// recoloring has the right information about the interferes and
// available colors.
Matrix->assign(VirtReg, PhysReg);
// Save the current recoloring state.
// If we cannot recolor all the interferences, we will have to start again
// at this point for the next physical register.
SmallVirtRegSet SaveFixedRegisters(FixedRegisters);
if (tryRecoloringCandidates(RecoloringQueue, CurrentNewVRegs,
FixedRegisters, RecolorStack, Depth)) {
// Push the queued vregs into the main queue.
2020-06-30 23:57:24 +08:00
for (Register NewVReg : CurrentNewVRegs)
NewVRegs.push_back(NewVReg);
// Do not mess up with the global assignment process.
// I.e., VirtReg must be unassigned.
Matrix->unassign(VirtReg);
return PhysReg;
}
LLVM_DEBUG(dbgs() << "Fail to assign: " << VirtReg << " to "
<< printReg(PhysReg, TRI) << '\n');
// The recoloring attempt failed, undo the changes.
FixedRegisters = SaveFixedRegisters;
Matrix->unassign(VirtReg);
// For a newly created vreg which is also in RecoloringCandidates,
// don't add it to NewVRegs because its physical register will be restored
// below. Other vregs in CurrentNewVRegs are created by calling
// selectOrSplit and should be added into NewVRegs.
for (Register &R : CurrentNewVRegs) {
if (RecoloringCandidates.count(&LIS->getInterval(R)))
continue;
NewVRegs.push_back(R);
}
// Roll back our unsuccessful recoloring. Also roll back any successful
// recolorings in any recursive recoloring attempts, since it's possible
// they would have introduced conflicts with assignments we will be
// restoring further up the stack. Perform all unassignments prior to
// reassigning, since sub-recolorings may have conflicted with the registers
// we are going to restore to their original assignments.
for (ssize_t I = RecolorStack.size() - 1; I >= EntryStackSize; --I) {
const LiveInterval *LI;
MCRegister PhysReg;
std::tie(LI, PhysReg) = RecolorStack[I];
if (VRM->hasPhys(LI->reg()))
Matrix->unassign(*LI);
}
for (size_t I = EntryStackSize; I != RecolorStack.size(); ++I) {
const LiveInterval *LI;
MCRegister PhysReg;
std::tie(LI, PhysReg) = RecolorStack[I];
if (!LI->empty() && !MRI->reg_nodbg_empty(LI->reg()))
Matrix->assign(*LI, PhysReg);
}
// Pop the stack of recoloring attempts.
RecolorStack.resize(EntryStackSize);
}
// Last chance recoloring did not worked either, give up.
return ~0u;
}
/// tryRecoloringCandidates - Try to assign a new color to every register
/// in \RecoloringQueue.
/// \p NewRegs will contain any new virtual register created during the
/// recoloring process.
/// \p FixedRegisters[in/out] contains all the registers that have been
/// recolored.
/// \return true if all virtual registers in RecoloringQueue were successfully
/// recolored, false otherwise.
bool RAGreedy::tryRecoloringCandidates(PQueue &RecoloringQueue,
2020-06-30 23:57:24 +08:00
SmallVectorImpl<Register> &NewVRegs,
SmallVirtRegSet &FixedRegisters,
RecoloringStack &RecolorStack,
unsigned Depth) {
while (!RecoloringQueue.empty()) {
const LiveInterval *LI = dequeue(RecoloringQueue);
LLVM_DEBUG(dbgs() << "Try to recolor: " << *LI << '\n');
MCRegister PhysReg = selectOrSplitImpl(*LI, NewVRegs, FixedRegisters,
RecolorStack, Depth + 1);
// When splitting happens, the live-range may actually be empty.
// In that case, this is okay to continue the recoloring even
// if we did not find an alternative color for it. Indeed,
// there will not be anything to color for LI in the end.
if (PhysReg == ~0u || (!PhysReg && !LI->empty()))
return false;
if (!PhysReg) {
assert(LI->empty() && "Only empty live-range do not require a register");
LLVM_DEBUG(dbgs() << "Recoloring of " << *LI
<< " succeeded. Empty LI.\n");
continue;
}
LLVM_DEBUG(dbgs() << "Recoloring of " << *LI
<< " succeeded with: " << printReg(PhysReg, TRI) << '\n');
Matrix->assign(*LI, PhysReg);
FixedRegisters.insert(LI->reg());
}
return true;
}
//===----------------------------------------------------------------------===//
// Main Entry Point
//===----------------------------------------------------------------------===//
MCRegister RAGreedy::selectOrSplit(const LiveInterval &VirtReg,
SmallVectorImpl<Register> &NewVRegs) {
CutOffInfo = CO_None;
LLVMContext &Ctx = MF->getFunction().getContext();
SmallVirtRegSet FixedRegisters;
RecoloringStack RecolorStack;
MCRegister Reg =
selectOrSplitImpl(VirtReg, NewVRegs, FixedRegisters, RecolorStack);
if (Reg == ~0U && (CutOffInfo != CO_None)) {
uint8_t CutOffEncountered = CutOffInfo & (CO_Depth | CO_Interf);
if (CutOffEncountered == CO_Depth)
Ctx.emitError("register allocation failed: maximum depth for recoloring "
"reached. Use -fexhaustive-register-search to skip "
"cutoffs");
else if (CutOffEncountered == CO_Interf)
Ctx.emitError("register allocation failed: maximum interference for "
"recoloring reached. Use -fexhaustive-register-search "
"to skip cutoffs");
else if (CutOffEncountered == (CO_Depth | CO_Interf))
Ctx.emitError("register allocation failed: maximum interference and "
"depth for recoloring reached. Use "
"-fexhaustive-register-search to skip cutoffs");
}
return Reg;
}
/// Using a CSR for the first time has a cost because it causes push|pop
/// to be added to prologue|epilogue. Splitting a cold section of the live
/// range can have lower cost than using the CSR for the first time;
/// Spilling a live range in the cold path can have lower cost than using
/// the CSR for the first time. Returns the physical register if we decide
/// to use the CSR; otherwise return 0.
MCRegister RAGreedy::tryAssignCSRFirstTime(
const LiveInterval &VirtReg, AllocationOrder &Order, MCRegister PhysReg,
uint8_t &CostPerUseLimit, SmallVectorImpl<Register> &NewVRegs) {
if (ExtraInfo->getStage(VirtReg) == RS_Spill && VirtReg.isSpillable()) {
// We choose spill over using the CSR for the first time if the spill cost
// is lower than CSRCost.
SA->analyze(&VirtReg);
if (calcSpillCost() >= CSRCost)
return PhysReg;
// We are going to spill, set CostPerUseLimit to 1 to make sure that
// we will not use a callee-saved register in tryEvict.
CostPerUseLimit = 1;
return 0;
}
if (ExtraInfo->getStage(VirtReg) < RS_Split) {
// We choose pre-splitting over using the CSR for the first time if
// the cost of splitting is lower than CSRCost.
SA->analyze(&VirtReg);
unsigned NumCands = 0;
BlockFrequency BestCost = CSRCost; // Don't modify CSRCost.
unsigned BestCand = calculateRegionSplitCost(VirtReg, Order, BestCost,
NumCands, true /*IgnoreCSR*/);
if (BestCand == NoCand)
// Use the CSR if we can't find a region split below CSRCost.
return PhysReg;
// Perform the actual pre-splitting.
doRegionSplit(VirtReg, BestCand, false/*HasCompact*/, NewVRegs);
return 0;
}
return PhysReg;
}
void RAGreedy::aboutToRemoveInterval(const LiveInterval &LI) {
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
// Do not keep invalid information around.
SetOfBrokenHints.remove(&LI);
}
void RAGreedy::initializeCSRCost() {
// We use the larger one out of the command-line option and the value report
// by TRI.
CSRCost = BlockFrequency(
std::max((unsigned)CSRFirstTimeCost, TRI->getCSRFirstUseCost()));
if (!CSRCost.getFrequency())
return;
// Raw cost is relative to Entry == 2^14; scale it appropriately.
uint64_t ActualEntry = MBFI->getEntryFreq();
if (!ActualEntry) {
CSRCost = 0;
return;
}
uint64_t FixedEntry = 1 << 14;
if (ActualEntry < FixedEntry)
CSRCost *= BranchProbability(ActualEntry, FixedEntry);
else if (ActualEntry <= UINT32_MAX)
// Invert the fraction and divide.
CSRCost /= BranchProbability(FixedEntry, ActualEntry);
else
// Can't use BranchProbability in general, since it takes 32-bit numbers.
CSRCost = CSRCost.getFrequency() * (ActualEntry / FixedEntry);
}
/// Collect the hint info for \p Reg.
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
/// The results are stored into \p Out.
/// \p Out is not cleared before being populated.
void RAGreedy::collectHintInfo(Register Reg, HintsInfo &Out) {
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) {
if (!Instr.isFullCopy())
continue;
// Look for the other end of the copy.
Register OtherReg = Instr.getOperand(0).getReg();
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
if (OtherReg == Reg) {
OtherReg = Instr.getOperand(1).getReg();
if (OtherReg == Reg)
continue;
}
// Get the current assignment.
MCRegister OtherPhysReg =
OtherReg.isPhysical() ? OtherReg.asMCReg() : VRM->getPhys(OtherReg);
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
// Push the collected information.
Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg,
OtherPhysReg));
}
}
/// Using the given \p List, compute the cost of the broken hints if
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
/// \p PhysReg was used.
/// \return The cost of \p List for \p PhysReg.
BlockFrequency RAGreedy::getBrokenHintFreq(const HintsInfo &List,
MCRegister PhysReg) {
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
BlockFrequency Cost = 0;
for (const HintInfo &Info : List) {
if (Info.PhysReg != PhysReg)
Cost += Info.Freq;
}
return Cost;
}
/// Using the register assigned to \p VirtReg, try to recolor
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
/// all the live ranges that are copy-related with \p VirtReg.
/// The recoloring is then propagated to all the live-ranges that have
/// been recolored and so on, until no more copies can be coalesced or
/// it is not profitable.
/// For a given live range, profitability is determined by the sum of the
/// frequencies of the non-identity copies it would introduce with the old
/// and new register.
void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
// We have a broken hint, check if it is possible to fix it by
// reusing PhysReg for the copy-related live-ranges. Indeed, we evicted
// some register and PhysReg may be available for the other live-ranges.
SmallSet<Register, 4> Visited;
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
SmallVector<unsigned, 2> RecoloringCandidates;
HintsInfo Info;
Register Reg = VirtReg.reg();
MCRegister PhysReg = VRM->getPhys(Reg);
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
// Start the recoloring algorithm from the input live-interval, then
// it will propagate to the ones that are copy-related with it.
Visited.insert(Reg);
RecoloringCandidates.push_back(Reg);
LLVM_DEBUG(dbgs() << "Trying to reconcile hints for: " << printReg(Reg, TRI)
<< '(' << printReg(PhysReg, TRI) << ")\n");
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
do {
Reg = RecoloringCandidates.pop_back_val();
// We cannot recolor physical register.
if (Register::isPhysicalRegister(Reg))
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
continue;
RegAlloc: Allow targets to split register allocation AMDGPU normally spills SGPRs to VGPRs. Previously, since all register classes are handled at the same time, this was problematic. We don't know ahead of time how many registers will be needed to be reserved to handle the spilling. If no VGPRs were left for spilling, we would have to try to spill to memory. If the spilled SGPRs were required for exec mask manipulation, it is highly problematic because the lanes active at the point of spill are not necessarily the same as at the restore point. Avoid this problem by fully allocating SGPRs in a separate regalloc run from VGPRs. This way we know the exact number of VGPRs needed, and can reserve them for a second run. This fixes the most serious issues, but it is still possible using inline asm to make all VGPRs unavailable. Start erroring in the case where we ever would require memory for an SGPR spill. This is implemented by giving each regalloc pass a callback which reports if a register class should be handled or not. A few passes need some small changes to deal with leftover virtual registers. In the AMDGPU implementation, a new pass is introduced to take the place of PrologEpilogInserter for SGPR spills emitted during the first run. One disadvantage of this is currently StackSlotColoring is no longer used for SGPR spills. It would need to be run again, which will require more work. Error if the standard -regalloc option is used. Introduce new separate -sgpr-regalloc and -vgpr-regalloc flags, so the two runs can be controlled individually. PBQB is not currently supported, so this also prevents using the unhandled allocator.
2018-09-27 07:36:28 +08:00
// This may be a skipped class
if (!VRM->hasPhys(Reg)) {
assert(!ShouldAllocateClass(*TRI, *MRI->getRegClass(Reg)) &&
"We have an unallocated variable which should have been handled");
continue;
}
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
// Get the live interval mapped with this virtual register to be able
// to check for the interference with the new color.
LiveInterval &LI = LIS->getInterval(Reg);
MCRegister CurrPhys = VRM->getPhys(Reg);
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
// Check that the new color matches the register class constraints and
// that it is free for this live range.
if (CurrPhys != PhysReg && (!MRI->getRegClass(Reg)->contains(PhysReg) ||
Matrix->checkInterference(LI, PhysReg)))
continue;
LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << '(' << printReg(CurrPhys, TRI)
<< ") is recolorable.\n");
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
// Gather the hint info.
Info.clear();
collectHintInfo(Reg, Info);
// Check if recoloring the live-range will increase the cost of the
// non-identity copies.
if (CurrPhys != PhysReg) {
LLVM_DEBUG(dbgs() << "Checking profitability:\n");
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
BlockFrequency OldCopiesCost = getBrokenHintFreq(Info, CurrPhys);
BlockFrequency NewCopiesCost = getBrokenHintFreq(Info, PhysReg);
LLVM_DEBUG(dbgs() << "Old Cost: " << OldCopiesCost.getFrequency()
<< "\nNew Cost: " << NewCopiesCost.getFrequency()
<< '\n');
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
if (OldCopiesCost < NewCopiesCost) {
LLVM_DEBUG(dbgs() << "=> Not profitable.\n");
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
continue;
}
// At this point, the cost is either cheaper or equal. If it is
// equal, we consider this is profitable because it may expose
// more recoloring opportunities.
LLVM_DEBUG(dbgs() << "=> Profitable.\n");
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
// Recolor the live-range.
Matrix->unassign(LI);
Matrix->assign(LI, PhysReg);
}
// Push all copy-related live-ranges to keep reconciling the broken
// hints.
for (const HintInfo &HI : Info) {
if (Visited.insert(HI.Reg).second)
RecoloringCandidates.push_back(HI.Reg);
}
} while (!RecoloringCandidates.empty());
}
/// Try to recolor broken hints.
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
/// Broken hints may be repaired by recoloring when an evicted variable
/// freed up a register for a larger live-range.
/// Consider the following example:
/// BB1:
/// a =
/// b =
/// BB2:
/// ...
/// = b
/// = a
/// Let us assume b gets split:
/// BB1:
/// a =
/// b =
/// BB2:
/// c = b
/// ...
/// d = c
/// = d
/// = a
/// Because of how the allocation work, b, c, and d may be assigned different
/// colors. Now, if a gets evicted later:
/// BB1:
/// a =
/// st a, SpillSlot
/// b =
/// BB2:
/// c = b
/// ...
/// d = c
/// = d
/// e = ld SpillSlot
/// = e
/// This is likely that we can assign the same register for b, c, and d,
/// getting rid of 2 copies.
void RAGreedy::tryHintsRecoloring() {
for (const LiveInterval *LI : SetOfBrokenHints) {
assert(Register::isVirtualRegister(LI->reg()) &&
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
"Recoloring is possible only for virtual registers");
// Some dead defs may be around (e.g., because of debug uses).
// Ignore those.
if (!VRM->hasPhys(LI->reg()))
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
continue;
tryHintRecoloring(*LI);
}
}
MCRegister RAGreedy::selectOrSplitImpl(const LiveInterval &VirtReg,
SmallVectorImpl<Register> &NewVRegs,
SmallVirtRegSet &FixedRegisters,
RecoloringStack &RecolorStack,
unsigned Depth) {
uint8_t CostPerUseLimit = uint8_t(~0u);
// First try assigning a free register.
auto Order =
AllocationOrder::create(VirtReg.reg(), *VRM, RegClassInfo, Matrix);
if (MCRegister PhysReg =
tryAssign(VirtReg, Order, NewVRegs, FixedRegisters)) {
// When NewVRegs is not empty, we may have made decisions such as evicting
// a virtual register, go with the earlier decisions and use the physical
// register.
if (CSRCost.getFrequency() &&
EvictAdvisor->isUnusedCalleeSavedReg(PhysReg) && NewVRegs.empty()) {
MCRegister CSRReg = tryAssignCSRFirstTime(VirtReg, Order, PhysReg,
CostPerUseLimit, NewVRegs);
if (CSRReg || !NewVRegs.empty())
// Return now if we decide to use a CSR or create new vregs due to
// pre-splitting.
return CSRReg;
} else
return PhysReg;
}
LiveRangeStage Stage = ExtraInfo->getStage(VirtReg);
LLVM_DEBUG(dbgs() << StageName[Stage] << " Cascade "
<< ExtraInfo->getCascade(VirtReg.reg()) << '\n');
// Try to evict a less worthy live range, but only for ranges from the primary
// queue. The RS_Split ranges already failed to do this, and they should not
// get a second chance until they have been split.
if (Stage != RS_Split)
2020-06-30 23:57:24 +08:00
if (Register PhysReg =
tryEvict(VirtReg, Order, NewVRegs, CostPerUseLimit,
FixedRegisters)) {
Register Hint = MRI->getSimpleHint(VirtReg.reg());
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
// If VirtReg has a hint and that hint is broken record this
// virtual register as a recoloring candidate for broken hint.
// Indeed, since we evicted a variable in its neighborhood it is
// likely we can at least partially recolor some of the
// copy-related live-ranges.
if (Hint && Hint != PhysReg)
SetOfBrokenHints.insert(&VirtReg);
return PhysReg;
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
}
assert((NewVRegs.empty() || Depth) && "Cannot append to existing NewVRegs");
// The first time we see a live range, don't try to split or spill.
// Wait until the second time, when all smaller ranges have been allocated.
// This gives a better picture of the interference to split around.
if (Stage < RS_Split) {
ExtraInfo->setStage(VirtReg, RS_Split);
LLVM_DEBUG(dbgs() << "wait for second round\n");
NewVRegs.push_back(VirtReg.reg());
return 0;
}
if (Stage < RS_Spill) {
// Try splitting VirtReg or interferences.
unsigned NewVRegSizeBefore = NewVRegs.size();
2020-06-30 23:57:24 +08:00
Register PhysReg = trySplit(VirtReg, Order, NewVRegs, FixedRegisters);
if (PhysReg || (NewVRegs.size() - NewVRegSizeBefore))
return PhysReg;
}
// If we couldn't allocate a register from spilling, there is probably some
2017-06-30 02:03:28 +08:00
// invalid inline assembly. The base class will report it.
if (Stage >= RS_Done || !VirtReg.isSpillable()) {
return tryLastChanceRecoloring(VirtReg, Order, NewVRegs, FixedRegisters,
RecolorStack, Depth);
}
// Finally spill VirtReg itself.
if ((EnableDeferredSpilling ||
TRI->shouldUseDeferredSpillingForVirtReg(*MF, VirtReg)) &&
ExtraInfo->getStage(VirtReg) < RS_Memory) {
// TODO: This is experimental and in particular, we do not model
// the live range splitting done by spilling correctly.
// We would need a deep integration with the spiller to do the
// right thing here. Anyway, that is still good for early testing.
ExtraInfo->setStage(VirtReg, RS_Memory);
LLVM_DEBUG(dbgs() << "Do as if this register is in memory\n");
NewVRegs.push_back(VirtReg.reg());
} else {
NamedRegionTimer T("spill", "Spiller", TimerGroupName,
TimerGroupDescription, TimePassesIsEnabled);
Recommit r265547, and r265610,r265639,r265657 on top of it, plus two fixes with one about error verify-regalloc reported, and another about live range update of phi after rematerialization. r265547: Replace analyzeSiblingValues with new algorithm to fix its compile time issue. The patch is to solve PR17409 and its duplicates. analyzeSiblingValues is a N x N complexity algorithm where N is the number of siblings generated by reg splitting. Although it causes siginificant compile time issue when N is large, it is also important for performance since it removes redundent spills and enables rematerialization. To solve the compile time issue, the patch removes analyzeSiblingValues and replaces it with lower cost alternatives containing two parts. The first part creates a new spill hoisting method in postOptimization of register allocation. It does spill hoisting at once after all the spills are generated instead of inside every instance of selectOrSplit. The second part queries the define expr of the original register for rematerializaiton and keep it always available during register allocation even if it is already dead. It deletes those dead instructions only in postOptimization. With the two parts in the patch, it can remove analyzeSiblingValues without sacrificing performance. Patches on top of r265547: r265610 "Fix the compare-clang diff error introduced by r265547." r265639 "Fix the sanitizer bootstrap error in r265547." r265657 "InlineSpiller.cpp: Escap \@ in r265547. [-Wdocumentation]" Differential Revision: http://reviews.llvm.org/D15302 Differential Revision: http://reviews.llvm.org/D18934 Differential Revision: http://reviews.llvm.org/D18935 Differential Revision: http://reviews.llvm.org/D18936 llvm-svn: 266162
2016-04-13 11:08:27 +08:00
LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
spiller().spill(LRE);
ExtraInfo->setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done);
2019-10-26 01:03:18 +08:00
// Tell LiveDebugVariables about the new ranges. Ranges not being covered by
// the new regs are kept in LDV (still mapping to the old register), until
// we rewrite spilled locations in LDV at a later stage.
DebugVars->splitRegister(VirtReg.reg(), LRE.regs(), *LIS);
2019-10-26 01:03:18 +08:00
if (VerifyEnabled)
MF->verify(this, "After spilling");
}
// The live virtual register requesting allocation was spilled, so tell
// the caller not to allocate anything during this round.
return 0;
}
void RAGreedy::RAGreedyStats::report(MachineOptimizationRemarkMissed &R) {
using namespace ore;
if (Spills) {
R << NV("NumSpills", Spills) << " spills ";
R << NV("TotalSpillsCost", SpillsCost) << " total spills cost ";
}
if (FoldedSpills) {
R << NV("NumFoldedSpills", FoldedSpills) << " folded spills ";
R << NV("TotalFoldedSpillsCost", FoldedSpillsCost)
<< " total folded spills cost ";
}
if (Reloads) {
R << NV("NumReloads", Reloads) << " reloads ";
R << NV("TotalReloadsCost", ReloadsCost) << " total reloads cost ";
}
if (FoldedReloads) {
R << NV("NumFoldedReloads", FoldedReloads) << " folded reloads ";
R << NV("TotalFoldedReloadsCost", FoldedReloadsCost)
<< " total folded reloads cost ";
}
if (ZeroCostFoldedReloads)
R << NV("NumZeroCostFoldedReloads", ZeroCostFoldedReloads)
<< " zero cost folded reloads ";
if (Copies) {
R << NV("NumVRCopies", Copies) << " virtual registers copies ";
R << NV("TotalCopiesCost", CopiesCost) << " total copies cost ";
}
}
RAGreedy::RAGreedyStats RAGreedy::computeStats(MachineBasicBlock &MBB) {
RAGreedyStats Stats;
const MachineFrameInfo &MFI = MF->getFrameInfo();
int FI;
auto isSpillSlotAccess = [&MFI](const MachineMemOperand *A) {
return MFI.isSpillSlotObjectIndex(cast<FixedStackPseudoSourceValue>(
A->getPseudoValue())->getFrameIndex());
};
auto isPatchpointInstr = [](const MachineInstr &MI) {
return MI.getOpcode() == TargetOpcode::PATCHPOINT ||
MI.getOpcode() == TargetOpcode::STACKMAP ||
MI.getOpcode() == TargetOpcode::STATEPOINT;
};
for (MachineInstr &MI : MBB) {
if (MI.isCopy()) {
MachineOperand &Dest = MI.getOperand(0);
MachineOperand &Src = MI.getOperand(1);
if (Dest.isReg() && Src.isReg() && Dest.getReg().isVirtual() &&
Src.getReg().isVirtual())
++Stats.Copies;
continue;
}
SmallVector<const MachineMemOperand *, 2> Accesses;
if (TII->isLoadFromStackSlot(MI, FI) && MFI.isSpillSlotObjectIndex(FI)) {
++Stats.Reloads;
continue;
}
if (TII->isStoreToStackSlot(MI, FI) && MFI.isSpillSlotObjectIndex(FI)) {
++Stats.Spills;
continue;
}
if (TII->hasLoadFromStackSlot(MI, Accesses) &&
llvm::any_of(Accesses, isSpillSlotAccess)) {
if (!isPatchpointInstr(MI)) {
Stats.FoldedReloads += Accesses.size();
continue;
}
// For statepoint there may be folded and zero cost folded stack reloads.
std::pair<unsigned, unsigned> NonZeroCostRange =
TII->getPatchpointUnfoldableRange(MI);
SmallSet<unsigned, 16> FoldedReloads;
SmallSet<unsigned, 16> ZeroCostFoldedReloads;
for (unsigned Idx = 0, E = MI.getNumOperands(); Idx < E; ++Idx) {
MachineOperand &MO = MI.getOperand(Idx);
if (!MO.isFI() || !MFI.isSpillSlotObjectIndex(MO.getIndex()))
continue;
if (Idx >= NonZeroCostRange.first && Idx < NonZeroCostRange.second)
FoldedReloads.insert(MO.getIndex());
else
ZeroCostFoldedReloads.insert(MO.getIndex());
}
// If stack slot is used in folded reload it is not zero cost then.
for (unsigned Slot : FoldedReloads)
ZeroCostFoldedReloads.erase(Slot);
Stats.FoldedReloads += FoldedReloads.size();
Stats.ZeroCostFoldedReloads += ZeroCostFoldedReloads.size();
continue;
}
Accesses.clear();
if (TII->hasStoreToStackSlot(MI, Accesses) &&
llvm::any_of(Accesses, isSpillSlotAccess)) {
Stats.FoldedSpills += Accesses.size();
}
}
// Set cost of collected statistic by multiplication to relative frequency of
// this basic block.
float RelFreq = MBFI->getBlockFreqRelativeToEntryBlock(&MBB);
Stats.ReloadsCost = RelFreq * Stats.Reloads;
Stats.FoldedReloadsCost = RelFreq * Stats.FoldedReloads;
Stats.SpillsCost = RelFreq * Stats.Spills;
Stats.FoldedSpillsCost = RelFreq * Stats.FoldedSpills;
Stats.CopiesCost = RelFreq * Stats.Copies;
return Stats;
}
RAGreedy::RAGreedyStats RAGreedy::reportStats(MachineLoop *L) {
RAGreedyStats Stats;
// Sum up the spill and reloads in subloops.
for (MachineLoop *SubLoop : *L)
Stats.add(reportStats(SubLoop));
for (MachineBasicBlock *MBB : L->getBlocks())
// Handle blocks that were not included in subloops.
if (Loops->getLoopFor(MBB) == L)
Stats.add(computeStats(*MBB));
if (!Stats.isEmpty()) {
using namespace ore;
ORE->emit([&]() {
MachineOptimizationRemarkMissed R(DEBUG_TYPE, "LoopSpillReloadCopies",
L->getStartLoc(), L->getHeader());
Stats.report(R);
R << "generated in loop";
return R;
});
}
return Stats;
}
void RAGreedy::reportStats() {
if (!ORE->allowExtraAnalysis(DEBUG_TYPE))
return;
RAGreedyStats Stats;
for (MachineLoop *L : *Loops)
Stats.add(reportStats(L));
// Process non-loop blocks.
for (MachineBasicBlock &MBB : *MF)
if (!Loops->getLoopFor(&MBB))
Stats.add(computeStats(MBB));
if (!Stats.isEmpty()) {
using namespace ore;
ORE->emit([&]() {
DebugLoc Loc;
if (auto *SP = MF->getFunction().getSubprogram())
Loc = DILocation::get(SP->getContext(), SP->getLine(), 1, SP);
MachineOptimizationRemarkMissed R(DEBUG_TYPE, "SpillReloadCopies", Loc,
&MF->front());
Stats.report(R);
R << "generated in function";
return R;
});
}
}
bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
LLVM_DEBUG(dbgs() << "********** GREEDY REGISTER ALLOCATION **********\n"
<< "********** Function: " << mf.getName() << '\n');
MF = &mf;
TII = MF->getSubtarget().getInstrInfo();
if (VerifyEnabled)
MF->verify(this, "Before greedy register allocator");
RegAllocBase::init(getAnalysis<VirtRegMap>(),
getAnalysis<LiveIntervals>(),
getAnalysis<LiveRegMatrix>());
Indexes = &getAnalysis<SlotIndexes>();
MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
DomTree = &getAnalysis<MachineDominatorTree>();
ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
Loops = &getAnalysis<MachineLoopInfo>();
Bundles = &getAnalysis<EdgeBundles>();
SpillPlacer = &getAnalysis<SpillPlacement>();
DebugVars = &getAnalysis<LiveDebugVariables>();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
initializeCSRCost();
RegCosts = TRI->getRegisterCosts(*MF);
RegClassPriorityTrumpsGlobalness =
GreedyRegClassPriorityTrumpsGlobalness.getNumOccurrences()
? GreedyRegClassPriorityTrumpsGlobalness
: TRI->regClassPriorityTrumpsGlobalness(*MF);
ExtraInfo.emplace();
EvictAdvisor =
getAnalysis<RegAllocEvictionAdvisorAnalysis>().getAdvisor(*MF, *this);
VRAI = std::make_unique<VirtRegAuxInfo>(*MF, *LIS, *VRM, *Loops, *MBFI);
SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM, *VRAI));
VRAI->calculateSpillWeightsAndHints();
LLVM_DEBUG(LIS->dump());
SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops));
SE.reset(new SplitEditor(*SA, *AA, *LIS, *VRM, *DomTree, *MBFI, *VRAI));
IntfCache.init(MF, Matrix->getLiveUnions(), Indexes, LIS, TRI);
GlobalCand.resize(32); // This will grow as needed.
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
SetOfBrokenHints.clear();
allocatePhysRegs();
[RegAllocGreedy] Introduce a late pass to repair broken hints. A broken hint is a copy where both ends are assigned different colors. When a variable gets evicted in the neighborhood of such copies, it is likely we can reconcile some of them. ** Context ** Copies are inserted during the register allocation via splitting. These split points are required to relax the constraints on the allocation problem. When such a point is inserted, both ends of the copy would not share the same color with respect to the current allocation problem. When variables get evicted, the allocation problem becomes different and some split point may not be required anymore. However, the related variables may already have been colored. This usually shows up in the assembly with pattern like this: def A ... save A to B def A use A restore A from B ... use B Whereas we could simply have done: def B ... def A use A ... use B ** Proposed Solution ** A variable having a broken hint is marked for late recoloring if and only if selecting a register for it evict another variable. Indeed, if no eviction happens this is pointless to look for recoloring opportunities as it means the situation was the same as the initial allocation problem where we had to break the hint. Finally, when everything has been allocated, we look for recoloring opportunities for all the identified candidates. The recoloring is performed very late to rely on accurate copy cost (all involved variables are allocated). The recoloring is simple unlike the last change recoloring. It propagates the color of the broken hint to all its copy-related variables. If the color is available for them, the recoloring uses it, otherwise it gives up on that hint even if a more complex coloring would have worked. The recoloring happens only if it is profitable. The profitability is evaluated using the expected frequency of the copies of the currently recolored variable with a) its current color and b) with the target color. If a) is greater or equal than b), then it is profitable and the recoloring happen. ** Example ** Consider the following example: BB1: a = b = BB2: ... = b = a Let us assume b gets split: BB1: a = b = BB2: c = b ... d = c = d = a Because of how the allocation work, b, c, and d may be assigned different colors. Now, if a gets evicted to make room for c, assuming b and d were assigned to something different than a. We end up with: BB1: a = st a, SpillSlot b = BB2: c = b ... d = c = d e = ld SpillSlot = e This is likely that we can assign the same register for b, c, and d, getting rid of 2 copies. ** Performances ** Both ARM64 and x86_64 show performance improvements of up to 3% for the llvm-testsuite + externals with Os and O3. There are a few regressions too that comes from the (in)accuracy of the block frequency estimate. <rdar://problem/18312047> llvm-svn: 225422
2015-01-08 09:16:39 +08:00
tryHintsRecoloring();
if (VerifyEnabled)
MF->verify(this, "Before post optimization");
Recommit r265547, and r265610,r265639,r265657 on top of it, plus two fixes with one about error verify-regalloc reported, and another about live range update of phi after rematerialization. r265547: Replace analyzeSiblingValues with new algorithm to fix its compile time issue. The patch is to solve PR17409 and its duplicates. analyzeSiblingValues is a N x N complexity algorithm where N is the number of siblings generated by reg splitting. Although it causes siginificant compile time issue when N is large, it is also important for performance since it removes redundent spills and enables rematerialization. To solve the compile time issue, the patch removes analyzeSiblingValues and replaces it with lower cost alternatives containing two parts. The first part creates a new spill hoisting method in postOptimization of register allocation. It does spill hoisting at once after all the spills are generated instead of inside every instance of selectOrSplit. The second part queries the define expr of the original register for rematerializaiton and keep it always available during register allocation even if it is already dead. It deletes those dead instructions only in postOptimization. With the two parts in the patch, it can remove analyzeSiblingValues without sacrificing performance. Patches on top of r265547: r265610 "Fix the compare-clang diff error introduced by r265547." r265639 "Fix the sanitizer bootstrap error in r265547." r265657 "InlineSpiller.cpp: Escap \@ in r265547. [-Wdocumentation]" Differential Revision: http://reviews.llvm.org/D15302 Differential Revision: http://reviews.llvm.org/D18934 Differential Revision: http://reviews.llvm.org/D18935 Differential Revision: http://reviews.llvm.org/D18936 llvm-svn: 266162
2016-04-13 11:08:27 +08:00
postOptimization();
reportStats();
Recommit r265547, and r265610,r265639,r265657 on top of it, plus two fixes with one about error verify-regalloc reported, and another about live range update of phi after rematerialization. r265547: Replace analyzeSiblingValues with new algorithm to fix its compile time issue. The patch is to solve PR17409 and its duplicates. analyzeSiblingValues is a N x N complexity algorithm where N is the number of siblings generated by reg splitting. Although it causes siginificant compile time issue when N is large, it is also important for performance since it removes redundent spills and enables rematerialization. To solve the compile time issue, the patch removes analyzeSiblingValues and replaces it with lower cost alternatives containing two parts. The first part creates a new spill hoisting method in postOptimization of register allocation. It does spill hoisting at once after all the spills are generated instead of inside every instance of selectOrSplit. The second part queries the define expr of the original register for rematerializaiton and keep it always available during register allocation even if it is already dead. It deletes those dead instructions only in postOptimization. With the two parts in the patch, it can remove analyzeSiblingValues without sacrificing performance. Patches on top of r265547: r265610 "Fix the compare-clang diff error introduced by r265547." r265639 "Fix the sanitizer bootstrap error in r265547." r265657 "InlineSpiller.cpp: Escap \@ in r265547. [-Wdocumentation]" Differential Revision: http://reviews.llvm.org/D15302 Differential Revision: http://reviews.llvm.org/D18934 Differential Revision: http://reviews.llvm.org/D18935 Differential Revision: http://reviews.llvm.org/D18936 llvm-svn: 266162
2016-04-13 11:08:27 +08:00
releaseMemory();
return true;
}