2011-09-28 08:01:56 +08:00
|
|
|
//===- ExecutionDepsFix.cpp - Fix execution dependecy issues ----*- C++ -*-===//
|
2010-03-26 01:25:00 +08:00
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
2011-09-28 08:01:56 +08:00
|
|
|
// This file contains the execution dependency fix pass.
|
2010-03-26 01:25:00 +08:00
|
|
|
//
|
2011-09-28 08:01:56 +08:00
|
|
|
// Some X86 SSE instructions like mov, and, or, xor are available in different
|
2010-03-26 01:25:00 +08:00
|
|
|
// variants for different operand types. These variant instructions are
|
|
|
|
// equivalent, but on Nehalem and newer cpus there is extra latency
|
2011-09-28 08:01:56 +08:00
|
|
|
// transferring data between integer and floating point domains. ARM cores
|
|
|
|
// have similar issues when they are configured with both VFP and NEON
|
|
|
|
// pipelines.
|
2010-03-26 01:25:00 +08:00
|
|
|
//
|
|
|
|
// This pass changes the variant instructions to minimize domain crossings.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2011-09-28 07:50:46 +08:00
|
|
|
#include "llvm/CodeGen/Passes.h"
|
2011-11-08 05:59:29 +08:00
|
|
|
#include "llvm/ADT/PostOrderIterator.h"
|
2014-12-18 03:13:47 +08:00
|
|
|
#include "llvm/ADT/iterator_range.h"
|
2013-12-14 14:52:56 +08:00
|
|
|
#include "llvm/CodeGen/LivePhysRegs.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
2016-08-18 03:07:40 +08:00
|
|
|
#include "llvm/CodeGen/RegisterClassInfo.h"
|
2010-04-05 02:00:21 +08:00
|
|
|
#include "llvm/Support/Allocator.h"
|
2010-03-26 01:25:00 +08:00
|
|
|
#include "llvm/Support/Debug.h"
|
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/Target/TargetInstrInfo.h"
|
2014-08-05 05:25:23 +08:00
|
|
|
#include "llvm/Target/TargetSubtargetInfo.h"
|
|
|
|
|
2010-03-26 01:25:00 +08:00
|
|
|
using namespace llvm;
|
|
|
|
|
2014-04-22 10:02:50 +08:00
|
|
|
#define DEBUG_TYPE "execution-fix"
|
|
|
|
|
2010-04-01 04:32:51 +08:00
|
|
|
/// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track
|
2010-03-30 07:24:21 +08:00
|
|
|
/// of execution domains.
|
|
|
|
///
|
|
|
|
/// An open DomainValue represents a set of instructions that can still switch
|
|
|
|
/// execution domain. Multiple registers may refer to the same open
|
|
|
|
/// DomainValue - they will eventually be collapsed to the same execution
|
|
|
|
/// domain.
|
|
|
|
///
|
|
|
|
/// A collapsed DomainValue represents a single register that has been forced
|
|
|
|
/// into one of more execution domains. There is a separate collapsed
|
|
|
|
/// DomainValue for each register, but it may contain multiple execution
|
|
|
|
/// domains. A register value is initially created in a single execution
|
|
|
|
/// domain, but if we were forced to pay the penalty of a domain crossing, we
|
2011-11-15 09:15:25 +08:00
|
|
|
/// keep track of the fact that the register is now available in multiple
|
2010-03-30 07:24:21 +08:00
|
|
|
/// domains.
|
2010-04-05 02:00:21 +08:00
|
|
|
namespace {
|
2010-03-30 07:24:21 +08:00
|
|
|
struct DomainValue {
|
|
|
|
// Basic reference counting.
|
|
|
|
unsigned Refs;
|
|
|
|
|
2010-04-05 05:27:26 +08:00
|
|
|
// Bitmask of available domains. For an open DomainValue, it is the still
|
|
|
|
// possible domains for collapsing. For a collapsed DomainValue it is the
|
|
|
|
// domains where the register is available for free.
|
|
|
|
unsigned AvailableDomains;
|
2010-03-30 07:24:21 +08:00
|
|
|
|
2011-11-09 08:06:18 +08:00
|
|
|
// Pointer to the next DomainValue in a chain. When two DomainValues are
|
|
|
|
// merged, Victim.Next is set to point to Victor, so old DomainValue
|
2012-06-02 18:20:22 +08:00
|
|
|
// references can be updated by following the chain.
|
2011-11-09 08:06:18 +08:00
|
|
|
DomainValue *Next;
|
|
|
|
|
2010-03-30 07:24:21 +08:00
|
|
|
// Twiddleable instructions using or defining these registers.
|
|
|
|
SmallVector<MachineInstr*, 8> Instrs;
|
|
|
|
|
2010-04-05 05:27:26 +08:00
|
|
|
// A collapsed DomainValue has no instructions to twiddle - it simply keeps
|
2010-03-30 07:24:21 +08:00
|
|
|
// track of the domains where the registers are already available.
|
2010-04-05 05:27:26 +08:00
|
|
|
bool isCollapsed() const { return Instrs.empty(); }
|
2010-03-30 07:24:21 +08:00
|
|
|
|
2010-04-05 05:27:26 +08:00
|
|
|
// Is domain available?
|
|
|
|
bool hasDomain(unsigned domain) const {
|
2014-12-16 22:04:11 +08:00
|
|
|
assert(domain <
|
|
|
|
static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
|
2014-12-16 02:48:43 +08:00
|
|
|
"undefined behavior");
|
2010-04-05 05:27:26 +08:00
|
|
|
return AvailableDomains & (1u << domain);
|
2010-03-30 07:24:21 +08:00
|
|
|
}
|
|
|
|
|
2010-03-31 04:04:01 +08:00
|
|
|
// Mark domain as available.
|
2010-04-05 05:27:26 +08:00
|
|
|
void addDomain(unsigned domain) {
|
|
|
|
AvailableDomains |= 1u << domain;
|
2010-03-30 07:24:21 +08:00
|
|
|
}
|
|
|
|
|
2010-04-05 05:27:26 +08:00
|
|
|
// Restrict to a single domain available.
|
|
|
|
void setSingleDomain(unsigned domain) {
|
|
|
|
AvailableDomains = 1u << domain;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return bitmask of domains that are available and in mask.
|
|
|
|
unsigned getCommonDomains(unsigned mask) const {
|
|
|
|
return AvailableDomains & mask;
|
|
|
|
}
|
|
|
|
|
|
|
|
// First domain available.
|
|
|
|
unsigned getFirstDomain() const {
|
2013-05-25 06:23:49 +08:00
|
|
|
return countTrailingZeros(AvailableDomains);
|
2010-03-31 04:04:01 +08:00
|
|
|
}
|
|
|
|
|
2011-11-09 07:26:00 +08:00
|
|
|
DomainValue() : Refs(0) { clear(); }
|
2010-03-30 07:24:21 +08:00
|
|
|
|
2011-11-09 08:06:18 +08:00
|
|
|
// Clear this DomainValue and point to next which has all its data.
|
2010-03-30 07:24:21 +08:00
|
|
|
void clear() {
|
2011-11-15 09:15:25 +08:00
|
|
|
AvailableDomains = 0;
|
2014-04-14 08:51:57 +08:00
|
|
|
Next = nullptr;
|
2010-03-30 07:24:21 +08:00
|
|
|
Instrs.clear();
|
|
|
|
}
|
|
|
|
};
|
2015-06-23 17:49:53 +08:00
|
|
|
}
|
2010-03-30 07:24:21 +08:00
|
|
|
|
2011-11-15 09:15:25 +08:00
|
|
|
namespace {
|
2015-03-16 02:16:04 +08:00
|
|
|
/// Information about a live register.
|
2011-11-15 09:15:25 +08:00
|
|
|
struct LiveReg {
|
|
|
|
/// Value currently in this register, or NULL when no value is being tracked.
|
|
|
|
/// This counts as a DomainValue reference.
|
|
|
|
DomainValue *Value;
|
|
|
|
|
|
|
|
/// Instruction that defined this register, relative to the beginning of the
|
|
|
|
/// current basic block. When a LiveReg is used to represent a live-out
|
|
|
|
/// register, this value is relative to the end of the basic block, so it
|
|
|
|
/// will be a negative number.
|
|
|
|
int Def;
|
|
|
|
};
|
2015-03-16 02:11:35 +08:00
|
|
|
} // anonymous namespace
|
2011-11-15 09:15:25 +08:00
|
|
|
|
2010-04-05 02:00:21 +08:00
|
|
|
namespace {
|
2011-09-28 08:01:56 +08:00
|
|
|
class ExeDepsFix : public MachineFunctionPass {
|
2010-03-26 01:25:00 +08:00
|
|
|
static char ID;
|
2010-04-05 02:00:21 +08:00
|
|
|
SpecificBumpPtrAllocator<DomainValue> Allocator;
|
|
|
|
SmallVector<DomainValue*,16> Avail;
|
2010-03-26 01:25:00 +08:00
|
|
|
|
2011-09-28 07:50:46 +08:00
|
|
|
const TargetRegisterClass *const RC;
|
2010-03-26 01:25:00 +08:00
|
|
|
MachineFunction *MF;
|
2011-09-28 07:50:46 +08:00
|
|
|
const TargetInstrInfo *TII;
|
2010-03-30 07:24:21 +08:00
|
|
|
const TargetRegisterInfo *TRI;
|
2016-08-18 03:07:40 +08:00
|
|
|
RegisterClassInfo RegClassInfo;
|
2014-12-18 03:13:47 +08:00
|
|
|
std::vector<SmallVector<int, 1>> AliasMap;
|
2011-09-28 07:50:46 +08:00
|
|
|
const unsigned NumRegs;
|
2011-11-15 09:15:25 +08:00
|
|
|
LiveReg *LiveRegs;
|
[ExecutionDepsFix] Improve clearance calculation for loops
Summary:
In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:
PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
^ |
+----------------------------------+
In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.
To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.
In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D
To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
- All Predecessors have completed primary processing
- For x the number of predecessors that have completed primary
processing *at the time of primary processing of this block*,
the number of predecessors that are done has reached x.
The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
Reviewers: myatsina
Differential Revision: https://reviews.llvm.org/D28759
llvm-svn: 293571
2017-01-31 07:37:03 +08:00
|
|
|
struct MBBInfo {
|
|
|
|
// Keeps clearance and domain information for all registers. Note that this
|
|
|
|
// is different from the usual definition notion of liveness. The CPU
|
|
|
|
// doesn't care whether or not we consider a register killed.
|
|
|
|
LiveReg *OutRegs;
|
|
|
|
|
|
|
|
// Whether we have gotten to this block in primary processing yet.
|
|
|
|
bool PrimaryCompleted;
|
|
|
|
|
|
|
|
// The number of predecessors for which primary processing has completed
|
|
|
|
unsigned IncomingProcessed;
|
|
|
|
|
|
|
|
// The value of `IncomingProcessed` at the start of primary processing
|
|
|
|
unsigned PrimaryIncoming;
|
|
|
|
|
|
|
|
// The number of predecessors for which all processing steps are done.
|
|
|
|
unsigned IncomingCompleted;
|
|
|
|
|
|
|
|
MBBInfo()
|
|
|
|
: OutRegs(nullptr), PrimaryCompleted(false), IncomingProcessed(0),
|
|
|
|
PrimaryIncoming(0), IncomingCompleted(0) {}
|
|
|
|
};
|
|
|
|
typedef DenseMap<MachineBasicBlock *, MBBInfo> MBBInfoMap;
|
|
|
|
MBBInfoMap MBBInfos;
|
2011-11-15 09:15:25 +08:00
|
|
|
|
2013-10-15 06:19:03 +08:00
|
|
|
/// List of undefined register reads in this block in forward order.
|
|
|
|
std::vector<std::pair<MachineInstr*, unsigned> > UndefReads;
|
|
|
|
|
|
|
|
/// Storage for register unit liveness.
|
2013-12-14 14:52:56 +08:00
|
|
|
LivePhysRegs LiveRegSet;
|
2013-10-15 06:19:03 +08:00
|
|
|
|
2011-11-15 09:15:25 +08:00
|
|
|
/// Current instruction number.
|
|
|
|
/// The first instruction in each basic block is 0.
|
|
|
|
int CurInstr;
|
2010-03-26 01:25:00 +08:00
|
|
|
public:
|
2011-09-28 08:01:56 +08:00
|
|
|
ExeDepsFix(const TargetRegisterClass *rc)
|
2011-09-28 07:50:46 +08:00
|
|
|
: MachineFunctionPass(ID), RC(rc), NumRegs(RC->getNumRegs()) {}
|
2010-03-26 01:25:00 +08:00
|
|
|
|
2014-03-07 17:26:03 +08:00
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
2010-03-26 01:25:00 +08:00
|
|
|
AU.setPreservesAll();
|
|
|
|
MachineFunctionPass::getAnalysisUsage(AU);
|
|
|
|
}
|
|
|
|
|
2014-03-07 17:26:03 +08:00
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override;
|
2010-03-26 01:25:00 +08:00
|
|
|
|
2016-04-05 01:09:25 +08:00
|
|
|
MachineFunctionProperties getRequiredProperties() const override {
|
|
|
|
return MachineFunctionProperties().set(
|
2016-08-25 09:27:13 +08:00
|
|
|
MachineFunctionProperties::Property::NoVRegs);
|
2016-04-05 01:09:25 +08:00
|
|
|
}
|
|
|
|
|
2016-10-01 10:56:57 +08:00
|
|
|
StringRef getPassName() const override { return "Execution dependency fix"; }
|
2010-03-26 01:25:00 +08:00
|
|
|
|
|
|
|
private:
|
2014-12-18 03:13:47 +08:00
|
|
|
iterator_range<SmallVectorImpl<int>::const_iterator>
|
2015-03-07 02:56:20 +08:00
|
|
|
regIndices(unsigned Reg) const;
|
2010-04-05 02:00:21 +08:00
|
|
|
// DomainValue allocation.
|
2011-11-09 05:57:47 +08:00
|
|
|
DomainValue *alloc(int domain = -1);
|
2011-11-09 08:06:18 +08:00
|
|
|
DomainValue *retain(DomainValue *DV) {
|
|
|
|
if (DV) ++DV->Refs;
|
|
|
|
return DV;
|
|
|
|
}
|
2011-11-09 05:57:44 +08:00
|
|
|
void release(DomainValue*);
|
2011-11-09 08:06:18 +08:00
|
|
|
DomainValue *resolve(DomainValue*&);
|
2010-04-05 02:00:21 +08:00
|
|
|
|
2010-03-30 07:24:21 +08:00
|
|
|
// LiveRegs manipulations.
|
2011-11-09 05:57:47 +08:00
|
|
|
void setLiveReg(int rx, DomainValue *DV);
|
|
|
|
void kill(int rx);
|
|
|
|
void force(int rx, unsigned domain);
|
|
|
|
void collapse(DomainValue *dv, unsigned domain);
|
|
|
|
bool merge(DomainValue *A, DomainValue *B);
|
2010-03-30 07:24:21 +08:00
|
|
|
|
2011-11-15 09:15:25 +08:00
|
|
|
void enterBasicBlock(MachineBasicBlock*);
|
2011-11-08 05:40:27 +08:00
|
|
|
void leaveBasicBlock(MachineBasicBlock*);
|
[ExecutionDepsFix] Improve clearance calculation for loops
Summary:
In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:
PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
^ |
+----------------------------------+
In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.
To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.
In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D
To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
- All Predecessors have completed primary processing
- For x the number of predecessors that have completed primary
processing *at the time of primary processing of this block*,
the number of predecessors that are done has reached x.
The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
Reviewers: myatsina
Differential Revision: https://reviews.llvm.org/D28759
llvm-svn: 293571
2017-01-31 07:37:03 +08:00
|
|
|
bool isBlockDone(MachineBasicBlock *);
|
|
|
|
void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass);
|
|
|
|
void updateSuccessors(MachineBasicBlock *MBB, bool PrimaryPass);
|
|
|
|
bool visitInstr(MachineInstr *);
|
|
|
|
void processDefs(MachineInstr *, bool breakDependency, bool Kill);
|
2010-03-30 07:24:21 +08:00
|
|
|
void visitSoftInstr(MachineInstr*, unsigned mask);
|
|
|
|
void visitHardInstr(MachineInstr*, unsigned domain);
|
2016-08-11 15:32:08 +08:00
|
|
|
void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
|
|
|
|
unsigned Pref);
|
2013-10-15 06:19:03 +08:00
|
|
|
bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref);
|
|
|
|
void processUndefReads(MachineBasicBlock*);
|
2010-03-26 01:25:00 +08:00
|
|
|
};
|
2015-06-23 17:49:53 +08:00
|
|
|
}
|
2010-03-26 01:25:00 +08:00
|
|
|
|
2011-09-28 08:01:56 +08:00
|
|
|
char ExeDepsFix::ID = 0;
|
2010-03-26 01:25:00 +08:00
|
|
|
|
2015-03-07 02:56:20 +08:00
|
|
|
/// Translate TRI register number to a list of indices into our smaller tables
|
2014-12-18 03:13:47 +08:00
|
|
|
/// of interesting registers.
|
|
|
|
iterator_range<SmallVectorImpl<int>::const_iterator>
|
2015-03-07 02:56:20 +08:00
|
|
|
ExeDepsFix::regIndices(unsigned Reg) const {
|
2011-09-28 07:50:46 +08:00
|
|
|
assert(Reg < AliasMap.size() && "Invalid register");
|
2014-12-18 03:13:47 +08:00
|
|
|
const auto &Entry = AliasMap[Reg];
|
|
|
|
return make_range(Entry.begin(), Entry.end());
|
2010-03-30 07:24:21 +08:00
|
|
|
}
|
|
|
|
|
2011-11-09 05:57:47 +08:00
|
|
|
DomainValue *ExeDepsFix::alloc(int domain) {
|
2010-04-05 02:00:21 +08:00
|
|
|
DomainValue *dv = Avail.empty() ?
|
|
|
|
new(Allocator.Allocate()) DomainValue :
|
|
|
|
Avail.pop_back_val();
|
|
|
|
if (domain >= 0)
|
2010-04-05 05:27:26 +08:00
|
|
|
dv->addDomain(domain);
|
2011-11-09 07:26:00 +08:00
|
|
|
assert(dv->Refs == 0 && "Reference count wasn't cleared");
|
2011-11-09 08:06:18 +08:00
|
|
|
assert(!dv->Next && "Chained DomainValue shouldn't have been recycled");
|
2010-04-05 02:00:21 +08:00
|
|
|
return dv;
|
|
|
|
}
|
|
|
|
|
2015-03-16 02:16:04 +08:00
|
|
|
/// Release a reference to DV. When the last reference is released,
|
2011-11-09 05:57:44 +08:00
|
|
|
/// collapse if needed.
|
|
|
|
void ExeDepsFix::release(DomainValue *DV) {
|
2011-11-09 08:06:18 +08:00
|
|
|
while (DV) {
|
|
|
|
assert(DV->Refs && "Bad DomainValue");
|
|
|
|
if (--DV->Refs)
|
|
|
|
return;
|
|
|
|
|
|
|
|
// There are no more DV references. Collapse any contained instructions.
|
|
|
|
if (DV->AvailableDomains && !DV->isCollapsed())
|
|
|
|
collapse(DV, DV->getFirstDomain());
|
|
|
|
|
|
|
|
DomainValue *Next = DV->Next;
|
|
|
|
DV->clear();
|
|
|
|
Avail.push_back(DV);
|
|
|
|
// Also release the next DomainValue in the chain.
|
|
|
|
DV = Next;
|
|
|
|
}
|
|
|
|
}
|
2011-11-09 05:57:44 +08:00
|
|
|
|
2015-03-16 02:16:04 +08:00
|
|
|
/// Follow the chain of dead DomainValues until a live DomainValue is reached.
|
|
|
|
/// Update the referenced pointer when necessary.
|
2011-11-09 08:06:18 +08:00
|
|
|
DomainValue *ExeDepsFix::resolve(DomainValue *&DVRef) {
|
|
|
|
DomainValue *DV = DVRef;
|
|
|
|
if (!DV || !DV->Next)
|
|
|
|
return DV;
|
|
|
|
|
|
|
|
// DV has a chain. Find the end.
|
|
|
|
do DV = DV->Next;
|
|
|
|
while (DV->Next);
|
|
|
|
|
|
|
|
// Update DVRef to point to DV.
|
|
|
|
retain(DV);
|
|
|
|
release(DVRef);
|
|
|
|
DVRef = DV;
|
|
|
|
return DV;
|
2010-04-05 02:00:21 +08:00
|
|
|
}
|
|
|
|
|
2010-03-30 07:24:21 +08:00
|
|
|
/// Set LiveRegs[rx] = dv, updating reference counts.
|
2011-11-09 05:57:47 +08:00
|
|
|
void ExeDepsFix::setLiveReg(int rx, DomainValue *dv) {
|
2010-03-31 04:04:01 +08:00
|
|
|
assert(unsigned(rx) < NumRegs && "Invalid index");
|
2011-11-15 09:15:25 +08:00
|
|
|
assert(LiveRegs && "Must enter basic block first.");
|
2010-03-31 04:04:01 +08:00
|
|
|
|
2011-11-15 09:15:25 +08:00
|
|
|
if (LiveRegs[rx].Value == dv)
|
2010-03-30 07:24:21 +08:00
|
|
|
return;
|
2011-11-15 09:15:25 +08:00
|
|
|
if (LiveRegs[rx].Value)
|
|
|
|
release(LiveRegs[rx].Value);
|
|
|
|
LiveRegs[rx].Value = retain(dv);
|
2010-03-30 07:24:21 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Kill register rx, recycle or collapse any DomainValue.
|
2011-11-09 05:57:47 +08:00
|
|
|
void ExeDepsFix::kill(int rx) {
|
2010-03-31 04:04:01 +08:00
|
|
|
assert(unsigned(rx) < NumRegs && "Invalid index");
|
2011-11-15 09:15:25 +08:00
|
|
|
assert(LiveRegs && "Must enter basic block first.");
|
|
|
|
if (!LiveRegs[rx].Value)
|
|
|
|
return;
|
2010-03-30 07:24:21 +08:00
|
|
|
|
2011-11-15 09:15:25 +08:00
|
|
|
release(LiveRegs[rx].Value);
|
2014-04-14 08:51:57 +08:00
|
|
|
LiveRegs[rx].Value = nullptr;
|
2010-03-30 07:24:21 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Force register rx into domain.
|
2011-11-09 05:57:47 +08:00
|
|
|
void ExeDepsFix::force(int rx, unsigned domain) {
|
2010-03-31 04:04:01 +08:00
|
|
|
assert(unsigned(rx) < NumRegs && "Invalid index");
|
2011-11-15 09:15:25 +08:00
|
|
|
assert(LiveRegs && "Must enter basic block first.");
|
|
|
|
if (DomainValue *dv = LiveRegs[rx].Value) {
|
2010-04-05 05:27:26 +08:00
|
|
|
if (dv->isCollapsed())
|
|
|
|
dv->addDomain(domain);
|
2010-04-07 03:48:56 +08:00
|
|
|
else if (dv->hasDomain(domain))
|
2011-11-09 05:57:47 +08:00
|
|
|
collapse(dv, domain);
|
2010-04-07 03:48:56 +08:00
|
|
|
else {
|
2011-09-28 08:01:56 +08:00
|
|
|
// This is an incompatible open DomainValue. Collapse it to whatever and
|
|
|
|
// force the new value into domain. This costs a domain crossing.
|
2011-11-09 05:57:47 +08:00
|
|
|
collapse(dv, dv->getFirstDomain());
|
2011-11-15 09:15:25 +08:00
|
|
|
assert(LiveRegs[rx].Value && "Not live after collapse?");
|
|
|
|
LiveRegs[rx].Value->addDomain(domain);
|
2010-04-07 03:48:56 +08:00
|
|
|
}
|
2010-03-30 07:24:21 +08:00
|
|
|
} else {
|
2010-03-31 04:04:01 +08:00
|
|
|
// Set up basic collapsed DomainValue.
|
2011-11-09 05:57:47 +08:00
|
|
|
setLiveReg(rx, alloc(domain));
|
2010-03-30 07:24:21 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Collapse open DomainValue into given domain. If there are multiple
|
|
|
|
/// registers using dv, they each get a unique collapsed DomainValue.
|
2011-11-09 05:57:47 +08:00
|
|
|
void ExeDepsFix::collapse(DomainValue *dv, unsigned domain) {
|
2010-04-05 05:27:26 +08:00
|
|
|
assert(dv->hasDomain(domain) && "Cannot collapse");
|
2010-03-30 07:24:21 +08:00
|
|
|
|
|
|
|
// Collapse all the instructions.
|
2010-04-05 05:27:26 +08:00
|
|
|
while (!dv->Instrs.empty())
|
2016-06-30 08:01:54 +08:00
|
|
|
TII->setExecutionDomain(*dv->Instrs.pop_back_val(), domain);
|
2010-04-05 05:27:26 +08:00
|
|
|
dv->setSingleDomain(domain);
|
2010-03-30 07:24:21 +08:00
|
|
|
|
|
|
|
// If there are multiple users, give them new, unique DomainValues.
|
2010-04-05 02:00:21 +08:00
|
|
|
if (LiveRegs && dv->Refs > 1)
|
2010-03-31 04:04:01 +08:00
|
|
|
for (unsigned rx = 0; rx != NumRegs; ++rx)
|
2011-11-15 09:15:25 +08:00
|
|
|
if (LiveRegs[rx].Value == dv)
|
2011-11-09 05:57:47 +08:00
|
|
|
setLiveReg(rx, alloc(domain));
|
2010-03-30 07:24:21 +08:00
|
|
|
}
|
|
|
|
|
2015-03-16 02:16:04 +08:00
|
|
|
/// All instructions and registers in B are moved to A, and B is released.
|
2011-11-09 05:57:47 +08:00
|
|
|
bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) {
|
2010-04-05 05:27:26 +08:00
|
|
|
assert(!A->isCollapsed() && "Cannot merge into collapsed");
|
|
|
|
assert(!B->isCollapsed() && "Cannot merge from collapsed");
|
2010-04-01 04:05:12 +08:00
|
|
|
if (A == B)
|
2010-04-01 01:13:16 +08:00
|
|
|
return true;
|
2010-04-05 05:27:26 +08:00
|
|
|
// Restrict to the domains that A and B have in common.
|
|
|
|
unsigned common = A->getCommonDomains(B->AvailableDomains);
|
|
|
|
if (!common)
|
2010-03-30 07:24:21 +08:00
|
|
|
return false;
|
2010-04-05 05:27:26 +08:00
|
|
|
A->AvailableDomains = common;
|
2010-03-30 07:24:21 +08:00
|
|
|
A->Instrs.append(B->Instrs.begin(), B->Instrs.end());
|
2011-11-09 04:57:04 +08:00
|
|
|
|
|
|
|
// Clear the old DomainValue so we won't try to swizzle instructions twice.
|
2011-11-09 07:26:00 +08:00
|
|
|
B->clear();
|
2011-11-09 08:06:18 +08:00
|
|
|
// All uses of B are referred to A.
|
|
|
|
B->Next = retain(A);
|
2011-11-09 04:57:04 +08:00
|
|
|
|
2014-12-16 02:48:43 +08:00
|
|
|
for (unsigned rx = 0; rx != NumRegs; ++rx) {
|
|
|
|
assert(LiveRegs && "no space allocated for live registers");
|
2011-11-15 09:15:25 +08:00
|
|
|
if (LiveRegs[rx].Value == B)
|
2011-11-09 05:57:47 +08:00
|
|
|
setLiveReg(rx, A);
|
2014-12-16 02:48:43 +08:00
|
|
|
}
|
2010-03-30 07:24:21 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-03-16 02:16:04 +08:00
|
|
|
/// Set up LiveRegs by merging predecessor live-out values.
|
2011-11-15 09:15:25 +08:00
|
|
|
void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
|
|
|
|
// Reset instruction counter in each basic block.
|
|
|
|
CurInstr = 0;
|
|
|
|
|
2013-10-15 06:19:03 +08:00
|
|
|
// Set up UndefReads to track undefined register reads.
|
|
|
|
UndefReads.clear();
|
2013-12-14 14:52:56 +08:00
|
|
|
LiveRegSet.clear();
|
2013-10-15 06:19:03 +08:00
|
|
|
|
2011-11-15 09:15:25 +08:00
|
|
|
// Set up LiveRegs to represent registers entering MBB.
|
|
|
|
if (!LiveRegs)
|
|
|
|
LiveRegs = new LiveReg[NumRegs];
|
|
|
|
|
|
|
|
// Default values are 'nothing happened a long time ago'.
|
|
|
|
for (unsigned rx = 0; rx != NumRegs; ++rx) {
|
2014-04-14 08:51:57 +08:00
|
|
|
LiveRegs[rx].Value = nullptr;
|
2011-11-15 09:15:25 +08:00
|
|
|
LiveRegs[rx].Def = -(1 << 20);
|
|
|
|
}
|
|
|
|
|
|
|
|
// This is the entry block.
|
|
|
|
if (MBB->pred_empty()) {
|
2015-09-10 02:08:03 +08:00
|
|
|
for (const auto &LI : MBB->liveins()) {
|
|
|
|
for (int rx : regIndices(LI.PhysReg)) {
|
2014-12-18 03:13:47 +08:00
|
|
|
// Treat function live-ins as if they were defined just before the first
|
|
|
|
// instruction. Usually, function arguments are set up immediately
|
|
|
|
// before the call.
|
|
|
|
LiveRegs[rx].Def = -1;
|
|
|
|
}
|
2011-11-15 09:15:25 +08:00
|
|
|
}
|
|
|
|
DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": entry\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try to coalesce live-out registers from predecessors.
|
|
|
|
for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(),
|
|
|
|
pe = MBB->pred_end(); pi != pe; ++pi) {
|
[ExecutionDepsFix] Improve clearance calculation for loops
Summary:
In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:
PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
^ |
+----------------------------------+
In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.
To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.
In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D
To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
- All Predecessors have completed primary processing
- For x the number of predecessors that have completed primary
processing *at the time of primary processing of this block*,
the number of predecessors that are done has reached x.
The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
Reviewers: myatsina
Differential Revision: https://reviews.llvm.org/D28759
llvm-svn: 293571
2017-01-31 07:37:03 +08:00
|
|
|
auto fi = MBBInfos.find(*pi);
|
|
|
|
assert(fi != MBBInfos.end() &&
|
|
|
|
"Should have pre-allocated MBBInfos for all MBBs");
|
|
|
|
LiveReg *Incoming = fi->second.OutRegs;
|
|
|
|
// Incoming is null if this is a backedge from a BB
|
|
|
|
// we haven't processed yet
|
|
|
|
if (Incoming == nullptr) {
|
2011-11-15 09:15:25 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (unsigned rx = 0; rx != NumRegs; ++rx) {
|
|
|
|
// Use the most recent predecessor def for each register.
|
[ExecutionDepsFix] Improve clearance calculation for loops
Summary:
In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:
PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
^ |
+----------------------------------+
In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.
To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.
In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D
To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
- All Predecessors have completed primary processing
- For x the number of predecessors that have completed primary
processing *at the time of primary processing of this block*,
the number of predecessors that are done has reached x.
The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
Reviewers: myatsina
Differential Revision: https://reviews.llvm.org/D28759
llvm-svn: 293571
2017-01-31 07:37:03 +08:00
|
|
|
LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, Incoming[rx].Def);
|
2011-11-15 09:15:25 +08:00
|
|
|
|
[ExecutionDepsFix] Improve clearance calculation for loops
Summary:
In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:
PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
^ |
+----------------------------------+
In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.
To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.
In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D
To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
- All Predecessors have completed primary processing
- For x the number of predecessors that have completed primary
processing *at the time of primary processing of this block*,
the number of predecessors that are done has reached x.
The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
Reviewers: myatsina
Differential Revision: https://reviews.llvm.org/D28759
llvm-svn: 293571
2017-01-31 07:37:03 +08:00
|
|
|
DomainValue *pdv = resolve(Incoming[rx].Value);
|
2011-11-15 09:15:25 +08:00
|
|
|
if (!pdv)
|
2011-11-09 09:06:56 +08:00
|
|
|
continue;
|
2011-11-15 09:15:25 +08:00
|
|
|
if (!LiveRegs[rx].Value) {
|
2011-11-09 05:57:47 +08:00
|
|
|
setLiveReg(rx, pdv);
|
2010-04-01 04:32:51 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We have a live DomainValue from more than one predecessor.
|
2011-11-15 09:15:25 +08:00
|
|
|
if (LiveRegs[rx].Value->isCollapsed()) {
|
2014-05-21 01:11:11 +08:00
|
|
|
// We are already collapsed, but predecessor is not. Force it.
|
2011-11-15 09:15:25 +08:00
|
|
|
unsigned Domain = LiveRegs[rx].Value->getFirstDomain();
|
|
|
|
if (!pdv->isCollapsed() && pdv->hasDomain(Domain))
|
|
|
|
collapse(pdv, Domain);
|
2010-04-01 04:32:51 +08:00
|
|
|
continue;
|
2010-03-31 04:04:01 +08:00
|
|
|
}
|
2010-04-05 02:00:21 +08:00
|
|
|
|
2010-04-01 04:32:51 +08:00
|
|
|
// Currently open, merge in predecessor.
|
2010-04-05 05:27:26 +08:00
|
|
|
if (!pdv->isCollapsed())
|
2011-11-15 09:15:25 +08:00
|
|
|
merge(LiveRegs[rx].Value, pdv);
|
2010-04-01 04:32:51 +08:00
|
|
|
else
|
2011-11-09 05:57:47 +08:00
|
|
|
force(rx, pdv->getFirstDomain());
|
2010-03-31 04:04:01 +08:00
|
|
|
}
|
|
|
|
}
|
[ExecutionDepsFix] Improve clearance calculation for loops
Summary:
In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:
PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
^ |
+----------------------------------+
In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.
To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.
In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D
To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
- All Predecessors have completed primary processing
- For x the number of predecessors that have completed primary
processing *at the time of primary processing of this block*,
the number of predecessors that are done has reached x.
The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
Reviewers: myatsina
Differential Revision: https://reviews.llvm.org/D28759
llvm-svn: 293571
2017-01-31 07:37:03 +08:00
|
|
|
DEBUG(
|
|
|
|
dbgs() << "BB#" << MBB->getNumber()
|
|
|
|
<< (!isBlockDone(MBB) ? ": incomplete\n" : ": all preds known\n"));
|
2010-03-30 07:24:21 +08:00
|
|
|
}
|
|
|
|
|
2011-11-08 05:40:27 +08:00
|
|
|
void ExeDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) {
|
2011-11-15 09:15:25 +08:00
|
|
|
assert(LiveRegs && "Must enter basic block first.");
|
[ExecutionDepsFix] Improve clearance calculation for loops
Summary:
In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:
PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
^ |
+----------------------------------+
In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.
To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.
In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D
To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
- All Predecessors have completed primary processing
- For x the number of predecessors that have completed primary
processing *at the time of primary processing of this block*,
the number of predecessors that are done has reached x.
The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
Reviewers: myatsina
Differential Revision: https://reviews.llvm.org/D28759
llvm-svn: 293571
2017-01-31 07:37:03 +08:00
|
|
|
LiveReg *OldOutRegs = MBBInfos[MBB].OutRegs;
|
|
|
|
// Save register clearances at end of MBB - used by enterBasicBlock().
|
|
|
|
MBBInfos[MBB].OutRegs = LiveRegs;
|
|
|
|
|
|
|
|
// While processing the basic block, we kept `Def` relative to the start
|
|
|
|
// of the basic block for convenience. However, future use of this information
|
|
|
|
// only cares about the clearance from the end of the block, so adjust
|
|
|
|
// everything to be relative to the end of the basic block.
|
|
|
|
for (unsigned i = 0, e = NumRegs; i != e; ++i)
|
|
|
|
LiveRegs[i].Def -= CurInstr;
|
|
|
|
if (OldOutRegs) {
|
|
|
|
// This must be the second pass.
|
2011-11-09 09:06:56 +08:00
|
|
|
// Release all the DomainValues instead of keeping them.
|
|
|
|
for (unsigned i = 0, e = NumRegs; i != e; ++i)
|
[ExecutionDepsFix] Improve clearance calculation for loops
Summary:
In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:
PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
^ |
+----------------------------------+
In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.
To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.
In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D
To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
- All Predecessors have completed primary processing
- For x the number of predecessors that have completed primary
processing *at the time of primary processing of this block*,
the number of predecessors that are done has reached x.
The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
Reviewers: myatsina
Differential Revision: https://reviews.llvm.org/D28759
llvm-svn: 293571
2017-01-31 07:37:03 +08:00
|
|
|
release(OldOutRegs[i].Value);
|
|
|
|
delete[] OldOutRegs;
|
2011-11-09 09:06:56 +08:00
|
|
|
}
|
2014-04-14 08:51:57 +08:00
|
|
|
LiveRegs = nullptr;
|
2011-11-08 05:40:27 +08:00
|
|
|
}
|
|
|
|
|
[ExecutionDepsFix] Improve clearance calculation for loops
Summary:
In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:
PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
^ |
+----------------------------------+
In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.
To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.
In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D
To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
- All Predecessors have completed primary processing
- For x the number of predecessors that have completed primary
processing *at the time of primary processing of this block*,
the number of predecessors that are done has reached x.
The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
Reviewers: myatsina
Differential Revision: https://reviews.llvm.org/D28759
llvm-svn: 293571
2017-01-31 07:37:03 +08:00
|
|
|
bool ExeDepsFix::visitInstr(MachineInstr *MI) {
|
2011-11-15 09:15:25 +08:00
|
|
|
// Update instructions with explicit execution domains.
|
2016-06-30 08:01:54 +08:00
|
|
|
std::pair<uint16_t, uint16_t> DomP = TII->getExecutionDomain(*MI);
|
2011-11-15 09:15:25 +08:00
|
|
|
if (DomP.first) {
|
|
|
|
if (DomP.second)
|
|
|
|
visitSoftInstr(MI, DomP.second);
|
2011-11-08 05:40:27 +08:00
|
|
|
else
|
2011-11-15 09:15:25 +08:00
|
|
|
visitHardInstr(MI, DomP.first);
|
|
|
|
}
|
|
|
|
|
[ExecutionDepsFix] Improve clearance calculation for loops
Summary:
In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:
PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
^ |
+----------------------------------+
In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.
To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.
In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D
To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
- All Predecessors have completed primary processing
- For x the number of predecessors that have completed primary
processing *at the time of primary processing of this block*,
the number of predecessors that are done has reached x.
The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
Reviewers: myatsina
Differential Revision: https://reviews.llvm.org/D28759
llvm-svn: 293571
2017-01-31 07:37:03 +08:00
|
|
|
return !DomP.first;
|
2011-11-15 09:15:25 +08:00
|
|
|
}
|
|
|
|
|
2016-08-11 15:32:08 +08:00
|
|
|
/// \brief Helps avoid false dependencies on undef registers by updating the
|
|
|
|
/// machine instructions' undef operand to use a register that the instruction
|
|
|
|
/// is truly dependent on, or use a register with clearance higher than Pref.
|
|
|
|
void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
|
|
|
|
unsigned Pref) {
|
|
|
|
MachineOperand &MO = MI->getOperand(OpIdx);
|
|
|
|
assert(MO.isUndef() && "Expected undef machine operand");
|
|
|
|
|
|
|
|
unsigned OriginalReg = MO.getReg();
|
|
|
|
|
|
|
|
// Update only undef operands that are mapped to one register.
|
|
|
|
if (AliasMap[OriginalReg].size() != 1)
|
|
|
|
return;
|
|
|
|
|
|
|
|
// Get the undef operand's register class
|
|
|
|
const TargetRegisterClass *OpRC =
|
|
|
|
TII->getRegClass(MI->getDesc(), OpIdx, TRI, *MF);
|
|
|
|
|
|
|
|
// If the instruction has a true dependency, we can hide the false depdency
|
|
|
|
// behind it.
|
|
|
|
for (MachineOperand &CurrMO : MI->operands()) {
|
|
|
|
if (!CurrMO.isReg() || CurrMO.isDef() || CurrMO.isUndef() ||
|
|
|
|
!OpRC->contains(CurrMO.getReg()))
|
|
|
|
continue;
|
|
|
|
// We found a true dependency - replace the undef register with the true
|
|
|
|
// dependency.
|
|
|
|
MO.setReg(CurrMO.getReg());
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Go over all registers in the register class and find the register with
|
|
|
|
// max clearance or clearance higher than Pref.
|
|
|
|
unsigned MaxClearance = 0;
|
|
|
|
unsigned MaxClearanceReg = OriginalReg;
|
2016-08-18 03:07:40 +08:00
|
|
|
ArrayRef<MCPhysReg> Order = RegClassInfo.getOrder(OpRC);
|
|
|
|
for (auto Reg : Order) {
|
2016-08-17 19:40:21 +08:00
|
|
|
assert(AliasMap[Reg].size() == 1 &&
|
|
|
|
"Reg is expected to be mapped to a single index");
|
|
|
|
int RCrx = *regIndices(Reg).begin();
|
|
|
|
unsigned Clearance = CurInstr - LiveRegs[RCrx].Def;
|
2016-08-11 15:32:08 +08:00
|
|
|
if (Clearance <= MaxClearance)
|
|
|
|
continue;
|
|
|
|
MaxClearance = Clearance;
|
2016-08-17 19:40:21 +08:00
|
|
|
MaxClearanceReg = Reg;
|
2016-08-11 15:32:08 +08:00
|
|
|
|
|
|
|
if (MaxClearance > Pref)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Update the operand if we found a register with better clearance.
|
|
|
|
if (MaxClearanceReg != OriginalReg)
|
|
|
|
MO.setReg(MaxClearanceReg);
|
|
|
|
}
|
|
|
|
|
2013-10-15 06:19:03 +08:00
|
|
|
/// \brief Return true to if it makes sense to break dependence on a partial def
|
|
|
|
/// or undef use.
|
|
|
|
bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
|
|
|
|
unsigned Pref) {
|
2014-12-18 03:13:47 +08:00
|
|
|
unsigned reg = MI->getOperand(OpIdx).getReg();
|
2015-03-07 02:56:20 +08:00
|
|
|
for (int rx : regIndices(reg)) {
|
2014-12-18 03:13:47 +08:00
|
|
|
unsigned Clearance = CurInstr - LiveRegs[rx].Def;
|
|
|
|
DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
|
2013-10-15 06:19:03 +08:00
|
|
|
|
2014-12-18 03:13:47 +08:00
|
|
|
if (Pref > Clearance) {
|
|
|
|
DEBUG(dbgs() << ": Break dependency.\n");
|
|
|
|
continue;
|
|
|
|
}
|
[ExecutionDepsFix] Improve clearance calculation for loops
Summary:
In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:
PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
^ |
+----------------------------------+
In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.
To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.
In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D
To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
- All Predecessors have completed primary processing
- For x the number of predecessors that have completed primary
processing *at the time of primary processing of this block*,
the number of predecessors that are done has reached x.
The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
Reviewers: myatsina
Differential Revision: https://reviews.llvm.org/D28759
llvm-svn: 293571
2017-01-31 07:37:03 +08:00
|
|
|
DEBUG(dbgs() << ": OK .\n");
|
2013-10-15 06:19:03 +08:00
|
|
|
return false;
|
|
|
|
}
|
2014-12-18 03:13:47 +08:00
|
|
|
return true;
|
2013-10-15 06:19:03 +08:00
|
|
|
}
|
|
|
|
|
2011-11-15 09:15:25 +08:00
|
|
|
// Update def-ages for registers defined by MI.
|
|
|
|
// If Kill is set, also kill off DomainValues clobbered by the defs.
|
2013-10-15 06:19:03 +08:00
|
|
|
//
|
|
|
|
// Also break dependencies on partial defs and undef uses.
|
[ExecutionDepsFix] Improve clearance calculation for loops
Summary:
In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:
PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
^ |
+----------------------------------+
In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.
To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.
In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D
To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
- All Predecessors have completed primary processing
- For x the number of predecessors that have completed primary
processing *at the time of primary processing of this block*,
the number of predecessors that are done has reached x.
The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
Reviewers: myatsina
Differential Revision: https://reviews.llvm.org/D28759
llvm-svn: 293571
2017-01-31 07:37:03 +08:00
|
|
|
void ExeDepsFix::processDefs(MachineInstr *MI, bool breakDependency,
|
|
|
|
bool Kill) {
|
2011-11-15 09:15:25 +08:00
|
|
|
assert(!MI->isDebugValue() && "Won't process debug values");
|
2013-10-15 06:19:03 +08:00
|
|
|
|
|
|
|
// Break dependence on undef uses. Do this before updating LiveRegs below.
|
|
|
|
unsigned OpNum;
|
[ExecutionDepsFix] Improve clearance calculation for loops
Summary:
In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:
PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
^ |
+----------------------------------+
In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.
To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.
In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D
To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
- All Predecessors have completed primary processing
- For x the number of predecessors that have completed primary
processing *at the time of primary processing of this block*,
the number of predecessors that are done has reached x.
The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
Reviewers: myatsina
Differential Revision: https://reviews.llvm.org/D28759
llvm-svn: 293571
2017-01-31 07:37:03 +08:00
|
|
|
if (breakDependency) {
|
|
|
|
unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI);
|
|
|
|
if (Pref) {
|
|
|
|
pickBestRegisterForUndef(MI, OpNum, Pref);
|
|
|
|
if (shouldBreakDependence(MI, OpNum, Pref))
|
|
|
|
UndefReads.push_back(std::make_pair(MI, OpNum));
|
|
|
|
}
|
2013-10-15 06:19:03 +08:00
|
|
|
}
|
2011-11-15 09:15:25 +08:00
|
|
|
const MCInstrDesc &MCID = MI->getDesc();
|
|
|
|
for (unsigned i = 0,
|
2011-12-07 15:15:52 +08:00
|
|
|
e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs();
|
2011-11-15 09:15:25 +08:00
|
|
|
i != e; ++i) {
|
|
|
|
MachineOperand &MO = MI->getOperand(i);
|
|
|
|
if (!MO.isReg())
|
|
|
|
continue;
|
|
|
|
if (MO.isUse())
|
|
|
|
continue;
|
2015-03-07 02:56:20 +08:00
|
|
|
for (int rx : regIndices(MO.getReg())) {
|
2014-12-18 03:13:47 +08:00
|
|
|
// This instruction explicitly defines rx.
|
|
|
|
DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr
|
|
|
|
<< '\t' << *MI);
|
|
|
|
|
[ExecutionDepsFix] Improve clearance calculation for loops
Summary:
In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:
PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
^ |
+----------------------------------+
In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.
To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.
In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D
To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
- All Predecessors have completed primary processing
- For x the number of predecessors that have completed primary
processing *at the time of primary processing of this block*,
the number of predecessors that are done has reached x.
The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
Reviewers: myatsina
Differential Revision: https://reviews.llvm.org/D28759
llvm-svn: 293571
2017-01-31 07:37:03 +08:00
|
|
|
if (breakDependency) {
|
|
|
|
// Check clearance before partial register updates.
|
|
|
|
// Call breakDependence before setting LiveRegs[rx].Def.
|
|
|
|
unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI);
|
|
|
|
if (Pref && shouldBreakDependence(MI, i, Pref))
|
|
|
|
TII->breakPartialRegDependency(*MI, i, TRI);
|
|
|
|
}
|
2014-12-18 03:13:47 +08:00
|
|
|
|
|
|
|
// How many instructions since rx was last written?
|
|
|
|
LiveRegs[rx].Def = CurInstr;
|
|
|
|
|
|
|
|
// Kill off domains redefined by generic instructions.
|
|
|
|
if (Kill)
|
|
|
|
kill(rx);
|
|
|
|
}
|
2013-10-15 06:19:03 +08:00
|
|
|
}
|
|
|
|
++CurInstr;
|
|
|
|
}
|
2011-11-15 09:15:30 +08:00
|
|
|
|
2013-10-15 06:19:03 +08:00
|
|
|
/// \break Break false dependencies on undefined register reads.
|
|
|
|
///
|
|
|
|
/// Walk the block backward computing precise liveness. This is expensive, so we
|
|
|
|
/// only do it on demand. Note that the occurrence of undefined register reads
|
|
|
|
/// that should be broken is very rare, but when they occur we may have many in
|
|
|
|
/// a single block.
|
|
|
|
void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) {
|
|
|
|
if (UndefReads.empty())
|
|
|
|
return;
|
2011-11-15 09:15:30 +08:00
|
|
|
|
2013-10-15 06:19:03 +08:00
|
|
|
// Collect this block's live out register units.
|
2016-12-08 08:15:51 +08:00
|
|
|
LiveRegSet.init(*TRI);
|
2016-05-03 08:08:46 +08:00
|
|
|
// We do not need to care about pristine registers as they are just preserved
|
|
|
|
// but not actually used in the function.
|
2016-05-03 08:24:32 +08:00
|
|
|
LiveRegSet.addLiveOutsNoPristines(*MBB);
|
2013-12-14 14:52:56 +08:00
|
|
|
|
2013-10-15 06:19:03 +08:00
|
|
|
MachineInstr *UndefMI = UndefReads.back().first;
|
|
|
|
unsigned OpIdx = UndefReads.back().second;
|
2011-11-15 09:15:25 +08:00
|
|
|
|
2015-07-25 05:13:43 +08:00
|
|
|
for (MachineInstr &I : make_range(MBB->rbegin(), MBB->rend())) {
|
2013-12-14 06:23:54 +08:00
|
|
|
// Update liveness, including the current instruction's defs.
|
2015-07-25 05:13:43 +08:00
|
|
|
LiveRegSet.stepBackward(I);
|
2013-10-15 11:39:43 +08:00
|
|
|
|
2015-07-25 05:13:43 +08:00
|
|
|
if (UndefMI == &I) {
|
2013-12-14 14:52:56 +08:00
|
|
|
if (!LiveRegSet.contains(UndefMI->getOperand(OpIdx).getReg()))
|
2016-06-30 08:01:54 +08:00
|
|
|
TII->breakPartialRegDependency(*UndefMI, OpIdx, TRI);
|
2013-10-15 06:19:03 +08:00
|
|
|
|
|
|
|
UndefReads.pop_back();
|
|
|
|
if (UndefReads.empty())
|
|
|
|
return;
|
|
|
|
|
|
|
|
UndefMI = UndefReads.back().first;
|
|
|
|
OpIdx = UndefReads.back().second;
|
|
|
|
}
|
|
|
|
}
|
2011-11-08 05:40:27 +08:00
|
|
|
}
|
|
|
|
|
2010-03-30 07:24:21 +08:00
|
|
|
// A hard instruction only works in one domain. All input registers will be
|
|
|
|
// forced into that domain.
|
2011-09-28 08:01:56 +08:00
|
|
|
void ExeDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) {
|
2010-03-30 07:24:21 +08:00
|
|
|
// Collapse all uses.
|
|
|
|
for (unsigned i = mi->getDesc().getNumDefs(),
|
|
|
|
e = mi->getDesc().getNumOperands(); i != e; ++i) {
|
|
|
|
MachineOperand &mo = mi->getOperand(i);
|
|
|
|
if (!mo.isReg()) continue;
|
2015-03-07 02:56:20 +08:00
|
|
|
for (int rx : regIndices(mo.getReg())) {
|
2014-12-18 03:13:47 +08:00
|
|
|
force(rx, domain);
|
|
|
|
}
|
2010-03-30 07:24:21 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Kill all defs and force them.
|
|
|
|
for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) {
|
|
|
|
MachineOperand &mo = mi->getOperand(i);
|
|
|
|
if (!mo.isReg()) continue;
|
2015-03-07 02:56:20 +08:00
|
|
|
for (int rx : regIndices(mo.getReg())) {
|
2014-12-18 03:13:47 +08:00
|
|
|
kill(rx);
|
|
|
|
force(rx, domain);
|
|
|
|
}
|
2010-03-30 07:24:21 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// A soft instruction can be changed to work in other domains given by mask.
|
2011-09-28 08:01:56 +08:00
|
|
|
void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
|
2010-04-05 05:27:26 +08:00
|
|
|
// Bitmask of available domains for this instruction after taking collapsed
|
|
|
|
// operands into account.
|
|
|
|
unsigned available = mask;
|
|
|
|
|
2010-03-30 07:24:21 +08:00
|
|
|
// Scan the explicit use operands for incoming domains.
|
|
|
|
SmallVector<int, 4> used;
|
2010-03-31 04:04:01 +08:00
|
|
|
if (LiveRegs)
|
|
|
|
for (unsigned i = mi->getDesc().getNumDefs(),
|
|
|
|
e = mi->getDesc().getNumOperands(); i != e; ++i) {
|
2010-04-01 04:32:51 +08:00
|
|
|
MachineOperand &mo = mi->getOperand(i);
|
|
|
|
if (!mo.isReg()) continue;
|
2015-03-07 02:56:20 +08:00
|
|
|
for (int rx : regIndices(mo.getReg())) {
|
2014-12-18 03:13:47 +08:00
|
|
|
DomainValue *dv = LiveRegs[rx].Value;
|
|
|
|
if (dv == nullptr)
|
|
|
|
continue;
|
2010-04-05 05:27:26 +08:00
|
|
|
// Bitmask of domains that dv and available have in common.
|
|
|
|
unsigned common = dv->getCommonDomains(available);
|
2010-04-01 04:32:51 +08:00
|
|
|
// Is it possible to use this collapsed register for free?
|
2010-04-05 05:27:26 +08:00
|
|
|
if (dv->isCollapsed()) {
|
|
|
|
// Restrict available domains to the ones in common with the operand.
|
2013-10-15 06:19:03 +08:00
|
|
|
// If there are no common domains, we must pay the cross-domain
|
2010-04-05 05:27:26 +08:00
|
|
|
// penalty for this operand.
|
|
|
|
if (common) available = common;
|
|
|
|
} else if (common)
|
|
|
|
// Open DomainValue is compatible, save it for merging.
|
2010-04-01 04:32:51 +08:00
|
|
|
used.push_back(rx);
|
|
|
|
else
|
2010-04-05 05:27:26 +08:00
|
|
|
// Open DomainValue is not compatible with instruction. It is useless
|
|
|
|
// now.
|
2011-11-09 05:57:47 +08:00
|
|
|
kill(rx);
|
2010-04-01 04:32:51 +08:00
|
|
|
}
|
2010-03-30 07:24:21 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// If the collapsed operands force a single domain, propagate the collapse.
|
2010-04-05 05:27:26 +08:00
|
|
|
if (isPowerOf2_32(available)) {
|
2013-05-25 06:23:49 +08:00
|
|
|
unsigned domain = countTrailingZeros(available);
|
2016-06-30 08:01:54 +08:00
|
|
|
TII->setExecutionDomain(*mi, domain);
|
2010-03-30 07:24:21 +08:00
|
|
|
visitHardInstr(mi, domain);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-04-05 05:27:26 +08:00
|
|
|
// Kill off any remaining uses that don't match available, and build a list of
|
|
|
|
// incoming DomainValues that we want to merge.
|
2017-02-26 02:12:25 +08:00
|
|
|
SmallVector<const LiveReg *, 4> Regs;
|
2017-02-24 14:38:24 +08:00
|
|
|
for (int rx : used) {
|
2014-12-16 02:48:43 +08:00
|
|
|
assert(LiveRegs && "no space allocated for live registers");
|
2011-11-15 09:15:25 +08:00
|
|
|
const LiveReg &LR = LiveRegs[rx];
|
2010-03-31 04:04:01 +08:00
|
|
|
// This useless DomainValue could have been missed above.
|
2011-11-15 09:15:25 +08:00
|
|
|
if (!LR.Value->getCommonDomains(available)) {
|
|
|
|
kill(rx);
|
2010-03-30 07:24:21 +08:00
|
|
|
continue;
|
|
|
|
}
|
2011-11-15 09:15:25 +08:00
|
|
|
// Sorted insertion.
|
2017-02-26 02:12:25 +08:00
|
|
|
auto I = std::upper_bound(Regs.begin(), Regs.end(), &LR,
|
|
|
|
[](const LiveReg *LHS, const LiveReg *RHS) {
|
|
|
|
return LHS->Def < RHS->Def;
|
|
|
|
});
|
|
|
|
Regs.insert(I, &LR);
|
2010-03-30 07:24:21 +08:00
|
|
|
}
|
|
|
|
|
2010-04-05 05:27:26 +08:00
|
|
|
// doms are now sorted in order of appearance. Try to merge them all, giving
|
|
|
|
// priority to the latest ones.
|
2014-04-14 08:51:57 +08:00
|
|
|
DomainValue *dv = nullptr;
|
2011-11-15 09:15:25 +08:00
|
|
|
while (!Regs.empty()) {
|
2010-04-01 04:32:51 +08:00
|
|
|
if (!dv) {
|
2017-02-26 02:12:25 +08:00
|
|
|
dv = Regs.pop_back_val()->Value;
|
2011-11-23 12:03:08 +08:00
|
|
|
// Force the first dv to match the current instruction.
|
|
|
|
dv->AvailableDomains = dv->getCommonDomains(available);
|
|
|
|
assert(dv->AvailableDomains && "Domain should have been filtered");
|
2010-04-01 04:32:51 +08:00
|
|
|
continue;
|
|
|
|
}
|
2010-04-05 02:00:21 +08:00
|
|
|
|
2017-02-26 02:12:25 +08:00
|
|
|
DomainValue *Latest = Regs.pop_back_val()->Value;
|
2011-11-15 09:15:25 +08:00
|
|
|
// Skip already merged values.
|
|
|
|
if (Latest == dv || Latest->Next)
|
|
|
|
continue;
|
|
|
|
if (merge(dv, Latest))
|
|
|
|
continue;
|
2010-04-05 02:00:21 +08:00
|
|
|
|
2010-04-05 05:27:26 +08:00
|
|
|
// If latest didn't merge, it is useless now. Kill all registers using it.
|
2014-12-16 02:48:43 +08:00
|
|
|
for (int i : used) {
|
|
|
|
assert(LiveRegs && "no space allocated for live registers");
|
|
|
|
if (LiveRegs[i].Value == Latest)
|
|
|
|
kill(i);
|
|
|
|
}
|
2010-03-30 07:24:21 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// dv is the DomainValue we are going to use for this instruction.
|
2011-11-23 12:03:08 +08:00
|
|
|
if (!dv) {
|
2011-11-09 05:57:47 +08:00
|
|
|
dv = alloc();
|
2011-11-23 12:03:08 +08:00
|
|
|
dv->AvailableDomains = available;
|
|
|
|
}
|
2010-03-30 07:24:21 +08:00
|
|
|
dv->Instrs.push_back(mi);
|
|
|
|
|
2012-10-03 16:29:36 +08:00
|
|
|
// Finally set all defs and non-collapsed uses to dv. We must iterate through
|
|
|
|
// all the operators, including imp-def ones.
|
|
|
|
for (MachineInstr::mop_iterator ii = mi->operands_begin(),
|
|
|
|
ee = mi->operands_end();
|
|
|
|
ii != ee; ++ii) {
|
|
|
|
MachineOperand &mo = *ii;
|
2010-03-30 07:24:21 +08:00
|
|
|
if (!mo.isReg()) continue;
|
2015-03-07 02:56:20 +08:00
|
|
|
for (int rx : regIndices(mo.getReg())) {
|
2014-12-18 03:13:47 +08:00
|
|
|
if (!LiveRegs[rx].Value || (mo.isDef() && LiveRegs[rx].Value != dv)) {
|
|
|
|
kill(rx);
|
|
|
|
setLiveReg(rx, dv);
|
|
|
|
}
|
2010-03-30 07:24:21 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
[ExecutionDepsFix] Improve clearance calculation for loops
Summary:
In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:
PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
^ |
+----------------------------------+
In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.
To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.
In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D
To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
- All Predecessors have completed primary processing
- For x the number of predecessors that have completed primary
processing *at the time of primary processing of this block*,
the number of predecessors that are done has reached x.
The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
Reviewers: myatsina
Differential Revision: https://reviews.llvm.org/D28759
llvm-svn: 293571
2017-01-31 07:37:03 +08:00
|
|
|
void ExeDepsFix::processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass) {
|
|
|
|
enterBasicBlock(MBB);
|
|
|
|
// If this block is not done, it makes little sense to make any decisions
|
|
|
|
// based on clearance information. We need to make a second pass anyway,
|
|
|
|
// and by then we'll have better information, so we can avoid doing the work
|
|
|
|
// to try and break dependencies now.
|
|
|
|
bool breakDependency = isBlockDone(MBB);
|
|
|
|
for (MachineInstr &MI : *MBB) {
|
|
|
|
if (!MI.isDebugValue()) {
|
|
|
|
bool Kill = false;
|
|
|
|
if (PrimaryPass)
|
|
|
|
Kill = visitInstr(&MI);
|
|
|
|
processDefs(&MI, breakDependency, Kill);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (breakDependency)
|
|
|
|
processUndefReads(MBB);
|
|
|
|
leaveBasicBlock(MBB);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool ExeDepsFix::isBlockDone(MachineBasicBlock *MBB) {
|
|
|
|
return MBBInfos[MBB].PrimaryCompleted &&
|
|
|
|
MBBInfos[MBB].IncomingCompleted == MBBInfos[MBB].PrimaryIncoming &&
|
|
|
|
MBBInfos[MBB].IncomingProcessed == MBB->pred_size();
|
|
|
|
}
|
|
|
|
|
|
|
|
void ExeDepsFix::updateSuccessors(MachineBasicBlock *MBB, bool Primary) {
|
|
|
|
bool Done = isBlockDone(MBB);
|
|
|
|
for (auto *Succ : MBB->successors()) {
|
|
|
|
if (!isBlockDone(Succ)) {
|
|
|
|
if (Primary) {
|
|
|
|
MBBInfos[Succ].IncomingProcessed++;
|
|
|
|
}
|
|
|
|
if (Done) {
|
|
|
|
MBBInfos[Succ].IncomingCompleted++;
|
|
|
|
}
|
|
|
|
if (isBlockDone(Succ)) {
|
|
|
|
// Perform secondary processing for this successor. See the big comment
|
|
|
|
// in runOnMachineFunction, for an explanation of the iteration order.
|
|
|
|
processBasicBlock(Succ, false);
|
|
|
|
updateSuccessors(Succ, false);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-09-28 08:01:56 +08:00
|
|
|
bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
|
2016-05-04 06:32:30 +08:00
|
|
|
if (skipFunction(*mf.getFunction()))
|
|
|
|
return false;
|
2010-03-26 01:25:00 +08:00
|
|
|
MF = &mf;
|
2014-08-05 10:39:49 +08:00
|
|
|
TII = MF->getSubtarget().getInstrInfo();
|
|
|
|
TRI = MF->getSubtarget().getRegisterInfo();
|
2016-08-18 03:07:40 +08:00
|
|
|
RegClassInfo.runOnMachineFunction(mf);
|
2014-04-14 08:51:57 +08:00
|
|
|
LiveRegs = nullptr;
|
2011-09-28 07:50:46 +08:00
|
|
|
assert(NumRegs == RC->getNumRegs() && "Bad regclass");
|
2010-03-26 01:25:00 +08:00
|
|
|
|
2011-11-15 09:15:25 +08:00
|
|
|
DEBUG(dbgs() << "********** FIX EXECUTION DEPENDENCIES: "
|
2014-11-17 13:50:14 +08:00
|
|
|
<< TRI->getRegClassName(RC) << " **********\n");
|
2011-11-15 09:15:25 +08:00
|
|
|
|
2011-09-28 08:01:56 +08:00
|
|
|
// If no relevant registers are used in the function, we can skip it
|
|
|
|
// completely.
|
2010-03-30 07:24:21 +08:00
|
|
|
bool anyregs = false;
|
2015-07-15 01:52:07 +08:00
|
|
|
const MachineRegisterInfo &MRI = mf.getRegInfo();
|
2015-08-19 02:54:27 +08:00
|
|
|
for (unsigned Reg : *RC) {
|
|
|
|
if (MRI.isPhysRegUsed(Reg)) {
|
|
|
|
anyregs = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2010-03-30 07:24:21 +08:00
|
|
|
if (!anyregs) return false;
|
2010-03-26 01:25:00 +08:00
|
|
|
|
2011-09-28 07:50:46 +08:00
|
|
|
// Initialize the AliasMap on the first use.
|
|
|
|
if (AliasMap.empty()) {
|
2014-12-18 03:13:47 +08:00
|
|
|
// Given a PhysReg, AliasMap[PhysReg] returns a list of indices into RC and
|
|
|
|
// therefore the LiveRegs array.
|
|
|
|
AliasMap.resize(TRI->getNumRegs());
|
2011-09-28 07:50:46 +08:00
|
|
|
for (unsigned i = 0, e = RC->getNumRegs(); i != e; ++i)
|
2012-06-02 07:28:30 +08:00
|
|
|
for (MCRegAliasIterator AI(RC->getRegister(i), TRI, true);
|
|
|
|
AI.isValid(); ++AI)
|
2014-12-18 03:13:47 +08:00
|
|
|
AliasMap[*AI].push_back(i);
|
2011-09-28 07:50:46 +08:00
|
|
|
}
|
|
|
|
|
[ExecutionDepsFix] Improve clearance calculation for loops
Summary:
In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:
PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
^ |
+----------------------------------+
In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.
To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.
In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D
To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
- All Predecessors have completed primary processing
- For x the number of predecessors that have completed primary
processing *at the time of primary processing of this block*,
the number of predecessors that are done has reached x.
The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
Reviewers: myatsina
Differential Revision: https://reviews.llvm.org/D28759
llvm-svn: 293571
2017-01-31 07:37:03 +08:00
|
|
|
// Initialize the MMBInfos
|
|
|
|
for (auto &MBB : mf) {
|
|
|
|
MBBInfo InitialInfo;
|
|
|
|
MBBInfos.insert(std::make_pair(&MBB, InitialInfo));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We want to visit every instruction in every basic block in order to update
|
|
|
|
* it's execution domain or break any false dependencies. However, for the
|
|
|
|
* dependency breaking, we need to know clearances from all predecessors
|
|
|
|
* (including any backedges). One way to do so would be to do two complete
|
|
|
|
* passes over all basic blocks/instructions, the first for recording
|
|
|
|
* clearances, the second to break the dependencies. However, for functions
|
|
|
|
* without backedges, or functions with a lot of straight-line code, and
|
|
|
|
* a small loop, that would be a lot of unnecessary work (since only the
|
|
|
|
* BBs that are part of the loop require two passes). As an example,
|
|
|
|
* consider the following loop.
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
|
|
|
|
* ^ |
|
|
|
|
* +----------------------------------+
|
|
|
|
*
|
|
|
|
* The iteration order is as follows:
|
|
|
|
* Naive: PH A B C D A' B' C' D'
|
|
|
|
* Optimized: PH A B C A' B' C' D
|
|
|
|
*
|
|
|
|
* Note that we avoid processing D twice, because we can entirely process
|
|
|
|
* the predecessors before getting to D. We call a block that is ready
|
|
|
|
* for its second round of processing `done` (isBlockDone). Once we finish
|
|
|
|
* processing some block, we update the counters in MBBInfos and re-process
|
|
|
|
* any successors that are now done.
|
|
|
|
*/
|
|
|
|
|
2015-10-10 00:54:49 +08:00
|
|
|
MachineBasicBlock *Entry = &*MF->begin();
|
2011-11-08 05:59:29 +08:00
|
|
|
ReversePostOrderTraversal<MachineBasicBlock*> RPOT(Entry);
|
|
|
|
for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator
|
|
|
|
MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) {
|
|
|
|
MachineBasicBlock *MBB = *MBBI;
|
[ExecutionDepsFix] Improve clearance calculation for loops
Summary:
In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:
PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
^ |
+----------------------------------+
In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.
To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.
In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D
To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
- All Predecessors have completed primary processing
- For x the number of predecessors that have completed primary
processing *at the time of primary processing of this block*,
the number of predecessors that are done has reached x.
The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
Reviewers: myatsina
Differential Revision: https://reviews.llvm.org/D28759
llvm-svn: 293571
2017-01-31 07:37:03 +08:00
|
|
|
// N.B: IncomingProcessed and IncomingCompleted were already updated while
|
|
|
|
// processing this block's predecessors.
|
|
|
|
MBBInfos[MBB].PrimaryCompleted = true;
|
|
|
|
MBBInfos[MBB].PrimaryIncoming = MBBInfos[MBB].IncomingProcessed;
|
|
|
|
processBasicBlock(MBB, true);
|
|
|
|
updateSuccessors(MBB, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
// We need to go through again and finalize any blocks that are not done yet.
|
|
|
|
// This is possible if blocks have dead predecessors, so we didn't visit them
|
|
|
|
// above.
|
|
|
|
for (ReversePostOrderTraversal<MachineBasicBlock *>::rpo_iterator
|
|
|
|
MBBI = RPOT.begin(),
|
|
|
|
MBBE = RPOT.end();
|
|
|
|
MBBI != MBBE; ++MBBI) {
|
|
|
|
MachineBasicBlock *MBB = *MBBI;
|
|
|
|
if (!isBlockDone(MBB)) {
|
|
|
|
processBasicBlock(MBB, false);
|
|
|
|
// Don't update successors here. We'll get to them anyway through this
|
|
|
|
// loop.
|
|
|
|
}
|
2011-11-09 09:06:56 +08:00
|
|
|
}
|
|
|
|
|
2011-11-08 07:08:21 +08:00
|
|
|
// Clear the LiveOuts vectors and collapse any remaining DomainValues.
|
|
|
|
for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator
|
|
|
|
MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) {
|
[ExecutionDepsFix] Improve clearance calculation for loops
Summary:
In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:
PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
^ |
+----------------------------------+
In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.
To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.
In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D
To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
- All Predecessors have completed primary processing
- For x the number of predecessors that have completed primary
processing *at the time of primary processing of this block*,
the number of predecessors that are done has reached x.
The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
Reviewers: myatsina
Differential Revision: https://reviews.llvm.org/D28759
llvm-svn: 293571
2017-01-31 07:37:03 +08:00
|
|
|
auto FI = MBBInfos.find(*MBBI);
|
|
|
|
if (FI == MBBInfos.end() || !FI->second.OutRegs)
|
2011-11-08 07:08:21 +08:00
|
|
|
continue;
|
|
|
|
for (unsigned i = 0, e = NumRegs; i != e; ++i)
|
[ExecutionDepsFix] Improve clearance calculation for loops
Summary:
In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:
PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
^ |
+----------------------------------+
In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.
To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.
In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D
To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
- All Predecessors have completed primary processing
- For x the number of predecessors that have completed primary
processing *at the time of primary processing of this block*,
the number of predecessors that are done has reached x.
The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
Reviewers: myatsina
Differential Revision: https://reviews.llvm.org/D28759
llvm-svn: 293571
2017-01-31 07:37:03 +08:00
|
|
|
if (FI->second.OutRegs[i].Value)
|
|
|
|
release(FI->second.OutRegs[i].Value);
|
|
|
|
delete[] FI->second.OutRegs;
|
2011-11-08 07:08:21 +08:00
|
|
|
}
|
[ExecutionDepsFix] Improve clearance calculation for loops
Summary:
In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:
PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
^ |
+----------------------------------+
In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.
To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.
In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D
To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
- All Predecessors have completed primary processing
- For x the number of predecessors that have completed primary
processing *at the time of primary processing of this block*,
the number of predecessors that are done has reached x.
The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
Reviewers: myatsina
Differential Revision: https://reviews.llvm.org/D28759
llvm-svn: 293571
2017-01-31 07:37:03 +08:00
|
|
|
MBBInfos.clear();
|
2013-10-15 06:19:03 +08:00
|
|
|
UndefReads.clear();
|
2010-04-05 02:00:21 +08:00
|
|
|
Avail.clear();
|
|
|
|
Allocator.DestroyAll();
|
2010-03-30 07:24:21 +08:00
|
|
|
|
2010-03-26 01:25:00 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2011-09-28 07:50:46 +08:00
|
|
|
FunctionPass *
|
|
|
|
llvm::createExecutionDependencyFixPass(const TargetRegisterClass *RC) {
|
2011-09-28 08:01:56 +08:00
|
|
|
return new ExeDepsFix(RC);
|
2010-03-26 01:25:00 +08:00
|
|
|
}
|