[MCA] Improved cost computation for loop carried dependencies in the bottleneck analysis.

This patch introduces a cut-off threshold for dependency edge frequences with
the goal of simplifying the critical sequence computation.  This patch also
removes the cost normalization for loop carried dependencies.  We didn't really
need to artificially amplify the cost of loop-carried dependencies since it is
already computed as the integral over time of the delay (in cycle).

In the absence of backend stalls there is no need for computing a critical
sequence. With this patch we early exit from the critical sequence computation
if no bottleneck was reported during the simulation.

llvm-svn: 372337
This commit is contained in:
Andrea Di Biagio 2019-09-19 16:05:11 +00:00
parent 7cb60fb00f
commit e0900f285b
3 changed files with 43 additions and 17 deletions

View File

@ -67,18 +67,14 @@
# CHECK-NEXT: | 11. vmovaps %xmm7, %xmm4
# CHECK-NEXT: +----> 12. vfmadd213ps %xmm1, %xmm14, %xmm2 ## REGISTER dependency: %xmm1
# CHECK-NEXT: | 13. vmovaps %xmm6, %xmm1
# CHECK-NEXT: | 14. vfmadd213ps %xmm2, %xmm15, %xmm3
# CHECK-NEXT: | 15. vpermilps $170, %xmm3, %xmm0
# CHECK-NEXT: +----> 14. vfmadd213ps %xmm2, %xmm15, %xmm3 ## REGISTER dependency: %xmm2
# CHECK-NEXT: +----> 15. vpermilps $170, %xmm3, %xmm0 ## REGISTER dependency: %xmm3
# CHECK-NEXT: | 16. vmovups %xmm3, (%rdx,%rax)
# CHECK-NEXT: | 17. vpermilps $255, %xmm3, %xmm2
# CHECK-NEXT: | 18. addq $16, %rax
# CHECK-NEXT: | 19. decl %ecx
# CHECK-NEXT: | 20. vmovaps %xmm0, %xmm3
# CHECK-NEXT: | 21. jne .LBB0_4
# CHECK-NEXT: |
# CHECK-NEXT: | < loop carried >
# CHECK-NEXT: |
# CHECK-NEXT: +----> 2. vmulps -24(%rsp), %xmm7, %xmm8 ## RESOURCE interference: SKLPort1 [ probability: 45% ]
# CHECK-NEXT: +----> 20. vmovaps %xmm0, %xmm3 ## REGISTER dependency: %xmm0
# CHECK-NEXT: 21. jne .LBB0_4
# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps

View File

@ -165,10 +165,33 @@ void DependencyGraph::dumpDependencyEdge(raw_ostream &OS,
"Unsupported dependency type!");
OS << " - RESOURCE MASK: " << DE.ResourceOrRegID;
}
OS << " - CYCLES: " << DE.Cost << '\n';
OS << " - COST: " << DE.Cost << '\n';
}
#endif // NDEBUG
void DependencyGraph::pruneEdges(unsigned Iterations) {
for (DGNode &N : Nodes) {
unsigned NumPruned = 0;
const unsigned Size = N.OutgoingEdges.size();
// Use a cut-off threshold to prune edges with a low frequency.
for (unsigned I = 0, E = Size; I < E; ++I) {
DependencyEdge &Edge = N.OutgoingEdges[I];
if (Edge.Frequency == Iterations)
continue;
double Factor = (double)Edge.Frequency / Iterations;
if (0.10 < Factor)
continue;
Nodes[Edge.ToIID].NumPredecessors--;
std::swap(Edge, N.OutgoingEdges[E - 1]);
--E;
++NumPruned;
}
if (NumPruned)
N.OutgoingEdges.resize(Size - NumPruned);
}
}
void DependencyGraph::initializeRootSet(
SmallVectorImpl<unsigned> &RootSet) const {
for (unsigned I = 0, E = Nodes.size(); I < E; ++I) {
@ -179,7 +202,7 @@ void DependencyGraph::initializeRootSet(
}
void DependencyGraph::propagateThroughEdges(
SmallVectorImpl<unsigned> &RootSet) {
SmallVectorImpl<unsigned> &RootSet, unsigned Iterations) {
SmallVector<unsigned, 8> ToVisit;
// A critical sequence is computed as the longest path from a node of the
@ -189,6 +212,10 @@ void DependencyGraph::propagateThroughEdges(
// Each node of the graph starts with an initial default cost of zero. The
// cost of a node is a measure of criticality: the higher the cost, the bigger
// is the performance impact.
// For register and memory dependencies, the cost is a function of the write
// latency as well as the actual delay (in cycles) caused to users.
// For processor resource dependencies, the cost is a function of the resource
// pressure. Resource interferences with low frequency values are ignored.
//
// This algorithm is very similar to a (reverse) Dijkstra. Every iteration of
// the inner loop selects (i.e. visits) a node N from a set of `unvisited
@ -277,6 +304,10 @@ static void printInstruction(formatted_raw_ostream &FOS,
}
void BottleneckAnalysis::printCriticalSequence(raw_ostream &OS) const {
// Early exit if no bottlenecks were found during the simulation.
if (!SeenStallCycles || !BPI.PressureIncreaseCycles)
return;
SmallVector<const DependencyEdge *, 16> Seq;
DG.getCriticalSequence(Seq);
if (Seq.empty())
@ -432,7 +463,6 @@ void BottleneckAnalysis::addRegisterDep(unsigned From, unsigned To,
bool IsLoopCarried = From >= To;
unsigned SourceSize = Source.size();
if (IsLoopCarried) {
Cost *= Iterations / 2;
DG.addRegisterDep(From, To + SourceSize, RegID, Cost);
DG.addRegisterDep(From + SourceSize, To + (SourceSize * 2), RegID, Cost);
return;
@ -445,7 +475,6 @@ void BottleneckAnalysis::addMemoryDep(unsigned From, unsigned To,
bool IsLoopCarried = From >= To;
unsigned SourceSize = Source.size();
if (IsLoopCarried) {
Cost *= Iterations / 2;
DG.addMemoryDep(From, To + SourceSize, Cost);
DG.addMemoryDep(From + SourceSize, To + (SourceSize * 2), Cost);
return;
@ -458,7 +487,6 @@ void BottleneckAnalysis::addResourceDep(unsigned From, unsigned To,
bool IsLoopCarried = From >= To;
unsigned SourceSize = Source.size();
if (IsLoopCarried) {
Cost *= Iterations / 2;
DG.addResourceDep(From, To + SourceSize, Mask, Cost);
DG.addResourceDep(From + SourceSize, To + (SourceSize * 2), Mask, Cost);
return;
@ -514,7 +542,7 @@ void BottleneckAnalysis::onEvent(const HWInstructionEvent &Event) {
// Check if this is the last simulated instruction.
if (IID == ((Iterations * Source.size()) - 1))
DG.finalizeGraph();
DG.finalizeGraph(Iterations);
}
void BottleneckAnalysis::onEvent(const HWPressureEvent &Event) {

View File

@ -236,8 +236,9 @@ class DependencyGraph {
void addDependency(unsigned From, unsigned To,
DependencyEdge::Dependency &&DE);
void pruneEdges(unsigned Iterations);
void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet);
void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet, unsigned Iterations);
#ifndef NDEBUG
void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
@ -263,10 +264,11 @@ public:
// Called by the bottleneck analysis at the end of simulation to propagate
// costs through the edges of the graph, and compute a critical path.
void finalizeGraph() {
void finalizeGraph(unsigned Iterations) {
SmallVector<unsigned, 16> RootSet;
pruneEdges(Iterations);
initializeRootSet(RootSet);
propagateThroughEdges(RootSet);
propagateThroughEdges(RootSet, Iterations);
}
// Returns a sequence of edges representing the critical sequence based on the