[MCA] Improved cost computation for loop carried dependencies in the bottleneck analysis.

This patch introduces a cut-off threshold for dependency edge frequences with the goal of simplifying the critical sequence computation. This patch also removes the cost normalization for loop carried dependencies. We didn't really need to artificially amplify the cost of loop-carried dependencies since it is already computed as the integral over time of the delay (in cycle). In the absence of backend stalls there is no need for computing a critical sequence. With this patch we early exit from the critical sequence computation if no bottleneck was reported during the simulation. llvm-svn: 372337
2019-09-19 16:05:11 +00:00 · 2019-09-19 16:05:11 +00:00 · e0900f285b
parent 7cb60fb00f
commit e0900f285b
3 changed files with 43 additions and 17 deletions
--- a/llvm/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s
@ -67,18 +67,14 @@
 # CHECK-NEXT:  |      11.   vmovaps	%xmm7, %xmm4
 # CHECK-NEXT:  +----> 12.   vfmadd213ps	%xmm1, %xmm14, %xmm2      ## REGISTER dependency:  %xmm1
 # CHECK-NEXT:  |      13.   vmovaps	%xmm6, %xmm1
-# CHECK-NEXT:  |      14.   vfmadd213ps	%xmm2, %xmm15, %xmm3
-# CHECK-NEXT:  |      15.   vpermilps	$170, %xmm3, %xmm0
+# CHECK-NEXT:  +----> 14.   vfmadd213ps	%xmm2, %xmm15, %xmm3      ## REGISTER dependency:  %xmm2
+# CHECK-NEXT:  +----> 15.   vpermilps	$170, %xmm3, %xmm0                ## REGISTER dependency:  %xmm3
 # CHECK-NEXT:  |      16.   vmovups	%xmm3, (%rdx,%rax)
 # CHECK-NEXT:  |      17.   vpermilps	$255, %xmm3, %xmm2
 # CHECK-NEXT:  |      18.   addq	$16, %rax
 # CHECK-NEXT:  |      19.   decl	%ecx
-# CHECK-NEXT:  |      20.   vmovaps	%xmm0, %xmm3
-# CHECK-NEXT:  |      21.   jne	.LBB0_4
-# CHECK-NEXT:  |
-# CHECK-NEXT:  |    < loop carried >
-# CHECK-NEXT:  |
-# CHECK-NEXT:  +----> 2.    vmulps	-24(%rsp), %xmm7, %xmm8           ## RESOURCE interference:  SKLPort1 [ probability: 45% ]
+# CHECK-NEXT:  +----> 20.   vmovaps	%xmm0, %xmm3                      ## REGISTER dependency:  %xmm0
+# CHECK-NEXT:         21.   jne	.LBB0_4

 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
--- a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp
+++ b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp
@ -165,10 +165,33 @@ void DependencyGraph::dumpDependencyEdge(raw_ostream &OS,
           "Unsupported dependency type!");
    OS << " - RESOURCE MASK: " << DE.ResourceOrRegID;
  }
-  OS << " - CYCLES: " << DE.Cost << '\n';
+  OS << " - COST: " << DE.Cost << '\n';
 }
 #endif // NDEBUG

+void DependencyGraph::pruneEdges(unsigned Iterations) {
+  for (DGNode &N : Nodes) {
+    unsigned NumPruned = 0;
+    const unsigned Size = N.OutgoingEdges.size();
+    // Use a cut-off threshold to prune edges with a low frequency.
+    for (unsigned I = 0, E = Size; I < E; ++I) {
+      DependencyEdge &Edge = N.OutgoingEdges[I];
+      if (Edge.Frequency == Iterations)
+        continue;
+      double Factor = (double)Edge.Frequency / Iterations;
+      if (0.10 < Factor)
+        continue;
+      Nodes[Edge.ToIID].NumPredecessors--;
+      std::swap(Edge, N.OutgoingEdges[E - 1]);
+      --E;
+      ++NumPruned;
+    }
+
+    if (NumPruned)
+      N.OutgoingEdges.resize(Size - NumPruned);
+  }
+}
+
 void DependencyGraph::initializeRootSet(
    SmallVectorImpl<unsigned> &RootSet) const {
  for (unsigned I = 0, E = Nodes.size(); I < E; ++I) {
@ -179,7 +202,7 @@ void DependencyGraph::initializeRootSet(
 }

 void DependencyGraph::propagateThroughEdges(
-    SmallVectorImpl<unsigned> &RootSet) {
+    SmallVectorImpl<unsigned> &RootSet, unsigned Iterations) {
  SmallVector<unsigned, 8> ToVisit;

  // A critical sequence is computed as the longest path from a node of the
@ -189,6 +212,10 @@ void DependencyGraph::propagateThroughEdges(
  // Each node of the graph starts with an initial default cost of zero.  The
  // cost of a node is a measure of criticality: the higher the cost, the bigger
  // is the performance impact.
+  // For register and memory dependencies, the cost is a function of the write
+  // latency as well as the actual delay (in cycles) caused to users.
+  // For processor resource dependencies, the cost is a function of the resource
+  // pressure. Resource interferences with low frequency values are ignored.
  //
  // This algorithm is very similar to a (reverse) Dijkstra.  Every iteration of
  // the inner loop selects (i.e. visits) a node N from a set of `unvisited
@ -277,6 +304,10 @@ static void printInstruction(formatted_raw_ostream &FOS,
 }

 void BottleneckAnalysis::printCriticalSequence(raw_ostream &OS) const {
+  // Early exit if no bottlenecks were found during the simulation.
+  if (!SeenStallCycles || !BPI.PressureIncreaseCycles)
+    return;
+
  SmallVector<const DependencyEdge *, 16> Seq;
  DG.getCriticalSequence(Seq);
  if (Seq.empty())
@ -432,7 +463,6 @@ void BottleneckAnalysis::addRegisterDep(unsigned From, unsigned To,
  bool IsLoopCarried = From >= To;
  unsigned SourceSize = Source.size();
  if (IsLoopCarried) {
-    Cost *= Iterations / 2;
    DG.addRegisterDep(From, To + SourceSize, RegID, Cost);
    DG.addRegisterDep(From + SourceSize, To + (SourceSize * 2), RegID, Cost);
    return;
@ -445,7 +475,6 @@ void BottleneckAnalysis::addMemoryDep(unsigned From, unsigned To,
  bool IsLoopCarried = From >= To;
  unsigned SourceSize = Source.size();
  if (IsLoopCarried) {
-    Cost *= Iterations / 2;
    DG.addMemoryDep(From, To + SourceSize, Cost);
    DG.addMemoryDep(From + SourceSize, To + (SourceSize * 2), Cost);
    return;
@ -458,7 +487,6 @@ void BottleneckAnalysis::addResourceDep(unsigned From, unsigned To,
  bool IsLoopCarried = From >= To;
  unsigned SourceSize = Source.size();
  if (IsLoopCarried) {
-    Cost *= Iterations / 2;
    DG.addResourceDep(From, To + SourceSize, Mask, Cost);
    DG.addResourceDep(From + SourceSize, To + (SourceSize * 2), Mask, Cost);
    return;
@ -514,7 +542,7 @@ void BottleneckAnalysis::onEvent(const HWInstructionEvent &Event) {

  // Check if this is the last simulated instruction.
  if (IID == ((Iterations * Source.size()) - 1))
-    DG.finalizeGraph();
+    DG.finalizeGraph(Iterations);
 }

 void BottleneckAnalysis::onEvent(const HWPressureEvent &Event) {
--- a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h
+++ b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h
@ -236,8 +236,9 @@ class DependencyGraph {
  void addDependency(unsigned From, unsigned To,
                     DependencyEdge::Dependency &&DE);

+  void pruneEdges(unsigned Iterations);
  void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
-  void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet);
+  void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet, unsigned Iterations);

 #ifndef NDEBUG
  void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
@ -263,10 +264,11 @@ public:

  // Called by the bottleneck analysis at the end of simulation to propagate
  // costs through the edges of the graph, and compute a critical path.
-  void finalizeGraph() {
+  void finalizeGraph(unsigned Iterations) {
    SmallVector<unsigned, 16> RootSet;
+    pruneEdges(Iterations);
    initializeRootSet(RootSet);
-    propagateThroughEdges(RootSet);
+    propagateThroughEdges(RootSet, Iterations);
  }

  // Returns a sequence of edges representing the critical sequence based on the