diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s index 1f5e2dbf4690..98c5376ff824 100644 --- a/llvm/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s +++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s @@ -67,18 +67,14 @@ # CHECK-NEXT: | 11. vmovaps %xmm7, %xmm4 # CHECK-NEXT: +----> 12. vfmadd213ps %xmm1, %xmm14, %xmm2 ## REGISTER dependency: %xmm1 # CHECK-NEXT: | 13. vmovaps %xmm6, %xmm1 -# CHECK-NEXT: | 14. vfmadd213ps %xmm2, %xmm15, %xmm3 -# CHECK-NEXT: | 15. vpermilps $170, %xmm3, %xmm0 +# CHECK-NEXT: +----> 14. vfmadd213ps %xmm2, %xmm15, %xmm3 ## REGISTER dependency: %xmm2 +# CHECK-NEXT: +----> 15. vpermilps $170, %xmm3, %xmm0 ## REGISTER dependency: %xmm3 # CHECK-NEXT: | 16. vmovups %xmm3, (%rdx,%rax) # CHECK-NEXT: | 17. vpermilps $255, %xmm3, %xmm2 # CHECK-NEXT: | 18. addq $16, %rax # CHECK-NEXT: | 19. decl %ecx -# CHECK-NEXT: | 20. vmovaps %xmm0, %xmm3 -# CHECK-NEXT: | 21. jne .LBB0_4 -# CHECK-NEXT: | -# CHECK-NEXT: | < loop carried > -# CHECK-NEXT: | -# CHECK-NEXT: +----> 2. vmulps -24(%rsp), %xmm7, %xmm8 ## RESOURCE interference: SKLPort1 [ probability: 45% ] +# CHECK-NEXT: +----> 20. vmovaps %xmm0, %xmm3 ## REGISTER dependency: %xmm0 +# CHECK-NEXT: 21. jne .LBB0_4 # CHECK: Instruction Info: # CHECK-NEXT: [1]: #uOps diff --git a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp index 560c6c6e8a33..feff0cd6d524 100644 --- a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp +++ b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp @@ -165,10 +165,33 @@ void DependencyGraph::dumpDependencyEdge(raw_ostream &OS, "Unsupported dependency type!"); OS << " - RESOURCE MASK: " << DE.ResourceOrRegID; } - OS << " - CYCLES: " << DE.Cost << '\n'; + OS << " - COST: " << DE.Cost << '\n'; } #endif // NDEBUG +void DependencyGraph::pruneEdges(unsigned Iterations) { + for (DGNode &N : Nodes) { + unsigned NumPruned = 0; + const unsigned Size = N.OutgoingEdges.size(); + // Use a cut-off threshold to prune edges with a low frequency. + for (unsigned I = 0, E = Size; I < E; ++I) { + DependencyEdge &Edge = N.OutgoingEdges[I]; + if (Edge.Frequency == Iterations) + continue; + double Factor = (double)Edge.Frequency / Iterations; + if (0.10 < Factor) + continue; + Nodes[Edge.ToIID].NumPredecessors--; + std::swap(Edge, N.OutgoingEdges[E - 1]); + --E; + ++NumPruned; + } + + if (NumPruned) + N.OutgoingEdges.resize(Size - NumPruned); + } +} + void DependencyGraph::initializeRootSet( SmallVectorImpl &RootSet) const { for (unsigned I = 0, E = Nodes.size(); I < E; ++I) { @@ -179,7 +202,7 @@ void DependencyGraph::initializeRootSet( } void DependencyGraph::propagateThroughEdges( - SmallVectorImpl &RootSet) { + SmallVectorImpl &RootSet, unsigned Iterations) { SmallVector ToVisit; // A critical sequence is computed as the longest path from a node of the @@ -189,6 +212,10 @@ void DependencyGraph::propagateThroughEdges( // Each node of the graph starts with an initial default cost of zero. The // cost of a node is a measure of criticality: the higher the cost, the bigger // is the performance impact. + // For register and memory dependencies, the cost is a function of the write + // latency as well as the actual delay (in cycles) caused to users. + // For processor resource dependencies, the cost is a function of the resource + // pressure. Resource interferences with low frequency values are ignored. // // This algorithm is very similar to a (reverse) Dijkstra. Every iteration of // the inner loop selects (i.e. visits) a node N from a set of `unvisited @@ -277,6 +304,10 @@ static void printInstruction(formatted_raw_ostream &FOS, } void BottleneckAnalysis::printCriticalSequence(raw_ostream &OS) const { + // Early exit if no bottlenecks were found during the simulation. + if (!SeenStallCycles || !BPI.PressureIncreaseCycles) + return; + SmallVector Seq; DG.getCriticalSequence(Seq); if (Seq.empty()) @@ -432,7 +463,6 @@ void BottleneckAnalysis::addRegisterDep(unsigned From, unsigned To, bool IsLoopCarried = From >= To; unsigned SourceSize = Source.size(); if (IsLoopCarried) { - Cost *= Iterations / 2; DG.addRegisterDep(From, To + SourceSize, RegID, Cost); DG.addRegisterDep(From + SourceSize, To + (SourceSize * 2), RegID, Cost); return; @@ -445,7 +475,6 @@ void BottleneckAnalysis::addMemoryDep(unsigned From, unsigned To, bool IsLoopCarried = From >= To; unsigned SourceSize = Source.size(); if (IsLoopCarried) { - Cost *= Iterations / 2; DG.addMemoryDep(From, To + SourceSize, Cost); DG.addMemoryDep(From + SourceSize, To + (SourceSize * 2), Cost); return; @@ -458,7 +487,6 @@ void BottleneckAnalysis::addResourceDep(unsigned From, unsigned To, bool IsLoopCarried = From >= To; unsigned SourceSize = Source.size(); if (IsLoopCarried) { - Cost *= Iterations / 2; DG.addResourceDep(From, To + SourceSize, Mask, Cost); DG.addResourceDep(From + SourceSize, To + (SourceSize * 2), Mask, Cost); return; @@ -514,7 +542,7 @@ void BottleneckAnalysis::onEvent(const HWInstructionEvent &Event) { // Check if this is the last simulated instruction. if (IID == ((Iterations * Source.size()) - 1)) - DG.finalizeGraph(); + DG.finalizeGraph(Iterations); } void BottleneckAnalysis::onEvent(const HWPressureEvent &Event) { diff --git a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h index 7564b1a48206..9e3bd5978f09 100644 --- a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h +++ b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h @@ -236,8 +236,9 @@ class DependencyGraph { void addDependency(unsigned From, unsigned To, DependencyEdge::Dependency &&DE); + void pruneEdges(unsigned Iterations); void initializeRootSet(SmallVectorImpl &RootSet) const; - void propagateThroughEdges(SmallVectorImpl &RootSet); + void propagateThroughEdges(SmallVectorImpl &RootSet, unsigned Iterations); #ifndef NDEBUG void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE, @@ -263,10 +264,11 @@ public: // Called by the bottleneck analysis at the end of simulation to propagate // costs through the edges of the graph, and compute a critical path. - void finalizeGraph() { + void finalizeGraph(unsigned Iterations) { SmallVector RootSet; + pruneEdges(Iterations); initializeRootSet(RootSet); - propagateThroughEdges(RootSet); + propagateThroughEdges(RootSet, Iterations); } // Returns a sequence of edges representing the critical sequence based on the