From 3a845be7d1b7e2c4e7f4f2047960dc619389b31a Mon Sep 17 00:00:00 2001 From: River Riddle Date: Wed, 27 Mar 2019 14:02:26 -0700 Subject: [PATCH] Add support for multi-threaded pass timing. When multi-threading is enabled in the pass manager the meaning of the display slightly changes. First, a new timing column is added, `User Time`, that displays the total time spent across all threads. Secondly, the `Wall Time` column displays the longest individual time spent amongst all of the threads. This means that the `Wall Time` column will continue to give an indicator on the perceived time, or clock time, whereas the `User Time` will display the total cpu time. Example: $ mlir-opt foo.mlir -experimental-mt-pm -cse -canonicalize -convert-to-llvmir -pass-timing ===-------------------------------------------------------------------------=== ... Pass execution timing report ... ===-------------------------------------------------------------------------=== Total Execution Time: 0.0078 seconds ---User Time--- ---Wall Time--- --- Name --- 0.0175 ( 88.3%) 0.0055 ( 70.4%) Function Pipeline 0.0018 ( 9.3%) 0.0006 ( 8.1%) CSE 0.0013 ( 6.3%) 0.0004 ( 5.8%) (A) DominanceInfo 0.0017 ( 8.7%) 0.0006 ( 7.1%) FunctionVerifier 0.0128 ( 64.6%) 0.0039 ( 50.5%) Canonicalizer 0.0011 ( 5.7%) 0.0004 ( 4.7%) FunctionVerifier 0.0004 ( 2.1%) 0.0004 ( 5.2%) ModuleVerifier 0.0010 ( 5.3%) 0.0010 ( 13.4%) LLVMLowering 0.0009 ( 4.3%) 0.0009 ( 11.0%) ModuleVerifier 0.0198 (100.0%) 0.0078 (100.0%) Total PiperOrigin-RevId: 240636269 --- mlir/g3doc/WritingAPass.md | 31 +++++ mlir/lib/Pass/PassTiming.cpp | 204 +++++++++++++++++++++++++------- mlir/test/Pass/pass-timing.mlir | 49 +++++++- 3 files changed, 236 insertions(+), 48 deletions(-) diff --git a/mlir/g3doc/WritingAPass.md b/mlir/g3doc/WritingAPass.md index def3a07016b8..cfa3ecddccc2 100644 --- a/mlir/g3doc/WritingAPass.md +++ b/mlir/g3doc/WritingAPass.md @@ -463,6 +463,37 @@ $ mlir-opt foo.mlir -cse -canonicalize -convert-to-llvmir -pass-timing 0.0249 (100.0%) Total ``` +##### Multi-threaded Pass Timing + +When multi-threading is enabled in the pass manager the meaning of the display +slightly changes. First, a new timing column is added, `User Time`, that +displays the total time spent across all threads. Secondly, the `Wall Time` +column displays the longest individual time spent amongst all of the threads. +This means that the `Wall Time` column will continue to give an indicator on the +perceived time, or clock time, whereas the `User Time` will display the total +cpu time. + +```shell +$ mlir-opt foo.mlir -experimental-mt-pm -cse -canonicalize -convert-to-llvmir -pass-timing + +===-------------------------------------------------------------------------=== + ... Pass execution timing report ... +===-------------------------------------------------------------------------=== + Total Execution Time: 0.0078 seconds + + ---User Time--- ---Wall Time--- --- Name --- + 0.0175 ( 88.3%) 0.0055 ( 70.4%) Function Pipeline + 0.0018 ( 9.3%) 0.0006 ( 8.1%) CSE + 0.0013 ( 6.3%) 0.0004 ( 5.8%) (A) DominanceInfo + 0.0017 ( 8.7%) 0.0006 ( 7.1%) FunctionVerifier + 0.0128 ( 64.6%) 0.0039 ( 50.5%) Canonicalizer + 0.0011 ( 5.7%) 0.0004 ( 4.7%) FunctionVerifier + 0.0004 ( 2.1%) 0.0004 ( 5.2%) ModuleVerifier + 0.0010 ( 5.3%) 0.0010 ( 13.4%) LLVMLowering + 0.0009 ( 4.3%) 0.0009 ( 11.0%) ModuleVerifier + 0.0198 (100.0%) 0.0078 (100.0%) Total +``` + #### IR Printing When debugging it is often useful to dump the IR at various stages of a pass diff --git a/mlir/lib/Pass/PassTiming.cpp b/mlir/lib/Pass/PassTiming.cpp index d158ea116a75..0da018412974 100644 --- a/mlir/lib/Pass/PassTiming.cpp +++ b/mlir/lib/Pass/PassTiming.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Support/Format.h" #include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/Threading.h" #include using namespace mlir; @@ -31,6 +32,28 @@ constexpr llvm::StringLiteral kPassTimingDescription = "... Pass execution timing report ..."; namespace { +/// Simple record class to record timing information. +struct TimeRecord { + TimeRecord(double wall = 0.0, double user = 0.0) : wall(wall), user(user) {} + + TimeRecord &operator+=(const TimeRecord &other) { + wall += other.wall; + user += other.user; + return *this; + } + + /// Print the current time record to 'os', with a breakdown showing + /// contributions to the give 'total' time record. + void print(raw_ostream &os, const TimeRecord &total) { + if (total.user != total.wall) + os << llvm::format(" %7.4f (%5.1f%%) ", user, + 100.0 * user / total.user); + os << llvm::format(" %7.4f (%5.1f%%) ", wall, 100.0 * wall / total.wall); + } + + double wall, user; +}; + struct Timer { explicit Timer(std::string &&name) : name(std::move(name)) {} @@ -38,7 +61,11 @@ struct Timer { void start() { startTime = std::chrono::system_clock::now(); } /// Stop the timer. - void stop() { total += (std::chrono::system_clock::now() - startTime); } + void stop() { + auto newTime = std::chrono::system_clock::now() - startTime; + wallTime += newTime; + userTime += newTime; + } /// Get or create a child timer with the provided name and id. Timer *getChildTimer(const void *id, @@ -50,26 +77,73 @@ struct Timer { } /// Returns the total time for this timer in seconds. - double getTotalTime() { - // If the total has a count, then we directly compute the seconds. - if (total.count()) { - return std::chrono::duration_cast>(total) - .count(); + TimeRecord getTotalTime() { + // If we have a valid wall time, then we directly compute the seconds. + if (wallTime.count()) { + return TimeRecord( + std::chrono::duration_cast>(wallTime) + .count(), + std::chrono::duration_cast>(userTime) + .count()); } // Otheriwse, accumulate the timing from each of the children. - double totalTime = 0.0; + TimeRecord totalTime; for (auto &child : children) totalTime += child.second->getTotalTime(); return totalTime; } + /// A map of unique identifiers to child timers. + using ChildrenMap = llvm::MapVector>; + + /// Merge the timing data from 'other' into this timer. + void merge(Timer &&other) { + if (wallTime < other.wallTime) + wallTime = other.wallTime; + userTime += other.userTime; + mergeChildren(std::move(other.children), /*isStructural=*/false); + } + + /// Merge the timer chilren in 'otherChildren' with the children of this + /// timer. If 'isStructural' is true, the children are merged lexographically + /// and 'otherChildren' must have the same number of elements as the children + /// of this timer. Otherwise, the timer children are merged based upon the + /// given timer key. + void mergeChildren(ChildrenMap &&otherChildren, bool isStructural) { + // Check for an empty children list. + if (children.empty()) { + children = std::move(otherChildren); + return; + } + + if (isStructural) { + // If this is a structural merge, the number of children must be the same. + assert(children.size() == otherChildren.size() && + "structural merge requires the same number of children"); + auto it = children.begin(), otherIt = otherChildren.begin(); + for (auto e = children.end(); it != e; ++it, ++otherIt) + it->second->merge(std::move(*otherIt->second)); + return; + } + + // Otherwise, we merge based upon the child timers key. + for (auto &otherChild : otherChildren) { + auto &child = children[otherChild.first]; + if (!child) + child = std::move(otherChild.second); + else + child->merge(std::move(*otherChild.second)); + } + } + /// Raw timing information. std::chrono::time_point startTime; - std::chrono::nanoseconds total = std::chrono::nanoseconds(0); + std::chrono::nanoseconds wallTime = std::chrono::nanoseconds(0); + std::chrono::nanoseconds userTime = std::chrono::nanoseconds(0); /// A map of unique identifiers to child timers. - llvm::MapVector> children; + ChildrenMap children; /// A descriptive name for this timer. std::string name; @@ -110,23 +184,38 @@ struct PassTiming : public PassInstrumentation { void stopTimer(); /// Print the timing result in list mode. - void printResultsAsList(llvm::raw_ostream &os, double totalTime); + void printResultsAsList(raw_ostream &os, Timer *root, TimeRecord totalTime); /// Print the timing result in pipeline mode. - void printResultsAsPipeline(llvm::raw_ostream &os, double totalTime); + void printResultsAsPipeline(raw_ostream &os, Timer *root, + TimeRecord totalTime); /// Returns a timer for the provided identifier and name. Timer *getTimer(const void *id, std::function &&nameBuilder) { - if (activeTimers.empty()) - return rootTimer.getChildTimer(id, std::move(nameBuilder)); - return activeTimers.back()->getChildTimer(id, std::move(nameBuilder)); + auto tid = llvm::get_threadid(); + + // If there is no active timer then add to the root timer. + auto &activeTimers = activeThreadTimers[tid]; + if (activeTimers.empty()) { + auto &rootTimer = rootTimers[tid]; + if (!rootTimer) + rootTimer.reset(new Timer("root")); + auto *timer = rootTimer->getChildTimer(id, std::move(nameBuilder)); + activeTimers.push_back(timer); + return timer; + } + + // Otherwise, add this to the active timer. + auto timer = activeTimers.back()->getChildTimer(id, std::move(nameBuilder)); + activeTimers.push_back(timer); + return timer; } - /// The root top level timer. - Timer rootTimer = Timer("root"); + /// The root top level timers for each thread. + DenseMap> rootTimers; - /// A stack of the currently active pass timers. - SmallVector activeTimers; + /// A stack of the currently active pass timers per thread. + DenseMap> activeThreadTimers; /// The display mode to use when printing the timing results. PassTimingDisplayMode displayMode; @@ -140,7 +229,6 @@ void PassTiming::startPassTimer(Pass *pass) { return StringRef("Function Pipeline"); return pass->getName(); }); - activeTimers.push_back(timer); // We don't actually want to time the adaptor passes, they gather their total // from their held passes. @@ -151,15 +239,38 @@ void PassTiming::startPassTimer(Pass *pass) { /// Start a new timer for the given analysis. void PassTiming::startAnalysisTimer(llvm::StringRef name, AnalysisID *id) { Timer *timer = getTimer(id, [name] { return "(A) " + name.str(); }); - activeTimers.push_back(timer); timer->start(); } /// Stop a pass timer. void PassTiming::runAfterPass(Pass *pass, const llvm::Any &) { + auto tid = llvm::get_threadid(); + auto &activeTimers = activeThreadTimers[tid]; assert(!activeTimers.empty() && "expected active timer"); Timer *timer = activeTimers.pop_back_val(); + // If this is an ModuleToFunctionPassAdaptorParallel, then we need to merge in + // the timing data for the other threads. + if (auto *asyncMTFPass = + dyn_cast(pass)) { + // The asychronous pipeline timers should exist as children of root timers + // for other threads. + for (auto &rootTimer : llvm::make_early_inc_range(rootTimers)) { + // Skip the current thread. + if (rootTimer.first == tid) + continue; + // Check that this thread has no active timers. + assert(activeThreadTimers[tid].empty() && "expected no active timers"); + + // Structurally merge this timers children into the parallel + // module-to-function pass timer. + timer->mergeChildren(std::move(rootTimer.second->children), + /*isStructural=*/true); + rootTimers.erase(rootTimer.first); + } + return; + } + // Adapator passes aren't timed directly, so we don't need to stop their // timers. if (!isAdaptorPass(pass)) @@ -169,13 +280,14 @@ void PassTiming::runAfterPass(Pass *pass, const llvm::Any &) { /// Stop a timer. void PassTiming::runAfterAnalysis(llvm::StringRef, AnalysisID *, const llvm::Any &) { + auto &activeTimers = activeThreadTimers[llvm::get_threadid()]; assert(!activeTimers.empty() && "expected active timer"); Timer *timer = activeTimers.pop_back_val(); timer->stop(); } /// Utility to print the timer heading information. -static void printTimerHeader(llvm::raw_ostream &os, double total) { +static void printTimerHeader(llvm::raw_ostream &os, TimeRecord total) { os << "===" << std::string(73, '-') << "===\n"; // Figure out how many spaces to description name. unsigned Padding = (80 - kPassTimingDescription.size()) / 2; @@ -183,69 +295,76 @@ static void printTimerHeader(llvm::raw_ostream &os, double total) { os << "===" << std::string(73, '-') << "===\n"; // Print the total time followed by the section headers. - os << llvm::format(" Total Execution Time: %5.4f seconds\n\n", total); + os << llvm::format(" Total Execution Time: %5.4f seconds\n\n", total.wall); + if (total.user != total.wall) + os << " ---User Time---"; os << " ---Wall Time--- --- Name ---\n"; } /// Utility to print a single line entry in the timer output. static void printTimeEntry(raw_ostream &os, unsigned indent, StringRef name, - double time, double totalTime) { - os << llvm::format(" %7.4f (%5.1f%%) ", time, 100.0 * time / totalTime); + TimeRecord time, TimeRecord totalTime) { + time.print(os, totalTime); os.indent(indent) << name << "\n"; } /// Print out the current timing information. void PassTiming::print() { // Don't print anything if there is no timing data. - if (rootTimer.children.empty()) + if (rootTimers.empty()) return; + + assert(rootTimers.size() == 1 && "expected one remaining root timer"); + auto &rootTimer = rootTimers.begin()->second; auto os = llvm::CreateInfoOutputFile(); // Print the timer header. - double totalTime = rootTimer.getTotalTime(); + TimeRecord totalTime = rootTimer->getTotalTime(); printTimerHeader(*os, totalTime); // Defer to a specialized printer for each display mode. switch (displayMode) { case PassTimingDisplayMode::List: - printResultsAsList(*os, totalTime); + printResultsAsList(*os, rootTimer.get(), totalTime); break; case PassTimingDisplayMode::Pipeline: - printResultsAsPipeline(*os, totalTime); + printResultsAsPipeline(*os, rootTimer.get(), totalTime); break; } printTimeEntry(*os, 0, "Total", totalTime, totalTime); os->flush(); - // Reset root timer. - rootTimer.children.clear(); + // Reset root timers. + rootTimers.clear(); + activeThreadTimers.clear(); } /// Print the timing result in list mode. -void PassTiming::printResultsAsList(llvm::raw_ostream &os, double totalTime) { - llvm::StringMap mergedTimings; +void PassTiming::printResultsAsList(raw_ostream &os, Timer *root, + TimeRecord totalTime) { + llvm::StringMap mergedTimings; std::function addTimer = [&](Timer *timer) { // Check for timing information. - if (timer->total.count()) + if (timer->wallTime.count()) mergedTimings[timer->name] += timer->getTotalTime(); for (auto &children : timer->children) addTimer(children.second.get()); }; // Add each of the top level timers. - for (auto &topLevelTimer : rootTimer.children) + for (auto &topLevelTimer : root->children) addTimer(topLevelTimer.second.get()); - // Sort the timing information. - std::vector> timerNameAndTime; + // Sort the timing information by wall time. + std::vector> timerNameAndTime; for (auto &it : mergedTimings) timerNameAndTime.emplace_back(it.first(), it.second); llvm::array_pod_sort(timerNameAndTime.begin(), timerNameAndTime.end(), - [](const std::pair *lhs, - const std::pair *rhs) { + [](const std::pair *lhs, + const std::pair *rhs) { return llvm::array_pod_sort_comparator( - &rhs->second, &lhs->second); + &rhs->second.wall, &lhs->second.wall); }); // Print the timing information sequentially. @@ -254,18 +373,17 @@ void PassTiming::printResultsAsList(llvm::raw_ostream &os, double totalTime) { } /// Print the timing result in pipeline mode. -void PassTiming::printResultsAsPipeline(llvm::raw_ostream &os, - double totalTime) { +void PassTiming::printResultsAsPipeline(raw_ostream &os, Timer *root, + TimeRecord totalTime) { std::function printTimer = [&](unsigned indent, Timer *timer) { - // Check for timing information. printTimeEntry(os, indent, timer->name, timer->getTotalTime(), totalTime); for (auto &children : timer->children) printTimer(indent + 2, children.second.get()); }; // Print each of the top level timers. - for (auto &topLevelTimer : rootTimer.children) + for (auto &topLevelTimer : root->children) printTimer(0, topLevelTimer.second.get()); } diff --git a/mlir/test/Pass/pass-timing.mlir b/mlir/test/Pass/pass-timing.mlir index af2a156e1678..15eb6077da35 100644 --- a/mlir/test/Pass/pass-timing.mlir +++ b/mlir/test/Pass/pass-timing.mlir @@ -1,9 +1,7 @@ // RUN: mlir-opt %s -verify-each=true -cse -canonicalize -cse -pass-timing -pass-timing-display=list 2>&1 | FileCheck -check-prefix=LIST %s // RUN: mlir-opt %s -verify-each=true -cse -canonicalize -cse -pass-timing -pass-timing-display=pipeline 2>&1 | FileCheck -check-prefix=PIPELINE %s - -func @foo() { - return -} +// RUN: mlir-opt %s -experimental-mt-pm=true -verify-each=true -cse -canonicalize -cse -pass-timing -pass-timing-display=list 2>&1 | FileCheck -check-prefix=MT_LIST %s +// RUN: mlir-opt %s -experimental-mt-pm=true -verify-each=true -cse -canonicalize -cse -pass-timing -pass-timing-display=pipeline 2>&1 | FileCheck -check-prefix=MT_PIPELINE %s // LIST: Pass execution timing report // LIST: Total Execution Time: @@ -13,7 +11,7 @@ func @foo() { // LIST-DAG: CSE // LIST-DAG: ModuleVerifier // LIST-DAG: DominanceInfo -// LIST: {{.*}} Total +// LIST: Total // PIPELINE: Pass execution timing report // PIPELINE: Total Execution Time: @@ -29,3 +27,44 @@ func @foo() { // PIPELINE-NEXT: FunctionVerifier // PIPELINE-NEXT: ModuleVerifier // PIPELINE-NEXT: Total + +// MT_LIST: Pass execution timing report +// MT_LIST: Total Execution Time: +// MT_LIST: ---User Time--- ---Wall Time--- --- Name --- +// MT_LIST-DAG: Canonicalizer +// MT_LIST-DAG: FunctionVerifier +// MT_LIST-DAG: CSE +// MT_LIST-DAG: ModuleVerifier +// MT_LIST-DAG: DominanceInfo +// MT_LIST: Total + +// MT_PIPELINE: Pass execution timing report +// MT_PIPELINE: Total Execution Time: +// MT_PIPELINE: ---User Time--- ---Wall Time--- --- Name --- +// MT_PIPELINE-NEXT: Function Pipeline +// MT_PIPELINE-NEXT: CSE +// MT_PIPELINE-NEXT: (A) DominanceInfo +// MT_PIPELINE-NEXT: FunctionVerifier +// MT_PIPELINE-NEXT: Canonicalizer +// MT_PIPELINE-NEXT: FunctionVerifier +// MT_PIPELINE-NEXT: CSE +// MT_PIPELINE-NEXT: (A) DominanceInfo +// MT_PIPELINE-NEXT: FunctionVerifier +// MT_PIPELINE-NEXT: ModuleVerifier +// MT_PIPELINE-NEXT: Total + +func @foo() { + return +} + +func @bar() { + return +} + +func @baz() { + return +} + +func @foobar() { + return +}