Fix graph history issue for CPU and GPU

This commit is contained in:
Parastoo Ashtari 2022-04-11 14:34:31 -04:00
parent 0af22f071a
commit 0e7cc35549
6 changed files with 26 additions and 3 deletions

View File

@ -44,7 +44,8 @@ void CPUE2eDump::DumpCNodeData(const CNodePtr &node, uint32_t graph_id) {
void CPUE2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) {
auto &json_parser = DumpJsonParser::GetInstance();
if (!(json_parser.e2e_dump_enabled())) {
// avoid dumping same iteration over and over
if (!(json_parser.e2e_dump_enabled()) || json_parser.cur_dump_iter() == prev_run_iter_) {
return;
}
std::string execution_order_path = json_parser.path() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
@ -65,6 +66,7 @@ void CPUE2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) {
fout << std::to_string(json_parser.cur_dump_iter()) + "\n";
fout.close();
ChangeFileMode(file_name, S_IRUSR);
prev_run_iter_ = json_parser.cur_dump_iter();
}
void CPUE2eDump::DumpCNodeInputs(const CNodePtr &node, const std::string &dump_path) {

View File

@ -51,6 +51,8 @@ class CPUE2eDump {
static void DumpInputImpl(const CNodePtr &node, const std::string &dump_path, std::string *kernel_name);
static void DumpOutputImpl(const CNodePtr &node, const std::string &dump_path, std::string *kernel_name);
inline static unsigned int prev_run_iter_ = UINT32_MAX;
};
} // namespace mindspore
#endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_CPU_E_2_E_DUMP_H_

View File

@ -292,6 +292,7 @@ void Debugger::Reset() {
graph_proto_list_.clear();
graph_ptr_list_.clear();
graph_ptr_step_vec_.clear();
executed_graph_ptr_set_.clear();
parameters_mindRT_.clear();
visited_root_graph_ids_.clear();
MS_LOG(INFO) << "Release Debugger resource.";
@ -502,8 +503,10 @@ void Debugger::DumpParamsAndConstAndHistory() {
// Dump constant data for Ascend.
DumpConstantDataAscend(graph);
}
}
for (auto kernel_graph : executed_graph_ptr_set_) {
// Dump graph run hisotry for each graph.
E2eDump::DumpRunIter(graph, GetRankID());
E2eDump::DumpRunIter(kernel_graph, GetRankID());
}
if (!cur_root_graph_checked) {
visited_root_graph_ids_.push_back(cur_root_graph_id_);
@ -583,6 +586,7 @@ void Debugger::PostExecuteGraphDebugger() {
debugger_->PostExecute();
}
E2eDump::UpdateIterMindRTDump();
executed_graph_ptr_set_.clear();
}
/*

View File

@ -22,6 +22,7 @@
#include <utility>
#include <vector>
#include <map>
#include <set>
#include "backend/common/session/kernel_graph.h"
#include "debug/debugger/grpc_client.h"
#include "debug/debug_services.h"
@ -174,6 +175,8 @@ class BACKEND_EXPORT Debugger : public std::enable_shared_from_this<Debugger> {
std::vector<KernelGraphPtr> GetStepGraphPtrList() const { return graph_ptr_step_vec_; }
void InsertExecutedGraph(const KernelGraphPtr &graph_ptr) { executed_graph_ptr_set_.insert(graph_ptr); }
void SetGraphPtr(const KernelGraphPtr &graph_ptr) { graph_ptr_ = graph_ptr; }
const KernelGraphPtr GetGraphPtr() const { return graph_ptr_; }
@ -317,8 +320,10 @@ class BACKEND_EXPORT Debugger : public std::enable_shared_from_this<Debugger> {
std::list<GraphProto> graph_proto_list_;
std::list<KernelGraphPtr> graph_ptr_list_;
// The vector of graph pointers that have been run in the current step.
// The vector of all the kernel graph pointers for the root graph that will execute in the current step.
std::vector<KernelGraphPtr> graph_ptr_step_vec_;
// The set of graph pointers that have been run in the current step.
std::set<KernelGraphPtr> executed_graph_ptr_set_;
// The vector of all the parameters for the current step for mindRT.
std::vector<AnfNodePtr> parameters_mindRT_;
std::vector<uint32_t> visited_root_graph_ids_;

View File

@ -67,6 +67,8 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
#ifdef ENABLE_DEBUGGER
auto debugger = Debugger::GetInstance();
if (debugger != nullptr) {
auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
debugger->InsertExecutedGraph(kernel_graph);
std::string kernel_name = cnode->fullname_with_scope();
debugger->SetCurNode(kernel_name);
bool read_data = CheckReadData(cnode);
@ -80,6 +82,8 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
#ifdef ENABLE_DEBUGGER
auto debugger = Debugger::GetInstance();
if (debugger != nullptr) {
auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
debugger->InsertExecutedGraph(kernel_graph);
debugger->SetAscendKernelByKernelFlag(true);
bool read_data = CheckReadData(cnode);
if (read_data) {
@ -108,6 +112,10 @@ void DebugActor::DebugForGraph(const KernelGraphPtr &graph, const DeviceContext
MS_EXCEPTION_IF_NULL(from_aid);
MS_LOG(DEBUG) << "Super kernel debug for graph: " << graph->graph_id() << ".";
#ifdef ENABLE_DEBUGGER
auto debugger = Debugger::GetInstance();
if (debugger != nullptr) {
debugger->InsertExecutedGraph(graph);
}
LoadDataForDebugger(graph);
// This function updates graph history file and cur_dump_iter if dump is enabled.
// When e2e dump is enabled, this function dumps the graph.

View File

@ -108,6 +108,8 @@ def run_multi_root_graph_dump(device, dump_mode, test_name):
assert len(os.listdir(execution_order_path)) == 8
check_graph_structure(dump_file_path, execution_order_path, '0', ['0', '2', '4'])
check_graph_structure(dump_file_path, execution_order_path, '1', ['1', '3', '5'])
check_graph_structure(dump_file_path, execution_order_path, '2', ['1', '3'])
check_graph_structure(dump_file_path, execution_order_path, '3', ['5'])
else:
# In Ascend, we have 2 root graphs folders under rank_0 dir.
# In graph history dir, there are 4 ms_execution_order files and 2 ms_global_execution_order files.