forked from mindspore-Ecosystem/mindspore
Fix graph history issue for CPU and GPU
This commit is contained in:
parent
0af22f071a
commit
0e7cc35549
|
@ -44,7 +44,8 @@ void CPUE2eDump::DumpCNodeData(const CNodePtr &node, uint32_t graph_id) {
|
|||
|
||||
void CPUE2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) {
|
||||
auto &json_parser = DumpJsonParser::GetInstance();
|
||||
if (!(json_parser.e2e_dump_enabled())) {
|
||||
// avoid dumping same iteration over and over
|
||||
if (!(json_parser.e2e_dump_enabled()) || json_parser.cur_dump_iter() == prev_run_iter_) {
|
||||
return;
|
||||
}
|
||||
std::string execution_order_path = json_parser.path() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
|
||||
|
@ -65,6 +66,7 @@ void CPUE2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) {
|
|||
fout << std::to_string(json_parser.cur_dump_iter()) + "\n";
|
||||
fout.close();
|
||||
ChangeFileMode(file_name, S_IRUSR);
|
||||
prev_run_iter_ = json_parser.cur_dump_iter();
|
||||
}
|
||||
|
||||
void CPUE2eDump::DumpCNodeInputs(const CNodePtr &node, const std::string &dump_path) {
|
||||
|
|
|
@ -51,6 +51,8 @@ class CPUE2eDump {
|
|||
static void DumpInputImpl(const CNodePtr &node, const std::string &dump_path, std::string *kernel_name);
|
||||
|
||||
static void DumpOutputImpl(const CNodePtr &node, const std::string &dump_path, std::string *kernel_name);
|
||||
|
||||
inline static unsigned int prev_run_iter_ = UINT32_MAX;
|
||||
};
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_CPU_E_2_E_DUMP_H_
|
||||
|
|
|
@ -292,6 +292,7 @@ void Debugger::Reset() {
|
|||
graph_proto_list_.clear();
|
||||
graph_ptr_list_.clear();
|
||||
graph_ptr_step_vec_.clear();
|
||||
executed_graph_ptr_set_.clear();
|
||||
parameters_mindRT_.clear();
|
||||
visited_root_graph_ids_.clear();
|
||||
MS_LOG(INFO) << "Release Debugger resource.";
|
||||
|
@ -502,8 +503,10 @@ void Debugger::DumpParamsAndConstAndHistory() {
|
|||
// Dump constant data for Ascend.
|
||||
DumpConstantDataAscend(graph);
|
||||
}
|
||||
}
|
||||
for (auto kernel_graph : executed_graph_ptr_set_) {
|
||||
// Dump graph run hisotry for each graph.
|
||||
E2eDump::DumpRunIter(graph, GetRankID());
|
||||
E2eDump::DumpRunIter(kernel_graph, GetRankID());
|
||||
}
|
||||
if (!cur_root_graph_checked) {
|
||||
visited_root_graph_ids_.push_back(cur_root_graph_id_);
|
||||
|
@ -583,6 +586,7 @@ void Debugger::PostExecuteGraphDebugger() {
|
|||
debugger_->PostExecute();
|
||||
}
|
||||
E2eDump::UpdateIterMindRTDump();
|
||||
executed_graph_ptr_set_.clear();
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include <utility>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include "backend/common/session/kernel_graph.h"
|
||||
#include "debug/debugger/grpc_client.h"
|
||||
#include "debug/debug_services.h"
|
||||
|
@ -174,6 +175,8 @@ class BACKEND_EXPORT Debugger : public std::enable_shared_from_this<Debugger> {
|
|||
|
||||
std::vector<KernelGraphPtr> GetStepGraphPtrList() const { return graph_ptr_step_vec_; }
|
||||
|
||||
void InsertExecutedGraph(const KernelGraphPtr &graph_ptr) { executed_graph_ptr_set_.insert(graph_ptr); }
|
||||
|
||||
void SetGraphPtr(const KernelGraphPtr &graph_ptr) { graph_ptr_ = graph_ptr; }
|
||||
|
||||
const KernelGraphPtr GetGraphPtr() const { return graph_ptr_; }
|
||||
|
@ -317,8 +320,10 @@ class BACKEND_EXPORT Debugger : public std::enable_shared_from_this<Debugger> {
|
|||
|
||||
std::list<GraphProto> graph_proto_list_;
|
||||
std::list<KernelGraphPtr> graph_ptr_list_;
|
||||
// The vector of graph pointers that have been run in the current step.
|
||||
// The vector of all the kernel graph pointers for the root graph that will execute in the current step.
|
||||
std::vector<KernelGraphPtr> graph_ptr_step_vec_;
|
||||
// The set of graph pointers that have been run in the current step.
|
||||
std::set<KernelGraphPtr> executed_graph_ptr_set_;
|
||||
// The vector of all the parameters for the current step for mindRT.
|
||||
std::vector<AnfNodePtr> parameters_mindRT_;
|
||||
std::vector<uint32_t> visited_root_graph_ids_;
|
||||
|
|
|
@ -67,6 +67,8 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
|
|||
#ifdef ENABLE_DEBUGGER
|
||||
auto debugger = Debugger::GetInstance();
|
||||
if (debugger != nullptr) {
|
||||
auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
|
||||
debugger->InsertExecutedGraph(kernel_graph);
|
||||
std::string kernel_name = cnode->fullname_with_scope();
|
||||
debugger->SetCurNode(kernel_name);
|
||||
bool read_data = CheckReadData(cnode);
|
||||
|
@ -80,6 +82,8 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
|
|||
#ifdef ENABLE_DEBUGGER
|
||||
auto debugger = Debugger::GetInstance();
|
||||
if (debugger != nullptr) {
|
||||
auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
|
||||
debugger->InsertExecutedGraph(kernel_graph);
|
||||
debugger->SetAscendKernelByKernelFlag(true);
|
||||
bool read_data = CheckReadData(cnode);
|
||||
if (read_data) {
|
||||
|
@ -108,6 +112,10 @@ void DebugActor::DebugForGraph(const KernelGraphPtr &graph, const DeviceContext
|
|||
MS_EXCEPTION_IF_NULL(from_aid);
|
||||
MS_LOG(DEBUG) << "Super kernel debug for graph: " << graph->graph_id() << ".";
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
auto debugger = Debugger::GetInstance();
|
||||
if (debugger != nullptr) {
|
||||
debugger->InsertExecutedGraph(graph);
|
||||
}
|
||||
LoadDataForDebugger(graph);
|
||||
// This function updates graph history file and cur_dump_iter if dump is enabled.
|
||||
// When e2e dump is enabled, this function dumps the graph.
|
||||
|
|
|
@ -108,6 +108,8 @@ def run_multi_root_graph_dump(device, dump_mode, test_name):
|
|||
assert len(os.listdir(execution_order_path)) == 8
|
||||
check_graph_structure(dump_file_path, execution_order_path, '0', ['0', '2', '4'])
|
||||
check_graph_structure(dump_file_path, execution_order_path, '1', ['1', '3', '5'])
|
||||
check_graph_structure(dump_file_path, execution_order_path, '2', ['1', '3'])
|
||||
check_graph_structure(dump_file_path, execution_order_path, '3', ['5'])
|
||||
else:
|
||||
# In Ascend, we have 2 root graphs folders under rank_0 dir.
|
||||
# In graph history dir, there are 4 ms_execution_order files and 2 ms_global_execution_order files.
|
||||
|
|
Loading…
Reference in New Issue