forked from mindspore-Ecosystem/mindspore
!49787 add pynative mode operator overflow check for dump
Merge pull request !49787 from maoyaomin/mym_debugger_kernel_dumper
This commit is contained in:
commit
9236c3b4e9
|
@ -88,9 +88,6 @@ bool DumpJsonParser::IsDumpEnabled() {
|
|||
|
||||
auto context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context);
|
||||
if (context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) {
|
||||
MS_LOG(EXCEPTION) << "Dump is disabled in PyNative mode. Please set mode to GRAPH_MODE in context.";
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -555,12 +555,8 @@ void E2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) {
|
|||
return;
|
||||
}
|
||||
std::string execution_order_path = json_parser.path() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
|
||||
std::string graph_str;
|
||||
if (Debugger::GetInstance()->GetAscendKernelByKernelFlag()) {
|
||||
graph_str = std::to_string(graph->graph_id());
|
||||
} else {
|
||||
graph_str = IsDeviceTargetGPU() ? std::to_string(graph->graph_id()) : std::to_string(graph->root_graph_id());
|
||||
}
|
||||
std::string graph_str =
|
||||
IsDeviceTargetGPU() ? std::to_string(graph->graph_id()) : std::to_string(graph->root_graph_id());
|
||||
std::string file_name_to_check = execution_order_path + "/ms_global_execution_order_graph_" + graph_str + ".csv";
|
||||
auto real_path = Common::CreatePrefixPath(file_name_to_check);
|
||||
if (!real_path.has_value()) {
|
||||
|
|
|
@ -513,7 +513,12 @@ void Debugger::DumpParamsAndConstAndHistory() {
|
|||
for (auto kernel_graph = executed_graph_ptr_set_.cbegin(); kernel_graph != executed_graph_ptr_set_.cend();
|
||||
++kernel_graph) {
|
||||
// Dump graph run hisotry for each graph.
|
||||
E2eDump::DumpRunIter(*kernel_graph, GetRankID());
|
||||
if (Debugger::GetInstance()->GetAscendKernelByKernelFlag() &&
|
||||
(*kernel_graph)->graph_id() != (*kernel_graph)->root_graph_id()) {
|
||||
MS_LOG(INFO) << "current graph graph_id = " << (*kernel_graph)->graph_id() << " is not root graph.";
|
||||
} else {
|
||||
E2eDump::DumpRunIter(*kernel_graph, GetRankID());
|
||||
}
|
||||
}
|
||||
if (!cur_root_graph_checked) {
|
||||
visited_root_graph_ids_.push_back(cur_root_graph_id_);
|
||||
|
|
|
@ -215,7 +215,7 @@ void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info,
|
|||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
auto graph_id = kernel_graph->graph_id();
|
||||
// for GPU, nodes are dumped in graph_id directory.
|
||||
if (IsDeviceTargetGPU() || debugger->GetAscendKernelByKernelFlag()) {
|
||||
if (IsDeviceTargetGPU()) {
|
||||
debugger->DumpSingleNode(cnode, graph_id);
|
||||
} else {
|
||||
// for Ascend, node are dumped in root_graph_id directory.
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
#include "plugin/device/ascend/hal/device/dump/kernel_dumper.h"
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
#ifndef ENABLE_SECURITY
|
||||
#include "debug/data_dump/dump_json_parser.h"
|
||||
#endif
|
||||
|
@ -44,7 +45,7 @@ static constexpr uint64_t kOpDebugMemorySize = 2048;
|
|||
const size_t kDebugP2pSize = 8UL;
|
||||
} // namespace
|
||||
DUMPER_REG(kAscendDevice, KernelDumper);
|
||||
std::mutex KernelDumper::debug_register_mutex_;
|
||||
std::mutex KernelDumper::dumper_mutex_;
|
||||
std::map<rtStream_t, std::unique_ptr<OpDebugTask>> KernelDumper::op_debug_tasks;
|
||||
std::map<uint32_t, bool> KernelDumper::is_data_map;
|
||||
std::map<std::string, std::string> KernelDumper::stream_task_graphs;
|
||||
|
@ -80,9 +81,17 @@ KernelDumper::~KernelDumper() {
|
|||
}
|
||||
|
||||
void KernelDumper::OpLoadDumpInfo(const CNodePtr &kernel) {
|
||||
std::lock_guard<std::mutex> lock(debug_register_mutex_);
|
||||
aicpu::dump::OpMappingInfo dump_info;
|
||||
SetOpMappingInfo(NOT_NULL(&dump_info), kernel);
|
||||
auto stream = AscendStreamMng::GetInstance().GetStream(AnfAlgo::GetStreamId(kernel));
|
||||
if (stream == nullptr) {
|
||||
stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex);
|
||||
}
|
||||
if (DumpJsonParser::GetInstance().op_debug_mode() > 0) {
|
||||
auto rt_ret = rtStreamSynchronize(stream);
|
||||
dumper_mutex_.unlock();
|
||||
if (rt_ret != ACL_ERROR_RT_AICORE_OVER_FLOW) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (!KernelNeedDump(kernel)) {
|
||||
return;
|
||||
|
@ -91,10 +100,9 @@ void KernelDumper::OpLoadDumpInfo(const CNodePtr &kernel) {
|
|||
MS_LOG(WARNING) << "[KernelDumper] kernel [" << kernel->UniqueName() << "] is a non-task node, skip dump.";
|
||||
return;
|
||||
}
|
||||
auto stream = AscendStreamMng::GetInstance().GetStream(AnfAlgo::GetStreamId(kernel));
|
||||
if (stream == nullptr) {
|
||||
stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex);
|
||||
}
|
||||
aicpu::dump::OpMappingInfo dump_info;
|
||||
SetOpMappingInfo(NOT_NULL(&dump_info), kernel);
|
||||
|
||||
DumpJsonParser::GetInstance().MatchKernel(kernel->fullname_with_scope());
|
||||
aicpu::dump::Task task;
|
||||
ConstructDumpTask(NOT_NULL(kernel), NOT_NULL(&task));
|
||||
|
@ -105,7 +113,7 @@ void KernelDumper::OpLoadDumpInfo(const CNodePtr &kernel) {
|
|||
graph_id_ = AnfAlgo::GetGraphId(kernel.get());
|
||||
std::string stream_task_id = std::to_string(stream_id_) + std::to_string(task_id_);
|
||||
KernelDumper::stream_task_graphs.emplace(stream_task_id, kernel->fullname_with_scope());
|
||||
MS_LOG(INFO) << "[DataDump] Get runtime info graph_id:" << graph_id_ << " stream_id:" << stream_id_
|
||||
MS_LOG(INFO) << "[KernelDumper] Get runtime info graph_id:" << graph_id_ << " stream_id:" << stream_id_
|
||||
<< " task_id:" << task_id_ << " fullname:" << kernel->fullname_with_scope();
|
||||
}
|
||||
|
||||
|
@ -114,12 +122,12 @@ void KernelDumper::SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_i
|
|||
dump_info->set_dump_path(dump_path_);
|
||||
dump_info->set_model_name(net_name_);
|
||||
dump_info->set_dump_step(iteration_);
|
||||
auto graph_id = AnfAlgo::GetGraphId(kernel.get());
|
||||
dump_info->set_model_id(graph_id);
|
||||
dump_info->set_flag(kAicpuLoadFlag);
|
||||
|
||||
FuncGraphPtr f_graph = kernel->func_graph();
|
||||
auto kernel_graph_ = f_graph->cast<KernelGraphPtr>();
|
||||
auto root_graph_id = kernel_graph_->root_graph_id();
|
||||
dump_info->set_model_id(root_graph_id);
|
||||
dump_info->set_flag(kAicpuLoadFlag);
|
||||
|
||||
auto input_ctrl_tensors = kernel_graph_->device_loop_control_tensors();
|
||||
if (input_ctrl_tensors.size() > 0) {
|
||||
auto kCurLoopCountName = "current_loop_count";
|
||||
|
@ -225,7 +233,6 @@ void KernelDumper::ExecutorDumpOp(const aicpu::dump::OpMappingInfo &op_mapping_i
|
|||
MS_LOG(ERROR) << "[KernelDumper] Call rt api rtCpuKernelLaunch Failed, rt_ret = " << rt_ret;
|
||||
return;
|
||||
}
|
||||
rtStreamSynchronize(stream_);
|
||||
}
|
||||
|
||||
void KernelDumper::ConstructDumpTask(NotNull<const CNodePtr &> kernel, NotNull<aicpu::dump::Task *> dump_task) {
|
||||
|
@ -375,7 +382,6 @@ void KernelDumper::MallocP2PDebugMem(const void *const op_debug_addr) {
|
|||
}
|
||||
|
||||
void KernelDumper::OpDebugRegisterForStream(const CNodePtr &kernel) {
|
||||
std::lock_guard<std::mutex> lock(register_mutex_);
|
||||
uint32_t op_debug_mode = DumpJsonParser::GetInstance().op_debug_mode();
|
||||
auto iter = kOverflowModeStr.find(op_debug_mode);
|
||||
if (iter == kOverflowModeStr.end()) {
|
||||
|
@ -384,6 +390,7 @@ void KernelDumper::OpDebugRegisterForStream(const CNodePtr &kernel) {
|
|||
if (op_debug_mode == kNoOverflow) {
|
||||
return;
|
||||
}
|
||||
dumper_mutex_.lock();
|
||||
auto stream = AscendStreamMng::GetInstance().GetStream(AnfAlgo::GetStreamId(kernel));
|
||||
if (stream == nullptr) {
|
||||
stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex);
|
||||
|
@ -391,6 +398,8 @@ void KernelDumper::OpDebugRegisterForStream(const CNodePtr &kernel) {
|
|||
if (KernelDumper::op_debug_tasks.find(stream) != KernelDumper::op_debug_tasks.end()) {
|
||||
return;
|
||||
} else {
|
||||
std::string stream_id = std::to_string(AnfAlgo::GetStreamId(kernel));
|
||||
KernelDumper::stream_task_graphs.emplace(stream_id, "KernelDumper");
|
||||
auto graph_id = AnfAlgo::GetGraphId(kernel.get());
|
||||
if (KernelDumper::is_data_map.find(graph_id) != KernelDumper::is_data_map.end()) {
|
||||
return;
|
||||
|
|
|
@ -72,6 +72,7 @@ class KernelDumper : public debug::OverflowDumper {
|
|||
static std::map<rtStream_t, std::unique_ptr<OpDebugTask>> op_debug_tasks;
|
||||
static std::map<uint32_t, bool> is_data_map;
|
||||
static std::map<std::string, std::string> stream_task_graphs;
|
||||
static std::mutex dumper_mutex_;
|
||||
|
||||
string dump_path_;
|
||||
string net_name_;
|
||||
|
@ -79,7 +80,6 @@ class KernelDumper : public debug::OverflowDumper {
|
|||
|
||||
private:
|
||||
// Support multi-thread.
|
||||
static std::mutex debug_register_mutex_;
|
||||
bool load_flag_;
|
||||
uint32_t graph_id_;
|
||||
uint32_t task_id_{0U};
|
||||
|
@ -91,7 +91,6 @@ class KernelDumper : public debug::OverflowDumper {
|
|||
void *dev_load_mem_ = nullptr;
|
||||
void *proto_dev_mem_ = nullptr;
|
||||
void *proto_size_dev_mem_ = nullptr;
|
||||
std::mutex register_mutex_;
|
||||
std::string overflow_dump_filename = "debug_files";
|
||||
void *p2p_debug_addr_ = nullptr;
|
||||
void SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_info, const CNodePtr &kernel);
|
||||
|
|
|
@ -43,6 +43,7 @@
|
|||
#include "plugin/device/ascend/hal/profiler/ascend_profiling.h"
|
||||
#include "plugin/device/ascend/hal/device/profiling/profiling_manager.h"
|
||||
#include "plugin/device/ascend/hal/device/dump/ascend_dump.h"
|
||||
#include "debug/data_dump/overflow_dumper.h"
|
||||
|
||||
using Adx::AdxRegDumpProcessCallBack;
|
||||
using mindspore::device::ascend::ProfilingManager;
|
||||
|
@ -370,7 +371,13 @@ bool AscendKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vector<Add
|
|||
stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex);
|
||||
}
|
||||
MS_EXCEPTION_IF_NULL(stream);
|
||||
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
if (DumpJsonParser::GetInstance().async_dump_enabled()) {
|
||||
auto register_dumper = debug::OverflowDumper::GetInstance(kAscendDevice);
|
||||
register_dumper->Init();
|
||||
register_dumper->OpDebugRegisterForStream(kernel);
|
||||
}
|
||||
#endif
|
||||
bool is_dynamic_shape = common::AnfAlgo::IsDynamicShape(kernel);
|
||||
if (!is_dynamic_shape || !(common::AnfAlgo::GetBooleanAttr(kernel, kAttrMSFunction))) {
|
||||
auto iter = node_atomics_persistent_cache_.find(kernel);
|
||||
|
@ -399,6 +406,12 @@ bool AscendKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vector<Add
|
|||
return false;
|
||||
}
|
||||
}
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
if (DumpJsonParser::GetInstance().async_dump_enabled()) {
|
||||
auto kernel_dumper = debug::OverflowDumper::GetInstance(kAscendDevice);
|
||||
kernel_dumper->OpLoadDumpInfo(kernel);
|
||||
}
|
||||
#endif
|
||||
#ifndef ENABLE_SECURITY
|
||||
auto ascend_instance = profiler::ascend::AscendProfiler::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ascend_instance);
|
||||
|
|
|
@ -24,7 +24,6 @@
|
|||
#ifndef ENABLE_SECURITY
|
||||
#include "debug/data_dump/cpu_e2e_dump.h"
|
||||
#include "debug/data_dump/e2e_dump.h"
|
||||
#include "debug/data_dump/overflow_dumper.h"
|
||||
#include "utils/ms_context.h"
|
||||
#endif
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
|
@ -80,18 +79,9 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
|
|||
#endif
|
||||
} else if (device_context->GetDeviceType() == device::DeviceType::kAscend) {
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
#ifndef ENABLE_SECURITY
|
||||
auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
|
||||
graph_id_sets_.insert(kernel_graph->graph_id());
|
||||
if (DumpJsonParser::GetInstance().async_dump_enabled()) {
|
||||
auto kernel_dumper = debug::OverflowDumper::GetInstance(kAscendDevice);
|
||||
kernel_dumper->Init();
|
||||
kernel_dumper->OpDebugRegisterForStream(cnode);
|
||||
kernel_dumper->OpLoadDumpInfo(cnode);
|
||||
}
|
||||
#endif
|
||||
auto debugger = Debugger::GetInstance();
|
||||
if (debugger != nullptr) {
|
||||
auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
|
||||
debugger->InsertExecutedGraph(kernel_graph);
|
||||
debugger->SetAscendKernelByKernelFlag(true);
|
||||
bool read_data = CheckReadData(cnode);
|
||||
|
@ -186,7 +176,7 @@ void DebugActor::DebugOnStepBegin(const std::vector<KernelGraphPtr> &graphs,
|
|||
return kernel->fullname_with_scope().find("InitDataSetQueue") != std::string::npos;
|
||||
});
|
||||
}
|
||||
if (!is_data_map_ && !graphs[0]->is_graph_run_mode()) {
|
||||
if (!is_data_map_) {
|
||||
auto kCurLoopCountName = "current_loop_count";
|
||||
for (size_t i = 0; i < graphs.size(); i++) {
|
||||
const auto &graph_ = graphs[i];
|
||||
|
@ -200,7 +190,7 @@ void DebugActor::DebugOnStepBegin(const std::vector<KernelGraphPtr> &graphs,
|
|||
}
|
||||
auto tensor = device_loop_control_tensors.at(kCurLoopCountName);
|
||||
MS_EXCEPTION_IF_NULL(tensor);
|
||||
auto *cur_val = static_cast<int32_t *>(tensor->data_c());
|
||||
auto *cur_val = static_cast<int64_t *>(tensor->data_c());
|
||||
MS_EXCEPTION_IF_NULL(cur_val);
|
||||
*cur_val = current_step;
|
||||
tensor->set_sync_status(kNeedSyncHostToDevice);
|
||||
|
@ -236,21 +226,6 @@ void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *const op_context, const
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
#ifndef ENABLE_SECURITY
|
||||
if (DumpJsonParser::GetInstance().async_dump_enabled() && DumpJsonParser::GetInstance().op_debug_mode() > 0 &&
|
||||
Debugger::GetInstance()->GetAscendKernelByKernelFlag()) {
|
||||
uint32_t rank_id = Debugger::GetRankID();
|
||||
std::set<uint32_t>::iterator graph_id_iter;
|
||||
for (graph_id_iter = graph_id_sets_.begin(); graph_id_iter != graph_id_sets_.end(); ++graph_id_iter) {
|
||||
auto graph_id = *graph_id_iter;
|
||||
DeleteNoOverflowFile(rank_id, graph_id);
|
||||
}
|
||||
graph_id_sets_.clear();
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
auto debugger = Debugger::GetInstance();
|
||||
if (debugger != nullptr) {
|
||||
|
|
|
@ -55,7 +55,7 @@ class DebugActor : public ActorBase {
|
|||
|
||||
// The debug on step end.
|
||||
void DebugOnStepEnd(OpContext<DeviceTensor> *const op_context, const AID *from_aid);
|
||||
static inline uint32_t current_step{0};
|
||||
static inline uint64_t current_step{0};
|
||||
|
||||
private:
|
||||
// class members
|
||||
|
@ -63,7 +63,6 @@ class DebugActor : public ActorBase {
|
|||
|
||||
// Support multi-thread.
|
||||
std::mutex debug_mutex_;
|
||||
std::set<uint32_t> graph_id_sets_;
|
||||
};
|
||||
|
||||
} // namespace runtime
|
||||
|
|
|
@ -101,8 +101,8 @@ def run_multi_root_graph_dump(device, dump_mode, test_name):
|
|||
time.sleep(2)
|
||||
execution_order_path = os.path.join(dump_path, 'rank_0', 'execution_order')
|
||||
# Multi root graph script: check dump data dir and graph history files and see if iteration number is matched.
|
||||
if device == "GPU" or os.environ.get('GRAPH_OP_RUN') == "1":
|
||||
# In GPU or KernelByKernel, we have 4 kernel graphs folders under rank_0 dir.
|
||||
if device == "GPU":
|
||||
# In GPU, we have 4 kernel graphs folders under rank_0 dir.
|
||||
# In graph history dir, there are 2 files for each graph (ms_execution_order and ms_global_execution_order).
|
||||
assert len(os.listdir(dump_file_path)) == 4
|
||||
assert len(os.listdir(execution_order_path)) == 8
|
||||
|
@ -111,7 +111,7 @@ def run_multi_root_graph_dump(device, dump_mode, test_name):
|
|||
check_graph_structure(dump_file_path, execution_order_path, '2', ['1', '3'])
|
||||
check_graph_structure(dump_file_path, execution_order_path, '3', ['5'])
|
||||
else:
|
||||
# In Ascend Super Kernel, we have 2 root graphs folders under rank_0 dir.
|
||||
# In Ascend, we have 2 root graphs folders under rank_0 dir.
|
||||
# In graph history dir, there are 4 ms_execution_order files and 2 ms_global_execution_order files.
|
||||
# Each graph should have 3 iterations. Each graph was executed once per epoch.
|
||||
# Graph 0 was executed in even iterations, graph 1 was executed in odd iterations.
|
||||
|
|
Loading…
Reference in New Issue