add pynative mode operator overflow check for dump

This commit is contained in:
maoyaomin 2023-03-04 17:46:28 +08:00
parent f55df887d4
commit 67bfdcab09
10 changed files with 55 additions and 62 deletions

View File

@ -88,9 +88,6 @@ bool DumpJsonParser::IsDumpEnabled() {
auto context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context);
if (context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) {
MS_LOG(EXCEPTION) << "Dump is disabled in PyNative mode. Please set mode to GRAPH_MODE in context.";
}
return true;
}

View File

@ -555,12 +555,8 @@ void E2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) {
return;
}
std::string execution_order_path = json_parser.path() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
std::string graph_str;
if (Debugger::GetInstance()->GetAscendKernelByKernelFlag()) {
graph_str = std::to_string(graph->graph_id());
} else {
graph_str = IsDeviceTargetGPU() ? std::to_string(graph->graph_id()) : std::to_string(graph->root_graph_id());
}
std::string graph_str =
IsDeviceTargetGPU() ? std::to_string(graph->graph_id()) : std::to_string(graph->root_graph_id());
std::string file_name_to_check = execution_order_path + "/ms_global_execution_order_graph_" + graph_str + ".csv";
auto real_path = Common::CreatePrefixPath(file_name_to_check);
if (!real_path.has_value()) {

View File

@ -513,7 +513,12 @@ void Debugger::DumpParamsAndConstAndHistory() {
for (auto kernel_graph = executed_graph_ptr_set_.cbegin(); kernel_graph != executed_graph_ptr_set_.cend();
++kernel_graph) {
// Dump graph run hisotry for each graph.
E2eDump::DumpRunIter(*kernel_graph, GetRankID());
if (Debugger::GetInstance()->GetAscendKernelByKernelFlag() &&
(*kernel_graph)->graph_id() != (*kernel_graph)->root_graph_id()) {
MS_LOG(INFO) << "current graph graph_id = " << (*kernel_graph)->graph_id() << " is not root graph.";
} else {
E2eDump::DumpRunIter(*kernel_graph, GetRankID());
}
}
if (!cur_root_graph_checked) {
visited_root_graph_ids_.push_back(cur_root_graph_id_);

View File

@ -215,7 +215,7 @@ void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info,
MS_EXCEPTION_IF_NULL(kernel_graph);
auto graph_id = kernel_graph->graph_id();
// for GPU, nodes are dumped in graph_id directory.
if (IsDeviceTargetGPU() || debugger->GetAscendKernelByKernelFlag()) {
if (IsDeviceTargetGPU()) {
debugger->DumpSingleNode(cnode, graph_id);
} else {
// for Ascend, node are dumped in root_graph_id directory.

View File

@ -16,6 +16,7 @@
#include "plugin/device/ascend/hal/device/dump/kernel_dumper.h"
#include <algorithm>
#include <utility>
#ifndef ENABLE_SECURITY
#include "debug/data_dump/dump_json_parser.h"
#endif
@ -44,7 +45,7 @@ static constexpr uint64_t kOpDebugMemorySize = 2048;
const size_t kDebugP2pSize = 8UL;
} // namespace
DUMPER_REG(kAscendDevice, KernelDumper);
std::mutex KernelDumper::debug_register_mutex_;
std::mutex KernelDumper::dumper_mutex_;
std::map<rtStream_t, std::unique_ptr<OpDebugTask>> KernelDumper::op_debug_tasks;
std::map<uint32_t, bool> KernelDumper::is_data_map;
std::map<std::string, std::string> KernelDumper::stream_task_graphs;
@ -80,9 +81,17 @@ KernelDumper::~KernelDumper() {
}
void KernelDumper::OpLoadDumpInfo(const CNodePtr &kernel) {
std::lock_guard<std::mutex> lock(debug_register_mutex_);
aicpu::dump::OpMappingInfo dump_info;
SetOpMappingInfo(NOT_NULL(&dump_info), kernel);
auto stream = AscendStreamMng::GetInstance().GetStream(AnfAlgo::GetStreamId(kernel));
if (stream == nullptr) {
stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex);
}
if (DumpJsonParser::GetInstance().op_debug_mode() > 0) {
auto rt_ret = rtStreamSynchronize(stream);
dumper_mutex_.unlock();
if (rt_ret != ACL_ERROR_RT_AICORE_OVER_FLOW) {
return;
}
}
if (!KernelNeedDump(kernel)) {
return;
@ -91,10 +100,9 @@ void KernelDumper::OpLoadDumpInfo(const CNodePtr &kernel) {
MS_LOG(WARNING) << "[KernelDumper] kernel [" << kernel->UniqueName() << "] is a non-task node, skip dump.";
return;
}
auto stream = AscendStreamMng::GetInstance().GetStream(AnfAlgo::GetStreamId(kernel));
if (stream == nullptr) {
stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex);
}
aicpu::dump::OpMappingInfo dump_info;
SetOpMappingInfo(NOT_NULL(&dump_info), kernel);
DumpJsonParser::GetInstance().MatchKernel(kernel->fullname_with_scope());
aicpu::dump::Task task;
ConstructDumpTask(NOT_NULL(kernel), NOT_NULL(&task));
@ -105,7 +113,7 @@ void KernelDumper::OpLoadDumpInfo(const CNodePtr &kernel) {
graph_id_ = AnfAlgo::GetGraphId(kernel.get());
std::string stream_task_id = std::to_string(stream_id_) + std::to_string(task_id_);
KernelDumper::stream_task_graphs.emplace(stream_task_id, kernel->fullname_with_scope());
MS_LOG(INFO) << "[DataDump] Get runtime info graph_id:" << graph_id_ << " stream_id:" << stream_id_
MS_LOG(INFO) << "[KernelDumper] Get runtime info graph_id:" << graph_id_ << " stream_id:" << stream_id_
<< " task_id:" << task_id_ << " fullname:" << kernel->fullname_with_scope();
}
@ -114,12 +122,12 @@ void KernelDumper::SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_i
dump_info->set_dump_path(dump_path_);
dump_info->set_model_name(net_name_);
dump_info->set_dump_step(iteration_);
auto graph_id = AnfAlgo::GetGraphId(kernel.get());
dump_info->set_model_id(graph_id);
dump_info->set_flag(kAicpuLoadFlag);
FuncGraphPtr f_graph = kernel->func_graph();
auto kernel_graph_ = f_graph->cast<KernelGraphPtr>();
auto root_graph_id = kernel_graph_->root_graph_id();
dump_info->set_model_id(root_graph_id);
dump_info->set_flag(kAicpuLoadFlag);
auto input_ctrl_tensors = kernel_graph_->device_loop_control_tensors();
if (input_ctrl_tensors.size() > 0) {
auto kCurLoopCountName = "current_loop_count";
@ -225,7 +233,6 @@ void KernelDumper::ExecutorDumpOp(const aicpu::dump::OpMappingInfo &op_mapping_i
MS_LOG(ERROR) << "[KernelDumper] Call rt api rtCpuKernelLaunch Failed, rt_ret = " << rt_ret;
return;
}
rtStreamSynchronize(stream_);
}
void KernelDumper::ConstructDumpTask(NotNull<const CNodePtr &> kernel, NotNull<aicpu::dump::Task *> dump_task) {
@ -375,7 +382,6 @@ void KernelDumper::MallocP2PDebugMem(const void *const op_debug_addr) {
}
void KernelDumper::OpDebugRegisterForStream(const CNodePtr &kernel) {
std::lock_guard<std::mutex> lock(register_mutex_);
uint32_t op_debug_mode = DumpJsonParser::GetInstance().op_debug_mode();
auto iter = kOverflowModeStr.find(op_debug_mode);
if (iter == kOverflowModeStr.end()) {
@ -384,6 +390,7 @@ void KernelDumper::OpDebugRegisterForStream(const CNodePtr &kernel) {
if (op_debug_mode == kNoOverflow) {
return;
}
dumper_mutex_.lock();
auto stream = AscendStreamMng::GetInstance().GetStream(AnfAlgo::GetStreamId(kernel));
if (stream == nullptr) {
stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex);
@ -391,6 +398,8 @@ void KernelDumper::OpDebugRegisterForStream(const CNodePtr &kernel) {
if (KernelDumper::op_debug_tasks.find(stream) != KernelDumper::op_debug_tasks.end()) {
return;
} else {
std::string stream_id = std::to_string(AnfAlgo::GetStreamId(kernel));
KernelDumper::stream_task_graphs.emplace(stream_id, "KernelDumper");
auto graph_id = AnfAlgo::GetGraphId(kernel.get());
if (KernelDumper::is_data_map.find(graph_id) != KernelDumper::is_data_map.end()) {
return;

View File

@ -72,6 +72,7 @@ class KernelDumper : public debug::OverflowDumper {
static std::map<rtStream_t, std::unique_ptr<OpDebugTask>> op_debug_tasks;
static std::map<uint32_t, bool> is_data_map;
static std::map<std::string, std::string> stream_task_graphs;
static std::mutex dumper_mutex_;
string dump_path_;
string net_name_;
@ -79,7 +80,6 @@ class KernelDumper : public debug::OverflowDumper {
private:
// Support multi-thread.
static std::mutex debug_register_mutex_;
bool load_flag_;
uint32_t graph_id_;
uint32_t task_id_{0U};
@ -91,7 +91,6 @@ class KernelDumper : public debug::OverflowDumper {
void *dev_load_mem_ = nullptr;
void *proto_dev_mem_ = nullptr;
void *proto_size_dev_mem_ = nullptr;
std::mutex register_mutex_;
std::string overflow_dump_filename = "debug_files";
void *p2p_debug_addr_ = nullptr;
void SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_info, const CNodePtr &kernel);

View File

@ -43,6 +43,7 @@
#include "plugin/device/ascend/hal/profiler/ascend_profiling.h"
#include "plugin/device/ascend/hal/device/profiling/profiling_manager.h"
#include "plugin/device/ascend/hal/device/dump/ascend_dump.h"
#include "debug/data_dump/overflow_dumper.h"
using Adx::AdxRegDumpProcessCallBack;
using mindspore::device::ascend::ProfilingManager;
@ -370,7 +371,13 @@ bool AscendKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vector<Add
stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex);
}
MS_EXCEPTION_IF_NULL(stream);
#ifdef ENABLE_DEBUGGER
if (DumpJsonParser::GetInstance().async_dump_enabled()) {
auto register_dumper = debug::OverflowDumper::GetInstance(kAscendDevice);
register_dumper->Init();
register_dumper->OpDebugRegisterForStream(kernel);
}
#endif
bool is_dynamic_shape = common::AnfAlgo::IsDynamicShape(kernel);
if (!is_dynamic_shape || !(common::AnfAlgo::GetBooleanAttr(kernel, kAttrMSFunction))) {
auto iter = node_atomics_persistent_cache_.find(kernel);
@ -399,6 +406,12 @@ bool AscendKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vector<Add
return false;
}
}
#ifdef ENABLE_DEBUGGER
if (DumpJsonParser::GetInstance().async_dump_enabled()) {
auto kernel_dumper = debug::OverflowDumper::GetInstance(kAscendDevice);
kernel_dumper->OpLoadDumpInfo(kernel);
}
#endif
#ifndef ENABLE_SECURITY
auto ascend_instance = profiler::ascend::AscendProfiler::GetInstance();
MS_EXCEPTION_IF_NULL(ascend_instance);

View File

@ -24,7 +24,6 @@
#ifndef ENABLE_SECURITY
#include "debug/data_dump/cpu_e2e_dump.h"
#include "debug/data_dump/e2e_dump.h"
#include "debug/data_dump/overflow_dumper.h"
#include "utils/ms_context.h"
#endif
#ifdef ENABLE_DEBUGGER
@ -80,18 +79,9 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
#endif
} else if (device_context->GetDeviceType() == device::DeviceType::kAscend) {
#ifdef ENABLE_DEBUGGER
#ifndef ENABLE_SECURITY
auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
graph_id_sets_.insert(kernel_graph->graph_id());
if (DumpJsonParser::GetInstance().async_dump_enabled()) {
auto kernel_dumper = debug::OverflowDumper::GetInstance(kAscendDevice);
kernel_dumper->Init();
kernel_dumper->OpDebugRegisterForStream(cnode);
kernel_dumper->OpLoadDumpInfo(cnode);
}
#endif
auto debugger = Debugger::GetInstance();
if (debugger != nullptr) {
auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
debugger->InsertExecutedGraph(kernel_graph);
debugger->SetAscendKernelByKernelFlag(true);
bool read_data = CheckReadData(cnode);
@ -186,7 +176,7 @@ void DebugActor::DebugOnStepBegin(const std::vector<KernelGraphPtr> &graphs,
return kernel->fullname_with_scope().find("InitDataSetQueue") != std::string::npos;
});
}
if (!is_data_map_ && !graphs[0]->is_graph_run_mode()) {
if (!is_data_map_) {
auto kCurLoopCountName = "current_loop_count";
for (size_t i = 0; i < graphs.size(); i++) {
const auto &graph_ = graphs[i];
@ -200,7 +190,7 @@ void DebugActor::DebugOnStepBegin(const std::vector<KernelGraphPtr> &graphs,
}
auto tensor = device_loop_control_tensors.at(kCurLoopCountName);
MS_EXCEPTION_IF_NULL(tensor);
auto *cur_val = static_cast<int32_t *>(tensor->data_c());
auto *cur_val = static_cast<int64_t *>(tensor->data_c());
MS_EXCEPTION_IF_NULL(cur_val);
*cur_val = current_step;
tensor->set_sync_status(kNeedSyncHostToDevice);
@ -236,21 +226,6 @@ void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *const op_context, const
}
#endif
#ifdef ENABLE_DEBUGGER
#ifndef ENABLE_SECURITY
if (DumpJsonParser::GetInstance().async_dump_enabled() && DumpJsonParser::GetInstance().op_debug_mode() > 0 &&
Debugger::GetInstance()->GetAscendKernelByKernelFlag()) {
uint32_t rank_id = Debugger::GetRankID();
std::set<uint32_t>::iterator graph_id_iter;
for (graph_id_iter = graph_id_sets_.begin(); graph_id_iter != graph_id_sets_.end(); ++graph_id_iter) {
auto graph_id = *graph_id_iter;
DeleteNoOverflowFile(rank_id, graph_id);
}
graph_id_sets_.clear();
}
#endif
#endif
#ifdef ENABLE_DEBUGGER
auto debugger = Debugger::GetInstance();
if (debugger != nullptr) {

View File

@ -55,7 +55,7 @@ class DebugActor : public ActorBase {
// The debug on step end.
void DebugOnStepEnd(OpContext<DeviceTensor> *const op_context, const AID *from_aid);
static inline uint32_t current_step{0};
static inline uint64_t current_step{0};
private:
// class members
@ -63,7 +63,6 @@ class DebugActor : public ActorBase {
// Support multi-thread.
std::mutex debug_mutex_;
std::set<uint32_t> graph_id_sets_;
};
} // namespace runtime

View File

@ -101,8 +101,8 @@ def run_multi_root_graph_dump(device, dump_mode, test_name):
time.sleep(2)
execution_order_path = os.path.join(dump_path, 'rank_0', 'execution_order')
# Multi root graph script: check dump data dir and graph history files and see if iteration number is matched.
if device == "GPU" or os.environ.get('GRAPH_OP_RUN') == "1":
# In GPU or KernelByKernel, we have 4 kernel graphs folders under rank_0 dir.
if device == "GPU":
# In GPU, we have 4 kernel graphs folders under rank_0 dir.
# In graph history dir, there are 2 files for each graph (ms_execution_order and ms_global_execution_order).
assert len(os.listdir(dump_file_path)) == 4
assert len(os.listdir(execution_order_path)) == 8
@ -111,7 +111,7 @@ def run_multi_root_graph_dump(device, dump_mode, test_name):
check_graph_structure(dump_file_path, execution_order_path, '2', ['1', '3'])
check_graph_structure(dump_file_path, execution_order_path, '3', ['5'])
else:
# In Ascend Super Kernel, we have 2 root graphs folders under rank_0 dir.
# In Ascend, we have 2 root graphs folders under rank_0 dir.
# In graph history dir, there are 4 ms_execution_order files and 2 ms_global_execution_order files.
# Each graph should have 3 iterations. Each graph was executed once per epoch.
# Graph 0 was executed in even iterations, graph 1 was executed in odd iterations.