forked from mindspore-Ecosystem/mindspore
Data Dump Bug Fix
1. Remove old e2e dump json 2. Remove warning log 3. Add device id to e2e dump path 4. Fix bug of dump json parse failed after hccl_init
This commit is contained in:
parent
b9c996484e
commit
1480c93d04
|
@ -4,7 +4,7 @@
|
||||||
"path": "/test",
|
"path": "/test",
|
||||||
"net_name": "ResNet50",
|
"net_name": "ResNet50",
|
||||||
"iteration": 0,
|
"iteration": 0,
|
||||||
"input_output": 0,
|
"input_output": 2,
|
||||||
"kernels": ["Default/Conv-op12"],
|
"kernels": ["Default/Conv-op12"],
|
||||||
"support_device": [0,1,2,3,4,5,6,7]
|
"support_device": [0,1,2,3,4,5,6,7]
|
||||||
},
|
},
|
||||||
|
|
|
@ -1,22 +0,0 @@
|
||||||
{
|
|
||||||
"DumpSettings": {
|
|
||||||
"enable": false,
|
|
||||||
"trans_flag": false,
|
|
||||||
"path": "/tmp/net/",
|
|
||||||
"net_name": "ResNet50",
|
|
||||||
"mode": 0,
|
|
||||||
"iteration": 0,
|
|
||||||
"kernels": ["Default/Conv2D-op2", "Default/TensorAdd-op10"]
|
|
||||||
},
|
|
||||||
|
|
||||||
"DumpSettingsSpec": {
|
|
||||||
"enable": "true: dump enable, false: dump disable",
|
|
||||||
"trans_flag": "true: trans to host format, false: not trans format",
|
|
||||||
"path": "the dump file folder",
|
|
||||||
"net_name": "net name eg:ResNet50",
|
|
||||||
"mode": "0: dump all kernels, 1: dump kernels in kernels list",
|
|
||||||
"iteration": "0: all iteration, others: specified iteration ",
|
|
||||||
"kernels": "op's full scope name which need to be dump"
|
|
||||||
},
|
|
||||||
"other": {}
|
|
||||||
}
|
|
|
@ -1,22 +0,0 @@
|
||||||
{
|
|
||||||
"DumpSettings": {
|
|
||||||
"enable": false,
|
|
||||||
"trans_flag": false,
|
|
||||||
"path": "/tmp/hccllog/0",
|
|
||||||
"net_name": "ResNet50",
|
|
||||||
"mode": 0,
|
|
||||||
"iteration": 0,
|
|
||||||
"kernels": ["Default/Conv2D-op2", "Default/TensorAdd-op10"]
|
|
||||||
},
|
|
||||||
|
|
||||||
"DumpSettingsSpec": {
|
|
||||||
"enable": "true: dump enable, false: dump disable",
|
|
||||||
"trans_flag": "true: trans to host format, false: not trans format",
|
|
||||||
"path": "the dump file folder",
|
|
||||||
"net_name": "net name eg:ResNet50",
|
|
||||||
"mode": "0: dump all kernels, 1: dump kernels in kernels list",
|
|
||||||
"iteration": "0: all iteration, others: specified iteration ",
|
|
||||||
"kernels": "op's full scope name which need to be dump"
|
|
||||||
},
|
|
||||||
"other": {}
|
|
||||||
}
|
|
|
@ -1,22 +0,0 @@
|
||||||
{
|
|
||||||
"DumpSettings": {
|
|
||||||
"enable": false,
|
|
||||||
"trans_flag": false,
|
|
||||||
"path": "/tmp/hccllog/1",
|
|
||||||
"net_name": "ResNet50",
|
|
||||||
"mode": 0,
|
|
||||||
"iteration": 0,
|
|
||||||
"kernels": ["Default/Conv2D-op2", "Default/TensorAdd-op10"]
|
|
||||||
},
|
|
||||||
|
|
||||||
"DumpSettingsSpec": {
|
|
||||||
"enable": "true: dump enable, false: dump disable",
|
|
||||||
"trans_flag": "true: trans to host format, false: not trans format",
|
|
||||||
"path": "the dump file folder",
|
|
||||||
"net_name": "net name eg:ResNet50",
|
|
||||||
"mode": "0: dump all kernels, 1: dump kernels in kernels list",
|
|
||||||
"iteration": "0: all iteration, others: specified iteration ",
|
|
||||||
"kernels": "op's full scope name which need to be dump"
|
|
||||||
},
|
|
||||||
"other": {}
|
|
||||||
}
|
|
|
@ -442,6 +442,7 @@ void AscendSession::InitRuntimeResource() {
|
||||||
if (!runtime_instance->Init()) {
|
if (!runtime_instance->Init()) {
|
||||||
MS_LOG(EXCEPTION) << "Kernel runtime init error.";
|
MS_LOG(EXCEPTION) << "Kernel runtime init error.";
|
||||||
}
|
}
|
||||||
|
DumpJsonParser::GetInstance().Parse();
|
||||||
MS_LOG(INFO) << "Finish!";
|
MS_LOG(INFO) << "Finish!";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -74,6 +74,10 @@ bool DumpJsonParser::IsDumpEnabled() {
|
||||||
|
|
||||||
void DumpJsonParser::Parse() {
|
void DumpJsonParser::Parse() {
|
||||||
std::lock_guard<std::mutex> guard(lock_);
|
std::lock_guard<std::mutex> guard(lock_);
|
||||||
|
if (already_parsed_) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
already_parsed_ = true;
|
||||||
if (!IsDumpEnabled()) {
|
if (!IsDumpEnabled()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -305,6 +309,8 @@ void DumpJsonParser::JudgeDumpEnabled() {
|
||||||
MS_LOG(WARNING) << "Dump not enabled. device_id:" << device_id << " not support";
|
MS_LOG(WARNING) << "Dump not enabled. device_id:" << device_id << " not support";
|
||||||
}
|
}
|
||||||
context->set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, !e2e_dump_enabled_);
|
context->set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, !e2e_dump_enabled_);
|
||||||
|
MS_LOG(INFO) << "Dump status, e2e_dump_enabled:" << e2e_dump_enabled_
|
||||||
|
<< " async_dump_enabled:" << async_dump_enabled_;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool DumpJsonParser::NeedDump(const std::string &op_full_name) const {
|
bool DumpJsonParser::NeedDump(const std::string &op_full_name) const {
|
||||||
|
@ -325,6 +331,9 @@ void DumpJsonParser::MatchKernel(const std::string &kernel_name) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void DumpJsonParser::PrintUnusedKernel() {
|
void DumpJsonParser::PrintUnusedKernel() {
|
||||||
|
if (!e2e_dump_enabled_ && !async_dump_enabled_) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
for (const auto &iter : kernels_) {
|
for (const auto &iter : kernels_) {
|
||||||
if (iter.second == 0) {
|
if (iter.second == 0) {
|
||||||
MS_LOG(WARNING) << "[DataDump] Unused Kernel in json:" << iter.first;
|
MS_LOG(WARNING) << "[DataDump] Unused Kernel in json:" << iter.first;
|
||||||
|
@ -362,16 +371,6 @@ bool DumpJsonParser::OutputNeedDump() const {
|
||||||
return input_output_ == kDumpInputAndOutput || input_output_ == kDumpOutputOnly;
|
return input_output_ == kDumpInputAndOutput || input_output_ == kDumpOutputOnly;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool NeedAsyncDump(const CNodePtr &kernel) {
|
|
||||||
if (AnfAlgo::GetKernelType(kernel) != TBE_KERNEL && AnfAlgo::GetKernelType(kernel) != AICPU_KERNEL &&
|
|
||||||
AnfAlgo::GetKernelType(kernel) != AKG_KERNEL) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
MS_EXCEPTION_IF_NULL(kernel);
|
|
||||||
// dump all kernel if mode is set 0 in data_dump.json
|
|
||||||
return DumpJsonParser::GetInstance().NeedDump(kernel->fullname_with_scope());
|
|
||||||
}
|
|
||||||
|
|
||||||
void DumpJsonParser::UpdateNeedDumpKernels(NotNull<const session::KernelGraph *> kernel_graph) {
|
void DumpJsonParser::UpdateNeedDumpKernels(NotNull<const session::KernelGraph *> kernel_graph) {
|
||||||
if (e2e_dump_enabled_) {
|
if (e2e_dump_enabled_) {
|
||||||
MS_LOG(INFO) << "E2e dump no need to update dump kernel list";
|
MS_LOG(INFO) << "E2e dump no need to update dump kernel list";
|
||||||
|
@ -391,9 +390,6 @@ void DumpJsonParser::UpdateNeedDumpKernels(NotNull<const session::KernelGraph *>
|
||||||
update_kernels.try_emplace(input->fullname_with_scope(), 0);
|
update_kernels.try_emplace(input->fullname_with_scope(), 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (NeedAsyncDump(kernel)) {
|
|
||||||
MS_LOG(INFO) << "[AsyncDump] Match Node:" << kernel->fullname_with_scope();
|
|
||||||
update_kernels.try_emplace(kernel->fullname_with_scope(), 0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
kernels_.insert(update_kernels.begin(), update_kernels.end());
|
kernels_.insert(update_kernels.begin(), update_kernels.end());
|
||||||
|
|
|
@ -72,6 +72,7 @@ class DumpJsonParser {
|
||||||
uint32_t op_debug_mode_{0};
|
uint32_t op_debug_mode_{0};
|
||||||
bool trans_flag_{false};
|
bool trans_flag_{false};
|
||||||
uint32_t cur_dump_iter_{0};
|
uint32_t cur_dump_iter_{0};
|
||||||
|
bool already_parsed_{false};
|
||||||
|
|
||||||
void ParseCommonDumpSetting(const nlohmann::json &content);
|
void ParseCommonDumpSetting(const nlohmann::json &content);
|
||||||
void ParseAsyncDumpSetting(const nlohmann::json &content);
|
void ParseAsyncDumpSetting(const nlohmann::json &content);
|
||||||
|
|
|
@ -206,14 +206,17 @@ bool E2eDumpUtil::DumpData(const session::KernelGraph *graph, Debugger *debugger
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter();
|
MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter();
|
||||||
|
auto context = MsContext::GetInstance();
|
||||||
|
MS_EXCEPTION_IF_NULL(context);
|
||||||
|
auto device_id = context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||||
|
|
||||||
std::string net_name = dump_json_parser.net_name();
|
std::string net_name = dump_json_parser.net_name();
|
||||||
std::string iterator = std::to_string(dump_json_parser.cur_dump_iter());
|
std::string iterator = std::to_string(dump_json_parser.cur_dump_iter());
|
||||||
std::string dump_path = dump_json_parser.path();
|
std::string dump_path = dump_json_parser.path();
|
||||||
if (dump_path.back() == '/') {
|
if (dump_path.back() != '/') {
|
||||||
dump_path = dump_path + net_name + '/' + iterator;
|
dump_path += "/";
|
||||||
} else {
|
|
||||||
dump_path = dump_path + '/' + net_name + '/' + iterator;
|
|
||||||
}
|
}
|
||||||
|
dump_path += (net_name + "/device_" + std::to_string(device_id) + "/iteration_" + iterator);
|
||||||
DumpInput(graph, dump_path, debugger);
|
DumpInput(graph, dump_path, debugger);
|
||||||
DumpOutput(graph, dump_path, debugger);
|
DumpOutput(graph, dump_path, debugger);
|
||||||
DumpParameters(graph, dump_path, debugger);
|
DumpParameters(graph, dump_path, debugger);
|
||||||
|
|
|
@ -206,11 +206,8 @@ bool AscendKernelRuntime::Init() {
|
||||||
SetContext();
|
SetContext();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
bool ret = false;
|
|
||||||
|
|
||||||
DumpJsonParser::GetInstance().Parse();
|
|
||||||
// Start up profiling before rtSetDevice
|
// Start up profiling before rtSetDevice
|
||||||
ret = ProfilingManager::GetInstance().StartupProfiling(device_id_);
|
bool ret = ProfilingManager::GetInstance().StartupProfiling(device_id_);
|
||||||
if (!ret) {
|
if (!ret) {
|
||||||
MS_EXCEPTION(DeviceProcessError) << "StartupProfiling failed.";
|
MS_EXCEPTION(DeviceProcessError) << "StartupProfiling failed.";
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue