forked from mindspore-Ecosystem/mindspore
Data Dump Bug Fix
1. Remove old e2e dump json 2. Remove warning log 3. Add device id to e2e dump path 4. Fix bug of dump json parse failed after hccl_init
This commit is contained in:
parent
b9c996484e
commit
1480c93d04
|
@ -4,7 +4,7 @@
|
|||
"path": "/test",
|
||||
"net_name": "ResNet50",
|
||||
"iteration": 0,
|
||||
"input_output": 0,
|
||||
"input_output": 2,
|
||||
"kernels": ["Default/Conv-op12"],
|
||||
"support_device": [0,1,2,3,4,5,6,7]
|
||||
},
|
||||
|
|
|
@ -1,22 +0,0 @@
|
|||
{
|
||||
"DumpSettings": {
|
||||
"enable": false,
|
||||
"trans_flag": false,
|
||||
"path": "/tmp/net/",
|
||||
"net_name": "ResNet50",
|
||||
"mode": 0,
|
||||
"iteration": 0,
|
||||
"kernels": ["Default/Conv2D-op2", "Default/TensorAdd-op10"]
|
||||
},
|
||||
|
||||
"DumpSettingsSpec": {
|
||||
"enable": "true: dump enable, false: dump disable",
|
||||
"trans_flag": "true: trans to host format, false: not trans format",
|
||||
"path": "the dump file folder",
|
||||
"net_name": "net name eg:ResNet50",
|
||||
"mode": "0: dump all kernels, 1: dump kernels in kernels list",
|
||||
"iteration": "0: all iteration, others: specified iteration ",
|
||||
"kernels": "op's full scope name which need to be dump"
|
||||
},
|
||||
"other": {}
|
||||
}
|
|
@ -1,22 +0,0 @@
|
|||
{
|
||||
"DumpSettings": {
|
||||
"enable": false,
|
||||
"trans_flag": false,
|
||||
"path": "/tmp/hccllog/0",
|
||||
"net_name": "ResNet50",
|
||||
"mode": 0,
|
||||
"iteration": 0,
|
||||
"kernels": ["Default/Conv2D-op2", "Default/TensorAdd-op10"]
|
||||
},
|
||||
|
||||
"DumpSettingsSpec": {
|
||||
"enable": "true: dump enable, false: dump disable",
|
||||
"trans_flag": "true: trans to host format, false: not trans format",
|
||||
"path": "the dump file folder",
|
||||
"net_name": "net name eg:ResNet50",
|
||||
"mode": "0: dump all kernels, 1: dump kernels in kernels list",
|
||||
"iteration": "0: all iteration, others: specified iteration ",
|
||||
"kernels": "op's full scope name which need to be dump"
|
||||
},
|
||||
"other": {}
|
||||
}
|
|
@ -1,22 +0,0 @@
|
|||
{
|
||||
"DumpSettings": {
|
||||
"enable": false,
|
||||
"trans_flag": false,
|
||||
"path": "/tmp/hccllog/1",
|
||||
"net_name": "ResNet50",
|
||||
"mode": 0,
|
||||
"iteration": 0,
|
||||
"kernels": ["Default/Conv2D-op2", "Default/TensorAdd-op10"]
|
||||
},
|
||||
|
||||
"DumpSettingsSpec": {
|
||||
"enable": "true: dump enable, false: dump disable",
|
||||
"trans_flag": "true: trans to host format, false: not trans format",
|
||||
"path": "the dump file folder",
|
||||
"net_name": "net name eg:ResNet50",
|
||||
"mode": "0: dump all kernels, 1: dump kernels in kernels list",
|
||||
"iteration": "0: all iteration, others: specified iteration ",
|
||||
"kernels": "op's full scope name which need to be dump"
|
||||
},
|
||||
"other": {}
|
||||
}
|
|
@ -442,6 +442,7 @@ void AscendSession::InitRuntimeResource() {
|
|||
if (!runtime_instance->Init()) {
|
||||
MS_LOG(EXCEPTION) << "Kernel runtime init error.";
|
||||
}
|
||||
DumpJsonParser::GetInstance().Parse();
|
||||
MS_LOG(INFO) << "Finish!";
|
||||
}
|
||||
|
||||
|
|
|
@ -74,6 +74,10 @@ bool DumpJsonParser::IsDumpEnabled() {
|
|||
|
||||
void DumpJsonParser::Parse() {
|
||||
std::lock_guard<std::mutex> guard(lock_);
|
||||
if (already_parsed_) {
|
||||
return;
|
||||
}
|
||||
already_parsed_ = true;
|
||||
if (!IsDumpEnabled()) {
|
||||
return;
|
||||
}
|
||||
|
@ -305,6 +309,8 @@ void DumpJsonParser::JudgeDumpEnabled() {
|
|||
MS_LOG(WARNING) << "Dump not enabled. device_id:" << device_id << " not support";
|
||||
}
|
||||
context->set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, !e2e_dump_enabled_);
|
||||
MS_LOG(INFO) << "Dump status, e2e_dump_enabled:" << e2e_dump_enabled_
|
||||
<< " async_dump_enabled:" << async_dump_enabled_;
|
||||
}
|
||||
|
||||
bool DumpJsonParser::NeedDump(const std::string &op_full_name) const {
|
||||
|
@ -325,6 +331,9 @@ void DumpJsonParser::MatchKernel(const std::string &kernel_name) {
|
|||
}
|
||||
|
||||
void DumpJsonParser::PrintUnusedKernel() {
|
||||
if (!e2e_dump_enabled_ && !async_dump_enabled_) {
|
||||
return;
|
||||
}
|
||||
for (const auto &iter : kernels_) {
|
||||
if (iter.second == 0) {
|
||||
MS_LOG(WARNING) << "[DataDump] Unused Kernel in json:" << iter.first;
|
||||
|
@ -362,16 +371,6 @@ bool DumpJsonParser::OutputNeedDump() const {
|
|||
return input_output_ == kDumpInputAndOutput || input_output_ == kDumpOutputOnly;
|
||||
}
|
||||
|
||||
bool NeedAsyncDump(const CNodePtr &kernel) {
|
||||
if (AnfAlgo::GetKernelType(kernel) != TBE_KERNEL && AnfAlgo::GetKernelType(kernel) != AICPU_KERNEL &&
|
||||
AnfAlgo::GetKernelType(kernel) != AKG_KERNEL) {
|
||||
return false;
|
||||
}
|
||||
MS_EXCEPTION_IF_NULL(kernel);
|
||||
// dump all kernel if mode is set 0 in data_dump.json
|
||||
return DumpJsonParser::GetInstance().NeedDump(kernel->fullname_with_scope());
|
||||
}
|
||||
|
||||
void DumpJsonParser::UpdateNeedDumpKernels(NotNull<const session::KernelGraph *> kernel_graph) {
|
||||
if (e2e_dump_enabled_) {
|
||||
MS_LOG(INFO) << "E2e dump no need to update dump kernel list";
|
||||
|
@ -391,9 +390,6 @@ void DumpJsonParser::UpdateNeedDumpKernels(NotNull<const session::KernelGraph *>
|
|||
update_kernels.try_emplace(input->fullname_with_scope(), 0);
|
||||
}
|
||||
}
|
||||
} else if (NeedAsyncDump(kernel)) {
|
||||
MS_LOG(INFO) << "[AsyncDump] Match Node:" << kernel->fullname_with_scope();
|
||||
update_kernels.try_emplace(kernel->fullname_with_scope(), 0);
|
||||
}
|
||||
}
|
||||
kernels_.insert(update_kernels.begin(), update_kernels.end());
|
||||
|
|
|
@ -72,6 +72,7 @@ class DumpJsonParser {
|
|||
uint32_t op_debug_mode_{0};
|
||||
bool trans_flag_{false};
|
||||
uint32_t cur_dump_iter_{0};
|
||||
bool already_parsed_{false};
|
||||
|
||||
void ParseCommonDumpSetting(const nlohmann::json &content);
|
||||
void ParseAsyncDumpSetting(const nlohmann::json &content);
|
||||
|
|
|
@ -206,14 +206,17 @@ bool E2eDumpUtil::DumpData(const session::KernelGraph *graph, Debugger *debugger
|
|||
}
|
||||
}
|
||||
MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter();
|
||||
auto context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context);
|
||||
auto device_id = context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
|
||||
std::string net_name = dump_json_parser.net_name();
|
||||
std::string iterator = std::to_string(dump_json_parser.cur_dump_iter());
|
||||
std::string dump_path = dump_json_parser.path();
|
||||
if (dump_path.back() == '/') {
|
||||
dump_path = dump_path + net_name + '/' + iterator;
|
||||
} else {
|
||||
dump_path = dump_path + '/' + net_name + '/' + iterator;
|
||||
if (dump_path.back() != '/') {
|
||||
dump_path += "/";
|
||||
}
|
||||
dump_path += (net_name + "/device_" + std::to_string(device_id) + "/iteration_" + iterator);
|
||||
DumpInput(graph, dump_path, debugger);
|
||||
DumpOutput(graph, dump_path, debugger);
|
||||
DumpParameters(graph, dump_path, debugger);
|
||||
|
|
|
@ -206,11 +206,8 @@ bool AscendKernelRuntime::Init() {
|
|||
SetContext();
|
||||
return true;
|
||||
}
|
||||
bool ret = false;
|
||||
|
||||
DumpJsonParser::GetInstance().Parse();
|
||||
// Start up profiling before rtSetDevice
|
||||
ret = ProfilingManager::GetInstance().StartupProfiling(device_id_);
|
||||
bool ret = ProfilingManager::GetInstance().StartupProfiling(device_id_);
|
||||
if (!ret) {
|
||||
MS_EXCEPTION(DeviceProcessError) << "StartupProfiling failed.";
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue