Data Dump Bug Fix

1. Remove old e2e dump json
2. Remove warning log
3. Add device id to e2e dump path
4. Fix bug of dump json parse failed after hccl_init
This commit is contained in:
caifubi 2020-09-14 18:47:30 +08:00
parent b9c996484e
commit 1480c93d04
9 changed files with 20 additions and 88 deletions

View File

@ -4,7 +4,7 @@
"path": "/test",
"net_name": "ResNet50",
"iteration": 0,
"input_output": 0,
"input_output": 2,
"kernels": ["Default/Conv-op12"],
"support_device": [0,1,2,3,4,5,6,7]
},

View File

@ -1,22 +0,0 @@
{
"DumpSettings": {
"enable": false,
"trans_flag": false,
"path": "/tmp/net/",
"net_name": "ResNet50",
"mode": 0,
"iteration": 0,
"kernels": ["Default/Conv2D-op2", "Default/TensorAdd-op10"]
},
"DumpSettingsSpec": {
"enable": "true: dump enable, false: dump disable",
"trans_flag": "true: trans to host format, false: not trans format",
"path": "the dump file folder",
"net_name": "net name eg:ResNet50",
"mode": "0: dump all kernels, 1: dump kernels in kernels list",
"iteration": "0: all iteration, others: specified iteration ",
"kernels": "op's full scope name which need to be dump"
},
"other": {}
}

View File

@ -1,22 +0,0 @@
{
"DumpSettings": {
"enable": false,
"trans_flag": false,
"path": "/tmp/hccllog/0",
"net_name": "ResNet50",
"mode": 0,
"iteration": 0,
"kernels": ["Default/Conv2D-op2", "Default/TensorAdd-op10"]
},
"DumpSettingsSpec": {
"enable": "true: dump enable, false: dump disable",
"trans_flag": "true: trans to host format, false: not trans format",
"path": "the dump file folder",
"net_name": "net name eg:ResNet50",
"mode": "0: dump all kernels, 1: dump kernels in kernels list",
"iteration": "0: all iteration, others: specified iteration ",
"kernels": "op's full scope name which need to be dump"
},
"other": {}
}

View File

@ -1,22 +0,0 @@
{
"DumpSettings": {
"enable": false,
"trans_flag": false,
"path": "/tmp/hccllog/1",
"net_name": "ResNet50",
"mode": 0,
"iteration": 0,
"kernels": ["Default/Conv2D-op2", "Default/TensorAdd-op10"]
},
"DumpSettingsSpec": {
"enable": "true: dump enable, false: dump disable",
"trans_flag": "true: trans to host format, false: not trans format",
"path": "the dump file folder",
"net_name": "net name eg:ResNet50",
"mode": "0: dump all kernels, 1: dump kernels in kernels list",
"iteration": "0: all iteration, others: specified iteration ",
"kernels": "op's full scope name which need to be dump"
},
"other": {}
}

View File

@ -442,6 +442,7 @@ void AscendSession::InitRuntimeResource() {
if (!runtime_instance->Init()) {
MS_LOG(EXCEPTION) << "Kernel runtime init error.";
}
DumpJsonParser::GetInstance().Parse();
MS_LOG(INFO) << "Finish!";
}

View File

@ -74,6 +74,10 @@ bool DumpJsonParser::IsDumpEnabled() {
void DumpJsonParser::Parse() {
std::lock_guard<std::mutex> guard(lock_);
if (already_parsed_) {
return;
}
already_parsed_ = true;
if (!IsDumpEnabled()) {
return;
}
@ -305,6 +309,8 @@ void DumpJsonParser::JudgeDumpEnabled() {
MS_LOG(WARNING) << "Dump not enabled. device_id:" << device_id << " not support";
}
context->set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, !e2e_dump_enabled_);
MS_LOG(INFO) << "Dump status, e2e_dump_enabled:" << e2e_dump_enabled_
<< " async_dump_enabled:" << async_dump_enabled_;
}
bool DumpJsonParser::NeedDump(const std::string &op_full_name) const {
@ -325,6 +331,9 @@ void DumpJsonParser::MatchKernel(const std::string &kernel_name) {
}
void DumpJsonParser::PrintUnusedKernel() {
if (!e2e_dump_enabled_ && !async_dump_enabled_) {
return;
}
for (const auto &iter : kernels_) {
if (iter.second == 0) {
MS_LOG(WARNING) << "[DataDump] Unused Kernel in json:" << iter.first;
@ -362,16 +371,6 @@ bool DumpJsonParser::OutputNeedDump() const {
return input_output_ == kDumpInputAndOutput || input_output_ == kDumpOutputOnly;
}
bool NeedAsyncDump(const CNodePtr &kernel) {
if (AnfAlgo::GetKernelType(kernel) != TBE_KERNEL && AnfAlgo::GetKernelType(kernel) != AICPU_KERNEL &&
AnfAlgo::GetKernelType(kernel) != AKG_KERNEL) {
return false;
}
MS_EXCEPTION_IF_NULL(kernel);
// dump all kernel if mode is set 0 in data_dump.json
return DumpJsonParser::GetInstance().NeedDump(kernel->fullname_with_scope());
}
void DumpJsonParser::UpdateNeedDumpKernels(NotNull<const session::KernelGraph *> kernel_graph) {
if (e2e_dump_enabled_) {
MS_LOG(INFO) << "E2e dump no need to update dump kernel list";
@ -391,9 +390,6 @@ void DumpJsonParser::UpdateNeedDumpKernels(NotNull<const session::KernelGraph *>
update_kernels.try_emplace(input->fullname_with_scope(), 0);
}
}
} else if (NeedAsyncDump(kernel)) {
MS_LOG(INFO) << "[AsyncDump] Match Node:" << kernel->fullname_with_scope();
update_kernels.try_emplace(kernel->fullname_with_scope(), 0);
}
}
kernels_.insert(update_kernels.begin(), update_kernels.end());

View File

@ -72,6 +72,7 @@ class DumpJsonParser {
uint32_t op_debug_mode_{0};
bool trans_flag_{false};
uint32_t cur_dump_iter_{0};
bool already_parsed_{false};
void ParseCommonDumpSetting(const nlohmann::json &content);
void ParseAsyncDumpSetting(const nlohmann::json &content);

View File

@ -206,14 +206,17 @@ bool E2eDumpUtil::DumpData(const session::KernelGraph *graph, Debugger *debugger
}
}
MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter();
auto context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context);
auto device_id = context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
std::string net_name = dump_json_parser.net_name();
std::string iterator = std::to_string(dump_json_parser.cur_dump_iter());
std::string dump_path = dump_json_parser.path();
if (dump_path.back() == '/') {
dump_path = dump_path + net_name + '/' + iterator;
} else {
dump_path = dump_path + '/' + net_name + '/' + iterator;
if (dump_path.back() != '/') {
dump_path += "/";
}
dump_path += (net_name + "/device_" + std::to_string(device_id) + "/iteration_" + iterator);
DumpInput(graph, dump_path, debugger);
DumpOutput(graph, dump_path, debugger);
DumpParameters(graph, dump_path, debugger);

View File

@ -206,11 +206,8 @@ bool AscendKernelRuntime::Init() {
SetContext();
return true;
}
bool ret = false;
DumpJsonParser::GetInstance().Parse();
// Start up profiling before rtSetDevice
ret = ProfilingManager::GetInstance().StartupProfiling(device_id_);
bool ret = ProfilingManager::GetInstance().StartupProfiling(device_id_);
if (!ret) {
MS_EXCEPTION(DeviceProcessError) << "StartupProfiling failed.";
}