From 1d97eefbb7958796091c271c1a979f01b66bf147 Mon Sep 17 00:00:00 2001 From: yelihua Date: Wed, 2 Mar 2022 16:34:41 +0800 Subject: [PATCH] enable dump when met exception during train --- mindspore/ccsrc/debug/data_dump/e2e_dump.cc | 71 ++++++++++++++++++- mindspore/ccsrc/debug/data_dump/e2e_dump.h | 5 ++ .../device/ascend/ascend_kernel_runtime.cc | 4 +- 3 files changed, 75 insertions(+), 5 deletions(-) diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc index 649d10dc89e..85dfe7894be 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc @@ -97,6 +97,20 @@ const std::map kDataTypetoMSTypeMap = { {ProtoDataType::DT_STRING, mindspore::TypeId::kObjectTypeString}}; #endif +std::string GenDataFilePath(const CNodePtr &node, const std::string &kernel_name, const std::string &dump_path, + size_t slot, bool is_input) { + std::string op_type = AnfAlgo::GetCNodeName(node); + std::string op_name = GetOpNameWithoutScope(kernel_name); + uint64_t timestamp = GetTimeStamp(); + uint32_t task_id = 0; + uint32_t stream_id = 0; + std::string tensor_type = is_input ? ".input." : ".output."; + std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(task_id) + '.' + + std::to_string(stream_id) + '.' + std::to_string(timestamp) + tensor_type + + std::to_string(slot); + return file_path; +} + bool E2eDump::IsDeviceTargetGPU() { auto context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context); @@ -193,6 +207,31 @@ void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::s } } +void E2eDump::DumpOutputData(const CNodePtr &node, bool trans_flag, const std::string &dump_path, + std::string *kernel_name) { + auto debugger = Debugger::GetInstance(); + MS_EXCEPTION_IF_NULL(debugger); + if (IsDeviceTargetGPU() || debugger->GetAscendKernelByKernelFlag()) { + MS_LOG(INFO) << "DumpInputData is only for graph mode on Ascend"; + return; + } + MS_EXCEPTION_IF_NULL(node); + GetFileKernelName(NOT_NULL(kernel_name)); + auto output_size = AnfAlgo::GetOutputTensorNum(node); + for (size_t j = 0; j < output_size; ++j) { + if (!AnfAlgo::OutputAddrExist(node, j)) { + continue; + } + auto addr = AnfAlgo::GetOutputAddr(node, j); + MS_EXCEPTION_IF_NULL(addr); + ShapeVector int_shapes; + GetDumpIntShape(node, j, NOT_NULL(&int_shapes), trans_flag); + auto type = AnfAlgo::GetOutputInferDataType(node, j); + std::string file_path = GenDataFilePath(node, *kernel_name, dump_path, j, false); + DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag); + } +} + void E2eDump::DumpInput(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger) { MS_EXCEPTION_IF_NULL(graph); auto &dump_json_parser = DumpJsonParser::GetInstance(); @@ -255,9 +294,6 @@ void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::st if (!AnfAlgo::OutputAddrExist(input, index)) { continue; } - auto addr = AnfAlgo::GetOutputAddr(input, index); - MS_EXCEPTION_IF_NULL(addr); - std::string tensor_name = GetKernelNodeName(node); size_t slot = j; if (IsDeviceTargetGPU() || Debugger::GetInstance()->GetAscendKernelByKernelFlag()) { @@ -277,6 +313,7 @@ void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::st uint32_t stream_id = 0; std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(task_id) + '.' + std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".input." + std::to_string(j); + auto addr = AnfAlgo::GetOutputAddr(input, index); MS_EXCEPTION_IF_NULL(addr); if (DumpJsonParser::GetInstance().IsStatisticDump() && (IsDeviceTargetGPU() || Debugger::GetInstance()->GetAscendKernelByKernelFlag())) { @@ -297,6 +334,34 @@ void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::st } } +void E2eDump::DumpInputData(const CNodePtr &node, bool trans_flag, const std::string &dump_path, + std::string *kernel_name) { + auto debugger = Debugger::GetInstance(); + MS_EXCEPTION_IF_NULL(debugger); + if (IsDeviceTargetGPU() || debugger->GetAscendKernelByKernelFlag()) { + MS_LOG(INFO) << "DumpInputData is only for graph mode on Ascend"; + return; + } + MS_EXCEPTION_IF_NULL(node); + GetFileKernelName(NOT_NULL(kernel_name)); + auto input_size = AnfAlgo::GetInputTensorNum(node); + for (size_t j = 0; j < input_size; ++j) { + auto kernel_with_index = AnfAlgo::GetPrevNodeOutput(node, j); + auto input = kernel_with_index.first; + auto index = kernel_with_index.second; + if (!AnfAlgo::OutputAddrExist(input, index)) { + continue; + } + auto addr = AnfAlgo::GetOutputAddr(input, index); + MS_EXCEPTION_IF_NULL(addr); + ShapeVector int_shapes; + GetDumpIntShape(input, index, NOT_NULL(&int_shapes), trans_flag); + auto type = AnfAlgo::GetOutputInferDataType(input, index); + std::string file_path = GenDataFilePath(node, *kernel_name, dump_path, j, true); + DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag); + } +} + void E2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_index, const std::string &dump_path, bool trans_flag, const Debugger *debugger) { MS_EXCEPTION_IF_NULL(anf_node); diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.h b/mindspore/ccsrc/debug/data_dump/e2e_dump.h index f59b5704ead..59643129352 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump.h +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.h @@ -63,6 +63,11 @@ class E2eDump { static void DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path, std::string *kernel_name, const Debugger *debugger); + // Dump input/output data without additional check, used for exception case only + static void DumpInputData(const CNodePtr &node, bool trans_flag, const std::string &dump_path, + std::string *kernel_name); + static void DumpOutputData(const CNodePtr &node, bool trans_flag, const std::string &dump_path, + std::string *kernel_name); static bool DumpDirExists(const std::string &dump_path); diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc index 47030f3bfcb..4627c158765 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc @@ -717,8 +717,8 @@ void AscendKernelRuntime::DumpTaskExceptionInfo(const session::KernelGraph & /* auto full_scope_name = node->fullname_with_scope(); MS_LOG(ERROR) << "Dump node (" << full_scope_name << ") task error input/output data to: " << path << trace::DumpSourceLines(node); - E2eDump::DumpInputImpl(node, false, path, &full_scope_name, nullptr); - E2eDump::DumpOutputImpl(node, false, path, &full_scope_name, nullptr); + E2eDump::DumpInputData(node, false, path, &full_scope_name); + E2eDump::DumpOutputData(node, false, path, &full_scope_name); } } #endif