enable dump when met exception during train
This commit is contained in:
parent
181addec81
commit
1d97eefbb7
|
@ -97,6 +97,20 @@ const std::map<ProtoDataType, mindspore::TypeId> kDataTypetoMSTypeMap = {
|
|||
{ProtoDataType::DT_STRING, mindspore::TypeId::kObjectTypeString}};
|
||||
#endif
|
||||
|
||||
std::string GenDataFilePath(const CNodePtr &node, const std::string &kernel_name, const std::string &dump_path,
|
||||
size_t slot, bool is_input) {
|
||||
std::string op_type = AnfAlgo::GetCNodeName(node);
|
||||
std::string op_name = GetOpNameWithoutScope(kernel_name);
|
||||
uint64_t timestamp = GetTimeStamp();
|
||||
uint32_t task_id = 0;
|
||||
uint32_t stream_id = 0;
|
||||
std::string tensor_type = is_input ? ".input." : ".output.";
|
||||
std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(task_id) + '.' +
|
||||
std::to_string(stream_id) + '.' + std::to_string(timestamp) + tensor_type +
|
||||
std::to_string(slot);
|
||||
return file_path;
|
||||
}
|
||||
|
||||
bool E2eDump::IsDeviceTargetGPU() {
|
||||
auto context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context);
|
||||
|
@ -193,6 +207,31 @@ void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::s
|
|||
}
|
||||
}
|
||||
|
||||
void E2eDump::DumpOutputData(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
|
||||
std::string *kernel_name) {
|
||||
auto debugger = Debugger::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(debugger);
|
||||
if (IsDeviceTargetGPU() || debugger->GetAscendKernelByKernelFlag()) {
|
||||
MS_LOG(INFO) << "DumpInputData is only for graph mode on Ascend";
|
||||
return;
|
||||
}
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
GetFileKernelName(NOT_NULL(kernel_name));
|
||||
auto output_size = AnfAlgo::GetOutputTensorNum(node);
|
||||
for (size_t j = 0; j < output_size; ++j) {
|
||||
if (!AnfAlgo::OutputAddrExist(node, j)) {
|
||||
continue;
|
||||
}
|
||||
auto addr = AnfAlgo::GetOutputAddr(node, j);
|
||||
MS_EXCEPTION_IF_NULL(addr);
|
||||
ShapeVector int_shapes;
|
||||
GetDumpIntShape(node, j, NOT_NULL(&int_shapes), trans_flag);
|
||||
auto type = AnfAlgo::GetOutputInferDataType(node, j);
|
||||
std::string file_path = GenDataFilePath(node, *kernel_name, dump_path, j, false);
|
||||
DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag);
|
||||
}
|
||||
}
|
||||
|
||||
void E2eDump::DumpInput(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
||||
|
@ -255,9 +294,6 @@ void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::st
|
|||
if (!AnfAlgo::OutputAddrExist(input, index)) {
|
||||
continue;
|
||||
}
|
||||
auto addr = AnfAlgo::GetOutputAddr(input, index);
|
||||
MS_EXCEPTION_IF_NULL(addr);
|
||||
|
||||
std::string tensor_name = GetKernelNodeName(node);
|
||||
size_t slot = j;
|
||||
if (IsDeviceTargetGPU() || Debugger::GetInstance()->GetAscendKernelByKernelFlag()) {
|
||||
|
@ -277,6 +313,7 @@ void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::st
|
|||
uint32_t stream_id = 0;
|
||||
std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(task_id) + '.' +
|
||||
std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".input." + std::to_string(j);
|
||||
auto addr = AnfAlgo::GetOutputAddr(input, index);
|
||||
MS_EXCEPTION_IF_NULL(addr);
|
||||
if (DumpJsonParser::GetInstance().IsStatisticDump() &&
|
||||
(IsDeviceTargetGPU() || Debugger::GetInstance()->GetAscendKernelByKernelFlag())) {
|
||||
|
@ -297,6 +334,34 @@ void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::st
|
|||
}
|
||||
}
|
||||
|
||||
void E2eDump::DumpInputData(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
|
||||
std::string *kernel_name) {
|
||||
auto debugger = Debugger::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(debugger);
|
||||
if (IsDeviceTargetGPU() || debugger->GetAscendKernelByKernelFlag()) {
|
||||
MS_LOG(INFO) << "DumpInputData is only for graph mode on Ascend";
|
||||
return;
|
||||
}
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
GetFileKernelName(NOT_NULL(kernel_name));
|
||||
auto input_size = AnfAlgo::GetInputTensorNum(node);
|
||||
for (size_t j = 0; j < input_size; ++j) {
|
||||
auto kernel_with_index = AnfAlgo::GetPrevNodeOutput(node, j);
|
||||
auto input = kernel_with_index.first;
|
||||
auto index = kernel_with_index.second;
|
||||
if (!AnfAlgo::OutputAddrExist(input, index)) {
|
||||
continue;
|
||||
}
|
||||
auto addr = AnfAlgo::GetOutputAddr(input, index);
|
||||
MS_EXCEPTION_IF_NULL(addr);
|
||||
ShapeVector int_shapes;
|
||||
GetDumpIntShape(input, index, NOT_NULL(&int_shapes), trans_flag);
|
||||
auto type = AnfAlgo::GetOutputInferDataType(input, index);
|
||||
std::string file_path = GenDataFilePath(node, *kernel_name, dump_path, j, true);
|
||||
DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag);
|
||||
}
|
||||
}
|
||||
|
||||
void E2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_index, const std::string &dump_path,
|
||||
bool trans_flag, const Debugger *debugger) {
|
||||
MS_EXCEPTION_IF_NULL(anf_node);
|
||||
|
|
|
@ -63,6 +63,11 @@ class E2eDump {
|
|||
|
||||
static void DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
|
||||
std::string *kernel_name, const Debugger *debugger);
|
||||
// Dump input/output data without additional check, used for exception case only
|
||||
static void DumpInputData(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
|
||||
std::string *kernel_name);
|
||||
static void DumpOutputData(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
|
||||
std::string *kernel_name);
|
||||
|
||||
static bool DumpDirExists(const std::string &dump_path);
|
||||
|
||||
|
|
|
@ -717,8 +717,8 @@ void AscendKernelRuntime::DumpTaskExceptionInfo(const session::KernelGraph & /*
|
|||
auto full_scope_name = node->fullname_with_scope();
|
||||
MS_LOG(ERROR) << "Dump node (" << full_scope_name << ") task error input/output data to: " << path
|
||||
<< trace::DumpSourceLines(node);
|
||||
E2eDump::DumpInputImpl(node, false, path, &full_scope_name, nullptr);
|
||||
E2eDump::DumpOutputImpl(node, false, path, &full_scope_name, nullptr);
|
||||
E2eDump::DumpInputData(node, false, path, &full_scope_name);
|
||||
E2eDump::DumpOutputData(node, false, path, &full_scope_name);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue