From f11d5fa8ad1d4b83c76dc344c03052af8e74bd48 Mon Sep 17 00:00:00 2001 From: TinaMengtingZhang Date: Mon, 31 May 2021 14:39:09 -0400 Subject: [PATCH] Unify file name for parameter and CPU dump --- .../ccsrc/debug/data_dump/cpu_e2e_dump.cc | 23 +++++++++++++-- .../ccsrc/debug/data_dump/dump_json_parser.cc | 6 +++- mindspore/ccsrc/debug/data_dump/dump_utils.cc | 19 ++++++++++-- mindspore/ccsrc/debug/data_dump/dump_utils.h | 3 ++ mindspore/ccsrc/debug/data_dump/e2e_dump.cc | 29 ++++++++++--------- mindspore/ccsrc/debug/data_dump/e2e_dump.h | 5 +--- mindspore/ccsrc/debug/debugger/debugger.cc | 2 +- .../runtime/device/cpu/cpu_kernel_runtime.cc | 4 +-- tests/st/dump/test_data_dump.py | 23 ++++++++------- 9 files changed, 77 insertions(+), 37 deletions(-) diff --git a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc index cbdd272c3fd..e2b2187229c 100644 --- a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc +++ b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc @@ -71,7 +71,13 @@ void CPUE2eDump::DumpInputImpl(const CNodePtr &node, const std::string &dump_pat ShapeVector int_shapes; GetDumpIntShape(input, index, NOT_NULL(&int_shapes)); auto type = AnfAlgo::GetOutputInferDataType(input, index); - std::string file_path = dump_path + '/' + *kernel_name + '_' + "input_" + std::to_string(j); + std::string op_type = AnfAlgo::GetCNodeName(node); + std::string op_name = GetOpNameWithoutScope(*kernel_name); + uint64_t timestamp = GetTimeStamp(); + const uint32_t kTaskId = 0; + const uint32_t kStreamId = 0; + std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(kTaskId) + '.' + + std::to_string(kStreamId) + '.' + std::to_string(timestamp) + ".input." + std::to_string(j); DumpMemToFile(file_path, NOT_NULL(addr), int_shapes, type); } } @@ -88,7 +94,14 @@ void CPUE2eDump::DumpOutputImpl(const CNodePtr &node, const std::string &dump_pa ShapeVector int_shapes; GetDumpIntShape(node, j, NOT_NULL(&int_shapes)); auto type = AnfAlgo::GetOutputInferDataType(node, j); - std::string file_path = dump_path + '/' + *kernel_name + '_' + "output_" + std::to_string(j); + std::string op_type = AnfAlgo::GetCNodeName(node); + std::string op_name = GetOpNameWithoutScope(*kernel_name); + const uint32_t kTaskId = 0; + const uint32_t kStreamId = 0; + uint64_t timestamp = GetTimeStamp(); + std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(kTaskId) + '.' + + std::to_string(kStreamId) + '.' + std::to_string(timestamp) + ".output." + + std::to_string(j); DumpMemToFile(file_path, NOT_NULL(addr), int_shapes, type); } } @@ -125,7 +138,11 @@ void CPUE2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t outp GetDumpIntShape(anf_node, output_index, NOT_NULL(&int_shapes)); auto type = AnfAlgo::GetOutputInferDataType(anf_node, output_index); - std::string file_path = dump_path + '/' + dump_name + '_' + "output_0"; + uint64_t timestamp = GetTimeStamp(); + const uint32_t kTaskId = 0; + const uint32_t kStreamId = 0; + std::string file_path = dump_path + "/Parameter." + dump_name + '.' + std::to_string(kTaskId) + '.' + + std::to_string(kStreamId) + '.' + std::to_string(timestamp) + ".output.0"; DumpMemToFile(file_path, NOT_NULL(addr), int_shapes, type); } diff --git a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc index de6c083323f..b7271052ed3 100644 --- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc +++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc @@ -141,7 +141,11 @@ void DumpJsonParser::CopyHcclJsonToDir(uint32_t device_id) { } std::string config_path = common::GetEnv("MINDSPORE_HCCL_CONFIG_PATH"); if (config_path.empty()) { - return; + config_path = common::GetEnv("RANK_TABLE_FILE"); + if (config_path.empty()) { + MS_LOG(INFO) << "Get hccl json config failed."; + return; + } } std::ifstream json_file(config_path); auto realpath = Common::GetRealPath(path_ + "/rank_" + std::to_string(device_id) + "/.dump_metadata/hccl.json"); diff --git a/mindspore/ccsrc/debug/data_dump/dump_utils.cc b/mindspore/ccsrc/debug/data_dump/dump_utils.cc index e07341d8be1..1c02ec4b51f 100644 --- a/mindspore/ccsrc/debug/data_dump/dump_utils.cc +++ b/mindspore/ccsrc/debug/data_dump/dump_utils.cc @@ -42,9 +42,8 @@ std::string GenerateDumpPath(uint32_t graph_id, const uint32_t *device_id) { if (dump_path.back() != '/') { dump_path += "/"; } - uint32_t physical_device = device_id == nullptr ? 0 : ConvertPhysicalDeviceId(*device_id); dump_path += - ("rank_" + std::to_string(physical_device) + "/" + net_name + "/" + std::to_string(graph_id) + "/" + iterator); + ("rank_" + std::to_string(*device_id) + "/" + net_name + "/" + std::to_string(graph_id) + "/" + iterator); return dump_path; } @@ -124,4 +123,20 @@ void DumpMemToFile(const std::string &file_path, NotNull(cur_sys_time.time_since_epoch()).count(); + return timestamp; +} + +std::string GetOpNameWithoutScope(const std::string &fullname_with_scope) { + std::size_t found = fullname_with_scope.rfind("--"); + std::string op_name; + if (found != std::string::npos) { + op_name = fullname_with_scope.substr(found + 2); + } + return op_name; +} + } // namespace mindspore diff --git a/mindspore/ccsrc/debug/data_dump/dump_utils.h b/mindspore/ccsrc/debug/data_dump/dump_utils.h index 5daf41b9504..1a887755ebd 100644 --- a/mindspore/ccsrc/debug/data_dump/dump_utils.h +++ b/mindspore/ccsrc/debug/data_dump/dump_utils.h @@ -37,6 +37,9 @@ void GetDumpIntShape(const AnfNodePtr &node, size_t index, NotNull addr, const ShapeVector &int_shapes, const TypeId &type, bool trans_flag = false); +// Get time stamp since epoch in microseconds +uint64_t GetTimeStamp(); +std::string GetOpNameWithoutScope(const std::string &fullname_with_scope); } // namespace mindspore #endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_DUMP_UTILS_H_ diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc index e73771aab4e..8fa2a3b53de 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc @@ -1,5 +1,5 @@ /** - * Copyright 2020 Huawei Technologies Co., Ltd + * Copyright 2020-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -90,10 +90,11 @@ void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::s auto type = AnfAlgo::GetOutputInferDataType(node, j); auto device_type = AnfAlgo::GetOutputDeviceDataType(node, j); std::string op_type = AnfAlgo::GetCNodeName(node); + std::string op_name = GetOpNameWithoutScope(*kernel_name); uint32_t task_id = 0; uint32_t stream_id = 0; uint64_t timestamp = GetTimeStamp(); - std::string file_path = dump_path + '/' + op_type + '.' + *kernel_name + '.' + std::to_string(task_id) + '.' + + std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(task_id) + '.' + std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".output." + std::to_string(j); if (IsDeviceTargetGPU()) { @@ -105,12 +106,6 @@ void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::s } } -uint64_t E2eDump::GetTimeStamp() { - auto cur_sys_time = std::chrono::system_clock::now(); - uint64_t timestamp = std::chrono::duration_cast(cur_sys_time.time_since_epoch()).count(); - return timestamp; -} - void E2eDump::DumpInput(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger) { MS_EXCEPTION_IF_NULL(graph); auto &dump_json_parser = DumpJsonParser::GetInstance(); @@ -161,10 +156,11 @@ void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::st auto type = AnfAlgo::GetOutputInferDataType(input, index); auto device_type = AnfAlgo::GetOutputDeviceDataType(input, index); std::string op_type = AnfAlgo::GetCNodeName(node); + std::string op_name = GetOpNameWithoutScope(*kernel_name); uint64_t timestamp = GetTimeStamp(); uint32_t task_id = 0; uint32_t stream_id = 0; - std::string file_path = dump_path + '/' + op_type + '.' + *kernel_name + '.' + std::to_string(task_id) + '.' + + std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(task_id) + '.' + std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".input." + std::to_string(j); if (IsDeviceTargetGPU()) { DumpGPUMemToFile(file_path, tensor_name, NOT_NULL(addr), int_shapes, type, device_type, trans_flag, slot, @@ -207,7 +203,11 @@ void E2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_ GetDumpIntShape(anf_node, output_index, NOT_NULL(&int_shapes), trans_flag); auto type = AnfAlgo::GetOutputInferDataType(anf_node, output_index); auto device_type = AnfAlgo::GetOutputDeviceDataType(anf_node, output_index); - std::string file_path = dump_path + '/' + dump_name + "_output_0"; + uint64_t timestamp = GetTimeStamp(); + uint32_t task_id = 0; + uint32_t stream_id = 0; + std::string file_path = dump_path + "/Parameter." + dump_name + '.' + std::to_string(task_id) + '.' + + std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".input.0"; if (IsDeviceTargetGPU()) { DumpGPUMemToFile(file_path, node_name, NOT_NULL(addr), int_shapes, type, device_type, trans_flag, 0, debugger); } else { @@ -281,9 +281,10 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, co uint32_t graph_id = graph->graph_id(); if (starting_graph_id == INT32_MAX) { starting_graph_id = graph_id; - } - if (starting_graph_id == graph_id) { - dump_json_parser.UpdateDumpIter(); + } else { + if (starting_graph_id == graph_id) { + dump_json_parser.UpdateDumpIter(); + } } if (dump_json_parser.GetIterDumpFlag()) { @@ -296,7 +297,7 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, co DumpParametersAndConst(graph, dump_path, debugger); return true; } else if (dump_json_parser.AsyncDumpEnabled()) { - uint32_t prev_dump_iter = dump_json_parser.cur_dump_iter() - 1; + uint32_t prev_dump_iter = dump_json_parser.cur_dump_iter(); auto zero_dir_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/_/" + std::to_string(graph->graph_id()) + "/0"; diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.h b/mindspore/ccsrc/debug/data_dump/e2e_dump.h index 74ed788af1c..c7e1543aaf6 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump.h +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.h @@ -1,5 +1,5 @@ /** - * Copyright 2020 Huawei Technologies Co., Ltd + * Copyright 2020-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,9 +57,6 @@ class E2eDump { static void DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_index, const std::string &dump_path, bool trans_flag, std::map *const_map, const Debugger *debugger); inline static unsigned int starting_graph_id = INT32_MAX; - - // Get time stamp since epoch in microseconds - static uint64_t GetTimeStamp(); }; } // namespace mindspore #endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_E_2_E_DUMP_UTIL_H_ diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index 470acf03ea7..2ab0bfd175f 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -1090,7 +1090,7 @@ std::vector Debugger::CheckOpOverflow() { } MS_LOG(INFO) << "Open overflow bin file " << file_name; const uint32_t offset = 321; - infile.seekg(offset, std::ios::beg); + (void)infile.seekg(offset, std::ios::beg); std::vector buffer; const size_t buf_size = 256; buffer.resize(buf_size); diff --git a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc index 9b612e39af5..b463bd535d1 100644 --- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc @@ -1,5 +1,5 @@ /** - * Copyright 2019 Huawei Technologies Co., Ltd + * Copyright 2019-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -380,7 +380,6 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool) { MS_EXCEPTION_IF_NULL(profiler_inst); auto &dump_json_parser = DumpJsonParser::GetInstance(); - dump_json_parser.UpdateDumpIter(); bool iter_dump_flag = dump_json_parser.GetIterDumpFlag(); uint32_t graph_id = kernel_graph->graph_id(); @@ -444,6 +443,7 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool) { if (iter_dump_flag) { CPUE2eDump::DumpParametersAndConst(kernel_graph, graph_id); } + dump_json_parser.UpdateDumpIter(); return true; } } // namespace cpu diff --git a/tests/st/dump/test_data_dump.py b/tests/st/dump/test_data_dump.py index dd0040e9079..6394c41c149 100644 --- a/tests/st/dump/test_data_dump.py +++ b/tests/st/dump/test_data_dump.py @@ -1,4 +1,4 @@ -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2020-2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ import json import sys import time import shutil +import glob import numpy as np import pytest @@ -85,21 +86,23 @@ def run_e2e_dump(): device_id = context.get_context("device_id") else: device_id = 0 - dump_file_path = dump_path + '/rank_{}/Net/0/1/'.format(device_id) + dump_file_path = dump_path + '/rank_{}/Net/0/0/'.format(device_id) if os.path.isdir(dump_path): shutil.rmtree(dump_path) add = Net() add(Tensor(x), Tensor(y)) time.sleep(5) assert len(os.listdir(dump_file_path)) == 5 - if context.get_context("device_target") == "CPU": - output_name = "Default--Add-op3_output_0.DefaultFormat.npy" - output_path = dump_file_path + output_name - real_path = os.path.realpath(output_path) - output = np.load(real_path) - expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32) - assert output.dtype == expect.dtype - assert np.array_equal(output, expect) + if context.get_context("device_target") == "Ascend": + output_name = "Add.Add-op1.0.0.*.output.0.DefaultFormat.npy" + else: + output_name = "Add.Add-op3.0.0.*.output.0.DefaultFormat.npy" + output_path = glob.glob(dump_file_path + output_name)[0] + real_path = os.path.realpath(output_path) + output = np.load(real_path) + expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32) + assert output.dtype == expect.dtype + assert np.array_equal(output, expect) @pytest.mark.level0