Unify file name for parameter and CPU dump

This commit is contained in:
TinaMengtingZhang 2021-05-31 14:39:09 -04:00
parent 28848a97b9
commit f11d5fa8ad
9 changed files with 77 additions and 37 deletions

View File

@ -71,7 +71,13 @@ void CPUE2eDump::DumpInputImpl(const CNodePtr &node, const std::string &dump_pat
ShapeVector int_shapes;
GetDumpIntShape(input, index, NOT_NULL(&int_shapes));
auto type = AnfAlgo::GetOutputInferDataType(input, index);
std::string file_path = dump_path + '/' + *kernel_name + '_' + "input_" + std::to_string(j);
std::string op_type = AnfAlgo::GetCNodeName(node);
std::string op_name = GetOpNameWithoutScope(*kernel_name);
uint64_t timestamp = GetTimeStamp();
const uint32_t kTaskId = 0;
const uint32_t kStreamId = 0;
std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(kTaskId) + '.' +
std::to_string(kStreamId) + '.' + std::to_string(timestamp) + ".input." + std::to_string(j);
DumpMemToFile(file_path, NOT_NULL(addr), int_shapes, type);
}
}
@ -88,7 +94,14 @@ void CPUE2eDump::DumpOutputImpl(const CNodePtr &node, const std::string &dump_pa
ShapeVector int_shapes;
GetDumpIntShape(node, j, NOT_NULL(&int_shapes));
auto type = AnfAlgo::GetOutputInferDataType(node, j);
std::string file_path = dump_path + '/' + *kernel_name + '_' + "output_" + std::to_string(j);
std::string op_type = AnfAlgo::GetCNodeName(node);
std::string op_name = GetOpNameWithoutScope(*kernel_name);
const uint32_t kTaskId = 0;
const uint32_t kStreamId = 0;
uint64_t timestamp = GetTimeStamp();
std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(kTaskId) + '.' +
std::to_string(kStreamId) + '.' + std::to_string(timestamp) + ".output." +
std::to_string(j);
DumpMemToFile(file_path, NOT_NULL(addr), int_shapes, type);
}
}
@ -125,7 +138,11 @@ void CPUE2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t outp
GetDumpIntShape(anf_node, output_index, NOT_NULL(&int_shapes));
auto type = AnfAlgo::GetOutputInferDataType(anf_node, output_index);
std::string file_path = dump_path + '/' + dump_name + '_' + "output_0";
uint64_t timestamp = GetTimeStamp();
const uint32_t kTaskId = 0;
const uint32_t kStreamId = 0;
std::string file_path = dump_path + "/Parameter." + dump_name + '.' + std::to_string(kTaskId) + '.' +
std::to_string(kStreamId) + '.' + std::to_string(timestamp) + ".output.0";
DumpMemToFile(file_path, NOT_NULL(addr), int_shapes, type);
}

View File

@ -141,7 +141,11 @@ void DumpJsonParser::CopyHcclJsonToDir(uint32_t device_id) {
}
std::string config_path = common::GetEnv("MINDSPORE_HCCL_CONFIG_PATH");
if (config_path.empty()) {
return;
config_path = common::GetEnv("RANK_TABLE_FILE");
if (config_path.empty()) {
MS_LOG(INFO) << "Get hccl json config failed.";
return;
}
}
std::ifstream json_file(config_path);
auto realpath = Common::GetRealPath(path_ + "/rank_" + std::to_string(device_id) + "/.dump_metadata/hccl.json");

View File

@ -42,9 +42,8 @@ std::string GenerateDumpPath(uint32_t graph_id, const uint32_t *device_id) {
if (dump_path.back() != '/') {
dump_path += "/";
}
uint32_t physical_device = device_id == nullptr ? 0 : ConvertPhysicalDeviceId(*device_id);
dump_path +=
("rank_" + std::to_string(physical_device) + "/" + net_name + "/" + std::to_string(graph_id) + "/" + iterator);
("rank_" + std::to_string(*device_id) + "/" + net_name + "/" + std::to_string(graph_id) + "/" + iterator);
return dump_path;
}
@ -124,4 +123,20 @@ void DumpMemToFile(const std::string &file_path, NotNull<const device::DeviceAdd
<< ".!";
}
}
uint64_t GetTimeStamp() {
auto cur_sys_time = std::chrono::system_clock::now();
uint64_t timestamp = std::chrono::duration_cast<std::chrono::microseconds>(cur_sys_time.time_since_epoch()).count();
return timestamp;
}
std::string GetOpNameWithoutScope(const std::string &fullname_with_scope) {
std::size_t found = fullname_with_scope.rfind("--");
std::string op_name;
if (found != std::string::npos) {
op_name = fullname_with_scope.substr(found + 2);
}
return op_name;
}
} // namespace mindspore

View File

@ -37,6 +37,9 @@ void GetDumpIntShape(const AnfNodePtr &node, size_t index, NotNull<ShapeVector *
void DumpMemToFile(const std::string &file_path, NotNull<const device::DeviceAddress *> addr,
const ShapeVector &int_shapes, const TypeId &type, bool trans_flag = false);
// Get time stamp since epoch in microseconds
uint64_t GetTimeStamp();
std::string GetOpNameWithoutScope(const std::string &fullname_with_scope);
} // namespace mindspore
#endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_DUMP_UTILS_H_

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -90,10 +90,11 @@ void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::s
auto type = AnfAlgo::GetOutputInferDataType(node, j);
auto device_type = AnfAlgo::GetOutputDeviceDataType(node, j);
std::string op_type = AnfAlgo::GetCNodeName(node);
std::string op_name = GetOpNameWithoutScope(*kernel_name);
uint32_t task_id = 0;
uint32_t stream_id = 0;
uint64_t timestamp = GetTimeStamp();
std::string file_path = dump_path + '/' + op_type + '.' + *kernel_name + '.' + std::to_string(task_id) + '.' +
std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(task_id) + '.' +
std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".output." +
std::to_string(j);
if (IsDeviceTargetGPU()) {
@ -105,12 +106,6 @@ void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::s
}
}
uint64_t E2eDump::GetTimeStamp() {
auto cur_sys_time = std::chrono::system_clock::now();
uint64_t timestamp = std::chrono::duration_cast<std::chrono::microseconds>(cur_sys_time.time_since_epoch()).count();
return timestamp;
}
void E2eDump::DumpInput(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger) {
MS_EXCEPTION_IF_NULL(graph);
auto &dump_json_parser = DumpJsonParser::GetInstance();
@ -161,10 +156,11 @@ void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::st
auto type = AnfAlgo::GetOutputInferDataType(input, index);
auto device_type = AnfAlgo::GetOutputDeviceDataType(input, index);
std::string op_type = AnfAlgo::GetCNodeName(node);
std::string op_name = GetOpNameWithoutScope(*kernel_name);
uint64_t timestamp = GetTimeStamp();
uint32_t task_id = 0;
uint32_t stream_id = 0;
std::string file_path = dump_path + '/' + op_type + '.' + *kernel_name + '.' + std::to_string(task_id) + '.' +
std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(task_id) + '.' +
std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".input." + std::to_string(j);
if (IsDeviceTargetGPU()) {
DumpGPUMemToFile(file_path, tensor_name, NOT_NULL(addr), int_shapes, type, device_type, trans_flag, slot,
@ -207,7 +203,11 @@ void E2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_
GetDumpIntShape(anf_node, output_index, NOT_NULL(&int_shapes), trans_flag);
auto type = AnfAlgo::GetOutputInferDataType(anf_node, output_index);
auto device_type = AnfAlgo::GetOutputDeviceDataType(anf_node, output_index);
std::string file_path = dump_path + '/' + dump_name + "_output_0";
uint64_t timestamp = GetTimeStamp();
uint32_t task_id = 0;
uint32_t stream_id = 0;
std::string file_path = dump_path + "/Parameter." + dump_name + '.' + std::to_string(task_id) + '.' +
std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".input.0";
if (IsDeviceTargetGPU()) {
DumpGPUMemToFile(file_path, node_name, NOT_NULL(addr), int_shapes, type, device_type, trans_flag, 0, debugger);
} else {
@ -281,9 +281,10 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, co
uint32_t graph_id = graph->graph_id();
if (starting_graph_id == INT32_MAX) {
starting_graph_id = graph_id;
}
if (starting_graph_id == graph_id) {
dump_json_parser.UpdateDumpIter();
} else {
if (starting_graph_id == graph_id) {
dump_json_parser.UpdateDumpIter();
}
}
if (dump_json_parser.GetIterDumpFlag()) {
@ -296,7 +297,7 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, co
DumpParametersAndConst(graph, dump_path, debugger);
return true;
} else if (dump_json_parser.AsyncDumpEnabled()) {
uint32_t prev_dump_iter = dump_json_parser.cur_dump_iter() - 1;
uint32_t prev_dump_iter = dump_json_parser.cur_dump_iter();
auto zero_dir_dump_path =
dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/_/" + std::to_string(graph->graph_id()) + "/0";

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -57,9 +57,6 @@ class E2eDump {
static void DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_index, const std::string &dump_path,
bool trans_flag, std::map<std::string, size_t> *const_map, const Debugger *debugger);
inline static unsigned int starting_graph_id = INT32_MAX;
// Get time stamp since epoch in microseconds
static uint64_t GetTimeStamp();
};
} // namespace mindspore
#endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_E_2_E_DUMP_UTIL_H_

View File

@ -1090,7 +1090,7 @@ std::vector<std::string> Debugger::CheckOpOverflow() {
}
MS_LOG(INFO) << "Open overflow bin file " << file_name;
const uint32_t offset = 321;
infile.seekg(offset, std::ios::beg);
(void)infile.seekg(offset, std::ios::beg);
std::vector<char> buffer;
const size_t buf_size = 256;
buffer.resize(buf_size);

View File

@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -380,7 +380,6 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool) {
MS_EXCEPTION_IF_NULL(profiler_inst);
auto &dump_json_parser = DumpJsonParser::GetInstance();
dump_json_parser.UpdateDumpIter();
bool iter_dump_flag = dump_json_parser.GetIterDumpFlag();
uint32_t graph_id = kernel_graph->graph_id();
@ -444,6 +443,7 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool) {
if (iter_dump_flag) {
CPUE2eDump::DumpParametersAndConst(kernel_graph, graph_id);
}
dump_json_parser.UpdateDumpIter();
return true;
}
} // namespace cpu

View File

@ -1,4 +1,4 @@
# Copyright 2020 Huawei Technologies Co., Ltd
# Copyright 2020-2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -17,6 +17,7 @@ import json
import sys
import time
import shutil
import glob
import numpy as np
import pytest
@ -85,21 +86,23 @@ def run_e2e_dump():
device_id = context.get_context("device_id")
else:
device_id = 0
dump_file_path = dump_path + '/rank_{}/Net/0/1/'.format(device_id)
dump_file_path = dump_path + '/rank_{}/Net/0/0/'.format(device_id)
if os.path.isdir(dump_path):
shutil.rmtree(dump_path)
add = Net()
add(Tensor(x), Tensor(y))
time.sleep(5)
assert len(os.listdir(dump_file_path)) == 5
if context.get_context("device_target") == "CPU":
output_name = "Default--Add-op3_output_0.DefaultFormat.npy"
output_path = dump_file_path + output_name
real_path = os.path.realpath(output_path)
output = np.load(real_path)
expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
assert output.dtype == expect.dtype
assert np.array_equal(output, expect)
if context.get_context("device_target") == "Ascend":
output_name = "Add.Add-op1.0.0.*.output.0.DefaultFormat.npy"
else:
output_name = "Add.Add-op3.0.0.*.output.0.DefaultFormat.npy"
output_path = glob.glob(dump_file_path + output_name)[0]
real_path = os.path.realpath(output_path)
output = np.load(real_path)
expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
assert output.dtype == expect.dtype
assert np.array_equal(output, expect)
@pytest.mark.level0