forked from mindspore-Ecosystem/mindspore
Unify file name for parameter and CPU dump
This commit is contained in:
parent
28848a97b9
commit
f11d5fa8ad
|
@ -71,7 +71,13 @@ void CPUE2eDump::DumpInputImpl(const CNodePtr &node, const std::string &dump_pat
|
|||
ShapeVector int_shapes;
|
||||
GetDumpIntShape(input, index, NOT_NULL(&int_shapes));
|
||||
auto type = AnfAlgo::GetOutputInferDataType(input, index);
|
||||
std::string file_path = dump_path + '/' + *kernel_name + '_' + "input_" + std::to_string(j);
|
||||
std::string op_type = AnfAlgo::GetCNodeName(node);
|
||||
std::string op_name = GetOpNameWithoutScope(*kernel_name);
|
||||
uint64_t timestamp = GetTimeStamp();
|
||||
const uint32_t kTaskId = 0;
|
||||
const uint32_t kStreamId = 0;
|
||||
std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(kTaskId) + '.' +
|
||||
std::to_string(kStreamId) + '.' + std::to_string(timestamp) + ".input." + std::to_string(j);
|
||||
DumpMemToFile(file_path, NOT_NULL(addr), int_shapes, type);
|
||||
}
|
||||
}
|
||||
|
@ -88,7 +94,14 @@ void CPUE2eDump::DumpOutputImpl(const CNodePtr &node, const std::string &dump_pa
|
|||
ShapeVector int_shapes;
|
||||
GetDumpIntShape(node, j, NOT_NULL(&int_shapes));
|
||||
auto type = AnfAlgo::GetOutputInferDataType(node, j);
|
||||
std::string file_path = dump_path + '/' + *kernel_name + '_' + "output_" + std::to_string(j);
|
||||
std::string op_type = AnfAlgo::GetCNodeName(node);
|
||||
std::string op_name = GetOpNameWithoutScope(*kernel_name);
|
||||
const uint32_t kTaskId = 0;
|
||||
const uint32_t kStreamId = 0;
|
||||
uint64_t timestamp = GetTimeStamp();
|
||||
std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(kTaskId) + '.' +
|
||||
std::to_string(kStreamId) + '.' + std::to_string(timestamp) + ".output." +
|
||||
std::to_string(j);
|
||||
DumpMemToFile(file_path, NOT_NULL(addr), int_shapes, type);
|
||||
}
|
||||
}
|
||||
|
@ -125,7 +138,11 @@ void CPUE2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t outp
|
|||
GetDumpIntShape(anf_node, output_index, NOT_NULL(&int_shapes));
|
||||
auto type = AnfAlgo::GetOutputInferDataType(anf_node, output_index);
|
||||
|
||||
std::string file_path = dump_path + '/' + dump_name + '_' + "output_0";
|
||||
uint64_t timestamp = GetTimeStamp();
|
||||
const uint32_t kTaskId = 0;
|
||||
const uint32_t kStreamId = 0;
|
||||
std::string file_path = dump_path + "/Parameter." + dump_name + '.' + std::to_string(kTaskId) + '.' +
|
||||
std::to_string(kStreamId) + '.' + std::to_string(timestamp) + ".output.0";
|
||||
DumpMemToFile(file_path, NOT_NULL(addr), int_shapes, type);
|
||||
}
|
||||
|
||||
|
|
|
@ -141,7 +141,11 @@ void DumpJsonParser::CopyHcclJsonToDir(uint32_t device_id) {
|
|||
}
|
||||
std::string config_path = common::GetEnv("MINDSPORE_HCCL_CONFIG_PATH");
|
||||
if (config_path.empty()) {
|
||||
return;
|
||||
config_path = common::GetEnv("RANK_TABLE_FILE");
|
||||
if (config_path.empty()) {
|
||||
MS_LOG(INFO) << "Get hccl json config failed.";
|
||||
return;
|
||||
}
|
||||
}
|
||||
std::ifstream json_file(config_path);
|
||||
auto realpath = Common::GetRealPath(path_ + "/rank_" + std::to_string(device_id) + "/.dump_metadata/hccl.json");
|
||||
|
|
|
@ -42,9 +42,8 @@ std::string GenerateDumpPath(uint32_t graph_id, const uint32_t *device_id) {
|
|||
if (dump_path.back() != '/') {
|
||||
dump_path += "/";
|
||||
}
|
||||
uint32_t physical_device = device_id == nullptr ? 0 : ConvertPhysicalDeviceId(*device_id);
|
||||
dump_path +=
|
||||
("rank_" + std::to_string(physical_device) + "/" + net_name + "/" + std::to_string(graph_id) + "/" + iterator);
|
||||
("rank_" + std::to_string(*device_id) + "/" + net_name + "/" + std::to_string(graph_id) + "/" + iterator);
|
||||
return dump_path;
|
||||
}
|
||||
|
||||
|
@ -124,4 +123,20 @@ void DumpMemToFile(const std::string &file_path, NotNull<const device::DeviceAdd
|
|||
<< ".!";
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t GetTimeStamp() {
|
||||
auto cur_sys_time = std::chrono::system_clock::now();
|
||||
uint64_t timestamp = std::chrono::duration_cast<std::chrono::microseconds>(cur_sys_time.time_since_epoch()).count();
|
||||
return timestamp;
|
||||
}
|
||||
|
||||
std::string GetOpNameWithoutScope(const std::string &fullname_with_scope) {
|
||||
std::size_t found = fullname_with_scope.rfind("--");
|
||||
std::string op_name;
|
||||
if (found != std::string::npos) {
|
||||
op_name = fullname_with_scope.substr(found + 2);
|
||||
}
|
||||
return op_name;
|
||||
}
|
||||
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -37,6 +37,9 @@ void GetDumpIntShape(const AnfNodePtr &node, size_t index, NotNull<ShapeVector *
|
|||
|
||||
void DumpMemToFile(const std::string &file_path, NotNull<const device::DeviceAddress *> addr,
|
||||
const ShapeVector &int_shapes, const TypeId &type, bool trans_flag = false);
|
||||
// Get time stamp since epoch in microseconds
|
||||
uint64_t GetTimeStamp();
|
||||
std::string GetOpNameWithoutScope(const std::string &fullname_with_scope);
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_DUMP_UTILS_H_
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -90,10 +90,11 @@ void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::s
|
|||
auto type = AnfAlgo::GetOutputInferDataType(node, j);
|
||||
auto device_type = AnfAlgo::GetOutputDeviceDataType(node, j);
|
||||
std::string op_type = AnfAlgo::GetCNodeName(node);
|
||||
std::string op_name = GetOpNameWithoutScope(*kernel_name);
|
||||
uint32_t task_id = 0;
|
||||
uint32_t stream_id = 0;
|
||||
uint64_t timestamp = GetTimeStamp();
|
||||
std::string file_path = dump_path + '/' + op_type + '.' + *kernel_name + '.' + std::to_string(task_id) + '.' +
|
||||
std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(task_id) + '.' +
|
||||
std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".output." +
|
||||
std::to_string(j);
|
||||
if (IsDeviceTargetGPU()) {
|
||||
|
@ -105,12 +106,6 @@ void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::s
|
|||
}
|
||||
}
|
||||
|
||||
uint64_t E2eDump::GetTimeStamp() {
|
||||
auto cur_sys_time = std::chrono::system_clock::now();
|
||||
uint64_t timestamp = std::chrono::duration_cast<std::chrono::microseconds>(cur_sys_time.time_since_epoch()).count();
|
||||
return timestamp;
|
||||
}
|
||||
|
||||
void E2eDump::DumpInput(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
||||
|
@ -161,10 +156,11 @@ void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::st
|
|||
auto type = AnfAlgo::GetOutputInferDataType(input, index);
|
||||
auto device_type = AnfAlgo::GetOutputDeviceDataType(input, index);
|
||||
std::string op_type = AnfAlgo::GetCNodeName(node);
|
||||
std::string op_name = GetOpNameWithoutScope(*kernel_name);
|
||||
uint64_t timestamp = GetTimeStamp();
|
||||
uint32_t task_id = 0;
|
||||
uint32_t stream_id = 0;
|
||||
std::string file_path = dump_path + '/' + op_type + '.' + *kernel_name + '.' + std::to_string(task_id) + '.' +
|
||||
std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(task_id) + '.' +
|
||||
std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".input." + std::to_string(j);
|
||||
if (IsDeviceTargetGPU()) {
|
||||
DumpGPUMemToFile(file_path, tensor_name, NOT_NULL(addr), int_shapes, type, device_type, trans_flag, slot,
|
||||
|
@ -207,7 +203,11 @@ void E2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_
|
|||
GetDumpIntShape(anf_node, output_index, NOT_NULL(&int_shapes), trans_flag);
|
||||
auto type = AnfAlgo::GetOutputInferDataType(anf_node, output_index);
|
||||
auto device_type = AnfAlgo::GetOutputDeviceDataType(anf_node, output_index);
|
||||
std::string file_path = dump_path + '/' + dump_name + "_output_0";
|
||||
uint64_t timestamp = GetTimeStamp();
|
||||
uint32_t task_id = 0;
|
||||
uint32_t stream_id = 0;
|
||||
std::string file_path = dump_path + "/Parameter." + dump_name + '.' + std::to_string(task_id) + '.' +
|
||||
std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".input.0";
|
||||
if (IsDeviceTargetGPU()) {
|
||||
DumpGPUMemToFile(file_path, node_name, NOT_NULL(addr), int_shapes, type, device_type, trans_flag, 0, debugger);
|
||||
} else {
|
||||
|
@ -281,9 +281,10 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, co
|
|||
uint32_t graph_id = graph->graph_id();
|
||||
if (starting_graph_id == INT32_MAX) {
|
||||
starting_graph_id = graph_id;
|
||||
}
|
||||
if (starting_graph_id == graph_id) {
|
||||
dump_json_parser.UpdateDumpIter();
|
||||
} else {
|
||||
if (starting_graph_id == graph_id) {
|
||||
dump_json_parser.UpdateDumpIter();
|
||||
}
|
||||
}
|
||||
|
||||
if (dump_json_parser.GetIterDumpFlag()) {
|
||||
|
@ -296,7 +297,7 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, co
|
|||
DumpParametersAndConst(graph, dump_path, debugger);
|
||||
return true;
|
||||
} else if (dump_json_parser.AsyncDumpEnabled()) {
|
||||
uint32_t prev_dump_iter = dump_json_parser.cur_dump_iter() - 1;
|
||||
uint32_t prev_dump_iter = dump_json_parser.cur_dump_iter();
|
||||
|
||||
auto zero_dir_dump_path =
|
||||
dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/_/" + std::to_string(graph->graph_id()) + "/0";
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -57,9 +57,6 @@ class E2eDump {
|
|||
static void DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_index, const std::string &dump_path,
|
||||
bool trans_flag, std::map<std::string, size_t> *const_map, const Debugger *debugger);
|
||||
inline static unsigned int starting_graph_id = INT32_MAX;
|
||||
|
||||
// Get time stamp since epoch in microseconds
|
||||
static uint64_t GetTimeStamp();
|
||||
};
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_E_2_E_DUMP_UTIL_H_
|
||||
|
|
|
@ -1090,7 +1090,7 @@ std::vector<std::string> Debugger::CheckOpOverflow() {
|
|||
}
|
||||
MS_LOG(INFO) << "Open overflow bin file " << file_name;
|
||||
const uint32_t offset = 321;
|
||||
infile.seekg(offset, std::ios::beg);
|
||||
(void)infile.seekg(offset, std::ios::beg);
|
||||
std::vector<char> buffer;
|
||||
const size_t buf_size = 256;
|
||||
buffer.resize(buf_size);
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
* Copyright 2019-2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -380,7 +380,6 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool) {
|
|||
MS_EXCEPTION_IF_NULL(profiler_inst);
|
||||
|
||||
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
||||
dump_json_parser.UpdateDumpIter();
|
||||
bool iter_dump_flag = dump_json_parser.GetIterDumpFlag();
|
||||
uint32_t graph_id = kernel_graph->graph_id();
|
||||
|
||||
|
@ -444,6 +443,7 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool) {
|
|||
if (iter_dump_flag) {
|
||||
CPUE2eDump::DumpParametersAndConst(kernel_graph, graph_id);
|
||||
}
|
||||
dump_json_parser.UpdateDumpIter();
|
||||
return true;
|
||||
}
|
||||
} // namespace cpu
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
# Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
@ -17,6 +17,7 @@ import json
|
|||
import sys
|
||||
import time
|
||||
import shutil
|
||||
import glob
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
@ -85,21 +86,23 @@ def run_e2e_dump():
|
|||
device_id = context.get_context("device_id")
|
||||
else:
|
||||
device_id = 0
|
||||
dump_file_path = dump_path + '/rank_{}/Net/0/1/'.format(device_id)
|
||||
dump_file_path = dump_path + '/rank_{}/Net/0/0/'.format(device_id)
|
||||
if os.path.isdir(dump_path):
|
||||
shutil.rmtree(dump_path)
|
||||
add = Net()
|
||||
add(Tensor(x), Tensor(y))
|
||||
time.sleep(5)
|
||||
assert len(os.listdir(dump_file_path)) == 5
|
||||
if context.get_context("device_target") == "CPU":
|
||||
output_name = "Default--Add-op3_output_0.DefaultFormat.npy"
|
||||
output_path = dump_file_path + output_name
|
||||
real_path = os.path.realpath(output_path)
|
||||
output = np.load(real_path)
|
||||
expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
|
||||
assert output.dtype == expect.dtype
|
||||
assert np.array_equal(output, expect)
|
||||
if context.get_context("device_target") == "Ascend":
|
||||
output_name = "Add.Add-op1.0.0.*.output.0.DefaultFormat.npy"
|
||||
else:
|
||||
output_name = "Add.Add-op3.0.0.*.output.0.DefaultFormat.npy"
|
||||
output_path = glob.glob(dump_file_path + output_name)[0]
|
||||
real_path = os.path.realpath(output_path)
|
||||
output = np.load(real_path)
|
||||
expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
|
||||
assert output.dtype == expect.dtype
|
||||
assert np.array_equal(output, expect)
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
|
|
Loading…
Reference in New Issue