bugfix: extract task id error in offline dbg
This commit is contained in:
parent
0fd1dfafd0
commit
79f920f499
|
@ -43,6 +43,7 @@ namespace mindspore {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static constexpr const char *constant_prefix = "Default--data-";
|
static constexpr const char *constant_prefix = "Default--data-";
|
||||||
|
static constexpr const char *kNpyExt = ".npy";
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
#ifdef __APPLE__
|
#ifdef __APPLE__
|
||||||
|
@ -815,7 +816,6 @@ void DebugServices::ReadTensorFromNpy(const std::string &tensor_name, const std:
|
||||||
* converted npy file name into AsyncFilePool. It's for Ascend async dump only.
|
* converted npy file name into AsyncFilePool. It's for Ascend async dump only.
|
||||||
*/
|
*/
|
||||||
void DebugServices::ConvertToHostFormat(const DirMap &dir_to_files_map, AsyncFilePool *const result_list) {
|
void DebugServices::ConvertToHostFormat(const DirMap &dir_to_files_map, AsyncFilePool *const result_list) {
|
||||||
std::string file_format = "npy";
|
|
||||||
for (auto const &d : dir_to_files_map) {
|
for (auto const &d : dir_to_files_map) {
|
||||||
std::vector<std::string> files_to_convert_in_dir;
|
std::vector<std::string> files_to_convert_in_dir;
|
||||||
std::vector<std::string> files_after_convert_in_dir;
|
std::vector<std::string> files_after_convert_in_dir;
|
||||||
|
@ -845,7 +845,7 @@ void DebugServices::ConvertToHostFormat(const DirMap &dir_to_files_map, AsyncFil
|
||||||
MS_LOG(EXCEPTION) << "Failed to convert async dump data: " << e.what();
|
MS_LOG(EXCEPTION) << "Failed to convert async dump data: " << e.what();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ProcessConvertToHostFormat(files_after_convert_in_dir, dump_key, result_list, file_format);
|
ProcessConvertToHostFormat(files_after_convert_in_dir, dump_key, result_list);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -858,8 +858,7 @@ void DebugServices::ConvertToHostFormat(const DirMap &dir_to_files_map, AsyncFil
|
||||||
* append into AsyncFilePool. It's for Ascend async dump only.
|
* append into AsyncFilePool. It's for Ascend async dump only.
|
||||||
*/
|
*/
|
||||||
void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir,
|
void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir,
|
||||||
const std::string &dump_key, AsyncFilePool *const result_list,
|
const std::string &dump_key, AsyncFilePool *const result_list) {
|
||||||
const std::string &file_format) {
|
|
||||||
std::string real_dump_iter_dir = RealPath(dump_key);
|
std::string real_dump_iter_dir = RealPath(dump_key);
|
||||||
DIR *d_handle = opendir(real_dump_iter_dir.c_str());
|
DIR *d_handle = opendir(real_dump_iter_dir.c_str());
|
||||||
if (d_handle == nullptr) {
|
if (d_handle == nullptr) {
|
||||||
|
@ -884,7 +883,7 @@ void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &f
|
||||||
if (last_slash_pos != std::string::npos) {
|
if (last_slash_pos != std::string::npos) {
|
||||||
file_n = file_to_find.substr(last_slash_pos + 1);
|
file_n = file_to_find.substr(last_slash_pos + 1);
|
||||||
}
|
}
|
||||||
if (candidate.find(file_n) != std::string::npos && candidate.rfind(file_format) != std::string::npos) {
|
if (candidate.find(file_n) != std::string::npos && candidate.rfind(kNpyExt) != std::string::npos) {
|
||||||
// we found a converted file for this op
|
// we found a converted file for this op
|
||||||
std::string found_file = dump_key + "/" + candidate;
|
std::string found_file = dump_key + "/" + candidate;
|
||||||
result_list->insert(found_file);
|
result_list->insert(found_file);
|
||||||
|
@ -927,7 +926,6 @@ std::string GetNodeNameWithoutScope(const std::string &dump_style_name) {
|
||||||
void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
|
void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
|
||||||
std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
|
std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
|
||||||
std::vector<unsigned int> root_graph_id, AsyncFilePool *const result_list) {
|
std::vector<unsigned int> root_graph_id, AsyncFilePool *const result_list) {
|
||||||
std::string file_format = "npy";
|
|
||||||
DirMap dir_to_files_map;
|
DirMap dir_to_files_map;
|
||||||
for (unsigned int i = 0; i < backend_name.size(); i++) {
|
for (unsigned int i = 0; i < backend_name.size(); i++) {
|
||||||
// form prefix of the tensor file to read from graph pb node name
|
// form prefix of the tensor file to read from graph pb node name
|
||||||
|
@ -954,7 +952,7 @@ void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, st
|
||||||
MS_LOG(INFO) << "Directory does not exist in ConvertReadTensors.";
|
MS_LOG(INFO) << "Directory does not exist in ConvertReadTensors.";
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
ProcessConvertList(prefix_dump_file_name, file_format, specific_dump_dir, &dir_to_files_map, result_list);
|
ProcessConvertList(prefix_dump_file_name, specific_dump_dir, &dir_to_files_map, result_list);
|
||||||
(void)closedir(d);
|
(void)closedir(d);
|
||||||
}
|
}
|
||||||
ConvertToHostFormat(dir_to_files_map, result_list);
|
ConvertToHostFormat(dir_to_files_map, result_list);
|
||||||
|
@ -962,7 +960,6 @@ void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, st
|
||||||
|
|
||||||
void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
|
void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
|
||||||
const std::string &specific_dump_dir, AsyncFilePool *const result_list) {
|
const std::string &specific_dump_dir, AsyncFilePool *const result_list) {
|
||||||
std::string file_format = "npy";
|
|
||||||
DirMap dir_to_files_map;
|
DirMap dir_to_files_map;
|
||||||
for (const auto &node : proto_dump) {
|
for (const auto &node : proto_dump) {
|
||||||
std::string dump_name = std::get<1>(node);
|
std::string dump_name = std::get<1>(node);
|
||||||
|
@ -974,15 +971,14 @@ void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::str
|
||||||
MS_LOG(INFO) << "Directory " << specific_dump_dir.c_str() << " does not exist in ConvertWatchPointNodes.";
|
MS_LOG(INFO) << "Directory " << specific_dump_dir.c_str() << " does not exist in ConvertWatchPointNodes.";
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
ProcessConvertList(dump_name, file_format, specific_dump_dir, &dir_to_files_map, result_list);
|
ProcessConvertList(dump_name, specific_dump_dir, &dir_to_files_map, result_list);
|
||||||
(void)closedir(d);
|
(void)closedir(d);
|
||||||
}
|
}
|
||||||
ConvertToHostFormat(dir_to_files_map, result_list);
|
ConvertToHostFormat(dir_to_files_map, result_list);
|
||||||
}
|
}
|
||||||
|
|
||||||
void DebugServices::ProcessConvertList(const std::string &prefix_dump_file_name, const std::string &file_format,
|
void DebugServices::ProcessConvertList(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
|
||||||
const std::string &specific_dump_dir, DirMap *dir_to_files_map,
|
DirMap *dir_to_files_map, AsyncFilePool *const result_list) {
|
||||||
AsyncFilePool *const result_list) {
|
|
||||||
MS_EXCEPTION_IF_NULL(dir_to_files_map);
|
MS_EXCEPTION_IF_NULL(dir_to_files_map);
|
||||||
DIR *d = opendir(specific_dump_dir.c_str());
|
DIR *d = opendir(specific_dump_dir.c_str());
|
||||||
struct dirent *dir = nullptr;
|
struct dirent *dir = nullptr;
|
||||||
|
@ -1006,7 +1002,7 @@ void DebugServices::ProcessConvertList(const std::string &prefix_dump_file_name,
|
||||||
file_name.find(prefix_dump_file_name + ".", type_pos + 1) == std::string::npos) {
|
file_name.find(prefix_dump_file_name + ".", type_pos + 1) == std::string::npos) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (file_name.rfind(file_format) == std::string::npos) {
|
if (file_name.rfind(kNpyExt) == std::string::npos) {
|
||||||
std::size_t second_dot = file_name.find(".", file_name.find(prefix_dump_file_name + ".", type_pos + 1));
|
std::size_t second_dot = file_name.find(".", file_name.find(prefix_dump_file_name + ".", type_pos + 1));
|
||||||
file_name_w_o_perfix.replace(type_pos + 1, second_dot - type_pos - 1, prefix_dump_file_name);
|
file_name_w_o_perfix.replace(type_pos + 1, second_dot - type_pos - 1, prefix_dump_file_name);
|
||||||
// if file matches prefix and is in device format add to candidate files to convert.
|
// if file matches prefix and is in device format add to candidate files to convert.
|
||||||
|
@ -1924,7 +1920,7 @@ void DebugServices::AddOpOverflowOpNames(const std::string overflow_bin_path, st
|
||||||
<< ".";
|
<< ".";
|
||||||
task_stream_hit.push_back(std::make_pair(task_id, stream_id));
|
task_stream_hit.push_back(std::make_pair(task_id, stream_id));
|
||||||
} else {
|
} else {
|
||||||
// regular bin file
|
// regular bin file or npy file
|
||||||
bool success_parse = GetAttrsFromFilename(file_name, &node_name, &task_id, &stream_id);
|
bool success_parse = GetAttrsFromFilename(file_name, &node_name, &task_id, &stream_id);
|
||||||
if (success_parse) {
|
if (success_parse) {
|
||||||
task_stream_to_opname[std::make_pair(task_id, stream_id)] = node_name;
|
task_stream_to_opname[std::make_pair(task_id, stream_id)] = node_name;
|
||||||
|
@ -1951,10 +1947,13 @@ void DebugServices::AddOpOverflowOpNames(const std::string overflow_bin_path, st
|
||||||
* Target device group: Ascend.
|
* Target device group: Ascend.
|
||||||
* Runtime category: Old runtime, MindRT.
|
* Runtime category: Old runtime, MindRT.
|
||||||
* Description: Checks whether for the given node the operator overflow happened or not by checking the overflow
|
* Description: Checks whether for the given node the operator overflow happened or not by checking the overflow
|
||||||
* directory.
|
* directory. This function is for async mode only.
|
||||||
*/
|
*/
|
||||||
bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int device_id, unsigned int root_graph_id,
|
bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int device_id, unsigned int root_graph_id,
|
||||||
unsigned int iteration) {
|
unsigned int iteration) {
|
||||||
|
if (is_sync_mode_) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
std::string overflow_bin_path = "";
|
std::string overflow_bin_path = "";
|
||||||
#ifdef ONLINE_DBG_MODE
|
#ifdef ONLINE_DBG_MODE
|
||||||
overflow_bin_path = GetOnlineOpOverflowDir();
|
overflow_bin_path = GetOnlineOpOverflowDir();
|
||||||
|
@ -2048,12 +2047,27 @@ bool DebugServices::GetTaskIdStreamId(std::string file_name, std::string overflo
|
||||||
|
|
||||||
bool DebugServices::GetAttrsFromFilename(const std::string &file_name, std::string *const node_name, uint64_t *task_id,
|
bool DebugServices::GetAttrsFromFilename(const std::string &file_name, std::string *const node_name, uint64_t *task_id,
|
||||||
uint64_t *stream_id) {
|
uint64_t *stream_id) {
|
||||||
// get the node_name, task_id, and stream_id from dump filename
|
// get the node_name, task_id, and stream_id from dump filename in the following two formats:
|
||||||
// node_type.node_name.task_id.stream_id.{etcetera}
|
// 1. bin file: node_type.node_name.task_id.stream_id.timestamp
|
||||||
|
// 2. npy file: node_type.node_name.task_id.stream_id.timestamp.output_input.slot.format.npy
|
||||||
|
// Please note that node_name might contain dot (i.e. Parameter). So to search for the location of second dot, we need
|
||||||
|
// to search the file name from right to left.
|
||||||
size_t first_dot = file_name.find(".");
|
size_t first_dot = file_name.find(".");
|
||||||
size_t second_dot = file_name.find(".", first_dot + 1);
|
size_t fourth_dot;
|
||||||
size_t third_dot = file_name.find(".", second_dot + 1);
|
if (file_name.rfind(kNpyExt) != std::string::npos) {
|
||||||
size_t fourth_dot = file_name.find(".", third_dot + 1);
|
// npy format file (converted file or A+M dump file)
|
||||||
|
size_t pos = file_name.rfind(".");
|
||||||
|
const int kFourthFromRight = 4;
|
||||||
|
for (int cnt = 0; cnt < kFourthFromRight; cnt++) {
|
||||||
|
pos = file_name.rfind(".", pos - 1);
|
||||||
|
}
|
||||||
|
fourth_dot = pos;
|
||||||
|
} else {
|
||||||
|
// bin format file
|
||||||
|
fourth_dot = file_name.rfind(".");
|
||||||
|
}
|
||||||
|
size_t third_dot = file_name.rfind(".", fourth_dot - 1);
|
||||||
|
size_t second_dot = file_name.rfind(".", third_dot - 1);
|
||||||
|
|
||||||
// check if dots were found
|
// check if dots were found
|
||||||
if (first_dot == std::string::npos || second_dot == std::string::npos || third_dot == std::string::npos ||
|
if (first_dot == std::string::npos || second_dot == std::string::npos || third_dot == std::string::npos ||
|
||||||
|
|
|
@ -372,8 +372,7 @@ class DebugServices {
|
||||||
void ConvertToHostFormat(const DirMap &dir_to_files_map, AsyncFilePool *const result_list);
|
void ConvertToHostFormat(const DirMap &dir_to_files_map, AsyncFilePool *const result_list);
|
||||||
|
|
||||||
void ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir,
|
void ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir,
|
||||||
const std::string &dump_key, AsyncFilePool *const result_list,
|
const std::string &dump_key, AsyncFilePool *const result_list);
|
||||||
const std::string &file_format);
|
|
||||||
|
|
||||||
void ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
|
void ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
|
||||||
std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
|
std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
|
||||||
|
@ -382,9 +381,8 @@ class DebugServices {
|
||||||
void ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
|
void ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
|
||||||
const std::string &specific_dump_dir, AsyncFilePool *const result_list);
|
const std::string &specific_dump_dir, AsyncFilePool *const result_list);
|
||||||
|
|
||||||
void ProcessConvertList(const std::string &prefix_dump_file_name, const std::string &file_format,
|
void ProcessConvertList(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
|
||||||
const std::string &specific_dump_dir, DirMap *dir_to_files_map,
|
DirMap *dir_to_files_map, AsyncFilePool *const result_list);
|
||||||
AsyncFilePool *const result_list);
|
|
||||||
|
|
||||||
void GetTensorDataInfoAsync(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
|
void GetTensorDataInfoAsync(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
|
||||||
const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id,
|
const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id,
|
||||||
|
|
Loading…
Reference in New Issue