bugfix: extract task id error in offline dbg

This commit is contained in:
TinaMengtingZhang 2022-01-25 13:43:55 -05:00
parent 0fd1dfafd0
commit 79f920f499
2 changed files with 37 additions and 25 deletions

View File

@ -43,6 +43,7 @@ namespace mindspore {
#endif
static constexpr const char *constant_prefix = "Default--data-";
static constexpr const char *kNpyExt = ".npy";
namespace {
#ifdef __APPLE__
@ -815,7 +816,6 @@ void DebugServices::ReadTensorFromNpy(const std::string &tensor_name, const std:
* converted npy file name into AsyncFilePool. It's for Ascend async dump only.
*/
void DebugServices::ConvertToHostFormat(const DirMap &dir_to_files_map, AsyncFilePool *const result_list) {
std::string file_format = "npy";
for (auto const &d : dir_to_files_map) {
std::vector<std::string> files_to_convert_in_dir;
std::vector<std::string> files_after_convert_in_dir;
@ -845,7 +845,7 @@ void DebugServices::ConvertToHostFormat(const DirMap &dir_to_files_map, AsyncFil
MS_LOG(EXCEPTION) << "Failed to convert async dump data: " << e.what();
}
}
ProcessConvertToHostFormat(files_after_convert_in_dir, dump_key, result_list, file_format);
ProcessConvertToHostFormat(files_after_convert_in_dir, dump_key, result_list);
}
}
}
@ -858,8 +858,7 @@ void DebugServices::ConvertToHostFormat(const DirMap &dir_to_files_map, AsyncFil
* append into AsyncFilePool. It's for Ascend async dump only.
*/
void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir,
const std::string &dump_key, AsyncFilePool *const result_list,
const std::string &file_format) {
const std::string &dump_key, AsyncFilePool *const result_list) {
std::string real_dump_iter_dir = RealPath(dump_key);
DIR *d_handle = opendir(real_dump_iter_dir.c_str());
if (d_handle == nullptr) {
@ -884,7 +883,7 @@ void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &f
if (last_slash_pos != std::string::npos) {
file_n = file_to_find.substr(last_slash_pos + 1);
}
if (candidate.find(file_n) != std::string::npos && candidate.rfind(file_format) != std::string::npos) {
if (candidate.find(file_n) != std::string::npos && candidate.rfind(kNpyExt) != std::string::npos) {
// we found a converted file for this op
std::string found_file = dump_key + "/" + candidate;
result_list->insert(found_file);
@ -927,7 +926,6 @@ std::string GetNodeNameWithoutScope(const std::string &dump_style_name) {
void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
std::vector<unsigned int> root_graph_id, AsyncFilePool *const result_list) {
std::string file_format = "npy";
DirMap dir_to_files_map;
for (unsigned int i = 0; i < backend_name.size(); i++) {
// form prefix of the tensor file to read from graph pb node name
@ -954,7 +952,7 @@ void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, st
MS_LOG(INFO) << "Directory does not exist in ConvertReadTensors.";
return;
}
ProcessConvertList(prefix_dump_file_name, file_format, specific_dump_dir, &dir_to_files_map, result_list);
ProcessConvertList(prefix_dump_file_name, specific_dump_dir, &dir_to_files_map, result_list);
(void)closedir(d);
}
ConvertToHostFormat(dir_to_files_map, result_list);
@ -962,7 +960,6 @@ void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, st
void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
const std::string &specific_dump_dir, AsyncFilePool *const result_list) {
std::string file_format = "npy";
DirMap dir_to_files_map;
for (const auto &node : proto_dump) {
std::string dump_name = std::get<1>(node);
@ -974,15 +971,14 @@ void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::str
MS_LOG(INFO) << "Directory " << specific_dump_dir.c_str() << " does not exist in ConvertWatchPointNodes.";
return;
}
ProcessConvertList(dump_name, file_format, specific_dump_dir, &dir_to_files_map, result_list);
ProcessConvertList(dump_name, specific_dump_dir, &dir_to_files_map, result_list);
(void)closedir(d);
}
ConvertToHostFormat(dir_to_files_map, result_list);
}
void DebugServices::ProcessConvertList(const std::string &prefix_dump_file_name, const std::string &file_format,
const std::string &specific_dump_dir, DirMap *dir_to_files_map,
AsyncFilePool *const result_list) {
void DebugServices::ProcessConvertList(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
DirMap *dir_to_files_map, AsyncFilePool *const result_list) {
MS_EXCEPTION_IF_NULL(dir_to_files_map);
DIR *d = opendir(specific_dump_dir.c_str());
struct dirent *dir = nullptr;
@ -1006,7 +1002,7 @@ void DebugServices::ProcessConvertList(const std::string &prefix_dump_file_name,
file_name.find(prefix_dump_file_name + ".", type_pos + 1) == std::string::npos) {
continue;
}
if (file_name.rfind(file_format) == std::string::npos) {
if (file_name.rfind(kNpyExt) == std::string::npos) {
std::size_t second_dot = file_name.find(".", file_name.find(prefix_dump_file_name + ".", type_pos + 1));
file_name_w_o_perfix.replace(type_pos + 1, second_dot - type_pos - 1, prefix_dump_file_name);
// if file matches prefix and is in device format add to candidate files to convert.
@ -1924,7 +1920,7 @@ void DebugServices::AddOpOverflowOpNames(const std::string overflow_bin_path, st
<< ".";
task_stream_hit.push_back(std::make_pair(task_id, stream_id));
} else {
// regular bin file
// regular bin file or npy file
bool success_parse = GetAttrsFromFilename(file_name, &node_name, &task_id, &stream_id);
if (success_parse) {
task_stream_to_opname[std::make_pair(task_id, stream_id)] = node_name;
@ -1951,10 +1947,13 @@ void DebugServices::AddOpOverflowOpNames(const std::string overflow_bin_path, st
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: Checks whether for the given node the operator overflow happened or not by checking the overflow
* directory.
* directory. This function is for async mode only.
*/
bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int device_id, unsigned int root_graph_id,
unsigned int iteration) {
if (is_sync_mode_) {
return false;
}
std::string overflow_bin_path = "";
#ifdef ONLINE_DBG_MODE
overflow_bin_path = GetOnlineOpOverflowDir();
@ -2048,12 +2047,27 @@ bool DebugServices::GetTaskIdStreamId(std::string file_name, std::string overflo
bool DebugServices::GetAttrsFromFilename(const std::string &file_name, std::string *const node_name, uint64_t *task_id,
uint64_t *stream_id) {
// get the node_name, task_id, and stream_id from dump filename
// node_type.node_name.task_id.stream_id.{etcetera}
// get the node_name, task_id, and stream_id from dump filename in the following two formats:
// 1. bin file: node_type.node_name.task_id.stream_id.timestamp
// 2. npy file: node_type.node_name.task_id.stream_id.timestamp.output_input.slot.format.npy
// Please note that node_name might contain dot (i.e. Parameter). So to search for the location of second dot, we need
// to search the file name from right to left.
size_t first_dot = file_name.find(".");
size_t second_dot = file_name.find(".", first_dot + 1);
size_t third_dot = file_name.find(".", second_dot + 1);
size_t fourth_dot = file_name.find(".", third_dot + 1);
size_t fourth_dot;
if (file_name.rfind(kNpyExt) != std::string::npos) {
// npy format file (converted file or A+M dump file)
size_t pos = file_name.rfind(".");
const int kFourthFromRight = 4;
for (int cnt = 0; cnt < kFourthFromRight; cnt++) {
pos = file_name.rfind(".", pos - 1);
}
fourth_dot = pos;
} else {
// bin format file
fourth_dot = file_name.rfind(".");
}
size_t third_dot = file_name.rfind(".", fourth_dot - 1);
size_t second_dot = file_name.rfind(".", third_dot - 1);
// check if dots were found
if (first_dot == std::string::npos || second_dot == std::string::npos || third_dot == std::string::npos ||

View File

@ -372,8 +372,7 @@ class DebugServices {
void ConvertToHostFormat(const DirMap &dir_to_files_map, AsyncFilePool *const result_list);
void ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir,
const std::string &dump_key, AsyncFilePool *const result_list,
const std::string &file_format);
const std::string &dump_key, AsyncFilePool *const result_list);
void ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
@ -382,9 +381,8 @@ class DebugServices {
void ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
const std::string &specific_dump_dir, AsyncFilePool *const result_list);
void ProcessConvertList(const std::string &prefix_dump_file_name, const std::string &file_format,
const std::string &specific_dump_dir, DirMap *dir_to_files_map,
AsyncFilePool *const result_list);
void ProcessConvertList(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
DirMap *dir_to_files_map, AsyncFilePool *const result_list);
void GetTensorDataInfoAsync(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id,