From 39294bedfd699df4586e9271f43a846294f01e37 Mon Sep 17 00:00:00 2001 From: John Tzanakakis Date: Fri, 9 Jul 2021 20:04:21 -0400 Subject: [PATCH] fix online debug overflow wp --- .../ccsrc/debug/data_dump/dump_json_parser.cc | 18 ++- .../ccsrc/debug/data_dump/dump_json_parser.h | 2 +- mindspore/ccsrc/debug/debugger/debugger.cc | 148 ++++++------------ mindspore/ccsrc/debug/debugger/debugger.h | 2 - .../runtime/device/ascend/dump/data_dumper.cc | 14 +- .../runtime/device/ascend/dump/data_dumper.h | 1 + 6 files changed, 74 insertions(+), 111 deletions(-) diff --git a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc index 05ea17d829e..5e30142e3e0 100644 --- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc +++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc @@ -22,6 +22,7 @@ #include "backend/session/anf_runtime_algorithm.h" #include "debug/data_dump/npy_header.h" #include "debug/anf_ir_utils.h" +#include "utils/comm_manager.h" namespace { constexpr auto kCommonDumpSettings = "common_dump_settings"; @@ -511,18 +512,29 @@ void DumpJsonParser::PrintUnusedKernel() { } } -std::string DumpJsonParser::GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const { +std::string DumpJsonParser::GetOpOverflowBinPath(uint32_t graph_id) const { std::string bin_path; bin_path.append(path_); bin_path.append("/"); bin_path.append("rank_"); - bin_path.append(std::to_string(device_id)); + + uint32_t rank_id = 0; + auto env_table_file = common::GetEnv("RANK_TABLE_FILE"); + auto env_rank_id = common::GetEnv("RANK_ID"); + if (!(env_table_file.empty() || env_rank_id.empty())) { + // get actual rank id if it's distribution training case. + if (!CommManager::GetInstance().GetRankID(kHcclWorldGroup, &rank_id)) { + MS_LOG(INFO) << "Failed to get rank id."; + } + } + bin_path.append(std::to_string(rank_id)); + bin_path.append("/"); bin_path.append(net_name_); bin_path.append("/"); bin_path.append(std::to_string(graph_id)); bin_path.append("/"); - bin_path.append(iteration_); + bin_path.append(std::to_string(cur_dump_iter_)); bin_path.append("/"); return bin_path; diff --git a/mindspore/ccsrc/debug/data_dump/dump_json_parser.h b/mindspore/ccsrc/debug/data_dump/dump_json_parser.h index cc0524b9be2..6e187c8c33b 100644 --- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.h +++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.h @@ -60,7 +60,7 @@ class DumpJsonParser { bool GetIterDumpFlag() const; bool InputNeedDump() const; bool OutputNeedDump() const; - std::string GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const; + std::string GetOpOverflowBinPath(uint32_t graph_id) const; void UpdateNeedDumpKernels(NotNull kernel_graph); void ClearGraph() { graphs_.clear(); } diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index 79776a8c8d6..8e89418459e 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -188,39 +188,6 @@ void Debugger::EnableDebugger() { debug_services_ = std::make_unique(); } -void Debugger::SetOpOverflowBinPath(uint32_t graph_id) { -#ifdef ENABLE_D - // set operation overflow info - overflow_bin_path_.insert(std::pair( - graph_id, DumpJsonParser::GetInstance().GetOpOverflowBinPath(graph_id, device_id_))); - // new overflow dump files will have a timestamp greater than last_overflow_bin_ - auto overflow_bin_path = overflow_bin_path_.find(graph_id)->second; - MS_LOG(INFO) << "overflow_bin_path = " << overflow_bin_path; - DIR *d = opendir(overflow_bin_path.c_str()); - if (d != nullptr) { - struct dirent *dir; - while ((dir = readdir(d)) != nullptr) { - if (dir->d_type == DT_REG) { - std::string file_path = overflow_bin_path; - file_path.append(dir->d_name); - std::size_t found = file_path.find_last_of("."); - if (found == std::string::npos) { - continue; - } - std::string overflow_time = file_path.substr(found + 1); - if (stod(overflow_time) <= last_overflow_bin_) { - MS_LOG(INFO) << "Old op overflow bin folder" << file_path; - continue; - } - last_overflow_bin_ = stod(overflow_time); - } - } - MS_LOG(INFO) << "last op overflow bin folder" << last_overflow_bin_; - closedir(d); - } -#endif -} - void Debugger::CheckDatasetSinkMode() { if (CheckDebuggerDumpEnabled() && ConfigManager::GetInstance().dataset_mode() == DS_SINK_MODE) { MS_EXCEPTION(NotSupportError) @@ -572,9 +539,6 @@ void Debugger::LoadGraphs(const KernelGraphPtr &graph_ptr) { // add new graph proto to graph_proto_list_ graph_proto_list_.push_back(graph_proto); graph_ptr_list_.push_back(graph_ptr); -#ifdef ENABLE_D - SetOpOverflowBinPath(graph_ptr->graph_id()); -#endif not_dataset_graph_sum_++; } // reset is_dataset_graph to be false @@ -1175,83 +1139,59 @@ uint64_t BytestoUInt64(const std::vector &buffer) { } std::vector Debugger::CheckOpOverflow() { - std::vector bin_list; std::vector op_names; - for (const auto &[graph_id, overflow_bin_path] : overflow_bin_path_) { - DIR *d = opendir(overflow_bin_path.c_str()); - MS_LOG(INFO) << "processing bin file path " << overflow_bin_path << ", graph id " << graph_id; - if (d != nullptr) { - struct dirent *dir = nullptr; - while ((dir = readdir(d)) != nullptr) { - if (dir->d_type == DT_REG) { - std::string file_path = overflow_bin_path; - file_path.append(dir->d_name); - std::string file_name = dir->d_name; - std::size_t found = file_name.find_last_of("."); - if (found == std::string::npos) { - continue; - } - std::string overflow_time = file_name.substr(found + 1); - if (stod(overflow_time) <= last_overflow_bin_) { - MS_LOG(INFO) << "File already processed " << file_name; - continue; - } - bin_list.push_back(stod(overflow_time)); - std::fstream infile; - infile.open(file_path.c_str(), std::ios::binary | std::ios::in); - if (!infile.is_open()) { - MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name; - continue; - } - MS_LOG(INFO) << "Open overflow bin file " << file_name; - const uint32_t offset = 321; - (void)infile.seekg(offset, std::ios::beg); - std::vector buffer; - const size_t buf_size = 256; - buffer.resize(buf_size); - (void)infile.read(buffer.data(), buf_size); - const uint8_t stream_id_offset = 16; - const uint8_t task_id_offset = 24; - // The stream_id and task_id in the dump file are 8 byte fields for extensibility purpose, but only hold 4 - // byte values currently. - uint64_t stream_id = BytestoUInt64(std::vector(buffer.begin() + stream_id_offset, buffer.end())); - uint64_t task_id = BytestoUInt64(std::vector(buffer.begin() + task_id_offset, buffer.end())); - MS_LOG(INFO) << "Overflow stream_id " << stream_id << ", task_id " << task_id << "."; - auto op = debugger_->stream_task_to_opname_.find(std::make_pair(stream_id, task_id)); - if (op != debugger_->stream_task_to_opname_.end()) { - MS_LOG(ERROR) << "Overflow detected on node " << op->second << std::endl; - op_names.push_back(op->second); - } else { - MS_LOG(INFO) << "No overflow is detected " << std::endl; - } - infile.close(); + std::string overflow_bin_path = DumpJsonParser::GetInstance().GetOpOverflowBinPath(graph_ptr_->graph_id()); + MS_LOG(INFO) << "Processing bin file path " << overflow_bin_path; + DIR *d = opendir(overflow_bin_path.c_str()); + if (d != nullptr) { + struct dirent *dir = nullptr; + while ((dir = readdir(d)) != nullptr) { + if (dir->d_type == DT_REG) { + std::string file_path = overflow_bin_path; + std::string file_name = dir->d_name; + file_path.append(file_name); + std::fstream infile; + infile.open(file_path.c_str(), std::ios::binary | std::ios::in); + if (!infile.is_open()) { + MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name; + continue; } + // start of op overflow data in bin file + const uint32_t offset = 321; + (void)infile.seekg(offset, std::ios::beg); + std::vector buffer; + // size of op overflow info section + const size_t buf_size = 256; + buffer.resize(buf_size); + (void)infile.read(buffer.data(), buf_size); + const uint8_t stream_id_offset = 16; + const uint8_t task_id_offset = 24; + // The stream_id and task_id in the dump file are 8 byte fields for extensibility purpose, but only hold 4 + // byte values currently. + uint64_t stream_id = BytestoUInt64(std::vector(buffer.begin() + stream_id_offset, buffer.end())); + uint64_t task_id = BytestoUInt64(std::vector(buffer.begin() + task_id_offset, buffer.end())); + MS_LOG(INFO) << "Overflow bin file " << file_name << ", overflow stream_id " << stream_id << ", task_id " + << task_id << "."; + auto op = debugger_->stream_task_to_opname_.find(std::make_pair(stream_id, task_id)); + if (op != debugger_->stream_task_to_opname_.end()) { + MS_LOG(INFO) << "Overflow detected on node " << op->second << std::endl; + op_names.push_back(op->second); + } else { + MS_LOG(INFO) << "No overflow is detected " << std::endl; + } + infile.close(); } - } else { - MS_LOG(INFO) << "OverFlow bin directory does not exist!"; } - closedir(d); + } else { + MS_LOG(INFO) << "OverFlow bin directory does not exist!"; } + closedir(d); if (!op_names.empty()) { - MS_LOG(ERROR) << "These operation overflows are detected " << op_names; + MS_LOG(INFO) << "These operation overflows are detected " << op_names; } - for (auto &i : bin_list) { - if (i > last_overflow_bin_) { - last_overflow_bin_ = i; - } - } - - auto iter_op_names = overflow_ops_.find(num_step_); - if (iter_op_names == overflow_ops_.end()) { - overflow_ops_.insert(std::pair>(num_step_, op_names)); - - return op_names; - } - iter_op_names->second.insert(std::end(iter_op_names->second), std::begin(op_names), std::end(op_names)); - - return iter_op_names->second; + return op_names; } void Debugger::SetTrainingDone(bool training_done) { training_done_ = training_done; } diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h index c71afc129f8..d171848aadf 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.h +++ b/mindspore/ccsrc/debug/debugger/debugger.h @@ -179,8 +179,6 @@ class Debugger : public std::enable_shared_from_this { // read env variable for grpc client void EnableDebugger(); - void SetOpOverflowBinPath(uint32_t graph_id); - // check if debugger enabled bool CheckDebuggerEnabled() const; diff --git a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc index c115985a657..6fc0a8d3014 100644 --- a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc +++ b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc @@ -123,7 +123,7 @@ void DataDumper::LoadDumpInfo() { std::inserter(stream_task_to_opname, stream_task_to_opname.end()), [](const std::pair> &p) -> std::pair, std::string> { - return {{std::get<1>(*p.second), std::get<0>(*p.second)}, p.first}; + return {{std::get<1>(*p.second), std::get<0>(*p.second)}, StripUniqueId(p.first)}; }); debugger->SetStreamTaskToOpnameMap(stream_task_to_opname); } @@ -464,6 +464,18 @@ void DataDumper::DumpKernelInput(const CNodePtr &kernel, void *args, NotNull *> kernel_map) const; static void DumpKernelOutput(const CNodePtr &kernel, void *args, NotNull task); static void DumpKernelInput(const CNodePtr &kernel, void *args, NotNull task); + static std::string StripUniqueId(const std::string node_name); static void RtLoadDumpData(const aicpu::dump::OpMappingInfo &dump_info, void **ptr); std::function model_handle_;