forked from mindspore-Ecosystem/mindspore
!20116 fix online debug overflow wp
Merge pull request !20116 from john_tzanakakis/jt_bug_fixes
This commit is contained in:
commit
960b8cb1a8
|
@ -22,6 +22,7 @@
|
|||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "debug/data_dump/npy_header.h"
|
||||
#include "debug/anf_ir_utils.h"
|
||||
#include "utils/comm_manager.h"
|
||||
|
||||
namespace {
|
||||
constexpr auto kCommonDumpSettings = "common_dump_settings";
|
||||
|
@ -511,18 +512,29 @@ void DumpJsonParser::PrintUnusedKernel() {
|
|||
}
|
||||
}
|
||||
|
||||
std::string DumpJsonParser::GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const {
|
||||
std::string DumpJsonParser::GetOpOverflowBinPath(uint32_t graph_id) const {
|
||||
std::string bin_path;
|
||||
bin_path.append(path_);
|
||||
bin_path.append("/");
|
||||
bin_path.append("rank_");
|
||||
bin_path.append(std::to_string(device_id));
|
||||
|
||||
uint32_t rank_id = 0;
|
||||
auto env_table_file = common::GetEnv("RANK_TABLE_FILE");
|
||||
auto env_rank_id = common::GetEnv("RANK_ID");
|
||||
if (!(env_table_file.empty() || env_rank_id.empty())) {
|
||||
// get actual rank id if it's distribution training case.
|
||||
if (!CommManager::GetInstance().GetRankID(kHcclWorldGroup, &rank_id)) {
|
||||
MS_LOG(INFO) << "Failed to get rank id.";
|
||||
}
|
||||
}
|
||||
bin_path.append(std::to_string(rank_id));
|
||||
|
||||
bin_path.append("/");
|
||||
bin_path.append(net_name_);
|
||||
bin_path.append("/");
|
||||
bin_path.append(std::to_string(graph_id));
|
||||
bin_path.append("/");
|
||||
bin_path.append(iteration_);
|
||||
bin_path.append(std::to_string(cur_dump_iter_));
|
||||
bin_path.append("/");
|
||||
|
||||
return bin_path;
|
||||
|
|
|
@ -60,7 +60,7 @@ class DumpJsonParser {
|
|||
bool GetIterDumpFlag() const;
|
||||
bool InputNeedDump() const;
|
||||
bool OutputNeedDump() const;
|
||||
std::string GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const;
|
||||
std::string GetOpOverflowBinPath(uint32_t graph_id) const;
|
||||
void UpdateNeedDumpKernels(NotNull<const session::KernelGraph *> kernel_graph);
|
||||
|
||||
void ClearGraph() { graphs_.clear(); }
|
||||
|
|
|
@ -188,39 +188,6 @@ void Debugger::EnableDebugger() {
|
|||
debug_services_ = std::make_unique<DebugServices>();
|
||||
}
|
||||
|
||||
void Debugger::SetOpOverflowBinPath(uint32_t graph_id) {
|
||||
#ifdef ENABLE_D
|
||||
// set operation overflow info
|
||||
overflow_bin_path_.insert(std::pair<uint32_t, std::string>(
|
||||
graph_id, DumpJsonParser::GetInstance().GetOpOverflowBinPath(graph_id, device_id_)));
|
||||
// new overflow dump files will have a timestamp greater than last_overflow_bin_
|
||||
auto overflow_bin_path = overflow_bin_path_.find(graph_id)->second;
|
||||
MS_LOG(INFO) << "overflow_bin_path = " << overflow_bin_path;
|
||||
DIR *d = opendir(overflow_bin_path.c_str());
|
||||
if (d != nullptr) {
|
||||
struct dirent *dir;
|
||||
while ((dir = readdir(d)) != nullptr) {
|
||||
if (dir->d_type == DT_REG) {
|
||||
std::string file_path = overflow_bin_path;
|
||||
file_path.append(dir->d_name);
|
||||
std::size_t found = file_path.find_last_of(".");
|
||||
if (found == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
std::string overflow_time = file_path.substr(found + 1);
|
||||
if (stod(overflow_time) <= last_overflow_bin_) {
|
||||
MS_LOG(INFO) << "Old op overflow bin folder" << file_path;
|
||||
continue;
|
||||
}
|
||||
last_overflow_bin_ = stod(overflow_time);
|
||||
}
|
||||
}
|
||||
MS_LOG(INFO) << "last op overflow bin folder" << last_overflow_bin_;
|
||||
closedir(d);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void Debugger::CheckDatasetSinkMode() {
|
||||
if (CheckDebuggerDumpEnabled() && ConfigManager::GetInstance().dataset_mode() == DS_SINK_MODE) {
|
||||
MS_EXCEPTION(NotSupportError)
|
||||
|
@ -572,9 +539,6 @@ void Debugger::LoadGraphs(const KernelGraphPtr &graph_ptr) {
|
|||
// add new graph proto to graph_proto_list_
|
||||
graph_proto_list_.push_back(graph_proto);
|
||||
graph_ptr_list_.push_back(graph_ptr);
|
||||
#ifdef ENABLE_D
|
||||
SetOpOverflowBinPath(graph_ptr->graph_id());
|
||||
#endif
|
||||
not_dataset_graph_sum_++;
|
||||
}
|
||||
// reset is_dataset_graph to be false
|
||||
|
@ -1189,83 +1153,59 @@ uint64_t BytestoUInt64(const std::vector<char> &buffer) {
|
|||
}
|
||||
|
||||
std::vector<std::string> Debugger::CheckOpOverflow() {
|
||||
std::vector<double> bin_list;
|
||||
std::vector<std::string> op_names;
|
||||
for (const auto &[graph_id, overflow_bin_path] : overflow_bin_path_) {
|
||||
DIR *d = opendir(overflow_bin_path.c_str());
|
||||
MS_LOG(INFO) << "processing bin file path " << overflow_bin_path << ", graph id " << graph_id;
|
||||
if (d != nullptr) {
|
||||
struct dirent *dir = nullptr;
|
||||
while ((dir = readdir(d)) != nullptr) {
|
||||
if (dir->d_type == DT_REG) {
|
||||
std::string file_path = overflow_bin_path;
|
||||
file_path.append(dir->d_name);
|
||||
std::string file_name = dir->d_name;
|
||||
std::size_t found = file_name.find_last_of(".");
|
||||
if (found == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
std::string overflow_time = file_name.substr(found + 1);
|
||||
if (stod(overflow_time) <= last_overflow_bin_) {
|
||||
MS_LOG(INFO) << "File already processed " << file_name;
|
||||
continue;
|
||||
}
|
||||
bin_list.push_back(stod(overflow_time));
|
||||
std::fstream infile;
|
||||
infile.open(file_path.c_str(), std::ios::binary | std::ios::in);
|
||||
if (!infile.is_open()) {
|
||||
MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name;
|
||||
continue;
|
||||
}
|
||||
MS_LOG(INFO) << "Open overflow bin file " << file_name;
|
||||
const uint32_t offset = 321;
|
||||
(void)infile.seekg(offset, std::ios::beg);
|
||||
std::vector<char> buffer;
|
||||
const size_t buf_size = 256;
|
||||
buffer.resize(buf_size);
|
||||
(void)infile.read(buffer.data(), buf_size);
|
||||
const uint8_t stream_id_offset = 16;
|
||||
const uint8_t task_id_offset = 24;
|
||||
// The stream_id and task_id in the dump file are 8 byte fields for extensibility purpose, but only hold 4
|
||||
// byte values currently.
|
||||
uint64_t stream_id = BytestoUInt64(std::vector<char>(buffer.begin() + stream_id_offset, buffer.end()));
|
||||
uint64_t task_id = BytestoUInt64(std::vector<char>(buffer.begin() + task_id_offset, buffer.end()));
|
||||
MS_LOG(INFO) << "Overflow stream_id " << stream_id << ", task_id " << task_id << ".";
|
||||
auto op = debugger_->stream_task_to_opname_.find(std::make_pair(stream_id, task_id));
|
||||
if (op != debugger_->stream_task_to_opname_.end()) {
|
||||
MS_LOG(ERROR) << "Overflow detected on node " << op->second << std::endl;
|
||||
op_names.push_back(op->second);
|
||||
} else {
|
||||
MS_LOG(INFO) << "No overflow is detected " << std::endl;
|
||||
}
|
||||
infile.close();
|
||||
std::string overflow_bin_path = DumpJsonParser::GetInstance().GetOpOverflowBinPath(graph_ptr_->graph_id());
|
||||
MS_LOG(INFO) << "Processing bin file path " << overflow_bin_path;
|
||||
DIR *d = opendir(overflow_bin_path.c_str());
|
||||
if (d != nullptr) {
|
||||
struct dirent *dir = nullptr;
|
||||
while ((dir = readdir(d)) != nullptr) {
|
||||
if (dir->d_type == DT_REG) {
|
||||
std::string file_path = overflow_bin_path;
|
||||
std::string file_name = dir->d_name;
|
||||
file_path.append(file_name);
|
||||
std::fstream infile;
|
||||
infile.open(file_path.c_str(), std::ios::binary | std::ios::in);
|
||||
if (!infile.is_open()) {
|
||||
MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name;
|
||||
continue;
|
||||
}
|
||||
// start of op overflow data in bin file
|
||||
const uint32_t offset = 321;
|
||||
(void)infile.seekg(offset, std::ios::beg);
|
||||
std::vector<char> buffer;
|
||||
// size of op overflow info section
|
||||
const size_t buf_size = 256;
|
||||
buffer.resize(buf_size);
|
||||
(void)infile.read(buffer.data(), buf_size);
|
||||
const uint8_t stream_id_offset = 16;
|
||||
const uint8_t task_id_offset = 24;
|
||||
// The stream_id and task_id in the dump file are 8 byte fields for extensibility purpose, but only hold 4
|
||||
// byte values currently.
|
||||
uint64_t stream_id = BytestoUInt64(std::vector<char>(buffer.begin() + stream_id_offset, buffer.end()));
|
||||
uint64_t task_id = BytestoUInt64(std::vector<char>(buffer.begin() + task_id_offset, buffer.end()));
|
||||
MS_LOG(INFO) << "Overflow bin file " << file_name << ", overflow stream_id " << stream_id << ", task_id "
|
||||
<< task_id << ".";
|
||||
auto op = debugger_->stream_task_to_opname_.find(std::make_pair(stream_id, task_id));
|
||||
if (op != debugger_->stream_task_to_opname_.end()) {
|
||||
MS_LOG(INFO) << "Overflow detected on node " << op->second << std::endl;
|
||||
op_names.push_back(op->second);
|
||||
} else {
|
||||
MS_LOG(INFO) << "No overflow is detected " << std::endl;
|
||||
}
|
||||
infile.close();
|
||||
}
|
||||
} else {
|
||||
MS_LOG(INFO) << "OverFlow bin directory does not exist!";
|
||||
}
|
||||
closedir(d);
|
||||
} else {
|
||||
MS_LOG(INFO) << "OverFlow bin directory does not exist!";
|
||||
}
|
||||
closedir(d);
|
||||
|
||||
if (!op_names.empty()) {
|
||||
MS_LOG(ERROR) << "These operation overflows are detected " << op_names;
|
||||
MS_LOG(INFO) << "These operation overflows are detected " << op_names;
|
||||
}
|
||||
|
||||
for (auto &i : bin_list) {
|
||||
if (i > last_overflow_bin_) {
|
||||
last_overflow_bin_ = i;
|
||||
}
|
||||
}
|
||||
|
||||
auto iter_op_names = overflow_ops_.find(num_step_);
|
||||
if (iter_op_names == overflow_ops_.end()) {
|
||||
overflow_ops_.insert(std::pair<uint32_t, std::vector<std::string>>(num_step_, op_names));
|
||||
|
||||
return op_names;
|
||||
}
|
||||
iter_op_names->second.insert(std::end(iter_op_names->second), std::begin(op_names), std::end(op_names));
|
||||
|
||||
return iter_op_names->second;
|
||||
return op_names;
|
||||
}
|
||||
|
||||
void Debugger::SetTrainingDone(bool training_done) { training_done_ = training_done; }
|
||||
|
|
|
@ -181,8 +181,6 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
|||
// read env variable for grpc client
|
||||
void EnableDebugger();
|
||||
|
||||
void SetOpOverflowBinPath(uint32_t graph_id);
|
||||
|
||||
// check if debugger enabled
|
||||
bool CheckDebuggerEnabled() const;
|
||||
|
||||
|
|
|
@ -123,7 +123,7 @@ void DataDumper::LoadDumpInfo() {
|
|||
std::inserter(stream_task_to_opname, stream_task_to_opname.end()),
|
||||
[](const std::pair<std::string, std::shared_ptr<RuntimeInfo>> &p)
|
||||
-> std::pair<std::pair<uint32_t, uint32_t>, std::string> {
|
||||
return {{std::get<1>(*p.second), std::get<0>(*p.second)}, p.first};
|
||||
return {{std::get<1>(*p.second), std::get<0>(*p.second)}, StripUniqueId(p.first)};
|
||||
});
|
||||
debugger->SetStreamTaskToOpnameMap(stream_task_to_opname);
|
||||
}
|
||||
|
@ -464,6 +464,18 @@ void DataDumper::DumpKernelInput(const CNodePtr &kernel, void *args, NotNull<aic
|
|||
offset += sizeof(void *);
|
||||
}
|
||||
}
|
||||
|
||||
std::string DataDumper::StripUniqueId(const std::string node_name) {
|
||||
size_t last_underscore = node_name.find_last_of('_');
|
||||
std::string stripped_node_name;
|
||||
if (last_underscore == string::npos) {
|
||||
MS_LOG(ERROR) << "Could not strip unique ID from " << node_name;
|
||||
stripped_node_name = node_name;
|
||||
} else {
|
||||
stripped_node_name = node_name.substr(0, last_underscore);
|
||||
}
|
||||
return stripped_node_name;
|
||||
}
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -67,6 +67,7 @@ class DataDumper {
|
|||
void GetNeedDumpKernelList(NotNull<std::map<std::string, CNodePtr> *> kernel_map) const;
|
||||
static void DumpKernelOutput(const CNodePtr &kernel, void *args, NotNull<aicpu::dump::Task *> task);
|
||||
static void DumpKernelInput(const CNodePtr &kernel, void *args, NotNull<aicpu::dump::Task *> task);
|
||||
static std::string StripUniqueId(const std::string node_name);
|
||||
static void RtLoadDumpData(const aicpu::dump::OpMappingInfo &dump_info, void **ptr);
|
||||
|
||||
std::function<void *()> model_handle_;
|
||||
|
|
Loading…
Reference in New Issue