forked from mindspore-Ecosystem/mindspore
!9740 Operator overflow watchpoint support for multiple graphs
From: @adelshafiei Reviewed-by: @john_tzanakakis Signed-off-by:
This commit is contained in:
commit
b76a852be6
|
@ -364,6 +364,7 @@ void DumpJsonParser::PrintUnusedKernel() {
|
|||
std::string DumpJsonParser::GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const {
|
||||
std::string bin_path;
|
||||
bin_path.append(path_);
|
||||
bin_path.append("/");
|
||||
bin_path.append("device_");
|
||||
bin_path.append(std::to_string(device_id));
|
||||
bin_path.append("/");
|
||||
|
|
|
@ -69,7 +69,6 @@ Debugger::Debugger()
|
|||
is_dataset_graph_(false),
|
||||
partial_memory_(false),
|
||||
last_overflow_bin_(0),
|
||||
overflow_bin_path_(""),
|
||||
initial_suspend_(true),
|
||||
not_dataset_graph_sum_(0),
|
||||
version_("") {
|
||||
|
@ -161,43 +160,45 @@ void Debugger::EnableDebugger() {
|
|||
}
|
||||
MS_LOG(INFO) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051";
|
||||
}
|
||||
|
||||
#ifdef ENABLE_D
|
||||
// set operation overflow info
|
||||
overflow_bin_path_ = DumpJsonParser::GetInstance().GetOpOverflowBinPath(graph_ptr_->graph_id(), device_id_);
|
||||
// new overflow dump files will have a timestamp greater than last_overflow_bin_
|
||||
last_overflow_bin_ = 0;
|
||||
DIR *d;
|
||||
d = opendir(overflow_bin_path_.c_str());
|
||||
if (d != nullptr) {
|
||||
struct dirent *dir;
|
||||
while ((dir = readdir(d)) != NULL) {
|
||||
if (dir->d_type == DT_REG) {
|
||||
std::string file_path = overflow_bin_path_;
|
||||
file_path.append(dir->d_name);
|
||||
std::size_t found = file_path.find_last_of(".");
|
||||
if (found == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
std::string overflow_time = file_path.substr(found + 1);
|
||||
if (stod(overflow_time) <= last_overflow_bin_) {
|
||||
MS_LOG(INFO) << "Old op overflow bin folder" << file_path;
|
||||
continue;
|
||||
}
|
||||
last_overflow_bin_ = stod(overflow_time);
|
||||
}
|
||||
}
|
||||
MS_LOG(INFO) << "last op overflow bin folder" << last_overflow_bin_;
|
||||
closedir(d);
|
||||
}
|
||||
#endif
|
||||
|
||||
// initialize grpc client
|
||||
grpc_client_ = std::make_unique<GrpcClient>(host, port);
|
||||
}
|
||||
debug_services_ = std::make_unique<DebugServices>();
|
||||
}
|
||||
|
||||
void Debugger::SetOpOverflowBinPath(uint32_t graph_id) {
|
||||
#ifdef ENABLE_D
|
||||
// set operation overflow info
|
||||
overflow_bin_path_.insert(std::pair<uint32_t, std::string>(
|
||||
graph_id, DumpJsonParser::GetInstance().GetOpOverflowBinPath(graph_id, device_id_)));
|
||||
// new overflow dump files will have a timestamp greater than last_overflow_bin_
|
||||
auto overflow_bin_path = overflow_bin_path_.find(graph_id)->second;
|
||||
DIR *d;
|
||||
d = opendir(overflow_bin_path.c_str());
|
||||
if (d != nullptr) {
|
||||
struct dirent *dir;
|
||||
while ((dir = readdir(d)) != NULL) {
|
||||
if (dir->d_type == DT_REG) {
|
||||
std::string file_path = overflow_bin_path;
|
||||
file_path.append(dir->d_name);
|
||||
std::size_t found = file_path.find_last_of(".");
|
||||
if (found == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
std::string overflow_time = file_path.substr(found + 1);
|
||||
if (stod(overflow_time) <= last_overflow_bin_) {
|
||||
MS_LOG(INFO) << "Old op overflow bin folder" << file_path;
|
||||
continue;
|
||||
}
|
||||
last_overflow_bin_ = stod(overflow_time);
|
||||
}
|
||||
}
|
||||
MS_LOG(INFO) << "last op overflow bin folder" << last_overflow_bin_;
|
||||
closedir(d);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void Debugger::CheckDatasetSinkMode() {
|
||||
if (CheckDebuggerDumpEnabled() && ConfigManager::GetInstance().dataset_mode() == DS_SINK_MODE) {
|
||||
MS_EXCEPTION(NotSupportError)
|
||||
|
@ -256,7 +257,7 @@ void Debugger::Reset() {
|
|||
grpc_client_ = nullptr;
|
||||
debug_services_ = nullptr;
|
||||
last_overflow_bin_ = 0;
|
||||
overflow_bin_path_ = "";
|
||||
overflow_bin_path_.clear();
|
||||
stream_task_to_opname_.clear();
|
||||
}
|
||||
|
||||
|
@ -390,6 +391,9 @@ void Debugger::LoadGraphs(const KernelGraphPtr &graph_ptr) {
|
|||
// add new graph proto to graph_proto_list_
|
||||
graph_proto_list_.push_back(graph_proto);
|
||||
graph_ptr_list_.push_back(graph_ptr);
|
||||
#ifdef ENABLE_D
|
||||
SetOpOverflowBinPath(graph_ptr->graph_id());
|
||||
#endif
|
||||
not_dataset_graph_sum_++;
|
||||
}
|
||||
// reset is_dataset_graph to be false
|
||||
|
@ -991,52 +995,55 @@ uint64_t BytestoInt64(const std::vector<char> &buffer) {
|
|||
std::vector<std::string> Debugger::CheckOpOverflow() {
|
||||
std::vector<double> bin_list;
|
||||
std::vector<std::string> op_names;
|
||||
DIR *d;
|
||||
struct dirent *dir = nullptr;
|
||||
d = opendir(overflow_bin_path_.c_str());
|
||||
if (d != nullptr) {
|
||||
while ((dir = readdir(d)) != NULL) {
|
||||
if (dir->d_type == DT_REG) {
|
||||
std::string file_path = overflow_bin_path_;
|
||||
file_path.append(dir->d_name);
|
||||
std::string file_name = dir->d_name;
|
||||
std::size_t found = file_name.find_last_of(".");
|
||||
if (found == std::string::npos) {
|
||||
continue;
|
||||
for (const auto &[graph_id, overflow_bin_path] : overflow_bin_path_) {
|
||||
DIR *d;
|
||||
d = opendir(overflow_bin_path.c_str());
|
||||
MS_LOG(INFO) << "processing bin file path " << overflow_bin_path << ", graph id " << graph_id;
|
||||
if (d != nullptr) {
|
||||
struct dirent *dir = nullptr;
|
||||
while ((dir = readdir(d)) != NULL) {
|
||||
if (dir->d_type == DT_REG) {
|
||||
std::string file_path = overflow_bin_path;
|
||||
file_path.append(dir->d_name);
|
||||
std::string file_name = dir->d_name;
|
||||
std::size_t found = file_name.find_last_of(".");
|
||||
if (found == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
std::string overflow_time = file_name.substr(found + 1);
|
||||
if (stod(overflow_time) <= last_overflow_bin_) {
|
||||
MS_LOG(INFO) << "File already processed " << file_name;
|
||||
continue;
|
||||
}
|
||||
bin_list.push_back(stod(overflow_time));
|
||||
std::fstream infile;
|
||||
infile.open(file_path.c_str(), std::ios::binary | std::ios::in);
|
||||
if (!infile.is_open()) {
|
||||
MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name;
|
||||
continue;
|
||||
}
|
||||
infile.seekg(313, std::ios::beg);
|
||||
std::vector<char> buffer;
|
||||
buffer.resize(BUF_SIZ);
|
||||
infile.read(buffer.data(), BUF_SIZ);
|
||||
uint64_t stream_id = BytestoInt64(std::vector<char>(buffer.begin() + 8, buffer.end()));
|
||||
uint64_t task_id = BytestoInt64(std::vector<char>(buffer.begin() + 16, buffer.end()));
|
||||
MS_LOG(INFO) << "Overflow stream_id " << stream_id << ", task_id " << task_id << ".";
|
||||
auto op = debugger_->stream_task_to_opname_.find(std::make_pair(stream_id, task_id));
|
||||
if (op != debugger_->stream_task_to_opname_.end()) {
|
||||
MS_LOG(ERROR) << "Overflow detected on node " << op->second << std::endl;
|
||||
op_names.push_back(op->second);
|
||||
} else {
|
||||
MS_LOG(INFO) << "No overflow is detected " << std::endl;
|
||||
}
|
||||
infile.close();
|
||||
}
|
||||
std::string overflow_time = file_name.substr(found + 1);
|
||||
if (stod(overflow_time) <= last_overflow_bin_) {
|
||||
MS_LOG(INFO) << "File already processed " << file_name;
|
||||
continue;
|
||||
}
|
||||
bin_list.push_back(stod(overflow_time));
|
||||
std::fstream infile;
|
||||
infile.open(file_path.c_str(), std::ios::binary | std::ios::in);
|
||||
if (!infile.is_open()) {
|
||||
MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name;
|
||||
continue;
|
||||
}
|
||||
infile.seekg(313, std::ios::beg);
|
||||
std::vector<char> buffer;
|
||||
buffer.resize(BUF_SIZ);
|
||||
infile.read(buffer.data(), BUF_SIZ);
|
||||
uint64_t stream_id = BytestoInt64(std::vector<char>(buffer.begin() + 8, buffer.end()));
|
||||
uint64_t task_id = BytestoInt64(std::vector<char>(buffer.begin() + 16, buffer.end()));
|
||||
MS_LOG(INFO) << "Overflow stream_id " << stream_id << ", task_id " << task_id << ".";
|
||||
auto op = debugger_->stream_task_to_opname_.find(std::make_pair(stream_id, task_id));
|
||||
if (op != debugger_->stream_task_to_opname_.end()) {
|
||||
MS_LOG(ERROR) << "Overflow detected on node " << op->second << std::endl;
|
||||
op_names.push_back(op->second);
|
||||
} else {
|
||||
MS_LOG(INFO) << "No overflow is detected " << std::endl;
|
||||
}
|
||||
infile.close();
|
||||
}
|
||||
} else {
|
||||
MS_LOG(INFO) << "OverFlow bin directory does not exist!";
|
||||
}
|
||||
} else {
|
||||
MS_LOG(INFO) << "OverFlow bin directory does not exist!";
|
||||
closedir(d);
|
||||
}
|
||||
closedir(d);
|
||||
|
||||
if (!op_names.empty()) {
|
||||
MS_LOG(ERROR) << "These operation overflows are detected " << op_names;
|
||||
|
|
|
@ -156,6 +156,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
|||
// read env variable for grpc client
|
||||
void EnableDebugger();
|
||||
|
||||
void SetOpOverflowBinPath(uint32_t graph_id);
|
||||
|
||||
// check if dump using debugger backend is enabled
|
||||
bool CheckDebuggerDumpEnabled();
|
||||
|
||||
|
@ -232,7 +234,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
|||
std::mutex access_lock_;
|
||||
std::map<std::pair<uint32_t, uint32_t>, std::string> stream_task_to_opname_;
|
||||
double last_overflow_bin_;
|
||||
std::string overflow_bin_path_;
|
||||
std::map<uint32_t, std::string> overflow_bin_path_;
|
||||
// flag to keep track of the very first suspension of debugger
|
||||
bool initial_suspend_;
|
||||
std::list<GraphProto> graph_proto_list_;
|
||||
|
|
Loading…
Reference in New Issue