!27335 change log level for read not exist tensors

Merge pull request !27335 from yelihua/dev
This commit is contained in:
i-robot 2021-12-07 13:08:20 +00:00 committed by Gitee
commit b80ed801d5
4 changed files with 151 additions and 132 deletions

View File

@ -758,7 +758,7 @@ void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &f
std::string real_dump_iter_dir = RealPath(dump_key);
DIR *d_handle = opendir(real_dump_iter_dir.c_str());
if (d_handle == nullptr) {
MS_LOG(ERROR) << "Directory does not exist in ConvertToHostFormat.";
MS_LOG(INFO) << "Directory does not exist in ConvertToHostFormat.";
return;
}
struct dirent *dir = nullptr;
@ -839,7 +839,7 @@ void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, st
std::string abspath = RealPath(specific_dump_dir);
DIR *d = opendir(abspath.c_str());
if (d == nullptr) {
MS_LOG(ERROR) << "Directory does not exist in ConvertReadTensors.";
MS_LOG(INFO) << "Directory does not exist in ConvertReadTensors.";
return;
}
ProcessConvertList(prefix_dump_file_name, file_format, specific_dump_dir, &dir_to_files_map, result_list);
@ -860,7 +860,7 @@ void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::str
std::string abspath = RealPath(specific_dump_dir);
DIR *d = opendir(abspath.c_str());
if (d == nullptr) {
MS_LOG(ERROR) << "Directory " << specific_dump_dir.c_str() << " does not exist in ConvertWatchPointNodes.";
MS_LOG(INFO) << "Directory " << specific_dump_dir.c_str() << " does not exist in ConvertWatchPointNodes.";
return;
}
ProcessConvertList(dump_name, file_format, specific_dump_dir, &dir_to_files_map, result_list);
@ -1039,7 +1039,7 @@ void DebugServices::ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id)
std::string file_to_check = "ms_global_execution_order_graph_" + std::to_string(root_graph_id) + ".csv";
DIR *d_handle = opendir(exec_order_path.c_str());
if (d_handle == nullptr) {
MS_LOG(ERROR) << "Directory does not exist.";
MS_LOG(ERROR) << "Execution order directory does not exist.";
return;
}
// read file and store the info
@ -1327,11 +1327,15 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
std::tuple<uint32_t, uint32_t> rank_and_graph = rank_and_graph_item.first;
uint32_t rank_id = std::get<0>(rank_and_graph);
uint32_t root_graph_id = std::get<1>(rank_and_graph);
std::vector<std::tuple<std::string, bool>> wp_nodes = rank_and_graph_item.second;
std::vector<std::tuple<std::string, std::string>> proto_to_dump;
std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(rank_id) + "/" + net_name_ + "/" +
std::to_string(root_graph_id) + "/" + IterationString(iteration);
std::string real_dump_dir = RealPath(specific_dump_dir);
if (real_dump_dir.empty()) {
MS_LOG(INFO) << "Dump dir " << specific_dump_dir << " doesn't exist. Skit it.";
continue;
}
std::vector<std::tuple<std::string, bool>> wp_nodes = rank_and_graph_item.second;
std::vector<std::tuple<std::string, std::string>> proto_to_dump;
// convert node names to dump style
for (auto node : wp_nodes) {
@ -1353,13 +1357,12 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
if (is_sync_mode_) {
// search files in dir for the one that meets the filename prefix and read the file into memory
std::string abspath = RealPath(specific_dump_dir);
ProcessTensorDataSync(proto_to_dump, abspath, specific_dump_dir, iteration, rank_id, root_graph_id, &tensor_list,
ProcessTensorDataSync(proto_to_dump, real_dump_dir, iteration, rank_id, root_graph_id, &tensor_list,
error_on_no_value);
} else {
// convert all files in proto_to_dump to npy and add to pool of async file names
ConvertWatchPointNodes(proto_to_dump, specific_dump_dir, async_file_pool);
GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, rank_id, root_graph_id, *async_file_pool,
ConvertWatchPointNodes(proto_to_dump, real_dump_dir, async_file_pool);
GetTensorDataInfoAsync(proto_to_dump, real_dump_dir, iteration, rank_id, root_graph_id, *async_file_pool,
&tensor_list);
}
}
@ -1368,24 +1371,23 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
}
void DebugServices::ProcessTensorDataSync(const std::vector<std::tuple<std::string, std::string>> &proto_to_dump,
const std::string &abspath, const std::string &specific_dump_dir,
unsigned int iteration, unsigned int device_id, unsigned int root_graph_id,
const std::string &specific_dump_dir, unsigned int iteration,
unsigned int device_id, unsigned int root_graph_id,
std::vector<std::shared_ptr<TensorData>> *const tensor_list,
bool error_on_no_value) {
DIR *d = opendir(abspath.c_str());
DIR *d = opendir(specific_dump_dir.c_str());
if (d == nullptr) {
MS_LOG(INFO) << "Directory " << specific_dump_dir.c_str() << " does not exist in ReadNeededDumpedTensors.";
MS_LOG(INFO) << "Directory " << specific_dump_dir.c_str() << " does not exist in ProcessTensorDataSync.";
} else {
struct dirent *dir = nullptr;
while ((dir = readdir(d)) != nullptr) {
struct stat st;
std::string name = abspath + std::string("/") + std::string(dir->d_name);
std::string name = specific_dump_dir + std::string("/") + std::string(dir->d_name);
int ret = stat(name.c_str(), &st);
if (ret == 0 && S_ISREG(st.st_mode)) {
std::string file_name = dir->d_name;
for (auto &node : proto_to_dump) {
std::string dump_name = std::get<1>(node);
std::string stripped_file_name = GetStrippedFilename(file_name);
if (stripped_file_name.empty() || stripped_file_name.length() <= dump_name.length()) {
continue;
@ -1568,8 +1570,109 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNod
}
#endif
std::string GetOnlineOpOverflowDir() {
// only called for online debugger mode
// get operator overflow directory for current iteration
std::string overflow_bin_path = "";
#ifdef ONLINE_DBG_MODE
if (DumpJsonParser::GetInstance().path().empty()) {
MS_LOG(INFO) << "Dump config is not set.";
return "";
}
auto debugger = Debugger::GetInstance();
MS_EXCEPTION_IF_NULL(debugger);
auto cur_graph = debugger->GetGraphPtr();
if (cur_graph == nullptr) {
return "";
}
overflow_bin_path = DumpJsonParser::GetInstance().GetOpOverflowBinPath(cur_graph->root_graph_id());
auto realpath = FileUtils::GetRealPath(overflow_bin_path.c_str());
if (!realpath.has_value()) {
MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
return "";
}
overflow_bin_path = realpath.value() + '/';
#endif
return overflow_bin_path;
}
void DebugServices::AddOpOverflowOpNames(const std::string overflow_bin_path, std::vector<std::string> *op_names) {
MS_EXCEPTION_IF_NULL(op_names);
std::map<std::pair<uint64_t, uint64_t>, std::string> task_stream_to_opname;
std::vector<std::pair<uint64_t, uint64_t>> task_stream_hit;
const std::string overflow_file_prefix = "Opdebug.Node_OpDebug.";
MS_LOG(INFO) << "Processing bin file path " << overflow_bin_path;
DIR *d = opendir(overflow_bin_path.c_str());
if (d == nullptr) {
MS_LOG(INFO) << "OverFlow bin directory does not exist!";
} else {
struct dirent *dir = nullptr;
while ((dir = readdir(d)) != nullptr) {
struct stat st;
std::string name = overflow_bin_path + std::string("/") + std::string(dir->d_name);
int ret = stat(name.c_str(), &st);
if (ret == 0 && S_ISREG(st.st_mode)) {
// form fully qualified filename
std::string file_path = name;
std::string file_name = dir->d_name;
// attempt to read the file
std::ifstream infile;
infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
if (!infile.is_open()) {
MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name << " Errno:" << errno;
continue;
}
std::string node_name;
uint64_t task_id = 0;
uint64_t stream_id = 0;
// detect overflow bin file
if (file_name.rfind(overflow_file_prefix, 0) == 0) {
if (!GetTaskIdStreamId(file_name, overflow_file_prefix, &task_id, &stream_id)) {
continue;
}
MS_LOG(INFO) << "Overflow bin file " << file_name << ", task_id " << task_id << ", stream_id " << stream_id
<< ".";
task_stream_hit.push_back(std::make_pair(task_id, stream_id));
} else {
// regular bin file
bool success_parse = GetAttrsFromFilename(file_name, &node_name, &task_id, &stream_id);
if (success_parse) {
task_stream_to_opname[std::make_pair(task_id, stream_id)] = node_name;
}
}
infile.close();
}
}
(void)closedir(d);
}
// find the op_names with an overflow hit
for (auto &task_stream : task_stream_hit) {
auto op_name = task_stream_to_opname[task_stream];
if (!op_name.empty()) {
MS_LOG(INFO) << "Operation overflow detected in " << op_name;
op_names->push_back(op_name);
}
}
}
bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int device_id, unsigned int root_graph_id,
unsigned int iteration) {
std::string overflow_bin_path = "";
#ifdef ONLINE_DBG_MODE
overflow_bin_path = GetOnlineOpOverflowDir();
#else
overflow_bin_path = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
std::to_string(root_graph_id) + "/" + IterationString(iteration) + "/";
overflow_bin_path = RealPath(overflow_bin_path);
#endif
if (overflow_bin_path.empty()) {
MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
return false;
}
// remove kernel_graph_#
std::string op_name_find_with_path = RemoveKernelGraphPrefix(node_name_to_find);
std::replace(op_name_find_with_path.begin(), op_name_find_with_path.end(), '/', '_');
@ -1583,26 +1686,6 @@ bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int
std::replace(node_name_to_find.begin(), node_name_to_find.end(), '/', '_');
std::vector<std::string> op_names;
std::string overflow_bin_path;
#ifdef ONLINE_DBG_MODE
if (DumpJsonParser::GetInstance().path().empty()) {
// Dump config is not set.
return false;
}
auto debugger = Debugger::GetInstance();
overflow_bin_path = DumpJsonParser::GetInstance().GetOpOverflowBinPath(debugger->GetGraphPtr()->root_graph_id());
auto realpath = FileUtils::GetRealPath(overflow_bin_path.c_str());
if (!realpath.has_value()) {
MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
return false;
}
overflow_bin_path = realpath.value() + '/';
#else
overflow_bin_path = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
std::to_string(root_graph_id) + "/" + IterationString(iteration) + "/";
overflow_bin_path = RealPath(overflow_bin_path);
#endif
overflow_wp_lock_.lock();
@ -1612,68 +1695,7 @@ bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int
MS_LOG(INFO) << "Found already computed overflows for " << overflow_bin_path;
op_names = overflow_ops_[overflow_bin_path];
} else {
std::map<std::pair<uint64_t, uint64_t>, std::string> task_stream_to_opname;
std::vector<std::pair<uint64_t, uint64_t>> task_stream_hit;
const std::string overflow_file_prefix = "Opdebug.Node_OpDebug.";
MS_LOG(INFO) << "Processing bin file path " << overflow_bin_path;
std::string abspath = RealPath(overflow_bin_path);
DIR *d = opendir(abspath.c_str());
if (d == nullptr) {
MS_LOG(ERROR) << "OverFlow bin directory does not exist!";
} else {
struct dirent *dir = nullptr;
while ((dir = readdir(d)) != nullptr) {
struct stat st;
std::string name = abspath + std::string("/") + std::string(dir->d_name);
int ret = stat(name.c_str(), &st);
if (ret == 0 && S_ISREG(st.st_mode)) {
// form fully qualified filename
std::string file_path = overflow_bin_path;
std::string file_name = dir->d_name;
(void)file_path.append(file_name);
// attempt to read the file
std::ifstream infile;
infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
if (!infile.is_open()) {
MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name << " Errno:" << errno;
continue;
}
std::string node_name;
uint64_t task_id = 0;
uint64_t stream_id = 0;
// detect overflow bin file
if (file_name.rfind(overflow_file_prefix, 0) == 0) {
if (!GetTaskIdStreamId(file_name, overflow_file_prefix, &task_id, &stream_id)) {
continue;
}
MS_LOG(INFO) << "Overflow bin file " << file_name << ", task_id " << task_id << ", stream_id " << stream_id
<< ".";
task_stream_hit.push_back(std::make_pair(task_id, stream_id));
} else {
// regular bin file
bool success_parse = GetAttrsFromFilename(file_name, &node_name, &task_id, &stream_id);
if (success_parse) {
task_stream_to_opname[std::make_pair(task_id, stream_id)] = node_name;
}
}
infile.close();
}
}
(void)closedir(d);
}
// find the op_names with an overflow hit
for (auto &task_stream : task_stream_hit) {
auto op_name = task_stream_to_opname[task_stream];
if (!op_name.empty()) {
MS_LOG(INFO) << "Operation overflow detected in " << op_name;
op_names.push_back(op_name);
}
}
AddOpOverflowOpNames(overflow_bin_path, &op_names);
overflow_ops_[overflow_bin_path] = op_names;
}

View File

@ -277,6 +277,8 @@ class DebugServices {
std::vector<unsigned int> *device_id, std::vector<unsigned int> *root_graph_id,
bool error_on_no_value = false);
void AddOpOverflowOpNames(const std::string overflow_bin_path, std::vector<std::string> *op_names);
void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition,
std::vector<unsigned int> *const watchpoint_id,
std::vector<std::vector<parameter_t>> *parameters, std::vector<int32_t> *error_code,
@ -333,9 +335,8 @@ class DebugServices {
std::vector<std::shared_ptr<TensorData>> *const result_list, bool *no_mem_to_read = nullptr);
void ProcessTensorDataSync(const std::vector<std::tuple<std::string, std::string>> &proto_to_dump,
const std::string &abspath, const std::string &specific_dump_dir, unsigned int iteration,
unsigned int device_id, unsigned int root_graph_id,
std::vector<std::shared_ptr<TensorData>> *const tensor_list,
const std::string &specific_dump_dir, unsigned int iteration, unsigned int device_id,
unsigned int root_graph_id, std::vector<std::shared_ptr<TensorData>> *const tensor_list,
bool error_on_no_value = false);
void ReadFileAndAddToTensor(const bool found, const std::vector<std::string> &matched_paths,

View File

@ -33,31 +33,29 @@ def build_dump_structure(path, tensor_name_list, tensor_list, net_name, tensor_i
rank_id = str(tensor_info.rank_id)
root_graph_id = str(tensor_info.root_graph_id)
is_output = str(tensor_info.is_output)
if rank_id not in ranks_run_history:
graphs_run_history = ranks_run_history.get(rank_id)
if graphs_run_history is None:
graphs_run_history = {}
ranks_run_history[rank_id] = graphs_run_history
if root_graph_id not in ranks_run_history[rank_id]:
iter_list = []
iter_list.append(iteration)
graphs_run_history[root_graph_id] = iter_list
elif iteration not in graphs_run_history[root_graph_id]:
if root_graph_id not in graphs_run_history:
graphs_run_history[root_graph_id] = [iteration]
if iteration not in graphs_run_history[root_graph_id]:
bisect.insort(graphs_run_history[root_graph_id], iteration)
path = os.path.join(temp_dir, "rank_" + rank_id, net_name, root_graph_id, iteration)
os.makedirs(path, exist_ok=True)
if is_output == "True":
file = tempfile.mkstemp(prefix=tensor_name, suffix=".output." + slot +
".DefaultFormat.npy", dir=path)
file_name = f'{tensor_name}.output.{slot}.DefaultFormat.npy'
else:
file = tempfile.mkstemp(prefix=tensor_name, suffix=".input." + slot +
".DefaultFormat.npy", dir=path)
full_path = file[1]
file_name = f'{tensor_name}.input.{slot}.DefaultFormat.npy'
full_path = os.path.join(path, file_name)
np.save(full_path, tensor)
build_global_execution_order(temp_dir, ranks_run_history)
return temp_dir
def build_global_execution_order(path, ranks_run_history):
"""Build global execution order."""
for rank_id in ranks_run_history.keys():
exec_order_path = path + "/rank_" + rank_id + "/" + "execution_order"
os.makedirs(exec_order_path, exist_ok=True)

View File

@ -26,38 +26,36 @@ import numpy as np
def build_dump_structure(tensor_name_list, tensor_list, net_name, tensor_info_list):
"""Build dump file structure from tensor_list."""
ranks_run_history = {}
temp_dir = tempfile.mkdtemp(prefix=net_name, dir="./")
for x, _ in enumerate(tensor_info_list):
slot = str(tensor_info_list[x].slot)
iteration = str(tensor_info_list[x].iteration)
rank_id = str(tensor_info_list[x].rank_id)
root_graph_id = str(tensor_info_list[x].root_graph_id)
is_output = str(tensor_info_list[x].is_output)
if rank_id not in ranks_run_history:
temp_dir = tempfile.mkdtemp(prefix=net_name, dir="/tmp")
for tensor_name, tensor, tensor_info in zip(tensor_name_list, tensor_list, tensor_info_list):
slot = str(tensor_info.slot)
iteration = str(tensor_info.iteration)
rank_id = str(tensor_info.rank_id)
root_graph_id = str(tensor_info.root_graph_id)
is_output = str(tensor_info.is_output)
graphs_run_history = ranks_run_history.get(rank_id)
if graphs_run_history is None:
graphs_run_history = {}
ranks_run_history[rank_id] = graphs_run_history
if root_graph_id not in ranks_run_history[rank_id]:
iter_list = []
iter_list.append(iteration)
graphs_run_history[root_graph_id] = iter_list
elif iteration not in graphs_run_history[root_graph_id]:
if root_graph_id not in graphs_run_history:
graphs_run_history[root_graph_id] = [iteration]
if iteration not in graphs_run_history[root_graph_id]:
bisect.insort(graphs_run_history[root_graph_id], iteration)
path = os.path.join(temp_dir, "rank_" + rank_id, net_name, root_graph_id, iteration)
os.makedirs(path, exist_ok=True)
if is_output == "True":
file = tempfile.mkstemp(prefix=tensor_name_list[x], suffix=".output." + slot +
".DefaultFormat.npy", dir=path)
file_name = f'{tensor_name}.output.{slot}.DefaultFormat.npy'
else:
file = tempfile.mkstemp(prefix=tensor_name_list[x], suffix=".input." + slot +
".DefaultFormat.npy", dir=path)
full_path = file[1]
np.save(full_path, tensor_list[x])
file_name = f'{tensor_name}.input.{slot}.DefaultFormat.npy'
full_path = os.path.join(path, file_name)
np.save(full_path, tensor)
build_global_execution_order(temp_dir, ranks_run_history)
return temp_dir
def build_global_execution_order(path, ranks_run_history):
"""Build global execution order."""
for rank_id in ranks_run_history.keys():
exec_order_path = path + "/rank_" + rank_id + "/" + "execution_order"
os.makedirs(exec_order_path, exist_ok=True)