!18897 use -1 iteration for init state and fix missing tensor read outputs

Merge pull request !18897 from john_tzanakakis/jt_bug_fixes
This commit is contained in:
i-robot 2021-06-26 02:53:08 +00:00 committed by Gitee
commit 007707904e
5 changed files with 28 additions and 4 deletions

View File

@ -529,7 +529,7 @@ void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, st
std::string prefix_dump_file_name = dump_style_kernel_name; std::string prefix_dump_file_name = dump_style_kernel_name;
std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id[i]) + "/" + net_name + "/" + std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id[i]) + "/" + net_name + "/" +
std::to_string(root_graph_id[i]) + "/" + std::to_string(iteration[i]); std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
// search files in dir for the one that meets the filename prefix and read the file into memory // search files in dir for the one that meets the filename prefix and read the file into memory
DIR *d; DIR *d;
@ -702,7 +702,7 @@ void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std:
SetPrefixToCheck(&prefix_dump_file_name, &dump_style_kernel_name, slot[i], is_output[i]); SetPrefixToCheck(&prefix_dump_file_name, &dump_style_kernel_name, slot[i], is_output[i]);
std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id[i]) + "/" + net_name + "/" + std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id[i]) + "/" + net_name + "/" +
std::to_string(root_graph_id[i]) + "/" + std::to_string(iteration[i]); std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
// search files in dir for the one that meets the filename prefix and read the file into memory // search files in dir for the one that meets the filename prefix and read the file into memory
std::vector<char> *buffer = NULL; std::vector<char> *buffer = NULL;
@ -741,6 +741,8 @@ void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std:
type_name, shape, buffer, result_list); type_name, shape, buffer, result_list);
} }
} else { } else {
AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], is_output[i], 0,
type_name, shape, buffer, result_list);
MS_LOG(INFO) << "directory does not exist!"; MS_LOG(INFO) << "directory does not exist!";
} }
closedir(d); closedir(d);
@ -821,7 +823,7 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
std::vector<std::tuple<std::string, std::string>> proto_to_dump; std::vector<std::tuple<std::string, std::string>> proto_to_dump;
std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id) + "/" + net_name + "/" + std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id) + "/" + net_name + "/" +
std::to_string(root_graph_id) + "/" + std::to_string(iteration); std::to_string(root_graph_id) + "/" + IterationString(iteration);
// convert node names to dump style // convert node names to dump style
for (auto node : wp_nodes) { for (auto node : wp_nodes) {
@ -890,6 +892,17 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
return tensor_list; return tensor_list;
} }
std::string DebugServices::IterationString(unsigned int iteration) {
std::string iteration_string;
bool init_dbg_suspend = (iteration == UINT_MAX);
if (init_dbg_suspend) {
iteration_string = "init";
} else {
iteration_string = std::to_string(iteration);
}
return iteration_string;
}
#endif #endif
void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name, void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,

View File

@ -259,6 +259,8 @@ class DebugServices {
std::vector<std::shared_ptr<TensorData>> *tensor_list); std::vector<std::shared_ptr<TensorData>> *tensor_list);
std::string GetStrippedFilename(const std::string &file_name); std::string GetStrippedFilename(const std::string &file_name);
std::string IterationString(unsigned int iteration);
#endif #endif
void ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *ret_name, void ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *ret_name,
std::vector<char *> *data_ptr, std::vector<ssize_t> *data_size, std::vector<char *> *data_ptr, std::vector<ssize_t> *data_size,

View File

@ -176,10 +176,12 @@ std::vector<watchpoint_hit_t> DbgServices::CheckWatchpoints(unsigned int iterati
std::vector<std::shared_ptr<TensorData>> tensor_list; std::vector<std::shared_ptr<TensorData>> tensor_list;
std::vector<std::string> file_paths; std::vector<std::string> file_paths;
const bool init_dbg_suspend = (iteration == UINT_MAX);
tensor_list = debug_services->ReadNeededDumpedTensors(iteration, &file_paths); tensor_list = debug_services->ReadNeededDumpedTensors(iteration, &file_paths);
debug_services->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, &error_codes, overflow_ops, debug_services->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, &error_codes, overflow_ops,
file_paths, &tensor_list, false, true, true, &rank_id, &root_graph_id); file_paths, &tensor_list, init_dbg_suspend, true, true, &rank_id, &root_graph_id);
std::vector<watchpoint_hit_t> hits; std::vector<watchpoint_hit_t> hits;
for (unsigned int i = 0; i < name.size(); i++) { for (unsigned int i = 0; i < name.size(); i++) {

View File

@ -18,6 +18,7 @@ The module DbgServices provides offline debugger APIs.
import mindspore._mindspore_offline_debug as cds import mindspore._mindspore_offline_debug as cds
from mindspore.offline_debug.mi_validators import check_init, check_initialize, check_add_watchpoint, check_remove_watchpoint, check_check_watchpoints, check_read_tensors, check_initialize_done, check_tensor_info_init, check_tensor_data_init, check_watchpoint_hit_init, check_parameter_init from mindspore.offline_debug.mi_validators import check_init, check_initialize, check_add_watchpoint, check_remove_watchpoint, check_check_watchpoints, check_read_tensors, check_initialize_done, check_tensor_info_init, check_tensor_data_init, check_watchpoint_hit_init, check_parameter_init
from mindspore.offline_debug.mi_validator_helpers import replace_minus_one
def get_version(): def get_version():
@ -212,6 +213,7 @@ class DbgServices():
""" """
log("in Python CheckWatchpoints iteration ", iteration) log("in Python CheckWatchpoints iteration ", iteration)
iteration = replace_minus_one(iteration)
watchpoint_list = self.dbg_instance.CheckWatchpoints(iteration) watchpoint_list = self.dbg_instance.CheckWatchpoints(iteration)
watchpoint_hit_list = [] watchpoint_hit_list = []
for watchpoint in watchpoint_list: for watchpoint in watchpoint_list:
@ -298,6 +300,7 @@ class TensorInfo():
@check_tensor_info_init @check_tensor_info_init
def __init__(self, node_name, slot, iteration, rank_id, root_graph_id, is_output=True): def __init__(self, node_name, slot, iteration, rank_id, root_graph_id, is_output=True):
iteration = replace_minus_one(iteration)
self.instance = cds.tensor_info(node_name, slot, iteration, rank_id, root_graph_id, is_output) self.instance = cds.tensor_info(node_name, slot, iteration, rank_id, root_graph_id, is_output)
@property @property

View File

@ -126,3 +126,7 @@ def type_check_list(args, types, arg_names):
arg_names = ["{0}[{1}]".format(arg_names, i) for i in range(len(args))] arg_names = ["{0}[{1}]".format(arg_names, i) for i in range(len(args))]
for arg, arg_name in zip(args, arg_names): for arg, arg_name in zip(args, arg_names):
type_check(arg, types, arg_name) type_check(arg, types, arg_name)
def replace_minus_one(value):
""" replace -1 with a default value """
return value if value != -1 else UINT32_MAX