!18897 use -1 iteration for init state and fix missing tensor read outputs
Merge pull request !18897 from john_tzanakakis/jt_bug_fixes
This commit is contained in:
commit
007707904e
|
@ -529,7 +529,7 @@ void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, st
|
||||||
std::string prefix_dump_file_name = dump_style_kernel_name;
|
std::string prefix_dump_file_name = dump_style_kernel_name;
|
||||||
|
|
||||||
std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id[i]) + "/" + net_name + "/" +
|
std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id[i]) + "/" + net_name + "/" +
|
||||||
std::to_string(root_graph_id[i]) + "/" + std::to_string(iteration[i]);
|
std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
|
||||||
|
|
||||||
// search files in dir for the one that meets the filename prefix and read the file into memory
|
// search files in dir for the one that meets the filename prefix and read the file into memory
|
||||||
DIR *d;
|
DIR *d;
|
||||||
|
@ -702,7 +702,7 @@ void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std:
|
||||||
SetPrefixToCheck(&prefix_dump_file_name, &dump_style_kernel_name, slot[i], is_output[i]);
|
SetPrefixToCheck(&prefix_dump_file_name, &dump_style_kernel_name, slot[i], is_output[i]);
|
||||||
|
|
||||||
std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id[i]) + "/" + net_name + "/" +
|
std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id[i]) + "/" + net_name + "/" +
|
||||||
std::to_string(root_graph_id[i]) + "/" + std::to_string(iteration[i]);
|
std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
|
||||||
|
|
||||||
// search files in dir for the one that meets the filename prefix and read the file into memory
|
// search files in dir for the one that meets the filename prefix and read the file into memory
|
||||||
std::vector<char> *buffer = NULL;
|
std::vector<char> *buffer = NULL;
|
||||||
|
@ -741,6 +741,8 @@ void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std:
|
||||||
type_name, shape, buffer, result_list);
|
type_name, shape, buffer, result_list);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], is_output[i], 0,
|
||||||
|
type_name, shape, buffer, result_list);
|
||||||
MS_LOG(INFO) << "directory does not exist!";
|
MS_LOG(INFO) << "directory does not exist!";
|
||||||
}
|
}
|
||||||
closedir(d);
|
closedir(d);
|
||||||
|
@ -821,7 +823,7 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
|
||||||
std::vector<std::tuple<std::string, std::string>> proto_to_dump;
|
std::vector<std::tuple<std::string, std::string>> proto_to_dump;
|
||||||
|
|
||||||
std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id) + "/" + net_name + "/" +
|
std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id) + "/" + net_name + "/" +
|
||||||
std::to_string(root_graph_id) + "/" + std::to_string(iteration);
|
std::to_string(root_graph_id) + "/" + IterationString(iteration);
|
||||||
|
|
||||||
// convert node names to dump style
|
// convert node names to dump style
|
||||||
for (auto node : wp_nodes) {
|
for (auto node : wp_nodes) {
|
||||||
|
@ -890,6 +892,17 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
|
||||||
|
|
||||||
return tensor_list;
|
return tensor_list;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string DebugServices::IterationString(unsigned int iteration) {
|
||||||
|
std::string iteration_string;
|
||||||
|
bool init_dbg_suspend = (iteration == UINT_MAX);
|
||||||
|
if (init_dbg_suspend) {
|
||||||
|
iteration_string = "init";
|
||||||
|
} else {
|
||||||
|
iteration_string = std::to_string(iteration);
|
||||||
|
}
|
||||||
|
return iteration_string;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
|
void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
|
||||||
|
|
|
@ -259,6 +259,8 @@ class DebugServices {
|
||||||
std::vector<std::shared_ptr<TensorData>> *tensor_list);
|
std::vector<std::shared_ptr<TensorData>> *tensor_list);
|
||||||
|
|
||||||
std::string GetStrippedFilename(const std::string &file_name);
|
std::string GetStrippedFilename(const std::string &file_name);
|
||||||
|
|
||||||
|
std::string IterationString(unsigned int iteration);
|
||||||
#endif
|
#endif
|
||||||
void ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *ret_name,
|
void ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *ret_name,
|
||||||
std::vector<char *> *data_ptr, std::vector<ssize_t> *data_size,
|
std::vector<char *> *data_ptr, std::vector<ssize_t> *data_size,
|
||||||
|
|
|
@ -176,10 +176,12 @@ std::vector<watchpoint_hit_t> DbgServices::CheckWatchpoints(unsigned int iterati
|
||||||
std::vector<std::shared_ptr<TensorData>> tensor_list;
|
std::vector<std::shared_ptr<TensorData>> tensor_list;
|
||||||
std::vector<std::string> file_paths;
|
std::vector<std::string> file_paths;
|
||||||
|
|
||||||
|
const bool init_dbg_suspend = (iteration == UINT_MAX);
|
||||||
|
|
||||||
tensor_list = debug_services->ReadNeededDumpedTensors(iteration, &file_paths);
|
tensor_list = debug_services->ReadNeededDumpedTensors(iteration, &file_paths);
|
||||||
|
|
||||||
debug_services->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, ¶meters, &error_codes, overflow_ops,
|
debug_services->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, ¶meters, &error_codes, overflow_ops,
|
||||||
file_paths, &tensor_list, false, true, true, &rank_id, &root_graph_id);
|
file_paths, &tensor_list, init_dbg_suspend, true, true, &rank_id, &root_graph_id);
|
||||||
|
|
||||||
std::vector<watchpoint_hit_t> hits;
|
std::vector<watchpoint_hit_t> hits;
|
||||||
for (unsigned int i = 0; i < name.size(); i++) {
|
for (unsigned int i = 0; i < name.size(); i++) {
|
||||||
|
|
|
@ -18,6 +18,7 @@ The module DbgServices provides offline debugger APIs.
|
||||||
|
|
||||||
import mindspore._mindspore_offline_debug as cds
|
import mindspore._mindspore_offline_debug as cds
|
||||||
from mindspore.offline_debug.mi_validators import check_init, check_initialize, check_add_watchpoint, check_remove_watchpoint, check_check_watchpoints, check_read_tensors, check_initialize_done, check_tensor_info_init, check_tensor_data_init, check_watchpoint_hit_init, check_parameter_init
|
from mindspore.offline_debug.mi_validators import check_init, check_initialize, check_add_watchpoint, check_remove_watchpoint, check_check_watchpoints, check_read_tensors, check_initialize_done, check_tensor_info_init, check_tensor_data_init, check_watchpoint_hit_init, check_parameter_init
|
||||||
|
from mindspore.offline_debug.mi_validator_helpers import replace_minus_one
|
||||||
|
|
||||||
|
|
||||||
def get_version():
|
def get_version():
|
||||||
|
@ -212,6 +213,7 @@ class DbgServices():
|
||||||
"""
|
"""
|
||||||
|
|
||||||
log("in Python CheckWatchpoints iteration ", iteration)
|
log("in Python CheckWatchpoints iteration ", iteration)
|
||||||
|
iteration = replace_minus_one(iteration)
|
||||||
watchpoint_list = self.dbg_instance.CheckWatchpoints(iteration)
|
watchpoint_list = self.dbg_instance.CheckWatchpoints(iteration)
|
||||||
watchpoint_hit_list = []
|
watchpoint_hit_list = []
|
||||||
for watchpoint in watchpoint_list:
|
for watchpoint in watchpoint_list:
|
||||||
|
@ -298,6 +300,7 @@ class TensorInfo():
|
||||||
|
|
||||||
@check_tensor_info_init
|
@check_tensor_info_init
|
||||||
def __init__(self, node_name, slot, iteration, rank_id, root_graph_id, is_output=True):
|
def __init__(self, node_name, slot, iteration, rank_id, root_graph_id, is_output=True):
|
||||||
|
iteration = replace_minus_one(iteration)
|
||||||
self.instance = cds.tensor_info(node_name, slot, iteration, rank_id, root_graph_id, is_output)
|
self.instance = cds.tensor_info(node_name, slot, iteration, rank_id, root_graph_id, is_output)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -126,3 +126,7 @@ def type_check_list(args, types, arg_names):
|
||||||
arg_names = ["{0}[{1}]".format(arg_names, i) for i in range(len(args))]
|
arg_names = ["{0}[{1}]".format(arg_names, i) for i in range(len(args))]
|
||||||
for arg, arg_name in zip(args, arg_names):
|
for arg, arg_name in zip(args, arg_names):
|
||||||
type_check(arg, types, arg_name)
|
type_check(arg, types, arg_name)
|
||||||
|
|
||||||
|
def replace_minus_one(value):
|
||||||
|
""" replace -1 with a default value """
|
||||||
|
return value if value != -1 else UINT32_MAX
|
||||||
|
|
Loading…
Reference in New Issue