From c85d65c0ef7ddc28b8d515e1abe517236f6375ae Mon Sep 17 00:00:00 2001 From: John Tzanakakis Date: Fri, 25 Jun 2021 18:49:55 -0400 Subject: [PATCH] use -1 iteration for init state and fix missing tensor read outputs --- mindspore/ccsrc/debug/debug_services.cc | 19 ++++++++++++++++--- mindspore/ccsrc/debug/debug_services.h | 2 ++ .../debugger/offline_debug/dbg_services.cc | 4 +++- mindspore/offline_debug/dbg_services.py | 3 +++ .../offline_debug/mi_validator_helpers.py | 4 ++++ 5 files changed, 28 insertions(+), 4 deletions(-) diff --git a/mindspore/ccsrc/debug/debug_services.cc b/mindspore/ccsrc/debug/debug_services.cc index ce71d83f7bc..fa2c1720ced 100644 --- a/mindspore/ccsrc/debug/debug_services.cc +++ b/mindspore/ccsrc/debug/debug_services.cc @@ -529,7 +529,7 @@ void DebugServices::ConvertReadTensors(std::vector backend_name, st std::string prefix_dump_file_name = dump_style_kernel_name; std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id[i]) + "/" + net_name + "/" + - std::to_string(root_graph_id[i]) + "/" + std::to_string(iteration[i]); + std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]); // search files in dir for the one that meets the filename prefix and read the file into memory DIR *d; @@ -702,7 +702,7 @@ void DebugServices::ReadDumpedTensor(std::vector backend_name, std: SetPrefixToCheck(&prefix_dump_file_name, &dump_style_kernel_name, slot[i], is_output[i]); std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id[i]) + "/" + net_name + "/" + - std::to_string(root_graph_id[i]) + "/" + std::to_string(iteration[i]); + std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]); // search files in dir for the one that meets the filename prefix and read the file into memory std::vector *buffer = NULL; @@ -741,6 +741,8 @@ void DebugServices::ReadDumpedTensor(std::vector backend_name, std: type_name, shape, buffer, result_list); } } else { + AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], is_output[i], 0, + type_name, shape, buffer, result_list); MS_LOG(INFO) << "directory does not exist!"; } closedir(d); @@ -821,7 +823,7 @@ std::vector> DebugServices::ReadNeededDumpedTensors( std::vector> proto_to_dump; std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id) + "/" + net_name + "/" + - std::to_string(root_graph_id) + "/" + std::to_string(iteration); + std::to_string(root_graph_id) + "/" + IterationString(iteration); // convert node names to dump style for (auto node : wp_nodes) { @@ -890,6 +892,17 @@ std::vector> DebugServices::ReadNeededDumpedTensors( return tensor_list; } + +std::string DebugServices::IterationString(unsigned int iteration) { + std::string iteration_string; + bool init_dbg_suspend = (iteration == UINT_MAX); + if (init_dbg_suspend) { + iteration_string = "init"; + } else { + iteration_string = std::to_string(iteration); + } + return iteration_string; +} #endif void DebugServices::ReadNodesTensors(const std::vector &name, std::vector *const ret_name, diff --git a/mindspore/ccsrc/debug/debug_services.h b/mindspore/ccsrc/debug/debug_services.h index aee6e40ef73..7b669adc772 100644 --- a/mindspore/ccsrc/debug/debug_services.h +++ b/mindspore/ccsrc/debug/debug_services.h @@ -259,6 +259,8 @@ class DebugServices { std::vector> *tensor_list); std::string GetStrippedFilename(const std::string &file_name); + + std::string IterationString(unsigned int iteration); #endif void ReadNodesTensors(const std::vector &name, std::vector *ret_name, std::vector *data_ptr, std::vector *data_size, diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc index 4cd9f2a40b5..8cc08fb8e25 100644 --- a/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc +++ b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc @@ -176,10 +176,12 @@ std::vector DbgServices::CheckWatchpoints(unsigned int iterati std::vector> tensor_list; std::vector file_paths; + const bool init_dbg_suspend = (iteration == UINT_MAX); + tensor_list = debug_services->ReadNeededDumpedTensors(iteration, &file_paths); debug_services->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, ¶meters, &error_codes, overflow_ops, - file_paths, &tensor_list, false, true, true, &rank_id, &root_graph_id); + file_paths, &tensor_list, init_dbg_suspend, true, true, &rank_id, &root_graph_id); std::vector hits; for (unsigned int i = 0; i < name.size(); i++) { diff --git a/mindspore/offline_debug/dbg_services.py b/mindspore/offline_debug/dbg_services.py index f079d7e8861..c2fda38252d 100644 --- a/mindspore/offline_debug/dbg_services.py +++ b/mindspore/offline_debug/dbg_services.py @@ -18,6 +18,7 @@ The module DbgServices provides offline debugger APIs. import mindspore._mindspore_offline_debug as cds from mindspore.offline_debug.mi_validators import check_init, check_initialize, check_add_watchpoint, check_remove_watchpoint, check_check_watchpoints, check_read_tensors, check_initialize_done, check_tensor_info_init, check_tensor_data_init, check_watchpoint_hit_init, check_parameter_init +from mindspore.offline_debug.mi_validator_helpers import replace_minus_one def get_version(): @@ -212,6 +213,7 @@ class DbgServices(): """ log("in Python CheckWatchpoints iteration ", iteration) + iteration = replace_minus_one(iteration) watchpoint_list = self.dbg_instance.CheckWatchpoints(iteration) watchpoint_hit_list = [] for watchpoint in watchpoint_list: @@ -298,6 +300,7 @@ class TensorInfo(): @check_tensor_info_init def __init__(self, node_name, slot, iteration, rank_id, root_graph_id, is_output=True): + iteration = replace_minus_one(iteration) self.instance = cds.tensor_info(node_name, slot, iteration, rank_id, root_graph_id, is_output) @property diff --git a/mindspore/offline_debug/mi_validator_helpers.py b/mindspore/offline_debug/mi_validator_helpers.py index 643626fbbe0..675c3732d7c 100644 --- a/mindspore/offline_debug/mi_validator_helpers.py +++ b/mindspore/offline_debug/mi_validator_helpers.py @@ -126,3 +126,7 @@ def type_check_list(args, types, arg_names): arg_names = ["{0}[{1}]".format(arg_names, i) for i in range(len(args))] for arg, arg_name in zip(args, arg_names): type_check(arg, types, arg_name) + +def replace_minus_one(value): + """ replace -1 with a default value """ + return value if value != -1 else UINT32_MAX