use -1 iteration for init state and fix missing tensor read outputs

2021-06-25 18:49:55 -04:00 · 2021-06-25 18:49:55 -04:00 · c85d65c0ef
parent 29e42efd98
commit c85d65c0ef
5 changed files with 28 additions and 4 deletions
--- a/mindspore/ccsrc/debug/debug_services.cc
+++ b/mindspore/ccsrc/debug/debug_services.cc
@ -529,7 +529,7 @@ void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, st
    std::string prefix_dump_file_name = dump_style_kernel_name;

    std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id[i]) + "/" + net_name + "/" +
-                                    std::to_string(root_graph_id[i]) + "/" + std::to_string(iteration[i]);
+                                    std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);

    // search files in dir for the one that meets the filename prefix and read the file into memory
    DIR *d;
@ -702,7 +702,7 @@ void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std:
    SetPrefixToCheck(&prefix_dump_file_name, &dump_style_kernel_name, slot[i], is_output[i]);

    std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id[i]) + "/" + net_name + "/" +
-                                    std::to_string(root_graph_id[i]) + "/" + std::to_string(iteration[i]);
+                                    std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);

    // search files in dir for the one that meets the filename prefix and read the file into memory
    std::vector<char> *buffer = NULL;
@ -741,6 +741,8 @@ void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std:
                          type_name, shape, buffer, result_list);
        }
      } else {
+        AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], is_output[i], 0,
+                        type_name, shape, buffer, result_list);
        MS_LOG(INFO) << "directory does not exist!";
      }
      closedir(d);
@ -821,7 +823,7 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
    std::vector<std::tuple<std::string, std::string>> proto_to_dump;

    std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id) + "/" + net_name + "/" +
-                                    std::to_string(root_graph_id) + "/" + std::to_string(iteration);
+                                    std::to_string(root_graph_id) + "/" + IterationString(iteration);

    // convert node names to dump style
    for (auto node : wp_nodes) {
@ -890,6 +892,17 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(

  return tensor_list;
 }
+
+std::string DebugServices::IterationString(unsigned int iteration) {
+  std::string iteration_string;
+  bool init_dbg_suspend = (iteration == UINT_MAX);
+  if (init_dbg_suspend) {
+    iteration_string = "init";
+  } else {
+    iteration_string = std::to_string(iteration);
+  }
+  return iteration_string;
+}
 #endif

 void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
--- a/mindspore/ccsrc/debug/debug_services.h
+++ b/mindspore/ccsrc/debug/debug_services.h
@ -259,6 +259,8 @@ class DebugServices {
                              std::vector<std::shared_ptr<TensorData>> *tensor_list);

  std::string GetStrippedFilename(const std::string &file_name);
+
+  std::string IterationString(unsigned int iteration);
 #endif
  void ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *ret_name,
                        std::vector<char *> *data_ptr, std::vector<ssize_t> *data_size,
--- a/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc
+++ b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc
@ -176,10 +176,12 @@ std::vector<watchpoint_hit_t> DbgServices::CheckWatchpoints(unsigned int iterati
  std::vector<std::shared_ptr<TensorData>> tensor_list;
  std::vector<std::string> file_paths;

+  const bool init_dbg_suspend = (iteration == UINT_MAX);
+
  tensor_list = debug_services->ReadNeededDumpedTensors(iteration, &file_paths);

  debug_services->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, &error_codes, overflow_ops,
-                                   file_paths, &tensor_list, false, true, true, &rank_id, &root_graph_id);
+                                   file_paths, &tensor_list, init_dbg_suspend, true, true, &rank_id, &root_graph_id);

  std::vector<watchpoint_hit_t> hits;
  for (unsigned int i = 0; i < name.size(); i++) {
--- a/mindspore/offline_debug/dbg_services.py
+++ b/mindspore/offline_debug/dbg_services.py
@ -18,6 +18,7 @@ The module DbgServices provides offline debugger APIs.

 import mindspore._mindspore_offline_debug as cds
 from mindspore.offline_debug.mi_validators import check_init, check_initialize, check_add_watchpoint, check_remove_watchpoint, check_check_watchpoints, check_read_tensors, check_initialize_done, check_tensor_info_init, check_tensor_data_init, check_watchpoint_hit_init, check_parameter_init
+from mindspore.offline_debug.mi_validator_helpers import replace_minus_one


 def get_version():
@ -212,6 +213,7 @@ class DbgServices():
        """

        log("in Python CheckWatchpoints iteration ", iteration)
+        iteration = replace_minus_one(iteration)
        watchpoint_list = self.dbg_instance.CheckWatchpoints(iteration)
        watchpoint_hit_list = []
        for watchpoint in watchpoint_list:
@ -298,6 +300,7 @@ class TensorInfo():

    @check_tensor_info_init
    def __init__(self, node_name, slot, iteration, rank_id, root_graph_id, is_output=True):
+        iteration = replace_minus_one(iteration)
        self.instance = cds.tensor_info(node_name, slot, iteration, rank_id, root_graph_id, is_output)

    @property
--- a/mindspore/offline_debug/mi_validator_helpers.py
+++ b/mindspore/offline_debug/mi_validator_helpers.py
@ -126,3 +126,7 @@ def type_check_list(args, types, arg_names):
        arg_names = ["{0}[{1}]".format(arg_names, i) for i in range(len(args))]
    for arg, arg_name in zip(args, arg_names):
        type_check(arg, types, arg_name)
+
+def replace_minus_one(value):
+    """ replace -1 with a default value """
+    return value if value != -1 else UINT32_MAX