!22333 Add tensor base and stat to online debugger

Merge pull request !22333 from parastooashtari/tensor_level_info_online
2021-08-26 13:18:32 +00:00 · 2021-08-26 13:18:32 +00:00 · 158536b9e2
parent e6e1f37ae4 5ed11885d7
commit 158536b9e2
7 changed files with 251 additions and 9 deletions
--- a/mindspore/ccsrc/debug/debug_services.cc
+++ b/mindspore/ccsrc/debug/debug_services.cc
@ -193,18 +193,26 @@ void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end,
    auto wp = std::get<1>(w_table_item);
    // check ONLY init conditions on initial suspended state.
    // skip other conditions on initial suspended state
-    if (init_dbg_suspend && (wp.condition.type != INIT)) continue;
+    if (init_dbg_suspend && (wp.condition.type != INIT)) {
+      continue;
+    }
    // skip init condition if not init suspend
-    if ((wp.condition.type == INIT) && !init_dbg_suspend) continue;
+    if ((wp.condition.type == INIT) && !init_dbg_suspend) {
+      continue;
+    }
    // check change conditions only on step end.
-    if (wp.change_condition() && !step_end) continue;
+    if (wp.change_condition() && !step_end) {
+      continue;
+    }
    // if recheck, ignore the cache results and reanalyze everything.
    // if not a recheck, check only unanalyzed tensors
    if (!recheck) {
      wp_lock_.lock();
      bool wp_cache_hit = wp_id_cache_[tensor_name].count(wp.id);
      wp_lock_.unlock();
-      if (wp_cache_hit) continue;
+      if (wp_cache_hit) {
+        continue;
+      }
    }
    std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot);
    if (!found.empty()) {
@ -258,7 +266,9 @@ void DebugServices::CheckWatchpointsForTensor(
    const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
    const auto tensor_slot = std::to_string(tensor->GetSlot());
    // no elements to analyze
-    if (tensor->GetByteSize() == 0) continue;
+    if (tensor->GetByteSize() == 0) {
+      continue;
+    }
    (*chunk_tensor_byte_size)[chunk_id] += tensor->GetByteSize();
    int tensor_dtype = tensor->GetType();
    std::vector<watchpoint_t> watchpoints_to_check;
@ -269,7 +279,9 @@ void DebugServices::CheckWatchpointsForTensor(
    AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor_name, tensor_name_no_slot,
                          &previous_iter_tensor_needed, &qualified_tensor_name, &watchpoints_to_check);
    // no wp set on current tensor
-    if (watchpoints_to_check.empty()) continue;
+    if (watchpoints_to_check.empty()) {
+      continue;
+    }
    uint32_t num_elements = tensor->GetNumElements();
 #ifdef OFFLINE_DBG_MODE
    void *previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed);
@ -1032,6 +1044,15 @@ void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::
  }
 }

+void DebugServices::SearchNodesTensors(const std::vector<std::string> &name,
+                                       std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
+  if (!result_list) {
+    MS_LOG(DEBUG) << "result_list is nullptr.";
+    return;
+  }
+  tensor_loader_->SearchTensors(name, result_list);
+}
+
 #ifdef ONLINE_DBG_MODE
 bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
  bool ret = false;
--- a/mindspore/ccsrc/debug/debug_services.h
+++ b/mindspore/ccsrc/debug/debug_services.h
@ -186,6 +186,15 @@ class DebugServices {
    }
  };

+  struct TensorBase {
+    TensorBase(uint64_t data_size, int dtype, const std::vector<int64_t> &shape)
+        : data_size(data_size), dtype(dtype), shape(shape) {}
+    TensorBase() = default;
+    uint64_t data_size = 0;
+    int dtype = 0;
+    std::vector<int64_t> shape;
+  };
+
  struct TensorStat {
    TensorStat(uint64_t data_size, int dtype, const std::vector<int64_t> &shape, bool is_bool, double max_value,
               double min_value, double avg_value, int count, int neg_zero_count, int pos_zero_count, int nan_count,
@ -313,6 +322,9 @@ class DebugServices {
  void ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *ret_name,
                        std::vector<char *> *data_ptr, std::vector<ssize_t> *data_size,
                        std::vector<unsigned int> *dtype, std::vector<std::vector<int64_t>> *const shape);
+
+  void SearchNodesTensors(const std::vector<std::string> &name,
+                          std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list);
 #ifdef ONLINE_DBG_MODE
  bool IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel = nullptr) const;

--- a/mindspore/ccsrc/debug/debugger/debug_grpc.proto
+++ b/mindspore/ccsrc/debug/debugger/debug_grpc.proto
@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -25,6 +25,8 @@ service EventListener {
  rpc SendMetadata (Metadata) returns (EventReply) {};
  rpc SendGraph (stream Chunk) returns (EventReply) {};
  rpc SendTensors (stream TensorProto) returns (EventReply) {};
+  rpc SendTensorBase (TensorBase) returns (EventReply) {};
+  rpc SendTensorStats (TensorSummary) returns (EventReply) {};
  rpc SendWatchpointHits (stream WatchpointHit) returns (EventReply) {};
  rpc SendMultiGraphs (stream Chunk) returns (EventReply) {};
  rpc SendHeartbeat (Heartbeat) returns (EventReply) {};
@ -87,6 +89,12 @@ message SetCMD {

 message ViewCMD {
  repeated TensorProto tensors = 1;
+  enum Level {
+    value = 0;
+    statistics = 1;
+    base = 2;
+  }
+  Level level = 2;
 }

 message WatchCondition {
@ -142,3 +150,28 @@ message Heartbeat {
  string message = 1;
  int32 period = 2;
 }
+
+message TensorSummary{
+ TensorBase tensor_base = 1;
+ Statistics statistics = 2;
+}
+
+message Statistics {
+ bool is_bool = 1;
+ float max_value = 2;
+ float min_value = 3;
+ float avg_value = 4;
+ int32 count = 5;
+ int32 neg_zero_count = 6;
+ int32 pos_zero_count = 7;
+ int32 nan_count = 8;
+ int32 neg_inf_count = 9;
+ int32 pos_inf_count = 10;
+ int32 zero_count = 11;
+}
+
+message TensorBase{
+ int32 data_type = 1;
+ repeated int64 shape = 2;
+ int64 data_size = 3;
+}
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@ -48,6 +48,7 @@ using debugger::Chunk;
 using debugger::EventReply;
 using debugger::GraphProto;
 using debugger::ModelProto;
+using debugger::Statistics;
 using debugger::TensorProto;
 using debugger::WatchCondition;
 using debugger::WatchCondition_Condition_inf;
@ -839,6 +840,29 @@ void Debugger::ProcessKViewCMD(const EventReply &reply) {
    MS_LOG(INFO) << "tensor iter: " << received_tensor.iter();
    MS_LOG(INFO) << "tensor truncate: " << std::boolalpha << received_tensor.truncate() << std::noboolalpha;
  }
+
+  switch (reply.view_cmd().level()) {
+    case debugger::ViewCMD_Level::ViewCMD_Level_base:
+      MS_LOG(INFO) << "Tensor base request.";
+      ViewBaseLevel(reply);
+      break;
+
+    case debugger::ViewCMD_Level::ViewCMD_Level_statistics:
+      MS_LOG(INFO) << "Tensor statistics request.";
+      ViewStatLevel(reply);
+      break;
+
+    case debugger::ViewCMD_Level::ViewCMD_Level_value:
+      MS_LOG(INFO) << "Tensor value request.";
+      ViewValueLevel(reply);
+      break;
+    default:
+      MS_LOG(DEBUG) << "Debug: Unknown tensor info level";
+      break;
+  }
+}
+
+void Debugger::ViewValueLevel(const EventReply &reply) {
  MS_LOG(INFO) << "Sending tensors";
  std::list<TensorProto> tensors = LoadTensors(GetTensors(reply));
  // print view cmd reply
@ -860,6 +884,30 @@ void Debugger::ProcessKViewCMD(const EventReply &reply) {
  }
 }

+void Debugger::ViewStatLevel(const EventReply &reply) {
+  std::list<TensorSummary> tensor_stat_list = LoadTensorsStat(GetTensors(reply));
+  int index = 0;
+  for (auto tensor_stat : tensor_stat_list) {
+    EventReply send_tensors_stat_reply = grpc_client_->SendTensorStats(tensor_stat);
+    if (send_tensors_stat_reply.status() != debugger::EventReply::OK) {
+      MS_LOG(ERROR) << "Error: SendTensorsStats failed for tensor index " << index << ".";
+    }
+    index++;
+  }
+}
+
+void Debugger::ViewBaseLevel(const EventReply &reply) {
+  std::list<TensorBase> tensors_base_list = LoadTensorsBase(GetTensors(reply));
+  int index = 0;
+  for (auto tensor_base : tensors_base_list) {
+    EventReply send_tensors_base_reply = grpc_client_->SendTensorBase(tensor_base);
+    if (send_tensors_base_reply.status() != debugger::EventReply::OK) {
+      MS_LOG(ERROR) << "Error: SendTensorsBase failed for tensor index " << index << ".";
+    }
+    index++;
+  }
+}
+
 void AddTensorProtoInfo(TensorProto *tensor_item, const TensorProto &tensor) {
  tensor_item->set_node_name(tensor.node_name());
  tensor_item->set_slot(tensor.slot());
@ -870,6 +918,35 @@ void AddTensorProtoInfo(TensorProto *tensor_item, const TensorProto &tensor) {
  tensor_item->clear_dims();
 }

+void AddTensorStatInfo(const DebugServices::TensorStat &tensor_stat, std::list<TensorSummary> *tensor_summary_list) {
+  if (!tensor_summary_list) {
+    MS_LOG(DEBUG) << "tensor_summary_list is nullptr.";
+    return;
+  }
+  TensorSummary tensor_summary_item;
+  TensorBase *tensor_base = tensor_summary_item.mutable_tensor_base();
+  tensor_base->set_data_type(tensor_stat.dtype);
+  tensor_base->set_data_size(tensor_stat.data_size);
+  for (auto elem : tensor_stat.shape) {
+    tensor_base->add_shape(elem);
+  }
+
+  Statistics *tensor_statistics = tensor_summary_item.mutable_statistics();
+  tensor_statistics->set_is_bool(tensor_stat.is_bool);
+  tensor_statistics->set_max_value(tensor_stat.max_value);
+  tensor_statistics->set_min_value(tensor_stat.min_value);
+  tensor_statistics->set_avg_value(tensor_stat.avg_value);
+  tensor_statistics->set_count(tensor_stat.count);
+  tensor_statistics->set_neg_zero_count(tensor_stat.neg_zero_count);
+  tensor_statistics->set_pos_zero_count(tensor_stat.pos_zero_count);
+  tensor_statistics->set_nan_count(tensor_stat.nan_count);
+  tensor_statistics->set_neg_inf_count(tensor_stat.neg_inf_count);
+  tensor_statistics->set_pos_inf_count(tensor_stat.pos_inf_count);
+  tensor_statistics->set_zero_count(tensor_stat.zero_count);
+
+  tensor_summary_list->push_back(tensor_summary_item);
+}
+
 void Debugger::SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id,
                             const ProtoVector<WatchCondition_Parameter> &parameters) {
  std::vector<std::tuple<std::string, bool>> check_node_list;
@ -945,6 +1022,56 @@ std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &ten
  return tensor_list;
 }

+std::list<TensorBase> Debugger::LoadTensorsBase(const ProtoVector<TensorProto> &tensors) const {
+  std::list<TensorBase> tensor_base_list;
+  std::vector<std::string> name;
+  std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName);
+  std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
+  debug_services_->SearchNodesTensors(name, &result_list);
+  for (auto result : result_list) {
+    auto tensor = std::get<1>(result);
+    if (!tensor) {
+      // tensor was not found, creating empty tensor base.
+      TensorBase tensor_base_item;
+      tensor_base_item.set_data_size(0);
+      tensor_base_item.set_data_type(0);
+      tensor_base_item.add_shape(0);
+      tensor_base_list.push_back(tensor_base_item);
+      continue;
+    }
+    // tensor was found creating tensor base object.
+    TensorBase tensor_base_item;
+    tensor_base_item.set_data_size(tensor->GetByteSize());
+    tensor_base_item.set_data_type(tensor->GetType());
+    for (auto elem : tensor->GetShape()) {
+      tensor_base_item.add_shape(elem);
+    }
+    tensor_base_list.push_back(tensor_base_item);
+  }
+  return tensor_base_list;
+}
+
+std::list<TensorSummary> Debugger::LoadTensorsStat(const ProtoVector<TensorProto> &tensors) const {
+  std::list<TensorSummary> tensor_summary_list;
+  std::vector<std::string> name;
+  std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName);
+  std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
+  debug_services_->SearchNodesTensors(name, &result_list);
+  for (auto result : result_list) {
+    auto tensor = std::get<1>(result);
+    if (!tensor) {
+      // tensor was not found, creating empty tensor summary.
+      DebugServices::TensorStat tensor_stat;
+      AddTensorStatInfo(tensor_stat, &tensor_summary_list);
+      continue;
+    }
+    // tensor was found creating tensor summary object.
+    DebugServices::TensorStat tensor_stat = debug_services_->GetTensorStatistics(tensor);
+    AddTensorStatInfo(tensor_stat, &tensor_summary_list);
+  }
+  return tensor_summary_list;
+}
+
 void Debugger::Exit() {
  // clear resource before exit
  // debugger will notify main thread to exit because main thread can only exit at step boundary.
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -32,6 +32,7 @@ using debugger::DataType;
 using debugger::EventReply;
 using debugger::GraphProto;
 using debugger::ModelProto;
+using debugger::Statistics;
 using debugger::TensorProto;
 using debugger::WatchCondition;
 using debugger::WatchCondition_Parameter;
@ -216,6 +217,12 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  void ProcessKSetCMD(const EventReply &reply);
  // Process the KViewCMD
  void ProcessKViewCMD(const EventReply &reply);
+  // ViewCMD base level
+  void ViewBaseLevel(const EventReply &reply);
+  // ViewCMD statistics level
+  void ViewStatLevel(const EventReply &reply);
+  // ViewCMD value level
+  void ViewValueLevel(const EventReply &reply);
  // set what nodes and conditions to watch
  void SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id,
                     const ProtoVector<WatchCondition_Parameter> &parameters);
@ -226,6 +233,12 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // load tensor for view command
  std::list<TensorProto> LoadTensors(const ProtoVector<TensorProto> &tensors) const;

+  // load tensor base for view command
+  std::list<TensorBase> LoadTensorsBase(const ProtoVector<TensorProto> &tensors) const;
+
+  // load tensor statistics for view command
+  std::list<TensorSummary> LoadTensorsStat(const ProtoVector<TensorProto> &tensors) const;
+
  // terminate training process
  void Exit();

--- a/mindspore/ccsrc/debug/debugger/grpc_client.cc
+++ b/mindspore/ccsrc/debug/debugger/grpc_client.cc
@ -26,7 +26,9 @@ using debugger::EventReply_Status_FAILED;
 using debugger::GraphProto;
 using debugger::Heartbeat;
 using debugger::Metadata;
+using debugger::TensorBase;
 using debugger::TensorProto;
+using debugger::TensorSummary;
 using debugger::WatchpointHit;

 namespace mindspore {
@ -200,4 +202,32 @@ EventReply GrpcClient::SendHeartbeat(const Heartbeat &heartbeat) {
  }
  return reply;
 }
+
+EventReply GrpcClient::SendTensorBase(const TensorBase &tensor_base) {
+  EventReply reply;
+  grpc::ClientContext context;
+
+  grpc::Status status = stub_->SendTensorBase(&context, tensor_base, &reply);
+
+  if (!status.ok()) {
+    MS_LOG(ERROR) << "RPC failed: SendTensorBase";
+    MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
+    reply.set_status(EventReply_Status_FAILED);
+  }
+  return reply;
+}
+
+EventReply GrpcClient::SendTensorStats(const TensorSummary &tensor_summary) {
+  EventReply reply;
+  grpc::ClientContext context;
+
+  grpc::Status status = stub_->SendTensorStats(&context, tensor_summary, &reply);
+
+  if (!status.ok()) {
+    MS_LOG(ERROR) << "RPC failed: SendTensorStats";
+    MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
+    reply.set_status(EventReply_Status_FAILED);
+  }
+  return reply;
+}
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/debugger/grpc_client.h
+++ b/mindspore/ccsrc/debug/debugger/grpc_client.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -29,7 +29,9 @@ using debugger::EventReply;
 using debugger::GraphProto;
 using debugger::Heartbeat;
 using debugger::Metadata;
+using debugger::TensorBase;
 using debugger::TensorProto;
+using debugger::TensorSummary;
 using debugger::WatchpointHit;

 namespace mindspore {
@ -55,6 +57,10 @@ class GrpcClient {

  EventReply SendTensors(const std::list<TensorProto> &tensors);

+  EventReply SendTensorBase(const TensorBase &tensor_base);
+
+  EventReply SendTensorStats(const TensorSummary &tensor_summary);
+
  EventReply SendMultiGraphs(const std::list<Chunk> &chunks);

  EventReply SendWatchpointHits(const std::list<WatchpointHit> &watchpoints);