!5737 Add support for more watchpoint conditions in debugger.

Merge pull request !5737 from HarshvardhanGupta/add-new-wp
2020-09-11 20:58:18 +08:00 · 2020-09-11 20:58:18 +08:00 · 8fe3cf6991
parent 939737c017 e751c2e069
commit 8fe3cf6991
4 changed files with 194 additions and 169 deletions
--- a/mindspore/ccsrc/debug/debug_services.cc
+++ b/mindspore/ccsrc/debug/debug_services.cc
@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+#include <algorithm>
 #include "debug/debug_services.h"
 namespace mindspore {

@ -37,25 +38,18 @@ DebugServices &DebugServices::operator=(const DebugServices &other) {

 DebugServices::~DebugServices() { delete tensor_loader_; }

-void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition,
+void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter,
                                  const std::vector<std::tuple<std::string, bool>> &check_node_list) {
  std::lock_guard<std::mutex> lg(lock_);

  watchpoint_t watchpoint_item;
-
  watchpoint_item.id = id;
-
-  if (watch_condition == 0) {
-    watchpoint_item.conditions.nan.enabled = true;
-  } else if (watch_condition == 1) {
-    watchpoint_item.conditions.inf.enabled = true;
-    watchpoint_item.conditions.neg_inf.enabled = true;
-  } else if (watch_condition == 2) {
-    watchpoint_item.conditions.overflow.enabled = true;
-  }
-
+  watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
+  watchpoint_item.condition.parameter = parameter;
+  if (watch_condition > 2)
+    // odd indices are greater than conditions and even indicies are less than
+    watchpoint_item.condition.comparison = (watch_condition & 1) == 0 ? "LT" : "GT";
  watchpoint_item.check_node_list = check_node_list;
-
  watchpoint_table[id] = watchpoint_item;
 }

@ -64,135 +58,109 @@ void DebugServices::RemoveWatchpoint(unsigned int id) {
  watchpoint_table.erase(id);
 }

+DebugServices::tensor_stats DebugServices::SummarizeTensor(const float *start, unsigned int n, bool need_min_max,
+                                                           bool need_mean_sd) {
+  tensor_stats stats;
+  for (unsigned int i = 0; i < n; ++i) {
+    float val = start[i];
+    stats.has_nan = stats.has_nan || isnan(val);
+    stats.has_inf = stats.has_inf || isinf(val);
+    if (stats.has_inf && stats.has_nan) {
+      // other statistics don't make sense in this case
+      break;
+    }
+
+    if (need_min_max) {
+      stats.min = std::min(stats.min, val);
+      stats.max = std::max(stats.max, val);
+    }
+
+    if (need_mean_sd) {
+      // for mean and sd calculation see
+      // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
+      float delta = val - stats.mean;
+      stats.mean += delta / (i + 1);
+      stats.m2 += delta * (val - stats.mean);
+    }
+  }
+  stats.n = n;
+  return stats;
+}
+
 void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot,
                                     std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id,
                                     const std::vector<std::string> &op_overflows,
                                     const std::vector<std::shared_ptr<TensorData>> &tensor_list) {
  std::lock_guard<std::mutex> lg(lock_);
-  std::string current_tensor_name;
-  std::unordered_map<unsigned int, watchpoint_t> watchpoints_to_check_table;
-  const size_t location = 0;
-
-  for (std::size_t i = 0; i < tensor_list.size(); i++) {
-    current_tensor_name = tensor_list[i]->GetName();
-    std::string tensor_slot = std::to_string(tensor_list[i]->GetSlot());
-    mindspore::tensor::TensorPtr tensor_ptr = tensor_list[i]->GetTensor();
-    int tensor_data_type = tensor_ptr->data_type_c();
-
-    // check if we need to analyze this node and for which watchpoints we will check
-    // create a list of watchpoints to check
-    watchpoints_to_check_table.clear();
-    for (auto w_table_item : watchpoint_table) {
-      // if the watchpoint is checking for a nan or inf and the current tensor is not of a float type, then
-      // don't check the watchpoint for this tensor
-      if (std::get<1>(w_table_item).conditions.inf.enabled || std::get<1>(w_table_item).conditions.neg_inf.enabled ||
-          std::get<1>(w_table_item).conditions.nan.enabled) {
-        if (tensor_data_type != kNumberTypeFloat16 && tensor_data_type != kNumberTypeFloat &&
-            tensor_data_type != kNumberTypeFloat32 && tensor_data_type != kNumberTypeFloat64) {
-          continue;
-        }
-      }
-
-      auto check_node_list = std::get<1>(w_table_item).check_node_list;
-
-      for (auto check_node : check_node_list) {
-        std::string w_name = std::get<0>(check_node);
-        bool w_type = std::get<1>(check_node);
-
-        // check if the current node tensor name is included the watchpoint
-        std::string current_node_name = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
-        if ((w_type == true && (current_tensor_name.find(w_name) == location || w_name == "*")) ||
-            (w_type == false && current_node_name == w_name)) {
-          watchpoints_to_check_table[w_table_item.second.id] = w_table_item.second;
-          break;
-        }
-      }
-    }
-    std::vector<unsigned int> hit_encountered;
-
-    // handle watchpoint conditions that do not require per element checks
-    for (auto it_w_table_check = watchpoints_to_check_table.begin();
-         it_w_table_check != watchpoints_to_check_table.end(); ++it_w_table_check) {
-      if (it_w_table_check->second.conditions.overflow.enabled) {
-        std::string name_no_slot = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
-        if (std::find(op_overflows.begin(), op_overflows.end(), name_no_slot) != op_overflows.end()) {
-          hit_encountered.push_back(it_w_table_check->second.id);
-        }
-      }
-    }
-
-    if (hit_encountered.size()) {
-      HandleWatchpointHits(hit_encountered, name, slot, condition, watchpoint_id, current_tensor_name,
-                           &watchpoints_to_check_table, tensor_slot);
-      hit_encountered.clear();
-    }
-
-    // need to add support for float16 and float64, and other types when we support conditions beyond inf and nan
-    if (tensor_data_type != kNumberTypeFloat && tensor_data_type != kNumberTypeFloat32) {
-      continue;
-    }
-
-    // check if no watchpoints are remaining
-    if (watchpoints_to_check_table.empty()) {
-      continue;
-    }
-
-    float *start_addr = reinterpret_cast<float *>(tensor_ptr->data_c());
-    unsigned int num_elements = (tensor_ptr->data().nbytes()) / sizeof(float);
-    std::unordered_map<unsigned int, watchpoint_t>::iterator it_w_table_check;
-
-    for (unsigned int index = 0; index < num_elements; index++) {
-      float x = start_addr[index];
-      it_w_table_check = watchpoints_to_check_table.begin();
-
-      while (it_w_table_check != watchpoints_to_check_table.end()) {
-        if ((it_w_table_check->second.conditions.inf.enabled || it_w_table_check->second.conditions.neg_inf.enabled) &&
-            isinf(x)) {
-          hit_encountered.push_back(it_w_table_check->second.id);
-        } else if (it_w_table_check->second.conditions.nan.enabled && isnan(x)) {
-          hit_encountered.push_back(it_w_table_check->second.id);
-        }
-        ++it_w_table_check;
-      }
-
-      if (hit_encountered.size()) {
-        HandleWatchpointHits(hit_encountered, name, slot, condition, watchpoint_id, current_tensor_name,
-                             &watchpoints_to_check_table, tensor_slot);
-        hit_encountered.clear();
-      }
-
-      if (watchpoints_to_check_table.empty()) {
-        break;
-      }
-    }
+  if (watchpoint_table.empty()) {
+    return;
  }
-}

-void DebugServices::HandleWatchpointHits(const std::vector<unsigned int> &hit_encountered,
-                                         std::vector<std::string> *name, std::vector<std::string> *slot,
-                                         std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id,
-                                         std::string current_tensor_name,
-                                         std::unordered_map<unsigned int, watchpoint_t> *watchpoints_to_check_table,
-                                         std::string tensor_slot) {
-  for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) {
-    if (watchpoint_table.find(*it_hit_id) != watchpoint_table.end()) {
-      std::string name_no_slot = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
-      name->push_back(name_no_slot);
-      slot->push_back(tensor_slot);
+  for (const auto &tensor : tensor_list) {
+    const auto tensor_name = tensor->GetName();
+    const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
+    const auto tensor_slot = std::to_string(tensor->GetSlot());
+    mindspore::tensor::TensorPtr tensor_ptr = tensor->GetTensor();
+    int tensor_dtype = tensor_ptr->data_type_c();
+    std::vector<unsigned int> hit_encountered;
+    std::unordered_map<unsigned int, watchpoint_t> watchpoints_to_check_table;
+    bool min_max_enabled = false;
+    bool mean_sd_enabled = false;
+    bool inf_nan_enabled = false;
+    for (auto w_table_item : watchpoint_table) {
+      auto wp = std::get<1>(w_table_item);

-      int condition_item = -1;
-      if (watchpoint_table[*it_hit_id].conditions.nan.enabled) {
-        condition_item = 0;
-      } else if (watchpoint_table[*it_hit_id].conditions.inf.enabled ||
-                 watchpoint_table[*it_hit_id].conditions.neg_inf.enabled) {
-        condition_item = 1;
-      } else if (watchpoint_table[*it_hit_id].conditions.overflow.enabled) {
-        condition_item = 2;
+      // if (!wp.conditions.condition_list[IS_OVERFLOW].enabled) {
+      if (wp.condition.type != IS_OVERFLOW) {
+        // only overflow condition supports all data types
+        if (tensor_dtype != kNumberTypeFloat && tensor_dtype != kNumberTypeFloat32) continue;
+      }
+
+      if (wp.IsNodeIncluded(tensor_name_no_slot)) {
+        min_max_enabled |= wp.min_max_enabled();
+        mean_sd_enabled |= wp.mean_sd_enabled();
+        inf_nan_enabled |= wp.inf_nan_enabled();
+        watchpoints_to_check_table[w_table_item.second.id] = w_table_item.second;
      }
-      condition->push_back(condition_item);
-      watchpoint_id->push_back(*it_hit_id);
    }
-    watchpoints_to_check_table->erase(*it_hit_id);
+    tensor_stats stats;
+
+    if (min_max_enabled || mean_sd_enabled || inf_nan_enabled) {
+      auto *start_addr = reinterpret_cast<float *>(tensor_ptr->data_c());
+      unsigned int num_elements = (tensor_ptr->data().nbytes()) / sizeof(float);
+      stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
+    }
+
+    for (auto &it : watchpoints_to_check_table) {
+      auto wp_id = it.second.id;
+      CONDITION_TYPE enabled_condition = it.second.condition.type;
+      bool hit = (enabled_condition == HAS_NAN && stats.has_nan) || (enabled_condition == HAS_INF && stats.has_inf) ||
+                 (enabled_condition == IS_OVERFLOW &&
+                  std::find(op_overflows.begin(), op_overflows.end(), tensor_name_no_slot) != op_overflows.end());
+
+      if (enabled_condition > 2) {
+        if (stats.has_inf || stats.has_nan) {
+          MS_LOG(WARNING) << "NaN or/and INF present in tensor: " << tensor_name << ". Cannot check "
+                          << condition_label[enabled_condition] << " watchpoint.";
+        } else {
+          bool gt = stats.statLookup(enabled_condition) > it.second.condition.parameter;
+          bool lt = stats.statLookup(enabled_condition) < it.second.condition.parameter;
+          hit |= it.second.condition.comparison == "GT" ? gt : lt;
+        }
+      }
+      if (hit) hit_encountered.push_back(wp_id);
+    }
+
+    for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) {
+      if (watchpoint_table.find(*it_hit_id) != watchpoint_table.end()) {
+        name->push_back(tensor_name_no_slot);
+        slot->push_back(tensor_slot);
+        int condition_item = watchpoint_table.find(*it_hit_id)->second.condition.type;
+        condition->push_back(condition_item);
+        watchpoint_id->push_back(*it_hit_id);
+      }
+      watchpoints_to_check_table.erase(*it_hit_id);
+    }
  }
 }

--- a/mindspore/ccsrc/debug/debug_services.h
+++ b/mindspore/ccsrc/debug/debug_services.h
@ -23,6 +23,7 @@
 #include <tuple>
 #include <unordered_map>
 #include <mutex>
+#include <limits>
 #include "debug/tensor_load.h"
 #include "debug/tensor_data.h"
 #include "ir/dtype.h"
@ -38,39 +39,91 @@ class DebugServices {

  ~DebugServices();

-  typedef struct condition_no_param {
-    bool enabled = false;
-  } condition_no_param_t;
+  enum CONDITION_TYPE {
+    HAS_NAN,
+    HAS_INF,
+    IS_OVERFLOW,
+    MAX_GT,
+    MAX_LT,
+    MIN_GT,
+    MIN_LT,
+    MAX_MIN_GT,
+    MAX_MIN_LT,
+    MEAN_GT,
+    MEAN_LT,
+    SD_GT,
+    SD_LT
+  };

-  typedef struct condition_with_param {
-    bool enabled = false;
+  typedef struct condition {
+    CONDITION_TYPE type;
    float parameter = 0;
-  } condition_with_param_t;
-
-  typedef struct conditions {
-    condition_no_param_t inf;
-    condition_no_param_t neg_inf;
-    condition_no_param_t nan;
-    condition_no_param_t overflow;
-    condition_with_param_t max_below;
-    condition_with_param_t max_above;
-    condition_with_param_t min_below;
-    condition_with_param_t min_above;
-    condition_with_param_t max_minus_min_below;
-    condition_with_param_t max_minus_min_above;
-    condition_with_param_t mean_below;
-    condition_with_param_t mean_above;
-    condition_with_param_t std_dev_below;
-    condition_with_param_t std_dev_above;
-  } conditions_t;
+    std::string comparison;
+  } condition_t;

  typedef struct watchpoint {
    unsigned int id;
-    conditions_t conditions;
+    condition_t condition;
    std::vector<std::tuple<std::string, bool>> check_node_list;
+    size_t location = 0;
+
+    bool IsNodeIncluded(const std::string &tensor_name) {
+      std::string node_name = tensor_name.substr(0, tensor_name.find_first_of(':'));
+      for (auto check_node : check_node_list) {
+        std::string w_name = std::get<0>(check_node);
+        bool w_type = std::get<1>(check_node);
+        if ((w_type && (tensor_name.find(w_name) == location || w_name == "*")) || (!w_type && node_name == w_name)) {
+          return true;
+        }
+      }
+      return false;
+    }
+
+    bool min_max_enabled() {
+      return condition.type == MAX_LT || condition.type == MAX_GT || condition.type == MIN_LT ||
+             condition.type == MIN_GT || condition.type == MAX_MIN_LT || condition.type == MAX_MIN_GT;
+    }
+    // inf or nan related condition set
+    bool inf_nan_enabled() { return condition.type == HAS_INF || condition.type == HAS_NAN; }
+    // mean or sd related condition set
+    bool mean_sd_enabled() {
+      return condition.type == MEAN_LT || condition.type == MEAN_GT || condition.type == SD_LT ||
+             condition.type == SD_GT;
+    }
  } watchpoint_t;

-  void AddWatchpoint(unsigned int id, unsigned int watch_condition,
+  struct tensor_stats {
+    float min = std::numeric_limits<float>::max();
+    float max = std::numeric_limits<float>::lowest();
+    bool has_inf = false;
+    bool has_nan = false;
+    unsigned int n = 0;
+    float mean = 0.0;
+    float m2 = 0.0;
+
+    float statLookup(CONDITION_TYPE type) const {
+      if (type == MAX_GT || type == MAX_LT) return max;
+      if (type == MIN_GT || type == MIN_LT) return min;
+      if (type == MAX_MIN_GT || type == MAX_MIN_LT) return (max - min);
+      if (type == MEAN_GT || type == MEAN_LT) return mean;
+      if (type == SD_GT || type == SD_LT) return getStandardDeviation();
+      return std::numeric_limits<float>::quiet_NaN();
+    }
+
+    float getMean() const { return mean; }
+
+    float getVariance() const {
+      if (n > 1) {
+        return m2 / (n - 1);
+      } else {
+        return 0.0;
+      }
+    }
+
+    float getStandardDeviation() const { return sqrt(getVariance()); }
+  };
+
+  void AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter,
                     const std::vector<std::tuple<std::string, bool>> &check_node_list);

  void RemoveWatchpoint(unsigned int id);
@ -93,14 +146,13 @@ class DebugServices {
  std::mutex lock_;

  std::unordered_map<unsigned int, watchpoint_t> watchpoint_table;
+  std::vector<std::string> condition_label = {"HAS_NAN", "HAS_INF", "IS_OVERFLOW", "MAX_GT",     "MAX_LT",
+                                              "MIN_GT",  "MIN_LT",  "MAX_MIN_GT",  "MAX_MIN_LT", "MEAN_GT",
+                                              "MEAN_LT", "SD_GT",   "SD_LT"};

  TensorLoader *tensor_loader_;

-  void HandleWatchpointHits(const std::vector<unsigned int> &hit_encountered, std::vector<std::string> *name,
-                            std::vector<std::string> *slot, std::vector<int> *condition,
-                            std::vector<unsigned int> *watchpoint_id, std::string current_tensor_name,
-                            std::unordered_map<unsigned int, watchpoint_t> *watchpoints_to_check_table,
-                            std::string tensor_slot);
+  static tensor_stats SummarizeTensor(const float *start, unsigned int n, bool need_min_max, bool need_mean_sd);
 };
 }  // namespace mindspore

--- a/mindspore/ccsrc/debug/debugger/debug_grpc.proto
+++ b/mindspore/ccsrc/debug/debugger/debug_grpc.proto
@ -84,14 +84,19 @@ message WatchCondition {
    nan = 0;
    inf = 1;
    overflow = 2;
-    ge = 3;  // greater than and equal to
-    gt = 4;  // greater than
-    le = 5;  // less than and equal to
-    lt = 6;  // less than
-    between = 7;  // between
+    max_gt = 3;
+    max_lt = 4;
+    min_gt = 5;
+    min_lt = 6;
+    max_min_gt = 7;
+    max_min_lt = 8;
+    mean_gt = 9;
+    mean_lt = 10;
+    sd_gt = 11;
+    sd_lt = 12;
  }
  Condition condition = 1;
-  repeated float value = 2;  // for between condition, there will be two values
+  float value = 2;  // for between condition, there will be two values
  repeated bool include = 3;  // for between condition, define the value is included or not
 }

--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@ -493,7 +493,7 @@ void Debugger::SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCon
                 [](WatchNode node) -> std::tuple<std::string, bool> {
                   return make_tuple(node.node_name(), node.node_type() == "scope");
                 });
-  debug_services_->AddWatchpoint(id, condition.condition(), check_node_list);
+  debug_services_->AddWatchpoint(id, condition.condition(), condition.value(), check_node_list);
 }

 void Debugger::RemoveWatchpoint(const int32_t id) { debug_services_->RemoveWatchpoint(id); }