!5737 Add support for more watchpoint conditions in debugger.

Merge pull request !5737 from HarshvardhanGupta/add-new-wp
This commit is contained in:
mindspore-ci-bot 2020-09-11 20:58:18 +08:00 committed by Gitee
commit 8fe3cf6991
4 changed files with 194 additions and 169 deletions

View File

@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <algorithm>
#include "debug/debug_services.h"
namespace mindspore {
@ -37,25 +38,18 @@ DebugServices &DebugServices::operator=(const DebugServices &other) {
DebugServices::~DebugServices() { delete tensor_loader_; }
void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition,
void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter,
const std::vector<std::tuple<std::string, bool>> &check_node_list) {
std::lock_guard<std::mutex> lg(lock_);
watchpoint_t watchpoint_item;
watchpoint_item.id = id;
if (watch_condition == 0) {
watchpoint_item.conditions.nan.enabled = true;
} else if (watch_condition == 1) {
watchpoint_item.conditions.inf.enabled = true;
watchpoint_item.conditions.neg_inf.enabled = true;
} else if (watch_condition == 2) {
watchpoint_item.conditions.overflow.enabled = true;
}
watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
watchpoint_item.condition.parameter = parameter;
if (watch_condition > 2)
// odd indices are greater than conditions and even indicies are less than
watchpoint_item.condition.comparison = (watch_condition & 1) == 0 ? "LT" : "GT";
watchpoint_item.check_node_list = check_node_list;
watchpoint_table[id] = watchpoint_item;
}
@ -64,135 +58,109 @@ void DebugServices::RemoveWatchpoint(unsigned int id) {
watchpoint_table.erase(id);
}
DebugServices::tensor_stats DebugServices::SummarizeTensor(const float *start, unsigned int n, bool need_min_max,
bool need_mean_sd) {
tensor_stats stats;
for (unsigned int i = 0; i < n; ++i) {
float val = start[i];
stats.has_nan = stats.has_nan || isnan(val);
stats.has_inf = stats.has_inf || isinf(val);
if (stats.has_inf && stats.has_nan) {
// other statistics don't make sense in this case
break;
}
if (need_min_max) {
stats.min = std::min(stats.min, val);
stats.max = std::max(stats.max, val);
}
if (need_mean_sd) {
// for mean and sd calculation see
// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
float delta = val - stats.mean;
stats.mean += delta / (i + 1);
stats.m2 += delta * (val - stats.mean);
}
}
stats.n = n;
return stats;
}
void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot,
std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id,
const std::vector<std::string> &op_overflows,
const std::vector<std::shared_ptr<TensorData>> &tensor_list) {
std::lock_guard<std::mutex> lg(lock_);
std::string current_tensor_name;
std::unordered_map<unsigned int, watchpoint_t> watchpoints_to_check_table;
const size_t location = 0;
for (std::size_t i = 0; i < tensor_list.size(); i++) {
current_tensor_name = tensor_list[i]->GetName();
std::string tensor_slot = std::to_string(tensor_list[i]->GetSlot());
mindspore::tensor::TensorPtr tensor_ptr = tensor_list[i]->GetTensor();
int tensor_data_type = tensor_ptr->data_type_c();
// check if we need to analyze this node and for which watchpoints we will check
// create a list of watchpoints to check
watchpoints_to_check_table.clear();
for (auto w_table_item : watchpoint_table) {
// if the watchpoint is checking for a nan or inf and the current tensor is not of a float type, then
// don't check the watchpoint for this tensor
if (std::get<1>(w_table_item).conditions.inf.enabled || std::get<1>(w_table_item).conditions.neg_inf.enabled ||
std::get<1>(w_table_item).conditions.nan.enabled) {
if (tensor_data_type != kNumberTypeFloat16 && tensor_data_type != kNumberTypeFloat &&
tensor_data_type != kNumberTypeFloat32 && tensor_data_type != kNumberTypeFloat64) {
continue;
}
}
auto check_node_list = std::get<1>(w_table_item).check_node_list;
for (auto check_node : check_node_list) {
std::string w_name = std::get<0>(check_node);
bool w_type = std::get<1>(check_node);
// check if the current node tensor name is included the watchpoint
std::string current_node_name = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
if ((w_type == true && (current_tensor_name.find(w_name) == location || w_name == "*")) ||
(w_type == false && current_node_name == w_name)) {
watchpoints_to_check_table[w_table_item.second.id] = w_table_item.second;
break;
}
}
}
std::vector<unsigned int> hit_encountered;
// handle watchpoint conditions that do not require per element checks
for (auto it_w_table_check = watchpoints_to_check_table.begin();
it_w_table_check != watchpoints_to_check_table.end(); ++it_w_table_check) {
if (it_w_table_check->second.conditions.overflow.enabled) {
std::string name_no_slot = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
if (std::find(op_overflows.begin(), op_overflows.end(), name_no_slot) != op_overflows.end()) {
hit_encountered.push_back(it_w_table_check->second.id);
}
}
}
if (hit_encountered.size()) {
HandleWatchpointHits(hit_encountered, name, slot, condition, watchpoint_id, current_tensor_name,
&watchpoints_to_check_table, tensor_slot);
hit_encountered.clear();
}
// need to add support for float16 and float64, and other types when we support conditions beyond inf and nan
if (tensor_data_type != kNumberTypeFloat && tensor_data_type != kNumberTypeFloat32) {
continue;
}
// check if no watchpoints are remaining
if (watchpoints_to_check_table.empty()) {
continue;
}
float *start_addr = reinterpret_cast<float *>(tensor_ptr->data_c());
unsigned int num_elements = (tensor_ptr->data().nbytes()) / sizeof(float);
std::unordered_map<unsigned int, watchpoint_t>::iterator it_w_table_check;
for (unsigned int index = 0; index < num_elements; index++) {
float x = start_addr[index];
it_w_table_check = watchpoints_to_check_table.begin();
while (it_w_table_check != watchpoints_to_check_table.end()) {
if ((it_w_table_check->second.conditions.inf.enabled || it_w_table_check->second.conditions.neg_inf.enabled) &&
isinf(x)) {
hit_encountered.push_back(it_w_table_check->second.id);
} else if (it_w_table_check->second.conditions.nan.enabled && isnan(x)) {
hit_encountered.push_back(it_w_table_check->second.id);
}
++it_w_table_check;
}
if (hit_encountered.size()) {
HandleWatchpointHits(hit_encountered, name, slot, condition, watchpoint_id, current_tensor_name,
&watchpoints_to_check_table, tensor_slot);
hit_encountered.clear();
}
if (watchpoints_to_check_table.empty()) {
break;
}
}
if (watchpoint_table.empty()) {
return;
}
}
void DebugServices::HandleWatchpointHits(const std::vector<unsigned int> &hit_encountered,
std::vector<std::string> *name, std::vector<std::string> *slot,
std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id,
std::string current_tensor_name,
std::unordered_map<unsigned int, watchpoint_t> *watchpoints_to_check_table,
std::string tensor_slot) {
for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) {
if (watchpoint_table.find(*it_hit_id) != watchpoint_table.end()) {
std::string name_no_slot = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
name->push_back(name_no_slot);
slot->push_back(tensor_slot);
for (const auto &tensor : tensor_list) {
const auto tensor_name = tensor->GetName();
const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
const auto tensor_slot = std::to_string(tensor->GetSlot());
mindspore::tensor::TensorPtr tensor_ptr = tensor->GetTensor();
int tensor_dtype = tensor_ptr->data_type_c();
std::vector<unsigned int> hit_encountered;
std::unordered_map<unsigned int, watchpoint_t> watchpoints_to_check_table;
bool min_max_enabled = false;
bool mean_sd_enabled = false;
bool inf_nan_enabled = false;
for (auto w_table_item : watchpoint_table) {
auto wp = std::get<1>(w_table_item);
int condition_item = -1;
if (watchpoint_table[*it_hit_id].conditions.nan.enabled) {
condition_item = 0;
} else if (watchpoint_table[*it_hit_id].conditions.inf.enabled ||
watchpoint_table[*it_hit_id].conditions.neg_inf.enabled) {
condition_item = 1;
} else if (watchpoint_table[*it_hit_id].conditions.overflow.enabled) {
condition_item = 2;
// if (!wp.conditions.condition_list[IS_OVERFLOW].enabled) {
if (wp.condition.type != IS_OVERFLOW) {
// only overflow condition supports all data types
if (tensor_dtype != kNumberTypeFloat && tensor_dtype != kNumberTypeFloat32) continue;
}
if (wp.IsNodeIncluded(tensor_name_no_slot)) {
min_max_enabled |= wp.min_max_enabled();
mean_sd_enabled |= wp.mean_sd_enabled();
inf_nan_enabled |= wp.inf_nan_enabled();
watchpoints_to_check_table[w_table_item.second.id] = w_table_item.second;
}
condition->push_back(condition_item);
watchpoint_id->push_back(*it_hit_id);
}
watchpoints_to_check_table->erase(*it_hit_id);
tensor_stats stats;
if (min_max_enabled || mean_sd_enabled || inf_nan_enabled) {
auto *start_addr = reinterpret_cast<float *>(tensor_ptr->data_c());
unsigned int num_elements = (tensor_ptr->data().nbytes()) / sizeof(float);
stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
}
for (auto &it : watchpoints_to_check_table) {
auto wp_id = it.second.id;
CONDITION_TYPE enabled_condition = it.second.condition.type;
bool hit = (enabled_condition == HAS_NAN && stats.has_nan) || (enabled_condition == HAS_INF && stats.has_inf) ||
(enabled_condition == IS_OVERFLOW &&
std::find(op_overflows.begin(), op_overflows.end(), tensor_name_no_slot) != op_overflows.end());
if (enabled_condition > 2) {
if (stats.has_inf || stats.has_nan) {
MS_LOG(WARNING) << "NaN or/and INF present in tensor: " << tensor_name << ". Cannot check "
<< condition_label[enabled_condition] << " watchpoint.";
} else {
bool gt = stats.statLookup(enabled_condition) > it.second.condition.parameter;
bool lt = stats.statLookup(enabled_condition) < it.second.condition.parameter;
hit |= it.second.condition.comparison == "GT" ? gt : lt;
}
}
if (hit) hit_encountered.push_back(wp_id);
}
for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) {
if (watchpoint_table.find(*it_hit_id) != watchpoint_table.end()) {
name->push_back(tensor_name_no_slot);
slot->push_back(tensor_slot);
int condition_item = watchpoint_table.find(*it_hit_id)->second.condition.type;
condition->push_back(condition_item);
watchpoint_id->push_back(*it_hit_id);
}
watchpoints_to_check_table.erase(*it_hit_id);
}
}
}

View File

@ -23,6 +23,7 @@
#include <tuple>
#include <unordered_map>
#include <mutex>
#include <limits>
#include "debug/tensor_load.h"
#include "debug/tensor_data.h"
#include "ir/dtype.h"
@ -38,39 +39,91 @@ class DebugServices {
~DebugServices();
typedef struct condition_no_param {
bool enabled = false;
} condition_no_param_t;
enum CONDITION_TYPE {
HAS_NAN,
HAS_INF,
IS_OVERFLOW,
MAX_GT,
MAX_LT,
MIN_GT,
MIN_LT,
MAX_MIN_GT,
MAX_MIN_LT,
MEAN_GT,
MEAN_LT,
SD_GT,
SD_LT
};
typedef struct condition_with_param {
bool enabled = false;
typedef struct condition {
CONDITION_TYPE type;
float parameter = 0;
} condition_with_param_t;
typedef struct conditions {
condition_no_param_t inf;
condition_no_param_t neg_inf;
condition_no_param_t nan;
condition_no_param_t overflow;
condition_with_param_t max_below;
condition_with_param_t max_above;
condition_with_param_t min_below;
condition_with_param_t min_above;
condition_with_param_t max_minus_min_below;
condition_with_param_t max_minus_min_above;
condition_with_param_t mean_below;
condition_with_param_t mean_above;
condition_with_param_t std_dev_below;
condition_with_param_t std_dev_above;
} conditions_t;
std::string comparison;
} condition_t;
typedef struct watchpoint {
unsigned int id;
conditions_t conditions;
condition_t condition;
std::vector<std::tuple<std::string, bool>> check_node_list;
size_t location = 0;
bool IsNodeIncluded(const std::string &tensor_name) {
std::string node_name = tensor_name.substr(0, tensor_name.find_first_of(':'));
for (auto check_node : check_node_list) {
std::string w_name = std::get<0>(check_node);
bool w_type = std::get<1>(check_node);
if ((w_type && (tensor_name.find(w_name) == location || w_name == "*")) || (!w_type && node_name == w_name)) {
return true;
}
}
return false;
}
bool min_max_enabled() {
return condition.type == MAX_LT || condition.type == MAX_GT || condition.type == MIN_LT ||
condition.type == MIN_GT || condition.type == MAX_MIN_LT || condition.type == MAX_MIN_GT;
}
// inf or nan related condition set
bool inf_nan_enabled() { return condition.type == HAS_INF || condition.type == HAS_NAN; }
// mean or sd related condition set
bool mean_sd_enabled() {
return condition.type == MEAN_LT || condition.type == MEAN_GT || condition.type == SD_LT ||
condition.type == SD_GT;
}
} watchpoint_t;
void AddWatchpoint(unsigned int id, unsigned int watch_condition,
struct tensor_stats {
float min = std::numeric_limits<float>::max();
float max = std::numeric_limits<float>::lowest();
bool has_inf = false;
bool has_nan = false;
unsigned int n = 0;
float mean = 0.0;
float m2 = 0.0;
float statLookup(CONDITION_TYPE type) const {
if (type == MAX_GT || type == MAX_LT) return max;
if (type == MIN_GT || type == MIN_LT) return min;
if (type == MAX_MIN_GT || type == MAX_MIN_LT) return (max - min);
if (type == MEAN_GT || type == MEAN_LT) return mean;
if (type == SD_GT || type == SD_LT) return getStandardDeviation();
return std::numeric_limits<float>::quiet_NaN();
}
float getMean() const { return mean; }
float getVariance() const {
if (n > 1) {
return m2 / (n - 1);
} else {
return 0.0;
}
}
float getStandardDeviation() const { return sqrt(getVariance()); }
};
void AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter,
const std::vector<std::tuple<std::string, bool>> &check_node_list);
void RemoveWatchpoint(unsigned int id);
@ -93,14 +146,13 @@ class DebugServices {
std::mutex lock_;
std::unordered_map<unsigned int, watchpoint_t> watchpoint_table;
std::vector<std::string> condition_label = {"HAS_NAN", "HAS_INF", "IS_OVERFLOW", "MAX_GT", "MAX_LT",
"MIN_GT", "MIN_LT", "MAX_MIN_GT", "MAX_MIN_LT", "MEAN_GT",
"MEAN_LT", "SD_GT", "SD_LT"};
TensorLoader *tensor_loader_;
void HandleWatchpointHits(const std::vector<unsigned int> &hit_encountered, std::vector<std::string> *name,
std::vector<std::string> *slot, std::vector<int> *condition,
std::vector<unsigned int> *watchpoint_id, std::string current_tensor_name,
std::unordered_map<unsigned int, watchpoint_t> *watchpoints_to_check_table,
std::string tensor_slot);
static tensor_stats SummarizeTensor(const float *start, unsigned int n, bool need_min_max, bool need_mean_sd);
};
} // namespace mindspore

View File

@ -84,14 +84,19 @@ message WatchCondition {
nan = 0;
inf = 1;
overflow = 2;
ge = 3; // greater than and equal to
gt = 4; // greater than
le = 5; // less than and equal to
lt = 6; // less than
between = 7; // between
max_gt = 3;
max_lt = 4;
min_gt = 5;
min_lt = 6;
max_min_gt = 7;
max_min_lt = 8;
mean_gt = 9;
mean_lt = 10;
sd_gt = 11;
sd_lt = 12;
}
Condition condition = 1;
repeated float value = 2; // for between condition, there will be two values
float value = 2; // for between condition, there will be two values
repeated bool include = 3; // for between condition, define the value is included or not
}

View File

@ -493,7 +493,7 @@ void Debugger::SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCon
[](WatchNode node) -> std::tuple<std::string, bool> {
return make_tuple(node.node_name(), node.node_type() == "scope");
});
debug_services_->AddWatchpoint(id, condition.condition(), check_node_list);
debug_services_->AddWatchpoint(id, condition.condition(), condition.value(), check_node_list);
}
void Debugger::RemoveWatchpoint(const int32_t id) { debug_services_->RemoveWatchpoint(id); }