forked from mindspore-Ecosystem/mindspore
!5737 Add support for more watchpoint conditions in debugger.
Merge pull request !5737 from HarshvardhanGupta/add-new-wp
This commit is contained in:
commit
8fe3cf6991
|
@ -13,6 +13,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <algorithm>
|
||||
#include "debug/debug_services.h"
|
||||
namespace mindspore {
|
||||
|
||||
|
@ -37,25 +38,18 @@ DebugServices &DebugServices::operator=(const DebugServices &other) {
|
|||
|
||||
DebugServices::~DebugServices() { delete tensor_loader_; }
|
||||
|
||||
void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition,
|
||||
void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter,
|
||||
const std::vector<std::tuple<std::string, bool>> &check_node_list) {
|
||||
std::lock_guard<std::mutex> lg(lock_);
|
||||
|
||||
watchpoint_t watchpoint_item;
|
||||
|
||||
watchpoint_item.id = id;
|
||||
|
||||
if (watch_condition == 0) {
|
||||
watchpoint_item.conditions.nan.enabled = true;
|
||||
} else if (watch_condition == 1) {
|
||||
watchpoint_item.conditions.inf.enabled = true;
|
||||
watchpoint_item.conditions.neg_inf.enabled = true;
|
||||
} else if (watch_condition == 2) {
|
||||
watchpoint_item.conditions.overflow.enabled = true;
|
||||
}
|
||||
|
||||
watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
|
||||
watchpoint_item.condition.parameter = parameter;
|
||||
if (watch_condition > 2)
|
||||
// odd indices are greater than conditions and even indicies are less than
|
||||
watchpoint_item.condition.comparison = (watch_condition & 1) == 0 ? "LT" : "GT";
|
||||
watchpoint_item.check_node_list = check_node_list;
|
||||
|
||||
watchpoint_table[id] = watchpoint_item;
|
||||
}
|
||||
|
||||
|
@ -64,135 +58,109 @@ void DebugServices::RemoveWatchpoint(unsigned int id) {
|
|||
watchpoint_table.erase(id);
|
||||
}
|
||||
|
||||
DebugServices::tensor_stats DebugServices::SummarizeTensor(const float *start, unsigned int n, bool need_min_max,
|
||||
bool need_mean_sd) {
|
||||
tensor_stats stats;
|
||||
for (unsigned int i = 0; i < n; ++i) {
|
||||
float val = start[i];
|
||||
stats.has_nan = stats.has_nan || isnan(val);
|
||||
stats.has_inf = stats.has_inf || isinf(val);
|
||||
if (stats.has_inf && stats.has_nan) {
|
||||
// other statistics don't make sense in this case
|
||||
break;
|
||||
}
|
||||
|
||||
if (need_min_max) {
|
||||
stats.min = std::min(stats.min, val);
|
||||
stats.max = std::max(stats.max, val);
|
||||
}
|
||||
|
||||
if (need_mean_sd) {
|
||||
// for mean and sd calculation see
|
||||
// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
|
||||
float delta = val - stats.mean;
|
||||
stats.mean += delta / (i + 1);
|
||||
stats.m2 += delta * (val - stats.mean);
|
||||
}
|
||||
}
|
||||
stats.n = n;
|
||||
return stats;
|
||||
}
|
||||
|
||||
void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot,
|
||||
std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id,
|
||||
const std::vector<std::string> &op_overflows,
|
||||
const std::vector<std::shared_ptr<TensorData>> &tensor_list) {
|
||||
std::lock_guard<std::mutex> lg(lock_);
|
||||
std::string current_tensor_name;
|
||||
std::unordered_map<unsigned int, watchpoint_t> watchpoints_to_check_table;
|
||||
const size_t location = 0;
|
||||
|
||||
for (std::size_t i = 0; i < tensor_list.size(); i++) {
|
||||
current_tensor_name = tensor_list[i]->GetName();
|
||||
std::string tensor_slot = std::to_string(tensor_list[i]->GetSlot());
|
||||
mindspore::tensor::TensorPtr tensor_ptr = tensor_list[i]->GetTensor();
|
||||
int tensor_data_type = tensor_ptr->data_type_c();
|
||||
|
||||
// check if we need to analyze this node and for which watchpoints we will check
|
||||
// create a list of watchpoints to check
|
||||
watchpoints_to_check_table.clear();
|
||||
for (auto w_table_item : watchpoint_table) {
|
||||
// if the watchpoint is checking for a nan or inf and the current tensor is not of a float type, then
|
||||
// don't check the watchpoint for this tensor
|
||||
if (std::get<1>(w_table_item).conditions.inf.enabled || std::get<1>(w_table_item).conditions.neg_inf.enabled ||
|
||||
std::get<1>(w_table_item).conditions.nan.enabled) {
|
||||
if (tensor_data_type != kNumberTypeFloat16 && tensor_data_type != kNumberTypeFloat &&
|
||||
tensor_data_type != kNumberTypeFloat32 && tensor_data_type != kNumberTypeFloat64) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
auto check_node_list = std::get<1>(w_table_item).check_node_list;
|
||||
|
||||
for (auto check_node : check_node_list) {
|
||||
std::string w_name = std::get<0>(check_node);
|
||||
bool w_type = std::get<1>(check_node);
|
||||
|
||||
// check if the current node tensor name is included the watchpoint
|
||||
std::string current_node_name = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
|
||||
if ((w_type == true && (current_tensor_name.find(w_name) == location || w_name == "*")) ||
|
||||
(w_type == false && current_node_name == w_name)) {
|
||||
watchpoints_to_check_table[w_table_item.second.id] = w_table_item.second;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::vector<unsigned int> hit_encountered;
|
||||
|
||||
// handle watchpoint conditions that do not require per element checks
|
||||
for (auto it_w_table_check = watchpoints_to_check_table.begin();
|
||||
it_w_table_check != watchpoints_to_check_table.end(); ++it_w_table_check) {
|
||||
if (it_w_table_check->second.conditions.overflow.enabled) {
|
||||
std::string name_no_slot = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
|
||||
if (std::find(op_overflows.begin(), op_overflows.end(), name_no_slot) != op_overflows.end()) {
|
||||
hit_encountered.push_back(it_w_table_check->second.id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (hit_encountered.size()) {
|
||||
HandleWatchpointHits(hit_encountered, name, slot, condition, watchpoint_id, current_tensor_name,
|
||||
&watchpoints_to_check_table, tensor_slot);
|
||||
hit_encountered.clear();
|
||||
}
|
||||
|
||||
// need to add support for float16 and float64, and other types when we support conditions beyond inf and nan
|
||||
if (tensor_data_type != kNumberTypeFloat && tensor_data_type != kNumberTypeFloat32) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// check if no watchpoints are remaining
|
||||
if (watchpoints_to_check_table.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
float *start_addr = reinterpret_cast<float *>(tensor_ptr->data_c());
|
||||
unsigned int num_elements = (tensor_ptr->data().nbytes()) / sizeof(float);
|
||||
std::unordered_map<unsigned int, watchpoint_t>::iterator it_w_table_check;
|
||||
|
||||
for (unsigned int index = 0; index < num_elements; index++) {
|
||||
float x = start_addr[index];
|
||||
it_w_table_check = watchpoints_to_check_table.begin();
|
||||
|
||||
while (it_w_table_check != watchpoints_to_check_table.end()) {
|
||||
if ((it_w_table_check->second.conditions.inf.enabled || it_w_table_check->second.conditions.neg_inf.enabled) &&
|
||||
isinf(x)) {
|
||||
hit_encountered.push_back(it_w_table_check->second.id);
|
||||
} else if (it_w_table_check->second.conditions.nan.enabled && isnan(x)) {
|
||||
hit_encountered.push_back(it_w_table_check->second.id);
|
||||
}
|
||||
++it_w_table_check;
|
||||
}
|
||||
|
||||
if (hit_encountered.size()) {
|
||||
HandleWatchpointHits(hit_encountered, name, slot, condition, watchpoint_id, current_tensor_name,
|
||||
&watchpoints_to_check_table, tensor_slot);
|
||||
hit_encountered.clear();
|
||||
}
|
||||
|
||||
if (watchpoints_to_check_table.empty()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (watchpoint_table.empty()) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void DebugServices::HandleWatchpointHits(const std::vector<unsigned int> &hit_encountered,
|
||||
std::vector<std::string> *name, std::vector<std::string> *slot,
|
||||
std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id,
|
||||
std::string current_tensor_name,
|
||||
std::unordered_map<unsigned int, watchpoint_t> *watchpoints_to_check_table,
|
||||
std::string tensor_slot) {
|
||||
for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) {
|
||||
if (watchpoint_table.find(*it_hit_id) != watchpoint_table.end()) {
|
||||
std::string name_no_slot = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
|
||||
name->push_back(name_no_slot);
|
||||
slot->push_back(tensor_slot);
|
||||
for (const auto &tensor : tensor_list) {
|
||||
const auto tensor_name = tensor->GetName();
|
||||
const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
|
||||
const auto tensor_slot = std::to_string(tensor->GetSlot());
|
||||
mindspore::tensor::TensorPtr tensor_ptr = tensor->GetTensor();
|
||||
int tensor_dtype = tensor_ptr->data_type_c();
|
||||
std::vector<unsigned int> hit_encountered;
|
||||
std::unordered_map<unsigned int, watchpoint_t> watchpoints_to_check_table;
|
||||
bool min_max_enabled = false;
|
||||
bool mean_sd_enabled = false;
|
||||
bool inf_nan_enabled = false;
|
||||
for (auto w_table_item : watchpoint_table) {
|
||||
auto wp = std::get<1>(w_table_item);
|
||||
|
||||
int condition_item = -1;
|
||||
if (watchpoint_table[*it_hit_id].conditions.nan.enabled) {
|
||||
condition_item = 0;
|
||||
} else if (watchpoint_table[*it_hit_id].conditions.inf.enabled ||
|
||||
watchpoint_table[*it_hit_id].conditions.neg_inf.enabled) {
|
||||
condition_item = 1;
|
||||
} else if (watchpoint_table[*it_hit_id].conditions.overflow.enabled) {
|
||||
condition_item = 2;
|
||||
// if (!wp.conditions.condition_list[IS_OVERFLOW].enabled) {
|
||||
if (wp.condition.type != IS_OVERFLOW) {
|
||||
// only overflow condition supports all data types
|
||||
if (tensor_dtype != kNumberTypeFloat && tensor_dtype != kNumberTypeFloat32) continue;
|
||||
}
|
||||
|
||||
if (wp.IsNodeIncluded(tensor_name_no_slot)) {
|
||||
min_max_enabled |= wp.min_max_enabled();
|
||||
mean_sd_enabled |= wp.mean_sd_enabled();
|
||||
inf_nan_enabled |= wp.inf_nan_enabled();
|
||||
watchpoints_to_check_table[w_table_item.second.id] = w_table_item.second;
|
||||
}
|
||||
condition->push_back(condition_item);
|
||||
watchpoint_id->push_back(*it_hit_id);
|
||||
}
|
||||
watchpoints_to_check_table->erase(*it_hit_id);
|
||||
tensor_stats stats;
|
||||
|
||||
if (min_max_enabled || mean_sd_enabled || inf_nan_enabled) {
|
||||
auto *start_addr = reinterpret_cast<float *>(tensor_ptr->data_c());
|
||||
unsigned int num_elements = (tensor_ptr->data().nbytes()) / sizeof(float);
|
||||
stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
|
||||
}
|
||||
|
||||
for (auto &it : watchpoints_to_check_table) {
|
||||
auto wp_id = it.second.id;
|
||||
CONDITION_TYPE enabled_condition = it.second.condition.type;
|
||||
bool hit = (enabled_condition == HAS_NAN && stats.has_nan) || (enabled_condition == HAS_INF && stats.has_inf) ||
|
||||
(enabled_condition == IS_OVERFLOW &&
|
||||
std::find(op_overflows.begin(), op_overflows.end(), tensor_name_no_slot) != op_overflows.end());
|
||||
|
||||
if (enabled_condition > 2) {
|
||||
if (stats.has_inf || stats.has_nan) {
|
||||
MS_LOG(WARNING) << "NaN or/and INF present in tensor: " << tensor_name << ". Cannot check "
|
||||
<< condition_label[enabled_condition] << " watchpoint.";
|
||||
} else {
|
||||
bool gt = stats.statLookup(enabled_condition) > it.second.condition.parameter;
|
||||
bool lt = stats.statLookup(enabled_condition) < it.second.condition.parameter;
|
||||
hit |= it.second.condition.comparison == "GT" ? gt : lt;
|
||||
}
|
||||
}
|
||||
if (hit) hit_encountered.push_back(wp_id);
|
||||
}
|
||||
|
||||
for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) {
|
||||
if (watchpoint_table.find(*it_hit_id) != watchpoint_table.end()) {
|
||||
name->push_back(tensor_name_no_slot);
|
||||
slot->push_back(tensor_slot);
|
||||
int condition_item = watchpoint_table.find(*it_hit_id)->second.condition.type;
|
||||
condition->push_back(condition_item);
|
||||
watchpoint_id->push_back(*it_hit_id);
|
||||
}
|
||||
watchpoints_to_check_table.erase(*it_hit_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#include <tuple>
|
||||
#include <unordered_map>
|
||||
#include <mutex>
|
||||
#include <limits>
|
||||
#include "debug/tensor_load.h"
|
||||
#include "debug/tensor_data.h"
|
||||
#include "ir/dtype.h"
|
||||
|
@ -38,39 +39,91 @@ class DebugServices {
|
|||
|
||||
~DebugServices();
|
||||
|
||||
typedef struct condition_no_param {
|
||||
bool enabled = false;
|
||||
} condition_no_param_t;
|
||||
enum CONDITION_TYPE {
|
||||
HAS_NAN,
|
||||
HAS_INF,
|
||||
IS_OVERFLOW,
|
||||
MAX_GT,
|
||||
MAX_LT,
|
||||
MIN_GT,
|
||||
MIN_LT,
|
||||
MAX_MIN_GT,
|
||||
MAX_MIN_LT,
|
||||
MEAN_GT,
|
||||
MEAN_LT,
|
||||
SD_GT,
|
||||
SD_LT
|
||||
};
|
||||
|
||||
typedef struct condition_with_param {
|
||||
bool enabled = false;
|
||||
typedef struct condition {
|
||||
CONDITION_TYPE type;
|
||||
float parameter = 0;
|
||||
} condition_with_param_t;
|
||||
|
||||
typedef struct conditions {
|
||||
condition_no_param_t inf;
|
||||
condition_no_param_t neg_inf;
|
||||
condition_no_param_t nan;
|
||||
condition_no_param_t overflow;
|
||||
condition_with_param_t max_below;
|
||||
condition_with_param_t max_above;
|
||||
condition_with_param_t min_below;
|
||||
condition_with_param_t min_above;
|
||||
condition_with_param_t max_minus_min_below;
|
||||
condition_with_param_t max_minus_min_above;
|
||||
condition_with_param_t mean_below;
|
||||
condition_with_param_t mean_above;
|
||||
condition_with_param_t std_dev_below;
|
||||
condition_with_param_t std_dev_above;
|
||||
} conditions_t;
|
||||
std::string comparison;
|
||||
} condition_t;
|
||||
|
||||
typedef struct watchpoint {
|
||||
unsigned int id;
|
||||
conditions_t conditions;
|
||||
condition_t condition;
|
||||
std::vector<std::tuple<std::string, bool>> check_node_list;
|
||||
size_t location = 0;
|
||||
|
||||
bool IsNodeIncluded(const std::string &tensor_name) {
|
||||
std::string node_name = tensor_name.substr(0, tensor_name.find_first_of(':'));
|
||||
for (auto check_node : check_node_list) {
|
||||
std::string w_name = std::get<0>(check_node);
|
||||
bool w_type = std::get<1>(check_node);
|
||||
if ((w_type && (tensor_name.find(w_name) == location || w_name == "*")) || (!w_type && node_name == w_name)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool min_max_enabled() {
|
||||
return condition.type == MAX_LT || condition.type == MAX_GT || condition.type == MIN_LT ||
|
||||
condition.type == MIN_GT || condition.type == MAX_MIN_LT || condition.type == MAX_MIN_GT;
|
||||
}
|
||||
// inf or nan related condition set
|
||||
bool inf_nan_enabled() { return condition.type == HAS_INF || condition.type == HAS_NAN; }
|
||||
// mean or sd related condition set
|
||||
bool mean_sd_enabled() {
|
||||
return condition.type == MEAN_LT || condition.type == MEAN_GT || condition.type == SD_LT ||
|
||||
condition.type == SD_GT;
|
||||
}
|
||||
} watchpoint_t;
|
||||
|
||||
void AddWatchpoint(unsigned int id, unsigned int watch_condition,
|
||||
struct tensor_stats {
|
||||
float min = std::numeric_limits<float>::max();
|
||||
float max = std::numeric_limits<float>::lowest();
|
||||
bool has_inf = false;
|
||||
bool has_nan = false;
|
||||
unsigned int n = 0;
|
||||
float mean = 0.0;
|
||||
float m2 = 0.0;
|
||||
|
||||
float statLookup(CONDITION_TYPE type) const {
|
||||
if (type == MAX_GT || type == MAX_LT) return max;
|
||||
if (type == MIN_GT || type == MIN_LT) return min;
|
||||
if (type == MAX_MIN_GT || type == MAX_MIN_LT) return (max - min);
|
||||
if (type == MEAN_GT || type == MEAN_LT) return mean;
|
||||
if (type == SD_GT || type == SD_LT) return getStandardDeviation();
|
||||
return std::numeric_limits<float>::quiet_NaN();
|
||||
}
|
||||
|
||||
float getMean() const { return mean; }
|
||||
|
||||
float getVariance() const {
|
||||
if (n > 1) {
|
||||
return m2 / (n - 1);
|
||||
} else {
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
float getStandardDeviation() const { return sqrt(getVariance()); }
|
||||
};
|
||||
|
||||
void AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter,
|
||||
const std::vector<std::tuple<std::string, bool>> &check_node_list);
|
||||
|
||||
void RemoveWatchpoint(unsigned int id);
|
||||
|
@ -93,14 +146,13 @@ class DebugServices {
|
|||
std::mutex lock_;
|
||||
|
||||
std::unordered_map<unsigned int, watchpoint_t> watchpoint_table;
|
||||
std::vector<std::string> condition_label = {"HAS_NAN", "HAS_INF", "IS_OVERFLOW", "MAX_GT", "MAX_LT",
|
||||
"MIN_GT", "MIN_LT", "MAX_MIN_GT", "MAX_MIN_LT", "MEAN_GT",
|
||||
"MEAN_LT", "SD_GT", "SD_LT"};
|
||||
|
||||
TensorLoader *tensor_loader_;
|
||||
|
||||
void HandleWatchpointHits(const std::vector<unsigned int> &hit_encountered, std::vector<std::string> *name,
|
||||
std::vector<std::string> *slot, std::vector<int> *condition,
|
||||
std::vector<unsigned int> *watchpoint_id, std::string current_tensor_name,
|
||||
std::unordered_map<unsigned int, watchpoint_t> *watchpoints_to_check_table,
|
||||
std::string tensor_slot);
|
||||
static tensor_stats SummarizeTensor(const float *start, unsigned int n, bool need_min_max, bool need_mean_sd);
|
||||
};
|
||||
} // namespace mindspore
|
||||
|
||||
|
|
|
@ -84,14 +84,19 @@ message WatchCondition {
|
|||
nan = 0;
|
||||
inf = 1;
|
||||
overflow = 2;
|
||||
ge = 3; // greater than and equal to
|
||||
gt = 4; // greater than
|
||||
le = 5; // less than and equal to
|
||||
lt = 6; // less than
|
||||
between = 7; // between
|
||||
max_gt = 3;
|
||||
max_lt = 4;
|
||||
min_gt = 5;
|
||||
min_lt = 6;
|
||||
max_min_gt = 7;
|
||||
max_min_lt = 8;
|
||||
mean_gt = 9;
|
||||
mean_lt = 10;
|
||||
sd_gt = 11;
|
||||
sd_lt = 12;
|
||||
}
|
||||
Condition condition = 1;
|
||||
repeated float value = 2; // for between condition, there will be two values
|
||||
float value = 2; // for between condition, there will be two values
|
||||
repeated bool include = 3; // for between condition, define the value is included or not
|
||||
}
|
||||
|
||||
|
|
|
@ -493,7 +493,7 @@ void Debugger::SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCon
|
|||
[](WatchNode node) -> std::tuple<std::string, bool> {
|
||||
return make_tuple(node.node_name(), node.node_type() == "scope");
|
||||
});
|
||||
debug_services_->AddWatchpoint(id, condition.condition(), check_node_list);
|
||||
debug_services_->AddWatchpoint(id, condition.condition(), condition.value(), check_node_list);
|
||||
}
|
||||
|
||||
void Debugger::RemoveWatchpoint(const int32_t id) { debug_services_->RemoveWatchpoint(id); }
|
||||
|
|
Loading…
Reference in New Issue