!21389 Add Tensor Base and Stat info to offline debugger

Merge pull request !21389 from parastooashtari/tensor_info_levels
This commit is contained in:
i-robot 2021-08-25 02:08:34 +00:00 committed by Gitee
commit 2edaba38bf
11 changed files with 1226 additions and 154 deletions

View File

@ -131,6 +131,30 @@ std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData>
}
}
DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_ptr<TensorData> &tensor) {
if (tensor == nullptr) {
MS_LOG(WARNING) << "Tensor is nullptr, returning empty tensor statistics.";
TensorStat empty_tensor_stat_data;
return empty_tensor_stat_data;
}
std::unique_ptr<ITensorSummary> base_summary_ptr;
void *previous_tensor_ptr = nullptr;
base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, tensor->GetNumElements(), tensor->GetType());
if (base_summary_ptr == nullptr) {
MS_LOG(WARNING) << "base_summary_ptr is nullptr, returning empty tensor statistics.";
TensorStat empty_tensor_stat_data;
return empty_tensor_stat_data;
}
base_summary_ptr->TensorStatistics(tensor->GetType());
TensorStat tensor_stat_data(tensor->GetByteSize(), tensor->GetType(), tensor->GetShape(), base_summary_ptr->is_bool(),
base_summary_ptr->max_value(), base_summary_ptr->min_value(),
base_summary_ptr->avg_value(), base_summary_ptr->count(),
base_summary_ptr->neg_zero_count(), base_summary_ptr->pos_zero_count(),
base_summary_ptr->nan_count(), base_summary_ptr->neg_inf_count(),
base_summary_ptr->pos_inf_count(), base_summary_ptr->zero_count());
return tensor_stat_data;
}
#ifdef OFFLINE_DBG_MODE
void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed) {
void *previous_tensor_ptr = nullptr;
@ -317,7 +341,11 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *const name, std::
MS_LOG(INFO) << "tensor list size: " << tensor_list_size;
if (tensor_list_size == 0) return;
// default value for number of threads
const int max_thread_num = 32;
const int default_thread_num = 32;
int max_thread_num = default_thread_num;
if (max_thread_num > tensor_list_size) {
max_thread_num = tensor_list_size;
}
MS_LOG(INFO) << "Number of threads used for checkwatchpoint: " << max_thread_num;
int chunk_size = tensor_list_size / max_thread_num;
int remainder = tensor_list_size % max_thread_num;
@ -757,78 +785,100 @@ void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std:
std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
// search files in dir for the one that meets the filename prefix and read the file into memory
std::vector<char> *buffer = NULL;
std::string type_name = "";
std::vector<int64_t> shape;
uint64_t data_size = 0;
if (is_sync_mode_) {
std::string abspath = RealPath(specific_dump_dir);
DIR *d = opendir(abspath.c_str());
bool found_file = false;
std::vector<std::string> matched_paths;
if (d == nullptr) {
MS_LOG(ERROR) << "Directory " << specific_dump_dir << " does not exist!";
} else {
struct dirent *dir = nullptr;
while ((dir = readdir(d)) != NULL) {
if (dir->d_type == DT_REG) {
std::string file_name = dir->d_name;
std::string stripped_file_name = GetStrippedFilename(file_name);
if (stripped_file_name.empty()) {
continue;
}
std::size_t found = stripped_file_name.rfind(prefix_dump_file_name, 0);
if (found != 0) {
continue;
}
std::string full_path = specific_dump_dir + "/" + file_name;
matched_paths.push_back(full_path);
found_file = true;
}
}
(void)closedir(d);
}
if (found_file) {
shape.clear();
std::string result_path = GetNewestFilePath(matched_paths);
ReadTensorFromNpy(result_path, &type_name, &data_size, &shape, &buffer);
AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], is_output[i], data_size,
type_name, shape, buffer, result_list);
} else {
AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], is_output[i], 0,
type_name, shape, buffer, result_list);
MS_LOG(INFO) << "Target tensor has not been found.";
}
ReadDumpedTensorSync(prefix_dump_file_name, specific_dump_dir, backend_name[i], slot[i], device_id[i],
iteration[i], root_graph_id[i], is_output[i], result_list);
} else {
bool found = false;
std::vector<std::string> matched_paths;
// if async mode
for (const std::string &file_path : async_file_pool) {
if (file_path.find(specific_dump_dir) != std::string::npos &&
file_path.find(prefix_dump_to_check) != std::string::npos &&
file_path.find(slot_string_to_check) != std::string::npos) {
matched_paths.push_back(file_path);
found = true;
}
}
if (found) {
shape.clear();
std::string result_path = GetNewestFilePath(matched_paths);
ReadTensorFromNpy(result_path, &type_name, &data_size, &shape, &buffer);
AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], is_output[i], data_size,
type_name, shape, buffer, result_list);
} else {
// If no npy file is found, add empty tensor data.
AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], is_output[i], 0,
type_name, shape, buffer, result_list);
MS_LOG(INFO) << "Target tensor has not been found.";
}
ReadDumpedTensorAsync(specific_dump_dir, prefix_dump_to_check, slot_string_to_check, backend_name[i], slot[i],
device_id[i], iteration[i], root_graph_id[i], is_output[i], async_file_pool, result_list);
}
}
}
void DebugServices::ReadDumpedTensorSync(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
const std::string &backend_name, size_t slot, unsigned int device_id,
unsigned int iteration, unsigned int root_graph_id, const bool &is_output,
std::vector<std::shared_ptr<TensorData>> *result_list) {
std::vector<char> *buffer = NULL;
std::string type_name = "";
std::vector<int64_t> shape;
uint64_t data_size = 0;
std::string abspath = RealPath(specific_dump_dir);
DIR *d = opendir(abspath.c_str());
bool found_file = false;
std::vector<std::string> matched_paths;
if (d == nullptr) {
MS_LOG(ERROR) << "Directory " << specific_dump_dir << " does not exist!";
return;
}
struct dirent *dir = nullptr;
while ((dir = readdir(d)) != NULL) {
if (dir->d_type == DT_REG) {
std::string file_name = dir->d_name;
std::string stripped_file_name = GetStrippedFilename(file_name);
if (stripped_file_name.empty()) {
continue;
}
std::size_t found = stripped_file_name.rfind(prefix_dump_file_name, 0);
if (found != 0) {
continue;
}
std::string full_path = specific_dump_dir + "/" + file_name;
matched_paths.push_back(full_path);
found_file = true;
}
}
if (found_file) {
shape.clear();
std::string result_path = GetNewestFilePath(matched_paths);
ReadTensorFromNpy(result_path, &type_name, &data_size, &shape, &buffer);
AddToTensorData(backend_name, slot, iteration, device_id, root_graph_id, is_output, data_size, type_name, shape,
buffer, result_list);
} else {
AddToTensorData(backend_name, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape, buffer,
result_list);
MS_LOG(INFO) << "Target tensor has not been found.";
}
(void)closedir(d);
}
void DebugServices::ReadDumpedTensorAsync(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check,
const std::string &slot_string_to_check, const std::string &backend_name,
size_t slot, unsigned int device_id, unsigned int iteration,
unsigned int root_graph_id, const bool &is_output,
const std::vector<std::string> &async_file_pool,
std::vector<std::shared_ptr<TensorData>> *result_list) {
std::vector<char> *buffer = NULL;
std::string type_name = "";
std::vector<int64_t> shape;
uint64_t data_size = 0;
bool found = false;
std::vector<std::string> matched_paths;
// if async mode
for (const std::string &file_path : async_file_pool) {
if (file_path.find(specific_dump_dir) != std::string::npos &&
file_path.find(prefix_dump_to_check) != std::string::npos &&
file_path.find(slot_string_to_check) != std::string::npos) {
matched_paths.push_back(file_path);
found = true;
}
}
if (found) {
shape.clear();
std::string result_path = GetNewestFilePath(matched_paths);
ReadTensorFromNpy(result_path, &type_name, &data_size, &shape, &buffer);
AddToTensorData(backend_name, slot, iteration, device_id, root_graph_id, is_output, data_size, type_name, shape,
buffer, result_list);
} else {
// If no npy file is found, add empty tensor data.
AddToTensorData(backend_name, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape, buffer,
result_list);
MS_LOG(INFO) << "Target tensor has not been found.";
}
}
std::string DebugServices::GetStrippedFilename(const std::string &file_name) {
// strip off the task_id, stream_id, and timestamp, then compare
size_t first_dot = file_name.find(".");

View File

@ -186,6 +186,45 @@ class DebugServices {
}
};
struct TensorStat {
TensorStat(uint64_t data_size, int dtype, const std::vector<int64_t> &shape, bool is_bool, double max_value,
double min_value, double avg_value, int count, int neg_zero_count, int pos_zero_count, int nan_count,
int neg_inf_count, int pos_inf_count, int zero_count)
: data_size(data_size),
dtype(dtype),
shape(shape),
is_bool(is_bool),
max_value(max_value),
min_value(min_value),
avg_value(avg_value),
count(count),
neg_zero_count(neg_zero_count),
pos_zero_count(pos_zero_count),
nan_count(nan_count),
neg_inf_count(neg_inf_count),
pos_inf_count(pos_inf_count),
zero_count(zero_count) {}
TensorStat() = default;
uint64_t data_size = 0;
int dtype = 0;
std::vector<int64_t> shape = {0};
bool is_bool = false;
double max_value = std::numeric_limits<double>::lowest();
double min_value = std::numeric_limits<double>::max();
double avg_value = 0.0;
int count = 0;
int neg_zero_count = 0;
int pos_zero_count = 0;
int nan_count = 0;
int neg_inf_count = 0;
int pos_inf_count = 0;
int zero_count = 0;
};
TensorStat GetTensorStatistics(const std::shared_ptr<TensorData> &tensor);
void AddWatchpoint(
unsigned int id, unsigned int watch_condition, float parameter,
const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> &parameter_list,
@ -233,6 +272,17 @@ class DebugServices {
const std::vector<std::string> &async_file_pool,
std::vector<std::shared_ptr<TensorData>> *result_list);
void ReadDumpedTensorSync(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
const std::string &backend_name, size_t slot, unsigned int device_id,
unsigned int iteration, unsigned int root_graph_id, const bool &is_output,
std::vector<std::shared_ptr<TensorData>> *result_list);
void ReadDumpedTensorAsync(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check,
const std::string &slot_string_to_check, const std::string &backend_name, size_t slot,
unsigned int device_id, unsigned int iteration, unsigned int root_graph_id,
const bool &is_output, const std::vector<std::string> &async_file_pool,
std::vector<std::shared_ptr<TensorData>> *result_list);
std::vector<std::shared_ptr<TensorData>> ReadNeededDumpedTensors(unsigned int iteration,
std::vector<std::string> *async_file_pool);

View File

@ -226,7 +226,7 @@ unsigned int GetTensorSlot(tensor_info_t info) { return info.slot; }
bool GetTensorIsOutput(tensor_info_t info) { return info.is_output; }
std::vector<tensor_data_t> DbgServices::ReadTensors(std::vector<tensor_info_t> info) {
std::vector<std::shared_ptr<TensorData>> DbgServices::ReadTensorsUtil(std::vector<tensor_info_t> info) {
for (auto i : info) {
MS_LOG(INFO) << "cpp DbgServices ReadTensor info name " << i.node_name << ", slot " << i.slot << ", iteration "
<< i.iteration << ", rank_id " << i.rank_id << ", root_graph_id " << i.root_graph_id << ", is_output "
@ -238,7 +238,6 @@ std::vector<tensor_data_t> DbgServices::ReadTensors(std::vector<tensor_info_t> i
std::vector<unsigned int> iteration;
std::vector<size_t> slot;
std::vector<std::shared_ptr<TensorData>> result_list;
std::vector<tensor_data_t> tensors_read;
std::vector<bool> is_output;
std::transform(info.begin(), info.end(), std::back_inserter(backend_name), GetTensorFullName);
@ -264,10 +263,60 @@ std::vector<tensor_data_t> DbgServices::ReadTensors(std::vector<tensor_info_t> i
MS_LOG(INFO) << "ReadTensors Took: " << ms_double.count() / 1000 << "s";
MS_LOG(INFO) << "cpp after";
return result_list;
}
std::vector<tensor_data_t> DbgServices::ReadTensors(std::vector<tensor_info_t> info) {
std::vector<tensor_data_t> tensors_read;
std::vector<std::shared_ptr<TensorData>> result_list;
result_list = ReadTensorsUtil(info);
for (auto result : result_list) {
tensor_data_t tensor_data_item(result->GetDataPtr(), result->GetByteSize(), result->GetType(), result->GetShape());
tensors_read.push_back(tensor_data_item);
}
MS_LOG(INFO) << "cpp end";
return tensors_read;
}
std::vector<TensorBaseData> DbgServices::ReadTensorsBase(std::vector<tensor_info_t> info) {
std::vector<TensorBaseData> tensors_read_base;
std::vector<std::shared_ptr<TensorData>> result_list;
result_list = ReadTensorsUtil(info);
for (auto result : result_list) {
if (!result->GetByteSize()) {
// tensor not found, adding empty tensor base.
TensorBaseData tensor_data_item(0, 0, {0});
tensors_read_base.push_back(tensor_data_item);
continue;
}
TensorBaseData tensor_data_item(result->GetByteSize(), result->GetType(), result->GetShape());
tensors_read_base.push_back(tensor_data_item);
}
return tensors_read_base;
}
std::vector<TensorStatData> DbgServices::ReadTensorsStat(std::vector<tensor_info_t> info) {
std::vector<TensorStatData> tensors_read_stat;
std::vector<std::shared_ptr<TensorData>> result_list;
result_list = ReadTensorsUtil(info);
for (auto result : result_list) {
if (!result->GetByteSize()) {
DebugServices::TensorStat tensor_statistics;
TensorStatData tensor_data_item(
tensor_statistics.data_size, tensor_statistics.dtype, tensor_statistics.shape, tensor_statistics.is_bool,
tensor_statistics.max_value, tensor_statistics.min_value, tensor_statistics.avg_value, tensor_statistics.count,
tensor_statistics.neg_zero_count, tensor_statistics.pos_zero_count, tensor_statistics.nan_count,
tensor_statistics.neg_inf_count, tensor_statistics.pos_inf_count, tensor_statistics.zero_count);
tensors_read_stat.push_back(tensor_data_item);
continue;
}
DebugServices::TensorStat tensor_statistics = debug_services_->GetTensorStatistics(result);
TensorStatData tensor_data_item(
tensor_statistics.data_size, tensor_statistics.dtype, tensor_statistics.shape, tensor_statistics.is_bool,
tensor_statistics.max_value, tensor_statistics.min_value, tensor_statistics.avg_value, tensor_statistics.count,
tensor_statistics.neg_zero_count, tensor_statistics.pos_zero_count, tensor_statistics.nan_count,
tensor_statistics.neg_inf_count, tensor_statistics.pos_inf_count, tensor_statistics.zero_count);
tensors_read_stat.push_back(tensor_data_item);
}
return tensors_read_stat;
}

View File

@ -117,6 +117,68 @@ struct tensor_data_t {
std::vector<int64_t> shape;
};
struct TensorBaseData {
TensorBaseData(uint64_t data_size, int dtype, const std::vector<int64_t> &shape)
: data_size_(data_size), dtype_(dtype), shape_(shape) {}
const uint64_t data_size() const { return data_size_; }
const int dtype() const { return dtype_; }
const std::vector<int64_t> &shape() const { return shape_; }
uint64_t data_size_;
int dtype_;
std::vector<int64_t> shape_;
};
struct TensorStatData {
TensorStatData(uint64_t data_size, int dtype, const std::vector<int64_t> &shape, bool is_bool, double max_value,
double min_value, double avg_value, int count, int neg_zero_count, int pos_zero_count, int nan_count,
int neg_inf_count, int pos_inf_count, int zero_count)
: data_size_(data_size),
dtype_(dtype),
shape_(shape),
is_bool_(is_bool),
max_value_(max_value),
min_value_(min_value),
avg_value_(avg_value),
count_(count),
neg_zero_count_(neg_zero_count),
pos_zero_count_(pos_zero_count),
nan_count_(nan_count),
neg_inf_count_(neg_inf_count),
pos_inf_count_(pos_inf_count),
zero_count_(zero_count) {}
const uint64_t data_size() const { return data_size_; }
const int dtype() const { return dtype_; }
const std::vector<int64_t> &shape() const { return shape_; }
const bool is_bool() const { return is_bool_; }
const double max_value() const { return max_value_; }
const double min_value() const { return min_value_; }
const double avg_value() const { return avg_value_; }
const int count() const { return count_; }
const int neg_zero_count() const { return neg_zero_count_; }
const int pos_zero_count() const { return pos_zero_count_; }
const int nan_count() const { return nan_count_; }
const int neg_inf_count() const { return neg_inf_count_; }
const int pos_inf_count() const { return pos_inf_count_; }
const int zero_count() const { return zero_count_; }
uint64_t data_size_;
int dtype_;
std::vector<int64_t> shape_;
bool is_bool_;
double max_value_;
double min_value_;
double avg_value_;
int count_;
int neg_zero_count_;
int pos_zero_count_;
int nan_count_;
int neg_inf_count_;
int pos_inf_count_;
int zero_count_;
};
class DbgServices {
private:
DebugServices *debug_services_;
@ -141,8 +203,14 @@ class DbgServices {
std::vector<watchpoint_hit_t> CheckWatchpoints(unsigned int iteration);
std::vector<std::shared_ptr<TensorData>> ReadTensorsUtil(std::vector<tensor_info_t> info);
std::vector<tensor_data_t> ReadTensors(std::vector<tensor_info_t> info);
std::vector<TensorBaseData> ReadTensorsBase(std::vector<tensor_info_t> info);
std::vector<TensorStatData> ReadTensorsStat(std::vector<tensor_info_t> info);
std::string GetVersion();
};

View File

@ -27,6 +27,8 @@ PYBIND11_MODULE(_mindspore_offline_debug, m) {
.def("RemoveWatchpoint", &DbgServices::RemoveWatchpoint)
.def("CheckWatchpoints", &DbgServices::CheckWatchpoints)
.def("ReadTensors", &DbgServices::ReadTensors)
.def("ReadTensorsBase", &DbgServices::ReadTensorsBase)
.def("ReadTensorsStat", &DbgServices::ReadTensorsStat)
.def("GetVersion", &DbgServices::GetVersion);
py::class_<parameter_t>(m, "parameter")
@ -63,4 +65,28 @@ PYBIND11_MODULE(_mindspore_offline_debug, m) {
.def("get_data_size", &tensor_data_t::get_data_size)
.def("get_dtype", &tensor_data_t::get_dtype)
.def("get_shape", &tensor_data_t::get_shape);
py::class_<TensorBaseData>(m, "TensorBaseData")
.def(py::init<uint64_t, int, std::vector<int64_t>>())
.def("data_size", &TensorBaseData::data_size)
.def("dtype", &TensorBaseData::dtype)
.def("shape", &TensorBaseData::shape);
py::class_<TensorStatData>(m, "TensorStatData")
.def(
py::init<uint64_t, int, std::vector<int64_t>, bool, double, double, double, int, int, int, int, int, int, int>())
.def("data_size", &TensorStatData::data_size)
.def("dtype", &TensorStatData::dtype)
.def("shape", &TensorStatData::shape)
.def("is_bool", &TensorStatData::is_bool)
.def("max_value", &TensorStatData::max_value)
.def("min_value", &TensorStatData::min_value)
.def("avg_value", &TensorStatData::avg_value)
.def("count", &TensorStatData::count)
.def("neg_zero_count", &TensorStatData::neg_zero_count)
.def("pos_zero_count", &TensorStatData::pos_zero_count)
.def("nan_count", &TensorStatData::nan_count)
.def("neg_inf_count", &TensorStatData::neg_inf_count)
.def("pos_inf_count", &TensorStatData::pos_inf_count)
.def("zero_count", &TensorStatData::zero_count);
}

View File

@ -20,6 +20,7 @@
#include <memory>
#include <bitset>
#include <tuple>
#include <type_traits>
#include "debug/debugger/tensor_summary.h"
#ifdef OFFLINE_DBG_MODE
@ -92,39 +93,45 @@ double VarianceAndMeanCalculator::GetStandardDeviation() { return sqrt(GetVarian
template <typename T>
TensorSummary<T>::TensorSummary(void *current_tensor_ptr, void *const previous_tensor_ptr, uint32_t num_elements)
: current_tensor_ptr(reinterpret_cast<T *>(current_tensor_ptr)),
prev_tensor_ptr(reinterpret_cast<T *>(previous_tensor_ptr)),
num_elements(num_elements),
min(std::numeric_limits<double>::max()),
max(std::numeric_limits<double>::lowest()),
inf_count(0),
nan_count(0),
zero_count(0),
epsilon(1.0e-9),
mean_sd_cal_enabled(false) {}
: current_tensor_ptr_(reinterpret_cast<T *>(current_tensor_ptr)),
prev_tensor_ptr_(reinterpret_cast<T *>(previous_tensor_ptr)),
num_elements_(num_elements),
min_(std::numeric_limits<double>::max()),
max_(std::numeric_limits<double>::lowest()),
avg_(0.0),
is_bool_(false),
neg_zero_count_(0),
pos_zero_count_(0),
pos_inf_count_(0),
neg_inf_count_(0),
inf_count_(0),
nan_count_(0),
zero_count_(0),
epsilon_(1.0e-9),
mean_sd_cal_enabled_(false) {}
template <typename T>
void TensorSummary<T>::SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &wps) {
InitCalculators(wps);
for (size_t i = 0; i < num_elements; ++i) {
auto current_value = static_cast<double>(current_tensor_ptr[i]);
for (size_t i = 0; i < num_elements_; ++i) {
auto current_value = static_cast<double>(current_tensor_ptr_[i]);
double previous_value =
prev_tensor_ptr ? static_cast<double>(prev_tensor_ptr[i]) : std::numeric_limits<double>::quiet_NaN();
inf_count += std::isinf(current_value);
nan_count += std::isnan(current_value);
zero_count += (current_value == 0);
max = std::max(max, current_value);
min = std::min(min, current_value);
if (mean_sd_cal_enabled) {
current_mean_variance.ProcessElement(current_value);
prev_tensor_ptr_ ? static_cast<double>(prev_tensor_ptr_[i]) : std::numeric_limits<double>::quiet_NaN();
inf_count_ += std::isinf(current_value);
nan_count_ += std::isnan(current_value);
zero_count_ += (current_value == 0);
max_ = std::max(max_, current_value);
min_ = std::min(min_, current_value);
if (mean_sd_cal_enabled_) {
current_mean_variance_.ProcessElement(current_value);
}
for (auto &it : all_close) {
for (auto &it : all_close_) {
it.second->ProcessElement(current_value, previous_value);
}
for (auto &range_count : range_counts) {
for (auto &range_count : range_counts_) {
range_count.second->ProcessElement(current_value);
}
for (auto &mean : means) {
for (auto &mean : means_) {
if (mean.first == "curr_prev_diff_mean") {
mean.second->ProcessElement(std::abs(current_value - previous_value));
} else if (mean.first == "abs_prev_mean") {
@ -136,6 +143,39 @@ void TensorSummary<T>::SummarizeTensor(const std::vector<DebugServices::watchpoi
}
}
template <typename T>
void TensorSummary<T>::TensorStatistics(DbgDataType dtype_value) {
if (dtype_value == DT_BOOL) {
is_bool_ = true;
}
double sum_elements = 0.0;
for (size_t i = 0; i < num_elements_; ++i) {
auto current_value = static_cast<double>(current_tensor_ptr_[i]);
if (std::isinf(current_value)) {
if (current_value > 0) {
pos_inf_count_ += 1;
} else {
neg_inf_count_ += 1;
}
}
zero_count_ += (current_value == 0);
nan_count_ += std::isnan(current_value);
if (!(std::isnan(current_value) || std::isinf(current_value))) {
// only considering tensor elements with value
if (std::signbit(current_value) && !(current_value == 0)) {
neg_zero_count_ += 1;
} else if (!(current_value == 0)) {
pos_zero_count_ += 1;
}
max_ = std::max(max_, current_value);
min_ = std::min(min_, current_value);
sum_elements += current_value;
}
}
int value_count = zero_count_ + neg_zero_count_ + pos_zero_count_;
avg_ = sum_elements / value_count;
}
template <typename T>
std::tuple<bool, int, std::vector<DebugServices::parameter_t>> TensorSummary<T>::IsWatchpointHit(
DebugServices::watchpoint_t wp) {
@ -145,24 +185,24 @@ std::tuple<bool, int, std::vector<DebugServices::parameter_t>> TensorSummary<T>:
std::bitset<bit_size> error_code;
CONDITION_TYPE type = wp.condition.type;
// bit 0 denotes presence of nan
error_code.set(0, nan_count > 0);
error_code.set(0, nan_count_ > 0);
// bit 1 denotes presence of inf
error_code.set(1, inf_count > 0);
error_code.set(1, inf_count_ > 0);
if (type == CONDITION_TYPE::HAS_NAN) {
error_code.reset();
hit = nan_count > 0;
hit = nan_count_ > 0;
} else if (type == CONDITION_TYPE::HAS_INF) {
error_code.reset();
hit = inf_count > 0;
hit = inf_count_ > 0;
} else if (type == CONDITION_TYPE::GENERAL_OVERFLOW) {
error_code.reset();
hit = (nan_count + inf_count) > 0;
} else if (type == CONDITION_TYPE::NOT_CHANGED && prev_tensor_ptr && error_code.none()) {
hit = all_close[wp.id]->IsAllClose();
hit = (nan_count_ + inf_count_) > 0;
} else if (type == CONDITION_TYPE::NOT_CHANGED && prev_tensor_ptr_ && error_code.none()) {
hit = all_close_[wp.id]->IsAllClose();
} else if ((type == CONDITION_TYPE::NOT_CHANGED || type == CONDITION_TYPE::CHANGE_TOO_LARGE ||
type == CONDITION_TYPE::CHANGE_TOO_SMALL) &&
!prev_tensor_ptr) {
!prev_tensor_ptr_) {
// bit 2 denotes absence of previous tensor
error_code.set(2, true);
}
@ -196,26 +236,26 @@ double_t TensorSummary<T>::StatLookup(const std::string &parameter_name, const D
}
if (param_type == "max") {
return max;
return max_;
} else if (param_type == "min") {
return min;
return min_;
} else if (param_type == "max_min") {
return max - min;
return max_ - min_;
} else if (param_type == "mean") {
return current_mean_variance.GetMean();
return current_mean_variance_.GetMean();
} else if (param_type == "sd") {
return current_mean_variance.GetStandardDeviation();
return current_mean_variance_.GetStandardDeviation();
} else if (param_type == "abs_mean") {
if (means.find("abs_current_mean") != means.end()) {
return means["abs_current_mean"]->GetMean();
if (means_.find("abs_current_mean") != means_.end()) {
return means_["abs_current_mean"]->GetMean();
}
} else if (param_type == "abs_mean_update_ratio" && prev_tensor_ptr) {
if (means.find("curr_prev_diff_mean") != means.end() && means.find("abs_prev_mean") != means.end()) {
return means["curr_prev_diff_mean"]->GetMean() / (means["abs_prev_mean"]->GetMean() + epsilon);
} else if (param_type == "abs_mean_update_ratio" && prev_tensor_ptr_) {
if (means_.find("curr_prev_diff_mean") != means_.end() && means_.find("abs_prev_mean") != means_.end()) {
return means_["curr_prev_diff_mean"]->GetMean() / (means_["abs_prev_mean"]->GetMean() + epsilon_);
}
} else if (param_type == "range_percentage") {
if (range_counts.find(wp.id) != range_counts.end()) {
return range_counts[wp.id]->GetPercentInRange();
if (range_counts_.find(wp.id) != range_counts_.end()) {
return range_counts_[wp.id]->GetPercentInRange();
}
} else if (param_type == "zero_percentage") {
return GetZeroValPercent();
@ -227,54 +267,54 @@ template <typename T>
double_t TensorSummary<T>::StatLookup(const DebugServices::watchpoint_t &wp) {
CONDITION_TYPE type = wp.condition.type;
if (type == CONDITION_TYPE::MAX_LT || type == CONDITION_TYPE::MAX_GT) {
return max;
return max_;
} else if (type == CONDITION_TYPE::MIN_LT || type == CONDITION_TYPE::MIN_GT) {
return min;
return min_;
} else if (type == CONDITION_TYPE::MEAN_LT || type == CONDITION_TYPE::MEAN_GT) {
return current_mean_variance.GetMean();
return current_mean_variance_.GetMean();
} else if (type == CONDITION_TYPE::SD_LT || type == CONDITION_TYPE::SD_GT) {
return current_mean_variance.GetStandardDeviation();
return current_mean_variance_.GetStandardDeviation();
} else if (type == CONDITION_TYPE::MAX_MIN_GT || type == CONDITION_TYPE::MAX_MIN_LT) {
return max - min;
return max_ - min_;
}
return std::numeric_limits<double_t>::quiet_NaN();
}
template <typename T>
double_t TensorSummary<T>::GetZeroValPercent() {
if (num_elements == 0) {
if (num_elements_ == 0) {
return 0;
}
return (zero_count * 100.0) / num_elements;
return (zero_count_ * 100.0) / num_elements_;
}
template <typename T>
void TensorSummary<T>::InitCalculators(const std::vector<DebugServices::watchpoint_t> &wps) {
for (auto &wp : wps) {
auto wp_id = wp.id;
mean_sd_cal_enabled = mean_sd_cal_enabled || wp.mean_sd_enabled();
if (wp.allclose_enabled() && prev_tensor_ptr) {
all_close[wp_id] = std::make_unique<AllCloseCalculator>();
mean_sd_cal_enabled_ = mean_sd_cal_enabled_ || wp.mean_sd_enabled();
if (wp.allclose_enabled() && prev_tensor_ptr_) {
all_close_[wp_id] = std::make_unique<AllCloseCalculator>();
if (!wp.parameter_list[0].disabled) {
all_close[wp_id]->set_atol(wp.parameter_list[0].value);
all_close_[wp_id]->set_atol(wp.parameter_list[0].value);
}
if (!wp.parameter_list[1].disabled) {
all_close[wp_id]->set_rtol(wp.parameter_list[1].value);
all_close_[wp_id]->set_rtol(wp.parameter_list[1].value);
}
} else if (wp.range_enabled()) {
range_counts[wp_id] = std::make_unique<RangeCountCalculator>();
range_counts_[wp_id] = std::make_unique<RangeCountCalculator>();
if (!wp.parameter_list[0].disabled) {
range_counts[wp_id]->set_range_start_inclusive(wp.parameter_list[0].value);
range_counts_[wp_id]->set_range_start_inclusive(wp.parameter_list[0].value);
}
if (!wp.parameter_list[1].disabled) {
range_counts[wp_id]->set_range_end_inclusive(wp.parameter_list[1].value);
range_counts_[wp_id]->set_range_end_inclusive(wp.parameter_list[1].value);
}
} else if (wp.tensor_update_ratio_mean_enabled() && prev_tensor_ptr) {
means.insert({"curr_prev_diff_mean", std::make_unique<MeanCalculator>()});
means.insert({"abs_prev_mean", std::make_unique<MeanCalculator>()});
} else if (wp.tensor_update_ratio_mean_enabled() && prev_tensor_ptr_) {
means_.insert({"curr_prev_diff_mean", std::make_unique<MeanCalculator>()});
means_.insert({"abs_prev_mean", std::make_unique<MeanCalculator>()});
} else if (wp.abs_mean_enabled()) {
means.insert({"abs_current_mean", std::make_unique<MeanCalculator>()});
means_.insert({"abs_current_mean", std::make_unique<MeanCalculator>()});
}
}
}

View File

@ -92,6 +92,18 @@ class ITensorSummary {
virtual void SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &) = 0;
virtual std::tuple<bool, int32_t, std::vector<DebugServices::parameter_t>> IsWatchpointHit(
DebugServices::watchpoint_t) = 0;
virtual void TensorStatistics(DbgDataType) = 0;
virtual const bool is_bool() const = 0;
virtual const double max_value() const = 0;
virtual const double min_value() const = 0;
virtual const double avg_value() const = 0;
virtual const int count() const = 0;
virtual const int neg_zero_count() const = 0;
virtual const int pos_zero_count() const = 0;
virtual const int nan_count() const = 0;
virtual const int neg_inf_count() const = 0;
virtual const int pos_inf_count() const = 0;
virtual const int zero_count() const = 0;
};
template <typename T>
@ -103,22 +115,40 @@ class TensorSummary : public ITensorSummary {
void SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &) override;
// returns hit, error_code, parameter_list
std::tuple<bool, int, std::vector<DebugServices::parameter_t>> IsWatchpointHit(DebugServices::watchpoint_t) override;
void TensorStatistics(DbgDataType) override;
const bool is_bool() const override { return is_bool_; }
const double max_value() const override { return max_; }
const double min_value() const override { return min_; }
const double avg_value() const override { return avg_; }
const int count() const override { return num_elements_; }
const int neg_zero_count() const override { return neg_zero_count_; }
const int pos_zero_count() const override { return pos_zero_count_; }
const int nan_count() const override { return nan_count_; }
const int neg_inf_count() const override { return neg_inf_count_; }
const int pos_inf_count() const override { return pos_inf_count_; }
const int zero_count() const override { return zero_count_; }
private:
T *current_tensor_ptr;
T *prev_tensor_ptr;
uint32_t num_elements;
double min;
double max;
uint32_t inf_count;
uint32_t nan_count;
uint32_t zero_count;
double epsilon;
bool mean_sd_cal_enabled;
VarianceAndMeanCalculator current_mean_variance;
std::unordered_map<std::string, std::unique_ptr<MeanCalculator>> means;
std::unordered_map<uint32_t, std::unique_ptr<AllCloseCalculator>> all_close;
std::unordered_map<uint32_t, std::unique_ptr<RangeCountCalculator>> range_counts;
T *current_tensor_ptr_;
T *prev_tensor_ptr_;
uint32_t num_elements_;
double min_;
double max_;
double avg_;
bool is_bool_;
uint32_t neg_zero_count_;
uint32_t pos_zero_count_;
uint32_t pos_inf_count_;
uint32_t neg_inf_count_;
uint32_t inf_count_;
uint32_t nan_count_;
uint32_t zero_count_;
double epsilon_;
bool mean_sd_cal_enabled_;
VarianceAndMeanCalculator current_mean_variance_;
std::unordered_map<std::string, std::unique_ptr<MeanCalculator>> means_;
std::unordered_map<uint32_t, std::unique_ptr<AllCloseCalculator>> all_close_;
std::unordered_map<uint32_t, std::unique_ptr<RangeCountCalculator>> range_counts_;
double_t StatLookup(const DebugServices::watchpoint_t &);
double_t StatLookup(const std::string &, const DebugServices::watchpoint_t &);
double_t GetZeroValPercent();

View File

@ -17,7 +17,10 @@ The module DbgServices provides offline debugger APIs.
"""
import mindspore._mindspore_offline_debug as cds
from mindspore.offline_debug.mi_validators import check_init, check_initialize, check_add_watchpoint, check_remove_watchpoint, check_check_watchpoints, check_read_tensors, check_initialize_done, check_tensor_info_init, check_tensor_data_init, check_watchpoint_hit_init, check_parameter_init
from mindspore.offline_debug.mi_validators import check_init, check_initialize, check_add_watchpoint,\
check_remove_watchpoint, check_check_watchpoints, check_read_tensor_info, check_initialize_done, \
check_tensor_info_init, check_tensor_data_init, check_tensor_base_data_init, check_tensor_stat_data_init,\
check_watchpoint_hit_init, check_parameter_init
from mindspore.offline_debug.mi_validator_helpers import replace_minus_one
@ -238,7 +241,7 @@ class DbgServices():
return watchpoint_hit_list
@check_initialize_done
@check_read_tensors
@check_read_tensor_info
def read_tensors(self, info):
"""
Returning tensor data object describing the tensor requested tensor.
@ -277,6 +280,83 @@ class DbgServices():
tensor_data_list_ret.append(tensor_data)
return tensor_data_list_ret
@check_initialize_done
@check_read_tensor_info
def read_tensor_base(self, info):
"""
Returning tensor base data object describing the requested tensor.
Args:
info (list): List of TensorInfo objects.
Returns:
list, TensorBaseData list.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> d = dbg_services.DbgServices(dump_file_path="dump_file_path",
>>> verbose=True)
>>> d_init = d.initialize(is_sync_mode=True)
>>> tensor_base_data_list = d_init.read_tensor_base([dbg_services.TensorInfo(node_name="conv2.bias",
>>> slot=0,
>>> iteration=8,
>>> rank_id=5,
>>> root_graph_id=0,
>>> is_output=True)])
"""
log("in Python ReadTensorsBase info ", info)
info_list_inst = []
for elem in info:
log("in Python ReadTensorsBase info ", info)
info_list_inst.append(elem.instance)
tensor_base_data_list = self.dbg_instance.ReadTensorsBase(info_list_inst)
tensor_base_data_list_ret = []
for elem in tensor_base_data_list:
tensor_base_data = TensorBaseData(elem.data_size(), elem.dtype(), elem.shape())
tensor_base_data_list_ret.append(tensor_base_data)
return tensor_base_data_list_ret
@check_initialize_done
@check_read_tensor_info
def read_tensor_stats(self, info):
"""
Returning tensor statistics object describing the requested tensor.
Args:
info (list): List of TensorInfo objects.
Returns:
list, TensorStatData list.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> d = dbg_services.DbgServices(dump_file_path="dump_file_path",
>>> verbose=True)
>>> d_init = d.initialize(is_sync_mode=True)
>>> tensor_stat_data_list = d_init.read_tensor_stats([dbg_services.TensorInfo(node_name="conv2.bias",
>>> slot=0,
>>> iteration=8,
>>> rank_id=5,
>>> root_graph_id=0,
>>> is_output=True)])
"""
log("in Python ReadTensorsStat info ", info)
info_list_inst = []
for elem in info:
log("in Python ReadTensorsStat info ", info)
info_list_inst.append(elem.instance)
tensor_stat_data_list = self.dbg_instance.ReadTensorsStat(info_list_inst)
tensor_stat_data_list_ret = []
for elem in tensor_stat_data_list:
tensor_stat_data = TensorStatData(elem.data_size(), elem.dtype(),
elem.shape(), elem.is_bool(),
elem.max_value(), elem.min_value(),
elem.avg_value(), elem.count(), elem.neg_zero_count(),
elem.pos_zero_count(), elem.nan_count(), elem.neg_inf_count(),
elem.pos_inf_count(), elem.zero_count())
tensor_stat_data_list_ret.append(tensor_stat_data)
return tensor_stat_data_list_ret
class TensorInfo():
"""
Tensor Information class.
@ -527,6 +607,406 @@ class TensorData():
return self.instance.get_shape()
class TensorBaseData():
"""
TensorBaseData class.
Args:
data_size (int): Size of data in bytes.
dtype (int): An encoding representing the type of TensorData.
shape (list): Shape of tensor.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_base_data = dbg_services.TensorBaseData(data_size=4,
>>> dtype=0,
>>> shape=[2, 2])
"""
@check_tensor_base_data_init
def __init__(self, data_size, dtype, shape):
self.instance = cds.TensorBaseData(data_size, dtype, shape)
@property
def data_size(self):
"""
Function to receive TensorBaseData data_size.
Returns:
int, data_size of TensorBaseData instance.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_base_data = dbg_services.TensorBaseData(data_size=4,
>>> dtype=0,
>>> shape=[2, 2])
>>> data_size = tensor_base_data.data_size
"""
return self.instance.data_size()
@property
def dtype(self):
"""
Function to receive TensorBaseData dtype.
Returns:
int, dtype of TensorBaseData instance.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_base_data = dbg_services.TensorBaseData(data_size=4,
>>> dtype=0,
>>> shape=[2, 2])
>>> dtype = tensor_base_data.dtype
"""
return self.instance.dtype()
@property
def shape(self):
"""
Function to receive TensorBaseData shape.
Returns:
list, shape of TensorBaseData instance.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_base_data = dbg_services.TensorBaseData(data_size=4,
>>> dtype=0,
>>> shape=[2, 2])
>>> shape = tensor_base_data.shape
"""
return self.instance.shape()
class TensorStatData():
"""
TensorStatData class.
Args:
data_size (int): Size of data in bytes.
dtype (int): An encoding representing the type of TensorData.
shape (list): Shape of tensor.
is_bool (bool): Whether the data type is bool
max_value (float): Maximum value in tensor's elements
min_value (float): Minimum value in tensor's elements
avg_value (float): Average value of all tensor's elements
count (int): Number of elements in tensor
neg_zero_count (int): Number of negative elements in tensor
pos_zero_count (int): Number of positive elements in tensor
nan_cout (int): Number of nan elements in tensor
neg_inf_count (int): Number of negative infinity elements in tensor
pos_inf_count (int): Number of positive infinity elements in tensor
zero_count (int): Total number of zero elements in tensor
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_stat_data = dbg_services.TensorStatData
>>> (data_size=4,
>>> dtype=0,
>>> shape=[2, 2], is_bool = false, max_value = 10.0,
>>> min_value = 0.0, avg_value = 5.0,
>>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0,
>>> neg_inf_count, pos_inf_count, zero_count = 1)
"""
@check_tensor_stat_data_init
def __init__(self, data_size, dtype, shape, is_bool, max_value, min_value, avg_value, count,
neg_zero_count, pos_zero_count, nan_count, neg_inf_count, pos_inf_count, zero_count):
self.instance = cds.TensorStatData(data_size, dtype, shape, is_bool, max_value,
min_value, avg_value, count, neg_zero_count,
pos_zero_count, nan_count, neg_inf_count,
pos_inf_count, zero_count)
@property
def data_size(self):
"""
Function to receive TensorStatData data_size.
Returns:
int, data_size of TensorStatData instance.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_stat_data = dbg_services.TensorStatData
>>> (data_size=4,
>>> dtype=0,
>>> shape=[2, 2], is_bool = false, max_value = 10.0,
>>> min_value = 0.0, avg_value = 5.0,
>>> count = 4, neg_zero_count = 0, pos_zero_count = 4,
>> nan_count = 0, neg_inf_count, pos_inf_count, zero_count = 1)
>>> data_size = tensor_stat_data.data_size
"""
return self.instance.data_size()
@property
def dtype(self):
"""
Function to receive TensorStatData dtype.
Returns:
int, dtype of TensorStatData instance.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_stat_data = dbg_services.TensorStatData(data_size=4,
>>> dtype=0,
>>> shape=[2, 2], is_bool = false, max_value = 10.0,
>> min_value = 0.0, avg_value = 5.0,
>>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0,
>>> neg_inf_count, pos_inf_count, zero_count = 1)
>>> dtype = tensor_stat_data.dtype
"""
return self.instance.dtype()
@property
def shape(self):
"""
Function to receive TensorStatData shape.
Returns:
list, shape of TensorStatData instance.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_stat_data = dbg_services.TensorStatData(data_size=4,
>>> dtype=0,
>>> shape=[2, 2], is_bool = false, max_value = 10.0,
>>> min_value = 0.0, avg_value = 5.0,
>>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0,
>>> neg_inf_count, pos_inf_count, zero_count = 1)
>>> shape = tensor_stat_data.shape
"""
return self.instance.shape()
@property
def is_bool(self):
"""
Function to receive TensorStatData is_bool.
Returns:
bool, Whether the tensor elements are bool.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_stat_data = dbg_services.TensorStatData(data_size=4,
>>> dtype=0,
>>> shape=[2, 2], is_bool = false, max_value = 10.0,
>>> min_value = 0.0, avg_value = 5.0,
>>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0,
>>> neg_inf_count, pos_inf_count, zero_count = 1)
>>> is_bool = tensor_stat_data.is_bool
"""
return self.instance.is_bool()
@property
def max_value(self):
"""
Function to receive TensorStatData max_value.
Returns:
float, max_value of TensorStatData instance.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_stat_data = dbg_services.TensorStatData(data_size=4,
>>> dtype=0,
>>> shape=[2, 2], is_bool = false, max_value = 10.0,
>>> min_value = 0.0, avg_value = 5.0,
>>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0,
>>> neg_inf_count, pos_inf_count, zero_count = 1)
>>> max_value = tensor_stat_data.max_value
"""
return self.instance.max_value()
@property
def min_value(self):
"""
Function to receive TensorStatData min_value.
Returns:
float, min_value of TensorStatData instance.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_stat_data = dbg_services.TensorStatData(data_size=4,
>>> dtype=0,
>>> shape=[2, 2], is_bool = false, max_value = 10.0,
>>> min_value = 0.0, avg_value = 5.0,
>>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0,
>>> neg_inf_count, pos_inf_count, zero_count = 1)
>>> min_value = tensor_stat_data.min_value
"""
return self.instance.min_value()
@property
def avg_value(self):
"""
Function to receive TensorStatData avg_value.
Returns:
float, avg_value of TensorStatData instance.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_stat_data = dbg_services.TensorStatData(data_size=4,
>>> dtype=0,
>>> shape=[2, 2], is_bool = false, max_value = 10.0,
>>> min_value = 0.0, avg_value = 5.0,
>>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0,
>>> neg_inf_count, pos_inf_count, zero_count = 1)
>>> avg_value = tensor_stat_data.avg_value
"""
return self.instance.avg_value()
@property
def count(self):
"""
Function to receive TensorStatData count.
Returns:
int, count of TensorStatData instance.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_stat_data = dbg_services.TensorStatData(data_size=4,
>>> dtype=0,
>>> shape=[2, 2], is_bool = false, max_value = 10.0,
>>> min_value = 0.0, avg_value = 5.0,
>>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0,
>>> neg_inf_count, pos_inf_count, zero_count = 1)
>>> count = tensor_stat_data.count
"""
return self.instance.count()
@property
def neg_zero_count(self):
"""
Function to receive TensorStatData neg_zero_count.
Returns:
int, neg_zero_count of TensorStatData instance.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_stat_data = dbg_services.TensorStatData(data_size=4,
>>> dtype=0,
>>> shape=[2, 2], is_bool = false, max_value = 10.0,
>>> min_value = 0.0, avg_value = 5.0,
>>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0,
>>> neg_inf_count, pos_inf_count, zero_count = 1)
>>> neg_zero_count = tensor_stat_data.neg_zero_count
"""
return self.instance.neg_zero_count()
@property
def pos_zero_count(self):
"""
Function to receive TensorStatData pos_zero_count.
Returns:
int, pos_zero_count of TensorStatData instance.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_stat_data = dbg_services.TensorStatData(data_size=4,
>>> dtype=0,
>>> shape=[2, 2], is_bool = false, max_value = 10.0,
>>> min_value = 0.0, avg_value = 5.0,
>>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0,
>>> neg_inf_count, pos_inf_count, zero_count = 1)
>>> pos_zero_count = tensor_stat_data.pos_zero_count
"""
return self.instance.pos_zero_count()
@property
def zero_count(self):
"""
Function to receive TensorStatData zero_count.
Returns:
int, zero_count of TensorStatData instance.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_stat_data = dbg_services.TensorStatData(data_size=4,
>>> dtype=0,
>>> shape=[2, 2], is_bool = false, max_value = 10.0,
>>> min_value = 0.0, avg_value = 5.0,
>>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0,
>>> neg_inf_count, pos_inf_count, zero_count = 1)
>>> zero_count = tensor_stat_data.zero_count
"""
return self.instance.zero_count()
@property
def nan_count(self):
"""
Function to receive TensorStatData nan_count.
Returns:
int, nan_count of TensorStatData instance.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_stat_data = dbg_services.TensorStatData(data_size=4,
>>> dtype=0,
>>> shape=[2, 2], is_bool = false, max_value = 10.0,
>>> min_value = 0.0, avg_value = 5.0,
>>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0,
>>> neg_inf_count, pos_inf_count, zero_count = 1)
>>> nan_count = tensor_stat_data.nan_count
"""
return self.instance.nan_count()
@property
def neg_inf_count(self):
"""
Function to receive TensorStatData shape.
Returns:
int, neg_inf_count of TensorStatData instance.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_stat_data = dbg_services.TensorStatData(data_size=4,
>>> dtype=0,
>>> shape=[2, 2], is_bool = false, max_value = 10.0,
>>> min_value = 0.0, avg_value = 5.0,
>>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0,
>>> neg_inf_count, pos_inf_count, zero_count = 1)
>>> neg_inf_count = tensor_stat_data.neg_inf_count
"""
return self.instance.neg_inf_count()
@property
def pos_inf_count(self):
"""
Function to receive TensorStatData pos_inf_count.
Returns:
pos_inf_count of TensorStatData instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_stat_data = dbg_services.TensorStatData(data_size=4,
>>> dtype=0,
>>> shape=[2, 2], is_bool = false, max_value = 10.0,
>>> min_value = 0.0, avg_value = 5.0,
>>> count = 4, neg_zero_count = 0, pos_zero_count = 1, nan_count = 0,
>>> neg_inf_count, pos_inf_count, zero_count = 1)
>>> pos_inf_count = tensor_stat_data.pos_inf_count
"""
return self.instance.pos_inf_count()
class WatchpointHit():
"""
WatchpointHit class.
@ -583,7 +1063,7 @@ class WatchpointHit():
>>> name = watchpoint_hit.name
"""
return self.instance.get_name()
return self.instance.name()
@property
def slot(self):
@ -606,7 +1086,7 @@ class WatchpointHit():
>>> slot = watchpoint_hit.slot
"""
return self.instance.get_slot()
return self.instance.slot()
@property
def condition(self):
@ -629,7 +1109,7 @@ class WatchpointHit():
>>> condition = watchpoint_hit.condition
"""
return self.instance.get_condition()
return self.instance.condition()
@property
def watchpoint_id(self):
@ -652,7 +1132,7 @@ class WatchpointHit():
>>> watchpoint_id = watchpoint_hit.watchpoint_id
"""
return self.instance.get_watchpoint_id()
return self.instance.watchpoint_id()
@property
def parameters(self):
@ -675,7 +1155,7 @@ class WatchpointHit():
>>> parameters = watchpoint_hit.parameters
"""
params = self.instance.get_parameters()
params = self.instance.parameters()
param_list = []
for elem in params:
tmp = Parameter(elem.get_name(),

View File

@ -121,7 +121,7 @@ def check_check_watchpoints(method):
return new_method
def check_read_tensors(method):
def check_read_tensor_info(method):
"""Wrapper method to check the parameters of DbgServices ReadTensors."""
@wraps(method)
@ -189,6 +189,52 @@ def check_tensor_data_init(method):
return new_method
def check_tensor_base_data_init(method):
"""Wrapper method to check the parameters of DbgServices TensorBaseData init."""
@wraps(method)
def new_method(self, *args, **kwargs):
[data_size, dtype, shape], _ = parse_user_args(method, *args, **kwargs)
check_uint64(data_size, "data_size")
type_check(dtype, (int,), "dtype")
shape_names = ["shape_{0}".format(i) for i in range(len(shape))]
type_check_list(shape, (int,), shape_names)
return method(self, *args, **kwargs)
return new_method
def check_tensor_stat_data_init(method):
"""Wrapper method to check the parameters of DbgServices TensorBaseData init."""
@wraps(method)
def new_method(self, *args, **kwargs):
[data_size, dtype, shape, is_bool, max_value, min_value,
avg_value, count, neg_zero_count, pos_zero_count,
nan_count, neg_inf_count, pos_inf_count,
zero_count], _ = parse_user_args(method, *args, **kwargs)
check_uint64(data_size, "data_size")
type_check(dtype, (int,), "dtype")
shape_names = ["shape_{0}".format(i) for i in range(len(shape))]
type_check_list(shape, (int,), shape_names)
type_check(is_bool, (bool,), "is_bool")
type_check(max_value, (float,), "max_value")
type_check(min_value, (float,), "min_value")
type_check(avg_value, (float,), "avg_value")
type_check(count, (int,), "count")
type_check(neg_zero_count, (int,), "neg_zero_count")
type_check(pos_zero_count, (int,), "pos_zero_count")
type_check(nan_count, (int,), "nan_count")
type_check(neg_inf_count, (int,), "neg_inf_count")
type_check(pos_inf_count, (int,), "pos_inf_count")
type_check(zero_count, (int,), "zero_count")
return method(self, *args, **kwargs)
return new_method
def check_watchpoint_hit_init(method):
"""Wrapper method to check the parameters of DbgServices WatchpointHit init."""

View File

@ -0,0 +1,87 @@
-----------------------------------------------------------
tensor_info_1 attributes:
node name = Default/Add-op4
slot = 0
iteration = 0
rank_id = 0
root_graph_id = 0
is_output = True
tensor_base_info:
size in bytes = 24
debugger dtype = 11
shape = [2, 3]
tensor_stat_info:
size in bytes = 24
debugger dtype = 11
shape = [2, 3]
is_bool = False
max_value = 10.0
min_value = -11.0
avg_value = 0.880000114440918
count = 6
neg_zero_count = 2
pos_zero_count = 3
nan_count = 0
neg_inf_count = 0
pos_inf_count = 0
zero_count = 1
-----------------------------------------------------------
tensor_info_2 attributes:
node name = Default/Reciprocal-op3
slot = 0
iteration = 0
rank_id = 0
root_graph_id = 0
is_output = True
tensor_base_info:
size in bytes = 40
debugger dtype = 11
shape = [2, 5]
tensor_stat_info:
size in bytes = 40
debugger dtype = 11
shape = [2, 5]
is_bool = False
max_value = 1.0
min_value = 1.0
avg_value = 1.0
count = 10
neg_zero_count = 0
pos_zero_count = 2
nan_count = 0
neg_inf_count = 3
pos_inf_count = 5
zero_count = 0
-----------------------------------------------------------
tensor_info_3 attributes:
node name = Default/network-WithLossCell/_backbone-MockModel/ReduceMean-op92
slot = 0
iteration = 0
rank_id = 0
root_graph_id = 0
is_output = True
tensor_base_info:
size in bytes = 20
debugger dtype = 11
shape = [5]
tensor_stat_info:
size in bytes = 20
debugger dtype = 11
shape = [5]
is_bool = False
max_value = 1.9901361465454102
min_value = -2.175431728363037
avg_value = -0.6648297309875488
count = 5
neg_zero_count = 2
pos_zero_count = 1
nan_count = 2
neg_inf_count = 0
pos_inf_count = 0
zero_count = 0

View File

@ -0,0 +1,146 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Read tensor base and statistics test script for offline debugger APIs.
"""
import tempfile
import os
import shutil
import numpy as np
import mindspore.offline_debug.dbg_services as d
from dump_test_utils import compare_actual_with_expected
GENERATE_GOLDEN = False
test_name = "sync_read_tensors_base_stat"
def test_sync_read_tensors_base_stat():
value_tensor = np.array([[7.5, 8.56, -9.78], [10.0, -11.0, 0.0]], np.float32)
inf_tensor = np.array([[1., -np.inf, np.inf, -np.inf, np.inf], [np.inf, 1., -np.inf, np.inf, np.inf]], np.float32)
nan_tensor = np.array([-2.1754317, 1.9901361, np.nan, np.nan, -1.8091936], np.float32)
value_path = build_dump_file_structure(value_tensor, "Add", "Add.Add-op4.0.0.")
inf_path = build_dump_file_structure(inf_tensor, "Inf", "Reciprocal.Reciprocal-op3.0.0.")
nan_path = build_dump_file_structure(nan_tensor, "Nan", "ReduceMean.ReduceMean-op92.0.0.")
debugger_backend = d.DbgServices(
dump_file_path=value_path, verbose=True)
_ = debugger_backend.initialize(
net_name="Add", is_sync_mode=True)
debugger_backend_2 = d.DbgServices(
dump_file_path=inf_path, verbose=True)
_ = debugger_backend_2.initialize(
net_name="Inf", is_sync_mode=True)
debugger_backend_3 = d.DbgServices(
dump_file_path=nan_path, verbose=True)
_ = debugger_backend_3.initialize(
net_name="Nan", is_sync_mode=True)
info1 = d.TensorInfo(node_name="Default/Add-op4",
slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True)
info2 = d.TensorInfo(node_name="Default/Reciprocal-op3",
slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True)
info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-MockModel/ReduceMean-op92",
slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True)
tensor_info_1 = [info1]
tensor_info_2 = [info2]
tensor_info_3 = [info3]
tensor_info = [info1, info2, info3]
tensor_base_data_list = debugger_backend.read_tensor_base(tensor_info_1)
tensor_base_data_list_2 = debugger_backend_2.read_tensor_base(tensor_info_2)
tensor_base_data_list.extend(tensor_base_data_list_2)
tensor_base_data_list_3 = debugger_backend_3.read_tensor_base(tensor_info_3)
tensor_base_data_list.extend(tensor_base_data_list_3)
tensor_stat_data_list = debugger_backend.read_tensor_stats(tensor_info_1)
tensor_stat_data_list_2 = debugger_backend_2.read_tensor_stats(tensor_info_2)
tensor_stat_data_list.extend(tensor_stat_data_list_2)
tensor_stat_data_list_3 = debugger_backend_3.read_tensor_stats(tensor_info_3)
tensor_stat_data_list.extend(tensor_stat_data_list_3)
shutil.rmtree(value_path)
shutil.rmtree(inf_path)
shutil.rmtree(nan_path)
print_read_tensors(tensor_info, tensor_base_data_list, tensor_stat_data_list)
assert compare_actual_with_expected(test_name)
def build_dump_file_structure(tensor_array, net_name, tensor_name):
debugger_temp_dir = tempfile.mkdtemp(prefix=net_name, dir="./")
print(debugger_temp_dir)
path = os.path.join(debugger_temp_dir, "rank_0", net_name, "0", "0")
print(path)
os.makedirs(path, exist_ok=True)
file = tempfile.mkstemp(prefix=tensor_name, suffix=".output.0.DefaultFormat.npy", dir=path)
full_path = file[1]
np.save(full_path, tensor_array)
return debugger_temp_dir
def print_read_tensors(tensor_info, tensor_base_data_list, tensor_stat_data_list):
"""Print read tensors info."""
if GENERATE_GOLDEN:
f_write = open(test_name + ".expected", "w")
else:
f_write = open(test_name + ".actual", "w")
for x, _ in enumerate(tensor_info):
f_write.write(
"-----------------------------------------------------------\n")
f_write.write("tensor_info_" + str(x+1) + " attributes:\n")
f_write.write("node name = " + tensor_info[x].node_name + "\n")
f_write.write("slot = " + str(tensor_info[x].slot) + "\n")
f_write.write("iteration = " + str(tensor_info[x].iteration) + "\n")
f_write.write("rank_id = " + str(tensor_info[x].rank_id) + "\n")
f_write.write("root_graph_id = " +
str(tensor_info[x].root_graph_id) + "\n")
f_write.write("is_output = " +
str(tensor_info[x].is_output) + "\n")
f_write.write("\n")
f_write.write("tensor_base_info:\n")
f_write.write("size in bytes = " +
str(tensor_base_data_list[x].data_size) + "\n")
f_write.write("debugger dtype = " + str(tensor_base_data_list[x].dtype) + "\n")
f_write.write("shape = " + str(tensor_base_data_list[x].shape) + "\n")
f_write.write("\n")
f_write.write("tensor_stat_info:\n")
f_write.write("size in bytes = " +
str(tensor_stat_data_list[x].data_size) + "\n")
f_write.write("debugger dtype = " + str(tensor_stat_data_list[x].dtype) + "\n")
f_write.write("shape = " + str(tensor_stat_data_list[x].shape) + "\n")
f_write.write("is_bool = " + str(tensor_stat_data_list[x].is_bool) + "\n")
f_write.write("max_value = " + str(tensor_stat_data_list[x].max_value) + "\n")
f_write.write("min_value = " + str(tensor_stat_data_list[x].min_value) + "\n")
f_write.write("avg_value = " + str(tensor_stat_data_list[x].avg_value) + "\n")
f_write.write("count = " + str(tensor_stat_data_list[x].count) + "\n")
f_write.write("neg_zero_count = " + str(tensor_stat_data_list[x].neg_zero_count) + "\n")
f_write.write("pos_zero_count = " + str(tensor_stat_data_list[x].pos_zero_count) + "\n")
f_write.write("nan_count = " + str(tensor_stat_data_list[x].nan_count) + "\n")
f_write.write("neg_inf_count = " + str(tensor_stat_data_list[x].neg_inf_count) + "\n")
f_write.write("pos_inf_count = " + str(tensor_stat_data_list[x].pos_inf_count) + "\n")
f_write.write("zero_count = " + str(tensor_stat_data_list[x].zero_count) + "\n")
f_write.close()