From ff83dee7043ab1cee9fd6546cd4d0581c122d905 Mon Sep 17 00:00:00 2001 From: Parastoo Ashtari Date: Wed, 4 Aug 2021 11:45:37 -0400 Subject: [PATCH] Add tensor base and stat info to offline debugger --- mindspore/ccsrc/debug/debug_services.cc | 184 ++++--- mindspore/ccsrc/debug/debug_services.h | 50 ++ .../debugger/offline_debug/dbg_services.cc | 55 +- .../debugger/offline_debug/dbg_services.h | 68 +++ .../offline_debug/mi_pybind_register.cc | 26 + .../ccsrc/debug/debugger/tensor_summary.cc | 164 +++--- .../ccsrc/debug/debugger/tensor_summary.h | 58 +- mindspore/offline_debug/dbg_services.py | 494 +++++++++++++++++- mindspore/offline_debug/mi_validators.py | 48 +- .../sync_read_tensors_base_stat.expected | 87 +++ .../test_sync_read_tensors_base_stat.py | 146 ++++++ 11 files changed, 1226 insertions(+), 154 deletions(-) create mode 100644 tests/ut/data/dump/gpu_dumps/golden/sync_read_tensors_base_stat.expected create mode 100644 tests/ut/python/debugger/gpu_tests/test_sync_read_tensors_base_stat.py diff --git a/mindspore/ccsrc/debug/debug_services.cc b/mindspore/ccsrc/debug/debug_services.cc index 3b75437b8b4..af5e5ca4ac4 100644 --- a/mindspore/ccsrc/debug/debug_services.cc +++ b/mindspore/ccsrc/debug/debug_services.cc @@ -131,6 +131,30 @@ std::unique_ptr GetSummaryPtr(const std::shared_ptr } } +DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_ptr &tensor) { + if (tensor == nullptr) { + MS_LOG(WARNING) << "Tensor is nullptr, returning empty tensor statistics."; + TensorStat empty_tensor_stat_data; + return empty_tensor_stat_data; + } + std::unique_ptr base_summary_ptr; + void *previous_tensor_ptr = nullptr; + base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, tensor->GetNumElements(), tensor->GetType()); + if (base_summary_ptr == nullptr) { + MS_LOG(WARNING) << "base_summary_ptr is nullptr, returning empty tensor statistics."; + TensorStat empty_tensor_stat_data; + return empty_tensor_stat_data; + } + base_summary_ptr->TensorStatistics(tensor->GetType()); + TensorStat tensor_stat_data(tensor->GetByteSize(), tensor->GetType(), tensor->GetShape(), base_summary_ptr->is_bool(), + base_summary_ptr->max_value(), base_summary_ptr->min_value(), + base_summary_ptr->avg_value(), base_summary_ptr->count(), + base_summary_ptr->neg_zero_count(), base_summary_ptr->pos_zero_count(), + base_summary_ptr->nan_count(), base_summary_ptr->neg_inf_count(), + base_summary_ptr->pos_inf_count(), base_summary_ptr->zero_count()); + + return tensor_stat_data; +} #ifdef OFFLINE_DBG_MODE void *DebugServices::GetPrevTensor(const std::shared_ptr &tensor, bool previous_iter_tensor_needed) { void *previous_tensor_ptr = nullptr; @@ -317,7 +341,11 @@ void DebugServices::CheckWatchpoints(std::vector *const name, std:: MS_LOG(INFO) << "tensor list size: " << tensor_list_size; if (tensor_list_size == 0) return; // default value for number of threads - const int max_thread_num = 32; + const int default_thread_num = 32; + int max_thread_num = default_thread_num; + if (max_thread_num > tensor_list_size) { + max_thread_num = tensor_list_size; + } MS_LOG(INFO) << "Number of threads used for checkwatchpoint: " << max_thread_num; int chunk_size = tensor_list_size / max_thread_num; int remainder = tensor_list_size % max_thread_num; @@ -757,78 +785,100 @@ void DebugServices::ReadDumpedTensor(std::vector backend_name, std: std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]); // search files in dir for the one that meets the filename prefix and read the file into memory - std::vector *buffer = NULL; - std::string type_name = ""; - std::vector shape; - uint64_t data_size = 0; if (is_sync_mode_) { - std::string abspath = RealPath(specific_dump_dir); - DIR *d = opendir(abspath.c_str()); - bool found_file = false; - std::vector matched_paths; - if (d == nullptr) { - MS_LOG(ERROR) << "Directory " << specific_dump_dir << " does not exist!"; - } else { - struct dirent *dir = nullptr; - while ((dir = readdir(d)) != NULL) { - if (dir->d_type == DT_REG) { - std::string file_name = dir->d_name; - std::string stripped_file_name = GetStrippedFilename(file_name); - if (stripped_file_name.empty()) { - continue; - } - std::size_t found = stripped_file_name.rfind(prefix_dump_file_name, 0); - if (found != 0) { - continue; - } - - std::string full_path = specific_dump_dir + "/" + file_name; - matched_paths.push_back(full_path); - found_file = true; - } - } - (void)closedir(d); - } - - if (found_file) { - shape.clear(); - std::string result_path = GetNewestFilePath(matched_paths); - ReadTensorFromNpy(result_path, &type_name, &data_size, &shape, &buffer); - AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], is_output[i], data_size, - type_name, shape, buffer, result_list); - } else { - AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], is_output[i], 0, - type_name, shape, buffer, result_list); - MS_LOG(INFO) << "Target tensor has not been found."; - } + ReadDumpedTensorSync(prefix_dump_file_name, specific_dump_dir, backend_name[i], slot[i], device_id[i], + iteration[i], root_graph_id[i], is_output[i], result_list); } else { - bool found = false; - std::vector matched_paths; - // if async mode - for (const std::string &file_path : async_file_pool) { - if (file_path.find(specific_dump_dir) != std::string::npos && - file_path.find(prefix_dump_to_check) != std::string::npos && - file_path.find(slot_string_to_check) != std::string::npos) { - matched_paths.push_back(file_path); - found = true; - } - } - if (found) { - shape.clear(); - std::string result_path = GetNewestFilePath(matched_paths); - ReadTensorFromNpy(result_path, &type_name, &data_size, &shape, &buffer); - AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], is_output[i], data_size, - type_name, shape, buffer, result_list); - } else { - // If no npy file is found, add empty tensor data. - AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], is_output[i], 0, - type_name, shape, buffer, result_list); - MS_LOG(INFO) << "Target tensor has not been found."; - } + ReadDumpedTensorAsync(specific_dump_dir, prefix_dump_to_check, slot_string_to_check, backend_name[i], slot[i], + device_id[i], iteration[i], root_graph_id[i], is_output[i], async_file_pool, result_list); } } } +void DebugServices::ReadDumpedTensorSync(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir, + const std::string &backend_name, size_t slot, unsigned int device_id, + unsigned int iteration, unsigned int root_graph_id, const bool &is_output, + std::vector> *result_list) { + std::vector *buffer = NULL; + std::string type_name = ""; + std::vector shape; + uint64_t data_size = 0; + std::string abspath = RealPath(specific_dump_dir); + DIR *d = opendir(abspath.c_str()); + bool found_file = false; + std::vector matched_paths; + if (d == nullptr) { + MS_LOG(ERROR) << "Directory " << specific_dump_dir << " does not exist!"; + return; + } + struct dirent *dir = nullptr; + while ((dir = readdir(d)) != NULL) { + if (dir->d_type == DT_REG) { + std::string file_name = dir->d_name; + std::string stripped_file_name = GetStrippedFilename(file_name); + if (stripped_file_name.empty()) { + continue; + } + std::size_t found = stripped_file_name.rfind(prefix_dump_file_name, 0); + if (found != 0) { + continue; + } + + std::string full_path = specific_dump_dir + "/" + file_name; + matched_paths.push_back(full_path); + found_file = true; + } + } + + if (found_file) { + shape.clear(); + std::string result_path = GetNewestFilePath(matched_paths); + ReadTensorFromNpy(result_path, &type_name, &data_size, &shape, &buffer); + AddToTensorData(backend_name, slot, iteration, device_id, root_graph_id, is_output, data_size, type_name, shape, + buffer, result_list); + } else { + AddToTensorData(backend_name, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape, buffer, + result_list); + MS_LOG(INFO) << "Target tensor has not been found."; + } + (void)closedir(d); +} + +void DebugServices::ReadDumpedTensorAsync(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check, + const std::string &slot_string_to_check, const std::string &backend_name, + size_t slot, unsigned int device_id, unsigned int iteration, + unsigned int root_graph_id, const bool &is_output, + const std::vector &async_file_pool, + std::vector> *result_list) { + std::vector *buffer = NULL; + std::string type_name = ""; + std::vector shape; + uint64_t data_size = 0; + bool found = false; + std::vector matched_paths; + // if async mode + for (const std::string &file_path : async_file_pool) { + if (file_path.find(specific_dump_dir) != std::string::npos && + file_path.find(prefix_dump_to_check) != std::string::npos && + file_path.find(slot_string_to_check) != std::string::npos) { + matched_paths.push_back(file_path); + found = true; + } + } + if (found) { + shape.clear(); + std::string result_path = GetNewestFilePath(matched_paths); + ReadTensorFromNpy(result_path, &type_name, &data_size, &shape, &buffer); + AddToTensorData(backend_name, slot, iteration, device_id, root_graph_id, is_output, data_size, type_name, shape, + buffer, result_list); + } else { + // If no npy file is found, add empty tensor data. + AddToTensorData(backend_name, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape, buffer, + result_list); + MS_LOG(INFO) << "Target tensor has not been found."; + } +} + std::string DebugServices::GetStrippedFilename(const std::string &file_name) { // strip off the task_id, stream_id, and timestamp, then compare size_t first_dot = file_name.find("."); diff --git a/mindspore/ccsrc/debug/debug_services.h b/mindspore/ccsrc/debug/debug_services.h index 9866475688e..062fc349473 100644 --- a/mindspore/ccsrc/debug/debug_services.h +++ b/mindspore/ccsrc/debug/debug_services.h @@ -186,6 +186,45 @@ class DebugServices { } }; + struct TensorStat { + TensorStat(uint64_t data_size, int dtype, const std::vector &shape, bool is_bool, double max_value, + double min_value, double avg_value, int count, int neg_zero_count, int pos_zero_count, int nan_count, + int neg_inf_count, int pos_inf_count, int zero_count) + : data_size(data_size), + dtype(dtype), + shape(shape), + is_bool(is_bool), + max_value(max_value), + min_value(min_value), + avg_value(avg_value), + count(count), + neg_zero_count(neg_zero_count), + pos_zero_count(pos_zero_count), + nan_count(nan_count), + neg_inf_count(neg_inf_count), + pos_inf_count(pos_inf_count), + zero_count(zero_count) {} + + TensorStat() = default; + + uint64_t data_size = 0; + int dtype = 0; + std::vector shape = {0}; + bool is_bool = false; + double max_value = std::numeric_limits::lowest(); + double min_value = std::numeric_limits::max(); + double avg_value = 0.0; + int count = 0; + int neg_zero_count = 0; + int pos_zero_count = 0; + int nan_count = 0; + int neg_inf_count = 0; + int pos_inf_count = 0; + int zero_count = 0; + }; + + TensorStat GetTensorStatistics(const std::shared_ptr &tensor); + void AddWatchpoint( unsigned int id, unsigned int watch_condition, float parameter, const std::vector> &check_node_list, const std::vector ¶meter_list, @@ -233,6 +272,17 @@ class DebugServices { const std::vector &async_file_pool, std::vector> *result_list); + void ReadDumpedTensorSync(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir, + const std::string &backend_name, size_t slot, unsigned int device_id, + unsigned int iteration, unsigned int root_graph_id, const bool &is_output, + std::vector> *result_list); + + void ReadDumpedTensorAsync(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check, + const std::string &slot_string_to_check, const std::string &backend_name, size_t slot, + unsigned int device_id, unsigned int iteration, unsigned int root_graph_id, + const bool &is_output, const std::vector &async_file_pool, + std::vector> *result_list); + std::vector> ReadNeededDumpedTensors(unsigned int iteration, std::vector *async_file_pool); diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc index 4a0075b341a..e64a9810061 100644 --- a/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc +++ b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc @@ -226,7 +226,7 @@ unsigned int GetTensorSlot(tensor_info_t info) { return info.slot; } bool GetTensorIsOutput(tensor_info_t info) { return info.is_output; } -std::vector DbgServices::ReadTensors(std::vector info) { +std::vector> DbgServices::ReadTensorsUtil(std::vector info) { for (auto i : info) { MS_LOG(INFO) << "cpp DbgServices ReadTensor info name " << i.node_name << ", slot " << i.slot << ", iteration " << i.iteration << ", rank_id " << i.rank_id << ", root_graph_id " << i.root_graph_id << ", is_output " @@ -238,7 +238,6 @@ std::vector DbgServices::ReadTensors(std::vector i std::vector iteration; std::vector slot; std::vector> result_list; - std::vector tensors_read; std::vector is_output; std::transform(info.begin(), info.end(), std::back_inserter(backend_name), GetTensorFullName); @@ -264,10 +263,60 @@ std::vector DbgServices::ReadTensors(std::vector i MS_LOG(INFO) << "ReadTensors Took: " << ms_double.count() / 1000 << "s"; MS_LOG(INFO) << "cpp after"; + return result_list; +} + +std::vector DbgServices::ReadTensors(std::vector info) { + std::vector tensors_read; + std::vector> result_list; + result_list = ReadTensorsUtil(info); for (auto result : result_list) { tensor_data_t tensor_data_item(result->GetDataPtr(), result->GetByteSize(), result->GetType(), result->GetShape()); tensors_read.push_back(tensor_data_item); } - MS_LOG(INFO) << "cpp end"; return tensors_read; } + +std::vector DbgServices::ReadTensorsBase(std::vector info) { + std::vector tensors_read_base; + std::vector> result_list; + result_list = ReadTensorsUtil(info); + for (auto result : result_list) { + if (!result->GetByteSize()) { + // tensor not found, adding empty tensor base. + TensorBaseData tensor_data_item(0, 0, {0}); + tensors_read_base.push_back(tensor_data_item); + continue; + } + TensorBaseData tensor_data_item(result->GetByteSize(), result->GetType(), result->GetShape()); + tensors_read_base.push_back(tensor_data_item); + } + return tensors_read_base; +} + +std::vector DbgServices::ReadTensorsStat(std::vector info) { + std::vector tensors_read_stat; + std::vector> result_list; + result_list = ReadTensorsUtil(info); + for (auto result : result_list) { + if (!result->GetByteSize()) { + DebugServices::TensorStat tensor_statistics; + TensorStatData tensor_data_item( + tensor_statistics.data_size, tensor_statistics.dtype, tensor_statistics.shape, tensor_statistics.is_bool, + tensor_statistics.max_value, tensor_statistics.min_value, tensor_statistics.avg_value, tensor_statistics.count, + tensor_statistics.neg_zero_count, tensor_statistics.pos_zero_count, tensor_statistics.nan_count, + tensor_statistics.neg_inf_count, tensor_statistics.pos_inf_count, tensor_statistics.zero_count); + tensors_read_stat.push_back(tensor_data_item); + continue; + } + DebugServices::TensorStat tensor_statistics = debug_services_->GetTensorStatistics(result); + TensorStatData tensor_data_item( + tensor_statistics.data_size, tensor_statistics.dtype, tensor_statistics.shape, tensor_statistics.is_bool, + tensor_statistics.max_value, tensor_statistics.min_value, tensor_statistics.avg_value, tensor_statistics.count, + tensor_statistics.neg_zero_count, tensor_statistics.pos_zero_count, tensor_statistics.nan_count, + tensor_statistics.neg_inf_count, tensor_statistics.pos_inf_count, tensor_statistics.zero_count); + tensors_read_stat.push_back(tensor_data_item); + } + + return tensors_read_stat; +} diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.h b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.h index 5243c413a6e..1bfef4f49aa 100644 --- a/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.h +++ b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.h @@ -117,6 +117,68 @@ struct tensor_data_t { std::vector shape; }; +struct TensorBaseData { + TensorBaseData(uint64_t data_size, int dtype, const std::vector &shape) + : data_size_(data_size), dtype_(dtype), shape_(shape) {} + + const uint64_t data_size() const { return data_size_; } + const int dtype() const { return dtype_; } + const std::vector &shape() const { return shape_; } + uint64_t data_size_; + int dtype_; + std::vector shape_; +}; + +struct TensorStatData { + TensorStatData(uint64_t data_size, int dtype, const std::vector &shape, bool is_bool, double max_value, + double min_value, double avg_value, int count, int neg_zero_count, int pos_zero_count, int nan_count, + int neg_inf_count, int pos_inf_count, int zero_count) + : data_size_(data_size), + dtype_(dtype), + shape_(shape), + is_bool_(is_bool), + max_value_(max_value), + min_value_(min_value), + avg_value_(avg_value), + count_(count), + neg_zero_count_(neg_zero_count), + pos_zero_count_(pos_zero_count), + nan_count_(nan_count), + neg_inf_count_(neg_inf_count), + pos_inf_count_(pos_inf_count), + zero_count_(zero_count) {} + + const uint64_t data_size() const { return data_size_; } + const int dtype() const { return dtype_; } + const std::vector &shape() const { return shape_; } + const bool is_bool() const { return is_bool_; } + const double max_value() const { return max_value_; } + const double min_value() const { return min_value_; } + const double avg_value() const { return avg_value_; } + const int count() const { return count_; } + const int neg_zero_count() const { return neg_zero_count_; } + const int pos_zero_count() const { return pos_zero_count_; } + const int nan_count() const { return nan_count_; } + const int neg_inf_count() const { return neg_inf_count_; } + const int pos_inf_count() const { return pos_inf_count_; } + const int zero_count() const { return zero_count_; } + + uint64_t data_size_; + int dtype_; + std::vector shape_; + bool is_bool_; + double max_value_; + double min_value_; + double avg_value_; + int count_; + int neg_zero_count_; + int pos_zero_count_; + int nan_count_; + int neg_inf_count_; + int pos_inf_count_; + int zero_count_; +}; + class DbgServices { private: DebugServices *debug_services_; @@ -141,8 +203,14 @@ class DbgServices { std::vector CheckWatchpoints(unsigned int iteration); + std::vector> ReadTensorsUtil(std::vector info); + std::vector ReadTensors(std::vector info); + std::vector ReadTensorsBase(std::vector info); + + std::vector ReadTensorsStat(std::vector info); + std::string GetVersion(); }; diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/mi_pybind_register.cc b/mindspore/ccsrc/debug/debugger/offline_debug/mi_pybind_register.cc index 3edc82f8c2f..1f14bd7cb69 100644 --- a/mindspore/ccsrc/debug/debugger/offline_debug/mi_pybind_register.cc +++ b/mindspore/ccsrc/debug/debugger/offline_debug/mi_pybind_register.cc @@ -27,6 +27,8 @@ PYBIND11_MODULE(_mindspore_offline_debug, m) { .def("RemoveWatchpoint", &DbgServices::RemoveWatchpoint) .def("CheckWatchpoints", &DbgServices::CheckWatchpoints) .def("ReadTensors", &DbgServices::ReadTensors) + .def("ReadTensorsBase", &DbgServices::ReadTensorsBase) + .def("ReadTensorsStat", &DbgServices::ReadTensorsStat) .def("GetVersion", &DbgServices::GetVersion); py::class_(m, "parameter") @@ -63,4 +65,28 @@ PYBIND11_MODULE(_mindspore_offline_debug, m) { .def("get_data_size", &tensor_data_t::get_data_size) .def("get_dtype", &tensor_data_t::get_dtype) .def("get_shape", &tensor_data_t::get_shape); + + py::class_(m, "TensorBaseData") + .def(py::init>()) + .def("data_size", &TensorBaseData::data_size) + .def("dtype", &TensorBaseData::dtype) + .def("shape", &TensorBaseData::shape); + + py::class_(m, "TensorStatData") + .def( + py::init, bool, double, double, double, int, int, int, int, int, int, int>()) + .def("data_size", &TensorStatData::data_size) + .def("dtype", &TensorStatData::dtype) + .def("shape", &TensorStatData::shape) + .def("is_bool", &TensorStatData::is_bool) + .def("max_value", &TensorStatData::max_value) + .def("min_value", &TensorStatData::min_value) + .def("avg_value", &TensorStatData::avg_value) + .def("count", &TensorStatData::count) + .def("neg_zero_count", &TensorStatData::neg_zero_count) + .def("pos_zero_count", &TensorStatData::pos_zero_count) + .def("nan_count", &TensorStatData::nan_count) + .def("neg_inf_count", &TensorStatData::neg_inf_count) + .def("pos_inf_count", &TensorStatData::pos_inf_count) + .def("zero_count", &TensorStatData::zero_count); } diff --git a/mindspore/ccsrc/debug/debugger/tensor_summary.cc b/mindspore/ccsrc/debug/debugger/tensor_summary.cc index 3bee9cc5bc6..56a90c73d5e 100644 --- a/mindspore/ccsrc/debug/debugger/tensor_summary.cc +++ b/mindspore/ccsrc/debug/debugger/tensor_summary.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include "debug/debugger/tensor_summary.h" #ifdef OFFLINE_DBG_MODE @@ -92,39 +93,45 @@ double VarianceAndMeanCalculator::GetStandardDeviation() { return sqrt(GetVarian template TensorSummary::TensorSummary(void *current_tensor_ptr, void *const previous_tensor_ptr, uint32_t num_elements) - : current_tensor_ptr(reinterpret_cast(current_tensor_ptr)), - prev_tensor_ptr(reinterpret_cast(previous_tensor_ptr)), - num_elements(num_elements), - min(std::numeric_limits::max()), - max(std::numeric_limits::lowest()), - inf_count(0), - nan_count(0), - zero_count(0), - epsilon(1.0e-9), - mean_sd_cal_enabled(false) {} + : current_tensor_ptr_(reinterpret_cast(current_tensor_ptr)), + prev_tensor_ptr_(reinterpret_cast(previous_tensor_ptr)), + num_elements_(num_elements), + min_(std::numeric_limits::max()), + max_(std::numeric_limits::lowest()), + avg_(0.0), + is_bool_(false), + neg_zero_count_(0), + pos_zero_count_(0), + pos_inf_count_(0), + neg_inf_count_(0), + inf_count_(0), + nan_count_(0), + zero_count_(0), + epsilon_(1.0e-9), + mean_sd_cal_enabled_(false) {} template void TensorSummary::SummarizeTensor(const std::vector &wps) { InitCalculators(wps); - for (size_t i = 0; i < num_elements; ++i) { - auto current_value = static_cast(current_tensor_ptr[i]); + for (size_t i = 0; i < num_elements_; ++i) { + auto current_value = static_cast(current_tensor_ptr_[i]); double previous_value = - prev_tensor_ptr ? static_cast(prev_tensor_ptr[i]) : std::numeric_limits::quiet_NaN(); - inf_count += std::isinf(current_value); - nan_count += std::isnan(current_value); - zero_count += (current_value == 0); - max = std::max(max, current_value); - min = std::min(min, current_value); - if (mean_sd_cal_enabled) { - current_mean_variance.ProcessElement(current_value); + prev_tensor_ptr_ ? static_cast(prev_tensor_ptr_[i]) : std::numeric_limits::quiet_NaN(); + inf_count_ += std::isinf(current_value); + nan_count_ += std::isnan(current_value); + zero_count_ += (current_value == 0); + max_ = std::max(max_, current_value); + min_ = std::min(min_, current_value); + if (mean_sd_cal_enabled_) { + current_mean_variance_.ProcessElement(current_value); } - for (auto &it : all_close) { + for (auto &it : all_close_) { it.second->ProcessElement(current_value, previous_value); } - for (auto &range_count : range_counts) { + for (auto &range_count : range_counts_) { range_count.second->ProcessElement(current_value); } - for (auto &mean : means) { + for (auto &mean : means_) { if (mean.first == "curr_prev_diff_mean") { mean.second->ProcessElement(std::abs(current_value - previous_value)); } else if (mean.first == "abs_prev_mean") { @@ -136,6 +143,39 @@ void TensorSummary::SummarizeTensor(const std::vector +void TensorSummary::TensorStatistics(DbgDataType dtype_value) { + if (dtype_value == DT_BOOL) { + is_bool_ = true; + } + double sum_elements = 0.0; + for (size_t i = 0; i < num_elements_; ++i) { + auto current_value = static_cast(current_tensor_ptr_[i]); + if (std::isinf(current_value)) { + if (current_value > 0) { + pos_inf_count_ += 1; + } else { + neg_inf_count_ += 1; + } + } + zero_count_ += (current_value == 0); + nan_count_ += std::isnan(current_value); + if (!(std::isnan(current_value) || std::isinf(current_value))) { + // only considering tensor elements with value + if (std::signbit(current_value) && !(current_value == 0)) { + neg_zero_count_ += 1; + } else if (!(current_value == 0)) { + pos_zero_count_ += 1; + } + max_ = std::max(max_, current_value); + min_ = std::min(min_, current_value); + sum_elements += current_value; + } + } + int value_count = zero_count_ + neg_zero_count_ + pos_zero_count_; + avg_ = sum_elements / value_count; +} + template std::tuple> TensorSummary::IsWatchpointHit( DebugServices::watchpoint_t wp) { @@ -145,24 +185,24 @@ std::tuple> TensorSummary: std::bitset error_code; CONDITION_TYPE type = wp.condition.type; // bit 0 denotes presence of nan - error_code.set(0, nan_count > 0); + error_code.set(0, nan_count_ > 0); // bit 1 denotes presence of inf - error_code.set(1, inf_count > 0); + error_code.set(1, inf_count_ > 0); if (type == CONDITION_TYPE::HAS_NAN) { error_code.reset(); - hit = nan_count > 0; + hit = nan_count_ > 0; } else if (type == CONDITION_TYPE::HAS_INF) { error_code.reset(); - hit = inf_count > 0; + hit = inf_count_ > 0; } else if (type == CONDITION_TYPE::GENERAL_OVERFLOW) { error_code.reset(); - hit = (nan_count + inf_count) > 0; - } else if (type == CONDITION_TYPE::NOT_CHANGED && prev_tensor_ptr && error_code.none()) { - hit = all_close[wp.id]->IsAllClose(); + hit = (nan_count_ + inf_count_) > 0; + } else if (type == CONDITION_TYPE::NOT_CHANGED && prev_tensor_ptr_ && error_code.none()) { + hit = all_close_[wp.id]->IsAllClose(); } else if ((type == CONDITION_TYPE::NOT_CHANGED || type == CONDITION_TYPE::CHANGE_TOO_LARGE || type == CONDITION_TYPE::CHANGE_TOO_SMALL) && - !prev_tensor_ptr) { + !prev_tensor_ptr_) { // bit 2 denotes absence of previous tensor error_code.set(2, true); } @@ -196,26 +236,26 @@ double_t TensorSummary::StatLookup(const std::string ¶meter_name, const D } if (param_type == "max") { - return max; + return max_; } else if (param_type == "min") { - return min; + return min_; } else if (param_type == "max_min") { - return max - min; + return max_ - min_; } else if (param_type == "mean") { - return current_mean_variance.GetMean(); + return current_mean_variance_.GetMean(); } else if (param_type == "sd") { - return current_mean_variance.GetStandardDeviation(); + return current_mean_variance_.GetStandardDeviation(); } else if (param_type == "abs_mean") { - if (means.find("abs_current_mean") != means.end()) { - return means["abs_current_mean"]->GetMean(); + if (means_.find("abs_current_mean") != means_.end()) { + return means_["abs_current_mean"]->GetMean(); } - } else if (param_type == "abs_mean_update_ratio" && prev_tensor_ptr) { - if (means.find("curr_prev_diff_mean") != means.end() && means.find("abs_prev_mean") != means.end()) { - return means["curr_prev_diff_mean"]->GetMean() / (means["abs_prev_mean"]->GetMean() + epsilon); + } else if (param_type == "abs_mean_update_ratio" && prev_tensor_ptr_) { + if (means_.find("curr_prev_diff_mean") != means_.end() && means_.find("abs_prev_mean") != means_.end()) { + return means_["curr_prev_diff_mean"]->GetMean() / (means_["abs_prev_mean"]->GetMean() + epsilon_); } } else if (param_type == "range_percentage") { - if (range_counts.find(wp.id) != range_counts.end()) { - return range_counts[wp.id]->GetPercentInRange(); + if (range_counts_.find(wp.id) != range_counts_.end()) { + return range_counts_[wp.id]->GetPercentInRange(); } } else if (param_type == "zero_percentage") { return GetZeroValPercent(); @@ -227,54 +267,54 @@ template double_t TensorSummary::StatLookup(const DebugServices::watchpoint_t &wp) { CONDITION_TYPE type = wp.condition.type; if (type == CONDITION_TYPE::MAX_LT || type == CONDITION_TYPE::MAX_GT) { - return max; + return max_; } else if (type == CONDITION_TYPE::MIN_LT || type == CONDITION_TYPE::MIN_GT) { - return min; + return min_; } else if (type == CONDITION_TYPE::MEAN_LT || type == CONDITION_TYPE::MEAN_GT) { - return current_mean_variance.GetMean(); + return current_mean_variance_.GetMean(); } else if (type == CONDITION_TYPE::SD_LT || type == CONDITION_TYPE::SD_GT) { - return current_mean_variance.GetStandardDeviation(); + return current_mean_variance_.GetStandardDeviation(); } else if (type == CONDITION_TYPE::MAX_MIN_GT || type == CONDITION_TYPE::MAX_MIN_LT) { - return max - min; + return max_ - min_; } return std::numeric_limits::quiet_NaN(); } template double_t TensorSummary::GetZeroValPercent() { - if (num_elements == 0) { + if (num_elements_ == 0) { return 0; } - return (zero_count * 100.0) / num_elements; + return (zero_count_ * 100.0) / num_elements_; } template void TensorSummary::InitCalculators(const std::vector &wps) { for (auto &wp : wps) { auto wp_id = wp.id; - mean_sd_cal_enabled = mean_sd_cal_enabled || wp.mean_sd_enabled(); - if (wp.allclose_enabled() && prev_tensor_ptr) { - all_close[wp_id] = std::make_unique(); + mean_sd_cal_enabled_ = mean_sd_cal_enabled_ || wp.mean_sd_enabled(); + if (wp.allclose_enabled() && prev_tensor_ptr_) { + all_close_[wp_id] = std::make_unique(); if (!wp.parameter_list[0].disabled) { - all_close[wp_id]->set_atol(wp.parameter_list[0].value); + all_close_[wp_id]->set_atol(wp.parameter_list[0].value); } if (!wp.parameter_list[1].disabled) { - all_close[wp_id]->set_rtol(wp.parameter_list[1].value); + all_close_[wp_id]->set_rtol(wp.parameter_list[1].value); } } else if (wp.range_enabled()) { - range_counts[wp_id] = std::make_unique(); + range_counts_[wp_id] = std::make_unique(); if (!wp.parameter_list[0].disabled) { - range_counts[wp_id]->set_range_start_inclusive(wp.parameter_list[0].value); + range_counts_[wp_id]->set_range_start_inclusive(wp.parameter_list[0].value); } if (!wp.parameter_list[1].disabled) { - range_counts[wp_id]->set_range_end_inclusive(wp.parameter_list[1].value); + range_counts_[wp_id]->set_range_end_inclusive(wp.parameter_list[1].value); } - } else if (wp.tensor_update_ratio_mean_enabled() && prev_tensor_ptr) { - means.insert({"curr_prev_diff_mean", std::make_unique()}); - means.insert({"abs_prev_mean", std::make_unique()}); + } else if (wp.tensor_update_ratio_mean_enabled() && prev_tensor_ptr_) { + means_.insert({"curr_prev_diff_mean", std::make_unique()}); + means_.insert({"abs_prev_mean", std::make_unique()}); } else if (wp.abs_mean_enabled()) { - means.insert({"abs_current_mean", std::make_unique()}); + means_.insert({"abs_current_mean", std::make_unique()}); } } } diff --git a/mindspore/ccsrc/debug/debugger/tensor_summary.h b/mindspore/ccsrc/debug/debugger/tensor_summary.h index ec6f181cb21..793a06ef2ec 100644 --- a/mindspore/ccsrc/debug/debugger/tensor_summary.h +++ b/mindspore/ccsrc/debug/debugger/tensor_summary.h @@ -92,6 +92,18 @@ class ITensorSummary { virtual void SummarizeTensor(const std::vector &) = 0; virtual std::tuple> IsWatchpointHit( DebugServices::watchpoint_t) = 0; + virtual void TensorStatistics(DbgDataType) = 0; + virtual const bool is_bool() const = 0; + virtual const double max_value() const = 0; + virtual const double min_value() const = 0; + virtual const double avg_value() const = 0; + virtual const int count() const = 0; + virtual const int neg_zero_count() const = 0; + virtual const int pos_zero_count() const = 0; + virtual const int nan_count() const = 0; + virtual const int neg_inf_count() const = 0; + virtual const int pos_inf_count() const = 0; + virtual const int zero_count() const = 0; }; template @@ -103,22 +115,40 @@ class TensorSummary : public ITensorSummary { void SummarizeTensor(const std::vector &) override; // returns hit, error_code, parameter_list std::tuple> IsWatchpointHit(DebugServices::watchpoint_t) override; + void TensorStatistics(DbgDataType) override; + const bool is_bool() const override { return is_bool_; } + const double max_value() const override { return max_; } + const double min_value() const override { return min_; } + const double avg_value() const override { return avg_; } + const int count() const override { return num_elements_; } + const int neg_zero_count() const override { return neg_zero_count_; } + const int pos_zero_count() const override { return pos_zero_count_; } + const int nan_count() const override { return nan_count_; } + const int neg_inf_count() const override { return neg_inf_count_; } + const int pos_inf_count() const override { return pos_inf_count_; } + const int zero_count() const override { return zero_count_; } private: - T *current_tensor_ptr; - T *prev_tensor_ptr; - uint32_t num_elements; - double min; - double max; - uint32_t inf_count; - uint32_t nan_count; - uint32_t zero_count; - double epsilon; - bool mean_sd_cal_enabled; - VarianceAndMeanCalculator current_mean_variance; - std::unordered_map> means; - std::unordered_map> all_close; - std::unordered_map> range_counts; + T *current_tensor_ptr_; + T *prev_tensor_ptr_; + uint32_t num_elements_; + double min_; + double max_; + double avg_; + bool is_bool_; + uint32_t neg_zero_count_; + uint32_t pos_zero_count_; + uint32_t pos_inf_count_; + uint32_t neg_inf_count_; + uint32_t inf_count_; + uint32_t nan_count_; + uint32_t zero_count_; + double epsilon_; + bool mean_sd_cal_enabled_; + VarianceAndMeanCalculator current_mean_variance_; + std::unordered_map> means_; + std::unordered_map> all_close_; + std::unordered_map> range_counts_; double_t StatLookup(const DebugServices::watchpoint_t &); double_t StatLookup(const std::string &, const DebugServices::watchpoint_t &); double_t GetZeroValPercent(); diff --git a/mindspore/offline_debug/dbg_services.py b/mindspore/offline_debug/dbg_services.py index c2fda38252d..045a3d87b02 100644 --- a/mindspore/offline_debug/dbg_services.py +++ b/mindspore/offline_debug/dbg_services.py @@ -17,7 +17,10 @@ The module DbgServices provides offline debugger APIs. """ import mindspore._mindspore_offline_debug as cds -from mindspore.offline_debug.mi_validators import check_init, check_initialize, check_add_watchpoint, check_remove_watchpoint, check_check_watchpoints, check_read_tensors, check_initialize_done, check_tensor_info_init, check_tensor_data_init, check_watchpoint_hit_init, check_parameter_init +from mindspore.offline_debug.mi_validators import check_init, check_initialize, check_add_watchpoint,\ + check_remove_watchpoint, check_check_watchpoints, check_read_tensor_info, check_initialize_done, \ + check_tensor_info_init, check_tensor_data_init, check_tensor_base_data_init, check_tensor_stat_data_init,\ + check_watchpoint_hit_init, check_parameter_init from mindspore.offline_debug.mi_validator_helpers import replace_minus_one @@ -238,7 +241,7 @@ class DbgServices(): return watchpoint_hit_list @check_initialize_done - @check_read_tensors + @check_read_tensor_info def read_tensors(self, info): """ Returning tensor data object describing the tensor requested tensor. @@ -277,6 +280,83 @@ class DbgServices(): tensor_data_list_ret.append(tensor_data) return tensor_data_list_ret + @check_initialize_done + @check_read_tensor_info + def read_tensor_base(self, info): + """ + Returning tensor base data object describing the requested tensor. + + Args: + info (list): List of TensorInfo objects. + + Returns: + list, TensorBaseData list. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> d = dbg_services.DbgServices(dump_file_path="dump_file_path", + >>> verbose=True) + >>> d_init = d.initialize(is_sync_mode=True) + >>> tensor_base_data_list = d_init.read_tensor_base([dbg_services.TensorInfo(node_name="conv2.bias", + >>> slot=0, + >>> iteration=8, + >>> rank_id=5, + >>> root_graph_id=0, + >>> is_output=True)]) + """ + log("in Python ReadTensorsBase info ", info) + info_list_inst = [] + for elem in info: + log("in Python ReadTensorsBase info ", info) + info_list_inst.append(elem.instance) + tensor_base_data_list = self.dbg_instance.ReadTensorsBase(info_list_inst) + tensor_base_data_list_ret = [] + for elem in tensor_base_data_list: + tensor_base_data = TensorBaseData(elem.data_size(), elem.dtype(), elem.shape()) + tensor_base_data_list_ret.append(tensor_base_data) + return tensor_base_data_list_ret + + @check_initialize_done + @check_read_tensor_info + def read_tensor_stats(self, info): + """ + Returning tensor statistics object describing the requested tensor. + + Args: + info (list): List of TensorInfo objects. + + Returns: + list, TensorStatData list. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> d = dbg_services.DbgServices(dump_file_path="dump_file_path", + >>> verbose=True) + >>> d_init = d.initialize(is_sync_mode=True) + >>> tensor_stat_data_list = d_init.read_tensor_stats([dbg_services.TensorInfo(node_name="conv2.bias", + >>> slot=0, + >>> iteration=8, + >>> rank_id=5, + >>> root_graph_id=0, + >>> is_output=True)]) + """ + log("in Python ReadTensorsStat info ", info) + info_list_inst = [] + for elem in info: + log("in Python ReadTensorsStat info ", info) + info_list_inst.append(elem.instance) + tensor_stat_data_list = self.dbg_instance.ReadTensorsStat(info_list_inst) + tensor_stat_data_list_ret = [] + for elem in tensor_stat_data_list: + tensor_stat_data = TensorStatData(elem.data_size(), elem.dtype(), + elem.shape(), elem.is_bool(), + elem.max_value(), elem.min_value(), + elem.avg_value(), elem.count(), elem.neg_zero_count(), + elem.pos_zero_count(), elem.nan_count(), elem.neg_inf_count(), + elem.pos_inf_count(), elem.zero_count()) + tensor_stat_data_list_ret.append(tensor_stat_data) + return tensor_stat_data_list_ret + class TensorInfo(): """ Tensor Information class. @@ -527,6 +607,406 @@ class TensorData(): return self.instance.get_shape() +class TensorBaseData(): + + """ + TensorBaseData class. + + Args: + data_size (int): Size of data in bytes. + dtype (int): An encoding representing the type of TensorData. + shape (list): Shape of tensor. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_base_data = dbg_services.TensorBaseData(data_size=4, + >>> dtype=0, + >>> shape=[2, 2]) + """ + @check_tensor_base_data_init + def __init__(self, data_size, dtype, shape): + self.instance = cds.TensorBaseData(data_size, dtype, shape) + + @property + def data_size(self): + """ + Function to receive TensorBaseData data_size. + + Returns: + int, data_size of TensorBaseData instance. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_base_data = dbg_services.TensorBaseData(data_size=4, + >>> dtype=0, + >>> shape=[2, 2]) + >>> data_size = tensor_base_data.data_size + """ + + return self.instance.data_size() + + @property + def dtype(self): + """ + Function to receive TensorBaseData dtype. + + Returns: + int, dtype of TensorBaseData instance. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_base_data = dbg_services.TensorBaseData(data_size=4, + >>> dtype=0, + >>> shape=[2, 2]) + >>> dtype = tensor_base_data.dtype + """ + + return self.instance.dtype() + + @property + def shape(self): + """ + Function to receive TensorBaseData shape. + + Returns: + list, shape of TensorBaseData instance. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_base_data = dbg_services.TensorBaseData(data_size=4, + >>> dtype=0, + >>> shape=[2, 2]) + >>> shape = tensor_base_data.shape + """ + + return self.instance.shape() +class TensorStatData(): + + """ + TensorStatData class. + + Args: + data_size (int): Size of data in bytes. + dtype (int): An encoding representing the type of TensorData. + shape (list): Shape of tensor. + is_bool (bool): Whether the data type is bool + max_value (float): Maximum value in tensor's elements + min_value (float): Minimum value in tensor's elements + avg_value (float): Average value of all tensor's elements + count (int): Number of elements in tensor + neg_zero_count (int): Number of negative elements in tensor + pos_zero_count (int): Number of positive elements in tensor + nan_cout (int): Number of nan elements in tensor + neg_inf_count (int): Number of negative infinity elements in tensor + pos_inf_count (int): Number of positive infinity elements in tensor + zero_count (int): Total number of zero elements in tensor + + + + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_stat_data = dbg_services.TensorStatData + >>> (data_size=4, + >>> dtype=0, + >>> shape=[2, 2], is_bool = false, max_value = 10.0, + >>> min_value = 0.0, avg_value = 5.0, + >>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0, + >>> neg_inf_count, pos_inf_count, zero_count = 1) + """ + @check_tensor_stat_data_init + def __init__(self, data_size, dtype, shape, is_bool, max_value, min_value, avg_value, count, + neg_zero_count, pos_zero_count, nan_count, neg_inf_count, pos_inf_count, zero_count): + self.instance = cds.TensorStatData(data_size, dtype, shape, is_bool, max_value, + min_value, avg_value, count, neg_zero_count, + pos_zero_count, nan_count, neg_inf_count, + pos_inf_count, zero_count) + + + @property + def data_size(self): + """ + Function to receive TensorStatData data_size. + + Returns: + int, data_size of TensorStatData instance. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_stat_data = dbg_services.TensorStatData + >>> (data_size=4, + >>> dtype=0, + >>> shape=[2, 2], is_bool = false, max_value = 10.0, + >>> min_value = 0.0, avg_value = 5.0, + >>> count = 4, neg_zero_count = 0, pos_zero_count = 4, + >> nan_count = 0, neg_inf_count, pos_inf_count, zero_count = 1) + >>> data_size = tensor_stat_data.data_size + """ + + return self.instance.data_size() + + @property + def dtype(self): + """ + Function to receive TensorStatData dtype. + + Returns: + int, dtype of TensorStatData instance. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_stat_data = dbg_services.TensorStatData(data_size=4, + >>> dtype=0, + >>> shape=[2, 2], is_bool = false, max_value = 10.0, + >> min_value = 0.0, avg_value = 5.0, + >>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0, + >>> neg_inf_count, pos_inf_count, zero_count = 1) + >>> dtype = tensor_stat_data.dtype + """ + + return self.instance.dtype() + + @property + def shape(self): + """ + Function to receive TensorStatData shape. + + Returns: + list, shape of TensorStatData instance. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_stat_data = dbg_services.TensorStatData(data_size=4, + >>> dtype=0, + >>> shape=[2, 2], is_bool = false, max_value = 10.0, + >>> min_value = 0.0, avg_value = 5.0, + >>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0, + >>> neg_inf_count, pos_inf_count, zero_count = 1) + >>> shape = tensor_stat_data.shape + """ + + return self.instance.shape() + + @property + def is_bool(self): + """ + Function to receive TensorStatData is_bool. + + Returns: + bool, Whether the tensor elements are bool. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_stat_data = dbg_services.TensorStatData(data_size=4, + >>> dtype=0, + >>> shape=[2, 2], is_bool = false, max_value = 10.0, + >>> min_value = 0.0, avg_value = 5.0, + >>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0, + >>> neg_inf_count, pos_inf_count, zero_count = 1) + >>> is_bool = tensor_stat_data.is_bool + """ + return self.instance.is_bool() + + @property + def max_value(self): + """ + Function to receive TensorStatData max_value. + + Returns: + float, max_value of TensorStatData instance. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_stat_data = dbg_services.TensorStatData(data_size=4, + >>> dtype=0, + >>> shape=[2, 2], is_bool = false, max_value = 10.0, + >>> min_value = 0.0, avg_value = 5.0, + >>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0, + >>> neg_inf_count, pos_inf_count, zero_count = 1) + >>> max_value = tensor_stat_data.max_value + """ + return self.instance.max_value() + + @property + def min_value(self): + """ + Function to receive TensorStatData min_value. + + Returns: + float, min_value of TensorStatData instance. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_stat_data = dbg_services.TensorStatData(data_size=4, + >>> dtype=0, + >>> shape=[2, 2], is_bool = false, max_value = 10.0, + >>> min_value = 0.0, avg_value = 5.0, + >>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0, + >>> neg_inf_count, pos_inf_count, zero_count = 1) + >>> min_value = tensor_stat_data.min_value + """ + return self.instance.min_value() + + @property + def avg_value(self): + """ + Function to receive TensorStatData avg_value. + + Returns: + float, avg_value of TensorStatData instance. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_stat_data = dbg_services.TensorStatData(data_size=4, + >>> dtype=0, + >>> shape=[2, 2], is_bool = false, max_value = 10.0, + >>> min_value = 0.0, avg_value = 5.0, + >>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0, + >>> neg_inf_count, pos_inf_count, zero_count = 1) + >>> avg_value = tensor_stat_data.avg_value + """ + return self.instance.avg_value() + + @property + def count(self): + """ + Function to receive TensorStatData count. + + Returns: + int, count of TensorStatData instance. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_stat_data = dbg_services.TensorStatData(data_size=4, + >>> dtype=0, + >>> shape=[2, 2], is_bool = false, max_value = 10.0, + >>> min_value = 0.0, avg_value = 5.0, + >>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0, + >>> neg_inf_count, pos_inf_count, zero_count = 1) + >>> count = tensor_stat_data.count + """ + return self.instance.count() + + @property + def neg_zero_count(self): + """ + Function to receive TensorStatData neg_zero_count. + + Returns: + int, neg_zero_count of TensorStatData instance. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_stat_data = dbg_services.TensorStatData(data_size=4, + >>> dtype=0, + >>> shape=[2, 2], is_bool = false, max_value = 10.0, + >>> min_value = 0.0, avg_value = 5.0, + >>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0, + >>> neg_inf_count, pos_inf_count, zero_count = 1) + >>> neg_zero_count = tensor_stat_data.neg_zero_count + """ + return self.instance.neg_zero_count() + + @property + def pos_zero_count(self): + """ + Function to receive TensorStatData pos_zero_count. + + Returns: + int, pos_zero_count of TensorStatData instance. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_stat_data = dbg_services.TensorStatData(data_size=4, + >>> dtype=0, + >>> shape=[2, 2], is_bool = false, max_value = 10.0, + >>> min_value = 0.0, avg_value = 5.0, + >>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0, + >>> neg_inf_count, pos_inf_count, zero_count = 1) + >>> pos_zero_count = tensor_stat_data.pos_zero_count + """ + return self.instance.pos_zero_count() + + @property + def zero_count(self): + """ + Function to receive TensorStatData zero_count. + + Returns: + int, zero_count of TensorStatData instance. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_stat_data = dbg_services.TensorStatData(data_size=4, + >>> dtype=0, + >>> shape=[2, 2], is_bool = false, max_value = 10.0, + >>> min_value = 0.0, avg_value = 5.0, + >>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0, + >>> neg_inf_count, pos_inf_count, zero_count = 1) + >>> zero_count = tensor_stat_data.zero_count + """ + return self.instance.zero_count() + + @property + def nan_count(self): + """ + Function to receive TensorStatData nan_count. + + Returns: + int, nan_count of TensorStatData instance. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_stat_data = dbg_services.TensorStatData(data_size=4, + >>> dtype=0, + >>> shape=[2, 2], is_bool = false, max_value = 10.0, + >>> min_value = 0.0, avg_value = 5.0, + >>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0, + >>> neg_inf_count, pos_inf_count, zero_count = 1) + >>> nan_count = tensor_stat_data.nan_count + """ + return self.instance.nan_count() + + @property + def neg_inf_count(self): + """ + Function to receive TensorStatData shape. + + Returns: + int, neg_inf_count of TensorStatData instance. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_stat_data = dbg_services.TensorStatData(data_size=4, + >>> dtype=0, + >>> shape=[2, 2], is_bool = false, max_value = 10.0, + >>> min_value = 0.0, avg_value = 5.0, + >>> count = 4, neg_zero_count = 0, pos_zero_count = 4, nan_count = 0, + >>> neg_inf_count, pos_inf_count, zero_count = 1) + >>> neg_inf_count = tensor_stat_data.neg_inf_count + """ + return self.instance.neg_inf_count() + + @property + def pos_inf_count(self): + """ + Function to receive TensorStatData pos_inf_count. + + Returns: + pos_inf_count of TensorStatData instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_stat_data = dbg_services.TensorStatData(data_size=4, + >>> dtype=0, + >>> shape=[2, 2], is_bool = false, max_value = 10.0, + >>> min_value = 0.0, avg_value = 5.0, + >>> count = 4, neg_zero_count = 0, pos_zero_count = 1, nan_count = 0, + >>> neg_inf_count, pos_inf_count, zero_count = 1) + >>> pos_inf_count = tensor_stat_data.pos_inf_count + """ + return self.instance.pos_inf_count() + class WatchpointHit(): """ WatchpointHit class. @@ -583,7 +1063,7 @@ class WatchpointHit(): >>> name = watchpoint_hit.name """ - return self.instance.get_name() + return self.instance.name() @property def slot(self): @@ -606,7 +1086,7 @@ class WatchpointHit(): >>> slot = watchpoint_hit.slot """ - return self.instance.get_slot() + return self.instance.slot() @property def condition(self): @@ -629,7 +1109,7 @@ class WatchpointHit(): >>> condition = watchpoint_hit.condition """ - return self.instance.get_condition() + return self.instance.condition() @property def watchpoint_id(self): @@ -652,7 +1132,7 @@ class WatchpointHit(): >>> watchpoint_id = watchpoint_hit.watchpoint_id """ - return self.instance.get_watchpoint_id() + return self.instance.watchpoint_id() @property def parameters(self): @@ -675,7 +1155,7 @@ class WatchpointHit(): >>> parameters = watchpoint_hit.parameters """ - params = self.instance.get_parameters() + params = self.instance.parameters() param_list = [] for elem in params: tmp = Parameter(elem.get_name(), diff --git a/mindspore/offline_debug/mi_validators.py b/mindspore/offline_debug/mi_validators.py index 134c075add3..ad9b3fcd69f 100644 --- a/mindspore/offline_debug/mi_validators.py +++ b/mindspore/offline_debug/mi_validators.py @@ -121,7 +121,7 @@ def check_check_watchpoints(method): return new_method -def check_read_tensors(method): +def check_read_tensor_info(method): """Wrapper method to check the parameters of DbgServices ReadTensors.""" @wraps(method) @@ -189,6 +189,52 @@ def check_tensor_data_init(method): return new_method +def check_tensor_base_data_init(method): + """Wrapper method to check the parameters of DbgServices TensorBaseData init.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [data_size, dtype, shape], _ = parse_user_args(method, *args, **kwargs) + + check_uint64(data_size, "data_size") + type_check(dtype, (int,), "dtype") + shape_names = ["shape_{0}".format(i) for i in range(len(shape))] + type_check_list(shape, (int,), shape_names) + + return method(self, *args, **kwargs) + + return new_method + +def check_tensor_stat_data_init(method): + """Wrapper method to check the parameters of DbgServices TensorBaseData init.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [data_size, dtype, shape, is_bool, max_value, min_value, + avg_value, count, neg_zero_count, pos_zero_count, + nan_count, neg_inf_count, pos_inf_count, + zero_count], _ = parse_user_args(method, *args, **kwargs) + + check_uint64(data_size, "data_size") + type_check(dtype, (int,), "dtype") + shape_names = ["shape_{0}".format(i) for i in range(len(shape))] + type_check_list(shape, (int,), shape_names) + type_check(is_bool, (bool,), "is_bool") + type_check(max_value, (float,), "max_value") + type_check(min_value, (float,), "min_value") + type_check(avg_value, (float,), "avg_value") + type_check(count, (int,), "count") + type_check(neg_zero_count, (int,), "neg_zero_count") + type_check(pos_zero_count, (int,), "pos_zero_count") + type_check(nan_count, (int,), "nan_count") + type_check(neg_inf_count, (int,), "neg_inf_count") + type_check(pos_inf_count, (int,), "pos_inf_count") + type_check(zero_count, (int,), "zero_count") + + + return method(self, *args, **kwargs) + + return new_method def check_watchpoint_hit_init(method): """Wrapper method to check the parameters of DbgServices WatchpointHit init.""" diff --git a/tests/ut/data/dump/gpu_dumps/golden/sync_read_tensors_base_stat.expected b/tests/ut/data/dump/gpu_dumps/golden/sync_read_tensors_base_stat.expected new file mode 100644 index 00000000000..71c83ef5cfe --- /dev/null +++ b/tests/ut/data/dump/gpu_dumps/golden/sync_read_tensors_base_stat.expected @@ -0,0 +1,87 @@ +----------------------------------------------------------- +tensor_info_1 attributes: +node name = Default/Add-op4 +slot = 0 +iteration = 0 +rank_id = 0 +root_graph_id = 0 +is_output = True + +tensor_base_info: +size in bytes = 24 +debugger dtype = 11 +shape = [2, 3] + +tensor_stat_info: +size in bytes = 24 +debugger dtype = 11 +shape = [2, 3] +is_bool = False +max_value = 10.0 +min_value = -11.0 +avg_value = 0.880000114440918 +count = 6 +neg_zero_count = 2 +pos_zero_count = 3 +nan_count = 0 +neg_inf_count = 0 +pos_inf_count = 0 +zero_count = 1 +----------------------------------------------------------- +tensor_info_2 attributes: +node name = Default/Reciprocal-op3 +slot = 0 +iteration = 0 +rank_id = 0 +root_graph_id = 0 +is_output = True + +tensor_base_info: +size in bytes = 40 +debugger dtype = 11 +shape = [2, 5] + +tensor_stat_info: +size in bytes = 40 +debugger dtype = 11 +shape = [2, 5] +is_bool = False +max_value = 1.0 +min_value = 1.0 +avg_value = 1.0 +count = 10 +neg_zero_count = 0 +pos_zero_count = 2 +nan_count = 0 +neg_inf_count = 3 +pos_inf_count = 5 +zero_count = 0 +----------------------------------------------------------- +tensor_info_3 attributes: +node name = Default/network-WithLossCell/_backbone-MockModel/ReduceMean-op92 +slot = 0 +iteration = 0 +rank_id = 0 +root_graph_id = 0 +is_output = True + +tensor_base_info: +size in bytes = 20 +debugger dtype = 11 +shape = [5] + +tensor_stat_info: +size in bytes = 20 +debugger dtype = 11 +shape = [5] +is_bool = False +max_value = 1.9901361465454102 +min_value = -2.175431728363037 +avg_value = -0.6648297309875488 +count = 5 +neg_zero_count = 2 +pos_zero_count = 1 +nan_count = 2 +neg_inf_count = 0 +pos_inf_count = 0 +zero_count = 0 diff --git a/tests/ut/python/debugger/gpu_tests/test_sync_read_tensors_base_stat.py b/tests/ut/python/debugger/gpu_tests/test_sync_read_tensors_base_stat.py new file mode 100644 index 00000000000..0c7b429e7a8 --- /dev/null +++ b/tests/ut/python/debugger/gpu_tests/test_sync_read_tensors_base_stat.py @@ -0,0 +1,146 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Read tensor base and statistics test script for offline debugger APIs. +""" + +import tempfile +import os +import shutil +import numpy as np +import mindspore.offline_debug.dbg_services as d +from dump_test_utils import compare_actual_with_expected + +GENERATE_GOLDEN = False +test_name = "sync_read_tensors_base_stat" + + +def test_sync_read_tensors_base_stat(): + + value_tensor = np.array([[7.5, 8.56, -9.78], [10.0, -11.0, 0.0]], np.float32) + inf_tensor = np.array([[1., -np.inf, np.inf, -np.inf, np.inf], [np.inf, 1., -np.inf, np.inf, np.inf]], np.float32) + nan_tensor = np.array([-2.1754317, 1.9901361, np.nan, np.nan, -1.8091936], np.float32) + + value_path = build_dump_file_structure(value_tensor, "Add", "Add.Add-op4.0.0.") + inf_path = build_dump_file_structure(inf_tensor, "Inf", "Reciprocal.Reciprocal-op3.0.0.") + nan_path = build_dump_file_structure(nan_tensor, "Nan", "ReduceMean.ReduceMean-op92.0.0.") + + debugger_backend = d.DbgServices( + dump_file_path=value_path, verbose=True) + + _ = debugger_backend.initialize( + net_name="Add", is_sync_mode=True) + + debugger_backend_2 = d.DbgServices( + dump_file_path=inf_path, verbose=True) + + _ = debugger_backend_2.initialize( + net_name="Inf", is_sync_mode=True) + + debugger_backend_3 = d.DbgServices( + dump_file_path=nan_path, verbose=True) + + _ = debugger_backend_3.initialize( + net_name="Nan", is_sync_mode=True) + + info1 = d.TensorInfo(node_name="Default/Add-op4", + slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True) + info2 = d.TensorInfo(node_name="Default/Reciprocal-op3", + slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True) + info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-MockModel/ReduceMean-op92", + slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True) + + + tensor_info_1 = [info1] + tensor_info_2 = [info2] + tensor_info_3 = [info3] + tensor_info = [info1, info2, info3] + + tensor_base_data_list = debugger_backend.read_tensor_base(tensor_info_1) + tensor_base_data_list_2 = debugger_backend_2.read_tensor_base(tensor_info_2) + tensor_base_data_list.extend(tensor_base_data_list_2) + tensor_base_data_list_3 = debugger_backend_3.read_tensor_base(tensor_info_3) + tensor_base_data_list.extend(tensor_base_data_list_3) + + tensor_stat_data_list = debugger_backend.read_tensor_stats(tensor_info_1) + tensor_stat_data_list_2 = debugger_backend_2.read_tensor_stats(tensor_info_2) + tensor_stat_data_list.extend(tensor_stat_data_list_2) + tensor_stat_data_list_3 = debugger_backend_3.read_tensor_stats(tensor_info_3) + tensor_stat_data_list.extend(tensor_stat_data_list_3) + + shutil.rmtree(value_path) + shutil.rmtree(inf_path) + shutil.rmtree(nan_path) + print_read_tensors(tensor_info, tensor_base_data_list, tensor_stat_data_list) + assert compare_actual_with_expected(test_name) + + +def build_dump_file_structure(tensor_array, net_name, tensor_name): + debugger_temp_dir = tempfile.mkdtemp(prefix=net_name, dir="./") + print(debugger_temp_dir) + path = os.path.join(debugger_temp_dir, "rank_0", net_name, "0", "0") + print(path) + os.makedirs(path, exist_ok=True) + file = tempfile.mkstemp(prefix=tensor_name, suffix=".output.0.DefaultFormat.npy", dir=path) + full_path = file[1] + np.save(full_path, tensor_array) + + return debugger_temp_dir + +def print_read_tensors(tensor_info, tensor_base_data_list, tensor_stat_data_list): + """Print read tensors info.""" + if GENERATE_GOLDEN: + f_write = open(test_name + ".expected", "w") + else: + f_write = open(test_name + ".actual", "w") + + for x, _ in enumerate(tensor_info): + f_write.write( + "-----------------------------------------------------------\n") + f_write.write("tensor_info_" + str(x+1) + " attributes:\n") + f_write.write("node name = " + tensor_info[x].node_name + "\n") + f_write.write("slot = " + str(tensor_info[x].slot) + "\n") + f_write.write("iteration = " + str(tensor_info[x].iteration) + "\n") + f_write.write("rank_id = " + str(tensor_info[x].rank_id) + "\n") + f_write.write("root_graph_id = " + + str(tensor_info[x].root_graph_id) + "\n") + f_write.write("is_output = " + + str(tensor_info[x].is_output) + "\n") + f_write.write("\n") + f_write.write("tensor_base_info:\n") + f_write.write("size in bytes = " + + str(tensor_base_data_list[x].data_size) + "\n") + f_write.write("debugger dtype = " + str(tensor_base_data_list[x].dtype) + "\n") + f_write.write("shape = " + str(tensor_base_data_list[x].shape) + "\n") + + f_write.write("\n") + f_write.write("tensor_stat_info:\n") + + f_write.write("size in bytes = " + + str(tensor_stat_data_list[x].data_size) + "\n") + f_write.write("debugger dtype = " + str(tensor_stat_data_list[x].dtype) + "\n") + f_write.write("shape = " + str(tensor_stat_data_list[x].shape) + "\n") + f_write.write("is_bool = " + str(tensor_stat_data_list[x].is_bool) + "\n") + f_write.write("max_value = " + str(tensor_stat_data_list[x].max_value) + "\n") + f_write.write("min_value = " + str(tensor_stat_data_list[x].min_value) + "\n") + f_write.write("avg_value = " + str(tensor_stat_data_list[x].avg_value) + "\n") + f_write.write("count = " + str(tensor_stat_data_list[x].count) + "\n") + f_write.write("neg_zero_count = " + str(tensor_stat_data_list[x].neg_zero_count) + "\n") + f_write.write("pos_zero_count = " + str(tensor_stat_data_list[x].pos_zero_count) + "\n") + f_write.write("nan_count = " + str(tensor_stat_data_list[x].nan_count) + "\n") + f_write.write("neg_inf_count = " + str(tensor_stat_data_list[x].neg_inf_count) + "\n") + f_write.write("pos_inf_count = " + str(tensor_stat_data_list[x].pos_inf_count) + "\n") + f_write.write("zero_count = " + str(tensor_stat_data_list[x].zero_count) + "\n") + f_write.close()