From c3ab951fa9d5cc0f73aa0b9b1cda33bb43da04cb Mon Sep 17 00:00:00 2001 From: Jimmy Qi Date: Mon, 13 Dec 2021 22:53:46 +0000 Subject: [PATCH] Parallelize calculating tensor statistics --- .../ccsrc/debug/debugger/tensor_summary.cc | 60 +++++++++++++++++-- .../ccsrc/debug/debugger/tensor_summary.h | 1 + .../read_tensors_base_stat_expected.json | 2 +- 3 files changed, 58 insertions(+), 5 deletions(-) diff --git a/mindspore/ccsrc/debug/debugger/tensor_summary.cc b/mindspore/ccsrc/debug/debugger/tensor_summary.cc index f0cb9d50aca..7c003e6d187 100644 --- a/mindspore/ccsrc/debug/debugger/tensor_summary.cc +++ b/mindspore/ccsrc/debug/debugger/tensor_summary.cc @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -160,7 +161,59 @@ void TensorSummary::TensorStatistics(DbgDataType dtype_value) { if (dtype_value == DT_BOOL) { is_bool_ = true; } - double sum_elements = 0.0; + const int default_threads = 32; + const int default_elements_per_thread = 10000; + + if (num_elements_ <= default_elements_per_thread) { + return TensorStatisticsSingleThread(); + } + int desired_threads = num_elements_ / default_elements_per_thread; + int actual_threads = std::min(desired_threads, default_threads); + int actual_elements_per_thread = num_elements_ / actual_threads; + + // Use multithread to calculate statistic on chunks of data + void *previous_tensor_ptr = nullptr; + size_t offset = 0; + std::vector>> summary_vec; + std::vector> summary_future_vec; + for (int i = 0; i < actual_threads; i++) { + int num_elements_for_thread; + if (i == actual_threads - 1) { + num_elements_for_thread = num_elements_ - offset; + } else { + num_elements_for_thread = actual_elements_per_thread; + } + summary_vec.emplace_back(std::make_unique>(current_tensor_ptr_ + offset, previous_tensor_ptr, + num_elements_for_thread, 0)); + summary_future_vec.emplace_back( + std::async(std::launch::async, &TensorSummary::TensorStatisticsSingleThread, summary_vec[i].get())); + offset += num_elements_for_thread; + } + + // Aggregate results of all chunks + num_elements_ = 0; // Let current tensor weight 0 in the aggregation + for (unsigned int i = 0; i < summary_future_vec.size(); i++) { + summary_future_vec[i].wait(); + summary_future_vec[i].get(); + auto &cur_summary = *(summary_vec[i]); + num_elements_ += cur_summary.num_elements_; + min_ = std::min(min_, cur_summary.min_); + max_ = std::max(max_, cur_summary.max_); + double avg_delta = cur_summary.avg_ - avg_; + avg_ += avg_delta * (cur_summary.num_elements_ / num_elements_); + neg_zero_count_ += cur_summary.neg_zero_count_; + pos_zero_count_ += cur_summary.pos_zero_count_; + neg_inf_count_ += cur_summary.neg_inf_count_; + pos_inf_count_ += cur_summary.pos_inf_count_; + inf_count_ += cur_summary.inf_count_; + nan_count_ += cur_summary.nan_count_; + zero_count_ += cur_summary.zero_count_; + } +} + +template +void TensorSummary::TensorStatisticsSingleThread() { + MeanCalculator mean_calc = MeanCalculator(); for (size_t i = 0; i < num_elements_; ++i) { auto current_value = static_cast(current_tensor_ptr_[i]); if (std::isinf(current_value)) { @@ -185,11 +238,10 @@ void TensorSummary::TensorStatistics(DbgDataType dtype_value) { } max_ = std::max(max_, current_value); min_ = std::min(min_, current_value); - sum_elements += current_value; + mean_calc.ProcessElement(current_value); } } - unsigned int value_count = zero_count_ + neg_zero_count_ + pos_zero_count_; - avg_ = sum_elements / value_count; + avg_ = mean_calc.GetMean(); } template diff --git a/mindspore/ccsrc/debug/debugger/tensor_summary.h b/mindspore/ccsrc/debug/debugger/tensor_summary.h index 080c88b27f5..1b0dc4a0840 100644 --- a/mindspore/ccsrc/debug/debugger/tensor_summary.h +++ b/mindspore/ccsrc/debug/debugger/tensor_summary.h @@ -161,6 +161,7 @@ class TensorSummary : public ITensorSummary { double_t StatLookup(const DebugServices::watchpoint_t &); double_t StatLookup(const std::string &, const DebugServices::watchpoint_t &); double_t GetZeroValPercent(); + void TensorStatisticsSingleThread(); void InitCalculators(const std::vector &); }; #ifdef ONLINE_DBG_MODE diff --git a/tests/ut/data/dump/gpu_dumps/golden/read_tensors_base_stat_expected.json b/tests/ut/data/dump/gpu_dumps/golden/read_tensors_base_stat_expected.json index f6dfb2d27a6..32213dc4eb6 100644 --- a/tests/ut/data/dump/gpu_dumps/golden/read_tensors_base_stat_expected.json +++ b/tests/ut/data/dump/gpu_dumps/golden/read_tensors_base_stat_expected.json @@ -27,7 +27,7 @@ "is_bool": false, "max_vaue": 10.0, "min_value": -11.0, - "avg_value": 0.880000114440918, + "avg_value": 0.8800001144409179, "count": 6, "neg_zero_count": 2, "pos_zero_count": 3,