Add statistic dump for ascend

2021-12-02 16:44:33 +00:00 · 2021-12-02 16:44:33 +00:00 · b9d1a4920c
parent 811d7128c7
commit b9d1a4920c
7 changed files with 175 additions and 55 deletions
--- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc
+++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc
@ -382,9 +382,9 @@ void DumpJsonParser::ParseSavedData(const nlohmann::json &content) {
                      << saved_data_ << ". Please set saved_data to either statistic, tensor, or full";
  }
  auto context = MsContext::GetInstance();
-  if (IsStatisticDump() && context->get_param<std::string>(MS_CTX_DEVICE_TARGET) != kGPUDevice) {
-    MS_LOG(EXCEPTION) << "Dump Json parse failed, storing statistic dump is only supported on GPU device, please set "
-                         "saved_data to tensor or use a GPU device";
+  if (IsStatisticDump() && context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kCPUDevice) {
+    MS_LOG(EXCEPTION) << "Dump Json parse failed, storing statistic dump is only supported on GPU and Ascend, please "
+                         "set saved_data to tensor or use a GPU or Ascend device";
  }
 }

--- a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc
+++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc
@ -19,6 +19,7 @@
 #include <unistd.h>
 #include <algorithm>
 #include <map>
+#include <memory>
 #include <set>
 #include <utility>
 #include <vector>
@ -511,7 +512,7 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum
  for (uint32_t slot = 0; slot < input_tensors.size(); slot++) {
    auto in_tensor = input_tensors[slot];
    std::string in_slot_path = in_path + std::to_string(slot) + ".";
-    auto succ = ConvertFormatForTensorAndDump(in_slot_path, in_tensor, data_ptr + offset);
+    auto succ = ConvertFormatForTensorAndDump(in_slot_path, in_tensor, data_ptr + offset, "input", slot);
    if (!succ) {
      MS_LOG(INFO) << "Failed to convert format for tensor " << in_slot_path;
    }
@ -524,7 +525,7 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum
  for (uint32_t slot = 0; slot < output_tensors.size(); slot++) {
    auto out_tensor = output_tensors[slot];
    std::string out_slot_path = out_path + std::to_string(slot) + ".";
-    auto succ = ConvertFormatForTensorAndDump(out_slot_path, out_tensor, data_ptr + offset);
+    auto succ = ConvertFormatForTensorAndDump(out_slot_path, out_tensor, data_ptr + offset, "output", slot);
    if (!succ) {
      MS_LOG(INFO) << "Failed to convert format for tensor " << out_slot_path;
    }
@ -533,7 +534,40 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum
 }

 template <typename T>
-bool E2eDump::ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr) {
+bool DumpTensorStatsIfNeeded(const std::string &dump_path, const T &tensor, char *data_ptr, const std::string &io,
+                             uint32_t slot, const ShapeVector &shape, TypeId type) {
+  if (!DumpJsonParser::GetInstance().IsStatisticDump()) {
+    return true;
+  }
+  size_t pos = dump_path.rfind("/");
+  std::string file_name = dump_path.substr(pos + 1);
+  size_t first_dot = file_name.find(".");
+  size_t second_dot = file_name.find(".", first_dot + 1);
+  size_t third_dot = file_name.find(".", second_dot + 1);
+  size_t fourth_dot = file_name.find(".", third_dot + 1);
+  size_t fifth_dot = file_name.find(".", fourth_dot + 1);
+  std::string op_type = file_name.substr(0, first_dot);
+  std::string op_name = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
+  std::string task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1);
+  std::string stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1);
+  std::string timestamp = file_name.substr(fourth_dot + 1, fifth_dot - fourth_dot - 1);
+  TensorStatDump stat_dump(op_type, op_name, task_id, stream_id, timestamp, io, slot, slot);
+  std::shared_ptr<TensorData> data = std::make_shared<TensorData>();
+  try {
+    data->ConvertMsToDbgType(type);
+  } catch (...) {
+    MS_LOG(ERROR) << "Data type of operator " << file_name << " is not supported by statistic dump";
+    return false;
+  }
+  data->SetByteSize((size_t)tensor.size());
+  data->SetShape(shape);
+  data->SetDataPtr(data_ptr);
+  return stat_dump.DumpTensorStatsToFile(dump_path.substr(0, pos), data);
+}
+
+template <typename T>
+bool E2eDump::ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr,
+                                            const std::string &io, uint32_t slot) {
  // get format
  auto iter_fmt = kFormatToStringMap.find(tensor.format());
  if (iter_fmt == kFormatToStringMap.end()) {
@ -584,13 +618,21 @@ bool E2eDump::ConvertFormatForTensorAndDump(std::string dump_path, const T &tens
    }
  }
  // dump tensor data into npy file
-  bool dump_success = false;
+  bool dump_success = true;
  if (trans_success) {
-    dump_path += host_format;
-    dump_success = DumpJsonParser::DumpToFile(dump_path, trans_buf.data(), data_size, shape_to, src_type);
+    dump_success = DumpTensorStatsIfNeeded(dump_path, tensor, reinterpret_cast<char *>(trans_buf.data()), io, slot,
+                                           shape_to, src_type);
+    if (DumpJsonParser::GetInstance().IsTensorDump()) {
+      dump_path += host_format;
+      dump_success =
+        DumpJsonParser::DumpToFile(dump_path, trans_buf.data(), data_size, shape_to, src_type) && dump_success;
+    }
  } else {
-    dump_path += device_format;
-    dump_success = DumpJsonParser::DumpToFile(dump_path, data_ptr, data_size, shape_to, src_type);
+    dump_success = DumpTensorStatsIfNeeded(dump_path, tensor, data_ptr, io, slot, shape_to, src_type);
+    if (DumpJsonParser::GetInstance().IsTensorDump()) {
+      dump_path += device_format;
+      dump_success = DumpJsonParser::DumpToFile(dump_path, data_ptr, data_size, shape_to, src_type) && dump_success;
+    }
  }
  return dump_success;
 }
--- a/mindspore/ccsrc/debug/data_dump/e2e_dump.h
+++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.h
@ -95,7 +95,8 @@ class E2eDump {
  static nlohmann::json ParseOverflowInfo(char *data_ptr);

  template <typename T>
-  static bool ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr);
+  static bool ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr,
+                                            const std::string &io, uint32_t slot);
 #endif

  inline static unsigned int starting_graph_id = INT32_MAX;
--- a/mindspore/ccsrc/debug/data_dump/tensor_stat_dump.cc
+++ b/mindspore/ccsrc/debug/data_dump/tensor_stat_dump.cc
@ -41,7 +41,7 @@ bool CsvWriter::OpenFile(const std::string &path, const std::string &header) {
  }
  auto file_path = Common::CreatePrefixPath(path);
  if (!file_path.has_value()) {
-    MS_LOG(WARNING) << "CreatePrefixPath failed.";
+    MS_LOG(WARNING) << "CreatePrefixPath failed, skipping current statistics";
    return false;
  }
  // try to open file
@ -55,7 +55,7 @@ bool CsvWriter::OpenFile(const std::string &path, const std::string &header) {
    file_.open(file_path_value, std::ios::out | std::ios::app | std::ios::binary);
  }
  if (!file_.is_open()) {
-    MS_LOG(WARNING) << "Open file " << path << " failed." << ErrnoToString(errno);
+    MS_LOG(WARNING) << "Open file " << file_path_value << " failed." << ErrnoToString(errno);
    return false;
  }
  if (first_time_opening) {
@ -63,7 +63,7 @@ bool CsvWriter::OpenFile(const std::string &path, const std::string &header) {
    file_.flush();
    file_path_str_ = path;
  }
-  MS_LOG(INFO) << "Opened file: " << path;
+  MS_LOG(INFO) << "Opened file: " << file_path_value;
  return true;
 }

@ -93,9 +93,9 @@ TensorStatDump::TensorStatDump(const std::string &op_type, const std::string &op
                               size_t tensor_loader_slot)
    : op_type_{op_type},
      op_name_{op_name},
-      task_id_{task_id},
-      stream_id_{stream_id},
-      timestamp_{timestamp},
+      task_id_{std::to_string(task_id)},
+      stream_id_{std::to_string(stream_id)},
+      timestamp_{std::to_string(timestamp)},
      slot_{slot},
      tensor_loader_slot_{tensor_loader_slot} {
  if (input) {
@ -105,6 +105,22 @@ TensorStatDump::TensorStatDump(const std::string &op_type, const std::string &op
  }
 }

+TensorStatDump::TensorStatDump(const std::string &op_type, const std::string &op_name, const std::string &task_id,
+                               const std::string &stream_id, const std::string &timestamp, const std::string &io,
+                               size_t slot, size_t tensor_loader_slot)
+    : op_type_{op_type},
+      op_name_{op_name},
+      task_id_{task_id},
+      stream_id_{stream_id},
+      timestamp_{timestamp},
+      io_{io},
+      slot_{slot},
+      tensor_loader_slot_{tensor_loader_slot} {
+  if (io_ != kInput && io_ != kOutput) {
+    MS_LOG(EXCEPTION) << "Cannot instantiate TensorStatDump, io needs to be either " << kInput << " or " << kOutput;
+  }
+}
+
 bool TensorStatDump::OpenStatisticsFile(const std::string &dump_path) {
  std::string filename = dump_path + "/" + kCsvFileName;
  // try to open file
@ -125,16 +141,24 @@ bool TensorStatDump::OpenStatisticsFile(const std::string &dump_path) {

 bool TensorStatDump::DumpTensorStatsToFile(const std::string &original_kernel_name, const std::string &dump_path,
                                           const Debugger *debugger) {
-  if (!OpenStatisticsFile(dump_path)) {
-    return false;
-  }
-  // get tensor statistics using debugger
+  // get tensor data using debugger
  std::string tensor_loader_name = original_kernel_name + ":" + std::to_string(tensor_loader_slot_);
  std::shared_ptr<TensorData> data = debugger->GetTensor(tensor_loader_name);
  if (data == nullptr) {
    MS_LOG(WARNING) << "Failed to find " << tensor_loader_name << " in tensor loader, skipping current statistics";
    return false;
  }
+  return DumpTensorStatsToFile(dump_path, data);
+}
+
+bool TensorStatDump::DumpTensorStatsToFile(const std::string &dump_path, std::shared_ptr<TensorData> data) {
+  if (data == nullptr) {
+    MS_LOG(WARNING) << "Tensor data is empty, skipping current statistics";
+    return false;
+  }
+  if (!OpenStatisticsFile(dump_path)) {
+    return false;
+  }
  const DebugServices::TensorStat &stat = DebugServices::GetTensorStatistics(data);
  // write tensor statistics to csv file
  std::ostringstream shape;
--- a/mindspore/ccsrc/debug/data_dump/tensor_stat_dump.h
+++ b/mindspore/ccsrc/debug/data_dump/tensor_stat_dump.h
@ -17,6 +17,7 @@
 #ifndef MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_TENSOR_STAT_DUMP_H_
 #define MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_TENSOR_STAT_DUMP_H_

+#include <memory>
 #include <string>
 #include <fstream>

@ -24,6 +25,7 @@

 namespace mindspore {
 class Debugger;
+class TensorData;
 class CsvWriter {
 public:
  static CsvWriter &GetInstance() {
@ -31,13 +33,6 @@ class CsvWriter {
    return instance;
  }

- private:
-  const std::string kSeparator = ",";
-  const std::string kEndLine = "\n";
-  std::ofstream file_;
-  std::string file_path_str_ = "";
-
- public:
  CsvWriter() = default;
  ~CsvWriter();
  DISABLE_COPY_AND_ASSIGN(CsvWriter)
@ -45,28 +40,39 @@ class CsvWriter {
  void CloseFile();
  template <typename T>
  void WriteToCsv(const T &val, bool end_line = false);
+
+ private:
+  const std::string kSeparator = ",";
+  const std::string kEndLine = "\n";
+  std::ofstream file_;
+  std::string file_path_str_ = "";
 };

 class TensorStatDump {
-  static const char CSV_HEADER[];
-  static const char CSV_FILE_NAME[];
-
-  const std::string &op_type_;
-  const std::string &op_name_;
-  uint32_t task_id_;
-  uint32_t stream_id_;
-  uint64_t timestamp_;
-  std::string io_;
-  size_t slot_;
-  size_t tensor_loader_slot_;
-
 public:
  static bool OpenStatisticsFile(const std::string &dump_path);

  TensorStatDump(const std::string &op_type, const std::string &op_name, uint32_t task_id, uint32_t stream_id,
                 uint64_t timestamp, bool input, size_t slot, size_t tensor_loader_slot_);
+  TensorStatDump(const std::string &op_type, const std::string &op_name, const std::string &task_id,
+                 const std::string &stream_id, const std::string &timestamp, const std::string &io, size_t slot,
+                 size_t tensor_loader_slot);
+  bool DumpTensorStatsToFile(const std::string &dump_path, std::shared_ptr<TensorData> data);
  bool DumpTensorStatsToFile(const std::string &original_kernel_name, const std::string &dump_path,
                             const Debugger *debugger);
+
+ private:
+  static const char CSV_HEADER[];
+  static const char CSV_FILE_NAME[];
+
+  const std::string op_type_;
+  const std::string op_name_;
+  const std::string task_id_;
+  const std::string stream_id_;
+  const std::string timestamp_;
+  std::string io_;
+  size_t slot_;
+  size_t tensor_loader_slot_;
 };
 }  // namespace mindspore
 #endif  // MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_TENSOR_STAT_DUMP_H_
--- a/tests/st/dump/dump_test_utils.py
+++ b/tests/st/dump/dump_test_utils.py
@ -158,11 +158,15 @@ def generate_statistic_dump_json(dump_path, json_file_name, test_key, saved_data
    """
    if test_key == "test_gpu_e2e_dump":
        data = e2e_dump_dict
-        data["common_dump_settings"]["path"] = dump_path
-        data["common_dump_settings"]["saved_data"] = saved_data
+    elif test_key == "test_async_dump":
+        data = async_dump_dict
+        data["common_dump_settings"]["input_output"] = 0
+        data["common_dump_settings"]["file_format"] = "npy"
    else:
        raise ValueError(
            "Failed to generate statistic dump json file. The test name value " + test_key + " is invalid.")
+    data["common_dump_settings"]["path"] = dump_path
+    data["common_dump_settings"]["saved_data"] = saved_data
    with open(json_file_name, 'w') as f:
        json.dump(data, f)

--- a/tests/st/dump/test_data_dump.py
+++ b/tests/st/dump/test_data_dump.py
@ -409,36 +409,37 @@ def check_statistic_dump(dump_file_path):
    real_path = os.path.realpath(output_path)
    with open(real_path) as f:
        reader = csv.DictReader(f)
-        input1 = next(reader)
+        stats = list(reader)
+        input1 = stats[0]
        assert input1['IO'] == 'input'
        assert input1['Min Value'] == '1'
        assert input1['Max Value'] == '6'
-        input2 = next(reader)
+        input2 = stats[1]
        assert input2['IO'] == 'input'
        assert input2['Min Value'] == '7'
        assert input2['Max Value'] == '12'
-        output = next(reader)
+        output = stats[2]
        assert output['IO'] == 'output'
        assert output['Min Value'] == '8'
        assert output['Max Value'] == '18'

 def check_data_dump(dump_file_path):
-    output_name = "Add.Add-op*.0.0.*.output.0.DefaultFormat.npy"
+    output_name = "Add.Add-op*.output.0.*.npy"
    output_path = glob.glob(os.path.join(dump_file_path, output_name))[0]
    real_path = os.path.realpath(output_path)
    output = np.load(real_path)
    expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
    assert np.array_equal(output, expect)

-def run_gpu_e2e_dump(saved_data):
-    """Run gpu e2e dump"""
+def run_saved_data_dump_test(scenario, saved_data):
+    """Run e2e dump on scenario, testing statistic dump"""
    if sys.platform != 'linux':
        return
    pwd = os.getcwd()
    with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
-        dump_path = os.path.join(tmp_dir, 'gpu_e2e_dump')
-        dump_config_path = os.path.join(tmp_dir, 'gpu_e2e_dump.json')
-        generate_statistic_dump_json(dump_path, dump_config_path, 'test_gpu_e2e_dump', saved_data)
+        dump_path = os.path.join(tmp_dir, 'test_saved_data')
+        dump_config_path = os.path.join(tmp_dir, 'test_saved_data.json')
+        generate_statistic_dump_json(dump_path, dump_config_path, scenario, saved_data)
        os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
        dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
        if os.path.isdir(dump_path):
@ -473,7 +474,7 @@ def test_gpu_e2e_statistic_dump():
    Expectation: Statistics are stored in statistic.csv files
    """
    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
-    run_gpu_e2e_dump('statistic')
+    run_saved_data_dump_test('test_gpu_e2e_dump', 'statistic')

@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@ -486,7 +487,7 @@ def test_gpu_e2e_tensor_dump():
    Expectation: Tensor data are stored in npy files
    """
    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
-    run_gpu_e2e_dump('tensor')
+    run_saved_data_dump_test('test_gpu_e2e_dump', 'tensor')

@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@ -499,4 +500,46 @@ def test_gpu_e2e_full_dump():
    Expectation: Tensor are stored in npy files and their statistics stored in statistic.csv
    """
    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
-    run_gpu_e2e_dump('full')
+    run_saved_data_dump_test('test_gpu_e2e_dump', 'full')
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+@security_off_wrap
+def test_ascend_statistic_dump():
+    """
+    Feature: Ascend Statistics Dump
+    Description: Test Ascend statistics dump
+    Expectation: Statistics are stored in statistic.csv files
+    """
+    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
+    run_saved_data_dump_test('test_async_dump', 'statistic')
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+@security_off_wrap
+def test_ascend_tensor_dump():
+    """
+    Feature: Ascend Tensor Dump
+    Description: Test Ascend tensor dump
+    Expectation: Tensors are stored in npy files
+    """
+    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
+    run_saved_data_dump_test('test_async_dump', 'tensor')
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+@security_off_wrap
+def test_ascend_full_dump():
+    """
+    Feature: Ascend Full Dump
+    Description: Test Ascend full dump
+    Expectation: Tensors are stored in npy files and their statistics stored in statistic.csv
+    """
+    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
+    run_saved_data_dump_test('test_async_dump', 'full')