Fix the stream_id error for async dump

Revert "takedown dump related testcases to ensure gate stability" This reverts commit 24321ded4d.
2022-06-22 17:44:13 +08:00 · 2022-06-22 17:44:13 +08:00 · d3f2d391a7
parent 1f63418d09
commit d3f2d391a7
6 changed files with 41 additions and 25 deletions
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/data_dumper.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/data_dumper.cc
@ -290,6 +290,10 @@ void DataDumper::ConstructDumpTask(NotNull<const CNodePtr &> kernel, NotNull<aic
 }

 void DataDumper::SetOpDebugMappingInfo(const NotNull<aicpu::dump::OpMappingInfo *> dump_info) const {
+  if (!is_op_debug_) {
+    MS_LOG(INFO) << "[DataDump] Not op debug mode, no need to set OpDebugMappingInfo.";
+    return;
+  }
  MS_LOG(INFO) << "[DataDump] Add op debug info to OpMappingInfo, task id = " << debug_task_id_
               << ", stream id = " << debug_stream_id_;
  aicpu::dump::Task task;
@ -333,6 +337,7 @@ void DataDumper::OpDebugRegister() {
    return;
  }

+  is_op_debug_ = true;
  int64_t value = 0;
  rtError_t rt_ret = rtGetRtCapability(FEATURE_TYPE_MEMORY, MEMORY_INFO_TS_LIMITED, &value);
  if (rt_ret != RT_ERROR_NONE) {
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/data_dumper.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/data_dumper.h
@ -40,15 +40,16 @@ class DataDumper {
 public:
  DataDumper(const session::KernelGraph *kernel_graph, NotNull<std::function<void *()>> model_handle)
      : model_handle_(model_handle),
-        debug_task_id_(-1),
-        debug_stream_id_(-1),
+        debug_task_id_(0U),
+        debug_stream_id_(0U),
        op_debug_buffer_addr_(nullptr),
        op_debug_dump_args_(nullptr),
        load_flag_(false),
        dev_load_mem_(nullptr),
        dev_unload_mem_(nullptr),
        graph_id_(UINT32_MAX),
-        kernel_graph_(kernel_graph) {}
+        kernel_graph_(kernel_graph),
+        is_op_debug_(false) {}
  ~DataDumper();
  void set_runtime_info(const std::map<std::string, std::shared_ptr<RuntimeInfo>> &runtime_info) {
    runtime_info_map_ = runtime_info;
@ -88,6 +89,7 @@ class DataDumper {
  std::vector<std::string> dump_kernel_names_;
  const session::KernelGraph *kernel_graph_;
  std::map<std::string, std::shared_ptr<RuntimeInfo>> runtime_info_map_;
+  bool is_op_debug_;
 };
 }  // namespace ascend
 }  // namespace device
--- a/tests/st/dump/test_async_a_plus_m_dump.py
+++ b/tests/st/dump/test_async_a_plus_m_dump.py
@ -70,7 +70,7 @@ def run_async_dump(test_name):
        del os.environ['MINDSPORE_DUMP_CONFIG']


-@pytest.mark.level1
+@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@ -84,7 +84,7 @@ def test_async_dump_npy():
    run_async_dump("test_async_dump_npy")


-@pytest.mark.level1
+@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
--- a/tests/st/dump/test_cell_dump.py
+++ b/tests/st/dump/test_cell_dump.py
@ -75,7 +75,7 @@ def run_multi_layer_train(is_set_dump):
    train_network(inputs, label)


-@pytest.mark.level1
+@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@ -118,7 +118,7 @@ def test_ascend_cell_dump():
        del os.environ['MINDSPORE_DUMP_CONFIG']


-@pytest.mark.level1
+@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@ -150,7 +150,7 @@ def test_ascend_not_cell_dump():
        del os.environ['MINDSPORE_DUMP_CONFIG']


-@pytest.mark.level1
+@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@ -179,7 +179,7 @@ def test_ascend_cell_empty_dump():
        del os.environ['MINDSPORE_DUMP_CONFIG']


-@pytest.mark.level1
+@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@ -222,7 +222,7 @@ class OperateSymbolNet(Cell):
        return x


-@pytest.mark.level1
+@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
--- a/tests/st/dump/test_data_dump.py
+++ b/tests/st/dump/test_data_dump.py
@ -14,6 +14,7 @@
 # ============================================================================
 import os
 import sys
+import json
 import tempfile
 import time
 import shutil
@ -205,7 +206,7 @@ class ReluReduceMeanDenseRelu(Cell):
        return x_


-@pytest.mark.level1
+@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@ -345,12 +346,20 @@ def run_overflow_dump():
        overflow_stream_id = overflow_file_name[1][overflow_third_dot_pos+1:overflow_fourth_dot_pos]
        assert output_task_id == overflow_task_id
        assert output_stream_id == overflow_stream_id
+        # Convert the overflow file into json format
+        convert_tool = '/usr/local/Ascend/latest/tools/operator_cmp/compare/msaccucmp.py'
+        convert_to_json_cmd = 'python {0} convert -d {1} -out {2}'.format(convert_tool, overflow_path, exe_graph_path)
+        _ = os.system(convert_to_json_cmd)
+        overflow_json_path = overflow_path + '.output.0.json'
+        for _ in range(3):
+            if not os.path.exists(overflow_json_path):
+                time.sleep(2)
        # check if overflow dump file contains same task and stream id as file name
-        with open(overflow_path, 'rb') as f:
-            f.seek(321, 0)
-            raw_data = f.read()
-            task_id_infile = int.from_bytes(raw_data[24:25], 'little')
-            stream_id_infile = int.from_bytes(raw_data[16:17], 'little')
+        with open(overflow_json_path) as f:
+            file_content = json.load(f)
+            ai_core = file_content["AI Core"]
+            task_id_infile = ai_core["task_id"]
+            stream_id_infile = ai_core["stream_id"]
            assert output_task_id == str(task_id_infile)
            assert output_stream_id == str(stream_id_infile)
        del os.environ['MINDSPORE_DUMP_CONFIG']
@ -376,7 +385,7 @@ def run_not_overflow_dump():
        assert not os.path.exists(exe_graph_path)
        del os.environ['MINDSPORE_DUMP_CONFIG']

-@pytest.mark.level1
+@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@ -415,13 +424,13 @@ def check_statistic_dump(dump_file_path):
        num_tensors = len(stats)
        assert num_tensors == 3
        for tensor in stats:
-            if (tensor['IO'] == 'input' and tensor['Slot'] == 0):
+            if tensor['IO'] == 'input' and tensor['Slot'] == 0:
                assert tensor['Min Value'] == '1'
                assert tensor['Max Value'] == '6'
-            elif (tensor['IO'] == 'input' and tensor['Slot'] == 1):
+            elif tensor['IO'] == 'input' and tensor['Slot'] == 1:
                assert tensor['Min Value'] == '7'
                assert tensor['Max Value'] == '12'
-            elif (tensor['IO'] == 'output' and tensor['Slot'] == 0):
+            elif tensor['IO'] == 'output' and tensor['Slot'] == 0:
                assert tensor['Min Value'] == '8'
                assert tensor['Max Value'] == '18'

@ -548,7 +557,7 @@ def test_stat_dump_nulls():
            assert output['Avg Value'] == 'null'


-@pytest.mark.level1
+@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@ -581,7 +590,7 @@ def test_ascend_statistic_dump_kernel_by_kernel():
    del os.environ['GRAPH_OP_RUN']


-@pytest.mark.level1
+@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@ -596,7 +605,7 @@ def test_ascend_tensor_dump():
    run_saved_data_dump_test('test_async_dump', 'tensor')


-@pytest.mark.level1
+@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@ -642,7 +651,7 @@ class ConstantNet(nn.Cell):
        return self.relu(construct_tensor(ops.shape(x_)))


-@pytest.mark.level1
+@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
--- a/tests/st/dump/test_dump_format.py
+++ b/tests/st/dump/test_dump_format.py
@ -144,7 +144,7 @@ def test_ascend_kernel_by_kernel_trans_false():
    del os.environ['GRAPH_OP_RUN']


-@pytest.mark.level1
+@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard