From 332e0dbb0f4e691d05fe04f107db581440e3a07f Mon Sep 17 00:00:00 2001
From: sabrinasun <sabrina.sun1@huawei.com>
Date: Fri, 22 Oct 2021 06:28:58 +0800
Subject: [PATCH] add overflow st test and checkdumpstructure test

---
 .../golden/overflow_watchpoint_expected.json  |  14 ++
 tests/st/debugger/test_watchpoints.py         |  87 +++++++++++-
 tests/st/dump/dump_test_utils.py              |  61 +++++++++
 tests/st/dump/test_data_dump.py               | 128 +++++++++++++++++-
 4 files changed, 286 insertions(+), 4 deletions(-)
 create mode 100644 tests/st/debugger/golden/overflow_watchpoint_expected.json

diff --git a/tests/st/debugger/golden/overflow_watchpoint_expected.json b/tests/st/debugger/golden/overflow_watchpoint_expected.json
new file mode 100644
index 00000000000..1773b7daf66
--- /dev/null
+++ b/tests/st/debugger/golden/overflow_watchpoint_expected.json
@@ -0,0 +1,14 @@
+[
+    {
+        "watchpoint_hit1": {
+            "name": "Default/Add-op0",
+            "slot": 0,
+            "condition": 2,
+            "watchpoint_id": 1,
+            "parameter": [],
+            "error_code": 0,
+            "rank_id": 0,
+            "root_graph_id": 0
+        }
+    }
+]
\ No newline at end of file
diff --git a/tests/st/debugger/test_watchpoints.py b/tests/st/debugger/test_watchpoints.py
index 400ea07589f..66e4ee32824 100644
--- a/tests/st/debugger/test_watchpoints.py
+++ b/tests/st/debugger/test_watchpoints.py
@@ -20,8 +20,8 @@ import os
 import json
 import tempfile
 import numpy as np
-import mindspore.offline_debug.dbg_services as d
 import pytest
+import mindspore.offline_debug.dbg_services as d
 from tests.security_utils import security_off_wrap
 from dump_test_utils import build_dump_structure
 
@@ -70,7 +70,7 @@ def run_watchpoints(is_sync):
         temp_dir = build_dump_structure(tmp_dir, tensor_name, tensor_list, "Test", tensor_info)
 
         debugger_backend = d.DbgServices(dump_file_path=temp_dir)
-        debugger_backend.initialize(net_name="Test", is_sync_mode=False)
+        debugger_backend.initialize(net_name="Test", is_sync_mode=is_sync)
 
         # NOTES:
         # -> watch_condition=6 is MIN_LT
@@ -146,6 +146,89 @@ def test_async_watchpoints():
     run_watchpoints(False)
 
 
+def run_overflow_watchpoint(is_overflow):
+    test_name = "overflow_watchpoint"
+    tensor = np.array([65504, 65504], np.float16)
+    task_id = 2
+    stream_id = 7
+    pwd = os.getcwd()
+    with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
+        path = os.path.join(tmp_dir, "rank_0", "Add", "0", "0")
+        os.makedirs(path, exist_ok=True)
+        add_file = tempfile.mkstemp(prefix="Add.Default_Add-op0."+str(task_id)+"."+str(stream_id)+
+                                    ".1", dir=path)
+        with open(add_file[1], 'wb') as add_f:
+            add_f.write(b'1')
+            add_f.seek(8)
+            add_f.write(b'\n\x032.0\x10\x83\xf7\xef\x9f\x99\xc8\xf3\x02\x1a\x10\x08\x02\x10\x02\x1a\x03')
+            add_f.write(b'\n\x01\x020\x04:\x03\n\x01\x022\x0f')
+            add_f.write(b'Default/Add-op0')
+            add_f.write(tensor)
+        overflow_file = tempfile.mkstemp(prefix="Opdebug.Node_OpDebug."+str(task_id)+"." +str(stream_id)+
+                                         ".0", dir=path)
+        with open(overflow_file[1], 'wb') as f:
+            f.seek(321, 0)
+            byte_list = []
+            for i in range(256):
+                if i == 16:
+                    byte_list.append(stream_id)
+                elif i == 24:
+                    if is_overflow:
+                        byte_list.append(task_id)
+                    else:
+                        # wrong task_id, should not generate overflow watchpoint hit
+                        byte_list.append(task_id+1)
+                else:
+                    byte_list.append(0)
+            newFileByteArray = bytearray(byte_list)
+            f.write(bytes(newFileByteArray))
+        debugger_backend = d.DbgServices(dump_file_path=tmp_dir)
+        debugger_backend.initialize(net_name="Add", is_sync_mode=False)
+        debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=2,
+                                        check_node_list={"Default/Add-op0":
+                                                         {"rank_id": [0], "root_graph_id": [0], "is_output": True
+                                                         }}, parameter_list=[])
+
+        watchpoint_hits_test = debugger_backend.check_watchpoints(iteration=0)
+
+        if is_overflow:
+            assert len(watchpoint_hits_test) == 1
+            if GENERATE_GOLDEN:
+                print_watchpoint_hits(watchpoint_hits_test, 0, True, test_name)
+            else:
+                compare_expect_actual_result(watchpoint_hits_test, 0, test_name)
+        else:
+            assert not watchpoint_hits_test
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+@security_off_wrap
+def test_async_overflow_watchpoints_hit():
+    """
+    Feature: Offline Debugger CheckWatchpoint
+    Description: Test check overflow watchpoint hit
+    Expectation: Overflow watchpoint is hit
+    """
+    run_overflow_watchpoint(True)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+@security_off_wrap
+def test_async_overflow_watchpoints_not_hit():
+    """
+    Feature: Offline Debugger CheckWatchpoint
+    Description: Test check overflow watchpoint hit
+    Expectation: Overflow watchpoint is not hit
+    """
+    run_overflow_watchpoint(False)
+
+
 def compare_expect_actual_result(watchpoint_hits_list, test_index, test_name):
     """Compare actual result with golden file."""
     pwd = os.getcwd()
diff --git a/tests/st/dump/dump_test_utils.py b/tests/st/dump/dump_test_utils.py
index 6a4b6bdf62f..3b3a0d64078 100644
--- a/tests/st/dump/dump_test_utils.py
+++ b/tests/st/dump/dump_test_utils.py
@@ -17,6 +17,7 @@ Utils for testing dump feature.
 """
 
 import json
+import os
 
 async_dump_dict = {
     "common_dump_settings": {
@@ -87,3 +88,63 @@ def generate_dump_json(dump_path, json_file_name, test_key):
             "Failed to generate dump json file. The test name value " + test_key + " is invalid.")
     with open(json_file_name, 'w') as f:
         json.dump(data, f)
+
+
+def generate_dump_json_with_overflow(dump_path, json_file_name, test_key, op):
+    """
+    Util function to generate dump configuration json file.
+    """
+    if test_key == "test_async_dump":
+        data = async_dump_dict
+        data["common_dump_settings"]["path"] = dump_path
+        data["common_dump_settings"]["op_debug_mode"] = op
+    else:
+        raise ValueError(
+            "Failed to generate dump json file. Overflow only support in async dump")
+    with open(json_file_name, 'w') as f:
+        json.dump(data, f)
+
+
+def check_dump_structure(dump_path, json_file_path, num_card, num_graph, num_iteration):
+    """
+    Util to check if the dump structure is correct.
+    """
+    with open(json_file_path) as f:
+        data = json.load(f)
+    net_name = data["common_dump_settings"]["net_name"]
+    assert os.path.isdir(dump_path)
+    for rank_id in range(num_card):
+        rank_path = os.path.join(dump_path, "rank_"+str(rank_id))
+        assert os.path.exists(rank_path)
+
+        net_name_path = os.path.join(rank_path, net_name)
+        assert os.path.exists(net_name_path)
+        graph_path = os.path.join(rank_path, "graphs")
+        assert os.path.exists(graph_path)
+        execution_order_path = os.path.join(rank_path, "execution_order")
+        assert os.path.exists(execution_order_path)
+
+        for graph_id in range(num_graph):
+            graph_id_path = os.path.join(net_name_path, str(graph_id))
+            assert os.path.exists(graph_id_path)
+
+            graph_pb_file = os.path.join(graph_path, "ms_output_trace_code_graph_" + str(graph_id) + ".pb")
+            graph_ir_file = os.path.join(graph_path, "ms_output_trace_code_graph_" + str(graph_id) + ".ir")
+            assert os.path.exists(graph_pb_file)
+            assert os.path.exists(graph_ir_file)
+
+            execution_order_file = os.path.join(execution_order_path, "ms_execution_order_graph_"
+                                                + str(graph_id) + ".csv")
+            assert os.path.exists(execution_order_file)
+
+            for iteration_id in range(num_iteration):
+                it_id_path = os.path.join(graph_id_path, str(iteration_id))
+                assert os.path.isdir(it_id_path)
+
+
+def find_nth_pos(string, substring, n):
+    start = string.find(substring)
+    while n > 1 and start >= 0:
+        start = string.find(substring, start + len(substring))
+        n -= 1
+    return start
diff --git a/tests/st/dump/test_data_dump.py b/tests/st/dump/test_data_dump.py
index 9b7c5f39516..791078e8eac 100644
--- a/tests/st/dump/test_data_dump.py
+++ b/tests/st/dump/test_data_dump.py
@@ -32,7 +32,8 @@ from mindspore.nn import SoftmaxCrossEntropyWithLogits
 from mindspore.nn import Momentum
 from mindspore.nn import TrainOneStepCell
 from mindspore.nn import WithLossCell
-from dump_test_utils import generate_dump_json
+from dump_test_utils import generate_dump_json, generate_dump_json_with_overflow, \
+    check_dump_structure, find_nth_pos
 from tests.security_utils import security_off_wrap
 
 
@@ -67,8 +68,12 @@ def test_async_dump():
             shutil.rmtree(dump_path)
         add = Net()
         add(Tensor(x), Tensor(y))
-        time.sleep(5)
+        for _ in range(3):
+            if not os.path.exists(dump_file_path):
+                time.sleep(2)
+        check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
         assert len(os.listdir(dump_file_path)) == 1
+        del os.environ['MINDSPORE_DUMP_CONFIG']
 
 
 def run_e2e_dump():
@@ -100,6 +105,11 @@ def run_e2e_dump():
         expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
         assert output.dtype == expect.dtype
         assert np.array_equal(output, expect)
+        for _ in range(3):
+            if not os.path.exists(dump_file_path):
+                time.sleep(2)
+        check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
+        del os.environ['MINDSPORE_DUMP_CONFIG']
 
 
 @pytest.mark.level0
@@ -122,6 +132,8 @@ def test_e2e_dump_with_hccl_env():
     os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
     os.environ["RANK_ID"] = "4"
     run_e2e_dump()
+    del os.environ['RANK_TABLE_FILE']
+    del os.environ['RANK_ID']
 
 
 @pytest.mark.level0
@@ -142,6 +154,8 @@ def test_cpu_e2e_dump_with_hccl_set():
     os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
     os.environ["RANK_ID"] = "4"
     run_e2e_dump()
+    del os.environ['RANK_TABLE_FILE']
+    del os.environ['RANK_ID']
 
 
 @pytest.mark.level0
@@ -162,6 +176,8 @@ def test_gpu_e2e_dump_with_hccl_set():
     os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
     os.environ["RANK_ID"] = "4"
     run_e2e_dump()
+    del os.environ['RANK_TABLE_FILE']
+    del os.environ['RANK_ID']
 
 
 class ReluReduceMeanDenseRelu(Cell):
@@ -221,6 +237,7 @@ def test_async_dump_net_multi_layer_mode1():
                 assert value.asnumpy() == dump_result[index]
         else:
             print('Failed to find hisi convert tools: msaccucmp.py or msaccucmp.pyc.')
+        del os.environ['MINDSPORE_DUMP_CONFIG']
 
 
 @pytest.mark.level0
@@ -247,6 +264,8 @@ def test_dump_with_diagnostic_path():
         add = Net()
         add(Tensor(x), Tensor(y))
         assert len(os.listdir(dump_file_path)) == 5
+        del os.environ['MINDSPORE_DUMP_CONFIG']
+        del os.environ['MS_DIAGNOSTIC_DATA_PATH']
 
 
 def run_e2e_dump_execution_graph():
@@ -265,6 +284,7 @@ def run_e2e_dump_execution_graph():
         add(Tensor(x), Tensor(y))
         exe_graph_path = os.path.join(dump_path, 'rank_0', 'execution_order')
         assert len(os.listdir(exe_graph_path)) == 1
+        del os.environ['MINDSPORE_DUMP_CONFIG']
 
 
 @pytest.mark.level0
@@ -275,3 +295,107 @@ def test_dump_with_execution_graph():
     """Test dump with execution graph on GPU."""
     context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
     run_e2e_dump_execution_graph()
+
+
+def run_overflow_dump():
+    """Run async dump and generate overflow"""
+    if sys.platform != 'linux':
+        return
+    pwd = os.getcwd()
+    overflow_x = np.array([60000, 60000]).astype(np.float16)
+    with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
+        dump_path = os.path.join(tmp_dir, 'overflow_dump')
+        dump_config_path = os.path.join(tmp_dir, 'overflow_dump.json')
+        generate_dump_json_with_overflow(dump_path, dump_config_path, 'test_async_dump', 3)
+        os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
+        if os.path.isdir(dump_path):
+            shutil.rmtree(dump_path)
+        add = Net()
+        add(Tensor(overflow_x), Tensor(overflow_x))
+        exe_graph_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
+        for _ in range(5):
+            if not os.path.exists(exe_graph_path):
+                time.sleep(2)
+        check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
+        # check if overflow dump generate exact two files, and the naming format
+        assert len(os.listdir(exe_graph_path)) == 2
+        output_path = glob.glob(os.path.join(exe_graph_path, "Add.Default_Add-op0.*.*.*"))[0]
+        overflow_path = glob.glob(os.path.join(exe_graph_path, "Opdebug.Node_OpDebug.*.*.*"))[0]
+        assert output_path
+        assert overflow_path
+        # check if generated files have matching task and stream id
+        output_file_name = os.path.split(output_path)
+        overflow_file_name = os.path.split(overflow_path)
+        output_second_dot_pos = find_nth_pos(output_file_name[1], ".", 2)
+        output_third_dot_pos = find_nth_pos(output_file_name[1], ".", 3)
+        output_fourth_dot_pos = find_nth_pos(output_file_name[1], ".", 4)
+        output_task_id = output_file_name[1][output_second_dot_pos+1:output_third_dot_pos]
+        output_stream_id = output_file_name[1][output_third_dot_pos+1:output_fourth_dot_pos]
+
+        overflow_second_dot_pos = find_nth_pos(overflow_file_name[1], ".", 2)
+        overflow_third_dot_pos = find_nth_pos(overflow_file_name[1], ".", 3)
+        overflow_fourth_dot_pos = find_nth_pos(overflow_file_name[1], ".", 4)
+        overflow_task_id = overflow_file_name[1][overflow_second_dot_pos+1:overflow_third_dot_pos]
+        overflow_stream_id = overflow_file_name[1][overflow_third_dot_pos+1:overflow_fourth_dot_pos]
+        assert output_task_id == overflow_task_id
+        assert output_stream_id == overflow_stream_id
+        # check if overflow dump file contains same task and stream id as file name
+        with open(overflow_path, 'rb') as f:
+            f.seek(321, 0)
+            raw_data = f.read()
+            task_id_infile = int.from_bytes(raw_data[24:25], 'little')
+            stream_id_infile = int.from_bytes(raw_data[16:17], 'little')
+            assert output_task_id == str(task_id_infile)
+            assert output_stream_id == str(stream_id_infile)
+        del os.environ['MINDSPORE_DUMP_CONFIG']
+
+
+def run_not_overflow_dump():
+    """Run async dump and not generate overflow"""
+    if sys.platform != 'linux':
+        return
+    pwd = os.getcwd()
+    overflow_x = np.array([60000, 60000]).astype(np.float16)
+    overflow_y = np.array([2, 2]).astype(np.float16)
+    with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
+        dump_path = os.path.join(tmp_dir, 'overflow_dump')
+        dump_config_path = os.path.join(tmp_dir, 'overflow_dump.json')
+        generate_dump_json_with_overflow(dump_path, dump_config_path, 'test_async_dump', 3)
+        os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
+        if os.path.isdir(dump_path):
+            shutil.rmtree(dump_path)
+        add = Net()
+        add(Tensor(overflow_x), Tensor(overflow_y))
+        exe_graph_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
+        # check no overflow is happening, and path should not be generated
+        assert not os.path.exists(exe_graph_path)
+        del os.environ['MINDSPORE_DUMP_CONFIG']
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+@security_off_wrap
+def test_ascend_overflow_dump():
+    """
+    Feature: Overflow Dump
+    Description: Test overflow dump
+    Expectation: Overflow is occurred, and overflow dump file is in correct format
+    """
+    context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')
+    run_overflow_dump()
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+@security_off_wrap
+def test_ascend_not_overflow_dump():
+    """
+    Feature: Overflow Dump
+    Description: Test overflow dump
+    Expectation: Overflow is not occurred, and overflow dump file is not generated
+    """
+    context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')
+    run_not_overflow_dump()