add overflow st test and checkdumpstructure test

2021-10-22 06:28:58 +08:00 · 2021-10-22 06:28:58 +08:00 · 332e0dbb0f
parent b414db49b6
commit 332e0dbb0f
4 changed files with 286 additions and 4 deletions
--- a/tests/st/debugger/golden/overflow_watchpoint_expected.json
+++ b/tests/st/debugger/golden/overflow_watchpoint_expected.json
@ -0,0 +1,14 @@
+[
+    {
+        "watchpoint_hit1": {
+            "name": "Default/Add-op0",
+            "slot": 0,
+            "condition": 2,
+            "watchpoint_id": 1,
+            "parameter": [],
+            "error_code": 0,
+            "rank_id": 0,
+            "root_graph_id": 0
+        }
+    }
+]
--- a/tests/st/debugger/test_watchpoints.py
+++ b/tests/st/debugger/test_watchpoints.py
@ -20,8 +20,8 @@ import os
 import json
 import tempfile
 import numpy as np
-import mindspore.offline_debug.dbg_services as d
 import pytest
+import mindspore.offline_debug.dbg_services as d
 from tests.security_utils import security_off_wrap
 from dump_test_utils import build_dump_structure

@ -70,7 +70,7 @@ def run_watchpoints(is_sync):
        temp_dir = build_dump_structure(tmp_dir, tensor_name, tensor_list, "Test", tensor_info)

        debugger_backend = d.DbgServices(dump_file_path=temp_dir)
-        debugger_backend.initialize(net_name="Test", is_sync_mode=False)
+        debugger_backend.initialize(net_name="Test", is_sync_mode=is_sync)

        # NOTES:
        # -> watch_condition=6 is MIN_LT
@ -146,6 +146,89 @@ def test_async_watchpoints():
    run_watchpoints(False)


+def run_overflow_watchpoint(is_overflow):
+    test_name = "overflow_watchpoint"
+    tensor = np.array([65504, 65504], np.float16)
+    task_id = 2
+    stream_id = 7
+    pwd = os.getcwd()
+    with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
+        path = os.path.join(tmp_dir, "rank_0", "Add", "0", "0")
+        os.makedirs(path, exist_ok=True)
+        add_file = tempfile.mkstemp(prefix="Add.Default_Add-op0."+str(task_id)+"."+str(stream_id)+
+                                    ".1", dir=path)
+        with open(add_file[1], 'wb') as add_f:
+            add_f.write(b'1')
+            add_f.seek(8)
+            add_f.write(b'\n\x032.0\x10\x83\xf7\xef\x9f\x99\xc8\xf3\x02\x1a\x10\x08\x02\x10\x02\x1a\x03')
+            add_f.write(b'\n\x01\x020\x04:\x03\n\x01\x022\x0f')
+            add_f.write(b'Default/Add-op0')
+            add_f.write(tensor)
+        overflow_file = tempfile.mkstemp(prefix="Opdebug.Node_OpDebug."+str(task_id)+"." +str(stream_id)+
+                                         ".0", dir=path)
+        with open(overflow_file[1], 'wb') as f:
+            f.seek(321, 0)
+            byte_list = []
+            for i in range(256):
+                if i == 16:
+                    byte_list.append(stream_id)
+                elif i == 24:
+                    if is_overflow:
+                        byte_list.append(task_id)
+                    else:
+                        # wrong task_id, should not generate overflow watchpoint hit
+                        byte_list.append(task_id+1)
+                else:
+                    byte_list.append(0)
+            newFileByteArray = bytearray(byte_list)
+            f.write(bytes(newFileByteArray))
+        debugger_backend = d.DbgServices(dump_file_path=tmp_dir)
+        debugger_backend.initialize(net_name="Add", is_sync_mode=False)
+        debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=2,
+                                        check_node_list={"Default/Add-op0":
+                                                         {"rank_id": [0], "root_graph_id": [0], "is_output": True
+                                                         }}, parameter_list=[])
+
+        watchpoint_hits_test = debugger_backend.check_watchpoints(iteration=0)
+
+        if is_overflow:
+            assert len(watchpoint_hits_test) == 1
+            if GENERATE_GOLDEN:
+                print_watchpoint_hits(watchpoint_hits_test, 0, True, test_name)
+            else:
+                compare_expect_actual_result(watchpoint_hits_test, 0, test_name)
+        else:
+            assert not watchpoint_hits_test
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+@security_off_wrap
+def test_async_overflow_watchpoints_hit():
+    """
+    Feature: Offline Debugger CheckWatchpoint
+    Description: Test check overflow watchpoint hit
+    Expectation: Overflow watchpoint is hit
+    """
+    run_overflow_watchpoint(True)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+@security_off_wrap
+def test_async_overflow_watchpoints_not_hit():
+    """
+    Feature: Offline Debugger CheckWatchpoint
+    Description: Test check overflow watchpoint hit
+    Expectation: Overflow watchpoint is not hit
+    """
+    run_overflow_watchpoint(False)
+
+
 def compare_expect_actual_result(watchpoint_hits_list, test_index, test_name):
    """Compare actual result with golden file."""
    pwd = os.getcwd()
--- a/tests/st/dump/dump_test_utils.py
+++ b/tests/st/dump/dump_test_utils.py
@ -17,6 +17,7 @@ Utils for testing dump feature.
 """

 import json
+import os

 async_dump_dict = {
    "common_dump_settings": {
@ -87,3 +88,63 @@ def generate_dump_json(dump_path, json_file_name, test_key):
            "Failed to generate dump json file. The test name value " + test_key + " is invalid.")
    with open(json_file_name, 'w') as f:
        json.dump(data, f)
+
+
+def generate_dump_json_with_overflow(dump_path, json_file_name, test_key, op):
+    """
+    Util function to generate dump configuration json file.
+    """
+    if test_key == "test_async_dump":
+        data = async_dump_dict
+        data["common_dump_settings"]["path"] = dump_path
+        data["common_dump_settings"]["op_debug_mode"] = op
+    else:
+        raise ValueError(
+            "Failed to generate dump json file. Overflow only support in async dump")
+    with open(json_file_name, 'w') as f:
+        json.dump(data, f)
+
+
+def check_dump_structure(dump_path, json_file_path, num_card, num_graph, num_iteration):
+    """
+    Util to check if the dump structure is correct.
+    """
+    with open(json_file_path) as f:
+        data = json.load(f)
+    net_name = data["common_dump_settings"]["net_name"]
+    assert os.path.isdir(dump_path)
+    for rank_id in range(num_card):
+        rank_path = os.path.join(dump_path, "rank_"+str(rank_id))
+        assert os.path.exists(rank_path)
+
+        net_name_path = os.path.join(rank_path, net_name)
+        assert os.path.exists(net_name_path)
+        graph_path = os.path.join(rank_path, "graphs")
+        assert os.path.exists(graph_path)
+        execution_order_path = os.path.join(rank_path, "execution_order")
+        assert os.path.exists(execution_order_path)
+
+        for graph_id in range(num_graph):
+            graph_id_path = os.path.join(net_name_path, str(graph_id))
+            assert os.path.exists(graph_id_path)
+
+            graph_pb_file = os.path.join(graph_path, "ms_output_trace_code_graph_" + str(graph_id) + ".pb")
+            graph_ir_file = os.path.join(graph_path, "ms_output_trace_code_graph_" + str(graph_id) + ".ir")
+            assert os.path.exists(graph_pb_file)
+            assert os.path.exists(graph_ir_file)
+
+            execution_order_file = os.path.join(execution_order_path, "ms_execution_order_graph_"
+                                                + str(graph_id) + ".csv")
+            assert os.path.exists(execution_order_file)
+
+            for iteration_id in range(num_iteration):
+                it_id_path = os.path.join(graph_id_path, str(iteration_id))
+                assert os.path.isdir(it_id_path)
+
+
+def find_nth_pos(string, substring, n):
+    start = string.find(substring)
+    while n > 1 and start >= 0:
+        start = string.find(substring, start + len(substring))
+        n -= 1
+    return start
--- a/tests/st/dump/test_data_dump.py
+++ b/tests/st/dump/test_data_dump.py
@ -32,7 +32,8 @@ from mindspore.nn import SoftmaxCrossEntropyWithLogits
 from mindspore.nn import Momentum
 from mindspore.nn import TrainOneStepCell
 from mindspore.nn import WithLossCell
-from dump_test_utils import generate_dump_json
+from dump_test_utils import generate_dump_json, generate_dump_json_with_overflow, \
+    check_dump_structure, find_nth_pos
 from tests.security_utils import security_off_wrap


@ -67,8 +68,12 @@ def test_async_dump():
            shutil.rmtree(dump_path)
        add = Net()
        add(Tensor(x), Tensor(y))
-        time.sleep(5)
+        for _ in range(3):
+            if not os.path.exists(dump_file_path):
+                time.sleep(2)
+        check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
        assert len(os.listdir(dump_file_path)) == 1
+        del os.environ['MINDSPORE_DUMP_CONFIG']


 def run_e2e_dump():
@ -100,6 +105,11 @@ def run_e2e_dump():
        expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
        assert output.dtype == expect.dtype
        assert np.array_equal(output, expect)
+        for _ in range(3):
+            if not os.path.exists(dump_file_path):
+                time.sleep(2)
+        check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
+        del os.environ['MINDSPORE_DUMP_CONFIG']


@pytest.mark.level0
@ -122,6 +132,8 @@ def test_e2e_dump_with_hccl_env():
    os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
    os.environ["RANK_ID"] = "4"
    run_e2e_dump()
+    del os.environ['RANK_TABLE_FILE']
+    del os.environ['RANK_ID']


@pytest.mark.level0
@ -142,6 +154,8 @@ def test_cpu_e2e_dump_with_hccl_set():
    os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
    os.environ["RANK_ID"] = "4"
    run_e2e_dump()
+    del os.environ['RANK_TABLE_FILE']
+    del os.environ['RANK_ID']


@pytest.mark.level0
@ -162,6 +176,8 @@ def test_gpu_e2e_dump_with_hccl_set():
    os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
    os.environ["RANK_ID"] = "4"
    run_e2e_dump()
+    del os.environ['RANK_TABLE_FILE']
+    del os.environ['RANK_ID']


 class ReluReduceMeanDenseRelu(Cell):
@ -221,6 +237,7 @@ def test_async_dump_net_multi_layer_mode1():
                assert value.asnumpy() == dump_result[index]
        else:
            print('Failed to find hisi convert tools: msaccucmp.py or msaccucmp.pyc.')
+        del os.environ['MINDSPORE_DUMP_CONFIG']


@pytest.mark.level0
@ -247,6 +264,8 @@ def test_dump_with_diagnostic_path():
        add = Net()
        add(Tensor(x), Tensor(y))
        assert len(os.listdir(dump_file_path)) == 5
+        del os.environ['MINDSPORE_DUMP_CONFIG']
+        del os.environ['MS_DIAGNOSTIC_DATA_PATH']


 def run_e2e_dump_execution_graph():
@ -265,6 +284,7 @@ def run_e2e_dump_execution_graph():
        add(Tensor(x), Tensor(y))
        exe_graph_path = os.path.join(dump_path, 'rank_0', 'execution_order')
        assert len(os.listdir(exe_graph_path)) == 1
+        del os.environ['MINDSPORE_DUMP_CONFIG']


@pytest.mark.level0
@ -275,3 +295,107 @@ def test_dump_with_execution_graph():
    """Test dump with execution graph on GPU."""
    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
    run_e2e_dump_execution_graph()
+
+
+def run_overflow_dump():
+    """Run async dump and generate overflow"""
+    if sys.platform != 'linux':
+        return
+    pwd = os.getcwd()
+    overflow_x = np.array([60000, 60000]).astype(np.float16)
+    with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
+        dump_path = os.path.join(tmp_dir, 'overflow_dump')
+        dump_config_path = os.path.join(tmp_dir, 'overflow_dump.json')
+        generate_dump_json_with_overflow(dump_path, dump_config_path, 'test_async_dump', 3)
+        os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
+        if os.path.isdir(dump_path):
+            shutil.rmtree(dump_path)
+        add = Net()
+        add(Tensor(overflow_x), Tensor(overflow_x))
+        exe_graph_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
+        for _ in range(5):
+            if not os.path.exists(exe_graph_path):
+                time.sleep(2)
+        check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
+        # check if overflow dump generate exact two files, and the naming format
+        assert len(os.listdir(exe_graph_path)) == 2
+        output_path = glob.glob(os.path.join(exe_graph_path, "Add.Default_Add-op0.*.*.*"))[0]
+        overflow_path = glob.glob(os.path.join(exe_graph_path, "Opdebug.Node_OpDebug.*.*.*"))[0]
+        assert output_path
+        assert overflow_path
+        # check if generated files have matching task and stream id
+        output_file_name = os.path.split(output_path)
+        overflow_file_name = os.path.split(overflow_path)
+        output_second_dot_pos = find_nth_pos(output_file_name[1], ".", 2)
+        output_third_dot_pos = find_nth_pos(output_file_name[1], ".", 3)
+        output_fourth_dot_pos = find_nth_pos(output_file_name[1], ".", 4)
+        output_task_id = output_file_name[1][output_second_dot_pos+1:output_third_dot_pos]
+        output_stream_id = output_file_name[1][output_third_dot_pos+1:output_fourth_dot_pos]
+
+        overflow_second_dot_pos = find_nth_pos(overflow_file_name[1], ".", 2)
+        overflow_third_dot_pos = find_nth_pos(overflow_file_name[1], ".", 3)
+        overflow_fourth_dot_pos = find_nth_pos(overflow_file_name[1], ".", 4)
+        overflow_task_id = overflow_file_name[1][overflow_second_dot_pos+1:overflow_third_dot_pos]
+        overflow_stream_id = overflow_file_name[1][overflow_third_dot_pos+1:overflow_fourth_dot_pos]
+        assert output_task_id == overflow_task_id
+        assert output_stream_id == overflow_stream_id
+        # check if overflow dump file contains same task and stream id as file name
+        with open(overflow_path, 'rb') as f:
+            f.seek(321, 0)
+            raw_data = f.read()
+            task_id_infile = int.from_bytes(raw_data[24:25], 'little')
+            stream_id_infile = int.from_bytes(raw_data[16:17], 'little')
+            assert output_task_id == str(task_id_infile)
+            assert output_stream_id == str(stream_id_infile)
+        del os.environ['MINDSPORE_DUMP_CONFIG']
+
+
+def run_not_overflow_dump():
+    """Run async dump and not generate overflow"""
+    if sys.platform != 'linux':
+        return
+    pwd = os.getcwd()
+    overflow_x = np.array([60000, 60000]).astype(np.float16)
+    overflow_y = np.array([2, 2]).astype(np.float16)
+    with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
+        dump_path = os.path.join(tmp_dir, 'overflow_dump')
+        dump_config_path = os.path.join(tmp_dir, 'overflow_dump.json')
+        generate_dump_json_with_overflow(dump_path, dump_config_path, 'test_async_dump', 3)
+        os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
+        if os.path.isdir(dump_path):
+            shutil.rmtree(dump_path)
+        add = Net()
+        add(Tensor(overflow_x), Tensor(overflow_y))
+        exe_graph_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
+        # check no overflow is happening, and path should not be generated
+        assert not os.path.exists(exe_graph_path)
+        del os.environ['MINDSPORE_DUMP_CONFIG']
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+@security_off_wrap
+def test_ascend_overflow_dump():
+    """
+    Feature: Overflow Dump
+    Description: Test overflow dump
+    Expectation: Overflow is occurred, and overflow dump file is in correct format
+    """
+    context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')
+    run_overflow_dump()
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+@security_off_wrap
+def test_ascend_not_overflow_dump():
+    """
+    Feature: Overflow Dump
+    Description: Test overflow dump
+    Expectation: Overflow is not occurred, and overflow dump file is not generated
+    """
+    context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')
+    run_not_overflow_dump()