diff --git a/tests/st/debugger/golden/overflow_watchpoint_expected.json b/tests/st/debugger/golden/overflow_watchpoint_expected.json new file mode 100644 index 00000000000..1773b7daf66 --- /dev/null +++ b/tests/st/debugger/golden/overflow_watchpoint_expected.json @@ -0,0 +1,14 @@ +[ + { + "watchpoint_hit1": { + "name": "Default/Add-op0", + "slot": 0, + "condition": 2, + "watchpoint_id": 1, + "parameter": [], + "error_code": 0, + "rank_id": 0, + "root_graph_id": 0 + } + } +] \ No newline at end of file diff --git a/tests/st/debugger/test_watchpoints.py b/tests/st/debugger/test_watchpoints.py index 400ea07589f..66e4ee32824 100644 --- a/tests/st/debugger/test_watchpoints.py +++ b/tests/st/debugger/test_watchpoints.py @@ -20,8 +20,8 @@ import os import json import tempfile import numpy as np -import mindspore.offline_debug.dbg_services as d import pytest +import mindspore.offline_debug.dbg_services as d from tests.security_utils import security_off_wrap from dump_test_utils import build_dump_structure @@ -70,7 +70,7 @@ def run_watchpoints(is_sync): temp_dir = build_dump_structure(tmp_dir, tensor_name, tensor_list, "Test", tensor_info) debugger_backend = d.DbgServices(dump_file_path=temp_dir) - debugger_backend.initialize(net_name="Test", is_sync_mode=False) + debugger_backend.initialize(net_name="Test", is_sync_mode=is_sync) # NOTES: # -> watch_condition=6 is MIN_LT @@ -146,6 +146,89 @@ def test_async_watchpoints(): run_watchpoints(False) +def run_overflow_watchpoint(is_overflow): + test_name = "overflow_watchpoint" + tensor = np.array([65504, 65504], np.float16) + task_id = 2 + stream_id = 7 + pwd = os.getcwd() + with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir: + path = os.path.join(tmp_dir, "rank_0", "Add", "0", "0") + os.makedirs(path, exist_ok=True) + add_file = tempfile.mkstemp(prefix="Add.Default_Add-op0."+str(task_id)+"."+str(stream_id)+ + ".1", dir=path) + with open(add_file[1], 'wb') as add_f: + add_f.write(b'1') + add_f.seek(8) + add_f.write(b'\n\x032.0\x10\x83\xf7\xef\x9f\x99\xc8\xf3\x02\x1a\x10\x08\x02\x10\x02\x1a\x03') + add_f.write(b'\n\x01\x020\x04:\x03\n\x01\x022\x0f') + add_f.write(b'Default/Add-op0') + add_f.write(tensor) + overflow_file = tempfile.mkstemp(prefix="Opdebug.Node_OpDebug."+str(task_id)+"." +str(stream_id)+ + ".0", dir=path) + with open(overflow_file[1], 'wb') as f: + f.seek(321, 0) + byte_list = [] + for i in range(256): + if i == 16: + byte_list.append(stream_id) + elif i == 24: + if is_overflow: + byte_list.append(task_id) + else: + # wrong task_id, should not generate overflow watchpoint hit + byte_list.append(task_id+1) + else: + byte_list.append(0) + newFileByteArray = bytearray(byte_list) + f.write(bytes(newFileByteArray)) + debugger_backend = d.DbgServices(dump_file_path=tmp_dir) + debugger_backend.initialize(net_name="Add", is_sync_mode=False) + debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=2, + check_node_list={"Default/Add-op0": + {"rank_id": [0], "root_graph_id": [0], "is_output": True + }}, parameter_list=[]) + + watchpoint_hits_test = debugger_backend.check_watchpoints(iteration=0) + + if is_overflow: + assert len(watchpoint_hits_test) == 1 + if GENERATE_GOLDEN: + print_watchpoint_hits(watchpoint_hits_test, 0, True, test_name) + else: + compare_expect_actual_result(watchpoint_hits_test, 0, test_name) + else: + assert not watchpoint_hits_test + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +@security_off_wrap +def test_async_overflow_watchpoints_hit(): + """ + Feature: Offline Debugger CheckWatchpoint + Description: Test check overflow watchpoint hit + Expectation: Overflow watchpoint is hit + """ + run_overflow_watchpoint(True) + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +@security_off_wrap +def test_async_overflow_watchpoints_not_hit(): + """ + Feature: Offline Debugger CheckWatchpoint + Description: Test check overflow watchpoint hit + Expectation: Overflow watchpoint is not hit + """ + run_overflow_watchpoint(False) + + def compare_expect_actual_result(watchpoint_hits_list, test_index, test_name): """Compare actual result with golden file.""" pwd = os.getcwd() diff --git a/tests/st/dump/dump_test_utils.py b/tests/st/dump/dump_test_utils.py index 6a4b6bdf62f..3b3a0d64078 100644 --- a/tests/st/dump/dump_test_utils.py +++ b/tests/st/dump/dump_test_utils.py @@ -17,6 +17,7 @@ Utils for testing dump feature. """ import json +import os async_dump_dict = { "common_dump_settings": { @@ -87,3 +88,63 @@ def generate_dump_json(dump_path, json_file_name, test_key): "Failed to generate dump json file. The test name value " + test_key + " is invalid.") with open(json_file_name, 'w') as f: json.dump(data, f) + + +def generate_dump_json_with_overflow(dump_path, json_file_name, test_key, op): + """ + Util function to generate dump configuration json file. + """ + if test_key == "test_async_dump": + data = async_dump_dict + data["common_dump_settings"]["path"] = dump_path + data["common_dump_settings"]["op_debug_mode"] = op + else: + raise ValueError( + "Failed to generate dump json file. Overflow only support in async dump") + with open(json_file_name, 'w') as f: + json.dump(data, f) + + +def check_dump_structure(dump_path, json_file_path, num_card, num_graph, num_iteration): + """ + Util to check if the dump structure is correct. + """ + with open(json_file_path) as f: + data = json.load(f) + net_name = data["common_dump_settings"]["net_name"] + assert os.path.isdir(dump_path) + for rank_id in range(num_card): + rank_path = os.path.join(dump_path, "rank_"+str(rank_id)) + assert os.path.exists(rank_path) + + net_name_path = os.path.join(rank_path, net_name) + assert os.path.exists(net_name_path) + graph_path = os.path.join(rank_path, "graphs") + assert os.path.exists(graph_path) + execution_order_path = os.path.join(rank_path, "execution_order") + assert os.path.exists(execution_order_path) + + for graph_id in range(num_graph): + graph_id_path = os.path.join(net_name_path, str(graph_id)) + assert os.path.exists(graph_id_path) + + graph_pb_file = os.path.join(graph_path, "ms_output_trace_code_graph_" + str(graph_id) + ".pb") + graph_ir_file = os.path.join(graph_path, "ms_output_trace_code_graph_" + str(graph_id) + ".ir") + assert os.path.exists(graph_pb_file) + assert os.path.exists(graph_ir_file) + + execution_order_file = os.path.join(execution_order_path, "ms_execution_order_graph_" + + str(graph_id) + ".csv") + assert os.path.exists(execution_order_file) + + for iteration_id in range(num_iteration): + it_id_path = os.path.join(graph_id_path, str(iteration_id)) + assert os.path.isdir(it_id_path) + + +def find_nth_pos(string, substring, n): + start = string.find(substring) + while n > 1 and start >= 0: + start = string.find(substring, start + len(substring)) + n -= 1 + return start diff --git a/tests/st/dump/test_data_dump.py b/tests/st/dump/test_data_dump.py index 9b7c5f39516..791078e8eac 100644 --- a/tests/st/dump/test_data_dump.py +++ b/tests/st/dump/test_data_dump.py @@ -32,7 +32,8 @@ from mindspore.nn import SoftmaxCrossEntropyWithLogits from mindspore.nn import Momentum from mindspore.nn import TrainOneStepCell from mindspore.nn import WithLossCell -from dump_test_utils import generate_dump_json +from dump_test_utils import generate_dump_json, generate_dump_json_with_overflow, \ + check_dump_structure, find_nth_pos from tests.security_utils import security_off_wrap @@ -67,8 +68,12 @@ def test_async_dump(): shutil.rmtree(dump_path) add = Net() add(Tensor(x), Tensor(y)) - time.sleep(5) + for _ in range(3): + if not os.path.exists(dump_file_path): + time.sleep(2) + check_dump_structure(dump_path, dump_config_path, 1, 1, 1) assert len(os.listdir(dump_file_path)) == 1 + del os.environ['MINDSPORE_DUMP_CONFIG'] def run_e2e_dump(): @@ -100,6 +105,11 @@ def run_e2e_dump(): expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32) assert output.dtype == expect.dtype assert np.array_equal(output, expect) + for _ in range(3): + if not os.path.exists(dump_file_path): + time.sleep(2) + check_dump_structure(dump_path, dump_config_path, 1, 1, 1) + del os.environ['MINDSPORE_DUMP_CONFIG'] @pytest.mark.level0 @@ -122,6 +132,8 @@ def test_e2e_dump_with_hccl_env(): os.environ["RANK_TABLE_FILE"] = "invalid_file.json" os.environ["RANK_ID"] = "4" run_e2e_dump() + del os.environ['RANK_TABLE_FILE'] + del os.environ['RANK_ID'] @pytest.mark.level0 @@ -142,6 +154,8 @@ def test_cpu_e2e_dump_with_hccl_set(): os.environ["RANK_TABLE_FILE"] = "invalid_file.json" os.environ["RANK_ID"] = "4" run_e2e_dump() + del os.environ['RANK_TABLE_FILE'] + del os.environ['RANK_ID'] @pytest.mark.level0 @@ -162,6 +176,8 @@ def test_gpu_e2e_dump_with_hccl_set(): os.environ["RANK_TABLE_FILE"] = "invalid_file.json" os.environ["RANK_ID"] = "4" run_e2e_dump() + del os.environ['RANK_TABLE_FILE'] + del os.environ['RANK_ID'] class ReluReduceMeanDenseRelu(Cell): @@ -221,6 +237,7 @@ def test_async_dump_net_multi_layer_mode1(): assert value.asnumpy() == dump_result[index] else: print('Failed to find hisi convert tools: msaccucmp.py or msaccucmp.pyc.') + del os.environ['MINDSPORE_DUMP_CONFIG'] @pytest.mark.level0 @@ -247,6 +264,8 @@ def test_dump_with_diagnostic_path(): add = Net() add(Tensor(x), Tensor(y)) assert len(os.listdir(dump_file_path)) == 5 + del os.environ['MINDSPORE_DUMP_CONFIG'] + del os.environ['MS_DIAGNOSTIC_DATA_PATH'] def run_e2e_dump_execution_graph(): @@ -265,6 +284,7 @@ def run_e2e_dump_execution_graph(): add(Tensor(x), Tensor(y)) exe_graph_path = os.path.join(dump_path, 'rank_0', 'execution_order') assert len(os.listdir(exe_graph_path)) == 1 + del os.environ['MINDSPORE_DUMP_CONFIG'] @pytest.mark.level0 @@ -275,3 +295,107 @@ def test_dump_with_execution_graph(): """Test dump with execution graph on GPU.""" context.set_context(mode=context.GRAPH_MODE, device_target='GPU') run_e2e_dump_execution_graph() + + +def run_overflow_dump(): + """Run async dump and generate overflow""" + if sys.platform != 'linux': + return + pwd = os.getcwd() + overflow_x = np.array([60000, 60000]).astype(np.float16) + with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir: + dump_path = os.path.join(tmp_dir, 'overflow_dump') + dump_config_path = os.path.join(tmp_dir, 'overflow_dump.json') + generate_dump_json_with_overflow(dump_path, dump_config_path, 'test_async_dump', 3) + os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path + if os.path.isdir(dump_path): + shutil.rmtree(dump_path) + add = Net() + add(Tensor(overflow_x), Tensor(overflow_x)) + exe_graph_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0') + for _ in range(5): + if not os.path.exists(exe_graph_path): + time.sleep(2) + check_dump_structure(dump_path, dump_config_path, 1, 1, 1) + # check if overflow dump generate exact two files, and the naming format + assert len(os.listdir(exe_graph_path)) == 2 + output_path = glob.glob(os.path.join(exe_graph_path, "Add.Default_Add-op0.*.*.*"))[0] + overflow_path = glob.glob(os.path.join(exe_graph_path, "Opdebug.Node_OpDebug.*.*.*"))[0] + assert output_path + assert overflow_path + # check if generated files have matching task and stream id + output_file_name = os.path.split(output_path) + overflow_file_name = os.path.split(overflow_path) + output_second_dot_pos = find_nth_pos(output_file_name[1], ".", 2) + output_third_dot_pos = find_nth_pos(output_file_name[1], ".", 3) + output_fourth_dot_pos = find_nth_pos(output_file_name[1], ".", 4) + output_task_id = output_file_name[1][output_second_dot_pos+1:output_third_dot_pos] + output_stream_id = output_file_name[1][output_third_dot_pos+1:output_fourth_dot_pos] + + overflow_second_dot_pos = find_nth_pos(overflow_file_name[1], ".", 2) + overflow_third_dot_pos = find_nth_pos(overflow_file_name[1], ".", 3) + overflow_fourth_dot_pos = find_nth_pos(overflow_file_name[1], ".", 4) + overflow_task_id = overflow_file_name[1][overflow_second_dot_pos+1:overflow_third_dot_pos] + overflow_stream_id = overflow_file_name[1][overflow_third_dot_pos+1:overflow_fourth_dot_pos] + assert output_task_id == overflow_task_id + assert output_stream_id == overflow_stream_id + # check if overflow dump file contains same task and stream id as file name + with open(overflow_path, 'rb') as f: + f.seek(321, 0) + raw_data = f.read() + task_id_infile = int.from_bytes(raw_data[24:25], 'little') + stream_id_infile = int.from_bytes(raw_data[16:17], 'little') + assert output_task_id == str(task_id_infile) + assert output_stream_id == str(stream_id_infile) + del os.environ['MINDSPORE_DUMP_CONFIG'] + + +def run_not_overflow_dump(): + """Run async dump and not generate overflow""" + if sys.platform != 'linux': + return + pwd = os.getcwd() + overflow_x = np.array([60000, 60000]).astype(np.float16) + overflow_y = np.array([2, 2]).astype(np.float16) + with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir: + dump_path = os.path.join(tmp_dir, 'overflow_dump') + dump_config_path = os.path.join(tmp_dir, 'overflow_dump.json') + generate_dump_json_with_overflow(dump_path, dump_config_path, 'test_async_dump', 3) + os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path + if os.path.isdir(dump_path): + shutil.rmtree(dump_path) + add = Net() + add(Tensor(overflow_x), Tensor(overflow_y)) + exe_graph_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0') + # check no overflow is happening, and path should not be generated + assert not os.path.exists(exe_graph_path) + del os.environ['MINDSPORE_DUMP_CONFIG'] + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +@security_off_wrap +def test_ascend_overflow_dump(): + """ + Feature: Overflow Dump + Description: Test overflow dump + Expectation: Overflow is occurred, and overflow dump file is in correct format + """ + context.set_context(mode=context.GRAPH_MODE, device_target='Ascend') + run_overflow_dump() + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +@security_off_wrap +def test_ascend_not_overflow_dump(): + """ + Feature: Overflow Dump + Description: Test overflow dump + Expectation: Overflow is not occurred, and overflow dump file is not generated + """ + context.set_context(mode=context.GRAPH_MODE, device_target='Ascend') + run_not_overflow_dump()