add overflow st test and checkdumpstructure test

This commit is contained in:
sabrinasun 2021-10-22 06:28:58 +08:00
parent b414db49b6
commit 332e0dbb0f
4 changed files with 286 additions and 4 deletions

View File

@ -0,0 +1,14 @@
[
{
"watchpoint_hit1": {
"name": "Default/Add-op0",
"slot": 0,
"condition": 2,
"watchpoint_id": 1,
"parameter": [],
"error_code": 0,
"rank_id": 0,
"root_graph_id": 0
}
}
]

View File

@ -20,8 +20,8 @@ import os
import json
import tempfile
import numpy as np
import mindspore.offline_debug.dbg_services as d
import pytest
import mindspore.offline_debug.dbg_services as d
from tests.security_utils import security_off_wrap
from dump_test_utils import build_dump_structure
@ -70,7 +70,7 @@ def run_watchpoints(is_sync):
temp_dir = build_dump_structure(tmp_dir, tensor_name, tensor_list, "Test", tensor_info)
debugger_backend = d.DbgServices(dump_file_path=temp_dir)
debugger_backend.initialize(net_name="Test", is_sync_mode=False)
debugger_backend.initialize(net_name="Test", is_sync_mode=is_sync)
# NOTES:
# -> watch_condition=6 is MIN_LT
@ -146,6 +146,89 @@ def test_async_watchpoints():
run_watchpoints(False)
def run_overflow_watchpoint(is_overflow):
test_name = "overflow_watchpoint"
tensor = np.array([65504, 65504], np.float16)
task_id = 2
stream_id = 7
pwd = os.getcwd()
with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
path = os.path.join(tmp_dir, "rank_0", "Add", "0", "0")
os.makedirs(path, exist_ok=True)
add_file = tempfile.mkstemp(prefix="Add.Default_Add-op0."+str(task_id)+"."+str(stream_id)+
".1", dir=path)
with open(add_file[1], 'wb') as add_f:
add_f.write(b'1')
add_f.seek(8)
add_f.write(b'\n\x032.0\x10\x83\xf7\xef\x9f\x99\xc8\xf3\x02\x1a\x10\x08\x02\x10\x02\x1a\x03')
add_f.write(b'\n\x01\x020\x04:\x03\n\x01\x022\x0f')
add_f.write(b'Default/Add-op0')
add_f.write(tensor)
overflow_file = tempfile.mkstemp(prefix="Opdebug.Node_OpDebug."+str(task_id)+"." +str(stream_id)+
".0", dir=path)
with open(overflow_file[1], 'wb') as f:
f.seek(321, 0)
byte_list = []
for i in range(256):
if i == 16:
byte_list.append(stream_id)
elif i == 24:
if is_overflow:
byte_list.append(task_id)
else:
# wrong task_id, should not generate overflow watchpoint hit
byte_list.append(task_id+1)
else:
byte_list.append(0)
newFileByteArray = bytearray(byte_list)
f.write(bytes(newFileByteArray))
debugger_backend = d.DbgServices(dump_file_path=tmp_dir)
debugger_backend.initialize(net_name="Add", is_sync_mode=False)
debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=2,
check_node_list={"Default/Add-op0":
{"rank_id": [0], "root_graph_id": [0], "is_output": True
}}, parameter_list=[])
watchpoint_hits_test = debugger_backend.check_watchpoints(iteration=0)
if is_overflow:
assert len(watchpoint_hits_test) == 1
if GENERATE_GOLDEN:
print_watchpoint_hits(watchpoint_hits_test, 0, True, test_name)
else:
compare_expect_actual_result(watchpoint_hits_test, 0, test_name)
else:
assert not watchpoint_hits_test
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@security_off_wrap
def test_async_overflow_watchpoints_hit():
"""
Feature: Offline Debugger CheckWatchpoint
Description: Test check overflow watchpoint hit
Expectation: Overflow watchpoint is hit
"""
run_overflow_watchpoint(True)
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@security_off_wrap
def test_async_overflow_watchpoints_not_hit():
"""
Feature: Offline Debugger CheckWatchpoint
Description: Test check overflow watchpoint hit
Expectation: Overflow watchpoint is not hit
"""
run_overflow_watchpoint(False)
def compare_expect_actual_result(watchpoint_hits_list, test_index, test_name):
"""Compare actual result with golden file."""
pwd = os.getcwd()

View File

@ -17,6 +17,7 @@ Utils for testing dump feature.
"""
import json
import os
async_dump_dict = {
"common_dump_settings": {
@ -87,3 +88,63 @@ def generate_dump_json(dump_path, json_file_name, test_key):
"Failed to generate dump json file. The test name value " + test_key + " is invalid.")
with open(json_file_name, 'w') as f:
json.dump(data, f)
def generate_dump_json_with_overflow(dump_path, json_file_name, test_key, op):
"""
Util function to generate dump configuration json file.
"""
if test_key == "test_async_dump":
data = async_dump_dict
data["common_dump_settings"]["path"] = dump_path
data["common_dump_settings"]["op_debug_mode"] = op
else:
raise ValueError(
"Failed to generate dump json file. Overflow only support in async dump")
with open(json_file_name, 'w') as f:
json.dump(data, f)
def check_dump_structure(dump_path, json_file_path, num_card, num_graph, num_iteration):
"""
Util to check if the dump structure is correct.
"""
with open(json_file_path) as f:
data = json.load(f)
net_name = data["common_dump_settings"]["net_name"]
assert os.path.isdir(dump_path)
for rank_id in range(num_card):
rank_path = os.path.join(dump_path, "rank_"+str(rank_id))
assert os.path.exists(rank_path)
net_name_path = os.path.join(rank_path, net_name)
assert os.path.exists(net_name_path)
graph_path = os.path.join(rank_path, "graphs")
assert os.path.exists(graph_path)
execution_order_path = os.path.join(rank_path, "execution_order")
assert os.path.exists(execution_order_path)
for graph_id in range(num_graph):
graph_id_path = os.path.join(net_name_path, str(graph_id))
assert os.path.exists(graph_id_path)
graph_pb_file = os.path.join(graph_path, "ms_output_trace_code_graph_" + str(graph_id) + ".pb")
graph_ir_file = os.path.join(graph_path, "ms_output_trace_code_graph_" + str(graph_id) + ".ir")
assert os.path.exists(graph_pb_file)
assert os.path.exists(graph_ir_file)
execution_order_file = os.path.join(execution_order_path, "ms_execution_order_graph_"
+ str(graph_id) + ".csv")
assert os.path.exists(execution_order_file)
for iteration_id in range(num_iteration):
it_id_path = os.path.join(graph_id_path, str(iteration_id))
assert os.path.isdir(it_id_path)
def find_nth_pos(string, substring, n):
start = string.find(substring)
while n > 1 and start >= 0:
start = string.find(substring, start + len(substring))
n -= 1
return start

View File

@ -32,7 +32,8 @@ from mindspore.nn import SoftmaxCrossEntropyWithLogits
from mindspore.nn import Momentum
from mindspore.nn import TrainOneStepCell
from mindspore.nn import WithLossCell
from dump_test_utils import generate_dump_json
from dump_test_utils import generate_dump_json, generate_dump_json_with_overflow, \
check_dump_structure, find_nth_pos
from tests.security_utils import security_off_wrap
@ -67,8 +68,12 @@ def test_async_dump():
shutil.rmtree(dump_path)
add = Net()
add(Tensor(x), Tensor(y))
time.sleep(5)
for _ in range(3):
if not os.path.exists(dump_file_path):
time.sleep(2)
check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
assert len(os.listdir(dump_file_path)) == 1
del os.environ['MINDSPORE_DUMP_CONFIG']
def run_e2e_dump():
@ -100,6 +105,11 @@ def run_e2e_dump():
expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
assert output.dtype == expect.dtype
assert np.array_equal(output, expect)
for _ in range(3):
if not os.path.exists(dump_file_path):
time.sleep(2)
check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
del os.environ['MINDSPORE_DUMP_CONFIG']
@pytest.mark.level0
@ -122,6 +132,8 @@ def test_e2e_dump_with_hccl_env():
os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
os.environ["RANK_ID"] = "4"
run_e2e_dump()
del os.environ['RANK_TABLE_FILE']
del os.environ['RANK_ID']
@pytest.mark.level0
@ -142,6 +154,8 @@ def test_cpu_e2e_dump_with_hccl_set():
os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
os.environ["RANK_ID"] = "4"
run_e2e_dump()
del os.environ['RANK_TABLE_FILE']
del os.environ['RANK_ID']
@pytest.mark.level0
@ -162,6 +176,8 @@ def test_gpu_e2e_dump_with_hccl_set():
os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
os.environ["RANK_ID"] = "4"
run_e2e_dump()
del os.environ['RANK_TABLE_FILE']
del os.environ['RANK_ID']
class ReluReduceMeanDenseRelu(Cell):
@ -221,6 +237,7 @@ def test_async_dump_net_multi_layer_mode1():
assert value.asnumpy() == dump_result[index]
else:
print('Failed to find hisi convert tools: msaccucmp.py or msaccucmp.pyc.')
del os.environ['MINDSPORE_DUMP_CONFIG']
@pytest.mark.level0
@ -247,6 +264,8 @@ def test_dump_with_diagnostic_path():
add = Net()
add(Tensor(x), Tensor(y))
assert len(os.listdir(dump_file_path)) == 5
del os.environ['MINDSPORE_DUMP_CONFIG']
del os.environ['MS_DIAGNOSTIC_DATA_PATH']
def run_e2e_dump_execution_graph():
@ -265,6 +284,7 @@ def run_e2e_dump_execution_graph():
add(Tensor(x), Tensor(y))
exe_graph_path = os.path.join(dump_path, 'rank_0', 'execution_order')
assert len(os.listdir(exe_graph_path)) == 1
del os.environ['MINDSPORE_DUMP_CONFIG']
@pytest.mark.level0
@ -275,3 +295,107 @@ def test_dump_with_execution_graph():
"""Test dump with execution graph on GPU."""
context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
run_e2e_dump_execution_graph()
def run_overflow_dump():
"""Run async dump and generate overflow"""
if sys.platform != 'linux':
return
pwd = os.getcwd()
overflow_x = np.array([60000, 60000]).astype(np.float16)
with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
dump_path = os.path.join(tmp_dir, 'overflow_dump')
dump_config_path = os.path.join(tmp_dir, 'overflow_dump.json')
generate_dump_json_with_overflow(dump_path, dump_config_path, 'test_async_dump', 3)
os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
if os.path.isdir(dump_path):
shutil.rmtree(dump_path)
add = Net()
add(Tensor(overflow_x), Tensor(overflow_x))
exe_graph_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
for _ in range(5):
if not os.path.exists(exe_graph_path):
time.sleep(2)
check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
# check if overflow dump generate exact two files, and the naming format
assert len(os.listdir(exe_graph_path)) == 2
output_path = glob.glob(os.path.join(exe_graph_path, "Add.Default_Add-op0.*.*.*"))[0]
overflow_path = glob.glob(os.path.join(exe_graph_path, "Opdebug.Node_OpDebug.*.*.*"))[0]
assert output_path
assert overflow_path
# check if generated files have matching task and stream id
output_file_name = os.path.split(output_path)
overflow_file_name = os.path.split(overflow_path)
output_second_dot_pos = find_nth_pos(output_file_name[1], ".", 2)
output_third_dot_pos = find_nth_pos(output_file_name[1], ".", 3)
output_fourth_dot_pos = find_nth_pos(output_file_name[1], ".", 4)
output_task_id = output_file_name[1][output_second_dot_pos+1:output_third_dot_pos]
output_stream_id = output_file_name[1][output_third_dot_pos+1:output_fourth_dot_pos]
overflow_second_dot_pos = find_nth_pos(overflow_file_name[1], ".", 2)
overflow_third_dot_pos = find_nth_pos(overflow_file_name[1], ".", 3)
overflow_fourth_dot_pos = find_nth_pos(overflow_file_name[1], ".", 4)
overflow_task_id = overflow_file_name[1][overflow_second_dot_pos+1:overflow_third_dot_pos]
overflow_stream_id = overflow_file_name[1][overflow_third_dot_pos+1:overflow_fourth_dot_pos]
assert output_task_id == overflow_task_id
assert output_stream_id == overflow_stream_id
# check if overflow dump file contains same task and stream id as file name
with open(overflow_path, 'rb') as f:
f.seek(321, 0)
raw_data = f.read()
task_id_infile = int.from_bytes(raw_data[24:25], 'little')
stream_id_infile = int.from_bytes(raw_data[16:17], 'little')
assert output_task_id == str(task_id_infile)
assert output_stream_id == str(stream_id_infile)
del os.environ['MINDSPORE_DUMP_CONFIG']
def run_not_overflow_dump():
"""Run async dump and not generate overflow"""
if sys.platform != 'linux':
return
pwd = os.getcwd()
overflow_x = np.array([60000, 60000]).astype(np.float16)
overflow_y = np.array([2, 2]).astype(np.float16)
with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
dump_path = os.path.join(tmp_dir, 'overflow_dump')
dump_config_path = os.path.join(tmp_dir, 'overflow_dump.json')
generate_dump_json_with_overflow(dump_path, dump_config_path, 'test_async_dump', 3)
os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
if os.path.isdir(dump_path):
shutil.rmtree(dump_path)
add = Net()
add(Tensor(overflow_x), Tensor(overflow_y))
exe_graph_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
# check no overflow is happening, and path should not be generated
assert not os.path.exists(exe_graph_path)
del os.environ['MINDSPORE_DUMP_CONFIG']
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@security_off_wrap
def test_ascend_overflow_dump():
"""
Feature: Overflow Dump
Description: Test overflow dump
Expectation: Overflow is occurred, and overflow dump file is in correct format
"""
context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')
run_overflow_dump()
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@security_off_wrap
def test_ascend_not_overflow_dump():
"""
Feature: Overflow Dump
Description: Test overflow dump
Expectation: Overflow is not occurred, and overflow dump file is not generated
"""
context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')
run_not_overflow_dump()