forked from mindspore-Ecosystem/mindspore
add overflow st test and checkdumpstructure test
This commit is contained in:
parent
b414db49b6
commit
332e0dbb0f
|
@ -0,0 +1,14 @@
|
|||
[
|
||||
{
|
||||
"watchpoint_hit1": {
|
||||
"name": "Default/Add-op0",
|
||||
"slot": 0,
|
||||
"condition": 2,
|
||||
"watchpoint_id": 1,
|
||||
"parameter": [],
|
||||
"error_code": 0,
|
||||
"rank_id": 0,
|
||||
"root_graph_id": 0
|
||||
}
|
||||
}
|
||||
]
|
|
@ -20,8 +20,8 @@ import os
|
|||
import json
|
||||
import tempfile
|
||||
import numpy as np
|
||||
import mindspore.offline_debug.dbg_services as d
|
||||
import pytest
|
||||
import mindspore.offline_debug.dbg_services as d
|
||||
from tests.security_utils import security_off_wrap
|
||||
from dump_test_utils import build_dump_structure
|
||||
|
||||
|
@ -70,7 +70,7 @@ def run_watchpoints(is_sync):
|
|||
temp_dir = build_dump_structure(tmp_dir, tensor_name, tensor_list, "Test", tensor_info)
|
||||
|
||||
debugger_backend = d.DbgServices(dump_file_path=temp_dir)
|
||||
debugger_backend.initialize(net_name="Test", is_sync_mode=False)
|
||||
debugger_backend.initialize(net_name="Test", is_sync_mode=is_sync)
|
||||
|
||||
# NOTES:
|
||||
# -> watch_condition=6 is MIN_LT
|
||||
|
@ -146,6 +146,89 @@ def test_async_watchpoints():
|
|||
run_watchpoints(False)
|
||||
|
||||
|
||||
def run_overflow_watchpoint(is_overflow):
|
||||
test_name = "overflow_watchpoint"
|
||||
tensor = np.array([65504, 65504], np.float16)
|
||||
task_id = 2
|
||||
stream_id = 7
|
||||
pwd = os.getcwd()
|
||||
with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
|
||||
path = os.path.join(tmp_dir, "rank_0", "Add", "0", "0")
|
||||
os.makedirs(path, exist_ok=True)
|
||||
add_file = tempfile.mkstemp(prefix="Add.Default_Add-op0."+str(task_id)+"."+str(stream_id)+
|
||||
".1", dir=path)
|
||||
with open(add_file[1], 'wb') as add_f:
|
||||
add_f.write(b'1')
|
||||
add_f.seek(8)
|
||||
add_f.write(b'\n\x032.0\x10\x83\xf7\xef\x9f\x99\xc8\xf3\x02\x1a\x10\x08\x02\x10\x02\x1a\x03')
|
||||
add_f.write(b'\n\x01\x020\x04:\x03\n\x01\x022\x0f')
|
||||
add_f.write(b'Default/Add-op0')
|
||||
add_f.write(tensor)
|
||||
overflow_file = tempfile.mkstemp(prefix="Opdebug.Node_OpDebug."+str(task_id)+"." +str(stream_id)+
|
||||
".0", dir=path)
|
||||
with open(overflow_file[1], 'wb') as f:
|
||||
f.seek(321, 0)
|
||||
byte_list = []
|
||||
for i in range(256):
|
||||
if i == 16:
|
||||
byte_list.append(stream_id)
|
||||
elif i == 24:
|
||||
if is_overflow:
|
||||
byte_list.append(task_id)
|
||||
else:
|
||||
# wrong task_id, should not generate overflow watchpoint hit
|
||||
byte_list.append(task_id+1)
|
||||
else:
|
||||
byte_list.append(0)
|
||||
newFileByteArray = bytearray(byte_list)
|
||||
f.write(bytes(newFileByteArray))
|
||||
debugger_backend = d.DbgServices(dump_file_path=tmp_dir)
|
||||
debugger_backend.initialize(net_name="Add", is_sync_mode=False)
|
||||
debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=2,
|
||||
check_node_list={"Default/Add-op0":
|
||||
{"rank_id": [0], "root_graph_id": [0], "is_output": True
|
||||
}}, parameter_list=[])
|
||||
|
||||
watchpoint_hits_test = debugger_backend.check_watchpoints(iteration=0)
|
||||
|
||||
if is_overflow:
|
||||
assert len(watchpoint_hits_test) == 1
|
||||
if GENERATE_GOLDEN:
|
||||
print_watchpoint_hits(watchpoint_hits_test, 0, True, test_name)
|
||||
else:
|
||||
compare_expect_actual_result(watchpoint_hits_test, 0, test_name)
|
||||
else:
|
||||
assert not watchpoint_hits_test
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_arm_ascend_training
|
||||
@pytest.mark.platform_x86_ascend_training
|
||||
@pytest.mark.env_onecard
|
||||
@security_off_wrap
|
||||
def test_async_overflow_watchpoints_hit():
|
||||
"""
|
||||
Feature: Offline Debugger CheckWatchpoint
|
||||
Description: Test check overflow watchpoint hit
|
||||
Expectation: Overflow watchpoint is hit
|
||||
"""
|
||||
run_overflow_watchpoint(True)
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_arm_ascend_training
|
||||
@pytest.mark.platform_x86_ascend_training
|
||||
@pytest.mark.env_onecard
|
||||
@security_off_wrap
|
||||
def test_async_overflow_watchpoints_not_hit():
|
||||
"""
|
||||
Feature: Offline Debugger CheckWatchpoint
|
||||
Description: Test check overflow watchpoint hit
|
||||
Expectation: Overflow watchpoint is not hit
|
||||
"""
|
||||
run_overflow_watchpoint(False)
|
||||
|
||||
|
||||
def compare_expect_actual_result(watchpoint_hits_list, test_index, test_name):
|
||||
"""Compare actual result with golden file."""
|
||||
pwd = os.getcwd()
|
||||
|
|
|
@ -17,6 +17,7 @@ Utils for testing dump feature.
|
|||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
async_dump_dict = {
|
||||
"common_dump_settings": {
|
||||
|
@ -87,3 +88,63 @@ def generate_dump_json(dump_path, json_file_name, test_key):
|
|||
"Failed to generate dump json file. The test name value " + test_key + " is invalid.")
|
||||
with open(json_file_name, 'w') as f:
|
||||
json.dump(data, f)
|
||||
|
||||
|
||||
def generate_dump_json_with_overflow(dump_path, json_file_name, test_key, op):
|
||||
"""
|
||||
Util function to generate dump configuration json file.
|
||||
"""
|
||||
if test_key == "test_async_dump":
|
||||
data = async_dump_dict
|
||||
data["common_dump_settings"]["path"] = dump_path
|
||||
data["common_dump_settings"]["op_debug_mode"] = op
|
||||
else:
|
||||
raise ValueError(
|
||||
"Failed to generate dump json file. Overflow only support in async dump")
|
||||
with open(json_file_name, 'w') as f:
|
||||
json.dump(data, f)
|
||||
|
||||
|
||||
def check_dump_structure(dump_path, json_file_path, num_card, num_graph, num_iteration):
|
||||
"""
|
||||
Util to check if the dump structure is correct.
|
||||
"""
|
||||
with open(json_file_path) as f:
|
||||
data = json.load(f)
|
||||
net_name = data["common_dump_settings"]["net_name"]
|
||||
assert os.path.isdir(dump_path)
|
||||
for rank_id in range(num_card):
|
||||
rank_path = os.path.join(dump_path, "rank_"+str(rank_id))
|
||||
assert os.path.exists(rank_path)
|
||||
|
||||
net_name_path = os.path.join(rank_path, net_name)
|
||||
assert os.path.exists(net_name_path)
|
||||
graph_path = os.path.join(rank_path, "graphs")
|
||||
assert os.path.exists(graph_path)
|
||||
execution_order_path = os.path.join(rank_path, "execution_order")
|
||||
assert os.path.exists(execution_order_path)
|
||||
|
||||
for graph_id in range(num_graph):
|
||||
graph_id_path = os.path.join(net_name_path, str(graph_id))
|
||||
assert os.path.exists(graph_id_path)
|
||||
|
||||
graph_pb_file = os.path.join(graph_path, "ms_output_trace_code_graph_" + str(graph_id) + ".pb")
|
||||
graph_ir_file = os.path.join(graph_path, "ms_output_trace_code_graph_" + str(graph_id) + ".ir")
|
||||
assert os.path.exists(graph_pb_file)
|
||||
assert os.path.exists(graph_ir_file)
|
||||
|
||||
execution_order_file = os.path.join(execution_order_path, "ms_execution_order_graph_"
|
||||
+ str(graph_id) + ".csv")
|
||||
assert os.path.exists(execution_order_file)
|
||||
|
||||
for iteration_id in range(num_iteration):
|
||||
it_id_path = os.path.join(graph_id_path, str(iteration_id))
|
||||
assert os.path.isdir(it_id_path)
|
||||
|
||||
|
||||
def find_nth_pos(string, substring, n):
|
||||
start = string.find(substring)
|
||||
while n > 1 and start >= 0:
|
||||
start = string.find(substring, start + len(substring))
|
||||
n -= 1
|
||||
return start
|
||||
|
|
|
@ -32,7 +32,8 @@ from mindspore.nn import SoftmaxCrossEntropyWithLogits
|
|||
from mindspore.nn import Momentum
|
||||
from mindspore.nn import TrainOneStepCell
|
||||
from mindspore.nn import WithLossCell
|
||||
from dump_test_utils import generate_dump_json
|
||||
from dump_test_utils import generate_dump_json, generate_dump_json_with_overflow, \
|
||||
check_dump_structure, find_nth_pos
|
||||
from tests.security_utils import security_off_wrap
|
||||
|
||||
|
||||
|
@ -67,8 +68,12 @@ def test_async_dump():
|
|||
shutil.rmtree(dump_path)
|
||||
add = Net()
|
||||
add(Tensor(x), Tensor(y))
|
||||
time.sleep(5)
|
||||
for _ in range(3):
|
||||
if not os.path.exists(dump_file_path):
|
||||
time.sleep(2)
|
||||
check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
|
||||
assert len(os.listdir(dump_file_path)) == 1
|
||||
del os.environ['MINDSPORE_DUMP_CONFIG']
|
||||
|
||||
|
||||
def run_e2e_dump():
|
||||
|
@ -100,6 +105,11 @@ def run_e2e_dump():
|
|||
expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
|
||||
assert output.dtype == expect.dtype
|
||||
assert np.array_equal(output, expect)
|
||||
for _ in range(3):
|
||||
if not os.path.exists(dump_file_path):
|
||||
time.sleep(2)
|
||||
check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
|
||||
del os.environ['MINDSPORE_DUMP_CONFIG']
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
|
@ -122,6 +132,8 @@ def test_e2e_dump_with_hccl_env():
|
|||
os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
|
||||
os.environ["RANK_ID"] = "4"
|
||||
run_e2e_dump()
|
||||
del os.environ['RANK_TABLE_FILE']
|
||||
del os.environ['RANK_ID']
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
|
@ -142,6 +154,8 @@ def test_cpu_e2e_dump_with_hccl_set():
|
|||
os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
|
||||
os.environ["RANK_ID"] = "4"
|
||||
run_e2e_dump()
|
||||
del os.environ['RANK_TABLE_FILE']
|
||||
del os.environ['RANK_ID']
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
|
@ -162,6 +176,8 @@ def test_gpu_e2e_dump_with_hccl_set():
|
|||
os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
|
||||
os.environ["RANK_ID"] = "4"
|
||||
run_e2e_dump()
|
||||
del os.environ['RANK_TABLE_FILE']
|
||||
del os.environ['RANK_ID']
|
||||
|
||||
|
||||
class ReluReduceMeanDenseRelu(Cell):
|
||||
|
@ -221,6 +237,7 @@ def test_async_dump_net_multi_layer_mode1():
|
|||
assert value.asnumpy() == dump_result[index]
|
||||
else:
|
||||
print('Failed to find hisi convert tools: msaccucmp.py or msaccucmp.pyc.')
|
||||
del os.environ['MINDSPORE_DUMP_CONFIG']
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
|
@ -247,6 +264,8 @@ def test_dump_with_diagnostic_path():
|
|||
add = Net()
|
||||
add(Tensor(x), Tensor(y))
|
||||
assert len(os.listdir(dump_file_path)) == 5
|
||||
del os.environ['MINDSPORE_DUMP_CONFIG']
|
||||
del os.environ['MS_DIAGNOSTIC_DATA_PATH']
|
||||
|
||||
|
||||
def run_e2e_dump_execution_graph():
|
||||
|
@ -265,6 +284,7 @@ def run_e2e_dump_execution_graph():
|
|||
add(Tensor(x), Tensor(y))
|
||||
exe_graph_path = os.path.join(dump_path, 'rank_0', 'execution_order')
|
||||
assert len(os.listdir(exe_graph_path)) == 1
|
||||
del os.environ['MINDSPORE_DUMP_CONFIG']
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
|
@ -275,3 +295,107 @@ def test_dump_with_execution_graph():
|
|||
"""Test dump with execution graph on GPU."""
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
|
||||
run_e2e_dump_execution_graph()
|
||||
|
||||
|
||||
def run_overflow_dump():
|
||||
"""Run async dump and generate overflow"""
|
||||
if sys.platform != 'linux':
|
||||
return
|
||||
pwd = os.getcwd()
|
||||
overflow_x = np.array([60000, 60000]).astype(np.float16)
|
||||
with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
|
||||
dump_path = os.path.join(tmp_dir, 'overflow_dump')
|
||||
dump_config_path = os.path.join(tmp_dir, 'overflow_dump.json')
|
||||
generate_dump_json_with_overflow(dump_path, dump_config_path, 'test_async_dump', 3)
|
||||
os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
|
||||
if os.path.isdir(dump_path):
|
||||
shutil.rmtree(dump_path)
|
||||
add = Net()
|
||||
add(Tensor(overflow_x), Tensor(overflow_x))
|
||||
exe_graph_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
|
||||
for _ in range(5):
|
||||
if not os.path.exists(exe_graph_path):
|
||||
time.sleep(2)
|
||||
check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
|
||||
# check if overflow dump generate exact two files, and the naming format
|
||||
assert len(os.listdir(exe_graph_path)) == 2
|
||||
output_path = glob.glob(os.path.join(exe_graph_path, "Add.Default_Add-op0.*.*.*"))[0]
|
||||
overflow_path = glob.glob(os.path.join(exe_graph_path, "Opdebug.Node_OpDebug.*.*.*"))[0]
|
||||
assert output_path
|
||||
assert overflow_path
|
||||
# check if generated files have matching task and stream id
|
||||
output_file_name = os.path.split(output_path)
|
||||
overflow_file_name = os.path.split(overflow_path)
|
||||
output_second_dot_pos = find_nth_pos(output_file_name[1], ".", 2)
|
||||
output_third_dot_pos = find_nth_pos(output_file_name[1], ".", 3)
|
||||
output_fourth_dot_pos = find_nth_pos(output_file_name[1], ".", 4)
|
||||
output_task_id = output_file_name[1][output_second_dot_pos+1:output_third_dot_pos]
|
||||
output_stream_id = output_file_name[1][output_third_dot_pos+1:output_fourth_dot_pos]
|
||||
|
||||
overflow_second_dot_pos = find_nth_pos(overflow_file_name[1], ".", 2)
|
||||
overflow_third_dot_pos = find_nth_pos(overflow_file_name[1], ".", 3)
|
||||
overflow_fourth_dot_pos = find_nth_pos(overflow_file_name[1], ".", 4)
|
||||
overflow_task_id = overflow_file_name[1][overflow_second_dot_pos+1:overflow_third_dot_pos]
|
||||
overflow_stream_id = overflow_file_name[1][overflow_third_dot_pos+1:overflow_fourth_dot_pos]
|
||||
assert output_task_id == overflow_task_id
|
||||
assert output_stream_id == overflow_stream_id
|
||||
# check if overflow dump file contains same task and stream id as file name
|
||||
with open(overflow_path, 'rb') as f:
|
||||
f.seek(321, 0)
|
||||
raw_data = f.read()
|
||||
task_id_infile = int.from_bytes(raw_data[24:25], 'little')
|
||||
stream_id_infile = int.from_bytes(raw_data[16:17], 'little')
|
||||
assert output_task_id == str(task_id_infile)
|
||||
assert output_stream_id == str(stream_id_infile)
|
||||
del os.environ['MINDSPORE_DUMP_CONFIG']
|
||||
|
||||
|
||||
def run_not_overflow_dump():
|
||||
"""Run async dump and not generate overflow"""
|
||||
if sys.platform != 'linux':
|
||||
return
|
||||
pwd = os.getcwd()
|
||||
overflow_x = np.array([60000, 60000]).astype(np.float16)
|
||||
overflow_y = np.array([2, 2]).astype(np.float16)
|
||||
with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
|
||||
dump_path = os.path.join(tmp_dir, 'overflow_dump')
|
||||
dump_config_path = os.path.join(tmp_dir, 'overflow_dump.json')
|
||||
generate_dump_json_with_overflow(dump_path, dump_config_path, 'test_async_dump', 3)
|
||||
os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
|
||||
if os.path.isdir(dump_path):
|
||||
shutil.rmtree(dump_path)
|
||||
add = Net()
|
||||
add(Tensor(overflow_x), Tensor(overflow_y))
|
||||
exe_graph_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
|
||||
# check no overflow is happening, and path should not be generated
|
||||
assert not os.path.exists(exe_graph_path)
|
||||
del os.environ['MINDSPORE_DUMP_CONFIG']
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_arm_ascend_training
|
||||
@pytest.mark.platform_x86_ascend_training
|
||||
@pytest.mark.env_onecard
|
||||
@security_off_wrap
|
||||
def test_ascend_overflow_dump():
|
||||
"""
|
||||
Feature: Overflow Dump
|
||||
Description: Test overflow dump
|
||||
Expectation: Overflow is occurred, and overflow dump file is in correct format
|
||||
"""
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')
|
||||
run_overflow_dump()
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_arm_ascend_training
|
||||
@pytest.mark.platform_x86_ascend_training
|
||||
@pytest.mark.env_onecard
|
||||
@security_off_wrap
|
||||
def test_ascend_not_overflow_dump():
|
||||
"""
|
||||
Feature: Overflow Dump
|
||||
Description: Test overflow dump
|
||||
Expectation: Overflow is not occurred, and overflow dump file is not generated
|
||||
"""
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')
|
||||
run_not_overflow_dump()
|
||||
|
|
Loading…
Reference in New Issue