Add st for dump

This commit is contained in:
TinaMengtingZhang 2022-04-06 17:33:09 -04:00
parent e84f16ac7d
commit 67c98923d5
4 changed files with 242 additions and 31 deletions

View File

@ -1,4 +1,4 @@
# Copyright 2021 Huawei Technologies Co., Ltd # Copyright 2021-2022 Huawei Technologies Co., Ltd
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
@ -106,7 +106,7 @@ def generate_dump_json(dump_path, json_file_name, test_key):
if test_key == "test_async_dump": if test_key == "test_async_dump":
data = async_dump_dict data = async_dump_dict
data["common_dump_settings"]["path"] = dump_path data["common_dump_settings"]["path"] = dump_path
elif test_key == "test_e2e_dump": elif test_key in ("test_e2e_dump", "test_e2e_dump_trans_false"):
data = e2e_dump_dict data = e2e_dump_dict
data["common_dump_settings"]["path"] = dump_path data["common_dump_settings"]["path"] = dump_path
elif test_key == "test_async_dump_net_multi_layer_mode1": elif test_key == "test_async_dump_net_multi_layer_mode1":
@ -126,6 +126,10 @@ def generate_dump_json(dump_path, json_file_name, test_key):
data = async_dump_dict data = async_dump_dict
data["common_dump_settings"]["path"] = dump_path data["common_dump_settings"]["path"] = dump_path
data["common_dump_settings"]["file_format"] = "bin" data["common_dump_settings"]["file_format"] = "bin"
elif test_key == "test_e2e_dump_trans_true":
data = e2e_dump_dict
data["common_dump_settings"]["path"] = dump_path
data["e2e_dump_settings"]["trans_flag"] = True
else: else:
raise ValueError( raise ValueError(
"Failed to generate dump json file. The test name value " + test_key + " is invalid.") "Failed to generate dump json file. The test name value " + test_key + " is invalid.")

View File

@ -1,4 +1,4 @@
# Copyright 2020-2021 Huawei Technologies Co., Ltd # Copyright 2020-2022 Huawei Technologies Co., Ltd
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
@ -611,6 +611,23 @@ def test_ascend_full_dump():
run_saved_data_dump_test('test_async_dump', 'full') run_saved_data_dump_test('test_async_dump', 'full')
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@security_off_wrap
def test_ascend_full_dump_kernel_by_kernel():
"""
Feature: Ascend Full Dump in kernel-by-kernel (MindRT) mode
Description: Test Ascend full dump
Expectation: Tensors are stored in npy files and their statistics stored in statistic.csv
"""
os.environ['GRAPH_OP_RUN'] = "1"
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
run_saved_data_dump_test('test_async_dump', 'full')
del os.environ['GRAPH_OP_RUN']
@constexpr @constexpr
def construct_tensor(cst): def construct_tensor(cst):
return Tensor(np.array(cst)) return Tensor(np.array(cst))

View File

@ -0,0 +1,159 @@
# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
import os
import sys
import tempfile
import glob
import shutil
import pytest
import numpy as np
import mindspore as ms
import mindspore.context as context
import mindspore.nn as nn
import mindspore.ops as ops
from mindspore import Tensor
from dump_test_utils import generate_dump_json, check_dump_structure
from tests.security_utils import security_off_wrap
class ConvNet(nn.Cell):
def __init__(self):
super(ConvNet, self).__init__()
self.conv2 = ops.Conv2D(out_channel=3, kernel_size=1)
def construct(self, x, weight):
return self.conv2(x, weight)
def run_trans_flag(test_name):
if sys.platform != 'linux':
return
with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
dump_path = os.path.join(tmp_dir, test_name)
dump_config_path = os.path.join(tmp_dir, '{}.json'.format(test_name))
generate_dump_json(dump_path, dump_config_path, test_name)
os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
if os.path.isdir(dump_path):
shutil.rmtree(dump_path)
net = ConvNet()
tensor = Tensor(np.ones([1, 3, 3, 3]), ms.float32)
weight = Tensor(np.ones([3, 3, 1, 1]), ms.float32)
expect = net(tensor, weight)
check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
dump_data_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
assert os.path.exists(dump_data_path)
if test_name == "test_e2e_dump_trans_true":
# tensor data in host format.
output_name = "Conv2D.Conv2D-op*.0.0.*.output.0.DefaultFormat.npy"
output_path = glob.glob(os.path.join(dump_data_path, output_name))[0]
real_path = os.path.realpath(output_path)
output = np.load(real_path)
assert output.shape == (1, 3, 3, 3)
assert np.array_equal(output, expect)
elif test_name == "test_e2e_dump_trans_false":
# tensor data in device format.
output_name = "Conv2D.Conv2D-op*.0.0.*.output.0.NC1HWC0.npy"
output_path = glob.glob(os.path.join(dump_data_path, output_name))[0]
real_path = os.path.realpath(output_path)
output = np.load(real_path)
assert output.shape == (1, 1, 3, 3, 16)
else:
# tensor data in host format.
output_name = "Conv2D.Conv2D-op*.*.*.*.output.0.NCHW.npy"
output_path = glob.glob(os.path.join(dump_data_path, output_name))[0]
real_path = os.path.realpath(output_path)
output = np.load(real_path)
assert output.shape == (1, 3, 3, 3)
assert np.array_equal(output, expect)
del os.environ['MINDSPORE_DUMP_CONFIG']
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@security_off_wrap
def test_ascend_e2e_trans_true():
"""
Feature: Ascend e2e dump.
Description: Test e2e dump in Ascend with trans_flag is configured to true.
Expectation: Dump files has tensor data in host format (4 dimensions).
"""
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
run_trans_flag("test_e2e_dump_trans_true")
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@security_off_wrap
def test_ascend_e2e_trans_false():
"""
Feature: Ascend e2e dump.
Description: Test e2e dump in Ascend with trans_flag is configured to false.
Expectation: Dump files has tensor data in device format.
"""
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
run_trans_flag("test_e2e_dump_trans_false")
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@security_off_wrap
def test_ascend_kernel_by_kernel_trans_true():
"""
Feature: Ascend kernel by kernel dump.
Description: Test kernel by kernel dump in Ascend with trans_flag is configured to true.
Expectation: Dump files has tensor data in host format (4 dimensions).
"""
os.environ['GRAPH_OP_RUN'] = "1"
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
run_trans_flag("test_e2e_dump_trans_true")
del os.environ['GRAPH_OP_RUN']
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@security_off_wrap
def test_ascend_kernel_by_kernel_trans_false():
"""
Feature: Ascend kernel by kernel dump.
Description: Test kernel by kernel dump in Ascend with trans_flag is configured to false.
Expectation: Dump files has tensor data in device format.
"""
os.environ['GRAPH_OP_RUN'] = "1"
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
run_trans_flag("test_e2e_dump_trans_false")
del os.environ['GRAPH_OP_RUN']
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@security_off_wrap
def test_ascend_a_plus_m_conversion():
"""
Feature: Ascend A+M dump.
Description: Test A+M dump in Ascend and check the format of the dump data.
Expectation: Dump files has tensor data in host format (4 dimensions).
"""
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
run_trans_flag("test_async_dump_npy")

View File

@ -1,4 +1,4 @@
# Copyright 2021 Huawei Technologies Co., Ltd # Copyright 2021-2022 Huawei Technologies Co., Ltd
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
@ -43,8 +43,12 @@ class NewAddNet(Cell):
super(NewAddNet, self).__init__() super(NewAddNet, self).__init__()
self.add = P.AddN() self.add = P.AddN()
def construct(self, x, y): def construct(self, b1, b2, x, y):
z = self.add([x, y, y]) z = self.add([x, y, y])
if b1 < b2:
z = self.add([x, y, y])
else:
z = self.add([x, x, y])
return z return z
@ -52,13 +56,29 @@ def train_addnet(epoch):
net = AddNet() net = AddNet()
net2 = NewAddNet() net2 = NewAddNet()
output_list = [] output_list = []
b1 = Tensor(np.array(1).astype(np.float32))
b2 = Tensor(np.array(3).astype(np.float32))
input_x = Tensor(np.ones([2, 1, 2, 1]).astype(np.float32)) input_x = Tensor(np.ones([2, 1, 2, 1]).astype(np.float32))
input_y = Tensor(np.ones([2, 1, 2, 1]).astype(np.float32)) input_y = Tensor(np.ones([2, 1, 2, 1]).astype(np.float32))
for _ in range(epoch): for _ in range(epoch):
out_put = net(input_x, input_y) out_put = net(input_x, input_y)
out2 = net2(out_put, input_x) out2 = net2(b1, b2, out_put, input_x)
output_list.append(out2.asnumpy()) output_list.append(out2.asnumpy())
input_x = input_x + input_y input_x = input_x + input_y
b1 = b1+1
return output_list
def check_graph_structure(dump_file_path, execution_order_path, graph_id, expect_steps):
dump_data_path = os.path.join(dump_file_path, graph_id)
assert sorted(os.listdir(dump_data_path)) == expect_steps
graph_history_file_path = os.path.join(
execution_order_path, 'ms_global_execution_order_graph_{}.csv'.format(graph_id))
assert path.exists(graph_history_file_path)
with open(graph_history_file_path) as csvfile:
history_graph = csv.reader(csvfile)
iter_list_graph = [row[0] for row in history_graph]
assert iter_list_graph == expect_steps
def run_multi_root_graph_dump(device, dump_mode, test_name): def run_multi_root_graph_dump(device, dump_mode, test_name):
@ -79,31 +99,24 @@ def run_multi_root_graph_dump(device, dump_mode, test_name):
for _ in range(3): for _ in range(3):
if not os.path.exists(dump_file_path): if not os.path.exists(dump_file_path):
time.sleep(2) time.sleep(2)
# Multi root graph script : we have 2 graphs under rank_0 dir
# Each graph should have 3 iteration
# Each graph was executed once per epoch,
# Graph 0 was executed in even iterations, graph one was executed in odd iterations
assert len(os.listdir(dump_file_path)) == 2
dump_path_graph_0 = os.path.join(dump_file_path, '0')
dump_path_graph_1 = os.path.join(dump_file_path, '1')
assert sorted(os.listdir(dump_path_graph_0)) == ['0', '2', '4']
assert sorted(os.listdir(dump_path_graph_1)) == ['1', '3', '5']
execution_order_path = os.path.join(dump_path, 'rank_0', 'execution_order') execution_order_path = os.path.join(dump_path, 'rank_0', 'execution_order')
# Four files in execution_order dir. # Multi root graph script: check dump data dir and graph history files and see if iteration number is matched.
# Two files for each graph (ms_execution_order and ms_global_execution_order) if device == "GPU":
assert len(os.listdir(execution_order_path)) == 4 # In GPU, we have 4 kernel graphs folders under rank_0 dir.
global_exec_order_graph_0 = os.path.join(execution_order_path, 'ms_global_execution_order_graph_0.csv') # In graph history dir, there are 2 files for each graph (ms_execution_order and ms_global_execution_order).
assert path.exists(global_exec_order_graph_0) assert len(os.listdir(dump_file_path)) == 4
with open(global_exec_order_graph_0) as csvfile: assert len(os.listdir(execution_order_path)) == 8
history_graph_0 = csv.reader(csvfile) check_graph_structure(dump_file_path, execution_order_path, '0', ['0', '2', '4'])
iter_list_graph_0 = list(history_graph_0) check_graph_structure(dump_file_path, execution_order_path, '1', ['1', '3', '5'])
assert iter_list_graph_0 == [['0'], ['2'], ['4']] else:
global_exec_order_graph_1 = os.path.join(execution_order_path, 'ms_global_execution_order_graph_1.csv') # In Ascend, we have 2 root graphs folders under rank_0 dir.
assert path.exists(global_exec_order_graph_1) # In graph history dir, there are 4 ms_execution_order files and 2 ms_global_execution_order files.
with open(global_exec_order_graph_1) as csvfile: # Each graph should have 3 iterations. Each graph was executed once per epoch.
history_graph_1 = csv.reader(csvfile) # Graph 0 was executed in even iterations, graph 1 was executed in odd iterations.
iter_list_graph_1 = list(history_graph_1) assert len(os.listdir(dump_file_path)) == 2
assert iter_list_graph_1 == [['1'], ['3'], ['5']] assert len(os.listdir(execution_order_path)) == 6
check_graph_structure(dump_file_path, execution_order_path, '0', ['0', '2', '4'])
check_graph_structure(dump_file_path, execution_order_path, '1', ['1', '3', '5'])
@pytest.mark.level0 @pytest.mark.level0
@ -154,5 +167,23 @@ def test_Ascend_async_multi_root_graph_dump():
Expectation: Expectation:
Dump for two different graphs, graph 0 even iterations and graph 1 odd iterations. Dump for two different graphs, graph 0 even iterations and graph 1 odd iterations.
""" """
run_multi_root_graph_dump("Ascend", "async_dump", "test_Ascend_async_multi_root_graph_dump") run_multi_root_graph_dump("Ascend", "async_dump", "test_Ascend_async_multi_root_graph_dump")
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@security_off_wrap
def test_ascend_multi_root_graph_dump_kernel_by_kernel():
"""
Feature:
Multi root graph dump for Ascend kernel by kernel.
Description:
Test multi root graph dump in Ascend kernel by kernel.
Expectation:
Dump for two different graphs, graph 0 even iterations and graph 1 odd iterations.
"""
os.environ['GRAPH_OP_RUN'] = "1"
run_multi_root_graph_dump("Ascend", "e2e_dump", "test_Ascend_e2e_multi_root_graph_dump")
del os.environ['GRAPH_OP_RUN']