!22468 Add st for error manager

Merge pull request !22468 from tanghuikang/tbe_em
This commit is contained in:
i-robot 2021-08-28 01:12:41 +00:00 committed by Gitee
commit c4fee1dff0
7 changed files with 188 additions and 1 deletions

View File

@ -305,7 +305,7 @@ std::string AscendKernelCompileManager::FormatSelectResultProcess(const nlohmann
}
void AscendKernelCompileManager::QueryResultProcess(const nlohmann::json &json, TargetJobStatus *task_info,
int adjust_log_level = 3) {
int adjust_log_level = EXCEPTION) {
auto job_type = GetJsonValue<std::string>(json, kJobType);
auto json_name = GetJsonValue<std::string>(json, kFusionOpName);
MS_LOG(DEBUG) << "Job: " << job_type << " post processing";

View File

@ -0,0 +1 @@
Fake rank table file

View File

@ -0,0 +1,26 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
import os
from mindspore import context
from mindspore.communication.management import init
def test_hccl_init_fail():
fake_rank_table_file = os.getenv("FAKE_RANK_TABLE_FILE")
os.environ["RANK_TABLE_FILE"] = fake_rank_table_file
os.environ["RANK_ID"] = "2"
context.set_context(device_id=2, device_target="Ascend")
init()

View File

@ -0,0 +1,32 @@
#!/bin/bash
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
BASE_PATH=$(
cd "$(dirname $0)"
pwd
)
export FAKE_RANK_TABLE_FILE=$BASE_PATH/fake_rank_table_file.json
pytest -s -v hccl_init_fail.py > test_hccl_init_fail.log 2>&1 &
process_pid=$(echo $!)
wait ${process_pid}
status=$(echo $?)
if [ "${status}" == "0" ]; then
exit 1
fi
exit 0

View File

@ -0,0 +1,26 @@
#!/bin/bash
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
pytest -s -v tbe_compile_fail.py > test_tbe_compile_fail.log 2>&1 &
process_pid=$(echo $!)
wait ${process_pid}
status=$(echo $?)
if [ "${status}" == "0" ]; then
exit 1
fi
exit 0

View File

@ -0,0 +1,53 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
import numpy as np
import mindspore.context as context
import mindspore.nn as nn
from mindspore import Tensor
from mindspore.common.api import ms_function
from mindspore.common.initializer import initializer
from mindspore.common.parameter import Parameter
from mindspore.ops import operations as P
context.set_context(device_target="Ascend")
class Net(nn.Cell):
def __init__(self):
super(Net, self).__init__()
out_channel = 64
kernel_size = 7
self.conv = P.Conv2D(out_channel,
kernel_size,
mode=1,
pad_mode="valid",
pad=0,
stride=99,
dilation=1,
group=1)
self.w = Parameter(initializer(
'normal', [64, 3, 7, 7]), name='w')
@ms_function
def construct(self, x):
return self.conv(x, self.w)
def test_tbe_compile_faile():
x = np.random.randn(32, 3, 224, 224).astype(np.float32)
conv = Net()
output = conv(Tensor(x))
print(output.asnumpy())

View File

@ -0,0 +1,49 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
import os
import pytest
@pytest.mark.level1
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_single
def test_hccl_init_fail():
sh_path = os.path.split(os.path.realpath(__file__))[0]
ret = os.system(f"sh {sh_path}/run_hccl_init_fail.sh")
assert ret == 0
grep_ret = os.system(f"grep 'Ascend error occurred, error message:' {sh_path}/test_hccl_init_fail.log -c")
assert grep_ret == 0
grep_ret = os.system(f"grep 'EI0004:' {sh_path}/test_hccl_init_fail.log -c")
assert grep_ret == 0
grep_ret = os.system(f"grep 'Invalid ranktable, with rankID' {sh_path}/test_hccl_init_fail.log -c")
assert grep_ret == 0
@pytest.mark.level1
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_single
def test_tbe_compile_fail():
sh_path = os.path.split(os.path.realpath(__file__))[0]
ret = os.system(f"sh {sh_path}/run_tbe_compile_fail.sh")
assert ret == 0
grep_ret = os.system(f"grep 'Ascend error occurred, error message:' {sh_path}/test_tbe_compile_fail.log -c")
assert grep_ret == 0
grep_ret = os.system(f"grep 'E60011:' {sh_path}/test_tbe_compile_fail.log -c")
assert grep_ret == 0
grep_ret = os.system(r"grep 'In op\[conv2d\], the \[strideh\] must in range \[1, 63\], actual is \[99\]' "
+ f"{sh_path}/test_tbe_compile_fail.log -c")
assert grep_ret == 0