From 4cd237eee4495ad838a312164d0d4b3e43be9232 Mon Sep 17 00:00:00 2001 From: ZPaC Date: Thu, 16 Apr 2020 09:56:43 +0800 Subject: [PATCH] Add GPU NCCL ci test cases. --- tests/st/nccl/test_nccl_all.py | 44 ++++++++++++++++++++ tests/st/nccl/test_nccl_all_reduce_op.py | 2 +- tests/st/nccl/test_nccl_lenet.py | 3 +- tests/st/nccl/test_nccl_reduce_scatter_op.py | 2 - 4 files changed, 47 insertions(+), 4 deletions(-) create mode 100644 tests/st/nccl/test_nccl_all.py diff --git a/tests/st/nccl/test_nccl_all.py b/tests/st/nccl/test_nccl_all.py new file mode 100644 index 00000000000..99494bb7411 --- /dev/null +++ b/tests/st/nccl/test_nccl_all.py @@ -0,0 +1,44 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import os +import pytest + +@pytest.mark.level0 +@pytest.mark.platform_x86_gpu_training +@pytest.mark.env_single +def test_nccl_lenet(): + return_code = os.system("mpirun -n 8 pytest -s test_nccl_lenet.py") + assert(return_code == 0) + +@pytest.mark.level0 +@pytest.mark.platform_x86_gpu_training +@pytest.mark.env_single +def test_nccl_all_reduce_op(): + return_code = os.system("mpirun -n 8 pytest -s test_nccl_all_reduce_op.py") + assert(return_code == 0) + +@pytest.mark.level0 +@pytest.mark.platform_x86_gpu_training +@pytest.mark.env_single +def test_nccl_all_gather_op(): + return_code = os.system("mpirun -n 8 pytest -s test_nccl_all_gather_op.py") + assert(return_code == 0) + +@pytest.mark.level0 +@pytest.mark.platform_x86_gpu_training +@pytest.mark.env_single +def test_nccl_reduce_scatter_op(): + return_code = os.system("mpirun -n 8 pytest -s test_nccl_reduce_scatter_op.py") + assert(return_code == 0) diff --git a/tests/st/nccl/test_nccl_all_reduce_op.py b/tests/st/nccl/test_nccl_all_reduce_op.py index 3ba8b219e48..7c2e5794637 100644 --- a/tests/st/nccl/test_nccl_all_reduce_op.py +++ b/tests/st/nccl/test_nccl_all_reduce_op.py @@ -20,7 +20,7 @@ import mindspore.context as context from mindspore.common.initializer import initializer from mindspore.common.parameter import Parameter from mindspore.communication.management import init, NCCL_WORLD_COMM_GROUP, get_rank, get_group_size -context.set_context(mode=context.GRAPH_MODE, device_target='GPU') +context.set_context(mode=context.GRAPH_MODE, device_target='GPU', enable_dynamic_memory=False) init('nccl') rank = get_rank() diff --git a/tests/st/nccl/test_nccl_lenet.py b/tests/st/nccl/test_nccl_lenet.py index 5642603d42f..2aebc5da501 100644 --- a/tests/st/nccl/test_nccl_lenet.py +++ b/tests/st/nccl/test_nccl_lenet.py @@ -27,7 +27,7 @@ context.set_context(mode=context.GRAPH_MODE, device_target="GPU") init('nccl') epoch = 2 -total = 50000 +total = 5000 batch_size = 32 mini_batch = total // batch_size @@ -94,3 +94,4 @@ def test_lenet_nccl(): with open("ms_loss.txt", "w") as fo2: fo2.write("loss:") fo2.write(str(losses[-5:])) + assert(losses[-1] < 0.01) diff --git a/tests/st/nccl/test_nccl_reduce_scatter_op.py b/tests/st/nccl/test_nccl_reduce_scatter_op.py index af22c7690fd..32c1f31788c 100644 --- a/tests/st/nccl/test_nccl_reduce_scatter_op.py +++ b/tests/st/nccl/test_nccl_reduce_scatter_op.py @@ -62,8 +62,6 @@ def test_ReduceScatter(): expect1 = np.ones([1, 1, 3, 3]).astype(np.float32) * 0.01 * size diff1 = output[1].asnumpy() - expect1 error1 = np.ones(shape=expect1.shape) * 1.0e-5 - print(expect1) - print(output[1]) assert np.all(diff1 < error1) assert (output[1].shape() == expect1.shape)