add some allreduce st test_cases in pynative mode

2021-05-11 16:57:55 +08:00 · 2021-05-11 16:57:55 +08:00 · 6ae6a27688
parent 837d6e71de
commit 6ae6a27688
4 changed files with 187 additions and 6 deletions
--- a/tests/st/pynative/data_parallel/test_pynative_hccl.py
+++ b/tests/st/pynative/data_parallel/test_pynative_hccl.py
@ -16,12 +16,11 @@
 """test bert thor performance with 8p on mlperf dataset"""

 import os
-from multiprocessing import Process
+from multiprocessing import Process, Queue
 import pytest
 import numpy as np
 import mindspore.nn as nn
 from mindspore import Tensor
-import mindspore.dataset as dataset
 from mindspore import dtype as mstype
 from mindspore.ops import operations as P
 import mindspore.communication.management as D
@ -31,7 +30,6 @@ from mindspore.context import ParallelMode
 MINDSPORE_HCCL_CONFIG_PATH = "/home/workspace/mindspore_config/hccl/rank_table_8p.json"

 np.random.seed(1)
-dataset.config.set_seed(1)
 os.environ['GLOG_v'] = str(2)

 class AllReduceNet(nn.Cell):
@ -42,7 +40,7 @@ class AllReduceNet(nn.Cell):
    def construct(self, x):
        return self.all_reduce(x)

-def train_allreduce_8p(device_id, device_num):
+def train_allreduce_8p(q, device_id, device_num):
    os.system("mkdir " + str(device_id))
    os.chdir(str(device_id))
    context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend", device_id=device_id)
@ -58,7 +56,7 @@ def train_allreduce_8p(device_id, device_num):
    input_x = np.ones([32, 255, 255, 3]).astype(np.float32)
    except_output = input_x * 8
    output = net(Tensor(input_x, mstype.float32))
-    assert np.allclose(output.asnumpy(), except_output)
+    q.put(np.allclose(output.asnumpy(), except_output))

@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@ -67,9 +65,10 @@ def train_allreduce_8p(device_id, device_num):
 def test_pynative_hccl_8p():
    device_num = 8
    process = []
+    q = Queue()
    for i in range(device_num):
        device_id = i
-        process.append(Process(target=train_allreduce_8p, args=(device_id, device_num)))
+        process.append(Process(target=train_allreduce_8p, args=(q, device_id, device_num)))

    for i in range(device_num):
        process[i].start()
@ -79,6 +78,10 @@ def test_pynative_hccl_8p():
    for i in range(device_num):
        process[i].join()

+    # check result
+    for i in range(device_num):
+        assert q.get()
+
    for i in range(device_num):
        os.system("rm -rf " + str(i))

--- a/tests/st/pynative/data_parallel/test_pynative_hccl_allreduce.py
+++ b/tests/st/pynative/data_parallel/test_pynative_hccl_allreduce.py
@ -0,0 +1,99 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""test hccl allreduce performance with 8p"""
+
+import os
+from multiprocessing import Process, Queue
+import pytest
+import numpy as np
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore import dtype as mstype
+from mindspore.ops import operations as P
+import mindspore.communication.management as D
+from mindspore import context
+from mindspore.context import ParallelMode
+
+MINDSPORE_HCCL_CONFIG_PATH = "/home/workspace/mindspore_config/hccl/rank_table_8p.json"
+
+np.random.seed(1)
+os.environ['GLOG_v'] = str(2)
+
+class AllReduceNet(nn.Cell):
+    def __init__(self):
+        super(AllReduceNet, self).__init__()
+        self.mul = P.Mul()
+        self.all_reduce = P.AllReduce()
+        self.add = P.Add()
+
+    def construct(self, x):
+        x = self.mul(x, 2)
+        y1 = Tensor(np.array([[2, 2, 2, 2], [2, 2, 2, 2], [2, 2, 2, 2]])).astype(np.float32)
+        z = self.add(x, y1)
+        z = self.all_reduce(z)
+        y2 = Tensor(np.array([[-16, -16, -16, -16], [-16, -16, -16, -16], [-16, -16, -16, -16]])).astype(np.float32)
+        out = self.add(z, y2)
+        out = self.all_reduce(out)
+        out = self.mul(out, 2)
+        return out
+
+def train_allreduce_8p(q, device_id, device_num):
+    os.system("mkdir " + str(device_id))
+    os.chdir(str(device_id))
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend", device_id=device_id)
+    os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH
+    os.environ['RANK_ID'] = str(device_id)
+    os.environ['RANK_SIZE'] = str(device_num)
+    D.init()
+    context.reset_auto_parallel_context()
+    context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=False,
+                                      device_num=device_num)
+
+    net = AllReduceNet()
+    input_x = np.ones([3, 4]).astype(np.float32)
+    output = net(Tensor(input_x, mstype.float32))
+    q.put(output)
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_single
+def test_pynative_hccl_allreduce_8p():
+    device_num = 8
+    process = []
+    q = Queue()
+    for i in range(device_num):
+        device_id = i
+        process.append(Process(target=train_allreduce_8p, args=(q, device_id, device_num)))
+
+    for i in range(device_num):
+        process[i].start()
+
+    print("Waiting for all subprocesses done...")
+
+    for i in range(device_num):
+        process[i].join()
+
+    # check result
+    for i in range(device_num):
+        expect_output = [[256, 256, 256, 256], [256, 256, 256, 256], [256, 256, 256, 256]]
+        output = Tensor(q.get())
+        assert np.allclose(output.asnumpy(), expect_output)
+
+    for i in range(device_num):
+        os.system("rm -rf " + str(i))
+
+    print("End training...")
--- a/tests/st/pynative/data_parallel/test_pynative_nccl_all.py
+++ b/tests/st/pynative/data_parallel/test_pynative_nccl_all.py
@ -0,0 +1,24 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import os
+import pytest
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_single
+def test_pynative_nccl_allreduce():
+    return_code = os.system("mpirun -n 8 pytest -s test_pynative_nccl_allreduce.py")
+    assert return_code == 0
--- a/tests/st/pynative/data_parallel/test_pynative_nccl_allreduce.py
+++ b/tests/st/pynative/data_parallel/test_pynative_nccl_allreduce.py
@ -0,0 +1,55 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""test nccl allreduce performance with 8p"""
+
+import os
+import numpy as np
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore import dtype as mstype
+from mindspore.ops import operations as P
+from mindspore import context
+from mindspore.communication.management import init
+
+np.random.seed(1)
+os.environ['GLOG_v'] = str(2)
+context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
+init()
+
+class AllReduceNet(nn.Cell):
+    def __init__(self):
+        super(AllReduceNet, self).__init__()
+        self.mul = P.Mul()
+        self.all_reduce = P.AllReduce()
+        self.add = P.Add()
+
+    def construct(self, x):
+        x = self.mul(x, 2)
+        y1 = Tensor(np.array([[2, 2, 2, 2], [2, 2, 2, 2], [2, 2, 2, 2]])).astype(np.float32)
+        z = self.add(x, y1)
+        z = self.all_reduce(z)
+        y2 = Tensor(np.array([[-16, -16, -16, -16], [-16, -16, -16, -16], [-16, -16, -16, -16]])).astype(np.float32)
+        out = self.add(z, y2)
+        out = self.all_reduce(out)
+        out = self.mul(out, 2)
+        return out
+
+def test_pynative_nccl_allreduce_8p():
+    net = AllReduceNet()
+    input_x = np.ones([3, 4]).astype(np.float32)
+    expect_output = [[256, 256, 256, 256], [256, 256, 256, 256], [256, 256, 256, 256]]
+    output = net(Tensor(input_x, mstype.float32))
+    assert np.allclose(output.asnumpy(), expect_output)