add pynative 8p st

This commit is contained in:
caifubi 2021-09-16 10:42:06 +08:00
parent 0f85e02b91
commit fae514124b
5 changed files with 60 additions and 12 deletions

View File

@ -217,6 +217,7 @@ def test_bert_thor_8p():
sum_cost_list.append(0.0) sum_cost_list.append(0.0)
for _ in range(device_num): for _ in range(device_num):
assert not q.empty()
output = q.get() output = q.get()
loss_list = output['loss'] loss_list = output['loss']
cost_list = output['cost'] cost_list = output['cost']

View File

@ -360,6 +360,7 @@ def test_resnet_and_resnet_thor_imagenet_4p():
acc = 0.0 acc = 0.0
cost = 0.0 cost = 0.0
for i in range(device_num): for i in range(device_num):
assert not q.empty()
output = q.get() output = q.get()
acc += output['acc'] acc += output['acc']
cost += output['cost'] cost += output['cost']

View File

@ -80,6 +80,7 @@ def test_pynative_hccl_8p():
# check result # check result
for i in range(device_num): for i in range(device_num):
assert not q.empty()
assert q.get() assert q.get()
for i in range(device_num): for i in range(device_num):
@ -87,7 +88,7 @@ def test_pynative_hccl_8p():
print("End training...") print("End training...")
@pytest.mark.level0 @pytest.mark.level1
@pytest.mark.platform_arm_ascend_training @pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training @pytest.mark.platform_x86_ascend_training
@pytest.mark.env_single @pytest.mark.env_single
@ -110,6 +111,7 @@ def test_pynative_hccl_8pv2():
# check result # check result
for i in range(device_num): for i in range(device_num):
assert not q.empty()
assert q.get() assert q.get()
for i in range(device_num): for i in range(device_num):

View File

@ -90,6 +90,7 @@ def test_pynative_hccl_allreduce_8p():
# check result # check result
for i in range(device_num): for i in range(device_num):
expect_output = [[256, 256, 256, 256], [256, 256, 256, 256], [256, 256, 256, 256]] expect_output = [[256, 256, 256, 256], [256, 256, 256, 256], [256, 256, 256, 256]]
assert not q.empty()
output = Tensor(q.get()) output = Tensor(q.get())
assert np.allclose(output.asnumpy(), expect_output) assert np.allclose(output.asnumpy(), expect_output)

View File

@ -13,8 +13,10 @@
# limitations under the License. # limitations under the License.
# ============================================================================ # ============================================================================
import os
import time import time
import random import random
from multiprocessing import Process, Queue
import numpy as np import numpy as np
import pytest import pytest
@ -32,10 +34,18 @@ from mindspore.nn import Cell
from mindspore.ops import operations as P from mindspore.ops import operations as P
from mindspore.ops import composite as CP from mindspore.ops import composite as CP
from mindspore.nn.optim.momentum import Momentum from mindspore.nn.optim.momentum import Momentum
from mindspore.train.callback import LossMonitor, Callback from mindspore.train.callback import Callback
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore.train.loss_scale_manager import FixedLossScaleManager
from mindspore.train.model import Model from mindspore.train.model import Model
from mindspore.context import ParallelMode
import mindspore.communication.management as D
MINDSPORE_HCCL_CONFIG_PATH = "/home/workspace/mindspore_config/hccl/rank_table_8p.json"
np.random.seed(1)
os.environ['GLOG_v'] = str(2)
os.environ['ASCEND_GLOBAL_LOG_LEVEL'] = str(3)
os.environ['ASCEND_GLOBAL_EVENT_ENABLE'] = str(0)
class MyTimeMonitor(Callback): class MyTimeMonitor(Callback):
def __init__(self, data_size): def __init__(self, data_size):
@ -56,7 +66,7 @@ class MyTimeMonitor(Callback):
def step_end(self, run_context): def step_end(self, run_context):
step_msseconds = (time.time() - self.step_time) * 1000 step_msseconds = (time.time() - self.step_time) * 1000
if step_msseconds < 275: if step_msseconds < 370:
self.total = self.total + 1 self.total = self.total + 1
print(f"step time:{step_msseconds}", flush=True) print(f"step time:{step_msseconds}", flush=True)
@ -405,13 +415,7 @@ class GradWrap(Cell):
return grad_by_list(self.network, weights)(x, label) return grad_by_list(self.network, weights)(x, label)
@pytest.mark.level1
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_single
def test_pynative_resnet50(): def test_pynative_resnet50():
context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend")
batch_size = 32 batch_size = 32
num_classes = 10 num_classes = 10
loss_scale = 128 loss_scale = 128
@ -423,8 +427,7 @@ def test_pynative_resnet50():
# define callbacks # define callbacks
time_cb = MyTimeMonitor(data_size=data_set.get_dataset_size()) time_cb = MyTimeMonitor(data_size=data_set.get_dataset_size())
loss_cb = LossMonitor() cb = [time_cb]
cb = [time_cb, loss_cb]
loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
loss_scale = FixedLossScaleManager(loss_scale=loss_scale, drop_overflow_update=False) loss_scale = FixedLossScaleManager(loss_scale=loss_scale, drop_overflow_update=False)
@ -435,4 +438,44 @@ def test_pynative_resnet50():
model.train(1, data_set, callbacks=cb, model.train(1, data_set, callbacks=cb,
sink_size=data_set.get_dataset_size(), dataset_sink_mode=True) sink_size=data_set.get_dataset_size(), dataset_sink_mode=True)
assert time_cb.good_step() > 10 return time_cb.good_step()
def test_pynative_resnet50_with_env(queue, device_id, device_num):
os.system("mkdir " + str(device_id))
os.chdir(str(device_id))
context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend", device_id=device_id)
os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH
os.environ['RANK_ID'] = str(device_id)
os.environ['RANK_SIZE'] = str(device_num)
D.init()
context.reset_auto_parallel_context()
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=False,
device_num=device_num)
good_steps = test_pynative_resnet50()
queue.put(good_steps)
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.env_single
def test_pynative_resnet50_8p():
device_num = 8
process = []
q = Queue()
for i in range(device_num):
device_id = i
process.append(Process(target=test_pynative_resnet50_with_env, args=(q, device_id, device_num)))
for i in range(device_num):
process[i].start()
for i in range(device_num):
process[i].join()
# check result
for i in range(device_num):
assert not q.empty()
good_steps = q.get()
assert good_steps > 10