From 89a4ebf8a155624e7bc0dbd236df3f08322a09bb Mon Sep 17 00:00:00 2001 From: Yi Huaijie Date: Thu, 20 Aug 2020 09:36:06 +0800 Subject: [PATCH] parallel mode must be set before create an initializer --- mindspore/ccsrc/frontend/parallel/context.cc | 2 + mindspore/ccsrc/frontend/parallel/context.h | 4 ++ mindspore/ccsrc/pipeline/jit/init.cc | 2 + mindspore/common/api.py | 3 +- mindspore/common/initializer.py | 2 + mindspore/context.py | 3 ++ mindspore/parallel/_auto_parallel_context.py | 20 ++++++++ mindspore/parallel/_utils.py | 13 +++++ tests/st/nccl/test_nccl_lenet.py | 2 +- .../communication/test_data_parallel_lenet.py | 2 + tests/ut/python/model/test_mix_precision.py | 4 +- tests/ut/python/nn/test_parameter.py | 3 +- .../parallel/test_add_relu_redistribution.py | 7 ++- .../python/parallel/test_allreduce_fusion.py | 16 +++++-- tests/ut/python/parallel/test_alltoall.py | 3 +- tests/ut/python/parallel/test_arithmetic.py | 38 ++++++++------- .../parallel/test_auto_parallel_BN_PReLU.py | 2 +- .../test_auto_parallel_double_subgraphs.py | 2 +- .../test_auto_parallel_tuple_depend.py | 2 +- .../parallel/test_auto_parallel_two_bn.py | 4 +- .../ut/python/parallel/test_batch_parallel.py | 2 +- .../parallel/test_batchnorm_batch_parallel.py | 6 +-- .../ut/python/parallel/test_bn_prelu_cell.py | 2 +- tests/ut/python/parallel/test_get_next.py | 8 ++-- .../parallel/test_initializer_weight_slice.py | 47 ++++++++++++++----- tests/ut/python/parallel/test_linear.py | 2 +- .../parallel/test_loss_and_optimizer.py | 10 ++-- tests/ut/python/parallel/test_one_hot_net.py | 8 ++-- .../parallel/test_operator_model_parallel.py | 10 ++-- tests/ut/python/parallel/test_optimizer.py | 4 +- .../parallel/test_optimizer_clone_weight.py | 4 +- tests/ut/python/parallel/test_reshape.py | 4 +- .../test_using_seed_for_initializer.py | 3 ++ .../parallel/test_virtual_dataset_3_input.py | 6 +-- tests/ut/python/train/test_amp.py | 8 ++-- 35 files changed, 174 insertions(+), 84 deletions(-) diff --git a/mindspore/ccsrc/frontend/parallel/context.cc b/mindspore/ccsrc/frontend/parallel/context.cc index 20445f9c6a7..0ba91857550 100644 --- a/mindspore/ccsrc/frontend/parallel/context.cc +++ b/mindspore/ccsrc/frontend/parallel/context.cc @@ -81,6 +81,8 @@ void ParallelContext::set_mirror_mean(bool mirror_mean) { mirror_mean_ = mirror_ void ParallelContext::set_full_batch(bool full_batch) { full_batch_ = full_batch; } +void ParallelContext::set_has_initializer(bool has_initializer) { has_initializer_ = has_initializer; } + void ParallelContext::set_cast_before_mirror(bool cast_before_mirror) { cast_before_mirror_ = cast_before_mirror; } void ParallelContext::set_loss_repeated_mean(bool loss_repeated_mean) { loss_repeated_mean_ = loss_repeated_mean; } diff --git a/mindspore/ccsrc/frontend/parallel/context.h b/mindspore/ccsrc/frontend/parallel/context.h index 34363726411..e32ef855e3e 100644 --- a/mindspore/ccsrc/frontend/parallel/context.h +++ b/mindspore/ccsrc/frontend/parallel/context.h @@ -58,6 +58,9 @@ class ParallelContext { void set_full_batch(bool full_batch); bool full_batch() const { return full_batch_; } + void set_has_initializer(bool has_initializer); + bool has_initializer() const { return has_initializer_; } + void set_cast_before_mirror(bool cast_before_mirror); bool cast_before_mirror() const { return cast_before_mirror_; } @@ -112,6 +115,7 @@ class ParallelContext { static std::shared_ptr inst_context_; bool mirror_mean_; bool full_batch_; + bool has_initializer_ = false; bool cast_before_mirror_; bool loss_repeated_mean_; int32_t device_num_; diff --git a/mindspore/ccsrc/pipeline/jit/init.cc b/mindspore/ccsrc/pipeline/jit/init.cc index 3b96b8938e0..3d4e7c6bdcc 100644 --- a/mindspore/ccsrc/pipeline/jit/init.cc +++ b/mindspore/ccsrc/pipeline/jit/init.cc @@ -193,6 +193,8 @@ PYBIND11_MODULE(_c_expression, m) { .def("get_strategy_ckpt_save_file", &ParallelContext::strategy_ckpt_save_file, "Get strategy checkpoint save file.") .def("set_full_batch", &ParallelContext::set_full_batch, "Set whether load full batch on each device.") .def("get_full_batch", &ParallelContext::full_batch, "Get whether load full batch on each device.") + .def("set_has_initializer", &ParallelContext::set_has_initializer, "Set whether any Initializer has been created.") + .def("get_has_initializer", &ParallelContext::has_initializer, "Get whether any Initializer has been created.") .def("set_enable_parallel_optimizer", &ParallelContext::set_enable_parallel_optimizer, "Set enable/disable parallel optimizer.") .def("get_enable_parallel_optimizer", &ParallelContext::enable_parallel_optimizer, diff --git a/mindspore/common/api.py b/mindspore/common/api.py index b827ffe3455..b30d7ed0d12 100644 --- a/mindspore/common/api.py +++ b/mindspore/common/api.py @@ -24,7 +24,7 @@ from mindspore import log as logger from .._c_expression import generate_key, Executor_, Tensor, MetaTensor, PynativeExecutor_ from .._c_expression import verify_inputs_signature, init_exec_dataset, _set_dataset_mode_config, init_backend from .tensor import Tensor as MsTensor -from ..parallel._utils import _get_device_num, _get_global_rank, _need_to_full, _to_full_tensor +from ..parallel._utils import _get_device_num, _get_global_rank, _need_to_full, _to_full_tensor, _set_has_initializer # store ms_function class compiled pipeline cache ms_compile_cache = {} @@ -383,6 +383,7 @@ class _Executor: Str, the full phase of the cell. Bool, if the graph has been compiled before, return False, else return True. """ + _set_has_initializer(False) obj.check_names() args_names, args_list = _generate_pip_args(obj, *args) dic = dict(zip(args_names, args_list)) diff --git a/mindspore/common/initializer.py b/mindspore/common/initializer.py index 546d1e99b15..4982243f044 100644 --- a/mindspore/common/initializer.py +++ b/mindspore/common/initializer.py @@ -24,6 +24,7 @@ from mindspore import log as logger from . import dtype as mstype from .tensor import Tensor from .._c_expression import random_normal +from ..parallel._utils import _set_has_initializer _INITIALIZER_ALIAS = dict() @@ -42,6 +43,7 @@ class Initializer: self._kwargs = kwargs self.shape = None self.dtype = None + _set_has_initializer(True) def _initialize(self, *kwargs): raise NotImplementedError('Must be overridden!') diff --git a/mindspore/context.py b/mindspore/context.py index 6240cbcadb1..1f5fe65f716 100644 --- a/mindspore/context.py +++ b/mindspore/context.py @@ -437,6 +437,8 @@ def set_auto_parallel_context(**kwargs): If a program has tasks with different parallel modes, then before setting new parallel mode for next task, interface mindspore.context.reset_auto_parallel_context() needs to be called to reset the configuration. + Setting or changing parallel modes must be called before any Initializer created, or RuntimeError + will be raised. Args: device_num (int): Available device number, the value must be in [1, 4096]. Default: 1. @@ -477,6 +479,7 @@ def set_auto_parallel_context(**kwargs): Raises: ValueError: If input key is not attribute in auto parallel context. + RuntimeError: If there is any Initializer created before setting or changing parallel_mode. Examples: >>> context.set_auto_parallel_context(device_num=8) diff --git a/mindspore/parallel/_auto_parallel_context.py b/mindspore/parallel/_auto_parallel_context.py index e2369c4aa69..7da25b8fd37 100644 --- a/mindspore/parallel/_auto_parallel_context.py +++ b/mindspore/parallel/_auto_parallel_context.py @@ -176,8 +176,12 @@ class _AutoParallelContext: Raises: ValueError: If parallel mode is not supported. + RuntimeError: If there is any Initializer created before setting or changing parallel_mode. """ self.check_context_handle() + if self.get_has_initializer(): + self.set_has_initializer(False) + raise RuntimeError("Must set or change parallel mode before any Initializer created.") ret = self._context_handle.set_parallel_mode(parallel_mode) if ret is False: raise ValueError("Parallel mode does not support {}".format(parallel_mode)) @@ -249,6 +253,21 @@ class _AutoParallelContext: self.check_context_handle() return self._context_handle.get_full_batch() + def set_has_initializer(self, has_initializer): + """ + Set whether any Initializer has been created. + + Args: + has_initializer (bool): True if a Initializer created. + """ + self.check_context_handle() + self._context_handle.set_has_initializer(has_initializer) + + def get_has_initializer(self): + """Get whether any Initializer has been created.""" + self.check_context_handle() + return self._context_handle.get_has_initializer() + def set_strategy_ckpt_save_file(self, strategy_ckpt_save_file): """ Set strategy checkpoint save path. @@ -543,6 +562,7 @@ def _set_auto_parallel_context(**kwargs): Raises: ValueError: If input key is not attribute in auto parallel context. + RuntimeError: If there is any Initializer created before setting or changing parallel_mode. """ for key, value in kwargs.items(): if key not in _set_auto_parallel_context_func_map: diff --git a/mindspore/parallel/_utils.py b/mindspore/parallel/_utils.py index ff1fbcc6c2b..3ed2416b359 100644 --- a/mindspore/parallel/_utils.py +++ b/mindspore/parallel/_utils.py @@ -32,6 +32,19 @@ def _get_full_batch(): """Get whether to use full_batch.""" return auto_parallel_context().get_full_batch() +def _get_has_initializer(): + """Get whether any Initializer has been created.""" + return auto_parallel_context().get_has_initializer() + +def _set_has_initializer(has_initializer): + """ + Set whether any Initializer has been created. + + Args: + has_initializer (bool): True if a Initializer created. + """ + auto_parallel_context().set_has_initializer(has_initializer) + def _need_to_full(): """Check whether to convert input to full shape or tensor.""" diff --git a/tests/st/nccl/test_nccl_lenet.py b/tests/st/nccl/test_nccl_lenet.py index b22bde5ab13..37fd6363c06 100644 --- a/tests/st/nccl/test_nccl_lenet.py +++ b/tests/st/nccl/test_nccl_lenet.py @@ -78,6 +78,7 @@ def multisteplr(total_steps, gap, base_lr=0.9, gamma=0.1, dtype=mstype.float32): def test_lenet_nccl(): + context.set_auto_parallel_context(parallel_mode="data_parallel", mirror_mean=True, device_num=get_group_size()) net = LeNet() net.set_train() @@ -86,7 +87,6 @@ def test_lenet_nccl(): mom_optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum) criterion = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) net_with_criterion = WithLossCell(net, criterion) - context.set_auto_parallel_context(parallel_mode="data_parallel", mirror_mean=True, device_num=get_group_size()) train_network = TrainOneStepCell(net_with_criterion, mom_optimizer) train_network.set_train() losses = [] diff --git a/tests/ut/python/communication/test_data_parallel_lenet.py b/tests/ut/python/communication/test_data_parallel_lenet.py index 7a5062b9410..42fc122adec 100755 --- a/tests/ut/python/communication/test_data_parallel_lenet.py +++ b/tests/ut/python/communication/test_data_parallel_lenet.py @@ -24,6 +24,7 @@ import mindspore.nn as nn from mindspore import Tensor, Model, ParallelMode from mindspore.nn.optim import Momentum from mindspore.ops import operations as P +from mindspore.parallel._utils import _set_has_initializer _current_dir = os.path.dirname(os.path.realpath(__file__)) + "/../test_data" @@ -89,3 +90,4 @@ def test_lenet5_train_step_training_pynative(): Model(network=network, loss_fn=loss_fn, optimizer=optimizer) context.set_context(mode=context.GRAPH_MODE) context.reset_auto_parallel_context() + _set_has_initializer(False) diff --git a/tests/ut/python/model/test_mix_precision.py b/tests/ut/python/model/test_mix_precision.py index f1fc2cc2f71..89a71bd37ce 100644 --- a/tests/ut/python/model/test_mix_precision.py +++ b/tests/ut/python/model/test_mix_precision.py @@ -96,6 +96,8 @@ def test_on_momentum(): def test_data_parallel_with_cast(): """test_data_parallel_with_cast""" + context.reset_auto_parallel_context() + context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=8) predict = Tensor(np.ones([1, 1, 32, 32]).astype(np.float32) * 0.01) label = Tensor(np.zeros([1, 10]).astype(np.float32)) net = LeNet5() @@ -107,8 +109,6 @@ def test_data_parallel_with_cast(): learning_rate=0.1, momentum=0.9) net = WithLossCell(net, loss_fn) - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=8) net = TrainOneStepCell(net, optimizer) _executor.compile(net, predict, label) diff --git a/tests/ut/python/nn/test_parameter.py b/tests/ut/python/nn/test_parameter.py index 0ff0949b3d2..58a118fded6 100644 --- a/tests/ut/python/nn/test_parameter.py +++ b/tests/ut/python/nn/test_parameter.py @@ -21,7 +21,7 @@ from mindspore import context, Tensor, Parameter, ParameterTuple from mindspore._checkparam import _check_str_by_regular from mindspore.common import dtype as mstype from mindspore.common.initializer import initializer - +from mindspore.parallel._utils import _set_has_initializer def test_parameter_init(): dat = np.array([[1, 2, 3], [2, 3, 4]]) @@ -170,6 +170,7 @@ def test_scalar_parameter_update(): def test_parameter_lazy_init(): + _set_has_initializer(False) # support lazy init in SEMI_AUTO_PARALLEL mode context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8) diff --git a/tests/ut/python/parallel/test_add_relu_redistribution.py b/tests/ut/python/parallel/test_add_relu_redistribution.py index 08ef18699ac..9a8a43b9bf8 100644 --- a/tests/ut/python/parallel/test_add_relu_redistribution.py +++ b/tests/ut/python/parallel/test_add_relu_redistribution.py @@ -20,6 +20,7 @@ from mindspore import context from mindspore.common.api import _executor from mindspore.ops import composite as C from mindspore.ops import operations as P +from mindspore.parallel._utils import _set_has_initializer from tests.ut.python.ops.test_math_ops import VirtualLoss @@ -60,12 +61,13 @@ def compile_net(net, x, y): def test_add_relu_stride_slice(): + _set_has_initializer(False) context.set_auto_parallel_context(device_num=8, global_rank=7) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy0 = ((1, 1), (1, 1)) strategy1 = ((8, 1),) net = Grad(NetWithLoss(AddRelu(strategy0, strategy1))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") x = Tensor(np.ones([128, 32]), dtype=ms.float32) y = Tensor(np.ones([128, 32]), dtype=ms.float32) @@ -73,12 +75,13 @@ def test_add_relu_stride_slice(): def test_add_relu_all_gather(): + _set_has_initializer(False) context.set_auto_parallel_context(device_num=8, global_rank=7) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy0 = ((8, 1), (8, 1)) strategy1 = ((1, 1),) net = Grad(NetWithLoss(AddRelu(strategy0, strategy1))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") x = Tensor(np.ones([128, 32]), dtype=ms.float32) y = Tensor(np.ones([128, 32]), dtype=ms.float32) diff --git a/tests/ut/python/parallel/test_allreduce_fusion.py b/tests/ut/python/parallel/test_allreduce_fusion.py index 607213f8064..bd1f88f85f8 100644 --- a/tests/ut/python/parallel/test_allreduce_fusion.py +++ b/tests/ut/python/parallel/test_allreduce_fusion.py @@ -23,6 +23,7 @@ from mindspore.nn.optim.momentum import Momentum from mindspore.parallel import _cost_model_context as cost_model_context from mindspore.parallel._auto_parallel_context import auto_parallel_context from mindspore.train import Model, ParallelMode +from mindspore.parallel._utils import _set_has_initializer from tests.dataset_mock import MindData @@ -105,10 +106,8 @@ def train_common(net): momentum = 0.9 epoch_size = 2 device_num = 4 - context.reset_auto_parallel_context() auto_parallel_context().set_enable_all_reduce_fusion(enable_all_reduce_fusion=True) - context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=device_num, - parameter_broadcast=False) + context.set_auto_parallel_context(device_num=device_num, parameter_broadcast=False) context.set_context(mode=context.GRAPH_MODE) predict = Tensor(np.ones([batch_size, 128]), dtype=ms.float32) @@ -183,9 +182,12 @@ def test_allreduce_fusion_parameters(): def test_allreduce_fusion1(): + _set_has_initializer(False) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=1) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=2) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_percent=0.5) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL) net = SimpleDMLNet(DenseNet1(has_bias=False, activation=None), DenseNet2(has_bias=False, activation=None)) allreduce_fusion_dict = train_common(net) expect_dict = {'backbone2.fc8.weight': 2, @@ -210,6 +212,8 @@ def test_allreduce_fusion2(): cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=2) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_percent=0.5) cost_model_context.reset_cost_model_context() + context.reset_auto_parallel_context() + context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL) net = SimpleDMLNet(DenseNet1(has_bias=False, activation=None), DenseNet2(has_bias=False, activation=None)) allreduce_fusion_dict = train_common(net) expect_dict = {} @@ -221,6 +225,8 @@ def test_allreduce_fusion3(): cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=1) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=3) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_percent=0.3333333) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL) net = SimpleDMLNet(DenseNet1(has_bias=True, activation='relu'), DenseNet2(has_bias=False, activation='relu')) allreduce_fusion_dict = train_common(net) expect_dict = {'backbone2.fc8.weight': 3, @@ -247,6 +253,8 @@ def test_allreduce_fusion4(): cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=1) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=2) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_percent=0.5) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL) net = SimpleDMLNet(DenseNet2(has_bias=False, activation=None), DenseNet2(has_bias=False, activation=None)) allreduce_fusion_dict = train_common(net) expect_dict = {'backbone2.fc8.weight': 2, @@ -276,6 +284,8 @@ def test_allreduce_fusion5(): cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_allreduce_inherent_time=0.05) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_allreduce_bandwidth=0.000001) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_computation_time_parameter=0.0000015) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL) net = SimpleDMLNet(DenseNet2(has_bias=False, activation=None), DenseNet2(has_bias=False, activation=None)) allreduce_fusion_dict = train_common(net) diff --git a/tests/ut/python/parallel/test_alltoall.py b/tests/ut/python/parallel/test_alltoall.py index 96ff8435046..26f19d722bb 100644 --- a/tests/ut/python/parallel/test_alltoall.py +++ b/tests/ut/python/parallel/test_alltoall.py @@ -23,7 +23,7 @@ from mindspore.common.parameter import Parameter from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore.nn.optim.momentum import Momentum from mindspore.ops import operations as P -from mindspore.parallel._utils import _reset_op_id +from mindspore.parallel._utils import _reset_op_id, _set_has_initializer from mindspore.train import Model, ParallelMode from tests.dataset_mock import MindData @@ -90,6 +90,7 @@ def all_to_all_common(strategy1): def test_all_to_all(): + _set_has_initializer(False) strategy1 = ((8, 1),) context.set_context(mode=context.GRAPH_MODE, save_graphs=False) _reset_op_id() diff --git a/tests/ut/python/parallel/test_arithmetic.py b/tests/ut/python/parallel/test_arithmetic.py index 1101c0ecd29..134685a620b 100644 --- a/tests/ut/python/parallel/test_arithmetic.py +++ b/tests/ut/python/parallel/test_arithmetic.py @@ -20,6 +20,7 @@ from mindspore import Parameter, Tensor, context from mindspore.common.api import _executor from mindspore.ops import composite as C from mindspore.ops import operations as P +from mindspore.parallel._utils import _set_has_initializer from tests.ut.python.ops.test_math_ops import VirtualLoss @@ -60,11 +61,12 @@ def test_matmul_sub(): out = self.sub(out, b) return out + _set_has_initializer(False) context.set_auto_parallel_context(device_num=8, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 2), (2, 2)) strategy2 = ((4, 2), (4, 2)) net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") x = Tensor(np.ones([64, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) @@ -85,10 +87,10 @@ def test_matmul_add(): return out context.set_auto_parallel_context(device_num=8, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 2), (2, 2)) strategy2 = ((4, 2), (4, 2)) net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") x = Tensor(np.ones([64, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) @@ -109,10 +111,10 @@ def test_matmul_mul(): return out context.set_auto_parallel_context(device_num=8, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 2), (2, 2)) strategy2 = ((4, 2), (4, 2)) net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") x = Tensor(np.ones([64, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) @@ -133,10 +135,10 @@ def test_matmul_div(): return out context.set_auto_parallel_context(device_num=8, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 2), (2, 2)) strategy2 = ((4, 2), (4, 2)) net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") x = Tensor(np.ones([64, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) @@ -157,10 +159,10 @@ def test_matmul_greater(): return out context.set_auto_parallel_context(device_num=8, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 2), (2, 2)) strategy2 = ((4, 2), (4, 2)) net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") x = Tensor(np.ones([64, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) @@ -181,10 +183,10 @@ def test_matmul_add_broadcast(): return out context.set_auto_parallel_context(device_num=8, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 2), (2, 2)) strategy2 = ((4, 2), (2,)) net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") x = Tensor(np.ones([64, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) @@ -205,10 +207,10 @@ def test_matmul_add_broadcast2(): return out context.set_auto_parallel_context(device_num=8, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 4), (4, 1)) strategy2 = ((4, 1), (1, 2)) net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") x = Tensor(np.ones([64, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 1]), dtype=ms.float32) @@ -229,10 +231,10 @@ def test_matmul_sub_broadcast(): return out context.set_auto_parallel_context(device_num=8, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 2), (2, 2)) strategy2 = ((4, 2), (2,)) net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") x = Tensor(np.ones([64, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) @@ -253,10 +255,10 @@ def test_matmul_sub_broadcast2(): return out context.set_auto_parallel_context(device_num=8, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 4), (4, 1)) strategy2 = ((4, 1), (1, 2)) net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") x = Tensor(np.ones([64, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 1]), dtype=ms.float32) @@ -277,10 +279,10 @@ def test_matmul_mul_broadcast(): return out context.set_auto_parallel_context(device_num=8, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 2), (2, 2)) strategy2 = ((4, 2), (2,)) net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") x = Tensor(np.ones([64, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) @@ -301,10 +303,10 @@ def test_matmul_mul_broadcast2(): return out context.set_auto_parallel_context(device_num=8, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 4), (4, 1)) strategy2 = ((4, 1), (1, 2)) net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") x = Tensor(np.ones([64, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 1]), dtype=ms.float32) @@ -325,10 +327,10 @@ def test_matmul_div_broadcast(): return out context.set_auto_parallel_context(device_num=8, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 2), (2, 2)) strategy2 = ((4, 2), (2,)) net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") x = Tensor(np.ones([64, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) @@ -349,10 +351,10 @@ def test_matmul_div_broadcast2(): return out context.set_auto_parallel_context(device_num=8, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 4), (4, 1)) strategy2 = ((4, 1), (1, 2)) net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") x = Tensor(np.ones([64, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 1]), dtype=ms.float32) @@ -373,10 +375,10 @@ def test_matmul_greater_broadcast(): return out context.set_auto_parallel_context(device_num=8, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 2), (2, 2)) strategy2 = ((4, 2), (2,)) net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") x = Tensor(np.ones([64, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) @@ -397,10 +399,10 @@ def test_matmul_greater_broadcast2(): return out context.set_auto_parallel_context(device_num=8, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 4), (4, 1)) strategy2 = ((4, 1), (1, 2)) net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") x = Tensor(np.ones([64, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 1]), dtype=ms.float32) @@ -421,10 +423,10 @@ def test_matmul_floordiv(): return out context.set_auto_parallel_context(device_num=8, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 2), (2, 2)) strategy2 = ((4, 2), (4, 2)) net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") x = Tensor(np.ones([64, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) @@ -445,10 +447,10 @@ def test_matmul_floordiv_broadcast(): return out context.set_auto_parallel_context(device_num=8, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 2), (2, 2)) strategy2 = ((4, 2), (2,)) net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") x = Tensor(np.ones([64, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) @@ -469,10 +471,10 @@ def test_matmul_floordiv_broadcast2(): return out context.set_auto_parallel_context(device_num=8, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 4), (4, 1)) strategy2 = ((4, 1), (1, 2)) net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") x = Tensor(np.ones([64, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 1]), dtype=ms.float32) diff --git a/tests/ut/python/parallel/test_auto_parallel_BN_PReLU.py b/tests/ut/python/parallel/test_auto_parallel_BN_PReLU.py index 087065a9a3b..27b9248e483 100644 --- a/tests/ut/python/parallel/test_auto_parallel_BN_PReLU.py +++ b/tests/ut/python/parallel/test_auto_parallel_BN_PReLU.py @@ -64,10 +64,10 @@ def test_auto_parallel_bn_with_prelu(): size = 8 context.set_auto_parallel_context(device_num=size, global_rank=0) + context.set_auto_parallel_context(parallel_mode="auto_parallel") x = Tensor(np.random.rand(16, 16, 32, 64), dtype=ms.float32) net = GradWrap(NetWithLoss(Net())) - context.set_auto_parallel_context(parallel_mode="auto_parallel") net.set_auto_parallel() _executor.compile(net, x) diff --git a/tests/ut/python/parallel/test_auto_parallel_double_subgraphs.py b/tests/ut/python/parallel/test_auto_parallel_double_subgraphs.py index 5dd259ec841..6003c0cc28a 100644 --- a/tests/ut/python/parallel/test_auto_parallel_double_subgraphs.py +++ b/tests/ut/python/parallel/test_auto_parallel_double_subgraphs.py @@ -106,8 +106,8 @@ def test_double_subgraphs(): cost_model_context.set_cost_model_context(multi_subgraphs=True) context.set_context(save_graphs=True) context.set_auto_parallel_context(device_num=8, global_rank=0) - net = TrainStepWarp(NetWithLoss(Net())) context.set_auto_parallel_context(parallel_mode="auto_parallel") + net = TrainStepWarp(NetWithLoss(Net())) net.set_auto_parallel() x = Tensor(np.ones([8, 8, 8, 8]), dtype=ms.float32) diff --git a/tests/ut/python/parallel/test_auto_parallel_tuple_depend.py b/tests/ut/python/parallel/test_auto_parallel_tuple_depend.py index a80ccb550a4..d1c77bdd338 100644 --- a/tests/ut/python/parallel/test_auto_parallel_tuple_depend.py +++ b/tests/ut/python/parallel/test_auto_parallel_tuple_depend.py @@ -68,9 +68,9 @@ def test_virtual_dataset_3_input(): out = self.matmul2(out, b) return out - net = GradWrap(NetWithLoss(Net())) context.set_auto_parallel_context(parallel_mode="auto_parallel") context.set_auto_parallel_context(device_num=8, global_rank=0) + net = GradWrap(NetWithLoss(Net())) net.set_auto_parallel() x = Tensor(np.ones([128, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) diff --git a/tests/ut/python/parallel/test_auto_parallel_two_bn.py b/tests/ut/python/parallel/test_auto_parallel_two_bn.py index 029d85ab3ce..d771fc794cb 100644 --- a/tests/ut/python/parallel/test_auto_parallel_two_bn.py +++ b/tests/ut/python/parallel/test_auto_parallel_two_bn.py @@ -68,11 +68,11 @@ def test_two_bn(): out = self.block2(out) return out - net = NetWithLoss(Net()) - x = Tensor(np.ones([64, 64]), dtype=ms.float32) context.set_context(save_graphs=True) context.set_auto_parallel_context(device_num=8, global_rank=0) context.set_auto_parallel_context(parallel_mode="auto_parallel") + net = NetWithLoss(Net()) + x = Tensor(np.ones([64, 64]), dtype=ms.float32) net.set_auto_parallel() set_algo_parameters(elementwise_op_strategy_follow=True) reset_op_id() diff --git a/tests/ut/python/parallel/test_batch_parallel.py b/tests/ut/python/parallel/test_batch_parallel.py index 6e505f2e462..ffd060ac751 100644 --- a/tests/ut/python/parallel/test_batch_parallel.py +++ b/tests/ut/python/parallel/test_batch_parallel.py @@ -94,12 +94,12 @@ def test_batch(): return out4 context.set_auto_parallel_context(device_num=8, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((8, 1, 1, 1), (1, 1, 1, 1)) strategy2 = ((1, 1, 1, 8), (1, 1, 1, 8)) strategy3 = ((4, 1, 1, 2), (4, 1, 1, 2)) net = GradWrap(NetWithLoss(Net(strategy1, strategy2, strategy3))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") net.set_auto_parallel() x = Tensor(np.ones([128, 16, 34, 34]), dtype=ms.float32) diff --git a/tests/ut/python/parallel/test_batchnorm_batch_parallel.py b/tests/ut/python/parallel/test_batchnorm_batch_parallel.py index 5935c44441c..21d5003b4c0 100644 --- a/tests/ut/python/parallel/test_batchnorm_batch_parallel.py +++ b/tests/ut/python/parallel/test_batchnorm_batch_parallel.py @@ -118,6 +118,9 @@ def batchnorm_net(num_classes): def test_batchnorm_batch_parallel(): + context.reset_auto_parallel_context() + context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num) + context.set_context(mode=context.GRAPH_MODE) num_classes = 1001 batch_size = 32 learning_rate = 0.1 @@ -134,9 +137,6 @@ def test_batchnorm_batch_parallel(): loss.softmax_cross_entropy.set_strategy(((dev_num, 1), (dev_num, 1))) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum) - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num) - context.set_context(mode=context.GRAPH_MODE) model = Model(net, loss, opt) model.train(epoch_size, dataset, dataset_sink_mode=False) diff --git a/tests/ut/python/parallel/test_bn_prelu_cell.py b/tests/ut/python/parallel/test_bn_prelu_cell.py index 526ab3203fb..07f5d3906be 100644 --- a/tests/ut/python/parallel/test_bn_prelu_cell.py +++ b/tests/ut/python/parallel/test_bn_prelu_cell.py @@ -198,6 +198,7 @@ def bn_net(): def bn_common(parallel_mode, train_flag, strategy_loss=None): context.set_context(mode=context.GRAPH_MODE) + context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=8) learning_rate = 0.1 momentum = 0.9 epoch_size = 2 @@ -218,7 +219,6 @@ def bn_common(parallel_mode, train_flag, strategy_loss=None): if parallel_mode == ParallelMode.DATA_PARALLEL: context.set_auto_parallel_context(parameter_broadcast=True) - context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=8) model = Model(net, loss, opt) if train_flag: model.train(epoch_size, dataset, dataset_sink_mode=False) diff --git a/tests/ut/python/parallel/test_get_next.py b/tests/ut/python/parallel/test_get_next.py index 7bd84820271..4bb532630af 100644 --- a/tests/ut/python/parallel/test_get_next.py +++ b/tests/ut/python/parallel/test_get_next.py @@ -88,13 +88,13 @@ def test_get_next_semi_auto_parallel(): return x context.set_auto_parallel_context(device_num=4, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") network = Net(strategy1=((1, 4),), strategy2=((4, 1), (1,))) strategy3 = ((4, 1), (), ()) strategy4 = ((4, 1), (4, 1)) net_with_loss = NetWithLoss(network, [ms.float32, ms.int32], [[32, 64], [32]], 2, strategy3=strategy3, strategy4=strategy4) net = GradWrap(net_with_loss) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") compile_net(net) @@ -112,13 +112,13 @@ def test_get_next_semi_auto_parallel1(): return x context.set_auto_parallel_context(device_num=4, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") network = Net(strategy1=((1, 4),), strategy2=((4, 1), (1,))) strategy3 = ((1, 4), (), ()) strategy4 = ((4, 1), (4, 1)) net_with_loss = NetWithLoss(network, [ms.float32, ms.int32], [[32, 64], [32]], 2, strategy3=strategy3, strategy4=strategy4) net = GradWrap(net_with_loss) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") compile_net(net) @@ -136,10 +136,10 @@ def test_get_next_auto_parallel(): return x context.set_auto_parallel_context(device_num=4, global_rank=0) + context.set_auto_parallel_context(parallel_mode="auto_parallel") network = Net() net_with_loss = NetWithLoss(network, [ms.float32, ms.int32], [[32, 64], [32]], 2) net = GradWrap(net_with_loss) - context.set_auto_parallel_context(parallel_mode="auto_parallel") compile_net(net) @@ -153,6 +153,6 @@ def test_only_one_get_next(): return self.get_next() context.set_auto_parallel_context(device_num=4, global_rank=0) - net = Net() context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") + net = Net() compile_net(net) diff --git a/tests/ut/python/parallel/test_initializer_weight_slice.py b/tests/ut/python/parallel/test_initializer_weight_slice.py index 2d78e1d4a02..7065087901d 100644 --- a/tests/ut/python/parallel/test_initializer_weight_slice.py +++ b/tests/ut/python/parallel/test_initializer_weight_slice.py @@ -13,6 +13,7 @@ # limitations under the License. import numpy as np +import pytest from mindspore import context import mindspore.nn as nn from mindspore.ops import operations as P @@ -22,20 +23,19 @@ import mindspore.common.api as me from mindspore.common.initializer import initializer from hccl_test.manage.api import Hccl +class Net(nn.Cell): + def __init__(self, strategy1, strategy2, weight): + super().__init__() + self.weight = Parameter(weight, "w1") + self.matmul = P.MatMul(transpose_a=False, transpose_b=True).set_strategy(strategy1) + self.relu = P.ReLU().set_strategy(strategy2) + + def construct(self, x): + out = self.matmul(x, self.weight) + out = self.relu(out) + return out def check_initializer_weight_slice(init_name="Uniform"): - class Net(nn.Cell): - def __init__(self, strategy1, strategy2, weight): - super().__init__() - self.weight = Parameter(weight, "w1") - self.matmul = P.MatMul(transpose_a=False, transpose_b=True).set_strategy(strategy1) - self.relu = P.ReLU().set_strategy(strategy2) - - def construct(self, x): - out = self.matmul(x, self.weight) - out = self.relu(out) - return out - def get_slice(rank): hccl = Hccl() rank_save = hccl.rank_id @@ -77,5 +77,28 @@ def test_initializer_weight_slice(): for init_name in initializers: check_initializer_weight_slice(init_name) +def test_wrong_order_set_parallel_mode_with_initializer(): + weight = initializer("Normal", [64, 32], ms.float32) + strategy1 = ((2, 1), (4, 1)) + strategy2 = ((2, 4),) + net = Net(strategy1, strategy2, weight) + exe = me._executor + x = Tensor(np.ones([32, 32]), dtype=ms.float32) + with pytest.raises(RuntimeError): + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0) + net.set_auto_parallel() + exe.compile(net, x, auto_parallel_mode=True, phase='train') + +def test_wrong_order_set_parallel_mode_without_initializer(): + weight = Tensor(np.ones([64, 32]), ms.float32) + strategy1 = ((2, 1), (4, 1)) + strategy2 = ((2, 4),) + net = Net(strategy1, strategy2, weight) + exe = me._executor + x = Tensor(np.ones([32, 32]), dtype=ms.float32) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0) + net.set_auto_parallel() + exe.compile(net, x, auto_parallel_mode=True, phase='train') + if __name__ == '__main__': test_initializer_weight_slice() diff --git a/tests/ut/python/parallel/test_linear.py b/tests/ut/python/parallel/test_linear.py index 795a0f604f8..0e4f39b38c9 100644 --- a/tests/ut/python/parallel/test_linear.py +++ b/tests/ut/python/parallel/test_linear.py @@ -58,12 +58,12 @@ def test_linear(): return out context.set_auto_parallel_context(device_num=16, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy0 = ((2, 4), (2, 4)) strategy1 = ((2, 4), (4,)) strategy2 = ((2, 8),) strategy3 = ((16, 1), (16, 1)) net = GradWrap(NetWithLoss(Net(strategy0, strategy1, strategy2), strategy3)) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") net.set_auto_parallel() x = Tensor(np.ones([64, 32]), dtype=ms.float32) diff --git a/tests/ut/python/parallel/test_loss_and_optimizer.py b/tests/ut/python/parallel/test_loss_and_optimizer.py index 91be7682abd..b04f447b2af 100644 --- a/tests/ut/python/parallel/test_loss_and_optimizer.py +++ b/tests/ut/python/parallel/test_loss_and_optimizer.py @@ -54,6 +54,7 @@ def test_momentum(): return out context.set_auto_parallel_context(device_num=4, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 1), (2, 1)) strategy2 = ((4, 1),) strategy3 = ((4, 1), (4, 1)) @@ -69,7 +70,6 @@ def test_momentum(): net_with_loss = NetWithLoss(net, strategy3) train_net = TrainOneStepCell(net_with_loss, optimizer) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") compile_net(train_net, x, b) @@ -88,6 +88,7 @@ def test_momentum_with_loss_scale(): return out context.set_auto_parallel_context(device_num=4, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 1), (2, 1)) strategy2 = ((4, 1),) strategy3 = ((4, 1), (4, 1)) @@ -103,7 +104,6 @@ def test_momentum_with_loss_scale(): net_with_loss = NetWithLoss(net, strategy3) train_net = TrainOneStepCell(net_with_loss, optimizer) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") compile_net(train_net, x, b) @@ -122,6 +122,7 @@ def test_momentum_with_dynamic_lr(): return out context.set_auto_parallel_context(device_num=4, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 1), (2, 1)) strategy2 = ((4, 1),) strategy3 = ((4, 1), (4, 1)) @@ -138,7 +139,6 @@ def test_momentum_with_dynamic_lr(): net_with_loss = NetWithLoss(net, strategy3) train_net = TrainOneStepCell(net_with_loss, optimizer) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") compile_net(train_net, x, b) @@ -157,6 +157,7 @@ def test_momentum_with_loss_scale_and_dynamic_lr(): return out context.set_auto_parallel_context(device_num=4, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 1), (2, 1)) strategy2 = ((4, 1),) @@ -174,7 +175,6 @@ def test_momentum_with_loss_scale_and_dynamic_lr(): net_with_loss = NetWithLoss(net, strategy3) train_net = TrainOneStepCell(net_with_loss, optimizer) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") compile_net(train_net, x, b) @@ -193,6 +193,7 @@ def test_lars(): return out context.set_auto_parallel_context(device_num=4, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 1), (2, 1)) strategy2 = ((4, 1),) strategy3 = ((4, 1), (4, 1)) @@ -209,6 +210,5 @@ def test_lars(): lars_filter=lambda x: 'bn' not in x.name) net_with_loss = NetWithLoss(net, strategy3) train_net = TrainOneStepCell(net_with_loss, optimizer) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") compile_net(train_net, x, b) diff --git a/tests/ut/python/parallel/test_one_hot_net.py b/tests/ut/python/parallel/test_one_hot_net.py index c6757cef6ad..08e23be1375 100644 --- a/tests/ut/python/parallel/test_one_hot_net.py +++ b/tests/ut/python/parallel/test_one_hot_net.py @@ -266,11 +266,11 @@ class BNReshapeDenseBNNet(nn.Cell): def test_bn_reshape_dense_bn_train_loss(): batch_size = 16 context.set_auto_parallel_context(device_num=device_num, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") input_ = Tensor(np.ones([batch_size, 2, 32, 32]).astype(np.float32) * 0.01) label = Tensor(np.ones([batch_size]), dtype=ms.int32) net = GradWrap(NetWithLoss(BNReshapeDenseBNNet())) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") net.set_auto_parallel() _executor.compile(net, input_, label) @@ -279,12 +279,12 @@ def test_bn_reshape_dense_bn_train_loss(): def test_semi_one_hot_net_batch(): batch_size = 16 context.set_auto_parallel_context(device_num=device_num, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") input_ = Tensor(np.ones([batch_size * 1, 512]).astype(np.float32) * 0.01) label = Tensor(np.ones([batch_size]), dtype=ms.int32) net = SemiAutoOneHotNet(args=Args(), strategy=StrategyBatch()) net = GradWrap(NetWithLoss(net)) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") net.set_auto_parallel() _executor.compile(net, input_, label) @@ -300,10 +300,10 @@ def test_semi_one_hot_net_model(): label = Tensor(np.ones([batch_size]), dtype=ms.int32) dataset = Dataset(predict, label, 2, input_num=2) - net = SemiAutoOneHotNet(args=Args(), strategy=StrategyModel()) - opt = Momentum(net.trainable_params(), learning_rate, momentum) context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=16) context.set_context(mode=context.GRAPH_MODE) + net = SemiAutoOneHotNet(args=Args(), strategy=StrategyModel()) + opt = Momentum(net.trainable_params(), learning_rate, momentum) model = Model(net, optimizer=opt) model.train(epoch_size, dataset, dataset_sink_mode=False) diff --git a/tests/ut/python/parallel/test_operator_model_parallel.py b/tests/ut/python/parallel/test_operator_model_parallel.py index 26f804537ba..788521c5258 100644 --- a/tests/ut/python/parallel/test_operator_model_parallel.py +++ b/tests/ut/python/parallel/test_operator_model_parallel.py @@ -353,6 +353,8 @@ def test_resnet_operator_batch_parallel(): context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=dev_num, global_rank=0) + context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num) + context.set_context(mode=context.GRAPH_MODE) predict = Tensor(np.ones([batch_size, 3, 224, 224]), dtype=ms.float32) label = Tensor(np.ones([batch_size]), dtype=ms.int32) @@ -363,9 +365,6 @@ def test_resnet_operator_batch_parallel(): loss.softmax_cross_entropy.set_strategy(((dev_num, 1), (dev_num, 1))) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum) - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num) - context.set_context(mode=context.GRAPH_MODE) model = Model(net, loss, opt) model.train(epoch_size, dataset, dataset_sink_mode=False) @@ -379,6 +378,8 @@ def test_resnet_model_parallel(): context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=dev_num, global_rank=0) + context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num) + context.set_context(mode=context.GRAPH_MODE) predict = Tensor(np.ones([batch_size, 64, 112, 112]), dtype=ms.float32) label = Tensor(np.ones([batch_size]), dtype=ms.int32) @@ -389,9 +390,6 @@ def test_resnet_model_parallel(): loss.softmax_cross_entropy.set_strategy(((dev_num, 1), (dev_num, 1))) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum) - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num) - context.set_context(mode=context.GRAPH_MODE) model = Model(net, loss, opt) model.train(epoch_size, dataset, dataset_sink_mode=False) diff --git a/tests/ut/python/parallel/test_optimizer.py b/tests/ut/python/parallel/test_optimizer.py index 10e8e261542..e848f4ed9b0 100644 --- a/tests/ut/python/parallel/test_optimizer.py +++ b/tests/ut/python/parallel/test_optimizer.py @@ -45,6 +45,8 @@ class Net(nn.Cell): def test_dense_gen_graph(): context.set_context(mode=context.GRAPH_MODE) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(parallel_mode=ParallelMode.HYBRID_PARALLEL, mirror_mean=True, device_num=8) init() network = Net(512, 128) @@ -53,8 +55,6 @@ def test_dense_gen_graph(): learning_rate=0.1, momentum=0.9) network = WithLossCell(network, loss_fn) - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.HYBRID_PARALLEL, mirror_mean=True, device_num=8) network = TrainOneStepCell(network, optimizer) predict = Tensor(np.ones([64, 512]).astype(np.float32) * 0.01) diff --git a/tests/ut/python/parallel/test_optimizer_clone_weight.py b/tests/ut/python/parallel/test_optimizer_clone_weight.py index baf5e748618..bdafcba10a9 100644 --- a/tests/ut/python/parallel/test_optimizer_clone_weight.py +++ b/tests/ut/python/parallel/test_optimizer_clone_weight.py @@ -54,6 +54,7 @@ def test_optimizer_clone_weight(): return out context.set_auto_parallel_context(device_num=4, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 1), (2, 1)) strategy2 = ((4, 1),) @@ -70,7 +71,6 @@ def test_optimizer_clone_weight(): net_with_loss = NetWithLoss(net, strategy3) train_net = TrainOneStepCell(net_with_loss, optimizer) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") compile_net(train_net, x, b) @@ -89,6 +89,7 @@ def test_optimizer_clone_weight2(): return out context.set_auto_parallel_context(device_num=4, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 1), (2, 1)) strategy2 = ((4, 1),) @@ -105,6 +106,5 @@ def test_optimizer_clone_weight2(): net_with_loss = NetWithLoss(net, strategy3) train_net = TrainOneStepCell(net_with_loss, optimizer) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") compile_net(train_net, x, b) diff --git a/tests/ut/python/parallel/test_reshape.py b/tests/ut/python/parallel/test_reshape.py index 5fe48da1339..65bfed83694 100644 --- a/tests/ut/python/parallel/test_reshape.py +++ b/tests/ut/python/parallel/test_reshape.py @@ -320,10 +320,10 @@ def reshape_net2(backbone): batch_size = 16 device_num = 16 context.set_auto_parallel_context(device_num=device_num, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") input_ = Tensor(np.ones([batch_size * device_num, 512, 7, 7]).astype(np.float32) * 0.01) net = GradWrap(NetWithLoss(backbone)) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") compile_net(net, input_) @@ -530,10 +530,10 @@ def test_bn_reshape_dense_bn_train(): batch_size = 16 device_num = 16 context.set_auto_parallel_context(device_num=device_num, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") input_ = Tensor(np.ones([batch_size, 2, 32, 32]).astype(np.float32) * 0.01) net = GradWrap(NetWithLoss(BNReshapeDenseBNNet())) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") compile_net(net, input_) diff --git a/tests/ut/python/parallel/test_using_seed_for_initializer.py b/tests/ut/python/parallel/test_using_seed_for_initializer.py index 9e601efccdd..a8426ebf587 100644 --- a/tests/ut/python/parallel/test_using_seed_for_initializer.py +++ b/tests/ut/python/parallel/test_using_seed_for_initializer.py @@ -18,6 +18,7 @@ from numpy import allclose import mindspore.common.initializer as init import mindspore.nn as nn from mindspore import Parameter +from mindspore.parallel._utils import _set_has_initializer parameter_shape = [16, 4] @@ -46,6 +47,7 @@ def test_using_same_seed_for_initializer(): np.random.seed(0) net2 = ParameterNet() net2.init_parameters_data() + _set_has_initializer(False) for key in net1.parameters_dict(): if key not in net2.parameters_dict(): assert False @@ -60,6 +62,7 @@ def test_using_diffserent_seed_for_initializer(): np.random.seed(1) net2 = ParameterNet() net2.init_parameters_data() + _set_has_initializer(False) for key in net1.parameters_dict(): if key not in net2.parameters_dict(): assert False diff --git a/tests/ut/python/parallel/test_virtual_dataset_3_input.py b/tests/ut/python/parallel/test_virtual_dataset_3_input.py index e7ea717dc92..206570cc243 100644 --- a/tests/ut/python/parallel/test_virtual_dataset_3_input.py +++ b/tests/ut/python/parallel/test_virtual_dataset_3_input.py @@ -62,13 +62,13 @@ def test_virtual_dataset_3_input(): out = self.matmul2(out, b) return out + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") + context.set_auto_parallel_context(device_num=8, global_rank=0) strategy0 = ((2, 1), (2, 1), (2, 1)) strategy1 = ((2, 2), (2, 2)) strategy2 = ((2, 2), (2, 2)) strategy3 = ((2, 4),) net = GradWrap(NetWithLoss(Net(strategy0, strategy1, strategy2, strategy3))) - context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") - context.set_auto_parallel_context(device_num=8, global_rank=0) x = Tensor(np.ones([128, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) b = Tensor(np.ones([64, 2048]), dtype=ms.float32) @@ -89,10 +89,10 @@ def test_virtualdataset_cell_3_inputs(): out = self.matmul2(out, b) return out - net = GradWrap(VirtualDatasetCellTriple(NetWithLoss(Net(None, None, None)))) context.set_context(save_graphs=True) context.set_auto_parallel_context(parallel_mode="auto_parallel") context.set_auto_parallel_context(device_num=8, global_rank=0) + net = GradWrap(VirtualDatasetCellTriple(NetWithLoss(Net(None, None, None)))) x = Tensor(np.ones([128, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) b = Tensor(np.ones([64, 2048]), dtype=ms.float32) diff --git a/tests/ut/python/train/test_amp.py b/tests/ut/python/train/test_amp.py index 6bb4ec54642..6406a4b80d3 100644 --- a/tests/ut/python/train/test_amp.py +++ b/tests/ut/python/train/test_amp.py @@ -146,6 +146,10 @@ def test_compile_model_train_O2(): def test_compile_model_train_O2_parallel(): dataset_types = (np.float32, np.float32) dataset_shapes = ((16, 16), (16, 16)) + context.set_auto_parallel_context( + global_rank=0, device_num=8, + mirror_mean=True, parameter_broadcast=True, + parallel_mode=ParallelMode.DATA_PARALLEL) dataset = MindDataSet(dataset_types, dataset_shapes) @@ -153,10 +157,6 @@ def test_compile_model_train_O2_parallel(): loss = nn.MSELoss() optimizer = nn.Momentum(net.trainable_params(), 0.1, 0.9, 0.00004, 1024.0) - context.set_auto_parallel_context( - global_rank=0, device_num=8, - mirror_mean=True, parameter_broadcast=True, - parallel_mode=ParallelMode.DATA_PARALLEL) init() model = Model(net, loss_fn=loss, optimizer=optimizer, metrics={"acc"}, amp_level="O2")