diff --git a/mindspore/communication/management.py b/mindspore/communication/management.py index 3fb4e7b9477..be07b538610 100755 --- a/mindspore/communication/management.py +++ b/mindspore/communication/management.py @@ -14,6 +14,7 @@ # ============================================================================ """Communication management API""" import os +from mindspore import context from mindspore.parallel._auto_parallel_context import auto_parallel_context from ._comm_helper import Backend, _get_rank_helper, _get_size_helper, \ _get_world_rank_from_group_rank_helper, _get_group_rank_from_world_rank_helper, \ @@ -45,7 +46,7 @@ class GlobalComm: WORLD_COMM_GROUP = DEFAULT_WORLD_COMM_GROUP -def init(backend_name="hccl"): +def init(backend_name=None): """ Init distributed backend, e.g., hccl/nccl, it is required before communication service can be used. @@ -57,11 +58,20 @@ def init(backend_name="hccl"): backend_name (str): Backend. Raises: - TypeError: If backend name is not a string. + TypeError: If backen_name is not a string. + RuntimeError: If device target is invalid. RuntimeError: If backend is invalid or distributed init fails. """ if MS_ROLE in ("MS_PSERVER", "MS_SCHED"): return + if backend_name is None: + device_target = context.get_context("device_target") + if device_target == "Ascend": + backend_name = "hccl" + elif device_target == "GPU": + backend_name = "nccl" + else: + raise RuntimeError("Device target {} is not supported.".format(device_target)) if not isinstance(backend_name, str): raise TypeError("Backend name must be a string, but got {}".format(type(backend_name))) diff --git a/mindspore/ops/operations/comm_ops.py b/mindspore/ops/operations/comm_ops.py index 2b84741fea6..6aff5037796 100644 --- a/mindspore/ops/operations/comm_ops.py +++ b/mindspore/ops/operations/comm_ops.py @@ -73,7 +73,7 @@ class AllReduce(PrimitiveWithInfer): >>> import mindspore.nn as nn >>> import mindspore.ops.operations as P >>> - >>> init('nccl') + >>> init() >>> class Net(nn.Cell): >>> def __init__(self): >>> super(Net, self).__init__() @@ -136,7 +136,7 @@ class AllGather(PrimitiveWithInfer): >>> from mindspore.communication import init >>> from mindspore import Tensor >>> - >>> init('nccl') + >>> init() >>> class Net(nn.Cell): >>> def __init__(self): >>> super(Net, self).__init__() @@ -246,7 +246,7 @@ class ReduceScatter(PrimitiveWithInfer): >>> import mindspore.nn as nn >>> import mindspore.ops.operations as P >>> - >>> init('nccl') + >>> init() >>> class Net(nn.Cell): >>> def __init__(self): >>> super(Net, self).__init__() @@ -360,7 +360,7 @@ class Broadcast(PrimitiveWithInfer): >>> import mindspore.nn as nn >>> import mindspore.ops.operations as P >>> - >>> init('nccl') + >>> init() >>> class Net(nn.Cell): >>> def __init__(self): >>> super(Net, self).__init__() diff --git a/model_zoo/official/cv/googlenet/train.py b/model_zoo/official/cv/googlenet/train.py index 5181f9c484a..65300a673d2 100644 --- a/model_zoo/official/cv/googlenet/train.py +++ b/model_zoo/official/cv/googlenet/train.py @@ -81,7 +81,7 @@ if __name__ == '__main__': mirror_mean=True) init() elif device_target == "GPU": - init("nccl") + init() if device_num > 1: context.reset_auto_parallel_context() diff --git a/model_zoo/official/cv/inceptionv3/train.py b/model_zoo/official/cv/inceptionv3/train.py index f2d2256eef5..d07d3855fb3 100644 --- a/model_zoo/official/cv/inceptionv3/train.py +++ b/model_zoo/official/cv/inceptionv3/train.py @@ -57,10 +57,7 @@ if __name__ == '__main__': cfg = config_ascend if args_opt.platform == 'Ascend' else config_gpu # init distributed if args_opt.is_distributed: - if args_opt.platform == "Ascend": - init() - else: - init("nccl") + init() cfg.rank = get_rank() cfg.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL diff --git a/model_zoo/official/cv/mobilenetv2/train.py b/model_zoo/official/cv/mobilenetv2/train.py index 4fb800d6ddd..a282742381a 100644 --- a/model_zoo/official/cv/mobilenetv2/train.py +++ b/model_zoo/official/cv/mobilenetv2/train.py @@ -64,7 +64,7 @@ elif args_opt.device_target == "GPU": context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False) - init("nccl") + init() context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) diff --git a/model_zoo/official/cv/mobilenetv2_quant/train.py b/model_zoo/official/cv/mobilenetv2_quant/train.py index ebe60996cf9..eb59d37c367 100644 --- a/model_zoo/official/cv/mobilenetv2_quant/train.py +++ b/model_zoo/official/cv/mobilenetv2_quant/train.py @@ -57,7 +57,7 @@ if args_opt.device_target == "Ascend": device_target="Ascend", device_id=device_id, save_graphs=False) elif args_opt.device_target == "GPU": - init("nccl") + init() context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) diff --git a/model_zoo/official/cv/mobilenetv3/train.py b/model_zoo/official/cv/mobilenetv3/train.py index 60f3723244e..ea1f9ec27ec 100644 --- a/model_zoo/official/cv/mobilenetv3/train.py +++ b/model_zoo/official/cv/mobilenetv3/train.py @@ -54,7 +54,7 @@ if args_opt.device_target == "GPU": context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False) - init("nccl") + init() context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) diff --git a/model_zoo/official/cv/resnet/src/dataset.py b/model_zoo/official/cv/resnet/src/dataset.py index 9d39b5ab77d..86ffecb1ec1 100755 --- a/model_zoo/official/cv/resnet/src/dataset.py +++ b/model_zoo/official/cv/resnet/src/dataset.py @@ -38,7 +38,7 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target= if target == "Ascend": device_num, rank_id = _get_rank_info() else: - init("nccl") + init() rank_id = get_rank() device_num = get_group_size() @@ -93,7 +93,7 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target= if target == "Ascend": device_num, rank_id = _get_rank_info() else: - init("nccl") + init() rank_id = get_rank() device_num = get_group_size() diff --git a/model_zoo/official/cv/resnet/train.py b/model_zoo/official/cv/resnet/train.py index 81c7c08a625..8426c6b6458 100755 --- a/model_zoo/official/cv/resnet/train.py +++ b/model_zoo/official/cv/resnet/train.py @@ -85,7 +85,7 @@ if __name__ == '__main__': init() # GPU target else: - init("nccl") + init() context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) if args_opt.net == "resnet50": diff --git a/model_zoo/official/cv/resnet50_quant/src/dataset.py b/model_zoo/official/cv/resnet50_quant/src/dataset.py index 3c35adeaaa0..7841bc403b7 100755 --- a/model_zoo/official/cv/resnet50_quant/src/dataset.py +++ b/model_zoo/official/cv/resnet50_quant/src/dataset.py @@ -46,7 +46,7 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" device_num = int(os.getenv("RANK_SIZE")) rank_id = int(os.getenv("RANK_ID")) else: - init("nccl") + init() rank_id = get_rank() device_num = get_group_size() @@ -114,7 +114,7 @@ def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, targe device_num = int(os.getenv("RANK_SIZE")) rank_id = int(os.getenv("RANK_ID")) else: - init("nccl") + init() rank_id = get_rank() device_num = get_group_size() diff --git a/model_zoo/official/cv/resnet_thor/src/dataset.py b/model_zoo/official/cv/resnet_thor/src/dataset.py index fc6dc1bac98..3f3b006fb5c 100644 --- a/model_zoo/official/cv/resnet_thor/src/dataset.py +++ b/model_zoo/official/cv/resnet_thor/src/dataset.py @@ -40,7 +40,7 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" if target == "Ascend": device_num, rank_id = _get_rank_info() else: - init("nccl") + init() rank_id = get_rank() device_num = get_group_size() diff --git a/model_zoo/official/cv/resnet_thor/train.py b/model_zoo/official/cv/resnet_thor/train.py index d7c667dffae..cef6ac0a3c4 100644 --- a/model_zoo/official/cv/resnet_thor/train.py +++ b/model_zoo/official/cv/resnet_thor/train.py @@ -106,7 +106,7 @@ if __name__ == '__main__': init() # GPU target else: - init("nccl") + init() context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/" diff --git a/model_zoo/official/cv/resnext50/eval.py b/model_zoo/official/cv/resnext50/eval.py index 4dc2aa485ab..06b4acfe05a 100644 --- a/model_zoo/official/cv/resnext50/eval.py +++ b/model_zoo/official/cv/resnext50/eval.py @@ -112,10 +112,7 @@ def test(cloud_args=None): # init distributed if args.is_distributed: - if args.platform == "Ascend": - init() - elif args.platform == "GPU": - init("nccl") + init() args.rank = get_rank() args.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL diff --git a/model_zoo/official/cv/resnext50/train.py b/model_zoo/official/cv/resnext50/train.py index d2cb72d5d21..6b31cbd8f1c 100644 --- a/model_zoo/official/cv/resnext50/train.py +++ b/model_zoo/official/cv/resnext50/train.py @@ -172,10 +172,7 @@ def train(cloud_args=None): # init distributed if args.is_distributed: - if args.platform == "Ascend": - init() - else: - init("nccl") + init() args.rank = get_rank() args.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL diff --git a/model_zoo/official/cv/vgg16/train.py b/model_zoo/official/cv/vgg16/train.py index ae2f934e1e7..d0f04271b10 100644 --- a/model_zoo/official/cv/vgg16/train.py +++ b/model_zoo/official/cv/vgg16/train.py @@ -135,7 +135,7 @@ if __name__ == '__main__': init() context.set_context(device_id=args.device_id) elif args.device_target == "GPU": - init("nccl") + init() args.rank = get_rank() args.group_size = get_group_size() diff --git a/model_zoo/official/cv/warpctc/train.py b/model_zoo/official/cv/warpctc/train.py index 380308653fd..2746b83fe7f 100755 --- a/model_zoo/official/cv/warpctc/train.py +++ b/model_zoo/official/cv/warpctc/train.py @@ -60,7 +60,7 @@ if __name__ == '__main__': device_num = int(os.environ.get("RANK_SIZE")) rank = int(os.environ.get("RANK_ID")) else: - init('nccl') + init() lr_scale = 0.5 device_num = get_group_size() rank = get_rank() diff --git a/model_zoo/official/nlp/bert/run_pretrain.py b/model_zoo/official/nlp/bert/run_pretrain.py index 73b1021003e..2a12b7913da 100644 --- a/model_zoo/official/nlp/bert/run_pretrain.py +++ b/model_zoo/official/nlp/bert/run_pretrain.py @@ -70,11 +70,11 @@ def run_pretrain(): ckpt_save_dir = args_opt.save_checkpoint_path if args_opt.distribute == "true": if args_opt.device_target == 'Ascend': - D.init('hccl') + D.init() device_num = args_opt.device_num rank = args_opt.device_id % device_num else: - D.init('nccl') + D.init() device_num = D.get_group_size() rank = D.get_rank() ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(rank) + '/' diff --git a/model_zoo/official/nlp/bert_thor/run_pretrain.py b/model_zoo/official/nlp/bert_thor/run_pretrain.py index 5c5e1c282f7..0d9e4a4a204 100644 --- a/model_zoo/official/nlp/bert_thor/run_pretrain.py +++ b/model_zoo/official/nlp/bert_thor/run_pretrain.py @@ -73,11 +73,11 @@ def run_pretrain(): ckpt_save_dir = args_opt.save_checkpoint_path if args_opt.distribute == "true": if args_opt.device_target == 'Ascend': - D.init('hccl') + D.init() device_num = args_opt.device_num rank = args_opt.device_id % device_num else: - D.init('nccl') + D.init() device_num = D.get_group_size() rank = D.get_rank() ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(rank) + '/' diff --git a/model_zoo/official/nlp/mass/train.py b/model_zoo/official/nlp/mass/train.py index 81854bcb8c2..97a87b0ff35 100644 --- a/model_zoo/official/nlp/mass/train.py +++ b/model_zoo/official/nlp/mass/train.py @@ -227,10 +227,7 @@ def _build_training_pipeline(config: TransformerConfig, def _setup_parallel_env(platform): context.reset_auto_parallel_context() - if platform == "GPU": - MultiAscend.init("nccl") - else: - MultiAscend.init() + MultiAscend.init() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, device_num=MultiAscend.get_group_size(), diff --git a/model_zoo/official/nlp/tinybert/run_general_distill.py b/model_zoo/official/nlp/tinybert/run_general_distill.py index 8fdc86b8bcd..f1d7620ae0c 100644 --- a/model_zoo/official/nlp/tinybert/run_general_distill.py +++ b/model_zoo/official/nlp/tinybert/run_general_distill.py @@ -67,11 +67,11 @@ def run_general_distill(): if args_opt.distribute == "true": if args_opt.device_target == 'Ascend': - D.init('hccl') + D.init() device_num = args_opt.device_num rank = args_opt.device_id % device_num else: - D.init('nccl') + D.init() device_num = D.get_group_size() rank = D.get_rank() save_ckpt_dir = save_ckpt_dir + '_ckpt_' + str(rank) diff --git a/model_zoo/official/recommend/deepfm/train.py b/model_zoo/official/recommend/deepfm/train.py index 95810c3a7e0..3dd8373c835 100644 --- a/model_zoo/official/recommend/deepfm/train.py +++ b/model_zoo/official/recommend/deepfm/train.py @@ -59,7 +59,7 @@ if __name__ == '__main__': init() rank_id = int(os.environ.get('RANK_ID')) elif args_opt.device_target == "GPU": - init("nccl") + init() context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=get_group_size(), diff --git a/model_zoo/official/recommend/wide_and_deep/train_and_eval_auto_parallel.py b/model_zoo/official/recommend/wide_and_deep/train_and_eval_auto_parallel.py index 4af6366e42d..38904a52c99 100644 --- a/model_zoo/official/recommend/wide_and_deep/train_and_eval_auto_parallel.py +++ b/model_zoo/official/recommend/wide_and_deep/train_and_eval_auto_parallel.py @@ -128,10 +128,7 @@ if __name__ == "__main__": context.set_context(variable_memory_max_size="24GB") context.set_context(enable_sparse=True) set_multi_subgraphs() - if wide_deep_config.device_target == "Ascend": - init("hccl") - elif wide_deep_config.device_target == "GPU": - init("nccl") + init() if wide_deep_config.host_device_mix == 1: context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, mirror_mean=True) else: diff --git a/model_zoo/official/recommend/wide_and_deep/train_and_eval_distribute.py b/model_zoo/official/recommend/wide_and_deep/train_and_eval_distribute.py index 9e70cd1d681..96616171c99 100644 --- a/model_zoo/official/recommend/wide_and_deep/train_and_eval_distribute.py +++ b/model_zoo/official/recommend/wide_and_deep/train_and_eval_distribute.py @@ -122,10 +122,7 @@ if __name__ == "__main__": wide_deep_config.argparse_init() context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target, save_graphs=True) - if wide_deep_config.device_target == "Ascend": - init("hccl") - elif wide_deep_config.device_target == "GPU": - init("nccl") + init() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=get_group_size()) diff --git a/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server.py b/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server.py index bab19acdc46..2ddce02725f 100644 --- a/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server.py +++ b/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server.py @@ -119,10 +119,7 @@ if __name__ == "__main__": wide_deep_config.argparse_init() context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target) - if wide_deep_config.device_target == "Ascend": - init("hccl") - elif wide_deep_config.device_target == "GPU": - init("nccl") + init() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=get_group_size()) diff --git a/tests/st/nccl/test_nccl_all_gather_op.py b/tests/st/nccl/test_nccl_all_gather_op.py index 9d7ad205c9e..afaf6167215 100644 --- a/tests/st/nccl/test_nccl_all_gather_op.py +++ b/tests/st/nccl/test_nccl_all_gather_op.py @@ -24,7 +24,7 @@ from mindspore.ops import operations as P context.set_context(mode=context.GRAPH_MODE, device_target='GPU') -init('nccl') +init() rank = get_rank() size = get_group_size() x = np.ones([1, 1, 3, 3]).astype(np.float32) * 0.01 * (rank + 1) diff --git a/tests/st/nccl/test_nccl_all_reduce_op.py b/tests/st/nccl/test_nccl_all_reduce_op.py index 0f00c6aef7e..c40bc48b9fa 100644 --- a/tests/st/nccl/test_nccl_all_reduce_op.py +++ b/tests/st/nccl/test_nccl_all_reduce_op.py @@ -24,7 +24,7 @@ from mindspore.ops import operations as P context.set_context(mode=context.GRAPH_MODE, device_target='GPU') -init('nccl') +init() rank = get_rank() size = get_group_size() x = np.ones([3, 1, 3, 3]).astype(np.float32) * 0.01 * (rank + 1) diff --git a/tests/st/nccl/test_nccl_broadcast_op.py b/tests/st/nccl/test_nccl_broadcast_op.py index 4541bf6e6b7..fe3955dc396 100644 --- a/tests/st/nccl/test_nccl_broadcast_op.py +++ b/tests/st/nccl/test_nccl_broadcast_op.py @@ -24,7 +24,7 @@ from mindspore.ops import operations as P context.set_context(mode=context.GRAPH_MODE, device_target='GPU') -init('nccl') +init() rank = get_rank() size = get_group_size() x = np.ones([3, 1, 3, 3]).astype(np.float32) * 0.01 * (rank + 1) diff --git a/tests/st/nccl/test_nccl_lenet.py b/tests/st/nccl/test_nccl_lenet.py index 37fd6363c06..632e8ec575b 100644 --- a/tests/st/nccl/test_nccl_lenet.py +++ b/tests/st/nccl/test_nccl_lenet.py @@ -25,7 +25,7 @@ from mindspore.nn.optim import Momentum from mindspore.ops import operations as P context.set_context(mode=context.GRAPH_MODE, device_target="GPU") -init('nccl') +init() epoch = 5 total = 5000 diff --git a/tests/st/nccl/test_nccl_reduce_scatter_op.py b/tests/st/nccl/test_nccl_reduce_scatter_op.py index 59ede9cd76e..14b4c574b5e 100644 --- a/tests/st/nccl/test_nccl_reduce_scatter_op.py +++ b/tests/st/nccl/test_nccl_reduce_scatter_op.py @@ -24,7 +24,7 @@ from mindspore.ops import operations as P context.set_context(mode=context.GRAPH_MODE, device_target='GPU') -init('nccl') +init() rank = get_rank() size = get_group_size() x = np.ones([size, 1, 3, 3]).astype(np.float32) * 0.01 * (rank + 1) diff --git a/tests/st/ps/multi_full_ps/test_multi_full_ps.py b/tests/st/ps/multi_full_ps/test_multi_full_ps.py index 30bf6176923..181d2512626 100644 --- a/tests/st/ps/multi_full_ps/test_multi_full_ps.py +++ b/tests/st/ps/multi_full_ps/test_multi_full_ps.py @@ -30,7 +30,7 @@ args, _ = parser.parse_known_args() device_target = args.device_target context.set_context(mode=context.GRAPH_MODE, device_target=device_target) if device_target == "GPU": - init('nccl') + init() def conv(in_channels, out_channels, kernel_size, stride=1, padding=0): diff --git a/tests/ut/python/train/test_dataset_helper.py b/tests/ut/python/train/test_dataset_helper.py index 6540adfe12f..916d87caad8 100644 --- a/tests/ut/python/train/test_dataset_helper.py +++ b/tests/ut/python/train/test_dataset_helper.py @@ -75,7 +75,7 @@ def test_dataset_iter_normal(): @pytest.mark.skipif('not context.get_context("enable_ge")') def test_dataset_iter_ge(): - init() + init("hccl") dataset = get_dataset(32) dataset_helper = DatasetHelper(dataset, dataset_sink_mode=True, sink_size=10) count = 0 @@ -87,7 +87,7 @@ def test_dataset_iter_ge(): @pytest.mark.skipif('context.get_context("enable_ge")') def test_dataset_iter_ms_loop_sink(): - init() + init("hccl") context.set_context(enable_loop_sink=True) dataset = get_dataset(32) dataset_helper = DatasetHelper(dataset, dataset_sink_mode=True, sink_size=10) @@ -101,7 +101,7 @@ def test_dataset_iter_ms_loop_sink(): @pytest.mark.skipif('context.get_context("enable_ge")') def test_dataset_iter_ms(): - init() + init("hccl") context.set_context(enable_loop_sink=False) dataset = get_dataset(32) DatasetHelper(dataset, dataset_sink_mode=True, sink_size=10)