add API for enable_alltoall

This commit is contained in:
b00518648 2022-03-16 15:31:37 +08:00
parent e55ce35b0f
commit 9f177a1cf2
2 changed files with 10 additions and 3 deletions

View File

@ -180,7 +180,7 @@ MindSpore context用于配置当前执行环境包括执行模式、执行
parallel_mode strategy_ckpt_load_file
all_reduce_fusion_config strategy_ckpt_save_file
enable_parallel_optimizer dataset_strategy
\ pipeline_stages
enable_alltoall pipeline_stages
\ grad_accumulation_step
========================= =========================
@ -211,6 +211,7 @@ MindSpore context用于配置当前执行环境包括执行模式、执行
- **full_batch** (bool) - 如果在auto_parallel模式下加载整个batch数据集则此参数应设置为True。默认值False。目前不建议使用该接口建议使用dataset_strategy来替换它。
- **dataset_strategy** (Union[str, tuple]) - 表示数据集分片策略。默认值data_parallel。dataset_strategy="data_parallel"等于full_batch=Falsedataset_strategy="full_batch"等于full_batch=True。对于通过模型并列策略加载到网络的数据集如ds_stra ((1, 8)、(1, 8))需要使用set_auto_parallel_context(dataset_strategy=ds_stra)。
- **enable_parallel_optimizer** (bool) - 这是一个开发中的特性它可以为数据并行训练对权重更新计算进行分片以节省时间和内存。目前自动和半自动并行模式支持Ascend和GPU中的所有优化器。数据并行模式仅支持Ascend中的 `Lamb``AdamWeightDecay` 。默认值False。
- **enable_alltoall** (bool) - 允许在通信期间生成 `AllToAll` 通信算子的开关。 如果其值为 False则将由 `AllGather``Split``Concat` 等通信算子的组合来代替 `AllToAll` 。 默认值False。
- **all_reduce_fusion_config** (list) - 通过参数索引设置 AllReduce 融合策略。仅支持ReduceOp.SUM和HCCL_WORLD_GROUP/NCCL_WORLD_GROUP。没有默认值。如果不设置则关闭算子融合。
- **pipeline_stages** (int) - 设置pipeline并行的阶段信息。这表明了设备如何单独分布在pipeline上。所有的设备将被划分为pipeline_stags个阶段。目前这只能在启动semi_auto_parallel模式的情况下使用。默认值1。
- **grad_accumulation_step** (int) - 在自动和半自动并行模式下设置梯度的累积step。其值应为正整数。默认值1。
@ -260,6 +261,7 @@ MindSpore context用于配置当前执行环境包括执行模式、执行
- strategy_ckpt_save_file''。
- full_batchFalse。
- enable_parallel_optimizerFalse。
- enable_alltoall: False。
- pipeline_stages1。
.. py:class:: mindspore.context.ParallelMode

View File

@ -398,7 +398,7 @@ def _context():
@args_type_check(device_num=int, global_rank=int, gradients_mean=bool, gradient_fp32_sync=bool, parallel_mode=str,
auto_parallel_search_mode=str, search_mode=str, parameter_broadcast=bool, strategy_ckpt_load_file=str,
strategy_ckpt_save_file=str, full_batch=bool, enable_parallel_optimizer=bool,
strategy_ckpt_save_file=str, full_batch=bool, enable_parallel_optimizer=bool, enable_alltoall=bool,
all_reduce_fusion_config=list, pipeline_stages=int, grad_accumulation_step=int,
parallel_optimizer_config=dict, comm_fusion=dict)
def set_auto_parallel_context(**kwargs):
@ -427,7 +427,7 @@ def set_auto_parallel_context(**kwargs):
all_reduce_fusion_config strategy_ckpt_save_file
enable_parallel_optimizer dataset_strategy
parallel_optimizer_config pipeline_stages
\ grad_accumulation_step
enable_alltoall grad_accumulation_step
\ auto_parallel_search_mode
\ comm_fusion
=========================== ===========================
@ -481,6 +481,9 @@ def set_auto_parallel_context(**kwargs):
data parallel training in the benefit of time and memory saving. Currently, auto and semi auto
parallel mode support all optimizers in both Ascend and GPU. Data parallel mode only supports
`Lamb` and `AdamWeightDecay` in Ascend . Default: False.
enable_alltoall (bool): A switch that allows AllToAll operators to be generated during communication. If its
value is False, there will be a combination of operators such as AllGather, Split and Concat
instead of AllToAll. Default: False.
all_reduce_fusion_config (list): Set allreduce fusion strategy by parameters indices. Only support ReduceOp.SUM
and HCCL_WORLD_GROUP/NCCL_WORLD_GROUP. No Default, if it is not set, the fusion is closed.
pipeline_stages (int): Set the stage information for pipeline parallel. This indicates how the devices are
@ -545,6 +548,7 @@ def set_auto_parallel_context(**kwargs):
>>> context.set_auto_parallel_context(strategy_ckpt_save_file="./strategy_stage1.ckpt")
>>> context.set_auto_parallel_context(dataset_strategy=((1, 8), (1, 8)))
>>> context.set_auto_parallel_context(enable_parallel_optimizer=False)
>>> context.set_auto_parallel_context(enable_alltoall=False)
>>> context.set_auto_parallel_context(all_reduce_fusion_config=[8, 160])
>>> context.set_auto_parallel_context(pipeline_stages=2)
>>> parallel_config = {"gradient_accumulation_shard": True, "parallel_optimizer_threshold": 24}
@ -592,6 +596,7 @@ def reset_auto_parallel_context():
- strategy_ckpt_save_file: ''.
- full_batch: False.
- enable_parallel_optimizer: False.
- enable_alltoall: False.
- pipeline_stages: 1.
- fusion_threshold: 64.
"""