From 7f8c5b10daa640aabb7d08b2d61cf27db32f7c82 Mon Sep 17 00:00:00 2001 From: huangxinjing Date: Fri, 9 Jul 2021 15:05:10 +0800 Subject: [PATCH] Add pipeline dataset imports Fix document --- mindspore/nn/wrap/grad_reducer.py | 2 +- mindspore/ops/primitive.py | 11 ++++++++++- .../official/nlp/pangu_alpha/src/dataset.py | 2 ++ model_zoo/official/nlp/pangu_alpha/train.py | 16 ++++++---------- 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/mindspore/nn/wrap/grad_reducer.py b/mindspore/nn/wrap/grad_reducer.py index 29836aec927..d7680f2c44e 100644 --- a/mindspore/nn/wrap/grad_reducer.py +++ b/mindspore/nn/wrap/grad_reducer.py @@ -291,7 +291,7 @@ class DistributedGradReducer(Cell): ValueError: If degree is not a int or less than 0. Supported Platforms: - ``Ascend``, ``GPU`` + ``Ascend`` ``GPU`` Examples: >>> # This example should be run with multiple processes. diff --git a/mindspore/ops/primitive.py b/mindspore/ops/primitive.py index b73dca4546c..42f0f24c6af 100644 --- a/mindspore/ops/primitive.py +++ b/mindspore/ops/primitive.py @@ -140,9 +140,13 @@ class Primitive(Primitive_): Note: It is valid only in semi auto parallel. In other parallel modes, please set it to be 0. - Args: stage (int): The stage id for the current operation. + Example: + >>> from mindspore.ops import operations as P + >>> add = P.Add() + >>> print(add.set_stage(0)) + Prim[Add] """ self.add_prim_attr("stage", stage) return self @@ -157,6 +161,11 @@ class Primitive(Primitive_): Args: strategy (tuple): Strategy describes the distributed parallel mode of the current primitive. + Example: + >>> from mindspore.ops import operations as P + >>> add = P.Add() + >>> print(add.shard(((1, 1), (1, 1)))) + Prim[Add] """ mode = context.get_auto_parallel_context("parallel_mode") if strategy is not None: diff --git a/model_zoo/official/nlp/pangu_alpha/src/dataset.py b/model_zoo/official/nlp/pangu_alpha/src/dataset.py index 6f6fc147b31..b8966d870c4 100644 --- a/model_zoo/official/nlp/pangu_alpha/src/dataset.py +++ b/model_zoo/official/nlp/pangu_alpha/src/dataset.py @@ -95,6 +95,8 @@ def create_dataset(batch_size, data_path, device_num=1, rank=0, drop=True, full_ os.path.join(home_path, name) for name in files if not name.endswith(".db") ] + # Ensure the order of mindrecords is same in all machines, otherwise it will meet loss converge problem. + data.sort() # Load data files and preprocess dataset = ds.MindDataset(data[data_start_index:], columns_list=[column_name], shuffle=False) diff --git a/model_zoo/official/nlp/pangu_alpha/train.py b/model_zoo/official/nlp/pangu_alpha/train.py index 976e71063da..a12cbf73ba1 100644 --- a/model_zoo/official/nlp/pangu_alpha/train.py +++ b/model_zoo/official/nlp/pangu_alpha/train.py @@ -279,20 +279,16 @@ def run_train_pipeline(args_opt): optimizer = nn.Lamb(group_params, learning_rate=lr) else: optimizer = nn.AdamWeightDecay(group_params, learning_rate=lr, beta1=0.9, beta2=0.95, eps=1e-8) - if context.get_auto_parallel_context("full_batch"): - ds = create_dataset(config.batch_size, data_path=cache_url, eod_reset=True, - data_start_index=0, full_batch=True, column_name=args_opt.data_column_name) - else: - if batch_size % stage_device_num != 0: - raise ValueError("Batch_size should be divisible by device_num") - ds = create_dataset(config.batch_size, data_path=cache_url, device_num=stage_device_num, - rank=rank_id, eod_reset=True, data_start_index=0, full_batch=False, - column_name=args_opt.data_column_name) + + ds = create_dataset(config.batch_size, data_path=cache_url, device_num=stage_device_num, + rank=rank_id % stage_device_num, eod_reset=True, data_start_index=0, + full_batch=context.get_auto_parallel_context("full_batch"), + column_name=args_opt.data_column_name) epoch_num = args_opt.epoch_size step_per_epoch = ds.get_dataset_size() callback_size = args_opt.sink_size actual_epoch_num = int(epoch_num * step_per_epoch / callback_size) - callback = [TimeMonitor(callback_size), LossCallBack(callback_size, rank_id, config.stage_num, config.micro_size)] + callback = [TimeMonitor(callback_size), LossCallBack(callback_size, rank_id, micro_size=config.micro_size)] loss_scale_value = math.pow(2, 32) update_cell = DynamicLossScaleUpdateCell(loss_scale_value=loss_scale_value, scale_factor=2, scale_window=1000) pangu_alpha_with_grads = PanguAlphaTrainPipelineWithLossScaleCell(