clould

2021-06-28 15:28:03 +08:00 · 2021-06-28 15:28:03 +08:00 · 2c4c157ae5
parent d919560902
commit 2c4c157ae5
36 changed files with 3180 additions and 215 deletions
--- a/model_zoo/official/nlp/transformer/create_data.py
+++ b/model_zoo/official/nlp/transformer/create_data.py
@ -18,12 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import argparse
-import ast
 import collections
 import logging
 import numpy as np
 import src.tokenization as tokenization
+from src.model_utils.config import config
 from mindspore.mindrecord import FileWriter

 class SampleInstance():
@ -121,33 +120,17 @@ def create_training_instance(source_words, target_words, max_seq_length, clip_to
    return instance

 def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input_file", type=str, required=True,
-                        help='Input raw text file (or comma-separated list of files).')
-    parser.add_argument("--output_file", type=str, required=True, help='Output MindRecord file.')
-    parser.add_argument("--num_splits", type=int, default=16,
-                        help='The MindRecord file will be split into the number of partition.')
-    parser.add_argument("--vocab_file", type=str, required=True,
-                        help='The vocabulary file that the Transformer model was trained on.')
-    parser.add_argument("--clip_to_max_len", type=bool, default=False,
-                        help='clip sequences to maximum sequence length.')
-    parser.add_argument("--max_seq_length", type=int, default=128, help='Maximum sequence length.')
-    parser.add_argument("--bucket", type=ast.literal_eval, default=[16, 32, 48, 64, 128],
-                        help='bucket sequence length')
-
-    args = parser.parse_args()
-
-    tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=args.vocab_file)
+    tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=config.vocab_file)

    input_files = []
-    for input_pattern in args.input_file.split(","):
+    for input_pattern in config.input_file.split(","):
        input_files.append(input_pattern)

    logging.info("*** Read from input files ***")
    for input_file in input_files:
        logging.info("  %s", input_file)

-    output_file = args.output_file
+    output_file = config.output_file
    logging.info("*** Write to output files ***")
    logging.info("  %s", output_file)

@ -155,7 +138,7 @@ def main():
    total_read = 0

    feature_dict = {}
-    for i in args.bucket:
+    for i in config.bucket:
        feature_dict[i] = []

    for input_file in input_files:
@ -174,17 +157,17 @@ def main():
                source_tokens = tokenizer.tokenize(source_line)
                target_tokens = tokenizer.tokenize(target_line)

-                if len(source_tokens) >= args.max_seq_length or len(target_tokens) >= args.max_seq_length:
+                if len(source_tokens) >= config.max_seq_length or len(target_tokens) >= config.max_seq_length:
                    logging.info("ignore long sentence!")
                    continue

-                instance = create_training_instance(source_tokens, target_tokens, args.max_seq_length,
-                                                    clip_to_max_len=args.clip_to_max_len)
+                instance = create_training_instance(source_tokens, target_tokens, config.max_seq_length,
+                                                    clip_to_max_len=config.clip_to_max_len)
                if instance is None:
                    continue

-                features, seq_max_bucket_length = get_instance_features(instance, tokenizer, args.max_seq_length,
-                                                                        args.bucket)
+                features, seq_max_bucket_length = get_instance_features(instance, tokenizer, config.max_seq_length,
+                                                                        config.bucket)
                for key in feature_dict:
                    if key == seq_max_bucket_length:
                        feature_dict[key].append(features)
@ -200,12 +183,12 @@ def main():
                        feature = features[feature_name]
                        logging.info("%s: %s", feature_name, feature)

-    for i in args.bucket:
-        if args.num_splits == 1:
+    for i in config.bucket:
+        if config.num_splits == 1:
            output_file_name = output_file
        else:
            output_file_name = output_file + '_' + str(i) + '_'
-        writer = FileWriter(output_file_name, args.num_splits)
+        writer = FileWriter(output_file_name, config.num_splits)
        data_schema = {"source_sos_ids": {"type": "int64", "shape": [-1]},
                       "source_sos_mask": {"type": "int64", "shape": [-1]},
                       "source_eos_ids": {"type": "int64", "shape": [-1]},
--- a/model_zoo/official/nlp/transformer/default_config.yaml
+++ b/model_zoo/official/nlp/transformer/default_config.yaml
@ -0,0 +1,133 @@
+# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
+enable_modelarts: False
+data_url: ""
+train_url: ""
+checkpoint_url: ""
+data_path: "/cache/data"
+output_path: "/cache/train"
+load_path: "/cache/checkpoint_path"
+checkpoint_path: ''
+device_target: Ascend
+enable_profiling: False
+
+# ==============================================================================
+# config/cfg edict
+transformer_network: 'base'
+init_loss_scale_value: 1024
+scale_factor: 2
+scale_window: 2000
+optimizer: 'Adam'
+optimizer_adam_beta2: 0.997
+# lr_schedule: edict({'learning_rate': 2.0, 'warmup_steps': 8000, 'start_decay_step': 16000, 'min_lr': 0.0,})
+
+# transformer_net_cfg
+batch_size: 96
+seq_length: 128
+vocab_size: 36560
+hidden_size: 512
+num_hidden_layers: 6
+num_attention_heads: 8
+intermediate_size: 2048
+hidden_act: "relu"
+hidden_dropout_prob: 0.2
+attention_probs_dropout_prob: 0.2
+max_position_embeddings: 128
+initializer_range: 0.02
+label_smoothing: 0.1
+dtype: mstype.float32
+compute_type: mstype.float16
+
+#eval_config/cfg edict
+data_file: '/cache/data'
+model_file: './transformer/transformer_trained.ckpt'
+output_file: './output_eval.txt'
+
+# transformer_net_cfg
+batch_size_ev: 1
+hidden_dropout_prob_ev: 0.0
+attention_probs_dropout_prob_ev: 0.0
+beam_width: 4
+max_decode_length: 80
+length_penalty_weight: 1.0
+
+# ==============================================================================
+# train.py / Argparse init.
+distribute: "false"
+epoch_size: 52
+device_id: 0
+device_num: 1
+enable_lossscale: "true"
+do_shuffle: "true"
+enable_save_ckpt: "true"
+save_checkpoint_steps: 2500
+save_checkpoint_num: 30
+save_checkpoint_path: "./"
+bucket_boundaries: [16, 32, 48, 64, 128]
+accumulation_steps: 1
+
+# export.py /eval_config - transformer export
+file_name: "transformer"
+file_format: 'AIR'
+
+#'postprocess / from eval_config'
+result_dir: "./result_Files"
+
+#'preprocess / from eval_config'
+result_path: "./preprocess_Result/"
+
+# src/process_output.py "recore nbest with smoothed sentence-level bleu."
+vocab_file: ""
+
+# create_data.py
+input_file: ''
+num_splits: 16
+clip_to_max_len: False
+max_seq_length: 128
+bucket: [16, 32, 48, 64, 128]
+
+---
+# Config description for each option
+enable_modelarts: 'Whether training on modelarts, default: False'
+data_url: 'Dataset url for obs'
+train_url: 'Training output url for obs'
+data_path: 'Dataset path for local, it is better to use absolute path'
+output_path: 'Training output path for local'
+ann_file: 'Ann file, default is val.json.'
+
+device_target: "device where the code will be implemented, default is Ascend"
+checkpoint_path: "Checkpoint file path"
+data_file: '/your/path/evaluation.mindrecord'
+model_file: '/your/path/checkpoint_file'
+output_file: './output_eval.txt'
+
+distribute: "Run distribute, default is false."
+epoch_size: "Epoch size, default is 52."
+device_id: "Device id, default is 0."
+device_num: "Use device nums, default is 1."
+enable_lossscale: "Use lossscale or not, default is true."
+do_shuffle: "Enable shuffle for dataset, default is true."
+enable_save_ckpt: "Enable save checkpoint, default is true."
+save_checkpoint_steps: "Save checkpoint steps, default is 2500."
+save_checkpoint_num: "Save checkpoint numbers, default is 30."
+save_checkpoint_path: "Save checkpoint file path"
+bucket_boundaries: "sequence length for different bucket"
+accumulation_steps: "Gradient accumulation steps, default is 1."
+
+file_name: "output file name."
+file_format: 'file format'
+result_dir: "./result_Files"
+result_path: "./preprocess_Result/"
+vocab_file: "vocab file path."
+input_file: 'Input raw text file (or comma-separated list of files).'
+num_splits: 'The MindRecord file will be split into the number of partition.'
+clip_to_max_len: 'clip sequences to maximum sequence length.'
+max_seq_length: 'Maximum sequence length.'
+bucket: 'bucket sequence length'
+
+---
+device_target: ["Ascend", "GPU", "CPU"]
+file_format: ["AIR", "ONNX", "MINDIR"]
+distribute: ['true', 'false']
+enable_lossscale: ['true', 'false']
+do_shuffle: ['true', 'false']
+enable_save_ckpt: ['true', 'false']
--- a/model_zoo/official/nlp/transformer/default_config_large.yaml
+++ b/model_zoo/official/nlp/transformer/default_config_large.yaml
@ -0,0 +1,134 @@
+# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
+enable_modelarts: False
+data_url: ""
+train_url: ""
+checkpoint_url: ""
+data_path: "/cache/data"
+output_path: "/cache/train"
+load_path: "/cache/checkpoint_path"
+checkpoint_path: ''
+device_target: Ascend
+enable_profiling: False
+
+# ==============================================================================
+# config/cfg edict
+transformer_network: 'large'
+init_loss_scale_value: 1024
+scale_factor: 2
+scale_window: 2000
+optimizer: 'Adam'
+optimizer_adam_beta2: 0.997
+#lr_schedule: edict({'learning_rate': 2.0, 'warmup_steps': 8000, 'start_decay_step': 16000, 'min_lr': 0.0,})
+
+# transformer_net_cfg
+batch_size: 96
+seq_length: 128
+vocab_size: 36560
+hidden_size: 1024
+num_hidden_layers: 6
+num_attention_heads: 16
+intermediate_size: 4096
+hidden_act: "relu"
+hidden_dropout_prob: 0.2
+attention_probs_dropout_prob: 0.2
+max_position_embeddings: 128
+initializer_range: 0.02
+label_smoothing: 0.1
+dtype: mstype.float32
+compute_type: mstype.float16
+
+#eval_config/cfg edict
+data_file: '/cache/data'
+model_file: './transformer/transformer_trained.ckpt'
+output_file: './output_eval.txt'
+
+# transformer_net_cfg
+batch_size_ev: 1
+hidden_dropout_prob_ev: 0.0
+attention_probs_dropout_prob_ev: 0.0
+beam_width: 4
+max_decode_length: 80
+length_penalty_weight: 1.0
+
+# ==============================================================================
+# train.py / Argparse init.
+distribute: "false"
+epoch_size: 52
+device_id: 0
+device_num: 1
+enable_lossscale: "true"
+do_shuffle: "true"
+enable_save_ckpt: "true"
+save_checkpoint_steps: 2500
+save_checkpoint_num: 30
+save_checkpoint_path: "./"
+bucket_boundaries: [16, 32, 48, 64, 128]
+accumulation_steps: 1
+
+# export.py /eval_config - transformer export
+file_name: "transformer"
+file_format: 'AIR'
+
+#'postprocess / from eval_config'
+result_dir: "./result_Files"
+
+#'preprocess / from eval_config'
+result_path: "./preprocess_Result/"
+
+# src/process_output.py "recore nbest with smoothed sentence-level bleu."
+vocab_file: ""
+
+# create_data.py
+input_file: ''
+num_splits: 16
+clip_to_max_len: False
+max_seq_length: 128
+bucket: [16, 32, 48, 64, 128]
+
+
+---
+# Config description for each option
+enable_modelarts: 'Whether training on modelarts, default: False'
+data_url: 'Dataset url for obs'
+train_url: 'Training output url for obs'
+data_path: 'Dataset path for local, it is better to use absolute path'
+output_path: 'Training output path for local'
+ann_file: 'Ann file, default is val.json.'
+
+device_target: "device where the code will be implemented, default is Ascend"
+checkpoint_path: "Checkpoint file path"
+data_file: '/your/path/evaluation.mindrecord'
+model_file: '/your/path/checkpoint_file'
+output_file: './output_eval.txt'
+
+distribute: "Run distribute, default is false."
+epoch_size: "Epoch size, default is 52."
+device_id: "Device id, default is 0."
+device_num: "Use device nums, default is 1."
+enable_lossscale: "Use lossscale or not, default is true."
+do_shuffle: "Enable shuffle for dataset, default is true."
+enable_save_ckpt: "Enable save checkpoint, default is true."
+save_checkpoint_steps: "Save checkpoint steps, default is 2500."
+save_checkpoint_num: "Save checkpoint numbers, default is 30."
+save_checkpoint_path: "Save checkpoint file path"
+bucket_boundaries: "sequence length for different bucket"
+accumulation_steps: "Gradient accumulation steps, default is 1."
+
+file_name: "output file name."
+file_format: 'file format'
+result_dir: "./result_Files"
+result_path: "./preprocess_Result/"
+vocab_file: "vocab file path."
+input_file: 'Input raw text file (or comma-separated list of files).'
+num_splits: 'The MindRecord file will be split into the number of partition.'
+clip_to_max_len: 'clip sequences to maximum sequence length.'
+max_seq_length: 'Maximum sequence length.'
+bucket: 'bucket sequence length'
+
+---
+device_target: ["Ascend", "GPU", "CPU"]
+file_format: ["AIR", "ONNX", "MINDIR"]
+distribute: ['true', 'false']
+enable_lossscale: ['true', 'false']
+do_shuffle: ['true', 'false']
+enable_save_ckpt: ['true', 'false']
--- a/model_zoo/official/nlp/transformer/default_config_large_gpu.yaml
+++ b/model_zoo/official/nlp/transformer/default_config_large_gpu.yaml
@ -0,0 +1,134 @@
+# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
+enable_modelarts: False
+data_url: ""
+train_url: ""
+checkpoint_url: ""
+data_path: "/cache/data"
+output_path: "/cache/train"
+load_path: "/cache/checkpoint_path"
+checkpoint_path: ''
+device_target: GPU
+enable_profiling: False
+
+# ==============================================================================
+# config/cfg edict
+transformer_network: 'large'
+init_loss_scale_value: 1024
+scale_factor: 2
+scale_window: 2000
+optimizer: 'Adam'
+optimizer_adam_beta2: 0.997
+#lr_schedule: edict({'learning_rate': 2.0, 'warmup_steps': 8000, 'start_decay_step': 16000, 'min_lr': 0.0,})
+
+# transformer_net_cfg_gpu
+batch_size: 32
+seq_length: 128
+vocab_size: 36560
+hidden_size: 1024
+num_hidden_layers: 6
+num_attention_heads: 16
+intermediate_size: 4096
+hidden_act: "relu"
+hidden_dropout_prob: 0.2
+attention_probs_dropout_prob: 0.2
+max_position_embeddings: 128
+initializer_range: 0.02
+label_smoothing: 0.1
+dtype: mstype.float32
+compute_type: mstype.float16
+
+#eval_config/cfg edict
+data_file: '/cache/data'
+model_file: './transformer/transformer_trained.ckpt'
+output_file: './output_eval.txt'
+
+# transformer_net_cfg
+batch_size_ev: 1
+hidden_dropout_prob_ev: 0.0
+attention_probs_dropout_prob_ev: 0.0
+beam_width: 4
+max_decode_length: 80
+length_penalty_weight: 1.0
+
+# ==============================================================================
+# train.py / Argparse init.
+distribute: "false"
+epoch_size: 52
+device_id: 0
+device_num: 1
+enable_lossscale: "true"
+do_shuffle: "true"
+enable_save_ckpt: "true"
+save_checkpoint_steps: 2500
+save_checkpoint_num: 30
+save_checkpoint_path: "./"
+bucket_boundaries: [16, 32, 48, 64, 128]
+accumulation_steps: 1
+
+# export.py /eval_config - transformer export
+file_name: "transformer"
+file_format: 'AIR'
+
+#'postprocess / from eval_config'
+result_dir: "./result_Files"
+
+#'preprocess / from eval_config'
+result_path: "./preprocess_Result/"
+
+# src/process_output.py "recore nbest with smoothed sentence-level bleu."
+vocab_file: ""
+
+# create_data.py
+input_file: ''
+num_splits: 16
+clip_to_max_len: False
+max_seq_length: 128
+bucket: [16, 32, 48, 64, 128]
+
+
+---
+# Config description for each option
+enable_modelarts: 'Whether training on modelarts, default: False'
+data_url: 'Dataset url for obs'
+train_url: 'Training output url for obs'
+data_path: 'Dataset path for local, it is better to use absolute path'
+output_path: 'Training output path for local'
+ann_file: 'Ann file, default is val.json.'
+
+device_target: "device where the code will be implemented, default is Ascend"
+checkpoint_path: "Checkpoint file path"
+data_file: '/your/path/evaluation.mindrecord'
+model_file: '/your/path/checkpoint_file'
+output_file: './output_eval.txt'
+
+distribute: "Run distribute, default is false."
+epoch_size: "Epoch size, default is 52."
+device_id: "Device id, default is 0."
+device_num: "Use device nums, default is 1."
+enable_lossscale: "Use lossscale or not, default is true."
+do_shuffle: "Enable shuffle for dataset, default is true."
+enable_save_ckpt: "Enable save checkpoint, default is true."
+save_checkpoint_steps: "Save checkpoint steps, default is 2500."
+save_checkpoint_num: "Save checkpoint numbers, default is 30."
+save_checkpoint_path: "Save checkpoint file path"
+bucket_boundaries: "sequence length for different bucket"
+accumulation_steps: "Gradient accumulation steps, default is 1."
+
+file_name: "output file name."
+file_format: 'file format'
+result_dir: "./result_Files"
+result_path: "./preprocess_Result/"
+vocab_file: "vocab file path."
+input_file: 'Input raw text file (or comma-separated list of files).'
+num_splits: 'The MindRecord file will be split into the number of partition.'
+clip_to_max_len: 'clip sequences to maximum sequence length.'
+max_seq_length: 'Maximum sequence length.'
+bucket: 'bucket sequence length'
+
+---
+device_target: ["Ascend", "GPU", "CPU"]
+file_format: ["AIR", "ONNX", "MINDIR"]
+distribute: ['true', 'false']
+enable_lossscale: ['true', 'false']
+do_shuffle: ['true', 'false']
+enable_save_ckpt: ['true', 'false']
--- a/model_zoo/official/nlp/transformer/eval.py
+++ b/model_zoo/official/nlp/transformer/eval.py
@ -14,7 +14,7 @@
 # ============================================================================
 """Transformer evaluation script."""

-import argparse
+import os
 import numpy as np

 import mindspore.nn as nn
@ -28,8 +28,15 @@ import mindspore.dataset.transforms.c_transforms as deC
 from mindspore import context

 from src.transformer_model import TransformerModel
-from src.eval_config import cfg, transformer_net_cfg
+from src.model_utils.config import config
+from src.model_utils.moxing_adapter import moxing_wrapper
+from src.model_utils.device_adapter import get_device_id

+config.dtype = mstype.float32
+config.compute_type = mstype.float16
+config.batch_size = config.batch_size_ev
+config.hidden_dropout_prob = config.hidden_dropout_prob_ev
+config.attention_probs_dropout_prob = config.attention_probs_dropout_prob_ev

 def load_test_data(batch_size=1, data_file=None):
    """
@ -57,7 +64,6 @@ class TransformerInferCell(nn.Cell):
    """
    Encapsulation class of transformer network infer.
    """
-
    def __init__(self, network):
        super(TransformerInferCell, self).__init__(auto_prefix=False)
        self.network = network
@ -98,23 +104,22 @@ def load_weights(model_path):
    return parameter_dict


+def modelarts_pre_process():
+    config.output_file = os.path.join(config.output_path, config.output_file)
+    config.data_file = os.path.join(config.data_file, 'ende-l128-mindrecord_128_00')
+
+@moxing_wrapper(pre_process=modelarts_pre_process)
 def run_transformer_eval():
    """
    Transformer evaluation.
    """
-    parser = argparse.ArgumentParser(description='tranformer')
-    parser.add_argument("--device_target", type=str, default="Ascend",
-                        help="device where the code will be implemented, default is Ascend")
-    parser.add_argument('--device_id', type=int, default=0, help='device id of GPU or Ascend, default is 0')
-    args = parser.parse_args()
+    context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, reserve_class_name_in_scope=False,
+                        device_id=get_device_id())

-    context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, reserve_class_name_in_scope=False,
-                        device_id=args.device_id)
+    dataset = load_test_data(batch_size=config.batch_size, data_file=config.data_file)
+    tfm_model = TransformerModel(config=config, is_training=False, use_one_hot_embeddings=False)

-    dataset = load_test_data(batch_size=transformer_net_cfg.batch_size, data_file=cfg.data_file)
-    tfm_model = TransformerModel(config=transformer_net_cfg, is_training=False, use_one_hot_embeddings=False)
-
-    parameter_dict = load_weights(cfg.model_file)
+    parameter_dict = load_weights(config.model_file)
    load_param_into_net(tfm_model, parameter_dict)

    tfm_infer = TransformerInferCell(tfm_model)
@ -132,9 +137,9 @@ def run_transformer_eval():
        predictions.append(predicted_ids.asnumpy())

    # decode and write to file
-    f = open(cfg.output_file, 'w')
+    f = open(config.output_file, 'w')
    for batch_out in predictions:
-        for i in range(transformer_net_cfg.batch_size):
+        for i in range(config.batch_size):
            if batch_out.ndim == 3:
                batch_out = batch_out[:, 0]
            token_ids = [str(x) for x in batch_out[i].tolist()]
--- a/model_zoo/official/nlp/transformer/export.py
+++ b/model_zoo/official/nlp/transformer/export.py
@ -14,35 +14,41 @@
 # ============================================================================
 """ export checkpoint file into models"""

-import argparse
 import numpy as np

 from mindspore import Tensor, context
 from mindspore.train.serialization import load_param_into_net, export

 from src.transformer_model import TransformerModel
-from src.eval_config import cfg, transformer_net_cfg
+from src.model_utils.config import config
+from src.model_utils.moxing_adapter import moxing_wrapper
+from src.model_utils.device_adapter import get_device_id
 from eval import load_weights

-parser = argparse.ArgumentParser(description='transformer export')
-parser.add_argument("--device_id", type=int, default=0, help="Device id")
-parser.add_argument("--file_name", type=str, default="transformer", help="output file name.")
-parser.add_argument('--file_format', type=str, choices=["AIR", "ONNX", "MINDIR"], default='AIR', help='file format')
-parser.add_argument("--device_target", type=str, default="Ascend",
-                    choices=["Ascend", "GPU", "CPU"], help="device target (default: Ascend)")
-args = parser.parse_args()

-context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
-if args.device_target == "Ascend":
-    context.set_context(device_id=args.device_id)
+config.batch_size = config.batch_size_ev
+config.hidden_dropout_prob = config.hidden_dropout_prob_ev
+config.attention_probs_dropout_prob = config.attention_probs_dropout_prob_ev

-if __name__ == '__main__':
-    tfm_model = TransformerModel(config=transformer_net_cfg, is_training=False, use_one_hot_embeddings=False)
+context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
+if config.device_target == "Ascend":
+    context.set_context(device_id=get_device_id())

-    parameter_dict = load_weights(cfg.model_file)
+def modelarts_pre_process():
+    pass
+
+@moxing_wrapper(pre_process=modelarts_pre_process)
+def export_transformer():
+    """ export_transformer """
+    tfm_model = TransformerModel(config=config, is_training=False, use_one_hot_embeddings=False)
+
+    parameter_dict = load_weights(config.model_file)
    load_param_into_net(tfm_model, parameter_dict)

-    source_ids = Tensor(np.ones((transformer_net_cfg.batch_size, transformer_net_cfg.seq_length)).astype(np.int32))
-    source_mask = Tensor(np.ones((transformer_net_cfg.batch_size, transformer_net_cfg.seq_length)).astype(np.int32))
+    source_ids = Tensor(np.ones((config.batch_size, config.seq_length)).astype(np.int32))
+    source_mask = Tensor(np.ones((config.batch_size, config.seq_length)).astype(np.int32))

-    export(tfm_model, source_ids, source_mask, file_name=args.file_name, file_format=args.file_format)
+    export(tfm_model, source_ids, source_mask, file_name=config.file_name, file_format=config.file_format)
+
+if __name__ == '__main__':
+    export_transformer()
--- a/model_zoo/official/nlp/transformer/postprocess.py
+++ b/model_zoo/official/nlp/transformer/postprocess.py
@ -15,31 +15,25 @@
 """Transformer evaluation script."""

 import os
-import argparse
 import numpy as np
+from src.model_utils.config import config

-from src.eval_config import cfg, transformer_net_cfg
-
-parser = argparse.ArgumentParser(description='postprocess')
-parser.add_argument("--result_dir", type=str, default="./result_Files",
-                    help="infer result path.")
-args = parser.parse_args()

 def generate_output():
    '''
    Generate output.
    '''
    predictions = []
-    file_num = len(os.listdir(args.result_dir))
+    file_num = len(os.listdir(config.result_dir))
    for i in range(file_num):
-        batch = "transformer_bs_" + str(transformer_net_cfg.batch_size) + "_" + str(i) + "_0.bin"
-        pred = np.fromfile(os.path.join(args.result_dir, batch), np.int32)
-        predictions.append(pred.reshape(1, 1, transformer_net_cfg.max_decode_length + 1))
+        batch = "transformer_bs_" + str(config.batch_size) + "_" + str(i) + "_0.bin"
+        pred = np.fromfile(os.path.join(config.result_dir, batch), np.int32)
+        predictions.append(pred.reshape(1, 1, config.max_decode_length + 1))

    # decode and write to file
-    f = open(cfg.output_file, 'w')
+    f = open(config.output_file, 'w')
    for batch_out in predictions:
-        for i in range(transformer_net_cfg.batch_size):
+        for i in range(config.batch_size):
            if batch_out.ndim == 3:
                batch_out = batch_out[:, 0]
            token_ids = [str(x) for x in batch_out[i].tolist()]
--- a/model_zoo/official/nlp/transformer/preprocess.py
+++ b/model_zoo/official/nlp/transformer/preprocess.py
@ -15,23 +15,16 @@
 """Transformer evaluation script."""

 import os
-import argparse
-
-from src.eval_config import cfg, transformer_net_cfg
+from src.model_utils.config import config
 from eval import load_test_data

-parser = argparse.ArgumentParser(description='preprocess')
-parser.add_argument("--result_path", type=str, default="./preprocess_Result/",
-                    help="preprocess result path.")
-args = parser.parse_args()
-

 def generate_bin():
    '''
    Generate bin files.
    '''
-    dataset = load_test_data(batch_size=transformer_net_cfg.batch_size, data_file=cfg.data_file)
-    cur_dir = args.result_path
+    dataset = load_test_data(batch_size=config.batch_size, data_file=config.data_file)
+    cur_dir = config.result_path

    source_eos_ids_path = os.path.join(cur_dir, "00_source_eos_ids")
    source_eos_mask_path = os.path.join(cur_dir, "01_source_eos_mask")
@ -41,7 +34,7 @@ def generate_bin():
    if not os.path.isdir(source_eos_mask_path):
        os.makedirs(source_eos_mask_path)

-    batch_size = transformer_net_cfg.batch_size
+    batch_size = config.batch_size

    for i, data in enumerate(dataset.create_dict_iterator(output_numpy=True, num_epochs=1)):
        file_name = "transformer_bs_" + str(batch_size) + "_" + str(i) + ".bin"
--- a/model_zoo/official/nlp/transformer/scripts/process_output.sh
+++ b/model_zoo/official/nlp/transformer/scripts/process_output.sh
@ -28,7 +28,7 @@ eval_output=$2
 vocab_file=$3

 cat $eval_output \
-  | python src/process_output.py --vocab_file $vocab_file \
+  | python src/process_output.py --config_path="./default_config_large.yaml" --vocab_file $vocab_file \
  | sed 's/@@ //g' > ${eval_output}.processed

 perl -ple 's/(\S)-(\S)/$1 #@#-#@# $2/g' < $ref_data | perl ${BASEDIR}/replace-quote.perl > ${ref_data}.forbleu
--- a/model_zoo/official/nlp/transformer/scripts/run_distribute_train_ascend.sh
+++ b/model_zoo/official/nlp/transformer/scripts/run_distribute_train_ascend.sh
@ -13,11 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-if [ $# != 4 ] ; then
+if [ $# != 5 ] ; then
 echo "=============================================================================================================="
 echo "Please run the script as: "
-echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_PATH RANK_TABLE_FILE"
-echo "for example: sh run_distribute_pretrain.sh 8 52 /path/ende-l128-mindrecord00 /path/hccl.json"
+echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_PATH RANK_TABLE_FILE CONFIG_PATH"
+echo "for example: sh run_distribute_pretrain.sh 8 52 /path/ende-l128-mindrecord00 /path/hccl.json ./default_config_large.yaml"
 echo "It is better to use absolute path."
 echo "=============================================================================================================="
 exit 1;
@ -32,6 +32,7 @@ DATA_PATH=$3

 export HCCL_CONNECT_TIMEOUT=600
 export RANK_TABLE_FILE=$4
+export CONFIG_PATH=$5
 export RANK_SIZE=$1
 export HCCL_FLAG=1
 export DEPLOY_MODE=0
@ -43,11 +44,12 @@ do
    export GE_USE_STATIC_MEMORY=1

    mkdir helper$i
-    cp -rf ../src/ ../train.py ./helper$i
+    cp -rf ../src/ ../train.py ../*.yaml ./helper$i
    cd ./helper$i || exit
    echo "start training for rank $i, device $DEVICE_ID"
    env > env.log
    python train.py  \
+    --config_path=$CONFIG_PATH \
    --distribute="true" \
    --epoch_size=$EPOCH_SIZE \
    --device_id=$DEVICE_ID \
@ -58,8 +60,7 @@ do
    --checkpoint_path="" \
    --save_checkpoint_steps=2500 \
    --save_checkpoint_num=30 \
-    --data_path=$DATA_PATH \
-    --bucket_boundaries=[16,32,48,64,128] > log.txt 2>&1 &
+    --data_path=$DATA_PATH > log.txt 2>&1 &
    cd ../
 done
 cd ..
--- a/model_zoo/official/nlp/transformer/scripts/run_distribute_train_ascend_multi_machines.sh
+++ b/model_zoo/official/nlp/transformer/scripts/run_distribute_train_ascend_multi_machines.sh
@ -13,11 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-if [ $# != 5 ] ; then
+if [ $# != 6 ] ; then
 echo "=============================================================================================================="
 echo "Please run the script as: "
-echo "sh run_distribute_train_ascend_multi_machines.sh DEVICE_NUM SERVER_ID EPOCH_SIZE DATA_PATH RANK_TABLE_FILE"
-echo "for example: sh run_distribute_train_ascend_multi_machines.sh 32 0 52 /path/ende-l128-mindrecord00 /path/hccl.json"
+echo "sh run_distribute_train_ascend_multi_machines.sh DEVICE_NUM SERVER_ID EPOCH_SIZE DATA_PATH RANK_TABLE_FILE CONFIG_PATH"
+echo "for example: sh run_distribute_train_ascend_multi_machines.sh 32 0 52 /path/ende-l128-mindrecord00 /path/hccl.json ./default_config_large.yaml"
 echo "It is better to use absolute path."
 echo "=============================================================================================================="
 exit 1;
@ -32,6 +32,7 @@ DATA_PATH=$4

 export HCCL_CONNECT_TIMEOUT=600
 export RANK_TABLE_FILE=$5
+export CONFIG_PATH=$6
 export RANK_SIZE=$1
 export SERVER_ID=$2
 export DEVICE_NUM=8
@ -47,11 +48,12 @@ do
    export GE_USE_STATIC_MEMORY=1

    mkdir helper$i
-    cp -rf ../src/ ../train.py ./helper$i
+    cp -rf ../src/ ../train.py ../*.yaml ./helper$i
    cd ./helper$i || exit
    echo "start training for rank $i, device $DEVICE_ID"
    env > env.log
    python train.py  \
+    --config_path=$CONFIG_PATH \
    --distribute="true" \
    --epoch_size=$EPOCH_SIZE \
    --device_id=$DEVICE_ID \
--- a/model_zoo/official/nlp/transformer/scripts/run_distribute_train_gpu.sh
+++ b/model_zoo/official/nlp/transformer/scripts/run_distribute_train_gpu.sh
@ -13,11 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-if [ $# != 3 ] ; then
+if [ $# != 4 ] ; then
 echo "=============================================================================================================="
 echo "Please run the script as: "
-echo "sh run_distribute_train_gpu.sh DEVICE_NUM EPOCH_SIZE DATA_PATH"
-echo "for example: sh run_distribute_pretrain.sh 8 55 /path/ende-l128-mindrecord00"
+echo "sh run_distribute_train_gpu.sh DEVICE_NUM EPOCH_SIZE DATA_PATH CONFIG_PATH"
+echo "for example: sh run_distribute_pretrain.sh 8 55 /path/ende-l128-mindrecord00 ./default_config_large_gpu.yaml"
 echo "It is better to use absolute path."
 echo "=============================================================================================================="
 exit 1;
@ -25,16 +25,18 @@ fi

 rm -rf run_distribute_train
 mkdir run_distribute_train
-cp -rf ./src/ train.py ./run_distribute_train
+cp -rf ./src/ train.py ./*.yaml ./run_distribute_train
 cd run_distribute_train || exit

 export RANK_SIZE=$1
+export CONFIG_PATH=$4
 EPOCH_SIZE=$2
 DATA_PATH=$3
 echo $RANK_SIZE

 mpirun -n $RANK_SIZE \
    python train.py  \
+    --config_path=$CONFIG_PATH \
    --distribute="true" \
    --device_target="GPU" \
    --epoch_size=$EPOCH_SIZE \
--- a/model_zoo/official/nlp/transformer/scripts/run_eval.sh
+++ b/model_zoo/official/nlp/transformer/scripts/run_eval.sh
@ -13,19 +13,36 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-if [ $# != 2 ] ; then
+if [ $# != 5 ] ; then
 echo "=============================================================================================================="
 echo "Please run the script as: "
-echo "sh run_eval.sh DEVICE_TARGET DEVICE_ID"
-echo "for example: sh run_eval.sh Ascend 0"
-echo "Note: set the checkpoint and dataset path in src/eval_config.py"
+echo "sh run_eval.sh DEVICE_TARGET DEVICE_ID MINDRECORD_DATA CKPT_PATH CONFIG_PATH"
+echo "for example: sh run_eval.sh Ascend 0 /your/path/evaluation.mindrecord /your/path/checkpoint_file ./default_config_large_gpu.yaml"
+echo "Note: set the checkpoint and dataset path in default_config.yaml"
 echo "=============================================================================================================="
 exit 1;
 fi

 export DEVICE_TARGET=$1
+export CONFIG_PATH=$5
 DEVICE_ID=$2

+get_real_path(){
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
+
+PATH3=$(get_real_path $3)
+PATH4=$(get_real_path $4)
+echo $PATH3
+echo $PATH4
+
 python eval.py  \
+    --config_path=$CONFIG_PATH \
    --device_target=$DEVICE_TARGET \
    --device_id=$DEVICE_ID \
+    --data_file=$PATH3 \
+    --model_file=$PATH4 > log_eval.txt 2>&1 &
--- a/model_zoo/official/nlp/transformer/scripts/run_infer_310.sh
+++ b/model_zoo/official/nlp/transformer/scripts/run_infer_310.sh
@ -66,7 +66,7 @@ function preprocess_data()
        rm -rf ./preprocess_Result
    fi
    mkdir preprocess_Result
-    python3.7 ../preprocess.py --result_path=./preprocess_Result/
+    python3.7 ../preprocess.py --config_path="./default_config_large.yaml" --result_path=./preprocess_Result/
 }

 function compile_app()
@ -93,7 +93,7 @@ function infer()

 function cal_acc()
 {
-    python3.7 ../postprocess.py --result_dir=./result_Files &> acc.log
+    python3.7 ../postprocess.py --config_path="./default_config_large.yaml" --result_dir=./result_Files &> acc.log
 }

 if [ $need_preprocess == "y" ]; then
--- a/model_zoo/official/nlp/transformer/scripts/run_standalone_train.sh
+++ b/model_zoo/official/nlp/transformer/scripts/run_standalone_train.sh
@ -25,7 +25,7 @@ fi

 rm -rf run_standalone_train
 mkdir run_standalone_train
-cp -rf ./src/ train.py ./run_standalone_train
+cp -rf ./src/ train.py ./*.yaml ./run_standalone_train
 cd run_standalone_train || exit

 export DEVICE_TARGET=$1
@ -36,6 +36,7 @@ DATA_PATH=$5

 if [ $DEVICE_TARGET == 'Ascend' ];then
    python train.py  \
+        --config_path="./default_config_large.yaml" \
        --distribute="false" \
        --epoch_size=$EPOCH_SIZE \
        --accumulation_steps=$GRADIENT_ACCUMULATE_STEP \
@ -53,6 +54,7 @@ elif [ $DEVICE_TARGET == 'GPU' ];then
    export CUDA_VISIBLE_DEVICES="$2"

    python train.py  \
+        --config_path="./default_config_large_gpu.yaml" \
        --distribute="false" \
        --epoch_size=$EPOCH_SIZE \
        --device_target=$DEVICE_TARGET \
--- a/model_zoo/official/nlp/transformer/src/dataset.py
+++ b/model_zoo/official/nlp/transformer/src/dataset.py
@ -17,7 +17,8 @@
 import mindspore.common.dtype as mstype
 import mindspore.dataset as de
 import mindspore.dataset.transforms.c_transforms as deC
-from .config import transformer_net_cfg, transformer_net_cfg_gpu
+from .model_utils.config import config
+
 de.config.set_seed(1)
 def create_transformer_dataset(epoch_count=1, rank_size=1, rank_id=0, do_shuffle="true", dataset_path=None,
                               bucket_boundaries=None, device_target="Ascend"):
@ -38,11 +39,8 @@ def create_transformer_dataset(epoch_count=1, rank_size=1, rank_id=0, do_shuffle
        ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask")

        # apply batch operations
-        if device_target == "Ascend":
-            ds = ds.batch(transformer_net_cfg.batch_size, drop_remainder=True)
-        else:
-            ds = ds.batch(transformer_net_cfg_gpu.batch_size, drop_remainder=True)

+        ds = ds.batch(config.batch_size, drop_remainder=True)
        ds = ds.repeat(epoch_count)
        return ds

--- a/model_zoo/official/nlp/transformer/src/model_utils/init.py
+++ b/model_zoo/official/nlp/transformer/src/model_utils/init.py
--- a/model_zoo/official/nlp/transformer/src/model_utils/config.py
+++ b/model_zoo/official/nlp/transformer/src/model_utils/config.py
@ -0,0 +1,127 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Parse arguments"""
+
+import os
+import ast
+import argparse
+from pprint import pprint, pformat
+import yaml
+
+class Config:
+    """
+    Configuration namespace. Convert dictionary to members.
+    """
+    def __init__(self, cfg_dict):
+        for k, v in cfg_dict.items():
+            if isinstance(v, (list, tuple)):
+                setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v])
+            else:
+                setattr(self, k, Config(v) if isinstance(v, dict) else v)
+
+    def __str__(self):
+        return pformat(self.__dict__)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"):
+    """
+    Parse command line arguments to the configuration according to the default yaml.
+
+    Args:
+        parser: Parent parser.
+        cfg: Base configuration.
+        helper: Helper description.
+        cfg_path: Path to the default yaml config.
+    """
+    parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]",
+                                     parents=[parser])
+    helper = {} if helper is None else helper
+    choices = {} if choices is None else choices
+    for item in cfg:
+        if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict):
+            help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path)
+            choice = choices[item] if item in choices else None
+            if isinstance(cfg[item], bool):
+                parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice,
+                                    help=help_description)
+            else:
+                parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice,
+                                    help=help_description)
+    args = parser.parse_args()
+    return args
+
+
+def parse_yaml(yaml_path):
+    """
+    Parse the yaml config file.
+
+    Args:
+        yaml_path: Path to the yaml config.
+    """
+    with open(yaml_path, 'r') as fin:
+        try:
+            cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader)
+            cfgs = [x for x in cfgs]
+            if len(cfgs) == 1:
+                cfg_helper = {}
+                cfg = cfgs[0]
+                cfg_choices = {}
+            elif len(cfgs) == 2:
+                cfg, cfg_helper = cfgs
+                cfg_choices = {}
+            elif len(cfgs) == 3:
+                cfg, cfg_helper, cfg_choices = cfgs
+            else:
+                raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml")
+            print(cfg_helper)
+        except:
+            raise ValueError("Failed to parse yaml")
+    return cfg, cfg_helper, cfg_choices
+
+
+def merge(args, cfg):
+    """
+    Merge the base config from yaml file and command line arguments.
+
+    Args:
+        args: Command line arguments.
+        cfg: Base configuration.
+    """
+    args_var = vars(args)
+    for item in args_var:
+        cfg[item] = args_var[item]
+    return cfg
+
+
+def get_config():
+    """
+    Get Config according to the yaml file and cli arguments.
+    """
+    parser = argparse.ArgumentParser(description="default name", add_help=False)
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../../default_config.yaml"),
+                        help="Config file path")
+    path_args, _ = parser.parse_known_args()
+    default, helper, choices = parse_yaml(path_args.config_path)
+    pprint(default)
+    args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path)
+    final_config = merge(args, default)
+    return Config(final_config)
+
+config = get_config()
--- a/model_zoo/official/nlp/transformer/src/model_utils/device_adapter.py
+++ b/model_zoo/official/nlp/transformer/src/model_utils/device_adapter.py
@ -0,0 +1,27 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Device adapter for ModelArts"""
+
+from .config import config
+
+if config.enable_modelarts:
+    from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
+else:
+    from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
+
+__all__ = [
+    "get_device_id", "get_device_num", "get_rank_id", "get_job_id"
+]
--- a/model_zoo/official/nlp/transformer/src/model_utils/local_adapter.py
+++ b/model_zoo/official/nlp/transformer/src/model_utils/local_adapter.py
@ -0,0 +1,36 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Local adapter"""
+
+import os
+
+def get_device_id():
+    device_id = os.getenv('DEVICE_ID', '0')
+    return int(device_id)
+
+
+def get_device_num():
+    device_num = os.getenv('RANK_SIZE', '1')
+    return int(device_num)
+
+
+def get_rank_id():
+    global_rank_id = os.getenv('RANK_ID', '0')
+    return int(global_rank_id)
+
+
+def get_job_id():
+    return "Local Job"
--- a/model_zoo/official/nlp/transformer/src/model_utils/moxing_adapter.py
+++ b/model_zoo/official/nlp/transformer/src/model_utils/moxing_adapter.py
@ -0,0 +1,122 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Moxing adapter for ModelArts"""
+
+import os
+import functools
+from mindspore import context
+from mindspore.profiler import Profiler
+from .config import config
+
+_global_sync_count = 0
+
+def get_device_id():
+    device_id = os.getenv('DEVICE_ID', '0')
+    return int(device_id)
+
+
+def get_device_num():
+    device_num = os.getenv('RANK_SIZE', '1')
+    return int(device_num)
+
+
+def get_rank_id():
+    global_rank_id = os.getenv('RANK_ID', '0')
+    return int(global_rank_id)
+
+
+def get_job_id():
+    job_id = os.getenv('JOB_ID')
+    job_id = job_id if job_id != "" else "default"
+    return job_id
+
+def sync_data(from_path, to_path):
+    """
+    Download data from remote obs to local directory if the first url is remote url and the second one is local path
+    Upload data from local directory to remote obs in contrast.
+    """
+    import moxing as mox
+    import time
+    global _global_sync_count
+    sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count)
+    _global_sync_count += 1
+
+    # Each server contains 8 devices as most.
+    if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
+        print("from path: ", from_path)
+        print("to path: ", to_path)
+        mox.file.copy_parallel(from_path, to_path)
+        print("===finish data synchronization===")
+        try:
+            os.mknod(sync_lock)
+        except IOError:
+            pass
+        print("===save flag===")
+
+    while True:
+        if os.path.exists(sync_lock):
+            break
+        time.sleep(1)
+
+    print("Finish sync data from {} to {}.".format(from_path, to_path))
+
+
+def moxing_wrapper(pre_process=None, post_process=None):
+    """
+    Moxing wrapper to download dataset and upload outputs.
+    """
+    def wrapper(run_func):
+        @functools.wraps(run_func)
+        def wrapped_func(*args, **kwargs):
+            # Download data from data_url
+            if config.enable_modelarts:
+                if config.data_url:
+                    sync_data(config.data_url, config.data_path)
+                    print("Dataset downloaded: ", os.listdir(config.data_path))
+                if config.checkpoint_url:
+                    sync_data(config.checkpoint_url, config.load_path)
+                    print("Preload downloaded: ", os.listdir(config.load_path))
+                if config.train_url:
+                    sync_data(config.train_url, config.output_path)
+                    print("Workspace downloaded: ", os.listdir(config.output_path))
+
+                context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
+                config.device_num = get_device_num()
+                config.device_id = get_device_id()
+                if not os.path.exists(config.output_path):
+                    os.makedirs(config.output_path)
+
+                if pre_process:
+                    pre_process()
+
+            if config.enable_profiling:
+                profiler = Profiler()
+
+            run_func(*args, **kwargs)
+
+            if config.enable_profiling:
+                profiler.analyse()
+
+            # Upload data to train_url
+            if config.enable_modelarts:
+                if post_process:
+                    post_process()
+
+                if config.train_url:
+                    print("Start to copy output directory")
+                    sync_data(config.output_path, config.train_url)
+        return wrapped_func
+    return wrapper
--- a/model_zoo/official/nlp/transformer/src/process_output.py
+++ b/model_zoo/official/nlp/transformer/src/process_output.py
@ -18,22 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import argparse
-import sys

+import sys
 import tokenization
+from .model_utils.config import config

 # Explicitly set the encoding
 sys.stdin = open(sys.stdin.fileno(), mode='r', encoding='utf-8', buffering=True)
 sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=True)

 def main():
-    parser = argparse.ArgumentParser(
-        description="recore nbest with smoothed sentence-level bleu.")
-    parser.add_argument("--vocab_file", type=str, default="", required=True, help="vocab file path.")
-    args = parser.parse_args()

-    tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=args.vocab_file)
+    tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=config.vocab_file)

    for line in sys.stdin:
        token_ids = [int(x) for x in line.strip().split()]
--- a/model_zoo/official/nlp/transformer/train.py
+++ b/model_zoo/official/nlp/transformer/train.py
@ -16,9 +16,8 @@

 import os
 import time
-import argparse
-import ast

+from easydict import EasyDict as edict
 import mindspore.common.dtype as mstype
 from mindspore.common.tensor import Tensor
 from mindspore.nn.optim import Adam
@ -36,21 +35,30 @@ from mindspore.common import set_seed
 from src.transformer_for_train import TransformerTrainOneStepCell, TransformerNetworkWithLoss, \
                                      TransformerTrainOneStepWithLossScaleCell, \
                                      TransformerTrainAccumulationAllReducePostWithLossScaleCell
-from src.config import cfg, transformer_net_cfg, transformer_net_cfg_gpu
 from src.dataset import create_transformer_dataset
 from src.lr_schedule import create_dynamic_lr
+from src.model_utils.config import config
+from src.model_utils.moxing_adapter import moxing_wrapper
+from src.model_utils.device_adapter import get_device_id
+

 set_seed(1)

-
 def get_ms_timestamp():
    t = time.time()
    return int(round(t * 1000))

-
 time_stamp_init = False
 time_stamp_first = 0

+config.dtype = mstype.float32
+config.compute_type = mstype.float16
+config.lr_schedule = edict({
+    'learning_rate': 2.0,
+    'warmup_steps': 8000,
+    'start_decay_step': 16000,
+    'min_lr': 0.0,
+    })

 class LossCallBack(Callback):
    """
@ -82,7 +90,12 @@ class LossCallBack(Callback):
                                                                     cb_params.cur_epoch_num,
                                                                     cb_params.cur_step_num,
                                                                     str(cb_params.net_outputs)))
-        with open("./loss_{}.log".format(self.rank_id), "a+") as f:
+
+        loss_file = "./loss_{}.log"
+        if config.enable_modelarts:
+            loss_file = "/cache/train/loss_{}.log"
+
+        with open(loss_file.format(self.rank_id), "a+") as f:
            f.write("time: {}, epoch: {}, step: {}, loss: {}, overflow: {}, loss_scale: {}".format(
                time_stamp_current - time_stamp_first,
                cb_params.cur_epoch_num,
@ -93,121 +106,90 @@ class LossCallBack(Callback):
            f.write('\n')


-def argparse_init():
-    """
-    Argparse init.
-    """
-    parser = argparse.ArgumentParser(description='transformer')
-    parser.add_argument("--distribute", type=str, default="false", choices=['true', 'false'],
-                        help="Run distribute, default is false.")
-    parser.add_argument("--epoch_size", type=int, default=52, help="Epoch size, default is 52.")
-    parser.add_argument("--device_target", type=str, default="Ascend",
-                        help="device where the code will be implemented, default is Ascend")
-    parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
-    parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.")
-    parser.add_argument("--enable_lossscale", type=str, default="true", choices=['true', 'false'],
-                        help="Use lossscale or not, default is true.")
-    parser.add_argument("--do_shuffle", type=str, default="true", choices=['true', 'false'],
-                        help="Enable shuffle for dataset, default is true.")
-    parser.add_argument("--checkpoint_path", type=str, default="", help="Checkpoint file path")
-    parser.add_argument("--enable_save_ckpt", type=str, default="true", choices=['true', 'false'],
-                        help="Enable save checkpoint, default is true.")
-    parser.add_argument("--save_checkpoint_steps", type=int, default=2500, help="Save checkpoint steps, "
-                                                                                "default is 2500.")
-    parser.add_argument("--save_checkpoint_num", type=int, default=30, help="Save checkpoint numbers, default is 30.")
-    parser.add_argument("--save_checkpoint_path", type=str, default="./", help="Save checkpoint file path")
-    parser.add_argument("--data_path", type=str, default="", help="Data path, it is better to use absolute path")
-    parser.add_argument("--bucket_boundaries", type=ast.literal_eval, default=[16, 32, 48, 64, 128],
-                        help="sequence length for different bucket")
-    parser.add_argument("--accumulation_steps", type=int, default=1, help="Gradient accumulation steps, default is 1.")
-
-    return parser
+def modelarts_pre_process():
+    config.save_checkpoint_path = config.output_path
+    config.data_path = os.path.join(config.data_path, 'ende-l128-mindrecord')

+@moxing_wrapper(pre_process=modelarts_pre_process)
 def run_transformer_train():
    """
    Transformer training.
    """
-    parser = argparse_init()
-    args, _ = parser.parse_known_args()
-    if args.device_target == "Ascend":
-        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=args.device_id)
+    if config.device_target == "Ascend":
+        context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id())
    else:
-        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
+        context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
    context.set_context(reserve_class_name_in_scope=False, enable_auto_mixed_precision=False)

-    if args.device_target == "GPU":
+    if config.device_target == "GPU":
        # Enable graph kernel
        context.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion")
-    if args.distribute == "true":
-        if args.device_target == "Ascend":
-            device_num = args.device_num
+    if config.distribute == "true":
+        if config.device_target == "Ascend":
+            device_num = config.device_num
            D.init('hccl')
        else:
            D.init('nccl')
            device_num = D.get_group_size()
            rank = get_rank()
-            args.device_id = rank
+            config.device_id = rank
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
                                          device_num=device_num)
-        rank_id = args.device_id % device_num
-        save_ckpt_path = os.path.join(args.save_checkpoint_path, 'ckpt_' + str(get_rank()) + '/')
+        rank_id = config.device_id % device_num
+        save_ckpt_path = os.path.join(config.save_checkpoint_path, 'ckpt_' + str(get_rank()) + '/')
    else:
        device_num = 1
        rank_id = 0
-        save_ckpt_path = os.path.join(args.save_checkpoint_path, 'ckpt_0/')
+        save_ckpt_path = os.path.join(config.save_checkpoint_path, 'ckpt_0/')
    dataset = create_transformer_dataset(epoch_count=1, rank_size=device_num,
-                                         rank_id=rank_id, do_shuffle=args.do_shuffle,
-                                         dataset_path=args.data_path,
-                                         bucket_boundaries=args.bucket_boundaries,
-                                         device_target=args.device_target)
-    if args.device_target == "Ascend":
-        netwithloss = TransformerNetworkWithLoss(transformer_net_cfg, True)
-    else:
-        netwithloss = TransformerNetworkWithLoss(transformer_net_cfg_gpu, True)
+                                         rank_id=rank_id, do_shuffle=config.do_shuffle,
+                                         dataset_path=config.data_path,
+                                         bucket_boundaries=config.bucket_boundaries,
+                                         device_target=config.device_target)

-    if args.checkpoint_path:
-        parameter_dict = load_checkpoint(args.checkpoint_path)
+    netwithloss = TransformerNetworkWithLoss(config, True)
+
+    if config.checkpoint_path:
+        parameter_dict = load_checkpoint(config.checkpoint_path)
        load_param_into_net(netwithloss, parameter_dict)

-    hidden_size = transformer_net_cfg.hidden_size if args.device_target == "Ascend" \
-        else transformer_net_cfg_gpu.hidden_size
-    learning_rate = cfg.lr_schedule.learning_rate if args.device_target == "Ascend" \
-        else 1.0
+    hidden_size = config.hidden_size
+    learning_rate = config.lr_schedule.learning_rate if config.device_target == "Ascend" else 1.0
    lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay",
-                                  training_steps=dataset.get_dataset_size()*args.epoch_size,
+                                  training_steps=dataset.get_dataset_size()*config.epoch_size,
                                  learning_rate=learning_rate,
-                                  warmup_steps=cfg.lr_schedule.warmup_steps,
+                                  warmup_steps=config.lr_schedule.warmup_steps,
                                  hidden_size=hidden_size,
-                                  start_decay_step=cfg.lr_schedule.start_decay_step,
-                                  min_lr=cfg.lr_schedule.min_lr), mstype.float32)
+                                  start_decay_step=config.lr_schedule.start_decay_step,
+                                  min_lr=config.lr_schedule.min_lr), mstype.float32)

-    if args.device_target == "GPU" and cfg.transformer_network == "large":
-        optimizer = Adam(netwithloss.trainable_params(), lr, beta2=cfg.optimizer_adam_beta2)
+    if config.device_target == "GPU" and config.transformer_network == "large":
+        optimizer = Adam(netwithloss.trainable_params(), lr, beta2=config.optimizer_adam_beta2)
    else:
        optimizer = Adam(netwithloss.trainable_params(), lr)

    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(rank_id=rank_id)]
-    if args.enable_save_ckpt == "true":
+    if config.enable_save_ckpt == "true":
        if device_num == 1 or (device_num > 1 and rank_id == 0):
-            if args.device_target == "Ascend":
-                ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps,
-                                               keep_checkpoint_max=args.save_checkpoint_num)
+            if config.device_target == "Ascend":
+                ckpt_config = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_steps,
+                                               keep_checkpoint_max=config.save_checkpoint_num)
            else:
                ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset.get_dataset_size(),
-                                               keep_checkpoint_max=args.save_checkpoint_num)
+                                               keep_checkpoint_max=config.save_checkpoint_num)
            ckpoint_cb = ModelCheckpoint(prefix='transformer', directory=save_ckpt_path, config=ckpt_config)
            callbacks.append(ckpoint_cb)

-    if args.enable_lossscale == "true":
-        scale_manager = DynamicLossScaleManager(init_loss_scale=cfg.init_loss_scale_value,
-                                                scale_factor=cfg.scale_factor,
-                                                scale_window=cfg.scale_window)
+    if config.enable_lossscale == "true":
+        scale_manager = DynamicLossScaleManager(init_loss_scale=config.init_loss_scale_value,
+                                                scale_factor=config.scale_factor,
+                                                scale_window=config.scale_window)
        update_cell = scale_manager.get_update_cell()
-        if args.accumulation_steps > 1:
+        if config.accumulation_steps > 1:
            netwithgrads = TransformerTrainAccumulationAllReducePostWithLossScaleCell(netwithloss, optimizer,
                                                                                      update_cell,
-                                                                                      args.accumulation_steps)
+                                                                                      config.accumulation_steps)
        else:
            netwithgrads = TransformerTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer,
                                                                    scale_update_cell=update_cell)
@ -217,7 +199,7 @@ def run_transformer_train():
    netwithgrads.set_train(True)
    model = Model(netwithgrads)

-    model.train(args.epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=False)
+    model.train(config.epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=False)


 if __name__ == '__main__':
--- a/tests/st/model_zoo_tests/transformer/init.py
+++ b/tests/st/model_zoo_tests/transformer/init.py
--- a/tests/st/model_zoo_tests/transformer/src/init.py
+++ b/tests/st/model_zoo_tests/transformer/src/init.py
--- a/tests/st/model_zoo_tests/transformer/src/beam_search.py
+++ b/tests/st/model_zoo_tests/transformer/src/beam_search.py
@ -0,0 +1,281 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Transformer beam search module."""
+
+import numpy as np
+import mindspore.common.dtype as mstype
+import mindspore.nn as nn
+from mindspore.ops import operations as P
+from mindspore.common.tensor import Tensor
+
+INF = 1. * 1e9
+
+class LengthPenalty(nn.Cell):
+    """
+    Normalize scores of translations according to their length.
+
+    Args:
+        weight (float): Weight of length penalty. Default: 1.0.
+        compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
+    """
+    def __init__(self,
+                 weight=1.0,
+                 compute_type=mstype.float32):
+        super(LengthPenalty, self).__init__()
+        self.weight = weight
+        self.add = P.Add()
+        self.pow = P.Pow()
+        self.div = P.RealDiv()
+        self.cast = P.Cast()
+        self.five = Tensor(5.0, mstype.float32)
+        self.six = Tensor(6.0, mstype.float32)
+
+    def construct(self, length_tensor):
+        length_tensor = self.cast(length_tensor, mstype.float32)
+        output = self.add(length_tensor, self.five)
+        output = self.div(output, self.six)
+        output = self.pow(output, self.weight)
+        return output
+
+
+class TileBeam(nn.Cell):
+    """
+    TileBeam.
+
+    Args:
+        beam_width (int): beam width setting. Default: 4.
+        compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
+    """
+    def __init__(self,
+                 beam_width,
+                 compute_type=mstype.float32):
+        super(TileBeam, self).__init__()
+        self.beam_width = beam_width
+        self.expand = P.ExpandDims()
+        self.tile = P.Tile()
+        self.reshape = P.Reshape()
+        self.shape = P.Shape()
+
+    def construct(self, input_tensor):
+        """
+        input_tensor: shape [batch, dim1, dim2]
+        output_tensor: shape [batch*beam, dim1, dim2]
+        """
+        shape = self.shape(input_tensor)
+        input_tensor = self.expand(input_tensor, 1)
+        tile_shape = (1,) + (self.beam_width,)
+        for _ in range(len(shape)-1):
+            tile_shape = tile_shape + (1,)
+        output = self.tile(input_tensor, tile_shape)
+        out_shape = (shape[0]*self.beam_width,) + shape[1:]
+        output = self.reshape(output, out_shape)
+        return output
+
+
+class Mod(nn.Cell):
+    """
+    Mod function.
+
+    Args:
+        compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
+    """
+    def __init__(self,
+                 compute_type=mstype.float32):
+        super(Mod, self).__init__()
+        self.compute_type = compute_type
+        self.floor_div = P.FloorDiv()
+        self.sub = P.Sub()
+        self.multiply = P.Mul()
+
+    def construct(self, input_x, input_y):
+        x = self.floor_div(input_x, input_y)
+        x = self.multiply(x, input_y)
+        x = self.sub(input_x, x)
+        return x
+
+
+class BeamSearchDecoder(nn.Cell):
+    """
+    Beam search decoder.
+
+    Args:
+        batch_size (int): Batch size of input dataset.
+        seq_length (int): Length of input sequence.
+        vocab_size (int): Size of vocabulary.
+        decoder (:class:`TransformerDecoderStep`): Decoder module.
+        beam_width (int): beam width setting. Default: 4.
+        length_penalty_weight (float): Weight of length penalty. Default: 1.0.
+        max_decode_length (int): max decode length. Default: 128.
+        sos_id (int): Id of sequence start token. Default: 1.
+        eos_id (int): Id of sequence end token. Default: 2.
+        compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
+    """
+    def __init__(self,
+                 batch_size,
+                 seq_length,
+                 vocab_size,
+                 decoder,
+                 beam_width=4,
+                 length_penalty_weight=1.0,
+                 max_decode_length=128,
+                 sos_id=1,
+                 eos_id=2,
+                 compute_type=mstype.float32):
+        super(BeamSearchDecoder, self).__init__(auto_prefix=False)
+        self.seq_length = seq_length
+        self.batch_size = batch_size
+        self.vocab_size = vocab_size
+        self.beam_width = beam_width
+        self.length_penalty_weight = length_penalty_weight
+        self.max_decode_length = max_decode_length
+        self.decoder = decoder
+
+        self.add = P.Add()
+        self.expand = P.ExpandDims()
+        self.reshape = P.Reshape()
+        self.shape_flat = (-1,)
+        self.shape = P.Shape()
+
+        self.zero_tensor = Tensor(np.zeros([batch_size, beam_width]), mstype.float32)
+        self.ninf_tensor = Tensor(np.full([batch_size, beam_width], -INF), mstype.float32)
+
+        self.select = P.Select()
+        self.flat_shape = (batch_size, beam_width * vocab_size)
+        self.topk = P.TopK(sorted=True)
+        self.floor_div = P.FloorDiv()
+        self.vocab_size_tensor = Tensor(self.vocab_size, mstype.int32)
+        self.real_div = P.RealDiv()
+        self.mod = Mod()
+        self.equal = P.Equal()
+        self.eos_ids = Tensor(np.full([batch_size, beam_width], eos_id), mstype.int32)
+
+        beam_ids = np.tile(np.arange(beam_width).reshape((1, beam_width)), [batch_size, 1])
+        self.beam_ids = Tensor(beam_ids, mstype.int32)
+        batch_ids = np.arange(batch_size*beam_width).reshape((batch_size, beam_width)) // beam_width
+        self.batch_ids = Tensor(batch_ids, mstype.int32)
+        self.concat = P.Concat(axis=-1)
+        self.gather_nd = P.GatherNd()
+
+        self.greater_equal = P.GreaterEqual()
+        self.sub = P.Sub()
+        self.cast = P.Cast()
+        self.zeroslike = P.ZerosLike()
+
+        # init inputs and states
+        self.start_ids = Tensor(np.full([batch_size * beam_width, 1], sos_id), mstype.int32)
+        self.init_seq = Tensor(np.full([batch_size, beam_width, 1], sos_id), mstype.int32)
+        init_scores = np.tile(np.array([[0.] + [-INF]*(beam_width-1)]), [batch_size, 1])
+        self.init_scores = Tensor(init_scores, mstype.float32)
+        self.init_finished = Tensor(np.zeros([batch_size, beam_width], dtype=np.bool))
+        self.init_length = Tensor(np.zeros([batch_size, beam_width], dtype=np.int32))
+        self.length_penalty = LengthPenalty(weight=length_penalty_weight)
+        self.one = Tensor(1, mstype.int32)
+
+    def one_step(self, cur_input_ids, enc_states, enc_attention_mask, state_log_probs,
+                 state_seq, state_finished, state_length):
+        """
+        One step for decode
+        """
+        log_probs = self.decoder(cur_input_ids, enc_states, enc_attention_mask, self.seq_length)
+        log_probs = self.reshape(log_probs, (self.batch_size, self.beam_width, self.vocab_size))
+
+        # select topk indices
+        total_log_probs = self.add(log_probs, self.expand(state_log_probs, -1))
+
+        # mask finished beams
+        mask_tensor = self.select(state_finished, self.ninf_tensor, self.zero_tensor)
+        total_log_probs = self.add(total_log_probs, self.expand(mask_tensor, -1))
+
+        # reshape scores to [batch, beam*vocab]
+        flat_scores = self.reshape(total_log_probs, self.flat_shape)
+        # select topk
+        topk_scores, topk_indices = self.topk(flat_scores, self.beam_width)
+
+        temp = topk_indices
+        beam_indices = self.zeroslike(topk_indices)
+        for _ in range(self.beam_width - 1):
+            temp = self.sub(temp, self.vocab_size_tensor)
+            res = self.cast(self.greater_equal(temp, 0), mstype.int32)
+            beam_indices = beam_indices + res
+        word_indices = topk_indices - beam_indices * self.vocab_size_tensor
+        #======================================================================
+
+        # mask finished indices
+        beam_indices = self.select(state_finished, self.beam_ids, beam_indices)
+        word_indices = self.select(state_finished, self.eos_ids, word_indices)
+        topk_scores = self.select(state_finished, state_log_probs, topk_scores)
+
+        ###### put finished sequences to the end
+        # sort according to scores with -inf for finished beams
+        tmp_log_probs = self.select(
+            self.equal(word_indices, self.eos_ids),
+            self.ninf_tensor,
+            topk_scores)
+        _, tmp_indices = self.topk(tmp_log_probs, self.beam_width)
+        # update
+        tmp_gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(tmp_indices, -1)))
+        beam_indices = self.gather_nd(beam_indices, tmp_gather_indices)
+        word_indices = self.gather_nd(word_indices, tmp_gather_indices)
+        topk_scores = self.gather_nd(topk_scores, tmp_gather_indices)
+
+        ###### generate new beam_search states
+        # gather indices for selecting alive beams
+        gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(beam_indices, -1)))
+
+        # length add 1 if not finished in the previous step
+        length_add = self.add(state_length, self.one)
+        state_length = self.select(state_finished, state_length, length_add)
+        state_length = self.gather_nd(state_length, gather_indices)
+
+        # concat seq
+        seq = self.gather_nd(state_seq, gather_indices)
+        state_seq = self.concat((seq, self.expand(word_indices, -1)))
+
+        # new finished flag and log_probs
+        state_finished = self.equal(word_indices, self.eos_ids)
+        state_log_probs = topk_scores
+
+        ###### generate new inputs and decoder states
+        cur_input_ids = self.reshape(state_seq, (self.batch_size*self.beam_width, -1))
+        return cur_input_ids, state_log_probs, state_seq, state_finished, state_length
+
+    def construct(self, enc_states, enc_attention_mask):
+        """Get beam search result."""
+        cur_input_ids = self.start_ids
+        # beam search states
+        state_log_probs = self.init_scores
+        state_seq = self.init_seq
+        state_finished = self.init_finished
+        state_length = self.init_length
+
+        for _ in range(self.max_decode_length):
+            # run one step decoder to get outputs of the current step
+            # shape [batch*beam, 1, vocab]
+            cur_input_ids, state_log_probs, state_seq, state_finished, state_length = self.one_step(
+                cur_input_ids, enc_states, enc_attention_mask, state_log_probs, state_seq, state_finished, state_length)
+
+        # add length penalty scores
+        penalty_len = self.length_penalty(state_length)
+        # get penalty length
+        log_probs = self.real_div(state_log_probs, penalty_len)
+
+        # sort according to scores
+        _, top_beam_indices = self.topk(log_probs, self.beam_width)
+        gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(top_beam_indices, -1)))
+        # sort sequence
+        predicted_ids = self.gather_nd(state_seq, gather_indices)
+        # take the first one
+        predicted_ids = predicted_ids[::, 0:1:1, ::]
+        return predicted_ids
--- a/tests/st/model_zoo_tests/transformer/src/config.py
+++ b/tests/st/model_zoo_tests/transformer/src/config.py
@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/tests/st/model_zoo_tests/transformer/src/dataset.py
+++ b/tests/st/model_zoo_tests/transformer/src/dataset.py
@ -0,0 +1,58 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Data operations, will be used in train.py."""
+
+import mindspore.common.dtype as mstype
+import mindspore.dataset as de
+import mindspore.dataset.transforms.c_transforms as deC
+from .config import transformer_net_cfg, transformer_net_cfg_gpu
+de.config.set_seed(1)
+def create_transformer_dataset(epoch_count=1, rank_size=1, rank_id=0, do_shuffle="true", dataset_path=None,
+                               bucket_boundaries=None, device_target="Ascend"):
+    """create dataset"""
+    def batch_per_bucket(bucket_len, dataset_path):
+        dataset_path = dataset_path + "_" + str(bucket_len) + "_00"
+        ds = de.MindDataset(dataset_path,
+                            columns_list=["source_eos_ids", "source_eos_mask",
+                                          "target_sos_ids", "target_sos_mask",
+                                          "target_eos_ids", "target_eos_mask"],
+                            shuffle=(do_shuffle == "true"), num_shards=rank_size, shard_id=rank_id)
+        type_cast_op = deC.TypeCast(mstype.int32)
+        ds = ds.map(operations=type_cast_op, input_columns="source_eos_ids")
+        ds = ds.map(operations=type_cast_op, input_columns="source_eos_mask")
+        ds = ds.map(operations=type_cast_op, input_columns="target_sos_ids")
+        ds = ds.map(operations=type_cast_op, input_columns="target_sos_mask")
+        ds = ds.map(operations=type_cast_op, input_columns="target_eos_ids")
+        ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask")
+
+        # apply batch operations
+        if device_target == "Ascend":
+            ds = ds.batch(transformer_net_cfg.batch_size, drop_remainder=True)
+        else:
+            ds = ds.batch(transformer_net_cfg_gpu.batch_size, drop_remainder=True)
+
+        ds = ds.repeat(epoch_count)
+        return ds
+
+    for i, _ in enumerate(bucket_boundaries):
+        bucket_len = bucket_boundaries[i]
+        ds_per = batch_per_bucket(bucket_len, dataset_path)
+        if i == 0:
+            ds = ds_per
+        else:
+            ds = ds + ds_per
+    ds = ds.shuffle(ds.get_dataset_size())
+    ds.channel_name = 'transformer'
+    return ds
--- a/tests/st/model_zoo_tests/transformer/src/eval_config.py
+++ b/tests/st/model_zoo_tests/transformer/src/eval_config.py
@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/tests/st/model_zoo_tests/transformer/src/lr_schedule.py
+++ b/tests/st/model_zoo_tests/transformer/src/lr_schedule.py
@ -0,0 +1,52 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Learning rate utilities."""
+
+def linear_warmup(warmup_steps, current_step):
+    return min([1.0, float(current_step)/float(warmup_steps)])
+
+def rsqrt_decay(warmup_steps, current_step):
+    return float(max([current_step, warmup_steps])) ** -0.5
+
+def rsqrt_hidden(hidden_size):
+    return float(hidden_size) ** -0.5
+
+def create_dynamic_lr(schedule, training_steps, learning_rate, warmup_steps, hidden_size,
+                      start_decay_step=0, min_lr=0.):
+    """
+    Generate dynamic learning rate.
+    """
+    if start_decay_step < warmup_steps:
+        start_decay_step = warmup_steps
+    lr = []
+    for current_step in range(1, training_steps+1):
+        cur_lr = 1.0
+        for name in schedule.split("*"):
+            if name == "constant":
+                cur_lr *= float(learning_rate)
+            elif name == "rsqrt_hidden":
+                cur_lr *= rsqrt_hidden(hidden_size)
+            elif name == "linear_warmup":
+                cur_lr *= linear_warmup(warmup_steps, current_step)
+            elif name == "rsqrt_decay":
+                cur_lr *= rsqrt_decay(warmup_steps, current_step-start_decay_step+warmup_steps)
+            else:
+                raise ValueError("unknown learning rate schedule")
+        if warmup_steps < current_step < start_decay_step:
+            cur_lr = lr[-1]
+        if current_step > warmup_steps:
+            cur_lr = max([cur_lr, min_lr])
+        lr.append(cur_lr)
+    return lr
--- a/tests/st/model_zoo_tests/transformer/src/process_output.py
+++ b/tests/st/model_zoo_tests/transformer/src/process_output.py
@ -0,0 +1,47 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Convert ids to tokens."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+
+import tokenization
+
+# Explicitly set the encoding
+sys.stdin = open(sys.stdin.fileno(), mode='r', encoding='utf-8', buffering=True)
+sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=True)
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="recore nbest with smoothed sentence-level bleu.")
+    parser.add_argument("--vocab_file", type=str, default="", required=True, help="vocab file path.")
+    args = parser.parse_args()
+
+    tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=args.vocab_file)
+
+    for line in sys.stdin:
+        token_ids = [int(x) for x in line.strip().split()]
+        tokens = tokenizer.convert_ids_to_tokens(token_ids)
+        sent = " ".join(tokens)
+        sent = sent.split("<s>")[-1]
+        sent = sent.split("</s>")[0]
+        print(sent.strip())
+
+if __name__ == "__main__":
+    main()
--- a/tests/st/model_zoo_tests/transformer/src/tokenization.py
+++ b/tests/st/model_zoo_tests/transformer/src/tokenization.py
@ -0,0 +1,158 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Tokenization utilities."""
+
+import sys
+import collections
+import unicodedata
+
+def convert_to_printable(text):
+    """
+    Converts `text` to a printable coding format.
+    """
+    if sys.version_info[0] == 3:
+        if isinstance(text, str):
+            return text
+        if isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        raise ValueError("Only support type `str` or `bytes`, while text type is `%s`" % (type(text)))
+    if sys.version_info[0] == 2:
+        if isinstance(text, str):
+            return text
+        if isinstance(text, unicode):
+            return text.encode("utf-8")
+        raise ValueError("Only support type `str` or `unicode`, while text type is `%s`" % (type(text)))
+    raise ValueError("Only supported when running on Python2 or Python3.")
+
+
+def convert_to_unicode(text):
+    """
+    Converts `text` to Unicode format.
+    """
+    if sys.version_info[0] == 3:
+        if isinstance(text, str):
+            return text
+        if isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        raise ValueError("Only support type `str` or `bytes`, while text type is `%s`" % (type(text)))
+    if sys.version_info[0] == 2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        if isinstance(text, unicode):
+            return text
+        raise ValueError("Only support type `str` or `unicode`, while text type is `%s`" % (type(text)))
+    raise ValueError("Only supported when running on Python2 or Python3.")
+
+
+def load_vocab_file(vocab_file):
+    """
+    Loads a vocabulary file and turns into a {token:id} dictionary.
+    """
+    vocab_dict = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r") as vocab:
+        while True:
+            token = convert_to_unicode(vocab.readline())
+            if not token:
+                break
+            token = token.strip()
+            vocab_dict[token] = index
+            index += 1
+    return vocab_dict
+
+
+def convert_by_vocab_dict(vocab_dict, items):
+    """
+    Converts a sequence of [tokens|ids] according to the vocab dict.
+    """
+    output = []
+    for item in items:
+        if item in vocab_dict:
+            output.append(vocab_dict[item])
+        else:
+            output.append(vocab_dict["<unk>"])
+    return output
+
+
+class WhiteSpaceTokenizer():
+    """
+    Whitespace tokenizer.
+    """
+    def __init__(self, vocab_file):
+        self.vocab_dict = load_vocab_file(vocab_file)
+        self.inv_vocab_dict = {index: token for token, index in self.vocab_dict.items()}
+
+    def _is_whitespace_char(self, char):
+        """
+        Checks if it is a whitespace character(regard "\t", "\n", "\r" as whitespace here).
+        """
+        if char in (" ", "\t", "\n", "\r"):
+            return True
+        uni = unicodedata.category(char)
+        if uni == "Zs":
+            return True
+        return False
+
+    def _is_control_char(self, char):
+        """
+        Checks if it is a control character.
+        """
+        if char in ("\t", "\n", "\r"):
+            return False
+        uni = unicodedata.category(char)
+        if uni in ("Cc", "Cf"):
+            return True
+        return False
+
+    def _clean_text(self, text):
+        """
+        Remove invalid characters and cleanup whitespace.
+        """
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or self._is_control_char(char):
+                continue
+            if self._is_whitespace_char(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _whitespace_tokenize(self, text):
+        """
+        Clean whitespace and split text into tokens.
+        """
+        text = text.strip()
+        if not text:
+            tokens = []
+        else:
+            tokens = text.split()
+        return tokens
+
+    def tokenize(self, text):
+        """
+        Tokenizes text.
+        """
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+        tokens = self._whitespace_tokenize(text)
+        return tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab_dict(self.vocab_dict, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab_dict(self.inv_vocab_dict, ids)
--- a/tests/st/model_zoo_tests/transformer/src/transformer_for_train.py
+++ b/tests/st/model_zoo_tests/transformer/src/transformer_for_train.py
@ -0,0 +1,472 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Transformer for training."""
+import numpy as np
+
+from mindspore.common.initializer import initializer
+import mindspore.nn as nn
+from mindspore.ops import operations as P
+from mindspore.ops import functional as F
+from mindspore.ops import composite as C
+from mindspore.common.tensor import Tensor
+from mindspore.common.parameter import Parameter
+from mindspore.common import dtype as mstype
+from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
+from mindspore.communication.management import get_group_size
+from mindspore.context import ParallelMode
+from mindspore import context
+
+from .transformer_model import TransformerModel
+
+GRADIENT_CLIP_TYPE = 1
+GRADIENT_CLIP_VALUE = 5.0
+
+clip_grad = C.MultitypeFuncGraph("clip_grad")
+
+
+@clip_grad.register("Number", "Number", "Tensor")
+def _clip_grad(clip_type, clip_value, grad):
+    """
+    Clip gradients.
+
+    Inputs:
+        clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'.
+        clip_value (float): Specifies how much to clip.
+        grad (tuple[Tensor]): Gradients.
+
+    Outputs:
+        tuple[Tensor], clipped gradients.
+    """
+    if clip_type not in (0, 1):
+        return grad
+    dt = F.dtype(grad)
+    if clip_type == 0:
+        new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt),
+                                   F.cast(F.tuple_to_array((clip_value,)), dt))
+    else:
+        new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt))
+    return new_grad
+
+
+class TransformerTrainingLoss(nn.Cell):
+    """
+    Provide transformer training loss.
+
+    Args:
+        config (TransformerConfig): The config of Transformer.
+
+    Returns:
+        Tensor, total loss.
+    """
+    def __init__(self, config):
+        super(TransformerTrainingLoss, self).__init__(auto_prefix=False)
+        self.vocab_size = config.vocab_size
+        self.onehot = P.OneHot()
+        self.on_value = Tensor(float(1 - config.label_smoothing), mstype.float32)
+        self.off_value = Tensor(config.label_smoothing / float(self.vocab_size - 1), mstype.float32)
+        self.reduce_sum = P.ReduceSum()
+        self.reduce_mean = P.ReduceMean()
+        self.reshape = P.Reshape()
+        self.last_idx = (-1,)
+        self.flatten = P.Flatten()
+        self.neg = P.Neg()
+        self.cast = P.Cast()
+        self.batch_size = config.batch_size
+
+    def construct(self, prediction_scores, label_ids, label_weights, seq_length):
+        """Defines the computation performed."""
+        flat_shape = (self.batch_size * seq_length,)
+        label_ids = self.reshape(label_ids, flat_shape)
+        label_weights = self.cast(self.reshape(label_weights, flat_shape), mstype.float32)
+        one_hot_labels = self.onehot(label_ids, self.vocab_size, self.on_value, self.off_value)
+
+        per_example_loss = self.neg(self.reduce_sum(prediction_scores * one_hot_labels, self.last_idx))
+        numerator = self.reduce_sum(label_weights * per_example_loss, ())
+        denominator = self.reduce_sum(label_weights, ()) + \
+                      self.cast(F.tuple_to_array((1e-5,)), mstype.float32)
+        loss = numerator / denominator
+        return loss
+
+
+class TransformerNetworkWithLoss(nn.Cell):
+    """
+    Provide  transformer training loss through network.
+
+    Args:
+        config (TransformerConfig): The config of Transformer.
+        is_training (bool): Specifies whether to use the training mode.
+        use_one_hot_embeddings (bool): Specifies whether to use one-hot for embeddings. Default: False.
+
+    Returns:
+        Tensor, the loss of the network.
+    """
+    def __init__(self, config, is_training, use_one_hot_embeddings=False):
+        super(TransformerNetworkWithLoss, self).__init__(auto_prefix=False)
+        self.transformer = TransformerModel(config, is_training, use_one_hot_embeddings)
+        self.loss = TransformerTrainingLoss(config)
+        self.cast = P.Cast()
+        self.shape = P.Shape()
+
+    def construct(self,
+                  source_ids,
+                  source_mask,
+                  target_ids,
+                  target_mask,
+                  label_ids,
+                  label_weights):
+        """Transformer network with loss."""
+        prediction_scores = self.transformer(source_ids, source_mask, target_ids, target_mask)
+        seq_length = self.shape(source_ids)[1]
+        total_loss = self.loss(prediction_scores, label_ids, label_weights, seq_length)
+        return self.cast(total_loss, mstype.float32)
+
+
+class TransformerTrainOneStepCell(nn.TrainOneStepCell):
+    """
+    Encapsulation class of transformer network training.
+
+    Append an optimizer to the training network after that the construct
+    function can be called to create the backward graph.
+
+    Args:
+        network (Cell): The training network. Note that loss function should have been added.
+        optimizer (Optimizer): Optimizer for updating the weights.
+        sens (Number): The adjust parameter. Default: 1.0.
+    """
+    def __init__(self, network, optimizer, sens=1.0):
+        super(TransformerTrainOneStepCell, self).__init__(network, optimizer, sens)
+
+        self.cast = P.Cast()
+        self.hyper_map = C.HyperMap()
+
+    def set_sens(self, value):
+        self.sens = value
+
+    def construct(self,
+                  source_eos_ids,
+                  source_eos_mask,
+                  target_sos_ids,
+                  target_sos_mask,
+                  target_eos_ids,
+                  target_eos_mask,):
+        """Defines the computation performed."""
+        source_ids = source_eos_ids
+        source_mask = source_eos_mask
+        target_ids = target_sos_ids
+        target_mask = target_sos_mask
+        label_ids = target_eos_ids
+        label_weights = target_eos_mask
+
+        weights = self.weights
+        loss = self.network(source_ids,
+                            source_mask,
+                            target_ids,
+                            target_mask,
+                            label_ids,
+                            label_weights)
+        grads = self.grad(self.network, weights)(source_ids,
+                                                 source_mask,
+                                                 target_ids,
+                                                 target_mask,
+                                                 label_ids,
+                                                 label_weights,
+                                                 self.cast(F.tuple_to_array((self.sens,)),
+                                                           mstype.float32))
+        grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
+        # apply grad reducer on grads
+        grads = self.grad_reducer(grads)
+        succ = self.optimizer(grads)
+        return F.depend(loss, succ)
+
+
+grad_scale = C.MultitypeFuncGraph("grad_scale")
+reciprocal = P.Reciprocal()
+
+
+@grad_scale.register("Tensor", "Tensor")
+def tensor_grad_scale(scale, grad):
+    return grad * F.cast(reciprocal(scale), F.dtype(grad))
+
+_grad_overflow = C.MultitypeFuncGraph("_grad_overflow")
+grad_overflow = P.FloatStatus()
+
+@_grad_overflow.register("Tensor")
+def _tensor_grad_overflow(grad):
+    return grad_overflow(grad)
+
+class TransformerTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell):
+    """
+    Encapsulation class of Transformer network training.
+
+    Append an optimizer to the training network after that the construct
+    function can be called to create the backward graph.
+
+    Args:
+        network (Cell): The training network. Note that loss function should have been added.
+        optimizer (Optimizer): Optimizer for updating the weights.
+        scale_update_cell (Cell): Cell to do the loss scale. Default: None.
+    """
+    def __init__(self, network, optimizer, scale_update_cell=None):
+        super(TransformerTrainOneStepWithLossScaleCell, self).__init__(network, optimizer, scale_update_cell)
+        self.cast = P.Cast()
+        self.degree = 1
+        if self.reducer_flag:
+            self.degree = get_group_size()
+            self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
+
+        self.loss_scale = None
+        self.loss_scaling_manager = scale_update_cell
+        if scale_update_cell:
+            self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32))
+
+    def construct(self,
+                  source_eos_ids,
+                  source_eos_mask,
+                  target_sos_ids,
+                  target_sos_mask,
+                  target_eos_ids,
+                  target_eos_mask,
+                  sens=None):
+        """Defines the computation performed."""
+        source_ids = source_eos_ids
+        source_mask = source_eos_mask
+        target_ids = target_sos_ids
+        target_mask = target_sos_mask
+        label_ids = target_eos_ids
+        label_weights = target_eos_mask
+
+        weights = self.weights
+        loss = self.network(source_ids,
+                            source_mask,
+                            target_ids,
+                            target_mask,
+                            label_ids,
+                            label_weights)
+        if sens is None:
+            scaling_sens = self.loss_scale
+        else:
+            scaling_sens = sens
+        status, scaling_sens = self.start_overflow_check(loss, scaling_sens)
+        grads = self.grad(self.network, weights)(source_ids,
+                                                 source_mask,
+                                                 target_ids,
+                                                 target_mask,
+                                                 label_ids,
+                                                 label_weights,
+                                                 self.cast(scaling_sens,
+                                                           mstype.float32))
+
+        # apply grad reducer on grads
+        grads = self.grad_reducer(grads)
+        grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads)
+        grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
+
+        cond = self.get_overflow_status(status, grads)
+        overflow = cond
+        if sens is None:
+            overflow = self.loss_scaling_manager(self.loss_scale, cond)
+        if overflow:
+            succ = False
+        else:
+            succ = self.optimizer(grads)
+        ret = (loss, cond, scaling_sens)
+        return F.depend(ret, succ)
+
+
+cast = P.Cast()
+add_grads = C.MultitypeFuncGraph("add_grads")
+
+
+@add_grads.register("Tensor", "Tensor")
+def _add_grads(accu_grad, grad):
+    return accu_grad + cast(grad, mstype.float32)
+
+update_accu_grads = C.MultitypeFuncGraph("update_accu_grads")
+
+@update_accu_grads.register("Tensor", "Tensor")
+def _update_accu_grads(accu_grad, grad):
+    succ = True
+    return F.depend(succ, F.assign(accu_grad, cast(grad, mstype.float32)))
+
+accumulate_accu_grads = C.MultitypeFuncGraph("accumulate_accu_grads")
+
+@accumulate_accu_grads.register("Tensor", "Tensor")
+def _accumulate_accu_grads(accu_grad, grad):
+    succ = True
+    return F.depend(succ, F.assign_add(accu_grad, cast(grad, mstype.float32)))
+
+
+zeroslike = P.ZerosLike()
+reset_accu_grads = C.MultitypeFuncGraph("reset_accu_grads")
+
+
+@reset_accu_grads.register("Tensor")
+def _reset_accu_grads(accu_grad):
+    succ = True
+    return F.depend(succ, F.assign(accu_grad, zeroslike(accu_grad)))
+
+
+class TransformerTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell):
+    """
+    Encapsulation class of bert network training.
+
+    Append an optimizer to the training network after that the construct
+    function can be called to create the backward graph.
+
+    To mimic higher batch size, gradients are accumulated N times before weight update.
+
+    For distribution mode, allreduce will only be implemented in the weight updated step,
+    i.e. the sub-step after gradients accumulated N times.
+
+    Args:
+        network (Cell): The training network. Note that loss function should have been added.
+        optimizer (Optimizer): Optimizer for updating the weights.
+        scale_update_cell (Cell): Cell to do the loss scale. Default: None.
+        accumulation_steps (int): Number of accumulation steps before gradient update. The global batch size =
+                                batch_size * accumulation_steps. Default: 1.
+    """
+
+    def __init__(self, network, optimizer, scale_update_cell=None, accumulation_steps=8, enable_global_norm=False):
+        super(TransformerTrainAccumulationAllReducePostWithLossScaleCell, self).__init__(auto_prefix=False)
+        self.network = network
+        self.network.set_grad()
+        self.weights = optimizer.parameters
+        self.optimizer = optimizer
+        self.accumulation_steps = accumulation_steps
+        self.enable_global_norm = enable_global_norm
+        self.one = Tensor(np.array([1]).astype(np.int32))
+        self.zero = Tensor(np.array([0]).astype(np.int32))
+        self.local_step = Parameter(initializer(0, [1], mstype.int32))
+        self.accu_grads = self.weights.clone(prefix="accu_grads", init='zeros')
+        self.accu_overflow = Parameter(initializer(0, [1], mstype.int32))
+        self.accu_loss = Parameter(initializer(0, [1], mstype.float32))
+
+        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
+        self.reducer_flag = False
+        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
+        if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
+            self.reducer_flag = True
+        self.grad_reducer = F.identity
+        self.degree = 1
+        if self.reducer_flag:
+            self.degree = get_group_size()
+            self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
+        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
+        self.overflow_reducer = F.identity
+        if self.is_distributed:
+            self.overflow_reducer = P.AllReduce()
+        self.cast = P.Cast()
+        self.alloc_status = P.NPUAllocFloatStatus()
+        self.get_status = P.NPUGetFloatStatus()
+        self.clear_status = P.NPUClearFloatStatus()
+        self.reduce_sum = P.ReduceSum(keep_dims=False)
+        self.base = Tensor(1, mstype.float32)
+        self.less_equal = P.LessEqual()
+        self.logical_or = P.LogicalOr()
+        self.not_equal = P.NotEqual()
+        self.select = P.Select()
+        self.reshape = P.Reshape()
+        self.hyper_map = C.HyperMap()
+        self.loss_scale = None
+        self.loss_scaling_manager = scale_update_cell
+        if scale_update_cell:
+            self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32))
+
+    def construct(self,
+                  source_eos_ids,
+                  source_eos_mask,
+                  target_sos_ids,
+                  target_sos_mask,
+                  target_eos_ids,
+                  target_eos_mask,
+                  sens=None):
+        """Defines the computation performed."""
+        source_ids = source_eos_ids
+        source_mask = source_eos_mask
+        target_ids = target_sos_ids
+        target_mask = target_sos_mask
+        label_ids = target_eos_ids
+        label_weights = target_eos_mask
+
+        weights = self.weights
+        loss = self.network(source_ids,
+                            source_mask,
+                            target_ids,
+                            target_mask,
+                            label_ids,
+                            label_weights)
+        if sens is None:
+            scaling_sens = self.loss_scale
+        else:
+            scaling_sens = sens
+        # alloc status and clear should be right before gradoperation
+        init = self.alloc_status()
+        init = F.depend(init, loss)
+        clear_status = self.clear_status(init)
+        scaling_sens = F.depend(scaling_sens, clear_status)
+        # update accumulation parameters
+        is_accu_step = self.not_equal(self.local_step, self.accumulation_steps)
+        self.local_step = self.select(is_accu_step, self.local_step + self.one, self.one)
+        self.accu_loss = self.select(is_accu_step, self.accu_loss + loss, loss)
+        mean_loss = self.accu_loss / self.local_step
+        is_accu_step = self.not_equal(self.local_step, self.accumulation_steps)
+
+        grads = self.grad(self.network, weights)(source_ids,
+                                                 source_mask,
+                                                 target_ids,
+                                                 target_mask,
+                                                 label_ids,
+                                                 label_weights,
+                                                 self.cast(scaling_sens,
+                                                           mstype.float32))
+
+        accu_succ = self.hyper_map(accumulate_accu_grads, self.accu_grads, grads)
+        mean_loss = F.depend(mean_loss, accu_succ)
+
+        init = F.depend(init, mean_loss)
+        get_status = self.get_status(init)
+        init = F.depend(init, get_status)
+        flag_sum = self.reduce_sum(init, (0,))
+        overflow = self.less_equal(self.base, flag_sum)
+        overflow = self.logical_or(self.not_equal(self.accu_overflow, self.zero), overflow)
+        accu_overflow = self.select(overflow, self.one, self.zero)
+        self.accu_overflow = self.select(is_accu_step, accu_overflow, self.zero)
+
+        if is_accu_step:
+            succ = False
+        else:
+            # apply grad reducer on grads
+            grads = self.grad_reducer(self.accu_grads)
+            scaling = scaling_sens * self.degree * self.accumulation_steps
+            grads = self.hyper_map(F.partial(grad_scale, scaling), grads)
+            if self.enable_global_norm:
+                grads = C.clip_by_global_norm(grads, 1.0, None)
+            else:
+                grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
+            accu_overflow = F.depend(accu_overflow, grads)
+            accu_overflow = self.overflow_reducer(accu_overflow)
+            overflow = self.less_equal(self.base, accu_overflow)
+            accu_succ = self.hyper_map(reset_accu_grads, self.accu_grads)
+            overflow = F.depend(overflow, accu_succ)
+            overflow = self.reshape(overflow, (()))
+            if sens is None:
+                overflow = self.loss_scaling_manager(self.loss_scale, overflow)
+            if overflow:
+                succ = False
+            else:
+                succ = self.optimizer(grads)
+
+        ret = (mean_loss, overflow, scaling_sens)
+        return F.depend(ret, succ)
--- a/tests/st/model_zoo_tests/transformer/src/transformer_model.py
+++ b/tests/st/model_zoo_tests/transformer/src/transformer_model.py
--- a/tests/st/model_zoo_tests/transformer/src/weight_init.py
+++ b/tests/st/model_zoo_tests/transformer/src/weight_init.py
@ -0,0 +1,52 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Weight init utilities."""
+
+import math
+import numpy as np
+from mindspore.common.tensor import Tensor
+
+def _average_units(shape):
+    """
+    Average shape dim.
+    """
+    if not shape:
+        return 1.
+    if len(shape) == 1:
+        return float(shape[0])
+    if len(shape) == 2:
+        return float(shape[0] + shape[1]) / 2.
+    raise RuntimeError("not support shape.")
+
+def weight_variable(shape):
+    scale_shape = shape
+    avg_units = _average_units(scale_shape)
+    scale = 1.0 / max(1., avg_units)
+    limit = math.sqrt(3.0 * scale)
+    values = np.random.uniform(-limit, limit, shape).astype(np.float32)
+    return Tensor(values)
+
+def one_weight(shape):
+    ones = np.ones(shape).astype(np.float32)
+    return Tensor(ones)
+
+def zero_weight(shape):
+    zeros = np.zeros(shape).astype(np.float32)
+    return Tensor(zeros)
+
+def normal_weight(shape, num_units):
+    norm = np.random.normal(0.0, num_units**-0.5, shape).astype(np.float32)
+    return Tensor(norm)
+  
--- a/tests/st/model_zoo_tests/transformer/test_transformer.py
+++ b/tests/st/model_zoo_tests/transformer/test_transformer.py
@ -27,12 +27,10 @@ from mindspore.train.callback import Callback
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.c_transforms as deC
 from mindspore import context
-from model_zoo.official.nlp.transformer.src.transformer_model import TransformerConfig
-from model_zoo.official.nlp.transformer.src.transformer_for_train import TransformerNetworkWithLoss, \
-    TransformerTrainOneStepWithLossScaleCell
-from model_zoo.official.nlp.transformer.src.config import cfg, transformer_net_cfg
-from model_zoo.official.nlp.transformer.src.lr_schedule import create_dynamic_lr
-
+from src.transformer_model import TransformerConfig
+from src.transformer_for_train import TransformerNetworkWithLoss, TransformerTrainOneStepWithLossScaleCell
+from src.config import cfg, transformer_net_cfg
+from src.lr_schedule import create_dynamic_lr
 from tests.st.model_zoo_tests import utils

 DATA_DIR = ["/home/workspace/mindspore_dataset/transformer/test-mindrecord"]
@ -216,14 +214,14 @@ def test_transformer_export_mindir():
    export_file = "transformer80_bs_0"
    ckpt_path = os.path.join(utils.ckpt_root, "transformer/transformer_trained.ckpt")
    print("ckpt_path:", ckpt_path)
-    old_list = ["'model_file': '/your/path/checkpoint_file'"]
-    new_list = ["'model_file': '{}'".format(ckpt_path)]
-    utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "src/eval_config.py"))
-    old_list = ["context.set_context(device_id=args.device_id)"]
+    old_list = ["model_file: './transformer/transformer_trained.ckpt'"]
+    new_list = ["model_file: '{}'".format(ckpt_path)]
+    utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "default_config_large.yaml"))
+    old_list = ["context.set_context(device_id=get_device_id())"]
    new_list = ["context.set_context()"]
    utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "export.py"))
    exec_export_shell = "cd transformer; python -u export.py --file_name={}" \
-                        " --file_format=MINDIR".format(export_file)
+                        " --file_format=MINDIR --config_path=./default_config_large.yaml".format(export_file)
    os.system(exec_export_shell)
    assert os.path.exists(os.path.join(cur_model_path, "{}.mindir".format(export_file)))