From 8c88411ce65a05fa5e279e9a5b60f7a138594ac8 Mon Sep 17 00:00:00 2001
From: huchunmei <huchunmei1@huawei.com>
Date: Wed, 21 Jul 2021 14:03:01 +0800
Subject: [PATCH] clould

---
 .../model_zoo_tests/transformer/__init__.py   |    0
 .../transformer/src/__init__.py               |    0
 .../transformer/src/beam_search.py            |  281 ----
 .../model_zoo_tests/transformer/src/config.py |   86 --
 .../transformer/src/dataset.py                |   58 -
 .../transformer/src/eval_config.py            |   67 -
 .../transformer/src/lr_schedule.py            |   52 -
 .../transformer/src/process_output.py         |   47 -
 .../transformer/src/tokenization.py           |  158 ---
 .../transformer/src/transformer_for_train.py  |  472 -------
 .../transformer/src/transformer_model.py      | 1153 -----------------
 .../transformer/src/weight_init.py            |   52 -
 .../transformer/test_transformer.py           |   32 +-
 13 files changed, 24 insertions(+), 2434 deletions(-)
 delete mode 100644 tests/st/model_zoo_tests/transformer/__init__.py
 delete mode 100644 tests/st/model_zoo_tests/transformer/src/__init__.py
 delete mode 100644 tests/st/model_zoo_tests/transformer/src/beam_search.py
 delete mode 100644 tests/st/model_zoo_tests/transformer/src/config.py
 delete mode 100644 tests/st/model_zoo_tests/transformer/src/dataset.py
 delete mode 100644 tests/st/model_zoo_tests/transformer/src/eval_config.py
 delete mode 100644 tests/st/model_zoo_tests/transformer/src/lr_schedule.py
 delete mode 100644 tests/st/model_zoo_tests/transformer/src/process_output.py
 delete mode 100644 tests/st/model_zoo_tests/transformer/src/tokenization.py
 delete mode 100644 tests/st/model_zoo_tests/transformer/src/transformer_for_train.py
 delete mode 100644 tests/st/model_zoo_tests/transformer/src/transformer_model.py
 delete mode 100644 tests/st/model_zoo_tests/transformer/src/weight_init.py

diff --git a/tests/st/model_zoo_tests/transformer/__init__.py b/tests/st/model_zoo_tests/transformer/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tests/st/model_zoo_tests/transformer/src/__init__.py b/tests/st/model_zoo_tests/transformer/src/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tests/st/model_zoo_tests/transformer/src/beam_search.py b/tests/st/model_zoo_tests/transformer/src/beam_search.py
deleted file mode 100644
index 53c765f7223..00000000000
--- a/tests/st/model_zoo_tests/transformer/src/beam_search.py
+++ /dev/null
@@ -1,281 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Transformer beam search module."""
-
-import numpy as np
-import mindspore.common.dtype as mstype
-import mindspore.nn as nn
-from mindspore.ops import operations as P
-from mindspore.common.tensor import Tensor
-
-INF = 1. * 1e9
-
-class LengthPenalty(nn.Cell):
-    """
-    Normalize scores of translations according to their length.
-
-    Args:
-        weight (float): Weight of length penalty. Default: 1.0.
-        compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
-    """
-    def __init__(self,
-                 weight=1.0,
-                 compute_type=mstype.float32):
-        super(LengthPenalty, self).__init__()
-        self.weight = weight
-        self.add = P.Add()
-        self.pow = P.Pow()
-        self.div = P.RealDiv()
-        self.cast = P.Cast()
-        self.five = Tensor(5.0, mstype.float32)
-        self.six = Tensor(6.0, mstype.float32)
-
-    def construct(self, length_tensor):
-        length_tensor = self.cast(length_tensor, mstype.float32)
-        output = self.add(length_tensor, self.five)
-        output = self.div(output, self.six)
-        output = self.pow(output, self.weight)
-        return output
-
-
-class TileBeam(nn.Cell):
-    """
-    TileBeam.
-
-    Args:
-        beam_width (int): beam width setting. Default: 4.
-        compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
-    """
-    def __init__(self,
-                 beam_width,
-                 compute_type=mstype.float32):
-        super(TileBeam, self).__init__()
-        self.beam_width = beam_width
-        self.expand = P.ExpandDims()
-        self.tile = P.Tile()
-        self.reshape = P.Reshape()
-        self.shape = P.Shape()
-
-    def construct(self, input_tensor):
-        """
-        input_tensor: shape [batch, dim1, dim2]
-        output_tensor: shape [batch*beam, dim1, dim2]
-        """
-        shape = self.shape(input_tensor)
-        input_tensor = self.expand(input_tensor, 1)
-        tile_shape = (1,) + (self.beam_width,)
-        for _ in range(len(shape)-1):
-            tile_shape = tile_shape + (1,)
-        output = self.tile(input_tensor, tile_shape)
-        out_shape = (shape[0]*self.beam_width,) + shape[1:]
-        output = self.reshape(output, out_shape)
-        return output
-
-
-class Mod(nn.Cell):
-    """
-    Mod function.
-
-    Args:
-        compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
-    """
-    def __init__(self,
-                 compute_type=mstype.float32):
-        super(Mod, self).__init__()
-        self.compute_type = compute_type
-        self.floor_div = P.FloorDiv()
-        self.sub = P.Sub()
-        self.multiply = P.Mul()
-
-    def construct(self, input_x, input_y):
-        x = self.floor_div(input_x, input_y)
-        x = self.multiply(x, input_y)
-        x = self.sub(input_x, x)
-        return x
-
-
-class BeamSearchDecoder(nn.Cell):
-    """
-    Beam search decoder.
-
-    Args:
-        batch_size (int): Batch size of input dataset.
-        seq_length (int): Length of input sequence.
-        vocab_size (int): Size of vocabulary.
-        decoder (:class:`TransformerDecoderStep`): Decoder module.
-        beam_width (int): beam width setting. Default: 4.
-        length_penalty_weight (float): Weight of length penalty. Default: 1.0.
-        max_decode_length (int): max decode length. Default: 128.
-        sos_id (int): Id of sequence start token. Default: 1.
-        eos_id (int): Id of sequence end token. Default: 2.
-        compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
-    """
-    def __init__(self,
-                 batch_size,
-                 seq_length,
-                 vocab_size,
-                 decoder,
-                 beam_width=4,
-                 length_penalty_weight=1.0,
-                 max_decode_length=128,
-                 sos_id=1,
-                 eos_id=2,
-                 compute_type=mstype.float32):
-        super(BeamSearchDecoder, self).__init__(auto_prefix=False)
-        self.seq_length = seq_length
-        self.batch_size = batch_size
-        self.vocab_size = vocab_size
-        self.beam_width = beam_width
-        self.length_penalty_weight = length_penalty_weight
-        self.max_decode_length = max_decode_length
-        self.decoder = decoder
-
-        self.add = P.Add()
-        self.expand = P.ExpandDims()
-        self.reshape = P.Reshape()
-        self.shape_flat = (-1,)
-        self.shape = P.Shape()
-
-        self.zero_tensor = Tensor(np.zeros([batch_size, beam_width]), mstype.float32)
-        self.ninf_tensor = Tensor(np.full([batch_size, beam_width], -INF), mstype.float32)
-
-        self.select = P.Select()
-        self.flat_shape = (batch_size, beam_width * vocab_size)
-        self.topk = P.TopK(sorted=True)
-        self.floor_div = P.FloorDiv()
-        self.vocab_size_tensor = Tensor(self.vocab_size, mstype.int32)
-        self.real_div = P.RealDiv()
-        self.mod = Mod()
-        self.equal = P.Equal()
-        self.eos_ids = Tensor(np.full([batch_size, beam_width], eos_id), mstype.int32)
-
-        beam_ids = np.tile(np.arange(beam_width).reshape((1, beam_width)), [batch_size, 1])
-        self.beam_ids = Tensor(beam_ids, mstype.int32)
-        batch_ids = np.arange(batch_size*beam_width).reshape((batch_size, beam_width)) // beam_width
-        self.batch_ids = Tensor(batch_ids, mstype.int32)
-        self.concat = P.Concat(axis=-1)
-        self.gather_nd = P.GatherNd()
-
-        self.greater_equal = P.GreaterEqual()
-        self.sub = P.Sub()
-        self.cast = P.Cast()
-        self.zeroslike = P.ZerosLike()
-
-        # init inputs and states
-        self.start_ids = Tensor(np.full([batch_size * beam_width, 1], sos_id), mstype.int32)
-        self.init_seq = Tensor(np.full([batch_size, beam_width, 1], sos_id), mstype.int32)
-        init_scores = np.tile(np.array([[0.] + [-INF]*(beam_width-1)]), [batch_size, 1])
-        self.init_scores = Tensor(init_scores, mstype.float32)
-        self.init_finished = Tensor(np.zeros([batch_size, beam_width], dtype=np.bool))
-        self.init_length = Tensor(np.zeros([batch_size, beam_width], dtype=np.int32))
-        self.length_penalty = LengthPenalty(weight=length_penalty_weight)
-        self.one = Tensor(1, mstype.int32)
-
-    def one_step(self, cur_input_ids, enc_states, enc_attention_mask, state_log_probs,
-                 state_seq, state_finished, state_length):
-        """
-        One step for decode
-        """
-        log_probs = self.decoder(cur_input_ids, enc_states, enc_attention_mask, self.seq_length)
-        log_probs = self.reshape(log_probs, (self.batch_size, self.beam_width, self.vocab_size))
-
-        # select topk indices
-        total_log_probs = self.add(log_probs, self.expand(state_log_probs, -1))
-
-        # mask finished beams
-        mask_tensor = self.select(state_finished, self.ninf_tensor, self.zero_tensor)
-        total_log_probs = self.add(total_log_probs, self.expand(mask_tensor, -1))
-
-        # reshape scores to [batch, beam*vocab]
-        flat_scores = self.reshape(total_log_probs, self.flat_shape)
-        # select topk
-        topk_scores, topk_indices = self.topk(flat_scores, self.beam_width)
-
-        temp = topk_indices
-        beam_indices = self.zeroslike(topk_indices)
-        for _ in range(self.beam_width - 1):
-            temp = self.sub(temp, self.vocab_size_tensor)
-            res = self.cast(self.greater_equal(temp, 0), mstype.int32)
-            beam_indices = beam_indices + res
-        word_indices = topk_indices - beam_indices * self.vocab_size_tensor
-        #======================================================================
-
-        # mask finished indices
-        beam_indices = self.select(state_finished, self.beam_ids, beam_indices)
-        word_indices = self.select(state_finished, self.eos_ids, word_indices)
-        topk_scores = self.select(state_finished, state_log_probs, topk_scores)
-
-        ###### put finished sequences to the end
-        # sort according to scores with -inf for finished beams
-        tmp_log_probs = self.select(
-            self.equal(word_indices, self.eos_ids),
-            self.ninf_tensor,
-            topk_scores)
-        _, tmp_indices = self.topk(tmp_log_probs, self.beam_width)
-        # update
-        tmp_gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(tmp_indices, -1)))
-        beam_indices = self.gather_nd(beam_indices, tmp_gather_indices)
-        word_indices = self.gather_nd(word_indices, tmp_gather_indices)
-        topk_scores = self.gather_nd(topk_scores, tmp_gather_indices)
-
-        ###### generate new beam_search states
-        # gather indices for selecting alive beams
-        gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(beam_indices, -1)))
-
-        # length add 1 if not finished in the previous step
-        length_add = self.add(state_length, self.one)
-        state_length = self.select(state_finished, state_length, length_add)
-        state_length = self.gather_nd(state_length, gather_indices)
-
-        # concat seq
-        seq = self.gather_nd(state_seq, gather_indices)
-        state_seq = self.concat((seq, self.expand(word_indices, -1)))
-
-        # new finished flag and log_probs
-        state_finished = self.equal(word_indices, self.eos_ids)
-        state_log_probs = topk_scores
-
-        ###### generate new inputs and decoder states
-        cur_input_ids = self.reshape(state_seq, (self.batch_size*self.beam_width, -1))
-        return cur_input_ids, state_log_probs, state_seq, state_finished, state_length
-
-    def construct(self, enc_states, enc_attention_mask):
-        """Get beam search result."""
-        cur_input_ids = self.start_ids
-        # beam search states
-        state_log_probs = self.init_scores
-        state_seq = self.init_seq
-        state_finished = self.init_finished
-        state_length = self.init_length
-
-        for _ in range(self.max_decode_length):
-            # run one step decoder to get outputs of the current step
-            # shape [batch*beam, 1, vocab]
-            cur_input_ids, state_log_probs, state_seq, state_finished, state_length = self.one_step(
-                cur_input_ids, enc_states, enc_attention_mask, state_log_probs, state_seq, state_finished, state_length)
-
-        # add length penalty scores
-        penalty_len = self.length_penalty(state_length)
-        # get penalty length
-        log_probs = self.real_div(state_log_probs, penalty_len)
-
-        # sort according to scores
-        _, top_beam_indices = self.topk(log_probs, self.beam_width)
-        gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(top_beam_indices, -1)))
-        # sort sequence
-        predicted_ids = self.gather_nd(state_seq, gather_indices)
-        # take the first one
-        predicted_ids = predicted_ids[::, 0:1:1, ::]
-        return predicted_ids
diff --git a/tests/st/model_zoo_tests/transformer/src/config.py b/tests/st/model_zoo_tests/transformer/src/config.py
deleted file mode 100644
index 58d5ee5f721..00000000000
--- a/tests/st/model_zoo_tests/transformer/src/config.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Network config setting, will be used in dataset.py, train.py."""
-
-from easydict import EasyDict as edict
-import mindspore.common.dtype as mstype
-from .transformer_model import TransformerConfig
-cfg = edict({
-    'transformer_network': 'large',
-    'init_loss_scale_value': 1024,
-    'scale_factor': 2,
-    'scale_window': 2000,
-    'optimizer': 'Adam',
-    'optimizer_adam_beta2': 0.997,
-    'lr_schedule': edict({
-        'learning_rate': 2.0,
-        'warmup_steps': 8000,
-        'start_decay_step': 16000,
-        'min_lr': 0.0,
-    }),
-})
-'''
-two kinds of transformer model version
-'''
-if cfg.transformer_network == 'large':
-    transformer_net_cfg = TransformerConfig(
-        batch_size=96,
-        seq_length=128,
-        vocab_size=36560,
-        hidden_size=1024,
-        num_hidden_layers=6,
-        num_attention_heads=16,
-        intermediate_size=4096,
-        hidden_act="relu",
-        hidden_dropout_prob=0.2,
-        attention_probs_dropout_prob=0.2,
-        max_position_embeddings=128,
-        initializer_range=0.02,
-        label_smoothing=0.1,
-        dtype=mstype.float32,
-        compute_type=mstype.float16)
-    transformer_net_cfg_gpu = TransformerConfig(
-        batch_size=32,
-        seq_length=128,
-        vocab_size=36560,
-        hidden_size=1024,
-        num_hidden_layers=6,
-        num_attention_heads=16,
-        intermediate_size=4096,
-        hidden_act="relu",
-        hidden_dropout_prob=0.2,
-        attention_probs_dropout_prob=0.2,
-        max_position_embeddings=128,
-        initializer_range=0.02,
-        label_smoothing=0.1,
-        dtype=mstype.float32,
-        compute_type=mstype.float16)
-if cfg.transformer_network == 'base':
-    transformer_net_cfg = TransformerConfig(
-        batch_size=96,
-        seq_length=128,
-        vocab_size=36560,
-        hidden_size=512,
-        num_hidden_layers=6,
-        num_attention_heads=8,
-        intermediate_size=2048,
-        hidden_act="relu",
-        hidden_dropout_prob=0.2,
-        attention_probs_dropout_prob=0.2,
-        max_position_embeddings=128,
-        initializer_range=0.02,
-        label_smoothing=0.1,
-        dtype=mstype.float32,
-        compute_type=mstype.float16)
diff --git a/tests/st/model_zoo_tests/transformer/src/dataset.py b/tests/st/model_zoo_tests/transformer/src/dataset.py
deleted file mode 100644
index b485fd7ddd6..00000000000
--- a/tests/st/model_zoo_tests/transformer/src/dataset.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Data operations, will be used in train.py."""
-
-import mindspore.common.dtype as mstype
-import mindspore.dataset as de
-import mindspore.dataset.transforms.c_transforms as deC
-from .config import transformer_net_cfg, transformer_net_cfg_gpu
-de.config.set_seed(1)
-def create_transformer_dataset(epoch_count=1, rank_size=1, rank_id=0, do_shuffle="true", dataset_path=None,
-                               bucket_boundaries=None, device_target="Ascend"):
-    """create dataset"""
-    def batch_per_bucket(bucket_len, dataset_path):
-        dataset_path = dataset_path + "_" + str(bucket_len) + "_00"
-        ds = de.MindDataset(dataset_path,
-                            columns_list=["source_eos_ids", "source_eos_mask",
-                                          "target_sos_ids", "target_sos_mask",
-                                          "target_eos_ids", "target_eos_mask"],
-                            shuffle=(do_shuffle == "true"), num_shards=rank_size, shard_id=rank_id)
-        type_cast_op = deC.TypeCast(mstype.int32)
-        ds = ds.map(operations=type_cast_op, input_columns="source_eos_ids")
-        ds = ds.map(operations=type_cast_op, input_columns="source_eos_mask")
-        ds = ds.map(operations=type_cast_op, input_columns="target_sos_ids")
-        ds = ds.map(operations=type_cast_op, input_columns="target_sos_mask")
-        ds = ds.map(operations=type_cast_op, input_columns="target_eos_ids")
-        ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask")
-
-        # apply batch operations
-        if device_target == "Ascend":
-            ds = ds.batch(transformer_net_cfg.batch_size, drop_remainder=True)
-        else:
-            ds = ds.batch(transformer_net_cfg_gpu.batch_size, drop_remainder=True)
-
-        ds = ds.repeat(epoch_count)
-        return ds
-
-    for i, _ in enumerate(bucket_boundaries):
-        bucket_len = bucket_boundaries[i]
-        ds_per = batch_per_bucket(bucket_len, dataset_path)
-        if i == 0:
-            ds = ds_per
-        else:
-            ds = ds + ds_per
-    ds = ds.shuffle(ds.get_dataset_size())
-    ds.channel_name = 'transformer'
-    return ds
diff --git a/tests/st/model_zoo_tests/transformer/src/eval_config.py b/tests/st/model_zoo_tests/transformer/src/eval_config.py
deleted file mode 100644
index 512e2c489a1..00000000000
--- a/tests/st/model_zoo_tests/transformer/src/eval_config.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Network evaluation config setting, will be used in eval.py."""
-
-from easydict import EasyDict as edict
-import mindspore.common.dtype as mstype
-from .transformer_model import TransformerConfig
-
-cfg = edict({
-    'transformer_network': 'large',
-    'data_file': '/your/path/evaluation.mindrecord',
-    'model_file': '/your/path/checkpoint_file',
-    'output_file': '/your/path/output',
-})
-'''
-two kinds of transformer model version
-'''
-if cfg.transformer_network == 'large':
-    transformer_net_cfg = TransformerConfig(
-        batch_size=1,
-        seq_length=128,
-        vocab_size=36560,
-        hidden_size=1024,
-        num_hidden_layers=6,
-        num_attention_heads=16,
-        intermediate_size=4096,
-        hidden_act="relu",
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        max_position_embeddings=128,
-        label_smoothing=0.1,
-        beam_width=4,
-        max_decode_length=80,
-        length_penalty_weight=1.0,
-        dtype=mstype.float32,
-        compute_type=mstype.float16)
-if cfg.transformer_network == 'base':
-    transformer_net_cfg = TransformerConfig(
-        batch_size=1,
-        seq_length=128,
-        vocab_size=36560,
-        hidden_size=512,
-        num_hidden_layers=6,
-        num_attention_heads=8,
-        intermediate_size=2048,
-        hidden_act="relu",
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        max_position_embeddings=128,
-        label_smoothing=0.1,
-        beam_width=4,
-        max_decode_length=80,
-        length_penalty_weight=1.0,
-        dtype=mstype.float32,
-        compute_type=mstype.float16)
diff --git a/tests/st/model_zoo_tests/transformer/src/lr_schedule.py b/tests/st/model_zoo_tests/transformer/src/lr_schedule.py
deleted file mode 100644
index c246283478a..00000000000
--- a/tests/st/model_zoo_tests/transformer/src/lr_schedule.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Learning rate utilities."""
-
-def linear_warmup(warmup_steps, current_step):
-    return min([1.0, float(current_step)/float(warmup_steps)])
-
-def rsqrt_decay(warmup_steps, current_step):
-    return float(max([current_step, warmup_steps])) ** -0.5
-
-def rsqrt_hidden(hidden_size):
-    return float(hidden_size) ** -0.5
-
-def create_dynamic_lr(schedule, training_steps, learning_rate, warmup_steps, hidden_size,
-                      start_decay_step=0, min_lr=0.):
-    """
-    Generate dynamic learning rate.
-    """
-    if start_decay_step < warmup_steps:
-        start_decay_step = warmup_steps
-    lr = []
-    for current_step in range(1, training_steps+1):
-        cur_lr = 1.0
-        for name in schedule.split("*"):
-            if name == "constant":
-                cur_lr *= float(learning_rate)
-            elif name == "rsqrt_hidden":
-                cur_lr *= rsqrt_hidden(hidden_size)
-            elif name == "linear_warmup":
-                cur_lr *= linear_warmup(warmup_steps, current_step)
-            elif name == "rsqrt_decay":
-                cur_lr *= rsqrt_decay(warmup_steps, current_step-start_decay_step+warmup_steps)
-            else:
-                raise ValueError("unknown learning rate schedule")
-        if warmup_steps < current_step < start_decay_step:
-            cur_lr = lr[-1]
-        if current_step > warmup_steps:
-            cur_lr = max([cur_lr, min_lr])
-        lr.append(cur_lr)
-    return lr
diff --git a/tests/st/model_zoo_tests/transformer/src/process_output.py b/tests/st/model_zoo_tests/transformer/src/process_output.py
deleted file mode 100644
index f69ea6a0d7d..00000000000
--- a/tests/st/model_zoo_tests/transformer/src/process_output.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Convert ids to tokens."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import sys
-
-import tokenization
-
-# Explicitly set the encoding
-sys.stdin = open(sys.stdin.fileno(), mode='r', encoding='utf-8', buffering=True)
-sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=True)
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="recore nbest with smoothed sentence-level bleu.")
-    parser.add_argument("--vocab_file", type=str, default="", required=True, help="vocab file path.")
-    args = parser.parse_args()
-
-    tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=args.vocab_file)
-
-    for line in sys.stdin:
-        token_ids = [int(x) for x in line.strip().split()]
-        tokens = tokenizer.convert_ids_to_tokens(token_ids)
-        sent = " ".join(tokens)
-        sent = sent.split("<s>")[-1]
-        sent = sent.split("</s>")[0]
-        print(sent.strip())
-
-if __name__ == "__main__":
-    main()
diff --git a/tests/st/model_zoo_tests/transformer/src/tokenization.py b/tests/st/model_zoo_tests/transformer/src/tokenization.py
deleted file mode 100644
index b4121f6c365..00000000000
--- a/tests/st/model_zoo_tests/transformer/src/tokenization.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Tokenization utilities."""
-
-import sys
-import collections
-import unicodedata
-
-def convert_to_printable(text):
-    """
-    Converts `text` to a printable coding format.
-    """
-    if sys.version_info[0] == 3:
-        if isinstance(text, str):
-            return text
-        if isinstance(text, bytes):
-            return text.decode("utf-8", "ignore")
-        raise ValueError("Only support type `str` or `bytes`, while text type is `%s`" % (type(text)))
-    if sys.version_info[0] == 2:
-        if isinstance(text, str):
-            return text
-        if isinstance(text, unicode):
-            return text.encode("utf-8")
-        raise ValueError("Only support type `str` or `unicode`, while text type is `%s`" % (type(text)))
-    raise ValueError("Only supported when running on Python2 or Python3.")
-
-
-def convert_to_unicode(text):
-    """
-    Converts `text` to Unicode format.
-    """
-    if sys.version_info[0] == 3:
-        if isinstance(text, str):
-            return text
-        if isinstance(text, bytes):
-            return text.decode("utf-8", "ignore")
-        raise ValueError("Only support type `str` or `bytes`, while text type is `%s`" % (type(text)))
-    if sys.version_info[0] == 2:
-        if isinstance(text, str):
-            return text.decode("utf-8", "ignore")
-        if isinstance(text, unicode):
-            return text
-        raise ValueError("Only support type `str` or `unicode`, while text type is `%s`" % (type(text)))
-    raise ValueError("Only supported when running on Python2 or Python3.")
-
-
-def load_vocab_file(vocab_file):
-    """
-    Loads a vocabulary file and turns into a {token:id} dictionary.
-    """
-    vocab_dict = collections.OrderedDict()
-    index = 0
-    with open(vocab_file, "r") as vocab:
-        while True:
-            token = convert_to_unicode(vocab.readline())
-            if not token:
-                break
-            token = token.strip()
-            vocab_dict[token] = index
-            index += 1
-    return vocab_dict
-
-
-def convert_by_vocab_dict(vocab_dict, items):
-    """
-    Converts a sequence of [tokens|ids] according to the vocab dict.
-    """
-    output = []
-    for item in items:
-        if item in vocab_dict:
-            output.append(vocab_dict[item])
-        else:
-            output.append(vocab_dict["<unk>"])
-    return output
-
-
-class WhiteSpaceTokenizer():
-    """
-    Whitespace tokenizer.
-    """
-    def __init__(self, vocab_file):
-        self.vocab_dict = load_vocab_file(vocab_file)
-        self.inv_vocab_dict = {index: token for token, index in self.vocab_dict.items()}
-
-    def _is_whitespace_char(self, char):
-        """
-        Checks if it is a whitespace character(regard "\t", "\n", "\r" as whitespace here).
-        """
-        if char in (" ", "\t", "\n", "\r"):
-            return True
-        uni = unicodedata.category(char)
-        if uni == "Zs":
-            return True
-        return False
-
-    def _is_control_char(self, char):
-        """
-        Checks if it is a control character.
-        """
-        if char in ("\t", "\n", "\r"):
-            return False
-        uni = unicodedata.category(char)
-        if uni in ("Cc", "Cf"):
-            return True
-        return False
-
-    def _clean_text(self, text):
-        """
-        Remove invalid characters and cleanup whitespace.
-        """
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xfffd or self._is_control_char(char):
-                continue
-            if self._is_whitespace_char(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _whitespace_tokenize(self, text):
-        """
-        Clean whitespace and split text into tokens.
-        """
-        text = text.strip()
-        if not text:
-            tokens = []
-        else:
-            tokens = text.split()
-        return tokens
-
-    def tokenize(self, text):
-        """
-        Tokenizes text.
-        """
-        text = convert_to_unicode(text)
-        text = self._clean_text(text)
-        tokens = self._whitespace_tokenize(text)
-        return tokens
-
-    def convert_tokens_to_ids(self, tokens):
-        return convert_by_vocab_dict(self.vocab_dict, tokens)
-
-    def convert_ids_to_tokens(self, ids):
-        return convert_by_vocab_dict(self.inv_vocab_dict, ids)
diff --git a/tests/st/model_zoo_tests/transformer/src/transformer_for_train.py b/tests/st/model_zoo_tests/transformer/src/transformer_for_train.py
deleted file mode 100644
index 153a98d621f..00000000000
--- a/tests/st/model_zoo_tests/transformer/src/transformer_for_train.py
+++ /dev/null
@@ -1,472 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Transformer for training."""
-import numpy as np
-
-from mindspore.common.initializer import initializer
-import mindspore.nn as nn
-from mindspore.ops import operations as P
-from mindspore.ops import functional as F
-from mindspore.ops import composite as C
-from mindspore.common.tensor import Tensor
-from mindspore.common.parameter import Parameter
-from mindspore.common import dtype as mstype
-from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
-from mindspore.communication.management import get_group_size
-from mindspore.context import ParallelMode
-from mindspore import context
-
-from .transformer_model import TransformerModel
-
-GRADIENT_CLIP_TYPE = 1
-GRADIENT_CLIP_VALUE = 5.0
-
-clip_grad = C.MultitypeFuncGraph("clip_grad")
-
-
-@clip_grad.register("Number", "Number", "Tensor")
-def _clip_grad(clip_type, clip_value, grad):
-    """
-    Clip gradients.
-
-    Inputs:
-        clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'.
-        clip_value (float): Specifies how much to clip.
-        grad (tuple[Tensor]): Gradients.
-
-    Outputs:
-        tuple[Tensor], clipped gradients.
-    """
-    if clip_type not in (0, 1):
-        return grad
-    dt = F.dtype(grad)
-    if clip_type == 0:
-        new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt),
-                                   F.cast(F.tuple_to_array((clip_value,)), dt))
-    else:
-        new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt))
-    return new_grad
-
-
-class TransformerTrainingLoss(nn.Cell):
-    """
-    Provide transformer training loss.
-
-    Args:
-        config (TransformerConfig): The config of Transformer.
-
-    Returns:
-        Tensor, total loss.
-    """
-    def __init__(self, config):
-        super(TransformerTrainingLoss, self).__init__(auto_prefix=False)
-        self.vocab_size = config.vocab_size
-        self.onehot = P.OneHot()
-        self.on_value = Tensor(float(1 - config.label_smoothing), mstype.float32)
-        self.off_value = Tensor(config.label_smoothing / float(self.vocab_size - 1), mstype.float32)
-        self.reduce_sum = P.ReduceSum()
-        self.reduce_mean = P.ReduceMean()
-        self.reshape = P.Reshape()
-        self.last_idx = (-1,)
-        self.flatten = P.Flatten()
-        self.neg = P.Neg()
-        self.cast = P.Cast()
-        self.batch_size = config.batch_size
-
-    def construct(self, prediction_scores, label_ids, label_weights, seq_length):
-        """Defines the computation performed."""
-        flat_shape = (self.batch_size * seq_length,)
-        label_ids = self.reshape(label_ids, flat_shape)
-        label_weights = self.cast(self.reshape(label_weights, flat_shape), mstype.float32)
-        one_hot_labels = self.onehot(label_ids, self.vocab_size, self.on_value, self.off_value)
-
-        per_example_loss = self.neg(self.reduce_sum(prediction_scores * one_hot_labels, self.last_idx))
-        numerator = self.reduce_sum(label_weights * per_example_loss, ())
-        denominator = self.reduce_sum(label_weights, ()) + \
-                      self.cast(F.tuple_to_array((1e-5,)), mstype.float32)
-        loss = numerator / denominator
-        return loss
-
-
-class TransformerNetworkWithLoss(nn.Cell):
-    """
-    Provide  transformer training loss through network.
-
-    Args:
-        config (TransformerConfig): The config of Transformer.
-        is_training (bool): Specifies whether to use the training mode.
-        use_one_hot_embeddings (bool): Specifies whether to use one-hot for embeddings. Default: False.
-
-    Returns:
-        Tensor, the loss of the network.
-    """
-    def __init__(self, config, is_training, use_one_hot_embeddings=False):
-        super(TransformerNetworkWithLoss, self).__init__(auto_prefix=False)
-        self.transformer = TransformerModel(config, is_training, use_one_hot_embeddings)
-        self.loss = TransformerTrainingLoss(config)
-        self.cast = P.Cast()
-        self.shape = P.Shape()
-
-    def construct(self,
-                  source_ids,
-                  source_mask,
-                  target_ids,
-                  target_mask,
-                  label_ids,
-                  label_weights):
-        """Transformer network with loss."""
-        prediction_scores = self.transformer(source_ids, source_mask, target_ids, target_mask)
-        seq_length = self.shape(source_ids)[1]
-        total_loss = self.loss(prediction_scores, label_ids, label_weights, seq_length)
-        return self.cast(total_loss, mstype.float32)
-
-
-class TransformerTrainOneStepCell(nn.TrainOneStepCell):
-    """
-    Encapsulation class of transformer network training.
-
-    Append an optimizer to the training network after that the construct
-    function can be called to create the backward graph.
-
-    Args:
-        network (Cell): The training network. Note that loss function should have been added.
-        optimizer (Optimizer): Optimizer for updating the weights.
-        sens (Number): The adjust parameter. Default: 1.0.
-    """
-    def __init__(self, network, optimizer, sens=1.0):
-        super(TransformerTrainOneStepCell, self).__init__(network, optimizer, sens)
-
-        self.cast = P.Cast()
-        self.hyper_map = C.HyperMap()
-
-    def set_sens(self, value):
-        self.sens = value
-
-    def construct(self,
-                  source_eos_ids,
-                  source_eos_mask,
-                  target_sos_ids,
-                  target_sos_mask,
-                  target_eos_ids,
-                  target_eos_mask,):
-        """Defines the computation performed."""
-        source_ids = source_eos_ids
-        source_mask = source_eos_mask
-        target_ids = target_sos_ids
-        target_mask = target_sos_mask
-        label_ids = target_eos_ids
-        label_weights = target_eos_mask
-
-        weights = self.weights
-        loss = self.network(source_ids,
-                            source_mask,
-                            target_ids,
-                            target_mask,
-                            label_ids,
-                            label_weights)
-        grads = self.grad(self.network, weights)(source_ids,
-                                                 source_mask,
-                                                 target_ids,
-                                                 target_mask,
-                                                 label_ids,
-                                                 label_weights,
-                                                 self.cast(F.tuple_to_array((self.sens,)),
-                                                           mstype.float32))
-        grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
-        # apply grad reducer on grads
-        grads = self.grad_reducer(grads)
-        succ = self.optimizer(grads)
-        return F.depend(loss, succ)
-
-
-grad_scale = C.MultitypeFuncGraph("grad_scale")
-reciprocal = P.Reciprocal()
-
-
-@grad_scale.register("Tensor", "Tensor")
-def tensor_grad_scale(scale, grad):
-    return grad * F.cast(reciprocal(scale), F.dtype(grad))
-
-_grad_overflow = C.MultitypeFuncGraph("_grad_overflow")
-grad_overflow = P.FloatStatus()
-
-@_grad_overflow.register("Tensor")
-def _tensor_grad_overflow(grad):
-    return grad_overflow(grad)
-
-class TransformerTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell):
-    """
-    Encapsulation class of Transformer network training.
-
-    Append an optimizer to the training network after that the construct
-    function can be called to create the backward graph.
-
-    Args:
-        network (Cell): The training network. Note that loss function should have been added.
-        optimizer (Optimizer): Optimizer for updating the weights.
-        scale_update_cell (Cell): Cell to do the loss scale. Default: None.
-    """
-    def __init__(self, network, optimizer, scale_update_cell=None):
-        super(TransformerTrainOneStepWithLossScaleCell, self).__init__(network, optimizer, scale_update_cell)
-        self.cast = P.Cast()
-        self.degree = 1
-        if self.reducer_flag:
-            self.degree = get_group_size()
-            self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
-
-        self.loss_scale = None
-        self.loss_scaling_manager = scale_update_cell
-        if scale_update_cell:
-            self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32))
-
-    def construct(self,
-                  source_eos_ids,
-                  source_eos_mask,
-                  target_sos_ids,
-                  target_sos_mask,
-                  target_eos_ids,
-                  target_eos_mask,
-                  sens=None):
-        """Defines the computation performed."""
-        source_ids = source_eos_ids
-        source_mask = source_eos_mask
-        target_ids = target_sos_ids
-        target_mask = target_sos_mask
-        label_ids = target_eos_ids
-        label_weights = target_eos_mask
-
-        weights = self.weights
-        loss = self.network(source_ids,
-                            source_mask,
-                            target_ids,
-                            target_mask,
-                            label_ids,
-                            label_weights)
-        if sens is None:
-            scaling_sens = self.loss_scale
-        else:
-            scaling_sens = sens
-        status, scaling_sens = self.start_overflow_check(loss, scaling_sens)
-        grads = self.grad(self.network, weights)(source_ids,
-                                                 source_mask,
-                                                 target_ids,
-                                                 target_mask,
-                                                 label_ids,
-                                                 label_weights,
-                                                 self.cast(scaling_sens,
-                                                           mstype.float32))
-
-        # apply grad reducer on grads
-        grads = self.grad_reducer(grads)
-        grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads)
-        grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
-
-        cond = self.get_overflow_status(status, grads)
-        overflow = cond
-        if sens is None:
-            overflow = self.loss_scaling_manager(self.loss_scale, cond)
-        if overflow:
-            succ = False
-        else:
-            succ = self.optimizer(grads)
-        ret = (loss, cond, scaling_sens)
-        return F.depend(ret, succ)
-
-
-cast = P.Cast()
-add_grads = C.MultitypeFuncGraph("add_grads")
-
-
-@add_grads.register("Tensor", "Tensor")
-def _add_grads(accu_grad, grad):
-    return accu_grad + cast(grad, mstype.float32)
-
-update_accu_grads = C.MultitypeFuncGraph("update_accu_grads")
-
-@update_accu_grads.register("Tensor", "Tensor")
-def _update_accu_grads(accu_grad, grad):
-    succ = True
-    return F.depend(succ, F.assign(accu_grad, cast(grad, mstype.float32)))
-
-accumulate_accu_grads = C.MultitypeFuncGraph("accumulate_accu_grads")
-
-@accumulate_accu_grads.register("Tensor", "Tensor")
-def _accumulate_accu_grads(accu_grad, grad):
-    succ = True
-    return F.depend(succ, F.assign_add(accu_grad, cast(grad, mstype.float32)))
-
-
-zeroslike = P.ZerosLike()
-reset_accu_grads = C.MultitypeFuncGraph("reset_accu_grads")
-
-
-@reset_accu_grads.register("Tensor")
-def _reset_accu_grads(accu_grad):
-    succ = True
-    return F.depend(succ, F.assign(accu_grad, zeroslike(accu_grad)))
-
-
-class TransformerTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell):
-    """
-    Encapsulation class of bert network training.
-
-    Append an optimizer to the training network after that the construct
-    function can be called to create the backward graph.
-
-    To mimic higher batch size, gradients are accumulated N times before weight update.
-
-    For distribution mode, allreduce will only be implemented in the weight updated step,
-    i.e. the sub-step after gradients accumulated N times.
-
-    Args:
-        network (Cell): The training network. Note that loss function should have been added.
-        optimizer (Optimizer): Optimizer for updating the weights.
-        scale_update_cell (Cell): Cell to do the loss scale. Default: None.
-        accumulation_steps (int): Number of accumulation steps before gradient update. The global batch size =
-                                batch_size * accumulation_steps. Default: 1.
-    """
-
-    def __init__(self, network, optimizer, scale_update_cell=None, accumulation_steps=8, enable_global_norm=False):
-        super(TransformerTrainAccumulationAllReducePostWithLossScaleCell, self).__init__(auto_prefix=False)
-        self.network = network
-        self.network.set_grad()
-        self.weights = optimizer.parameters
-        self.optimizer = optimizer
-        self.accumulation_steps = accumulation_steps
-        self.enable_global_norm = enable_global_norm
-        self.one = Tensor(np.array([1]).astype(np.int32))
-        self.zero = Tensor(np.array([0]).astype(np.int32))
-        self.local_step = Parameter(initializer(0, [1], mstype.int32))
-        self.accu_grads = self.weights.clone(prefix="accu_grads", init='zeros')
-        self.accu_overflow = Parameter(initializer(0, [1], mstype.int32))
-        self.accu_loss = Parameter(initializer(0, [1], mstype.float32))
-
-        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
-        self.reducer_flag = False
-        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
-        if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
-            self.reducer_flag = True
-        self.grad_reducer = F.identity
-        self.degree = 1
-        if self.reducer_flag:
-            self.degree = get_group_size()
-            self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
-        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
-        self.overflow_reducer = F.identity
-        if self.is_distributed:
-            self.overflow_reducer = P.AllReduce()
-        self.cast = P.Cast()
-        self.alloc_status = P.NPUAllocFloatStatus()
-        self.get_status = P.NPUGetFloatStatus()
-        self.clear_status = P.NPUClearFloatStatus()
-        self.reduce_sum = P.ReduceSum(keep_dims=False)
-        self.base = Tensor(1, mstype.float32)
-        self.less_equal = P.LessEqual()
-        self.logical_or = P.LogicalOr()
-        self.not_equal = P.NotEqual()
-        self.select = P.Select()
-        self.reshape = P.Reshape()
-        self.hyper_map = C.HyperMap()
-        self.loss_scale = None
-        self.loss_scaling_manager = scale_update_cell
-        if scale_update_cell:
-            self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32))
-
-    def construct(self,
-                  source_eos_ids,
-                  source_eos_mask,
-                  target_sos_ids,
-                  target_sos_mask,
-                  target_eos_ids,
-                  target_eos_mask,
-                  sens=None):
-        """Defines the computation performed."""
-        source_ids = source_eos_ids
-        source_mask = source_eos_mask
-        target_ids = target_sos_ids
-        target_mask = target_sos_mask
-        label_ids = target_eos_ids
-        label_weights = target_eos_mask
-
-        weights = self.weights
-        loss = self.network(source_ids,
-                            source_mask,
-                            target_ids,
-                            target_mask,
-                            label_ids,
-                            label_weights)
-        if sens is None:
-            scaling_sens = self.loss_scale
-        else:
-            scaling_sens = sens
-        # alloc status and clear should be right before gradoperation
-        init = self.alloc_status()
-        init = F.depend(init, loss)
-        clear_status = self.clear_status(init)
-        scaling_sens = F.depend(scaling_sens, clear_status)
-        # update accumulation parameters
-        is_accu_step = self.not_equal(self.local_step, self.accumulation_steps)
-        self.local_step = self.select(is_accu_step, self.local_step + self.one, self.one)
-        self.accu_loss = self.select(is_accu_step, self.accu_loss + loss, loss)
-        mean_loss = self.accu_loss / self.local_step
-        is_accu_step = self.not_equal(self.local_step, self.accumulation_steps)
-
-        grads = self.grad(self.network, weights)(source_ids,
-                                                 source_mask,
-                                                 target_ids,
-                                                 target_mask,
-                                                 label_ids,
-                                                 label_weights,
-                                                 self.cast(scaling_sens,
-                                                           mstype.float32))
-
-        accu_succ = self.hyper_map(accumulate_accu_grads, self.accu_grads, grads)
-        mean_loss = F.depend(mean_loss, accu_succ)
-
-        init = F.depend(init, mean_loss)
-        get_status = self.get_status(init)
-        init = F.depend(init, get_status)
-        flag_sum = self.reduce_sum(init, (0,))
-        overflow = self.less_equal(self.base, flag_sum)
-        overflow = self.logical_or(self.not_equal(self.accu_overflow, self.zero), overflow)
-        accu_overflow = self.select(overflow, self.one, self.zero)
-        self.accu_overflow = self.select(is_accu_step, accu_overflow, self.zero)
-
-        if is_accu_step:
-            succ = False
-        else:
-            # apply grad reducer on grads
-            grads = self.grad_reducer(self.accu_grads)
-            scaling = scaling_sens * self.degree * self.accumulation_steps
-            grads = self.hyper_map(F.partial(grad_scale, scaling), grads)
-            if self.enable_global_norm:
-                grads = C.clip_by_global_norm(grads, 1.0, None)
-            else:
-                grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
-            accu_overflow = F.depend(accu_overflow, grads)
-            accu_overflow = self.overflow_reducer(accu_overflow)
-            overflow = self.less_equal(self.base, accu_overflow)
-            accu_succ = self.hyper_map(reset_accu_grads, self.accu_grads)
-            overflow = F.depend(overflow, accu_succ)
-            overflow = self.reshape(overflow, (()))
-            if sens is None:
-                overflow = self.loss_scaling_manager(self.loss_scale, overflow)
-            if overflow:
-                succ = False
-            else:
-                succ = self.optimizer(grads)
-
-        ret = (mean_loss, overflow, scaling_sens)
-        return F.depend(ret, succ)
diff --git a/tests/st/model_zoo_tests/transformer/src/transformer_model.py b/tests/st/model_zoo_tests/transformer/src/transformer_model.py
deleted file mode 100644
index 5e0aa6aa5b7..00000000000
--- a/tests/st/model_zoo_tests/transformer/src/transformer_model.py
+++ /dev/null
@@ -1,1153 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Transformer model."""
-
-import math
-import copy
-import numpy as np
-import mindspore.common.dtype as mstype
-import mindspore.nn as nn
-import mindspore.ops.functional as F
-from mindspore.ops import operations as P
-from mindspore.common.tensor import Tensor
-from mindspore.common.parameter import Parameter
-from mindspore.ops.primitive import constexpr
-from .beam_search import BeamSearchDecoder, TileBeam
-from .weight_init import normal_weight, weight_variable
-
-class TransformerConfig:
-    """
-    Configuration for `Transformer`.
-
-    Args:
-        batch_size (int): Batch size of input dataset.
-        seq_length (int): Length of input sequence. Default: 128.
-        vocab_size (int): The shape of each embedding vector. Default: 36560.
-        hidden_size (int): Size of the layers. Default: 1024.
-        num_hidden_layers (int): Number of hidden layers in the Transformer encoder/decoder
-                           cell. Default: 6.
-        num_attention_heads (int): Number of attention heads in the Transformer
-                             encoder/decoder cell. Default: 16.
-        intermediate_size (int): Size of intermediate layer in the Transformer
-                           encoder/decoder cell. Default: 4096.
-        hidden_act (str): Activation function used in the Transformer encoder/decoder
-                    cell. Default: "relu".
-        hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.3.
-        attention_probs_dropout_prob (float): The dropout probability for
-                                      MultiheadAttention. Default: 0.3.
-        max_position_embeddings (int): Maximum length of sequences used in this
-                                 model. Default: 128.
-        initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
-        label_smoothing (float): label smoothing setting. Default: 0.1
-        beam_width (int): beam width setting. Default: 4
-        max_decode_length (int): max decode length in evaluation. Default: 80
-        length_penalty_weight (float): normalize scores of translations according to their length. Default: 1.0
-        dtype (:class:`mindspore.dtype`): Data type of the input. Default: mstype.float32.
-        compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
-    """
-    def __init__(self,
-                 batch_size,
-                 seq_length=128,
-                 vocab_size=36560,
-                 hidden_size=1024,
-                 num_hidden_layers=6,
-                 num_attention_heads=16,
-                 intermediate_size=4096,
-                 hidden_act="relu",
-                 hidden_dropout_prob=0.3,
-                 attention_probs_dropout_prob=0.3,
-                 max_position_embeddings=128,
-                 initializer_range=0.02,
-                 label_smoothing=0.1,
-                 beam_width=4,
-                 max_decode_length=80,
-                 length_penalty_weight=1.0,
-                 dtype=mstype.float32,
-                 compute_type=mstype.float32):
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.intermediate_size = intermediate_size
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.label_smoothing = label_smoothing
-        self.beam_width = beam_width
-        self.max_decode_length = max_decode_length
-        self.length_penalty_weight = length_penalty_weight
-        self.dtype = dtype
-        self.compute_type = compute_type
-
-
-class EmbeddingLookup(nn.Cell):
-    """
-    A embeddings lookup table with a fixed dictionary and size.
-
-    Args:
-        vocab_size (int): Size of the dictionary of embeddings.
-        embedding_size (int): The size of each embedding vector.
-        use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False.
-        initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
-    """
-    def __init__(self,
-                 vocab_size,
-                 embedding_size,
-                 use_one_hot_embeddings=False,
-                 initializer_range=0.02):
-        super(EmbeddingLookup, self).__init__()
-        self.vocab_size = vocab_size
-        self.embedding_size = embedding_size
-        self.use_one_hot_embeddings = use_one_hot_embeddings
-        self.embedding_table = Parameter(normal_weight([vocab_size, embedding_size], embedding_size))
-        self.expand = P.ExpandDims()
-        self.shape_flat = (-1,)
-        self.gather = P.Gather()
-        self.one_hot = P.OneHot()
-        self.on_value = Tensor(1.0, mstype.float32)
-        self.off_value = Tensor(0.0, mstype.float32)
-        self.array_mul = P.MatMul()
-        self.reshape = P.Reshape()
-        self.shape = P.Shape()
-
-    def construct(self, input_ids):
-        """Get a embeddings lookup table with a fixed dictionary and size."""
-        input_shape = self.shape(input_ids)
-
-        flat_ids = self.reshape(input_ids, self.shape_flat)
-        if self.use_one_hot_embeddings:
-            one_hot_ids = self.one_hot(flat_ids, self.vocab_size, self.on_value, self.off_value)
-            output_for_reshape = self.array_mul(one_hot_ids, self.embedding_table)
-        else:
-            output_for_reshape = self.gather(self.embedding_table, flat_ids, 0)
-
-        out_shape = input_shape + (self.embedding_size,)
-        output = self.reshape(output_for_reshape, out_shape)
-        return output, self.embedding_table
-
-
-def position_encoding(length,
-                      depth,
-                      min_timescale=1,
-                      max_timescale=1e4):
-    """
-    Create Tensor of sinusoids of different frequencies.
-
-    Args:
-        length (int): Length of the Tensor to create, i.e. Number of steps.
-        depth (int): Hidden size.
-        min_timescale (float): Default: 1.
-        max_timescale (float): Default: 10000.
-
-    Returns:
-        Tensor of shape (length, depth)
-    """
-    depth = depth // 2
-    positions = np.arange(length, dtype=np.float32)
-    log_timescale_increment = (np.log(max_timescale / min_timescale) / (depth - 1))
-    inv_timescales = min_timescale * np.exp(np.arange(depth, dtype=np.float32) * -log_timescale_increment)
-    scaled_time = np.expand_dims(positions, 1) * np.expand_dims(inv_timescales, 0)
-    x = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
-    return x
-
-
-class EmbeddingPostprocessor(nn.Cell):
-    """
-    Postprocessors apply positional embeddings to word embeddings.
-
-    Args:
-        embedding_size (int): The size of each embedding vector.
-        use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False.
-        initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
-        max_position_embeddings (int): Maximum length of sequences used in this
-                                 model. Default: 128.
-        dropout_prob (float): The dropout probability. Default: 0.1.
-    """
-    def __init__(self,
-                 embedding_size,
-                 use_one_hot_embeddings=False,
-                 initializer_range=0.02,
-                 max_position_embeddings=128,
-                 dropout_prob=0.1):
-        super(EmbeddingPostprocessor, self).__init__()
-        self.scores_mul = Tensor([math.sqrt(float(embedding_size))], dtype=mstype.float32)
-        self.multiply = P.Mul()
-        self.add = P.Add()
-        self.dropout = nn.Dropout(1 - dropout_prob, dtype=mstype.float32)
-        self.use_dropout = dropout_prob > 0
-        self.expand_dims = P.ExpandDims()
-        self.position_embedding_table = Tensor(position_encoding(max_position_embeddings, embedding_size),
-                                               mstype.float32)
-        self.shape = P.Shape()
-
-    def construct(self, word_embeddings):
-        """Postprocessors apply positional embeddings to word embeddings."""
-        input_shape = self.shape(word_embeddings)
-        input_len = input_shape[1]
-
-        output = self.multiply(word_embeddings, self.scores_mul)
-
-        # add position embeddings
-        position_embeddings = self.position_embedding_table[0:input_len:1, ::]
-        position_embeddings = self.expand_dims(position_embeddings, 0)
-        output = self.add(output, position_embeddings)
-
-        if self.use_dropout:
-            output = self.dropout(output)
-        return output
-
-
-class CastWrapper(nn.Cell):
-    """
-    Cast wrapper.
-    """
-    def __init__(self, src_type=mstype.float32, dst_type=mstype.float32):
-        super(CastWrapper, self).__init__()
-        self.cast = P.Cast()
-        self.dst_type = dst_type
-
-    def construct(self, x):
-        return self.cast(x, self.dst_type)
-
-
-class LayerPreprocess(nn.Cell):
-    """
-    preprocess input of each layer.
-    """
-    def __init__(self,
-                 in_channels=None):
-        super(LayerPreprocess, self).__init__()
-        self.layernorm = nn.LayerNorm((in_channels,))
-        self.cast = P.Cast()
-        self.get_dtype = P.DType()
-
-    def construct(self, input_tensor):
-        output = self.cast(input_tensor, mstype.float32)
-        output = self.layernorm(output)
-        output = self.cast(output, self.get_dtype(input_tensor))
-        return output
-
-
-class LayerPostprocess(nn.Cell):
-    """
-    postprocess output of each layer.
-    """
-    def __init__(self,
-                 dropout_prob=0.1):
-        super(LayerPostprocess, self).__init__()
-        self.add = P.Add()
-        self.dropout = nn.Dropout(1 - dropout_prob)
-        self.use_dropout = dropout_prob > 0
-
-    def construct(self, hidden_tensor, input_tensor):
-        output = hidden_tensor
-        if self.use_dropout:
-            output = self.dropout(output)
-        output = self.add(output, input_tensor)
-        return output
-
-
-class MultiheadAttention(nn.Cell):
-    """
-    Apply multi-headed attention from "from_tensor" to "to_tensor".
-
-    Args:
-        batch_size (int): Batch size of input datasets.
-        from_tensor_width (int): Size of last dim of from_tensor.
-        to_tensor_width (int): Size of last dim of to_tensor.
-        from_seq_length (int): Length of from_tensor sequence.
-        to_seq_length (int): Length of to_tensor sequence.
-        num_attention_heads (int): Number of attention heads. Default: 1.
-        size_per_head (int): Size of each attention head. Default: 512.
-        query_act (str): Activation function for the query transform. Default: None.
-        key_act (str): Activation function for the key transform. Default: None.
-        value_act (str): Activation function for the value transform. Default: None.
-        has_attention_mask (bool): Specifies whether to use attention mask. Default: False.
-        attention_probs_dropout_prob (float): The dropout probability for
-                                      MultiheadAttention. Default: 0.0.
-        use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False.
-        initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
-        do_return_2d_tensor (bool): True for return 2d tensor. False for return 3d
-                             tensor. Default: False.
-        compute_type (:class:`mindspore.dtype`): Compute type in MultiheadAttention. Default: mstype.float32.
-    """
-    def __init__(self,
-                 batch_size,
-                 from_tensor_width,
-                 to_tensor_width,
-                 out_tensor_width,
-                 num_attention_heads=1,
-                 size_per_head=512,
-                 query_act=None,
-                 key_act=None,
-                 value_act=None,
-                 out_act=None,
-                 has_attention_mask=True,
-                 attention_probs_dropout_prob=0.0,
-                 use_one_hot_embeddings=False,
-                 initializer_range=0.02,
-                 do_return_2d_tensor=True,
-                 compute_type=mstype.float32):
-        super(MultiheadAttention, self).__init__()
-        self.batch_size = batch_size
-        self.num_attention_heads = num_attention_heads
-        self.size_per_head = size_per_head
-        self.has_attention_mask = has_attention_mask
-        assert has_attention_mask
-        self.use_one_hot_embeddings = use_one_hot_embeddings
-        self.initializer_range = initializer_range
-        self.do_return_2d_tensor = do_return_2d_tensor
-
-        self.scores_mul = Tensor([1.0 / math.sqrt(float(self.size_per_head))], dtype=compute_type)
-        self.reshape = P.Reshape()
-        self.shape_from_2d = (-1, from_tensor_width)
-        self.shape_to_2d = (-1, to_tensor_width)
-        units = num_attention_heads * size_per_head
-        self.query_layer = nn.Dense(from_tensor_width,
-                                    units,
-                                    activation=query_act,
-                                    has_bias=False,
-                                    weight_init=weight_variable([units, from_tensor_width])).to_float(compute_type)
-        self.key_layer = nn.Dense(to_tensor_width,
-                                  units,
-                                  activation=key_act,
-                                  has_bias=False,
-                                  weight_init=weight_variable([units, to_tensor_width])).to_float(compute_type)
-        self.value_layer = nn.Dense(to_tensor_width,
-                                    units,
-                                    activation=value_act,
-                                    has_bias=False,
-                                    weight_init=weight_variable([units, to_tensor_width])).to_float(compute_type)
-        self.out_layer = nn.Dense(units,
-                                  out_tensor_width,
-                                  activation=out_act,
-                                  has_bias=False,
-                                  weight_init=weight_variable([out_tensor_width, units])).to_float(compute_type)
-
-        self.matmul_trans_b = P.BatchMatMul(transpose_b=True)
-        self.multiply = P.Mul()
-        self.transpose = P.Transpose()
-        self.trans_shape = (0, 2, 1, 3)
-        self.trans_shape_relative = (2, 0, 1, 3)
-        self.trans_shape_position = (1, 2, 0, 3)
-        self.multiply_data = Tensor([-10000.0,], dtype=compute_type)
-        self.batch_num = batch_size * num_attention_heads
-        self.matmul = P.BatchMatMul()
-
-        self.softmax = nn.Softmax()
-        self.dropout = nn.Dropout(1 - attention_probs_dropout_prob)
-        self.use_dropout = attention_probs_dropout_prob > 0
-
-        if self.has_attention_mask:
-            self.expand_dims = P.ExpandDims()
-            self.sub = P.Sub()
-            self.add = P.Add()
-            self.cast = P.Cast()
-            self.get_dtype = P.DType()
-
-        self.cast_compute_type = CastWrapper(dst_type=compute_type)
-        self.softmax_cast = P.Cast()
-
-    def construct(self, from_tensor, to_tensor, seq_length, enc_seq_length, attention_mask=None):
-        """Apply multihead attention."""
-        from_seq_length = seq_length
-        to_seq_length = enc_seq_length
-        shape_from = (self.batch_size, from_seq_length, self.num_attention_heads, self.size_per_head)
-        shape_to = (self.batch_size, to_seq_length, self.num_attention_heads, self.size_per_head)
-        if self.do_return_2d_tensor:
-            shape_return = (self.batch_size * from_seq_length, self.num_attention_heads * self.size_per_head)
-            if from_seq_length == -1:
-                shape_return = (-1, self.num_attention_heads * self.size_per_head)
-        else:
-            shape_return = (self.batch_size, from_seq_length, self.num_attention_heads * self.size_per_head)
-
-        # reshape 2d/3d input tensors to 2d
-        from_tensor_2d = self.reshape(from_tensor, self.shape_from_2d)
-        to_tensor_2d = self.reshape(to_tensor, self.shape_to_2d)
-        query_out = self.query_layer(from_tensor_2d)
-        key_out = self.key_layer(to_tensor_2d)
-        value_out = self.value_layer(to_tensor_2d)
-
-        query_layer = self.reshape(query_out, shape_from)
-        query_layer = self.transpose(query_layer, self.trans_shape)
-        key_layer = self.reshape(key_out, shape_to)
-        key_layer = self.transpose(key_layer, self.trans_shape)
-
-        attention_scores = self.matmul_trans_b(query_layer, key_layer)
-        attention_scores = self.multiply(attention_scores, self.scores_mul)
-
-        if self.has_attention_mask:
-            attention_mask = self.expand_dims(attention_mask, 1)
-            multiply_out = self.sub(self.cast(F.tuple_to_array((1.0,)), self.get_dtype(attention_scores)),
-                                    self.cast(attention_mask, self.get_dtype(attention_scores)))
-            adder = self.multiply(multiply_out, self.multiply_data)
-            attention_scores = self.add(adder, attention_scores)
-
-        attention_scores = self.softmax_cast(attention_scores, mstype.float32)
-        attention_probs = self.softmax(attention_scores)
-        attention_probs = self.softmax_cast(attention_probs, self.get_dtype(key_layer))
-        if self.use_dropout:
-            attention_probs = self.dropout(attention_probs)
-
-        value_layer = self.reshape(value_out, shape_to)
-        value_layer = self.transpose(value_layer, self.trans_shape)
-        context_layer = self.matmul(attention_probs, value_layer)
-
-        context_layer = self.transpose(context_layer, self.trans_shape)
-        context_layer = self.reshape(context_layer, shape_return)
-        context_layer = self.out_layer(context_layer)
-        return context_layer
-
-
-class SelfAttention(nn.Cell):
-    """
-    Apply self-attention.
-
-    Args:
-        batch_size (int): Batch size of input dataset.
-        from_seq_length (int): Length of query sequence.
-        to_seq_length (int): Length of memory sequence.
-        hidden_size (int): Size of attention layers.
-        num_attention_heads (int): Number of attention heads. Default: 16.
-        attention_probs_dropout_prob (float): The dropout probability for
-                                      SelfAttention. Default: 0.1.
-        use_one_hot_embeddings (bool): Specifies whether to use one_hot encoding form. Default: False.
-        initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
-        hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1.
-        has_attention_mask (bool): Specifies whether has attention mask. Default: True.
-        is_encdec_att (bool): Specifies whether query sequence and memory sequence are different. Default: False.
-        compute_type (:class:`mindspore.dtype`): Compute type in MultiheadAttention. Default: mstype.float32.
-    """
-    def __init__(self,
-                 batch_size,
-                 hidden_size,
-                 num_attention_heads=16,
-                 attention_probs_dropout_prob=0.1,
-                 use_one_hot_embeddings=False,
-                 initializer_range=0.02,
-                 hidden_dropout_prob=0.1,
-                 has_attention_mask=True,
-                 is_encdec_att=False,
-                 compute_type=mstype.float32):
-        super(SelfAttention, self).__init__()
-        if hidden_size % num_attention_heads != 0:
-            raise ValueError("The hidden size (%d) is not a multiple of the number "
-                             "of attention heads (%d)" % (hidden_size, num_attention_heads))
-        self.size_per_head = int(hidden_size / num_attention_heads)
-        self.is_encdec_att = is_encdec_att
-
-        self.attention = MultiheadAttention(
-            batch_size=batch_size,
-            from_tensor_width=hidden_size,
-            to_tensor_width=hidden_size,
-            out_tensor_width=hidden_size,
-            num_attention_heads=num_attention_heads,
-            size_per_head=self.size_per_head,
-            attention_probs_dropout_prob=attention_probs_dropout_prob,
-            use_one_hot_embeddings=use_one_hot_embeddings,
-            initializer_range=initializer_range,
-            has_attention_mask=has_attention_mask,
-            do_return_2d_tensor=True,
-            compute_type=compute_type)
-
-        self.preprocess = LayerPreprocess(in_channels=hidden_size)
-        self.postprocess = LayerPostprocess(dropout_prob=hidden_dropout_prob)
-
-        self.reshape = P.Reshape()
-        self.shape = (-1, hidden_size)
-    def construct(self, input_tensor, memory_tensor, attention_mask, seq_length, enc_seq_length):
-        """Apply self-attention."""
-        input_tensor = self.reshape(input_tensor, self.shape)
-        memory_tensor = self.reshape(memory_tensor, self.shape)
-
-        output = self.preprocess(input_tensor)
-
-        if not self.is_encdec_att:
-            memory_tensor = output
-
-        attention_output = self.attention(output, memory_tensor, seq_length, enc_seq_length, attention_mask)
-        output = self.postprocess(attention_output, input_tensor)
-        return output
-
-
-class FeedForward(nn.Cell):
-    """
-    Apply two-layer feed forward
-
-    Args:
-        in_channels (int): Size of the input layer.
-        hidden_size (int): Size of the hidden layer.
-        out_channels (int): Size of the output layers.
-        hidden_act (str): name of the activation function. Default: relu
-        initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
-        hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1.
-        compute_type (:class:`mindspore.dtype`): Compute type in FeedForward. Default: mstype.float32.
-    """
-    def __init__(self,
-                 in_channels,
-                 hidden_size,
-                 out_channels,
-                 hidden_act="relu",
-                 initializer_range=0.02,
-                 hidden_dropout_prob=0.1,
-                 compute_type=mstype.float32):
-        super(FeedForward, self).__init__()
-
-        self.conv1 = nn.Dense(in_channels,
-                              hidden_size,
-                              activation=hidden_act,
-                              weight_init=weight_variable([hidden_size, in_channels])).to_float(compute_type)
-        self.conv2 = nn.Dense(hidden_size,
-                              out_channels,
-                              weight_init=weight_variable([out_channels, hidden_size])).to_float(compute_type)
-
-        self.preprocess = LayerPreprocess(in_channels=in_channels)
-        self.postprocess = LayerPostprocess(dropout_prob=hidden_dropout_prob)
-
-        self.reshape = P.Reshape()
-        self.shape = (-1, in_channels)
-        self.dropout = nn.Dropout(1 - hidden_dropout_prob)
-        self.use_dropout = hidden_dropout_prob > 0
-
-    def construct(self, input_tensor):
-        input_tensor = self.reshape(input_tensor, self.shape)
-        output = self.preprocess(input_tensor)
-        output = self.conv1(output)
-        if self.use_dropout:
-            output = self.dropout(output)
-        output = self.conv2(output)
-        output = self.postprocess(output, input_tensor)
-        return output
-
-
-class EncoderCell(nn.Cell):
-    """
-    Encoder cells used in Transformer.
-
-    Args:
-        batch_size (int): Batch size of input dataset.
-        hidden_size (int): Size of the encoder layers. Default: 1024.
-        seq_length (int): Length of input sequence. Default: 128.
-        num_attention_heads (int): Number of attention heads. Default: 16.
-        intermediate_size (int): Size of intermediate layer. Default: 4096.
-        attention_probs_dropout_prob (float): The dropout probability for
-                                      SelfAttention. Default: 0.02.
-        use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False.
-        initializer_range (float): Initialization value of TruncatedNormal. Default: 0.1.
-        hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1.
-        hidden_act (str): Activation function. Default: "relu".
-        compute_type (:class:`mindspore.dtype`): Compute type in attention. Default: mstype.float32.
-    """
-    def __init__(self,
-                 batch_size,
-                 hidden_size=1024,
-                 num_attention_heads=16,
-                 intermediate_size=4096,
-                 attention_probs_dropout_prob=0.1,
-                 use_one_hot_embeddings=False,
-                 initializer_range=0.02,
-                 hidden_dropout_prob=0.1,
-                 hidden_act="relu",
-                 compute_type=mstype.float32):
-        super(EncoderCell, self).__init__()
-        self.attention = SelfAttention(
-            batch_size=batch_size,
-            hidden_size=hidden_size,
-            num_attention_heads=num_attention_heads,
-            attention_probs_dropout_prob=attention_probs_dropout_prob,
-            use_one_hot_embeddings=use_one_hot_embeddings,
-            initializer_range=initializer_range,
-            hidden_dropout_prob=hidden_dropout_prob,
-            is_encdec_att=False,
-            compute_type=compute_type)
-        self.feedforward = FeedForward(
-            in_channels=hidden_size,
-            hidden_size=intermediate_size,
-            out_channels=hidden_size,
-            hidden_act=hidden_act,
-            initializer_range=initializer_range,
-            hidden_dropout_prob=hidden_dropout_prob,
-            compute_type=compute_type)
-
-    def construct(self, hidden_states, attention_mask, seq_length):
-        # self-attention with ln, res
-        attention_output = self.attention(hidden_states, hidden_states, attention_mask, seq_length, seq_length)
-        # feed forward with ln, res
-        output = self.feedforward(attention_output)
-        return output
-
-
-class TransformerEncoder(nn.Cell):
-    """
-    Multi-layer transformer encoder.
-
-    Args:
-        batch_size (int): Batch size of input dataset.
-        hidden_size (int): Size of the encoder layers.
-        seq_length (int): Length of input sequence.
-        num_hidden_layers (int): Number of hidden layers in encoder cells.
-        num_attention_heads (int): Number of attention heads in encoder cells. Default: 16.
-        intermediate_size (int): Size of intermediate layer in encoder cells. Default: 4096.
-        attention_probs_dropout_prob (float): The dropout probability for
-                                      SelfAttention. Default: 0.1.
-        use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False.
-        initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
-        hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1..
-        hidden_act (str): Activation function used in the encoder cells. Default: "gelu".
-        compute_type (:class:`mindspore.dtype`): Compute type. Default: mstype.float32.
-    """
-    def __init__(self,
-                 batch_size,
-                 hidden_size,
-                 num_hidden_layers,
-                 num_attention_heads=16,
-                 intermediate_size=4096,
-                 attention_probs_dropout_prob=0.1,
-                 use_one_hot_embeddings=False,
-                 initializer_range=0.02,
-                 hidden_dropout_prob=0.1,
-                 hidden_act="relu",
-                 compute_type=mstype.float32):
-        super(TransformerEncoder, self).__init__()
-        self.num_hidden_layers = num_hidden_layers
-        self.batch_size = batch_size
-        self.hidden_size = hidden_size
-
-        layers = []
-        for _ in range(num_hidden_layers):
-            layer = EncoderCell(batch_size=batch_size,
-                                hidden_size=hidden_size,
-                                num_attention_heads=num_attention_heads,
-                                intermediate_size=intermediate_size,
-                                attention_probs_dropout_prob=attention_probs_dropout_prob,
-                                use_one_hot_embeddings=use_one_hot_embeddings,
-                                initializer_range=initializer_range,
-                                hidden_dropout_prob=hidden_dropout_prob,
-                                hidden_act=hidden_act,
-                                compute_type=compute_type)
-            layers.append(layer)
-        self.layers = nn.CellList(layers)
-
-        self.layer_preprocess = LayerPreprocess(in_channels=hidden_size)
-
-        self.reshape = P.Reshape()
-        self.shape = (-1, hidden_size)
-
-    def construct(self, input_tensor, attention_mask, seq_length):
-        """Apply encoder."""
-        out_shape = (self.batch_size, seq_length, self.hidden_size)
-        prev_output = self.reshape(input_tensor, self.shape)
-
-        for layer_module in self.layers:
-            layer_output = layer_module(prev_output, attention_mask, seq_length)
-            prev_output = layer_output
-
-        prev_output = self.layer_preprocess(prev_output)
-        output = self.reshape(prev_output, out_shape)
-        return output
-
-
-class DecoderCell(nn.Cell):
-    """
-    decoder cells used in Transformer.
-
-    Args:
-        batch_size (int): Batch size of input dataset.
-        hidden_size (int): Size of the Transformer decoder layers. Default: 1024.
-        seq_length (int): Length of input sequence. Default: 128.
-        enc_seq_length (int): Length of source sentences. Default:128
-        num_attention_heads (int): Number of attention heads. Default: 12.
-        intermediate_size (int): Size of intermediate layer. Default: 4096.
-        attention_probs_dropout_prob (float): The dropout probability for
-                                      SelfAttention. Default: 0.02.
-        use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False.
-        initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
-        hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1.
-        hidden_act (str): Activation function. Default: "relu".
-        compute_type (:class:`mindspore.dtype`): Compute type in attention. Default: mstype.float32.
-    """
-    def __init__(self,
-                 batch_size,
-                 hidden_size=1024,
-                 num_attention_heads=12,
-                 intermediate_size=4096,
-                 attention_probs_dropout_prob=0.02,
-                 use_one_hot_embeddings=False,
-                 initializer_range=0.02,
-                 hidden_dropout_prob=0.1,
-                 hidden_act="relu",
-                 compute_type=mstype.float32):
-        super(DecoderCell, self).__init__()
-        self.self_attention = SelfAttention(
-            batch_size=batch_size,
-            hidden_size=hidden_size,
-            num_attention_heads=num_attention_heads,
-            attention_probs_dropout_prob=attention_probs_dropout_prob,
-            use_one_hot_embeddings=use_one_hot_embeddings,
-            initializer_range=initializer_range,
-            is_encdec_att=False,
-            hidden_dropout_prob=hidden_dropout_prob,
-            compute_type=compute_type)
-        self.cross_attention = SelfAttention(
-            batch_size=batch_size,
-            hidden_size=hidden_size,
-            num_attention_heads=num_attention_heads,
-            attention_probs_dropout_prob=attention_probs_dropout_prob,
-            use_one_hot_embeddings=use_one_hot_embeddings,
-            initializer_range=initializer_range,
-            is_encdec_att=True,
-            hidden_dropout_prob=hidden_dropout_prob,
-            compute_type=compute_type)
-        self.feedforward = FeedForward(
-            in_channels=hidden_size,
-            hidden_size=intermediate_size,
-            out_channels=hidden_size,
-            hidden_act=hidden_act,
-            initializer_range=initializer_range,
-            hidden_dropout_prob=hidden_dropout_prob,
-            compute_type=compute_type)
-
-    def construct(self, hidden_states, attention_mask, enc_states, enc_attention_mask, seq_length, enc_seq_length):
-        # self-attention with ln, res
-        attention_output = self.self_attention(hidden_states, hidden_states, attention_mask, seq_length, seq_length)
-        # cross-attention with ln, res
-        attention_output = self.cross_attention(attention_output, enc_states, enc_attention_mask,
-                                                seq_length, enc_seq_length)
-        # feed forward with ln, res
-        output = self.feedforward(attention_output)
-        return output
-
-
-class TransformerDecoder(nn.Cell):
-    """
-    Multi-layer transformer decoder.
-
-    Args:
-        batch_size (int): Batch size of input dataset.
-        hidden_size (int): Size of the encoder layers.
-        seq_length (int): Length of input sequence.
-        enc_seq_length (int): Length of source sentences.
-        num_hidden_layers (int): Number of hidden layers in encoder cells.
-        num_attention_heads (int): Number of attention heads in encoder cells. Default: 16.
-        intermediate_size (int): Size of intermediate layer in encoder cells. Default: 4096.
-        attention_probs_dropout_prob (float): The dropout probability for
-                                      SelfAttention. Default: 0.1.
-        use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False.
-        initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
-        hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1.
-        hidden_act (str): Activation function used in the encoder cells. Default: "gelu".
-        compute_type (:class:`mindspore.dtype`): Compute type. Default: mstype.float32.
-    """
-    def __init__(self,
-                 batch_size,
-                 hidden_size,
-                 num_hidden_layers,
-                 num_attention_heads=16,
-                 intermediate_size=4096,
-                 attention_probs_dropout_prob=0.1,
-                 use_one_hot_embeddings=False,
-                 initializer_range=0.02,
-                 hidden_dropout_prob=0.1,
-                 hidden_act="relu",
-                 compute_type=mstype.float32):
-        super(TransformerDecoder, self).__init__()
-        self.num_hidden_layers = num_hidden_layers
-
-        layers = []
-        for _ in range(num_hidden_layers):
-            layer = DecoderCell(batch_size=batch_size,
-                                hidden_size=hidden_size,
-                                num_attention_heads=num_attention_heads,
-                                intermediate_size=intermediate_size,
-                                attention_probs_dropout_prob=attention_probs_dropout_prob,
-                                use_one_hot_embeddings=use_one_hot_embeddings,
-                                initializer_range=initializer_range,
-                                hidden_dropout_prob=hidden_dropout_prob,
-                                hidden_act=hidden_act,
-                                compute_type=compute_type)
-            layers.append(layer)
-        self.layers = nn.CellList(layers)
-
-        self.layer_preprocess = LayerPreprocess(in_channels=hidden_size)
-
-        self.reshape = P.Reshape()
-        self.shape = (-1, hidden_size)
-        self.hidden_size = hidden_size
-        self.batch_size = batch_size
-
-    def construct(self, input_tensor, attention_mask, enc_states, enc_attention_mask, seq_length, enc_seq_length):
-        """Apply decoder."""
-        out_shape = (self.batch_size, seq_length, self.hidden_size)
-        prev_output = self.reshape(input_tensor, self.shape)
-
-        for layer_module in self.layers:
-            layer_output = layer_module(prev_output, attention_mask, enc_states, enc_attention_mask,
-                                        seq_length, enc_seq_length)
-            prev_output = layer_output
-
-        prev_output = self.layer_preprocess(prev_output)
-        output = self.reshape(prev_output, out_shape)
-        return output
-
-
-class CreateAttentionMaskFromInputMask(nn.Cell):
-    """
-    Create attention mask according to input mask.
-
-    Args:
-        config (:class:`TransformerConfig`): Configuration for Transformer.
-    """
-    def __init__(self):
-        super(CreateAttentionMaskFromInputMask, self).__init__()
-        self.cast = P.Cast()
-        self.reshape = P.Reshape()
-        self.shape = P.Shape()
-        self.batch_matmul = P.BatchMatMul()
-
-    def construct(self, input_mask):
-        """Create attention mask according to input mask."""
-        input_shape = self.shape(input_mask)
-        shape_right = (input_shape[0], 1, input_shape[1])
-        shape_left = input_shape + (1,)
-
-        input_mask = self.cast(input_mask, mstype.float32)
-        mask_left = self.reshape(input_mask, shape_left)
-        mask_right = self.reshape(input_mask, shape_right)
-        attention_mask = self.batch_matmul(mask_left, mask_right)
-
-        return attention_mask
-
-
-class PredLogProbs(nn.Cell):
-    """
-    Get log probs.
-
-    Args:
-        batch_size (int): Batch size.
-        seq_length (int): Length of input sequence.
-        width (int): Hidden size.
-        compute_type (:class:`mindspore.dtype`): Compute type. Default: mstype.float32.
-        dtype (:class:`mindspore.dtype`): Compute type to compute log_softmax. Default: mstype.float32.
-    """
-    def __init__(self,
-                 batch_size,
-                 width,
-                 compute_type=mstype.float32,
-                 dtype=mstype.float32):
-        super(PredLogProbs, self).__init__()
-        self.batch_size = batch_size
-        self.width = width
-        self.compute_type = compute_type
-        self.dtype = dtype
-
-        self.reshape = P.Reshape()
-        self.matmul = P.MatMul(transpose_b=True)
-        self.log_softmax = nn.LogSoftmax(axis=-1)
-        self.cast = P.Cast()
-
-    def construct(self,
-                  input_tensor,
-                  output_weights,
-                  seq_length):
-        """Get log probs."""
-        shape_flat_sequence_tensor = (self.batch_size * seq_length, self.width)
-
-        input_tensor = self.reshape(input_tensor, shape_flat_sequence_tensor)
-        input_tensor = self.cast(input_tensor, self.compute_type)
-        output_weights = self.cast(output_weights, self.compute_type)
-
-        logits = self.matmul(input_tensor, output_weights)
-        logits = self.cast(logits, self.dtype)
-
-        log_probs = self.log_softmax(logits)
-        return log_probs
-
-
-class TransformerDecoderStep(nn.Cell):
-    """
-    Multi-layer transformer decoder step.
-
-    Args:
-        batch_size (int): Batch size of input dataset.
-        hidden_size (int): Size of the encoder layers.
-        max_decode_length (int): Max decode length.
-        enc_seq_length (int): Length of source sentences.
-        num_hidden_layers (int): Number of hidden layers in encoder cells.
-        num_attention_heads (int): Number of attention heads in encoder cells. Default: 16.
-        intermediate_size (int): Size of intermediate layer in encoder cells. Default: 4096.
-        attention_probs_dropout_prob (float): The dropout probability for
-                                      SelfAttention. Default: 0.1.
-        use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False.
-        initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
-        hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1.
-        hidden_act (str): Activation function used in the encoder cells. Default: "gelu".
-        compute_type (:class:`mindspore.dtype`): Compute type. Default: mstype.float32.
-        embedding_lookup (:class:`EmbeddingLookup`): Embedding lookup module.
-        embedding_processor (:class:`EmbeddingPostprocessor`) Embedding postprocessor module.
-        projection (:class:`PredLogProbs`): PredLogProbs module
-    """
-    def __init__(self,
-                 batch_size,
-                 hidden_size,
-                 max_decode_length,
-                 num_hidden_layers,
-                 num_attention_heads=16,
-                 intermediate_size=4096,
-                 attention_probs_dropout_prob=0.3,
-                 use_one_hot_embeddings=False,
-                 initializer_range=0.02,
-                 hidden_dropout_prob=0.3,
-                 hidden_act="relu",
-                 compute_type=mstype.float32,
-                 embedding_lookup=None,
-                 embedding_processor=None,
-                 projection=None):
-        super(TransformerDecoderStep, self).__init__(auto_prefix=False)
-        self.num_hidden_layers = num_hidden_layers
-
-        self.tfm_embedding_lookup = embedding_lookup
-        self.tfm_embedding_processor = embedding_processor
-        self.projection = projection
-
-        self.tfm_decoder = TransformerDecoder(
-            batch_size=batch_size,
-            hidden_size=hidden_size,
-            num_attention_heads=num_attention_heads,
-            num_hidden_layers=num_hidden_layers,
-            intermediate_size=intermediate_size,
-            attention_probs_dropout_prob=attention_probs_dropout_prob,
-            use_one_hot_embeddings=use_one_hot_embeddings,
-            initializer_range=initializer_range,
-            hidden_dropout_prob=hidden_dropout_prob,
-            hidden_act=hidden_act,
-            compute_type=compute_type)
-
-        self.ones_like = P.OnesLike()
-        self.shape = P.Shape()
-
-        self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask()
-        self.expand = P.ExpandDims()
-        self.multiply = P.Mul()
-
-        ones = np.ones(shape=(max_decode_length, max_decode_length))
-        self.future_mask = Tensor(np.tril(ones), dtype=mstype.float32)
-
-        self.cast_compute_type = CastWrapper(dst_type=compute_type)
-
-    def construct(self, input_ids, enc_states, enc_attention_mask, seq_length):
-        """
-        Multi-layer transformer decoder step.
-        input_ids: [batch_size * beam_width]
-        """
-        # process embedding
-        input_embedding, embedding_tables = self.tfm_embedding_lookup(input_ids)
-        input_embedding = self.tfm_embedding_processor(input_embedding)
-        input_embedding = self.cast_compute_type(input_embedding)
-
-        input_shape = self.shape(input_ids)
-        input_len = input_shape[1]
-        future_mask = self.future_mask[0:input_len:1, 0:input_len:1]
-
-        input_mask = self.ones_like(input_ids)
-        input_mask = self._create_attention_mask_from_input_mask(input_mask)
-        input_mask = self.multiply(input_mask, self.expand(future_mask, 0))
-        input_mask = self.cast_compute_type(input_mask)
-
-        enc_attention_mask = enc_attention_mask[::, 0:input_len:1, ::]
-
-        # call TransformerDecoder
-        decoder_output = self.tfm_decoder(input_embedding, input_mask, enc_states, enc_attention_mask, -1, seq_length)
-
-        # take the last step
-        decoder_output = decoder_output[::, input_len-1:input_len:1, ::]
-
-        # projection and log_prob
-        log_probs = self.projection(decoder_output, embedding_tables, 1)
-
-        return log_probs
-
-
-@constexpr
-def convert_np_to_tensor_encoder(seq_length):
-    ones = np.ones(shape=(seq_length, seq_length))
-    return Tensor(np.tril(ones), dtype=mstype.float32)
-
-
-class TransformerModel(nn.Cell):
-    """
-    Transformer with encoder and decoder.
-
-    Args:
-        config (Class): Configuration for Transformer.
-        is_training (bool): True for training mode. False for eval mode.
-        use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False.
-    """
-    def __init__(self,
-                 config,
-                 is_training,
-                 use_one_hot_embeddings=False):
-        super(TransformerModel, self).__init__()
-        config = copy.deepcopy(config)
-        self.is_training = is_training
-        if not is_training:
-            config.hidden_dropout_prob = 0.0
-            config.attention_probs_dropout_prob = 0.0
-
-        self.batch_size = config.batch_size
-        self.hidden_size = config.hidden_size
-        self.num_hidden_layers = config.num_hidden_layers
-        self.embedding_size = config.hidden_size
-
-        self.last_idx = self.num_hidden_layers - 1
-        self.beam_width = config.beam_width
-        self.max_decode_length = config.max_decode_length
-
-        self.tfm_embedding_lookup = EmbeddingLookup(
-            vocab_size=config.vocab_size,
-            embedding_size=self.embedding_size,
-            use_one_hot_embeddings=use_one_hot_embeddings,
-            initializer_range=config.initializer_range)
-        self.tfm_embedding_postprocessor_for_encoder = EmbeddingPostprocessor(
-            embedding_size=self.embedding_size,
-            use_one_hot_embeddings=use_one_hot_embeddings,
-            initializer_range=0.02,
-            max_position_embeddings=config.max_position_embeddings,
-            dropout_prob=config.hidden_dropout_prob)
-        self.tfm_embedding_postprocessor_for_decoder = EmbeddingPostprocessor(
-            embedding_size=self.embedding_size,
-            use_one_hot_embeddings=use_one_hot_embeddings,
-            initializer_range=0.02,
-            max_position_embeddings=config.max_position_embeddings,
-            dropout_prob=config.hidden_dropout_prob)
-        self.tfm_encoder = TransformerEncoder(
-            batch_size=self.batch_size,
-            hidden_size=self.hidden_size,
-            num_attention_heads=config.num_attention_heads,
-            num_hidden_layers=self.num_hidden_layers,
-            intermediate_size=config.intermediate_size,
-            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
-            use_one_hot_embeddings=use_one_hot_embeddings,
-            initializer_range=config.initializer_range,
-            hidden_dropout_prob=config.hidden_dropout_prob,
-            hidden_act=config.hidden_act,
-            compute_type=config.compute_type)
-
-        if is_training:
-            self.projection = PredLogProbs(
-                batch_size=self.batch_size,
-                width=self.hidden_size,
-                compute_type=config.compute_type,
-                dtype=config.dtype)
-            self.tfm_decoder = TransformerDecoder(
-                batch_size=self.batch_size,
-                hidden_size=self.hidden_size,
-                num_attention_heads=config.num_attention_heads,
-                num_hidden_layers=self.num_hidden_layers,
-                intermediate_size=config.intermediate_size,
-                attention_probs_dropout_prob=config.attention_probs_dropout_prob,
-                use_one_hot_embeddings=use_one_hot_embeddings,
-                initializer_range=config.initializer_range,
-                hidden_dropout_prob=config.hidden_dropout_prob,
-                hidden_act=config.hidden_act,
-                compute_type=config.compute_type)
-        else:
-            self.projection = PredLogProbs(
-                batch_size=self.batch_size * config.beam_width,
-                width=self.hidden_size,
-                compute_type=config.compute_type,
-                dtype=config.dtype)
-            self.tfm_decoder = TransformerDecoderStep(
-                batch_size=self.batch_size * config.beam_width,
-                hidden_size=self.hidden_size,
-                max_decode_length=config.max_decode_length,
-                num_hidden_layers=config.num_hidden_layers,
-                num_attention_heads=config.num_attention_heads,
-                intermediate_size=config.intermediate_size,
-                attention_probs_dropout_prob=config.attention_probs_dropout_prob,
-                use_one_hot_embeddings=False,
-                initializer_range=config.initializer_range,
-                hidden_dropout_prob=config.hidden_dropout_prob,
-                hidden_act=config.hidden_act,
-                compute_type=config.compute_type,
-                embedding_lookup=self.tfm_embedding_lookup,
-                embedding_processor=self.tfm_embedding_postprocessor_for_decoder,
-                projection=self.projection)
-            self.tfm_decoder = BeamSearchDecoder(
-                batch_size=config.batch_size,
-                seq_length=config.seq_length,
-                vocab_size=config.vocab_size,
-                decoder=self.tfm_decoder,
-                beam_width=config.beam_width,
-                length_penalty_weight=config.length_penalty_weight,
-                max_decode_length=config.max_decode_length)
-
-            self.tfm_decoder.add_flags(loop_can_unroll=True)
-            self.tile_beam = TileBeam(beam_width=self.beam_width)
-            ones = np.ones(shape=(self.batch_size, self.max_decode_length))
-            self.encdec_mask = Tensor(ones, mstype.float32)
-
-        self.cast = P.Cast()
-        self.dtype = config.dtype
-        self.cast_compute_type = CastWrapper(dst_type=config.compute_type)
-        self.expand = P.ExpandDims()
-        self.multiply = P.Mul()
-        self.shape = P.Shape()
-
-        self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask()
-
-    def construct(self, source_ids, source_mask, target_ids=None, target_mask=None):
-        """Transformer with encoder and decoder."""
-        seq_length = self.shape(source_ids)[1]
-
-        # process source sentence
-        src_word_embeddings, embedding_tables = self.tfm_embedding_lookup(source_ids)
-        src_embedding_output = self.tfm_embedding_postprocessor_for_encoder(src_word_embeddings)
-        # attention mask [batch_size, seq_length, seq_length]
-        enc_attention_mask = self._create_attention_mask_from_input_mask(source_mask)
-        # transformer encoder
-        encoder_output = self.tfm_encoder(self.cast_compute_type(src_embedding_output),
-                                          self.cast_compute_type(enc_attention_mask),
-                                          seq_length)
-
-        if self.is_training:
-            future_mask = convert_np_to_tensor_encoder(seq_length)
-            # process target sentence
-            tgt_word_embeddings, _ = self.tfm_embedding_lookup(target_ids)
-            tgt_embedding_output = self.tfm_embedding_postprocessor_for_decoder(tgt_word_embeddings)
-            # attention mask [batch_size, seq_length, seq_length]
-            tgt_attention_mask = self._create_attention_mask_from_input_mask(target_mask)
-            tgt_attention_mask = self.multiply(tgt_attention_mask, self.expand(future_mask, 0))
-            # transformer decoder
-            decoder_output = self.tfm_decoder(self.cast_compute_type(tgt_embedding_output),
-                                              self.cast_compute_type(tgt_attention_mask),
-                                              encoder_output, enc_attention_mask,
-                                              seq_length, seq_length)
-            # calculate logits and log_probs
-            log_probs = self.projection(decoder_output, embedding_tables, seq_length)
-            ret = log_probs
-        else:
-            beam_encoder_output = self.tile_beam(encoder_output)
-
-            enc_attention_mask = self.multiply(enc_attention_mask[::, 0:1:1, ::], self.expand(self.encdec_mask, -1))
-
-            beam_enc_attention_mask = self.tile_beam(enc_attention_mask)
-            beam_enc_attention_mask = self.cast_compute_type(beam_enc_attention_mask)
-            predicted_ids = self.tfm_decoder(beam_encoder_output, beam_enc_attention_mask)
-            ret = predicted_ids
-        return ret
diff --git a/tests/st/model_zoo_tests/transformer/src/weight_init.py b/tests/st/model_zoo_tests/transformer/src/weight_init.py
deleted file mode 100644
index 460a1c67c43..00000000000
--- a/tests/st/model_zoo_tests/transformer/src/weight_init.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Weight init utilities."""
-
-import math
-import numpy as np
-from mindspore.common.tensor import Tensor
-
-def _average_units(shape):
-    """
-    Average shape dim.
-    """
-    if not shape:
-        return 1.
-    if len(shape) == 1:
-        return float(shape[0])
-    if len(shape) == 2:
-        return float(shape[0] + shape[1]) / 2.
-    raise RuntimeError("not support shape.")
-
-def weight_variable(shape):
-    scale_shape = shape
-    avg_units = _average_units(scale_shape)
-    scale = 1.0 / max(1., avg_units)
-    limit = math.sqrt(3.0 * scale)
-    values = np.random.uniform(-limit, limit, shape).astype(np.float32)
-    return Tensor(values)
-
-def one_weight(shape):
-    ones = np.ones(shape).astype(np.float32)
-    return Tensor(ones)
-
-def zero_weight(shape):
-    zeros = np.zeros(shape).astype(np.float32)
-    return Tensor(zeros)
-
-def normal_weight(shape, num_units):
-    norm = np.random.normal(0.0, num_units**-0.5, shape).astype(np.float32)
-    return Tensor(norm)
-  
\ No newline at end of file
diff --git a/tests/st/model_zoo_tests/transformer/test_transformer.py b/tests/st/model_zoo_tests/transformer/test_transformer.py
index 7ee8769944a..8ace3c49c2d 100644
--- a/tests/st/model_zoo_tests/transformer/test_transformer.py
+++ b/tests/st/model_zoo_tests/transformer/test_transformer.py
@@ -27,14 +27,30 @@ from mindspore.train.callback import Callback
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.c_transforms as deC
 from mindspore import context
-from src.transformer_model import TransformerConfig
-from src.transformer_for_train import TransformerNetworkWithLoss, TransformerTrainOneStepWithLossScaleCell
-from src.config import cfg, transformer_net_cfg
-from src.lr_schedule import create_dynamic_lr
+from easydict import EasyDict as edict
+from model_zoo.official.nlp.transformer.src.transformer_model import TransformerConfig
+from model_zoo.official.nlp.transformer.src.transformer_for_train import TransformerNetworkWithLoss, TransformerTrainOneStepWithLossScaleCell
+from model_zoo.official.nlp.transformer.src.lr_schedule import create_dynamic_lr
 from tests.st.model_zoo_tests import utils
 
+
 DATA_DIR = ["/home/workspace/mindspore_dataset/transformer/test-mindrecord"]
 
+cfg = edict({
+    'transformer_network': 'large',
+    'init_loss_scale_value': 1024,
+    'scale_factor': 2,
+    'scale_window': 2000,
+    'optimizer': 'Adam',
+    'optimizer_adam_beta2': 0.997,
+    'lr_schedule': edict({
+        'learning_rate': 2.0,
+        'warmup_steps': 8000,
+        'start_decay_step': 16000,
+        'min_lr': 0.0,
+    }),
+})
+
 
 def get_config(version='base', batch_size=1):
     """get config"""
@@ -129,7 +145,7 @@ class TimeMonitor(Callback):
         self.per_step_mseconds_list.append(epoch_mseconds / self.data_size)
 
 
-@pytest.mark.level2
+@pytest.mark.level0
 @pytest.mark.platform_arm_ascend_training
 @pytest.mark.platform_x86_ascend_training
 @pytest.mark.env_onecard
@@ -144,7 +160,7 @@ def test_transformer():
     batch_size = 96
     epoch_size = 3
     config = get_config(version=version, batch_size=batch_size)
-    dataset = load_test_data(batch_size=transformer_net_cfg.batch_size, data_file=DATA_DIR)
+    dataset = load_test_data(batch_size=config.batch_size, data_file=DATA_DIR)
 
     netwithloss = TransformerNetworkWithLoss(config, True)
 
@@ -171,7 +187,7 @@ def test_transformer():
 
     # assertion occurs while the loss value, overflow state or loss_scale value is wrong
     loss_value = np.array(callback.loss_list)
-    assert np.allclose(loss_value[0], 11.241606, 0, 0.000005)
+    assert np.allclose(loss_value[0], 11.241601, 0, 0.000005)
 
     expect_loss_value = [11.241606, 11.243232, 11.217459, 11.204157, 11.213804,
                          11.215373, 11.190564, 11.150393, 11.191823, 11.160045]
@@ -201,7 +217,7 @@ def test_transformer():
     assert per_step_mseconds <= expect_per_step_mseconds + 10
 
 
-@pytest.mark.level1
+@pytest.mark.level0
 @pytest.mark.platform_arm_ascend_training
 @pytest.mark.platform_x86_ascend_training
 @pytest.mark.env_onecard