diff --git a/tests/st/model_zoo_tests/transformer/__init__.py b/tests/st/model_zoo_tests/transformer/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tests/st/model_zoo_tests/transformer/src/__init__.py b/tests/st/model_zoo_tests/transformer/src/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tests/st/model_zoo_tests/transformer/src/beam_search.py b/tests/st/model_zoo_tests/transformer/src/beam_search.py
deleted file mode 100644
index 53c765f7223..00000000000
--- a/tests/st/model_zoo_tests/transformer/src/beam_search.py
+++ /dev/null
@@ -1,281 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Transformer beam search module."""
-
-import numpy as np
-import mindspore.common.dtype as mstype
-import mindspore.nn as nn
-from mindspore.ops import operations as P
-from mindspore.common.tensor import Tensor
-
-INF = 1. * 1e9
-
-class LengthPenalty(nn.Cell):
- """
- Normalize scores of translations according to their length.
-
- Args:
- weight (float): Weight of length penalty. Default: 1.0.
- compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
- """
- def __init__(self,
- weight=1.0,
- compute_type=mstype.float32):
- super(LengthPenalty, self).__init__()
- self.weight = weight
- self.add = P.Add()
- self.pow = P.Pow()
- self.div = P.RealDiv()
- self.cast = P.Cast()
- self.five = Tensor(5.0, mstype.float32)
- self.six = Tensor(6.0, mstype.float32)
-
- def construct(self, length_tensor):
- length_tensor = self.cast(length_tensor, mstype.float32)
- output = self.add(length_tensor, self.five)
- output = self.div(output, self.six)
- output = self.pow(output, self.weight)
- return output
-
-
-class TileBeam(nn.Cell):
- """
- TileBeam.
-
- Args:
- beam_width (int): beam width setting. Default: 4.
- compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
- """
- def __init__(self,
- beam_width,
- compute_type=mstype.float32):
- super(TileBeam, self).__init__()
- self.beam_width = beam_width
- self.expand = P.ExpandDims()
- self.tile = P.Tile()
- self.reshape = P.Reshape()
- self.shape = P.Shape()
-
- def construct(self, input_tensor):
- """
- input_tensor: shape [batch, dim1, dim2]
- output_tensor: shape [batch*beam, dim1, dim2]
- """
- shape = self.shape(input_tensor)
- input_tensor = self.expand(input_tensor, 1)
- tile_shape = (1,) + (self.beam_width,)
- for _ in range(len(shape)-1):
- tile_shape = tile_shape + (1,)
- output = self.tile(input_tensor, tile_shape)
- out_shape = (shape[0]*self.beam_width,) + shape[1:]
- output = self.reshape(output, out_shape)
- return output
-
-
-class Mod(nn.Cell):
- """
- Mod function.
-
- Args:
- compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
- """
- def __init__(self,
- compute_type=mstype.float32):
- super(Mod, self).__init__()
- self.compute_type = compute_type
- self.floor_div = P.FloorDiv()
- self.sub = P.Sub()
- self.multiply = P.Mul()
-
- def construct(self, input_x, input_y):
- x = self.floor_div(input_x, input_y)
- x = self.multiply(x, input_y)
- x = self.sub(input_x, x)
- return x
-
-
-class BeamSearchDecoder(nn.Cell):
- """
- Beam search decoder.
-
- Args:
- batch_size (int): Batch size of input dataset.
- seq_length (int): Length of input sequence.
- vocab_size (int): Size of vocabulary.
- decoder (:class:`TransformerDecoderStep`): Decoder module.
- beam_width (int): beam width setting. Default: 4.
- length_penalty_weight (float): Weight of length penalty. Default: 1.0.
- max_decode_length (int): max decode length. Default: 128.
- sos_id (int): Id of sequence start token. Default: 1.
- eos_id (int): Id of sequence end token. Default: 2.
- compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
- """
- def __init__(self,
- batch_size,
- seq_length,
- vocab_size,
- decoder,
- beam_width=4,
- length_penalty_weight=1.0,
- max_decode_length=128,
- sos_id=1,
- eos_id=2,
- compute_type=mstype.float32):
- super(BeamSearchDecoder, self).__init__(auto_prefix=False)
- self.seq_length = seq_length
- self.batch_size = batch_size
- self.vocab_size = vocab_size
- self.beam_width = beam_width
- self.length_penalty_weight = length_penalty_weight
- self.max_decode_length = max_decode_length
- self.decoder = decoder
-
- self.add = P.Add()
- self.expand = P.ExpandDims()
- self.reshape = P.Reshape()
- self.shape_flat = (-1,)
- self.shape = P.Shape()
-
- self.zero_tensor = Tensor(np.zeros([batch_size, beam_width]), mstype.float32)
- self.ninf_tensor = Tensor(np.full([batch_size, beam_width], -INF), mstype.float32)
-
- self.select = P.Select()
- self.flat_shape = (batch_size, beam_width * vocab_size)
- self.topk = P.TopK(sorted=True)
- self.floor_div = P.FloorDiv()
- self.vocab_size_tensor = Tensor(self.vocab_size, mstype.int32)
- self.real_div = P.RealDiv()
- self.mod = Mod()
- self.equal = P.Equal()
- self.eos_ids = Tensor(np.full([batch_size, beam_width], eos_id), mstype.int32)
-
- beam_ids = np.tile(np.arange(beam_width).reshape((1, beam_width)), [batch_size, 1])
- self.beam_ids = Tensor(beam_ids, mstype.int32)
- batch_ids = np.arange(batch_size*beam_width).reshape((batch_size, beam_width)) // beam_width
- self.batch_ids = Tensor(batch_ids, mstype.int32)
- self.concat = P.Concat(axis=-1)
- self.gather_nd = P.GatherNd()
-
- self.greater_equal = P.GreaterEqual()
- self.sub = P.Sub()
- self.cast = P.Cast()
- self.zeroslike = P.ZerosLike()
-
- # init inputs and states
- self.start_ids = Tensor(np.full([batch_size * beam_width, 1], sos_id), mstype.int32)
- self.init_seq = Tensor(np.full([batch_size, beam_width, 1], sos_id), mstype.int32)
- init_scores = np.tile(np.array([[0.] + [-INF]*(beam_width-1)]), [batch_size, 1])
- self.init_scores = Tensor(init_scores, mstype.float32)
- self.init_finished = Tensor(np.zeros([batch_size, beam_width], dtype=np.bool))
- self.init_length = Tensor(np.zeros([batch_size, beam_width], dtype=np.int32))
- self.length_penalty = LengthPenalty(weight=length_penalty_weight)
- self.one = Tensor(1, mstype.int32)
-
- def one_step(self, cur_input_ids, enc_states, enc_attention_mask, state_log_probs,
- state_seq, state_finished, state_length):
- """
- One step for decode
- """
- log_probs = self.decoder(cur_input_ids, enc_states, enc_attention_mask, self.seq_length)
- log_probs = self.reshape(log_probs, (self.batch_size, self.beam_width, self.vocab_size))
-
- # select topk indices
- total_log_probs = self.add(log_probs, self.expand(state_log_probs, -1))
-
- # mask finished beams
- mask_tensor = self.select(state_finished, self.ninf_tensor, self.zero_tensor)
- total_log_probs = self.add(total_log_probs, self.expand(mask_tensor, -1))
-
- # reshape scores to [batch, beam*vocab]
- flat_scores = self.reshape(total_log_probs, self.flat_shape)
- # select topk
- topk_scores, topk_indices = self.topk(flat_scores, self.beam_width)
-
- temp = topk_indices
- beam_indices = self.zeroslike(topk_indices)
- for _ in range(self.beam_width - 1):
- temp = self.sub(temp, self.vocab_size_tensor)
- res = self.cast(self.greater_equal(temp, 0), mstype.int32)
- beam_indices = beam_indices + res
- word_indices = topk_indices - beam_indices * self.vocab_size_tensor
- #======================================================================
-
- # mask finished indices
- beam_indices = self.select(state_finished, self.beam_ids, beam_indices)
- word_indices = self.select(state_finished, self.eos_ids, word_indices)
- topk_scores = self.select(state_finished, state_log_probs, topk_scores)
-
- ###### put finished sequences to the end
- # sort according to scores with -inf for finished beams
- tmp_log_probs = self.select(
- self.equal(word_indices, self.eos_ids),
- self.ninf_tensor,
- topk_scores)
- _, tmp_indices = self.topk(tmp_log_probs, self.beam_width)
- # update
- tmp_gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(tmp_indices, -1)))
- beam_indices = self.gather_nd(beam_indices, tmp_gather_indices)
- word_indices = self.gather_nd(word_indices, tmp_gather_indices)
- topk_scores = self.gather_nd(topk_scores, tmp_gather_indices)
-
- ###### generate new beam_search states
- # gather indices for selecting alive beams
- gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(beam_indices, -1)))
-
- # length add 1 if not finished in the previous step
- length_add = self.add(state_length, self.one)
- state_length = self.select(state_finished, state_length, length_add)
- state_length = self.gather_nd(state_length, gather_indices)
-
- # concat seq
- seq = self.gather_nd(state_seq, gather_indices)
- state_seq = self.concat((seq, self.expand(word_indices, -1)))
-
- # new finished flag and log_probs
- state_finished = self.equal(word_indices, self.eos_ids)
- state_log_probs = topk_scores
-
- ###### generate new inputs and decoder states
- cur_input_ids = self.reshape(state_seq, (self.batch_size*self.beam_width, -1))
- return cur_input_ids, state_log_probs, state_seq, state_finished, state_length
-
- def construct(self, enc_states, enc_attention_mask):
- """Get beam search result."""
- cur_input_ids = self.start_ids
- # beam search states
- state_log_probs = self.init_scores
- state_seq = self.init_seq
- state_finished = self.init_finished
- state_length = self.init_length
-
- for _ in range(self.max_decode_length):
- # run one step decoder to get outputs of the current step
- # shape [batch*beam, 1, vocab]
- cur_input_ids, state_log_probs, state_seq, state_finished, state_length = self.one_step(
- cur_input_ids, enc_states, enc_attention_mask, state_log_probs, state_seq, state_finished, state_length)
-
- # add length penalty scores
- penalty_len = self.length_penalty(state_length)
- # get penalty length
- log_probs = self.real_div(state_log_probs, penalty_len)
-
- # sort according to scores
- _, top_beam_indices = self.topk(log_probs, self.beam_width)
- gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(top_beam_indices, -1)))
- # sort sequence
- predicted_ids = self.gather_nd(state_seq, gather_indices)
- # take the first one
- predicted_ids = predicted_ids[::, 0:1:1, ::]
- return predicted_ids
diff --git a/tests/st/model_zoo_tests/transformer/src/config.py b/tests/st/model_zoo_tests/transformer/src/config.py
deleted file mode 100644
index 58d5ee5f721..00000000000
--- a/tests/st/model_zoo_tests/transformer/src/config.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Network config setting, will be used in dataset.py, train.py."""
-
-from easydict import EasyDict as edict
-import mindspore.common.dtype as mstype
-from .transformer_model import TransformerConfig
-cfg = edict({
- 'transformer_network': 'large',
- 'init_loss_scale_value': 1024,
- 'scale_factor': 2,
- 'scale_window': 2000,
- 'optimizer': 'Adam',
- 'optimizer_adam_beta2': 0.997,
- 'lr_schedule': edict({
- 'learning_rate': 2.0,
- 'warmup_steps': 8000,
- 'start_decay_step': 16000,
- 'min_lr': 0.0,
- }),
-})
-'''
-two kinds of transformer model version
-'''
-if cfg.transformer_network == 'large':
- transformer_net_cfg = TransformerConfig(
- batch_size=96,
- seq_length=128,
- vocab_size=36560,
- hidden_size=1024,
- num_hidden_layers=6,
- num_attention_heads=16,
- intermediate_size=4096,
- hidden_act="relu",
- hidden_dropout_prob=0.2,
- attention_probs_dropout_prob=0.2,
- max_position_embeddings=128,
- initializer_range=0.02,
- label_smoothing=0.1,
- dtype=mstype.float32,
- compute_type=mstype.float16)
- transformer_net_cfg_gpu = TransformerConfig(
- batch_size=32,
- seq_length=128,
- vocab_size=36560,
- hidden_size=1024,
- num_hidden_layers=6,
- num_attention_heads=16,
- intermediate_size=4096,
- hidden_act="relu",
- hidden_dropout_prob=0.2,
- attention_probs_dropout_prob=0.2,
- max_position_embeddings=128,
- initializer_range=0.02,
- label_smoothing=0.1,
- dtype=mstype.float32,
- compute_type=mstype.float16)
-if cfg.transformer_network == 'base':
- transformer_net_cfg = TransformerConfig(
- batch_size=96,
- seq_length=128,
- vocab_size=36560,
- hidden_size=512,
- num_hidden_layers=6,
- num_attention_heads=8,
- intermediate_size=2048,
- hidden_act="relu",
- hidden_dropout_prob=0.2,
- attention_probs_dropout_prob=0.2,
- max_position_embeddings=128,
- initializer_range=0.02,
- label_smoothing=0.1,
- dtype=mstype.float32,
- compute_type=mstype.float16)
diff --git a/tests/st/model_zoo_tests/transformer/src/dataset.py b/tests/st/model_zoo_tests/transformer/src/dataset.py
deleted file mode 100644
index b485fd7ddd6..00000000000
--- a/tests/st/model_zoo_tests/transformer/src/dataset.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Data operations, will be used in train.py."""
-
-import mindspore.common.dtype as mstype
-import mindspore.dataset as de
-import mindspore.dataset.transforms.c_transforms as deC
-from .config import transformer_net_cfg, transformer_net_cfg_gpu
-de.config.set_seed(1)
-def create_transformer_dataset(epoch_count=1, rank_size=1, rank_id=0, do_shuffle="true", dataset_path=None,
- bucket_boundaries=None, device_target="Ascend"):
- """create dataset"""
- def batch_per_bucket(bucket_len, dataset_path):
- dataset_path = dataset_path + "_" + str(bucket_len) + "_00"
- ds = de.MindDataset(dataset_path,
- columns_list=["source_eos_ids", "source_eos_mask",
- "target_sos_ids", "target_sos_mask",
- "target_eos_ids", "target_eos_mask"],
- shuffle=(do_shuffle == "true"), num_shards=rank_size, shard_id=rank_id)
- type_cast_op = deC.TypeCast(mstype.int32)
- ds = ds.map(operations=type_cast_op, input_columns="source_eos_ids")
- ds = ds.map(operations=type_cast_op, input_columns="source_eos_mask")
- ds = ds.map(operations=type_cast_op, input_columns="target_sos_ids")
- ds = ds.map(operations=type_cast_op, input_columns="target_sos_mask")
- ds = ds.map(operations=type_cast_op, input_columns="target_eos_ids")
- ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask")
-
- # apply batch operations
- if device_target == "Ascend":
- ds = ds.batch(transformer_net_cfg.batch_size, drop_remainder=True)
- else:
- ds = ds.batch(transformer_net_cfg_gpu.batch_size, drop_remainder=True)
-
- ds = ds.repeat(epoch_count)
- return ds
-
- for i, _ in enumerate(bucket_boundaries):
- bucket_len = bucket_boundaries[i]
- ds_per = batch_per_bucket(bucket_len, dataset_path)
- if i == 0:
- ds = ds_per
- else:
- ds = ds + ds_per
- ds = ds.shuffle(ds.get_dataset_size())
- ds.channel_name = 'transformer'
- return ds
diff --git a/tests/st/model_zoo_tests/transformer/src/eval_config.py b/tests/st/model_zoo_tests/transformer/src/eval_config.py
deleted file mode 100644
index 512e2c489a1..00000000000
--- a/tests/st/model_zoo_tests/transformer/src/eval_config.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Network evaluation config setting, will be used in eval.py."""
-
-from easydict import EasyDict as edict
-import mindspore.common.dtype as mstype
-from .transformer_model import TransformerConfig
-
-cfg = edict({
- 'transformer_network': 'large',
- 'data_file': '/your/path/evaluation.mindrecord',
- 'model_file': '/your/path/checkpoint_file',
- 'output_file': '/your/path/output',
-})
-'''
-two kinds of transformer model version
-'''
-if cfg.transformer_network == 'large':
- transformer_net_cfg = TransformerConfig(
- batch_size=1,
- seq_length=128,
- vocab_size=36560,
- hidden_size=1024,
- num_hidden_layers=6,
- num_attention_heads=16,
- intermediate_size=4096,
- hidden_act="relu",
- hidden_dropout_prob=0.0,
- attention_probs_dropout_prob=0.0,
- max_position_embeddings=128,
- label_smoothing=0.1,
- beam_width=4,
- max_decode_length=80,
- length_penalty_weight=1.0,
- dtype=mstype.float32,
- compute_type=mstype.float16)
-if cfg.transformer_network == 'base':
- transformer_net_cfg = TransformerConfig(
- batch_size=1,
- seq_length=128,
- vocab_size=36560,
- hidden_size=512,
- num_hidden_layers=6,
- num_attention_heads=8,
- intermediate_size=2048,
- hidden_act="relu",
- hidden_dropout_prob=0.0,
- attention_probs_dropout_prob=0.0,
- max_position_embeddings=128,
- label_smoothing=0.1,
- beam_width=4,
- max_decode_length=80,
- length_penalty_weight=1.0,
- dtype=mstype.float32,
- compute_type=mstype.float16)
diff --git a/tests/st/model_zoo_tests/transformer/src/lr_schedule.py b/tests/st/model_zoo_tests/transformer/src/lr_schedule.py
deleted file mode 100644
index c246283478a..00000000000
--- a/tests/st/model_zoo_tests/transformer/src/lr_schedule.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Learning rate utilities."""
-
-def linear_warmup(warmup_steps, current_step):
- return min([1.0, float(current_step)/float(warmup_steps)])
-
-def rsqrt_decay(warmup_steps, current_step):
- return float(max([current_step, warmup_steps])) ** -0.5
-
-def rsqrt_hidden(hidden_size):
- return float(hidden_size) ** -0.5
-
-def create_dynamic_lr(schedule, training_steps, learning_rate, warmup_steps, hidden_size,
- start_decay_step=0, min_lr=0.):
- """
- Generate dynamic learning rate.
- """
- if start_decay_step < warmup_steps:
- start_decay_step = warmup_steps
- lr = []
- for current_step in range(1, training_steps+1):
- cur_lr = 1.0
- for name in schedule.split("*"):
- if name == "constant":
- cur_lr *= float(learning_rate)
- elif name == "rsqrt_hidden":
- cur_lr *= rsqrt_hidden(hidden_size)
- elif name == "linear_warmup":
- cur_lr *= linear_warmup(warmup_steps, current_step)
- elif name == "rsqrt_decay":
- cur_lr *= rsqrt_decay(warmup_steps, current_step-start_decay_step+warmup_steps)
- else:
- raise ValueError("unknown learning rate schedule")
- if warmup_steps < current_step < start_decay_step:
- cur_lr = lr[-1]
- if current_step > warmup_steps:
- cur_lr = max([cur_lr, min_lr])
- lr.append(cur_lr)
- return lr
diff --git a/tests/st/model_zoo_tests/transformer/src/process_output.py b/tests/st/model_zoo_tests/transformer/src/process_output.py
deleted file mode 100644
index f69ea6a0d7d..00000000000
--- a/tests/st/model_zoo_tests/transformer/src/process_output.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Convert ids to tokens."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import sys
-
-import tokenization
-
-# Explicitly set the encoding
-sys.stdin = open(sys.stdin.fileno(), mode='r', encoding='utf-8', buffering=True)
-sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=True)
-
-def main():
- parser = argparse.ArgumentParser(
- description="recore nbest with smoothed sentence-level bleu.")
- parser.add_argument("--vocab_file", type=str, default="", required=True, help="vocab file path.")
- args = parser.parse_args()
-
- tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=args.vocab_file)
-
- for line in sys.stdin:
- token_ids = [int(x) for x in line.strip().split()]
- tokens = tokenizer.convert_ids_to_tokens(token_ids)
- sent = " ".join(tokens)
- sent = sent.split("")[-1]
- sent = sent.split("")[0]
- print(sent.strip())
-
-if __name__ == "__main__":
- main()
diff --git a/tests/st/model_zoo_tests/transformer/src/tokenization.py b/tests/st/model_zoo_tests/transformer/src/tokenization.py
deleted file mode 100644
index b4121f6c365..00000000000
--- a/tests/st/model_zoo_tests/transformer/src/tokenization.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Tokenization utilities."""
-
-import sys
-import collections
-import unicodedata
-
-def convert_to_printable(text):
- """
- Converts `text` to a printable coding format.
- """
- if sys.version_info[0] == 3:
- if isinstance(text, str):
- return text
- if isinstance(text, bytes):
- return text.decode("utf-8", "ignore")
- raise ValueError("Only support type `str` or `bytes`, while text type is `%s`" % (type(text)))
- if sys.version_info[0] == 2:
- if isinstance(text, str):
- return text
- if isinstance(text, unicode):
- return text.encode("utf-8")
- raise ValueError("Only support type `str` or `unicode`, while text type is `%s`" % (type(text)))
- raise ValueError("Only supported when running on Python2 or Python3.")
-
-
-def convert_to_unicode(text):
- """
- Converts `text` to Unicode format.
- """
- if sys.version_info[0] == 3:
- if isinstance(text, str):
- return text
- if isinstance(text, bytes):
- return text.decode("utf-8", "ignore")
- raise ValueError("Only support type `str` or `bytes`, while text type is `%s`" % (type(text)))
- if sys.version_info[0] == 2:
- if isinstance(text, str):
- return text.decode("utf-8", "ignore")
- if isinstance(text, unicode):
- return text
- raise ValueError("Only support type `str` or `unicode`, while text type is `%s`" % (type(text)))
- raise ValueError("Only supported when running on Python2 or Python3.")
-
-
-def load_vocab_file(vocab_file):
- """
- Loads a vocabulary file and turns into a {token:id} dictionary.
- """
- vocab_dict = collections.OrderedDict()
- index = 0
- with open(vocab_file, "r") as vocab:
- while True:
- token = convert_to_unicode(vocab.readline())
- if not token:
- break
- token = token.strip()
- vocab_dict[token] = index
- index += 1
- return vocab_dict
-
-
-def convert_by_vocab_dict(vocab_dict, items):
- """
- Converts a sequence of [tokens|ids] according to the vocab dict.
- """
- output = []
- for item in items:
- if item in vocab_dict:
- output.append(vocab_dict[item])
- else:
- output.append(vocab_dict[""])
- return output
-
-
-class WhiteSpaceTokenizer():
- """
- Whitespace tokenizer.
- """
- def __init__(self, vocab_file):
- self.vocab_dict = load_vocab_file(vocab_file)
- self.inv_vocab_dict = {index: token for token, index in self.vocab_dict.items()}
-
- def _is_whitespace_char(self, char):
- """
- Checks if it is a whitespace character(regard "\t", "\n", "\r" as whitespace here).
- """
- if char in (" ", "\t", "\n", "\r"):
- return True
- uni = unicodedata.category(char)
- if uni == "Zs":
- return True
- return False
-
- def _is_control_char(self, char):
- """
- Checks if it is a control character.
- """
- if char in ("\t", "\n", "\r"):
- return False
- uni = unicodedata.category(char)
- if uni in ("Cc", "Cf"):
- return True
- return False
-
- def _clean_text(self, text):
- """
- Remove invalid characters and cleanup whitespace.
- """
- output = []
- for char in text:
- cp = ord(char)
- if cp == 0 or cp == 0xfffd or self._is_control_char(char):
- continue
- if self._is_whitespace_char(char):
- output.append(" ")
- else:
- output.append(char)
- return "".join(output)
-
- def _whitespace_tokenize(self, text):
- """
- Clean whitespace and split text into tokens.
- """
- text = text.strip()
- if not text:
- tokens = []
- else:
- tokens = text.split()
- return tokens
-
- def tokenize(self, text):
- """
- Tokenizes text.
- """
- text = convert_to_unicode(text)
- text = self._clean_text(text)
- tokens = self._whitespace_tokenize(text)
- return tokens
-
- def convert_tokens_to_ids(self, tokens):
- return convert_by_vocab_dict(self.vocab_dict, tokens)
-
- def convert_ids_to_tokens(self, ids):
- return convert_by_vocab_dict(self.inv_vocab_dict, ids)
diff --git a/tests/st/model_zoo_tests/transformer/src/transformer_for_train.py b/tests/st/model_zoo_tests/transformer/src/transformer_for_train.py
deleted file mode 100644
index 153a98d621f..00000000000
--- a/tests/st/model_zoo_tests/transformer/src/transformer_for_train.py
+++ /dev/null
@@ -1,472 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Transformer for training."""
-import numpy as np
-
-from mindspore.common.initializer import initializer
-import mindspore.nn as nn
-from mindspore.ops import operations as P
-from mindspore.ops import functional as F
-from mindspore.ops import composite as C
-from mindspore.common.tensor import Tensor
-from mindspore.common.parameter import Parameter
-from mindspore.common import dtype as mstype
-from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
-from mindspore.communication.management import get_group_size
-from mindspore.context import ParallelMode
-from mindspore import context
-
-from .transformer_model import TransformerModel
-
-GRADIENT_CLIP_TYPE = 1
-GRADIENT_CLIP_VALUE = 5.0
-
-clip_grad = C.MultitypeFuncGraph("clip_grad")
-
-
-@clip_grad.register("Number", "Number", "Tensor")
-def _clip_grad(clip_type, clip_value, grad):
- """
- Clip gradients.
-
- Inputs:
- clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'.
- clip_value (float): Specifies how much to clip.
- grad (tuple[Tensor]): Gradients.
-
- Outputs:
- tuple[Tensor], clipped gradients.
- """
- if clip_type not in (0, 1):
- return grad
- dt = F.dtype(grad)
- if clip_type == 0:
- new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt),
- F.cast(F.tuple_to_array((clip_value,)), dt))
- else:
- new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt))
- return new_grad
-
-
-class TransformerTrainingLoss(nn.Cell):
- """
- Provide transformer training loss.
-
- Args:
- config (TransformerConfig): The config of Transformer.
-
- Returns:
- Tensor, total loss.
- """
- def __init__(self, config):
- super(TransformerTrainingLoss, self).__init__(auto_prefix=False)
- self.vocab_size = config.vocab_size
- self.onehot = P.OneHot()
- self.on_value = Tensor(float(1 - config.label_smoothing), mstype.float32)
- self.off_value = Tensor(config.label_smoothing / float(self.vocab_size - 1), mstype.float32)
- self.reduce_sum = P.ReduceSum()
- self.reduce_mean = P.ReduceMean()
- self.reshape = P.Reshape()
- self.last_idx = (-1,)
- self.flatten = P.Flatten()
- self.neg = P.Neg()
- self.cast = P.Cast()
- self.batch_size = config.batch_size
-
- def construct(self, prediction_scores, label_ids, label_weights, seq_length):
- """Defines the computation performed."""
- flat_shape = (self.batch_size * seq_length,)
- label_ids = self.reshape(label_ids, flat_shape)
- label_weights = self.cast(self.reshape(label_weights, flat_shape), mstype.float32)
- one_hot_labels = self.onehot(label_ids, self.vocab_size, self.on_value, self.off_value)
-
- per_example_loss = self.neg(self.reduce_sum(prediction_scores * one_hot_labels, self.last_idx))
- numerator = self.reduce_sum(label_weights * per_example_loss, ())
- denominator = self.reduce_sum(label_weights, ()) + \
- self.cast(F.tuple_to_array((1e-5,)), mstype.float32)
- loss = numerator / denominator
- return loss
-
-
-class TransformerNetworkWithLoss(nn.Cell):
- """
- Provide transformer training loss through network.
-
- Args:
- config (TransformerConfig): The config of Transformer.
- is_training (bool): Specifies whether to use the training mode.
- use_one_hot_embeddings (bool): Specifies whether to use one-hot for embeddings. Default: False.
-
- Returns:
- Tensor, the loss of the network.
- """
- def __init__(self, config, is_training, use_one_hot_embeddings=False):
- super(TransformerNetworkWithLoss, self).__init__(auto_prefix=False)
- self.transformer = TransformerModel(config, is_training, use_one_hot_embeddings)
- self.loss = TransformerTrainingLoss(config)
- self.cast = P.Cast()
- self.shape = P.Shape()
-
- def construct(self,
- source_ids,
- source_mask,
- target_ids,
- target_mask,
- label_ids,
- label_weights):
- """Transformer network with loss."""
- prediction_scores = self.transformer(source_ids, source_mask, target_ids, target_mask)
- seq_length = self.shape(source_ids)[1]
- total_loss = self.loss(prediction_scores, label_ids, label_weights, seq_length)
- return self.cast(total_loss, mstype.float32)
-
-
-class TransformerTrainOneStepCell(nn.TrainOneStepCell):
- """
- Encapsulation class of transformer network training.
-
- Append an optimizer to the training network after that the construct
- function can be called to create the backward graph.
-
- Args:
- network (Cell): The training network. Note that loss function should have been added.
- optimizer (Optimizer): Optimizer for updating the weights.
- sens (Number): The adjust parameter. Default: 1.0.
- """
- def __init__(self, network, optimizer, sens=1.0):
- super(TransformerTrainOneStepCell, self).__init__(network, optimizer, sens)
-
- self.cast = P.Cast()
- self.hyper_map = C.HyperMap()
-
- def set_sens(self, value):
- self.sens = value
-
- def construct(self,
- source_eos_ids,
- source_eos_mask,
- target_sos_ids,
- target_sos_mask,
- target_eos_ids,
- target_eos_mask,):
- """Defines the computation performed."""
- source_ids = source_eos_ids
- source_mask = source_eos_mask
- target_ids = target_sos_ids
- target_mask = target_sos_mask
- label_ids = target_eos_ids
- label_weights = target_eos_mask
-
- weights = self.weights
- loss = self.network(source_ids,
- source_mask,
- target_ids,
- target_mask,
- label_ids,
- label_weights)
- grads = self.grad(self.network, weights)(source_ids,
- source_mask,
- target_ids,
- target_mask,
- label_ids,
- label_weights,
- self.cast(F.tuple_to_array((self.sens,)),
- mstype.float32))
- grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
- # apply grad reducer on grads
- grads = self.grad_reducer(grads)
- succ = self.optimizer(grads)
- return F.depend(loss, succ)
-
-
-grad_scale = C.MultitypeFuncGraph("grad_scale")
-reciprocal = P.Reciprocal()
-
-
-@grad_scale.register("Tensor", "Tensor")
-def tensor_grad_scale(scale, grad):
- return grad * F.cast(reciprocal(scale), F.dtype(grad))
-
-_grad_overflow = C.MultitypeFuncGraph("_grad_overflow")
-grad_overflow = P.FloatStatus()
-
-@_grad_overflow.register("Tensor")
-def _tensor_grad_overflow(grad):
- return grad_overflow(grad)
-
-class TransformerTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell):
- """
- Encapsulation class of Transformer network training.
-
- Append an optimizer to the training network after that the construct
- function can be called to create the backward graph.
-
- Args:
- network (Cell): The training network. Note that loss function should have been added.
- optimizer (Optimizer): Optimizer for updating the weights.
- scale_update_cell (Cell): Cell to do the loss scale. Default: None.
- """
- def __init__(self, network, optimizer, scale_update_cell=None):
- super(TransformerTrainOneStepWithLossScaleCell, self).__init__(network, optimizer, scale_update_cell)
- self.cast = P.Cast()
- self.degree = 1
- if self.reducer_flag:
- self.degree = get_group_size()
- self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
-
- self.loss_scale = None
- self.loss_scaling_manager = scale_update_cell
- if scale_update_cell:
- self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32))
-
- def construct(self,
- source_eos_ids,
- source_eos_mask,
- target_sos_ids,
- target_sos_mask,
- target_eos_ids,
- target_eos_mask,
- sens=None):
- """Defines the computation performed."""
- source_ids = source_eos_ids
- source_mask = source_eos_mask
- target_ids = target_sos_ids
- target_mask = target_sos_mask
- label_ids = target_eos_ids
- label_weights = target_eos_mask
-
- weights = self.weights
- loss = self.network(source_ids,
- source_mask,
- target_ids,
- target_mask,
- label_ids,
- label_weights)
- if sens is None:
- scaling_sens = self.loss_scale
- else:
- scaling_sens = sens
- status, scaling_sens = self.start_overflow_check(loss, scaling_sens)
- grads = self.grad(self.network, weights)(source_ids,
- source_mask,
- target_ids,
- target_mask,
- label_ids,
- label_weights,
- self.cast(scaling_sens,
- mstype.float32))
-
- # apply grad reducer on grads
- grads = self.grad_reducer(grads)
- grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads)
- grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
-
- cond = self.get_overflow_status(status, grads)
- overflow = cond
- if sens is None:
- overflow = self.loss_scaling_manager(self.loss_scale, cond)
- if overflow:
- succ = False
- else:
- succ = self.optimizer(grads)
- ret = (loss, cond, scaling_sens)
- return F.depend(ret, succ)
-
-
-cast = P.Cast()
-add_grads = C.MultitypeFuncGraph("add_grads")
-
-
-@add_grads.register("Tensor", "Tensor")
-def _add_grads(accu_grad, grad):
- return accu_grad + cast(grad, mstype.float32)
-
-update_accu_grads = C.MultitypeFuncGraph("update_accu_grads")
-
-@update_accu_grads.register("Tensor", "Tensor")
-def _update_accu_grads(accu_grad, grad):
- succ = True
- return F.depend(succ, F.assign(accu_grad, cast(grad, mstype.float32)))
-
-accumulate_accu_grads = C.MultitypeFuncGraph("accumulate_accu_grads")
-
-@accumulate_accu_grads.register("Tensor", "Tensor")
-def _accumulate_accu_grads(accu_grad, grad):
- succ = True
- return F.depend(succ, F.assign_add(accu_grad, cast(grad, mstype.float32)))
-
-
-zeroslike = P.ZerosLike()
-reset_accu_grads = C.MultitypeFuncGraph("reset_accu_grads")
-
-
-@reset_accu_grads.register("Tensor")
-def _reset_accu_grads(accu_grad):
- succ = True
- return F.depend(succ, F.assign(accu_grad, zeroslike(accu_grad)))
-
-
-class TransformerTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell):
- """
- Encapsulation class of bert network training.
-
- Append an optimizer to the training network after that the construct
- function can be called to create the backward graph.
-
- To mimic higher batch size, gradients are accumulated N times before weight update.
-
- For distribution mode, allreduce will only be implemented in the weight updated step,
- i.e. the sub-step after gradients accumulated N times.
-
- Args:
- network (Cell): The training network. Note that loss function should have been added.
- optimizer (Optimizer): Optimizer for updating the weights.
- scale_update_cell (Cell): Cell to do the loss scale. Default: None.
- accumulation_steps (int): Number of accumulation steps before gradient update. The global batch size =
- batch_size * accumulation_steps. Default: 1.
- """
-
- def __init__(self, network, optimizer, scale_update_cell=None, accumulation_steps=8, enable_global_norm=False):
- super(TransformerTrainAccumulationAllReducePostWithLossScaleCell, self).__init__(auto_prefix=False)
- self.network = network
- self.network.set_grad()
- self.weights = optimizer.parameters
- self.optimizer = optimizer
- self.accumulation_steps = accumulation_steps
- self.enable_global_norm = enable_global_norm
- self.one = Tensor(np.array([1]).astype(np.int32))
- self.zero = Tensor(np.array([0]).astype(np.int32))
- self.local_step = Parameter(initializer(0, [1], mstype.int32))
- self.accu_grads = self.weights.clone(prefix="accu_grads", init='zeros')
- self.accu_overflow = Parameter(initializer(0, [1], mstype.int32))
- self.accu_loss = Parameter(initializer(0, [1], mstype.float32))
-
- self.grad = C.GradOperation(get_by_list=True, sens_param=True)
- self.reducer_flag = False
- self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
- if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
- self.reducer_flag = True
- self.grad_reducer = F.identity
- self.degree = 1
- if self.reducer_flag:
- self.degree = get_group_size()
- self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
- self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
- self.overflow_reducer = F.identity
- if self.is_distributed:
- self.overflow_reducer = P.AllReduce()
- self.cast = P.Cast()
- self.alloc_status = P.NPUAllocFloatStatus()
- self.get_status = P.NPUGetFloatStatus()
- self.clear_status = P.NPUClearFloatStatus()
- self.reduce_sum = P.ReduceSum(keep_dims=False)
- self.base = Tensor(1, mstype.float32)
- self.less_equal = P.LessEqual()
- self.logical_or = P.LogicalOr()
- self.not_equal = P.NotEqual()
- self.select = P.Select()
- self.reshape = P.Reshape()
- self.hyper_map = C.HyperMap()
- self.loss_scale = None
- self.loss_scaling_manager = scale_update_cell
- if scale_update_cell:
- self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32))
-
- def construct(self,
- source_eos_ids,
- source_eos_mask,
- target_sos_ids,
- target_sos_mask,
- target_eos_ids,
- target_eos_mask,
- sens=None):
- """Defines the computation performed."""
- source_ids = source_eos_ids
- source_mask = source_eos_mask
- target_ids = target_sos_ids
- target_mask = target_sos_mask
- label_ids = target_eos_ids
- label_weights = target_eos_mask
-
- weights = self.weights
- loss = self.network(source_ids,
- source_mask,
- target_ids,
- target_mask,
- label_ids,
- label_weights)
- if sens is None:
- scaling_sens = self.loss_scale
- else:
- scaling_sens = sens
- # alloc status and clear should be right before gradoperation
- init = self.alloc_status()
- init = F.depend(init, loss)
- clear_status = self.clear_status(init)
- scaling_sens = F.depend(scaling_sens, clear_status)
- # update accumulation parameters
- is_accu_step = self.not_equal(self.local_step, self.accumulation_steps)
- self.local_step = self.select(is_accu_step, self.local_step + self.one, self.one)
- self.accu_loss = self.select(is_accu_step, self.accu_loss + loss, loss)
- mean_loss = self.accu_loss / self.local_step
- is_accu_step = self.not_equal(self.local_step, self.accumulation_steps)
-
- grads = self.grad(self.network, weights)(source_ids,
- source_mask,
- target_ids,
- target_mask,
- label_ids,
- label_weights,
- self.cast(scaling_sens,
- mstype.float32))
-
- accu_succ = self.hyper_map(accumulate_accu_grads, self.accu_grads, grads)
- mean_loss = F.depend(mean_loss, accu_succ)
-
- init = F.depend(init, mean_loss)
- get_status = self.get_status(init)
- init = F.depend(init, get_status)
- flag_sum = self.reduce_sum(init, (0,))
- overflow = self.less_equal(self.base, flag_sum)
- overflow = self.logical_or(self.not_equal(self.accu_overflow, self.zero), overflow)
- accu_overflow = self.select(overflow, self.one, self.zero)
- self.accu_overflow = self.select(is_accu_step, accu_overflow, self.zero)
-
- if is_accu_step:
- succ = False
- else:
- # apply grad reducer on grads
- grads = self.grad_reducer(self.accu_grads)
- scaling = scaling_sens * self.degree * self.accumulation_steps
- grads = self.hyper_map(F.partial(grad_scale, scaling), grads)
- if self.enable_global_norm:
- grads = C.clip_by_global_norm(grads, 1.0, None)
- else:
- grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
- accu_overflow = F.depend(accu_overflow, grads)
- accu_overflow = self.overflow_reducer(accu_overflow)
- overflow = self.less_equal(self.base, accu_overflow)
- accu_succ = self.hyper_map(reset_accu_grads, self.accu_grads)
- overflow = F.depend(overflow, accu_succ)
- overflow = self.reshape(overflow, (()))
- if sens is None:
- overflow = self.loss_scaling_manager(self.loss_scale, overflow)
- if overflow:
- succ = False
- else:
- succ = self.optimizer(grads)
-
- ret = (mean_loss, overflow, scaling_sens)
- return F.depend(ret, succ)
diff --git a/tests/st/model_zoo_tests/transformer/src/transformer_model.py b/tests/st/model_zoo_tests/transformer/src/transformer_model.py
deleted file mode 100644
index 5e0aa6aa5b7..00000000000
--- a/tests/st/model_zoo_tests/transformer/src/transformer_model.py
+++ /dev/null
@@ -1,1153 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Transformer model."""
-
-import math
-import copy
-import numpy as np
-import mindspore.common.dtype as mstype
-import mindspore.nn as nn
-import mindspore.ops.functional as F
-from mindspore.ops import operations as P
-from mindspore.common.tensor import Tensor
-from mindspore.common.parameter import Parameter
-from mindspore.ops.primitive import constexpr
-from .beam_search import BeamSearchDecoder, TileBeam
-from .weight_init import normal_weight, weight_variable
-
-class TransformerConfig:
- """
- Configuration for `Transformer`.
-
- Args:
- batch_size (int): Batch size of input dataset.
- seq_length (int): Length of input sequence. Default: 128.
- vocab_size (int): The shape of each embedding vector. Default: 36560.
- hidden_size (int): Size of the layers. Default: 1024.
- num_hidden_layers (int): Number of hidden layers in the Transformer encoder/decoder
- cell. Default: 6.
- num_attention_heads (int): Number of attention heads in the Transformer
- encoder/decoder cell. Default: 16.
- intermediate_size (int): Size of intermediate layer in the Transformer
- encoder/decoder cell. Default: 4096.
- hidden_act (str): Activation function used in the Transformer encoder/decoder
- cell. Default: "relu".
- hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.3.
- attention_probs_dropout_prob (float): The dropout probability for
- MultiheadAttention. Default: 0.3.
- max_position_embeddings (int): Maximum length of sequences used in this
- model. Default: 128.
- initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
- label_smoothing (float): label smoothing setting. Default: 0.1
- beam_width (int): beam width setting. Default: 4
- max_decode_length (int): max decode length in evaluation. Default: 80
- length_penalty_weight (float): normalize scores of translations according to their length. Default: 1.0
- dtype (:class:`mindspore.dtype`): Data type of the input. Default: mstype.float32.
- compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
- """
- def __init__(self,
- batch_size,
- seq_length=128,
- vocab_size=36560,
- hidden_size=1024,
- num_hidden_layers=6,
- num_attention_heads=16,
- intermediate_size=4096,
- hidden_act="relu",
- hidden_dropout_prob=0.3,
- attention_probs_dropout_prob=0.3,
- max_position_embeddings=128,
- initializer_range=0.02,
- label_smoothing=0.1,
- beam_width=4,
- max_decode_length=80,
- length_penalty_weight=1.0,
- dtype=mstype.float32,
- compute_type=mstype.float32):
- self.batch_size = batch_size
- self.seq_length = seq_length
- self.vocab_size = vocab_size
- self.hidden_size = hidden_size
- self.num_hidden_layers = num_hidden_layers
- self.num_attention_heads = num_attention_heads
- self.hidden_act = hidden_act
- self.intermediate_size = intermediate_size
- self.hidden_dropout_prob = hidden_dropout_prob
- self.attention_probs_dropout_prob = attention_probs_dropout_prob
- self.max_position_embeddings = max_position_embeddings
- self.initializer_range = initializer_range
- self.label_smoothing = label_smoothing
- self.beam_width = beam_width
- self.max_decode_length = max_decode_length
- self.length_penalty_weight = length_penalty_weight
- self.dtype = dtype
- self.compute_type = compute_type
-
-
-class EmbeddingLookup(nn.Cell):
- """
- A embeddings lookup table with a fixed dictionary and size.
-
- Args:
- vocab_size (int): Size of the dictionary of embeddings.
- embedding_size (int): The size of each embedding vector.
- use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False.
- initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
- """
- def __init__(self,
- vocab_size,
- embedding_size,
- use_one_hot_embeddings=False,
- initializer_range=0.02):
- super(EmbeddingLookup, self).__init__()
- self.vocab_size = vocab_size
- self.embedding_size = embedding_size
- self.use_one_hot_embeddings = use_one_hot_embeddings
- self.embedding_table = Parameter(normal_weight([vocab_size, embedding_size], embedding_size))
- self.expand = P.ExpandDims()
- self.shape_flat = (-1,)
- self.gather = P.Gather()
- self.one_hot = P.OneHot()
- self.on_value = Tensor(1.0, mstype.float32)
- self.off_value = Tensor(0.0, mstype.float32)
- self.array_mul = P.MatMul()
- self.reshape = P.Reshape()
- self.shape = P.Shape()
-
- def construct(self, input_ids):
- """Get a embeddings lookup table with a fixed dictionary and size."""
- input_shape = self.shape(input_ids)
-
- flat_ids = self.reshape(input_ids, self.shape_flat)
- if self.use_one_hot_embeddings:
- one_hot_ids = self.one_hot(flat_ids, self.vocab_size, self.on_value, self.off_value)
- output_for_reshape = self.array_mul(one_hot_ids, self.embedding_table)
- else:
- output_for_reshape = self.gather(self.embedding_table, flat_ids, 0)
-
- out_shape = input_shape + (self.embedding_size,)
- output = self.reshape(output_for_reshape, out_shape)
- return output, self.embedding_table
-
-
-def position_encoding(length,
- depth,
- min_timescale=1,
- max_timescale=1e4):
- """
- Create Tensor of sinusoids of different frequencies.
-
- Args:
- length (int): Length of the Tensor to create, i.e. Number of steps.
- depth (int): Hidden size.
- min_timescale (float): Default: 1.
- max_timescale (float): Default: 10000.
-
- Returns:
- Tensor of shape (length, depth)
- """
- depth = depth // 2
- positions = np.arange(length, dtype=np.float32)
- log_timescale_increment = (np.log(max_timescale / min_timescale) / (depth - 1))
- inv_timescales = min_timescale * np.exp(np.arange(depth, dtype=np.float32) * -log_timescale_increment)
- scaled_time = np.expand_dims(positions, 1) * np.expand_dims(inv_timescales, 0)
- x = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
- return x
-
-
-class EmbeddingPostprocessor(nn.Cell):
- """
- Postprocessors apply positional embeddings to word embeddings.
-
- Args:
- embedding_size (int): The size of each embedding vector.
- use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False.
- initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
- max_position_embeddings (int): Maximum length of sequences used in this
- model. Default: 128.
- dropout_prob (float): The dropout probability. Default: 0.1.
- """
- def __init__(self,
- embedding_size,
- use_one_hot_embeddings=False,
- initializer_range=0.02,
- max_position_embeddings=128,
- dropout_prob=0.1):
- super(EmbeddingPostprocessor, self).__init__()
- self.scores_mul = Tensor([math.sqrt(float(embedding_size))], dtype=mstype.float32)
- self.multiply = P.Mul()
- self.add = P.Add()
- self.dropout = nn.Dropout(1 - dropout_prob, dtype=mstype.float32)
- self.use_dropout = dropout_prob > 0
- self.expand_dims = P.ExpandDims()
- self.position_embedding_table = Tensor(position_encoding(max_position_embeddings, embedding_size),
- mstype.float32)
- self.shape = P.Shape()
-
- def construct(self, word_embeddings):
- """Postprocessors apply positional embeddings to word embeddings."""
- input_shape = self.shape(word_embeddings)
- input_len = input_shape[1]
-
- output = self.multiply(word_embeddings, self.scores_mul)
-
- # add position embeddings
- position_embeddings = self.position_embedding_table[0:input_len:1, ::]
- position_embeddings = self.expand_dims(position_embeddings, 0)
- output = self.add(output, position_embeddings)
-
- if self.use_dropout:
- output = self.dropout(output)
- return output
-
-
-class CastWrapper(nn.Cell):
- """
- Cast wrapper.
- """
- def __init__(self, src_type=mstype.float32, dst_type=mstype.float32):
- super(CastWrapper, self).__init__()
- self.cast = P.Cast()
- self.dst_type = dst_type
-
- def construct(self, x):
- return self.cast(x, self.dst_type)
-
-
-class LayerPreprocess(nn.Cell):
- """
- preprocess input of each layer.
- """
- def __init__(self,
- in_channels=None):
- super(LayerPreprocess, self).__init__()
- self.layernorm = nn.LayerNorm((in_channels,))
- self.cast = P.Cast()
- self.get_dtype = P.DType()
-
- def construct(self, input_tensor):
- output = self.cast(input_tensor, mstype.float32)
- output = self.layernorm(output)
- output = self.cast(output, self.get_dtype(input_tensor))
- return output
-
-
-class LayerPostprocess(nn.Cell):
- """
- postprocess output of each layer.
- """
- def __init__(self,
- dropout_prob=0.1):
- super(LayerPostprocess, self).__init__()
- self.add = P.Add()
- self.dropout = nn.Dropout(1 - dropout_prob)
- self.use_dropout = dropout_prob > 0
-
- def construct(self, hidden_tensor, input_tensor):
- output = hidden_tensor
- if self.use_dropout:
- output = self.dropout(output)
- output = self.add(output, input_tensor)
- return output
-
-
-class MultiheadAttention(nn.Cell):
- """
- Apply multi-headed attention from "from_tensor" to "to_tensor".
-
- Args:
- batch_size (int): Batch size of input datasets.
- from_tensor_width (int): Size of last dim of from_tensor.
- to_tensor_width (int): Size of last dim of to_tensor.
- from_seq_length (int): Length of from_tensor sequence.
- to_seq_length (int): Length of to_tensor sequence.
- num_attention_heads (int): Number of attention heads. Default: 1.
- size_per_head (int): Size of each attention head. Default: 512.
- query_act (str): Activation function for the query transform. Default: None.
- key_act (str): Activation function for the key transform. Default: None.
- value_act (str): Activation function for the value transform. Default: None.
- has_attention_mask (bool): Specifies whether to use attention mask. Default: False.
- attention_probs_dropout_prob (float): The dropout probability for
- MultiheadAttention. Default: 0.0.
- use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False.
- initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
- do_return_2d_tensor (bool): True for return 2d tensor. False for return 3d
- tensor. Default: False.
- compute_type (:class:`mindspore.dtype`): Compute type in MultiheadAttention. Default: mstype.float32.
- """
- def __init__(self,
- batch_size,
- from_tensor_width,
- to_tensor_width,
- out_tensor_width,
- num_attention_heads=1,
- size_per_head=512,
- query_act=None,
- key_act=None,
- value_act=None,
- out_act=None,
- has_attention_mask=True,
- attention_probs_dropout_prob=0.0,
- use_one_hot_embeddings=False,
- initializer_range=0.02,
- do_return_2d_tensor=True,
- compute_type=mstype.float32):
- super(MultiheadAttention, self).__init__()
- self.batch_size = batch_size
- self.num_attention_heads = num_attention_heads
- self.size_per_head = size_per_head
- self.has_attention_mask = has_attention_mask
- assert has_attention_mask
- self.use_one_hot_embeddings = use_one_hot_embeddings
- self.initializer_range = initializer_range
- self.do_return_2d_tensor = do_return_2d_tensor
-
- self.scores_mul = Tensor([1.0 / math.sqrt(float(self.size_per_head))], dtype=compute_type)
- self.reshape = P.Reshape()
- self.shape_from_2d = (-1, from_tensor_width)
- self.shape_to_2d = (-1, to_tensor_width)
- units = num_attention_heads * size_per_head
- self.query_layer = nn.Dense(from_tensor_width,
- units,
- activation=query_act,
- has_bias=False,
- weight_init=weight_variable([units, from_tensor_width])).to_float(compute_type)
- self.key_layer = nn.Dense(to_tensor_width,
- units,
- activation=key_act,
- has_bias=False,
- weight_init=weight_variable([units, to_tensor_width])).to_float(compute_type)
- self.value_layer = nn.Dense(to_tensor_width,
- units,
- activation=value_act,
- has_bias=False,
- weight_init=weight_variable([units, to_tensor_width])).to_float(compute_type)
- self.out_layer = nn.Dense(units,
- out_tensor_width,
- activation=out_act,
- has_bias=False,
- weight_init=weight_variable([out_tensor_width, units])).to_float(compute_type)
-
- self.matmul_trans_b = P.BatchMatMul(transpose_b=True)
- self.multiply = P.Mul()
- self.transpose = P.Transpose()
- self.trans_shape = (0, 2, 1, 3)
- self.trans_shape_relative = (2, 0, 1, 3)
- self.trans_shape_position = (1, 2, 0, 3)
- self.multiply_data = Tensor([-10000.0,], dtype=compute_type)
- self.batch_num = batch_size * num_attention_heads
- self.matmul = P.BatchMatMul()
-
- self.softmax = nn.Softmax()
- self.dropout = nn.Dropout(1 - attention_probs_dropout_prob)
- self.use_dropout = attention_probs_dropout_prob > 0
-
- if self.has_attention_mask:
- self.expand_dims = P.ExpandDims()
- self.sub = P.Sub()
- self.add = P.Add()
- self.cast = P.Cast()
- self.get_dtype = P.DType()
-
- self.cast_compute_type = CastWrapper(dst_type=compute_type)
- self.softmax_cast = P.Cast()
-
- def construct(self, from_tensor, to_tensor, seq_length, enc_seq_length, attention_mask=None):
- """Apply multihead attention."""
- from_seq_length = seq_length
- to_seq_length = enc_seq_length
- shape_from = (self.batch_size, from_seq_length, self.num_attention_heads, self.size_per_head)
- shape_to = (self.batch_size, to_seq_length, self.num_attention_heads, self.size_per_head)
- if self.do_return_2d_tensor:
- shape_return = (self.batch_size * from_seq_length, self.num_attention_heads * self.size_per_head)
- if from_seq_length == -1:
- shape_return = (-1, self.num_attention_heads * self.size_per_head)
- else:
- shape_return = (self.batch_size, from_seq_length, self.num_attention_heads * self.size_per_head)
-
- # reshape 2d/3d input tensors to 2d
- from_tensor_2d = self.reshape(from_tensor, self.shape_from_2d)
- to_tensor_2d = self.reshape(to_tensor, self.shape_to_2d)
- query_out = self.query_layer(from_tensor_2d)
- key_out = self.key_layer(to_tensor_2d)
- value_out = self.value_layer(to_tensor_2d)
-
- query_layer = self.reshape(query_out, shape_from)
- query_layer = self.transpose(query_layer, self.trans_shape)
- key_layer = self.reshape(key_out, shape_to)
- key_layer = self.transpose(key_layer, self.trans_shape)
-
- attention_scores = self.matmul_trans_b(query_layer, key_layer)
- attention_scores = self.multiply(attention_scores, self.scores_mul)
-
- if self.has_attention_mask:
- attention_mask = self.expand_dims(attention_mask, 1)
- multiply_out = self.sub(self.cast(F.tuple_to_array((1.0,)), self.get_dtype(attention_scores)),
- self.cast(attention_mask, self.get_dtype(attention_scores)))
- adder = self.multiply(multiply_out, self.multiply_data)
- attention_scores = self.add(adder, attention_scores)
-
- attention_scores = self.softmax_cast(attention_scores, mstype.float32)
- attention_probs = self.softmax(attention_scores)
- attention_probs = self.softmax_cast(attention_probs, self.get_dtype(key_layer))
- if self.use_dropout:
- attention_probs = self.dropout(attention_probs)
-
- value_layer = self.reshape(value_out, shape_to)
- value_layer = self.transpose(value_layer, self.trans_shape)
- context_layer = self.matmul(attention_probs, value_layer)
-
- context_layer = self.transpose(context_layer, self.trans_shape)
- context_layer = self.reshape(context_layer, shape_return)
- context_layer = self.out_layer(context_layer)
- return context_layer
-
-
-class SelfAttention(nn.Cell):
- """
- Apply self-attention.
-
- Args:
- batch_size (int): Batch size of input dataset.
- from_seq_length (int): Length of query sequence.
- to_seq_length (int): Length of memory sequence.
- hidden_size (int): Size of attention layers.
- num_attention_heads (int): Number of attention heads. Default: 16.
- attention_probs_dropout_prob (float): The dropout probability for
- SelfAttention. Default: 0.1.
- use_one_hot_embeddings (bool): Specifies whether to use one_hot encoding form. Default: False.
- initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
- hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1.
- has_attention_mask (bool): Specifies whether has attention mask. Default: True.
- is_encdec_att (bool): Specifies whether query sequence and memory sequence are different. Default: False.
- compute_type (:class:`mindspore.dtype`): Compute type in MultiheadAttention. Default: mstype.float32.
- """
- def __init__(self,
- batch_size,
- hidden_size,
- num_attention_heads=16,
- attention_probs_dropout_prob=0.1,
- use_one_hot_embeddings=False,
- initializer_range=0.02,
- hidden_dropout_prob=0.1,
- has_attention_mask=True,
- is_encdec_att=False,
- compute_type=mstype.float32):
- super(SelfAttention, self).__init__()
- if hidden_size % num_attention_heads != 0:
- raise ValueError("The hidden size (%d) is not a multiple of the number "
- "of attention heads (%d)" % (hidden_size, num_attention_heads))
- self.size_per_head = int(hidden_size / num_attention_heads)
- self.is_encdec_att = is_encdec_att
-
- self.attention = MultiheadAttention(
- batch_size=batch_size,
- from_tensor_width=hidden_size,
- to_tensor_width=hidden_size,
- out_tensor_width=hidden_size,
- num_attention_heads=num_attention_heads,
- size_per_head=self.size_per_head,
- attention_probs_dropout_prob=attention_probs_dropout_prob,
- use_one_hot_embeddings=use_one_hot_embeddings,
- initializer_range=initializer_range,
- has_attention_mask=has_attention_mask,
- do_return_2d_tensor=True,
- compute_type=compute_type)
-
- self.preprocess = LayerPreprocess(in_channels=hidden_size)
- self.postprocess = LayerPostprocess(dropout_prob=hidden_dropout_prob)
-
- self.reshape = P.Reshape()
- self.shape = (-1, hidden_size)
- def construct(self, input_tensor, memory_tensor, attention_mask, seq_length, enc_seq_length):
- """Apply self-attention."""
- input_tensor = self.reshape(input_tensor, self.shape)
- memory_tensor = self.reshape(memory_tensor, self.shape)
-
- output = self.preprocess(input_tensor)
-
- if not self.is_encdec_att:
- memory_tensor = output
-
- attention_output = self.attention(output, memory_tensor, seq_length, enc_seq_length, attention_mask)
- output = self.postprocess(attention_output, input_tensor)
- return output
-
-
-class FeedForward(nn.Cell):
- """
- Apply two-layer feed forward
-
- Args:
- in_channels (int): Size of the input layer.
- hidden_size (int): Size of the hidden layer.
- out_channels (int): Size of the output layers.
- hidden_act (str): name of the activation function. Default: relu
- initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
- hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1.
- compute_type (:class:`mindspore.dtype`): Compute type in FeedForward. Default: mstype.float32.
- """
- def __init__(self,
- in_channels,
- hidden_size,
- out_channels,
- hidden_act="relu",
- initializer_range=0.02,
- hidden_dropout_prob=0.1,
- compute_type=mstype.float32):
- super(FeedForward, self).__init__()
-
- self.conv1 = nn.Dense(in_channels,
- hidden_size,
- activation=hidden_act,
- weight_init=weight_variable([hidden_size, in_channels])).to_float(compute_type)
- self.conv2 = nn.Dense(hidden_size,
- out_channels,
- weight_init=weight_variable([out_channels, hidden_size])).to_float(compute_type)
-
- self.preprocess = LayerPreprocess(in_channels=in_channels)
- self.postprocess = LayerPostprocess(dropout_prob=hidden_dropout_prob)
-
- self.reshape = P.Reshape()
- self.shape = (-1, in_channels)
- self.dropout = nn.Dropout(1 - hidden_dropout_prob)
- self.use_dropout = hidden_dropout_prob > 0
-
- def construct(self, input_tensor):
- input_tensor = self.reshape(input_tensor, self.shape)
- output = self.preprocess(input_tensor)
- output = self.conv1(output)
- if self.use_dropout:
- output = self.dropout(output)
- output = self.conv2(output)
- output = self.postprocess(output, input_tensor)
- return output
-
-
-class EncoderCell(nn.Cell):
- """
- Encoder cells used in Transformer.
-
- Args:
- batch_size (int): Batch size of input dataset.
- hidden_size (int): Size of the encoder layers. Default: 1024.
- seq_length (int): Length of input sequence. Default: 128.
- num_attention_heads (int): Number of attention heads. Default: 16.
- intermediate_size (int): Size of intermediate layer. Default: 4096.
- attention_probs_dropout_prob (float): The dropout probability for
- SelfAttention. Default: 0.02.
- use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False.
- initializer_range (float): Initialization value of TruncatedNormal. Default: 0.1.
- hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1.
- hidden_act (str): Activation function. Default: "relu".
- compute_type (:class:`mindspore.dtype`): Compute type in attention. Default: mstype.float32.
- """
- def __init__(self,
- batch_size,
- hidden_size=1024,
- num_attention_heads=16,
- intermediate_size=4096,
- attention_probs_dropout_prob=0.1,
- use_one_hot_embeddings=False,
- initializer_range=0.02,
- hidden_dropout_prob=0.1,
- hidden_act="relu",
- compute_type=mstype.float32):
- super(EncoderCell, self).__init__()
- self.attention = SelfAttention(
- batch_size=batch_size,
- hidden_size=hidden_size,
- num_attention_heads=num_attention_heads,
- attention_probs_dropout_prob=attention_probs_dropout_prob,
- use_one_hot_embeddings=use_one_hot_embeddings,
- initializer_range=initializer_range,
- hidden_dropout_prob=hidden_dropout_prob,
- is_encdec_att=False,
- compute_type=compute_type)
- self.feedforward = FeedForward(
- in_channels=hidden_size,
- hidden_size=intermediate_size,
- out_channels=hidden_size,
- hidden_act=hidden_act,
- initializer_range=initializer_range,
- hidden_dropout_prob=hidden_dropout_prob,
- compute_type=compute_type)
-
- def construct(self, hidden_states, attention_mask, seq_length):
- # self-attention with ln, res
- attention_output = self.attention(hidden_states, hidden_states, attention_mask, seq_length, seq_length)
- # feed forward with ln, res
- output = self.feedforward(attention_output)
- return output
-
-
-class TransformerEncoder(nn.Cell):
- """
- Multi-layer transformer encoder.
-
- Args:
- batch_size (int): Batch size of input dataset.
- hidden_size (int): Size of the encoder layers.
- seq_length (int): Length of input sequence.
- num_hidden_layers (int): Number of hidden layers in encoder cells.
- num_attention_heads (int): Number of attention heads in encoder cells. Default: 16.
- intermediate_size (int): Size of intermediate layer in encoder cells. Default: 4096.
- attention_probs_dropout_prob (float): The dropout probability for
- SelfAttention. Default: 0.1.
- use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False.
- initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
- hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1..
- hidden_act (str): Activation function used in the encoder cells. Default: "gelu".
- compute_type (:class:`mindspore.dtype`): Compute type. Default: mstype.float32.
- """
- def __init__(self,
- batch_size,
- hidden_size,
- num_hidden_layers,
- num_attention_heads=16,
- intermediate_size=4096,
- attention_probs_dropout_prob=0.1,
- use_one_hot_embeddings=False,
- initializer_range=0.02,
- hidden_dropout_prob=0.1,
- hidden_act="relu",
- compute_type=mstype.float32):
- super(TransformerEncoder, self).__init__()
- self.num_hidden_layers = num_hidden_layers
- self.batch_size = batch_size
- self.hidden_size = hidden_size
-
- layers = []
- for _ in range(num_hidden_layers):
- layer = EncoderCell(batch_size=batch_size,
- hidden_size=hidden_size,
- num_attention_heads=num_attention_heads,
- intermediate_size=intermediate_size,
- attention_probs_dropout_prob=attention_probs_dropout_prob,
- use_one_hot_embeddings=use_one_hot_embeddings,
- initializer_range=initializer_range,
- hidden_dropout_prob=hidden_dropout_prob,
- hidden_act=hidden_act,
- compute_type=compute_type)
- layers.append(layer)
- self.layers = nn.CellList(layers)
-
- self.layer_preprocess = LayerPreprocess(in_channels=hidden_size)
-
- self.reshape = P.Reshape()
- self.shape = (-1, hidden_size)
-
- def construct(self, input_tensor, attention_mask, seq_length):
- """Apply encoder."""
- out_shape = (self.batch_size, seq_length, self.hidden_size)
- prev_output = self.reshape(input_tensor, self.shape)
-
- for layer_module in self.layers:
- layer_output = layer_module(prev_output, attention_mask, seq_length)
- prev_output = layer_output
-
- prev_output = self.layer_preprocess(prev_output)
- output = self.reshape(prev_output, out_shape)
- return output
-
-
-class DecoderCell(nn.Cell):
- """
- decoder cells used in Transformer.
-
- Args:
- batch_size (int): Batch size of input dataset.
- hidden_size (int): Size of the Transformer decoder layers. Default: 1024.
- seq_length (int): Length of input sequence. Default: 128.
- enc_seq_length (int): Length of source sentences. Default:128
- num_attention_heads (int): Number of attention heads. Default: 12.
- intermediate_size (int): Size of intermediate layer. Default: 4096.
- attention_probs_dropout_prob (float): The dropout probability for
- SelfAttention. Default: 0.02.
- use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False.
- initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
- hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1.
- hidden_act (str): Activation function. Default: "relu".
- compute_type (:class:`mindspore.dtype`): Compute type in attention. Default: mstype.float32.
- """
- def __init__(self,
- batch_size,
- hidden_size=1024,
- num_attention_heads=12,
- intermediate_size=4096,
- attention_probs_dropout_prob=0.02,
- use_one_hot_embeddings=False,
- initializer_range=0.02,
- hidden_dropout_prob=0.1,
- hidden_act="relu",
- compute_type=mstype.float32):
- super(DecoderCell, self).__init__()
- self.self_attention = SelfAttention(
- batch_size=batch_size,
- hidden_size=hidden_size,
- num_attention_heads=num_attention_heads,
- attention_probs_dropout_prob=attention_probs_dropout_prob,
- use_one_hot_embeddings=use_one_hot_embeddings,
- initializer_range=initializer_range,
- is_encdec_att=False,
- hidden_dropout_prob=hidden_dropout_prob,
- compute_type=compute_type)
- self.cross_attention = SelfAttention(
- batch_size=batch_size,
- hidden_size=hidden_size,
- num_attention_heads=num_attention_heads,
- attention_probs_dropout_prob=attention_probs_dropout_prob,
- use_one_hot_embeddings=use_one_hot_embeddings,
- initializer_range=initializer_range,
- is_encdec_att=True,
- hidden_dropout_prob=hidden_dropout_prob,
- compute_type=compute_type)
- self.feedforward = FeedForward(
- in_channels=hidden_size,
- hidden_size=intermediate_size,
- out_channels=hidden_size,
- hidden_act=hidden_act,
- initializer_range=initializer_range,
- hidden_dropout_prob=hidden_dropout_prob,
- compute_type=compute_type)
-
- def construct(self, hidden_states, attention_mask, enc_states, enc_attention_mask, seq_length, enc_seq_length):
- # self-attention with ln, res
- attention_output = self.self_attention(hidden_states, hidden_states, attention_mask, seq_length, seq_length)
- # cross-attention with ln, res
- attention_output = self.cross_attention(attention_output, enc_states, enc_attention_mask,
- seq_length, enc_seq_length)
- # feed forward with ln, res
- output = self.feedforward(attention_output)
- return output
-
-
-class TransformerDecoder(nn.Cell):
- """
- Multi-layer transformer decoder.
-
- Args:
- batch_size (int): Batch size of input dataset.
- hidden_size (int): Size of the encoder layers.
- seq_length (int): Length of input sequence.
- enc_seq_length (int): Length of source sentences.
- num_hidden_layers (int): Number of hidden layers in encoder cells.
- num_attention_heads (int): Number of attention heads in encoder cells. Default: 16.
- intermediate_size (int): Size of intermediate layer in encoder cells. Default: 4096.
- attention_probs_dropout_prob (float): The dropout probability for
- SelfAttention. Default: 0.1.
- use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False.
- initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
- hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1.
- hidden_act (str): Activation function used in the encoder cells. Default: "gelu".
- compute_type (:class:`mindspore.dtype`): Compute type. Default: mstype.float32.
- """
- def __init__(self,
- batch_size,
- hidden_size,
- num_hidden_layers,
- num_attention_heads=16,
- intermediate_size=4096,
- attention_probs_dropout_prob=0.1,
- use_one_hot_embeddings=False,
- initializer_range=0.02,
- hidden_dropout_prob=0.1,
- hidden_act="relu",
- compute_type=mstype.float32):
- super(TransformerDecoder, self).__init__()
- self.num_hidden_layers = num_hidden_layers
-
- layers = []
- for _ in range(num_hidden_layers):
- layer = DecoderCell(batch_size=batch_size,
- hidden_size=hidden_size,
- num_attention_heads=num_attention_heads,
- intermediate_size=intermediate_size,
- attention_probs_dropout_prob=attention_probs_dropout_prob,
- use_one_hot_embeddings=use_one_hot_embeddings,
- initializer_range=initializer_range,
- hidden_dropout_prob=hidden_dropout_prob,
- hidden_act=hidden_act,
- compute_type=compute_type)
- layers.append(layer)
- self.layers = nn.CellList(layers)
-
- self.layer_preprocess = LayerPreprocess(in_channels=hidden_size)
-
- self.reshape = P.Reshape()
- self.shape = (-1, hidden_size)
- self.hidden_size = hidden_size
- self.batch_size = batch_size
-
- def construct(self, input_tensor, attention_mask, enc_states, enc_attention_mask, seq_length, enc_seq_length):
- """Apply decoder."""
- out_shape = (self.batch_size, seq_length, self.hidden_size)
- prev_output = self.reshape(input_tensor, self.shape)
-
- for layer_module in self.layers:
- layer_output = layer_module(prev_output, attention_mask, enc_states, enc_attention_mask,
- seq_length, enc_seq_length)
- prev_output = layer_output
-
- prev_output = self.layer_preprocess(prev_output)
- output = self.reshape(prev_output, out_shape)
- return output
-
-
-class CreateAttentionMaskFromInputMask(nn.Cell):
- """
- Create attention mask according to input mask.
-
- Args:
- config (:class:`TransformerConfig`): Configuration for Transformer.
- """
- def __init__(self):
- super(CreateAttentionMaskFromInputMask, self).__init__()
- self.cast = P.Cast()
- self.reshape = P.Reshape()
- self.shape = P.Shape()
- self.batch_matmul = P.BatchMatMul()
-
- def construct(self, input_mask):
- """Create attention mask according to input mask."""
- input_shape = self.shape(input_mask)
- shape_right = (input_shape[0], 1, input_shape[1])
- shape_left = input_shape + (1,)
-
- input_mask = self.cast(input_mask, mstype.float32)
- mask_left = self.reshape(input_mask, shape_left)
- mask_right = self.reshape(input_mask, shape_right)
- attention_mask = self.batch_matmul(mask_left, mask_right)
-
- return attention_mask
-
-
-class PredLogProbs(nn.Cell):
- """
- Get log probs.
-
- Args:
- batch_size (int): Batch size.
- seq_length (int): Length of input sequence.
- width (int): Hidden size.
- compute_type (:class:`mindspore.dtype`): Compute type. Default: mstype.float32.
- dtype (:class:`mindspore.dtype`): Compute type to compute log_softmax. Default: mstype.float32.
- """
- def __init__(self,
- batch_size,
- width,
- compute_type=mstype.float32,
- dtype=mstype.float32):
- super(PredLogProbs, self).__init__()
- self.batch_size = batch_size
- self.width = width
- self.compute_type = compute_type
- self.dtype = dtype
-
- self.reshape = P.Reshape()
- self.matmul = P.MatMul(transpose_b=True)
- self.log_softmax = nn.LogSoftmax(axis=-1)
- self.cast = P.Cast()
-
- def construct(self,
- input_tensor,
- output_weights,
- seq_length):
- """Get log probs."""
- shape_flat_sequence_tensor = (self.batch_size * seq_length, self.width)
-
- input_tensor = self.reshape(input_tensor, shape_flat_sequence_tensor)
- input_tensor = self.cast(input_tensor, self.compute_type)
- output_weights = self.cast(output_weights, self.compute_type)
-
- logits = self.matmul(input_tensor, output_weights)
- logits = self.cast(logits, self.dtype)
-
- log_probs = self.log_softmax(logits)
- return log_probs
-
-
-class TransformerDecoderStep(nn.Cell):
- """
- Multi-layer transformer decoder step.
-
- Args:
- batch_size (int): Batch size of input dataset.
- hidden_size (int): Size of the encoder layers.
- max_decode_length (int): Max decode length.
- enc_seq_length (int): Length of source sentences.
- num_hidden_layers (int): Number of hidden layers in encoder cells.
- num_attention_heads (int): Number of attention heads in encoder cells. Default: 16.
- intermediate_size (int): Size of intermediate layer in encoder cells. Default: 4096.
- attention_probs_dropout_prob (float): The dropout probability for
- SelfAttention. Default: 0.1.
- use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False.
- initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
- hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1.
- hidden_act (str): Activation function used in the encoder cells. Default: "gelu".
- compute_type (:class:`mindspore.dtype`): Compute type. Default: mstype.float32.
- embedding_lookup (:class:`EmbeddingLookup`): Embedding lookup module.
- embedding_processor (:class:`EmbeddingPostprocessor`) Embedding postprocessor module.
- projection (:class:`PredLogProbs`): PredLogProbs module
- """
- def __init__(self,
- batch_size,
- hidden_size,
- max_decode_length,
- num_hidden_layers,
- num_attention_heads=16,
- intermediate_size=4096,
- attention_probs_dropout_prob=0.3,
- use_one_hot_embeddings=False,
- initializer_range=0.02,
- hidden_dropout_prob=0.3,
- hidden_act="relu",
- compute_type=mstype.float32,
- embedding_lookup=None,
- embedding_processor=None,
- projection=None):
- super(TransformerDecoderStep, self).__init__(auto_prefix=False)
- self.num_hidden_layers = num_hidden_layers
-
- self.tfm_embedding_lookup = embedding_lookup
- self.tfm_embedding_processor = embedding_processor
- self.projection = projection
-
- self.tfm_decoder = TransformerDecoder(
- batch_size=batch_size,
- hidden_size=hidden_size,
- num_attention_heads=num_attention_heads,
- num_hidden_layers=num_hidden_layers,
- intermediate_size=intermediate_size,
- attention_probs_dropout_prob=attention_probs_dropout_prob,
- use_one_hot_embeddings=use_one_hot_embeddings,
- initializer_range=initializer_range,
- hidden_dropout_prob=hidden_dropout_prob,
- hidden_act=hidden_act,
- compute_type=compute_type)
-
- self.ones_like = P.OnesLike()
- self.shape = P.Shape()
-
- self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask()
- self.expand = P.ExpandDims()
- self.multiply = P.Mul()
-
- ones = np.ones(shape=(max_decode_length, max_decode_length))
- self.future_mask = Tensor(np.tril(ones), dtype=mstype.float32)
-
- self.cast_compute_type = CastWrapper(dst_type=compute_type)
-
- def construct(self, input_ids, enc_states, enc_attention_mask, seq_length):
- """
- Multi-layer transformer decoder step.
- input_ids: [batch_size * beam_width]
- """
- # process embedding
- input_embedding, embedding_tables = self.tfm_embedding_lookup(input_ids)
- input_embedding = self.tfm_embedding_processor(input_embedding)
- input_embedding = self.cast_compute_type(input_embedding)
-
- input_shape = self.shape(input_ids)
- input_len = input_shape[1]
- future_mask = self.future_mask[0:input_len:1, 0:input_len:1]
-
- input_mask = self.ones_like(input_ids)
- input_mask = self._create_attention_mask_from_input_mask(input_mask)
- input_mask = self.multiply(input_mask, self.expand(future_mask, 0))
- input_mask = self.cast_compute_type(input_mask)
-
- enc_attention_mask = enc_attention_mask[::, 0:input_len:1, ::]
-
- # call TransformerDecoder
- decoder_output = self.tfm_decoder(input_embedding, input_mask, enc_states, enc_attention_mask, -1, seq_length)
-
- # take the last step
- decoder_output = decoder_output[::, input_len-1:input_len:1, ::]
-
- # projection and log_prob
- log_probs = self.projection(decoder_output, embedding_tables, 1)
-
- return log_probs
-
-
-@constexpr
-def convert_np_to_tensor_encoder(seq_length):
- ones = np.ones(shape=(seq_length, seq_length))
- return Tensor(np.tril(ones), dtype=mstype.float32)
-
-
-class TransformerModel(nn.Cell):
- """
- Transformer with encoder and decoder.
-
- Args:
- config (Class): Configuration for Transformer.
- is_training (bool): True for training mode. False for eval mode.
- use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False.
- """
- def __init__(self,
- config,
- is_training,
- use_one_hot_embeddings=False):
- super(TransformerModel, self).__init__()
- config = copy.deepcopy(config)
- self.is_training = is_training
- if not is_training:
- config.hidden_dropout_prob = 0.0
- config.attention_probs_dropout_prob = 0.0
-
- self.batch_size = config.batch_size
- self.hidden_size = config.hidden_size
- self.num_hidden_layers = config.num_hidden_layers
- self.embedding_size = config.hidden_size
-
- self.last_idx = self.num_hidden_layers - 1
- self.beam_width = config.beam_width
- self.max_decode_length = config.max_decode_length
-
- self.tfm_embedding_lookup = EmbeddingLookup(
- vocab_size=config.vocab_size,
- embedding_size=self.embedding_size,
- use_one_hot_embeddings=use_one_hot_embeddings,
- initializer_range=config.initializer_range)
- self.tfm_embedding_postprocessor_for_encoder = EmbeddingPostprocessor(
- embedding_size=self.embedding_size,
- use_one_hot_embeddings=use_one_hot_embeddings,
- initializer_range=0.02,
- max_position_embeddings=config.max_position_embeddings,
- dropout_prob=config.hidden_dropout_prob)
- self.tfm_embedding_postprocessor_for_decoder = EmbeddingPostprocessor(
- embedding_size=self.embedding_size,
- use_one_hot_embeddings=use_one_hot_embeddings,
- initializer_range=0.02,
- max_position_embeddings=config.max_position_embeddings,
- dropout_prob=config.hidden_dropout_prob)
- self.tfm_encoder = TransformerEncoder(
- batch_size=self.batch_size,
- hidden_size=self.hidden_size,
- num_attention_heads=config.num_attention_heads,
- num_hidden_layers=self.num_hidden_layers,
- intermediate_size=config.intermediate_size,
- attention_probs_dropout_prob=config.attention_probs_dropout_prob,
- use_one_hot_embeddings=use_one_hot_embeddings,
- initializer_range=config.initializer_range,
- hidden_dropout_prob=config.hidden_dropout_prob,
- hidden_act=config.hidden_act,
- compute_type=config.compute_type)
-
- if is_training:
- self.projection = PredLogProbs(
- batch_size=self.batch_size,
- width=self.hidden_size,
- compute_type=config.compute_type,
- dtype=config.dtype)
- self.tfm_decoder = TransformerDecoder(
- batch_size=self.batch_size,
- hidden_size=self.hidden_size,
- num_attention_heads=config.num_attention_heads,
- num_hidden_layers=self.num_hidden_layers,
- intermediate_size=config.intermediate_size,
- attention_probs_dropout_prob=config.attention_probs_dropout_prob,
- use_one_hot_embeddings=use_one_hot_embeddings,
- initializer_range=config.initializer_range,
- hidden_dropout_prob=config.hidden_dropout_prob,
- hidden_act=config.hidden_act,
- compute_type=config.compute_type)
- else:
- self.projection = PredLogProbs(
- batch_size=self.batch_size * config.beam_width,
- width=self.hidden_size,
- compute_type=config.compute_type,
- dtype=config.dtype)
- self.tfm_decoder = TransformerDecoderStep(
- batch_size=self.batch_size * config.beam_width,
- hidden_size=self.hidden_size,
- max_decode_length=config.max_decode_length,
- num_hidden_layers=config.num_hidden_layers,
- num_attention_heads=config.num_attention_heads,
- intermediate_size=config.intermediate_size,
- attention_probs_dropout_prob=config.attention_probs_dropout_prob,
- use_one_hot_embeddings=False,
- initializer_range=config.initializer_range,
- hidden_dropout_prob=config.hidden_dropout_prob,
- hidden_act=config.hidden_act,
- compute_type=config.compute_type,
- embedding_lookup=self.tfm_embedding_lookup,
- embedding_processor=self.tfm_embedding_postprocessor_for_decoder,
- projection=self.projection)
- self.tfm_decoder = BeamSearchDecoder(
- batch_size=config.batch_size,
- seq_length=config.seq_length,
- vocab_size=config.vocab_size,
- decoder=self.tfm_decoder,
- beam_width=config.beam_width,
- length_penalty_weight=config.length_penalty_weight,
- max_decode_length=config.max_decode_length)
-
- self.tfm_decoder.add_flags(loop_can_unroll=True)
- self.tile_beam = TileBeam(beam_width=self.beam_width)
- ones = np.ones(shape=(self.batch_size, self.max_decode_length))
- self.encdec_mask = Tensor(ones, mstype.float32)
-
- self.cast = P.Cast()
- self.dtype = config.dtype
- self.cast_compute_type = CastWrapper(dst_type=config.compute_type)
- self.expand = P.ExpandDims()
- self.multiply = P.Mul()
- self.shape = P.Shape()
-
- self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask()
-
- def construct(self, source_ids, source_mask, target_ids=None, target_mask=None):
- """Transformer with encoder and decoder."""
- seq_length = self.shape(source_ids)[1]
-
- # process source sentence
- src_word_embeddings, embedding_tables = self.tfm_embedding_lookup(source_ids)
- src_embedding_output = self.tfm_embedding_postprocessor_for_encoder(src_word_embeddings)
- # attention mask [batch_size, seq_length, seq_length]
- enc_attention_mask = self._create_attention_mask_from_input_mask(source_mask)
- # transformer encoder
- encoder_output = self.tfm_encoder(self.cast_compute_type(src_embedding_output),
- self.cast_compute_type(enc_attention_mask),
- seq_length)
-
- if self.is_training:
- future_mask = convert_np_to_tensor_encoder(seq_length)
- # process target sentence
- tgt_word_embeddings, _ = self.tfm_embedding_lookup(target_ids)
- tgt_embedding_output = self.tfm_embedding_postprocessor_for_decoder(tgt_word_embeddings)
- # attention mask [batch_size, seq_length, seq_length]
- tgt_attention_mask = self._create_attention_mask_from_input_mask(target_mask)
- tgt_attention_mask = self.multiply(tgt_attention_mask, self.expand(future_mask, 0))
- # transformer decoder
- decoder_output = self.tfm_decoder(self.cast_compute_type(tgt_embedding_output),
- self.cast_compute_type(tgt_attention_mask),
- encoder_output, enc_attention_mask,
- seq_length, seq_length)
- # calculate logits and log_probs
- log_probs = self.projection(decoder_output, embedding_tables, seq_length)
- ret = log_probs
- else:
- beam_encoder_output = self.tile_beam(encoder_output)
-
- enc_attention_mask = self.multiply(enc_attention_mask[::, 0:1:1, ::], self.expand(self.encdec_mask, -1))
-
- beam_enc_attention_mask = self.tile_beam(enc_attention_mask)
- beam_enc_attention_mask = self.cast_compute_type(beam_enc_attention_mask)
- predicted_ids = self.tfm_decoder(beam_encoder_output, beam_enc_attention_mask)
- ret = predicted_ids
- return ret
diff --git a/tests/st/model_zoo_tests/transformer/src/weight_init.py b/tests/st/model_zoo_tests/transformer/src/weight_init.py
deleted file mode 100644
index 460a1c67c43..00000000000
--- a/tests/st/model_zoo_tests/transformer/src/weight_init.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Weight init utilities."""
-
-import math
-import numpy as np
-from mindspore.common.tensor import Tensor
-
-def _average_units(shape):
- """
- Average shape dim.
- """
- if not shape:
- return 1.
- if len(shape) == 1:
- return float(shape[0])
- if len(shape) == 2:
- return float(shape[0] + shape[1]) / 2.
- raise RuntimeError("not support shape.")
-
-def weight_variable(shape):
- scale_shape = shape
- avg_units = _average_units(scale_shape)
- scale = 1.0 / max(1., avg_units)
- limit = math.sqrt(3.0 * scale)
- values = np.random.uniform(-limit, limit, shape).astype(np.float32)
- return Tensor(values)
-
-def one_weight(shape):
- ones = np.ones(shape).astype(np.float32)
- return Tensor(ones)
-
-def zero_weight(shape):
- zeros = np.zeros(shape).astype(np.float32)
- return Tensor(zeros)
-
-def normal_weight(shape, num_units):
- norm = np.random.normal(0.0, num_units**-0.5, shape).astype(np.float32)
- return Tensor(norm)
-
\ No newline at end of file
diff --git a/tests/st/model_zoo_tests/transformer/test_transformer.py b/tests/st/model_zoo_tests/transformer/test_transformer.py
index 7ee8769944a..8ace3c49c2d 100644
--- a/tests/st/model_zoo_tests/transformer/test_transformer.py
+++ b/tests/st/model_zoo_tests/transformer/test_transformer.py
@@ -27,14 +27,30 @@ from mindspore.train.callback import Callback
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as deC
from mindspore import context
-from src.transformer_model import TransformerConfig
-from src.transformer_for_train import TransformerNetworkWithLoss, TransformerTrainOneStepWithLossScaleCell
-from src.config import cfg, transformer_net_cfg
-from src.lr_schedule import create_dynamic_lr
+from easydict import EasyDict as edict
+from model_zoo.official.nlp.transformer.src.transformer_model import TransformerConfig
+from model_zoo.official.nlp.transformer.src.transformer_for_train import TransformerNetworkWithLoss, TransformerTrainOneStepWithLossScaleCell
+from model_zoo.official.nlp.transformer.src.lr_schedule import create_dynamic_lr
from tests.st.model_zoo_tests import utils
+
DATA_DIR = ["/home/workspace/mindspore_dataset/transformer/test-mindrecord"]
+cfg = edict({
+ 'transformer_network': 'large',
+ 'init_loss_scale_value': 1024,
+ 'scale_factor': 2,
+ 'scale_window': 2000,
+ 'optimizer': 'Adam',
+ 'optimizer_adam_beta2': 0.997,
+ 'lr_schedule': edict({
+ 'learning_rate': 2.0,
+ 'warmup_steps': 8000,
+ 'start_decay_step': 16000,
+ 'min_lr': 0.0,
+ }),
+})
+
def get_config(version='base', batch_size=1):
"""get config"""
@@ -129,7 +145,7 @@ class TimeMonitor(Callback):
self.per_step_mseconds_list.append(epoch_mseconds / self.data_size)
-@pytest.mark.level2
+@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@@ -144,7 +160,7 @@ def test_transformer():
batch_size = 96
epoch_size = 3
config = get_config(version=version, batch_size=batch_size)
- dataset = load_test_data(batch_size=transformer_net_cfg.batch_size, data_file=DATA_DIR)
+ dataset = load_test_data(batch_size=config.batch_size, data_file=DATA_DIR)
netwithloss = TransformerNetworkWithLoss(config, True)
@@ -171,7 +187,7 @@ def test_transformer():
# assertion occurs while the loss value, overflow state or loss_scale value is wrong
loss_value = np.array(callback.loss_list)
- assert np.allclose(loss_value[0], 11.241606, 0, 0.000005)
+ assert np.allclose(loss_value[0], 11.241601, 0, 0.000005)
expect_loss_value = [11.241606, 11.243232, 11.217459, 11.204157, 11.213804,
11.215373, 11.190564, 11.150393, 11.191823, 11.160045]
@@ -201,7 +217,7 @@ def test_transformer():
assert per_step_mseconds <= expect_per_step_mseconds + 10
-@pytest.mark.level1
+@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard