From 8c88411ce65a05fa5e279e9a5b60f7a138594ac8 Mon Sep 17 00:00:00 2001 From: huchunmei Date: Wed, 21 Jul 2021 14:03:01 +0800 Subject: [PATCH] clould --- .../model_zoo_tests/transformer/__init__.py | 0 .../transformer/src/__init__.py | 0 .../transformer/src/beam_search.py | 281 ---- .../model_zoo_tests/transformer/src/config.py | 86 -- .../transformer/src/dataset.py | 58 - .../transformer/src/eval_config.py | 67 - .../transformer/src/lr_schedule.py | 52 - .../transformer/src/process_output.py | 47 - .../transformer/src/tokenization.py | 158 --- .../transformer/src/transformer_for_train.py | 472 ------- .../transformer/src/transformer_model.py | 1153 ----------------- .../transformer/src/weight_init.py | 52 - .../transformer/test_transformer.py | 32 +- 13 files changed, 24 insertions(+), 2434 deletions(-) delete mode 100644 tests/st/model_zoo_tests/transformer/__init__.py delete mode 100644 tests/st/model_zoo_tests/transformer/src/__init__.py delete mode 100644 tests/st/model_zoo_tests/transformer/src/beam_search.py delete mode 100644 tests/st/model_zoo_tests/transformer/src/config.py delete mode 100644 tests/st/model_zoo_tests/transformer/src/dataset.py delete mode 100644 tests/st/model_zoo_tests/transformer/src/eval_config.py delete mode 100644 tests/st/model_zoo_tests/transformer/src/lr_schedule.py delete mode 100644 tests/st/model_zoo_tests/transformer/src/process_output.py delete mode 100644 tests/st/model_zoo_tests/transformer/src/tokenization.py delete mode 100644 tests/st/model_zoo_tests/transformer/src/transformer_for_train.py delete mode 100644 tests/st/model_zoo_tests/transformer/src/transformer_model.py delete mode 100644 tests/st/model_zoo_tests/transformer/src/weight_init.py diff --git a/tests/st/model_zoo_tests/transformer/__init__.py b/tests/st/model_zoo_tests/transformer/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/st/model_zoo_tests/transformer/src/__init__.py b/tests/st/model_zoo_tests/transformer/src/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/st/model_zoo_tests/transformer/src/beam_search.py b/tests/st/model_zoo_tests/transformer/src/beam_search.py deleted file mode 100644 index 53c765f7223..00000000000 --- a/tests/st/model_zoo_tests/transformer/src/beam_search.py +++ /dev/null @@ -1,281 +0,0 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""Transformer beam search module.""" - -import numpy as np -import mindspore.common.dtype as mstype -import mindspore.nn as nn -from mindspore.ops import operations as P -from mindspore.common.tensor import Tensor - -INF = 1. * 1e9 - -class LengthPenalty(nn.Cell): - """ - Normalize scores of translations according to their length. - - Args: - weight (float): Weight of length penalty. Default: 1.0. - compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32. - """ - def __init__(self, - weight=1.0, - compute_type=mstype.float32): - super(LengthPenalty, self).__init__() - self.weight = weight - self.add = P.Add() - self.pow = P.Pow() - self.div = P.RealDiv() - self.cast = P.Cast() - self.five = Tensor(5.0, mstype.float32) - self.six = Tensor(6.0, mstype.float32) - - def construct(self, length_tensor): - length_tensor = self.cast(length_tensor, mstype.float32) - output = self.add(length_tensor, self.five) - output = self.div(output, self.six) - output = self.pow(output, self.weight) - return output - - -class TileBeam(nn.Cell): - """ - TileBeam. - - Args: - beam_width (int): beam width setting. Default: 4. - compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32. - """ - def __init__(self, - beam_width, - compute_type=mstype.float32): - super(TileBeam, self).__init__() - self.beam_width = beam_width - self.expand = P.ExpandDims() - self.tile = P.Tile() - self.reshape = P.Reshape() - self.shape = P.Shape() - - def construct(self, input_tensor): - """ - input_tensor: shape [batch, dim1, dim2] - output_tensor: shape [batch*beam, dim1, dim2] - """ - shape = self.shape(input_tensor) - input_tensor = self.expand(input_tensor, 1) - tile_shape = (1,) + (self.beam_width,) - for _ in range(len(shape)-1): - tile_shape = tile_shape + (1,) - output = self.tile(input_tensor, tile_shape) - out_shape = (shape[0]*self.beam_width,) + shape[1:] - output = self.reshape(output, out_shape) - return output - - -class Mod(nn.Cell): - """ - Mod function. - - Args: - compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32. - """ - def __init__(self, - compute_type=mstype.float32): - super(Mod, self).__init__() - self.compute_type = compute_type - self.floor_div = P.FloorDiv() - self.sub = P.Sub() - self.multiply = P.Mul() - - def construct(self, input_x, input_y): - x = self.floor_div(input_x, input_y) - x = self.multiply(x, input_y) - x = self.sub(input_x, x) - return x - - -class BeamSearchDecoder(nn.Cell): - """ - Beam search decoder. - - Args: - batch_size (int): Batch size of input dataset. - seq_length (int): Length of input sequence. - vocab_size (int): Size of vocabulary. - decoder (:class:`TransformerDecoderStep`): Decoder module. - beam_width (int): beam width setting. Default: 4. - length_penalty_weight (float): Weight of length penalty. Default: 1.0. - max_decode_length (int): max decode length. Default: 128. - sos_id (int): Id of sequence start token. Default: 1. - eos_id (int): Id of sequence end token. Default: 2. - compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32. - """ - def __init__(self, - batch_size, - seq_length, - vocab_size, - decoder, - beam_width=4, - length_penalty_weight=1.0, - max_decode_length=128, - sos_id=1, - eos_id=2, - compute_type=mstype.float32): - super(BeamSearchDecoder, self).__init__(auto_prefix=False) - self.seq_length = seq_length - self.batch_size = batch_size - self.vocab_size = vocab_size - self.beam_width = beam_width - self.length_penalty_weight = length_penalty_weight - self.max_decode_length = max_decode_length - self.decoder = decoder - - self.add = P.Add() - self.expand = P.ExpandDims() - self.reshape = P.Reshape() - self.shape_flat = (-1,) - self.shape = P.Shape() - - self.zero_tensor = Tensor(np.zeros([batch_size, beam_width]), mstype.float32) - self.ninf_tensor = Tensor(np.full([batch_size, beam_width], -INF), mstype.float32) - - self.select = P.Select() - self.flat_shape = (batch_size, beam_width * vocab_size) - self.topk = P.TopK(sorted=True) - self.floor_div = P.FloorDiv() - self.vocab_size_tensor = Tensor(self.vocab_size, mstype.int32) - self.real_div = P.RealDiv() - self.mod = Mod() - self.equal = P.Equal() - self.eos_ids = Tensor(np.full([batch_size, beam_width], eos_id), mstype.int32) - - beam_ids = np.tile(np.arange(beam_width).reshape((1, beam_width)), [batch_size, 1]) - self.beam_ids = Tensor(beam_ids, mstype.int32) - batch_ids = np.arange(batch_size*beam_width).reshape((batch_size, beam_width)) // beam_width - self.batch_ids = Tensor(batch_ids, mstype.int32) - self.concat = P.Concat(axis=-1) - self.gather_nd = P.GatherNd() - - self.greater_equal = P.GreaterEqual() - self.sub = P.Sub() - self.cast = P.Cast() - self.zeroslike = P.ZerosLike() - - # init inputs and states - self.start_ids = Tensor(np.full([batch_size * beam_width, 1], sos_id), mstype.int32) - self.init_seq = Tensor(np.full([batch_size, beam_width, 1], sos_id), mstype.int32) - init_scores = np.tile(np.array([[0.] + [-INF]*(beam_width-1)]), [batch_size, 1]) - self.init_scores = Tensor(init_scores, mstype.float32) - self.init_finished = Tensor(np.zeros([batch_size, beam_width], dtype=np.bool)) - self.init_length = Tensor(np.zeros([batch_size, beam_width], dtype=np.int32)) - self.length_penalty = LengthPenalty(weight=length_penalty_weight) - self.one = Tensor(1, mstype.int32) - - def one_step(self, cur_input_ids, enc_states, enc_attention_mask, state_log_probs, - state_seq, state_finished, state_length): - """ - One step for decode - """ - log_probs = self.decoder(cur_input_ids, enc_states, enc_attention_mask, self.seq_length) - log_probs = self.reshape(log_probs, (self.batch_size, self.beam_width, self.vocab_size)) - - # select topk indices - total_log_probs = self.add(log_probs, self.expand(state_log_probs, -1)) - - # mask finished beams - mask_tensor = self.select(state_finished, self.ninf_tensor, self.zero_tensor) - total_log_probs = self.add(total_log_probs, self.expand(mask_tensor, -1)) - - # reshape scores to [batch, beam*vocab] - flat_scores = self.reshape(total_log_probs, self.flat_shape) - # select topk - topk_scores, topk_indices = self.topk(flat_scores, self.beam_width) - - temp = topk_indices - beam_indices = self.zeroslike(topk_indices) - for _ in range(self.beam_width - 1): - temp = self.sub(temp, self.vocab_size_tensor) - res = self.cast(self.greater_equal(temp, 0), mstype.int32) - beam_indices = beam_indices + res - word_indices = topk_indices - beam_indices * self.vocab_size_tensor - #====================================================================== - - # mask finished indices - beam_indices = self.select(state_finished, self.beam_ids, beam_indices) - word_indices = self.select(state_finished, self.eos_ids, word_indices) - topk_scores = self.select(state_finished, state_log_probs, topk_scores) - - ###### put finished sequences to the end - # sort according to scores with -inf for finished beams - tmp_log_probs = self.select( - self.equal(word_indices, self.eos_ids), - self.ninf_tensor, - topk_scores) - _, tmp_indices = self.topk(tmp_log_probs, self.beam_width) - # update - tmp_gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(tmp_indices, -1))) - beam_indices = self.gather_nd(beam_indices, tmp_gather_indices) - word_indices = self.gather_nd(word_indices, tmp_gather_indices) - topk_scores = self.gather_nd(topk_scores, tmp_gather_indices) - - ###### generate new beam_search states - # gather indices for selecting alive beams - gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(beam_indices, -1))) - - # length add 1 if not finished in the previous step - length_add = self.add(state_length, self.one) - state_length = self.select(state_finished, state_length, length_add) - state_length = self.gather_nd(state_length, gather_indices) - - # concat seq - seq = self.gather_nd(state_seq, gather_indices) - state_seq = self.concat((seq, self.expand(word_indices, -1))) - - # new finished flag and log_probs - state_finished = self.equal(word_indices, self.eos_ids) - state_log_probs = topk_scores - - ###### generate new inputs and decoder states - cur_input_ids = self.reshape(state_seq, (self.batch_size*self.beam_width, -1)) - return cur_input_ids, state_log_probs, state_seq, state_finished, state_length - - def construct(self, enc_states, enc_attention_mask): - """Get beam search result.""" - cur_input_ids = self.start_ids - # beam search states - state_log_probs = self.init_scores - state_seq = self.init_seq - state_finished = self.init_finished - state_length = self.init_length - - for _ in range(self.max_decode_length): - # run one step decoder to get outputs of the current step - # shape [batch*beam, 1, vocab] - cur_input_ids, state_log_probs, state_seq, state_finished, state_length = self.one_step( - cur_input_ids, enc_states, enc_attention_mask, state_log_probs, state_seq, state_finished, state_length) - - # add length penalty scores - penalty_len = self.length_penalty(state_length) - # get penalty length - log_probs = self.real_div(state_log_probs, penalty_len) - - # sort according to scores - _, top_beam_indices = self.topk(log_probs, self.beam_width) - gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(top_beam_indices, -1))) - # sort sequence - predicted_ids = self.gather_nd(state_seq, gather_indices) - # take the first one - predicted_ids = predicted_ids[::, 0:1:1, ::] - return predicted_ids diff --git a/tests/st/model_zoo_tests/transformer/src/config.py b/tests/st/model_zoo_tests/transformer/src/config.py deleted file mode 100644 index 58d5ee5f721..00000000000 --- a/tests/st/model_zoo_tests/transformer/src/config.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""Network config setting, will be used in dataset.py, train.py.""" - -from easydict import EasyDict as edict -import mindspore.common.dtype as mstype -from .transformer_model import TransformerConfig -cfg = edict({ - 'transformer_network': 'large', - 'init_loss_scale_value': 1024, - 'scale_factor': 2, - 'scale_window': 2000, - 'optimizer': 'Adam', - 'optimizer_adam_beta2': 0.997, - 'lr_schedule': edict({ - 'learning_rate': 2.0, - 'warmup_steps': 8000, - 'start_decay_step': 16000, - 'min_lr': 0.0, - }), -}) -''' -two kinds of transformer model version -''' -if cfg.transformer_network == 'large': - transformer_net_cfg = TransformerConfig( - batch_size=96, - seq_length=128, - vocab_size=36560, - hidden_size=1024, - num_hidden_layers=6, - num_attention_heads=16, - intermediate_size=4096, - hidden_act="relu", - hidden_dropout_prob=0.2, - attention_probs_dropout_prob=0.2, - max_position_embeddings=128, - initializer_range=0.02, - label_smoothing=0.1, - dtype=mstype.float32, - compute_type=mstype.float16) - transformer_net_cfg_gpu = TransformerConfig( - batch_size=32, - seq_length=128, - vocab_size=36560, - hidden_size=1024, - num_hidden_layers=6, - num_attention_heads=16, - intermediate_size=4096, - hidden_act="relu", - hidden_dropout_prob=0.2, - attention_probs_dropout_prob=0.2, - max_position_embeddings=128, - initializer_range=0.02, - label_smoothing=0.1, - dtype=mstype.float32, - compute_type=mstype.float16) -if cfg.transformer_network == 'base': - transformer_net_cfg = TransformerConfig( - batch_size=96, - seq_length=128, - vocab_size=36560, - hidden_size=512, - num_hidden_layers=6, - num_attention_heads=8, - intermediate_size=2048, - hidden_act="relu", - hidden_dropout_prob=0.2, - attention_probs_dropout_prob=0.2, - max_position_embeddings=128, - initializer_range=0.02, - label_smoothing=0.1, - dtype=mstype.float32, - compute_type=mstype.float16) diff --git a/tests/st/model_zoo_tests/transformer/src/dataset.py b/tests/st/model_zoo_tests/transformer/src/dataset.py deleted file mode 100644 index b485fd7ddd6..00000000000 --- a/tests/st/model_zoo_tests/transformer/src/dataset.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""Data operations, will be used in train.py.""" - -import mindspore.common.dtype as mstype -import mindspore.dataset as de -import mindspore.dataset.transforms.c_transforms as deC -from .config import transformer_net_cfg, transformer_net_cfg_gpu -de.config.set_seed(1) -def create_transformer_dataset(epoch_count=1, rank_size=1, rank_id=0, do_shuffle="true", dataset_path=None, - bucket_boundaries=None, device_target="Ascend"): - """create dataset""" - def batch_per_bucket(bucket_len, dataset_path): - dataset_path = dataset_path + "_" + str(bucket_len) + "_00" - ds = de.MindDataset(dataset_path, - columns_list=["source_eos_ids", "source_eos_mask", - "target_sos_ids", "target_sos_mask", - "target_eos_ids", "target_eos_mask"], - shuffle=(do_shuffle == "true"), num_shards=rank_size, shard_id=rank_id) - type_cast_op = deC.TypeCast(mstype.int32) - ds = ds.map(operations=type_cast_op, input_columns="source_eos_ids") - ds = ds.map(operations=type_cast_op, input_columns="source_eos_mask") - ds = ds.map(operations=type_cast_op, input_columns="target_sos_ids") - ds = ds.map(operations=type_cast_op, input_columns="target_sos_mask") - ds = ds.map(operations=type_cast_op, input_columns="target_eos_ids") - ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask") - - # apply batch operations - if device_target == "Ascend": - ds = ds.batch(transformer_net_cfg.batch_size, drop_remainder=True) - else: - ds = ds.batch(transformer_net_cfg_gpu.batch_size, drop_remainder=True) - - ds = ds.repeat(epoch_count) - return ds - - for i, _ in enumerate(bucket_boundaries): - bucket_len = bucket_boundaries[i] - ds_per = batch_per_bucket(bucket_len, dataset_path) - if i == 0: - ds = ds_per - else: - ds = ds + ds_per - ds = ds.shuffle(ds.get_dataset_size()) - ds.channel_name = 'transformer' - return ds diff --git a/tests/st/model_zoo_tests/transformer/src/eval_config.py b/tests/st/model_zoo_tests/transformer/src/eval_config.py deleted file mode 100644 index 512e2c489a1..00000000000 --- a/tests/st/model_zoo_tests/transformer/src/eval_config.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""Network evaluation config setting, will be used in eval.py.""" - -from easydict import EasyDict as edict -import mindspore.common.dtype as mstype -from .transformer_model import TransformerConfig - -cfg = edict({ - 'transformer_network': 'large', - 'data_file': '/your/path/evaluation.mindrecord', - 'model_file': '/your/path/checkpoint_file', - 'output_file': '/your/path/output', -}) -''' -two kinds of transformer model version -''' -if cfg.transformer_network == 'large': - transformer_net_cfg = TransformerConfig( - batch_size=1, - seq_length=128, - vocab_size=36560, - hidden_size=1024, - num_hidden_layers=6, - num_attention_heads=16, - intermediate_size=4096, - hidden_act="relu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - max_position_embeddings=128, - label_smoothing=0.1, - beam_width=4, - max_decode_length=80, - length_penalty_weight=1.0, - dtype=mstype.float32, - compute_type=mstype.float16) -if cfg.transformer_network == 'base': - transformer_net_cfg = TransformerConfig( - batch_size=1, - seq_length=128, - vocab_size=36560, - hidden_size=512, - num_hidden_layers=6, - num_attention_heads=8, - intermediate_size=2048, - hidden_act="relu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - max_position_embeddings=128, - label_smoothing=0.1, - beam_width=4, - max_decode_length=80, - length_penalty_weight=1.0, - dtype=mstype.float32, - compute_type=mstype.float16) diff --git a/tests/st/model_zoo_tests/transformer/src/lr_schedule.py b/tests/st/model_zoo_tests/transformer/src/lr_schedule.py deleted file mode 100644 index c246283478a..00000000000 --- a/tests/st/model_zoo_tests/transformer/src/lr_schedule.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""Learning rate utilities.""" - -def linear_warmup(warmup_steps, current_step): - return min([1.0, float(current_step)/float(warmup_steps)]) - -def rsqrt_decay(warmup_steps, current_step): - return float(max([current_step, warmup_steps])) ** -0.5 - -def rsqrt_hidden(hidden_size): - return float(hidden_size) ** -0.5 - -def create_dynamic_lr(schedule, training_steps, learning_rate, warmup_steps, hidden_size, - start_decay_step=0, min_lr=0.): - """ - Generate dynamic learning rate. - """ - if start_decay_step < warmup_steps: - start_decay_step = warmup_steps - lr = [] - for current_step in range(1, training_steps+1): - cur_lr = 1.0 - for name in schedule.split("*"): - if name == "constant": - cur_lr *= float(learning_rate) - elif name == "rsqrt_hidden": - cur_lr *= rsqrt_hidden(hidden_size) - elif name == "linear_warmup": - cur_lr *= linear_warmup(warmup_steps, current_step) - elif name == "rsqrt_decay": - cur_lr *= rsqrt_decay(warmup_steps, current_step-start_decay_step+warmup_steps) - else: - raise ValueError("unknown learning rate schedule") - if warmup_steps < current_step < start_decay_step: - cur_lr = lr[-1] - if current_step > warmup_steps: - cur_lr = max([cur_lr, min_lr]) - lr.append(cur_lr) - return lr diff --git a/tests/st/model_zoo_tests/transformer/src/process_output.py b/tests/st/model_zoo_tests/transformer/src/process_output.py deleted file mode 100644 index f69ea6a0d7d..00000000000 --- a/tests/st/model_zoo_tests/transformer/src/process_output.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""Convert ids to tokens.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import sys - -import tokenization - -# Explicitly set the encoding -sys.stdin = open(sys.stdin.fileno(), mode='r', encoding='utf-8', buffering=True) -sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=True) - -def main(): - parser = argparse.ArgumentParser( - description="recore nbest with smoothed sentence-level bleu.") - parser.add_argument("--vocab_file", type=str, default="", required=True, help="vocab file path.") - args = parser.parse_args() - - tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=args.vocab_file) - - for line in sys.stdin: - token_ids = [int(x) for x in line.strip().split()] - tokens = tokenizer.convert_ids_to_tokens(token_ids) - sent = " ".join(tokens) - sent = sent.split("")[-1] - sent = sent.split("")[0] - print(sent.strip()) - -if __name__ == "__main__": - main() diff --git a/tests/st/model_zoo_tests/transformer/src/tokenization.py b/tests/st/model_zoo_tests/transformer/src/tokenization.py deleted file mode 100644 index b4121f6c365..00000000000 --- a/tests/st/model_zoo_tests/transformer/src/tokenization.py +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""Tokenization utilities.""" - -import sys -import collections -import unicodedata - -def convert_to_printable(text): - """ - Converts `text` to a printable coding format. - """ - if sys.version_info[0] == 3: - if isinstance(text, str): - return text - if isinstance(text, bytes): - return text.decode("utf-8", "ignore") - raise ValueError("Only support type `str` or `bytes`, while text type is `%s`" % (type(text))) - if sys.version_info[0] == 2: - if isinstance(text, str): - return text - if isinstance(text, unicode): - return text.encode("utf-8") - raise ValueError("Only support type `str` or `unicode`, while text type is `%s`" % (type(text))) - raise ValueError("Only supported when running on Python2 or Python3.") - - -def convert_to_unicode(text): - """ - Converts `text` to Unicode format. - """ - if sys.version_info[0] == 3: - if isinstance(text, str): - return text - if isinstance(text, bytes): - return text.decode("utf-8", "ignore") - raise ValueError("Only support type `str` or `bytes`, while text type is `%s`" % (type(text))) - if sys.version_info[0] == 2: - if isinstance(text, str): - return text.decode("utf-8", "ignore") - if isinstance(text, unicode): - return text - raise ValueError("Only support type `str` or `unicode`, while text type is `%s`" % (type(text))) - raise ValueError("Only supported when running on Python2 or Python3.") - - -def load_vocab_file(vocab_file): - """ - Loads a vocabulary file and turns into a {token:id} dictionary. - """ - vocab_dict = collections.OrderedDict() - index = 0 - with open(vocab_file, "r") as vocab: - while True: - token = convert_to_unicode(vocab.readline()) - if not token: - break - token = token.strip() - vocab_dict[token] = index - index += 1 - return vocab_dict - - -def convert_by_vocab_dict(vocab_dict, items): - """ - Converts a sequence of [tokens|ids] according to the vocab dict. - """ - output = [] - for item in items: - if item in vocab_dict: - output.append(vocab_dict[item]) - else: - output.append(vocab_dict[""]) - return output - - -class WhiteSpaceTokenizer(): - """ - Whitespace tokenizer. - """ - def __init__(self, vocab_file): - self.vocab_dict = load_vocab_file(vocab_file) - self.inv_vocab_dict = {index: token for token, index in self.vocab_dict.items()} - - def _is_whitespace_char(self, char): - """ - Checks if it is a whitespace character(regard "\t", "\n", "\r" as whitespace here). - """ - if char in (" ", "\t", "\n", "\r"): - return True - uni = unicodedata.category(char) - if uni == "Zs": - return True - return False - - def _is_control_char(self, char): - """ - Checks if it is a control character. - """ - if char in ("\t", "\n", "\r"): - return False - uni = unicodedata.category(char) - if uni in ("Cc", "Cf"): - return True - return False - - def _clean_text(self, text): - """ - Remove invalid characters and cleanup whitespace. - """ - output = [] - for char in text: - cp = ord(char) - if cp == 0 or cp == 0xfffd or self._is_control_char(char): - continue - if self._is_whitespace_char(char): - output.append(" ") - else: - output.append(char) - return "".join(output) - - def _whitespace_tokenize(self, text): - """ - Clean whitespace and split text into tokens. - """ - text = text.strip() - if not text: - tokens = [] - else: - tokens = text.split() - return tokens - - def tokenize(self, text): - """ - Tokenizes text. - """ - text = convert_to_unicode(text) - text = self._clean_text(text) - tokens = self._whitespace_tokenize(text) - return tokens - - def convert_tokens_to_ids(self, tokens): - return convert_by_vocab_dict(self.vocab_dict, tokens) - - def convert_ids_to_tokens(self, ids): - return convert_by_vocab_dict(self.inv_vocab_dict, ids) diff --git a/tests/st/model_zoo_tests/transformer/src/transformer_for_train.py b/tests/st/model_zoo_tests/transformer/src/transformer_for_train.py deleted file mode 100644 index 153a98d621f..00000000000 --- a/tests/st/model_zoo_tests/transformer/src/transformer_for_train.py +++ /dev/null @@ -1,472 +0,0 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""Transformer for training.""" -import numpy as np - -from mindspore.common.initializer import initializer -import mindspore.nn as nn -from mindspore.ops import operations as P -from mindspore.ops import functional as F -from mindspore.ops import composite as C -from mindspore.common.tensor import Tensor -from mindspore.common.parameter import Parameter -from mindspore.common import dtype as mstype -from mindspore.nn.wrap.grad_reducer import DistributedGradReducer -from mindspore.communication.management import get_group_size -from mindspore.context import ParallelMode -from mindspore import context - -from .transformer_model import TransformerModel - -GRADIENT_CLIP_TYPE = 1 -GRADIENT_CLIP_VALUE = 5.0 - -clip_grad = C.MultitypeFuncGraph("clip_grad") - - -@clip_grad.register("Number", "Number", "Tensor") -def _clip_grad(clip_type, clip_value, grad): - """ - Clip gradients. - - Inputs: - clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'. - clip_value (float): Specifies how much to clip. - grad (tuple[Tensor]): Gradients. - - Outputs: - tuple[Tensor], clipped gradients. - """ - if clip_type not in (0, 1): - return grad - dt = F.dtype(grad) - if clip_type == 0: - new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt), - F.cast(F.tuple_to_array((clip_value,)), dt)) - else: - new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt)) - return new_grad - - -class TransformerTrainingLoss(nn.Cell): - """ - Provide transformer training loss. - - Args: - config (TransformerConfig): The config of Transformer. - - Returns: - Tensor, total loss. - """ - def __init__(self, config): - super(TransformerTrainingLoss, self).__init__(auto_prefix=False) - self.vocab_size = config.vocab_size - self.onehot = P.OneHot() - self.on_value = Tensor(float(1 - config.label_smoothing), mstype.float32) - self.off_value = Tensor(config.label_smoothing / float(self.vocab_size - 1), mstype.float32) - self.reduce_sum = P.ReduceSum() - self.reduce_mean = P.ReduceMean() - self.reshape = P.Reshape() - self.last_idx = (-1,) - self.flatten = P.Flatten() - self.neg = P.Neg() - self.cast = P.Cast() - self.batch_size = config.batch_size - - def construct(self, prediction_scores, label_ids, label_weights, seq_length): - """Defines the computation performed.""" - flat_shape = (self.batch_size * seq_length,) - label_ids = self.reshape(label_ids, flat_shape) - label_weights = self.cast(self.reshape(label_weights, flat_shape), mstype.float32) - one_hot_labels = self.onehot(label_ids, self.vocab_size, self.on_value, self.off_value) - - per_example_loss = self.neg(self.reduce_sum(prediction_scores * one_hot_labels, self.last_idx)) - numerator = self.reduce_sum(label_weights * per_example_loss, ()) - denominator = self.reduce_sum(label_weights, ()) + \ - self.cast(F.tuple_to_array((1e-5,)), mstype.float32) - loss = numerator / denominator - return loss - - -class TransformerNetworkWithLoss(nn.Cell): - """ - Provide transformer training loss through network. - - Args: - config (TransformerConfig): The config of Transformer. - is_training (bool): Specifies whether to use the training mode. - use_one_hot_embeddings (bool): Specifies whether to use one-hot for embeddings. Default: False. - - Returns: - Tensor, the loss of the network. - """ - def __init__(self, config, is_training, use_one_hot_embeddings=False): - super(TransformerNetworkWithLoss, self).__init__(auto_prefix=False) - self.transformer = TransformerModel(config, is_training, use_one_hot_embeddings) - self.loss = TransformerTrainingLoss(config) - self.cast = P.Cast() - self.shape = P.Shape() - - def construct(self, - source_ids, - source_mask, - target_ids, - target_mask, - label_ids, - label_weights): - """Transformer network with loss.""" - prediction_scores = self.transformer(source_ids, source_mask, target_ids, target_mask) - seq_length = self.shape(source_ids)[1] - total_loss = self.loss(prediction_scores, label_ids, label_weights, seq_length) - return self.cast(total_loss, mstype.float32) - - -class TransformerTrainOneStepCell(nn.TrainOneStepCell): - """ - Encapsulation class of transformer network training. - - Append an optimizer to the training network after that the construct - function can be called to create the backward graph. - - Args: - network (Cell): The training network. Note that loss function should have been added. - optimizer (Optimizer): Optimizer for updating the weights. - sens (Number): The adjust parameter. Default: 1.0. - """ - def __init__(self, network, optimizer, sens=1.0): - super(TransformerTrainOneStepCell, self).__init__(network, optimizer, sens) - - self.cast = P.Cast() - self.hyper_map = C.HyperMap() - - def set_sens(self, value): - self.sens = value - - def construct(self, - source_eos_ids, - source_eos_mask, - target_sos_ids, - target_sos_mask, - target_eos_ids, - target_eos_mask,): - """Defines the computation performed.""" - source_ids = source_eos_ids - source_mask = source_eos_mask - target_ids = target_sos_ids - target_mask = target_sos_mask - label_ids = target_eos_ids - label_weights = target_eos_mask - - weights = self.weights - loss = self.network(source_ids, - source_mask, - target_ids, - target_mask, - label_ids, - label_weights) - grads = self.grad(self.network, weights)(source_ids, - source_mask, - target_ids, - target_mask, - label_ids, - label_weights, - self.cast(F.tuple_to_array((self.sens,)), - mstype.float32)) - grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) - # apply grad reducer on grads - grads = self.grad_reducer(grads) - succ = self.optimizer(grads) - return F.depend(loss, succ) - - -grad_scale = C.MultitypeFuncGraph("grad_scale") -reciprocal = P.Reciprocal() - - -@grad_scale.register("Tensor", "Tensor") -def tensor_grad_scale(scale, grad): - return grad * F.cast(reciprocal(scale), F.dtype(grad)) - -_grad_overflow = C.MultitypeFuncGraph("_grad_overflow") -grad_overflow = P.FloatStatus() - -@_grad_overflow.register("Tensor") -def _tensor_grad_overflow(grad): - return grad_overflow(grad) - -class TransformerTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell): - """ - Encapsulation class of Transformer network training. - - Append an optimizer to the training network after that the construct - function can be called to create the backward graph. - - Args: - network (Cell): The training network. Note that loss function should have been added. - optimizer (Optimizer): Optimizer for updating the weights. - scale_update_cell (Cell): Cell to do the loss scale. Default: None. - """ - def __init__(self, network, optimizer, scale_update_cell=None): - super(TransformerTrainOneStepWithLossScaleCell, self).__init__(network, optimizer, scale_update_cell) - self.cast = P.Cast() - self.degree = 1 - if self.reducer_flag: - self.degree = get_group_size() - self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree) - - self.loss_scale = None - self.loss_scaling_manager = scale_update_cell - if scale_update_cell: - self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) - - def construct(self, - source_eos_ids, - source_eos_mask, - target_sos_ids, - target_sos_mask, - target_eos_ids, - target_eos_mask, - sens=None): - """Defines the computation performed.""" - source_ids = source_eos_ids - source_mask = source_eos_mask - target_ids = target_sos_ids - target_mask = target_sos_mask - label_ids = target_eos_ids - label_weights = target_eos_mask - - weights = self.weights - loss = self.network(source_ids, - source_mask, - target_ids, - target_mask, - label_ids, - label_weights) - if sens is None: - scaling_sens = self.loss_scale - else: - scaling_sens = sens - status, scaling_sens = self.start_overflow_check(loss, scaling_sens) - grads = self.grad(self.network, weights)(source_ids, - source_mask, - target_ids, - target_mask, - label_ids, - label_weights, - self.cast(scaling_sens, - mstype.float32)) - - # apply grad reducer on grads - grads = self.grad_reducer(grads) - grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads) - grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) - - cond = self.get_overflow_status(status, grads) - overflow = cond - if sens is None: - overflow = self.loss_scaling_manager(self.loss_scale, cond) - if overflow: - succ = False - else: - succ = self.optimizer(grads) - ret = (loss, cond, scaling_sens) - return F.depend(ret, succ) - - -cast = P.Cast() -add_grads = C.MultitypeFuncGraph("add_grads") - - -@add_grads.register("Tensor", "Tensor") -def _add_grads(accu_grad, grad): - return accu_grad + cast(grad, mstype.float32) - -update_accu_grads = C.MultitypeFuncGraph("update_accu_grads") - -@update_accu_grads.register("Tensor", "Tensor") -def _update_accu_grads(accu_grad, grad): - succ = True - return F.depend(succ, F.assign(accu_grad, cast(grad, mstype.float32))) - -accumulate_accu_grads = C.MultitypeFuncGraph("accumulate_accu_grads") - -@accumulate_accu_grads.register("Tensor", "Tensor") -def _accumulate_accu_grads(accu_grad, grad): - succ = True - return F.depend(succ, F.assign_add(accu_grad, cast(grad, mstype.float32))) - - -zeroslike = P.ZerosLike() -reset_accu_grads = C.MultitypeFuncGraph("reset_accu_grads") - - -@reset_accu_grads.register("Tensor") -def _reset_accu_grads(accu_grad): - succ = True - return F.depend(succ, F.assign(accu_grad, zeroslike(accu_grad))) - - -class TransformerTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell): - """ - Encapsulation class of bert network training. - - Append an optimizer to the training network after that the construct - function can be called to create the backward graph. - - To mimic higher batch size, gradients are accumulated N times before weight update. - - For distribution mode, allreduce will only be implemented in the weight updated step, - i.e. the sub-step after gradients accumulated N times. - - Args: - network (Cell): The training network. Note that loss function should have been added. - optimizer (Optimizer): Optimizer for updating the weights. - scale_update_cell (Cell): Cell to do the loss scale. Default: None. - accumulation_steps (int): Number of accumulation steps before gradient update. The global batch size = - batch_size * accumulation_steps. Default: 1. - """ - - def __init__(self, network, optimizer, scale_update_cell=None, accumulation_steps=8, enable_global_norm=False): - super(TransformerTrainAccumulationAllReducePostWithLossScaleCell, self).__init__(auto_prefix=False) - self.network = network - self.network.set_grad() - self.weights = optimizer.parameters - self.optimizer = optimizer - self.accumulation_steps = accumulation_steps - self.enable_global_norm = enable_global_norm - self.one = Tensor(np.array([1]).astype(np.int32)) - self.zero = Tensor(np.array([0]).astype(np.int32)) - self.local_step = Parameter(initializer(0, [1], mstype.int32)) - self.accu_grads = self.weights.clone(prefix="accu_grads", init='zeros') - self.accu_overflow = Parameter(initializer(0, [1], mstype.int32)) - self.accu_loss = Parameter(initializer(0, [1], mstype.float32)) - - self.grad = C.GradOperation(get_by_list=True, sens_param=True) - self.reducer_flag = False - self.parallel_mode = context.get_auto_parallel_context("parallel_mode") - if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: - self.reducer_flag = True - self.grad_reducer = F.identity - self.degree = 1 - if self.reducer_flag: - self.degree = get_group_size() - self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree) - self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) - self.overflow_reducer = F.identity - if self.is_distributed: - self.overflow_reducer = P.AllReduce() - self.cast = P.Cast() - self.alloc_status = P.NPUAllocFloatStatus() - self.get_status = P.NPUGetFloatStatus() - self.clear_status = P.NPUClearFloatStatus() - self.reduce_sum = P.ReduceSum(keep_dims=False) - self.base = Tensor(1, mstype.float32) - self.less_equal = P.LessEqual() - self.logical_or = P.LogicalOr() - self.not_equal = P.NotEqual() - self.select = P.Select() - self.reshape = P.Reshape() - self.hyper_map = C.HyperMap() - self.loss_scale = None - self.loss_scaling_manager = scale_update_cell - if scale_update_cell: - self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) - - def construct(self, - source_eos_ids, - source_eos_mask, - target_sos_ids, - target_sos_mask, - target_eos_ids, - target_eos_mask, - sens=None): - """Defines the computation performed.""" - source_ids = source_eos_ids - source_mask = source_eos_mask - target_ids = target_sos_ids - target_mask = target_sos_mask - label_ids = target_eos_ids - label_weights = target_eos_mask - - weights = self.weights - loss = self.network(source_ids, - source_mask, - target_ids, - target_mask, - label_ids, - label_weights) - if sens is None: - scaling_sens = self.loss_scale - else: - scaling_sens = sens - # alloc status and clear should be right before gradoperation - init = self.alloc_status() - init = F.depend(init, loss) - clear_status = self.clear_status(init) - scaling_sens = F.depend(scaling_sens, clear_status) - # update accumulation parameters - is_accu_step = self.not_equal(self.local_step, self.accumulation_steps) - self.local_step = self.select(is_accu_step, self.local_step + self.one, self.one) - self.accu_loss = self.select(is_accu_step, self.accu_loss + loss, loss) - mean_loss = self.accu_loss / self.local_step - is_accu_step = self.not_equal(self.local_step, self.accumulation_steps) - - grads = self.grad(self.network, weights)(source_ids, - source_mask, - target_ids, - target_mask, - label_ids, - label_weights, - self.cast(scaling_sens, - mstype.float32)) - - accu_succ = self.hyper_map(accumulate_accu_grads, self.accu_grads, grads) - mean_loss = F.depend(mean_loss, accu_succ) - - init = F.depend(init, mean_loss) - get_status = self.get_status(init) - init = F.depend(init, get_status) - flag_sum = self.reduce_sum(init, (0,)) - overflow = self.less_equal(self.base, flag_sum) - overflow = self.logical_or(self.not_equal(self.accu_overflow, self.zero), overflow) - accu_overflow = self.select(overflow, self.one, self.zero) - self.accu_overflow = self.select(is_accu_step, accu_overflow, self.zero) - - if is_accu_step: - succ = False - else: - # apply grad reducer on grads - grads = self.grad_reducer(self.accu_grads) - scaling = scaling_sens * self.degree * self.accumulation_steps - grads = self.hyper_map(F.partial(grad_scale, scaling), grads) - if self.enable_global_norm: - grads = C.clip_by_global_norm(grads, 1.0, None) - else: - grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) - accu_overflow = F.depend(accu_overflow, grads) - accu_overflow = self.overflow_reducer(accu_overflow) - overflow = self.less_equal(self.base, accu_overflow) - accu_succ = self.hyper_map(reset_accu_grads, self.accu_grads) - overflow = F.depend(overflow, accu_succ) - overflow = self.reshape(overflow, (())) - if sens is None: - overflow = self.loss_scaling_manager(self.loss_scale, overflow) - if overflow: - succ = False - else: - succ = self.optimizer(grads) - - ret = (mean_loss, overflow, scaling_sens) - return F.depend(ret, succ) diff --git a/tests/st/model_zoo_tests/transformer/src/transformer_model.py b/tests/st/model_zoo_tests/transformer/src/transformer_model.py deleted file mode 100644 index 5e0aa6aa5b7..00000000000 --- a/tests/st/model_zoo_tests/transformer/src/transformer_model.py +++ /dev/null @@ -1,1153 +0,0 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""Transformer model.""" - -import math -import copy -import numpy as np -import mindspore.common.dtype as mstype -import mindspore.nn as nn -import mindspore.ops.functional as F -from mindspore.ops import operations as P -from mindspore.common.tensor import Tensor -from mindspore.common.parameter import Parameter -from mindspore.ops.primitive import constexpr -from .beam_search import BeamSearchDecoder, TileBeam -from .weight_init import normal_weight, weight_variable - -class TransformerConfig: - """ - Configuration for `Transformer`. - - Args: - batch_size (int): Batch size of input dataset. - seq_length (int): Length of input sequence. Default: 128. - vocab_size (int): The shape of each embedding vector. Default: 36560. - hidden_size (int): Size of the layers. Default: 1024. - num_hidden_layers (int): Number of hidden layers in the Transformer encoder/decoder - cell. Default: 6. - num_attention_heads (int): Number of attention heads in the Transformer - encoder/decoder cell. Default: 16. - intermediate_size (int): Size of intermediate layer in the Transformer - encoder/decoder cell. Default: 4096. - hidden_act (str): Activation function used in the Transformer encoder/decoder - cell. Default: "relu". - hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.3. - attention_probs_dropout_prob (float): The dropout probability for - MultiheadAttention. Default: 0.3. - max_position_embeddings (int): Maximum length of sequences used in this - model. Default: 128. - initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. - label_smoothing (float): label smoothing setting. Default: 0.1 - beam_width (int): beam width setting. Default: 4 - max_decode_length (int): max decode length in evaluation. Default: 80 - length_penalty_weight (float): normalize scores of translations according to their length. Default: 1.0 - dtype (:class:`mindspore.dtype`): Data type of the input. Default: mstype.float32. - compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32. - """ - def __init__(self, - batch_size, - seq_length=128, - vocab_size=36560, - hidden_size=1024, - num_hidden_layers=6, - num_attention_heads=16, - intermediate_size=4096, - hidden_act="relu", - hidden_dropout_prob=0.3, - attention_probs_dropout_prob=0.3, - max_position_embeddings=128, - initializer_range=0.02, - label_smoothing=0.1, - beam_width=4, - max_decode_length=80, - length_penalty_weight=1.0, - dtype=mstype.float32, - compute_type=mstype.float32): - self.batch_size = batch_size - self.seq_length = seq_length - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.label_smoothing = label_smoothing - self.beam_width = beam_width - self.max_decode_length = max_decode_length - self.length_penalty_weight = length_penalty_weight - self.dtype = dtype - self.compute_type = compute_type - - -class EmbeddingLookup(nn.Cell): - """ - A embeddings lookup table with a fixed dictionary and size. - - Args: - vocab_size (int): Size of the dictionary of embeddings. - embedding_size (int): The size of each embedding vector. - use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. - initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. - """ - def __init__(self, - vocab_size, - embedding_size, - use_one_hot_embeddings=False, - initializer_range=0.02): - super(EmbeddingLookup, self).__init__() - self.vocab_size = vocab_size - self.embedding_size = embedding_size - self.use_one_hot_embeddings = use_one_hot_embeddings - self.embedding_table = Parameter(normal_weight([vocab_size, embedding_size], embedding_size)) - self.expand = P.ExpandDims() - self.shape_flat = (-1,) - self.gather = P.Gather() - self.one_hot = P.OneHot() - self.on_value = Tensor(1.0, mstype.float32) - self.off_value = Tensor(0.0, mstype.float32) - self.array_mul = P.MatMul() - self.reshape = P.Reshape() - self.shape = P.Shape() - - def construct(self, input_ids): - """Get a embeddings lookup table with a fixed dictionary and size.""" - input_shape = self.shape(input_ids) - - flat_ids = self.reshape(input_ids, self.shape_flat) - if self.use_one_hot_embeddings: - one_hot_ids = self.one_hot(flat_ids, self.vocab_size, self.on_value, self.off_value) - output_for_reshape = self.array_mul(one_hot_ids, self.embedding_table) - else: - output_for_reshape = self.gather(self.embedding_table, flat_ids, 0) - - out_shape = input_shape + (self.embedding_size,) - output = self.reshape(output_for_reshape, out_shape) - return output, self.embedding_table - - -def position_encoding(length, - depth, - min_timescale=1, - max_timescale=1e4): - """ - Create Tensor of sinusoids of different frequencies. - - Args: - length (int): Length of the Tensor to create, i.e. Number of steps. - depth (int): Hidden size. - min_timescale (float): Default: 1. - max_timescale (float): Default: 10000. - - Returns: - Tensor of shape (length, depth) - """ - depth = depth // 2 - positions = np.arange(length, dtype=np.float32) - log_timescale_increment = (np.log(max_timescale / min_timescale) / (depth - 1)) - inv_timescales = min_timescale * np.exp(np.arange(depth, dtype=np.float32) * -log_timescale_increment) - scaled_time = np.expand_dims(positions, 1) * np.expand_dims(inv_timescales, 0) - x = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1) - return x - - -class EmbeddingPostprocessor(nn.Cell): - """ - Postprocessors apply positional embeddings to word embeddings. - - Args: - embedding_size (int): The size of each embedding vector. - use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. - initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. - max_position_embeddings (int): Maximum length of sequences used in this - model. Default: 128. - dropout_prob (float): The dropout probability. Default: 0.1. - """ - def __init__(self, - embedding_size, - use_one_hot_embeddings=False, - initializer_range=0.02, - max_position_embeddings=128, - dropout_prob=0.1): - super(EmbeddingPostprocessor, self).__init__() - self.scores_mul = Tensor([math.sqrt(float(embedding_size))], dtype=mstype.float32) - self.multiply = P.Mul() - self.add = P.Add() - self.dropout = nn.Dropout(1 - dropout_prob, dtype=mstype.float32) - self.use_dropout = dropout_prob > 0 - self.expand_dims = P.ExpandDims() - self.position_embedding_table = Tensor(position_encoding(max_position_embeddings, embedding_size), - mstype.float32) - self.shape = P.Shape() - - def construct(self, word_embeddings): - """Postprocessors apply positional embeddings to word embeddings.""" - input_shape = self.shape(word_embeddings) - input_len = input_shape[1] - - output = self.multiply(word_embeddings, self.scores_mul) - - # add position embeddings - position_embeddings = self.position_embedding_table[0:input_len:1, ::] - position_embeddings = self.expand_dims(position_embeddings, 0) - output = self.add(output, position_embeddings) - - if self.use_dropout: - output = self.dropout(output) - return output - - -class CastWrapper(nn.Cell): - """ - Cast wrapper. - """ - def __init__(self, src_type=mstype.float32, dst_type=mstype.float32): - super(CastWrapper, self).__init__() - self.cast = P.Cast() - self.dst_type = dst_type - - def construct(self, x): - return self.cast(x, self.dst_type) - - -class LayerPreprocess(nn.Cell): - """ - preprocess input of each layer. - """ - def __init__(self, - in_channels=None): - super(LayerPreprocess, self).__init__() - self.layernorm = nn.LayerNorm((in_channels,)) - self.cast = P.Cast() - self.get_dtype = P.DType() - - def construct(self, input_tensor): - output = self.cast(input_tensor, mstype.float32) - output = self.layernorm(output) - output = self.cast(output, self.get_dtype(input_tensor)) - return output - - -class LayerPostprocess(nn.Cell): - """ - postprocess output of each layer. - """ - def __init__(self, - dropout_prob=0.1): - super(LayerPostprocess, self).__init__() - self.add = P.Add() - self.dropout = nn.Dropout(1 - dropout_prob) - self.use_dropout = dropout_prob > 0 - - def construct(self, hidden_tensor, input_tensor): - output = hidden_tensor - if self.use_dropout: - output = self.dropout(output) - output = self.add(output, input_tensor) - return output - - -class MultiheadAttention(nn.Cell): - """ - Apply multi-headed attention from "from_tensor" to "to_tensor". - - Args: - batch_size (int): Batch size of input datasets. - from_tensor_width (int): Size of last dim of from_tensor. - to_tensor_width (int): Size of last dim of to_tensor. - from_seq_length (int): Length of from_tensor sequence. - to_seq_length (int): Length of to_tensor sequence. - num_attention_heads (int): Number of attention heads. Default: 1. - size_per_head (int): Size of each attention head. Default: 512. - query_act (str): Activation function for the query transform. Default: None. - key_act (str): Activation function for the key transform. Default: None. - value_act (str): Activation function for the value transform. Default: None. - has_attention_mask (bool): Specifies whether to use attention mask. Default: False. - attention_probs_dropout_prob (float): The dropout probability for - MultiheadAttention. Default: 0.0. - use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. - initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. - do_return_2d_tensor (bool): True for return 2d tensor. False for return 3d - tensor. Default: False. - compute_type (:class:`mindspore.dtype`): Compute type in MultiheadAttention. Default: mstype.float32. - """ - def __init__(self, - batch_size, - from_tensor_width, - to_tensor_width, - out_tensor_width, - num_attention_heads=1, - size_per_head=512, - query_act=None, - key_act=None, - value_act=None, - out_act=None, - has_attention_mask=True, - attention_probs_dropout_prob=0.0, - use_one_hot_embeddings=False, - initializer_range=0.02, - do_return_2d_tensor=True, - compute_type=mstype.float32): - super(MultiheadAttention, self).__init__() - self.batch_size = batch_size - self.num_attention_heads = num_attention_heads - self.size_per_head = size_per_head - self.has_attention_mask = has_attention_mask - assert has_attention_mask - self.use_one_hot_embeddings = use_one_hot_embeddings - self.initializer_range = initializer_range - self.do_return_2d_tensor = do_return_2d_tensor - - self.scores_mul = Tensor([1.0 / math.sqrt(float(self.size_per_head))], dtype=compute_type) - self.reshape = P.Reshape() - self.shape_from_2d = (-1, from_tensor_width) - self.shape_to_2d = (-1, to_tensor_width) - units = num_attention_heads * size_per_head - self.query_layer = nn.Dense(from_tensor_width, - units, - activation=query_act, - has_bias=False, - weight_init=weight_variable([units, from_tensor_width])).to_float(compute_type) - self.key_layer = nn.Dense(to_tensor_width, - units, - activation=key_act, - has_bias=False, - weight_init=weight_variable([units, to_tensor_width])).to_float(compute_type) - self.value_layer = nn.Dense(to_tensor_width, - units, - activation=value_act, - has_bias=False, - weight_init=weight_variable([units, to_tensor_width])).to_float(compute_type) - self.out_layer = nn.Dense(units, - out_tensor_width, - activation=out_act, - has_bias=False, - weight_init=weight_variable([out_tensor_width, units])).to_float(compute_type) - - self.matmul_trans_b = P.BatchMatMul(transpose_b=True) - self.multiply = P.Mul() - self.transpose = P.Transpose() - self.trans_shape = (0, 2, 1, 3) - self.trans_shape_relative = (2, 0, 1, 3) - self.trans_shape_position = (1, 2, 0, 3) - self.multiply_data = Tensor([-10000.0,], dtype=compute_type) - self.batch_num = batch_size * num_attention_heads - self.matmul = P.BatchMatMul() - - self.softmax = nn.Softmax() - self.dropout = nn.Dropout(1 - attention_probs_dropout_prob) - self.use_dropout = attention_probs_dropout_prob > 0 - - if self.has_attention_mask: - self.expand_dims = P.ExpandDims() - self.sub = P.Sub() - self.add = P.Add() - self.cast = P.Cast() - self.get_dtype = P.DType() - - self.cast_compute_type = CastWrapper(dst_type=compute_type) - self.softmax_cast = P.Cast() - - def construct(self, from_tensor, to_tensor, seq_length, enc_seq_length, attention_mask=None): - """Apply multihead attention.""" - from_seq_length = seq_length - to_seq_length = enc_seq_length - shape_from = (self.batch_size, from_seq_length, self.num_attention_heads, self.size_per_head) - shape_to = (self.batch_size, to_seq_length, self.num_attention_heads, self.size_per_head) - if self.do_return_2d_tensor: - shape_return = (self.batch_size * from_seq_length, self.num_attention_heads * self.size_per_head) - if from_seq_length == -1: - shape_return = (-1, self.num_attention_heads * self.size_per_head) - else: - shape_return = (self.batch_size, from_seq_length, self.num_attention_heads * self.size_per_head) - - # reshape 2d/3d input tensors to 2d - from_tensor_2d = self.reshape(from_tensor, self.shape_from_2d) - to_tensor_2d = self.reshape(to_tensor, self.shape_to_2d) - query_out = self.query_layer(from_tensor_2d) - key_out = self.key_layer(to_tensor_2d) - value_out = self.value_layer(to_tensor_2d) - - query_layer = self.reshape(query_out, shape_from) - query_layer = self.transpose(query_layer, self.trans_shape) - key_layer = self.reshape(key_out, shape_to) - key_layer = self.transpose(key_layer, self.trans_shape) - - attention_scores = self.matmul_trans_b(query_layer, key_layer) - attention_scores = self.multiply(attention_scores, self.scores_mul) - - if self.has_attention_mask: - attention_mask = self.expand_dims(attention_mask, 1) - multiply_out = self.sub(self.cast(F.tuple_to_array((1.0,)), self.get_dtype(attention_scores)), - self.cast(attention_mask, self.get_dtype(attention_scores))) - adder = self.multiply(multiply_out, self.multiply_data) - attention_scores = self.add(adder, attention_scores) - - attention_scores = self.softmax_cast(attention_scores, mstype.float32) - attention_probs = self.softmax(attention_scores) - attention_probs = self.softmax_cast(attention_probs, self.get_dtype(key_layer)) - if self.use_dropout: - attention_probs = self.dropout(attention_probs) - - value_layer = self.reshape(value_out, shape_to) - value_layer = self.transpose(value_layer, self.trans_shape) - context_layer = self.matmul(attention_probs, value_layer) - - context_layer = self.transpose(context_layer, self.trans_shape) - context_layer = self.reshape(context_layer, shape_return) - context_layer = self.out_layer(context_layer) - return context_layer - - -class SelfAttention(nn.Cell): - """ - Apply self-attention. - - Args: - batch_size (int): Batch size of input dataset. - from_seq_length (int): Length of query sequence. - to_seq_length (int): Length of memory sequence. - hidden_size (int): Size of attention layers. - num_attention_heads (int): Number of attention heads. Default: 16. - attention_probs_dropout_prob (float): The dropout probability for - SelfAttention. Default: 0.1. - use_one_hot_embeddings (bool): Specifies whether to use one_hot encoding form. Default: False. - initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. - hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1. - has_attention_mask (bool): Specifies whether has attention mask. Default: True. - is_encdec_att (bool): Specifies whether query sequence and memory sequence are different. Default: False. - compute_type (:class:`mindspore.dtype`): Compute type in MultiheadAttention. Default: mstype.float32. - """ - def __init__(self, - batch_size, - hidden_size, - num_attention_heads=16, - attention_probs_dropout_prob=0.1, - use_one_hot_embeddings=False, - initializer_range=0.02, - hidden_dropout_prob=0.1, - has_attention_mask=True, - is_encdec_att=False, - compute_type=mstype.float32): - super(SelfAttention, self).__init__() - if hidden_size % num_attention_heads != 0: - raise ValueError("The hidden size (%d) is not a multiple of the number " - "of attention heads (%d)" % (hidden_size, num_attention_heads)) - self.size_per_head = int(hidden_size / num_attention_heads) - self.is_encdec_att = is_encdec_att - - self.attention = MultiheadAttention( - batch_size=batch_size, - from_tensor_width=hidden_size, - to_tensor_width=hidden_size, - out_tensor_width=hidden_size, - num_attention_heads=num_attention_heads, - size_per_head=self.size_per_head, - attention_probs_dropout_prob=attention_probs_dropout_prob, - use_one_hot_embeddings=use_one_hot_embeddings, - initializer_range=initializer_range, - has_attention_mask=has_attention_mask, - do_return_2d_tensor=True, - compute_type=compute_type) - - self.preprocess = LayerPreprocess(in_channels=hidden_size) - self.postprocess = LayerPostprocess(dropout_prob=hidden_dropout_prob) - - self.reshape = P.Reshape() - self.shape = (-1, hidden_size) - def construct(self, input_tensor, memory_tensor, attention_mask, seq_length, enc_seq_length): - """Apply self-attention.""" - input_tensor = self.reshape(input_tensor, self.shape) - memory_tensor = self.reshape(memory_tensor, self.shape) - - output = self.preprocess(input_tensor) - - if not self.is_encdec_att: - memory_tensor = output - - attention_output = self.attention(output, memory_tensor, seq_length, enc_seq_length, attention_mask) - output = self.postprocess(attention_output, input_tensor) - return output - - -class FeedForward(nn.Cell): - """ - Apply two-layer feed forward - - Args: - in_channels (int): Size of the input layer. - hidden_size (int): Size of the hidden layer. - out_channels (int): Size of the output layers. - hidden_act (str): name of the activation function. Default: relu - initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. - hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1. - compute_type (:class:`mindspore.dtype`): Compute type in FeedForward. Default: mstype.float32. - """ - def __init__(self, - in_channels, - hidden_size, - out_channels, - hidden_act="relu", - initializer_range=0.02, - hidden_dropout_prob=0.1, - compute_type=mstype.float32): - super(FeedForward, self).__init__() - - self.conv1 = nn.Dense(in_channels, - hidden_size, - activation=hidden_act, - weight_init=weight_variable([hidden_size, in_channels])).to_float(compute_type) - self.conv2 = nn.Dense(hidden_size, - out_channels, - weight_init=weight_variable([out_channels, hidden_size])).to_float(compute_type) - - self.preprocess = LayerPreprocess(in_channels=in_channels) - self.postprocess = LayerPostprocess(dropout_prob=hidden_dropout_prob) - - self.reshape = P.Reshape() - self.shape = (-1, in_channels) - self.dropout = nn.Dropout(1 - hidden_dropout_prob) - self.use_dropout = hidden_dropout_prob > 0 - - def construct(self, input_tensor): - input_tensor = self.reshape(input_tensor, self.shape) - output = self.preprocess(input_tensor) - output = self.conv1(output) - if self.use_dropout: - output = self.dropout(output) - output = self.conv2(output) - output = self.postprocess(output, input_tensor) - return output - - -class EncoderCell(nn.Cell): - """ - Encoder cells used in Transformer. - - Args: - batch_size (int): Batch size of input dataset. - hidden_size (int): Size of the encoder layers. Default: 1024. - seq_length (int): Length of input sequence. Default: 128. - num_attention_heads (int): Number of attention heads. Default: 16. - intermediate_size (int): Size of intermediate layer. Default: 4096. - attention_probs_dropout_prob (float): The dropout probability for - SelfAttention. Default: 0.02. - use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. - initializer_range (float): Initialization value of TruncatedNormal. Default: 0.1. - hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1. - hidden_act (str): Activation function. Default: "relu". - compute_type (:class:`mindspore.dtype`): Compute type in attention. Default: mstype.float32. - """ - def __init__(self, - batch_size, - hidden_size=1024, - num_attention_heads=16, - intermediate_size=4096, - attention_probs_dropout_prob=0.1, - use_one_hot_embeddings=False, - initializer_range=0.02, - hidden_dropout_prob=0.1, - hidden_act="relu", - compute_type=mstype.float32): - super(EncoderCell, self).__init__() - self.attention = SelfAttention( - batch_size=batch_size, - hidden_size=hidden_size, - num_attention_heads=num_attention_heads, - attention_probs_dropout_prob=attention_probs_dropout_prob, - use_one_hot_embeddings=use_one_hot_embeddings, - initializer_range=initializer_range, - hidden_dropout_prob=hidden_dropout_prob, - is_encdec_att=False, - compute_type=compute_type) - self.feedforward = FeedForward( - in_channels=hidden_size, - hidden_size=intermediate_size, - out_channels=hidden_size, - hidden_act=hidden_act, - initializer_range=initializer_range, - hidden_dropout_prob=hidden_dropout_prob, - compute_type=compute_type) - - def construct(self, hidden_states, attention_mask, seq_length): - # self-attention with ln, res - attention_output = self.attention(hidden_states, hidden_states, attention_mask, seq_length, seq_length) - # feed forward with ln, res - output = self.feedforward(attention_output) - return output - - -class TransformerEncoder(nn.Cell): - """ - Multi-layer transformer encoder. - - Args: - batch_size (int): Batch size of input dataset. - hidden_size (int): Size of the encoder layers. - seq_length (int): Length of input sequence. - num_hidden_layers (int): Number of hidden layers in encoder cells. - num_attention_heads (int): Number of attention heads in encoder cells. Default: 16. - intermediate_size (int): Size of intermediate layer in encoder cells. Default: 4096. - attention_probs_dropout_prob (float): The dropout probability for - SelfAttention. Default: 0.1. - use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. - initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. - hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1.. - hidden_act (str): Activation function used in the encoder cells. Default: "gelu". - compute_type (:class:`mindspore.dtype`): Compute type. Default: mstype.float32. - """ - def __init__(self, - batch_size, - hidden_size, - num_hidden_layers, - num_attention_heads=16, - intermediate_size=4096, - attention_probs_dropout_prob=0.1, - use_one_hot_embeddings=False, - initializer_range=0.02, - hidden_dropout_prob=0.1, - hidden_act="relu", - compute_type=mstype.float32): - super(TransformerEncoder, self).__init__() - self.num_hidden_layers = num_hidden_layers - self.batch_size = batch_size - self.hidden_size = hidden_size - - layers = [] - for _ in range(num_hidden_layers): - layer = EncoderCell(batch_size=batch_size, - hidden_size=hidden_size, - num_attention_heads=num_attention_heads, - intermediate_size=intermediate_size, - attention_probs_dropout_prob=attention_probs_dropout_prob, - use_one_hot_embeddings=use_one_hot_embeddings, - initializer_range=initializer_range, - hidden_dropout_prob=hidden_dropout_prob, - hidden_act=hidden_act, - compute_type=compute_type) - layers.append(layer) - self.layers = nn.CellList(layers) - - self.layer_preprocess = LayerPreprocess(in_channels=hidden_size) - - self.reshape = P.Reshape() - self.shape = (-1, hidden_size) - - def construct(self, input_tensor, attention_mask, seq_length): - """Apply encoder.""" - out_shape = (self.batch_size, seq_length, self.hidden_size) - prev_output = self.reshape(input_tensor, self.shape) - - for layer_module in self.layers: - layer_output = layer_module(prev_output, attention_mask, seq_length) - prev_output = layer_output - - prev_output = self.layer_preprocess(prev_output) - output = self.reshape(prev_output, out_shape) - return output - - -class DecoderCell(nn.Cell): - """ - decoder cells used in Transformer. - - Args: - batch_size (int): Batch size of input dataset. - hidden_size (int): Size of the Transformer decoder layers. Default: 1024. - seq_length (int): Length of input sequence. Default: 128. - enc_seq_length (int): Length of source sentences. Default:128 - num_attention_heads (int): Number of attention heads. Default: 12. - intermediate_size (int): Size of intermediate layer. Default: 4096. - attention_probs_dropout_prob (float): The dropout probability for - SelfAttention. Default: 0.02. - use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. - initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. - hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1. - hidden_act (str): Activation function. Default: "relu". - compute_type (:class:`mindspore.dtype`): Compute type in attention. Default: mstype.float32. - """ - def __init__(self, - batch_size, - hidden_size=1024, - num_attention_heads=12, - intermediate_size=4096, - attention_probs_dropout_prob=0.02, - use_one_hot_embeddings=False, - initializer_range=0.02, - hidden_dropout_prob=0.1, - hidden_act="relu", - compute_type=mstype.float32): - super(DecoderCell, self).__init__() - self.self_attention = SelfAttention( - batch_size=batch_size, - hidden_size=hidden_size, - num_attention_heads=num_attention_heads, - attention_probs_dropout_prob=attention_probs_dropout_prob, - use_one_hot_embeddings=use_one_hot_embeddings, - initializer_range=initializer_range, - is_encdec_att=False, - hidden_dropout_prob=hidden_dropout_prob, - compute_type=compute_type) - self.cross_attention = SelfAttention( - batch_size=batch_size, - hidden_size=hidden_size, - num_attention_heads=num_attention_heads, - attention_probs_dropout_prob=attention_probs_dropout_prob, - use_one_hot_embeddings=use_one_hot_embeddings, - initializer_range=initializer_range, - is_encdec_att=True, - hidden_dropout_prob=hidden_dropout_prob, - compute_type=compute_type) - self.feedforward = FeedForward( - in_channels=hidden_size, - hidden_size=intermediate_size, - out_channels=hidden_size, - hidden_act=hidden_act, - initializer_range=initializer_range, - hidden_dropout_prob=hidden_dropout_prob, - compute_type=compute_type) - - def construct(self, hidden_states, attention_mask, enc_states, enc_attention_mask, seq_length, enc_seq_length): - # self-attention with ln, res - attention_output = self.self_attention(hidden_states, hidden_states, attention_mask, seq_length, seq_length) - # cross-attention with ln, res - attention_output = self.cross_attention(attention_output, enc_states, enc_attention_mask, - seq_length, enc_seq_length) - # feed forward with ln, res - output = self.feedforward(attention_output) - return output - - -class TransformerDecoder(nn.Cell): - """ - Multi-layer transformer decoder. - - Args: - batch_size (int): Batch size of input dataset. - hidden_size (int): Size of the encoder layers. - seq_length (int): Length of input sequence. - enc_seq_length (int): Length of source sentences. - num_hidden_layers (int): Number of hidden layers in encoder cells. - num_attention_heads (int): Number of attention heads in encoder cells. Default: 16. - intermediate_size (int): Size of intermediate layer in encoder cells. Default: 4096. - attention_probs_dropout_prob (float): The dropout probability for - SelfAttention. Default: 0.1. - use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. - initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. - hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1. - hidden_act (str): Activation function used in the encoder cells. Default: "gelu". - compute_type (:class:`mindspore.dtype`): Compute type. Default: mstype.float32. - """ - def __init__(self, - batch_size, - hidden_size, - num_hidden_layers, - num_attention_heads=16, - intermediate_size=4096, - attention_probs_dropout_prob=0.1, - use_one_hot_embeddings=False, - initializer_range=0.02, - hidden_dropout_prob=0.1, - hidden_act="relu", - compute_type=mstype.float32): - super(TransformerDecoder, self).__init__() - self.num_hidden_layers = num_hidden_layers - - layers = [] - for _ in range(num_hidden_layers): - layer = DecoderCell(batch_size=batch_size, - hidden_size=hidden_size, - num_attention_heads=num_attention_heads, - intermediate_size=intermediate_size, - attention_probs_dropout_prob=attention_probs_dropout_prob, - use_one_hot_embeddings=use_one_hot_embeddings, - initializer_range=initializer_range, - hidden_dropout_prob=hidden_dropout_prob, - hidden_act=hidden_act, - compute_type=compute_type) - layers.append(layer) - self.layers = nn.CellList(layers) - - self.layer_preprocess = LayerPreprocess(in_channels=hidden_size) - - self.reshape = P.Reshape() - self.shape = (-1, hidden_size) - self.hidden_size = hidden_size - self.batch_size = batch_size - - def construct(self, input_tensor, attention_mask, enc_states, enc_attention_mask, seq_length, enc_seq_length): - """Apply decoder.""" - out_shape = (self.batch_size, seq_length, self.hidden_size) - prev_output = self.reshape(input_tensor, self.shape) - - for layer_module in self.layers: - layer_output = layer_module(prev_output, attention_mask, enc_states, enc_attention_mask, - seq_length, enc_seq_length) - prev_output = layer_output - - prev_output = self.layer_preprocess(prev_output) - output = self.reshape(prev_output, out_shape) - return output - - -class CreateAttentionMaskFromInputMask(nn.Cell): - """ - Create attention mask according to input mask. - - Args: - config (:class:`TransformerConfig`): Configuration for Transformer. - """ - def __init__(self): - super(CreateAttentionMaskFromInputMask, self).__init__() - self.cast = P.Cast() - self.reshape = P.Reshape() - self.shape = P.Shape() - self.batch_matmul = P.BatchMatMul() - - def construct(self, input_mask): - """Create attention mask according to input mask.""" - input_shape = self.shape(input_mask) - shape_right = (input_shape[0], 1, input_shape[1]) - shape_left = input_shape + (1,) - - input_mask = self.cast(input_mask, mstype.float32) - mask_left = self.reshape(input_mask, shape_left) - mask_right = self.reshape(input_mask, shape_right) - attention_mask = self.batch_matmul(mask_left, mask_right) - - return attention_mask - - -class PredLogProbs(nn.Cell): - """ - Get log probs. - - Args: - batch_size (int): Batch size. - seq_length (int): Length of input sequence. - width (int): Hidden size. - compute_type (:class:`mindspore.dtype`): Compute type. Default: mstype.float32. - dtype (:class:`mindspore.dtype`): Compute type to compute log_softmax. Default: mstype.float32. - """ - def __init__(self, - batch_size, - width, - compute_type=mstype.float32, - dtype=mstype.float32): - super(PredLogProbs, self).__init__() - self.batch_size = batch_size - self.width = width - self.compute_type = compute_type - self.dtype = dtype - - self.reshape = P.Reshape() - self.matmul = P.MatMul(transpose_b=True) - self.log_softmax = nn.LogSoftmax(axis=-1) - self.cast = P.Cast() - - def construct(self, - input_tensor, - output_weights, - seq_length): - """Get log probs.""" - shape_flat_sequence_tensor = (self.batch_size * seq_length, self.width) - - input_tensor = self.reshape(input_tensor, shape_flat_sequence_tensor) - input_tensor = self.cast(input_tensor, self.compute_type) - output_weights = self.cast(output_weights, self.compute_type) - - logits = self.matmul(input_tensor, output_weights) - logits = self.cast(logits, self.dtype) - - log_probs = self.log_softmax(logits) - return log_probs - - -class TransformerDecoderStep(nn.Cell): - """ - Multi-layer transformer decoder step. - - Args: - batch_size (int): Batch size of input dataset. - hidden_size (int): Size of the encoder layers. - max_decode_length (int): Max decode length. - enc_seq_length (int): Length of source sentences. - num_hidden_layers (int): Number of hidden layers in encoder cells. - num_attention_heads (int): Number of attention heads in encoder cells. Default: 16. - intermediate_size (int): Size of intermediate layer in encoder cells. Default: 4096. - attention_probs_dropout_prob (float): The dropout probability for - SelfAttention. Default: 0.1. - use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. - initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. - hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1. - hidden_act (str): Activation function used in the encoder cells. Default: "gelu". - compute_type (:class:`mindspore.dtype`): Compute type. Default: mstype.float32. - embedding_lookup (:class:`EmbeddingLookup`): Embedding lookup module. - embedding_processor (:class:`EmbeddingPostprocessor`) Embedding postprocessor module. - projection (:class:`PredLogProbs`): PredLogProbs module - """ - def __init__(self, - batch_size, - hidden_size, - max_decode_length, - num_hidden_layers, - num_attention_heads=16, - intermediate_size=4096, - attention_probs_dropout_prob=0.3, - use_one_hot_embeddings=False, - initializer_range=0.02, - hidden_dropout_prob=0.3, - hidden_act="relu", - compute_type=mstype.float32, - embedding_lookup=None, - embedding_processor=None, - projection=None): - super(TransformerDecoderStep, self).__init__(auto_prefix=False) - self.num_hidden_layers = num_hidden_layers - - self.tfm_embedding_lookup = embedding_lookup - self.tfm_embedding_processor = embedding_processor - self.projection = projection - - self.tfm_decoder = TransformerDecoder( - batch_size=batch_size, - hidden_size=hidden_size, - num_attention_heads=num_attention_heads, - num_hidden_layers=num_hidden_layers, - intermediate_size=intermediate_size, - attention_probs_dropout_prob=attention_probs_dropout_prob, - use_one_hot_embeddings=use_one_hot_embeddings, - initializer_range=initializer_range, - hidden_dropout_prob=hidden_dropout_prob, - hidden_act=hidden_act, - compute_type=compute_type) - - self.ones_like = P.OnesLike() - self.shape = P.Shape() - - self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask() - self.expand = P.ExpandDims() - self.multiply = P.Mul() - - ones = np.ones(shape=(max_decode_length, max_decode_length)) - self.future_mask = Tensor(np.tril(ones), dtype=mstype.float32) - - self.cast_compute_type = CastWrapper(dst_type=compute_type) - - def construct(self, input_ids, enc_states, enc_attention_mask, seq_length): - """ - Multi-layer transformer decoder step. - input_ids: [batch_size * beam_width] - """ - # process embedding - input_embedding, embedding_tables = self.tfm_embedding_lookup(input_ids) - input_embedding = self.tfm_embedding_processor(input_embedding) - input_embedding = self.cast_compute_type(input_embedding) - - input_shape = self.shape(input_ids) - input_len = input_shape[1] - future_mask = self.future_mask[0:input_len:1, 0:input_len:1] - - input_mask = self.ones_like(input_ids) - input_mask = self._create_attention_mask_from_input_mask(input_mask) - input_mask = self.multiply(input_mask, self.expand(future_mask, 0)) - input_mask = self.cast_compute_type(input_mask) - - enc_attention_mask = enc_attention_mask[::, 0:input_len:1, ::] - - # call TransformerDecoder - decoder_output = self.tfm_decoder(input_embedding, input_mask, enc_states, enc_attention_mask, -1, seq_length) - - # take the last step - decoder_output = decoder_output[::, input_len-1:input_len:1, ::] - - # projection and log_prob - log_probs = self.projection(decoder_output, embedding_tables, 1) - - return log_probs - - -@constexpr -def convert_np_to_tensor_encoder(seq_length): - ones = np.ones(shape=(seq_length, seq_length)) - return Tensor(np.tril(ones), dtype=mstype.float32) - - -class TransformerModel(nn.Cell): - """ - Transformer with encoder and decoder. - - Args: - config (Class): Configuration for Transformer. - is_training (bool): True for training mode. False for eval mode. - use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. - """ - def __init__(self, - config, - is_training, - use_one_hot_embeddings=False): - super(TransformerModel, self).__init__() - config = copy.deepcopy(config) - self.is_training = is_training - if not is_training: - config.hidden_dropout_prob = 0.0 - config.attention_probs_dropout_prob = 0.0 - - self.batch_size = config.batch_size - self.hidden_size = config.hidden_size - self.num_hidden_layers = config.num_hidden_layers - self.embedding_size = config.hidden_size - - self.last_idx = self.num_hidden_layers - 1 - self.beam_width = config.beam_width - self.max_decode_length = config.max_decode_length - - self.tfm_embedding_lookup = EmbeddingLookup( - vocab_size=config.vocab_size, - embedding_size=self.embedding_size, - use_one_hot_embeddings=use_one_hot_embeddings, - initializer_range=config.initializer_range) - self.tfm_embedding_postprocessor_for_encoder = EmbeddingPostprocessor( - embedding_size=self.embedding_size, - use_one_hot_embeddings=use_one_hot_embeddings, - initializer_range=0.02, - max_position_embeddings=config.max_position_embeddings, - dropout_prob=config.hidden_dropout_prob) - self.tfm_embedding_postprocessor_for_decoder = EmbeddingPostprocessor( - embedding_size=self.embedding_size, - use_one_hot_embeddings=use_one_hot_embeddings, - initializer_range=0.02, - max_position_embeddings=config.max_position_embeddings, - dropout_prob=config.hidden_dropout_prob) - self.tfm_encoder = TransformerEncoder( - batch_size=self.batch_size, - hidden_size=self.hidden_size, - num_attention_heads=config.num_attention_heads, - num_hidden_layers=self.num_hidden_layers, - intermediate_size=config.intermediate_size, - attention_probs_dropout_prob=config.attention_probs_dropout_prob, - use_one_hot_embeddings=use_one_hot_embeddings, - initializer_range=config.initializer_range, - hidden_dropout_prob=config.hidden_dropout_prob, - hidden_act=config.hidden_act, - compute_type=config.compute_type) - - if is_training: - self.projection = PredLogProbs( - batch_size=self.batch_size, - width=self.hidden_size, - compute_type=config.compute_type, - dtype=config.dtype) - self.tfm_decoder = TransformerDecoder( - batch_size=self.batch_size, - hidden_size=self.hidden_size, - num_attention_heads=config.num_attention_heads, - num_hidden_layers=self.num_hidden_layers, - intermediate_size=config.intermediate_size, - attention_probs_dropout_prob=config.attention_probs_dropout_prob, - use_one_hot_embeddings=use_one_hot_embeddings, - initializer_range=config.initializer_range, - hidden_dropout_prob=config.hidden_dropout_prob, - hidden_act=config.hidden_act, - compute_type=config.compute_type) - else: - self.projection = PredLogProbs( - batch_size=self.batch_size * config.beam_width, - width=self.hidden_size, - compute_type=config.compute_type, - dtype=config.dtype) - self.tfm_decoder = TransformerDecoderStep( - batch_size=self.batch_size * config.beam_width, - hidden_size=self.hidden_size, - max_decode_length=config.max_decode_length, - num_hidden_layers=config.num_hidden_layers, - num_attention_heads=config.num_attention_heads, - intermediate_size=config.intermediate_size, - attention_probs_dropout_prob=config.attention_probs_dropout_prob, - use_one_hot_embeddings=False, - initializer_range=config.initializer_range, - hidden_dropout_prob=config.hidden_dropout_prob, - hidden_act=config.hidden_act, - compute_type=config.compute_type, - embedding_lookup=self.tfm_embedding_lookup, - embedding_processor=self.tfm_embedding_postprocessor_for_decoder, - projection=self.projection) - self.tfm_decoder = BeamSearchDecoder( - batch_size=config.batch_size, - seq_length=config.seq_length, - vocab_size=config.vocab_size, - decoder=self.tfm_decoder, - beam_width=config.beam_width, - length_penalty_weight=config.length_penalty_weight, - max_decode_length=config.max_decode_length) - - self.tfm_decoder.add_flags(loop_can_unroll=True) - self.tile_beam = TileBeam(beam_width=self.beam_width) - ones = np.ones(shape=(self.batch_size, self.max_decode_length)) - self.encdec_mask = Tensor(ones, mstype.float32) - - self.cast = P.Cast() - self.dtype = config.dtype - self.cast_compute_type = CastWrapper(dst_type=config.compute_type) - self.expand = P.ExpandDims() - self.multiply = P.Mul() - self.shape = P.Shape() - - self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask() - - def construct(self, source_ids, source_mask, target_ids=None, target_mask=None): - """Transformer with encoder and decoder.""" - seq_length = self.shape(source_ids)[1] - - # process source sentence - src_word_embeddings, embedding_tables = self.tfm_embedding_lookup(source_ids) - src_embedding_output = self.tfm_embedding_postprocessor_for_encoder(src_word_embeddings) - # attention mask [batch_size, seq_length, seq_length] - enc_attention_mask = self._create_attention_mask_from_input_mask(source_mask) - # transformer encoder - encoder_output = self.tfm_encoder(self.cast_compute_type(src_embedding_output), - self.cast_compute_type(enc_attention_mask), - seq_length) - - if self.is_training: - future_mask = convert_np_to_tensor_encoder(seq_length) - # process target sentence - tgt_word_embeddings, _ = self.tfm_embedding_lookup(target_ids) - tgt_embedding_output = self.tfm_embedding_postprocessor_for_decoder(tgt_word_embeddings) - # attention mask [batch_size, seq_length, seq_length] - tgt_attention_mask = self._create_attention_mask_from_input_mask(target_mask) - tgt_attention_mask = self.multiply(tgt_attention_mask, self.expand(future_mask, 0)) - # transformer decoder - decoder_output = self.tfm_decoder(self.cast_compute_type(tgt_embedding_output), - self.cast_compute_type(tgt_attention_mask), - encoder_output, enc_attention_mask, - seq_length, seq_length) - # calculate logits and log_probs - log_probs = self.projection(decoder_output, embedding_tables, seq_length) - ret = log_probs - else: - beam_encoder_output = self.tile_beam(encoder_output) - - enc_attention_mask = self.multiply(enc_attention_mask[::, 0:1:1, ::], self.expand(self.encdec_mask, -1)) - - beam_enc_attention_mask = self.tile_beam(enc_attention_mask) - beam_enc_attention_mask = self.cast_compute_type(beam_enc_attention_mask) - predicted_ids = self.tfm_decoder(beam_encoder_output, beam_enc_attention_mask) - ret = predicted_ids - return ret diff --git a/tests/st/model_zoo_tests/transformer/src/weight_init.py b/tests/st/model_zoo_tests/transformer/src/weight_init.py deleted file mode 100644 index 460a1c67c43..00000000000 --- a/tests/st/model_zoo_tests/transformer/src/weight_init.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""Weight init utilities.""" - -import math -import numpy as np -from mindspore.common.tensor import Tensor - -def _average_units(shape): - """ - Average shape dim. - """ - if not shape: - return 1. - if len(shape) == 1: - return float(shape[0]) - if len(shape) == 2: - return float(shape[0] + shape[1]) / 2. - raise RuntimeError("not support shape.") - -def weight_variable(shape): - scale_shape = shape - avg_units = _average_units(scale_shape) - scale = 1.0 / max(1., avg_units) - limit = math.sqrt(3.0 * scale) - values = np.random.uniform(-limit, limit, shape).astype(np.float32) - return Tensor(values) - -def one_weight(shape): - ones = np.ones(shape).astype(np.float32) - return Tensor(ones) - -def zero_weight(shape): - zeros = np.zeros(shape).astype(np.float32) - return Tensor(zeros) - -def normal_weight(shape, num_units): - norm = np.random.normal(0.0, num_units**-0.5, shape).astype(np.float32) - return Tensor(norm) - \ No newline at end of file diff --git a/tests/st/model_zoo_tests/transformer/test_transformer.py b/tests/st/model_zoo_tests/transformer/test_transformer.py index 7ee8769944a..8ace3c49c2d 100644 --- a/tests/st/model_zoo_tests/transformer/test_transformer.py +++ b/tests/st/model_zoo_tests/transformer/test_transformer.py @@ -27,14 +27,30 @@ from mindspore.train.callback import Callback import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as deC from mindspore import context -from src.transformer_model import TransformerConfig -from src.transformer_for_train import TransformerNetworkWithLoss, TransformerTrainOneStepWithLossScaleCell -from src.config import cfg, transformer_net_cfg -from src.lr_schedule import create_dynamic_lr +from easydict import EasyDict as edict +from model_zoo.official.nlp.transformer.src.transformer_model import TransformerConfig +from model_zoo.official.nlp.transformer.src.transformer_for_train import TransformerNetworkWithLoss, TransformerTrainOneStepWithLossScaleCell +from model_zoo.official.nlp.transformer.src.lr_schedule import create_dynamic_lr from tests.st.model_zoo_tests import utils + DATA_DIR = ["/home/workspace/mindspore_dataset/transformer/test-mindrecord"] +cfg = edict({ + 'transformer_network': 'large', + 'init_loss_scale_value': 1024, + 'scale_factor': 2, + 'scale_window': 2000, + 'optimizer': 'Adam', + 'optimizer_adam_beta2': 0.997, + 'lr_schedule': edict({ + 'learning_rate': 2.0, + 'warmup_steps': 8000, + 'start_decay_step': 16000, + 'min_lr': 0.0, + }), +}) + def get_config(version='base', batch_size=1): """get config""" @@ -129,7 +145,7 @@ class TimeMonitor(Callback): self.per_step_mseconds_list.append(epoch_mseconds / self.data_size) -@pytest.mark.level2 +@pytest.mark.level0 @pytest.mark.platform_arm_ascend_training @pytest.mark.platform_x86_ascend_training @pytest.mark.env_onecard @@ -144,7 +160,7 @@ def test_transformer(): batch_size = 96 epoch_size = 3 config = get_config(version=version, batch_size=batch_size) - dataset = load_test_data(batch_size=transformer_net_cfg.batch_size, data_file=DATA_DIR) + dataset = load_test_data(batch_size=config.batch_size, data_file=DATA_DIR) netwithloss = TransformerNetworkWithLoss(config, True) @@ -171,7 +187,7 @@ def test_transformer(): # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) - assert np.allclose(loss_value[0], 11.241606, 0, 0.000005) + assert np.allclose(loss_value[0], 11.241601, 0, 0.000005) expect_loss_value = [11.241606, 11.243232, 11.217459, 11.204157, 11.213804, 11.215373, 11.190564, 11.150393, 11.191823, 11.160045] @@ -201,7 +217,7 @@ def test_transformer(): assert per_step_mseconds <= expect_per_step_mseconds + 10 -@pytest.mark.level1 +@pytest.mark.level0 @pytest.mark.platform_arm_ascend_training @pytest.mark.platform_x86_ascend_training @pytest.mark.env_onecard