From b787c5c8c8a731d5f72b7af43097678ae34fe66f Mon Sep 17 00:00:00 2001 From: huangxinjing Date: Fri, 17 Sep 2021 20:47:31 +0800 Subject: [PATCH] Fix spell error --- mindspore/parallel/nn/layers.py | 22 +- mindspore/parallel/nn/loss.py | 10 +- mindspore/parallel/nn/op_parallel_config.py | 4 +- mindspore/parallel/nn/transformer.py | 232 ++++++++++---------- 4 files changed, 139 insertions(+), 129 deletions(-) diff --git a/mindspore/parallel/nn/layers.py b/mindspore/parallel/nn/layers.py index 6dbf3d4a4bd..3a6e77e0320 100644 --- a/mindspore/parallel/nn/layers.py +++ b/mindspore/parallel/nn/layers.py @@ -376,14 +376,14 @@ class FixedSparseAttention(nn.Cell): only supports 64, 128 for now Inputs: - - **q** - Tensor uery (:class:`mstype.fp16` [batch_size, seq_length, hidden_size]): Sequence of + - **q** (Tensor) - Tensor query (:class:`mstype.fp16` [batch_size, seq_length, hidden_size]): Sequence of queries to query the context. - - **k** - Tensor key (:class:`mstype.fp16` [batch_size, seq_length, hidden_size]): Sequence of + - **k** (Tensor) - Tensor key (:class:`mstype.fp16` [batch_size, seq_length, hidden_size]): Sequence of queries to query the context. - - **v** - Tensor value (:class:`mstype.fp16` [batch size, sequence length, Embedding Size]): Sequence of - queries to query the context. - - **attention_mask** - Tensor the mask of (:class:`mstype.fp32` [batch_size, seq_length, seq_length]): - Lower triangular matrix to pass masked information. + - **v** (Tensor) - Tensor value (:class:`mstype.fp16` [batch size, sequence length, Embedding Size]): + Sequence of queries to query the context. + - **attention_mask** (Tensor) - Float Tensor the mask of (:class:`mstype.fp32`, :class:`mstype.fp16` + [batch_size, seq_length, seq_length]): Lower triangular matrix to pass masked information. Outputs: A Tensor. The output of the attention with shape [batch_size, seq_length, hidden_size] @@ -396,10 +396,10 @@ class FixedSparseAttention(nn.Cell): ... num_heads=8, ... size_per_head=64, ... block_size=64) - >>> q = Tensor(np.ones((2, 1024, 8*64)), dtype.float16) - >>> k = Tensor(np.ones((2, 1024, 8*64)), dtype.float16) - >>> v = Tensor(np.ones((2, 1024, 8*64)), dtype.float16) - >>> attention_mask = Tensor(np.ones((2, 1024, 1024)), dtype.float16) + >>> q = Tensor(np.ones((2, 1024, 8*64)), mstype.float16) + >>> k = Tensor(np.ones((2, 1024, 8*64)), mstype.float16) + >>> v = Tensor(np.ones((2, 1024, 8*64)), mstype.float16) + >>> attention_mask = Tensor(np.ones((2, 1024, 1024)), mstype.float32) >>> output = model(q, k, v, attention_mask) >>> print(output.shape) (2, 1024, 512) @@ -550,7 +550,7 @@ class FixedSparseAttention(nn.Cell): _check_input_dtype(F.dtype(v), "v", [mstype.float16], self.cls_name) _check_shape_equal(F.shape(attention_mask), "attention_mask", self.cls_name, [self.batch_size, self.seq_length, self.seq_length]) - _check_input_dtype(F.dtype(attention_mask), "attention_mask", [mstype.float32], self.cls_name) + _check_input_dtype(F.dtype(attention_mask), "attention_mask", [mstype.float32, mstype.float16], self.cls_name) q, k, v = self._transpose_inputs(q, k, v) local_mask, global_mask = self._generate_attention_mask(attention_mask) diff --git a/mindspore/parallel/nn/loss.py b/mindspore/parallel/nn/loss.py index b543abd4926..7ab96e3eaee 100644 --- a/mindspore/parallel/nn/loss.py +++ b/mindspore/parallel/nn/loss.py @@ -34,7 +34,7 @@ class CrossEntropyLoss(Cell): Args: parallel_config (OpParallelConfig): The parallel configure. Default `default_dpmp_config`, - a instance of `OpParallelConfig` with default args. + an instance of `OpParallelConfig` with default args. Inputs: - **logits** (Tensor) - Tensor of shape (N, C). Data type must be float16 or float32. the output logits of @@ -48,8 +48,9 @@ class CrossEntropyLoss(Cell): Outputs: Tensor. the corresponding cross entropy loss - Exapmes: - >>> loss = mindspore.parallel.nn.CrossEntropyLoss() + Examples: + >>> from mindspore.parallel.nn import CrossEntropyLoss + >>> loss = CrossEntropyLoss() >>> logits = Tensor(np.array([[3, 5, 6, 9, 12, 33, 42, 12, 32, 72]]), mindspore.float32) >>> labels_np = np.array([1]).astype(np.int32) >>> input_mask = Tensor(np.ones(1).astype(np.float32)) @@ -88,9 +89,6 @@ class CrossEntropyLoss(Cell): self.div2 = P.RealDiv() def construct(self, logits, label, input_mask): - r""" - Compute loss using logits, label and input mask - """ self._check_input(logits, label, input_mask) # the shape is [bs*seq_length, vocab_size] diff --git a/mindspore/parallel/nn/op_parallel_config.py b/mindspore/parallel/nn/op_parallel_config.py index 37a2a2a1081..98930939b79 100644 --- a/mindspore/parallel/nn/op_parallel_config.py +++ b/mindspore/parallel/nn/op_parallel_config.py @@ -41,7 +41,7 @@ class _Config: class OpParallelConfig(_Config): r""" - OpParallelConfig for the setting the data parallel and model parallel. + OpParallelConfig for the setting data parallel and model parallel. Args: data_parallel (int): The data parallel way. Default: 1 @@ -81,7 +81,7 @@ class OpParallelConfig(_Config): class _PipeLineConfig(_Config): r""" - PPConfig for the setting the data parallel, model parallel + PPConfig for the setting data parallel, model parallel Args: pipeline_stage (int): The number of the pipeline stages. Default: 1 diff --git a/mindspore/parallel/nn/transformer.py b/mindspore/parallel/nn/transformer.py index 3a27169c318..187cec3505d 100644 --- a/mindspore/parallel/nn/transformer.py +++ b/mindspore/parallel/nn/transformer.py @@ -53,7 +53,7 @@ __all__ = [ class EmbeddingOpParallelConfig(_Config): r""" - EmbeddingOpParallelConfig for the setting the data parallel or row slice for the embedding table. + EmbeddingOpParallelConfig for the setting data parallel or row slice for the embedding table. Args: data_parallel (int): The data parallel way. Default: 1 @@ -100,7 +100,7 @@ class EmbeddingOpParallelConfig(_Config): @property def dp_mp_config(self): r""" - To obtain the DPMPlConfig for the setting the data parallel, model parallel + To obtain the DPMPlConfig for the setting data parallel, model parallel Supported Platforms: ``Ascend`` ``GPU`` @@ -114,21 +114,21 @@ class EmbeddingOpParallelConfig(_Config): class TransformerOpParallelConfig(_Config): r""" - TransformerOpParallelConfig for the setting the global data parallel, model parallel and fusion group. + TransformerOpParallelConfig for the setting global data parallel, model parallel and fusion group. The parallel configure setting. Note: Except the recompute argument, other arguments will not be effective when the user doesn't set auto_parallel_context to `SEMI_AUTO_PARALLEL` or `AUTO_PARALLEL`. - The micro_batch_num must be greater then or equal to pipeline_stage. The data_parallel\*model_parallel - \*pipeline_stage must be equal to the device. When setting the pipeline stage and + The micro_batch_num must be greater than or equal to pipeline_stage. The data_parallel\*model_parallel + \*pipeline_stage must be equal or less equal to the device. When setting the pipeline stage and optimizer_shard, the config will overwrite the auto_parallel_context. Args: data_parallel (int): The data parallel way. Default: 1. model_parallel (int): The model parallel way. Default: 1. pipeline_stage (int): The number of the pipeline stage. Should be a positive value. Default: 1. - micro_batch_num (int): The micore size of the batches for the pipeline training. Default: 1. + micro_batch_num (int): The microe size of the batches for the pipeline training. Default: 1. optimizer_shard (bool): Whether to enable optimizer shard. Default False. gradient_aggregation_group (int): The fusion group size of the optimizer state sharding. Default: 4. recompute (bool): Enable recomputation of the transformer block or not. Default: False. @@ -221,7 +221,7 @@ class TransformerOpParallelConfig(_Config): @property def embedding_dp_mp_config(self): r""" - To obtain the EmbeddingParallelConfig for the setting the data parallel, model parallel amd embedding + To obtain the EmbeddingParallelConfig for the setting data parallel, model parallel and embedding parallel. Supported Platforms: @@ -236,7 +236,7 @@ class TransformerOpParallelConfig(_Config): @property def dp_mp_config(self): r""" - To obtain the EmbeddingParallelConfig for the setting the data parallel, model parallel amd embedding + To obtain the EmbeddingParallelConfig for the setting data parallel, model parallel and embedding parallel. Supported Platforms: @@ -274,10 +274,11 @@ class FeedForward(Cell): 'hsigmoid', 'logsigmoid' and so on. Default: gelu. expert_num (int): The number of experts used in Linear. For the case expert_num > 1, BatchMatMul is used and the first dimension in BatchMatMul indicate expert_num. Default: 1. - param_init_type (dtype.Number): The parameter initialization type. Can be dtype.float32 or dtype.float16. + param_init_type (dtype.Number): The parameter initialization type. Should be dtype.float32 or dtype.float16. + Default: dtype.float32. parallel_config(OpParallelConfig): The config of parallel setting, see `OpParallelConfig`. - Default `default_dpmp_config`, a instance of `OpParallelConfig` with default - args. + Default `default_dpmp_config`, an instance of `OpParallelConfig` with + default args. Inputs: - **x** (Tensor) - should be `[batch, seq_length, hidden_size]`. Float tensor. @@ -296,7 +297,7 @@ class FeedForward(Cell): Examples: >>> model = FeedForward(hidden_size=15, ffn_hidden_size=30, dropout_rate=0.1) - >>> tensor = Tensor(np.ones((2, 20, 15)), dtype.float32) + >>> tensor = Tensor(np.ones((2, 20, 15)), mstype.float32) >>> output = model(tensor) >>> print(output.shape) (2, 20, 15) @@ -383,19 +384,19 @@ class AttentionMask(Cell): with 1 and 0. 1 indicates the current position is a valid token, otherwise not. Args: - seq_length(int): the sequence length of the input tensor. - parallel_config(OpParallelConfig): the parallel configure. Default `default_dpmp_config`, - a instance of `OpParallelConfig` with default args. + seq_length(int): The sequence length of the input tensor. + parallel_config(OpParallelConfig): The parallel configure. Default `default_dpmp_config`, + an instance of `OpParallelConfig` with default args. Inputs: - - **input_mask** (Tensor) - the mask indicating whether each position is a valid input with + - **input_mask** (Tensor) - The mask indicating whether each position is a valid input with (batch_size, seq_length). Outputs: - Tensor. the attention mask matrix with shape (batch_size, seq_length, seq_length). + Tensor. The attention mask matrix with shape (batch_size, seq_length, seq_length). Raises: - TypeError: `seq_length` is not a int. + TypeError: `seq_length` is not an integer. ValueError: `seq_length` is not a positive value. TypeError: `parallel_config` is not a subclass of OpParallelConfig. @@ -403,15 +404,16 @@ class AttentionMask(Cell): ``Ascend`` ``GPU`` Examples: - >>> mask = mindspore.parallel.nn.AttentionMask(seq_length=4) + >>> from mindspore.parallel.nn import AttentionMask + >>> mask = AttentionMask(seq_length=4) >>> mask_array = np.array([[1, 1, 1, 0]], np.float32) >>> inputs = Tensor(mask_array) >>> res = mask(inputs) >>> print(res) - Tensor(shape=[1, 4, 4], dtype=Float32,value=[[[1, 0, 0, 0], + [[[1, 0, 0, 0], [1, 1, 0, 0], [1, 1, 1, 0], - [0, 0, 0, 0]]]) + [0, 0, 0, 0]]] """ @_args_type_validator_check(seq_length=Validator.check_positive_int, @@ -452,8 +454,8 @@ class VocabEmbedding(Cell): """ The embedding lookup table from the 0-th dim of the parameter table. When the parallel_config.vocab_emb_dp is True and in the `AUTO_PARALLEL_MODE`, the embedding lookup will be a `parallel_config.data_parallel` - data parallel way, or will shard the parameter at the 0-th dimension in `parallel_config.model_parallel`, so called - row slice of the embedding table + data parallel way, or will shard the parameter at the 0-th dimension in `parallel_config.model_parallel`, so-called + row slice of the embedding table. Args: vocab_size (int): Size of the dictionary of embeddings. @@ -461,11 +463,11 @@ class VocabEmbedding(Cell): param_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the embedding_table. Refer to class `initializer` for the values of string when a string is specified. Default: 'normal'. - parallel_config(EmbeddingOpParallelConfig): the parallel config of network. Default - `default_embedding_parallel_config`, a instance of `EmbeddingOpParallelConfig` with default args. + parallel_config(EmbeddingOpParallelConfig): The parallel config of network. Default + `default_embedding_parallel_config`, an instance of `EmbeddingOpParallelConfig` with default args. Inputs: - **input_ids** (Tensor) - the tokenized inputs with datatype int32 with shape (batch_size, seq_length) + **input_ids** (Tensor) - The tokenized inputs with datatype int32 with shape (batch_size, seq_length) Outputs: Tuple, a tuple contains (`output`, `embedding_table`) @@ -486,7 +488,7 @@ class VocabEmbedding(Cell): Examples: >>> model = VocabEmbedding(vocab_size=30, embedding_size=30) - >>> tensor = Tensor(np.ones((20, 15)), dtype.int32) + >>> tensor = Tensor(np.ones((20, 15)), mstype.int32) >>> output, table = model(tensor) >>> print(output.shape) (20, 15, 30) @@ -526,7 +528,7 @@ class MultiHeadAttention(Cell): r""" This is an implementation of multihead attention in the paper `Attention is all you need `_. Given the query vector with source length, and the - key and value vector with target length, the attention will be performered as the following + key and value vector with target length, the attention will be performed as the following .. math:: MultiHeadAttention(query, key, vector) = Concat(head_1, \dots, head_h)W^O @@ -543,13 +545,15 @@ class MultiHeadAttention(Cell): num_heads(int): The number of the heads. hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1 attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1 - compute_dtype(dtype.Number): The computation type. Default dtype.float16. The computation of the - softmax will be converted to the float32. + compute_dtype(dtype.Number): The computation type of dense. Default dtype.float16. + Should be dtype.float32 or dtype.float16. param_init_type(dtype.Number). The parameter initialization type of the module. Default dtype.float32. - Can be dtype.float32 or dtype.float16. + Should be dtype.float32 or dtype.float16. + softmax_compute_type(dtype.Number). The type of softmax computation module. Default dtype.float32. + Should be dtype.float32 or dtype.float16. use_past(bool): Use the past state to compute, used for incremental prediction. Default False. parallel_config(OpParallelConfig): The parallel configure. Default `default_dpmp_config`, - a instance of `OpParallelConfig` with default args. + an instance of `OpParallelConfig` with default args. Inputs: - **query_tensor** (Tensor) - the query vector with shape (batch_size, src_seq_length, hidden_size). @@ -572,7 +576,7 @@ class MultiHeadAttention(Cell): - **output** (Tensor) - Tensor, the float tensor of the output of the layer with shape (batch_size, src_seq_length, hidden_size) - - **layer_present** (Tuple) - A tuple of the Tensor the projected key and value vector with + - **layer_present** (Tuple) - A tuple of the Tensor of the projected key and value vector with ((batch_size, num_heads, size_per_head, tgt_seq_length), (batch_size, num_heads, tgt_seq_length, size_per_head)). @@ -582,9 +586,9 @@ class MultiHeadAttention(Cell): Examples: >>> model = MultiHeadAttention(batch_size=2, hidden_size=15, src_seq_length=20, tgt_seq_length=20, ... num_heads=3) - >>> from_tensor = Tensor(np.ones((2, 20, 15)), dtype.float32) - >>> to_tensor = Tensor(np.ones((2, 20, 15)), dtype.float16) - >>> attention_mask = Tensor(np.ones((2, 20, 20)), dtype.float16) + >>> from_tensor = Tensor(np.ones((2, 20, 15)), mstype.float32) + >>> to_tensor = Tensor(np.ones((2, 20, 15)), mstype.float16) + >>> attention_mask = Tensor(np.ones((2, 20, 20)), mstype.float16) >>> attn_out, past = model(from_tensor, to_tensor, to_tensor, attention_mask) >>> print(attn_out.shape) (2, 20, 15) @@ -601,6 +605,8 @@ class MultiHeadAttention(Cell): tgt_seq_length=Validator.check_positive_int, attention_dropout_rate=Validator.check_non_negative_float, hidden_dropout_rate=Validator.check_non_negative_float, + compute_dtype=_valid_value_checks([mstype.float32, mstype.float16], + "MultiHeadAttention"), softmax_compute_type=_valid_value_checks([mstype.float32, mstype.float16], "MultiHeadAttention"), param_init_type=_valid_value_checks([mstype.float32, mstype.float16], @@ -915,7 +921,7 @@ class MultiHeadAttention(Cell): class TransformerEncoderLayer(Cell): r""" Transformer Encoder Layer. This is an implementation of the single layer of the transformer - encoder layer including multihead attention and feedward layer. + encoder layer, including multihead attention and feedward layer. Args: batch_size(int): The batch size of the input tensor. @@ -930,22 +936,22 @@ class TransformerEncoderLayer(Cell): 'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish', 'hsigmoid', 'logsigmoid' and so on. Default: gelu. layernorm_compute_type(dtype.Number): The computation type of the layernorm. - Can be dtype.float32 or dtype.float16. Default dtype.float16. + Should be dtype.float32 or dtype.float16. Default dtype.float32. softmax_compute_type(dtype.Number): The computation type of the softmax in the attention. - Can be dtype.float32 or dtype.float16. Default mstype.float16. + Should be dtype.float32 or dtype.float16. Default mstype.float32. param_init_type(dtype.Number): The parameter initialization type of the module. - Can be dtype.float32 or dtype.float16. Default dtype.float32. + Should be dtype.float32 or dtype.float16. Default dtype.float32. use_past(bool): Use the past state to compute, used for incremental prediction. Default False. moe_config(MoEConfig): The configuration of MoE (Mixture of Expert). parallel_config(OpParallelConfig): The parallel configure. Default `default_dpmp_config`, - a instance of `OpParallelConfig` with default args. + an instance of `OpParallelConfig` with default args. Inputs: - **x** (Tensor) - Float Tensor, shape should be [batch_size, seq_length, hidden_size]. - **input_mask** (Tensor) - Float Tensor, attention mask with shape [batch_size, seq_length, seq_length]. - - **init_reset** (Tensor) - A bool tensor with shape [batch_size,], used to clear the past key parameter and + - **init_reset** (Tensor) - A bool tensor with shape [batch_size], used to clear the past key parameter and past value parameter used in the incremental prediction. Only valid when use_past is True. Default True. - - **batch_valid_length** (Tensor) - Int32 tensor with shape (batch_size,) the past calculated the index. Used + - **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index. Used for incremental prediction when the use_past is True. Default None. Outputs: @@ -954,7 +960,7 @@ class TransformerEncoderLayer(Cell): - **output** (Tensor) - The float tensor of the output of the layer with shape (batch_size, seq_length, hidden_size). - - **layer_present** (Tuple) - A tuple of the Tensor the projected key and value vector with + - **layer_present** (Tuple) - A tuple of the Tensor of the projected key and value vector with ((batch_size, num_heads, size_per_head, seq_length), (batch_size, num_heads, seq_length, size_per_head)). @@ -964,8 +970,8 @@ class TransformerEncoderLayer(Cell): Examples: >>> model = TransformerEncoderLayer(batch_size=2, hidden_size=8, ffn_hidden_size=64, seq_length=16, ... num_heads=2) - >>> encoder_input_value = Tensor(np.ones((2, 16, 8)), dtype.float32) - >>> encoder_input_mask = Tensor(np.ones((2, 16, 16)), dtype.float16) + >>> encoder_input_value = Tensor(np.ones((2, 16, 8)), mstype.float32) + >>> encoder_input_mask = Tensor(np.ones((2, 16, 16)), mstype.float16) >>> output, past = model(encoder_input_value, encoder_input_mask) >>> print(output.shape) (2, 16, 8) @@ -1159,7 +1165,7 @@ class TransformerEncoderLayer(Cell): class TransformerDecoderLayer(Cell): r""" Transformer Decoder Layer. This is an implementation of the single layer of the transformer - decoder layer including self-attention, cross attention and feedward layer. When the encoder_output is None, + decoder layer, including self-attention, cross attention and feedward layer. When the encoder_output is None, the cross attention will not be effective. Args: @@ -1176,15 +1182,15 @@ class TransformerDecoderLayer(Cell): 'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish', 'hsigmoid', 'logsigmoid' and so on. Default: gelu. layernorm_compute_type(dtype.Number): The computation type of the layernorm. - Can be dtype.float32 or dtype.float16. Default dtype.float16. + Should be dtype.float32 or dtype.float16. Default dtype.float32. softmax_compute_type(dtype.Number): The computation type of the softmax in the attention. - Can be dtype.float32 or dtype.float16. Default mstype.float16. - param_init_type: The parameter initialization type of the module. Can be dtype.float32 or dtype.float16. - Default dtype.float32. + Should be dtype.float32 or dtype.float16. Default mstype.float32. + param_init_type(dtype.Number): The parameter initialization type of the module. + Should be dtype.float32 or dtype.float16. Default dtype.float32. use_past(bool): Use the past state to compute, used for incremental prediction. Default False. moe_config(MoEConfig): The configuration of MoE (Mixture of Expert). parallel_config(OpParallelConfig): The parallel configure. Default `default_dpmp_config`, - a instance of `OpParallelConfig` with default args. + an instance of `OpParallelConfig` with default args. Inputs: - **hidden_stats** (Tensor) - the input tensor with shape [batch_size, tgt_seq_length, hidden_size]. @@ -1193,18 +1199,18 @@ class TransformerDecoderLayer(Cell): - **encoder_output** (Tensor) - the output of the encoder with shape [batch_size, seq_length, hidden_size]. - **memory_mask** (Tensor) - the memory mask of the cross attention with shape [batch, tgt_seq_length, src_seq_length], where tgt_seq_length is the length of the decoder. - - **init_reset** (Tensor) - A bool tensor with shape [batch_size,], used to clear the past key parameter and + - **init_reset** (Tensor) - A bool tensor with shape [batch_size], used to clear the past key parameter and past value parameter used in the incremental prediction. Only valid when use_past is True. Default True. - - **batch_valid_length** (Tensor) - Int32 tensor with shape (batch_size,) the past calculated the index. Used + - **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index. Used for incremental prediction when the use_past is True. Default None. Outputs: Tuple, a tuple contains(`output`, `layer_present`) - **output** (Tensor) - the output logit of this layer. The shape is [batch, seq_length, hidden_size] - - **layer_present** (Tensor) - A tuple, where each tuple is the tensor the projected key and value + - **layer_present** (Tensor) - A tuple, where each tuple is the tensor of the projected key and value vector in self attention with shape ((batch_size, num_heads, size_per_head, tgt_seq_length), - (batch_size, num_heads, tgt_seq_length, size_per_head), and the projected key and value vector + (batch_size, num_heads, tgt_seq_length, size_per_head), and of the projected key and value vector in cross attention with shape (batch_size, num_heads, size_per_head, src_seq_length), (batch_size, num_heads, src_seq_length, size_per_head)). @@ -1214,10 +1220,10 @@ class TransformerDecoderLayer(Cell): Examples: >>> model = TransformerDecoderLayer(batch_size=2, hidden_size=64, ffn_hidden_size=64, num_heads=2, ... src_seq_length=20, tgt_seq_length=10) - >>> encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32) - >>> decoder_input_value = Tensor(np.ones((2, 10, 64)), dtype.float32) - >>> decoder_input_mask = Tensor(np.ones((2, 10, 10)), dtype.float16) - >>> memory_mask = Tensor(np.ones((2, 10, 20)), dtype.float16) + >>> encoder_input_value = Tensor(np.ones((2, 20, 64)), mstype.float32) + >>> decoder_input_value = Tensor(np.ones((2, 10, 64)), mstype.float32) + >>> decoder_input_mask = Tensor(np.ones((2, 10, 10)), mstype.float16) + >>> memory_mask = Tensor(np.ones((2, 10, 20)), mstype.float16) >>> output, past = model(decoder_input_value, decoder_input_mask, encoder_input_value, memory_mask) >>> print(output.shape) (2, 10, 64) @@ -1477,7 +1483,7 @@ def _get_lambda_func(total_layer=None): Args: network(Cell) - Represents the transformer block layer_id(int) - Means the layer index for the current module, counts from zero. - offset(int) - Means the layer_index needs a offset, if there are other modules in the net. + offset(int) - Means the layer_index needs an offset, if there are other modules in the net. layers(int) - The total layers used for the model. """ # override the layers @@ -1522,30 +1528,30 @@ class TransformerEncoder(Cell): 'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish', 'hsigmoid', 'logsigmoid' and so on. Default: gelu. layernorm_compute_type(dtype.Number): The computation type of the layernorm. - Can be dtype.float32 or dtype.float16. Default dtype.float16. + Should be dtype.float32 or dtype.float16. Default dtype.float32. softmax_compute_type(dtype.Number): The computation type of the softmax in the attention. - Can be dtype.float32 or dtype.float16. Default mstype.float16. - param_init_type: The parameter initialization type of the module. Can be dtype.float32 or dtype.float16. - Default dtype.float32. + Should be dtype.float32 or dtype.float16. Default mstype.float32. + param_init_type(dtype.Number): The parameter initialization type of the module. + Should be dtype.float32 or dtype.float16. Default dtype.float32. use_past(bool): Use the past state to compute, used for incremental prediction. Default False. - lambda_func: A function can specific the fusion index, pipeline stages and recompute attribute. If the user - wants to specific the pipeline stage and gradient aggregation fusion, the user can pass a function + lambda_func: A function can determine the fusion index, pipeline stages and recompute attribute. If the user + wants to determine the pipeline stage and gradient aggregation fusion, the user can pass a function that accepts `network`, `layer_id`, `offset`, `parallel_config`, `layers`. The `network(Cell)` represents the transformer block, `layer_id(int)` means the layer index for the current module, counts from - zero, `offset(int)` means the layer_index needs a offset, if there are other modules in the net. The + zero, `offset(int)` means the layer_index needs an offset, if there are other modules in the net. The default setting for the pipeline is: `(layer_id + offset) // (layers / pipeline_stage)`. offset(int): The initial layer index for the `decoder`. Used for setting the fusion id and stage id, to not overlap with the encoder layer. moe_config(MoEConfig): The configuration of MoE (Mixture of Expert). parallel_config(TransformerOpParallelConfig): The parallel configure. Default `default_transformer_config`, - a instance of `TransformerOpParallelConfig` with default args. + an instance of `TransformerOpParallelConfig` with default args. Inputs: - **hidden_states** (Tensor) - Tensor, shape should be [batch_size, seq_length, hidden_size] - **attention_mask** (Tensor) - Tensor, attention mask with shape [batch_size, seq_length, seq_length] - - **init_reset** (Tensor) - A bool tensor with shape [batch_size,], used to clear the past key parameter and + - **init_reset** (Tensor) - A bool tensor with shape [batch_size], used to clear the past key parameter and past value parameter used in the incremental prediction. Only valid when use_past is True. Default True - - **batch_valid_length** (Tensor) - Int32 tensor with shape (batch_size,) the past calculated the index. Used + - **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index. Used for incremental prediction when the use_past is True. Default None. Outputs: @@ -1563,8 +1569,8 @@ class TransformerEncoder(Cell): Examples: >>> model = TransformerEncoder(batch_size=2, num_layers=2, hidden_size=8, ffn_hidden_size=64, seq_length=16, ... num_heads=2) - >>> encoder_input_value = Tensor(np.ones((2, 16, 8)), dtype.float32) - >>> encoder_input_mask = Tensor(np.ones((2, 16, 16)), dtype.float16) + >>> encoder_input_value = Tensor(np.ones((2, 16, 8)), mstype.float32) + >>> encoder_input_mask = Tensor(np.ones((2, 16, 16)), mstype.float16) >>> output, past = model(encoder_input_value, encoder_input_mask) >>> print(output.shape) (2, 16, 8) @@ -1692,23 +1698,23 @@ class TransformerDecoder(Cell): 'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish', 'hsigmoid', 'logsigmoid' and so on. Default: gelu. layernorm_compute_type(dtype.Number): The computation type of the layernorm. - Can be dtype.float32 or dtype.float16. Default dtype.float16. + Should be dtype.float32 or dtype.float16. Default dtype.float32. softmax_compute_type(dtype.Number): The computation type of the softmax in the attention. - Can be dtype.float32 or dtype.float16. Default mstype.float16. - param_init_type: The parameter initialization type of the module. Can be dtype.float32 or dtype.float16. - Default dtype.float32. + Should be dtype.float32 or dtype.float16. Default mstype.float32. + param_init_type(dtype.Number): The parameter initialization type of the module. + Should be dtype.float32 or dtype.float16. Default dtype.float32. offset(int): The initial layer index for the `decoder`. Used for setting the fusion id and stage id, to not overlap with the encoder layer. - lambda_func: A function can specific the fusion index, pipeline stages and recompute attribute. If the user - wants to specific the pipeline stage and gradient aggregation fusion, the user can pass a function + lambda_func: A function can determine the fusion index, pipeline stages and recompute attribute. If the user + wants to determine the pipeline stage and gradient aggregation fusion, the user can pass a function that accepts `network`, `layer_id`, `offset`, `parallel_config`, `layers`. The `network(Cell)` represents the transformer block, `layer_id(int)` means the layer index for the current module, counts from - zero, `offset(int)` means the layer_index needs a offset, if there are other modules in the net. The + zero, `offset(int)` means the layer_index needs an offset, if there are other modules in the net. The default setting for the pipeline is: `(layer_id + offset) // (layers / pipeline_stage)`. Default: None moe_config(MoEConfig): The configuration of MoE (Mixture of Expert). parallel_config(TransformerOpParallelConfig): The parallel configure. Default `default_transformer_config`, - a instance of `TransformerOpParallelConfig` with default args. + an instance of `TransformerOpParallelConfig` with default args. Inputs: - **hidden_stats** (Tensor) - the input tensor with shape [batch_size, seq_length, hidden_size] @@ -1717,18 +1723,18 @@ class TransformerDecoder(Cell): - **memory_mask** (Tensor) - the memory mask of the cross attention with shape [batch, tgt_seq_length, src_seq_length] where tgt_seq_length is the length of the decoder. the output of the encoder with shape [batch_size, seq_length, hidden_size], - - **init_reset** (Tensor) - A bool tensor with shape [batch_size,], used to clear the past key parameter and + - **init_reset** (Tensor) - A bool tensor with shape [batch_size], used to clear the past key parameter and past value parameter used in the incremental prediction. Only valid when use_past is True. Default True - - **batch_valid_length** (Tensor) - Int32 tensor with shape (batch_size,) the past calculated the index. + - **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index. Used for incremental prediction when the use_past is True. Default None. Outputs: Tuple, a tuple contains(`output`, `layer_present`) - **output** (Tensor) - The output logit of this layer. The shape is [batch, tgt_seq_length, hidden_size] - - **layer_present** (Tuple) - A tuple with size of num_layers, where each tuple is the tensor the projected + - **layer_present** (Tuple) - A tuple with size of num_layers, where each tuple is the tensor of the projected key and value vector in self attention with shape ((batch_size, num_heads, size_per_head, tgt_seq_length), - (batch_size, num_heads, tgt_seq_length, size_per_head), and the projected key and value vector + (batch_size, num_heads, tgt_seq_length, size_per_head), and of the projected key and value vector in cross attention with shape (batch_size, num_heads, size_per_head, src_seq_length), (batch_size, num_heads, src_seq_length, size_per_head)). @@ -1738,10 +1744,10 @@ class TransformerDecoder(Cell): Examples: >>> model = TransformerDecoder(batch_size=2, num_layers=1, hidden_size=64, ffn_hidden_size=64, ... num_heads=2, src_seq_length=20, tgt_seq_length=10) - >>> encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32) - >>> decoder_input_value = Tensor(np.ones((2, 10, 64)), dtype.float32) - >>> decoder_input_mask = Tensor(np.ones((2, 10, 10)), dtype.float16) - >>> memory_mask = Tensor(np.ones((2, 10, 20)), dtype.float16) + >>> encoder_input_value = Tensor(np.ones((2, 20, 64)), mstype.float32) + >>> decoder_input_value = Tensor(np.ones((2, 10, 64)), mstype.float32) + >>> decoder_input_mask = Tensor(np.ones((2, 10, 10)), mstype.float16) + >>> memory_mask = Tensor(np.ones((2, 10, 20)), mstype.float16) >>> output, past = model(decoder_input_value, decoder_input_mask, encoder_input_value, memory_mask) >>> print(output.shape) (2, 10, 64) @@ -1868,7 +1874,7 @@ class Transformer(Cell): r""" Transformer module including encoder and decoder. The difference with the original implements is the module use the residual addition before the layer normalization. And the default hidden act is `gelu`. - The detials can be found in `Attention is all you need `_. + The details can be found in `Attention is all you need `_. Note: This is an experimental interface that is subject to change and/or deletion. @@ -1881,37 +1887,43 @@ class Transformer(Cell): ffn_hidden_size(int): The hidden size of bottleneck in the feedforward layer. src_seq_length(int): The seq_length of the encoder's input tensor. tgt_seq_length(int): The seq_length of the decoder's input tensor. - num_heads(int): The number of the heads. + num_heads(int): The number of the heads. Default: 2. hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1 attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1 post_layernorm_residual(bool): Do residuals adds before the layernorm. Default False. + layernorm_compute_type(dtype.Number): The computation type of the layernorm. + Should be dtype.float32 or dtype.float16. Default dtype.float32. + softmax_compute_type(dtype.Number): The computation type of the softmax in the attention. + Should be dtype.float32 or dtype.float16. Default mstype.float32. + param_init_type(dtype.Number): The parameter initialization type of the module. + Should be dtype.float32 or dtype.float16. Default dtype.float32. hidden_act(str): The activation of the internal feedforward layer. Supports 'relu', 'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish', 'hsigmoid', 'logsigmoid' and so on. Default: gelu. - lambda_func: A function can specific the fusion index, pipeline stages and recompute attribute. If the user - wants to specific the pipeline stage and gradient aggregation fusion, the user can pass a function + moe_config(MoEConfig): The configuration of MoE (Mixture of Expert). + lambda_func: A function can determine the fusion index, pipeline stages and recompute attribute. If the user + wants to determine the pipeline stage and gradient aggregation fusion, the user can pass a function that accepts `network`, `layer_id`, `offset`, `parallel_config`, `layers`. The `network(Cell)` represents the transformer block, `layer_id(int)` means the layer index for the current module, counts from - zero, `offset(int)` means the layer_index needs a offset, if there are other modules in the net. The + zero, `offset(int)` means the layer_index needs an offset, if there are other modules in the net. The default setting for the pipeline is: `(layer_id + offset) // ((encoder_layers + decoder_length) / pipeline_stage)`. parallel_config(TransformerOpParallelConfig): The parallel configure. Default `default_transformer_config`, - a instance of `TransformerOpParallelConfig` with default args. + an instance of `TransformerOpParallelConfig` with default args. Inputs: - **encoder_inputs** (Tensor) - the input tensor with shape [batch_size, seq_length, hidden_size]. - **encoder_masks** (Tensor) - the attention mask for decoder with shape [batch_size, seq_length, seq_length]. - **decoder_inputs** (Tensor) - the output of the encoder with shape [batch_size, seq_length, hidden_size], - this can be none if the decoder layer is 0. - - **decoder_masks** (Tensor) - the attention mask for decoder with shape [batch_size, 1, - seq_length, seq_length] + this should be none if the decoder layer is 0. + - **decoder_masks** (Tensor) - the attention mask for decoder with shape [batch_size, seq_length, seq_length] - **memory_mask** (Tensor) - the memory mask of the cross attention with shape [batch, tgt_seq_length, src_seq_length] where tgt_seq_length is the length of the decoder. the output of the encoder with shape [batch_size, - seq_length, hidden_size], this can be none if the decoder layer is 0. - - **init_reset** (Tensor) - A bool tensor with shape [batch_size,], used to clear the past key parameter and + seq_length, hidden_size], this should be none if the decoder layer is 0. + - **init_reset** (Tensor) - A bool tensor with shape [batch_size], used to clear the past key parameter and past value parameter used in the incremental prediction. Only valid when use_past is True. Default True - - **batch_valid_length** (Tensor) - Int32 tensor with shape (batch_size,) the past calculated the index. Used + - **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index. Used for incremental prediction when the use_past is True. Default None. Outputs: @@ -1922,10 +1934,10 @@ class Transformer(Cell): decoder layer. The shape is [batch, tgt_seq_length, hidden_size]. - **encoder_layer_present** (Tuple) - A tuple with size of num_layers, where each tuple is the tensor the projected key and value vector in self attention with shape ((batch_size, num_heads, size_per_head, - src_seq_length), (batch_size, num_heads, src_seq_length, size_per_head). + src_seq_length), (batch_size, num_heads, src_seq_length, size_per_head)). - **decoder_layer_present** (Tuple) - A tuple with size of num_layers, where each tuple is the tensor - the projected key and value vector in self attention with shape ((batch_size, num_heads, size_per_head, - tgt_seq_length), (batch_size, num_heads, tgt_seq_length, size_per_head), and the + of the projected key and value vector in self attention with shape ((batch_size, num_heads, size_per_head, + tgt_seq_length), (batch_size, num_heads, tgt_seq_length, size_per_head)), and the projected key and value vector in cross attention with shape (batch_size, num_heads, size_per_head, src_seq_length), (batch_size, num_heads, src_seq_length, size_per_head)). If the decoder is not set, the @@ -1937,11 +1949,11 @@ class Transformer(Cell): Examples: >>> model = Transformer(batch_size=2, encoder_layers=1, decoder_layers=2, hidden_size=64, ffn_hidden_size=64, ... src_seq_length=20, tgt_seq_length=10) - >>> encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32) - >>> encoder_input_mask = Tensor(np.ones((2, 20, 20)), dtype.float16) - >>> decoder_input_value = Tensor(np.ones((2, 10, 64)), dtype.float32) - >>> decoder_input_mask = Tensor(np.ones((2, 10, 10)), dtype.float16) - >>> memory_mask = Tensor(np.ones((2, 10, 20)), dtype.float16) + >>> encoder_input_value = Tensor(np.ones((2, 20, 64)), mstype.float32) + >>> encoder_input_mask = Tensor(np.ones((2, 20, 20)), mstype.float16) + >>> decoder_input_value = Tensor(np.ones((2, 10, 64)), mstype.float32) + >>> decoder_input_mask = Tensor(np.ones((2, 10, 10)), mstype.float16) + >>> memory_mask = Tensor(np.ones((2, 10, 20)), mstype.float16) >>> output, en_past, de_past = model(encoder_input_value, encoder_input_mask, decoder_input_value, ... decoder_input_mask, memory_mask) >>> print(output.shape) @@ -2094,7 +2106,7 @@ class Transformer(Cell): output = encoder_output if self.decoder is not None: - # decoder mask can be created outside of the model + # decoder mask should be created outside of the model if self.use_moe is True: decoder_output, decoder_layer_present, decoder_aux_loss = self.decoder(decoder_inputs, decoder_masks, encoder_output, memory_mask,