!23108 Fix spelling error for transformer

Merge pull request !23108 from huangxinjing/fix_spell_error
This commit is contained in:
i-robot 2021-09-10 01:29:32 +00:00 committed by Gitee
commit 0abff9ad65
4 changed files with 132 additions and 71 deletions

View File

@ -184,7 +184,7 @@ class _LayerNorm(Cell):
Args:
strategy (tuple): The strategy for the dropout. Should be the same shape as the inputs.
Examples:
>>> net = nn.parallel.transformer.LayerNorm(normalized_shape=(1024, 10))
>>> net = mindspore.parallel.nn.transformer.LayerNorm(normalized_shape=(1024, 10))
>>> net.shard(((10, 2, 1),))
"""
self.mean.shard(strategy)

View File

@ -30,6 +30,8 @@ from mindspore.ops.primitive import constexpr
from mindspore.nn.cell import Cell
from mindspore._checkparam import Validator
from mindspore import log as logger
from mindspore.parallel._utils import _get_parallel_mode
from mindspore.context import ParallelMode
from .layers import _LayerNorm, _Linear, _check_input_shape, \
_args_type_validator_check, _valid_type_checks, _valid_value_checks, \
_check_shape_equal, _check_past_none_input_none, _check_input_dtype, _check_input_shape_value, Router
@ -284,7 +286,11 @@ class FeedForward(Cell):
will project the input dimension from hidden_size to ffn_hidden_size, the second linear will project the
dimension from ffn_hidden_size to hidden_size. The first linear is sharded on the relative dimension,
the second linear is sharded on the output dimension. The overview process can be
`DROPOUT(FFN(FFN(x)))`
.. math::
Dropout((xW_1+b_1)W_2 + b_2))
where the W_1, W_2, b_1 and b_2 are trainable parameters.
Args:
hidden_size (int): The dimension of the inputs.
@ -308,7 +314,7 @@ class FeedForward(Cell):
Raises:
ValueError: `hidden_act` is not a string.
ValueError: `parallel_config` is not a subclass of OpParallelConfig.
TypeError: `parallel_config` is not a subclass of OpParallelConfig.
ValueError: `ffn_hidden_size` is not a multiple of the model parallel way.
ValueError: `hidden_size` is not a multiple of the model parallel way.
@ -343,12 +349,12 @@ class FeedForward(Cell):
dp = parallel_config.data_parallel
mp = parallel_config.model_parallel
if ffn_hidden_size % mp != 0:
raise ValueError("ffn_hidden_size {ffn_hidden_size} should be a multiple of the model parallel way {mp}")
raise ValueError(f"ffn_hidden_size {ffn_hidden_size} should be a multiple of the model parallel way {mp}")
if hidden_size % mp != 0:
raise ValueError("hidden_size {hidden_size} should be a multiple of the model parallel way {mp}")
raise ValueError(f"hidden_size {hidden_size} should be a multiple of the model parallel way {mp}")
if dropout_rate < 0 or dropout_rate >= 1:
raise ValueError("dropout_rate probability should be a number in range [0, 1.0), "
"but got {}".format(dropout_rate))
raise ValueError(f"dropout_rate probability should be a number in range [0, 1.0), "
"but got {dropout_rate}")
input_size = hidden_size
output_size = ffn_hidden_size
# Here, 'ep' stands for expert parallel number, which is equal to data parallel number.
@ -360,6 +366,7 @@ class FeedForward(Cell):
transpose_b=False,
expert_num=expert_num,
param_init_type=param_init_type)
if expert_num > 1:
self.mapping.shard(strategy_matmul=((ep, 1, 1), (ep, 1, mp)),
strategy_bias=((ep, 1, mp), (mp,)),
@ -368,7 +375,7 @@ class FeedForward(Cell):
self.mapping.shard(strategy_matmul=((dp, 1), (1, mp)),
strategy_bias=((dp, mp), (mp,)),
strategy_activation=((dp, 1, mp),))
# Project back to embedding_size
# Project back to hidden_size
self.projection = _Linear(in_channels=output_size,
out_channels=input_size,
transpose_b=False,
@ -515,6 +522,7 @@ class MoE(Cell):
aux_loss = self.mul(self.aux_loss_factor, aux_loss)
return combined_output, aux_loss
class AttentionMask(Cell):
r"""
Get the Lower triangular matrix from the input mask. The input mask is a 2D tensor (batch_size, seq_length)
@ -535,14 +543,14 @@ class AttentionMask(Cell):
Raises:
TypeError: `seq_length` is not a int.
ValueError: `seq_length` is not a positive value.
ValueError: `parallel_config` is not a subclass of OpParallelConfig.
TypeError: `parallel_config` is not a subclass of OpParallelConfig.
Supported Platforms:
``Ascend`` ``GPU``
Examples:
>>> mask = mindspore.parallel.nn.AttentionMask(seq_length=4)
>>> mask_array = np.array([[1, 1, 1, 0]], np.int32)
>>> mask_array = np.array([[1, 1, 1, 0]], np.float32)
>>> inputs = Tensor(mask_array)
>>> res = mask(inputs)
>>> print(res)
@ -617,7 +625,7 @@ class VocabEmbedding(Cell):
parallel_config.model_parallel
ValueError: `vocab_size` is not a positive value.
ValueError: `embedding_size` is not a positive value.
ValueError: `parallel_config` is not a subclass of OpParallelConfig.
TypeError: `parallel_config` is not a subclass of OpParallelConfig.
Supported Platforms:
``Ascend`` ``GPU``
@ -661,9 +669,17 @@ class VocabEmbedding(Cell):
class MultiHeadAttention(Cell):
"""
r"""
This is an implementation of multihead attention in the paper `Attention is all you need
<https://arxiv.org/pdf/1706.03762v5.pdf>`_.
<https://arxiv.org/pdf/1706.03762v5.pdf>`_. Given the query vector with source length, and the
key and value vector with target length, the attention will be performered as the following
.. math::
MultiHeadAttention(query, key, vector) = Concat(head_1, \dots, head_h)W^O
where :math:`head_i = Attention(QW_i^Q, KW_i^K, VW_i^V)`. The default is with a bias.
if query, key and value tensor is same, then it will be self attention.
Args:
batch_size(int): The batch size of the input tensor.
@ -714,7 +730,7 @@ class MultiHeadAttention(Cell):
... num_heads=3)
>>> from_tensor = Tensor(np.ones((2, 20, 15)), dtype.float32)
>>> to_tensor = Tensor(np.ones((2, 20, 15)), dtype.float16)
>>> attention_mask = Tensor(np.ones((2, 1, 20, 20)), dtype.float16)
>>> attention_mask = Tensor(np.ones((2, 20, 20)), dtype.float16)
>>> attn_out, past = model(from_tensor, to_tensor, to_tensor, attention_mask)
>>> print(attn_out.shape)
(2, 20, 15)
@ -731,7 +747,7 @@ class MultiHeadAttention(Cell):
tgt_seq_length=Validator.check_positive_int,
attention_dropout_rate=Validator.check_non_negative_float,
hidden_dropout_rate=Validator.check_non_negative_float,
softmax_comptue_type=_valid_value_checks([mstype.float32, mstype.float16],
softmax_compute_type=_valid_value_checks([mstype.float32, mstype.float16],
"MultiHeadAttention"),
param_init_type=_valid_value_checks([mstype.float32, mstype.float16],
"MultiHeadAttention"),
@ -746,7 +762,7 @@ class MultiHeadAttention(Cell):
hidden_dropout_rate=0.1,
attention_dropout_rate=0.1,
compute_dtype=mstype.float16,
softmax_comptue_type=mstype.float32,
softmax_compute_type=mstype.float32,
param_init_type=mstype.float32,
use_past=False,
parallel_config=default_dpmp_config):
@ -757,11 +773,11 @@ class MultiHeadAttention(Cell):
self.hidden_size = hidden_size
self.batch_size = batch_size
if hidden_dropout_rate < 0 or hidden_dropout_rate >= 1:
raise ValueError("hidden_dropout_rate probability should be a number in range [0, 1.0), "
"but got {}".format(hidden_dropout_rate))
raise ValueError(f"hidden_dropout_rate probability should be a number in range [0, 1.0), "
"but got {hidden_dropout_rate}")
if attention_dropout_rate < 0 or attention_dropout_rate >= 1:
raise ValueError("attention_dropout_rate probability should be a number in range [0, 1.0), "
"but got {}".format(attention_dropout_rate))
raise ValueError(f"attention_dropout_rate probability should be a number in range [0, 1.0), "
"but got {attention_dropout_rate}")
if hidden_size % num_heads != 0:
raise ValueError(f"The hidden size {hidden_size} should be a multiple of num_heads {num_heads}")
if num_heads % parallel_config.model_parallel != 0:
@ -837,7 +853,7 @@ class MultiHeadAttention(Cell):
strategy_bias=((parallel_config.data_parallel, parallel_config.model_parallel),
(parallel_config.model_parallel,)))
self.dtype = compute_dtype
self.softmax_dtype = softmax_comptue_type
self.softmax_dtype = softmax_compute_type
if self.use_past:
# operators used for state reuse
seq_range = np.arange(src_seq_length).reshape(1, 1, -1)
@ -933,11 +949,10 @@ class MultiHeadAttention(Cell):
layer_present = (key_present, value_present)
# multi head attention considering attention mask
# [bs, seq_length, hidden_size]
attention = self._attn(query, key, value, attention_mask)
# [bs, seq_length, embedding_size]
attention_merge = self._merge_heads(attention)
# Output
output = self.projection(attention_merge)
output = self.projection(attention)
output = self.dropout(output)
return output, layer_present
@ -1038,7 +1053,8 @@ class MultiHeadAttention(Cell):
attention_probs = self.prob_dropout(attention_probs)
# Weighted sum output [bs, num_heads, seq_length, size_per_head]
weighted_values = self.batch_matmul(attention_probs, value)
return weighted_values
attention_merge = self._merge_heads(weighted_values)
return attention_merge
class TransformerEncoderLayer(Cell):
@ -1060,7 +1076,7 @@ class TransformerEncoderLayer(Cell):
'hsigmoid', 'logsigmoid' and so on. Default: gelu.
layernorm_compute_type(dtype.Number): The computation type of the layernorm.
Can be dtype.float32 or dtype.float16. Default dtype.float16.
softmax_comptue_type(dtype.Number): The computation type of the softmax in the attention.
softmax_compute_type(dtype.Number): The computation type of the softmax in the attention.
Can be dtype.float32 or dtype.float16. Default mstype.float16.
param_init_type(dtype.Number): The parameter initialization type of the module.
Can be dtype.float32 or dtype.float16. Default dtype.float32.
@ -1115,7 +1131,7 @@ class TransformerEncoderLayer(Cell):
post_layernorm_residual=Validator.check_bool,
layernorm_compute_type=_valid_value_checks([mstype.float32, mstype.float16],
"TransformerEncoderLayer"),
softmax_comptue_type=_valid_value_checks([mstype.float32, mstype.float16],
softmax_compute_type=_valid_value_checks([mstype.float32, mstype.float16],
"TransformerEncoderLayer"),
param_init_type=_valid_value_checks([mstype.float32, mstype.float16],
"TransformerEncoderLayer"),
@ -1132,7 +1148,7 @@ class TransformerEncoderLayer(Cell):
hidden_dropout_rate=0.1,
post_layernorm_residual=False,
layernorm_compute_type=mstype.float32,
softmax_comptue_type=mstype.float32,
softmax_compute_type=mstype.float32,
param_init_type=mstype.float32,
hidden_act='gelu',
use_past=False,
@ -1142,8 +1158,16 @@ class TransformerEncoderLayer(Cell):
_check_config(parallel_config)
if num_heads % parallel_config.model_parallel != 0:
raise ValueError(
f"num heads must be divisibled by the model parallel way {parallel_config.model_parallel},"
f"num heads must be divisibled by the model parallel way {parallel_config.model_parallel}, "
f"but found {num_heads}")
if hidden_size % parallel_config.model_parallel != 0:
raise ValueError(
f"hidden_size must be divisibled by the model parallel way {parallel_config.model_parallel}, "
f"but found {hidden_size}")
if ffn_hidden_size % parallel_config.model_parallel != 0:
raise ValueError(
f"ffn_hidden_size must be divisibled by the model parallel way {parallel_config.model_parallel}, "
f"but found {ffn_hidden_size}")
self.use_past = use_past
self.seq_length = seq_length
self.hidden_size = hidden_size
@ -1160,7 +1184,7 @@ class TransformerEncoderLayer(Cell):
num_heads=num_heads,
hidden_dropout_rate=hidden_dropout_rate,
attention_dropout_rate=attention_dropout_rate,
softmax_comptue_type=softmax_comptue_type,
softmax_compute_type=softmax_compute_type,
param_init_type=param_init_type,
use_past=use_past,
parallel_config=parallel_config)
@ -1298,7 +1322,7 @@ class TransformerDecoderLayer(Cell):
'hsigmoid', 'logsigmoid' and so on. Default: gelu.
layernorm_compute_type(dtype.Number): The computation type of the layernorm.
Can be dtype.float32 or dtype.float16. Default dtype.float16.
softmax_comptue_type(dtype.Number): The computation type of the softmax in the attention.
softmax_compute_type(dtype.Number): The computation type of the softmax in the attention.
Can be dtype.float32 or dtype.float16. Default mstype.float16.
param_init_type: The parameter initialization type of the module. Can be dtype.float32 or dtype.float16.
Default dtype.float32.
@ -1337,8 +1361,8 @@ class TransformerDecoderLayer(Cell):
... src_seq_length=20, tgt_seq_length=10)
>>> encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32)
>>> decoder_input_value = Tensor(np.ones((2, 10, 64)), dtype.float32)
>>> decoder_input_mask = Tensor(np.ones((2, 1, 10, 10)), dtype.float16)
>>> memory_mask = Tensor(np.ones((2, 1, 10, 20)), dtype.float16)
>>> decoder_input_mask = Tensor(np.ones((2, 10, 10)), dtype.float16)
>>> memory_mask = Tensor(np.ones((2, 10, 20)), dtype.float16)
>>> output, past = model(decoder_input_value, decoder_input_mask, encoder_input_value, memory_mask)
>>> print(output.shape)
(2, 10, 64)
@ -1364,7 +1388,7 @@ class TransformerDecoderLayer(Cell):
post_layernorm_residual=Validator.check_bool,
layernorm_compute_type=_valid_value_checks([mstype.float32, mstype.float16],
"TransformerDecoderLayer"),
softmax_comptue_type=_valid_value_checks([mstype.float32, mstype.float16],
softmax_compute_type=_valid_value_checks([mstype.float32, mstype.float16],
"TransformerDecoderLayer"),
param_init_type=_valid_value_checks([mstype.float32, mstype.float16],
"TransformerDecoderLayer"),
@ -1382,16 +1406,28 @@ class TransformerDecoderLayer(Cell):
post_layernorm_residual=False,
use_past=False,
layernorm_compute_type=mstype.float32,
softmax_comptue_type=mstype.float32,
softmax_compute_type=mstype.float32,
param_init_type=mstype.float32,
hidden_act='gelu',
moe_config=default_moe_config,
parallel_config=default_dpmp_config):
super(TransformerDecoderLayer, self).__init__()
_check_config(parallel_config)
if num_heads % parallel_config.model_parallel != 0:
raise ValueError(
f"num heads must be divisibled by the model parallel way {parallel_config.model_parallel}, "
f"but found {num_heads}")
if hidden_size % parallel_config.model_parallel != 0:
raise ValueError(
f"hidden_size must be divisibled by the model parallel way {parallel_config.model_parallel}, "
f"but found {hidden_size}")
if ffn_hidden_size % parallel_config.model_parallel != 0:
raise ValueError(
f"ffn_hidden_size must be divisibled by the model parallel way {parallel_config.model_parallel}, "
f"but found {ffn_hidden_size}")
self.batch_size = batch_size
self.use_past = use_past
self.softmax_comptue_type = softmax_comptue_type
self.softmax_compute_type = softmax_compute_type
self.src_seq_length = src_seq_length
self.tgt_seq_length = tgt_seq_length
@ -1411,7 +1447,7 @@ class TransformerDecoderLayer(Cell):
hidden_dropout_rate=hidden_dropout_rate,
attention_dropout_rate=attention_dropout_rate,
use_past=use_past,
softmax_comptue_type=softmax_comptue_type,
softmax_compute_type=softmax_compute_type,
param_init_type=param_init_type,
parallel_config=parallel_config)
# Cross attention with the output of encoder as memory tensor
@ -1422,7 +1458,7 @@ class TransformerDecoderLayer(Cell):
tgt_seq_length=src_seq_length,
hidden_dropout_rate=hidden_dropout_rate,
attention_dropout_rate=attention_dropout_rate,
softmax_comptue_type=softmax_comptue_type,
softmax_compute_type=softmax_compute_type,
use_past=use_past,
param_init_type=param_init_type,
parallel_config=parallel_config)
@ -1614,7 +1650,8 @@ def _get_lambda_func(total_layer=None):
class TransformerEncoder(Cell):
r"""
Transformer Encoder module with multi-layer stacled of `TransformerEncoderLayer`.
Transformer Encoder module with multi-layer stacked of `TransformerEncoderLayer`, including multihead self
attention and feedforward layer.
Args:
batch_size(int): The batch size of the input tensor.
@ -1631,7 +1668,7 @@ class TransformerEncoder(Cell):
'hsigmoid', 'logsigmoid' and so on. Default: gelu.
layernorm_compute_type(dtype.Number): The computation type of the layernorm.
Can be dtype.float32 or dtype.float16. Default dtype.float16.
softmax_comptue_type(dtype.Number): The computation type of the softmax in the attention.
softmax_compute_type(dtype.Number): The computation type of the softmax in the attention.
Can be dtype.float32 or dtype.float16. Default mstype.float16.
param_init_type: The parameter initialization type of the module. Can be dtype.float32 or dtype.float16.
Default dtype.float32.
@ -1697,7 +1734,7 @@ class TransformerEncoder(Cell):
post_layernorm_residual=Validator.check_bool,
layernorm_compute_type=_valid_value_checks([mstype.float32, mstype.float16],
"TransformerEncoder"),
softmax_comptue_type=_valid_value_checks([mstype.float32, mstype.float16],
softmax_compute_type=_valid_value_checks([mstype.float32, mstype.float16],
"TransformerEncoder"),
param_init_type=_valid_value_checks([mstype.float32, mstype.float16],
"TransformerEncoder"),
@ -1716,7 +1753,7 @@ class TransformerEncoder(Cell):
hidden_act='gelu',
post_layernorm_residual=False,
layernorm_compute_type=mstype.float32,
softmax_comptue_type=mstype.float32,
softmax_compute_type=mstype.float32,
param_init_type=mstype.float32,
lambda_func=None,
offset=0,
@ -1729,6 +1766,8 @@ class TransformerEncoder(Cell):
self.use_moe = (moe_config.expert_num > 1)
self.add = P.TensorAdd().shard(((), ()))
self.aux_loss = Tensor(0.0, mstype.float32)
if _get_parallel_mode() in (ParallelMode.AUTO_PARALLEL,):
raise RuntimeError(f"The {self.cls_name} does not support auto parallel mode now.")
self.num_layers = num_layers
self.blocks = nn.CellList()
for i in range(num_layers):
@ -1739,7 +1778,7 @@ class TransformerEncoder(Cell):
attention_dropout_rate=attention_dropout_rate,
hidden_dropout_rate=hidden_dropout_rate,
layernorm_compute_type=layernorm_compute_type,
softmax_comptue_type=softmax_comptue_type,
softmax_compute_type=softmax_compute_type,
num_heads=num_heads,
hidden_act=hidden_act,
post_layernorm_residual=post_layernorm_residual,
@ -1780,7 +1819,8 @@ class TransformerEncoder(Cell):
class TransformerDecoder(Cell):
r"""
Transformer Decoder module with multi-layer stacled of `TransformerDecoderLayer`.
Transformer Decoder module with multi-layer stacked of `TransformerDecoderLayer`, including multihead self
attention, cross attention and feedforward layer.
Args:
batch_size(int): The batch size of the input tensor.
@ -1798,7 +1838,7 @@ class TransformerDecoder(Cell):
'hsigmoid', 'logsigmoid' and so on. Default: gelu.
layernorm_compute_type(dtype.Number): The computation type of the layernorm.
Can be dtype.float32 or dtype.float16. Default dtype.float16.
softmax_comptue_type(dtype.Number): The computation type of the softmax in the attention.
softmax_compute_type(dtype.Number): The computation type of the softmax in the attention.
Can be dtype.float32 or dtype.float16. Default mstype.float16.
param_init_type: The parameter initialization type of the module. Can be dtype.float32 or dtype.float16.
Default dtype.float32.
@ -1826,6 +1866,7 @@ class TransformerDecoder(Cell):
past value parameter used in the incremental prediction. Only valid when use_past is True. Default True
- **batch_valid_length** (Tensor) - Int32 tensor with shape (batch_size,) the past calculated the index.
Used for incremental prediction when the use_past is True. Default None.
Outputs:
Tuple, a tuple contains(`output`, `layer_present`)
@ -1844,8 +1885,8 @@ class TransformerDecoder(Cell):
... num_heads=2, src_seq_length=20, tgt_seq_length=10)
>>> encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32)
>>> decoder_input_value = Tensor(np.ones((2, 10, 64)), dtype.float32)
>>> decoder_input_mask = Tensor(np.ones((2, 1, 10, 10)), dtype.float16)
>>> memory_mask = Tensor(np.ones((2, 1, 10, 20)), dtype.float16)
>>> decoder_input_mask = Tensor(np.ones((2, 10, 10)), dtype.float16)
>>> memory_mask = Tensor(np.ones((2, 10, 20)), dtype.float16)
>>> output, past = model(decoder_input_value, decoder_input_mask, encoder_input_value, memory_mask)
>>> print(output.shape)
(2, 10, 64)
@ -1876,7 +1917,7 @@ class TransformerDecoder(Cell):
post_layernorm_residual=Validator.check_bool,
layernorm_compute_type=_valid_value_checks([mstype.float32, mstype.float16],
"TransformerDecoder"),
softmax_comptue_type=_valid_value_checks([mstype.float32, mstype.float16],
softmax_compute_type=_valid_value_checks([mstype.float32, mstype.float16],
"TransformerDecoder"),
param_init_type=_valid_value_checks([mstype.float32, mstype.float16],
"TransformerDecoder"),
@ -1895,7 +1936,7 @@ class TransformerDecoder(Cell):
hidden_dropout_rate=0.1,
post_layernorm_residual=False,
layernorm_compute_type=mstype.float32,
softmax_comptue_type=mstype.float32,
softmax_compute_type=mstype.float32,
param_init_type=mstype.float32,
hidden_act='gelu',
lambda_func=None,
@ -1908,6 +1949,8 @@ class TransformerDecoder(Cell):
self.add = P.TensorAdd().shard(((), ()))
self.aux_loss = Tensor(0.0, mstype.float32)
if _get_parallel_mode() in (ParallelMode.AUTO_PARALLEL,):
raise RuntimeError(f"The {self.cls_name} does not support auto parallel mode now.")
self.num_layers = num_layers
self.blocks = nn.CellList()
self.use_moe = (moe_config.expert_num > 1)
@ -1921,7 +1964,7 @@ class TransformerDecoder(Cell):
hidden_dropout_rate=hidden_dropout_rate,
num_heads=num_heads,
layernorm_compute_type=layernorm_compute_type,
softmax_comptue_type=softmax_comptue_type,
softmax_compute_type=softmax_compute_type,
hidden_act=hidden_act,
use_past=use_past,
param_init_type=param_init_type,
@ -1969,7 +2012,7 @@ class TransformerDecoder(Cell):
class Transformer(Cell):
r"""
Transformer module including encoder and decoder. The difference with the original implements is the module use
the residual addition before the layernormalization. And the default hidden act is `gelu`.
the residual addition before the layer normalization. And the default hidden act is `gelu`.
The detials can be found in `Attention is all you need <https://arxiv.org/pdf/1706.03762v5.pdf>`_.
Note:
@ -2037,13 +2080,13 @@ class Transformer(Cell):
``Ascend`` ``GPU``
Examples:
>>> model = Transformer(encoder_layers=1, decoder_layers=2, hidden_size=64, ffn_hidden_size=64,
>>> model = Transformer(batch_size=2, encoder_layers=1, decoder_layers=2, hidden_size=64, ffn_hidden_size=64,
... src_seq_length=20, tgt_seq_length=10)
>>> encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32)
>>> encoder_input_mask = Tensor(np.ones((2, 1, 20, 20)), dtype.float16)
>>> encoder_input_mask = Tensor(np.ones((2, 20, 20)), dtype.float16)
>>> decoder_input_value = Tensor(np.ones((2, 10, 64)), dtype.float32)
>>> decoder_input_mask = Tensor(np.ones((2, 1, 10, 10)), dtype.float16)
>>> memory_mask = Tensor(np.ones((2, 1, 10, 20)), dtype.float16)
>>> decoder_input_mask = Tensor(np.ones((2, 10, 10)), dtype.float16)
>>> memory_mask = Tensor(np.ones((2, 10, 20)), dtype.float16)
>>> output, en_past, de_past = model(encoder_input_value, encoder_input_mask, decoder_input_value,
... decoder_input_mask, memory_mask)
>>> print(output.shape)
@ -2079,11 +2122,11 @@ class Transformer(Cell):
hidden_dropout_rate=Validator.check_non_negative_float,
hidden_act=_valid_type_checks([str], "Transformer"),
post_layernorm_residual=Validator.check_bool,
layernorm_compute_type=_valid_type_checks([mstype.float32, mstype.float16],
layernorm_compute_type=_valid_value_checks([mstype.float32, mstype.float16],
"Transformer"),
softmax_comptue_type=_valid_type_checks([mstype.float32, mstype.float16],
softmax_compute_type=_valid_value_checks([mstype.float32, mstype.float16],
"Transformer"),
param_init_type=_valid_type_checks([mstype.float32, mstype.float16], "Transformer"),
param_init_type=_valid_value_checks([mstype.float32, mstype.float16], "Transformer"),
parallel_config=_valid_type_checks([TransformerOpParallelConfig], "Transformer"),
use_past=Validator.check_bool)
def __init__(self,
@ -2100,7 +2143,7 @@ class Transformer(Cell):
hidden_act='gelu',
post_layernorm_residual=False,
layernorm_compute_type=mstype.float32,
softmax_comptue_type=mstype.float32,
softmax_compute_type=mstype.float32,
param_init_type=mstype.float32,
lambda_func=None,
use_past=False,
@ -2118,7 +2161,9 @@ class Transformer(Cell):
f"layer {decoder_layers}, please use TransformerDecoder")
if encoder_layers > 0 and decoder_layers > 0 and use_past is True:
raise ValueError("The transformer with encoder and decoder does not support use_past=True.")
# The shard setting of Transformer is set within the class StackedTransformer
if _get_parallel_mode() in (ParallelMode.AUTO_PARALLEL,):
raise RuntimeError(f"The {self.cls_name} does not support auto parallel mode now.")
# The shard setting of Transformer is set within the TransformerEncoderLayer
if not lambda_func:
lambda_func = _get_lambda_func(total_layer=encoder_layers + decoder_layers)
@ -2136,7 +2181,7 @@ class Transformer(Cell):
hidden_dropout_rate=hidden_dropout_rate,
hidden_act=hidden_act,
layernorm_compute_type=layernorm_compute_type,
softmax_comptue_type=softmax_comptue_type,
softmax_compute_type=softmax_compute_type,
post_layernorm_residual=post_layernorm_residual,
param_init_type=param_init_type,
lambda_func=lambda_func,
@ -2162,7 +2207,7 @@ class Transformer(Cell):
hidden_act=hidden_act,
post_layernorm_residual=post_layernorm_residual,
layernorm_compute_type=layernorm_compute_type,
softmax_comptue_type=softmax_comptue_type,
softmax_compute_type=softmax_compute_type,
lambda_func=lambda_func,
use_past=use_past,
param_init_type=param_init_type,

View File

@ -398,7 +398,7 @@ class PanGUAlphaWithLoss(Cell):
self.not_equal = P.NotEqual().shard(((dp, 1), ()))
self.batch_size = config.batch_size
self.len = config.seq_length
self.expand = P.ExpandDims().shard(((dp, 1, 1),))
self.slice2 = P.StridedSlice().shard(((dp, 1, 1),))
self.micro_batch_step = 1
if config.parallel_config.pipeline_stage > 1:
self.micro_batch_step = config.parallel_config.micro_batch_num
@ -407,13 +407,14 @@ class PanGUAlphaWithLoss(Cell):
r"""Forward process of the pangu alpha model"""
tokens = self.slice(input_ids, (0, 0), (self.batch_size, -1), (1, 1))
input_position = self.slice(input_position, (0, 0), (self.batch_size, self.len), (1, 1))
encoder_attention_masks = attention_mask
decoder_attention_masks = self.slice2(attention_mask, (0, 0, 0), (self.batch_size, self.len, self.len),
(1, 1, 1))
input_mask = F.cast(self.not_equal(tokens, self.eod_token),
mstype.float32)
logits = self.network(tokens,
input_position,
encoder_attention_masks)
decoder_attention_masks)
# Get label corresponding to input tokens
labels = self.slice(input_ids, (0, 1), (self.batch_size, self.len + 1),
(1, 1))

View File

@ -74,14 +74,15 @@ class NetWithLossFiveInputs(nn.Cell):
def run_total_transformer_model_head(e_layer,
d_layer,
arg_parallel_config):
arg_parallel_config,
mode=ParallelMode.SEMI_AUTO_PARALLEL):
dp = arg_parallel_config.data_parallel
mp = arg_parallel_config.model_parallel
pp = arg_parallel_config.pipeline_stage
if dp * mp * pp != 1:
set_auto_parallel_context(device_num=8,
full_batch=True,
global_rank=0, parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL)
global_rank=0, parallel_mode=mode)
class Net(nn.Cell):
def __init__(self, en_layer, de_layer, parallel_config):
@ -208,6 +209,13 @@ def test_transformer_model_head_stand_alone():
run_total_transformer_model_head(e_layer=2, d_layer=2, arg_parallel_config=local_config)
def test_transformer_model_auto_parallel_no_support():
local_config = TransformerOpParallelConfig(data_parallel=8, model_parallel=1)
with pytest.raises(RuntimeError):
run_total_transformer_model_head(e_layer=2, d_layer=2, arg_parallel_config=local_config,
mode=ParallelMode.AUTO_PARALLEL)
def test_pipeline_single_transformer():
set_auto_parallel_context(device_num=32,
full_batch=True,
@ -405,6 +413,7 @@ def test_sparse_attention_parallel_mp():
model = Model(net)
model.train(1, dataset, dataset_sink_mode=False)
def test_sparse_attention_parallel_mix():
set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode=ParallelMode.AUTO_PARALLEL)
set_algo_parameters(fully_use_devices=False)
@ -423,6 +432,7 @@ def test_sparse_attention_parallel_mix():
model = Model(net)
model.train(1, dataset, dataset_sink_mode=False)
def test_sparse_attention_parallel_mix1():
set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode=ParallelMode.AUTO_PARALLEL)
set_algo_parameters(fully_use_devices=False)
@ -441,6 +451,7 @@ def test_sparse_attention_parallel_mix1():
model = Model(net)
model.train(1, dataset, dataset_sink_mode=False)
def test_sparse_attention_parallel_dp():
set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode=ParallelMode.AUTO_PARALLEL)
set_algo_parameters(fully_use_devices=False)
@ -459,6 +470,7 @@ def test_sparse_attention_parallel_dp():
model = Model(net)
model.train(1, dataset, dataset_sink_mode=False)
def test_parallel_cross_entroy_loss_semi_auto_parallel():
set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode=ParallelMode.AUTO_PARALLEL)
@ -496,7 +508,7 @@ def test_transformer_args():
with pytest.raises(TypeError):
Transformer(hidden_size=10, batch_size=2, ffn_hidden_size=20, src_seq_length=10,
tgt_seq_length=20, softmax_comptue_type=mstype.int64)
tgt_seq_length=20, softmax_compute_type=mstype.int64)
with pytest.raises(TypeError):
Transformer(hidden_size=10, batch_size=2, ffn_hidden_size=20, src_seq_length=10,
@ -510,6 +522,9 @@ def test_transformer_args():
Transformer(hidden_size=10, batch_size=2, ffn_hidden_size=20, src_seq_length=10,
tgt_seq_length=20, hidden_dropout_rate=mstype.int64)
Transformer(hidden_size=10, batch_size=2, ffn_hidden_size=20, src_seq_length=10,
tgt_seq_length=20, softmax_compute_type=mstype.float16)
def test_transformer_parallel_config():
parallel_test_config = TransformerOpParallelConfig(data_parallel=1, model_parallel=3)