forked from mindspore-Ecosystem/mindspore
Fix spell error
This commit is contained in:
parent
a17915b2c4
commit
b787c5c8c8
|
@ -376,14 +376,14 @@ class FixedSparseAttention(nn.Cell):
|
||||||
only supports 64, 128 for now
|
only supports 64, 128 for now
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
- **q** - Tensor uery (:class:`mstype.fp16` [batch_size, seq_length, hidden_size]): Sequence of
|
- **q** (Tensor) - Tensor query (:class:`mstype.fp16` [batch_size, seq_length, hidden_size]): Sequence of
|
||||||
queries to query the context.
|
queries to query the context.
|
||||||
- **k** - Tensor key (:class:`mstype.fp16` [batch_size, seq_length, hidden_size]): Sequence of
|
- **k** (Tensor) - Tensor key (:class:`mstype.fp16` [batch_size, seq_length, hidden_size]): Sequence of
|
||||||
queries to query the context.
|
queries to query the context.
|
||||||
- **v** - Tensor value (:class:`mstype.fp16` [batch size, sequence length, Embedding Size]): Sequence of
|
- **v** (Tensor) - Tensor value (:class:`mstype.fp16` [batch size, sequence length, Embedding Size]):
|
||||||
queries to query the context.
|
Sequence of queries to query the context.
|
||||||
- **attention_mask** - Tensor the mask of (:class:`mstype.fp32` [batch_size, seq_length, seq_length]):
|
- **attention_mask** (Tensor) - Float Tensor the mask of (:class:`mstype.fp32`, :class:`mstype.fp16`
|
||||||
Lower triangular matrix to pass masked information.
|
[batch_size, seq_length, seq_length]): Lower triangular matrix to pass masked information.
|
||||||
|
|
||||||
Outputs:
|
Outputs:
|
||||||
A Tensor. The output of the attention with shape [batch_size, seq_length, hidden_size]
|
A Tensor. The output of the attention with shape [batch_size, seq_length, hidden_size]
|
||||||
|
@ -396,10 +396,10 @@ class FixedSparseAttention(nn.Cell):
|
||||||
... num_heads=8,
|
... num_heads=8,
|
||||||
... size_per_head=64,
|
... size_per_head=64,
|
||||||
... block_size=64)
|
... block_size=64)
|
||||||
>>> q = Tensor(np.ones((2, 1024, 8*64)), dtype.float16)
|
>>> q = Tensor(np.ones((2, 1024, 8*64)), mstype.float16)
|
||||||
>>> k = Tensor(np.ones((2, 1024, 8*64)), dtype.float16)
|
>>> k = Tensor(np.ones((2, 1024, 8*64)), mstype.float16)
|
||||||
>>> v = Tensor(np.ones((2, 1024, 8*64)), dtype.float16)
|
>>> v = Tensor(np.ones((2, 1024, 8*64)), mstype.float16)
|
||||||
>>> attention_mask = Tensor(np.ones((2, 1024, 1024)), dtype.float16)
|
>>> attention_mask = Tensor(np.ones((2, 1024, 1024)), mstype.float32)
|
||||||
>>> output = model(q, k, v, attention_mask)
|
>>> output = model(q, k, v, attention_mask)
|
||||||
>>> print(output.shape)
|
>>> print(output.shape)
|
||||||
(2, 1024, 512)
|
(2, 1024, 512)
|
||||||
|
@ -550,7 +550,7 @@ class FixedSparseAttention(nn.Cell):
|
||||||
_check_input_dtype(F.dtype(v), "v", [mstype.float16], self.cls_name)
|
_check_input_dtype(F.dtype(v), "v", [mstype.float16], self.cls_name)
|
||||||
_check_shape_equal(F.shape(attention_mask), "attention_mask", self.cls_name,
|
_check_shape_equal(F.shape(attention_mask), "attention_mask", self.cls_name,
|
||||||
[self.batch_size, self.seq_length, self.seq_length])
|
[self.batch_size, self.seq_length, self.seq_length])
|
||||||
_check_input_dtype(F.dtype(attention_mask), "attention_mask", [mstype.float32], self.cls_name)
|
_check_input_dtype(F.dtype(attention_mask), "attention_mask", [mstype.float32, mstype.float16], self.cls_name)
|
||||||
|
|
||||||
q, k, v = self._transpose_inputs(q, k, v)
|
q, k, v = self._transpose_inputs(q, k, v)
|
||||||
local_mask, global_mask = self._generate_attention_mask(attention_mask)
|
local_mask, global_mask = self._generate_attention_mask(attention_mask)
|
||||||
|
|
|
@ -34,7 +34,7 @@ class CrossEntropyLoss(Cell):
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
parallel_config (OpParallelConfig): The parallel configure. Default `default_dpmp_config`,
|
parallel_config (OpParallelConfig): The parallel configure. Default `default_dpmp_config`,
|
||||||
a instance of `OpParallelConfig` with default args.
|
an instance of `OpParallelConfig` with default args.
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
- **logits** (Tensor) - Tensor of shape (N, C). Data type must be float16 or float32. the output logits of
|
- **logits** (Tensor) - Tensor of shape (N, C). Data type must be float16 or float32. the output logits of
|
||||||
|
@ -48,8 +48,9 @@ class CrossEntropyLoss(Cell):
|
||||||
Outputs:
|
Outputs:
|
||||||
Tensor. the corresponding cross entropy loss
|
Tensor. the corresponding cross entropy loss
|
||||||
|
|
||||||
Exapmes:
|
Examples:
|
||||||
>>> loss = mindspore.parallel.nn.CrossEntropyLoss()
|
>>> from mindspore.parallel.nn import CrossEntropyLoss
|
||||||
|
>>> loss = CrossEntropyLoss()
|
||||||
>>> logits = Tensor(np.array([[3, 5, 6, 9, 12, 33, 42, 12, 32, 72]]), mindspore.float32)
|
>>> logits = Tensor(np.array([[3, 5, 6, 9, 12, 33, 42, 12, 32, 72]]), mindspore.float32)
|
||||||
>>> labels_np = np.array([1]).astype(np.int32)
|
>>> labels_np = np.array([1]).astype(np.int32)
|
||||||
>>> input_mask = Tensor(np.ones(1).astype(np.float32))
|
>>> input_mask = Tensor(np.ones(1).astype(np.float32))
|
||||||
|
@ -88,9 +89,6 @@ class CrossEntropyLoss(Cell):
|
||||||
self.div2 = P.RealDiv()
|
self.div2 = P.RealDiv()
|
||||||
|
|
||||||
def construct(self, logits, label, input_mask):
|
def construct(self, logits, label, input_mask):
|
||||||
r"""
|
|
||||||
Compute loss using logits, label and input mask
|
|
||||||
"""
|
|
||||||
self._check_input(logits, label, input_mask)
|
self._check_input(logits, label, input_mask)
|
||||||
|
|
||||||
# the shape is [bs*seq_length, vocab_size]
|
# the shape is [bs*seq_length, vocab_size]
|
||||||
|
|
|
@ -41,7 +41,7 @@ class _Config:
|
||||||
|
|
||||||
class OpParallelConfig(_Config):
|
class OpParallelConfig(_Config):
|
||||||
r"""
|
r"""
|
||||||
OpParallelConfig for the setting the data parallel and model parallel.
|
OpParallelConfig for the setting data parallel and model parallel.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
data_parallel (int): The data parallel way. Default: 1
|
data_parallel (int): The data parallel way. Default: 1
|
||||||
|
@ -81,7 +81,7 @@ class OpParallelConfig(_Config):
|
||||||
|
|
||||||
class _PipeLineConfig(_Config):
|
class _PipeLineConfig(_Config):
|
||||||
r"""
|
r"""
|
||||||
PPConfig for the setting the data parallel, model parallel
|
PPConfig for the setting data parallel, model parallel
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pipeline_stage (int): The number of the pipeline stages. Default: 1
|
pipeline_stage (int): The number of the pipeline stages. Default: 1
|
||||||
|
|
|
@ -53,7 +53,7 @@ __all__ = [
|
||||||
|
|
||||||
class EmbeddingOpParallelConfig(_Config):
|
class EmbeddingOpParallelConfig(_Config):
|
||||||
r"""
|
r"""
|
||||||
EmbeddingOpParallelConfig for the setting the data parallel or row slice for the embedding table.
|
EmbeddingOpParallelConfig for the setting data parallel or row slice for the embedding table.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
data_parallel (int): The data parallel way. Default: 1
|
data_parallel (int): The data parallel way. Default: 1
|
||||||
|
@ -100,7 +100,7 @@ class EmbeddingOpParallelConfig(_Config):
|
||||||
@property
|
@property
|
||||||
def dp_mp_config(self):
|
def dp_mp_config(self):
|
||||||
r"""
|
r"""
|
||||||
To obtain the DPMPlConfig for the setting the data parallel, model parallel
|
To obtain the DPMPlConfig for the setting data parallel, model parallel
|
||||||
|
|
||||||
Supported Platforms:
|
Supported Platforms:
|
||||||
``Ascend`` ``GPU``
|
``Ascend`` ``GPU``
|
||||||
|
@ -114,21 +114,21 @@ class EmbeddingOpParallelConfig(_Config):
|
||||||
|
|
||||||
class TransformerOpParallelConfig(_Config):
|
class TransformerOpParallelConfig(_Config):
|
||||||
r"""
|
r"""
|
||||||
TransformerOpParallelConfig for the setting the global data parallel, model parallel and fusion group.
|
TransformerOpParallelConfig for the setting global data parallel, model parallel and fusion group.
|
||||||
The parallel configure setting.
|
The parallel configure setting.
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
Except the recompute argument, other arguments will not be effective when the user doesn't set
|
Except the recompute argument, other arguments will not be effective when the user doesn't set
|
||||||
auto_parallel_context to `SEMI_AUTO_PARALLEL` or `AUTO_PARALLEL`.
|
auto_parallel_context to `SEMI_AUTO_PARALLEL` or `AUTO_PARALLEL`.
|
||||||
The micro_batch_num must be greater then or equal to pipeline_stage. The data_parallel\*model_parallel
|
The micro_batch_num must be greater than or equal to pipeline_stage. The data_parallel\*model_parallel
|
||||||
\*pipeline_stage must be equal to the device. When setting the pipeline stage and
|
\*pipeline_stage must be equal or less equal to the device. When setting the pipeline stage and
|
||||||
optimizer_shard, the config will overwrite the auto_parallel_context.
|
optimizer_shard, the config will overwrite the auto_parallel_context.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
data_parallel (int): The data parallel way. Default: 1.
|
data_parallel (int): The data parallel way. Default: 1.
|
||||||
model_parallel (int): The model parallel way. Default: 1.
|
model_parallel (int): The model parallel way. Default: 1.
|
||||||
pipeline_stage (int): The number of the pipeline stage. Should be a positive value. Default: 1.
|
pipeline_stage (int): The number of the pipeline stage. Should be a positive value. Default: 1.
|
||||||
micro_batch_num (int): The micore size of the batches for the pipeline training. Default: 1.
|
micro_batch_num (int): The microe size of the batches for the pipeline training. Default: 1.
|
||||||
optimizer_shard (bool): Whether to enable optimizer shard. Default False.
|
optimizer_shard (bool): Whether to enable optimizer shard. Default False.
|
||||||
gradient_aggregation_group (int): The fusion group size of the optimizer state sharding. Default: 4.
|
gradient_aggregation_group (int): The fusion group size of the optimizer state sharding. Default: 4.
|
||||||
recompute (bool): Enable recomputation of the transformer block or not. Default: False.
|
recompute (bool): Enable recomputation of the transformer block or not. Default: False.
|
||||||
|
@ -221,7 +221,7 @@ class TransformerOpParallelConfig(_Config):
|
||||||
@property
|
@property
|
||||||
def embedding_dp_mp_config(self):
|
def embedding_dp_mp_config(self):
|
||||||
r"""
|
r"""
|
||||||
To obtain the EmbeddingParallelConfig for the setting the data parallel, model parallel amd embedding
|
To obtain the EmbeddingParallelConfig for the setting data parallel, model parallel and embedding
|
||||||
parallel.
|
parallel.
|
||||||
|
|
||||||
Supported Platforms:
|
Supported Platforms:
|
||||||
|
@ -236,7 +236,7 @@ class TransformerOpParallelConfig(_Config):
|
||||||
@property
|
@property
|
||||||
def dp_mp_config(self):
|
def dp_mp_config(self):
|
||||||
r"""
|
r"""
|
||||||
To obtain the EmbeddingParallelConfig for the setting the data parallel, model parallel amd embedding
|
To obtain the EmbeddingParallelConfig for the setting data parallel, model parallel and embedding
|
||||||
parallel.
|
parallel.
|
||||||
|
|
||||||
Supported Platforms:
|
Supported Platforms:
|
||||||
|
@ -274,10 +274,11 @@ class FeedForward(Cell):
|
||||||
'hsigmoid', 'logsigmoid' and so on. Default: gelu.
|
'hsigmoid', 'logsigmoid' and so on. Default: gelu.
|
||||||
expert_num (int): The number of experts used in Linear. For the case expert_num > 1, BatchMatMul is used
|
expert_num (int): The number of experts used in Linear. For the case expert_num > 1, BatchMatMul is used
|
||||||
and the first dimension in BatchMatMul indicate expert_num. Default: 1.
|
and the first dimension in BatchMatMul indicate expert_num. Default: 1.
|
||||||
param_init_type (dtype.Number): The parameter initialization type. Can be dtype.float32 or dtype.float16.
|
param_init_type (dtype.Number): The parameter initialization type. Should be dtype.float32 or dtype.float16.
|
||||||
|
Default: dtype.float32.
|
||||||
parallel_config(OpParallelConfig): The config of parallel setting, see `OpParallelConfig`.
|
parallel_config(OpParallelConfig): The config of parallel setting, see `OpParallelConfig`.
|
||||||
Default `default_dpmp_config`, a instance of `OpParallelConfig` with default
|
Default `default_dpmp_config`, an instance of `OpParallelConfig` with
|
||||||
args.
|
default args.
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
- **x** (Tensor) - should be `[batch, seq_length, hidden_size]`. Float tensor.
|
- **x** (Tensor) - should be `[batch, seq_length, hidden_size]`. Float tensor.
|
||||||
|
@ -296,7 +297,7 @@ class FeedForward(Cell):
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
>>> model = FeedForward(hidden_size=15, ffn_hidden_size=30, dropout_rate=0.1)
|
>>> model = FeedForward(hidden_size=15, ffn_hidden_size=30, dropout_rate=0.1)
|
||||||
>>> tensor = Tensor(np.ones((2, 20, 15)), dtype.float32)
|
>>> tensor = Tensor(np.ones((2, 20, 15)), mstype.float32)
|
||||||
>>> output = model(tensor)
|
>>> output = model(tensor)
|
||||||
>>> print(output.shape)
|
>>> print(output.shape)
|
||||||
(2, 20, 15)
|
(2, 20, 15)
|
||||||
|
@ -383,19 +384,19 @@ class AttentionMask(Cell):
|
||||||
with 1 and 0. 1 indicates the current position is a valid token, otherwise not.
|
with 1 and 0. 1 indicates the current position is a valid token, otherwise not.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
seq_length(int): the sequence length of the input tensor.
|
seq_length(int): The sequence length of the input tensor.
|
||||||
parallel_config(OpParallelConfig): the parallel configure. Default `default_dpmp_config`,
|
parallel_config(OpParallelConfig): The parallel configure. Default `default_dpmp_config`,
|
||||||
a instance of `OpParallelConfig` with default args.
|
an instance of `OpParallelConfig` with default args.
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
- **input_mask** (Tensor) - the mask indicating whether each position is a valid input with
|
- **input_mask** (Tensor) - The mask indicating whether each position is a valid input with
|
||||||
(batch_size, seq_length).
|
(batch_size, seq_length).
|
||||||
|
|
||||||
Outputs:
|
Outputs:
|
||||||
Tensor. the attention mask matrix with shape (batch_size, seq_length, seq_length).
|
Tensor. The attention mask matrix with shape (batch_size, seq_length, seq_length).
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
TypeError: `seq_length` is not a int.
|
TypeError: `seq_length` is not an integer.
|
||||||
ValueError: `seq_length` is not a positive value.
|
ValueError: `seq_length` is not a positive value.
|
||||||
TypeError: `parallel_config` is not a subclass of OpParallelConfig.
|
TypeError: `parallel_config` is not a subclass of OpParallelConfig.
|
||||||
|
|
||||||
|
@ -403,15 +404,16 @@ class AttentionMask(Cell):
|
||||||
``Ascend`` ``GPU``
|
``Ascend`` ``GPU``
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
>>> mask = mindspore.parallel.nn.AttentionMask(seq_length=4)
|
>>> from mindspore.parallel.nn import AttentionMask
|
||||||
|
>>> mask = AttentionMask(seq_length=4)
|
||||||
>>> mask_array = np.array([[1, 1, 1, 0]], np.float32)
|
>>> mask_array = np.array([[1, 1, 1, 0]], np.float32)
|
||||||
>>> inputs = Tensor(mask_array)
|
>>> inputs = Tensor(mask_array)
|
||||||
>>> res = mask(inputs)
|
>>> res = mask(inputs)
|
||||||
>>> print(res)
|
>>> print(res)
|
||||||
Tensor(shape=[1, 4, 4], dtype=Float32,value=[[[1, 0, 0, 0],
|
[[[1, 0, 0, 0],
|
||||||
[1, 1, 0, 0],
|
[1, 1, 0, 0],
|
||||||
[1, 1, 1, 0],
|
[1, 1, 1, 0],
|
||||||
[0, 0, 0, 0]]])
|
[0, 0, 0, 0]]]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@_args_type_validator_check(seq_length=Validator.check_positive_int,
|
@_args_type_validator_check(seq_length=Validator.check_positive_int,
|
||||||
|
@ -452,8 +454,8 @@ class VocabEmbedding(Cell):
|
||||||
"""
|
"""
|
||||||
The embedding lookup table from the 0-th dim of the parameter table. When the parallel_config.vocab_emb_dp is
|
The embedding lookup table from the 0-th dim of the parameter table. When the parallel_config.vocab_emb_dp is
|
||||||
True and in the `AUTO_PARALLEL_MODE`, the embedding lookup will be a `parallel_config.data_parallel`
|
True and in the `AUTO_PARALLEL_MODE`, the embedding lookup will be a `parallel_config.data_parallel`
|
||||||
data parallel way, or will shard the parameter at the 0-th dimension in `parallel_config.model_parallel`, so called
|
data parallel way, or will shard the parameter at the 0-th dimension in `parallel_config.model_parallel`, so-called
|
||||||
row slice of the embedding table
|
row slice of the embedding table.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size (int): Size of the dictionary of embeddings.
|
vocab_size (int): Size of the dictionary of embeddings.
|
||||||
|
@ -461,11 +463,11 @@ class VocabEmbedding(Cell):
|
||||||
param_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the embedding_table.
|
param_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the embedding_table.
|
||||||
Refer to class `initializer` for the values of string when a string
|
Refer to class `initializer` for the values of string when a string
|
||||||
is specified. Default: 'normal'.
|
is specified. Default: 'normal'.
|
||||||
parallel_config(EmbeddingOpParallelConfig): the parallel config of network. Default
|
parallel_config(EmbeddingOpParallelConfig): The parallel config of network. Default
|
||||||
`default_embedding_parallel_config`, a instance of `EmbeddingOpParallelConfig` with default args.
|
`default_embedding_parallel_config`, an instance of `EmbeddingOpParallelConfig` with default args.
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
**input_ids** (Tensor) - the tokenized inputs with datatype int32 with shape (batch_size, seq_length)
|
**input_ids** (Tensor) - The tokenized inputs with datatype int32 with shape (batch_size, seq_length)
|
||||||
|
|
||||||
Outputs:
|
Outputs:
|
||||||
Tuple, a tuple contains (`output`, `embedding_table`)
|
Tuple, a tuple contains (`output`, `embedding_table`)
|
||||||
|
@ -486,7 +488,7 @@ class VocabEmbedding(Cell):
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
>>> model = VocabEmbedding(vocab_size=30, embedding_size=30)
|
>>> model = VocabEmbedding(vocab_size=30, embedding_size=30)
|
||||||
>>> tensor = Tensor(np.ones((20, 15)), dtype.int32)
|
>>> tensor = Tensor(np.ones((20, 15)), mstype.int32)
|
||||||
>>> output, table = model(tensor)
|
>>> output, table = model(tensor)
|
||||||
>>> print(output.shape)
|
>>> print(output.shape)
|
||||||
(20, 15, 30)
|
(20, 15, 30)
|
||||||
|
@ -526,7 +528,7 @@ class MultiHeadAttention(Cell):
|
||||||
r"""
|
r"""
|
||||||
This is an implementation of multihead attention in the paper `Attention is all you need
|
This is an implementation of multihead attention in the paper `Attention is all you need
|
||||||
<https://arxiv.org/pdf/1706.03762v5.pdf>`_. Given the query vector with source length, and the
|
<https://arxiv.org/pdf/1706.03762v5.pdf>`_. Given the query vector with source length, and the
|
||||||
key and value vector with target length, the attention will be performered as the following
|
key and value vector with target length, the attention will be performed as the following
|
||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
MultiHeadAttention(query, key, vector) = Concat(head_1, \dots, head_h)W^O
|
MultiHeadAttention(query, key, vector) = Concat(head_1, \dots, head_h)W^O
|
||||||
|
@ -543,13 +545,15 @@ class MultiHeadAttention(Cell):
|
||||||
num_heads(int): The number of the heads.
|
num_heads(int): The number of the heads.
|
||||||
hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1
|
hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1
|
||||||
attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1
|
attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1
|
||||||
compute_dtype(dtype.Number): The computation type. Default dtype.float16. The computation of the
|
compute_dtype(dtype.Number): The computation type of dense. Default dtype.float16.
|
||||||
softmax will be converted to the float32.
|
Should be dtype.float32 or dtype.float16.
|
||||||
param_init_type(dtype.Number). The parameter initialization type of the module. Default dtype.float32.
|
param_init_type(dtype.Number). The parameter initialization type of the module. Default dtype.float32.
|
||||||
Can be dtype.float32 or dtype.float16.
|
Should be dtype.float32 or dtype.float16.
|
||||||
|
softmax_compute_type(dtype.Number). The type of softmax computation module. Default dtype.float32.
|
||||||
|
Should be dtype.float32 or dtype.float16.
|
||||||
use_past(bool): Use the past state to compute, used for incremental prediction. Default False.
|
use_past(bool): Use the past state to compute, used for incremental prediction. Default False.
|
||||||
parallel_config(OpParallelConfig): The parallel configure. Default `default_dpmp_config`,
|
parallel_config(OpParallelConfig): The parallel configure. Default `default_dpmp_config`,
|
||||||
a instance of `OpParallelConfig` with default args.
|
an instance of `OpParallelConfig` with default args.
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
- **query_tensor** (Tensor) - the query vector with shape (batch_size, src_seq_length, hidden_size).
|
- **query_tensor** (Tensor) - the query vector with shape (batch_size, src_seq_length, hidden_size).
|
||||||
|
@ -572,7 +576,7 @@ class MultiHeadAttention(Cell):
|
||||||
- **output** (Tensor) - Tensor, the float tensor of the output of the layer with
|
- **output** (Tensor) - Tensor, the float tensor of the output of the layer with
|
||||||
shape (batch_size, src_seq_length, hidden_size)
|
shape (batch_size, src_seq_length, hidden_size)
|
||||||
|
|
||||||
- **layer_present** (Tuple) - A tuple of the Tensor the projected key and value vector with
|
- **layer_present** (Tuple) - A tuple of the Tensor of the projected key and value vector with
|
||||||
((batch_size, num_heads, size_per_head, tgt_seq_length),
|
((batch_size, num_heads, size_per_head, tgt_seq_length),
|
||||||
(batch_size, num_heads, tgt_seq_length, size_per_head)).
|
(batch_size, num_heads, tgt_seq_length, size_per_head)).
|
||||||
|
|
||||||
|
@ -582,9 +586,9 @@ class MultiHeadAttention(Cell):
|
||||||
Examples:
|
Examples:
|
||||||
>>> model = MultiHeadAttention(batch_size=2, hidden_size=15, src_seq_length=20, tgt_seq_length=20,
|
>>> model = MultiHeadAttention(batch_size=2, hidden_size=15, src_seq_length=20, tgt_seq_length=20,
|
||||||
... num_heads=3)
|
... num_heads=3)
|
||||||
>>> from_tensor = Tensor(np.ones((2, 20, 15)), dtype.float32)
|
>>> from_tensor = Tensor(np.ones((2, 20, 15)), mstype.float32)
|
||||||
>>> to_tensor = Tensor(np.ones((2, 20, 15)), dtype.float16)
|
>>> to_tensor = Tensor(np.ones((2, 20, 15)), mstype.float16)
|
||||||
>>> attention_mask = Tensor(np.ones((2, 20, 20)), dtype.float16)
|
>>> attention_mask = Tensor(np.ones((2, 20, 20)), mstype.float16)
|
||||||
>>> attn_out, past = model(from_tensor, to_tensor, to_tensor, attention_mask)
|
>>> attn_out, past = model(from_tensor, to_tensor, to_tensor, attention_mask)
|
||||||
>>> print(attn_out.shape)
|
>>> print(attn_out.shape)
|
||||||
(2, 20, 15)
|
(2, 20, 15)
|
||||||
|
@ -601,6 +605,8 @@ class MultiHeadAttention(Cell):
|
||||||
tgt_seq_length=Validator.check_positive_int,
|
tgt_seq_length=Validator.check_positive_int,
|
||||||
attention_dropout_rate=Validator.check_non_negative_float,
|
attention_dropout_rate=Validator.check_non_negative_float,
|
||||||
hidden_dropout_rate=Validator.check_non_negative_float,
|
hidden_dropout_rate=Validator.check_non_negative_float,
|
||||||
|
compute_dtype=_valid_value_checks([mstype.float32, mstype.float16],
|
||||||
|
"MultiHeadAttention"),
|
||||||
softmax_compute_type=_valid_value_checks([mstype.float32, mstype.float16],
|
softmax_compute_type=_valid_value_checks([mstype.float32, mstype.float16],
|
||||||
"MultiHeadAttention"),
|
"MultiHeadAttention"),
|
||||||
param_init_type=_valid_value_checks([mstype.float32, mstype.float16],
|
param_init_type=_valid_value_checks([mstype.float32, mstype.float16],
|
||||||
|
@ -915,7 +921,7 @@ class MultiHeadAttention(Cell):
|
||||||
class TransformerEncoderLayer(Cell):
|
class TransformerEncoderLayer(Cell):
|
||||||
r"""
|
r"""
|
||||||
Transformer Encoder Layer. This is an implementation of the single layer of the transformer
|
Transformer Encoder Layer. This is an implementation of the single layer of the transformer
|
||||||
encoder layer including multihead attention and feedward layer.
|
encoder layer, including multihead attention and feedward layer.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
batch_size(int): The batch size of the input tensor.
|
batch_size(int): The batch size of the input tensor.
|
||||||
|
@ -930,22 +936,22 @@ class TransformerEncoderLayer(Cell):
|
||||||
'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
|
'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
|
||||||
'hsigmoid', 'logsigmoid' and so on. Default: gelu.
|
'hsigmoid', 'logsigmoid' and so on. Default: gelu.
|
||||||
layernorm_compute_type(dtype.Number): The computation type of the layernorm.
|
layernorm_compute_type(dtype.Number): The computation type of the layernorm.
|
||||||
Can be dtype.float32 or dtype.float16. Default dtype.float16.
|
Should be dtype.float32 or dtype.float16. Default dtype.float32.
|
||||||
softmax_compute_type(dtype.Number): The computation type of the softmax in the attention.
|
softmax_compute_type(dtype.Number): The computation type of the softmax in the attention.
|
||||||
Can be dtype.float32 or dtype.float16. Default mstype.float16.
|
Should be dtype.float32 or dtype.float16. Default mstype.float32.
|
||||||
param_init_type(dtype.Number): The parameter initialization type of the module.
|
param_init_type(dtype.Number): The parameter initialization type of the module.
|
||||||
Can be dtype.float32 or dtype.float16. Default dtype.float32.
|
Should be dtype.float32 or dtype.float16. Default dtype.float32.
|
||||||
use_past(bool): Use the past state to compute, used for incremental prediction. Default False.
|
use_past(bool): Use the past state to compute, used for incremental prediction. Default False.
|
||||||
moe_config(MoEConfig): The configuration of MoE (Mixture of Expert).
|
moe_config(MoEConfig): The configuration of MoE (Mixture of Expert).
|
||||||
parallel_config(OpParallelConfig): The parallel configure. Default `default_dpmp_config`,
|
parallel_config(OpParallelConfig): The parallel configure. Default `default_dpmp_config`,
|
||||||
a instance of `OpParallelConfig` with default args.
|
an instance of `OpParallelConfig` with default args.
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
- **x** (Tensor) - Float Tensor, shape should be [batch_size, seq_length, hidden_size].
|
- **x** (Tensor) - Float Tensor, shape should be [batch_size, seq_length, hidden_size].
|
||||||
- **input_mask** (Tensor) - Float Tensor, attention mask with shape [batch_size, seq_length, seq_length].
|
- **input_mask** (Tensor) - Float Tensor, attention mask with shape [batch_size, seq_length, seq_length].
|
||||||
- **init_reset** (Tensor) - A bool tensor with shape [batch_size,], used to clear the past key parameter and
|
- **init_reset** (Tensor) - A bool tensor with shape [batch_size], used to clear the past key parameter and
|
||||||
past value parameter used in the incremental prediction. Only valid when use_past is True. Default True.
|
past value parameter used in the incremental prediction. Only valid when use_past is True. Default True.
|
||||||
- **batch_valid_length** (Tensor) - Int32 tensor with shape (batch_size,) the past calculated the index. Used
|
- **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index. Used
|
||||||
for incremental prediction when the use_past is True. Default None.
|
for incremental prediction when the use_past is True. Default None.
|
||||||
|
|
||||||
Outputs:
|
Outputs:
|
||||||
|
@ -954,7 +960,7 @@ class TransformerEncoderLayer(Cell):
|
||||||
- **output** (Tensor) - The float tensor of the output of the layer with
|
- **output** (Tensor) - The float tensor of the output of the layer with
|
||||||
shape (batch_size, seq_length, hidden_size).
|
shape (batch_size, seq_length, hidden_size).
|
||||||
|
|
||||||
- **layer_present** (Tuple) - A tuple of the Tensor the projected key and value vector with
|
- **layer_present** (Tuple) - A tuple of the Tensor of the projected key and value vector with
|
||||||
((batch_size, num_heads, size_per_head, seq_length),
|
((batch_size, num_heads, size_per_head, seq_length),
|
||||||
(batch_size, num_heads, seq_length, size_per_head)).
|
(batch_size, num_heads, seq_length, size_per_head)).
|
||||||
|
|
||||||
|
@ -964,8 +970,8 @@ class TransformerEncoderLayer(Cell):
|
||||||
Examples:
|
Examples:
|
||||||
>>> model = TransformerEncoderLayer(batch_size=2, hidden_size=8, ffn_hidden_size=64, seq_length=16,
|
>>> model = TransformerEncoderLayer(batch_size=2, hidden_size=8, ffn_hidden_size=64, seq_length=16,
|
||||||
... num_heads=2)
|
... num_heads=2)
|
||||||
>>> encoder_input_value = Tensor(np.ones((2, 16, 8)), dtype.float32)
|
>>> encoder_input_value = Tensor(np.ones((2, 16, 8)), mstype.float32)
|
||||||
>>> encoder_input_mask = Tensor(np.ones((2, 16, 16)), dtype.float16)
|
>>> encoder_input_mask = Tensor(np.ones((2, 16, 16)), mstype.float16)
|
||||||
>>> output, past = model(encoder_input_value, encoder_input_mask)
|
>>> output, past = model(encoder_input_value, encoder_input_mask)
|
||||||
>>> print(output.shape)
|
>>> print(output.shape)
|
||||||
(2, 16, 8)
|
(2, 16, 8)
|
||||||
|
@ -1159,7 +1165,7 @@ class TransformerEncoderLayer(Cell):
|
||||||
class TransformerDecoderLayer(Cell):
|
class TransformerDecoderLayer(Cell):
|
||||||
r"""
|
r"""
|
||||||
Transformer Decoder Layer. This is an implementation of the single layer of the transformer
|
Transformer Decoder Layer. This is an implementation of the single layer of the transformer
|
||||||
decoder layer including self-attention, cross attention and feedward layer. When the encoder_output is None,
|
decoder layer, including self-attention, cross attention and feedward layer. When the encoder_output is None,
|
||||||
the cross attention will not be effective.
|
the cross attention will not be effective.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -1176,15 +1182,15 @@ class TransformerDecoderLayer(Cell):
|
||||||
'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
|
'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
|
||||||
'hsigmoid', 'logsigmoid' and so on. Default: gelu.
|
'hsigmoid', 'logsigmoid' and so on. Default: gelu.
|
||||||
layernorm_compute_type(dtype.Number): The computation type of the layernorm.
|
layernorm_compute_type(dtype.Number): The computation type of the layernorm.
|
||||||
Can be dtype.float32 or dtype.float16. Default dtype.float16.
|
Should be dtype.float32 or dtype.float16. Default dtype.float32.
|
||||||
softmax_compute_type(dtype.Number): The computation type of the softmax in the attention.
|
softmax_compute_type(dtype.Number): The computation type of the softmax in the attention.
|
||||||
Can be dtype.float32 or dtype.float16. Default mstype.float16.
|
Should be dtype.float32 or dtype.float16. Default mstype.float32.
|
||||||
param_init_type: The parameter initialization type of the module. Can be dtype.float32 or dtype.float16.
|
param_init_type(dtype.Number): The parameter initialization type of the module.
|
||||||
Default dtype.float32.
|
Should be dtype.float32 or dtype.float16. Default dtype.float32.
|
||||||
use_past(bool): Use the past state to compute, used for incremental prediction. Default False.
|
use_past(bool): Use the past state to compute, used for incremental prediction. Default False.
|
||||||
moe_config(MoEConfig): The configuration of MoE (Mixture of Expert).
|
moe_config(MoEConfig): The configuration of MoE (Mixture of Expert).
|
||||||
parallel_config(OpParallelConfig): The parallel configure. Default `default_dpmp_config`,
|
parallel_config(OpParallelConfig): The parallel configure. Default `default_dpmp_config`,
|
||||||
a instance of `OpParallelConfig` with default args.
|
an instance of `OpParallelConfig` with default args.
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
- **hidden_stats** (Tensor) - the input tensor with shape [batch_size, tgt_seq_length, hidden_size].
|
- **hidden_stats** (Tensor) - the input tensor with shape [batch_size, tgt_seq_length, hidden_size].
|
||||||
|
@ -1193,18 +1199,18 @@ class TransformerDecoderLayer(Cell):
|
||||||
- **encoder_output** (Tensor) - the output of the encoder with shape [batch_size, seq_length, hidden_size].
|
- **encoder_output** (Tensor) - the output of the encoder with shape [batch_size, seq_length, hidden_size].
|
||||||
- **memory_mask** (Tensor) - the memory mask of the cross attention with shape [batch, tgt_seq_length,
|
- **memory_mask** (Tensor) - the memory mask of the cross attention with shape [batch, tgt_seq_length,
|
||||||
src_seq_length], where tgt_seq_length is the length of the decoder.
|
src_seq_length], where tgt_seq_length is the length of the decoder.
|
||||||
- **init_reset** (Tensor) - A bool tensor with shape [batch_size,], used to clear the past key parameter and
|
- **init_reset** (Tensor) - A bool tensor with shape [batch_size], used to clear the past key parameter and
|
||||||
past value parameter used in the incremental prediction. Only valid when use_past is True. Default True.
|
past value parameter used in the incremental prediction. Only valid when use_past is True. Default True.
|
||||||
- **batch_valid_length** (Tensor) - Int32 tensor with shape (batch_size,) the past calculated the index. Used
|
- **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index. Used
|
||||||
for incremental prediction when the use_past is True. Default None.
|
for incremental prediction when the use_past is True. Default None.
|
||||||
|
|
||||||
Outputs:
|
Outputs:
|
||||||
Tuple, a tuple contains(`output`, `layer_present`)
|
Tuple, a tuple contains(`output`, `layer_present`)
|
||||||
|
|
||||||
- **output** (Tensor) - the output logit of this layer. The shape is [batch, seq_length, hidden_size]
|
- **output** (Tensor) - the output logit of this layer. The shape is [batch, seq_length, hidden_size]
|
||||||
- **layer_present** (Tensor) - A tuple, where each tuple is the tensor the projected key and value
|
- **layer_present** (Tensor) - A tuple, where each tuple is the tensor of the projected key and value
|
||||||
vector in self attention with shape ((batch_size, num_heads, size_per_head, tgt_seq_length),
|
vector in self attention with shape ((batch_size, num_heads, size_per_head, tgt_seq_length),
|
||||||
(batch_size, num_heads, tgt_seq_length, size_per_head), and the projected key and value vector
|
(batch_size, num_heads, tgt_seq_length, size_per_head), and of the projected key and value vector
|
||||||
in cross attention with shape (batch_size, num_heads, size_per_head, src_seq_length),
|
in cross attention with shape (batch_size, num_heads, size_per_head, src_seq_length),
|
||||||
(batch_size, num_heads, src_seq_length, size_per_head)).
|
(batch_size, num_heads, src_seq_length, size_per_head)).
|
||||||
|
|
||||||
|
@ -1214,10 +1220,10 @@ class TransformerDecoderLayer(Cell):
|
||||||
Examples:
|
Examples:
|
||||||
>>> model = TransformerDecoderLayer(batch_size=2, hidden_size=64, ffn_hidden_size=64, num_heads=2,
|
>>> model = TransformerDecoderLayer(batch_size=2, hidden_size=64, ffn_hidden_size=64, num_heads=2,
|
||||||
... src_seq_length=20, tgt_seq_length=10)
|
... src_seq_length=20, tgt_seq_length=10)
|
||||||
>>> encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32)
|
>>> encoder_input_value = Tensor(np.ones((2, 20, 64)), mstype.float32)
|
||||||
>>> decoder_input_value = Tensor(np.ones((2, 10, 64)), dtype.float32)
|
>>> decoder_input_value = Tensor(np.ones((2, 10, 64)), mstype.float32)
|
||||||
>>> decoder_input_mask = Tensor(np.ones((2, 10, 10)), dtype.float16)
|
>>> decoder_input_mask = Tensor(np.ones((2, 10, 10)), mstype.float16)
|
||||||
>>> memory_mask = Tensor(np.ones((2, 10, 20)), dtype.float16)
|
>>> memory_mask = Tensor(np.ones((2, 10, 20)), mstype.float16)
|
||||||
>>> output, past = model(decoder_input_value, decoder_input_mask, encoder_input_value, memory_mask)
|
>>> output, past = model(decoder_input_value, decoder_input_mask, encoder_input_value, memory_mask)
|
||||||
>>> print(output.shape)
|
>>> print(output.shape)
|
||||||
(2, 10, 64)
|
(2, 10, 64)
|
||||||
|
@ -1477,7 +1483,7 @@ def _get_lambda_func(total_layer=None):
|
||||||
Args:
|
Args:
|
||||||
network(Cell) - Represents the transformer block
|
network(Cell) - Represents the transformer block
|
||||||
layer_id(int) - Means the layer index for the current module, counts from zero.
|
layer_id(int) - Means the layer index for the current module, counts from zero.
|
||||||
offset(int) - Means the layer_index needs a offset, if there are other modules in the net.
|
offset(int) - Means the layer_index needs an offset, if there are other modules in the net.
|
||||||
layers(int) - The total layers used for the model.
|
layers(int) - The total layers used for the model.
|
||||||
"""
|
"""
|
||||||
# override the layers
|
# override the layers
|
||||||
|
@ -1522,30 +1528,30 @@ class TransformerEncoder(Cell):
|
||||||
'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
|
'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
|
||||||
'hsigmoid', 'logsigmoid' and so on. Default: gelu.
|
'hsigmoid', 'logsigmoid' and so on. Default: gelu.
|
||||||
layernorm_compute_type(dtype.Number): The computation type of the layernorm.
|
layernorm_compute_type(dtype.Number): The computation type of the layernorm.
|
||||||
Can be dtype.float32 or dtype.float16. Default dtype.float16.
|
Should be dtype.float32 or dtype.float16. Default dtype.float32.
|
||||||
softmax_compute_type(dtype.Number): The computation type of the softmax in the attention.
|
softmax_compute_type(dtype.Number): The computation type of the softmax in the attention.
|
||||||
Can be dtype.float32 or dtype.float16. Default mstype.float16.
|
Should be dtype.float32 or dtype.float16. Default mstype.float32.
|
||||||
param_init_type: The parameter initialization type of the module. Can be dtype.float32 or dtype.float16.
|
param_init_type(dtype.Number): The parameter initialization type of the module.
|
||||||
Default dtype.float32.
|
Should be dtype.float32 or dtype.float16. Default dtype.float32.
|
||||||
use_past(bool): Use the past state to compute, used for incremental prediction. Default False.
|
use_past(bool): Use the past state to compute, used for incremental prediction. Default False.
|
||||||
lambda_func: A function can specific the fusion index, pipeline stages and recompute attribute. If the user
|
lambda_func: A function can determine the fusion index, pipeline stages and recompute attribute. If the user
|
||||||
wants to specific the pipeline stage and gradient aggregation fusion, the user can pass a function
|
wants to determine the pipeline stage and gradient aggregation fusion, the user can pass a function
|
||||||
that accepts `network`, `layer_id`, `offset`, `parallel_config`, `layers`. The `network(Cell)`
|
that accepts `network`, `layer_id`, `offset`, `parallel_config`, `layers`. The `network(Cell)`
|
||||||
represents the transformer block, `layer_id(int)` means the layer index for the current module, counts from
|
represents the transformer block, `layer_id(int)` means the layer index for the current module, counts from
|
||||||
zero, `offset(int)` means the layer_index needs a offset, if there are other modules in the net. The
|
zero, `offset(int)` means the layer_index needs an offset, if there are other modules in the net. The
|
||||||
default setting for the pipeline is: `(layer_id + offset) // (layers / pipeline_stage)`.
|
default setting for the pipeline is: `(layer_id + offset) // (layers / pipeline_stage)`.
|
||||||
offset(int): The initial layer index for the `decoder`. Used for setting the fusion id and stage id, to not
|
offset(int): The initial layer index for the `decoder`. Used for setting the fusion id and stage id, to not
|
||||||
overlap with the encoder layer.
|
overlap with the encoder layer.
|
||||||
moe_config(MoEConfig): The configuration of MoE (Mixture of Expert).
|
moe_config(MoEConfig): The configuration of MoE (Mixture of Expert).
|
||||||
parallel_config(TransformerOpParallelConfig): The parallel configure. Default `default_transformer_config`,
|
parallel_config(TransformerOpParallelConfig): The parallel configure. Default `default_transformer_config`,
|
||||||
a instance of `TransformerOpParallelConfig` with default args.
|
an instance of `TransformerOpParallelConfig` with default args.
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
- **hidden_states** (Tensor) - Tensor, shape should be [batch_size, seq_length, hidden_size]
|
- **hidden_states** (Tensor) - Tensor, shape should be [batch_size, seq_length, hidden_size]
|
||||||
- **attention_mask** (Tensor) - Tensor, attention mask with shape [batch_size, seq_length, seq_length]
|
- **attention_mask** (Tensor) - Tensor, attention mask with shape [batch_size, seq_length, seq_length]
|
||||||
- **init_reset** (Tensor) - A bool tensor with shape [batch_size,], used to clear the past key parameter and
|
- **init_reset** (Tensor) - A bool tensor with shape [batch_size], used to clear the past key parameter and
|
||||||
past value parameter used in the incremental prediction. Only valid when use_past is True. Default True
|
past value parameter used in the incremental prediction. Only valid when use_past is True. Default True
|
||||||
- **batch_valid_length** (Tensor) - Int32 tensor with shape (batch_size,) the past calculated the index. Used
|
- **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index. Used
|
||||||
for incremental prediction when the use_past is True. Default None.
|
for incremental prediction when the use_past is True. Default None.
|
||||||
|
|
||||||
Outputs:
|
Outputs:
|
||||||
|
@ -1563,8 +1569,8 @@ class TransformerEncoder(Cell):
|
||||||
Examples:
|
Examples:
|
||||||
>>> model = TransformerEncoder(batch_size=2, num_layers=2, hidden_size=8, ffn_hidden_size=64, seq_length=16,
|
>>> model = TransformerEncoder(batch_size=2, num_layers=2, hidden_size=8, ffn_hidden_size=64, seq_length=16,
|
||||||
... num_heads=2)
|
... num_heads=2)
|
||||||
>>> encoder_input_value = Tensor(np.ones((2, 16, 8)), dtype.float32)
|
>>> encoder_input_value = Tensor(np.ones((2, 16, 8)), mstype.float32)
|
||||||
>>> encoder_input_mask = Tensor(np.ones((2, 16, 16)), dtype.float16)
|
>>> encoder_input_mask = Tensor(np.ones((2, 16, 16)), mstype.float16)
|
||||||
>>> output, past = model(encoder_input_value, encoder_input_mask)
|
>>> output, past = model(encoder_input_value, encoder_input_mask)
|
||||||
>>> print(output.shape)
|
>>> print(output.shape)
|
||||||
(2, 16, 8)
|
(2, 16, 8)
|
||||||
|
@ -1692,23 +1698,23 @@ class TransformerDecoder(Cell):
|
||||||
'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
|
'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
|
||||||
'hsigmoid', 'logsigmoid' and so on. Default: gelu.
|
'hsigmoid', 'logsigmoid' and so on. Default: gelu.
|
||||||
layernorm_compute_type(dtype.Number): The computation type of the layernorm.
|
layernorm_compute_type(dtype.Number): The computation type of the layernorm.
|
||||||
Can be dtype.float32 or dtype.float16. Default dtype.float16.
|
Should be dtype.float32 or dtype.float16. Default dtype.float32.
|
||||||
softmax_compute_type(dtype.Number): The computation type of the softmax in the attention.
|
softmax_compute_type(dtype.Number): The computation type of the softmax in the attention.
|
||||||
Can be dtype.float32 or dtype.float16. Default mstype.float16.
|
Should be dtype.float32 or dtype.float16. Default mstype.float32.
|
||||||
param_init_type: The parameter initialization type of the module. Can be dtype.float32 or dtype.float16.
|
param_init_type(dtype.Number): The parameter initialization type of the module.
|
||||||
Default dtype.float32.
|
Should be dtype.float32 or dtype.float16. Default dtype.float32.
|
||||||
offset(int): The initial layer index for the `decoder`. Used for setting the fusion id and stage id, to not
|
offset(int): The initial layer index for the `decoder`. Used for setting the fusion id and stage id, to not
|
||||||
overlap with the encoder layer.
|
overlap with the encoder layer.
|
||||||
lambda_func: A function can specific the fusion index, pipeline stages and recompute attribute. If the user
|
lambda_func: A function can determine the fusion index, pipeline stages and recompute attribute. If the user
|
||||||
wants to specific the pipeline stage and gradient aggregation fusion, the user can pass a function
|
wants to determine the pipeline stage and gradient aggregation fusion, the user can pass a function
|
||||||
that accepts `network`, `layer_id`, `offset`, `parallel_config`, `layers`. The `network(Cell)`
|
that accepts `network`, `layer_id`, `offset`, `parallel_config`, `layers`. The `network(Cell)`
|
||||||
represents the transformer block, `layer_id(int)` means the layer index for the current module, counts from
|
represents the transformer block, `layer_id(int)` means the layer index for the current module, counts from
|
||||||
zero, `offset(int)` means the layer_index needs a offset, if there are other modules in the net. The
|
zero, `offset(int)` means the layer_index needs an offset, if there are other modules in the net. The
|
||||||
default setting for the pipeline is: `(layer_id + offset) // (layers / pipeline_stage)`.
|
default setting for the pipeline is: `(layer_id + offset) // (layers / pipeline_stage)`.
|
||||||
Default: None
|
Default: None
|
||||||
moe_config(MoEConfig): The configuration of MoE (Mixture of Expert).
|
moe_config(MoEConfig): The configuration of MoE (Mixture of Expert).
|
||||||
parallel_config(TransformerOpParallelConfig): The parallel configure. Default `default_transformer_config`,
|
parallel_config(TransformerOpParallelConfig): The parallel configure. Default `default_transformer_config`,
|
||||||
a instance of `TransformerOpParallelConfig` with default args.
|
an instance of `TransformerOpParallelConfig` with default args.
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
- **hidden_stats** (Tensor) - the input tensor with shape [batch_size, seq_length, hidden_size]
|
- **hidden_stats** (Tensor) - the input tensor with shape [batch_size, seq_length, hidden_size]
|
||||||
|
@ -1717,18 +1723,18 @@ class TransformerDecoder(Cell):
|
||||||
- **memory_mask** (Tensor) - the memory mask of the cross attention with shape [batch, tgt_seq_length,
|
- **memory_mask** (Tensor) - the memory mask of the cross attention with shape [batch, tgt_seq_length,
|
||||||
src_seq_length] where tgt_seq_length is the length of the decoder. the output of the encoder with shape
|
src_seq_length] where tgt_seq_length is the length of the decoder. the output of the encoder with shape
|
||||||
[batch_size, seq_length, hidden_size],
|
[batch_size, seq_length, hidden_size],
|
||||||
- **init_reset** (Tensor) - A bool tensor with shape [batch_size,], used to clear the past key parameter and
|
- **init_reset** (Tensor) - A bool tensor with shape [batch_size], used to clear the past key parameter and
|
||||||
past value parameter used in the incremental prediction. Only valid when use_past is True. Default True
|
past value parameter used in the incremental prediction. Only valid when use_past is True. Default True
|
||||||
- **batch_valid_length** (Tensor) - Int32 tensor with shape (batch_size,) the past calculated the index.
|
- **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index.
|
||||||
Used for incremental prediction when the use_past is True. Default None.
|
Used for incremental prediction when the use_past is True. Default None.
|
||||||
|
|
||||||
Outputs:
|
Outputs:
|
||||||
Tuple, a tuple contains(`output`, `layer_present`)
|
Tuple, a tuple contains(`output`, `layer_present`)
|
||||||
|
|
||||||
- **output** (Tensor) - The output logit of this layer. The shape is [batch, tgt_seq_length, hidden_size]
|
- **output** (Tensor) - The output logit of this layer. The shape is [batch, tgt_seq_length, hidden_size]
|
||||||
- **layer_present** (Tuple) - A tuple with size of num_layers, where each tuple is the tensor the projected
|
- **layer_present** (Tuple) - A tuple with size of num_layers, where each tuple is the tensor of the projected
|
||||||
key and value vector in self attention with shape ((batch_size, num_heads, size_per_head, tgt_seq_length),
|
key and value vector in self attention with shape ((batch_size, num_heads, size_per_head, tgt_seq_length),
|
||||||
(batch_size, num_heads, tgt_seq_length, size_per_head), and the projected key and value vector
|
(batch_size, num_heads, tgt_seq_length, size_per_head), and of the projected key and value vector
|
||||||
in cross attention with shape (batch_size, num_heads, size_per_head, src_seq_length),
|
in cross attention with shape (batch_size, num_heads, size_per_head, src_seq_length),
|
||||||
(batch_size, num_heads, src_seq_length, size_per_head)).
|
(batch_size, num_heads, src_seq_length, size_per_head)).
|
||||||
|
|
||||||
|
@ -1738,10 +1744,10 @@ class TransformerDecoder(Cell):
|
||||||
Examples:
|
Examples:
|
||||||
>>> model = TransformerDecoder(batch_size=2, num_layers=1, hidden_size=64, ffn_hidden_size=64,
|
>>> model = TransformerDecoder(batch_size=2, num_layers=1, hidden_size=64, ffn_hidden_size=64,
|
||||||
... num_heads=2, src_seq_length=20, tgt_seq_length=10)
|
... num_heads=2, src_seq_length=20, tgt_seq_length=10)
|
||||||
>>> encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32)
|
>>> encoder_input_value = Tensor(np.ones((2, 20, 64)), mstype.float32)
|
||||||
>>> decoder_input_value = Tensor(np.ones((2, 10, 64)), dtype.float32)
|
>>> decoder_input_value = Tensor(np.ones((2, 10, 64)), mstype.float32)
|
||||||
>>> decoder_input_mask = Tensor(np.ones((2, 10, 10)), dtype.float16)
|
>>> decoder_input_mask = Tensor(np.ones((2, 10, 10)), mstype.float16)
|
||||||
>>> memory_mask = Tensor(np.ones((2, 10, 20)), dtype.float16)
|
>>> memory_mask = Tensor(np.ones((2, 10, 20)), mstype.float16)
|
||||||
>>> output, past = model(decoder_input_value, decoder_input_mask, encoder_input_value, memory_mask)
|
>>> output, past = model(decoder_input_value, decoder_input_mask, encoder_input_value, memory_mask)
|
||||||
>>> print(output.shape)
|
>>> print(output.shape)
|
||||||
(2, 10, 64)
|
(2, 10, 64)
|
||||||
|
@ -1868,7 +1874,7 @@ class Transformer(Cell):
|
||||||
r"""
|
r"""
|
||||||
Transformer module including encoder and decoder. The difference with the original implements is the module use
|
Transformer module including encoder and decoder. The difference with the original implements is the module use
|
||||||
the residual addition before the layer normalization. And the default hidden act is `gelu`.
|
the residual addition before the layer normalization. And the default hidden act is `gelu`.
|
||||||
The detials can be found in `Attention is all you need <https://arxiv.org/pdf/1706.03762v5.pdf>`_.
|
The details can be found in `Attention is all you need <https://arxiv.org/pdf/1706.03762v5.pdf>`_.
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
This is an experimental interface that is subject to change and/or deletion.
|
This is an experimental interface that is subject to change and/or deletion.
|
||||||
|
@ -1881,37 +1887,43 @@ class Transformer(Cell):
|
||||||
ffn_hidden_size(int): The hidden size of bottleneck in the feedforward layer.
|
ffn_hidden_size(int): The hidden size of bottleneck in the feedforward layer.
|
||||||
src_seq_length(int): The seq_length of the encoder's input tensor.
|
src_seq_length(int): The seq_length of the encoder's input tensor.
|
||||||
tgt_seq_length(int): The seq_length of the decoder's input tensor.
|
tgt_seq_length(int): The seq_length of the decoder's input tensor.
|
||||||
num_heads(int): The number of the heads.
|
num_heads(int): The number of the heads. Default: 2.
|
||||||
hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1
|
hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1
|
||||||
attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1
|
attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1
|
||||||
post_layernorm_residual(bool): Do residuals adds before the layernorm. Default False.
|
post_layernorm_residual(bool): Do residuals adds before the layernorm. Default False.
|
||||||
|
layernorm_compute_type(dtype.Number): The computation type of the layernorm.
|
||||||
|
Should be dtype.float32 or dtype.float16. Default dtype.float32.
|
||||||
|
softmax_compute_type(dtype.Number): The computation type of the softmax in the attention.
|
||||||
|
Should be dtype.float32 or dtype.float16. Default mstype.float32.
|
||||||
|
param_init_type(dtype.Number): The parameter initialization type of the module.
|
||||||
|
Should be dtype.float32 or dtype.float16. Default dtype.float32.
|
||||||
hidden_act(str): The activation of the internal feedforward layer. Supports 'relu',
|
hidden_act(str): The activation of the internal feedforward layer. Supports 'relu',
|
||||||
'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
|
'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
|
||||||
'hsigmoid', 'logsigmoid' and so on. Default: gelu.
|
'hsigmoid', 'logsigmoid' and so on. Default: gelu.
|
||||||
lambda_func: A function can specific the fusion index, pipeline stages and recompute attribute. If the user
|
moe_config(MoEConfig): The configuration of MoE (Mixture of Expert).
|
||||||
wants to specific the pipeline stage and gradient aggregation fusion, the user can pass a function
|
lambda_func: A function can determine the fusion index, pipeline stages and recompute attribute. If the user
|
||||||
|
wants to determine the pipeline stage and gradient aggregation fusion, the user can pass a function
|
||||||
that accepts `network`, `layer_id`, `offset`, `parallel_config`, `layers`. The `network(Cell)`
|
that accepts `network`, `layer_id`, `offset`, `parallel_config`, `layers`. The `network(Cell)`
|
||||||
represents the transformer block, `layer_id(int)` means the layer index for the current module, counts from
|
represents the transformer block, `layer_id(int)` means the layer index for the current module, counts from
|
||||||
zero, `offset(int)` means the layer_index needs a offset, if there are other modules in the net. The
|
zero, `offset(int)` means the layer_index needs an offset, if there are other modules in the net. The
|
||||||
default setting for the pipeline is: `(layer_id + offset) // ((encoder_layers + decoder_length)
|
default setting for the pipeline is: `(layer_id + offset) // ((encoder_layers + decoder_length)
|
||||||
/ pipeline_stage)`.
|
/ pipeline_stage)`.
|
||||||
parallel_config(TransformerOpParallelConfig): The parallel configure. Default `default_transformer_config`,
|
parallel_config(TransformerOpParallelConfig): The parallel configure. Default `default_transformer_config`,
|
||||||
a instance of `TransformerOpParallelConfig` with default args.
|
an instance of `TransformerOpParallelConfig` with default args.
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
- **encoder_inputs** (Tensor) - the input tensor with shape [batch_size, seq_length, hidden_size].
|
- **encoder_inputs** (Tensor) - the input tensor with shape [batch_size, seq_length, hidden_size].
|
||||||
- **encoder_masks** (Tensor) - the attention mask for decoder with shape [batch_size, seq_length, seq_length].
|
- **encoder_masks** (Tensor) - the attention mask for decoder with shape [batch_size, seq_length, seq_length].
|
||||||
- **decoder_inputs** (Tensor) - the output of the encoder with shape [batch_size, seq_length, hidden_size],
|
- **decoder_inputs** (Tensor) - the output of the encoder with shape [batch_size, seq_length, hidden_size],
|
||||||
this can be none if the decoder layer is 0.
|
this should be none if the decoder layer is 0.
|
||||||
- **decoder_masks** (Tensor) - the attention mask for decoder with shape [batch_size, 1,
|
- **decoder_masks** (Tensor) - the attention mask for decoder with shape [batch_size, seq_length, seq_length]
|
||||||
seq_length, seq_length]
|
|
||||||
- **memory_mask** (Tensor) - the memory mask of the cross attention with shape [batch, tgt_seq_length,
|
- **memory_mask** (Tensor) - the memory mask of the cross attention with shape [batch, tgt_seq_length,
|
||||||
src_seq_length]
|
src_seq_length]
|
||||||
where tgt_seq_length is the length of the decoder. the output of the encoder with shape [batch_size,
|
where tgt_seq_length is the length of the decoder. the output of the encoder with shape [batch_size,
|
||||||
seq_length, hidden_size], this can be none if the decoder layer is 0.
|
seq_length, hidden_size], this should be none if the decoder layer is 0.
|
||||||
- **init_reset** (Tensor) - A bool tensor with shape [batch_size,], used to clear the past key parameter and
|
- **init_reset** (Tensor) - A bool tensor with shape [batch_size], used to clear the past key parameter and
|
||||||
past value parameter used in the incremental prediction. Only valid when use_past is True. Default True
|
past value parameter used in the incremental prediction. Only valid when use_past is True. Default True
|
||||||
- **batch_valid_length** (Tensor) - Int32 tensor with shape (batch_size,) the past calculated the index. Used
|
- **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index. Used
|
||||||
for incremental prediction when the use_past is True. Default None.
|
for incremental prediction when the use_past is True. Default None.
|
||||||
|
|
||||||
Outputs:
|
Outputs:
|
||||||
|
@ -1922,10 +1934,10 @@ class Transformer(Cell):
|
||||||
decoder layer. The shape is [batch, tgt_seq_length, hidden_size].
|
decoder layer. The shape is [batch, tgt_seq_length, hidden_size].
|
||||||
- **encoder_layer_present** (Tuple) - A tuple with size of num_layers, where each tuple is the tensor the
|
- **encoder_layer_present** (Tuple) - A tuple with size of num_layers, where each tuple is the tensor the
|
||||||
projected key and value vector in self attention with shape ((batch_size, num_heads, size_per_head,
|
projected key and value vector in self attention with shape ((batch_size, num_heads, size_per_head,
|
||||||
src_seq_length), (batch_size, num_heads, src_seq_length, size_per_head).
|
src_seq_length), (batch_size, num_heads, src_seq_length, size_per_head)).
|
||||||
- **decoder_layer_present** (Tuple) - A tuple with size of num_layers, where each tuple is the tensor
|
- **decoder_layer_present** (Tuple) - A tuple with size of num_layers, where each tuple is the tensor
|
||||||
the projected key and value vector in self attention with shape ((batch_size, num_heads, size_per_head,
|
of the projected key and value vector in self attention with shape ((batch_size, num_heads, size_per_head,
|
||||||
tgt_seq_length), (batch_size, num_heads, tgt_seq_length, size_per_head), and the
|
tgt_seq_length), (batch_size, num_heads, tgt_seq_length, size_per_head)), and the
|
||||||
projected key and value vector in cross attention with shape
|
projected key and value vector in cross attention with shape
|
||||||
(batch_size, num_heads, size_per_head, src_seq_length),
|
(batch_size, num_heads, size_per_head, src_seq_length),
|
||||||
(batch_size, num_heads, src_seq_length, size_per_head)). If the decoder is not set, the
|
(batch_size, num_heads, src_seq_length, size_per_head)). If the decoder is not set, the
|
||||||
|
@ -1937,11 +1949,11 @@ class Transformer(Cell):
|
||||||
Examples:
|
Examples:
|
||||||
>>> model = Transformer(batch_size=2, encoder_layers=1, decoder_layers=2, hidden_size=64, ffn_hidden_size=64,
|
>>> model = Transformer(batch_size=2, encoder_layers=1, decoder_layers=2, hidden_size=64, ffn_hidden_size=64,
|
||||||
... src_seq_length=20, tgt_seq_length=10)
|
... src_seq_length=20, tgt_seq_length=10)
|
||||||
>>> encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32)
|
>>> encoder_input_value = Tensor(np.ones((2, 20, 64)), mstype.float32)
|
||||||
>>> encoder_input_mask = Tensor(np.ones((2, 20, 20)), dtype.float16)
|
>>> encoder_input_mask = Tensor(np.ones((2, 20, 20)), mstype.float16)
|
||||||
>>> decoder_input_value = Tensor(np.ones((2, 10, 64)), dtype.float32)
|
>>> decoder_input_value = Tensor(np.ones((2, 10, 64)), mstype.float32)
|
||||||
>>> decoder_input_mask = Tensor(np.ones((2, 10, 10)), dtype.float16)
|
>>> decoder_input_mask = Tensor(np.ones((2, 10, 10)), mstype.float16)
|
||||||
>>> memory_mask = Tensor(np.ones((2, 10, 20)), dtype.float16)
|
>>> memory_mask = Tensor(np.ones((2, 10, 20)), mstype.float16)
|
||||||
>>> output, en_past, de_past = model(encoder_input_value, encoder_input_mask, decoder_input_value,
|
>>> output, en_past, de_past = model(encoder_input_value, encoder_input_mask, decoder_input_value,
|
||||||
... decoder_input_mask, memory_mask)
|
... decoder_input_mask, memory_mask)
|
||||||
>>> print(output.shape)
|
>>> print(output.shape)
|
||||||
|
@ -2094,7 +2106,7 @@ class Transformer(Cell):
|
||||||
output = encoder_output
|
output = encoder_output
|
||||||
|
|
||||||
if self.decoder is not None:
|
if self.decoder is not None:
|
||||||
# decoder mask can be created outside of the model
|
# decoder mask should be created outside of the model
|
||||||
if self.use_moe is True:
|
if self.use_moe is True:
|
||||||
decoder_output, decoder_layer_present, decoder_aux_loss = self.decoder(decoder_inputs, decoder_masks,
|
decoder_output, decoder_layer_present, decoder_aux_loss = self.decoder(decoder_inputs, decoder_masks,
|
||||||
encoder_output, memory_mask,
|
encoder_output, memory_mask,
|
||||||
|
|
Loading…
Reference in New Issue