!28353 Fix Intent For Transformer APIs

Merge pull request !28353 from huangxinjing/fx_api_intent
2021-12-30 07:19:50 +00:00 · 2021-12-30 07:19:50 +00:00 · 54c9971ad6
parent 30441a8709 13b3d2ae29
commit 54c9971ad6
5 changed files with 29 additions and 25 deletions
--- a/mindspore/python/mindspore/nn/transformer/init.py
+++ b/mindspore/python/mindspore/nn/transformer/init.py
@ -17,7 +17,6 @@ NOTE:
    Transformer Networks.
    This is an experimental interface that is subject to change or deletion.
 """
-# pylint: disable=W0614,W0401
 from .transformer import AttentionMask, VocabEmbedding, MultiHeadAttention, FeedForward, TransformerEncoder, \
    TransformerDecoder, TransformerEncoderLayer, TransformerDecoderLayer, Transformer, TransformerOpParallelConfig, \
    EmbeddingOpParallelConfig, TransformerRecomputeConfig
--- a/mindspore/python/mindspore/nn/transformer/moe.py
+++ b/mindspore/python/mindspore/nn/transformer/moe.py
@ -70,8 +70,10 @@ class MoEConfig:
        self.aux_loss_factor = aux_loss_factor
        self.num_experts_chosen = num_experts_chosen

+
 default_moe_config = MoEConfig()

+
 def _check_moe_config(moe_config=None, parallel_config=None):
    if not isinstance(moe_config, MoEConfig):
        raise TypeError(f"'moe_config' should be an instance of MoEConfig, but got {type(moe_config).__name__}.")
@ -81,6 +83,8 @@ def _check_moe_config(moe_config=None, parallel_config=None):
                         f"of 'data_parallel' value in {type(parallel_config).__name__}, but got "
                         f"{moe_config.expert_num} for 'expert_num' and {parallel_config.data_parallel} for "
                         f"'data_parallel'.")
+
+
@constexpr
 def calculate_expert_capacity(k, tokens_per_device, capacity_factor, expert_dim):
    return math.ceil(k * tokens_per_device * capacity_factor / expert_dim)
--- a/mindspore/python/mindspore/nn/transformer/op_parallel_config.py
+++ b/mindspore/python/mindspore/nn/transformer/op_parallel_config.py
@ -78,6 +78,7 @@ class OpParallelConfig(_Config):
        Validator.check_positive_int(value, "model_parallel")
        self._model_parallel = value

+
 class _PipeLineConfig(_Config):
    r"""
        PPConfig for the setting data parallel, model parallel
--- a/mindspore/python/mindspore/nn/transformer/transformer.py
+++ b/mindspore/python/mindspore/nn/transformer/transformer.py
@ -139,7 +139,7 @@ class TransformerRecomputeConfig(_Config):
    Examples:
        >>> from mindspore.nn.transformer import TransformerRecomputeConfig
        >>> config=TransformerRecomputeConfig(recompute=True, parallel_optimizer_comm_recompute=True, \
-        mp_comm_recompute=True, recompute_slice_activation=True)
+        ...                                   mp_comm_recompute=True, recompute_slice_activation=True)
    """

    def __init__(self, recompute=False, parallel_optimizer_comm_recompute=False,
@ -226,7 +226,7 @@ class TransformerOpParallelConfig(_Config):
    Examples:
        >>> from mindspore.nn.transformer import TransformerRecomputeConfig
        >>> recompute_config=TransformerRecomputeConfig(recompute=True, parallel_optimizer_comm_recompute=True, \
-        mp_comm_recompute=True, recompute_slice_activation=True)
+        ...                                             mp_comm_recompute=True, recompute_slice_activation=True)
        >>> config=TransformerOpParallelConfig(data_parallel=1, model_parallel=1, recompute=recompute_config)
    """

@ -521,9 +521,9 @@ class AttentionMask(Cell):
        >>> inputs = Tensor(mask_array)
        >>> res = mask(inputs)
        >>> print(res)
-        [[[1. 0. 0. 0],
-          [1. 1. 0. 0],
-          [1. 1. 1. 0],
+        [[[1. 0. 0. 0]
+          [1. 1. 0. 0]
+          [1. 1. 1. 0]
          [0. 0. 0. 0]]]
    """

@ -734,15 +734,15 @@ class MultiHeadAttention(Cell):
        (2, 3, 5, 20)
        >>> print(past[1].shape)
        (2, 3, 20, 5)
-        # When use use_past=True, it includes two steps to implement the incremental prediction.
-        # Step 1: set is_first_iteration=True, and input the full sequence length's state.
-        # We need to prepare the memory parameters for saving key and value states firstly.
+        >>> # When use use_past=True, it includes two steps to implement the incremental prediction.
+        >>> # Step 1: set is_first_iteration=True, and input the full sequence length's state.
+        >>> # We need to prepare the memory parameters for saving key and value states firstly.
        >>> model = MultiHeadAttention(batch_size=2, hidden_size=15, src_seq_length=20, tgt_seq_length=20,
        ...                            num_heads=3, use_past=True)
        >>> key_past = Tensor(np.zeros(shape=(2, 3, 5, 20)), mstype.float16)
        >>> value_past = Tensor(np.zeros(shape=(2, 3, 20, 5)), mstype.float16)
        >>> batch_valid_length = Tensor(np.ones((2,)), mstype.int32)
-        # Set is_first_iteration=True to generate the full memory states
+        >>> # Set is_first_iteration=True to generate the full memory states
        >>> model.add_flags_recursive(is_first_iteration=True)
        >>> attn_out, past = model(from_tensor, to_tensor, to_tensor, attention_mask, key_past, value_past,
        ...                        batch_valid_length)
@ -755,8 +755,8 @@ class MultiHeadAttention(Cell):
        >>> from_tensor = Tensor(np.ones((2, 1, 15)), mstype.float32)
        >>> to_tensor = Tensor(np.ones((2, 1, 15)), mstype.float16)
        >>> attention_mask = Tensor(np.ones((2, 1, 20)), mstype.float16)
-        # Step 2: set is_first_iteration=False, and pass the single word to run the prediction rather than the full
-        # sequence.
+        >>> # Step 2: set is_first_iteration=False, and pass the single word to run the prediction rather than the full
+        >>> # sequence.
        >>> model.add_flags_recursive(is_first_iteration=False)
        >>> attn_out, past = model(from_tensor, to_tensor, to_tensor, attention_mask, key_past, value_past,
        ...                        batch_valid_length)
@ -1209,11 +1209,11 @@ class TransformerEncoderLayer(Cell):
        (2, 2, 4, 16)
        >>> print(past[1].shape)
        (2, 2, 16, 4)
-        # When use use_past=True, it includes two steps to implement the incremental prediction.
-        # Step 1: set is_first_iteration=True, and input the full sequence length's state.
+        >>> # When use use_past=True, it includes two steps to implement the incremental prediction.
+        >>> # Step 1: set is_first_iteration=True, and input the full sequence length's state.
        >>> batch_valid_length = Tensor(np.ones((2,)), mstype.int32)
        >>> init_reset = Tensor([True], mstype.bool_)
-        # Set is_first_iteration=True to generate the full memory states
+        >>> # Set is_first_iteration=True to generate the full memory states
        >>> model = TransformerEncoderLayer(batch_size=2, hidden_size=8, ffn_hidden_size=64, seq_length=16,
        ...                                 num_heads=2, use_past=True)
        >>> model.add_flags_recursive(is_first_iteration=True)
@ -1227,8 +1227,8 @@ class TransformerEncoderLayer(Cell):
        >>> encoder_input_value = Tensor(np.ones((2, 1, 8)), mstype.float32)
        >>> encoder_input_mask = Tensor(np.ones((2, 1, 16)), mstype.float16)
        >>> init_reset = Tensor([False], mstype.bool_)
-        # Step 2: set is_first_iteration=False, and pass the single word to run the prediction rather than the full
-        # sequence.
+        >>> # Step 2: set is_first_iteration=False, and pass the single word to run the prediction rather than the full
+        >>> # sequence.
        >>> model.add_flags_recursive(is_first_iteration=False)
        >>> hidden, past = model(encoder_input_value, encoder_input_mask, init_reset, batch_valid_length)
        >>> print(hidden.shape)
@ -1847,8 +1847,8 @@ def _get_lambda_func(total_layer=None):
                network.recompute()
        else:
            if parallel_config.recompute.recompute:
-                network.recompute(parallel_optimizer_comm_recompute=
-                                  parallel_config.recompute.parallel_optimizer_comm_recompute,
+                paralel_op_comm_compute = parallel_config.recompute.parallel_optimizer_comm_recompute
+                network.recompute(parallel_optimizer_comm_recompute=paralel_op_comm_compute,
                                  mp_comm_recompute=parallel_config.recompute.mp_comm_recompute,
                                  recompute_slice_activation=parallel_config.recompute.recompute_slice_activation)

@ -1940,11 +1940,11 @@ class TransformerEncoder(Cell):
        (2, 2, 4, 16)
        >>> print(past[0][1].shape)
        (2, 2, 16, 4)
-        # When use use_past=True, it includes two steps to implement the incremental prediction.
-        # Step 1: set is_first_iteration=True, and input the full sequence length's state.
+        >>> # When use use_past=True, it includes two steps to implement the incremental prediction.
+        >>> # Step 1: set is_first_iteration=True, and input the full sequence length's state.
        >>> batch_valid_length = Tensor(np.ones((2,)), mstype.int32)
        >>> init_reset = Tensor([True], mstype.bool_)
-        # Set is_first_iteration=True to generate the full memory states
+        >>> # Set is_first_iteration=True to generate the full memory states
        >>> model = TransformerEncoder(batch_size=2, hidden_size=8, ffn_hidden_size=64, seq_length=16,
        ...                            num_heads=2, num_layers=2, use_past=True)
        >>> model.add_flags_recursive(is_first_iteration=True)
@ -1958,8 +1958,8 @@ class TransformerEncoder(Cell):
        >>> encoder_input_value = Tensor(np.ones((2, 1, 8)), mstype.float32)
        >>> encoder_input_mask = Tensor(np.ones((2, 1, 16)), mstype.float16)
        >>> init_reset = Tensor([False], mstype.bool_)
-        # Step 2: set is_first_iteration=False, and pass the single word to run the prediction rather than the full
-        # sequence.
+        >>> # Step 2: set is_first_iteration=False, and pass the single word to run the prediction rather than the full
+        >>> # sequence.
        >>> model.add_flags_recursive(is_first_iteration=False)
        >>> hidden, past = model(encoder_input_value, encoder_input_mask, init_reset, batch_valid_length)
        >>> print(hidden.shape)
--- a/mindspore/python/mindspore/parallel/nn/init.py
+++ b/mindspore/python/mindspore/parallel/nn/init.py
@ -20,7 +20,7 @@ NOTE:
    while the usage of these APIs stay unchanged. The original import path will retain one or two versions.
    You can view the changes using the examples described below:

-    #r1.5
+    # r1.5
    from mindspore.parallel.nn import Transformer

    # Current