enhance api description

2021-06-07 20:00:36 +08:00 · 2021-06-07 20:00:36 +08:00 · d480a78c21
parent 0cd459fb47
commit d480a78c21
3 changed files with 58 additions and 55 deletions
--- a/mindspore/dataset/text/transforms.py
+++ b/mindspore/dataset/text/transforms.py
@ -327,9 +327,10 @@ class SentencePieceTokenizer(TextTensorOperation):
    Tokenize scalar token or 1-D tokens to tokens by sentencepiece.

    Args:
-        mode (Union[str, SentencePieceVocab]): If the input parameter is a file, then it is of type string.
-            If the input parameter is a SentencePieceVocab object, then it is of type SentencePieceVocab.
-        out_type (SPieceTokenizerOutType): The type of output, the type is int or string
+        mode (Union[str, SentencePieceVocab]): If the input parameter is a file, then its type should be string.
+            If the input parameter is a SentencePieceVocab object, then its type should be SentencePieceVocab.
+        out_type (SPieceTokenizerOutType): The type of output, it can be any of [SPieceTokenizerOutType.STRING,
+            SPieceTokenizerOutType.INT].

    Examples:
        >>> from mindspore.dataset.text import SentencePieceModel, SPieceTokenizerOutType
@ -350,7 +351,7 @@ class SentencePieceTokenizer(TextTensorOperation):

 class SlidingWindow(TextTensorOperation):
    """
-    TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension axis
+    Construct a tensor from given data (only support 1-D for now), where each element in the dimension axis
    is a slice of data starting at the corresponding position, with a specified width.

    Args:
@ -387,15 +388,13 @@ class ToNumber(TextTensorOperation):
    """
    Tensor operation to convert every element of a string tensor to a number.

-    Strings are cast according to the rules specified in the following links:
+    Strings are cast according to the rules specified in the following links, except that any strings which represent
+    negative numbers cannot be cast to an unsigned integer type, rules links are as follows:
    https://en.cppreference.com/w/cpp/string/basic_string/stof,
    https://en.cppreference.com/w/cpp/string/basic_string/stoul,
-    except that any strings which represent negative numbers cannot be cast to an
-    unsigned integer type.

    Args:
-        data_type (mindspore.dtype): mindspore.dtype to be cast to. Must be
-            a numeric type.
+        data_type (mindspore.dtype): Type to be cast to. Must be a numeric type in mindspore.dtype.

    Raises:
        RuntimeError: If strings are invalid to cast, or are out of range after being cast.
@ -521,7 +520,7 @@ class WordpieceTokenizer(TextTensorOperation):

 class PythonTokenizer:
    """
-    Callable class to be used for user-defined string tokenizer.
+    Class that apply user-defined string tokenizer into input string.

    Args:
        tokenizer (Callable): Python function that takes a `str` and returns a list of `str` as tokens.
@ -752,9 +751,9 @@ if platform.system().lower() != 'windows':

    class RegexReplace(TextTensorOperation):
        """
-        Replace UTF-8 string tensor with 'replace' according to regular expression 'pattern'.
+        Replace a part of UTF-8 string tensor with given text according to regular expressions.

-        See http://userguide.icu-project.org/strings/regexp for support regex pattern.
+        See http://userguide.icu-project.org/strings/regexp for supported regex pattern.

        Note:
            RegexReplace is not supported on Windows platform yet.
@ -786,7 +785,7 @@ if platform.system().lower() != 'windows':
        """
        Tokenize a scalar tensor of UTF-8 string by regex expression pattern.

-        See http://userguide.icu-project.org/strings/regexp for support regex pattern.
+        See http://userguide.icu-project.org/strings/regexp for supported regex pattern.

        Note:
            RegexTokenizer is not supported on Windows platform yet.
@ -795,16 +794,16 @@ if platform.system().lower() != 'windows':
            delim_pattern (str): The pattern of regex delimiters.
                The original string will be split by matched elements.
            keep_delim_pattern (str, optional): The string matched by 'delim_pattern' can be kept as a token
-                if it can be matched by 'keep_delim_pattern'. The default value is an empty str ('')
+                if it can be matched by 'keep_delim_pattern'. The default value is an empty str
                which means that delimiters will not be kept as an output token (default='').
-            with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
+            with_offsets (bool, optional): Whether or not output offsets of tokens(default=False).

        Examples:
-            >>> # If with_offsets=False, default output one column {["text", dtype=str]}
+            >>> # If with_offsets=False, default output is one column {["text", dtype=str]}
            >>> delim_pattern = r"[ |,]"
            >>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=False)
            >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
-            >>> # If with_offsets=False, then output three columns {["token", dtype=str],
+            >>> # If with_offsets=True, then output three columns {["token", dtype=str],
            >>> #                                                   ["offsets_start", dtype=uint32],
            >>> #                                                   ["offsets_limit", dtype=uint32]}
            >>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=True)
@ -827,7 +826,7 @@ if platform.system().lower() != 'windows':

    class UnicodeScriptTokenizer(TextTensorOperation):
        """
-        Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
+        Tokenize a scalar tensor of UTF-8 string based on Unicode script boundaries.

        Note:
            UnicodeScriptTokenizer is not supported on Windows platform yet.
@ -840,9 +839,9 @@ if platform.system().lower() != 'windows':
            >>> # If with_offsets=False, default output one column {["text", dtype=str]}
            >>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=False)
            >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
-            >>> # If with_offsets=False, then output three columns {["token", dtype=str],
-            >>> #                                                   ["offsets_start", dtype=uint32],
-            >>> #                                                   ["offsets_limit", dtype=uint32]}
+            >>> # If with_offsets=True, then output three columns {["token", dtype=str],
+            >>> #                                                  ["offsets_start", dtype=uint32],
+            >>> #                                                  ["offsets_limit", dtype=uint32]}
            >>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True)
            >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"],
            ...                                           output_columns=["token", "offsets_start", "offsets_limit"],
--- a/mindspore/dataset/text/utils.py
+++ b/mindspore/dataset/text/utils.py
@ -56,12 +56,12 @@ class Vocab(cde.Vocab):
                min_frequency=1. max_frequency > total_words is the same as max_frequency = total_words.
                min_frequency/max_frequency can be None, which corresponds to 0/total_words separately
                (default=None, all words are included).
-            top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
-                taken. top_k is taken after freq_range. If not enough top_k, all words will be taken (default=None,
-                all words are included).
-            special_tokens(list, optional):  a list of strings, each one is a special token. for example
+            top_k(int, optional): top_k is greater than 0. Number of words to be built into vocab. top_k means most
+                frequent words are taken. top_k is taken after freq_range. If not enough top_k, all words will be taken
+                (default=None, all words are included).
+            special_tokens(list, optional):  A list of strings, each one is a special token. For example
                special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
-            special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens
+            special_first(bool, optional): Whether special_tokens will be prepended/appended to vocab. If special_tokens
                is specified and special_first is set to True, special_tokens will be prepended (default=True).

        Returns:
@ -76,10 +76,10 @@ class Vocab(cde.Vocab):
        Build a vocab object from a list of word.

        Args:
-            word_list(list): a list of string where each element is a word of type string.
-            special_tokens(list, optional):  a list of strings, each one is a special token. for example
+            word_list(list): A list of string where each element is a word of type string.
+            special_tokens(list, optional):  A list of strings, each one is a special token. for example
                special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
-            special_first(bool, optional): whether special_tokens is prepended or appended to vocab. If special_tokens
+            special_first(bool, optional): Whether special_tokens is prepended or appended to vocab. If special_tokens
                is specified and special_first is set to True, special_tokens will be prepended (default=True).

        Returns:
@ -96,13 +96,13 @@ class Vocab(cde.Vocab):
        Build a vocab object from a list of word.

        Args:
-            file_path (str): path to the file which contains the vocab list.
-            delimiter (str, optional): a delimiter to break up each line in file, the first element is taken to be
+            file_path (str): Path to the file which contains the vocab list.
+            delimiter (str, optional): A delimiter to break up each line in file, the first element is taken to be
                the word (default="").
-            vocab_size (int, optional): number of words to read from file_path (default=None, all words are taken).
-            special_tokens (list, optional):  a list of strings, each one is a special token. for example
+            vocab_size (int, optional): Number of words to read from file_path (default=None, all words are taken).
+            special_tokens (list, optional):  A list of strings, each one is a special token. for example
                special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
-            special_first (bool, optional): whether special_tokens will be prepended/appended to vocab,
+            special_first (bool, optional): Whether special_tokens will be prepended/appended to vocab,
                If special_tokens is specified and special_first is set to True,
                special_tokens will be prepended (default=True).

@ -122,7 +122,7 @@ class Vocab(cde.Vocab):
        Build a vocab object from a dict.

        Args:
-            word_dict (dict): dict contains word and id pairs, where word should be str and id be int. id is recommended
+            word_dict (dict): Dict contains word and id pairs, where word should be str and id be int. id is recommended
                to start from 0 and be continuous. ValueError will be raised if id is negative.

        Returns:
@ -134,24 +134,26 @@ class Vocab(cde.Vocab):

 class SentencePieceVocab(cde.SentencePieceVocab):
    """
-    SentencePiece obiect that is used to segmentate words
+    SentencePiece object that is used to do words segmentation.
    """

    @classmethod
    @check_from_dataset_sentencepiece
    def from_dataset(cls, dataset, col_names, vocab_size, character_coverage, model_type, params):
        """
-        Build a sentencepiece from a dataset
+        Build a SentencePiece from a dataset.

        Args:
-            dataset(Dataset): Dataset to build sentencepiece.
+            dataset(Dataset): Dataset to build SentencePiece.
            col_names(list): The list of the col name.
            vocab_size(int): Vocabulary size.
            character_coverage(float): Amount of characters covered by the model, good defaults are: 0.9995 for
-                languages. with rich character set like Japanese or Chinese and 1.0 for other languages with small
+                languages with rich character set like Japanese or Chinese and 1.0 for other languages with small
                character set.
-            model_type(SentencePieceModel): Choose from UNIGRAM (default), BPE, CHAR, or WORD. The input sentence
-                must be pretokenized when using word type.
+            model_type(SentencePieceModel): It can be any of [SentencePieceModel.UNIGRAM, SentencePieceModel.BPE,
+                SentencePieceModel.CHAR, SentencePieceModel.WORD], default is SentencePieceModel.UNIGRAM. The input
+                sentence must be pre-tokenized when using word type.
+
            params(dict): A dictionary with no incoming parameters.

        Returns:
@ -168,13 +170,15 @@ class SentencePieceVocab(cde.SentencePieceVocab):
        Build a SentencePiece object from a list of word.

        Args:
-            file_path(list): Path to the file which contains the sentencepiece list.
-            vocab_size(int): Vocabulary size, the type of uint32_t.
+            file_path(list): Path to the file which contains the SentencePiece list.
+            vocab_size(int): Vocabulary size.
            character_coverage(float): Amount of characters covered by the model, good defaults are: 0.9995 for
-                languages. with rich character set like Japanse or Chinese and 1.0 for other languages with small
+                languages with rich character set like Japanese or Chinese and 1.0 for other languages with small
                character set.
-            model_type(SentencePieceModel): Choose from unigram (default), bpe, char, or word. The input sentence
-                must be pretokenized when using word type.
+            model_type(SentencePieceModel): It can be any of [SentencePieceModel.UNIGRAM, SentencePieceModel.BPE,
+                SentencePieceModel.CHAR, SentencePieceModel.WORD], default is SentencePieceModel.UNIGRAM. The input
+                sentence must be pre-tokenized when using word type.
+
            params(dict): A dictionary with no incoming parameters(The parameters are derived from SentencePiece
                library).

@ -193,10 +197,10 @@ class SentencePieceVocab(cde.SentencePieceVocab):
    @check_save_model
    def save_model(cls, vocab, path, filename):
        """
-        Save model to filepath
+        Save model into given filepath.

        Args:
-            vocab(SentencePieceVocab): A sentencepiece object.
+            vocab(SentencePieceVocab): A SentencePiece object.
            path(str): Path to store model.
            filename(str): The name of the file.
        """
@ -208,7 +212,7 @@ def to_str(array, encoding='utf8'):
    Convert NumPy array of `bytes` to array of `str` by decoding each element based on charset `encoding`.

    Args:
-        array (numpy.ndarray): Array of type `bytes` representing strings.
+        array (numpy.ndarray): Array of `bytes` type representing strings.
        encoding (str): Indicating the charset for decoding.

    Returns:
@ -226,7 +230,7 @@ def to_bytes(array, encoding='utf8'):
    Convert NumPy array of `str` to array of `bytes` by encoding each element based on charset `encoding`.

    Args:
-        array (numpy.ndarray): Array of type `str` representing strings.
+        array (numpy.ndarray): Array of `str` type representing strings.
        encoding (str): Indicating the charset for encoding.

    Returns:
--- a/mindspore/dataset/transforms/c_transforms.py
+++ b/mindspore/dataset/transforms/c_transforms.py
@ -169,8 +169,7 @@ class Slice(TensorOperation):
    """
    Slice operation to extract a tensor out using the given n slices.

-    The functionality of Slice is similar to NumPy's indexing feature.
-    (Currently only rank-1 tensors are supported).
+    The functionality of Slice is similar to NumPy's indexing feature (Currently only rank-1 tensors are supported).

    Args:
        slices (Union[int, list[int], slice, None, Ellipsis]):
@ -234,10 +233,11 @@ class Mask(TensorOperation):
    Any element of the tensor that matches the predicate will be evaluated to True, otherwise False.

    Args:
-        operator (Relational): One of the relational operators EQ, NE LT, GT, LE or GE
+        operator (Relational): relational operators, it can be any of [Relational.EQ, Relational.NE, Relational.LT,
+            Relational.GT, Relational.LE, Relational.GE], take Relational.EQ as example, EQ refers to equal.
        constant (Union[str, int, float, bool]): Constant to be compared to.
            Constant will be cast to the type of the input tensor.
-        dtype (mindspore.dtype, optional): Type of the generated mask (Default to bool).
+        dtype (mindspore.dtype, optional): Type of the generated mask (Default mstype.bool_).

    Examples:
        >>> from mindspore.dataset.transforms.c_transforms import Relational
@ -268,7 +268,7 @@ class Mask(TensorOperation):

 class PadEnd(TensorOperation):
    """
-    Pad input tensor according to pad_shape, need to have same rank.
+    Pad input tensor according to pad_shape, input tensor needs to have same rank.

    Args:
        pad_shape (list(int)): List of integers representing the shape needed. Dimensions that set to `None` will