!2433 add comment for dataset.text

Merge pull request !2433 from qianlong21st/add_text_comment
2020-06-22 14:23:16 +08:00 · 2020-06-22 14:23:16 +08:00 · 3654f0f9ac
parent e9e4442dcb d9f4549d13
commit 3654f0f9ac
3 changed files with 46 additions and 12 deletions
--- a/mindspore/dataset/text/init.py
+++ b/mindspore/dataset/text/init.py
@ -11,9 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
-mindspore.dataset.text
+This module is to support text processing for nlp. It includes two parts:
+transforms and utils. transforms is a high performance
+nlp text processing module which is developed with icu4c and cppjieba.
+utils provides some general methods for nlp text processing.
 """
 import platform
 from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, TruncateSequencePair, \
--- a/mindspore/dataset/text/transforms.py
+++ b/mindspore/dataset/text/transforms.py
@ -12,9 +12,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-c transforms for all text related operators
-"""
+The module text.transforms is inheritted from _c_dataengine
+which is implemented basing on icu4c and cppjieba in C++.
+It's a high performance module to process nlp text.
+Users can use Vocab to build their own dictionary,
+use appropriate tokenizers to split sentences into different tokens,
+and use Lookup to find the index of tokens in Vocab.

+.. Note::
+    Constructor's arguments for every class in this module must be saved into the
+    class attributes (self.xxx) to support save() and load().
+
+Examples:
+    >>> import mindspore.dataset as ds
+    >>> import mindspore.dataset.text as text
+    >>> dataset_file = "path/to/text_file_path"
+    >>> # sentences as line data saved in a file
+    >>> dataset = ds.TextFileDataset(dataset_file, shuffle=False)
+    >>> # tokenize sentence to unicode characters
+    >>> tokenizer = text.UnicodeCharTokenizer()
+    >>> # load vocabulary form list
+    >>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您'])
+    >>> # lookup is an operation for mapping tokens to ids
+    >>> lookup = text.Lookup(vocab)
+    >>> dataset = dataset.map(operations=[tokenizer, lookup])
+    >>> for i in dataset.create_dict_iterator():
+    >>>     print(i)
+    >>> # if text line in dataset_file is:
+    >>> # 深圳欢迎您
+    >>> # then the output will be:
+    >>> # {'text': array([0, 1, 2, 3, 4], dtype=int32)}
+"""
 import os
 import re
 import platform
@ -203,8 +231,8 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):

    Args:
        vocab (Vocab): a Vocab object.
-        suffix_indicator (str, optional): Used to show that the subword is the last part of a word(default '##').
-        max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default 100).
+        suffix_indicator (str, optional): Used to show that the subword is the last part of a word(default='##').
+        max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default=100).
        unknown_token (str, optional): When we can not found the token: if 'unknown_token' is empty string,
            return the token directly, else return 'unknown_token'(default='[UNK]').
    """
@ -299,7 +327,7 @@ if platform.system().lower() != 'windows':
                The original string will be split by matched elements.
            keep_delim_pattern(str, optional): The string matched by 'delim_pattern' can be kept as a token
                if it can be matched by 'keep_delim_pattern'. And the default value is empty str(''),
-                in this situation, delimiters will not kept as a output token.
+                in this situation, delimiters will not kept as a output token(default='').
        """

        def __init__(self, delim_pattern, keep_delim_pattern=''):
--- a/mindspore/dataset/text/utils.py
+++ b/mindspore/dataset/text/utils.py
@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Some basic function for text
+The module text.utils provides some general methods for nlp text processing.
+For example, you can use Vocab to build a dictionary,
+use to_bytes and to_str to encode and decode strings into a specified format.
 """
 from enum import IntEnum

@ -52,12 +54,12 @@ class Vocab(cde.Vocab):
                min_frequency/max_frequency can be None, which corresponds to 0/total_words separately
                (default=None, all words are included).
            top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
-                taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default=None,
+                taken. top_k is taken after freq_range. If not enough top_k, all words will be taken (default=None,
                all words are included).
            special_tokens(list, optional):  a list of strings, each one is a special token. for example
                special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
            special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens
-                is specified and special_first is set to None, special_tokens will be prepended. (default=None).
+                is specified and special_first is set to None, special_tokens will be prepended (default=None).

        Returns:
            Vocab, Vocab object built from dataset.
@ -81,7 +83,7 @@ class Vocab(cde.Vocab):
            special_tokens(list, optional):  a list of strings, each one is a special token. for example
                special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
            special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens
-                is specified and special_first is set to None, special_tokens will be prepended. (default=None).
+                is specified and special_first is set to None, special_tokens will be prepended (default=None).
        """

        return super().from_list(word_list, special_tokens, special_first)
@ -101,7 +103,7 @@ class Vocab(cde.Vocab):
                special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
            special_first (bool, optional): whether special_tokens will be prepended/appended to vocab,
                If special_tokens is specified and special_first is set to None,
-                special_tokens will be prepended. (default=None).
+                special_tokens will be prepended (default=None).
        """

        return super().from_file(file_path, delimiter, vocab_size, special_tokens, special_first)
@ -157,12 +159,14 @@ def to_bytes(array, encoding='utf8'):


 class JiebaMode(IntEnum):
+    """An enumeration for JiebaTokenizer, effective enumeration types are MIX, MP, HMM."""
    MIX = 0
    MP = 1
    HMM = 2


 class NormalizeForm(IntEnum):
+    """An enumeration for NormalizeUTF8, effective enumeration types are NONE, NFC, NFKC, NFD, NFKD."""
    NONE = 0
    NFC = 1
    NFKC = 2