From e0aaf847aa2219c03f2c371caceeab768a38fe39 Mon Sep 17 00:00:00 2001 From: Cathy Wong Date: Tue, 29 Sep 2020 16:46:56 -0400 Subject: [PATCH] dataset API docstring: Update/add text examples --- mindspore/dataset/core/config.py | 2 +- mindspore/dataset/text/transforms.py | 177 +++++++++++++----- mindspore/dataset/transforms/c_transforms.py | 1 + mindspore/dataset/transforms/py_transforms.py | 14 +- 4 files changed, 141 insertions(+), 53 deletions(-) diff --git a/mindspore/dataset/core/config.py b/mindspore/dataset/core/config.py index b279c69e615..e50a5983b56 100644 --- a/mindspore/dataset/core/config.py +++ b/mindspore/dataset/core/config.py @@ -159,7 +159,7 @@ def get_monitor_sampling_interval(): Get the default interval of performance monitor sampling. Returns: - Interval: interval (in milliseconds) for performance monitor sampling. + Int, interval (in milliseconds) for performance monitor sampling. """ return _config.get_monitor_sampling_interval() diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py index 16886b7b1e3..7ff283933d7 100644 --- a/mindspore/dataset/text/transforms.py +++ b/mindspore/dataset/text/transforms.py @@ -28,16 +28,16 @@ Examples: >>> import mindspore.dataset.text as text >>> >>> dataset_file = "path/to/text_file_path" - >>> # sentences as line data saved in a file - >>> dataset = ds.TextFileDataset(dataset_file, shuffle=False) - >>> # tokenize sentence to unicode characters + >>> # Create a dataset for text sentences saved as line data in a file + >>> data1 = ds.TextFileDataset(dataset_file, shuffle=False) + >>> # Tokenize sentences to unicode characters >>> tokenizer = text.UnicodeCharTokenizer() - >>> # load vocabulary form list + >>> # Load vocabulary from list >>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您']) - >>> # lookup is an operation for mapping tokens to ids + >>> # Use Lookup operator to map tokens to ids >>> lookup = text.Lookup(vocab) - >>> dataset = dataset.map(operations=[tokenizer, lookup]) - >>> for i in dataset.create_dict_iterator(): + >>> data1 = data1.map(operations=[tokenizer, lookup]) + >>> for i in data1.create_dict_iterator(): >>> print(i) >>> # if text line in dataset_file is: >>> # 深圳欢迎您 @@ -69,6 +69,15 @@ class Lookup(cde.LookupOp): unknown_token (str, optional): Word used for lookup if the word being looked up is out-of-vocabulary (OOV). If unknown_token is OOV, a runtime error will be thrown (default=None). data_type (mindspore.dtype, optional): mindspore.dtype that lookup maps string to (default=mstype.int32) + + Examples: + >>> import mindspore.dataset.text as text + >>> + >>> # Load vocabulary from list + >>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您']) + >>> # Use Lookup operator to map tokens to ids + >>> lookup = text.Lookup(vocab, "") + >>> data1 = data1.map(operations=[lookup]) """ @check_lookup @@ -86,12 +95,14 @@ class SlidingWindow(cde.SlidingWindowOp): axis (int, optional): The axis along which the sliding window is computed (default=0). Examples: + >>> import mindspore.dataset.text as text + >>> >>> # Data before >>> # | col1 | >>> # +-------------+ >>> # | [1,2,3,4,5] | >>> # +-------------+ - >>> data = data.map(operations=SlidingWindow(3, 0)) + >>> data1 = data1.map(operations=text.SlidingWindow(3, 0)) >>> # Data after >>> # | col1 | >>> # +-------------+ @@ -125,6 +136,11 @@ class Ngram(cde.NgramOp): separator (str, optional): symbol used to join strings together. For example. if 2-gram is ["mindspore", "amazing"] with separator="-", the result would be ["mindspore-amazing"] (default=None, which means whitespace is used). + + Examples: + >>> import mindspore.dataset.text as text + >>> + >>> data1 = data1.map(operations=text.Ngram(3, separator=" ")) """ @check_ngram @@ -157,15 +173,17 @@ class JiebaTokenizer(cde.JiebaTokenizerOp): with_offsets (bool, optional): If or not output offsets of tokens (default=False). Examples: + >>> import mindspore.dataset.text as text + >>> >>> # If with_offsets=False, default output one column {["text", dtype=str]} - >>> tokenizer_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=False) - >>> data = data.map(operations=tokenizer_op) + >>> tokenizer_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=False) + >>> data1 = data1.map(operations=tokenizer_op) >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], >>> # ["offsets_limit", dtype=uint32]} - >>> tokenizer_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) - >>> data = data.map(operations=tokenizer_op, input_columns=["text"], - >>> output_columns=["token", "offsets_start", "offsets_limit"], - >>> column_order=["token", "offsets_start", "offsets_limit"]) + >>> tokenizer_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) + >>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"], + >>> output_columns=["token", "offsets_start", "offsets_limit"], + >>> column_order=["token", "offsets_start", "offsets_limit"]) """ @check_jieba_init @@ -191,6 +209,16 @@ class JiebaTokenizer(cde.JiebaTokenizerOp): The added word will not be written into the built-in dictionary on disk. freq (int, optional): The frequency of the word to be added. The higher the frequency, the better chance the word will be tokenized (default=None, use default frequency). + + Examples: + >>> import mindspore.dataset.text as text + >>> + >>> jieba_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=text.JiebaMode.MP) + >>> with open(VOCAB_FILE, 'r') as f: + >>> for line in f: + >>> word = line.split(',')[0] + >>> jieba_op.add_word(word) + >>> data1 = data1.map(operations=jieba_op, input_columns=["text"]) """ if freq is None: @@ -213,6 +241,14 @@ class JiebaTokenizer(cde.JiebaTokenizerOp): word1 freq1 word2 word3 freq3 + + Examples: + >>> import mindspore.dataset.text as text + >>> + >>> user_dict = {"男默女泪": 10} + >>> jieba_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) + >>> jieba_op.add_dict(user_dict) + >>> data1 = data1.map(operations=jieba_op, input_columns=["text"]) """ if isinstance(user_dict, str): @@ -277,15 +313,17 @@ class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp): with_offsets (bool, optional): If or not output offsets of tokens (default=False). Examples: + >>> import mindspore.dataset.text as text + >>> >>> # If with_offsets=False, default output one column {["text", dtype=str]} >>> tokenizer_op = text.UnicodeCharTokenizer() - >>> dataset = dataset.map(operations=tokenizer_op) + >>> data1 = data1.map(operations=tokenizer_op) >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], >>> # ["offsets_limit", dtype=uint32]} >>> tokenizer_op = text.UnicodeCharTokenizer(True) - >>> data = data.map(operations=tokenizer_op, input_columns=["text"], - >>> output_columns=["token", "offsets_start", "offsets_limit"], - >>> column_order=["token", "offsets_start", "offsets_limit"]) + >>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"], + >>> output_columns=["token", "offsets_start", "offsets_limit"], + >>> column_order=["token", "offsets_start", "offsets_limit"]) """ @check_with_offsets @@ -307,17 +345,19 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp): with_offsets (bool, optional): If or not output offsets of tokens (default=False). Examples: + >>> import mindspore.dataset.text as text + >>> >>> # If with_offsets=False, default output one column {["text", dtype=str]} >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=['UNK'], >>> max_bytes_per_token=100, with_offsets=False) - >>> dataset = dataset.map(operations=tokenizer_op) + >>> data1 = data1.map(operations=tokenizer_op) >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], >>> # ["offsets_limit", dtype=uint32]} >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=['UNK'], >>> max_bytes_per_token=100, with_offsets=True) - >>> data = data.map(operations=tokenizer_op, - >>> input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], - >>> column_order=["token", "offsets_start", "offsets_limit"]) + >>> data2 = data2.map(operations=tokenizer_op, + >>> input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], + >>> column_order=["token", "offsets_start", "offsets_limit"]) """ @check_wordpiece_tokenizer @@ -351,6 +391,13 @@ class SentencePieceTokenizer(cde.SentencePieceTokenizerOp): mode (Union[str, SentencePieceVocab]): If the input parameter is a file, then it is of type string. If the input parameter is a SentencePieceVocab object, then it is of type SentencePieceVocab. out_type (Union[str, int]): The type of output. + + Examples: + >>> import mindspore.dataset.text as text + >>> + >>> vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) + >>> tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) + >>> data1 = data1.map(operations=tokenizer) """ def __init__(self, mode, out_type): @@ -374,16 +421,18 @@ if platform.system().lower() != 'windows': with_offsets (bool, optional): If or not output offsets of tokens (default=False). Examples: + >>> import mindspore.dataset.text as text + >>> >>> # If with_offsets=False, default output one column {["text", dtype=str]} >>> tokenizer_op = text.WhitespaceTokenizer() - >>> dataset = dataset.map(operations=tokenizer_op) + >>> data1 = data1.map(operations=tokenizer_op) >>> # If with_offsets=False, then output three columns {["token", dtype=str], >>> # ["offsets_start", dtype=uint32], >>> # ["offsets_limit", dtype=uint32]} >>> tokenizer_op = text.WhitespaceTokenizer(True) - >>> data = data.map(operations=tokenizer_op, input_columns=["text"], - >>> output_columns=["token", "offsets_start", "offsets_limit"], - >>> column_order=["token", "offsets_start", "offsets_limit"]) + >>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"], + >>> output_columns=["token", "offsets_start", "offsets_limit"], + >>> column_order=["token", "offsets_start", "offsets_limit"]) """ @check_with_offsets @@ -401,16 +450,18 @@ if platform.system().lower() != 'windows': with_offsets (bool, optional): If or not output offsets of tokens (default=False). Examples: + >>> import mindspore.dataset.text as text + >>> >>> # If with_offsets=False, default output one column {["text", dtype=str]} >>> tokenizer_op = text.UnicodeScriptTokenizerOp(keep_whitespace=True, with_offsets=False) - >>> dataset = dataset.map(operations=tokenizer_op) + >>> data1 = data1.map(operations=tokenizer_op) >>> # If with_offsets=False, then output three columns {["token", dtype=str], >>> # ["offsets_start", dtype=uint32], >>> # ["offsets_limit", dtype=uint32]} >>> tokenizer_op = text.UnicodeScriptTokenizerOp(keep_whitespace=True, with_offsets=True) - >>> data = data.map(operations=tokenizer_op, input_columns=["text"], - >>> output_columns=["token", "offsets_start", "offsets_limit"], - >>> column_order=["token", "offsets_start", "offsets_limit"]) + >>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"], + >>> output_columns=["token", "offsets_start", "offsets_limit"], + >>> column_order=["token", "offsets_start", "offsets_limit"]) """ @check_unicode_script_tokenizer @@ -423,6 +474,12 @@ if platform.system().lower() != 'windows': class CaseFold(cde.CaseFoldOp): """ Apply case fold operation on utf-8 string tensor. + + Examples: + >>> import mindspore.dataset.text as text + >>> + >>> case_op = text.CaseFold() + >>> data1 = data1.map(operations=case_op) """ @@ -434,7 +491,6 @@ if platform.system().lower() != 'windows': NormalizeForm.NFKD: cde.NormalizeForm.DE_NORMALIZE_NFKD } - class NormalizeUTF8(cde.NormalizeUTF8Op): """ Apply normalize operation on utf-8 string tensor. @@ -450,6 +506,12 @@ if platform.system().lower() != 'windows': - NormalizeForm.NFKC, normalize with Normalization Form KC. - NormalizeForm.NFD, normalize with Normalization Form D. - NormalizeForm.NFKD, normalize with Normalization Form KD. + + Examples: + >>> import mindspore.dataset.text as text + >>> + >>> normalize_op = text.NormalizeUTF8(normalize_form=NormalizeForm.NFC) + >>> data1 = data1.map(operations=normalize_op) """ def __init__(self, normalize_form=NormalizeForm.NFKC): @@ -471,6 +533,14 @@ if platform.system().lower() != 'windows': replace (str): the string to replace matched element. replace_all (bool, optional): If False, only replace first matched element; if True, replace all matched elements (default=True). + + Examples: + >>> import mindspore.dataset.text as text + >>> + >>> pattern = 'Canada' + >>> replace = 'China' + >>> replace_op = text.RegexReplace(pattern, replace) + >>> data1 = data1.map(operations=replace_op) """ def __init__(self, pattern, replace, replace_all=True): @@ -495,16 +565,18 @@ if platform.system().lower() != 'windows': with_offsets (bool, optional): If or not output offsets of tokens (default=False). Examples: + >>> import mindspore.dataset.text as text + >>> >>> # If with_offsets=False, default output one column {["text", dtype=str]} >>> tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=False) - >>> dataset = dataset.map(operations=tokenizer_op) + >>> data1 = data1.map(operations=tokenizer_op) >>> # If with_offsets=False, then output three columns {["token", dtype=str], >>> # ["offsets_start", dtype=uint32], >>> # ["offsets_limit", dtype=uint32]} >>> tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True) - >>> data = data.map(operations=tokenizer_op, input_columns=["text"], - >>> output_columns=["token", "offsets_start", "offsets_limit"], - >>> column_order=["token", "offsets_start", "offsets_limit"]) + >>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"], + >>> output_columns=["token", "offsets_start", "offsets_limit"], + >>> column_order=["token", "offsets_start", "offsets_limit"]) """ @check_regex_tokenizer @@ -531,13 +603,15 @@ if platform.system().lower() != 'windows': with_offsets (bool, optional): If or not output offsets of tokens (default=False). Examples: + >>> import mindspore.dataset.text as text + >>> >>> # If with_offsets=False, default output one column {["text", dtype=str]} >>> tokenizer_op = text.BasicTokenizer(lower_case=False, >>> keep_whitespace=False, >>> normalization_form=NormalizeForm.NONE, >>> preserve_unused_token=True, >>> with_offsets=False) - >>> dataset = dataset.map(operations=tokenizer_op) + >>> data1 = data1.map(operations=tokenizer_op) >>> # If with_offsets=False, then output three columns {["token", dtype=str], >>> # ["offsets_start", dtype=uint32], >>> # ["offsets_limit", dtype=uint32]} @@ -546,9 +620,9 @@ if platform.system().lower() != 'windows': >>> normalization_form=NormalizeForm.NONE, >>> preserve_unused_token=True, >>> with_offsets=True) - >>> data = data.map(operations=tokenizer_op, input_columns=["text"], - >>> output_columns=["token", "offsets_start", "offsets_limit"], - >>> column_order=["token", "offsets_start", "offsets_limit"]) + >>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"], + >>> output_columns=["token", "offsets_start", "offsets_limit"], + >>> column_order=["token", "offsets_start", "offsets_limit"]) """ @check_basic_tokenizer @@ -587,12 +661,14 @@ if platform.system().lower() != 'windows': with_offsets (bool, optional): If or not output offsets of tokens (default=False). Examples: + >>> import mindspore.dataset.text as text + >>> >>> # If with_offsets=False, default output one column {["text", dtype=str]} >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100, >>> unknown_token=100, lower_case=False, keep_whitespace=False, >>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True, >>> with_offsets=False) - >>> dataset = dataset.map(operations=tokenizer_op) + >>> data1 = data1.map(operations=tokenizer_op) >>> # If with_offsets=False, then output three columns {["token", dtype=str], >>> # ["offsets_start", dtype=uint32], >>> # ["offsets_limit", dtype=uint32]} @@ -600,9 +676,9 @@ if platform.system().lower() != 'windows': >>> unknown_token=100, lower_case=False, keep_whitespace=False, >>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True, >>> with_offsets=True) - >>> data = data.map(operations=tokenizer_op, input_columns=["text"], - >>> output_columns=["token", "offsets_start", "offsets_limit"], - >>> column_order=["token", "offsets_start", "offsets_limit"]) + >>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"], + >>> output_columns=["token", "offsets_start", "offsets_limit"], + >>> column_order=["token", "offsets_start", "offsets_limit"]) """ @check_bert_tokenizer @@ -636,12 +712,14 @@ class TruncateSequencePair(cde.TruncateSequencePairOp): max_length (int): Maximum length required. Examples: + >>> import mindspore.dataset.text as text + >>> >>> # Data before >>> # | col1 | col2 | >>> # +---------+---------| >>> # | [1,2,3] | [4,5] | >>> # +---------+---------+ - >>> data = data.map(operations=TruncateSequencePair(4)) + >>> data1 = data1.map(operations=text.TruncateSequencePair(4)) >>> # Data after >>> # | col1 | col2 | >>> # +---------+---------+ @@ -670,6 +748,13 @@ class ToNumber(cde.ToNumberOp): Raises: RuntimeError: If strings are invalid to cast, or are out of range after being casted. + + Examples: + >>> import mindspore.dataset.text as text + >>> import mindspore.common.dtype as mstype + >>> + >>> to_number_op = text.ToNumber(mstype.int8) + >>> data1 = data1.map(operations=to_number_op) """ @check_to_number @@ -687,9 +772,11 @@ class PythonTokenizer: tokenizer (Callable): Python function that takes a `str` and returns a list of `str` as tokens. Examples: + >>> import mindspore.dataset.text as text + >>> >>> def my_tokenizer(line): >>> return line.split() - >>> data = data.map(operations=PythonTokenizer(my_tokenizer)) + >>> data1 = data1.map(operations=text.PythonTokenizer(my_tokenizer)) """ @check_python_tokenizer diff --git a/mindspore/dataset/transforms/c_transforms.py b/mindspore/dataset/transforms/c_transforms.py index 243e0477e5c..7c19bfd4295 100644 --- a/mindspore/dataset/transforms/c_transforms.py +++ b/mindspore/dataset/transforms/c_transforms.py @@ -83,6 +83,7 @@ class TypeCast(cde.TypeCastOp): Examples: >>> import mindspore.dataset.transforms.c_transforms as c_transforms + >>> import mindspore.common.dtype as mstype >>> >>> type_cast_op = c_transforms.TypeCast(mstype.int32) """ diff --git a/mindspore/dataset/transforms/py_transforms.py b/mindspore/dataset/transforms/py_transforms.py index 610907586dd..8814a264b81 100644 --- a/mindspore/dataset/transforms/py_transforms.py +++ b/mindspore/dataset/transforms/py_transforms.py @@ -77,7 +77,7 @@ class Compose: >>> >>> dataset_dir = "path/to/imagefolder_directory" >>> # create a dataset that reads all files in dataset_dir with 8 threads - >>> dataset = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8) + >>> data1 = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8) >>> # create a list of transformations to be applied to the image data >>> transform = py_transforms.Compose([py_vision.Decode(), >>> py_vision.RandomHorizontalFlip(0.5), @@ -85,7 +85,7 @@ class Compose: >>> py_vision.Normalize((0.491, 0.482, 0.447), (0.247, 0.243, 0.262)), >>> py_vision.RandomErasing()]) >>> # apply the transform to the dataset through dataset.map() - >>> dataset = dataset.map(operations=transform, input_columns="image") + >>> data1 = data1.map(operations=transform, input_columns="image") >>> >>> # Compose is also be invoked implicitly, by just passing in a list of ops >>> # the above example then becomes: @@ -96,7 +96,7 @@ class Compose: >>> py_vision.RandomErasing()] >>> >>> # apply the transform to the dataset through dataset.map() - >>> dataset = dataset.map(operations=transform_list, input_columns="image") + >>> data2 = data2.map(operations=transform_list, input_columns="image") >>> >>> # Certain C++ and Python ops can be combined, but not all of them >>> # An example of combined operations @@ -104,20 +104,20 @@ class Compose: >>> import mindspore.dataset.transforms.c_transforms as c_transforms >>> import mindspore.dataset.vision.c_transforms as c_vision >>> - >>> data = ds.NumpySlicesDataset(arr, column_names=["cols"], shuffle=False) + >>> data3 = ds.NumpySlicesDataset(arr, column_names=["cols"], shuffle=False) >>> transformed_list = [py_transforms.OneHotOp(2), c_transforms.Mask(c_transforms.Relational.EQ, 1)] - >>> data = data.map(operations=transformed_list, input_columns=["cols"]) + >>> data3 = data3.map(operations=transformed_list, input_columns=["cols"]) >>> >>> # Here is an example of mixing vision ops >>> data_dir = "/path/to/imagefolder_directory" - >>> data1 = ds.ImageFolderDataset(dataset_dir=data_dir, shuffle=False) + >>> data4 = ds.ImageFolderDataset(dataset_dir=data_dir, shuffle=False) >>> input_columns = ["column_names"] >>> op_list=[c_vision.Decode(), >>> c_vision.Resize((224, 244)), >>> py_vision.ToPIL(), >>> np.array, # need to convert PIL image to a NumPy array to pass it to C++ operation >>> c_vision.Resize((24, 24))] - >>> data1 = data1.map(operations=op_list, input_columns=input_columns) + >>> data4 = data4.map(operations=op_list, input_columns=input_columns) """ @check_compose_list