diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py index fbed2194da5..9f91bd8a27e 100644 --- a/mindspore/dataset/text/transforms.py +++ b/mindspore/dataset/text/transforms.py @@ -327,9 +327,10 @@ class SentencePieceTokenizer(TextTensorOperation): Tokenize scalar token or 1-D tokens to tokens by sentencepiece. Args: - mode (Union[str, SentencePieceVocab]): If the input parameter is a file, then it is of type string. - If the input parameter is a SentencePieceVocab object, then it is of type SentencePieceVocab. - out_type (SPieceTokenizerOutType): The type of output, the type is int or string + mode (Union[str, SentencePieceVocab]): If the input parameter is a file, then its type should be string. + If the input parameter is a SentencePieceVocab object, then its type should be SentencePieceVocab. + out_type (SPieceTokenizerOutType): The type of output, it can be any of [SPieceTokenizerOutType.STRING, + SPieceTokenizerOutType.INT]. Examples: >>> from mindspore.dataset.text import SentencePieceModel, SPieceTokenizerOutType @@ -350,7 +351,7 @@ class SentencePieceTokenizer(TextTensorOperation): class SlidingWindow(TextTensorOperation): """ - TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension axis + Construct a tensor from given data (only support 1-D for now), where each element in the dimension axis is a slice of data starting at the corresponding position, with a specified width. Args: @@ -387,15 +388,13 @@ class ToNumber(TextTensorOperation): """ Tensor operation to convert every element of a string tensor to a number. - Strings are cast according to the rules specified in the following links: + Strings are cast according to the rules specified in the following links, except that any strings which represent + negative numbers cannot be cast to an unsigned integer type, rules links are as follows: https://en.cppreference.com/w/cpp/string/basic_string/stof, https://en.cppreference.com/w/cpp/string/basic_string/stoul, - except that any strings which represent negative numbers cannot be cast to an - unsigned integer type. Args: - data_type (mindspore.dtype): mindspore.dtype to be cast to. Must be - a numeric type. + data_type (mindspore.dtype): Type to be cast to. Must be a numeric type in mindspore.dtype. Raises: RuntimeError: If strings are invalid to cast, or are out of range after being cast. @@ -521,7 +520,7 @@ class WordpieceTokenizer(TextTensorOperation): class PythonTokenizer: """ - Callable class to be used for user-defined string tokenizer. + Class that apply user-defined string tokenizer into input string. Args: tokenizer (Callable): Python function that takes a `str` and returns a list of `str` as tokens. @@ -752,9 +751,9 @@ if platform.system().lower() != 'windows': class RegexReplace(TextTensorOperation): """ - Replace UTF-8 string tensor with 'replace' according to regular expression 'pattern'. + Replace a part of UTF-8 string tensor with given text according to regular expressions. - See http://userguide.icu-project.org/strings/regexp for support regex pattern. + See http://userguide.icu-project.org/strings/regexp for supported regex pattern. Note: RegexReplace is not supported on Windows platform yet. @@ -786,7 +785,7 @@ if platform.system().lower() != 'windows': """ Tokenize a scalar tensor of UTF-8 string by regex expression pattern. - See http://userguide.icu-project.org/strings/regexp for support regex pattern. + See http://userguide.icu-project.org/strings/regexp for supported regex pattern. Note: RegexTokenizer is not supported on Windows platform yet. @@ -795,16 +794,16 @@ if platform.system().lower() != 'windows': delim_pattern (str): The pattern of regex delimiters. The original string will be split by matched elements. keep_delim_pattern (str, optional): The string matched by 'delim_pattern' can be kept as a token - if it can be matched by 'keep_delim_pattern'. The default value is an empty str ('') + if it can be matched by 'keep_delim_pattern'. The default value is an empty str which means that delimiters will not be kept as an output token (default=''). - with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). + with_offsets (bool, optional): Whether or not output offsets of tokens(default=False). Examples: - >>> # If with_offsets=False, default output one column {["text", dtype=str]} + >>> # If with_offsets=False, default output is one column {["text", dtype=str]} >>> delim_pattern = r"[ |,]" >>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=False) >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op) - >>> # If with_offsets=False, then output three columns {["token", dtype=str], + >>> # If with_offsets=True, then output three columns {["token", dtype=str], >>> # ["offsets_start", dtype=uint32], >>> # ["offsets_limit", dtype=uint32]} >>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=True) @@ -827,7 +826,7 @@ if platform.system().lower() != 'windows': class UnicodeScriptTokenizer(TextTensorOperation): """ - Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries. + Tokenize a scalar tensor of UTF-8 string based on Unicode script boundaries. Note: UnicodeScriptTokenizer is not supported on Windows platform yet. @@ -840,9 +839,9 @@ if platform.system().lower() != 'windows': >>> # If with_offsets=False, default output one column {["text", dtype=str]} >>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=False) >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op) - >>> # If with_offsets=False, then output three columns {["token", dtype=str], - >>> # ["offsets_start", dtype=uint32], - >>> # ["offsets_limit", dtype=uint32]} + >>> # If with_offsets=True, then output three columns {["token", dtype=str], + >>> # ["offsets_start", dtype=uint32], + >>> # ["offsets_limit", dtype=uint32]} >>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True) >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"], ... output_columns=["token", "offsets_start", "offsets_limit"], diff --git a/mindspore/dataset/text/utils.py b/mindspore/dataset/text/utils.py index 7dde91a4c95..143d5a9ccd2 100644 --- a/mindspore/dataset/text/utils.py +++ b/mindspore/dataset/text/utils.py @@ -56,12 +56,12 @@ class Vocab(cde.Vocab): min_frequency=1. max_frequency > total_words is the same as max_frequency = total_words. min_frequency/max_frequency can be None, which corresponds to 0/total_words separately (default=None, all words are included). - top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are - taken. top_k is taken after freq_range. If not enough top_k, all words will be taken (default=None, - all words are included). - special_tokens(list, optional): a list of strings, each one is a special token. for example + top_k(int, optional): top_k is greater than 0. Number of words to be built into vocab. top_k means most + frequent words are taken. top_k is taken after freq_range. If not enough top_k, all words will be taken + (default=None, all words are included). + special_tokens(list, optional): A list of strings, each one is a special token. For example special_tokens=["",""] (default=None, no special tokens will be added). - special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens + special_first(bool, optional): Whether special_tokens will be prepended/appended to vocab. If special_tokens is specified and special_first is set to True, special_tokens will be prepended (default=True). Returns: @@ -76,10 +76,10 @@ class Vocab(cde.Vocab): Build a vocab object from a list of word. Args: - word_list(list): a list of string where each element is a word of type string. - special_tokens(list, optional): a list of strings, each one is a special token. for example + word_list(list): A list of string where each element is a word of type string. + special_tokens(list, optional): A list of strings, each one is a special token. for example special_tokens=["",""] (default=None, no special tokens will be added). - special_first(bool, optional): whether special_tokens is prepended or appended to vocab. If special_tokens + special_first(bool, optional): Whether special_tokens is prepended or appended to vocab. If special_tokens is specified and special_first is set to True, special_tokens will be prepended (default=True). Returns: @@ -96,13 +96,13 @@ class Vocab(cde.Vocab): Build a vocab object from a list of word. Args: - file_path (str): path to the file which contains the vocab list. - delimiter (str, optional): a delimiter to break up each line in file, the first element is taken to be + file_path (str): Path to the file which contains the vocab list. + delimiter (str, optional): A delimiter to break up each line in file, the first element is taken to be the word (default=""). - vocab_size (int, optional): number of words to read from file_path (default=None, all words are taken). - special_tokens (list, optional): a list of strings, each one is a special token. for example + vocab_size (int, optional): Number of words to read from file_path (default=None, all words are taken). + special_tokens (list, optional): A list of strings, each one is a special token. for example special_tokens=["",""] (default=None, no special tokens will be added). - special_first (bool, optional): whether special_tokens will be prepended/appended to vocab, + special_first (bool, optional): Whether special_tokens will be prepended/appended to vocab, If special_tokens is specified and special_first is set to True, special_tokens will be prepended (default=True). @@ -122,7 +122,7 @@ class Vocab(cde.Vocab): Build a vocab object from a dict. Args: - word_dict (dict): dict contains word and id pairs, where word should be str and id be int. id is recommended + word_dict (dict): Dict contains word and id pairs, where word should be str and id be int. id is recommended to start from 0 and be continuous. ValueError will be raised if id is negative. Returns: @@ -134,24 +134,26 @@ class Vocab(cde.Vocab): class SentencePieceVocab(cde.SentencePieceVocab): """ - SentencePiece obiect that is used to segmentate words + SentencePiece object that is used to do words segmentation. """ @classmethod @check_from_dataset_sentencepiece def from_dataset(cls, dataset, col_names, vocab_size, character_coverage, model_type, params): """ - Build a sentencepiece from a dataset + Build a SentencePiece from a dataset. Args: - dataset(Dataset): Dataset to build sentencepiece. + dataset(Dataset): Dataset to build SentencePiece. col_names(list): The list of the col name. vocab_size(int): Vocabulary size. character_coverage(float): Amount of characters covered by the model, good defaults are: 0.9995 for - languages. with rich character set like Japanese or Chinese and 1.0 for other languages with small + languages with rich character set like Japanese or Chinese and 1.0 for other languages with small character set. - model_type(SentencePieceModel): Choose from UNIGRAM (default), BPE, CHAR, or WORD. The input sentence - must be pretokenized when using word type. + model_type(SentencePieceModel): It can be any of [SentencePieceModel.UNIGRAM, SentencePieceModel.BPE, + SentencePieceModel.CHAR, SentencePieceModel.WORD], default is SentencePieceModel.UNIGRAM. The input + sentence must be pre-tokenized when using word type. + params(dict): A dictionary with no incoming parameters. Returns: @@ -168,13 +170,15 @@ class SentencePieceVocab(cde.SentencePieceVocab): Build a SentencePiece object from a list of word. Args: - file_path(list): Path to the file which contains the sentencepiece list. - vocab_size(int): Vocabulary size, the type of uint32_t. + file_path(list): Path to the file which contains the SentencePiece list. + vocab_size(int): Vocabulary size. character_coverage(float): Amount of characters covered by the model, good defaults are: 0.9995 for - languages. with rich character set like Japanse or Chinese and 1.0 for other languages with small + languages with rich character set like Japanese or Chinese and 1.0 for other languages with small character set. - model_type(SentencePieceModel): Choose from unigram (default), bpe, char, or word. The input sentence - must be pretokenized when using word type. + model_type(SentencePieceModel): It can be any of [SentencePieceModel.UNIGRAM, SentencePieceModel.BPE, + SentencePieceModel.CHAR, SentencePieceModel.WORD], default is SentencePieceModel.UNIGRAM. The input + sentence must be pre-tokenized when using word type. + params(dict): A dictionary with no incoming parameters(The parameters are derived from SentencePiece library). @@ -193,10 +197,10 @@ class SentencePieceVocab(cde.SentencePieceVocab): @check_save_model def save_model(cls, vocab, path, filename): """ - Save model to filepath + Save model into given filepath. Args: - vocab(SentencePieceVocab): A sentencepiece object. + vocab(SentencePieceVocab): A SentencePiece object. path(str): Path to store model. filename(str): The name of the file. """ @@ -208,7 +212,7 @@ def to_str(array, encoding='utf8'): Convert NumPy array of `bytes` to array of `str` by decoding each element based on charset `encoding`. Args: - array (numpy.ndarray): Array of type `bytes` representing strings. + array (numpy.ndarray): Array of `bytes` type representing strings. encoding (str): Indicating the charset for decoding. Returns: @@ -226,7 +230,7 @@ def to_bytes(array, encoding='utf8'): Convert NumPy array of `str` to array of `bytes` by encoding each element based on charset `encoding`. Args: - array (numpy.ndarray): Array of type `str` representing strings. + array (numpy.ndarray): Array of `str` type representing strings. encoding (str): Indicating the charset for encoding. Returns: diff --git a/mindspore/dataset/transforms/c_transforms.py b/mindspore/dataset/transforms/c_transforms.py index a5545fa6c11..4767162ed44 100644 --- a/mindspore/dataset/transforms/c_transforms.py +++ b/mindspore/dataset/transforms/c_transforms.py @@ -169,8 +169,7 @@ class Slice(TensorOperation): """ Slice operation to extract a tensor out using the given n slices. - The functionality of Slice is similar to NumPy's indexing feature. - (Currently only rank-1 tensors are supported). + The functionality of Slice is similar to NumPy's indexing feature (Currently only rank-1 tensors are supported). Args: slices (Union[int, list[int], slice, None, Ellipsis]): @@ -234,10 +233,11 @@ class Mask(TensorOperation): Any element of the tensor that matches the predicate will be evaluated to True, otherwise False. Args: - operator (Relational): One of the relational operators EQ, NE LT, GT, LE or GE + operator (Relational): relational operators, it can be any of [Relational.EQ, Relational.NE, Relational.LT, + Relational.GT, Relational.LE, Relational.GE], take Relational.EQ as example, EQ refers to equal. constant (Union[str, int, float, bool]): Constant to be compared to. Constant will be cast to the type of the input tensor. - dtype (mindspore.dtype, optional): Type of the generated mask (Default to bool). + dtype (mindspore.dtype, optional): Type of the generated mask (Default mstype.bool_). Examples: >>> from mindspore.dataset.transforms.c_transforms import Relational @@ -268,7 +268,7 @@ class Mask(TensorOperation): class PadEnd(TensorOperation): """ - Pad input tensor according to pad_shape, need to have same rank. + Pad input tensor according to pad_shape, input tensor needs to have same rank. Args: pad_shape (list(int)): List of integers representing the shape needed. Dimensions that set to `None` will