enhance api description

This commit is contained in:
ms_yan 2021-06-07 20:00:36 +08:00
parent 0cd459fb47
commit d480a78c21
3 changed files with 58 additions and 55 deletions

View File

@ -327,9 +327,10 @@ class SentencePieceTokenizer(TextTensorOperation):
Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
Args:
mode (Union[str, SentencePieceVocab]): If the input parameter is a file, then it is of type string.
If the input parameter is a SentencePieceVocab object, then it is of type SentencePieceVocab.
out_type (SPieceTokenizerOutType): The type of output, the type is int or string
mode (Union[str, SentencePieceVocab]): If the input parameter is a file, then its type should be string.
If the input parameter is a SentencePieceVocab object, then its type should be SentencePieceVocab.
out_type (SPieceTokenizerOutType): The type of output, it can be any of [SPieceTokenizerOutType.STRING,
SPieceTokenizerOutType.INT].
Examples:
>>> from mindspore.dataset.text import SentencePieceModel, SPieceTokenizerOutType
@ -350,7 +351,7 @@ class SentencePieceTokenizer(TextTensorOperation):
class SlidingWindow(TextTensorOperation):
"""
TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension axis
Construct a tensor from given data (only support 1-D for now), where each element in the dimension axis
is a slice of data starting at the corresponding position, with a specified width.
Args:
@ -387,15 +388,13 @@ class ToNumber(TextTensorOperation):
"""
Tensor operation to convert every element of a string tensor to a number.
Strings are cast according to the rules specified in the following links:
Strings are cast according to the rules specified in the following links, except that any strings which represent
negative numbers cannot be cast to an unsigned integer type, rules links are as follows:
https://en.cppreference.com/w/cpp/string/basic_string/stof,
https://en.cppreference.com/w/cpp/string/basic_string/stoul,
except that any strings which represent negative numbers cannot be cast to an
unsigned integer type.
Args:
data_type (mindspore.dtype): mindspore.dtype to be cast to. Must be
a numeric type.
data_type (mindspore.dtype): Type to be cast to. Must be a numeric type in mindspore.dtype.
Raises:
RuntimeError: If strings are invalid to cast, or are out of range after being cast.
@ -521,7 +520,7 @@ class WordpieceTokenizer(TextTensorOperation):
class PythonTokenizer:
"""
Callable class to be used for user-defined string tokenizer.
Class that apply user-defined string tokenizer into input string.
Args:
tokenizer (Callable): Python function that takes a `str` and returns a list of `str` as tokens.
@ -752,9 +751,9 @@ if platform.system().lower() != 'windows':
class RegexReplace(TextTensorOperation):
"""
Replace UTF-8 string tensor with 'replace' according to regular expression 'pattern'.
Replace a part of UTF-8 string tensor with given text according to regular expressions.
See http://userguide.icu-project.org/strings/regexp for support regex pattern.
See http://userguide.icu-project.org/strings/regexp for supported regex pattern.
Note:
RegexReplace is not supported on Windows platform yet.
@ -786,7 +785,7 @@ if platform.system().lower() != 'windows':
"""
Tokenize a scalar tensor of UTF-8 string by regex expression pattern.
See http://userguide.icu-project.org/strings/regexp for support regex pattern.
See http://userguide.icu-project.org/strings/regexp for supported regex pattern.
Note:
RegexTokenizer is not supported on Windows platform yet.
@ -795,16 +794,16 @@ if platform.system().lower() != 'windows':
delim_pattern (str): The pattern of regex delimiters.
The original string will be split by matched elements.
keep_delim_pattern (str, optional): The string matched by 'delim_pattern' can be kept as a token
if it can be matched by 'keep_delim_pattern'. The default value is an empty str ('')
if it can be matched by 'keep_delim_pattern'. The default value is an empty str
which means that delimiters will not be kept as an output token (default='').
with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
with_offsets (bool, optional): Whether or not output offsets of tokens(default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> # If with_offsets=False, default output is one column {["text", dtype=str]}
>>> delim_pattern = r"[ |,]"
>>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=False)
>>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
>>> # If with_offsets=True, then output three columns {["token", dtype=str],
>>> # ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=True)
@ -827,7 +826,7 @@ if platform.system().lower() != 'windows':
class UnicodeScriptTokenizer(TextTensorOperation):
"""
Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
Tokenize a scalar tensor of UTF-8 string based on Unicode script boundaries.
Note:
UnicodeScriptTokenizer is not supported on Windows platform yet.
@ -840,9 +839,9 @@ if platform.system().lower() != 'windows':
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=False)
>>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
>>> # ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]}
>>> # If with_offsets=True, then output three columns {["token", dtype=str],
>>> # ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True)
>>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"],
... output_columns=["token", "offsets_start", "offsets_limit"],

View File

@ -56,12 +56,12 @@ class Vocab(cde.Vocab):
min_frequency=1. max_frequency > total_words is the same as max_frequency = total_words.
min_frequency/max_frequency can be None, which corresponds to 0/total_words separately
(default=None, all words are included).
top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
taken. top_k is taken after freq_range. If not enough top_k, all words will be taken (default=None,
all words are included).
special_tokens(list, optional): a list of strings, each one is a special token. for example
top_k(int, optional): top_k is greater than 0. Number of words to be built into vocab. top_k means most
frequent words are taken. top_k is taken after freq_range. If not enough top_k, all words will be taken
(default=None, all words are included).
special_tokens(list, optional): A list of strings, each one is a special token. For example
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens
special_first(bool, optional): Whether special_tokens will be prepended/appended to vocab. If special_tokens
is specified and special_first is set to True, special_tokens will be prepended (default=True).
Returns:
@ -76,10 +76,10 @@ class Vocab(cde.Vocab):
Build a vocab object from a list of word.
Args:
word_list(list): a list of string where each element is a word of type string.
special_tokens(list, optional): a list of strings, each one is a special token. for example
word_list(list): A list of string where each element is a word of type string.
special_tokens(list, optional): A list of strings, each one is a special token. for example
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
special_first(bool, optional): whether special_tokens is prepended or appended to vocab. If special_tokens
special_first(bool, optional): Whether special_tokens is prepended or appended to vocab. If special_tokens
is specified and special_first is set to True, special_tokens will be prepended (default=True).
Returns:
@ -96,13 +96,13 @@ class Vocab(cde.Vocab):
Build a vocab object from a list of word.
Args:
file_path (str): path to the file which contains the vocab list.
delimiter (str, optional): a delimiter to break up each line in file, the first element is taken to be
file_path (str): Path to the file which contains the vocab list.
delimiter (str, optional): A delimiter to break up each line in file, the first element is taken to be
the word (default="").
vocab_size (int, optional): number of words to read from file_path (default=None, all words are taken).
special_tokens (list, optional): a list of strings, each one is a special token. for example
vocab_size (int, optional): Number of words to read from file_path (default=None, all words are taken).
special_tokens (list, optional): A list of strings, each one is a special token. for example
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
special_first (bool, optional): whether special_tokens will be prepended/appended to vocab,
special_first (bool, optional): Whether special_tokens will be prepended/appended to vocab,
If special_tokens is specified and special_first is set to True,
special_tokens will be prepended (default=True).
@ -122,7 +122,7 @@ class Vocab(cde.Vocab):
Build a vocab object from a dict.
Args:
word_dict (dict): dict contains word and id pairs, where word should be str and id be int. id is recommended
word_dict (dict): Dict contains word and id pairs, where word should be str and id be int. id is recommended
to start from 0 and be continuous. ValueError will be raised if id is negative.
Returns:
@ -134,24 +134,26 @@ class Vocab(cde.Vocab):
class SentencePieceVocab(cde.SentencePieceVocab):
"""
SentencePiece obiect that is used to segmentate words
SentencePiece object that is used to do words segmentation.
"""
@classmethod
@check_from_dataset_sentencepiece
def from_dataset(cls, dataset, col_names, vocab_size, character_coverage, model_type, params):
"""
Build a sentencepiece from a dataset
Build a SentencePiece from a dataset.
Args:
dataset(Dataset): Dataset to build sentencepiece.
dataset(Dataset): Dataset to build SentencePiece.
col_names(list): The list of the col name.
vocab_size(int): Vocabulary size.
character_coverage(float): Amount of characters covered by the model, good defaults are: 0.9995 for
languages. with rich character set like Japanese or Chinese and 1.0 for other languages with small
languages with rich character set like Japanese or Chinese and 1.0 for other languages with small
character set.
model_type(SentencePieceModel): Choose from UNIGRAM (default), BPE, CHAR, or WORD. The input sentence
must be pretokenized when using word type.
model_type(SentencePieceModel): It can be any of [SentencePieceModel.UNIGRAM, SentencePieceModel.BPE,
SentencePieceModel.CHAR, SentencePieceModel.WORD], default is SentencePieceModel.UNIGRAM. The input
sentence must be pre-tokenized when using word type.
params(dict): A dictionary with no incoming parameters.
Returns:
@ -168,13 +170,15 @@ class SentencePieceVocab(cde.SentencePieceVocab):
Build a SentencePiece object from a list of word.
Args:
file_path(list): Path to the file which contains the sentencepiece list.
vocab_size(int): Vocabulary size, the type of uint32_t.
file_path(list): Path to the file which contains the SentencePiece list.
vocab_size(int): Vocabulary size.
character_coverage(float): Amount of characters covered by the model, good defaults are: 0.9995 for
languages. with rich character set like Japanse or Chinese and 1.0 for other languages with small
languages with rich character set like Japanese or Chinese and 1.0 for other languages with small
character set.
model_type(SentencePieceModel): Choose from unigram (default), bpe, char, or word. The input sentence
must be pretokenized when using word type.
model_type(SentencePieceModel): It can be any of [SentencePieceModel.UNIGRAM, SentencePieceModel.BPE,
SentencePieceModel.CHAR, SentencePieceModel.WORD], default is SentencePieceModel.UNIGRAM. The input
sentence must be pre-tokenized when using word type.
params(dict): A dictionary with no incoming parameters(The parameters are derived from SentencePiece
library).
@ -193,10 +197,10 @@ class SentencePieceVocab(cde.SentencePieceVocab):
@check_save_model
def save_model(cls, vocab, path, filename):
"""
Save model to filepath
Save model into given filepath.
Args:
vocab(SentencePieceVocab): A sentencepiece object.
vocab(SentencePieceVocab): A SentencePiece object.
path(str): Path to store model.
filename(str): The name of the file.
"""
@ -208,7 +212,7 @@ def to_str(array, encoding='utf8'):
Convert NumPy array of `bytes` to array of `str` by decoding each element based on charset `encoding`.
Args:
array (numpy.ndarray): Array of type `bytes` representing strings.
array (numpy.ndarray): Array of `bytes` type representing strings.
encoding (str): Indicating the charset for decoding.
Returns:
@ -226,7 +230,7 @@ def to_bytes(array, encoding='utf8'):
Convert NumPy array of `str` to array of `bytes` by encoding each element based on charset `encoding`.
Args:
array (numpy.ndarray): Array of type `str` representing strings.
array (numpy.ndarray): Array of `str` type representing strings.
encoding (str): Indicating the charset for encoding.
Returns:

View File

@ -169,8 +169,7 @@ class Slice(TensorOperation):
"""
Slice operation to extract a tensor out using the given n slices.
The functionality of Slice is similar to NumPy's indexing feature.
(Currently only rank-1 tensors are supported).
The functionality of Slice is similar to NumPy's indexing feature (Currently only rank-1 tensors are supported).
Args:
slices (Union[int, list[int], slice, None, Ellipsis]):
@ -234,10 +233,11 @@ class Mask(TensorOperation):
Any element of the tensor that matches the predicate will be evaluated to True, otherwise False.
Args:
operator (Relational): One of the relational operators EQ, NE LT, GT, LE or GE
operator (Relational): relational operators, it can be any of [Relational.EQ, Relational.NE, Relational.LT,
Relational.GT, Relational.LE, Relational.GE], take Relational.EQ as example, EQ refers to equal.
constant (Union[str, int, float, bool]): Constant to be compared to.
Constant will be cast to the type of the input tensor.
dtype (mindspore.dtype, optional): Type of the generated mask (Default to bool).
dtype (mindspore.dtype, optional): Type of the generated mask (Default mstype.bool_).
Examples:
>>> from mindspore.dataset.transforms.c_transforms import Relational
@ -268,7 +268,7 @@ class Mask(TensorOperation):
class PadEnd(TensorOperation):
"""
Pad input tensor according to pad_shape, need to have same rank.
Pad input tensor according to pad_shape, input tensor needs to have same rank.
Args:
pad_shape (list(int)): List of integers representing the shape needed. Dimensions that set to `None` will