enhance api description
This commit is contained in:
parent
0cd459fb47
commit
d480a78c21
|
@ -327,9 +327,10 @@ class SentencePieceTokenizer(TextTensorOperation):
|
|||
Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
|
||||
|
||||
Args:
|
||||
mode (Union[str, SentencePieceVocab]): If the input parameter is a file, then it is of type string.
|
||||
If the input parameter is a SentencePieceVocab object, then it is of type SentencePieceVocab.
|
||||
out_type (SPieceTokenizerOutType): The type of output, the type is int or string
|
||||
mode (Union[str, SentencePieceVocab]): If the input parameter is a file, then its type should be string.
|
||||
If the input parameter is a SentencePieceVocab object, then its type should be SentencePieceVocab.
|
||||
out_type (SPieceTokenizerOutType): The type of output, it can be any of [SPieceTokenizerOutType.STRING,
|
||||
SPieceTokenizerOutType.INT].
|
||||
|
||||
Examples:
|
||||
>>> from mindspore.dataset.text import SentencePieceModel, SPieceTokenizerOutType
|
||||
|
@ -350,7 +351,7 @@ class SentencePieceTokenizer(TextTensorOperation):
|
|||
|
||||
class SlidingWindow(TextTensorOperation):
|
||||
"""
|
||||
TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension axis
|
||||
Construct a tensor from given data (only support 1-D for now), where each element in the dimension axis
|
||||
is a slice of data starting at the corresponding position, with a specified width.
|
||||
|
||||
Args:
|
||||
|
@ -387,15 +388,13 @@ class ToNumber(TextTensorOperation):
|
|||
"""
|
||||
Tensor operation to convert every element of a string tensor to a number.
|
||||
|
||||
Strings are cast according to the rules specified in the following links:
|
||||
Strings are cast according to the rules specified in the following links, except that any strings which represent
|
||||
negative numbers cannot be cast to an unsigned integer type, rules links are as follows:
|
||||
https://en.cppreference.com/w/cpp/string/basic_string/stof,
|
||||
https://en.cppreference.com/w/cpp/string/basic_string/stoul,
|
||||
except that any strings which represent negative numbers cannot be cast to an
|
||||
unsigned integer type.
|
||||
|
||||
Args:
|
||||
data_type (mindspore.dtype): mindspore.dtype to be cast to. Must be
|
||||
a numeric type.
|
||||
data_type (mindspore.dtype): Type to be cast to. Must be a numeric type in mindspore.dtype.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If strings are invalid to cast, or are out of range after being cast.
|
||||
|
@ -521,7 +520,7 @@ class WordpieceTokenizer(TextTensorOperation):
|
|||
|
||||
class PythonTokenizer:
|
||||
"""
|
||||
Callable class to be used for user-defined string tokenizer.
|
||||
Class that apply user-defined string tokenizer into input string.
|
||||
|
||||
Args:
|
||||
tokenizer (Callable): Python function that takes a `str` and returns a list of `str` as tokens.
|
||||
|
@ -752,9 +751,9 @@ if platform.system().lower() != 'windows':
|
|||
|
||||
class RegexReplace(TextTensorOperation):
|
||||
"""
|
||||
Replace UTF-8 string tensor with 'replace' according to regular expression 'pattern'.
|
||||
Replace a part of UTF-8 string tensor with given text according to regular expressions.
|
||||
|
||||
See http://userguide.icu-project.org/strings/regexp for support regex pattern.
|
||||
See http://userguide.icu-project.org/strings/regexp for supported regex pattern.
|
||||
|
||||
Note:
|
||||
RegexReplace is not supported on Windows platform yet.
|
||||
|
@ -786,7 +785,7 @@ if platform.system().lower() != 'windows':
|
|||
"""
|
||||
Tokenize a scalar tensor of UTF-8 string by regex expression pattern.
|
||||
|
||||
See http://userguide.icu-project.org/strings/regexp for support regex pattern.
|
||||
See http://userguide.icu-project.org/strings/regexp for supported regex pattern.
|
||||
|
||||
Note:
|
||||
RegexTokenizer is not supported on Windows platform yet.
|
||||
|
@ -795,16 +794,16 @@ if platform.system().lower() != 'windows':
|
|||
delim_pattern (str): The pattern of regex delimiters.
|
||||
The original string will be split by matched elements.
|
||||
keep_delim_pattern (str, optional): The string matched by 'delim_pattern' can be kept as a token
|
||||
if it can be matched by 'keep_delim_pattern'. The default value is an empty str ('')
|
||||
if it can be matched by 'keep_delim_pattern'. The default value is an empty str
|
||||
which means that delimiters will not be kept as an output token (default='').
|
||||
with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
|
||||
with_offsets (bool, optional): Whether or not output offsets of tokens(default=False).
|
||||
|
||||
Examples:
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> # If with_offsets=False, default output is one column {["text", dtype=str]}
|
||||
>>> delim_pattern = r"[ |,]"
|
||||
>>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=False)
|
||||
>>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
|
||||
>>> # If with_offsets=True, then output three columns {["token", dtype=str],
|
||||
>>> # ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=True)
|
||||
|
@ -827,7 +826,7 @@ if platform.system().lower() != 'windows':
|
|||
|
||||
class UnicodeScriptTokenizer(TextTensorOperation):
|
||||
"""
|
||||
Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
|
||||
Tokenize a scalar tensor of UTF-8 string based on Unicode script boundaries.
|
||||
|
||||
Note:
|
||||
UnicodeScriptTokenizer is not supported on Windows platform yet.
|
||||
|
@ -840,9 +839,9 @@ if platform.system().lower() != 'windows':
|
|||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=False)
|
||||
>>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
|
||||
>>> # ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> # If with_offsets=True, then output three columns {["token", dtype=str],
|
||||
>>> # ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True)
|
||||
>>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"],
|
||||
... output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
|
|
|
@ -56,12 +56,12 @@ class Vocab(cde.Vocab):
|
|||
min_frequency=1. max_frequency > total_words is the same as max_frequency = total_words.
|
||||
min_frequency/max_frequency can be None, which corresponds to 0/total_words separately
|
||||
(default=None, all words are included).
|
||||
top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
|
||||
taken. top_k is taken after freq_range. If not enough top_k, all words will be taken (default=None,
|
||||
all words are included).
|
||||
special_tokens(list, optional): a list of strings, each one is a special token. for example
|
||||
top_k(int, optional): top_k is greater than 0. Number of words to be built into vocab. top_k means most
|
||||
frequent words are taken. top_k is taken after freq_range. If not enough top_k, all words will be taken
|
||||
(default=None, all words are included).
|
||||
special_tokens(list, optional): A list of strings, each one is a special token. For example
|
||||
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
|
||||
special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens
|
||||
special_first(bool, optional): Whether special_tokens will be prepended/appended to vocab. If special_tokens
|
||||
is specified and special_first is set to True, special_tokens will be prepended (default=True).
|
||||
|
||||
Returns:
|
||||
|
@ -76,10 +76,10 @@ class Vocab(cde.Vocab):
|
|||
Build a vocab object from a list of word.
|
||||
|
||||
Args:
|
||||
word_list(list): a list of string where each element is a word of type string.
|
||||
special_tokens(list, optional): a list of strings, each one is a special token. for example
|
||||
word_list(list): A list of string where each element is a word of type string.
|
||||
special_tokens(list, optional): A list of strings, each one is a special token. for example
|
||||
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
|
||||
special_first(bool, optional): whether special_tokens is prepended or appended to vocab. If special_tokens
|
||||
special_first(bool, optional): Whether special_tokens is prepended or appended to vocab. If special_tokens
|
||||
is specified and special_first is set to True, special_tokens will be prepended (default=True).
|
||||
|
||||
Returns:
|
||||
|
@ -96,13 +96,13 @@ class Vocab(cde.Vocab):
|
|||
Build a vocab object from a list of word.
|
||||
|
||||
Args:
|
||||
file_path (str): path to the file which contains the vocab list.
|
||||
delimiter (str, optional): a delimiter to break up each line in file, the first element is taken to be
|
||||
file_path (str): Path to the file which contains the vocab list.
|
||||
delimiter (str, optional): A delimiter to break up each line in file, the first element is taken to be
|
||||
the word (default="").
|
||||
vocab_size (int, optional): number of words to read from file_path (default=None, all words are taken).
|
||||
special_tokens (list, optional): a list of strings, each one is a special token. for example
|
||||
vocab_size (int, optional): Number of words to read from file_path (default=None, all words are taken).
|
||||
special_tokens (list, optional): A list of strings, each one is a special token. for example
|
||||
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
|
||||
special_first (bool, optional): whether special_tokens will be prepended/appended to vocab,
|
||||
special_first (bool, optional): Whether special_tokens will be prepended/appended to vocab,
|
||||
If special_tokens is specified and special_first is set to True,
|
||||
special_tokens will be prepended (default=True).
|
||||
|
||||
|
@ -122,7 +122,7 @@ class Vocab(cde.Vocab):
|
|||
Build a vocab object from a dict.
|
||||
|
||||
Args:
|
||||
word_dict (dict): dict contains word and id pairs, where word should be str and id be int. id is recommended
|
||||
word_dict (dict): Dict contains word and id pairs, where word should be str and id be int. id is recommended
|
||||
to start from 0 and be continuous. ValueError will be raised if id is negative.
|
||||
|
||||
Returns:
|
||||
|
@ -134,24 +134,26 @@ class Vocab(cde.Vocab):
|
|||
|
||||
class SentencePieceVocab(cde.SentencePieceVocab):
|
||||
"""
|
||||
SentencePiece obiect that is used to segmentate words
|
||||
SentencePiece object that is used to do words segmentation.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
@check_from_dataset_sentencepiece
|
||||
def from_dataset(cls, dataset, col_names, vocab_size, character_coverage, model_type, params):
|
||||
"""
|
||||
Build a sentencepiece from a dataset
|
||||
Build a SentencePiece from a dataset.
|
||||
|
||||
Args:
|
||||
dataset(Dataset): Dataset to build sentencepiece.
|
||||
dataset(Dataset): Dataset to build SentencePiece.
|
||||
col_names(list): The list of the col name.
|
||||
vocab_size(int): Vocabulary size.
|
||||
character_coverage(float): Amount of characters covered by the model, good defaults are: 0.9995 for
|
||||
languages. with rich character set like Japanese or Chinese and 1.0 for other languages with small
|
||||
languages with rich character set like Japanese or Chinese and 1.0 for other languages with small
|
||||
character set.
|
||||
model_type(SentencePieceModel): Choose from UNIGRAM (default), BPE, CHAR, or WORD. The input sentence
|
||||
must be pretokenized when using word type.
|
||||
model_type(SentencePieceModel): It can be any of [SentencePieceModel.UNIGRAM, SentencePieceModel.BPE,
|
||||
SentencePieceModel.CHAR, SentencePieceModel.WORD], default is SentencePieceModel.UNIGRAM. The input
|
||||
sentence must be pre-tokenized when using word type.
|
||||
|
||||
params(dict): A dictionary with no incoming parameters.
|
||||
|
||||
Returns:
|
||||
|
@ -168,13 +170,15 @@ class SentencePieceVocab(cde.SentencePieceVocab):
|
|||
Build a SentencePiece object from a list of word.
|
||||
|
||||
Args:
|
||||
file_path(list): Path to the file which contains the sentencepiece list.
|
||||
vocab_size(int): Vocabulary size, the type of uint32_t.
|
||||
file_path(list): Path to the file which contains the SentencePiece list.
|
||||
vocab_size(int): Vocabulary size.
|
||||
character_coverage(float): Amount of characters covered by the model, good defaults are: 0.9995 for
|
||||
languages. with rich character set like Japanse or Chinese and 1.0 for other languages with small
|
||||
languages with rich character set like Japanese or Chinese and 1.0 for other languages with small
|
||||
character set.
|
||||
model_type(SentencePieceModel): Choose from unigram (default), bpe, char, or word. The input sentence
|
||||
must be pretokenized when using word type.
|
||||
model_type(SentencePieceModel): It can be any of [SentencePieceModel.UNIGRAM, SentencePieceModel.BPE,
|
||||
SentencePieceModel.CHAR, SentencePieceModel.WORD], default is SentencePieceModel.UNIGRAM. The input
|
||||
sentence must be pre-tokenized when using word type.
|
||||
|
||||
params(dict): A dictionary with no incoming parameters(The parameters are derived from SentencePiece
|
||||
library).
|
||||
|
||||
|
@ -193,10 +197,10 @@ class SentencePieceVocab(cde.SentencePieceVocab):
|
|||
@check_save_model
|
||||
def save_model(cls, vocab, path, filename):
|
||||
"""
|
||||
Save model to filepath
|
||||
Save model into given filepath.
|
||||
|
||||
Args:
|
||||
vocab(SentencePieceVocab): A sentencepiece object.
|
||||
vocab(SentencePieceVocab): A SentencePiece object.
|
||||
path(str): Path to store model.
|
||||
filename(str): The name of the file.
|
||||
"""
|
||||
|
@ -208,7 +212,7 @@ def to_str(array, encoding='utf8'):
|
|||
Convert NumPy array of `bytes` to array of `str` by decoding each element based on charset `encoding`.
|
||||
|
||||
Args:
|
||||
array (numpy.ndarray): Array of type `bytes` representing strings.
|
||||
array (numpy.ndarray): Array of `bytes` type representing strings.
|
||||
encoding (str): Indicating the charset for decoding.
|
||||
|
||||
Returns:
|
||||
|
@ -226,7 +230,7 @@ def to_bytes(array, encoding='utf8'):
|
|||
Convert NumPy array of `str` to array of `bytes` by encoding each element based on charset `encoding`.
|
||||
|
||||
Args:
|
||||
array (numpy.ndarray): Array of type `str` representing strings.
|
||||
array (numpy.ndarray): Array of `str` type representing strings.
|
||||
encoding (str): Indicating the charset for encoding.
|
||||
|
||||
Returns:
|
||||
|
|
|
@ -169,8 +169,7 @@ class Slice(TensorOperation):
|
|||
"""
|
||||
Slice operation to extract a tensor out using the given n slices.
|
||||
|
||||
The functionality of Slice is similar to NumPy's indexing feature.
|
||||
(Currently only rank-1 tensors are supported).
|
||||
The functionality of Slice is similar to NumPy's indexing feature (Currently only rank-1 tensors are supported).
|
||||
|
||||
Args:
|
||||
slices (Union[int, list[int], slice, None, Ellipsis]):
|
||||
|
@ -234,10 +233,11 @@ class Mask(TensorOperation):
|
|||
Any element of the tensor that matches the predicate will be evaluated to True, otherwise False.
|
||||
|
||||
Args:
|
||||
operator (Relational): One of the relational operators EQ, NE LT, GT, LE or GE
|
||||
operator (Relational): relational operators, it can be any of [Relational.EQ, Relational.NE, Relational.LT,
|
||||
Relational.GT, Relational.LE, Relational.GE], take Relational.EQ as example, EQ refers to equal.
|
||||
constant (Union[str, int, float, bool]): Constant to be compared to.
|
||||
Constant will be cast to the type of the input tensor.
|
||||
dtype (mindspore.dtype, optional): Type of the generated mask (Default to bool).
|
||||
dtype (mindspore.dtype, optional): Type of the generated mask (Default mstype.bool_).
|
||||
|
||||
Examples:
|
||||
>>> from mindspore.dataset.transforms.c_transforms import Relational
|
||||
|
@ -268,7 +268,7 @@ class Mask(TensorOperation):
|
|||
|
||||
class PadEnd(TensorOperation):
|
||||
"""
|
||||
Pad input tensor according to pad_shape, need to have same rank.
|
||||
Pad input tensor according to pad_shape, input tensor needs to have same rank.
|
||||
|
||||
Args:
|
||||
pad_shape (list(int)): List of integers representing the shape needed. Dimensions that set to `None` will
|
||||
|
|
Loading…
Reference in New Issue