dataset API docstring: Update datasets, samplers, graphdata and text

This commit is contained in:
Cathy Wong 2020-09-10 18:08:51 -04:00
parent d0e49c5cf8
commit f7adf648e9
7 changed files with 423 additions and 412 deletions

File diff suppressed because it is too large Load Diff

View File

@ -34,29 +34,36 @@ class GraphData:
Reads the graph dataset used for GNN training from the shared file and database.
Args:
dataset_file (str): One of file names in dataset.
num_parallel_workers (int, optional): Number of workers to process the Dataset in parallel
dataset_file (str): One of file names in the dataset.
num_parallel_workers (int, optional): Number of workers to process the dataset in parallel
(default=None).
working_mode (str, optional): Set working mode, now support 'local'/'client'/'server' (default='local').
working_mode (str, optional): Set working mode, now supports 'local'/'client'/'server' (default='local').
- 'local', used in non-distributed training scenarios.
- 'client', used in distributed training scenarios, the client does not load data,
- 'client', used in distributed training scenarios. The client does not load data,
but obtains data from the server.
- 'server', used in distributed training scenarios, the server loads the data
- 'server', used in distributed training scenarios. The server loads the data
and is available to the client.
hostname (str, optional): Valid when working_mode is set to 'client' or 'server',
set the hostname of the graph data server (default='127.0.0.1').
port (int, optional): Valid when working_mode is set to 'client' or 'server',
set the port of the graph data server, the range is 1024-65535 (default=50051).
num_client (int, optional): Valid when working_mode is set to 'server',
set the number of clients expected to connect, and the server will allocate corresponding
resources according to this parameter (default=1).
hostname (str, optional): Hostname of the graph data server. This parameter is only valid when
working_mode is set to 'client' or 'server' (default='127.0.0.1').
port (int, optional): Port of the graph data server. The range is 1024-65535. This parameter is
only valid when working_mode is set to 'client' or 'server' (default=50051).
num_client (int, optional): Maximum number of clients expected to connect to the server. The server will
allocate resources according to this parameter. This parameter is only valid when working_mode
is set to 'server' (default=1).
auto_shutdown (bool, optional): Valid when working_mode is set to 'server',
when the number of connected clients reaches num_client and no client is being connected,
the server automatically exits (default=True).
Examples:
>>> import mindspore.dataset as ds
>>>
>>> data_graph = ds.GraphData('dataset_file', 2)
>>> nodes = data_graph.get_all_nodes(0)
>>> features = data_graph.get_node_feature(nodes, [1])
"""
@check_gnn_graphdata
@ -94,10 +101,11 @@ class GraphData:
node_type (int): Specify the type of node.
Returns:
numpy.ndarray: array of nodes.
numpy.ndarray: Array of nodes.
Examples:
>>> import mindspore.dataset as ds
>>>
>>> data_graph = ds.GraphData('dataset_file', 2)
>>> nodes = data_graph.get_all_nodes(0)
@ -121,6 +129,7 @@ class GraphData:
Examples:
>>> import mindspore.dataset as ds
>>>
>>> data_graph = ds.GraphData('dataset_file', 2)
>>> nodes = data_graph.get_all_edges(0)
@ -140,7 +149,7 @@ class GraphData:
edge_list (Union[list, numpy.ndarray]): The given list of edges.
Returns:
numpy.ndarray: array of nodes.
numpy.ndarray: Array of nodes.
Raises:
TypeError: If `edge_list` is not list or ndarray.
@ -159,10 +168,11 @@ class GraphData:
neighbor_type (int): Specify the type of neighbor.
Returns:
numpy.ndarray: array of nodes.
numpy.ndarray: Array of nodes.
Examples:
>>> import mindspore.dataset as ds
>>>
>>> data_graph = ds.GraphData('dataset_file', 2)
>>> nodes = data_graph.get_all_nodes(0)
>>> neighbors = data_graph.get_all_neighbors(nodes, 0)
@ -192,13 +202,14 @@ class GraphData:
neighbor_types (Union[list, numpy.ndarray]): Neighbor type sampled per hop.
Returns:
numpy.ndarray: array of nodes.
numpy.ndarray: Array of nodes.
Examples:
>>> import mindspore.dataset as ds
>>>
>>> data_graph = ds.GraphData('dataset_file', 2)
>>> nodes = data_graph.get_all_nodes(0)
>>> neighbors = data_graph.get_all_neighbors(nodes, [2, 2], [0, 0])
>>> neighbors = data_graph.get_sampled_neighbors(nodes, [2, 2], [0, 0])
Raises:
TypeError: If `node_list` is not list or ndarray.
@ -221,10 +232,11 @@ class GraphData:
neg_neighbor_type (int): Specify the type of negative neighbor.
Returns:
numpy.ndarray: array of nodes.
numpy.ndarray: Array of nodes.
Examples:
>>> import mindspore.dataset as ds
>>>
>>> data_graph = ds.GraphData('dataset_file', 2)
>>> nodes = data_graph.get_all_nodes(0)
>>> neg_neighbors = data_graph.get_neg_sampled_neighbors(nodes, 5, 0)
@ -253,6 +265,7 @@ class GraphData:
Examples:
>>> import mindspore.dataset as ds
>>>
>>> data_graph = ds.GraphData('dataset_file', 2)
>>> nodes = data_graph.get_all_nodes(0)
>>> features = data_graph.get_node_feature(nodes, [1])
@ -284,6 +297,7 @@ class GraphData:
Examples:
>>> import mindspore.dataset as ds
>>>
>>> data_graph = ds.GraphData('dataset_file', 2)
>>> edges = data_graph.get_all_edges(0)
>>> features = data_graph.get_edge_feature(edges, [1])
@ -334,10 +348,11 @@ class GraphData:
A default value of -1 indicates that no node is given.
Returns:
numpy.ndarray: array of nodes.
numpy.ndarray: Array of nodes.
Examples:
>>> import mindspore.dataset as ds
>>>
>>> data_graph = ds.GraphData('dataset_file', 2)
>>> nodes = data_graph.random_walk([1,2], [1,2,1,2,1])

View File

@ -13,10 +13,10 @@
# limitations under the License.
# ==============================================================================
"""
Sampler module provides several samplers to generate sampling data from dataset.
There are following samplers: DistributedSampler, PKSampler, RandomSampler,
SequentialSampler, SubsetRandomSampler, WeightedRandomSampler.
User can also define custom sampler by extending from Sampler class.
The sampler module provides several samplers to generate data from datasets.
The provided samplers include: DistributedSampler, PKSampler, RandomSampler,
SequentialSampler, SubsetRandomSampler, and WeightedRandomSampler.
Users can also define a custom sampler by extending from the Sampler class.
"""
import numpy as np
@ -26,9 +26,9 @@ import mindspore.dataset as ds
class Sampler:
"""
Base class for user defined sampler.
User defined sampler can be used with any existing dataset with sampler support.
A user defined sampler can be used with any existing dataset with sampler support.
An required _iter_() method should by overridden by user for sample index generation.
A required _iter_() method should by overridden by the user for sample index generation.
An optional reset() method can be overridden for per repeat reset,
dataset_size and num_samples will be set by dataset once a dataset iterator is created.
@ -52,8 +52,7 @@ class Sampler:
def __iter__(self):
"""
User defined iterator, must be overridden.
_handshake is guaranteed to be called prior to iterator construction
_handshake is guaranteed to be called prior to iterator construction.
"""
raise NotImplementedError
@ -160,10 +159,10 @@ class BuiltinSampler:
def get_num_samples(self):
"""
All samplers can contain a numeric num_samples value (or it could be set to None).
Child sampler can exist or be None.
if child sampler exists, then the child sampler count can be a numeric value or None.
Given these conditions, we need to output what the sampler count is for this sampler.
All samplers can contain a numeric num_samples value (or it can be set to None).
A child sampler can exist or be None.
If a child sampler exists, then the child sampler count can be a numeric value or None.
These conditions impact the resultant sampler count that is used.
The following table shows the possible results from calling this function.
.. list-table::
@ -217,20 +216,20 @@ class BuiltinSampler:
class DistributedSampler(BuiltinSampler):
"""
Sampler that access a shard of the dataset.
A sampler that accesses a shard of the dataset.
Args:
num_shards (int): Number of shards to divide the dataset into.
shard_id (int): Shard ID of the current shard within num_shards.
shuffle (bool, optional): If true, the indices are shuffled (default=True).
shuffle (bool, optional): If True, the indices are shuffled (default=True).
num_samples (int, optional): The number of samples to draw (default=None, all elements).
offset(int, optional): Offset from shard when the element of dataset is allocated
offset(int, optional): Offset from shard when the element of dataset is allocated (default=-1).
Examples:
>>> import mindspore.dataset as ds
>>>
>>> dataset_dir = "path/to/imagefolder_directory"
>>>
>>> # creates a distributed sampler with 10 shards total. This shard is shard 5
>>> # creates a distributed sampler with 10 shards in total. This shard is shard 5.
>>> sampler = ds.DistributedSampler(10, 5)
>>> data = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8, sampler=sampler)
@ -304,8 +303,8 @@ class PKSampler(BuiltinSampler):
Args:
num_val (int): Number of elements to sample for each class.
num_class (int, optional): Number of classes to sample (default=None, all classes).
shuffle (bool, optional): If true, the class IDs are shuffled (default=False).
class_column (str, optional): Name of column to classify dataset(default='label'), for MindDataset.
shuffle (bool, optional): If True, the class IDs are shuffled (default=False).
class_column (str, optional): Name of column with class labels for MindDataset (default='label').
num_samples (int, optional): The number of samples to draw (default=None, all elements).
Examples:
@ -372,6 +371,7 @@ class PKSampler(BuiltinSampler):
c_sampler.add_child(c_child_sampler)
return c_sampler
class RandomSampler(BuiltinSampler):
"""
Samples the elements randomly.
@ -437,7 +437,7 @@ class SequentialSampler(BuiltinSampler):
Samples the dataset elements sequentially, same as not having a sampler.
Args:
start_index (int, optional): Index to start sampling at. (dafault=None starts at first id)
start_index (int, optional): Index to start sampling at. (dafault=None, start at first ID)
num_samples (int, optional): Number of elements to sample (default=None, all elements).
Examples:

View File

@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module is to support text processing for nlp. It includes two parts:
This module is to support text processing for NLP. It includes two parts:
transforms and utils. transforms is a high performance
nlp text processing module which is developed with icu4c and cppjieba.
utils provides some general methods for nlp text processing.
NLP text processing module which is developed with ICU4C and cppjieba.
utils provides some general methods for NLP text processing.
"""
import platform
from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, TruncateSequencePair, \

View File

@ -13,35 +13,36 @@
# limitations under the License.
"""
The module text.transforms is inheritted from _c_dataengine
which is implemented basing on icu4c and cppjieba in C++.
It's a high performance module to process nlp text.
and is implemented based on ICU4C and cppjieba in C++.
It's a high performance module to process NLP text.
Users can use Vocab to build their own dictionary,
use appropriate tokenizers to split sentences into different tokens,
and use Lookup to find the index of tokens in Vocab.
.. Note::
Constructor's arguments for every class in this module must be saved into the
A constructor's arguments for every class in this module must be saved into the
class attributes (self.xxx) to support save() and load().
Examples:
>>> import mindspore.dataset as ds
>>> import mindspore.dataset.text as text
>>> dataset_file = "path/to/text_file_path"
>>> # sentences as line data saved in a file
>>> dataset = ds.TextFileDataset(dataset_file, shuffle=False)
>>> # tokenize sentence to unicode characters
>>> tokenizer = text.UnicodeCharTokenizer()
>>> # load vocabulary form list
>>> vocab = text.Vocab.from_list(['', '', '', '', ''])
>>> # lookup is an operation for mapping tokens to ids
>>> lookup = text.Lookup(vocab)
>>> dataset = dataset.map(operations=[tokenizer, lookup])
>>> for i in dataset.create_dict_iterator():
>>> print(i)
>>> # if text line in dataset_file is:
>>> # 深圳欢迎您
>>> # then the output will be:
>>> # {'text': array([0, 1, 2, 3, 4], dtype=int32)}
Examples:
>>> import mindspore.dataset as ds
>>> import mindspore.dataset.text as text
>>>
>>> dataset_file = "path/to/text_file_path"
>>> # sentences as line data saved in a file
>>> dataset = ds.TextFileDataset(dataset_file, shuffle=False)
>>> # tokenize sentence to unicode characters
>>> tokenizer = text.UnicodeCharTokenizer()
>>> # load vocabulary form list
>>> vocab = text.Vocab.from_list(['', '', '', '', ''])
>>> # lookup is an operation for mapping tokens to ids
>>> lookup = text.Lookup(vocab)
>>> dataset = dataset.map(operations=[tokenizer, lookup])
>>> for i in dataset.create_dict_iterator():
>>> print(i)
>>> # if text line in dataset_file is:
>>> # 深圳欢迎您
>>> # then the output will be:
>>> # {'text': array([0, 1, 2, 3, 4], dtype=int32)}
"""
import os
import re
@ -64,10 +65,10 @@ class Lookup(cde.LookupOp):
Lookup operator that looks up a word to an id.
Args:
vocab(Vocab): a Vocab object.
unknown_token(str, optional): word to use for lookup if the word being looked up is out of Vocabulary (oov).
If unknown_token is oov, runtime error will be thrown (default=None).
data_type (mindspore.dtype, optional): mindspore.dtype lookup maps string to (default=mstype.int32)
vocab (Vocab): A vocabulary object.
unknown_token (str, optional): Word used for lookup if the word being looked up is out-of-vocabulary (OOV).
If unknown_token is OOV, a runtime error will be thrown (default=None).
data_type (mindspore.dtype, optional): mindspore.dtype that lookup maps string to (default=mstype.int32)
"""
@check_lookup
@ -81,8 +82,8 @@ class SlidingWindow(cde.SlidingWindowOp):
is a slice of data starting at the corresponding position, with a specified width.
Args:
width (int): The width of the window. Must be an integer and greater than zero.
axis (int, optional): The axis along which sliding window is computed (default=0).
width (int): The width of the window. It must be an integer and greater than zero.
axis (int, optional): The axis along which the sliding window is computed (default=0).
Examples:
>>> # Data before
@ -112,18 +113,18 @@ class Ngram(cde.NgramOp):
Refer to https://en.wikipedia.org/wiki/N-gram#Examples for an overview of what n-gram is and how it works.
Args:
n (list[int]): n in n-gram, n >= 1. n is a list of positive integers, for e.g. n=[4,3], The result
would be a 4-gram followed by a 3-gram in the same tensor. If number of words is not enough to make up for
a n-gram, an empty string would be returned. For e.g. 3 grams on ["mindspore","best"] would result in an
empty string be produced.
n (list[int]): n in n-gram, n >= 1. n is a list of positive integers. For example, if n=[4,3], then the result
would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up
for a n-gram, an empty string will be returned. For example, 3 grams on ["mindspore","best"] will result in
an empty string produced.
left_pad (tuple, optional): ("pad_token", pad_width). Padding performed on left side of the sequence. pad_width
will be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (default=None).
right_pad (tuple, optional): ("pad_token", pad_width). Padding performed on right side of the sequence.
pad_width will be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--"
(default=None).
separator (str, optional): symbol used to join strings together. for e.g. if 2-gram the ["mindspore", "amazing"]
with separator="-" the result would be ["mindspore-amazing"] (default=None, which means whitespace is
used).
separator (str, optional): symbol used to join strings together. For example. if 2-gram is
["mindspore", "amazing"] with separator="-", the result would be ["mindspore-amazing"]
(default=None, which means whitespace is used).
"""
@check_ngram
@ -143,10 +144,10 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
Tokenize Chinese string into words based on dictionary.
Args:
hmm_path (str): the dictionary file is used by HMMSegment algorithm,
the dictionary can be obtained on the official website of cppjieba.
mp_path (str): the dictionary file is used by MPSegment algorithm,
the dictionary can be obtained on the official website of cppjieba.
hmm_path (str): Dictionary file is used by HMMSegment algorithm.
The dictionary can be obtained on the official website of cppjieba.
mp_path (str): Dictionary file is used by MPSegment algorithm.
The dictionary can be obtained on the official website of cppjieba.
mode (JiebaMode, optional): Valid values can be any of [JiebaMode.MP, JiebaMode.HMM,
JiebaMode.MIX](default=JiebaMode.MIX).
@ -188,8 +189,8 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
Args:
word (str): The word to be added to the JiebaTokenizer instance.
The added word will not be written into the built-in dictionary on disk.
freq (int, optional): The frequency of the word to be added, The higher the frequency,
the better change the word will be tokenized(default=None, use default frequency).
freq (int, optional): The frequency of the word to be added. The higher the frequency,
the better chance the word will be tokenized (default=None, use default frequency).
"""
if freq is None:
@ -298,11 +299,11 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
Tokenize scalar token or 1-D tokens to 1-D subword tokens.
Args:
vocab (Vocab): a Vocab object.
suffix_indicator (str, optional): Used to show that the subword is the last part of a word(default='##').
max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default=100).
unknown_token (str, optional): When we can not found the token: if 'unknown_token' is empty string,
return the token directly, else return 'unknown_token'(default='[UNK]').
vocab (Vocab): A vocabulary object.
suffix_indicator (str, optional): Used to show that the subword is the last part of a word (default='##').
max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split (default=100).
unknown_token (str, optional): When a token cannot be found: if 'unknown_token' is empty string,
return the token directly, else return 'unknown_token' (default='[UNK]').
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples:
@ -347,9 +348,9 @@ class SentencePieceTokenizer(cde.SentencePieceTokenizerOp):
Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
Args:
mode(Union[str, SentencePieceVocab]): If the input parameter is a file, then it is of type string,
if the input parameter is a SentencePieceVocab object, then it is of type SentencePieceVocab.
out_type(Union[str, int]): The type of output.
mode (Union[str, SentencePieceVocab]): If the input parameter is a file, then it is of type string.
If the input parameter is a SentencePieceVocab object, then it is of type SentencePieceVocab.
out_type (Union[str, int]): The type of output.
"""
def __init__(self, mode, out_type):
@ -367,7 +368,7 @@ class SentencePieceTokenizer(cde.SentencePieceTokenizerOp):
if platform.system().lower() != 'windows':
class WhitespaceTokenizer(cde.WhitespaceTokenizerOp):
"""
Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\\\\t', '\\\\r', '\\\\n').
Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces, such as: ' ', '\\\\t', '\\\\r', '\\\\n'.
Args:
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
@ -442,7 +443,7 @@ if platform.system().lower() != 'windows':
normalize_form (NormalizeForm, optional): Valid values can be any of [NormalizeForm.NONE,
NormalizeForm.NFC, NormalizeForm.NFKC, NormalizeForm.NFD,
NormalizeForm.NFKD](default=NormalizeForm.NFKC).
And you can see http://unicode.org/reports/tr15/ for details.
See http://unicode.org/reports/tr15/ for details.
- NormalizeForm.NONE, do nothing for input string tensor.
- NormalizeForm.NFC, normalize with Normalization Form C.
@ -466,10 +467,10 @@ if platform.system().lower() != 'windows':
See http://userguide.icu-project.org/strings/regexp for support regex pattern.
Args:
pattern(str): the regex expression patterns.
replace(str): the string to replace matched element.
replace_all(bool, optional): If False, only replace first matched element;
if True, replace all matched elements(default=True).
pattern (str): the regex expression patterns.
replace (str): the string to replace matched element.
replace_all (bool, optional): If False, only replace first matched element;
if True, replace all matched elements (default=True).
"""
def __init__(self, pattern, replace, replace_all=True):
@ -486,11 +487,11 @@ if platform.system().lower() != 'windows':
See http://userguide.icu-project.org/strings/regexp for support regex pattern.
Args:
delim_pattern(str): The pattern of regex delimiters.
delim_pattern (str): The pattern of regex delimiters.
The original string will be split by matched elements.
keep_delim_pattern(str, optional): The string matched by 'delim_pattern' can be kept as a token
if it can be matched by 'keep_delim_pattern'. And the default value is empty str(''),
in this situation, delimiters will not kept as an output token(default='').
keep_delim_pattern (str, optional): The string matched by 'delim_pattern' can be kept as a token
if it can be matched by 'keep_delim_pattern'. The default value is an empty str ('')
which means that delimiters will not be kept as an output token (default='').
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples:
@ -519,14 +520,14 @@ if platform.system().lower() != 'windows':
Tokenize a scalar tensor of UTF-8 string by specific rules.
Args:
lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
on input text to make the text to lower case and strip accents characters; If False, only apply
NormalizeUTF8('normalization_form' mode) operation on input text(default=False).
keep_whitespace(bool, optional): If True, the whitespace will be kept in out tokens(default=False).
normalization_form(NormalizeForm, optional): Used to specify a specific normalize mode,
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default=NormalizeForm.NONE).
preserve_unused_token(bool, optional): If True, do not split special tokens like
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
lower_case (bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
on input text to fold the text to lower case and strip accents characters. If False, only apply
NormalizeUTF8('normalization_form' mode) operation on input text (default=False).
keep_whitespace (bool, optional): If True, the whitespace will be kept in out tokens (default=False).
normalization_form (NormalizeForm, optional): Used to specify a specific normalize mode. This is
only effective when 'lower_case' is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE).
preserve_unused_token (bool, optional): If True, do not split special tokens like
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True).
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples:
@ -570,19 +571,19 @@ if platform.system().lower() != 'windows':
Tokenizer used for Bert text process.
Args:
vocab(Vocab): a Vocab object.
suffix_indicator(str, optional): Used to show that the subword is the last part of a word(default='##').
max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default=100).
unknown_token(str, optional): When we can not found the token: if 'unknown_token' is empty string,
vocab (Vocab): A vocabulary object.
suffix_indicator (str, optional): Used to show that the subword is the last part of a word (default='##').
max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split (default=100).
unknown_token (str, optional): When a token cannot be found: if 'unknown_token' is empty string,
return the token directly, else return 'unknown_token'(default='[UNK]').
lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
on input text to make the text to lower case and strip accents characters; If False, only apply
NormalizeUTF8('normalization_form' mode) operation on input text(default=False).
keep_whitespace(bool, optional): If True, the whitespace will be kept in out tokens(default=False).
normalization_form(NormalizeForm, optional): Used to specify a specific normlaize mode,
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE').
preserve_unused_token(bool, optional): If True, do not split special tokens like
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
lower_case (bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
on input text to fold the text to lower case and strip accented characters. If False, only apply
NormalizeUTF8('normalization_form' mode) operation on input text (default=False).
keep_whitespace (bool, optional): If True, the whitespace will be kept in out tokens (default=False).
normalization_form (NormalizeForm, optional): Used to specify a specific normalize mode,
only effective when 'lower_case' is False. See NormalizeUTF8 for details (default='NONE').
preserve_unused_token (bool, optional): If True, do not split special tokens like
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True).
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples:
@ -632,7 +633,7 @@ class TruncateSequencePair(cde.TruncateSequencePairOp):
This operation takes two input tensors and returns two output Tenors.
Args:
max_length(int): Maximum length required.
max_length (int): Maximum length required.
Examples:
>>> # Data before
@ -660,7 +661,7 @@ class ToNumber(cde.ToNumberOp):
Strings are casted according to the rules specified in the following links:
https://en.cppreference.com/w/cpp/string/basic_string/stof,
https://en.cppreference.com/w/cpp/string/basic_string/stoul,
except that any strings which represent negative numbers cannot be casted to an
except that any strings which represent negative numbers cannot be cast to an
unsigned integer type.
Args:
@ -681,6 +682,7 @@ class ToNumber(cde.ToNumberOp):
class PythonTokenizer:
"""
Callable class to be used for user-defined string tokenizer.
Args:
tokenizer (Callable): Python function that takes a `str` and returns a list of `str` as tokens.

View File

@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The module text.utils provides some general methods for nlp text processing.
The module text.utils provides some general methods for NLP text processing.
For example, you can use Vocab to build a dictionary,
use to_bytes and to_str to encode and decode strings into a specified format.
"""

View File

@ -131,7 +131,7 @@ def test_cv_minddataset_pk_sample_error_class_column():
create_cv_mindrecord(1)
columns_list = ["data", "file_name", "label"]
num_readers = 4
sampler = ds.PKSampler(5, None, True, 'no_exsit_column')
sampler = ds.PKSampler(5, None, True, 'no_exist_column')
with pytest.raises(Exception, match="MindRecordOp launch failed"):
data_set = ds.MindDataset(CV_FILE_NAME, columns_list, num_readers, sampler=sampler)
num_iter = 0