forked from mindspore-Ecosystem/mindspore
dataset API docstring: Update datasets, samplers, graphdata and text
This commit is contained in:
parent
d0e49c5cf8
commit
f7adf648e9
File diff suppressed because it is too large
Load Diff
|
@ -34,29 +34,36 @@ class GraphData:
|
|||
Reads the graph dataset used for GNN training from the shared file and database.
|
||||
|
||||
Args:
|
||||
dataset_file (str): One of file names in dataset.
|
||||
num_parallel_workers (int, optional): Number of workers to process the Dataset in parallel
|
||||
dataset_file (str): One of file names in the dataset.
|
||||
num_parallel_workers (int, optional): Number of workers to process the dataset in parallel
|
||||
(default=None).
|
||||
working_mode (str, optional): Set working mode, now support 'local'/'client'/'server' (default='local').
|
||||
working_mode (str, optional): Set working mode, now supports 'local'/'client'/'server' (default='local').
|
||||
|
||||
- 'local', used in non-distributed training scenarios.
|
||||
|
||||
- 'client', used in distributed training scenarios, the client does not load data,
|
||||
- 'client', used in distributed training scenarios. The client does not load data,
|
||||
but obtains data from the server.
|
||||
|
||||
- 'server', used in distributed training scenarios, the server loads the data
|
||||
- 'server', used in distributed training scenarios. The server loads the data
|
||||
and is available to the client.
|
||||
|
||||
hostname (str, optional): Valid when working_mode is set to 'client' or 'server',
|
||||
set the hostname of the graph data server (default='127.0.0.1').
|
||||
port (int, optional): Valid when working_mode is set to 'client' or 'server',
|
||||
set the port of the graph data server, the range is 1024-65535 (default=50051).
|
||||
num_client (int, optional): Valid when working_mode is set to 'server',
|
||||
set the number of clients expected to connect, and the server will allocate corresponding
|
||||
resources according to this parameter (default=1).
|
||||
hostname (str, optional): Hostname of the graph data server. This parameter is only valid when
|
||||
working_mode is set to 'client' or 'server' (default='127.0.0.1').
|
||||
port (int, optional): Port of the graph data server. The range is 1024-65535. This parameter is
|
||||
only valid when working_mode is set to 'client' or 'server' (default=50051).
|
||||
num_client (int, optional): Maximum number of clients expected to connect to the server. The server will
|
||||
allocate resources according to this parameter. This parameter is only valid when working_mode
|
||||
is set to 'server' (default=1).
|
||||
auto_shutdown (bool, optional): Valid when working_mode is set to 'server',
|
||||
when the number of connected clients reaches num_client and no client is being connected,
|
||||
the server automatically exits (default=True).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> data_graph = ds.GraphData('dataset_file', 2)
|
||||
>>> nodes = data_graph.get_all_nodes(0)
|
||||
>>> features = data_graph.get_node_feature(nodes, [1])
|
||||
"""
|
||||
|
||||
@check_gnn_graphdata
|
||||
|
@ -94,10 +101,11 @@ class GraphData:
|
|||
node_type (int): Specify the type of node.
|
||||
|
||||
Returns:
|
||||
numpy.ndarray: array of nodes.
|
||||
numpy.ndarray: Array of nodes.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> data_graph = ds.GraphData('dataset_file', 2)
|
||||
>>> nodes = data_graph.get_all_nodes(0)
|
||||
|
||||
|
@ -121,6 +129,7 @@ class GraphData:
|
|||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> data_graph = ds.GraphData('dataset_file', 2)
|
||||
>>> nodes = data_graph.get_all_edges(0)
|
||||
|
||||
|
@ -140,7 +149,7 @@ class GraphData:
|
|||
edge_list (Union[list, numpy.ndarray]): The given list of edges.
|
||||
|
||||
Returns:
|
||||
numpy.ndarray: array of nodes.
|
||||
numpy.ndarray: Array of nodes.
|
||||
|
||||
Raises:
|
||||
TypeError: If `edge_list` is not list or ndarray.
|
||||
|
@ -159,10 +168,11 @@ class GraphData:
|
|||
neighbor_type (int): Specify the type of neighbor.
|
||||
|
||||
Returns:
|
||||
numpy.ndarray: array of nodes.
|
||||
numpy.ndarray: Array of nodes.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> data_graph = ds.GraphData('dataset_file', 2)
|
||||
>>> nodes = data_graph.get_all_nodes(0)
|
||||
>>> neighbors = data_graph.get_all_neighbors(nodes, 0)
|
||||
|
@ -192,13 +202,14 @@ class GraphData:
|
|||
neighbor_types (Union[list, numpy.ndarray]): Neighbor type sampled per hop.
|
||||
|
||||
Returns:
|
||||
numpy.ndarray: array of nodes.
|
||||
numpy.ndarray: Array of nodes.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> data_graph = ds.GraphData('dataset_file', 2)
|
||||
>>> nodes = data_graph.get_all_nodes(0)
|
||||
>>> neighbors = data_graph.get_all_neighbors(nodes, [2, 2], [0, 0])
|
||||
>>> neighbors = data_graph.get_sampled_neighbors(nodes, [2, 2], [0, 0])
|
||||
|
||||
Raises:
|
||||
TypeError: If `node_list` is not list or ndarray.
|
||||
|
@ -221,10 +232,11 @@ class GraphData:
|
|||
neg_neighbor_type (int): Specify the type of negative neighbor.
|
||||
|
||||
Returns:
|
||||
numpy.ndarray: array of nodes.
|
||||
numpy.ndarray: Array of nodes.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> data_graph = ds.GraphData('dataset_file', 2)
|
||||
>>> nodes = data_graph.get_all_nodes(0)
|
||||
>>> neg_neighbors = data_graph.get_neg_sampled_neighbors(nodes, 5, 0)
|
||||
|
@ -253,6 +265,7 @@ class GraphData:
|
|||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> data_graph = ds.GraphData('dataset_file', 2)
|
||||
>>> nodes = data_graph.get_all_nodes(0)
|
||||
>>> features = data_graph.get_node_feature(nodes, [1])
|
||||
|
@ -284,6 +297,7 @@ class GraphData:
|
|||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> data_graph = ds.GraphData('dataset_file', 2)
|
||||
>>> edges = data_graph.get_all_edges(0)
|
||||
>>> features = data_graph.get_edge_feature(edges, [1])
|
||||
|
@ -334,10 +348,11 @@ class GraphData:
|
|||
A default value of -1 indicates that no node is given.
|
||||
|
||||
Returns:
|
||||
numpy.ndarray: array of nodes.
|
||||
numpy.ndarray: Array of nodes.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> data_graph = ds.GraphData('dataset_file', 2)
|
||||
>>> nodes = data_graph.random_walk([1,2], [1,2,1,2,1])
|
||||
|
||||
|
|
|
@ -13,10 +13,10 @@
|
|||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""
|
||||
Sampler module provides several samplers to generate sampling data from dataset.
|
||||
There are following samplers: DistributedSampler, PKSampler, RandomSampler,
|
||||
SequentialSampler, SubsetRandomSampler, WeightedRandomSampler.
|
||||
User can also define custom sampler by extending from Sampler class.
|
||||
The sampler module provides several samplers to generate data from datasets.
|
||||
The provided samplers include: DistributedSampler, PKSampler, RandomSampler,
|
||||
SequentialSampler, SubsetRandomSampler, and WeightedRandomSampler.
|
||||
Users can also define a custom sampler by extending from the Sampler class.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
|
@ -26,9 +26,9 @@ import mindspore.dataset as ds
|
|||
class Sampler:
|
||||
"""
|
||||
Base class for user defined sampler.
|
||||
User defined sampler can be used with any existing dataset with sampler support.
|
||||
A user defined sampler can be used with any existing dataset with sampler support.
|
||||
|
||||
An required _iter_() method should by overridden by user for sample index generation.
|
||||
A required _iter_() method should by overridden by the user for sample index generation.
|
||||
An optional reset() method can be overridden for per repeat reset,
|
||||
|
||||
dataset_size and num_samples will be set by dataset once a dataset iterator is created.
|
||||
|
@ -52,8 +52,7 @@ class Sampler:
|
|||
def __iter__(self):
|
||||
"""
|
||||
User defined iterator, must be overridden.
|
||||
_handshake is guaranteed to be called prior to iterator construction
|
||||
|
||||
_handshake is guaranteed to be called prior to iterator construction.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
@ -160,10 +159,10 @@ class BuiltinSampler:
|
|||
|
||||
def get_num_samples(self):
|
||||
"""
|
||||
All samplers can contain a numeric num_samples value (or it could be set to None).
|
||||
Child sampler can exist or be None.
|
||||
if child sampler exists, then the child sampler count can be a numeric value or None.
|
||||
Given these conditions, we need to output what the sampler count is for this sampler.
|
||||
All samplers can contain a numeric num_samples value (or it can be set to None).
|
||||
A child sampler can exist or be None.
|
||||
If a child sampler exists, then the child sampler count can be a numeric value or None.
|
||||
These conditions impact the resultant sampler count that is used.
|
||||
The following table shows the possible results from calling this function.
|
||||
|
||||
.. list-table::
|
||||
|
@ -217,20 +216,20 @@ class BuiltinSampler:
|
|||
|
||||
class DistributedSampler(BuiltinSampler):
|
||||
"""
|
||||
Sampler that access a shard of the dataset.
|
||||
A sampler that accesses a shard of the dataset.
|
||||
|
||||
Args:
|
||||
num_shards (int): Number of shards to divide the dataset into.
|
||||
shard_id (int): Shard ID of the current shard within num_shards.
|
||||
shuffle (bool, optional): If true, the indices are shuffled (default=True).
|
||||
shuffle (bool, optional): If True, the indices are shuffled (default=True).
|
||||
num_samples (int, optional): The number of samples to draw (default=None, all elements).
|
||||
offset(int, optional): Offset from shard when the element of dataset is allocated
|
||||
offset(int, optional): Offset from shard when the element of dataset is allocated (default=-1).
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> dataset_dir = "path/to/imagefolder_directory"
|
||||
>>>
|
||||
>>> # creates a distributed sampler with 10 shards total. This shard is shard 5
|
||||
>>> # creates a distributed sampler with 10 shards in total. This shard is shard 5.
|
||||
>>> sampler = ds.DistributedSampler(10, 5)
|
||||
>>> data = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8, sampler=sampler)
|
||||
|
||||
|
@ -304,8 +303,8 @@ class PKSampler(BuiltinSampler):
|
|||
Args:
|
||||
num_val (int): Number of elements to sample for each class.
|
||||
num_class (int, optional): Number of classes to sample (default=None, all classes).
|
||||
shuffle (bool, optional): If true, the class IDs are shuffled (default=False).
|
||||
class_column (str, optional): Name of column to classify dataset(default='label'), for MindDataset.
|
||||
shuffle (bool, optional): If True, the class IDs are shuffled (default=False).
|
||||
class_column (str, optional): Name of column with class labels for MindDataset (default='label').
|
||||
num_samples (int, optional): The number of samples to draw (default=None, all elements).
|
||||
|
||||
Examples:
|
||||
|
@ -372,6 +371,7 @@ class PKSampler(BuiltinSampler):
|
|||
c_sampler.add_child(c_child_sampler)
|
||||
return c_sampler
|
||||
|
||||
|
||||
class RandomSampler(BuiltinSampler):
|
||||
"""
|
||||
Samples the elements randomly.
|
||||
|
@ -437,7 +437,7 @@ class SequentialSampler(BuiltinSampler):
|
|||
Samples the dataset elements sequentially, same as not having a sampler.
|
||||
|
||||
Args:
|
||||
start_index (int, optional): Index to start sampling at. (dafault=None starts at first id)
|
||||
start_index (int, optional): Index to start sampling at. (dafault=None, start at first ID)
|
||||
num_samples (int, optional): Number of elements to sample (default=None, all elements).
|
||||
|
||||
Examples:
|
||||
|
|
|
@ -12,10 +12,10 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This module is to support text processing for nlp. It includes two parts:
|
||||
This module is to support text processing for NLP. It includes two parts:
|
||||
transforms and utils. transforms is a high performance
|
||||
nlp text processing module which is developed with icu4c and cppjieba.
|
||||
utils provides some general methods for nlp text processing.
|
||||
NLP text processing module which is developed with ICU4C and cppjieba.
|
||||
utils provides some general methods for NLP text processing.
|
||||
"""
|
||||
import platform
|
||||
from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, TruncateSequencePair, \
|
||||
|
|
|
@ -13,19 +13,20 @@
|
|||
# limitations under the License.
|
||||
"""
|
||||
The module text.transforms is inheritted from _c_dataengine
|
||||
which is implemented basing on icu4c and cppjieba in C++.
|
||||
It's a high performance module to process nlp text.
|
||||
and is implemented based on ICU4C and cppjieba in C++.
|
||||
It's a high performance module to process NLP text.
|
||||
Users can use Vocab to build their own dictionary,
|
||||
use appropriate tokenizers to split sentences into different tokens,
|
||||
and use Lookup to find the index of tokens in Vocab.
|
||||
|
||||
.. Note::
|
||||
Constructor's arguments for every class in this module must be saved into the
|
||||
A constructor's arguments for every class in this module must be saved into the
|
||||
class attributes (self.xxx) to support save() and load().
|
||||
|
||||
Examples:
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> dataset_file = "path/to/text_file_path"
|
||||
>>> # sentences as line data saved in a file
|
||||
>>> dataset = ds.TextFileDataset(dataset_file, shuffle=False)
|
||||
|
@ -64,10 +65,10 @@ class Lookup(cde.LookupOp):
|
|||
Lookup operator that looks up a word to an id.
|
||||
|
||||
Args:
|
||||
vocab(Vocab): a Vocab object.
|
||||
unknown_token(str, optional): word to use for lookup if the word being looked up is out of Vocabulary (oov).
|
||||
If unknown_token is oov, runtime error will be thrown (default=None).
|
||||
data_type (mindspore.dtype, optional): mindspore.dtype lookup maps string to (default=mstype.int32)
|
||||
vocab (Vocab): A vocabulary object.
|
||||
unknown_token (str, optional): Word used for lookup if the word being looked up is out-of-vocabulary (OOV).
|
||||
If unknown_token is OOV, a runtime error will be thrown (default=None).
|
||||
data_type (mindspore.dtype, optional): mindspore.dtype that lookup maps string to (default=mstype.int32)
|
||||
"""
|
||||
|
||||
@check_lookup
|
||||
|
@ -81,8 +82,8 @@ class SlidingWindow(cde.SlidingWindowOp):
|
|||
is a slice of data starting at the corresponding position, with a specified width.
|
||||
|
||||
Args:
|
||||
width (int): The width of the window. Must be an integer and greater than zero.
|
||||
axis (int, optional): The axis along which sliding window is computed (default=0).
|
||||
width (int): The width of the window. It must be an integer and greater than zero.
|
||||
axis (int, optional): The axis along which the sliding window is computed (default=0).
|
||||
|
||||
Examples:
|
||||
>>> # Data before
|
||||
|
@ -112,18 +113,18 @@ class Ngram(cde.NgramOp):
|
|||
Refer to https://en.wikipedia.org/wiki/N-gram#Examples for an overview of what n-gram is and how it works.
|
||||
|
||||
Args:
|
||||
n (list[int]): n in n-gram, n >= 1. n is a list of positive integers, for e.g. n=[4,3], The result
|
||||
would be a 4-gram followed by a 3-gram in the same tensor. If number of words is not enough to make up for
|
||||
a n-gram, an empty string would be returned. For e.g. 3 grams on ["mindspore","best"] would result in an
|
||||
empty string be produced.
|
||||
n (list[int]): n in n-gram, n >= 1. n is a list of positive integers. For example, if n=[4,3], then the result
|
||||
would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up
|
||||
for a n-gram, an empty string will be returned. For example, 3 grams on ["mindspore","best"] will result in
|
||||
an empty string produced.
|
||||
left_pad (tuple, optional): ("pad_token", pad_width). Padding performed on left side of the sequence. pad_width
|
||||
will be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (default=None).
|
||||
right_pad (tuple, optional): ("pad_token", pad_width). Padding performed on right side of the sequence.
|
||||
pad_width will be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--"
|
||||
(default=None).
|
||||
separator (str, optional): symbol used to join strings together. for e.g. if 2-gram the ["mindspore", "amazing"]
|
||||
with separator="-" the result would be ["mindspore-amazing"] (default=None, which means whitespace is
|
||||
used).
|
||||
separator (str, optional): symbol used to join strings together. For example. if 2-gram is
|
||||
["mindspore", "amazing"] with separator="-", the result would be ["mindspore-amazing"]
|
||||
(default=None, which means whitespace is used).
|
||||
"""
|
||||
|
||||
@check_ngram
|
||||
|
@ -143,10 +144,10 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
|
|||
Tokenize Chinese string into words based on dictionary.
|
||||
|
||||
Args:
|
||||
hmm_path (str): the dictionary file is used by HMMSegment algorithm,
|
||||
the dictionary can be obtained on the official website of cppjieba.
|
||||
mp_path (str): the dictionary file is used by MPSegment algorithm,
|
||||
the dictionary can be obtained on the official website of cppjieba.
|
||||
hmm_path (str): Dictionary file is used by HMMSegment algorithm.
|
||||
The dictionary can be obtained on the official website of cppjieba.
|
||||
mp_path (str): Dictionary file is used by MPSegment algorithm.
|
||||
The dictionary can be obtained on the official website of cppjieba.
|
||||
mode (JiebaMode, optional): Valid values can be any of [JiebaMode.MP, JiebaMode.HMM,
|
||||
JiebaMode.MIX](default=JiebaMode.MIX).
|
||||
|
||||
|
@ -188,8 +189,8 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
|
|||
Args:
|
||||
word (str): The word to be added to the JiebaTokenizer instance.
|
||||
The added word will not be written into the built-in dictionary on disk.
|
||||
freq (int, optional): The frequency of the word to be added, The higher the frequency,
|
||||
the better change the word will be tokenized(default=None, use default frequency).
|
||||
freq (int, optional): The frequency of the word to be added. The higher the frequency,
|
||||
the better chance the word will be tokenized (default=None, use default frequency).
|
||||
"""
|
||||
|
||||
if freq is None:
|
||||
|
@ -298,11 +299,11 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
|
|||
Tokenize scalar token or 1-D tokens to 1-D subword tokens.
|
||||
|
||||
Args:
|
||||
vocab (Vocab): a Vocab object.
|
||||
suffix_indicator (str, optional): Used to show that the subword is the last part of a word(default='##').
|
||||
max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default=100).
|
||||
unknown_token (str, optional): When we can not found the token: if 'unknown_token' is empty string,
|
||||
return the token directly, else return 'unknown_token'(default='[UNK]').
|
||||
vocab (Vocab): A vocabulary object.
|
||||
suffix_indicator (str, optional): Used to show that the subword is the last part of a word (default='##').
|
||||
max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split (default=100).
|
||||
unknown_token (str, optional): When a token cannot be found: if 'unknown_token' is empty string,
|
||||
return the token directly, else return 'unknown_token' (default='[UNK]').
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
|
@ -347,9 +348,9 @@ class SentencePieceTokenizer(cde.SentencePieceTokenizerOp):
|
|||
Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
|
||||
|
||||
Args:
|
||||
mode(Union[str, SentencePieceVocab]): If the input parameter is a file, then it is of type string,
|
||||
if the input parameter is a SentencePieceVocab object, then it is of type SentencePieceVocab.
|
||||
out_type(Union[str, int]): The type of output.
|
||||
mode (Union[str, SentencePieceVocab]): If the input parameter is a file, then it is of type string.
|
||||
If the input parameter is a SentencePieceVocab object, then it is of type SentencePieceVocab.
|
||||
out_type (Union[str, int]): The type of output.
|
||||
"""
|
||||
|
||||
def __init__(self, mode, out_type):
|
||||
|
@ -367,7 +368,7 @@ class SentencePieceTokenizer(cde.SentencePieceTokenizerOp):
|
|||
if platform.system().lower() != 'windows':
|
||||
class WhitespaceTokenizer(cde.WhitespaceTokenizerOp):
|
||||
"""
|
||||
Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\\\\t', '\\\\r', '\\\\n').
|
||||
Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces, such as: ' ', '\\\\t', '\\\\r', '\\\\n'.
|
||||
|
||||
Args:
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
@ -442,7 +443,7 @@ if platform.system().lower() != 'windows':
|
|||
normalize_form (NormalizeForm, optional): Valid values can be any of [NormalizeForm.NONE,
|
||||
NormalizeForm.NFC, NormalizeForm.NFKC, NormalizeForm.NFD,
|
||||
NormalizeForm.NFKD](default=NormalizeForm.NFKC).
|
||||
And you can see http://unicode.org/reports/tr15/ for details.
|
||||
See http://unicode.org/reports/tr15/ for details.
|
||||
|
||||
- NormalizeForm.NONE, do nothing for input string tensor.
|
||||
- NormalizeForm.NFC, normalize with Normalization Form C.
|
||||
|
@ -466,10 +467,10 @@ if platform.system().lower() != 'windows':
|
|||
See http://userguide.icu-project.org/strings/regexp for support regex pattern.
|
||||
|
||||
Args:
|
||||
pattern(str): the regex expression patterns.
|
||||
replace(str): the string to replace matched element.
|
||||
replace_all(bool, optional): If False, only replace first matched element;
|
||||
if True, replace all matched elements(default=True).
|
||||
pattern (str): the regex expression patterns.
|
||||
replace (str): the string to replace matched element.
|
||||
replace_all (bool, optional): If False, only replace first matched element;
|
||||
if True, replace all matched elements (default=True).
|
||||
"""
|
||||
|
||||
def __init__(self, pattern, replace, replace_all=True):
|
||||
|
@ -486,11 +487,11 @@ if platform.system().lower() != 'windows':
|
|||
See http://userguide.icu-project.org/strings/regexp for support regex pattern.
|
||||
|
||||
Args:
|
||||
delim_pattern(str): The pattern of regex delimiters.
|
||||
delim_pattern (str): The pattern of regex delimiters.
|
||||
The original string will be split by matched elements.
|
||||
keep_delim_pattern(str, optional): The string matched by 'delim_pattern' can be kept as a token
|
||||
if it can be matched by 'keep_delim_pattern'. And the default value is empty str(''),
|
||||
in this situation, delimiters will not kept as an output token(default='').
|
||||
keep_delim_pattern (str, optional): The string matched by 'delim_pattern' can be kept as a token
|
||||
if it can be matched by 'keep_delim_pattern'. The default value is an empty str ('')
|
||||
which means that delimiters will not be kept as an output token (default='').
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
|
@ -519,14 +520,14 @@ if platform.system().lower() != 'windows':
|
|||
Tokenize a scalar tensor of UTF-8 string by specific rules.
|
||||
|
||||
Args:
|
||||
lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
|
||||
on input text to make the text to lower case and strip accents characters; If False, only apply
|
||||
NormalizeUTF8('normalization_form' mode) operation on input text(default=False).
|
||||
keep_whitespace(bool, optional): If True, the whitespace will be kept in out tokens(default=False).
|
||||
normalization_form(NormalizeForm, optional): Used to specify a specific normalize mode,
|
||||
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default=NormalizeForm.NONE).
|
||||
preserve_unused_token(bool, optional): If True, do not split special tokens like
|
||||
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
|
||||
lower_case (bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
|
||||
on input text to fold the text to lower case and strip accents characters. If False, only apply
|
||||
NormalizeUTF8('normalization_form' mode) operation on input text (default=False).
|
||||
keep_whitespace (bool, optional): If True, the whitespace will be kept in out tokens (default=False).
|
||||
normalization_form (NormalizeForm, optional): Used to specify a specific normalize mode. This is
|
||||
only effective when 'lower_case' is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE).
|
||||
preserve_unused_token (bool, optional): If True, do not split special tokens like
|
||||
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True).
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
|
@ -570,19 +571,19 @@ if platform.system().lower() != 'windows':
|
|||
Tokenizer used for Bert text process.
|
||||
|
||||
Args:
|
||||
vocab(Vocab): a Vocab object.
|
||||
suffix_indicator(str, optional): Used to show that the subword is the last part of a word(default='##').
|
||||
max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default=100).
|
||||
unknown_token(str, optional): When we can not found the token: if 'unknown_token' is empty string,
|
||||
vocab (Vocab): A vocabulary object.
|
||||
suffix_indicator (str, optional): Used to show that the subword is the last part of a word (default='##').
|
||||
max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split (default=100).
|
||||
unknown_token (str, optional): When a token cannot be found: if 'unknown_token' is empty string,
|
||||
return the token directly, else return 'unknown_token'(default='[UNK]').
|
||||
lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
|
||||
on input text to make the text to lower case and strip accents characters; If False, only apply
|
||||
NormalizeUTF8('normalization_form' mode) operation on input text(default=False).
|
||||
keep_whitespace(bool, optional): If True, the whitespace will be kept in out tokens(default=False).
|
||||
normalization_form(NormalizeForm, optional): Used to specify a specific normlaize mode,
|
||||
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE').
|
||||
preserve_unused_token(bool, optional): If True, do not split special tokens like
|
||||
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
|
||||
lower_case (bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
|
||||
on input text to fold the text to lower case and strip accented characters. If False, only apply
|
||||
NormalizeUTF8('normalization_form' mode) operation on input text (default=False).
|
||||
keep_whitespace (bool, optional): If True, the whitespace will be kept in out tokens (default=False).
|
||||
normalization_form (NormalizeForm, optional): Used to specify a specific normalize mode,
|
||||
only effective when 'lower_case' is False. See NormalizeUTF8 for details (default='NONE').
|
||||
preserve_unused_token (bool, optional): If True, do not split special tokens like
|
||||
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True).
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
|
@ -632,7 +633,7 @@ class TruncateSequencePair(cde.TruncateSequencePairOp):
|
|||
This operation takes two input tensors and returns two output Tenors.
|
||||
|
||||
Args:
|
||||
max_length(int): Maximum length required.
|
||||
max_length (int): Maximum length required.
|
||||
|
||||
Examples:
|
||||
>>> # Data before
|
||||
|
@ -660,7 +661,7 @@ class ToNumber(cde.ToNumberOp):
|
|||
Strings are casted according to the rules specified in the following links:
|
||||
https://en.cppreference.com/w/cpp/string/basic_string/stof,
|
||||
https://en.cppreference.com/w/cpp/string/basic_string/stoul,
|
||||
except that any strings which represent negative numbers cannot be casted to an
|
||||
except that any strings which represent negative numbers cannot be cast to an
|
||||
unsigned integer type.
|
||||
|
||||
Args:
|
||||
|
@ -681,6 +682,7 @@ class ToNumber(cde.ToNumberOp):
|
|||
class PythonTokenizer:
|
||||
"""
|
||||
Callable class to be used for user-defined string tokenizer.
|
||||
|
||||
Args:
|
||||
tokenizer (Callable): Python function that takes a `str` and returns a list of `str` as tokens.
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
The module text.utils provides some general methods for nlp text processing.
|
||||
The module text.utils provides some general methods for NLP text processing.
|
||||
For example, you can use Vocab to build a dictionary,
|
||||
use to_bytes and to_str to encode and decode strings into a specified format.
|
||||
"""
|
||||
|
|
|
@ -131,7 +131,7 @@ def test_cv_minddataset_pk_sample_error_class_column():
|
|||
create_cv_mindrecord(1)
|
||||
columns_list = ["data", "file_name", "label"]
|
||||
num_readers = 4
|
||||
sampler = ds.PKSampler(5, None, True, 'no_exsit_column')
|
||||
sampler = ds.PKSampler(5, None, True, 'no_exist_column')
|
||||
with pytest.raises(Exception, match="MindRecordOp launch failed"):
|
||||
data_set = ds.MindDataset(CV_FILE_NAME, columns_list, num_readers, sampler=sampler)
|
||||
num_iter = 0
|
||||
|
|
Loading…
Reference in New Issue