forked from mindspore-Ecosystem/mindspore
!2258 address some review comments on doc
Merge pull request !2258 from ZiruiWu/fix_api_doc
This commit is contained in:
commit
ef08dc0d21
|
@ -141,6 +141,11 @@ Status BuildVocabOp::CollectorThread() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int64_t num_words = std::min(static_cast<int64_t>(words.size()), top_k_);
|
int64_t num_words = std::min(static_cast<int64_t>(words.size()), top_k_);
|
||||||
|
if (num_words == 0) {
|
||||||
|
MS_LOG(WARNING) << "No word falls in the frequency range: (" << freq_range_.first << "," << freq_range_.second
|
||||||
|
<< ") vocab would be empty (except for special tokens).";
|
||||||
|
}
|
||||||
|
|
||||||
// this would take the top-k most frequent words
|
// this would take the top-k most frequent words
|
||||||
std::partial_sort(words.begin(), words.begin() + num_words, words.end(),
|
std::partial_sort(words.begin(), words.begin() + num_words, words.end(),
|
||||||
[this](const std::string &w1, const std::string &w2) {
|
[this](const std::string &w1, const std::string &w2) {
|
||||||
|
|
|
@ -323,9 +323,7 @@ Status ImageFolderOp::PrescanWorkerEntry(int32_t worker_id) {
|
||||||
// if mRecursive == false, don't go into folder of folders
|
// if mRecursive == false, don't go into folder of folders
|
||||||
Status ImageFolderOp::RecursiveWalkFolder(Path *dir) {
|
Status ImageFolderOp::RecursiveWalkFolder(Path *dir) {
|
||||||
std::shared_ptr<Path::DirIterator> dir_itr = Path::DirIterator::OpenDirectory(dir);
|
std::shared_ptr<Path::DirIterator> dir_itr = Path::DirIterator::OpenDirectory(dir);
|
||||||
if (dir_itr == nullptr) {
|
RETURN_UNEXPECTED_IF_NULL(dir_itr);
|
||||||
RETURN_STATUS_UNEXPECTED("Error encountered when indexing files");
|
|
||||||
}
|
|
||||||
while (dir_itr->hasNext()) {
|
while (dir_itr->hasNext()) {
|
||||||
Path subdir = dir_itr->next();
|
Path subdir = dir_itr->next();
|
||||||
if (subdir.IsDirectory()) {
|
if (subdir.IsDirectory()) {
|
||||||
|
|
|
@ -32,6 +32,5 @@ __all__ = ["config", "ConfigurationManager", "zip",
|
||||||
"ImageFolderDatasetV2", "MnistDataset",
|
"ImageFolderDatasetV2", "MnistDataset",
|
||||||
"MindDataset", "GeneratorDataset", "TFRecordDataset", "CLUEDataset",
|
"MindDataset", "GeneratorDataset", "TFRecordDataset", "CLUEDataset",
|
||||||
"ManifestDataset", "Cifar10Dataset", "Cifar100Dataset", "CelebADataset",
|
"ManifestDataset", "Cifar10Dataset", "Cifar100Dataset", "CelebADataset",
|
||||||
"VOCDataset", "CocoDataset", "TextFileDataset", "BuildVocabDataset", "Schema", "Schema",
|
"VOCDataset", "CocoDataset", "TextFileDataset", "Schema", "DistributedSampler",
|
||||||
"DistributedSampler", "PKSampler",
|
"PKSampler", "RandomSampler", "SequentialSampler", "SubsetRandomSampler", "WeightedRandomSampler"]
|
||||||
"RandomSampler", "SequentialSampler", "SubsetRandomSampler", "WeightedRandomSampler"]
|
|
||||||
|
|
|
@ -42,8 +42,8 @@ from .iterators import DictIterator, TupleIterator
|
||||||
from .validators import check_batch, check_shuffle, check_map, check_filter, check_repeat, check_skip, check_zip, \
|
from .validators import check_batch, check_shuffle, check_map, check_filter, check_repeat, check_skip, check_zip, \
|
||||||
check_rename, check_numpyslicesdataset, \
|
check_rename, check_numpyslicesdataset, \
|
||||||
check_take, check_project, check_imagefolderdatasetv2, check_mnist_cifar_dataset, check_manifestdataset, \
|
check_take, check_project, check_imagefolderdatasetv2, check_mnist_cifar_dataset, check_manifestdataset, \
|
||||||
check_tfrecorddataset, check_vocdataset, check_cocodataset, check_celebadataset, check_minddataset,\
|
check_tfrecorddataset, check_vocdataset, check_cocodataset, check_celebadataset, check_minddataset, \
|
||||||
check_generatordataset, check_sync_wait, check_zip_dataset, check_add_column, check_textfiledataset, check_concat,\
|
check_generatordataset, check_sync_wait, check_zip_dataset, check_add_column, check_textfiledataset, check_concat, \
|
||||||
check_split, check_bucket_batch_by_length, check_cluedataset
|
check_split, check_bucket_batch_by_length, check_cluedataset
|
||||||
from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist
|
from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist
|
||||||
|
|
||||||
|
@ -895,26 +895,7 @@ class Dataset:
|
||||||
return ProjectDataset(self, columns)
|
return ProjectDataset(self, columns)
|
||||||
|
|
||||||
def build_vocab(self, vocab, columns, freq_range, top_k):
|
def build_vocab(self, vocab, columns, freq_range, top_k):
|
||||||
"""
|
""" Internal function for building a vocab"""
|
||||||
Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab
|
|
||||||
which contains top_k most frequent words (if top_k is specified)
|
|
||||||
This function is not meant to be called directly by user. To build vocab, please use the function
|
|
||||||
text.Vocab.from_dataset()
|
|
||||||
|
|
||||||
Args:
|
|
||||||
vocab(Vocab): vocab object
|
|
||||||
columns(str or list, optional): column names to get words from. It can be a list of column names.
|
|
||||||
(Default is None where all columns will be used. If any column isn't string type, will return error)
|
|
||||||
freq_range(tuple, optional): A tuple of integers (min_frequency, max_frequency). Words within the frequency
|
|
||||||
range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency/max_frequency
|
|
||||||
can be None, which corresponds to 0/total_words separately (default is None, all words are included)
|
|
||||||
top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
|
|
||||||
taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default is None
|
|
||||||
all words are included)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
BuildVocabDataset
|
|
||||||
"""
|
|
||||||
return BuildVocabDataset(self, vocab, columns, freq_range, top_k)
|
return BuildVocabDataset(self, vocab, columns, freq_range, top_k)
|
||||||
|
|
||||||
def apply(self, apply_func):
|
def apply(self, apply_func):
|
||||||
|
@ -1468,6 +1449,7 @@ class DatasetOp(Dataset):
|
||||||
|
|
||||||
# No need for __init__ since it is the same as the super's init
|
# No need for __init__ since it is the same as the super's init
|
||||||
|
|
||||||
|
|
||||||
class BucketBatchByLengthDataset(DatasetOp):
|
class BucketBatchByLengthDataset(DatasetOp):
|
||||||
"""
|
"""
|
||||||
The result of applying BucketBatchByLength operator to the input dataset.
|
The result of applying BucketBatchByLength operator to the input dataset.
|
||||||
|
@ -1608,7 +1590,7 @@ class BatchDataset(DatasetOp):
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
dataset (Dataset): dataset to be checked.
|
dataset (Dataset): dataset to be checked.
|
||||||
batchsize (int): batch size to notify.
|
batch_size (int): batch size to notify.
|
||||||
"""
|
"""
|
||||||
if isinstance(dataset, SyncWaitDataset):
|
if isinstance(dataset, SyncWaitDataset):
|
||||||
dataset.update_sync_batch_size(batch_size)
|
dataset.update_sync_batch_size(batch_size)
|
||||||
|
@ -1646,7 +1628,7 @@ class BlockReleasePair:
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
init_release_rows (int): Number of lines to allow through the pipeline.
|
init_release_rows (int): Number of lines to allow through the pipeline.
|
||||||
callback (function): The callback funciton that will be called when release is called.
|
callback (function): The callback function that will be called when release is called.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, init_release_rows, callback=None):
|
def __init__(self, init_release_rows, callback=None):
|
||||||
|
@ -1710,7 +1692,7 @@ class SyncWaitDataset(DatasetOp):
|
||||||
input_dataset (Dataset): Input dataset to apply flow control.
|
input_dataset (Dataset): Input dataset to apply flow control.
|
||||||
num_batch (int): the number of batches without blocking at the start of each epoch.
|
num_batch (int): the number of batches without blocking at the start of each epoch.
|
||||||
condition_name (str): The condition name that is used to toggle sending next row.
|
condition_name (str): The condition name that is used to toggle sending next row.
|
||||||
callback (function): The callback funciton that will be invoked when sync_update is called.
|
callback (function): The callback function that will be invoked when sync_update is called.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
RuntimeError: If condition name already exists.
|
RuntimeError: If condition name already exists.
|
||||||
|
@ -2066,7 +2048,7 @@ class SkipDataset(DatasetOp):
|
||||||
The result of applying Skip operator to the input Dataset.
|
The result of applying Skip operator to the input Dataset.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
datasets (tuple): A tuple of datasets to be skipped.
|
input_dataset (tuple): A tuple of datasets to be skipped.
|
||||||
count (int): Number of rows the dataset should be skipped.
|
count (int): Number of rows the dataset should be skipped.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -3055,7 +3037,7 @@ class GeneratorDataset(MappableDataset):
|
||||||
provide either column_names or schema.
|
provide either column_names or schema.
|
||||||
column_types (list[mindspore.dtype], optional): List of column data types of the dataset (default=None).
|
column_types (list[mindspore.dtype], optional): List of column data types of the dataset (default=None).
|
||||||
If provided, sanity check will be performed on generator output.
|
If provided, sanity check will be performed on generator output.
|
||||||
schema (Schema/String, optional): Path to the json schema file or schema object (default=None). Users are
|
schema (Schema/str, optional): Path to the json schema file or schema object (default=None). Users are
|
||||||
required to provide either column_names or schema. If both are provided, schema will be used.
|
required to provide either column_names or schema. If both are provided, schema will be used.
|
||||||
num_samples (int, optional): The number of samples to be included in the dataset
|
num_samples (int, optional): The number of samples to be included in the dataset
|
||||||
(default=None, all images).
|
(default=None, all images).
|
||||||
|
@ -4343,7 +4325,7 @@ class CelebADataset(MappableDataset):
|
||||||
dataset_dir (str): Path to the root directory that contains the dataset.
|
dataset_dir (str): Path to the root directory that contains the dataset.
|
||||||
num_parallel_workers (int, optional): Number of workers to read the data (default=value set in the config).
|
num_parallel_workers (int, optional): Number of workers to read the data (default=value set in the config).
|
||||||
shuffle (bool, optional): Whether to perform shuffle on the dataset (default=None).
|
shuffle (bool, optional): Whether to perform shuffle on the dataset (default=None).
|
||||||
dataset_type (string): one of 'all', 'train', 'valid' or 'test'.
|
dataset_type (str): one of 'all', 'train', 'valid' or 'test'.
|
||||||
sampler (Sampler, optional): Object used to choose samples from the dataset (default=None).
|
sampler (Sampler, optional): Object used to choose samples from the dataset (default=None).
|
||||||
decode (bool, optional): decode the images after reading (default=False).
|
decode (bool, optional): decode the images after reading (default=False).
|
||||||
extensions (list[str], optional): List of file extensions to be
|
extensions (list[str], optional): List of file extensions to be
|
||||||
|
@ -4874,18 +4856,15 @@ class BuildVocabDataset(DatasetOp):
|
||||||
text.Vocab.from_dataset()
|
text.Vocab.from_dataset()
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab(Vocab): vocab object.
|
vocab(Vocab): text.vocab object.
|
||||||
columns(str or list, optional): column names to get words from. It can be a list of column names (Default is
|
columns(str or list, optional): column names to get words from. It can be a list of column names (Default is
|
||||||
None, all columns are used, return error if any column isn't string).
|
None, all columns are used, return error if any column isn't string).
|
||||||
freq_range(tuple, optional): A tuple of integers (min_frequency, max_frequency). Words within the frequency
|
freq_range(tuple, optional): A tuple of integers (min_frequency, max_frequency). Words within the frequency
|
||||||
range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency/max_frequency
|
range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency/max_frequency
|
||||||
can be None, which corresponds to 0/total_words separately (default is None, all words are included).
|
can be None, which corresponds to 0/total_words separately (default=None, all words are included).
|
||||||
top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
|
top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
|
||||||
taken. The top_k is taken after freq_range. If not enough top_k, all words will be taken (default is None
|
taken. The top_k is taken after freq_range. If not enough top_k, all words will be taken (default=None,
|
||||||
all words are included).
|
all words are included).
|
||||||
|
|
||||||
Returns:
|
|
||||||
BuildVocabDataset
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, input_dataset, vocab, columns, freq_range, top_k, prefetch_size=None):
|
def __init__(self, input_dataset, vocab, columns, freq_range, top_k, prefetch_size=None):
|
||||||
|
|
|
@ -30,8 +30,8 @@ class Lookup(cde.LookupOp):
|
||||||
"""
|
"""
|
||||||
Lookup operator that looks up a word to an id
|
Lookup operator that looks up a word to an id
|
||||||
Args:
|
Args:
|
||||||
vocab(Vocab): a Vocab object
|
vocab(Vocab): a Vocab object.
|
||||||
unknown(None,int): default id to lookup a word that is out of vocab
|
unknown(int): default id to lookup a word that is out of vocab (default is None).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@check_lookup
|
@check_lookup
|
||||||
|
@ -45,16 +45,21 @@ class Lookup(cde.LookupOp):
|
||||||
class Ngram(cde.NgramOp):
|
class Ngram(cde.NgramOp):
|
||||||
"""
|
"""
|
||||||
TensorOp to generate n-gram from a 1-D string Tensor
|
TensorOp to generate n-gram from a 1-D string Tensor
|
||||||
Refer to https://en.wikipedia.org/wiki/N-gram#Examples for an explanation of what n-gram is.
|
Refer to https://en.wikipedia.org/wiki/N-gram#Examples for an overview of what n-gram is and how it works.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
n(int or list): n in n-gram, n >= 1. n is a list of positive integers, for e.g. n=[4,3], The result
|
n([int, list]): n in n-gram, n >= 1. n is a list of positive integers, for e.g. n=[4,3], The result
|
||||||
would be a 4-gram followed by a 3-gram in the same tensor.
|
would be a 4-gram followed by a 3-gram in the same tensor. If number of words is not enough to make up for
|
||||||
left_pad(tuple, optional): ("pad_token",pad_width). Padding performed on left side of the sequence. pad_width
|
a n-gram, an empty string would be returned. For e.g. 3 grams on ["mindspore","best"] would result in an
|
||||||
will be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__". (Default is None)
|
empty string be produced.
|
||||||
right_pad(tuple, optional): ("pad_token",pad_width). Padding performed on right side of the sequence. pad_width
|
left_pad(tuple, optional): ("pad_token", pad_width). Padding performed on left side of the sequence. pad_width
|
||||||
will be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--". (Default is None)
|
will be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (Default is None).
|
||||||
|
right_pad(tuple, optional): ("pad_token", pad_width). Padding performed on right side of the sequence.
|
||||||
|
pad_width will be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--"
|
||||||
|
(Default is None).
|
||||||
separator(str,optional): symbol used to join strings together. for e.g. if 2-gram the ["mindspore", "amazing"]
|
separator(str,optional): symbol used to join strings together. for e.g. if 2-gram the ["mindspore", "amazing"]
|
||||||
with separator="-" the result would be ["mindspore-amazing"]. (Default is None which means whitespace is used)
|
with separator="-" the result would be ["mindspore-amazing"] (Default is None which means whitespace is
|
||||||
|
used).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@check_ngram
|
@check_ngram
|
||||||
|
|
|
@ -12,7 +12,7 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""
|
"""
|
||||||
Some basic function for nlp
|
Some basic function for text
|
||||||
"""
|
"""
|
||||||
from enum import IntEnum
|
from enum import IntEnum
|
||||||
|
|
||||||
|
@ -25,42 +25,47 @@ from .validators import check_from_file, check_from_list, check_from_dict, check
|
||||||
|
|
||||||
class Vocab(cde.Vocab):
|
class Vocab(cde.Vocab):
|
||||||
"""
|
"""
|
||||||
Vocab object that is used for lookup word
|
Vocab object that is used for lookup word.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@check_from_dataset
|
@check_from_dataset
|
||||||
def from_dataset(cls, dataset, columns=None, freq_range=None, top_k=None):
|
def from_dataset(cls, dataset, columns=None, freq_range=None, top_k=None):
|
||||||
"""
|
"""
|
||||||
Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab
|
Build a vocab from a dataset. This would collect all unique words in a dataset and return a vocab within
|
||||||
which contains top_k most frequent words (if top_k is specified)
|
the frequency range specified by user in freq_range. User would be warned if no words fall into the frequency.
|
||||||
|
Words in vocab are ordered from highest frequency to lowest frequency. Words with the same frequency would be
|
||||||
|
ordered lexicographically.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
dataset(Dataset): dataset to build vocab from.
|
dataset(Dataset): dataset to build vocab from.
|
||||||
columns(str or list, optional): column names to get words from. It can be a list of column names.
|
columns([str, list], optional): column names to get words from. It can be a list of column names.
|
||||||
(Default is None where all columns will be used. If any column isn't string type, will return error)
|
(Default=None where all columns will be used. If any column isn't string type, will return error)
|
||||||
freq_range(tuple, optional): A tuple of integers (min_frequency, max_frequency). Words within the frequency
|
freq_range(tuple, optional): A tuple of integers (min_frequency, max_frequency). Words within the frequency
|
||||||
range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency/max_frequency
|
range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency=0 is the same as
|
||||||
can be None, which corresponds to 0/total_words separately (default is None, all words are included)
|
min_frequency=1. max_frequency > total_words is the same as max_frequency = total_words.
|
||||||
|
min_frequency/max_frequency can be None, which corresponds to 0/total_words separately
|
||||||
|
(default=None, all words are included).
|
||||||
top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
|
top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
|
||||||
taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default is None
|
taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default=None
|
||||||
all words are included)
|
all words are included).
|
||||||
return:
|
return:
|
||||||
text.Vocab: vocab object built from dataset.
|
text.Vocab: Vocab object built from dataset.
|
||||||
"""
|
"""
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
root = copy.deepcopy(dataset).build_vocab(vocab, columns, freq_range, top_k)
|
root = copy.deepcopy(dataset).build_vocab(vocab, columns, freq_range, top_k)
|
||||||
for d in root.create_dict_iterator():
|
for d in root.create_dict_iterator():
|
||||||
if d is not None:
|
if d is not None:
|
||||||
raise ValueError("from_dataset should receive data other than None")
|
raise ValueError("from_dataset should receive data other than None.")
|
||||||
return vocab
|
return vocab
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@check_from_list
|
@check_from_list
|
||||||
def from_list(cls, word_list):
|
def from_list(cls, word_list):
|
||||||
"""
|
"""
|
||||||
build a vocab object from a list of word
|
build a vocab object from a list of word.
|
||||||
Args:
|
Args:
|
||||||
word_list(list): a list of string where each element is a word
|
word_list(list): a list of string where each element is a word.
|
||||||
"""
|
"""
|
||||||
return super().from_list(word_list)
|
return super().from_list(word_list)
|
||||||
|
|
||||||
|
@ -68,11 +73,12 @@ class Vocab(cde.Vocab):
|
||||||
@check_from_file
|
@check_from_file
|
||||||
def from_file(cls, file_path, delimiter=None, vocab_size=None):
|
def from_file(cls, file_path, delimiter=None, vocab_size=None):
|
||||||
"""
|
"""
|
||||||
build a vocab object from a list of word
|
build a vocab object from a list of word.
|
||||||
Args:
|
Args:
|
||||||
file_path(str): path to the file which contains the vocab list
|
file_path(str): path to the file which contains the vocab list.
|
||||||
delimiter(None, str): a delimiter to break up each line in file, the first element is taken to be the word
|
delimiter(str, optional): a delimiter to break up each line in file, the first element is taken to be
|
||||||
vocab_size(None, int): number of words to read from file_path
|
the word (default=None).
|
||||||
|
vocab_size(int, optional): number of words to read from file_path (default=None, all words are taken).
|
||||||
"""
|
"""
|
||||||
return super().from_file(file_path, delimiter, vocab_size)
|
return super().from_file(file_path, delimiter, vocab_size)
|
||||||
|
|
||||||
|
@ -82,7 +88,7 @@ class Vocab(cde.Vocab):
|
||||||
"""
|
"""
|
||||||
build a vocab object from a dict.
|
build a vocab object from a dict.
|
||||||
Args:
|
Args:
|
||||||
word_dict(dict): dict contains word, id pairs. id should start from 2 and continuous
|
word_dict(dict): dict contains word, id pairs. id should start from 2 and be continuous.
|
||||||
"""
|
"""
|
||||||
return super().from_dict(word_dict)
|
return super().from_dict(word_dict)
|
||||||
|
|
||||||
|
@ -100,7 +106,7 @@ def to_str(array, encoding='utf8'):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not isinstance(array, np.ndarray):
|
if not isinstance(array, np.ndarray):
|
||||||
raise ValueError('input should be a numpy array')
|
raise ValueError('input should be a numpy array.')
|
||||||
|
|
||||||
return np.char.decode(array, encoding)
|
return np.char.decode(array, encoding)
|
||||||
|
|
||||||
|
@ -118,7 +124,7 @@ def to_bytes(array, encoding='utf8'):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not isinstance(array, np.ndarray):
|
if not isinstance(array, np.ndarray):
|
||||||
raise ValueError('input should be a numpy array')
|
raise ValueError('input should be a numpy array.')
|
||||||
|
|
||||||
return np.char.encode(array, encoding)
|
return np.char.encode(array, encoding)
|
||||||
|
|
||||||
|
|
|
@ -24,7 +24,7 @@ from ..transforms.validators import check_uint32, check_pos_int64
|
||||||
|
|
||||||
|
|
||||||
def check_lookup(method):
|
def check_lookup(method):
|
||||||
"""A wrapper that wrap a parameter checker to the original function(crop operation)."""
|
"""A wrapper that wrap a parameter checker to the original function."""
|
||||||
|
|
||||||
@wraps(method)
|
@wraps(method)
|
||||||
def new_method(self, *args, **kwargs):
|
def new_method(self, *args, **kwargs):
|
||||||
|
@ -35,10 +35,10 @@ def check_lookup(method):
|
||||||
unknown = kwargs.get("unknown")
|
unknown = kwargs.get("unknown")
|
||||||
if unknown is not None:
|
if unknown is not None:
|
||||||
if not (isinstance(unknown, int) and unknown >= 0):
|
if not (isinstance(unknown, int) and unknown >= 0):
|
||||||
raise ValueError("unknown needs to be a non-negative integer")
|
raise ValueError("unknown needs to be a non-negative integer.")
|
||||||
|
|
||||||
if not isinstance(vocab, cde.Vocab):
|
if not isinstance(vocab, cde.Vocab):
|
||||||
raise ValueError("vocab is not an instance of cde.Vocab")
|
raise ValueError("vocab is not an instance of cde.Vocab.")
|
||||||
|
|
||||||
kwargs["vocab"] = vocab
|
kwargs["vocab"] = vocab
|
||||||
kwargs["unknown"] = unknown
|
kwargs["unknown"] = unknown
|
||||||
|
@ -48,7 +48,7 @@ def check_lookup(method):
|
||||||
|
|
||||||
|
|
||||||
def check_from_file(method):
|
def check_from_file(method):
|
||||||
"""A wrapper that wrap a parameter checker to the original function(crop operation)."""
|
"""A wrapper that wrap a parameter checker to the original function."""
|
||||||
|
|
||||||
@wraps(method)
|
@wraps(method)
|
||||||
def new_method(self, *args, **kwargs):
|
def new_method(self, *args, **kwargs):
|
||||||
|
@ -61,16 +61,16 @@ def check_from_file(method):
|
||||||
vocab_size = kwargs.get("vocab_size")
|
vocab_size = kwargs.get("vocab_size")
|
||||||
|
|
||||||
if not isinstance(file_path, str):
|
if not isinstance(file_path, str):
|
||||||
raise ValueError("file_path needs to be str")
|
raise ValueError("file_path needs to be str.")
|
||||||
|
|
||||||
if delimiter is not None:
|
if delimiter is not None:
|
||||||
if not isinstance(delimiter, str):
|
if not isinstance(delimiter, str):
|
||||||
raise ValueError("delimiter needs to be str")
|
raise ValueError("delimiter needs to be str.")
|
||||||
else:
|
else:
|
||||||
delimiter = ""
|
delimiter = ""
|
||||||
if vocab_size is not None:
|
if vocab_size is not None:
|
||||||
if not (isinstance(vocab_size, int) and vocab_size > 0):
|
if not (isinstance(vocab_size, int) and vocab_size > 0):
|
||||||
raise ValueError("vocab size needs to be a positive integer")
|
raise ValueError("vocab size needs to be a positive integer.")
|
||||||
else:
|
else:
|
||||||
vocab_size = -1
|
vocab_size = -1
|
||||||
kwargs["file_path"] = file_path
|
kwargs["file_path"] = file_path
|
||||||
|
@ -82,7 +82,7 @@ def check_from_file(method):
|
||||||
|
|
||||||
|
|
||||||
def check_from_list(method):
|
def check_from_list(method):
|
||||||
"""A wrapper that wrap a parameter checker to the original function(crop operation)."""
|
"""A wrapper that wrap a parameter checker to the original function."""
|
||||||
|
|
||||||
@wraps(method)
|
@wraps(method)
|
||||||
def new_method(self, *args, **kwargs):
|
def new_method(self, *args, **kwargs):
|
||||||
|
@ -90,10 +90,10 @@ def check_from_list(method):
|
||||||
if "word_list" in kwargs:
|
if "word_list" in kwargs:
|
||||||
word_list = kwargs.get("word_list")
|
word_list = kwargs.get("word_list")
|
||||||
if not isinstance(word_list, list):
|
if not isinstance(word_list, list):
|
||||||
raise ValueError("word_list needs to be a list of words")
|
raise ValueError("word_list needs to be a list of words.")
|
||||||
for word in word_list:
|
for word in word_list:
|
||||||
if not isinstance(word, str):
|
if not isinstance(word, str):
|
||||||
raise ValueError("each word in word list needs to be type str")
|
raise ValueError("each word in word list needs to be type str.")
|
||||||
|
|
||||||
kwargs["word_list"] = word_list
|
kwargs["word_list"] = word_list
|
||||||
return method(self, **kwargs)
|
return method(self, **kwargs)
|
||||||
|
@ -102,7 +102,7 @@ def check_from_list(method):
|
||||||
|
|
||||||
|
|
||||||
def check_from_dict(method):
|
def check_from_dict(method):
|
||||||
"""A wrapper that wrap a parameter checker to the original function(crop operation)."""
|
"""A wrapper that wrap a parameter checker to the original function."""
|
||||||
|
|
||||||
@wraps(method)
|
@wraps(method)
|
||||||
def new_method(self, *args, **kwargs):
|
def new_method(self, *args, **kwargs):
|
||||||
|
@ -110,12 +110,12 @@ def check_from_dict(method):
|
||||||
if "word_dict" in kwargs:
|
if "word_dict" in kwargs:
|
||||||
word_dict = kwargs.get("word_dict")
|
word_dict = kwargs.get("word_dict")
|
||||||
if not isinstance(word_dict, dict):
|
if not isinstance(word_dict, dict):
|
||||||
raise ValueError("word_dict needs to be a list of word,id pairs")
|
raise ValueError("word_dict needs to be a list of word,id pairs.")
|
||||||
for word, word_id in word_dict.items():
|
for word, word_id in word_dict.items():
|
||||||
if not isinstance(word, str):
|
if not isinstance(word, str):
|
||||||
raise ValueError("each word in word_dict needs to be type str")
|
raise ValueError("each word in word_dict needs to be type str.")
|
||||||
if not (isinstance(word_id, int) and word_id >= 0):
|
if not (isinstance(word_id, int) and word_id >= 0):
|
||||||
raise ValueError("each word id needs to be positive integer")
|
raise ValueError("each word id needs to be positive integer.")
|
||||||
kwargs["word_dict"] = word_dict
|
kwargs["word_dict"] = word_dict
|
||||||
return method(self, **kwargs)
|
return method(self, **kwargs)
|
||||||
|
|
||||||
|
@ -135,11 +135,11 @@ def check_jieba_init(method):
|
||||||
mp_path = kwargs.get("mp_path")
|
mp_path = kwargs.get("mp_path")
|
||||||
if hmm_path is None:
|
if hmm_path is None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"the dict of HMMSegment in cppjieba is not provided")
|
"the dict of HMMSegment in cppjieba is not provided.")
|
||||||
kwargs["hmm_path"] = hmm_path
|
kwargs["hmm_path"] = hmm_path
|
||||||
if mp_path is None:
|
if mp_path is None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"the dict of MPSegment in cppjieba is not provided")
|
"the dict of MPSegment in cppjieba is not provided.")
|
||||||
kwargs["mp_path"] = mp_path
|
kwargs["mp_path"] = mp_path
|
||||||
if model is not None:
|
if model is not None:
|
||||||
kwargs["model"] = model
|
kwargs["model"] = model
|
||||||
|
@ -160,7 +160,7 @@ def check_jieba_add_word(method):
|
||||||
if "freq" in kwargs:
|
if "freq" in kwargs:
|
||||||
freq = kwargs.get("freq")
|
freq = kwargs.get("freq")
|
||||||
if word is None:
|
if word is None:
|
||||||
raise ValueError("word is not provided")
|
raise ValueError("word is not provided.")
|
||||||
kwargs["word"] = word
|
kwargs["word"] = word
|
||||||
if freq is not None:
|
if freq is not None:
|
||||||
check_uint32(freq)
|
check_uint32(freq)
|
||||||
|
@ -179,7 +179,7 @@ def check_jieba_add_dict(method):
|
||||||
if "user_dict" in kwargs:
|
if "user_dict" in kwargs:
|
||||||
user_dict = kwargs.get("user_dict")
|
user_dict = kwargs.get("user_dict")
|
||||||
if user_dict is None:
|
if user_dict is None:
|
||||||
raise ValueError("user_dict is not provided")
|
raise ValueError("user_dict is not provided.")
|
||||||
kwargs["user_dict"] = user_dict
|
kwargs["user_dict"] = user_dict
|
||||||
return method(self, **kwargs)
|
return method(self, **kwargs)
|
||||||
|
|
||||||
|
@ -187,7 +187,7 @@ def check_jieba_add_dict(method):
|
||||||
|
|
||||||
|
|
||||||
def check_from_dataset(method):
|
def check_from_dataset(method):
|
||||||
"""A wrapper that wrap a parameter checker to the original function(crop operation)."""
|
"""A wrapper that wrap a parameter checker to the original function."""
|
||||||
|
|
||||||
# def from_dataset(cls, dataset, columns, freq_range=None, top_k=None):
|
# def from_dataset(cls, dataset, columns, freq_range=None, top_k=None):
|
||||||
@wraps(method)
|
@wraps(method)
|
||||||
|
@ -210,27 +210,27 @@ def check_from_dataset(method):
|
||||||
|
|
||||||
for column in columns:
|
for column in columns:
|
||||||
if not isinstance(column, str):
|
if not isinstance(column, str):
|
||||||
raise ValueError("columns need to be a list of strings")
|
raise ValueError("columns need to be a list of strings.")
|
||||||
|
|
||||||
if freq_range is None:
|
if freq_range is None:
|
||||||
freq_range = (None, None)
|
freq_range = (None, None)
|
||||||
|
|
||||||
if not isinstance(freq_range, tuple) or len(freq_range) != 2:
|
if not isinstance(freq_range, tuple) or len(freq_range) != 2:
|
||||||
raise ValueError("freq_range needs to be either None or a tuple of 2 integers or an int and a None")
|
raise ValueError("freq_range needs to be either None or a tuple of 2 integers or an int and a None.")
|
||||||
|
|
||||||
for num in freq_range:
|
for num in freq_range:
|
||||||
if num is not None and (not isinstance(num, int)):
|
if num is not None and (not isinstance(num, int)):
|
||||||
raise ValueError("freq_range needs to be either None or a tuple of 2 integers or an int and a None")
|
raise ValueError("freq_range needs to be either None or a tuple of 2 integers or an int and a None.")
|
||||||
|
|
||||||
if isinstance(freq_range[0], int) and isinstance(freq_range[1], int):
|
if isinstance(freq_range[0], int) and isinstance(freq_range[1], int):
|
||||||
if freq_range[0] > freq_range[1] or freq_range[0] < 0:
|
if freq_range[0] > freq_range[1] or freq_range[0] < 0:
|
||||||
raise ValueError("frequency range [a,b] should be 0 <= a <= b (a,b are inclusive)")
|
raise ValueError("frequency range [a,b] should be 0 <= a <= b (a,b are inclusive).")
|
||||||
|
|
||||||
if top_k is not None and (not isinstance(top_k, int)):
|
if top_k is not None and (not isinstance(top_k, int)):
|
||||||
raise ValueError("top_k needs to be a positive integer")
|
raise ValueError("top_k needs to be a positive integer.")
|
||||||
|
|
||||||
if isinstance(top_k, int) and top_k <= 0:
|
if isinstance(top_k, int) and top_k <= 0:
|
||||||
raise ValueError("top_k needs to be a positive integer")
|
raise ValueError("top_k needs to be a positive integer.")
|
||||||
|
|
||||||
kwargs["dataset"] = dataset
|
kwargs["dataset"] = dataset
|
||||||
kwargs["columns"] = columns
|
kwargs["columns"] = columns
|
||||||
|
@ -243,7 +243,7 @@ def check_from_dataset(method):
|
||||||
|
|
||||||
|
|
||||||
def check_ngram(method):
|
def check_ngram(method):
|
||||||
"""A wrapper that wrap a parameter checker to the original function(crop operation)."""
|
"""A wrapper that wrap a parameter checker to the original function."""
|
||||||
|
|
||||||
@wraps(method)
|
@wraps(method)
|
||||||
def new_method(self, *args, **kwargs):
|
def new_method(self, *args, **kwargs):
|
||||||
|
@ -261,11 +261,11 @@ def check_ngram(method):
|
||||||
n = [n]
|
n = [n]
|
||||||
|
|
||||||
if not (isinstance(n, list) and n != []):
|
if not (isinstance(n, list) and n != []):
|
||||||
raise ValueError("n needs to be a non-empty list of positive integers")
|
raise ValueError("n needs to be a non-empty list of positive integers.")
|
||||||
|
|
||||||
for gram in n:
|
for gram in n:
|
||||||
if not (isinstance(gram, int) and gram > 0):
|
if not (isinstance(gram, int) and gram > 0):
|
||||||
raise ValueError("n in ngram needs to be a positive number\n")
|
raise ValueError("n in ngram needs to be a positive number.")
|
||||||
|
|
||||||
if left_pad is None:
|
if left_pad is None:
|
||||||
left_pad = ("", 0)
|
left_pad = ("", 0)
|
||||||
|
@ -275,20 +275,20 @@ def check_ngram(method):
|
||||||
|
|
||||||
if not (isinstance(left_pad, tuple) and len(left_pad) == 2 and isinstance(left_pad[0], str) and isinstance(
|
if not (isinstance(left_pad, tuple) and len(left_pad) == 2 and isinstance(left_pad[0], str) and isinstance(
|
||||||
left_pad[1], int)):
|
left_pad[1], int)):
|
||||||
raise ValueError("left_pad needs to be a tuple of (str, int) str is pad token and int is pad_width")
|
raise ValueError("left_pad needs to be a tuple of (str, int) str is pad token and int is pad_width.")
|
||||||
|
|
||||||
if not (isinstance(right_pad, tuple) and len(right_pad) == 2 and isinstance(right_pad[0], str) and isinstance(
|
if not (isinstance(right_pad, tuple) and len(right_pad) == 2 and isinstance(right_pad[0], str) and isinstance(
|
||||||
right_pad[1], int)):
|
right_pad[1], int)):
|
||||||
raise ValueError("right_pad needs to be a tuple of (str, int) str is pad token and int is pad_width")
|
raise ValueError("right_pad needs to be a tuple of (str, int) str is pad token and int is pad_width.")
|
||||||
|
|
||||||
if not (left_pad[1] >= 0 and right_pad[1] >= 0):
|
if not (left_pad[1] >= 0 and right_pad[1] >= 0):
|
||||||
raise ValueError("padding width need to be positive numbers")
|
raise ValueError("padding width need to be positive numbers.")
|
||||||
|
|
||||||
if separator is None:
|
if separator is None:
|
||||||
separator = " "
|
separator = " "
|
||||||
|
|
||||||
if not isinstance(separator, str):
|
if not isinstance(separator, str):
|
||||||
raise ValueError("separator needs to be a string")
|
raise ValueError("separator needs to be a string.")
|
||||||
|
|
||||||
kwargs["n"] = n
|
kwargs["n"] = n
|
||||||
kwargs["left_pad"] = left_pad
|
kwargs["left_pad"] = left_pad
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
Testing Ngram in mindspore.dataset
|
Testing Ngram in mindspore.dataset
|
||||||
"""
|
"""
|
||||||
import mindspore.dataset as ds
|
import mindspore.dataset as ds
|
||||||
import mindspore.dataset.text as nlp
|
import mindspore.dataset.text as text
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ def test_multiple_ngrams():
|
||||||
yield (np.array(line.split(" "), dtype='S'),)
|
yield (np.array(line.split(" "), dtype='S'),)
|
||||||
|
|
||||||
dataset = ds.GeneratorDataset(gen(plates_mottos), column_names=["text"])
|
dataset = ds.GeneratorDataset(gen(plates_mottos), column_names=["text"])
|
||||||
dataset = dataset.map(input_columns=["text"], operations=nlp.Ngram([1, 2, 3], ("_", 2), ("_", 2), " "))
|
dataset = dataset.map(input_columns=["text"], operations=text.Ngram([1, 2, 3], ("_", 2), ("_", 2), " "))
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
for data in dataset.create_dict_iterator():
|
for data in dataset.create_dict_iterator():
|
||||||
|
@ -61,7 +61,7 @@ def test_simple_ngram():
|
||||||
yield (np.array(line.split(" "), dtype='S'),)
|
yield (np.array(line.split(" "), dtype='S'),)
|
||||||
|
|
||||||
dataset = ds.GeneratorDataset(gen(plates_mottos), column_names=["text"])
|
dataset = ds.GeneratorDataset(gen(plates_mottos), column_names=["text"])
|
||||||
dataset = dataset.map(input_columns=["text"], operations=nlp.Ngram(3, separator=None))
|
dataset = dataset.map(input_columns=["text"], operations=text.Ngram(3, separator=None))
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
for data in dataset.create_dict_iterator():
|
for data in dataset.create_dict_iterator():
|
||||||
|
@ -73,11 +73,11 @@ def test_corner_cases():
|
||||||
""" testing various corner cases and exceptions"""
|
""" testing various corner cases and exceptions"""
|
||||||
|
|
||||||
def test_config(input_line, output_line, n, l_pad=None, r_pad=None, sep=None):
|
def test_config(input_line, output_line, n, l_pad=None, r_pad=None, sep=None):
|
||||||
def gen(text):
|
def gen(texts):
|
||||||
yield (np.array(text.split(" "), dtype='S'),)
|
yield (np.array(texts.split(" "), dtype='S'),)
|
||||||
|
|
||||||
dataset = ds.GeneratorDataset(gen(input_line), column_names=["text"])
|
dataset = ds.GeneratorDataset(gen(input_line), column_names=["text"])
|
||||||
dataset = dataset.map(input_columns=["text"], operations=nlp.Ngram(n, l_pad, r_pad, separator=sep))
|
dataset = dataset.map(input_columns=["text"], operations=text.Ngram(n, l_pad, r_pad, separator=sep))
|
||||||
for data in dataset.create_dict_iterator():
|
for data in dataset.create_dict_iterator():
|
||||||
assert [d.decode("utf8") for d in data["text"]] == output_line, output_line
|
assert [d.decode("utf8") for d in data["text"]] == output_line, output_line
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue