forked from mindspore-Ecosystem/mindspore
fix minddata api doc
This commit is contained in:
parent
8539de0e01
commit
e5f2c75233
|
@ -1809,7 +1809,8 @@ class TextBaseDataset(Dataset):
|
|||
|
||||
def build_vocab(self, columns, freq_range, top_k, special_tokens, special_first):
|
||||
"""
|
||||
Function to create a Vocab from source dataset
|
||||
Function to create a Vocab from source dataset.
|
||||
Desired source dataset is a text type dataset.
|
||||
|
||||
Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab
|
||||
which contains top_k most frequent words (if top_k is specified)
|
||||
|
@ -1879,7 +1880,8 @@ class TextBaseDataset(Dataset):
|
|||
|
||||
def build_sentencepiece_vocab(self, columns, vocab_size, character_coverage, model_type, params):
|
||||
"""
|
||||
Function to create a SentencePieceVocab from source dataset
|
||||
Function to create a SentencePieceVocab from source dataset.
|
||||
Desired source dataset is a text type dataset.
|
||||
|
||||
Args:
|
||||
|
||||
|
@ -1899,6 +1901,7 @@ class TextBaseDataset(Dataset):
|
|||
Examples:
|
||||
>>> from mindspore.dataset.text import SentencePieceModel
|
||||
>>>
|
||||
>>> # You can construct any text dataset as source, take TextFileDataset as example.
|
||||
>>> dataset = ds.TextFileDataset("/path/to/sentence/piece/vocab/file", shuffle=False)
|
||||
>>> dataset = dataset.build_sentencepiece_vocab(["text"], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
|
||||
"""
|
||||
|
|
|
@ -46,11 +46,11 @@ class MindDataset(MappableDataset, TextBaseDataset):
|
|||
num_parallel_workers (int, optional): The number of readers (default=None).
|
||||
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
|
||||
(default=None, performs global shuffle).
|
||||
If shuffle is False, no shuffling will be performed;
|
||||
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
|
||||
Otherwise, there are three levels of shuffling:
|
||||
If shuffle is False, no shuffling will be performed.
|
||||
If shuffle is True, performs global shuffle.
|
||||
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
|
||||
|
||||
- Shuffle.GLOBAL: Global shuffle of all rows of data in dataset.
|
||||
- Shuffle.GLOBAL: Global shuffle of all rows of data in dataset, same as setting shuffle to True.
|
||||
|
||||
- Shuffle.FILES: Shuffle the file sequence but keep the order of data within each file.
|
||||
|
||||
|
@ -74,8 +74,8 @@ class MindDataset(MappableDataset, TextBaseDataset):
|
|||
(default=None, which means no cache is used).
|
||||
|
||||
Raises:
|
||||
RuntimeError: If dataset_files are not valid or do not exist.
|
||||
RuntimeError: If num_parallel_workers exceeds the max thread numbers.
|
||||
ValueError: If dataset_files are not valid or do not exist.
|
||||
ValueError: If num_parallel_workers exceeds the max thread numbers.
|
||||
RuntimeError: If num_shards is specified but shard_id is None.
|
||||
RuntimeError: If shard_id is specified but num_shards is None.
|
||||
ValueError: If shard_id is invalid (< 0 or >= num_shards).
|
||||
|
@ -180,11 +180,11 @@ class TFRecordDataset(SourceDataset, TextBaseDataset):
|
|||
(default=None, number set in the config).
|
||||
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
|
||||
(default=Shuffle.GLOBAL).
|
||||
If shuffle is False, no shuffling will be performed;
|
||||
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
|
||||
Otherwise, there are two levels of shuffling:
|
||||
If shuffle is False, no shuffling will be performed.
|
||||
If shuffle is True, performs global shuffle.
|
||||
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
|
||||
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples.
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
|
||||
|
||||
- Shuffle.FILES: Shuffle files only.
|
||||
|
||||
|
@ -201,8 +201,8 @@ class TFRecordDataset(SourceDataset, TextBaseDataset):
|
|||
(default=None, which means no cache is used).
|
||||
|
||||
Raises:
|
||||
RuntimeError: If dataset_files are not valid or do not exist.
|
||||
RuntimeError: If num_parallel_workers exceeds the max thread numbers.
|
||||
ValueError: If dataset_files are not valid or do not exist.
|
||||
ValueError: If num_parallel_workers exceeds the max thread numbers.
|
||||
RuntimeError: If num_shards is specified but shard_id is None.
|
||||
RuntimeError: If shard_id is specified but num_shards is None.
|
||||
ValueError: If shard_id is invalid (< 0 or >= num_shards).
|
||||
|
|
|
@ -47,11 +47,11 @@ class AGNewsDataset(SourceDataset, TextBaseDataset):
|
|||
(default=None, number set in the config).
|
||||
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
|
||||
(default=Shuffle.GLOBAL).
|
||||
If shuffle is False, no shuffling will be performed;
|
||||
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
|
||||
Otherwise, there are two levels of shuffling:
|
||||
If shuffle is False, no shuffling will be performed.
|
||||
If shuffle is True, performs global shuffle.
|
||||
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
|
||||
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples.
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
|
||||
|
||||
- Shuffle.FILES: Shuffle files only.
|
||||
|
||||
|
@ -139,11 +139,11 @@ class AmazonReviewDataset(SourceDataset):
|
|||
num_samples (int, optional): Number of samples (rows) to be read (default=None, reads the full dataset).
|
||||
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
|
||||
(default=Shuffle.GLOBAL).
|
||||
If shuffle is False, no shuffling will be performed;
|
||||
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
|
||||
Otherwise, there are two levels of shuffling:
|
||||
If shuffle is False, no shuffling will be performed.
|
||||
If shuffle is True, performs global shuffle.
|
||||
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
|
||||
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples.
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
|
||||
|
||||
- Shuffle.FILES: Shuffle files only.
|
||||
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
|
||||
|
@ -217,61 +217,6 @@ class CLUEDataset(SourceDataset, TextBaseDataset):
|
|||
A source dataset that reads and parses CLUE datasets.
|
||||
Supported CLUE classification tasks: `AFQMC`, `TNEWS`, `IFLYTEK`, `CMNLI`, `WSC` and `CSL`.
|
||||
|
||||
The generated dataset with different task setting has different output columns:
|
||||
|
||||
- task = :py:obj:`AFQMC`
|
||||
- usage = :py:obj:`train`, output columns: :py:obj:`[sentence1, dtype=string]`, \
|
||||
:py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
|
||||
- usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint8]`, \
|
||||
:py:obj:`[sentence1, dtype=string]`, :py:obj:`[sentence2, dtype=string]`.
|
||||
- usage = :py:obj:`eval`, output columns: :py:obj:`[sentence1, dtype=string]`, \
|
||||
:py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
|
||||
|
||||
- task = :py:obj:`TNEWS`
|
||||
- usage = :py:obj:`train`, output columns: :py:obj:`[label, dtype=string]`, \
|
||||
:py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`, :py:obj:`[keywords, dtype=string]`.
|
||||
- usage = :py:obj:`test`, output columns: :py:obj:`[label, dtype=string]`, \
|
||||
:py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`, :py:obj:`[keywords, dtype=string]`.
|
||||
- usage = :py:obj:`eval`, output columns: :py:obj:`[label, dtype=string]`, \
|
||||
:py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`, :py:obj:`[keywords, dtype=string]`.
|
||||
|
||||
- task = :py:obj:`IFLYTEK`
|
||||
- usage = :py:obj:`train`, output columns: :py:obj:`[label, dtype=string]`, \
|
||||
:py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`.
|
||||
- usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=string]`, \
|
||||
:py:obj:`[sentence, dtype=string]`.
|
||||
- usage = :py:obj:`eval`, output columns: :py:obj:`[label, dtype=string]`, \
|
||||
:py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`.
|
||||
|
||||
- task = :py:obj:`CMNLI`
|
||||
- usage = :py:obj:`train`, output columns: :py:obj:`[sentence1, dtype=string]`, \
|
||||
:py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
|
||||
- usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint8]`, \
|
||||
:py:obj:`[sentence1, dtype=string]`, :py:obj:`[sentence2, dtype=string]`.
|
||||
- usage = :py:obj:`eval`, output columns: :py:obj:`[sentence1, dtype=string]`, \
|
||||
:py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
|
||||
|
||||
- task = :py:obj:`WSC`
|
||||
- usage = :py:obj:`train`, output columns: :py:obj:`[span1_index, dtype=uint8]`, \
|
||||
:py:obj:`[span2_index, dtype=uint8]`, :py:obj:`[span1_text, dtype=string]`, \
|
||||
:py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint8]`, \
|
||||
:py:obj:`[text, dtype=string]`, :py:obj:`[label, dtype=string]`.
|
||||
- usage = :py:obj:`test`, output columns: :py:obj:`[span1_index, dtype=uint8]`, \
|
||||
:py:obj:`[span2_index, dtype=uint8]`, :py:obj:`[span1_text, dtype=string]`, \
|
||||
:py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint8]`, :py:obj:`[text, dtype=string]`.
|
||||
- usage = :py:obj:`eval`, output columns: :py:obj:`[span1_index, dtype=uint8]`, \
|
||||
:py:obj:`[span2_index, dtype=uint8]`, :py:obj:`[span1_text, dtype=string]`, \
|
||||
:py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint8]`, \
|
||||
:py:obj:`[text, dtype=string]`, :py:obj:`[label, dtype=string]`.
|
||||
|
||||
- task = :py:obj:`CSL`
|
||||
- usage = :py:obj:`train`, output columns: :py:obj:`[id, dtype=uint8]`, \
|
||||
:py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`, :py:obj:`[label, dtype=string]`.
|
||||
- usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint8]`, \
|
||||
:py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`.
|
||||
- usage = :py:obj:`eval`, output columns: :py:obj:`[id, dtype=uint8]`, \
|
||||
:py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`, :py:obj:`[label, dtype=string]`.
|
||||
|
||||
Args:
|
||||
dataset_files (Union[str, list[str]]): String or list of files to be read or glob strings to search for
|
||||
a pattern of files. The list will be sorted in a lexicographical order.
|
||||
|
@ -284,11 +229,11 @@ class CLUEDataset(SourceDataset, TextBaseDataset):
|
|||
(default=None, number set in the config).
|
||||
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
|
||||
(default=Shuffle.GLOBAL).
|
||||
If shuffle is False, no shuffling will be performed;
|
||||
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
|
||||
Otherwise, there are two levels of shuffling:
|
||||
If shuffle is False, no shuffling will be performed.
|
||||
If shuffle is True, performs global shuffle.
|
||||
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
|
||||
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples.
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
|
||||
|
||||
- Shuffle.FILES: Shuffle files only.
|
||||
|
||||
|
@ -299,11 +244,72 @@ class CLUEDataset(SourceDataset, TextBaseDataset):
|
|||
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
|
||||
(default=None, which means no cache is used).
|
||||
|
||||
Note:
|
||||
The generated dataset with different task setting has different output columns:
|
||||
|
||||
- task = :py:obj:`AFQMC`
|
||||
- usage = :py:obj:`train`, output columns: :py:obj:`[sentence1, dtype=string]`, \
|
||||
:py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
|
||||
- usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint32]`, \
|
||||
:py:obj:`[sentence1, dtype=string]`, :py:obj:`[sentence2, dtype=string]`.
|
||||
- usage = :py:obj:`eval`, output columns: :py:obj:`[sentence1, dtype=string]`, \
|
||||
:py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
|
||||
|
||||
- task = :py:obj:`TNEWS`
|
||||
- usage = :py:obj:`train`, output columns: :py:obj:`[label, dtype=string]`, \
|
||||
:py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`, \
|
||||
:py:obj:`[keywords, dtype=string]`.
|
||||
- usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint32]`, \
|
||||
:py:obj:`[keywords, dtype=string]`, :py:obj:`[sentence, dtype=string]`.
|
||||
- usage = :py:obj:`eval`, output columns: :py:obj:`[label, dtype=string]`, \
|
||||
:py:obj:`[label_desc, dtype=string]`, :py:obj:`[sentence, dtype=string]`,\
|
||||
:py:obj:`[keywords, dtype=string]`.
|
||||
|
||||
- task = :py:obj:`IFLYTEK`
|
||||
- usage = :py:obj:`train`, output columns: :py:obj:`[label, dtype=string]`, \
|
||||
:py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`.
|
||||
- usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint32]`, \
|
||||
:py:obj:`[sentence, dtype=string]`.
|
||||
- usage = :py:obj:`eval`, output columns: :py:obj:`[label, dtype=string]`, \
|
||||
:py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`.
|
||||
|
||||
- task = :py:obj:`CMNLI`
|
||||
- usage = :py:obj:`train`, output columns: :py:obj:`[sentence1, dtype=string]`, \
|
||||
:py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
|
||||
- usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint32]`, \
|
||||
:py:obj:`[sentence1, dtype=string]`, :py:obj:`[sentence2, dtype=string]`.
|
||||
- usage = :py:obj:`eval`, output columns: :py:obj:`[sentence1, dtype=string]`, \
|
||||
:py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
|
||||
|
||||
- task = :py:obj:`WSC`
|
||||
- usage = :py:obj:`train`, output columns: :py:obj:`[span1_index, dtype=uint32]`, \
|
||||
:py:obj:`[span2_index, dtype=uint32]`, :py:obj:`[span1_text, dtype=string]`, \
|
||||
:py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint32]`, \
|
||||
:py:obj:`[text, dtype=string]`, :py:obj:`[label, dtype=string]`.
|
||||
- usage = :py:obj:`test`, output columns: :py:obj:`[span1_index, dtype=uint32]`, \
|
||||
:py:obj:`[span2_index, dtype=uint32]`, :py:obj:`[span1_text, dtype=string]`, \
|
||||
:py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint32]`, :py:obj:`[text, dtype=string]`.
|
||||
- usage = :py:obj:`eval`, output columns: :py:obj:`[span1_index, dtype=uint32]`, \
|
||||
:py:obj:`[span2_index, dtype=uint32]`, :py:obj:`[span1_text, dtype=string]`, \
|
||||
:py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint32]`, \
|
||||
:py:obj:`[text, dtype=string]`, :py:obj:`[label, dtype=string]`.
|
||||
|
||||
- task = :py:obj:`CSL`
|
||||
- usage = :py:obj:`train`, output columns: :py:obj:`[id, dtype=uint32]`, \
|
||||
:py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`, :py:obj:`[label, dtype=string]`.
|
||||
- usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint32]`, \
|
||||
:py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`.
|
||||
- usage = :py:obj:`eval`, output columns: :py:obj:`[id, dtype=uint32]`, \
|
||||
:py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`, :py:obj:`[label, dtype=string]`.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If dataset_files are not valid or do not exist.
|
||||
RuntimeError: If num_parallel_workers exceeds the max thread numbers.
|
||||
ValueError: If dataset_files are not valid or do not exist.
|
||||
ValueError: task is not in 'AFQMC', 'TNEWS', 'IFLYTEK', 'CMNLI', 'WSC' or 'CSL'.
|
||||
ValueError: usage is not in 'train', 'test' or 'eval'.
|
||||
ValueError: If num_parallel_workers exceeds the max thread numbers.
|
||||
RuntimeError: If num_shards is specified but shard_id is None.
|
||||
RuntimeError: If shard_id is specified but num_shards is None.
|
||||
ValueError: If shard_id is invalid (< 0 or >= num_shards).
|
||||
|
||||
Examples:
|
||||
>>> clue_dataset_dir = ["/path/to/clue_dataset_file"] # contains 1 or multiple clue files
|
||||
|
@ -373,11 +379,11 @@ class CoNLL2000Dataset(SourceDataset):
|
|||
num_samples (int, optional): Number of samples (rows) to read (default=None, reads the full dataset).
|
||||
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
|
||||
(default=Shuffle.GLOBAL).
|
||||
If shuffle is False, no shuffling will be performed;
|
||||
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
|
||||
Otherwise, there are two levels of shuffling:
|
||||
If shuffle is False, no shuffling will be performed.
|
||||
If shuffle is True, performs global shuffle.
|
||||
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
|
||||
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples.
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
|
||||
|
||||
- Shuffle.FILES: Shuffle files only.
|
||||
|
||||
|
@ -416,7 +422,8 @@ class CoNLL2000Dataset(SourceDataset):
|
|||
|
||||
class CSVDataset(SourceDataset, TextBaseDataset):
|
||||
"""
|
||||
A source dataset that reads and parses comma-separated values (CSV) datasets.
|
||||
A source dataset that reads and parses comma-separated values
|
||||
`(CSV) <http://en.volupedia.org/wiki/Comma-separated_values>`_ files as dataset.
|
||||
The columns of generated dataset depend on the source CSV files.
|
||||
|
||||
Args:
|
||||
|
@ -434,11 +441,11 @@ class CSVDataset(SourceDataset, TextBaseDataset):
|
|||
(default=None, number set in the config).
|
||||
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
|
||||
(default=Shuffle.GLOBAL).
|
||||
If shuffle is False, no shuffling will be performed;
|
||||
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
|
||||
Otherwise, there are two levels of shuffling:
|
||||
If shuffle is False, no shuffling will be performed.
|
||||
If shuffle is True, performs global shuffle.
|
||||
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
|
||||
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples.
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
|
||||
|
||||
- Shuffle.FILES: Shuffle files only.
|
||||
|
||||
|
@ -451,9 +458,11 @@ class CSVDataset(SourceDataset, TextBaseDataset):
|
|||
|
||||
Raises:
|
||||
RuntimeError: If dataset_files are not valid or do not exist.
|
||||
RuntimeError: If num_parallel_workers exceeds the max thread numbers.
|
||||
ValueError: If field_delim is invalid.
|
||||
ValueError: If num_parallel_workers exceeds the max thread numbers.
|
||||
RuntimeError: If num_shards is specified but shard_id is None.
|
||||
RuntimeError: If shard_id is specified but num_shards is None.
|
||||
ValueError: If shard_id is invalid (< 0 or >= num_shards).
|
||||
|
||||
Examples:
|
||||
>>> csv_dataset_dir = ["/path/to/csv_dataset_file"] # contains 1 or multiple csv files
|
||||
|
@ -497,11 +506,11 @@ class DBpediaDataset(SourceDataset, TextBaseDataset):
|
|||
(default=None, number set in the config).
|
||||
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
|
||||
(default=Shuffle.GLOBAL).
|
||||
If shuffle is False, no shuffling will be performed;
|
||||
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL;
|
||||
Otherwise, there are two levels of shuffling:
|
||||
If shuffle is False, no shuffling will be performed.
|
||||
If shuffle is True, performs global shuffle.
|
||||
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
|
||||
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples.
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
|
||||
|
||||
- Shuffle.FILES: Shuffle files only.
|
||||
|
||||
|
@ -586,11 +595,11 @@ class EnWik9Dataset(SourceDataset):
|
|||
(default=None, number set in the config).
|
||||
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
|
||||
(default=True).
|
||||
If shuffle is False, no shuffling will be performed;
|
||||
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
|
||||
Otherwise, there are two levels of shuffling:
|
||||
If shuffle is False, no shuffling will be performed.
|
||||
If shuffle is True, performs global shuffle.
|
||||
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
|
||||
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples.
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
|
||||
|
||||
- Shuffle.FILES: Shuffle files only.
|
||||
|
||||
|
@ -812,11 +821,11 @@ class IWSLT2016Dataset(SourceDataset, TextBaseDataset):
|
|||
num_samples (int, optional): Number of samples (rows) to read (default=None, reads the full dataset).
|
||||
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
|
||||
(default=Shuffle.GLOBAL).
|
||||
If shuffle is False, no shuffling will be performed;
|
||||
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
|
||||
Otherwise, there are two levels of shuffling:
|
||||
If shuffle is False, no shuffling will be performed.
|
||||
If shuffle is True, performs global shuffle.
|
||||
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
|
||||
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples.
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
|
||||
|
||||
- Shuffle.FILES: Shuffle files only.
|
||||
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
|
||||
|
@ -933,11 +942,11 @@ class IWSLT2017Dataset(SourceDataset, TextBaseDataset):
|
|||
num_samples (int, optional): Number of samples (rows) to read (default=None, reads the full dataset).
|
||||
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
|
||||
(default=Shuffle.GLOBAL).
|
||||
If shuffle is False, no shuffling will be performed;
|
||||
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
|
||||
Otherwise, there are two levels of shuffling:
|
||||
If shuffle is False, no shuffling will be performed.
|
||||
If shuffle is True, performs global shuffle.
|
||||
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
|
||||
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples.
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
|
||||
|
||||
- Shuffle.FILES: Shuffle files only.
|
||||
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
|
||||
|
@ -1030,11 +1039,11 @@ class PennTreebankDataset(SourceDataset, TextBaseDataset):
|
|||
(default=None, number set in the config).
|
||||
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
|
||||
(default=Shuffle.GLOBAL).
|
||||
If shuffle is False, no shuffling will be performed;
|
||||
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
|
||||
Otherwise, there are two levels of shuffling:
|
||||
If shuffle is False, no shuffling will be performed.
|
||||
If shuffle is True, performs global shuffle.
|
||||
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
|
||||
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples.
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
|
||||
|
||||
- Shuffle.FILES: Shuffle files only.
|
||||
|
||||
|
@ -1117,11 +1126,11 @@ class SogouNewsDataset(SourceDataset):
|
|||
num_samples (int, optional): Number of samples (rows) to read (default=None, read all samples).
|
||||
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
|
||||
(default=Shuffle.GLOBAL).
|
||||
If shuffle is False, no shuffling will be performed;
|
||||
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
|
||||
Otherwise, there are two levels of shuffling:
|
||||
If shuffle is False, no shuffling will be performed.
|
||||
If shuffle is True, performs global shuffle.
|
||||
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
|
||||
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples.
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
|
||||
|
||||
- Shuffle.FILES: Shuffle files only.
|
||||
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
|
||||
|
@ -1201,11 +1210,11 @@ class TextFileDataset(SourceDataset, TextBaseDataset):
|
|||
(default=None, number set in the config).
|
||||
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
|
||||
(default=Shuffle.GLOBAL).
|
||||
If shuffle is False, no shuffling will be performed;
|
||||
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
|
||||
Otherwise, there are two levels of shuffling:
|
||||
If shuffle is False, no shuffling will be performed.
|
||||
If shuffle is True, performs global shuffle.
|
||||
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
|
||||
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples.
|
||||
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
|
||||
|
||||
- Shuffle.FILES: Shuffle files only.
|
||||
|
||||
|
@ -1217,10 +1226,11 @@ class TextFileDataset(SourceDataset, TextBaseDataset):
|
|||
(default=None, which means no cache is used).
|
||||
|
||||
Raises:
|
||||
RuntimeError: If dataset_files are not valid or do not exist.
|
||||
RuntimeError: If num_parallel_workers exceeds the max thread numbers.
|
||||
ValueError: If dataset_files are not valid or do not exist.
|
||||
ValueError: If num_parallel_workers exceeds the max thread numbers.
|
||||
RuntimeError: If num_shards is specified but shard_id is None.
|
||||
RuntimeError: If shard_id is specified but num_shards is None.
|
||||
ValueError: If shard_id is invalid (< 0 or >= num_shards).
|
||||
|
||||
Examples:
|
||||
>>> text_file_dataset_dir = ["/path/to/text_file_dataset_file"] # contains 1 or multiple text files
|
||||
|
|
|
@ -467,11 +467,11 @@ class GeneratorDataset(MappableDataset, TextBaseDataset):
|
|||
Raises:
|
||||
RuntimeError: If source raises an exception during execution.
|
||||
RuntimeError: If len of column_names does not match output len of source.
|
||||
RuntimeError: If num_parallel_workers exceeds the max thread numbers.
|
||||
RuntimeError: If sampler and shuffle are specified at the same time.
|
||||
RuntimeError: If sampler and sharding are specified at the same time.
|
||||
RuntimeError: If num_shards is specified but shard_id is None.
|
||||
RuntimeError: If shard_id is specified but num_shards is None.
|
||||
ValueError: If num_parallel_workers exceeds the max thread numbers.
|
||||
ValueError: If sampler and shuffle are specified at the same time.
|
||||
ValueError: If sampler and sharding are specified at the same time.
|
||||
ValueError: If num_shards is specified but shard_id is None.
|
||||
ValueError: If shard_id is specified but num_shards is None.
|
||||
ValueError: If shard_id is invalid (< 0 or >= num_shards).
|
||||
|
||||
Note:
|
||||
|
|
|
@ -72,6 +72,10 @@ DE_C_INTER_OUTPUT_FORMAT = {
|
|||
class GraphData:
|
||||
"""
|
||||
Reads the graph dataset used for GNN training from the shared file and database.
|
||||
Support reading graph datasets like Cora, Citeseer and PubMed.
|
||||
|
||||
About how to load raw graph dataset into MindSpore please
|
||||
refer to `Loading Graph Dataset <https://mindspore.cn/docs/programming_guide/zh-CN/master/load_dataset_gnn.html>`_.
|
||||
|
||||
Args:
|
||||
dataset_file (str): One of file names in the dataset.
|
||||
|
@ -98,6 +102,17 @@ class GraphData:
|
|||
when the number of connected clients reaches num_client and no client is being connected,
|
||||
the server automatically exits (default=True).
|
||||
|
||||
Raises:
|
||||
ValueError: If `dataset_file` does not exist or permission denied.
|
||||
TypeError: If `num_parallel_workers` exceeds the max thread numbers.
|
||||
ValueError: If `working_mode` is not 'local', 'client' or 'server'.
|
||||
TypeError: If `hostname` is illegal.
|
||||
ValueError: If `port` is not in range [1024, 65535].
|
||||
ValueError: If `num_client` is not in range [1, 255].
|
||||
|
||||
Supported Platforms:
|
||||
``CPU``
|
||||
|
||||
Examples:
|
||||
>>> graph_dataset_dir = "/path/to/graph_dataset_file"
|
||||
>>> graph_dataset = ds.GraphData(dataset_file=graph_dataset_dir, num_parallel_workers=2)
|
||||
|
|
|
@ -81,10 +81,14 @@ class Iterator:
|
|||
|
||||
self._transform_tensor = lambda t: t.as_array()
|
||||
if not output_numpy:
|
||||
if do_copy:
|
||||
self._transform_tensor = lambda t: Tensor(t.as_array())
|
||||
else:
|
||||
self._transform_tensor = lambda t: Tensor.from_numpy(t.as_array())
|
||||
def _transform(t, do_copy):
|
||||
array = t.as_array()
|
||||
if array.dtype.type is np.bytes_:
|
||||
array = array.astype(np.str_)
|
||||
if do_copy:
|
||||
return Tensor(array)
|
||||
return Tensor.from_numpy(array)
|
||||
self._transform_tensor = lambda t: _transform(t, do_copy)
|
||||
self.__index = 0
|
||||
|
||||
self.offload_model = None
|
||||
|
|
|
@ -109,9 +109,9 @@ def show(dataset, indentation=2):
|
|||
Do not indent if indentation is None (default=2).
|
||||
|
||||
Examples:
|
||||
>>> dataset = ds.MnistDataset(mnist_dataset_dir, 100)
|
||||
>>> dataset = ds.MnistDataset(mnist_dataset_dir, num_samples=100)
|
||||
>>> one_hot_encode = c_transforms.OneHot(10)
|
||||
>>> dataset = dataset.map(operation=one_hot_encode, input_column_names="label")
|
||||
>>> dataset = dataset.map(operations=one_hot_encode, input_columns="label")
|
||||
>>> dataset = dataset.batch(batch_size=10, drop_remainder=True)
|
||||
>>> ds.show(dataset)
|
||||
"""
|
||||
|
|
|
@ -192,12 +192,12 @@ class Vocab:
|
|||
>>> # cat,00
|
||||
>>> # --- end of file ---
|
||||
>>>
|
||||
>>> # Read file through this API and specify "," as delimiter,
|
||||
>>> # then the delimiter will break up each line in file, the first element is taken to be the word.
|
||||
>>> # Read file through this API and specify "," as delimiter.
|
||||
>>> # The delimiter will break up each line in file, then the first element is taken to be the word.
|
||||
>>> vocab = text.Vocab.from_file("/path/to/simple/vocab/file", ",", None, ["<pad>", "<unk>"], True)
|
||||
>>>
|
||||
>>> # Finally, there are 5 words in the vocab: "<pad>", "<unk>", "apple", "banana", "cat"
|
||||
>>> print(vocab.vocab())
|
||||
>>> # Finally, there are 5 words in the vocab: "<pad>", "<unk>", "apple", "banana", "cat".
|
||||
>>> vocabulary = vocab.vocab()
|
||||
"""
|
||||
if vocab_size is None:
|
||||
vocab_size = -1
|
||||
|
|
|
@ -182,16 +182,56 @@ def test_textline_dataset_repeat():
|
|||
assert count == 9
|
||||
|
||||
|
||||
def test_textline_dataset_output_tensor():
|
||||
"""
|
||||
Feature: Test text dataset output string and construct mindspore.Tensor.
|
||||
Description: set output_numpy=False in create_dict_iterator.
|
||||
Expectation: output tensor successfully
|
||||
"""
|
||||
data = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
expected_text = ["This is a text file.", "Be happy every day.", "Good luck to everyone."]
|
||||
|
||||
count = 0
|
||||
for i in data.create_dict_iterator(num_epochs=1, output_numpy=False):
|
||||
logger.info("{}".format(i["text"]))
|
||||
assert expected_text[count] == str(i["text"])
|
||||
count += 1
|
||||
assert count == 3
|
||||
|
||||
count = 0
|
||||
for i in data.create_tuple_iterator(num_epochs=1, output_numpy=False, do_copy=True):
|
||||
logger.info("{}".format(i[0]))
|
||||
assert expected_text[count] == str(i[0])
|
||||
count += 1
|
||||
assert count == 3
|
||||
|
||||
count = 0
|
||||
for i in data.create_tuple_iterator(num_epochs=1, output_numpy=False, do_copy=False):
|
||||
logger.info("{}".format(i[0]))
|
||||
assert expected_text[count] == str(i[0])
|
||||
count += 1
|
||||
assert count == 3
|
||||
|
||||
count = 0
|
||||
for i in data:
|
||||
logger.info("{}".format(i[0]))
|
||||
assert expected_text[count] == str(i[0])
|
||||
count += 1
|
||||
assert count == 3
|
||||
|
||||
|
||||
def test_textline_dataset_get_datasetsize():
|
||||
data = ds.TextFileDataset(DATA_FILE)
|
||||
size = data.get_dataset_size()
|
||||
assert size == 3
|
||||
|
||||
|
||||
def test_textline_dataset_to_device():
|
||||
data = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
data = data.to_device()
|
||||
data.send()
|
||||
|
||||
|
||||
def test_textline_dataset_exceptions():
|
||||
with pytest.raises(ValueError) as error_info:
|
||||
_ = ds.TextFileDataset(DATA_FILE, num_samples=-1)
|
||||
|
|
Loading…
Reference in New Issue