fix minddata api doc

This commit is contained in:
luoyang 2022-01-13 15:28:36 +08:00
parent 8539de0e01
commit e5f2c75233
9 changed files with 210 additions and 138 deletions

View File

@ -1809,7 +1809,8 @@ class TextBaseDataset(Dataset):
def build_vocab(self, columns, freq_range, top_k, special_tokens, special_first):
"""
Function to create a Vocab from source dataset
Function to create a Vocab from source dataset.
Desired source dataset is a text type dataset.
Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab
which contains top_k most frequent words (if top_k is specified)
@ -1879,7 +1880,8 @@ class TextBaseDataset(Dataset):
def build_sentencepiece_vocab(self, columns, vocab_size, character_coverage, model_type, params):
"""
Function to create a SentencePieceVocab from source dataset
Function to create a SentencePieceVocab from source dataset.
Desired source dataset is a text type dataset.
Args:
@ -1899,6 +1901,7 @@ class TextBaseDataset(Dataset):
Examples:
>>> from mindspore.dataset.text import SentencePieceModel
>>>
>>> # You can construct any text dataset as source, take TextFileDataset as example.
>>> dataset = ds.TextFileDataset("/path/to/sentence/piece/vocab/file", shuffle=False)
>>> dataset = dataset.build_sentencepiece_vocab(["text"], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
"""

View File

@ -46,11 +46,11 @@ class MindDataset(MappableDataset, TextBaseDataset):
num_parallel_workers (int, optional): The number of readers (default=None).
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
(default=None, performs global shuffle).
If shuffle is False, no shuffling will be performed;
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
Otherwise, there are three levels of shuffling:
If shuffle is False, no shuffling will be performed.
If shuffle is True, performs global shuffle.
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
- Shuffle.GLOBAL: Global shuffle of all rows of data in dataset.
- Shuffle.GLOBAL: Global shuffle of all rows of data in dataset, same as setting shuffle to True.
- Shuffle.FILES: Shuffle the file sequence but keep the order of data within each file.
@ -74,8 +74,8 @@ class MindDataset(MappableDataset, TextBaseDataset):
(default=None, which means no cache is used).
Raises:
RuntimeError: If dataset_files are not valid or do not exist.
RuntimeError: If num_parallel_workers exceeds the max thread numbers.
ValueError: If dataset_files are not valid or do not exist.
ValueError: If num_parallel_workers exceeds the max thread numbers.
RuntimeError: If num_shards is specified but shard_id is None.
RuntimeError: If shard_id is specified but num_shards is None.
ValueError: If shard_id is invalid (< 0 or >= num_shards).
@ -180,11 +180,11 @@ class TFRecordDataset(SourceDataset, TextBaseDataset):
(default=None, number set in the config).
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
(default=Shuffle.GLOBAL).
If shuffle is False, no shuffling will be performed;
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
Otherwise, there are two levels of shuffling:
If shuffle is False, no shuffling will be performed.
If shuffle is True, performs global shuffle.
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
- Shuffle.GLOBAL: Shuffle both the files and samples.
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
- Shuffle.FILES: Shuffle files only.
@ -201,8 +201,8 @@ class TFRecordDataset(SourceDataset, TextBaseDataset):
(default=None, which means no cache is used).
Raises:
RuntimeError: If dataset_files are not valid or do not exist.
RuntimeError: If num_parallel_workers exceeds the max thread numbers.
ValueError: If dataset_files are not valid or do not exist.
ValueError: If num_parallel_workers exceeds the max thread numbers.
RuntimeError: If num_shards is specified but shard_id is None.
RuntimeError: If shard_id is specified but num_shards is None.
ValueError: If shard_id is invalid (< 0 or >= num_shards).

View File

@ -47,11 +47,11 @@ class AGNewsDataset(SourceDataset, TextBaseDataset):
(default=None, number set in the config).
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
(default=Shuffle.GLOBAL).
If shuffle is False, no shuffling will be performed;
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
Otherwise, there are two levels of shuffling:
If shuffle is False, no shuffling will be performed.
If shuffle is True, performs global shuffle.
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
- Shuffle.GLOBAL: Shuffle both the files and samples.
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
- Shuffle.FILES: Shuffle files only.
@ -139,11 +139,11 @@ class AmazonReviewDataset(SourceDataset):
num_samples (int, optional): Number of samples (rows) to be read (default=None, reads the full dataset).
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
(default=Shuffle.GLOBAL).
If shuffle is False, no shuffling will be performed;
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
Otherwise, there are two levels of shuffling:
If shuffle is False, no shuffling will be performed.
If shuffle is True, performs global shuffle.
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
- Shuffle.GLOBAL: Shuffle both the files and samples.
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
- Shuffle.FILES: Shuffle files only.
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
@ -217,61 +217,6 @@ class CLUEDataset(SourceDataset, TextBaseDataset):
A source dataset that reads and parses CLUE datasets.
Supported CLUE classification tasks: `AFQMC`, `TNEWS`, `IFLYTEK`, `CMNLI`, `WSC` and `CSL`.
The generated dataset with different task setting has different output columns:
- task = :py:obj:`AFQMC`
- usage = :py:obj:`train`, output columns: :py:obj:`[sentence1, dtype=string]`, \
:py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
- usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint8]`, \
:py:obj:`[sentence1, dtype=string]`, :py:obj:`[sentence2, dtype=string]`.
- usage = :py:obj:`eval`, output columns: :py:obj:`[sentence1, dtype=string]`, \
:py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
- task = :py:obj:`TNEWS`
- usage = :py:obj:`train`, output columns: :py:obj:`[label, dtype=string]`, \
:py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`, :py:obj:`[keywords, dtype=string]`.
- usage = :py:obj:`test`, output columns: :py:obj:`[label, dtype=string]`, \
:py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`, :py:obj:`[keywords, dtype=string]`.
- usage = :py:obj:`eval`, output columns: :py:obj:`[label, dtype=string]`, \
:py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`, :py:obj:`[keywords, dtype=string]`.
- task = :py:obj:`IFLYTEK`
- usage = :py:obj:`train`, output columns: :py:obj:`[label, dtype=string]`, \
:py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`.
- usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=string]`, \
:py:obj:`[sentence, dtype=string]`.
- usage = :py:obj:`eval`, output columns: :py:obj:`[label, dtype=string]`, \
:py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`.
- task = :py:obj:`CMNLI`
- usage = :py:obj:`train`, output columns: :py:obj:`[sentence1, dtype=string]`, \
:py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
- usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint8]`, \
:py:obj:`[sentence1, dtype=string]`, :py:obj:`[sentence2, dtype=string]`.
- usage = :py:obj:`eval`, output columns: :py:obj:`[sentence1, dtype=string]`, \
:py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
- task = :py:obj:`WSC`
- usage = :py:obj:`train`, output columns: :py:obj:`[span1_index, dtype=uint8]`, \
:py:obj:`[span2_index, dtype=uint8]`, :py:obj:`[span1_text, dtype=string]`, \
:py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint8]`, \
:py:obj:`[text, dtype=string]`, :py:obj:`[label, dtype=string]`.
- usage = :py:obj:`test`, output columns: :py:obj:`[span1_index, dtype=uint8]`, \
:py:obj:`[span2_index, dtype=uint8]`, :py:obj:`[span1_text, dtype=string]`, \
:py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint8]`, :py:obj:`[text, dtype=string]`.
- usage = :py:obj:`eval`, output columns: :py:obj:`[span1_index, dtype=uint8]`, \
:py:obj:`[span2_index, dtype=uint8]`, :py:obj:`[span1_text, dtype=string]`, \
:py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint8]`, \
:py:obj:`[text, dtype=string]`, :py:obj:`[label, dtype=string]`.
- task = :py:obj:`CSL`
- usage = :py:obj:`train`, output columns: :py:obj:`[id, dtype=uint8]`, \
:py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`, :py:obj:`[label, dtype=string]`.
- usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint8]`, \
:py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`.
- usage = :py:obj:`eval`, output columns: :py:obj:`[id, dtype=uint8]`, \
:py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`, :py:obj:`[label, dtype=string]`.
Args:
dataset_files (Union[str, list[str]]): String or list of files to be read or glob strings to search for
a pattern of files. The list will be sorted in a lexicographical order.
@ -284,11 +229,11 @@ class CLUEDataset(SourceDataset, TextBaseDataset):
(default=None, number set in the config).
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
(default=Shuffle.GLOBAL).
If shuffle is False, no shuffling will be performed;
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
Otherwise, there are two levels of shuffling:
If shuffle is False, no shuffling will be performed.
If shuffle is True, performs global shuffle.
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
- Shuffle.GLOBAL: Shuffle both the files and samples.
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
- Shuffle.FILES: Shuffle files only.
@ -299,11 +244,72 @@ class CLUEDataset(SourceDataset, TextBaseDataset):
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
(default=None, which means no cache is used).
Note:
The generated dataset with different task setting has different output columns:
- task = :py:obj:`AFQMC`
- usage = :py:obj:`train`, output columns: :py:obj:`[sentence1, dtype=string]`, \
:py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
- usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint32]`, \
:py:obj:`[sentence1, dtype=string]`, :py:obj:`[sentence2, dtype=string]`.
- usage = :py:obj:`eval`, output columns: :py:obj:`[sentence1, dtype=string]`, \
:py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
- task = :py:obj:`TNEWS`
- usage = :py:obj:`train`, output columns: :py:obj:`[label, dtype=string]`, \
:py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`, \
:py:obj:`[keywords, dtype=string]`.
- usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint32]`, \
:py:obj:`[keywords, dtype=string]`, :py:obj:`[sentence, dtype=string]`.
- usage = :py:obj:`eval`, output columns: :py:obj:`[label, dtype=string]`, \
:py:obj:`[label_desc, dtype=string]`, :py:obj:`[sentence, dtype=string]`,\
:py:obj:`[keywords, dtype=string]`.
- task = :py:obj:`IFLYTEK`
- usage = :py:obj:`train`, output columns: :py:obj:`[label, dtype=string]`, \
:py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`.
- usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint32]`, \
:py:obj:`[sentence, dtype=string]`.
- usage = :py:obj:`eval`, output columns: :py:obj:`[label, dtype=string]`, \
:py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`.
- task = :py:obj:`CMNLI`
- usage = :py:obj:`train`, output columns: :py:obj:`[sentence1, dtype=string]`, \
:py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
- usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint32]`, \
:py:obj:`[sentence1, dtype=string]`, :py:obj:`[sentence2, dtype=string]`.
- usage = :py:obj:`eval`, output columns: :py:obj:`[sentence1, dtype=string]`, \
:py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
- task = :py:obj:`WSC`
- usage = :py:obj:`train`, output columns: :py:obj:`[span1_index, dtype=uint32]`, \
:py:obj:`[span2_index, dtype=uint32]`, :py:obj:`[span1_text, dtype=string]`, \
:py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint32]`, \
:py:obj:`[text, dtype=string]`, :py:obj:`[label, dtype=string]`.
- usage = :py:obj:`test`, output columns: :py:obj:`[span1_index, dtype=uint32]`, \
:py:obj:`[span2_index, dtype=uint32]`, :py:obj:`[span1_text, dtype=string]`, \
:py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint32]`, :py:obj:`[text, dtype=string]`.
- usage = :py:obj:`eval`, output columns: :py:obj:`[span1_index, dtype=uint32]`, \
:py:obj:`[span2_index, dtype=uint32]`, :py:obj:`[span1_text, dtype=string]`, \
:py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint32]`, \
:py:obj:`[text, dtype=string]`, :py:obj:`[label, dtype=string]`.
- task = :py:obj:`CSL`
- usage = :py:obj:`train`, output columns: :py:obj:`[id, dtype=uint32]`, \
:py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`, :py:obj:`[label, dtype=string]`.
- usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint32]`, \
:py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`.
- usage = :py:obj:`eval`, output columns: :py:obj:`[id, dtype=uint32]`, \
:py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`, :py:obj:`[label, dtype=string]`.
Raises:
RuntimeError: If dataset_files are not valid or do not exist.
RuntimeError: If num_parallel_workers exceeds the max thread numbers.
ValueError: If dataset_files are not valid or do not exist.
ValueError: task is not in 'AFQMC', 'TNEWS', 'IFLYTEK', 'CMNLI', 'WSC' or 'CSL'.
ValueError: usage is not in 'train', 'test' or 'eval'.
ValueError: If num_parallel_workers exceeds the max thread numbers.
RuntimeError: If num_shards is specified but shard_id is None.
RuntimeError: If shard_id is specified but num_shards is None.
ValueError: If shard_id is invalid (< 0 or >= num_shards).
Examples:
>>> clue_dataset_dir = ["/path/to/clue_dataset_file"] # contains 1 or multiple clue files
@ -373,11 +379,11 @@ class CoNLL2000Dataset(SourceDataset):
num_samples (int, optional): Number of samples (rows) to read (default=None, reads the full dataset).
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
(default=Shuffle.GLOBAL).
If shuffle is False, no shuffling will be performed;
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
Otherwise, there are two levels of shuffling:
If shuffle is False, no shuffling will be performed.
If shuffle is True, performs global shuffle.
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
- Shuffle.GLOBAL: Shuffle both the files and samples.
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
- Shuffle.FILES: Shuffle files only.
@ -416,7 +422,8 @@ class CoNLL2000Dataset(SourceDataset):
class CSVDataset(SourceDataset, TextBaseDataset):
"""
A source dataset that reads and parses comma-separated values (CSV) datasets.
A source dataset that reads and parses comma-separated values
`(CSV) <http://en.volupedia.org/wiki/Comma-separated_values>`_ files as dataset.
The columns of generated dataset depend on the source CSV files.
Args:
@ -434,11 +441,11 @@ class CSVDataset(SourceDataset, TextBaseDataset):
(default=None, number set in the config).
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
(default=Shuffle.GLOBAL).
If shuffle is False, no shuffling will be performed;
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
Otherwise, there are two levels of shuffling:
If shuffle is False, no shuffling will be performed.
If shuffle is True, performs global shuffle.
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
- Shuffle.GLOBAL: Shuffle both the files and samples.
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
- Shuffle.FILES: Shuffle files only.
@ -451,9 +458,11 @@ class CSVDataset(SourceDataset, TextBaseDataset):
Raises:
RuntimeError: If dataset_files are not valid or do not exist.
RuntimeError: If num_parallel_workers exceeds the max thread numbers.
ValueError: If field_delim is invalid.
ValueError: If num_parallel_workers exceeds the max thread numbers.
RuntimeError: If num_shards is specified but shard_id is None.
RuntimeError: If shard_id is specified but num_shards is None.
ValueError: If shard_id is invalid (< 0 or >= num_shards).
Examples:
>>> csv_dataset_dir = ["/path/to/csv_dataset_file"] # contains 1 or multiple csv files
@ -497,11 +506,11 @@ class DBpediaDataset(SourceDataset, TextBaseDataset):
(default=None, number set in the config).
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
(default=Shuffle.GLOBAL).
If shuffle is False, no shuffling will be performed;
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL;
Otherwise, there are two levels of shuffling:
If shuffle is False, no shuffling will be performed.
If shuffle is True, performs global shuffle.
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
- Shuffle.GLOBAL: Shuffle both the files and samples.
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
- Shuffle.FILES: Shuffle files only.
@ -586,11 +595,11 @@ class EnWik9Dataset(SourceDataset):
(default=None, number set in the config).
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
(default=True).
If shuffle is False, no shuffling will be performed;
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
Otherwise, there are two levels of shuffling:
If shuffle is False, no shuffling will be performed.
If shuffle is True, performs global shuffle.
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
- Shuffle.GLOBAL: Shuffle both the files and samples.
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
- Shuffle.FILES: Shuffle files only.
@ -812,11 +821,11 @@ class IWSLT2016Dataset(SourceDataset, TextBaseDataset):
num_samples (int, optional): Number of samples (rows) to read (default=None, reads the full dataset).
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
(default=Shuffle.GLOBAL).
If shuffle is False, no shuffling will be performed;
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
Otherwise, there are two levels of shuffling:
If shuffle is False, no shuffling will be performed.
If shuffle is True, performs global shuffle.
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
- Shuffle.GLOBAL: Shuffle both the files and samples.
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
- Shuffle.FILES: Shuffle files only.
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
@ -933,11 +942,11 @@ class IWSLT2017Dataset(SourceDataset, TextBaseDataset):
num_samples (int, optional): Number of samples (rows) to read (default=None, reads the full dataset).
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
(default=Shuffle.GLOBAL).
If shuffle is False, no shuffling will be performed;
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
Otherwise, there are two levels of shuffling:
If shuffle is False, no shuffling will be performed.
If shuffle is True, performs global shuffle.
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
- Shuffle.GLOBAL: Shuffle both the files and samples.
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
- Shuffle.FILES: Shuffle files only.
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
@ -1030,11 +1039,11 @@ class PennTreebankDataset(SourceDataset, TextBaseDataset):
(default=None, number set in the config).
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
(default=Shuffle.GLOBAL).
If shuffle is False, no shuffling will be performed;
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
Otherwise, there are two levels of shuffling:
If shuffle is False, no shuffling will be performed.
If shuffle is True, performs global shuffle.
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
- Shuffle.GLOBAL: Shuffle both the files and samples.
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
- Shuffle.FILES: Shuffle files only.
@ -1117,11 +1126,11 @@ class SogouNewsDataset(SourceDataset):
num_samples (int, optional): Number of samples (rows) to read (default=None, read all samples).
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
(default=Shuffle.GLOBAL).
If shuffle is False, no shuffling will be performed;
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
Otherwise, there are two levels of shuffling:
If shuffle is False, no shuffling will be performed.
If shuffle is True, performs global shuffle.
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
- Shuffle.GLOBAL: Shuffle both the files and samples.
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
- Shuffle.FILES: Shuffle files only.
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
@ -1201,11 +1210,11 @@ class TextFileDataset(SourceDataset, TextBaseDataset):
(default=None, number set in the config).
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
(default=Shuffle.GLOBAL).
If shuffle is False, no shuffling will be performed;
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
Otherwise, there are two levels of shuffling:
If shuffle is False, no shuffling will be performed.
If shuffle is True, performs global shuffle.
There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.
- Shuffle.GLOBAL: Shuffle both the files and samples.
- Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.
- Shuffle.FILES: Shuffle files only.
@ -1217,10 +1226,11 @@ class TextFileDataset(SourceDataset, TextBaseDataset):
(default=None, which means no cache is used).
Raises:
RuntimeError: If dataset_files are not valid or do not exist.
RuntimeError: If num_parallel_workers exceeds the max thread numbers.
ValueError: If dataset_files are not valid or do not exist.
ValueError: If num_parallel_workers exceeds the max thread numbers.
RuntimeError: If num_shards is specified but shard_id is None.
RuntimeError: If shard_id is specified but num_shards is None.
ValueError: If shard_id is invalid (< 0 or >= num_shards).
Examples:
>>> text_file_dataset_dir = ["/path/to/text_file_dataset_file"] # contains 1 or multiple text files

View File

@ -467,11 +467,11 @@ class GeneratorDataset(MappableDataset, TextBaseDataset):
Raises:
RuntimeError: If source raises an exception during execution.
RuntimeError: If len of column_names does not match output len of source.
RuntimeError: If num_parallel_workers exceeds the max thread numbers.
RuntimeError: If sampler and shuffle are specified at the same time.
RuntimeError: If sampler and sharding are specified at the same time.
RuntimeError: If num_shards is specified but shard_id is None.
RuntimeError: If shard_id is specified but num_shards is None.
ValueError: If num_parallel_workers exceeds the max thread numbers.
ValueError: If sampler and shuffle are specified at the same time.
ValueError: If sampler and sharding are specified at the same time.
ValueError: If num_shards is specified but shard_id is None.
ValueError: If shard_id is specified but num_shards is None.
ValueError: If shard_id is invalid (< 0 or >= num_shards).
Note:

View File

@ -72,6 +72,10 @@ DE_C_INTER_OUTPUT_FORMAT = {
class GraphData:
"""
Reads the graph dataset used for GNN training from the shared file and database.
Support reading graph datasets like Cora, Citeseer and PubMed.
About how to load raw graph dataset into MindSpore please
refer to `Loading Graph Dataset <https://mindspore.cn/docs/programming_guide/zh-CN/master/load_dataset_gnn.html>`_.
Args:
dataset_file (str): One of file names in the dataset.
@ -98,6 +102,17 @@ class GraphData:
when the number of connected clients reaches num_client and no client is being connected,
the server automatically exits (default=True).
Raises:
ValueError: If `dataset_file` does not exist or permission denied.
TypeError: If `num_parallel_workers` exceeds the max thread numbers.
ValueError: If `working_mode` is not 'local', 'client' or 'server'.
TypeError: If `hostname` is illegal.
ValueError: If `port` is not in range [1024, 65535].
ValueError: If `num_client` is not in range [1, 255].
Supported Platforms:
``CPU``
Examples:
>>> graph_dataset_dir = "/path/to/graph_dataset_file"
>>> graph_dataset = ds.GraphData(dataset_file=graph_dataset_dir, num_parallel_workers=2)

View File

@ -81,10 +81,14 @@ class Iterator:
self._transform_tensor = lambda t: t.as_array()
if not output_numpy:
if do_copy:
self._transform_tensor = lambda t: Tensor(t.as_array())
else:
self._transform_tensor = lambda t: Tensor.from_numpy(t.as_array())
def _transform(t, do_copy):
array = t.as_array()
if array.dtype.type is np.bytes_:
array = array.astype(np.str_)
if do_copy:
return Tensor(array)
return Tensor.from_numpy(array)
self._transform_tensor = lambda t: _transform(t, do_copy)
self.__index = 0
self.offload_model = None

View File

@ -109,9 +109,9 @@ def show(dataset, indentation=2):
Do not indent if indentation is None (default=2).
Examples:
>>> dataset = ds.MnistDataset(mnist_dataset_dir, 100)
>>> dataset = ds.MnistDataset(mnist_dataset_dir, num_samples=100)
>>> one_hot_encode = c_transforms.OneHot(10)
>>> dataset = dataset.map(operation=one_hot_encode, input_column_names="label")
>>> dataset = dataset.map(operations=one_hot_encode, input_columns="label")
>>> dataset = dataset.batch(batch_size=10, drop_remainder=True)
>>> ds.show(dataset)
"""

View File

@ -192,12 +192,12 @@ class Vocab:
>>> # cat,00
>>> # --- end of file ---
>>>
>>> # Read file through this API and specify "," as delimiter,
>>> # then the delimiter will break up each line in file, the first element is taken to be the word.
>>> # Read file through this API and specify "," as delimiter.
>>> # The delimiter will break up each line in file, then the first element is taken to be the word.
>>> vocab = text.Vocab.from_file("/path/to/simple/vocab/file", ",", None, ["<pad>", "<unk>"], True)
>>>
>>> # Finally, there are 5 words in the vocab: "<pad>", "<unk>", "apple", "banana", "cat"
>>> print(vocab.vocab())
>>> # Finally, there are 5 words in the vocab: "<pad>", "<unk>", "apple", "banana", "cat".
>>> vocabulary = vocab.vocab()
"""
if vocab_size is None:
vocab_size = -1

View File

@ -182,16 +182,56 @@ def test_textline_dataset_repeat():
assert count == 9
def test_textline_dataset_output_tensor():
"""
Feature: Test text dataset output string and construct mindspore.Tensor.
Description: set output_numpy=False in create_dict_iterator.
Expectation: output tensor successfully
"""
data = ds.TextFileDataset(DATA_FILE, shuffle=False)
expected_text = ["This is a text file.", "Be happy every day.", "Good luck to everyone."]
count = 0
for i in data.create_dict_iterator(num_epochs=1, output_numpy=False):
logger.info("{}".format(i["text"]))
assert expected_text[count] == str(i["text"])
count += 1
assert count == 3
count = 0
for i in data.create_tuple_iterator(num_epochs=1, output_numpy=False, do_copy=True):
logger.info("{}".format(i[0]))
assert expected_text[count] == str(i[0])
count += 1
assert count == 3
count = 0
for i in data.create_tuple_iterator(num_epochs=1, output_numpy=False, do_copy=False):
logger.info("{}".format(i[0]))
assert expected_text[count] == str(i[0])
count += 1
assert count == 3
count = 0
for i in data:
logger.info("{}".format(i[0]))
assert expected_text[count] == str(i[0])
count += 1
assert count == 3
def test_textline_dataset_get_datasetsize():
data = ds.TextFileDataset(DATA_FILE)
size = data.get_dataset_size()
assert size == 3
def test_textline_dataset_to_device():
data = ds.TextFileDataset(DATA_FILE, shuffle=False)
data = data.to_device()
data.send()
def test_textline_dataset_exceptions():
with pytest.raises(ValueError) as error_info:
_ = ds.TextFileDataset(DATA_FILE, num_samples=-1)