fix minddata api doc

2022-01-13 15:28:36 +08:00 · 2022-01-13 15:28:36 +08:00 · e5f2c75233
parent 8539de0e01
commit e5f2c75233
9 changed files with 210 additions and 138 deletions
--- a/mindspore/python/mindspore/dataset/engine/datasets.py
+++ b/mindspore/python/mindspore/dataset/engine/datasets.py
@ -1809,7 +1809,8 @@ class TextBaseDataset(Dataset):

    def build_vocab(self, columns, freq_range, top_k, special_tokens, special_first):
        """
-        Function to create a Vocab from source dataset
+        Function to create a Vocab from source dataset.
+        Desired source dataset is a text type dataset.

        Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab
        which contains top_k most frequent words (if top_k is specified)
@ -1879,7 +1880,8 @@ class TextBaseDataset(Dataset):

    def build_sentencepiece_vocab(self, columns, vocab_size, character_coverage, model_type, params):
        """
-        Function to create a SentencePieceVocab from source dataset
+        Function to create a SentencePieceVocab from source dataset.
+        Desired source dataset is a text type dataset.

        Args:

@ -1899,6 +1901,7 @@ class TextBaseDataset(Dataset):
        Examples:
            >>> from mindspore.dataset.text import SentencePieceModel
            >>>
+            >>> # You can construct any text dataset as source, take TextFileDataset as example.
            >>> dataset = ds.TextFileDataset("/path/to/sentence/piece/vocab/file", shuffle=False)
            >>> dataset = dataset.build_sentencepiece_vocab(["text"], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
        """
--- a/mindspore/python/mindspore/dataset/engine/datasets_standard_format.py
+++ b/mindspore/python/mindspore/dataset/engine/datasets_standard_format.py
@ -46,11 +46,11 @@ class MindDataset(MappableDataset, TextBaseDataset):
        num_parallel_workers (int, optional): The number of readers (default=None).
        shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
            (default=None, performs global shuffle).
-            If shuffle is False, no shuffling will be performed;
-            If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
-            Otherwise, there are three levels of shuffling:
+            If shuffle is False, no shuffling will be performed.
+            If shuffle is True, performs global shuffle.
+            There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.

-            - Shuffle.GLOBAL: Global shuffle of all rows of data in dataset.
+            - Shuffle.GLOBAL: Global shuffle of all rows of data in dataset, same as setting shuffle to True.

            - Shuffle.FILES: Shuffle the file sequence but keep the order of data within each file.

@ -74,8 +74,8 @@ class MindDataset(MappableDataset, TextBaseDataset):
            (default=None, which means no cache is used).

    Raises:
-        RuntimeError: If dataset_files are not valid or do not exist.
-        RuntimeError: If num_parallel_workers exceeds the max thread numbers.
+        ValueError: If dataset_files are not valid or do not exist.
+        ValueError: If num_parallel_workers exceeds the max thread numbers.
        RuntimeError: If num_shards is specified but shard_id is None.
        RuntimeError: If shard_id is specified but num_shards is None.
        ValueError: If shard_id is invalid (< 0 or >= num_shards).
@ -180,11 +180,11 @@ class TFRecordDataset(SourceDataset, TextBaseDataset):
            (default=None, number set in the config).
        shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
            (default=Shuffle.GLOBAL).
-            If shuffle is False, no shuffling will be performed;
-            If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
-            Otherwise, there are two levels of shuffling:
+            If shuffle is False, no shuffling will be performed.
+            If shuffle is True, performs global shuffle.
+            There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.

-            - Shuffle.GLOBAL: Shuffle both the files and samples.
+            - Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.

            - Shuffle.FILES: Shuffle files only.

@ -201,8 +201,8 @@ class TFRecordDataset(SourceDataset, TextBaseDataset):
            (default=None, which means no cache is used).

    Raises:
-        RuntimeError: If dataset_files are not valid or do not exist.
-        RuntimeError: If num_parallel_workers exceeds the max thread numbers.
+        ValueError: If dataset_files are not valid or do not exist.
+        ValueError: If num_parallel_workers exceeds the max thread numbers.
        RuntimeError: If num_shards is specified but shard_id is None.
        RuntimeError: If shard_id is specified but num_shards is None.
        ValueError: If shard_id is invalid (< 0 or >= num_shards).
--- a/mindspore/python/mindspore/dataset/engine/datasets_text.py
+++ b/mindspore/python/mindspore/dataset/engine/datasets_text.py
@ -47,11 +47,11 @@ class AGNewsDataset(SourceDataset, TextBaseDataset):
            (default=None, number set in the config).
        shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
            (default=Shuffle.GLOBAL).
-            If shuffle is False, no shuffling will be performed;
-            If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
-            Otherwise, there are two levels of shuffling:
+            If shuffle is False, no shuffling will be performed.
+            If shuffle is True, performs global shuffle.
+            There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.

-            - Shuffle.GLOBAL: Shuffle both the files and samples.
+            - Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.

            - Shuffle.FILES: Shuffle files only.

@ -139,11 +139,11 @@ class AmazonReviewDataset(SourceDataset):
        num_samples (int, optional): Number of samples (rows) to be read (default=None, reads the full dataset).
        shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
            (default=Shuffle.GLOBAL).
-            If shuffle is False, no shuffling will be performed;
-            If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
-            Otherwise, there are two levels of shuffling:
+            If shuffle is False, no shuffling will be performed.
+            If shuffle is True, performs global shuffle.
+            There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.

-            - Shuffle.GLOBAL: Shuffle both the files and samples.
+            - Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.

            - Shuffle.FILES: Shuffle files only.
        num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
@ -217,61 +217,6 @@ class CLUEDataset(SourceDataset, TextBaseDataset):
    A source dataset that reads and parses CLUE datasets.
    Supported CLUE classification tasks: `AFQMC`, `TNEWS`, `IFLYTEK`, `CMNLI`, `WSC` and `CSL`.

-    The generated dataset with different task setting has different output columns:
-
-    - task = :py:obj:`AFQMC`
-        - usage = :py:obj:`train`, output columns: :py:obj:`[sentence1, dtype=string]`, \
-            :py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
-        - usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint8]`, \
-            :py:obj:`[sentence1, dtype=string]`, :py:obj:`[sentence2, dtype=string]`.
-        - usage = :py:obj:`eval`, output columns: :py:obj:`[sentence1, dtype=string]`, \
-            :py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
-
-    - task = :py:obj:`TNEWS`
-        - usage = :py:obj:`train`, output columns: :py:obj:`[label, dtype=string]`, \
-            :py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`, :py:obj:`[keywords, dtype=string]`.
-        - usage = :py:obj:`test`, output columns: :py:obj:`[label, dtype=string]`, \
-            :py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`, :py:obj:`[keywords, dtype=string]`.
-        - usage = :py:obj:`eval`, output columns: :py:obj:`[label, dtype=string]`, \
-            :py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`, :py:obj:`[keywords, dtype=string]`.
-
-    - task = :py:obj:`IFLYTEK`
-        - usage = :py:obj:`train`, output columns: :py:obj:`[label, dtype=string]`, \
-            :py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`.
-        - usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=string]`, \
-            :py:obj:`[sentence, dtype=string]`.
-        - usage = :py:obj:`eval`, output columns: :py:obj:`[label, dtype=string]`, \
-            :py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`.
-
-    - task = :py:obj:`CMNLI`
-        - usage = :py:obj:`train`, output columns: :py:obj:`[sentence1, dtype=string]`, \
-            :py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
-        - usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint8]`, \
-            :py:obj:`[sentence1, dtype=string]`, :py:obj:`[sentence2, dtype=string]`.
-        - usage = :py:obj:`eval`, output columns: :py:obj:`[sentence1, dtype=string]`, \
-            :py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
-
-    - task = :py:obj:`WSC`
-        - usage = :py:obj:`train`, output columns: :py:obj:`[span1_index, dtype=uint8]`, \
-            :py:obj:`[span2_index, dtype=uint8]`, :py:obj:`[span1_text, dtype=string]`, \
-            :py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint8]`, \
-            :py:obj:`[text, dtype=string]`, :py:obj:`[label, dtype=string]`.
-        - usage = :py:obj:`test`, output columns: :py:obj:`[span1_index, dtype=uint8]`, \
-            :py:obj:`[span2_index, dtype=uint8]`, :py:obj:`[span1_text, dtype=string]`, \
-            :py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint8]`, :py:obj:`[text, dtype=string]`.
-        - usage = :py:obj:`eval`, output columns: :py:obj:`[span1_index, dtype=uint8]`, \
-            :py:obj:`[span2_index, dtype=uint8]`, :py:obj:`[span1_text, dtype=string]`, \
-            :py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint8]`, \
-            :py:obj:`[text, dtype=string]`, :py:obj:`[label, dtype=string]`.
-
-    - task = :py:obj:`CSL`
-        - usage = :py:obj:`train`, output columns: :py:obj:`[id, dtype=uint8]`, \
-            :py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`, :py:obj:`[label, dtype=string]`.
-        - usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint8]`, \
-            :py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`.
-        - usage = :py:obj:`eval`, output columns: :py:obj:`[id, dtype=uint8]`, \
-            :py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`, :py:obj:`[label, dtype=string]`.
-
    Args:
        dataset_files (Union[str, list[str]]): String or list of files to be read or glob strings to search for
            a pattern of files. The list will be sorted in a lexicographical order.
@ -284,11 +229,11 @@ class CLUEDataset(SourceDataset, TextBaseDataset):
            (default=None, number set in the config).
        shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
            (default=Shuffle.GLOBAL).
-            If shuffle is False, no shuffling will be performed;
-            If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
-            Otherwise, there are two levels of shuffling:
+            If shuffle is False, no shuffling will be performed.
+            If shuffle is True, performs global shuffle.
+            There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.

-            - Shuffle.GLOBAL: Shuffle both the files and samples.
+            - Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.

            - Shuffle.FILES: Shuffle files only.

@ -299,11 +244,72 @@ class CLUEDataset(SourceDataset, TextBaseDataset):
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
            (default=None, which means no cache is used).

+    Note:
+        The generated dataset with different task setting has different output columns:
+
+        - task = :py:obj:`AFQMC`
+            - usage = :py:obj:`train`, output columns: :py:obj:`[sentence1, dtype=string]`, \
+                :py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
+            - usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint32]`, \
+                :py:obj:`[sentence1, dtype=string]`, :py:obj:`[sentence2, dtype=string]`.
+            - usage = :py:obj:`eval`, output columns: :py:obj:`[sentence1, dtype=string]`, \
+                :py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
+
+        - task = :py:obj:`TNEWS`
+            - usage = :py:obj:`train`, output columns: :py:obj:`[label, dtype=string]`, \
+                :py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`, \
+                :py:obj:`[keywords, dtype=string]`.
+            - usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint32]`, \
+                :py:obj:`[keywords, dtype=string]`, :py:obj:`[sentence, dtype=string]`.
+            - usage = :py:obj:`eval`, output columns: :py:obj:`[label, dtype=string]`, \
+                :py:obj:`[label_desc, dtype=string]`, :py:obj:`[sentence, dtype=string]`,\
+                :py:obj:`[keywords, dtype=string]`.
+
+        - task = :py:obj:`IFLYTEK`
+            - usage = :py:obj:`train`, output columns: :py:obj:`[label, dtype=string]`, \
+                :py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`.
+            - usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint32]`, \
+                :py:obj:`[sentence, dtype=string]`.
+            - usage = :py:obj:`eval`, output columns: :py:obj:`[label, dtype=string]`, \
+                :py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`.
+
+        - task = :py:obj:`CMNLI`
+            - usage = :py:obj:`train`, output columns: :py:obj:`[sentence1, dtype=string]`, \
+                :py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
+            - usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint32]`, \
+                :py:obj:`[sentence1, dtype=string]`, :py:obj:`[sentence2, dtype=string]`.
+            - usage = :py:obj:`eval`, output columns: :py:obj:`[sentence1, dtype=string]`, \
+                :py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
+
+        - task = :py:obj:`WSC`
+            - usage = :py:obj:`train`, output columns: :py:obj:`[span1_index, dtype=uint32]`, \
+                :py:obj:`[span2_index, dtype=uint32]`, :py:obj:`[span1_text, dtype=string]`, \
+                :py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint32]`, \
+                :py:obj:`[text, dtype=string]`, :py:obj:`[label, dtype=string]`.
+            - usage = :py:obj:`test`, output columns: :py:obj:`[span1_index, dtype=uint32]`, \
+                :py:obj:`[span2_index, dtype=uint32]`, :py:obj:`[span1_text, dtype=string]`, \
+                :py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint32]`, :py:obj:`[text, dtype=string]`.
+            - usage = :py:obj:`eval`, output columns: :py:obj:`[span1_index, dtype=uint32]`, \
+                :py:obj:`[span2_index, dtype=uint32]`, :py:obj:`[span1_text, dtype=string]`, \
+                :py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint32]`, \
+                :py:obj:`[text, dtype=string]`, :py:obj:`[label, dtype=string]`.
+
+        - task = :py:obj:`CSL`
+            - usage = :py:obj:`train`, output columns: :py:obj:`[id, dtype=uint32]`, \
+                :py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`, :py:obj:`[label, dtype=string]`.
+            - usage = :py:obj:`test`, output columns: :py:obj:`[id, dtype=uint32]`, \
+                :py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`.
+            - usage = :py:obj:`eval`, output columns: :py:obj:`[id, dtype=uint32]`, \
+                :py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`, :py:obj:`[label, dtype=string]`.
+
    Raises:
-        RuntimeError: If dataset_files are not valid or do not exist.
-        RuntimeError: If num_parallel_workers exceeds the max thread numbers.
+        ValueError: If dataset_files are not valid or do not exist.
+        ValueError: task is not in 'AFQMC', 'TNEWS', 'IFLYTEK', 'CMNLI', 'WSC' or 'CSL'.
+        ValueError: usage is not in 'train', 'test' or 'eval'.
+        ValueError: If num_parallel_workers exceeds the max thread numbers.
        RuntimeError: If num_shards is specified but shard_id is None.
        RuntimeError: If shard_id is specified but num_shards is None.
+        ValueError: If shard_id is invalid (< 0 or >= num_shards).

    Examples:
        >>> clue_dataset_dir = ["/path/to/clue_dataset_file"] # contains 1 or multiple clue files
@ -373,11 +379,11 @@ class CoNLL2000Dataset(SourceDataset):
        num_samples (int, optional): Number of samples (rows) to read (default=None, reads the full dataset).
        shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
            (default=Shuffle.GLOBAL).
-            If shuffle is False, no shuffling will be performed;
-            If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
-            Otherwise, there are two levels of shuffling:
+            If shuffle is False, no shuffling will be performed.
+            If shuffle is True, performs global shuffle.
+            There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.

-            - Shuffle.GLOBAL: Shuffle both the files and samples.
+            - Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.

            - Shuffle.FILES: Shuffle files only.

@ -416,7 +422,8 @@ class CoNLL2000Dataset(SourceDataset):

 class CSVDataset(SourceDataset, TextBaseDataset):
    """
-    A source dataset that reads and parses comma-separated values (CSV) datasets.
+    A source dataset that reads and parses comma-separated values
+    `(CSV) <http://en.volupedia.org/wiki/Comma-separated_values>`_ files as dataset.
    The columns of generated dataset depend on the source CSV files.

    Args:
@ -434,11 +441,11 @@ class CSVDataset(SourceDataset, TextBaseDataset):
            (default=None, number set in the config).
        shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
            (default=Shuffle.GLOBAL).
-            If shuffle is False, no shuffling will be performed;
-            If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
-            Otherwise, there are two levels of shuffling:
+            If shuffle is False, no shuffling will be performed.
+            If shuffle is True, performs global shuffle.
+            There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.

-            - Shuffle.GLOBAL: Shuffle both the files and samples.
+            - Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.

            - Shuffle.FILES: Shuffle files only.

@ -451,9 +458,11 @@ class CSVDataset(SourceDataset, TextBaseDataset):

    Raises:
        RuntimeError: If dataset_files are not valid or do not exist.
-        RuntimeError: If num_parallel_workers exceeds the max thread numbers.
+        ValueError: If field_delim is invalid.
+        ValueError: If num_parallel_workers exceeds the max thread numbers.
        RuntimeError: If num_shards is specified but shard_id is None.
        RuntimeError: If shard_id is specified but num_shards is None.
+        ValueError: If shard_id is invalid (< 0 or >= num_shards).

    Examples:
        >>> csv_dataset_dir = ["/path/to/csv_dataset_file"] # contains 1 or multiple csv files
@ -497,11 +506,11 @@ class DBpediaDataset(SourceDataset, TextBaseDataset):
            (default=None, number set in the config).
        shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
            (default=Shuffle.GLOBAL).
-            If shuffle is False, no shuffling will be performed;
-            If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL;
-            Otherwise, there are two levels of shuffling:
+            If shuffle is False, no shuffling will be performed.
+            If shuffle is True, performs global shuffle.
+            There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.

-            - Shuffle.GLOBAL: Shuffle both the files and samples.
+            - Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.

            - Shuffle.FILES: Shuffle files only.

@ -586,11 +595,11 @@ class EnWik9Dataset(SourceDataset):
            (default=None, number set in the config).
        shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
            (default=True).
-            If shuffle is False, no shuffling will be performed;
-            If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
-            Otherwise, there are two levels of shuffling:
+            If shuffle is False, no shuffling will be performed.
+            If shuffle is True, performs global shuffle.
+            There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.

-            - Shuffle.GLOBAL: Shuffle both the files and samples.
+            - Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.

            - Shuffle.FILES: Shuffle files only.

@ -812,11 +821,11 @@ class IWSLT2016Dataset(SourceDataset, TextBaseDataset):
        num_samples (int, optional): Number of samples (rows) to read (default=None, reads the full dataset).
        shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
            (default=Shuffle.GLOBAL).
-            If shuffle is False, no shuffling will be performed;
-            If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
-            Otherwise, there are two levels of shuffling:
+            If shuffle is False, no shuffling will be performed.
+            If shuffle is True, performs global shuffle.
+            There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.

-            - Shuffle.GLOBAL: Shuffle both the files and samples.
+            - Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.

            - Shuffle.FILES: Shuffle files only.
        num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
@ -933,11 +942,11 @@ class IWSLT2017Dataset(SourceDataset, TextBaseDataset):
        num_samples (int, optional): Number of samples (rows) to read (default=None, reads the full dataset).
        shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
            (default=Shuffle.GLOBAL).
-            If shuffle is False, no shuffling will be performed;
-            If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
-            Otherwise, there are two levels of shuffling:
+            If shuffle is False, no shuffling will be performed.
+            If shuffle is True, performs global shuffle.
+            There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.

-            - Shuffle.GLOBAL: Shuffle both the files and samples.
+            - Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.

            - Shuffle.FILES: Shuffle files only.
        num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
@ -1030,11 +1039,11 @@ class PennTreebankDataset(SourceDataset, TextBaseDataset):
            (default=None, number set in the config).
        shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
            (default=Shuffle.GLOBAL).
-            If shuffle is False, no shuffling will be performed;
-            If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
-            Otherwise, there are two levels of shuffling:
+            If shuffle is False, no shuffling will be performed.
+            If shuffle is True, performs global shuffle.
+            There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.

-            - Shuffle.GLOBAL: Shuffle both the files and samples.
+            - Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.

            - Shuffle.FILES: Shuffle files only.

@ -1117,11 +1126,11 @@ class SogouNewsDataset(SourceDataset):
        num_samples (int, optional): Number of samples (rows) to read (default=None, read all samples).
        shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
            (default=Shuffle.GLOBAL).
-            If shuffle is False, no shuffling will be performed;
-            If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
-            Otherwise, there are two levels of shuffling:
+            If shuffle is False, no shuffling will be performed.
+            If shuffle is True, performs global shuffle.
+            There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.

-            - Shuffle.GLOBAL: Shuffle both the files and samples.
+            - Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.

            - Shuffle.FILES: Shuffle files only.
        num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
@ -1201,11 +1210,11 @@ class TextFileDataset(SourceDataset, TextBaseDataset):
            (default=None, number set in the config).
        shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
            (default=Shuffle.GLOBAL).
-            If shuffle is False, no shuffling will be performed;
-            If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
-            Otherwise, there are two levels of shuffling:
+            If shuffle is False, no shuffling will be performed.
+            If shuffle is True, performs global shuffle.
+            There are three levels of shuffling, desired shuffle enum defined by mindspore.dataset.Shuffle.

-            - Shuffle.GLOBAL: Shuffle both the files and samples.
+            - Shuffle.GLOBAL: Shuffle both the files and samples, same as setting shuffle to True.

            - Shuffle.FILES: Shuffle files only.

@ -1217,10 +1226,11 @@ class TextFileDataset(SourceDataset, TextBaseDataset):
            (default=None, which means no cache is used).

    Raises:
-        RuntimeError: If dataset_files are not valid or do not exist.
-        RuntimeError: If num_parallel_workers exceeds the max thread numbers.
+        ValueError: If dataset_files are not valid or do not exist.
+        ValueError: If num_parallel_workers exceeds the max thread numbers.
        RuntimeError: If num_shards is specified but shard_id is None.
        RuntimeError: If shard_id is specified but num_shards is None.
+        ValueError: If shard_id is invalid (< 0 or >= num_shards).

    Examples:
        >>> text_file_dataset_dir = ["/path/to/text_file_dataset_file"] # contains 1 or multiple text files
--- a/mindspore/python/mindspore/dataset/engine/datasets_user_defined.py
+++ b/mindspore/python/mindspore/dataset/engine/datasets_user_defined.py
@ -467,11 +467,11 @@ class GeneratorDataset(MappableDataset, TextBaseDataset):
    Raises:
        RuntimeError: If source raises an exception during execution.
        RuntimeError: If len of column_names does not match output len of source.
-        RuntimeError: If num_parallel_workers exceeds the max thread numbers.
-        RuntimeError: If sampler and shuffle are specified at the same time.
-        RuntimeError: If sampler and sharding are specified at the same time.
-        RuntimeError: If num_shards is specified but shard_id is None.
-        RuntimeError: If shard_id is specified but num_shards is None.
+        ValueError: If num_parallel_workers exceeds the max thread numbers.
+        ValueError: If sampler and shuffle are specified at the same time.
+        ValueError: If sampler and sharding are specified at the same time.
+        ValueError: If num_shards is specified but shard_id is None.
+        ValueError: If shard_id is specified but num_shards is None.
        ValueError: If shard_id is invalid (< 0 or >= num_shards).

    Note:
--- a/mindspore/python/mindspore/dataset/engine/graphdata.py
+++ b/mindspore/python/mindspore/dataset/engine/graphdata.py
@ -72,6 +72,10 @@ DE_C_INTER_OUTPUT_FORMAT = {
 class GraphData:
    """
    Reads the graph dataset used for GNN training from the shared file and database.
+    Support reading graph datasets like Cora, Citeseer and PubMed.
+
+    About how to load raw graph dataset into MindSpore please
+    refer to `Loading Graph Dataset <https://mindspore.cn/docs/programming_guide/zh-CN/master/load_dataset_gnn.html>`_.

    Args:
        dataset_file (str): One of file names in the dataset.
@ -98,6 +102,17 @@ class GraphData:
            when the number of connected clients reaches num_client and no client is being connected,
            the server automatically exits (default=True).

+    Raises:
+        ValueError: If `dataset_file` does not exist or permission denied.
+        TypeError: If `num_parallel_workers` exceeds the max thread numbers.
+        ValueError: If `working_mode` is not 'local', 'client' or 'server'.
+        TypeError: If `hostname` is illegal.
+        ValueError: If `port` is not in range [1024, 65535].
+        ValueError: If `num_client` is not in range [1, 255].
+
+    Supported Platforms:
+        ``CPU``
+
    Examples:
        >>> graph_dataset_dir = "/path/to/graph_dataset_file"
        >>> graph_dataset = ds.GraphData(dataset_file=graph_dataset_dir, num_parallel_workers=2)
--- a/mindspore/python/mindspore/dataset/engine/iterators.py
+++ b/mindspore/python/mindspore/dataset/engine/iterators.py
@ -81,10 +81,14 @@ class Iterator:

        self._transform_tensor = lambda t: t.as_array()
        if not output_numpy:
-            if do_copy:
-                self._transform_tensor = lambda t: Tensor(t.as_array())
-            else:
-                self._transform_tensor = lambda t: Tensor.from_numpy(t.as_array())
+            def _transform(t, do_copy):
+                array = t.as_array()
+                if array.dtype.type is np.bytes_:
+                    array = array.astype(np.str_)
+                if do_copy:
+                    return Tensor(array)
+                return Tensor.from_numpy(array)
+            self._transform_tensor = lambda t: _transform(t, do_copy)
        self.__index = 0

        self.offload_model = None
--- a/mindspore/python/mindspore/dataset/engine/serializer_deserializer.py
+++ b/mindspore/python/mindspore/dataset/engine/serializer_deserializer.py
@ -109,9 +109,9 @@ def show(dataset, indentation=2):
            Do not indent if indentation is None (default=2).

    Examples:
-        >>> dataset = ds.MnistDataset(mnist_dataset_dir, 100)
+        >>> dataset = ds.MnistDataset(mnist_dataset_dir, num_samples=100)
        >>> one_hot_encode = c_transforms.OneHot(10)
-        >>> dataset = dataset.map(operation=one_hot_encode, input_column_names="label")
+        >>> dataset = dataset.map(operations=one_hot_encode, input_columns="label")
        >>> dataset = dataset.batch(batch_size=10, drop_remainder=True)
        >>> ds.show(dataset)
    """
--- a/mindspore/python/mindspore/dataset/text/utils.py
+++ b/mindspore/python/mindspore/dataset/text/utils.py
@ -192,12 +192,12 @@ class Vocab:
            >>> # cat,00
            >>> # --- end of file ---
            >>>
-            >>> # Read file through this API and specify "," as delimiter,
-            >>> # then the delimiter will break up each line in file, the first element is taken to be the word.
+            >>> # Read file through this API and specify "," as delimiter.
+            >>> # The delimiter will break up each line in file, then the first element is taken to be the word.
            >>> vocab = text.Vocab.from_file("/path/to/simple/vocab/file", ",", None, ["<pad>", "<unk>"], True)
            >>>
-            >>> # Finally, there are 5 words in the vocab: "<pad>", "<unk>", "apple", "banana", "cat"
-            >>> print(vocab.vocab())
+            >>> # Finally, there are 5 words in the vocab: "<pad>", "<unk>", "apple", "banana", "cat".
+            >>> vocabulary = vocab.vocab()
        """
        if vocab_size is None:
            vocab_size = -1
--- a/tests/ut/python/dataset/test_datasets_textfileop.py
+++ b/tests/ut/python/dataset/test_datasets_textfileop.py
@ -182,16 +182,56 @@ def test_textline_dataset_repeat():
    assert count == 9


+def test_textline_dataset_output_tensor():
+    """
+    Feature: Test text dataset output string and construct mindspore.Tensor.
+    Description: set output_numpy=False in create_dict_iterator.
+    Expectation: output tensor successfully
+    """
+    data = ds.TextFileDataset(DATA_FILE, shuffle=False)
+    expected_text = ["This is a text file.", "Be happy every day.", "Good luck to everyone."]
+
+    count = 0
+    for i in data.create_dict_iterator(num_epochs=1, output_numpy=False):
+        logger.info("{}".format(i["text"]))
+        assert expected_text[count] == str(i["text"])
+        count += 1
+    assert count == 3
+
+    count = 0
+    for i in data.create_tuple_iterator(num_epochs=1, output_numpy=False, do_copy=True):
+        logger.info("{}".format(i[0]))
+        assert expected_text[count] == str(i[0])
+        count += 1
+    assert count == 3
+
+    count = 0
+    for i in data.create_tuple_iterator(num_epochs=1, output_numpy=False, do_copy=False):
+        logger.info("{}".format(i[0]))
+        assert expected_text[count] == str(i[0])
+        count += 1
+    assert count == 3
+
+    count = 0
+    for i in data:
+        logger.info("{}".format(i[0]))
+        assert expected_text[count] == str(i[0])
+        count += 1
+    assert count == 3
+
+
 def test_textline_dataset_get_datasetsize():
    data = ds.TextFileDataset(DATA_FILE)
    size = data.get_dataset_size()
    assert size == 3

+
 def test_textline_dataset_to_device():
    data = ds.TextFileDataset(DATA_FILE, shuffle=False)
    data = data.to_device()
    data.send()

+
 def test_textline_dataset_exceptions():
    with pytest.raises(ValueError) as error_info:
        _ = ds.TextFileDataset(DATA_FILE, num_samples=-1)