update dataset chinese comment

2021-11-20 11:18:45 +08:00 · 2021-11-20 11:18:45 +08:00 · bfa57ebd34
parent ddc5399ca7
commit bfa57ebd34
10 changed files with 1049 additions and 1 deletions
--- a/docs/api/api_python/dataset/mindspore.dataset.CLUEDataset.rst
+++ b/docs/api/api_python/dataset/mindspore.dataset.CLUEDataset.rst
@ -0,0 +1,90 @@
+Class mindspore.dataset.CLUEDataset(dataset_files, task='AFQMC', usage='train', num_samples=None, num_parallel_workers=None, shuffle=<Shuffle.GLOBAL: 'global'>, num_shards=None, shard_id=None, cache=None)
+
+    读取和解析CLUE数据集的源数据集文件。
+    目前支持的CLUE分类任务包括：`AFQMC`、`Tnews`、`IFLYTEK`、`CMNLI`、`WSC`和`CSL`。
+
+    根据给定的`task`配置，数据集会生成不同的输出列：
+
+    - task = :py:obj:`AFQMC`
+        - usage = :py:obj:`train`，输出列: :py:obj:`[sentence1, dtype=string]`, :py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
+        - usage = :py:obj:`test`，输出列: :py:obj:`[id, dtype=uint8]`, :py:obj:`[sentence1, dtype=string]`, :py:obj:`[sentence2, dtype=string]`.
+        - usage = :py:obj:`eval`，输出列: :py:obj:`[sentence1, dtype=string]`, :py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
+
+    - task = :py:obj:`TNEWS`
+        - usage = :py:obj:`train`，输出列: :py:obj:`[label, dtype=string]`, :py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`, :py:obj:`[keywords, dtype=string]`.
+        - usage = :py:obj:`test`，输出列: :py:obj:`[label, dtype=string]`, :py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`, :py:obj:`[keywords, dtype=string]`.
+        - usage = :py:obj:`eval`，输出列: :py:obj:`[label, dtype=string]`, :py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`, :py:obj:`[keywords, dtype=string]`.
+
+    - task = :py:obj:`IFLYTEK`
+        - usage = :py:obj:`train`，输出列: :py:obj:`[label, dtype=string]`, :py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`.
+        - usage = :py:obj:`test`，输出列: :py:obj:`[id, dtype=string]`, :py:obj:`[sentence, dtype=string]`.
+        - usage = :py:obj:`eval`，输出列: :py:obj:`[label, dtype=string]`, :py:obj:`[label_des, dtype=string]`, :py:obj:`[sentence, dtype=string]`.
+
+    - task = :py:obj:`CMNLI`
+        - usage = :py:obj:`train`，输出列: :py:obj:`[sentence1, dtype=string]`, :py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
+        - usage = :py:obj:`test`，输出列: :py:obj:`[id, dtype=uint8]`, :py:obj:`[sentence1, dtype=string]`, :py:obj:`[sentence2, dtype=string]`.
+        - usage = :py:obj:`eval`，输出列: :py:obj:`[sentence1, dtype=string]`, :py:obj:`[sentence2, dtype=string]`, :py:obj:`[label, dtype=string]`.
+
+    - task = :py:obj:`WSC`
+        - usage = :py:obj:`train`，输出列: :py:obj:`[span1_index, dtype=uint8]`, :py:obj:`[span2_index, dtype=uint8]`, :py:obj:`[span1_text, dtype=string]`, :py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint8]`, :py:obj:`[text, dtype=string]`, :py:obj:`[label, dtype=string]`.
+        - usage = :py:obj:`test`，输出列: :py:obj:`[span1_index, dtype=uint8]`, :py:obj:`[span2_index, dtype=uint8]`, :py:obj:`[span1_text, dtype=string]`, :py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint8]`, :py:obj:`[text, dtype=string]`.
+        - usage = :py:obj:`eval`，输出列: :py:obj:`[span1_index, dtype=uint8]`, :py:obj:`[span2_index, dtype=uint8]`, :py:obj:`[span1_text, dtype=string]`, :py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint8]`, :py:obj:`[text, dtype=string]`, :py:obj:`[label, dtype=string]`.
+
+    - task = :py:obj:`CSL`
+        - usage = :py:obj:`train`，输出列: :py:obj:`[id, dtype=uint8]`, :py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`, :py:obj:`[label, dtype=string]`.
+        - usage = :py:obj:`test`，输出列: :py:obj:`[id, dtype=uint8]`, :py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`.
+        - usage = :py:obj:`eval`，输出列: :py:obj:`[id, dtype=uint8]`, :py:obj:`[abst, dtype=string]`, :py:obj:`[keyword, dtype=string]`, :py:obj:`[label, dtype=string]`.
+
+    **参数：**
+
+        - **dataset_files** (Union[str, list[str]])：数据集文件路径，支持单文件路径字符串、多文件路径字符串列表或可被glob库模式匹配的字符串，文件列表将在内部进行字典排序。
+        - **task** (str, 可选)：任务类型，可取值为`AFQMC`、`Tnews`、`IFLYTEK`、`CMNLI`、`WSC`或`CSL`（默认为`AFQMC`）。
+        - **usage** (str, 可选)：指定数据集的子集，可取值为`train`、`test`或`eval`（默认为`train`）。
+        - **num_samples** (int, 可选)：指定从数据集中读取的样本数（默认为None，即读取所有图像样本）。
+        - **num_parallel_workers** (int, 可选)：指定读取数据的工作线程数(默认值None，即使用mindspore.dataset.config中配置的线程数）。
+        - **shuffle** (Union[bool, Shuffle level], 可选)：每个epoch中数据混洗的模式（默认为为mindspore.dataset.Shuffle.GLOBAL）。
+          如果为False，则不混洗；如果为True，等同于将`shuffle`设置为mindspore.dataset.Shuffle.GLOBAL。另外也可以传入枚举变量设置shuffle级别：
+          - Shuffle.GLOBAL：混洗文件和样本。
+          - Shuffle.FILES：仅混洗文件。
+        - **num_shards** (int, 可选)：指定分布式训练时将数据集进行划分的分片数（默认值None）。指定此参数后, `num_samples` 表示每个分片的最大样本数。
+        - **shard_id** (int, 可选)：指定分布式训练时使用的分片ID号（默认值None）。只有当指定了 `num_shards` 时才能指定此参数。
+        - **cache** (DatasetCache, 可选)：数据缓存客户端实例，用于加快数据集处理速度（默认为None，不使用缓存）。
+
+    **异常：**
+        - **RuntimeError**：`dataset_files` 所指的文件无效或不存在。
+        - **RuntimeError**：`num_parallel_workers` 超过系统最大线程数。
+        - **RuntimeError**：指定了`num_shards`参数，但是未指定`shard_id`参数。
+        - **RuntimeError**：指定了`shard_id`参数，但是未指定`num_shards`参数。
+
+    **示例：**
+        >>> clue_dataset_dir = ["/path/to/clue_dataset_file"] # 包含一个或多个CLUE数据集文件
+        >>> dataset = ds.CLUEDataset(dataset_files=clue_dataset_dir, task='AFQMC', usage='train')
+
+    **关于CLUE数据集：**
+
+    CLUE，又名中文语言理解测评基准，包含许多有代表性的数据集，涵盖单句分类、句对分类和机器阅读理解等任务。
+
+    您可以将数据集解压成如下的文件结构，并通过MindSpore的API进行读取，以`afqmc`数据集为例：
+
+    .. code-block::
+
+        .
+        └── afqmc_public
+             ├── train.json
+             ├── test.json
+             └── dev.json
+
+    **引用：**
+
+    .. code-block::
+
+        @article{CLUEbenchmark,
+        title   = {CLUE: A Chinese Language Understanding Evaluation Benchmark},
+        author  = {Liang Xu, Xuanwei Zhang, Lu Li, Hai Hu, Chenjie Cao, Weitang Liu, Junyi Li, Yudong Li,
+                Kai Sun, Yechen Xu, Yiming Cui, Cong Yu, Qianqian Dong, Yin Tian, Dian Yu, Bo Shi, Jun Zeng,
+                Rongzhao Wang, Weijian Xie, Yanting Li, Yina Patterson, Zuoyu Tian, Yiwen Zhang, He Zhou,
+                Shaoweihua Liu, Qipeng Zhao, Cong Yue, Xinrui Zhang, Zhengliang Yang, Zhenzhong Lan},
+        journal = {arXiv preprint arXiv:2004.05986},
+        year    = {2020},
+        howpublished = {https://github.com/CLUEbenchmark/CLUE}
+        }
--- a/docs/api/api_python/dataset/mindspore.dataset.GraphData.rst
+++ b/docs/api/api_python/dataset/mindspore.dataset.GraphData.rst
@ -0,0 +1,321 @@
+Class mindspore.dataset.GraphData(dataset_file, num_parallel_workers=None, working_mode='local', hostname='127.0.0.1', port=50051, num_client=1, auto_shutdown=True)
+
+    从共享文件和数据库中读取用于GNN训练的图数据集。
+
+    **参数：**
+
+        - **dataset_file** (str)：数据集文件路径。
+        - **num_parallel_workers** (int, 可选)：读取数据的工作线程数（默认为None）。
+        - **working_mode** (str, 可选)：设置工作模式，目前支持'local'/'client'/'server'（默认为'local'）。
+            -'local'，用于非分布式训练场景。
+            -'client'，用于分布式训练场景。客户端不加载数据，而是从服务器获取数据。
+            -'server'，用于分布式训练场景。服务器加载数据并可供客户端使用。
+        - **hostname** (str, 可选)：图数据集服务器的主机名。该参数仅在工作模式设置为'client'或'server'时有效（默认为'127.0.0.1'）。
+        - **port** (int, 可选)：图数据服务器的端口，取值范围为1024-65535。此参数仅当工作模式设置为'client'或'server'（默认为50051）时有效。
+        - **num_client** (int, 可选)：期望连接到服务器的最大客户端数。服务器将根据该参数分配资源。该参数仅在工作模式设置为'server'时有效（默认为1）。
+        - **auto_shutdown** (bool, 可选)：当工作模式设置为'server'时有效。当连接的客户端数量达到num_client，且没有客户端正在连接时，服务器将自动退出（默认为True）。
+
+    **示例：**
+        >>> graph_dataset_dir = "/path/to/graph_dataset_file"
+        >>> graph_dataset = ds.GraphData(dataset_file=graph_dataset_dir, num_parallel_workers=2)
+        >>> nodes = graph_dataset.get_all_nodes(node_type=1)
+        >>> features = graph_dataset.get_node_feature(node_list=nodes, feature_types=[1])
+
+
+get_all_edges(edge_type)
+
+        获取图的所有边。
+
+        **参数：**
+            edge_type (int)：指定边的类型。
+
+        **返回：**
+            numpy.ndarray，包含边的数组。
+
+        **示例：**
+            >>> edges = graph_dataset.get_all_edges(edge_type=0)
+
+        **异常：**
+            - **TypeError**：参数`edge_type`的类型不为整型。
+
+
+get_all_neighbors(node_list, neighbor_type, output_format=<OutputFormat.NORMAL: 0。
+
+        获取`node_list`所有节点的邻居，以`neighbor_type`类型返回。
+        格式的定义参见以下示例：1表示两个节点之间连接，0表示不连接。
+
+        .. list-table:: 邻接矩阵
+           :widths: 20 20 20 20 20
+           :header-rows: 1
+
+           * -
+             - 0
+             - 1
+             - 2
+             - 3
+           * - 0
+             - 0
+             - 1
+             - 0
+             - 0
+           * - 1
+             - 0
+             - 0
+             - 1
+             - 0
+           * - 2
+             - 1
+             - 0
+             - 0
+             - 1
+           * - 3
+             - 1
+             - 0
+             - 0
+             - 0
+
+        .. list-table:: 普通格式
+           :widths: 20 20 20 20 20
+           :header-rows: 1
+
+           * - src
+             - 0
+             - 1
+             - 2
+             - 3
+           * - dst_0
+             - 1
+             - 2
+             - 0
+             - 1
+           * - dst_1
+             - -1
+             - -1
+             - 3
+             - -1
+
+        .. list-table:: COO格式
+           :widths: 20 20 20 20 20 20
+           :header-rows: 1
+
+           * - src
+             - 0
+             - 1
+             - 2
+             - 2
+             - 3
+           * - dst
+             - 1
+             - 2
+             - 0
+             - 3
+             - 1
+
+        .. list-table:: CSR格式
+           :widths: 40 20 20 20 20 20
+           :header-rows: 1
+
+           * - offsetTable
+             - 0
+             - 1
+             - 2
+             - 4
+             -
+           * - dstTable
+             - 1
+             - 2
+             - 0
+             - 3
+             - 1
+
+        **参数：**
+            - **node_list** (Union[list, numpy.ndarray])：给定的节点列表。
+            - **neighbor_type** (int)：指定邻居节点的类型。
+            - **output_format** (OutputFormat, 可选)：输出存储格式（默认为mindspore.dataset.engine.OutputFormat.NORMAL）取值范围：[OutputFormat.NORMAL, OutputFormat.COO, OutputFormat.CSR]。
+
+        **返回：**
+            对于普通格式或COO格式，
+            将返回numpy.ndarray类型的数组表示邻居节点。
+            如果指定了CSR格式，将返回两个numpy.ndarray数组，第一个表示偏移表，第二个表示邻居节点。
+
+        **示例：**
+            >>> from mindspore.dataset.engine import OutputFormat
+            >>> nodes = graph_dataset.get_all_nodes(node_type=1)
+            >>> neighbors = graph_dataset.get_all_neighbors(node_list=nodes, neighbor_type=2)
+            >>> neighbors_coo = graph_dataset.get_all_neighbors(node_list=nodes, neighbor_type=2,
+            ...                                                 output_format=OutputFormat.COO)
+            >>> offset_table, neighbors_csr = graph_dataset.get_all_neighbors(node_list=nodes, neighbor_type=2,
+            ...                                                               output_format=OutputFormat.CSR)
+
+        **异常：**
+            - **TypeError**：参数`node_list`的类型不为列表或numpy.ndarray。
+            - **TypeError**：参数`neighbor_type`的类型不为整型。
+
+
+get_all_nodes(node_type)
+
+        获取图中的所有节点。
+
+        **参数：**
+            - **node_type** (int)：指定节点的类型。
+
+        **返回：**
+            numpy.ndarray，包含节点的数组。
+
+        **示例：**
+            >>> nodes = graph_dataset.get_all_nodes(node_type=1)
+
+        **异常：**
+            - **TypeError**：参数`node_type`的类型不为整型。
+
+
+get_edges_from_nodes(node_list)
+
+        从节点获取边。
+
+        **参数：**
+            - **node_list** (Union[list[tuple], numpy.ndarray])：含一个或多个图节点ID对的列表。
+
+        **返回：**
+            numpy.ndarray，含一个或多个边ID的数组。
+
+        **示例：**
+            >>> edges = graph_dataset.get_edges_from_nodes(node_list=[(101, 201), (103, 207)])
+
+        **异常：**
+            - **TypeError**：参数`edge_list`的类型不为列表或numpy.ndarray。
+
+
+get_edge_feature(edge_list, feature_types)
+
+        获取`edge_list`列表中边的特征，以`feature_types`类型返回。
+
+        **参数：**
+            - **edge_list** (Union[list, numpy.ndarray])：包含边的列表。
+            - **feature_types** (Union[list, numpy.ndarray])：包含给定特征类型的列表。
+
+        **返回：**
+            numpy.ndarray，包含特征的数组。
+
+        **示例：**
+            >>> edges = graph_dataset.get_all_edges(edge_type=0)
+            >>> features = graph_dataset.get_edge_feature(edge_list=edges, feature_types=[1])
+
+        **异常：**
+            - **TypeError**：参数`edge_list`的类型不为列表或numpy.ndarray。
+            - **TypeError**：参数`feature_types`的类型不为列表或numpy.ndarray。
+
+
+get_neg_sampled_neighbors(node_list, neg_neighbor_num, neg_neighbor_type)
+
+        获取`node_list`列表中节所有点的负样本邻居，以`neg_neighbor_type`类型返回。
+
+        **参数：**
+            - **node_list** (Union[list, numpy.ndarray])：包含节点的列表。
+            - **neg_neighbor_num** (int)：采样的邻居数量。
+            - **neg_neighbor_type** (int)：指定负样本邻居的类型。
+
+        **返回：**
+            numpy.ndarray，包含邻居的数组。
+
+        **示例：**
+            >>> nodes = graph_dataset.get_all_nodes(node_type=1)
+            >>> neg_neighbors = graph_dataset.get_neg_sampled_neighbors(node_list=nodes, neg_neighbor_num=5,
+            ...                                                         neg_neighbor_type=2)
+
+        **异常：**
+            - **TypeError**：参数`node_list`的类型不为列表或numpy.ndarray。
+            - **TypeError**：参数`neg_neighbor_num`的类型不为整型。
+            - **TypeError**：参数`neg_neighbor_type`的类型不为整型。
+
+
+get_nodes_from_edges(edge_list)
+
+        从图中的边获取节点。
+
+        **参数：**
+            - **edge_list** (Union[list, numpy.ndarray])：包含边的列表。
+
+        **返回：**
+            numpy.ndarray，包含节点的数组。
+
+        **异常：**
+            TypeError：参数`edge_list`不为列表或ndarray。
+
+
+get_node_feature(node_list, feature_types)
+
+        获取`node_list`中节点的特征，以`feature_types`类型返回。
+
+        **参数：**
+            - **node_list** (Union[list, numpy.ndarray])：包含节点的列表。
+            - **feature_types** (Union[list, numpy.ndarray])：指定特征的类型。
+
+        **返回：**
+            numpy.ndarray，包含特征的数组。
+
+        **示例：**
+            >>> nodes = graph_dataset.get_all_nodes(node_type=1)
+            >>> features = graph_dataset.get_node_feature(node_list=nodes, feature_types=[2, 3])
+
+        **异常：**
+            - **TypeError**：参数`node_list`的类型不为列表或numpy.ndarray。
+            - **TypeError**：参数`feature_types`的类型不为列表或numpy.ndarray。
+
+
+get_sampled_neighbors(node_list, neighbor_nums, neighbor_types, strategy=<SamplingStrategy.RANDOM: 0>)
+
+        获取已采样邻居信息。此API支持多跳邻居采样。即将上一次采样结果作为下一跳采样的输入，最多允许6跳。
+        采样结果平铺成列表，格式为[input node, 1-hop sampling result, 2-hop samling result ...]
+
+        **参数：**
+            - **node_list** (Union[list, numpy.ndarray])：包含节点的列表。
+            - **neighbor_nums** (Union[list, numpy.ndarray])：每跳采样的邻居数。
+            - **neighbor_types** (Union[list, numpy.ndarray])：每跳采样的邻居类型。
+            - **strategy** (SamplingStrategy, 可选)：采样策略（默认为mindspore.dataset.engine.SamplingStrategy.RANDOM）。取值范围：[SamplingStrategy.RANDOM, SamplingStrategy.EDGE_WEIGHT]。
+                - SamplingStrategy.RANDOM，随机抽样，带放回采样。
+                - SamplingStrategy.EDGE_WEIGHT，以边缘权重为概率进行采样。
+
+        **返回：**
+            numpy.ndarray，包含邻居的数组。
+
+        **示例：**
+            >>> nodes = graph_dataset.get_all_nodes(node_type=1)
+            >>> neighbors = graph_dataset.get_sampled_neighbors(node_list=nodes, neighbor_nums=[2, 2],
+            ...                                                 neighbor_types=[2, 1])
+
+        **异常：**
+            - **TypeError**：参数`node_list`的类型不为列表或numpy.ndarray。
+            - **TypeError**：参数`neighbor_nums`的类型不为列表或numpy.ndarray。
+            - **TypeError**：参数`neighbor_types`的类型不为列表或numpy.ndarray。
+
+
+graph_info()
+
+        获取图的元信息，包括节点数、节点类型、节点特征信息、边数、边类型、边特征信息。
+
+        **返回：**
+            dict，图的元信息。键为node_num、node_type、node_feature_type、edge_num、edge_type、和edge_feature_type。
+
+
+random_walk(target_nodes, meta_path, step_home_param=1.0, step_away_param=1.0, default_node=-1)
+
+        在节点中的随机游走。
+
+        **参数：**
+            - **target_nodes** (list[int])：随机游走中的起始节点列表。
+            - **meta_path** (list[int])：每个步长的节点类型。
+            - **step_home_param** (float, 可选)：返回node2vec算法中的超参（默认为1.0）。
+            - **step_away_param** (float, 可选)：node2vec算法中的in和out超参（默认为1.0）。
+            - **default_node** (int, 可选)：如果找不到更多邻居，则为默认节点（默认值为-1，表示不给定节点）。
+
+        **返回：**
+            numpy.ndarray，包含节点的数组。
+
+        **示例：**
+            >>> nodes = graph_dataset.get_all_nodes(node_type=1)
+            >>> walks = graph_dataset.random_walk(target_nodes=nodes, meta_path=[2, 1, 2])
+
+        **异常：**
+            - **TypeError**：参数`target_nodes`的类型不为列表或numpy.ndarray。
+            - **TypeError**：参数`meta_path`的类型不为列表或numpy.ndarray。
--- a/docs/api/api_python/dataset/mindspore.dataset.ImageFolderDataset.rst
+++ b/docs/api/api_python/dataset/mindspore.dataset.ImageFolderDataset.rst
@ -0,0 +1,96 @@
+Class mindspore.dataset.ImageFolderDataset(dataset_dir, num_samples=None, num_parallel_workers=None, shuffle=None, sampler=None, extensions=None, class_indexing=None, decode=False, num_shards=None, shard_id=None, cache=None)
+
+    从树状结构的文件目录中读取图像作为源数据集，同一个文件夹中的所有图像都具有相同的标签。
+
+    生成的数据集有两列： :py:obj:`[image, label]`。列:py:obj:`image` 的数据为uint8类型，列:py:obj:`label` 的数据是uint32类型的标量。
+
+    **参数：**
+        - **dataset_dir** (str)：包含数据集文件的根目录的路径。
+        - **num_samples** (int, 可选): 指定从数据集中读取的样本数（可以小于数据集总数，默认值为None，即全部样本图片）。
+        - **num_parallel_workers** (int, 可选): 指定读取数据的工作线程数（默认值None，即使用mindspore.dataset.config中配置的线程数）。
+        - **shuffle** (bool, 可选): 是否混洗数据集（默认为None，下表中会展示不同配置的预期行为）。
+        - **sampler** (Sampler, 可选): 指定从数据集中选取样本的采样器（默认为None，下表中会展示不同配置的预期行为）。
+        - **extensions** (list[str], 可选)：指定文件扩展后缀，仅读取这些后续的文件到数据集中（默认为None）。
+        - **class_indexing** (dict, 可选)：指定文件夹名称到类标签的映射，要求映射规则为str到int（默认为None，文件夹名称将按字母顺序排列，每类都有一个唯一的索引，从0开始）。
+        - **decode** (bool, 可选)：是否对读取的图像进行解码操作（默认为False）。
+        - **num_shards** (int, 可选): 分布式训练时，将数据集划分成指定的分片数（默认值None）。指定此参数后，`num_samples` 表示每个分片的最大样本数。
+        - **shard_id** (int, 可选): 分布式训练时，指定使用的分片ID号（默认值None）。只有当指定了 `num_shards` 时才能指定此参数。
+        - **cache** (DatasetCache, 可选): 单节点数据缓存，能够加快数据加载和处理的速度（默认值None, 即不使用缓存加速）。
+
+    **异常：**
+        - **RuntimeError**：dataset_dir不包含任何数据文件。
+        - **RuntimeError**：num_parallel_workers超过系统最大线程数。
+        - **RuntimeError**：同时指定了采样器和shuffle。
+        - **RuntimeError**：同时指定了采样器和分片。
+        - **RuntimeError**: 指定了`num_shards`参数，但是未指定`shard_id`参数。
+        - **RuntimeError**: 指定了`shard_id`参数，但是未指定`num_shards`参数。
+        - **RuntimeError**：class_indexing的类型不是字典。
+        - **ValueError**: `shard_id`参数错误（小于0或者大于等于 `num_shards`）。
+
+    **注：**
+        - 如果`decode`参数指定为False，则`image`列的shape为[image_size]，否则为[H,W,C]。
+        - 此数据集可以指定`sampler`参数，但`sampler` 和 `shuffle` 是互斥的。下表展示了几种合法的输入参数及预期的行为。
+
+    .. list-table:: 配置`sampler`和`shuffle`的不同组合得到的预期排序结果
+       :widths: 25 25 50
+       :header-rows: 1
+
+       * - 参数`sampler`
+         - 参数`shuffle`
+         - 预期数据顺序
+       * - None
+         - None
+         - 随机排列
+       * - None
+         - True
+         - 随机排列
+       * - None
+         - False
+         - 顺序排列
+       * - 参数`sampler`
+         - None
+         - 由`sampler`行为定义的顺序
+       * - 参数`sampler`
+         - True
+         - 不允许
+       * - 参数`sampler`
+         - False
+         - 不允许
+
+    **示例：**
+        >>> image_folder_dataset_dir = "/path/to/image_folder_dataset_directory"
+        >>>
+        >>> # 1）使用8个线程读取image_folder_dataset_dir中的所有图像文件。
+        >>> dataset = ds.ImageFolderDataset(dataset_dir=image_folder_dataset_dir,
+        ...                                 num_parallel_workers=8)
+        >>>
+        >>> # 2）从标签为0和1的cat文件夹为和dog文件夹中读取所有图像文件。
+        >>> dataset = ds.ImageFolderDataset(dataset_dir=image_folder_dataset_dir,
+        ...                                 class_indexing={"cat":0, "dog":1})
+        >>>
+        >>> # 3）读取image_folder_dataset_dir中所有扩展名为.JPEG和.png（区分大小写）的图像文件。
+        >>> dataset = ds.ImageFolderDataset(dataset_dir=image_folder_dataset_dir,
+        ...                                 extensions=[".JPEG", ".png"])
+
+    **关于ImageFolderDataset：**
+
+    您可以将图像数据文件构建成如下目录结构，并通过MindSpore的API进行读取。
+
+    .. code-block::
+
+        .
+        └── image_folder_dataset_directory
+             ├── class1
+             │    ├── 000000000001.jpg
+             │    ├── 000000000002.jpg
+             │    ├── ...
+             ├── class2
+             │    ├── 000000000001.jpg
+             │    ├── 000000000002.jpg
+             │    ├── ...
+             ├── class3
+             │    ├── 000000000001.jpg
+             │    ├── 000000000002.jpg
+             │    ├── ...
+             ├── classN
+             ├── ...
--- a/docs/api/api_python/dataset/mindspore.dataset.MnistDataset.rst
+++ b/docs/api/api_python/dataset/mindspore.dataset.MnistDataset.rst
@ -0,0 +1,99 @@
+mindspore.dataset.MnistDataset
+===============================
+
+.. py:class:: MnistDataset(dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=None, sampler=None, num_shards=None, shard_id=None, cache=None)
+
+    用于读取和解析MNIST数据集的源数据集文件。
+
+    生成的数据集有两列: `[image, label]`。 `image` 列的数据类型为uint8。`label` 列的数据为uint32的标量。
+
+    **参数：**
+
+        - **dataset_dir** (str): 包含数据集文件的根目录路径。
+        - **usage** (str, 可选): 指定数据集的子集，可取值为 `train`、`test` 或 `all`。使用 `train` 参数将会读取60,000个训练样本，`test` 将会读取10,000个测试样本，`all` 将会读取全部70,000个样本（默认值为None，即全部样本图片）。
+        - **num_samples** (int, 可选): 指定从数据集中读取的样本数（可以小于数据集总数，默认值为None,即全部样本图片)。
+        - **num_parallel_workers** (int, 可选): 指定读取数据的工作线程数（默认值None，即使用mindspore.dataset.config中配置的线程数）。
+        - **shuffle** (bool, 可选): 是否混洗数据集（默认为None，下表中会展示不同配置的预期行为）。
+        - **sampler** (Sampler, 可选): 指定从数据集中选取样本的采样器（默认为None，下表中会展示不同配置的预期行为）。
+        - **num_shards** (int, 可选): 分布式训练时，将数据集划分成指定的分片数（默认值None）。指定此参数后, `num_samples` 表示每个分片的最大样本数。
+        - **shard_id** (int, 可选): 分布式训练时，指定使用的分片ID号（默认值None）。只有当指定了 `num_shards` 时才能指定此参数。
+        - **cache** (DatasetCache, 可选): 单节点数据缓存，能够加快数据加载和处理的速度（默认值None，即不使用缓存加速）。
+
+    **异常：**
+
+        - **RuntimeError**: `dataset_dir` 路径下不包含数据文件。
+        - **RuntimeError**: `num_parallel_workers` 超过系统最大线程数。
+        - **RuntimeError**: 同时指定了`sampler`和`shuffle`参数。
+        - **RuntimeError**: 同时指定了`sampler`和`num_shards`参数。
+        - **RuntimeError**: 指定了`num_shards`参数，但是未指定`shard_id`参数。
+        - **RuntimeError**: 指定了`shard_id`参数，但是未指定`num_shards`参数。
+        - **ValueError**: `shard_id`参数错误（小于0或者大于等于 `num_shards`）。
+
+    **注：**
+
+        此数据集可以指定`sampler`参数，但`sampler` 和 `shuffle` 是互斥的。下表展示了几种合法的输入参数及预期的行为。
+
+    .. list-table:: 配置`sampler`和`shuffle`的不同组合得到的预期排序结果
+       :widths: 25 25 50
+       :header-rows: 1
+
+       * - 参数`sampler`
+         - 参数`shuffle`
+         - 预期数据顺序
+       * - None
+         - None
+         - 随机排列
+       * - None
+         - True
+         - 随机排列
+       * - None
+         - False
+         - 顺序排列
+       * - 参数`sampler`
+         - None
+         - 由`sampler`行为定义的顺序
+       * - 参数`sampler`
+         - True
+         - 不允许
+       * - 参数`sampler`
+         - False
+         - 不允许
+
+    **样例：**
+    
+        .. code-block::
+
+            >>> mnist_dataset_dir = "/path/to/mnist_dataset_directory"
+            >>>
+            >>> # 从MNIST数据集中随机读取3个样本
+            >>> dataset = ds.MnistDataset(dataset_dir=mnist_dataset_dir, num_samples=3)
+            >>>
+            >>> # 提示：在MNIST数据集生成的数据集对象中，每一次迭代得到的数据行都有"image"和"label"两个键
+
+    **关于MNIST数据集:**
+    
+    MNIST手写数字数据集是NIST数据集的子集，共有60,000个训练样本和10,000个测试样本。
+
+    以下为原始MNIST数据集结构，您可以将数据集解压成如下的文件结构，并通过MindSpore的API进行读取。
+
+    .. code-block::
+
+        . 
+        └── mnist_dataset_dir
+            ├── t10k-images-idx3-ubyte
+            ├── t10k-labels-idx1-ubyte
+            ├── train-images-idx3-ubyte
+            └── train-labels-idx1-ubyte
+
+    **引用：**
+
+    .. code-block::
+
+        @article{lecun2010mnist,
+        title        = {MNIST handwritten digit database},
+        author       = {LeCun, Yann and Cortes, Corinna and Burges, CJ},
+        journal      = {ATT Labs [Online]},
+        volume       = {2},
+        year         = {2010},
+        howpublished = {http://yann.lecun.com/exdb/mnist}
+        }
--- a/docs/api/api_python/dataset/mindspore.dataset.NumpySlicesDataset.rst
+++ b/docs/api/api_python/dataset/mindspore.dataset.NumpySlicesDataset.rst
@ -0,0 +1,73 @@
+Class mindspore.dataset.NumpySlicesDataset(data, column_names=None, num_samples=None, num_parallel_workers=1, shuffle=None, sampler=None, num_shards=None, shard_id=None)
+
+    由Python数据构建源数据集。
+    生成的数据集的列名和列类型取决于用户传入的Python数据。
+
+    **参数：**
+        - **data** (Union[list, tuple, dict])：输入的Python数据。支持的数据类型包括：list、tuple、dict和其他NumPy格式。
+          输入数据将沿着第一个维度切片，并生成额外的行。如果输入是单个list，则将生成一个数据列，若是嵌套多个list，则生成多个数据列。
+          不建议通过这种方式加载大量的数据，因为可能会在数据加载到内存时等待较长时间。
+        - **column_names** (list[str], 可选): 指定数据集生成的列名（默认值为None）。
+          如果未指定列名称，且当输入数据的类型是dict时，输出列名称将被命名为dict的键名，否则它们将被命名为column_0，column_1...。
+        - **num_samples** (int, 可选): 指定从数据集中读取的样本数（默认值为None，所有样本）。
+        - **num_parallel_workers** (int, 可选): 指定读取数据的工作线程数（默认值为1）。
+        - **shuffle** (bool, 可选): 是否混洗数据集。只有输入的`data`参数带有可随机访问属性（__getitem__）时，才可以指定该参数。（默认值为None，下表中会展示不同配置的预期行为）。
+        - **sampler** (Union[Sampler, Iterable], 可选): 指定从数据集中选取样本的采样器。只有输入的`data`参数带有可随机访问属性（__getitem__）时，才可以指定该参数（默认值为None，下表中会展示不同配置的预期行为）。
+        - **num_shards** (int, 可选): 分布式训练时，将数据集划分成指定的分片数（默认值None）。指定此参数后，`num_samples` 表示每个分片的最大样本数。需要输入`data`支持可随机访问才能指定该参数。
+        - **shard_id** (int, 可选): 分布式训练时，指定使用的分片ID号（默认值None）。只有当指定了 `num_shards` 时才能指定此参数。
+
+    **注：**
+        - 此数据集可以指定`sampler`参数，但`sampler` 和 `shuffle` 是互斥的。下表展示了几种合法的输入参数及预期的行为。
+
+    .. list-table:: 配置`sampler`和`shuffle`的不同组合得到的预期排序结果
+       :widths: 25 25 50
+       :header-rows: 1
+
+       * - 参数`sampler`
+         - 参数`shuffle`
+         - 预期数据顺序
+       * - None
+         - None
+         - 随机排列
+       * - None
+         - True
+         - 随机排列
+       * - None
+         - False
+         - 顺序排列
+       * - 参数`sampler`
+         - None
+         - 由`sampler`行为定义的顺序
+       * - 参数`sampler`
+         - True
+         - 不允许
+       * - 参数`sampler`
+         - False
+         - 不允许
+
+    **异常：**
+        - **RuntimeError**: column_names列表的长度与数据的输出列表长度不匹配。
+        - **RuntimeError**: num_parallel_workers超过系统最大线程数。
+        - **RuntimeError**: 同时指定了sampler和shuffle。
+        - **RuntimeError**: 同时指定了sampler和num_shards。
+        - **RuntimeError**: 指定了`num_shards`参数，但是未指定`shard_id`参数。
+        - **RuntimeError**: 指定了`shard_id`参数，但是未指定`num_shards`参数。
+        - **ValueError**: `shard_id`参数错误（小于0或者大于等于 `num_shards`）。
+
+    **示例：**
+        >>> # 1) 输入的`data`参数类型为list
+        >>> data = [1, 2, 3]
+        >>> dataset = ds.NumpySlicesDataset(data=data, column_names=["column_1"])
+        >>>
+        >>> # 2) 输入的`data`参数类型为dict，并且使用column_names的默认行为，即采用键名作为生成列名。
+        >>> data = {"a": [1, 2], "b": [3, 4]}
+        >>> dataset = ds.NumpySlicesDataset(data=data)
+        >>>
+        >>> # 3) 输入的`data`参数类型是由list组成的tuple（或NumPy数组），每个元组分别生成一个输出列，共三个输出列
+        >>> data = ([1, 2], [3, 4], [5, 6])
+        >>> dataset = ds.NumpySlicesDataset(data=data, column_names=["column_1", "column_2", "column_3"])
+        >>>
+        >>> # 4) 从CSV文件加载数据
+        >>> import pandas as pd
+        >>> df = pd.read_csv(filepath_or_buffer=csv_dataset_dir[0])
+        >>> dataset = ds.NumpySlicesDataset(data=dict(df), shuffle=False)
--- a/docs/api/api_python/dataset/mindspore.dataset.PaddedDataset.rst
+++ b/docs/api/api_python/dataset/mindspore.dataset.PaddedDataset.rst
@ -0,0 +1,17 @@
+Class mindspore.dataset.PaddedDataset(padded_samples)
+
+    使用用户提供的填充数据创建数据集。
+    可用于在分布式训练时给原始数据集添加样本，使数据集能平均分配给不同的分片。
+
+    **参数：**
+        - **padded_samples** (list(dict)): 用户提供的样本数据。
+
+    **异常：**
+        - **TypeError**：padded_samples的类型不为list。
+        - **TypeError**：padded_samples的元素类型不为dict。
+        - **ValueError**：padded_samples为空列表。
+
+    **示例：**
+        >>> import numpy as np
+        >>> data = [{'image': np.zeros(1, np.uint8)}, {'image': np.zeros(2, np.uint8)}]
+        >>> dataset = ds.PaddedDataset(padded_samples=data)
--- a/docs/api/api_python/dataset/mindspore.dataset.VOCDataset.rst
+++ b/docs/api/api_python/dataset/mindspore.dataset.VOCDataset.rst
@ -0,0 +1,143 @@
+Class mindspore.dataset.VOCDataset(dataset_dir, task='Segmentation', usage='train', class_indexing=None, num_samples=None, num_parallel_workers=None, shuffle=None, decode=False, sampler=None, num_shards=None, shard_id=None, cache=None, extra_metadata=False)
+
+    用于读取和解析VOC数据集的源数据集文件。
+
+    根据给定的task配置，数据集会生成不同的输出列：
+
+    - task = :py:obj:`Detection`，输出列：:py:obj:`[image, dtype=uint8]`, :py:obj:`[bbox, dtype=float32]`, :py:obj:`[label, dtype=uint32]`, :py:obj:`[difficult, dtype=uint32]`, :py:obj:`[truncate, dtype=uint32]`。
+    - task = :py:obj:`Segmentation`，输出列： :py:obj:`[image, dtype=uint8]`, :py:obj:`[target,dtype=uint8]`。
+
+    **参数：**
+        - **dataset_dir** (str): 包含数据集文件的根目录的路径。
+        - **task** (str, 可选): 指定读取VOC数据的任务类型，现在只支持`Segmentation`或`Detection`（默认值`Segmentation`）。
+        - **usage** (str, 可选): 指定数据集的子集（默认值`train`）。
+          如果`task`参数为`Segmentation`，则将在./ImageSets/Segmentation/usage + ".txt"中加载数据集图像和标注信息；
+          如果`task`参数为`Detection`，则将在./ImageSets/Main/usage + ".txt"中加载数据集图像和标注信息；
+          如果未设置任务和用法，默认将加载./ImageSets/Segmentation/train.txt中的数据集图像和标注信息。
+        - **class_indexing** (dict, 可选): 指定标签名称到类标签的映射，要求映射规则为str到int，
+          仅在`Detection`任务中有效（默认值None，文件夹名称将按字母顺序排列，每类都有一个唯一的索引，从0开始)。
+        - **num_samples** (int, 可选): 指定从数据集中读取的样本数（默认值为None，所有图像样本）。
+        - **num_parallel_workers** (int, 可选): 指定读取数据的工作线程数（默认值None，即使用mindspore.dataset.config中配置的线程数）。
+        - **shuffle** (bool, 可选): 是否混洗数据集（默认为None，下表中会展示不同配置的预期行为）。
+        - **decode** (bool, 可选): 是否对读取的图像进行解码操作（默认值为False）。s
+        - **sampler** (Sampler, 可选): 指定从数据集中选取样本的采样器（默认为None，下表中会展示不同配置的预期行为）。
+        - **num_shards** (int, 可选): 分布式训练时，将数据集划分成指定的分片数（默认值None）。指定此参数后，`num_samples` 表示每个分片的最大样本数。
+        - **shard_id** (int, 可选): 分布式训练时，指定使用的分片ID号（默认值None）。只有当指定了 `num_shards` 时才能指定此参数。
+        - **cache** (DatasetCache, 可选): 数据缓存客户端实例，用于加快数据集处理速度（默认为None，不使用缓存）。
+        - **extra_metadata** (bool, 可选): 用于指定是否额外输出一列数据用于表示图像元信息。如果为True，则将额外输出一列数据，名为:py:obj:`[_meta-filename, dtype=string]` （默认值为False）。
+
+    **异常：**
+        - **RuntimeError**: dataset_dir不包含任何数据文件。
+        - **RuntimeError**: num_parallel_workers超过系统最大线程数。
+        - **RuntimeError**: 标注的xml文件格式异常或无效。
+        - **RuntimeError**: 标注的xml文件缺失`object`属性。
+        - **RuntimeError**: 标注的xml文件缺失`bndbox`属性。
+        - **RuntimeError**: 同时指定了`sampler`和`shuffle`。
+        - **RuntimeError**: 同时指定了`sampler`和`num_shards`。
+        - **RuntimeError**: 指定了`num_shards`参数，但是未指定`shard_id`参数。
+        - **RuntimeError**: 指定了`shard_id`参数，但是未指定`num_shards`参数。
+        - **ValueError**： 指定的任务不为'Segmentation'或'Detection'。
+        - **ValueError**： 指定任务为'Segmentation'时，class_indexing不为None。
+        - **ValueError**： 与usage相关的txt文件不存在。
+        - **ValueError**: `shard_id`参数错误（小于0或者大于等于 `num_shards`）。
+
+    **注：**
+        - 当指定`extra_metadata`为True时，除非显式使用rename算子以删除元信息列明的前缀('_meta-')，
+          否则迭代的数据行中不会出现'[_meta-filename, dtype=string]'列。
+          
+        - 此数据集可以指定`sampler`参数，但`sampler` 和 `shuffle` 是互斥的。下表展示了几种合法的输入参数及预期的行为。
+
+    .. list-table:: 配置`sampler`和`shuffle`的不同组合得到的预期排序结果
+       :widths: 25 25 50
+       :header-rows: 1
+
+       * - 参数`sampler`
+         - 参数`shuffle`
+         - 预期数据顺序
+       * - None
+         - None
+         - 随机排列
+       * - None
+         - True
+         - 随机排列
+       * - None
+         - False
+         - 顺序排列
+       * - 参数`sampler`
+         - None
+         - 由`sampler`行为定义的顺序
+       * - 参数`sampler`
+         - True
+         - 不允许
+       * - 参数`sampler`
+         - False
+         - 不允许
+
+    **示例：**
+        >>> voc_dataset_dir = "/path/to/voc_dataset_directory"
+        >>>
+        >>> # 1) 读取VOC数据的Segmentation任务中的train部分进行训练
+        >>> dataset = ds.VOCDataset(dataset_dir=voc_dataset_dir, task="Segmentation", usage="train")
+        >>>
+        >>> # 2) 读取VOC数据的Detection任务中的train部分进行训练
+        >>> dataset = ds.VOCDataset(dataset_dir=voc_dataset_dir, task="Detection", usage="train")
+        >>>
+        >>> # 3) 以8个线程随机顺序读取voc_dataset_dir中的所有VOC数据集样本
+        >>> dataset = ds.VOCDataset(dataset_dir=voc_dataset_dir, task="Detection", usage="train",
+        ...                         num_parallel_workers=8)
+        >>>
+        >>> # 4) 读voc_dataset_dir中的所有VOC数据集图片样本，且对图像进行解码
+        >>> dataset = ds.VOCDataset(dataset_dir=voc_dataset_dir, task="Detection", usage="train",
+        ...                         decode=True, shuffle=False)
+        >>>
+        >>> # 在VOC数据集中，如果task='Segmentation'，每一次迭代得到的数据行都有"image"和"target"两个键。
+        >>> # 在VOC数据集中，如果task='Detection'，每一次迭代得到的数据行都有"image"和"annotation"两个键。
+
+    **关于VOC数据集：**
+
+    PASCAL Visual Object Classes（VOC）是视觉目标识别和检测的挑战赛，它为视觉和机器学习社区提供了图像和标注的标准数据集，称为VOC数据集。
+
+    您可以解压缩原始VOC-2012数据集文件到如下目录结构，并通过MindSpore的API进行读取。
+
+    .. code-block::
+
+        .
+        └── voc2012_dataset_dir
+            ├── Annotations
+            │    ├── 2007_000027.xml
+            │    ├── 2007_000032.xml
+            │    ├── ...
+            ├── ImageSets
+            │    ├── Action
+            │    ├── Layout
+            │    ├── Main
+            │    └── Segmentation
+            ├── JPEGImages
+            │    ├── 2007_000027.jpg
+            │    ├── 2007_000032.jpg
+            │    ├── ...
+            ├── SegmentationClass
+            │    ├── 2007_000032.png
+            │    ├── 2007_000033.png
+            │    ├── ...
+            └── SegmentationObject
+                 ├── 2007_000032.png
+                 ├── 2007_000033.png
+                 ├── ...
+
+    **引用：**
+
+    .. code-block::
+
+        @article{Everingham10,
+        author       = {Everingham, M. and Van~Gool, L. and Williams, C. K. I. and Winn, J. and Zisserman, A.},
+        title        = {The Pascal Visual Object Classes (VOC) Challenge},
+        journal      = {International Journal of Computer Vision},
+        volume       = {88},
+        year         = {2012},
+        number       = {2},
+        month        = {jun},
+        pages        = {303--338},
+        biburl       = {http://host.robots.ox.ac.uk/pascal/VOC/pubs/everingham10.html#bibtex},
+        howpublished = {http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html}
+        }
--- a/docs/api/api_python/dataset/mindspore.dataset.cifar100dataset.rst
+++ b/docs/api/api_python/dataset/mindspore.dataset.cifar100dataset.rst
@ -0,0 +1,101 @@
+mindspore.dataset.Cifar100Dataset
+=================================
+
+.. py:class:: Cifar100Dataset(dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=None, sampler=None, num_shards=None, shard_id=None, cache=None)
+
+    用于读取和解析CIFAR-100数据集的源数据文件。
+
+    生成的数据集有三列: `[image, coarse_label, fine_label]`。`image` 列的数据类型是uint8。`coarse_label` 和 `fine_labels` 列的数据是uint32类型的标量。
+
+    **参数：**
+
+        - **dataset_dir** (str): 包含数据集文件的根目录路径。
+        - **usage** (str, 可选): 指定数据集的子集，可取值为 `train`，`test`或`all`。使用`train`参数将会读取50,000个训练样本，`test` 将会读取10,000个测试样本，`all` 将会读取全部60,000个样本（默认值为None，即全部样本图片）。
+        - **num_samples** (int, 可选): 指定从数据集中读取的样本数（可以小于数据集总数，默认值为None，即全部样本图片)。
+        - **num_parallel_workers** (int, 可选): 指定读取数据的工作线程数（默认值None，即使用mindspore.dataset.config中配置的线程数）。
+        - **shuffle** (bool, 可选): 是否混洗数据集（默认为None，下表中会展示不同配置的预期行为）。
+        - **sampler** (Sampler, 可选): 指定从数据集中选取样本的采样器（默认为None，下表中会展示不同配置的预期行为）。
+        - **num_shards** (int, 可选): 分布式训练时，将数据集划分成指定的分片数（默认值None）。指定此参数后, `num_samples` 表示每个分片的最大样本数。
+        - **shard_id** (int, 可选): 分布式训练时，指定使用的分片ID号（默认值None）。只有当指定了 `num_shards` 时才能指定此参数。
+        - **cache** (DatasetCache, 可选): 单节点数据缓存，能够加快数据加载和处理的速度（默认值None，即不使用缓存加速）。
+
+    **异常：**
+
+        - **RuntimeError:** `dataset_dir` 路径下不包含数据文件。
+        - **RuntimeError:** `num_parallel_workers` 超过系统最大线程数。
+        - **RuntimeError:** 同时指定了`sampler`和`shuffle`参数。
+        - **RuntimeError:** 同时指定了`sampler`和`num_shards`参数。
+        - **RuntimeError:** 指定了`num_shards`参数，但是未指定`shard_id`参数。
+        - **RuntimeError:** 指定了`shard_id`参数，但是未指定`num_shards`参数。
+        - **ValueError:**  `shard_id`参数错误（小于0或者大于等于 `num_shards`）。
+
+    **注：**
+
+        此数据集可以指定`sampler`参数，但`sampler` 和 `shuffle` 是互斥的。下表展示了几种合法的输入参数及预期的行为。
+
+    .. list-table:: 配置`sampler`和`shuffle`的不同组合得到的预期排序结果
+       :widths: 25 25 50
+       :header-rows: 1
+
+       * - 参数`sampler`
+         - 参数`shuffle`
+         - 预期数据顺序
+       * - None
+         - None
+         - 随机排列
+       * - None
+         - True
+         - 随机排列
+       * - None
+         - False
+         - 顺序排列
+       * - 参数`sampler`
+         - None
+         - 由`sampler`行为定义的顺序
+       * - 参数`sampler`
+         - True
+         - 不允许
+       * - 参数`sampler`
+         - False
+         - 不允许
+
+    **样例：**
+    
+        .. code-block::
+
+            >>> cifar100_dataset_dir = "/path/to/cifar100_dataset_directory"
+            >>>
+            >>> # 1)  按数据集文件的读取顺序，依次获取CIFAR-100数据集中的所有样本
+            >>> dataset = ds.Cifar100Dataset(dataset_dir=cifar100_dataset_dir, shuffle=False)
+            >>>
+            >>> # 2)  从CIFAR100数据集中随机抽取350个样本
+            >>> dataset = ds.Cifar100Dataset(dataset_dir=cifar100_dataset_dir, num_samples=350, shuffle=True)
+            >>>
+            >>> #  提示： 在CIFAR-100数据集生成的数据集对象中，每一次迭代得到的数据行都有"image", "fine_label" 和 "coarse_label"三个键
+
+    **关于CIFAR-100数据集:**
+
+    CIFAR-100数据集和CIFAR-10数据集非常相似，CIFAR-100有100个类别，每类包含600张图片，其中500张训练图片和100张测试图片。这100个类别又被分成20个超类。每个图片都有一个"fine"标签（所属子类）和一个"coarse"标签(所属超类)。
+    
+    以下为原始CIFAR-100数据集结构。您可以将数据集解压成如下的文件结构，并通过MindSpore的API进行读取。
+
+    .. code-block::
+
+        . 
+        └── cifar-100-binary
+            ├── train.bin
+            ├── test.bin
+            ├── fine_label_names.txt
+            └── coarse_label_names.txt
+
+    **引用：**
+
+    .. code-block::
+
+        @techreport{Krizhevsky09,
+        author       = {Alex Krizhevsky},
+        title        = {Learning multiple layers of features from tiny images},
+        institution  = {},
+        year         = {2009},
+        howpublished = {http://www.cs.toronto.edu/~kriz/cifar.html}
+        }
--- a/docs/api/api_python/dataset/mindspore.dataset.cifar10dataset.rst
+++ b/docs/api/api_python/dataset/mindspore.dataset.cifar10dataset.rst
@ -0,0 +1,108 @@
+mindspore.dataset.Cifar10Dataset
+================================
+
+.. py:class:: Cifar10Dataset(dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=None, sampler=None, num_shards=None, shard_id=None, cache=None)
+
+    用于读取和解析CIFAR-10数据集的源数据集文件。该API目前仅支持解析二进制版本的CIFAR-10文件（CIFAR-10 binary version）。
+
+    生成的数据集有两列: `[image, label]`。`image` 列的数据类型是uint8。`label` 列的数据是uint32类型的标量。
+
+    **参数：**
+
+        - **dataset_dir** (str): 包含数据集文件的根目录路径。
+        - **usage** (str, 可选): 指定数据集的子集，可取值为 `train`，`test`或`all`。使用`train`参数将会读取50,000个训练样本，`test` 将会读取10,000个测试样本，`all` 将会读取全部60,000个样本（默认值为None，即全部样本图片）。
+        - **num_samples** (int, 可选): 指定从数据集中读取的样本数（可以小于数据集总数，默认值为None，即全部样本图片)。
+        - **num_parallel_workers** (int, 可选): 指定读取数据的工作线程数（默认值None，即使用mindspore.dataset.config中配置的线程数）。
+        - **shuffle** (bool, 可选): 是否混洗数据集（默认为None，下表中会展示不同配置的预期行为）。
+        - **sampler** (Sampler, 可选): 指定从数据集中选取样本的采样器（默认为None，下表中会展示不同配置的预期行为）。
+        - **num_shards** (int, 可选): 分布式训练时，将数据集划分成指定的分片数（默认值None）。指定此参数后, `num_samples` 表示每个分片的最大样本数。
+        - **shard_id** (int, 可选): 分布式训练时，指定使用的分片ID号（默认值None）。只有当指定了 `num_shards` 时才能指定此参数。
+        - **cache** (DatasetCache, 可选): 单节点数据缓存，能够加快数据加载和处理的速度（默认值None，即不使用缓存加速）。
+
+    **异常：**
+
+        - **RuntimeError:** `dataset_dir` 路径下不包含数据文件。
+        - **RuntimeError:** `num_parallel_workers` 超过系统最大线程数。
+        - **RuntimeError:** 同时指定了`sampler`和`shuffle`参数。
+        - **RuntimeError:** 同时指定了`sampler`和`num_shards`参数。
+        - **RuntimeError:** 指定了`num_shards`参数，但是未指定`shard_id`参数。
+        - **RuntimeError:** 指定了`shard_id`参数，但是未指定`num_shards`参数。
+        - **ValueError:**  `shard_id`参数错误（小于0或者大于等于 `num_shards`）。
+
+    **注：**
+
+        此数据集可以通过`sampler`指定任意采样器，但参数`sampler` 和 `shuffle` 的行为是互斥的。下表展示了几种合法的输入参数及预期的行为。
+
+    .. list-table:: 配置`sampler`和`shuffle`的不同组合得到的预期排序结果
+       :widths: 25 25 50
+       :header-rows: 1
+
+       * - 参数`sampler`
+         - 参数`shuffle`
+         - 预期数据顺序
+       * - None
+         - None
+         - 随机排列
+       * - None
+         - True
+         - 随机排列
+       * - None
+         - False
+         - 顺序排列
+       * - 参数`sampler`
+         - None
+         - 由`sampler`行为定义的顺序
+       * - 参数`sampler`
+         - True
+         - 不允许
+       * - 参数`sampler`
+         - False
+         - 不允许
+
+    **样例：**
+    
+        .. code-block::
+
+            >>> cifar10_dataset_dir = "/path/to/cifar10_dataset_directory"
+            >>>
+            >>> # 1) 按数据集文件的读取顺序，获取CIFAR-10数据集中的所有样本
+            >>> dataset = ds.Cifar10Dataset(dataset_dir=cifar10_dataset_dir, shuffle=False)
+            >>>
+            >>> # 2) 从CIFAR10数据集中随机抽取350个样本
+            >>> dataset = ds.Cifar10Dataset(dataset_dir=cifar10_dataset_dir, num_samples=350, shuffle=True)
+            >>>
+            >>> # 3) 对CIFAR10数据集进行分布式训练，并将数据集拆分为2个分片，当前数据集仅加载分片ID号为0的数据
+            >>> dataset = ds.Cifar10Dataset(dataset_dir=cifar10_dataset_dir, num_shards=2, shard_id=0)
+            >>>
+            >>> # 提示：在CIFAR-10数据集生成的数据集对象中，每一次迭代得到的数据行都有"image"和"label"两个键
+
+    **关于CIFAR-10数据集:**
+
+    CIFAR-10数据集由10类60000张32x32彩色图片组成，每类6000张图片。有50000个训练样本和10000个测试样本。图片分为飞机、汽车、鸟类、猫、鹿、狗、青蛙、马、船和卡车这10类。
+
+    以下为原始CIFAR-10 数据集结构。您可以将数据集解压成如下的文件结构，并通过MindSpore的API进行读取。
+
+    .. code-block::
+
+        .
+        └── cifar-10-batches-bin
+            ├── data_batch_1.bin
+            ├── data_batch_2.bin
+            ├── data_batch_3.bin
+            ├── data_batch_4.bin
+            ├── data_batch_5.bin
+            ├── test_batch.bin
+            ├── readme.html
+            └── batches.meta.text
+
+    **引用：**
+
+    .. code-block::
+
+        @techreport{Krizhevsky09,
+        author       = {Alex Krizhevsky},
+        title        = {Learning multiple layers of features from tiny images},
+        institution  = {},
+        year         = {2009},
+        howpublished = {http://www.cs.toronto.edu/~kriz/cifar.html}
+        }
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@ -6435,7 +6435,7 @@ class CLUEDataset(SourceDataset):
            :py:obj:`[span2_index, dtype=uint8]`, :py:obj:`[span1_text, dtype=string]`, \
            :py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint8]`, \
            :py:obj:`[text, dtype=string]`, :py:obj:`[label, dtype=string]`.
-        - usage = output columns: :py:obj:`[span1_index, dtype=uint8]`, \
+        - usage = :py:obj:`test`, output columns: :py:obj:`[span1_index, dtype=uint8]`, \
            :py:obj:`[span2_index, dtype=uint8]`, :py:obj:`[span1_text, dtype=string]`, \
            :py:obj:`[span2_text, dtype=string]`, :py:obj:`[idx, dtype=uint8]`, :py:obj:`[text, dtype=string]`.
        - usage = :py:obj:`eval`, output columns: :py:obj:`[span1_index, dtype=uint8]`, \