fix example code and docs

2022-04-27 10:59:22 +08:00 · 2022-04-27 10:59:22 +08:00 · 82b4d74ef5
parent 5b4e912625
commit 82b4d74ef5
6 changed files with 42 additions and 31 deletions
--- a/.jenkins/check/config/filter_pylint.txt
+++ b/.jenkins/check/config/filter_pylint.txt
@ -61,6 +61,7 @@

 # MindData
 "mindspore/mindspore/python/mindspore/dataset/__init__.py"                                                               "redefined-builtin"
+"mindspore/mindspore/python/mindspore/dataset/audio/transforms.py"                                                       "super-init-not-called"
 "mindspore/mindspore/python/mindspore/dataset/engine/__init__.py"                                                        "redefined-builtin"
 "mindspore/mindspore/python/mindspore/dataset/engine/datasets.py"                                                        "redefined-builtin"
 "mindspore/mindspore/python/mindspore/dataset/engine/datasets.py"                                                        "broad-except"
--- a/docs/api/api_python/dataset/mindspore.dataset.Dataset.rst
+++ b/docs/api/api_python/dataset/mindspore.dataset.Dataset.rst
@ -32,10 +32,11 @@
    - **drop_remainder** (bool, 可选) - 当最后一个批处理数据包含的数据条目小于 `batch_size` 时，是否将该批处理丢弃，不传递给下一个操作。默认值：False，不丢弃。
    - **num_parallel_workers** (int, 可选) - 指定 `batch` 操作的并发进程数/线程数（由参数 `python_multiprocessing` 决定当前为多进程模式或多线程模式）。
      默认值：None，使用mindspore.dataset.config中配置的线程数。
-    - **per_batch_map** (callable, 可选) - 可调用对象，以(list[Tensor], list[Tensor], ..., BatchInfo)作为输入参数，处理后返回(list[Tensor], list[Tensor],...)作为新的数据列。
-      输入参数中每个list[Tensor]代表给定数据列中的一批Tensor，list[Tensor]的个数应与 `input_columns` 中传入列名的数量相匹配，
-      在返回的(list[Tensor], list[Tensor], ...)中，list[Tensor]的个数应与输入相同，如果输出列数与输入列数不一致，则需要指定 `output_columns`。
-      该可调用对象的最后一个输入参数始终是BatchInfo，用于获取数据集的信息，用法参考样例（2）。
+    - **per_batch_map** (callable, 可选) - 可调用对象，以(list[numpy.ndarray], list[numpy.ndarray], ..., BatchInfo)作为输入参数，
+      处理后返回(list[numpy.ndarray], list[numpy.ndarray],...)作为新的数据列。输入参数中每个list[numpy.ndarray]代表给定数据列中的一批numpy.ndarray，
+      list[numpy.ndarray]的个数应与 `input_columns` 中传入列名的数量相匹配，在返回的(list[numpy.ndarray], list[numpy.ndarray], ...)中，
+      list[numpy.ndarray]的个数应与输入相同，如果输出列数与输入列数不一致，则需要指定 `output_columns`。该可调用对象的最后一个输入参数始终是BatchInfo，
+      用于获取数据集的信息，用法参考样例（2）。
    - **input_columns** (Union[str, list[str]], 可选)：指定 `batch` 操作的输入数据列。
      如果 `per_batch_map` 不为None，列表中列名的个数应与 `per_batch_map` 中包含的列数匹配。默认值：None，不指定。
    - **output_columns** (Union[str, list[str]], 可选) - 指定 `batch` 操作的输出数据列。如果输入数据列与输入数据列的长度不相等，则必须指定此参数。
--- a/mindspore/ccsrc/minddata/dataset/liteapi/include/datasets.h
+++ b/mindspore/ccsrc/minddata/dataset/liteapi/include/datasets.h
@ -150,7 +150,7 @@ class MS_API Dataset : public std::enable_shared_from_this<Dataset> {
  }

  /// \brief Function to transfer data through a device.
-  /// \notes If device is Ascend, features of data will be transferred one by one. The limitation
+  /// \note If device is Ascend, features of data will be transferred one by one. The limitation
  ///     of data transmission per time is 256M.
  /// \param[in] queue_name Channel name (default="", create new unique name).
  /// \param[in] device_type Type of device (default="", get from MSContext).
@ -193,7 +193,7 @@ class MS_API Dataset : public std::enable_shared_from_this<Dataset> {
  }

  /// \brief Function to create a BatchDataset
-  /// \notes Combines batch_size number of consecutive rows into batches
+  /// \note Combines batch_size number of consecutive rows into batches
  /// \param[in] batch_size The number of rows each batch is created with
  /// \param[in] drop_remainder Determines whether or not to drop the last possibly incomplete
  ///     batch. If true, and if there are less than batch_size rows
@ -209,7 +209,7 @@ class MS_API Dataset : public std::enable_shared_from_this<Dataset> {
  std::shared_ptr<BatchDataset> Batch(int32_t batch_size, bool drop_remainder = false);

  /// \brief Function to create a MapDataset
-  /// \notes Applies each operation in operations to this dataset
+  /// \note Applies each operation in operations to this dataset
  /// \param[in] operations Vector of raw pointers to TensorTransform objects to be applied on the dataset. Operations
  ///     are applied in the order they appear in this list
  /// \param[in] input_columns Vector of the names of the columns that will be passed to the first
@ -274,7 +274,7 @@ class MS_API Dataset : public std::enable_shared_from_this<Dataset> {
  }

  /// \brief Function to create a MapDataset
-  /// \notes Applies each operation in operations to this dataset
+  /// \note Applies each operation in operations to this dataset
  /// \param[in] operations Vector of shared pointers to TensorTransform objects to be applied on the dataset.
  ///     Operations are applied in the order they appear in this list
  /// \param[in] input_columns Vector of the names of the columns that will be passed to the first
@ -306,7 +306,7 @@ class MS_API Dataset : public std::enable_shared_from_this<Dataset> {
  }

  /// \brief Function to create a MapDataset
-  /// \notes Applies each operation in operations to this dataset
+  /// \note Applies each operation in operations to this dataset
  /// \param[in] operations Vector of TensorTransform objects to be applied on the dataset. Operations are applied in
  ///     the order they appear in this list
  /// \param[in] input_columns Vector of the names of the columns that will be passed to the first
@ -336,7 +336,7 @@ class MS_API Dataset : public std::enable_shared_from_this<Dataset> {
  }

  /// \brief Function to create a Project Dataset
-  /// \notes Applies project to the dataset
+  /// \note Applies project to the dataset
  /// \param[in] columns The name of columns to project
  /// \return Shared pointer to the current Dataset
  /// \par Example
@ -350,7 +350,7 @@ class MS_API Dataset : public std::enable_shared_from_this<Dataset> {
  }

  /// \brief Function to create a Shuffle Dataset
-  /// \notes Randomly shuffles the rows of this dataset
+  /// \note Randomly shuffles the rows of this dataset
  /// \param[in] buffer_size The size of the buffer (must be larger than 1) for shuffling
  /// \return Shared pointer to the current ShuffleDataset
  /// \par Example
@ -576,7 +576,7 @@ class MS_API AlbumDataset : public Dataset {
 };

 /// \brief Function to create an AlbumDataset
-/// \notes The generated dataset is specified through setting a schema
+/// \note The generated dataset is specified through setting a schema
 /// \param[in] dataset_dir Path to the root directory that contains the dataset
 /// \param[in] data_schema Path to dataset schema file
 /// \param[in] column_names Column names used to specify columns to load, if empty, will read all columns.
@ -611,7 +611,7 @@ Album(const std::string &dataset_dir, const std::string &data_schema, const std:
 }

 /// \brief Function to create an AlbumDataset
-/// \notes The generated dataset is specified through setting a schema
+/// \note The generated dataset is specified through setting a schema
 /// \param[in] dataset_dir Path to the root directory that contains the dataset
 /// \param[in] data_schema Path to dataset schema file
 /// \param[in] column_names Column names used to specify columns to load
@ -628,7 +628,7 @@ inline std::shared_ptr<AlbumDataset> MS_API Album(const std::string &dataset_dir
 }

 /// \brief Function to create an AlbumDataset
-/// \notes The generated dataset is specified through setting a schema
+/// \note The generated dataset is specified through setting a schema
 /// \param[in] dataset_dir Path to the root directory that contains the dataset
 /// \param[in] data_schema Path to dataset schema file
 /// \param[in] column_names Column names used to specify columns to load
@ -676,7 +676,7 @@ class MS_API MnistDataset : public Dataset {
 };

 /// \brief Function to create a MnistDataset
-/// \notes The generated dataset has two columns ["image", "label"]
+/// \note The generated dataset has two columns ["image", "label"]
 /// \param[in] dataset_dir Path to the root directory that contains the dataset
 /// \param[in] usage of MNIST, can be "train", "test" or "all" (default = "all").
 /// \param[in] sampler Shared pointer to a sampler object used to choose samples from the dataset. If sampler is not
@ -705,7 +705,7 @@ Mnist(const std::string &dataset_dir, const std::string &usage = "all",
 }

 /// \brief Function to create a MnistDataset
-/// \notes The generated dataset has two columns ["image", "label"]
+/// \note The generated dataset has two columns ["image", "label"]
 /// \param[in] dataset_dir Path to the root directory that contains the dataset
 /// \param[in] usage of MNIST, can be "train", "test" or "all"
 /// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset.
@ -718,7 +718,7 @@ inline std::shared_ptr<MnistDataset> MS_API Mnist(const std::string &dataset_dir
 }

 /// \brief Function to create a MnistDataset
-/// \notes The generated dataset has two columns ["image", "label"]
+/// \note The generated dataset has two columns ["image", "label"]
 /// \param[in] dataset_dir Path to the root directory that contains the dataset
 /// \param[in] usage of MNIST, can be "train", "test" or "all"
 /// \param[in] sampler Sampler object used to choose samples from the dataset.
--- a/mindspore/python/mindspore/dataset/audio/transforms.py
+++ b/mindspore/python/mindspore/dataset/audio/transforms.py
@ -1409,9 +1409,9 @@ class PhaseVocoder(AudioTensorOperation):
    Examples:
        >>> import numpy as np
        >>>
-        >>> waveform = np.random.randn(2, 44, 10, 2)
+        >>> waveform = np.random.random([2, 44, 10, 2])
        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
-        >>> phase_advance = np.random.randn(44, 1)
+        >>> phase_advance = np.random.random([44, 1])
        >>> transforms = [audio.PhaseVocoder(rate=2, phase_advance=phase_advance)]
        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
    """
@ -1661,7 +1661,7 @@ class TimeStretch(AudioTensorOperation):
    Examples:
        >>> import numpy as np
        >>>
-        >>> waveform = np.random.random([1, 30])
+        >>> waveform = np.random.random([44, 10, 2])
        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
        >>> transforms = [audio.TimeStretch()]
        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
--- a/mindspore/python/mindspore/dataset/engine/datasets.py
+++ b/mindspore/python/mindspore/dataset/engine/datasets.py
@ -574,11 +574,12 @@ class Dataset:
            num_parallel_workers (int, optional): Number of workers(threads) to process the dataset in parallel
                (default=None).
            per_batch_map (callable, optional): Per batch map callable (default=None). A callable which takes
-                (list[Tensor], list[Tensor], ..., BatchInfo) as input parameters. Each list[Tensor] represents a batch
-                of Tensors on a given column. The number of lists should match with the number of entries in
-                input_columns. The last parameter of the callable should always be a BatchInfo object. Per_batch_map
-                should return (list[Tensor], list[Tensor], ...). The length of each list in output should be the same as
-                the input. output_columns is required if the number of output lists is different from input.
+                (list[numpy.ndarray], list[numpy.ndarray], ..., BatchInfo) as input parameters. Each
+                list[numpy.ndarray] represents a batch of numpy.ndarray on a given column. The number of lists should
+                match with the number of entries in input_columns. The last parameter of the callable should always be
+                a BatchInfo object. Per_batch_map should return (list[numpy.ndarray], list[numpy.ndarray], ...). The
+                length of each list in output should be the same as the input. output_columns is required if the number
+                of output lists is different from input.
            input_columns (Union[str, list[str]], optional): List of names of the input columns. The size of the list
                should match with signature of per_batch_map callable (default=None).
            output_columns (Union[str, list[str]], optional): List of names assigned to the columns
--- a/mindspore/python/mindspore/dataset/vision/py_transforms.py
+++ b/mindspore/python/mindspore/dataset/vision/py_transforms.py
@ -488,7 +488,7 @@ class HWC2CHW(py_transforms.PyTensorOperation):
    Examples:
        >>> from mindspore.dataset.transforms.py_transforms import Compose
        >>>
-        >>> transforms_list = Compose([py_vision.Decode(),
+        >>> transforms_list = Compose([c_vision.Decode(),
        ...                            py_vision.HWC2CHW()])
        >>> # apply the transform to dataset through map function
        >>> image_folder_dataset = image_folder_dataset.map(operations=transforms_list,
@ -627,11 +627,19 @@ class MixUp(py_transforms.PyTensorOperation):
        ``CPU``

    Examples:
-        >>> # Setup multi-batch mixup transformation
-        >>> transform = [py_vision.MixUp(batch_size=16, alpha=0.2, is_single=False)]
-        >>> # Apply the transform to the dataset through dataset.map()
-        >>> image_folder_dataset = image_folder_dataset.map(input_columns="image",
-        ...                                                 operations=transform)
+        >>> # first decode the image
+        >>> image_folder_dataset = image_folder_dataset.map(operations=c_vision.Decode(),
+        ...                                                 input_columns="image")
+        >>> # then ont hot decode the label
+        >>> image_folder_dataset = image_folder_dataset.map(operations=c_transforms.OneHot(10),
+        ...                                                 input_columns="label")
+        >>> # batch the samples
+        >>> batch_size = 4
+        >>> image_folder_dataset = image_folder_dataset.batch(batch_size=batch_size)
+        >>> # finally mix up the images and labels
+        >>> image_folder_dataset = image_folder_dataset.map(
+        ...     operations=py_vision.MixUp(batch_size=batch_size, alpha=0.2),
+        ...     input_columns=["image", "label"])
    """

    @check_mix_up