From 82b4d74ef59cb5ad8d64c7cbd89f3de6a35bd314 Mon Sep 17 00:00:00 2001 From: Xiao Tianci Date: Wed, 27 Apr 2022 10:59:22 +0800 Subject: [PATCH] fix example code and docs --- .jenkins/check/config/filter_pylint.txt | 1 + .../dataset/mindspore.dataset.Dataset.rst | 9 ++++--- .../dataset/liteapi/include/datasets.h | 26 +++++++++---------- .../mindspore/dataset/audio/transforms.py | 6 ++--- .../mindspore/dataset/engine/datasets.py | 11 ++++---- .../mindspore/dataset/vision/py_transforms.py | 20 +++++++++----- 6 files changed, 42 insertions(+), 31 deletions(-) diff --git a/.jenkins/check/config/filter_pylint.txt b/.jenkins/check/config/filter_pylint.txt index 4fa9220446e..412cec87ef0 100644 --- a/.jenkins/check/config/filter_pylint.txt +++ b/.jenkins/check/config/filter_pylint.txt @@ -61,6 +61,7 @@ # MindData "mindspore/mindspore/python/mindspore/dataset/__init__.py" "redefined-builtin" +"mindspore/mindspore/python/mindspore/dataset/audio/transforms.py" "super-init-not-called" "mindspore/mindspore/python/mindspore/dataset/engine/__init__.py" "redefined-builtin" "mindspore/mindspore/python/mindspore/dataset/engine/datasets.py" "redefined-builtin" "mindspore/mindspore/python/mindspore/dataset/engine/datasets.py" "broad-except" diff --git a/docs/api/api_python/dataset/mindspore.dataset.Dataset.rst b/docs/api/api_python/dataset/mindspore.dataset.Dataset.rst index ef39b9de5f7..3223ab317f6 100644 --- a/docs/api/api_python/dataset/mindspore.dataset.Dataset.rst +++ b/docs/api/api_python/dataset/mindspore.dataset.Dataset.rst @@ -32,10 +32,11 @@ - **drop_remainder** (bool, 可选) - 当最后一个批处理数据包含的数据条目小于 `batch_size` 时,是否将该批处理丢弃,不传递给下一个操作。默认值:False,不丢弃。 - **num_parallel_workers** (int, 可选) - 指定 `batch` 操作的并发进程数/线程数(由参数 `python_multiprocessing` 决定当前为多进程模式或多线程模式)。 默认值:None,使用mindspore.dataset.config中配置的线程数。 - - **per_batch_map** (callable, 可选) - 可调用对象,以(list[Tensor], list[Tensor], ..., BatchInfo)作为输入参数,处理后返回(list[Tensor], list[Tensor],...)作为新的数据列。 - 输入参数中每个list[Tensor]代表给定数据列中的一批Tensor,list[Tensor]的个数应与 `input_columns` 中传入列名的数量相匹配, - 在返回的(list[Tensor], list[Tensor], ...)中,list[Tensor]的个数应与输入相同,如果输出列数与输入列数不一致,则需要指定 `output_columns`。 - 该可调用对象的最后一个输入参数始终是BatchInfo,用于获取数据集的信息,用法参考样例(2)。 + - **per_batch_map** (callable, 可选) - 可调用对象,以(list[numpy.ndarray], list[numpy.ndarray], ..., BatchInfo)作为输入参数, + 处理后返回(list[numpy.ndarray], list[numpy.ndarray],...)作为新的数据列。输入参数中每个list[numpy.ndarray]代表给定数据列中的一批numpy.ndarray, + list[numpy.ndarray]的个数应与 `input_columns` 中传入列名的数量相匹配,在返回的(list[numpy.ndarray], list[numpy.ndarray], ...)中, + list[numpy.ndarray]的个数应与输入相同,如果输出列数与输入列数不一致,则需要指定 `output_columns`。该可调用对象的最后一个输入参数始终是BatchInfo, + 用于获取数据集的信息,用法参考样例(2)。 - **input_columns** (Union[str, list[str]], 可选):指定 `batch` 操作的输入数据列。 如果 `per_batch_map` 不为None,列表中列名的个数应与 `per_batch_map` 中包含的列数匹配。默认值:None,不指定。 - **output_columns** (Union[str, list[str]], 可选) - 指定 `batch` 操作的输出数据列。如果输入数据列与输入数据列的长度不相等,则必须指定此参数。 diff --git a/mindspore/ccsrc/minddata/dataset/liteapi/include/datasets.h b/mindspore/ccsrc/minddata/dataset/liteapi/include/datasets.h index cb719fa01a6..c07201733e9 100644 --- a/mindspore/ccsrc/minddata/dataset/liteapi/include/datasets.h +++ b/mindspore/ccsrc/minddata/dataset/liteapi/include/datasets.h @@ -150,7 +150,7 @@ class MS_API Dataset : public std::enable_shared_from_this { } /// \brief Function to transfer data through a device. - /// \notes If device is Ascend, features of data will be transferred one by one. The limitation + /// \note If device is Ascend, features of data will be transferred one by one. The limitation /// of data transmission per time is 256M. /// \param[in] queue_name Channel name (default="", create new unique name). /// \param[in] device_type Type of device (default="", get from MSContext). @@ -193,7 +193,7 @@ class MS_API Dataset : public std::enable_shared_from_this { } /// \brief Function to create a BatchDataset - /// \notes Combines batch_size number of consecutive rows into batches + /// \note Combines batch_size number of consecutive rows into batches /// \param[in] batch_size The number of rows each batch is created with /// \param[in] drop_remainder Determines whether or not to drop the last possibly incomplete /// batch. If true, and if there are less than batch_size rows @@ -209,7 +209,7 @@ class MS_API Dataset : public std::enable_shared_from_this { std::shared_ptr Batch(int32_t batch_size, bool drop_remainder = false); /// \brief Function to create a MapDataset - /// \notes Applies each operation in operations to this dataset + /// \note Applies each operation in operations to this dataset /// \param[in] operations Vector of raw pointers to TensorTransform objects to be applied on the dataset. Operations /// are applied in the order they appear in this list /// \param[in] input_columns Vector of the names of the columns that will be passed to the first @@ -274,7 +274,7 @@ class MS_API Dataset : public std::enable_shared_from_this { } /// \brief Function to create a MapDataset - /// \notes Applies each operation in operations to this dataset + /// \note Applies each operation in operations to this dataset /// \param[in] operations Vector of shared pointers to TensorTransform objects to be applied on the dataset. /// Operations are applied in the order they appear in this list /// \param[in] input_columns Vector of the names of the columns that will be passed to the first @@ -306,7 +306,7 @@ class MS_API Dataset : public std::enable_shared_from_this { } /// \brief Function to create a MapDataset - /// \notes Applies each operation in operations to this dataset + /// \note Applies each operation in operations to this dataset /// \param[in] operations Vector of TensorTransform objects to be applied on the dataset. Operations are applied in /// the order they appear in this list /// \param[in] input_columns Vector of the names of the columns that will be passed to the first @@ -336,7 +336,7 @@ class MS_API Dataset : public std::enable_shared_from_this { } /// \brief Function to create a Project Dataset - /// \notes Applies project to the dataset + /// \note Applies project to the dataset /// \param[in] columns The name of columns to project /// \return Shared pointer to the current Dataset /// \par Example @@ -350,7 +350,7 @@ class MS_API Dataset : public std::enable_shared_from_this { } /// \brief Function to create a Shuffle Dataset - /// \notes Randomly shuffles the rows of this dataset + /// \note Randomly shuffles the rows of this dataset /// \param[in] buffer_size The size of the buffer (must be larger than 1) for shuffling /// \return Shared pointer to the current ShuffleDataset /// \par Example @@ -576,7 +576,7 @@ class MS_API AlbumDataset : public Dataset { }; /// \brief Function to create an AlbumDataset -/// \notes The generated dataset is specified through setting a schema +/// \note The generated dataset is specified through setting a schema /// \param[in] dataset_dir Path to the root directory that contains the dataset /// \param[in] data_schema Path to dataset schema file /// \param[in] column_names Column names used to specify columns to load, if empty, will read all columns. @@ -611,7 +611,7 @@ Album(const std::string &dataset_dir, const std::string &data_schema, const std: } /// \brief Function to create an AlbumDataset -/// \notes The generated dataset is specified through setting a schema +/// \note The generated dataset is specified through setting a schema /// \param[in] dataset_dir Path to the root directory that contains the dataset /// \param[in] data_schema Path to dataset schema file /// \param[in] column_names Column names used to specify columns to load @@ -628,7 +628,7 @@ inline std::shared_ptr MS_API Album(const std::string &dataset_dir } /// \brief Function to create an AlbumDataset -/// \notes The generated dataset is specified through setting a schema +/// \note The generated dataset is specified through setting a schema /// \param[in] dataset_dir Path to the root directory that contains the dataset /// \param[in] data_schema Path to dataset schema file /// \param[in] column_names Column names used to specify columns to load @@ -676,7 +676,7 @@ class MS_API MnistDataset : public Dataset { }; /// \brief Function to create a MnistDataset -/// \notes The generated dataset has two columns ["image", "label"] +/// \note The generated dataset has two columns ["image", "label"] /// \param[in] dataset_dir Path to the root directory that contains the dataset /// \param[in] usage of MNIST, can be "train", "test" or "all" (default = "all"). /// \param[in] sampler Shared pointer to a sampler object used to choose samples from the dataset. If sampler is not @@ -705,7 +705,7 @@ Mnist(const std::string &dataset_dir, const std::string &usage = "all", } /// \brief Function to create a MnistDataset -/// \notes The generated dataset has two columns ["image", "label"] +/// \note The generated dataset has two columns ["image", "label"] /// \param[in] dataset_dir Path to the root directory that contains the dataset /// \param[in] usage of MNIST, can be "train", "test" or "all" /// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset. @@ -718,7 +718,7 @@ inline std::shared_ptr MS_API Mnist(const std::string &dataset_dir } /// \brief Function to create a MnistDataset -/// \notes The generated dataset has two columns ["image", "label"] +/// \note The generated dataset has two columns ["image", "label"] /// \param[in] dataset_dir Path to the root directory that contains the dataset /// \param[in] usage of MNIST, can be "train", "test" or "all" /// \param[in] sampler Sampler object used to choose samples from the dataset. diff --git a/mindspore/python/mindspore/dataset/audio/transforms.py b/mindspore/python/mindspore/dataset/audio/transforms.py index 54b826cf2b7..8e394f5379f 100644 --- a/mindspore/python/mindspore/dataset/audio/transforms.py +++ b/mindspore/python/mindspore/dataset/audio/transforms.py @@ -1409,9 +1409,9 @@ class PhaseVocoder(AudioTensorOperation): Examples: >>> import numpy as np >>> - >>> waveform = np.random.randn(2, 44, 10, 2) + >>> waveform = np.random.random([2, 44, 10, 2]) >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"]) - >>> phase_advance = np.random.randn(44, 1) + >>> phase_advance = np.random.random([44, 1]) >>> transforms = [audio.PhaseVocoder(rate=2, phase_advance=phase_advance)] >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"]) """ @@ -1661,7 +1661,7 @@ class TimeStretch(AudioTensorOperation): Examples: >>> import numpy as np >>> - >>> waveform = np.random.random([1, 30]) + >>> waveform = np.random.random([44, 10, 2]) >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"]) >>> transforms = [audio.TimeStretch()] >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"]) diff --git a/mindspore/python/mindspore/dataset/engine/datasets.py b/mindspore/python/mindspore/dataset/engine/datasets.py index 118681eadd7..28fbed879ee 100644 --- a/mindspore/python/mindspore/dataset/engine/datasets.py +++ b/mindspore/python/mindspore/dataset/engine/datasets.py @@ -574,11 +574,12 @@ class Dataset: num_parallel_workers (int, optional): Number of workers(threads) to process the dataset in parallel (default=None). per_batch_map (callable, optional): Per batch map callable (default=None). A callable which takes - (list[Tensor], list[Tensor], ..., BatchInfo) as input parameters. Each list[Tensor] represents a batch - of Tensors on a given column. The number of lists should match with the number of entries in - input_columns. The last parameter of the callable should always be a BatchInfo object. Per_batch_map - should return (list[Tensor], list[Tensor], ...). The length of each list in output should be the same as - the input. output_columns is required if the number of output lists is different from input. + (list[numpy.ndarray], list[numpy.ndarray], ..., BatchInfo) as input parameters. Each + list[numpy.ndarray] represents a batch of numpy.ndarray on a given column. The number of lists should + match with the number of entries in input_columns. The last parameter of the callable should always be + a BatchInfo object. Per_batch_map should return (list[numpy.ndarray], list[numpy.ndarray], ...). The + length of each list in output should be the same as the input. output_columns is required if the number + of output lists is different from input. input_columns (Union[str, list[str]], optional): List of names of the input columns. The size of the list should match with signature of per_batch_map callable (default=None). output_columns (Union[str, list[str]], optional): List of names assigned to the columns diff --git a/mindspore/python/mindspore/dataset/vision/py_transforms.py b/mindspore/python/mindspore/dataset/vision/py_transforms.py index 7c0ab61e35f..7a81dcebccc 100644 --- a/mindspore/python/mindspore/dataset/vision/py_transforms.py +++ b/mindspore/python/mindspore/dataset/vision/py_transforms.py @@ -488,7 +488,7 @@ class HWC2CHW(py_transforms.PyTensorOperation): Examples: >>> from mindspore.dataset.transforms.py_transforms import Compose >>> - >>> transforms_list = Compose([py_vision.Decode(), + >>> transforms_list = Compose([c_vision.Decode(), ... py_vision.HWC2CHW()]) >>> # apply the transform to dataset through map function >>> image_folder_dataset = image_folder_dataset.map(operations=transforms_list, @@ -627,11 +627,19 @@ class MixUp(py_transforms.PyTensorOperation): ``CPU`` Examples: - >>> # Setup multi-batch mixup transformation - >>> transform = [py_vision.MixUp(batch_size=16, alpha=0.2, is_single=False)] - >>> # Apply the transform to the dataset through dataset.map() - >>> image_folder_dataset = image_folder_dataset.map(input_columns="image", - ... operations=transform) + >>> # first decode the image + >>> image_folder_dataset = image_folder_dataset.map(operations=c_vision.Decode(), + ... input_columns="image") + >>> # then ont hot decode the label + >>> image_folder_dataset = image_folder_dataset.map(operations=c_transforms.OneHot(10), + ... input_columns="label") + >>> # batch the samples + >>> batch_size = 4 + >>> image_folder_dataset = image_folder_dataset.batch(batch_size=batch_size) + >>> # finally mix up the images and labels + >>> image_folder_dataset = image_folder_dataset.map( + ... operations=py_vision.MixUp(batch_size=batch_size, alpha=0.2), + ... input_columns=["image", "label"]) """ @check_mix_up