fix example code and docs

This commit is contained in:
Xiao Tianci 2022-04-27 10:59:22 +08:00
parent 5b4e912625
commit 82b4d74ef5
6 changed files with 42 additions and 31 deletions

View File

@ -61,6 +61,7 @@
# MindData
"mindspore/mindspore/python/mindspore/dataset/__init__.py" "redefined-builtin"
"mindspore/mindspore/python/mindspore/dataset/audio/transforms.py" "super-init-not-called"
"mindspore/mindspore/python/mindspore/dataset/engine/__init__.py" "redefined-builtin"
"mindspore/mindspore/python/mindspore/dataset/engine/datasets.py" "redefined-builtin"
"mindspore/mindspore/python/mindspore/dataset/engine/datasets.py" "broad-except"

View File

@ -32,10 +32,11 @@
- **drop_remainder** (bool, 可选) - 当最后一个批处理数据包含的数据条目小于 `batch_size`是否将该批处理丢弃不传递给下一个操作。默认值False不丢弃。
- **num_parallel_workers** (int, 可选) - 指定 `batch` 操作的并发进程数/线程数(由参数 `python_multiprocessing` 决定当前为多进程模式或多线程模式)。
默认值None使用mindspore.dataset.config中配置的线程数。
- **per_batch_map** (callable, 可选) - 可调用对象,以(list[Tensor], list[Tensor], ..., BatchInfo)作为输入参数,处理后返回(list[Tensor], list[Tensor],...)作为新的数据列。
输入参数中每个list[Tensor]代表给定数据列中的一批Tensorlist[Tensor]的个数应与 `input_columns` 中传入列名的数量相匹配,
在返回的(list[Tensor], list[Tensor], ...)中list[Tensor]的个数应与输入相同,如果输出列数与输入列数不一致,则需要指定 `output_columns`
该可调用对象的最后一个输入参数始终是BatchInfo用于获取数据集的信息用法参考样例2
- **per_batch_map** (callable, 可选) - 可调用对象,以(list[numpy.ndarray], list[numpy.ndarray], ..., BatchInfo)作为输入参数,
处理后返回(list[numpy.ndarray], list[numpy.ndarray],...)作为新的数据列。输入参数中每个list[numpy.ndarray]代表给定数据列中的一批numpy.ndarray
list[numpy.ndarray]的个数应与 `input_columns` 中传入列名的数量相匹配,在返回的(list[numpy.ndarray], list[numpy.ndarray], ...)中,
list[numpy.ndarray]的个数应与输入相同,如果输出列数与输入列数不一致,则需要指定 `output_columns`。该可调用对象的最后一个输入参数始终是BatchInfo
用于获取数据集的信息用法参考样例2
- **input_columns** (Union[str, list[str]], 可选):指定 `batch` 操作的输入数据列。
如果 `per_batch_map` 不为None列表中列名的个数应与 `per_batch_map` 中包含的列数匹配。默认值None不指定。
- **output_columns** (Union[str, list[str]], 可选) - 指定 `batch` 操作的输出数据列。如果输入数据列与输入数据列的长度不相等,则必须指定此参数。

View File

@ -150,7 +150,7 @@ class MS_API Dataset : public std::enable_shared_from_this<Dataset> {
}
/// \brief Function to transfer data through a device.
/// \notes If device is Ascend, features of data will be transferred one by one. The limitation
/// \note If device is Ascend, features of data will be transferred one by one. The limitation
/// of data transmission per time is 256M.
/// \param[in] queue_name Channel name (default="", create new unique name).
/// \param[in] device_type Type of device (default="", get from MSContext).
@ -193,7 +193,7 @@ class MS_API Dataset : public std::enable_shared_from_this<Dataset> {
}
/// \brief Function to create a BatchDataset
/// \notes Combines batch_size number of consecutive rows into batches
/// \note Combines batch_size number of consecutive rows into batches
/// \param[in] batch_size The number of rows each batch is created with
/// \param[in] drop_remainder Determines whether or not to drop the last possibly incomplete
/// batch. If true, and if there are less than batch_size rows
@ -209,7 +209,7 @@ class MS_API Dataset : public std::enable_shared_from_this<Dataset> {
std::shared_ptr<BatchDataset> Batch(int32_t batch_size, bool drop_remainder = false);
/// \brief Function to create a MapDataset
/// \notes Applies each operation in operations to this dataset
/// \note Applies each operation in operations to this dataset
/// \param[in] operations Vector of raw pointers to TensorTransform objects to be applied on the dataset. Operations
/// are applied in the order they appear in this list
/// \param[in] input_columns Vector of the names of the columns that will be passed to the first
@ -274,7 +274,7 @@ class MS_API Dataset : public std::enable_shared_from_this<Dataset> {
}
/// \brief Function to create a MapDataset
/// \notes Applies each operation in operations to this dataset
/// \note Applies each operation in operations to this dataset
/// \param[in] operations Vector of shared pointers to TensorTransform objects to be applied on the dataset.
/// Operations are applied in the order they appear in this list
/// \param[in] input_columns Vector of the names of the columns that will be passed to the first
@ -306,7 +306,7 @@ class MS_API Dataset : public std::enable_shared_from_this<Dataset> {
}
/// \brief Function to create a MapDataset
/// \notes Applies each operation in operations to this dataset
/// \note Applies each operation in operations to this dataset
/// \param[in] operations Vector of TensorTransform objects to be applied on the dataset. Operations are applied in
/// the order they appear in this list
/// \param[in] input_columns Vector of the names of the columns that will be passed to the first
@ -336,7 +336,7 @@ class MS_API Dataset : public std::enable_shared_from_this<Dataset> {
}
/// \brief Function to create a Project Dataset
/// \notes Applies project to the dataset
/// \note Applies project to the dataset
/// \param[in] columns The name of columns to project
/// \return Shared pointer to the current Dataset
/// \par Example
@ -350,7 +350,7 @@ class MS_API Dataset : public std::enable_shared_from_this<Dataset> {
}
/// \brief Function to create a Shuffle Dataset
/// \notes Randomly shuffles the rows of this dataset
/// \note Randomly shuffles the rows of this dataset
/// \param[in] buffer_size The size of the buffer (must be larger than 1) for shuffling
/// \return Shared pointer to the current ShuffleDataset
/// \par Example
@ -576,7 +576,7 @@ class MS_API AlbumDataset : public Dataset {
};
/// \brief Function to create an AlbumDataset
/// \notes The generated dataset is specified through setting a schema
/// \note The generated dataset is specified through setting a schema
/// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] data_schema Path to dataset schema file
/// \param[in] column_names Column names used to specify columns to load, if empty, will read all columns.
@ -611,7 +611,7 @@ Album(const std::string &dataset_dir, const std::string &data_schema, const std:
}
/// \brief Function to create an AlbumDataset
/// \notes The generated dataset is specified through setting a schema
/// \note The generated dataset is specified through setting a schema
/// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] data_schema Path to dataset schema file
/// \param[in] column_names Column names used to specify columns to load
@ -628,7 +628,7 @@ inline std::shared_ptr<AlbumDataset> MS_API Album(const std::string &dataset_dir
}
/// \brief Function to create an AlbumDataset
/// \notes The generated dataset is specified through setting a schema
/// \note The generated dataset is specified through setting a schema
/// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] data_schema Path to dataset schema file
/// \param[in] column_names Column names used to specify columns to load
@ -676,7 +676,7 @@ class MS_API MnistDataset : public Dataset {
};
/// \brief Function to create a MnistDataset
/// \notes The generated dataset has two columns ["image", "label"]
/// \note The generated dataset has two columns ["image", "label"]
/// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] usage of MNIST, can be "train", "test" or "all" (default = "all").
/// \param[in] sampler Shared pointer to a sampler object used to choose samples from the dataset. If sampler is not
@ -705,7 +705,7 @@ Mnist(const std::string &dataset_dir, const std::string &usage = "all",
}
/// \brief Function to create a MnistDataset
/// \notes The generated dataset has two columns ["image", "label"]
/// \note The generated dataset has two columns ["image", "label"]
/// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] usage of MNIST, can be "train", "test" or "all"
/// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset.
@ -718,7 +718,7 @@ inline std::shared_ptr<MnistDataset> MS_API Mnist(const std::string &dataset_dir
}
/// \brief Function to create a MnistDataset
/// \notes The generated dataset has two columns ["image", "label"]
/// \note The generated dataset has two columns ["image", "label"]
/// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] usage of MNIST, can be "train", "test" or "all"
/// \param[in] sampler Sampler object used to choose samples from the dataset.

View File

@ -1409,9 +1409,9 @@ class PhaseVocoder(AudioTensorOperation):
Examples:
>>> import numpy as np
>>>
>>> waveform = np.random.randn(2, 44, 10, 2)
>>> waveform = np.random.random([2, 44, 10, 2])
>>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
>>> phase_advance = np.random.randn(44, 1)
>>> phase_advance = np.random.random([44, 1])
>>> transforms = [audio.PhaseVocoder(rate=2, phase_advance=phase_advance)]
>>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
"""
@ -1661,7 +1661,7 @@ class TimeStretch(AudioTensorOperation):
Examples:
>>> import numpy as np
>>>
>>> waveform = np.random.random([1, 30])
>>> waveform = np.random.random([44, 10, 2])
>>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
>>> transforms = [audio.TimeStretch()]
>>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])

View File

@ -574,11 +574,12 @@ class Dataset:
num_parallel_workers (int, optional): Number of workers(threads) to process the dataset in parallel
(default=None).
per_batch_map (callable, optional): Per batch map callable (default=None). A callable which takes
(list[Tensor], list[Tensor], ..., BatchInfo) as input parameters. Each list[Tensor] represents a batch
of Tensors on a given column. The number of lists should match with the number of entries in
input_columns. The last parameter of the callable should always be a BatchInfo object. Per_batch_map
should return (list[Tensor], list[Tensor], ...). The length of each list in output should be the same as
the input. output_columns is required if the number of output lists is different from input.
(list[numpy.ndarray], list[numpy.ndarray], ..., BatchInfo) as input parameters. Each
list[numpy.ndarray] represents a batch of numpy.ndarray on a given column. The number of lists should
match with the number of entries in input_columns. The last parameter of the callable should always be
a BatchInfo object. Per_batch_map should return (list[numpy.ndarray], list[numpy.ndarray], ...). The
length of each list in output should be the same as the input. output_columns is required if the number
of output lists is different from input.
input_columns (Union[str, list[str]], optional): List of names of the input columns. The size of the list
should match with signature of per_batch_map callable (default=None).
output_columns (Union[str, list[str]], optional): List of names assigned to the columns

View File

@ -488,7 +488,7 @@ class HWC2CHW(py_transforms.PyTensorOperation):
Examples:
>>> from mindspore.dataset.transforms.py_transforms import Compose
>>>
>>> transforms_list = Compose([py_vision.Decode(),
>>> transforms_list = Compose([c_vision.Decode(),
... py_vision.HWC2CHW()])
>>> # apply the transform to dataset through map function
>>> image_folder_dataset = image_folder_dataset.map(operations=transforms_list,
@ -627,11 +627,19 @@ class MixUp(py_transforms.PyTensorOperation):
``CPU``
Examples:
>>> # Setup multi-batch mixup transformation
>>> transform = [py_vision.MixUp(batch_size=16, alpha=0.2, is_single=False)]
>>> # Apply the transform to the dataset through dataset.map()
>>> image_folder_dataset = image_folder_dataset.map(input_columns="image",
... operations=transform)
>>> # first decode the image
>>> image_folder_dataset = image_folder_dataset.map(operations=c_vision.Decode(),
... input_columns="image")
>>> # then ont hot decode the label
>>> image_folder_dataset = image_folder_dataset.map(operations=c_transforms.OneHot(10),
... input_columns="label")
>>> # batch the samples
>>> batch_size = 4
>>> image_folder_dataset = image_folder_dataset.batch(batch_size=batch_size)
>>> # finally mix up the images and labels
>>> image_folder_dataset = image_folder_dataset.map(
... operations=py_vision.MixUp(batch_size=batch_size, alpha=0.2),
... input_columns=["image", "label"])
"""
@check_mix_up