diff --git a/docs/api/api_python/dataset/mindspore.dataset.GeneratorDataset.rst b/docs/api/api_python/dataset/mindspore.dataset.GeneratorDataset.rst index 40ba8830d80..c63d4c9784f 100644 --- a/docs/api/api_python/dataset/mindspore.dataset.GeneratorDataset.rst +++ b/docs/api/api_python/dataset/mindspore.dataset.GeneratorDataset.rst @@ -36,6 +36,12 @@ - **ValueError** - `shard_id` 参数错误,小于0或者大于等于 `num_shards` 。 .. note:: + - 如果配置 `python_multiprocessing=True(默认值:True)` 和 `num_parallel_workers>1(默认值:1)` 表示启动了多进程方式进行数据load加速, + 此时随着数据集迭代,子进程的内存占用会逐渐增加,主要是因为自定义数据集的子进程以 Copy-On-Write 的方式获取主进程中的成员变量。 + 举例:如果自定义数据集 `__init__` 函数中包含大量成员变量数据(例如:在数据集构建时加载了一个非常大的文件名列表)并且使用了多进程方式, + 那这可能会导致产生OOM的问题(总内存的预估使用量是:(子进程数量 + 1) * 父进程的内存大小)。最简单的解决方法是成员变量用非引用数据类型 + (如:Pandas、Numpy或PyArrow对象)替换Python对象(如:list / dict / int / float / string等),或者配置 `python_multiprocessing=False` + 使用多线程方式。 - `source` 参数接收用户自定义的Python函数(PyFuncs),不要将 `mindspore.nn` 和 `mindspore.ops` 目录下或其他的网络计算算子添加 到 `source` 中。 - 此数据集可以指定参数 `sampler` ,但参数 `sampler` 和参数 `shuffle` 的行为是互斥的。下表展示了几种合法的输入参数组合及预期的行为。 diff --git a/docs/api/api_python/mindrecord/mindspore.mindrecord.ImageNetToMR.rst b/docs/api/api_python/mindrecord/mindspore.mindrecord.ImageNetToMR.rst index a78863e3ac0..a53601d2f36 100644 --- a/docs/api/api_python/mindrecord/mindspore.mindrecord.ImageNetToMR.rst +++ b/docs/api/api_python/mindrecord/mindspore.mindrecord.ImageNetToMR.rst @@ -4,7 +4,7 @@ 将ImageNet数据集转换为MindRecord格式数据集。 参数: - - **map_file** (str) - 标签映射文件的路径。该文件可通过命令: `ls -l [image_dir] | grep -vE "总用量|total|\." | awk -F " " '{print $9, NR-1;}' > [file_path]` 生成,其中 `image_dir` 为ImageNet数据集的目录路径, `file_path` 为生成的 `map_file` 文件 。 `map_file` 文件内容示例如下: + - **map_file** (str) - 标签映射文件的路径。该文件可通过命令: `ls -l [image_dir] | grep -vE "总用量|total|\\\." | awk -F " " '{print $9, NR-1;}' > [file_path]` 生成,其中 `image_dir` 为ImageNet数据集的目录路径, `file_path` 为生成的 `map_file` 文件 。 `map_file` 文件内容示例如下: .. code-block:: diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc old mode 100755 new mode 100644 index 95b891d579b..ad1c3f46068 --- a/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc +++ b/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc @@ -320,24 +320,34 @@ Status Resize(const std::shared_ptr &input, std::shared_ptr *out } } +const unsigned char kJpegMagic[] = "\xFF\xD8\xFF"; +constexpr dsize_t kJpegMagicLen = 3; +const unsigned char kPngMagic[] = "\x89\x50\x4E\x47"; +constexpr dsize_t kPngMagicLen = 4; + bool IsNonEmptyJPEG(const std::shared_ptr &input) { - const unsigned char kJpegMagic[] = "\xFF\xD8\xFF"; - constexpr dsize_t kJpegMagicLen = 3; return input->SizeInBytes() > kJpegMagicLen && memcmp(input->GetBuffer(), kJpegMagic, kJpegMagicLen) == 0; } bool IsNonEmptyPNG(const std::shared_ptr &input) { - const unsigned char kPngMagic[] = "\x89\x50\x4E\x47"; - constexpr dsize_t kPngMagicLen = 4; return input->SizeInBytes() > kPngMagicLen && memcmp(input->GetBuffer(), kPngMagic, kPngMagicLen) == 0; } Status Decode(const std::shared_ptr &input, std::shared_ptr *output) { + RETURN_IF_NOT_OK(CheckUnsupportedImage(input)); + + Status ret; if (IsNonEmptyJPEG(input)) { - return JpegCropAndDecode(input, output); + ret = JpegCropAndDecode(input, output); } else { - return DecodeCv(input, output); + ret = DecodeCv(input, output); } + + // decode failed and dump it + if (ret != Status::OK()) { + return DumpImageAndAppendStatus(input, ret); + } + return ret; } Status DecodeCv(const std::shared_ptr &input, std::shared_ptr *output) { @@ -2590,5 +2600,82 @@ Status WritePng(const std::string &filename, const std::shared_ptr &imag fs.close(); return Status::OK(); } + +// support list +const unsigned char kBmpMagic[] = "\x42\x4D"; +constexpr dsize_t kBmpMagicLen = 2; +const unsigned char kTiffMagic1[] = "\x4D\x4D"; +const unsigned char kTiffMagic2[] = "\x49\x49"; +constexpr dsize_t kTiffMagicLen = 2; + +Status DumpImageAndAppendStatus(const std::shared_ptr &image, const Status &status) { + Status local_status = status; + std::string file_name = "./abnormal_image."; + std::string file_suffix = ""; + std::string error_info = local_status.GetErrDescription(); + if (image->SizeInBytes() == 0) { + return local_status; + } + + if (memcmp(image->GetBuffer(), kJpegMagic, kJpegMagicLen) == 0) { // support + file_suffix = "jpg"; + } else if (memcmp(image->GetBuffer(), kPngMagic, kPngMagicLen) == 0) { // support + file_suffix = "png"; + } else if (memcmp(image->GetBuffer(), kBmpMagic, kBmpMagicLen) == 0) { // support + file_suffix = "bmp"; + } else if (memcmp(image->GetBuffer(), kTiffMagic1, kTiffMagicLen) == 0 || // support + memcmp(image->GetBuffer(), kTiffMagic2, kTiffMagicLen) == 0) { + file_suffix = "tif"; + } else { + file_suffix = "exception"; + error_info += " Unknown image type."; + } + + auto ret = WriteFile(file_name + file_suffix, image); + if (ret == Status::OK()) { + error_info += " Dump the abnormal image to [" + (file_name + file_suffix) + + "]. You can check this image first through the image viewer. If you find that " + + "the image is abnormal, delete it from the dataset and re-run."; + } + local_status.SetErrDescription(error_info); + return local_status; +} + +// unsupported list +const unsigned char kGifMagic[] = "\x47\x49\x46"; +constexpr dsize_t kGifMagicLen = 3; +const unsigned char kWebpMagic[] = "\x00\x57\x45\x42"; +constexpr dsize_t kWebpMagicLen = 4; + +Status CheckUnsupportedImage(const std::shared_ptr &image) { + bool unsupport_flag = false; + + std::string file_name = "./unsupported_image."; + std::string file_suffix = ""; + if (image->SizeInBytes() == 0) { + RETURN_STATUS_UNEXPECTED("Image file size is 0."); + } + + if (memcmp(image->GetBuffer(), kGifMagic, kGifMagicLen) == 0) { // unsupported + file_suffix = "gif"; + unsupport_flag = true; + } else if (memcmp(image->GetBuffer() + 7, kWebpMagic, kWebpMagicLen) == 0) { // unsupported: skip the 7 bytes + file_suffix = "webp"; + unsupport_flag = true; + } + + if (unsupport_flag) { + auto ret = WriteFile(file_name + file_suffix, image); + if (ret == Status::OK()) { + RETURN_STATUS_UNEXPECTED("Unsupported image type [" + file_suffix + "] and dump the image to [" + + (file_name + file_suffix) + "]. Please delete it from the dataset and re-run."); + } else { + ret.SetErrDescription("Unsupported image type [" + file_suffix + "], but dump the image failed. " + + "Error info: " + ret.GetErrDescription()); + return ret; + } + } + return Status::OK(); +} } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.h b/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.h index 76cea0bf5f8..dc258c9b0a6 100755 --- a/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.h +++ b/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.h @@ -557,6 +557,17 @@ Status WriteJpeg(const std::string &filename, const std::shared_ptr &ima /// \param[in] compression_level The compression level for PNG file, in range of [0, 9]. Default: 6. /// \return Status code. Status WritePng(const std::string &filename, const std::shared_ptr &image, int compression_level = 6); + +/// \brief Dump the abnormal image to disk and facilitate user to check it. +/// \param[in] image The data Tensor. +/// \param[in] status The previous error status which is needed to append more info. +/// \return Status code. +Status DumpImageAndAppendStatus(const std::shared_ptr &image, const Status &status); + +/// \brief Check the unsupported image and dump it to disk. +/// \param[in] image The data Tensor. +/// \return Status code. +Status CheckUnsupportedImage(const std::shared_ptr &image); } // namespace dataset } // namespace mindspore #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_IMAGE_UTILS_H_ diff --git a/mindspore/python/mindspore/dataset/engine/datasets_user_defined.py b/mindspore/python/mindspore/dataset/engine/datasets_user_defined.py index 42997393eac..61a770c1857 100644 --- a/mindspore/python/mindspore/dataset/engine/datasets_user_defined.py +++ b/mindspore/python/mindspore/dataset/engine/datasets_user_defined.py @@ -555,6 +555,17 @@ class GeneratorDataset(MappableDataset, UnionBaseDataset): ValueError: If `shard_id` is invalid (< 0 or >= `num_shards`). Note: + - If you configure `python_multiprocessing=True (default: True)` and `num_parallel_workers>1 (default: 1)` + indicates that the multi-process mode is started for data load acceleration. At this time, as the dataset + iterates, the memory consumption of the subprocess will gradually increase, mainly because the subprocess + of the user-defined dataset obtains the member variables from the main process in the Copy On Write way. + Example: If you define a dataset with `__ init__` function which contains a large number of member variable + data (for example, a very large file name list is loaded during the dataset construction) and uses the + multi-process mode, which may cause the problem of OOM (the estimated total memory usage is: + `(num_parallel_workers+1) * size of the parent process` ). The simplest solution is to replace python objects + (such as list/dict/int/float/string) with non referenced data types + (such as Pandas, Numpy or PyArrow objects) for member variables, or configure `python_multiprocessing=False` + multi-threading mode is used. - Input `source` accepts user-defined Python functions (PyFuncs), Do not add network computing operators from mindspore.nn and mindspore.ops or others into this `source` . - This dataset can take in a `sampler` . `sampler` and `shuffle` are mutually exclusive. diff --git a/mindspore/python/mindspore/mindrecord/tools/imagenet_to_mr.py b/mindspore/python/mindspore/mindrecord/tools/imagenet_to_mr.py index 82f7005adeb..be772dbab3d 100644 --- a/mindspore/python/mindspore/mindrecord/tools/imagenet_to_mr.py +++ b/mindspore/python/mindspore/mindrecord/tools/imagenet_to_mr.py @@ -32,9 +32,9 @@ class ImageNetToMR: Args: map_file (str): The map file that indicates label. This file can be generated by command - `ls -l [image_dir] | grep -vE "total|\." | awk -F " " '{print $9, NR-1;}' > [file_path]` , where `image_dir` - is image directory contains n02119789, n02100735, n02110185 and n02096294 directory and `file_path` is - the generated `map_file` . An example of `map_file` is as below: + `ls -l [image_dir] | grep -vE "total|\\\." | awk -F " " '{print $9, NR-1;}' > [file_path]` , + where `image_dir` is image directory contains n02119789, n02100735, n02110185 and n02096294 directory + and `file_path` is the generated `map_file` . An example of `map_file` is as below: .. code-block:: diff --git a/tests/ut/data/dataset/testFormats/abnormal_apple.jpg b/tests/ut/data/dataset/testFormats/abnormal_apple.jpg new file mode 100644 index 00000000000..8062f048b66 Binary files /dev/null and b/tests/ut/data/dataset/testFormats/abnormal_apple.jpg differ diff --git a/tests/ut/python/dataset/test_decode.py b/tests/ut/python/dataset/test_decode.py index 7e8549fa1d7..4fa2858c7fc 100644 --- a/tests/ut/python/dataset/test_decode.py +++ b/tests/ut/python/dataset/test_decode.py @@ -15,6 +15,9 @@ """ Testing Decode op in DE """ +import glob +import os + import cv2 import numpy as np import pytest @@ -85,16 +88,35 @@ def test_decode_op_support_format(): # gif: Opencv[×] Pillow[√] gif_image = np.fromfile("../data/dataset/testFormats/apple.gif", np.uint8) - with pytest.raises(RuntimeError): + with pytest.raises(RuntimeError, match="Unsupported image type"): c_decode(gif_image) p_decode(gif_image) + assert len(glob.glob('unsupported_image.gif')) == 1 + # delete the dump file which is not supported + os.remove(glob.glob('unsupported_image.gif')[0]) + # webp: Opencv[×] Pillow[√] webp_image = np.fromfile("../data/dataset/testFormats/apple.webp", np.uint8) - with pytest.raises(RuntimeError): + with pytest.raises(RuntimeError, match="Unsupported image type"): c_decode(webp_image) p_decode(webp_image) + assert len(glob.glob('unsupported_image.webp')) == 1 + # delete the dump file which is not supported + os.remove(glob.glob('unsupported_image.webp')[0]) + + # abnormal image: Opencv[x] Pillow[x] + abnormal_image = np.fromfile("../data/dataset/testFormats/abnormal_apple.jpg", np.uint8) + with pytest.raises(RuntimeError, match="Dump the abnormal image to"): + c_decode(abnormal_image) + with pytest.raises(ValueError, match="image file is truncated"): + p_decode(abnormal_image) + + assert len(glob.glob('abnormal_image.jpg')) == 1 + # delete the dump file which is abnormal + os.remove(glob.glob('abnormal_image.jpg')[0]) + class ImageDataset: """Custom class to generate and read image dataset"""