!46343 dump the image when decode failed

Merge pull request !46343 from guozhijian/dump_image_when_decode_failed
2022-12-09 02:26:30 +00:00 · 2022-12-09 02:26:30 +00:00 · 85056c10ff
parent 88c93e8e28 6d98681f11
commit 85056c10ff
8 changed files with 149 additions and 12 deletions
--- a/docs/api/api_python/dataset/mindspore.dataset.GeneratorDataset.rst
+++ b/docs/api/api_python/dataset/mindspore.dataset.GeneratorDataset.rst
@ -36,6 +36,12 @@
        - **ValueError** - `shard_id` 参数错误，小于0或者大于等于 `num_shards` 。

    .. note::
+        - 如果配置 `python_multiprocessing=True（默认值：True）` 和 `num_parallel_workers>1（默认值：1）` 表示启动了多进程方式进行数据load加速，
+          此时随着数据集迭代，子进程的内存占用会逐渐增加，主要是因为自定义数据集的子进程以 Copy-On-Write 的方式获取主进程中的成员变量。
+          举例：如果自定义数据集 `__init__` 函数中包含大量成员变量数据（例如：在数据集构建时加载了一个非常大的文件名列表）并且使用了多进程方式，
+          那这可能会导致产生OOM的问题（总内存的预估使用量是：(子进程数量 + 1) * 父进程的内存大小）。最简单的解决方法是成员变量用非引用数据类型
+          （如：Pandas、Numpy或PyArrow对象）替换Python对象（如：list / dict / int / float / string等），或者配置 `python_multiprocessing=False` 
+          使用多线程方式。
        - `source` 参数接收用户自定义的Python函数（PyFuncs），不要将 `mindspore.nn` 和 `mindspore.ops` 目录下或其他的网络计算算子添加
          到 `source` 中。
        - 此数据集可以指定参数 `sampler` ，但参数 `sampler` 和参数 `shuffle` 的行为是互斥的。下表展示了几种合法的输入参数组合及预期的行为。
--- a/docs/api/api_python/mindrecord/mindspore.mindrecord.ImageNetToMR.rst
+++ b/docs/api/api_python/mindrecord/mindspore.mindrecord.ImageNetToMR.rst
@ -4,7 +4,7 @@
    将ImageNet数据集转换为MindRecord格式数据集。

    参数：
-        - **map_file** (str) - 标签映射文件的路径。该文件可通过命令： `ls -l [image_dir] | grep -vE "总用量|total|\." | awk -F " " '{print $9, NR-1;}' > [file_path]` 生成，其中 `image_dir` 为ImageNet数据集的目录路径， `file_path` 为生成的 `map_file` 文件 。 `map_file` 文件内容示例如下：
+        - **map_file** (str) - 标签映射文件的路径。该文件可通过命令： `ls -l [image_dir] | grep -vE "总用量|total|\\\." | awk -F " " '{print $9, NR-1;}' > [file_path]` 生成，其中 `image_dir` 为ImageNet数据集的目录路径， `file_path` 为生成的 `map_file` 文件 。 `map_file` 文件内容示例如下：

          .. code-block::

--- a/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc
@ -320,24 +320,34 @@ Status Resize(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *out
  }
 }

+const unsigned char kJpegMagic[] = "\xFF\xD8\xFF";
+constexpr dsize_t kJpegMagicLen = 3;
+const unsigned char kPngMagic[] = "\x89\x50\x4E\x47";
+constexpr dsize_t kPngMagicLen = 4;
+
 bool IsNonEmptyJPEG(const std::shared_ptr<Tensor> &input) {
-  const unsigned char kJpegMagic[] = "\xFF\xD8\xFF";
-  constexpr dsize_t kJpegMagicLen = 3;
  return input->SizeInBytes() > kJpegMagicLen && memcmp(input->GetBuffer(), kJpegMagic, kJpegMagicLen) == 0;
 }

 bool IsNonEmptyPNG(const std::shared_ptr<Tensor> &input) {
-  const unsigned char kPngMagic[] = "\x89\x50\x4E\x47";
-  constexpr dsize_t kPngMagicLen = 4;
  return input->SizeInBytes() > kPngMagicLen && memcmp(input->GetBuffer(), kPngMagic, kPngMagicLen) == 0;
 }

 Status Decode(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  RETURN_IF_NOT_OK(CheckUnsupportedImage(input));
+
+  Status ret;
  if (IsNonEmptyJPEG(input)) {
-    return JpegCropAndDecode(input, output);
+    ret = JpegCropAndDecode(input, output);
  } else {
-    return DecodeCv(input, output);
+    ret = DecodeCv(input, output);
  }
+
+  // decode failed and dump it
+  if (ret != Status::OK()) {
+    return DumpImageAndAppendStatus(input, ret);
+  }
+  return ret;
 }

 Status DecodeCv(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
@ -2590,5 +2600,82 @@ Status WritePng(const std::string &filename, const std::shared_ptr<Tensor> &imag
  fs.close();
  return Status::OK();
 }
+
+// support list
+const unsigned char kBmpMagic[] = "\x42\x4D";
+constexpr dsize_t kBmpMagicLen = 2;
+const unsigned char kTiffMagic1[] = "\x4D\x4D";
+const unsigned char kTiffMagic2[] = "\x49\x49";
+constexpr dsize_t kTiffMagicLen = 2;
+
+Status DumpImageAndAppendStatus(const std::shared_ptr<Tensor> &image, const Status &status) {
+  Status local_status = status;
+  std::string file_name = "./abnormal_image.";
+  std::string file_suffix = "";
+  std::string error_info = local_status.GetErrDescription();
+  if (image->SizeInBytes() == 0) {
+    return local_status;
+  }
+
+  if (memcmp(image->GetBuffer(), kJpegMagic, kJpegMagicLen) == 0) {  // support
+    file_suffix = "jpg";
+  } else if (memcmp(image->GetBuffer(), kPngMagic, kPngMagicLen) == 0) {  // support
+    file_suffix = "png";
+  } else if (memcmp(image->GetBuffer(), kBmpMagic, kBmpMagicLen) == 0) {  // support
+    file_suffix = "bmp";
+  } else if (memcmp(image->GetBuffer(), kTiffMagic1, kTiffMagicLen) == 0 ||  // support
+             memcmp(image->GetBuffer(), kTiffMagic2, kTiffMagicLen) == 0) {
+    file_suffix = "tif";
+  } else {
+    file_suffix = "exception";
+    error_info += " Unknown image type.";
+  }
+
+  auto ret = WriteFile(file_name + file_suffix, image);
+  if (ret == Status::OK()) {
+    error_info += " Dump the abnormal image to [" + (file_name + file_suffix) +
+                  "]. You can check this image first through the image viewer. If you find that " +
+                  "the image is abnormal, delete it from the dataset and re-run.";
+  }
+  local_status.SetErrDescription(error_info);
+  return local_status;
+}
+
+// unsupported list
+const unsigned char kGifMagic[] = "\x47\x49\x46";
+constexpr dsize_t kGifMagicLen = 3;
+const unsigned char kWebpMagic[] = "\x00\x57\x45\x42";
+constexpr dsize_t kWebpMagicLen = 4;
+
+Status CheckUnsupportedImage(const std::shared_ptr<Tensor> &image) {
+  bool unsupport_flag = false;
+
+  std::string file_name = "./unsupported_image.";
+  std::string file_suffix = "";
+  if (image->SizeInBytes() == 0) {
+    RETURN_STATUS_UNEXPECTED("Image file size is 0.");
+  }
+
+  if (memcmp(image->GetBuffer(), kGifMagic, kGifMagicLen) == 0) {  // unsupported
+    file_suffix = "gif";
+    unsupport_flag = true;
+  } else if (memcmp(image->GetBuffer() + 7, kWebpMagic, kWebpMagicLen) == 0) {  // unsupported: skip the 7 bytes
+    file_suffix = "webp";
+    unsupport_flag = true;
+  }
+
+  if (unsupport_flag) {
+    auto ret = WriteFile(file_name + file_suffix, image);
+    if (ret == Status::OK()) {
+      RETURN_STATUS_UNEXPECTED("Unsupported image type [" + file_suffix + "] and dump the image to [" +
+                               (file_name + file_suffix) + "]. Please delete it from the dataset and re-run.");
+    } else {
+      ret.SetErrDescription("Unsupported image type [" + file_suffix + "], but dump the image failed. " +
+                            "Error info: " + ret.GetErrDescription());
+      return ret;
+    }
+  }
+  return Status::OK();
+}
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.h
@ -557,6 +557,17 @@ Status WriteJpeg(const std::string &filename, const std::shared_ptr<Tensor> &ima
 /// \param[in] compression_level The compression level for PNG file, in range of [0, 9]. Default: 6.
 /// \return Status code.
 Status WritePng(const std::string &filename, const std::shared_ptr<Tensor> &image, int compression_level = 6);
+
+/// \brief Dump the abnormal image to disk and facilitate user to check it.
+/// \param[in] image The data Tensor.
+/// \param[in] status The previous error status which is needed to append more info.
+/// \return Status code.
+Status DumpImageAndAppendStatus(const std::shared_ptr<Tensor> &image, const Status &status);
+
+/// \brief Check the unsupported image and dump it to disk.
+/// \param[in] image The data Tensor.
+/// \return Status code.
+Status CheckUnsupportedImage(const std::shared_ptr<Tensor> &image);
 }  // namespace dataset
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_IMAGE_UTILS_H_
--- a/mindspore/python/mindspore/dataset/engine/datasets_user_defined.py
+++ b/mindspore/python/mindspore/dataset/engine/datasets_user_defined.py
@ -555,6 +555,17 @@ class GeneratorDataset(MappableDataset, UnionBaseDataset):
        ValueError: If `shard_id` is invalid (< 0 or >= `num_shards`).

    Note:
+        - If you configure `python_multiprocessing=True (default: True)` and `num_parallel_workers>1 (default: 1)`
+          indicates that the multi-process mode is started for data load acceleration. At this time, as the dataset
+          iterates, the memory consumption of the subprocess will gradually increase, mainly because the subprocess
+          of the user-defined dataset obtains the member variables from the main process in the Copy On Write way.
+          Example: If you define a dataset with `__ init__` function which contains a large number of member variable
+          data (for example, a very large file name list is loaded during the dataset construction) and uses the
+          multi-process mode, which may cause the problem of OOM (the estimated total memory usage is:
+          `(num_parallel_workers+1) * size of the parent process` ). The simplest solution is to replace python objects
+          (such as list/dict/int/float/string) with non referenced data types
+          (such as Pandas, Numpy or PyArrow objects) for member variables, or configure `python_multiprocessing=False`
+          multi-threading mode is used.
        - Input `source` accepts user-defined Python functions (PyFuncs), Do not add network computing operators from
          mindspore.nn and mindspore.ops or others into this `source` .
        - This dataset can take in a `sampler` . `sampler` and `shuffle` are mutually exclusive.
--- a/mindspore/python/mindspore/mindrecord/tools/imagenet_to_mr.py
+++ b/mindspore/python/mindspore/mindrecord/tools/imagenet_to_mr.py
@ -32,9 +32,9 @@ class ImageNetToMR:

    Args:
        map_file (str): The map file that indicates label. This file can be generated by command
-            `ls -l [image_dir] | grep -vE "total|\." | awk -F " " '{print $9, NR-1;}' > [file_path]` , where `image_dir`
-            is image directory contains n02119789, n02100735, n02110185 and n02096294 directory and `file_path` is
-            the generated `map_file` . An example of `map_file` is as below:
+            `ls -l [image_dir] | grep -vE "total|\\\." | awk -F " " '{print $9, NR-1;}' > [file_path]` ,
+            where `image_dir` is image directory contains n02119789, n02100735, n02110185 and n02096294 directory
+            and `file_path` is the generated `map_file` . An example of `map_file` is as below:

            .. code-block::

--- a/tests/ut/data/dataset/testFormats/abnormal_apple.jpg
+++ b/tests/ut/data/dataset/testFormats/abnormal_apple.jpg
--- a/tests/ut/python/dataset/test_decode.py
+++ b/tests/ut/python/dataset/test_decode.py
@ -15,6 +15,9 @@
 """
 Testing Decode op in DE
 """
+import glob
+import os
+
 import cv2
 import numpy as np
 import pytest
@ -85,16 +88,35 @@ def test_decode_op_support_format():

    # gif: Opencv[×] Pillow[√]
    gif_image = np.fromfile("../data/dataset/testFormats/apple.gif", np.uint8)
-    with pytest.raises(RuntimeError):
+    with pytest.raises(RuntimeError, match="Unsupported image type"):
        c_decode(gif_image)
    p_decode(gif_image)

+    assert len(glob.glob('unsupported_image.gif')) == 1
+    # delete the dump file which is not supported
+    os.remove(glob.glob('unsupported_image.gif')[0])
+
    # webp: Opencv[×] Pillow[√]
    webp_image = np.fromfile("../data/dataset/testFormats/apple.webp", np.uint8)
-    with pytest.raises(RuntimeError):
+    with pytest.raises(RuntimeError, match="Unsupported image type"):
        c_decode(webp_image)
    p_decode(webp_image)

+    assert len(glob.glob('unsupported_image.webp')) == 1
+    # delete the dump file which is not supported
+    os.remove(glob.glob('unsupported_image.webp')[0])
+
+    # abnormal image: Opencv[x] Pillow[x]
+    abnormal_image = np.fromfile("../data/dataset/testFormats/abnormal_apple.jpg", np.uint8)
+    with pytest.raises(RuntimeError, match="Dump the abnormal image to"):
+        c_decode(abnormal_image)
+    with pytest.raises(ValueError, match="image file is truncated"):
+        p_decode(abnormal_image)
+
+    assert len(glob.glob('abnormal_image.jpg')) == 1
+    # delete the dump file which is abnormal
+    os.remove(glob.glob('abnormal_image.jpg')[0])
+

 class ImageDataset:
    """Custom class to generate and read image dataset"""