!46343 dump the image when decode failed

Merge pull request !46343 from guozhijian/dump_image_when_decode_failed
This commit is contained in:
i-robot 2022-12-09 02:26:30 +00:00 committed by Gitee
commit 85056c10ff
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
8 changed files with 149 additions and 12 deletions

View File

@ -36,6 +36,12 @@
- **ValueError** - `shard_id` 参数错误小于0或者大于等于 `num_shards`
.. note::
- 如果配置 `python_multiprocessing=True默认值True``num_parallel_workers>1默认值1` 表示启动了多进程方式进行数据load加速
此时随着数据集迭代,子进程的内存占用会逐渐增加,主要是因为自定义数据集的子进程以 Copy-On-Write 的方式获取主进程中的成员变量。
举例:如果自定义数据集 `__init__` 函数中包含大量成员变量数据(例如:在数据集构建时加载了一个非常大的文件名列表)并且使用了多进程方式,
那这可能会导致产生OOM的问题总内存的预估使用量是(子进程数量 + 1) * 父进程的内存大小)。最简单的解决方法是成员变量用非引用数据类型
Pandas、Numpy或PyArrow对象替换Python对象list / dict / int / float / string等或者配置 `python_multiprocessing=False`
使用多线程方式。
- `source` 参数接收用户自定义的Python函数PyFuncs不要将 `mindspore.nn``mindspore.ops` 目录下或其他的网络计算算子添加
`source` 中。
- 此数据集可以指定参数 `sampler` ,但参数 `sampler` 和参数 `shuffle` 的行为是互斥的。下表展示了几种合法的输入参数组合及预期的行为。

View File

@ -4,7 +4,7 @@
将ImageNet数据集转换为MindRecord格式数据集。
参数:
- **map_file** (str) - 标签映射文件的路径。该文件可通过命令: `ls -l [image_dir] | grep -vE "总用量|total|\." | awk -F " " '{print $9, NR-1;}' > [file_path]` 生成,其中 `image_dir` 为ImageNet数据集的目录路径 `file_path` 为生成的 `map_file` 文件 。 `map_file` 文件内容示例如下:
- **map_file** (str) - 标签映射文件的路径。该文件可通过命令: `ls -l [image_dir] | grep -vE "总用量|total|\\\." | awk -F " " '{print $9, NR-1;}' > [file_path]` 生成,其中 `image_dir` 为ImageNet数据集的目录路径 `file_path` 为生成的 `map_file` 文件 。 `map_file` 文件内容示例如下:
.. code-block::

View File

@ -320,24 +320,34 @@ Status Resize(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *out
}
}
const unsigned char kJpegMagic[] = "\xFF\xD8\xFF";
constexpr dsize_t kJpegMagicLen = 3;
const unsigned char kPngMagic[] = "\x89\x50\x4E\x47";
constexpr dsize_t kPngMagicLen = 4;
bool IsNonEmptyJPEG(const std::shared_ptr<Tensor> &input) {
const unsigned char kJpegMagic[] = "\xFF\xD8\xFF";
constexpr dsize_t kJpegMagicLen = 3;
return input->SizeInBytes() > kJpegMagicLen && memcmp(input->GetBuffer(), kJpegMagic, kJpegMagicLen) == 0;
}
bool IsNonEmptyPNG(const std::shared_ptr<Tensor> &input) {
const unsigned char kPngMagic[] = "\x89\x50\x4E\x47";
constexpr dsize_t kPngMagicLen = 4;
return input->SizeInBytes() > kPngMagicLen && memcmp(input->GetBuffer(), kPngMagic, kPngMagicLen) == 0;
}
Status Decode(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
RETURN_IF_NOT_OK(CheckUnsupportedImage(input));
Status ret;
if (IsNonEmptyJPEG(input)) {
return JpegCropAndDecode(input, output);
ret = JpegCropAndDecode(input, output);
} else {
return DecodeCv(input, output);
ret = DecodeCv(input, output);
}
// decode failed and dump it
if (ret != Status::OK()) {
return DumpImageAndAppendStatus(input, ret);
}
return ret;
}
Status DecodeCv(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
@ -2590,5 +2600,82 @@ Status WritePng(const std::string &filename, const std::shared_ptr<Tensor> &imag
fs.close();
return Status::OK();
}
// support list
const unsigned char kBmpMagic[] = "\x42\x4D";
constexpr dsize_t kBmpMagicLen = 2;
const unsigned char kTiffMagic1[] = "\x4D\x4D";
const unsigned char kTiffMagic2[] = "\x49\x49";
constexpr dsize_t kTiffMagicLen = 2;
Status DumpImageAndAppendStatus(const std::shared_ptr<Tensor> &image, const Status &status) {
Status local_status = status;
std::string file_name = "./abnormal_image.";
std::string file_suffix = "";
std::string error_info = local_status.GetErrDescription();
if (image->SizeInBytes() == 0) {
return local_status;
}
if (memcmp(image->GetBuffer(), kJpegMagic, kJpegMagicLen) == 0) { // support
file_suffix = "jpg";
} else if (memcmp(image->GetBuffer(), kPngMagic, kPngMagicLen) == 0) { // support
file_suffix = "png";
} else if (memcmp(image->GetBuffer(), kBmpMagic, kBmpMagicLen) == 0) { // support
file_suffix = "bmp";
} else if (memcmp(image->GetBuffer(), kTiffMagic1, kTiffMagicLen) == 0 || // support
memcmp(image->GetBuffer(), kTiffMagic2, kTiffMagicLen) == 0) {
file_suffix = "tif";
} else {
file_suffix = "exception";
error_info += " Unknown image type.";
}
auto ret = WriteFile(file_name + file_suffix, image);
if (ret == Status::OK()) {
error_info += " Dump the abnormal image to [" + (file_name + file_suffix) +
"]. You can check this image first through the image viewer. If you find that " +
"the image is abnormal, delete it from the dataset and re-run.";
}
local_status.SetErrDescription(error_info);
return local_status;
}
// unsupported list
const unsigned char kGifMagic[] = "\x47\x49\x46";
constexpr dsize_t kGifMagicLen = 3;
const unsigned char kWebpMagic[] = "\x00\x57\x45\x42";
constexpr dsize_t kWebpMagicLen = 4;
Status CheckUnsupportedImage(const std::shared_ptr<Tensor> &image) {
bool unsupport_flag = false;
std::string file_name = "./unsupported_image.";
std::string file_suffix = "";
if (image->SizeInBytes() == 0) {
RETURN_STATUS_UNEXPECTED("Image file size is 0.");
}
if (memcmp(image->GetBuffer(), kGifMagic, kGifMagicLen) == 0) { // unsupported
file_suffix = "gif";
unsupport_flag = true;
} else if (memcmp(image->GetBuffer() + 7, kWebpMagic, kWebpMagicLen) == 0) { // unsupported: skip the 7 bytes
file_suffix = "webp";
unsupport_flag = true;
}
if (unsupport_flag) {
auto ret = WriteFile(file_name + file_suffix, image);
if (ret == Status::OK()) {
RETURN_STATUS_UNEXPECTED("Unsupported image type [" + file_suffix + "] and dump the image to [" +
(file_name + file_suffix) + "]. Please delete it from the dataset and re-run.");
} else {
ret.SetErrDescription("Unsupported image type [" + file_suffix + "], but dump the image failed. " +
"Error info: " + ret.GetErrDescription());
return ret;
}
}
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -557,6 +557,17 @@ Status WriteJpeg(const std::string &filename, const std::shared_ptr<Tensor> &ima
/// \param[in] compression_level The compression level for PNG file, in range of [0, 9]. Default: 6.
/// \return Status code.
Status WritePng(const std::string &filename, const std::shared_ptr<Tensor> &image, int compression_level = 6);
/// \brief Dump the abnormal image to disk and facilitate user to check it.
/// \param[in] image The data Tensor.
/// \param[in] status The previous error status which is needed to append more info.
/// \return Status code.
Status DumpImageAndAppendStatus(const std::shared_ptr<Tensor> &image, const Status &status);
/// \brief Check the unsupported image and dump it to disk.
/// \param[in] image The data Tensor.
/// \return Status code.
Status CheckUnsupportedImage(const std::shared_ptr<Tensor> &image);
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_IMAGE_UTILS_H_

View File

@ -555,6 +555,17 @@ class GeneratorDataset(MappableDataset, UnionBaseDataset):
ValueError: If `shard_id` is invalid (< 0 or >= `num_shards`).
Note:
- If you configure `python_multiprocessing=True (default: True)` and `num_parallel_workers>1 (default: 1)`
indicates that the multi-process mode is started for data load acceleration. At this time, as the dataset
iterates, the memory consumption of the subprocess will gradually increase, mainly because the subprocess
of the user-defined dataset obtains the member variables from the main process in the Copy On Write way.
Example: If you define a dataset with `__ init__` function which contains a large number of member variable
data (for example, a very large file name list is loaded during the dataset construction) and uses the
multi-process mode, which may cause the problem of OOM (the estimated total memory usage is:
`(num_parallel_workers+1) * size of the parent process` ). The simplest solution is to replace python objects
(such as list/dict/int/float/string) with non referenced data types
(such as Pandas, Numpy or PyArrow objects) for member variables, or configure `python_multiprocessing=False`
multi-threading mode is used.
- Input `source` accepts user-defined Python functions (PyFuncs), Do not add network computing operators from
mindspore.nn and mindspore.ops or others into this `source` .
- This dataset can take in a `sampler` . `sampler` and `shuffle` are mutually exclusive.

View File

@ -32,9 +32,9 @@ class ImageNetToMR:
Args:
map_file (str): The map file that indicates label. This file can be generated by command
`ls -l [image_dir] | grep -vE "total|\." | awk -F " " '{print $9, NR-1;}' > [file_path]` , where `image_dir`
is image directory contains n02119789, n02100735, n02110185 and n02096294 directory and `file_path` is
the generated `map_file` . An example of `map_file` is as below:
`ls -l [image_dir] | grep -vE "total|\\\." | awk -F " " '{print $9, NR-1;}' > [file_path]` ,
where `image_dir` is image directory contains n02119789, n02100735, n02110185 and n02096294 directory
and `file_path` is the generated `map_file` . An example of `map_file` is as below:
.. code-block::

Binary file not shown.

After

Width:  |  Height:  |  Size: 58 KiB

View File

@ -15,6 +15,9 @@
"""
Testing Decode op in DE
"""
import glob
import os
import cv2
import numpy as np
import pytest
@ -85,16 +88,35 @@ def test_decode_op_support_format():
# gif: Opencv[×] Pillow[√]
gif_image = np.fromfile("../data/dataset/testFormats/apple.gif", np.uint8)
with pytest.raises(RuntimeError):
with pytest.raises(RuntimeError, match="Unsupported image type"):
c_decode(gif_image)
p_decode(gif_image)
assert len(glob.glob('unsupported_image.gif')) == 1
# delete the dump file which is not supported
os.remove(glob.glob('unsupported_image.gif')[0])
# webp: Opencv[×] Pillow[√]
webp_image = np.fromfile("../data/dataset/testFormats/apple.webp", np.uint8)
with pytest.raises(RuntimeError):
with pytest.raises(RuntimeError, match="Unsupported image type"):
c_decode(webp_image)
p_decode(webp_image)
assert len(glob.glob('unsupported_image.webp')) == 1
# delete the dump file which is not supported
os.remove(glob.glob('unsupported_image.webp')[0])
# abnormal image: Opencv[x] Pillow[x]
abnormal_image = np.fromfile("../data/dataset/testFormats/abnormal_apple.jpg", np.uint8)
with pytest.raises(RuntimeError, match="Dump the abnormal image to"):
c_decode(abnormal_image)
with pytest.raises(ValueError, match="image file is truncated"):
p_decode(abnormal_image)
assert len(glob.glob('abnormal_image.jpg')) == 1
# delete the dump file which is abnormal
os.remove(glob.glob('abnormal_image.jpg')[0])
class ImageDataset:
"""Custom class to generate and read image dataset"""