!8370 [MD] Use neon instruction set to accelerate the Substract and Divide ops

From: @jiangzhiwen8 Reviewed-by: @xulei2020 Signed-off-by:
2020-11-13 16:14:00 +08:00 · 2020-11-13 16:14:00 +08:00 · edcb0cd86b
parent 9c8828761d 03bbe5f2a8
commit edcb0cd86b
5 changed files with 319 additions and 244 deletions
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/image_process.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/image_process.cc
@ -19,8 +19,6 @@
 #include <string.h>
 #include <cmath>
 #include <vector>
-#include <algorithm>
-#include <limits>

 namespace mindspore {
 namespace dataset {
@ -549,73 +547,39 @@ template <typename T>
 static void PadWithConstant(const LiteMat &src, LiteMat &dst, const int top, const int bottom, const int left,
                            const int right, const PaddBorderType pad_type, uint8_t fill_b_or_gray, uint8_t fill_g,
                            uint8_t fill_r) {
-  dst.Init(src.width_ + left + right, src.height_ + top + bottom, src.channel_, src.data_type_);
-  const T *src_start_p = src;
-  T *dst_start_p = dst;
-  // padd top
-  for (int h = 0; h < top; h++) {
-    for (int w = 0; w < dst.width_; w++) {
-      uint32_t index = (h * dst.width_ + w) * dst.channel_;
-      if (dst.channel_ == 1) {
-        dst_start_p[index] = fill_b_or_gray;
-      } else if (dst.channel_ == 3) {
-        dst_start_p[index] = fill_b_or_gray;
-        dst_start_p[index + 1] = fill_g;
-        dst_start_p[index + 2] = fill_r;
-      } else {
-      }
+  std::vector<uint8_t> row_buffer(dst.width_ * dst.channel_);
+  uint8_t *const_ptr = row_buffer.data();
+  int src_step = src.width_ * dst.channel_;
+  int dst_step = dst.width_ * dst.channel_;
+  if (dst.channel_ == 1) {
+    for (int i = 0; i < dst_step; i++) {
+      const_ptr[i] = fill_b_or_gray;
    }
-  }
-  // padd bottom
-  for (int h = dst.height_ - bottom; h < dst.height_; h++) {
-    for (int w = 0; w < dst.width_; w++) {
-      uint32_t index = (h * dst.width_ + w) * dst.channel_;
-      if (dst.channel_ == 1) {
-        dst_start_p[index] = fill_b_or_gray;
-      } else if (dst.channel_ == 3) {
-        dst_start_p[index] = fill_b_or_gray;
-        dst_start_p[index + 1] = fill_g;
-        dst_start_p[index + 2] = fill_r;
-      } else {
-      }
+  } else if (dst.channel_ == 3) {
+    for (int i = 0; i < dst.width_; i++) {
+      const_ptr[i * dst.channel_] = fill_b_or_gray;
+      const_ptr[i * dst.channel_ + 1] = fill_g;
+      const_ptr[i * dst.channel_ + 2] = fill_r;
    }
  }

-  // padd left
-  for (int h = top; h < dst.height_ - bottom; h++) {
-    for (int w = 0; w < left; w++) {
-      uint32_t index = (h * dst.width_ + w) * dst.channel_;
-      if (dst.channel_ == 1) {
-        dst_start_p[index] = fill_b_or_gray;
-      } else if (dst.channel_ == 3) {
-        dst_start_p[index] = fill_b_or_gray;
-        dst_start_p[index + 1] = fill_g;
-        dst_start_p[index + 2] = fill_r;
-      } else {
-      }
-    }
+  uint8_t *dst_ptr = reinterpret_cast<uint8_t *>(dst.data_ptr_);
+  uint8_t *src_ptr = reinterpret_cast<uint8_t *>(src.data_ptr_);
+  for (int i = 0; i < top; i++) {
+    memcpy(dst_ptr + i * dst_step, const_ptr, dst_step);
  }

-  // padd right
-  for (int h = top; h < dst.height_ - bottom; h++) {
-    for (int w = dst.width_ - right; w < dst.width_; w++) {
-      uint32_t index = (h * dst.width_ + w) * dst.channel_;
-      if (dst.channel_ == 1) {
-        dst_start_p[index] = fill_b_or_gray;
-      } else if (dst.channel_ == 3) {
-        dst_start_p[index] = fill_b_or_gray;
-        dst_start_p[index + 1] = fill_g;
-        dst_start_p[index + 2] = fill_r;
-      } else {
-      }
-    }
+  int left_size = left * dst.channel_;
+  int right_size = right * dst.channel_;
+  uint8_t *dst_raw_data = dst_ptr + top * dst_step + left_size;
+  for (int i = 0; i < src.width_; i++, dst_raw_data += dst_step, src_ptr += src_step) {
+    memcpy(dst_raw_data, src_ptr, src_step);
+    memcpy(dst_raw_data - left_size, const_ptr, left_size);
+    memcpy(dst_raw_data + src_step, const_ptr, right_size);
  }
-  // image data
-  dst_start_p = dst_start_p + (top * dst.width_ + left) * dst.channel_;
-  for (int i_h = 0; i_h < src.height_; i_h++) {
-    const T *src_index_p = src_start_p + i_h * src.width_ * src.channel_;
-    T *dst_index_p = dst_start_p + i_h * dst.width_ * dst.channel_;
-    (void)memcpy(dst_index_p, src_index_p, src.width_ * src.channel_ * sizeof(T));
+
+  for (int i = dst.height_ - bottom; i < dst.height_; i++) {
+    memcpy(dst_ptr + i * dst_step, const_ptr, dst_step);
  }
 }

@ -758,6 +722,15 @@ bool Pad(const LiteMat &src, LiteMat &dst, int top, int bottom, int left, int ri
  if (src.IsEmpty()) {
    return false;
  }
+  int dst_width = src.width_ + left + right;
+  int dst_height = src.height_ + top + bottom;
+  if (dst.IsEmpty()) {
+    dst.Init(dst_width, dst_height, src.channel_, src.data_type_);
+  } else if (dst.width_ != dst_width || dst.height_ != dst_height || src.channel_ != dst.channel_) {
+    return false;
+  } else if (src.data_type_ != dst.data_type_) {
+    return false;
+  }
  if (pad_type == PADD_BORDER_CONSTANT && src.data_type_ == LDataType::FLOAT32) {
    PadWithConstant<float>(src, dst, top, bottom, left, right, pad_type, fill_b_or_gray, fill_g, fill_r);
  } else if (pad_type == PADD_BORDER_CONSTANT && src.data_type_ == LDataType::UINT8) {
@ -921,167 +894,5 @@ bool Affine(LiteMat &src, LiteMat &out_img, const double M[6], std::vector<size_
  return ImplementAffine(src, out_img, M, dsize, borderValue);
 }

-template <typename T>
-inline void SubtractImpl(const T *src1_ptr, const T *src2_ptr, T *dst, size_t total_size) {
-  for (size_t i = 0; i < total_size; i++) {
-    dst[i] = src1_ptr[i] - src2_ptr[i];
-  }
-}
-
-template <>
-inline void SubtractImpl(const uint8_t *src1_ptr, const uint8_t *src2_ptr, uint8_t *dst, size_t total_size) {
-  for (size_t i = 0; i < total_size; i++) {
-    int val = static_cast<int>(src1_ptr[i]) - src2_ptr[i];
-    dst[i] =
-      std::max<int>(std::numeric_limits<uint8_t>::min(), std::min<int>(std::numeric_limits<uint8_t>::max(), val));
-  }
-}
-
-template <>
-inline void SubtractImpl(const uint16_t *src1_ptr, const uint16_t *src2_ptr, uint16_t *dst, size_t total_size) {
-  for (size_t i = 0; i < total_size; i++) {
-    int val = static_cast<int>(src1_ptr[i]) - src2_ptr[i];
-    dst[i] =
-      std::max<int>(std::numeric_limits<uint16_t>::min(), std::min<int>(std::numeric_limits<uint16_t>::max(), val));
-  }
-}
-
-template <>
-inline void SubtractImpl(const uint32_t *src1_ptr, const uint32_t *src2_ptr, uint32_t *dst, size_t total_size) {
-  for (size_t i = 0; i < total_size; i++) {
-    int64_t val = static_cast<int64_t>(src1_ptr[i]) - src2_ptr[i];
-    dst[i] = std::max<int64_t>(std::numeric_limits<uint32_t>::min(),
-                               std::min<int64_t>(std::numeric_limits<uint32_t>::max(), val));
-  }
-}
-
-bool Subtract(const LiteMat &src1, const LiteMat &src2, LiteMat &dst) {
-  if (src1.width_ != src2.width_ || src1.height_ != src2.height_ || src1.channel_ != src2.channel_) {
-    return false;
-  }
-
-  if (src1.data_type_ != src2.data_type_) {
-    return false;
-  }
-
-  if (dst.IsEmpty()) {
-    dst.Init(src1.width_, src1.height_, src1.channel_, src1.data_type_);
-  } else if (src1.width_ != dst.width_ || src1.height_ != dst.height_ || src1.channel_ != dst.channel_) {
-    return false;
-  } else if (src1.data_type_ != dst.data_type_) {
-    return false;
-  }
-
-  size_t total_size = src1.height_ * src1.width_ * src1.channel_;
-
-  if (src1.data_type_ == LDataType::BOOL) {
-    SubtractImpl<bool>(src1, src2, dst, total_size);
-  } else if (src1.data_type_ == LDataType::INT8) {
-    SubtractImpl<int8_t>(src1, src2, dst, total_size);
-  } else if (src1.data_type_ == LDataType::UINT8) {
-    SubtractImpl<uint8_t>(src1, src2, dst, total_size);
-  } else if (src1.data_type_ == LDataType::INT16) {
-    SubtractImpl<int16_t>(src1, src2, dst, total_size);
-  } else if (src1.data_type_ == LDataType::UINT16) {
-    SubtractImpl<uint16_t>(src1, src2, dst, total_size);
-  } else if (src1.data_type_ == LDataType::INT32) {
-    SubtractImpl<int32_t>(src1, src2, dst, total_size);
-  } else if (src1.data_type_ == LDataType::UINT32) {
-    SubtractImpl<uint32_t>(src1, src2, dst, total_size);
-  } else if (src1.data_type_ == LDataType::INT64) {
-    SubtractImpl<int64_t>(src1, src2, dst, total_size);
-  } else if (src1.data_type_ == LDataType::UINT64) {
-    SubtractImpl<uint64_t>(src1, src2, dst, total_size);
-  } else if (src1.data_type_ == LDataType::FLOAT32) {
-    SubtractImpl<float>(src1, src2, dst, total_size);
-  } else if (src1.data_type_ == LDataType::FLOAT64) {
-    SubtractImpl<double>(src1, src2, dst, total_size);
-  } else {
-    return false;
-  }
-
-  return true;
-}
-
-template <typename T>
-inline void DivideImpl(const T *src1_ptr, const T *src2_ptr, T *dst, size_t total_size) {
-  for (size_t i = 0; i < total_size; i++) {
-    dst[i] = src1_ptr[i] / (src2_ptr[i] + std::numeric_limits<float>::min());
-  }
-}
-
-template <>
-inline void DivideImpl(const uint8_t *src1_ptr, const uint8_t *src2_ptr, uint8_t *dst, size_t total_size) {
-  for (size_t i = 0; i < total_size; i++) {
-    int val = std::round(src1_ptr[i] / (src2_ptr[i] + std::numeric_limits<float>::min()));
-    dst[i] =
-      std::max<int>(std::numeric_limits<uint8_t>::min(), std::min<int>(std::numeric_limits<uint8_t>::max(), val));
-  }
-}
-
-template <>
-inline void DivideImpl(const uint16_t *src1_ptr, const uint16_t *src2_ptr, uint16_t *dst, size_t total_size) {
-  for (size_t i = 0; i < total_size; i++) {
-    int val = std::round(src1_ptr[i] / (src2_ptr[i] + std::numeric_limits<float>::min()));
-    dst[i] =
-      std::max<int>(std::numeric_limits<uint16_t>::min(), std::min<int>(std::numeric_limits<uint16_t>::max(), val));
-  }
-}
-
-template <>
-inline void DivideImpl(const uint32_t *src1_ptr, const uint32_t *src2_ptr, uint32_t *dst, size_t total_size) {
-  for (size_t i = 0; i < total_size; i++) {
-    int64_t val = std::round(src1_ptr[i] / (src2_ptr[i] + std::numeric_limits<double>::min()));
-    dst[i] = std::max<int64_t>(std::numeric_limits<uint32_t>::min(),
-                               std::min<int64_t>(std::numeric_limits<uint32_t>::max(), val));
-  }
-}
-
-bool Divide(const LiteMat &src1, const LiteMat &src2, LiteMat &dst) {
-  if (src1.width_ != src2.width_ || src1.height_ != src2.height_ || src1.channel_ != src2.channel_) {
-    return false;
-  }
-
-  if (src1.data_type_ != src2.data_type_) {
-    return false;
-  }
-
-  if (dst.IsEmpty()) {
-    dst.Init(src1.width_, src1.height_, src1.channel_, src1.data_type_);
-  } else if (src1.width_ != dst.width_ || src1.height_ != dst.height_ || src1.channel_ != dst.channel_) {
-    return false;
-  } else if (src1.data_type_ != dst.data_type_) {
-    return false;
-  }
-
-  size_t total_size = src1.height_ * src1.width_ * src1.channel_;
-
-  if (src1.data_type_ == LDataType::INT8) {
-    DivideImpl<int8_t>(src1, src2, dst, total_size);
-  } else if (src1.data_type_ == LDataType::UINT8) {
-    DivideImpl<uint8_t>(src1, src2, dst, total_size);
-  } else if (src1.data_type_ == LDataType::INT16) {
-    DivideImpl<int16_t>(src1, src2, dst, total_size);
-  } else if (src1.data_type_ == LDataType::UINT16) {
-    DivideImpl<uint16_t>(src1, src2, dst, total_size);
-  } else if (src1.data_type_ == LDataType::INT32) {
-    DivideImpl<int32_t>(src1, src2, dst, total_size);
-  } else if (src1.data_type_ == LDataType::UINT32) {
-    DivideImpl<uint32_t>(src1, src2, dst, total_size);
-  } else if (src1.data_type_ == LDataType::INT64) {
-    DivideImpl<int64_t>(src1, src2, dst, total_size);
-  } else if (src1.data_type_ == LDataType::UINT64) {
-    DivideImpl<uint64_t>(src1, src2, dst, total_size);
-  } else if (src1.data_type_ == LDataType::FLOAT32) {
-    DivideImpl<float>(src1, src2, dst, total_size);
-  } else if (src1.data_type_ == LDataType::FLOAT64) {
-    DivideImpl<double>(src1, src2, dst, total_size);
-  } else {
-    return false;
-  }
-
-  return true;
-}
-
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/image_process.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/image_process.h
@ -107,12 +107,6 @@ void ConvertBoxes(std::vector<std::vector<float>> &boxes, const std::vector<std:
 std::vector<int> ApplyNms(const std::vector<std::vector<float>> &all_boxes, std::vector<float> &all_scores, float thres,
                          int max_boxes);

-/// \brief Calculates the difference between the two images for each element
-bool Subtract(const LiteMat &src1, const LiteMat &src2, LiteMat &dst);
-
-/// \brief Calculates the division between the two images for each element
-bool Divide(const LiteMat &src1, const LiteMat &src2, LiteMat &dst);
-
 }  // namespace dataset
 }  // namespace mindspore
 #endif  // IMAGE_PROCESS_H_
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/lite_mat.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/lite_mat.cc
@ -15,6 +15,16 @@
 */
 #include "minddata/dataset/kernels/image/lite_cv/lite_mat.h"

+#include <algorithm>
+#include <limits>
+#include <cmath>
+#ifdef ENABLE_ANDROID
+#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64)
+#define USE_NEON
+#include <arm_neon.h>
+#endif
+#endif
+
 namespace mindspore {
 namespace dataset {

@ -221,5 +231,258 @@ void LiteMat::AlignFree(void *ptr) {

 inline void LiteMat::InitElemSize(LDataType data_type) { elem_size_ = data_type.SizeInBytes(); }

+template <typename T>
+inline void SubtractImpl(const T *src0, const T *src1, T *dst, int64_t total_size) {
+  for (int64_t i = 0; i < total_size; i++) {
+    dst[i] = src0[i] - src1[i];
+  }
+}
+
+template <>
+inline void SubtractImpl(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, int64_t total_size) {
+  int64_t x = 0;
+#ifdef USE_NEON
+  const int64_t step = 32;
+  for (; x <= total_size - step; x += step) {
+    uint8x16_t v_src00 = vld1q_u8(src0 + x);
+    uint8x16_t v_src01 = vld1q_u8(src0 + x + 16);
+    uint8x16_t v_src10 = vld1q_u8(src1 + x);
+    uint8x16_t v_src11 = vld1q_u8(src1 + x + 16);
+    uint8x16_t v_dst;
+
+    v_dst = vqsubq_u8(v_src00, v_src10);
+    vst1q_u8(dst + x, v_dst);
+
+    v_dst = vqsubq_u8(v_src01, v_src11);
+    vst1q_u8(dst + x + 16, v_dst);
+  }
+#endif
+  for (; x < total_size; x++) {
+    int32_t val = static_cast<int32_t>(src0[x]) - src1[x];
+    dst[x] = std::max<int32_t>(std::numeric_limits<uint8_t>::min(),
+                               std::min<int32_t>(std::numeric_limits<uint8_t>::max(), val));
+  }
+}
+
+template <>
+inline void SubtractImpl(const uint16_t *src0, const uint16_t *src1, uint16_t *dst, int64_t total_size) {
+  for (int64_t i = 0; i < total_size; i++) {
+    int32_t val = static_cast<int32_t>(src0[i]) - src1[i];
+    dst[i] = std::max<int32_t>(std::numeric_limits<uint16_t>::min(),
+                               std::min<int32_t>(std::numeric_limits<uint16_t>::max(), val));
+  }
+}
+
+template <>
+inline void SubtractImpl(const uint32_t *src0, const uint32_t *src1, uint32_t *dst, int64_t total_size) {
+  for (int64_t i = 0; i < total_size; i++) {
+    int64_t val = static_cast<int64_t>(src0[i]) - src1[i];
+    dst[i] = std::max<int64_t>(std::numeric_limits<uint32_t>::min(),
+                               std::min<int64_t>(std::numeric_limits<uint32_t>::max(), val));
+  }
+}
+
+bool Subtract(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst) {
+  if (src_a.width_ != src_b.width_ || src_a.height_ != src_b.height_ || src_a.channel_ != src_b.channel_) {
+    return false;
+  }
+
+  if (src_a.data_type_ != src_b.data_type_) {
+    return false;
+  }
+
+  if (dst->IsEmpty()) {
+    dst->Init(src_a.width_, src_a.height_, src_a.channel_, src_a.data_type_);
+  } else if (src_a.width_ != dst->width_ || src_a.height_ != dst->height_ || src_a.channel_ != dst->channel_) {
+    return false;
+  } else if (src_a.data_type_ != dst->data_type_) {
+    return false;
+  }
+
+  int64_t total_size = src_a.height_ * src_a.width_ * src_a.channel_;
+  if (src_a.data_type_ == LDataType::BOOL) {
+    SubtractImpl<bool>(src_a, src_b, *dst, total_size);
+  } else if (src_a.data_type_ == LDataType::INT8) {
+    SubtractImpl<int8_t>(src_a, src_b, *dst, total_size);
+  } else if (src_a.data_type_ == LDataType::UINT8) {
+    SubtractImpl<uint8_t>(src_a, src_b, *dst, total_size);
+  } else if (src_a.data_type_ == LDataType::INT16) {
+    SubtractImpl<int16_t>(src_a, src_b, *dst, total_size);
+  } else if (src_a.data_type_ == LDataType::UINT16) {
+    SubtractImpl<uint16_t>(src_a, src_b, *dst, total_size);
+  } else if (src_a.data_type_ == LDataType::INT32) {
+    SubtractImpl<int32_t>(src_a, src_b, *dst, total_size);
+  } else if (src_a.data_type_ == LDataType::UINT32) {
+    SubtractImpl<uint32_t>(src_a, src_b, *dst, total_size);
+  } else if (src_a.data_type_ == LDataType::INT64) {
+    SubtractImpl<int64_t>(src_a, src_b, *dst, total_size);
+  } else if (src_a.data_type_ == LDataType::UINT64) {
+    SubtractImpl<uint64_t>(src_a, src_b, *dst, total_size);
+  } else if (src_a.data_type_ == LDataType::FLOAT32) {
+    SubtractImpl<float>(src_a, src_b, *dst, total_size);
+  } else if (src_a.data_type_ == LDataType::FLOAT64) {
+    SubtractImpl<double>(src_a, src_b, *dst, total_size);
+  } else {
+    return false;
+  }
+
+  return true;
+}
+
+#ifdef USE_NEON
+inline float32x4_t reciprocal_simd(float32x4_t val) {
+  // get an initial estimate of 1/val
+  float32x4_t reciprocal = vrecpeq_f32(val);
+
+  // use Newton-Raphson steps to refine the estimate
+  reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+  reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+  return reciprocal;
+}
+
+inline float32x4_t round_simd(const float32x4_t &v) {
+  const int32x4_t signMask = vdupq_n_s32(1U << 31);
+  const int32x4_t half = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
+  float32x4_t v_addition = vreinterpretq_f32_s32(vorrq_s32(half, vandq_s32(signMask, vreinterpretq_s32_f32(v))));
+  return vaddq_f32(v, v_addition);
+}
+#endif
+
+template <typename T>
+inline void DivideImpl(const T *src0, const T *src1, T *dst, int64_t total_size) {
+  for (size_t i = 0; i < total_size; i++) {
+    dst[i] = src1[i] ? src0[i] / src1[i] : 0;
+  }
+}
+
+template <>
+inline void DivideImpl(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, int64_t total_size) {
+  int64_t x = 0;
+#ifdef USE_NEON
+  const int64_t step = 16;
+  for (; x <= total_size - step; x += step) {
+    __builtin_prefetch(reinterpret_cast<const char *>(src0 + x) + 32 * 10);
+    __builtin_prefetch(reinterpret_cast<const char *>(src1 + x) + 32 * 10);
+
+    uint8x16_t v_a = vld1q_u8(src0 + x);
+    uint8x16_t v_b = vld1q_u8(src1 + x);
+    uint8x16_t v_mask = vtstq_u8(v_b, v_b);
+
+    uint16x8_t va_l_16x8 = vmovl_u8(vget_low_u8(v_a));
+    uint16x8_t va_h_16x8 = vmovl_u8(vget_high_u8(v_a));
+    uint16x8_t vb_l_16x8 = vmovl_u8(vget_low_u8(v_b));
+    uint16x8_t vb_h_16x8 = vmovl_u8(vget_high_u8(v_b));
+
+    float32x4_t va_ll_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(va_l_16x8)));
+    float32x4_t va_lh_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(va_l_16x8)));
+    float32x4_t va_hl_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(va_h_16x8)));
+    float32x4_t va_hh_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(va_h_16x8)));
+    float32x4_t vb_ll_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vb_l_16x8)));
+    float32x4_t vb_lh_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vb_l_16x8)));
+    float32x4_t vb_hl_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vb_h_16x8)));
+    float32x4_t vb_hh_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vb_h_16x8)));
+
+    float32x4_t vb_ll_re_f32x4 = reciprocal_simd(vb_ll_f32x4);
+    float32x4_t vb_lh_re_f32x4 = reciprocal_simd(vb_lh_f32x4);
+    float32x4_t vb_hl_re_f32x4 = reciprocal_simd(vb_hl_f32x4);
+    float32x4_t vb_hh_re_f32x4 = reciprocal_simd(vb_hh_f32x4);
+
+    float32x4_t dst_ll_f32x4 = round_simd(vmulq_f32(va_ll_f32x4, vb_ll_re_f32x4));
+    float32x4_t dst_lh_f32x4 = round_simd(vmulq_f32(va_lh_f32x4, vb_lh_re_f32x4));
+    float32x4_t dst_hl_f32x4 = round_simd(vmulq_f32(va_hl_f32x4, vb_hl_re_f32x4));
+    float32x4_t dst_hh_f32x4 = round_simd(vmulq_f32(va_hh_f32x4, vb_hh_re_f32x4));
+
+    uint32x4_t dst_ll_32x4 = vcvtq_u32_f32(dst_ll_f32x4);
+    uint32x4_t dst_lh_32x4 = vcvtq_u32_f32(dst_lh_f32x4);
+    uint32x4_t dst_hl_32x4 = vcvtq_u32_f32(dst_hl_f32x4);
+    uint32x4_t dst_hh_32x4 = vcvtq_u32_f32(dst_hh_f32x4);
+
+    uint16x4_t dst_ll_16x4 = vqmovn_u32(dst_ll_32x4);
+    uint16x4_t dst_lh_16x4 = vqmovn_u32(dst_lh_32x4);
+    uint16x4_t dst_hl_16x4 = vqmovn_u32(dst_hl_32x4);
+    uint16x4_t dst_hh_16x4 = vqmovn_u32(dst_hh_32x4);
+
+    uint16x8_t dst_l_16x8 = vcombine_u16(dst_ll_16x4, dst_lh_16x4);
+    uint16x8_t dst_h_16x8 = vcombine_u16(dst_hl_16x4, dst_hh_16x4);
+
+    int8x8_t dst_l_8x8 = vqmovn_u16(dst_l_16x8);
+    int8x8_t dst_h_8x8 = vqmovn_u16(dst_h_16x8);
+    int8x16_t dst_8x16 = vcombine_u8(dst_l_8x8, dst_h_8x8);
+
+    dst_8x16 = vandq_u8(dst_8x16, v_mask);
+    vst1q_u8(dst + x, dst_8x16);
+  }
+#endif
+  for (; x < total_size; x++) {
+    int32_t val = src1[x] ? std::round(src0[x] / src1[x]) : 0;
+    dst[x] = std::max<int32_t>(std::numeric_limits<uint8_t>::min(),
+                               std::min<int32_t>(std::numeric_limits<uint8_t>::max(), val));
+  }
+}
+
+template <>
+inline void DivideImpl(const uint16_t *src0, const uint16_t *src1, uint16_t *dst, int64_t total_size) {
+  for (size_t i = 0; i < total_size; i++) {
+    int32_t val = src1[i] ? std::round(src0[i] / src1[i]) : 0;
+    dst[i] = std::max<int32_t>(std::numeric_limits<uint16_t>::min(),
+                               std::min<int32_t>(std::numeric_limits<uint16_t>::max(), val));
+  }
+}
+
+template <>
+inline void DivideImpl(const uint32_t *src0, const uint32_t *src1, uint32_t *dst, int64_t total_size) {
+  for (size_t i = 0; i < total_size; i++) {
+    int64_t val = src1[i] ? std::round(src0[i] / src1[i]) : 0;
+    dst[i] = std::max<int64_t>(std::numeric_limits<uint32_t>::min(),
+                               std::min<int64_t>(std::numeric_limits<uint32_t>::max(), val));
+  }
+}
+
+bool Divide(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst) {
+  if (src_a.width_ != src_b.width_ || src_a.height_ != src_b.height_ || src_a.channel_ != src_b.channel_) {
+    return false;
+  }
+
+  if (src_a.data_type_ != src_b.data_type_) {
+    return false;
+  }
+
+  if (dst->IsEmpty()) {
+    dst->Init(src_a.width_, src_a.height_, src_a.channel_, src_a.data_type_);
+  } else if (src_a.width_ != dst->width_ || src_a.height_ != dst->height_ || src_a.channel_ != dst->channel_) {
+    return false;
+  } else if (src_a.data_type_ != dst->data_type_) {
+    return false;
+  }
+
+  int64_t total_size = src_a.height_ * src_a.width_ * src_a.channel_;
+  if (src_a.data_type_ == LDataType::BOOL) {
+    DivideImpl<bool>(src_a, src_b, *dst, total_size);
+  } else if (src_a.data_type_ == LDataType::INT8) {
+    DivideImpl<int8_t>(src_a, src_b, *dst, total_size);
+  } else if (src_a.data_type_ == LDataType::UINT8) {
+    DivideImpl<uint8_t>(src_a, src_b, *dst, total_size);
+  } else if (src_a.data_type_ == LDataType::INT16) {
+    DivideImpl<int16_t>(src_a, src_b, *dst, total_size);
+  } else if (src_a.data_type_ == LDataType::UINT16) {
+    DivideImpl<uint16_t>(src_a, src_b, *dst, total_size);
+  } else if (src_a.data_type_ == LDataType::INT32) {
+    DivideImpl<int32_t>(src_a, src_b, *dst, total_size);
+  } else if (src_a.data_type_ == LDataType::UINT32) {
+    DivideImpl<uint32_t>(src_a, src_b, *dst, total_size);
+  } else if (src_a.data_type_ == LDataType::INT64) {
+    DivideImpl<int64_t>(src_a, src_b, *dst, total_size);
+  } else if (src_a.data_type_ == LDataType::UINT64) {
+    DivideImpl<uint64_t>(src_a, src_b, *dst, total_size);
+  } else if (src_a.data_type_ == LDataType::FLOAT32) {
+    DivideImpl<float>(src_a, src_b, *dst, total_size);
+  } else if (src_a.data_type_ == LDataType::FLOAT64) {
+    DivideImpl<double>(src_a, src_b, *dst, total_size);
+  } else {
+    return false;
+  }
+  return true;
+}
+
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/lite_mat.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/lite_mat.h
@ -247,6 +247,13 @@ class LiteMat {
  LDataType data_type_;
  int *ref_count_;
 };
+
+/// \brief Calculates the difference between the two images for each element
+bool Subtract(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst);
+
+/// \brief Calculates the division between the two images for each element
+bool Divide(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst);
+
 }  // namespace dataset
 }  // namespace mindspore
 #endif  // MINI_MAT_H_
--- a/tests/ut/cpp/dataset/image_process_test.cc
+++ b/tests/ut/cpp/dataset/image_process_test.cc
@ -538,7 +538,7 @@ TEST_F(MindDataImageProcess, TestSubtractUint8) {
    static_cast<UINT8_C1 *>(expect_uint8.data_ptr_)[i] = 1;
  }
  LiteMat dst_uint8;
-  EXPECT_TRUE(Subtract(src1_uint8, src2_uint8, dst_uint8));
+  EXPECT_TRUE(Subtract(src1_uint8, src2_uint8, &dst_uint8));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<UINT8_C1 *>(expect_uint8.data_ptr_)[i].c1,
              static_cast<UINT8_C1 *>(dst_uint8.data_ptr_)[i].c1);
@ -557,7 +557,7 @@ TEST_F(MindDataImageProcess, TestSubtractInt8) {
    static_cast<INT8_C1 *>(expect_int8.data_ptr_)[i] = -1;
  }
  LiteMat dst_int8;
-  EXPECT_TRUE(Subtract(src1_int8, src2_int8, dst_int8));
+  EXPECT_TRUE(Subtract(src1_int8, src2_int8, &dst_int8));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<INT8_C1 *>(expect_int8.data_ptr_)[i].c1, static_cast<INT8_C1 *>(dst_int8.data_ptr_)[i].c1);
  }
@ -575,7 +575,7 @@ TEST_F(MindDataImageProcess, TestSubtractUInt16) {
    static_cast<UINT16_C1 *>(expect_uint16.data_ptr_)[i] = 0;
  }
  LiteMat dst_uint16;
-  EXPECT_TRUE(Subtract(src1_uint16, src2_uint16, dst_uint16));
+  EXPECT_TRUE(Subtract(src1_uint16, src2_uint16, &dst_uint16));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<UINT16_C1 *>(expect_uint16.data_ptr_)[i].c1,
              static_cast<UINT16_C1 *>(dst_uint16.data_ptr_)[i].c1);
@ -594,7 +594,7 @@ TEST_F(MindDataImageProcess, TestSubtractInt16) {
    static_cast<INT16_C1 *>(expect_int16.data_ptr_)[i] = -1;
  }
  LiteMat dst_int16;
-  EXPECT_TRUE(Subtract(src1_int16, src2_int16, dst_int16));
+  EXPECT_TRUE(Subtract(src1_int16, src2_int16, &dst_int16));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<INT16_C1 *>(expect_int16.data_ptr_)[i].c1,
              static_cast<INT16_C1 *>(dst_int16.data_ptr_)[i].c1);
@ -613,7 +613,7 @@ TEST_F(MindDataImageProcess, TestSubtractUInt32) {
    static_cast<UINT32_C1 *>(expect_uint32.data_ptr_)[i] = 0;
  }
  LiteMat dst_uint32;
-  EXPECT_TRUE(Subtract(src1_uint32, src2_uint32, dst_uint32));
+  EXPECT_TRUE(Subtract(src1_uint32, src2_uint32, &dst_uint32));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<UINT32_C1 *>(expect_uint32.data_ptr_)[i].c1,
              static_cast<UINT32_C1 *>(dst_uint32.data_ptr_)[i].c1);
@ -632,7 +632,7 @@ TEST_F(MindDataImageProcess, TestSubtractInt32) {
    static_cast<INT32_C1 *>(expect_int32.data_ptr_)[i] = -2;
  }
  LiteMat dst_int32;
-  EXPECT_TRUE(Subtract(src1_int32, src2_int32, dst_int32));
+  EXPECT_TRUE(Subtract(src1_int32, src2_int32, &dst_int32));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<INT32_C1 *>(expect_int32.data_ptr_)[i].c1,
              static_cast<INT32_C1 *>(dst_int32.data_ptr_)[i].c1);
@ -651,7 +651,7 @@ TEST_F(MindDataImageProcess, TestSubtractFloat) {
    static_cast<FLOAT32_C1 *>(expect_float.data_ptr_)[i] = -2.3;
  }
  LiteMat dst_float;
-  EXPECT_TRUE(Subtract(src1_float, src2_float, dst_float));
+  EXPECT_TRUE(Subtract(src1_float, src2_float, &dst_float));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_FLOAT_EQ(static_cast<FLOAT32_C1 *>(expect_float.data_ptr_)[i].c1,
                    static_cast<FLOAT32_C1 *>(dst_float.data_ptr_)[i].c1);
@ -670,7 +670,7 @@ TEST_F(MindDataImageProcess, TestDivideUint8) {
    static_cast<UINT8_C1 *>(expect_uint8.data_ptr_)[i] = 2;
  }
  LiteMat dst_uint8;
-  EXPECT_TRUE(Divide(src1_uint8, src2_uint8, dst_uint8));
+  EXPECT_TRUE(Divide(src1_uint8, src2_uint8, &dst_uint8));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<UINT8_C1 *>(expect_uint8.data_ptr_)[i].c1,
              static_cast<UINT8_C1 *>(dst_uint8.data_ptr_)[i].c1);
@ -689,7 +689,7 @@ TEST_F(MindDataImageProcess, TestDivideInt8) {
    static_cast<INT8_C1 *>(expect_int8.data_ptr_)[i] = -2;
  }
  LiteMat dst_int8;
-  EXPECT_TRUE(Divide(src1_int8, src2_int8, dst_int8));
+  EXPECT_TRUE(Divide(src1_int8, src2_int8, &dst_int8));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<INT8_C1 *>(expect_int8.data_ptr_)[i].c1, static_cast<INT8_C1 *>(dst_int8.data_ptr_)[i].c1);
  }
@ -707,7 +707,7 @@ TEST_F(MindDataImageProcess, TestDivideUInt16) {
    static_cast<UINT16_C1 *>(expect_uint16.data_ptr_)[i] = 2;
  }
  LiteMat dst_uint16;
-  EXPECT_TRUE(Divide(src1_uint16, src2_uint16, dst_uint16));
+  EXPECT_TRUE(Divide(src1_uint16, src2_uint16, &dst_uint16));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<UINT16_C1 *>(expect_uint16.data_ptr_)[i].c1,
              static_cast<UINT16_C1 *>(dst_uint16.data_ptr_)[i].c1);
@ -726,7 +726,7 @@ TEST_F(MindDataImageProcess, TestDivideInt16) {
    static_cast<INT16_C1 *>(expect_int16.data_ptr_)[i] = -10000;
  }
  LiteMat dst_int16;
-  EXPECT_TRUE(Divide(src1_int16, src2_int16, dst_int16));
+  EXPECT_TRUE(Divide(src1_int16, src2_int16, &dst_int16));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<INT16_C1 *>(expect_int16.data_ptr_)[i].c1,
              static_cast<INT16_C1 *>(dst_int16.data_ptr_)[i].c1);
@ -745,7 +745,7 @@ TEST_F(MindDataImageProcess, TestDivideUInt32) {
    static_cast<UINT32_C1 *>(expect_uint32.data_ptr_)[i] = 1000000000;
  }
  LiteMat dst_uint32;
-  EXPECT_TRUE(Divide(src1_uint32, src2_uint32, dst_uint32));
+  EXPECT_TRUE(Divide(src1_uint32, src2_uint32, &dst_uint32));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<UINT32_C1 *>(expect_uint32.data_ptr_)[i].c1,
              static_cast<UINT32_C1 *>(dst_uint32.data_ptr_)[i].c1);
@ -764,7 +764,7 @@ TEST_F(MindDataImageProcess, TestDivideInt32) {
    static_cast<INT32_C1 *>(expect_int32.data_ptr_)[i] = -1000000000;
  }
  LiteMat dst_int32;
-  EXPECT_TRUE(Divide(src1_int32, src2_int32, dst_int32));
+  EXPECT_TRUE(Divide(src1_int32, src2_int32, &dst_int32));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<INT32_C1 *>(expect_int32.data_ptr_)[i].c1,
              static_cast<INT32_C1 *>(dst_int32.data_ptr_)[i].c1);
@ -783,7 +783,7 @@ TEST_F(MindDataImageProcess, TestDivideFloat) {
    static_cast<FLOAT32_C1 *>(expect_float.data_ptr_)[i] = -6.17f;
  }
  LiteMat dst_float;
-  EXPECT_TRUE(Divide(src1_float, src2_float, dst_float));
+  EXPECT_TRUE(Divide(src1_float, src2_float, &dst_float));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_FLOAT_EQ(static_cast<FLOAT32_C1 *>(expect_float.data_ptr_)[i].c1,
                    static_cast<FLOAT32_C1 *>(dst_float.data_ptr_)[i].c1);