forked from mindspore-Ecosystem/mindspore
!8370 [MD] Use neon instruction set to accelerate the Substract and Divide ops
From: @jiangzhiwen8 Reviewed-by: @xulei2020 Signed-off-by:
This commit is contained in:
commit
edcb0cd86b
|
@ -19,8 +19,6 @@
|
|||
#include <string.h>
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
@ -549,73 +547,39 @@ template <typename T>
|
|||
static void PadWithConstant(const LiteMat &src, LiteMat &dst, const int top, const int bottom, const int left,
|
||||
const int right, const PaddBorderType pad_type, uint8_t fill_b_or_gray, uint8_t fill_g,
|
||||
uint8_t fill_r) {
|
||||
dst.Init(src.width_ + left + right, src.height_ + top + bottom, src.channel_, src.data_type_);
|
||||
const T *src_start_p = src;
|
||||
T *dst_start_p = dst;
|
||||
// padd top
|
||||
for (int h = 0; h < top; h++) {
|
||||
for (int w = 0; w < dst.width_; w++) {
|
||||
uint32_t index = (h * dst.width_ + w) * dst.channel_;
|
||||
if (dst.channel_ == 1) {
|
||||
dst_start_p[index] = fill_b_or_gray;
|
||||
} else if (dst.channel_ == 3) {
|
||||
dst_start_p[index] = fill_b_or_gray;
|
||||
dst_start_p[index + 1] = fill_g;
|
||||
dst_start_p[index + 2] = fill_r;
|
||||
} else {
|
||||
}
|
||||
std::vector<uint8_t> row_buffer(dst.width_ * dst.channel_);
|
||||
uint8_t *const_ptr = row_buffer.data();
|
||||
int src_step = src.width_ * dst.channel_;
|
||||
int dst_step = dst.width_ * dst.channel_;
|
||||
if (dst.channel_ == 1) {
|
||||
for (int i = 0; i < dst_step; i++) {
|
||||
const_ptr[i] = fill_b_or_gray;
|
||||
}
|
||||
}
|
||||
// padd bottom
|
||||
for (int h = dst.height_ - bottom; h < dst.height_; h++) {
|
||||
for (int w = 0; w < dst.width_; w++) {
|
||||
uint32_t index = (h * dst.width_ + w) * dst.channel_;
|
||||
if (dst.channel_ == 1) {
|
||||
dst_start_p[index] = fill_b_or_gray;
|
||||
} else if (dst.channel_ == 3) {
|
||||
dst_start_p[index] = fill_b_or_gray;
|
||||
dst_start_p[index + 1] = fill_g;
|
||||
dst_start_p[index + 2] = fill_r;
|
||||
} else {
|
||||
}
|
||||
} else if (dst.channel_ == 3) {
|
||||
for (int i = 0; i < dst.width_; i++) {
|
||||
const_ptr[i * dst.channel_] = fill_b_or_gray;
|
||||
const_ptr[i * dst.channel_ + 1] = fill_g;
|
||||
const_ptr[i * dst.channel_ + 2] = fill_r;
|
||||
}
|
||||
}
|
||||
|
||||
// padd left
|
||||
for (int h = top; h < dst.height_ - bottom; h++) {
|
||||
for (int w = 0; w < left; w++) {
|
||||
uint32_t index = (h * dst.width_ + w) * dst.channel_;
|
||||
if (dst.channel_ == 1) {
|
||||
dst_start_p[index] = fill_b_or_gray;
|
||||
} else if (dst.channel_ == 3) {
|
||||
dst_start_p[index] = fill_b_or_gray;
|
||||
dst_start_p[index + 1] = fill_g;
|
||||
dst_start_p[index + 2] = fill_r;
|
||||
} else {
|
||||
}
|
||||
}
|
||||
uint8_t *dst_ptr = reinterpret_cast<uint8_t *>(dst.data_ptr_);
|
||||
uint8_t *src_ptr = reinterpret_cast<uint8_t *>(src.data_ptr_);
|
||||
for (int i = 0; i < top; i++) {
|
||||
memcpy(dst_ptr + i * dst_step, const_ptr, dst_step);
|
||||
}
|
||||
|
||||
// padd right
|
||||
for (int h = top; h < dst.height_ - bottom; h++) {
|
||||
for (int w = dst.width_ - right; w < dst.width_; w++) {
|
||||
uint32_t index = (h * dst.width_ + w) * dst.channel_;
|
||||
if (dst.channel_ == 1) {
|
||||
dst_start_p[index] = fill_b_or_gray;
|
||||
} else if (dst.channel_ == 3) {
|
||||
dst_start_p[index] = fill_b_or_gray;
|
||||
dst_start_p[index + 1] = fill_g;
|
||||
dst_start_p[index + 2] = fill_r;
|
||||
} else {
|
||||
}
|
||||
}
|
||||
int left_size = left * dst.channel_;
|
||||
int right_size = right * dst.channel_;
|
||||
uint8_t *dst_raw_data = dst_ptr + top * dst_step + left_size;
|
||||
for (int i = 0; i < src.width_; i++, dst_raw_data += dst_step, src_ptr += src_step) {
|
||||
memcpy(dst_raw_data, src_ptr, src_step);
|
||||
memcpy(dst_raw_data - left_size, const_ptr, left_size);
|
||||
memcpy(dst_raw_data + src_step, const_ptr, right_size);
|
||||
}
|
||||
// image data
|
||||
dst_start_p = dst_start_p + (top * dst.width_ + left) * dst.channel_;
|
||||
for (int i_h = 0; i_h < src.height_; i_h++) {
|
||||
const T *src_index_p = src_start_p + i_h * src.width_ * src.channel_;
|
||||
T *dst_index_p = dst_start_p + i_h * dst.width_ * dst.channel_;
|
||||
(void)memcpy(dst_index_p, src_index_p, src.width_ * src.channel_ * sizeof(T));
|
||||
|
||||
for (int i = dst.height_ - bottom; i < dst.height_; i++) {
|
||||
memcpy(dst_ptr + i * dst_step, const_ptr, dst_step);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -758,6 +722,15 @@ bool Pad(const LiteMat &src, LiteMat &dst, int top, int bottom, int left, int ri
|
|||
if (src.IsEmpty()) {
|
||||
return false;
|
||||
}
|
||||
int dst_width = src.width_ + left + right;
|
||||
int dst_height = src.height_ + top + bottom;
|
||||
if (dst.IsEmpty()) {
|
||||
dst.Init(dst_width, dst_height, src.channel_, src.data_type_);
|
||||
} else if (dst.width_ != dst_width || dst.height_ != dst_height || src.channel_ != dst.channel_) {
|
||||
return false;
|
||||
} else if (src.data_type_ != dst.data_type_) {
|
||||
return false;
|
||||
}
|
||||
if (pad_type == PADD_BORDER_CONSTANT && src.data_type_ == LDataType::FLOAT32) {
|
||||
PadWithConstant<float>(src, dst, top, bottom, left, right, pad_type, fill_b_or_gray, fill_g, fill_r);
|
||||
} else if (pad_type == PADD_BORDER_CONSTANT && src.data_type_ == LDataType::UINT8) {
|
||||
|
@ -921,167 +894,5 @@ bool Affine(LiteMat &src, LiteMat &out_img, const double M[6], std::vector<size_
|
|||
return ImplementAffine(src, out_img, M, dsize, borderValue);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void SubtractImpl(const T *src1_ptr, const T *src2_ptr, T *dst, size_t total_size) {
|
||||
for (size_t i = 0; i < total_size; i++) {
|
||||
dst[i] = src1_ptr[i] - src2_ptr[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void SubtractImpl(const uint8_t *src1_ptr, const uint8_t *src2_ptr, uint8_t *dst, size_t total_size) {
|
||||
for (size_t i = 0; i < total_size; i++) {
|
||||
int val = static_cast<int>(src1_ptr[i]) - src2_ptr[i];
|
||||
dst[i] =
|
||||
std::max<int>(std::numeric_limits<uint8_t>::min(), std::min<int>(std::numeric_limits<uint8_t>::max(), val));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void SubtractImpl(const uint16_t *src1_ptr, const uint16_t *src2_ptr, uint16_t *dst, size_t total_size) {
|
||||
for (size_t i = 0; i < total_size; i++) {
|
||||
int val = static_cast<int>(src1_ptr[i]) - src2_ptr[i];
|
||||
dst[i] =
|
||||
std::max<int>(std::numeric_limits<uint16_t>::min(), std::min<int>(std::numeric_limits<uint16_t>::max(), val));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void SubtractImpl(const uint32_t *src1_ptr, const uint32_t *src2_ptr, uint32_t *dst, size_t total_size) {
|
||||
for (size_t i = 0; i < total_size; i++) {
|
||||
int64_t val = static_cast<int64_t>(src1_ptr[i]) - src2_ptr[i];
|
||||
dst[i] = std::max<int64_t>(std::numeric_limits<uint32_t>::min(),
|
||||
std::min<int64_t>(std::numeric_limits<uint32_t>::max(), val));
|
||||
}
|
||||
}
|
||||
|
||||
bool Subtract(const LiteMat &src1, const LiteMat &src2, LiteMat &dst) {
|
||||
if (src1.width_ != src2.width_ || src1.height_ != src2.height_ || src1.channel_ != src2.channel_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src1.data_type_ != src2.data_type_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dst.IsEmpty()) {
|
||||
dst.Init(src1.width_, src1.height_, src1.channel_, src1.data_type_);
|
||||
} else if (src1.width_ != dst.width_ || src1.height_ != dst.height_ || src1.channel_ != dst.channel_) {
|
||||
return false;
|
||||
} else if (src1.data_type_ != dst.data_type_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t total_size = src1.height_ * src1.width_ * src1.channel_;
|
||||
|
||||
if (src1.data_type_ == LDataType::BOOL) {
|
||||
SubtractImpl<bool>(src1, src2, dst, total_size);
|
||||
} else if (src1.data_type_ == LDataType::INT8) {
|
||||
SubtractImpl<int8_t>(src1, src2, dst, total_size);
|
||||
} else if (src1.data_type_ == LDataType::UINT8) {
|
||||
SubtractImpl<uint8_t>(src1, src2, dst, total_size);
|
||||
} else if (src1.data_type_ == LDataType::INT16) {
|
||||
SubtractImpl<int16_t>(src1, src2, dst, total_size);
|
||||
} else if (src1.data_type_ == LDataType::UINT16) {
|
||||
SubtractImpl<uint16_t>(src1, src2, dst, total_size);
|
||||
} else if (src1.data_type_ == LDataType::INT32) {
|
||||
SubtractImpl<int32_t>(src1, src2, dst, total_size);
|
||||
} else if (src1.data_type_ == LDataType::UINT32) {
|
||||
SubtractImpl<uint32_t>(src1, src2, dst, total_size);
|
||||
} else if (src1.data_type_ == LDataType::INT64) {
|
||||
SubtractImpl<int64_t>(src1, src2, dst, total_size);
|
||||
} else if (src1.data_type_ == LDataType::UINT64) {
|
||||
SubtractImpl<uint64_t>(src1, src2, dst, total_size);
|
||||
} else if (src1.data_type_ == LDataType::FLOAT32) {
|
||||
SubtractImpl<float>(src1, src2, dst, total_size);
|
||||
} else if (src1.data_type_ == LDataType::FLOAT64) {
|
||||
SubtractImpl<double>(src1, src2, dst, total_size);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void DivideImpl(const T *src1_ptr, const T *src2_ptr, T *dst, size_t total_size) {
|
||||
for (size_t i = 0; i < total_size; i++) {
|
||||
dst[i] = src1_ptr[i] / (src2_ptr[i] + std::numeric_limits<float>::min());
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void DivideImpl(const uint8_t *src1_ptr, const uint8_t *src2_ptr, uint8_t *dst, size_t total_size) {
|
||||
for (size_t i = 0; i < total_size; i++) {
|
||||
int val = std::round(src1_ptr[i] / (src2_ptr[i] + std::numeric_limits<float>::min()));
|
||||
dst[i] =
|
||||
std::max<int>(std::numeric_limits<uint8_t>::min(), std::min<int>(std::numeric_limits<uint8_t>::max(), val));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void DivideImpl(const uint16_t *src1_ptr, const uint16_t *src2_ptr, uint16_t *dst, size_t total_size) {
|
||||
for (size_t i = 0; i < total_size; i++) {
|
||||
int val = std::round(src1_ptr[i] / (src2_ptr[i] + std::numeric_limits<float>::min()));
|
||||
dst[i] =
|
||||
std::max<int>(std::numeric_limits<uint16_t>::min(), std::min<int>(std::numeric_limits<uint16_t>::max(), val));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void DivideImpl(const uint32_t *src1_ptr, const uint32_t *src2_ptr, uint32_t *dst, size_t total_size) {
|
||||
for (size_t i = 0; i < total_size; i++) {
|
||||
int64_t val = std::round(src1_ptr[i] / (src2_ptr[i] + std::numeric_limits<double>::min()));
|
||||
dst[i] = std::max<int64_t>(std::numeric_limits<uint32_t>::min(),
|
||||
std::min<int64_t>(std::numeric_limits<uint32_t>::max(), val));
|
||||
}
|
||||
}
|
||||
|
||||
bool Divide(const LiteMat &src1, const LiteMat &src2, LiteMat &dst) {
|
||||
if (src1.width_ != src2.width_ || src1.height_ != src2.height_ || src1.channel_ != src2.channel_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src1.data_type_ != src2.data_type_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dst.IsEmpty()) {
|
||||
dst.Init(src1.width_, src1.height_, src1.channel_, src1.data_type_);
|
||||
} else if (src1.width_ != dst.width_ || src1.height_ != dst.height_ || src1.channel_ != dst.channel_) {
|
||||
return false;
|
||||
} else if (src1.data_type_ != dst.data_type_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t total_size = src1.height_ * src1.width_ * src1.channel_;
|
||||
|
||||
if (src1.data_type_ == LDataType::INT8) {
|
||||
DivideImpl<int8_t>(src1, src2, dst, total_size);
|
||||
} else if (src1.data_type_ == LDataType::UINT8) {
|
||||
DivideImpl<uint8_t>(src1, src2, dst, total_size);
|
||||
} else if (src1.data_type_ == LDataType::INT16) {
|
||||
DivideImpl<int16_t>(src1, src2, dst, total_size);
|
||||
} else if (src1.data_type_ == LDataType::UINT16) {
|
||||
DivideImpl<uint16_t>(src1, src2, dst, total_size);
|
||||
} else if (src1.data_type_ == LDataType::INT32) {
|
||||
DivideImpl<int32_t>(src1, src2, dst, total_size);
|
||||
} else if (src1.data_type_ == LDataType::UINT32) {
|
||||
DivideImpl<uint32_t>(src1, src2, dst, total_size);
|
||||
} else if (src1.data_type_ == LDataType::INT64) {
|
||||
DivideImpl<int64_t>(src1, src2, dst, total_size);
|
||||
} else if (src1.data_type_ == LDataType::UINT64) {
|
||||
DivideImpl<uint64_t>(src1, src2, dst, total_size);
|
||||
} else if (src1.data_type_ == LDataType::FLOAT32) {
|
||||
DivideImpl<float>(src1, src2, dst, total_size);
|
||||
} else if (src1.data_type_ == LDataType::FLOAT64) {
|
||||
DivideImpl<double>(src1, src2, dst, total_size);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -107,12 +107,6 @@ void ConvertBoxes(std::vector<std::vector<float>> &boxes, const std::vector<std:
|
|||
std::vector<int> ApplyNms(const std::vector<std::vector<float>> &all_boxes, std::vector<float> &all_scores, float thres,
|
||||
int max_boxes);
|
||||
|
||||
/// \brief Calculates the difference between the two images for each element
|
||||
bool Subtract(const LiteMat &src1, const LiteMat &src2, LiteMat &dst);
|
||||
|
||||
/// \brief Calculates the division between the two images for each element
|
||||
bool Divide(const LiteMat &src1, const LiteMat &src2, LiteMat &dst);
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // IMAGE_PROCESS_H_
|
||||
|
|
|
@ -15,6 +15,16 @@
|
|||
*/
|
||||
#include "minddata/dataset/kernels/image/lite_cv/lite_mat.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <cmath>
|
||||
#ifdef ENABLE_ANDROID
|
||||
#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64)
|
||||
#define USE_NEON
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
|
@ -221,5 +231,258 @@ void LiteMat::AlignFree(void *ptr) {
|
|||
|
||||
inline void LiteMat::InitElemSize(LDataType data_type) { elem_size_ = data_type.SizeInBytes(); }
|
||||
|
||||
template <typename T>
|
||||
inline void SubtractImpl(const T *src0, const T *src1, T *dst, int64_t total_size) {
|
||||
for (int64_t i = 0; i < total_size; i++) {
|
||||
dst[i] = src0[i] - src1[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void SubtractImpl(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, int64_t total_size) {
|
||||
int64_t x = 0;
|
||||
#ifdef USE_NEON
|
||||
const int64_t step = 32;
|
||||
for (; x <= total_size - step; x += step) {
|
||||
uint8x16_t v_src00 = vld1q_u8(src0 + x);
|
||||
uint8x16_t v_src01 = vld1q_u8(src0 + x + 16);
|
||||
uint8x16_t v_src10 = vld1q_u8(src1 + x);
|
||||
uint8x16_t v_src11 = vld1q_u8(src1 + x + 16);
|
||||
uint8x16_t v_dst;
|
||||
|
||||
v_dst = vqsubq_u8(v_src00, v_src10);
|
||||
vst1q_u8(dst + x, v_dst);
|
||||
|
||||
v_dst = vqsubq_u8(v_src01, v_src11);
|
||||
vst1q_u8(dst + x + 16, v_dst);
|
||||
}
|
||||
#endif
|
||||
for (; x < total_size; x++) {
|
||||
int32_t val = static_cast<int32_t>(src0[x]) - src1[x];
|
||||
dst[x] = std::max<int32_t>(std::numeric_limits<uint8_t>::min(),
|
||||
std::min<int32_t>(std::numeric_limits<uint8_t>::max(), val));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void SubtractImpl(const uint16_t *src0, const uint16_t *src1, uint16_t *dst, int64_t total_size) {
|
||||
for (int64_t i = 0; i < total_size; i++) {
|
||||
int32_t val = static_cast<int32_t>(src0[i]) - src1[i];
|
||||
dst[i] = std::max<int32_t>(std::numeric_limits<uint16_t>::min(),
|
||||
std::min<int32_t>(std::numeric_limits<uint16_t>::max(), val));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void SubtractImpl(const uint32_t *src0, const uint32_t *src1, uint32_t *dst, int64_t total_size) {
|
||||
for (int64_t i = 0; i < total_size; i++) {
|
||||
int64_t val = static_cast<int64_t>(src0[i]) - src1[i];
|
||||
dst[i] = std::max<int64_t>(std::numeric_limits<uint32_t>::min(),
|
||||
std::min<int64_t>(std::numeric_limits<uint32_t>::max(), val));
|
||||
}
|
||||
}
|
||||
|
||||
bool Subtract(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst) {
|
||||
if (src_a.width_ != src_b.width_ || src_a.height_ != src_b.height_ || src_a.channel_ != src_b.channel_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src_a.data_type_ != src_b.data_type_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dst->IsEmpty()) {
|
||||
dst->Init(src_a.width_, src_a.height_, src_a.channel_, src_a.data_type_);
|
||||
} else if (src_a.width_ != dst->width_ || src_a.height_ != dst->height_ || src_a.channel_ != dst->channel_) {
|
||||
return false;
|
||||
} else if (src_a.data_type_ != dst->data_type_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int64_t total_size = src_a.height_ * src_a.width_ * src_a.channel_;
|
||||
if (src_a.data_type_ == LDataType::BOOL) {
|
||||
SubtractImpl<bool>(src_a, src_b, *dst, total_size);
|
||||
} else if (src_a.data_type_ == LDataType::INT8) {
|
||||
SubtractImpl<int8_t>(src_a, src_b, *dst, total_size);
|
||||
} else if (src_a.data_type_ == LDataType::UINT8) {
|
||||
SubtractImpl<uint8_t>(src_a, src_b, *dst, total_size);
|
||||
} else if (src_a.data_type_ == LDataType::INT16) {
|
||||
SubtractImpl<int16_t>(src_a, src_b, *dst, total_size);
|
||||
} else if (src_a.data_type_ == LDataType::UINT16) {
|
||||
SubtractImpl<uint16_t>(src_a, src_b, *dst, total_size);
|
||||
} else if (src_a.data_type_ == LDataType::INT32) {
|
||||
SubtractImpl<int32_t>(src_a, src_b, *dst, total_size);
|
||||
} else if (src_a.data_type_ == LDataType::UINT32) {
|
||||
SubtractImpl<uint32_t>(src_a, src_b, *dst, total_size);
|
||||
} else if (src_a.data_type_ == LDataType::INT64) {
|
||||
SubtractImpl<int64_t>(src_a, src_b, *dst, total_size);
|
||||
} else if (src_a.data_type_ == LDataType::UINT64) {
|
||||
SubtractImpl<uint64_t>(src_a, src_b, *dst, total_size);
|
||||
} else if (src_a.data_type_ == LDataType::FLOAT32) {
|
||||
SubtractImpl<float>(src_a, src_b, *dst, total_size);
|
||||
} else if (src_a.data_type_ == LDataType::FLOAT64) {
|
||||
SubtractImpl<double>(src_a, src_b, *dst, total_size);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#ifdef USE_NEON
|
||||
inline float32x4_t reciprocal_simd(float32x4_t val) {
|
||||
// get an initial estimate of 1/val
|
||||
float32x4_t reciprocal = vrecpeq_f32(val);
|
||||
|
||||
// use Newton-Raphson steps to refine the estimate
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
|
||||
return reciprocal;
|
||||
}
|
||||
|
||||
inline float32x4_t round_simd(const float32x4_t &v) {
|
||||
const int32x4_t signMask = vdupq_n_s32(1U << 31);
|
||||
const int32x4_t half = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
|
||||
float32x4_t v_addition = vreinterpretq_f32_s32(vorrq_s32(half, vandq_s32(signMask, vreinterpretq_s32_f32(v))));
|
||||
return vaddq_f32(v, v_addition);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
inline void DivideImpl(const T *src0, const T *src1, T *dst, int64_t total_size) {
|
||||
for (size_t i = 0; i < total_size; i++) {
|
||||
dst[i] = src1[i] ? src0[i] / src1[i] : 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void DivideImpl(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, int64_t total_size) {
|
||||
int64_t x = 0;
|
||||
#ifdef USE_NEON
|
||||
const int64_t step = 16;
|
||||
for (; x <= total_size - step; x += step) {
|
||||
__builtin_prefetch(reinterpret_cast<const char *>(src0 + x) + 32 * 10);
|
||||
__builtin_prefetch(reinterpret_cast<const char *>(src1 + x) + 32 * 10);
|
||||
|
||||
uint8x16_t v_a = vld1q_u8(src0 + x);
|
||||
uint8x16_t v_b = vld1q_u8(src1 + x);
|
||||
uint8x16_t v_mask = vtstq_u8(v_b, v_b);
|
||||
|
||||
uint16x8_t va_l_16x8 = vmovl_u8(vget_low_u8(v_a));
|
||||
uint16x8_t va_h_16x8 = vmovl_u8(vget_high_u8(v_a));
|
||||
uint16x8_t vb_l_16x8 = vmovl_u8(vget_low_u8(v_b));
|
||||
uint16x8_t vb_h_16x8 = vmovl_u8(vget_high_u8(v_b));
|
||||
|
||||
float32x4_t va_ll_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(va_l_16x8)));
|
||||
float32x4_t va_lh_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(va_l_16x8)));
|
||||
float32x4_t va_hl_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(va_h_16x8)));
|
||||
float32x4_t va_hh_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(va_h_16x8)));
|
||||
float32x4_t vb_ll_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vb_l_16x8)));
|
||||
float32x4_t vb_lh_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vb_l_16x8)));
|
||||
float32x4_t vb_hl_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vb_h_16x8)));
|
||||
float32x4_t vb_hh_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vb_h_16x8)));
|
||||
|
||||
float32x4_t vb_ll_re_f32x4 = reciprocal_simd(vb_ll_f32x4);
|
||||
float32x4_t vb_lh_re_f32x4 = reciprocal_simd(vb_lh_f32x4);
|
||||
float32x4_t vb_hl_re_f32x4 = reciprocal_simd(vb_hl_f32x4);
|
||||
float32x4_t vb_hh_re_f32x4 = reciprocal_simd(vb_hh_f32x4);
|
||||
|
||||
float32x4_t dst_ll_f32x4 = round_simd(vmulq_f32(va_ll_f32x4, vb_ll_re_f32x4));
|
||||
float32x4_t dst_lh_f32x4 = round_simd(vmulq_f32(va_lh_f32x4, vb_lh_re_f32x4));
|
||||
float32x4_t dst_hl_f32x4 = round_simd(vmulq_f32(va_hl_f32x4, vb_hl_re_f32x4));
|
||||
float32x4_t dst_hh_f32x4 = round_simd(vmulq_f32(va_hh_f32x4, vb_hh_re_f32x4));
|
||||
|
||||
uint32x4_t dst_ll_32x4 = vcvtq_u32_f32(dst_ll_f32x4);
|
||||
uint32x4_t dst_lh_32x4 = vcvtq_u32_f32(dst_lh_f32x4);
|
||||
uint32x4_t dst_hl_32x4 = vcvtq_u32_f32(dst_hl_f32x4);
|
||||
uint32x4_t dst_hh_32x4 = vcvtq_u32_f32(dst_hh_f32x4);
|
||||
|
||||
uint16x4_t dst_ll_16x4 = vqmovn_u32(dst_ll_32x4);
|
||||
uint16x4_t dst_lh_16x4 = vqmovn_u32(dst_lh_32x4);
|
||||
uint16x4_t dst_hl_16x4 = vqmovn_u32(dst_hl_32x4);
|
||||
uint16x4_t dst_hh_16x4 = vqmovn_u32(dst_hh_32x4);
|
||||
|
||||
uint16x8_t dst_l_16x8 = vcombine_u16(dst_ll_16x4, dst_lh_16x4);
|
||||
uint16x8_t dst_h_16x8 = vcombine_u16(dst_hl_16x4, dst_hh_16x4);
|
||||
|
||||
int8x8_t dst_l_8x8 = vqmovn_u16(dst_l_16x8);
|
||||
int8x8_t dst_h_8x8 = vqmovn_u16(dst_h_16x8);
|
||||
int8x16_t dst_8x16 = vcombine_u8(dst_l_8x8, dst_h_8x8);
|
||||
|
||||
dst_8x16 = vandq_u8(dst_8x16, v_mask);
|
||||
vst1q_u8(dst + x, dst_8x16);
|
||||
}
|
||||
#endif
|
||||
for (; x < total_size; x++) {
|
||||
int32_t val = src1[x] ? std::round(src0[x] / src1[x]) : 0;
|
||||
dst[x] = std::max<int32_t>(std::numeric_limits<uint8_t>::min(),
|
||||
std::min<int32_t>(std::numeric_limits<uint8_t>::max(), val));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void DivideImpl(const uint16_t *src0, const uint16_t *src1, uint16_t *dst, int64_t total_size) {
|
||||
for (size_t i = 0; i < total_size; i++) {
|
||||
int32_t val = src1[i] ? std::round(src0[i] / src1[i]) : 0;
|
||||
dst[i] = std::max<int32_t>(std::numeric_limits<uint16_t>::min(),
|
||||
std::min<int32_t>(std::numeric_limits<uint16_t>::max(), val));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void DivideImpl(const uint32_t *src0, const uint32_t *src1, uint32_t *dst, int64_t total_size) {
|
||||
for (size_t i = 0; i < total_size; i++) {
|
||||
int64_t val = src1[i] ? std::round(src0[i] / src1[i]) : 0;
|
||||
dst[i] = std::max<int64_t>(std::numeric_limits<uint32_t>::min(),
|
||||
std::min<int64_t>(std::numeric_limits<uint32_t>::max(), val));
|
||||
}
|
||||
}
|
||||
|
||||
bool Divide(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst) {
|
||||
if (src_a.width_ != src_b.width_ || src_a.height_ != src_b.height_ || src_a.channel_ != src_b.channel_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src_a.data_type_ != src_b.data_type_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dst->IsEmpty()) {
|
||||
dst->Init(src_a.width_, src_a.height_, src_a.channel_, src_a.data_type_);
|
||||
} else if (src_a.width_ != dst->width_ || src_a.height_ != dst->height_ || src_a.channel_ != dst->channel_) {
|
||||
return false;
|
||||
} else if (src_a.data_type_ != dst->data_type_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int64_t total_size = src_a.height_ * src_a.width_ * src_a.channel_;
|
||||
if (src_a.data_type_ == LDataType::BOOL) {
|
||||
DivideImpl<bool>(src_a, src_b, *dst, total_size);
|
||||
} else if (src_a.data_type_ == LDataType::INT8) {
|
||||
DivideImpl<int8_t>(src_a, src_b, *dst, total_size);
|
||||
} else if (src_a.data_type_ == LDataType::UINT8) {
|
||||
DivideImpl<uint8_t>(src_a, src_b, *dst, total_size);
|
||||
} else if (src_a.data_type_ == LDataType::INT16) {
|
||||
DivideImpl<int16_t>(src_a, src_b, *dst, total_size);
|
||||
} else if (src_a.data_type_ == LDataType::UINT16) {
|
||||
DivideImpl<uint16_t>(src_a, src_b, *dst, total_size);
|
||||
} else if (src_a.data_type_ == LDataType::INT32) {
|
||||
DivideImpl<int32_t>(src_a, src_b, *dst, total_size);
|
||||
} else if (src_a.data_type_ == LDataType::UINT32) {
|
||||
DivideImpl<uint32_t>(src_a, src_b, *dst, total_size);
|
||||
} else if (src_a.data_type_ == LDataType::INT64) {
|
||||
DivideImpl<int64_t>(src_a, src_b, *dst, total_size);
|
||||
} else if (src_a.data_type_ == LDataType::UINT64) {
|
||||
DivideImpl<uint64_t>(src_a, src_b, *dst, total_size);
|
||||
} else if (src_a.data_type_ == LDataType::FLOAT32) {
|
||||
DivideImpl<float>(src_a, src_b, *dst, total_size);
|
||||
} else if (src_a.data_type_ == LDataType::FLOAT64) {
|
||||
DivideImpl<double>(src_a, src_b, *dst, total_size);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -247,6 +247,13 @@ class LiteMat {
|
|||
LDataType data_type_;
|
||||
int *ref_count_;
|
||||
};
|
||||
|
||||
/// \brief Calculates the difference between the two images for each element
|
||||
bool Subtract(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst);
|
||||
|
||||
/// \brief Calculates the division between the two images for each element
|
||||
bool Divide(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst);
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // MINI_MAT_H_
|
||||
|
|
|
@ -538,7 +538,7 @@ TEST_F(MindDataImageProcess, TestSubtractUint8) {
|
|||
static_cast<UINT8_C1 *>(expect_uint8.data_ptr_)[i] = 1;
|
||||
}
|
||||
LiteMat dst_uint8;
|
||||
EXPECT_TRUE(Subtract(src1_uint8, src2_uint8, dst_uint8));
|
||||
EXPECT_TRUE(Subtract(src1_uint8, src2_uint8, &dst_uint8));
|
||||
for (size_t i = 0; i < cols; i++) {
|
||||
EXPECT_EQ(static_cast<UINT8_C1 *>(expect_uint8.data_ptr_)[i].c1,
|
||||
static_cast<UINT8_C1 *>(dst_uint8.data_ptr_)[i].c1);
|
||||
|
@ -557,7 +557,7 @@ TEST_F(MindDataImageProcess, TestSubtractInt8) {
|
|||
static_cast<INT8_C1 *>(expect_int8.data_ptr_)[i] = -1;
|
||||
}
|
||||
LiteMat dst_int8;
|
||||
EXPECT_TRUE(Subtract(src1_int8, src2_int8, dst_int8));
|
||||
EXPECT_TRUE(Subtract(src1_int8, src2_int8, &dst_int8));
|
||||
for (size_t i = 0; i < cols; i++) {
|
||||
EXPECT_EQ(static_cast<INT8_C1 *>(expect_int8.data_ptr_)[i].c1, static_cast<INT8_C1 *>(dst_int8.data_ptr_)[i].c1);
|
||||
}
|
||||
|
@ -575,7 +575,7 @@ TEST_F(MindDataImageProcess, TestSubtractUInt16) {
|
|||
static_cast<UINT16_C1 *>(expect_uint16.data_ptr_)[i] = 0;
|
||||
}
|
||||
LiteMat dst_uint16;
|
||||
EXPECT_TRUE(Subtract(src1_uint16, src2_uint16, dst_uint16));
|
||||
EXPECT_TRUE(Subtract(src1_uint16, src2_uint16, &dst_uint16));
|
||||
for (size_t i = 0; i < cols; i++) {
|
||||
EXPECT_EQ(static_cast<UINT16_C1 *>(expect_uint16.data_ptr_)[i].c1,
|
||||
static_cast<UINT16_C1 *>(dst_uint16.data_ptr_)[i].c1);
|
||||
|
@ -594,7 +594,7 @@ TEST_F(MindDataImageProcess, TestSubtractInt16) {
|
|||
static_cast<INT16_C1 *>(expect_int16.data_ptr_)[i] = -1;
|
||||
}
|
||||
LiteMat dst_int16;
|
||||
EXPECT_TRUE(Subtract(src1_int16, src2_int16, dst_int16));
|
||||
EXPECT_TRUE(Subtract(src1_int16, src2_int16, &dst_int16));
|
||||
for (size_t i = 0; i < cols; i++) {
|
||||
EXPECT_EQ(static_cast<INT16_C1 *>(expect_int16.data_ptr_)[i].c1,
|
||||
static_cast<INT16_C1 *>(dst_int16.data_ptr_)[i].c1);
|
||||
|
@ -613,7 +613,7 @@ TEST_F(MindDataImageProcess, TestSubtractUInt32) {
|
|||
static_cast<UINT32_C1 *>(expect_uint32.data_ptr_)[i] = 0;
|
||||
}
|
||||
LiteMat dst_uint32;
|
||||
EXPECT_TRUE(Subtract(src1_uint32, src2_uint32, dst_uint32));
|
||||
EXPECT_TRUE(Subtract(src1_uint32, src2_uint32, &dst_uint32));
|
||||
for (size_t i = 0; i < cols; i++) {
|
||||
EXPECT_EQ(static_cast<UINT32_C1 *>(expect_uint32.data_ptr_)[i].c1,
|
||||
static_cast<UINT32_C1 *>(dst_uint32.data_ptr_)[i].c1);
|
||||
|
@ -632,7 +632,7 @@ TEST_F(MindDataImageProcess, TestSubtractInt32) {
|
|||
static_cast<INT32_C1 *>(expect_int32.data_ptr_)[i] = -2;
|
||||
}
|
||||
LiteMat dst_int32;
|
||||
EXPECT_TRUE(Subtract(src1_int32, src2_int32, dst_int32));
|
||||
EXPECT_TRUE(Subtract(src1_int32, src2_int32, &dst_int32));
|
||||
for (size_t i = 0; i < cols; i++) {
|
||||
EXPECT_EQ(static_cast<INT32_C1 *>(expect_int32.data_ptr_)[i].c1,
|
||||
static_cast<INT32_C1 *>(dst_int32.data_ptr_)[i].c1);
|
||||
|
@ -651,7 +651,7 @@ TEST_F(MindDataImageProcess, TestSubtractFloat) {
|
|||
static_cast<FLOAT32_C1 *>(expect_float.data_ptr_)[i] = -2.3;
|
||||
}
|
||||
LiteMat dst_float;
|
||||
EXPECT_TRUE(Subtract(src1_float, src2_float, dst_float));
|
||||
EXPECT_TRUE(Subtract(src1_float, src2_float, &dst_float));
|
||||
for (size_t i = 0; i < cols; i++) {
|
||||
EXPECT_FLOAT_EQ(static_cast<FLOAT32_C1 *>(expect_float.data_ptr_)[i].c1,
|
||||
static_cast<FLOAT32_C1 *>(dst_float.data_ptr_)[i].c1);
|
||||
|
@ -670,7 +670,7 @@ TEST_F(MindDataImageProcess, TestDivideUint8) {
|
|||
static_cast<UINT8_C1 *>(expect_uint8.data_ptr_)[i] = 2;
|
||||
}
|
||||
LiteMat dst_uint8;
|
||||
EXPECT_TRUE(Divide(src1_uint8, src2_uint8, dst_uint8));
|
||||
EXPECT_TRUE(Divide(src1_uint8, src2_uint8, &dst_uint8));
|
||||
for (size_t i = 0; i < cols; i++) {
|
||||
EXPECT_EQ(static_cast<UINT8_C1 *>(expect_uint8.data_ptr_)[i].c1,
|
||||
static_cast<UINT8_C1 *>(dst_uint8.data_ptr_)[i].c1);
|
||||
|
@ -689,7 +689,7 @@ TEST_F(MindDataImageProcess, TestDivideInt8) {
|
|||
static_cast<INT8_C1 *>(expect_int8.data_ptr_)[i] = -2;
|
||||
}
|
||||
LiteMat dst_int8;
|
||||
EXPECT_TRUE(Divide(src1_int8, src2_int8, dst_int8));
|
||||
EXPECT_TRUE(Divide(src1_int8, src2_int8, &dst_int8));
|
||||
for (size_t i = 0; i < cols; i++) {
|
||||
EXPECT_EQ(static_cast<INT8_C1 *>(expect_int8.data_ptr_)[i].c1, static_cast<INT8_C1 *>(dst_int8.data_ptr_)[i].c1);
|
||||
}
|
||||
|
@ -707,7 +707,7 @@ TEST_F(MindDataImageProcess, TestDivideUInt16) {
|
|||
static_cast<UINT16_C1 *>(expect_uint16.data_ptr_)[i] = 2;
|
||||
}
|
||||
LiteMat dst_uint16;
|
||||
EXPECT_TRUE(Divide(src1_uint16, src2_uint16, dst_uint16));
|
||||
EXPECT_TRUE(Divide(src1_uint16, src2_uint16, &dst_uint16));
|
||||
for (size_t i = 0; i < cols; i++) {
|
||||
EXPECT_EQ(static_cast<UINT16_C1 *>(expect_uint16.data_ptr_)[i].c1,
|
||||
static_cast<UINT16_C1 *>(dst_uint16.data_ptr_)[i].c1);
|
||||
|
@ -726,7 +726,7 @@ TEST_F(MindDataImageProcess, TestDivideInt16) {
|
|||
static_cast<INT16_C1 *>(expect_int16.data_ptr_)[i] = -10000;
|
||||
}
|
||||
LiteMat dst_int16;
|
||||
EXPECT_TRUE(Divide(src1_int16, src2_int16, dst_int16));
|
||||
EXPECT_TRUE(Divide(src1_int16, src2_int16, &dst_int16));
|
||||
for (size_t i = 0; i < cols; i++) {
|
||||
EXPECT_EQ(static_cast<INT16_C1 *>(expect_int16.data_ptr_)[i].c1,
|
||||
static_cast<INT16_C1 *>(dst_int16.data_ptr_)[i].c1);
|
||||
|
@ -745,7 +745,7 @@ TEST_F(MindDataImageProcess, TestDivideUInt32) {
|
|||
static_cast<UINT32_C1 *>(expect_uint32.data_ptr_)[i] = 1000000000;
|
||||
}
|
||||
LiteMat dst_uint32;
|
||||
EXPECT_TRUE(Divide(src1_uint32, src2_uint32, dst_uint32));
|
||||
EXPECT_TRUE(Divide(src1_uint32, src2_uint32, &dst_uint32));
|
||||
for (size_t i = 0; i < cols; i++) {
|
||||
EXPECT_EQ(static_cast<UINT32_C1 *>(expect_uint32.data_ptr_)[i].c1,
|
||||
static_cast<UINT32_C1 *>(dst_uint32.data_ptr_)[i].c1);
|
||||
|
@ -764,7 +764,7 @@ TEST_F(MindDataImageProcess, TestDivideInt32) {
|
|||
static_cast<INT32_C1 *>(expect_int32.data_ptr_)[i] = -1000000000;
|
||||
}
|
||||
LiteMat dst_int32;
|
||||
EXPECT_TRUE(Divide(src1_int32, src2_int32, dst_int32));
|
||||
EXPECT_TRUE(Divide(src1_int32, src2_int32, &dst_int32));
|
||||
for (size_t i = 0; i < cols; i++) {
|
||||
EXPECT_EQ(static_cast<INT32_C1 *>(expect_int32.data_ptr_)[i].c1,
|
||||
static_cast<INT32_C1 *>(dst_int32.data_ptr_)[i].c1);
|
||||
|
@ -783,7 +783,7 @@ TEST_F(MindDataImageProcess, TestDivideFloat) {
|
|||
static_cast<FLOAT32_C1 *>(expect_float.data_ptr_)[i] = -6.17f;
|
||||
}
|
||||
LiteMat dst_float;
|
||||
EXPECT_TRUE(Divide(src1_float, src2_float, dst_float));
|
||||
EXPECT_TRUE(Divide(src1_float, src2_float, &dst_float));
|
||||
for (size_t i = 0; i < cols; i++) {
|
||||
EXPECT_FLOAT_EQ(static_cast<FLOAT32_C1 *>(expect_float.data_ptr_)[i].c1,
|
||||
static_cast<FLOAT32_C1 *>(dst_float.data_ptr_)[i].c1);
|
||||
|
|
Loading…
Reference in New Issue