!31536 [MSLITE][CPU] AVX512/256/SSE/NENO Advanced packaging, and Arithmetic Op (total 14) Refactoring and optimization

Merge pull request !31536 from Greatpan/avx512_arithmetic_self
This commit is contained in:
i-robot 2022-03-21 02:55:57 +00:00 committed by Gitee
commit f301c9e40f
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
1 changed files with 268 additions and 36 deletions

View File

@ -1,5 +1,5 @@
/** /**
* Copyright 2020 Huawei Technologies Co., Ltd * Copyright 2020-2022 Huawei Technologies Co., Ltd
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
@ -19,20 +19,62 @@
#define ACCURACY_DATA 0.00000001 #define ACCURACY_DATA 0.00000001
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
#define SimdElementFloorModCoreCalc(block_size, block_num, in0, in1, out, size, i) \
for (int block_max_size = size - block_num + 1; i < block_max_size; i += block_num) { \
MS_FLOAT_32xN(block_num) in0_tmp = MS_LD_F32(block_size, in0 + i); \
MS_FLOAT_32xN(block_num) in1_tmp = MS_LD_F32(block_size, in1 + i); \
MS_FLOAT_32xN(block_num) floor_tmp = MS_FLOOR_F32(block_size, MS_DIV_F32(block_size, in0_tmp, in1_tmp)); \
MS_FLOAT_32xN(block_num) out_tmp = MS_SUB_F32(block_size, in0_tmp, MS_MUL_F32(block_size, floor_tmp, in1_tmp)); \
MS_ST_F32(block_size, out + i, out_tmp); \
}
int ElementFloorMod(const float *in0, const float *in1, float *out, int size) { int ElementFloorMod(const float *in0, const float *in1, float *out, int size) {
for (int i = 0; i < size; i++) { int i = 0;
MS_SIMD_RUN_X86_NO_SCALAR(SimdElementFloorModCoreCalc, in0, in1, out, size, i); // neon no floor instruction
for (; i < size; i++) {
out[i] = in0[i] - floorf(in0[i] / in1[i]) * in1[i]; out[i] = in0[i] - floorf(in0[i] / in1[i]) * in1[i];
} }
return NNACL_OK; return NNACL_OK;
} }
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
#define SimdElementOptFloorModCoreCalc1(block_size, block_num, in0, in1, out, size, i) \
do { \
MS_FLOAT_32xN(block_num) in0_tmp = MS_MOVN_F32(block_size, in0[0]); \
for (int block_max_size = size - block_num + 1; i < block_max_size; i += block_num) { \
MS_FLOAT_32xN(block_num) in1_tmp = MS_LD_F32(block_size, in1 + i); \
MS_FLOAT_32xN(block_num) floor_tmp = MS_FLOOR_F32(block_size, MS_DIV_F32(block_size, in0_tmp, in1_tmp)); \
MS_FLOAT_32xN(block_num) out_tmp = MS_SUB_F32(block_size, in0_tmp, MS_MUL_F32(block_size, floor_tmp, in1_tmp)); \
MS_ST_F32(block_size, out + i, out_tmp); \
} \
} while (0)
#define SimdElementOptFloorModCoreCalc2(block_size, block_num, in0, in1, out, size, i) \
do { \
MS_FLOAT_32xN(block_num) in1_tmp = MS_MOVN_F32(block_size, in1[0]); \
for (int block_max_size = size - block_num + 1; i < block_max_size; i += block_num) { \
MS_FLOAT_32xN(block_num) in0_tmp = MS_LD_F32(block_size, in0 + i); \
MS_FLOAT_32xN(block_num) floor_tmp = MS_FLOOR_F32(block_size, MS_DIV_F32(block_size, in0_tmp, in1_tmp)); \
MS_FLOAT_32xN(block_num) out_tmp = MS_SUB_F32(block_size, in0_tmp, MS_MUL_F32(block_size, floor_tmp, in1_tmp)); \
MS_ST_F32(block_size, out + i, out_tmp); \
} \
} while (0)
int ElementOptFloorMod(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { int ElementOptFloorMod(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
int i = 0; int i = 0;
if (param->in_elements_num0_ == 1) { if (param->in_elements_num0_ == 1) {
MS_SIMD_RUN_X86_NO_SCALAR(SimdElementOptFloorModCoreCalc1, in0, in1, out, size, i); // neon no floor instruction
for (; i < size; i++) { for (; i < size; i++) {
out[i] = in0[0] - floorf(in0[0] / in1[i]) * in1[i]; out[i] = in0[0] - floorf(in0[0] / in1[i]) * in1[i];
} }
} else { } else {
MS_SIMD_RUN_X86_NO_SCALAR(SimdElementOptFloorModCoreCalc2, in0, in1, out, size, i); // neon no floor instruction
for (; i < size; i++) { for (; i < size; i++) {
out[i] = in0[i] - floorf(in0[i] / in1[0]) * in1[0]; out[i] = in0[i] - floorf(in0[i] / in1[0]) * in1[0];
} }
@ -113,20 +155,59 @@ int ElementOptModInt(const int *in0, const int *in1, int *out, int size, const A
return NNACL_OK; return NNACL_OK;
} }
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
#define SimdElementFloorDivCoreCalc(block_size, block_num, in0, in1, out, size, i) \
for (int block_max_size = size - block_num + 1; i < block_max_size; i += block_num) { \
MS_FLOAT_32xN(block_num) in0_tmp = MS_LD_F32(block_size, in0 + i); \
MS_FLOAT_32xN(block_num) in1_tmp = MS_LD_F32(block_size, in1 + i); \
MS_FLOAT_32xN(block_num) floor_tmp = MS_FLOOR_F32(block_size, MS_DIV_F32(block_size, in0_tmp, in1_tmp)); \
MS_ST_F32(block_size, out + i, floor_tmp); \
}
int ElementFloorDiv(const float *in0, const float *in1, float *out, int size) { int ElementFloorDiv(const float *in0, const float *in1, float *out, int size) {
for (int i = 0; i < size; i++) { int i = 0;
MS_SIMD_RUN_X86_NO_SCALAR(SimdElementFloorDivCoreCalc, in0, in1, out, size, i); // neon no floor instruction
for (; i < size; i++) {
out[i] = floorf(in0[i] / in1[i]); out[i] = floorf(in0[i] / in1[i]);
} }
return NNACL_OK; return NNACL_OK;
} }
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
#define SimdElementOptFloorDivCoreCalc1(block_size, block_num, in0, in1, out, size, i) \
do { \
MS_FLOAT_32xN(block_num) in0_tmp = MS_MOVN_F32(block_size, in0[0]); \
for (int block_max_size = size - block_num + 1; i < block_max_size; i += block_num) { \
MS_FLOAT_32xN(block_num) in1_tmp = MS_LD_F32(block_size, in1 + i); \
MS_FLOAT_32xN(block_num) out_tmp = MS_FLOOR_F32(block_size, MS_DIV_F32(block_size, in0_tmp, in1_tmp)); \
MS_ST_F32(block_size, out + i, out_tmp); \
} \
} while (0)
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
#define SimdElementOptFloorDivCoreCalc2(block_size, block_num, in0, in1, out, size, i) \
do { \
MS_FLOAT_32xN(block_num) in1_tmp = MS_MOVN_F32(block_size, in1[0]); \
for (int block_max_size = size - block_num + 1; i < block_max_size; i += block_num) { \
MS_FLOAT_32xN(block_num) in0_tmp = MS_LD_F32(block_size, in0 + i); \
MS_FLOAT_32xN(block_num) out_tmp = MS_FLOOR_F32(block_size, MS_DIV_F32(block_size, in0_tmp, in1_tmp)); \
MS_ST_F32(block_size, out + i, out_tmp); \
} \
} while (0)
int ElementOptFloorDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { int ElementOptFloorDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
int i = 0; int i = 0;
if (param->in_elements_num0_ == 1) { if (param->in_elements_num0_ == 1) {
MS_SIMD_RUN_X86_NO_SCALAR(SimdElementOptFloorDivCoreCalc1, in0, in1, out, size, i); // neon no floor instruction
for (; i < size; i++) { for (; i < size; i++) {
out[i] = floorf(in0[0] / in1[i]); out[i] = floorf(in0[0] / in1[i]);
} }
} else { } else {
MS_SIMD_RUN_X86_NO_SCALAR(SimdElementOptFloorDivCoreCalc2, in0, in1, out, size, i); // neon no floor instruction
for (; i < size; i++) { for (; i < size; i++) {
out[i] = floorf(in0[i] / in1[0]); out[i] = floorf(in0[i] / in1[0]);
} }
@ -135,23 +216,61 @@ int ElementOptFloorDiv(const float *in0, const float *in1, float *out, int size,
return NNACL_OK; return NNACL_OK;
} }
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
#define SimdElementFloorDivIntCoreCalc(block_size, block_num, in0, in1, out, size, i) \
for (int block_max_size = size - block_num + 1; i < block_max_size; i += block_num) { \
MS_INT_32xN(block_num) in0_tmp = MS_LD_EPI32(block_size, in0 + i); \
MS_INT_32xN(block_num) in1_tmp = MS_LD_EPI32(block_size, in1 + i); \
MS_INT_32xN(block_num) out_tmp = MS_DIV_EPI32(block_size, in0_tmp, in1_tmp); \
MS_ST_EPI32(block_size, out + i, out_tmp); \
}
int ElementFloorDivInt(const int *in0, const int *in1, int *out, int size) { int ElementFloorDivInt(const int *in0, const int *in1, int *out, int size) {
for (int i = 0; i < size; i++) { int i = 0;
MS_SIMD_RUN_NO_SCALAR(SimdElementFloorDivIntCoreCalc, in0, in1, out, size, i);
for (; i < size; i++) {
NNACL_CHECK_ZERO_RETURN_ERR(in1[i]); NNACL_CHECK_ZERO_RETURN_ERR(in1[i]);
out[i] = in0[i] / in1[i]; out[i] = in0[i] / in1[i];
} }
return NNACL_OK; return NNACL_OK;
} }
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
#define SimdElementOptFloorDivIntCoreCalc1(block_size, block_num, in0, in1, out, size, i) \
do { \
MS_INT_32xN(block_num) in0_tmp = MS_MOVN_EPI32(block_size, in0[0]); \
for (int block_max_size = size - block_num + 1; i < block_max_size; i += block_num) { \
MS_INT_32xN(block_num) in1_tmp = MS_LD_EPI32(block_size, in1 + i); \
MS_INT_32xN(block_num) out_tmp = MS_DIV_EPI32(block_size, in0_tmp, in1_tmp); \
MS_ST_EPI32(block_size, out + i, out_tmp); \
} \
} while (0)
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
#define SimdElementOptFloorDivIntCoreCalc2(block_size, block_num, in0, in1, out, size, i) \
do { \
MS_INT_32xN(block_num) in1_tmp = MS_MOVN_EPI32(block_size, in1[0]); \
for (int block_max_size = size - block_num + 1; i < block_max_size; i += block_num) { \
MS_INT_32xN(block_num) in0_tmp = MS_LD_EPI32(block_size, in0 + i); \
MS_INT_32xN(block_num) out_tmp = MS_DIV_EPI32(block_size, in0_tmp, in1_tmp); \
MS_ST_EPI32(block_size, out + i, out_tmp); \
} \
} while (0)
int ElementOptFloorDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { int ElementOptFloorDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
int i = 0; int i = 0;
if (param->in_elements_num0_ == 1) { if (param->in_elements_num0_ == 1) {
MS_SIMD_RUN_NO_SCALAR(SimdElementOptFloorDivIntCoreCalc1, in0, in1, out, size, i);
for (; i < size; i++) { for (; i < size; i++) {
NNACL_CHECK_ZERO_RETURN_ERR(in1[i]); NNACL_CHECK_ZERO_RETURN_ERR(in1[i]);
out[i] = in0[0] / in1[i]; out[i] = in0[0] / in1[i];
} }
} else { } else {
NNACL_CHECK_ZERO_RETURN_ERR(in1[0]); NNACL_CHECK_ZERO_RETURN_ERR(in1[0]);
MS_SIMD_RUN_NO_SCALAR(SimdElementOptFloorDivIntCoreCalc2, in0, in1, out, size, i);
for (; i < size; i++) { for (; i < size; i++) {
out[i] = in0[i] / in1[0]; out[i] = in0[i] / in1[0];
} }
@ -300,30 +419,58 @@ int ElementOptLogicalOrBool(const bool *in0, const bool *in1, bool *out, int siz
return NNACL_OK; return NNACL_OK;
} }
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
#define SimdElementMaximumCoreCalc(block_size, block_num, in0, in1, out, size, index) \
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
MS_FLOAT_32xN(block_num) in0_tmp = MS_LD_F32(block_size, in0 + index); \
MS_FLOAT_32xN(block_num) in1_tmp = MS_LD_F32(block_size, in1 + index); \
MS_FLOAT_32xN(block_num) out_tmp = MS_MAX_F32(block_size, in0_tmp, in1_tmp); \
MS_ST_F32(block_size, out + index, out_tmp); \
}
int ElementMaximum(const float *in0, const float *in1, float *out, int size) { int ElementMaximum(const float *in0, const float *in1, float *out, int size) {
int index = 0; int index = 0;
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) { MS_SIMD_RUN_NO_SCALAR(SimdElementMaximumCoreCalc, in0, in1, out, size, index);
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vmaxq_f32(vin0, vin1);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) { for (; index < size; index++) {
out[index] = in0[index] > in1[index] ? in0[index] : in1[index]; out[index] = in0[index] > in1[index] ? in0[index] : in1[index];
} }
return NNACL_OK; return NNACL_OK;
} }
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
#define SimdElementOptMaximumCoreCalc1(block_size, block_num, in0, in1, out, size, index) \
do { \
MS_FLOAT_32xN(block_num) in0_tmp = MS_MOVN_F32(block_size, in0[0]); \
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
MS_FLOAT_32xN(block_num) in1_tmp = MS_LD_F32(block_size, in1 + index); \
MS_FLOAT_32xN(block_num) out_tmp = MS_MAX_F32(block_size, in0_tmp, in1_tmp); \
MS_ST_F32(block_size, out + index, out_tmp); \
} \
} while (0)
#define SimdElementOptMaximumCoreCalc2(block_size, block_num, in0, in1, out, size, index) \
do { \
MS_FLOAT_32xN(block_num) in1_tmp = MS_MOVN_F32(block_size, in1[0]); \
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
MS_FLOAT_32xN(block_num) in0_tmp = MS_LD_F32(block_size, in0 + index); \
MS_FLOAT_32xN(block_num) out_tmp = MS_MAX_F32(block_size, in0_tmp, in1_tmp); \
MS_ST_F32(block_size, out + index, out_tmp); \
} \
} while (0)
int ElementOptMaximum(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { int ElementOptMaximum(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
int index = 0; int index = 0;
if (param->in_elements_num0_ == 1) { if (param->in_elements_num0_ == 1) {
MS_SIMD_RUN_NO_SCALAR(SimdElementOptMaximumCoreCalc1, in0, in1, out, size, index);
for (; index < size; index++) { for (; index < size; index++) {
out[index] = in0[0] > in1[index] ? in0[0] : in1[index]; out[index] = in0[0] > in1[index] ? in0[0] : in1[index];
} }
} else { } else {
MS_SIMD_RUN_NO_SCALAR(SimdElementOptMaximumCoreCalc2, in0, in1, out, size, index);
for (; index < size; index++) { for (; index < size; index++) {
out[index] = in0[index] > in1[0] ? in0[index] : in1[0]; out[index] = in0[index] > in1[0] ? in0[index] : in1[0];
} }
@ -332,29 +479,57 @@ int ElementOptMaximum(const float *in0, const float *in1, float *out, int size,
return NNACL_OK; return NNACL_OK;
} }
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
#define SimdElementMaximumIntCoreCalc(block_size, block_num, in0, in1, out, size, index) \
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
MS_INT_32xN(block_num) in0_tmp = MS_LD_EPI32(block_size, in0 + index); \
MS_INT_32xN(block_num) in1_tmp = MS_LD_EPI32(block_size, in1 + index); \
MS_INT_32xN(block_num) out_tmp = MS_MAX_EPI32(block_size, in0_tmp, in1_tmp); \
MS_ST_EPI32(block_size, out + index, out_tmp); \
}
int ElementMaximumInt(const int *in0, const int *in1, int *out, int size) { int ElementMaximumInt(const int *in0, const int *in1, int *out, int size) {
int index = 0; int index = 0;
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) { MS_SIMD_RUN_NO_SCALAR(SimdElementMaximumIntCoreCalc, in0, in1, out, size, index);
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vmaxq_s32(vin0, vin1);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) { for (; index < size; index++) {
out[index] = in0[index] > in1[index] ? in0[index] : in1[index]; out[index] = in0[index] > in1[index] ? in0[index] : in1[index];
} }
return NNACL_OK; return NNACL_OK;
} }
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
#define SimdElementOptMaximumIntCoreCalc1(block_size, block_num, in0, in1, out, size, index) \
do { \
MS_INT_32xN(block_num) in0_tmp = MS_MOVN_EPI32(block_size, in0[0]); \
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
MS_INT_32xN(block_num) in1_tmp = MS_LD_EPI32(block_size, in1 + index); \
MS_INT_32xN(block_num) out_tmp = MS_MAX_EPI32(block_size, in0_tmp, in1_tmp); \
MS_ST_EPI32(block_size, out + index, out_tmp); \
} \
} while (0)
#define SimdElementOptMaximumIntCoreCalc2(block_size, block_num, in0, in1, out, size, index) \
do { \
MS_INT_32xN(block_num) in1_tmp = MS_MOVN_EPI32(block_size, in1[0]); \
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
MS_INT_32xN(block_num) in0_tmp = MS_LD_EPI32(block_size, in0 + index); \
MS_INT_32xN(block_num) out_tmp = MS_MAX_EPI32(block_size, in0_tmp, in1_tmp); \
MS_ST_EPI32(block_size, out + index, out_tmp); \
} \
} while (0)
int ElementOptMaximumInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { int ElementOptMaximumInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
int index = 0; int index = 0;
if (param->in_elements_num0_ == 1) { if (param->in_elements_num0_ == 1) {
MS_SIMD_RUN_NO_SCALAR(SimdElementOptMaximumIntCoreCalc1, in0, in1, out, size, index);
for (; index < size; index++) { for (; index < size; index++) {
out[index] = in0[0] > in1[index] ? in0[0] : in1[index]; out[index] = in0[0] > in1[index] ? in0[0] : in1[index];
} }
} else { } else {
MS_SIMD_RUN_NO_SCALAR(SimdElementOptMaximumIntCoreCalc2, in0, in1, out, size, index);
for (; index < size; index++) { for (; index < size; index++) {
out[index] = in0[index] > in1[0] ? in0[index] : in1[0]; out[index] = in0[index] > in1[0] ? in0[index] : in1[0];
} }
@ -363,30 +538,58 @@ int ElementOptMaximumInt(const int *in0, const int *in1, int *out, int size, con
return NNACL_OK; return NNACL_OK;
} }
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
#define SimdElementMinimumIntCoreCalc(block_size, block_num, in0, in1, out, size, index) \
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
MS_INT_32xN(block_num) in0_tmp = MS_LD_EPI32(block_size, in0 + index); \
MS_INT_32xN(block_num) in1_tmp = MS_LD_EPI32(block_size, in1 + index); \
MS_INT_32xN(block_num) out_tmp = MS_MIN_EPI32(block_size, in0_tmp, in1_tmp); \
MS_ST_EPI32(block_size, out + index, out_tmp); \
}
int ElementMinimumInt(const int *input0, const int *input1, int *output, int size) { int ElementMinimumInt(const int *input0, const int *input1, int *output, int size) {
int index = 0; int index = 0;
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) { MS_SIMD_RUN_NO_SCALAR(SimdElementMinimumIntCoreCalc, input0, input1, output, size, index);
int32x4_t vin0 = vld1q_s32(input0 + index);
int32x4_t vin1 = vld1q_s32(input1 + index);
int32x4_t vout = vminq_s32(vin0, vin1);
vst1q_s32(output + index, vout);
}
#endif
for (; index < size; index++) { for (; index < size; index++) {
output[index] = input0[index] > input1[index] ? input1[index] : input0[index]; output[index] = input0[index] > input1[index] ? input1[index] : input0[index];
} }
return NNACL_OK; return NNACL_OK;
} }
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
#define SimdElementOptMinimumIntCoreCalc1(block_size, block_num, in0, in1, out, size, index) \
do { \
MS_INT_32xN(block_num) in0_tmp = MS_MOVN_EPI32(block_size, in0[0]); \
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
MS_INT_32xN(block_num) in1_tmp = MS_LD_EPI32(block_size, in1 + index); \
MS_INT_32xN(block_num) out_tmp = MS_MIN_EPI32(block_size, in0_tmp, in1_tmp); \
MS_ST_EPI32(block_size, out + index, out_tmp); \
} \
} while (0)
#define SimdElementOptMinimumIntCoreCalc2(block_size, block_num, in0, in1, out, size, index) \
do { \
MS_INT_32xN(block_num) in1_tmp = MS_MOVN_EPI32(block_size, in1[0]); \
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
MS_INT_32xN(block_num) in0_tmp = MS_LD_EPI32(block_size, in0 + index); \
MS_INT_32xN(block_num) out_tmp = MS_MIN_EPI32(block_size, in0_tmp, in1_tmp); \
MS_ST_EPI32(block_size, out + index, out_tmp); \
} \
} while (0)
int ElementOptMinimumInt(const int *input0, const int *input1, int *output, int size, int ElementOptMinimumInt(const int *input0, const int *input1, int *output, int size,
const ArithmeticParameter *param) { const ArithmeticParameter *param) {
int index = 0; int index = 0;
if (param->in_elements_num0_ == 1) { if (param->in_elements_num0_ == 1) {
MS_SIMD_RUN_NO_SCALAR(SimdElementOptMinimumIntCoreCalc1, input0, input1, output, size, index);
for (; index < size; index++) { for (; index < size; index++) {
output[index] = input0[0] > input1[index] ? input1[index] : input0[0]; output[index] = input0[0] > input1[index] ? input1[index] : input0[0];
} }
} else { } else {
MS_SIMD_RUN_NO_SCALAR(SimdElementOptMinimumIntCoreCalc2, input0, input1, output, size, index);
for (; index < size; index++) { for (; index < size; index++) {
output[index] = input0[index] > input1[0] ? input1[0] : input0[index]; output[index] = input0[index] > input1[0] ? input1[0] : input0[index];
} }
@ -395,29 +598,58 @@ int ElementOptMinimumInt(const int *input0, const int *input1, int *output, int
return NNACL_OK; return NNACL_OK;
} }
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
#define SimdElementMinimumCoreCalc(block_size, block_num, in0, in1, out, size, index) \
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
MS_FLOAT_32xN(block_num) in0_tmp = MS_LD_F32(block_size, in0 + index); \
MS_FLOAT_32xN(block_num) in1_tmp = MS_LD_F32(block_size, in1 + index); \
MS_FLOAT_32xN(block_num) out_tmp = MS_MIN_F32(block_size, in0_tmp, in1_tmp); \
MS_ST_F32(block_size, out + index, out_tmp); \
}
int ElementMinimum(const float *in0, const float *in1, float *out, int size) { int ElementMinimum(const float *in0, const float *in1, float *out, int size) {
int index = 0; int index = 0;
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) { MS_SIMD_RUN_NO_SCALAR(SimdElementMinimumCoreCalc, in0, in1, out, size, index);
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vminq_f32(vin0, vin1);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) { for (; index < size; index++) {
out[index] = in0[index] > in1[index] ? in1[index] : in0[index]; out[index] = in0[index] > in1[index] ? in1[index] : in0[index];
} }
return NNACL_OK; return NNACL_OK;
} }
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
#define SimdElementOptMinimumCoreCalc1(block_size, block_num, in0, in1, out, size, index) \
do { \
MS_FLOAT_32xN(block_num) in0_tmp = MS_MOVN_F32(block_size, in0[0]); \
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
MS_FLOAT_32xN(block_num) in1_tmp = MS_LD_F32(block_size, in1 + index); \
MS_FLOAT_32xN(block_num) out_tmp = MS_MIN_F32(block_size, in0_tmp, in1_tmp); \
MS_ST_F32(block_size, out + index, out_tmp); \
} \
} while (0)
#define SimdElementOptMinimumCoreCalc2(block_size, block_num, in0, in1, out, size, index) \
do { \
MS_FLOAT_32xN(block_num) in1_tmp = MS_MOVN_F32(block_size, in1[0]); \
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
MS_FLOAT_32xN(block_num) in0_tmp = MS_LD_F32(block_size, in0 + index); \
MS_FLOAT_32xN(block_num) out_tmp = MS_MIN_F32(block_size, in0_tmp, in1_tmp); \
MS_ST_F32(block_size, out + index, out_tmp); \
} \
} while (0)
int ElementOptMinimum(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { int ElementOptMinimum(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
int index = 0; int index = 0;
if (param->in_elements_num0_ == 1) { if (param->in_elements_num0_ == 1) {
MS_SIMD_RUN_NO_SCALAR(SimdElementOptMinimumCoreCalc1, in0, in1, out, size, index);
for (; index < size; index++) { for (; index < size; index++) {
out[index] = in0[0] > in1[index] ? in1[index] : in0[0]; out[index] = in0[0] > in1[index] ? in1[index] : in0[0];
} }
} else { } else {
MS_SIMD_RUN_NO_SCALAR(SimdElementOptMinimumCoreCalc2, in0, in1, out, size, index);
for (; index < size; index++) { for (; index < size; index++) {
out[index] = in0[index] > in1[0] ? in1[0] : in0[index]; out[index] = in0[index] > in1[0] ? in1[0] : in0[index];
} }