!6473 optimize cpu op reduce_sum

Merge pull request !6473 from 陶云浩/master
This commit is contained in:
mindspore-ci-bot 2020-09-23 09:35:11 +08:00 committed by Gitee
commit c52a076d6a
1 changed files with 19 additions and 3 deletions

View File

@ -14,8 +14,8 @@
* limitations under the License.
*/
#include <float.h>
#include "nnacl/fp32/reduce.h"
#include <float.h>
#include "nnacl/errorcode.h"
#include "nnacl/common_func.h"
@ -45,11 +45,27 @@ int ReduceSum(const int outer_size, const int inner_size, const int axis_size, c
if (src_data == NULL || dst_data == NULL) {
return NNACL_NULL_PTR;
}
int i, j, k;
int i, j;
#ifdef ENABLE_NEON
int block_mod = inner_size % C4NUM;
int block_c4 = inner_size - block_mod;
#endif
for (j = tid; j < outer_size; j += thread_num) {
const float *outer_src = src_data + j * axis_size * inner_size;
float *outer_dst = dst_data + j * inner_size;
for (k = 0; k < inner_size; k++) {
int k = 0;
#ifdef ENABLE_NEON
for (; k < block_c4; k += C4NUM) {
const float *inner_src = outer_src + k;
float *inner_dst = outer_dst + k;
float32x4_t tmp = {0, 0, 0, 0};
for (i = 0; i < axis_size; i++) {
tmp = vaddq_f32(tmp, vld1q_f32(inner_src + i * inner_size));
}
vst1q_f32(inner_dst, tmp);
}
#endif
for (; k < inner_size; k++) {
const float *inner_src = outer_src + k;
float *inner_dst = outer_dst + k;
float tmp = 0.0f;