forked from mindspore-Ecosystem/mindspore
!6473 optimize cpu op reduce_sum
Merge pull request !6473 from 陶云浩/master
This commit is contained in:
commit
c52a076d6a
|
@ -14,8 +14,8 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <float.h>
|
||||
#include "nnacl/fp32/reduce.h"
|
||||
#include <float.h>
|
||||
#include "nnacl/errorcode.h"
|
||||
#include "nnacl/common_func.h"
|
||||
|
||||
|
@ -45,11 +45,27 @@ int ReduceSum(const int outer_size, const int inner_size, const int axis_size, c
|
|||
if (src_data == NULL || dst_data == NULL) {
|
||||
return NNACL_NULL_PTR;
|
||||
}
|
||||
int i, j, k;
|
||||
int i, j;
|
||||
#ifdef ENABLE_NEON
|
||||
int block_mod = inner_size % C4NUM;
|
||||
int block_c4 = inner_size - block_mod;
|
||||
#endif
|
||||
for (j = tid; j < outer_size; j += thread_num) {
|
||||
const float *outer_src = src_data + j * axis_size * inner_size;
|
||||
float *outer_dst = dst_data + j * inner_size;
|
||||
for (k = 0; k < inner_size; k++) {
|
||||
int k = 0;
|
||||
#ifdef ENABLE_NEON
|
||||
for (; k < block_c4; k += C4NUM) {
|
||||
const float *inner_src = outer_src + k;
|
||||
float *inner_dst = outer_dst + k;
|
||||
float32x4_t tmp = {0, 0, 0, 0};
|
||||
for (i = 0; i < axis_size; i++) {
|
||||
tmp = vaddq_f32(tmp, vld1q_f32(inner_src + i * inner_size));
|
||||
}
|
||||
vst1q_f32(inner_dst, tmp);
|
||||
}
|
||||
#endif
|
||||
for (; k < inner_size; k++) {
|
||||
const float *inner_src = outer_src + k;
|
||||
float *inner_dst = outer_dst + k;
|
||||
float tmp = 0.0f;
|
||||
|
|
Loading…
Reference in New Issue