diff --git a/mindspore/lite/nnacl/assembly/arm64/matrix_add.S b/mindspore/lite/nnacl/assembly/arm64/matrix_add.S deleted file mode 100644 index d3611903611..00000000000 --- a/mindspore/lite/nnacl/assembly/arm64/matrix_add.S +++ /dev/null @@ -1,103 +0,0 @@ - -#ifdef __aarch64__ - .text - .align 5 - //.p2align 5,,15 - .global MatrixAdd -#ifndef __APPLE__ - .type MatrixAdd, %function -#endif - - - -//void MatrixAdd(const float* matDataA, const float* matDataB, float* matDataC, -// size_t aStride, size_t bStride, size_t cStride, size_t width, size_t height) - -//Auto: x0: matDataA, x1:matDataB, x2:matDatac, -//x3:aStride, x4:bStride, x5:cStride, x6:width, x7:height - -MatrixAdd: -mov x12, #4 //sizeof(float) -mul x3, x12, x3 -mul x4, x12, x4 -mul x5, x12, x5 - -loopH: -mov x8, x0 -mov x9, x1 -mov x10, x2 - -mov x11, x6 - -loop16LineIn: -cmp x11, #4 -blt L8 -sub x11, x11, #4 -ld1 {v0.4s, v1.4s}, [x0], #32 -ld1 {v2.4s, v3.4s}, [x1], #32 - -fadd v4.4s, v0.4s, v2.4s -fadd v5.4s, v1.4s, v3.4s - -ld1 {v6.4s, v7.4s}, [x0], #32 -ld1 {v8.4s, v9.4s}, [x1], #32 - -cmp x11, #4 -blt loop16LineOut - -loop16: -st1 {v4.4s, v5.4s}, [x2], #32 -fadd v10.4s, v6.4s, v8.4s -fadd v11.4s, v7.4s, v9.4s -ld1 {v0.4s, v1.4s}, [x0], #32 -ld1 {v2.4s, v3.4s}, [x1], #32 - -st1 {v10.4s, v11.4s}, [x2], #32 -fadd v4.4s, v0.4s, v2.4s -fadd v5.4s, v1.4s, v3.4s -ld1 {v6.4s, v7.4s}, [x0], #32 -ld1 {v8.4s, v9.4s}, [x1], #32 - -sub x11, x11, #4 -cmp x11, #4 -bge loop16 - -loop16LineOut: -st1 {v4.4s, v5.4s}, [x2], #32 -fadd v10.4s, v6.4s, v8.4s -fadd v11.4s, v7.4s, v9.4s -st1 {v10.4s, v11.4s}, [x2], #32 - - -L8: -cmp x11, #2 -blt L4 -ld1 {v0.4s, v1.4s}, [x0], #32 -ld1 {v2.4s, v3.4s}, [x1], #32 -fadd v4.4s, v0.4s, v2.4s -fadd v5.4s, v1.4s, v3.4s -sub x11, x11, #2 -st1 {v4.4s, v5.4s}, [x2], #32 - - -cmp x11, #0 -beq loop16EndLine - -L4: -ld1 {v0.4s}, [x0], #16 -ld1 {v1.4s}, [x1], #16 -fadd v0.4s, v0.4s, v1.4s -sub x11, x11, #1 -st1 {v0.4s}, [x2], #16 -//bne L4 - -loop16EndLine: -add x0, x8, x3 -add x1, x9, x4 -add x2, x10, x5 - -subs x7, x7, #1 -bne loopH - -ret -#endif diff --git a/mindspore/lite/nnacl/assembly/arm64/matrix_sub.S b/mindspore/lite/nnacl/assembly/arm64/matrix_sub.S deleted file mode 100644 index 7ac5f56a391..00000000000 --- a/mindspore/lite/nnacl/assembly/arm64/matrix_sub.S +++ /dev/null @@ -1,105 +0,0 @@ - -#ifdef __aarch64__ - .text - .align 5 - //.p2align 5,,15 - .global MatrixSub -#ifndef __APPLE__ - .type MatrixSub, %function -#endif - - - -//void MatrixSub(const float* matDataA, const float* matDataB, float* matDataC, -// size_t aStride, size_t bStride, size_t cStride, size_t width, size_t height) - -//Auto: x0: matDataA, x1:matDataB, x2:matDatac, -//x3:aStride, x4:bStride, x5:cStride, x6:width, x7:height - -MatrixSub: -mov x12, #4 //sizeof(float) -mul x3, x12, x3 -mul x4, x12, x4 -mul x5, x12, x5 - -loopH: -mov x8, x0 -mov x9, x1 -mov x10, x2 - -mov x11, x6 - -loop16LineIn: -cmp x11, #4 -blt L8 -sub x11, x11, #4 -ld1 {v0.4s, v1.4s}, [x0], #32 -ld1 {v2.4s, v3.4s}, [x1], #32 - -fsub v4.4s, v0.4s, v2.4s -fsub v5.4s, v1.4s, v3.4s - -ld1 {v6.4s, v7.4s}, [x0], #32 -ld1 {v8.4s, v9.4s}, [x1], #32 - -cmp x11, #4 -blt loop16LineOut - -loop16: -st1 {v4.4s, v5.4s}, [x2], #32 -fsub v10.4s, v6.4s, v8.4s -fsub v11.4s, v7.4s, v9.4s -ld1 {v0.4s, v1.4s}, [x0], #32 -ld1 {v2.4s, v3.4s}, [x1], #32 - -st1 {v10.4s, v11.4s}, [x2], #32 -fsub v4.4s, v0.4s, v2.4s -fsub v5.4s, v1.4s, v3.4s -ld1 {v6.4s, v7.4s}, [x0], #32 -ld1 {v8.4s, v9.4s}, [x1], #32 - -sub x11, x11, #4 -cmp x11, #4 -bge loop16 - -loop16LineOut: -st1 {v4.4s, v5.4s}, [x2], #32 -fsub v10.4s, v6.4s, v8.4s -fsub v11.4s, v7.4s, v9.4s -st1 {v10.4s, v11.4s}, [x2], #32 - -L8: -cmp x11, #2 -blt L4 - -ld1 {v0.4s, v1.4s}, [x0], #32 -ld1 {v2.4s, v3.4s}, [x1], #32 - -fsub v4.4s, v0.4s, v2.4s -fsub v5.4s, v1.4s, v3.4s - -sub x11, x11, #2 -st1 {v4.4s, v5.4s}, [x2], #32 - - -cmp x11, #0 -beq loop16EndLine - -L4: -ld1 {v0.4s}, [x0], #16 -ld1 {v1.4s}, [x1], #16 -fsub v0.4s, v0.4s, v1.4s -sub x11, x11, #1 -st1 {v0.4s}, [x2], #16 - - -loop16EndLine: -add x0, x8, x3 -add x1, x9, x4 -add x2, x10, x5 - -subs x7, x7, #1 -bne loopH - -ret -#endif diff --git a/mindspore/lite/nnacl/fp16/pooling_fp16.c b/mindspore/lite/nnacl/fp16/pooling_fp16.c index 7eff5509242..5dad6783f00 100644 --- a/mindspore/lite/nnacl/fp16/pooling_fp16.c +++ b/mindspore/lite/nnacl/fp16/pooling_fp16.c @@ -51,6 +51,12 @@ void AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa int in_w_index = out_w_index * stride_w - pad_w; int in_h_index = out_h_index * stride_h - pad_h; int out_plane_offset = out_batch_offset + index * channel; + + int real_win_h_start = MSMAX(0, -in_h_index); + int real_win_h_end = MSMIN(win_h, in_h - in_h_index); + int resl_win_w_start = MSMAX(0, -in_w_index); + int real_win_w_end = MSMIN(win_w, in_w - in_w_index); + for (int j = 0; j < c8; j++) { int in_channel_offset = in_batch_offset + j * C8NUM; int out_channel_offset = out_plane_offset + j * C8NUM; @@ -60,22 +66,17 @@ void AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa float16_t tmp_avg[8]{0}; #endif int real_count = 0; - for (int h = 0; h < win_h; h++) { - for (int w = 0; w < win_w; w++) { - if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 || - (in_w_index + w) >= in_w) { - continue; - } else { - int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; + for (int h = real_win_h_start; h < real_win_h_end; h++) { + for (int w = resl_win_w_start; w < real_win_w_end; w++) { + int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; #ifdef ENABLE_NEON - tmp_avg = vaddq_f16(tmp_avg, vld1q_f16(input_ptr + in_offset)); + tmp_avg = vaddq_f16(tmp_avg, vld1q_f16(input_ptr + in_offset)); #else - for (int t = 0; t < 8; t++) { - tmp_avg[t] += *(input_ptr + in_offset + t); - } -#endif - ++real_count; + for (int t = 0; t < 8; t++) { + tmp_avg[t] += *(input_ptr + in_offset + t); } +#endif + ++real_count; } // win_w loop } // win_h loop #ifdef ENABLE_NEON @@ -97,22 +98,17 @@ void AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa float16_t tmp_avg[4]{0}; #endif int real_count = 0; - for (int h = 0; h < win_h; h++) { - for (int w = 0; w < win_w; w++) { - if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 || - (in_w_index + w) >= in_w) { - continue; - } else { - int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; + for (int h = real_win_h_start; h < real_win_h_end; h++) { + for (int w = resl_win_w_start; w < real_win_w_end; w++) { + int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; #ifdef ENABLE_NEON - tmp_avg = vadd_f16(tmp_avg, vld1_f16(input_ptr + in_offset)); + tmp_avg = vadd_f16(tmp_avg, vld1_f16(input_ptr + in_offset)); #else - for (int j = 0; j < C4NUM; ++j) { - tmp_avg[j] += *(input_ptr + in_offset); - } -#endif - ++real_count; + for (int j = 0; j < C4NUM; ++j) { + tmp_avg[j] += *(input_ptr + in_offset); } +#endif + ++real_count; } // win_w loop } // win_h loop #ifdef ENABLE_NEON @@ -130,16 +126,11 @@ void AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa int out_channel_offset = out_plane_offset + k; float16_t tmp_avg = 0; int real_count = 0; - for (int h = 0; h < win_h; h++) { - for (int w = 0; w < win_w; w++) { - if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 || - (in_w_index + w) >= in_w) { - continue; - } else { - int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; - tmp_avg += *(input_ptr + in_offset); - ++real_count; - } + for (int h = real_win_h_start; h < real_win_h_end; h++) { + for (int w = resl_win_w_start; w < real_win_w_end; w++) { + int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; + tmp_avg += *(input_ptr + in_offset); + ++real_count; } // win_w loop } // win_h loop *(output_ptr + out_channel_offset) = tmp_avg / (float16_t)real_count; @@ -148,7 +139,6 @@ void AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa } // out_plane loop } // out_batch loop } - void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id) { int stride_w = pooling_param->stride_w_; int stride_h = pooling_param->stride_h_; @@ -183,6 +173,12 @@ void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa int in_w_index = out_w_index * stride_w - pad_w; int in_h_index = out_h_index * stride_h - pad_h; int out_plane_offset = out_batch_offset + index * channel; + + int real_win_h_start = MSMAX(0, -in_h_index); + int real_win_h_end = MSMIN(win_h, in_h - in_h_index); + int resl_win_w_start = MSMAX(0, -in_w_index); + int real_win_w_end = MSMIN(win_w, in_w - in_w_index); + for (int j = 0; j < c8; j++) { int in_channel_offset = in_batch_offset + j * C8NUM; int out_channel_offset = out_plane_offset + j * C8NUM; @@ -191,21 +187,16 @@ void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa #else float16_t tmp_max[8]{-FLT_MAX}; #endif - for (int h = 0; h < win_h; h++) { - for (int w = 0; w < win_w; w++) { - if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 || - (in_w_index + w) >= in_w) { - continue; - } else { - int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; + for (int h = real_win_h_start; h < real_win_h_end; h++) { + for (int w = resl_win_w_start; w < real_win_w_end; w++) { + int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; #ifdef ENABLE_NEON - tmp_max = vmaxq_f16(tmp_max, vld1q_f16(input_ptr + in_offset)); + tmp_max = vmaxq_f16(tmp_max, vld1q_f16(input_ptr + in_offset)); #else - for (int k = 0; k < C8NUM; k++) { - tmp_max[k] = fmax(tmp_max[k], *(input_ptr + in_offset + k)); - } -#endif + for (int k = 0; k < C8NUM; k++) { + tmp_max[k] = fmax(tmp_max[k], *(input_ptr + in_offset + k)); } +#endif } // win_w loop } // win_h loop #ifdef ENABLE_NEON @@ -226,21 +217,16 @@ void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa #else float16_t tmp_max[4]{-FLT_MAX}; #endif - for (int h = 0; h < win_h; h++) { - for (int w = 0; w < win_w; w++) { - if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 || - (in_w_index + w) >= in_w) { - continue; - } else { - int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; + for (int h = real_win_h_start; h < real_win_h_end; h++) { + for (int w = resl_win_w_start; w < real_win_w_end; w++) { + int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; #ifdef ENABLE_NEON - tmp_max = vmax_f16(tmp_max, vld1_f16(input_ptr + in_offset)); + tmp_max = vmax_f16(tmp_max, vld1_f16(input_ptr + in_offset)); #else - for (int k = 0; k < C4NUM; k++) { - tmp_max[k] = fmax(tmp_max[k], *(input_ptr + in_offset + k)); - } -#endif + for (int k = 0; k < C4NUM; k++) { + tmp_max[k] = fmax(tmp_max[k], *(input_ptr + in_offset + k)); } +#endif } // win_w loop } // win_h loop #ifdef ENABLE_NEON @@ -257,15 +243,10 @@ void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa int in_channel_offset = in_batch_offset + k; int out_channel_offset = out_plane_offset + k; float16_t tmp_max = -FLT_MAX; - for (int h = 0; h < win_h; h++) { - for (int w = 0; w < win_w; w++) { - if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 || - (in_w_index + w) >= in_w) { - continue; - } else { - int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; - tmp_max = fmax(tmp_max, *(input_ptr + in_offset)); - } + for (int h = real_win_h_start; h < real_win_h_end; h++) { + for (int w = resl_win_w_start; w < real_win_w_end; w++) { + int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; + tmp_max = fmax(tmp_max, *(input_ptr + in_offset)); } // win_w loop } // win_h loop *(output_ptr + out_channel_offset) = tmp_max; diff --git a/mindspore/lite/nnacl/fp32/common_func.c b/mindspore/lite/nnacl/fp32/common_func.c index 4c130ba9956..188f39b804b 100644 --- a/mindspore/lite/nnacl/fp32/common_func.c +++ b/mindspore/lite/nnacl/fp32/common_func.c @@ -15,54 +15,6 @@ */ #include "nnacl/fp32/common_func.h" - -#ifndef ENABLE_ARM64 -void MatrixAdd(const float *a_ptr, const float *b_ptr, float *dst, size_t a_stride, size_t b_stride, size_t c_stride, - size_t row, size_t col) { - for (int r = 0; r < row; r++) { - for (int c = 0; c < col; c++) { - int a_index = c * a_stride + r * C4NUM; - int b_index = c * b_stride + r * C4NUM; - int c_index = c * c_stride + r * C4NUM; - for (int i = 0; i < C4NUM; i++) { - dst[c_index + i] = a_ptr[a_index + i] + b_ptr[b_index + i]; - } - } - } - return; -} - -void MatrixSub(const float *a_ptr, const float *b_ptr, float *dst, size_t a_stride, size_t b_stride, size_t c_stride, - size_t row, size_t col) { - for (int r = 0; r < row; r++) { - for (int c = 0; c < col; c++) { - int a_index = c * a_stride + r * C4NUM; - int b_index = c * b_stride + r * C4NUM; - int c_index = c * c_stride + r * C4NUM; - for (int i = 0; i < C4NUM; i++) { - dst[c_index + i] = a_ptr[a_index + i] - b_ptr[b_index + i]; - } - } - } - return; -} -#endif - -void MatrixMultiAdd(float *c11, float *c12, float *c21, float *c22, float *x_ptr, size_t row, size_t col, - size_t c_stride, size_t x_stride) { - /* U2 = P1 + P6 */ - MatrixAdd(x_ptr, c12, c12, x_stride, c_stride, c_stride, row, col); - /* U3 = U2 + P7 */ - MatrixAdd(c12, c21, c21, c_stride, c_stride, c_stride, row, col); - /* U4 = U2 + P5 */ - MatrixAdd(c12, c22, c12, c_stride, c_stride, c_stride, row, col); - /* U7 = U3 + P5 */ - MatrixAdd(c21, c22, c22, c_stride, c_stride, c_stride, row, col); - /* U5 = U4 + P3 */ - MatrixAdd(c12, c11, c12, c_stride, c_stride, c_stride, row, col); - return; -} - void PostConvFuncComm(const float *src_ptr_, float *out_ptr, const float *bias_ptr, size_t output_channel, size_t plane_size, size_t stride, bool is_relu, bool is_relu6, int size) { for (int oc = 0; oc < output_channel; oc++) { diff --git a/mindspore/lite/nnacl/fp32/common_func.h b/mindspore/lite/nnacl/fp32/common_func.h index 0c3b72caabf..873768f5084 100644 --- a/mindspore/lite/nnacl/fp32/common_func.h +++ b/mindspore/lite/nnacl/fp32/common_func.h @@ -31,12 +31,6 @@ void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bi size_t plane_size, size_t stride, bool is_relu, bool is_relu6); void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, size_t plane_size, size_t stride, bool is_relu, bool is_relu6); -void MatrixAdd(const float *a_ptr, const float *b_ptr, float *dst, size_t a_stride, size_t b_stride, size_t c_stride, - size_t row, size_t col); -void MatrixSub(const float *a_ptr, const float *b_ptr, float *dst, size_t a_stride, size_t b_stride, size_t c_stride, - size_t row, size_t col); -void MatrixMultiAdd(float *c11, float *c12, float *c21, float *c22, float *x_ptr, size_t row, size_t col, - size_t c_stride, size_t x_stride); float ShortToFloat32(uint16_t srcValue); uint16_t Float32ToShort(float srcValue); diff --git a/mindspore/lite/nnacl/fp32/pooling.c b/mindspore/lite/nnacl/fp32/pooling.c index 54bc5788157..5ce7f18797b 100644 --- a/mindspore/lite/nnacl/fp32/pooling.c +++ b/mindspore/lite/nnacl/fp32/pooling.c @@ -17,7 +17,8 @@ #include "nnacl/fp32/pooling.h" #include -void AvgPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id) { +void AvgPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id, float minf, + float maxf) { int stride_w = pooling_param->stride_w_; int stride_h = pooling_param->stride_h_; int pad_w = pooling_param->pad_l_; @@ -25,7 +26,7 @@ void AvgPooling(const float *input_ptr, float *output_ptr, PoolingParameter *poo int win_w = pooling_param->window_w_; int win_h = pooling_param->window_h_; int channel = pooling_param->input_channel_; - int c4 = UP_DIV(channel, C4NUM); + int c4 = UP_DIV(channel, C4NUM); /* oc && ic */ int in_w = pooling_param->input_w_; int in_h = pooling_param->input_h_; int output_w = pooling_param->output_w_; @@ -34,11 +35,15 @@ void AvgPooling(const float *input_ptr, float *output_ptr, PoolingParameter *poo int out_plane = output_w * output_h; int out_tile_count = UP_DIV(out_plane, TILE_NUM); int thread_num = pooling_param->thread_num_; - // input channel is equal to output channel + +#ifdef ENABLE_NEON + float32x4_t min_value = vdupq_n_f32(minf); + float32x4_t max_value = vdupq_n_f32(maxf); +#endif for (int batch = 0; batch < output_batch; batch++) { - int in_batch_offset = batch * in_h * in_w * channel; - int out_batch_offset = batch * output_h * output_w * channel; + const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel; + float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel; for (int thread_id = task_id; thread_id < out_tile_count; thread_id += thread_num) { int cal_start_index = thread_id * TILE_NUM; int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index); @@ -48,10 +53,18 @@ void AvgPooling(const float *input_ptr, float *output_ptr, PoolingParameter *poo int out_h_index = index / output_w; int in_w_index = out_w_index * stride_w - pad_w; int in_h_index = out_h_index * stride_h - pad_h; - int out_plane_offset = out_batch_offset + index * channel; - for (int j = 0; j < c4 - 1; j++) { - int in_channel_offset = in_batch_offset + j * C4NUM; - int out_channel_offset = out_plane_offset + j * C4NUM; + + const float *src_plane_ptr = src_b_ptr; + float *dst_plane_ptr = dst_b_ptr + index * channel; + + int real_win_h_start = MSMAX(0, -in_h_index); + int real_win_h_end = MSMIN(win_h, in_h - in_h_index); + int resl_win_w_start = MSMAX(0, -in_w_index); + int real_win_w_end = MSMIN(win_w, in_w - in_w_index); + + for (int ci = 0; ci < c4 - 1; ci++) { + const float *src_c_ptr = src_plane_ptr + ci * C4NUM; + float *dst_c_ptr = dst_plane_ptr + ci * C4NUM; #ifdef ENABLE_NEON float32x4_t tmp_avg = vdupq_n_f32(0); #else @@ -61,60 +74,69 @@ void AvgPooling(const float *input_ptr, float *output_ptr, PoolingParameter *poo float tmp_avg4 = 0; #endif int real_count = 0; - for (int h = 0; h < win_h; h++) { - for (int w = 0; w < win_w; w++) { - if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 || - (in_w_index + w) >= in_w) { - continue; - } else { - int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; + for (int h = real_win_h_start; h < real_win_h_end; h++) { + for (int w = resl_win_w_start; w < real_win_w_end; w++) { + const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel; #ifdef ENABLE_NEON - tmp_avg = vaddq_f32(tmp_avg, vld1q_f32(input_ptr + in_offset)); + tmp_avg = vaddq_f32(tmp_avg, vld1q_f32(src_win_ptr)); #else - tmp_avg1 += *(input_ptr + in_offset); - tmp_avg2 += *(input_ptr + in_offset + 1); - tmp_avg3 += *(input_ptr + in_offset + 2); - tmp_avg4 += *(input_ptr + in_offset + 3); + tmp_avg1 += src_win_ptr[0]; + tmp_avg2 += src_win_ptr[1]; + tmp_avg3 += src_win_ptr[2]; + tmp_avg4 += src_win_ptr[3]; #endif - ++real_count; - } + ++real_count; } // win_w loop } // win_h loop #ifdef ENABLE_NEON - vst1q_f32(output_ptr + out_channel_offset, tmp_avg / vdupq_n_f32(real_count)); + tmp_avg = tmp_avg / vdupq_n_f32(real_count); + tmp_avg = vmaxq_f32(tmp_avg, min_value); + tmp_avg = vminq_f32(tmp_avg, max_value); + vst1q_f32(dst_c_ptr, tmp_avg); #else - *(output_ptr + out_channel_offset) = tmp_avg1 / (float)real_count; - *(output_ptr + out_channel_offset + 1) = tmp_avg2 / (float)real_count; - *(output_ptr + out_channel_offset + 2) = tmp_avg3 / (float)real_count; - *(output_ptr + out_channel_offset + 3) = tmp_avg4 / (float)real_count; + tmp_avg1 /= (float)real_count; + tmp_avg2 /= (float)real_count; + tmp_avg3 /= (float)real_count; + tmp_avg4 /= (float)real_count; + tmp_avg1 = fmax(tmp_avg1, minf); + tmp_avg2 = fmax(tmp_avg2, minf); + tmp_avg3 = fmax(tmp_avg3, minf); + tmp_avg4 = fmax(tmp_avg4, minf); + tmp_avg1 = fmin(tmp_avg1, maxf); + tmp_avg2 = fmin(tmp_avg2, maxf); + tmp_avg3 = fmin(tmp_avg3, maxf); + tmp_avg4 = fmin(tmp_avg4, maxf); + dst_c_ptr[0] = tmp_avg1; + dst_c_ptr[1] = tmp_avg2; + dst_c_ptr[2] = tmp_avg3; + dst_c_ptr[3] = tmp_avg4; #endif } // ic4-1 loop int channel_s = (c4 - 1) * C4NUM; - for (int k = channel_s; k < channel; k++) { - int in_channel_offset = in_batch_offset + k; - int out_channel_offset = out_plane_offset + k; + for (int ci = channel_s; ci < channel; ci++) { + const float *src_c_ptr = src_plane_ptr + ci; + float *dst_c_ptr = dst_plane_ptr + ci; float tmp_avg = 0; int real_count = 0; - for (int h = 0; h < win_h; h++) { - for (int w = 0; w < win_w; w++) { - if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 || - (in_w_index + w) >= in_w) { - continue; - } else { - int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; - tmp_avg += *(input_ptr + in_offset); - ++real_count; - } + for (int h = real_win_h_start; h < real_win_h_end; h++) { + for (int w = resl_win_w_start; w < real_win_w_end; w++) { + const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel; + tmp_avg += src_win_ptr[0]; + ++real_count; } // win_w loop } // win_h loop - *(output_ptr + out_channel_offset) = tmp_avg / (float)real_count; + tmp_avg = tmp_avg / (float)real_count; + tmp_avg = fmax(tmp_avg, minf); + tmp_avg = fmin(tmp_avg, maxf); + dst_c_ptr[0] = tmp_avg; } // channel_res loop } // real_cal_num loop } // out_plane loop } // out_batch loop } -void MaxPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id) { +void MaxPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id, float minf, + float maxf) { int stride_w = pooling_param->stride_w_; int stride_h = pooling_param->stride_h_; int pad_w = pooling_param->pad_l_; @@ -132,207 +154,9 @@ void MaxPooling(const float *input_ptr, float *output_ptr, PoolingParameter *poo int thread_num = pooling_param->thread_num_; int c4 = UP_DIV(channel, C4NUM); /* oc && ic */ - for (int batch = 0; batch < output_batch; batch++) { - const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel; - float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel; - for (int thread_id = task_id; thread_id < out_tile_count; thread_id += thread_num) { - int cal_start_index = thread_id * TILE_NUM; - int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index); - for (int i = 0; i < real_cal_num; i++) { - int index = cal_start_index + i; - int out_w_index = index % output_w; - int out_h_index = index / output_w; - int in_w_index = out_w_index * stride_w - pad_w; - int in_h_index = out_h_index * stride_h - pad_h; - - const float *src_plane_ptr = src_b_ptr; - float *dst_plane_ptr = dst_b_ptr + index * channel; - - int real_win_h_start = MSMAX(0, -in_h_index); - int real_win_h_end = MSMIN(win_h, in_h - in_h_index); - int resl_win_w_start = MSMAX(0, -in_w_index); - int real_win_w_end = MSMIN(win_w, in_w - in_w_index); - - for (int ci = 0; ci < c4 - 1; ci++) { - const float *src_c_ptr = src_plane_ptr + ci * C4NUM; - float *dst_c_ptr = dst_plane_ptr + ci * C4NUM; #ifdef ENABLE_NEON - float32x4_t tmp_max = vdupq_n_f32(-FLT_MAX); -#else - float tmp_max1 = -FLT_MAX; - float tmp_max2 = -FLT_MAX; - float tmp_max3 = -FLT_MAX; - float tmp_max4 = -FLT_MAX; -#endif - - for (int kh = real_win_h_start; kh < real_win_h_end; kh++) { - for (int kw = resl_win_w_start; kw < real_win_w_end; kw++) { - const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel; -#ifdef ENABLE_NEON - tmp_max = vmaxq_f32(tmp_max, vld1q_f32(src_win_ptr)); -#else - tmp_max1 = fmax(tmp_max1, src_win_ptr[0]); - tmp_max2 = fmax(tmp_max2, src_win_ptr[1]); - tmp_max3 = fmax(tmp_max3, src_win_ptr[2]); - tmp_max4 = fmax(tmp_max4, src_win_ptr[3]); - -#endif - } // win_w loop - } // win_h loop -#ifdef ENABLE_NEON - vst1q_f32(dst_c_ptr, tmp_max); -#else - dst_c_ptr[0] = tmp_max1; - dst_c_ptr[1] = tmp_max2; - dst_c_ptr[2] = tmp_max3; - dst_c_ptr[3] = tmp_max4; -#endif - } // ic4-1 loop - int channel_s = (c4 - 1) * C4NUM; - for (int ci = channel_s; ci < channel; ci++) { - float *dst_c_ptr = dst_plane_ptr + ci; - const float *src_c_ptr = src_plane_ptr + ci; - float tmp_max = -FLT_MAX; - - for (int kh = real_win_h_start; kh < real_win_h_end; kh++) { - for (int kw = resl_win_w_start; kw < real_win_w_end; kw++) { - const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel; - tmp_max = fmax(tmp_max, src_win_ptr[0]); - } // win_w loop - } // win_h loop - dst_c_ptr[0] = tmp_max; - } // channel_res loop - } // real_cal_num loop - } // out_plane loop - } // out_batch loop -} - -void AvgPoolingRelu(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id) { - int stride_w = pooling_param->stride_w_; - int stride_h = pooling_param->stride_h_; - int pad_w = pooling_param->pad_l_; - int pad_h = pooling_param->pad_u_; - int win_w = pooling_param->window_w_; - int win_h = pooling_param->window_h_; - int channel = pooling_param->input_channel_; - int c4 = UP_DIV(channel, C4NUM); - int in_w = pooling_param->input_w_; - int in_h = pooling_param->input_h_; - int output_w = pooling_param->output_w_; - int output_h = pooling_param->output_h_; - int output_batch = pooling_param->output_batch_; - int out_plane = output_w * output_h; - int out_tile_count = UP_DIV(out_plane, TILE_NUM); - int thread_num = pooling_param->thread_num_; -#ifdef ENABLE_NEON - float32x4_t zeros = vdupq_n_f32(0); -#endif - - for (int batch = 0; batch < output_batch; batch++) { - int in_batch_offset = batch * in_h * in_w * channel; - int out_batch_offset = batch * output_h * output_w * channel; - for (int thread_id = task_id; thread_id < out_tile_count; thread_id += thread_num) { - int cal_start_index = thread_id * TILE_NUM; - int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index); - for (int i = 0; i < real_cal_num; i++) { - int index = cal_start_index + i; - int out_w_index = index % output_w; - int out_h_index = index / output_w; - int in_w_index = out_w_index * stride_w - pad_w; - int in_h_index = out_h_index * stride_h - pad_h; - int out_plane_offset = out_batch_offset + index * channel; - for (int j = 0; j < c4 - 1; j++) { - int in_channel_offset = in_batch_offset + j * C4NUM; - int out_channel_offset = out_plane_offset + j * C4NUM; -#ifdef ENABLE_NEON - float32x4_t tmp_avg = vdupq_n_f32(0); -#else - float tmp_avg1 = 0; - float tmp_avg2 = 0; - float tmp_avg3 = 0; - float tmp_avg4 = 0; -#endif - int real_count = 0; - for (int h = 0; h < win_h; h++) { - for (int w = 0; w < win_w; w++) { - if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 || - (in_w_index + w) >= in_w) { - continue; - } else { - int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; -#ifdef ENABLE_NEON - tmp_avg = vaddq_f32(tmp_avg, vld1q_f32(input_ptr + in_offset)); -#else - tmp_avg1 += *(input_ptr + in_offset); - tmp_avg2 += *(input_ptr + in_offset + 1); - tmp_avg3 += *(input_ptr + in_offset + 2); - tmp_avg4 += *(input_ptr + in_offset + 3); -#endif - ++real_count; - } - } // win_w loop - } // win_h loop -#ifdef ENABLE_NEON - tmp_avg = vmaxq_f32(tmp_avg, zeros); - vst1q_f32(output_ptr + out_channel_offset, tmp_avg / vdupq_n_f32(real_count)); -#else - tmp_avg1 = fmax(tmp_avg1, 0); - tmp_avg2 = fmax(tmp_avg2, 0); - tmp_avg3 = fmax(tmp_avg3, 0); - tmp_avg4 = fmax(tmp_avg4, 0); - - *(output_ptr + out_channel_offset) = tmp_avg1 / (float)real_count; - *(output_ptr + out_channel_offset + 1) = tmp_avg2 / (float)real_count; - *(output_ptr + out_channel_offset + 2) = tmp_avg3 / (float)real_count; - *(output_ptr + out_channel_offset + 3) = tmp_avg4 / (float)real_count; -#endif - } // ic4-1 loop - int channel_s = (c4 - 1) * C4NUM; - for (int k = channel_s; k < channel; k++) { - int in_channel_offset = in_batch_offset + k; - int out_channel_offset = out_plane_offset + k; - float tmp_avg = 0; - int real_count = 0; - for (int h = 0; h < win_h; h++) { - for (int w = 0; w < win_w; w++) { - if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 || - (in_w_index + w) >= in_w) { - continue; - } else { - int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; - tmp_avg += *(input_ptr + in_offset); - ++real_count; - } - } // win_w loop - } // win_h loop - tmp_avg = fmax(tmp_avg, 0); - *(output_ptr + out_channel_offset) = tmp_avg / (float)real_count; - } // channel_res loop - } // real_cal_num loop - } // out_plane loop - } // out_batch loop -} - -void MaxPoolingRelu(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id) { - int stride_w = pooling_param->stride_w_; - int stride_h = pooling_param->stride_h_; - int pad_w = pooling_param->pad_l_; - int pad_h = pooling_param->pad_u_; - int win_w = pooling_param->window_w_; - int win_h = pooling_param->window_h_; - int channel = pooling_param->input_channel_; - int in_w = pooling_param->input_w_; - int in_h = pooling_param->input_h_; - int output_w = pooling_param->output_w_; - int output_h = pooling_param->output_h_; - int output_batch = pooling_param->output_batch_; - int out_plane = output_w * output_h; - int out_tile_count = UP_DIV(out_plane, TILE_NUM); - int thread_num = pooling_param->thread_num_; - int c4 = UP_DIV(channel, C4NUM); - -#ifdef ENABLE_NEON - float32x4_t zeros = vdupq_n_f32(0); + float32x4_t min_value = vdupq_n_f32(minf); + float32x4_t max_value = vdupq_n_f32(maxf); #endif for (int batch = 0; batch < output_batch; batch++) { @@ -378,251 +202,22 @@ void MaxPoolingRelu(const float *input_ptr, float *output_ptr, PoolingParameter tmp_max2 = fmax(tmp_max2, src_win_ptr[1]); tmp_max3 = fmax(tmp_max3, src_win_ptr[2]); tmp_max4 = fmax(tmp_max4, src_win_ptr[3]); - #endif } // win_w loop } // win_h loop #ifdef ENABLE_NEON - tmp_max = vmaxq_f32(tmp_max, zeros); + tmp_max = vmaxq_f32(tmp_max, min_value); + tmp_max = vminq_f32(tmp_max, max_value); vst1q_f32(dst_c_ptr, tmp_max); #else - // relu: - tmp_max1 = fmax(tmp_max1, 0); - tmp_max2 = fmax(tmp_max2, 0); - tmp_max3 = fmax(tmp_max3, 0); - tmp_max4 = fmax(tmp_max4, 0); - - dst_c_ptr[0] = tmp_max1; - dst_c_ptr[1] = tmp_max2; - dst_c_ptr[2] = tmp_max3; - dst_c_ptr[3] = tmp_max4; -#endif - } // ic4-1 loop - int channel_s = (c4 - 1) * C4NUM; - for (int ci = channel_s; ci < channel; ci++) { - float *dst_c_ptr = dst_plane_ptr + ci; - const float *src_c_ptr = src_plane_ptr + ci; - float tmp_max = -FLT_MAX; - - for (int kh = real_win_h_start; kh < real_win_h_end; kh++) { - for (int kw = resl_win_w_start; kw < real_win_w_end; kw++) { - const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel; - tmp_max = fmax(tmp_max, src_win_ptr[0]); - } // win_w loop - } // win_h loop - dst_c_ptr[0] = tmp_max; - } // channel_res loop - } // real_cal_num loop - } // out_plane loop - } // out_batch loop -} - -void AvgPoolingRelu6(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id) { - int stride_w = pooling_param->stride_w_; - int stride_h = pooling_param->stride_h_; - int pad_w = pooling_param->pad_l_; - int pad_h = pooling_param->pad_u_; - int win_w = pooling_param->window_w_; - int win_h = pooling_param->window_h_; - int channel = pooling_param->input_channel_; - int c4 = UP_DIV(channel, C4NUM); - int in_w = pooling_param->input_w_; - int in_h = pooling_param->input_h_; - int output_w = pooling_param->output_w_; - int output_h = pooling_param->output_h_; - int output_batch = pooling_param->output_batch_; - int out_plane = output_w * output_h; - int out_tile_count = UP_DIV(out_plane, TILE_NUM); - int thread_num = pooling_param->thread_num_; - // input channel is equal to output channel - -#ifdef ENABLE_NEON - float32x4_t zeros = vdupq_n_f32(0); - float32x4_t bounds = vdupq_n_f32(6); -#endif - - for (int batch = 0; batch < output_batch; batch++) { - int in_batch_offset = batch * in_h * in_w * channel; - int out_batch_offset = batch * output_h * output_w * channel; - for (int thread_id = task_id; thread_id < out_tile_count; thread_id += thread_num) { - int cal_start_index = thread_id * TILE_NUM; - int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index); - for (int i = 0; i < real_cal_num; i++) { - int index = cal_start_index + i; - int out_w_index = index % output_w; - int out_h_index = index / output_w; - int in_w_index = out_w_index * stride_w - pad_w; - int in_h_index = out_h_index * stride_h - pad_h; - int out_plane_offset = out_batch_offset + index * channel; - for (int j = 0; j < c4 - 1; j++) { - int in_channel_offset = in_batch_offset + j * C4NUM; - int out_channel_offset = out_plane_offset + j * C4NUM; -#ifdef ENABLE_NEON - float32x4_t tmp_avg = vdupq_n_f32(0); -#else - float tmp_avg1 = 0; - float tmp_avg2 = 0; - float tmp_avg3 = 0; - float tmp_avg4 = 0; -#endif - int real_count = 0; - for (int h = 0; h < win_h; h++) { - for (int w = 0; w < win_w; w++) { - if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 || - (in_w_index + w) >= in_w) { - continue; - } else { - int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; -#ifdef ENABLE_NEON - tmp_avg = vaddq_f32(tmp_avg, vld1q_f32(input_ptr + in_offset)); -#else - tmp_avg1 += *(input_ptr + in_offset); - tmp_avg2 += *(input_ptr + in_offset + 1); - tmp_avg3 += *(input_ptr + in_offset + 2); - tmp_avg4 += *(input_ptr + in_offset + 3); -#endif - ++real_count; - } - } // win_w loop - } // win_h loop -#ifdef ENABLE_NEON - tmp_avg = tmp_avg / vdupq_n_f32(real_count); - tmp_avg = vmaxq_f32(tmp_avg, zeros); - tmp_avg = vminq_f32(tmp_avg, bounds); - vst1q_f32(output_ptr + out_channel_offset, tmp_avg); -#else - tmp_avg1 /= (float)real_count; - tmp_avg2 /= (float)real_count; - tmp_avg3 /= (float)real_count; - tmp_avg4 /= (float)real_count; - tmp_avg1 = fmax(tmp_avg1, 0); - tmp_avg2 = fmax(tmp_avg2, 0); - tmp_avg3 = fmax(tmp_avg3, 0); - tmp_avg4 = fmax(tmp_avg4, 0); - tmp_avg1 = fmin(tmp_avg1, 6); - tmp_avg2 = fmin(tmp_avg2, 6); - tmp_avg3 = fmin(tmp_avg3, 6); - tmp_avg4 = fmin(tmp_avg4, 6); - - *(output_ptr + out_channel_offset) = tmp_avg1; - *(output_ptr + out_channel_offset + 1) = tmp_avg2; - *(output_ptr + out_channel_offset + 2) = tmp_avg3; - *(output_ptr + out_channel_offset + 3) = tmp_avg4; -#endif - } // ic4-1 loop - int channel_s = (c4 - 1) * C4NUM; - for (int k = channel_s; k < channel; k++) { - int in_channel_offset = in_batch_offset + k; - int out_channel_offset = out_plane_offset + k; - float tmp_avg = 0; - int real_count = 0; - for (int h = 0; h < win_h; h++) { - for (int w = 0; w < win_w; w++) { - if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 || - (in_w_index + w) >= in_w) { - continue; - } else { - int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; - tmp_avg += *(input_ptr + in_offset); - ++real_count; - } - } // win_w loop - } // win_h loop - tmp_avg /= (float)real_count; - tmp_avg = fmax(tmp_avg, 0); - tmp_avg = fmin(tmp_avg, 6); - *(output_ptr + out_channel_offset) = tmp_avg; - } // channel_res loop - } // real_cal_num loop - } // out_plane loop - } // out_batch loop -} - -void MaxPoolingRelu6(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id) { - int stride_w = pooling_param->stride_w_; - int stride_h = pooling_param->stride_h_; - int pad_w = pooling_param->pad_l_; - int pad_h = pooling_param->pad_u_; - int win_w = pooling_param->window_w_; - int win_h = pooling_param->window_h_; - int channel = pooling_param->input_channel_; - int in_w = pooling_param->input_w_; - int in_h = pooling_param->input_h_; - int output_w = pooling_param->output_w_; - int output_h = pooling_param->output_h_; - int output_batch = pooling_param->output_batch_; - int out_plane = output_w * output_h; - int out_tile_count = UP_DIV(out_plane, TILE_NUM); - int thread_num = pooling_param->thread_num_; - int c4 = UP_DIV(channel, C4NUM); - -#ifdef ENABLE_NEON - float32x4_t zeros = vdupq_n_f32(0); - float32x4_t bounds = vdupq_n_f32(6); -#endif - - for (int batch = 0; batch < output_batch; batch++) { - const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel; - float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel; - for (int thread_id = task_id; thread_id < out_tile_count; thread_id += thread_num) { - int cal_start_index = thread_id * TILE_NUM; - int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index); - for (int i = 0; i < real_cal_num; i++) { - int index = cal_start_index + i; - int out_w_index = index % output_w; - int out_h_index = index / output_w; - int in_w_index = out_w_index * stride_w - pad_w; - int in_h_index = out_h_index * stride_h - pad_h; - - const float *src_plane_ptr = src_b_ptr; - float *dst_plane_ptr = dst_b_ptr + index * channel; - - int real_win_h_start = MSMAX(0, -in_h_index); - int real_win_h_end = MSMIN(win_h, in_h - in_h_index); - int resl_win_w_start = MSMAX(0, -in_w_index); - int real_win_w_end = MSMIN(win_w, in_w - in_w_index); - - for (int ci = 0; ci < c4 - 1; ci++) { - const float *src_c_ptr = src_plane_ptr + ci * C4NUM; - float *dst_c_ptr = dst_plane_ptr + ci * C4NUM; -#ifdef ENABLE_NEON - float32x4_t tmp_max = vdupq_n_f32(-FLT_MAX); -#else - float tmp_max1 = -FLT_MAX; - float tmp_max2 = -FLT_MAX; - float tmp_max3 = -FLT_MAX; - float tmp_max4 = -FLT_MAX; -#endif - - for (int kh = real_win_h_start; kh < real_win_h_end; kh++) { - for (int kw = resl_win_w_start; kw < real_win_w_end; kw++) { - const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel; -#ifdef ENABLE_NEON - tmp_max = vmaxq_f32(tmp_max, vld1q_f32(src_win_ptr)); -#else - tmp_max1 = fmax(tmp_max1, src_win_ptr[0]); - tmp_max2 = fmax(tmp_max2, src_win_ptr[1]); - tmp_max3 = fmax(tmp_max3, src_win_ptr[2]); - tmp_max4 = fmax(tmp_max4, src_win_ptr[3]); - -#endif - } // win_w loop - } // win_h loop -#ifdef ENABLE_NEON - tmp_max = vmaxq_f32(tmp_max, zeros); - tmp_max = vminq_f32(tmp_max, bounds); - vst1q_f32(dst_c_ptr, tmp_max); -#else - // relu: - tmp_max1 = fmax(tmp_max1, 0); - tmp_max2 = fmax(tmp_max2, 0); - tmp_max3 = fmax(tmp_max3, 0); - tmp_max4 = fmax(tmp_max4, 0); - tmp_max1 = fmin(tmp_max1, 6); - tmp_max2 = fmin(tmp_max2, 6); - tmp_max3 = fmin(tmp_max3, 6); - tmp_max4 = fmin(tmp_max4, 6); - + tmp_max1 = fmax(tmp_max1, minf); + tmp_max2 = fmax(tmp_max2, minf); + tmp_max3 = fmax(tmp_max3, minf); + tmp_max4 = fmax(tmp_max4, minf); + tmp_max1 = fmin(tmp_max1, maxf); + tmp_max2 = fmin(tmp_max2, maxf); + tmp_max3 = fmin(tmp_max3, maxf); + tmp_max4 = fmin(tmp_max4, maxf); dst_c_ptr[0] = tmp_max1; dst_c_ptr[1] = tmp_max2; dst_c_ptr[2] = tmp_max3; @@ -641,6 +236,8 @@ void MaxPoolingRelu6(const float *input_ptr, float *output_ptr, PoolingParameter tmp_max = fmax(tmp_max, src_win_ptr[0]); } // win_w loop } // win_h loop + tmp_max = fmax(tmp_max, minf); + tmp_max = fmin(tmp_max, maxf); dst_c_ptr[0] = tmp_max; } // channel_res loop } // real_cal_num loop diff --git a/mindspore/lite/nnacl/fp32/pooling.h b/mindspore/lite/nnacl/fp32/pooling.h index ae62f973905..302f0f7b85a 100644 --- a/mindspore/lite/nnacl/fp32/pooling.h +++ b/mindspore/lite/nnacl/fp32/pooling.h @@ -27,17 +27,10 @@ #ifdef __cplusplus extern "C" { #endif -void AvgPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id); - -void MaxPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id); - -void AvgPoolingRelu(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id); - -void MaxPoolingRelu(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id); - -void AvgPoolingRelu6(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id); - -void MaxPoolingRelu6(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id); +void AvgPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id, float minf, + float maxf); +void MaxPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id, float minf, + float maxf); #ifdef __cplusplus } #endif diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling.cc index 2891ed0494d..a7c3b4377fc 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling.cc @@ -15,6 +15,7 @@ */ #include "src/runtime/kernel/arm/fp32/pooling.h" +#include #include "nnacl/fp32/pooling.h" #include "src/kernel_registry.h" #include "src/runtime/runtime_api.h" @@ -52,28 +53,18 @@ int PoolingCPUKernel::ReSize() { int PoolingCPUKernel::RunImpl(int task_id) { auto input_ptr = reinterpret_cast(in_tensors_.at(kInputIndex)->Data()); auto output_ptr = reinterpret_cast(out_tensors_.at(kOutputIndex)->Data()); + float minf = -FLT_MAX; + float maxf = FLT_MAX; + if (pooling_param_->act_type_ == ActType_Relu) { + minf = 0.f; + } else if (pooling_param_->act_type_ == ActType_Relu6) { + minf = 0.f; + maxf = 6.f; + } if (pooling_param_->pool_mode_ == PoolMode_MaxPool) { - switch (pooling_param_->act_type_) { - case ActType_Relu: - MaxPoolingRelu(input_ptr, output_ptr, pooling_param_, task_id); - break; - case ActType_Relu6: - MaxPoolingRelu6(input_ptr, output_ptr, pooling_param_, task_id); - break; - default: - MaxPooling(input_ptr, output_ptr, pooling_param_, task_id); - } + MaxPooling(input_ptr, output_ptr, pooling_param_, task_id, minf, maxf); } else { - switch (pooling_param_->act_type_) { - case ActType_Relu: - AvgPoolingRelu(input_ptr, output_ptr, pooling_param_, task_id); - break; - case ActType_Relu6: - AvgPoolingRelu6(input_ptr, output_ptr, pooling_param_, task_id); - break; - default: - AvgPooling(input_ptr, output_ptr, pooling_param_, task_id); - } + AvgPooling(input_ptr, output_ptr, pooling_param_, task_id, minf, maxf); } return RET_OK; }