From 0baf9d53b6ef3845bbe9045bc4290e43c488b7eb Mon Sep 17 00:00:00 2001 From: greatpanc Date: Wed, 15 Dec 2021 10:13:38 +0800 Subject: [PATCH] avgpooling optimization from nc4hwc -> nhwc --- .../cpu/nnacl/fp32/pooling_fp32.c | 308 ++++++++++++++++-- .../cpu/nnacl/fp32/pooling_fp32.h | 4 +- .../runtime/kernel/arm/fp32/pooling_fp32.cc | 5 +- 3 files changed, 290 insertions(+), 27 deletions(-) diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pooling_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pooling_fp32.c index b8388e077e2..9395070d8ac 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pooling_fp32.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pooling_fp32.c @@ -140,6 +140,243 @@ int AvgPooling(const float *input_ptr, float *output_ptr, const PoolingParameter return NNACL_OK; } +int AvgPoolingFromNC4HW4ToNHWCLessC(const float *src_b_ptr, float *dst_b_ptr, const PoolingParameter *pooling_param, + int task_id, float minf, float maxf) { + int in_w = pooling_param->input_w_, in_h = pooling_param->input_h_; + int win_w = pooling_param->window_w_, win_h = pooling_param->window_h_; + int output_w = pooling_param->output_w_, output_h = pooling_param->output_h_; + int channel = pooling_param->input_channel_; + + int out_plane = output_w * output_h; + int in_plane = in_w * in_h; + NNACL_CHECK_ZERO_RETURN_ERR(output_w); + +#ifdef ENABLE_AVX + const int c_tile = C8NUM; + const int once_calc_num = 2; +#elif defined(ENABLE_NEON) || defined(ENABLE_SSE) + const int c_tile = C4NUM; + const int once_calc_num = 1; +#else + const int c_tile = 1; + const int once_calc_num = 1; +#endif + + const int c_xtile = once_calc_num * c_tile; + + int cur_c = (channel / c_xtile) * c_xtile; + int last_c_size = channel - cur_c; + + int less_out_plane = out_plane * last_c_size; + int calc_tile = UP_DIV(less_out_plane, pooling_param->thread_num_); + + int index_begin = task_id * calc_tile; + int index_end = (index_begin + calc_tile) < less_out_plane ? (index_begin + calc_tile) : less_out_plane; + + int c_start = (index_begin / out_plane) + cur_c; + int index_less = index_begin % out_plane; + int h_start = index_less / output_h; + int w_start = index_less % output_h; + + int c_end = (index_end / out_plane) + cur_c; + index_less = index_end % out_plane; + int h_end = index_less / output_h; + int w_end = index_less % output_h; + + int c = c_start; + int h = h_start; + int w = w_start; + + int in_w_cx_line = in_w * last_c_size; + const float *src_c_ptr = src_b_ptr + c * in_plane; + for (; c < channel; c += c_xtile) { + for (; h < output_h; h++) { + int cur_index_in_h_start = MSMAX(h * pooling_param->stride_h_ - pooling_param->pad_d_, 0); + int cur_index_in_h_end = MSMIN(cur_index_in_h_start + win_h, in_h); + + for (; w < output_w; w++) { + MS_CHECK_TRUE_RET((c < c_end || h < h_end || w < w_end), NNACL_OK); + float tmp_avg = 0.0; + + int cur_index_in_w_start = MSMAX(w * pooling_param->stride_w_ - pooling_param->pad_l_, 0); + int cur_index_in_w_end = MSMIN(cur_index_in_w_start + win_w, in_w); + + int real_count = (cur_index_in_w_end - cur_index_in_w_start) * (cur_index_in_h_end - cur_index_in_h_start); + MS_CHECK_TRUE_RET(real_count != 0, NNACL_ERR); + + for (int cur_index_in_h = cur_index_in_h_start; cur_index_in_h < cur_index_in_h_end; cur_index_in_h++) { + const float *src_c_ptr_h_line = src_c_ptr + cur_index_in_h * in_w_cx_line; + for (int cur_index_in_w = cur_index_in_w_start; cur_index_in_w < cur_index_in_w_end; cur_index_in_w++) { + const float *cur_input_index = src_c_ptr_h_line + cur_index_in_w * last_c_size + (c - cur_c); + tmp_avg += cur_input_index[0]; + } + } + + float *dst_c_ptr = dst_b_ptr + h * output_w * channel + w * channel + c; + tmp_avg = tmp_avg / (float)real_count; + tmp_avg = fminf(tmp_avg, maxf); + dst_c_ptr[0] = tmp_avg; + } + w = 0; + } + h = 0; + } + return NNACL_OK; +} + +int AvgPoolingFromNC4HW4ToNHWCBatch(const float *src_b_ptr, float *dst_b_ptr, const PoolingParameter *pooling_param, + int task_id, float minf, float maxf) { + int in_w = pooling_param->input_w_, in_h = pooling_param->input_h_; + int win_w = pooling_param->window_w_, win_h = pooling_param->window_h_; + int output_w = pooling_param->output_w_, output_h = pooling_param->output_h_; + int channel = pooling_param->input_channel_; + + int out_plane = output_w * output_h; + int in_plane = in_w * in_h; + NNACL_CHECK_ZERO_RETURN_ERR(output_w); + +#ifdef ENABLE_AVX + const int c_tile = C8NUM; + const int once_calc_num = 2; + MS_FLOAT32X8 min_value_8 = MS_MOV256_F32(minf); + MS_FLOAT32X8 max_value_8 = MS_MOV256_F32(maxf); +#elif defined(ENABLE_NEON) || defined(ENABLE_SSE) + const int c_tile = C4NUM; + const int once_calc_num = 1; + MS_FLOAT32X4 min_value = MS_MOVQ_F32(minf); + MS_FLOAT32X4 max_value = MS_MOVQ_F32(maxf); +#else + const int c_tile = 1; + const int once_calc_num = 1; +#endif + + int in_w_cx_line = in_w * c_tile; + const int c_xtile = once_calc_num * c_tile; + int c_tile_num = channel / c_xtile; + int all_out_plane = out_plane * c_tile_num; + int calc_tile = UP_DIV(all_out_plane, pooling_param->thread_num_); + + int index_begin = task_id * calc_tile; + int index_end = (index_begin + calc_tile) < all_out_plane ? (index_begin + calc_tile) : all_out_plane; + + int c_start = (index_begin / out_plane) * c_xtile; + int index_less = index_begin % out_plane; + int h_start = index_less / output_h; + int w_start = index_less % output_h; + + int c_end = (index_end / out_plane) * c_xtile; + index_less = index_end % out_plane; + int h_end = index_less / output_h; + int w_end = index_less % output_h; + + int c = c_start; + int h = h_start; + int w = w_start; + for (; c < channel; c += c_xtile) { + const float *src_c_ptr = src_b_ptr + c * in_plane; + for (; h < output_h; h++) { + int cur_index_in_h_start = MSMAX(h * pooling_param->stride_h_ - pooling_param->pad_d_, 0); + int cur_index_in_h_end = MSMIN(cur_index_in_h_start + win_h, in_h); + + for (; w < output_w; w++) { + MS_CHECK_TRUE_RET((c < c_end || h < h_end || w < w_end), NNACL_OK); + +#ifdef ENABLE_AVX + MS_FLOAT32X8 tmp_avg = MS_MOV256_F32(0); + MS_FLOAT32X8 tmp_avg2 = MS_MOV256_F32(0); +#elif defined(ENABLE_NEON) || defined(ENABLE_SSE) + MS_FLOAT32X4 tmp_avg = MS_MOVQ_F32(0); +#else + float tmp_avg = 0; +#endif + + int cur_index_in_w_start = MSMAX(w * pooling_param->stride_w_ - pooling_param->pad_l_, 0); + int cur_index_in_w_end = MSMIN(cur_index_in_w_start + win_w, in_w); + + int real_count = (cur_index_in_w_end - cur_index_in_w_start) * (cur_index_in_h_end - cur_index_in_h_start); + MS_CHECK_TRUE_RET(real_count != 0, NNACL_ERR); + + for (int cur_index_in_h = cur_index_in_h_start; cur_index_in_h < cur_index_in_h_end; cur_index_in_h++) { + const float *src_c_ptr_h_line = src_c_ptr + cur_index_in_h * in_w_cx_line; + for (int cur_index_in_w = cur_index_in_w_start; cur_index_in_w < cur_index_in_w_end; cur_index_in_w++) { + const float *cur_input_index = src_c_ptr_h_line + cur_index_in_w * c_tile; +#ifdef ENABLE_AVX + tmp_avg = MS_ADD256_F32(tmp_avg, MS_LD256_F32(cur_input_index)); +#elif defined(ENABLE_NEON) || defined(ENABLE_SSE) + tmp_avg = MS_ADDQ_F32(tmp_avg, MS_LDQ_F32(cur_input_index)); +#else + tmp_avg += cur_input_index[0]; +#endif + } + +#ifdef ENABLE_AVX + const float *src_c2_ptr_h_line = src_c_ptr_h_line + c_tile * in_plane; + for (int cur_index_in_w = cur_index_in_w_start; cur_index_in_w < cur_index_in_w_end; cur_index_in_w++) { + const float *cur_input_index = src_c2_ptr_h_line + cur_index_in_w * c_tile; + + tmp_avg2 = MS_ADD256_F32(tmp_avg2, MS_LD256_F32(cur_input_index)); + } +#endif + } + + float *dst_c_ptr = dst_b_ptr + h * output_w * channel + w * channel + c; +#ifdef ENABLE_AVX + float *dst_c2_ptr = dst_c_ptr + c_tile; + + tmp_avg = MS_DIV256_F32(tmp_avg, MS_MOV256_F32(real_count)); + tmp_avg = MS_MAX256_F32(tmp_avg, min_value_8); + tmp_avg = MS_MIN256_F32(tmp_avg, max_value_8); + MS_ST256_F32(dst_c_ptr, tmp_avg); + + tmp_avg2 = MS_DIV256_F32(tmp_avg2, MS_MOV256_F32(real_count)); + tmp_avg2 = MS_MAX256_F32(tmp_avg2, min_value_8); + tmp_avg2 = MS_MIN256_F32(tmp_avg2, max_value_8); + MS_ST256_F32(dst_c2_ptr, tmp_avg2); +#elif defined(ENABLE_NEON) || defined(ENABLE_SSE) + tmp_avg = MS_DIVQ_F32(tmp_avg, MS_MOVQ_F32(real_count)); + tmp_avg = MS_MAXQ_F32(tmp_avg, min_value); + tmp_avg = MS_MINQ_F32(tmp_avg, max_value); + MS_STQ_F32(dst_c_ptr, tmp_avg); +#else + tmp_avg = tmp_avg / (float)real_count; + tmp_avg = fmaxf(tmp_avg, minf); + tmp_avg = fminf(tmp_avg, maxf); + dst_c_ptr[0] = tmp_avg; +#endif + } + w = 0; + } + h = 0; + } + + return NNACL_OK; +} + +int AvgPoolingFromNC4HW4ToNHWC(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, + int task_id, float minf, float maxf) { + int in_w = pooling_param->input_w_; + int in_h = pooling_param->input_h_; + int output_w = pooling_param->output_w_; + int output_h = pooling_param->output_h_; + int channel = pooling_param->input_channel_; + int output_batch = pooling_param->output_batch_; + + for (int batch = 0; batch < output_batch; batch++) { + const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel; + float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel; + int ret = AvgPoolingFromNC4HW4ToNHWCBatch(src_b_ptr, dst_b_ptr, pooling_param, task_id, minf, maxf); + if (ret != NNACL_OK) { + return ret; + } + + ret = AvgPoolingFromNC4HW4ToNHWCLessC(src_b_ptr, dst_b_ptr, pooling_param, task_id, minf, maxf); + if (ret != NNACL_OK) { + return ret; + } + } + return NNACL_OK; +} + int MaxPoolingBatch(const float *src_b_ptr, float *dst_b_ptr, const PoolingParameter *pooling_param, int task_id, float minf, float maxf) { int in_w = pooling_param->input_w_, in_h = pooling_param->input_h_; @@ -249,7 +486,7 @@ int MaxPooling(const float *input_ptr, float *output_ptr, const PoolingParameter return NNACL_OK; } -int MaxPoolingFormNC4HW4ToNHWCLessC(const float *src_b_ptr, float *dst_b_ptr, const PoolingParameter *pooling_param, +int MaxPoolingFromNC4HW4ToNHWCLessC(const float *src_b_ptr, float *dst_b_ptr, const PoolingParameter *pooling_param, int task_id, float minf, float maxf) { int in_w = pooling_param->input_w_, in_h = pooling_param->input_h_; int win_w = pooling_param->window_w_, win_h = pooling_param->window_h_; @@ -276,36 +513,61 @@ int MaxPoolingFormNC4HW4ToNHWCLessC(const float *src_b_ptr, float *dst_b_ptr, co int cur_c = (channel / c_xtile) * c_xtile; int last_c_size = channel - cur_c; - int calc_tile = UP_DIV(out_plane, pooling_param->thread_num_); + int less_out_plane = out_plane * last_c_size; + int calc_tile = UP_DIV(less_out_plane, pooling_param->thread_num_); int index_begin = task_id * calc_tile; - int index_end = (index_begin + calc_tile) < out_plane ? (index_begin + calc_tile) : out_plane; + int index_end = (index_begin + calc_tile) < less_out_plane ? (index_begin + calc_tile) : less_out_plane; - for (int c = cur_c; c < channel; c++) { - for (int index = index_begin; index < index_end; index++) { - const float *src_c_ptr = src_b_ptr + c * in_plane; - int h = index / output_h; - int w = index % output_h; + int c_start = (index_begin / out_plane) + cur_c; + int index_less = index_begin % out_plane; + int h_start = index_less / output_h; + int w_start = index_less % output_h; - float tmp_max = -FLT_MAX; - for (int kh = 0; kh < win_h; kh++) { - for (int kw = 0; kw < win_w; kw++) { - const float *src_win_ptr = src_c_ptr + (kh + win_h * h) * in_w * last_c_size + (kw + win_w * w) * last_c_size; - tmp_max = fmaxf(tmp_max, src_win_ptr[0]); + int c_end = (index_end / out_plane) + cur_c; + index_less = index_end % out_plane; + int h_end = index_less / output_h; + int w_end = index_less % output_h; + + int c = c_start; + int h = h_start; + int w = w_start; + + int in_w_cx_line = in_w * last_c_size; + const float *src_c_ptr = src_b_ptr + cur_c * in_plane; + for (; c < channel; c++) { + for (; h < output_h; h++) { + int cur_index_in_h_start = MSMAX(h * pooling_param->stride_h_ - pooling_param->pad_d_, 0); + int cur_index_in_h_end = MSMIN(cur_index_in_h_start + win_h, in_h); + + for (; w < output_w; w++) { + MS_CHECK_TRUE_RET((c < c_end || h < h_end || w < w_end), NNACL_OK); + float tmp_max = -FLT_MAX; + + int cur_index_in_w_start = MSMAX(w * pooling_param->stride_w_ - pooling_param->pad_l_, 0); + int cur_index_in_w_end = MSMIN(cur_index_in_w_start + win_w, in_w); + + for (int cur_index_in_h = cur_index_in_h_start; cur_index_in_h < cur_index_in_h_end; cur_index_in_h++) { + const float *src_c_ptr_h_line = src_c_ptr + cur_index_in_h * in_w_cx_line; + for (int cur_index_in_w = cur_index_in_w_start; cur_index_in_w < cur_index_in_w_end; cur_index_in_w++) { + const float *cur_input_index = src_c_ptr_h_line + cur_index_in_w * last_c_size + (c - cur_c); + tmp_max = fmaxf(tmp_max, cur_input_index[0]); + } } + + float *dst_c_ptr = dst_b_ptr + h * output_w * channel + w * channel + c; + tmp_max = fmaxf(tmp_max, minf); + tmp_max = fminf(tmp_max, maxf); + dst_c_ptr[0] = tmp_max; } - - float *dst_c_ptr = dst_b_ptr + h * output_w * channel + w * channel + c; - tmp_max = fmaxf(tmp_max, minf); - tmp_max = fminf(tmp_max, maxf); - dst_c_ptr[0] = tmp_max; + w = 0; } + h = 0; } - return NNACL_OK; } -int MaxPoolingFormNC4HW4ToNHWCBatch(const float *src_b_ptr, float *dst_b_ptr, const PoolingParameter *pooling_param, +int MaxPoolingFromNC4HW4ToNHWCBatch(const float *src_b_ptr, float *dst_b_ptr, const PoolingParameter *pooling_param, int task_id, float minf, float maxf) { int in_w = pooling_param->input_w_, in_h = pooling_param->input_h_; int win_w = pooling_param->window_w_, win_h = pooling_param->window_h_; @@ -426,7 +688,7 @@ int MaxPoolingFormNC4HW4ToNHWCBatch(const float *src_b_ptr, float *dst_b_ptr, co return NNACL_OK; } -int MaxPoolingFormNC4HW4ToNHWC(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, +int MaxPoolingFromNC4HW4ToNHWC(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, int task_id, float minf, float maxf) { int in_w = pooling_param->input_w_; int in_h = pooling_param->input_h_; @@ -438,12 +700,12 @@ int MaxPoolingFormNC4HW4ToNHWC(const float *input_ptr, float *output_ptr, const for (int batch = 0; batch < output_batch; batch++) { const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel; float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel; - int ret = MaxPoolingFormNC4HW4ToNHWCBatch(src_b_ptr, dst_b_ptr, pooling_param, task_id, minf, maxf); + int ret = MaxPoolingFromNC4HW4ToNHWCBatch(src_b_ptr, dst_b_ptr, pooling_param, task_id, minf, maxf); if (ret != NNACL_OK) { return ret; } - ret = MaxPoolingFormNC4HW4ToNHWCLessC(src_b_ptr, dst_b_ptr, pooling_param, task_id, minf, maxf); + ret = MaxPoolingFromNC4HW4ToNHWCLessC(src_b_ptr, dst_b_ptr, pooling_param, task_id, minf, maxf); if (ret != NNACL_OK) { return ret; } diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pooling_fp32.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pooling_fp32.h index 9764354e514..dd0b1ebf3bc 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pooling_fp32.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pooling_fp32.h @@ -29,9 +29,11 @@ extern "C" { #endif int AvgPooling(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, int task_id, float minf, float maxf); +int AvgPoolingFromNC4HW4ToNHWC(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, + int task_id, float minf, float maxf); int MaxPooling(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, int task_id, float minf, float maxf); -int MaxPoolingFormNC4HW4ToNHWC(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, +int MaxPoolingFromNC4HW4ToNHWC(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, int task_id, float minf, float maxf); #ifdef __cplusplus } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_fp32.cc index ff8c87cb2f3..22ceb1f2d38 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_fp32.cc @@ -67,10 +67,9 @@ int PoolingCPUKernel::RunImpl(int task_id) const { if (in_tensors_[0]->format() == NC4HW4) { if (pooling_param_->pool_mode_ == PoolMode_MaxPool) { - ret = MaxPoolingFormNC4HW4ToNHWC(input_ptr, output_ptr, pooling_param_, task_id, minf, maxf); + ret = MaxPoolingFromNC4HW4ToNHWC(input_ptr, output_ptr, pooling_param_, task_id, minf, maxf); } else { - // ret = AvgPoolingFormNC4HW4ToNHWC(input_ptr, output_ptr, pooling_param_, task_id, minf, maxf); - MS_LOG(ERROR) << "Do not support NC4HW4 AvgPooling input format."; + ret = AvgPoolingFromNC4HW4ToNHWC(input_ptr, output_ptr, pooling_param_, task_id, minf, maxf); } } else if (in_tensors_[0]->format() == NHWC) { if (pooling_param_->pool_mode_ == PoolMode_MaxPool) {