From 8230f06ca283e008a207cf163243b1d78505c4d4 Mon Sep 17 00:00:00 2001 From: ling Date: Tue, 26 Oct 2021 20:12:59 +0800 Subject: [PATCH] [MSLITE] code clean --- .../cpu/nnacl/fp32/pack_fp32.c | 55 +-- .../cpu/nnacl/fp32/pooling_fp32.c | 371 ++++++++++-------- .../kernel_compiler/cpu/nnacl/op_base.h | 1 + mindspore/lite/src/lite_session.cc | 4 + .../src/runtime/kernel/arm/fp16/pad_fp16.cc | 42 +- .../src/runtime/kernel/arm/fp16/pad_fp16.h | 7 +- .../src/runtime/kernel/arm/fp16/prelu_fp16.cc | 2 +- .../src/runtime/kernel/arm/fp16/prelu_fp16.h | 2 +- .../runtime/kernel/arm/fp32/gatherNd_fp32.cc | 6 +- .../runtime/kernel/arm/fp32/gatherNd_fp32.h | 2 +- .../runtime/kernel/arm/fp32/gather_fp32.cc | 6 +- .../src/runtime/kernel/arm/fp32/gather_fp32.h | 2 +- .../src/runtime/kernel/arm/fp32/glu_fp32.cc | 27 +- .../src/runtime/kernel/arm/fp32/glu_fp32.h | 12 +- .../src/runtime/kernel/arm/fp32/gru_fp32.h | 2 +- .../kernel/arm/fp32/instance_norm_fp32.cc | 6 +- .../kernel/arm/fp32/instance_norm_fp32.h | 4 +- .../runtime/kernel/arm/fp32/l2_norm_fp32.cc | 18 +- .../runtime/kernel/arm/fp32/l2_norm_fp32.h | 6 +- .../kernel/arm/fp32/layer_norm_fp32.cc | 6 +- .../runtime/kernel/arm/fp32/layer_norm_fp32.h | 2 +- .../arm/fp32/local_response_norm_fp32.cc | 6 +- .../arm/fp32/local_response_norm_fp32.h | 2 +- .../kernel/arm/fp32/log_softmax_fp32.cc | 6 +- .../kernel/arm/fp32/log_softmax_fp32.h | 2 +- .../src/runtime/kernel/arm/fp32/lstm_fp32.cc | 6 +- .../src/runtime/kernel/arm/fp32/lstm_fp32.h | 10 +- .../kernel/arm/fp32/matmul_fp32_base.cc | 50 ++- .../kernel/arm/fp32/matmul_fp32_base.h | 4 +- .../arm/fp32/non_max_suppression_fp32.cc | 6 +- .../arm/fp32/non_max_suppression_fp32.h | 3 +- .../src/runtime/kernel/arm/fp32/pad_fp32.cc | 50 +-- .../src/runtime/kernel/arm/fp32/pad_fp32.h | 9 +- .../runtime/kernel/arm/fp32/pooling_fp32.cc | 6 +- .../runtime/kernel/arm/fp32/pooling_fp32.h | 2 +- .../src/runtime/kernel/arm/fp32/power_fp32.cc | 7 +- .../src/runtime/kernel/arm/fp32/power_fp32.h | 2 +- .../src/runtime/kernel/arm/fp32/prelu_fp32.cc | 6 +- .../src/runtime/kernel/arm/fp32/prelu_fp32.h | 2 +- mindspore/lite/src/runtime/runtime_pass.cc | 4 +- 40 files changed, 413 insertions(+), 353 deletions(-) diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pack_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pack_fp32.c index deba0d28f76..4d40298cd63 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pack_fp32.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pack_fp32.c @@ -258,35 +258,36 @@ void PackNHWCToNXHWCXFp32(int kernel_h, int kernel_w, int output_channel, int oc tmp_weight[oc_remainder + oc_remainder_step * ic] = src[ic + oc_remainder * input_channel]; } } - } else { - for (; oc < oc_block8; oc += (oc_block / C8NUM)) { - oc_block = MSMIN(C4NUM, oc_block8 - oc) * C8NUM; // max_tile = 32 ==> 24 ==> 16 ==> 8 - for (int oc_tmp = 0; oc_tmp < oc_block; oc_tmp += C8NUM) { - for (int hw = 0; hw < plane; ++hw) { - int ic = 0; - for (; ic < ic8; ic += C8NUM) { - Transpose8X8Fp32Avx(src + hw * input_channel + ic, - tmp_weight + hw * oc_block * input_channel + ic * oc_block + oc_tmp, - input_channel * plane, oc_block); - } - for (; ic < input_channel; ++ic) { - for (int j = 0; j < C8NUM; ++j) { - tmp_weight[ic * oc_block + oc_tmp + j + hw * oc_block * input_channel] = - src[ic + input_channel * j * plane + hw * input_channel]; - } - } - } - src += C8NUM * plane * input_channel; - } - tmp_weight += oc_block * input_channel * plane; - } - oc = output_channel - oc_block8 * C8NUM; - for (int oc_remainder = 0; oc_remainder < oc; ++oc_remainder) { + return; + } + + for (; oc < oc_block8; oc += (oc_block / C8NUM)) { + oc_block = MSMIN(C4NUM, oc_block8 - oc) * C8NUM; // max_tile = 32 ==> 24 ==> 16 ==> 8 + for (int oc_tmp = 0; oc_tmp < oc_block; oc_tmp += C8NUM) { for (int hw = 0; hw < plane; ++hw) { - for (int ic = 0; ic < input_channel; ++ic) { - tmp_weight[oc_remainder + oc_remainder_step * ic + hw * input_channel * oc_remainder_step] = - src[ic + (oc_remainder * plane + hw) * input_channel]; + int ic = 0; + for (; ic < ic8; ic += C8NUM) { + Transpose8X8Fp32Avx(src + hw * input_channel + ic, + tmp_weight + hw * oc_block * input_channel + ic * oc_block + oc_tmp, + input_channel * plane, oc_block); } + for (; ic < input_channel; ++ic) { + for (int j = 0; j < C8NUM; ++j) { + tmp_weight[ic * oc_block + oc_tmp + j + hw * oc_block * input_channel] = + src[ic + input_channel * j * plane + hw * input_channel]; + } + } + } + src += C8NUM * plane * input_channel; + } + tmp_weight += oc_block * input_channel * plane; + } + oc = output_channel - oc_block8 * C8NUM; + for (int oc_remainder = 0; oc_remainder < oc; ++oc_remainder) { + for (int hw = 0; hw < plane; ++hw) { + for (int ic = 0; ic < input_channel; ++ic) { + tmp_weight[oc_remainder + oc_remainder_step * ic + hw * input_channel * oc_remainder_step] = + src[ic + (oc_remainder * plane + hw) * input_channel]; } } } diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pooling_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pooling_fp32.c index 770622243dd..400e4e2fd2c 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pooling_fp32.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pooling_fp32.c @@ -19,15 +19,12 @@ #include "nnacl/errorcode.h" #include "nnacl/op_base.h" -int AvgPooling(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, int task_id, - float minf, float maxf) { - int win_w = pooling_param->window_w_; - int win_h = pooling_param->window_h_; +int AvgPoolingBatch(const float *src_b_ptr, float *dst_b_ptr, const PoolingParameter *pooling_param, int task_id, + float minf, float maxf) { + int win_w = pooling_param->window_w_, win_h = pooling_param->window_h_; + int in_w = pooling_param->input_w_, in_h = pooling_param->input_h_; + int output_w = pooling_param->output_w_, output_h = pooling_param->output_h_; int channel = pooling_param->input_channel_; - int in_w = pooling_param->input_w_; - int in_h = pooling_param->input_h_; - int output_w = pooling_param->output_w_; - int output_h = pooling_param->output_h_; int out_plane = output_w * output_h; int out_tile_count = UP_DIV(out_plane, TILE_NUM); NNACL_CHECK_ZERO_RETURN_ERR(output_w); @@ -42,190 +39,218 @@ int AvgPooling(const float *input_ptr, float *output_ptr, const PoolingParameter MS_FLOAT32X4 max_value = MS_MOVQ_F32(maxf); #endif - for (int batch = 0; batch < pooling_param->output_batch_; batch++) { - const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel; - float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel; - for (int thread_id = task_id; thread_id < out_tile_count; thread_id += pooling_param->thread_num_) { - int cal_start_index = thread_id * TILE_NUM; - int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index); - for (int i = 0; i < real_cal_num; i++) { - int index = cal_start_index + i; - int out_w_index = index % output_w; - int out_h_index = index / output_w; - int in_w_index = out_w_index * pooling_param->stride_w_ - pooling_param->pad_l_; - int in_h_index = out_h_index * pooling_param->stride_h_ - pooling_param->pad_u_; + for (int thread_id = task_id; thread_id < out_tile_count; thread_id += pooling_param->thread_num_) { + int cal_start_index = thread_id * TILE_NUM; + int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index); + for (int i = 0; i < real_cal_num; i++) { + int index = cal_start_index + i; + int out_w_index = index % output_w; + int out_h_index = index / output_w; + int in_w_index = out_w_index * pooling_param->stride_w_ - pooling_param->pad_l_; + int in_h_index = out_h_index * pooling_param->stride_h_ - pooling_param->pad_u_; - const float *src_plane_ptr = src_b_ptr; - float *dst_plane_ptr = dst_b_ptr + index * channel; + const float *src_plane_ptr = src_b_ptr; + float *dst_plane_ptr = dst_b_ptr + index * channel; - int real_win_h_start = MSMAX(0, -in_h_index); - int real_win_h_end = MSMIN(win_h, in_h - in_h_index); - int real_win_w_start = MSMAX(0, -in_w_index); - int real_win_w_end = MSMIN(win_w, in_w - in_w_index); - int ci = 0; + int real_win_h_start = MSMAX(0, -in_h_index); + int real_win_h_end = MSMIN(win_h, in_h - in_h_index); + int real_win_w_start = MSMAX(0, -in_w_index); + int real_win_w_end = MSMIN(win_w, in_w - in_w_index); + int ci = 0; #ifdef ENABLE_AVX - for (; ci < c8; ci += C8NUM) { - const float *src_c_ptr = src_plane_ptr + ci; - float *dst_c_ptr = dst_plane_ptr + ci; - MS_FLOAT32X8 tmp_avg = MS_MOV256_F32(0); - int real_count = 0; - for (int h = real_win_h_start; h < real_win_h_end; h++) { - for (int w = real_win_w_start; w < real_win_w_end; w++) { - const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel; - tmp_avg = MS_ADD256_F32(tmp_avg, MS_LD256_F32(src_win_ptr)); - ++real_count; - } // win_w loop - } // win_h loop - if (real_count == 0) { - return NNACL_ERR; - } - tmp_avg = MS_DIV256_F32(tmp_avg, MS_MOV256_F32(real_count)); - tmp_avg = MS_MAX256_F32(tmp_avg, min_value_8); - tmp_avg = MS_MIN256_F32(tmp_avg, max_value_8); - MS_ST256_F32(dst_c_ptr, tmp_avg); - } // ic8-1 loop + for (; ci < c8; ci += C8NUM) { + const float *src_c_ptr = src_plane_ptr + ci; + float *dst_c_ptr = dst_plane_ptr + ci; + MS_FLOAT32X8 tmp_avg = MS_MOV256_F32(0); + int real_count = 0; + for (int h = real_win_h_start; h < real_win_h_end; h++) { + for (int w = real_win_w_start; w < real_win_w_end; w++) { + const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel; + tmp_avg = MS_ADD256_F32(tmp_avg, MS_LD256_F32(src_win_ptr)); + ++real_count; + } // win_w loop + } // win_h loop + if (real_count == 0) { + return NNACL_ERR; + } + tmp_avg = MS_DIV256_F32(tmp_avg, MS_MOV256_F32(real_count)); + tmp_avg = MS_MAX256_F32(tmp_avg, min_value_8); + tmp_avg = MS_MIN256_F32(tmp_avg, max_value_8); + MS_ST256_F32(dst_c_ptr, tmp_avg); + } // ic8-1 loop #endif #if defined(ENABLE_NEON) || defined(ENABLE_SSE) - for (; ci < c4; ci += C4NUM) { - const float *src_c_ptr = src_plane_ptr + ci; - float *dst_c_ptr = dst_plane_ptr + ci; - MS_FLOAT32X4 tmp_avg = MS_MOVQ_F32(0); - int real_count = 0; - for (int h = real_win_h_start; h < real_win_h_end; h++) { - for (int w = real_win_w_start; w < real_win_w_end; w++) { - const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel; - tmp_avg = MS_ADDQ_F32(tmp_avg, MS_LDQ_F32(src_win_ptr)); - ++real_count; - } // win_w loop - } // win_h loop - if (real_count == 0) { - return NNACL_ERR; - } - tmp_avg = MS_DIVQ_F32(tmp_avg, MS_MOVQ_F32(real_count)); - tmp_avg = MS_MAXQ_F32(tmp_avg, min_value); - tmp_avg = MS_MINQ_F32(tmp_avg, max_value); - MS_STQ_F32(dst_c_ptr, tmp_avg); - } // ic4-1 loop + for (; ci < c4; ci += C4NUM) { + const float *src_c_ptr = src_plane_ptr + ci; + float *dst_c_ptr = dst_plane_ptr + ci; + MS_FLOAT32X4 tmp_avg = MS_MOVQ_F32(0); + int real_count = 0; + for (int h = real_win_h_start; h < real_win_h_end; h++) { + for (int w = real_win_w_start; w < real_win_w_end; w++) { + const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel; + tmp_avg = MS_ADDQ_F32(tmp_avg, MS_LDQ_F32(src_win_ptr)); + ++real_count; + } // win_w loop + } // win_h loop + if (real_count == 0) { + return NNACL_ERR; + } + tmp_avg = MS_DIVQ_F32(tmp_avg, MS_MOVQ_F32(real_count)); + tmp_avg = MS_MAXQ_F32(tmp_avg, min_value); + tmp_avg = MS_MINQ_F32(tmp_avg, max_value); + MS_STQ_F32(dst_c_ptr, tmp_avg); + } // ic4-1 loop #endif - for (; ci < channel; ci++) { - const float *src_c_ptr = src_plane_ptr + ci; - float *dst_c_ptr = dst_plane_ptr + ci; - float tmp_avg = 0; - int real_count = 0; - for (int h = real_win_h_start; h < real_win_h_end; h++) { - for (int w = real_win_w_start; w < real_win_w_end; w++) { - const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel; - tmp_avg += src_win_ptr[0]; - ++real_count; - } // win_w loop - } // win_h loop - if (real_count == 0) { - return NNACL_ERR; - } - tmp_avg = tmp_avg / (float)real_count; - tmp_avg = fmaxf(tmp_avg, minf); - tmp_avg = fminf(tmp_avg, maxf); - dst_c_ptr[0] = tmp_avg; - } // channel_res loop - } // real_cal_num loop - } // out_plane loop - } // out_batch loop + for (; ci < channel; ci++) { + const float *src_c_ptr = src_plane_ptr + ci; + float *dst_c_ptr = dst_plane_ptr + ci; + float tmp_avg = 0; + int real_count = 0; + for (int h = real_win_h_start; h < real_win_h_end; h++) { + for (int w = real_win_w_start; w < real_win_w_end; w++) { + const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel; + tmp_avg += src_win_ptr[0]; + ++real_count; + } // win_w loop + } // win_h loop + if (real_count == 0) { + return NNACL_ERR; + } + tmp_avg = tmp_avg / (float)real_count; + tmp_avg = fmaxf(tmp_avg, minf); + tmp_avg = fminf(tmp_avg, maxf); + dst_c_ptr[0] = tmp_avg; + } // channel_res loop + } // real_cal_num loop + } // out_plane loop + return NNACL_OK; +} + +int AvgPooling(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, int task_id, + float minf, float maxf) { + int in_w = pooling_param->input_w_; + int in_h = pooling_param->input_h_; + int output_w = pooling_param->output_w_; + int output_h = pooling_param->output_h_; + int channel = pooling_param->input_channel_; + int output_batch = pooling_param->output_batch_; + + for (int batch = 0; batch < output_batch; batch++) { + const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel; + float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel; + int ret = AvgPoolingBatch(src_b_ptr, dst_b_ptr, pooling_param, task_id, minf, maxf); + if (ret != NNACL_OK) { + return ret; + } + } + return NNACL_OK; +} + +int MaxPoolingBatch(const float *src_b_ptr, float *dst_b_ptr, const PoolingParameter *pooling_param, int task_id, + float minf, float maxf) { + int in_w = pooling_param->input_w_, in_h = pooling_param->input_h_; + int win_w = pooling_param->window_w_, win_h = pooling_param->window_h_; + int output_w = pooling_param->output_w_, output_h = pooling_param->output_h_; + int channel = pooling_param->input_channel_; + int out_plane = output_w * output_h; + int out_tile_count = UP_DIV(out_plane, TILE_NUM); + NNACL_CHECK_ZERO_RETURN_ERR(output_w); +#ifdef ENABLE_AVX + int c8 = channel / C8NUM * C8NUM; + MS_FLOAT32X8 min_value_8 = MS_MOV256_F32(minf); + MS_FLOAT32X8 max_value_8 = MS_MOV256_F32(maxf); +#endif +#if defined(ENABLE_NEON) || defined(ENABLE_SSE) + int c4 = channel / C4NUM * C4NUM; + MS_FLOAT32X4 min_value = MS_MOVQ_F32(minf); + MS_FLOAT32X4 max_value = MS_MOVQ_F32(maxf); +#endif + + for (int thread_id = task_id; thread_id < out_tile_count; thread_id += pooling_param->thread_num_) { + int cal_start_index = thread_id * TILE_NUM; + int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index); + for (int i = 0; i < real_cal_num; i++) { + int index = cal_start_index + i; + int out_w_index = index % output_w; + int out_h_index = index / output_w; + int in_w_index = out_w_index * pooling_param->stride_w_ - pooling_param->pad_l_; + int in_h_index = out_h_index * pooling_param->stride_h_ - pooling_param->pad_u_; + + const float *src_plane_ptr = src_b_ptr; + float *dst_plane_ptr = dst_b_ptr + index * channel; + + int real_win_h_start = MSMAX(0, -in_h_index); + int real_win_h_end = MSMIN(win_h, in_h - in_h_index); + int real_win_w_start = MSMAX(0, -in_w_index); + int real_win_w_end = MSMIN(win_w, in_w - in_w_index); + int ci = 0; +#ifdef ENABLE_AVX + for (; ci < c8; ci += C8NUM) { + const float *src_c_ptr = src_plane_ptr + ci; + float *dst_c_ptr = dst_plane_ptr + ci; + MS_FLOAT32X8 tmp_max = MS_MOV256_F32(-FLT_MAX); + for (int kh = real_win_h_start; kh < real_win_h_end; kh++) { + for (int kw = real_win_w_start; kw < real_win_w_end; kw++) { + const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel; + tmp_max = MS_MAX256_F32(tmp_max, MS_LD256_F32(src_win_ptr)); + } // win_w loop + } // win_h loop + tmp_max = MS_MAX256_F32(tmp_max, min_value_8); + tmp_max = MS_MIN256_F32(tmp_max, max_value_8); + MS_ST256_F32(dst_c_ptr, tmp_max); + } // ic8 loop +#endif +#if defined(ENABLE_NEON) || defined(ENABLE_SSE) + for (; ci < c4; ci += C4NUM) { + const float *src_c_ptr = src_plane_ptr + ci; + float *dst_c_ptr = dst_plane_ptr + ci; + MS_FLOAT32X4 tmp_max = MS_MOVQ_F32(-FLT_MAX); + for (int kh = real_win_h_start; kh < real_win_h_end; kh++) { + for (int kw = real_win_w_start; kw < real_win_w_end; kw++) { + const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel; + tmp_max = MS_MAXQ_F32(tmp_max, MS_LDQ_F32(src_win_ptr)); + } // win_w loop + } // win_h loop + tmp_max = MS_MAXQ_F32(tmp_max, min_value); + tmp_max = MS_MINQ_F32(tmp_max, max_value); + MS_STQ_F32(dst_c_ptr, tmp_max); + } // ic4 loop +#endif + for (; ci < channel; ci++) { + float *dst_c_ptr = dst_plane_ptr + ci; + const float *src_c_ptr = src_plane_ptr + ci; + float tmp_max = -FLT_MAX; + for (int kh = real_win_h_start; kh < real_win_h_end; kh++) { + for (int kw = real_win_w_start; kw < real_win_w_end; kw++) { + const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel; + tmp_max = fmaxf(tmp_max, src_win_ptr[0]); + } // win_w loop + } // win_h loop + tmp_max = fmaxf(tmp_max, minf); + tmp_max = fminf(tmp_max, maxf); + dst_c_ptr[0] = tmp_max; + } // channel_res loop + } // real_cal_num loop + } // out_plane loop return NNACL_OK; } int MaxPooling(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, int task_id, float minf, float maxf) { - int win_w = pooling_param->window_w_; - int win_h = pooling_param->window_h_; - int channel = pooling_param->input_channel_; int in_w = pooling_param->input_w_; int in_h = pooling_param->input_h_; int output_w = pooling_param->output_w_; int output_h = pooling_param->output_h_; + int channel = pooling_param->input_channel_; int output_batch = pooling_param->output_batch_; - int out_plane = output_w * output_h; - int out_tile_count = UP_DIV(out_plane, TILE_NUM); - NNACL_CHECK_ZERO_RETURN_ERR(output_w); -#ifdef ENABLE_AVX - int c8 = channel / C8NUM * C8NUM; - MS_FLOAT32X8 min_value_8 = MS_MOV256_F32(minf); - MS_FLOAT32X8 max_value_8 = MS_MOV256_F32(maxf); -#endif -#if defined(ENABLE_NEON) || defined(ENABLE_SSE) - int c4 = channel / C4NUM * C4NUM; - MS_FLOAT32X4 min_value = MS_MOVQ_F32(minf); - MS_FLOAT32X4 max_value = MS_MOVQ_F32(maxf); -#endif for (int batch = 0; batch < output_batch; batch++) { const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel; float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel; - for (int thread_id = task_id; thread_id < out_tile_count; thread_id += pooling_param->thread_num_) { - int cal_start_index = thread_id * TILE_NUM; - int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index); - for (int i = 0; i < real_cal_num; i++) { - int index = cal_start_index + i; - int out_w_index = index % output_w; - int out_h_index = index / output_w; - int in_w_index = out_w_index * pooling_param->stride_w_ - pooling_param->pad_l_; - int in_h_index = out_h_index * pooling_param->stride_h_ - pooling_param->pad_u_; - - const float *src_plane_ptr = src_b_ptr; - float *dst_plane_ptr = dst_b_ptr + index * channel; - - int real_win_h_start = MSMAX(0, -in_h_index); - int real_win_h_end = MSMIN(win_h, in_h - in_h_index); - int real_win_w_start = MSMAX(0, -in_w_index); - int real_win_w_end = MSMIN(win_w, in_w - in_w_index); - int ci = 0; -#ifdef ENABLE_AVX - for (; ci < c8; ci += C8NUM) { - const float *src_c_ptr = src_plane_ptr + ci; - float *dst_c_ptr = dst_plane_ptr + ci; - MS_FLOAT32X8 tmp_max = MS_MOV256_F32(-FLT_MAX); - for (int kh = real_win_h_start; kh < real_win_h_end; kh++) { - for (int kw = real_win_w_start; kw < real_win_w_end; kw++) { - const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel; - tmp_max = MS_MAX256_F32(tmp_max, MS_LD256_F32(src_win_ptr)); - } // win_w loop - } // win_h loop - tmp_max = MS_MAX256_F32(tmp_max, min_value_8); - tmp_max = MS_MIN256_F32(tmp_max, max_value_8); - MS_ST256_F32(dst_c_ptr, tmp_max); - } // ic8 loop -#endif -#if defined(ENABLE_NEON) || defined(ENABLE_SSE) - for (; ci < c4; ci += C4NUM) { - const float *src_c_ptr = src_plane_ptr + ci; - float *dst_c_ptr = dst_plane_ptr + ci; - MS_FLOAT32X4 tmp_max = MS_MOVQ_F32(-FLT_MAX); - for (int kh = real_win_h_start; kh < real_win_h_end; kh++) { - for (int kw = real_win_w_start; kw < real_win_w_end; kw++) { - const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel; - tmp_max = MS_MAXQ_F32(tmp_max, MS_LDQ_F32(src_win_ptr)); - } // win_w loop - } // win_h loop - tmp_max = MS_MAXQ_F32(tmp_max, min_value); - tmp_max = MS_MINQ_F32(tmp_max, max_value); - MS_STQ_F32(dst_c_ptr, tmp_max); - } // ic4 loop -#endif - for (; ci < channel; ci++) { - float *dst_c_ptr = dst_plane_ptr + ci; - const float *src_c_ptr = src_plane_ptr + ci; - float tmp_max = -FLT_MAX; - for (int kh = real_win_h_start; kh < real_win_h_end; kh++) { - for (int kw = real_win_w_start; kw < real_win_w_end; kw++) { - const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel; - tmp_max = fmaxf(tmp_max, src_win_ptr[0]); - } // win_w loop - } // win_h loop - tmp_max = fmaxf(tmp_max, minf); - tmp_max = fminf(tmp_max, maxf); - dst_c_ptr[0] = tmp_max; - } // channel_res loop - } // real_cal_num loop - } // out_plane loop - } // out_batch loop + int ret = MaxPoolingBatch(src_b_ptr, dst_b_ptr, pooling_param, task_id, minf, maxf); + if (ret != NNACL_OK) { + return ret; + } + } return NNACL_OK; } diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/op_base.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/op_base.h index a1057518ea3..56e0eaf6eb7 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/op_base.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/op_base.h @@ -76,6 +76,7 @@ #define THIRD_INPUT 2 #define FOURTH_INPUT 3 #define FIFTH_INPUT 4 +#define SIXTH_INPUT 5 #define DIMENSION_1D 1 #define DIMENSION_2D 2 diff --git a/mindspore/lite/src/lite_session.cc b/mindspore/lite/src/lite_session.cc index 216207f695f..d5d4463e1c1 100644 --- a/mindspore/lite/src/lite_session.cc +++ b/mindspore/lite/src/lite_session.cc @@ -165,6 +165,10 @@ int LiteSession::ConvertTensorsData(const lite::Model *model, size_t tensor_inde auto ret = DecompressTensor(*src_tensor, dst_tensor); if (ret == RET_NO_CHANGE) { + if (src_tensor->data()->size() < dst_tensor->Size()) { + MS_LOG(ERROR) << "Tensor data shape invalid"; + return RET_ERROR; + } dst_tensor->set_data(const_cast(src_tensor->data()->data())); dst_tensor->set_own_data(false); } else if (ret != RET_OK) { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc index 33f0ea2f87a..5b6b6fc140e 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc @@ -28,12 +28,32 @@ namespace mindspore::kernel { namespace { constexpr size_t kPadCommonInputSize = 2; } // namespace -int PadFp16CPUKernel::RunImpl(int task_id) { +int PadFp16CPUKernel::RunImpl(int task_id) const { PadFp16(input_, output_, in_, out_, pad_param_->paddings_, task_id, op_parameter_->thread_num_); return RET_OK; } -int PadFp16CPUKernel::RunMirrorPadImpl(int task_id) { +void PadFp16CPUKernel::RunMirrorPadImplFast(const MirrorPadBlock &block, const float16_t *input_data, + float16_t *output_data) const { + for (int a = 0; a < block.size_[0]; a++) { + int out_a_index = block.out_offset_ + a * block.out_stride_[0]; + for (int b = 0; b < block.size_[1]; b++) { + int out_b_index = out_a_index + b * block.out_stride_[1]; + for (int c = 0; c < block.size_[2]; ++c) { + int out_c_index = out_b_index + c * block.out_stride_[2]; + for (int d = 0; d < block.size_[3]; ++d) { + int out_d_index = out_c_index + d * block.out_stride_[3]; + for (int e = 0; e < block.size_[4]; ++e) { + int output_index = out_d_index + e * block.out_stride_[4]; + MirrorPadFp16(input_data, output_data, in_, pad_param_, output_index, output_index + block.size_[5]); + } + } + } + } + } +} + +int PadFp16CPUKernel::RunMirrorPadImpl(int task_id) const { auto input = in_tensors_.at(0); CHECK_NULL_RETURN(input); auto output = out_tensors_.at(0); @@ -51,23 +71,7 @@ int PadFp16CPUKernel::RunMirrorPadImpl(int task_id) { /* calculate region part */ for (size_t i = task_id; i < mirror_pad_block_.size(); i += op_parameter_->thread_num_) { auto block = mirror_pad_block_[i]; - - for (int a = 0; a < block.size_[0]; a++) { - int out_a_index = block.out_offset_ + a * block.out_stride_[0]; - for (int b = 0; b < block.size_[1]; b++) { - int out_b_index = out_a_index + b * block.out_stride_[1]; - for (int c = 0; c < block.size_[2]; ++c) { - int out_c_index = out_b_index + c * block.out_stride_[2]; - for (int d = 0; d < block.size_[3]; ++d) { - int out_d_index = out_c_index + d * block.out_stride_[3]; - for (int e = 0; e < block.size_[4]; ++e) { - int output_index = out_d_index + e * block.out_stride_[4]; - MirrorPadFp16(input_data, output_data, in_, pad_param_, output_index, output_index + block.size_[5]); - } - } - } - } - } + RunMirrorPadImplFast(block, input_data, output_data); } return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.h index 2e0d0d69f35..807831df120 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.h @@ -30,8 +30,11 @@ class PadFp16CPUKernel : public PadCPUKernel { ~PadFp16CPUKernel() {} int Run() override; - int RunImpl(int task_id) override; - int RunMirrorPadImpl(int task_id) override; + int RunImpl(int task_id) const override; + int RunMirrorPadImpl(int task_id) const override; + + private: + void RunMirrorPadImplFast(const MirrorPadBlock &block, const float16_t *input_data, float16_t *output_data) const; private: float16_t *input_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/prelu_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/prelu_fp16.cc index a86fdb2e425..283a0cddbd6 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/prelu_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/prelu_fp16.cc @@ -25,7 +25,7 @@ using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_PReLUFusion; namespace mindspore::kernel { -int PReluFp16CPUKernel::DoExcute(int task_id) { +int PReluFp16CPUKernel::DoExcute(int task_id) const { int thread_num = param_->op_parameter_.thread_num_; if (thread_num == 0) { MS_LOG(ERROR) << "thread_num is 0!"; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/prelu_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/prelu_fp16.h index 7f0f28ed737..c2f0123a5cd 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/prelu_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/prelu_fp16.h @@ -27,7 +27,7 @@ class PReluFp16CPUKernel : public PReluCPUKernel { : PReluCPUKernel(parameter, inputs, outputs, ctx) {} ~PReluFp16CPUKernel() = default; - int DoExcute(int task_id) override; + int DoExcute(int task_id) const override; }; } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_PRELU_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd_fp32.cc index 3d4a09c2bb2..74036ae2621 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd_fp32.cc @@ -102,7 +102,7 @@ void GatherNdCPUKernel::InitOffset() { } } -int GatherNdCPUKernel::DoGatherNd(int task_id) { +int GatherNdCPUKernel::DoGatherNd(int task_id) const { int count = MSMIN(thread_sz_stride_, count_ - task_id * thread_sz_stride_); if (count <= 0) { return RET_OK; @@ -116,8 +116,8 @@ int GatherNdCPUKernel::DoGatherNd(int task_id) { return RET_OK; } -int GatherNdRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) { - auto g_kernel = reinterpret_cast(cdata); +int GatherNdRun(const void *cdata, int task_id, float, float) { + auto g_kernel = reinterpret_cast(cdata); auto ret = g_kernel->DoGatherNd(task_id); if (ret != RET_OK) { MS_LOG(ERROR) << "GatherNdRun error task_id[" << task_id << "] error_code[" << ret << "]"; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd_fp32.h index 54f6cc14c36..be3ba3c2760 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd_fp32.h @@ -37,7 +37,7 @@ class GatherNdCPUKernel : public InnerKernel { int Prepare() override; int ReSize() override; int Run() override; - int DoGatherNd(int task_id); + int DoGatherNd(int task_id) const; private: void InitOffset(); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/gather_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/gather_fp32.cc index 989eac7b7a5..785462b1d28 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/gather_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gather_fp32.cc @@ -42,7 +42,7 @@ int GatherCPUKernel::Prepare() { int GatherCPUKernel::ReSize() { return RET_OK; } -int GatherCPUKernel::DoGather(int task_id) { +int GatherCPUKernel::DoGather(int task_id) const { auto input_tensor = in_tensors_.at(0); auto indices_tensor = in_tensors_.at(1); auto out_tensor = out_tensors_.at(0); @@ -81,8 +81,8 @@ int GatherCPUKernel::DoGather(int task_id) { return error_code; } -int GatherRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) { - auto gather_kernel = reinterpret_cast(cdata); +int GatherRun(const void *cdata, int task_id, float, float) { + auto gather_kernel = reinterpret_cast(cdata); auto error_code = gather_kernel->DoGather(task_id); if (error_code != RET_OK) { MS_LOG(ERROR) << "GatherRun error task_id[" << task_id << "] error_code[" << error_code << "]"; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/gather_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/gather_fp32.h index 56199468f0d..b8a3c1195a9 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/gather_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gather_fp32.h @@ -34,7 +34,7 @@ class GatherCPUKernel : public InnerKernel { int Prepare() override; int ReSize() override; int Run() override; - int DoGather(int task_id); + int DoGather(int task_id) const; private: int *indices_data_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/glu_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/glu_fp32.cc index 001da1fcd69..dc31aef6974 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/glu_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/glu_fp32.cc @@ -35,7 +35,7 @@ int GluCPUKernel::MallocTmpBuffer() { FreeTmpBuffer(); auto in_tensor = in_tensors_.front(); for (int i = 0; i < kSplitNum; i++) { - split_ptr_[i] = reinterpret_cast(ms_context_->allocator->Malloc(in_tensor->Size() / kSplitNum)); + split_ptr_[i] = ms_context_->allocator->Malloc(in_tensor->Size() / kSplitNum); if (split_ptr_[i] == nullptr) { MS_LOG(ERROR) << "GluCPUKernel malloc split ptr failed."; return RET_ERROR; @@ -96,8 +96,7 @@ int GluCPUKernel::ReSize() { return RET_OK; } -int GluCPUKernel::Split(int task_id) { - input_ptr_ = in_tensors_.front()->data(); +int GluCPUKernel::Split(int task_id) const { MS_CHECK_INT_MUL_NOT_OVERFLOW(task_id, thread_n_stride_, RET_ERROR); int num_unit_thread = MSMIN(thread_n_stride_, num_unit_ - task_id * thread_n_stride_); if (num_unit_thread <= 0) { @@ -105,8 +104,8 @@ int GluCPUKernel::Split(int task_id) { } int thread_offset = task_id * thread_n_stride_; auto ret = - DoSplit(input_ptr_, reinterpret_cast(split_ptr_.data()), in_tensors_.front()->shape().data(), - thread_offset, num_unit_thread, &split_param_, lite::DataTypeSize(in_tensors_.front()->data_type())); + DoSplit(input_ptr_, const_cast(split_ptr_.data()), in_tensors_.front()->shape().data(), thread_offset, + num_unit_thread, &split_param_, lite::DataTypeSize(in_tensors_.front()->data_type())); if (ret != RET_OK) { MS_LOG(ERROR) << "Split error task_id[" << task_id << "] error_code[" << ret << "]"; return RET_ERROR; @@ -114,7 +113,7 @@ int GluCPUKernel::Split(int task_id) { return RET_OK; } -int GluCPUKernel::Sigmoid(int task_id) { +int GluCPUKernel::Sigmoid(int task_id) const { auto input_addr = reinterpret_cast(split_ptr_.at(1)); auto output_addr = reinterpret_cast(sigmoid_ptr_); auto length = in_tensors_.at(0)->ElementsNum() / kGluBranchNum; @@ -128,7 +127,7 @@ int GluCPUKernel::Sigmoid(int task_id) { return ::Sigmoid(input_addr + stride * task_id, count, output_addr + stride * task_id); } -int GluCPUKernel::Mul(int task_id) { +int GluCPUKernel::Mul(int task_id) const { auto input_addr0 = reinterpret_cast(split_ptr_.at(0)); auto input_addr1 = reinterpret_cast(sigmoid_ptr_); auto output_addr = reinterpret_cast(out_tensors_.at(0)->data()); @@ -144,22 +143,24 @@ int GluCPUKernel::Mul(int task_id) { return ElementMul(input_addr0 + offset, input_addr1 + offset, output_addr + offset, count); } -static int SplitRun(void *cdata, int task_id, float, float) { - auto g_kernel = reinterpret_cast(cdata); +static int SplitRun(const void *cdata, int task_id, float, float) { + auto g_kernel = reinterpret_cast(cdata); return g_kernel->Split(task_id); } -static int SigmoidRun(void *cdata, int task_id, float, float) { - auto activation_kernel = reinterpret_cast(cdata); +static int SigmoidRun(const void *cdata, int task_id, float, float) { + auto activation_kernel = reinterpret_cast(cdata); return activation_kernel->Sigmoid(task_id); } -static int MulRun(void *cdata, int task_id, float, float) { - auto g_kernel = reinterpret_cast(cdata); +static int MulRun(const void *cdata, int task_id, float, float) { + auto g_kernel = reinterpret_cast(cdata); return g_kernel->Mul(task_id); } int GluCPUKernel::Run() { + input_ptr_ = in_tensors_.front()->data(); + auto ret = MallocTmpBuffer(); if (ret != RET_OK) { MS_LOG(ERROR) << "Malloc tmp buffer failed"; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/glu_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/glu_fp32.h index 5f2a1dadd44..3bc9142aafd 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/glu_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/glu_fp32.h @@ -43,9 +43,11 @@ class GluCPUKernel : public InnerKernel { int Prepare() override; int ReSize() override; int Run() override; - int Split(int task_id); - int Sigmoid(int task_id); - int Mul(int task_id); + int Split(int task_id) const; + int Sigmoid(int task_id) const; + int Mul(int task_id) const; + + private: void FreeTmpBuffer(); int MallocTmpBuffer(); @@ -54,8 +56,8 @@ class GluCPUKernel : public InnerKernel { GluParameter *glu_param_ = nullptr; void *input_ptr_ = nullptr; int8_t *sigmoid_ptr_ = nullptr; - std::vector split_ptr_; - int split_sizes_[kSplitNum]; + std::vector split_ptr_; + int split_sizes_[kSplitNum] = {0}; int thread_n_stride_ = 0; int usable_thread_num_ = 0; int num_unit_ = 0; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/gru_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/gru_fp32.h index a837f6cfae7..8c225c7bf98 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/gru_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gru_fp32.h @@ -50,7 +50,7 @@ class GruCPUKernel : public InnerKernel { const int weight_r_index = 2; const int bias_index = 3; - float *buffer_[4]; + float *buffer_[4] = {nullptr}; const int gate_num = 3; const int packed_input_index = 0; const int input_gate_index = 1; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/instance_norm_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/instance_norm_fp32.cc index a97b68e8d96..c5f7882a052 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/instance_norm_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/instance_norm_fp32.cc @@ -45,7 +45,7 @@ int InstanceNormCPUKernel::ReSize() { return RET_OK; } -int InstanceNormCPUKernel::DoInstanceNorm(int task_id) { +int InstanceNormCPUKernel::DoInstanceNorm(int task_id) const { int ret = 0; if (in_tensors_[0]->format() == NC4HW4) { // arm64 x86-avx x86-sse x86 #ifdef ENABLE_AVX @@ -63,8 +63,8 @@ int InstanceNormCPUKernel::DoInstanceNorm(int task_id) { return RET_OK; } -int InstanceNormRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) { - auto kernel = reinterpret_cast(cdata); +int InstanceNormRun(const void *cdata, int task_id, float, float) { + auto kernel = reinterpret_cast(cdata); auto ret = kernel->DoInstanceNorm(task_id); if (ret != RET_OK) { MS_LOG(ERROR) << "InstanceNormRun error task_id[" << task_id << "] error_code[" << ret << "]"; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/instance_norm_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/instance_norm_fp32.h index 87c9499f990..457e10bc5e8 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/instance_norm_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/instance_norm_fp32.h @@ -35,7 +35,9 @@ class InstanceNormCPUKernel : public InnerKernel { int Prepare() override; int ReSize() override; int Run() override; - int DoInstanceNorm(int task_id); + int DoInstanceNorm(int task_id) const; + + private: void FreeTmpBuffer() { if (tmp_src_data_ != nullptr) { ms_context_->allocator->Free(tmp_src_data_); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/l2_norm_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/l2_norm_fp32.cc index 15f3e7782a4..0daec8e35a6 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/l2_norm_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/l2_norm_fp32.cc @@ -89,7 +89,7 @@ int L2NormCPUKernel::ReSize() { return RET_OK; } -int L2NormCPUKernel::CalcSquareSum(int task_id) { +int L2NormCPUKernel::CalcSquareSum(int task_id) const { int unit = UP_DIV(l2_norm_param_->data_num_, op_parameter_->thread_num_); if (INT_MUL_OVERFLOW(task_id, unit)) { MS_LOG(ERROR) << "int mul overflow."; @@ -100,7 +100,7 @@ int L2NormCPUKernel::CalcSquareSum(int task_id) { return CalcThreadSquareSum(input_ptr_, tmp_sum_ + task_id, begin, end); } -int L2NormCPUKernel::DivSqrtSum(int task_id) { +int L2NormCPUKernel::DivSqrtSum(int task_id) const { int unit = UP_DIV(l2_norm_param_->data_num_, op_parameter_->thread_num_); if (INT_MUL_OVERFLOW(task_id, unit)) { MS_LOG(ERROR) << "int mul overflow."; @@ -111,7 +111,7 @@ int L2NormCPUKernel::DivSqrtSum(int task_id) { return ThreadDivSqrtSum(input_ptr_, output_ptr_, l2_norm_param_, sqrt_sum_, begin, end); } -int L2NormCPUKernel::CalcL2NormTrailingAxis(int task_id) { +int L2NormCPUKernel::CalcL2NormTrailingAxis(int task_id) const { auto input = in_tensors_.at(0); if (input->shape().back() == 0) { MS_LOG(ERROR) << "input->shape().back() is 0"; @@ -128,8 +128,8 @@ int L2NormCPUKernel::CalcL2NormTrailingAxis(int task_id) { return ThreadTrailingAxis(input_ptr_, output_ptr_, l2_norm_param_, begin, end); } -int SquareSumRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) { - auto kernel = reinterpret_cast(cdata); +int SquareSumRun(const void *cdata, int task_id, float, float) { + auto kernel = reinterpret_cast(cdata); auto ret = kernel->CalcSquareSum(task_id); if (ret != RET_OK) { MS_LOG(ERROR) << "L2Norm SquareSumRun error task_id[" << task_id << "] error_code[" << ret << "]"; @@ -138,9 +138,9 @@ int SquareSumRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) { return RET_OK; } -int L2NormRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) { +int L2NormRun(const void *cdata, int task_id, float, float) { CHECK_NULL_RETURN(cdata); - auto kernel = reinterpret_cast(cdata); + auto kernel = reinterpret_cast(cdata); auto ret = kernel->DivSqrtSum(task_id); if (ret != RET_OK) { MS_LOG(ERROR) << "L2Norm L2NormRun error task_id[" << task_id << "] error_code[" << ret << "]"; @@ -149,9 +149,9 @@ int L2NormRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) { return RET_OK; } -int L2NormTrailingAxisRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) { +int L2NormTrailingAxisRun(const void *cdata, int task_id, float, float) { CHECK_NULL_RETURN(cdata); - auto kernel = reinterpret_cast(cdata); + auto kernel = reinterpret_cast(cdata); auto ret = kernel->CalcL2NormTrailingAxis(task_id); if (ret != RET_OK) { MS_LOG(ERROR) << "L2Norm TrailingAxisRun error task_id[" << task_id << "] error_code[" << ret << "]"; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/l2_norm_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/l2_norm_fp32.h index 54b523fe0c5..e78f36d8b81 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/l2_norm_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/l2_norm_fp32.h @@ -36,9 +36,9 @@ class L2NormCPUKernel : public InnerKernel { } ~L2NormCPUKernel() { FreeTmpBuffer(); } - int CalcSquareSum(int task_id); - int DivSqrtSum(int task_id); - int CalcL2NormTrailingAxis(int task_id); + int CalcSquareSum(int task_id) const; + int DivSqrtSum(int task_id) const; + int CalcL2NormTrailingAxis(int task_id) const; int Prepare() override; int ReSize() override; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/layer_norm_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/layer_norm_fp32.cc index a29bfabc279..042f529bc95 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/layer_norm_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/layer_norm_fp32.cc @@ -65,7 +65,7 @@ int LayerNormCPUKernel::ReSize() { return RET_OK; } -int LayerNormCPUKernel::DoLayerNorm(int thread_id) { +int LayerNormCPUKernel::DoLayerNorm(int thread_id) const { auto ret = LayerNorm(src_data_, gamma_data_, beta_data_, dst_data_, mean_data_, var_data_, param_, thread_id); if (ret != RET_OK) { MS_LOG(ERROR) << "DoLayerNorm error error_code[" << ret << "]"; @@ -74,8 +74,8 @@ int LayerNormCPUKernel::DoLayerNorm(int thread_id) { return RET_OK; } -int LayerNormRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) { - auto kernel = reinterpret_cast(cdata); +int LayerNormRun(const void *cdata, int task_id, float, float) { + auto kernel = reinterpret_cast(cdata); CHECK_NULL_RETURN(kernel); auto ret = kernel->DoLayerNorm(task_id); if (ret != RET_OK) { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/layer_norm_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/layer_norm_fp32.h index 16ab54ca1dd..5c84cdb775f 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/layer_norm_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/layer_norm_fp32.h @@ -35,7 +35,7 @@ class LayerNormCPUKernel : public InnerKernel { int Prepare() override; int ReSize() override; int Run() override; - int DoLayerNorm(int thread_id); + int DoLayerNorm(int thread_id) const; private: LayerNormParameter *param_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm_fp32.cc index 1aa4766ce10..c9fe6d5e712 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm_fp32.cc @@ -35,7 +35,7 @@ int LocalResponseNormCPUKernel::Prepare() { int LocalResponseNormCPUKernel::ReSize() { return RET_OK; } -int LocalResponseNormCPUKernel::DoLocalResponseNorm(int task_id) { +int LocalResponseNormCPUKernel::DoLocalResponseNorm(int task_id) const { auto input_tensor = in_tensors_.front(); auto out_tensor = out_tensors_.front(); auto input_ptr = reinterpret_cast(input_tensor->MutableData()); @@ -67,8 +67,8 @@ int LocalResponseNormCPUKernel::DoLocalResponseNorm(int task_id) { return RET_OK; } -int LocalResponseNormRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) { - auto lrn = reinterpret_cast(cdata); +int LocalResponseNormRun(const void *cdata, int task_id, float, float) { + auto lrn = reinterpret_cast(cdata); auto error_code = lrn->DoLocalResponseNorm(task_id); if (error_code != RET_OK) { MS_LOG(ERROR) << "LocalResponseNormRun error task_id[" << task_id << "] error_code[" << error_code << "]"; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm_fp32.h index 44642d6db0b..a4199e0bc70 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm_fp32.h @@ -32,7 +32,7 @@ class LocalResponseNormCPUKernel : public InnerKernel { int Prepare() override; int ReSize() override; int Run() override; - int DoLocalResponseNorm(int task_id); + int DoLocalResponseNorm(int task_id) const; private: int thread_count_; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/log_softmax_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/log_softmax_fp32.cc index 3da2a50ed05..7745bfa7b7d 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/log_softmax_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/log_softmax_fp32.cc @@ -79,7 +79,7 @@ int LogSoftmaxCPUKernel::ReSize() { return RET_OK; } -int LogSoftmaxCPUKernel::DoLogSoftmaxLastAxis(int task_id) { +int LogSoftmaxCPUKernel::DoLogSoftmaxLastAxis(int task_id) const { MS_CHECK_FALSE(op_parameter_->thread_num_ == 0, RET_ERROR); int unit = UP_DIV(out_plane_size_, op_parameter_->thread_num_); int begin = task_id * unit; @@ -94,8 +94,8 @@ int LogSoftmaxCPUKernel::DoLogSoftmaxLastAxis(int task_id) { return RET_OK; } -int LogSoftmaxLastAxisRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) { - auto kernel = reinterpret_cast(cdata); +int LogSoftmaxLastAxisRun(const void *cdata, int task_id, float, float) { + auto kernel = reinterpret_cast(cdata); CHECK_NULL_RETURN(kernel); auto ret = kernel->DoLogSoftmaxLastAxis(task_id); if (ret != RET_OK) { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/log_softmax_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/log_softmax_fp32.h index ddc08645c10..7f946712f1a 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/log_softmax_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/log_softmax_fp32.h @@ -32,7 +32,7 @@ class LogSoftmaxCPUKernel : public SoftmaxBaseCPUKernel { int Prepare() override; int ReSize() override; int Run() override; - int DoLogSoftmaxLastAxis(int task_id); + int DoLogSoftmaxLastAxis(int task_id) const; private: float *tmp_data_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/lstm_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/lstm_fp32.cc index 815db106867..08c63bb931a 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/lstm_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/lstm_fp32.cc @@ -322,7 +322,7 @@ int LstmCPUKernel::MallocRunBuffer() { return RET_OK; } -void LstmCPUKernel::InputWeightMatMul(int task_id) { +void LstmCPUKernel::InputWeightMatMul(int task_id) const { int current_start_oc = task_id * input_thread_stride_ * col_tile_; int current_rest_oc = 0; current_rest_oc = lstm_param_->hidden_size_ - current_start_oc; @@ -339,8 +339,8 @@ void LstmCPUKernel::InputWeightMatMul(int task_id) { cur_oc, lstm_param_->hidden_size_, OutType_Nhwc); } -int LstmInputMulWeightRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) { - auto kernel = reinterpret_cast(cdata); +int LstmInputMulWeightRun(const void *cdata, int task_id, float, float) { + auto kernel = reinterpret_cast(cdata); CHECK_NULL_RETURN(kernel); kernel->InputWeightMatMul(task_id); return RET_OK; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/lstm_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/lstm_fp32.h index b2db8e3ee20..04ca1325103 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/lstm_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/lstm_fp32.h @@ -36,7 +36,7 @@ class LstmCPUKernel : public InnerKernel { int ReSize() override; int Run() override; - void InputWeightMatMul(int task_id); + void InputWeightMatMul(int task_id) const; private: void FreeTmpBuffer(); @@ -50,9 +50,9 @@ class LstmCPUKernel : public InnerKernel { const float *state_bias, float *hidden_state, float *cell_state, bool is_backward); int InnerExecute(float *output, const float *input, float *hidden_state, float *cell_state); void RecordStates(const float *cell_state, int step); - const float *weight_loop_; - const float *bias_loop_; - float *gate_loop_; + const float *weight_loop_ = nullptr; + const float *bias_loop_ = nullptr; + float *gate_loop_ = nullptr; int input_thread_count_ = 0; int input_thread_stride_ = 0; @@ -64,7 +64,7 @@ class LstmCPUKernel : public InnerKernel { const int weight_h_index = 2; const int bias_index = 3; - float *buffer_[7]; + float *buffer_[7] = {nullptr}; const int gate_num = 4; const int packed_input_index = 0; const int input_gate_index = 1; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc index 8c933d642e1..72c2fc6b98d 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc @@ -21,9 +21,9 @@ using mindspore::lite::RET_NULL_PTR; namespace mindspore::kernel { -int MatmulBaseFloatRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) { +int MatmulBaseFloatRun(const void *cdata, int task_id, float, float) { CHECK_NULL_RETURN(cdata); - auto op = reinterpret_cast(cdata); + auto op = reinterpret_cast(cdata); auto error_code = op->FloatRun(task_id); if (error_code != RET_OK) { MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]"; @@ -126,32 +126,44 @@ int MatmulFp32BaseCPUKernel::CalBroadCastBiasDataElements() { } int MatmulFp32BaseCPUKernel::InitBiasData() { - if (in_tensors_.size() == 3) { - auto bias_tensor = in_tensors_[2]; - size_t max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), col_tile_); - // malloc addr need to aligned to 32 bytes + if (in_tensors_.size() != FOURTH_INPUT) { + return RET_OK; + } + auto bias_tensor = in_tensors_[THIRD_INPUT]; + if (bias_tensor == nullptr) { + MS_LOG(ERROR) << "bias_tensor invalid"; + return RET_ERROR; + } + + if (bias_tensor->ElementsNum() == 1) { + // broadcast bias data + size_t max_bias_data = CalBroadCastBiasDataElements(); bias_ptr_ = reinterpret_cast(malloc(max_bias_data * static_cast(sizeof(float)))); if (bias_ptr_ == nullptr) { MS_LOG(ERROR) << "malloc bias_ptr_ failed"; return RET_ERROR; } - // whether to broadcast bias data - if (bias_tensor->ElementsNum() == 1) { - max_bias_data = CalBroadCastBiasDataElements(); - float broadcast_data = (reinterpret_cast(bias_tensor->data()))[0]; - // broadcast bias data - for (size_t i = 0; i < max_bias_data; ++i) { - bias_ptr_[i] = broadcast_data; - } - } else { - memset(bias_ptr_, 0, max_bias_data * static_cast(sizeof(float))); - memcpy(bias_ptr_, bias_tensor->data(), bias_tensor->ElementsNum() * static_cast(sizeof(float))); + float broadcast_data = (reinterpret_cast(bias_tensor->data()))[0]; + // broadcast bias data + for (size_t i = 0; i < max_bias_data; ++i) { + bias_ptr_[i] = broadcast_data; } + return RET_OK; } + + size_t max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), col_tile_); + // malloc addr need to aligned to 32 bytes + bias_ptr_ = reinterpret_cast(malloc(max_bias_data * static_cast(sizeof(float)))); + if (bias_ptr_ == nullptr) { + MS_LOG(ERROR) << "malloc bias_ptr_ failed"; + return RET_ERROR; + } + memset(bias_ptr_, 0, max_bias_data * static_cast(sizeof(float))); + memcpy(bias_ptr_, bias_tensor->data(), bias_tensor->ElementsNum() * static_cast(sizeof(float))); return RET_OK; } -int MatmulFp32BaseCPUKernel::InitMatrixA(const float *src_ptr) { +int MatmulFp32BaseCPUKernel::InitMatrixA(const float *src_ptr) const { CHECK_NULL_RETURN(src_ptr); #ifdef ENABLE_ARM64 if (vec_matmul_) { @@ -175,7 +187,7 @@ int MatmulFp32BaseCPUKernel::InitMatrixA(const float *src_ptr) { return RET_OK; } -int MatmulFp32BaseCPUKernel::InitMatrixB(const float *src_ptr) { +int MatmulFp32BaseCPUKernel::InitMatrixB(const float *src_ptr) const { CHECK_NULL_RETURN(src_ptr); for (int i = 0; i < params_->batch; i++) { const float *src = src_ptr + i * params_->deep_ * params_->col_; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h index cf26c125171..d3b46057bdd 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h @@ -47,8 +47,8 @@ class MatmulFp32BaseCPUKernel : public InnerKernel { protected: int InitBufferA(); int InitBufferB(); - int InitMatrixA(const float *src_ptr); - int InitMatrixB(const float *src_ptr); + int InitMatrixA(const float *src_ptr) const; + int InitMatrixB(const float *src_ptr) const; void FreeBiasBuf(); int InitBiasData(); void InitParameter(); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/non_max_suppression_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/non_max_suppression_fp32.cc index 3aaa877d232..55dad3a699b 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/non_max_suppression_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/non_max_suppression_fp32.cc @@ -110,7 +110,7 @@ void ExpandDims(std::vector *shape, size_t size) { } int NonMaxSuppressionCPUKernel::Run_Selecte(bool simple_out, int box_num, int batch_num, int class_num, - float *scores_data, float *box_data) { + const float *scores_data, const float *box_data) { std::vector selected_box_per_class; selected_box_per_class.reserve(std::min(static_cast(box_num), max_output_per_class_)); std::vector selected_index; @@ -119,8 +119,8 @@ int NonMaxSuppressionCPUKernel::Run_Selecte(bool simple_out, int box_num, int ba int batch_offset = i * class_num * box_num; for (auto j = 0; j < class_num; ++j) { // per batch per class filter - float *per_class_scores = scores_data + batch_offset + j * box_num; - float *box = box_data + i * box_num * kBoxPointNum; + const float *per_class_scores = scores_data + batch_offset + j * box_num; + const float *box = box_data + i * box_num * kBoxPointNum; std::vector above_score_candidates; above_score_candidates.reserve(box_num); for (auto k = 0; k < box_num; ++k) { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/non_max_suppression_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/non_max_suppression_fp32.h index c8c8bf0eab1..8bc608fee76 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/non_max_suppression_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/non_max_suppression_fp32.h @@ -41,7 +41,8 @@ class NonMaxSuppressionCPUKernel : public InnerKernel { private: int GetParams(); - int Run_Selecte(bool simple_out, int box_num, int batch_num, int class_num, float *scores_data, float *box_data); + int Run_Selecte(bool simple_out, int box_num, int batch_num, int class_num, const float *scores_data, + const float *box_data); private: int center_point_box_ = 0; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/pad_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/pad_fp32.cc index 87ef9dd6c14..3bfd59df55a 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/pad_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pad_fp32.cc @@ -206,8 +206,8 @@ int PadCPUKernel::ExtendPaddings(int *paddings, int length, const int *ori_paddi return RET_OK; } -int PadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) { - auto padKernel = reinterpret_cast(cdata); +int PadImpl(const void *cdata, int task_id, float, float) { + auto padKernel = reinterpret_cast(cdata); int error_code = padKernel->RunImpl(task_id); if (error_code != NNACL_OK) { MS_LOG(ERROR) << "Pad Run error task_id[" << task_id << "] error_code[" << error_code << "]"; @@ -216,7 +216,7 @@ int PadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) { return RET_OK; } -int PadCPUKernel::RunImpl(int task_id) { +int PadCPUKernel::RunImpl(int task_id) const { auto input = in_tensors_.at(0); auto output = out_tensors_.at(0); auto input_data = reinterpret_cast(input->data()); @@ -228,8 +228,8 @@ int PadCPUKernel::RunImpl(int task_id) { return RET_OK; } -int MirrorPadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) { - auto padKernel = reinterpret_cast(cdata); +int MirrorPadImpl(const void *cdata, int task_id, float, float) { + auto padKernel = reinterpret_cast(cdata); int error_code = padKernel->RunMirrorPadImpl(task_id); if (error_code != NNACL_OK) { MS_LOG(ERROR) << "Pad Run error task_id[" << task_id << "] error_code[" << error_code << "]"; @@ -238,7 +238,27 @@ int MirrorPadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) { return RET_OK; } -int PadCPUKernel::RunMirrorPadImpl(int task_id) { +void PadCPUKernel::RunMirrorPadImplFast(const MirrorPadBlock &block, const float *input_data, + float *output_data) const { + for (int a = 0; a < block.size_[FIRST_INPUT]; a++) { + int out_a_index = block.out_offset_ + a * block.out_stride_[FIRST_INPUT]; + for (int b = 0; b < block.size_[SECOND_INPUT]; b++) { + int out_b_index = out_a_index + b * block.out_stride_[SECOND_INPUT]; + for (int c = 0; c < block.size_[THIRD_INPUT]; ++c) { + int out_c_index = out_b_index + c * block.out_stride_[THIRD_INPUT]; + for (int d = 0; d < block.size_[FOURTH_INPUT]; ++d) { + int out_d_index = out_c_index + d * block.out_stride_[FOURTH_INPUT]; + for (int e = 0; e < block.size_[FIFTH_INPUT]; ++e) { + int output_index = out_d_index + e * block.out_stride_[FIFTH_INPUT]; + MirrorPad(input_data, output_data, in_, pad_param_, output_index, output_index + block.size_[SIXTH_INPUT]); + } + } + } + } + } +} + +int PadCPUKernel::RunMirrorPadImpl(int task_id) const { auto input = in_tensors_.at(0); auto output = out_tensors_.at(0); auto input_data = reinterpret_cast(input->data()); @@ -253,23 +273,7 @@ int PadCPUKernel::RunMirrorPadImpl(int task_id) { /* calculate region part */ for (size_t i = task_id; i < mirror_pad_block_.size(); i += static_cast(op_parameter_->thread_num_)) { auto block = mirror_pad_block_[i]; - - for (int a = 0; a < block.size_[0]; a++) { - int out_a_index = block.out_offset_ + a * block.out_stride_[0]; - for (int b = 0; b < block.size_[1]; b++) { - int out_b_index = out_a_index + b * block.out_stride_[1]; - for (int c = 0; c < block.size_[2]; ++c) { - int out_c_index = out_b_index + c * block.out_stride_[2]; - for (int d = 0; d < block.size_[3]; ++d) { - int out_d_index = out_c_index + d * block.out_stride_[3]; - for (int e = 0; e < block.size_[4]; ++e) { - int output_index = out_d_index + e * block.out_stride_[4]; - MirrorPad(input_data, output_data, in_, pad_param_, output_index, output_index + block.size_[5]); - } - } - } - } - } + RunMirrorPadImplFast(block, input_data, output_data); } return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/pad_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/pad_fp32.h index 89a4bf7ee82..112693486ca 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/pad_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pad_fp32.h @@ -41,8 +41,8 @@ class PadCPUKernel : public InnerKernel { int Prepare() override; int ReSize() override; int Run() override; - virtual int RunImpl(int task_id); - virtual int RunMirrorPadImpl(int task_id); + virtual int RunImpl(int task_id) const; + virtual int RunMirrorPadImpl(int task_id) const; private: int CheckPaddings(const int *paddings, int length, const int *input_shape, int mode); @@ -50,6 +50,7 @@ class PadCPUKernel : public InnerKernel { int ExtendShape(int *shape, int length, const int *ori_shape, int rank) const; int ExtendPaddings(int *paddings, int length, const int *ori_paddings, int ori_length) const; void InitMirrorPadBlock(); + void RunMirrorPadImplFast(const MirrorPadBlock &block, const float *input_data, float *output_data) const; protected: int HandleMirrorPad(); @@ -60,8 +61,8 @@ class PadCPUKernel : public InnerKernel { std::vector mirror_pad_block_; }; -int PadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale); -int MirrorPadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale); +int PadImpl(const void *cdata, int task_id, float, float); +int MirrorPadImpl(const void *cdata, int task_id, float, float); } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_PAD_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_fp32.cc index 9c60e6258da..42fd203c097 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_fp32.cc @@ -50,7 +50,7 @@ int PoolingCPUKernel::ReSize() { return RET_OK; } -int PoolingCPUKernel::RunImpl(int task_id) { +int PoolingCPUKernel::RunImpl(int task_id) const { auto input_ptr = reinterpret_cast(in_tensors_.at(kInputIndex)->MutableData()); CHECK_NULL_RETURN(input_ptr); auto output_ptr = reinterpret_cast(out_tensors_.at(kOutputIndex)->MutableData()); @@ -76,8 +76,8 @@ int PoolingCPUKernel::RunImpl(int task_id) { return RET_OK; } -int PoolingImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) { - auto pooling = reinterpret_cast(cdata); +int PoolingImpl(const void *cdata, int task_id, float, float) { + auto pooling = reinterpret_cast(cdata); auto error_code = pooling->RunImpl(task_id); if (error_code != RET_OK) { MS_LOG(ERROR) << "Pooling Run error task_id[" << task_id << "] error_code[" << error_code << "]"; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_fp32.h index 84ced7f8164..b6ae4b0b12c 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_fp32.h @@ -32,7 +32,7 @@ class PoolingCPUKernel : public PoolingBaseCPUKernel { int Prepare() override; int ReSize() override; int Run() override; - int RunImpl(int task_id); + int RunImpl(int task_id) const; private: }; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/power_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/power_fp32.cc index dd94c89cc30..1d1f49c69a7 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/power_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/power_fp32.cc @@ -33,9 +33,8 @@ int PowerCPUKernel::Prepare() { int PowerCPUKernel::ReSize() { return RET_OK; } -int PowerImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) { - CHECK_NULL_RETURN(cdata); - auto kernel = reinterpret_cast(cdata); +int PowerImpl(const void *cdata, int task_id, float, float) { + auto kernel = reinterpret_cast(cdata); CHECK_NULL_RETURN(kernel); auto ret = kernel->RunImpl(task_id); if (ret != RET_OK) { @@ -54,7 +53,7 @@ int PowerCPUKernel::Run() { return RET_OK; } -int PowerCPUKernel::RunImpl(int task_id) { +int PowerCPUKernel::RunImpl(int task_id) const { auto x_addr = reinterpret_cast(in_tensors_.at(0)->MutableData()); CHECK_NULL_RETURN(x_addr); auto output_addr = reinterpret_cast(out_tensors_.at(0)->MutableData()); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/power_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/power_fp32.h index 65bf23586e2..195de7edc6b 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/power_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/power_fp32.h @@ -36,7 +36,7 @@ class PowerCPUKernel : public InnerKernel { int Prepare() override; int ReSize() override; int Run() override; - int RunImpl(int task_id); + int RunImpl(int task_id) const; private: int thread_count_; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/prelu_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/prelu_fp32.cc index ca368593392..63f5d8f8c5d 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/prelu_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/prelu_fp32.cc @@ -27,8 +27,8 @@ using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_PReLUFusion; namespace mindspore::kernel { -static int PReluRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) { - auto PRelu = reinterpret_cast(cdata); +static int PReluRun(const void *cdata, int task_id, float, float) { + auto PRelu = reinterpret_cast(cdata); auto ret = PRelu->DoExcute(task_id); if (ret != RET_OK) { MS_LOG(ERROR) << "PReluRun error task_id[" << task_id << "] error_code[" << ret << "]"; @@ -55,7 +55,7 @@ int PReluCPUKernel::Prepare() { return ReSize(); } -int PReluCPUKernel::DoExcute(int task_id) { +int PReluCPUKernel::DoExcute(int task_id) const { int thread_num = param_->op_parameter_.thread_num_; if (thread_num == 0) { MS_LOG(ERROR) << "thread_num is 0!"; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/prelu_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/prelu_fp32.h index 04c444a9602..304afa0f9e9 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/prelu_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/prelu_fp32.h @@ -34,7 +34,7 @@ class PReluCPUKernel : public InnerKernel { int Prepare() override; int ReSize() override; int Run() override; - virtual int DoExcute(int task_id); + virtual int DoExcute(int task_id) const; protected: PReluParameter *param_; diff --git a/mindspore/lite/src/runtime/runtime_pass.cc b/mindspore/lite/src/runtime/runtime_pass.cc index fb982f53bcc..8f47387d174 100644 --- a/mindspore/lite/src/runtime/runtime_pass.cc +++ b/mindspore/lite/src/runtime/runtime_pass.cc @@ -84,7 +84,7 @@ void Nc4hw4PassReplace(std::vector *kernels, std::vector *kernels, size_t index) { +bool Nc4hw4PassMatch(const std::vector *kernels, size_t index) { kernel::LiteKernel *start_kernel = kernels->at(index); if (IsContain(Nc4hw4FormatOutOpList, start_kernel->type()) == false) { return false; @@ -179,7 +179,7 @@ void Nc4hw4PassAct(std::vector *kernels, std::vectorout_tensors().front()->set_format(NC4HW4); in_op->in_tensors().front()->set_format(NC4HW4); }