diff --git a/mindspore/lite/nnacl/fp32/conv.c b/mindspore/lite/nnacl/fp32/conv.c index 04e3e9115f..9db14f8b33 100644 --- a/mindspore/lite/nnacl/fp32/conv.c +++ b/mindspore/lite/nnacl/fp32/conv.c @@ -221,7 +221,6 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons int ic4 = UP_DIV(in_channel, C4NUM); int kernel_plane = kernel_h * kernel_w; int unit_size = kernel_plane * ic4 * C4NUM; - int packed_input_size = output_tile_count * TILE_NUM * unit_size; bool relu = conv_param->act_type_ == ActType_Relu; bool relu6 = conv_param->act_type_ == ActType_Relu6; @@ -232,13 +231,14 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons size_t output_offset = out_channel * sizeof(float); for (int b = 0; b < in_batch; b++) { - int in_batch_offset = b * ic4 * C4NUM * in_h * in_w; + int in_batch_offset = b * in_channel * in_h * in_w; int out_batch_offset = b * out_channel * out_h * out_w; - int gemm_in_batch_offset = b * packed_input_size; for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) { int start_index = thread_id * TILE_NUM; int real_cal_num = (output_count - start_index) < TILE_NUM ? (output_count - start_index) : TILE_NUM; - float *gemm_input = packed_input + thread_id * unit_size * TILE_NUM + gemm_in_batch_offset; + float *gemm_input = packed_input + task_id * unit_size * TILE_NUM; + size_t packed_input_size = unit_size * TILE_NUM * sizeof(float); + memset(gemm_input, 0, packed_input_size); Im2ColPackUnitFp32(input_data + in_batch_offset, conv_param, gemm_input, real_cal_num, start_index); int out_offset = thread_id * TILE_NUM * out_channel + out_batch_offset; @@ -291,7 +291,7 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_ // step 1 : filter transform (pre-processed offline) // step 2 : input transform (online) for (int b = 0; b < in_batch; b++) { - int in_batch_offset = b * ic4 * C4NUM * conv_param->input_h_ * conv_param->input_w_; + int in_batch_offset = b * in_channel * conv_param->input_h_ * conv_param->input_w_; int out_batch_offset = b * out_channel * conv_param->output_w_ * conv_param->output_h_; for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_num) { int out_tile_index = thread_id * tile_num; @@ -322,144 +322,9 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_ } } -void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, int width, int channel, - int output_unit) { - int out_h_block_num = UP_DIV(height, output_unit); - int out_w_block_num = UP_DIV(width, output_unit); - int c4 = UP_DIV(channel, C4NUM); - int c4_block = C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit; - for (int b = 0; b < batch; b++) { - int src_batch_offset = b * c4 * c4_block; - int dst_batch_offset = b * height * width * channel; - for (int h = 0; h < height; h++) { - int src_h_offset = src_batch_offset + C4NUM * (h * out_w_block_num * output_unit); - int dst_h_offset = dst_batch_offset + h * width * channel; - for (int w = 0; w < width; w++) { - int src_w_offset = src_h_offset + w * C4NUM; - int dst_w_offset = dst_h_offset + w * channel; - for (int c = 0; c < c4 - 1; c++) { - int src_c4_offset = src_w_offset + c * c4_block; - int dst_c4_offset = dst_w_offset + c * C4NUM; -#ifdef ENABLE_NEON - vst1q_f32(dst + dst_c4_offset, vld1q_f32(src + src_c4_offset)); -#else - for (int i = 0; i < C4NUM; ++i) { - dst[dst_c4_offset + i] = src[src_c4_offset + i]; - } -#endif - } - int c_res = channel - (c4 - 1) * C4NUM; - int src_c_res_offset = (c4 - 1) * c4_block; - int dst_c_res_offset = (c4 - 1) * C4NUM; - for (int c = 0; c < c_res; c++) { - int src_c4_res_offset = src_w_offset + src_c_res_offset + c; - int dst_c4_res_offset = dst_w_offset + dst_c_res_offset + c; - dst[dst_c4_res_offset] = src[src_c4_res_offset]; - } - } - } - } -} - -void UnPackWinogradReluOutput(const float *src, float *dst, int batch, int height, int width, int channel, - int output_unit) { - int out_h_block_num = UP_DIV(height, output_unit); - int out_w_block_num = UP_DIV(width, output_unit); - int c4 = UP_DIV(channel, C4NUM); - int c4_block = C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit; - for (int b = 0; b < batch; b++) { - int src_batch_offset = b * c4 * c4_block; - int dst_batch_offset = b * height * width * channel; - for (int h = 0; h < height; h++) { - int src_h_offset = src_batch_offset + C4NUM * (h * out_w_block_num * output_unit); - int dst_h_offset = dst_batch_offset + h * width * channel; - for (int w = 0; w < width; w++) { - int src_w_offset = src_h_offset + w * C4NUM; - int dst_w_offset = dst_h_offset + w * channel; - for (int c = 0; c < c4 - 1; c++) { - int src_c4_offset = src_w_offset + c * c4_block; - int dst_c4_offset = dst_w_offset + c * C4NUM; -#ifdef ENABLE_NEON - float32x4_t input_ptr = vld1q_f32(src + src_c4_offset); - float32x4_t zero = vdupq_n_f32(0); - input_ptr = vmaxq_f32(zero, input_ptr); - vst1q_f32(dst + dst_c4_offset, input_ptr); -#else - for (int i = 0; i < C4NUM; ++i) { - float input_data = src[src_c4_offset + i]; - input_data = input_data < 0 ? 0 : input_data; - dst[dst_c4_offset + i] = input_data; - } -#endif - } - int c_res = channel - (c4 - 1) * C4NUM; - int src_c_res_offset = (c4 - 1) * c4_block; - int dst_c_res_offset = (c4 - 1) * C4NUM; - for (int c = 0; c < c_res; c++) { - int src_c4_res_offset = src_w_offset + src_c_res_offset + c; - int dst_c4_res_offset = dst_w_offset + dst_c_res_offset + c; - float input_data = src[src_c4_res_offset]; - input_data = input_data < 0 ? 0 : input_data; - dst[dst_c4_res_offset] = input_data; - } - } - } - } -} - -void UnPackWinogradRelu6Output(const float *src, float *dst, int batch, int height, int width, int channel, - int output_unit) { - int out_h_block_num = UP_DIV(height, output_unit); - int out_w_block_num = UP_DIV(width, output_unit); - int c4 = UP_DIV(channel, C4NUM); - int c4_block = C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit; - for (int b = 0; b < batch; b++) { - int src_batch_offset = b * c4 * c4_block; - int dst_batch_offset = b * height * width * channel; - for (int h = 0; h < height; h++) { - int src_h_offset = src_batch_offset + C4NUM * (h * out_w_block_num * output_unit); - int dst_h_offset = dst_batch_offset + h * width * channel; - for (int w = 0; w < width; w++) { - int src_w_offset = src_h_offset + w * C4NUM; - int dst_w_offset = dst_h_offset + w * channel; - for (int c = 0; c < c4 - 1; c++) { - int src_c4_offset = src_w_offset + c * c4_block; - int dst_c4_offset = dst_w_offset + c * C4NUM; -#ifdef ENABLE_NEON - float32x4_t input_ptr = vld1q_f32(src + src_c4_offset); - float32x4_t zero = vdupq_n_f32(0); - float32x4_t six = vdupq_n_f32(6); - input_ptr = vmaxq_f32(zero, input_ptr); - input_ptr = vminq_f32(six, input_ptr); - vst1q_f32(dst + dst_c4_offset, input_ptr); -#else - for (int i = 0; i < C4NUM; ++i) { - float input_data = src[src_c4_offset + i]; - input_data = input_data < 0 ? 0 : input_data; - input_data = input_data > 6 ? 6 : input_data; - dst[dst_c4_offset + i] = input_data; - } -#endif - } - int c_res = channel - (c4 - 1) * C4NUM; - int src_c_res_offset = (c4 - 1) * c4_block; - int dst_c_res_offset = (c4 - 1) * C4NUM; - for (int c = 0; c < c_res; c++) { - int src_c4_res_offset = src_w_offset + src_c_res_offset + c; - int dst_c4_res_offset = dst_w_offset + dst_c_res_offset + c; - float input_data = src[src_c4_res_offset]; - input_data = input_data < 0 ? 0 : input_data; - input_data = input_data > 6 ? 6 : input_data; - dst[dst_c4_res_offset] = input_data; - } - } - } - } -} - // fp32 conv3x3 void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, TmpBufferAddress *buffer_list, - int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func) { + int task_id, ConvParameter *conv_param) { int thread_count = conv_param->thread_num_; int ic4 = UP_DIV(conv_param->input_channel_, C4NUM); int output_channel = conv_param->output_channel_; @@ -488,7 +353,7 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat int input_batch = conv_param->input_batch_; for (int batch = 0; batch < input_batch; batch++) { - int in_batch_offset = batch * ic4 * C4NUM * conv_param->input_h_ * conv_param->input_w_; + int in_batch_offset = batch * conv_param->input_channel_ * conv_param->input_h_ * conv_param->input_w_; int nc4hw4_buffer_offset = batch * oc4 * C4NUM * conv_param->output_h_ * conv_param->output_w_; for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) { diff --git a/mindspore/lite/nnacl/fp32/conv.h b/mindspore/lite/nnacl/fp32/conv.h index 9b280ed820..5aeba9fb1a 100644 --- a/mindspore/lite/nnacl/fp32/conv.h +++ b/mindspore/lite/nnacl/fp32/conv.h @@ -67,7 +67,7 @@ void UnPackWinogradRelu6Output(const float *src, float *dst, int batch, int heig // fp32 conv3x3 void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, TmpBufferAddress *buffer_list, - int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func); + int task_id, ConvParameter *conv_param); #ifdef __cplusplus } #endif diff --git a/mindspore/lite/nnacl/int8/conv_int8.c b/mindspore/lite/nnacl/int8/conv_int8.c index 4d2e9aadd5..d0d1c23928 100644 --- a/mindspore/lite/nnacl/int8/conv_int8.c +++ b/mindspore/lite/nnacl/int8/conv_int8.c @@ -273,7 +273,6 @@ void ConvInt8(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight, c int kernel_plane = kernel_h * kernel_w; int plane_block = UP_DIV(kernel_plane, C4NUM); int unit_size = plane_block * C4NUM * ic4 * C4NUM; - int packed_input_size = output_tile_count * tile_n * unit_size; int input_sum_offset; if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) { input_sum_offset = tile_n * oc4 * C4NUM; @@ -284,12 +283,11 @@ void ConvInt8(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight, c for (int b = 0; b < in_batch; b++) { int in_batch_offset = b * ic4 * C4NUM * in_h * in_w; int out_batch_offset = b * out_channel * out_h * out_w; - int gemm_in_batch_offset = b * packed_input_size; for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) { int start_index = thread_id * tile_n; int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n; int32_t *tmp_input_sum = input_sum + task_id * input_sum_offset; - int8_t *gemm_input = packed_input + thread_id * unit_size * tile_n + gemm_in_batch_offset; + int8_t *gemm_input = packed_input + task_id * unit_size * tile_n; // clear tmp buffer before compute memset(gemm_input, (int8_t)input_zp, unit_size * tile_n); int out_offset = thread_id * tile_n * out_channel + out_batch_offset; @@ -336,7 +334,6 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight int ic4 = UP_DIV(in_channel, C4NUM); int kernel_plane = kernel_h * kernel_w; int unit_size = kernel_plane * ic4 * C4NUM; - int packed_input_size = output_tile_count * tile_n * unit_size; int input_sum_offset; if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) { input_sum_offset = tile_n * oc4 * C4NUM; @@ -347,12 +344,11 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight for (int b = 0; b < in_batch; b++) { int in_batch_offset = b * ic4 * C4NUM * in_h * in_w; int out_batch_offset = b * out_channel * out_h * out_w; - int gemm_in_batch_offset = b * packed_input_size; for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) { int start_index = thread_id * tile_n; int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n; int32_t *tmp_input_sum = input_sum + task_id * input_sum_offset; - int8_t *gemm_input = packed_input + thread_id * unit_size * tile_n + gemm_in_batch_offset; + int8_t *gemm_input = packed_input + task_id * unit_size * tile_n; // clear tmp buffer before compute memset(gemm_input, (int8_t)input_zp, unit_size * tile_n); int out_offset = thread_id * tile_n * out_channel + out_batch_offset; diff --git a/mindspore/lite/nnacl/pack.c b/mindspore/lite/nnacl/pack.c index 83cb2dee3e..5c3aa18499 100644 --- a/mindspore/lite/nnacl/pack.c +++ b/mindspore/lite/nnacl/pack.c @@ -297,24 +297,24 @@ void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, floa int in_h = conv_param->input_h_; int in_w = conv_param->input_w_; int out_w = conv_param->output_w_; + int ic4_minus = in_channel / C4NUM; int ic4 = UP_DIV(in_channel, C4NUM); - memset(packed_input, 0, kernel_h * kernel_w * ic4 * C4NUM * TILE_NUM * sizeof(float)); for (int i = 0; i < real_cal_num; i++) { int block_start = block_index + i; int input_h = block_start / out_w * stride_h - pad_h; int input_w = block_start % out_w * stride_w - pad_w; - int input_stride = input_h * in_w * ic4 * C4NUM + input_w * ic4 * C4NUM; + int input_stride = (input_h * in_w + input_w) * in_channel; int kh_s = MSMAX(0, UP_DIV(-input_h, dilation_h)); int kh_e = MSMIN(kernel_h, UP_DIV(in_h - input_h, dilation_h)); int kw_s = MSMAX(0, UP_DIV(-input_w, dilation_w)); int kw_e = MSMIN(kernel_w, UP_DIV(in_w - input_w, dilation_w)); for (int j = kh_s; j < kh_e; j++) { - int input_y_stride = j * dilation_h * in_w * ic4 * C4NUM + input_stride; + int input_y_stride = j * dilation_h * in_w * in_channel + input_stride; for (int n = kw_s; n < kw_e; n++) { - int input_x_stride = input_y_stride + n * dilation_w * ic4 * C4NUM; + int input_x_stride = input_y_stride + n * dilation_w * in_channel; int input_plane_offset = (j * kernel_w + n) * C8NUM * C4NUM * ic4 + i * C4NUM; - for (int m = 0; m < ic4; m++) { + for (int m = 0; m < ic4_minus; m++) { int channel_block_stride = input_x_stride + m * C4NUM; int channel_block_offset = input_plane_offset + m * C8NUM * C4NUM; #ifdef ENABLE_NEON @@ -325,9 +325,15 @@ void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, floa } #endif } // channel_block loop - } // kernel_w loop - } // kernel_h loop - } // tile num loop + int ic_res = conv_param->input_channel_ - ic4_minus * C4NUM; + for (int l = 0; l < ic_res; ++l) { + int channel_block_stride = input_x_stride + ic4_minus * C4NUM + l; + int channel_block_offset = input_plane_offset + ic4_minus * C8NUM * C4NUM + l; + packed_input[channel_block_offset] = input_data[channel_block_stride]; + } + } // kernel_w loop + } // kernel_h loop + } // tile num loop } void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real_cal_num, int block_index, @@ -346,6 +352,7 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real int in_channel = conv_param->input_channel_; int in_h = conv_param->input_h_; int in_w = conv_param->input_w_; + int ic4_minus = in_channel / C4NUM; int ic4 = UP_DIV(in_channel, C4NUM); int oc4 = UP_DIV(conv_param->output_channel_, C4NUM); int out_w = conv_param->output_w_; @@ -362,19 +369,19 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real input_accumulator += ic4 * C4NUM * conv_param->conv_quant_arg_.input_quant_args_[0].zp_ * kernel_w; continue; } - int input_y_stride = input_y * in_w * ic4 * C4NUM; + int input_y_stride = input_y * in_w * in_channel; for (int n = 0; n < kernel_w; n++) { int input_x = input_w + n * dilation_w; if (input_x < 0 || input_x >= in_w) { input_accumulator += ic4 * C4NUM * conv_param->conv_quant_arg_.input_quant_args_[0].zp_; continue; } - int input_x_stride = input_y_stride + input_x * ic4 * C4NUM; + int input_x_stride = input_y_stride + input_x * in_channel; int plane_c4_block = (j * kernel_w + n) / C4NUM; int plane_c4_res = (j * kernel_w + n) % C4NUM; int input_plane_offset = plane_c4_block * tile_num * C4NUM * C4NUM * ic4 + plane_c4_res * C4NUM + input_cal_num_offset; - for (int m = 0; m < ic4; m++) { + for (int m = 0; m < ic4_minus; m++) { int channel_block_stride = input_x_stride + m * C4NUM; int channel_block_offset = input_plane_offset + m * tile_num * C4NUM * C4NUM; (packed_input + channel_block_offset)[0] = (input_data + channel_block_stride)[0]; @@ -386,8 +393,15 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real input_accumulator += (packed_input + channel_block_offset)[2]; input_accumulator += (packed_input + channel_block_offset)[3]; } // channel_block loop - } // kernel_w loop - } // kernel_h loop + int ic_res = conv_param->input_channel_ - ic4_minus * C4NUM; + for (int l = 0; l < ic_res; ++l) { + int channel_block_stride = input_x_stride + ic4_minus * C4NUM + l; + int channel_block_offset = input_plane_offset + ic4_minus * tile_num * C4NUM + l; + packed_input[channel_block_offset] = input_data[channel_block_stride]; + input_accumulator += (packed_input + channel_block_offset)[0]; + } + } // kernel_w loop + } // kernel_h loop if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC)) { continue; } else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) && @@ -419,6 +433,7 @@ void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int r int in_channel = conv_param->input_channel_; int in_h = conv_param->input_h_; int in_w = conv_param->input_w_; + int ic4_minus = in_channel / C4NUM; int ic4 = UP_DIV(in_channel, C4NUM); int oc4 = UP_DIV(conv_param->output_channel_, C4NUM); int out_w = conv_param->output_w_; @@ -428,26 +443,29 @@ void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int r int block_start = block_index + i; int input_h = block_start / out_w * stride_h - pad_h; int input_w = block_start % out_w * stride_w - pad_w; - for (int j = 0; j < kernel_h; j++) { - int input_y = input_h + j * dilation_h; - if (input_y < 0 || input_y >= in_h) { - continue; - } - int input_y_stride = input_y * in_w * ic4 * C4NUM; - for (int n = 0; n < kernel_w; n++) { - int input_x = input_w + n * dilation_w; - if (input_x < 0 || input_x >= in_w) { - continue; - } - int input_x_stride = input_y_stride + input_x * ic4 * C4NUM; + int input_stride = input_h * in_w * in_channel + input_w * in_channel; + int kh_s = MSMAX(0, UP_DIV(-input_h, dilation_h)); + int kh_e = MSMIN(kernel_h, UP_DIV(in_h - input_h, dilation_h)); + int kw_s = MSMAX(0, UP_DIV(-input_w, dilation_w)); + int kw_e = MSMIN(kernel_w, UP_DIV(in_w - input_w, dilation_w)); + for (int j = kh_s; j < kh_e; j++) { + int input_y_stride = j * dilation_h * in_w * in_channel + input_stride; + for (int n = kw_s; n < kw_e; n++) { + int input_x_stride = input_y_stride + n * dilation_w * in_channel; int input_plane_offset = (j * kernel_w + n) * tile_num * C4NUM * ic4 + i * C4NUM; - for (int m = 0; m < ic4; m++) { + for (int m = 0; m < ic4_minus; m++) { int channel_block_stride = input_x_stride + m * C4NUM; int channel_block_offset = input_plane_offset + m * tile_num * C4NUM; memcpy(packed_input + channel_block_offset, input_data + channel_block_stride, 4); } // channel_block loop - } // kernel_w loop - } // kernel_h loop + int ic_res = conv_param->input_channel_ - ic4_minus * C4NUM; + for (int l = 0; l < ic_res; ++l) { + int channel_block_stride = input_x_stride + ic4_minus * C4NUM + l; + int channel_block_offset = input_plane_offset + ic4_minus * tile_num * C4NUM + l; + packed_input[channel_block_offset] = input_data[channel_block_stride]; + } + } // kernel_w loop + } // kernel_h loop int32_t input_accumulator = 0; for (int j = 0; j < block_size; j++) { int block_offset = j * tile_num * ic4 * C4NUM + i * C4NUM; diff --git a/mindspore/lite/nnacl/winograd_transform.c b/mindspore/lite/nnacl/winograd_transform.c index f59d422205..c8a7eb9851 100644 --- a/mindspore/lite/nnacl/winograd_transform.c +++ b/mindspore/lite/nnacl/winograd_transform.c @@ -41,30 +41,48 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float * int interval_x_e = src_x_e < input_w ? input_unit : (input_w - src_x_s); int interval_y_e = src_y_e < input_h ? input_unit : (input_h - src_y_s); - int src_plane_offset = ic4 * C4NUM * (src_y_s * input_w + src_x_s); + int src_plane_offset = in_channel * (src_y_s * input_w + src_x_s); int dst_plane_offset = c * C4NUM * ic4; for (int ic = 0; ic < ic4; ic++) { // clear tmp buffer memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float)); - // get real input block with padding + int real_c = in_channel - ic * C4NUM; + real_c = real_c > C4NUM ? C4NUM : real_c; int src_ic4_offset = src_plane_offset + ic * C4NUM; - for (int interval = interval_y_s; interval < interval_y_e; interval++) { - int src_y_offset = src_ic4_offset + (interval * input_w + interval_x_s) * ic4 * C4NUM; - int dst_y_offset = interval * input_unit * C4NUM + interval_x_s * C4NUM; - for (int j = 0; j < (interval_x_e - interval_x_s); j++) { - int src_x_offset = src_y_offset + j * ic4 * C4NUM; - int dst_x_offset = dst_y_offset + j * C4NUM; - float *src_addr = (float *)(input_data) + src_x_offset; - float *dst_addr = tmp_data + dst_x_offset; + // get real input block with padding + if (real_c == C4NUM) { + for (int interval = interval_y_s; interval < interval_y_e; interval++) { + int src_y_offset = src_ic4_offset + (interval * input_w + interval_x_s) * in_channel; + int dst_y_offset = interval * input_unit * C4NUM + interval_x_s * C4NUM; + for (int j = 0; j < (interval_x_e - interval_x_s); j++) { + int src_x_offset = src_y_offset + j * in_channel; + int dst_x_offset = dst_y_offset + j * C4NUM; + float *src_addr = (float *)(input_data) + src_x_offset; + float *dst_addr = tmp_data + dst_x_offset; #ifdef ENABLE_NEON - vst1q_f32(dst_addr, vld1q_f32(src_addr)); + vst1q_f32(dst_addr, vld1q_f32(src_addr)); #else - for (int k = 0; k < C4NUM; k++) { - dst_addr[k] = src_addr[k]; - } + for (int k = 0; k < C4NUM; k++) { + dst_addr[k] = src_addr[k]; + } #endif - } + } // interval x loop + } // interval y loop + } else { + for (int interval = interval_y_s; interval < interval_y_e; interval++) { + int src_y_offset = src_ic4_offset + (interval * input_w + interval_x_s) * in_channel; + int dst_y_offset = interval * input_unit * C4NUM + interval_x_s * C4NUM; + for (int j = 0; j < (interval_x_e - interval_x_s); j++) { + int src_x_offset = src_y_offset + j * in_channel; + int dst_x_offset = dst_y_offset + j * C4NUM; + float *src_addr = (float *)(input_data) + src_x_offset; + float *dst_addr = tmp_data + dst_x_offset; + for (int k = 0; k < real_c; k++) { + dst_addr[k] = src_addr[k]; + } + } // interval x loop + } // interval y loop } // input transform #ifdef ENABLE_ARM32 @@ -314,29 +332,47 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa int real_y_start = origin_y > 0 ? 0 : -origin_y; int real_y_end = (origin_y + input_unit) < input_height ? input_unit : (input_height - origin_y); - int src_plane_offset = ic4 * C4NUM * (origin_y * input_width + origin_x); + int src_plane_offset = input_channel * (origin_y * input_width + origin_x); int dst_plane_offset = cal_id * C4NUM * ic4; for (int ic = 0; ic < ic4; ic++) { // clear tmp buffer memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float)); + int real_c = input_channel - ic * C4NUM; + real_c = real_c > C4NUM ? C4NUM : real_c; // get real input block with padding int src_ic4_offset = src_plane_offset + ic * C4NUM; - for (int interval = real_y_start; interval < real_y_end; interval++) { - int src_y_offset = src_ic4_offset + (interval * input_width + real_x_start) * ic4 * C4NUM; - int dst_y_offset = interval * input_unit * C4NUM + real_x_start * C4NUM; - for (int j = 0; j < (real_x_end - real_x_start); j++) { - int src_x_offset = src_y_offset + j * ic4 * C4NUM; - int dst_x_offset = dst_y_offset + j * C4NUM; - float *src_addr = (float *)(input_data) + src_x_offset; - float *dst_addr = tmp_data + dst_x_offset; + if (real_c == C4NUM) { + for (int interval = real_y_start; interval < real_y_end; interval++) { + int src_y_offset = src_ic4_offset + (interval * input_width + real_x_start) * input_channel; + int dst_y_offset = interval * input_unit * C4NUM + real_x_start * C4NUM; + for (int j = 0; j < (real_x_end - real_x_start); j++) { + int src_x_offset = src_y_offset + j * input_channel; + int dst_x_offset = dst_y_offset + j * C4NUM; + float *src_addr = (float *)(input_data) + src_x_offset; + float *dst_addr = tmp_data + dst_x_offset; #ifdef ENABLE_NEON - vst1q_f32(dst_addr, vld1q_f32(src_addr)); + vst1q_f32(dst_addr, vld1q_f32(src_addr)); #else - for (int k = 0; k < C4NUM; k++) { - (dst_addr + k)[0] = (src_addr + k)[0]; - } + for (int k = 0; k < C4NUM; k++) { + dst_addr[k] = src_addr[k]; + } #endif + } + } + } else { + for (int interval = real_y_start; interval < real_y_end; interval++) { + int src_y_offset = src_ic4_offset + (interval * input_width + real_x_start) * input_channel; + int dst_y_offset = interval * input_unit * C4NUM + real_x_start * C4NUM; + for (int j = 0; j < (real_x_end - real_x_start); j++) { + int src_x_offset = src_y_offset + j * input_channel; + int dst_x_offset = dst_y_offset + j * C4NUM; + float *src_addr = (float *)(input_data) + src_x_offset; + float *dst_addr = tmp_data + dst_x_offset; + for (int k = 0; k < real_c; k++) { + dst_addr[k] = src_addr[k]; + } + } } } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc index 60b3ad10b9..1769f1a3df 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc @@ -84,21 +84,8 @@ int ConvolutionCPUKernel::InitTmpBuffer() { MS_ASSERT(ctx_->allocator != nullptr); int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); - size_t nhwc4_input_size = - ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); - MS_ASSERT(nullptr != ctx_->allocator); - nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size); - if (nhwc4_input_ == nullptr) { - MS_LOG(ERROR) << "malloc nhwc4 input failed."; - return RET_ERROR; - } - - int output_count = conv_param_->output_h_ * conv_param_->output_w_; - int output_tile_count = UP_DIV(output_count, TILE_NUM); - int unit_size = conv_param_->kernel_h_ * conv_param_->kernel_w_ * ic4 * C4NUM; - int packed_input_size = output_tile_count * TILE_NUM * unit_size; - packed_input_ = - reinterpret_cast(ctx_->allocator->Malloc(conv_param_->input_batch_ * packed_input_size * sizeof(float))); + int unit_size = conv_param_->kernel_h_ * conv_param_->kernel_w_ * ic4 * C4NUM * TILE_NUM * thread_count_; + packed_input_ = reinterpret_cast(ctx_->allocator->Malloc(unit_size * sizeof(float))); if (packed_input_ == nullptr) { MS_LOG(ERROR) << "malloc packed input failed."; return RET_ERROR; @@ -158,9 +145,11 @@ int ConvolutionCPUKernel::RunImpl(int task_id) { MS_LOG(ERROR) << "gemm_func is nullptr."; return RET_ERROR; } + auto input_tensor = in_tensors_.at(kInputIndex); + auto ori_input_data = reinterpret_cast(input_tensor->MutableData()); auto output_addr = reinterpret_cast(out_tensors_.at(kOutputIndex)->MutableData()); - ConvFp32(reinterpret_cast(nhwc4_input_), packed_input_, packed_weight_, - reinterpret_cast(bias_data_), tmp_output_block_, output_addr, task_id, conv_param_, gemm_func_); + ConvFp32(ori_input_data, packed_input_, packed_weight_, reinterpret_cast(bias_data_), tmp_output_block_, + output_addr, task_id, conv_param_, gemm_func_); return RET_OK; } @@ -187,11 +176,6 @@ int ConvolutionCPUKernel::Run() { return RET_ERROR; } - auto input_tensor = in_tensors_.at(kInputIndex); - auto ori_input_data = input_tensor->MutableData(); - PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_, - conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); - int error_code = ParallelLaunch(this->context_->thread_pool_, ConvolutionImpl, this, thread_count_); if (error_code != RET_OK) { MS_LOG(ERROR) << "conv error error_code[" << error_code << "]"; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h index 7237cb6ce3..97188bd4b6 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h @@ -51,10 +51,6 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel { ctx_->allocator->Free(tmp_output_block_); tmp_output_block_ = nullptr; } - if (nhwc4_input_ != nullptr) { - ctx_->allocator->Free(nhwc4_input_); - nhwc4_input_ = nullptr; - } if (packed_input_ != nullptr) { ctx_->allocator->Free(packed_input_); packed_input_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc index 2c57806f32..f63dca1e81 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc @@ -100,13 +100,6 @@ int Convolution3x3CPUKernel::InitTmpBuffer() { #else const int tile_num = 12; #endif - size_t nhwc4_input_size = - ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); - nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size); - if (nhwc4_input_ == nullptr) { - MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; - return RET_ERROR; - } size_t tile_buffer_size = thread_count_ * tile_num * C16NUM * ic4 * C4NUM * sizeof(float); tile_buffer_ = reinterpret_cast(ctx_->allocator->Malloc(tile_buffer_size)); @@ -152,16 +145,6 @@ int Convolution3x3CPUKernel::InitTmpBuffer() { return RET_OK; } -void Convolution3x3CPUKernel::ConfigInputOutput() { - auto output_tensor = out_tensors_.at(kOutputIndex); - output_tensor->SetFormat(schema::Format::Format_NHWC); - // #ifdef ENABLE_ARM32 - // gemm_func_ = IndirectGemmFp32_8x4; - // #else - gemm_func_ = IndirectGemmFp32_8x8; - // #endif -} - int Convolution3x3CPUKernel::Init() { auto ret = InitWeightBias(); if (ret != RET_OK) { @@ -171,7 +154,6 @@ int Convolution3x3CPUKernel::Init() { if (!InferShapeDone()) { return RET_OK; } - ConfigInputOutput(); return ReSize(); } @@ -191,12 +173,10 @@ int Convolution3x3CPUKernel::ReSize() { } int Convolution3x3CPUKernel::RunImpl(int task_id) { - if (gemm_func_ == nullptr) { - MS_LOG(ERROR) << "gemm_func is nullptr."; - return RET_ERROR; - } - Conv3x3Fp32(reinterpret_cast(nhwc4_input_), transformed_filter_addr_, reinterpret_cast(bias_data_), - tmp_buffer_address_list_, task_id, conv_param_, gemm_func_); + auto input_tensor = in_tensors_.at(kInputIndex); + auto ori_input_data = reinterpret_cast(input_tensor->MutableData()); + Conv3x3Fp32(ori_input_data, transformed_filter_addr_, reinterpret_cast(bias_data_), tmp_buffer_address_list_, + task_id, conv_param_); return RET_OK; } @@ -245,10 +225,6 @@ int Convolution3x3CPUKernel::Run() { MS_LOG(ERROR) << "Init tmp buffer failed.ret: " << ret; return RET_ERROR; } - auto input_tensor = in_tensors_.at(kInputIndex); - auto ori_input_data = input_tensor->MutableData(); - PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_, - conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); int error_code = ParallelLaunch(this->context_->thread_pool_, Convolution3x3Impl, this, thread_count_); if (error_code != RET_OK) { @@ -260,6 +236,7 @@ int Convolution3x3CPUKernel::Run() { ret = PostProcess(); if (ret != RET_OK) { MS_LOG(ERROR) << "Post process failed."; + FreeTmpBuffer(); return ret; } FreeTmpBuffer(); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h index b269481acc..30bfad66bf 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h @@ -40,15 +40,10 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel { int RunImpl(int task_id); int InitWeightBias(); int InitTmpBuffer(); - void ConfigInputOutput(); int PostProcess(); private: void FreeTmpBuffer() { - if (nhwc4_input_ != nullptr) { - ctx_->allocator->Free(nhwc4_input_); - nhwc4_input_ = nullptr; - } if (tile_buffer_ != nullptr) { ctx_->allocator->Free(tile_buffer_); tile_buffer_ = nullptr; @@ -78,7 +73,6 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel { float *col_buffer_ = nullptr; float *nc4hw4_out_ = nullptr; TmpBufferAddress tmp_buffer_address_list_[5]; - GEMM_FUNC_FP32 gemm_func_ = nullptr; }; void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param, int oc_block, int oc_block_num); } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc index af21fc75e6..e7fb418f47 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc @@ -184,14 +184,6 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() { #endif MS_ASSERT(ctx_->allocator != nullptr); - size_t nhwc4_input_size = - ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); - nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size); - if (nhwc4_input_ == nullptr) { - MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; - return RET_MEMORY_FAILED; - } - size_t tile_buffer_size = thread_count_ * tile_num * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float); trans_input_ = reinterpret_cast(ctx_->allocator->Malloc(tile_buffer_size)); if (trans_input_ == nullptr) { @@ -298,9 +290,11 @@ int ConvolutionWinogradCPUKernel::ReSize() { } int ConvolutionWinogradCPUKernel::RunImpl(int task_id) { + auto input_tensor = in_tensors_.at(kInputIndex); + auto ori_input_data = reinterpret_cast(input_tensor->MutableData()); auto output_data = reinterpret_cast(out_tensors_.front()->MutableData()); - ConvWinogardFp32(reinterpret_cast(nhwc4_input_), trans_weight_, reinterpret_cast(bias_data_), - output_data, tmp_buffer_address_list_, task_id, conv_param_, in_func_, out_func_); + ConvWinogardFp32(ori_input_data, trans_weight_, reinterpret_cast(bias_data_), output_data, + tmp_buffer_address_list_, task_id, conv_param_, in_func_, out_func_); return RET_OK; } @@ -314,30 +308,6 @@ int ConvolutionWinogradImpl(void *cdata, int task_id) { return RET_OK; } -int ConvolutionWinogradCPUKernel::PostProcess() { - auto out_tensor = out_tensors_.front(); - auto out_data = reinterpret_cast(out_tensor->MutableData()); - auto act_type = conv_param_->act_type_; - switch (act_type) { - case ActType_No: - UnPackWinogradOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, - conv_param_->output_w_, conv_param_->output_channel_, output_unit_); - break; - case ActType_Relu: - UnPackWinogradReluOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, - conv_param_->output_w_, conv_param_->output_channel_, output_unit_); - break; - case ActType_Relu6: - UnPackWinogradRelu6Output(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, - conv_param_->output_w_, conv_param_->output_channel_, output_unit_); - break; - default: - MS_LOG(ERROR) << "Unsupport activation type."; - return RET_ERROR; - } - return RET_OK; -} - int ConvolutionWinogradCPUKernel::Run() { auto prepare_ret = Prepare(); if (prepare_ret != RET_OK) { @@ -351,11 +321,6 @@ int ConvolutionWinogradCPUKernel::Run() { return RET_ERROR; } - auto input_tensor = in_tensors_.at(kInputIndex); - auto ori_input_data = input_tensor->MutableData(); - PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_, - conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); - int error_code = ParallelLaunch(this->context_->thread_pool_, ConvolutionWinogradImpl, this, thread_count_); if (error_code != RET_OK) { MS_LOG(ERROR) << "conv winograd error error_code[" << error_code << "]"; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h index f1e97291c3..319aaf1c4e 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h @@ -45,15 +45,10 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel { int InitWeightBias(); int InitTmpBuffer(); int ConfigInputOutput(); - int PostProcess(); int WinogradFilterTransform(const float *weight_data, float *matrix_g, float *matrix_gt, int oc_block); private: void FreeTmpBuffer() { - if (nhwc4_input_ != nullptr) { - ctx_->allocator->Free(nhwc4_input_); - nhwc4_input_ = nullptr; - } if (trans_input_ != nullptr) { ctx_->allocator->Free(trans_input_); trans_input_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc index 76833240ef..167b556f4e 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc @@ -137,25 +137,15 @@ int ConvolutionInt8CPUKernel::InitTmpBuffer() { MS_ASSERT(ctx_->allocator != nullptr); int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); - int output_count = conv_param_->output_h_ * conv_param_->output_w_; - int output_tile_count = UP_DIV(output_count, tile_num_); int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_; int plane_c4 = UP_DIV(kernel_plane, C4NUM); int unit_size = plane_c4 * C4NUM * ic4 * C4NUM; - int packed_input_size = output_tile_count * tile_num_ * unit_size; - packed_input_ = reinterpret_cast(ctx_->allocator->Malloc(conv_param_->input_batch_ * packed_input_size)); + packed_input_ = reinterpret_cast(ctx_->allocator->Malloc(unit_size * thread_count_ * tile_num_)); if (packed_input_ == nullptr) { MS_LOG(ERROR) << "malloc packed_input_ failed."; return RET_ERROR; } - size_t nhwc4_input_size = ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_; - nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size); - if (nhwc4_input_ == nullptr) { - MS_LOG(ERROR) << "malloc nhwc4 input failed."; - return RET_ERROR; - } - size_t tmp_dst_size = thread_count_ * tile_num_ * conv_param_->output_channel_ * sizeof(int32_t); tmp_dst_ = reinterpret_cast(ctx_->allocator->Malloc(tmp_dst_size)); if (tmp_dst_ == nullptr) { @@ -322,15 +312,15 @@ int ConvolutionInt8CPUKernel::ReSize() { } int ConvolutionInt8CPUKernel::RunImpl(int task_id) { + auto input_tensor = in_tensors_.at(kInputIndex); + auto ori_input_data = reinterpret_cast(input_tensor->MutableData()); auto output_addr = reinterpret_cast(out_tensors_.at(kOutputIndex)->MutableData()); if (support_optimize_) { - ConvInt8Opt(reinterpret_cast(nhwc4_input_), packed_input_, packed_weight_, - reinterpret_cast(bias_data_), tmp_dst_, tmp_out_, output_addr, input_sum_, task_id, - conv_param_, gemm_func_); + ConvInt8Opt(ori_input_data, packed_input_, packed_weight_, reinterpret_cast(bias_data_), tmp_dst_, + tmp_out_, output_addr, input_sum_, task_id, conv_param_, gemm_func_); } else { - ConvInt8(reinterpret_cast(nhwc4_input_), packed_input_, packed_weight_, - reinterpret_cast(bias_data_), tmp_dst_, tmp_out_, output_addr, input_sum_, task_id, - conv_param_); + ConvInt8(ori_input_data, packed_input_, packed_weight_, reinterpret_cast(bias_data_), tmp_dst_, tmp_out_, + output_addr, input_sum_, task_id, conv_param_); } return RET_OK; } @@ -358,11 +348,6 @@ int ConvolutionInt8CPUKernel::Run() { return RET_ERROR; } - auto input_tensor = in_tensors_.at(kInputIndex); - auto ori_input_data = input_tensor->MutableData(); - PackNHWCToNHWC4Int8(ori_input_data, nhwc4_input_, conv_param_->input_batch_, - conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); - int error_code = ParallelLaunch(this->context_->thread_pool_, ConvolutionInt8Impl, this, thread_count_); if (error_code != RET_OK) { MS_LOG(ERROR) << "conv int8 error error_code[" << error_code << "]"; diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h index 76ee710178..d836da459a 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h @@ -55,10 +55,6 @@ class ConvolutionInt8CPUKernel : public ConvolutionBaseCPUKernel { private: void FreeTmpBuffer() { - if (nhwc4_input_ != nullptr) { - ctx_->allocator->Free(nhwc4_input_); - nhwc4_input_ = nullptr; - } if (packed_input_ != nullptr) { ctx_->allocator->Free(packed_input_); packed_input_ = nullptr;