diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_1x1_x86_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_1x1_x86_fp32.c index f830c1d1bf1..691272b54f7 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_1x1_x86_fp32.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_1x1_x86_fp32.c @@ -20,6 +20,15 @@ // sliding window to compate 1x1 conv in x86 void Conv1x1SWFp32(const float *input_data, const float *packed_weight, const float *bias_data, float *output_data, int task_id, ConvParameter *conv_param, SlidingWindowParam *sw_param) { + int output_w = conv_param->output_w_; + int output_h = conv_param->output_h_; + int ohw = output_h * output_w; + int ohw_step = UP_DIV(ohw, conv_param->thread_num_); + int ohw_start = ohw_step * task_id; + int ohw_end = MSMIN(ohw_start + ohw_step, ohw); + if (ohw_start >= ohw_end) { + return; + } int oc_tile_ = C8NUM; // oc in algin to C8NUM in x86_64_avx int act_type = 0; if (conv_param->act_type_ == ActType_Relu6) { @@ -28,8 +37,6 @@ void Conv1x1SWFp32(const float *input_data, const float *packed_weight, const fl if (conv_param->act_type_ == ActType_Relu || conv_param->act_type_ == ActType_Relu6) { act_type += 2; } - int output_w = conv_param->output_w_; - int output_h = conv_param->output_h_; int pad_d = conv_param->pad_d_; int pad_l = conv_param->pad_l_; int pad_r = conv_param->pad_r_; @@ -43,10 +50,6 @@ void Conv1x1SWFp32(const float *input_data, const float *packed_weight, const fl int oc_num = sw_param->c_block_; int in_step = sw_param->in_step_; int out_step = sw_param->out_step_; - int ohw = output_h * output_w; - int ohw_step = UP_DIV(ohw, conv_param->thread_num_); - int ohw_start = ohw_step * task_id; - int ohw_end = MSMIN(ohw_start + ohw_step, ohw); const int ow_block_num[4] = {12, 6, 4, 3}; const Conv1x1SWKernel kernel[4][2] = {{Conv1x1SW1x8Kernel, Conv1x1SW12x8Kernel}, {Conv1x1SW1x16Kernel, Conv1x1SW6x16Kernel}, diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.c index 673693e37c3..ae159e061a1 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.c @@ -133,6 +133,13 @@ void SWBorder(float *dst, const float *src, const float *weight, const float *bi // fp32 sliding window common conv void ConvSWFp32(const float *input_data, const float *packed_weight, const float *bias_data, float *output_data, int task_id, ConvParameter *conv_param, SlidingWindowParam *sw_param) { + int out_h = conv_param->output_h_; + int oh_step = UP_DIV(out_h, conv_param->thread_num_); + int oh_start = oh_step * task_id; + int oh_end = MSMIN(oh_start + oh_step, out_h); + if (oh_start >= oh_end) { + return; + } int oc_tile_ = C8NUM; // oc in algin to C8NUM in x86_64_avx int act_type = 0; if (conv_param->act_type_ == ActType_Relu6) { @@ -148,56 +155,75 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float int in_sw_step = sw_param->in_sw_step_; int in_kw_step = sw_param->in_kw_step_; int in_kh_step = sw_param->in_kh_step_; + int in_sh_step = sw_param->in_sh_step_; + int out_h_step = sw_param->out_h_step_; + int kernel_step = sw_param->kernel_step_; + int in_step = sw_param->in_step_; + int out_step = sw_param->out_step_; + int c_block = sw_param->c_block_; + int top = sw_param->top_; + int left = sw_param->left_; + int right = sw_param->right_; + int bottom = sw_param->bottom_; + int block_channel = sw_param->block_channel_; + int stride_h = conv_param->stride_h_; + int stride_w = conv_param->stride_w_; + int out_w = conv_param->output_w_; + int pad_u = conv_param->pad_u_; + int pad_l = conv_param->pad_l_; + int in_h_step = sw_param->in_h_step_; + int out_batch = conv_param->output_batch_; + int in_h_start = top * stride_h - pad_u; + int in_w_start = left * stride_w - pad_l; + int center_step = in_h_start * in_h_step + in_w_start * ic_algin; const int ow_block_num[4] = {12, 6, 4, 3}; const SWConvKernel kernel[4][2] = {{SWConv1x8Kernel, SWConv12x8Kernel}, {SWConv1x16Kernel, SWConv6x16Kernel}, {SWConv1x24Kernel, SWConv4x24Kernel}, {SWConv1x32Kernel, SWConv3x32Kernel}}; - for (int b = 0; b < conv_param->output_batch_; b++) { - for (int oh = task_id; oh < conv_param->output_h_; oh += conv_param->thread_num_) { - float *dst_oh = output_data + oh * sw_param->out_h_step_; - int in_h_start = sw_param->top_ * conv_param->stride_h_ - conv_param->pad_u_; - int in_w_start = sw_param->left_ * conv_param->stride_w_ - conv_param->pad_l_; - const float *src_h = input_data + in_h_start * sw_param->in_h_step_ + in_w_start * sw_param->ic_align_; + for (int b = 0; b < out_batch; b++) { + for (int oh = oh_start; oh < oh_end; oh += 1) { + float *dst_oh = output_data + oh * out_h_step; + const float *src_h = input_data + center_step; int oc_block = 0; const float *bias = bias_data; - for (int oc = 0; oc < sw_param->c_block_; oc += oc_block) { - oc_block = MSMIN(C4NUM, sw_param->c_block_ - oc); // 4 3 2 1 - const float *weight = packed_weight + oc * sw_param->kernel_step_; + for (int oc = 0; oc < c_block; oc += oc_block) { + oc_block = MSMIN(C4NUM, c_block - oc); // 4 3 2 1 + const float *weight = packed_weight + oc * kernel_step; if (bias != NULL) { bias = bias_data + oc * oc_tile_; } float *dst_w = dst_oh + oc * oc_tile_; const SWConvKernel kernel_border = kernel[oc_block - 1][0]; - if (oh < sw_param->top_ || oh >= sw_param->bottom_) { // oh in up or down border - SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, 0, conv_param->output_w_, conv_param, sw_param, - kernel_border, act_type, 1, oc_block); + if (oh < top || oh >= bottom) { // oh in up or down border + SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, 0, out_w, conv_param, sw_param, kernel_border, act_type, + 1, oc_block); } else { // oh in center // ow in right - SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, 0, sw_param->left_, conv_param, sw_param, kernel_border, - act_type, 1, oc_block); + SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, 0, left, conv_param, sw_param, kernel_border, act_type, + 1, oc_block); // ow in center - const float *src_w = src_h + (oh - sw_param->top_) * sw_param->in_sh_step_; - int ow_block = ow_block_num[oc_block - 1]; // 12 6 4 3 - for (int ow = sw_param->left_; ow < sw_param->right_; ow += ow_block) { // left ~ right - ow_block = MSMIN(ow_block, sw_param->right_ - ow); + const float *src_w = src_h + (oh - top) * in_sh_step; + int ow_block = ow_block_num[oc_block - 1]; // 12 6 4 3 + for (int ow = left; ow < right; ow += ow_block) { // left ~ right + ow_block = MSMIN(ow_block, right - ow); if (ow_block < ow_block_num[oc_block - 1]) { // ow is not enough and process one ow ow_block = 1; } kernel[oc_block - 1][ow_block / ow_block_num[oc_block - 1]]( - dst_w + ow * sw_param->block_channel_, src_w, weight, bias, kernel_h, kernel_w, act_type, ow_block, - oc_block, oc_algin, ic_algin, in_kw_step, in_kh_step, in_sw_step, 0); + dst_w + ow * block_channel, src_w, weight, bias, kernel_h, kernel_w, act_type, ow_block, oc_block, + oc_algin, ic_algin, in_kw_step, in_kh_step, in_sw_step, 0); src_w += ow_block * in_sw_step; } // ow in left - SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, sw_param->right_, conv_param->output_w_, conv_param, - sw_param, kernel_border, act_type, 1, oc_block); + SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, right, out_w, conv_param, sw_param, kernel_border, + act_type, 1, oc_block); } } } // output h loop - input_data += sw_param->in_step_; - output_data += sw_param->out_step_; + input_data += in_step; + output_data += out_step; } // batch loop } diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.c index 9c1d7423b6c..ace1e320cea 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.c @@ -1040,6 +1040,12 @@ void DepthwiseBorderAvxFp32(float *dst, const float *src, const float *weight, c void DepthwiseSWAvxFp32(float *output_data, const float *input_data, const float *weight_data, const float *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sw_param, int task_id) { + int oh_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_); + int oh_start = oh_step * task_id; + int oh_end = MSMIN(oh_start + oh_step, conv_param->output_h_); + if (oh_start >= oh_end) { + return; + } // depthwise sw in x86 avx instructions int oc_tile_ = C8NUM; // oc in algin to C8NUM in x86_64_avx int act_type = 0; @@ -1064,6 +1070,8 @@ void DepthwiseSWAvxFp32(float *output_data, const float *input_data, const float int out_left = sw_param->left_; int out_top = sw_param->top_; int out_bottom = sw_param->bottom_; + int kernel_step = sw_param->kernel_step_; + int out_h_step = sw_param->out_h_step_; int in_h_start = out_top * conv_param->stride_h_ - conv_param->pad_u_; int in_w_start = out_left * conv_param->stride_w_ - conv_param->pad_l_; int in_start = in_h_start * sw_param->in_h_step_ + in_w_start * oc_algin; @@ -1072,19 +1080,16 @@ void DepthwiseSWAvxFp32(float *output_data, const float *input_data, const float {DepthwiseSW1x16Kernel, DepthwiseSW4x16Kernel}, {DepthwiseSW1x24Kernel, DepthwiseSW4x24Kernel}, {DepthwiseSW1x32Kernel, DepthwiseSW3x32Kernel}}; - int oh_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_); - int oh_start = oh_step * task_id; - int oh_end = MSMIN(oh_start + oh_step, conv_param->output_h_); for (int b = 0; b < conv_param->output_batch_; b++) { for (int oh = oh_start; oh < oh_end; ++oh) { - float *dst_oh = output_data + oh * sw_param->out_h_step_; + float *dst_oh = output_data + oh * out_h_step; const float *src_h = input_data + in_start + (oh - out_top) * in_sh_step; int oc_block = 0; const float *bias = bias_data; for (int oc = 0; oc < oc_num; oc += oc_block) { oc_block = MSMIN(C4NUM, oc_num - oc); // 4 3 2 1 int oc_step = oc * oc_tile_; - const float *weight = weight_data + oc * sw_param->kernel_step_; + const float *weight = weight_data + oc * kernel_step; if (bias != NULL) { bias = bias_data + oc_step; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc index 18733dd23f2..413614e9cc8 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc @@ -144,7 +144,8 @@ kernel::InnerKernel *ConvolutionDelegateCPUKernel::CpuConvFp32KernelSelect() { if (conv_param->kernel_h_ == 1 && conv_param->kernel_w_ == 1) { #ifdef ENABLE_AVX if (conv_param->pad_d_ == 0 && conv_param->pad_l_ == 0 && conv_param->pad_r_ == 0 && conv_param->pad_u_ == 0 && - conv_param->stride_h_ == 1 && conv_param->stride_w_ == 1 && conv_param->input_channel_ % 8 == 0) { + conv_param->stride_h_ == 1 && conv_param->stride_w_ == 1 && conv_param->input_channel_ % 8 == 0 && + (conv_param->input_w_ * conv_param->input_h_ >= conv_param->thread_num_)) { kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel( op_parameter_, in_tensors_, out_tensors_, static_cast(this->context_), origin_weight_, origin_bias_); @@ -165,21 +166,22 @@ kernel::InnerKernel *ConvolutionDelegateCPUKernel::CpuConvFp32KernelSelect() { op_parameter_, in_tensors_, out_tensors_, static_cast(this->context_), out_unit, origin_weight_, origin_bias_); } else { - if (conv_param->input_channel_ / op_parameter_->thread_num_ > 64) { +#ifdef ENABLE_AVX + if (conv_param->input_channel_ / op_parameter_->thread_num_ > 64 || + conv_param->input_h_ < conv_param->thread_num_) { kernel = new (std::nothrow) kernel::ConvolutionCPUKernel( op_parameter_, in_tensors_, out_tensors_, static_cast(this->context_), origin_weight_, origin_bias_); } else { -#ifdef ENABLE_AVX kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel( op_parameter_, in_tensors_, out_tensors_, static_cast(this->context_), origin_weight_, origin_bias_); -#else - kernel = new (std::nothrow) kernel::ConvolutionCPUKernel( - op_parameter_, in_tensors_, out_tensors_, static_cast(this->context_), - origin_weight_, origin_bias_); -#endif } +#else + kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter_, in_tensors_, out_tensors_, + static_cast(this->context_), + origin_weight_, origin_bias_); +#endif } }