@ -20,6 +20,15 @@
// sliding window to compate 1x1 conv in x86
void Conv1x1SWFp32(const float *input_data, const float *packed_weight, const float *bias_data, float *output_data,
int task_id, ConvParameter *conv_param, SlidingWindowParam *sw_param) {
int output_w = conv_param->output_w_;
int output_h = conv_param->output_h_;
int ohw = output_h * output_w;
int ohw_step = UP_DIV(ohw, conv_param->thread_num_);
int ohw_start = ohw_step * task_id;
int ohw_end = MSMIN(ohw_start + ohw_step, ohw);
if (ohw_start >= ohw_end) {
int oc_tile_ = C8NUM; // oc in algin to C8NUM in x86_64_avx
int act_type = 0;
if (conv_param->act_type_ == ActType_Relu6) {
@ -28,8 +37,6 @@ void Conv1x1SWFp32(const float *input_data, const float *packed_weight, const fl
if (conv_param->act_type_ == ActType_Relu || conv_param->act_type_ == ActType_Relu6) {
act_type += 2;
int output_w = conv_param->output_w_;
int output_h = conv_param->output_h_;
int pad_d = conv_param->pad_d_;
int pad_l = conv_param->pad_l_;
int pad_r = conv_param->pad_r_;
@ -43,10 +50,6 @@ void Conv1x1SWFp32(const float *input_data, const float *packed_weight, const fl
int oc_num = sw_param->c_block_;
int in_step = sw_param->in_step_;
int out_step = sw_param->out_step_;
int ohw = output_h * output_w;
int ohw_step = UP_DIV(ohw, conv_param->thread_num_);
int ohw_start = ohw_step * task_id;
int ohw_end = MSMIN(ohw_start + ohw_step, ohw);
const int ow_block_num[4] = {12, 6, 4, 3};
const Conv1x1SWKernel kernel[4][2] = {{Conv1x1SW1x8Kernel, Conv1x1SW12x8Kernel},
{Conv1x1SW1x16Kernel, Conv1x1SW6x16Kernel},
@ -133,6 +133,13 @@ void SWBorder(float *dst, const float *src, const float *weight, const float *bi
// fp32 sliding window common conv
void ConvSWFp32(const float *input_data, const float *packed_weight, const float *bias_data, float *output_data,
int task_id, ConvParameter *conv_param, SlidingWindowParam *sw_param) {
int out_h = conv_param->output_h_;
int oh_step = UP_DIV(out_h, conv_param->thread_num_);
int oh_start = oh_step * task_id;
int oh_end = MSMIN(oh_start + oh_step, out_h);
if (oh_start >= oh_end) {
int oc_tile_ = C8NUM; // oc in algin to C8NUM in x86_64_avx
int act_type = 0;
if (conv_param->act_type_ == ActType_Relu6) {
@ -148,56 +155,75 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float
int in_sw_step = sw_param->in_sw_step_;
int in_kw_step = sw_param->in_kw_step_;
int in_kh_step = sw_param->in_kh_step_;
int in_sh_step = sw_param->in_sh_step_;
int out_h_step = sw_param->out_h_step_;
int kernel_step = sw_param->kernel_step_;
int in_step = sw_param->in_step_;
int out_step = sw_param->out_step_;
int c_block = sw_param->c_block_;
int top = sw_param->top_;
int left = sw_param->left_;
int right = sw_param->right_;
int bottom = sw_param->bottom_;
int block_channel = sw_param->block_channel_;
int stride_h = conv_param->stride_h_;
int stride_w = conv_param->stride_w_;
int out_w = conv_param->output_w_;
int pad_u = conv_param->pad_u_;
int pad_l = conv_param->pad_l_;
int in_h_step = sw_param->in_h_step_;
int out_batch = conv_param->output_batch_;
int in_h_start = top * stride_h - pad_u;
int in_w_start = left * stride_w - pad_l;
int center_step = in_h_start * in_h_step + in_w_start * ic_algin;
const int ow_block_num[4] = {12, 6, 4, 3};
const SWConvKernel kernel[4][2] = {{SWConv1x8Kernel, SWConv12x8Kernel},
{SWConv1x16Kernel, SWConv6x16Kernel},
{SWConv1x24Kernel, SWConv4x24Kernel},
{SWConv1x32Kernel, SWConv3x32Kernel}};
for (int b = 0; b < conv_param->output_batch_; b++) {
for (int oh = task_id; oh < conv_param->output_h_; oh += conv_param->thread_num_) {
float *dst_oh = output_data + oh * sw_param->out_h_step_;
int in_h_start = sw_param->top_ * conv_param->stride_h_ - conv_param->pad_u_;
int in_w_start = sw_param->left_ * conv_param->stride_w_ - conv_param->pad_l_;
const float *src_h = input_data + in_h_start * sw_param->in_h_step_ + in_w_start * sw_param->ic_align_;
for (int b = 0; b < out_batch; b++) {
for (int oh = oh_start; oh < oh_end; oh += 1) {
float *dst_oh = output_data + oh * out_h_step;
const float *src_h = input_data + center_step;
int oc_block = 0;
const float *bias = bias_data;
for (int oc = 0; oc < sw_param->c_block_; oc += oc_block) {
oc_block = MSMIN(C4NUM, sw_param->c_block_ - oc); // 4 3 2 1
const float *weight = packed_weight + oc * sw_param->kernel_step_;
for (int oc = 0; oc < c_block; oc += oc_block) {
oc_block = MSMIN(C4NUM, c_block - oc); // 4 3 2 1
const float *weight = packed_weight + oc * kernel_step;
if (bias != NULL) {
bias = bias_data + oc * oc_tile_;
float *dst_w = dst_oh + oc * oc_tile_;
const SWConvKernel kernel_border = kernel[oc_block - 1][0];
if (oh < sw_param->top_ || oh >= sw_param->bottom_) { // oh in up or down border
SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, 0, conv_param->output_w_, conv_param, sw_param,
kernel_border, act_type, 1, oc_block);
if (oh < top || oh >= bottom) { // oh in up or down border
SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, 0, out_w, conv_param, sw_param, kernel_border, act_type,
1, oc_block);
} else { // oh in center
// ow in right
SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, 0, sw_param->left_, conv_param, sw_param, kernel_border,
act_type, 1, oc_block);
SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, 0, left, conv_param, sw_param, kernel_border, act_type,
1, oc_block);
// ow in center
const float *src_w = src_h + (oh - sw_param->top_) * sw_param->in_sh_step_;
int ow_block = ow_block_num[oc_block - 1]; // 12 6 4 3
for (int ow = sw_param->left_; ow < sw_param->right_; ow += ow_block) { // left ~ right
ow_block = MSMIN(ow_block, sw_param->right_ - ow);
const float *src_w = src_h + (oh - top) * in_sh_step;
int ow_block = ow_block_num[oc_block - 1]; // 12 6 4 3
for (int ow = left; ow < right; ow += ow_block) { // left ~ right
ow_block = MSMIN(ow_block, right - ow);
if (ow_block < ow_block_num[oc_block - 1]) { // ow is not enough and process one ow
ow_block = 1;
kernel[oc_block - 1][ow_block / ow_block_num[oc_block - 1]](
dst_w + ow * sw_param->block_channel_, src_w, weight, bias, kernel_h, kernel_w, act_type, ow_block,
oc_block, oc_algin, ic_algin, in_kw_step, in_kh_step, in_sw_step, 0);
dst_w + ow * block_channel, src_w, weight, bias, kernel_h, kernel_w, act_type, ow_block, oc_block,
oc_algin, ic_algin, in_kw_step, in_kh_step, in_sw_step, 0);
src_w += ow_block * in_sw_step;
// ow in left
SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, sw_param->right_, conv_param->output_w_, conv_param,
sw_param, kernel_border, act_type, 1, oc_block);
SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, right, out_w, conv_param, sw_param, kernel_border,
act_type, 1, oc_block);
} // output h loop
input_data += sw_param->in_step_;
output_data += sw_param->out_step_;
input_data += in_step;
output_data += out_step;
} // batch loop
@ -1040,6 +1040,12 @@ void DepthwiseBorderAvxFp32(float *dst, const float *src, const float *weight, c
void DepthwiseSWAvxFp32(float *output_data, const float *input_data, const float *weight_data, const float *bias_data,
const ConvParameter *conv_param, const SlidingWindowParam *sw_param, int task_id) {
int oh_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_);
int oh_start = oh_step * task_id;
int oh_end = MSMIN(oh_start + oh_step, conv_param->output_h_);
if (oh_start >= oh_end) {
// depthwise sw in x86 avx instructions
int oc_tile_ = C8NUM; // oc in algin to C8NUM in x86_64_avx
int act_type = 0;
@ -1064,6 +1070,8 @@ void DepthwiseSWAvxFp32(float *output_data, const float *input_data, const float
int out_left = sw_param->left_;
int out_top = sw_param->top_;
int out_bottom = sw_param->bottom_;
int kernel_step = sw_param->kernel_step_;
int out_h_step = sw_param->out_h_step_;
int in_h_start = out_top * conv_param->stride_h_ - conv_param->pad_u_;
int in_w_start = out_left * conv_param->stride_w_ - conv_param->pad_l_;
int in_start = in_h_start * sw_param->in_h_step_ + in_w_start * oc_algin;
@ -1072,19 +1080,16 @@ void DepthwiseSWAvxFp32(float *output_data, const float *input_data, const float
{DepthwiseSW1x16Kernel, DepthwiseSW4x16Kernel},
{DepthwiseSW1x24Kernel, DepthwiseSW4x24Kernel},
{DepthwiseSW1x32Kernel, DepthwiseSW3x32Kernel}};
int oh_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_);
int oh_start = oh_step * task_id;
int oh_end = MSMIN(oh_start + oh_step, conv_param->output_h_);
for (int b = 0; b < conv_param->output_batch_; b++) {
for (int oh = oh_start; oh < oh_end; ++oh) {
float *dst_oh = output_data + oh * sw_param->out_h_step_;
float *dst_oh = output_data + oh * out_h_step;
const float *src_h = input_data + in_start + (oh - out_top) * in_sh_step;
int oc_block = 0;
const float *bias = bias_data;
for (int oc = 0; oc < oc_num; oc += oc_block) {
oc_block = MSMIN(C4NUM, oc_num - oc); // 4 3 2 1
int oc_step = oc * oc_tile_;
const float *weight = weight_data + oc * sw_param->kernel_step_;
const float *weight = weight_data + oc * kernel_step;
if (bias != NULL) {
bias = bias_data + oc_step;
@ -144,7 +144,8 @@ kernel::InnerKernel *ConvolutionDelegateCPUKernel::CpuConvFp32KernelSelect() {
if (conv_param->kernel_h_ == 1 && conv_param->kernel_w_ == 1) {
if (conv_param->pad_d_ == 0 && conv_param->pad_l_ == 0 && conv_param->pad_r_ == 0 && conv_param->pad_u_ == 0 &&
conv_param->stride_h_ == 1 && conv_param->stride_w_ == 1 && conv_param->input_channel_ % 8 == 0) {
conv_param->stride_h_ == 1 && conv_param->stride_w_ == 1 && conv_param->input_channel_ % 8 == 0 &&
(conv_param->input_w_ * conv_param->input_h_ >= conv_param->thread_num_)) {
kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(
op_parameter_, in_tensors_, out_tensors_, static_cast<const lite::InnerContext *>(this->context_),
origin_weight_, origin_bias_);
@ -165,21 +166,22 @@ kernel::InnerKernel *ConvolutionDelegateCPUKernel::CpuConvFp32KernelSelect() {
op_parameter_, in_tensors_, out_tensors_, static_cast<const lite::InnerContext *>(this->context_), out_unit,
origin_weight_, origin_bias_);
} else {
if (conv_param->input_channel_ / op_parameter_->thread_num_ > 64) {
if (conv_param->input_channel_ / op_parameter_->thread_num_ > 64 ||
conv_param->input_h_ < conv_param->thread_num_) {
kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(
op_parameter_, in_tensors_, out_tensors_, static_cast<const lite::InnerContext *>(this->context_),
origin_weight_, origin_bias_);
} else {
kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(
op_parameter_, in_tensors_, out_tensors_, static_cast<const lite::InnerContext *>(this->context_),
origin_weight_, origin_bias_);
kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(
op_parameter_, in_tensors_, out_tensors_, static_cast<const lite::InnerContext *>(this->context_),
origin_weight_, origin_bias_);
kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter_, in_tensors_, out_tensors_,
static_cast<const lite::InnerContext *>(this->context_),
origin_weight_, origin_bias_);
