!19547 [MS][LITE][CPU] x86线程切割优化
Merge pull request !19547 from liuzhongkai/thread2
This commit is contained in:
commit
cdeb8e86d0
|
@ -20,6 +20,15 @@
|
|||
// sliding window to compate 1x1 conv in x86
|
||||
void Conv1x1SWFp32(const float *input_data, const float *packed_weight, const float *bias_data, float *output_data,
|
||||
int task_id, ConvParameter *conv_param, SlidingWindowParam *sw_param) {
|
||||
int output_w = conv_param->output_w_;
|
||||
int output_h = conv_param->output_h_;
|
||||
int ohw = output_h * output_w;
|
||||
int ohw_step = UP_DIV(ohw, conv_param->thread_num_);
|
||||
int ohw_start = ohw_step * task_id;
|
||||
int ohw_end = MSMIN(ohw_start + ohw_step, ohw);
|
||||
if (ohw_start >= ohw_end) {
|
||||
return;
|
||||
}
|
||||
int oc_tile_ = C8NUM; // oc in algin to C8NUM in x86_64_avx
|
||||
int act_type = 0;
|
||||
if (conv_param->act_type_ == ActType_Relu6) {
|
||||
|
@ -28,8 +37,6 @@ void Conv1x1SWFp32(const float *input_data, const float *packed_weight, const fl
|
|||
if (conv_param->act_type_ == ActType_Relu || conv_param->act_type_ == ActType_Relu6) {
|
||||
act_type += 2;
|
||||
}
|
||||
int output_w = conv_param->output_w_;
|
||||
int output_h = conv_param->output_h_;
|
||||
int pad_d = conv_param->pad_d_;
|
||||
int pad_l = conv_param->pad_l_;
|
||||
int pad_r = conv_param->pad_r_;
|
||||
|
@ -43,10 +50,6 @@ void Conv1x1SWFp32(const float *input_data, const float *packed_weight, const fl
|
|||
int oc_num = sw_param->c_block_;
|
||||
int in_step = sw_param->in_step_;
|
||||
int out_step = sw_param->out_step_;
|
||||
int ohw = output_h * output_w;
|
||||
int ohw_step = UP_DIV(ohw, conv_param->thread_num_);
|
||||
int ohw_start = ohw_step * task_id;
|
||||
int ohw_end = MSMIN(ohw_start + ohw_step, ohw);
|
||||
const int ow_block_num[4] = {12, 6, 4, 3};
|
||||
const Conv1x1SWKernel kernel[4][2] = {{Conv1x1SW1x8Kernel, Conv1x1SW12x8Kernel},
|
||||
{Conv1x1SW1x16Kernel, Conv1x1SW6x16Kernel},
|
||||
|
|
|
@ -133,6 +133,13 @@ void SWBorder(float *dst, const float *src, const float *weight, const float *bi
|
|||
// fp32 sliding window common conv
|
||||
void ConvSWFp32(const float *input_data, const float *packed_weight, const float *bias_data, float *output_data,
|
||||
int task_id, ConvParameter *conv_param, SlidingWindowParam *sw_param) {
|
||||
int out_h = conv_param->output_h_;
|
||||
int oh_step = UP_DIV(out_h, conv_param->thread_num_);
|
||||
int oh_start = oh_step * task_id;
|
||||
int oh_end = MSMIN(oh_start + oh_step, out_h);
|
||||
if (oh_start >= oh_end) {
|
||||
return;
|
||||
}
|
||||
int oc_tile_ = C8NUM; // oc in algin to C8NUM in x86_64_avx
|
||||
int act_type = 0;
|
||||
if (conv_param->act_type_ == ActType_Relu6) {
|
||||
|
@ -148,56 +155,75 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float
|
|||
int in_sw_step = sw_param->in_sw_step_;
|
||||
int in_kw_step = sw_param->in_kw_step_;
|
||||
int in_kh_step = sw_param->in_kh_step_;
|
||||
int in_sh_step = sw_param->in_sh_step_;
|
||||
int out_h_step = sw_param->out_h_step_;
|
||||
int kernel_step = sw_param->kernel_step_;
|
||||
int in_step = sw_param->in_step_;
|
||||
int out_step = sw_param->out_step_;
|
||||
int c_block = sw_param->c_block_;
|
||||
int top = sw_param->top_;
|
||||
int left = sw_param->left_;
|
||||
int right = sw_param->right_;
|
||||
int bottom = sw_param->bottom_;
|
||||
int block_channel = sw_param->block_channel_;
|
||||
int stride_h = conv_param->stride_h_;
|
||||
int stride_w = conv_param->stride_w_;
|
||||
int out_w = conv_param->output_w_;
|
||||
int pad_u = conv_param->pad_u_;
|
||||
int pad_l = conv_param->pad_l_;
|
||||
int in_h_step = sw_param->in_h_step_;
|
||||
int out_batch = conv_param->output_batch_;
|
||||
int in_h_start = top * stride_h - pad_u;
|
||||
int in_w_start = left * stride_w - pad_l;
|
||||
int center_step = in_h_start * in_h_step + in_w_start * ic_algin;
|
||||
const int ow_block_num[4] = {12, 6, 4, 3};
|
||||
const SWConvKernel kernel[4][2] = {{SWConv1x8Kernel, SWConv12x8Kernel},
|
||||
{SWConv1x16Kernel, SWConv6x16Kernel},
|
||||
{SWConv1x24Kernel, SWConv4x24Kernel},
|
||||
{SWConv1x32Kernel, SWConv3x32Kernel}};
|
||||
for (int b = 0; b < conv_param->output_batch_; b++) {
|
||||
for (int oh = task_id; oh < conv_param->output_h_; oh += conv_param->thread_num_) {
|
||||
float *dst_oh = output_data + oh * sw_param->out_h_step_;
|
||||
int in_h_start = sw_param->top_ * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int in_w_start = sw_param->left_ * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
const float *src_h = input_data + in_h_start * sw_param->in_h_step_ + in_w_start * sw_param->ic_align_;
|
||||
for (int b = 0; b < out_batch; b++) {
|
||||
for (int oh = oh_start; oh < oh_end; oh += 1) {
|
||||
float *dst_oh = output_data + oh * out_h_step;
|
||||
const float *src_h = input_data + center_step;
|
||||
|
||||
int oc_block = 0;
|
||||
const float *bias = bias_data;
|
||||
for (int oc = 0; oc < sw_param->c_block_; oc += oc_block) {
|
||||
oc_block = MSMIN(C4NUM, sw_param->c_block_ - oc); // 4 3 2 1
|
||||
const float *weight = packed_weight + oc * sw_param->kernel_step_;
|
||||
for (int oc = 0; oc < c_block; oc += oc_block) {
|
||||
oc_block = MSMIN(C4NUM, c_block - oc); // 4 3 2 1
|
||||
const float *weight = packed_weight + oc * kernel_step;
|
||||
if (bias != NULL) {
|
||||
bias = bias_data + oc * oc_tile_;
|
||||
}
|
||||
float *dst_w = dst_oh + oc * oc_tile_;
|
||||
const SWConvKernel kernel_border = kernel[oc_block - 1][0];
|
||||
if (oh < sw_param->top_ || oh >= sw_param->bottom_) { // oh in up or down border
|
||||
SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, 0, conv_param->output_w_, conv_param, sw_param,
|
||||
kernel_border, act_type, 1, oc_block);
|
||||
if (oh < top || oh >= bottom) { // oh in up or down border
|
||||
SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, 0, out_w, conv_param, sw_param, kernel_border, act_type,
|
||||
1, oc_block);
|
||||
} else { // oh in center
|
||||
// ow in right
|
||||
SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, 0, sw_param->left_, conv_param, sw_param, kernel_border,
|
||||
act_type, 1, oc_block);
|
||||
SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, 0, left, conv_param, sw_param, kernel_border, act_type,
|
||||
1, oc_block);
|
||||
// ow in center
|
||||
const float *src_w = src_h + (oh - sw_param->top_) * sw_param->in_sh_step_;
|
||||
int ow_block = ow_block_num[oc_block - 1]; // 12 6 4 3
|
||||
for (int ow = sw_param->left_; ow < sw_param->right_; ow += ow_block) { // left ~ right
|
||||
ow_block = MSMIN(ow_block, sw_param->right_ - ow);
|
||||
const float *src_w = src_h + (oh - top) * in_sh_step;
|
||||
int ow_block = ow_block_num[oc_block - 1]; // 12 6 4 3
|
||||
for (int ow = left; ow < right; ow += ow_block) { // left ~ right
|
||||
ow_block = MSMIN(ow_block, right - ow);
|
||||
if (ow_block < ow_block_num[oc_block - 1]) { // ow is not enough and process one ow
|
||||
ow_block = 1;
|
||||
}
|
||||
kernel[oc_block - 1][ow_block / ow_block_num[oc_block - 1]](
|
||||
dst_w + ow * sw_param->block_channel_, src_w, weight, bias, kernel_h, kernel_w, act_type, ow_block,
|
||||
oc_block, oc_algin, ic_algin, in_kw_step, in_kh_step, in_sw_step, 0);
|
||||
dst_w + ow * block_channel, src_w, weight, bias, kernel_h, kernel_w, act_type, ow_block, oc_block,
|
||||
oc_algin, ic_algin, in_kw_step, in_kh_step, in_sw_step, 0);
|
||||
src_w += ow_block * in_sw_step;
|
||||
}
|
||||
// ow in left
|
||||
SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, sw_param->right_, conv_param->output_w_, conv_param,
|
||||
sw_param, kernel_border, act_type, 1, oc_block);
|
||||
SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, right, out_w, conv_param, sw_param, kernel_border,
|
||||
act_type, 1, oc_block);
|
||||
}
|
||||
}
|
||||
} // output h loop
|
||||
input_data += sw_param->in_step_;
|
||||
output_data += sw_param->out_step_;
|
||||
input_data += in_step;
|
||||
output_data += out_step;
|
||||
} // batch loop
|
||||
}
|
||||
|
||||
|
|
|
@ -1040,6 +1040,12 @@ void DepthwiseBorderAvxFp32(float *dst, const float *src, const float *weight, c
|
|||
|
||||
void DepthwiseSWAvxFp32(float *output_data, const float *input_data, const float *weight_data, const float *bias_data,
|
||||
const ConvParameter *conv_param, const SlidingWindowParam *sw_param, int task_id) {
|
||||
int oh_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_);
|
||||
int oh_start = oh_step * task_id;
|
||||
int oh_end = MSMIN(oh_start + oh_step, conv_param->output_h_);
|
||||
if (oh_start >= oh_end) {
|
||||
return;
|
||||
}
|
||||
// depthwise sw in x86 avx instructions
|
||||
int oc_tile_ = C8NUM; // oc in algin to C8NUM in x86_64_avx
|
||||
int act_type = 0;
|
||||
|
@ -1064,6 +1070,8 @@ void DepthwiseSWAvxFp32(float *output_data, const float *input_data, const float
|
|||
int out_left = sw_param->left_;
|
||||
int out_top = sw_param->top_;
|
||||
int out_bottom = sw_param->bottom_;
|
||||
int kernel_step = sw_param->kernel_step_;
|
||||
int out_h_step = sw_param->out_h_step_;
|
||||
int in_h_start = out_top * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int in_w_start = out_left * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
int in_start = in_h_start * sw_param->in_h_step_ + in_w_start * oc_algin;
|
||||
|
@ -1072,19 +1080,16 @@ void DepthwiseSWAvxFp32(float *output_data, const float *input_data, const float
|
|||
{DepthwiseSW1x16Kernel, DepthwiseSW4x16Kernel},
|
||||
{DepthwiseSW1x24Kernel, DepthwiseSW4x24Kernel},
|
||||
{DepthwiseSW1x32Kernel, DepthwiseSW3x32Kernel}};
|
||||
int oh_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_);
|
||||
int oh_start = oh_step * task_id;
|
||||
int oh_end = MSMIN(oh_start + oh_step, conv_param->output_h_);
|
||||
for (int b = 0; b < conv_param->output_batch_; b++) {
|
||||
for (int oh = oh_start; oh < oh_end; ++oh) {
|
||||
float *dst_oh = output_data + oh * sw_param->out_h_step_;
|
||||
float *dst_oh = output_data + oh * out_h_step;
|
||||
const float *src_h = input_data + in_start + (oh - out_top) * in_sh_step;
|
||||
int oc_block = 0;
|
||||
const float *bias = bias_data;
|
||||
for (int oc = 0; oc < oc_num; oc += oc_block) {
|
||||
oc_block = MSMIN(C4NUM, oc_num - oc); // 4 3 2 1
|
||||
int oc_step = oc * oc_tile_;
|
||||
const float *weight = weight_data + oc * sw_param->kernel_step_;
|
||||
const float *weight = weight_data + oc * kernel_step;
|
||||
if (bias != NULL) {
|
||||
bias = bias_data + oc_step;
|
||||
}
|
||||
|
|
|
@ -144,7 +144,8 @@ kernel::InnerKernel *ConvolutionDelegateCPUKernel::CpuConvFp32KernelSelect() {
|
|||
if (conv_param->kernel_h_ == 1 && conv_param->kernel_w_ == 1) {
|
||||
#ifdef ENABLE_AVX
|
||||
if (conv_param->pad_d_ == 0 && conv_param->pad_l_ == 0 && conv_param->pad_r_ == 0 && conv_param->pad_u_ == 0 &&
|
||||
conv_param->stride_h_ == 1 && conv_param->stride_w_ == 1 && conv_param->input_channel_ % 8 == 0) {
|
||||
conv_param->stride_h_ == 1 && conv_param->stride_w_ == 1 && conv_param->input_channel_ % 8 == 0 &&
|
||||
(conv_param->input_w_ * conv_param->input_h_ >= conv_param->thread_num_)) {
|
||||
kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(
|
||||
op_parameter_, in_tensors_, out_tensors_, static_cast<const lite::InnerContext *>(this->context_),
|
||||
origin_weight_, origin_bias_);
|
||||
|
@ -165,21 +166,22 @@ kernel::InnerKernel *ConvolutionDelegateCPUKernel::CpuConvFp32KernelSelect() {
|
|||
op_parameter_, in_tensors_, out_tensors_, static_cast<const lite::InnerContext *>(this->context_), out_unit,
|
||||
origin_weight_, origin_bias_);
|
||||
} else {
|
||||
if (conv_param->input_channel_ / op_parameter_->thread_num_ > 64) {
|
||||
#ifdef ENABLE_AVX
|
||||
if (conv_param->input_channel_ / op_parameter_->thread_num_ > 64 ||
|
||||
conv_param->input_h_ < conv_param->thread_num_) {
|
||||
kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(
|
||||
op_parameter_, in_tensors_, out_tensors_, static_cast<const lite::InnerContext *>(this->context_),
|
||||
origin_weight_, origin_bias_);
|
||||
} else {
|
||||
#ifdef ENABLE_AVX
|
||||
kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(
|
||||
op_parameter_, in_tensors_, out_tensors_, static_cast<const lite::InnerContext *>(this->context_),
|
||||
origin_weight_, origin_bias_);
|
||||
#else
|
||||
kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(
|
||||
op_parameter_, in_tensors_, out_tensors_, static_cast<const lite::InnerContext *>(this->context_),
|
||||
origin_weight_, origin_bias_);
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter_, in_tensors_, out_tensors_,
|
||||
static_cast<const lite::InnerContext *>(this->context_),
|
||||
origin_weight_, origin_bias_);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue