!6768 [MS][LITE]optimize fp16 common conv preprocess

Merge pull request !6768 from fuzhiye/tmp
This commit is contained in:
mindspore-ci-bot 2020-09-23 16:45:53 +08:00 committed by Gitee
commit fcaa5bc859
8 changed files with 56 additions and 69 deletions

View File

@ -344,7 +344,6 @@ void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_
int channel_block = UP_DIV(in_channel, C4NUM);
int kernel_plane = kernel_h * kernel_w;
int unit_size = kernel_plane * channel_block * C4NUM;
int packed_input_size = output_tile_count * tile_n * unit_size;
// we accumulate 4 channels per time for input blocks
int ic4 = UP_DIV(in_channel, C4NUM);
@ -355,11 +354,10 @@ void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_
for (int b = 0; b < in_batch; b++) {
int in_batch_offset = b * ic4 * C4NUM * in_h * in_w;
int out_batch_offset = b * out_channel * out_h * out_w;
int gemm_in_batch_offset = b * packed_input_size;
for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
int start_index = thread_id * tile_n;
int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n;
float16_t *gemm_input = (float16_t *)(packed_input + thread_id * unit_size * tile_n + gemm_in_batch_offset);
float16_t *gemm_input = (float16_t *)(packed_input + task_id * unit_size * tile_n);
Im2ColPackUnitFp16(input_data + in_batch_offset, conv_param, gemm_input, real_cal_num, start_index);
int out_offset = thread_id * tile_n * out_channel + out_batch_offset;

View File

@ -55,23 +55,24 @@ void Im2ColPackUnitFp16(float16_t *input_data, ConvParameter *conv_param, float1
int in_w = conv_param->input_w_;
int out_w = conv_param->output_w_;
int ic4 = UP_DIV(in_channel, 4);
int ic4_minus = in_channel / 4;
memset(packed_input, 0, kernel_w * kernel_h * ic4 * C4NUM * 16 * sizeof(float16_t));
for (int i = 0; i < real_cal_num; i++) {
int block_start = block_index + i;
int input_h = block_start / out_w * stride_h - pad_h;
int input_w = block_start % out_w * stride_w - pad_w;
int input_stride = input_h * in_w * ic4 * C4NUM + input_w * ic4 * C4NUM;
int input_stride = (input_h * in_w + input_w) * in_channel;
int kh_s = MSMAX(0, UP_DIV(-input_h, dilation_h));
int kh_e = MSMIN(kernel_h, UP_DIV(in_h - input_h, dilation_h));
int kw_s = MSMAX(0, UP_DIV(-input_w, dilation_w));
int kw_e = MSMIN(kernel_w, UP_DIV(in_w - input_w, dilation_w));
for (int j = kh_s; j < kh_e; j++) {
int input_y_stride = j * dilation_h * in_w * ic4 * C4NUM + input_stride;
int input_y_stride = j * dilation_h * in_w * in_channel + input_stride;
for (int n = kw_s; n < kw_e; n++) {
int input_x_stride = input_y_stride + n * dilation_w * ic4 * C4NUM;
int input_x_stride = input_y_stride + n * dilation_w * in_channel;
int input_plane_offset = (j * kernel_w + n) * 16 * C4NUM * ic4 + i * C4NUM;
for (int m = 0; m < ic4; m++) {
for (int m = 0; m < ic4_minus; m++) {
int channel_block_stride = input_x_stride + m * C4NUM;
int channel_block_offset = input_plane_offset + m * 16 * C4NUM;
#ifdef ENABLE_ARM64
@ -82,9 +83,15 @@ void Im2ColPackUnitFp16(float16_t *input_data, ConvParameter *conv_param, float1
}
#endif
} // channel_block loop
} // kernel_w loop
} // kernel_h loop
} // tile num loop
int ic_res = in_channel - ic4_minus * C4NUM;
for (int l = 0; l < ic_res; ++l) {
int channel_block_stride = input_x_stride + ic4_minus * C4NUM + l;
int channel_block_offset = input_plane_offset + ic4_minus * 16 * C4NUM + l;
packed_input[channel_block_offset] = input_data[channel_block_stride];
}
} // kernel_w loop
} // kernel_h loop
} // tile num loop
}
void PackWeightFp16(float16_t *weight_data, ConvParameter *conv_param, float16_t *packed_weight) {
@ -334,7 +341,8 @@ void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int
"st1 {v27.8h}, [x11], %[dstStride]\n"
"st1 {v31.8h}, [x10], %[dstStride]\n"
:
: [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride)
:
[ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride)
: "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
"v30", "v31");

View File

@ -78,6 +78,7 @@ void PackWeightInt8(int8_t *weight_data, ConvParameter *conv_param, int8_t *pack
int plane_c4 = UP_DIV(kernel_plane, C4NUM);
int pack_weight_size = oc4 * C4NUM * ic4 * C4NUM * plane_c4 * C4NUM;
int block_size = pack_weight_size / oc4;
QuantArg *filter_args = conv_param->conv_quant_arg_.filter_quant_args_;
for (int m = 0; m < kernel_plane; m++) {
int kernel_plane_stride = m * in_channel;
@ -101,7 +102,13 @@ void PackWeightInt8(int8_t *weight_data, ConvParameter *conv_param, int8_t *pack
int8_t *origin_data_ptr = weight_data + kernel_block_stride + k * kernel_plane * in_channel;
int8_t *packed_data_ptr = packed_weight + packed_kernel_block_size + k * C4NUM * C4NUM;
*packed_data_ptr = origin_data_ptr[0];
weight_sum[j * C4NUM + k] += (int32_t)packed_data_ptr[0];
int32_t f_zp;
if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) {
f_zp = filter_args[j * C4NUM + k].zp_;
} else {
f_zp = filter_args[0].zp_;
}
weight_sum[j * C4NUM + k] += (int32_t)(packed_data_ptr[0] - f_zp);
}
} // kernel block loop
} // inchannel block loop
@ -121,6 +128,7 @@ void PackWeightInt8Opt(int8_t *weight_data, ConvParameter *conv_param, int8_t *p
int pack_weight_size = oc4 * ic4 * C4NUM * C4NUM * kernel_plane;
int unit_size = C4NUM * C4NUM;
int block_size = pack_weight_size / oc4;
QuantArg *filter_args = conv_param->conv_quant_arg_.filter_quant_args_;
for (int m = 0; m < kernel_plane; m++) {
int kernel_plane_stride = m * in_channel;
@ -142,7 +150,13 @@ void PackWeightInt8Opt(int8_t *weight_data, ConvParameter *conv_param, int8_t *p
int8_t *origin_data_ptr = weight_data + kernel_block_stride + k * kernel_plane * in_channel;
int8_t *packed_data_ptr = packed_weight + packed_kernel_block_size + k * C4NUM;
*packed_data_ptr = origin_data_ptr[0];
weight_sum[j * C4NUM + k] += (int32_t)(packed_data_ptr[0]);
int32_t f_zp;
if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) {
f_zp = filter_args[j * C4NUM + k].zp_;
} else {
f_zp = filter_args[0].zp_;
}
weight_sum[j * C4NUM + k] += (int32_t)(packed_data_ptr[0] - f_zp);
}
} // kernel block loop
} // inchannel block loop
@ -400,6 +414,9 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real
packed_input[channel_block_offset] = input_data[channel_block_stride];
input_accumulator += (packed_input + channel_block_offset)[0];
}
for (int l = 0; l < (C4NUM - ic_res); l++) {
input_accumulator += conv_param->conv_quant_arg_.input_quant_args_[0].zp_;
}
} // kernel_w loop
} // kernel_h loop
if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC)) {

View File

@ -84,53 +84,29 @@ int ConvolutionFP16CPUKernel::InitWeightBias() {
}
int ConvolutionFP16CPUKernel::InitTmpBuffer() {
int in_batch = conv_param_->input_batch_;
int in_channel = conv_param_->input_channel_;
int out_channel = conv_param_->output_channel_;
int channel_block = UP_DIV(in_channel, C4NUM);
int cal_num = 16;
int output_count = conv_param_->output_h_ * conv_param_->output_w_;
int output_tile_count = UP_DIV(output_count, cal_num);
int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_;
int unit_size = kernel_plane * channel_block * C4NUM;
int packed_input_size = output_tile_count * cal_num * unit_size;
int packed_input_size = thread_count_ * cal_num * unit_size;
packed_input_ =
reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(in_batch * packed_input_size * sizeof(float16_t)));
packed_input_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(packed_input_size * sizeof(float16_t)));
if (packed_input_ == nullptr) {
MS_LOG(ERROR) << "malloc packed_input_ failed.";
return RET_ERROR;
}
size_t nhwc4_input_size =
channel_block * C4NUM * in_batch * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size);
if (nhwc4_input_ == nullptr) {
MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
return RET_ERROR;
}
tmp_output_block_ =
reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(thread_count_ * cal_num * out_channel * sizeof(float16_t)));
if (tmp_output_block_ == nullptr) {
MS_LOG(ERROR) << "malloc tmp_output_block_ failed.";
return RET_ERROR;
}
return RET_OK;
}
void ConvolutionFP16CPUKernel::ConfigInputOutput() {
auto input_tensor = in_tensors_.at(kInputIndex);
auto input_format = input_tensor->GetFormat();
schema::Format execute_format = schema::Format::Format_NHWC4;
convert_func_ = LayoutTransformFp16(input_format, execute_format);
if (convert_func_ == nullptr) {
MS_LOG(ERROR) << "layout convert func is nullptr.";
return;
}
}
int ConvolutionFP16CPUKernel::Init() {
auto ret = InitWeightBias();
if (ret != RET_OK) {
@ -140,7 +116,6 @@ int ConvolutionFP16CPUKernel::Init() {
if (!InferShapeDone()) {
return RET_OK;
}
ConfigInputOutput();
return ReSize();
}
@ -160,8 +135,8 @@ int ConvolutionFP16CPUKernel::ReSize() {
}
int ConvolutionFP16CPUKernel::RunImpl(int task_id) {
ConvFp16(reinterpret_cast<float16_t *>(nhwc4_input_), packed_input_, packed_weight_,
reinterpret_cast<float16_t *>(bias_data_), tmp_output_block_, execute_output_, task_id, conv_param_);
ConvFp16(execute_input_, packed_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), tmp_output_block_,
execute_output_, task_id, conv_param_);
return RET_OK;
}
@ -194,12 +169,6 @@ int ConvolutionFP16CPUKernel::Run() {
return RET_ERROR;
}
int in_batch = conv_param_->input_batch_;
int in_h = conv_param_->input_h_;
int in_w = conv_param_->input_w_;
int in_channel = conv_param_->input_channel_;
convert_func_(reinterpret_cast<void *>(execute_input_), nhwc4_input_, in_batch, in_h * in_w, in_channel);
int error_code = ParallelLaunch(this->context_->thread_pool_, ConvolutionFp16Impl, this, thread_count_);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "conv fp16 error error_code[" << error_code << "]";

View File

@ -46,14 +46,9 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
int RunImpl(int task_id);
int InitWeightBias();
int InitTmpBuffer();
void ConfigInputOutput();
private:
void FreeTmpBuffer() {
if (nhwc4_input_ != nullptr) {
ctx_->allocator->Free(nhwc4_input_);
nhwc4_input_ = nullptr;
}
if (packed_input_ != nullptr) {
ctx_->allocator->Free(packed_input_);
packed_input_ = nullptr;

View File

@ -236,12 +236,8 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::Tensor *> &
if (kernel_h == 1 && kernel_w == 1) {
kernel = new (std::nothrow) kernel::Convolution1x1CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
} else if (use_winograd) {
if (kernel_h == 3 && kernel_w == 3 && out_unit == 2) {
kernel = new (std::nothrow) kernel::Convolution3x3CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
} else {
kernel = new (std::nothrow)
kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
}
kernel =
new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
} else {
kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
}

View File

@ -89,7 +89,13 @@ int ConvolutionInt8CPUKernel::InitWeightBias() {
MS_LOG(ERROR) << "malloc weight_sum failed.";
return RET_ERROR;
}
for (int i = 0; i < output_channel; i++) weight_sum[i] = 0;
for (int i = 0; i < output_channel; i++) {
if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[i].zp_;
} else {
weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[0].zp_;
}
}
PackWeightInt8(origin_weight, conv_param_, packed_weight_, weight_sum);
// init bias
@ -190,7 +196,13 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() {
MS_LOG(ERROR) << "malloc weight_sum failed.";
return RET_ERROR;
}
for (int i = 0; i < output_channel; i++) weight_sum[i] = 0;
for (int i = 0; i < output_channel; i++) {
if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[i].zp_;
} else {
weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[0].zp_;
}
}
PackWeightInt8Opt(origin_weight, conv_param_, packed_weight_, weight_sum);
// init bias
@ -261,14 +273,7 @@ int ConvolutionInt8CPUKernel::InitTmpBufferOpt() {
return RET_OK;
}
void ConvolutionInt8CPUKernel::ConfigInputOutput() {
auto output_tensor = out_tensors_.at(kOutputIndex);
output_tensor->SetFormat(schema::Format::Format_NHWC);
}
int ConvolutionInt8CPUKernel::Init() {
// config input output
ConfigInputOutput();
CheckSupportOptimize();
auto ret = SetQuantParam();
if (ret != RET_OK) {

View File

@ -51,7 +51,6 @@ class ConvolutionInt8CPUKernel : public ConvolutionBaseCPUKernel {
int InitTmpBufferOpt();
int InitWeightBias();
int InitTmpBuffer();
void ConfigInputOutput();
private:
void FreeTmpBuffer() {