forked from mindspore-Ecosystem/mindspore
!6768 [MS][LITE]optimize fp16 common conv preprocess
Merge pull request !6768 from fuzhiye/tmp
This commit is contained in:
commit
fcaa5bc859
|
@ -344,7 +344,6 @@ void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_
|
|||
int channel_block = UP_DIV(in_channel, C4NUM);
|
||||
int kernel_plane = kernel_h * kernel_w;
|
||||
int unit_size = kernel_plane * channel_block * C4NUM;
|
||||
int packed_input_size = output_tile_count * tile_n * unit_size;
|
||||
|
||||
// we accumulate 4 channels per time for input blocks
|
||||
int ic4 = UP_DIV(in_channel, C4NUM);
|
||||
|
@ -355,11 +354,10 @@ void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_
|
|||
for (int b = 0; b < in_batch; b++) {
|
||||
int in_batch_offset = b * ic4 * C4NUM * in_h * in_w;
|
||||
int out_batch_offset = b * out_channel * out_h * out_w;
|
||||
int gemm_in_batch_offset = b * packed_input_size;
|
||||
for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
|
||||
int start_index = thread_id * tile_n;
|
||||
int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n;
|
||||
float16_t *gemm_input = (float16_t *)(packed_input + thread_id * unit_size * tile_n + gemm_in_batch_offset);
|
||||
float16_t *gemm_input = (float16_t *)(packed_input + task_id * unit_size * tile_n);
|
||||
Im2ColPackUnitFp16(input_data + in_batch_offset, conv_param, gemm_input, real_cal_num, start_index);
|
||||
|
||||
int out_offset = thread_id * tile_n * out_channel + out_batch_offset;
|
||||
|
|
|
@ -55,23 +55,24 @@ void Im2ColPackUnitFp16(float16_t *input_data, ConvParameter *conv_param, float1
|
|||
int in_w = conv_param->input_w_;
|
||||
int out_w = conv_param->output_w_;
|
||||
int ic4 = UP_DIV(in_channel, 4);
|
||||
int ic4_minus = in_channel / 4;
|
||||
memset(packed_input, 0, kernel_w * kernel_h * ic4 * C4NUM * 16 * sizeof(float16_t));
|
||||
|
||||
for (int i = 0; i < real_cal_num; i++) {
|
||||
int block_start = block_index + i;
|
||||
int input_h = block_start / out_w * stride_h - pad_h;
|
||||
int input_w = block_start % out_w * stride_w - pad_w;
|
||||
int input_stride = input_h * in_w * ic4 * C4NUM + input_w * ic4 * C4NUM;
|
||||
int input_stride = (input_h * in_w + input_w) * in_channel;
|
||||
int kh_s = MSMAX(0, UP_DIV(-input_h, dilation_h));
|
||||
int kh_e = MSMIN(kernel_h, UP_DIV(in_h - input_h, dilation_h));
|
||||
int kw_s = MSMAX(0, UP_DIV(-input_w, dilation_w));
|
||||
int kw_e = MSMIN(kernel_w, UP_DIV(in_w - input_w, dilation_w));
|
||||
for (int j = kh_s; j < kh_e; j++) {
|
||||
int input_y_stride = j * dilation_h * in_w * ic4 * C4NUM + input_stride;
|
||||
int input_y_stride = j * dilation_h * in_w * in_channel + input_stride;
|
||||
for (int n = kw_s; n < kw_e; n++) {
|
||||
int input_x_stride = input_y_stride + n * dilation_w * ic4 * C4NUM;
|
||||
int input_x_stride = input_y_stride + n * dilation_w * in_channel;
|
||||
int input_plane_offset = (j * kernel_w + n) * 16 * C4NUM * ic4 + i * C4NUM;
|
||||
for (int m = 0; m < ic4; m++) {
|
||||
for (int m = 0; m < ic4_minus; m++) {
|
||||
int channel_block_stride = input_x_stride + m * C4NUM;
|
||||
int channel_block_offset = input_plane_offset + m * 16 * C4NUM;
|
||||
#ifdef ENABLE_ARM64
|
||||
|
@ -82,9 +83,15 @@ void Im2ColPackUnitFp16(float16_t *input_data, ConvParameter *conv_param, float1
|
|||
}
|
||||
#endif
|
||||
} // channel_block loop
|
||||
} // kernel_w loop
|
||||
} // kernel_h loop
|
||||
} // tile num loop
|
||||
int ic_res = in_channel - ic4_minus * C4NUM;
|
||||
for (int l = 0; l < ic_res; ++l) {
|
||||
int channel_block_stride = input_x_stride + ic4_minus * C4NUM + l;
|
||||
int channel_block_offset = input_plane_offset + ic4_minus * 16 * C4NUM + l;
|
||||
packed_input[channel_block_offset] = input_data[channel_block_stride];
|
||||
}
|
||||
} // kernel_w loop
|
||||
} // kernel_h loop
|
||||
} // tile num loop
|
||||
}
|
||||
|
||||
void PackWeightFp16(float16_t *weight_data, ConvParameter *conv_param, float16_t *packed_weight) {
|
||||
|
@ -334,7 +341,8 @@ void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int
|
|||
"st1 {v27.8h}, [x11], %[dstStride]\n"
|
||||
"st1 {v31.8h}, [x10], %[dstStride]\n"
|
||||
:
|
||||
: [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride)
|
||||
:
|
||||
[ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride)
|
||||
: "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
|
||||
"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
|
||||
"v30", "v31");
|
||||
|
|
|
@ -78,6 +78,7 @@ void PackWeightInt8(int8_t *weight_data, ConvParameter *conv_param, int8_t *pack
|
|||
int plane_c4 = UP_DIV(kernel_plane, C4NUM);
|
||||
int pack_weight_size = oc4 * C4NUM * ic4 * C4NUM * plane_c4 * C4NUM;
|
||||
int block_size = pack_weight_size / oc4;
|
||||
QuantArg *filter_args = conv_param->conv_quant_arg_.filter_quant_args_;
|
||||
|
||||
for (int m = 0; m < kernel_plane; m++) {
|
||||
int kernel_plane_stride = m * in_channel;
|
||||
|
@ -101,7 +102,13 @@ void PackWeightInt8(int8_t *weight_data, ConvParameter *conv_param, int8_t *pack
|
|||
int8_t *origin_data_ptr = weight_data + kernel_block_stride + k * kernel_plane * in_channel;
|
||||
int8_t *packed_data_ptr = packed_weight + packed_kernel_block_size + k * C4NUM * C4NUM;
|
||||
*packed_data_ptr = origin_data_ptr[0];
|
||||
weight_sum[j * C4NUM + k] += (int32_t)packed_data_ptr[0];
|
||||
int32_t f_zp;
|
||||
if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) {
|
||||
f_zp = filter_args[j * C4NUM + k].zp_;
|
||||
} else {
|
||||
f_zp = filter_args[0].zp_;
|
||||
}
|
||||
weight_sum[j * C4NUM + k] += (int32_t)(packed_data_ptr[0] - f_zp);
|
||||
}
|
||||
} // kernel block loop
|
||||
} // inchannel block loop
|
||||
|
@ -121,6 +128,7 @@ void PackWeightInt8Opt(int8_t *weight_data, ConvParameter *conv_param, int8_t *p
|
|||
int pack_weight_size = oc4 * ic4 * C4NUM * C4NUM * kernel_plane;
|
||||
int unit_size = C4NUM * C4NUM;
|
||||
int block_size = pack_weight_size / oc4;
|
||||
QuantArg *filter_args = conv_param->conv_quant_arg_.filter_quant_args_;
|
||||
|
||||
for (int m = 0; m < kernel_plane; m++) {
|
||||
int kernel_plane_stride = m * in_channel;
|
||||
|
@ -142,7 +150,13 @@ void PackWeightInt8Opt(int8_t *weight_data, ConvParameter *conv_param, int8_t *p
|
|||
int8_t *origin_data_ptr = weight_data + kernel_block_stride + k * kernel_plane * in_channel;
|
||||
int8_t *packed_data_ptr = packed_weight + packed_kernel_block_size + k * C4NUM;
|
||||
*packed_data_ptr = origin_data_ptr[0];
|
||||
weight_sum[j * C4NUM + k] += (int32_t)(packed_data_ptr[0]);
|
||||
int32_t f_zp;
|
||||
if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) {
|
||||
f_zp = filter_args[j * C4NUM + k].zp_;
|
||||
} else {
|
||||
f_zp = filter_args[0].zp_;
|
||||
}
|
||||
weight_sum[j * C4NUM + k] += (int32_t)(packed_data_ptr[0] - f_zp);
|
||||
}
|
||||
} // kernel block loop
|
||||
} // inchannel block loop
|
||||
|
@ -400,6 +414,9 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real
|
|||
packed_input[channel_block_offset] = input_data[channel_block_stride];
|
||||
input_accumulator += (packed_input + channel_block_offset)[0];
|
||||
}
|
||||
for (int l = 0; l < (C4NUM - ic_res); l++) {
|
||||
input_accumulator += conv_param->conv_quant_arg_.input_quant_args_[0].zp_;
|
||||
}
|
||||
} // kernel_w loop
|
||||
} // kernel_h loop
|
||||
if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC)) {
|
||||
|
|
|
@ -84,53 +84,29 @@ int ConvolutionFP16CPUKernel::InitWeightBias() {
|
|||
}
|
||||
|
||||
int ConvolutionFP16CPUKernel::InitTmpBuffer() {
|
||||
int in_batch = conv_param_->input_batch_;
|
||||
int in_channel = conv_param_->input_channel_;
|
||||
int out_channel = conv_param_->output_channel_;
|
||||
int channel_block = UP_DIV(in_channel, C4NUM);
|
||||
int cal_num = 16;
|
||||
int output_count = conv_param_->output_h_ * conv_param_->output_w_;
|
||||
int output_tile_count = UP_DIV(output_count, cal_num);
|
||||
int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_;
|
||||
int unit_size = kernel_plane * channel_block * C4NUM;
|
||||
int packed_input_size = output_tile_count * cal_num * unit_size;
|
||||
int packed_input_size = thread_count_ * cal_num * unit_size;
|
||||
|
||||
packed_input_ =
|
||||
reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(in_batch * packed_input_size * sizeof(float16_t)));
|
||||
packed_input_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(packed_input_size * sizeof(float16_t)));
|
||||
if (packed_input_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc packed_input_ failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
size_t nhwc4_input_size =
|
||||
channel_block * C4NUM * in_batch * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
|
||||
nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size);
|
||||
if (nhwc4_input_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
tmp_output_block_ =
|
||||
reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(thread_count_ * cal_num * out_channel * sizeof(float16_t)));
|
||||
if (tmp_output_block_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc tmp_output_block_ failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void ConvolutionFP16CPUKernel::ConfigInputOutput() {
|
||||
auto input_tensor = in_tensors_.at(kInputIndex);
|
||||
auto input_format = input_tensor->GetFormat();
|
||||
schema::Format execute_format = schema::Format::Format_NHWC4;
|
||||
convert_func_ = LayoutTransformFp16(input_format, execute_format);
|
||||
if (convert_func_ == nullptr) {
|
||||
MS_LOG(ERROR) << "layout convert func is nullptr.";
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
int ConvolutionFP16CPUKernel::Init() {
|
||||
auto ret = InitWeightBias();
|
||||
if (ret != RET_OK) {
|
||||
|
@ -140,7 +116,6 @@ int ConvolutionFP16CPUKernel::Init() {
|
|||
if (!InferShapeDone()) {
|
||||
return RET_OK;
|
||||
}
|
||||
ConfigInputOutput();
|
||||
return ReSize();
|
||||
}
|
||||
|
||||
|
@ -160,8 +135,8 @@ int ConvolutionFP16CPUKernel::ReSize() {
|
|||
}
|
||||
|
||||
int ConvolutionFP16CPUKernel::RunImpl(int task_id) {
|
||||
ConvFp16(reinterpret_cast<float16_t *>(nhwc4_input_), packed_input_, packed_weight_,
|
||||
reinterpret_cast<float16_t *>(bias_data_), tmp_output_block_, execute_output_, task_id, conv_param_);
|
||||
ConvFp16(execute_input_, packed_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), tmp_output_block_,
|
||||
execute_output_, task_id, conv_param_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
@ -194,12 +169,6 @@ int ConvolutionFP16CPUKernel::Run() {
|
|||
return RET_ERROR;
|
||||
}
|
||||
|
||||
int in_batch = conv_param_->input_batch_;
|
||||
int in_h = conv_param_->input_h_;
|
||||
int in_w = conv_param_->input_w_;
|
||||
int in_channel = conv_param_->input_channel_;
|
||||
convert_func_(reinterpret_cast<void *>(execute_input_), nhwc4_input_, in_batch, in_h * in_w, in_channel);
|
||||
|
||||
int error_code = ParallelLaunch(this->context_->thread_pool_, ConvolutionFp16Impl, this, thread_count_);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "conv fp16 error error_code[" << error_code << "]";
|
||||
|
|
|
@ -46,14 +46,9 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
|
|||
int RunImpl(int task_id);
|
||||
int InitWeightBias();
|
||||
int InitTmpBuffer();
|
||||
void ConfigInputOutput();
|
||||
|
||||
private:
|
||||
void FreeTmpBuffer() {
|
||||
if (nhwc4_input_ != nullptr) {
|
||||
ctx_->allocator->Free(nhwc4_input_);
|
||||
nhwc4_input_ = nullptr;
|
||||
}
|
||||
if (packed_input_ != nullptr) {
|
||||
ctx_->allocator->Free(packed_input_);
|
||||
packed_input_ = nullptr;
|
||||
|
|
|
@ -236,12 +236,8 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::Tensor *> &
|
|||
if (kernel_h == 1 && kernel_w == 1) {
|
||||
kernel = new (std::nothrow) kernel::Convolution1x1CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
|
||||
} else if (use_winograd) {
|
||||
if (kernel_h == 3 && kernel_w == 3 && out_unit == 2) {
|
||||
kernel = new (std::nothrow) kernel::Convolution3x3CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
|
||||
} else {
|
||||
kernel = new (std::nothrow)
|
||||
kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
|
||||
}
|
||||
kernel =
|
||||
new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
|
||||
} else {
|
||||
kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
|
||||
}
|
||||
|
|
|
@ -89,7 +89,13 @@ int ConvolutionInt8CPUKernel::InitWeightBias() {
|
|||
MS_LOG(ERROR) << "malloc weight_sum failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
for (int i = 0; i < output_channel; i++) weight_sum[i] = 0;
|
||||
for (int i = 0; i < output_channel; i++) {
|
||||
if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
|
||||
weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[i].zp_;
|
||||
} else {
|
||||
weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[0].zp_;
|
||||
}
|
||||
}
|
||||
PackWeightInt8(origin_weight, conv_param_, packed_weight_, weight_sum);
|
||||
|
||||
// init bias
|
||||
|
@ -190,7 +196,13 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() {
|
|||
MS_LOG(ERROR) << "malloc weight_sum failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
for (int i = 0; i < output_channel; i++) weight_sum[i] = 0;
|
||||
for (int i = 0; i < output_channel; i++) {
|
||||
if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
|
||||
weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[i].zp_;
|
||||
} else {
|
||||
weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[0].zp_;
|
||||
}
|
||||
}
|
||||
PackWeightInt8Opt(origin_weight, conv_param_, packed_weight_, weight_sum);
|
||||
|
||||
// init bias
|
||||
|
@ -261,14 +273,7 @@ int ConvolutionInt8CPUKernel::InitTmpBufferOpt() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void ConvolutionInt8CPUKernel::ConfigInputOutput() {
|
||||
auto output_tensor = out_tensors_.at(kOutputIndex);
|
||||
output_tensor->SetFormat(schema::Format::Format_NHWC);
|
||||
}
|
||||
|
||||
int ConvolutionInt8CPUKernel::Init() {
|
||||
// config input output
|
||||
ConfigInputOutput();
|
||||
CheckSupportOptimize();
|
||||
auto ret = SetQuantParam();
|
||||
if (ret != RET_OK) {
|
||||
|
|
|
@ -51,7 +51,6 @@ class ConvolutionInt8CPUKernel : public ConvolutionBaseCPUKernel {
|
|||
int InitTmpBufferOpt();
|
||||
int InitWeightBias();
|
||||
int InitTmpBuffer();
|
||||
void ConfigInputOutput();
|
||||
|
||||
private:
|
||||
void FreeTmpBuffer() {
|
||||
|
|
Loading…
Reference in New Issue