From a3cc26ffcc1559e8cbc5a4834e6323510928d097 Mon Sep 17 00:00:00 2001 From: ling Date: Fri, 11 Sep 2020 14:56:36 +0800 Subject: [PATCH] [MSLITE][Develop]int8 conv1x1 aupport arm32 --- mindspore/lite/nnacl/int8/matmul_int8.c | 15 ++ mindspore/lite/nnacl/int8/matmul_int8.h | 3 +- mindspore/lite/nnacl/matmul_parameter.h | 1 + mindspore/lite/nnacl/op_base.h | 1 + .../kernel/arm/int8/convolution_1x1_int8.cc | 134 +++++++++++++----- .../kernel/arm/int8/convolution_1x1_int8.h | 2 + 6 files changed, 118 insertions(+), 38 deletions(-) diff --git a/mindspore/lite/nnacl/int8/matmul_int8.c b/mindspore/lite/nnacl/int8/matmul_int8.c index 1e1241712c8..33da1b4ed76 100644 --- a/mindspore/lite/nnacl/int8/matmul_int8.c +++ b/mindspore/lite/nnacl/int8/matmul_int8.c @@ -43,6 +43,21 @@ void RowMajor2Row4x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int co } } +void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) { + int col16 = UP_ROUND(col, C16NUM); + for (int r = 0; r < row; r++) { + int rd4 = r / C2NUM; + int rm4 = r % C2NUM; + for (int c = 0; c < col; c++) { + int cd16 = c / C16NUM; + int cm16 = c % C16NUM; + int dst_index = rd4 * col16 * C2NUM + cd16 * C2NUM * C16NUM + rm4 * C16NUM + cm16; + int src_index = r * col + c; + dst_ptr[dst_index] = src_ptr[src_index]; + } + } +} + void RowMajor2Row8x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col) { int col4 = UP_ROUND(col, C4NUM); for (int r = 0; r < row; r++) { diff --git a/mindspore/lite/nnacl/int8/matmul_int8.h b/mindspore/lite/nnacl/int8/matmul_int8.h index fe20548b8d4..11dd3a66c05 100644 --- a/mindspore/lite/nnacl/int8/matmul_int8.h +++ b/mindspore/lite/nnacl/int8/matmul_int8.h @@ -42,7 +42,6 @@ void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, bool per_channel); void RowMajor2Row8x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col); void RowMajor2Row4x8MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col); - void RowMajor2Row4x16Major(int8_t *src, int row, int col, int8_t *dst, int col_16); void RowMajor2Col16x4Major(int8_t *src, int row, int col, int8_t *dst, int row_16); void CalcInputSums(int8_t *input, int row, int col, int weight_zp, int *dst, DataOrder order); @@ -52,6 +51,8 @@ void MatmulInt8(const int8_t *a, const int8_t *b, int8_t *dst, const int *a_sums int act_max, int out_zp, int multiplier, int left_shift, int right_shift, int row, int col, int deep16, int stride); +void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col); + #ifdef ENABLE_ARM64 void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16, const int *a_sums, const int *bias, int act_min, int act_max, int out_zp, int multiplier, int left_shift, diff --git a/mindspore/lite/nnacl/matmul_parameter.h b/mindspore/lite/nnacl/matmul_parameter.h index 7be90402c8f..8f6b562974c 100644 --- a/mindspore/lite/nnacl/matmul_parameter.h +++ b/mindspore/lite/nnacl/matmul_parameter.h @@ -39,6 +39,7 @@ typedef struct MatMulParameter { int row_8_; int row_12_; int row_16_; + int col_2_; int col_4_; int col_8_; int deep_; diff --git a/mindspore/lite/nnacl/op_base.h b/mindspore/lite/nnacl/op_base.h index e5bf293ed31..b080dfd3341 100644 --- a/mindspore/lite/nnacl/op_base.h +++ b/mindspore/lite/nnacl/op_base.h @@ -21,6 +21,7 @@ #include #include +#define C2NUM 2 #define C4NUM 4 #define C8NUM 8 #define C12NUM 12 diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc index d4ac9fcdbc8..0acd324d2a5 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc @@ -86,44 +86,10 @@ void Convolution1x1Int8CPUKernel::CheckSupportOptimize() { return; } -int Convolution1x1Int8CPUKernel::InitWeightBias() { - auto filter_tensor = in_tensors_.at(kWeightIndex); - auto input_channel = filter_tensor->Channel(); - auto output_channel = filter_tensor->Batch(); - - /* weight */ - size_t size = support_optimize_ ? UP_ROUND(input_channel, C4NUM) * UP_ROUND(output_channel, C8NUM) * sizeof(int8_t) - : UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C4NUM) * sizeof(int8_t); - packed_weight_ = reinterpret_cast(malloc(size)); - if (packed_weight_ == nullptr) { - MS_LOG(ERROR) << "Conv1x1 int8 Malloc weight error!"; - return RET_ERROR; - } - memset(packed_weight_, 0, size); - if (support_optimize_) { - RowMajor2Row8x4MajorInt8(reinterpret_cast(filter_tensor->MutableData()), packed_weight_, output_channel, - input_channel); - } else { - RowMajor2Row4x16MajorInt8(reinterpret_cast(filter_tensor->MutableData()), packed_weight_, output_channel, - input_channel); - } - +int Convolution1x1Int8CPUKernel::InitBiasByzp(void *src_weight, int input_channel, int output_channel) { /* bias = bias - v2 x zp1 + zp1 x zp2 */ - int col4 = UP_ROUND(output_channel, C4NUM); - int col8 = UP_ROUND(output_channel, C8NUM); - size = support_optimize_ ? col8 * sizeof(int32_t) : col4 * sizeof(int32_t); - bias_data_ = malloc(size); - if (bias_data_ == nullptr) { - MS_LOG(ERROR) << "Conv1x1 int8 Malloc bias_ptr_ error!"; - return RET_ERROR; - } - memset(bias_data_, 0, size); - if (in_tensors_.size() == 3) { - memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), output_channel * sizeof(int32_t)); - } - int32_t *bias_data = reinterpret_cast(bias_data_); - int8_t *weight = reinterpret_cast(filter_tensor->MutableData()); + int8_t *weight = reinterpret_cast(src_weight); int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_; for (int oc = 0; oc < output_channel; oc++) { int32_t weight_sum_value = 0; @@ -147,6 +113,77 @@ int Convolution1x1Int8CPUKernel::InitWeightBias() { return RET_OK; } +int Convolution1x1Int8CPUKernel::InitWeightBias() { + auto filter_tensor = in_tensors_.at(kWeightIndex); + auto input_channel = filter_tensor->Channel(); + auto output_channel = filter_tensor->Batch(); + + /* weight */ + size_t size = support_optimize_ ? UP_ROUND(input_channel, C4NUM) * UP_ROUND(output_channel, C8NUM) * sizeof(int8_t) + : UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C4NUM) * sizeof(int8_t); + packed_weight_ = reinterpret_cast(malloc(size)); + if (packed_weight_ == nullptr) { + MS_LOG(ERROR) << "Conv1x1 int8 Malloc weight error!"; + return RET_ERROR; + } + memset(packed_weight_, 0, size); + if (support_optimize_) { + RowMajor2Row8x4MajorInt8(reinterpret_cast(filter_tensor->MutableData()), packed_weight_, output_channel, + input_channel); + } else { + RowMajor2Row4x16MajorInt8(reinterpret_cast(filter_tensor->MutableData()), packed_weight_, output_channel, + input_channel); + } + + int col4 = UP_ROUND(output_channel, C4NUM); + int col8 = UP_ROUND(output_channel, C8NUM); + size = support_optimize_ ? col8 * sizeof(int32_t) : col4 * sizeof(int32_t); + bias_data_ = malloc(size); + if (bias_data_ == nullptr) { + MS_LOG(ERROR) << "Conv1x1 int8 Malloc bias_ptr_ error!"; + return RET_ERROR; + } + memset(bias_data_, 0, size); + if (in_tensors_.size() == 3) { + memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), output_channel * sizeof(int32_t)); + } + + InitBiasByzp(filter_tensor->MutableData(), input_channel, output_channel); + return RET_OK; +} + +int Convolution1x1Int8CPUKernel::InitWeightBiasArm32() { + auto filter_tensor = in_tensors_.at(kWeightIndex); + auto input_channel = filter_tensor->Channel(); + auto output_channel = filter_tensor->Batch(); + + /* weight */ + size_t size = UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C2NUM) * sizeof(int8_t); + packed_weight_ = reinterpret_cast(malloc(size)); + if (packed_weight_ == nullptr) { + MS_LOG(ERROR) << "Conv1x1 int8 arm32 Malloc weight error!"; + return RET_ERROR; + } + memset(packed_weight_, 0, size); + RowMajor2Row2x16MajorInt8(reinterpret_cast(filter_tensor->MutableData()), packed_weight_, output_channel, + input_channel); + + /* bias */ + int col2 = UP_ROUND(output_channel, C2NUM); + bias_data_ = malloc(col2 * sizeof(int32_t)); + if (bias_data_ == nullptr) { + MS_LOG(ERROR) << "Conv1x1 int8 arm32 Malloc bias_ptr_ error!"; + return RET_ERROR; + } + memset(bias_data_, 0, size); + if (in_tensors_.size() == 3) { + memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), output_channel * sizeof(int32_t)); + } + + InitBiasByzp(filter_tensor->MutableData(), input_channel, output_channel); + return RET_OK; +} + int Convolution1x1Int8CPUKernel::Init() { matmul_param_ = new (std::nothrow) MatMulParameter(); if (matmul_param_ == nullptr) { @@ -164,7 +201,11 @@ int Convolution1x1Int8CPUKernel::Init() { CheckSupportOptimize(); +#ifdef ENABLE_ARM32 + ret = InitWeightBiasArm32(); +#else ret = InitWeightBias(); +#endif if (ret != RET_OK) { MS_LOG(ERROR) << "Init weight bias failed."; return ret; @@ -183,6 +224,7 @@ int Convolution1x1Int8CPUKernel::InitParam() { matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_; matmul_param_->deep_ = conv_param_->input_channel_; matmul_param_->col_ = conv_param_->output_channel_; + matmul_param_->col_2_ = UP_ROUND(matmul_param_->col_, C2NUM); matmul_param_->col_4_ = UP_ROUND(matmul_param_->col_, C4NUM); matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM); matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM); @@ -192,6 +234,10 @@ int Convolution1x1Int8CPUKernel::InitParam() { int row_pack_count = 0; int col_pack_count = 0; +#ifdef ENABLE_ARM32 + row_pack_count = C4NUM; + col_pack_count = C2NUM; +#else if (support_optimize_) { row_pack_count = C8NUM; col_pack_count = C8NUM; @@ -199,6 +245,7 @@ int Convolution1x1Int8CPUKernel::InitParam() { row_pack_count = C4NUM; col_pack_count = C4NUM; } +#endif /* init input sum size */ if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) { @@ -222,7 +269,7 @@ int Convolution1x1Int8CPUKernel::InitParam() { memset(input_ptr_, 0, matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t)); } return RET_OK; -} +} // namespace mindspore::kernel int Convolution1x1Int8CPUKernel::ReSize() { FreeResizeBuf(); @@ -260,6 +307,18 @@ int Convolution1x1Int8CPUKernel::RunImpl(int task_id) { int32_t *cur_right_shift = conv_param_->conv_quant_arg_.right_shift_; int32_t *cur_multiplier = conv_param_->conv_quant_arg_.quant_multiplier_; +#ifdef ENABLE_ARM32 + int cur_stride = thread_stride_ * C2NUM; + int res_stride = matmul_param_->col_ - task_id * thread_stride_ * C2NUM; + int cur_oc = MSMIN(cur_stride, res_stride); + if (cur_oc <= 0) { + return RET_OK; + } + Conv1x1Int8(packed_input_, packed_weight_ + task_id * thread_stride_ * C2NUM * matmul_param_->deep_16_, + output_ptr_ + task_id * thread_stride_ * C2NUM, cur_input_sum, + reinterpret_cast(bias_data_) + task_id * thread_stride_ * C2NUM, matmul_param_->row_, cur_oc, + matmul_param_->deep_16_, cur_left_shift, cur_right_shift, cur_multiplier, conv_param_); +#else if (support_optimize_) { int cur_stride = thread_stride_ * C8NUM; int res_stride = matmul_param_->col_ - task_id * thread_stride_ * C8NUM; @@ -296,6 +355,7 @@ int Convolution1x1Int8CPUKernel::RunImpl(int task_id) { reinterpret_cast(bias_data_) + task_id * thread_stride_ * C4NUM, matmul_param_->row_, cur_oc, matmul_param_->deep_16_, cur_left_shift, cur_right_shift, cur_multiplier, conv_param_); } +#endif return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h index 342aa3eff71..d8a57f2439b 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h @@ -52,8 +52,10 @@ class Convolution1x1Int8CPUKernel : public ConvolutionBaseCPUKernel { void FreeResizeBuf(); int InitParam(); int InitWeightBias(); + int InitWeightBiasArm32(); void Pre1x1Trans(int8_t *src_input, int8_t *src_output); void CheckSupportOptimize(); + int InitBiasByzp(void *src_weight, int input_channel, int output_channel); private: int32_t *input_sum_ = nullptr; /* per-channel: oc4 format */