!12181 [MSLITE] int8 matmul base

From: @ling_qiao_min
Reviewed-by: 
Signed-off-by:
This commit is contained in:
mindspore-ci-bot 2021-02-20 17:19:44 +08:00 committed by Gitee
commit 2f1d4f9ef9
14 changed files with 560 additions and 667 deletions

View File

@ -182,40 +182,6 @@ void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int
return;
}
void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
bool peroc) {
/* support per-layer && weight per-channel */
/* row4x16-major * row16x4-major => (int8)row-major*/
for (int r = 0; r < row; r++) {
for (int c = 0; c < col; c++) {
int r4div = r / C4NUM, r4mod = r % C4NUM;
int c4div = c / C4NUM, c4mod = c % C4NUM;
size_t ci = r * stride + c;
int32_t value = 0;
for (int d = 0; d < deep_16; d++) {
int d16div = d / C16NUM, d16mod = d % C16NUM;
size_t ai = r4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + r4mod * C16NUM + d16mod;
size_t bi = c4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + c4mod * C16NUM + d16mod;
value = value + a[ai] * b[bi];
}
int32_t cur_input_sum =
peroc ? input_sum[c4div * UP_ROUND(row, C4NUM) * C4NUM + r * C4NUM + c4mod] : input_sum[r];
value -= cur_input_sum;
value += bias[c];
int32_t cur_left_shift = peroc ? left_shift[c] : left_shift[0];
int32_t cur_right_shift = peroc ? right_shift[c] : right_shift[0];
int32_t cur_multiplier = peroc ? multiplier[c] : multiplier[0];
value = MultiplyByQuantizedMultiplier(value, cur_multiplier, cur_left_shift, cur_right_shift) + output_zp;
value = MSMIN(maxi, value);
value = MSMAX(mini, value);
dst[ci] = (int8_t)value;
}
}
return;
}
void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
@ -353,6 +319,105 @@ void MatMulInt8_4x16_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row
return;
}
#ifdef ENABLE_ARM64
void PackInput4x4AndInputSumPert_arm64(const int8_t *src_ic, int8_t *pack_ic, int32_t *input_sum_r, size_t src_stride,
size_t ic_4div, size_t ic_4res, int32_t filter_zp) {
asm volatile(
"dup v2.4s, wzr \n"
"mov x14, %[input_sum_r] \n"
"dup v3.4s, %w[filter_zp] \n"
"mov x10, %[src_ic] \n"
"mov x11, %[pack_ic] \n"
"mov x15, #0 \n"
"1: \n"
"cmp x15, %[ic_4div] \n"
"add x15, x15, #4\n"
"mov x12, x10 \n"
"add x10, x10, #4\n"
"blt 2f \n"
"cmp %[ic_4res], #0\n"
"beq 6f \n"
"cmp %[ic_4res], #1\n"
"beq 3f \n"
"cmp %[ic_4res], #2\n"
"beq 4f \n"
"cmp %[ic_4res], #3\n"
"beq 5f \n"
"2: \n"
"ld1 {v0.s}[0], [x12], %[src_stride]\n"
"ld1 {v0.s}[1], [x12], %[src_stride]\n"
"ld1 {v0.s}[2], [x12], %[src_stride]\n"
"ld1 {v0.s}[3], [x12], %[src_stride]\n"
"st1 {v0.16b}, [x11], #16\n"
"saddlp v1.8h, v0.16b \n"
"saddlp v0.4s, v1.8h \n"
"add v2.4s, v2.4s, v0.4s \n"
"b 1b \n"
"3: \n" /* ic res 1 */
"dup v0.4s, wzr \n"
"ld1 {v0.b}[0], [x12], %[src_stride]\n"
"ld1 {v0.b}[4], [x12], %[src_stride]\n"
"ld1 {v0.b}[8], [x12], %[src_stride]\n"
"ld1 {v0.b}[12], [x12], %[src_stride]\n"
"st1 {v0.16b}, [x11], #16\n"
"saddlp v1.8h, v0.16b \n"
"saddlp v0.4s, v1.8h \n"
"add v2.4s, v2.4s, v0.4s \n"
"b 6f \n"
"4: \n" /* ic res 2 */
"dup v0.4s, wzr \n"
"ld1 {v0.h}[0], [x12], %[src_stride]\n"
"ld1 {v0.h}[2], [x12], %[src_stride]\n"
"ld1 {v0.h}[4], [x12], %[src_stride]\n"
"ld1 {v0.h}[6], [x12], %[src_stride]\n"
"st1 {v0.16b}, [x11], #16\n"
"saddlp v1.8h, v0.16b \n"
"saddlp v0.4s, v1.8h \n"
"add v2.4s, v2.4s, v0.4s \n"
"b 6f \n"
"5: \n" /* ic res 3 */
"dup v0.4s, wzr \n"
"add x13, x12, #2 \n"
"ld1 {v0.h}[0], [x12], %[src_stride]\n"
"ld1 {v0.b}[2], [x13], %[src_stride]\n"
"ld1 {v0.h}[2], [x12], %[src_stride]\n"
"ld1 {v0.b}[6], [x13], %[src_stride]\n"
"ld1 {v0.h}[4], [x12], %[src_stride]\n"
"ld1 {v0.b}[10], [x13], %[src_stride]\n"
"ld1 {v0.h}[6], [x12], %[src_stride]\n"
"ld1 {v0.b}[14], [x13], %[src_stride]\n"
"st1 {v0.16b}, [x11], #16\n"
"saddlp v1.8h, v0.16b \n"
"saddlp v0.4s, v1.8h \n"
"add v2.4s, v2.4s, v0.4s \n"
"b 6f \n"
"6: \n"
"mul v2.4s, v2.4s, v3.4s \n"
"st1 {v2.4s}, [x14], #16 \n"
:
: [ src_ic ] "r"(src_ic), [ pack_ic ] "r"(pack_ic), [ input_sum_r ] "r"(input_sum_r),
[ src_stride ] "r"(src_stride), [ ic_4div ] "r"(ic_4div), [ ic_4res ] "r"(ic_4res), [ filter_zp ] "r"(filter_zp)
: "x10", "x11", "x12", "x13", "x14", "x15", "v0", "v1", "v2", "v3");
return;
}
#endif
void PackInput4x4AndInputSumPert(const int8_t *src_input, int8_t *packed_input, int32_t *input_sum,
size_t input_channel, size_t plane_size, int32_t filter_zp) {
int ic4 = UP_ROUND(input_channel, C4NUM);
@ -370,99 +435,7 @@ void PackInput4x4AndInputSumPert(const int8_t *src_input, int8_t *packed_input,
#ifdef ENABLE_ARM64
size_t src_stride = input_channel;
size_t ic_4res = input_channel - ic_4div;
asm volatile(
"dup v2.4s, wzr \n"
"mov x14, %[input_sum_r] \n"
"dup v3.4s, %w[filter_zp] \n"
"mov x10, %[src_ic] \n"
"mov x11, %[pack_ic] \n"
"mov x15, #0 \n"
"1: \n"
"cmp x15, %[ic_4div] \n"
"add x15, x15, #4\n"
"mov x12, x10 \n"
"add x10, x10, #4\n"
"blt 2f \n"
"cmp %[ic_4res], #0\n"
"beq 6f \n"
"cmp %[ic_4res], #1\n"
"beq 3f \n"
"cmp %[ic_4res], #2\n"
"beq 4f \n"
"cmp %[ic_4res], #3\n"
"beq 5f \n"
"2: \n"
"ld1 {v0.s}[0], [x12], %[src_stride]\n"
"ld1 {v0.s}[1], [x12], %[src_stride]\n"
"ld1 {v0.s}[2], [x12], %[src_stride]\n"
"ld1 {v0.s}[3], [x12], %[src_stride]\n"
"st1 {v0.16b}, [x11], #16\n"
"saddlp v1.8h, v0.16b \n"
"saddlp v0.4s, v1.8h \n"
"add v2.4s, v2.4s, v0.4s \n"
"b 1b \n"
"3: \n" /* ic res 1 */
"dup v0.4s, wzr \n"
"ld1 {v0.b}[0], [x12], %[src_stride]\n"
"ld1 {v0.b}[4], [x12], %[src_stride]\n"
"ld1 {v0.b}[8], [x12], %[src_stride]\n"
"ld1 {v0.b}[12], [x12], %[src_stride]\n"
"st1 {v0.16b}, [x11], #16\n"
"saddlp v1.8h, v0.16b \n"
"saddlp v0.4s, v1.8h \n"
"add v2.4s, v2.4s, v0.4s \n"
"b 6f \n"
"4: \n" /* ic res 2 */
"dup v0.4s, wzr \n"
"ld1 {v0.h}[0], [x12], %[src_stride]\n"
"ld1 {v0.h}[2], [x12], %[src_stride]\n"
"ld1 {v0.h}[4], [x12], %[src_stride]\n"
"ld1 {v0.h}[6], [x12], %[src_stride]\n"
"st1 {v0.16b}, [x11], #16\n"
"saddlp v1.8h, v0.16b \n"
"saddlp v0.4s, v1.8h \n"
"add v2.4s, v2.4s, v0.4s \n"
"b 6f \n"
"5: \n" /* ic res 3 */
"dup v0.4s, wzr \n"
"add x13, x12, #2 \n"
"ld1 {v0.h}[0], [x12], %[src_stride]\n"
"ld1 {v0.b}[2], [x13], %[src_stride]\n"
"ld1 {v0.h}[2], [x12], %[src_stride]\n"
"ld1 {v0.b}[6], [x13], %[src_stride]\n"
"ld1 {v0.h}[4], [x12], %[src_stride]\n"
"ld1 {v0.b}[10], [x13], %[src_stride]\n"
"ld1 {v0.h}[6], [x12], %[src_stride]\n"
"ld1 {v0.b}[14], [x13], %[src_stride]\n"
"st1 {v0.16b}, [x11], #16\n"
"saddlp v1.8h, v0.16b \n"
"saddlp v0.4s, v1.8h \n"
"add v2.4s, v2.4s, v0.4s \n"
"b 6f \n"
"6: \n"
"mul v2.4s, v2.4s, v3.4s \n"
"st1 {v2.4s}, [x14], #16 \n"
:
: [ src_ic ] "r"(src_ic), [ pack_ic ] "r"(pack_ic), [ input_sum_r ] "r"(input_sum_r),
[ src_stride ] "r"(src_stride), [ ic_4div ] "r"(ic_4div), [ ic_4res ] "r"(ic_4res), [ filter_zp ] "r"(filter_zp)
: "x10", "x11", "x12", "x13", "x14", "x15", "v0", "v1", "v2", "v3");
PackInput4x4AndInputSumPert_arm64(src_ic, pack_ic, input_sum_r, src_stride, ic_4div, ic_4res, filter_zp);
#else
int32_t tmp_sum_value[4] = {0};
for (int ici = 0; ici < ic_4div; ici += C4NUM) {

View File

@ -25,12 +25,9 @@
extern "C" {
#endif
/* 4x16 16x4 -> 4x4 */
/* matmul */
void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16,
const int *input_sum, const int *bias);
void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
bool per_channel);
void RowMajor2Row16x4MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
void RowMajor2Col16x4MajorInt8(int8_t *src, int row, int col, int8_t *dst);
void CalcInputSums(int8_t *input, int row, int col, int weight_zp, int *dst, DataOrder order);
@ -41,6 +38,7 @@ void MatmulInt8Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int c
int32_t *right_shift, size_t stride, size_t filter_peroc, int32_t *filter_zp);
/* 8x4 4x8 -> 8x8 */
/* optimize conv */
void RowMajor2Row8x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
@ -48,6 +46,7 @@ void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row,
size_t per_channel);
/* 4x16 16x2 -> 4x2 */
/* arm32 conv1x1 */
void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
@ -55,6 +54,7 @@ void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row,
bool peroc);
/* 4x4 4x16 -> 4x16 */
/* optimize conv1x1 */
void RowMajor2Row4x16MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
void PackInput4x4AndInputSumPert(const int8_t *src_input, int8_t *packed_input, int32_t *input_sum,
size_t input_channel, size_t plane_size, int32_t filter_zp);

View File

@ -66,17 +66,6 @@ typedef struct PreluQuantArg {
QuantArg out_quant_args_;
} PreluQuantArg;
typedef struct MatmulQuantArg {
QuantArg input;
QuantArg weight;
QuantArg output;
int32_t out_act_min;
int32_t out_act_max;
int32_t left_shift;
int32_t right_shift;
int32_t quant_multiplier;
} MatmulQuantArg;
typedef struct CropQuantArg {
QuantArg in_args_;
QuantArg out_args_;

View File

@ -73,4 +73,15 @@ typedef struct MatmulQuantParameter {
int32_t *quant_multiplier_;
} MatmulQuantParameter;
typedef struct MatmulQuantArg {
QuantArg input;
QuantArg weight;
QuantArg output;
int32_t out_act_min;
int32_t out_act_max;
int32_t left_shift;
int32_t right_shift;
int32_t quant_multiplier;
} MatmulQuantArg;
#endif // MINDSPORE_LITE_NNACL_MATMUL_H_

View File

@ -67,10 +67,5 @@ int FullconnectionCPUKernel::ReSize() {
return MatmulFp32BaseCPUKernel::ReSize();
}
int FullconnectionCPUKernel::Run() {
MatmulFp32BaseCPUKernel::Run();
return RET_OK;
}
REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_FullConnection, LiteKernelCreator<FullconnectionCPUKernel>)
} // namespace mindspore::kernel

View File

@ -33,7 +33,6 @@ class FullconnectionCPUKernel : public MatmulFp32BaseCPUKernel {
~FullconnectionCPUKernel() = default;
int Init() override;
int ReSize() override;
int Run() override;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_FULLCONNECTION_H_

View File

@ -15,258 +15,44 @@
*/
#include "src/runtime/kernel/arm/int8/fullconnection_int8.h"
#include "src/runtime/runtime_api.h"
#include "src/kernel_registry.h"
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_MEMORY_FAILED;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_FullConnection;
namespace mindspore::kernel {
void FullconnectionInt8CPUKernel::FreeQuantParam() {
if (quant_.filter_scale_ != nullptr) {
free(quant_.filter_scale_);
quant_.filter_scale_ = nullptr;
}
if (quant_.filter_zp_ != nullptr) {
free(quant_.filter_zp_);
quant_.filter_zp_ = nullptr;
}
if (quant_.left_shift_ != nullptr) {
free(quant_.left_shift_);
quant_.left_shift_ = nullptr;
}
if (quant_.right_shift_ != nullptr) {
free(quant_.right_shift_);
quant_.right_shift_ = nullptr;
}
if (quant_.quant_multiplier_ != nullptr) {
free(quant_.quant_multiplier_);
quant_.quant_multiplier_ = nullptr;
}
return;
}
void FullconnectionInt8CPUKernel::FreeTmpBuffer() {
if (pack_a_ptr_ != nullptr) {
free(pack_a_ptr_);
pack_a_ptr_ = nullptr;
}
if (pack_b_ptr_ != nullptr) {
free(pack_b_ptr_);
pack_b_ptr_ = nullptr;
}
if (input_sums_ != nullptr) {
free(input_sums_);
input_sums_ = nullptr;
}
if (weight_bias_sums_ != nullptr) {
free(weight_bias_sums_);
weight_bias_sums_ = nullptr;
}
if (bias_ptr_ != nullptr) {
free(bias_ptr_);
bias_ptr_ = nullptr;
}
return;
}
int FullconnectionInt8CPUKernel::MallocQuantParam() {
auto weight_tensor = in_tensors_.at(1);
auto weight_quant_params = weight_tensor->quant_params();
int col = weight_tensor->shape().front();
filter_per_channel_ = (weight_quant_params.size() > 1);
int init_size = filter_per_channel_ ? col : 1;
quant_.filter_scale_ = reinterpret_cast<float *>(malloc(init_size * sizeof(float)));
if (quant_.filter_scale_ == nullptr) {
return RET_ERROR;
}
quant_.filter_zp_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
if (quant_.filter_zp_ == nullptr) {
return RET_ERROR;
}
quant_.left_shift_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
if (quant_.left_shift_ == nullptr) {
return RET_ERROR;
}
quant_.right_shift_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
if (quant_.right_shift_ == nullptr) {
return RET_ERROR;
}
quant_.quant_multiplier_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
if (quant_.quant_multiplier_ == nullptr) {
return RET_ERROR;
}
return RET_OK;
}
int FullconnectionInt8CPUKernel::Init() {
auto ret = MallocQuantParam();
param_->batch = 1;
param_->a_transpose_ = false;
param_->b_transpose_ = true;
InitParameter();
auto ret = MatmulBaseInt8CPUKernel::Init();
if (ret != RET_OK) {
FreeQuantParam();
MS_LOG(ERROR) << "ParallelLaunch failed";
return ret;
}
auto in_quant_params = in_tensors_.at(0)->quant_params();
quant_.input_.zp_ = in_quant_params.front().zeroPoint;
quant_.input_.scale_ = in_quant_params.front().scale;
auto out_quant_params = out_tensors_.at(0)->quant_params();
quant_.output_.zp_ = out_quant_params.front().zeroPoint;
quant_.output_.scale_ = out_quant_params.front().scale;
auto weight_tensor = in_tensors_.at(1);
fc_param_->b_const_ = (weight_tensor->data_c() != nullptr);
int weight_quant_num = filter_per_channel_ ? weight_tensor->shape().front() : 1;
auto weight_quant_params = weight_tensor->quant_params();
for (int i = 0; i < weight_quant_num; i++) {
quant_.filter_zp_[i] = weight_quant_params[i].zeroPoint;
quant_.filter_scale_[i] = weight_quant_params[i].scale;
}
for (int i = 0; i < weight_quant_num; ++i) {
const double in_scale = static_cast<double>(quant_.input_.scale_ * quant_.filter_scale_[i]);
double real_multiplier = in_scale / static_cast<double>(quant_.output_.scale_);
QuantizeRoundParameterWithDoublePrecision(real_multiplier, &quant_.quant_multiplier_[i], &quant_.left_shift_[i],
&quant_.right_shift_[i]);
}
CalculateActivationRangeQuantized(fc_param_->act_type_ == ActType_Relu, fc_param_->act_type_ == ActType_Relu6,
quant_.output_.zp_, quant_.output_.scale_, &quant_.out_act_min_,
&quant_.out_act_max_);
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
}
void FullconnectionInt8CPUKernel::InitParam() {
int FullconnectionInt8CPUKernel::ReSize() {
int row = 1;
for (size_t i = 0; i < out_tensors_.at(0)->shape().size() - 1; ++i) {
row *= (out_tensors_.at(0)->shape()).at(i);
}
fc_param_->row_ = row;
fc_param_->col_ = out_tensors_.at(0)->shape().back();
fc_param_->deep_ = (in_tensors_.at(1)->shape()).at(1);
param_->row_ = row;
param_->col_ = out_tensors_.at(0)->shape().back();
param_->deep_ = (in_tensors_.at(1)->shape()).at(1);
fc_param_->row_4_ = UP_ROUND(fc_param_->row_, C4NUM);
fc_param_->col_4_ = UP_ROUND(fc_param_->col_, C4NUM);
fc_param_->col_8_ = UP_ROUND(fc_param_->col_, C8NUM);
fc_param_->deep_16_ = UP_ROUND(fc_param_->deep_, C16NUM);
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(fc_param_->col_4_, C4NUM));
thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_4_, C4NUM), thread_count_);
return;
}
int FullconnectionInt8CPUKernel::ReSize() {
FreeTmpBuffer();
InitParam();
pack_a_ptr_ = reinterpret_cast<int8_t *>(malloc(fc_param_->row_4_ * fc_param_->deep_16_ * sizeof(int8_t)));
if (pack_a_ptr_ == nullptr) {
FreeTmpBuffer();
return RET_ERROR;
}
pack_b_ptr_ = reinterpret_cast<int8_t *>(malloc(fc_param_->col_4_ * fc_param_->deep_16_ * sizeof(int8_t)));
if (pack_b_ptr_ == nullptr) {
FreeTmpBuffer();
return RET_ERROR;
}
input_sums_ = reinterpret_cast<int *>(malloc(fc_param_->row_4_ * sizeof(int)));
if (input_sums_ == nullptr) {
FreeTmpBuffer();
return RET_ERROR;
}
weight_bias_sums_ = reinterpret_cast<int *>(malloc(fc_param_->col_4_ * sizeof(int)));
if (weight_bias_sums_ == nullptr) {
FreeTmpBuffer();
return RET_ERROR;
}
memset(pack_a_ptr_, 0, fc_param_->row_4_ * fc_param_->deep_16_ * sizeof(int8_t));
memset(pack_b_ptr_, 0, fc_param_->col_4_ * fc_param_->deep_16_ * sizeof(int8_t));
memset(input_sums_, 0, fc_param_->row_4_ * sizeof(int));
memset(weight_bias_sums_, 0, fc_param_->col_4_ * sizeof(int));
if (in_tensors_.size() == 3) {
bias_ptr_ = reinterpret_cast<int *>(malloc(fc_param_->col_4_ * sizeof(int)));
if (bias_ptr_ == nullptr) {
MS_LOG(ERROR) << "Memory allocation failed";
FreeTmpBuffer();
return RET_MEMORY_FAILED;
}
memcpy(bias_ptr_, in_tensors_.at(2)->data_c(), fc_param_->col_ * sizeof(int));
} else {
bias_ptr_ = nullptr;
}
if (fc_param_->b_const_) {
auto weight_data = reinterpret_cast<int8_t *>(in_tensors_.at(1)->data_c());
RowMajor2Row16x4MajorInt8(weight_data, pack_b_ptr_, fc_param_->col_, fc_param_->deep_);
CalcWeightBiasSums(weight_data, fc_param_->deep_, fc_param_->col_, quant_.input_.zp_, quant_.filter_zp_, bias_ptr_,
weight_bias_sums_, ColMajor, filter_per_channel_);
}
return RET_OK;
}
int FullconnectionInt8CPUKernel::RunImpl(int task_id) {
int stride = thread_stride_ * C4NUM;
int cur_stride = task_id * stride;
int res_stride = fc_param_->col_ - cur_stride;
int cur_oc = MSMIN(stride, res_stride);
if (cur_oc <= 0) {
return RET_OK;
}
int32_t *cur_left = filter_per_channel_ ? quant_.left_shift_ + cur_stride : quant_.left_shift_;
int32_t *cur_right = filter_per_channel_ ? quant_.right_shift_ + cur_stride : quant_.right_shift_;
int32_t *cur_mul = filter_per_channel_ ? quant_.quant_multiplier_ + cur_stride : quant_.quant_multiplier_;
int32_t *cur_zp = filter_per_channel_ ? quant_.filter_zp_ + cur_stride : quant_.filter_zp_;
MatmulInt8Opt(pack_a_ptr_, pack_b_ptr_ + cur_stride * fc_param_->deep_16_, c_ptr_ + cur_stride, fc_param_->row_,
cur_oc, fc_param_->deep_16_, input_sums_, weight_bias_sums_ + cur_stride, quant_.out_act_min_,
quant_.out_act_max_, quant_.output_.zp_, cur_mul, cur_left, cur_right, fc_param_->col_,
filter_per_channel_, cur_zp);
return RET_OK;
}
int FcInt8Run(void *cdata, int task_id) {
auto fc = reinterpret_cast<FullconnectionInt8CPUKernel *>(cdata);
auto ret = fc->RunImpl(task_id);
auto ret = MatmulBaseInt8CPUKernel::ReSize();
if (ret != RET_OK) {
MS_LOG(ERROR) << "FcInt8Run error task_id[" << task_id << "] error_code[" << ret << "]";
return ret;
}
return RET_OK;
}
int FullconnectionInt8CPUKernel::Run() {
auto input_ptr = reinterpret_cast<int8_t *>(in_tensors_.at(0)->data_c());
RowMajor2Row16x4MajorInt8(input_ptr, pack_a_ptr_, fc_param_->row_, fc_param_->deep_);
int32_t tmp_weight_zp = filter_per_channel_ ? 1 : quant_.filter_zp_[0];
CalcInputSums(input_ptr, fc_param_->row_, fc_param_->deep_, tmp_weight_zp, input_sums_, RowMajor);
if (!fc_param_->b_const_) {
auto weight_data = reinterpret_cast<int8_t *>(in_tensors_.at(1)->data_c());
RowMajor2Row16x4MajorInt8(weight_data, pack_b_ptr_, fc_param_->col_, fc_param_->deep_);
CalcWeightBiasSums(weight_data, fc_param_->deep_, fc_param_->col_, quant_.input_.zp_, quant_.filter_zp_, bias_ptr_,
weight_bias_sums_, ColMajor, filter_per_channel_);
}
c_ptr_ = reinterpret_cast<int8_t *>(out_tensors_.at(0)->data_c());
auto ret = ParallelLaunch(this->context_->thread_pool_, FcInt8Run, this, thread_count_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "ParallelLaunch failed";
MS_LOG(ERROR) << "MatmulBaseInt8CPUKernel failed";
return ret;
}
return RET_OK;

View File

@ -18,52 +18,19 @@
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_FULLCONNECTION_INT8_H_
#include <vector>
#include "src/lite_kernel.h"
#include "include/errorcode.h"
#include "mindspore/lite/nnacl/int8/quantize.h"
#include "nnacl/common_func.h"
#include "nnacl/int8/common_func_int8.h"
#include "nnacl/int8/matmul_int8.h"
#include "src/runtime/kernel/arm/int8/matmul_base_int8.h"
namespace mindspore::kernel {
class FullconnectionInt8CPUKernel : public LiteKernel {
class FullconnectionInt8CPUKernel : public MatmulBaseInt8CPUKernel {
public:
FullconnectionInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const mindspore::lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: LiteKernel(parameter, inputs, outputs, ctx, primitive) {
fc_param_ = reinterpret_cast<MatMulParameter *>(op_parameter_);
}
~FullconnectionInt8CPUKernel() override {
FreeTmpBuffer();
FreeQuantParam();
}
: MatmulBaseInt8CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~FullconnectionInt8CPUKernel() override = default;
int Init() override;
int ReSize() override;
int Run() override;
public:
int RunImpl(int task_id);
private:
void InitParam();
void FreeTmpBuffer();
void FreeQuantParam();
int MallocQuantParam();
private:
MatMulParameter *fc_param_ = nullptr;
MatmulQuantParameter quant_;
int thread_count_ = 1;
int thread_stride_ = 0;
int8_t *pack_a_ptr_ = nullptr;
int8_t *pack_b_ptr_ = nullptr;
int8_t *c_ptr_ = nullptr;
int *input_sums_ = nullptr;
int *weight_bias_sums_ = nullptr;
int *bias_ptr_ = nullptr;
bool filter_per_channel_ = true;
};
} // namespace mindspore::kernel

View File

@ -0,0 +1,323 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/int8/matmul_base_int8.h"
#include "src/runtime/runtime_api.h"
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_MEMORY_FAILED;
using mindspore::lite::RET_OK;
namespace mindspore::kernel {
int MatmulBaseInt8Run(void *cdata, int task_id) {
auto op = reinterpret_cast<MatmulBaseInt8CPUKernel *>(cdata);
auto ret = op->RunImpl(task_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "MatmulInt8Run error task_id[" << task_id << "] error_code[" << ret << "]";
return ret;
}
return RET_OK;
}
int MatmulBaseInt8CPUKernel::RunImpl(int task_id) {
int stride = thread_stride_ * C4NUM;
int cur_stride = task_id * stride;
int res_stride = param_->col_ - cur_stride;
int cur_oc = MSMIN(stride, res_stride);
if (cur_oc <= 0) {
return RET_OK;
}
int32_t *cur_left = filter_per_channel_ ? quant_.left_shift_ + cur_stride : quant_.left_shift_;
int32_t *cur_right = filter_per_channel_ ? quant_.right_shift_ + cur_stride : quant_.right_shift_;
int32_t *cur_mul = filter_per_channel_ ? quant_.quant_multiplier_ + cur_stride : quant_.quant_multiplier_;
int32_t *cur_zp = filter_per_channel_ ? quant_.filter_zp_ + cur_stride : quant_.filter_zp_;
MatmulInt8Opt(pack_a_ptr_, batch_b_ptr_ + cur_stride * param_->deep_16_, batch_c_ptr_ + cur_stride, param_->row_,
cur_oc, param_->deep_16_, input_sums_, weight_bias_sums_ + cur_stride, quant_.out_act_min_,
quant_.out_act_max_, quant_.output_.zp_, cur_mul, cur_left, cur_right, param_->col_,
filter_per_channel_, cur_zp);
return RET_OK;
}
MatmulBaseInt8CPUKernel::~MatmulBaseInt8CPUKernel() {
FreeQuantParam();
FreeTmpBuffer();
if (bias_ptr_ != nullptr) {
free(bias_ptr_);
bias_ptr_ = nullptr;
}
return;
}
void MatmulBaseInt8CPUKernel::FreeQuantParam() {
if (quant_.filter_scale_ != nullptr) {
free(quant_.filter_scale_);
quant_.filter_scale_ = nullptr;
}
if (quant_.filter_zp_ != nullptr) {
free(quant_.filter_zp_);
quant_.filter_zp_ = nullptr;
}
if (quant_.left_shift_ != nullptr) {
free(quant_.left_shift_);
quant_.left_shift_ = nullptr;
}
if (quant_.right_shift_ != nullptr) {
free(quant_.right_shift_);
quant_.right_shift_ = nullptr;
}
if (quant_.quant_multiplier_ != nullptr) {
free(quant_.quant_multiplier_);
quant_.quant_multiplier_ = nullptr;
}
return;
}
int MatmulBaseInt8CPUKernel::MallocQuantParam() {
auto weight_tensor = in_tensors_.at(1);
auto weight_quant_params = weight_tensor->quant_params();
int col = weight_tensor->shape().front();
filter_per_channel_ = (weight_quant_params.size() > 1);
int init_size = filter_per_channel_ ? col : 1;
quant_.filter_scale_ = reinterpret_cast<float *>(malloc(init_size * sizeof(float)));
if (quant_.filter_scale_ == nullptr) {
return RET_ERROR;
}
quant_.filter_zp_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
if (quant_.filter_zp_ == nullptr) {
return RET_ERROR;
}
quant_.left_shift_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
if (quant_.left_shift_ == nullptr) {
return RET_ERROR;
}
quant_.right_shift_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
if (quant_.right_shift_ == nullptr) {
return RET_ERROR;
}
quant_.quant_multiplier_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
if (quant_.quant_multiplier_ == nullptr) {
return RET_ERROR;
}
return RET_OK;
}
void MatmulBaseInt8CPUKernel::InitQuantParam() {
auto in_quant_params = in_tensors_.at(0)->quant_params();
quant_.input_.zp_ = in_quant_params.front().zeroPoint;
quant_.input_.scale_ = in_quant_params.front().scale;
auto out_quant_params = out_tensors_.at(0)->quant_params();
quant_.output_.zp_ = out_quant_params.front().zeroPoint;
quant_.output_.scale_ = out_quant_params.front().scale;
auto weight_tensor = in_tensors_.at(1);
int weight_quant_num = filter_per_channel_ ? weight_tensor->shape().front() : 1;
auto weight_quant_params = weight_tensor->quant_params();
for (int i = 0; i < weight_quant_num; i++) {
quant_.filter_zp_[i] = weight_quant_params[i].zeroPoint;
quant_.filter_scale_[i] = weight_quant_params[i].scale;
}
for (int i = 0; i < weight_quant_num; ++i) {
const double in_scale = static_cast<double>(quant_.input_.scale_ * quant_.filter_scale_[i]);
double real_multiplier = in_scale / static_cast<double>(quant_.output_.scale_);
QuantizeRoundParameterWithDoublePrecision(real_multiplier, &quant_.quant_multiplier_[i], &quant_.left_shift_[i],
&quant_.right_shift_[i]);
}
CalculateActivationRangeQuantized(param_->act_type_ == ActType_Relu, param_->act_type_ == ActType_Relu6,
quant_.output_.zp_, quant_.output_.scale_, &quant_.out_act_min_,
&quant_.out_act_max_);
}
void MatmulBaseInt8CPUKernel::InitParameter() {
param_->a_const_ = (in_tensors_[0]->data_c() != nullptr);
param_->b_const_ = (in_tensors_[1]->data_c() != nullptr);
return;
}
void MatmulBaseInt8CPUKernel::ResizeParameter() {
param_->row_align_ = UP_ROUND(param_->row_, C4NUM);
param_->col_align_ = UP_ROUND(param_->col_, C4NUM);
param_->deep_16_ = UP_ROUND(param_->deep_, C16NUM);
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(param_->col_align_, C4NUM));
thread_stride_ = UP_DIV(UP_DIV(param_->col_align_, C4NUM), thread_count_);
return;
}
void MatmulBaseInt8CPUKernel::FreeTmpBuffer() {
if (pack_a_ptr_ != nullptr) {
free(pack_a_ptr_);
pack_a_ptr_ = nullptr;
}
if (pack_b_ptr_ != nullptr) {
free(pack_b_ptr_);
pack_b_ptr_ = nullptr;
}
if (input_sums_ != nullptr) {
free(input_sums_);
input_sums_ = nullptr;
}
if (weight_bias_sums_ != nullptr) {
free(weight_bias_sums_);
weight_bias_sums_ = nullptr;
}
return;
}
void MatmulBaseInt8CPUKernel::TransferB() {
auto weight_data = reinterpret_cast<int8_t *>(in_tensors_.at(1)->data_c());
for (int i = 0; i < param_->batch; i++) {
auto current_weight = weight_data + i * param_->deep_ * param_->col_;
auto current_b_pack = pack_b_ptr_ + i * param_->col_align_ * param_->deep_16_;
auto current_sums = weight_bias_sums_ + i * param_->col_align_;
if (param_->b_transpose_) {
RowMajor2Row16x4MajorInt8(current_weight, current_b_pack, param_->col_, param_->deep_);
CalcWeightBiasSums(current_weight, param_->deep_, param_->col_, quant_.input_.zp_, quant_.filter_zp_, bias_ptr_,
current_sums, ColMajor, filter_per_channel_);
} else {
RowMajor2Col16x4MajorInt8(current_weight, param_->deep_, param_->col_, current_b_pack);
CalcWeightBiasSums(current_weight, param_->deep_, param_->col_, quant_.input_.zp_, quant_.filter_zp_, bias_ptr_,
current_sums, RowMajor, false);
}
}
return;
}
int MatmulBaseInt8CPUKernel::InitTmpBuffer() {
pack_a_ptr_ = reinterpret_cast<int8_t *>(malloc(param_->row_align_ * param_->deep_16_ * sizeof(int8_t)));
if (pack_a_ptr_ == nullptr) {
FreeTmpBuffer();
return RET_ERROR;
}
pack_b_ptr_ =
reinterpret_cast<int8_t *>(malloc(param_->batch * param_->col_align_ * param_->deep_16_ * sizeof(int8_t)));
if (pack_b_ptr_ == nullptr) {
FreeTmpBuffer();
return RET_ERROR;
}
input_sums_ = reinterpret_cast<int *>(malloc(param_->row_align_ * sizeof(int)));
if (input_sums_ == nullptr) {
FreeTmpBuffer();
return RET_ERROR;
}
weight_bias_sums_ = reinterpret_cast<int *>(malloc(param_->batch * param_->col_align_ * sizeof(int)));
if (weight_bias_sums_ == nullptr) {
FreeTmpBuffer();
return RET_ERROR;
}
memset(pack_a_ptr_, 0, param_->row_align_ * param_->deep_16_ * sizeof(int8_t));
memset(pack_b_ptr_, 0, param_->batch * param_->col_align_ * param_->deep_16_ * sizeof(int8_t));
memset(input_sums_, 0, param_->row_align_ * sizeof(int));
memset(weight_bias_sums_, 0, param_->batch * param_->col_align_ * sizeof(int));
return RET_OK;
}
int MatmulBaseInt8CPUKernel::InitBias() {
if (in_tensors_.size() == 3) {
auto bias_tensor = in_tensors_[2];
int max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), C4NUM);
bias_ptr_ = reinterpret_cast<int *>(malloc(max_bias_data * sizeof(int)));
if (bias_ptr_ == nullptr) {
MS_LOG(ERROR) << "Memory allocation failed";
FreeTmpBuffer();
return RET_MEMORY_FAILED;
}
memcpy(bias_ptr_, bias_tensor->data_c(), bias_tensor->ElementsNum() * sizeof(int));
} else {
bias_ptr_ = nullptr;
}
return RET_OK;
}
int MatmulBaseInt8CPUKernel::Init() {
auto ret = MallocQuantParam();
if (ret != RET_OK) {
FreeQuantParam();
return ret;
}
InitQuantParam();
ret = InitBias();
if (ret != RET_OK) {
FreeQuantParam();
return ret;
}
return RET_OK;
}
int MatmulBaseInt8CPUKernel::ReSize() {
FreeTmpBuffer();
ResizeParameter();
auto ret = InitTmpBuffer();
if (ret != RET_OK) {
FreeQuantParam();
return ret;
}
if (param_->b_const_ == true) {
TransferB();
}
return RET_OK;
}
int MatmulBaseInt8CPUKernel::Run() {
if (param_->b_const_ == false) {
TransferB();
}
int8_t *a_ptr = reinterpret_cast<int8_t *>(in_tensors_.at(0)->data_c());
int8_t *c_ptr = reinterpret_cast<int8_t *>(out_tensors_.at(0)->data_c());
int32_t tmp_weight_zp = filter_per_channel_ ? 1 : quant_.filter_zp_[0];
for (int i = 0; i < param_->batch; i++) {
auto current_src_a = a_ptr + i * param_->row_ * param_->deep_;
if (param_->a_transpose_) {
RowMajor2Col16x4MajorInt8(current_src_a, param_->deep_, param_->row_, pack_a_ptr_);
CalcInputSums(current_src_a, param_->row_, param_->deep_, tmp_weight_zp, input_sums_, ColMajor);
} else {
RowMajor2Row16x4MajorInt8(current_src_a, pack_a_ptr_, param_->row_, param_->deep_);
CalcInputSums(current_src_a, param_->row_, param_->deep_, tmp_weight_zp, input_sums_, RowMajor);
}
batch_b_ptr_ = pack_b_ptr_ + i * param_->col_align_ * param_->deep_16_;
batch_sums_ = weight_bias_sums_ + i * param_->col_align_;
batch_c_ptr_ = c_ptr + i * param_->row_ * param_->col_;
auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulBaseInt8Run, this, thread_count_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "MatmulInt8Run error: [" << ret << "]";
return ret;
}
}
return RET_OK;
}
} // namespace mindspore::kernel

View File

@ -0,0 +1,82 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_BASE_INT8_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_BASE_INT8_H_
#include <vector>
#include "include/errorcode.h"
#include "include/context.h"
#include "src/lite_kernel.h"
#include "nnacl/matmul_parameter.h"
#include "nnacl/common_func.h"
#include "nnacl/int8/quantize.h"
#include "nnacl/int8/common_func_int8.h"
#include "nnacl/int8/matmul_int8.h"
namespace mindspore::kernel {
class MatmulBaseInt8CPUKernel : public LiteKernel {
public:
MatmulBaseInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: LiteKernel(parameter, inputs, outputs, ctx, primitive) {
param_ = reinterpret_cast<MatMulParameter *>(op_parameter_);
}
~MatmulBaseInt8CPUKernel() override;
int Init() override;
int ReSize() override;
int Run() override;
public:
int RunImpl(int task_id);
protected:
void InitParameter();
private:
void ResizeParameter();
int InitBias();
private:
int InitTmpBuffer();
void FreeTmpBuffer();
void TransferA();
void TransferB();
private:
int MallocQuantParam();
void FreeQuantParam();
void InitQuantParam();
protected:
MatMulParameter *param_ = nullptr;
MatmulQuantParameter quant_;
int thread_count_ = 1;
int thread_stride_ = 0;
int8_t *pack_a_ptr_ = nullptr;
int8_t *pack_b_ptr_ = nullptr;
int *input_sums_ = nullptr;
int *weight_bias_sums_ = nullptr;
int *bias_ptr_ = nullptr;
bool filter_per_channel_ = true;
int8_t *batch_b_ptr_ = nullptr;
int8_t *batch_c_ptr_ = nullptr;
int *batch_sums_ = nullptr;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_BASE_INT8_H_

View File

@ -22,46 +22,27 @@
#include "src/kernel_registry.h"
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_MEMORY_FAILED;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_MatMul;
namespace mindspore::kernel {
MatmulInt8CPUKernel::~MatmulInt8CPUKernel() { FreeTmpBuffer(); }
void MatmulInt8CPUKernel::FreeTmpBuffer() {
if (a_r4x16_ptr_ != nullptr) {
context_->allocator->Free(a_r4x16_ptr_);
a_r4x16_ptr_ = nullptr;
}
if (input_sums_ != nullptr) {
context_->allocator->Free(input_sums_);
input_sums_ = nullptr;
}
if (b_c16x4_batch_ != nullptr) {
context_->allocator->Free(b_c16x4_batch_);
b_c16x4_batch_ = nullptr;
}
if (weight_bias_sums_batch_ != nullptr) {
context_->allocator->Free(weight_bias_sums_batch_);
weight_bias_sums_batch_ = nullptr;
}
if (bias_ptr_ != nullptr) {
context_->allocator->Free(bias_ptr_);
bias_ptr_ = nullptr;
}
return;
}
int MatmulInt8CPUKernel::Init() {
InitParameter();
auto ret = MatmulBaseInt8CPUKernel::Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ParallelLaunch failed";
return ret;
}
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
}
int MatmulInt8CPUKernel::ReSize() {
FreeTmpBuffer();
int batch = 1;
auto x_shape = in_tensors_.at(0)->shape();
auto o_shape = out_tensors_.at(0)->shape();
@ -69,159 +50,19 @@ int MatmulInt8CPUKernel::ReSize() {
for (size_t i = 0; i < x_shape.size() - 2; ++i) {
batch *= x_shape[i];
}
params_->batch = batch;
param_->batch = batch;
MS_ASSERT(o_shape.size() >= 2);
params_->row_ = o_shape[o_shape.size() - 2];
params_->col_ = o_shape[o_shape.size() - 1];
params_->deep_ = params_->a_transpose_ ? x_shape[x_shape.size() - 2] : x_shape[x_shape.size() - 1];
params_->row_4_ = UP_ROUND(params_->row_, 4);
params_->col_4_ = UP_ROUND(params_->col_, 4);
params_->deep_16_ = UP_ROUND(params_->deep_, 16);
a_r4x16_ptr_ =
reinterpret_cast<int8_t *>(context_->allocator->Malloc(params_->row_4_ * params_->deep_16_ * sizeof(int8_t)));
if (!a_r4x16_ptr_) return RET_MEMORY_FAILED;
memset(a_r4x16_ptr_, 0, params_->row_4_ * params_->deep_16_ * sizeof(int8_t));
input_sums_ = reinterpret_cast<int *>(context_->allocator->Malloc(params_->row_4_ * sizeof(int)));
if (!input_sums_) return RET_MEMORY_FAILED;
memset(input_sums_, 0, params_->row_4_ * sizeof(int));
b_c16x4_batch_ = reinterpret_cast<int8_t *>(
context_->allocator->Malloc(params_->batch * params_->col_4_ * params_->deep_16_ * sizeof(int8_t)));
if (!b_c16x4_batch_) return RET_MEMORY_FAILED;
memset(b_c16x4_batch_, 0, params_->batch * params_->col_4_ * params_->deep_16_ * sizeof(int8_t));
weight_bias_sums_batch_ =
reinterpret_cast<int *>(context_->allocator->Malloc(params_->batch * params_->col_4_ * sizeof(int)));
if (!weight_bias_sums_batch_) return RET_MEMORY_FAILED;
memset(weight_bias_sums_batch_, 0, params_->batch * params_->col_4_ * sizeof(int));
if (in_tensors_.size() == 3) {
auto bias_size = params_->col_4_ * sizeof(int);
bias_ptr_ = reinterpret_cast<int *>(context_->allocator->Malloc(bias_size));
if (!bias_ptr_) return RET_MEMORY_FAILED;
memcpy(bias_ptr_, in_tensors_[2]->data_c(), bias_size);
} else {
bias_ptr_ = NULL;
}
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_4_, 4));
thread_stride_ = UP_DIV(UP_DIV(params_->col_4_, 4), thread_count_);
param_->row_ = o_shape[o_shape.size() - 2];
param_->col_ = o_shape[o_shape.size() - 1];
param_->deep_ = param_->a_transpose_ ? x_shape[x_shape.size() - 2] : x_shape[x_shape.size() - 1];
auto input_tensor = in_tensors_.at(0);
auto params = input_tensor->quant_params();
MS_ASSERT(params.size() == 1);
quant_params_.input.zp_ = params.front().zeroPoint;
quant_params_.input.scale_ = params.front().scale;
auto weight_tensor = in_tensors_.at(1);
params = weight_tensor->quant_params();
MS_ASSERT(params.size() == 1);
quant_params_.weight.zp_ = params.front().zeroPoint;
quant_params_.weight.scale_ = params.front().scale;
auto output_tensor = out_tensors_.at(0);
params = output_tensor->quant_params();
MS_ASSERT(params.size() == 1);
quant_params_.output.zp_ = params.front().zeroPoint;
quant_params_.output.scale_ = params.front().scale;
params_->b_const_ = (in_tensors_.at(1)->data_c() != nullptr);
if (params_->b_const_) {
auto b_ptr = reinterpret_cast<int8_t *>(in_tensors_.at(1)->data_c());
for (int i = 0; i < params_->batch; ++i) {
auto cur_b = b_ptr + i * params_->deep_ * params_->col_;
auto cur_b_pack = b_c16x4_batch_ + i * params_->col_4_ * params_->deep_16_;
auto cur_sums = weight_bias_sums_batch_ + i * params_->col_4_;
if (params_->b_transpose_) {
RowMajor2Row16x4MajorInt8(cur_b, cur_b_pack, params_->col_, params_->deep_);
CalcWeightBiasSums(cur_b, params_->deep_, params_->col_, quant_params_.input.zp_, &quant_params_.weight.zp_,
bias_ptr_, cur_sums, ColMajor, false);
} else {
RowMajor2Col16x4MajorInt8(cur_b, params_->deep_, params_->col_, cur_b_pack);
CalcWeightBiasSums(cur_b, params_->deep_, params_->col_, quant_params_.input.zp_, &quant_params_.weight.zp_,
bias_ptr_, cur_sums, RowMajor, false);
}
}
}
double real_multiplier = quant_params_.input.scale_ * quant_params_.weight.scale_ / quant_params_.output.scale_;
QuantizeRoundParameterWithDoublePrecision(real_multiplier, &quant_params_.quant_multiplier, &quant_params_.left_shift,
&quant_params_.right_shift);
return RET_OK;
}
int MatmulInt8CPUKernel::RunImpl(int task_id) {
int cur_oc = MSMIN(thread_stride_, UP_DIV(params_->col_4_, 4) - task_id * thread_stride_);
if (cur_oc <= 0) {
return RET_OK;
}
int cur_oc_res = MSMIN(thread_stride_ * C4NUM, params_->col_ - task_id * thread_stride_ * C4NUM);
auto cur_b = b_c16x4_ptr_ + task_id * thread_stride_ * 4 * params_->deep_16_;
auto cur_bias = weight_bias_sums_ + task_id * thread_stride_ * 4;
auto cur_c = c_ptr_ + task_id * thread_stride_ * 4;
auto &p = quant_params_;
#ifdef ENABLE_ARM64
MatmulInt8Neon64(a_r4x16_ptr_, cur_b, cur_c, params_->row_4_, cur_oc * C4NUM, params_->deep_16_, input_sums_,
cur_bias, INT8_MIN, INT8_MAX, p.output.zp_, &p.quant_multiplier, &p.left_shift, &p.right_shift,
params_->row_, cur_oc_res, params_->col_ * sizeof(int8_t), false);
#else
MatMulInt8_16x4_r(a_r4x16_ptr_, cur_b, cur_c, params_->row_, cur_oc_res, params_->deep_16_, params_->col_,
input_sums_, cur_bias, &p.left_shift, &p.right_shift, &p.quant_multiplier, p.output.zp_, INT8_MIN,
INT8_MAX, false);
#endif
return RET_OK;
}
int MatmulInt8Run(void *cdata, int task_id) {
auto op = reinterpret_cast<MatmulInt8CPUKernel *>(cdata);
auto ret = op->RunImpl(task_id);
auto ret = MatmulBaseInt8CPUKernel::ReSize();
if (ret != RET_OK) {
MS_LOG(ERROR) << "MatmulInt8Run error task_id[" << task_id << "] error_code[" << ret << "]";
MS_LOG(ERROR) << "MatmulBaseInt8CPUKernel failed";
return ret;
}
return RET_OK;
}
int MatmulInt8CPUKernel::Run() {
auto a_ptr = reinterpret_cast<int8_t *>(in_tensors_.at(0)->data_c());
auto c_ptr = reinterpret_cast<int8_t *>(out_tensors_.at(0)->data_c());
auto a_stride = params_->row_ * params_->deep_;
auto b_stride = params_->deep_ * params_->col_;
auto c_stride = params_->row_ * params_->col_;
if (!params_->b_const_) {
auto b_ptr = reinterpret_cast<int8_t *>(in_tensors_.at(1)->data_c());
for (int i = 0; i < params_->batch; ++i) {
auto cur_b = b_ptr + i * b_stride;
auto cur_b_pack = b_c16x4_batch_ + i * params_->col_4_ * params_->deep_16_;
auto cur_sums = weight_bias_sums_batch_ + i * params_->col_4_;
if (params_->b_transpose_) {
RowMajor2Row16x4MajorInt8(cur_b, cur_b_pack, params_->col_, params_->deep_);
CalcWeightBiasSums(cur_b, params_->deep_, params_->col_, quant_params_.input.zp_, &quant_params_.weight.zp_,
bias_ptr_, cur_sums, ColMajor, false);
} else {
RowMajor2Col16x4MajorInt8(cur_b, params_->deep_, params_->col_, cur_b_pack);
CalcWeightBiasSums(cur_b, params_->deep_, params_->col_, quant_params_.input.zp_, &quant_params_.weight.zp_,
bias_ptr_, cur_sums, RowMajor, false);
}
}
}
for (int i = 0; i < params_->batch; ++i) {
auto cur_a_ptr = a_ptr + i * a_stride;
if (params_->a_transpose_) {
RowMajor2Col16x4MajorInt8(cur_a_ptr, params_->deep_, params_->row_, a_r4x16_ptr_);
CalcInputSums(cur_a_ptr, params_->row_, params_->deep_, quant_params_.weight.zp_, input_sums_, ColMajor);
} else {
RowMajor2Row16x4MajorInt8(cur_a_ptr, a_r4x16_ptr_, params_->row_, params_->deep_);
CalcInputSums(cur_a_ptr, params_->row_, params_->deep_, quant_params_.weight.zp_, input_sums_, RowMajor);
}
b_c16x4_ptr_ = b_c16x4_batch_ + i * params_->col_4_ * params_->deep_16_;
weight_bias_sums_ = weight_bias_sums_batch_ + i * params_->col_4_;
c_ptr_ = c_ptr + i * c_stride;
auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulInt8Run, this, thread_count_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "MatmulInt8Run error: [" << ret << "]";
return ret;
}
}
return RET_OK;
}
REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_MatMul, LiteKernelCreator<MatmulInt8CPUKernel>)
} // namespace mindspore::kernel

View File

@ -22,39 +22,18 @@
#include "nnacl/matmul_parameter.h"
#include "mindspore/lite/nnacl/int8/quantize.h"
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/int8/matmul_base_int8.h"
using mindspore::lite::InnerContext;
namespace mindspore::kernel {
class MatmulInt8CPUKernel : public LiteKernel {
class MatmulInt8CPUKernel : public MatmulBaseInt8CPUKernel {
public:
MatmulInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: LiteKernel(parameter, inputs, outputs, ctx, primitive) {
params_ = reinterpret_cast<MatMulParameter *>(op_parameter_);
}
~MatmulInt8CPUKernel() override;
: MatmulBaseInt8CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~MatmulInt8CPUKernel() override = default;
int Init() override;
int ReSize() override;
int Run() override;
int RunImpl(int task_id);
private:
void FreeTmpBuffer();
private:
MatMulParameter *params_ = nullptr;
MatmulQuantArg quant_params_;
int8_t *a_r4x16_ptr_ = nullptr;
int8_t *b_c16x4_ptr_ = nullptr;
int8_t *c_ptr_ = nullptr;
int8_t *b_c16x4_batch_ = nullptr;
int *bias_ptr_ = nullptr;
int *input_sums_ = nullptr;
int *weight_bias_sums_ = nullptr;
int *weight_bias_sums_batch_ = nullptr;
int thread_stride_ = 0;
int thread_count_ = 0;
};
} // namespace mindspore::kernel

View File

@ -599,9 +599,9 @@ function Run_x86() {
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile='${ms_models_path}'/'${model_name}'.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out --accuracyThreshold=${accuracy_limit}' >> "${run_x86_log_file}"
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile=${ms_models_path}/${model_name}_weightquant.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out --accuracyThreshold=${accuracy_limit} >> "${run_x86_log_file}"
if [ $? = 0 ]; then
run_result='x86: '${model_name}'[weight quant] pass'; echo ${run_result} >> ${run_benchmark_result_file}
run_result='x86: '${model_name}'[weight_quant] pass'; echo ${run_result} >> ${run_benchmark_result_file}
else
run_result='x86: '${model_name}'[weight quant] failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1
run_result='x86: '${model_name}'[weight_quant] failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1
fi
done < ${models_mindspore_weightquant_config}

View File

@ -79,58 +79,6 @@ void MMInt8TestInit(std::vector<lite::Tensor *> *inputs, std::vector<lite::Tenso
delete[] weight_data;
}
TEST_F(TestMatmulInt8, simple) {
#define ROW 10
#define COL 15
#define DEPTH 10
#define ROW4 UP_ROUND(ROW, 4)
#define COL4 UP_ROUND(COL, 4)
#define DEPTH16 UP_ROUND(DEPTH, 16)
int8_t a[ROW * DEPTH] = {-3, -3, 0, -2, -4, -2, 1, 0, -1, 0, 5, 1, 3, 4, 4, -3, -5, 2, -2, 4,
4, 5, 1, -1, 5, 5, 2, -1, 0, 4, -4, 2, 5, -2, 5, 3, -1, 2, -4, 5,
-5, 4, 5, 3, 5, 4, -2, 5, 5, -5, -5, -5, 2, -4, -3, 3, -3, -5, 5, 0,
2, -4, 4, 2, -5, 3, -1, 3, -3, 2, -5, -4, 0, -5, 2, 4, 0, -5, -1, 4,
3, 5, 5, 2, -5, -5, -4, -5, 3, 3, 3, 0, -2, 0, -2, -3, -2, 3, 5, -5};
int8_t b[DEPTH * COL] = {1, 2, -2, -5, -4, 2, 3, 2, -5, 4, -5, 4, 1, -2, 1, 5, 5, 5, 2, 5, -3, -3,
-1, -3, -1, 0, -4, 0, 1, -2, -2, -3, -5, 1, 1, 0, 4, 5, -3, -1, 4, 3, 5, 4,
2, 4, -3, -4, 1, 4, -4, 5, -1, -2, 3, 5, 5, 2, 1, -4, 1, 2, -3, 0, -2, 4,
-3, -3, 1, 3, 4, -1, 3, 1, -5, -1, 2, 0, 0, 5, -1, -5, 5, -5, 0, 3, -3, 4,
3, 1, -3, -3, 2, -2, -3, -3, 3, 4, 2, -1, 2, 0, -2, 4, 5, 3, -1, -3, -2, -1,
4, 3, -5, 1, 0, 0, -1, -4, -3, -2, 5, 3, 2, 1, -4, 1, 4, 5, -1, 2, -2, 2,
1, -2, 5, 2, -4, -4, 1, 1, 2, -1, -5, -4, 4, 1, -3, 4, -1, -4};
int8_t correct[ROW * COL] = {
-36, -33, 11, 4, -12, -7, 11, 0, 37, -30, -13, -2, -30, -3, 29, 46, -13, -84, -8, 6, 39, 26,
-67, -48, 57, 12, 32, 44, -24, -85, 22, 32, -8, -8, 20, 10, -45, 12, -69, 36, 22, -37, 58, 27,
-24, -11, -22, -50, 26, 50, 28, -56, -42, -23, -1, 70, -58, 54, 35, -61, 54, 40, -11, 35, 43, 3,
7, 30, -7, -13, 73, -3, 26, 26, -11, -37, 0, 19, 34, -4, 0, -22, 71, 8, -25, -6, -5, 31,
8, 63, -25, -55, -62, -17, 23, 1, 36, 12, -38, 2, 11, 27, 18, 5, 4, -59, -17, 1, 25, 9,
13, -77, 13, 9, -11, 26, -52, 42, 28, 6, 44, 4, 2, 26, 19, -31, 46, 23, -57, 15, -31, 39,
40, -9, 8, 38, 40, 27, -19, -47, 14, 50, 14, 18, 0, -59, 39, -48, -47, 35};
int8_t output[ROW * COL] = {0};
int8_t *a_r4x16 = new int8_t[ROW4 * DEPTH16];
memset(a_r4x16, 0, ROW4 * DEPTH16);
int8_t *b_c16x4 = new int8_t[COL4 * DEPTH16];
memset(b_c16x4, 0, COL4 * DEPTH16);
RowMajor2Row16x4MajorInt8(a, a_r4x16, ROW, DEPTH);
RowMajor2Col16x4MajorInt8(b, DEPTH, COL, b_c16x4);
int a_sums[ROW4] = {0};
int bias[COL4] = {0};
int multiplier, ls, rs;
QuantizeRoundParameterWithDoublePrecision(1.0f, &multiplier, &ls, &rs);
#ifdef ENABLE_ARM64
MatmulInt8Neon64(a_r4x16, b_c16x4, output, ROW4, COL4, DEPTH16, a_sums, bias, INT8_MIN, INT8_MAX, 0, &multiplier, &ls,
&rs, ROW, COL, COL, false);
#else
MatMulInt8_16x4_r(a_r4x16, b_c16x4, output, ROW, COL, DEPTH16, COL, a_sums, bias, &ls, &rs, &multiplier, 0, INT8_MIN,
INT8_MAX, false);
#endif
ASSERT_EQ(0, CompareOutputData(output, correct, ROW * COL, 0.1));
delete[] a_r4x16;
delete[] b_c16x4;
}
TEST_F(TestMatmulInt8, mmtest1) {
float in[] = {6.583835634764597, 11.337275140963907, -4.125256949459629, 10.994337291530833,
19.086065139532636, 3.620842999158455, 13.167624585590346, -18.326739299407755,