!30296 fix DynamicMatmul4x4x16AIWI c++ && copy bias bug and float_mode bug

Merge pull request !30296 from yeyunpeng2020/dynamic_quant
This commit is contained in:
i-robot 2022-02-21 06:35:22 +00:00 committed by Gitee
commit 56a90003c4
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
8 changed files with 98 additions and 53 deletions

View File

@ -20,7 +20,7 @@
// void DynamicMatmulSdot4x4x16AIWI(const int8_t *a, const int8_t *b, float *out, size_t deep4, float *multi_scales,
// float *bias, size_t row, size_t col, size_t stride, const int *a_sums,
// const int *b_sums, size_t a_zp, size_t b_zp, size_t deep);
// const int *b_sums, int64_t a_zp, int64_t b_zp_sum);
// x0: a(left matrix ptr)
// x1: b(right matrix ptr)
// x2: out ptr
@ -33,7 +33,7 @@
// x9: a_sums
// x10: b_sums
// x19/w19: a_zp
// x19/w20: b_zp
// x19/w20: b_zp_sum
asm_function DynamicMatmulSdot4x4x16AIWI
sub sp, sp, #144

View File

@ -17,30 +17,31 @@
#include "nnacl/int8/dynamic_matmul_int8.h"
#include "nnacl/int8/fixed_point.h"
void DynamicMatmul4x4x16AIWI(const int8_t *a, const int8_t *b, const float *bias, float *dst, int row, int col,
int deep4, size_t stride, float input_scale, const float *filter_scale,
bool filter_per_channel) {
void DynamicMatmul4x4x16AIWI(const int8_t *a, const int8_t *b, float *out, size_t deep4, float *multi_scales,
float *bias, size_t row, size_t col, size_t stride, const int *a_sums, const int *b_sums,
int64_t a_zp, int64_t b_zp_sum) {
/* *
* row4x4-major * row4x16-major => (int8)row-major
* support activation per-layer symmetric && weight per-layer/per-channel symmetric
* */
for (int r = 0; r < row; r++) {
int64_t s2 = a_sums[r] * b_zp_sum;
for (int c = 0; c < col; c++) {
int r4div = r / C4NUM, r4mod = r % C4NUM;
int c16div = c / C16NUM, c16mod = c % C16NUM;
int32_t value = 0;
int32_t s1 = 0;
for (int d = 0; d < deep4; d++) {
int d4div = d / C4NUM, d4mod = d % C4NUM;
size_t ai = r4div * deep4 * C4NUM + d4div * C4NUM * C4NUM + r4mod * C4NUM + d4mod;
size_t bi = c16div * deep4 * C16NUM + d4div * C4NUM * C16NUM + c16mod * C4NUM + d4mod;
value += a[ai] * b[bi];
s1 += a[ai] * b[bi];
}
int filter_quant_index = filter_per_channel ? c : 0;
double multi_scale = input_scale * filter_scale[filter_quant_index];
size_t ci = r * stride + c;
dst[ci] = multi_scale * value;
int64_t s3 = b_sums[c] * a_zp;
int64_t s4 = a_zp * b_zp_sum;
size_t ci = r * stride / sizeof(float) + c;
out[ci] = multi_scales[c] * (s1 - s2 - s3 + s4);
if (bias != NULL) {
dst[ci] += bias[c];
out[ci] += bias[c];
}
}
}
@ -74,7 +75,7 @@ void DynamicMatmul4x16x4AIWI(const int8_t *a, const int8_t *b, const float *bias
}
value = s0 - s1 - s2 + s3;
int filter_quant_index = filter_per_channel ? c : 0;
double multi_scale = input_scale * filter_scale[filter_quant_index];
float multi_scale = input_scale * filter_scale[filter_quant_index];
size_t ci = r * stride + c;
dst[ci] = multi_scale * value;
if (bias != NULL) {

View File

@ -34,12 +34,11 @@ void CalcPartWeightSums(const int8_t *weight, int row, int stride, int cur_col,
#ifdef ENABLE_ARM64
void DynamicMatmulSdot4x4x16AIWI(const int8_t *a, const int8_t *b, float *out, size_t deep4, float *multi_scales,
float *bias, size_t row, size_t col, size_t stride, const int *a_sums,
const int *b_sums, int64_t a_zp, int64_t b_zp);
#else
void DynamicMatmul4x4x16AIWI(const int8_t *a, const int8_t *b, const float *bias, float *dst, int row, int col,
int deep4, size_t stride, float input_scale, const float *filter_scale,
bool filter_per_channel);
const int *b_sums, int64_t a_zp, int64_t b_zp_sum);
#endif
void DynamicMatmul4x4x16AIWI(const int8_t *a, const int8_t *b, float *out, size_t deep4, float *multi_scales,
float *bias, size_t row, size_t col, size_t stride, const int *a_sums, const int *b_sums,
int64_t a_zp, int64_t b_zp_sum);
#ifdef __cplusplus
}
#endif

View File

@ -99,8 +99,16 @@ int MatmulDynamicBaseInt8CPUKernel::InitFilterQuantParam() {
return RET_OK;
}
void MatmulDynamicBaseInt8CPUKernel::ResizeParameter() {
param_->row_align_ = UP_ROUND(param_->row_, row_tile_);
void MatmulDynamicBaseInt8CPUKernel::ResizeMatrixBParameter() {
auto w_shape = in_tensors_.at(kWeightIndex)->shape();
int batch = 1;
for (size_t i = 0; i < w_shape.size() - kSize2; ++i) {
batch *= w_shape[i];
}
param_->batch = batch;
param_->col_ = param_->b_transpose_ ? w_shape[w_shape.size() - kSize2] : w_shape[w_shape.size() - kSize1];
param_->deep_ = param_->b_transpose_ ? w_shape[w_shape.size() - kSize1] : w_shape[w_shape.size() - kSize2];
param_->col_align_ = UP_ROUND(param_->col_, col_tile_);
param_->deep_align_ = UP_ROUND(param_->deep_, deep_tile_);
@ -126,6 +134,10 @@ void MatmulDynamicBaseInt8CPUKernel::FreeTmpBuffer() {
free(weight_sums_);
weight_sums_ = nullptr;
}
if (fp32_bias_ptr_ != nullptr) {
free(fp32_bias_ptr_);
fp32_bias_ptr_ = nullptr;
}
return;
}
@ -143,8 +155,6 @@ int MatmulDynamicBaseInt8CPUKernel::InitInputQuantParam() {
int MatmulDynamicBaseInt8CPUKernel::TransferB() {
auto weight_data = reinterpret_cast<int8_t *>(in_tensors_.at(kWeightIndex)->data());
CHECK_NULL_RETURN(weight_data);
memset(pack_b_ptr_, quant_param_->filter_zp_[0],
param_->batch * param_->col_align_ * param_->deep_align_ * sizeof(int8_t));
for (int i = 0; i < param_->batch; i++) {
auto current_weight = weight_data + i * param_->deep_ * param_->col_;
auto current_b_pack = pack_b_ptr_ + i * param_->col_align_ * param_->deep_align_;
@ -161,31 +171,51 @@ int MatmulDynamicBaseInt8CPUKernel::TransferB() {
return RET_OK;
}
int MatmulDynamicBaseInt8CPUKernel::InitTmpBuffer() {
int MatmulDynamicBaseInt8CPUKernel::InitMatrixABuffer() {
if (pack_a_ptr_ != nullptr) {
delete pack_a_ptr_;
pack_a_ptr_ = nullptr;
}
pack_a_ptr_ = reinterpret_cast<int8_t *>(malloc(param_->row_align_ * param_->deep_align_ * sizeof(int8_t)));
if (pack_a_ptr_ == nullptr) {
FreeTmpBuffer();
return RET_ERROR;
}
if (input_sums_ != nullptr) {
delete pack_a_ptr_;
input_sums_ = nullptr;
}
input_sums_ = reinterpret_cast<int *>(malloc(param_->row_align_ * sizeof(int)));
if (input_sums_ == nullptr) {
FreeTmpBuffer();
return RET_ERROR;
}
memset(pack_a_ptr_, 0, param_->row_align_ * param_->deep_align_ * sizeof(int8_t));
memset(input_sums_, 0, param_->row_align_ * sizeof(int));
return RET_OK;
}
int MatmulDynamicBaseInt8CPUKernel::InitMatrixBBuffer() {
if (pack_b_ptr_ != nullptr) {
delete pack_b_ptr_;
pack_b_ptr_ = nullptr;
}
pack_b_ptr_ =
reinterpret_cast<int8_t *>(malloc(param_->batch * param_->col_align_ * param_->deep_align_ * sizeof(int8_t)));
if (pack_b_ptr_ == nullptr) {
FreeTmpBuffer();
return RET_ERROR;
}
input_sums_ = reinterpret_cast<int *>(malloc(param_->row_align_ * sizeof(int)));
if (input_sums_ == nullptr) {
FreeTmpBuffer();
return RET_ERROR;
if (weight_sums_ != nullptr) {
delete weight_sums_;
weight_sums_ = nullptr;
}
weight_sums_ = reinterpret_cast<int *>(malloc(param_->batch * param_->col_align_ * sizeof(int)));
if (weight_sums_ == nullptr) {
FreeTmpBuffer();
return RET_ERROR;
}
memset(pack_a_ptr_, 0, param_->row_align_ * param_->deep_align_ * sizeof(int8_t));
memset(pack_b_ptr_, 0, param_->batch * param_->col_align_ * param_->deep_align_ * sizeof(int8_t));
memset(input_sums_, 0, param_->row_align_ * sizeof(int));
memset(weight_sums_, 0, param_->batch * param_->col_align_ * sizeof(int));
return RET_OK;
}
@ -193,7 +223,7 @@ int MatmulDynamicBaseInt8CPUKernel::InitTmpBuffer() {
int MatmulDynamicBaseInt8CPUKernel::CopyBias() {
if (in_tensors_.size() == kHasBiasSize) {
auto bias_tensor = in_tensors_[kBiasIndex];
fp32_bias_ptr_ = reinterpret_cast<float *>(bias_tensor->data());
fp32_bias_ptr_ = static_cast<float *>(malloc(bias_tensor->Size()));
if (fp32_bias_ptr_ == nullptr) {
MS_LOG(ERROR) << "Memory allocation failed";
FreeTmpBuffer();
@ -216,12 +246,25 @@ int MatmulDynamicBaseInt8CPUKernel::Prepare() {
return ret;
}
if (param_->b_const_) {
ResizeMatrixBParameter();
ret = InitFilterQuantParam();
if (ret != RET_OK) {
FreeQuantParam();
return ret;
}
ret = InitMatrixBBuffer();
if (ret != RET_OK) {
FreeQuantParam();
return ret;
}
ret = TransferB();
if (ret != RET_OK) {
FreeQuantParam();
return ret;
}
}
ret = CopyBias();
if (ret != RET_OK) {
FreeQuantParam();
@ -234,30 +277,27 @@ int MatmulDynamicBaseInt8CPUKernel::Prepare() {
}
int MatmulDynamicBaseInt8CPUKernel::ReSize() {
int batch = 1;
auto x_shape = in_tensors_.at(0)->shape();
auto o_shape = out_tensors_.at(0)->shape();
MS_ASSERT(x_shape.size() >= kSize2);
for (size_t i = 0; i < x_shape.size() - kSize2; ++i) {
batch *= x_shape[i];
}
param_->batch = batch;
MS_ASSERT(o_shape.size() >= kSize2);
param_->row_ = o_shape[o_shape.size() - kSize2];
param_->col_ = o_shape[o_shape.size() - kSize1];
param_->row_align_ = UP_ROUND(param_->row_, row_tile_);
param_->deep_ = param_->a_transpose_ ? x_shape[x_shape.size() - kSize2] : x_shape[x_shape.size() - kSize1];
param_->deep_align_ = UP_ROUND(param_->deep_, deep_tile_);
FreeTmpBuffer();
ResizeParameter();
auto ret = InitTmpBuffer();
auto ret = InitMatrixABuffer();
if (ret != RET_OK) {
FreeQuantParam();
return ret;
}
if (param_->b_const_ == true) {
TransferB();
if (!param_->b_const_) {
ResizeMatrixBParameter();
ret = InitMatrixBBuffer();
if (ret != RET_OK) {
FreeQuantParam();
return ret;
}
}
return RET_OK;
}

View File

@ -39,10 +39,10 @@ class MatmulDynamicBaseInt8CPUKernel : public InnerKernel {
int ReSize() override;
private:
void ResizeParameter();
void ResizeMatrixBParameter();
int CopyBias();
int InitTmpBuffer();
int InitMatrixABuffer();
int InitMatrixBBuffer();
int MallocQuantParam();
protected:

View File

@ -82,7 +82,6 @@ int MatMulDynamicSdotInt8Kernel::MatMulDynamicArm64SdotPre(int task_id) {
}
int MatMulDynamicSdotInt8Kernel::MatMulDynamicArm64SdotImpl(int task_id) {
#if defined(ENABLE_ARM64) && !defined(SUPPORT_NNIE) && (!defined(MACHINE_LINUX_ARM64))
// Multi-thread split by col.
int stride = thread_stride_ * col_tile_;
int cur_stride = task_id * stride;
@ -128,12 +127,18 @@ int MatMulDynamicSdotInt8Kernel::MatMulDynamicArm64SdotImpl(int task_id) {
if (bias != nullptr) {
bias += col_offset;
}
#if defined(ENABLE_ARM64) && !defined(SUPPORT_NNIE) && (!defined(MACHINE_LINUX_ARM64))
DynamicMatmulSdot4x4x16AIWI(a_ptr, b_ptr, out_ptr, param_->deep_align_, multi_scale.data() + c, bias, row, col,
out_stride, input_sums_ptr, weight_sums_ptr, quant_param_->input_zp_,
quant_param_->filter_zp_[0] * param_->deep_);
#else
DynamicMatmul4x4x16AIWI(a_ptr, b_ptr, out_ptr, param_->deep_align_, multi_scale.data() + c, bias, row, col,
out_stride, input_sums_ptr, weight_sums_ptr, quant_param_->input_zp_,
quant_param_->filter_zp_[0] * param_->deep_);
#endif
}
}
#endif
return RET_OK;
}

View File

@ -664,7 +664,9 @@ int Scheduler::InferNodeShape(const lite::Model::Node *node) {
parameter->quant_type_ = node->quant_type_;
parameter->thread_num_ = context_->thread_num_;
if (context_->float_mode && parameter->quant_type_ == schema::QuantType_QUANT_ALL) {
parameter->quant_type_ = schema::QuantType_QUANT_WEIGHT;
}
if (node->output_indices_.empty()) {
MS_LOG(ERROR) << "The output size is invalid";
if (parameter->destroy_func_ != nullptr) {
@ -1000,9 +1002,6 @@ int Scheduler::FindCpuKernel(const std::vector<Tensor *> &in_tensors, const std:
cpu_desc.data_type = kNumberTypeFloat16;
}
int ret;
if (context_->float_mode && op_parameter->quant_type_ == schema::QuantType_QUANT_ALL) {
op_parameter->quant_type_ = schema::QuantType_QUANT_WEIGHT;
}
#ifndef WEIGHT_DECODE_CLIP
ret = WeightDecoder::DequantNode(op_parameter, in_tensors, kernel_data_type, src_model_->version_);
if (ret != RET_OK) {

View File

@ -88,6 +88,7 @@ function Convert() {
done
fi
# start running converter
echo "Convert ${model_name} ${quant_type} ......"
echo ${model_name} >> "$4"
echo './converter_lite --fmk='${model_fmk}' --modelFile='${model_file}' --weightFile='${weight_file}' --outputFile='${output_file}\
' --inputDataType='${in_dtype}' --outputDataType='${out_dtype}' --inputShape='${spec_shapes}\