forked from mindspore-Ecosystem/mindspore
!30296 fix DynamicMatmul4x4x16AIWI c++ && copy bias bug and float_mode bug
Merge pull request !30296 from yeyunpeng2020/dynamic_quant
This commit is contained in:
commit
56a90003c4
|
@ -20,7 +20,7 @@
|
|||
|
||||
// void DynamicMatmulSdot4x4x16AIWI(const int8_t *a, const int8_t *b, float *out, size_t deep4, float *multi_scales,
|
||||
// float *bias, size_t row, size_t col, size_t stride, const int *a_sums,
|
||||
// const int *b_sums, size_t a_zp, size_t b_zp, size_t deep);
|
||||
// const int *b_sums, int64_t a_zp, int64_t b_zp_sum);
|
||||
// x0: a(left matrix ptr)
|
||||
// x1: b(right matrix ptr)
|
||||
// x2: out ptr
|
||||
|
@ -33,7 +33,7 @@
|
|||
// x9: a_sums
|
||||
// x10: b_sums
|
||||
// x19/w19: a_zp
|
||||
// x19/w20: b_zp
|
||||
// x19/w20: b_zp_sum
|
||||
|
||||
asm_function DynamicMatmulSdot4x4x16AIWI
|
||||
sub sp, sp, #144
|
||||
|
|
|
@ -17,30 +17,31 @@
|
|||
#include "nnacl/int8/dynamic_matmul_int8.h"
|
||||
#include "nnacl/int8/fixed_point.h"
|
||||
|
||||
void DynamicMatmul4x4x16AIWI(const int8_t *a, const int8_t *b, const float *bias, float *dst, int row, int col,
|
||||
int deep4, size_t stride, float input_scale, const float *filter_scale,
|
||||
bool filter_per_channel) {
|
||||
void DynamicMatmul4x4x16AIWI(const int8_t *a, const int8_t *b, float *out, size_t deep4, float *multi_scales,
|
||||
float *bias, size_t row, size_t col, size_t stride, const int *a_sums, const int *b_sums,
|
||||
int64_t a_zp, int64_t b_zp_sum) {
|
||||
/* *
|
||||
* row4x4-major * row4x16-major => (int8)row-major
|
||||
* support activation per-layer symmetric && weight per-layer/per-channel symmetric
|
||||
* */
|
||||
for (int r = 0; r < row; r++) {
|
||||
int64_t s2 = a_sums[r] * b_zp_sum;
|
||||
for (int c = 0; c < col; c++) {
|
||||
int r4div = r / C4NUM, r4mod = r % C4NUM;
|
||||
int c16div = c / C16NUM, c16mod = c % C16NUM;
|
||||
int32_t value = 0;
|
||||
int32_t s1 = 0;
|
||||
for (int d = 0; d < deep4; d++) {
|
||||
int d4div = d / C4NUM, d4mod = d % C4NUM;
|
||||
size_t ai = r4div * deep4 * C4NUM + d4div * C4NUM * C4NUM + r4mod * C4NUM + d4mod;
|
||||
size_t bi = c16div * deep4 * C16NUM + d4div * C4NUM * C16NUM + c16mod * C4NUM + d4mod;
|
||||
value += a[ai] * b[bi];
|
||||
s1 += a[ai] * b[bi];
|
||||
}
|
||||
int filter_quant_index = filter_per_channel ? c : 0;
|
||||
double multi_scale = input_scale * filter_scale[filter_quant_index];
|
||||
size_t ci = r * stride + c;
|
||||
dst[ci] = multi_scale * value;
|
||||
int64_t s3 = b_sums[c] * a_zp;
|
||||
int64_t s4 = a_zp * b_zp_sum;
|
||||
size_t ci = r * stride / sizeof(float) + c;
|
||||
out[ci] = multi_scales[c] * (s1 - s2 - s3 + s4);
|
||||
if (bias != NULL) {
|
||||
dst[ci] += bias[c];
|
||||
out[ci] += bias[c];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -74,7 +75,7 @@ void DynamicMatmul4x16x4AIWI(const int8_t *a, const int8_t *b, const float *bias
|
|||
}
|
||||
value = s0 - s1 - s2 + s3;
|
||||
int filter_quant_index = filter_per_channel ? c : 0;
|
||||
double multi_scale = input_scale * filter_scale[filter_quant_index];
|
||||
float multi_scale = input_scale * filter_scale[filter_quant_index];
|
||||
size_t ci = r * stride + c;
|
||||
dst[ci] = multi_scale * value;
|
||||
if (bias != NULL) {
|
||||
|
|
|
@ -34,12 +34,11 @@ void CalcPartWeightSums(const int8_t *weight, int row, int stride, int cur_col,
|
|||
#ifdef ENABLE_ARM64
|
||||
void DynamicMatmulSdot4x4x16AIWI(const int8_t *a, const int8_t *b, float *out, size_t deep4, float *multi_scales,
|
||||
float *bias, size_t row, size_t col, size_t stride, const int *a_sums,
|
||||
const int *b_sums, int64_t a_zp, int64_t b_zp);
|
||||
#else
|
||||
void DynamicMatmul4x4x16AIWI(const int8_t *a, const int8_t *b, const float *bias, float *dst, int row, int col,
|
||||
int deep4, size_t stride, float input_scale, const float *filter_scale,
|
||||
bool filter_per_channel);
|
||||
const int *b_sums, int64_t a_zp, int64_t b_zp_sum);
|
||||
#endif
|
||||
void DynamicMatmul4x4x16AIWI(const int8_t *a, const int8_t *b, float *out, size_t deep4, float *multi_scales,
|
||||
float *bias, size_t row, size_t col, size_t stride, const int *a_sums, const int *b_sums,
|
||||
int64_t a_zp, int64_t b_zp_sum);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -99,8 +99,16 @@ int MatmulDynamicBaseInt8CPUKernel::InitFilterQuantParam() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void MatmulDynamicBaseInt8CPUKernel::ResizeParameter() {
|
||||
param_->row_align_ = UP_ROUND(param_->row_, row_tile_);
|
||||
void MatmulDynamicBaseInt8CPUKernel::ResizeMatrixBParameter() {
|
||||
auto w_shape = in_tensors_.at(kWeightIndex)->shape();
|
||||
int batch = 1;
|
||||
for (size_t i = 0; i < w_shape.size() - kSize2; ++i) {
|
||||
batch *= w_shape[i];
|
||||
}
|
||||
param_->batch = batch;
|
||||
param_->col_ = param_->b_transpose_ ? w_shape[w_shape.size() - kSize2] : w_shape[w_shape.size() - kSize1];
|
||||
param_->deep_ = param_->b_transpose_ ? w_shape[w_shape.size() - kSize1] : w_shape[w_shape.size() - kSize2];
|
||||
|
||||
param_->col_align_ = UP_ROUND(param_->col_, col_tile_);
|
||||
param_->deep_align_ = UP_ROUND(param_->deep_, deep_tile_);
|
||||
|
||||
|
@ -126,6 +134,10 @@ void MatmulDynamicBaseInt8CPUKernel::FreeTmpBuffer() {
|
|||
free(weight_sums_);
|
||||
weight_sums_ = nullptr;
|
||||
}
|
||||
if (fp32_bias_ptr_ != nullptr) {
|
||||
free(fp32_bias_ptr_);
|
||||
fp32_bias_ptr_ = nullptr;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -143,8 +155,6 @@ int MatmulDynamicBaseInt8CPUKernel::InitInputQuantParam() {
|
|||
int MatmulDynamicBaseInt8CPUKernel::TransferB() {
|
||||
auto weight_data = reinterpret_cast<int8_t *>(in_tensors_.at(kWeightIndex)->data());
|
||||
CHECK_NULL_RETURN(weight_data);
|
||||
memset(pack_b_ptr_, quant_param_->filter_zp_[0],
|
||||
param_->batch * param_->col_align_ * param_->deep_align_ * sizeof(int8_t));
|
||||
for (int i = 0; i < param_->batch; i++) {
|
||||
auto current_weight = weight_data + i * param_->deep_ * param_->col_;
|
||||
auto current_b_pack = pack_b_ptr_ + i * param_->col_align_ * param_->deep_align_;
|
||||
|
@ -161,31 +171,51 @@ int MatmulDynamicBaseInt8CPUKernel::TransferB() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int MatmulDynamicBaseInt8CPUKernel::InitTmpBuffer() {
|
||||
int MatmulDynamicBaseInt8CPUKernel::InitMatrixABuffer() {
|
||||
if (pack_a_ptr_ != nullptr) {
|
||||
delete pack_a_ptr_;
|
||||
pack_a_ptr_ = nullptr;
|
||||
}
|
||||
pack_a_ptr_ = reinterpret_cast<int8_t *>(malloc(param_->row_align_ * param_->deep_align_ * sizeof(int8_t)));
|
||||
if (pack_a_ptr_ == nullptr) {
|
||||
FreeTmpBuffer();
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (input_sums_ != nullptr) {
|
||||
delete pack_a_ptr_;
|
||||
input_sums_ = nullptr;
|
||||
}
|
||||
input_sums_ = reinterpret_cast<int *>(malloc(param_->row_align_ * sizeof(int)));
|
||||
if (input_sums_ == nullptr) {
|
||||
FreeTmpBuffer();
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(pack_a_ptr_, 0, param_->row_align_ * param_->deep_align_ * sizeof(int8_t));
|
||||
memset(input_sums_, 0, param_->row_align_ * sizeof(int));
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int MatmulDynamicBaseInt8CPUKernel::InitMatrixBBuffer() {
|
||||
if (pack_b_ptr_ != nullptr) {
|
||||
delete pack_b_ptr_;
|
||||
pack_b_ptr_ = nullptr;
|
||||
}
|
||||
pack_b_ptr_ =
|
||||
reinterpret_cast<int8_t *>(malloc(param_->batch * param_->col_align_ * param_->deep_align_ * sizeof(int8_t)));
|
||||
if (pack_b_ptr_ == nullptr) {
|
||||
FreeTmpBuffer();
|
||||
return RET_ERROR;
|
||||
}
|
||||
input_sums_ = reinterpret_cast<int *>(malloc(param_->row_align_ * sizeof(int)));
|
||||
if (input_sums_ == nullptr) {
|
||||
FreeTmpBuffer();
|
||||
return RET_ERROR;
|
||||
if (weight_sums_ != nullptr) {
|
||||
delete weight_sums_;
|
||||
weight_sums_ = nullptr;
|
||||
}
|
||||
weight_sums_ = reinterpret_cast<int *>(malloc(param_->batch * param_->col_align_ * sizeof(int)));
|
||||
if (weight_sums_ == nullptr) {
|
||||
FreeTmpBuffer();
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(pack_a_ptr_, 0, param_->row_align_ * param_->deep_align_ * sizeof(int8_t));
|
||||
memset(pack_b_ptr_, 0, param_->batch * param_->col_align_ * param_->deep_align_ * sizeof(int8_t));
|
||||
memset(input_sums_, 0, param_->row_align_ * sizeof(int));
|
||||
memset(weight_sums_, 0, param_->batch * param_->col_align_ * sizeof(int));
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -193,7 +223,7 @@ int MatmulDynamicBaseInt8CPUKernel::InitTmpBuffer() {
|
|||
int MatmulDynamicBaseInt8CPUKernel::CopyBias() {
|
||||
if (in_tensors_.size() == kHasBiasSize) {
|
||||
auto bias_tensor = in_tensors_[kBiasIndex];
|
||||
fp32_bias_ptr_ = reinterpret_cast<float *>(bias_tensor->data());
|
||||
fp32_bias_ptr_ = static_cast<float *>(malloc(bias_tensor->Size()));
|
||||
if (fp32_bias_ptr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Memory allocation failed";
|
||||
FreeTmpBuffer();
|
||||
|
@ -216,12 +246,25 @@ int MatmulDynamicBaseInt8CPUKernel::Prepare() {
|
|||
return ret;
|
||||
}
|
||||
if (param_->b_const_) {
|
||||
ResizeMatrixBParameter();
|
||||
ret = InitFilterQuantParam();
|
||||
if (ret != RET_OK) {
|
||||
FreeQuantParam();
|
||||
return ret;
|
||||
}
|
||||
ret = InitMatrixBBuffer();
|
||||
if (ret != RET_OK) {
|
||||
FreeQuantParam();
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = TransferB();
|
||||
if (ret != RET_OK) {
|
||||
FreeQuantParam();
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
ret = CopyBias();
|
||||
if (ret != RET_OK) {
|
||||
FreeQuantParam();
|
||||
|
@ -234,30 +277,27 @@ int MatmulDynamicBaseInt8CPUKernel::Prepare() {
|
|||
}
|
||||
|
||||
int MatmulDynamicBaseInt8CPUKernel::ReSize() {
|
||||
int batch = 1;
|
||||
auto x_shape = in_tensors_.at(0)->shape();
|
||||
auto o_shape = out_tensors_.at(0)->shape();
|
||||
MS_ASSERT(x_shape.size() >= kSize2);
|
||||
for (size_t i = 0; i < x_shape.size() - kSize2; ++i) {
|
||||
batch *= x_shape[i];
|
||||
}
|
||||
param_->batch = batch;
|
||||
MS_ASSERT(o_shape.size() >= kSize2);
|
||||
param_->row_ = o_shape[o_shape.size() - kSize2];
|
||||
param_->col_ = o_shape[o_shape.size() - kSize1];
|
||||
param_->row_align_ = UP_ROUND(param_->row_, row_tile_);
|
||||
param_->deep_ = param_->a_transpose_ ? x_shape[x_shape.size() - kSize2] : x_shape[x_shape.size() - kSize1];
|
||||
param_->deep_align_ = UP_ROUND(param_->deep_, deep_tile_);
|
||||
|
||||
FreeTmpBuffer();
|
||||
|
||||
ResizeParameter();
|
||||
|
||||
auto ret = InitTmpBuffer();
|
||||
auto ret = InitMatrixABuffer();
|
||||
if (ret != RET_OK) {
|
||||
FreeQuantParam();
|
||||
return ret;
|
||||
}
|
||||
if (param_->b_const_ == true) {
|
||||
TransferB();
|
||||
|
||||
if (!param_->b_const_) {
|
||||
ResizeMatrixBParameter();
|
||||
ret = InitMatrixBBuffer();
|
||||
if (ret != RET_OK) {
|
||||
FreeQuantParam();
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
|
|
@ -39,10 +39,10 @@ class MatmulDynamicBaseInt8CPUKernel : public InnerKernel {
|
|||
int ReSize() override;
|
||||
|
||||
private:
|
||||
void ResizeParameter();
|
||||
void ResizeMatrixBParameter();
|
||||
int CopyBias();
|
||||
int InitTmpBuffer();
|
||||
|
||||
int InitMatrixABuffer();
|
||||
int InitMatrixBBuffer();
|
||||
int MallocQuantParam();
|
||||
|
||||
protected:
|
||||
|
|
|
@ -82,7 +82,6 @@ int MatMulDynamicSdotInt8Kernel::MatMulDynamicArm64SdotPre(int task_id) {
|
|||
}
|
||||
|
||||
int MatMulDynamicSdotInt8Kernel::MatMulDynamicArm64SdotImpl(int task_id) {
|
||||
#if defined(ENABLE_ARM64) && !defined(SUPPORT_NNIE) && (!defined(MACHINE_LINUX_ARM64))
|
||||
// Multi-thread split by col.
|
||||
int stride = thread_stride_ * col_tile_;
|
||||
int cur_stride = task_id * stride;
|
||||
|
@ -128,12 +127,18 @@ int MatMulDynamicSdotInt8Kernel::MatMulDynamicArm64SdotImpl(int task_id) {
|
|||
if (bias != nullptr) {
|
||||
bias += col_offset;
|
||||
}
|
||||
|
||||
#if defined(ENABLE_ARM64) && !defined(SUPPORT_NNIE) && (!defined(MACHINE_LINUX_ARM64))
|
||||
DynamicMatmulSdot4x4x16AIWI(a_ptr, b_ptr, out_ptr, param_->deep_align_, multi_scale.data() + c, bias, row, col,
|
||||
out_stride, input_sums_ptr, weight_sums_ptr, quant_param_->input_zp_,
|
||||
quant_param_->filter_zp_[0] * param_->deep_);
|
||||
#else
|
||||
DynamicMatmul4x4x16AIWI(a_ptr, b_ptr, out_ptr, param_->deep_align_, multi_scale.data() + c, bias, row, col,
|
||||
out_stride, input_sums_ptr, weight_sums_ptr, quant_param_->input_zp_,
|
||||
quant_param_->filter_zp_[0] * param_->deep_);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -664,7 +664,9 @@ int Scheduler::InferNodeShape(const lite::Model::Node *node) {
|
|||
|
||||
parameter->quant_type_ = node->quant_type_;
|
||||
parameter->thread_num_ = context_->thread_num_;
|
||||
|
||||
if (context_->float_mode && parameter->quant_type_ == schema::QuantType_QUANT_ALL) {
|
||||
parameter->quant_type_ = schema::QuantType_QUANT_WEIGHT;
|
||||
}
|
||||
if (node->output_indices_.empty()) {
|
||||
MS_LOG(ERROR) << "The output size is invalid";
|
||||
if (parameter->destroy_func_ != nullptr) {
|
||||
|
@ -1000,9 +1002,6 @@ int Scheduler::FindCpuKernel(const std::vector<Tensor *> &in_tensors, const std:
|
|||
cpu_desc.data_type = kNumberTypeFloat16;
|
||||
}
|
||||
int ret;
|
||||
if (context_->float_mode && op_parameter->quant_type_ == schema::QuantType_QUANT_ALL) {
|
||||
op_parameter->quant_type_ = schema::QuantType_QUANT_WEIGHT;
|
||||
}
|
||||
#ifndef WEIGHT_DECODE_CLIP
|
||||
ret = WeightDecoder::DequantNode(op_parameter, in_tensors, kernel_data_type, src_model_->version_);
|
||||
if (ret != RET_OK) {
|
||||
|
|
|
@ -88,6 +88,7 @@ function Convert() {
|
|||
done
|
||||
fi
|
||||
# start running converter
|
||||
echo "Convert ${model_name} ${quant_type} ......"
|
||||
echo ${model_name} >> "$4"
|
||||
echo './converter_lite --fmk='${model_fmk}' --modelFile='${model_file}' --weightFile='${weight_file}' --outputFile='${output_file}\
|
||||
' --inputDataType='${in_dtype}' --outputDataType='${out_dtype}' --inputShape='${spec_shapes}\
|
||||
|
|
Loading…
Reference in New Issue