forked from mindspore-Ecosystem/mindspore
!21462 fix mix using signed and unsigned values
Merge pull request !21462 from zhaozhenlong/lite/issue/codex_0803
This commit is contained in:
commit
3acfeac239
|
@ -5,7 +5,8 @@
|
|||
|
||||
//void MatmulInt8DpNeon64(const int8_t *a, const int8_t *b, int8_t *dst, int row8, int col8, int deep4,
|
||||
// const int *a_sums, const int *bias, int act_min, int act_max, int out_zp,
|
||||
// int *multiplier, int *left_shift, int *right_shift, int row, int col, int stride, int peroc);
|
||||
// const int *multiplier, const int *left_shift, const int *right_shift, int row,
|
||||
// int col, int stride, int peroc);
|
||||
|
||||
// x0: a(left matrix ptr)
|
||||
// x1: b(right matrix ptr)
|
||||
|
|
|
@ -4,8 +4,9 @@
|
|||
.align 5
|
||||
|
||||
//void MatmulInt8DpOpt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep4, const int *a_sums,
|
||||
// const int *bias, int act_min, int act_max, int out_zp, int32_t *multiplier, int32_t *left_shift,
|
||||
// int32_t *right_shift, size_t stride, size_t filter_peroc, int32_t *filter_zp)
|
||||
// const int *bias, int act_min, int act_max, int out_zp, const int32_t *multiplier,
|
||||
// const int32_t *left_shift, const int32_t *right_shift, size_t stride, size_t filter_peroc,
|
||||
// const int32_t *filter_zp)
|
||||
|
||||
// x0: a(left matrix ptr)
|
||||
// x1: b(right matrix ptr)
|
||||
|
|
|
@ -39,8 +39,8 @@ void DoArgMinMaxQuant(const int8_t *input, int8_t *output, const ArgMinMaxParame
|
|||
float bias = -in_quant_arg->zp_ * in_quant_arg->scale_;
|
||||
int32_t output_zp = out_quant_arg->zp_;
|
||||
for (int i = 0; i < pre_axis_count; ++i) {
|
||||
size_t output_offset = i * after_axis_count;
|
||||
size_t input_offset = output_offset * axis_count;
|
||||
int output_offset = i * after_axis_count;
|
||||
int input_offset = output_offset * axis_count;
|
||||
for (int j = 0; j < after_axis_count; ++j) {
|
||||
float value = -FLT_MAX;
|
||||
if (!param->get_max_) {
|
||||
|
@ -97,8 +97,8 @@ void Int8ArgMinMaxDim0(const int8_t *input, int8_t *output, const int *in_shape,
|
|||
int32_t output_zp = out_quant_arg->zp_;
|
||||
for (int32_t i = 0; i < param->in_strides_[0]; ++i) {
|
||||
for (int j = 0; j < in_shape[0]; ++j) {
|
||||
size_t offset = param->in_strides_[0] * j + i;
|
||||
param->arg_elements_[j].index_ = j;
|
||||
int offset = param->in_strides_[0] * j + i;
|
||||
param->arg_elements_[j].index_ = (uint32_t)j;
|
||||
param->arg_elements_[j].data_.f_data_ = input[offset] * in_quant_arg->scale_ + bias;
|
||||
}
|
||||
if (param->get_max_) {
|
||||
|
@ -108,7 +108,7 @@ void Int8ArgMinMaxDim0(const int8_t *input, int8_t *output, const int *in_shape,
|
|||
}
|
||||
|
||||
for (int j = 0; j < param->topk_; ++j) {
|
||||
size_t out_offset = j * param->out_strides_[0] + i;
|
||||
int out_offset = j * param->out_strides_[0] + i;
|
||||
float real_out = out_value ? param->arg_elements_[j].data_.f_data_ : param->arg_elements_[j].index_;
|
||||
output[out_offset] = GetInt8Output(real_out, output_inverse_scale, output_zp);
|
||||
}
|
||||
|
@ -123,12 +123,12 @@ void Int8ArgMinMaxDim1(const int8_t *input, int8_t *output, const int *in_shape,
|
|||
int32_t output_zp = out_quant_arg->zp_;
|
||||
int in_shape1 = in_shape[1];
|
||||
for (int i = 0; i < in_shape[0]; ++i) {
|
||||
size_t in_dim0_offset = i * param->in_strides_[0];
|
||||
size_t out_dim0_offset = i * param->out_strides_[0];
|
||||
int in_dim0_offset = i * param->in_strides_[0];
|
||||
int out_dim0_offset = i * param->out_strides_[0];
|
||||
for (int j = 0; j < param->in_strides_[1]; ++j) {
|
||||
for (int k = 0; k < in_shape1; ++k) {
|
||||
size_t offset = param->in_strides_[1] * k + in_dim0_offset + j;
|
||||
param->arg_elements_[k].index_ = k;
|
||||
int offset = param->in_strides_[1] * k + in_dim0_offset + j;
|
||||
param->arg_elements_[k].index_ = (size_t)k;
|
||||
param->arg_elements_[k].data_.f_data_ = input[offset] * in_quant_arg->scale_ + bias;
|
||||
}
|
||||
if (param->get_max_) {
|
||||
|
@ -138,7 +138,7 @@ void Int8ArgMinMaxDim1(const int8_t *input, int8_t *output, const int *in_shape,
|
|||
}
|
||||
|
||||
for (int k = 0; k < param->topk_; ++k) {
|
||||
size_t out_offset = out_dim0_offset + j + k * param->out_strides_[1];
|
||||
int out_offset = out_dim0_offset + j + k * param->out_strides_[1];
|
||||
float real_out = out_value ? param->arg_elements_[k].data_.f_data_ : param->arg_elements_[k].index_;
|
||||
output[out_offset] = GetInt8Output(real_out, output_inverse_scale, output_zp);
|
||||
}
|
||||
|
@ -155,15 +155,15 @@ void Int8ArgMinMaxDim2(const int8_t *input, int8_t *output, const int *in_shape,
|
|||
int in_shape1 = in_shape[1];
|
||||
int in_shape2 = in_shape[2];
|
||||
for (int i = 0; i < in_shape[0]; ++i) {
|
||||
size_t in_dim0_offset = i * param->in_strides_[0];
|
||||
size_t out_dim0_offset = i * param->out_strides_[0];
|
||||
int in_dim0_offset = i * param->in_strides_[0];
|
||||
int out_dim0_offset = i * param->out_strides_[0];
|
||||
for (int j = 0; j < in_shape1; ++j) {
|
||||
size_t in_dim1_offset = j * param->in_strides_[1] + in_dim0_offset;
|
||||
size_t out_dim1_offset = j * param->out_strides_[1] + out_dim0_offset;
|
||||
int in_dim1_offset = j * param->in_strides_[1] + in_dim0_offset;
|
||||
int out_dim1_offset = j * param->out_strides_[1] + out_dim0_offset;
|
||||
for (int k = 0; k < param->in_strides_[2]; ++k) {
|
||||
for (int l = 0; l < in_shape2; ++l) {
|
||||
size_t offset = param->in_strides_[2] * l + k + in_dim1_offset;
|
||||
param->arg_elements_[l].index_ = l;
|
||||
int offset = param->in_strides_[2] * l + k + in_dim1_offset;
|
||||
param->arg_elements_[l].index_ = (uint32_t)l;
|
||||
param->arg_elements_[l].data_.f_data_ = input[offset] * in_quant_arg->scale_ + bias;
|
||||
}
|
||||
if (param->get_max_) {
|
||||
|
@ -172,7 +172,7 @@ void Int8ArgMinMaxDim2(const int8_t *input, int8_t *output, const int *in_shape,
|
|||
qsort(param->arg_elements_, in_shape2, sizeof(ArgElement), ArgCompareAscInt8);
|
||||
}
|
||||
for (int l = 0; l < param->topk_; ++l) {
|
||||
size_t out_offset = out_dim1_offset + k + l * param->out_strides_[2];
|
||||
int out_offset = out_dim1_offset + k + l * param->out_strides_[2];
|
||||
float real_out = out_value ? param->arg_elements_[l].data_.f_data_ : param->arg_elements_[l].index_;
|
||||
output[out_offset] = GetInt8Output(real_out, output_inverse_scale, output_zp);
|
||||
}
|
||||
|
@ -191,17 +191,17 @@ void Int8ArgMinMaxDim3(const int8_t *input, int8_t *output, const int *in_shape,
|
|||
int in_shape2 = in_shape[2];
|
||||
int in_shape3 = in_shape[3];
|
||||
for (int i = 0; i < in_shape[0]; ++i) {
|
||||
size_t in_dim0_offset = i * param->in_strides_[0];
|
||||
size_t out_dim0_offset = i * param->out_strides_[0];
|
||||
int in_dim0_offset = i * param->in_strides_[0];
|
||||
int out_dim0_offset = i * param->out_strides_[0];
|
||||
for (int j = 0; j < in_shape1; ++j) {
|
||||
size_t in_dim1_offset = j * param->in_strides_[1] + in_dim0_offset;
|
||||
size_t out_dim1_offset = j * param->out_strides_[1] + out_dim0_offset;
|
||||
int in_dim1_offset = j * param->in_strides_[1] + in_dim0_offset;
|
||||
int out_dim1_offset = j * param->out_strides_[1] + out_dim0_offset;
|
||||
for (int k = 0; k < in_shape2; ++k) {
|
||||
size_t in_dim2_offset = k * param->in_strides_[2] + in_dim1_offset;
|
||||
size_t out_dim2_offset = k * param->out_strides_[2] + out_dim1_offset;
|
||||
int in_dim2_offset = k * param->in_strides_[2] + in_dim1_offset;
|
||||
int out_dim2_offset = k * param->out_strides_[2] + out_dim1_offset;
|
||||
for (int l = 0; l < in_shape3; ++l) {
|
||||
size_t offset = l + in_dim2_offset;
|
||||
param->arg_elements_[l].index_ = l;
|
||||
int offset = l + in_dim2_offset;
|
||||
param->arg_elements_[l].index_ = (uint32_t)l;
|
||||
param->arg_elements_[l].data_.f_data_ = input[offset] * in_quant_arg->scale_ + bias;
|
||||
}
|
||||
if (param->get_max_) {
|
||||
|
@ -210,7 +210,7 @@ void Int8ArgMinMaxDim3(const int8_t *input, int8_t *output, const int *in_shape,
|
|||
qsort(param->arg_elements_, in_shape3, sizeof(ArgElement), ArgCompareAscInt8);
|
||||
}
|
||||
for (int l = 0; l < param->topk_; ++l) {
|
||||
size_t out_offset = out_dim2_offset + l;
|
||||
int out_offset = out_dim2_offset + l;
|
||||
float real_out = out_value ? param->arg_elements_[l].data_.f_data_ : param->arg_elements_[l].index_;
|
||||
output[out_offset] = GetInt8Output(real_out, output_inverse_scale, output_zp);
|
||||
}
|
||||
|
|
|
@ -218,7 +218,7 @@ int16x4_t ClacSumHalfWord(int32x4_t scaled_input, int32x4_t left_shift_out_vec,
|
|||
void SquareInt8NEON(const int8_t *input_data, int8_t *output_data, int64_t element_size, ArithSelfQuantArg para,
|
||||
int *index) {
|
||||
int32x4_t output_multiplier_vec = vdupq_n_s32(para.output_multiplier_);
|
||||
int32x4_t left_shift_out_vec = vdupq_n_s32(1 << para.shift_left_);
|
||||
int32x4_t left_shift_out_vec = vdupq_n_s32(1 << (size_t)para.shift_left_);
|
||||
|
||||
for (; (*index) <= element_size - 8; (*index) += 8) {
|
||||
int16x8_t input_val = LoadAndAddOffset(input_data, *index, para.in_args_.zp_);
|
||||
|
|
|
@ -812,11 +812,11 @@ void Conv3x3Int8InputTransform(const int16_t *input_data, int16_t *trans_input,
|
|||
for (int j = real_y_start; j < real_y_end; j++) {
|
||||
const int16_t *src = input_data + src_c8_offset + C8NUM * (j * input_width + real_x_start);
|
||||
int16_t *dst = tmp_data + C8NUM * (C4NUM * j + real_x_start);
|
||||
memcpy(dst, src, (real_x_end - real_x_start) * C8NUM * sizeof(int16_t));
|
||||
memcpy(dst, src, (size_t)(real_x_end - real_x_start) * C8NUM * sizeof(int16_t));
|
||||
}
|
||||
// input transform
|
||||
int dst_ic8_offset = dst_plane_offset + ic * TILE_NUM * C8NUM;
|
||||
size_t dst_step = ic8 * C8NUM * TILE_NUM;
|
||||
size_t dst_step = (size_t)ic8 * C8NUM * TILE_NUM;
|
||||
int16_t *trans_input_ptr = trans_input + dst_ic8_offset;
|
||||
Conv3x3Int8InputUnit(tmp_data, trans_input_ptr, dst_step, input_zp);
|
||||
}
|
||||
|
@ -826,7 +826,7 @@ void Conv3x3Int8InputTransform(const int16_t *input_data, int16_t *trans_input,
|
|||
void Conv3x3Int8Gemm(int32_t *dst, const int16_t *src, const int16_t *weight, int oc, int ic8, size_t real_cal_num) {
|
||||
int oc4 = UP_DIV(oc, C4NUM);
|
||||
#ifdef ENABLE_ARM
|
||||
IndirectGemmInt16to32_8x4(dst, src, weight, 16, ic8, oc4, oc4 * 4 * 16 * sizeof(int32_t));
|
||||
IndirectGemmInt16to32_8x4(dst, src, weight, 16, ic8, oc4, (size_t)oc4 * 4 * 16 * sizeof(int32_t));
|
||||
#else
|
||||
const int input_unit_square = 16;
|
||||
for (int c = 0; c < oc4; c++) {
|
||||
|
|
|
@ -20,9 +20,9 @@
|
|||
int DeConvPostInt8C4(const int32_t *src, const int32_t *bias, int32_t *tmp, int8_t *out, int output_channel,
|
||||
const ConvParameter *conv_param) {
|
||||
/* row4x4-major(ih*iw x oc*kh*kw) -> row4-major(oh*ow x oc) */
|
||||
size_t input_plane = conv_param->input_w_ * conv_param->input_h_;
|
||||
size_t kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_;
|
||||
size_t output_plane = conv_param->output_w_ * conv_param->output_h_;
|
||||
int input_plane = conv_param->input_w_ * conv_param->input_h_;
|
||||
int kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_;
|
||||
int output_plane = conv_param->output_w_ * conv_param->output_h_;
|
||||
int oc4 = UP_DIV(output_channel, C4NUM);
|
||||
int in_plane4 = UP_ROUND(input_plane, C4NUM);
|
||||
|
||||
|
@ -38,7 +38,7 @@ int DeConvPostInt8C4(const int32_t *src, const int32_t *bias, int32_t *tmp, int8
|
|||
for (int c = 0; c < oc4; c++) {
|
||||
int32_t *dst_ptr = tmp + c * output_plane * C4NUM;
|
||||
const int32_t *src_ptr = src + c * in_plane4 * kernel_plane * C4NUM;
|
||||
memset(dst_ptr, 0, output_plane * C4NUM * sizeof(int32_t));
|
||||
memset(dst_ptr, 0, (size_t)output_plane * C4NUM * sizeof(int32_t));
|
||||
|
||||
for (int ih = 0; ih < conv_param->input_h_; ih++) {
|
||||
for (int iw = 0; iw < conv_param->input_w_; iw++) {
|
||||
|
@ -81,7 +81,7 @@ int DeConvPostInt8C4(const int32_t *src, const int32_t *bias, int32_t *tmp, int8
|
|||
} /*ih*/
|
||||
} /*oc*/
|
||||
|
||||
PostFuncInt8C4(tmp, bias, out, output_channel, output_plane, conv_param->output_channel_,
|
||||
PostFuncInt8C4(tmp, bias, out, output_channel, (size_t)output_plane, conv_param->output_channel_,
|
||||
conv_param->conv_quant_arg_.quant_multiplier_[0], conv_param->conv_quant_arg_.left_shift_[0],
|
||||
conv_param->conv_quant_arg_.right_shift_[0], conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
|
||||
conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0]);
|
||||
|
|
|
@ -39,7 +39,7 @@ int HSwishInt8(const int8_t *src, int length, int8_t *dst, HswishQuantArg *arg)
|
|||
if (arg->relu6_multiplier_exponent < 0) {
|
||||
relu6_value = RoundingDivideByPOT(relu6_value, -arg->relu6_multiplier_exponent);
|
||||
}
|
||||
relu6_value = (relu6_value + (1 << 15)) >> 1;
|
||||
relu6_value = (size_t)(relu6_value + (1 << 15)) >> 1;
|
||||
const int16_t preshift_output_value =
|
||||
SaturatingRoundingDoublingHighMulInt16(relu6_value, input_value_on_preshift_output_scale);
|
||||
|
||||
|
|
|
@ -104,7 +104,7 @@ void RowMajor2Row16x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row,
|
|||
|
||||
for (int ri = 0; ri < row_4div; ri += C4NUM) {
|
||||
for (int ci = 0; ci < col_16div; ci += C16NUM) {
|
||||
size_t col_offset = col;
|
||||
size_t col_offset = (size_t)col;
|
||||
int8_t *src_c = src_r + ci;
|
||||
int8_t *dst_c = dst_r + ci * C4NUM;
|
||||
#ifdef ENABLE_ARM64
|
||||
|
@ -207,7 +207,7 @@ void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row,
|
|||
int c2div = c / C2NUM, c2mod = c % C2NUM;
|
||||
size_t ci = r * stride + c;
|
||||
int32_t value = 0;
|
||||
for (int d = 0; d < deep_16; d++) {
|
||||
for (int d = 0; d < (int)deep_16; d++) {
|
||||
int d16div = d / C16NUM, d16mod = d % C16NUM;
|
||||
size_t ai = r4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + r4mod * C16NUM + d16mod;
|
||||
size_t bi = c2div * deep_16 * C2NUM + d16div * C2NUM * C16NUM + c2mod * C16NUM + d16mod;
|
||||
|
@ -269,9 +269,9 @@ void MatmulInt8Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int c
|
|||
#endif
|
||||
|
||||
void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
|
||||
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
|
||||
size_t per_channel) {
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, const int32_t *left_shift,
|
||||
const int32_t *right_shift, const int32_t *multiplier, int32_t output_zp, int32_t mini,
|
||||
int32_t maxi, size_t per_channel) {
|
||||
/* row8x4-major * row4x8-major => (int8)row-major */
|
||||
for (int r = 0; r < row; r++) {
|
||||
for (int c = 0; c < col; c++) {
|
||||
|
@ -279,7 +279,7 @@ void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row,
|
|||
int c8div = c / C8NUM, c8mod = c % C8NUM;
|
||||
size_t ci = r * stride + c;
|
||||
int32_t value = 0;
|
||||
for (int d = 0; d < deep_4; d++) {
|
||||
for (int d = 0; d < (int)deep_4; d++) {
|
||||
int d4div = d / C4NUM, d4mod = d % C4NUM;
|
||||
size_t ai = r8div * deep_4 * C8NUM + d4div * C8NUM * C4NUM + r8mod * C4NUM + d4mod;
|
||||
size_t bi = c8div * deep_4 * C8NUM + d4div * C8NUM * C4NUM + c8mod * C4NUM + d4mod;
|
||||
|
@ -302,9 +302,9 @@ void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row,
|
|||
}
|
||||
|
||||
void MatMulInt8_4x16_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
|
||||
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
|
||||
size_t per_channel, int32_t *filter_zp) {
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, const int32_t *left_shift,
|
||||
const int32_t *right_shift, const int32_t *multiplier, int32_t output_zp, int32_t mini,
|
||||
int32_t maxi, size_t per_channel, const int32_t *filter_zp) {
|
||||
/* row4x4-major * row4x16-major => (int8)row-major */
|
||||
for (int r = 0; r < row; r++) {
|
||||
for (int c = 0; c < col; c++) {
|
||||
|
@ -312,7 +312,7 @@ void MatMulInt8_4x16_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row
|
|||
int c16div = c / C16NUM, c16mod = c % C16NUM;
|
||||
size_t ci = r * stride + c;
|
||||
int32_t value = 0;
|
||||
for (int d = 0; d < deep_4; d++) {
|
||||
for (int d = 0; d < (int)deep_4; d++) {
|
||||
int d4div = d / C4NUM, d4mod = d % C4NUM;
|
||||
size_t ai = r4div * deep_4 * C4NUM + d4div * C4NUM * C4NUM + r4mod * C4NUM + d4mod;
|
||||
size_t bi = c16div * deep_4 * C16NUM + d4div * C16NUM * C4NUM + c16mod * C4NUM + d4mod;
|
||||
|
@ -453,7 +453,7 @@ void PackInput4x4AndInputSumPert(const int8_t *src_input, int8_t *packed_input,
|
|||
#else
|
||||
int32_t tmp_sum_value[4] = {0};
|
||||
for (int ici = 0; ici < ic_4div; ici += C4NUM) {
|
||||
for (int i = 0; i < C4NUM; i++) {
|
||||
for (size_t i = 0; i < C4NUM; i++) {
|
||||
tmp_sum_value[i] += src_ic[0 + i * input_channel];
|
||||
tmp_sum_value[i] += src_ic[1 + i * input_channel];
|
||||
tmp_sum_value[i] += src_ic[2 + i * input_channel];
|
||||
|
|
|
@ -42,9 +42,9 @@ void MatmulInt8Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int c
|
|||
/* optimize conv */
|
||||
void RowMajor2Row8x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
|
||||
void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
|
||||
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
|
||||
size_t per_channel);
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, const int32_t *left_shift,
|
||||
const int32_t *right_shift, const int32_t *multiplier, int32_t output_zp, int32_t mini,
|
||||
int32_t maxi, size_t per_channel);
|
||||
|
||||
/* 4x16 16x2 -> 4x2 */
|
||||
/* arm32 conv1x1 */
|
||||
|
@ -61,9 +61,9 @@ void RowMajor2Row4x16MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row,
|
|||
void PackInput4x4AndInputSumPert(const int8_t *src_input, int8_t *packed_input, int32_t *input_sum,
|
||||
size_t input_channel, size_t plane_size, int32_t filter_zp);
|
||||
void MatMulInt8_4x16_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
|
||||
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
|
||||
size_t per_channel, int32_t *filter_zp);
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, const int32_t *left_shift,
|
||||
const int32_t *right_shift, const int32_t *multiplier, int32_t output_zp, int32_t mini,
|
||||
int32_t maxi, size_t per_channel, const int32_t *filter_zp);
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16, const int *a_sums,
|
||||
|
|
|
@ -27,10 +27,10 @@ int16x4_t ClacSumHalfWordMul(int16x4_t scaled_input0, int16x4_t scaled_input1, i
|
|||
return vqmovn_s32(raw_sum);
|
||||
}
|
||||
|
||||
void MulInt8NEON(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int64_t real_dst_count,
|
||||
MulQuantArg *quant_arg, int *index) {
|
||||
void MulInt8NEON(const int8_t *input0_data, const int8_t *input1_data, int8_t *output_data, int64_t real_dst_count,
|
||||
const MulQuantArg *quant_arg, int *index) {
|
||||
int32x4_t output_multiplier_vec = vdupq_n_s32(quant_arg->output_multiplier_);
|
||||
int32x4_t left_shift_out_vec = vdupq_n_s32(1 << quant_arg->shift_left_);
|
||||
int32x4_t left_shift_out_vec = vdupq_n_s32(1 << (size_t)quant_arg->shift_left_);
|
||||
int32x4_t right_shift_out_vec = vdupq_n_s32(-quant_arg->shift_right_);
|
||||
int16x8_t out_zp_vec = vdupq_n_s16(quant_arg->out_quant_arg_.zp_);
|
||||
int8x16_t out_min_vec = vdupq_n_s8(quant_arg->output_activation_min_);
|
||||
|
@ -104,8 +104,8 @@ void MulInt8NEON(int8_t *input0_data, int8_t *input1_data, int8_t *output_data,
|
|||
}
|
||||
#endif
|
||||
|
||||
void FastMul(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int depth, int64_t real_dst_count,
|
||||
bool input1_broad, MulQuantArg *quant_arg) {
|
||||
void FastMul(const int8_t *input0_data, const int8_t *input1_data, int8_t *output_data, int depth,
|
||||
int64_t real_dst_count, bool input1_broad, const MulQuantArg *quant_arg) {
|
||||
// input0 need broadcast
|
||||
int32_t zp1 = quant_arg->in_quant_args_[0].zp_;
|
||||
int32_t zp2 = quant_arg->in_quant_args_[1].zp_;
|
||||
|
@ -215,8 +215,8 @@ void FastMul(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int
|
|||
return;
|
||||
}
|
||||
|
||||
void Mul(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int64_t real_dst_count,
|
||||
MulQuantArg *quant_arg) {
|
||||
void Mul(const int8_t *input0_data, const int8_t *input1_data, int8_t *output_data, int64_t real_dst_count,
|
||||
const MulQuantArg *quant_arg) {
|
||||
int index = 0;
|
||||
#ifdef ENABLE_NEON
|
||||
MulInt8NEON(input0_data, input1_data, output_data, real_dst_count, quant_arg, &index);
|
||||
|
|
|
@ -28,9 +28,10 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
void Mul(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int64_t real_dst_count, MulQuantArg *quant_arg);
|
||||
void FastMul(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int depth, int64_t real_dst_count,
|
||||
bool input1_broad, MulQuantArg *quant_arg);
|
||||
void Mul(const int8_t *input0_data, const int8_t *input1_data, int8_t *output_data, int64_t real_dst_count,
|
||||
const MulQuantArg *quant_arg);
|
||||
void FastMul(const int8_t *input0_data, const int8_t *input1_data, int8_t *output_data, int depth,
|
||||
int64_t real_dst_count, bool input1_broad, const MulQuantArg *quant_arg);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -849,7 +849,8 @@ void PackInputToC8Int8(const int8_t *input_data, int16_t *packed_input, ConvPara
|
|||
}
|
||||
}
|
||||
|
||||
void PackWeightToC8Int8(const int8_t *origin_weight_data, int16_t *packed_weight_data, ConvParameter *conv_param) {
|
||||
void PackWeightToC8Int8(const int8_t *origin_weight_data, int16_t *packed_weight_data,
|
||||
const ConvParameter *conv_param) {
|
||||
// origin weight format : ohwi
|
||||
int input_channel = conv_param->input_channel_;
|
||||
int ic8 = input_channel / C8NUM * C8NUM;
|
||||
|
|
|
@ -40,7 +40,7 @@ void PackInputSum16x4Int8(const int8_t *input, int32_t *input_sum, const int32_t
|
|||
const ConvParameter *conv_param);
|
||||
void PackInputSum16x4PerLayer(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16);
|
||||
void PackInputToC8Int8(const int8_t *input_data, int16_t *packed_input, ConvParameter *conv_param);
|
||||
void PackWeightToC8Int8(const int8_t *origin_weight_data, int16_t *packed_weight_data, ConvParameter *conv_param);
|
||||
void PackWeightToC8Int8(const int8_t *origin_weight_data, int16_t *packed_weight_data, const ConvParameter *conv_param);
|
||||
void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int8_t *matmul_input, int real_cal_num,
|
||||
int block_index, const int32_t *filter_zp, int32_t *input_sum,
|
||||
const ConvParameter *conv_param, bool per_channel, bool is_optimize);
|
||||
|
|
|
@ -26,7 +26,7 @@ int PadConstant4D(const int8_t *in_data, int8_t *out_data, const int32_t *in_dim
|
|||
for (int w = 0; w < in_dims[2]; w++) {
|
||||
const int8_t *in = in_data + Offset(in_dims, n, h, w, 0);
|
||||
int8_t *out = out_data + Offset(out_dims, n + paddings[0], h + paddings[2], w + paddings[4], paddings[6]);
|
||||
memcpy(out, in, copy_size * sizeof(int8_t));
|
||||
memcpy(out, in, (size_t)copy_size * sizeof(int8_t));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -112,7 +112,7 @@ int UInt8ToInt8(const uint8_t *real_values, int8_t *quant_values, int size) {
|
|||
}
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
int temp = real_values[i] - 128;
|
||||
int temp = (int)real_values[i] - 128;
|
||||
if (temp > 127) {
|
||||
quant_values[i] = 127;
|
||||
} else if (temp < -128) {
|
||||
|
|
|
@ -34,8 +34,8 @@ int16x4_t ClacSumHalfWordMul3(int32x4_t scaled_input0, int32x4_t scaled_input1,
|
|||
const ScaleParameter *scale_param) {
|
||||
int32x4_t output_multiplier_vec = vdupq_n_s32(scale_param->scale_mul_arg_.multiplier_);
|
||||
int32x4_t output_multiplier_vec2 = vdupq_n_s32(scale_param->offset_mul_arg_.multiplier_);
|
||||
int32x4_t left_shift_out_vec = vdupq_n_s32(1 << scale_param->scale_mul_arg_.left_shift_);
|
||||
int32x4_t left_shift_out_vec2 = vdupq_n_s32(1 << scale_param->offset_mul_arg_.left_shift_);
|
||||
int32x4_t left_shift_out_vec = vdupq_n_s32(1 << (size_t)(scale_param->scale_mul_arg_.left_shift_));
|
||||
int32x4_t left_shift_out_vec2 = vdupq_n_s32(1 << (size_t)(scale_param->offset_mul_arg_.left_shift_));
|
||||
int32x4_t input_scale = vmulq_s32(scaled_input0, scaled_input1);
|
||||
int32x4_t raw_sum = RoundingDivideByPOTInt32x4(
|
||||
SaturatingRoundingDoublingHighMulInt32x4(vmulq_s32(input_scale, left_shift_out_vec), output_multiplier_vec),
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
#ifdef ENABLE_NEON
|
||||
|
||||
int16x4_t DoClacSumHalfWord(int32x4_t scaled_input0, int32x4_t scaled_input1, int32x4_t left_shift_out_vec,
|
||||
int32x4_t output_multiplier_vec, SubQuantArg *para) {
|
||||
int32x4_t output_multiplier_vec, const SubQuantArg *para) {
|
||||
int32x4_t raw_data = vsubq_s32(scaled_input0, scaled_input1);
|
||||
|
||||
raw_data = RoundingDivideByPOTInt32x4(vqrdmulhq_s32(vmulq_s32(raw_data, left_shift_out_vec), output_multiplier_vec),
|
||||
|
@ -35,14 +35,14 @@ int16x4_t DoClacSumHalfWord(int32x4_t scaled_input0, int32x4_t scaled_input1, in
|
|||
return vqmovn_s32(raw_data);
|
||||
}
|
||||
|
||||
void SubInt8NEON(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int64_t real_dst_count,
|
||||
SubQuantArg *para, int *index) {
|
||||
void SubInt8NEON(const int8_t *input0_data, const int8_t *input1_data, int8_t *output_data, int64_t real_dst_count,
|
||||
const SubQuantArg *para, int *index) {
|
||||
int32x4_t left_shift_result0_vec = vdupq_n_s32(para->left_shift_result0_);
|
||||
int32x4_t left_shift_result1_vec = vdupq_n_s32(para->left_shift_result1_);
|
||||
int32x4_t input0_multiplier_vec = vdupq_n_s32(para->input0_multiplier_);
|
||||
int32x4_t input1_multiplier_vec = vdupq_n_s32(para->input1_multiplier_);
|
||||
int32x4_t output_multiplier_vec = vdupq_n_s32(para->output_multiplier_);
|
||||
int32x4_t left_shift_out_vec = vdupq_n_s32((1 << para->left_shift_out_));
|
||||
int32x4_t left_shift_out_vec = vdupq_n_s32((1 << (size_t)para->left_shift_out_));
|
||||
int32x4_t right_shift0_vec = vdupq_n_s32(-para->right_shift0_);
|
||||
int32x4_t right_shift1_vec = vdupq_n_s32(-para->right_shift1_);
|
||||
|
||||
|
|
|
@ -226,16 +226,16 @@ void TransposeDimsInt8(const int8_t *in_data, int8_t *out_data, const int *outpu
|
|||
const int *strides = transpose_param->strides_;
|
||||
const int *out_strides = transpose_param->out_strides_;
|
||||
int num_axes = transpose_param->num_axes_;
|
||||
size_t data_size = (*out_strides) * output_shape[0];
|
||||
size_t data_size = (size_t)((*out_strides) * output_shape[0]);
|
||||
size_t offset_size = UP_DIV(data_size, thread_num);
|
||||
size_t task_offset = offset_size * task_id;
|
||||
int count = data_size - task_offset;
|
||||
if (count <= 0) {
|
||||
size_t count = data_size - task_offset;
|
||||
if (data_size < task_offset) {
|
||||
return;
|
||||
}
|
||||
count = MSMIN(offset_size, count);
|
||||
for (size_t idx = task_offset; idx < task_offset + count; ++idx) {
|
||||
int pos = idx;
|
||||
int pos = (int)idx;
|
||||
int output_idx = 0;
|
||||
int input_idx = 0;
|
||||
for (int i = 0; i < num_axes; ++i) {
|
||||
|
|
|
@ -24,7 +24,7 @@ int Int8Unsqueeze(const int8_t *input_ptr, int8_t *output_ptr, UnSqueezeParamete
|
|||
float input_scale = para_->quant_arg.in_quant_args_.scale_;
|
||||
int8_t input_zp = para_->quant_arg.in_quant_args_.zp_;
|
||||
|
||||
for (int i = task_id; i < data_size; i += para_->thread_count_) {
|
||||
for (int i = task_id; i < (int)data_size; i += para_->thread_count_) {
|
||||
output_ptr[i] = output_zp + round(1 / output_scale * input_scale * (input_ptr[i] - input_zp));
|
||||
}
|
||||
return 0;
|
||||
|
|
|
@ -23,14 +23,15 @@ typedef void (*MATMUL_OPT_R4_FUNC)(const int8_t *a, const int8_t *b, int *dst, i
|
|||
const int *input_sum, const int *bias);
|
||||
|
||||
typedef void (*MATMUL_OPT_R_FUNC)(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
|
||||
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
|
||||
int32_t maxi, size_t per_channel);
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias,
|
||||
const int32_t *left_shift, const int32_t *right_shift, const int32_t *multiplier,
|
||||
int32_t output_zp, int32_t mini, int32_t maxi, size_t per_channel);
|
||||
|
||||
typedef void (*MATMUL_OPT_DP_FUNC)(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
|
||||
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
|
||||
int32_t maxi, size_t per_channel, int *filter_zp);
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias,
|
||||
const int32_t *left_shift, const int32_t *right_shift, const int32_t *multiplier,
|
||||
int32_t output_zp, int32_t mini, int32_t maxi, size_t per_channel,
|
||||
const int *filter_zp);
|
||||
|
||||
typedef enum OutType { OutType_C8 = 0, OutType_Nhwc = 1, OutType_TileC8 = 2 } OutType;
|
||||
|
||||
|
|
|
@ -165,7 +165,7 @@ int Conv2DINT8Coder::InitWeightBias(CoderContext *const context) {
|
|||
}
|
||||
|
||||
int Conv2DINT8Coder::Prepare(CoderContext *const context) {
|
||||
Conv2DBaseCoder::Init();
|
||||
MS_CHECK_RET_CODE(Conv2DBaseCoder::Init(), "Conv2d base init failed.");
|
||||
CheckSupportOptimize();
|
||||
MS_CHECK_RET_CODE(SetQuantParam(), "Set quant param failed!");
|
||||
MS_CHECK_RET_CODE(InitWeightBias(context), "Init weight bias failed.");
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
|
||||
namespace mindspore::lite::micro {
|
||||
int ConvolutionDepthwiseINT8Coder::Prepare(CoderContext *const context) {
|
||||
Conv2DBaseCoder::Init();
|
||||
MS_CHECK_RET_CODE(Conv2DBaseCoder::Init(), "Conv2d base init failed.");
|
||||
// init sliding window param
|
||||
MS_CHECK_RET_CODE(SetQuantParam(), "Set quant param failed.");
|
||||
MS_CHECK_RET_CODE(InitWeightBias(context), "dwconvolution do init weightbais failed");
|
||||
|
|
|
@ -69,7 +69,7 @@ int ReduceInt8Coder::CalculateQuantArgs() {
|
|||
QuantizeMultiplierSmallerThanOne(prod_multiplier, &qm->multiplier_, &shift);
|
||||
qm->left_shift_ = shift < 0 ? -shift : 0;
|
||||
qm->right_shift_ = shift > 0 ? shift : 0;
|
||||
mean_multipliers_.push_back(qm);
|
||||
prod_multipliers_.push_back(qm);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -30,7 +30,21 @@ class ReduceInt8Coder final : public ReduceBaseCoder {
|
|||
const Model::Node *node, size_t node_index, Target target)
|
||||
: ReduceBaseCoder(in_tensors, out_tensors, node, node_index, target) {}
|
||||
|
||||
~ReduceInt8Coder() override { begin_src_data_ = nullptr; }
|
||||
~ReduceInt8Coder() override {
|
||||
begin_src_data_ = nullptr;
|
||||
for (auto &arg : mean_multipliers_) {
|
||||
delete arg;
|
||||
arg = nullptr;
|
||||
}
|
||||
for (auto &arg : prod_multipliers_) {
|
||||
delete arg;
|
||||
arg = nullptr;
|
||||
}
|
||||
for (auto &arg : sum_square_multipliers_) {
|
||||
delete arg;
|
||||
arg = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
int Prepare(CoderContext *const context) override;
|
||||
int DoCode(CoderContext *const context) override;
|
||||
|
|
|
@ -29,7 +29,7 @@ using mindspore::schema::PrimitiveType_Softmax;
|
|||
|
||||
namespace mindspore::lite::micro::nnacl {
|
||||
int SoftMaxInt8Coder::Prepare(CoderContext *const context) {
|
||||
SoftmaxBaseCoder::Init();
|
||||
MS_CHECK_RET_CODE(SoftmaxBaseCoder::Init(), "Softmax base init failed.");
|
||||
std::vector<LiteQuantParam> in_quant_args = input_tensor_->quant_params();
|
||||
quant_params_.in_quant_args_.scale_ = in_quant_args.at(0).scale;
|
||||
quant_params_.in_quant_args_.zp_ = -in_quant_args.at(0).zeroPoint;
|
||||
|
@ -59,8 +59,7 @@ int SoftMaxInt8Coder::Prepare(CoderContext *const context) {
|
|||
sum_data_size_ = inner_size * sizeof(int);
|
||||
sum_data_ = static_cast<int *>(allocator_->Malloc(kNumberTypeInt32, sum_data_size_, kWorkspace));
|
||||
MS_CHECK_PTR(sum_data_);
|
||||
ReSize();
|
||||
return RET_OK;
|
||||
return ReSize();
|
||||
}
|
||||
|
||||
int SoftMaxInt8Coder::DoCode(CoderContext *const context) {
|
||||
|
|
|
@ -20,11 +20,12 @@ extern void MatMulOptR4Int8Neon64(const int8_t *a, const int8_t *b, int *dst, in
|
|||
const int *input_sum, const int *bias);
|
||||
extern void MatmulInt8DpNeon64(const int8_t *a, const int8_t *b, int8_t *dst, int row8, int col8, int deep4,
|
||||
const int *a_sums, const int *bias, int act_min, int act_max, int out_zp,
|
||||
int *multiplier, int *left_shift, int *right_shift, int row, int col, int stride,
|
||||
size_t peroc);
|
||||
const int *multiplier, const int *left_shift, const int *right_shift, int row, int col,
|
||||
int stride, size_t peroc);
|
||||
extern void MatmulInt8DpOpt(const int8_t *a, const int8_t *b, int8_t *dst, size_t row8, size_t col8, size_t deep4,
|
||||
const int *a_sums, const int *bias, int act_min, int act_max, int out_zp, int *multiplier,
|
||||
int *left_shift, int *right_shift, size_t stride, size_t peroc, int *filter_zp);
|
||||
const int *a_sums, const int *bias, int act_min, int act_max, int out_zp,
|
||||
const int *multiplier, const int *left_shift, const int *right_shift, size_t stride,
|
||||
size_t peroc, const int *filter_zp);
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, int row4, int col4, int deep16,
|
||||
|
@ -33,16 +34,17 @@ void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, i
|
|||
}
|
||||
|
||||
void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
|
||||
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
|
||||
int32_t maxi, size_t per_channel) {
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias,
|
||||
const int32_t *left_shift, const int32_t *right_shift, const int32_t *multiplier,
|
||||
int32_t output_zp, int32_t mini, int32_t maxi, size_t per_channel) {
|
||||
return MatmulInt8DpNeon64(a, b, dst, UP_ROUND(row, C8NUM), UP_ROUND(col, C8NUM), deep_4, input_sum, bias, mini, maxi,
|
||||
output_zp, multiplier, left_shift, right_shift, row, col, stride, per_channel);
|
||||
}
|
||||
void MatMulDpInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
|
||||
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
|
||||
int32_t maxi, size_t per_channel, int32_t *filter_zp) {
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias,
|
||||
const int32_t *left_shift, const int32_t *right_shift, const int32_t *multiplier,
|
||||
int32_t output_zp, int32_t mini, int32_t maxi, size_t per_channel,
|
||||
const int32_t *filter_zp) {
|
||||
return MatmulInt8DpOpt(a, b, dst, row, col, deep_4, input_sum, bias, mini, maxi, output_zp, multiplier, left_shift,
|
||||
right_shift, stride, per_channel, filter_zp);
|
||||
}
|
||||
|
|
|
@ -29,13 +29,14 @@ void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, i
|
|||
const int *input_sum, const int *bias);
|
||||
|
||||
void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
|
||||
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
|
||||
int32_t maxi, size_t per_channel);
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias,
|
||||
const int32_t *left_shift, const int32_t *right_shift, const int32_t *multiplier,
|
||||
int32_t output_zp, int32_t mini, int32_t maxi, size_t per_channel);
|
||||
void MatMulDpInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
|
||||
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
|
||||
int32_t maxi, size_t per_channel, int32_t *filter_zp);
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias,
|
||||
const int32_t *left_shift, const int32_t *right_shift, const int32_t *multiplier,
|
||||
int32_t output_zp, int32_t mini, int32_t maxi, size_t per_channel,
|
||||
const int32_t *filter_zp);
|
||||
#endif
|
||||
|
||||
#endif // MINDSPORE_LITE_MICRO_CODER_OPERATOR_LIBRARY_OPTIMIZE_HANDLER_WRAPPER_H_
|
||||
|
|
|
@ -35,7 +35,7 @@ int Conv1x1Init(int8_t *src_weight, int32_t *src_bias, int32_t *filter_zps, int3
|
|||
memset(packed_weight_, 0, size);
|
||||
RowMajor2Row2x16MajorInt8(src_weight, packed_weight_, output_channel, input_channel);
|
||||
/* bias */
|
||||
size = UP_ROUND(output_channel, C2NUM);
|
||||
size = (size_t)UP_ROUND(output_channel, C2NUM);
|
||||
int32_t *bias_data_ = (int32_t *)malloc(size * sizeof(int32_t));
|
||||
if (bias_data_ == NULL) {
|
||||
free(packed_weight_);
|
||||
|
@ -43,7 +43,7 @@ int Conv1x1Init(int8_t *src_weight, int32_t *src_bias, int32_t *filter_zps, int3
|
|||
}
|
||||
memset(bias_data_, 0, size * sizeof(int32_t));
|
||||
if (src_bias != NULL) {
|
||||
memcpy(bias_data_, src_bias, output_channel * sizeof(int32_t));
|
||||
memcpy(bias_data_, src_bias, (size_t)output_channel * sizeof(int32_t));
|
||||
}
|
||||
#else
|
||||
/* InitWeightBias */
|
||||
|
|
|
@ -42,7 +42,7 @@ class ArithmeticInt8CPUKernel : public InnerKernel {
|
|||
int8_t *tile_data0_{nullptr};
|
||||
int8_t *tile_data1_{nullptr};
|
||||
ArithmeticRunInt8 arithmetic_run_{nullptr};
|
||||
ArithmeticQuantArg quant_args_;
|
||||
ArithmeticQuantArg quant_args_ = {};
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_ARITHMETIC_INT8_H_
|
||||
|
|
|
@ -48,12 +48,12 @@ int BatchnormInt8CPUKernel::InitConstTensor() {
|
|||
|
||||
auto mean_ptr = reinterpret_cast<int8_t *>(mean->MutableData());
|
||||
auto var_ptr = reinterpret_cast<int8_t *>(variance->MutableData());
|
||||
alpha_addr_ = reinterpret_cast<float *>(malloc(mean->ElementsNum() * sizeof(float)));
|
||||
alpha_addr_ = reinterpret_cast<float *>(malloc(static_cast<size_t>(mean->ElementsNum()) * sizeof(float)));
|
||||
if (alpha_addr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
beta_addr_ = reinterpret_cast<float *>(malloc(variance->ElementsNum() * sizeof(float)));
|
||||
beta_addr_ = reinterpret_cast<float *>(malloc(static_cast<size_t>(variance->ElementsNum()) * sizeof(float)));
|
||||
if (beta_addr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -92,12 +92,12 @@ int BatchnormInt8CPUKernel::InitFusedConstTensor() {
|
|||
auto mean_ptr = reinterpret_cast<int8_t *>(mean->MutableData());
|
||||
auto var_ptr = reinterpret_cast<int8_t *>(variance->MutableData());
|
||||
|
||||
alpha_addr_ = reinterpret_cast<float *>(malloc(mean->ElementsNum() * sizeof(float)));
|
||||
alpha_addr_ = reinterpret_cast<float *>(malloc(static_cast<size_t>(mean->ElementsNum()) * sizeof(float)));
|
||||
if (alpha_addr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
beta_addr_ = reinterpret_cast<float *>(malloc(variance->ElementsNum() * sizeof(float)));
|
||||
beta_addr_ = reinterpret_cast<float *>(malloc(static_cast<size_t>(variance->ElementsNum()) * sizeof(float)));
|
||||
if (beta_addr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
|
|
|
@ -59,11 +59,12 @@ int ConcatInt8CPUKernel::Init() {
|
|||
}
|
||||
|
||||
int ConcatInt8CPUKernel::ReSize() {
|
||||
concat_param_->axis_ =
|
||||
concat_param_->axis_ >= 0 ? concat_param_->axis_ : in_tensors_.front()->shape().size() + concat_param_->axis_;
|
||||
concat_param_->axis_ = concat_param_->axis_ >= 0
|
||||
? concat_param_->axis_
|
||||
: static_cast<int>(in_tensors_.front()->shape().size()) + concat_param_->axis_;
|
||||
|
||||
auto input_num = in_tensors_.size();
|
||||
concat_param_->input_num_ = input_num;
|
||||
concat_param_->input_num_ = static_cast<int>(input_num);
|
||||
concat_param_->input_shapes_ = reinterpret_cast<int **>(malloc(sizeof(int *) * input_num));
|
||||
if (concat_param_->input_shapes_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc concat_param_->input_shapes_ failed.";
|
||||
|
@ -97,7 +98,7 @@ int ConcatInt8CPUKernel::ReSize() {
|
|||
memcpy(reinterpret_cast<void *>(concat_param_->output_shapes_), output_tensor->shape().data(),
|
||||
sizeof(int) * output_dim);
|
||||
|
||||
for (size_t i = concat_param_->axis_ + 1; i < output_dim; i++) {
|
||||
for (size_t i = static_cast<size_t>(concat_param_->axis_ + 1); i < output_dim; i++) {
|
||||
after_axis_size *= concat_param_->output_shapes_[i];
|
||||
}
|
||||
concat_param_->after_axis_size = after_axis_size;
|
||||
|
@ -122,21 +123,17 @@ int ConcatInt8CPUKernel::Run() {
|
|||
|
||||
int ConcatInt8Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto concat = reinterpret_cast<ConcatInt8CPUKernel *>(cdata);
|
||||
auto ret = concat->DoExecute(task_id);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "ConcatInt8Run task_id " << task_id << " failed.";
|
||||
return ret;
|
||||
}
|
||||
concat->DoExecute(task_id);
|
||||
return lite::RET_OK;
|
||||
}
|
||||
|
||||
int ConcatInt8CPUKernel::DoExecute(int task_id) {
|
||||
void ConcatInt8CPUKernel::DoExecute(int task_id) {
|
||||
int64_t real_dst_count = MSMIN(before_axis_size - task_id * count_unit_, count_unit_);
|
||||
if (real_dst_count <= 0) {
|
||||
return lite::RET_OK;
|
||||
return;
|
||||
}
|
||||
Int8Concat(input_data_, output_data_, concat_param_, concat_param_->axis_, real_dst_count, task_id);
|
||||
return lite::RET_OK;
|
||||
return;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Concat, LiteKernelCreator<ConcatInt8CPUKernel>)
|
||||
|
|
|
@ -57,7 +57,7 @@ class ConcatInt8CPUKernel : public InnerKernel {
|
|||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int DoExecute(int task_id);
|
||||
void DoExecute(int task_id);
|
||||
|
||||
private:
|
||||
int64_t before_axis_size = 0;
|
||||
|
|
|
@ -25,7 +25,7 @@ namespace mindspore::kernel {
|
|||
namespace {
|
||||
constexpr size_t kUnitBufferMultipler = 4 * 4;
|
||||
} // namespace
|
||||
int ProcessFilterUint8(const int8_t *origin_weight, int16_t *dst_weight, ConvParameter *conv_param) {
|
||||
int ProcessFilterUint8(const int8_t *origin_weight, int16_t *dst_weight, const ConvParameter *conv_param) {
|
||||
auto input_channel = conv_param->input_channel_;
|
||||
auto output_channel = conv_param->output_channel_;
|
||||
auto kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_;
|
||||
|
@ -116,7 +116,7 @@ int Convolution3x3Int8CPUKernel::InitWeightBias() {
|
|||
memset(bias_data_, 0, new_bias_size);
|
||||
if (in_tensors_.size() == kInputSize2) {
|
||||
auto ori_bias_addr = reinterpret_cast<int32_t *>(in_tensors_.at(kBiasIndex)->MutableData());
|
||||
memcpy(bias_data_, ori_bias_addr, output_channel * sizeof(int32_t));
|
||||
memcpy(bias_data_, ori_bias_addr, static_cast<size_t>(output_channel) * sizeof(int32_t));
|
||||
} else {
|
||||
MS_ASSERT(in_tensors_.size() == kInputSize1);
|
||||
}
|
||||
|
|
|
@ -46,7 +46,7 @@ class Convolution3x3Int8CPUKernel : public ConvolutionBaseCPUKernel {
|
|||
int32_t *tmp_dst_buffer_ = nullptr;
|
||||
int8_t *tmp_out_ = nullptr;
|
||||
};
|
||||
int ProcessFilterUint8(const int8_t *origin_weight, int16_t *dst_weight, ConvParameter *conv_param);
|
||||
int ProcessFilterUint8(const int8_t *origin_weight, int16_t *dst_weight, const ConvParameter *conv_param);
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_3X3_INT8_H_
|
||||
|
|
|
@ -60,13 +60,13 @@ int ConvolutionDepthwise3x3Int8CPUKernel::InitWeightBias() {
|
|||
PackNCHWToNHWCInt8(origin_weight, tmp_weight, 1, weight_tensor->Height() * weight_tensor->Width(),
|
||||
weight_tensor->Batch());
|
||||
|
||||
packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t)));
|
||||
packed_weight_ = reinterpret_cast<int16_t *>(malloc(static_cast<size_t>(pack_weight_size) * sizeof(int16_t)));
|
||||
if (packed_weight_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
free(tmp_weight);
|
||||
return RET_ERROR;
|
||||
}
|
||||
bool filter_per_channel = conv_param_->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL;
|
||||
bool filter_per_channel = static_cast<bool>(conv_param_->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL);
|
||||
if (filter_per_channel) {
|
||||
for (int i = 0; i < weight_tensor->Height() * weight_tensor->Width(); i++) {
|
||||
for (int c = 0; c < channel; c++) {
|
||||
|
@ -87,16 +87,16 @@ int ConvolutionDepthwise3x3Int8CPUKernel::InitWeightBias() {
|
|||
}
|
||||
free(tmp_weight);
|
||||
|
||||
bias_data_ = reinterpret_cast<int32_t *>(malloc(channel * sizeof(int32_t)));
|
||||
bias_data_ = reinterpret_cast<int32_t *>(malloc(static_cast<size_t>(channel) * sizeof(int32_t)));
|
||||
if (bias_data_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(bias_data_, 0, channel * sizeof(int32_t));
|
||||
memset(bias_data_, 0, static_cast<size_t>(channel) * sizeof(int32_t));
|
||||
if (in_tensors_.size() == kInputSize2) {
|
||||
auto bias_tensor = in_tensors_.at(kBiasIndex);
|
||||
auto ori_bias = reinterpret_cast<int32_t *>(bias_tensor->MutableData());
|
||||
memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t));
|
||||
memcpy(bias_data_, ori_bias, static_cast<size_t>(bias_tensor->ElementsNum()) * sizeof(int32_t));
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -153,7 +153,8 @@ int ConvDw3x3Int8Run(void *cdata, int task_id, float lhs_scale, float rhs_scale)
|
|||
|
||||
int ConvolutionDepthwise3x3Int8CPUKernel::InitBuffer() {
|
||||
int buffer_size = kConvDepthwise3x3BufferSize * conv_param_->thread_num_;
|
||||
buffer_ = reinterpret_cast<int8_t *>(ms_context_->allocator->Malloc(buffer_size * sizeof(int8_t)));
|
||||
buffer_ =
|
||||
reinterpret_cast<int8_t *>(ms_context_->allocator->Malloc(static_cast<size_t>(buffer_size) * sizeof(int8_t)));
|
||||
if (buffer_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
|
|
|
@ -55,7 +55,7 @@ int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
|
|||
return RET_ERROR;
|
||||
}
|
||||
|
||||
bool filter_per_channel = conv_param_->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL;
|
||||
bool filter_per_channel = static_cast<bool>(conv_param_->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL);
|
||||
if (filter_per_channel) {
|
||||
for (int i = 0; i < weight_tensor->Height() * weight_tensor->Width(); i++) {
|
||||
for (int c = 0; c < channel; c++) {
|
||||
|
|
|
@ -42,7 +42,7 @@ int ConvolutionDepthwiseSWInt8CPUKernel::InitWeightBias() {
|
|||
auto origin_weight = reinterpret_cast<int8_t *>(weight_tensor->MutableData());
|
||||
int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM);
|
||||
int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width();
|
||||
packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t)));
|
||||
packed_weight_ = reinterpret_cast<int16_t *>(malloc(static_cast<size_t>(pack_weight_size) * sizeof(int16_t)));
|
||||
if (packed_weight_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -50,16 +50,16 @@ int ConvolutionDepthwiseSWInt8CPUKernel::InitWeightBias() {
|
|||
PackDepthwiseInt8Weight(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(),
|
||||
weight_tensor->Batch(), &(conv_param_->conv_quant_arg_));
|
||||
|
||||
bias_data_ = reinterpret_cast<int32_t *>(malloc(C8NUM * OC8 * sizeof(int32_t)));
|
||||
bias_data_ = reinterpret_cast<int32_t *>(malloc(static_cast<size_t>(C8NUM * OC8) * sizeof(int32_t)));
|
||||
if (bias_data_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(bias_data_, 0, C8NUM * OC8 * sizeof(int32_t));
|
||||
memset(bias_data_, 0, static_cast<size_t>(C8NUM * OC8) * sizeof(int32_t));
|
||||
if (in_tensors_.size() == kInputSize2) {
|
||||
auto bias_tensor = in_tensors_.at(kBiasIndex);
|
||||
auto ori_bias = reinterpret_cast<int32_t *>(bias_tensor->MutableData());
|
||||
memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t));
|
||||
memcpy(bias_data_, ori_bias, static_cast<size_t>(bias_tensor->ElementsNum()) * sizeof(int32_t));
|
||||
}
|
||||
|
||||
conv_param_->thread_num_ = MSMIN(thread_count_, OC8);
|
||||
|
@ -72,7 +72,8 @@ int ConvolutionDepthwiseSWInt8CPUKernel::InitPackedInputOutput() {
|
|||
|
||||
int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM *
|
||||
UP_DIV(conv_param_->input_channel_, C8NUM);
|
||||
packed_input_ = reinterpret_cast<int8_t *>(ms_context_->allocator->Malloc(pack_input_size * sizeof(int8_t)));
|
||||
packed_input_ =
|
||||
reinterpret_cast<int8_t *>(ms_context_->allocator->Malloc(static_cast<size_t>(pack_input_size) * sizeof(int8_t)));
|
||||
if (packed_input_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -80,7 +81,8 @@ int ConvolutionDepthwiseSWInt8CPUKernel::InitPackedInputOutput() {
|
|||
|
||||
int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM *
|
||||
UP_DIV(conv_param_->output_channel_, C8NUM);
|
||||
packed_output_ = reinterpret_cast<int8_t *>(ms_context_->allocator->Malloc(pack_output_size * sizeof(int8_t)));
|
||||
packed_output_ = reinterpret_cast<int8_t *>(
|
||||
ms_context_->allocator->Malloc(static_cast<size_t>(pack_output_size) * sizeof(int8_t)));
|
||||
if (packed_output_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -150,10 +152,10 @@ int ConvolutionDepthwiseSWInt8CPUKernel::ReinitQuantParam() {
|
|||
|
||||
auto input_tensor = in_tensors_.at(kInputIndex);
|
||||
auto channel = conv_param_->input_channel_;
|
||||
input_scale_ = reinterpret_cast<float *>(malloc(channel * sizeof(float)));
|
||||
input_scale_ = reinterpret_cast<float *>(malloc(static_cast<size_t>(channel) * sizeof(float)));
|
||||
MSLITE_CHECK_PTR(input_scale_);
|
||||
|
||||
input_zp_ = reinterpret_cast<int8_t *>(malloc(channel * sizeof(int8_t)));
|
||||
input_zp_ = reinterpret_cast<int8_t *>(malloc(static_cast<size_t>(channel) * sizeof(int8_t)));
|
||||
MSLITE_CHECK_PTR(input_zp_);
|
||||
|
||||
if (input_tensor->quant_params().size() == kPerTensor) {
|
||||
|
@ -171,10 +173,10 @@ int ConvolutionDepthwiseSWInt8CPUKernel::ReinitQuantParam() {
|
|||
}
|
||||
|
||||
auto output_tensor = out_tensors_.at(kOutputIndex);
|
||||
output_scale_ = reinterpret_cast<float *>(malloc(channel * sizeof(float)));
|
||||
output_scale_ = reinterpret_cast<float *>(malloc(static_cast<size_t>(channel) * sizeof(float)));
|
||||
MSLITE_CHECK_PTR(output_scale_);
|
||||
|
||||
output_zp_ = reinterpret_cast<int32_t *>(malloc(channel * sizeof(int32_t)));
|
||||
output_zp_ = reinterpret_cast<int32_t *>(malloc(static_cast<size_t>(channel) * sizeof(int32_t)));
|
||||
MSLITE_CHECK_PTR(output_zp_);
|
||||
|
||||
if (output_tensor->quant_params().size() == kPerTensor) {
|
||||
|
@ -191,25 +193,26 @@ int ConvolutionDepthwiseSWInt8CPUKernel::ReinitQuantParam() {
|
|||
}
|
||||
}
|
||||
|
||||
conv_quant_arg_->real_multiplier_ = reinterpret_cast<double *>(malloc(channel * sizeof(double)));
|
||||
conv_quant_arg_->real_multiplier_ = reinterpret_cast<double *>(malloc(static_cast<size_t>(channel) * sizeof(double)));
|
||||
MSLITE_CHECK_PTR(conv_quant_arg_->real_multiplier_);
|
||||
|
||||
conv_quant_arg_->left_shift_ = reinterpret_cast<int32_t *>(malloc(channel * sizeof(int32_t)));
|
||||
conv_quant_arg_->left_shift_ = reinterpret_cast<int32_t *>(malloc(static_cast<size_t>(channel) * sizeof(int32_t)));
|
||||
MSLITE_CHECK_PTR(conv_quant_arg_->left_shift_);
|
||||
|
||||
conv_quant_arg_->right_shift_ = reinterpret_cast<int32_t *>(malloc(channel * sizeof(int32_t)));
|
||||
conv_quant_arg_->right_shift_ = reinterpret_cast<int32_t *>(malloc(static_cast<size_t>(channel) * sizeof(int32_t)));
|
||||
MSLITE_CHECK_PTR(conv_quant_arg_->right_shift_);
|
||||
|
||||
conv_quant_arg_->quant_multiplier_ = reinterpret_cast<int32_t *>(malloc(channel * sizeof(int32_t)));
|
||||
conv_quant_arg_->quant_multiplier_ =
|
||||
reinterpret_cast<int32_t *>(malloc(static_cast<size_t>(channel) * sizeof(int32_t)));
|
||||
MSLITE_CHECK_PTR(conv_quant_arg_->quant_multiplier_);
|
||||
|
||||
conv_quant_arg_->out_act_min_ = reinterpret_cast<int32_t *>(malloc(channel * sizeof(int32_t)));
|
||||
conv_quant_arg_->out_act_min_ = reinterpret_cast<int32_t *>(malloc(static_cast<size_t>(channel) * sizeof(int32_t)));
|
||||
MSLITE_CHECK_PTR(conv_quant_arg_->out_act_min_);
|
||||
|
||||
conv_quant_arg_->out_act_max_ = reinterpret_cast<int32_t *>(malloc(channel * sizeof(int32_t)));
|
||||
conv_quant_arg_->out_act_max_ = reinterpret_cast<int32_t *>(malloc(static_cast<size_t>(channel) * sizeof(int32_t)));
|
||||
MSLITE_CHECK_PTR(conv_quant_arg_->out_act_max_);
|
||||
|
||||
weight_scale_ = reinterpret_cast<float *>(malloc(channel * sizeof(float)));
|
||||
weight_scale_ = reinterpret_cast<float *>(malloc(static_cast<size_t>(channel) * sizeof(float)));
|
||||
MSLITE_CHECK_PTR(weight_scale_);
|
||||
|
||||
auto weight_tensor = in_tensors_.at(kWeightIndex);
|
||||
|
|
|
@ -98,12 +98,12 @@ int ConvolutionInt8CPUKernel::InitWeightBias() {
|
|||
memset(bias_data_, 0, bias_size);
|
||||
if (in_tensors_.size() == kInputSize2) {
|
||||
auto ori_bias = reinterpret_cast<int32_t *>(in_tensors_.at(kBiasIndex)->data_c());
|
||||
memcpy(bias_data_, ori_bias, output_channel * sizeof(int32_t));
|
||||
memcpy(bias_data_, ori_bias, static_cast<size_t>(output_channel) * sizeof(int32_t));
|
||||
} else {
|
||||
MS_ASSERT(in_tensors_.size() == kInputSize1);
|
||||
}
|
||||
auto *bias_data = reinterpret_cast<int32_t *>(bias_data_);
|
||||
bool filter_peroc = conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL;
|
||||
bool filter_peroc = static_cast<bool>(conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL);
|
||||
if (filter_peroc) {
|
||||
filter_zp_ptr_ = reinterpret_cast<int32_t *>(malloc(output_channel * sizeof(int32_t)));
|
||||
if (filter_zp_ptr_ == nullptr) {
|
||||
|
@ -126,9 +126,9 @@ int ConvolutionInt8CPUKernel::InitWeightBias() {
|
|||
|
||||
size_t input_sum_size;
|
||||
if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
|
||||
input_sum_size = up_round_oc * tile_num_ * thread_count_ * sizeof(int32_t);
|
||||
input_sum_size = static_cast<size_t>(up_round_oc * tile_num_ * thread_count_) * sizeof(int32_t);
|
||||
} else {
|
||||
input_sum_size = tile_num_ * thread_count_ * sizeof(int32_t);
|
||||
input_sum_size = static_cast<size_t>(tile_num_ * thread_count_) * sizeof(int32_t);
|
||||
}
|
||||
input_sum_ = reinterpret_cast<int32_t *>(malloc(input_sum_size));
|
||||
if (input_sum_ == nullptr) {
|
||||
|
|
|
@ -57,21 +57,16 @@ int CropInt8CPUKernel::Run() {
|
|||
|
||||
int CropInt8Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto crop = reinterpret_cast<CropInt8CPUKernel *>(cdata);
|
||||
auto ret = crop->DoExecute(task_id);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "CropInt8Run task id " << task_id << " run failed.";
|
||||
return ret;
|
||||
}
|
||||
crop->DoExecute(task_id);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int CropInt8CPUKernel::DoExecute(int task_id) {
|
||||
void CropInt8CPUKernel::DoExecute(int task_id) {
|
||||
auto input_tensor = in_tensors_.at(kInputIndex);
|
||||
auto out_tensor = out_tensors_.at(kOutputIndex);
|
||||
int8_t *input_data = reinterpret_cast<int8_t *>(input_tensor->data_c());
|
||||
int8_t *output_data = reinterpret_cast<int8_t *>(out_tensor->data_c());
|
||||
Int8Crop(input_data, output_data, task_id, crop_para_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Crop, LiteKernelCreator<CropInt8CPUKernel>)
|
||||
|
|
|
@ -36,7 +36,7 @@ class CropInt8CPUKernel : public CropBaseCPUKernel {
|
|||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int DoExecute(int task_id);
|
||||
void DoExecute(int task_id);
|
||||
};
|
||||
|
||||
int CropInt8Run(void *cdata, int task_id, float lhs_scale, float rhs_scale);
|
||||
|
|
|
@ -57,7 +57,7 @@ int GatherNdInt8CPUKernel::ReSize() {
|
|||
|
||||
auto indices_tensor = in_tensors_.at(1);
|
||||
auto indices_shape = indices_tensor->shape();
|
||||
int indices_rank = indices_shape.size();
|
||||
int indices_rank = static_cast<size_t>(indices_shape.size());
|
||||
count_ = 1;
|
||||
for (int i = 0; i < indices_rank - 1; ++i) {
|
||||
count_ *= indices_shape[i];
|
||||
|
@ -66,12 +66,12 @@ int GatherNdInt8CPUKernel::ReSize() {
|
|||
MS_LOG(ERROR) << "count_ is invalid, count_: " << count_;
|
||||
return RET_ERROR;
|
||||
}
|
||||
in_offset_ = reinterpret_cast<int *>(malloc(count_ * sizeof(int)));
|
||||
in_offset_ = reinterpret_cast<int *>(malloc(static_cast<size_t>(count_) * sizeof(int)));
|
||||
if (in_offset_ == nullptr) {
|
||||
MS_LOG(ERROR) << "GatherNdInt8 Malloc in_offset_ error!";
|
||||
return RET_ERROR;
|
||||
}
|
||||
(void)memset(in_offset_, 0, count_ * sizeof(int));
|
||||
(void)memset(in_offset_, 0, static_cast<size_t>(count_) * sizeof(int));
|
||||
thread_sz_count_ = MSMIN(thread_count_, count_);
|
||||
if (thread_sz_count_ == 0) {
|
||||
MS_LOG(ERROR) << "div zero";
|
||||
|
@ -85,9 +85,9 @@ int GatherNdInt8CPUKernel::InitOffset() {
|
|||
auto ind_quant_args = in_tensors_.at(1)->quant_params();
|
||||
auto indices_tensor = in_tensors_.at(1);
|
||||
auto indices_shape = indices_tensor->shape();
|
||||
int indices_rank = indices_shape.size();
|
||||
int indices_rank = static_cast<size_t>(indices_shape.size());
|
||||
auto in_shape = in_tensors_.front()->shape();
|
||||
int in_rank = in_shape.size();
|
||||
int in_rank = static_cast<size_t>(in_shape.size());
|
||||
if (indices_rank < 1) {
|
||||
MS_LOG(ERROR) << "inex out of bounds";
|
||||
return RET_ERROR;
|
||||
|
|
|
@ -44,7 +44,7 @@ class GatherNdInt8CPUKernel : public InnerKernel {
|
|||
int *in_offset_ = nullptr;
|
||||
int8_t *in_ptr_ = nullptr;
|
||||
int8_t *out_ptr_ = nullptr;
|
||||
GatherQuantArg param_;
|
||||
GatherQuantArg param_ = {};
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
|
@ -29,7 +29,7 @@ int GroupConvolutionInt8CPUKernel::SeparateInput(int group_id) {
|
|||
int8_t *src_ptr = reinterpret_cast<int8_t *>(ori_in_data_) + group_id * sub_in_channel;
|
||||
int8_t *dst_ptr = sub_in_data;
|
||||
for (int i = 0; i < in_plane; ++i) {
|
||||
memcpy(dst_ptr, src_ptr, sub_in_channel * sizeof(int8_t));
|
||||
memcpy(dst_ptr, src_ptr, static_cast<size_t>(sub_in_channel) * sizeof(int8_t));
|
||||
src_ptr += ori_in_channel;
|
||||
dst_ptr += sub_in_channel;
|
||||
}
|
||||
|
@ -45,7 +45,7 @@ int GroupConvolutionInt8CPUKernel::PostConcat(int group_id) {
|
|||
int8_t *src_ptr = sub_out_data;
|
||||
int8_t *dst_ptr = reinterpret_cast<int8_t *>(ori_out_data_) + group_id * sub_out_channel;
|
||||
for (int i = 0; i < out_plane; ++i) {
|
||||
memcpy(dst_ptr, src_ptr, sub_out_channel * sizeof(int8_t));
|
||||
memcpy(dst_ptr, src_ptr, static_cast<size_t>(sub_out_channel) * sizeof(int8_t));
|
||||
src_ptr += sub_out_channel;
|
||||
dst_ptr += ori_out_channel;
|
||||
}
|
||||
|
|
|
@ -37,7 +37,7 @@ class HswishInt8CPUKernel : public InnerKernel {
|
|||
|
||||
private:
|
||||
int thread_count_;
|
||||
HswishQuantArg quant_arg_;
|
||||
HswishQuantArg quant_arg_ = {};
|
||||
void MultiplierInt32ToInt16(int32_t input, int16_t *output) const;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
|
|
@ -39,7 +39,7 @@ class LeakyReluInt8CPUKernel : public InnerKernel {
|
|||
int DoExecute(int task_id);
|
||||
|
||||
private:
|
||||
LeakyReluQuantArg quant_prelu_parm_;
|
||||
LeakyReluQuantArg quant_prelu_parm_ = {};
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
|
@ -187,29 +187,21 @@ int MulInt8CPUKernel::Run() {
|
|||
|
||||
int FastHWBroadcastMulInt8Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto mul = reinterpret_cast<MulInt8CPUKernel *>(cdata);
|
||||
auto ret = mul->FastDoExecute(task_id);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "FastHWBroadcastMulInt8Run task_id " << task_id << " failed.";
|
||||
return ret;
|
||||
}
|
||||
mul->FastDoExecute(task_id);
|
||||
return lite::RET_OK;
|
||||
}
|
||||
|
||||
int MulInt8Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto mul = reinterpret_cast<MulInt8CPUKernel *>(cdata);
|
||||
auto ret = mul->DoExecute(task_id);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "MulInt8Run task_id " << task_id << " failed.";
|
||||
return ret;
|
||||
}
|
||||
mul->DoExecute(task_id);
|
||||
return lite::RET_OK;
|
||||
}
|
||||
|
||||
int MulInt8CPUKernel::FastDoExecute(int task_id) {
|
||||
void MulInt8CPUKernel::FastDoExecute(int task_id) {
|
||||
int depth = out_tensors_.front()->Channel();
|
||||
int64_t real_dst_count = MSMIN(elements_num_ - task_id * count_unit_, count_unit_);
|
||||
if (real_dst_count <= 0) {
|
||||
return lite::RET_OK;
|
||||
return;
|
||||
}
|
||||
int8_t *cur_input0_data = input0_data_;
|
||||
int8_t *cur_input1_data = input1_data_ + task_id * count_unit_ * depth;
|
||||
|
@ -219,20 +211,19 @@ int MulInt8CPUKernel::FastDoExecute(int task_id) {
|
|||
cur_input1_data = input0_data_ + task_id * count_unit_ * depth;
|
||||
}
|
||||
FastMul(cur_input0_data, cur_input1_data, cur_output_data, depth, real_dst_count, input1_hw_broadcast_, quant_args_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int MulInt8CPUKernel::DoExecute(int task_id) {
|
||||
void MulInt8CPUKernel::DoExecute(int task_id) {
|
||||
int64_t real_dst_count = MSMIN(elements_num_ - task_id * count_unit_, count_unit_);
|
||||
if (real_dst_count <= 0) {
|
||||
return lite::RET_OK;
|
||||
return;
|
||||
}
|
||||
int8_t *cur_input0_data = input0_data_ + task_id * count_unit_;
|
||||
int8_t *cur_input1_data = input1_data_ + task_id * count_unit_;
|
||||
int8_t *cur_output_data = output_data_ + task_id * count_unit_;
|
||||
|
||||
Mul(cur_input0_data, cur_input1_data, cur_output_data, real_dst_count, quant_args_);
|
||||
return lite::RET_OK;
|
||||
return;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_MulFusion, LiteKernelCreator<MulInt8CPUKernel>)
|
||||
|
|
|
@ -39,8 +39,8 @@ class MulInt8CPUKernel : public InnerKernel {
|
|||
void CheckSameShapeSize(std::vector<int> in_tensor0_shape, std::vector<int> in_tensor1_shape);
|
||||
void CheckIfFastImpl();
|
||||
int Run() override;
|
||||
int DoExecute(int task_id);
|
||||
int FastDoExecute(int task_id);
|
||||
void DoExecute(int task_id);
|
||||
void FastDoExecute(int task_id);
|
||||
|
||||
private:
|
||||
const lite::InnerContext *ctx_ = nullptr;
|
||||
|
|
|
@ -30,16 +30,17 @@ void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, i
|
|||
}
|
||||
|
||||
void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
|
||||
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
|
||||
int32_t maxi, size_t per_channel) {
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias,
|
||||
const int32_t *left_shift, const int32_t *right_shift, const int32_t *multiplier,
|
||||
int32_t output_zp, int32_t mini, int32_t maxi, size_t per_channel) {
|
||||
return MatmulInt8DpNeon64(a, b, dst, UP_ROUND(row, C8NUM), UP_ROUND(col, C8NUM), deep_4, input_sum, bias, mini, maxi,
|
||||
output_zp, multiplier, left_shift, right_shift, row, col, stride, per_channel);
|
||||
}
|
||||
void MatMulDpInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
|
||||
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
|
||||
int32_t maxi, size_t per_channel, int32_t *filter_zp) {
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias,
|
||||
const int32_t *left_shift, const int32_t *right_shift, const int32_t *multiplier,
|
||||
int32_t output_zp, int32_t mini, int32_t maxi, size_t per_channel,
|
||||
const int32_t *filter_zp) {
|
||||
return MatmulInt8DpOpt(a, b, dst, row, col, deep_4, input_sum, bias, mini, maxi, output_zp, multiplier, left_shift,
|
||||
right_shift, stride, per_channel, filter_zp);
|
||||
}
|
||||
|
|
|
@ -25,11 +25,11 @@ extern "C" {
|
|||
void MatMulOptR4Int8Neon64(const int8_t *a, const int8_t *b, int *dst, int row4, int col4, int deep16,
|
||||
const int *input_sum, const int *bias);
|
||||
void MatmulInt8DpNeon64(const int8_t *a, const int8_t *b, int8_t *dst, int row8, int col8, int deep4, const int *a_sums,
|
||||
const int *bias, int act_min, int act_max, int out_zp, int *multiplier, int *left_shift,
|
||||
int *right_shift, int row, int col, int stride, size_t peroc);
|
||||
const int *bias, int act_min, int act_max, int out_zp, const int *multiplier,
|
||||
const int *left_shift, const int *right_shift, int row, int col, int stride, size_t peroc);
|
||||
void MatmulInt8DpOpt(const int8_t *a, const int8_t *b, int8_t *dst, size_t row8, size_t col8, size_t deep4,
|
||||
const int *a_sums, const int *bias, int act_min, int act_max, int out_zp, int *multiplier,
|
||||
int *left_shift, int *right_shift, size_t stride, size_t peroc, int *filter_zp);
|
||||
const int *a_sums, const int *bias, int act_min, int act_max, int out_zp, const int *multiplier,
|
||||
const int *left_shift, const int *right_shift, size_t stride, size_t peroc, const int *filter_zp);
|
||||
#ifdef ENABLE_ARM64
|
||||
void IndirectGemmInt8_optimize_handler(int8_t *dst, const int8_t *src, const int8_t *weight, const int32_t *bias,
|
||||
size_t ksize, size_t ic4, size_t output_channel, size_t offset,
|
||||
|
@ -40,13 +40,14 @@ void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, i
|
|||
const int *input_sum, const int *bias);
|
||||
|
||||
void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
|
||||
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
|
||||
int32_t maxi, size_t per_channel);
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias,
|
||||
const int32_t *left_shift, const int32_t *right_shift, const int32_t *multiplier,
|
||||
int32_t output_zp, int32_t mini, int32_t maxi, size_t per_channel);
|
||||
void MatMulDpInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
|
||||
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
|
||||
int32_t maxi, size_t per_channel, int32_t *filter_zp);
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias,
|
||||
const int32_t *left_shift, const int32_t *right_shift, const int32_t *multiplier,
|
||||
int32_t output_zp, int32_t mini, int32_t maxi, size_t per_channel,
|
||||
const int32_t *filter_zp);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -85,7 +85,7 @@ int PadInt8CPUKernel::SetQuantParam() {
|
|||
int PadInt8CPUKernel::InitPadParam() {
|
||||
auto in_dims = in_tensors_.at(0)->shape();
|
||||
auto out_dims = out_tensors_.at(0)->shape();
|
||||
int ndims = in_dims.size();
|
||||
int ndims = static_cast<size_t>(in_dims.size());
|
||||
|
||||
int in[] = {1, 1, 1, 1};
|
||||
int out[] = {1, 1, 1, 1};
|
||||
|
@ -267,7 +267,8 @@ int PadInt8CPUKernel::Run() {
|
|||
|
||||
int error_code;
|
||||
if (pad_param_->pad_mode_ == static_cast<int>(schema::PaddingMode_CONSTANT)) {
|
||||
memset(out_data_, pad_param_->pad_quant_arg_.constant_value_[0], out_tensors_[0]->ElementsNum() * sizeof(int8_t));
|
||||
memset(out_data_, pad_param_->pad_quant_arg_.constant_value_[0],
|
||||
static_cast<size_t>(out_tensors_[0]->ElementsNum()) * sizeof(int8_t));
|
||||
error_code = ParallelLaunch(this->ms_context_, PadInt8Impl, this, op_parameter_->thread_num_);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "Resize run error, error_code[" << error_code << "]";
|
||||
|
|
|
@ -93,7 +93,7 @@ class ReduceInt8CPUKernel : public ReduceBaseCPUKernel {
|
|||
bool valid_shape_ = false;
|
||||
bool pattern_impl_ = false;
|
||||
Four_DIMENSION_REDUCE_TEMPLATE pattern_;
|
||||
QuantMulArg reduce_mean_quant_param_; // used in reduce mean 4D situation
|
||||
QuantMulArg reduce_mean_quant_param_ = {}; // used in reduce mean 4D situation
|
||||
Reducer reducer_ = nullptr;
|
||||
LastReducer last_reducer_ = nullptr;
|
||||
std::vector<QuantMulArg *> mean_multipliers_;
|
||||
|
|
|
@ -37,7 +37,7 @@ class ReluXInt8CPUKernel : public InnerKernel {
|
|||
int Run() override;
|
||||
int DoActivation(int task_id);
|
||||
|
||||
ReluXQuantArg quant_arg_;
|
||||
ReluXQuantArg quant_arg_ = {};
|
||||
|
||||
private:
|
||||
int type_{0};
|
||||
|
|
|
@ -63,18 +63,14 @@ int ReshapeInt8CPUKernel::Run() {
|
|||
|
||||
int ReshapeInt8Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto reshape = reinterpret_cast<ReshapeInt8CPUKernel *>(cdata);
|
||||
auto ret = reshape->DoExecute(task_id);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Reshapeint8Run task_id " << task_id << " failed.";
|
||||
return ret;
|
||||
}
|
||||
reshape->DoExecute(task_id);
|
||||
return lite::RET_OK;
|
||||
}
|
||||
|
||||
int ReshapeInt8CPUKernel::DoExecute(int task_id) {
|
||||
void ReshapeInt8CPUKernel::DoExecute(int task_id) {
|
||||
int64_t real_dst_count = MSMIN(elements_num_ - task_id * count_unit_, count_unit_);
|
||||
if (real_dst_count <= 0) {
|
||||
return lite::RET_OK;
|
||||
return;
|
||||
}
|
||||
MS_ASSERT(input_data_);
|
||||
MS_ASSERT(output_data_);
|
||||
|
@ -82,7 +78,7 @@ int ReshapeInt8CPUKernel::DoExecute(int task_id) {
|
|||
int8_t *cur_output_data = output_data_ + task_id * count_unit_;
|
||||
|
||||
Int8Reshape(cur_input0_data, cur_output_data, real_dst_count, reshape_param_->quant_para_);
|
||||
return lite::RET_OK;
|
||||
return;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Reshape, LiteKernelCreator<ReshapeInt8CPUKernel>)
|
||||
|
|
|
@ -37,7 +37,7 @@ class ReshapeInt8CPUKernel : public InnerKernel {
|
|||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int DoExecute(int task_id);
|
||||
void DoExecute(int task_id);
|
||||
|
||||
private:
|
||||
int64_t elements_num_ = 0;
|
||||
|
|
|
@ -37,20 +37,32 @@ constexpr unsigned int OFFSET_BASE = 10;
|
|||
} // namespace
|
||||
void ResizeInt8CPUKernel::FreeResizeBiLinear() {
|
||||
free(resize_quant_arg_.x_axis_index_);
|
||||
resize_quant_arg_.x_axis_index_ = nullptr;
|
||||
free(resize_quant_arg_.x_axis_lower_);
|
||||
resize_quant_arg_.x_axis_lower_ = nullptr;
|
||||
free(resize_quant_arg_.x_axis_upper_);
|
||||
resize_quant_arg_.x_axis_upper_ = nullptr;
|
||||
free(resize_quant_arg_.y_axis_index_);
|
||||
resize_quant_arg_.y_axis_index_ = nullptr;
|
||||
free(resize_quant_arg_.y_axis_lower_);
|
||||
resize_quant_arg_.y_axis_lower_ = nullptr;
|
||||
free(resize_quant_arg_.y_axis_upper_);
|
||||
resize_quant_arg_.y_axis_upper_ = nullptr;
|
||||
}
|
||||
|
||||
void ResizeInt8CPUKernel::FreeFloatResizeBiLinear() {
|
||||
free(resize_float_quant_arg_.x_axis_index_);
|
||||
resize_float_quant_arg_.x_axis_index_ = nullptr;
|
||||
free(resize_float_quant_arg_.x_axis_lower_);
|
||||
resize_float_quant_arg_.x_axis_lower_ = nullptr;
|
||||
free(resize_float_quant_arg_.x_axis_upper_);
|
||||
resize_float_quant_arg_.x_axis_upper_ = nullptr;
|
||||
free(resize_float_quant_arg_.y_axis_index_);
|
||||
resize_float_quant_arg_.y_axis_index_ = nullptr;
|
||||
free(resize_float_quant_arg_.y_axis_lower_);
|
||||
resize_float_quant_arg_.y_axis_lower_ = nullptr;
|
||||
free(resize_float_quant_arg_.y_axis_upper_);
|
||||
resize_float_quant_arg_.y_axis_upper_ = nullptr;
|
||||
}
|
||||
|
||||
ResizeInt8CPUKernel::~ResizeInt8CPUKernel() {
|
||||
|
|
|
@ -52,8 +52,8 @@ class ResizeInt8CPUKernel : public ResizeBaseCPUKernel {
|
|||
QuantArg *quant_in_{nullptr};
|
||||
QuantArg *quant_out_{nullptr};
|
||||
QuantMulArg *multiplier_{nullptr};
|
||||
ResizeQuantArg resize_quant_arg_;
|
||||
ResizeFloatScaleQuantArg resize_float_quant_arg_;
|
||||
ResizeQuantArg resize_quant_arg_ = {};
|
||||
ResizeFloatScaleQuantArg resize_float_quant_arg_ = {};
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
|
@ -64,7 +64,7 @@ int SqueezeInt8CPUKernel::Init() {
|
|||
auto quant_params = output_tensor->quant_params();
|
||||
MS_ASSERT(quant_params.size() == 1);
|
||||
quant_squeeze_param_->out_quant_args_ = reinterpret_cast<QuantArg *>(malloc(sizeof(QuantArg)));
|
||||
if (quant_squeeze_param_->in_quant_args_ == nullptr) {
|
||||
if (quant_squeeze_param_->out_quant_args_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc QuantArg failed";
|
||||
if (quant_squeeze_param_ != nullptr) {
|
||||
if (quant_squeeze_param_->in_quant_args_ != nullptr) {
|
||||
|
@ -97,15 +97,11 @@ int SqueezeInt8CPUKernel::Run() {
|
|||
|
||||
int SqueezeInt8Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto Squeeze = reinterpret_cast<SqueezeInt8CPUKernel *>(cdata);
|
||||
auto ret = Squeeze->DoExecute(task_id);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "SqueezeInt8Run task_id " << task_id << " failed.";
|
||||
return ret;
|
||||
}
|
||||
Squeeze->DoExecute(task_id);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int SqueezeInt8CPUKernel::DoExecute(int task_id) {
|
||||
void SqueezeInt8CPUKernel::DoExecute(int task_id) {
|
||||
auto input_tensor = in_tensors_.at(kInputIndex);
|
||||
MS_ASSERT(input_tensor);
|
||||
auto out_tensor = out_tensors_.at(kOutputIndex);
|
||||
|
@ -117,7 +113,6 @@ int SqueezeInt8CPUKernel::DoExecute(int task_id) {
|
|||
|
||||
int num = input_tensor->ElementsNum();
|
||||
SqueezeInt8(input_data, output_data, quant_squeeze_param_, num, task_id, op_parameter_->thread_num_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Squeeze, LiteKernelCreator<SqueezeInt8CPUKernel>)
|
||||
|
|
|
@ -36,7 +36,7 @@ class SqueezeInt8CPUKernel : public InnerKernel {
|
|||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int DoExecute(int tId);
|
||||
void DoExecute(int tId);
|
||||
|
||||
private:
|
||||
SqueezeQuantArg *quant_squeeze_param_{nullptr};
|
||||
|
|
|
@ -46,7 +46,7 @@ class TanhInt8CPUKernel : public InnerKernel {
|
|||
int element_size_{0};
|
||||
int thread_count_{0};
|
||||
int thread_stride_{0};
|
||||
TanhQuantParameter tanh_quant_;
|
||||
TanhQuantParameter tanh_quant_ = {};
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
|
@ -79,7 +79,7 @@ int TransposeInt8CPUKernel::DoTranspose(int task_id) {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void TransposeInt8CPUKernel::GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor,
|
||||
void TransposeInt8CPUKernel::GetNHNCTransposeFunc(const lite::Tensor *in_tensor, const lite::Tensor *out_tensor,
|
||||
const TransposeParameter *param) {
|
||||
auto out_shape = out_tensor->shape();
|
||||
if (in_tensor->shape().size() == DIMENSION_4D && param->perm_[0] == 0 && param->perm_[1] == 2 &&
|
||||
|
|
|
@ -44,7 +44,8 @@ class TransposeInt8CPUKernel : public InnerKernel {
|
|||
int DoTranspose(int task_id);
|
||||
|
||||
private:
|
||||
void GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor, const TransposeParameter *param);
|
||||
void GetNHNCTransposeFunc(const lite::Tensor *in_tensor, const lite::Tensor *out_tensor,
|
||||
const TransposeParameter *param);
|
||||
TransposeParameter *transpose_param_;
|
||||
TransposeFunc NHNCTransposeFunc_ = nullptr;
|
||||
int8_t *in_ptr_ = nullptr;
|
||||
|
|
Loading…
Reference in New Issue