forked from mindspore-Ecosystem/mindspore
!4896 [MS][LITE][Develop]int8 conv op
Merge pull request !4896 from ling/conv1x1
This commit is contained in:
commit
9dd4ab0e3e
|
@ -367,6 +367,26 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight
|
|||
}
|
||||
}
|
||||
|
||||
void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
|
||||
const int32_t *bias, int row, int col, int deep16, ConvParameter *conv_param,
|
||||
MATMUL_OPT_R_FUNC matmul_func) {
|
||||
if (matmul_func != NULL) {
|
||||
matmul_func(packed_input, packed_weight, dst, row, col, deep16, conv_param->output_channel_, input_sum, bias,
|
||||
conv_param->conv_quant_arg_.left_shift_, conv_param->conv_quant_arg_.right_shift_,
|
||||
conv_param->conv_quant_arg_.quant_multiplier_, conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
|
||||
conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
|
||||
(conv_param->conv_quant_arg_.filter_arg_num_ > 1));
|
||||
} else {
|
||||
MatMulInt8_16x4_r(packed_input, packed_weight, dst, row, col, deep16, conv_param->output_channel_, input_sum, bias,
|
||||
conv_param->conv_quant_arg_.left_shift_, conv_param->conv_quant_arg_.right_shift_,
|
||||
conv_param->conv_quant_arg_.quant_multiplier_,
|
||||
conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
|
||||
conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
|
||||
(conv_param->conv_quant_arg_.filter_arg_num_ > 1));
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// int8 convolution 3x3
|
||||
void Conv3x3Int8(int16_t *input_data, int16_t *transed_weight, const int32_t *bias_data, int8_t *output_data,
|
||||
int16_t *tile_buffer, int16_t *block_unit_buffer, int32_t *tmp_dst_buffer, int8_t *tmp_out,
|
||||
|
|
|
@ -25,6 +25,8 @@
|
|||
#include "nnacl/conv_parameter.h"
|
||||
#include "nnacl/winograd_utils.h"
|
||||
#include "nnacl/quantization/quantize.h"
|
||||
#include "nnacl/matmul_parameter.h"
|
||||
#include "nnacl/int8/matmul_int8.h"
|
||||
|
||||
typedef void (*GEMM_FUNC)(int8_t *dst, const int8_t *src, const int8_t *weight, const int32_t *bias, size_t ksize,
|
||||
size_t ic4, size_t output_channel, size_t offset, const int32_t *input_sum, size_t act_min,
|
||||
|
@ -51,6 +53,11 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight
|
|||
int32_t *tmp_dst, int8_t *tmp_out, int8_t *output_data, int32_t *input_sum, int task_id,
|
||||
ConvParameter *conv_param, GEMM_FUNC gemm_func);
|
||||
|
||||
// int8 convolution 1x1
|
||||
void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
|
||||
const int32_t *bias, int row, int col, int deep16, ConvParameter *conv_param,
|
||||
MATMUL_OPT_R_FUNC matmul_func);
|
||||
|
||||
// int8 convolution 3x3
|
||||
void Conv3x3Int8(int16_t *input_data, int16_t *transed_weight, const int32_t *bias_data, int8_t *output_data,
|
||||
int16_t *tile_buffer, int16_t *block_unit_buffer, int32_t *tmp_dst_buffer, int8_t *tmp_out,
|
||||
|
|
|
@ -172,73 +172,7 @@ void DeConvPackWeightSum(int8_t *weight, int32_t *weight_sum, int32_t input_zp,
|
|||
void DeConvPackInputSum(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16,
|
||||
bool suppport_opt) {
|
||||
/* optimize normal -> same layout */
|
||||
#ifdef ENABLE_ARM64
|
||||
asm volatile(
|
||||
"mov x10, %[src] \n"
|
||||
"mov x11, %[dst] \n"
|
||||
"dup v15.4s, %w[filter_zp] \n"
|
||||
|
||||
"mov x0, #0 \n"
|
||||
"1: \n"
|
||||
"cmp x0, %[row4] \n"
|
||||
"beq 4f \n"
|
||||
"add x0, x0, #4\n"
|
||||
"dup v10.4s, wzr \n"
|
||||
"mov x2, #0 \n"
|
||||
|
||||
"2: \n"
|
||||
"cmp x2, %[col16] \n"
|
||||
"beq 3f \n"
|
||||
"add x2, x2, #16\n"
|
||||
|
||||
"ld1 {v0.16b}, [x10], #16\n"
|
||||
"ld1 {v1.16b}, [x10], #16\n"
|
||||
"ld1 {v2.16b}, [x10], #16\n"
|
||||
"ld1 {v3.16b}, [x10], #16\n"
|
||||
|
||||
"saddlp v4.8h, v0.16b \n"
|
||||
"saddlp v5.8h, v1.16b \n"
|
||||
"saddlp v6.8h, v2.16b \n"
|
||||
"saddlp v7.8h, v3.16b \n"
|
||||
|
||||
"saddlp v0.4S, v4.8h \n"
|
||||
"saddlp v1.4S, v5.8h \n"
|
||||
"saddlp v2.4S, v6.8h \n"
|
||||
"saddlp v3.4S, v7.8h \n"
|
||||
|
||||
"addv s4, v0.4S \n"
|
||||
"addv s5, v1.4S \n"
|
||||
"addv s6, v2.4S \n"
|
||||
"addv s7, v3.4S \n"
|
||||
|
||||
"mov v0.s[0], v4.s[0] \n"
|
||||
"mov v0.s[1], v5.s[0] \n"
|
||||
"mov v0.s[2], v6.s[0] \n"
|
||||
"mov v0.s[3], v7.s[0] \n"
|
||||
|
||||
"add v10.4s, v10.4s, v0.4s \n"
|
||||
"b 2b\n"
|
||||
|
||||
"3: \n"
|
||||
"mul v10.4s, v10.4s, v15.4s \n"
|
||||
"st1 {v10.4s}, [x11], #16 \n"
|
||||
"beq 1b \n"
|
||||
|
||||
"4: \n"
|
||||
|
||||
:
|
||||
: [ dst ] "r"(dst), [ src ] "r"(src), [ row4 ] "r"(row4), [ col16 ] "r"(col16), [ filter_zp ] "r"(filter_zp)
|
||||
: "x0", "x1", "x2", "x3", "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v10", "v15");
|
||||
#else
|
||||
for (int r = 0; r < row4; r++) {
|
||||
int32_t tmp_value = 0;
|
||||
for (int c = 0; c < col16; c++) {
|
||||
int r4div = r / C4NUM, r4mod = r % C4NUM, c16div = c / C16NUM, c16mod = c % C16NUM;
|
||||
int src_index = r4div * C4NUM * col16 + c16div * C16NUM * C4NUM + r4mod * C16NUM + c16mod;
|
||||
tmp_value += src[src_index];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
PackInputSum16x4PerLater(src, dst, filter_zp, row4, col16);
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -28,6 +28,19 @@ void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col)
|
|||
}
|
||||
}
|
||||
|
||||
void RowMajor2Row4x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
|
||||
int col16 = UP_ROUND(col, C16NUM);
|
||||
for (int r = 0; r < row; r++) {
|
||||
int rd4 = r / C4NUM;
|
||||
int rm4 = r % C4NUM;
|
||||
for (int c = 0; c < col; c++) {
|
||||
int cd16 = c / C16NUM;
|
||||
int cm16 = c % C16NUM;
|
||||
dst_ptr[cd16 * col16 * C4NUM + rd4 * C4NUM * C16NUM + rm4 * C16NUM + cm16] = src_ptr[r * col16 + c];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MatrixPack4x16UnitInt8(int8_t *src, int8_t *dst, int row, int col, int stride) {
|
||||
for (int r = 0; r < row; r++) {
|
||||
int8_t *src_r = src + r * stride;
|
||||
|
@ -145,7 +158,38 @@ void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int
|
|||
return;
|
||||
}
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
|
||||
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
|
||||
bool per_channel) {
|
||||
/* row4x16-major * row16x4-major => (int8)row-major : per-channel */
|
||||
for (int r = 0; r < row; r++) {
|
||||
for (int c = 0; c < col; c++) {
|
||||
int r4div = r / C4NUM, r4mod = r % C4NUM;
|
||||
int c4div = c / C4NUM, c4mod = c % C4NUM;
|
||||
size_t ci = r * stride + c;
|
||||
int32_t value = 0;
|
||||
for (int d = 0; d < deep_16; d++) {
|
||||
int d16div = d / C16NUM, d16mod = d % C16NUM;
|
||||
size_t ai = r4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + r4mod * C16NUM + d16mod;
|
||||
size_t bi = c4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + c4mod * C16NUM + d16mod;
|
||||
value = value + a[ai] * b[bi];
|
||||
}
|
||||
int32_t cur_input_sum = per_channel ? input_sum[c4div * UP_ROUND(row, C4NUM) + r * C4NUM + c4mod] : input_sum[r];
|
||||
value -= cur_input_sum;
|
||||
value += bias[c];
|
||||
int32_t cur_left_shift = per_channel ? left_shift[c] : left_shift[0];
|
||||
int32_t cur_right_shift = per_channel ? right_shift[c] : right_shift[0];
|
||||
int32_t cur_multiplier = per_channel ? multiplier[c] : multiplier[0];
|
||||
value = MultiplyByQuantizedMultiplier(value, cur_multiplier, cur_left_shift, cur_right_shift) + output_zp;
|
||||
value = MSMIN(maxi, value);
|
||||
value = MSMAX(mini, value);
|
||||
dst[ci] = (int8_t)value;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void RowMajor2Row4x16Major(int8_t *src, int row, int col, int8_t *dst, int col_16) {
|
||||
int stride = sizeof(int8_t) * 16 * 4;
|
||||
for (int r = 0; r < row; ++r) {
|
||||
|
@ -201,4 +245,3 @@ void Row4x4Major2RowMajor(int8_t *src, int row4, int8_t *dst, int row, int cow)
|
|||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -28,17 +28,22 @@ void MatMulInt8(const int8_t *a, const int8_t *b, int *c, const int row8, const
|
|||
const int a_zp, const int b_zp);
|
||||
void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16,
|
||||
const int *input_sum, const int *bias);
|
||||
void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
|
||||
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
|
||||
bool per_channel);
|
||||
void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
|
||||
void RowMajor2Row4x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
|
||||
void RowMajor2Col8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
|
||||
void RowMajor2Row16x4MajorInt8(void *src_ptr, void *dst_ptr, int row, int col);
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
void RowMajor2Row4x16Major(int8_t *src, int row, int col, int8_t *dst, int col_16);
|
||||
void RowMajor2Col16x4Major(int8_t *src, int row, int col, int8_t *dst, int row_16);
|
||||
void RowMajor2Asums(int8_t *a, int row, int col, int b_zp, int *dst);
|
||||
void RowMajor2Bbias(int8_t *b, int row, int col, int a_zp, int b_zp, int *bias, int *dst);
|
||||
void Row4x4Major2RowMajor(int8_t *src, int row4, int8_t *dst, int row, int cow);
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
// bias = bias + depth * a_zp * b_zp - a_zp * b_sums
|
||||
void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16, const int *a_sums,
|
||||
const int *bias, int act_min, int act_max, int out_zp, int multiplier, int left_shift,
|
||||
|
|
|
@ -22,6 +22,11 @@
|
|||
typedef void (*MATMUL_OPT_R4_FUNC)(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16,
|
||||
const int *input_sum, const int *bias);
|
||||
|
||||
typedef void (*MATMUL_OPT_R_FUNC)(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
|
||||
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
|
||||
int32_t maxi, bool per_channel);
|
||||
|
||||
typedef void (*MAT_TRANS_FUNC)(void *dst, void *a, int row, int col);
|
||||
|
||||
typedef enum ActType { ActType_No, ActType_Relu, ActType_Relu6 } ActType;
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
@ -45,4 +46,11 @@ void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, i
|
|||
const int *input_sum, const int *bias) {
|
||||
return MatMulOptR4Int8Neon64(a, b, dst, row4, col4, deep16, input_sum, bias);
|
||||
}
|
||||
|
||||
void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
|
||||
size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
|
||||
int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
|
||||
int32_t maxi, bool per_channel) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -153,22 +153,24 @@ void PackWeightInt8Opt(int8_t *weight_data, ConvParameter *conv_param, int8_t *p
|
|||
} // kernel plane loop
|
||||
}
|
||||
|
||||
void Conv1x1InputPackFp32(const float *src, float *dst, ConvParameter *conv_param) {
|
||||
void Conv1x1InputPack(const void *src_ptr, void *dst_ptr, ConvParameter *conv_param, int data_size) {
|
||||
/* support nhwc */
|
||||
char *src = (char *)src_ptr;
|
||||
char *dst = (char *)dst_ptr;
|
||||
for (int dst_h = 0; dst_h < conv_param->output_h_; dst_h++) {
|
||||
int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
if (src_h < 0 || src_h >= conv_param->input_h_) {
|
||||
continue;
|
||||
}
|
||||
const float *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_;
|
||||
float *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_;
|
||||
const char *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_ * data_size;
|
||||
char *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_ * data_size;
|
||||
for (int dst_w = 0; dst_w < conv_param->output_w_; dst_w++) {
|
||||
int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
if (src_w < 0 || src_w >= conv_param->input_w_) {
|
||||
continue;
|
||||
}
|
||||
memcpy(dst_h_ptr + dst_w * conv_param->input_channel_, src_h_ptr + src_w * conv_param->input_channel_,
|
||||
conv_param->input_channel_ * sizeof(float));
|
||||
memcpy(dst_h_ptr + dst_w * conv_param->input_channel_ * data_size,
|
||||
src_h_ptr + src_w * conv_param->input_channel_ * data_size, conv_param->input_channel_ * data_size);
|
||||
}
|
||||
}
|
||||
return;
|
||||
|
@ -188,6 +190,105 @@ void Pack1x1WeightFp32(const float *weight_data, float *packed_weight, ConvParam
|
|||
return;
|
||||
}
|
||||
|
||||
void PackInputSum16x4PerLater(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16) {
|
||||
/* optimize normal -> same layout */
|
||||
#ifdef ENABLE_ARM64
|
||||
asm volatile(
|
||||
"mov x10, %[src] \n"
|
||||
"mov x11, %[dst] \n"
|
||||
"dup v15.4s, %w[filter_zp] \n"
|
||||
|
||||
"mov x0, #0 \n"
|
||||
"1: \n"
|
||||
"cmp x0, %[row4] \n"
|
||||
"beq 4f \n"
|
||||
"add x0, x0, #4\n"
|
||||
"dup v10.4s, wzr \n"
|
||||
"mov x2, #0 \n"
|
||||
|
||||
"2: \n"
|
||||
"cmp x2, %[col16] \n"
|
||||
"beq 3f \n"
|
||||
"add x2, x2, #16\n"
|
||||
|
||||
"ld1 {v0.16b}, [x10], #16\n"
|
||||
"ld1 {v1.16b}, [x10], #16\n"
|
||||
"ld1 {v2.16b}, [x10], #16\n"
|
||||
"ld1 {v3.16b}, [x10], #16\n"
|
||||
|
||||
"saddlp v4.8h, v0.16b \n"
|
||||
"saddlp v5.8h, v1.16b \n"
|
||||
"saddlp v6.8h, v2.16b \n"
|
||||
"saddlp v7.8h, v3.16b \n"
|
||||
|
||||
"saddlp v0.4S, v4.8h \n"
|
||||
"saddlp v1.4S, v5.8h \n"
|
||||
"saddlp v2.4S, v6.8h \n"
|
||||
"saddlp v3.4S, v7.8h \n"
|
||||
|
||||
"addv s4, v0.4S \n"
|
||||
"addv s5, v1.4S \n"
|
||||
"addv s6, v2.4S \n"
|
||||
"addv s7, v3.4S \n"
|
||||
|
||||
"mov v0.s[0], v4.s[0] \n"
|
||||
"mov v0.s[1], v5.s[0] \n"
|
||||
"mov v0.s[2], v6.s[0] \n"
|
||||
"mov v0.s[3], v7.s[0] \n"
|
||||
|
||||
"add v10.4s, v10.4s, v0.4s \n"
|
||||
"b 2b\n"
|
||||
|
||||
"3: \n"
|
||||
"mul v10.4s, v10.4s, v15.4s \n"
|
||||
"st1 {v10.4s}, [x11], #16 \n"
|
||||
"beq 1b \n"
|
||||
|
||||
"4: \n"
|
||||
|
||||
:
|
||||
: [ dst ] "r"(dst), [ src ] "r"(src), [ row4 ] "r"(row4), [ col16 ] "r"(col16), [ filter_zp ] "r"(filter_zp)
|
||||
: "x0", "x1", "x2", "x3", "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v10", "v15");
|
||||
#else
|
||||
for (int r = 0; r < row4; r++) {
|
||||
int32_t tmp_value = 0;
|
||||
for (int c = 0; c < col16; c++) {
|
||||
int r4div = r / C4NUM, r4mod = r % C4NUM, c16div = c / C16NUM, c16mod = c % C16NUM;
|
||||
int src_index = r4div * C4NUM * col16 + c16div * C16NUM * C4NUM + r4mod * C16NUM + c16mod;
|
||||
tmp_value += src[src_index];
|
||||
}
|
||||
dst[r] = tmp_value * filter_zp;
|
||||
}
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
void PackInputSum16x4Int8(int8_t *input_value, int32_t *input_sum, size_t input_channel, size_t output_channel,
|
||||
size_t plane_size, ConvParameter *conv_param) {
|
||||
size_t hw4 = UP_ROUND(plane_size, C4NUM);
|
||||
size_t ic16 = UP_ROUND(input_channel, C16NUM);
|
||||
if (conv_param->conv_quant_arg_.filter_arg_num_ == 1) {
|
||||
PackInputSum16x4PerLater(input_value, input_sum, conv_param->conv_quant_arg_.filter_quant_args_[0].zp_, hw4, ic16);
|
||||
} else {
|
||||
for (int ri = 0; ri < plane_size; ri++) {
|
||||
int ri4div = ri / C4NUM, ri4mod = ri % C4NUM;
|
||||
for (int ci = 0; ci < output_channel; ci++) {
|
||||
int32_t tmp_sum_value = 0;
|
||||
int ci4div = ci / C4NUM, ci4mod = ci % C4NUM;
|
||||
int32_t filter_zp = conv_param->conv_quant_arg_.filter_quant_args_[ci].zp_;
|
||||
for (int di = 0; di < input_channel; di++) {
|
||||
size_t di16div = di / C16NUM, di16mod = di % C16NUM;
|
||||
int src_index = ri4div * C4NUM * ic16 + di16div * C16NUM * C4NUM + ri4mod * C16NUM + di16mod;
|
||||
tmp_sum_value += input_value[src_index];
|
||||
}
|
||||
int dst_index = ci4div * C4NUM * hw4 + ri * C4NUM + ci4mod;
|
||||
input_sum[dst_index] = tmp_sum_value * filter_zp;
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, float *packed_input, int real_cal_num,
|
||||
int block_index) {
|
||||
// input format : nhwc
|
||||
|
|
|
@ -35,10 +35,15 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real
|
|||
void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int real_cal_num, int block_index,
|
||||
int32_t *input_sum, ConvParameter *conv_param);
|
||||
|
||||
void Conv1x1InputPackFp32(const float *src, float *dst, ConvParameter *conv_param);
|
||||
void PackInputSum16x4PerLater(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16);
|
||||
|
||||
void Conv1x1InputPack(const void *src_ptr, void *dst_ptr, ConvParameter *conv_param, int data_size);
|
||||
|
||||
void Pack1x1WeightFp32(const float *weight_data, float *packed_weight, ConvParameter *conv_param);
|
||||
|
||||
void PackInputSum16x4Int8(int8_t *input_value, int32_t *input_sum, size_t input_channel, size_t output_channel,
|
||||
size_t plane_size, ConvParameter *conv_param);
|
||||
|
||||
void MatrixPack(const float *src, float *dst, int row, int ic4, int stride);
|
||||
|
||||
void PackInputToC8Int8(const int8_t *input_data, int16_t *packed_input, ConvParameter *conv_param);
|
||||
|
|
|
@ -118,10 +118,13 @@ int ConvolutionBaseCPUKernel::CheckLayout(lite::tensor::Tensor *input_tensor) {
|
|||
}
|
||||
|
||||
int ConvolutionBaseCPUKernel::SetIfPerChannel() {
|
||||
auto filter_tensor = in_tensors_.at(kWeightIndex);
|
||||
auto input_channel = filter_tensor->Channel();
|
||||
auto output_channel = filter_tensor->Batch();
|
||||
|
||||
uint8_t per_channel = 0b0;
|
||||
if (conv_quant_arg_->input_arg_num_ != kPerTensor) {
|
||||
int in_channel = conv_param_->input_channel_;
|
||||
if (static_cast<int>(conv_quant_arg_->input_arg_num_) != in_channel) {
|
||||
if (static_cast<int>(conv_quant_arg_->input_arg_num_) != input_channel) {
|
||||
MS_LOG(ERROR) << "input per channel quant param length is not equal to input channel.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
@ -129,8 +132,7 @@ int ConvolutionBaseCPUKernel::SetIfPerChannel() {
|
|||
}
|
||||
|
||||
if (conv_quant_arg_->filter_arg_num_ != kPerTensor) {
|
||||
int filter_num = conv_param_->output_channel_;
|
||||
if (static_cast<int>(conv_quant_arg_->filter_arg_num_) != filter_num) {
|
||||
if (static_cast<int>(conv_quant_arg_->filter_arg_num_) != output_channel) {
|
||||
MS_LOG(ERROR) << "weight per channel quant param length is not equal to filter num.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
@ -138,8 +140,7 @@ int ConvolutionBaseCPUKernel::SetIfPerChannel() {
|
|||
}
|
||||
|
||||
if (conv_quant_arg_->output_arg_num_ != kPerTensor) {
|
||||
int out_channel = conv_param_->output_channel_;
|
||||
if (static_cast<int>(conv_quant_arg_->output_arg_num_) != out_channel) {
|
||||
if (static_cast<int>(conv_quant_arg_->output_arg_num_) != output_channel) {
|
||||
MS_LOG(ERROR) << "output per channel quant param length is not equal to output channel.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
|
|
@ -113,7 +113,7 @@ void Convolution1x1CPUKernel::Pre1x1Trans(float *src_input, float *src_output) {
|
|||
output_ptr_ = src_output;
|
||||
|
||||
if (pre_trans_input_) {
|
||||
Conv1x1InputPackFp32(src_input, input_ptr_, conv_param_);
|
||||
Conv1x1InputPack(src_input, input_ptr_, conv_param_, sizeof(float));
|
||||
} else {
|
||||
input_ptr_ = src_input;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,270 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h"
|
||||
#include "src/runtime/runtime_api.h"
|
||||
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_MEMORY_FAILED;
|
||||
using mindspore::lite::RET_OK;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
Convolution1x1Int8CPUKernel::~Convolution1x1Int8CPUKernel() {
|
||||
if (matmul_param_ != nullptr) {
|
||||
delete matmul_param_;
|
||||
matmul_param_ = nullptr;
|
||||
}
|
||||
if (packed_weight_ != nullptr) {
|
||||
delete packed_weight_;
|
||||
packed_weight_ = nullptr;
|
||||
}
|
||||
FreeResizeBuf();
|
||||
FreeQuantParam();
|
||||
}
|
||||
|
||||
void Convolution1x1Int8CPUKernel::FreeResizeBuf() {
|
||||
if (packed_input_ != nullptr) {
|
||||
free(packed_input_);
|
||||
packed_input_ = nullptr;
|
||||
}
|
||||
if (input_sum_ != nullptr) {
|
||||
free(input_sum_);
|
||||
input_sum_ = nullptr;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void Convolution1x1Int8CPUKernel::CheckSupportOptimize() {
|
||||
support_optimize_ = false;
|
||||
matmul_func_ = MatMulInt8_16x4_r;
|
||||
#ifdef ENABLE_ARM64
|
||||
void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_;
|
||||
if (optimize_op_handler != nullptr) {
|
||||
dlerror();
|
||||
*(reinterpret_cast<void **>(&matmul_func_)) = dlsym(optimize_op_handler, "MatMulRInt8_optimize_handler");
|
||||
auto dlopen_error = dlerror();
|
||||
if (dlopen_error != nullptr) {
|
||||
MS_LOG(ERROR) << "load matmul func failed! " << dlopen_error << ".";
|
||||
support_optimize_ = false;
|
||||
matmul_func_ = nullptr;
|
||||
} else {
|
||||
support_optimize_ = true;
|
||||
}
|
||||
} else {
|
||||
support_optimize_ = false;
|
||||
matmul_func_ = nullptr;
|
||||
}
|
||||
#endif
|
||||
|
||||
matmul_func_ = MatMulInt8_16x4_r;
|
||||
return;
|
||||
}
|
||||
|
||||
int Convolution1x1Int8CPUKernel::InitWeightBias() {
|
||||
auto filter_tensor = in_tensors_.at(kWeightIndex);
|
||||
auto input_channel = filter_tensor->Channel();
|
||||
auto output_channel = filter_tensor->Batch();
|
||||
|
||||
/* weight */
|
||||
size_t size = UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C4NUM) * sizeof(int8_t);
|
||||
packed_weight_ = reinterpret_cast<int8_t *>(malloc(size));
|
||||
if (packed_weight_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Conv1x1 int8 Malloc weight error!";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(packed_weight_, 0, size);
|
||||
RowMajor2Row4x16MajorInt8(reinterpret_cast<int8_t *>(filter_tensor->Data()), packed_weight_, output_channel,
|
||||
input_channel);
|
||||
|
||||
/* bias = bias - v2 x zp1 + zp1 x zp2 */
|
||||
int col4 = UP_ROUND(output_channel, C4NUM);
|
||||
bias_data_ = malloc(col4 * sizeof(int32_t));
|
||||
if (bias_data_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Conv1x1 int8 Malloc bias_ptr_ error!";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(bias_data_, 0, col4 * sizeof(int32_t));
|
||||
if (in_tensors_.size() == 3) {
|
||||
memcpy(bias_data_, in_tensors_[kBiasIndex]->Data(), output_channel * sizeof(int32_t));
|
||||
}
|
||||
|
||||
int32_t *bias_data = reinterpret_cast<int32_t *>(bias_data_);
|
||||
int8_t *weight = reinterpret_cast<int8_t *>(filter_tensor->Data());
|
||||
int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_;
|
||||
for (int oc = 0; oc < output_channel; oc++) {
|
||||
int32_t weight_sum_value = 0;
|
||||
int32_t filter_zp = (conv_param_->conv_quant_arg_.filter_arg_num_ == 1)
|
||||
? conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_
|
||||
: conv_param_->conv_quant_arg_.filter_quant_args_[oc].zp_;
|
||||
for (int ic = 0; ic < input_channel; ic++) {
|
||||
weight_sum_value += weight[oc * input_channel + ic];
|
||||
}
|
||||
bias_data[oc] += filter_zp * input_zp * input_channel - weight_sum_value * input_zp;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int Convolution1x1Int8CPUKernel::Init() {
|
||||
if (!InferShapeDone()) {
|
||||
return RET_OK;
|
||||
}
|
||||
matmul_param_ = new (std::nothrow) MatMulParameter();
|
||||
if (matmul_param_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Init matmul_param_ failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
CheckSupportOptimize();
|
||||
|
||||
auto ret = SetQuantParam();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Set quant param failed.";
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = InitWeightBias();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Init weight bias failed.";
|
||||
return ret;
|
||||
}
|
||||
|
||||
return ReSize();
|
||||
}
|
||||
|
||||
int Convolution1x1Int8CPUKernel::InitParam() {
|
||||
pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 ||
|
||||
conv_param_->stride_w_ != 1);
|
||||
|
||||
matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
|
||||
matmul_param_->deep_ = conv_param_->input_channel_;
|
||||
matmul_param_->col_ = conv_param_->output_channel_;
|
||||
|
||||
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C4NUM));
|
||||
thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, C4NUM), thread_count_);
|
||||
|
||||
size_t size = UP_ROUND(matmul_param_->row_, C4NUM) * UP_ROUND(matmul_param_->deep_, C16NUM);
|
||||
packed_input_ = reinterpret_cast<int8_t *>(malloc(size * sizeof(int8_t)));
|
||||
if (packed_input_ == nullptr) {
|
||||
MS_LOG(ERROR) << "conv1x1 int8 Malloc packed_input_ error!";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(packed_input_, 0, size * sizeof(int8_t));
|
||||
|
||||
if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
|
||||
size = UP_ROUND(conv_param_->output_channel_, C4NUM) * UP_ROUND(matmul_param_->row_, C4NUM);
|
||||
} else {
|
||||
size = UP_ROUND(matmul_param_->row_, C4NUM);
|
||||
}
|
||||
input_sum_ = reinterpret_cast<int32_t *>(malloc(size * sizeof(int32_t)));
|
||||
if (input_sum_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc input_sum_ failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(input_sum_, 0, size * sizeof(int32_t));
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int Convolution1x1Int8CPUKernel::ReSize() {
|
||||
FreeResizeBuf();
|
||||
|
||||
ConvolutionBaseCPUKernel::Init();
|
||||
|
||||
int error_code = InitParam();
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "Convolution base init failed.";
|
||||
return error_code;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void Convolution1x1Int8CPUKernel::Pre1x1Trans(int8_t *src_input, int8_t *src_output) {
|
||||
output_ptr_ = src_output;
|
||||
if (pre_trans_input_) {
|
||||
Conv1x1InputPack(src_input, input_ptr_, conv_param_, sizeof(int8_t));
|
||||
} else {
|
||||
input_ptr_ = src_input;
|
||||
}
|
||||
RowMajor2Row16x4MajorInt8(input_ptr_, packed_input_, matmul_param_->row_, matmul_param_->deep_);
|
||||
return;
|
||||
}
|
||||
|
||||
int Convolution1x1Int8CPUKernel::RunImpl(int task_id) {
|
||||
int cur_oc = MSMIN(thread_stride_ * C4NUM, matmul_param_->col_ - task_id * thread_stride_ * C4NUM);
|
||||
if (cur_oc <= 0) {
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int32_t *bias = reinterpret_cast<int32_t *>(bias_data_) + thread_stride_ * C4NUM * task_id;
|
||||
|
||||
Conv1x1Int8(packed_input_, packed_weight_ + task_id * thread_stride_ * C4NUM * matmul_param_->deep_,
|
||||
output_ptr_ + task_id * thread_stride_ * C4NUM, input_sum_, bias + task_id * thread_stride_ * C4NUM,
|
||||
matmul_param_->row_, cur_oc, UP_ROUND(matmul_param_->deep_, C16NUM), conv_param_, matmul_func_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int Convolution1x1Int8Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
|
||||
auto conv = reinterpret_cast<Convolution1x1Int8CPUKernel *>(cdata);
|
||||
auto error_code = conv->RunImpl(task_id);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "conv1x1 Int8 Run error task_id[" << task_id << "] error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int Convolution1x1Int8CPUKernel::Run() {
|
||||
auto ret = Prepare();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Prepare failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
if (pre_trans_input_) {
|
||||
input_ptr_ =
|
||||
reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t)));
|
||||
if (input_ptr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Conv1x1 int8 Malloc input_ptr_ error!";
|
||||
return RET_MEMORY_FAILED;
|
||||
}
|
||||
}
|
||||
|
||||
int8_t *src_in = reinterpret_cast<int8_t *>(in_tensors_[0]->Data());
|
||||
int8_t *src_out = reinterpret_cast<int8_t *>(out_tensors_[0]->Data());
|
||||
|
||||
for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
|
||||
Pre1x1Trans(src_in + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_,
|
||||
src_out + batch_index * matmul_param_->row_ * matmul_param_->col_);
|
||||
|
||||
PackInputSum16x4Int8(packed_input_, input_sum_, matmul_param_->deep_, matmul_param_->col_, matmul_param_->row_,
|
||||
conv_param_);
|
||||
|
||||
int error_code = LiteBackendParallelLaunch(Convolution1x1Int8Impl, this, thread_count_);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "conv1x1 fp16 error error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
if (pre_trans_input_ && input_ptr_ != nullptr) {
|
||||
ctx_->allocator->Free(input_ptr_);
|
||||
input_ptr_ = nullptr;
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,68 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_1x1_INT8_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_1x1_INT8_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/lite_kernel.h"
|
||||
#include "include/errorcode.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/runtime/kernel/arm/base/convolution_base.h"
|
||||
#include "nnacl/int8/conv_int8.h"
|
||||
#include "nnacl/int8/matmul_int8.h"
|
||||
#include "nnacl/matmul_parameter.h"
|
||||
#include "nnacl/optimized_kernel.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
class Convolution1x1Int8CPUKernel : public ConvolutionBaseCPUKernel {
|
||||
public:
|
||||
Convolution1x1Int8CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
|
||||
const mindspore::lite::PrimitiveC *primitive)
|
||||
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
|
||||
~Convolution1x1Int8CPUKernel() override;
|
||||
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
|
||||
public:
|
||||
int RunImpl(int task_id);
|
||||
|
||||
private:
|
||||
void FreeResizeBuf();
|
||||
int InitParam();
|
||||
int InitWeightBias();
|
||||
void Pre1x1Trans(int8_t *src_input, int8_t *src_output);
|
||||
void CheckSupportOptimize();
|
||||
|
||||
private:
|
||||
int32_t *input_sum_ = nullptr; /* per-channel: oc4 format */
|
||||
int8_t *packed_weight_ = nullptr;
|
||||
int8_t *packed_input_ = nullptr;
|
||||
int8_t *input_ptr_ = nullptr;
|
||||
int8_t *output_ptr_ = nullptr;
|
||||
size_t thread_count_ = 1;
|
||||
size_t thread_stride_ = 0;
|
||||
bool pre_trans_input_ = false;
|
||||
MatMulParameter *matmul_param_ = nullptr;
|
||||
MATMUL_OPT_R_FUNC matmul_func_ = nullptr;
|
||||
bool support_optimize_ = false;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_1x1_INT8_H_
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
#include "src/runtime/kernel/arm/int8/convolution_int8.h"
|
||||
#include "src/runtime/kernel/arm/int8/convolution_3x3_int8.h"
|
||||
#include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h"
|
||||
#include "nnacl/int8/conv_int8.h"
|
||||
#include "src/runtime/kernel/arm/base/layout_transform.h"
|
||||
#include "schema/model_generated.h"
|
||||
|
@ -400,6 +401,9 @@ kernel::LiteKernel *CpuConvInt8KernelCreator(const std::vector<lite::tensor::Ten
|
|||
kernel::LiteKernel *kernel;
|
||||
if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) {
|
||||
kernel = new (std::nothrow) kernel::Convolution3x3Int8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
|
||||
} else if (kernel_h == 1 && kernel_w == 1) {
|
||||
/* Convolution1x1Int8CPUKernel */
|
||||
kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
|
||||
} else {
|
||||
kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
|
||||
}
|
||||
|
|
|
@ -54,7 +54,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack1) {
|
|||
conv_param->pad_h_ = conv_param->pad_w_ = 2;
|
||||
|
||||
float out[20] = {0};
|
||||
Conv1x1InputPackFp32(in, out, conv_param);
|
||||
Conv1x1InputPack(in, out, conv_param, sizeof(float));
|
||||
EXPECT_EQ(0, lite::CompareOutputData(out, correct, 20));
|
||||
delete conv_param;
|
||||
}
|
||||
|
@ -95,7 +95,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack2) {
|
|||
conv_param->pad_h_ = conv_param->pad_w_ = 0;
|
||||
|
||||
float out[28] = {0};
|
||||
Conv1x1InputPackFp32(in, out, conv_param);
|
||||
Conv1x1InputPack(in, out, conv_param, sizeof(float));
|
||||
CompareOutputData(out, correct, 28, 0.0001);
|
||||
delete conv_param;
|
||||
}
|
||||
|
@ -114,7 +114,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack3) {
|
|||
float correct[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 17.025112,
|
||||
-5.052577, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
|
||||
|
||||
Conv1x1InputPackFp32(in, out, conv_param);
|
||||
Conv1x1InputPack(in, out, conv_param, sizeof(float));
|
||||
EXPECT_EQ(0, lite::CompareOutputData(out, correct, 18));
|
||||
delete conv_param;
|
||||
}
|
||||
|
@ -136,7 +136,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack4) {
|
|||
-1.770, 41.903, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
|
||||
float out[54] = {0};
|
||||
Conv1x1InputPackFp32(in, out, conv_param);
|
||||
Conv1x1InputPack(in, out, conv_param, sizeof(float));
|
||||
EXPECT_EQ(0, lite::CompareOutputData(out, correct, 54));
|
||||
delete conv_param;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,281 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "utils/log_adapter.h"
|
||||
#include "common/common_test.h"
|
||||
#include "mindspore/lite/src/lite_kernel.h"
|
||||
#include "src/common/file_utils.h"
|
||||
#include "nnacl/quantization/quantize.h"
|
||||
#include "nnacl/common_func.h"
|
||||
#include "mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h"
|
||||
|
||||
namespace mindspore {
|
||||
using lite::tensor::Tensor;
|
||||
class TestConv1x1Int8 : public mindspore::CommonTest {
|
||||
public:
|
||||
TestConv1x1Int8() {}
|
||||
};
|
||||
|
||||
TEST_F(TestConv1x1Int8, Input1x1PrePack1) {
|
||||
auto conv_param = new ConvParameter();
|
||||
conv_param->input_channel_ = 6;
|
||||
conv_param->input_h_ = conv_param->input_w_ = 3;
|
||||
conv_param->output_h_ = conv_param->output_w_ = 3;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 2;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 1;
|
||||
int8_t in[] = {4, 13, -3, 16, 19, 8, 19, -6, -2, -9, 9, 18, 23, 8, 47, -14, 15, 4,
|
||||
-0, 37, -0, 6, 0, -1, 37, 13, 11, 1, -1, 41, 9, 14, 3, 0, 8, 9,
|
||||
14, -14, -8, -8, -8, 7, 19, 17, 13, 3, 9, 18, -1, -0, 18, 0, 4, -2};
|
||||
int8_t correct[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 37, 13, 11,
|
||||
1, -1, 41, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
int8_t out[54] = {0};
|
||||
Conv1x1InputPack(in, out, conv_param, sizeof(int8_t));
|
||||
CompareOutputData(out, correct, 54, 0);
|
||||
delete conv_param;
|
||||
}
|
||||
|
||||
TEST_F(TestConv1x1Int8, Input1x1PrePack2) {
|
||||
auto conv_param = new ConvParameter();
|
||||
int8_t in[] = {-0, -0, -7, -0, -6, 4, 9, 9, 12, -0, 6, 2, 13, 15, 16, -7, 9, 1, 10, 13, 17, 17, 4, 13,
|
||||
-6, 5, 7, -7, 15, 0, 1, -5, -7, 18, 15, 19, -7, 13, 7, -0, 16, -5, 16, -7, 6, 10, -5, 10,
|
||||
9, 12, -9, -8, -4, 18, -5, 0, 7, 12, 13, 16, -9, -4, 18, -0, 8, 6, 2, 10, 16, 1, -1, 2,
|
||||
9, 8, 9, 13, 7, -0, 15, -7, 0, -0, 17, 19, 9, 17, -6, -2, 7, -0, 10, -6, -6, 18, -0, 9,
|
||||
9, 6, 3, -1, -8, 10, 17, -9, 17, 6, -3, 7, -2, -0, -9, 1, -3, 15, 13, 4, 18};
|
||||
int8_t correct[] = {0, 0, 0, 0, 0, 0, 15, -7, -7, 0, 0, 0, 9, 7, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
conv_param->input_h_ = 9;
|
||||
conv_param->input_w_ = 13;
|
||||
conv_param->input_channel_ = 1;
|
||||
conv_param->output_h_ = 4;
|
||||
conv_param->output_w_ = 5;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 4;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 2;
|
||||
|
||||
int8_t out[20] = {0};
|
||||
Conv1x1InputPack(in, out, conv_param, sizeof(int8_t));
|
||||
CompareOutputData(out, correct, 20, 0);
|
||||
delete conv_param;
|
||||
}
|
||||
|
||||
int Conv1x1Int8TestInit1_perchannel(std::vector<lite::tensor::Tensor *> *inputs_,
|
||||
std::vector<lite::tensor::Tensor *> *outputs_, ConvParameter *conv_param,
|
||||
int8_t **correct) {
|
||||
Tensor *in_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
|
||||
auto in_quant_arg = new mindspore::lite::tensor::QuantArg();
|
||||
in_quant_arg->zeroPoint = -42, in_quant_arg->scale = 0.117647;
|
||||
in_t->AddQuantParam(*in_quant_arg);
|
||||
in_t->MallocData();
|
||||
int8_t in[] = {62, -14, 88, 2, -35, 43, 83, -111, 75, 26, 14, -121,
|
||||
-78, 56, 37, -31, 15, -75, -10, -115, -71, 74, -65, -15};
|
||||
memcpy(in_t->Data(), in, in_t->ElementsNum() * sizeof(int8_t));
|
||||
inputs_->push_back(in_t);
|
||||
|
||||
Tensor *weight_t = new Tensor(kNumberTypeInt8, {3, 1, 1, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
|
||||
weight_t->MallocData();
|
||||
auto weight_quant_arg1 = new mindspore::lite::tensor::QuantArg();
|
||||
weight_quant_arg1->zeroPoint = 66, weight_quant_arg1->scale = 0.96439215686275;
|
||||
auto weight_quant_arg2 = new mindspore::lite::tensor::QuantArg();
|
||||
weight_quant_arg2->zeroPoint = 33, weight_quant_arg2->scale = 0.76439215686275;
|
||||
auto weight_quant_arg3 = new mindspore::lite::tensor::QuantArg();
|
||||
weight_quant_arg3->zeroPoint = -20, weight_quant_arg3->scale = 0.99117647;
|
||||
weight_t->AddQuantParam(*weight_quant_arg1);
|
||||
weight_t->AddQuantParam(*weight_quant_arg2);
|
||||
weight_t->AddQuantParam(*weight_quant_arg3);
|
||||
int8_t weight[] = {65, 67, 65, 65, 32, 33, 34, 33, -19, -20, -19, -20};
|
||||
memcpy(weight_t->Data(), weight, weight_t->ElementsNum() * sizeof(int8_t));
|
||||
inputs_->push_back(weight_t);
|
||||
|
||||
Tensor *out_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
|
||||
out_t->MallocData();
|
||||
auto output_quant_arg = new mindspore::lite::tensor::QuantArg();
|
||||
output_quant_arg->zeroPoint = 7, output_quant_arg->scale = 0.294321233;
|
||||
out_t->AddQuantParam(*output_quant_arg);
|
||||
outputs_->push_back(out_t);
|
||||
|
||||
*correct = reinterpret_cast<int8_t *>(malloc(out_t->ElementsNum() * sizeof(int8_t)));
|
||||
int8_t nchw_co[] = {-83, 34, 100, 10, 113, 55, 3, 16, 63, 6, 93, 20, 5, 6, 42, 35, 28, -24};
|
||||
memcpy(*correct, nchw_co, out_t->ElementsNum() * sizeof(int8_t));
|
||||
|
||||
conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 1;
|
||||
conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 0;
|
||||
conv_param->is_relu_ = conv_param->is_relu6_ = false;
|
||||
return out_t->ElementsNum();
|
||||
}
|
||||
|
||||
TEST_F(TestConv1x1Int8, Conv1x1TestPerChannel) {
|
||||
std::vector<lite::tensor::Tensor *> inputs_;
|
||||
std::vector<lite::tensor::Tensor *> outputs_;
|
||||
auto conv_param = new ConvParameter();
|
||||
int8_t *correct;
|
||||
auto ctx = new lite::Context;
|
||||
ctx->thread_num_ = 1;
|
||||
int total_size = Conv1x1Int8TestInit1_perchannel(&inputs_, &outputs_, conv_param, &correct);
|
||||
kernel::Convolution1x1Int8CPUKernel *conv1x1 = new kernel::Convolution1x1Int8CPUKernel(
|
||||
reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr);
|
||||
|
||||
conv1x1->Init();
|
||||
conv1x1->Run();
|
||||
CompareOutputData(reinterpret_cast<int8_t *>(outputs_[0]->Data()), correct, total_size, 70);
|
||||
|
||||
delete conv1x1;
|
||||
for (auto t : inputs_) delete t;
|
||||
for (auto t : outputs_) delete t;
|
||||
free(correct);
|
||||
}
|
||||
|
||||
int Conv1x1Int8TestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
|
||||
ConvParameter *conv_param, int8_t **correct) {
|
||||
Tensor *in_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
|
||||
auto in_quant_arg = new mindspore::lite::tensor::QuantArg();
|
||||
in_quant_arg->zeroPoint = -42, in_quant_arg->scale = 0.117647;
|
||||
in_t->AddQuantParam(*in_quant_arg);
|
||||
in_t->MallocData();
|
||||
float in[] = {12.216284, 3.3466918, 15.327419, 5.234958, 0.804376, 9.952188, 14.727955, -8.080715,
|
||||
13.71383, 8.055829, 6.5845337, -9.25232, -4.24519, 11.550042, 9.262012, 1.2780352,
|
||||
6.7263746, -3.9301445, 3.764492, -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505};
|
||||
Quantize(in, in_t->ElementsNum(), in_quant_arg->scale, in_quant_arg->zeroPoint,
|
||||
reinterpret_cast<int8_t *>(in_t->Data()));
|
||||
inputs_->push_back(in_t);
|
||||
|
||||
Tensor *weight_t = new Tensor(kNumberTypeInt8, {3, 1, 1, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
|
||||
auto weight_quant_arg = new mindspore::lite::tensor::QuantArg();
|
||||
weight_quant_arg->zeroPoint = 66, weight_quant_arg->scale = 0.036439215686275;
|
||||
weight_t->AddQuantParam(*weight_quant_arg);
|
||||
weight_t->MallocData();
|
||||
float weight[] = {-0.7308652, 0.5257509, -0.87825793, -1.123181, -1.2206168, 0.562695,
|
||||
1.5382664, -0.5020635, 0.8591602, -0.26410004, 1.1262615, 0.073132955};
|
||||
Quantize(weight, weight_t->ElementsNum(), weight_quant_arg->scale, weight_quant_arg->zeroPoint,
|
||||
reinterpret_cast<int8_t *>(weight_t->Data()));
|
||||
inputs_->push_back(weight_t);
|
||||
|
||||
Tensor *out_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
|
||||
out_t->MallocData();
|
||||
auto output_quant_arg = new mindspore::lite::tensor::QuantArg();
|
||||
output_quant_arg->zeroPoint = 7, output_quant_arg->scale = 0.234321233;
|
||||
out_t->AddQuantParam(*output_quant_arg);
|
||||
outputs_->push_back(out_t);
|
||||
|
||||
*correct = reinterpret_cast<int8_t *>(malloc(out_t->ElementsNum() * sizeof(int8_t)));
|
||||
float nchw_co[] = {-26.51016327, 7.92113757, 27.25741343, 0.785643655, 31.3307619, 14.05927672,
|
||||
-1.178490666, 2.5676252, 16.39408946, -0.394793726, 25.2866881, 3.827249175,
|
||||
-0.626854507, -0.3122176, 10.42769169, 8.362184085, 6.04617807, -9.252362384};
|
||||
Quantize(nchw_co, out_t->ElementsNum(), output_quant_arg->scale, output_quant_arg->zeroPoint, *correct);
|
||||
|
||||
conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 1;
|
||||
conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 0;
|
||||
conv_param->is_relu_ = conv_param->is_relu6_ = false;
|
||||
return out_t->ElementsNum();
|
||||
}
|
||||
|
||||
TEST_F(TestConv1x1Int8, Conv1x1Int8Test1) {
|
||||
std::vector<lite::tensor::Tensor *> inputs_;
|
||||
std::vector<lite::tensor::Tensor *> outputs_;
|
||||
auto conv_param = new ConvParameter();
|
||||
int8_t *correct;
|
||||
auto ctx = new lite::Context;
|
||||
ctx->thread_num_ = 1;
|
||||
int total_size = Conv1x1Int8TestInit1(&inputs_, &outputs_, conv_param, &correct);
|
||||
kernel::Convolution1x1Int8CPUKernel *conv1x1 = new kernel::Convolution1x1Int8CPUKernel(
|
||||
reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr);
|
||||
|
||||
conv1x1->Init();
|
||||
conv1x1->Run();
|
||||
CompareOutputData(reinterpret_cast<int8_t *>(outputs_[0]->Data()), correct, total_size, 2);
|
||||
|
||||
delete conv1x1;
|
||||
for (auto t : inputs_) delete t;
|
||||
for (auto t : outputs_) delete t;
|
||||
free(correct);
|
||||
}
|
||||
|
||||
int Conv1x1Int8TestInit2(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
|
||||
ConvParameter *conv_param, int8_t **correct) {
|
||||
size_t buffer_size;
|
||||
Tensor *in_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
|
||||
auto in_quant_arg = new mindspore::lite::tensor::QuantArg();
|
||||
in_quant_arg->zeroPoint = -42, in_quant_arg->scale = 0.117647;
|
||||
in_t->AddQuantParam(*in_quant_arg);
|
||||
in_t->MallocData();
|
||||
std::string input_path = "./input";
|
||||
auto input = mindspore::lite::ReadFile(input_path.c_str(), &buffer_size);
|
||||
memcpy(in_t->Data(), input, buffer_size);
|
||||
inputs_->push_back(in_t);
|
||||
delete[] input;
|
||||
|
||||
Tensor *weight_t = new Tensor(kNumberTypeInt8, {3, 1, 1, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
|
||||
auto weight_quant_arg = new mindspore::lite::tensor::QuantArg();
|
||||
weight_quant_arg->zeroPoint = 66, weight_quant_arg->scale = 0.036439215686275;
|
||||
weight_t->AddQuantParam(*weight_quant_arg);
|
||||
weight_t->MallocData();
|
||||
std::string weight_path = "./weight";
|
||||
auto weight = mindspore::lite::ReadFile(weight_path.c_str(), &buffer_size);
|
||||
memcpy(weight_t->Data(), weight, buffer_size);
|
||||
inputs_->push_back(weight_t);
|
||||
delete[] weight;
|
||||
|
||||
Tensor *bias_t = new Tensor(kNumberTypeInt32, {4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
|
||||
weight_t->MallocData();
|
||||
std::string bias_path = "./bias";
|
||||
auto bias = mindspore::lite::ReadFile(bias_path.c_str(), &buffer_size);
|
||||
memcpy(bias_t->Data(), bias, buffer_size);
|
||||
inputs_->push_back(bias_t);
|
||||
delete[] bias;
|
||||
|
||||
Tensor *out_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
|
||||
out_t->MallocData();
|
||||
auto output_quant_arg = new mindspore::lite::tensor::QuantArg();
|
||||
output_quant_arg->zeroPoint = 7, output_quant_arg->scale = 0.234321233;
|
||||
out_t->AddQuantParam(*output_quant_arg);
|
||||
outputs_->push_back(out_t);
|
||||
|
||||
*correct = reinterpret_cast<int8_t *>(malloc(out_t->ElementsNum() * sizeof(int8_t)));
|
||||
std::string output_path = "./output";
|
||||
auto output = mindspore::lite::ReadFile(output_path.c_str(), &buffer_size);
|
||||
memcpy(*correct, output, buffer_size);
|
||||
delete[] output;
|
||||
|
||||
conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 1;
|
||||
conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 0;
|
||||
conv_param->is_relu_ = conv_param->is_relu6_ = false;
|
||||
return out_t->ElementsNum();
|
||||
}
|
||||
|
||||
TEST_F(TestConv1x1Int8, Conv1x1Int8Test2) {
|
||||
std::vector<lite::tensor::Tensor *> inputs_;
|
||||
std::vector<lite::tensor::Tensor *> outputs_;
|
||||
auto conv_param = new ConvParameter();
|
||||
int8_t *correct;
|
||||
auto ctx = new lite::Context;
|
||||
ctx->thread_num_ = 1;
|
||||
int total_size = Conv1x1Int8TestInit2(&inputs_, &outputs_, conv_param, &correct);
|
||||
kernel::Convolution1x1Int8CPUKernel *conv1x1 = new kernel::Convolution1x1Int8CPUKernel(
|
||||
reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr);
|
||||
|
||||
conv1x1->Init();
|
||||
conv1x1->Run();
|
||||
CompareOutputData(reinterpret_cast<int8_t *>(outputs_[0]->Data()), correct, total_size, 2);
|
||||
|
||||
delete conv1x1;
|
||||
for (auto t : inputs_) delete t;
|
||||
for (auto t : outputs_) delete t;
|
||||
free(correct);
|
||||
}
|
||||
} // namespace mindspore
|
Loading…
Reference in New Issue