From 23f16b8778284159ffefd3525996a6f889664c10 Mon Sep 17 00:00:00 2001 From: zhaozhenlong Date: Tue, 29 Jun 2021 16:56:19 +0800 Subject: [PATCH] full connection vec matmul opt --- .../cpu/nnacl/fp16/matmul_fp16.c | 102 ++++++++++++++++++ .../cpu/nnacl/fp16/matmul_fp16.h | 4 + .../cpu/nnacl/fp32/matmul_fp32.c | 97 +++++++++++++++++ .../cpu/nnacl/fp32/matmul_fp32.h | 2 + .../kernel/arm/fp16/fullconnection_fp16.cc | 5 +- .../kernel/arm/fp16/matmul_base_fp16.cc | 43 +++++++- .../kernel/arm/fp32/matmul_fp32_base.cc | 11 ++ .../lite/test/config/models_caffe_fp16.cfg | 28 ++--- .../lite/test/config/models_onnx_fp16.cfg | 12 +-- mindspore/lite/test/config/models_tf_fp16.cfg | 8 +- .../lite/test/config/models_tflite_fp16.cfg | 16 +-- 11 files changed, 289 insertions(+), 39 deletions(-) diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/matmul_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/matmul_fp16.c index 3ce17b8e8b1..307395c9cda 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/matmul_fp16.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/matmul_fp16.c @@ -393,6 +393,91 @@ void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const floa return; } +#ifdef ENABLE_ARM64 +// 8 X 16 +void VecMatmulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, + int col) { + int align_col = UP_ROUND(col, C16NUM); + int ci = 0; + for (; ci < align_col - C16NUM + 1; ci += C16NUM) { + float16x8_t acc_0 = vdupq_n_f16((float16_t)0.0); + float16x8_t acc_1 = vdupq_n_f16((float16_t)0.0); + if (bias != NULL) { + acc_0 = vld1q_f16(bias + ci); + acc_1 = vld1q_f16(bias + ci + C8NUM); + } + const float16_t *bv_base = b + ci * depth; + int di = 0; + for (; di < depth - C8NUM + 1; di += C8NUM) { + float16x8_t av = vld1q_f16(a + di); + float16x8_t bv_0[C8NUM]; + float16x8_t bv_1[C8NUM]; + for (int i = 0; i < C8NUM; ++i) { + bv_0[i] = vld1q_f16(bv_base); + bv_1[i] = vld1q_f16(bv_base + C8NUM); + bv_base += C16NUM; + } + for (int i = 0; i < C8NUM; ++i) { + acc_0 = vfmaq_n_f16(acc_0, bv_0[i], av[i]); + acc_1 = vfmaq_n_f16(acc_1, bv_1[i], av[i]); + } + } + if (di < depth) { + for (; di < depth; ++di) { + float16_t ai = a[di]; + float16x8_t bv0 = vld1q_f16(bv_base); + float16x8_t bv1 = vld1q_f16(bv_base + C8NUM); + acc_0 = vfmaq_n_f16(acc_0, bv0, ai); + acc_1 = vfmaq_n_f16(acc_1, bv1, ai); + bv_base += C16NUM; + } + } // only save actual col num data + if (ci + C8NUM > col) { + int c_remain = col - ci; + for (int i = 0; i < c_remain; ++i) { + if (act_type == ActType_Relu) { + c[i] = MSMAX(acc_0[i], (float16_t)0.0); + } else if (act_type == ActType_Relu6) { + c[i] = MSMIN(MSMAX(acc_0[i], (float16_t)0.0), (float16_t)6.0); + } else { + c[i] = acc_0[i]; + } + } + return; + } + if (act_type == ActType_Relu) { + acc_0 = vmaxq_f16(acc_0, vdupq_n_f16((float16_t)0.0)); + } + if (act_type == ActType_Relu6) { + acc_0 = vminq_f16(vmaxq_f16(acc_0, vdupq_n_f16((float16_t)0.0)), vdupq_n_f16((float16_t)6.0)); + } + vst1q_f16(c, acc_0); + + if (ci + C16NUM > col) { + int c_remain = col - ci; + for (int i = 0; i < c_remain; ++i) { + if (act_type == ActType_Relu) { + c[C8NUM + i] = MSMAX(acc_1[i], (float16_t)0.0); + } else if (act_type == ActType_Relu6) { + c[C8NUM + i] = MSMIN(MSMAX(acc_1[i], (float16_t)0.0), (float16_t)6.0); + } else { + c[C8NUM + i] = acc_1[i]; + } + } + return; + } + if (act_type == ActType_Relu) { + acc_1 = vmaxq_f16(acc_1, vdupq_n_f16((float16_t)0.0)); + } + if (act_type == ActType_Relu6) { + acc_1 = vminq_f16(vmaxq_f16(acc_1, vdupq_n_f16((float16_t)0.0)), vdupq_n_f16((float16_t)6.0)); + } + vst1q_f16(c + C8NUM, acc_1); + c += C16NUM; + } +} +#endif + #ifdef ENABLE_ARM82_A32 void MatVecMulA32Fp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, int col) { @@ -675,6 +760,23 @@ void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col, } } +void RowMajor2Row16MajorFp16Opt(const float16_t *src, float16_t *dst, int row, int col) { + int col_align = UP_ROUND(col, C16NUM); + for (int r = 0; r < row; r++) { + int c = 0; + for (; c < col; c++) { + int c_div16 = c / C16NUM; + int c_mod16 = c % C16NUM; + dst[c_div16 * C16NUM * row + r * C16NUM + c_mod16] = src[r * col + c]; + } + for (; c < col_align; c++) { + int c_div16 = c / C16NUM; + int c_mod16 = c % C16NUM; + dst[c_div16 * C16NUM * row + r * C16NUM + c_mod16] = (float16_t)0.0; + } + } +} + void RowMajor2Row12MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { for (int r = 0; r < row; r++) { for (int c = 0; c < col; c++) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/matmul_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/matmul_fp16.h index 7264ef463a6..da9fafc4d45 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/matmul_fp16.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/matmul_fp16.h @@ -56,6 +56,8 @@ void MatmulFp16Neon64Opt(const float16_t *a, const float16_t *b, float16_t *c, c void MatVecMulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, int col); +void VecMatmulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, + int col); #elif ENABLE_ARM82_A32 void MatMul12x8A32Fp16(const float16_t *a, const float16_t *b, float16_t *dst, const float16_t *bias, ActType act_type, int deep, int row, int col, int stride, int write_mode); @@ -86,6 +88,8 @@ void RowMajor2Col16MajorFp16(const void *src, float16_t *dst, int row, int col, void RowMajor2Col12MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); +void RowMajor2Row16MajorFp16Opt(const float16_t *src, float16_t *dst, int row, int col); + void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); void RowMajor2Row12MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/matmul_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/matmul_fp32.c index 4a2f2955453..b027363aa41 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/matmul_fp32.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/matmul_fp32.c @@ -18,6 +18,9 @@ #ifdef ENABLE_SSE #include #endif +#ifdef ENABLE_ARM64 +#include +#endif void RowMajor2ColMajor(const float *src_ptr, float *dst_ptr, int row, int col) { for (int r = 0; r < row; ++r) { for (int c = 0; c < col; ++c) { @@ -881,6 +884,100 @@ void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, } } #endif + +#ifdef ENABLE_ARM64 +// 4x8 +void MatVecMulFp32Neon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col, + int align_col) { + int ci = 0; + for (; ci < align_col - C8NUM + 1; ci += C8NUM) { + float32x4_t acc_0; + float32x4_t acc_1; + if (bias != NULL) { + acc_0 = vld1q_f32(bias + ci); + acc_1 = vld1q_f32(bias + ci + C4NUM); + } else { + acc_0 = vdupq_n_f32(0.0f); + acc_1 = vdupq_n_f32(0.0f); + } + const float *bv_base = b + ci * depth; + int di = 0; + for (; di < depth - C4NUM + 1; di += C4NUM) { + float32x4_t av = vld1q_f32(a + di); + float32x4_t bv_00 = vld1q_f32(bv_base); + float32x4_t bv_10 = vld1q_f32(bv_base + C4NUM); + bv_base += C8NUM; + float32x4_t bv_01 = vld1q_f32(bv_base); + float32x4_t bv_11 = vld1q_f32(bv_base + C4NUM); + bv_base += C8NUM; + float32x4_t bv_02 = vld1q_f32(bv_base); + float32x4_t bv_12 = vld1q_f32(bv_base + C4NUM); + bv_base += C8NUM; + float32x4_t bv_03 = vld1q_f32(bv_base); + float32x4_t bv_13 = vld1q_f32(bv_base + C4NUM); + bv_base += C8NUM; + acc_0 = vmlaq_n_f32(acc_0, bv_00, av[0]); + acc_1 = vmlaq_n_f32(acc_1, bv_10, av[0]); + acc_0 = vmlaq_n_f32(acc_0, bv_01, av[1]); + acc_1 = vmlaq_n_f32(acc_1, bv_11, av[1]); + acc_0 = vmlaq_n_f32(acc_0, bv_02, av[2]); + acc_1 = vmlaq_n_f32(acc_1, bv_12, av[2]); + acc_0 = vmlaq_n_f32(acc_0, bv_03, av[3]); + acc_1 = vmlaq_n_f32(acc_1, bv_13, av[3]); + } + if (di < depth) { + for (; di < depth; ++di) { + float ai = a[di]; + float32x4_t bv0 = vld1q_f32(bv_base); + float32x4_t bv1 = vld1q_f32(bv_base + C4NUM); + acc_0 = vmlaq_n_f32(acc_0, bv0, ai); + acc_1 = vmlaq_n_f32(acc_1, bv1, ai); + bv_base += C8NUM; + } + } // only save actual col num data + if (ci + C4NUM - 1 >= col) { + int c_remain = col - ci; + for (int i = 0; i < c_remain; ++i) { + if (act_type == ActType_Relu) { + c[i] = MSMAX(acc_0[i], 0.0f); + } else if (act_type == ActType_Relu6) { + c[i] = MSMIN(MSMAX(acc_0[i], 0.0f), 6.0f); + } else { + c[i] = acc_0[i]; + } + } + return; + } + if (act_type == ActType_Relu) { + acc_0 = vmaxq_f32(acc_0, vdupq_n_f32(0.0f)); + } else if (act_type == ActType_Relu6) { + acc_0 = vminq_f32(vmaxq_f32(acc_0, vdupq_n_f32(0.0f)), vdupq_n_f32(6.0f)); + } + vst1q_f32(c, acc_0); + if (ci + C8NUM - 1 >= col) { + int c_remain = col - ci; + for (int i = 0; i < c_remain; ++i) { + if (act_type == ActType_Relu) { + c[C4NUM + i] = MSMAX(acc_1[i], 0.0f); + } else if (act_type == ActType_Relu6) { + c[C4NUM + i] = MSMIN(MSMAX(acc_1[i], 0.0f), 6.0f); + } else { + c[C4NUM + i] = acc_1[i]; + } + } + return; + } + if (act_type == ActType_Relu) { + acc_1 = vmaxq_f32(acc_1, vdupq_n_f32(0.0f)); + } else if (act_type == ActType_Relu6) { + acc_1 = vminq_f32(vmaxq_f32(acc_1, vdupq_n_f32(0.0f)), vdupq_n_f32(6.0f)); + } + vst1q_f32(c + C4NUM, acc_1); + c += C8NUM; + } +} +#endif + void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, ActType act_type, int deep, int row, int col, int stride, int out_type) { if (out_type == OutType_Nhwc) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/matmul_fp32.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/matmul_fp32.h index a82587fd3bc..466cb33d0ec 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/matmul_fp32.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/matmul_fp32.h @@ -65,6 +65,8 @@ void MatmulFloatNeon64OptRow4(const float *a, const float *b, float *c, const fl int row, int col, size_t stride, size_t write_mode); void MatmulFloatNeon64OptRow12(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, int col, size_t stride, size_t write_mode); +void MatVecMulFp32Neon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col, + int align_col); #elif ENABLE_ARM32 void MatmulFloatNeon32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, int col, int stride, size_t writeNhwc, size_t WriteWino); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc index fbdcbde4436..47da33433ef 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc @@ -50,12 +50,11 @@ int FullconnectionFP16CPUKernel::Init() { params_->a_transpose_ = false; params_->b_transpose_ = true; - MatmulBaseFP16CPUKernel::InitParameter(); - + params_->a_const_ = (in_tensors_[0]->data_c() != nullptr); + params_->b_const_ = (in_tensors_[1]->data_c() != nullptr); if (params_->a_const_ == true) { InitAShape(); } - if (params_->b_const_ == true) { InitBShape(); } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc index 50f3a63c3ae..3e0c817d5d9 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc @@ -100,9 +100,18 @@ int MatmulBaseFP16CPUKernel::ReSize() { free(src_b_); src_b_ = nullptr; } - - thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_, C8NUM)); - thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM; + if (vec_matmul_) { +#ifdef ENABLE_ARM64 + thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_, C16NUM)); + thread_stride_ = UP_DIV(UP_DIV(params_->col_, C16NUM), thread_count_) * C16NUM; +#else + thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_, C8NUM)); + thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM; +#endif + } else { + thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_, C8NUM)); + thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM; + } return RET_OK; } @@ -113,7 +122,11 @@ void MatmulBaseFP16CPUKernel::ResizeParameter() { if (vec_matmul_) { params_->row_align_ = 1; +#ifdef ENABLE_ARM64 + params_->col_align_ = UP_ROUND(params_->col_, C16NUM); +#else params_->col_align_ = params_->col_; +#endif } else { params_->row_align_ = UP_ROUND(params_->row_, row_tile_); params_->col_align_ = UP_ROUND(params_->col_, C8NUM); @@ -188,13 +201,27 @@ void MatmulBaseFP16CPUKernel::InitMatrixB(void *src_ptr, TypeId src_data_type) { Float32ToFloat16(reinterpret_cast(src_ptr), b_pack_ptr_, params_->batch * params_->col_ * params_->deep_); } else { +#ifdef ENABLE_ARM64 + for (auto i = 0; i < params_->batch; ++i) { + const auto *b_src = reinterpret_cast(src_ptr) + i * params_->col_align_ * params_->deep_; + auto *dst = b_pack_ptr_ + i * params_->col_align_ * params_->deep_; + RowMajor2Col16MajorFp16Opt(b_src, dst, params_->col_align_, params_->deep_); + } +#else memcpy(b_pack_ptr_, src_ptr, params_->batch * params_->col_ * params_->deep_ * sizeof(float16_t)); +#endif } } else { for (int i = 0; i < params_->batch; i++) { +#ifdef ENABLE_ARM64 + const auto *b_src = reinterpret_cast(src_ptr) + i * params_->col_align_ * params_->deep_; + auto *dst = b_pack_ptr_ + i * params_->col_align_ * params_->deep_; + RowMajor2Row16MajorFp16Opt(b_src, dst, params_->deep_, params_->col_); +#else const int8_t *batch_src = int8_src + i * params_->deep_ * params_->col_ * lite::DataTypeSize(src_data_type); float16_t *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_; RowMajor2ColMajorFp16(batch_src, dst, params_->deep_, params_->col_, src_data_type == kNumberTypeFloat32); +#endif } } return; @@ -210,7 +237,7 @@ void MatmulBaseFP16CPUKernel::InitMatrixB(void *src_ptr, TypeId src_data_type) { } } return; -} +} // namespace mindspore::kernel int MatmulBaseFP16CPUKernel::Init() { ResizeParameter(); @@ -259,7 +286,11 @@ int MatmulBaseFP16CPUKernel::RunImpl(int task_id) { auto c = batch_c_ptr_ + task_id * thread_stride_; if (vec_matmul_) { +#ifdef ENABLE_ARM64 + VecMatmulFp16(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc); +#else MatVecMulFp16(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc); +#endif } else { MatMulFp16(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc); @@ -288,7 +319,11 @@ int MatmulBaseFP16CPUKernel::Run() { for (int i = 0; i < params_->batch; ++i) { if (vec_matmul_) { batch_a_ptr_ = a_pack_ptr_ + i * params_->deep_; +#ifdef ENABLE_ARM64 + batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_align_; +#else batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_; +#endif batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_; } else { batch_a_ptr_ = a_pack_ptr_ + i * params_->row_align_ * params_->deep_; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc index c5f7d907c74..2d53861b376 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc @@ -53,6 +53,8 @@ void MatmulFp32BaseCPUKernel::ResizeParameter() { #ifdef ENABLE_AVX // vector matmul col is aligned to C8NUM in avx col_tile_ = C8NUM; +#elif defined(ENABLE_ARM64) + col_tile_ = C8NUM; #endif row_tile_ = 1; } @@ -60,6 +62,9 @@ void MatmulFp32BaseCPUKernel::ResizeParameter() { #ifdef ENABLE_AVX // avx is aligned to col_tile_ params_->col_align_ = UP_ROUND(params_->col_, col_tile_); +#elif defined(ENABLE_ARM64) + // no matter vec_matmul_ or not, use col_tile_ to get col_align_ + params_->col_align_ = UP_ROUND(params_->col_, col_tile_); #else params_->col_align_ = vec_matmul_ ? params_->col_ : UP_ROUND(params_->col_, col_tile_); #endif @@ -170,12 +175,16 @@ int MatmulFp32BaseCPUKernel::InitMatrixB(const float *src_ptr) { if (params_->b_transpose_) { #ifdef ENABLE_AVX RowMajor2Col32Major(src_data, dst, params_->deep_, params_->col_); +#elif defined(ENABLE_ARM64) + RowMajor2Col8Major(src_data, dst, params_->col_, params_->deep_); #else memcpy(dst, src_data, params_->col_ * params_->deep_ * sizeof(float)); #endif } else { #ifdef ENABLE_AVX RowMajor2Row32Major(src_data, dst, params_->col_, params_->deep_); +#elif defined(ENABLE_ARM64) + RowMajor2Row8Major(src_data, dst, params_->deep_, params_->col_); #else RowMajor2ColMajor(src_data, dst, params_->deep_, params_->col_); #endif @@ -248,6 +257,8 @@ int MatmulFp32BaseCPUKernel::FloatRun(int task_id) { if (vec_matmul_) { #ifdef ENABLE_AVX MatVecMulAvxFp32(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc, params_->col_align_); +#elif defined(ENABLE_ARM64) + MatVecMulFp32Neon64(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc, params_->col_align_); #else MatVecMulFp32(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc); #endif diff --git a/mindspore/lite/test/config/models_caffe_fp16.cfg b/mindspore/lite/test/config/models_caffe_fp16.cfg index ba8218d9464..3c7bd66649c 100644 --- a/mindspore/lite/test/config/models_caffe_fp16.cfg +++ b/mindspore/lite/test/config/models_caffe_fp16.cfg @@ -6,7 +6,7 @@ beard 2 emotion 60 gender_res_large_deploy 0.1 glasses 4 -hat 1 +hat 2.5 isface 1 ml_bank_detect_0312_tmp 20 ml_face_div_parsing 8 @@ -24,8 +24,8 @@ mtk_detect-deeper-halfdeeper-mbv1-lastearlySSD-shortcut-400-400_nopostprocess_si # mtk_detect-deeper-halfdeeper-mbv1-shortcut-400-400_nopostprocess_simplified: precision is 5% detect-deeper-halfdeeper-mbv1-shortcut-400-400_nopostprocess_simplified 5.5 hiai_face_detect_rfb 4 -hiai_face_isface 0.1 -hiai_face_landmark 0.2 +hiai_face_isface 0.2 +hiai_face_landmark 0.3 hiai_face_pose_tuku 1.3 ml_hand_detection 8 ml_ocr_cn 6 @@ -45,17 +45,17 @@ model_hebing_3branch 40 hiai_cv_focusShootOCRModel_07 3 hiai_cv_focusShootOCRModel_03 60 hiai_cv_focusShootOCRModel_01 14 -hiai_face_hat1 1 +hiai_face_hat1 1.7 hiai_cv_focusShootOCRModel_04 8 hiai_cv_focusShootOCRModel_06 13 -hiai_cpu_face_hat 0.3 +hiai_cpu_face_hat 1.7 hiai_video_seg 1 hiai_semantic_seg 3 hiai_human_seg 28 hiai_face_recognition_1 10 hiai_cpu_face_detect 4.5 -hiai_cpu_face_attr 12 -hiai_face_attr1 12 +hiai_cpu_face_attr 82.3 # divded by small number causes big bias +hiai_face_attr1 82.3 # divded by small number causes big bias # mtk_detect-mbv1-shortcut-400-400_nopostprocess_simplified: precision is 5% mtk_detect-mbv1-shortcut-400-400_nopostprocess_simplified 5.5 mtk_detect_mbv1_640_480_nopostprocess_simplified 5 @@ -79,7 +79,7 @@ hdc_contour_pose_128 0.5 hdc_emotion 0.5 hdc_fivembnet 1 hdc_isface 0.5 -hdc_mobilenetface 8.5 +hdc_mobilenetface 11.5 # small output causes big bias hdc_retinaface 14 hdc_resnet 7 ml_video_edit_detect 2.5 @@ -94,13 +94,13 @@ ml_video_edit_video_segment_gauss_adaptis_part1 5 # When the input range is [-1,1], the precision is poor, and the output value is very small (10e-5). If the input range is adjusted to [0,255], the precision will decrease to 15.5415%, and the rest is cumulative error. ml_handpose 175 hdc_Face_Aesthetic_MTI_Aesthetic 0.5 -ml_face_compare 5.5 +ml_face_compare 8.7 ml_face_tracking 2.5 ml_face_beard 0.5 ml_face_age 3.5 ml_face_pose 1 ml_face_isface 0.5 -ml_face_glasses 2.5 +ml_face_glasses 3.4 # ml_segmentation_matting 26 # output value unstable ml_segmentation_atlanta_10 5 # ml_bodymask: The difference of output node divided by a very small value leads to a large error @@ -108,13 +108,13 @@ ml_bodymask 16 ml_Hand_deploy 4 # ml_hand_3d_detection: The difference of output node divided by a very small value leads to a large error ml_hand_3d_detection 12 -ml_hand_3d_regression 3 +ml_hand_3d_regression 5.4 # ml_ARengine23_bodypose: The difference of output node divided by a very small value leads to a large error ml_ARengine23_bodypose 56 ml_ocr_bank_card_detection_inception_tmp 20 ml_ocr_bank_card_recognition_fcny 0.5 hiai_cv_aestheticsEngineModel_osp 1.6 -ml_face_hat 0.5 +ml_face_hat 2.2 bank_card_recognition_fcny 17 bank_card_detection_inception_tmp 12 ml_ocr_identify_card_fcny 0.5 @@ -122,8 +122,8 @@ ml_ocr_identify_card_detect_tmp 2 identify_card_detect_tmp 0.5 ml_2012_ocr_detection_caffe_tmp 1 ml_2012_ocr_rec_caffe 0.5 -ml_lable_model_hebing_device 2 -ml_face_sex 0.5 +ml_lable_model_hebing_device 3 +ml_face_sex 0.6 # ml_face_mnet: The precision problem caused by cumulative error. ml_face_mnet 12 ml_segmentation_atlanta_1 0.5 diff --git a/mindspore/lite/test/config/models_onnx_fp16.cfg b/mindspore/lite/test/config/models_onnx_fp16.cfg index aff8571d28a..4c23284359c 100644 --- a/mindspore/lite/test/config/models_onnx_fp16.cfg +++ b/mindspore/lite/test/config/models_onnx_fp16.cfg @@ -61,8 +61,8 @@ ml_ei_facedetection.onnx 2 #ml_video_edit_art_generate.onnx #mul operator overflows, not suitable for fp16 #ml_voice_detect.onnx #conv operator overflows, not suitable for fp16 #ml_location_lane_counter.onnx has very small values during op computation (<1e-6), which causes the precision variation -ml_location_lane_counter.onnx 7 -ml_location_lane_counter0.onnx 0.5 +ml_location_lane_counter.onnx 7.5 +ml_location_lane_counter0.onnx 1.0 #The encoder an decoder model are used in ml_asr scene, both have value overflow. Not suitable for fp16. #But added for guarding process. encoder.onnx;1;1,32,83 1262 @@ -75,19 +75,19 @@ mtk_detect-mbv1-shortcut-400-400_nopostprocess_simplified_onnx.onnx 6.5 mtk_detect-deeper-halfdeeper-mbv1-lastearlySSD-shortcut-400-400_nopostprocess_simplified_onnx.onnx 2.5 mtk_detect_mbv1_640_480_nopostprocess_simplified_onnx.onnx;1;1,480,640,3 2 mtk_face_features_v2.onnx;1;1,256,192,3 0.5 -mtk_face_recognition_v3.onnx 0.5 +mtk_face_recognition_v3.onnx 1.1 mtk_face_recognition_v2.onnx 2.5 ml_2012_ocr_detection_tmp.onnx 0.5 -Harmony_Voiceprint_resnet18.onnx;1;1,150,40,1 4.5 +Harmony_Voiceprint_resnet18.onnx;1;1,150,40,1 5.5 bloom_hongmo_detection_tmp.onnx 0.5 -Q_face_recognition.onnx 3 +Q_face_recognition.onnx 3.2 ml_video_edit_enhance_update_tmp.onnx 0.5 Q888_face_recognition.onnx 3.5 Q888_iris_detect.onnx 0.5 ssd_mobilenet_v1_10.onnx;1;1,383,640,3 0.5 # The output from a conv in the later part contains many minus values, the following leakyRelu makes them become very # close to 0 (-e^-4). The fp16 precision lost a lot in this case and it affects the following computation. -Harmony_Voiceprint.onnx;1;1,200,40,1 5.5 +Harmony_Voiceprint.onnx;1;1,200,40,1 21.5 # small output causes big bias # A matmul op in the later part produces overflowed output values (>65504). #ml_video_edit_art_generate_20210513.onnx nan ml_asr_encoder_int8_202103.onnx;;;4 2.1 diff --git a/mindspore/lite/test/config/models_tf_fp16.cfg b/mindspore/lite/test/config/models_tf_fp16.cfg index 681e8f65e48..cc9fd6c97d2 100644 --- a/mindspore/lite/test/config/models_tf_fp16.cfg +++ b/mindspore/lite/test/config/models_tf_fp16.cfg @@ -33,7 +33,7 @@ mtk_face_features_v1.pb 26 model_normalize_object_scene_ps_20200519.pb;1;1,224,224,3 10 hiai_AADB_HADB_MBV2_model.pb;1;1,224,224,3 6 hiai_frozen_inference_graph.pb 8 -hiai_lm_inference_graph.pb 0.6 +hiai_lm_inference_graph.pb 1.2 hiai_ghostnet.pb 0.9 hiai_face_model_npu.pb 0.5 hiai_cv_focusShootOCRModel_02.pb 10.5 @@ -60,9 +60,9 @@ bolt_segment.pb 2 siteAI_wireless_depress_w.pb;1;1,36 0.5 siteAI_wireless_restore_w.pb;1;1,36 0.5 siteAI_trans_nonlinear.pb;1;1,137 0.5 -siteAI_trans_nonlinear40g.pb;1;1,271 0.5 +siteAI_trans_nonlinear40g.pb;1;1,271 0.6 siteAI_trans_nonlinear134g.pb;1;1,137 0.5 -siteAI_trans_nonlinear134g_nrz.pb;1;1,182 0.5 +siteAI_trans_nonlinear134g_nrz.pb;1;1,182 0.6 ml_vision_guide_detection2.pb;1;1,320,320,1 1 # ml_tts_encoder.pb has a round op, which will cause round-off error when the decimal of input value is near 0.5 ml_tts_encoder.pb;4;1:1,44:1:1 9 @@ -85,4 +85,4 @@ ml_tts_decoder_control_flow.pb;5 1 ml_tts_decoder.pb;5 2.5 ml_tts_vocoder.pb;66 53 hiai_transformer_encoder.pb;15 4 -decoder_step_nocumsum_v5.pb;13;1:1,512:1,1429,2:1,127:1,127:1,127:1,127,320:1,80:1,512:1,512:1,512:1,512:1,512 0.5 +decoder_step_nocumsum_v5.pb;13;1:1,512:1,1429,2:1,127:1,127:1,127:1,127,320:1,80:1,512:1,512:1,512:1,512:1,512 1.2 diff --git a/mindspore/lite/test/config/models_tflite_fp16.cfg b/mindspore/lite/test/config/models_tflite_fp16.cfg index 3e6acafcdce..24d175ad9ee 100644 --- a/mindspore/lite/test/config/models_tflite_fp16.cfg +++ b/mindspore/lite/test/config/models_tflite_fp16.cfg @@ -75,8 +75,8 @@ mtk_model_emotions_0725_fp16.tflite 3 mtk_face_features_v1_fp16.tflite 20 siteAI_digcom_AI_ECN.tflite 0.1 siteAI_digcom_g2v_keras.tflite 5 -siteAI_trans_nonlinear.tflite 0.1 -siteAI_trans_tcpclassify.tflite 5 +siteAI_trans_nonlinear.tflite 0.2 +siteAI_trans_tcpclassify.tflite 5.3 siteAI_wireless_depress_w.tflite 8 siteAI_wireless_restore_w.tflite 0.1 magenta_arbitrary-image-stylization-v1-256_fp16_prediction_1.tflite 5 @@ -123,7 +123,7 @@ lite-model_cartoongan_fp16_1.tflite 3 lite-model_arbitrary-image-stylization-inceptionv3_fp16_predict_1.tflite 6 gts_detect_5k_tf115.tflite 9.5 mtk_isface.tflite 0.2 -mtk_landmark.tflite 0.1 +mtk_landmark.tflite 0.3 mtk_new_detect.tflite 3 mtk_pose.tflite 2 mtk_model_emotions_0727_nosoftmax.tflite 2 @@ -132,7 +132,7 @@ mtk_276landmark_0913.tflite 16 mtk_face_recognition.tflite 8 mtk_convert_model.tflite 5 smartreply.tflite 0.1 -mindspore_text_classification_tflite.tflite 4 +mindspore_text_classification_tflite.tflite 9.2 # small output causes big bias #ml_location.tflite 0.1 ml_text_correction.tflite 1 # ml_pic_shopping.tflite involves subtract two close numbers. @@ -147,7 +147,7 @@ ml_ocr_jk_pb2tflite.tflite 0.5 ml_ocr_latin_pb2tflite.tflite 11.5 scan_hms_angle_pb2tflite.tflite 2.5 scan_hms_detect_pb2tflite.tflite 1.5 -ml_location.tflite 0.5 +ml_location.tflite 0.6 ml_face_openclose_tflite.tflite 0.5 ml_object_detect_pb2tflite.tflite 1.5 # lite-model_on_device_vision_classifier_landmarks_classifier* models' bias are caused by error accumulation and small @@ -189,7 +189,7 @@ Q_landmark.tflite 0.5 Q_new_detect.tflite 3.5 # the input of Q_object_scene model is between 0-255 Q_object_scene.tflite 3 -Q_pose.tflite 1.5 +Q_pose.tflite 4.1 Q_detect_fpn_add_inception-1448650.tflite 1 bloom_landmark.tflite 0.5 # input data: 0~255 @@ -197,7 +197,7 @@ Q888_age_gender_orderd.tflite 1.5 Q888_face_dress_mv3y.tflite 0.5 Q888_HADB_AADB_MBV2_model_fp32.tflite 2.5 Q888_landmark.tflite 0.5 -Q888_pose.tflite 5 +Q888_pose.tflite 6.1 # the output contains value less than e-7 Q888_lapa158_unet_0924.tflite 19 Q888_isface.tflite 1.0 @@ -219,4 +219,4 @@ hdc_tb_cn_neg.tflite;3 295 # The input of hiai_cv_labelDetectorModel_v3.tflite is between 0-255. hiai_cv_labelDetectorModel_v3.tflite;2 2 ml_headpose_pb2tflite.tflite;3;16:1,64,64,3:16 1 -ml_ei_headpose_pb2tflite.tflite;3;16:1,64,64,3:16 0.5 +ml_ei_headpose_pb2tflite.tflite;3;16:1,64,64,3:16 0.6