!16305 [MS][LITE][CPU] arm32 fp16 support npu and weight quant

From: @lzkcode
Reviewed-by: @zhanghaibo5,@zhang_xue_tong
Signed-off-by: @zhang_xue_tong
This commit is contained in:
mindspore-ci-bot 2021-05-17 09:23:43 +08:00 committed by Gitee
commit 5e7e39f925
7 changed files with 56 additions and 55 deletions

View File

@ -56,7 +56,7 @@ int Relu6Fp16(const float16_t *data, float16_t *dst, int ele_num) {
int LReluFp16(const float16_t *src, float16_t *dst, int ele_num, float16_t alpha) {
int i = 0;
#ifdef ENABLE_NEON
int ele_c8 = UP_ROUND(ele_num, C8NUM);
int ele_c8 = DOWN_ROUND(ele_num, C8NUM);
for (; i < ele_c8; i += C8NUM) {
float16x8_t src_tmp = vld1q_f16(src + i);
float16x8_t mul_tmp = vmulq_n_f16(src_tmp, alpha);
@ -187,7 +187,7 @@ int GeluFp16(const float16_t *src, int length, float16_t *dst, bool approximate)
if (approximate) {
// dst = 0.5 * x * (1 + tanh((2 / pi) ^ 0.5 * (x + 0.044715x^3)))
#ifdef ENABLE_NEON
int C8 = UP_ROUND(length, C8NUM);
int C8 = DOWN_ROUND(length, C8NUM);
for (; i < C8; i += C8NUM) {
float16x8_t in = vld1q_f16(src + i);
float16x8_t res =
@ -202,7 +202,7 @@ int GeluFp16(const float16_t *src, int length, float16_t *dst, bool approximate)
}
} else {
#ifdef ENABLE_NEON
int C8 = UP_ROUND(length, C8NUM);
int C8 = DOWN_ROUND(length, C8NUM);
for (; i < C8; i += C8NUM) {
float16x8_t in = vld1q_f16(src + i);
const float16x8_t res = 0.5f * in * (1.0f + MS_ERFX8_F16(in / (float16_t)1.4142135623730951f));

View File

@ -70,7 +70,7 @@ void PowerBroadCastFp16(const float16_t *input, const float16_t *exponent, float
}
int i = 0;
#ifdef ENABLE_NEON
int len_c8 = UP_ROUND(len, C8NUM);
int len_c8 = DOWN_ROUND(len, C8NUM);
float16x8_t scale_8 = vmovq_n_f16(scale);
float16x8_t shift_8 = vmovq_n_f16(shift);
for (; i < len_c8; i += C8NUM) {
@ -88,7 +88,7 @@ void PowerSingleFp16(const float16_t *input, const float16_t *exponent, float16_
int i = 0;
PowerScalarFunFp16 PowerScalarFunFp16_ = NULL;
#ifdef ENABLE_NEON
int len_c8 = UP_ROUND(len, C8NUM);
int len_c8 = DOWN_ROUND(len, C8NUM);
float16x8_t scale_8 = vmovq_n_f16(scale);
float16x8_t shift_8 = vmovq_n_f16(shift);
for (; i < len_c8; i += C8NUM) {

View File

@ -228,7 +228,7 @@ int Gelu(const float *src, int length, float *dst, bool approximate) {
if (approximate) {
// dst = 0.5 * x * (1 + tanh((2 / pi) ^ 0.5 * (x + 0.044715x^3)))
#if defined(ENABLE_AVX)
int C8 = UP_ROUND(length, C8NUM);
int C8 = DOWN_ROUND(length, C8NUM);
for (; i < C8; i += C8NUM) {
MS_FLOAT32X8 in = MS_LD256_F32(src + i);
MS_FLOAT32X8 res = 0.5 * in * (1.0 + MS_TANHX8_F32((0.79788456080287f + 0.035677408136f * in * in) * in));
@ -236,7 +236,7 @@ int Gelu(const float *src, int length, float *dst, bool approximate) {
}
#endif
#if defined(ENABLE_SSE) || defined(ENABLE_ARM)
int C4 = UP_ROUND(length, C4NUM);
int C4 = DOWN_ROUND(length, C4NUM);
for (; i < C4; i += C4NUM) {
MS_FLOAT32X4 in = MS_LDQ_F32(src + i);
MS_FLOAT32X4 res = 0.5 * in * (1.0 + MS_TANHX4_F32((0.79788456080287f + 0.035677408136f * in * in) * in));
@ -248,7 +248,7 @@ int Gelu(const float *src, int length, float *dst, bool approximate) {
}
} else {
#if defined(ENABLE_AVX) || defined(ENABLE_SSE) || defined(ENABLE_ARM)
int C4 = UP_ROUND(length, C4NUM);
int C4 = DOWN_ROUND(length, C4NUM);
for (; i < C4; i += C4NUM) {
MS_FLOAT32X4 in = MS_LDQ_F32(src + i);
MS_FLOAT32X4 res = 0.5 * in * (1.0 + MS_ERFX4_F32(in / 1.4142135623730951f));

View File

@ -158,9 +158,9 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float
if (ow_block < ow_block_num[oc_block - 1]) { // ow is not enough and process one ow
ow_block = 1;
}
kernel[oc_block - 1][ow_block > 1](dst_w + ow * sw_param->block_channel_, src_w, weight, bias, kernel_h,
kernel_w, act_type, ow_block, oc_block, oc_algin, ic_algin, in_kw_step,
in_kh_step, in_sw_step, 0);
kernel[oc_block - 1][ow_block / ow_block_num[oc_block - 1]](
dst_w + ow * sw_param->block_channel_, src_w, weight, bias, kernel_h, kernel_w, act_type, ow_block,
oc_block, oc_algin, ic_algin, in_kw_step, in_kh_step, in_sw_step, 0);
src_w += ow_block * in_sw_step;
}
// ow in left
@ -177,11 +177,11 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float
void SWConv3x32Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
in_kh_step *= 4;
in_sw_step *= 4;
in_kw_step *= 4;
oc_algin *= 4;
kw_remainder *= 4;
in_kh_step *= sizeof(float);
in_sw_step *= sizeof(float);
in_kw_step *= sizeof(float);
oc_algin *= sizeof(float);
kw_remainder *= sizeof(float);
asm volatile(
"cmpq $0, %2\n"
"je 0f\n"
@ -315,10 +315,10 @@ void SWConv3x32Kernel(float *dst, const float *src, const float *weight, const f
void SWConv1x32Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
in_kh_step *= 4;
in_kw_step *= 4;
oc_algin *= 4;
kw_remainder *= 4;
in_kh_step *= sizeof(float);
in_kw_step *= sizeof(float);
oc_algin *= sizeof(float);
kw_remainder *= sizeof(float);
asm volatile(
"cmpq $0, %2\n"
"je 0f\n"
@ -397,13 +397,13 @@ void SWConv1x32Kernel(float *dst, const float *src, const float *weight, const f
void SWConv4x24Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
in_kh_step *= 4;
in_kw_step *= 4;
in_sw_step *= 4;
kw_remainder *= 4;
in_kh_step *= sizeof(float);
in_kw_step *= sizeof(float);
in_sw_step *= sizeof(float);
kw_remainder *= sizeof(float);
size_t src_3_step = 3 * in_sw_step;
float *dst_3 = dst + 3 * oc_algin;
oc_algin *= 4;
oc_algin *= sizeof(float);
asm volatile(
"cmpq $0, %2\n"
"je 0f\n"
@ -545,10 +545,10 @@ void SWConv4x24Kernel(float *dst, const float *src, const float *weight, const f
void SWConv1x24Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
in_kh_step *= 4;
in_kw_step *= 4;
kw_remainder *= 4;
oc_algin *= 4;
in_kh_step *= sizeof(float);
in_kw_step *= sizeof(float);
kw_remainder *= sizeof(float);
oc_algin *= sizeof(float);
asm volatile(
"cmpq $0, %2\n"
"je 0f\n"
@ -621,13 +621,13 @@ void SWConv1x24Kernel(float *dst, const float *src, const float *weight, const f
void SWConv6x16Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
in_kh_step *= 4;
in_kw_step *= 4;
in_sw_step *= 4;
kw_remainder *= 4;
in_kh_step *= sizeof(float);
in_kw_step *= sizeof(float);
in_sw_step *= sizeof(float);
kw_remainder *= sizeof(float);
size_t src_3_step = 3 * in_sw_step;
float *dst_3 = dst + 3 * oc_algin;
oc_algin *= 4;
oc_algin *= sizeof(float);
asm volatile(
"cmpq $0, %2\n"
"je 0f\n"
@ -772,10 +772,10 @@ void SWConv6x16Kernel(float *dst, const float *src, const float *weight, const f
void SWConv1x16Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
in_kh_step *= 4;
in_kw_step *= 4;
kw_remainder *= 4;
oc_algin *= 4;
in_kh_step *= sizeof(float);
in_kw_step *= sizeof(float);
kw_remainder *= sizeof(float);
oc_algin *= sizeof(float);
asm volatile(
"cmpq $0, %2\n"
"je 0f\n"
@ -842,15 +842,15 @@ void SWConv1x16Kernel(float *dst, const float *src, const float *weight, const f
void SWConv12x8Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
in_kh_step *= 4;
in_sw_step *= 4;
in_kw_step *= 4;
kw_remainder *= 4;
in_kh_step *= sizeof(float);
in_sw_step *= sizeof(float);
in_kw_step *= sizeof(float);
kw_remainder *= sizeof(float);
size_t src_3_step = 3 * in_sw_step;
float *dst_3 = dst + 3 * oc_algin;
float *dst_5 = dst + 5 * oc_algin;
float *dst_9 = dst + 9 * oc_algin;
oc_algin *= 4;
oc_algin *= sizeof(float);
asm volatile(
"cmpq $0, %0\n"
"je 0f\n"
@ -1001,12 +1001,12 @@ void SWConv12x8Kernel(float *dst, const float *src, const float *weight, const f
void SWConv4x8Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
in_kh_step *= 4;
in_sw_step *= 4;
in_kw_step *= 4;
in_kh_step *= sizeof(float);
in_sw_step *= sizeof(float);
in_kw_step *= sizeof(float);
size_t src_step = 3 * in_sw_step;
float *dst_3 = dst + 3 * oc_algin;
oc_algin *= 4;
oc_algin *= sizeof(float);
asm volatile(
"cmpq $0, %2\n"
"je 0f\n"
@ -1091,10 +1091,10 @@ void SWConv4x8Kernel(float *dst, const float *src, const float *weight, const fl
void SWConv1x8Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
in_kh_step *= 4;
in_kw_step *= 4;
kw_remainder *= 4;
oc_algin *= 4;
in_kh_step *= sizeof(float);
in_kw_step *= sizeof(float);
kw_remainder *= sizeof(float);
oc_algin *= sizeof(float);
asm volatile(
"cmpq $0, %2\n"
"je 0f\n"

View File

@ -67,7 +67,7 @@ void PowerBroadCast(const float *input, const float *exponent, float *output, in
}
int i = 0;
#if defined(ENABLE_AVX) || defined(ENABLE_SSE) || defined(ENABLE_ARM)
int len_c4 = UP_ROUND(len, C4NUM);
int len_c4 = DOWN_ROUND(len, C4NUM);
MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale);
MS_FLOAT32X4 shift_4 = MS_MOVQ_F32(shift);
for (; i < len_c4; i += C4NUM) {
@ -84,7 +84,7 @@ void PowerSingle(const float *input, const float *exponent, float *output, int l
int i = 0;
PowerScalarFun PowerScalarFun_ = NULL;
#if defined(ENABLE_AVX) || defined(ENABLE_SSE) || defined(ENABLE_ARM)
int len_c4 = UP_ROUND(len, C4NUM);
int len_c4 = DOWN_ROUND(len, C4NUM);
MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale);
MS_FLOAT32X4 shift_4 = MS_MOVQ_F32(shift);
for (; i < len_c4; i += C4NUM) {

View File

@ -39,7 +39,8 @@
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
#define UP_ROUND(x, y) (((x) + (y) - (1)) / (y) * (y))
#define UP_ROUND_DIV(x, y) (x % y == 0 ? (x / y) : (x / y) + 1)
#define DOWN_DIV(x, y) (((x) - (y) + (1)) / (y))
#define DOWN_DIV(x, y) ((x) / (y))
#define DOWN_ROUND(x, y) ((x) / (y) * (y))
#define MSVALID(left, x, right) (MSMIN((MSMAX(left, x)), right))

View File

@ -186,8 +186,8 @@ int ConvolutionSWCPUKernel::Run() {
FreeTmpBuffer();
return error_code;
}
auto out_data = reinterpret_cast<float *>(out_tensors_.front()->MutableData());
if (oc_res_ != 0) {
auto out_data = reinterpret_cast<float *>(out_tensors_.front()->MutableData());
PackNHWCXToNHWCFp32(output_data_, out_data, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_, oc_tile_);
}