forked from mindspore-Ecosystem/mindspore
!16305 [MS][LITE][CPU] arm32 fp16 support npu and weight quant
From: @lzkcode Reviewed-by: @zhanghaibo5,@zhang_xue_tong Signed-off-by: @zhang_xue_tong
This commit is contained in:
commit
5e7e39f925
|
@ -56,7 +56,7 @@ int Relu6Fp16(const float16_t *data, float16_t *dst, int ele_num) {
|
|||
int LReluFp16(const float16_t *src, float16_t *dst, int ele_num, float16_t alpha) {
|
||||
int i = 0;
|
||||
#ifdef ENABLE_NEON
|
||||
int ele_c8 = UP_ROUND(ele_num, C8NUM);
|
||||
int ele_c8 = DOWN_ROUND(ele_num, C8NUM);
|
||||
for (; i < ele_c8; i += C8NUM) {
|
||||
float16x8_t src_tmp = vld1q_f16(src + i);
|
||||
float16x8_t mul_tmp = vmulq_n_f16(src_tmp, alpha);
|
||||
|
@ -187,7 +187,7 @@ int GeluFp16(const float16_t *src, int length, float16_t *dst, bool approximate)
|
|||
if (approximate) {
|
||||
// dst = 0.5 * x * (1 + tanh((2 / pi) ^ 0.5 * (x + 0.044715x^3)))
|
||||
#ifdef ENABLE_NEON
|
||||
int C8 = UP_ROUND(length, C8NUM);
|
||||
int C8 = DOWN_ROUND(length, C8NUM);
|
||||
for (; i < C8; i += C8NUM) {
|
||||
float16x8_t in = vld1q_f16(src + i);
|
||||
float16x8_t res =
|
||||
|
@ -202,7 +202,7 @@ int GeluFp16(const float16_t *src, int length, float16_t *dst, bool approximate)
|
|||
}
|
||||
} else {
|
||||
#ifdef ENABLE_NEON
|
||||
int C8 = UP_ROUND(length, C8NUM);
|
||||
int C8 = DOWN_ROUND(length, C8NUM);
|
||||
for (; i < C8; i += C8NUM) {
|
||||
float16x8_t in = vld1q_f16(src + i);
|
||||
const float16x8_t res = 0.5f * in * (1.0f + MS_ERFX8_F16(in / (float16_t)1.4142135623730951f));
|
||||
|
|
|
@ -70,7 +70,7 @@ void PowerBroadCastFp16(const float16_t *input, const float16_t *exponent, float
|
|||
}
|
||||
int i = 0;
|
||||
#ifdef ENABLE_NEON
|
||||
int len_c8 = UP_ROUND(len, C8NUM);
|
||||
int len_c8 = DOWN_ROUND(len, C8NUM);
|
||||
float16x8_t scale_8 = vmovq_n_f16(scale);
|
||||
float16x8_t shift_8 = vmovq_n_f16(shift);
|
||||
for (; i < len_c8; i += C8NUM) {
|
||||
|
@ -88,7 +88,7 @@ void PowerSingleFp16(const float16_t *input, const float16_t *exponent, float16_
|
|||
int i = 0;
|
||||
PowerScalarFunFp16 PowerScalarFunFp16_ = NULL;
|
||||
#ifdef ENABLE_NEON
|
||||
int len_c8 = UP_ROUND(len, C8NUM);
|
||||
int len_c8 = DOWN_ROUND(len, C8NUM);
|
||||
float16x8_t scale_8 = vmovq_n_f16(scale);
|
||||
float16x8_t shift_8 = vmovq_n_f16(shift);
|
||||
for (; i < len_c8; i += C8NUM) {
|
||||
|
|
|
@ -228,7 +228,7 @@ int Gelu(const float *src, int length, float *dst, bool approximate) {
|
|||
if (approximate) {
|
||||
// dst = 0.5 * x * (1 + tanh((2 / pi) ^ 0.5 * (x + 0.044715x^3)))
|
||||
#if defined(ENABLE_AVX)
|
||||
int C8 = UP_ROUND(length, C8NUM);
|
||||
int C8 = DOWN_ROUND(length, C8NUM);
|
||||
for (; i < C8; i += C8NUM) {
|
||||
MS_FLOAT32X8 in = MS_LD256_F32(src + i);
|
||||
MS_FLOAT32X8 res = 0.5 * in * (1.0 + MS_TANHX8_F32((0.79788456080287f + 0.035677408136f * in * in) * in));
|
||||
|
@ -236,7 +236,7 @@ int Gelu(const float *src, int length, float *dst, bool approximate) {
|
|||
}
|
||||
#endif
|
||||
#if defined(ENABLE_SSE) || defined(ENABLE_ARM)
|
||||
int C4 = UP_ROUND(length, C4NUM);
|
||||
int C4 = DOWN_ROUND(length, C4NUM);
|
||||
for (; i < C4; i += C4NUM) {
|
||||
MS_FLOAT32X4 in = MS_LDQ_F32(src + i);
|
||||
MS_FLOAT32X4 res = 0.5 * in * (1.0 + MS_TANHX4_F32((0.79788456080287f + 0.035677408136f * in * in) * in));
|
||||
|
@ -248,7 +248,7 @@ int Gelu(const float *src, int length, float *dst, bool approximate) {
|
|||
}
|
||||
} else {
|
||||
#if defined(ENABLE_AVX) || defined(ENABLE_SSE) || defined(ENABLE_ARM)
|
||||
int C4 = UP_ROUND(length, C4NUM);
|
||||
int C4 = DOWN_ROUND(length, C4NUM);
|
||||
for (; i < C4; i += C4NUM) {
|
||||
MS_FLOAT32X4 in = MS_LDQ_F32(src + i);
|
||||
MS_FLOAT32X4 res = 0.5 * in * (1.0 + MS_ERFX4_F32(in / 1.4142135623730951f));
|
||||
|
|
|
@ -158,9 +158,9 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float
|
|||
if (ow_block < ow_block_num[oc_block - 1]) { // ow is not enough and process one ow
|
||||
ow_block = 1;
|
||||
}
|
||||
kernel[oc_block - 1][ow_block > 1](dst_w + ow * sw_param->block_channel_, src_w, weight, bias, kernel_h,
|
||||
kernel_w, act_type, ow_block, oc_block, oc_algin, ic_algin, in_kw_step,
|
||||
in_kh_step, in_sw_step, 0);
|
||||
kernel[oc_block - 1][ow_block / ow_block_num[oc_block - 1]](
|
||||
dst_w + ow * sw_param->block_channel_, src_w, weight, bias, kernel_h, kernel_w, act_type, ow_block,
|
||||
oc_block, oc_algin, ic_algin, in_kw_step, in_kh_step, in_sw_step, 0);
|
||||
src_w += ow_block * in_sw_step;
|
||||
}
|
||||
// ow in left
|
||||
|
@ -177,11 +177,11 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float
|
|||
void SWConv3x32Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
|
||||
size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
|
||||
size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
|
||||
in_kh_step *= 4;
|
||||
in_sw_step *= 4;
|
||||
in_kw_step *= 4;
|
||||
oc_algin *= 4;
|
||||
kw_remainder *= 4;
|
||||
in_kh_step *= sizeof(float);
|
||||
in_sw_step *= sizeof(float);
|
||||
in_kw_step *= sizeof(float);
|
||||
oc_algin *= sizeof(float);
|
||||
kw_remainder *= sizeof(float);
|
||||
asm volatile(
|
||||
"cmpq $0, %2\n"
|
||||
"je 0f\n"
|
||||
|
@ -315,10 +315,10 @@ void SWConv3x32Kernel(float *dst, const float *src, const float *weight, const f
|
|||
void SWConv1x32Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
|
||||
size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
|
||||
size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
|
||||
in_kh_step *= 4;
|
||||
in_kw_step *= 4;
|
||||
oc_algin *= 4;
|
||||
kw_remainder *= 4;
|
||||
in_kh_step *= sizeof(float);
|
||||
in_kw_step *= sizeof(float);
|
||||
oc_algin *= sizeof(float);
|
||||
kw_remainder *= sizeof(float);
|
||||
asm volatile(
|
||||
"cmpq $0, %2\n"
|
||||
"je 0f\n"
|
||||
|
@ -397,13 +397,13 @@ void SWConv1x32Kernel(float *dst, const float *src, const float *weight, const f
|
|||
void SWConv4x24Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
|
||||
size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
|
||||
size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
|
||||
in_kh_step *= 4;
|
||||
in_kw_step *= 4;
|
||||
in_sw_step *= 4;
|
||||
kw_remainder *= 4;
|
||||
in_kh_step *= sizeof(float);
|
||||
in_kw_step *= sizeof(float);
|
||||
in_sw_step *= sizeof(float);
|
||||
kw_remainder *= sizeof(float);
|
||||
size_t src_3_step = 3 * in_sw_step;
|
||||
float *dst_3 = dst + 3 * oc_algin;
|
||||
oc_algin *= 4;
|
||||
oc_algin *= sizeof(float);
|
||||
asm volatile(
|
||||
"cmpq $0, %2\n"
|
||||
"je 0f\n"
|
||||
|
@ -545,10 +545,10 @@ void SWConv4x24Kernel(float *dst, const float *src, const float *weight, const f
|
|||
void SWConv1x24Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
|
||||
size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
|
||||
size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
|
||||
in_kh_step *= 4;
|
||||
in_kw_step *= 4;
|
||||
kw_remainder *= 4;
|
||||
oc_algin *= 4;
|
||||
in_kh_step *= sizeof(float);
|
||||
in_kw_step *= sizeof(float);
|
||||
kw_remainder *= sizeof(float);
|
||||
oc_algin *= sizeof(float);
|
||||
asm volatile(
|
||||
"cmpq $0, %2\n"
|
||||
"je 0f\n"
|
||||
|
@ -621,13 +621,13 @@ void SWConv1x24Kernel(float *dst, const float *src, const float *weight, const f
|
|||
void SWConv6x16Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
|
||||
size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
|
||||
size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
|
||||
in_kh_step *= 4;
|
||||
in_kw_step *= 4;
|
||||
in_sw_step *= 4;
|
||||
kw_remainder *= 4;
|
||||
in_kh_step *= sizeof(float);
|
||||
in_kw_step *= sizeof(float);
|
||||
in_sw_step *= sizeof(float);
|
||||
kw_remainder *= sizeof(float);
|
||||
size_t src_3_step = 3 * in_sw_step;
|
||||
float *dst_3 = dst + 3 * oc_algin;
|
||||
oc_algin *= 4;
|
||||
oc_algin *= sizeof(float);
|
||||
asm volatile(
|
||||
"cmpq $0, %2\n"
|
||||
"je 0f\n"
|
||||
|
@ -772,10 +772,10 @@ void SWConv6x16Kernel(float *dst, const float *src, const float *weight, const f
|
|||
void SWConv1x16Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
|
||||
size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
|
||||
size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
|
||||
in_kh_step *= 4;
|
||||
in_kw_step *= 4;
|
||||
kw_remainder *= 4;
|
||||
oc_algin *= 4;
|
||||
in_kh_step *= sizeof(float);
|
||||
in_kw_step *= sizeof(float);
|
||||
kw_remainder *= sizeof(float);
|
||||
oc_algin *= sizeof(float);
|
||||
asm volatile(
|
||||
"cmpq $0, %2\n"
|
||||
"je 0f\n"
|
||||
|
@ -842,15 +842,15 @@ void SWConv1x16Kernel(float *dst, const float *src, const float *weight, const f
|
|||
void SWConv12x8Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
|
||||
size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
|
||||
size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
|
||||
in_kh_step *= 4;
|
||||
in_sw_step *= 4;
|
||||
in_kw_step *= 4;
|
||||
kw_remainder *= 4;
|
||||
in_kh_step *= sizeof(float);
|
||||
in_sw_step *= sizeof(float);
|
||||
in_kw_step *= sizeof(float);
|
||||
kw_remainder *= sizeof(float);
|
||||
size_t src_3_step = 3 * in_sw_step;
|
||||
float *dst_3 = dst + 3 * oc_algin;
|
||||
float *dst_5 = dst + 5 * oc_algin;
|
||||
float *dst_9 = dst + 9 * oc_algin;
|
||||
oc_algin *= 4;
|
||||
oc_algin *= sizeof(float);
|
||||
asm volatile(
|
||||
"cmpq $0, %0\n"
|
||||
"je 0f\n"
|
||||
|
@ -1001,12 +1001,12 @@ void SWConv12x8Kernel(float *dst, const float *src, const float *weight, const f
|
|||
void SWConv4x8Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
|
||||
size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
|
||||
size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
|
||||
in_kh_step *= 4;
|
||||
in_sw_step *= 4;
|
||||
in_kw_step *= 4;
|
||||
in_kh_step *= sizeof(float);
|
||||
in_sw_step *= sizeof(float);
|
||||
in_kw_step *= sizeof(float);
|
||||
size_t src_step = 3 * in_sw_step;
|
||||
float *dst_3 = dst + 3 * oc_algin;
|
||||
oc_algin *= 4;
|
||||
oc_algin *= sizeof(float);
|
||||
asm volatile(
|
||||
"cmpq $0, %2\n"
|
||||
"je 0f\n"
|
||||
|
@ -1091,10 +1091,10 @@ void SWConv4x8Kernel(float *dst, const float *src, const float *weight, const fl
|
|||
void SWConv1x8Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
|
||||
size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
|
||||
size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
|
||||
in_kh_step *= 4;
|
||||
in_kw_step *= 4;
|
||||
kw_remainder *= 4;
|
||||
oc_algin *= 4;
|
||||
in_kh_step *= sizeof(float);
|
||||
in_kw_step *= sizeof(float);
|
||||
kw_remainder *= sizeof(float);
|
||||
oc_algin *= sizeof(float);
|
||||
asm volatile(
|
||||
"cmpq $0, %2\n"
|
||||
"je 0f\n"
|
||||
|
|
|
@ -67,7 +67,7 @@ void PowerBroadCast(const float *input, const float *exponent, float *output, in
|
|||
}
|
||||
int i = 0;
|
||||
#if defined(ENABLE_AVX) || defined(ENABLE_SSE) || defined(ENABLE_ARM)
|
||||
int len_c4 = UP_ROUND(len, C4NUM);
|
||||
int len_c4 = DOWN_ROUND(len, C4NUM);
|
||||
MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale);
|
||||
MS_FLOAT32X4 shift_4 = MS_MOVQ_F32(shift);
|
||||
for (; i < len_c4; i += C4NUM) {
|
||||
|
@ -84,7 +84,7 @@ void PowerSingle(const float *input, const float *exponent, float *output, int l
|
|||
int i = 0;
|
||||
PowerScalarFun PowerScalarFun_ = NULL;
|
||||
#if defined(ENABLE_AVX) || defined(ENABLE_SSE) || defined(ENABLE_ARM)
|
||||
int len_c4 = UP_ROUND(len, C4NUM);
|
||||
int len_c4 = DOWN_ROUND(len, C4NUM);
|
||||
MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale);
|
||||
MS_FLOAT32X4 shift_4 = MS_MOVQ_F32(shift);
|
||||
for (; i < len_c4; i += C4NUM) {
|
||||
|
|
|
@ -39,7 +39,8 @@
|
|||
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
|
||||
#define UP_ROUND(x, y) (((x) + (y) - (1)) / (y) * (y))
|
||||
#define UP_ROUND_DIV(x, y) (x % y == 0 ? (x / y) : (x / y) + 1)
|
||||
#define DOWN_DIV(x, y) (((x) - (y) + (1)) / (y))
|
||||
#define DOWN_DIV(x, y) ((x) / (y))
|
||||
#define DOWN_ROUND(x, y) ((x) / (y) * (y))
|
||||
|
||||
#define MSVALID(left, x, right) (MSMIN((MSMAX(left, x)), right))
|
||||
|
||||
|
|
|
@ -186,8 +186,8 @@ int ConvolutionSWCPUKernel::Run() {
|
|||
FreeTmpBuffer();
|
||||
return error_code;
|
||||
}
|
||||
auto out_data = reinterpret_cast<float *>(out_tensors_.front()->MutableData());
|
||||
if (oc_res_ != 0) {
|
||||
auto out_data = reinterpret_cast<float *>(out_tensors_.front()->MutableData());
|
||||
PackNHWCXToNHWCFp32(output_data_, out_data, conv_param_->output_batch_,
|
||||
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_, oc_tile_);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue