diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/conv_depthwise_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/conv_depthwise_fp16.c index c7738c62b64..c29c98ef41e 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/conv_depthwise_fp16.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/conv_depthwise_fp16.c @@ -30,6 +30,371 @@ void ConvDwFp16Row(float16_t *output_ptr, const float16_t *input_ptr, const floa } #endif +#ifdef ENABLE_ARM +static void ConvDw3x3RowLeftFp16(const float16_t *src, float16_t *line, int lw, int channel) { + MS_FLOAT16X8 v0, v1, v2, v3; + v0 = MS_MOVQ_F16((float16_t)0.0); + int ic = 0; + for (; ic < channel - 7; ic += 8) { + v1 = MS_LDQ_F16(src + ic); + v2 = MS_LDQ_F16(src + channel + ic); + v3 = MS_LDQ_F16(src + 2 * channel + ic); + MS_FLOAT16X8 b0 = MS_SUBQ_F16(v0, v2); + MS_FLOAT16X8 b1 = MS_ADDQ_F16(v1, v2); + MS_FLOAT16X8 b2 = MS_SUBQ_F16(v2, v1); + MS_FLOAT16X8 b3 = MS_SUBQ_F16(v3, v1); + MS_STQ_F16(line + lw * ic, b0); + MS_STQ_F16(line + lw * ic + 8, b1); + MS_STQ_F16(line + lw * ic + 16, b2); + MS_STQ_F16(line + lw * ic + 24, b3); + } + if (ic < channel) { + float16_t *remain_line = line + ic * lw; + memset(remain_line, 0, 16); + memset(remain_line + 8, 0, 16); + memset(remain_line + 16, 0, 16); + memset(remain_line + 24, 0, 16); + for (int i = 0; i < channel - ic; i++) { + float16_t d1 = src[i + ic]; + float16_t d2 = src[i + ic + channel]; + float16_t d3 = src[i + ic + 2 * channel]; + remain_line[i] = (float16_t)0.0 - d2; + remain_line[i + 8] = d1 + d2; + remain_line[i + 16] = d2 - d1; + remain_line[i + 24] = d3 - d1; + } + } +} + +static void ConvDw3x3RowMiddleFp16(const float16_t *src, float16_t *line, int lw, int channel) { + MS_FLOAT16X8 v0, v1, v2, v3; + int ic = 0; + for (; ic < channel - 7; ic += 8) { + v0 = MS_LDQ_F16(src + ic); + v1 = MS_LDQ_F16(src + channel + ic); + v2 = MS_LDQ_F16(src + 2 * channel + ic); + v3 = MS_LDQ_F16(src + 3 * channel + ic); + MS_FLOAT16X8 b0 = MS_SUBQ_F16(v0, v2); + MS_FLOAT16X8 b1 = MS_ADDQ_F16(v1, v2); + MS_FLOAT16X8 b2 = MS_SUBQ_F16(v2, v1); + MS_FLOAT16X8 b3 = MS_SUBQ_F16(v3, v1); + MS_STQ_F16(line + lw * ic, b0); + MS_STQ_F16(line + lw * ic + 8, b1); + MS_STQ_F16(line + lw * ic + 16, b2); + MS_STQ_F16(line + lw * ic + 24, b3); + } + if (ic < channel) { + float16_t *remain_line = line + ic * lw; + memset(remain_line, 0, 16); + memset(remain_line + 8, 0, 16); + memset(remain_line + 16, 0, 16); + memset(remain_line + 24, 0, 16); + for (int i = 0; i < channel - ic; i++) { + float16_t d0 = src[i + ic]; + float16_t d1 = src[i + ic + channel]; + float16_t d2 = src[i + ic + 2 * channel]; + float16_t d3 = src[i + ic + 3 * channel]; + remain_line[i] = d0 - d2; + remain_line[i + 8] = d1 + d2; + remain_line[i + 16] = d2 - d1; + remain_line[i + 24] = d3 - d1; + } + } +} + +static void ConvDw3x3RowRightFp16(const float16_t *src, float16_t *line, int lw, int channel) { + MS_FLOAT16X8 v0, v1, v2, v3; + int ic = 0; + v3 = MS_MOVQ_F16((float16_t)0.0); + for (; ic < channel - 7; ic += 8) { + v0 = MS_LDQ_F16(src + ic); + v1 = MS_LDQ_F16(src + channel + ic); + v2 = MS_LDQ_F16(src + 2 * channel + ic); + MS_FLOAT16X8 b0 = MS_SUBQ_F16(v0, v2); + MS_FLOAT16X8 b1 = MS_ADDQ_F16(v1, v2); + MS_FLOAT16X8 b2 = MS_SUBQ_F16(v2, v1); + MS_FLOAT16X8 b3 = MS_SUBQ_F16(v3, v1); + MS_STQ_F16(line + lw * ic, b0); + MS_STQ_F16(line + lw * ic + 8, b1); + MS_STQ_F16(line + lw * ic + 16, b2); + MS_STQ_F16(line + lw * ic + 24, b3); + } + if (ic < channel) { + float16_t *remain_line = line + ic * lw; + memset(remain_line, 0, 16); + memset(remain_line + 8, 0, 16); + memset(remain_line + 16, 0, 16); + memset(remain_line + 24, 0, 16); + for (int i = 0; i < channel - ic; i++) { + float16_t d0 = src[i + ic]; + float16_t d1 = src[i + ic + channel]; + float16_t d2 = src[i + ic + 2 * channel]; + remain_line[i] = d0 - d2; + remain_line[i + 8] = d1 + d2; + remain_line[i + 16] = d2 - d1; + remain_line[i + 24] = (float16_t)0.0 - d1; + } + } +} + +static void ConvDw3x3RowSingleFp16(const float16_t *src, float16_t *line, int lw, int channel) { + MS_FLOAT16X8 v0, v1, v2; + int ic = 0; + v2 = MS_MOVQ_F16((float16_t)0.0); + for (; ic < channel - 7; ic += 8) { + v0 = MS_LDQ_F16(src + ic); + v1 = MS_LDQ_F16(src + channel + ic); + MS_FLOAT16X8 b2 = MS_SUBQ_F16(v2, v1); + MS_STQ_F16(line + lw * ic, v0); + MS_STQ_F16(line + lw * ic + 8, v1); + MS_STQ_F16(line + lw * ic + 16, b2); + memset(line + lw * ic + 24, 0, 16); + } + if (ic < channel) { + float16_t *remain_line = line + ic * lw; + memset(remain_line, 0, 16); + memset(remain_line + 8, 0, 16); + memset(remain_line + 16, 0, 16); + memset(remain_line + 24, 0, 16); + for (int i = 0; i < channel - ic; i++) { + float16_t d0 = src[i + ic]; + float16_t d1 = src[i + ic + channel]; + remain_line[i] = d0; + remain_line[i + 8] = d1; + remain_line[i + 16] = (float16_t)0.0 - d1; + } + } +} + +static void ConvDw3x3InitTopFp16(const float16_t *src, float16_t **lines, int width, int channel) { + float16_t *line0 = lines[0]; + float16_t *line1 = lines[1]; + float16_t *line2 = lines[2]; + int c8 = UP_ROUND(channel, C8NUM); + int lw = UP_DIV(width, C2NUM) * C4NUM; + memset(line0, 0, c8 * lw * sizeof(float16_t)); + ConvDw3x3RowLeftFp16(src, line1, lw, channel); + ConvDw3x3RowLeftFp16(src + width * channel, line2, lw, channel); + int ow = 2; + for (; ow < width - 2; ow += 2) { + ConvDw3x3RowMiddleFp16(src + (ow - 1) * channel, line1 + 2 * ow * 8, lw, channel); + ConvDw3x3RowMiddleFp16(src + width * channel + (ow - 1) * channel, line2 + 2 * ow * 8, lw, channel); + } + int remain = width - ow; + if (remain == 2) { + ConvDw3x3RowRightFp16(src + (ow - 1) * channel, line1 + 2 * ow * 8, lw, channel); + ConvDw3x3RowRightFp16(src + width * channel + (ow - 1) * channel, line2 + 2 * ow * 8, lw, channel); + } else if (remain == 1) { + ConvDw3x3RowSingleFp16(src + (ow - 1) * channel, line1 + 2 * ow * 8, lw, channel); + ConvDw3x3RowSingleFp16(src + width * channel + (ow - 1) * channel, line2 + 2 * ow * 8, lw, channel); + } +} + +static void ConvDw3x3InitRowFp16(const float16_t *src, float16_t **lines, int width, int channel) { + float16_t *line0 = lines[0]; + float16_t *line1 = lines[1]; + float16_t *line2 = lines[2]; + int lw = UP_DIV(width, C2NUM) * C4NUM; + ConvDw3x3RowLeftFp16(src - width * channel, line0, lw, channel); + ConvDw3x3RowLeftFp16(src, line1, lw, channel); + ConvDw3x3RowLeftFp16(src + width * channel, line2, lw, channel); + int ow = 2; + for (; ow < width - 2; ow += 2) { + ConvDw3x3RowMiddleFp16(src - width * channel + (ow - 1) * channel, line0 + 2 * ow * 8, lw, channel); + ConvDw3x3RowMiddleFp16(src + (ow - 1) * channel, line1 + 2 * ow * 8, lw, channel); + ConvDw3x3RowMiddleFp16(src + width * channel + (ow - 1) * channel, line2 + 2 * ow * 8, lw, channel); + } + int remain = width - ow; + if (remain == 2) { + ConvDw3x3RowRightFp16(src - width * channel + (ow - 1) * channel, line0 + 2 * ow * 8, lw, channel); + ConvDw3x3RowRightFp16(src + (ow - 1) * channel, line1 + 2 * ow * 8, lw, channel); + ConvDw3x3RowRightFp16(src + width * channel + (ow - 1) * channel, line2 + 2 * ow * 8, lw, channel); + } else if (remain == 1) { + ConvDw3x3RowSingleFp16(src - width * channel + (ow - 1) * channel, line0 + 2 * ow * 8, lw, channel); + ConvDw3x3RowSingleFp16(src + (ow - 1) * channel, line1 + 2 * ow * 8, lw, channel); + ConvDw3x3RowSingleFp16(src + width * channel + (ow - 1) * channel, line2 + 2 * ow * 8, lw, channel); + } +} + +static void ConvDw3x3RowFp16(const float16_t *src, float16_t **lines, int width, int channel) { + float16_t *tmp = lines[0]; + lines[0] = lines[1]; + lines[1] = lines[2]; + lines[2] = tmp; + int c8 = UP_ROUND(channel, C8NUM); + int lw = UP_DIV(width, C2NUM) * C4NUM; + memset(tmp, 0, c8 * lw * sizeof(float16_t)); + ConvDw3x3RowLeftFp16(src, tmp, lw, channel); + int ow = 2; + for (; ow < width - 2; ow += 2) { + ConvDw3x3RowMiddleFp16(src + (ow - 1) * channel, tmp + 2 * ow * 8, lw, channel); + } + int remain = width - ow; + if (remain == 2) { + ConvDw3x3RowRightFp16(src + (ow - 1) * channel, tmp + 2 * ow * 8, lw, channel); + } else if (remain == 1) { + ConvDw3x3RowSingleFp16(src + (ow - 1) * channel, tmp + 2 * ow * 8, lw, channel); + } +} + +static void ConvDw3x3BottomFp16(float16_t **lines, int width, int channel) { + float16_t *tmp = lines[0]; + lines[0] = lines[1]; + lines[1] = lines[2]; + lines[2] = tmp; + int c8 = UP_ROUND(channel, C8NUM); + memset(tmp, 0, UP_DIV(width, C2NUM) * c8 * C4NUM * sizeof(float16_t)); +} + +void ConvDw3x3LineFp16(float16_t *dst, float16_t **lines, const float16_t *weight, const float16_t *bias_data, + int width, int ori_channel, bool relu, bool relu6) { + int channel = ori_channel; + float16_t *line0 = lines[0]; + float16_t *line1 = lines[1]; + float16_t *line2 = lines[2]; + for (; channel > 0; channel -= 8) { + MS_FLOAT16X8 bias = MS_LDQ_F16(bias_data); + bias_data += 8; + MS_FLOAT16X8 g00 = MS_LDQ_F16(weight); + MS_FLOAT16X8 g01 = MS_LDQ_F16(weight + 8); + MS_FLOAT16X8 g02 = MS_LDQ_F16(weight + 16); + MS_FLOAT16X8 g03 = MS_LDQ_F16(weight + 24); + MS_FLOAT16X8 g10 = MS_LDQ_F16(weight + 32); + MS_FLOAT16X8 g11 = MS_LDQ_F16(weight + 40); + MS_FLOAT16X8 g12 = MS_LDQ_F16(weight + 48); + MS_FLOAT16X8 g13 = MS_LDQ_F16(weight + 56); + MS_FLOAT16X8 g20 = MS_LDQ_F16(weight + 64); + MS_FLOAT16X8 g21 = MS_LDQ_F16(weight + 72); + MS_FLOAT16X8 g22 = MS_LDQ_F16(weight + 80); + MS_FLOAT16X8 g23 = MS_LDQ_F16(weight + 88); + weight += 96; + float16_t *cur_dst = dst; + int ow = 0; + for (; ow < width - 1; ow += 2) { + MS_FLOAT16X8 acc0 = MS_MULQ_F16(MS_LDQ_F16(line0), g00); + MS_FLOAT16X8 acc1 = MS_MULQ_F16(MS_LDQ_F16(line0 + 8), g01); + MS_FLOAT16X8 acc2 = MS_MULQ_F16(MS_LDQ_F16(line0 + 16), g02); + MS_FLOAT16X8 acc3 = MS_MULQ_F16(MS_LDQ_F16(line0 + 24), g03); + line0 += 32; + acc0 = MS_FMAQ_F16(acc0, MS_LDQ_F16(line1), g10); + acc1 = MS_FMAQ_F16(acc1, MS_LDQ_F16(line1 + 8), g11); + acc2 = MS_FMAQ_F16(acc2, MS_LDQ_F16(line1 + 16), g12); + acc3 = MS_FMAQ_F16(acc3, MS_LDQ_F16(line1 + 24), g13); + + line1 += 32; + acc0 = MS_FMAQ_F16(acc0, MS_LDQ_F16(line2), g20); + acc1 = MS_FMAQ_F16(acc1, MS_LDQ_F16(line2 + 8), g21); + acc2 = MS_FMAQ_F16(acc2, MS_LDQ_F16(line2 + 16), g22); + acc3 = MS_FMAQ_F16(acc3, MS_LDQ_F16(line2 + 24), g23); + + line2 += 32; + MS_FLOAT16X8 res0 = MS_ADDQ_F16(acc0, MS_ADDQ_F16(acc2, acc1)); + MS_FLOAT16X8 res1 = MS_ADDQ_F16(acc1, MS_SUBQ_F16(acc3, acc2)); + res0 = MS_ADDQ_F16(res0, bias); + res1 = MS_ADDQ_F16(res1, bias); + if (relu || relu6) { + res0 = MS_MAXQ_F16(res0, MS_MOVQ_F16((float16_t)0.0)); + res1 = MS_MAXQ_F16(res1, MS_MOVQ_F16((float16_t)0.0)); + } + if (relu6) { + res0 = MS_MINQ_F16(res0, MS_MOVQ_F16((float16_t)6.0)); + res1 = MS_MINQ_F16(res1, MS_MOVQ_F16((float16_t)6.0)); + } + if (channel >= 8) { + MS_STQ_F16(cur_dst, res0); + MS_STQ_F16(cur_dst + ori_channel, res1); + } else { + for (int i = 0; i < channel; i++) { + cur_dst[i] = res0[i]; + cur_dst[ori_channel + i] = res1[i]; + } + } + cur_dst += 2 * ori_channel; + } + if (ow < width) { + MS_FLOAT16X8 acc0 = MS_MULQ_F16(MS_LDQ_F16(line0), g00); + MS_FLOAT16X8 acc1 = MS_MULQ_F16(MS_LDQ_F16(line0 + 8), g01); + MS_FLOAT16X8 acc2 = MS_MULQ_F16(MS_LDQ_F16(line0 + 16), g02); + line0 += 32; + acc0 = MS_FMAQ_F16(acc0, MS_LDQ_F16(line1), g10); + acc1 = MS_FMAQ_F16(acc1, MS_LDQ_F16(line1 + 8), g11); + acc2 = MS_FMAQ_F16(acc2, MS_LDQ_F16(line1 + 16), g12); + + line1 += 32; + acc0 = MS_FMAQ_F16(acc0, MS_LDQ_F16(line2), g20); + acc1 = MS_FMAQ_F16(acc1, MS_LDQ_F16(line2 + 8), g21); + acc2 = MS_FMAQ_F16(acc2, MS_LDQ_F16(line2 + 16), g22); + + line2 += 32; + MS_FLOAT16X8 res0 = MS_ADDQ_F16(acc0, MS_ADDQ_F16(acc2, acc1)); + res0 = MS_ADDQ_F16(res0, bias); + if (relu || relu6) { + res0 = MS_MAXQ_F16(res0, MS_MOVQ_F16((float16_t)0.0)); + } + if (relu6) { + res0 = MS_MINQ_F16(res0, MS_MOVQ_F16((float16_t)6.0)); + } + if (channel >= 8) { + MS_STQ_F16(cur_dst, res0); + } else { + for (int i = 0; i < channel; i++) { + cur_dst[i] = res0[i]; + } + } + } + dst += 8; + } +} + +void ConvDw3x3Fp16(float16_t *output_data, float16_t *buffer, const float16_t *input_data, const float16_t *weight_data, + const float16_t *bias_data, const ConvParameter *conv_param, int start_oh, int end_oh) { + int units = UP_DIV(conv_param->output_w_, C2NUM); + int c8 = UP_ROUND(conv_param->input_channel_, C8NUM); + int line = conv_param->input_channel_ * conv_param->input_w_; + + bool relu = conv_param->act_type_ == ActType_Relu; + bool relu6 = conv_param->act_type_ == ActType_Relu6; + + for (int b = 0; b < conv_param->output_batch_; b++) { + const float16_t *src = input_data + b * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_; + float16_t *dst = output_data + b * conv_param->output_h_ * conv_param->output_w_ * conv_param->output_channel_; + float16_t *line0 = buffer; + float16_t *line1 = buffer + units * c8 * C4NUM; + float16_t *line2 = buffer + units * c8 * C4NUM * 2; + float16_t *lines[3] = {line0, line1, line2}; + int oh = start_oh; + if (oh == 0) { + // input trans + ConvDw3x3InitTopFp16(src, lines, conv_param->output_w_, conv_param->input_channel_); + } else { + // input trans + ConvDw3x3InitRowFp16(src + oh * line, lines, conv_param->output_w_, conv_param->input_channel_); + } + // dst calc and trans + ConvDw3x3LineFp16(dst + oh * line, lines, weight_data, bias_data, conv_param->output_w_, conv_param->input_channel_, + relu, relu6); + for (oh = start_oh + 1; oh < end_oh - 1; oh++) { + // input trans + ConvDw3x3RowFp16(src + oh * line + line, lines, conv_param->output_w_, conv_param->input_channel_); + // dst calc and trans + ConvDw3x3LineFp16(dst + oh * line, lines, weight_data, bias_data, conv_param->output_w_, + conv_param->input_channel_, relu, relu6); + } + if (oh == conv_param->output_h_ - 1) { + // input trans + ConvDw3x3BottomFp16(lines, conv_param->output_w_, conv_param->input_channel_); + } else { + // input trans + ConvDw3x3RowFp16(src + oh * line + line, lines, conv_param->output_w_, conv_param->input_channel_); + } + // dst calc and trans + ConvDw3x3LineFp16(dst + oh * line, lines, weight_data, bias_data, conv_param->output_w_, conv_param->input_channel_, + relu, relu6); + } +} + +#endif + void ConvDwFp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, const float16_t *bias_data, const ConvParameter *conv_param, int task_id) { int h_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_); diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/conv_depthwise_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/conv_depthwise_fp16.h index e5b781dfd69..5e61c8aab4e 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/conv_depthwise_fp16.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/conv_depthwise_fp16.h @@ -50,6 +50,14 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id); + +#ifdef ENABLE_ARM +void ConvDw3x3LineFp16(float16_t *dst, float16_t **lines, const float16_t *weight, const float16_t *bias_data, + int width, int ori_channel, bool relu, bool relu6); +void ConvDw3x3Fp16(float16_t *output_data, float16_t *buffer, const float16_t *input_data, const float16_t *weight_data, + const float16_t *bias_data, const ConvParameter *conv_param, int start_oh, int end_oh); +#endif + #ifdef __cplusplus } #endif diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pack_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pack_fp16.c index c446b51934f..d53b2925667 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pack_fp16.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pack_fp16.c @@ -17,6 +17,26 @@ #include "nnacl/fp16/pack_fp16.h" #include +#ifdef ENABLE_ARM +void PackWeightConvDw3x3Fp16(const void *src, void *dst, int channel) { + // nchw to nc8hw8 with 1D F(2,3) + for (int i = 0; i < channel; i++) { + float16_t *src_kernel = (float16_t *)src + i * 9; + float16_t *dst_kernel = (float16_t *)dst + (i / 8) * 96 + i % 8; + for (int y = 0; y < 3; y++) { + float16_t g0 = src_kernel[3 * y]; + float16_t g1 = src_kernel[3 * y + 1]; + float16_t g2 = src_kernel[3 * y + 2]; + + dst_kernel[32 * y] = g0; + dst_kernel[32 * y + 8] = (float16_t)0.5 * (g0 + g1 + g2); + dst_kernel[32 * y + 16] = (float16_t)0.5 * (g0 - g1 + g2); + dst_kernel[32 * y + 24] = g2; + } + } +} +#endif + void Im2ColPackUnitFp16(float16_t *input_data, ConvParameter *conv_param, float16_t *packed_input, int real_cal_num, int block_index) { // input format : nhwc diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pack_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pack_fp16.h index 91e482a44df..992c254c37c 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pack_fp16.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pack_fp16.h @@ -81,6 +81,10 @@ void Transpose12x8A32Fp16(const float16_t *src, float16_t *dst, size_t src_strid void Transpose16x8ARM64Fp16(const float16_t *src, float16_t *dst, size_t src_stride, size_t dst_stride); #endif +#ifdef ENABLE_ARM +void PackWeightConvDw3x3Fp16(const void *src, void *dst, int channel); +#endif + #ifdef __cplusplus } #endif diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions_fp16.h index fe83cafbd5f..6b7d937fdd7 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions_fp16.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions_fp16.h @@ -99,6 +99,9 @@ static inline float16x4_t ms_vcvt_f16_f32(float32x4_t in) { #define MS_MAXQ_F16 vmaxq_f16 #define MS_LDQ_F16 vld1q_f16 #define MS_ADDQ_F16 vaddq_f16 +#define MS_SUBQ_F16 vsubq_f16 +#define MS_MULQ_F16 vmulq_f16 +#define MS_FMAQ_F16 vfmaq_f16 static inline float16x8_t MS_TANHX8_F16(float16x8_t src) { float32x4_t src_low = MS_CVT_F32_F16(vget_low_f16(src)); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc index 9ba5d54ddfc..89aaee606e0 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc @@ -22,6 +22,7 @@ #include "src/runtime/kernel/arm/fp16/group_convolution_fp16.h" #include "src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h" #include "src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h" +#include "src/runtime/kernel/arm/fp16/convolution_depthwise_3x3_fp16.h" #include "src/runtime/kernel/arm/base/group_convolution_creator.h" #include "schema/model_generated.h" #include "src/kernel_registry.h" @@ -117,6 +118,17 @@ kernel::InnerKernel *CpuConvDwFp16KernelCreator(const std::vector(opParameter); kernel::InnerKernel *kernel = nullptr; +#if defined(ENABLE_ARM) + if (CheckConvDw1DWinograd(conv_param, ctx->thread_num_)) { + kernel = new (std::nothrow) kernel::ConvolutionDepthwise3x3Fp16CPUKernel(opParameter, inputs, outputs, ctx); + if (kernel == nullptr) { + MS_LOG(ERROR) << "kernel is nullptr."; + free(opParameter); + return nullptr; + } + return kernel; + } +#endif if (conv_param->input_channel_ < 32) { kernel = new (std::nothrow) kernel::ConvolutionDepthwiseSWFp16CPUKernel(opParameter, inputs, outputs, ctx); } else { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_3x3_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_3x3_fp16.cc new file mode 100644 index 00000000000..c81a69ec5bb --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_3x3_fp16.cc @@ -0,0 +1,150 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef ENABLE_ARM +#include "src/runtime/kernel/arm/fp16/convolution_depthwise_3x3_fp16.h" +#include "include/errorcode.h" +#include "nnacl/fp16/pack_fp16.h" +#include "nnacl/fp16/conv_depthwise_fp16.h" + +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_INFER_INVALID; +using mindspore::lite::RET_MEMORY_FAILED; +using mindspore::lite::RET_OK; + +namespace mindspore::kernel { +ConvolutionDepthwise3x3Fp16CPUKernel::~ConvolutionDepthwise3x3Fp16CPUKernel() { + if (packed_weight_ != nullptr) { + free(packed_weight_); + packed_weight_ = nullptr; + } +} + +int ConvolutionDepthwise3x3Fp16CPUKernel::InitWeightBias() { + // init weight: k, h, w, c; k == group == output_channel, c == 1 + auto weight_tensor = in_tensors_[kWeightIndex]; + auto origin_weight = reinterpret_cast(weight_tensor->MutableData()); + int channel = weight_tensor->Batch(); + int c8 = UP_ROUND(channel, C8NUM); + int pack_weight_size = c8 * C12NUM; + + if (packed_weight_ == nullptr) { + packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(float16_t))); + if (packed_weight_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + } + PackWeightConvDw3x3Fp16(origin_weight, packed_weight_, channel); + + if (bias_data_ == nullptr) { + bias_data_ = reinterpret_cast(malloc(c8 * sizeof(float16_t))); + if (bias_data_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + } + memset(bias_data_, 0, c8 * sizeof(float16_t)); + if (in_tensors_.size() == kInputSize2) { + auto bias_tensor = in_tensors_[kBiasIndex]; + auto ori_bias = reinterpret_cast(bias_tensor->MutableData()); + memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(float16_t)); + } + + return RET_OK; +} + +int ConvolutionDepthwise3x3Fp16CPUKernel::Init() { + auto ret = InitWeightBias(); + if (ret != 0) { + MS_LOG(ERROR) << "Convolution depthwise 3x3 fp16 InitWeightBias failed."; + return RET_ERROR; + } + if (!InferShapeDone()) { + return RET_OK; + } + return ReSize(); +} + +int ConvolutionDepthwise3x3Fp16CPUKernel::ReSize() { + ConvolutionBaseCPUKernel::Init(); + conv_param_->thread_num_ = MSMIN(thread_count_, conv_param_->output_h_); + return RET_OK; +} + +int ConvolutionDepthwise3x3Fp16CPUKernel::Execute(int task_id) { + int units = UP_DIV(conv_param_->output_w_, C2NUM); // F(2, 3) contains 2 conv units + int c8 = UP_ROUND(conv_param_->input_channel_, C8NUM); + auto buffer = buffer_ + C12NUM * c8 * units * task_id; + int step_oh = UP_DIV(conv_param_->output_h_, conv_param_->thread_num_); + int start_oh = step_oh * task_id; + int end_oh = MSMIN(start_oh + step_oh, conv_param_->output_h_); + ConvDw3x3Fp16(output_ptr_, buffer, input_ptr_, packed_weight_, reinterpret_cast(bias_data_), conv_param_, + start_oh, end_oh); + return RET_OK; +} + +int ConvDw3x3Fp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) { + auto conv_dw = reinterpret_cast(cdata); + auto ret = conv_dw->Execute(task_id); + if (ret != RET_OK) { + MS_LOG(ERROR) << "ConvolutionDepthwise3x3Run error task_id[" << task_id << "] error_code[" << ret << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int ConvolutionDepthwise3x3Fp16CPUKernel::Run() { + if (IsTrainable() && (IsTrain() || IsRepack())) { + auto ret = InitWeightBias(); + if (ret != 0) { + MS_LOG(ERROR) << "Convolution depthwise fp16 repack weight failure"; + return RET_ERROR; + } + is_repack_ = false; + } + int units = UP_DIV(conv_param_->output_w_, C2NUM); // F(2, 3) contains 2 conv units + int c8 = UP_ROUND(conv_param_->input_channel_, C8NUM); + int buffer_size = units * c8 * C12NUM * conv_param_->thread_num_; + buffer_ = reinterpret_cast(ctx_->allocator->Malloc(buffer_size * sizeof(float16_t))); + if (buffer_ == nullptr) { + MS_LOG(ERROR) << "ConvDw3x3Fp16Run failed to allocate buffer"; + return RET_MEMORY_FAILED; + } + + auto input_tensor = in_tensors_.at(kInputIndex); + input_ptr_ = reinterpret_cast(input_tensor->data_c()); + + auto output_tensor = out_tensors_.at(kOutputIndex); + output_ptr_ = reinterpret_cast(output_tensor->data_c()); + + auto ret = ParallelLaunch(this->context_, ConvDw3x3Fp16Run, this, conv_param_->thread_num_); + ctx_->allocator->Free(buffer_); + if (ret != RET_OK) { + MS_LOG(ERROR) << "ConvDw3x3Run error: error_code[" << ret << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int ConvolutionDepthwise3x3Fp16CPUKernel::Eval() { + if (IsTrainable()) { + is_repack_ = true; + } + return InnerKernel::Eval(); +} +} // namespace mindspore::kernel +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_3x3_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_3x3_fp16.h new file mode 100644 index 00000000000..c6663837369 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_3x3_fp16.h @@ -0,0 +1,50 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_DEPTHWISE_3X3_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_DEPTHWISE_3X3_FP16_H_ + +#ifdef ENABLE_ARM +#include +#include "src/inner_kernel.h" +#include "src/runtime/kernel/arm/base/convolution_base.h" +#include "nnacl/fp32/conv_depthwise_fp32.h" + +namespace mindspore::kernel { +class ConvolutionDepthwise3x3Fp16CPUKernel : public ConvolutionBaseCPUKernel { + public: + ConvolutionDepthwise3x3Fp16CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {} + ~ConvolutionDepthwise3x3Fp16CPUKernel() override; + + int Init() override; + int ReSize() override; + int Run() override; + + int InitWeightBias(); + int Execute(int task_id); + int Eval() override; + + private: + float16_t *packed_weight_ = nullptr; + float16_t *input_ptr_ = nullptr; + float16_t *output_ptr_ = nullptr; + float16_t *buffer_ = nullptr; +}; +} // namespace mindspore::kernel +#endif +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_DEPTHWISE_3X3_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc index ee22cd1229f..18733dd23f2 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc @@ -118,7 +118,6 @@ int ConvolutionDelegateCPUKernel::ReSize() { MS_LOG(ERROR) << "Selecting execute kernel failed for conv_kernel, got a nullptr."; return RET_ERROR; } - // conv_kernel_->set_name(this->name_); } FreeCopiedData(); return conv_kernel_->ReSize(); diff --git a/mindspore/lite/test/config/models_caffe_fp16.cfg b/mindspore/lite/test/config/models_caffe_fp16.cfg index be183d3e49b..6cc90ef4de1 100644 --- a/mindspore/lite/test/config/models_caffe_fp16.cfg +++ b/mindspore/lite/test/config/models_caffe_fp16.cfg @@ -37,7 +37,7 @@ ml_ocr_sfz_add_final_0325 0.1 ml_hardware_pose 2 ml_bank_recog 0.1 2012_ATLANTA_10class_20190131_v4.0 12 -mnet 9 +mnet 12 recognition 10 ml_face_landmark 1 model_hebing_3branch 40 @@ -52,7 +52,7 @@ hiai_video_seg 1 hiai_semantic_seg 3 hiai_human_seg 28 hiai_face_recognition_1 10 -hiai_cpu_face_detect 4 +hiai_cpu_face_detect 4.5 hiai_cpu_face_attr 12 hiai_face_attr1 12 # mtk_detect-mbv1-shortcut-400-400_nopostprocess_simplified: precision is 5% @@ -76,9 +76,9 @@ ml_video_edit_hairSeg_have_imageProcessLayer_interpTo145 0.5 hdc_age_medium 6 hdc_contour_pose_128 0.5 hdc_emotion 0.5 -hdc_fivembnet 0.5 +hdc_fivembnet 1 hdc_isface 0.5 -hdc_mobilenetface 7.5 +hdc_mobilenetface 8.5 hdc_retinaface 14 hdc_resnet 7 ml_video_edit_detect 2.5 diff --git a/mindspore/lite/test/config/models_onnx_fp16.cfg b/mindspore/lite/test/config/models_onnx_fp16.cfg index 9f549dbb00c..663068ea1c8 100644 --- a/mindspore/lite/test/config/models_onnx_fp16.cfg +++ b/mindspore/lite/test/config/models_onnx_fp16.cfg @@ -55,7 +55,7 @@ hdc_resnet_1w_class.onnx 6 gts_text_detection.onnx;1;1,224,224,3 10 hdc_Face_Emotion_MTI_Aesthetic.onnx 144 ml_video_edit_imitate_filter.onnx 120 -ml_facedetector.onnx 3 +ml_facedetector.onnx 6 ml_ei_facedetection.onnx 2 #ml_video_edit_art_generate.onnx #mul operator overflows, not suitable for fp16 #ml_voice_detect.onnx #conv operator overflows, not suitable for fp16 @@ -79,7 +79,7 @@ mtk_face_recognition_v2.onnx 2.5 ml_2012_ocr_detection_tmp.onnx 0.5 Harmony_Voiceprint_resnet18.onnx;1;1,150,40,1 4.5 bloom_hongmo_detection_tmp.onnx 0.5 -Q_face_recognition.onnx 2 +Q_face_recognition.onnx 3 ml_video_edit_enhance_update_tmp.onnx 0.5 Q888_face_recognition.onnx 3.5 Q888_iris_detect.onnx 0.5 diff --git a/mindspore/lite/test/config/models_tf_fp16.cfg b/mindspore/lite/test/config/models_tf_fp16.cfg index 0c50eae29ee..c9b2d099fc8 100644 --- a/mindspore/lite/test/config/models_tf_fp16.cfg +++ b/mindspore/lite/test/config/models_tf_fp16.cfg @@ -6,10 +6,10 @@ ml_vision_guide_detection1.pb 0.5 ml_vision_guide_detection3.pb 0.5 ml_video_edit_generate_filter.pb 2 -ml_ocr_jk.pb 0.7 +ml_ocr_jk.pb 0.8 # The accumulated error causes the threshold to be exceeded ml_ocr_latin.pb 12 -scan_hms_angle.pb 2.5 +scan_hms_angle.pb 7 scan_hms_detect.pb 2.5 ml_face_openclose.pb;1;1,32,32,3 0.5 ml_object_detect.pb;1;1,288,288,3 2 @@ -23,7 +23,7 @@ hiai_PoseEstimation_Pcm.pb 0.5 hiai_cn_recognize_modify_padv2.pb;1;1,32,512,1 27 hiai_model_normalize_object_scene_ps_20200519.pb;1;1,224,224,3 17.1 # The output of mtk_model_ckpt.pb has small value -mtk_model_ckpt.pb 19 +mtk_model_ckpt.pb 19.5 mtk_age_gender.pb 0.5 # The Difference of output node divided by 0 results in cumulative deviation mtk_model_normalize_object_scene_ps_20200519.pb;1;1,224,224,3 10 diff --git a/mindspore/lite/test/config/models_tflite_fp16.cfg b/mindspore/lite/test/config/models_tflite_fp16.cfg index e56df57223d..8de9d641836 100644 --- a/mindspore/lite/test/config/models_tflite_fp16.cfg +++ b/mindspore/lite/test/config/models_tflite_fp16.cfg @@ -52,10 +52,10 @@ ml_ei_landmark.tflite 3 mnist.tflite 4 mobilenet.tflite 0.1 resnet.tflite 120 -scan_hms_angle1.tflite 4 +scan_hms_angle1.tflite 6 scan_hms_detect.tflite 12 hiai_latin_ocr.tflite 45 -hiai_latin_ocr_1.tflite 13 +hiai_latin_ocr_1.tflite 14.5 ml_ocr_jk.tflite 2 nasnet_mobile.tflite 3 nasnet_large.tflite 3 @@ -113,14 +113,14 @@ mnasnet_1.0_224_1_metadata_1.tflite 6 mnasnet_1.0_96_1_metadata_1.tflite 6 lite-model_on_device_vision_classifier_popular_us_products_V1_1.tflite 16 lite-model_on_device_vision_classifier_popular_wine_V1_1.tflite 80 -posenet_mobilenet_float_075_1_default_1.tflite 45 +posenet_mobilenet_float_075_1_default_1.tflite 49 deeplabv3_1_default_1.tflite 6 lite-model_deeplabv3-mobilenetv2_dm05-float16_1_default_1.tflite 13 lite-model_deeplabv3-mobilenetv2-float16_1_default_1.tflite 60 lite-model_east-text-detector_fp16_1.tflite 60 lite-model_cartoongan_fp16_1.tflite 3 lite-model_arbitrary-image-stylization-inceptionv3_fp16_predict_1.tflite 6 -gts_detect_5k_tf115.tflite 6 +gts_detect_5k_tf115.tflite 9.5 mtk_isface.tflite 0.2 mtk_landmark.tflite 0.1 mtk_new_detect.tflite 3 @@ -143,7 +143,7 @@ ml_vision_guide_detection3_pb2tflite.tflite 0.5 ml_vision_guide_detection1_pb2tflite.tflite 0.5 ml_pic_shopping_pb2tflite.tflite 95 ml_ocr_jk_pb2tflite.tflite 0.5 -ml_ocr_latin_pb2tflite.tflite 11 +ml_ocr_latin_pb2tflite.tflite 11.5 scan_hms_angle_pb2tflite.tflite 2.5 scan_hms_detect_pb2tflite.tflite 1.5 ml_location.tflite 0.5 @@ -154,19 +154,19 @@ ml_object_detect_pb2tflite.tflite 1.5 lite-model_on_device_vision_classifier_landmarks_classifier_africa_V1_1.tflite 10 lite-model_on_device_vision_classifier_landmarks_classifier_north_america_V1_1.tflite 19 lite-model_on_device_vision_classifier_landmarks_classifier_asia_V1_1.tflite 25 -lite-model_on_device_vision_classifier_landmarks_classifier_oceania_antarctica_V1_1.tflite 10 +lite-model_on_device_vision_classifier_landmarks_classifier_oceania_antarctica_V1_1.tflite 11 lite-model_on_device_vision_classifier_landmarks_classifier_europe_V1_1.tflite 32 lite-model_on_device_vision_classifier_landmarks_classifier_south_america_V1_1.tflite 14 ml_ei_landmark_pb2tflite.tflite 2 -unet_mbv2_05_104pts.tflite 15 -hiai_AADB_HADB_MBV2_model_f16.tflite 2.5 +unet_mbv2_05_104pts.tflite 17 +hiai_AADB_HADB_MBV2_model_f16.tflite 3.5 hiai_AADB_HADB_MBV2_model_fp32.tflite 4.5 mtk_age_gender_fp16.tflite 26 hiai_detect_curve_model_float32.tflite 9 Q_language_model_hrmini_Q4_b4_17w.tflite 3.5 -lite-model_aiy_vision_classifier_food_V1_1.tflite 42 +lite-model_aiy_vision_classifier_food_V1_1.tflite 47.5 lite-model_disease-classification_1.tflite 70 -lite-model_models_mushroom-identification_v1_1.tflite 4.5 +lite-model_models_mushroom-identification_v1_1.tflite 5 smartreply_1_default_1.tflite 0.5 text_classification.tflite 0.5 Q_AADB_HADB_MBV2_model.tflite 5