diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc index 7f29a101f0d..65a8c17662b 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc @@ -124,7 +124,7 @@ int DeConvFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) { } int DeConvolutionFp16CPUKernel::DoDeconv(int task_id) { - int oc = MSMIN(thread_stride_ * C8NUM, conv_param_->output_channel_ - task_id * thread_stride_ * C8NUM); + int oc = MSMIN(thread_stride_, UP_DIV(conv_param_->output_channel_, C8NUM) - task_id * thread_stride_); int oc_res = MSMIN(thread_stride_ * C8NUM, conv_param_->output_channel_ - task_id * thread_stride_ * C8NUM); if (oc <= 0) { return RET_OK; diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/PostFuncBiasReluC8.S b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/PostFuncBiasReluC8.S index f07e05f87f6..4a81030f118 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/PostFuncBiasReluC8.S +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/PostFuncBiasReluC8.S @@ -204,6 +204,9 @@ Loop_C1: beq End mov w13, w5 ld1 {v16.4s, v17.4s}, [x2], #32 + mov x25, #4 + mul x24, x10, x25 + add x0, x0, x24 cmp x4, #1 beq Loop_C1_1 diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/MatmulFp16.S b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/MatmulFp16.S index 09f408004d9..b96a397cff0 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/MatmulFp16.S +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/MatmulFp16.S @@ -863,7 +863,10 @@ End2: subs w7, w7, #8 // rhs col - 8 add x1, x1, x15 // rhs ptr + stride add x3, x3, #16 // bias ptr + stride + ldrb w13, [sp, #8] + cbz w13, NoDstStep add x2, x2, #16 // dst ptr + stride +NoDstStep: bgt L1 End1: diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/PostFuncBiasReluC8Fp16.S b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/PostFuncBiasReluC8Fp16.S index 491392386d0..6127435102b 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/PostFuncBiasReluC8Fp16.S +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/PostFuncBiasReluC8Fp16.S @@ -16,21 +16,20 @@ // v0 ~ v7 value // v16 bias data -// x24 x25 weite loop tmp buf +// x22 x23 x24 x25 write loop tmp buf // x26 relu6 #6; x27 relu #0 // w10 oc8 loop control // w13 hw loop control PostFuncBiasReluC8Fp16: - movi v26.8h, #6 - scvtf v26.8h, v26.8h + movi v26.8h, #0x46, lsl #8 dup v27.8h, wzr mov w10, #0 Loop_C8: cmp w10, w3 beq Loop_C1 - mov x25, #4 + mov x25, #2 mul x24, x10, x25 add x25, x0, x24 add w10, w10, #8 @@ -118,6 +117,7 @@ Write_4x8: st1 {v1.8h}, [x25], x6 st1 {v2.8h}, [x25], x6 st1 {v3.8h}, [x25], x6 + b Loop_4x8 Loop_1x8: cmp w7, #2 @@ -159,6 +159,9 @@ Loop_C1: beq End mov w13, w5 ld1 {v16.8h}, [x2], #16 + mov x25, #2 + mul x24, x10, x25 + add x22, x0, x24 cmp x4, #1 beq Loop_C1_1 @@ -189,7 +192,7 @@ Loop_C1_1_Relu6: fadd v0.8h, v0.8h, v16.8h fmin v0.8h, v0.8h, v26.8h fmax v0.8h, v0.8h, v27.8h - st1 {v1.h}[0], [x0], x6 + st1 {v0.h}[0], [x22], x6 b Loop_C1_1_Relu6 Loop_C1_1_Relu: cmp w13, #0 @@ -198,7 +201,7 @@ Loop_C1_1_Relu: ld1 {v0.8h}, [x1], #16 fadd v0.8h, v0.8h, v16.8h fmax v0.8h, v0.8h, v27.8h - st1 {v1.h}[0], [x0], x6 + st1 {v0.h}[0], [x22], x6 b Loop_C1_1_Relu Loop_C1_1_Write: cmp w13, #0 @@ -206,7 +209,7 @@ Loop_C1_1_Write: sub w13, w13, #1 ld1 {v0.8h}, [x1], #16 fadd v0.8h, v0.8h, v16.8h - st1 {v1.h}[0], [x0], x6 + st1 {v0.h}[0], [x22], x6 b Loop_C1_1_Write Loop_C1_2: @@ -224,8 +227,8 @@ Loop_C1_2_Relu6: fadd v0.8h, v0.8h, v16.8h fmin v0.8h, v0.8h, v26.8h fmax v0.8h, v0.8h, v27.8h - st1 {v1.h}[0], [x0], x6 - st1 {v1.h}[1], [x24], x6 + st1 {v0.h}[0], [x22], x6 + st1 {v0.h}[1], [x24], x6 b Loop_C1_2_Relu6 Loop_C1_2_Relu: cmp w13, #0 @@ -234,8 +237,8 @@ Loop_C1_2_Relu: ld1 {v0.8h}, [x1], #16 fadd v0.8h, v0.8h, v16.8h fmax v0.8h, v0.8h, v27.8h - st1 {v1.h}[0], [x0], x6 - st1 {v1.h}[1], [x24], x6 + st1 {v0.h}[0], [x22], x6 + st1 {v0.h}[1], [x24], x6 b Loop_C1_2_Relu Loop_C1_2_Write: cmp w13, #0 @@ -243,14 +246,14 @@ Loop_C1_2_Write: sub w13, w13, #1 ld1 {v0.8h}, [x1], #16 fadd v0.8h, v0.8h, v16.8h - st1 {v1.h}[0], [x0], x6 - st1 {v1.h}[1], [x24], x6 + st1 {v0.h}[0], [x22], x6 + st1 {v0.h}[1], [x24], x6 b Loop_C1_2_Write Loop_C1_3: - add x24, x0, #2 - add x25, x0, #4 + add x24, x22, #2 + add x25, x22, #4 cmp w7, #2 beq Loop_C1_3_Relu6 cmp w7, #1 @@ -264,9 +267,9 @@ Loop_C1_3_Relu6: fadd v0.8h, v0.8h, v16.8h fmin v0.8h, v0.8h, v26.8h fmax v0.8h, v0.8h, v27.8h - st1 {v1.h}[0], [x0], x6 - st1 {v1.h}[1], [x24], x6 - st1 {v1.h}[2], [x25], x6 + st1 {v0.h}[0], [x22], x6 + st1 {v0.h}[1], [x24], x6 + st1 {v0.h}[2], [x25], x6 b Loop_C1_3_Relu6 Loop_C1_3_Relu: cmp w13, #0 @@ -275,9 +278,9 @@ Loop_C1_3_Relu: ld1 {v0.8h}, [x1], #16 fadd v0.8h, v0.8h, v16.8h fmax v0.8h, v0.8h, v27.8h - st1 {v1.h}[0], [x0], x6 - st1 {v1.h}[1], [x24], x6 - st1 {v1.h}[2], [x25], x6 + st1 {v0.h}[0], [x22], x6 + st1 {v0.h}[1], [x24], x6 + st1 {v0.h}[2], [x25], x6 b Loop_C1_3_Relu Loop_C1_3_Write: cmp w13, #0 @@ -285,9 +288,9 @@ Loop_C1_3_Write: sub w13, w13, #1 ld1 {v0.8h}, [x1], #16 fadd v0.8h, v0.8h, v16.8h - st1 {v1.h}[0], [x0], x6 - st1 {v1.h}[1], [x24], x6 - st1 {v1.h}[2], [x25], x6 + st1 {v0.h}[0], [x22], x6 + st1 {v0.h}[1], [x24], x6 + st1 {v0.h}[2], [x25], x6 b Loop_C1_3_Write Loop_C1_4: @@ -304,7 +307,7 @@ Loop_C1_4_Relu6: fadd v0.8h, v0.8h, v16.8h fmin v0.8h, v0.8h, v26.8h fmax v0.8h, v0.8h, v27.8h - st1 {v0.4h}, [x0], x6 + st1 {v0.4h}, [x22], x6 b Loop_C1_4_Relu6 Loop_C1_4_Relu: cmp w13, #0 @@ -313,7 +316,7 @@ Loop_C1_4_Relu: ld1 {v0.8h}, [x1], #16 fadd v0.8h, v0.8h, v16.8h fmax v0.8h, v0.8h, v27.8h - st1 {v0.4h}, [x0], x6 + st1 {v0.4h}, [x22], x6 b Loop_C1_4_Relu6 Loop_C1_4_Write: cmp w13, #0 @@ -321,11 +324,11 @@ Loop_C1_4_Write: sub w13, w13, #1 ld1 {v0.8h}, [x1], #16 fadd v0.8h, v0.8h, v16.8h - st1 {v0.4h}, [x0], x6 + st1 {v0.4h}, [x22], x6 b Loop_C1_4_Write Loop_C1_5: - add x25, x0, #16 + add x25, x22, #8 cmp w7, #2 beq Loop_C1_5_Relu6 cmp w7, #1 @@ -339,9 +342,8 @@ Loop_C1_5_Relu6: fadd v0.8h, v0.8h, v16.8h fmin v0.8h, v0.8h, v26.8h fmax v0.8h, v0.8h, v27.8h - st1 {v0.4h}, [x0], x6 - str h1, [x25] - add x25, x25, x6 + st1 {v0.4h}, [x22], x6 + st1 {v0.h}[4], [x25], x6 b Loop_C1_5_Relu6 Loop_C1_5_Relu: cmp w13, #0 @@ -350,9 +352,8 @@ Loop_C1_5_Relu: ld1 {v0.8h}, [x1], #16 fadd v0.8h, v0.8h, v16.8h fmax v0.8h, v0.8h, v27.8h - st1 {v0.4h}, [x0], x6 - str h1, [x25] - add x25, x25, x6 + st1 {v0.4h}, [x22], x6 + st1 {v0.h}[4], [x25], x6 b Loop_C1_5_Relu Loop_C1_5_Write: cmp w13, #0 @@ -360,14 +361,13 @@ Loop_C1_5_Write: sub w13, w13, #1 ld1 {v0.8h}, [x1], #16 fadd v0.8h, v0.8h, v16.8h - st1 {v0.4h}, [x0], x6 - str h1, [x25] - add x25, x25, x6 + st1 {v0.4h}, [x22], x6 + st1 {v0.h}[4], [x25], x6 b Loop_C1_5_Write Loop_C1_6: - add x23, x0, #8 - add x24, x0, #10 + add x23, x22, #8 + add x24, x22, #10 cmp w7, #2 beq Loop_C1_6_Relu6 cmp w7, #1 @@ -381,9 +381,9 @@ Loop_C1_6_Relu6: fadd v0.8h, v0.8h, v16.8h fmin v0.8h, v0.8h, v26.8h fmax v0.8h, v0.8h, v27.8h - st1 {v0.4h}, [x0], x6 - st1 {v1.h}[4], [x23], x6 - st1 {v1.h}[5], [x24], x6 + st1 {v0.4h}, [x22], x6 + st1 {v0.h}[4], [x23], x6 + st1 {v0.h}[5], [x24], x6 b Loop_C1_6_Relu6 Loop_C1_6_Relu: cmp w13, #0 @@ -392,9 +392,9 @@ Loop_C1_6_Relu: ld1 {v0.8h}, [x1], #16 fadd v0.8h, v0.8h, v16.8h fmax v0.8h, v0.8h, v27.8h - st1 {v0.4h}, [x0], x6 - st1 {v1.h}[4], [x23], x6 - st1 {v1.h}[5], [x24], x6 + st1 {v0.4h}, [x22], x6 + st1 {v0.h}[4], [x23], x6 + st1 {v0.h}[5], [x24], x6 b Loop_C1_6_Relu Loop_C1_6_Write: cmp w13, #0 @@ -402,15 +402,15 @@ Loop_C1_6_Write: sub w13, w13, #1 ld1 {v0.8h}, [x1], #16 fadd v0.8h, v0.8h, v16.8h - st1 {v0.4h}, [x0], x6 - st1 {v1.h}[4], [x23], x6 - st1 {v1.h}[5], [x24], x6 + st1 {v0.4h}, [x22], x6 + st1 {v0.h}[4], [x23], x6 + st1 {v0.h}[5], [x24], x6 b Loop_C1_6_Write Loop_C1_7: - add x23, x0, #8 - add x24, x0, #10 - add x25, x0, #12 + add x23, x22, #8 + add x24, x22, #10 + add x25, x22, #12 cmp w7, #2 beq Loop_C1_7_Relu6 cmp w7, #1 @@ -424,10 +424,10 @@ Loop_C1_7_Relu6: fadd v0.8h, v0.8h, v16.8h fmin v0.8h, v0.8h, v26.8h fmax v0.8h, v0.8h, v27.8h - st1 {v0.4h}, [x0], x6 - st1 {v1.h}[4], [x23], x6 - st1 {v1.h}[5], [x24], x6 - st1 {v1.h}[6], [x25], x6 + st1 {v0.4h}, [x22], x6 + st1 {v0.h}[4], [x23], x6 + st1 {v0.h}[5], [x24], x6 + st1 {v0.h}[6], [x25], x6 b Loop_C1_7_Relu6 Loop_C1_7_Relu: cmp w13, #0 @@ -436,10 +436,10 @@ Loop_C1_7_Relu: ld1 {v0.8h}, [x1], #16 fadd v0.8h, v0.8h, v16.8h fmax v0.8h, v0.8h, v27.8h - st1 {v0.4h}, [x0], x6 - st1 {v1.h}[4], [x23], x6 - st1 {v1.h}[5], [x24], x6 - st1 {v1.h}[6], [x25], x6 + st1 {v0.4h}, [x22], x6 + st1 {v0.h}[4], [x23], x6 + st1 {v0.h}[5], [x24], x6 + st1 {v0.h}[6], [x25], x6 b Loop_C1_7_Relu Loop_C1_7_Write: cmp w13, #0 @@ -447,11 +447,10 @@ Loop_C1_7_Write: sub w13, w13, #1 ld1 {v0.8h}, [x1], #16 fadd v0.8h, v0.8h, v16.8h - fmax v0.8h, v0.8h, v27.8h - st1 {v0.4h}, [x0], x6 - st1 {v1.h}[4], [x23], x6 - st1 {v1.h}[5], [x24], x6 - st1 {v1.h}[6], [x25], x6 + st1 {v0.4h}, [x22], x6 + st1 {v0.h}[4], [x23], x6 + st1 {v0.h}[5], [x24], x6 + st1 {v0.h}[6], [x25], x6 b Loop_C1_7_Write End: diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/deconv_fp16.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/deconv_fp16.c index 050304a0ef7..63262197174 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/deconv_fp16.c +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/deconv_fp16.c @@ -37,16 +37,12 @@ void PostConvFuncCommFp16(float16_t *out_ptr, const float16_t *src_ptr_, const f void PostConvFuncFp16C8(const float16_t *c8_out_ptr, float16_t *out_ptr, const float16_t *bias_ptr, size_t output_channel, size_t plane_size, size_t stride, bool is_relu, bool is_relu6) { -#ifdef DEBUG_CODE - PostConvFuncCommFp16(out_ptr, c8_out_ptr, bias_ptr, output_channel, plane_size, stride, is_relu, is_relu6, C8NUM); -#else size_t oc8mod = output_channel % C8NUM; size_t oc8div = output_channel - oc8mod; size_t stride_size = stride * sizeof(float16_t); size_t relu_type = is_relu ? 1 : 0; relu_type = is_relu6 ? 2 : relu_type; PostFuncBiasReluC8Fp16(out_ptr, c8_out_ptr, bias_ptr, oc8div, oc8mod, plane_size, stride_size, relu_type); -#endif return; } diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/matmul_fp16.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/matmul_fp16.c index b59cba408d1..7d0b785fd26 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/matmul_fp16.c +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/matmul_fp16.c @@ -74,7 +74,6 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, int depth, int row, int col, int stride, bool write_nhwc) { MatmulFp16Neon64(a, b, c, bias, (int)act_type, depth, row, col, stride, write_nhwc); - // MatMul16x8(a, b, c, bias, (int)act_type, depth, row, col, stride, write_nhwc); return; }