From 99cc44d25c2158bdff36bf7af51ea0125e674ac3 Mon Sep 17 00:00:00 2001 From: lixian <179220644@qq.com> Date: Mon, 24 Aug 2020 09:50:59 +0800 Subject: [PATCH] add c4 output support for fp32 matmul kernel --- .../lite/nnacl/assembly/arm64/MatmulFp32Opt.S | 50 ++++++++++++++++--- mindspore/lite/nnacl/fp32/matmul.c | 10 ++-- mindspore/lite/nnacl/fp32/matmul.h | 4 +- .../kernel/arm/fp32/convolution_1x1.cc | 2 +- 4 files changed, 50 insertions(+), 16 deletions(-) diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S index c1d3f4498c0..7ecb6ad3058 100644 --- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S +++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S @@ -47,7 +47,7 @@ ///////////////////////////////////////////////////////////////////////////////// // // void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth -// int row, int col, int stride, bool write_nhwc) +// int row, int col, size_t stride, size_t writeNhwc, size_t writeC4) // x0: a // x1: b // x2: c @@ -57,7 +57,7 @@ // w6: row // w7: col // w17: stride -// w13: writeC8 +// w13: c8_nhwc_c4 MatmulFloatNeon64Opt: sub sp, sp, #128 @@ -209,8 +209,8 @@ Activation: b Write Relu6: - mov w8, #6 - dup v2.4s, w8 + mov w13, #6 + dup v2.4s, w13 scvtf v2.4s, v2.4s fmin v8.4s, v8.4s, v2.4s fmin v9.4s, v9.4s, v2.4s @@ -265,8 +265,10 @@ Relu: fmax v31.4s, v31.4s, v3.4s Write: - ldrb w13, [sp, #8] - cbz w13, WriteC8 + ldr w8, [sp, #8] + cbz w8, WriteC8 + ldr w8, [sp, #16] + cbnz w8, WriteC4 cmp w7, #1 beq Write1 cmp w7, #2 @@ -726,6 +728,33 @@ WriteC8: st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x2], #64 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x2], #64 b WriteEnd +WriteC4: + st1 {v8.8h}, [x2], #16 + st1 {v10.8h}, [x2], #16 + st1 {v12.8h}, [x2], #16 + st1 {v14.8h}, [x2], #16 + st1 {v16.8h}, [x2], #16 + st1 {v18.8h}, [x2], #16 + st1 {v20.8h}, [x2], #16 + st1 {v22.8h}, [x2], #16 + st1 {v24.8h}, [x2], #16 + st1 {v26.8h}, [x2], #16 + st1 {v28.8h}, [x2], #16 + st1 {v30.8h}, [x2], #16 + add x18, x2, x17 + st1 {v9.8h}, [x18], #16 + st1 {v11.8h}, [x18], #16 + st1 {v13.8h}, [x18], #16 + st1 {v15.8h}, [x18], #16 + st1 {v17.8h}, [x18], #16 + st1 {v19.8h}, [x18], #16 + st1 {v21.8h}, [x18], #16 + st1 {v23.8h}, [x18], #16 + st1 {v25.8h}, [x18], #16 + st1 {v27.8h}, [x18], #16 + st1 {v29.8h}, [x18], #16 + st1 {v31.8h}, [x18], #16 + b WriteEnd Write8: st1 {v8.4s, v9.4s}, [x18], x17 cmp w10, #1 @@ -770,9 +799,14 @@ End2: subs w7, w7, #8 // rhs col - 8 add x1, x1, x15 // rhs ptr + stride add x3, x3, #32 // bias ptr + stride - ldrb w13, [sp, #8] - cbz w13, NoDstStep + ldr w8, [sp, #8] + cbz w8, NoDstStep + ldr w8, [sp, #16] + cbnz w8, C4DstStep add x2, x2, #32 // dst ptr + stride + b NoDstStep +C4DstStep: + add x2, x2, x17 NoDstStep: bgt L1 diff --git a/mindspore/lite/nnacl/fp32/matmul.c b/mindspore/lite/nnacl/fp32/matmul.c index 97ff005637a..0caf28bb6f8 100644 --- a/mindspore/lite/nnacl/fp32/matmul.c +++ b/mindspore/lite/nnacl/fp32/matmul.c @@ -370,8 +370,8 @@ void MatMul8x8(const float *a, const float *b, float *dst, const float *bias, Ac } void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, ActType act_type, int deep, int row, - int col, int stride, bool write_nhwc) { - if (write_nhwc) { + int col, size_t stride, size_t writeNhwc, size_t writeC4) { + if (writeNhwc != 0) { /* col8-major * row8-major => col-major */ for (int r = 0; r < row; r++) { for (int c = 0; c < col; c++) { @@ -404,10 +404,10 @@ void MatMul(const float *a, const float *b, float *c, const float *bias, ActType } void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActType act_type, int deep, int row, - int col, int stride, bool write_nhwc) { + int col, size_t stride, size_t writeNhwc, size_t writeC4) { #ifdef ENABLE_ARM64 - MatmulFloatNeon64Opt(a, b, c, bias, (int)act_type, deep, row, col, stride, write_nhwc); + MatmulFloatNeon64Opt(a, b, c, bias, (int)act_type, deep, row, col, stride, writeNhwc, writeC4); #else - MatMul12x8(a, b, c, bias, act_type, deep, row, col, stride, write_nhwc); + MatMul12x8(a, b, c, bias, act_type, deep, row, col, stride, writeNhwc, writeC4); #endif } diff --git a/mindspore/lite/nnacl/fp32/matmul.h b/mindspore/lite/nnacl/fp32/matmul.h index 4bc74c1b3a3..9b47ca8f4f4 100644 --- a/mindspore/lite/nnacl/fp32/matmul.h +++ b/mindspore/lite/nnacl/fp32/matmul.h @@ -29,7 +29,7 @@ extern "C" { void MatMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int depth, int row, int col, int stride, bool write_nhwc); void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActType act_type, int depth, int row, - int col, int stride, bool write_nhwc); + int col, size_t stride, size_t writeNhwc, size_t writeC4); void RowMajor2Row8Major(float *src_ptr, float *dst_ptr, int row, int col); void RowMajor2Col8Major(float *src_ptr, float *dst_ptr, size_t row, size_t col); void RowMajor2Col12Major(float *src_ptr, float *dst_ptr, size_t row, size_t col); @@ -38,7 +38,7 @@ void Row8x8Major2RowMajor(float *src_ptr, float *dst_ptr, size_t row, size_t col void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, int col, size_t stride, bool write_nhwc); void MatmulFloatNeon64Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, - int col, size_t stride, bool write_nhwc); + int col, size_t stride, size_t writeNhwc, size_t writeC4); #endif #ifdef __cplusplus } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc index 798488a1bd5..617e1d998b7 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc @@ -145,7 +145,7 @@ int Convolution1x1CPUKernel::DoConv1x1(int task_id) { MatMulOpt(pack_input_, weight_ptr_ + task_id * thread_stride_ * matmul_param_->deep_, output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_, - matmul_param_->row_, cur_oc, matmul_param_->col_, true); + matmul_param_->row_, cur_oc, matmul_param_->col_, 1, 0); return RET_OK; }