From d83668bcca4917504987953e2834365fe1ad93ea Mon Sep 17 00:00:00 2001 From: yangruoqi713 Date: Wed, 24 Feb 2021 15:04:28 +0800 Subject: [PATCH] [MSLITE][Develop] optimize cpu arm64 fp32 lstm --- .../lite/nnacl/assembly/arm64/MatVecMulFp32.S | 56 +++++++++++++++---- mindspore/lite/nnacl/fp32/lstm_fp32.c | 3 +- 2 files changed, 47 insertions(+), 12 deletions(-) diff --git a/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S b/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S index 228dc502457..36383dfb300 100644 --- a/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S +++ b/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S @@ -46,42 +46,78 @@ Loop1x4: Depth8_1x4: cmp w9, #8 blt Depth4_1x4 - + sub w9, w9, #8 ld1 {v0.4s, v1.4s}, [x15], #32 ld1 {v2.4s, v3.4s}, [x7], #32 ld1 {v4.4s, v5.4s}, [x10], #32 + cmp w9, #8 + blt Depth8_1x4_Loop_End + +Depth8_1x4_Loop: fmla v10.4s, v0.4s, v2.4s fmla v10.4s, v1.4s, v3.4s + ld1 {v6.4s, v7.4s}, [x11], #32 fmla v11.4s, v0.4s, v4.4s fmla v11.4s, v1.4s, v5.4s + ld1 {v8.4s, v9.4s}, [x12], #32 + fmla v12.4s, v0.4s, v6.4s + fmla v12.4s, v1.4s, v7.4s + ld1 {v2.4s, v3.4s}, [x7], #32 + fmla v13.4s, v0.4s, v8.4s + fmla v13.4s, v1.4s, v9.4s + ld1 {v0.4s, v1.4s}, [x15], #32 + ld1 {v4.4s, v5.4s}, [x10], #32 + sub w9, w9, #8 + cmp w9, #8 + bge Depth8_1x4_Loop + +Depth8_1x4_Loop_End: + fmla v10.4s, v0.4s, v2.4s + fmla v10.4s, v1.4s, v3.4s ld1 {v6.4s, v7.4s}, [x11], #32 + fmla v11.4s, v0.4s, v4.4s + fmla v11.4s, v1.4s, v5.4s ld1 {v8.4s, v9.4s}, [x12], #32 fmla v12.4s, v0.4s, v6.4s fmla v12.4s, v1.4s, v7.4s fmla v13.4s, v0.4s, v8.4s fmla v13.4s, v1.4s, v9.4s - sub w9, w9, #8 - cbz w9, End1x4 - b Depth8_1x4 Depth4_1x4: cmp w9, #4 blt Depth1_1x4 - + sub w9, w9, #4 ld1 {v0.4s}, [x15], #16 ld1 {v1.4s}, [x7], #16 ld1 {v2.4s}, [x10], #16 - ld1 {v3.4s}, [x11], #16 - ld1 {v4.4s}, [x12], #16 + cmp w9, #4 + blt Depth4_1x4_Loop_End + +Depth4_1x4_Loop: fmla v10.4s, v1.4s, v0.4s + ld1 {v3.4s}, [x11], #16 fmla v11.4s, v2.4s, v0.4s + ld1 {v4.4s}, [x12], #16 + fmla v12.4s, v3.4s, v0.4s + ld1 {v1.4s}, [x7], #16 + fmla v13.4s, v4.4s, v0.4s + ld1 {v0.4s}, [x15], #16 + ld1 {v2.4s}, [x10], #16 + sub w9, w9, #4 + cmp w9, #4 + bge Depth4_1x4_Loop + +Depth4_1x4_Loop_End: + fmla v10.4s, v1.4s, v0.4s + ld1 {v3.4s}, [x11], #16 + fmla v11.4s, v2.4s, v0.4s + ld1 {v4.4s}, [x12], #16 fmla v12.4s, v3.4s, v0.4s fmla v13.4s, v4.4s, v0.4s - sub w9, w9, #4 - cbz w9, End1x4 - b Depth8_1x4 Depth1_1x4: + cmp w9, #0 + beq End1x4 ld1 {v0.s}[0], [x15], #4 ld1 {v1.s}[0], [x7], #4 ld1 {v1.s}[1], [x10], #4 diff --git a/mindspore/lite/nnacl/fp32/lstm_fp32.c b/mindspore/lite/nnacl/fp32/lstm_fp32.c index 85bbe7a6488..276375c8636 100644 --- a/mindspore/lite/nnacl/fp32/lstm_fp32.c +++ b/mindspore/lite/nnacl/fp32/lstm_fp32.c @@ -78,8 +78,7 @@ void MatMulAcc(float *output, const float *input, const float *weight, int rows, void LstmMatMul(float *c, const float *a, const float *b, const float *bias, int row, int deep, int col, bool is_vec) { if (is_vec) { - memcpy(c, bias, col * sizeof(float)); - MatMulAcc(c, a, b, row, col, deep); + MatVecMulFp32(a, b, c, bias, ActType_No, deep, col); } else { MatMulOpt(a, b, c, bias, ActType_No, deep, row, col, col, OutType_Nhwc); }