From d83668bcca4917504987953e2834365fe1ad93ea Mon Sep 17 00:00:00 2001
From: yangruoqi713 <yangruoqi@huawei.com>
Date: Wed, 24 Feb 2021 15:04:28 +0800
Subject: [PATCH] [MSLITE][Develop] optimize cpu arm64 fp32 lstm

---
 .../lite/nnacl/assembly/arm64/MatVecMulFp32.S | 56 +++++++++++++++----
 mindspore/lite/nnacl/fp32/lstm_fp32.c         |  3 +-
 2 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S b/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S
index 228dc502457..36383dfb300 100644
--- a/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S
@@ -46,42 +46,78 @@ Loop1x4:
 Depth8_1x4:
   cmp w9, #8
   blt Depth4_1x4
-
+  sub w9, w9, #8
   ld1 {v0.4s, v1.4s}, [x15], #32
   ld1 {v2.4s, v3.4s}, [x7], #32
   ld1 {v4.4s, v5.4s}, [x10], #32
+  cmp w9, #8
+  blt Depth8_1x4_Loop_End
+
+Depth8_1x4_Loop:
   fmla v10.4s, v0.4s, v2.4s
   fmla v10.4s, v1.4s, v3.4s
+  ld1 {v6.4s, v7.4s}, [x11], #32
   fmla v11.4s, v0.4s, v4.4s
   fmla v11.4s, v1.4s, v5.4s
+  ld1 {v8.4s, v9.4s}, [x12], #32
+  fmla v12.4s, v0.4s, v6.4s
+  fmla v12.4s, v1.4s, v7.4s
+  ld1 {v2.4s, v3.4s}, [x7], #32
+  fmla v13.4s, v0.4s, v8.4s
+  fmla v13.4s, v1.4s, v9.4s
+  ld1 {v0.4s, v1.4s}, [x15], #32
+  ld1 {v4.4s, v5.4s}, [x10], #32
+  sub w9, w9, #8
+  cmp w9, #8
+  bge Depth8_1x4_Loop
+
+Depth8_1x4_Loop_End:
+  fmla v10.4s, v0.4s, v2.4s
+  fmla v10.4s, v1.4s, v3.4s
   ld1 {v6.4s, v7.4s}, [x11], #32
+  fmla v11.4s, v0.4s, v4.4s
+  fmla v11.4s, v1.4s, v5.4s
   ld1 {v8.4s, v9.4s}, [x12], #32
   fmla v12.4s, v0.4s, v6.4s
   fmla v12.4s, v1.4s, v7.4s
   fmla v13.4s, v0.4s, v8.4s
   fmla v13.4s, v1.4s, v9.4s
-  sub w9, w9, #8
-  cbz w9, End1x4
-  b Depth8_1x4
 
 Depth4_1x4:
   cmp w9, #4
   blt Depth1_1x4
-
+  sub w9, w9, #4
   ld1 {v0.4s}, [x15], #16
   ld1 {v1.4s}, [x7], #16
   ld1 {v2.4s}, [x10], #16
-  ld1 {v3.4s}, [x11], #16
-  ld1 {v4.4s}, [x12], #16
+  cmp w9, #4
+  blt Depth4_1x4_Loop_End
+
+Depth4_1x4_Loop:
   fmla v10.4s, v1.4s, v0.4s
+  ld1 {v3.4s}, [x11], #16
   fmla v11.4s, v2.4s, v0.4s
+  ld1 {v4.4s}, [x12], #16
+  fmla v12.4s, v3.4s, v0.4s
+  ld1 {v1.4s}, [x7], #16
+  fmla v13.4s, v4.4s, v0.4s
+  ld1 {v0.4s}, [x15], #16
+  ld1 {v2.4s}, [x10], #16
+  sub w9, w9, #4
+  cmp w9, #4
+  bge Depth4_1x4_Loop
+
+Depth4_1x4_Loop_End:
+  fmla v10.4s, v1.4s, v0.4s
+  ld1 {v3.4s}, [x11], #16
+  fmla v11.4s, v2.4s, v0.4s
+  ld1 {v4.4s}, [x12], #16
   fmla v12.4s, v3.4s, v0.4s
   fmla v13.4s, v4.4s, v0.4s
-  sub w9, w9, #4
-  cbz w9, End1x4
-  b Depth8_1x4
 
 Depth1_1x4:
+  cmp w9, #0
+  beq End1x4
   ld1 {v0.s}[0], [x15], #4
   ld1 {v1.s}[0], [x7], #4
   ld1 {v1.s}[1], [x10], #4
diff --git a/mindspore/lite/nnacl/fp32/lstm_fp32.c b/mindspore/lite/nnacl/fp32/lstm_fp32.c
index 85bbe7a6488..276375c8636 100644
--- a/mindspore/lite/nnacl/fp32/lstm_fp32.c
+++ b/mindspore/lite/nnacl/fp32/lstm_fp32.c
@@ -78,8 +78,7 @@ void MatMulAcc(float *output, const float *input, const float *weight, int rows,
 
 void LstmMatMul(float *c, const float *a, const float *b, const float *bias, int row, int deep, int col, bool is_vec) {
   if (is_vec) {
-    memcpy(c, bias, col * sizeof(float));
-    MatMulAcc(c, a, b, row, col, deep);
+    MatVecMulFp32(a, b, c, bias, ActType_No, deep, col);
   } else {
     MatMulOpt(a, b, c, bias, ActType_No, deep, row, col, col, OutType_Nhwc);
   }