diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32Opt.S b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32Opt.S index 7dda0cfa8e4..999ae943bbb 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32Opt.S +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32Opt.S @@ -1345,239 +1345,162 @@ LoopRow4: st1 {v30.4s}, [x11], #16 b WriteEnd C4Write5: - add x19, x11, #16 - st1 {v8.4s}, [x11] - add x11, x11, #20 - str s9, [x19] - add x19, x19, #20 + add x19, x11, x8 + st1 {v8.4s}, [x11], #16 + str s9, [x19], #4 cmp x6, #1 beq WriteEnd - - st1 {v10.4s}, [x11] - add x11, x11, #20 - str s11, [x19] - add x19, x19, #20 + st1 {v10.4s}, [x11], #16 + str s11, [x19], #4 cmp x6, #2 beq WriteEnd - - st1 {v12.4s}, [x11] - add x11, x11, #20 - str s13, [x19] - add x19, x19, #20 + st1 {v12.4s}, [x11], #16 + str s13, [x19], #4 cmp x6, #3 beq WriteEnd - - st1 {v14.4s}, [x11] - add x11, x11, #20 - str s15, [x19] - add x19, x19, #20 + st1 {v14.4s}, [x11], #16 + str s15, [x19], #4 cmp x6, #4 beq WriteEnd - - st1 {v16.4s}, [x11] - add x11, x11, #20 - str s17, [x19] - add x19, x19, #20 + st1 {v16.4s}, [x11], #16 + str s17, [x19], #4 cmp x6, #5 beq WriteEnd - - st1 {v18.4s}, [x11] - add x11, x11, #20 - str s19, [x19] - add x19, x19, #20 + st1 {v18.4s}, [x11], #16 + str s19, [x19], #4 cmp x6, #6 beq WriteEnd - - st1 {v20.4s}, [x11] - add x11, x11, #20 - str s21, [x19] - add x19, x19, #20 + st1 {v20.4s}, [x11], #16 + str s21, [x19], #4 cmp x6, #7 beq WriteEnd - - st1 {v22.4s}, [x11] - add x11, x11, #20 - str s23, [x19] - add x19, x19, #20 + st1 {v22.4s}, [x11], #16 + str s23, [x19], #4 cmp x6, #8 beq WriteEnd - - st1 {v24.4s}, [x11] - add x11, x11, #20 - str s25, [x19] - add x19, x19, #20 + st1 {v24.4s}, [x11], #16 + str s25, [x19], #4 cmp x6, #9 beq WriteEnd - - st1 {v26.4s}, [x11] - add x11, x11, #20 - str s27, [x19] - add x19, x19, #20 + st1 {v26.4s}, [x11], #16 + str s27, [x19], #4 cmp x6, #10 beq WriteEnd - - st1 {v28.4s}, [x11] - add x11, x11, #20 - str s29, [x19] - add x19, x19, #20 + st1 {v28.4s}, [x11], #16 + str s29, [x19], #4 cmp x6, #11 beq WriteEnd - - st1 {v30.4s}, [x11] - str s31, [x19] + st1 {v30.4s}, [x11], #16 + str s31, [x19], #4 b WriteEnd C4Write6: - add x19, x11, #16 - st1 {v8.4s}, [x11] - add x11, x11, #24 - st1 {v9.2s}, [x19] - add x19, x19, #24 + add x19, x11, x8 + st1 {v8.4s}, [x11], #16 + st1 {v9.2s}, [x19], #8 cmp x6, #1 beq WriteEnd - - st1 {v10.4s}, [x11] - add x11, x11, #24 - st1 {v11.2s}, [x19] - add x19, x19, #24 + st1 {v10.4s}, [x11], #16 + st1 {v11.2s}, [x19], #8 cmp x6, #2 beq WriteEnd - - st1 {v12.4s}, [x11] - add x11, x11, #24 - st1 {v13.2s}, [x19] - add x19, x19, #24 + st1 {v12.4s}, [x11], #16 + st1 {v13.2s}, [x19], #8 cmp x6, #3 beq WriteEnd - - st1 {v14.4s}, [x11] - add x11, x11, #24 - st1 {v15.2s}, [x19] - add x19, x19, #24 + st1 {v14.4s}, [x11], #16 + st1 {v15.2s}, [x19], #8 cmp x6, #4 beq WriteEnd - - st1 {v16.4s}, [x11] - add x11, x11, #24 - st1 {v17.2s}, [x19] - add x19, x19, #24 + st1 {v16.4s}, [x11], #16 + st1 {v17.2s}, [x19], #8 cmp x6, #5 beq WriteEnd - - st1 {v18.4s}, [x11] - add x11, x11, #24 - st1 {v19.2s}, [x19] - add x19, x19, #24 + st1 {v18.4s}, [x11], #16 + st1 {v19.2s}, [x19], #8 cmp x6, #6 beq WriteEnd - - st1 {v20.4s}, [x11] - add x11, x11, #24 - st1 {v21.2s}, [x19] - add x19, x19, #24 + st1 {v20.4s}, [x11], #16 + st1 {v21.2s}, [x19], #8 cmp x6, #7 beq WriteEnd - - st1 {v22.4s}, [x11] - add x11, x11, #24 - st1 {v23.2s}, [x19] - add x19, x19, #24 + st1 {v22.4s}, [x11], #16 + st1 {v23.2s}, [x19], #8 cmp x6, #8 beq WriteEnd - - st1 {v24.4s}, [x11] - add x11, x11, #24 - st1 {v25.2s}, [x19] - add x19, x19, #24 + st1 {v24.4s}, [x11], #16 + st1 {v25.2s}, [x19], #8 cmp x6, #9 beq WriteEnd - - st1 {v26.4s}, [x11] - add x11, x11, #24 - st1 {v27.2s}, [x19] - add x19, x19, #24 + st1 {v26.4s}, [x11], #16 + st1 {v27.2s}, [x19], #8 cmp x6, #10 beq WriteEnd - - st1 {v28.4s}, [x11] - add x11, x11, #24 - st1 {v29.2s}, [x19] - add x19, x19, #24 + st1 {v28.4s}, [x11], #16 + st1 {v29.2s}, [x19], #8 cmp x6, #11 beq WriteEnd - - st1 {v30.4s}, [x11] - st1 {v31.2s}, [x19] + st1 {v30.4s}, [x11], #16 + st1 {v31.2s}, [x19], #8 b WriteEnd C4Write7: - add x19, x11, #16 - add x16, x11, #24 - mov x10, #28 - st1 {v8.4s}, [x11], x10 - st1 {v9.2s}, [x19], x10 - st1 {v9.s}[2], [x16], x10 + add x19, x11, x8 + add x16, x19, #8 + mov x15, #12 + st1 {v8.4s}, [x11], #16 + st1 {v9.2s}, [x19], x15 + st1 {v9.s}[2], [x16], x15 cmp x6, #1 beq WriteEnd - - st1 {v10.4s}, [x11], x10 - st1 {v11.2s}, [x19], x10 - st1 {v11.s}[2], [x16], x10 + st1 {v10.4s}, [x11], #16 + st1 {v11.2s}, [x19], x15 + st1 {v11.s}[2], [x16], x15 cmp x6, #2 beq WriteEnd - - st1 {v12.4s}, [x11], x10 - st1 {v13.2s}, [x19], x10 - st1 {v13.s}[2], [x16], x10 + st1 {v12.4s}, [x11], #16 + st1 {v13.2s}, [x19], x15 + st1 {v13.s}[2], [x16], x15 cmp x6, #3 beq WriteEnd - - st1 {v14.4s}, [x11], x10 - st1 {v15.2s}, [x19], x10 - st1 {v15.s}[2], [x16], x10 + st1 {v14.4s}, [x11], #16 + st1 {v15.2s}, [x19], x15 + st1 {v15.s}[2], [x16], x15 cmp x6, #4 beq WriteEnd - - st1 {v16.4s}, [x11], x10 - st1 {v17.2s}, [x19], x10 - st1 {v17.s}[2], [x16], x10 + st1 {v16.4s}, [x11], #16 + st1 {v17.2s}, [x19], x15 + st1 {v17.s}[2], [x16], x15 cmp x6, #5 beq WriteEnd - - st1 {v18.4s}, [x11], x10 - st1 {v19.2s}, [x19], x10 - st1 {v19.s}[2], [x16], x10 + st1 {v18.4s}, [x11], #16 + st1 {v19.2s}, [x19], x15 + st1 {v19.s}[2], [x16], x15 cmp x6, #6 beq WriteEnd - - st1 {v20.4s}, [x11], x10 - st1 {v21.2s}, [x19], x10 - st1 {v21.s}[2], [x16], x10 + st1 {v20.4s}, [x11], #16 + st1 {v21.2s}, [x19], x15 + st1 {v21.s}[2], [x16], x15 cmp x6, #7 beq WriteEnd - - st1 {v22.4s}, [x11], x10 - st1 {v23.2s}, [x19], x10 - st1 {v23.s}[2], [x16], x10 + st1 {v22.4s}, [x11], #16 + st1 {v23.2s}, [x19], x15 + st1 {v23.s}[2], [x16], x15 cmp x6, #8 beq WriteEnd - - st1 {v24.4s}, [x11], x10 - st1 {v25.2s}, [x19], x10 - st1 {v25.s}[2], [x16], x10 + st1 {v24.4s}, [x11], #16 + st1 {v25.2s}, [x19], x15 + st1 {v25.s}[2], [x16], x15 cmp x6, #9 beq WriteEnd - - st1 {v26.4s}, [x11], x10 - st1 {v27.2s}, [x19], x10 - st1 {v27.s}[2], [x16], x10 + st1 {v26.4s}, [x11], #16 + st1 {v27.2s}, [x19], x15 + st1 {v27.s}[2], [x16], x15 cmp x6, #10 beq WriteEnd - - st1 {v28.4s}, [x11], x10 - st1 {v29.2s}, [x19], x10 - st1 {v29.s}[2], [x16], x10 + st1 {v28.4s}, [x11], #16 + st1 {v29.2s}, [x19], x15 + st1 {v29.s}[2], [x16], x15 cmp x6, #11 beq WriteEnd - st1 {v30.4s}, [x11] st1 {v31.2s}, [x19] st1 {v31.s}[2], [x16] diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow12.S b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow12.S index eae7a436fea..15d9b14ecb3 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow12.S +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow12.S @@ -398,7 +398,7 @@ LoopRow: str s26, [x11] cmp x6, #10 beq WriteEnd -add x11, x11, x8 + add x11, x11, x8 str s28, [x11] cmp x6, #11 beq WriteEnd @@ -972,204 +972,160 @@ add x11, x11, x8 st1 {v30.4s}, [x11], #16 b WriteEnd C4Write5: - add x19, x11, #16 - st1 {v8.4s}, [x11] - add x11, x11, #20 - str s9, [x19] - add x19, x19, #20 + add x19, x11, x8 + st1 {v8.4s}, [x11], #16 + str s9, [x19], #4 cmp x6, #1 beq WriteEnd - st1 {v10.4s}, [x11] - add x11, x11, #20 - str s11, [x19] - add x19, x19, #20 + st1 {v10.4s}, [x11], #16 + str s11, [x19], #4 cmp x6, #2 beq WriteEnd - st1 {v12.4s}, [x11] - add x11, x11, #20 - str s13, [x19] - add x19, x19, #20 + st1 {v12.4s}, [x11], #16 + str s13, [x19], #4 cmp x6, #3 beq WriteEnd - st1 {v14.4s}, [x11] - add x11, x11, #20 - str s15, [x19] - add x19, x19, #20 + st1 {v14.4s}, [x11], #16 + str s15, [x19], #4 cmp x6, #4 beq WriteEnd - st1 {v16.4s}, [x11] - add x11, x11, #20 - str s17, [x19] - add x19, x19, #20 + st1 {v16.4s}, [x11], #16 + str s17, [x19], #4 cmp x6, #5 beq WriteEnd - st1 {v18.4s}, [x11] - add x11, x11, #20 - str s19, [x19] - add x19, x19, #20 + st1 {v18.4s}, [x11], #16 + str s19, [x19], #4 cmp x6, #6 beq WriteEnd - st1 {v20.4s}, [x11] - add x11, x11, #20 - str s21, [x19] - add x19, x19, #20 + st1 {v20.4s}, [x11], #16 + str s21, [x19], #4 cmp x6, #7 beq WriteEnd - st1 {v22.4s}, [x11] - add x11, x11, #20 - str s23, [x19] - add x19, x19, #20 + st1 {v22.4s}, [x11], #16 + str s23, [x19], #4 cmp x6, #8 beq WriteEnd - st1 {v24.4s}, [x11] - add x11, x11, #20 - str s25, [x19] - add x19, x19, #20 + st1 {v24.4s}, [x11], #16 + str s25, [x19], #4 cmp x6, #9 beq WriteEnd - st1 {v26.4s}, [x11] - add x11, x11, #20 - str s27, [x19] - add x19, x19, #20 + st1 {v26.4s}, [x11], #16 + str s27, [x19], #4 cmp x6, #10 beq WriteEnd - st1 {v28.4s}, [x11] - add x11, x11, #20 - str s29, [x19] - add x19, x19, #20 + st1 {v28.4s}, [x11], #16 + str s29, [x19], #4 cmp x6, #11 beq WriteEnd - st1 {v30.4s}, [x11] - str s31, [x19] + st1 {v30.4s}, [x11], #16 + str s31, [x19], #4 b WriteEnd C4Write6: - add x19, x11, #16 - st1 {v8.4s}, [x11] - add x11, x11, #24 - st1 {v9.2s}, [x19] - add x19, x19, #24 + add x19, x11, x8 + st1 {v8.4s}, [x11], #16 + st1 {v9.2s}, [x19], #8 cmp x6, #1 beq WriteEnd - st1 {v10.4s}, [x11] - add x11, x11, #24 - st1 {v11.2s}, [x19] - add x19, x19, #24 + st1 {v10.4s}, [x11], #16 + st1 {v11.2s}, [x19], #8 cmp x6, #2 beq WriteEnd - st1 {v12.4s}, [x11] - add x11, x11, #24 - st1 {v13.2s}, [x19] - add x19, x19, #24 + st1 {v12.4s}, [x11], #16 + st1 {v13.2s}, [x19], #8 cmp x6, #3 beq WriteEnd - st1 {v14.4s}, [x11] - add x11, x11, #24 - st1 {v15.2s}, [x19] - add x19, x19, #24 + st1 {v14.4s}, [x11], #16 + st1 {v15.2s}, [x19], #8 cmp x6, #4 beq WriteEnd - st1 {v16.4s}, [x11] - add x11, x11, #24 - st1 {v17.2s}, [x19] - add x19, x19, #24 + st1 {v16.4s}, [x11], #16 + st1 {v17.2s}, [x19], #8 cmp x6, #5 beq WriteEnd - st1 {v18.4s}, [x11] - add x11, x11, #24 - st1 {v19.2s}, [x19] - add x19, x19, #24 + st1 {v18.4s}, [x11], #16 + st1 {v19.2s}, [x19], #8 cmp x6, #6 beq WriteEnd - st1 {v20.4s}, [x11] - add x11, x11, #24 - st1 {v21.2s}, [x19] - add x19, x19, #24 + st1 {v20.4s}, [x11], #16 + st1 {v21.2s}, [x19], #8 cmp x6, #7 beq WriteEnd - st1 {v22.4s}, [x11] - add x11, x11, #24 - st1 {v23.2s}, [x19] - add x19, x19, #24 + st1 {v22.4s}, [x11], #16 + st1 {v23.2s}, [x19], #8 cmp x6, #8 beq WriteEnd - st1 {v24.4s}, [x11] - add x11, x11, #24 - st1 {v25.2s}, [x19] - add x19, x19, #24 + st1 {v24.4s}, [x11], #16 + st1 {v25.2s}, [x19], #8 cmp x6, #9 beq WriteEnd - st1 {v26.4s}, [x11] - add x11, x11, #24 - st1 {v27.2s}, [x19] - add x19, x19, #24 + st1 {v26.4s}, [x11], #16 + st1 {v27.2s}, [x19], #8 cmp x6, #10 beq WriteEnd - st1 {v28.4s}, [x11] - add x11, x11, #24 - st1 {v29.2s}, [x19] - add x19, x19, #24 + st1 {v28.4s}, [x11], #16 + st1 {v29.2s}, [x19], #8 cmp x6, #11 beq WriteEnd - st1 {v30.4s}, [x11] - st1 {v31.2s}, [x19] + st1 {v30.4s}, [x11], #16 + st1 {v31.2s}, [x19], #8 b WriteEnd C4Write7: - add x19, x11, #16 - add x16, x11, #24 - mov x10, #28 - st1 {v8.4s}, [x11], x10 - st1 {v9.2s}, [x19], x10 - st1 {v9.s}[2], [x16], x10 + add x19, x11, x8 + add x16, x19, #8 + mov x15, #12 + st1 {v8.4s}, [x11], #16 + st1 {v9.2s}, [x19], x15 + st1 {v9.s}[2], [x16], x15 cmp x6, #1 beq WriteEnd - st1 {v10.4s}, [x11], x10 - st1 {v11.2s}, [x19], x10 - st1 {v11.s}[2], [x16], x10 + st1 {v10.4s}, [x11], #16 + st1 {v11.2s}, [x19], x15 + st1 {v11.s}[2], [x16], x15 cmp x6, #2 beq WriteEnd - st1 {v12.4s}, [x11], x10 - st1 {v13.2s}, [x19], x10 - st1 {v13.s}[2], [x16], x10 + st1 {v12.4s}, [x11], #16 + st1 {v13.2s}, [x19], x15 + st1 {v13.s}[2], [x16], x15 cmp x6, #3 beq WriteEnd - st1 {v14.4s}, [x11], x10 - st1 {v15.2s}, [x19], x10 - st1 {v15.s}[2], [x16], x10 + st1 {v14.4s}, [x11], #16 + st1 {v15.2s}, [x19], x15 + st1 {v15.s}[2], [x16], x15 cmp x6, #4 beq WriteEnd - st1 {v16.4s}, [x11], x10 - st1 {v17.2s}, [x19], x10 - st1 {v17.s}[2], [x16], x10 + st1 {v16.4s}, [x11], #16 + st1 {v17.2s}, [x19], x15 + st1 {v17.s}[2], [x16], x15 cmp x6, #5 beq WriteEnd - st1 {v18.4s}, [x11], x10 - st1 {v19.2s}, [x19], x10 - st1 {v19.s}[2], [x16], x10 + st1 {v18.4s}, [x11], #16 + st1 {v19.2s}, [x19], x15 + st1 {v19.s}[2], [x16], x15 cmp x6, #6 beq WriteEnd - st1 {v20.4s}, [x11], x10 - st1 {v21.2s}, [x19], x10 - st1 {v21.s}[2], [x16], x10 + st1 {v20.4s}, [x11], #16 + st1 {v21.2s}, [x19], x15 + st1 {v21.s}[2], [x16], x15 cmp x6, #7 beq WriteEnd - st1 {v22.4s}, [x11], x10 - st1 {v23.2s}, [x19], x10 - st1 {v23.s}[2], [x16], x10 + st1 {v22.4s}, [x11], #16 + st1 {v23.2s}, [x19], x15 + st1 {v23.s}[2], [x16], x15 cmp x6, #8 beq WriteEnd - st1 {v24.4s}, [x11], x10 - st1 {v25.2s}, [x19], x10 - st1 {v25.s}[2], [x16], x10 + st1 {v24.4s}, [x11], #16 + st1 {v25.2s}, [x19], x15 + st1 {v25.s}[2], [x16], x15 cmp x6, #9 beq WriteEnd - st1 {v26.4s}, [x11], x10 - st1 {v27.2s}, [x19], x10 - st1 {v27.s}[2], [x16], x10 + st1 {v26.4s}, [x11], #16 + st1 {v27.2s}, [x19], x15 + st1 {v27.s}[2], [x16], x15 cmp x6, #10 beq WriteEnd - st1 {v28.4s}, [x11], x10 - st1 {v29.2s}, [x19], x10 - st1 {v29.s}[2], [x16], x10 + st1 {v28.4s}, [x11], #16 + st1 {v29.2s}, [x19], x15 + st1 {v29.s}[2], [x16], x15 cmp x6, #11 beq WriteEnd st1 {v30.4s}, [x11] diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow4.S b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow4.S index eaa9e47db50..78242eb717d 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow4.S +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow4.S @@ -475,73 +475,61 @@ LoopRow4: st1 {v14.4s}, [x11], #16 b WriteEnd C4Write5: - add x19, x11, #16 - st1 {v8.4s}, [x11] - add x11, x11, #20 - str s9, [x19] - add x19, x19, #20 + add x19, x11, x8 + st1 {v8.4s}, [x11], #16 + str s9, [x19], #4 cmp x6, #1 beq WriteEnd - st1 {v10.4s}, [x11] - add x11, x11, #20 - str s11, [x19] - add x19, x19, #20 + st1 {v10.4s}, [x11], #16 + str s11, [x19], #4 cmp x6, #2 beq WriteEnd - st1 {v12.4s}, [x11] - add x11, x11, #20 - str s13, [x19] - add x19, x19, #20 + st1 {v12.4s}, [x11], #16 + str s13, [x19], #4 cmp x6, #3 beq WriteEnd - st1 {v14.4s}, [x11] - str s15, [x19] + st1 {v14.4s}, [x11], #16 + str s15, [x19], #4 b WriteEnd C4Write6: - add x19, x11, #16 - st1 {v8.4s}, [x11] - add x11, x11, #24 - st1 {v9.2s}, [x19] - add x19, x19, #24 + add x19, x11, x8 + st1 {v8.4s}, [x11], #16 + st1 {v9.2s}, [x19], #8 cmp x6, #1 beq WriteEnd - st1 {v10.4s}, [x11] - add x11, x11, #24 - st1 {v11.2s}, [x19] - add x19, x19, #24 + st1 {v10.4s}, [x11], #16 + st1 {v11.2s}, [x19], #8 cmp x6, #2 beq WriteEnd - st1 {v12.4s}, [x11] - add x11, x11, #24 - st1 {v13.2s}, [x19] - add x19, x19, #24 + st1 {v12.4s}, [x11], #16 + st1 {v13.2s}, [x19], #8 cmp x6, #3 beq WriteEnd - st1 {v14.4s}, [x11] - st1 {v15.2s}, [x19] + st1 {v14.4s}, [x11], #16 + st1 {v15.2s}, [x19], #8 b WriteEnd C4Write7: - add x19, x11, #16 - add x16, x11, #24 - mov x10, #28 - st1 {v8.4s}, [x11], x10 - st1 {v9.2s}, [x19], x10 - st1 {v9.s}[2], [x16], x10 + add x19, x11, x8 + add x16, x19, #8 + mov x15, #12 + st1 {v8.4s}, [x11], #16 + st1 {v9.2s}, [x19], x15 + st1 {v9.s}[2], [x16], x15 cmp x6, #1 beq WriteEnd - st1 {v10.4s}, [x11], x10 - st1 {v11.2s}, [x19], x10 - st1 {v11.s}[2], [x16], x10 + st1 {v10.4s}, [x11], #16 + st1 {v11.2s}, [x19], x15 + st1 {v11.s}[2], [x16], x15 cmp x6, #2 beq WriteEnd - st1 {v12.4s}, [x11], x10 - st1 {v13.2s}, [x19], x10 - st1 {v13.s}[2], [x16], x10 + st1 {v12.4s}, [x11], #16 + st1 {v13.2s}, [x19], x15 + st1 {v13.s}[2], [x16], x15 cmp x6, #3 beq WriteEnd - st1 {v14.4s}, [x11], x10 - st1 {v15.2s}, [x19], x10 - st1 {v15.s}[2], [x16], x10 + st1 {v14.4s}, [x11], #16 + st1 {v15.2s}, [x19], x15 + st1 {v15.s}[2], [x16], x15 b WriteEnd C4Write8: add x19, x11, x8 diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow8.S b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow8.S index c6dc3191259..b262455782d 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow8.S +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow8.S @@ -722,141 +722,113 @@ LoopRow8: st1 {v22.4s}, [x11], #16 b WriteEnd C4Write5: - add x19, x11, #16 - st1 {v8.4s}, [x11] - add x11, x11, #20 - str s9, [x19] - add x19, x19, #20 + add x19, x11, x8 + st1 {v8.4s}, [x11], #16 + str s9, [x19], #4 cmp x6, #1 beq WriteEnd - st1 {v10.4s}, [x11] - add x11, x11, #20 - str s11, [x19] - add x19, x19, #20 + st1 {v10.4s}, [x11], #16 + str s11, [x19], #4 cmp x6, #2 beq WriteEnd - st1 {v12.4s}, [x11] - add x11, x11, #20 - str s13, [x19] - add x19, x19, #20 + st1 {v12.4s}, [x11], #16 + str s13, [x19], #4 cmp x6, #3 beq WriteEnd - st1 {v14.4s}, [x11] - add x11, x11, #20 - str s15, [x19] - add x19, x19, #20 + st1 {v14.4s}, [x11], #16 + str s15, [x19], #4 cmp x6, #4 beq WriteEnd - st1 {v16.4s}, [x11] - add x11, x11, #20 - str s17, [x19] - add x19, x19, #20 + st1 {v16.4s}, [x11], #16 + str s17, [x19], #4 cmp x6, #5 beq WriteEnd - st1 {v18.4s}, [x11] - add x11, x11, #20 - str s19, [x19] - add x19, x19, #20 + st1 {v18.4s}, [x11], #16 + str s19, [x19], #4 cmp x6, #6 beq WriteEnd - st1 {v20.4s}, [x11] - add x11, x11, #20 - str s21, [x19] - add x19, x19, #20 + st1 {v20.4s}, [x11], #16 + str s21, [x19], #4 cmp x6, #7 beq WriteEnd - st1 {v22.4s}, [x11] - str s23, [x19] + st1 {v22.4s}, [x11], #16 + str s23, [x19], #4 b WriteEnd C4Write6: - add x19, x11, #16 - st1 {v8.4s}, [x11] - add x11, x11, #24 - st1 {v9.2s}, [x19] - add x19, x19, #24 + add x19, x11, x8 + st1 {v8.4s}, [x11], #16 + st1 {v9.2s}, [x19], #8 cmp x6, #1 beq WriteEnd - st1 {v10.4s}, [x11] - add x11, x11, #24 - st1 {v11.2s}, [x19] - add x19, x19, #24 + st1 {v10.4s}, [x11], #16 + st1 {v11.2s}, [x19], #8 cmp x6, #2 beq WriteEnd - st1 {v12.4s}, [x11] - add x11, x11, #24 - st1 {v13.2s}, [x19] - add x19, x19, #24 + st1 {v12.4s}, [x11], #16 + st1 {v13.2s}, [x19], #8 cmp x6, #3 beq WriteEnd - st1 {v14.4s}, [x11] - add x11, x11, #24 - st1 {v15.2s}, [x19] - add x19, x19, #24 + st1 {v14.4s}, [x11], #16 + st1 {v15.2s}, [x19], #8 cmp x6, #4 beq WriteEnd - st1 {v16.4s}, [x11] - add x11, x11, #24 - st1 {v17.2s}, [x19] - add x19, x19, #24 + st1 {v16.4s}, [x11], #16 + st1 {v17.2s}, [x19], #8 cmp x6, #5 beq WriteEnd - st1 {v18.4s}, [x11] - add x11, x11, #24 - st1 {v19.2s}, [x19] - add x19, x19, #24 + st1 {v18.4s}, [x11], #16 + st1 {v19.2s}, [x19], #8 cmp x6, #6 beq WriteEnd - st1 {v20.4s}, [x11] - add x11, x11, #24 - st1 {v21.2s}, [x19] - add x19, x19, #24 + st1 {v20.4s}, [x11], #16 + st1 {v21.2s}, [x19], #8 cmp x6, #7 beq WriteEnd - st1 {v22.4s}, [x11] - st1 {v23.2s}, [x19] + st1 {v22.4s}, [x11], #16 + st1 {v23.2s}, [x19], #8 b WriteEnd C4Write7: - add x19, x11, #16 - add x16, x11, #24 - mov x10, #28 - st1 {v8.4s}, [x11], x10 - st1 {v9.2s}, [x19], x10 - st1 {v9.s}[2], [x16], x10 + add x19, x11, x8 + add x16, x19, #8 + mov x15, #12 + st1 {v8.4s}, [x11], #16 + st1 {v9.2s}, [x19], x15 + st1 {v9.s}[2], [x16], x15 cmp x6, #1 beq WriteEnd - st1 {v10.4s}, [x11], x10 - st1 {v11.2s}, [x19], x10 - st1 {v11.s}[2], [x16], x10 + st1 {v10.4s}, [x11], #16 + st1 {v11.2s}, [x19], x15 + st1 {v11.s}[2], [x16], x15 cmp x6, #2 beq WriteEnd - st1 {v12.4s}, [x11], x10 - st1 {v13.2s}, [x19], x10 - st1 {v13.s}[2], [x16], x10 + st1 {v12.4s}, [x11], #16 + st1 {v13.2s}, [x19], x15 + st1 {v13.s}[2], [x16], x15 cmp x6, #3 beq WriteEnd - st1 {v14.4s}, [x11], x10 - st1 {v15.2s}, [x19], x10 - st1 {v15.s}[2], [x16], x10 + st1 {v14.4s}, [x11], #16 + st1 {v15.2s}, [x19], x15 + st1 {v15.s}[2], [x16], x15 cmp x6, #4 beq WriteEnd - st1 {v16.4s}, [x11], x10 - st1 {v17.2s}, [x19], x10 - st1 {v17.s}[2], [x16], x10 + st1 {v16.4s}, [x11], #16 + st1 {v17.2s}, [x19], x15 + st1 {v17.s}[2], [x16], x15 cmp x6, #5 beq WriteEnd - st1 {v18.4s}, [x11], x10 - st1 {v19.2s}, [x19], x10 - st1 {v19.s}[2], [x16], x10 + st1 {v18.4s}, [x11], #16 + st1 {v19.2s}, [x19], x15 + st1 {v19.s}[2], [x16], x15 cmp x6, #6 beq WriteEnd - st1 {v20.4s}, [x11], x10 - st1 {v21.2s}, [x19], x10 - st1 {v21.s}[2], [x16], x10 + st1 {v20.4s}, [x11], #16 + st1 {v21.2s}, [x19], x15 + st1 {v21.s}[2], [x16], x15 cmp x6, #7 beq WriteEnd - st1 {v22.4s}, [x11], x10 - st1 {v23.2s}, [x19], x10 - st1 {v23.s}[2], [x16], x10 + st1 {v22.4s}, [x11], #16 + st1 {v23.2s}, [x19], x15 + st1 {v23.s}[2], [x16], x15 b WriteEnd C4Write8: add x19, x11, x8 diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc index 7fff594c1ab..bef73c917d6 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc @@ -195,13 +195,14 @@ int Convolution1x1CPUKernel::DoConv1x1Hw(int task_id) { MatMulOpt(thread_pack_input, reinterpret_cast(packed_weight_), cur_output, reinterpret_cast(bias_data_), matmul_param_->act_type_, matmul_param_->deep_, cur_rows, matmul_param_->col_, matmul_param_->col_, OutType_Nhwc); + cur_output += row_tile_ * matmul_param_->col_; } else { MatMulOpt(thread_pack_input, reinterpret_cast(packed_weight_), cur_output, reinterpret_cast(bias_data_), matmul_param_->act_type_, matmul_param_->deep_, cur_rows, matmul_param_->col_, matmul_param_->row_, OutType_NC4HW4); + cur_output += row_tile_ * MSMIN(matmul_param_->col_, C4NUM); } cur_intput += row_tile_ * matmul_param_->deep_; - cur_output += row_tile_ * matmul_param_->col_; } return RET_OK;