!22186 fix c4 output issue

Merge pull request !22186 from zhaozhenlong/lite/issue/fix_c4_output_offset_issue
This commit is contained in:
i-robot 2021-08-23 09:16:35 +00:00 committed by Gitee
commit 4661b47b52
5 changed files with 269 additions and 429 deletions

View File

@ -1345,239 +1345,162 @@ LoopRow4:
st1 {v30.4s}, [x11], #16
b WriteEnd
C4Write5:
add x19, x11, #16
st1 {v8.4s}, [x11]
add x11, x11, #20
str s9, [x19]
add x19, x19, #20
add x19, x11, x8
st1 {v8.4s}, [x11], #16
str s9, [x19], #4
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11]
add x11, x11, #20
str s11, [x19]
add x19, x19, #20
st1 {v10.4s}, [x11], #16
str s11, [x19], #4
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11]
add x11, x11, #20
str s13, [x19]
add x19, x19, #20
st1 {v12.4s}, [x11], #16
str s13, [x19], #4
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11]
add x11, x11, #20
str s15, [x19]
add x19, x19, #20
st1 {v14.4s}, [x11], #16
str s15, [x19], #4
cmp x6, #4
beq WriteEnd
st1 {v16.4s}, [x11]
add x11, x11, #20
str s17, [x19]
add x19, x19, #20
st1 {v16.4s}, [x11], #16
str s17, [x19], #4
cmp x6, #5
beq WriteEnd
st1 {v18.4s}, [x11]
add x11, x11, #20
str s19, [x19]
add x19, x19, #20
st1 {v18.4s}, [x11], #16
str s19, [x19], #4
cmp x6, #6
beq WriteEnd
st1 {v20.4s}, [x11]
add x11, x11, #20
str s21, [x19]
add x19, x19, #20
st1 {v20.4s}, [x11], #16
str s21, [x19], #4
cmp x6, #7
beq WriteEnd
st1 {v22.4s}, [x11]
add x11, x11, #20
str s23, [x19]
add x19, x19, #20
st1 {v22.4s}, [x11], #16
str s23, [x19], #4
cmp x6, #8
beq WriteEnd
st1 {v24.4s}, [x11]
add x11, x11, #20
str s25, [x19]
add x19, x19, #20
st1 {v24.4s}, [x11], #16
str s25, [x19], #4
cmp x6, #9
beq WriteEnd
st1 {v26.4s}, [x11]
add x11, x11, #20
str s27, [x19]
add x19, x19, #20
st1 {v26.4s}, [x11], #16
str s27, [x19], #4
cmp x6, #10
beq WriteEnd
st1 {v28.4s}, [x11]
add x11, x11, #20
str s29, [x19]
add x19, x19, #20
st1 {v28.4s}, [x11], #16
str s29, [x19], #4
cmp x6, #11
beq WriteEnd
st1 {v30.4s}, [x11]
str s31, [x19]
st1 {v30.4s}, [x11], #16
str s31, [x19], #4
b WriteEnd
C4Write6:
add x19, x11, #16
st1 {v8.4s}, [x11]
add x11, x11, #24
st1 {v9.2s}, [x19]
add x19, x19, #24
add x19, x11, x8
st1 {v8.4s}, [x11], #16
st1 {v9.2s}, [x19], #8
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11]
add x11, x11, #24
st1 {v11.2s}, [x19]
add x19, x19, #24
st1 {v10.4s}, [x11], #16
st1 {v11.2s}, [x19], #8
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11]
add x11, x11, #24
st1 {v13.2s}, [x19]
add x19, x19, #24
st1 {v12.4s}, [x11], #16
st1 {v13.2s}, [x19], #8
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11]
add x11, x11, #24
st1 {v15.2s}, [x19]
add x19, x19, #24
st1 {v14.4s}, [x11], #16
st1 {v15.2s}, [x19], #8
cmp x6, #4
beq WriteEnd
st1 {v16.4s}, [x11]
add x11, x11, #24
st1 {v17.2s}, [x19]
add x19, x19, #24
st1 {v16.4s}, [x11], #16
st1 {v17.2s}, [x19], #8
cmp x6, #5
beq WriteEnd
st1 {v18.4s}, [x11]
add x11, x11, #24
st1 {v19.2s}, [x19]
add x19, x19, #24
st1 {v18.4s}, [x11], #16
st1 {v19.2s}, [x19], #8
cmp x6, #6
beq WriteEnd
st1 {v20.4s}, [x11]
add x11, x11, #24
st1 {v21.2s}, [x19]
add x19, x19, #24
st1 {v20.4s}, [x11], #16
st1 {v21.2s}, [x19], #8
cmp x6, #7
beq WriteEnd
st1 {v22.4s}, [x11]
add x11, x11, #24
st1 {v23.2s}, [x19]
add x19, x19, #24
st1 {v22.4s}, [x11], #16
st1 {v23.2s}, [x19], #8
cmp x6, #8
beq WriteEnd
st1 {v24.4s}, [x11]
add x11, x11, #24
st1 {v25.2s}, [x19]
add x19, x19, #24
st1 {v24.4s}, [x11], #16
st1 {v25.2s}, [x19], #8
cmp x6, #9
beq WriteEnd
st1 {v26.4s}, [x11]
add x11, x11, #24
st1 {v27.2s}, [x19]
add x19, x19, #24
st1 {v26.4s}, [x11], #16
st1 {v27.2s}, [x19], #8
cmp x6, #10
beq WriteEnd
st1 {v28.4s}, [x11]
add x11, x11, #24
st1 {v29.2s}, [x19]
add x19, x19, #24
st1 {v28.4s}, [x11], #16
st1 {v29.2s}, [x19], #8
cmp x6, #11
beq WriteEnd
st1 {v30.4s}, [x11]
st1 {v31.2s}, [x19]
st1 {v30.4s}, [x11], #16
st1 {v31.2s}, [x19], #8
b WriteEnd
C4Write7:
add x19, x11, #16
add x16, x11, #24
mov x10, #28
st1 {v8.4s}, [x11], x10
st1 {v9.2s}, [x19], x10
st1 {v9.s}[2], [x16], x10
add x19, x11, x8
add x16, x19, #8
mov x15, #12
st1 {v8.4s}, [x11], #16
st1 {v9.2s}, [x19], x15
st1 {v9.s}[2], [x16], x15
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11], x10
st1 {v11.2s}, [x19], x10
st1 {v11.s}[2], [x16], x10
st1 {v10.4s}, [x11], #16
st1 {v11.2s}, [x19], x15
st1 {v11.s}[2], [x16], x15
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11], x10
st1 {v13.2s}, [x19], x10
st1 {v13.s}[2], [x16], x10
st1 {v12.4s}, [x11], #16
st1 {v13.2s}, [x19], x15
st1 {v13.s}[2], [x16], x15
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11], x10
st1 {v15.2s}, [x19], x10
st1 {v15.s}[2], [x16], x10
st1 {v14.4s}, [x11], #16
st1 {v15.2s}, [x19], x15
st1 {v15.s}[2], [x16], x15
cmp x6, #4
beq WriteEnd
st1 {v16.4s}, [x11], x10
st1 {v17.2s}, [x19], x10
st1 {v17.s}[2], [x16], x10
st1 {v16.4s}, [x11], #16
st1 {v17.2s}, [x19], x15
st1 {v17.s}[2], [x16], x15
cmp x6, #5
beq WriteEnd
st1 {v18.4s}, [x11], x10
st1 {v19.2s}, [x19], x10
st1 {v19.s}[2], [x16], x10
st1 {v18.4s}, [x11], #16
st1 {v19.2s}, [x19], x15
st1 {v19.s}[2], [x16], x15
cmp x6, #6
beq WriteEnd
st1 {v20.4s}, [x11], x10
st1 {v21.2s}, [x19], x10
st1 {v21.s}[2], [x16], x10
st1 {v20.4s}, [x11], #16
st1 {v21.2s}, [x19], x15
st1 {v21.s}[2], [x16], x15
cmp x6, #7
beq WriteEnd
st1 {v22.4s}, [x11], x10
st1 {v23.2s}, [x19], x10
st1 {v23.s}[2], [x16], x10
st1 {v22.4s}, [x11], #16
st1 {v23.2s}, [x19], x15
st1 {v23.s}[2], [x16], x15
cmp x6, #8
beq WriteEnd
st1 {v24.4s}, [x11], x10
st1 {v25.2s}, [x19], x10
st1 {v25.s}[2], [x16], x10
st1 {v24.4s}, [x11], #16
st1 {v25.2s}, [x19], x15
st1 {v25.s}[2], [x16], x15
cmp x6, #9
beq WriteEnd
st1 {v26.4s}, [x11], x10
st1 {v27.2s}, [x19], x10
st1 {v27.s}[2], [x16], x10
st1 {v26.4s}, [x11], #16
st1 {v27.2s}, [x19], x15
st1 {v27.s}[2], [x16], x15
cmp x6, #10
beq WriteEnd
st1 {v28.4s}, [x11], x10
st1 {v29.2s}, [x19], x10
st1 {v29.s}[2], [x16], x10
st1 {v28.4s}, [x11], #16
st1 {v29.2s}, [x19], x15
st1 {v29.s}[2], [x16], x15
cmp x6, #11
beq WriteEnd
st1 {v30.4s}, [x11]
st1 {v31.2s}, [x19]
st1 {v31.s}[2], [x16]

View File

@ -398,7 +398,7 @@ LoopRow:
str s26, [x11]
cmp x6, #10
beq WriteEnd
add x11, x11, x8
add x11, x11, x8
str s28, [x11]
cmp x6, #11
beq WriteEnd
@ -972,204 +972,160 @@ add x11, x11, x8
st1 {v30.4s}, [x11], #16
b WriteEnd
C4Write5:
add x19, x11, #16
st1 {v8.4s}, [x11]
add x11, x11, #20
str s9, [x19]
add x19, x19, #20
add x19, x11, x8
st1 {v8.4s}, [x11], #16
str s9, [x19], #4
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11]
add x11, x11, #20
str s11, [x19]
add x19, x19, #20
st1 {v10.4s}, [x11], #16
str s11, [x19], #4
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11]
add x11, x11, #20
str s13, [x19]
add x19, x19, #20
st1 {v12.4s}, [x11], #16
str s13, [x19], #4
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11]
add x11, x11, #20
str s15, [x19]
add x19, x19, #20
st1 {v14.4s}, [x11], #16
str s15, [x19], #4
cmp x6, #4
beq WriteEnd
st1 {v16.4s}, [x11]
add x11, x11, #20
str s17, [x19]
add x19, x19, #20
st1 {v16.4s}, [x11], #16
str s17, [x19], #4
cmp x6, #5
beq WriteEnd
st1 {v18.4s}, [x11]
add x11, x11, #20
str s19, [x19]
add x19, x19, #20
st1 {v18.4s}, [x11], #16
str s19, [x19], #4
cmp x6, #6
beq WriteEnd
st1 {v20.4s}, [x11]
add x11, x11, #20
str s21, [x19]
add x19, x19, #20
st1 {v20.4s}, [x11], #16
str s21, [x19], #4
cmp x6, #7
beq WriteEnd
st1 {v22.4s}, [x11]
add x11, x11, #20
str s23, [x19]
add x19, x19, #20
st1 {v22.4s}, [x11], #16
str s23, [x19], #4
cmp x6, #8
beq WriteEnd
st1 {v24.4s}, [x11]
add x11, x11, #20
str s25, [x19]
add x19, x19, #20
st1 {v24.4s}, [x11], #16
str s25, [x19], #4
cmp x6, #9
beq WriteEnd
st1 {v26.4s}, [x11]
add x11, x11, #20
str s27, [x19]
add x19, x19, #20
st1 {v26.4s}, [x11], #16
str s27, [x19], #4
cmp x6, #10
beq WriteEnd
st1 {v28.4s}, [x11]
add x11, x11, #20
str s29, [x19]
add x19, x19, #20
st1 {v28.4s}, [x11], #16
str s29, [x19], #4
cmp x6, #11
beq WriteEnd
st1 {v30.4s}, [x11]
str s31, [x19]
st1 {v30.4s}, [x11], #16
str s31, [x19], #4
b WriteEnd
C4Write6:
add x19, x11, #16
st1 {v8.4s}, [x11]
add x11, x11, #24
st1 {v9.2s}, [x19]
add x19, x19, #24
add x19, x11, x8
st1 {v8.4s}, [x11], #16
st1 {v9.2s}, [x19], #8
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11]
add x11, x11, #24
st1 {v11.2s}, [x19]
add x19, x19, #24
st1 {v10.4s}, [x11], #16
st1 {v11.2s}, [x19], #8
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11]
add x11, x11, #24
st1 {v13.2s}, [x19]
add x19, x19, #24
st1 {v12.4s}, [x11], #16
st1 {v13.2s}, [x19], #8
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11]
add x11, x11, #24
st1 {v15.2s}, [x19]
add x19, x19, #24
st1 {v14.4s}, [x11], #16
st1 {v15.2s}, [x19], #8
cmp x6, #4
beq WriteEnd
st1 {v16.4s}, [x11]
add x11, x11, #24
st1 {v17.2s}, [x19]
add x19, x19, #24
st1 {v16.4s}, [x11], #16
st1 {v17.2s}, [x19], #8
cmp x6, #5
beq WriteEnd
st1 {v18.4s}, [x11]
add x11, x11, #24
st1 {v19.2s}, [x19]
add x19, x19, #24
st1 {v18.4s}, [x11], #16
st1 {v19.2s}, [x19], #8
cmp x6, #6
beq WriteEnd
st1 {v20.4s}, [x11]
add x11, x11, #24
st1 {v21.2s}, [x19]
add x19, x19, #24
st1 {v20.4s}, [x11], #16
st1 {v21.2s}, [x19], #8
cmp x6, #7
beq WriteEnd
st1 {v22.4s}, [x11]
add x11, x11, #24
st1 {v23.2s}, [x19]
add x19, x19, #24
st1 {v22.4s}, [x11], #16
st1 {v23.2s}, [x19], #8
cmp x6, #8
beq WriteEnd
st1 {v24.4s}, [x11]
add x11, x11, #24
st1 {v25.2s}, [x19]
add x19, x19, #24
st1 {v24.4s}, [x11], #16
st1 {v25.2s}, [x19], #8
cmp x6, #9
beq WriteEnd
st1 {v26.4s}, [x11]
add x11, x11, #24
st1 {v27.2s}, [x19]
add x19, x19, #24
st1 {v26.4s}, [x11], #16
st1 {v27.2s}, [x19], #8
cmp x6, #10
beq WriteEnd
st1 {v28.4s}, [x11]
add x11, x11, #24
st1 {v29.2s}, [x19]
add x19, x19, #24
st1 {v28.4s}, [x11], #16
st1 {v29.2s}, [x19], #8
cmp x6, #11
beq WriteEnd
st1 {v30.4s}, [x11]
st1 {v31.2s}, [x19]
st1 {v30.4s}, [x11], #16
st1 {v31.2s}, [x19], #8
b WriteEnd
C4Write7:
add x19, x11, #16
add x16, x11, #24
mov x10, #28
st1 {v8.4s}, [x11], x10
st1 {v9.2s}, [x19], x10
st1 {v9.s}[2], [x16], x10
add x19, x11, x8
add x16, x19, #8
mov x15, #12
st1 {v8.4s}, [x11], #16
st1 {v9.2s}, [x19], x15
st1 {v9.s}[2], [x16], x15
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11], x10
st1 {v11.2s}, [x19], x10
st1 {v11.s}[2], [x16], x10
st1 {v10.4s}, [x11], #16
st1 {v11.2s}, [x19], x15
st1 {v11.s}[2], [x16], x15
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11], x10
st1 {v13.2s}, [x19], x10
st1 {v13.s}[2], [x16], x10
st1 {v12.4s}, [x11], #16
st1 {v13.2s}, [x19], x15
st1 {v13.s}[2], [x16], x15
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11], x10
st1 {v15.2s}, [x19], x10
st1 {v15.s}[2], [x16], x10
st1 {v14.4s}, [x11], #16
st1 {v15.2s}, [x19], x15
st1 {v15.s}[2], [x16], x15
cmp x6, #4
beq WriteEnd
st1 {v16.4s}, [x11], x10
st1 {v17.2s}, [x19], x10
st1 {v17.s}[2], [x16], x10
st1 {v16.4s}, [x11], #16
st1 {v17.2s}, [x19], x15
st1 {v17.s}[2], [x16], x15
cmp x6, #5
beq WriteEnd
st1 {v18.4s}, [x11], x10
st1 {v19.2s}, [x19], x10
st1 {v19.s}[2], [x16], x10
st1 {v18.4s}, [x11], #16
st1 {v19.2s}, [x19], x15
st1 {v19.s}[2], [x16], x15
cmp x6, #6
beq WriteEnd
st1 {v20.4s}, [x11], x10
st1 {v21.2s}, [x19], x10
st1 {v21.s}[2], [x16], x10
st1 {v20.4s}, [x11], #16
st1 {v21.2s}, [x19], x15
st1 {v21.s}[2], [x16], x15
cmp x6, #7
beq WriteEnd
st1 {v22.4s}, [x11], x10
st1 {v23.2s}, [x19], x10
st1 {v23.s}[2], [x16], x10
st1 {v22.4s}, [x11], #16
st1 {v23.2s}, [x19], x15
st1 {v23.s}[2], [x16], x15
cmp x6, #8
beq WriteEnd
st1 {v24.4s}, [x11], x10
st1 {v25.2s}, [x19], x10
st1 {v25.s}[2], [x16], x10
st1 {v24.4s}, [x11], #16
st1 {v25.2s}, [x19], x15
st1 {v25.s}[2], [x16], x15
cmp x6, #9
beq WriteEnd
st1 {v26.4s}, [x11], x10
st1 {v27.2s}, [x19], x10
st1 {v27.s}[2], [x16], x10
st1 {v26.4s}, [x11], #16
st1 {v27.2s}, [x19], x15
st1 {v27.s}[2], [x16], x15
cmp x6, #10
beq WriteEnd
st1 {v28.4s}, [x11], x10
st1 {v29.2s}, [x19], x10
st1 {v29.s}[2], [x16], x10
st1 {v28.4s}, [x11], #16
st1 {v29.2s}, [x19], x15
st1 {v29.s}[2], [x16], x15
cmp x6, #11
beq WriteEnd
st1 {v30.4s}, [x11]

View File

@ -475,73 +475,61 @@ LoopRow4:
st1 {v14.4s}, [x11], #16
b WriteEnd
C4Write5:
add x19, x11, #16
st1 {v8.4s}, [x11]
add x11, x11, #20
str s9, [x19]
add x19, x19, #20
add x19, x11, x8
st1 {v8.4s}, [x11], #16
str s9, [x19], #4
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11]
add x11, x11, #20
str s11, [x19]
add x19, x19, #20
st1 {v10.4s}, [x11], #16
str s11, [x19], #4
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11]
add x11, x11, #20
str s13, [x19]
add x19, x19, #20
st1 {v12.4s}, [x11], #16
str s13, [x19], #4
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11]
str s15, [x19]
st1 {v14.4s}, [x11], #16
str s15, [x19], #4
b WriteEnd
C4Write6:
add x19, x11, #16
st1 {v8.4s}, [x11]
add x11, x11, #24
st1 {v9.2s}, [x19]
add x19, x19, #24
add x19, x11, x8
st1 {v8.4s}, [x11], #16
st1 {v9.2s}, [x19], #8
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11]
add x11, x11, #24
st1 {v11.2s}, [x19]
add x19, x19, #24
st1 {v10.4s}, [x11], #16
st1 {v11.2s}, [x19], #8
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11]
add x11, x11, #24
st1 {v13.2s}, [x19]
add x19, x19, #24
st1 {v12.4s}, [x11], #16
st1 {v13.2s}, [x19], #8
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11]
st1 {v15.2s}, [x19]
st1 {v14.4s}, [x11], #16
st1 {v15.2s}, [x19], #8
b WriteEnd
C4Write7:
add x19, x11, #16
add x16, x11, #24
mov x10, #28
st1 {v8.4s}, [x11], x10
st1 {v9.2s}, [x19], x10
st1 {v9.s}[2], [x16], x10
add x19, x11, x8
add x16, x19, #8
mov x15, #12
st1 {v8.4s}, [x11], #16
st1 {v9.2s}, [x19], x15
st1 {v9.s}[2], [x16], x15
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11], x10
st1 {v11.2s}, [x19], x10
st1 {v11.s}[2], [x16], x10
st1 {v10.4s}, [x11], #16
st1 {v11.2s}, [x19], x15
st1 {v11.s}[2], [x16], x15
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11], x10
st1 {v13.2s}, [x19], x10
st1 {v13.s}[2], [x16], x10
st1 {v12.4s}, [x11], #16
st1 {v13.2s}, [x19], x15
st1 {v13.s}[2], [x16], x15
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11], x10
st1 {v15.2s}, [x19], x10
st1 {v15.s}[2], [x16], x10
st1 {v14.4s}, [x11], #16
st1 {v15.2s}, [x19], x15
st1 {v15.s}[2], [x16], x15
b WriteEnd
C4Write8:
add x19, x11, x8

View File

@ -722,141 +722,113 @@ LoopRow8:
st1 {v22.4s}, [x11], #16
b WriteEnd
C4Write5:
add x19, x11, #16
st1 {v8.4s}, [x11]
add x11, x11, #20
str s9, [x19]
add x19, x19, #20
add x19, x11, x8
st1 {v8.4s}, [x11], #16
str s9, [x19], #4
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11]
add x11, x11, #20
str s11, [x19]
add x19, x19, #20
st1 {v10.4s}, [x11], #16
str s11, [x19], #4
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11]
add x11, x11, #20
str s13, [x19]
add x19, x19, #20
st1 {v12.4s}, [x11], #16
str s13, [x19], #4
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11]
add x11, x11, #20
str s15, [x19]
add x19, x19, #20
st1 {v14.4s}, [x11], #16
str s15, [x19], #4
cmp x6, #4
beq WriteEnd
st1 {v16.4s}, [x11]
add x11, x11, #20
str s17, [x19]
add x19, x19, #20
st1 {v16.4s}, [x11], #16
str s17, [x19], #4
cmp x6, #5
beq WriteEnd
st1 {v18.4s}, [x11]
add x11, x11, #20
str s19, [x19]
add x19, x19, #20
st1 {v18.4s}, [x11], #16
str s19, [x19], #4
cmp x6, #6
beq WriteEnd
st1 {v20.4s}, [x11]
add x11, x11, #20
str s21, [x19]
add x19, x19, #20
st1 {v20.4s}, [x11], #16
str s21, [x19], #4
cmp x6, #7
beq WriteEnd
st1 {v22.4s}, [x11]
str s23, [x19]
st1 {v22.4s}, [x11], #16
str s23, [x19], #4
b WriteEnd
C4Write6:
add x19, x11, #16
st1 {v8.4s}, [x11]
add x11, x11, #24
st1 {v9.2s}, [x19]
add x19, x19, #24
add x19, x11, x8
st1 {v8.4s}, [x11], #16
st1 {v9.2s}, [x19], #8
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11]
add x11, x11, #24
st1 {v11.2s}, [x19]
add x19, x19, #24
st1 {v10.4s}, [x11], #16
st1 {v11.2s}, [x19], #8
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11]
add x11, x11, #24
st1 {v13.2s}, [x19]
add x19, x19, #24
st1 {v12.4s}, [x11], #16
st1 {v13.2s}, [x19], #8
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11]
add x11, x11, #24
st1 {v15.2s}, [x19]
add x19, x19, #24
st1 {v14.4s}, [x11], #16
st1 {v15.2s}, [x19], #8
cmp x6, #4
beq WriteEnd
st1 {v16.4s}, [x11]
add x11, x11, #24
st1 {v17.2s}, [x19]
add x19, x19, #24
st1 {v16.4s}, [x11], #16
st1 {v17.2s}, [x19], #8
cmp x6, #5
beq WriteEnd
st1 {v18.4s}, [x11]
add x11, x11, #24
st1 {v19.2s}, [x19]
add x19, x19, #24
st1 {v18.4s}, [x11], #16
st1 {v19.2s}, [x19], #8
cmp x6, #6
beq WriteEnd
st1 {v20.4s}, [x11]
add x11, x11, #24
st1 {v21.2s}, [x19]
add x19, x19, #24
st1 {v20.4s}, [x11], #16
st1 {v21.2s}, [x19], #8
cmp x6, #7
beq WriteEnd
st1 {v22.4s}, [x11]
st1 {v23.2s}, [x19]
st1 {v22.4s}, [x11], #16
st1 {v23.2s}, [x19], #8
b WriteEnd
C4Write7:
add x19, x11, #16
add x16, x11, #24
mov x10, #28
st1 {v8.4s}, [x11], x10
st1 {v9.2s}, [x19], x10
st1 {v9.s}[2], [x16], x10
add x19, x11, x8
add x16, x19, #8
mov x15, #12
st1 {v8.4s}, [x11], #16
st1 {v9.2s}, [x19], x15
st1 {v9.s}[2], [x16], x15
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11], x10
st1 {v11.2s}, [x19], x10
st1 {v11.s}[2], [x16], x10
st1 {v10.4s}, [x11], #16
st1 {v11.2s}, [x19], x15
st1 {v11.s}[2], [x16], x15
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11], x10
st1 {v13.2s}, [x19], x10
st1 {v13.s}[2], [x16], x10
st1 {v12.4s}, [x11], #16
st1 {v13.2s}, [x19], x15
st1 {v13.s}[2], [x16], x15
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11], x10
st1 {v15.2s}, [x19], x10
st1 {v15.s}[2], [x16], x10
st1 {v14.4s}, [x11], #16
st1 {v15.2s}, [x19], x15
st1 {v15.s}[2], [x16], x15
cmp x6, #4
beq WriteEnd
st1 {v16.4s}, [x11], x10
st1 {v17.2s}, [x19], x10
st1 {v17.s}[2], [x16], x10
st1 {v16.4s}, [x11], #16
st1 {v17.2s}, [x19], x15
st1 {v17.s}[2], [x16], x15
cmp x6, #5
beq WriteEnd
st1 {v18.4s}, [x11], x10
st1 {v19.2s}, [x19], x10
st1 {v19.s}[2], [x16], x10
st1 {v18.4s}, [x11], #16
st1 {v19.2s}, [x19], x15
st1 {v19.s}[2], [x16], x15
cmp x6, #6
beq WriteEnd
st1 {v20.4s}, [x11], x10
st1 {v21.2s}, [x19], x10
st1 {v21.s}[2], [x16], x10
st1 {v20.4s}, [x11], #16
st1 {v21.2s}, [x19], x15
st1 {v21.s}[2], [x16], x15
cmp x6, #7
beq WriteEnd
st1 {v22.4s}, [x11], x10
st1 {v23.2s}, [x19], x10
st1 {v23.s}[2], [x16], x10
st1 {v22.4s}, [x11], #16
st1 {v23.2s}, [x19], x15
st1 {v23.s}[2], [x16], x15
b WriteEnd
C4Write8:
add x19, x11, x8

View File

@ -195,13 +195,14 @@ int Convolution1x1CPUKernel::DoConv1x1Hw(int task_id) {
MatMulOpt(thread_pack_input, reinterpret_cast<float *>(packed_weight_), cur_output,
reinterpret_cast<float *>(bias_data_), matmul_param_->act_type_, matmul_param_->deep_, cur_rows,
matmul_param_->col_, matmul_param_->col_, OutType_Nhwc);
cur_output += row_tile_ * matmul_param_->col_;
} else {
MatMulOpt(thread_pack_input, reinterpret_cast<float *>(packed_weight_), cur_output,
reinterpret_cast<float *>(bias_data_), matmul_param_->act_type_, matmul_param_->deep_, cur_rows,
matmul_param_->col_, matmul_param_->row_, OutType_NC4HW4);
cur_output += row_tile_ * MSMIN(matmul_param_->col_, C4NUM);
}
cur_intput += row_tile_ * matmul_param_->deep_;
cur_output += row_tile_ * matmul_param_->col_;
}
return RET_OK;