forked from mindspore-Ecosystem/mindspore
!22186 fix c4 output issue
Merge pull request !22186 from zhaozhenlong/lite/issue/fix_c4_output_offset_issue
This commit is contained in:
commit
4661b47b52
|
@ -1345,239 +1345,162 @@ LoopRow4:
|
|||
st1 {v30.4s}, [x11], #16
|
||||
b WriteEnd
|
||||
C4Write5:
|
||||
add x19, x11, #16
|
||||
st1 {v8.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s9, [x19]
|
||||
add x19, x19, #20
|
||||
add x19, x11, x8
|
||||
st1 {v8.4s}, [x11], #16
|
||||
str s9, [x19], #4
|
||||
cmp x6, #1
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v10.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s11, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v10.4s}, [x11], #16
|
||||
str s11, [x19], #4
|
||||
cmp x6, #2
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v12.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s13, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v12.4s}, [x11], #16
|
||||
str s13, [x19], #4
|
||||
cmp x6, #3
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v14.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s15, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v14.4s}, [x11], #16
|
||||
str s15, [x19], #4
|
||||
cmp x6, #4
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v16.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s17, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v16.4s}, [x11], #16
|
||||
str s17, [x19], #4
|
||||
cmp x6, #5
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v18.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s19, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v18.4s}, [x11], #16
|
||||
str s19, [x19], #4
|
||||
cmp x6, #6
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v20.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s21, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v20.4s}, [x11], #16
|
||||
str s21, [x19], #4
|
||||
cmp x6, #7
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v22.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s23, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v22.4s}, [x11], #16
|
||||
str s23, [x19], #4
|
||||
cmp x6, #8
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v24.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s25, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v24.4s}, [x11], #16
|
||||
str s25, [x19], #4
|
||||
cmp x6, #9
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v26.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s27, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v26.4s}, [x11], #16
|
||||
str s27, [x19], #4
|
||||
cmp x6, #10
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v28.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s29, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v28.4s}, [x11], #16
|
||||
str s29, [x19], #4
|
||||
cmp x6, #11
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v30.4s}, [x11]
|
||||
str s31, [x19]
|
||||
st1 {v30.4s}, [x11], #16
|
||||
str s31, [x19], #4
|
||||
b WriteEnd
|
||||
C4Write6:
|
||||
add x19, x11, #16
|
||||
st1 {v8.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v9.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
add x19, x11, x8
|
||||
st1 {v8.4s}, [x11], #16
|
||||
st1 {v9.2s}, [x19], #8
|
||||
cmp x6, #1
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v10.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v11.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v10.4s}, [x11], #16
|
||||
st1 {v11.2s}, [x19], #8
|
||||
cmp x6, #2
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v12.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v13.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v12.4s}, [x11], #16
|
||||
st1 {v13.2s}, [x19], #8
|
||||
cmp x6, #3
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v14.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v15.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v14.4s}, [x11], #16
|
||||
st1 {v15.2s}, [x19], #8
|
||||
cmp x6, #4
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v16.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v17.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v16.4s}, [x11], #16
|
||||
st1 {v17.2s}, [x19], #8
|
||||
cmp x6, #5
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v18.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v19.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v18.4s}, [x11], #16
|
||||
st1 {v19.2s}, [x19], #8
|
||||
cmp x6, #6
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v20.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v21.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v20.4s}, [x11], #16
|
||||
st1 {v21.2s}, [x19], #8
|
||||
cmp x6, #7
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v22.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v23.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v22.4s}, [x11], #16
|
||||
st1 {v23.2s}, [x19], #8
|
||||
cmp x6, #8
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v24.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v25.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v24.4s}, [x11], #16
|
||||
st1 {v25.2s}, [x19], #8
|
||||
cmp x6, #9
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v26.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v27.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v26.4s}, [x11], #16
|
||||
st1 {v27.2s}, [x19], #8
|
||||
cmp x6, #10
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v28.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v29.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v28.4s}, [x11], #16
|
||||
st1 {v29.2s}, [x19], #8
|
||||
cmp x6, #11
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v30.4s}, [x11]
|
||||
st1 {v31.2s}, [x19]
|
||||
st1 {v30.4s}, [x11], #16
|
||||
st1 {v31.2s}, [x19], #8
|
||||
b WriteEnd
|
||||
C4Write7:
|
||||
add x19, x11, #16
|
||||
add x16, x11, #24
|
||||
mov x10, #28
|
||||
st1 {v8.4s}, [x11], x10
|
||||
st1 {v9.2s}, [x19], x10
|
||||
st1 {v9.s}[2], [x16], x10
|
||||
add x19, x11, x8
|
||||
add x16, x19, #8
|
||||
mov x15, #12
|
||||
st1 {v8.4s}, [x11], #16
|
||||
st1 {v9.2s}, [x19], x15
|
||||
st1 {v9.s}[2], [x16], x15
|
||||
cmp x6, #1
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v10.4s}, [x11], x10
|
||||
st1 {v11.2s}, [x19], x10
|
||||
st1 {v11.s}[2], [x16], x10
|
||||
st1 {v10.4s}, [x11], #16
|
||||
st1 {v11.2s}, [x19], x15
|
||||
st1 {v11.s}[2], [x16], x15
|
||||
cmp x6, #2
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v12.4s}, [x11], x10
|
||||
st1 {v13.2s}, [x19], x10
|
||||
st1 {v13.s}[2], [x16], x10
|
||||
st1 {v12.4s}, [x11], #16
|
||||
st1 {v13.2s}, [x19], x15
|
||||
st1 {v13.s}[2], [x16], x15
|
||||
cmp x6, #3
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v14.4s}, [x11], x10
|
||||
st1 {v15.2s}, [x19], x10
|
||||
st1 {v15.s}[2], [x16], x10
|
||||
st1 {v14.4s}, [x11], #16
|
||||
st1 {v15.2s}, [x19], x15
|
||||
st1 {v15.s}[2], [x16], x15
|
||||
cmp x6, #4
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v16.4s}, [x11], x10
|
||||
st1 {v17.2s}, [x19], x10
|
||||
st1 {v17.s}[2], [x16], x10
|
||||
st1 {v16.4s}, [x11], #16
|
||||
st1 {v17.2s}, [x19], x15
|
||||
st1 {v17.s}[2], [x16], x15
|
||||
cmp x6, #5
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v18.4s}, [x11], x10
|
||||
st1 {v19.2s}, [x19], x10
|
||||
st1 {v19.s}[2], [x16], x10
|
||||
st1 {v18.4s}, [x11], #16
|
||||
st1 {v19.2s}, [x19], x15
|
||||
st1 {v19.s}[2], [x16], x15
|
||||
cmp x6, #6
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v20.4s}, [x11], x10
|
||||
st1 {v21.2s}, [x19], x10
|
||||
st1 {v21.s}[2], [x16], x10
|
||||
st1 {v20.4s}, [x11], #16
|
||||
st1 {v21.2s}, [x19], x15
|
||||
st1 {v21.s}[2], [x16], x15
|
||||
cmp x6, #7
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v22.4s}, [x11], x10
|
||||
st1 {v23.2s}, [x19], x10
|
||||
st1 {v23.s}[2], [x16], x10
|
||||
st1 {v22.4s}, [x11], #16
|
||||
st1 {v23.2s}, [x19], x15
|
||||
st1 {v23.s}[2], [x16], x15
|
||||
cmp x6, #8
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v24.4s}, [x11], x10
|
||||
st1 {v25.2s}, [x19], x10
|
||||
st1 {v25.s}[2], [x16], x10
|
||||
st1 {v24.4s}, [x11], #16
|
||||
st1 {v25.2s}, [x19], x15
|
||||
st1 {v25.s}[2], [x16], x15
|
||||
cmp x6, #9
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v26.4s}, [x11], x10
|
||||
st1 {v27.2s}, [x19], x10
|
||||
st1 {v27.s}[2], [x16], x10
|
||||
st1 {v26.4s}, [x11], #16
|
||||
st1 {v27.2s}, [x19], x15
|
||||
st1 {v27.s}[2], [x16], x15
|
||||
cmp x6, #10
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v28.4s}, [x11], x10
|
||||
st1 {v29.2s}, [x19], x10
|
||||
st1 {v29.s}[2], [x16], x10
|
||||
st1 {v28.4s}, [x11], #16
|
||||
st1 {v29.2s}, [x19], x15
|
||||
st1 {v29.s}[2], [x16], x15
|
||||
cmp x6, #11
|
||||
beq WriteEnd
|
||||
|
||||
st1 {v30.4s}, [x11]
|
||||
st1 {v31.2s}, [x19]
|
||||
st1 {v31.s}[2], [x16]
|
||||
|
|
|
@ -398,7 +398,7 @@ LoopRow:
|
|||
str s26, [x11]
|
||||
cmp x6, #10
|
||||
beq WriteEnd
|
||||
add x11, x11, x8
|
||||
add x11, x11, x8
|
||||
str s28, [x11]
|
||||
cmp x6, #11
|
||||
beq WriteEnd
|
||||
|
@ -972,204 +972,160 @@ add x11, x11, x8
|
|||
st1 {v30.4s}, [x11], #16
|
||||
b WriteEnd
|
||||
C4Write5:
|
||||
add x19, x11, #16
|
||||
st1 {v8.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s9, [x19]
|
||||
add x19, x19, #20
|
||||
add x19, x11, x8
|
||||
st1 {v8.4s}, [x11], #16
|
||||
str s9, [x19], #4
|
||||
cmp x6, #1
|
||||
beq WriteEnd
|
||||
st1 {v10.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s11, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v10.4s}, [x11], #16
|
||||
str s11, [x19], #4
|
||||
cmp x6, #2
|
||||
beq WriteEnd
|
||||
st1 {v12.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s13, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v12.4s}, [x11], #16
|
||||
str s13, [x19], #4
|
||||
cmp x6, #3
|
||||
beq WriteEnd
|
||||
st1 {v14.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s15, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v14.4s}, [x11], #16
|
||||
str s15, [x19], #4
|
||||
cmp x6, #4
|
||||
beq WriteEnd
|
||||
st1 {v16.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s17, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v16.4s}, [x11], #16
|
||||
str s17, [x19], #4
|
||||
cmp x6, #5
|
||||
beq WriteEnd
|
||||
st1 {v18.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s19, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v18.4s}, [x11], #16
|
||||
str s19, [x19], #4
|
||||
cmp x6, #6
|
||||
beq WriteEnd
|
||||
st1 {v20.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s21, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v20.4s}, [x11], #16
|
||||
str s21, [x19], #4
|
||||
cmp x6, #7
|
||||
beq WriteEnd
|
||||
st1 {v22.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s23, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v22.4s}, [x11], #16
|
||||
str s23, [x19], #4
|
||||
cmp x6, #8
|
||||
beq WriteEnd
|
||||
st1 {v24.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s25, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v24.4s}, [x11], #16
|
||||
str s25, [x19], #4
|
||||
cmp x6, #9
|
||||
beq WriteEnd
|
||||
st1 {v26.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s27, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v26.4s}, [x11], #16
|
||||
str s27, [x19], #4
|
||||
cmp x6, #10
|
||||
beq WriteEnd
|
||||
st1 {v28.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s29, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v28.4s}, [x11], #16
|
||||
str s29, [x19], #4
|
||||
cmp x6, #11
|
||||
beq WriteEnd
|
||||
st1 {v30.4s}, [x11]
|
||||
str s31, [x19]
|
||||
st1 {v30.4s}, [x11], #16
|
||||
str s31, [x19], #4
|
||||
b WriteEnd
|
||||
C4Write6:
|
||||
add x19, x11, #16
|
||||
st1 {v8.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v9.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
add x19, x11, x8
|
||||
st1 {v8.4s}, [x11], #16
|
||||
st1 {v9.2s}, [x19], #8
|
||||
cmp x6, #1
|
||||
beq WriteEnd
|
||||
st1 {v10.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v11.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v10.4s}, [x11], #16
|
||||
st1 {v11.2s}, [x19], #8
|
||||
cmp x6, #2
|
||||
beq WriteEnd
|
||||
st1 {v12.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v13.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v12.4s}, [x11], #16
|
||||
st1 {v13.2s}, [x19], #8
|
||||
cmp x6, #3
|
||||
beq WriteEnd
|
||||
st1 {v14.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v15.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v14.4s}, [x11], #16
|
||||
st1 {v15.2s}, [x19], #8
|
||||
cmp x6, #4
|
||||
beq WriteEnd
|
||||
st1 {v16.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v17.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v16.4s}, [x11], #16
|
||||
st1 {v17.2s}, [x19], #8
|
||||
cmp x6, #5
|
||||
beq WriteEnd
|
||||
st1 {v18.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v19.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v18.4s}, [x11], #16
|
||||
st1 {v19.2s}, [x19], #8
|
||||
cmp x6, #6
|
||||
beq WriteEnd
|
||||
st1 {v20.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v21.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v20.4s}, [x11], #16
|
||||
st1 {v21.2s}, [x19], #8
|
||||
cmp x6, #7
|
||||
beq WriteEnd
|
||||
st1 {v22.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v23.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v22.4s}, [x11], #16
|
||||
st1 {v23.2s}, [x19], #8
|
||||
cmp x6, #8
|
||||
beq WriteEnd
|
||||
st1 {v24.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v25.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v24.4s}, [x11], #16
|
||||
st1 {v25.2s}, [x19], #8
|
||||
cmp x6, #9
|
||||
beq WriteEnd
|
||||
st1 {v26.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v27.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v26.4s}, [x11], #16
|
||||
st1 {v27.2s}, [x19], #8
|
||||
cmp x6, #10
|
||||
beq WriteEnd
|
||||
st1 {v28.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v29.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v28.4s}, [x11], #16
|
||||
st1 {v29.2s}, [x19], #8
|
||||
cmp x6, #11
|
||||
beq WriteEnd
|
||||
st1 {v30.4s}, [x11]
|
||||
st1 {v31.2s}, [x19]
|
||||
st1 {v30.4s}, [x11], #16
|
||||
st1 {v31.2s}, [x19], #8
|
||||
b WriteEnd
|
||||
C4Write7:
|
||||
add x19, x11, #16
|
||||
add x16, x11, #24
|
||||
mov x10, #28
|
||||
st1 {v8.4s}, [x11], x10
|
||||
st1 {v9.2s}, [x19], x10
|
||||
st1 {v9.s}[2], [x16], x10
|
||||
add x19, x11, x8
|
||||
add x16, x19, #8
|
||||
mov x15, #12
|
||||
st1 {v8.4s}, [x11], #16
|
||||
st1 {v9.2s}, [x19], x15
|
||||
st1 {v9.s}[2], [x16], x15
|
||||
cmp x6, #1
|
||||
beq WriteEnd
|
||||
st1 {v10.4s}, [x11], x10
|
||||
st1 {v11.2s}, [x19], x10
|
||||
st1 {v11.s}[2], [x16], x10
|
||||
st1 {v10.4s}, [x11], #16
|
||||
st1 {v11.2s}, [x19], x15
|
||||
st1 {v11.s}[2], [x16], x15
|
||||
cmp x6, #2
|
||||
beq WriteEnd
|
||||
st1 {v12.4s}, [x11], x10
|
||||
st1 {v13.2s}, [x19], x10
|
||||
st1 {v13.s}[2], [x16], x10
|
||||
st1 {v12.4s}, [x11], #16
|
||||
st1 {v13.2s}, [x19], x15
|
||||
st1 {v13.s}[2], [x16], x15
|
||||
cmp x6, #3
|
||||
beq WriteEnd
|
||||
st1 {v14.4s}, [x11], x10
|
||||
st1 {v15.2s}, [x19], x10
|
||||
st1 {v15.s}[2], [x16], x10
|
||||
st1 {v14.4s}, [x11], #16
|
||||
st1 {v15.2s}, [x19], x15
|
||||
st1 {v15.s}[2], [x16], x15
|
||||
cmp x6, #4
|
||||
beq WriteEnd
|
||||
st1 {v16.4s}, [x11], x10
|
||||
st1 {v17.2s}, [x19], x10
|
||||
st1 {v17.s}[2], [x16], x10
|
||||
st1 {v16.4s}, [x11], #16
|
||||
st1 {v17.2s}, [x19], x15
|
||||
st1 {v17.s}[2], [x16], x15
|
||||
cmp x6, #5
|
||||
beq WriteEnd
|
||||
st1 {v18.4s}, [x11], x10
|
||||
st1 {v19.2s}, [x19], x10
|
||||
st1 {v19.s}[2], [x16], x10
|
||||
st1 {v18.4s}, [x11], #16
|
||||
st1 {v19.2s}, [x19], x15
|
||||
st1 {v19.s}[2], [x16], x15
|
||||
cmp x6, #6
|
||||
beq WriteEnd
|
||||
st1 {v20.4s}, [x11], x10
|
||||
st1 {v21.2s}, [x19], x10
|
||||
st1 {v21.s}[2], [x16], x10
|
||||
st1 {v20.4s}, [x11], #16
|
||||
st1 {v21.2s}, [x19], x15
|
||||
st1 {v21.s}[2], [x16], x15
|
||||
cmp x6, #7
|
||||
beq WriteEnd
|
||||
st1 {v22.4s}, [x11], x10
|
||||
st1 {v23.2s}, [x19], x10
|
||||
st1 {v23.s}[2], [x16], x10
|
||||
st1 {v22.4s}, [x11], #16
|
||||
st1 {v23.2s}, [x19], x15
|
||||
st1 {v23.s}[2], [x16], x15
|
||||
cmp x6, #8
|
||||
beq WriteEnd
|
||||
st1 {v24.4s}, [x11], x10
|
||||
st1 {v25.2s}, [x19], x10
|
||||
st1 {v25.s}[2], [x16], x10
|
||||
st1 {v24.4s}, [x11], #16
|
||||
st1 {v25.2s}, [x19], x15
|
||||
st1 {v25.s}[2], [x16], x15
|
||||
cmp x6, #9
|
||||
beq WriteEnd
|
||||
st1 {v26.4s}, [x11], x10
|
||||
st1 {v27.2s}, [x19], x10
|
||||
st1 {v27.s}[2], [x16], x10
|
||||
st1 {v26.4s}, [x11], #16
|
||||
st1 {v27.2s}, [x19], x15
|
||||
st1 {v27.s}[2], [x16], x15
|
||||
cmp x6, #10
|
||||
beq WriteEnd
|
||||
st1 {v28.4s}, [x11], x10
|
||||
st1 {v29.2s}, [x19], x10
|
||||
st1 {v29.s}[2], [x16], x10
|
||||
st1 {v28.4s}, [x11], #16
|
||||
st1 {v29.2s}, [x19], x15
|
||||
st1 {v29.s}[2], [x16], x15
|
||||
cmp x6, #11
|
||||
beq WriteEnd
|
||||
st1 {v30.4s}, [x11]
|
||||
|
|
|
@ -475,73 +475,61 @@ LoopRow4:
|
|||
st1 {v14.4s}, [x11], #16
|
||||
b WriteEnd
|
||||
C4Write5:
|
||||
add x19, x11, #16
|
||||
st1 {v8.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s9, [x19]
|
||||
add x19, x19, #20
|
||||
add x19, x11, x8
|
||||
st1 {v8.4s}, [x11], #16
|
||||
str s9, [x19], #4
|
||||
cmp x6, #1
|
||||
beq WriteEnd
|
||||
st1 {v10.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s11, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v10.4s}, [x11], #16
|
||||
str s11, [x19], #4
|
||||
cmp x6, #2
|
||||
beq WriteEnd
|
||||
st1 {v12.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s13, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v12.4s}, [x11], #16
|
||||
str s13, [x19], #4
|
||||
cmp x6, #3
|
||||
beq WriteEnd
|
||||
st1 {v14.4s}, [x11]
|
||||
str s15, [x19]
|
||||
st1 {v14.4s}, [x11], #16
|
||||
str s15, [x19], #4
|
||||
b WriteEnd
|
||||
C4Write6:
|
||||
add x19, x11, #16
|
||||
st1 {v8.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v9.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
add x19, x11, x8
|
||||
st1 {v8.4s}, [x11], #16
|
||||
st1 {v9.2s}, [x19], #8
|
||||
cmp x6, #1
|
||||
beq WriteEnd
|
||||
st1 {v10.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v11.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v10.4s}, [x11], #16
|
||||
st1 {v11.2s}, [x19], #8
|
||||
cmp x6, #2
|
||||
beq WriteEnd
|
||||
st1 {v12.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v13.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v12.4s}, [x11], #16
|
||||
st1 {v13.2s}, [x19], #8
|
||||
cmp x6, #3
|
||||
beq WriteEnd
|
||||
st1 {v14.4s}, [x11]
|
||||
st1 {v15.2s}, [x19]
|
||||
st1 {v14.4s}, [x11], #16
|
||||
st1 {v15.2s}, [x19], #8
|
||||
b WriteEnd
|
||||
C4Write7:
|
||||
add x19, x11, #16
|
||||
add x16, x11, #24
|
||||
mov x10, #28
|
||||
st1 {v8.4s}, [x11], x10
|
||||
st1 {v9.2s}, [x19], x10
|
||||
st1 {v9.s}[2], [x16], x10
|
||||
add x19, x11, x8
|
||||
add x16, x19, #8
|
||||
mov x15, #12
|
||||
st1 {v8.4s}, [x11], #16
|
||||
st1 {v9.2s}, [x19], x15
|
||||
st1 {v9.s}[2], [x16], x15
|
||||
cmp x6, #1
|
||||
beq WriteEnd
|
||||
st1 {v10.4s}, [x11], x10
|
||||
st1 {v11.2s}, [x19], x10
|
||||
st1 {v11.s}[2], [x16], x10
|
||||
st1 {v10.4s}, [x11], #16
|
||||
st1 {v11.2s}, [x19], x15
|
||||
st1 {v11.s}[2], [x16], x15
|
||||
cmp x6, #2
|
||||
beq WriteEnd
|
||||
st1 {v12.4s}, [x11], x10
|
||||
st1 {v13.2s}, [x19], x10
|
||||
st1 {v13.s}[2], [x16], x10
|
||||
st1 {v12.4s}, [x11], #16
|
||||
st1 {v13.2s}, [x19], x15
|
||||
st1 {v13.s}[2], [x16], x15
|
||||
cmp x6, #3
|
||||
beq WriteEnd
|
||||
st1 {v14.4s}, [x11], x10
|
||||
st1 {v15.2s}, [x19], x10
|
||||
st1 {v15.s}[2], [x16], x10
|
||||
st1 {v14.4s}, [x11], #16
|
||||
st1 {v15.2s}, [x19], x15
|
||||
st1 {v15.s}[2], [x16], x15
|
||||
b WriteEnd
|
||||
C4Write8:
|
||||
add x19, x11, x8
|
||||
|
|
|
@ -722,141 +722,113 @@ LoopRow8:
|
|||
st1 {v22.4s}, [x11], #16
|
||||
b WriteEnd
|
||||
C4Write5:
|
||||
add x19, x11, #16
|
||||
st1 {v8.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s9, [x19]
|
||||
add x19, x19, #20
|
||||
add x19, x11, x8
|
||||
st1 {v8.4s}, [x11], #16
|
||||
str s9, [x19], #4
|
||||
cmp x6, #1
|
||||
beq WriteEnd
|
||||
st1 {v10.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s11, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v10.4s}, [x11], #16
|
||||
str s11, [x19], #4
|
||||
cmp x6, #2
|
||||
beq WriteEnd
|
||||
st1 {v12.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s13, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v12.4s}, [x11], #16
|
||||
str s13, [x19], #4
|
||||
cmp x6, #3
|
||||
beq WriteEnd
|
||||
st1 {v14.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s15, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v14.4s}, [x11], #16
|
||||
str s15, [x19], #4
|
||||
cmp x6, #4
|
||||
beq WriteEnd
|
||||
st1 {v16.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s17, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v16.4s}, [x11], #16
|
||||
str s17, [x19], #4
|
||||
cmp x6, #5
|
||||
beq WriteEnd
|
||||
st1 {v18.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s19, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v18.4s}, [x11], #16
|
||||
str s19, [x19], #4
|
||||
cmp x6, #6
|
||||
beq WriteEnd
|
||||
st1 {v20.4s}, [x11]
|
||||
add x11, x11, #20
|
||||
str s21, [x19]
|
||||
add x19, x19, #20
|
||||
st1 {v20.4s}, [x11], #16
|
||||
str s21, [x19], #4
|
||||
cmp x6, #7
|
||||
beq WriteEnd
|
||||
st1 {v22.4s}, [x11]
|
||||
str s23, [x19]
|
||||
st1 {v22.4s}, [x11], #16
|
||||
str s23, [x19], #4
|
||||
b WriteEnd
|
||||
C4Write6:
|
||||
add x19, x11, #16
|
||||
st1 {v8.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v9.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
add x19, x11, x8
|
||||
st1 {v8.4s}, [x11], #16
|
||||
st1 {v9.2s}, [x19], #8
|
||||
cmp x6, #1
|
||||
beq WriteEnd
|
||||
st1 {v10.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v11.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v10.4s}, [x11], #16
|
||||
st1 {v11.2s}, [x19], #8
|
||||
cmp x6, #2
|
||||
beq WriteEnd
|
||||
st1 {v12.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v13.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v12.4s}, [x11], #16
|
||||
st1 {v13.2s}, [x19], #8
|
||||
cmp x6, #3
|
||||
beq WriteEnd
|
||||
st1 {v14.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v15.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v14.4s}, [x11], #16
|
||||
st1 {v15.2s}, [x19], #8
|
||||
cmp x6, #4
|
||||
beq WriteEnd
|
||||
st1 {v16.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v17.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v16.4s}, [x11], #16
|
||||
st1 {v17.2s}, [x19], #8
|
||||
cmp x6, #5
|
||||
beq WriteEnd
|
||||
st1 {v18.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v19.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v18.4s}, [x11], #16
|
||||
st1 {v19.2s}, [x19], #8
|
||||
cmp x6, #6
|
||||
beq WriteEnd
|
||||
st1 {v20.4s}, [x11]
|
||||
add x11, x11, #24
|
||||
st1 {v21.2s}, [x19]
|
||||
add x19, x19, #24
|
||||
st1 {v20.4s}, [x11], #16
|
||||
st1 {v21.2s}, [x19], #8
|
||||
cmp x6, #7
|
||||
beq WriteEnd
|
||||
st1 {v22.4s}, [x11]
|
||||
st1 {v23.2s}, [x19]
|
||||
st1 {v22.4s}, [x11], #16
|
||||
st1 {v23.2s}, [x19], #8
|
||||
b WriteEnd
|
||||
C4Write7:
|
||||
add x19, x11, #16
|
||||
add x16, x11, #24
|
||||
mov x10, #28
|
||||
st1 {v8.4s}, [x11], x10
|
||||
st1 {v9.2s}, [x19], x10
|
||||
st1 {v9.s}[2], [x16], x10
|
||||
add x19, x11, x8
|
||||
add x16, x19, #8
|
||||
mov x15, #12
|
||||
st1 {v8.4s}, [x11], #16
|
||||
st1 {v9.2s}, [x19], x15
|
||||
st1 {v9.s}[2], [x16], x15
|
||||
cmp x6, #1
|
||||
beq WriteEnd
|
||||
st1 {v10.4s}, [x11], x10
|
||||
st1 {v11.2s}, [x19], x10
|
||||
st1 {v11.s}[2], [x16], x10
|
||||
st1 {v10.4s}, [x11], #16
|
||||
st1 {v11.2s}, [x19], x15
|
||||
st1 {v11.s}[2], [x16], x15
|
||||
cmp x6, #2
|
||||
beq WriteEnd
|
||||
st1 {v12.4s}, [x11], x10
|
||||
st1 {v13.2s}, [x19], x10
|
||||
st1 {v13.s}[2], [x16], x10
|
||||
st1 {v12.4s}, [x11], #16
|
||||
st1 {v13.2s}, [x19], x15
|
||||
st1 {v13.s}[2], [x16], x15
|
||||
cmp x6, #3
|
||||
beq WriteEnd
|
||||
st1 {v14.4s}, [x11], x10
|
||||
st1 {v15.2s}, [x19], x10
|
||||
st1 {v15.s}[2], [x16], x10
|
||||
st1 {v14.4s}, [x11], #16
|
||||
st1 {v15.2s}, [x19], x15
|
||||
st1 {v15.s}[2], [x16], x15
|
||||
cmp x6, #4
|
||||
beq WriteEnd
|
||||
st1 {v16.4s}, [x11], x10
|
||||
st1 {v17.2s}, [x19], x10
|
||||
st1 {v17.s}[2], [x16], x10
|
||||
st1 {v16.4s}, [x11], #16
|
||||
st1 {v17.2s}, [x19], x15
|
||||
st1 {v17.s}[2], [x16], x15
|
||||
cmp x6, #5
|
||||
beq WriteEnd
|
||||
st1 {v18.4s}, [x11], x10
|
||||
st1 {v19.2s}, [x19], x10
|
||||
st1 {v19.s}[2], [x16], x10
|
||||
st1 {v18.4s}, [x11], #16
|
||||
st1 {v19.2s}, [x19], x15
|
||||
st1 {v19.s}[2], [x16], x15
|
||||
cmp x6, #6
|
||||
beq WriteEnd
|
||||
st1 {v20.4s}, [x11], x10
|
||||
st1 {v21.2s}, [x19], x10
|
||||
st1 {v21.s}[2], [x16], x10
|
||||
st1 {v20.4s}, [x11], #16
|
||||
st1 {v21.2s}, [x19], x15
|
||||
st1 {v21.s}[2], [x16], x15
|
||||
cmp x6, #7
|
||||
beq WriteEnd
|
||||
st1 {v22.4s}, [x11], x10
|
||||
st1 {v23.2s}, [x19], x10
|
||||
st1 {v23.s}[2], [x16], x10
|
||||
st1 {v22.4s}, [x11], #16
|
||||
st1 {v23.2s}, [x19], x15
|
||||
st1 {v23.s}[2], [x16], x15
|
||||
b WriteEnd
|
||||
C4Write8:
|
||||
add x19, x11, x8
|
||||
|
|
|
@ -195,13 +195,14 @@ int Convolution1x1CPUKernel::DoConv1x1Hw(int task_id) {
|
|||
MatMulOpt(thread_pack_input, reinterpret_cast<float *>(packed_weight_), cur_output,
|
||||
reinterpret_cast<float *>(bias_data_), matmul_param_->act_type_, matmul_param_->deep_, cur_rows,
|
||||
matmul_param_->col_, matmul_param_->col_, OutType_Nhwc);
|
||||
cur_output += row_tile_ * matmul_param_->col_;
|
||||
} else {
|
||||
MatMulOpt(thread_pack_input, reinterpret_cast<float *>(packed_weight_), cur_output,
|
||||
reinterpret_cast<float *>(bias_data_), matmul_param_->act_type_, matmul_param_->deep_, cur_rows,
|
||||
matmul_param_->col_, matmul_param_->row_, OutType_NC4HW4);
|
||||
cur_output += row_tile_ * MSMIN(matmul_param_->col_, C4NUM);
|
||||
}
|
||||
cur_intput += row_tile_ * matmul_param_->deep_;
|
||||
cur_output += row_tile_ * matmul_param_->col_;
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
|
|
Loading…
Reference in New Issue