forked from mindspore-Ecosystem/mindspore
!4547 [MS][LITE]deconvolution fp16 post function
Merge pull request !4547 from ling/deconv
This commit is contained in:
commit
31ff088789
|
@ -124,7 +124,7 @@ int DeConvFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
|
|||
}
|
||||
|
||||
int DeConvolutionFp16CPUKernel::DoDeconv(int task_id) {
|
||||
int oc = MSMIN(thread_stride_ * C8NUM, conv_param_->output_channel_ - task_id * thread_stride_ * C8NUM);
|
||||
int oc = MSMIN(thread_stride_, UP_DIV(conv_param_->output_channel_, C8NUM) - task_id * thread_stride_);
|
||||
int oc_res = MSMIN(thread_stride_ * C8NUM, conv_param_->output_channel_ - task_id * thread_stride_ * C8NUM);
|
||||
if (oc <= 0) {
|
||||
return RET_OK;
|
||||
|
|
|
@ -204,6 +204,9 @@ Loop_C1:
|
|||
beq End
|
||||
mov w13, w5
|
||||
ld1 {v16.4s, v17.4s}, [x2], #32
|
||||
mov x25, #4
|
||||
mul x24, x10, x25
|
||||
add x0, x0, x24
|
||||
|
||||
cmp x4, #1
|
||||
beq Loop_C1_1
|
||||
|
|
|
@ -863,7 +863,10 @@ End2:
|
|||
subs w7, w7, #8 // rhs col - 8
|
||||
add x1, x1, x15 // rhs ptr + stride
|
||||
add x3, x3, #16 // bias ptr + stride
|
||||
ldrb w13, [sp, #8]
|
||||
cbz w13, NoDstStep
|
||||
add x2, x2, #16 // dst ptr + stride
|
||||
NoDstStep:
|
||||
bgt L1
|
||||
|
||||
End1:
|
||||
|
|
|
@ -16,21 +16,20 @@
|
|||
|
||||
// v0 ~ v7 value
|
||||
// v16 bias data
|
||||
// x24 x25 weite loop tmp buf
|
||||
// x22 x23 x24 x25 write loop tmp buf
|
||||
// x26 relu6 #6; x27 relu #0
|
||||
// w10 oc8 loop control
|
||||
// w13 hw loop control
|
||||
|
||||
PostFuncBiasReluC8Fp16:
|
||||
movi v26.8h, #6
|
||||
scvtf v26.8h, v26.8h
|
||||
movi v26.8h, #0x46, lsl #8
|
||||
dup v27.8h, wzr
|
||||
mov w10, #0
|
||||
|
||||
Loop_C8:
|
||||
cmp w10, w3
|
||||
beq Loop_C1
|
||||
mov x25, #4
|
||||
mov x25, #2
|
||||
mul x24, x10, x25
|
||||
add x25, x0, x24
|
||||
add w10, w10, #8
|
||||
|
@ -118,6 +117,7 @@ Write_4x8:
|
|||
st1 {v1.8h}, [x25], x6
|
||||
st1 {v2.8h}, [x25], x6
|
||||
st1 {v3.8h}, [x25], x6
|
||||
b Loop_4x8
|
||||
|
||||
Loop_1x8:
|
||||
cmp w7, #2
|
||||
|
@ -159,6 +159,9 @@ Loop_C1:
|
|||
beq End
|
||||
mov w13, w5
|
||||
ld1 {v16.8h}, [x2], #16
|
||||
mov x25, #2
|
||||
mul x24, x10, x25
|
||||
add x22, x0, x24
|
||||
|
||||
cmp x4, #1
|
||||
beq Loop_C1_1
|
||||
|
@ -189,7 +192,7 @@ Loop_C1_1_Relu6:
|
|||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmin v0.8h, v0.8h, v26.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v1.h}[0], [x0], x6
|
||||
st1 {v0.h}[0], [x22], x6
|
||||
b Loop_C1_1_Relu6
|
||||
Loop_C1_1_Relu:
|
||||
cmp w13, #0
|
||||
|
@ -198,7 +201,7 @@ Loop_C1_1_Relu:
|
|||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v1.h}[0], [x0], x6
|
||||
st1 {v0.h}[0], [x22], x6
|
||||
b Loop_C1_1_Relu
|
||||
Loop_C1_1_Write:
|
||||
cmp w13, #0
|
||||
|
@ -206,7 +209,7 @@ Loop_C1_1_Write:
|
|||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
st1 {v1.h}[0], [x0], x6
|
||||
st1 {v0.h}[0], [x22], x6
|
||||
b Loop_C1_1_Write
|
||||
|
||||
Loop_C1_2:
|
||||
|
@ -224,8 +227,8 @@ Loop_C1_2_Relu6:
|
|||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmin v0.8h, v0.8h, v26.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v1.h}[0], [x0], x6
|
||||
st1 {v1.h}[1], [x24], x6
|
||||
st1 {v0.h}[0], [x22], x6
|
||||
st1 {v0.h}[1], [x24], x6
|
||||
b Loop_C1_2_Relu6
|
||||
Loop_C1_2_Relu:
|
||||
cmp w13, #0
|
||||
|
@ -234,8 +237,8 @@ Loop_C1_2_Relu:
|
|||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v1.h}[0], [x0], x6
|
||||
st1 {v1.h}[1], [x24], x6
|
||||
st1 {v0.h}[0], [x22], x6
|
||||
st1 {v0.h}[1], [x24], x6
|
||||
b Loop_C1_2_Relu
|
||||
Loop_C1_2_Write:
|
||||
cmp w13, #0
|
||||
|
@ -243,14 +246,14 @@ Loop_C1_2_Write:
|
|||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
st1 {v1.h}[0], [x0], x6
|
||||
st1 {v1.h}[1], [x24], x6
|
||||
st1 {v0.h}[0], [x22], x6
|
||||
st1 {v0.h}[1], [x24], x6
|
||||
b Loop_C1_2_Write
|
||||
|
||||
|
||||
Loop_C1_3:
|
||||
add x24, x0, #2
|
||||
add x25, x0, #4
|
||||
add x24, x22, #2
|
||||
add x25, x22, #4
|
||||
cmp w7, #2
|
||||
beq Loop_C1_3_Relu6
|
||||
cmp w7, #1
|
||||
|
@ -264,9 +267,9 @@ Loop_C1_3_Relu6:
|
|||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmin v0.8h, v0.8h, v26.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v1.h}[0], [x0], x6
|
||||
st1 {v1.h}[1], [x24], x6
|
||||
st1 {v1.h}[2], [x25], x6
|
||||
st1 {v0.h}[0], [x22], x6
|
||||
st1 {v0.h}[1], [x24], x6
|
||||
st1 {v0.h}[2], [x25], x6
|
||||
b Loop_C1_3_Relu6
|
||||
Loop_C1_3_Relu:
|
||||
cmp w13, #0
|
||||
|
@ -275,9 +278,9 @@ Loop_C1_3_Relu:
|
|||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v1.h}[0], [x0], x6
|
||||
st1 {v1.h}[1], [x24], x6
|
||||
st1 {v1.h}[2], [x25], x6
|
||||
st1 {v0.h}[0], [x22], x6
|
||||
st1 {v0.h}[1], [x24], x6
|
||||
st1 {v0.h}[2], [x25], x6
|
||||
b Loop_C1_3_Relu
|
||||
Loop_C1_3_Write:
|
||||
cmp w13, #0
|
||||
|
@ -285,9 +288,9 @@ Loop_C1_3_Write:
|
|||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
st1 {v1.h}[0], [x0], x6
|
||||
st1 {v1.h}[1], [x24], x6
|
||||
st1 {v1.h}[2], [x25], x6
|
||||
st1 {v0.h}[0], [x22], x6
|
||||
st1 {v0.h}[1], [x24], x6
|
||||
st1 {v0.h}[2], [x25], x6
|
||||
b Loop_C1_3_Write
|
||||
|
||||
Loop_C1_4:
|
||||
|
@ -304,7 +307,7 @@ Loop_C1_4_Relu6:
|
|||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmin v0.8h, v0.8h, v26.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
st1 {v0.4h}, [x22], x6
|
||||
b Loop_C1_4_Relu6
|
||||
Loop_C1_4_Relu:
|
||||
cmp w13, #0
|
||||
|
@ -313,7 +316,7 @@ Loop_C1_4_Relu:
|
|||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
st1 {v0.4h}, [x22], x6
|
||||
b Loop_C1_4_Relu6
|
||||
Loop_C1_4_Write:
|
||||
cmp w13, #0
|
||||
|
@ -321,11 +324,11 @@ Loop_C1_4_Write:
|
|||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
st1 {v0.4h}, [x22], x6
|
||||
b Loop_C1_4_Write
|
||||
|
||||
Loop_C1_5:
|
||||
add x25, x0, #16
|
||||
add x25, x22, #8
|
||||
cmp w7, #2
|
||||
beq Loop_C1_5_Relu6
|
||||
cmp w7, #1
|
||||
|
@ -339,9 +342,8 @@ Loop_C1_5_Relu6:
|
|||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmin v0.8h, v0.8h, v26.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
str h1, [x25]
|
||||
add x25, x25, x6
|
||||
st1 {v0.4h}, [x22], x6
|
||||
st1 {v0.h}[4], [x25], x6
|
||||
b Loop_C1_5_Relu6
|
||||
Loop_C1_5_Relu:
|
||||
cmp w13, #0
|
||||
|
@ -350,9 +352,8 @@ Loop_C1_5_Relu:
|
|||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
str h1, [x25]
|
||||
add x25, x25, x6
|
||||
st1 {v0.4h}, [x22], x6
|
||||
st1 {v0.h}[4], [x25], x6
|
||||
b Loop_C1_5_Relu
|
||||
Loop_C1_5_Write:
|
||||
cmp w13, #0
|
||||
|
@ -360,14 +361,13 @@ Loop_C1_5_Write:
|
|||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
str h1, [x25]
|
||||
add x25, x25, x6
|
||||
st1 {v0.4h}, [x22], x6
|
||||
st1 {v0.h}[4], [x25], x6
|
||||
b Loop_C1_5_Write
|
||||
|
||||
Loop_C1_6:
|
||||
add x23, x0, #8
|
||||
add x24, x0, #10
|
||||
add x23, x22, #8
|
||||
add x24, x22, #10
|
||||
cmp w7, #2
|
||||
beq Loop_C1_6_Relu6
|
||||
cmp w7, #1
|
||||
|
@ -381,9 +381,9 @@ Loop_C1_6_Relu6:
|
|||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmin v0.8h, v0.8h, v26.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
st1 {v1.h}[4], [x23], x6
|
||||
st1 {v1.h}[5], [x24], x6
|
||||
st1 {v0.4h}, [x22], x6
|
||||
st1 {v0.h}[4], [x23], x6
|
||||
st1 {v0.h}[5], [x24], x6
|
||||
b Loop_C1_6_Relu6
|
||||
Loop_C1_6_Relu:
|
||||
cmp w13, #0
|
||||
|
@ -392,9 +392,9 @@ Loop_C1_6_Relu:
|
|||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
st1 {v1.h}[4], [x23], x6
|
||||
st1 {v1.h}[5], [x24], x6
|
||||
st1 {v0.4h}, [x22], x6
|
||||
st1 {v0.h}[4], [x23], x6
|
||||
st1 {v0.h}[5], [x24], x6
|
||||
b Loop_C1_6_Relu
|
||||
Loop_C1_6_Write:
|
||||
cmp w13, #0
|
||||
|
@ -402,15 +402,15 @@ Loop_C1_6_Write:
|
|||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
st1 {v1.h}[4], [x23], x6
|
||||
st1 {v1.h}[5], [x24], x6
|
||||
st1 {v0.4h}, [x22], x6
|
||||
st1 {v0.h}[4], [x23], x6
|
||||
st1 {v0.h}[5], [x24], x6
|
||||
b Loop_C1_6_Write
|
||||
|
||||
Loop_C1_7:
|
||||
add x23, x0, #8
|
||||
add x24, x0, #10
|
||||
add x25, x0, #12
|
||||
add x23, x22, #8
|
||||
add x24, x22, #10
|
||||
add x25, x22, #12
|
||||
cmp w7, #2
|
||||
beq Loop_C1_7_Relu6
|
||||
cmp w7, #1
|
||||
|
@ -424,10 +424,10 @@ Loop_C1_7_Relu6:
|
|||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmin v0.8h, v0.8h, v26.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
st1 {v1.h}[4], [x23], x6
|
||||
st1 {v1.h}[5], [x24], x6
|
||||
st1 {v1.h}[6], [x25], x6
|
||||
st1 {v0.4h}, [x22], x6
|
||||
st1 {v0.h}[4], [x23], x6
|
||||
st1 {v0.h}[5], [x24], x6
|
||||
st1 {v0.h}[6], [x25], x6
|
||||
b Loop_C1_7_Relu6
|
||||
Loop_C1_7_Relu:
|
||||
cmp w13, #0
|
||||
|
@ -436,10 +436,10 @@ Loop_C1_7_Relu:
|
|||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
st1 {v1.h}[4], [x23], x6
|
||||
st1 {v1.h}[5], [x24], x6
|
||||
st1 {v1.h}[6], [x25], x6
|
||||
st1 {v0.4h}, [x22], x6
|
||||
st1 {v0.h}[4], [x23], x6
|
||||
st1 {v0.h}[5], [x24], x6
|
||||
st1 {v0.h}[6], [x25], x6
|
||||
b Loop_C1_7_Relu
|
||||
Loop_C1_7_Write:
|
||||
cmp w13, #0
|
||||
|
@ -447,11 +447,10 @@ Loop_C1_7_Write:
|
|||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
st1 {v1.h}[4], [x23], x6
|
||||
st1 {v1.h}[5], [x24], x6
|
||||
st1 {v1.h}[6], [x25], x6
|
||||
st1 {v0.4h}, [x22], x6
|
||||
st1 {v0.h}[4], [x23], x6
|
||||
st1 {v0.h}[5], [x24], x6
|
||||
st1 {v0.h}[6], [x25], x6
|
||||
b Loop_C1_7_Write
|
||||
|
||||
End:
|
||||
|
|
|
@ -37,16 +37,12 @@ void PostConvFuncCommFp16(float16_t *out_ptr, const float16_t *src_ptr_, const f
|
|||
|
||||
void PostConvFuncFp16C8(const float16_t *c8_out_ptr, float16_t *out_ptr, const float16_t *bias_ptr,
|
||||
size_t output_channel, size_t plane_size, size_t stride, bool is_relu, bool is_relu6) {
|
||||
#ifdef DEBUG_CODE
|
||||
PostConvFuncCommFp16(out_ptr, c8_out_ptr, bias_ptr, output_channel, plane_size, stride, is_relu, is_relu6, C8NUM);
|
||||
#else
|
||||
size_t oc8mod = output_channel % C8NUM;
|
||||
size_t oc8div = output_channel - oc8mod;
|
||||
size_t stride_size = stride * sizeof(float16_t);
|
||||
size_t relu_type = is_relu ? 1 : 0;
|
||||
relu_type = is_relu6 ? 2 : relu_type;
|
||||
PostFuncBiasReluC8Fp16(out_ptr, c8_out_ptr, bias_ptr, oc8div, oc8mod, plane_size, stride_size, relu_type);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -74,7 +74,6 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl
|
|||
void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type,
|
||||
int depth, int row, int col, int stride, bool write_nhwc) {
|
||||
MatmulFp16Neon64(a, b, c, bias, (int)act_type, depth, row, col, stride, write_nhwc);
|
||||
// MatMul16x8(a, b, c, bias, (int)act_type, depth, row, col, stride, write_nhwc);
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue