!4547 [MS][LITE]deconvolution fp16 post function

Merge pull request !4547 from ling/deconv
This commit is contained in:
mindspore-ci-bot 2020-08-17 15:29:50 +08:00 committed by Gitee
commit 31ff088789
6 changed files with 70 additions and 70 deletions

View File

@ -124,7 +124,7 @@ int DeConvFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
}
int DeConvolutionFp16CPUKernel::DoDeconv(int task_id) {
int oc = MSMIN(thread_stride_ * C8NUM, conv_param_->output_channel_ - task_id * thread_stride_ * C8NUM);
int oc = MSMIN(thread_stride_, UP_DIV(conv_param_->output_channel_, C8NUM) - task_id * thread_stride_);
int oc_res = MSMIN(thread_stride_ * C8NUM, conv_param_->output_channel_ - task_id * thread_stride_ * C8NUM);
if (oc <= 0) {
return RET_OK;

View File

@ -204,6 +204,9 @@ Loop_C1:
beq End
mov w13, w5
ld1 {v16.4s, v17.4s}, [x2], #32
mov x25, #4
mul x24, x10, x25
add x0, x0, x24
cmp x4, #1
beq Loop_C1_1

View File

@ -863,7 +863,10 @@ End2:
subs w7, w7, #8 // rhs col - 8
add x1, x1, x15 // rhs ptr + stride
add x3, x3, #16 // bias ptr + stride
ldrb w13, [sp, #8]
cbz w13, NoDstStep
add x2, x2, #16 // dst ptr + stride
NoDstStep:
bgt L1
End1:

View File

@ -16,21 +16,20 @@
// v0 ~ v7 value
// v16 bias data
// x24 x25 weite loop tmp buf
// x22 x23 x24 x25 write loop tmp buf
// x26 relu6 #6; x27 relu #0
// w10 oc8 loop control
// w13 hw loop control
PostFuncBiasReluC8Fp16:
movi v26.8h, #6
scvtf v26.8h, v26.8h
movi v26.8h, #0x46, lsl #8
dup v27.8h, wzr
mov w10, #0
Loop_C8:
cmp w10, w3
beq Loop_C1
mov x25, #4
mov x25, #2
mul x24, x10, x25
add x25, x0, x24
add w10, w10, #8
@ -118,6 +117,7 @@ Write_4x8:
st1 {v1.8h}, [x25], x6
st1 {v2.8h}, [x25], x6
st1 {v3.8h}, [x25], x6
b Loop_4x8
Loop_1x8:
cmp w7, #2
@ -159,6 +159,9 @@ Loop_C1:
beq End
mov w13, w5
ld1 {v16.8h}, [x2], #16
mov x25, #2
mul x24, x10, x25
add x22, x0, x24
cmp x4, #1
beq Loop_C1_1
@ -189,7 +192,7 @@ Loop_C1_1_Relu6:
fadd v0.8h, v0.8h, v16.8h
fmin v0.8h, v0.8h, v26.8h
fmax v0.8h, v0.8h, v27.8h
st1 {v1.h}[0], [x0], x6
st1 {v0.h}[0], [x22], x6
b Loop_C1_1_Relu6
Loop_C1_1_Relu:
cmp w13, #0
@ -198,7 +201,7 @@ Loop_C1_1_Relu:
ld1 {v0.8h}, [x1], #16
fadd v0.8h, v0.8h, v16.8h
fmax v0.8h, v0.8h, v27.8h
st1 {v1.h}[0], [x0], x6
st1 {v0.h}[0], [x22], x6
b Loop_C1_1_Relu
Loop_C1_1_Write:
cmp w13, #0
@ -206,7 +209,7 @@ Loop_C1_1_Write:
sub w13, w13, #1
ld1 {v0.8h}, [x1], #16
fadd v0.8h, v0.8h, v16.8h
st1 {v1.h}[0], [x0], x6
st1 {v0.h}[0], [x22], x6
b Loop_C1_1_Write
Loop_C1_2:
@ -224,8 +227,8 @@ Loop_C1_2_Relu6:
fadd v0.8h, v0.8h, v16.8h
fmin v0.8h, v0.8h, v26.8h
fmax v0.8h, v0.8h, v27.8h
st1 {v1.h}[0], [x0], x6
st1 {v1.h}[1], [x24], x6
st1 {v0.h}[0], [x22], x6
st1 {v0.h}[1], [x24], x6
b Loop_C1_2_Relu6
Loop_C1_2_Relu:
cmp w13, #0
@ -234,8 +237,8 @@ Loop_C1_2_Relu:
ld1 {v0.8h}, [x1], #16
fadd v0.8h, v0.8h, v16.8h
fmax v0.8h, v0.8h, v27.8h
st1 {v1.h}[0], [x0], x6
st1 {v1.h}[1], [x24], x6
st1 {v0.h}[0], [x22], x6
st1 {v0.h}[1], [x24], x6
b Loop_C1_2_Relu
Loop_C1_2_Write:
cmp w13, #0
@ -243,14 +246,14 @@ Loop_C1_2_Write:
sub w13, w13, #1
ld1 {v0.8h}, [x1], #16
fadd v0.8h, v0.8h, v16.8h
st1 {v1.h}[0], [x0], x6
st1 {v1.h}[1], [x24], x6
st1 {v0.h}[0], [x22], x6
st1 {v0.h}[1], [x24], x6
b Loop_C1_2_Write
Loop_C1_3:
add x24, x0, #2
add x25, x0, #4
add x24, x22, #2
add x25, x22, #4
cmp w7, #2
beq Loop_C1_3_Relu6
cmp w7, #1
@ -264,9 +267,9 @@ Loop_C1_3_Relu6:
fadd v0.8h, v0.8h, v16.8h
fmin v0.8h, v0.8h, v26.8h
fmax v0.8h, v0.8h, v27.8h
st1 {v1.h}[0], [x0], x6
st1 {v1.h}[1], [x24], x6
st1 {v1.h}[2], [x25], x6
st1 {v0.h}[0], [x22], x6
st1 {v0.h}[1], [x24], x6
st1 {v0.h}[2], [x25], x6
b Loop_C1_3_Relu6
Loop_C1_3_Relu:
cmp w13, #0
@ -275,9 +278,9 @@ Loop_C1_3_Relu:
ld1 {v0.8h}, [x1], #16
fadd v0.8h, v0.8h, v16.8h
fmax v0.8h, v0.8h, v27.8h
st1 {v1.h}[0], [x0], x6
st1 {v1.h}[1], [x24], x6
st1 {v1.h}[2], [x25], x6
st1 {v0.h}[0], [x22], x6
st1 {v0.h}[1], [x24], x6
st1 {v0.h}[2], [x25], x6
b Loop_C1_3_Relu
Loop_C1_3_Write:
cmp w13, #0
@ -285,9 +288,9 @@ Loop_C1_3_Write:
sub w13, w13, #1
ld1 {v0.8h}, [x1], #16
fadd v0.8h, v0.8h, v16.8h
st1 {v1.h}[0], [x0], x6
st1 {v1.h}[1], [x24], x6
st1 {v1.h}[2], [x25], x6
st1 {v0.h}[0], [x22], x6
st1 {v0.h}[1], [x24], x6
st1 {v0.h}[2], [x25], x6
b Loop_C1_3_Write
Loop_C1_4:
@ -304,7 +307,7 @@ Loop_C1_4_Relu6:
fadd v0.8h, v0.8h, v16.8h
fmin v0.8h, v0.8h, v26.8h
fmax v0.8h, v0.8h, v27.8h
st1 {v0.4h}, [x0], x6
st1 {v0.4h}, [x22], x6
b Loop_C1_4_Relu6
Loop_C1_4_Relu:
cmp w13, #0
@ -313,7 +316,7 @@ Loop_C1_4_Relu:
ld1 {v0.8h}, [x1], #16
fadd v0.8h, v0.8h, v16.8h
fmax v0.8h, v0.8h, v27.8h
st1 {v0.4h}, [x0], x6
st1 {v0.4h}, [x22], x6
b Loop_C1_4_Relu6
Loop_C1_4_Write:
cmp w13, #0
@ -321,11 +324,11 @@ Loop_C1_4_Write:
sub w13, w13, #1
ld1 {v0.8h}, [x1], #16
fadd v0.8h, v0.8h, v16.8h
st1 {v0.4h}, [x0], x6
st1 {v0.4h}, [x22], x6
b Loop_C1_4_Write
Loop_C1_5:
add x25, x0, #16
add x25, x22, #8
cmp w7, #2
beq Loop_C1_5_Relu6
cmp w7, #1
@ -339,9 +342,8 @@ Loop_C1_5_Relu6:
fadd v0.8h, v0.8h, v16.8h
fmin v0.8h, v0.8h, v26.8h
fmax v0.8h, v0.8h, v27.8h
st1 {v0.4h}, [x0], x6
str h1, [x25]
add x25, x25, x6
st1 {v0.4h}, [x22], x6
st1 {v0.h}[4], [x25], x6
b Loop_C1_5_Relu6
Loop_C1_5_Relu:
cmp w13, #0
@ -350,9 +352,8 @@ Loop_C1_5_Relu:
ld1 {v0.8h}, [x1], #16
fadd v0.8h, v0.8h, v16.8h
fmax v0.8h, v0.8h, v27.8h
st1 {v0.4h}, [x0], x6
str h1, [x25]
add x25, x25, x6
st1 {v0.4h}, [x22], x6
st1 {v0.h}[4], [x25], x6
b Loop_C1_5_Relu
Loop_C1_5_Write:
cmp w13, #0
@ -360,14 +361,13 @@ Loop_C1_5_Write:
sub w13, w13, #1
ld1 {v0.8h}, [x1], #16
fadd v0.8h, v0.8h, v16.8h
st1 {v0.4h}, [x0], x6
str h1, [x25]
add x25, x25, x6
st1 {v0.4h}, [x22], x6
st1 {v0.h}[4], [x25], x6
b Loop_C1_5_Write
Loop_C1_6:
add x23, x0, #8
add x24, x0, #10
add x23, x22, #8
add x24, x22, #10
cmp w7, #2
beq Loop_C1_6_Relu6
cmp w7, #1
@ -381,9 +381,9 @@ Loop_C1_6_Relu6:
fadd v0.8h, v0.8h, v16.8h
fmin v0.8h, v0.8h, v26.8h
fmax v0.8h, v0.8h, v27.8h
st1 {v0.4h}, [x0], x6
st1 {v1.h}[4], [x23], x6
st1 {v1.h}[5], [x24], x6
st1 {v0.4h}, [x22], x6
st1 {v0.h}[4], [x23], x6
st1 {v0.h}[5], [x24], x6
b Loop_C1_6_Relu6
Loop_C1_6_Relu:
cmp w13, #0
@ -392,9 +392,9 @@ Loop_C1_6_Relu:
ld1 {v0.8h}, [x1], #16
fadd v0.8h, v0.8h, v16.8h
fmax v0.8h, v0.8h, v27.8h
st1 {v0.4h}, [x0], x6
st1 {v1.h}[4], [x23], x6
st1 {v1.h}[5], [x24], x6
st1 {v0.4h}, [x22], x6
st1 {v0.h}[4], [x23], x6
st1 {v0.h}[5], [x24], x6
b Loop_C1_6_Relu
Loop_C1_6_Write:
cmp w13, #0
@ -402,15 +402,15 @@ Loop_C1_6_Write:
sub w13, w13, #1
ld1 {v0.8h}, [x1], #16
fadd v0.8h, v0.8h, v16.8h
st1 {v0.4h}, [x0], x6
st1 {v1.h}[4], [x23], x6
st1 {v1.h}[5], [x24], x6
st1 {v0.4h}, [x22], x6
st1 {v0.h}[4], [x23], x6
st1 {v0.h}[5], [x24], x6
b Loop_C1_6_Write
Loop_C1_7:
add x23, x0, #8
add x24, x0, #10
add x25, x0, #12
add x23, x22, #8
add x24, x22, #10
add x25, x22, #12
cmp w7, #2
beq Loop_C1_7_Relu6
cmp w7, #1
@ -424,10 +424,10 @@ Loop_C1_7_Relu6:
fadd v0.8h, v0.8h, v16.8h
fmin v0.8h, v0.8h, v26.8h
fmax v0.8h, v0.8h, v27.8h
st1 {v0.4h}, [x0], x6
st1 {v1.h}[4], [x23], x6
st1 {v1.h}[5], [x24], x6
st1 {v1.h}[6], [x25], x6
st1 {v0.4h}, [x22], x6
st1 {v0.h}[4], [x23], x6
st1 {v0.h}[5], [x24], x6
st1 {v0.h}[6], [x25], x6
b Loop_C1_7_Relu6
Loop_C1_7_Relu:
cmp w13, #0
@ -436,10 +436,10 @@ Loop_C1_7_Relu:
ld1 {v0.8h}, [x1], #16
fadd v0.8h, v0.8h, v16.8h
fmax v0.8h, v0.8h, v27.8h
st1 {v0.4h}, [x0], x6
st1 {v1.h}[4], [x23], x6
st1 {v1.h}[5], [x24], x6
st1 {v1.h}[6], [x25], x6
st1 {v0.4h}, [x22], x6
st1 {v0.h}[4], [x23], x6
st1 {v0.h}[5], [x24], x6
st1 {v0.h}[6], [x25], x6
b Loop_C1_7_Relu
Loop_C1_7_Write:
cmp w13, #0
@ -447,11 +447,10 @@ Loop_C1_7_Write:
sub w13, w13, #1
ld1 {v0.8h}, [x1], #16
fadd v0.8h, v0.8h, v16.8h
fmax v0.8h, v0.8h, v27.8h
st1 {v0.4h}, [x0], x6
st1 {v1.h}[4], [x23], x6
st1 {v1.h}[5], [x24], x6
st1 {v1.h}[6], [x25], x6
st1 {v0.4h}, [x22], x6
st1 {v0.h}[4], [x23], x6
st1 {v0.h}[5], [x24], x6
st1 {v0.h}[6], [x25], x6
b Loop_C1_7_Write
End:

View File

@ -37,16 +37,12 @@ void PostConvFuncCommFp16(float16_t *out_ptr, const float16_t *src_ptr_, const f
void PostConvFuncFp16C8(const float16_t *c8_out_ptr, float16_t *out_ptr, const float16_t *bias_ptr,
size_t output_channel, size_t plane_size, size_t stride, bool is_relu, bool is_relu6) {
#ifdef DEBUG_CODE
PostConvFuncCommFp16(out_ptr, c8_out_ptr, bias_ptr, output_channel, plane_size, stride, is_relu, is_relu6, C8NUM);
#else
size_t oc8mod = output_channel % C8NUM;
size_t oc8div = output_channel - oc8mod;
size_t stride_size = stride * sizeof(float16_t);
size_t relu_type = is_relu ? 1 : 0;
relu_type = is_relu6 ? 2 : relu_type;
PostFuncBiasReluC8Fp16(out_ptr, c8_out_ptr, bias_ptr, oc8div, oc8mod, plane_size, stride_size, relu_type);
#endif
return;
}

View File

@ -74,7 +74,6 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl
void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type,
int depth, int row, int col, int stride, bool write_nhwc) {
MatmulFp16Neon64(a, b, c, bias, (int)act_type, depth, row, col, stride, write_nhwc);
// MatMul16x8(a, b, c, bias, (int)act_type, depth, row, col, stride, write_nhwc);
return;
}