diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S index 7ecb6ad3058..b5a2ec2e4cb 100644 --- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S +++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S @@ -47,7 +47,7 @@ ///////////////////////////////////////////////////////////////////////////////// // // void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth -// int row, int col, size_t stride, size_t writeNhwc, size_t writeC4) +// int row, int col, size_t stride, size_t writeNhwc, size_t WriteWino) // x0: a // x1: b // x2: c @@ -64,11 +64,20 @@ MatmulFloatNeon64Opt: st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 + ldr x9, [sp, #8] + ldr x14, [sp, #16] + mov w18, #32 // sizeof(float) * 8 mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float) * 8 * depth - mov x11, x3 // bias flag mov x18, #4 ldr x17, [sp] + cbz x14, NoWinoSteps + mul x8, x7, x17 + mov x11, #8 + mul x11, x11, x17 + mul x8, x8, x18 + mul x11, x11, x18 +NoWinoSteps: mul x17, x17, x18 L1: @@ -79,7 +88,6 @@ L1: L2: mov x16, x1 // reload rhs ptr mov w13, w5 // reload depth - mov x14, x3 // reload bias ptr dup v8.4s, wzr dup v9.4s, wzr dup v10.4s, wzr @@ -150,7 +158,7 @@ Loop: fmla v11.4s, v4.4s, v0.s[1] fmla v13.4s, v4.4s, v0.s[2] fmla v15.4s, v4.4s, v0.s[3] - + subs w13, w13, #1 bgt Loop @@ -173,9 +181,10 @@ LoopEnd: fmla v31.4s, v4.4s, v2.s[3] Bias: - cbz x11, Activation - ld1 {v0.4s}, [x14], #16 - ld1 {v1.4s}, [x14], #16 + cbz x3, Activation + ld1 {v0.4s}, [x3], #16 + ld1 {v1.4s}, [x3] + sub x3, x3, #16 fadd v8.4s, v8.4s, v0.4s fadd v9.4s, v9.4s, v1.4s fadd v10.4s, v10.4s, v0.4s @@ -265,10 +274,8 @@ Relu: fmax v31.4s, v31.4s, v3.4s Write: - ldr w8, [sp, #8] - cbz w8, WriteC8 - ldr w8, [sp, #16] - cbnz w8, WriteC4 + cbnz x14, WriteWino + cbz x9, WriteC8 cmp w7, #1 beq Write1 cmp w7, #2 @@ -721,39 +728,26 @@ Write7: st1 {v31.s}[2], [x16], x17 b WriteEnd WriteC8: - st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x2], #64 - st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x2], #64 - st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64 - st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64 - st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x2], #64 - st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x2], #64 + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x2], #64 + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x2], #64 + st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x2], #64 + st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x2], #64 + st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x2], #64 + st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64 b WriteEnd -WriteC4: - st1 {v8.8h}, [x2], #16 - st1 {v10.8h}, [x2], #16 - st1 {v12.8h}, [x2], #16 - st1 {v14.8h}, [x2], #16 - st1 {v16.8h}, [x2], #16 - st1 {v18.8h}, [x2], #16 - st1 {v20.8h}, [x2], #16 - st1 {v22.8h}, [x2], #16 - st1 {v24.8h}, [x2], #16 - st1 {v26.8h}, [x2], #16 - st1 {v28.8h}, [x2], #16 - st1 {v30.8h}, [x2], #16 - add x18, x2, x17 - st1 {v9.8h}, [x18], #16 - st1 {v11.8h}, [x18], #16 - st1 {v13.8h}, [x18], #16 - st1 {v15.8h}, [x18], #16 - st1 {v17.8h}, [x18], #16 - st1 {v19.8h}, [x18], #16 - st1 {v21.8h}, [x18], #16 - st1 {v23.8h}, [x18], #16 - st1 {v25.8h}, [x18], #16 - st1 {v27.8h}, [x18], #16 - st1 {v29.8h}, [x18], #16 - st1 {v31.8h}, [x18], #16 +WriteWino: + st1 {v8.4s, v9.4s}, [x18], x8 + st1 {v10.4s, v11.4s}, [x18], x8 + st1 {v12.4s, v13.4s}, [x18], x8 + st1 {v14.4s, v15.4s}, [x18], x8 + st1 {v16.4s, v17.4s}, [x18], x8 + st1 {v18.4s, v19.4s}, [x18], x8 + st1 {v20.4s, v21.4s}, [x18], x8 + st1 {v22.4s, v23.4s}, [x18], x8 + st1 {v24.4s, v25.4s}, [x18], x8 + st1 {v26.4s, v27.4s}, [x18], x8 + st1 {v28.4s, v29.4s}, [x18], x8 + st1 {v30.4s, v31.4s}, [x18], x8 b WriteEnd Write8: st1 {v8.4s, v9.4s}, [x18], x17 @@ -798,15 +792,15 @@ WriteEnd: End2: subs w7, w7, #8 // rhs col - 8 add x1, x1, x15 // rhs ptr + stride + cbz x3, NoBiasStep add x3, x3, #32 // bias ptr + stride - ldr w8, [sp, #8] - cbz w8, NoDstStep - ldr w8, [sp, #16] - cbnz w8, C4DstStep +NoBiasStep: + cbnz x14, WinoDstStep + cbz x9, NoDstStep add x2, x2, #32 // dst ptr + stride b NoDstStep -C4DstStep: - add x2, x2, x17 +WinoDstStep: + add x2, x2, x11 NoDstStep: bgt L1 diff --git a/mindspore/lite/nnacl/conv_parameter.h b/mindspore/lite/nnacl/conv_parameter.h index 6a7f3c5a85a..ef01f73ab2a 100644 --- a/mindspore/lite/nnacl/conv_parameter.h +++ b/mindspore/lite/nnacl/conv_parameter.h @@ -32,8 +32,6 @@ typedef struct ConvParameter { int stride_w_; int dilation_h_; int dilation_w_; - int pad_h_; - int pad_w_; int pad_u_; int pad_d_; int pad_l_; @@ -51,8 +49,7 @@ typedef struct ConvParameter { int thread_num_; int input_unit_; int output_unit_; - bool is_relu_; - bool is_relu6_; + ActType act_type_; } ConvParameter; typedef struct SlidingWindowParam { diff --git a/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.c b/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.c index e5870f8f4b4..49777ad0138 100644 --- a/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.c +++ b/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.c @@ -53,16 +53,18 @@ void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float1 void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, int top, int bottom, int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) { + bool relu = conv_param->act_type_ == ActType_Relu; + bool relu6 = conv_param->act_type_ == ActType_Relu6; float16_t *dst_h = dst + top * sliding->out_h_step_; for (int oh = top; oh < bottom; oh++) { - int ih = oh * conv_param->stride_h_ - conv_param->pad_h_; + int ih = oh * conv_param->stride_h_ - conv_param->pad_u_; int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_)); int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_)); const float16_t *src_h = src + ih * sliding->in_h_step_; float16_t *dst_kernel = dst_h + left * sliding->block_channel_; for (int ow = left; ow < right; ow++) { - int iw = ow * conv_param->stride_w_ - conv_param->pad_w_; + int iw = ow * conv_param->stride_w_ - conv_param->pad_l_; int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_)); int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_)); const float16_t *src_w = src_h + iw * sliding->block_channel_; @@ -72,11 +74,10 @@ void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t * #ifdef ENABLE_ARM64 ConvDwFp16Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t), - conv_param->kernel_w_ * C8NUM * sizeof(float16_t), conv_param->is_relu_, conv_param->is_relu6_); + conv_param->kernel_w_ * C8NUM * sizeof(float16_t), relu, relu6); #else DepthwiseBorderPixelFp16(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, - sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C8NUM, - conv_param->is_relu_, conv_param->is_relu6_); + sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C8NUM, relu, relu6); #endif dst_kernel += sliding->block_channel_; } // width loop @@ -139,6 +140,8 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t * void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) { + bool relu = conv_param->act_type_ == ActType_Relu; + bool relu6 = conv_param->act_type_ == ActType_Relu6; const float16_t *src = input_data; float16_t *dst = output_data; for (int b = 0; b < conv_param->output_batch_; b++) { @@ -157,8 +160,8 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo conv_param->output_w_, conv_param, sliding); if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { - int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_; - int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_; + int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_; + int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_; const float16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_; float16_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; #ifdef ENABLE_ARM64 @@ -166,12 +169,12 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(float16_t), sliding->block_channel_ * sizeof(float16_t), sliding->in_sh_step_ * sizeof(float16_t), sliding->in_sw_step_ * sizeof(float16_t), sliding->in_kh_step_ * sizeof(float16_t), - sliding->in_kw_step_ * sizeof(float16_t), conv_param->is_relu_, conv_param->is_relu6_); + sliding->in_kw_step_ * sizeof(float16_t), relu, relu6); #else DepthwiseCenterFp16(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, sliding->in_sw_step_, - sliding->in_kh_step_, sliding->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_); + sliding->in_kh_step_, sliding->in_kw_step_, relu, relu6); #endif } } // output C8 loop @@ -210,14 +213,14 @@ void DeconvDepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float const SlidingWindowParam *sliding) { const float16_t *src_h = src + top * sliding->out_h_step_; for (int ih = top; ih < bottom; ih++) { - int oh = ih * conv_param->stride_h_ - conv_param->pad_h_; + int oh = ih * conv_param->stride_h_ - conv_param->pad_u_; int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); float16_t *dst_h = dst + oh * sliding->in_h_step_; const float16_t *src_kernel = src_h + left * sliding->block_channel_; for (int iw = left; iw < right; iw++) { - int ow = iw * conv_param->stride_w_ - conv_param->pad_w_; + int ow = iw * conv_param->stride_w_ - conv_param->pad_l_; int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_)); int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_)); float16_t *dst_w = dst_h + ow * sliding->block_channel_; @@ -282,12 +285,14 @@ void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float void DeconvDepthwisePostFuncFp16(float16_t *dst, const float16_t *bias, int block_channel, const ConvParameter *conv_param) { + bool relu = conv_param->act_type_ == ActType_Relu; + bool relu6 = conv_param->act_type_ == ActType_Relu6; float16_t *dst_k = dst; for (int k = 0; k < conv_param->output_h_ * conv_param->output_w_; k++) { for (int c = 0; c < C8NUM; c++) { dst_k[c] += bias[c]; - dst_k[c] = (conv_param->is_relu_) ? (MSMAX(0, dst_k[c])) : (dst_k[c]); - dst_k[c] = (conv_param->is_relu6_) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]); + dst_k[c] = (relu) ? (MSMAX(0, dst_k[c])) : (dst_k[c]); + dst_k[c] = (relu6) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]); } dst_k += block_channel; } @@ -315,8 +320,8 @@ void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const f conv_param->input_w_, conv_param, sliding); if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { - int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_; - int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_; + int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_; + int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_; float16_t *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_; const float16_t *in_t = src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; diff --git a/mindspore/lite/nnacl/fp16/conv_fp16.c b/mindspore/lite/nnacl/fp16/conv_fp16.c index 3a93351aed8..d00069834e0 100644 --- a/mindspore/lite/nnacl/fp16/conv_fp16.c +++ b/mindspore/lite/nnacl/fp16/conv_fp16.c @@ -173,16 +173,18 @@ void SWBorderPixel(float16_t *dst, const float16_t *src, const float16_t *weight void SWBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, int top, int bottom, int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) { + bool relu = conv_param->act_type_ == ActType_Relu; + bool relu6 = conv_param->act_type_ == ActType_Relu6; float16_t *dst_h = dst + top * sliding->out_h_step_; for (int oh = top; oh < bottom; oh++) { - int ih = oh * conv_param->stride_h_ - conv_param->pad_h_; + int ih = oh * conv_param->stride_h_ - conv_param->pad_u_; int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_)); int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_)); const float16_t *src_h = src + ih * sliding->in_h_step_; float16_t *dst_kernel = dst_h + left * sliding->block_channel_; for (int ow = left; ow < right; ow++) { - int iw = ow * conv_param->stride_w_ - conv_param->pad_w_; + int iw = ow * conv_param->stride_w_ - conv_param->pad_l_; int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_)); int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_)); const float16_t *src_w = src_h + iw * sliding->ic4_channel_; @@ -192,7 +194,7 @@ void SWBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, SWBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_h_, conv_param->kernel_w_, - sliding->ic4_channel_, conv_param->is_relu_, conv_param->is_relu6_); + sliding->ic4_channel_, relu, relu6); dst_kernel += sliding->block_channel_; } // width loop @@ -273,6 +275,8 @@ void SWCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, void ConvSWFp16(const float16_t *input_data, const float16_t *packed_weight, const float16_t *bias_data, float16_t *tmp_out_block, float16_t *output_data, int task_id, ConvParameter *conv_param, SlidingWindowParam *slidingWindow_param) { + bool relu = conv_param->act_type_ == ActType_Relu; + bool relu6 = conv_param->act_type_ == ActType_Relu6; int oc4_res = conv_param->output_channel_ % C4NUM; const float16_t *src = input_data; float16_t *dst; @@ -299,8 +303,8 @@ void ConvSWFp16(const float16_t *input_data, const float16_t *packed_weight, con if (slidingWindow_param->right_ > slidingWindow_param->left_ && slidingWindow_param->bottom_ > slidingWindow_param->top_) { - int in_h_start = slidingWindow_param->top_ * conv_param->stride_h_ - conv_param->pad_h_; - int in_w_start = slidingWindow_param->left_ * conv_param->stride_w_ - conv_param->pad_w_; + int in_h_start = slidingWindow_param->top_ * conv_param->stride_h_ - conv_param->pad_u_; + int in_w_start = slidingWindow_param->left_ * conv_param->stride_w_ - conv_param->pad_l_; const float16_t *in_t = src_data + in_h_start * slidingWindow_param->in_h_step_ + in_w_start * slidingWindow_param->ic4_channel_; float16_t *out_t = dst_data + slidingWindow_param->top_ * slidingWindow_param->out_h_step_ + @@ -310,7 +314,7 @@ void ConvSWFp16(const float16_t *input_data, const float16_t *packed_weight, con conv_param->kernel_w_, slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_, slidingWindow_param->ic4_channel_, slidingWindow_param->in_sh_step_, slidingWindow_param->in_sw_step_, slidingWindow_param->in_kh_step_, - slidingWindow_param->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_); + slidingWindow_param->in_kw_step_, relu, relu6); } } // output C4 loop src += slidingWindow_param->in_step_; @@ -330,8 +334,8 @@ void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_ int out_h = conv_param->output_h_; int out_w = conv_param->output_w_; int out_channel = conv_param->output_channel_; - bool relu = conv_param->is_relu_; - bool relu6 = conv_param->is_relu6_; + bool relu = conv_param->act_type_ == ActType_Relu; + bool relu6 = conv_param->act_type_ == ActType_Relu6; int thread_count = conv_param->thread_num_; const int tile_n = 16; int output_count = out_h * out_w; diff --git a/mindspore/lite/nnacl/fp16/deconv_fp16.c b/mindspore/lite/nnacl/fp16/deconv_fp16.c index 2675ecf16ce..476c33f4572 100644 --- a/mindspore/lite/nnacl/fp16/deconv_fp16.c +++ b/mindspore/lite/nnacl/fp16/deconv_fp16.c @@ -73,8 +73,8 @@ int DeConvPostFp16(const float16_t *src, float16_t *tmp, const float16_t *bias, for (int ih = 0; ih < conv_param->input_h_; ih++) { for (int iw = 0; iw < conv_param->input_w_; iw++) { - int oh = ih * conv_param->stride_h_ - conv_param->pad_h_; - int ow = iw * conv_param->stride_w_ - conv_param->pad_w_; + int oh = ih * conv_param->stride_h_ - conv_param->pad_u_; + int ow = iw * conv_param->stride_w_ - conv_param->pad_l_; int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); @@ -112,7 +112,7 @@ int DeConvPostFp16(const float16_t *src, float16_t *tmp, const float16_t *bias, } /*ih*/ } /*oc8*/ - PostConvFuncFp16C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_, conv_param->is_relu_, - conv_param->is_relu6_); + PostConvFuncFp16C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_, + conv_param->act_type_ == ActType_Relu, conv_param->act_type_ == ActType_Relu6); return NNACL_OK; } diff --git a/mindspore/lite/nnacl/fp16/pack_fp16.c b/mindspore/lite/nnacl/fp16/pack_fp16.c index e4d2f863f2a..b5ca307daea 100644 --- a/mindspore/lite/nnacl/fp16/pack_fp16.c +++ b/mindspore/lite/nnacl/fp16/pack_fp16.c @@ -21,14 +21,14 @@ void Conv1x1InputPackFp16(const float16_t *src, float16_t *dst, ConvParameter *conv_param) { /* support nhwc */ for (int dst_h = 0; dst_h < conv_param->output_h_; dst_h++) { - int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_h_; + int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_u_; if (src_h < 0 || src_h >= conv_param->input_h_) { continue; } const float16_t *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_; float16_t *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_; for (int dst_w = 0; dst_w < conv_param->output_w_; dst_w++) { - int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_w_; + int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_l_; if (src_w < 0 || src_w >= conv_param->input_w_) { continue; } @@ -46,8 +46,8 @@ void Im2ColPackUnitFp16(float16_t *input_data, ConvParameter *conv_param, float1 int kernel_w = conv_param->kernel_w_; int stride_h = conv_param->stride_h_; int stride_w = conv_param->stride_w_; - int pad_h = conv_param->pad_h_; - int pad_w = conv_param->pad_w_; + int pad_h = conv_param->pad_u_; + int pad_w = conv_param->pad_l_; int dilation_h = conv_param->dilation_h_; int dilation_w = conv_param->dilation_w_; int in_channel = conv_param->input_channel_; diff --git a/mindspore/lite/nnacl/fp16/winograd_transform_fp16.c b/mindspore/lite/nnacl/fp16/winograd_transform_fp16.c index 38f402c2747..d683f07ab0e 100644 --- a/mindspore/lite/nnacl/fp16/winograd_transform_fp16.c +++ b/mindspore/lite/nnacl/fp16/winograd_transform_fp16.c @@ -230,8 +230,8 @@ void Conv3x3Fp16InputTransform(const float16_t *input_data, float16_t *trans_inp int input_channel = conv_param->input_channel_; int input_width = conv_param->input_w_; int input_height = conv_param->input_h_; - int pad_w = conv_param->pad_w_; - int pad_h = conv_param->pad_h_; + int pad_w = conv_param->pad_l_; + int pad_h = conv_param->pad_u_; int ic8 = UP_DIV(input_channel, C8NUM); if (out_w_block == 0) { return; @@ -576,8 +576,8 @@ void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_in int output_unit = conv_param->output_unit_; int in_channel = conv_param->input_channel_; int ic8 = UP_DIV(in_channel, C8NUM); - int pad_h = conv_param->pad_h_; - int pad_w = conv_param->pad_w_; + int pad_h = conv_param->pad_u_; + int pad_w = conv_param->pad_l_; int input_h = conv_param->input_h_; int input_w = conv_param->input_w_; if (out_w_block_num == 0) { diff --git a/mindspore/lite/nnacl/fp32/conv.c b/mindspore/lite/nnacl/fp32/conv.c index c22a55a29e7..ec5fdad0bb0 100644 --- a/mindspore/lite/nnacl/fp32/conv.c +++ b/mindspore/lite/nnacl/fp32/conv.c @@ -18,6 +18,7 @@ #include #include "nnacl/fp32/common_func.h" #include "nnacl/winograd_transform.h" +#include "nnacl/fp32/matmul.h" void SWBorderPixel(float *dst, const float *src, const float *weight, const float *bias, int height, int width, int in_kh_step, int in_kw_step, int kernel_h, int kernel_w, int ic4, bool is_relu, bool is_relu6) { @@ -57,16 +58,18 @@ void SWBorderPixel(float *dst, const float *src, const float *weight, const floa void SWBorder(float *dst, const float *src, const float *weight, const float *bias, int top, int bottom, int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) { int ic4 = sliding->ic4_channel_ / C4NUM; + bool relu = conv_param->act_type_ == ActType_Relu; + bool relu6 = conv_param->act_type_ == ActType_Relu6; float *dst_h = dst + top * sliding->out_h_step_; for (int oh = top; oh < bottom; oh++) { - int ih = oh * conv_param->stride_h_ - conv_param->pad_h_; + int ih = oh * conv_param->stride_h_ - conv_param->pad_u_; int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_)); int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_)); const float *src_h = src + ih * sliding->in_h_step_; float *dst_kernel = dst_h + left * sliding->block_channel_; for (int ow = left; ow < right; ow++) { - int iw = ow * conv_param->stride_w_ - conv_param->pad_w_; + int iw = ow * conv_param->stride_w_ - conv_param->pad_l_; int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_)); int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_)); const float *src_w = src_h + iw * sliding->ic4_channel_; @@ -75,8 +78,8 @@ void SWBorder(float *dst, const float *src, const float *weight, const float *bi const float *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * sliding->ic4_channel_; SWBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, - sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_h_, conv_param->kernel_w_, ic4, - conv_param->is_relu_, conv_param->is_relu6_); + sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_h_, conv_param->kernel_w_, ic4, relu, + relu6); dst_kernel += sliding->block_channel_; } // width loop @@ -144,6 +147,8 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float float *output_data, int task_id, ConvParameter *conv_param, SlidingWindowParam *slidingWindow_param) { int ic4 = slidingWindow_param->ic4_channel_ / C4NUM; int oc4_res = conv_param->output_channel_ % C4NUM; + bool relu = conv_param->act_type_ == ActType_Relu; + bool relu6 = conv_param->act_type_ == ActType_Relu6; const float *src = input_data; float *dst = NULL; if (oc4_res == 0) { @@ -169,28 +174,26 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float if (slidingWindow_param->right_ > slidingWindow_param->left_ && slidingWindow_param->bottom_ > slidingWindow_param->top_) { - int in_h_start = slidingWindow_param->top_ * conv_param->stride_h_ - conv_param->pad_h_; - int in_w_start = slidingWindow_param->left_ * conv_param->stride_w_ - conv_param->pad_w_; + int in_h_start = slidingWindow_param->top_ * conv_param->stride_h_ - conv_param->pad_u_; + int in_w_start = slidingWindow_param->left_ * conv_param->stride_w_ - conv_param->pad_l_; const float *in_t = src_data + in_h_start * slidingWindow_param->in_h_step_ + in_w_start * slidingWindow_param->ic4_channel_; float *out_t = dst_data + slidingWindow_param->top_ * slidingWindow_param->out_h_step_ + slidingWindow_param->left_ * slidingWindow_param->block_channel_; #ifdef ENABLE_ARM64 - ConvSwFp32Center(out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_, - slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_, - conv_param->kernel_w_, slidingWindow_param->out_h_step_ * sizeof(float), - slidingWindow_param->block_channel_ * sizeof(float), ic4, - slidingWindow_param->in_sh_step_ * sizeof(float), - slidingWindow_param->in_sw_step_ * sizeof(float), - slidingWindow_param->in_kh_step_ * sizeof(float), - slidingWindow_param->in_kw_step_ * sizeof(float), - conv_param->is_relu_, conv_param->is_relu6_); + ConvSwFp32Center( + out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_, + slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_, conv_param->kernel_w_, + slidingWindow_param->out_h_step_ * sizeof(float), slidingWindow_param->block_channel_ * sizeof(float), ic4, + slidingWindow_param->in_sh_step_ * sizeof(float), slidingWindow_param->in_sw_step_ * sizeof(float), + slidingWindow_param->in_kh_step_ * sizeof(float), slidingWindow_param->in_kw_step_ * sizeof(float), relu, + relu6); #else SWCenter(out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_, - slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_, - conv_param->kernel_w_, slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_, ic4, + slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_, conv_param->kernel_w_, + slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_, ic4, slidingWindow_param->in_sh_step_, slidingWindow_param->in_sw_step_, slidingWindow_param->in_kh_step_, - slidingWindow_param->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_); + slidingWindow_param->in_kw_step_, relu, relu6); #endif } } // output C4 loop @@ -219,6 +222,8 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons int kernel_plane = kernel_h * kernel_w; int unit_size = kernel_plane * ic4 * C4NUM; int packed_input_size = output_tile_count * TILE_NUM * unit_size; + bool relu = conv_param->act_type_ == ActType_Relu; + bool relu6 = conv_param->act_type_ == ActType_Relu6; // we accumulate 4 channels per time for input blocks int conv_depth = kernel_h * kernel_w; @@ -240,11 +245,11 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons if (real_cal_num == TILE_NUM) { float *gemm_output = output_data + out_offset; gemm_func(gemm_output, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, output_offset, 0, 0, - conv_param->is_relu_, conv_param->is_relu6_); + relu, relu6); } else { // res part gemm_func(tmp_out_block, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, output_offset, 0, - 0, conv_param->is_relu_, conv_param->is_relu6_); + 0, relu, relu6); memcpy(output_data + out_offset, tmp_out_block, real_cal_num * out_channel * sizeof(float)); } } @@ -264,34 +269,42 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_ int out_w_block = UP_DIV(conv_param->output_w_, out_unit); int out_h_block = UP_DIV(conv_param->output_h_, out_unit); int output_count = out_w_block * out_h_block; - int output_tile_count = UP_DIV(output_count, TILE_NUM); + int output_tile_count = UP_DIV(output_count, C12NUM); int out_channel = conv_param->output_channel_; int oc4 = UP_DIV(out_channel, C4NUM); + int oc8 = UP_DIV(out_channel, C8NUM); int input_unit_square = input_unit * input_unit; - size_t output_offset = oc4 * C4NUM * input_unit_square * sizeof(float); float *trans_input = buffer_list[0]; float *gemm_out = buffer_list[1]; float *tmp_out_data = buffer_list[2]; float *tmp_data = buffer_list[3]; - int trans_input_offset = TILE_NUM * input_unit_square * ic4 * C4NUM; - int gemm_out_offset = TILE_NUM * input_unit_square * oc4 * C4NUM; + float *col_buffer = buffer_list[4]; + int trans_input_offset = C12NUM * input_unit_square * ic4 * C4NUM; + int gemm_out_offset = C12NUM * input_unit_square * oc8 * C8NUM; int tmp_data_offset = input_unit_square * C4NUM; + int col_buffer_offset = C12NUM * ic4 * C4NUM; // step 1 : filter transform (pre-processed offline) // step 2 : input transform (online) for (int b = 0; b < in_batch; b++) { int in_batch_offset = b * ic4 * C4NUM * conv_param->input_h_ * conv_param->input_w_; int tmp_out_batch_offset = b * out_w_block * out_h_block * out_unit * out_unit * oc4 * C4NUM; for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_num) { - int out_tile_index = thread_id * TILE_NUM; - int cal_num = output_count - thread_id * TILE_NUM; - cal_num = cal_num > TILE_NUM ? TILE_NUM : cal_num; + int out_tile_index = thread_id * C12NUM; + int cal_num = output_count - thread_id * C12NUM; + cal_num = cal_num > C12NUM ? C12NUM : cal_num; WinogradInputTransform(input_data + in_batch_offset, trans_input + task_id * trans_input_offset, tmp_data + task_id * tmp_data_offset, cal_num, out_tile_index, out_w_block, conv_param, input_trans_func); // step 3 : gemm - gemm_func(gemm_out + task_id * gemm_out_offset, trans_input + task_id * trans_input_offset, trans_weight, NULL, - input_unit_square, ic4, oc4 * C4NUM, output_offset, 1, 1, 0, 0); + float *src_ptr = trans_input + task_id * trans_input_offset; + float *dst_ptr = gemm_out + task_id * gemm_out_offset; + float *tmp_col_ptr = col_buffer + task_id * col_buffer_offset; + for (int i = 0; i < input_unit_square; ++i) { + RowMajor2Col12Major(src_ptr + i * C12NUM * ic4 * C4NUM, tmp_col_ptr, C12NUM, ic4 * C4NUM); + MatMulOpt(tmp_col_ptr, trans_weight + i * ic4 * C4NUM * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, ic4 * C4NUM, + C12NUM, oc8 * C8NUM, input_unit_square, 2); + } // step 4 : output transform WinogradOutputTransform(gemm_out + task_id * gemm_out_offset, tmp_out_data + tmp_out_batch_offset, bias_data, @@ -442,18 +455,21 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat int ic4 = UP_DIV(conv_param->input_channel_, C4NUM); int output_channel = conv_param->output_channel_; int oc4 = UP_DIV(output_channel, C4NUM); + int oc8 = UP_DIV(output_channel, C8NUM); int out_w_block = UP_DIV(conv_param->output_w_, OUPUT_UNIT); int out_h_block = UP_DIV(conv_param->output_h_, OUPUT_UNIT); int output_count = out_w_block * out_h_block; - int output_tile_count = UP_DIV(output_count, TILE_NUM); + int output_tile_count = UP_DIV(output_count, C12NUM); const int input_unit_square = 4 * 4; float *tile_buffer = buffer_list[0]; float *block_unit_buffer = buffer_list[1]; float *tmp_dst_buffer = buffer_list[2]; float *nc4hw4_out = buffer_list[3]; - int tile_buffer_offset = TILE_NUM * input_unit_square * ic4 * C4NUM; + float *col_buffer = buffer_list[4]; + int tile_buffer_offset = C12NUM * input_unit_square * ic4 * C4NUM; int block_unit_buffer_offset = input_unit_square * C4NUM; - int tmp_dst_buffer_offset = TILE_NUM * input_unit_square * oc4 * C4NUM; + int tmp_dst_buffer_offset = C12NUM * input_unit_square * oc8 * C8NUM; + int col_buffer_offset = C12NUM * ic4 * C4NUM; int input_batch = conv_param->input_batch_; for (int batch = 0; batch < input_batch; batch++) { @@ -461,15 +477,20 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat int nc4hw4_buffer_offset = batch * oc4 * C4NUM * conv_param->output_h_ * conv_param->output_w_; for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) { - int start_index = thread_id * TILE_NUM; - int real_cal_num = (output_count - start_index) < TILE_NUM ? (output_count - start_index) : TILE_NUM; + int start_index = thread_id * C12NUM; + int real_cal_num = (output_count - start_index) < C12NUM ? (output_count - start_index) : C12NUM; Conv3x3Fp32InputTransform(input_data + in_batch_offset, tile_buffer + task_id * tile_buffer_offset, block_unit_buffer + task_id * block_unit_buffer_offset, start_index, real_cal_num, out_w_block, conv_param); - gemm_func(tmp_dst_buffer + task_id * tmp_dst_buffer_offset, tile_buffer + task_id * tile_buffer_offset, - transed_weight, NULL, input_unit_square, ic4, oc4 * C4NUM, - oc4 * C4NUM * input_unit_square * sizeof(float), 1, 1, 0, 0); + float *src_ptr = tile_buffer + task_id * tile_buffer_offset; + float *tmp_col_ptr = col_buffer + task_id * col_buffer_offset; + float *dst_ptr = tmp_dst_buffer + task_id * tmp_dst_buffer_offset; + for (int i = 0; i < input_unit_square; ++i) { + RowMajor2Col12Major(src_ptr + i * C12NUM * ic4 * C4NUM, tmp_col_ptr, C12NUM, ic4 * C4NUM); + MatMulOpt(tmp_col_ptr, transed_weight + i * ic4 * C4NUM * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, + ic4 * C4NUM, C12NUM, oc8 * C8NUM, input_unit_square, 2); + } Conv3x3Fp32OutputTransform(tmp_dst_buffer + task_id * tmp_dst_buffer_offset, nc4hw4_out + nc4hw4_buffer_offset, bias_data, start_index, real_cal_num, out_w_block, conv_param); diff --git a/mindspore/lite/nnacl/fp32/conv_depthwise.c b/mindspore/lite/nnacl/fp32/conv_depthwise.c index c71e135ba78..26c93f6c7ab 100644 --- a/mindspore/lite/nnacl/fp32/conv_depthwise.c +++ b/mindspore/lite/nnacl/fp32/conv_depthwise.c @@ -38,13 +38,15 @@ void ConvDw(float *output_data, const float *input_data, const float *weight_dat int h_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_); int h_start = h_step * task_id; int h_end = MSMIN(h_start + h_step, conv_param->output_h_); + bool relu = conv_param->act_type_ == ActType_Relu; + bool relu6 = conv_param->act_type_ == ActType_Relu6; for (int b = 0; b < conv_param->output_batch_; b++) { const float *src = input_data + b * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_; float *dst = output_data + b * conv_param->output_h_ * conv_param->output_w_ * conv_param->output_channel_; for (int oh = h_start; oh < h_end; oh++) { float *dst_data = dst + oh * conv_param->output_w_ * conv_param->output_channel_; - int ih_origin = oh * conv_param->stride_h_ - conv_param->pad_h_; + int ih_origin = oh * conv_param->stride_h_ - conv_param->pad_u_; int start_kh = MSMAX(0, UP_DIV(-ih_origin, conv_param->dilation_h_)); int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih_origin, conv_param->dilation_h_)); @@ -60,13 +62,13 @@ void ConvDw(float *output_data, const float *input_data, const float *weight_dat int in_sw_step = conv_param->stride_w_ * conv_param->input_channel_; for (int kw = 0; kw < conv_param->kernel_w_; kw++) { int out_w_start = MSMAX( - 0, (conv_param->pad_w_ - conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / conv_param->stride_w_); - int out_w_end = MSMIN(conv_param->output_w_, (conv_param->input_w_ + conv_param->pad_w_ - + 0, (conv_param->pad_l_ - conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / conv_param->stride_w_); + int out_w_end = MSMIN(conv_param->output_w_, (conv_param->input_w_ + conv_param->pad_l_ - conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / conv_param->stride_w_); float *dst_w = dst_data + out_w_start * conv_param->output_channel_; - int iw_origin = (out_w_start * conv_param->stride_w_) - conv_param->pad_w_ + conv_param->dilation_w_ * kw; + int iw_origin = (out_w_start * conv_param->stride_w_) - conv_param->pad_l_ + conv_param->dilation_w_ * kw; const float *src_kw = src_kh + iw_origin * conv_param->input_channel_; int num_pixels = out_w_end - out_w_start; @@ -75,10 +77,10 @@ void ConvDw(float *output_data, const float *input_data, const float *weight_dat weight_kh += conv_param->output_channel_; } } - if (conv_param->is_relu_) { + if (relu) { ReluFp32(dst_data, dst_data, conv_param->output_w_ * conv_param->output_channel_); } - if (conv_param->is_relu6_) { + if (relu6) { Relu6Fp32(dst_data, dst_data, conv_param->output_w_ * conv_param->output_channel_); } } @@ -91,16 +93,16 @@ void InitSlidingParam(SlidingWindowParam *sliding, const ConvParameter *conv_par int top = 0; int bottom = conv_param->output_h_; - for (; left * conv_param->stride_w_ < conv_param->pad_w_; left++) { + for (; left * conv_param->stride_w_ < conv_param->pad_l_; left++) { } - for (; (right - 1) * conv_param->stride_w_ - conv_param->pad_w_ + conv_param->kernel_w_ * conv_param->dilation_w_ > + for (; (right - 1) * conv_param->stride_w_ - conv_param->pad_l_ + conv_param->kernel_w_ * conv_param->dilation_w_ > conv_param->input_w_ && right > left; right--) { } - for (; top * conv_param->stride_h_ < conv_param->pad_h_; top++) { + for (; top * conv_param->stride_h_ < conv_param->pad_u_; top++) { } - for (; (bottom - 1) * conv_param->stride_h_ - conv_param->pad_h_ + conv_param->kernel_h_ * conv_param->dilation_h_ > + for (; (bottom - 1) * conv_param->stride_h_ - conv_param->pad_u_ + conv_param->kernel_h_ * conv_param->dilation_h_ > conv_param->input_h_ && bottom > top; bottom--) { @@ -181,16 +183,18 @@ void DepthwiseBorderPixel(float *dst, const float *src, const float *weight, con void DepthwiseBorder(float *dst, const float *src, const float *weight, const float *bias, int top, int bottom, int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) { + bool relu = conv_param->act_type_ == ActType_Relu; + bool relu6 = conv_param->act_type_ == ActType_Relu6; float *dst_h = dst + top * sliding->out_h_step_; for (int oh = top; oh < bottom; oh++) { - int ih = oh * conv_param->stride_h_ - conv_param->pad_h_; + int ih = oh * conv_param->stride_h_ - conv_param->pad_u_; int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_)); int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_)); const float *src_h = src + ih * sliding->in_h_step_; float *dst_kernel = dst_h + left * sliding->block_channel_; for (int ow = left; ow < right; ow++) { - int iw = ow * conv_param->stride_w_ - conv_param->pad_w_; + int iw = ow * conv_param->stride_w_ - conv_param->pad_l_; int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_)); int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_)); const float *src_w = src_h + iw * sliding->block_channel_; @@ -201,11 +205,10 @@ void DepthwiseBorder(float *dst, const float *src, const float *weight, const fl #ifdef ENABLE_ARM64 ConvDwFp32Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, sliding->in_kh_step_ * sizeof(float), sliding->in_kw_step_ * sizeof(float), - conv_param->kernel_w_ * C4NUM * sizeof(float), conv_param->is_relu_, conv_param->is_relu6_); + conv_param->kernel_w_ * C4NUM * sizeof(float), relu, relu6); #else DepthwiseBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, - sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C4NUM, - conv_param->is_relu_, conv_param->is_relu6_); + sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C4NUM, relu, relu6); #endif dst_kernel += sliding->block_channel_; } // width loop @@ -259,6 +262,8 @@ void DepthwiseCenter(float *dst, const float *src, const float *weight, const fl // conv depthwise fp32: sliding window void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weight_data, const float *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) { + bool relu = conv_param->act_type_ == ActType_Relu; + bool relu6 = conv_param->act_type_ == ActType_Relu6; const float *src = input_data; float *dst = output_data; for (int b = 0; b < conv_param->output_batch_; b++) { @@ -277,8 +282,8 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig conv_param->output_w_, conv_param, sliding); if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { - int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_; - int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_; + int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_; + int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_; const float *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_; float *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; #ifdef ENABLE_ARM64 @@ -286,12 +291,12 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(float), sliding->block_channel_ * sizeof(float), sliding->in_sh_step_ * sizeof(float), sliding->in_sw_step_ * sizeof(float), sliding->in_kh_step_ * sizeof(float), - sliding->in_kw_step_ * sizeof(float), conv_param->is_relu_, conv_param->is_relu6_); + sliding->in_kw_step_ * sizeof(float), relu, relu6); #else DepthwiseCenter(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_, sliding->block_channel_, - sliding->in_sh_step_, sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_, - conv_param->is_relu_, conv_param->is_relu6_); + sliding->in_sh_step_, sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_, relu, + relu6); #endif } } // output C4 loop @@ -454,11 +459,11 @@ void ConvDw3x3Fp32InputTrans(const float *input_data, float *trans_input, float memset(trans_input, 0, out_h_block * out_h_block * 16 * C4NUM * sizeof(float)); for (int oh = 0; oh < out_h_block; oh++) { - int ih = oh * 2 - conv_param->pad_h_; + int ih = oh * 2 - conv_param->pad_u_; int real_h_start = ih > 0 ? 0 : -ih; int real_h_end = (ih + input_unit) < conv_param->input_h_ ? input_unit : (conv_param->input_h_ - ih); for (int ow = 0; ow < out_w_block; ow++) { - int iw = ow * 2 - conv_param->pad_w_; + int iw = ow * 2 - conv_param->pad_l_; int real_w_start = iw > 0 ? 0 : -iw; int real_w_end = (iw + input_unit) < conv_param->input_w_ ? input_unit : (conv_param->input_w_ - iw); @@ -642,6 +647,8 @@ void ConvDw3x3Fp32OutputUnit(float *src_buf, float *dst_output, const float *bia void ConvDw3x3Fp32OutputTrans(float *trans_buffer, float *output_data, const float *bias, int out_h_block, int out_w_block, const ConvParameter *conv_param) { + bool relu = conv_param->act_type_ == ActType_Relu; + bool relu6 = conv_param->act_type_ == ActType_Relu6; int oc4 = UP_DIV(conv_param->output_channel_, C4NUM); bool h_in_range = true; for (int oh = 0; oh < out_h_block; oh++) { @@ -661,8 +668,8 @@ void ConvDw3x3Fp32OutputTrans(float *trans_buffer, float *output_data, const flo float *buf_ow = buf_oh + ow * 16 * C4NUM; float *output_ow = output_oh + real_ow * oc4 * C4NUM; - ConvDw3x3Fp32OutputUnit(buf_ow, output_ow, bias, oc4 * C4NUM, conv_param->output_w_, h_in_range, w_in_range, - conv_param->is_relu_, conv_param->is_relu6_); + ConvDw3x3Fp32OutputUnit(buf_ow, output_ow, bias, oc4 * C4NUM, conv_param->output_w_, h_in_range, w_in_range, relu, + relu6); } } } @@ -727,14 +734,14 @@ void DeconvDepthwiseBorder(float *dst, const float *src, const float *weight, in const ConvParameter *conv_param, const SlidingWindowParam *sliding) { const float *src_h = src + top * sliding->out_h_step_; for (int ih = top; ih < bottom; ih++) { - int oh = ih * conv_param->stride_h_ - conv_param->pad_h_; + int oh = ih * conv_param->stride_h_ - conv_param->pad_u_; int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); float *dst_h = dst + oh * sliding->in_h_step_; const float *src_kernel = src_h + left * sliding->block_channel_; for (int iw = left; iw < right; iw++) { - int ow = iw * conv_param->stride_w_ - conv_param->pad_w_; + int ow = iw * conv_param->stride_w_ - conv_param->pad_l_; int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_)); int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_)); float *dst_w = dst_h + ow * sliding->block_channel_; @@ -790,12 +797,14 @@ void DeconvDepthwiseCenter(float *dst, const float *src, const float *weight, in #endif void DeconvDepthwisePostFunc(float *dst, const float *bias, int block_channel, const ConvParameter *conv_param) { + bool relu = conv_param->act_type_ == ActType_Relu; + bool relu6 = conv_param->act_type_ == ActType_Relu6; float *dst_k = dst; for (int k = 0; k < conv_param->output_h_ * conv_param->output_w_; k++) { for (int c = 0; c < C4NUM; c++) { dst_k[c] += bias[c]; - dst_k[c] = (conv_param->is_relu_) ? (MSMAX(0, dst_k[c])) : (dst_k[c]); - dst_k[c] = (conv_param->is_relu6_) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]); + dst_k[c] = (relu) ? (MSMAX(0, dst_k[c])) : (dst_k[c]); + dst_k[c] = (relu6) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]); } dst_k += block_channel; } @@ -821,8 +830,8 @@ void DeconvDwC4Fp32(float *output_data, const float *input_data, const float *we conv_param->input_w_, conv_param, sliding); if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { - int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_; - int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_; + int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_; + int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_; float *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_; const float *in_t = src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; diff --git a/mindspore/lite/nnacl/fp32/deconv.c b/mindspore/lite/nnacl/fp32/deconv.c index 5996e962ef7..b9a78a622c0 100644 --- a/mindspore/lite/nnacl/fp32/deconv.c +++ b/mindspore/lite/nnacl/fp32/deconv.c @@ -57,8 +57,8 @@ int DeConvPostFp32C12x8(const float *src, float *tmp, const float *bias, float * for (int ih = 0; ih < conv_param->input_h_; ih++) { for (int iw = 0; iw < conv_param->input_w_; iw++) { - int oh = ih * conv_param->stride_h_ - conv_param->pad_h_; - int ow = iw * conv_param->stride_w_ - conv_param->pad_w_; + int oh = ih * conv_param->stride_h_ - conv_param->pad_u_; + int ow = iw * conv_param->stride_w_ - conv_param->pad_l_; int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); @@ -97,7 +97,7 @@ int DeConvPostFp32C12x8(const float *src, float *tmp, const float *bias, float * } /*ih*/ } /*oc8*/ - PostConvFuncFp32C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_, conv_param->is_relu_, - conv_param->is_relu6_); + PostConvFuncFp32C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_, + conv_param->act_type_ == ActType_Relu, conv_param->act_type_ == ActType_Relu6); return NNACL_OK; } diff --git a/mindspore/lite/nnacl/fp32/matmul.c b/mindspore/lite/nnacl/fp32/matmul.c index 286b65bfb60..dd5d2979144 100644 --- a/mindspore/lite/nnacl/fp32/matmul.c +++ b/mindspore/lite/nnacl/fp32/matmul.c @@ -356,7 +356,7 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A dst[ci] = value; } } - } else { + } else if (out_type == OutType_C8) { /* col8-major * row8-major => col12x8-major */ int col_8 = UP_ROUND(col, C8NUM); int row_12 = UP_ROUND(row, C12NUM); @@ -364,9 +364,7 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A for (int c = 0; c < col_8; c++) { int r12div = r / C12NUM, r12mod = r % C12NUM; int c8div = c / C8NUM, c8mod = c % C8NUM; - int c4div = c / C4NUM, c4mod = c % C4NUM; - size_t ci = (out_type == OutType_C4) ? (c4div * C4NUM * row_12 + r * C4NUM + c4mod) - : (c8div * C8NUM * row_12 + r * C8NUM + c8mod); + size_t ci = (c8div * C8NUM * row_12 + r * C8NUM + c8mod); float value = 0; for (int d = 0; d < deep; d++) { size_t ai = r12div * deep * C12NUM + d * C12NUM + r12mod; @@ -379,6 +377,25 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A dst[ci] = value; } } + } else { + for (int i = 0; i < row; ++i) { + int src_r_offset = i; + int dst_r_offset = i * col * stride; + for (int j = 0; j < col; ++j) { + int c8div = j / 8, c8mod = j % 8; + size_t ci = dst_r_offset + c8div * 8 * stride + c8mod; + float value = 0; + for (int d = 0; d < deep; ++d) { + size_t ai = src_r_offset + d * row; + size_t bi = c8div * deep * 8 + d * 8 + c8mod; + value = value + a[ai] * b[bi]; + } + if (bias != NULL) value += bias[j]; + if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); + if (act_type != ActType_No) value = MSMAX(0.0f, value); + dst[ci] = value; + } + } } return; } @@ -387,7 +404,7 @@ void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActT int col, size_t stride, int out_type) { #ifdef ENABLE_ARM64 MatmulFloatNeon64Opt(a, b, c, bias, (int)act_type, deep, row, col, stride, (int)(out_type == OutType_Nhwc), - (int)(out_type == OutType_C4)); + (int)(out_type == OutType_TileC8)); #else MatMul12x8(a, b, c, bias, act_type, deep, row, col, stride, out_type); #endif diff --git a/mindspore/lite/nnacl/fp32_grad/pack_ext.c b/mindspore/lite/nnacl/fp32_grad/pack_ext.c index d95cc6daf9c..48665e83f26 100644 --- a/mindspore/lite/nnacl/fp32_grad/pack_ext.c +++ b/mindspore/lite/nnacl/fp32_grad/pack_ext.c @@ -20,9 +20,9 @@ static int is_a_ge_zero_and_a_lt_b(int a, int b) { return (unsigned)(a) < (unsigned)(b); } void im2col_hwc(const float *in_data, float *data_col, ConvParameter *conv_param) { - const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_w_; + const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_l_; // const int pad_right = /*conv_param->pad_r_*/conv_param->pad_w_; - const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_h_; + const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_u_; // const int pad_down = /*conv_param->pad_d/*/conv_param->pad_h_; const int stride_h = conv_param->stride_h_; @@ -72,9 +72,9 @@ void im2col_hwc(const float *in_data, float *data_col, ConvParameter *conv_param // output matrix is (kernel_h*kernel_w*channels)X(output_h*output_w) void im2row_hwc(const float *in_data, float *data_row, ConvParameter *conv_param) { - const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_w_; + const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_l_; // const int pad_right = /*conv_param->pad_r_*/conv_param->pad_w_; - const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_h_; + const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_u_; // const int pad_down = /*conv_param->pad_d/*/conv_param->pad_h_; const int stride_h = conv_param->stride_h_; diff --git a/mindspore/lite/nnacl/int8/conv_depthwise_int8.c b/mindspore/lite/nnacl/int8/conv_depthwise_int8.c index fc173973873..817d549a39b 100644 --- a/mindspore/lite/nnacl/int8/conv_depthwise_int8.c +++ b/mindspore/lite/nnacl/int8/conv_depthwise_int8.c @@ -68,14 +68,14 @@ void DepthwiseBorderInt8(int8_t *dst, const int16_t *src, const int16_t *weight, bool per_channel) { int8_t *dst_h = dst + top * sliding->out_h_step_; for (int oh = top; oh < bottom; oh++) { - int ih = oh * conv_param->stride_h_ - conv_param->pad_h_; + int ih = oh * conv_param->stride_h_ - conv_param->pad_u_; int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_)); int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_)); const int16_t *src_h = src + ih * sliding->in_h_step_; int8_t *dst_kernel = dst_h + left * sliding->block_channel_; for (int ow = left; ow < right; ow++) { - int iw = ow * conv_param->stride_w_ - conv_param->pad_w_; + int iw = ow * conv_param->stride_w_ - conv_param->pad_l_; int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_)); int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_)); const int16_t *src_w = src_h + iw * sliding->block_channel_; @@ -186,8 +186,8 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w per_channel); if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { - int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_; - int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_; + int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_; + int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_; const int16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_; int8_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; #ifdef ENABLE_ARM64 @@ -241,14 +241,14 @@ void DeconvDepthwiseBorderInt8(int32_t *dst, const int16_t *src, const int16_t * int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) { const int16_t *src_h = src + top * sliding->out_h_step_; for (int ih = top; ih < bottom; ih++) { - int oh = ih * conv_param->stride_h_ - conv_param->pad_h_; + int oh = ih * conv_param->stride_h_ - conv_param->pad_u_; int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); int32_t *dst_h = dst + oh * sliding->in_h_step_; const int16_t *src_kernel = src_h + left * sliding->block_channel_; for (int iw = left; iw < right; iw++) { - int ow = iw * conv_param->stride_w_ - conv_param->pad_w_; + int ow = iw * conv_param->stride_w_ - conv_param->pad_l_; int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_)); int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_)); int32_t *dst_w = dst_h + ow * C4NUM; @@ -341,8 +341,8 @@ void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *in conv_param->input_w_, conv_param, sliding); if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { - int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_; - int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_; + int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_; + int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_; int32_t *out_t = output_buffer + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_; const int16_t *in_t = src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; diff --git a/mindspore/lite/nnacl/int8/deconv.c b/mindspore/lite/nnacl/int8/deconv.c index a00aed84251..0317ce5ea8f 100644 --- a/mindspore/lite/nnacl/int8/deconv.c +++ b/mindspore/lite/nnacl/int8/deconv.c @@ -33,8 +33,8 @@ int DeConvPostInt8C8(const int32_t *src, const int32_t *bias, int32_t *tmp, int8 for (int ih = 0; ih < conv_param->input_h_; ih++) { for (int iw = 0; iw < conv_param->input_w_; iw++) { - int oh = ih * conv_param->stride_h_ - conv_param->pad_h_; - int ow = iw * conv_param->stride_w_ - conv_param->pad_w_; + int oh = ih * conv_param->stride_h_ - conv_param->pad_u_; + int ow = iw * conv_param->stride_w_ - conv_param->pad_l_; int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); @@ -88,8 +88,8 @@ int DeConvPostInt8C4(const int32_t *src, const int32_t *bias, int32_t *tmp, int8 for (int ih = 0; ih < conv_param->input_h_; ih++) { for (int iw = 0; iw < conv_param->input_w_; iw++) { - int oh = ih * conv_param->stride_h_ - conv_param->pad_h_; - int ow = iw * conv_param->stride_w_ - conv_param->pad_w_; + int oh = ih * conv_param->stride_h_ - conv_param->pad_u_; + int ow = iw * conv_param->stride_w_ - conv_param->pad_l_; int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); diff --git a/mindspore/lite/nnacl/matmul_parameter.h b/mindspore/lite/nnacl/matmul_parameter.h index 0533b2112c8..783d3631fed 100644 --- a/mindspore/lite/nnacl/matmul_parameter.h +++ b/mindspore/lite/nnacl/matmul_parameter.h @@ -29,9 +29,7 @@ typedef void (*MATMUL_OPT_R_FUNC)(const int8_t *a, const int8_t *b, int8_t *dst, typedef void (*MAT_TRANS_FUNC)(void *dst, void *a, int row, int col); -typedef enum ActType { ActType_No, ActType_Relu, ActType_Relu6 } ActType; - -typedef enum OutType { OutType_C8 = 0, OutType_Nhwc = 1, OutType_C4 = 2 } OutType; +typedef enum OutType { OutType_C8 = 0, OutType_Nhwc = 1, OutType_TileC8 = 2 } OutType; typedef struct MatMulParameter { OpParameter op_parameter_; diff --git a/mindspore/lite/nnacl/op_base.h b/mindspore/lite/nnacl/op_base.h index 11450bc4893..58b46a57c84 100644 --- a/mindspore/lite/nnacl/op_base.h +++ b/mindspore/lite/nnacl/op_base.h @@ -25,7 +25,6 @@ #define C8NUM 8 #define C12NUM 12 #define C16NUM 16 -#define BLOCK 4 #define TILE_NUM 8 #define MSMIN(x, y) ((x) < (y) ? (x) : (y)) @@ -62,4 +61,6 @@ typedef struct OpParameter { int thread_num_; } OpParameter; +typedef enum ActType { ActType_No, ActType_Relu, ActType_Relu6 } ActType; + #endif // MINDSPORE_LITE_NNACL_OP_BASE_H_ diff --git a/mindspore/lite/nnacl/pack.c b/mindspore/lite/nnacl/pack.c index 5ae56ef406a..1a5e1c7d5d9 100644 --- a/mindspore/lite/nnacl/pack.c +++ b/mindspore/lite/nnacl/pack.c @@ -158,14 +158,14 @@ void Conv1x1InputPack(const void *src_ptr, void *dst_ptr, ConvParameter *conv_pa char *src = (char *)src_ptr; char *dst = (char *)dst_ptr; for (int dst_h = 0; dst_h < conv_param->output_h_; dst_h++) { - int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_h_; + int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_u_; if (src_h < 0 || src_h >= conv_param->input_h_) { continue; } const char *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_ * data_size; char *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_ * data_size; for (int dst_w = 0; dst_w < conv_param->output_w_; dst_w++) { - int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_w_; + int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_l_; if (src_w < 0 || src_w >= conv_param->input_w_) { continue; } @@ -296,8 +296,8 @@ void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, floa int kernel_w = conv_param->kernel_w_; int stride_h = conv_param->stride_h_; int stride_w = conv_param->stride_w_; - int pad_h = conv_param->pad_h_; - int pad_w = conv_param->pad_w_; + int pad_h = conv_param->pad_u_; + int pad_w = conv_param->pad_l_; int dilation_h = conv_param->dilation_h_; int dilation_w = conv_param->dilation_w_; int in_channel = conv_param->input_channel_; @@ -348,8 +348,8 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real int kernel_w = conv_param->kernel_w_; int stride_h = conv_param->stride_h_; int stride_w = conv_param->stride_w_; - int pad_h = conv_param->pad_h_; - int pad_w = conv_param->pad_w_; + int pad_h = conv_param->pad_u_; + int pad_w = conv_param->pad_l_; int dilation_h = conv_param->dilation_h_; int dilation_w = conv_param->dilation_w_; int in_channel = conv_param->input_channel_; @@ -419,8 +419,8 @@ void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int r int kernel_w = conv_param->kernel_w_; int stride_h = conv_param->stride_h_; int stride_w = conv_param->stride_w_; - int pad_h = conv_param->pad_h_; - int pad_w = conv_param->pad_w_; + int pad_h = conv_param->pad_u_; + int pad_w = conv_param->pad_l_; int dilation_h = conv_param->dilation_h_; int dilation_w = conv_param->dilation_w_; int in_channel = conv_param->input_channel_; diff --git a/mindspore/lite/nnacl/winograd_transform.c b/mindspore/lite/nnacl/winograd_transform.c index 3ff4b449463..25ba3caa5c8 100644 --- a/mindspore/lite/nnacl/winograd_transform.c +++ b/mindspore/lite/nnacl/winograd_transform.c @@ -24,8 +24,8 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float * int output_unit = conv_param->output_unit_; int in_channel = conv_param->input_channel_; int ic4 = UP_DIV(in_channel, C4NUM); - int pad_h = conv_param->pad_h_; - int pad_w = conv_param->pad_w_; + int pad_h = conv_param->pad_u_; + int pad_w = conv_param->pad_l_; int input_h = conv_param->input_h_; int input_w = conv_param->input_w_; if (out_w_block_num == 0) { @@ -42,7 +42,7 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float * int interval_y_e = src_y_e < input_h ? input_unit : (input_h - src_y_s); int src_plane_offset = ic4 * C4NUM * (src_y_s * input_w + src_x_s); - int dst_plane_offset = c * C4NUM; + int dst_plane_offset = c * C4NUM * ic4; for (int ic = 0; ic < ic4; ic++) { // clear tmp buffer memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float)); @@ -67,8 +67,8 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float * } } // input transform - int dst_ic4_offset = dst_plane_offset + ic * TILE_NUM * C4NUM; - size_t dst_step = ic4 * C4NUM * TILE_NUM; + int dst_ic4_offset = dst_plane_offset + ic * C4NUM; + size_t dst_step = C12NUM * ic4 * C4NUM; float *trans_input_ptr = trans_input + dst_ic4_offset; input_trans_func(tmp_data, trans_input_ptr, C4NUM, dst_step); } @@ -86,6 +86,7 @@ void WinogradOutputTransform(const float *gemm_out, float *tmp_out_data, const f int output_h_unit_block = UP_DIV(output_h, output_unit); int output_channel = conv_param->output_channel_; int oc4 = UP_DIV(output_channel, C4NUM); + int oc8 = UP_DIV(output_channel, C8NUM); int input_unit = conv_param->input_unit_; if (output_unit_num == 0) { return; @@ -93,17 +94,19 @@ void WinogradOutputTransform(const float *gemm_out, float *tmp_out_data, const f for (int i = 0; i < cal_num; i++) { int dst_x_s = out_tile_index % output_unit_num; int dst_y_s = out_tile_index / output_unit_num; - int src_tile_offset = i * oc4 * C4NUM * input_unit * input_unit; + int src_tile_offset = i * oc8 * C8NUM * input_unit * input_unit; int dst_tile_offset = C4NUM * output_unit * (dst_x_s + dst_y_s * output_w_unit_block * output_unit); for (int j = 0; j < oc4; j++) { - int src_oc4_offset = src_tile_offset + j * input_unit * input_unit * C4NUM; + int c8_block = j / 2; + int c8_res = j % 2; + int src_oc4_offset = src_tile_offset + c8_block * input_unit * input_unit * C8NUM + c8_res * C4NUM; int dst_oc4_offset = dst_tile_offset + j * C4NUM * output_h_unit_block * output_w_unit_block * output_unit * output_unit; const float *src_ptr = gemm_out + src_oc4_offset; const float *bias_ptr = bias_data + j * C4NUM; float *dst_ptr = tmp_out_data + dst_oc4_offset; - output_trans_func(src_ptr, dst_ptr, bias_ptr, C4NUM, output_w_unit_block * output_unit); + output_trans_func(src_ptr, dst_ptr, bias_ptr, C8NUM, output_w_unit_block * output_unit); } out_tile_index++; } @@ -283,8 +286,8 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa int input_channel = conv_param->input_channel_; int input_width = conv_param->input_w_; int input_height = conv_param->input_h_; - int pad_w = conv_param->pad_w_; - int pad_h = conv_param->pad_h_; + int pad_w = conv_param->pad_l_; + int pad_h = conv_param->pad_u_; int ic4 = UP_DIV(input_channel, C4NUM); const int input_unit = 4; if (out_w_block == 0) { @@ -300,7 +303,7 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa int real_y_end = (origin_y + input_unit) < input_height ? input_unit : (input_height - origin_y); int src_plane_offset = ic4 * C4NUM * (origin_y * input_width + origin_x); - int dst_plane_offset = cal_id * C4NUM; + int dst_plane_offset = cal_id * C4NUM * ic4; for (int ic = 0; ic < ic4; ic++) { // clear tmp buffer memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float)); @@ -326,8 +329,8 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa } // input transform - int dst_ic4_offset = dst_plane_offset + ic * TILE_NUM * C4NUM; - size_t dst_step = ic4 * C4NUM * TILE_NUM; + int dst_ic4_offset = dst_plane_offset + ic * C4NUM; + size_t dst_step = C12NUM * ic4 * C4NUM; float *trans_input_ptr = trans_input + dst_ic4_offset; Conv3x3Fp32InputUnit(tmp_data, trans_input_ptr, dst_step); } @@ -336,8 +339,8 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4, int output_channel, int kernel_plane, int oc_block) { - const int input_unit = 4; - int dst_step = iC4 * C4NUM * oc_block; + int oc_plane_block = UP_DIV(output_channel, oc_block); + int dst_step = iC4 * C4NUM * oc_block * oc_plane_block; if (oc_block == 0) { return; } @@ -345,7 +348,7 @@ void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4 int oc_block_num = o / oc_block; int oc_block_rem = o % oc_block; int src_oc_offset = o * iC4 * C4NUM * kernel_plane; - int dst_oc_offset = oc_block_num * oc_block * iC4 * C4NUM * input_unit * input_unit + oc_block_rem; + int dst_oc_offset = oc_block_num * oc_block * iC4 * C4NUM + oc_block_rem; for (int i = 0; i < iC4; i++) { float *src_ic4_ptr = weight_data + src_oc_offset + i * kernel_plane * C4NUM; float *dst_ic4_ptr = trans_weight + dst_oc_offset + i * oc_block * C4NUM; @@ -559,24 +562,24 @@ void Conv3x3Fp32OutputUnit(const float *gemm_out, const float *bias_data, float float32x4_t bias_ptr = vld1q_f32(bias_data); float32x4_t s00 = vld1q_f32(gemm_out); - float32x4_t s01 = vld1q_f32(gemm_out + 4); - float32x4_t s02 = vld1q_f32(gemm_out + 8); - float32x4_t s03 = vld1q_f32(gemm_out + 12); + float32x4_t s01 = vld1q_f32(gemm_out + 8); + float32x4_t s02 = vld1q_f32(gemm_out + 16); + float32x4_t s03 = vld1q_f32(gemm_out + 24); - float32x4_t s10 = vld1q_f32(gemm_out + 16); - float32x4_t s11 = vld1q_f32(gemm_out + 20); - float32x4_t s12 = vld1q_f32(gemm_out + 24); - float32x4_t s13 = vld1q_f32(gemm_out + 28); + float32x4_t s10 = vld1q_f32(gemm_out + 32); + float32x4_t s11 = vld1q_f32(gemm_out + 40); + float32x4_t s12 = vld1q_f32(gemm_out + 48); + float32x4_t s13 = vld1q_f32(gemm_out + 56); - float32x4_t s20 = vld1q_f32(gemm_out + 32); - float32x4_t s21 = vld1q_f32(gemm_out + 36); - float32x4_t s22 = vld1q_f32(gemm_out + 40); - float32x4_t s23 = vld1q_f32(gemm_out + 44); + float32x4_t s20 = vld1q_f32(gemm_out + 64); + float32x4_t s21 = vld1q_f32(gemm_out + 72); + float32x4_t s22 = vld1q_f32(gemm_out + 80); + float32x4_t s23 = vld1q_f32(gemm_out + 88); - float32x4_t s30 = vld1q_f32(gemm_out + 48); - float32x4_t s31 = vld1q_f32(gemm_out + 52); - float32x4_t s32 = vld1q_f32(gemm_out + 56); - float32x4_t s33 = vld1q_f32(gemm_out + 60); + float32x4_t s30 = vld1q_f32(gemm_out + 96); + float32x4_t s31 = vld1q_f32(gemm_out + 104); + float32x4_t s32 = vld1q_f32(gemm_out + 112); + float32x4_t s33 = vld1q_f32(gemm_out + 120); float32x4_t t00 = vaddq_f32(vaddq_f32(s00, s10), s20); float32x4_t t01 = vaddq_f32(vaddq_f32(s01, s11), s21); @@ -609,24 +612,24 @@ void Conv3x3Fp32OutputUnit(const float *gemm_out, const float *bias_data, float const float *bias_ptr = bias_data + i; float s00 = local_ptr[0]; - float s01 = (local_ptr + 4)[0]; - float s02 = (local_ptr + 8)[0]; - float s03 = (local_ptr + 12)[0]; + float s01 = (local_ptr + 8)[0]; + float s02 = (local_ptr + 16)[0]; + float s03 = (local_ptr + 24)[0]; - float s10 = (local_ptr + 16)[0]; - float s11 = (local_ptr + 20)[0]; - float s12 = (local_ptr + 24)[0]; - float s13 = (local_ptr + 28)[0]; + float s10 = (local_ptr + 32)[0]; + float s11 = (local_ptr + 40)[0]; + float s12 = (local_ptr + 48)[0]; + float s13 = (local_ptr + 56)[0]; - float s20 = (local_ptr + 32)[0]; - float s21 = (local_ptr + 36)[0]; - float s22 = (local_ptr + 40)[0]; - float s23 = (local_ptr + 44)[0]; + float s20 = (local_ptr + 64)[0]; + float s21 = (local_ptr + 72)[0]; + float s22 = (local_ptr + 80)[0]; + float s23 = (local_ptr + 88)[0]; - float s30 = (local_ptr + 48)[0]; - float s31 = (local_ptr + 52)[0]; - float s32 = (local_ptr + 56)[0]; - float s33 = (local_ptr + 60)[0]; + float s30 = (local_ptr + 96)[0]; + float s31 = (local_ptr + 104)[0]; + float s32 = (local_ptr + 112)[0]; + float s33 = (local_ptr + 120)[0]; float t00 = s00 + s10 + s20; float t01 = s01 + s11 + s21; @@ -663,6 +666,7 @@ void Conv3x3Fp32OutputTransform(const float *gemm_out, float *out_data, const fl int output_w = conv_param->output_w_; int output_h = conv_param->output_h_; int oc4 = UP_DIV(output_channel, C4NUM); + int oc8 = UP_DIV(output_channel, C8NUM); const int input_unit = 4; if (out_w_block == 0) { return; @@ -670,11 +674,13 @@ void Conv3x3Fp32OutputTransform(const float *gemm_out, float *out_data, const fl for (int i = 0; i < real_cal_num; i++) { int out_w_index = (start_index + i) % out_w_block; int out_h_index = (start_index + i) / out_w_block; - int src_tile_offset = i * oc4 * C4NUM * input_unit * input_unit; + int src_tile_offset = i * oc8 * C8NUM * input_unit * input_unit; int dst_tile_offset = C4NUM * (out_w_index * OUPUT_UNIT + out_h_index * OUPUT_UNIT * output_w); for (int j = 0; j < oc4; j++) { - int src_oc4_offset = src_tile_offset + j * input_unit * input_unit * C4NUM; + int c8_block = j / 2; + int c8_res = j % 2; + int src_oc4_offset = src_tile_offset + c8_block * input_unit * input_unit * C8NUM + c8_res * C4NUM; int dst_oc4_offset = dst_tile_offset + j * C4NUM * output_h * output_w; const float *src_ptr = gemm_out + src_oc4_offset; const float *bias_ptr = bias_data + j * C4NUM; @@ -864,8 +870,8 @@ void Conv3x3Uint8InputTransform(const int16_t *input_data, int16_t *trans_input, int input_channel = conv_param->input_channel_; int input_width = conv_param->input_w_; int input_height = conv_param->input_h_; - int pad_w = conv_param->pad_w_; - int pad_h = conv_param->pad_h_; + int pad_w = conv_param->pad_l_; + int pad_h = conv_param->pad_u_; ConvQuantArg quant_arg = conv_param->conv_quant_arg_; int input_zp = quant_arg.input_quant_args_[0].zp_; const int ic8 = UP_DIV(input_channel, C8NUM); @@ -1221,9 +1227,9 @@ void Conv3x3Uint8OutputUnit(const int32_t *gemm_out, const int32_t *bias_data, i int32x4_t ls; int32x4_t rs; if ((conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) { - out_multiplier = vld1q_s32(quant_multiplier); - ls = vld1q_s32(left_shift); - rs = vld1q_s32(right_shift); + out_multiplier = vld1q_s32(quant_multiplier + oc_start); + ls = vld1q_s32(left_shift + oc_start); + rs = vld1q_s32(right_shift + oc_start); } else { out_multiplier = vdupq_n_s32(quant_multiplier[0]); ls = vdupq_n_s32(left_shift[0]); diff --git a/mindspore/lite/nnacl/winograd_utils.c b/mindspore/lite/nnacl/winograd_utils.c index 6a63d9a8440..f53c9197151 100644 --- a/mindspore/lite/nnacl/winograd_utils.c +++ b/mindspore/lite/nnacl/winograd_utils.c @@ -4649,43 +4649,41 @@ void OutputTransform8x7Unit(const float *src_data, float *dst_data, const float // Utilize cost model to compute performance gain. // If the gain is greater than got from Im2col, winograd algorithm will be chosen. int SelectOutputUnit(ConvParameter *conv_param) { - int input_batch = conv_param->input_batch_; int kernel_h = conv_param->kernel_h_; int kernel_w = conv_param->kernel_w_; - int in_channel = conv_param->input_channel_; - int out_h = conv_param->output_h_; + int in_c = conv_param->input_channel_; int out_w = conv_param->output_w_; - int out_channel = conv_param->output_channel_; - int out_plane = out_h * out_w; + int out_h = conv_param->output_h_; + int out_c = conv_param->output_channel_; + int unit2 = UP_DIV(out_w * out_h, C12NUM * conv_param->op_parameter_.thread_num_); + int max_out_unit = (int)(sqrtf((float)unit2)); + max_out_unit = max_out_unit < MAX_UNIT ? MAX_UNIT : max_out_unit; + max_out_unit = max_out_unit > MIN_UNIT ? max_out_unit : MIN_UNIT; - int max_unit = sqrt((float)(out_plane)); - max_unit = max_unit > MIN_UNIT ? max_unit : MIN_UNIT; - max_unit = max_unit < MAX_UNIT ? max_unit : MAX_UNIT; - int output_unit = 1; - float ratio = 0.0f; - // cost of conventional convolution multiplications - float ori_cost = out_plane * out_channel * in_channel * kernel_h * kernel_w; + int unit = 0; + float max_rate = 0.0f; + float common_cost = (float)out_h * out_w * in_c * out_c * kernel_h * kernel_w; - for (int u = MIN_UNIT; u < max_unit; u++) { - int input_unit = u + kernel_h - 1; - if (input_unit != 4 && input_unit != 8) { + for (int i = MIN_UNIT; i <= max_out_unit; ++i) { + int input_unit = i + kernel_w - 1; + OutputTransformUnitFunc output_trans_func = GetOutputTransFunc(input_unit, i); + if (output_trans_func == NULL) { continue; } - // don't count filter transform cost, because it can be processed once offline. - const float input_trans_unit_cost = 2 * input_unit * input_unit * input_unit * in_channel; - float gemm_unit_cost = input_unit * input_unit * in_channel * out_channel; - float output_trans_unit_cost = input_unit * u * (u + input_unit) * out_channel; - // equation (23) in papar - float winograd_cost = (input_trans_unit_cost + gemm_unit_cost + output_trans_unit_cost) * - (UP_DIV(out_w, u) * (UP_DIV(out_h, u))) * input_batch; - float reduce_rate = ori_cost / winograd_cost; - if (reduce_rate > ratio && reduce_rate > 1) { - ratio = reduce_rate; - output_unit = u; + float penalty = ((float)input_unit * input_unit) / ((float)kernel_h * kernel_w) * 0.12f; + float wino_cost = ((2 + out_c) * (float)input_unit * input_unit * in_c + ((float)input_unit + i) * i * out_c) * + UP_DIV(out_w, i) * UP_DIV(out_h, i); + float reduce_rate = common_cost / wino_cost - penalty; + if (reduce_rate > max_rate) { + max_rate = reduce_rate; + unit = i; } } + if (max_rate < 1.0f) { + return 1; + } // If output_unit is 1, then it is conventional convolution - return output_unit; + return unit; } InputTransformUnitFunc GetInputTransFunc(int input_unit) { @@ -4719,17 +4717,6 @@ void CheckIfUseWinograd(bool *use_winograd, int *output_unit, ConvParameter *con *output_unit = SelectOutputUnit(conv_param); if (*output_unit > 1) { *use_winograd = true; - int input_unit = conv_param->kernel_h_ + *output_unit - 1; - input_trans_func = GetInputTransFunc(input_unit); - if (input_trans_func == NULL) { - *use_winograd = false; - } - output_trans_func = GetOutputTransFunc(input_unit, *output_unit); - if (output_trans_func == NULL) { - *use_winograd = false; - } - } else { - *use_winograd = false; } } else { *use_winograd = false; diff --git a/mindspore/lite/src/ops/conv2d.cc b/mindspore/lite/src/ops/conv2d.cc index cdf634875e6..bd000233f31 100644 --- a/mindspore/lite/src/ops/conv2d.cc +++ b/mindspore/lite/src/ops/conv2d.cc @@ -376,10 +376,18 @@ void Conv2D::ConvInferShape(int input_h, int input_w, int *output_h, int *output *output_h = std::ceil(static_cast(input_h) / static_cast(stride_h)); auto pad_h_all = ((*output_h - 1) * stride_h + (kernel_h - 1) * dilate_h + 1 - input_h); auto pad_w_all = ((*output_w - 1) * stride_w + (kernel_w - 1) * dilate_w + 1 - input_w); - pad_u_ = pad_h_all / 2; - pad_d_ = pad_h_all - pad_u_; - pad_l_ = pad_w_all / 2; - pad_r_ = pad_w_all - pad_l_; + if (pad_h_all < 0) { + pad_u_ = pad_d_ = 0; + } else { + pad_u_ = pad_h_all / 2; + pad_d_ = pad_h_all - pad_u_; + } + if (pad_w_all < 0) { + pad_l_ = pad_r_ = 0; + } else { + pad_l_ = pad_w_all / 2; + pad_r_ = pad_w_all - pad_l_; + } } else { *output_w = std::ceil((static_cast(input_w) + pad_l_ + pad_r_ - (static_cast(kernel_w) - 1) * static_cast(dilate_w)) / diff --git a/mindspore/lite/src/ops/deconv2d.cc b/mindspore/lite/src/ops/deconv2d.cc index f50bf63297e..0e3fc989282 100644 --- a/mindspore/lite/src/ops/deconv2d.cc +++ b/mindspore/lite/src/ops/deconv2d.cc @@ -126,14 +126,12 @@ int DeConv2D::InferShape(std::vector inputs_, std::vecto output->set_shape(out_shape); if (pad_mode == schema::PadMode_SAME) { - pad_h_ = ((input_h - 1) * stride_h + (kernel_h - 1) * dilate_h + 1 - output_h) / 2; - pad_w_ = ((input_w - 1) * stride_w + (kernel_w - 1) * dilate_w + 1 - output_w) / 2; + pad_u_ = ((input_h - 1) * stride_h + (kernel_h - 1) * dilate_h + 1 - output_h) / 2; + pad_l_ = ((input_w - 1) * stride_w + (kernel_w - 1) * dilate_w + 1 - output_w) / 2; } else if (pad_mode == schema::PadMode_VALID) { - pad_h_ = 0; - pad_w_ = 0; + pad_u_ = 0; + pad_l_ = 0; } else if (pad_mode == schema::PadMode_CAFFE) { - pad_h_ = pad_u_; - pad_w_ = pad_l_; } else { MS_LOG(ERROR) << "unsupported pad mode for deconv"; } diff --git a/mindspore/lite/src/ops/deconv2d.h b/mindspore/lite/src/ops/deconv2d.h index ab20b156ba1..89b7e9fccc5 100644 --- a/mindspore/lite/src/ops/deconv2d.h +++ b/mindspore/lite/src/ops/deconv2d.h @@ -74,16 +74,12 @@ class DeConv2D : public PrimitiveC { int PadDown() const { return this->pad_d_; } int PadLeft() const { return this->pad_l_; } int PadRight() const { return this->pad_r_; } - int PadH() const { return this->pad_h_; } - int PadW() const { return this->pad_w_; } protected: int pad_u_ = 0; int pad_d_ = 0; int pad_l_ = 0; int pad_r_ = 0; - int pad_h_ = 0; - int pad_w_ = 0; }; } // namespace lite } // namespace mindspore diff --git a/mindspore/lite/src/ops/pooling.cc b/mindspore/lite/src/ops/pooling.cc index 6495f868ff3..ffdddfece20 100644 --- a/mindspore/lite/src/ops/pooling.cc +++ b/mindspore/lite/src/ops/pooling.cc @@ -170,10 +170,18 @@ int Pooling::InferShape(std::vector inputs_, std::vector(input_h) / static_cast(GetStrideH())); auto pad_h_all = ((output_h - 1) * GetStrideH() + (window_h - 1) + 1 - input_h); auto pad_w_all = ((output_w - 1) * GetStrideW() + (window_w - 1) + 1 - input_w); - pad_u_ = pad_h_all / 2; - pad_d_ = pad_h_all - pad_u_; - pad_l_ = pad_w_all / 2; - pad_r_ = pad_w_all - pad_l_; + if (pad_h_all < 0) { + pad_u_ = pad_d_ = 0; + } else { + pad_u_ = pad_h_all / 2; + pad_d_ = pad_h_all - pad_u_; + } + if (pad_w_all < 0) { + pad_l_ = pad_r_ = 0; + } else { + pad_l_ = pad_w_all / 2; + pad_r_ = pad_w_all - pad_l_; + } } else { auto round_mode = (schema::RoundMode)GetRoundMode(); if (round_mode == schema::RoundMode_FLOOR) { diff --git a/mindspore/lite/src/populate_parameter.cc b/mindspore/lite/src/populate_parameter.cc index 98da74c0b99..9106496e82e 100644 --- a/mindspore/lite/src/populate_parameter.cc +++ b/mindspore/lite/src/populate_parameter.cc @@ -376,8 +376,6 @@ OpParameter *PopulateConvParameter(const mindspore::lite::PrimitiveC *primitive) conv_param->pad_d_ = conv2d_lite_primitive->PadDown(); conv_param->pad_l_ = conv2d_lite_primitive->PadLeft(); conv_param->pad_r_ = conv2d_lite_primitive->PadRight(); - conv_param->pad_h_ = conv2d_lite_primitive->PadUp(); - conv_param->pad_w_ = conv2d_lite_primitive->PadLeft(); conv_param->dilation_h_ = conv_primitive->GetDilateH(); conv_param->dilation_w_ = conv_primitive->GetDilateW(); conv_param->input_channel_ = conv_primitive->GetChannelIn(); @@ -386,16 +384,13 @@ OpParameter *PopulateConvParameter(const mindspore::lite::PrimitiveC *primitive) auto act_type = conv_primitive->GetActivationType(); switch (act_type) { case schema::ActivationType_RELU: - conv_param->is_relu_ = true; - conv_param->is_relu6_ = false; + conv_param->act_type_ = ActType_Relu; break; case schema::ActivationType_RELU6: - conv_param->is_relu_ = false; - conv_param->is_relu6_ = true; + conv_param->act_type_ = ActType_Relu6; break; default: - conv_param->is_relu_ = false; - conv_param->is_relu6_ = false; + conv_param->act_type_ = ActType_No; break; } return reinterpret_cast(conv_param); @@ -422,23 +417,18 @@ OpParameter *PopulateConvDwParameter(const mindspore::lite::PrimitiveC *primitiv conv_param->pad_d_ = convdw_lite_primitive->PadDown(); conv_param->pad_l_ = convdw_lite_primitive->PadLeft(); conv_param->pad_r_ = convdw_lite_primitive->PadRight(); - conv_param->pad_h_ = convdw_lite_primitive->PadUp(); - conv_param->pad_w_ = convdw_lite_primitive->PadLeft(); conv_param->dilation_h_ = conv_primitive->GetDilateH(); conv_param->dilation_w_ = conv_primitive->GetDilateW(); auto act_type = conv_primitive->GetActivationType(); switch (act_type) { case schema::ActivationType_RELU: - conv_param->is_relu_ = true; - conv_param->is_relu6_ = false; + conv_param->act_type_ = ActType_Relu; break; case schema::ActivationType_RELU6: - conv_param->is_relu_ = false; - conv_param->is_relu6_ = true; + conv_param->act_type_ = ActType_Relu6; break; default: - conv_param->is_relu_ = false; - conv_param->is_relu6_ = false; + conv_param->act_type_ = ActType_No; break; } return reinterpret_cast(conv_param); @@ -464,23 +454,18 @@ OpParameter *PopulateDeconvDwParameter(const mindspore::lite::PrimitiveC *primit conv_param->pad_d_ = deconvdw_lite_primitive->PadDown(); conv_param->pad_l_ = deconvdw_lite_primitive->PadLeft(); conv_param->pad_r_ = deconvdw_lite_primitive->PadRight(); - conv_param->pad_h_ = deconvdw_lite_primitive->PadUp(); - conv_param->pad_w_ = deconvdw_lite_primitive->PadLeft(); conv_param->dilation_h_ = conv_primitive->GetDilateH(); conv_param->dilation_w_ = conv_primitive->GetDilateW(); auto act_type = conv_primitive->GetActivationType(); switch (act_type) { case schema::ActivationType_RELU: - conv_param->is_relu_ = true; - conv_param->is_relu6_ = false; + conv_param->act_type_ = ActType_Relu; break; case schema::ActivationType_RELU6: - conv_param->is_relu_ = false; - conv_param->is_relu6_ = true; + conv_param->act_type_ = ActType_Relu6; break; default: - conv_param->is_relu_ = false; - conv_param->is_relu6_ = false; + conv_param->act_type_ = ActType_No; break; } return reinterpret_cast(conv_param); @@ -506,23 +491,18 @@ OpParameter *PopulateDeconvParameter(const mindspore::lite::PrimitiveC *primitiv conv_param->pad_d_ = deconv_lite_primitive->PadDown(); conv_param->pad_l_ = deconv_lite_primitive->PadLeft(); conv_param->pad_r_ = deconv_lite_primitive->PadRight(); - conv_param->pad_h_ = deconv_lite_primitive->PadH(); - conv_param->pad_w_ = deconv_lite_primitive->PadW(); conv_param->dilation_h_ = conv_primitive->GetDilateH(); conv_param->dilation_w_ = conv_primitive->GetDilateW(); auto act_type = conv_primitive->GetActivationType(); switch (act_type) { case schema::ActivationType_RELU: - conv_param->is_relu_ = true; - conv_param->is_relu6_ = false; + conv_param->act_type_ = ActType_Relu; break; case schema::ActivationType_RELU6: - conv_param->is_relu_ = false; - conv_param->is_relu6_ = true; + conv_param->act_type_ = ActType_Relu6; break; default: - conv_param->is_relu_ = false; - conv_param->is_relu6_ = false; + conv_param->act_type_ = ActType_No; break; } return reinterpret_cast(conv_param); diff --git a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc index deb0e277ea4..6e7388801b5 100644 --- a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc +++ b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc @@ -322,10 +322,12 @@ int ConvolutionBaseCPUKernel::SetQuantParam() { return ret; } // now only consider per tensor for output - CalculateActivationRangeQuantized( - conv_param_->is_relu_, conv_param_->is_relu6_, conv_param_->conv_quant_arg_.output_quant_args_[0].zp_, - conv_param_->conv_quant_arg_.output_quant_args_[0].scale_, &conv_param_->conv_quant_arg_.out_act_min_[0], - &conv_param_->conv_quant_arg_.out_act_max_[0]); + bool relu = conv_param_->act_type_ == ActType_Relu; + bool relu6 = conv_param_->act_type_ == ActType_Relu6; + CalculateActivationRangeQuantized(relu, relu6, conv_param_->conv_quant_arg_.output_quant_args_[0].zp_, + conv_param_->conv_quant_arg_.output_quant_args_[0].scale_, + &conv_param_->conv_quant_arg_.out_act_min_[0], + &conv_param_->conv_quant_arg_.out_act_max_[0]); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc index b5babf681d6..2ccc2957fe6 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc @@ -38,8 +38,7 @@ int Convolution1x1FP16CPUKernel::InitMatmulParam() { matmul_param_->deep_ = conv_param_->input_channel_; matmul_param_->row_16_ = UP_ROUND(matmul_param_->row_, C16NUM); matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM); - matmul_param_->act_type_ = (conv_param_->is_relu6_) ? ActType_Relu6 : ActType_No; - matmul_param_->act_type_ = (conv_param_->is_relu_) ? ActType_Relu : matmul_param_->act_type_; + matmul_param_->act_type_ = conv_param_->act_type_; return RET_OK; } @@ -57,7 +56,7 @@ Convolution1x1FP16CPUKernel::~Convolution1x1FP16CPUKernel() { } int Convolution1x1FP16CPUKernel::InitConv1x1Param() { - pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 || + pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 || conv_param_->stride_w_ != 1); thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM)); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc index 883d7bca0b7..8d11db725c9 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc @@ -237,8 +237,8 @@ int Convolution3x3FP16CPUKernel::Run() { } // get real output - bool relu = conv_param_->is_relu_; - bool relu6 = conv_param_->is_relu6_; + bool relu = conv_param_->act_type_ == ActType_Relu; + bool relu6 = conv_param_->act_type_ == ActType_Relu6; if (relu) { UnPack3x3ReluOutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, conv_param_->output_w_, conv_param_->output_channel_); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc index 0f55fe633bc..ef2235b2895 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc @@ -391,10 +391,10 @@ int ConvolutionWinogradFP16CPUKernel::Run() { } // get real output - if (conv_param_->is_relu_) { + if (conv_param_->act_type_ == ActType_Relu) { UnPackWinogradReluOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, conv_param_->output_w_, conv_param_->output_channel_, output_unit_); - } else if (conv_param_->is_relu6_) { + } else if (conv_param_->act_type_ == ActType_Relu6) { UnPackWinogradRelu6OutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, conv_param_->output_w_, conv_param_->output_channel_, output_unit_); } else { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc index 7d83c4fba1a..11964e9e6e4 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc @@ -232,34 +232,31 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector(op_parameter); int kernel_h = conv_param->kernel_h_; int kernel_w = conv_param->kernel_w_; - int stride_h = conv_param->stride_h_; - int stride_w = conv_param->stride_w_; - int dilation_h = conv_param->dilation_h_; - int dilation_w = conv_param->dilation_w_; conv_param->input_h_ = inputs.front()->Height(); conv_param->input_w_ = inputs.front()->Width(); + conv_param->input_channel_ = inputs.front()->Channel(); conv_param->output_h_ = outputs.front()->Height(); conv_param->output_w_ = outputs.front()->Width(); + conv_param->output_channel_ = outputs.front()->Channel(); + conv_param->op_parameter_.thread_num_ = ctx->thread_num_; bool use_winograd = false; - bool use_sw = false; int out_unit; InputTransformUnitFunc input_trans_func = nullptr; OutputTransformUnitFunc output_trans_func = nullptr; if (primitive != nullptr && primitive->GetInferFlag()) { CheckIfUseWinograd(&use_winograd, &out_unit, conv_param, input_trans_func, output_trans_func); - use_sw = CheckIfUseSlideWindow(conv_param); } kernel::LiteKernel *kernel; if (kernel_h == 1 && kernel_w == 1) { kernel = new (std::nothrow) kernel::Convolution1x1CPUKernel(op_parameter, inputs, outputs, ctx, primitive); - } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { - kernel = new (std::nothrow) kernel::Convolution3x3CPUKernel(op_parameter, inputs, outputs, ctx, primitive); } else if (use_winograd) { - kernel = - new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit); - } else if (use_sw) { - kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(op_parameter, inputs, outputs, ctx, primitive); + if (kernel_h == 3 && kernel_w == 3 && out_unit == 2) { + kernel = new (std::nothrow) kernel::Convolution3x3CPUKernel(op_parameter, inputs, outputs, ctx, primitive); + } else { + kernel = new (std::nothrow) + kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit); + } } else { kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive); } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc index 1e5556bcade..62d7afce114 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc @@ -65,8 +65,7 @@ void Convolution1x1CPUKernel::InitConv1x1MatmulParam() { matmul_param_->deep_ = conv_param_->input_channel_; matmul_param_->row_12_ = UP_ROUND(matmul_param_->row_, C12NUM); matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM); - matmul_param_->act_type_ = (conv_param_->is_relu6_) ? ActType_Relu6 : ActType_No; - matmul_param_->act_type_ = (conv_param_->is_relu_) ? ActType_Relu : matmul_param_->act_type_; + matmul_param_->act_type_ = conv_param_->act_type_; return; } @@ -98,7 +97,7 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() { } int Convolution1x1CPUKernel::InitConv1x1Param() { - pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 || + pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 || conv_param_->stride_w_ != 1); thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM)); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc index d30a9c4402c..a0387e01653 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc @@ -94,7 +94,9 @@ int Convolution3x3CPUKernel::InitWeightBias() { } int Convolution3x3CPUKernel::InitTmpBuffer() { + int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); int oC4 = UP_DIV(conv_param_->output_channel_, C4NUM); + int oC8 = UP_DIV(conv_param_->output_channel_, C8NUM); const int k_plane = 16; MS_ASSERT(ctx_->allocator != nullptr); @@ -105,13 +107,20 @@ int Convolution3x3CPUKernel::InitTmpBuffer() { return RET_ERROR; } - size_t tmp_dst_buffer_size = thread_count_ * TILE_NUM * k_plane * oC4 * C4NUM * sizeof(float); + size_t tmp_dst_buffer_size = thread_count_ * C12NUM * k_plane * oC8 * C8NUM * sizeof(float); tmp_dst_buffer_ = reinterpret_cast(ctx_->allocator->Malloc(tmp_dst_buffer_size)); if (tmp_dst_buffer_ == nullptr) { MS_LOG(ERROR) << "malloc tmp_dst_buffer_ failed."; return RET_ERROR; } + size_t col_buffer_size = thread_count_ * C12NUM * C4NUM * ic4 * sizeof(float); + col_buffer_ = reinterpret_cast(ctx_->allocator->Malloc(col_buffer_size)); + if (col_buffer_ == nullptr) { + MS_LOG(ERROR) << "malloc col_buffer_ failed."; + return RET_ERROR; + } + size_t nc4hw4_out_size = oC4 * C4NUM * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float); nc4hw4_out_ = reinterpret_cast(ctx_->allocator->Malloc(nc4hw4_out_size)); @@ -124,6 +133,7 @@ int Convolution3x3CPUKernel::InitTmpBuffer() { tmp_buffer_address_list_[1] = block_unit_buffer_; tmp_buffer_address_list_[2] = tmp_dst_buffer_; tmp_buffer_address_list_[3] = nc4hw4_out_; + tmp_buffer_address_list_[4] = col_buffer_; return RET_OK; } @@ -182,7 +192,7 @@ int Convolution3x3CPUKernel::ReSize() { } memset(nhwc4_input_, 0, nhwc4_input_size); - size_t tile_buffer_size = thread_count_ * TILE_NUM * C16NUM * iC4 * C4NUM * sizeof(float); + size_t tile_buffer_size = thread_count_ * C12NUM * C16NUM * iC4 * C4NUM * sizeof(float); tile_buffer_ = reinterpret_cast(malloc(tile_buffer_size)); if (tile_buffer_ == nullptr) { MS_LOG(ERROR) << "malloc tile buffer failed."; @@ -237,8 +247,8 @@ int Convolution3x3CPUKernel::Run() { return RET_ERROR; } - auto is_relu = conv_param_->is_relu_; - auto is_relu6 = conv_param_->is_relu6_; + auto is_relu = conv_param_->act_type_ == ActType_Relu; + auto is_relu6 = conv_param_->act_type_ == ActType_Relu6; auto output_addr = reinterpret_cast(out_tensors_.at(kOutputIndex)->Data()); if (is_relu) { PackNC4HW4ToNHWCReluFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_, diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h index 383c8f1bc62..acae0778964 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h @@ -60,14 +60,19 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel { ctx_->allocator->Free(nc4hw4_out_); nc4hw4_out_ = nullptr; } + if (col_buffer_ != nullptr) { + ctx_->allocator->Free(col_buffer_); + col_buffer_ = nullptr; + } } float *transformed_filter_addr_ = nullptr; float *tile_buffer_ = nullptr; float *block_unit_buffer_ = nullptr; float *tmp_dst_buffer_ = nullptr; + float *col_buffer_ = nullptr; float *nc4hw4_out_ = nullptr; - TmpBufferAddress tmp_buffer_address_list_[4]; + TmpBufferAddress tmp_buffer_address_list_[5]; GEMM_FUNC_FP32 gemm_func_ = nullptr; }; void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param, int oc_block, int oc_block_num); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc index 29504aa36d9..2262566480b 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc @@ -76,7 +76,7 @@ int WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int int out_c_block = i / oc_block; int out_c_res = i % oc_block; int input_oz_offset = i * kernel_unit * kernel_unit * channel_in; - int output_oz_offset = out_c_block * strides[1] * input_unit * input_unit + out_c_res; + int output_oz_offset = out_c_block * strides[1] + out_c_res; for (int j = 0; j < channel_in; j++) { int ic4_block = j / C4NUM; int ic4_res = j % C4NUM; @@ -93,7 +93,7 @@ int WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int MatrixMultiply(tmp_data, matrix_gt_data, trans_out_data, input_unit, kernel_unit, input_unit, row); for (int z = 0; z < input_unit_square; z++) { - int output_xy_offset = output_iz_offset + z * strides[1]; + int output_xy_offset = output_iz_offset + z * strides[0]; *(trans_weight_data + output_xy_offset) = trans_out_data[z]; } } @@ -151,7 +151,7 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() { int ConvolutionWinogradCPUKernel::MallocFilterMatrix(int oc_block, int oc_block_num) { int channel_in = conv_param_->input_channel_; - int ic4 = UP_DIV(channel_in, BLOCK); + int ic4 = UP_DIV(channel_in, C4NUM); // set data auto trans_matrix_data_size = input_unit_ * input_unit_ * ic4 * C4NUM * oc_block_num * oc_block * sizeof(float); @@ -196,10 +196,12 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() { int output_h = conv_param_->output_h_; int output_w = conv_param_->output_w_; int oc4 = UP_DIV(channel_out, C4NUM); + int oc8 = UP_DIV(channel_out, C8NUM); + int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); MS_ASSERT(ctx_->allocator != nullptr); gemm_out_ = reinterpret_cast( - ctx_->allocator->Malloc(thread_count_ * TILE_NUM * input_unit_ * input_unit_ * oc4 * C4NUM * sizeof(float))); + ctx_->allocator->Malloc(thread_count_ * C12NUM * input_unit_ * input_unit_ * oc8 * C8NUM * sizeof(float))); if (gemm_out_ == nullptr) { MS_LOG(ERROR) << "malloc gemm_out_ failed."; return RET_ERROR; @@ -222,10 +224,18 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() { return RET_ERROR; } + col_buffer_ = + reinterpret_cast(ctx_->allocator->Malloc(thread_count_ * C12NUM * ic4 * C4NUM * sizeof(float))); + if (col_buffer_ == nullptr) { + MS_LOG(ERROR) << "malloc col_buffer_ failed."; + return RET_ERROR; + } + tmp_buffer_address_list_[0] = trans_input_; tmp_buffer_address_list_[1] = gemm_out_; tmp_buffer_address_list_[2] = tmp_out_data_; tmp_buffer_address_list_[3] = tmp_data_; + tmp_buffer_address_list_[4] = col_buffer_; return RET_OK; } @@ -306,7 +316,7 @@ int ConvolutionWinogradCPUKernel::ReSize() { } memset(nhwc4_input_, 0, nhwc4_input_size); - size_t tile_buffer_size = thread_count_ * TILE_NUM * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float); + size_t tile_buffer_size = thread_count_ * C12NUM * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float); trans_input_ = reinterpret_cast(malloc(tile_buffer_size)); if (trans_input_ == nullptr) { MS_LOG(ERROR) << "malloc trans_input_ failed."; @@ -370,10 +380,10 @@ int ConvolutionWinogradCPUKernel::Run() { // get real output auto out_tensor = out_tensors_.front(); auto out_data = reinterpret_cast(out_tensor->Data()); - if (conv_param_->is_relu_) { + if (conv_param_->act_type_ == ActType_Relu) { UnPackWinogradReluOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, conv_param_->output_w_, conv_param_->output_channel_, output_unit_); - } else if (conv_param_->is_relu6_) { + } else if (conv_param_->act_type_ == ActType_Relu6) { UnPackWinogradRelu6Output(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, conv_param_->output_w_, conv_param_->output_channel_, output_unit_); } else { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h index 44ff046cb56..584dcde4efb 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h @@ -66,6 +66,10 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel { ctx_->allocator->Free(tmp_out_data_); tmp_out_data_ = nullptr; } + if (col_buffer_ != nullptr) { + ctx_->allocator->Free(col_buffer_); + col_buffer_ = nullptr; + } } int kernel_unit_; int input_unit_; @@ -74,6 +78,7 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel { float *trans_input_ = nullptr; float *gemm_out_ = nullptr; float *tmp_out_data_ = nullptr; + float *col_buffer_ = nullptr; Matrix *trans_weight_ = nullptr; InputTransformUnitFunc input_trans_func_; OutputTransformUnitFunc output_trans_func_; diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc index 39d32f7a8aa..9e8c7968bea 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc @@ -146,7 +146,7 @@ int Convolution1x1Int8CPUKernel::Init() { } int Convolution1x1Int8CPUKernel::InitParam() { - pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 || + pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 || conv_param_->stride_w_ != 1); matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc index ae751adc316..5d2d33ab82f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc @@ -36,7 +36,7 @@ int Conv2dTransposeOpenCLKernel::Init() { MS_LOG(ERROR) << "only support kh=kw=2 and stride_h=stride_w=2."; return RET_ERROR; } - if (param->pad_h_ != 0 || param->pad_w_ != 0) { + if (param->pad_u_ != 0 || param->pad_l_ != 0) { MS_LOG(ERROR) << "only support pad =0."; return RET_ERROR; } @@ -170,7 +170,7 @@ int Conv2dTransposeOpenCLKernel::Run() { int co = out_tensors_[0]->Channel(); int kh = param->kernel_h_; int kw = param->kernel_w_; - int pad = param->pad_h_; + int pad = param->pad_u_; int oh = out_tensors_[0]->Height(); int ow = out_tensors_[0]->Width(); int h = in_tensors_[0]->Height(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc index 29ef635c42d..36b45ddb9a2 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc @@ -382,9 +382,9 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolution() { " }\n\n"; code += " FLT4 out0_c4_bias = out0_c4 + bias[co_slice];\n"; - if (param->is_relu_) { + if (param->act_type_ == ActType_Relu) { code += " out0_c4_bias = max(out0_c4_bias, (FLT4)(0.0f));\n"; - } else if (param->is_relu6_) { + } else if (param->act_type_ == ActType_Relu6) { code += " out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n"; } @@ -609,9 +609,9 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { " acc += bias[slice];\n"; auto param = reinterpret_cast(op_parameter_); - if (param->is_relu_) { + if (param->act_type_ == ActType_Relu) { code += " acc = max(acc, (float4)(0.0f));\n"; - } else if (param->is_relu6_) { + } else if (param->act_type_ == ActType_Relu6) { code += " acc = clamp(acc, (float4)(0.0f), (float4)(6.0f));\n"; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc index b6c2e05b013..c3a9f681bc9 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc @@ -163,7 +163,7 @@ int DepthwiseConv2dOpenCLKernel::Run() { float relu_clip1 = 6.0; cl_int2 kernel_size = {parameter->kernel_h_, parameter->kernel_w_}; cl_int2 stride = {parameter->stride_h_, parameter->stride_w_}; - cl_int2 padding = {-parameter->pad_h_, -parameter->pad_w_}; + cl_int2 padding = {-parameter->pad_u_, -parameter->pad_l_}; cl_int2 dilation = {parameter->dilation_h_, parameter->dilation_w_}; cl_int4 src_size = {in_tensors_[0]->Width(), in_tensors_[0]->Height(), (cl_int)CI4, in_tensors_[0]->Batch()}; cl_int4 dst_size = {(cl_int)out_tensors_[0]->Width(), (cl_int)out_tensors_[0]->Height(), (cl_int)CO4, diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc index 5ad46139ba2..8220e31e475 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc @@ -47,8 +47,8 @@ void InitConvParamPack(ConvParameter *conv_param) { conv_param->dilation_h_ = 1; conv_param->dilation_w_ = 1; - conv_param->pad_h_ = 1; - conv_param->pad_w_ = 1; + conv_param->pad_u_ = 1; + conv_param->pad_l_ = 1; } TEST_F(TestPack, PackInputFp32) { diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16/convolution_fp16_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16/convolution_fp16_tests.cc index 599e75739b1..bad7728b9ef 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16/convolution_fp16_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16/convolution_fp16_tests.cc @@ -50,8 +50,8 @@ void InitConvParamGroup1Fp16(ConvParameter *conv_param) { conv_param->dilation_h_ = 1; conv_param->dilation_w_ = 1; - conv_param->pad_h_ = 1; - conv_param->pad_w_ = 1; + conv_param->pad_u_ = 1; + conv_param->pad_l_ = 1; conv_param->thread_num_ = 1; } @@ -75,8 +75,8 @@ void InitConvParamGroup2Fp16(ConvParameter *conv_param) { conv_param->dilation_h_ = 1; conv_param->dilation_w_ = 1; - conv_param->pad_h_ = 1; - conv_param->pad_w_ = 1; + conv_param->pad_u_ = 1; + conv_param->pad_l_ = 1; conv_param->thread_num_ = 1; } diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc index 202faec8bc8..4fad7b355f3 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc @@ -50,7 +50,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack1) { conv_param->output_h_ = 4; conv_param->output_w_ = 5; conv_param->stride_h_ = conv_param->stride_w_ = 4; - conv_param->pad_h_ = conv_param->pad_w_ = 2; + conv_param->pad_u_ = conv_param->pad_l_ = 2; float out[20] = {0}; Conv1x1InputPack(in, out, conv_param, sizeof(float)); @@ -91,7 +91,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack2) { conv_param->output_h_ = 7; conv_param->output_w_ = 4; conv_param->stride_h_ = conv_param->stride_w_ = 3; - conv_param->pad_h_ = conv_param->pad_w_ = 0; + conv_param->pad_u_ = conv_param->pad_l_ = 0; float out[28] = {0}; Conv1x1InputPack(in, out, conv_param, sizeof(float)); @@ -105,7 +105,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack3) { conv_param->input_h_ = conv_param->input_w_ = 3; conv_param->output_h_ = conv_param->output_w_ = 3; conv_param->stride_h_ = conv_param->stride_w_ = 2; - conv_param->pad_h_ = conv_param->pad_w_ = 1; + conv_param->pad_u_ = conv_param->pad_l_ = 1; float in[] = {1.6767339, 12.25904, 19.018835, 3.0790641, -9.252135, -8.685675, 3.6115494, 3.2282279, 17.025112, -5.052577, 12.750252, 12.701241, -8.9477215, -9.080522, 19.03931, -6.501229, -4.122992, 9.540845}; @@ -124,7 +124,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack4) { conv_param->input_h_ = conv_param->input_w_ = 3; conv_param->output_h_ = conv_param->output_w_ = 3; conv_param->stride_h_ = conv_param->stride_w_ = 2; - conv_param->pad_h_ = conv_param->pad_w_ = 1; + conv_param->pad_u_ = conv_param->pad_l_ = 1; float in[] = {4.1795, 13.142, -3.593, 16.505, 19.899, 8.5562, 19.969, -6.235, -2.380, -9.027, 9.5542, 18.974, 23.622, 8.3608, 47.325, -14.36, 15.370, 4.3049, -0.784, 37.925, -0.081, 6.1298, 0.6721, -1.517, 37.998, 13.719, 11.029, 1.7127, -1.770, 41.903, 9.0560, 14.988, 3.1866, @@ -281,8 +281,8 @@ int Conv1x1TestInit1(std::vector *inputs_, std::vectorkernel_h_ = conv_param->kernel_w_ = 1; conv_param->stride_h_ = conv_param->stride_w_ = 2; conv_param->dilation_h_ = conv_param->dilation_w_ = 1; - conv_param->pad_h_ = conv_param->pad_w_ = 1; - conv_param->is_relu_ = conv_param->is_relu6_ = false; + conv_param->pad_u_ = conv_param->pad_l_ = 1; + conv_param->act_type_ = ActType_No; return out_t->ElementsNum(); } @@ -348,9 +348,8 @@ int Conv1x1TestInit2(std::vector *inputs_, std::vectorkernel_h_ = conv_param->kernel_w_ = 1; conv_param->stride_h_ = conv_param->stride_w_ = 1; conv_param->dilation_h_ = conv_param->dilation_w_ = 1; - conv_param->pad_h_ = conv_param->pad_w_ = 0; - conv_param->is_relu_ = false; - conv_param->is_relu6_ = false; + conv_param->pad_u_ = conv_param->pad_l_ = 0; + conv_param->act_type_ = ActType_No; return out_t->ElementsNum(); } diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32_tests.cc index 38c25d694b8..becf047a691 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32_tests.cc @@ -47,8 +47,8 @@ void InitConvDwParam(ConvParameter *conv_param) { conv_param->dilation_h_ = 1; conv_param->dilation_w_ = 1; - conv_param->pad_h_ = 1; - conv_param->pad_w_ = 1; + conv_param->pad_u_ = 1; + conv_param->pad_l_ = 1; } void InitConvDwCreator(std::vector *inputs, std::vector *outputs, diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/deconvolution_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/deconvolution_fp32_tests.cc index 3b6ecadd9e1..4549f6d6417 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/deconvolution_fp32_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/deconvolution_fp32_tests.cc @@ -468,7 +468,7 @@ int DeConvTestInit1(std::vector *inputs_, std::vector
  • kernel_h_ = conv_param->kernel_w_ = 3; conv_param->stride_h_ = conv_param->stride_w_ = 2; conv_param->dilation_h_ = conv_param->dilation_w_ = 1; - conv_param->pad_h_ = conv_param->pad_w_ = 1; + conv_param->pad_u_ = conv_param->pad_l_ = 1; return out_t->ElementsNum(); } @@ -537,7 +537,7 @@ int DeConvTestInit2(std::vector *inputs_, std::vector
  • kernel_h_ = conv_param->kernel_w_ = 3; conv_param->stride_h_ = conv_param->stride_w_ = 2; conv_param->dilation_h_ = conv_param->dilation_w_ = 1; - conv_param->pad_h_ = conv_param->pad_w_ = 1; + conv_param->pad_u_ = conv_param->pad_l_ = 1; return out_t->ElementsNum(); } @@ -616,7 +616,7 @@ int DeConvTestInit3(std::vector *inputs_, std::vector
  • kernel_h_ = conv_param->kernel_w_ = 2; conv_param->stride_h_ = conv_param->stride_w_ = 3; conv_param->dilation_h_ = conv_param->dilation_w_ = 2; - conv_param->pad_h_ = conv_param->pad_w_ = 0; + conv_param->pad_u_ = conv_param->pad_l_ = 0; return out_t->ElementsNum(); } @@ -685,8 +685,8 @@ int DeConvTestInit4(std::vector *inputs_, std::vector
  • kernel_h_ = conv_param->kernel_w_ = 3; conv_param->stride_h_ = conv_param->stride_w_ = 1; conv_param->dilation_h_ = conv_param->dilation_w_ = 1; - conv_param->pad_h_ = conv_param->pad_w_ = 0; - conv_param->is_relu_ = conv_param->is_relu6_ = false; + conv_param->pad_u_ = conv_param->pad_l_ = 0; + conv_param->act_type_ = ActType_No; return out_t->ElementsNum(); } diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/convolution_grad_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/convolution_grad_fp32_tests.cc index cb8b72cff11..76fe27358c9 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/convolution_grad_fp32_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/convolution_grad_fp32_tests.cc @@ -52,12 +52,11 @@ void InitConvParamGroup1FP32(ConvParameter *conv_param) { conv_param->dilation_h_ = 1; conv_param->dilation_w_ = 1; - conv_param->pad_h_ = 1; - conv_param->pad_w_ = 1; + conv_param->pad_u_ = 1; + conv_param->pad_l_ = 1; conv_param->group_ = 1; - conv_param->is_relu_ = false; - conv_param->is_relu6_ = false; + conv_param->act_type_ = ActType_No; conv_param->thread_num_ = 1; } diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/conv_1x1_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/conv_1x1_int8_tests.cc index 00abebfcec4..3fe09b3e939 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/conv_1x1_int8_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/conv_1x1_int8_tests.cc @@ -34,7 +34,7 @@ TEST_F(TestConv1x1Int8, Input1x1PrePack1) { conv_param->input_h_ = conv_param->input_w_ = 3; conv_param->output_h_ = conv_param->output_w_ = 3; conv_param->stride_h_ = conv_param->stride_w_ = 2; - conv_param->pad_h_ = conv_param->pad_w_ = 1; + conv_param->pad_u_ = conv_param->pad_l_ = 1; int8_t in[] = {4, 13, -3, 16, 19, 8, 19, -6, -2, -9, 9, 18, 23, 8, 47, -14, 15, 4, -0, 37, -0, 6, 0, -1, 37, 13, 11, 1, -1, 41, 9, 14, 3, 0, 8, 9, 14, -14, -8, -8, -8, 7, 19, 17, 13, 3, 9, 18, -1, -0, 18, 0, 4, -2}; @@ -61,7 +61,7 @@ TEST_F(TestConv1x1Int8, Input1x1PrePack2) { conv_param->output_h_ = 4; conv_param->output_w_ = 5; conv_param->stride_h_ = conv_param->stride_w_ = 4; - conv_param->pad_h_ = conv_param->pad_w_ = 2; + conv_param->pad_u_ = conv_param->pad_l_ = 2; int8_t out[20] = {0}; Conv1x1InputPack(in, out, conv_param, sizeof(int8_t)); @@ -111,8 +111,8 @@ int Conv1x1Int8TestInit1_perchannel(std::vector *inputs_ conv_param->kernel_h_ = conv_param->kernel_w_ = 1; conv_param->stride_h_ = conv_param->stride_w_ = 1; conv_param->dilation_h_ = conv_param->dilation_w_ = 1; - conv_param->pad_h_ = conv_param->pad_w_ = 0; - conv_param->is_relu_ = conv_param->is_relu6_ = false; + conv_param->pad_u_ = conv_param->pad_l_ = 0; + conv_param->act_type_ = ActType_No; return out_t->ElementsNum(); } @@ -178,8 +178,8 @@ int Conv1x1Int8TestInit1(std::vector *inputs_, std::vect conv_param->kernel_h_ = conv_param->kernel_w_ = 1; conv_param->stride_h_ = conv_param->stride_w_ = 1; conv_param->dilation_h_ = conv_param->dilation_w_ = 1; - conv_param->pad_h_ = conv_param->pad_w_ = 0; - conv_param->is_relu_ = conv_param->is_relu6_ = false; + conv_param->pad_u_ = conv_param->pad_l_ = 0; + conv_param->act_type_ = ActType_No; return out_t->ElementsNum(); } @@ -253,8 +253,8 @@ int Conv1x1Int8TestInit2(std::vector *inputs_, std::vect conv_param->kernel_h_ = conv_param->kernel_w_ = 1; conv_param->stride_h_ = conv_param->stride_w_ = 1; conv_param->dilation_h_ = conv_param->dilation_w_ = 1; - conv_param->pad_h_ = conv_param->pad_w_ = 0; - conv_param->is_relu_ = conv_param->is_relu6_ = false; + conv_param->pad_u_ = conv_param->pad_l_ = 0; + conv_param->act_type_ = ActType_No; return out_t->ElementsNum(); } diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/deconv_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/deconv_int8_tests.cc index d5adb010ad6..9484e6e5280 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/deconv_int8_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/deconv_int8_tests.cc @@ -343,7 +343,7 @@ int DeConvInt8TestInit1(std::vector *inputs_, std::vecto PackNCHWToNHWCInt8(co_nchw, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel()); conv_param->kernel_h_ = conv_param->kernel_w_ = 3; - conv_param->pad_h_ = conv_param->pad_w_ = 1; + conv_param->pad_u_ = conv_param->pad_l_ = 1; conv_param->stride_h_ = conv_param->stride_w_ = 2; conv_param->dilation_h_ = conv_param->dilation_w_ = 1; return out_t->ElementsNum(); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc index c79fe2e96b5..20e2dc5d377 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc @@ -119,8 +119,8 @@ void RunTestCase(const std::vector shape, const std::vector fi opParameter->kernel_w_ = kw; opParameter->stride_h_ = 2; opParameter->stride_w_ = 2; - opParameter->pad_h_ = pad; - opParameter->pad_w_ = pad; + opParameter->pad_u_ = pad; + opParameter->pad_l_ = pad; opParameter->input_channel_ = ci; opParameter->output_channel_ = co; auto op_kernel_ptr = std::make_unique( diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc index 08585fd94ec..a88a23a5c25 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc @@ -169,8 +169,8 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNC4HW4Fp32) { conv_param->stride_w_ = 1; conv_param->dilation_h_ = 1; conv_param->dilation_w_ = 1; - conv_param->pad_h_ = 0; - conv_param->pad_w_ = 0; + conv_param->pad_u_ = 0; + conv_param->pad_l_ = 0; } // nhwc @@ -214,8 +214,8 @@ TEST_F(TestConvolutionDwOpenCL, PadNC4HW4Fp32) { conv_param->stride_w_ = 1; conv_param->dilation_h_ = 1; conv_param->dilation_w_ = 1; - conv_param->pad_h_ = 1; - conv_param->pad_w_ = 1; + conv_param->pad_u_ = 1; + conv_param->pad_l_ = 1; } // nhwc @@ -286,8 +286,8 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp32) { conv_param->stride_w_ = 1; conv_param->dilation_h_ = 1; conv_param->dilation_w_ = 1; - conv_param->pad_h_ = 0; - conv_param->pad_w_ = 0; + conv_param->pad_u_ = 0; + conv_param->pad_l_ = 0; } // nhwc @@ -331,8 +331,8 @@ TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp32) { conv_param->stride_w_ = 1; conv_param->dilation_h_ = 1; conv_param->dilation_w_ = 1; - conv_param->pad_h_ = 1; - conv_param->pad_w_ = 1; + conv_param->pad_u_ = 1; + conv_param->pad_l_ = 1; } // nhwc @@ -405,8 +405,8 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwNoPadFp32) { conv_param->stride_w_ = 1; conv_param->dilation_h_ = 1; conv_param->dilation_w_ = 1; - conv_param->pad_h_ = 0; - conv_param->pad_w_ = 0; + conv_param->pad_u_ = 0; + conv_param->pad_l_ = 0; } // nhwc @@ -529,8 +529,8 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwPadFp32) { conv_param->stride_w_ = 1; conv_param->dilation_h_ = 1; conv_param->dilation_w_ = 1; - conv_param->pad_h_ = 1; - conv_param->pad_w_ = 1; + conv_param->pad_u_ = 1; + conv_param->pad_l_ = 1; } // nhwc @@ -724,8 +724,8 @@ TEST_F(TestConvolutionDwOpenCL, ProfilingMobilenetv2) { conv_param->kernel_w_ = filter_shape[i][2]; conv_param->stride_h_ = conv_param->output_h_ / conv_param->input_h_; conv_param->stride_w_ = conv_param->output_w_ / conv_param->input_w_; - conv_param->pad_h_ = (conv_param->kernel_h_ - 1) / 2; - conv_param->pad_w_ = (conv_param->kernel_w_ - 1) / 2; + conv_param->pad_u_ = (conv_param->kernel_h_ - 1) / 2; + conv_param->pad_l_ = (conv_param->kernel_w_ - 1) / 2; conv_param->dilation_h_ = 1; conv_param->dilation_w_ = 1; } @@ -774,8 +774,8 @@ TEST_F(TestConvolutionDwOpenCL, Buffer2Image) { conv_param->kernel_w_ = filter_shape[2]; conv_param->stride_h_ = conv_param->output_h_ / conv_param->input_h_; conv_param->stride_w_ = conv_param->output_w_ / conv_param->input_w_; - conv_param->pad_h_ = (conv_param->kernel_h_ - 1) / 2; - conv_param->pad_w_ = (conv_param->kernel_w_ - 1) / 2; + conv_param->pad_u_ = (conv_param->kernel_h_ - 1) / 2; + conv_param->pad_l_ = (conv_param->kernel_w_ - 1) / 2; conv_param->dilation_h_ = 1; conv_param->dilation_w_ = 1; }