forked from mindspore-Ecosystem/mindspore
optimize winograd
This commit is contained in:
parent
600263ccfe
commit
d34c620dce
|
@ -47,7 +47,7 @@
|
|||
/////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
|
||||
// int row, int col, size_t stride, size_t writeNhwc, size_t writeC4)
|
||||
// int row, int col, size_t stride, size_t writeNhwc, size_t WriteWino)
|
||||
// x0: a
|
||||
// x1: b
|
||||
// x2: c
|
||||
|
@ -64,11 +64,20 @@ MatmulFloatNeon64Opt:
|
|||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
|
||||
ldr x9, [sp, #8]
|
||||
ldr x14, [sp, #16]
|
||||
|
||||
mov w18, #32 // sizeof(float) * 8
|
||||
mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float) * 8 * depth
|
||||
mov x11, x3 // bias flag
|
||||
mov x18, #4
|
||||
ldr x17, [sp]
|
||||
cbz x14, NoWinoSteps
|
||||
mul x8, x7, x17
|
||||
mov x11, #8
|
||||
mul x11, x11, x17
|
||||
mul x8, x8, x18
|
||||
mul x11, x11, x18
|
||||
NoWinoSteps:
|
||||
mul x17, x17, x18
|
||||
|
||||
L1:
|
||||
|
@ -79,7 +88,6 @@ L1:
|
|||
L2:
|
||||
mov x16, x1 // reload rhs ptr
|
||||
mov w13, w5 // reload depth
|
||||
mov x14, x3 // reload bias ptr
|
||||
dup v8.4s, wzr
|
||||
dup v9.4s, wzr
|
||||
dup v10.4s, wzr
|
||||
|
@ -173,9 +181,10 @@ LoopEnd:
|
|||
fmla v31.4s, v4.4s, v2.s[3]
|
||||
|
||||
Bias:
|
||||
cbz x11, Activation
|
||||
ld1 {v0.4s}, [x14], #16
|
||||
ld1 {v1.4s}, [x14], #16
|
||||
cbz x3, Activation
|
||||
ld1 {v0.4s}, [x3], #16
|
||||
ld1 {v1.4s}, [x3]
|
||||
sub x3, x3, #16
|
||||
fadd v8.4s, v8.4s, v0.4s
|
||||
fadd v9.4s, v9.4s, v1.4s
|
||||
fadd v10.4s, v10.4s, v0.4s
|
||||
|
@ -265,10 +274,8 @@ Relu:
|
|||
fmax v31.4s, v31.4s, v3.4s
|
||||
|
||||
Write:
|
||||
ldr w8, [sp, #8]
|
||||
cbz w8, WriteC8
|
||||
ldr w8, [sp, #16]
|
||||
cbnz w8, WriteC4
|
||||
cbnz x14, WriteWino
|
||||
cbz x9, WriteC8
|
||||
cmp w7, #1
|
||||
beq Write1
|
||||
cmp w7, #2
|
||||
|
@ -721,39 +728,26 @@ Write7:
|
|||
st1 {v31.s}[2], [x16], x17
|
||||
b WriteEnd
|
||||
WriteC8:
|
||||
st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x2], #64
|
||||
st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x2], #64
|
||||
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
|
||||
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64
|
||||
st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x2], #64
|
||||
st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x2], #64
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x2], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x2], #64
|
||||
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x2], #64
|
||||
st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x2], #64
|
||||
st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x2], #64
|
||||
st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64
|
||||
b WriteEnd
|
||||
WriteC4:
|
||||
st1 {v8.8h}, [x2], #16
|
||||
st1 {v10.8h}, [x2], #16
|
||||
st1 {v12.8h}, [x2], #16
|
||||
st1 {v14.8h}, [x2], #16
|
||||
st1 {v16.8h}, [x2], #16
|
||||
st1 {v18.8h}, [x2], #16
|
||||
st1 {v20.8h}, [x2], #16
|
||||
st1 {v22.8h}, [x2], #16
|
||||
st1 {v24.8h}, [x2], #16
|
||||
st1 {v26.8h}, [x2], #16
|
||||
st1 {v28.8h}, [x2], #16
|
||||
st1 {v30.8h}, [x2], #16
|
||||
add x18, x2, x17
|
||||
st1 {v9.8h}, [x18], #16
|
||||
st1 {v11.8h}, [x18], #16
|
||||
st1 {v13.8h}, [x18], #16
|
||||
st1 {v15.8h}, [x18], #16
|
||||
st1 {v17.8h}, [x18], #16
|
||||
st1 {v19.8h}, [x18], #16
|
||||
st1 {v21.8h}, [x18], #16
|
||||
st1 {v23.8h}, [x18], #16
|
||||
st1 {v25.8h}, [x18], #16
|
||||
st1 {v27.8h}, [x18], #16
|
||||
st1 {v29.8h}, [x18], #16
|
||||
st1 {v31.8h}, [x18], #16
|
||||
WriteWino:
|
||||
st1 {v8.4s, v9.4s}, [x18], x8
|
||||
st1 {v10.4s, v11.4s}, [x18], x8
|
||||
st1 {v12.4s, v13.4s}, [x18], x8
|
||||
st1 {v14.4s, v15.4s}, [x18], x8
|
||||
st1 {v16.4s, v17.4s}, [x18], x8
|
||||
st1 {v18.4s, v19.4s}, [x18], x8
|
||||
st1 {v20.4s, v21.4s}, [x18], x8
|
||||
st1 {v22.4s, v23.4s}, [x18], x8
|
||||
st1 {v24.4s, v25.4s}, [x18], x8
|
||||
st1 {v26.4s, v27.4s}, [x18], x8
|
||||
st1 {v28.4s, v29.4s}, [x18], x8
|
||||
st1 {v30.4s, v31.4s}, [x18], x8
|
||||
b WriteEnd
|
||||
Write8:
|
||||
st1 {v8.4s, v9.4s}, [x18], x17
|
||||
|
@ -798,15 +792,15 @@ WriteEnd:
|
|||
End2:
|
||||
subs w7, w7, #8 // rhs col - 8
|
||||
add x1, x1, x15 // rhs ptr + stride
|
||||
cbz x3, NoBiasStep
|
||||
add x3, x3, #32 // bias ptr + stride
|
||||
ldr w8, [sp, #8]
|
||||
cbz w8, NoDstStep
|
||||
ldr w8, [sp, #16]
|
||||
cbnz w8, C4DstStep
|
||||
NoBiasStep:
|
||||
cbnz x14, WinoDstStep
|
||||
cbz x9, NoDstStep
|
||||
add x2, x2, #32 // dst ptr + stride
|
||||
b NoDstStep
|
||||
C4DstStep:
|
||||
add x2, x2, x17
|
||||
WinoDstStep:
|
||||
add x2, x2, x11
|
||||
NoDstStep:
|
||||
bgt L1
|
||||
|
||||
|
|
|
@ -32,8 +32,6 @@ typedef struct ConvParameter {
|
|||
int stride_w_;
|
||||
int dilation_h_;
|
||||
int dilation_w_;
|
||||
int pad_h_;
|
||||
int pad_w_;
|
||||
int pad_u_;
|
||||
int pad_d_;
|
||||
int pad_l_;
|
||||
|
@ -51,8 +49,7 @@ typedef struct ConvParameter {
|
|||
int thread_num_;
|
||||
int input_unit_;
|
||||
int output_unit_;
|
||||
bool is_relu_;
|
||||
bool is_relu6_;
|
||||
ActType act_type_;
|
||||
} ConvParameter;
|
||||
|
||||
typedef struct SlidingWindowParam {
|
||||
|
|
|
@ -53,16 +53,18 @@ void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float1
|
|||
void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, int top,
|
||||
int bottom, int left, int right, const ConvParameter *conv_param,
|
||||
const SlidingWindowParam *sliding) {
|
||||
bool relu = conv_param->act_type_ == ActType_Relu;
|
||||
bool relu6 = conv_param->act_type_ == ActType_Relu6;
|
||||
float16_t *dst_h = dst + top * sliding->out_h_step_;
|
||||
for (int oh = top; oh < bottom; oh++) {
|
||||
int ih = oh * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int ih = oh * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_));
|
||||
int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_));
|
||||
const float16_t *src_h = src + ih * sliding->in_h_step_;
|
||||
|
||||
float16_t *dst_kernel = dst_h + left * sliding->block_channel_;
|
||||
for (int ow = left; ow < right; ow++) {
|
||||
int iw = ow * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int iw = ow * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_));
|
||||
int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_));
|
||||
const float16_t *src_w = src_h + iw * sliding->block_channel_;
|
||||
|
@ -72,11 +74,10 @@ void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *
|
|||
#ifdef ENABLE_ARM64
|
||||
ConvDwFp16Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
|
||||
sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t),
|
||||
conv_param->kernel_w_ * C8NUM * sizeof(float16_t), conv_param->is_relu_, conv_param->is_relu6_);
|
||||
conv_param->kernel_w_ * C8NUM * sizeof(float16_t), relu, relu6);
|
||||
#else
|
||||
DepthwiseBorderPixelFp16(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
|
||||
sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C8NUM,
|
||||
conv_param->is_relu_, conv_param->is_relu6_);
|
||||
sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C8NUM, relu, relu6);
|
||||
#endif
|
||||
dst_kernel += sliding->block_channel_;
|
||||
} // width loop
|
||||
|
@ -139,6 +140,8 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *
|
|||
void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
|
||||
const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding,
|
||||
int task_id) {
|
||||
bool relu = conv_param->act_type_ == ActType_Relu;
|
||||
bool relu6 = conv_param->act_type_ == ActType_Relu6;
|
||||
const float16_t *src = input_data;
|
||||
float16_t *dst = output_data;
|
||||
for (int b = 0; b < conv_param->output_batch_; b++) {
|
||||
|
@ -157,8 +160,8 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo
|
|||
conv_param->output_w_, conv_param, sliding);
|
||||
|
||||
if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) {
|
||||
int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
const float16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_;
|
||||
float16_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
|
||||
#ifdef ENABLE_ARM64
|
||||
|
@ -166,12 +169,12 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo
|
|||
conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(float16_t),
|
||||
sliding->block_channel_ * sizeof(float16_t), sliding->in_sh_step_ * sizeof(float16_t),
|
||||
sliding->in_sw_step_ * sizeof(float16_t), sliding->in_kh_step_ * sizeof(float16_t),
|
||||
sliding->in_kw_step_ * sizeof(float16_t), conv_param->is_relu_, conv_param->is_relu6_);
|
||||
sliding->in_kw_step_ * sizeof(float16_t), relu, relu6);
|
||||
#else
|
||||
DepthwiseCenterFp16(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_,
|
||||
sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
|
||||
sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, sliding->in_sw_step_,
|
||||
sliding->in_kh_step_, sliding->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_);
|
||||
sliding->in_kh_step_, sliding->in_kw_step_, relu, relu6);
|
||||
#endif
|
||||
}
|
||||
} // output C8 loop
|
||||
|
@ -210,14 +213,14 @@ void DeconvDepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float
|
|||
const SlidingWindowParam *sliding) {
|
||||
const float16_t *src_h = src + top * sliding->out_h_step_;
|
||||
for (int ih = top; ih < bottom; ih++) {
|
||||
int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
|
||||
int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
|
||||
float16_t *dst_h = dst + oh * sliding->in_h_step_;
|
||||
|
||||
const float16_t *src_kernel = src_h + left * sliding->block_channel_;
|
||||
for (int iw = left; iw < right; iw++) {
|
||||
int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_));
|
||||
int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_));
|
||||
float16_t *dst_w = dst_h + ow * sliding->block_channel_;
|
||||
|
@ -282,12 +285,14 @@ void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float
|
|||
|
||||
void DeconvDepthwisePostFuncFp16(float16_t *dst, const float16_t *bias, int block_channel,
|
||||
const ConvParameter *conv_param) {
|
||||
bool relu = conv_param->act_type_ == ActType_Relu;
|
||||
bool relu6 = conv_param->act_type_ == ActType_Relu6;
|
||||
float16_t *dst_k = dst;
|
||||
for (int k = 0; k < conv_param->output_h_ * conv_param->output_w_; k++) {
|
||||
for (int c = 0; c < C8NUM; c++) {
|
||||
dst_k[c] += bias[c];
|
||||
dst_k[c] = (conv_param->is_relu_) ? (MSMAX(0, dst_k[c])) : (dst_k[c]);
|
||||
dst_k[c] = (conv_param->is_relu6_) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]);
|
||||
dst_k[c] = (relu) ? (MSMAX(0, dst_k[c])) : (dst_k[c]);
|
||||
dst_k[c] = (relu6) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]);
|
||||
}
|
||||
dst_k += block_channel;
|
||||
}
|
||||
|
@ -315,8 +320,8 @@ void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const f
|
|||
conv_param->input_w_, conv_param, sliding);
|
||||
|
||||
if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) {
|
||||
int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
float16_t *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_;
|
||||
const float16_t *in_t =
|
||||
src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
|
||||
|
|
|
@ -173,16 +173,18 @@ void SWBorderPixel(float16_t *dst, const float16_t *src, const float16_t *weight
|
|||
|
||||
void SWBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, int top,
|
||||
int bottom, int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) {
|
||||
bool relu = conv_param->act_type_ == ActType_Relu;
|
||||
bool relu6 = conv_param->act_type_ == ActType_Relu6;
|
||||
float16_t *dst_h = dst + top * sliding->out_h_step_;
|
||||
for (int oh = top; oh < bottom; oh++) {
|
||||
int ih = oh * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int ih = oh * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_));
|
||||
int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_));
|
||||
const float16_t *src_h = src + ih * sliding->in_h_step_;
|
||||
|
||||
float16_t *dst_kernel = dst_h + left * sliding->block_channel_;
|
||||
for (int ow = left; ow < right; ow++) {
|
||||
int iw = ow * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int iw = ow * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_));
|
||||
int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_));
|
||||
const float16_t *src_w = src_h + iw * sliding->ic4_channel_;
|
||||
|
@ -192,7 +194,7 @@ void SWBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight,
|
|||
|
||||
SWBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
|
||||
sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_h_, conv_param->kernel_w_,
|
||||
sliding->ic4_channel_, conv_param->is_relu_, conv_param->is_relu6_);
|
||||
sliding->ic4_channel_, relu, relu6);
|
||||
|
||||
dst_kernel += sliding->block_channel_;
|
||||
} // width loop
|
||||
|
@ -273,6 +275,8 @@ void SWCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight,
|
|||
void ConvSWFp16(const float16_t *input_data, const float16_t *packed_weight, const float16_t *bias_data,
|
||||
float16_t *tmp_out_block, float16_t *output_data, int task_id, ConvParameter *conv_param,
|
||||
SlidingWindowParam *slidingWindow_param) {
|
||||
bool relu = conv_param->act_type_ == ActType_Relu;
|
||||
bool relu6 = conv_param->act_type_ == ActType_Relu6;
|
||||
int oc4_res = conv_param->output_channel_ % C4NUM;
|
||||
const float16_t *src = input_data;
|
||||
float16_t *dst;
|
||||
|
@ -299,8 +303,8 @@ void ConvSWFp16(const float16_t *input_data, const float16_t *packed_weight, con
|
|||
|
||||
if (slidingWindow_param->right_ > slidingWindow_param->left_ &&
|
||||
slidingWindow_param->bottom_ > slidingWindow_param->top_) {
|
||||
int in_h_start = slidingWindow_param->top_ * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int in_w_start = slidingWindow_param->left_ * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int in_h_start = slidingWindow_param->top_ * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int in_w_start = slidingWindow_param->left_ * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
const float16_t *in_t =
|
||||
src_data + in_h_start * slidingWindow_param->in_h_step_ + in_w_start * slidingWindow_param->ic4_channel_;
|
||||
float16_t *out_t = dst_data + slidingWindow_param->top_ * slidingWindow_param->out_h_step_ +
|
||||
|
@ -310,7 +314,7 @@ void ConvSWFp16(const float16_t *input_data, const float16_t *packed_weight, con
|
|||
conv_param->kernel_w_, slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_,
|
||||
slidingWindow_param->ic4_channel_, slidingWindow_param->in_sh_step_,
|
||||
slidingWindow_param->in_sw_step_, slidingWindow_param->in_kh_step_,
|
||||
slidingWindow_param->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_);
|
||||
slidingWindow_param->in_kw_step_, relu, relu6);
|
||||
}
|
||||
} // output C4 loop
|
||||
src += slidingWindow_param->in_step_;
|
||||
|
@ -330,8 +334,8 @@ void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_
|
|||
int out_h = conv_param->output_h_;
|
||||
int out_w = conv_param->output_w_;
|
||||
int out_channel = conv_param->output_channel_;
|
||||
bool relu = conv_param->is_relu_;
|
||||
bool relu6 = conv_param->is_relu6_;
|
||||
bool relu = conv_param->act_type_ == ActType_Relu;
|
||||
bool relu6 = conv_param->act_type_ == ActType_Relu6;
|
||||
int thread_count = conv_param->thread_num_;
|
||||
const int tile_n = 16;
|
||||
int output_count = out_h * out_w;
|
||||
|
|
|
@ -73,8 +73,8 @@ int DeConvPostFp16(const float16_t *src, float16_t *tmp, const float16_t *bias,
|
|||
|
||||
for (int ih = 0; ih < conv_param->input_h_; ih++) {
|
||||
for (int iw = 0; iw < conv_param->input_w_; iw++) {
|
||||
int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
|
||||
int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
|
||||
int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
|
||||
|
@ -112,7 +112,7 @@ int DeConvPostFp16(const float16_t *src, float16_t *tmp, const float16_t *bias,
|
|||
} /*ih*/
|
||||
} /*oc8*/
|
||||
|
||||
PostConvFuncFp16C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_, conv_param->is_relu_,
|
||||
conv_param->is_relu6_);
|
||||
PostConvFuncFp16C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_,
|
||||
conv_param->act_type_ == ActType_Relu, conv_param->act_type_ == ActType_Relu6);
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
|
|
@ -21,14 +21,14 @@
|
|||
void Conv1x1InputPackFp16(const float16_t *src, float16_t *dst, ConvParameter *conv_param) {
|
||||
/* support nhwc */
|
||||
for (int dst_h = 0; dst_h < conv_param->output_h_; dst_h++) {
|
||||
int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
if (src_h < 0 || src_h >= conv_param->input_h_) {
|
||||
continue;
|
||||
}
|
||||
const float16_t *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_;
|
||||
float16_t *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_;
|
||||
for (int dst_w = 0; dst_w < conv_param->output_w_; dst_w++) {
|
||||
int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
if (src_w < 0 || src_w >= conv_param->input_w_) {
|
||||
continue;
|
||||
}
|
||||
|
@ -46,8 +46,8 @@ void Im2ColPackUnitFp16(float16_t *input_data, ConvParameter *conv_param, float1
|
|||
int kernel_w = conv_param->kernel_w_;
|
||||
int stride_h = conv_param->stride_h_;
|
||||
int stride_w = conv_param->stride_w_;
|
||||
int pad_h = conv_param->pad_h_;
|
||||
int pad_w = conv_param->pad_w_;
|
||||
int pad_h = conv_param->pad_u_;
|
||||
int pad_w = conv_param->pad_l_;
|
||||
int dilation_h = conv_param->dilation_h_;
|
||||
int dilation_w = conv_param->dilation_w_;
|
||||
int in_channel = conv_param->input_channel_;
|
||||
|
|
|
@ -230,8 +230,8 @@ void Conv3x3Fp16InputTransform(const float16_t *input_data, float16_t *trans_inp
|
|||
int input_channel = conv_param->input_channel_;
|
||||
int input_width = conv_param->input_w_;
|
||||
int input_height = conv_param->input_h_;
|
||||
int pad_w = conv_param->pad_w_;
|
||||
int pad_h = conv_param->pad_h_;
|
||||
int pad_w = conv_param->pad_l_;
|
||||
int pad_h = conv_param->pad_u_;
|
||||
int ic8 = UP_DIV(input_channel, C8NUM);
|
||||
if (out_w_block == 0) {
|
||||
return;
|
||||
|
@ -576,8 +576,8 @@ void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_in
|
|||
int output_unit = conv_param->output_unit_;
|
||||
int in_channel = conv_param->input_channel_;
|
||||
int ic8 = UP_DIV(in_channel, C8NUM);
|
||||
int pad_h = conv_param->pad_h_;
|
||||
int pad_w = conv_param->pad_w_;
|
||||
int pad_h = conv_param->pad_u_;
|
||||
int pad_w = conv_param->pad_l_;
|
||||
int input_h = conv_param->input_h_;
|
||||
int input_w = conv_param->input_w_;
|
||||
if (out_w_block_num == 0) {
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include <string.h>
|
||||
#include "nnacl/fp32/common_func.h"
|
||||
#include "nnacl/winograd_transform.h"
|
||||
#include "nnacl/fp32/matmul.h"
|
||||
|
||||
void SWBorderPixel(float *dst, const float *src, const float *weight, const float *bias, int height, int width,
|
||||
int in_kh_step, int in_kw_step, int kernel_h, int kernel_w, int ic4, bool is_relu, bool is_relu6) {
|
||||
|
@ -57,16 +58,18 @@ void SWBorderPixel(float *dst, const float *src, const float *weight, const floa
|
|||
void SWBorder(float *dst, const float *src, const float *weight, const float *bias, int top, int bottom, int left,
|
||||
int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) {
|
||||
int ic4 = sliding->ic4_channel_ / C4NUM;
|
||||
bool relu = conv_param->act_type_ == ActType_Relu;
|
||||
bool relu6 = conv_param->act_type_ == ActType_Relu6;
|
||||
float *dst_h = dst + top * sliding->out_h_step_;
|
||||
for (int oh = top; oh < bottom; oh++) {
|
||||
int ih = oh * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int ih = oh * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_));
|
||||
int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_));
|
||||
const float *src_h = src + ih * sliding->in_h_step_;
|
||||
|
||||
float *dst_kernel = dst_h + left * sliding->block_channel_;
|
||||
for (int ow = left; ow < right; ow++) {
|
||||
int iw = ow * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int iw = ow * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_));
|
||||
int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_));
|
||||
const float *src_w = src_h + iw * sliding->ic4_channel_;
|
||||
|
@ -75,8 +78,8 @@ void SWBorder(float *dst, const float *src, const float *weight, const float *bi
|
|||
const float *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * sliding->ic4_channel_;
|
||||
|
||||
SWBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
|
||||
sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_h_, conv_param->kernel_w_, ic4,
|
||||
conv_param->is_relu_, conv_param->is_relu6_);
|
||||
sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_h_, conv_param->kernel_w_, ic4, relu,
|
||||
relu6);
|
||||
|
||||
dst_kernel += sliding->block_channel_;
|
||||
} // width loop
|
||||
|
@ -144,6 +147,8 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float
|
|||
float *output_data, int task_id, ConvParameter *conv_param, SlidingWindowParam *slidingWindow_param) {
|
||||
int ic4 = slidingWindow_param->ic4_channel_ / C4NUM;
|
||||
int oc4_res = conv_param->output_channel_ % C4NUM;
|
||||
bool relu = conv_param->act_type_ == ActType_Relu;
|
||||
bool relu6 = conv_param->act_type_ == ActType_Relu6;
|
||||
const float *src = input_data;
|
||||
float *dst = NULL;
|
||||
if (oc4_res == 0) {
|
||||
|
@ -169,28 +174,26 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float
|
|||
|
||||
if (slidingWindow_param->right_ > slidingWindow_param->left_ &&
|
||||
slidingWindow_param->bottom_ > slidingWindow_param->top_) {
|
||||
int in_h_start = slidingWindow_param->top_ * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int in_w_start = slidingWindow_param->left_ * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int in_h_start = slidingWindow_param->top_ * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int in_w_start = slidingWindow_param->left_ * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
const float *in_t =
|
||||
src_data + in_h_start * slidingWindow_param->in_h_step_ + in_w_start * slidingWindow_param->ic4_channel_;
|
||||
float *out_t = dst_data + slidingWindow_param->top_ * slidingWindow_param->out_h_step_ +
|
||||
slidingWindow_param->left_ * slidingWindow_param->block_channel_;
|
||||
#ifdef ENABLE_ARM64
|
||||
ConvSwFp32Center(out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_,
|
||||
slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_,
|
||||
conv_param->kernel_w_, slidingWindow_param->out_h_step_ * sizeof(float),
|
||||
slidingWindow_param->block_channel_ * sizeof(float), ic4,
|
||||
slidingWindow_param->in_sh_step_ * sizeof(float),
|
||||
slidingWindow_param->in_sw_step_ * sizeof(float),
|
||||
slidingWindow_param->in_kh_step_ * sizeof(float),
|
||||
slidingWindow_param->in_kw_step_ * sizeof(float),
|
||||
conv_param->is_relu_, conv_param->is_relu6_);
|
||||
ConvSwFp32Center(
|
||||
out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_,
|
||||
slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_, conv_param->kernel_w_,
|
||||
slidingWindow_param->out_h_step_ * sizeof(float), slidingWindow_param->block_channel_ * sizeof(float), ic4,
|
||||
slidingWindow_param->in_sh_step_ * sizeof(float), slidingWindow_param->in_sw_step_ * sizeof(float),
|
||||
slidingWindow_param->in_kh_step_ * sizeof(float), slidingWindow_param->in_kw_step_ * sizeof(float), relu,
|
||||
relu6);
|
||||
#else
|
||||
SWCenter(out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_,
|
||||
slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_,
|
||||
conv_param->kernel_w_, slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_, ic4,
|
||||
slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_, conv_param->kernel_w_,
|
||||
slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_, ic4,
|
||||
slidingWindow_param->in_sh_step_, slidingWindow_param->in_sw_step_, slidingWindow_param->in_kh_step_,
|
||||
slidingWindow_param->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_);
|
||||
slidingWindow_param->in_kw_step_, relu, relu6);
|
||||
#endif
|
||||
}
|
||||
} // output C4 loop
|
||||
|
@ -219,6 +222,8 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons
|
|||
int kernel_plane = kernel_h * kernel_w;
|
||||
int unit_size = kernel_plane * ic4 * C4NUM;
|
||||
int packed_input_size = output_tile_count * TILE_NUM * unit_size;
|
||||
bool relu = conv_param->act_type_ == ActType_Relu;
|
||||
bool relu6 = conv_param->act_type_ == ActType_Relu6;
|
||||
|
||||
// we accumulate 4 channels per time for input blocks
|
||||
int conv_depth = kernel_h * kernel_w;
|
||||
|
@ -240,11 +245,11 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons
|
|||
if (real_cal_num == TILE_NUM) {
|
||||
float *gemm_output = output_data + out_offset;
|
||||
gemm_func(gemm_output, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, output_offset, 0, 0,
|
||||
conv_param->is_relu_, conv_param->is_relu6_);
|
||||
relu, relu6);
|
||||
} else {
|
||||
// res part
|
||||
gemm_func(tmp_out_block, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, output_offset, 0,
|
||||
0, conv_param->is_relu_, conv_param->is_relu6_);
|
||||
0, relu, relu6);
|
||||
memcpy(output_data + out_offset, tmp_out_block, real_cal_num * out_channel * sizeof(float));
|
||||
}
|
||||
}
|
||||
|
@ -264,34 +269,42 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
|
|||
int out_w_block = UP_DIV(conv_param->output_w_, out_unit);
|
||||
int out_h_block = UP_DIV(conv_param->output_h_, out_unit);
|
||||
int output_count = out_w_block * out_h_block;
|
||||
int output_tile_count = UP_DIV(output_count, TILE_NUM);
|
||||
int output_tile_count = UP_DIV(output_count, C12NUM);
|
||||
int out_channel = conv_param->output_channel_;
|
||||
int oc4 = UP_DIV(out_channel, C4NUM);
|
||||
int oc8 = UP_DIV(out_channel, C8NUM);
|
||||
int input_unit_square = input_unit * input_unit;
|
||||
size_t output_offset = oc4 * C4NUM * input_unit_square * sizeof(float);
|
||||
|
||||
float *trans_input = buffer_list[0];
|
||||
float *gemm_out = buffer_list[1];
|
||||
float *tmp_out_data = buffer_list[2];
|
||||
float *tmp_data = buffer_list[3];
|
||||
int trans_input_offset = TILE_NUM * input_unit_square * ic4 * C4NUM;
|
||||
int gemm_out_offset = TILE_NUM * input_unit_square * oc4 * C4NUM;
|
||||
float *col_buffer = buffer_list[4];
|
||||
int trans_input_offset = C12NUM * input_unit_square * ic4 * C4NUM;
|
||||
int gemm_out_offset = C12NUM * input_unit_square * oc8 * C8NUM;
|
||||
int tmp_data_offset = input_unit_square * C4NUM;
|
||||
int col_buffer_offset = C12NUM * ic4 * C4NUM;
|
||||
// step 1 : filter transform (pre-processed offline)
|
||||
// step 2 : input transform (online)
|
||||
for (int b = 0; b < in_batch; b++) {
|
||||
int in_batch_offset = b * ic4 * C4NUM * conv_param->input_h_ * conv_param->input_w_;
|
||||
int tmp_out_batch_offset = b * out_w_block * out_h_block * out_unit * out_unit * oc4 * C4NUM;
|
||||
for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_num) {
|
||||
int out_tile_index = thread_id * TILE_NUM;
|
||||
int cal_num = output_count - thread_id * TILE_NUM;
|
||||
cal_num = cal_num > TILE_NUM ? TILE_NUM : cal_num;
|
||||
int out_tile_index = thread_id * C12NUM;
|
||||
int cal_num = output_count - thread_id * C12NUM;
|
||||
cal_num = cal_num > C12NUM ? C12NUM : cal_num;
|
||||
WinogradInputTransform(input_data + in_batch_offset, trans_input + task_id * trans_input_offset,
|
||||
tmp_data + task_id * tmp_data_offset, cal_num, out_tile_index, out_w_block, conv_param,
|
||||
input_trans_func);
|
||||
// step 3 : gemm
|
||||
gemm_func(gemm_out + task_id * gemm_out_offset, trans_input + task_id * trans_input_offset, trans_weight, NULL,
|
||||
input_unit_square, ic4, oc4 * C4NUM, output_offset, 1, 1, 0, 0);
|
||||
float *src_ptr = trans_input + task_id * trans_input_offset;
|
||||
float *dst_ptr = gemm_out + task_id * gemm_out_offset;
|
||||
float *tmp_col_ptr = col_buffer + task_id * col_buffer_offset;
|
||||
for (int i = 0; i < input_unit_square; ++i) {
|
||||
RowMajor2Col12Major(src_ptr + i * C12NUM * ic4 * C4NUM, tmp_col_ptr, C12NUM, ic4 * C4NUM);
|
||||
MatMulOpt(tmp_col_ptr, trans_weight + i * ic4 * C4NUM * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, ic4 * C4NUM,
|
||||
C12NUM, oc8 * C8NUM, input_unit_square, 2);
|
||||
}
|
||||
|
||||
// step 4 : output transform
|
||||
WinogradOutputTransform(gemm_out + task_id * gemm_out_offset, tmp_out_data + tmp_out_batch_offset, bias_data,
|
||||
|
@ -442,18 +455,21 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat
|
|||
int ic4 = UP_DIV(conv_param->input_channel_, C4NUM);
|
||||
int output_channel = conv_param->output_channel_;
|
||||
int oc4 = UP_DIV(output_channel, C4NUM);
|
||||
int oc8 = UP_DIV(output_channel, C8NUM);
|
||||
int out_w_block = UP_DIV(conv_param->output_w_, OUPUT_UNIT);
|
||||
int out_h_block = UP_DIV(conv_param->output_h_, OUPUT_UNIT);
|
||||
int output_count = out_w_block * out_h_block;
|
||||
int output_tile_count = UP_DIV(output_count, TILE_NUM);
|
||||
int output_tile_count = UP_DIV(output_count, C12NUM);
|
||||
const int input_unit_square = 4 * 4;
|
||||
float *tile_buffer = buffer_list[0];
|
||||
float *block_unit_buffer = buffer_list[1];
|
||||
float *tmp_dst_buffer = buffer_list[2];
|
||||
float *nc4hw4_out = buffer_list[3];
|
||||
int tile_buffer_offset = TILE_NUM * input_unit_square * ic4 * C4NUM;
|
||||
float *col_buffer = buffer_list[4];
|
||||
int tile_buffer_offset = C12NUM * input_unit_square * ic4 * C4NUM;
|
||||
int block_unit_buffer_offset = input_unit_square * C4NUM;
|
||||
int tmp_dst_buffer_offset = TILE_NUM * input_unit_square * oc4 * C4NUM;
|
||||
int tmp_dst_buffer_offset = C12NUM * input_unit_square * oc8 * C8NUM;
|
||||
int col_buffer_offset = C12NUM * ic4 * C4NUM;
|
||||
|
||||
int input_batch = conv_param->input_batch_;
|
||||
for (int batch = 0; batch < input_batch; batch++) {
|
||||
|
@ -461,15 +477,20 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat
|
|||
int nc4hw4_buffer_offset = batch * oc4 * C4NUM * conv_param->output_h_ * conv_param->output_w_;
|
||||
|
||||
for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
|
||||
int start_index = thread_id * TILE_NUM;
|
||||
int real_cal_num = (output_count - start_index) < TILE_NUM ? (output_count - start_index) : TILE_NUM;
|
||||
int start_index = thread_id * C12NUM;
|
||||
int real_cal_num = (output_count - start_index) < C12NUM ? (output_count - start_index) : C12NUM;
|
||||
Conv3x3Fp32InputTransform(input_data + in_batch_offset, tile_buffer + task_id * tile_buffer_offset,
|
||||
block_unit_buffer + task_id * block_unit_buffer_offset, start_index, real_cal_num,
|
||||
out_w_block, conv_param);
|
||||
|
||||
gemm_func(tmp_dst_buffer + task_id * tmp_dst_buffer_offset, tile_buffer + task_id * tile_buffer_offset,
|
||||
transed_weight, NULL, input_unit_square, ic4, oc4 * C4NUM,
|
||||
oc4 * C4NUM * input_unit_square * sizeof(float), 1, 1, 0, 0);
|
||||
float *src_ptr = tile_buffer + task_id * tile_buffer_offset;
|
||||
float *tmp_col_ptr = col_buffer + task_id * col_buffer_offset;
|
||||
float *dst_ptr = tmp_dst_buffer + task_id * tmp_dst_buffer_offset;
|
||||
for (int i = 0; i < input_unit_square; ++i) {
|
||||
RowMajor2Col12Major(src_ptr + i * C12NUM * ic4 * C4NUM, tmp_col_ptr, C12NUM, ic4 * C4NUM);
|
||||
MatMulOpt(tmp_col_ptr, transed_weight + i * ic4 * C4NUM * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0,
|
||||
ic4 * C4NUM, C12NUM, oc8 * C8NUM, input_unit_square, 2);
|
||||
}
|
||||
|
||||
Conv3x3Fp32OutputTransform(tmp_dst_buffer + task_id * tmp_dst_buffer_offset, nc4hw4_out + nc4hw4_buffer_offset,
|
||||
bias_data, start_index, real_cal_num, out_w_block, conv_param);
|
||||
|
|
|
@ -38,13 +38,15 @@ void ConvDw(float *output_data, const float *input_data, const float *weight_dat
|
|||
int h_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_);
|
||||
int h_start = h_step * task_id;
|
||||
int h_end = MSMIN(h_start + h_step, conv_param->output_h_);
|
||||
bool relu = conv_param->act_type_ == ActType_Relu;
|
||||
bool relu6 = conv_param->act_type_ == ActType_Relu6;
|
||||
for (int b = 0; b < conv_param->output_batch_; b++) {
|
||||
const float *src = input_data + b * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_;
|
||||
float *dst = output_data + b * conv_param->output_h_ * conv_param->output_w_ * conv_param->output_channel_;
|
||||
for (int oh = h_start; oh < h_end; oh++) {
|
||||
float *dst_data = dst + oh * conv_param->output_w_ * conv_param->output_channel_;
|
||||
|
||||
int ih_origin = oh * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int ih_origin = oh * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int start_kh = MSMAX(0, UP_DIV(-ih_origin, conv_param->dilation_h_));
|
||||
int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih_origin, conv_param->dilation_h_));
|
||||
|
||||
|
@ -60,13 +62,13 @@ void ConvDw(float *output_data, const float *input_data, const float *weight_dat
|
|||
int in_sw_step = conv_param->stride_w_ * conv_param->input_channel_;
|
||||
for (int kw = 0; kw < conv_param->kernel_w_; kw++) {
|
||||
int out_w_start = MSMAX(
|
||||
0, (conv_param->pad_w_ - conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / conv_param->stride_w_);
|
||||
int out_w_end = MSMIN(conv_param->output_w_, (conv_param->input_w_ + conv_param->pad_w_ -
|
||||
0, (conv_param->pad_l_ - conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / conv_param->stride_w_);
|
||||
int out_w_end = MSMIN(conv_param->output_w_, (conv_param->input_w_ + conv_param->pad_l_ -
|
||||
conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) /
|
||||
conv_param->stride_w_);
|
||||
|
||||
float *dst_w = dst_data + out_w_start * conv_param->output_channel_;
|
||||
int iw_origin = (out_w_start * conv_param->stride_w_) - conv_param->pad_w_ + conv_param->dilation_w_ * kw;
|
||||
int iw_origin = (out_w_start * conv_param->stride_w_) - conv_param->pad_l_ + conv_param->dilation_w_ * kw;
|
||||
|
||||
const float *src_kw = src_kh + iw_origin * conv_param->input_channel_;
|
||||
int num_pixels = out_w_end - out_w_start;
|
||||
|
@ -75,10 +77,10 @@ void ConvDw(float *output_data, const float *input_data, const float *weight_dat
|
|||
weight_kh += conv_param->output_channel_;
|
||||
}
|
||||
}
|
||||
if (conv_param->is_relu_) {
|
||||
if (relu) {
|
||||
ReluFp32(dst_data, dst_data, conv_param->output_w_ * conv_param->output_channel_);
|
||||
}
|
||||
if (conv_param->is_relu6_) {
|
||||
if (relu6) {
|
||||
Relu6Fp32(dst_data, dst_data, conv_param->output_w_ * conv_param->output_channel_);
|
||||
}
|
||||
}
|
||||
|
@ -91,16 +93,16 @@ void InitSlidingParam(SlidingWindowParam *sliding, const ConvParameter *conv_par
|
|||
int top = 0;
|
||||
int bottom = conv_param->output_h_;
|
||||
|
||||
for (; left * conv_param->stride_w_ < conv_param->pad_w_; left++) {
|
||||
for (; left * conv_param->stride_w_ < conv_param->pad_l_; left++) {
|
||||
}
|
||||
for (; (right - 1) * conv_param->stride_w_ - conv_param->pad_w_ + conv_param->kernel_w_ * conv_param->dilation_w_ >
|
||||
for (; (right - 1) * conv_param->stride_w_ - conv_param->pad_l_ + conv_param->kernel_w_ * conv_param->dilation_w_ >
|
||||
conv_param->input_w_ &&
|
||||
right > left;
|
||||
right--) {
|
||||
}
|
||||
for (; top * conv_param->stride_h_ < conv_param->pad_h_; top++) {
|
||||
for (; top * conv_param->stride_h_ < conv_param->pad_u_; top++) {
|
||||
}
|
||||
for (; (bottom - 1) * conv_param->stride_h_ - conv_param->pad_h_ + conv_param->kernel_h_ * conv_param->dilation_h_ >
|
||||
for (; (bottom - 1) * conv_param->stride_h_ - conv_param->pad_u_ + conv_param->kernel_h_ * conv_param->dilation_h_ >
|
||||
conv_param->input_h_ &&
|
||||
bottom > top;
|
||||
bottom--) {
|
||||
|
@ -181,16 +183,18 @@ void DepthwiseBorderPixel(float *dst, const float *src, const float *weight, con
|
|||
|
||||
void DepthwiseBorder(float *dst, const float *src, const float *weight, const float *bias, int top, int bottom,
|
||||
int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) {
|
||||
bool relu = conv_param->act_type_ == ActType_Relu;
|
||||
bool relu6 = conv_param->act_type_ == ActType_Relu6;
|
||||
float *dst_h = dst + top * sliding->out_h_step_;
|
||||
for (int oh = top; oh < bottom; oh++) {
|
||||
int ih = oh * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int ih = oh * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_));
|
||||
int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_));
|
||||
const float *src_h = src + ih * sliding->in_h_step_;
|
||||
|
||||
float *dst_kernel = dst_h + left * sliding->block_channel_;
|
||||
for (int ow = left; ow < right; ow++) {
|
||||
int iw = ow * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int iw = ow * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_));
|
||||
int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_));
|
||||
const float *src_w = src_h + iw * sliding->block_channel_;
|
||||
|
@ -201,11 +205,10 @@ void DepthwiseBorder(float *dst, const float *src, const float *weight, const fl
|
|||
#ifdef ENABLE_ARM64
|
||||
ConvDwFp32Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
|
||||
sliding->in_kh_step_ * sizeof(float), sliding->in_kw_step_ * sizeof(float),
|
||||
conv_param->kernel_w_ * C4NUM * sizeof(float), conv_param->is_relu_, conv_param->is_relu6_);
|
||||
conv_param->kernel_w_ * C4NUM * sizeof(float), relu, relu6);
|
||||
#else
|
||||
DepthwiseBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
|
||||
sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C4NUM,
|
||||
conv_param->is_relu_, conv_param->is_relu6_);
|
||||
sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C4NUM, relu, relu6);
|
||||
#endif
|
||||
dst_kernel += sliding->block_channel_;
|
||||
} // width loop
|
||||
|
@ -259,6 +262,8 @@ void DepthwiseCenter(float *dst, const float *src, const float *weight, const fl
|
|||
// conv depthwise fp32: sliding window
|
||||
void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weight_data, const float *bias_data,
|
||||
const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) {
|
||||
bool relu = conv_param->act_type_ == ActType_Relu;
|
||||
bool relu6 = conv_param->act_type_ == ActType_Relu6;
|
||||
const float *src = input_data;
|
||||
float *dst = output_data;
|
||||
for (int b = 0; b < conv_param->output_batch_; b++) {
|
||||
|
@ -277,8 +282,8 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig
|
|||
conv_param->output_w_, conv_param, sliding);
|
||||
|
||||
if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) {
|
||||
int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
const float *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_;
|
||||
float *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
|
||||
#ifdef ENABLE_ARM64
|
||||
|
@ -286,12 +291,12 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig
|
|||
conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(float),
|
||||
sliding->block_channel_ * sizeof(float), sliding->in_sh_step_ * sizeof(float),
|
||||
sliding->in_sw_step_ * sizeof(float), sliding->in_kh_step_ * sizeof(float),
|
||||
sliding->in_kw_step_ * sizeof(float), conv_param->is_relu_, conv_param->is_relu6_);
|
||||
sliding->in_kw_step_ * sizeof(float), relu, relu6);
|
||||
#else
|
||||
DepthwiseCenter(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_,
|
||||
conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_, sliding->block_channel_,
|
||||
sliding->in_sh_step_, sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_,
|
||||
conv_param->is_relu_, conv_param->is_relu6_);
|
||||
sliding->in_sh_step_, sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_, relu,
|
||||
relu6);
|
||||
#endif
|
||||
}
|
||||
} // output C4 loop
|
||||
|
@ -454,11 +459,11 @@ void ConvDw3x3Fp32InputTrans(const float *input_data, float *trans_input, float
|
|||
memset(trans_input, 0, out_h_block * out_h_block * 16 * C4NUM * sizeof(float));
|
||||
|
||||
for (int oh = 0; oh < out_h_block; oh++) {
|
||||
int ih = oh * 2 - conv_param->pad_h_;
|
||||
int ih = oh * 2 - conv_param->pad_u_;
|
||||
int real_h_start = ih > 0 ? 0 : -ih;
|
||||
int real_h_end = (ih + input_unit) < conv_param->input_h_ ? input_unit : (conv_param->input_h_ - ih);
|
||||
for (int ow = 0; ow < out_w_block; ow++) {
|
||||
int iw = ow * 2 - conv_param->pad_w_;
|
||||
int iw = ow * 2 - conv_param->pad_l_;
|
||||
int real_w_start = iw > 0 ? 0 : -iw;
|
||||
int real_w_end = (iw + input_unit) < conv_param->input_w_ ? input_unit : (conv_param->input_w_ - iw);
|
||||
|
||||
|
@ -642,6 +647,8 @@ void ConvDw3x3Fp32OutputUnit(float *src_buf, float *dst_output, const float *bia
|
|||
|
||||
void ConvDw3x3Fp32OutputTrans(float *trans_buffer, float *output_data, const float *bias, int out_h_block,
|
||||
int out_w_block, const ConvParameter *conv_param) {
|
||||
bool relu = conv_param->act_type_ == ActType_Relu;
|
||||
bool relu6 = conv_param->act_type_ == ActType_Relu6;
|
||||
int oc4 = UP_DIV(conv_param->output_channel_, C4NUM);
|
||||
bool h_in_range = true;
|
||||
for (int oh = 0; oh < out_h_block; oh++) {
|
||||
|
@ -661,8 +668,8 @@ void ConvDw3x3Fp32OutputTrans(float *trans_buffer, float *output_data, const flo
|
|||
float *buf_ow = buf_oh + ow * 16 * C4NUM;
|
||||
float *output_ow = output_oh + real_ow * oc4 * C4NUM;
|
||||
|
||||
ConvDw3x3Fp32OutputUnit(buf_ow, output_ow, bias, oc4 * C4NUM, conv_param->output_w_, h_in_range, w_in_range,
|
||||
conv_param->is_relu_, conv_param->is_relu6_);
|
||||
ConvDw3x3Fp32OutputUnit(buf_ow, output_ow, bias, oc4 * C4NUM, conv_param->output_w_, h_in_range, w_in_range, relu,
|
||||
relu6);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -727,14 +734,14 @@ void DeconvDepthwiseBorder(float *dst, const float *src, const float *weight, in
|
|||
const ConvParameter *conv_param, const SlidingWindowParam *sliding) {
|
||||
const float *src_h = src + top * sliding->out_h_step_;
|
||||
for (int ih = top; ih < bottom; ih++) {
|
||||
int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
|
||||
int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
|
||||
float *dst_h = dst + oh * sliding->in_h_step_;
|
||||
|
||||
const float *src_kernel = src_h + left * sliding->block_channel_;
|
||||
for (int iw = left; iw < right; iw++) {
|
||||
int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_));
|
||||
int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_));
|
||||
float *dst_w = dst_h + ow * sliding->block_channel_;
|
||||
|
@ -790,12 +797,14 @@ void DeconvDepthwiseCenter(float *dst, const float *src, const float *weight, in
|
|||
#endif
|
||||
|
||||
void DeconvDepthwisePostFunc(float *dst, const float *bias, int block_channel, const ConvParameter *conv_param) {
|
||||
bool relu = conv_param->act_type_ == ActType_Relu;
|
||||
bool relu6 = conv_param->act_type_ == ActType_Relu6;
|
||||
float *dst_k = dst;
|
||||
for (int k = 0; k < conv_param->output_h_ * conv_param->output_w_; k++) {
|
||||
for (int c = 0; c < C4NUM; c++) {
|
||||
dst_k[c] += bias[c];
|
||||
dst_k[c] = (conv_param->is_relu_) ? (MSMAX(0, dst_k[c])) : (dst_k[c]);
|
||||
dst_k[c] = (conv_param->is_relu6_) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]);
|
||||
dst_k[c] = (relu) ? (MSMAX(0, dst_k[c])) : (dst_k[c]);
|
||||
dst_k[c] = (relu6) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]);
|
||||
}
|
||||
dst_k += block_channel;
|
||||
}
|
||||
|
@ -821,8 +830,8 @@ void DeconvDwC4Fp32(float *output_data, const float *input_data, const float *we
|
|||
conv_param->input_w_, conv_param, sliding);
|
||||
|
||||
if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) {
|
||||
int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
float *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_;
|
||||
const float *in_t = src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
|
||||
|
||||
|
|
|
@ -57,8 +57,8 @@ int DeConvPostFp32C12x8(const float *src, float *tmp, const float *bias, float *
|
|||
|
||||
for (int ih = 0; ih < conv_param->input_h_; ih++) {
|
||||
for (int iw = 0; iw < conv_param->input_w_; iw++) {
|
||||
int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
|
||||
int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
|
||||
int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
|
||||
|
@ -97,7 +97,7 @@ int DeConvPostFp32C12x8(const float *src, float *tmp, const float *bias, float *
|
|||
} /*ih*/
|
||||
} /*oc8*/
|
||||
|
||||
PostConvFuncFp32C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_, conv_param->is_relu_,
|
||||
conv_param->is_relu6_);
|
||||
PostConvFuncFp32C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_,
|
||||
conv_param->act_type_ == ActType_Relu, conv_param->act_type_ == ActType_Relu6);
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
|
|
@ -356,7 +356,7 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A
|
|||
dst[ci] = value;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
} else if (out_type == OutType_C8) {
|
||||
/* col8-major * row8-major => col12x8-major */
|
||||
int col_8 = UP_ROUND(col, C8NUM);
|
||||
int row_12 = UP_ROUND(row, C12NUM);
|
||||
|
@ -364,9 +364,7 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A
|
|||
for (int c = 0; c < col_8; c++) {
|
||||
int r12div = r / C12NUM, r12mod = r % C12NUM;
|
||||
int c8div = c / C8NUM, c8mod = c % C8NUM;
|
||||
int c4div = c / C4NUM, c4mod = c % C4NUM;
|
||||
size_t ci = (out_type == OutType_C4) ? (c4div * C4NUM * row_12 + r * C4NUM + c4mod)
|
||||
: (c8div * C8NUM * row_12 + r * C8NUM + c8mod);
|
||||
size_t ci = (c8div * C8NUM * row_12 + r * C8NUM + c8mod);
|
||||
float value = 0;
|
||||
for (int d = 0; d < deep; d++) {
|
||||
size_t ai = r12div * deep * C12NUM + d * C12NUM + r12mod;
|
||||
|
@ -379,6 +377,25 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A
|
|||
dst[ci] = value;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < row; ++i) {
|
||||
int src_r_offset = i;
|
||||
int dst_r_offset = i * col * stride;
|
||||
for (int j = 0; j < col; ++j) {
|
||||
int c8div = j / 8, c8mod = j % 8;
|
||||
size_t ci = dst_r_offset + c8div * 8 * stride + c8mod;
|
||||
float value = 0;
|
||||
for (int d = 0; d < deep; ++d) {
|
||||
size_t ai = src_r_offset + d * row;
|
||||
size_t bi = c8div * deep * 8 + d * 8 + c8mod;
|
||||
value = value + a[ai] * b[bi];
|
||||
}
|
||||
if (bias != NULL) value += bias[j];
|
||||
if (act_type == ActType_Relu6) value = MSMIN(6.0f, value);
|
||||
if (act_type != ActType_No) value = MSMAX(0.0f, value);
|
||||
dst[ci] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -387,7 +404,7 @@ void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActT
|
|||
int col, size_t stride, int out_type) {
|
||||
#ifdef ENABLE_ARM64
|
||||
MatmulFloatNeon64Opt(a, b, c, bias, (int)act_type, deep, row, col, stride, (int)(out_type == OutType_Nhwc),
|
||||
(int)(out_type == OutType_C4));
|
||||
(int)(out_type == OutType_TileC8));
|
||||
#else
|
||||
MatMul12x8(a, b, c, bias, act_type, deep, row, col, stride, out_type);
|
||||
#endif
|
||||
|
|
|
@ -20,9 +20,9 @@
|
|||
static int is_a_ge_zero_and_a_lt_b(int a, int b) { return (unsigned)(a) < (unsigned)(b); }
|
||||
|
||||
void im2col_hwc(const float *in_data, float *data_col, ConvParameter *conv_param) {
|
||||
const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_w_;
|
||||
const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_l_;
|
||||
// const int pad_right = /*conv_param->pad_r_*/conv_param->pad_w_;
|
||||
const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_h_;
|
||||
const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_u_;
|
||||
// const int pad_down = /*conv_param->pad_d/*/conv_param->pad_h_;
|
||||
|
||||
const int stride_h = conv_param->stride_h_;
|
||||
|
@ -72,9 +72,9 @@ void im2col_hwc(const float *in_data, float *data_col, ConvParameter *conv_param
|
|||
|
||||
// output matrix is (kernel_h*kernel_w*channels)X(output_h*output_w)
|
||||
void im2row_hwc(const float *in_data, float *data_row, ConvParameter *conv_param) {
|
||||
const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_w_;
|
||||
const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_l_;
|
||||
// const int pad_right = /*conv_param->pad_r_*/conv_param->pad_w_;
|
||||
const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_h_;
|
||||
const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_u_;
|
||||
// const int pad_down = /*conv_param->pad_d/*/conv_param->pad_h_;
|
||||
|
||||
const int stride_h = conv_param->stride_h_;
|
||||
|
|
|
@ -68,14 +68,14 @@ void DepthwiseBorderInt8(int8_t *dst, const int16_t *src, const int16_t *weight,
|
|||
bool per_channel) {
|
||||
int8_t *dst_h = dst + top * sliding->out_h_step_;
|
||||
for (int oh = top; oh < bottom; oh++) {
|
||||
int ih = oh * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int ih = oh * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_));
|
||||
int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_));
|
||||
const int16_t *src_h = src + ih * sliding->in_h_step_;
|
||||
|
||||
int8_t *dst_kernel = dst_h + left * sliding->block_channel_;
|
||||
for (int ow = left; ow < right; ow++) {
|
||||
int iw = ow * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int iw = ow * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_));
|
||||
int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_));
|
||||
const int16_t *src_w = src_h + iw * sliding->block_channel_;
|
||||
|
@ -186,8 +186,8 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w
|
|||
per_channel);
|
||||
|
||||
if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) {
|
||||
int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
const int16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_;
|
||||
int8_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
|
||||
#ifdef ENABLE_ARM64
|
||||
|
@ -241,14 +241,14 @@ void DeconvDepthwiseBorderInt8(int32_t *dst, const int16_t *src, const int16_t *
|
|||
int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) {
|
||||
const int16_t *src_h = src + top * sliding->out_h_step_;
|
||||
for (int ih = top; ih < bottom; ih++) {
|
||||
int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
|
||||
int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
|
||||
int32_t *dst_h = dst + oh * sliding->in_h_step_;
|
||||
|
||||
const int16_t *src_kernel = src_h + left * sliding->block_channel_;
|
||||
for (int iw = left; iw < right; iw++) {
|
||||
int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_));
|
||||
int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_));
|
||||
int32_t *dst_w = dst_h + ow * C4NUM;
|
||||
|
@ -341,8 +341,8 @@ void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *in
|
|||
conv_param->input_w_, conv_param, sliding);
|
||||
|
||||
if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) {
|
||||
int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
int32_t *out_t = output_buffer + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_;
|
||||
const int16_t *in_t =
|
||||
src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
|
||||
|
|
|
@ -33,8 +33,8 @@ int DeConvPostInt8C8(const int32_t *src, const int32_t *bias, int32_t *tmp, int8
|
|||
|
||||
for (int ih = 0; ih < conv_param->input_h_; ih++) {
|
||||
for (int iw = 0; iw < conv_param->input_w_; iw++) {
|
||||
int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
|
||||
int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
|
||||
int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
|
||||
|
@ -88,8 +88,8 @@ int DeConvPostInt8C4(const int32_t *src, const int32_t *bias, int32_t *tmp, int8
|
|||
|
||||
for (int ih = 0; ih < conv_param->input_h_; ih++) {
|
||||
for (int iw = 0; iw < conv_param->input_w_; iw++) {
|
||||
int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
|
||||
int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
|
||||
int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
|
||||
|
|
|
@ -29,9 +29,7 @@ typedef void (*MATMUL_OPT_R_FUNC)(const int8_t *a, const int8_t *b, int8_t *dst,
|
|||
|
||||
typedef void (*MAT_TRANS_FUNC)(void *dst, void *a, int row, int col);
|
||||
|
||||
typedef enum ActType { ActType_No, ActType_Relu, ActType_Relu6 } ActType;
|
||||
|
||||
typedef enum OutType { OutType_C8 = 0, OutType_Nhwc = 1, OutType_C4 = 2 } OutType;
|
||||
typedef enum OutType { OutType_C8 = 0, OutType_Nhwc = 1, OutType_TileC8 = 2 } OutType;
|
||||
|
||||
typedef struct MatMulParameter {
|
||||
OpParameter op_parameter_;
|
||||
|
|
|
@ -25,7 +25,6 @@
|
|||
#define C8NUM 8
|
||||
#define C12NUM 12
|
||||
#define C16NUM 16
|
||||
#define BLOCK 4
|
||||
#define TILE_NUM 8
|
||||
|
||||
#define MSMIN(x, y) ((x) < (y) ? (x) : (y))
|
||||
|
@ -62,4 +61,6 @@ typedef struct OpParameter {
|
|||
int thread_num_;
|
||||
} OpParameter;
|
||||
|
||||
typedef enum ActType { ActType_No, ActType_Relu, ActType_Relu6 } ActType;
|
||||
|
||||
#endif // MINDSPORE_LITE_NNACL_OP_BASE_H_
|
||||
|
|
|
@ -158,14 +158,14 @@ void Conv1x1InputPack(const void *src_ptr, void *dst_ptr, ConvParameter *conv_pa
|
|||
char *src = (char *)src_ptr;
|
||||
char *dst = (char *)dst_ptr;
|
||||
for (int dst_h = 0; dst_h < conv_param->output_h_; dst_h++) {
|
||||
int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
if (src_h < 0 || src_h >= conv_param->input_h_) {
|
||||
continue;
|
||||
}
|
||||
const char *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_ * data_size;
|
||||
char *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_ * data_size;
|
||||
for (int dst_w = 0; dst_w < conv_param->output_w_; dst_w++) {
|
||||
int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
if (src_w < 0 || src_w >= conv_param->input_w_) {
|
||||
continue;
|
||||
}
|
||||
|
@ -296,8 +296,8 @@ void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, floa
|
|||
int kernel_w = conv_param->kernel_w_;
|
||||
int stride_h = conv_param->stride_h_;
|
||||
int stride_w = conv_param->stride_w_;
|
||||
int pad_h = conv_param->pad_h_;
|
||||
int pad_w = conv_param->pad_w_;
|
||||
int pad_h = conv_param->pad_u_;
|
||||
int pad_w = conv_param->pad_l_;
|
||||
int dilation_h = conv_param->dilation_h_;
|
||||
int dilation_w = conv_param->dilation_w_;
|
||||
int in_channel = conv_param->input_channel_;
|
||||
|
@ -348,8 +348,8 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real
|
|||
int kernel_w = conv_param->kernel_w_;
|
||||
int stride_h = conv_param->stride_h_;
|
||||
int stride_w = conv_param->stride_w_;
|
||||
int pad_h = conv_param->pad_h_;
|
||||
int pad_w = conv_param->pad_w_;
|
||||
int pad_h = conv_param->pad_u_;
|
||||
int pad_w = conv_param->pad_l_;
|
||||
int dilation_h = conv_param->dilation_h_;
|
||||
int dilation_w = conv_param->dilation_w_;
|
||||
int in_channel = conv_param->input_channel_;
|
||||
|
@ -419,8 +419,8 @@ void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int r
|
|||
int kernel_w = conv_param->kernel_w_;
|
||||
int stride_h = conv_param->stride_h_;
|
||||
int stride_w = conv_param->stride_w_;
|
||||
int pad_h = conv_param->pad_h_;
|
||||
int pad_w = conv_param->pad_w_;
|
||||
int pad_h = conv_param->pad_u_;
|
||||
int pad_w = conv_param->pad_l_;
|
||||
int dilation_h = conv_param->dilation_h_;
|
||||
int dilation_w = conv_param->dilation_w_;
|
||||
int in_channel = conv_param->input_channel_;
|
||||
|
|
|
@ -24,8 +24,8 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
|
|||
int output_unit = conv_param->output_unit_;
|
||||
int in_channel = conv_param->input_channel_;
|
||||
int ic4 = UP_DIV(in_channel, C4NUM);
|
||||
int pad_h = conv_param->pad_h_;
|
||||
int pad_w = conv_param->pad_w_;
|
||||
int pad_h = conv_param->pad_u_;
|
||||
int pad_w = conv_param->pad_l_;
|
||||
int input_h = conv_param->input_h_;
|
||||
int input_w = conv_param->input_w_;
|
||||
if (out_w_block_num == 0) {
|
||||
|
@ -42,7 +42,7 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
|
|||
int interval_y_e = src_y_e < input_h ? input_unit : (input_h - src_y_s);
|
||||
|
||||
int src_plane_offset = ic4 * C4NUM * (src_y_s * input_w + src_x_s);
|
||||
int dst_plane_offset = c * C4NUM;
|
||||
int dst_plane_offset = c * C4NUM * ic4;
|
||||
for (int ic = 0; ic < ic4; ic++) {
|
||||
// clear tmp buffer
|
||||
memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float));
|
||||
|
@ -67,8 +67,8 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
|
|||
}
|
||||
}
|
||||
// input transform
|
||||
int dst_ic4_offset = dst_plane_offset + ic * TILE_NUM * C4NUM;
|
||||
size_t dst_step = ic4 * C4NUM * TILE_NUM;
|
||||
int dst_ic4_offset = dst_plane_offset + ic * C4NUM;
|
||||
size_t dst_step = C12NUM * ic4 * C4NUM;
|
||||
float *trans_input_ptr = trans_input + dst_ic4_offset;
|
||||
input_trans_func(tmp_data, trans_input_ptr, C4NUM, dst_step);
|
||||
}
|
||||
|
@ -86,6 +86,7 @@ void WinogradOutputTransform(const float *gemm_out, float *tmp_out_data, const f
|
|||
int output_h_unit_block = UP_DIV(output_h, output_unit);
|
||||
int output_channel = conv_param->output_channel_;
|
||||
int oc4 = UP_DIV(output_channel, C4NUM);
|
||||
int oc8 = UP_DIV(output_channel, C8NUM);
|
||||
int input_unit = conv_param->input_unit_;
|
||||
if (output_unit_num == 0) {
|
||||
return;
|
||||
|
@ -93,17 +94,19 @@ void WinogradOutputTransform(const float *gemm_out, float *tmp_out_data, const f
|
|||
for (int i = 0; i < cal_num; i++) {
|
||||
int dst_x_s = out_tile_index % output_unit_num;
|
||||
int dst_y_s = out_tile_index / output_unit_num;
|
||||
int src_tile_offset = i * oc4 * C4NUM * input_unit * input_unit;
|
||||
int src_tile_offset = i * oc8 * C8NUM * input_unit * input_unit;
|
||||
int dst_tile_offset = C4NUM * output_unit * (dst_x_s + dst_y_s * output_w_unit_block * output_unit);
|
||||
|
||||
for (int j = 0; j < oc4; j++) {
|
||||
int src_oc4_offset = src_tile_offset + j * input_unit * input_unit * C4NUM;
|
||||
int c8_block = j / 2;
|
||||
int c8_res = j % 2;
|
||||
int src_oc4_offset = src_tile_offset + c8_block * input_unit * input_unit * C8NUM + c8_res * C4NUM;
|
||||
int dst_oc4_offset =
|
||||
dst_tile_offset + j * C4NUM * output_h_unit_block * output_w_unit_block * output_unit * output_unit;
|
||||
const float *src_ptr = gemm_out + src_oc4_offset;
|
||||
const float *bias_ptr = bias_data + j * C4NUM;
|
||||
float *dst_ptr = tmp_out_data + dst_oc4_offset;
|
||||
output_trans_func(src_ptr, dst_ptr, bias_ptr, C4NUM, output_w_unit_block * output_unit);
|
||||
output_trans_func(src_ptr, dst_ptr, bias_ptr, C8NUM, output_w_unit_block * output_unit);
|
||||
}
|
||||
out_tile_index++;
|
||||
}
|
||||
|
@ -283,8 +286,8 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa
|
|||
int input_channel = conv_param->input_channel_;
|
||||
int input_width = conv_param->input_w_;
|
||||
int input_height = conv_param->input_h_;
|
||||
int pad_w = conv_param->pad_w_;
|
||||
int pad_h = conv_param->pad_h_;
|
||||
int pad_w = conv_param->pad_l_;
|
||||
int pad_h = conv_param->pad_u_;
|
||||
int ic4 = UP_DIV(input_channel, C4NUM);
|
||||
const int input_unit = 4;
|
||||
if (out_w_block == 0) {
|
||||
|
@ -300,7 +303,7 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa
|
|||
int real_y_end = (origin_y + input_unit) < input_height ? input_unit : (input_height - origin_y);
|
||||
|
||||
int src_plane_offset = ic4 * C4NUM * (origin_y * input_width + origin_x);
|
||||
int dst_plane_offset = cal_id * C4NUM;
|
||||
int dst_plane_offset = cal_id * C4NUM * ic4;
|
||||
for (int ic = 0; ic < ic4; ic++) {
|
||||
// clear tmp buffer
|
||||
memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float));
|
||||
|
@ -326,8 +329,8 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa
|
|||
}
|
||||
|
||||
// input transform
|
||||
int dst_ic4_offset = dst_plane_offset + ic * TILE_NUM * C4NUM;
|
||||
size_t dst_step = ic4 * C4NUM * TILE_NUM;
|
||||
int dst_ic4_offset = dst_plane_offset + ic * C4NUM;
|
||||
size_t dst_step = C12NUM * ic4 * C4NUM;
|
||||
float *trans_input_ptr = trans_input + dst_ic4_offset;
|
||||
Conv3x3Fp32InputUnit(tmp_data, trans_input_ptr, dst_step);
|
||||
}
|
||||
|
@ -336,8 +339,8 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa
|
|||
|
||||
void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4, int output_channel, int kernel_plane,
|
||||
int oc_block) {
|
||||
const int input_unit = 4;
|
||||
int dst_step = iC4 * C4NUM * oc_block;
|
||||
int oc_plane_block = UP_DIV(output_channel, oc_block);
|
||||
int dst_step = iC4 * C4NUM * oc_block * oc_plane_block;
|
||||
if (oc_block == 0) {
|
||||
return;
|
||||
}
|
||||
|
@ -345,7 +348,7 @@ void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4
|
|||
int oc_block_num = o / oc_block;
|
||||
int oc_block_rem = o % oc_block;
|
||||
int src_oc_offset = o * iC4 * C4NUM * kernel_plane;
|
||||
int dst_oc_offset = oc_block_num * oc_block * iC4 * C4NUM * input_unit * input_unit + oc_block_rem;
|
||||
int dst_oc_offset = oc_block_num * oc_block * iC4 * C4NUM + oc_block_rem;
|
||||
for (int i = 0; i < iC4; i++) {
|
||||
float *src_ic4_ptr = weight_data + src_oc_offset + i * kernel_plane * C4NUM;
|
||||
float *dst_ic4_ptr = trans_weight + dst_oc_offset + i * oc_block * C4NUM;
|
||||
|
@ -559,24 +562,24 @@ void Conv3x3Fp32OutputUnit(const float *gemm_out, const float *bias_data, float
|
|||
float32x4_t bias_ptr = vld1q_f32(bias_data);
|
||||
|
||||
float32x4_t s00 = vld1q_f32(gemm_out);
|
||||
float32x4_t s01 = vld1q_f32(gemm_out + 4);
|
||||
float32x4_t s02 = vld1q_f32(gemm_out + 8);
|
||||
float32x4_t s03 = vld1q_f32(gemm_out + 12);
|
||||
float32x4_t s01 = vld1q_f32(gemm_out + 8);
|
||||
float32x4_t s02 = vld1q_f32(gemm_out + 16);
|
||||
float32x4_t s03 = vld1q_f32(gemm_out + 24);
|
||||
|
||||
float32x4_t s10 = vld1q_f32(gemm_out + 16);
|
||||
float32x4_t s11 = vld1q_f32(gemm_out + 20);
|
||||
float32x4_t s12 = vld1q_f32(gemm_out + 24);
|
||||
float32x4_t s13 = vld1q_f32(gemm_out + 28);
|
||||
float32x4_t s10 = vld1q_f32(gemm_out + 32);
|
||||
float32x4_t s11 = vld1q_f32(gemm_out + 40);
|
||||
float32x4_t s12 = vld1q_f32(gemm_out + 48);
|
||||
float32x4_t s13 = vld1q_f32(gemm_out + 56);
|
||||
|
||||
float32x4_t s20 = vld1q_f32(gemm_out + 32);
|
||||
float32x4_t s21 = vld1q_f32(gemm_out + 36);
|
||||
float32x4_t s22 = vld1q_f32(gemm_out + 40);
|
||||
float32x4_t s23 = vld1q_f32(gemm_out + 44);
|
||||
float32x4_t s20 = vld1q_f32(gemm_out + 64);
|
||||
float32x4_t s21 = vld1q_f32(gemm_out + 72);
|
||||
float32x4_t s22 = vld1q_f32(gemm_out + 80);
|
||||
float32x4_t s23 = vld1q_f32(gemm_out + 88);
|
||||
|
||||
float32x4_t s30 = vld1q_f32(gemm_out + 48);
|
||||
float32x4_t s31 = vld1q_f32(gemm_out + 52);
|
||||
float32x4_t s32 = vld1q_f32(gemm_out + 56);
|
||||
float32x4_t s33 = vld1q_f32(gemm_out + 60);
|
||||
float32x4_t s30 = vld1q_f32(gemm_out + 96);
|
||||
float32x4_t s31 = vld1q_f32(gemm_out + 104);
|
||||
float32x4_t s32 = vld1q_f32(gemm_out + 112);
|
||||
float32x4_t s33 = vld1q_f32(gemm_out + 120);
|
||||
|
||||
float32x4_t t00 = vaddq_f32(vaddq_f32(s00, s10), s20);
|
||||
float32x4_t t01 = vaddq_f32(vaddq_f32(s01, s11), s21);
|
||||
|
@ -609,24 +612,24 @@ void Conv3x3Fp32OutputUnit(const float *gemm_out, const float *bias_data, float
|
|||
const float *bias_ptr = bias_data + i;
|
||||
|
||||
float s00 = local_ptr[0];
|
||||
float s01 = (local_ptr + 4)[0];
|
||||
float s02 = (local_ptr + 8)[0];
|
||||
float s03 = (local_ptr + 12)[0];
|
||||
float s01 = (local_ptr + 8)[0];
|
||||
float s02 = (local_ptr + 16)[0];
|
||||
float s03 = (local_ptr + 24)[0];
|
||||
|
||||
float s10 = (local_ptr + 16)[0];
|
||||
float s11 = (local_ptr + 20)[0];
|
||||
float s12 = (local_ptr + 24)[0];
|
||||
float s13 = (local_ptr + 28)[0];
|
||||
float s10 = (local_ptr + 32)[0];
|
||||
float s11 = (local_ptr + 40)[0];
|
||||
float s12 = (local_ptr + 48)[0];
|
||||
float s13 = (local_ptr + 56)[0];
|
||||
|
||||
float s20 = (local_ptr + 32)[0];
|
||||
float s21 = (local_ptr + 36)[0];
|
||||
float s22 = (local_ptr + 40)[0];
|
||||
float s23 = (local_ptr + 44)[0];
|
||||
float s20 = (local_ptr + 64)[0];
|
||||
float s21 = (local_ptr + 72)[0];
|
||||
float s22 = (local_ptr + 80)[0];
|
||||
float s23 = (local_ptr + 88)[0];
|
||||
|
||||
float s30 = (local_ptr + 48)[0];
|
||||
float s31 = (local_ptr + 52)[0];
|
||||
float s32 = (local_ptr + 56)[0];
|
||||
float s33 = (local_ptr + 60)[0];
|
||||
float s30 = (local_ptr + 96)[0];
|
||||
float s31 = (local_ptr + 104)[0];
|
||||
float s32 = (local_ptr + 112)[0];
|
||||
float s33 = (local_ptr + 120)[0];
|
||||
|
||||
float t00 = s00 + s10 + s20;
|
||||
float t01 = s01 + s11 + s21;
|
||||
|
@ -663,6 +666,7 @@ void Conv3x3Fp32OutputTransform(const float *gemm_out, float *out_data, const fl
|
|||
int output_w = conv_param->output_w_;
|
||||
int output_h = conv_param->output_h_;
|
||||
int oc4 = UP_DIV(output_channel, C4NUM);
|
||||
int oc8 = UP_DIV(output_channel, C8NUM);
|
||||
const int input_unit = 4;
|
||||
if (out_w_block == 0) {
|
||||
return;
|
||||
|
@ -670,11 +674,13 @@ void Conv3x3Fp32OutputTransform(const float *gemm_out, float *out_data, const fl
|
|||
for (int i = 0; i < real_cal_num; i++) {
|
||||
int out_w_index = (start_index + i) % out_w_block;
|
||||
int out_h_index = (start_index + i) / out_w_block;
|
||||
int src_tile_offset = i * oc4 * C4NUM * input_unit * input_unit;
|
||||
int src_tile_offset = i * oc8 * C8NUM * input_unit * input_unit;
|
||||
int dst_tile_offset = C4NUM * (out_w_index * OUPUT_UNIT + out_h_index * OUPUT_UNIT * output_w);
|
||||
|
||||
for (int j = 0; j < oc4; j++) {
|
||||
int src_oc4_offset = src_tile_offset + j * input_unit * input_unit * C4NUM;
|
||||
int c8_block = j / 2;
|
||||
int c8_res = j % 2;
|
||||
int src_oc4_offset = src_tile_offset + c8_block * input_unit * input_unit * C8NUM + c8_res * C4NUM;
|
||||
int dst_oc4_offset = dst_tile_offset + j * C4NUM * output_h * output_w;
|
||||
const float *src_ptr = gemm_out + src_oc4_offset;
|
||||
const float *bias_ptr = bias_data + j * C4NUM;
|
||||
|
@ -864,8 +870,8 @@ void Conv3x3Uint8InputTransform(const int16_t *input_data, int16_t *trans_input,
|
|||
int input_channel = conv_param->input_channel_;
|
||||
int input_width = conv_param->input_w_;
|
||||
int input_height = conv_param->input_h_;
|
||||
int pad_w = conv_param->pad_w_;
|
||||
int pad_h = conv_param->pad_h_;
|
||||
int pad_w = conv_param->pad_l_;
|
||||
int pad_h = conv_param->pad_u_;
|
||||
ConvQuantArg quant_arg = conv_param->conv_quant_arg_;
|
||||
int input_zp = quant_arg.input_quant_args_[0].zp_;
|
||||
const int ic8 = UP_DIV(input_channel, C8NUM);
|
||||
|
@ -1221,9 +1227,9 @@ void Conv3x3Uint8OutputUnit(const int32_t *gemm_out, const int32_t *bias_data, i
|
|||
int32x4_t ls;
|
||||
int32x4_t rs;
|
||||
if ((conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
|
||||
out_multiplier = vld1q_s32(quant_multiplier);
|
||||
ls = vld1q_s32(left_shift);
|
||||
rs = vld1q_s32(right_shift);
|
||||
out_multiplier = vld1q_s32(quant_multiplier + oc_start);
|
||||
ls = vld1q_s32(left_shift + oc_start);
|
||||
rs = vld1q_s32(right_shift + oc_start);
|
||||
} else {
|
||||
out_multiplier = vdupq_n_s32(quant_multiplier[0]);
|
||||
ls = vdupq_n_s32(left_shift[0]);
|
||||
|
|
|
@ -4649,43 +4649,41 @@ void OutputTransform8x7Unit(const float *src_data, float *dst_data, const float
|
|||
// Utilize cost model to compute performance gain.
|
||||
// If the gain is greater than got from Im2col, winograd algorithm will be chosen.
|
||||
int SelectOutputUnit(ConvParameter *conv_param) {
|
||||
int input_batch = conv_param->input_batch_;
|
||||
int kernel_h = conv_param->kernel_h_;
|
||||
int kernel_w = conv_param->kernel_w_;
|
||||
int in_channel = conv_param->input_channel_;
|
||||
int out_h = conv_param->output_h_;
|
||||
int in_c = conv_param->input_channel_;
|
||||
int out_w = conv_param->output_w_;
|
||||
int out_channel = conv_param->output_channel_;
|
||||
int out_plane = out_h * out_w;
|
||||
int out_h = conv_param->output_h_;
|
||||
int out_c = conv_param->output_channel_;
|
||||
int unit2 = UP_DIV(out_w * out_h, C12NUM * conv_param->op_parameter_.thread_num_);
|
||||
int max_out_unit = (int)(sqrtf((float)unit2));
|
||||
max_out_unit = max_out_unit < MAX_UNIT ? MAX_UNIT : max_out_unit;
|
||||
max_out_unit = max_out_unit > MIN_UNIT ? max_out_unit : MIN_UNIT;
|
||||
|
||||
int max_unit = sqrt((float)(out_plane));
|
||||
max_unit = max_unit > MIN_UNIT ? max_unit : MIN_UNIT;
|
||||
max_unit = max_unit < MAX_UNIT ? max_unit : MAX_UNIT;
|
||||
int output_unit = 1;
|
||||
float ratio = 0.0f;
|
||||
// cost of conventional convolution multiplications
|
||||
float ori_cost = out_plane * out_channel * in_channel * kernel_h * kernel_w;
|
||||
int unit = 0;
|
||||
float max_rate = 0.0f;
|
||||
float common_cost = (float)out_h * out_w * in_c * out_c * kernel_h * kernel_w;
|
||||
|
||||
for (int u = MIN_UNIT; u < max_unit; u++) {
|
||||
int input_unit = u + kernel_h - 1;
|
||||
if (input_unit != 4 && input_unit != 8) {
|
||||
for (int i = MIN_UNIT; i <= max_out_unit; ++i) {
|
||||
int input_unit = i + kernel_w - 1;
|
||||
OutputTransformUnitFunc output_trans_func = GetOutputTransFunc(input_unit, i);
|
||||
if (output_trans_func == NULL) {
|
||||
continue;
|
||||
}
|
||||
// don't count filter transform cost, because it can be processed once offline.
|
||||
const float input_trans_unit_cost = 2 * input_unit * input_unit * input_unit * in_channel;
|
||||
float gemm_unit_cost = input_unit * input_unit * in_channel * out_channel;
|
||||
float output_trans_unit_cost = input_unit * u * (u + input_unit) * out_channel;
|
||||
// equation (23) in papar
|
||||
float winograd_cost = (input_trans_unit_cost + gemm_unit_cost + output_trans_unit_cost) *
|
||||
(UP_DIV(out_w, u) * (UP_DIV(out_h, u))) * input_batch;
|
||||
float reduce_rate = ori_cost / winograd_cost;
|
||||
if (reduce_rate > ratio && reduce_rate > 1) {
|
||||
ratio = reduce_rate;
|
||||
output_unit = u;
|
||||
float penalty = ((float)input_unit * input_unit) / ((float)kernel_h * kernel_w) * 0.12f;
|
||||
float wino_cost = ((2 + out_c) * (float)input_unit * input_unit * in_c + ((float)input_unit + i) * i * out_c) *
|
||||
UP_DIV(out_w, i) * UP_DIV(out_h, i);
|
||||
float reduce_rate = common_cost / wino_cost - penalty;
|
||||
if (reduce_rate > max_rate) {
|
||||
max_rate = reduce_rate;
|
||||
unit = i;
|
||||
}
|
||||
}
|
||||
if (max_rate < 1.0f) {
|
||||
return 1;
|
||||
}
|
||||
// If output_unit is 1, then it is conventional convolution
|
||||
return output_unit;
|
||||
return unit;
|
||||
}
|
||||
|
||||
InputTransformUnitFunc GetInputTransFunc(int input_unit) {
|
||||
|
@ -4719,17 +4717,6 @@ void CheckIfUseWinograd(bool *use_winograd, int *output_unit, ConvParameter *con
|
|||
*output_unit = SelectOutputUnit(conv_param);
|
||||
if (*output_unit > 1) {
|
||||
*use_winograd = true;
|
||||
int input_unit = conv_param->kernel_h_ + *output_unit - 1;
|
||||
input_trans_func = GetInputTransFunc(input_unit);
|
||||
if (input_trans_func == NULL) {
|
||||
*use_winograd = false;
|
||||
}
|
||||
output_trans_func = GetOutputTransFunc(input_unit, *output_unit);
|
||||
if (output_trans_func == NULL) {
|
||||
*use_winograd = false;
|
||||
}
|
||||
} else {
|
||||
*use_winograd = false;
|
||||
}
|
||||
} else {
|
||||
*use_winograd = false;
|
||||
|
|
|
@ -376,10 +376,18 @@ void Conv2D::ConvInferShape(int input_h, int input_w, int *output_h, int *output
|
|||
*output_h = std::ceil(static_cast<float>(input_h) / static_cast<float>(stride_h));
|
||||
auto pad_h_all = ((*output_h - 1) * stride_h + (kernel_h - 1) * dilate_h + 1 - input_h);
|
||||
auto pad_w_all = ((*output_w - 1) * stride_w + (kernel_w - 1) * dilate_w + 1 - input_w);
|
||||
pad_u_ = pad_h_all / 2;
|
||||
pad_d_ = pad_h_all - pad_u_;
|
||||
pad_l_ = pad_w_all / 2;
|
||||
pad_r_ = pad_w_all - pad_l_;
|
||||
if (pad_h_all < 0) {
|
||||
pad_u_ = pad_d_ = 0;
|
||||
} else {
|
||||
pad_u_ = pad_h_all / 2;
|
||||
pad_d_ = pad_h_all - pad_u_;
|
||||
}
|
||||
if (pad_w_all < 0) {
|
||||
pad_l_ = pad_r_ = 0;
|
||||
} else {
|
||||
pad_l_ = pad_w_all / 2;
|
||||
pad_r_ = pad_w_all - pad_l_;
|
||||
}
|
||||
} else {
|
||||
*output_w = std::ceil((static_cast<float>(input_w) + pad_l_ + pad_r_ -
|
||||
(static_cast<float>(kernel_w) - 1) * static_cast<float>(dilate_w)) /
|
||||
|
|
|
@ -126,14 +126,12 @@ int DeConv2D::InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vecto
|
|||
output->set_shape(out_shape);
|
||||
|
||||
if (pad_mode == schema::PadMode_SAME) {
|
||||
pad_h_ = ((input_h - 1) * stride_h + (kernel_h - 1) * dilate_h + 1 - output_h) / 2;
|
||||
pad_w_ = ((input_w - 1) * stride_w + (kernel_w - 1) * dilate_w + 1 - output_w) / 2;
|
||||
pad_u_ = ((input_h - 1) * stride_h + (kernel_h - 1) * dilate_h + 1 - output_h) / 2;
|
||||
pad_l_ = ((input_w - 1) * stride_w + (kernel_w - 1) * dilate_w + 1 - output_w) / 2;
|
||||
} else if (pad_mode == schema::PadMode_VALID) {
|
||||
pad_h_ = 0;
|
||||
pad_w_ = 0;
|
||||
pad_u_ = 0;
|
||||
pad_l_ = 0;
|
||||
} else if (pad_mode == schema::PadMode_CAFFE) {
|
||||
pad_h_ = pad_u_;
|
||||
pad_w_ = pad_l_;
|
||||
} else {
|
||||
MS_LOG(ERROR) << "unsupported pad mode for deconv";
|
||||
}
|
||||
|
|
|
@ -74,16 +74,12 @@ class DeConv2D : public PrimitiveC {
|
|||
int PadDown() const { return this->pad_d_; }
|
||||
int PadLeft() const { return this->pad_l_; }
|
||||
int PadRight() const { return this->pad_r_; }
|
||||
int PadH() const { return this->pad_h_; }
|
||||
int PadW() const { return this->pad_w_; }
|
||||
|
||||
protected:
|
||||
int pad_u_ = 0;
|
||||
int pad_d_ = 0;
|
||||
int pad_l_ = 0;
|
||||
int pad_r_ = 0;
|
||||
int pad_h_ = 0;
|
||||
int pad_w_ = 0;
|
||||
};
|
||||
} // namespace lite
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -170,10 +170,18 @@ int Pooling::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tenso
|
|||
output_h = std::ceil(static_cast<float>(input_h) / static_cast<float>(GetStrideH()));
|
||||
auto pad_h_all = ((output_h - 1) * GetStrideH() + (window_h - 1) + 1 - input_h);
|
||||
auto pad_w_all = ((output_w - 1) * GetStrideW() + (window_w - 1) + 1 - input_w);
|
||||
pad_u_ = pad_h_all / 2;
|
||||
pad_d_ = pad_h_all - pad_u_;
|
||||
pad_l_ = pad_w_all / 2;
|
||||
pad_r_ = pad_w_all - pad_l_;
|
||||
if (pad_h_all < 0) {
|
||||
pad_u_ = pad_d_ = 0;
|
||||
} else {
|
||||
pad_u_ = pad_h_all / 2;
|
||||
pad_d_ = pad_h_all - pad_u_;
|
||||
}
|
||||
if (pad_w_all < 0) {
|
||||
pad_l_ = pad_r_ = 0;
|
||||
} else {
|
||||
pad_l_ = pad_w_all / 2;
|
||||
pad_r_ = pad_w_all - pad_l_;
|
||||
}
|
||||
} else {
|
||||
auto round_mode = (schema::RoundMode)GetRoundMode();
|
||||
if (round_mode == schema::RoundMode_FLOOR) {
|
||||
|
|
|
@ -376,8 +376,6 @@ OpParameter *PopulateConvParameter(const mindspore::lite::PrimitiveC *primitive)
|
|||
conv_param->pad_d_ = conv2d_lite_primitive->PadDown();
|
||||
conv_param->pad_l_ = conv2d_lite_primitive->PadLeft();
|
||||
conv_param->pad_r_ = conv2d_lite_primitive->PadRight();
|
||||
conv_param->pad_h_ = conv2d_lite_primitive->PadUp();
|
||||
conv_param->pad_w_ = conv2d_lite_primitive->PadLeft();
|
||||
conv_param->dilation_h_ = conv_primitive->GetDilateH();
|
||||
conv_param->dilation_w_ = conv_primitive->GetDilateW();
|
||||
conv_param->input_channel_ = conv_primitive->GetChannelIn();
|
||||
|
@ -386,16 +384,13 @@ OpParameter *PopulateConvParameter(const mindspore::lite::PrimitiveC *primitive)
|
|||
auto act_type = conv_primitive->GetActivationType();
|
||||
switch (act_type) {
|
||||
case schema::ActivationType_RELU:
|
||||
conv_param->is_relu_ = true;
|
||||
conv_param->is_relu6_ = false;
|
||||
conv_param->act_type_ = ActType_Relu;
|
||||
break;
|
||||
case schema::ActivationType_RELU6:
|
||||
conv_param->is_relu_ = false;
|
||||
conv_param->is_relu6_ = true;
|
||||
conv_param->act_type_ = ActType_Relu6;
|
||||
break;
|
||||
default:
|
||||
conv_param->is_relu_ = false;
|
||||
conv_param->is_relu6_ = false;
|
||||
conv_param->act_type_ = ActType_No;
|
||||
break;
|
||||
}
|
||||
return reinterpret_cast<OpParameter *>(conv_param);
|
||||
|
@ -422,23 +417,18 @@ OpParameter *PopulateConvDwParameter(const mindspore::lite::PrimitiveC *primitiv
|
|||
conv_param->pad_d_ = convdw_lite_primitive->PadDown();
|
||||
conv_param->pad_l_ = convdw_lite_primitive->PadLeft();
|
||||
conv_param->pad_r_ = convdw_lite_primitive->PadRight();
|
||||
conv_param->pad_h_ = convdw_lite_primitive->PadUp();
|
||||
conv_param->pad_w_ = convdw_lite_primitive->PadLeft();
|
||||
conv_param->dilation_h_ = conv_primitive->GetDilateH();
|
||||
conv_param->dilation_w_ = conv_primitive->GetDilateW();
|
||||
auto act_type = conv_primitive->GetActivationType();
|
||||
switch (act_type) {
|
||||
case schema::ActivationType_RELU:
|
||||
conv_param->is_relu_ = true;
|
||||
conv_param->is_relu6_ = false;
|
||||
conv_param->act_type_ = ActType_Relu;
|
||||
break;
|
||||
case schema::ActivationType_RELU6:
|
||||
conv_param->is_relu_ = false;
|
||||
conv_param->is_relu6_ = true;
|
||||
conv_param->act_type_ = ActType_Relu6;
|
||||
break;
|
||||
default:
|
||||
conv_param->is_relu_ = false;
|
||||
conv_param->is_relu6_ = false;
|
||||
conv_param->act_type_ = ActType_No;
|
||||
break;
|
||||
}
|
||||
return reinterpret_cast<OpParameter *>(conv_param);
|
||||
|
@ -464,23 +454,18 @@ OpParameter *PopulateDeconvDwParameter(const mindspore::lite::PrimitiveC *primit
|
|||
conv_param->pad_d_ = deconvdw_lite_primitive->PadDown();
|
||||
conv_param->pad_l_ = deconvdw_lite_primitive->PadLeft();
|
||||
conv_param->pad_r_ = deconvdw_lite_primitive->PadRight();
|
||||
conv_param->pad_h_ = deconvdw_lite_primitive->PadUp();
|
||||
conv_param->pad_w_ = deconvdw_lite_primitive->PadLeft();
|
||||
conv_param->dilation_h_ = conv_primitive->GetDilateH();
|
||||
conv_param->dilation_w_ = conv_primitive->GetDilateW();
|
||||
auto act_type = conv_primitive->GetActivationType();
|
||||
switch (act_type) {
|
||||
case schema::ActivationType_RELU:
|
||||
conv_param->is_relu_ = true;
|
||||
conv_param->is_relu6_ = false;
|
||||
conv_param->act_type_ = ActType_Relu;
|
||||
break;
|
||||
case schema::ActivationType_RELU6:
|
||||
conv_param->is_relu_ = false;
|
||||
conv_param->is_relu6_ = true;
|
||||
conv_param->act_type_ = ActType_Relu6;
|
||||
break;
|
||||
default:
|
||||
conv_param->is_relu_ = false;
|
||||
conv_param->is_relu6_ = false;
|
||||
conv_param->act_type_ = ActType_No;
|
||||
break;
|
||||
}
|
||||
return reinterpret_cast<OpParameter *>(conv_param);
|
||||
|
@ -506,23 +491,18 @@ OpParameter *PopulateDeconvParameter(const mindspore::lite::PrimitiveC *primitiv
|
|||
conv_param->pad_d_ = deconv_lite_primitive->PadDown();
|
||||
conv_param->pad_l_ = deconv_lite_primitive->PadLeft();
|
||||
conv_param->pad_r_ = deconv_lite_primitive->PadRight();
|
||||
conv_param->pad_h_ = deconv_lite_primitive->PadH();
|
||||
conv_param->pad_w_ = deconv_lite_primitive->PadW();
|
||||
conv_param->dilation_h_ = conv_primitive->GetDilateH();
|
||||
conv_param->dilation_w_ = conv_primitive->GetDilateW();
|
||||
auto act_type = conv_primitive->GetActivationType();
|
||||
switch (act_type) {
|
||||
case schema::ActivationType_RELU:
|
||||
conv_param->is_relu_ = true;
|
||||
conv_param->is_relu6_ = false;
|
||||
conv_param->act_type_ = ActType_Relu;
|
||||
break;
|
||||
case schema::ActivationType_RELU6:
|
||||
conv_param->is_relu_ = false;
|
||||
conv_param->is_relu6_ = true;
|
||||
conv_param->act_type_ = ActType_Relu6;
|
||||
break;
|
||||
default:
|
||||
conv_param->is_relu_ = false;
|
||||
conv_param->is_relu6_ = false;
|
||||
conv_param->act_type_ = ActType_No;
|
||||
break;
|
||||
}
|
||||
return reinterpret_cast<OpParameter *>(conv_param);
|
||||
|
|
|
@ -322,10 +322,12 @@ int ConvolutionBaseCPUKernel::SetQuantParam() {
|
|||
return ret;
|
||||
}
|
||||
// now only consider per tensor for output
|
||||
CalculateActivationRangeQuantized(
|
||||
conv_param_->is_relu_, conv_param_->is_relu6_, conv_param_->conv_quant_arg_.output_quant_args_[0].zp_,
|
||||
conv_param_->conv_quant_arg_.output_quant_args_[0].scale_, &conv_param_->conv_quant_arg_.out_act_min_[0],
|
||||
&conv_param_->conv_quant_arg_.out_act_max_[0]);
|
||||
bool relu = conv_param_->act_type_ == ActType_Relu;
|
||||
bool relu6 = conv_param_->act_type_ == ActType_Relu6;
|
||||
CalculateActivationRangeQuantized(relu, relu6, conv_param_->conv_quant_arg_.output_quant_args_[0].zp_,
|
||||
conv_param_->conv_quant_arg_.output_quant_args_[0].scale_,
|
||||
&conv_param_->conv_quant_arg_.out_act_min_[0],
|
||||
&conv_param_->conv_quant_arg_.out_act_max_[0]);
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
|
|
@ -38,8 +38,7 @@ int Convolution1x1FP16CPUKernel::InitMatmulParam() {
|
|||
matmul_param_->deep_ = conv_param_->input_channel_;
|
||||
matmul_param_->row_16_ = UP_ROUND(matmul_param_->row_, C16NUM);
|
||||
matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM);
|
||||
matmul_param_->act_type_ = (conv_param_->is_relu6_) ? ActType_Relu6 : ActType_No;
|
||||
matmul_param_->act_type_ = (conv_param_->is_relu_) ? ActType_Relu : matmul_param_->act_type_;
|
||||
matmul_param_->act_type_ = conv_param_->act_type_;
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
@ -57,7 +56,7 @@ Convolution1x1FP16CPUKernel::~Convolution1x1FP16CPUKernel() {
|
|||
}
|
||||
|
||||
int Convolution1x1FP16CPUKernel::InitConv1x1Param() {
|
||||
pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 ||
|
||||
pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 ||
|
||||
conv_param_->stride_w_ != 1);
|
||||
|
||||
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM));
|
||||
|
|
|
@ -237,8 +237,8 @@ int Convolution3x3FP16CPUKernel::Run() {
|
|||
}
|
||||
|
||||
// get real output
|
||||
bool relu = conv_param_->is_relu_;
|
||||
bool relu6 = conv_param_->is_relu6_;
|
||||
bool relu = conv_param_->act_type_ == ActType_Relu;
|
||||
bool relu6 = conv_param_->act_type_ == ActType_Relu6;
|
||||
if (relu) {
|
||||
UnPack3x3ReluOutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
|
||||
conv_param_->output_w_, conv_param_->output_channel_);
|
||||
|
|
|
@ -391,10 +391,10 @@ int ConvolutionWinogradFP16CPUKernel::Run() {
|
|||
}
|
||||
|
||||
// get real output
|
||||
if (conv_param_->is_relu_) {
|
||||
if (conv_param_->act_type_ == ActType_Relu) {
|
||||
UnPackWinogradReluOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
|
||||
conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
|
||||
} else if (conv_param_->is_relu6_) {
|
||||
} else if (conv_param_->act_type_ == ActType_Relu6) {
|
||||
UnPackWinogradRelu6OutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
|
||||
conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
|
||||
} else {
|
||||
|
|
|
@ -232,34 +232,31 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::tensor::Ten
|
|||
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
|
||||
int kernel_h = conv_param->kernel_h_;
|
||||
int kernel_w = conv_param->kernel_w_;
|
||||
int stride_h = conv_param->stride_h_;
|
||||
int stride_w = conv_param->stride_w_;
|
||||
int dilation_h = conv_param->dilation_h_;
|
||||
int dilation_w = conv_param->dilation_w_;
|
||||
conv_param->input_h_ = inputs.front()->Height();
|
||||
conv_param->input_w_ = inputs.front()->Width();
|
||||
conv_param->input_channel_ = inputs.front()->Channel();
|
||||
conv_param->output_h_ = outputs.front()->Height();
|
||||
conv_param->output_w_ = outputs.front()->Width();
|
||||
conv_param->output_channel_ = outputs.front()->Channel();
|
||||
conv_param->op_parameter_.thread_num_ = ctx->thread_num_;
|
||||
bool use_winograd = false;
|
||||
bool use_sw = false;
|
||||
int out_unit;
|
||||
InputTransformUnitFunc input_trans_func = nullptr;
|
||||
OutputTransformUnitFunc output_trans_func = nullptr;
|
||||
if (primitive != nullptr && primitive->GetInferFlag()) {
|
||||
CheckIfUseWinograd(&use_winograd, &out_unit, conv_param, input_trans_func, output_trans_func);
|
||||
use_sw = CheckIfUseSlideWindow(conv_param);
|
||||
}
|
||||
|
||||
kernel::LiteKernel *kernel;
|
||||
if (kernel_h == 1 && kernel_w == 1) {
|
||||
kernel = new (std::nothrow) kernel::Convolution1x1CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
|
||||
} else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) {
|
||||
kernel = new (std::nothrow) kernel::Convolution3x3CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
|
||||
} else if (use_winograd) {
|
||||
kernel =
|
||||
new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
|
||||
} else if (use_sw) {
|
||||
kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
|
||||
if (kernel_h == 3 && kernel_w == 3 && out_unit == 2) {
|
||||
kernel = new (std::nothrow) kernel::Convolution3x3CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
|
||||
} else {
|
||||
kernel = new (std::nothrow)
|
||||
kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
|
||||
}
|
||||
} else {
|
||||
kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
|
||||
}
|
||||
|
|
|
@ -65,8 +65,7 @@ void Convolution1x1CPUKernel::InitConv1x1MatmulParam() {
|
|||
matmul_param_->deep_ = conv_param_->input_channel_;
|
||||
matmul_param_->row_12_ = UP_ROUND(matmul_param_->row_, C12NUM);
|
||||
matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM);
|
||||
matmul_param_->act_type_ = (conv_param_->is_relu6_) ? ActType_Relu6 : ActType_No;
|
||||
matmul_param_->act_type_ = (conv_param_->is_relu_) ? ActType_Relu : matmul_param_->act_type_;
|
||||
matmul_param_->act_type_ = conv_param_->act_type_;
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -98,7 +97,7 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
|
|||
}
|
||||
|
||||
int Convolution1x1CPUKernel::InitConv1x1Param() {
|
||||
pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 ||
|
||||
pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 ||
|
||||
conv_param_->stride_w_ != 1);
|
||||
|
||||
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM));
|
||||
|
|
|
@ -94,7 +94,9 @@ int Convolution3x3CPUKernel::InitWeightBias() {
|
|||
}
|
||||
|
||||
int Convolution3x3CPUKernel::InitTmpBuffer() {
|
||||
int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
|
||||
int oC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
|
||||
int oC8 = UP_DIV(conv_param_->output_channel_, C8NUM);
|
||||
const int k_plane = 16;
|
||||
MS_ASSERT(ctx_->allocator != nullptr);
|
||||
|
||||
|
@ -105,13 +107,20 @@ int Convolution3x3CPUKernel::InitTmpBuffer() {
|
|||
return RET_ERROR;
|
||||
}
|
||||
|
||||
size_t tmp_dst_buffer_size = thread_count_ * TILE_NUM * k_plane * oC4 * C4NUM * sizeof(float);
|
||||
size_t tmp_dst_buffer_size = thread_count_ * C12NUM * k_plane * oC8 * C8NUM * sizeof(float);
|
||||
tmp_dst_buffer_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(tmp_dst_buffer_size));
|
||||
if (tmp_dst_buffer_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc tmp_dst_buffer_ failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
size_t col_buffer_size = thread_count_ * C12NUM * C4NUM * ic4 * sizeof(float);
|
||||
col_buffer_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(col_buffer_size));
|
||||
if (col_buffer_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc col_buffer_ failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
size_t nc4hw4_out_size =
|
||||
oC4 * C4NUM * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float);
|
||||
nc4hw4_out_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(nc4hw4_out_size));
|
||||
|
@ -124,6 +133,7 @@ int Convolution3x3CPUKernel::InitTmpBuffer() {
|
|||
tmp_buffer_address_list_[1] = block_unit_buffer_;
|
||||
tmp_buffer_address_list_[2] = tmp_dst_buffer_;
|
||||
tmp_buffer_address_list_[3] = nc4hw4_out_;
|
||||
tmp_buffer_address_list_[4] = col_buffer_;
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
@ -182,7 +192,7 @@ int Convolution3x3CPUKernel::ReSize() {
|
|||
}
|
||||
memset(nhwc4_input_, 0, nhwc4_input_size);
|
||||
|
||||
size_t tile_buffer_size = thread_count_ * TILE_NUM * C16NUM * iC4 * C4NUM * sizeof(float);
|
||||
size_t tile_buffer_size = thread_count_ * C12NUM * C16NUM * iC4 * C4NUM * sizeof(float);
|
||||
tile_buffer_ = reinterpret_cast<float *>(malloc(tile_buffer_size));
|
||||
if (tile_buffer_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc tile buffer failed.";
|
||||
|
@ -237,8 +247,8 @@ int Convolution3x3CPUKernel::Run() {
|
|||
return RET_ERROR;
|
||||
}
|
||||
|
||||
auto is_relu = conv_param_->is_relu_;
|
||||
auto is_relu6 = conv_param_->is_relu6_;
|
||||
auto is_relu = conv_param_->act_type_ == ActType_Relu;
|
||||
auto is_relu6 = conv_param_->act_type_ == ActType_Relu6;
|
||||
auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
|
||||
if (is_relu) {
|
||||
PackNC4HW4ToNHWCReluFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_,
|
||||
|
|
|
@ -60,14 +60,19 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel {
|
|||
ctx_->allocator->Free(nc4hw4_out_);
|
||||
nc4hw4_out_ = nullptr;
|
||||
}
|
||||
if (col_buffer_ != nullptr) {
|
||||
ctx_->allocator->Free(col_buffer_);
|
||||
col_buffer_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
float *transformed_filter_addr_ = nullptr;
|
||||
float *tile_buffer_ = nullptr;
|
||||
float *block_unit_buffer_ = nullptr;
|
||||
float *tmp_dst_buffer_ = nullptr;
|
||||
float *col_buffer_ = nullptr;
|
||||
float *nc4hw4_out_ = nullptr;
|
||||
TmpBufferAddress tmp_buffer_address_list_[4];
|
||||
TmpBufferAddress tmp_buffer_address_list_[5];
|
||||
GEMM_FUNC_FP32 gemm_func_ = nullptr;
|
||||
};
|
||||
void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param, int oc_block, int oc_block_num);
|
||||
|
|
|
@ -76,7 +76,7 @@ int WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int
|
|||
int out_c_block = i / oc_block;
|
||||
int out_c_res = i % oc_block;
|
||||
int input_oz_offset = i * kernel_unit * kernel_unit * channel_in;
|
||||
int output_oz_offset = out_c_block * strides[1] * input_unit * input_unit + out_c_res;
|
||||
int output_oz_offset = out_c_block * strides[1] + out_c_res;
|
||||
for (int j = 0; j < channel_in; j++) {
|
||||
int ic4_block = j / C4NUM;
|
||||
int ic4_res = j % C4NUM;
|
||||
|
@ -93,7 +93,7 @@ int WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int
|
|||
MatrixMultiply(tmp_data, matrix_gt_data, trans_out_data, input_unit, kernel_unit, input_unit, row);
|
||||
|
||||
for (int z = 0; z < input_unit_square; z++) {
|
||||
int output_xy_offset = output_iz_offset + z * strides[1];
|
||||
int output_xy_offset = output_iz_offset + z * strides[0];
|
||||
*(trans_weight_data + output_xy_offset) = trans_out_data[z];
|
||||
}
|
||||
}
|
||||
|
@ -151,7 +151,7 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
|
|||
|
||||
int ConvolutionWinogradCPUKernel::MallocFilterMatrix(int oc_block, int oc_block_num) {
|
||||
int channel_in = conv_param_->input_channel_;
|
||||
int ic4 = UP_DIV(channel_in, BLOCK);
|
||||
int ic4 = UP_DIV(channel_in, C4NUM);
|
||||
|
||||
// set data
|
||||
auto trans_matrix_data_size = input_unit_ * input_unit_ * ic4 * C4NUM * oc_block_num * oc_block * sizeof(float);
|
||||
|
@ -196,10 +196,12 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
|
|||
int output_h = conv_param_->output_h_;
|
||||
int output_w = conv_param_->output_w_;
|
||||
int oc4 = UP_DIV(channel_out, C4NUM);
|
||||
int oc8 = UP_DIV(channel_out, C8NUM);
|
||||
int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
|
||||
MS_ASSERT(ctx_->allocator != nullptr);
|
||||
|
||||
gemm_out_ = reinterpret_cast<float *>(
|
||||
ctx_->allocator->Malloc(thread_count_ * TILE_NUM * input_unit_ * input_unit_ * oc4 * C4NUM * sizeof(float)));
|
||||
ctx_->allocator->Malloc(thread_count_ * C12NUM * input_unit_ * input_unit_ * oc8 * C8NUM * sizeof(float)));
|
||||
if (gemm_out_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc gemm_out_ failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -222,10 +224,18 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
|
|||
return RET_ERROR;
|
||||
}
|
||||
|
||||
col_buffer_ =
|
||||
reinterpret_cast<float *>(ctx_->allocator->Malloc(thread_count_ * C12NUM * ic4 * C4NUM * sizeof(float)));
|
||||
if (col_buffer_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc col_buffer_ failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
tmp_buffer_address_list_[0] = trans_input_;
|
||||
tmp_buffer_address_list_[1] = gemm_out_;
|
||||
tmp_buffer_address_list_[2] = tmp_out_data_;
|
||||
tmp_buffer_address_list_[3] = tmp_data_;
|
||||
tmp_buffer_address_list_[4] = col_buffer_;
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
@ -306,7 +316,7 @@ int ConvolutionWinogradCPUKernel::ReSize() {
|
|||
}
|
||||
memset(nhwc4_input_, 0, nhwc4_input_size);
|
||||
|
||||
size_t tile_buffer_size = thread_count_ * TILE_NUM * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float);
|
||||
size_t tile_buffer_size = thread_count_ * C12NUM * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float);
|
||||
trans_input_ = reinterpret_cast<float *>(malloc(tile_buffer_size));
|
||||
if (trans_input_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc trans_input_ failed.";
|
||||
|
@ -370,10 +380,10 @@ int ConvolutionWinogradCPUKernel::Run() {
|
|||
// get real output
|
||||
auto out_tensor = out_tensors_.front();
|
||||
auto out_data = reinterpret_cast<float *>(out_tensor->Data());
|
||||
if (conv_param_->is_relu_) {
|
||||
if (conv_param_->act_type_ == ActType_Relu) {
|
||||
UnPackWinogradReluOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
|
||||
conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
|
||||
} else if (conv_param_->is_relu6_) {
|
||||
} else if (conv_param_->act_type_ == ActType_Relu6) {
|
||||
UnPackWinogradRelu6Output(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
|
||||
conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
|
||||
} else {
|
||||
|
|
|
@ -66,6 +66,10 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
|
|||
ctx_->allocator->Free(tmp_out_data_);
|
||||
tmp_out_data_ = nullptr;
|
||||
}
|
||||
if (col_buffer_ != nullptr) {
|
||||
ctx_->allocator->Free(col_buffer_);
|
||||
col_buffer_ = nullptr;
|
||||
}
|
||||
}
|
||||
int kernel_unit_;
|
||||
int input_unit_;
|
||||
|
@ -74,6 +78,7 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
|
|||
float *trans_input_ = nullptr;
|
||||
float *gemm_out_ = nullptr;
|
||||
float *tmp_out_data_ = nullptr;
|
||||
float *col_buffer_ = nullptr;
|
||||
Matrix *trans_weight_ = nullptr;
|
||||
InputTransformUnitFunc input_trans_func_;
|
||||
OutputTransformUnitFunc output_trans_func_;
|
||||
|
|
|
@ -146,7 +146,7 @@ int Convolution1x1Int8CPUKernel::Init() {
|
|||
}
|
||||
|
||||
int Convolution1x1Int8CPUKernel::InitParam() {
|
||||
pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 ||
|
||||
pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 ||
|
||||
conv_param_->stride_w_ != 1);
|
||||
|
||||
matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
|
||||
|
|
|
@ -36,7 +36,7 @@ int Conv2dTransposeOpenCLKernel::Init() {
|
|||
MS_LOG(ERROR) << "only support kh=kw=2 and stride_h=stride_w=2.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (param->pad_h_ != 0 || param->pad_w_ != 0) {
|
||||
if (param->pad_u_ != 0 || param->pad_l_ != 0) {
|
||||
MS_LOG(ERROR) << "only support pad =0.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
@ -170,7 +170,7 @@ int Conv2dTransposeOpenCLKernel::Run() {
|
|||
int co = out_tensors_[0]->Channel();
|
||||
int kh = param->kernel_h_;
|
||||
int kw = param->kernel_w_;
|
||||
int pad = param->pad_h_;
|
||||
int pad = param->pad_u_;
|
||||
int oh = out_tensors_[0]->Height();
|
||||
int ow = out_tensors_[0]->Width();
|
||||
int h = in_tensors_[0]->Height();
|
||||
|
|
|
@ -382,9 +382,9 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolution() {
|
|||
" }\n\n";
|
||||
code += " FLT4 out0_c4_bias = out0_c4 + bias[co_slice];\n";
|
||||
|
||||
if (param->is_relu_) {
|
||||
if (param->act_type_ == ActType_Relu) {
|
||||
code += " out0_c4_bias = max(out0_c4_bias, (FLT4)(0.0f));\n";
|
||||
} else if (param->is_relu6_) {
|
||||
} else if (param->act_type_ == ActType_Relu6) {
|
||||
code += " out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n";
|
||||
}
|
||||
|
||||
|
@ -609,9 +609,9 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
|
|||
" acc += bias[slice];\n";
|
||||
|
||||
auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
|
||||
if (param->is_relu_) {
|
||||
if (param->act_type_ == ActType_Relu) {
|
||||
code += " acc = max(acc, (float4)(0.0f));\n";
|
||||
} else if (param->is_relu6_) {
|
||||
} else if (param->act_type_ == ActType_Relu6) {
|
||||
code += " acc = clamp(acc, (float4)(0.0f), (float4)(6.0f));\n";
|
||||
}
|
||||
|
||||
|
|
|
@ -163,7 +163,7 @@ int DepthwiseConv2dOpenCLKernel::Run() {
|
|||
float relu_clip1 = 6.0;
|
||||
cl_int2 kernel_size = {parameter->kernel_h_, parameter->kernel_w_};
|
||||
cl_int2 stride = {parameter->stride_h_, parameter->stride_w_};
|
||||
cl_int2 padding = {-parameter->pad_h_, -parameter->pad_w_};
|
||||
cl_int2 padding = {-parameter->pad_u_, -parameter->pad_l_};
|
||||
cl_int2 dilation = {parameter->dilation_h_, parameter->dilation_w_};
|
||||
cl_int4 src_size = {in_tensors_[0]->Width(), in_tensors_[0]->Height(), (cl_int)CI4, in_tensors_[0]->Batch()};
|
||||
cl_int4 dst_size = {(cl_int)out_tensors_[0]->Width(), (cl_int)out_tensors_[0]->Height(), (cl_int)CO4,
|
||||
|
|
|
@ -47,8 +47,8 @@ void InitConvParamPack(ConvParameter *conv_param) {
|
|||
conv_param->dilation_h_ = 1;
|
||||
conv_param->dilation_w_ = 1;
|
||||
|
||||
conv_param->pad_h_ = 1;
|
||||
conv_param->pad_w_ = 1;
|
||||
conv_param->pad_u_ = 1;
|
||||
conv_param->pad_l_ = 1;
|
||||
}
|
||||
|
||||
TEST_F(TestPack, PackInputFp32) {
|
||||
|
|
|
@ -50,8 +50,8 @@ void InitConvParamGroup1Fp16(ConvParameter *conv_param) {
|
|||
conv_param->dilation_h_ = 1;
|
||||
conv_param->dilation_w_ = 1;
|
||||
|
||||
conv_param->pad_h_ = 1;
|
||||
conv_param->pad_w_ = 1;
|
||||
conv_param->pad_u_ = 1;
|
||||
conv_param->pad_l_ = 1;
|
||||
conv_param->thread_num_ = 1;
|
||||
}
|
||||
|
||||
|
@ -75,8 +75,8 @@ void InitConvParamGroup2Fp16(ConvParameter *conv_param) {
|
|||
conv_param->dilation_h_ = 1;
|
||||
conv_param->dilation_w_ = 1;
|
||||
|
||||
conv_param->pad_h_ = 1;
|
||||
conv_param->pad_w_ = 1;
|
||||
conv_param->pad_u_ = 1;
|
||||
conv_param->pad_l_ = 1;
|
||||
conv_param->thread_num_ = 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -50,7 +50,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack1) {
|
|||
conv_param->output_h_ = 4;
|
||||
conv_param->output_w_ = 5;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 4;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 2;
|
||||
conv_param->pad_u_ = conv_param->pad_l_ = 2;
|
||||
|
||||
float out[20] = {0};
|
||||
Conv1x1InputPack(in, out, conv_param, sizeof(float));
|
||||
|
@ -91,7 +91,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack2) {
|
|||
conv_param->output_h_ = 7;
|
||||
conv_param->output_w_ = 4;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 3;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 0;
|
||||
conv_param->pad_u_ = conv_param->pad_l_ = 0;
|
||||
|
||||
float out[28] = {0};
|
||||
Conv1x1InputPack(in, out, conv_param, sizeof(float));
|
||||
|
@ -105,7 +105,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack3) {
|
|||
conv_param->input_h_ = conv_param->input_w_ = 3;
|
||||
conv_param->output_h_ = conv_param->output_w_ = 3;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 2;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 1;
|
||||
conv_param->pad_u_ = conv_param->pad_l_ = 1;
|
||||
|
||||
float in[] = {1.6767339, 12.25904, 19.018835, 3.0790641, -9.252135, -8.685675, 3.6115494, 3.2282279, 17.025112,
|
||||
-5.052577, 12.750252, 12.701241, -8.9477215, -9.080522, 19.03931, -6.501229, -4.122992, 9.540845};
|
||||
|
@ -124,7 +124,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack4) {
|
|||
conv_param->input_h_ = conv_param->input_w_ = 3;
|
||||
conv_param->output_h_ = conv_param->output_w_ = 3;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 2;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 1;
|
||||
conv_param->pad_u_ = conv_param->pad_l_ = 1;
|
||||
float in[] = {4.1795, 13.142, -3.593, 16.505, 19.899, 8.5562, 19.969, -6.235, -2.380, -9.027, 9.5542,
|
||||
18.974, 23.622, 8.3608, 47.325, -14.36, 15.370, 4.3049, -0.784, 37.925, -0.081, 6.1298,
|
||||
0.6721, -1.517, 37.998, 13.719, 11.029, 1.7127, -1.770, 41.903, 9.0560, 14.988, 3.1866,
|
||||
|
@ -281,8 +281,8 @@ int Conv1x1TestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<l
|
|||
conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 2;
|
||||
conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 1;
|
||||
conv_param->is_relu_ = conv_param->is_relu6_ = false;
|
||||
conv_param->pad_u_ = conv_param->pad_l_ = 1;
|
||||
conv_param->act_type_ = ActType_No;
|
||||
return out_t->ElementsNum();
|
||||
}
|
||||
|
||||
|
@ -348,9 +348,8 @@ int Conv1x1TestInit2(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<l
|
|||
conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 1;
|
||||
conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 0;
|
||||
conv_param->is_relu_ = false;
|
||||
conv_param->is_relu6_ = false;
|
||||
conv_param->pad_u_ = conv_param->pad_l_ = 0;
|
||||
conv_param->act_type_ = ActType_No;
|
||||
return out_t->ElementsNum();
|
||||
}
|
||||
|
||||
|
|
|
@ -47,8 +47,8 @@ void InitConvDwParam(ConvParameter *conv_param) {
|
|||
conv_param->dilation_h_ = 1;
|
||||
conv_param->dilation_w_ = 1;
|
||||
|
||||
conv_param->pad_h_ = 1;
|
||||
conv_param->pad_w_ = 1;
|
||||
conv_param->pad_u_ = 1;
|
||||
conv_param->pad_l_ = 1;
|
||||
}
|
||||
|
||||
void InitConvDwCreator(std::vector<lite::tensor::Tensor *> *inputs, std::vector<lite::tensor::Tensor *> *outputs,
|
||||
|
|
|
@ -468,7 +468,7 @@ int DeConvTestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<li
|
|||
conv_param->kernel_h_ = conv_param->kernel_w_ = 3;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 2;
|
||||
conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 1;
|
||||
conv_param->pad_u_ = conv_param->pad_l_ = 1;
|
||||
return out_t->ElementsNum();
|
||||
}
|
||||
|
||||
|
@ -537,7 +537,7 @@ int DeConvTestInit2(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<li
|
|||
conv_param->kernel_h_ = conv_param->kernel_w_ = 3;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 2;
|
||||
conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 1;
|
||||
conv_param->pad_u_ = conv_param->pad_l_ = 1;
|
||||
return out_t->ElementsNum();
|
||||
}
|
||||
|
||||
|
@ -616,7 +616,7 @@ int DeConvTestInit3(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<li
|
|||
conv_param->kernel_h_ = conv_param->kernel_w_ = 2;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 3;
|
||||
conv_param->dilation_h_ = conv_param->dilation_w_ = 2;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 0;
|
||||
conv_param->pad_u_ = conv_param->pad_l_ = 0;
|
||||
return out_t->ElementsNum();
|
||||
}
|
||||
|
||||
|
@ -685,8 +685,8 @@ int DeConvTestInit4(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<li
|
|||
conv_param->kernel_h_ = conv_param->kernel_w_ = 3;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 1;
|
||||
conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 0;
|
||||
conv_param->is_relu_ = conv_param->is_relu6_ = false;
|
||||
conv_param->pad_u_ = conv_param->pad_l_ = 0;
|
||||
conv_param->act_type_ = ActType_No;
|
||||
|
||||
return out_t->ElementsNum();
|
||||
}
|
||||
|
|
|
@ -52,12 +52,11 @@ void InitConvParamGroup1FP32(ConvParameter *conv_param) {
|
|||
conv_param->dilation_h_ = 1;
|
||||
conv_param->dilation_w_ = 1;
|
||||
|
||||
conv_param->pad_h_ = 1;
|
||||
conv_param->pad_w_ = 1;
|
||||
conv_param->pad_u_ = 1;
|
||||
conv_param->pad_l_ = 1;
|
||||
|
||||
conv_param->group_ = 1;
|
||||
conv_param->is_relu_ = false;
|
||||
conv_param->is_relu6_ = false;
|
||||
conv_param->act_type_ = ActType_No;
|
||||
conv_param->thread_num_ = 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -34,7 +34,7 @@ TEST_F(TestConv1x1Int8, Input1x1PrePack1) {
|
|||
conv_param->input_h_ = conv_param->input_w_ = 3;
|
||||
conv_param->output_h_ = conv_param->output_w_ = 3;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 2;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 1;
|
||||
conv_param->pad_u_ = conv_param->pad_l_ = 1;
|
||||
int8_t in[] = {4, 13, -3, 16, 19, 8, 19, -6, -2, -9, 9, 18, 23, 8, 47, -14, 15, 4,
|
||||
-0, 37, -0, 6, 0, -1, 37, 13, 11, 1, -1, 41, 9, 14, 3, 0, 8, 9,
|
||||
14, -14, -8, -8, -8, 7, 19, 17, 13, 3, 9, 18, -1, -0, 18, 0, 4, -2};
|
||||
|
@ -61,7 +61,7 @@ TEST_F(TestConv1x1Int8, Input1x1PrePack2) {
|
|||
conv_param->output_h_ = 4;
|
||||
conv_param->output_w_ = 5;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 4;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 2;
|
||||
conv_param->pad_u_ = conv_param->pad_l_ = 2;
|
||||
|
||||
int8_t out[20] = {0};
|
||||
Conv1x1InputPack(in, out, conv_param, sizeof(int8_t));
|
||||
|
@ -111,8 +111,8 @@ int Conv1x1Int8TestInit1_perchannel(std::vector<lite::tensor::Tensor *> *inputs_
|
|||
conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 1;
|
||||
conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 0;
|
||||
conv_param->is_relu_ = conv_param->is_relu6_ = false;
|
||||
conv_param->pad_u_ = conv_param->pad_l_ = 0;
|
||||
conv_param->act_type_ = ActType_No;
|
||||
return out_t->ElementsNum();
|
||||
}
|
||||
|
||||
|
@ -178,8 +178,8 @@ int Conv1x1Int8TestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vect
|
|||
conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 1;
|
||||
conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 0;
|
||||
conv_param->is_relu_ = conv_param->is_relu6_ = false;
|
||||
conv_param->pad_u_ = conv_param->pad_l_ = 0;
|
||||
conv_param->act_type_ = ActType_No;
|
||||
return out_t->ElementsNum();
|
||||
}
|
||||
|
||||
|
@ -253,8 +253,8 @@ int Conv1x1Int8TestInit2(std::vector<lite::tensor::Tensor *> *inputs_, std::vect
|
|||
conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 1;
|
||||
conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 0;
|
||||
conv_param->is_relu_ = conv_param->is_relu6_ = false;
|
||||
conv_param->pad_u_ = conv_param->pad_l_ = 0;
|
||||
conv_param->act_type_ = ActType_No;
|
||||
return out_t->ElementsNum();
|
||||
}
|
||||
|
||||
|
|
|
@ -343,7 +343,7 @@ int DeConvInt8TestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vecto
|
|||
PackNCHWToNHWCInt8(co_nchw, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel());
|
||||
|
||||
conv_param->kernel_h_ = conv_param->kernel_w_ = 3;
|
||||
conv_param->pad_h_ = conv_param->pad_w_ = 1;
|
||||
conv_param->pad_u_ = conv_param->pad_l_ = 1;
|
||||
conv_param->stride_h_ = conv_param->stride_w_ = 2;
|
||||
conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
|
||||
return out_t->ElementsNum();
|
||||
|
|
|
@ -119,8 +119,8 @@ void RunTestCase(const std::vector<int> shape, const std::vector<std::string> fi
|
|||
opParameter->kernel_w_ = kw;
|
||||
opParameter->stride_h_ = 2;
|
||||
opParameter->stride_w_ = 2;
|
||||
opParameter->pad_h_ = pad;
|
||||
opParameter->pad_w_ = pad;
|
||||
opParameter->pad_u_ = pad;
|
||||
opParameter->pad_l_ = pad;
|
||||
opParameter->input_channel_ = ci;
|
||||
opParameter->output_channel_ = co;
|
||||
auto op_kernel_ptr = std::make_unique<kernel::Conv2dTransposeOpenCLKernel>(
|
||||
|
|
|
@ -169,8 +169,8 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNC4HW4Fp32) {
|
|||
conv_param->stride_w_ = 1;
|
||||
conv_param->dilation_h_ = 1;
|
||||
conv_param->dilation_w_ = 1;
|
||||
conv_param->pad_h_ = 0;
|
||||
conv_param->pad_w_ = 0;
|
||||
conv_param->pad_u_ = 0;
|
||||
conv_param->pad_l_ = 0;
|
||||
}
|
||||
|
||||
// nhwc
|
||||
|
@ -214,8 +214,8 @@ TEST_F(TestConvolutionDwOpenCL, PadNC4HW4Fp32) {
|
|||
conv_param->stride_w_ = 1;
|
||||
conv_param->dilation_h_ = 1;
|
||||
conv_param->dilation_w_ = 1;
|
||||
conv_param->pad_h_ = 1;
|
||||
conv_param->pad_w_ = 1;
|
||||
conv_param->pad_u_ = 1;
|
||||
conv_param->pad_l_ = 1;
|
||||
}
|
||||
|
||||
// nhwc
|
||||
|
@ -286,8 +286,8 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp32) {
|
|||
conv_param->stride_w_ = 1;
|
||||
conv_param->dilation_h_ = 1;
|
||||
conv_param->dilation_w_ = 1;
|
||||
conv_param->pad_h_ = 0;
|
||||
conv_param->pad_w_ = 0;
|
||||
conv_param->pad_u_ = 0;
|
||||
conv_param->pad_l_ = 0;
|
||||
}
|
||||
|
||||
// nhwc
|
||||
|
@ -331,8 +331,8 @@ TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp32) {
|
|||
conv_param->stride_w_ = 1;
|
||||
conv_param->dilation_h_ = 1;
|
||||
conv_param->dilation_w_ = 1;
|
||||
conv_param->pad_h_ = 1;
|
||||
conv_param->pad_w_ = 1;
|
||||
conv_param->pad_u_ = 1;
|
||||
conv_param->pad_l_ = 1;
|
||||
}
|
||||
|
||||
// nhwc
|
||||
|
@ -405,8 +405,8 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwNoPadFp32) {
|
|||
conv_param->stride_w_ = 1;
|
||||
conv_param->dilation_h_ = 1;
|
||||
conv_param->dilation_w_ = 1;
|
||||
conv_param->pad_h_ = 0;
|
||||
conv_param->pad_w_ = 0;
|
||||
conv_param->pad_u_ = 0;
|
||||
conv_param->pad_l_ = 0;
|
||||
}
|
||||
|
||||
// nhwc
|
||||
|
@ -529,8 +529,8 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwPadFp32) {
|
|||
conv_param->stride_w_ = 1;
|
||||
conv_param->dilation_h_ = 1;
|
||||
conv_param->dilation_w_ = 1;
|
||||
conv_param->pad_h_ = 1;
|
||||
conv_param->pad_w_ = 1;
|
||||
conv_param->pad_u_ = 1;
|
||||
conv_param->pad_l_ = 1;
|
||||
}
|
||||
|
||||
// nhwc
|
||||
|
@ -724,8 +724,8 @@ TEST_F(TestConvolutionDwOpenCL, ProfilingMobilenetv2) {
|
|||
conv_param->kernel_w_ = filter_shape[i][2];
|
||||
conv_param->stride_h_ = conv_param->output_h_ / conv_param->input_h_;
|
||||
conv_param->stride_w_ = conv_param->output_w_ / conv_param->input_w_;
|
||||
conv_param->pad_h_ = (conv_param->kernel_h_ - 1) / 2;
|
||||
conv_param->pad_w_ = (conv_param->kernel_w_ - 1) / 2;
|
||||
conv_param->pad_u_ = (conv_param->kernel_h_ - 1) / 2;
|
||||
conv_param->pad_l_ = (conv_param->kernel_w_ - 1) / 2;
|
||||
conv_param->dilation_h_ = 1;
|
||||
conv_param->dilation_w_ = 1;
|
||||
}
|
||||
|
@ -774,8 +774,8 @@ TEST_F(TestConvolutionDwOpenCL, Buffer2Image) {
|
|||
conv_param->kernel_w_ = filter_shape[2];
|
||||
conv_param->stride_h_ = conv_param->output_h_ / conv_param->input_h_;
|
||||
conv_param->stride_w_ = conv_param->output_w_ / conv_param->input_w_;
|
||||
conv_param->pad_h_ = (conv_param->kernel_h_ - 1) / 2;
|
||||
conv_param->pad_w_ = (conv_param->kernel_w_ - 1) / 2;
|
||||
conv_param->pad_u_ = (conv_param->kernel_h_ - 1) / 2;
|
||||
conv_param->pad_l_ = (conv_param->kernel_w_ - 1) / 2;
|
||||
conv_param->dilation_h_ = 1;
|
||||
conv_param->dilation_w_ = 1;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue