forked from mindspore-Ecosystem/mindspore
!5202 [MS][LITE][Develop]fp16 pooling optimize
Merge pull request !5202 from ling/sr
This commit is contained in:
commit
572c8fa22f
|
@ -1,103 +0,0 @@
|
|||
|
||||
#ifdef __aarch64__
|
||||
.text
|
||||
.align 5
|
||||
//.p2align 5,,15
|
||||
.global MatrixAdd
|
||||
#ifndef __APPLE__
|
||||
.type MatrixAdd, %function
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
//void MatrixAdd(const float* matDataA, const float* matDataB, float* matDataC,
|
||||
// size_t aStride, size_t bStride, size_t cStride, size_t width, size_t height)
|
||||
|
||||
//Auto: x0: matDataA, x1:matDataB, x2:matDatac,
|
||||
//x3:aStride, x4:bStride, x5:cStride, x6:width, x7:height
|
||||
|
||||
MatrixAdd:
|
||||
mov x12, #4 //sizeof(float)
|
||||
mul x3, x12, x3
|
||||
mul x4, x12, x4
|
||||
mul x5, x12, x5
|
||||
|
||||
loopH:
|
||||
mov x8, x0
|
||||
mov x9, x1
|
||||
mov x10, x2
|
||||
|
||||
mov x11, x6
|
||||
|
||||
loop16LineIn:
|
||||
cmp x11, #4
|
||||
blt L8
|
||||
sub x11, x11, #4
|
||||
ld1 {v0.4s, v1.4s}, [x0], #32
|
||||
ld1 {v2.4s, v3.4s}, [x1], #32
|
||||
|
||||
fadd v4.4s, v0.4s, v2.4s
|
||||
fadd v5.4s, v1.4s, v3.4s
|
||||
|
||||
ld1 {v6.4s, v7.4s}, [x0], #32
|
||||
ld1 {v8.4s, v9.4s}, [x1], #32
|
||||
|
||||
cmp x11, #4
|
||||
blt loop16LineOut
|
||||
|
||||
loop16:
|
||||
st1 {v4.4s, v5.4s}, [x2], #32
|
||||
fadd v10.4s, v6.4s, v8.4s
|
||||
fadd v11.4s, v7.4s, v9.4s
|
||||
ld1 {v0.4s, v1.4s}, [x0], #32
|
||||
ld1 {v2.4s, v3.4s}, [x1], #32
|
||||
|
||||
st1 {v10.4s, v11.4s}, [x2], #32
|
||||
fadd v4.4s, v0.4s, v2.4s
|
||||
fadd v5.4s, v1.4s, v3.4s
|
||||
ld1 {v6.4s, v7.4s}, [x0], #32
|
||||
ld1 {v8.4s, v9.4s}, [x1], #32
|
||||
|
||||
sub x11, x11, #4
|
||||
cmp x11, #4
|
||||
bge loop16
|
||||
|
||||
loop16LineOut:
|
||||
st1 {v4.4s, v5.4s}, [x2], #32
|
||||
fadd v10.4s, v6.4s, v8.4s
|
||||
fadd v11.4s, v7.4s, v9.4s
|
||||
st1 {v10.4s, v11.4s}, [x2], #32
|
||||
|
||||
|
||||
L8:
|
||||
cmp x11, #2
|
||||
blt L4
|
||||
ld1 {v0.4s, v1.4s}, [x0], #32
|
||||
ld1 {v2.4s, v3.4s}, [x1], #32
|
||||
fadd v4.4s, v0.4s, v2.4s
|
||||
fadd v5.4s, v1.4s, v3.4s
|
||||
sub x11, x11, #2
|
||||
st1 {v4.4s, v5.4s}, [x2], #32
|
||||
|
||||
|
||||
cmp x11, #0
|
||||
beq loop16EndLine
|
||||
|
||||
L4:
|
||||
ld1 {v0.4s}, [x0], #16
|
||||
ld1 {v1.4s}, [x1], #16
|
||||
fadd v0.4s, v0.4s, v1.4s
|
||||
sub x11, x11, #1
|
||||
st1 {v0.4s}, [x2], #16
|
||||
//bne L4
|
||||
|
||||
loop16EndLine:
|
||||
add x0, x8, x3
|
||||
add x1, x9, x4
|
||||
add x2, x10, x5
|
||||
|
||||
subs x7, x7, #1
|
||||
bne loopH
|
||||
|
||||
ret
|
||||
#endif
|
|
@ -1,105 +0,0 @@
|
|||
|
||||
#ifdef __aarch64__
|
||||
.text
|
||||
.align 5
|
||||
//.p2align 5,,15
|
||||
.global MatrixSub
|
||||
#ifndef __APPLE__
|
||||
.type MatrixSub, %function
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
//void MatrixSub(const float* matDataA, const float* matDataB, float* matDataC,
|
||||
// size_t aStride, size_t bStride, size_t cStride, size_t width, size_t height)
|
||||
|
||||
//Auto: x0: matDataA, x1:matDataB, x2:matDatac,
|
||||
//x3:aStride, x4:bStride, x5:cStride, x6:width, x7:height
|
||||
|
||||
MatrixSub:
|
||||
mov x12, #4 //sizeof(float)
|
||||
mul x3, x12, x3
|
||||
mul x4, x12, x4
|
||||
mul x5, x12, x5
|
||||
|
||||
loopH:
|
||||
mov x8, x0
|
||||
mov x9, x1
|
||||
mov x10, x2
|
||||
|
||||
mov x11, x6
|
||||
|
||||
loop16LineIn:
|
||||
cmp x11, #4
|
||||
blt L8
|
||||
sub x11, x11, #4
|
||||
ld1 {v0.4s, v1.4s}, [x0], #32
|
||||
ld1 {v2.4s, v3.4s}, [x1], #32
|
||||
|
||||
fsub v4.4s, v0.4s, v2.4s
|
||||
fsub v5.4s, v1.4s, v3.4s
|
||||
|
||||
ld1 {v6.4s, v7.4s}, [x0], #32
|
||||
ld1 {v8.4s, v9.4s}, [x1], #32
|
||||
|
||||
cmp x11, #4
|
||||
blt loop16LineOut
|
||||
|
||||
loop16:
|
||||
st1 {v4.4s, v5.4s}, [x2], #32
|
||||
fsub v10.4s, v6.4s, v8.4s
|
||||
fsub v11.4s, v7.4s, v9.4s
|
||||
ld1 {v0.4s, v1.4s}, [x0], #32
|
||||
ld1 {v2.4s, v3.4s}, [x1], #32
|
||||
|
||||
st1 {v10.4s, v11.4s}, [x2], #32
|
||||
fsub v4.4s, v0.4s, v2.4s
|
||||
fsub v5.4s, v1.4s, v3.4s
|
||||
ld1 {v6.4s, v7.4s}, [x0], #32
|
||||
ld1 {v8.4s, v9.4s}, [x1], #32
|
||||
|
||||
sub x11, x11, #4
|
||||
cmp x11, #4
|
||||
bge loop16
|
||||
|
||||
loop16LineOut:
|
||||
st1 {v4.4s, v5.4s}, [x2], #32
|
||||
fsub v10.4s, v6.4s, v8.4s
|
||||
fsub v11.4s, v7.4s, v9.4s
|
||||
st1 {v10.4s, v11.4s}, [x2], #32
|
||||
|
||||
L8:
|
||||
cmp x11, #2
|
||||
blt L4
|
||||
|
||||
ld1 {v0.4s, v1.4s}, [x0], #32
|
||||
ld1 {v2.4s, v3.4s}, [x1], #32
|
||||
|
||||
fsub v4.4s, v0.4s, v2.4s
|
||||
fsub v5.4s, v1.4s, v3.4s
|
||||
|
||||
sub x11, x11, #2
|
||||
st1 {v4.4s, v5.4s}, [x2], #32
|
||||
|
||||
|
||||
cmp x11, #0
|
||||
beq loop16EndLine
|
||||
|
||||
L4:
|
||||
ld1 {v0.4s}, [x0], #16
|
||||
ld1 {v1.4s}, [x1], #16
|
||||
fsub v0.4s, v0.4s, v1.4s
|
||||
sub x11, x11, #1
|
||||
st1 {v0.4s}, [x2], #16
|
||||
|
||||
|
||||
loop16EndLine:
|
||||
add x0, x8, x3
|
||||
add x1, x9, x4
|
||||
add x2, x10, x5
|
||||
|
||||
subs x7, x7, #1
|
||||
bne loopH
|
||||
|
||||
ret
|
||||
#endif
|
|
@ -51,6 +51,12 @@ void AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
|
|||
int in_w_index = out_w_index * stride_w - pad_w;
|
||||
int in_h_index = out_h_index * stride_h - pad_h;
|
||||
int out_plane_offset = out_batch_offset + index * channel;
|
||||
|
||||
int real_win_h_start = MSMAX(0, -in_h_index);
|
||||
int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
|
||||
int resl_win_w_start = MSMAX(0, -in_w_index);
|
||||
int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
|
||||
|
||||
for (int j = 0; j < c8; j++) {
|
||||
int in_channel_offset = in_batch_offset + j * C8NUM;
|
||||
int out_channel_offset = out_plane_offset + j * C8NUM;
|
||||
|
@ -60,22 +66,17 @@ void AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
|
|||
float16_t tmp_avg[8]{0};
|
||||
#endif
|
||||
int real_count = 0;
|
||||
for (int h = 0; h < win_h; h++) {
|
||||
for (int w = 0; w < win_w; w++) {
|
||||
if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
|
||||
(in_w_index + w) >= in_w) {
|
||||
continue;
|
||||
} else {
|
||||
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
for (int h = real_win_h_start; h < real_win_h_end; h++) {
|
||||
for (int w = resl_win_w_start; w < real_win_w_end; w++) {
|
||||
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
#ifdef ENABLE_NEON
|
||||
tmp_avg = vaddq_f16(tmp_avg, vld1q_f16(input_ptr + in_offset));
|
||||
tmp_avg = vaddq_f16(tmp_avg, vld1q_f16(input_ptr + in_offset));
|
||||
#else
|
||||
for (int t = 0; t < 8; t++) {
|
||||
tmp_avg[t] += *(input_ptr + in_offset + t);
|
||||
}
|
||||
#endif
|
||||
++real_count;
|
||||
for (int t = 0; t < 8; t++) {
|
||||
tmp_avg[t] += *(input_ptr + in_offset + t);
|
||||
}
|
||||
#endif
|
||||
++real_count;
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
#ifdef ENABLE_NEON
|
||||
|
@ -97,22 +98,17 @@ void AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
|
|||
float16_t tmp_avg[4]{0};
|
||||
#endif
|
||||
int real_count = 0;
|
||||
for (int h = 0; h < win_h; h++) {
|
||||
for (int w = 0; w < win_w; w++) {
|
||||
if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
|
||||
(in_w_index + w) >= in_w) {
|
||||
continue;
|
||||
} else {
|
||||
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
for (int h = real_win_h_start; h < real_win_h_end; h++) {
|
||||
for (int w = resl_win_w_start; w < real_win_w_end; w++) {
|
||||
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
#ifdef ENABLE_NEON
|
||||
tmp_avg = vadd_f16(tmp_avg, vld1_f16(input_ptr + in_offset));
|
||||
tmp_avg = vadd_f16(tmp_avg, vld1_f16(input_ptr + in_offset));
|
||||
#else
|
||||
for (int j = 0; j < C4NUM; ++j) {
|
||||
tmp_avg[j] += *(input_ptr + in_offset);
|
||||
}
|
||||
#endif
|
||||
++real_count;
|
||||
for (int j = 0; j < C4NUM; ++j) {
|
||||
tmp_avg[j] += *(input_ptr + in_offset);
|
||||
}
|
||||
#endif
|
||||
++real_count;
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
#ifdef ENABLE_NEON
|
||||
|
@ -130,16 +126,11 @@ void AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
|
|||
int out_channel_offset = out_plane_offset + k;
|
||||
float16_t tmp_avg = 0;
|
||||
int real_count = 0;
|
||||
for (int h = 0; h < win_h; h++) {
|
||||
for (int w = 0; w < win_w; w++) {
|
||||
if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
|
||||
(in_w_index + w) >= in_w) {
|
||||
continue;
|
||||
} else {
|
||||
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
tmp_avg += *(input_ptr + in_offset);
|
||||
++real_count;
|
||||
}
|
||||
for (int h = real_win_h_start; h < real_win_h_end; h++) {
|
||||
for (int w = resl_win_w_start; w < real_win_w_end; w++) {
|
||||
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
tmp_avg += *(input_ptr + in_offset);
|
||||
++real_count;
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
*(output_ptr + out_channel_offset) = tmp_avg / (float16_t)real_count;
|
||||
|
@ -148,7 +139,6 @@ void AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
|
|||
} // out_plane loop
|
||||
} // out_batch loop
|
||||
}
|
||||
|
||||
void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id) {
|
||||
int stride_w = pooling_param->stride_w_;
|
||||
int stride_h = pooling_param->stride_h_;
|
||||
|
@ -183,6 +173,12 @@ void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
|
|||
int in_w_index = out_w_index * stride_w - pad_w;
|
||||
int in_h_index = out_h_index * stride_h - pad_h;
|
||||
int out_plane_offset = out_batch_offset + index * channel;
|
||||
|
||||
int real_win_h_start = MSMAX(0, -in_h_index);
|
||||
int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
|
||||
int resl_win_w_start = MSMAX(0, -in_w_index);
|
||||
int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
|
||||
|
||||
for (int j = 0; j < c8; j++) {
|
||||
int in_channel_offset = in_batch_offset + j * C8NUM;
|
||||
int out_channel_offset = out_plane_offset + j * C8NUM;
|
||||
|
@ -191,21 +187,16 @@ void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
|
|||
#else
|
||||
float16_t tmp_max[8]{-FLT_MAX};
|
||||
#endif
|
||||
for (int h = 0; h < win_h; h++) {
|
||||
for (int w = 0; w < win_w; w++) {
|
||||
if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
|
||||
(in_w_index + w) >= in_w) {
|
||||
continue;
|
||||
} else {
|
||||
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
for (int h = real_win_h_start; h < real_win_h_end; h++) {
|
||||
for (int w = resl_win_w_start; w < real_win_w_end; w++) {
|
||||
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
#ifdef ENABLE_NEON
|
||||
tmp_max = vmaxq_f16(tmp_max, vld1q_f16(input_ptr + in_offset));
|
||||
tmp_max = vmaxq_f16(tmp_max, vld1q_f16(input_ptr + in_offset));
|
||||
#else
|
||||
for (int k = 0; k < C8NUM; k++) {
|
||||
tmp_max[k] = fmax(tmp_max[k], *(input_ptr + in_offset + k));
|
||||
}
|
||||
#endif
|
||||
for (int k = 0; k < C8NUM; k++) {
|
||||
tmp_max[k] = fmax(tmp_max[k], *(input_ptr + in_offset + k));
|
||||
}
|
||||
#endif
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
#ifdef ENABLE_NEON
|
||||
|
@ -226,21 +217,16 @@ void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
|
|||
#else
|
||||
float16_t tmp_max[4]{-FLT_MAX};
|
||||
#endif
|
||||
for (int h = 0; h < win_h; h++) {
|
||||
for (int w = 0; w < win_w; w++) {
|
||||
if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
|
||||
(in_w_index + w) >= in_w) {
|
||||
continue;
|
||||
} else {
|
||||
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
for (int h = real_win_h_start; h < real_win_h_end; h++) {
|
||||
for (int w = resl_win_w_start; w < real_win_w_end; w++) {
|
||||
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
#ifdef ENABLE_NEON
|
||||
tmp_max = vmax_f16(tmp_max, vld1_f16(input_ptr + in_offset));
|
||||
tmp_max = vmax_f16(tmp_max, vld1_f16(input_ptr + in_offset));
|
||||
#else
|
||||
for (int k = 0; k < C4NUM; k++) {
|
||||
tmp_max[k] = fmax(tmp_max[k], *(input_ptr + in_offset + k));
|
||||
}
|
||||
#endif
|
||||
for (int k = 0; k < C4NUM; k++) {
|
||||
tmp_max[k] = fmax(tmp_max[k], *(input_ptr + in_offset + k));
|
||||
}
|
||||
#endif
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
#ifdef ENABLE_NEON
|
||||
|
@ -257,15 +243,10 @@ void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
|
|||
int in_channel_offset = in_batch_offset + k;
|
||||
int out_channel_offset = out_plane_offset + k;
|
||||
float16_t tmp_max = -FLT_MAX;
|
||||
for (int h = 0; h < win_h; h++) {
|
||||
for (int w = 0; w < win_w; w++) {
|
||||
if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
|
||||
(in_w_index + w) >= in_w) {
|
||||
continue;
|
||||
} else {
|
||||
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
tmp_max = fmax(tmp_max, *(input_ptr + in_offset));
|
||||
}
|
||||
for (int h = real_win_h_start; h < real_win_h_end; h++) {
|
||||
for (int w = resl_win_w_start; w < real_win_w_end; w++) {
|
||||
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
tmp_max = fmax(tmp_max, *(input_ptr + in_offset));
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
*(output_ptr + out_channel_offset) = tmp_max;
|
||||
|
|
|
@ -15,54 +15,6 @@
|
|||
*/
|
||||
|
||||
#include "nnacl/fp32/common_func.h"
|
||||
|
||||
#ifndef ENABLE_ARM64
|
||||
void MatrixAdd(const float *a_ptr, const float *b_ptr, float *dst, size_t a_stride, size_t b_stride, size_t c_stride,
|
||||
size_t row, size_t col) {
|
||||
for (int r = 0; r < row; r++) {
|
||||
for (int c = 0; c < col; c++) {
|
||||
int a_index = c * a_stride + r * C4NUM;
|
||||
int b_index = c * b_stride + r * C4NUM;
|
||||
int c_index = c * c_stride + r * C4NUM;
|
||||
for (int i = 0; i < C4NUM; i++) {
|
||||
dst[c_index + i] = a_ptr[a_index + i] + b_ptr[b_index + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void MatrixSub(const float *a_ptr, const float *b_ptr, float *dst, size_t a_stride, size_t b_stride, size_t c_stride,
|
||||
size_t row, size_t col) {
|
||||
for (int r = 0; r < row; r++) {
|
||||
for (int c = 0; c < col; c++) {
|
||||
int a_index = c * a_stride + r * C4NUM;
|
||||
int b_index = c * b_stride + r * C4NUM;
|
||||
int c_index = c * c_stride + r * C4NUM;
|
||||
for (int i = 0; i < C4NUM; i++) {
|
||||
dst[c_index + i] = a_ptr[a_index + i] - b_ptr[b_index + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
void MatrixMultiAdd(float *c11, float *c12, float *c21, float *c22, float *x_ptr, size_t row, size_t col,
|
||||
size_t c_stride, size_t x_stride) {
|
||||
/* U2 = P1 + P6 */
|
||||
MatrixAdd(x_ptr, c12, c12, x_stride, c_stride, c_stride, row, col);
|
||||
/* U3 = U2 + P7 */
|
||||
MatrixAdd(c12, c21, c21, c_stride, c_stride, c_stride, row, col);
|
||||
/* U4 = U2 + P5 */
|
||||
MatrixAdd(c12, c22, c12, c_stride, c_stride, c_stride, row, col);
|
||||
/* U7 = U3 + P5 */
|
||||
MatrixAdd(c21, c22, c22, c_stride, c_stride, c_stride, row, col);
|
||||
/* U5 = U4 + P3 */
|
||||
MatrixAdd(c12, c11, c12, c_stride, c_stride, c_stride, row, col);
|
||||
return;
|
||||
}
|
||||
|
||||
void PostConvFuncComm(const float *src_ptr_, float *out_ptr, const float *bias_ptr, size_t output_channel,
|
||||
size_t plane_size, size_t stride, bool is_relu, bool is_relu6, int size) {
|
||||
for (int oc = 0; oc < output_channel; oc++) {
|
||||
|
|
|
@ -31,12 +31,6 @@ void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bi
|
|||
size_t plane_size, size_t stride, bool is_relu, bool is_relu6);
|
||||
void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel,
|
||||
size_t plane_size, size_t stride, bool is_relu, bool is_relu6);
|
||||
void MatrixAdd(const float *a_ptr, const float *b_ptr, float *dst, size_t a_stride, size_t b_stride, size_t c_stride,
|
||||
size_t row, size_t col);
|
||||
void MatrixSub(const float *a_ptr, const float *b_ptr, float *dst, size_t a_stride, size_t b_stride, size_t c_stride,
|
||||
size_t row, size_t col);
|
||||
void MatrixMultiAdd(float *c11, float *c12, float *c21, float *c22, float *x_ptr, size_t row, size_t col,
|
||||
size_t c_stride, size_t x_stride);
|
||||
float ShortToFloat32(uint16_t srcValue);
|
||||
|
||||
uint16_t Float32ToShort(float srcValue);
|
||||
|
|
|
@ -17,7 +17,8 @@
|
|||
#include "nnacl/fp32/pooling.h"
|
||||
#include <float.h>
|
||||
|
||||
void AvgPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id) {
|
||||
void AvgPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id, float minf,
|
||||
float maxf) {
|
||||
int stride_w = pooling_param->stride_w_;
|
||||
int stride_h = pooling_param->stride_h_;
|
||||
int pad_w = pooling_param->pad_l_;
|
||||
|
@ -25,7 +26,7 @@ void AvgPooling(const float *input_ptr, float *output_ptr, PoolingParameter *poo
|
|||
int win_w = pooling_param->window_w_;
|
||||
int win_h = pooling_param->window_h_;
|
||||
int channel = pooling_param->input_channel_;
|
||||
int c4 = UP_DIV(channel, C4NUM);
|
||||
int c4 = UP_DIV(channel, C4NUM); /* oc && ic */
|
||||
int in_w = pooling_param->input_w_;
|
||||
int in_h = pooling_param->input_h_;
|
||||
int output_w = pooling_param->output_w_;
|
||||
|
@ -34,11 +35,15 @@ void AvgPooling(const float *input_ptr, float *output_ptr, PoolingParameter *poo
|
|||
int out_plane = output_w * output_h;
|
||||
int out_tile_count = UP_DIV(out_plane, TILE_NUM);
|
||||
int thread_num = pooling_param->thread_num_;
|
||||
// input channel is equal to output channel
|
||||
|
||||
#ifdef ENABLE_NEON
|
||||
float32x4_t min_value = vdupq_n_f32(minf);
|
||||
float32x4_t max_value = vdupq_n_f32(maxf);
|
||||
#endif
|
||||
|
||||
for (int batch = 0; batch < output_batch; batch++) {
|
||||
int in_batch_offset = batch * in_h * in_w * channel;
|
||||
int out_batch_offset = batch * output_h * output_w * channel;
|
||||
const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel;
|
||||
float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel;
|
||||
for (int thread_id = task_id; thread_id < out_tile_count; thread_id += thread_num) {
|
||||
int cal_start_index = thread_id * TILE_NUM;
|
||||
int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
|
||||
|
@ -48,10 +53,18 @@ void AvgPooling(const float *input_ptr, float *output_ptr, PoolingParameter *poo
|
|||
int out_h_index = index / output_w;
|
||||
int in_w_index = out_w_index * stride_w - pad_w;
|
||||
int in_h_index = out_h_index * stride_h - pad_h;
|
||||
int out_plane_offset = out_batch_offset + index * channel;
|
||||
for (int j = 0; j < c4 - 1; j++) {
|
||||
int in_channel_offset = in_batch_offset + j * C4NUM;
|
||||
int out_channel_offset = out_plane_offset + j * C4NUM;
|
||||
|
||||
const float *src_plane_ptr = src_b_ptr;
|
||||
float *dst_plane_ptr = dst_b_ptr + index * channel;
|
||||
|
||||
int real_win_h_start = MSMAX(0, -in_h_index);
|
||||
int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
|
||||
int resl_win_w_start = MSMAX(0, -in_w_index);
|
||||
int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
|
||||
|
||||
for (int ci = 0; ci < c4 - 1; ci++) {
|
||||
const float *src_c_ptr = src_plane_ptr + ci * C4NUM;
|
||||
float *dst_c_ptr = dst_plane_ptr + ci * C4NUM;
|
||||
#ifdef ENABLE_NEON
|
||||
float32x4_t tmp_avg = vdupq_n_f32(0);
|
||||
#else
|
||||
|
@ -61,60 +74,69 @@ void AvgPooling(const float *input_ptr, float *output_ptr, PoolingParameter *poo
|
|||
float tmp_avg4 = 0;
|
||||
#endif
|
||||
int real_count = 0;
|
||||
for (int h = 0; h < win_h; h++) {
|
||||
for (int w = 0; w < win_w; w++) {
|
||||
if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
|
||||
(in_w_index + w) >= in_w) {
|
||||
continue;
|
||||
} else {
|
||||
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
for (int h = real_win_h_start; h < real_win_h_end; h++) {
|
||||
for (int w = resl_win_w_start; w < real_win_w_end; w++) {
|
||||
const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
#ifdef ENABLE_NEON
|
||||
tmp_avg = vaddq_f32(tmp_avg, vld1q_f32(input_ptr + in_offset));
|
||||
tmp_avg = vaddq_f32(tmp_avg, vld1q_f32(src_win_ptr));
|
||||
#else
|
||||
tmp_avg1 += *(input_ptr + in_offset);
|
||||
tmp_avg2 += *(input_ptr + in_offset + 1);
|
||||
tmp_avg3 += *(input_ptr + in_offset + 2);
|
||||
tmp_avg4 += *(input_ptr + in_offset + 3);
|
||||
tmp_avg1 += src_win_ptr[0];
|
||||
tmp_avg2 += src_win_ptr[1];
|
||||
tmp_avg3 += src_win_ptr[2];
|
||||
tmp_avg4 += src_win_ptr[3];
|
||||
#endif
|
||||
++real_count;
|
||||
}
|
||||
++real_count;
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
#ifdef ENABLE_NEON
|
||||
vst1q_f32(output_ptr + out_channel_offset, tmp_avg / vdupq_n_f32(real_count));
|
||||
tmp_avg = tmp_avg / vdupq_n_f32(real_count);
|
||||
tmp_avg = vmaxq_f32(tmp_avg, min_value);
|
||||
tmp_avg = vminq_f32(tmp_avg, max_value);
|
||||
vst1q_f32(dst_c_ptr, tmp_avg);
|
||||
#else
|
||||
*(output_ptr + out_channel_offset) = tmp_avg1 / (float)real_count;
|
||||
*(output_ptr + out_channel_offset + 1) = tmp_avg2 / (float)real_count;
|
||||
*(output_ptr + out_channel_offset + 2) = tmp_avg3 / (float)real_count;
|
||||
*(output_ptr + out_channel_offset + 3) = tmp_avg4 / (float)real_count;
|
||||
tmp_avg1 /= (float)real_count;
|
||||
tmp_avg2 /= (float)real_count;
|
||||
tmp_avg3 /= (float)real_count;
|
||||
tmp_avg4 /= (float)real_count;
|
||||
tmp_avg1 = fmax(tmp_avg1, minf);
|
||||
tmp_avg2 = fmax(tmp_avg2, minf);
|
||||
tmp_avg3 = fmax(tmp_avg3, minf);
|
||||
tmp_avg4 = fmax(tmp_avg4, minf);
|
||||
tmp_avg1 = fmin(tmp_avg1, maxf);
|
||||
tmp_avg2 = fmin(tmp_avg2, maxf);
|
||||
tmp_avg3 = fmin(tmp_avg3, maxf);
|
||||
tmp_avg4 = fmin(tmp_avg4, maxf);
|
||||
dst_c_ptr[0] = tmp_avg1;
|
||||
dst_c_ptr[1] = tmp_avg2;
|
||||
dst_c_ptr[2] = tmp_avg3;
|
||||
dst_c_ptr[3] = tmp_avg4;
|
||||
#endif
|
||||
} // ic4-1 loop
|
||||
int channel_s = (c4 - 1) * C4NUM;
|
||||
for (int k = channel_s; k < channel; k++) {
|
||||
int in_channel_offset = in_batch_offset + k;
|
||||
int out_channel_offset = out_plane_offset + k;
|
||||
for (int ci = channel_s; ci < channel; ci++) {
|
||||
const float *src_c_ptr = src_plane_ptr + ci;
|
||||
float *dst_c_ptr = dst_plane_ptr + ci;
|
||||
float tmp_avg = 0;
|
||||
int real_count = 0;
|
||||
for (int h = 0; h < win_h; h++) {
|
||||
for (int w = 0; w < win_w; w++) {
|
||||
if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
|
||||
(in_w_index + w) >= in_w) {
|
||||
continue;
|
||||
} else {
|
||||
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
tmp_avg += *(input_ptr + in_offset);
|
||||
++real_count;
|
||||
}
|
||||
for (int h = real_win_h_start; h < real_win_h_end; h++) {
|
||||
for (int w = resl_win_w_start; w < real_win_w_end; w++) {
|
||||
const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
tmp_avg += src_win_ptr[0];
|
||||
++real_count;
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
*(output_ptr + out_channel_offset) = tmp_avg / (float)real_count;
|
||||
tmp_avg = tmp_avg / (float)real_count;
|
||||
tmp_avg = fmax(tmp_avg, minf);
|
||||
tmp_avg = fmin(tmp_avg, maxf);
|
||||
dst_c_ptr[0] = tmp_avg;
|
||||
} // channel_res loop
|
||||
} // real_cal_num loop
|
||||
} // out_plane loop
|
||||
} // out_batch loop
|
||||
}
|
||||
|
||||
void MaxPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id) {
|
||||
void MaxPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id, float minf,
|
||||
float maxf) {
|
||||
int stride_w = pooling_param->stride_w_;
|
||||
int stride_h = pooling_param->stride_h_;
|
||||
int pad_w = pooling_param->pad_l_;
|
||||
|
@ -132,207 +154,9 @@ void MaxPooling(const float *input_ptr, float *output_ptr, PoolingParameter *poo
|
|||
int thread_num = pooling_param->thread_num_;
|
||||
int c4 = UP_DIV(channel, C4NUM); /* oc && ic */
|
||||
|
||||
for (int batch = 0; batch < output_batch; batch++) {
|
||||
const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel;
|
||||
float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel;
|
||||
for (int thread_id = task_id; thread_id < out_tile_count; thread_id += thread_num) {
|
||||
int cal_start_index = thread_id * TILE_NUM;
|
||||
int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
|
||||
for (int i = 0; i < real_cal_num; i++) {
|
||||
int index = cal_start_index + i;
|
||||
int out_w_index = index % output_w;
|
||||
int out_h_index = index / output_w;
|
||||
int in_w_index = out_w_index * stride_w - pad_w;
|
||||
int in_h_index = out_h_index * stride_h - pad_h;
|
||||
|
||||
const float *src_plane_ptr = src_b_ptr;
|
||||
float *dst_plane_ptr = dst_b_ptr + index * channel;
|
||||
|
||||
int real_win_h_start = MSMAX(0, -in_h_index);
|
||||
int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
|
||||
int resl_win_w_start = MSMAX(0, -in_w_index);
|
||||
int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
|
||||
|
||||
for (int ci = 0; ci < c4 - 1; ci++) {
|
||||
const float *src_c_ptr = src_plane_ptr + ci * C4NUM;
|
||||
float *dst_c_ptr = dst_plane_ptr + ci * C4NUM;
|
||||
#ifdef ENABLE_NEON
|
||||
float32x4_t tmp_max = vdupq_n_f32(-FLT_MAX);
|
||||
#else
|
||||
float tmp_max1 = -FLT_MAX;
|
||||
float tmp_max2 = -FLT_MAX;
|
||||
float tmp_max3 = -FLT_MAX;
|
||||
float tmp_max4 = -FLT_MAX;
|
||||
#endif
|
||||
|
||||
for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
|
||||
for (int kw = resl_win_w_start; kw < real_win_w_end; kw++) {
|
||||
const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
|
||||
#ifdef ENABLE_NEON
|
||||
tmp_max = vmaxq_f32(tmp_max, vld1q_f32(src_win_ptr));
|
||||
#else
|
||||
tmp_max1 = fmax(tmp_max1, src_win_ptr[0]);
|
||||
tmp_max2 = fmax(tmp_max2, src_win_ptr[1]);
|
||||
tmp_max3 = fmax(tmp_max3, src_win_ptr[2]);
|
||||
tmp_max4 = fmax(tmp_max4, src_win_ptr[3]);
|
||||
|
||||
#endif
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
#ifdef ENABLE_NEON
|
||||
vst1q_f32(dst_c_ptr, tmp_max);
|
||||
#else
|
||||
dst_c_ptr[0] = tmp_max1;
|
||||
dst_c_ptr[1] = tmp_max2;
|
||||
dst_c_ptr[2] = tmp_max3;
|
||||
dst_c_ptr[3] = tmp_max4;
|
||||
#endif
|
||||
} // ic4-1 loop
|
||||
int channel_s = (c4 - 1) * C4NUM;
|
||||
for (int ci = channel_s; ci < channel; ci++) {
|
||||
float *dst_c_ptr = dst_plane_ptr + ci;
|
||||
const float *src_c_ptr = src_plane_ptr + ci;
|
||||
float tmp_max = -FLT_MAX;
|
||||
|
||||
for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
|
||||
for (int kw = resl_win_w_start; kw < real_win_w_end; kw++) {
|
||||
const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
|
||||
tmp_max = fmax(tmp_max, src_win_ptr[0]);
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
dst_c_ptr[0] = tmp_max;
|
||||
} // channel_res loop
|
||||
} // real_cal_num loop
|
||||
} // out_plane loop
|
||||
} // out_batch loop
|
||||
}
|
||||
|
||||
void AvgPoolingRelu(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id) {
|
||||
int stride_w = pooling_param->stride_w_;
|
||||
int stride_h = pooling_param->stride_h_;
|
||||
int pad_w = pooling_param->pad_l_;
|
||||
int pad_h = pooling_param->pad_u_;
|
||||
int win_w = pooling_param->window_w_;
|
||||
int win_h = pooling_param->window_h_;
|
||||
int channel = pooling_param->input_channel_;
|
||||
int c4 = UP_DIV(channel, C4NUM);
|
||||
int in_w = pooling_param->input_w_;
|
||||
int in_h = pooling_param->input_h_;
|
||||
int output_w = pooling_param->output_w_;
|
||||
int output_h = pooling_param->output_h_;
|
||||
int output_batch = pooling_param->output_batch_;
|
||||
int out_plane = output_w * output_h;
|
||||
int out_tile_count = UP_DIV(out_plane, TILE_NUM);
|
||||
int thread_num = pooling_param->thread_num_;
|
||||
#ifdef ENABLE_NEON
|
||||
float32x4_t zeros = vdupq_n_f32(0);
|
||||
#endif
|
||||
|
||||
for (int batch = 0; batch < output_batch; batch++) {
|
||||
int in_batch_offset = batch * in_h * in_w * channel;
|
||||
int out_batch_offset = batch * output_h * output_w * channel;
|
||||
for (int thread_id = task_id; thread_id < out_tile_count; thread_id += thread_num) {
|
||||
int cal_start_index = thread_id * TILE_NUM;
|
||||
int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
|
||||
for (int i = 0; i < real_cal_num; i++) {
|
||||
int index = cal_start_index + i;
|
||||
int out_w_index = index % output_w;
|
||||
int out_h_index = index / output_w;
|
||||
int in_w_index = out_w_index * stride_w - pad_w;
|
||||
int in_h_index = out_h_index * stride_h - pad_h;
|
||||
int out_plane_offset = out_batch_offset + index * channel;
|
||||
for (int j = 0; j < c4 - 1; j++) {
|
||||
int in_channel_offset = in_batch_offset + j * C4NUM;
|
||||
int out_channel_offset = out_plane_offset + j * C4NUM;
|
||||
#ifdef ENABLE_NEON
|
||||
float32x4_t tmp_avg = vdupq_n_f32(0);
|
||||
#else
|
||||
float tmp_avg1 = 0;
|
||||
float tmp_avg2 = 0;
|
||||
float tmp_avg3 = 0;
|
||||
float tmp_avg4 = 0;
|
||||
#endif
|
||||
int real_count = 0;
|
||||
for (int h = 0; h < win_h; h++) {
|
||||
for (int w = 0; w < win_w; w++) {
|
||||
if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
|
||||
(in_w_index + w) >= in_w) {
|
||||
continue;
|
||||
} else {
|
||||
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
#ifdef ENABLE_NEON
|
||||
tmp_avg = vaddq_f32(tmp_avg, vld1q_f32(input_ptr + in_offset));
|
||||
#else
|
||||
tmp_avg1 += *(input_ptr + in_offset);
|
||||
tmp_avg2 += *(input_ptr + in_offset + 1);
|
||||
tmp_avg3 += *(input_ptr + in_offset + 2);
|
||||
tmp_avg4 += *(input_ptr + in_offset + 3);
|
||||
#endif
|
||||
++real_count;
|
||||
}
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
#ifdef ENABLE_NEON
|
||||
tmp_avg = vmaxq_f32(tmp_avg, zeros);
|
||||
vst1q_f32(output_ptr + out_channel_offset, tmp_avg / vdupq_n_f32(real_count));
|
||||
#else
|
||||
tmp_avg1 = fmax(tmp_avg1, 0);
|
||||
tmp_avg2 = fmax(tmp_avg2, 0);
|
||||
tmp_avg3 = fmax(tmp_avg3, 0);
|
||||
tmp_avg4 = fmax(tmp_avg4, 0);
|
||||
|
||||
*(output_ptr + out_channel_offset) = tmp_avg1 / (float)real_count;
|
||||
*(output_ptr + out_channel_offset + 1) = tmp_avg2 / (float)real_count;
|
||||
*(output_ptr + out_channel_offset + 2) = tmp_avg3 / (float)real_count;
|
||||
*(output_ptr + out_channel_offset + 3) = tmp_avg4 / (float)real_count;
|
||||
#endif
|
||||
} // ic4-1 loop
|
||||
int channel_s = (c4 - 1) * C4NUM;
|
||||
for (int k = channel_s; k < channel; k++) {
|
||||
int in_channel_offset = in_batch_offset + k;
|
||||
int out_channel_offset = out_plane_offset + k;
|
||||
float tmp_avg = 0;
|
||||
int real_count = 0;
|
||||
for (int h = 0; h < win_h; h++) {
|
||||
for (int w = 0; w < win_w; w++) {
|
||||
if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
|
||||
(in_w_index + w) >= in_w) {
|
||||
continue;
|
||||
} else {
|
||||
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
tmp_avg += *(input_ptr + in_offset);
|
||||
++real_count;
|
||||
}
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
tmp_avg = fmax(tmp_avg, 0);
|
||||
*(output_ptr + out_channel_offset) = tmp_avg / (float)real_count;
|
||||
} // channel_res loop
|
||||
} // real_cal_num loop
|
||||
} // out_plane loop
|
||||
} // out_batch loop
|
||||
}
|
||||
|
||||
void MaxPoolingRelu(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id) {
|
||||
int stride_w = pooling_param->stride_w_;
|
||||
int stride_h = pooling_param->stride_h_;
|
||||
int pad_w = pooling_param->pad_l_;
|
||||
int pad_h = pooling_param->pad_u_;
|
||||
int win_w = pooling_param->window_w_;
|
||||
int win_h = pooling_param->window_h_;
|
||||
int channel = pooling_param->input_channel_;
|
||||
int in_w = pooling_param->input_w_;
|
||||
int in_h = pooling_param->input_h_;
|
||||
int output_w = pooling_param->output_w_;
|
||||
int output_h = pooling_param->output_h_;
|
||||
int output_batch = pooling_param->output_batch_;
|
||||
int out_plane = output_w * output_h;
|
||||
int out_tile_count = UP_DIV(out_plane, TILE_NUM);
|
||||
int thread_num = pooling_param->thread_num_;
|
||||
int c4 = UP_DIV(channel, C4NUM);
|
||||
|
||||
#ifdef ENABLE_NEON
|
||||
float32x4_t zeros = vdupq_n_f32(0);
|
||||
float32x4_t min_value = vdupq_n_f32(minf);
|
||||
float32x4_t max_value = vdupq_n_f32(maxf);
|
||||
#endif
|
||||
|
||||
for (int batch = 0; batch < output_batch; batch++) {
|
||||
|
@ -378,251 +202,22 @@ void MaxPoolingRelu(const float *input_ptr, float *output_ptr, PoolingParameter
|
|||
tmp_max2 = fmax(tmp_max2, src_win_ptr[1]);
|
||||
tmp_max3 = fmax(tmp_max3, src_win_ptr[2]);
|
||||
tmp_max4 = fmax(tmp_max4, src_win_ptr[3]);
|
||||
|
||||
#endif
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
#ifdef ENABLE_NEON
|
||||
tmp_max = vmaxq_f32(tmp_max, zeros);
|
||||
tmp_max = vmaxq_f32(tmp_max, min_value);
|
||||
tmp_max = vminq_f32(tmp_max, max_value);
|
||||
vst1q_f32(dst_c_ptr, tmp_max);
|
||||
#else
|
||||
// relu:
|
||||
tmp_max1 = fmax(tmp_max1, 0);
|
||||
tmp_max2 = fmax(tmp_max2, 0);
|
||||
tmp_max3 = fmax(tmp_max3, 0);
|
||||
tmp_max4 = fmax(tmp_max4, 0);
|
||||
|
||||
dst_c_ptr[0] = tmp_max1;
|
||||
dst_c_ptr[1] = tmp_max2;
|
||||
dst_c_ptr[2] = tmp_max3;
|
||||
dst_c_ptr[3] = tmp_max4;
|
||||
#endif
|
||||
} // ic4-1 loop
|
||||
int channel_s = (c4 - 1) * C4NUM;
|
||||
for (int ci = channel_s; ci < channel; ci++) {
|
||||
float *dst_c_ptr = dst_plane_ptr + ci;
|
||||
const float *src_c_ptr = src_plane_ptr + ci;
|
||||
float tmp_max = -FLT_MAX;
|
||||
|
||||
for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
|
||||
for (int kw = resl_win_w_start; kw < real_win_w_end; kw++) {
|
||||
const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
|
||||
tmp_max = fmax(tmp_max, src_win_ptr[0]);
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
dst_c_ptr[0] = tmp_max;
|
||||
} // channel_res loop
|
||||
} // real_cal_num loop
|
||||
} // out_plane loop
|
||||
} // out_batch loop
|
||||
}
|
||||
|
||||
void AvgPoolingRelu6(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id) {
|
||||
int stride_w = pooling_param->stride_w_;
|
||||
int stride_h = pooling_param->stride_h_;
|
||||
int pad_w = pooling_param->pad_l_;
|
||||
int pad_h = pooling_param->pad_u_;
|
||||
int win_w = pooling_param->window_w_;
|
||||
int win_h = pooling_param->window_h_;
|
||||
int channel = pooling_param->input_channel_;
|
||||
int c4 = UP_DIV(channel, C4NUM);
|
||||
int in_w = pooling_param->input_w_;
|
||||
int in_h = pooling_param->input_h_;
|
||||
int output_w = pooling_param->output_w_;
|
||||
int output_h = pooling_param->output_h_;
|
||||
int output_batch = pooling_param->output_batch_;
|
||||
int out_plane = output_w * output_h;
|
||||
int out_tile_count = UP_DIV(out_plane, TILE_NUM);
|
||||
int thread_num = pooling_param->thread_num_;
|
||||
// input channel is equal to output channel
|
||||
|
||||
#ifdef ENABLE_NEON
|
||||
float32x4_t zeros = vdupq_n_f32(0);
|
||||
float32x4_t bounds = vdupq_n_f32(6);
|
||||
#endif
|
||||
|
||||
for (int batch = 0; batch < output_batch; batch++) {
|
||||
int in_batch_offset = batch * in_h * in_w * channel;
|
||||
int out_batch_offset = batch * output_h * output_w * channel;
|
||||
for (int thread_id = task_id; thread_id < out_tile_count; thread_id += thread_num) {
|
||||
int cal_start_index = thread_id * TILE_NUM;
|
||||
int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
|
||||
for (int i = 0; i < real_cal_num; i++) {
|
||||
int index = cal_start_index + i;
|
||||
int out_w_index = index % output_w;
|
||||
int out_h_index = index / output_w;
|
||||
int in_w_index = out_w_index * stride_w - pad_w;
|
||||
int in_h_index = out_h_index * stride_h - pad_h;
|
||||
int out_plane_offset = out_batch_offset + index * channel;
|
||||
for (int j = 0; j < c4 - 1; j++) {
|
||||
int in_channel_offset = in_batch_offset + j * C4NUM;
|
||||
int out_channel_offset = out_plane_offset + j * C4NUM;
|
||||
#ifdef ENABLE_NEON
|
||||
float32x4_t tmp_avg = vdupq_n_f32(0);
|
||||
#else
|
||||
float tmp_avg1 = 0;
|
||||
float tmp_avg2 = 0;
|
||||
float tmp_avg3 = 0;
|
||||
float tmp_avg4 = 0;
|
||||
#endif
|
||||
int real_count = 0;
|
||||
for (int h = 0; h < win_h; h++) {
|
||||
for (int w = 0; w < win_w; w++) {
|
||||
if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
|
||||
(in_w_index + w) >= in_w) {
|
||||
continue;
|
||||
} else {
|
||||
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
#ifdef ENABLE_NEON
|
||||
tmp_avg = vaddq_f32(tmp_avg, vld1q_f32(input_ptr + in_offset));
|
||||
#else
|
||||
tmp_avg1 += *(input_ptr + in_offset);
|
||||
tmp_avg2 += *(input_ptr + in_offset + 1);
|
||||
tmp_avg3 += *(input_ptr + in_offset + 2);
|
||||
tmp_avg4 += *(input_ptr + in_offset + 3);
|
||||
#endif
|
||||
++real_count;
|
||||
}
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
#ifdef ENABLE_NEON
|
||||
tmp_avg = tmp_avg / vdupq_n_f32(real_count);
|
||||
tmp_avg = vmaxq_f32(tmp_avg, zeros);
|
||||
tmp_avg = vminq_f32(tmp_avg, bounds);
|
||||
vst1q_f32(output_ptr + out_channel_offset, tmp_avg);
|
||||
#else
|
||||
tmp_avg1 /= (float)real_count;
|
||||
tmp_avg2 /= (float)real_count;
|
||||
tmp_avg3 /= (float)real_count;
|
||||
tmp_avg4 /= (float)real_count;
|
||||
tmp_avg1 = fmax(tmp_avg1, 0);
|
||||
tmp_avg2 = fmax(tmp_avg2, 0);
|
||||
tmp_avg3 = fmax(tmp_avg3, 0);
|
||||
tmp_avg4 = fmax(tmp_avg4, 0);
|
||||
tmp_avg1 = fmin(tmp_avg1, 6);
|
||||
tmp_avg2 = fmin(tmp_avg2, 6);
|
||||
tmp_avg3 = fmin(tmp_avg3, 6);
|
||||
tmp_avg4 = fmin(tmp_avg4, 6);
|
||||
|
||||
*(output_ptr + out_channel_offset) = tmp_avg1;
|
||||
*(output_ptr + out_channel_offset + 1) = tmp_avg2;
|
||||
*(output_ptr + out_channel_offset + 2) = tmp_avg3;
|
||||
*(output_ptr + out_channel_offset + 3) = tmp_avg4;
|
||||
#endif
|
||||
} // ic4-1 loop
|
||||
int channel_s = (c4 - 1) * C4NUM;
|
||||
for (int k = channel_s; k < channel; k++) {
|
||||
int in_channel_offset = in_batch_offset + k;
|
||||
int out_channel_offset = out_plane_offset + k;
|
||||
float tmp_avg = 0;
|
||||
int real_count = 0;
|
||||
for (int h = 0; h < win_h; h++) {
|
||||
for (int w = 0; w < win_w; w++) {
|
||||
if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
|
||||
(in_w_index + w) >= in_w) {
|
||||
continue;
|
||||
} else {
|
||||
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
tmp_avg += *(input_ptr + in_offset);
|
||||
++real_count;
|
||||
}
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
tmp_avg /= (float)real_count;
|
||||
tmp_avg = fmax(tmp_avg, 0);
|
||||
tmp_avg = fmin(tmp_avg, 6);
|
||||
*(output_ptr + out_channel_offset) = tmp_avg;
|
||||
} // channel_res loop
|
||||
} // real_cal_num loop
|
||||
} // out_plane loop
|
||||
} // out_batch loop
|
||||
}
|
||||
|
||||
void MaxPoolingRelu6(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id) {
|
||||
int stride_w = pooling_param->stride_w_;
|
||||
int stride_h = pooling_param->stride_h_;
|
||||
int pad_w = pooling_param->pad_l_;
|
||||
int pad_h = pooling_param->pad_u_;
|
||||
int win_w = pooling_param->window_w_;
|
||||
int win_h = pooling_param->window_h_;
|
||||
int channel = pooling_param->input_channel_;
|
||||
int in_w = pooling_param->input_w_;
|
||||
int in_h = pooling_param->input_h_;
|
||||
int output_w = pooling_param->output_w_;
|
||||
int output_h = pooling_param->output_h_;
|
||||
int output_batch = pooling_param->output_batch_;
|
||||
int out_plane = output_w * output_h;
|
||||
int out_tile_count = UP_DIV(out_plane, TILE_NUM);
|
||||
int thread_num = pooling_param->thread_num_;
|
||||
int c4 = UP_DIV(channel, C4NUM);
|
||||
|
||||
#ifdef ENABLE_NEON
|
||||
float32x4_t zeros = vdupq_n_f32(0);
|
||||
float32x4_t bounds = vdupq_n_f32(6);
|
||||
#endif
|
||||
|
||||
for (int batch = 0; batch < output_batch; batch++) {
|
||||
const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel;
|
||||
float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel;
|
||||
for (int thread_id = task_id; thread_id < out_tile_count; thread_id += thread_num) {
|
||||
int cal_start_index = thread_id * TILE_NUM;
|
||||
int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
|
||||
for (int i = 0; i < real_cal_num; i++) {
|
||||
int index = cal_start_index + i;
|
||||
int out_w_index = index % output_w;
|
||||
int out_h_index = index / output_w;
|
||||
int in_w_index = out_w_index * stride_w - pad_w;
|
||||
int in_h_index = out_h_index * stride_h - pad_h;
|
||||
|
||||
const float *src_plane_ptr = src_b_ptr;
|
||||
float *dst_plane_ptr = dst_b_ptr + index * channel;
|
||||
|
||||
int real_win_h_start = MSMAX(0, -in_h_index);
|
||||
int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
|
||||
int resl_win_w_start = MSMAX(0, -in_w_index);
|
||||
int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
|
||||
|
||||
for (int ci = 0; ci < c4 - 1; ci++) {
|
||||
const float *src_c_ptr = src_plane_ptr + ci * C4NUM;
|
||||
float *dst_c_ptr = dst_plane_ptr + ci * C4NUM;
|
||||
#ifdef ENABLE_NEON
|
||||
float32x4_t tmp_max = vdupq_n_f32(-FLT_MAX);
|
||||
#else
|
||||
float tmp_max1 = -FLT_MAX;
|
||||
float tmp_max2 = -FLT_MAX;
|
||||
float tmp_max3 = -FLT_MAX;
|
||||
float tmp_max4 = -FLT_MAX;
|
||||
#endif
|
||||
|
||||
for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
|
||||
for (int kw = resl_win_w_start; kw < real_win_w_end; kw++) {
|
||||
const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
|
||||
#ifdef ENABLE_NEON
|
||||
tmp_max = vmaxq_f32(tmp_max, vld1q_f32(src_win_ptr));
|
||||
#else
|
||||
tmp_max1 = fmax(tmp_max1, src_win_ptr[0]);
|
||||
tmp_max2 = fmax(tmp_max2, src_win_ptr[1]);
|
||||
tmp_max3 = fmax(tmp_max3, src_win_ptr[2]);
|
||||
tmp_max4 = fmax(tmp_max4, src_win_ptr[3]);
|
||||
|
||||
#endif
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
#ifdef ENABLE_NEON
|
||||
tmp_max = vmaxq_f32(tmp_max, zeros);
|
||||
tmp_max = vminq_f32(tmp_max, bounds);
|
||||
vst1q_f32(dst_c_ptr, tmp_max);
|
||||
#else
|
||||
// relu:
|
||||
tmp_max1 = fmax(tmp_max1, 0);
|
||||
tmp_max2 = fmax(tmp_max2, 0);
|
||||
tmp_max3 = fmax(tmp_max3, 0);
|
||||
tmp_max4 = fmax(tmp_max4, 0);
|
||||
tmp_max1 = fmin(tmp_max1, 6);
|
||||
tmp_max2 = fmin(tmp_max2, 6);
|
||||
tmp_max3 = fmin(tmp_max3, 6);
|
||||
tmp_max4 = fmin(tmp_max4, 6);
|
||||
|
||||
tmp_max1 = fmax(tmp_max1, minf);
|
||||
tmp_max2 = fmax(tmp_max2, minf);
|
||||
tmp_max3 = fmax(tmp_max3, minf);
|
||||
tmp_max4 = fmax(tmp_max4, minf);
|
||||
tmp_max1 = fmin(tmp_max1, maxf);
|
||||
tmp_max2 = fmin(tmp_max2, maxf);
|
||||
tmp_max3 = fmin(tmp_max3, maxf);
|
||||
tmp_max4 = fmin(tmp_max4, maxf);
|
||||
dst_c_ptr[0] = tmp_max1;
|
||||
dst_c_ptr[1] = tmp_max2;
|
||||
dst_c_ptr[2] = tmp_max3;
|
||||
|
@ -641,6 +236,8 @@ void MaxPoolingRelu6(const float *input_ptr, float *output_ptr, PoolingParameter
|
|||
tmp_max = fmax(tmp_max, src_win_ptr[0]);
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
tmp_max = fmax(tmp_max, minf);
|
||||
tmp_max = fmin(tmp_max, maxf);
|
||||
dst_c_ptr[0] = tmp_max;
|
||||
} // channel_res loop
|
||||
} // real_cal_num loop
|
||||
|
|
|
@ -27,17 +27,10 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
void AvgPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id);
|
||||
|
||||
void MaxPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id);
|
||||
|
||||
void AvgPoolingRelu(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id);
|
||||
|
||||
void MaxPoolingRelu(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id);
|
||||
|
||||
void AvgPoolingRelu6(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id);
|
||||
|
||||
void MaxPoolingRelu6(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id);
|
||||
void AvgPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id, float minf,
|
||||
float maxf);
|
||||
void MaxPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id, float minf,
|
||||
float maxf);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp32/pooling.h"
|
||||
#include <float.h>
|
||||
#include "nnacl/fp32/pooling.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/runtime_api.h"
|
||||
|
@ -52,28 +53,18 @@ int PoolingCPUKernel::ReSize() {
|
|||
int PoolingCPUKernel::RunImpl(int task_id) {
|
||||
auto input_ptr = reinterpret_cast<float *>(in_tensors_.at(kInputIndex)->Data());
|
||||
auto output_ptr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
|
||||
float minf = -FLT_MAX;
|
||||
float maxf = FLT_MAX;
|
||||
if (pooling_param_->act_type_ == ActType_Relu) {
|
||||
minf = 0.f;
|
||||
} else if (pooling_param_->act_type_ == ActType_Relu6) {
|
||||
minf = 0.f;
|
||||
maxf = 6.f;
|
||||
}
|
||||
if (pooling_param_->pool_mode_ == PoolMode_MaxPool) {
|
||||
switch (pooling_param_->act_type_) {
|
||||
case ActType_Relu:
|
||||
MaxPoolingRelu(input_ptr, output_ptr, pooling_param_, task_id);
|
||||
break;
|
||||
case ActType_Relu6:
|
||||
MaxPoolingRelu6(input_ptr, output_ptr, pooling_param_, task_id);
|
||||
break;
|
||||
default:
|
||||
MaxPooling(input_ptr, output_ptr, pooling_param_, task_id);
|
||||
}
|
||||
MaxPooling(input_ptr, output_ptr, pooling_param_, task_id, minf, maxf);
|
||||
} else {
|
||||
switch (pooling_param_->act_type_) {
|
||||
case ActType_Relu:
|
||||
AvgPoolingRelu(input_ptr, output_ptr, pooling_param_, task_id);
|
||||
break;
|
||||
case ActType_Relu6:
|
||||
AvgPoolingRelu6(input_ptr, output_ptr, pooling_param_, task_id);
|
||||
break;
|
||||
default:
|
||||
AvgPooling(input_ptr, output_ptr, pooling_param_, task_id);
|
||||
}
|
||||
AvgPooling(input_ptr, output_ptr, pooling_param_, task_id, minf, maxf);
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue