optimize int8 pooling

This commit is contained in:
fuzhiye 2020-08-27 18:53:04 +08:00
parent 29070d60a1
commit d0aa719a80
14 changed files with 313 additions and 166 deletions

View File

@ -264,7 +264,8 @@ void ConvInt8(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight, c
int output_tile_count = UP_DIV(output_count, tile_n); int output_tile_count = UP_DIV(output_count, tile_n);
int ic4 = UP_DIV(in_channel, C4NUM); int ic4 = UP_DIV(in_channel, C4NUM);
int kernel_plane = kernel_h * kernel_w; int kernel_plane = kernel_h * kernel_w;
int unit_size = kernel_plane * ic4 * C4NUM; int plane_block = UP_DIV(kernel_plane, C4NUM);
int unit_size = plane_block * C4NUM * ic4 * C4NUM;
int packed_input_size = output_tile_count * tile_n * unit_size; int packed_input_size = output_tile_count * tile_n * unit_size;
int input_sum_offset; int input_sum_offset;
if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) { if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) {

View File

@ -89,8 +89,13 @@ void AvgPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParam
int output_batch = pooling_param->output_batch_; int output_batch = pooling_param->output_batch_;
int out_plane = output_w * output_h; int out_plane = output_w * output_h;
int out_tile_count = UP_DIV(out_plane, TILE_NUM); int out_tile_count = UP_DIV(out_plane, TILE_NUM);
int thread_num = pooling_param->thread_num_; int thread_num = out_tile_count < pooling_param->thread_num_ ? out_tile_count : pooling_param->thread_num_;
int c8 = UP_DIV(channel, C8NUM); float input_scale = pooling_param->quant_args_[0][0].scale_;
int input_zp = pooling_param->quant_args_[0][0].zp_;
float output_scale = pooling_param->quant_args_[1][0].scale_;
int output_zp = pooling_param->quant_args_[1][0].zp_;
double real_multiplier = input_scale / output_scale;
int c16 = channel / C16NUM;
const int8_t out_min = INT8_MIN; const int8_t out_min = INT8_MIN;
const int8_t out_max = INT8_MAX; const int8_t out_max = INT8_MAX;
@ -107,89 +112,159 @@ void AvgPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParam
int in_w_index = out_w_index * stride_w - pad_w; int in_w_index = out_w_index * stride_w - pad_w;
int in_h_index = out_h_index * stride_h - pad_h; int in_h_index = out_h_index * stride_h - pad_h;
int out_plane_offset = out_batch_offset + index * channel; int out_plane_offset = out_batch_offset + index * channel;
for (int j = 0; j < c8 - 1; j++) { int input_stride = (in_h_index * in_w + in_w_index) * channel;
int in_channel_offset = in_batch_offset + j * C8NUM; int kw_s = MSMAX(0, -in_w_index);
int out_channel_offset = out_plane_offset + j * C8NUM; int kw_e = MSMIN(win_w, in_w - in_w_index);
int16_t tmp_avg1 = 0; int kh_s = MSMAX(0, -in_h_index);
int16_t tmp_avg2 = 0; int kh_e = MSMIN(win_h, in_h - in_h_index);
int16_t tmp_avg3 = 0; int real_count = (kw_e - kw_s) * (kh_e - kh_s);
int16_t tmp_avg4 = 0;
int16_t tmp_avg5 = 0; // 16 channels
int16_t tmp_avg6 = 0; for (int j = 0; j < c16; j++) {
int16_t tmp_avg7 = 0; #ifdef ENABLE_NEON
int16_t tmp_avg8 = 0; int16x8_t tmp_avg[2];
int real_count = 0; tmp_avg[0] = vmovq_n_s16(0);
for (int h = 0; h < win_h; h++) { tmp_avg[1] = vmovq_n_s16(0);
for (int w = 0; w < win_w; w++) { #else
if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 || int16_t tmp_avg[16];
(in_w_index + w) >= in_w) { int16_t real_out[16];
continue; for (int m = 0; m < C16NUM; ++m) {
} else { tmp_avg[m] = 0;
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
tmp_avg1 += *(input_ptr + in_offset);
tmp_avg2 += *(input_ptr + in_offset + 1);
tmp_avg3 += *(input_ptr + in_offset + 2);
tmp_avg4 += *(input_ptr + in_offset + 3);
tmp_avg5 += *(input_ptr + in_offset + 4);
tmp_avg6 += *(input_ptr + in_offset + 5);
tmp_avg7 += *(input_ptr + in_offset + 6);
tmp_avg8 += *(input_ptr + in_offset + 7);
++real_count;
} }
#endif
int in_channel_offset = in_batch_offset + j * C16NUM;
int out_channel_offset = out_plane_offset + j * C16NUM;
for (int h = kh_s; h < kh_e; h++) {
for (int w = kw_s; w < kw_e; w++) {
int in_offset = in_channel_offset + input_stride + (h * in_w + w) * channel;
#ifdef ENABLE_NEON
int8x16_t in_ptr = vld1q_s8(input_ptr + in_offset);
int8x8_t in_data1 = vget_low_s8(in_ptr);
int8x8_t in_data2 = vget_high_s8(in_ptr);
int16x8_t data1 = vmovl_s8(in_data1);
int16x8_t data2 = vmovl_s8(in_data2);
tmp_avg[0] = vaddq_s16(tmp_avg[0], data1);
tmp_avg[1] = vaddq_s16(tmp_avg[1], data2);
#else
for (int k = 0; k < C16NUM; ++k) {
tmp_avg[k] += input_ptr[in_offset + k];
}
#endif
} // win_w loop } // win_w loop
} // win_h loop } // win_h loop
int16_t tmp_out1 = round((float)tmp_avg1 / (float)real_count); #ifdef ENABLE_NEON
int16_t tmp_out2 = round((float)tmp_avg2 / (float)real_count); int16_t tmp_data[8];
int16_t tmp_out3 = round((float)tmp_avg3 / (float)real_count); int16_t tmp_out[8];
int16_t tmp_out4 = round((float)tmp_avg4 / (float)real_count); int16_t tmp_data1[8];
int16_t tmp_out5 = round((float)tmp_avg5 / (float)real_count); int16_t tmp_out1[8];
int16_t tmp_out6 = round((float)tmp_avg6 / (float)real_count); for (int l = 0; l < C8NUM; l++) {
int16_t tmp_out7 = round((float)tmp_avg7 / (float)real_count); tmp_data[l] = tmp_avg[0][l] + 128 * real_count;
int16_t tmp_out8 = round((float)tmp_avg8 / (float)real_count); tmp_out[l] = (tmp_data[l] + real_count / 2) / real_count;
int16_t real_out1 = tmp_out1 < out_min ? out_min : tmp_out1; tmp_out[l] -= 128;
int16_t real_out2 = tmp_out2 < out_min ? out_min : tmp_out2; tmp_out[l] = round((tmp_out[l] - input_zp) * real_multiplier) + output_zp;
int16_t real_out3 = tmp_out3 < out_min ? out_min : tmp_out3; }
int16_t real_out4 = tmp_out4 < out_min ? out_min : tmp_out4; for (int l = 0; l < C8NUM; l++) {
int16_t real_out5 = tmp_out5 < out_min ? out_min : tmp_out5; tmp_data1[l] = tmp_avg[1][l] + 128 * real_count;
int16_t real_out6 = tmp_out6 < out_min ? out_min : tmp_out6; tmp_out1[l] = (tmp_data1[l] + real_count / 2) / real_count;
int16_t real_out7 = tmp_out7 < out_min ? out_min : tmp_out7; tmp_out1[l] -= 128;
int16_t real_out8 = tmp_out8 < out_min ? out_min : tmp_out8; tmp_out1[l] = round((tmp_out1[l] - input_zp) * real_multiplier) + output_zp;
real_out1 = real_out1 > out_max ? out_max : real_out1; }
real_out2 = real_out2 > out_max ? out_max : real_out2; int8x8_t real_out[2];
real_out3 = real_out3 > out_max ? out_max : real_out3; int8x8_t output_min = vdup_n_s8(out_min);
real_out4 = real_out4 > out_max ? out_max : real_out4; int8x8_t output_max = vdup_n_s8(out_max);
real_out5 = real_out5 > out_max ? out_max : real_out5; real_out[0] = vqmovn_s16(vld1q_s16(tmp_out));
real_out6 = real_out6 > out_max ? out_max : real_out6; real_out[0] = vmin_s8(real_out[0], output_max);
real_out7 = real_out7 > out_max ? out_max : real_out7; real_out[0] = vmax_s8(real_out[0], output_min);
real_out8 = real_out8 > out_max ? out_max : real_out8; vst1_s8(output_ptr + out_channel_offset, real_out[0]);
*(output_ptr + out_channel_offset) = (int8_t)real_out1; real_out[1] = vqmovn_s16(vld1q_s16(tmp_out1));
*(output_ptr + out_channel_offset + 1) = (int8_t)real_out2; real_out[1] = vmin_s8(real_out[1], output_max);
*(output_ptr + out_channel_offset + 2) = (int8_t)real_out3; real_out[1] = vmax_s8(real_out[1], output_min);
*(output_ptr + out_channel_offset + 3) = (int8_t)real_out4; vst1_s8(output_ptr + out_channel_offset + 8, real_out[1]);
*(output_ptr + out_channel_offset + 4) = (int8_t)real_out5; #else
*(output_ptr + out_channel_offset + 5) = (int8_t)real_out6; for (int l = 0; l < C16NUM; ++l) {
*(output_ptr + out_channel_offset + 6) = (int8_t)real_out7; int16_t tmp_data = tmp_avg[l] + 128 * real_count;
*(output_ptr + out_channel_offset + 7) = (int8_t)real_out8; real_out[l] = (tmp_data + real_count / 2) / real_count - 128;
} // in_channel loop real_out[l] = (int8_t)(round((real_out[l] - input_zp) * real_multiplier) + output_zp);
int channel_s = (c8 - 1) * C8NUM; real_out[l] = real_out[l] < out_min ? out_min : real_out[l];
for (int k = channel_s; k < channel; k++) { real_out[l] = real_out[l] > out_max ? out_max : real_out[l];
int in_channel_offset = in_batch_offset + k; *(output_ptr + out_channel_offset + l) = (int8_t)real_out[l];
int out_channel_offset = out_plane_offset + k; }
#endif
}
// 8 channels
int channel_16_res = channel - c16 * C16NUM;
int c8 = channel_16_res / C8NUM;
int in_c16_offset = in_batch_offset + c16 * C16NUM;
int out_c16_offset = out_plane_offset + c16 * C16NUM;
for (int j = 0; j < c8; j++) {
#ifdef ENABLE_NEON
int16x8_t tmp_avg = vmovq_n_s16(0);
#else
int16_t tmp_avg[8] = {0, 0, 0, 0, 0, 0, 0, 0};
int16_t real_out[8];
#endif
int in_channel_offset = in_c16_offset + j * C8NUM;
int out_channel_offset = out_c16_offset + j * C8NUM;
for (int h = kh_s; h < kh_e; h++) {
for (int w = kw_s; w < kw_e; w++) {
int in_offset = in_channel_offset + input_stride + (h * in_w + w) * channel;
#ifdef ENABLE_NEON
int8x8_t in_ptr = vld1_s8(input_ptr + in_offset);
int16x8_t data = vmovl_s8(in_ptr);
tmp_avg = vaddq_s16(tmp_avg, data);
#else
for (int k = 0; k < C8NUM; ++k) {
tmp_avg[k] += input_ptr[in_offset + k];
}
#endif
} // win_w loop
} // win_h loop
#ifdef ENABLE_NEON
int16_t tmp_data[8];
int16_t tmp_out[8];
for (int l = 0; l < C8NUM; l++) {
tmp_data[l] = tmp_avg[l] + 128 * real_count;
tmp_out[l] = (tmp_data[l] + real_count / 2) / real_count;
tmp_out[l] -= 128;
tmp_out[l] = round((tmp_out[l] - input_zp) * real_multiplier) + output_zp;
}
int8x8_t real_out;
int8x8_t output_min = vdup_n_s8(out_min);
int8x8_t output_max = vdup_n_s8(out_max);
real_out = vqmovn_s16(vld1q_s16(tmp_out));
real_out = vmin_s8(real_out, output_max);
real_out = vmax_s8(real_out, output_min);
vst1_s8(output_ptr + out_channel_offset, real_out);
#else
for (int l = 0; l < C8NUM; ++l) {
int16_t tmp_data = tmp_avg[l] + 128 * real_count;
real_out[l] = (tmp_data + real_count / 2) / real_count - 128;
real_out[l] = (int8_t)(round((real_out[l] - input_zp) * real_multiplier) + output_zp);
real_out[l] = real_out[l] < out_min ? out_min : real_out[l];
real_out[l] = real_out[l] > out_max ? out_max : real_out[l];
*(output_ptr + out_channel_offset + l) = (int8_t)real_out[l];
}
#endif
}
// less than 8 channel
int channel_8_res = channel_16_res - c8 * C8NUM;
int in_c8_offset = in_c16_offset + c8 * C8NUM;
int out_c8_offset = out_c16_offset + c8 * C8NUM;
for (int k = 0; k < channel_8_res; k++) {
int in_channel_offset = in_c8_offset + k;
int out_channel_offset = out_c8_offset + k;
int16_t tmp_avg = 0; int16_t tmp_avg = 0;
int real_count = 0; for (int h = kh_s; h < kh_e; h++) {
for (int h = 0; h < win_h; h++) { for (int w = kw_s; w < kw_e; w++) {
for (int w = 0; w < win_w; w++) { int in_offset = in_channel_offset + input_stride + (h * in_w + w) * channel;
if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 || tmp_avg += input_ptr[in_offset];
(in_w_index + w) >= in_w) {
continue;
} else {
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
tmp_avg += *(input_ptr + in_offset);
++real_count;
}
} // win_w loop } // win_w loop
} // win_h loop } // win_h loop
int16_t tmp_out = round((float)tmp_avg / (float)real_count); int16_t tmp_out = round((float)tmp_avg / (float)real_count + 128) - 128;
tmp_out = (int8_t)(round((tmp_out - input_zp) * real_multiplier) + output_zp);
int16_t real_out = tmp_out < out_min ? out_min : tmp_out; int16_t real_out = tmp_out < out_min ? out_min : tmp_out;
real_out = real_out > out_max ? out_max : real_out; real_out = real_out > out_max ? out_max : real_out;
*(output_ptr + out_channel_offset) = (int8_t)real_out; *(output_ptr + out_channel_offset) = (int8_t)real_out;
@ -249,6 +324,109 @@ void MaxPoolingInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParamete
} // out_batch loop } // out_batch loop
} }
void MaxPoolingWithQuantInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParameter *pooling_param,
int task_id) {
int stride_w = pooling_param->stride_w_;
int stride_h = pooling_param->stride_h_;
int pad_w = pooling_param->pad_l_;
int pad_h = pooling_param->pad_u_;
int win_w = pooling_param->window_w_;
int win_h = pooling_param->window_h_;
int channel = pooling_param->input_channel_;
int in_w = pooling_param->input_w_;
int in_h = pooling_param->input_h_;
int output_w = pooling_param->output_w_;
int output_h = pooling_param->output_h_;
int output_batch = pooling_param->output_batch_;
int out_plane = output_w * output_h;
int out_tile_count = UP_DIV(out_plane, TILE_NUM);
int thread_num = out_tile_count < pooling_param->thread_num_ ? out_tile_count : pooling_param->thread_num_;
int c16 = UP_DIV(channel, 16);
// input channel is equal to output channel
float input_scale = pooling_param->quant_args_[0][0].scale_;
int input_zp = pooling_param->quant_args_[0][0].zp_;
float output_scale = pooling_param->quant_args_[1][0].scale_;
int output_zp = pooling_param->quant_args_[1][0].zp_;
double real_multiplier = input_scale / output_scale;
for (int batch = 0; batch < output_batch; batch++) {
int in_batch_offset = batch * in_h * in_w * channel;
int out_batch_offset = batch * output_h * output_w * channel;
for (int thread_id = task_id; thread_id < out_tile_count; thread_id += thread_num) {
int cal_start_index = thread_id * TILE_NUM;
int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
for (int i = 0; i < real_cal_num; i++) {
int index = cal_start_index + i;
int out_w_index = index % output_w;
int out_h_index = index / output_w;
int in_w_index = out_w_index * stride_w - pad_w;
int in_h_index = out_h_index * stride_h - pad_h;
int out_plane_offset = out_batch_offset + index * channel;
for (int j = 0; j < c16 - 1; j++) {
int in_channel_offset = in_batch_offset + j * 16;
int out_channel_offset = out_plane_offset + j * 16;
#ifdef ENABLE_NEON
int8x16_t tmp_max = vdupq_n_s8(INT8_MIN);
#else
int8_t tmp_max[16];
for (int m = 0; m < C16NUM; ++m) {
tmp_max[m] = INT8_MIN;
}
#endif
for (int h = 0; h < win_h; h++) {
for (int w = 0; w < win_w; w++) {
if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
(in_w_index + w) >= in_w) {
continue;
} else {
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
#ifdef ENABLE_NEON
tmp_max = vmaxq_s8(tmp_max, vld1q_s8(input_ptr + in_offset));
#else
for (int k = 0; k < C16NUM; ++k) {
tmp_max[k] = MaxInt8(tmp_max[k], *(input_ptr + in_offset + k));
}
#endif
}
} // win_w loop
} // win_h loop
#ifdef ENABLE_NEON
for (int l = 0; l < C16NUM; ++l) {
tmp_max[l] = (int8_t)(round((tmp_max[l] - input_zp) * real_multiplier) + output_zp);
}
vst1q_s8(output_ptr + out_channel_offset, tmp_max);
#else
for (int l = 0; l < C16NUM; ++l) {
*(output_ptr + out_channel_offset + l) =
(int8_t)(round((tmp_max[l] - input_zp) * real_multiplier) + output_zp);
}
#endif
} // in_channel loop
// res channel
int channel_s = (c16 - 1) * 16;
for (int k = channel_s; k < channel; k++) {
int in_channel_offset = in_batch_offset + k;
int out_channel_offset = out_plane_offset + k;
int8_t tmp_max = INT8_MIN;
for (int h = 0; h < win_h; h++) {
for (int w = 0; w < win_w; w++) {
if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 ||
(in_w_index + w) >= in_w) {
continue;
} else {
int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
tmp_max = MaxInt8(tmp_max, *(input_ptr + in_offset));
}
} // win_w loop
} // win_h loop
*(output_ptr + out_channel_offset) = (int8_t)(round((tmp_max - input_zp) * real_multiplier) + output_zp);
} // channel_res loop
} // out_plane loop
} // out_batch loop
}
}
void MaxPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParameter *pooling_param, int task_id) { void MaxPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParameter *pooling_param, int task_id) {
int stride_w = pooling_param->stride_w_; int stride_w = pooling_param->stride_w_;
int stride_h = pooling_param->stride_h_; int stride_h = pooling_param->stride_h_;
@ -264,7 +442,7 @@ void MaxPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParam
int output_batch = pooling_param->output_batch_; int output_batch = pooling_param->output_batch_;
int out_plane = output_w * output_h; int out_plane = output_w * output_h;
int out_tile_count = UP_DIV(out_plane, TILE_NUM); int out_tile_count = UP_DIV(out_plane, TILE_NUM);
int thread_num = pooling_param->thread_num_; int thread_num = out_tile_count < pooling_param->thread_num_ ? out_tile_count : pooling_param->thread_num_;
int c16 = UP_DIV(channel, 16); int c16 = UP_DIV(channel, 16);
for (int batch = 0; batch < output_batch; batch++) { for (int batch = 0; batch < output_batch; batch++) {
@ -286,22 +464,10 @@ void MaxPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParam
#ifdef ENABLE_NEON #ifdef ENABLE_NEON
int8x16_t tmp_max = vdupq_n_s8(INT8_MIN); int8x16_t tmp_max = vdupq_n_s8(INT8_MIN);
#else #else
int8_t tmp_max1 = INT8_MIN; int8_t tmp_max[16];
int8_t tmp_max2 = INT8_MIN; for (int m = 0; m < C16NUM; ++m) {
int8_t tmp_max3 = INT8_MIN; tmp_max[m] = INT8_MIN;
int8_t tmp_max4 = INT8_MIN; }
int8_t tmp_max5 = INT8_MIN;
int8_t tmp_max6 = INT8_MIN;
int8_t tmp_max7 = INT8_MIN;
int8_t tmp_max8 = INT8_MIN;
int8_t tmp_max9 = INT8_MIN;
int8_t tmp_max10 = INT8_MIN;
int8_t tmp_max11 = INT8_MIN;
int8_t tmp_max12 = INT8_MIN;
int8_t tmp_max13 = INT8_MIN;
int8_t tmp_max14 = INT8_MIN;
int8_t tmp_max15 = INT8_MIN;
int8_t tmp_max16 = INT8_MIN;
#endif #endif
for (int h = 0; h < win_h; h++) { for (int h = 0; h < win_h; h++) {
for (int w = 0; w < win_w; w++) { for (int w = 0; w < win_w; w++) {
@ -313,22 +479,9 @@ void MaxPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParam
#ifdef ENABLE_NEON #ifdef ENABLE_NEON
tmp_max = vmaxq_s8(tmp_max, vld1q_s8(input_ptr + in_offset)); tmp_max = vmaxq_s8(tmp_max, vld1q_s8(input_ptr + in_offset));
#else #else
tmp_max1 = MaxInt8(tmp_max1, *(input_ptr + in_offset)); for (int k = 0; k < C16NUM; ++k) {
tmp_max2 = MaxInt8(tmp_max2, *(input_ptr + in_offset + 1)); tmp_max[k] = MaxInt8(tmp_max[k], *(input_ptr + in_offset + k));
tmp_max3 = MaxInt8(tmp_max3, *(input_ptr + in_offset + 2)); }
tmp_max4 = MaxInt8(tmp_max4, *(input_ptr + in_offset + 3));
tmp_max5 = MaxInt8(tmp_max5, *(input_ptr + in_offset + 4));
tmp_max6 = MaxInt8(tmp_max6, *(input_ptr + in_offset + 5));
tmp_max7 = MaxInt8(tmp_max7, *(input_ptr + in_offset + 6));
tmp_max8 = MaxInt8(tmp_max8, *(input_ptr + in_offset + 7));
tmp_max9 = MaxInt8(tmp_max9, *(input_ptr + in_offset + 8));
tmp_max10 = MaxInt8(tmp_max10, *(input_ptr + in_offset + 9));
tmp_max11 = MaxInt8(tmp_max11, *(input_ptr + in_offset + 10));
tmp_max12 = MaxInt8(tmp_max12, *(input_ptr + in_offset + 11));
tmp_max13 = MaxInt8(tmp_max13, *(input_ptr + in_offset + 12));
tmp_max14 = MaxInt8(tmp_max14, *(input_ptr + in_offset + 13));
tmp_max15 = MaxInt8(tmp_max15, *(input_ptr + in_offset + 14));
tmp_max16 = MaxInt8(tmp_max16, *(input_ptr + in_offset + 15));
#endif #endif
} }
} // win_w loop } // win_w loop
@ -336,24 +489,13 @@ void MaxPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParam
#ifdef ENABLE_NEON #ifdef ENABLE_NEON
vst1q_s8(output_ptr + out_channel_offset, tmp_max); vst1q_s8(output_ptr + out_channel_offset, tmp_max);
#else #else
*(output_ptr + out_channel_offset) = tmp_max1; for (int l = 0; l < C16NUM; ++l) {
*(output_ptr + out_channel_offset + 1) = tmp_max2; *(output_ptr + out_channel_offset + l) = tmp_max[l];
*(output_ptr + out_channel_offset + 2) = tmp_max3; }
*(output_ptr + out_channel_offset + 3) = tmp_max4;
*(output_ptr + out_channel_offset + 4) = tmp_max5;
*(output_ptr + out_channel_offset + 5) = tmp_max6;
*(output_ptr + out_channel_offset + 6) = tmp_max7;
*(output_ptr + out_channel_offset + 7) = tmp_max8;
*(output_ptr + out_channel_offset + 8) = tmp_max9;
*(output_ptr + out_channel_offset + 9) = tmp_max10;
*(output_ptr + out_channel_offset + 10) = tmp_max11;
*(output_ptr + out_channel_offset + 11) = tmp_max12;
*(output_ptr + out_channel_offset + 12) = tmp_max13;
*(output_ptr + out_channel_offset + 13) = tmp_max14;
*(output_ptr + out_channel_offset + 14) = tmp_max15;
*(output_ptr + out_channel_offset + 15) = tmp_max16;
#endif #endif
} // in_channel loop } // in_channel loop
// res channel
int channel_s = (c16 - 1) * 16; int channel_s = (c16 - 1) * 16;
for (int k = channel_s; k < channel; k++) { for (int k = channel_s; k < channel; k++) {
int in_channel_offset = in_batch_offset + k; int in_channel_offset = in_batch_offset + k;

View File

@ -32,6 +32,8 @@ void AvgPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParam
void MaxPoolingInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParameter *pooling_param, int task_id); void MaxPoolingInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParameter *pooling_param, int task_id);
void MaxPoolingWithQuantInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParameter *pooling_param, int task_id);
void MaxPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParameter *pooling_param, int task_id); void MaxPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParameter *pooling_param, int task_id);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -19,14 +19,16 @@
#include "nnacl/op_base.h" #include "nnacl/op_base.h"
#include "nnacl/quantization/quantize.h" #include "nnacl/quantization/quantize.h"
typedef enum PoolMode { PoolMode_No, PoolMode_MaxPool, PoolMode_AvgPool } PoolMode;
typedef enum RoundMode { RoundMode_No, RoundMode_Ceil, RoundMode_Floor } RoundMode;
typedef struct PoolingParameter { typedef struct PoolingParameter {
OpParameter op_parameter_; OpParameter op_parameter_;
PoolMode pool_mode_;
RoundMode round_mode_;
ActType act_type_;
QuantArg **quant_args_; QuantArg **quant_args_;
bool global_;
bool max_pooling_;
bool avg_pooling_;
bool round_ceil_;
bool round_floor_;
int window_w_; int window_w_;
int window_h_; int window_h_;
int input_w_; int input_w_;
@ -44,7 +46,8 @@ typedef struct PoolingParameter {
int stride_w_; int stride_w_;
int stride_h_; int stride_h_;
int thread_num_; int thread_num_;
ActType act_type_; bool global_;
bool quantize_;
} PoolingParameter; } PoolingParameter;
#endif // MINDSPORE_LITE_NNACL_POOLING_PARAMETER_H_ #endif // MINDSPORE_LITE_NNACL_POOLING_PARAMETER_H_

View File

@ -294,32 +294,26 @@ OpParameter *PopulatePoolingParameter(const mindspore::lite::PrimitiveC *primiti
auto pool_mode = pooling_primitive->GetPoolingMode(); auto pool_mode = pooling_primitive->GetPoolingMode();
switch (pool_mode) { switch (pool_mode) {
case schema::PoolMode_MAX_POOLING: case schema::PoolMode_MAX_POOLING:
pooling_param->max_pooling_ = true; pooling_param->pool_mode_ = PoolMode_MaxPool;
pooling_param->avg_pooling_ = false;
break; break;
case schema::PoolMode_MEAN_POOLING: case schema::PoolMode_MEAN_POOLING:
pooling_param->max_pooling_ = false; pooling_param->pool_mode_ = PoolMode_AvgPool;
pooling_param->avg_pooling_ = true;
break; break;
default: default:
pooling_param->max_pooling_ = false; pooling_param->pool_mode_ = PoolMode_No;
pooling_param->avg_pooling_ = false;
break; break;
} }
auto round_mode = pooling_primitive->GetRoundMode(); auto round_mode = pooling_primitive->GetRoundMode();
switch (round_mode) { switch (round_mode) {
case schema::RoundMode_FLOOR: case schema::RoundMode_FLOOR:
pooling_param->round_floor_ = true; pooling_param->round_mode_ = RoundMode_Floor;
pooling_param->round_ceil_ = false;
break; break;
case schema::RoundMode_CEIL: case schema::RoundMode_CEIL:
pooling_param->round_floor_ = false; pooling_param->round_mode_ = RoundMode_Ceil;
pooling_param->round_ceil_ = true;
break; break;
default: default:
pooling_param->round_floor_ = false; pooling_param->round_mode_ = RoundMode_No;
pooling_param->round_ceil_ = false;
break; break;
} }

View File

@ -42,6 +42,12 @@ int PoolingBaseCPUKernel::SetQuantParam() {
pooling_quant_arg_[1][0].scale_ = out_quant_arg.front().scale; pooling_quant_arg_[1][0].scale_ = out_quant_arg.front().scale;
pooling_quant_arg_[1][0].zp_ = out_quant_arg.front().zeroPoint; pooling_quant_arg_[1][0].zp_ = out_quant_arg.front().zeroPoint;
pooling_param_->quant_args_ = pooling_quant_arg_; pooling_param_->quant_args_ = pooling_quant_arg_;
if (pooling_quant_arg_[0][0].scale_ == pooling_quant_arg_[1][0].scale_ &&
pooling_quant_arg_[0][0].zp_ == pooling_quant_arg_[1][0].zp_) {
pooling_param_->quantize_ = false;
} else {
pooling_param_->quantize_ = true;
}
return RET_OK; return RET_OK;
} }

View File

@ -53,7 +53,7 @@ int PoolingFp16CPUKernel::ReSize() {
} }
int PoolingFp16CPUKernel::RunImpl(int task_id) { int PoolingFp16CPUKernel::RunImpl(int task_id) {
if (pooling_param_->max_pooling_) { if (pooling_param_->pool_mode_ == PoolMode_MaxPool) {
MaxPoolingFp16(fp16_input_, fp16_output_, pooling_param_, task_id); MaxPoolingFp16(fp16_input_, fp16_output_, pooling_param_, task_id);
} else { } else {
AvgPoolingFp16(fp16_input_, fp16_output_, pooling_param_, task_id); AvgPoolingFp16(fp16_input_, fp16_output_, pooling_param_, task_id);

View File

@ -52,7 +52,7 @@ int PoolingCPUKernel::ReSize() {
int PoolingCPUKernel::RunImpl(int task_id) { int PoolingCPUKernel::RunImpl(int task_id) {
auto input_ptr = reinterpret_cast<float *>(in_tensors_.at(kInputIndex)->Data()); auto input_ptr = reinterpret_cast<float *>(in_tensors_.at(kInputIndex)->Data());
auto output_ptr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data()); auto output_ptr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
if (pooling_param_->max_pooling_) { if (pooling_param_->pool_mode_ == PoolMode_MaxPool) {
switch (pooling_param_->act_type_) { switch (pooling_param_->act_type_) {
case ActType_Relu: case ActType_Relu:
MaxPoolingRelu(input_ptr, output_ptr, pooling_param_, task_id); MaxPoolingRelu(input_ptr, output_ptr, pooling_param_, task_id);

View File

@ -163,7 +163,7 @@ int PoolingGradCPUKernel::Run() {
auto input_ptr = reinterpret_cast<float *>(inputs_.at(0)->Data()); auto input_ptr = reinterpret_cast<float *>(inputs_.at(0)->Data());
auto output_ptr = reinterpret_cast<float *>(outputs_.at(0)->Data()); auto output_ptr = reinterpret_cast<float *>(outputs_.at(0)->Data());
if (pool_param->max_pooling_) { if (pool_param->pool_mode_ == PoolMode_MaxPool) {
auto ind = reinterpret_cast<int *>(inputs_.at(1)->Data()); auto ind = reinterpret_cast<int *>(inputs_.at(1)->Data());
MaxPoolingGrad(input_ptr, ind, output_ptr, pool_param); MaxPoolingGrad(input_ptr, ind, output_ptr, pool_param);
} else { } else {

View File

@ -61,10 +61,14 @@ int PoolingInt8CPUKernel::ReSize() {
int PoolingInt8CPUKernel::RunImpl(int task_id) { int PoolingInt8CPUKernel::RunImpl(int task_id) {
auto input_data = reinterpret_cast<int8_t *>(in_tensors_.at(kInputIndex)->Data()); auto input_data = reinterpret_cast<int8_t *>(in_tensors_.at(kInputIndex)->Data());
auto output_data = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->Data()); auto output_data = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->Data());
if (pooling_param_->max_pooling_) { if (pooling_param_->pool_mode_ == PoolMode_MaxPool) {
MaxPoolingInt8(input_data, output_data, pooling_param_, task_id); if (pooling_param_->quantize_) {
MaxPoolingWithQuantInt8(input_data, output_data, pooling_param_, task_id);
} else { } else {
AvgPoolingInt8(input_data, output_data, pooling_param_, task_id); MaxPoolingOptInt8(input_data, output_data, pooling_param_, task_id);
}
} else {
AvgPoolingOptInt8(input_data, output_data, pooling_param_, task_id);
} }
return RET_OK; return RET_OK;
} }

View File

@ -43,13 +43,13 @@ int PoolingOpenCLKernel::Init() {
std::string source; std::string source;
std::string program_name; std::string program_name;
#endif #endif
if (parameter_->max_pooling_) { if (parameter_->pool_mode_ == PoolMode_MaxPool) {
kernel_name = "MaxPooling2d"; kernel_name = "MaxPooling2d";
#ifndef PROGRAM_WITH_IL #ifndef PROGRAM_WITH_IL
source = max_pool2d_source; source = max_pool2d_source;
program_name = "MaxPooling2d"; program_name = "MaxPooling2d";
#endif #endif
} else if (parameter_->avg_pooling_) { } else if (parameter_->pool_mode_ == PoolMode_AvgPool) {
kernel_name = "AvgPooling2d"; kernel_name = "AvgPooling2d";
#ifndef PROGRAM_WITH_IL #ifndef PROGRAM_WITH_IL
source = avg_pool2d_source; source = avg_pool2d_source;

View File

@ -161,8 +161,7 @@ TEST_F(TestPoolingGradFp32, MaxPoolingGradFp32) {
auto pooling_param = new PoolingParameter(); auto pooling_param = new PoolingParameter();
InitPoolingParamFP32(pooling_param); InitPoolingParamFP32(pooling_param);
pooling_param->output_channel_ = 3; pooling_param->output_channel_ = 3;
pooling_param->avg_pooling_ = false; pooling_param->pool_mode_ = PoolMode_MaxPool;
pooling_param->max_pooling_ = true;
// runtime part // runtime part
printf("Calculating runtime cost...\n"); printf("Calculating runtime cost...\n");
uint64_t time_avg = 0; uint64_t time_avg = 0;
@ -215,8 +214,7 @@ TEST_F(TestPoolingGradFp32, MaxPoolingKernelGradFp32) {
// prepare stage // prepare stage
auto maxpool = new PoolingParameter(); auto maxpool = new PoolingParameter();
InitPoolingParamFP32(maxpool); InitPoolingParamFP32(maxpool);
maxpool->avg_pooling_ = false; maxpool->pool_mode_ = PoolMode_MaxPool;
maxpool->max_pooling_ = true;
maxpool->input_h_ = 30; maxpool->input_h_ = 30;
maxpool->input_w_ = 30; maxpool->input_w_ = 30;
maxpool->input_channel_ = 3; maxpool->input_channel_ = 3;
@ -268,8 +266,7 @@ TEST_F(TestPoolingGradFp32, MaxPoolingKernelGradFp32) {
auto pooling_param = new PoolingParameter(); auto pooling_param = new PoolingParameter();
InitPoolingParamFP32(pooling_param); InitPoolingParamFP32(pooling_param);
pooling_param->avg_pooling_ = false; pooling_param->pool_mode_ = PoolMode_MaxPool;
pooling_param->max_pooling_ = true;
pooling_param->input_h_ = 10; pooling_param->input_h_ = 10;
pooling_param->input_w_ = 10; pooling_param->input_w_ = 10;
pooling_param->input_channel_ = 3; pooling_param->input_channel_ = 3;

View File

@ -48,8 +48,7 @@ void InitAvgPoolingParam(PoolingParameter *param) {
param->pad_l_ = 0; param->pad_l_ = 0;
param->pad_r_ = 0; param->pad_r_ = 0;
param->max_pooling_ = false; param->pool_mode_ = PoolMode_AvgPool;
param->avg_pooling_ = true;
} }
TEST_F(TestAvgPoolingOpenCL, AvgPoolFp32) { TEST_F(TestAvgPoolingOpenCL, AvgPoolFp32) {

View File

@ -35,8 +35,7 @@ void InitParameter(PoolingParameter *param) {
param->pad_d_ = 0; param->pad_d_ = 0;
param->pad_l_ = 0; param->pad_l_ = 0;
param->pad_r_ = 0; param->pad_r_ = 0;
param->avg_pooling_ = false; param->pool_mode_ = PoolMode_MaxPool;
param->max_pooling_ = true;
} }
TEST_F(TestMaxPoolingOpenCL, MaxPool_1_32_512_96) { TEST_F(TestMaxPoolingOpenCL, MaxPool_1_32_512_96) {