forked from mindspore-Ecosystem/mindspore
add activation fusion for fp16 pooling
This commit is contained in:
parent
8d20d1b8e6
commit
772adb84d7
|
@ -17,7 +17,8 @@
|
|||
#include <float.h>
|
||||
#include "nnacl/errorcode.h"
|
||||
|
||||
int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id) {
|
||||
int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id,
|
||||
float16_t min, float16_t max) {
|
||||
int stride_w = pooling_param->stride_w_;
|
||||
int stride_h = pooling_param->stride_h_;
|
||||
int pad_w = pooling_param->pad_l_;
|
||||
|
@ -40,6 +41,12 @@ int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPar
|
|||
int thread_num = pooling_param->thread_num_;
|
||||
// input channel is equal to output channel
|
||||
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t min_value = vdupq_n_f16(min);
|
||||
float16x8_t max_value = vdupq_n_f16(max);
|
||||
float16x4_t min_value2 = vdup_n_f16(min);
|
||||
float16x4_t max_value2 = vdup_n_f16(max);
|
||||
#endif
|
||||
for (int batch = 0; batch < output_batch; batch++) {
|
||||
int in_batch_offset = batch * in_h * in_w * channel;
|
||||
int out_batch_offset = batch * output_h * output_w * channel;
|
||||
|
@ -88,10 +95,16 @@ int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPar
|
|||
return NNACL_ERR;
|
||||
}
|
||||
#ifdef ENABLE_NEON
|
||||
vst1q_f16(output_ptr + out_channel_offset, tmp_avg / vdupq_n_f16(real_count));
|
||||
tmp_avg = vdivq_f16(tmp_avg, vdupq_n_f16(real_count));
|
||||
tmp_avg = vmaxq_f16(tmp_avg, min_value);
|
||||
tmp_avg = vminq_f16(tmp_avg, max_value);
|
||||
vst1q_f16(output_ptr + out_channel_offset, tmp_avg);
|
||||
#else
|
||||
for (int t = 0; t < C8NUM; ++t) {
|
||||
*(output_ptr + out_channel_offset + t) = tmp_avg[t] / (float16_t)real_count;
|
||||
float16_t tmp_value = tmp_avg[t] / (float16_t)real_count;
|
||||
tmp_value = fmax(tmp_value, min);
|
||||
tmp_value = fmin(tmp_value, max);
|
||||
output_ptr[out_channel_offset + t] = tmp_value;
|
||||
}
|
||||
#endif
|
||||
} // c8 loop
|
||||
|
@ -126,10 +139,16 @@ int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPar
|
|||
return NNACL_ERR;
|
||||
}
|
||||
#ifdef ENABLE_NEON
|
||||
vst1_f16(output_ptr + out_channel_offset, tmp_avg / vdup_n_f16(real_count));
|
||||
tmp_avg = vdiv_f16(tmp_avg, vdup_n_f16(real_count));
|
||||
tmp_avg = vmax_f16(tmp_avg, min_value2);
|
||||
tmp_avg = vmin_f16(tmp_avg, max_value2);
|
||||
vst1_f16(output_ptr + out_channel_offset, tmp_avg);
|
||||
#else
|
||||
for (int t = 0; t < C4NUM; ++t) {
|
||||
*(output_ptr + out_channel_offset + t) = tmp_avg[t] / (float16_t)real_count;
|
||||
float16_t tmp_value = tmp_avg[t] / (float16_t)real_count;
|
||||
tmp_value = fmax(tmp_value, min);
|
||||
tmp_value = fmin(tmp_value, max);
|
||||
output_ptr[out_channel_offset + t] = tmp_value;
|
||||
}
|
||||
#endif
|
||||
} // c4 loop
|
||||
|
@ -150,7 +169,10 @@ int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPar
|
|||
if (real_count == 0) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
*(output_ptr + out_channel_offset) = tmp_avg / (float16_t)real_count;
|
||||
float16_t tmp_value = tmp_avg / (float16_t)real_count;
|
||||
tmp_value = fmax(tmp_value, min);
|
||||
tmp_value = fmin(tmp_value, max);
|
||||
output_ptr[out_channel_offset] = tmp_value;
|
||||
} // channel_res loop
|
||||
} // real_cal_num loop
|
||||
} // out_plane loop
|
||||
|
@ -158,7 +180,8 @@ int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPar
|
|||
return NNACL_OK;
|
||||
}
|
||||
|
||||
void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id) {
|
||||
void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id,
|
||||
float16_t min, float16_t max) {
|
||||
int stride_w = pooling_param->stride_w_;
|
||||
int stride_h = pooling_param->stride_h_;
|
||||
int pad_w = pooling_param->pad_l_;
|
||||
|
@ -177,6 +200,12 @@ void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
|
|||
int c8 = channel / C8NUM;
|
||||
int c8_res = channel % C8NUM;
|
||||
int c4 = c8_res / C4NUM;
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t min_value = vdupq_n_f16(min);
|
||||
float16x8_t max_value = vdupq_n_f16(max);
|
||||
float16x4_t min_value2 = vdup_n_f16(min);
|
||||
float16x4_t max_value2 = vdup_n_f16(max);
|
||||
#endif
|
||||
// input channel is equal to output channel
|
||||
|
||||
for (int batch = 0; batch < output_batch; batch++) {
|
||||
|
@ -219,9 +248,13 @@ void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
|
|||
} // win_w loop
|
||||
} // win_h loop
|
||||
#ifdef ENABLE_NEON
|
||||
tmp_max = vmaxq_f16(tmp_max, min_value);
|
||||
tmp_max = vminq_f16(tmp_max, max_value);
|
||||
vst1q_f16(output_ptr + out_channel_offset, tmp_max);
|
||||
#else
|
||||
for (int l = 0; l < C8NUM; ++l) {
|
||||
tmp_max[l] = fmax(tmp_max[l], min);
|
||||
tmp_max[l] = fmin(tmp_max[l], max);
|
||||
*(output_ptr + out_channel_offset + l) = tmp_max[l];
|
||||
}
|
||||
#endif
|
||||
|
@ -249,10 +282,14 @@ void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
|
|||
} // win_w loop
|
||||
} // win_h loop
|
||||
#ifdef ENABLE_NEON
|
||||
tmp_max = vmax_f16(tmp_max, min_value2);
|
||||
tmp_max = vmin_f16(tmp_max, max_value2);
|
||||
vst1_f16(output_ptr + out_channel_offset, tmp_max);
|
||||
#else
|
||||
for (int l = 0; l < C4NUM; ++l) {
|
||||
*(output_ptr + out_channel_offset + l) = tmp_max[l];
|
||||
tmp_max[l] = fmax(tmp_max[l], min);
|
||||
tmp_max[l] = fmin(tmp_max[l], max);
|
||||
output_ptr[out_channel_offset + l] = tmp_max[l];
|
||||
}
|
||||
#endif
|
||||
} // c4 loop
|
||||
|
@ -268,7 +305,9 @@ void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa
|
|||
tmp_max = fmax(tmp_max, *(input_ptr + in_offset));
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
*(output_ptr + out_channel_offset) = tmp_max;
|
||||
tmp_max = fmax(tmp_max, min);
|
||||
tmp_max = fmin(tmp_max, max);
|
||||
output_ptr[out_channel_offset] = tmp_max;
|
||||
} // channel_res loop
|
||||
} // real_cal_num loop
|
||||
} // out_plane loop
|
||||
|
|
|
@ -24,9 +24,11 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id);
|
||||
int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id,
|
||||
float16_t min, float16_t max);
|
||||
|
||||
void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id);
|
||||
void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id,
|
||||
float16_t min, float16_t max);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -28,10 +28,7 @@ using mindspore::schema::PrimitiveType_Pad;
|
|||
|
||||
namespace mindspore::kernel {
|
||||
int PadFp16CPUKernel::RunImpl(int task_id) {
|
||||
auto input_data = reinterpret_cast<float16_t *>(in_tensors_.at(0)->MutableData());
|
||||
auto output_data = reinterpret_cast<float16_t *>(out_tensors_.at(0)->MutableData());
|
||||
|
||||
PadFp16(input_data, output_data, in_, out_, pad_param_->paddings_, task_id, context_->thread_num_);
|
||||
PadFp16(input_, output_, in_, out_, pad_param_->paddings_, task_id, context_->thread_num_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
@ -54,7 +51,13 @@ int PadFp16CPUKernel::Run() {
|
|||
return RET_ERROR;
|
||||
}
|
||||
|
||||
memset(output_, 0, output_tensor->ElementsNum() * sizeof(float16_t));
|
||||
if (pad_param_->constant_value_ - 0.0f < 1e-5) {
|
||||
memset(output_, 0, output_tensor->ElementsNum() * sizeof(float16_t));
|
||||
} else {
|
||||
for (int i = 0; i < output_tensor->ElementsNum(); ++i) {
|
||||
output_[i] = pad_param_->constant_value_;
|
||||
}
|
||||
}
|
||||
ret = ParallelLaunch(this->context_->thread_pool_, PadImpl, this, op_parameter_->thread_num_);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]";
|
||||
|
|
|
@ -53,10 +53,18 @@ int PoolingFp16CPUKernel::ReSize() {
|
|||
}
|
||||
|
||||
int PoolingFp16CPUKernel::RunImpl(int task_id) {
|
||||
float16_t minf = -FLT_MAX;
|
||||
float16_t maxf = FLT_MAX;
|
||||
if (pooling_param_->act_type_ == ActType_Relu) {
|
||||
minf = 0.f;
|
||||
} else if (pooling_param_->act_type_ == ActType_Relu6) {
|
||||
minf = 0.f;
|
||||
maxf = 6.f;
|
||||
}
|
||||
if (pooling_param_->pool_mode_ == PoolMode_MaxPool) {
|
||||
MaxPoolingFp16(fp16_input_, fp16_output_, pooling_param_, task_id);
|
||||
MaxPoolingFp16(fp16_input_, fp16_output_, pooling_param_, task_id, minf, maxf);
|
||||
} else {
|
||||
auto ret = AvgPoolingFp16(fp16_input_, fp16_output_, pooling_param_, task_id);
|
||||
auto ret = AvgPoolingFp16(fp16_input_, fp16_output_, pooling_param_, task_id, minf, maxf);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "AvgPooling run failed.";
|
||||
return ret;
|
||||
|
|
Loading…
Reference in New Issue