commit
59f03e8e21
|
@ -258,35 +258,36 @@ void PackNHWCToNXHWCXFp32(int kernel_h, int kernel_w, int output_channel, int oc
|
|||
tmp_weight[oc_remainder + oc_remainder_step * ic] = src[ic + oc_remainder * input_channel];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (; oc < oc_block8; oc += (oc_block / C8NUM)) {
|
||||
oc_block = MSMIN(C4NUM, oc_block8 - oc) * C8NUM; // max_tile = 32 ==> 24 ==> 16 ==> 8
|
||||
for (int oc_tmp = 0; oc_tmp < oc_block; oc_tmp += C8NUM) {
|
||||
for (int hw = 0; hw < plane; ++hw) {
|
||||
int ic = 0;
|
||||
for (; ic < ic8; ic += C8NUM) {
|
||||
Transpose8X8Fp32Avx(src + hw * input_channel + ic,
|
||||
tmp_weight + hw * oc_block * input_channel + ic * oc_block + oc_tmp,
|
||||
input_channel * plane, oc_block);
|
||||
}
|
||||
for (; ic < input_channel; ++ic) {
|
||||
for (int j = 0; j < C8NUM; ++j) {
|
||||
tmp_weight[ic * oc_block + oc_tmp + j + hw * oc_block * input_channel] =
|
||||
src[ic + input_channel * j * plane + hw * input_channel];
|
||||
}
|
||||
}
|
||||
}
|
||||
src += C8NUM * plane * input_channel;
|
||||
}
|
||||
tmp_weight += oc_block * input_channel * plane;
|
||||
}
|
||||
oc = output_channel - oc_block8 * C8NUM;
|
||||
for (int oc_remainder = 0; oc_remainder < oc; ++oc_remainder) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (; oc < oc_block8; oc += (oc_block / C8NUM)) {
|
||||
oc_block = MSMIN(C4NUM, oc_block8 - oc) * C8NUM; // max_tile = 32 ==> 24 ==> 16 ==> 8
|
||||
for (int oc_tmp = 0; oc_tmp < oc_block; oc_tmp += C8NUM) {
|
||||
for (int hw = 0; hw < plane; ++hw) {
|
||||
for (int ic = 0; ic < input_channel; ++ic) {
|
||||
tmp_weight[oc_remainder + oc_remainder_step * ic + hw * input_channel * oc_remainder_step] =
|
||||
src[ic + (oc_remainder * plane + hw) * input_channel];
|
||||
int ic = 0;
|
||||
for (; ic < ic8; ic += C8NUM) {
|
||||
Transpose8X8Fp32Avx(src + hw * input_channel + ic,
|
||||
tmp_weight + hw * oc_block * input_channel + ic * oc_block + oc_tmp,
|
||||
input_channel * plane, oc_block);
|
||||
}
|
||||
for (; ic < input_channel; ++ic) {
|
||||
for (int j = 0; j < C8NUM; ++j) {
|
||||
tmp_weight[ic * oc_block + oc_tmp + j + hw * oc_block * input_channel] =
|
||||
src[ic + input_channel * j * plane + hw * input_channel];
|
||||
}
|
||||
}
|
||||
}
|
||||
src += C8NUM * plane * input_channel;
|
||||
}
|
||||
tmp_weight += oc_block * input_channel * plane;
|
||||
}
|
||||
oc = output_channel - oc_block8 * C8NUM;
|
||||
for (int oc_remainder = 0; oc_remainder < oc; ++oc_remainder) {
|
||||
for (int hw = 0; hw < plane; ++hw) {
|
||||
for (int ic = 0; ic < input_channel; ++ic) {
|
||||
tmp_weight[oc_remainder + oc_remainder_step * ic + hw * input_channel * oc_remainder_step] =
|
||||
src[ic + (oc_remainder * plane + hw) * input_channel];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,15 +19,12 @@
|
|||
#include "nnacl/errorcode.h"
|
||||
#include "nnacl/op_base.h"
|
||||
|
||||
int AvgPooling(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, int task_id,
|
||||
float minf, float maxf) {
|
||||
int win_w = pooling_param->window_w_;
|
||||
int win_h = pooling_param->window_h_;
|
||||
int AvgPoolingBatch(const float *src_b_ptr, float *dst_b_ptr, const PoolingParameter *pooling_param, int task_id,
|
||||
float minf, float maxf) {
|
||||
int win_w = pooling_param->window_w_, win_h = pooling_param->window_h_;
|
||||
int in_w = pooling_param->input_w_, in_h = pooling_param->input_h_;
|
||||
int output_w = pooling_param->output_w_, output_h = pooling_param->output_h_;
|
||||
int channel = pooling_param->input_channel_;
|
||||
int in_w = pooling_param->input_w_;
|
||||
int in_h = pooling_param->input_h_;
|
||||
int output_w = pooling_param->output_w_;
|
||||
int output_h = pooling_param->output_h_;
|
||||
int out_plane = output_w * output_h;
|
||||
int out_tile_count = UP_DIV(out_plane, TILE_NUM);
|
||||
NNACL_CHECK_ZERO_RETURN_ERR(output_w);
|
||||
|
@ -42,190 +39,218 @@ int AvgPooling(const float *input_ptr, float *output_ptr, const PoolingParameter
|
|||
MS_FLOAT32X4 max_value = MS_MOVQ_F32(maxf);
|
||||
#endif
|
||||
|
||||
for (int batch = 0; batch < pooling_param->output_batch_; batch++) {
|
||||
const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel;
|
||||
float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel;
|
||||
for (int thread_id = task_id; thread_id < out_tile_count; thread_id += pooling_param->thread_num_) {
|
||||
int cal_start_index = thread_id * TILE_NUM;
|
||||
int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
|
||||
for (int i = 0; i < real_cal_num; i++) {
|
||||
int index = cal_start_index + i;
|
||||
int out_w_index = index % output_w;
|
||||
int out_h_index = index / output_w;
|
||||
int in_w_index = out_w_index * pooling_param->stride_w_ - pooling_param->pad_l_;
|
||||
int in_h_index = out_h_index * pooling_param->stride_h_ - pooling_param->pad_u_;
|
||||
for (int thread_id = task_id; thread_id < out_tile_count; thread_id += pooling_param->thread_num_) {
|
||||
int cal_start_index = thread_id * TILE_NUM;
|
||||
int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
|
||||
for (int i = 0; i < real_cal_num; i++) {
|
||||
int index = cal_start_index + i;
|
||||
int out_w_index = index % output_w;
|
||||
int out_h_index = index / output_w;
|
||||
int in_w_index = out_w_index * pooling_param->stride_w_ - pooling_param->pad_l_;
|
||||
int in_h_index = out_h_index * pooling_param->stride_h_ - pooling_param->pad_u_;
|
||||
|
||||
const float *src_plane_ptr = src_b_ptr;
|
||||
float *dst_plane_ptr = dst_b_ptr + index * channel;
|
||||
const float *src_plane_ptr = src_b_ptr;
|
||||
float *dst_plane_ptr = dst_b_ptr + index * channel;
|
||||
|
||||
int real_win_h_start = MSMAX(0, -in_h_index);
|
||||
int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
|
||||
int real_win_w_start = MSMAX(0, -in_w_index);
|
||||
int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
|
||||
int ci = 0;
|
||||
int real_win_h_start = MSMAX(0, -in_h_index);
|
||||
int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
|
||||
int real_win_w_start = MSMAX(0, -in_w_index);
|
||||
int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
|
||||
int ci = 0;
|
||||
#ifdef ENABLE_AVX
|
||||
for (; ci < c8; ci += C8NUM) {
|
||||
const float *src_c_ptr = src_plane_ptr + ci;
|
||||
float *dst_c_ptr = dst_plane_ptr + ci;
|
||||
MS_FLOAT32X8 tmp_avg = MS_MOV256_F32(0);
|
||||
int real_count = 0;
|
||||
for (int h = real_win_h_start; h < real_win_h_end; h++) {
|
||||
for (int w = real_win_w_start; w < real_win_w_end; w++) {
|
||||
const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
tmp_avg = MS_ADD256_F32(tmp_avg, MS_LD256_F32(src_win_ptr));
|
||||
++real_count;
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
if (real_count == 0) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
tmp_avg = MS_DIV256_F32(tmp_avg, MS_MOV256_F32(real_count));
|
||||
tmp_avg = MS_MAX256_F32(tmp_avg, min_value_8);
|
||||
tmp_avg = MS_MIN256_F32(tmp_avg, max_value_8);
|
||||
MS_ST256_F32(dst_c_ptr, tmp_avg);
|
||||
} // ic8-1 loop
|
||||
for (; ci < c8; ci += C8NUM) {
|
||||
const float *src_c_ptr = src_plane_ptr + ci;
|
||||
float *dst_c_ptr = dst_plane_ptr + ci;
|
||||
MS_FLOAT32X8 tmp_avg = MS_MOV256_F32(0);
|
||||
int real_count = 0;
|
||||
for (int h = real_win_h_start; h < real_win_h_end; h++) {
|
||||
for (int w = real_win_w_start; w < real_win_w_end; w++) {
|
||||
const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
tmp_avg = MS_ADD256_F32(tmp_avg, MS_LD256_F32(src_win_ptr));
|
||||
++real_count;
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
if (real_count == 0) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
tmp_avg = MS_DIV256_F32(tmp_avg, MS_MOV256_F32(real_count));
|
||||
tmp_avg = MS_MAX256_F32(tmp_avg, min_value_8);
|
||||
tmp_avg = MS_MIN256_F32(tmp_avg, max_value_8);
|
||||
MS_ST256_F32(dst_c_ptr, tmp_avg);
|
||||
} // ic8-1 loop
|
||||
#endif
|
||||
#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
|
||||
for (; ci < c4; ci += C4NUM) {
|
||||
const float *src_c_ptr = src_plane_ptr + ci;
|
||||
float *dst_c_ptr = dst_plane_ptr + ci;
|
||||
MS_FLOAT32X4 tmp_avg = MS_MOVQ_F32(0);
|
||||
int real_count = 0;
|
||||
for (int h = real_win_h_start; h < real_win_h_end; h++) {
|
||||
for (int w = real_win_w_start; w < real_win_w_end; w++) {
|
||||
const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
tmp_avg = MS_ADDQ_F32(tmp_avg, MS_LDQ_F32(src_win_ptr));
|
||||
++real_count;
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
if (real_count == 0) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
tmp_avg = MS_DIVQ_F32(tmp_avg, MS_MOVQ_F32(real_count));
|
||||
tmp_avg = MS_MAXQ_F32(tmp_avg, min_value);
|
||||
tmp_avg = MS_MINQ_F32(tmp_avg, max_value);
|
||||
MS_STQ_F32(dst_c_ptr, tmp_avg);
|
||||
} // ic4-1 loop
|
||||
for (; ci < c4; ci += C4NUM) {
|
||||
const float *src_c_ptr = src_plane_ptr + ci;
|
||||
float *dst_c_ptr = dst_plane_ptr + ci;
|
||||
MS_FLOAT32X4 tmp_avg = MS_MOVQ_F32(0);
|
||||
int real_count = 0;
|
||||
for (int h = real_win_h_start; h < real_win_h_end; h++) {
|
||||
for (int w = real_win_w_start; w < real_win_w_end; w++) {
|
||||
const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
tmp_avg = MS_ADDQ_F32(tmp_avg, MS_LDQ_F32(src_win_ptr));
|
||||
++real_count;
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
if (real_count == 0) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
tmp_avg = MS_DIVQ_F32(tmp_avg, MS_MOVQ_F32(real_count));
|
||||
tmp_avg = MS_MAXQ_F32(tmp_avg, min_value);
|
||||
tmp_avg = MS_MINQ_F32(tmp_avg, max_value);
|
||||
MS_STQ_F32(dst_c_ptr, tmp_avg);
|
||||
} // ic4-1 loop
|
||||
#endif
|
||||
for (; ci < channel; ci++) {
|
||||
const float *src_c_ptr = src_plane_ptr + ci;
|
||||
float *dst_c_ptr = dst_plane_ptr + ci;
|
||||
float tmp_avg = 0;
|
||||
int real_count = 0;
|
||||
for (int h = real_win_h_start; h < real_win_h_end; h++) {
|
||||
for (int w = real_win_w_start; w < real_win_w_end; w++) {
|
||||
const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
tmp_avg += src_win_ptr[0];
|
||||
++real_count;
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
if (real_count == 0) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
tmp_avg = tmp_avg / (float)real_count;
|
||||
tmp_avg = fmaxf(tmp_avg, minf);
|
||||
tmp_avg = fminf(tmp_avg, maxf);
|
||||
dst_c_ptr[0] = tmp_avg;
|
||||
} // channel_res loop
|
||||
} // real_cal_num loop
|
||||
} // out_plane loop
|
||||
} // out_batch loop
|
||||
for (; ci < channel; ci++) {
|
||||
const float *src_c_ptr = src_plane_ptr + ci;
|
||||
float *dst_c_ptr = dst_plane_ptr + ci;
|
||||
float tmp_avg = 0;
|
||||
int real_count = 0;
|
||||
for (int h = real_win_h_start; h < real_win_h_end; h++) {
|
||||
for (int w = real_win_w_start; w < real_win_w_end; w++) {
|
||||
const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
|
||||
tmp_avg += src_win_ptr[0];
|
||||
++real_count;
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
if (real_count == 0) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
tmp_avg = tmp_avg / (float)real_count;
|
||||
tmp_avg = fmaxf(tmp_avg, minf);
|
||||
tmp_avg = fminf(tmp_avg, maxf);
|
||||
dst_c_ptr[0] = tmp_avg;
|
||||
} // channel_res loop
|
||||
} // real_cal_num loop
|
||||
} // out_plane loop
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int AvgPooling(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, int task_id,
|
||||
float minf, float maxf) {
|
||||
int in_w = pooling_param->input_w_;
|
||||
int in_h = pooling_param->input_h_;
|
||||
int output_w = pooling_param->output_w_;
|
||||
int output_h = pooling_param->output_h_;
|
||||
int channel = pooling_param->input_channel_;
|
||||
int output_batch = pooling_param->output_batch_;
|
||||
|
||||
for (int batch = 0; batch < output_batch; batch++) {
|
||||
const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel;
|
||||
float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel;
|
||||
int ret = AvgPoolingBatch(src_b_ptr, dst_b_ptr, pooling_param, task_id, minf, maxf);
|
||||
if (ret != NNACL_OK) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int MaxPoolingBatch(const float *src_b_ptr, float *dst_b_ptr, const PoolingParameter *pooling_param, int task_id,
|
||||
float minf, float maxf) {
|
||||
int in_w = pooling_param->input_w_, in_h = pooling_param->input_h_;
|
||||
int win_w = pooling_param->window_w_, win_h = pooling_param->window_h_;
|
||||
int output_w = pooling_param->output_w_, output_h = pooling_param->output_h_;
|
||||
int channel = pooling_param->input_channel_;
|
||||
int out_plane = output_w * output_h;
|
||||
int out_tile_count = UP_DIV(out_plane, TILE_NUM);
|
||||
NNACL_CHECK_ZERO_RETURN_ERR(output_w);
|
||||
#ifdef ENABLE_AVX
|
||||
int c8 = channel / C8NUM * C8NUM;
|
||||
MS_FLOAT32X8 min_value_8 = MS_MOV256_F32(minf);
|
||||
MS_FLOAT32X8 max_value_8 = MS_MOV256_F32(maxf);
|
||||
#endif
|
||||
#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
|
||||
int c4 = channel / C4NUM * C4NUM;
|
||||
MS_FLOAT32X4 min_value = MS_MOVQ_F32(minf);
|
||||
MS_FLOAT32X4 max_value = MS_MOVQ_F32(maxf);
|
||||
#endif
|
||||
|
||||
for (int thread_id = task_id; thread_id < out_tile_count; thread_id += pooling_param->thread_num_) {
|
||||
int cal_start_index = thread_id * TILE_NUM;
|
||||
int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
|
||||
for (int i = 0; i < real_cal_num; i++) {
|
||||
int index = cal_start_index + i;
|
||||
int out_w_index = index % output_w;
|
||||
int out_h_index = index / output_w;
|
||||
int in_w_index = out_w_index * pooling_param->stride_w_ - pooling_param->pad_l_;
|
||||
int in_h_index = out_h_index * pooling_param->stride_h_ - pooling_param->pad_u_;
|
||||
|
||||
const float *src_plane_ptr = src_b_ptr;
|
||||
float *dst_plane_ptr = dst_b_ptr + index * channel;
|
||||
|
||||
int real_win_h_start = MSMAX(0, -in_h_index);
|
||||
int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
|
||||
int real_win_w_start = MSMAX(0, -in_w_index);
|
||||
int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
|
||||
int ci = 0;
|
||||
#ifdef ENABLE_AVX
|
||||
for (; ci < c8; ci += C8NUM) {
|
||||
const float *src_c_ptr = src_plane_ptr + ci;
|
||||
float *dst_c_ptr = dst_plane_ptr + ci;
|
||||
MS_FLOAT32X8 tmp_max = MS_MOV256_F32(-FLT_MAX);
|
||||
for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
|
||||
for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
|
||||
const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
|
||||
tmp_max = MS_MAX256_F32(tmp_max, MS_LD256_F32(src_win_ptr));
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
tmp_max = MS_MAX256_F32(tmp_max, min_value_8);
|
||||
tmp_max = MS_MIN256_F32(tmp_max, max_value_8);
|
||||
MS_ST256_F32(dst_c_ptr, tmp_max);
|
||||
} // ic8 loop
|
||||
#endif
|
||||
#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
|
||||
for (; ci < c4; ci += C4NUM) {
|
||||
const float *src_c_ptr = src_plane_ptr + ci;
|
||||
float *dst_c_ptr = dst_plane_ptr + ci;
|
||||
MS_FLOAT32X4 tmp_max = MS_MOVQ_F32(-FLT_MAX);
|
||||
for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
|
||||
for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
|
||||
const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
|
||||
tmp_max = MS_MAXQ_F32(tmp_max, MS_LDQ_F32(src_win_ptr));
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
tmp_max = MS_MAXQ_F32(tmp_max, min_value);
|
||||
tmp_max = MS_MINQ_F32(tmp_max, max_value);
|
||||
MS_STQ_F32(dst_c_ptr, tmp_max);
|
||||
} // ic4 loop
|
||||
#endif
|
||||
for (; ci < channel; ci++) {
|
||||
float *dst_c_ptr = dst_plane_ptr + ci;
|
||||
const float *src_c_ptr = src_plane_ptr + ci;
|
||||
float tmp_max = -FLT_MAX;
|
||||
for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
|
||||
for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
|
||||
const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
|
||||
tmp_max = fmaxf(tmp_max, src_win_ptr[0]);
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
tmp_max = fmaxf(tmp_max, minf);
|
||||
tmp_max = fminf(tmp_max, maxf);
|
||||
dst_c_ptr[0] = tmp_max;
|
||||
} // channel_res loop
|
||||
} // real_cal_num loop
|
||||
} // out_plane loop
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int MaxPooling(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, int task_id,
|
||||
float minf, float maxf) {
|
||||
int win_w = pooling_param->window_w_;
|
||||
int win_h = pooling_param->window_h_;
|
||||
int channel = pooling_param->input_channel_;
|
||||
int in_w = pooling_param->input_w_;
|
||||
int in_h = pooling_param->input_h_;
|
||||
int output_w = pooling_param->output_w_;
|
||||
int output_h = pooling_param->output_h_;
|
||||
int channel = pooling_param->input_channel_;
|
||||
int output_batch = pooling_param->output_batch_;
|
||||
int out_plane = output_w * output_h;
|
||||
int out_tile_count = UP_DIV(out_plane, TILE_NUM);
|
||||
NNACL_CHECK_ZERO_RETURN_ERR(output_w);
|
||||
#ifdef ENABLE_AVX
|
||||
int c8 = channel / C8NUM * C8NUM;
|
||||
MS_FLOAT32X8 min_value_8 = MS_MOV256_F32(minf);
|
||||
MS_FLOAT32X8 max_value_8 = MS_MOV256_F32(maxf);
|
||||
#endif
|
||||
#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
|
||||
int c4 = channel / C4NUM * C4NUM;
|
||||
MS_FLOAT32X4 min_value = MS_MOVQ_F32(minf);
|
||||
MS_FLOAT32X4 max_value = MS_MOVQ_F32(maxf);
|
||||
#endif
|
||||
|
||||
for (int batch = 0; batch < output_batch; batch++) {
|
||||
const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel;
|
||||
float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel;
|
||||
for (int thread_id = task_id; thread_id < out_tile_count; thread_id += pooling_param->thread_num_) {
|
||||
int cal_start_index = thread_id * TILE_NUM;
|
||||
int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
|
||||
for (int i = 0; i < real_cal_num; i++) {
|
||||
int index = cal_start_index + i;
|
||||
int out_w_index = index % output_w;
|
||||
int out_h_index = index / output_w;
|
||||
int in_w_index = out_w_index * pooling_param->stride_w_ - pooling_param->pad_l_;
|
||||
int in_h_index = out_h_index * pooling_param->stride_h_ - pooling_param->pad_u_;
|
||||
|
||||
const float *src_plane_ptr = src_b_ptr;
|
||||
float *dst_plane_ptr = dst_b_ptr + index * channel;
|
||||
|
||||
int real_win_h_start = MSMAX(0, -in_h_index);
|
||||
int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
|
||||
int real_win_w_start = MSMAX(0, -in_w_index);
|
||||
int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
|
||||
int ci = 0;
|
||||
#ifdef ENABLE_AVX
|
||||
for (; ci < c8; ci += C8NUM) {
|
||||
const float *src_c_ptr = src_plane_ptr + ci;
|
||||
float *dst_c_ptr = dst_plane_ptr + ci;
|
||||
MS_FLOAT32X8 tmp_max = MS_MOV256_F32(-FLT_MAX);
|
||||
for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
|
||||
for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
|
||||
const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
|
||||
tmp_max = MS_MAX256_F32(tmp_max, MS_LD256_F32(src_win_ptr));
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
tmp_max = MS_MAX256_F32(tmp_max, min_value_8);
|
||||
tmp_max = MS_MIN256_F32(tmp_max, max_value_8);
|
||||
MS_ST256_F32(dst_c_ptr, tmp_max);
|
||||
} // ic8 loop
|
||||
#endif
|
||||
#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
|
||||
for (; ci < c4; ci += C4NUM) {
|
||||
const float *src_c_ptr = src_plane_ptr + ci;
|
||||
float *dst_c_ptr = dst_plane_ptr + ci;
|
||||
MS_FLOAT32X4 tmp_max = MS_MOVQ_F32(-FLT_MAX);
|
||||
for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
|
||||
for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
|
||||
const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
|
||||
tmp_max = MS_MAXQ_F32(tmp_max, MS_LDQ_F32(src_win_ptr));
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
tmp_max = MS_MAXQ_F32(tmp_max, min_value);
|
||||
tmp_max = MS_MINQ_F32(tmp_max, max_value);
|
||||
MS_STQ_F32(dst_c_ptr, tmp_max);
|
||||
} // ic4 loop
|
||||
#endif
|
||||
for (; ci < channel; ci++) {
|
||||
float *dst_c_ptr = dst_plane_ptr + ci;
|
||||
const float *src_c_ptr = src_plane_ptr + ci;
|
||||
float tmp_max = -FLT_MAX;
|
||||
for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
|
||||
for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
|
||||
const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
|
||||
tmp_max = fmaxf(tmp_max, src_win_ptr[0]);
|
||||
} // win_w loop
|
||||
} // win_h loop
|
||||
tmp_max = fmaxf(tmp_max, minf);
|
||||
tmp_max = fminf(tmp_max, maxf);
|
||||
dst_c_ptr[0] = tmp_max;
|
||||
} // channel_res loop
|
||||
} // real_cal_num loop
|
||||
} // out_plane loop
|
||||
} // out_batch loop
|
||||
int ret = MaxPoolingBatch(src_b_ptr, dst_b_ptr, pooling_param, task_id, minf, maxf);
|
||||
if (ret != NNACL_OK) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
|
|
@ -76,6 +76,7 @@
|
|||
#define THIRD_INPUT 2
|
||||
#define FOURTH_INPUT 3
|
||||
#define FIFTH_INPUT 4
|
||||
#define SIXTH_INPUT 5
|
||||
|
||||
#define DIMENSION_1D 1
|
||||
#define DIMENSION_2D 2
|
||||
|
|
|
@ -165,6 +165,10 @@ int LiteSession::ConvertTensorsData(const lite::Model *model, size_t tensor_inde
|
|||
|
||||
auto ret = DecompressTensor(*src_tensor, dst_tensor);
|
||||
if (ret == RET_NO_CHANGE) {
|
||||
if (src_tensor->data()->size() < dst_tensor->Size()) {
|
||||
MS_LOG(ERROR) << "Tensor data shape invalid";
|
||||
return RET_ERROR;
|
||||
}
|
||||
dst_tensor->set_data(const_cast<unsigned char *>(src_tensor->data()->data()));
|
||||
dst_tensor->set_own_data(false);
|
||||
} else if (ret != RET_OK) {
|
||||
|
|
|
@ -28,12 +28,32 @@ namespace mindspore::kernel {
|
|||
namespace {
|
||||
constexpr size_t kPadCommonInputSize = 2;
|
||||
} // namespace
|
||||
int PadFp16CPUKernel::RunImpl(int task_id) {
|
||||
int PadFp16CPUKernel::RunImpl(int task_id) const {
|
||||
PadFp16(input_, output_, in_, out_, pad_param_->paddings_, task_id, op_parameter_->thread_num_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int PadFp16CPUKernel::RunMirrorPadImpl(int task_id) {
|
||||
void PadFp16CPUKernel::RunMirrorPadImplFast(const MirrorPadBlock &block, const float16_t *input_data,
|
||||
float16_t *output_data) const {
|
||||
for (int a = 0; a < block.size_[0]; a++) {
|
||||
int out_a_index = block.out_offset_ + a * block.out_stride_[0];
|
||||
for (int b = 0; b < block.size_[1]; b++) {
|
||||
int out_b_index = out_a_index + b * block.out_stride_[1];
|
||||
for (int c = 0; c < block.size_[2]; ++c) {
|
||||
int out_c_index = out_b_index + c * block.out_stride_[2];
|
||||
for (int d = 0; d < block.size_[3]; ++d) {
|
||||
int out_d_index = out_c_index + d * block.out_stride_[3];
|
||||
for (int e = 0; e < block.size_[4]; ++e) {
|
||||
int output_index = out_d_index + e * block.out_stride_[4];
|
||||
MirrorPadFp16(input_data, output_data, in_, pad_param_, output_index, output_index + block.size_[5]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int PadFp16CPUKernel::RunMirrorPadImpl(int task_id) const {
|
||||
auto input = in_tensors_.at(0);
|
||||
CHECK_NULL_RETURN(input);
|
||||
auto output = out_tensors_.at(0);
|
||||
|
@ -51,23 +71,7 @@ int PadFp16CPUKernel::RunMirrorPadImpl(int task_id) {
|
|||
/* calculate region part */
|
||||
for (size_t i = task_id; i < mirror_pad_block_.size(); i += op_parameter_->thread_num_) {
|
||||
auto block = mirror_pad_block_[i];
|
||||
|
||||
for (int a = 0; a < block.size_[0]; a++) {
|
||||
int out_a_index = block.out_offset_ + a * block.out_stride_[0];
|
||||
for (int b = 0; b < block.size_[1]; b++) {
|
||||
int out_b_index = out_a_index + b * block.out_stride_[1];
|
||||
for (int c = 0; c < block.size_[2]; ++c) {
|
||||
int out_c_index = out_b_index + c * block.out_stride_[2];
|
||||
for (int d = 0; d < block.size_[3]; ++d) {
|
||||
int out_d_index = out_c_index + d * block.out_stride_[3];
|
||||
for (int e = 0; e < block.size_[4]; ++e) {
|
||||
int output_index = out_d_index + e * block.out_stride_[4];
|
||||
MirrorPadFp16(input_data, output_data, in_, pad_param_, output_index, output_index + block.size_[5]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
RunMirrorPadImplFast(block, input_data, output_data);
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
|
|
@ -30,8 +30,11 @@ class PadFp16CPUKernel : public PadCPUKernel {
|
|||
~PadFp16CPUKernel() {}
|
||||
|
||||
int Run() override;
|
||||
int RunImpl(int task_id) override;
|
||||
int RunMirrorPadImpl(int task_id) override;
|
||||
int RunImpl(int task_id) const override;
|
||||
int RunMirrorPadImpl(int task_id) const override;
|
||||
|
||||
private:
|
||||
void RunMirrorPadImplFast(const MirrorPadBlock &block, const float16_t *input_data, float16_t *output_data) const;
|
||||
|
||||
private:
|
||||
float16_t *input_ = nullptr;
|
||||
|
|
|
@ -25,7 +25,7 @@ using mindspore::lite::RET_OK;
|
|||
using mindspore::schema::PrimitiveType_PReLUFusion;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
int PReluFp16CPUKernel::DoExcute(int task_id) {
|
||||
int PReluFp16CPUKernel::DoExcute(int task_id) const {
|
||||
int thread_num = param_->op_parameter_.thread_num_;
|
||||
if (thread_num == 0) {
|
||||
MS_LOG(ERROR) << "thread_num is 0!";
|
||||
|
|
|
@ -27,7 +27,7 @@ class PReluFp16CPUKernel : public PReluCPUKernel {
|
|||
: PReluCPUKernel(parameter, inputs, outputs, ctx) {}
|
||||
~PReluFp16CPUKernel() = default;
|
||||
|
||||
int DoExcute(int task_id) override;
|
||||
int DoExcute(int task_id) const override;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_PRELU_FP16_H_
|
||||
|
|
|
@ -102,7 +102,7 @@ void GatherNdCPUKernel::InitOffset() {
|
|||
}
|
||||
}
|
||||
|
||||
int GatherNdCPUKernel::DoGatherNd(int task_id) {
|
||||
int GatherNdCPUKernel::DoGatherNd(int task_id) const {
|
||||
int count = MSMIN(thread_sz_stride_, count_ - task_id * thread_sz_stride_);
|
||||
if (count <= 0) {
|
||||
return RET_OK;
|
||||
|
@ -116,8 +116,8 @@ int GatherNdCPUKernel::DoGatherNd(int task_id) {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int GatherNdRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto g_kernel = reinterpret_cast<GatherNdCPUKernel *>(cdata);
|
||||
int GatherNdRun(const void *cdata, int task_id, float, float) {
|
||||
auto g_kernel = reinterpret_cast<const GatherNdCPUKernel *>(cdata);
|
||||
auto ret = g_kernel->DoGatherNd(task_id);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "GatherNdRun error task_id[" << task_id << "] error_code[" << ret << "]";
|
||||
|
|
|
@ -37,7 +37,7 @@ class GatherNdCPUKernel : public InnerKernel {
|
|||
int Prepare() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int DoGatherNd(int task_id);
|
||||
int DoGatherNd(int task_id) const;
|
||||
|
||||
private:
|
||||
void InitOffset();
|
||||
|
|
|
@ -42,7 +42,7 @@ int GatherCPUKernel::Prepare() {
|
|||
|
||||
int GatherCPUKernel::ReSize() { return RET_OK; }
|
||||
|
||||
int GatherCPUKernel::DoGather(int task_id) {
|
||||
int GatherCPUKernel::DoGather(int task_id) const {
|
||||
auto input_tensor = in_tensors_.at(0);
|
||||
auto indices_tensor = in_tensors_.at(1);
|
||||
auto out_tensor = out_tensors_.at(0);
|
||||
|
@ -81,8 +81,8 @@ int GatherCPUKernel::DoGather(int task_id) {
|
|||
return error_code;
|
||||
}
|
||||
|
||||
int GatherRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto gather_kernel = reinterpret_cast<GatherCPUKernel *>(cdata);
|
||||
int GatherRun(const void *cdata, int task_id, float, float) {
|
||||
auto gather_kernel = reinterpret_cast<const GatherCPUKernel *>(cdata);
|
||||
auto error_code = gather_kernel->DoGather(task_id);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "GatherRun error task_id[" << task_id << "] error_code[" << error_code << "]";
|
||||
|
|
|
@ -34,7 +34,7 @@ class GatherCPUKernel : public InnerKernel {
|
|||
int Prepare() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int DoGather(int task_id);
|
||||
int DoGather(int task_id) const;
|
||||
|
||||
private:
|
||||
int *indices_data_ = nullptr;
|
||||
|
|
|
@ -35,7 +35,7 @@ int GluCPUKernel::MallocTmpBuffer() {
|
|||
FreeTmpBuffer();
|
||||
auto in_tensor = in_tensors_.front();
|
||||
for (int i = 0; i < kSplitNum; i++) {
|
||||
split_ptr_[i] = reinterpret_cast<int8_t *>(ms_context_->allocator->Malloc(in_tensor->Size() / kSplitNum));
|
||||
split_ptr_[i] = ms_context_->allocator->Malloc(in_tensor->Size() / kSplitNum);
|
||||
if (split_ptr_[i] == nullptr) {
|
||||
MS_LOG(ERROR) << "GluCPUKernel malloc split ptr failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -96,8 +96,7 @@ int GluCPUKernel::ReSize() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int GluCPUKernel::Split(int task_id) {
|
||||
input_ptr_ = in_tensors_.front()->data();
|
||||
int GluCPUKernel::Split(int task_id) const {
|
||||
MS_CHECK_INT_MUL_NOT_OVERFLOW(task_id, thread_n_stride_, RET_ERROR);
|
||||
int num_unit_thread = MSMIN(thread_n_stride_, num_unit_ - task_id * thread_n_stride_);
|
||||
if (num_unit_thread <= 0) {
|
||||
|
@ -105,8 +104,8 @@ int GluCPUKernel::Split(int task_id) {
|
|||
}
|
||||
int thread_offset = task_id * thread_n_stride_;
|
||||
auto ret =
|
||||
DoSplit(input_ptr_, reinterpret_cast<void **>(split_ptr_.data()), in_tensors_.front()->shape().data(),
|
||||
thread_offset, num_unit_thread, &split_param_, lite::DataTypeSize(in_tensors_.front()->data_type()));
|
||||
DoSplit(input_ptr_, const_cast<void **>(split_ptr_.data()), in_tensors_.front()->shape().data(), thread_offset,
|
||||
num_unit_thread, &split_param_, lite::DataTypeSize(in_tensors_.front()->data_type()));
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Split error task_id[" << task_id << "] error_code[" << ret << "]";
|
||||
return RET_ERROR;
|
||||
|
@ -114,7 +113,7 @@ int GluCPUKernel::Split(int task_id) {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int GluCPUKernel::Sigmoid(int task_id) {
|
||||
int GluCPUKernel::Sigmoid(int task_id) const {
|
||||
auto input_addr = reinterpret_cast<float *>(split_ptr_.at(1));
|
||||
auto output_addr = reinterpret_cast<float *>(sigmoid_ptr_);
|
||||
auto length = in_tensors_.at(0)->ElementsNum() / kGluBranchNum;
|
||||
|
@ -128,7 +127,7 @@ int GluCPUKernel::Sigmoid(int task_id) {
|
|||
return ::Sigmoid(input_addr + stride * task_id, count, output_addr + stride * task_id);
|
||||
}
|
||||
|
||||
int GluCPUKernel::Mul(int task_id) {
|
||||
int GluCPUKernel::Mul(int task_id) const {
|
||||
auto input_addr0 = reinterpret_cast<float *>(split_ptr_.at(0));
|
||||
auto input_addr1 = reinterpret_cast<float *>(sigmoid_ptr_);
|
||||
auto output_addr = reinterpret_cast<float *>(out_tensors_.at(0)->data());
|
||||
|
@ -144,22 +143,24 @@ int GluCPUKernel::Mul(int task_id) {
|
|||
return ElementMul(input_addr0 + offset, input_addr1 + offset, output_addr + offset, count);
|
||||
}
|
||||
|
||||
static int SplitRun(void *cdata, int task_id, float, float) {
|
||||
auto g_kernel = reinterpret_cast<GluCPUKernel *>(cdata);
|
||||
static int SplitRun(const void *cdata, int task_id, float, float) {
|
||||
auto g_kernel = reinterpret_cast<const GluCPUKernel *>(cdata);
|
||||
return g_kernel->Split(task_id);
|
||||
}
|
||||
|
||||
static int SigmoidRun(void *cdata, int task_id, float, float) {
|
||||
auto activation_kernel = reinterpret_cast<GluCPUKernel *>(cdata);
|
||||
static int SigmoidRun(const void *cdata, int task_id, float, float) {
|
||||
auto activation_kernel = reinterpret_cast<const GluCPUKernel *>(cdata);
|
||||
return activation_kernel->Sigmoid(task_id);
|
||||
}
|
||||
|
||||
static int MulRun(void *cdata, int task_id, float, float) {
|
||||
auto g_kernel = reinterpret_cast<GluCPUKernel *>(cdata);
|
||||
static int MulRun(const void *cdata, int task_id, float, float) {
|
||||
auto g_kernel = reinterpret_cast<const GluCPUKernel *>(cdata);
|
||||
return g_kernel->Mul(task_id);
|
||||
}
|
||||
|
||||
int GluCPUKernel::Run() {
|
||||
input_ptr_ = in_tensors_.front()->data();
|
||||
|
||||
auto ret = MallocTmpBuffer();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Malloc tmp buffer failed";
|
||||
|
|
|
@ -43,9 +43,11 @@ class GluCPUKernel : public InnerKernel {
|
|||
int Prepare() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int Split(int task_id);
|
||||
int Sigmoid(int task_id);
|
||||
int Mul(int task_id);
|
||||
int Split(int task_id) const;
|
||||
int Sigmoid(int task_id) const;
|
||||
int Mul(int task_id) const;
|
||||
|
||||
private:
|
||||
void FreeTmpBuffer();
|
||||
int MallocTmpBuffer();
|
||||
|
||||
|
@ -54,8 +56,8 @@ class GluCPUKernel : public InnerKernel {
|
|||
GluParameter *glu_param_ = nullptr;
|
||||
void *input_ptr_ = nullptr;
|
||||
int8_t *sigmoid_ptr_ = nullptr;
|
||||
std::vector<int8_t *> split_ptr_;
|
||||
int split_sizes_[kSplitNum];
|
||||
std::vector<void *> split_ptr_;
|
||||
int split_sizes_[kSplitNum] = {0};
|
||||
int thread_n_stride_ = 0;
|
||||
int usable_thread_num_ = 0;
|
||||
int num_unit_ = 0;
|
||||
|
|
|
@ -50,7 +50,7 @@ class GruCPUKernel : public InnerKernel {
|
|||
const int weight_r_index = 2;
|
||||
const int bias_index = 3;
|
||||
|
||||
float *buffer_[4];
|
||||
float *buffer_[4] = {nullptr};
|
||||
const int gate_num = 3;
|
||||
const int packed_input_index = 0;
|
||||
const int input_gate_index = 1;
|
||||
|
|
|
@ -45,7 +45,7 @@ int InstanceNormCPUKernel::ReSize() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int InstanceNormCPUKernel::DoInstanceNorm(int task_id) {
|
||||
int InstanceNormCPUKernel::DoInstanceNorm(int task_id) const {
|
||||
int ret = 0;
|
||||
if (in_tensors_[0]->format() == NC4HW4) { // arm64 x86-avx x86-sse x86
|
||||
#ifdef ENABLE_AVX
|
||||
|
@ -63,8 +63,8 @@ int InstanceNormCPUKernel::DoInstanceNorm(int task_id) {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int InstanceNormRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto kernel = reinterpret_cast<InstanceNormCPUKernel *>(cdata);
|
||||
int InstanceNormRun(const void *cdata, int task_id, float, float) {
|
||||
auto kernel = reinterpret_cast<const InstanceNormCPUKernel *>(cdata);
|
||||
auto ret = kernel->DoInstanceNorm(task_id);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "InstanceNormRun error task_id[" << task_id << "] error_code[" << ret << "]";
|
||||
|
|
|
@ -35,7 +35,9 @@ class InstanceNormCPUKernel : public InnerKernel {
|
|||
int Prepare() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int DoInstanceNorm(int task_id);
|
||||
int DoInstanceNorm(int task_id) const;
|
||||
|
||||
private:
|
||||
void FreeTmpBuffer() {
|
||||
if (tmp_src_data_ != nullptr) {
|
||||
ms_context_->allocator->Free(tmp_src_data_);
|
||||
|
|
|
@ -89,7 +89,7 @@ int L2NormCPUKernel::ReSize() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int L2NormCPUKernel::CalcSquareSum(int task_id) {
|
||||
int L2NormCPUKernel::CalcSquareSum(int task_id) const {
|
||||
int unit = UP_DIV(l2_norm_param_->data_num_, op_parameter_->thread_num_);
|
||||
if (INT_MUL_OVERFLOW(task_id, unit)) {
|
||||
MS_LOG(ERROR) << "int mul overflow.";
|
||||
|
@ -100,7 +100,7 @@ int L2NormCPUKernel::CalcSquareSum(int task_id) {
|
|||
return CalcThreadSquareSum(input_ptr_, tmp_sum_ + task_id, begin, end);
|
||||
}
|
||||
|
||||
int L2NormCPUKernel::DivSqrtSum(int task_id) {
|
||||
int L2NormCPUKernel::DivSqrtSum(int task_id) const {
|
||||
int unit = UP_DIV(l2_norm_param_->data_num_, op_parameter_->thread_num_);
|
||||
if (INT_MUL_OVERFLOW(task_id, unit)) {
|
||||
MS_LOG(ERROR) << "int mul overflow.";
|
||||
|
@ -111,7 +111,7 @@ int L2NormCPUKernel::DivSqrtSum(int task_id) {
|
|||
return ThreadDivSqrtSum(input_ptr_, output_ptr_, l2_norm_param_, sqrt_sum_, begin, end);
|
||||
}
|
||||
|
||||
int L2NormCPUKernel::CalcL2NormTrailingAxis(int task_id) {
|
||||
int L2NormCPUKernel::CalcL2NormTrailingAxis(int task_id) const {
|
||||
auto input = in_tensors_.at(0);
|
||||
if (input->shape().back() == 0) {
|
||||
MS_LOG(ERROR) << "input->shape().back() is 0";
|
||||
|
@ -128,8 +128,8 @@ int L2NormCPUKernel::CalcL2NormTrailingAxis(int task_id) {
|
|||
return ThreadTrailingAxis(input_ptr_, output_ptr_, l2_norm_param_, begin, end);
|
||||
}
|
||||
|
||||
int SquareSumRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto kernel = reinterpret_cast<L2NormCPUKernel *>(cdata);
|
||||
int SquareSumRun(const void *cdata, int task_id, float, float) {
|
||||
auto kernel = reinterpret_cast<const L2NormCPUKernel *>(cdata);
|
||||
auto ret = kernel->CalcSquareSum(task_id);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "L2Norm SquareSumRun error task_id[" << task_id << "] error_code[" << ret << "]";
|
||||
|
@ -138,9 +138,9 @@ int SquareSumRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int L2NormRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
int L2NormRun(const void *cdata, int task_id, float, float) {
|
||||
CHECK_NULL_RETURN(cdata);
|
||||
auto kernel = reinterpret_cast<L2NormCPUKernel *>(cdata);
|
||||
auto kernel = reinterpret_cast<const L2NormCPUKernel *>(cdata);
|
||||
auto ret = kernel->DivSqrtSum(task_id);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "L2Norm L2NormRun error task_id[" << task_id << "] error_code[" << ret << "]";
|
||||
|
@ -149,9 +149,9 @@ int L2NormRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int L2NormTrailingAxisRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
int L2NormTrailingAxisRun(const void *cdata, int task_id, float, float) {
|
||||
CHECK_NULL_RETURN(cdata);
|
||||
auto kernel = reinterpret_cast<L2NormCPUKernel *>(cdata);
|
||||
auto kernel = reinterpret_cast<const L2NormCPUKernel *>(cdata);
|
||||
auto ret = kernel->CalcL2NormTrailingAxis(task_id);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "L2Norm TrailingAxisRun error task_id[" << task_id << "] error_code[" << ret << "]";
|
||||
|
|
|
@ -36,9 +36,9 @@ class L2NormCPUKernel : public InnerKernel {
|
|||
}
|
||||
~L2NormCPUKernel() { FreeTmpBuffer(); }
|
||||
|
||||
int CalcSquareSum(int task_id);
|
||||
int DivSqrtSum(int task_id);
|
||||
int CalcL2NormTrailingAxis(int task_id);
|
||||
int CalcSquareSum(int task_id) const;
|
||||
int DivSqrtSum(int task_id) const;
|
||||
int CalcL2NormTrailingAxis(int task_id) const;
|
||||
|
||||
int Prepare() override;
|
||||
int ReSize() override;
|
||||
|
|
|
@ -65,7 +65,7 @@ int LayerNormCPUKernel::ReSize() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int LayerNormCPUKernel::DoLayerNorm(int thread_id) {
|
||||
int LayerNormCPUKernel::DoLayerNorm(int thread_id) const {
|
||||
auto ret = LayerNorm(src_data_, gamma_data_, beta_data_, dst_data_, mean_data_, var_data_, param_, thread_id);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "DoLayerNorm error error_code[" << ret << "]";
|
||||
|
@ -74,8 +74,8 @@ int LayerNormCPUKernel::DoLayerNorm(int thread_id) {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int LayerNormRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto kernel = reinterpret_cast<LayerNormCPUKernel *>(cdata);
|
||||
int LayerNormRun(const void *cdata, int task_id, float, float) {
|
||||
auto kernel = reinterpret_cast<const LayerNormCPUKernel *>(cdata);
|
||||
CHECK_NULL_RETURN(kernel);
|
||||
auto ret = kernel->DoLayerNorm(task_id);
|
||||
if (ret != RET_OK) {
|
||||
|
|
|
@ -35,7 +35,7 @@ class LayerNormCPUKernel : public InnerKernel {
|
|||
int Prepare() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int DoLayerNorm(int thread_id);
|
||||
int DoLayerNorm(int thread_id) const;
|
||||
|
||||
private:
|
||||
LayerNormParameter *param_ = nullptr;
|
||||
|
|
|
@ -35,7 +35,7 @@ int LocalResponseNormCPUKernel::Prepare() {
|
|||
|
||||
int LocalResponseNormCPUKernel::ReSize() { return RET_OK; }
|
||||
|
||||
int LocalResponseNormCPUKernel::DoLocalResponseNorm(int task_id) {
|
||||
int LocalResponseNormCPUKernel::DoLocalResponseNorm(int task_id) const {
|
||||
auto input_tensor = in_tensors_.front();
|
||||
auto out_tensor = out_tensors_.front();
|
||||
auto input_ptr = reinterpret_cast<float *>(input_tensor->MutableData());
|
||||
|
@ -67,8 +67,8 @@ int LocalResponseNormCPUKernel::DoLocalResponseNorm(int task_id) {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int LocalResponseNormRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto lrn = reinterpret_cast<LocalResponseNormCPUKernel *>(cdata);
|
||||
int LocalResponseNormRun(const void *cdata, int task_id, float, float) {
|
||||
auto lrn = reinterpret_cast<const LocalResponseNormCPUKernel *>(cdata);
|
||||
auto error_code = lrn->DoLocalResponseNorm(task_id);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "LocalResponseNormRun error task_id[" << task_id << "] error_code[" << error_code << "]";
|
||||
|
|
|
@ -32,7 +32,7 @@ class LocalResponseNormCPUKernel : public InnerKernel {
|
|||
int Prepare() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int DoLocalResponseNorm(int task_id);
|
||||
int DoLocalResponseNorm(int task_id) const;
|
||||
|
||||
private:
|
||||
int thread_count_;
|
||||
|
|
|
@ -79,7 +79,7 @@ int LogSoftmaxCPUKernel::ReSize() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int LogSoftmaxCPUKernel::DoLogSoftmaxLastAxis(int task_id) {
|
||||
int LogSoftmaxCPUKernel::DoLogSoftmaxLastAxis(int task_id) const {
|
||||
MS_CHECK_FALSE(op_parameter_->thread_num_ == 0, RET_ERROR);
|
||||
int unit = UP_DIV(out_plane_size_, op_parameter_->thread_num_);
|
||||
int begin = task_id * unit;
|
||||
|
@ -94,8 +94,8 @@ int LogSoftmaxCPUKernel::DoLogSoftmaxLastAxis(int task_id) {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int LogSoftmaxLastAxisRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto kernel = reinterpret_cast<LogSoftmaxCPUKernel *>(cdata);
|
||||
int LogSoftmaxLastAxisRun(const void *cdata, int task_id, float, float) {
|
||||
auto kernel = reinterpret_cast<const LogSoftmaxCPUKernel *>(cdata);
|
||||
CHECK_NULL_RETURN(kernel);
|
||||
auto ret = kernel->DoLogSoftmaxLastAxis(task_id);
|
||||
if (ret != RET_OK) {
|
||||
|
|
|
@ -32,7 +32,7 @@ class LogSoftmaxCPUKernel : public SoftmaxBaseCPUKernel {
|
|||
int Prepare() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int DoLogSoftmaxLastAxis(int task_id);
|
||||
int DoLogSoftmaxLastAxis(int task_id) const;
|
||||
|
||||
private:
|
||||
float *tmp_data_ = nullptr;
|
||||
|
|
|
@ -322,7 +322,7 @@ int LstmCPUKernel::MallocRunBuffer() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void LstmCPUKernel::InputWeightMatMul(int task_id) {
|
||||
void LstmCPUKernel::InputWeightMatMul(int task_id) const {
|
||||
int current_start_oc = task_id * input_thread_stride_ * col_tile_;
|
||||
int current_rest_oc = 0;
|
||||
current_rest_oc = lstm_param_->hidden_size_ - current_start_oc;
|
||||
|
@ -339,8 +339,8 @@ void LstmCPUKernel::InputWeightMatMul(int task_id) {
|
|||
cur_oc, lstm_param_->hidden_size_, OutType_Nhwc);
|
||||
}
|
||||
|
||||
int LstmInputMulWeightRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto kernel = reinterpret_cast<LstmCPUKernel *>(cdata);
|
||||
int LstmInputMulWeightRun(const void *cdata, int task_id, float, float) {
|
||||
auto kernel = reinterpret_cast<const LstmCPUKernel *>(cdata);
|
||||
CHECK_NULL_RETURN(kernel);
|
||||
kernel->InputWeightMatMul(task_id);
|
||||
return RET_OK;
|
||||
|
|
|
@ -36,7 +36,7 @@ class LstmCPUKernel : public InnerKernel {
|
|||
int ReSize() override;
|
||||
int Run() override;
|
||||
|
||||
void InputWeightMatMul(int task_id);
|
||||
void InputWeightMatMul(int task_id) const;
|
||||
|
||||
private:
|
||||
void FreeTmpBuffer();
|
||||
|
@ -50,9 +50,9 @@ class LstmCPUKernel : public InnerKernel {
|
|||
const float *state_bias, float *hidden_state, float *cell_state, bool is_backward);
|
||||
int InnerExecute(float *output, const float *input, float *hidden_state, float *cell_state);
|
||||
void RecordStates(const float *cell_state, int step);
|
||||
const float *weight_loop_;
|
||||
const float *bias_loop_;
|
||||
float *gate_loop_;
|
||||
const float *weight_loop_ = nullptr;
|
||||
const float *bias_loop_ = nullptr;
|
||||
float *gate_loop_ = nullptr;
|
||||
int input_thread_count_ = 0;
|
||||
int input_thread_stride_ = 0;
|
||||
|
||||
|
@ -64,7 +64,7 @@ class LstmCPUKernel : public InnerKernel {
|
|||
const int weight_h_index = 2;
|
||||
const int bias_index = 3;
|
||||
|
||||
float *buffer_[7];
|
||||
float *buffer_[7] = {nullptr};
|
||||
const int gate_num = 4;
|
||||
const int packed_input_index = 0;
|
||||
const int input_gate_index = 1;
|
||||
|
|
|
@ -21,9 +21,9 @@
|
|||
using mindspore::lite::RET_NULL_PTR;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
int MatmulBaseFloatRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
int MatmulBaseFloatRun(const void *cdata, int task_id, float, float) {
|
||||
CHECK_NULL_RETURN(cdata);
|
||||
auto op = reinterpret_cast<MatmulFp32BaseCPUKernel *>(cdata);
|
||||
auto op = reinterpret_cast<const MatmulFp32BaseCPUKernel *>(cdata);
|
||||
auto error_code = op->FloatRun(task_id);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]";
|
||||
|
@ -126,32 +126,44 @@ int MatmulFp32BaseCPUKernel::CalBroadCastBiasDataElements() {
|
|||
}
|
||||
|
||||
int MatmulFp32BaseCPUKernel::InitBiasData() {
|
||||
if (in_tensors_.size() == 3) {
|
||||
auto bias_tensor = in_tensors_[2];
|
||||
size_t max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), col_tile_);
|
||||
// malloc addr need to aligned to 32 bytes
|
||||
if (in_tensors_.size() != FOURTH_INPUT) {
|
||||
return RET_OK;
|
||||
}
|
||||
auto bias_tensor = in_tensors_[THIRD_INPUT];
|
||||
if (bias_tensor == nullptr) {
|
||||
MS_LOG(ERROR) << "bias_tensor invalid";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
if (bias_tensor->ElementsNum() == 1) {
|
||||
// broadcast bias data
|
||||
size_t max_bias_data = CalBroadCastBiasDataElements();
|
||||
bias_ptr_ = reinterpret_cast<float *>(malloc(max_bias_data * static_cast<int>(sizeof(float))));
|
||||
if (bias_ptr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc bias_ptr_ failed";
|
||||
return RET_ERROR;
|
||||
}
|
||||
// whether to broadcast bias data
|
||||
if (bias_tensor->ElementsNum() == 1) {
|
||||
max_bias_data = CalBroadCastBiasDataElements();
|
||||
float broadcast_data = (reinterpret_cast<float *>(bias_tensor->data()))[0];
|
||||
// broadcast bias data
|
||||
for (size_t i = 0; i < max_bias_data; ++i) {
|
||||
bias_ptr_[i] = broadcast_data;
|
||||
}
|
||||
} else {
|
||||
memset(bias_ptr_, 0, max_bias_data * static_cast<int>(sizeof(float)));
|
||||
memcpy(bias_ptr_, bias_tensor->data(), bias_tensor->ElementsNum() * static_cast<int>(sizeof(float)));
|
||||
float broadcast_data = (reinterpret_cast<float *>(bias_tensor->data()))[0];
|
||||
// broadcast bias data
|
||||
for (size_t i = 0; i < max_bias_data; ++i) {
|
||||
bias_ptr_[i] = broadcast_data;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
size_t max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), col_tile_);
|
||||
// malloc addr need to aligned to 32 bytes
|
||||
bias_ptr_ = reinterpret_cast<float *>(malloc(max_bias_data * static_cast<int>(sizeof(float))));
|
||||
if (bias_ptr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc bias_ptr_ failed";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(bias_ptr_, 0, max_bias_data * static_cast<int>(sizeof(float)));
|
||||
memcpy(bias_ptr_, bias_tensor->data(), bias_tensor->ElementsNum() * static_cast<int>(sizeof(float)));
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int MatmulFp32BaseCPUKernel::InitMatrixA(const float *src_ptr) {
|
||||
int MatmulFp32BaseCPUKernel::InitMatrixA(const float *src_ptr) const {
|
||||
CHECK_NULL_RETURN(src_ptr);
|
||||
#ifdef ENABLE_ARM64
|
||||
if (vec_matmul_) {
|
||||
|
@ -175,7 +187,7 @@ int MatmulFp32BaseCPUKernel::InitMatrixA(const float *src_ptr) {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int MatmulFp32BaseCPUKernel::InitMatrixB(const float *src_ptr) {
|
||||
int MatmulFp32BaseCPUKernel::InitMatrixB(const float *src_ptr) const {
|
||||
CHECK_NULL_RETURN(src_ptr);
|
||||
for (int i = 0; i < params_->batch; i++) {
|
||||
const float *src = src_ptr + i * params_->deep_ * params_->col_;
|
||||
|
|
|
@ -47,8 +47,8 @@ class MatmulFp32BaseCPUKernel : public InnerKernel {
|
|||
protected:
|
||||
int InitBufferA();
|
||||
int InitBufferB();
|
||||
int InitMatrixA(const float *src_ptr);
|
||||
int InitMatrixB(const float *src_ptr);
|
||||
int InitMatrixA(const float *src_ptr) const;
|
||||
int InitMatrixB(const float *src_ptr) const;
|
||||
void FreeBiasBuf();
|
||||
int InitBiasData();
|
||||
void InitParameter();
|
||||
|
|
|
@ -110,7 +110,7 @@ void ExpandDims(std::vector<int> *shape, size_t size) {
|
|||
}
|
||||
|
||||
int NonMaxSuppressionCPUKernel::Run_Selecte(bool simple_out, int box_num, int batch_num, int class_num,
|
||||
float *scores_data, float *box_data) {
|
||||
const float *scores_data, const float *box_data) {
|
||||
std::vector<NMSBox> selected_box_per_class;
|
||||
selected_box_per_class.reserve(std::min(static_cast<int32_t>(box_num), max_output_per_class_));
|
||||
std::vector<NMSIndex> selected_index;
|
||||
|
@ -119,8 +119,8 @@ int NonMaxSuppressionCPUKernel::Run_Selecte(bool simple_out, int box_num, int ba
|
|||
int batch_offset = i * class_num * box_num;
|
||||
for (auto j = 0; j < class_num; ++j) {
|
||||
// per batch per class filter
|
||||
float *per_class_scores = scores_data + batch_offset + j * box_num;
|
||||
float *box = box_data + i * box_num * kBoxPointNum;
|
||||
const float *per_class_scores = scores_data + batch_offset + j * box_num;
|
||||
const float *box = box_data + i * box_num * kBoxPointNum;
|
||||
std::vector<NMSBox> above_score_candidates;
|
||||
above_score_candidates.reserve(box_num);
|
||||
for (auto k = 0; k < box_num; ++k) {
|
||||
|
|
|
@ -41,7 +41,8 @@ class NonMaxSuppressionCPUKernel : public InnerKernel {
|
|||
|
||||
private:
|
||||
int GetParams();
|
||||
int Run_Selecte(bool simple_out, int box_num, int batch_num, int class_num, float *scores_data, float *box_data);
|
||||
int Run_Selecte(bool simple_out, int box_num, int batch_num, int class_num, const float *scores_data,
|
||||
const float *box_data);
|
||||
|
||||
private:
|
||||
int center_point_box_ = 0;
|
||||
|
|
|
@ -206,8 +206,8 @@ int PadCPUKernel::ExtendPaddings(int *paddings, int length, const int *ori_paddi
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int PadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto padKernel = reinterpret_cast<PadCPUKernel *>(cdata);
|
||||
int PadImpl(const void *cdata, int task_id, float, float) {
|
||||
auto padKernel = reinterpret_cast<const PadCPUKernel *>(cdata);
|
||||
int error_code = padKernel->RunImpl(task_id);
|
||||
if (error_code != NNACL_OK) {
|
||||
MS_LOG(ERROR) << "Pad Run error task_id[" << task_id << "] error_code[" << error_code << "]";
|
||||
|
@ -216,7 +216,7 @@ int PadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int PadCPUKernel::RunImpl(int task_id) {
|
||||
int PadCPUKernel::RunImpl(int task_id) const {
|
||||
auto input = in_tensors_.at(0);
|
||||
auto output = out_tensors_.at(0);
|
||||
auto input_data = reinterpret_cast<float *>(input->data());
|
||||
|
@ -228,8 +228,8 @@ int PadCPUKernel::RunImpl(int task_id) {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int MirrorPadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto padKernel = reinterpret_cast<PadCPUKernel *>(cdata);
|
||||
int MirrorPadImpl(const void *cdata, int task_id, float, float) {
|
||||
auto padKernel = reinterpret_cast<const PadCPUKernel *>(cdata);
|
||||
int error_code = padKernel->RunMirrorPadImpl(task_id);
|
||||
if (error_code != NNACL_OK) {
|
||||
MS_LOG(ERROR) << "Pad Run error task_id[" << task_id << "] error_code[" << error_code << "]";
|
||||
|
@ -238,7 +238,27 @@ int MirrorPadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int PadCPUKernel::RunMirrorPadImpl(int task_id) {
|
||||
void PadCPUKernel::RunMirrorPadImplFast(const MirrorPadBlock &block, const float *input_data,
|
||||
float *output_data) const {
|
||||
for (int a = 0; a < block.size_[FIRST_INPUT]; a++) {
|
||||
int out_a_index = block.out_offset_ + a * block.out_stride_[FIRST_INPUT];
|
||||
for (int b = 0; b < block.size_[SECOND_INPUT]; b++) {
|
||||
int out_b_index = out_a_index + b * block.out_stride_[SECOND_INPUT];
|
||||
for (int c = 0; c < block.size_[THIRD_INPUT]; ++c) {
|
||||
int out_c_index = out_b_index + c * block.out_stride_[THIRD_INPUT];
|
||||
for (int d = 0; d < block.size_[FOURTH_INPUT]; ++d) {
|
||||
int out_d_index = out_c_index + d * block.out_stride_[FOURTH_INPUT];
|
||||
for (int e = 0; e < block.size_[FIFTH_INPUT]; ++e) {
|
||||
int output_index = out_d_index + e * block.out_stride_[FIFTH_INPUT];
|
||||
MirrorPad(input_data, output_data, in_, pad_param_, output_index, output_index + block.size_[SIXTH_INPUT]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int PadCPUKernel::RunMirrorPadImpl(int task_id) const {
|
||||
auto input = in_tensors_.at(0);
|
||||
auto output = out_tensors_.at(0);
|
||||
auto input_data = reinterpret_cast<float *>(input->data());
|
||||
|
@ -253,23 +273,7 @@ int PadCPUKernel::RunMirrorPadImpl(int task_id) {
|
|||
/* calculate region part */
|
||||
for (size_t i = task_id; i < mirror_pad_block_.size(); i += static_cast<size_t>(op_parameter_->thread_num_)) {
|
||||
auto block = mirror_pad_block_[i];
|
||||
|
||||
for (int a = 0; a < block.size_[0]; a++) {
|
||||
int out_a_index = block.out_offset_ + a * block.out_stride_[0];
|
||||
for (int b = 0; b < block.size_[1]; b++) {
|
||||
int out_b_index = out_a_index + b * block.out_stride_[1];
|
||||
for (int c = 0; c < block.size_[2]; ++c) {
|
||||
int out_c_index = out_b_index + c * block.out_stride_[2];
|
||||
for (int d = 0; d < block.size_[3]; ++d) {
|
||||
int out_d_index = out_c_index + d * block.out_stride_[3];
|
||||
for (int e = 0; e < block.size_[4]; ++e) {
|
||||
int output_index = out_d_index + e * block.out_stride_[4];
|
||||
MirrorPad(input_data, output_data, in_, pad_param_, output_index, output_index + block.size_[5]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
RunMirrorPadImplFast(block, input_data, output_data);
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
|
|
@ -41,8 +41,8 @@ class PadCPUKernel : public InnerKernel {
|
|||
int Prepare() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
virtual int RunImpl(int task_id);
|
||||
virtual int RunMirrorPadImpl(int task_id);
|
||||
virtual int RunImpl(int task_id) const;
|
||||
virtual int RunMirrorPadImpl(int task_id) const;
|
||||
|
||||
private:
|
||||
int CheckPaddings(const int *paddings, int length, const int *input_shape, int mode);
|
||||
|
@ -50,6 +50,7 @@ class PadCPUKernel : public InnerKernel {
|
|||
int ExtendShape(int *shape, int length, const int *ori_shape, int rank) const;
|
||||
int ExtendPaddings(int *paddings, int length, const int *ori_paddings, int ori_length) const;
|
||||
void InitMirrorPadBlock();
|
||||
void RunMirrorPadImplFast(const MirrorPadBlock &block, const float *input_data, float *output_data) const;
|
||||
|
||||
protected:
|
||||
int HandleMirrorPad();
|
||||
|
@ -60,8 +61,8 @@ class PadCPUKernel : public InnerKernel {
|
|||
std::vector<MirrorPadBlock> mirror_pad_block_;
|
||||
};
|
||||
|
||||
int PadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale);
|
||||
int MirrorPadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale);
|
||||
int PadImpl(const void *cdata, int task_id, float, float);
|
||||
int MirrorPadImpl(const void *cdata, int task_id, float, float);
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_PAD_H_
|
||||
|
|
|
@ -50,7 +50,7 @@ int PoolingCPUKernel::ReSize() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int PoolingCPUKernel::RunImpl(int task_id) {
|
||||
int PoolingCPUKernel::RunImpl(int task_id) const {
|
||||
auto input_ptr = reinterpret_cast<float *>(in_tensors_.at(kInputIndex)->MutableData());
|
||||
CHECK_NULL_RETURN(input_ptr);
|
||||
auto output_ptr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->MutableData());
|
||||
|
@ -76,8 +76,8 @@ int PoolingCPUKernel::RunImpl(int task_id) {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int PoolingImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto pooling = reinterpret_cast<PoolingCPUKernel *>(cdata);
|
||||
int PoolingImpl(const void *cdata, int task_id, float, float) {
|
||||
auto pooling = reinterpret_cast<const PoolingCPUKernel *>(cdata);
|
||||
auto error_code = pooling->RunImpl(task_id);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "Pooling Run error task_id[" << task_id << "] error_code[" << error_code << "]";
|
||||
|
|
|
@ -32,7 +32,7 @@ class PoolingCPUKernel : public PoolingBaseCPUKernel {
|
|||
int Prepare() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int RunImpl(int task_id);
|
||||
int RunImpl(int task_id) const;
|
||||
|
||||
private:
|
||||
};
|
||||
|
|
|
@ -33,9 +33,8 @@ int PowerCPUKernel::Prepare() {
|
|||
|
||||
int PowerCPUKernel::ReSize() { return RET_OK; }
|
||||
|
||||
int PowerImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
CHECK_NULL_RETURN(cdata);
|
||||
auto kernel = reinterpret_cast<PowerCPUKernel *>(cdata);
|
||||
int PowerImpl(const void *cdata, int task_id, float, float) {
|
||||
auto kernel = reinterpret_cast<const PowerCPUKernel *>(cdata);
|
||||
CHECK_NULL_RETURN(kernel);
|
||||
auto ret = kernel->RunImpl(task_id);
|
||||
if (ret != RET_OK) {
|
||||
|
@ -54,7 +53,7 @@ int PowerCPUKernel::Run() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int PowerCPUKernel::RunImpl(int task_id) {
|
||||
int PowerCPUKernel::RunImpl(int task_id) const {
|
||||
auto x_addr = reinterpret_cast<float *>(in_tensors_.at(0)->MutableData());
|
||||
CHECK_NULL_RETURN(x_addr);
|
||||
auto output_addr = reinterpret_cast<float *>(out_tensors_.at(0)->MutableData());
|
||||
|
|
|
@ -36,7 +36,7 @@ class PowerCPUKernel : public InnerKernel {
|
|||
int Prepare() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int RunImpl(int task_id);
|
||||
int RunImpl(int task_id) const;
|
||||
|
||||
private:
|
||||
int thread_count_;
|
||||
|
|
|
@ -27,8 +27,8 @@ using mindspore::lite::RET_OK;
|
|||
using mindspore::schema::PrimitiveType_PReLUFusion;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
static int PReluRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto PRelu = reinterpret_cast<PReluCPUKernel *>(cdata);
|
||||
static int PReluRun(const void *cdata, int task_id, float, float) {
|
||||
auto PRelu = reinterpret_cast<const PReluCPUKernel *>(cdata);
|
||||
auto ret = PRelu->DoExcute(task_id);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "PReluRun error task_id[" << task_id << "] error_code[" << ret << "]";
|
||||
|
@ -55,7 +55,7 @@ int PReluCPUKernel::Prepare() {
|
|||
return ReSize();
|
||||
}
|
||||
|
||||
int PReluCPUKernel::DoExcute(int task_id) {
|
||||
int PReluCPUKernel::DoExcute(int task_id) const {
|
||||
int thread_num = param_->op_parameter_.thread_num_;
|
||||
if (thread_num == 0) {
|
||||
MS_LOG(ERROR) << "thread_num is 0!";
|
||||
|
|
|
@ -34,7 +34,7 @@ class PReluCPUKernel : public InnerKernel {
|
|||
int Prepare() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
virtual int DoExcute(int task_id);
|
||||
virtual int DoExcute(int task_id) const;
|
||||
|
||||
protected:
|
||||
PReluParameter *param_;
|
||||
|
|
|
@ -84,7 +84,7 @@ void Nc4hw4PassReplace(std::vector<kernel::LiteKernel *> *kernels, std::vector<T
|
|||
return;
|
||||
}
|
||||
|
||||
bool Nc4hw4PassMatch(std::vector<kernel::LiteKernel *> *kernels, size_t index) {
|
||||
bool Nc4hw4PassMatch(const std::vector<kernel::LiteKernel *> *kernels, size_t index) {
|
||||
kernel::LiteKernel *start_kernel = kernels->at(index);
|
||||
if (IsContain(Nc4hw4FormatOutOpList, start_kernel->type()) == false) {
|
||||
return false;
|
||||
|
@ -179,7 +179,7 @@ void Nc4hw4PassAct(std::vector<kernel::LiteKernel *> *kernels, std::vector<Tenso
|
|||
return;
|
||||
}
|
||||
|
||||
void ConvNormC4PassActReplace(kernel::LiteKernel *conv_op, kernel::LiteKernel *in_op) {
|
||||
void ConvNormC4PassActReplace(const kernel::LiteKernel *conv_op, const kernel::LiteKernel *in_op) {
|
||||
conv_op->out_tensors().front()->set_format(NC4HW4);
|
||||
in_op->in_tensors().front()->set_format(NC4HW4);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue