!25455 [MSLITE] code clean

Merge pull request !25455 from ling/pr
This commit is contained in:
i-robot 2021-10-28 01:48:46 +00:00 committed by Gitee
commit 59f03e8e21
40 changed files with 413 additions and 353 deletions

View File

@ -258,35 +258,36 @@ void PackNHWCToNXHWCXFp32(int kernel_h, int kernel_w, int output_channel, int oc
tmp_weight[oc_remainder + oc_remainder_step * ic] = src[ic + oc_remainder * input_channel];
}
}
} else {
for (; oc < oc_block8; oc += (oc_block / C8NUM)) {
oc_block = MSMIN(C4NUM, oc_block8 - oc) * C8NUM; // max_tile = 32 ==> 24 ==> 16 ==> 8
for (int oc_tmp = 0; oc_tmp < oc_block; oc_tmp += C8NUM) {
for (int hw = 0; hw < plane; ++hw) {
int ic = 0;
for (; ic < ic8; ic += C8NUM) {
Transpose8X8Fp32Avx(src + hw * input_channel + ic,
tmp_weight + hw * oc_block * input_channel + ic * oc_block + oc_tmp,
input_channel * plane, oc_block);
}
for (; ic < input_channel; ++ic) {
for (int j = 0; j < C8NUM; ++j) {
tmp_weight[ic * oc_block + oc_tmp + j + hw * oc_block * input_channel] =
src[ic + input_channel * j * plane + hw * input_channel];
}
}
}
src += C8NUM * plane * input_channel;
}
tmp_weight += oc_block * input_channel * plane;
}
oc = output_channel - oc_block8 * C8NUM;
for (int oc_remainder = 0; oc_remainder < oc; ++oc_remainder) {
return;
}
for (; oc < oc_block8; oc += (oc_block / C8NUM)) {
oc_block = MSMIN(C4NUM, oc_block8 - oc) * C8NUM; // max_tile = 32 ==> 24 ==> 16 ==> 8
for (int oc_tmp = 0; oc_tmp < oc_block; oc_tmp += C8NUM) {
for (int hw = 0; hw < plane; ++hw) {
for (int ic = 0; ic < input_channel; ++ic) {
tmp_weight[oc_remainder + oc_remainder_step * ic + hw * input_channel * oc_remainder_step] =
src[ic + (oc_remainder * plane + hw) * input_channel];
int ic = 0;
for (; ic < ic8; ic += C8NUM) {
Transpose8X8Fp32Avx(src + hw * input_channel + ic,
tmp_weight + hw * oc_block * input_channel + ic * oc_block + oc_tmp,
input_channel * plane, oc_block);
}
for (; ic < input_channel; ++ic) {
for (int j = 0; j < C8NUM; ++j) {
tmp_weight[ic * oc_block + oc_tmp + j + hw * oc_block * input_channel] =
src[ic + input_channel * j * plane + hw * input_channel];
}
}
}
src += C8NUM * plane * input_channel;
}
tmp_weight += oc_block * input_channel * plane;
}
oc = output_channel - oc_block8 * C8NUM;
for (int oc_remainder = 0; oc_remainder < oc; ++oc_remainder) {
for (int hw = 0; hw < plane; ++hw) {
for (int ic = 0; ic < input_channel; ++ic) {
tmp_weight[oc_remainder + oc_remainder_step * ic + hw * input_channel * oc_remainder_step] =
src[ic + (oc_remainder * plane + hw) * input_channel];
}
}
}

View File

@ -19,15 +19,12 @@
#include "nnacl/errorcode.h"
#include "nnacl/op_base.h"
int AvgPooling(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, int task_id,
float minf, float maxf) {
int win_w = pooling_param->window_w_;
int win_h = pooling_param->window_h_;
int AvgPoolingBatch(const float *src_b_ptr, float *dst_b_ptr, const PoolingParameter *pooling_param, int task_id,
float minf, float maxf) {
int win_w = pooling_param->window_w_, win_h = pooling_param->window_h_;
int in_w = pooling_param->input_w_, in_h = pooling_param->input_h_;
int output_w = pooling_param->output_w_, output_h = pooling_param->output_h_;
int channel = pooling_param->input_channel_;
int in_w = pooling_param->input_w_;
int in_h = pooling_param->input_h_;
int output_w = pooling_param->output_w_;
int output_h = pooling_param->output_h_;
int out_plane = output_w * output_h;
int out_tile_count = UP_DIV(out_plane, TILE_NUM);
NNACL_CHECK_ZERO_RETURN_ERR(output_w);
@ -42,190 +39,218 @@ int AvgPooling(const float *input_ptr, float *output_ptr, const PoolingParameter
MS_FLOAT32X4 max_value = MS_MOVQ_F32(maxf);
#endif
for (int batch = 0; batch < pooling_param->output_batch_; batch++) {
const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel;
float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel;
for (int thread_id = task_id; thread_id < out_tile_count; thread_id += pooling_param->thread_num_) {
int cal_start_index = thread_id * TILE_NUM;
int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
for (int i = 0; i < real_cal_num; i++) {
int index = cal_start_index + i;
int out_w_index = index % output_w;
int out_h_index = index / output_w;
int in_w_index = out_w_index * pooling_param->stride_w_ - pooling_param->pad_l_;
int in_h_index = out_h_index * pooling_param->stride_h_ - pooling_param->pad_u_;
for (int thread_id = task_id; thread_id < out_tile_count; thread_id += pooling_param->thread_num_) {
int cal_start_index = thread_id * TILE_NUM;
int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
for (int i = 0; i < real_cal_num; i++) {
int index = cal_start_index + i;
int out_w_index = index % output_w;
int out_h_index = index / output_w;
int in_w_index = out_w_index * pooling_param->stride_w_ - pooling_param->pad_l_;
int in_h_index = out_h_index * pooling_param->stride_h_ - pooling_param->pad_u_;
const float *src_plane_ptr = src_b_ptr;
float *dst_plane_ptr = dst_b_ptr + index * channel;
const float *src_plane_ptr = src_b_ptr;
float *dst_plane_ptr = dst_b_ptr + index * channel;
int real_win_h_start = MSMAX(0, -in_h_index);
int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
int real_win_w_start = MSMAX(0, -in_w_index);
int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
int ci = 0;
int real_win_h_start = MSMAX(0, -in_h_index);
int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
int real_win_w_start = MSMAX(0, -in_w_index);
int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
int ci = 0;
#ifdef ENABLE_AVX
for (; ci < c8; ci += C8NUM) {
const float *src_c_ptr = src_plane_ptr + ci;
float *dst_c_ptr = dst_plane_ptr + ci;
MS_FLOAT32X8 tmp_avg = MS_MOV256_F32(0);
int real_count = 0;
for (int h = real_win_h_start; h < real_win_h_end; h++) {
for (int w = real_win_w_start; w < real_win_w_end; w++) {
const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
tmp_avg = MS_ADD256_F32(tmp_avg, MS_LD256_F32(src_win_ptr));
++real_count;
} // win_w loop
} // win_h loop
if (real_count == 0) {
return NNACL_ERR;
}
tmp_avg = MS_DIV256_F32(tmp_avg, MS_MOV256_F32(real_count));
tmp_avg = MS_MAX256_F32(tmp_avg, min_value_8);
tmp_avg = MS_MIN256_F32(tmp_avg, max_value_8);
MS_ST256_F32(dst_c_ptr, tmp_avg);
} // ic8-1 loop
for (; ci < c8; ci += C8NUM) {
const float *src_c_ptr = src_plane_ptr + ci;
float *dst_c_ptr = dst_plane_ptr + ci;
MS_FLOAT32X8 tmp_avg = MS_MOV256_F32(0);
int real_count = 0;
for (int h = real_win_h_start; h < real_win_h_end; h++) {
for (int w = real_win_w_start; w < real_win_w_end; w++) {
const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
tmp_avg = MS_ADD256_F32(tmp_avg, MS_LD256_F32(src_win_ptr));
++real_count;
} // win_w loop
} // win_h loop
if (real_count == 0) {
return NNACL_ERR;
}
tmp_avg = MS_DIV256_F32(tmp_avg, MS_MOV256_F32(real_count));
tmp_avg = MS_MAX256_F32(tmp_avg, min_value_8);
tmp_avg = MS_MIN256_F32(tmp_avg, max_value_8);
MS_ST256_F32(dst_c_ptr, tmp_avg);
} // ic8-1 loop
#endif
#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
for (; ci < c4; ci += C4NUM) {
const float *src_c_ptr = src_plane_ptr + ci;
float *dst_c_ptr = dst_plane_ptr + ci;
MS_FLOAT32X4 tmp_avg = MS_MOVQ_F32(0);
int real_count = 0;
for (int h = real_win_h_start; h < real_win_h_end; h++) {
for (int w = real_win_w_start; w < real_win_w_end; w++) {
const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
tmp_avg = MS_ADDQ_F32(tmp_avg, MS_LDQ_F32(src_win_ptr));
++real_count;
} // win_w loop
} // win_h loop
if (real_count == 0) {
return NNACL_ERR;
}
tmp_avg = MS_DIVQ_F32(tmp_avg, MS_MOVQ_F32(real_count));
tmp_avg = MS_MAXQ_F32(tmp_avg, min_value);
tmp_avg = MS_MINQ_F32(tmp_avg, max_value);
MS_STQ_F32(dst_c_ptr, tmp_avg);
} // ic4-1 loop
for (; ci < c4; ci += C4NUM) {
const float *src_c_ptr = src_plane_ptr + ci;
float *dst_c_ptr = dst_plane_ptr + ci;
MS_FLOAT32X4 tmp_avg = MS_MOVQ_F32(0);
int real_count = 0;
for (int h = real_win_h_start; h < real_win_h_end; h++) {
for (int w = real_win_w_start; w < real_win_w_end; w++) {
const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
tmp_avg = MS_ADDQ_F32(tmp_avg, MS_LDQ_F32(src_win_ptr));
++real_count;
} // win_w loop
} // win_h loop
if (real_count == 0) {
return NNACL_ERR;
}
tmp_avg = MS_DIVQ_F32(tmp_avg, MS_MOVQ_F32(real_count));
tmp_avg = MS_MAXQ_F32(tmp_avg, min_value);
tmp_avg = MS_MINQ_F32(tmp_avg, max_value);
MS_STQ_F32(dst_c_ptr, tmp_avg);
} // ic4-1 loop
#endif
for (; ci < channel; ci++) {
const float *src_c_ptr = src_plane_ptr + ci;
float *dst_c_ptr = dst_plane_ptr + ci;
float tmp_avg = 0;
int real_count = 0;
for (int h = real_win_h_start; h < real_win_h_end; h++) {
for (int w = real_win_w_start; w < real_win_w_end; w++) {
const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
tmp_avg += src_win_ptr[0];
++real_count;
} // win_w loop
} // win_h loop
if (real_count == 0) {
return NNACL_ERR;
}
tmp_avg = tmp_avg / (float)real_count;
tmp_avg = fmaxf(tmp_avg, minf);
tmp_avg = fminf(tmp_avg, maxf);
dst_c_ptr[0] = tmp_avg;
} // channel_res loop
} // real_cal_num loop
} // out_plane loop
} // out_batch loop
for (; ci < channel; ci++) {
const float *src_c_ptr = src_plane_ptr + ci;
float *dst_c_ptr = dst_plane_ptr + ci;
float tmp_avg = 0;
int real_count = 0;
for (int h = real_win_h_start; h < real_win_h_end; h++) {
for (int w = real_win_w_start; w < real_win_w_end; w++) {
const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
tmp_avg += src_win_ptr[0];
++real_count;
} // win_w loop
} // win_h loop
if (real_count == 0) {
return NNACL_ERR;
}
tmp_avg = tmp_avg / (float)real_count;
tmp_avg = fmaxf(tmp_avg, minf);
tmp_avg = fminf(tmp_avg, maxf);
dst_c_ptr[0] = tmp_avg;
} // channel_res loop
} // real_cal_num loop
} // out_plane loop
return NNACL_OK;
}
int AvgPooling(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, int task_id,
float minf, float maxf) {
int in_w = pooling_param->input_w_;
int in_h = pooling_param->input_h_;
int output_w = pooling_param->output_w_;
int output_h = pooling_param->output_h_;
int channel = pooling_param->input_channel_;
int output_batch = pooling_param->output_batch_;
for (int batch = 0; batch < output_batch; batch++) {
const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel;
float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel;
int ret = AvgPoolingBatch(src_b_ptr, dst_b_ptr, pooling_param, task_id, minf, maxf);
if (ret != NNACL_OK) {
return ret;
}
}
return NNACL_OK;
}
int MaxPoolingBatch(const float *src_b_ptr, float *dst_b_ptr, const PoolingParameter *pooling_param, int task_id,
float minf, float maxf) {
int in_w = pooling_param->input_w_, in_h = pooling_param->input_h_;
int win_w = pooling_param->window_w_, win_h = pooling_param->window_h_;
int output_w = pooling_param->output_w_, output_h = pooling_param->output_h_;
int channel = pooling_param->input_channel_;
int out_plane = output_w * output_h;
int out_tile_count = UP_DIV(out_plane, TILE_NUM);
NNACL_CHECK_ZERO_RETURN_ERR(output_w);
#ifdef ENABLE_AVX
int c8 = channel / C8NUM * C8NUM;
MS_FLOAT32X8 min_value_8 = MS_MOV256_F32(minf);
MS_FLOAT32X8 max_value_8 = MS_MOV256_F32(maxf);
#endif
#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
int c4 = channel / C4NUM * C4NUM;
MS_FLOAT32X4 min_value = MS_MOVQ_F32(minf);
MS_FLOAT32X4 max_value = MS_MOVQ_F32(maxf);
#endif
for (int thread_id = task_id; thread_id < out_tile_count; thread_id += pooling_param->thread_num_) {
int cal_start_index = thread_id * TILE_NUM;
int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
for (int i = 0; i < real_cal_num; i++) {
int index = cal_start_index + i;
int out_w_index = index % output_w;
int out_h_index = index / output_w;
int in_w_index = out_w_index * pooling_param->stride_w_ - pooling_param->pad_l_;
int in_h_index = out_h_index * pooling_param->stride_h_ - pooling_param->pad_u_;
const float *src_plane_ptr = src_b_ptr;
float *dst_plane_ptr = dst_b_ptr + index * channel;
int real_win_h_start = MSMAX(0, -in_h_index);
int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
int real_win_w_start = MSMAX(0, -in_w_index);
int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
int ci = 0;
#ifdef ENABLE_AVX
for (; ci < c8; ci += C8NUM) {
const float *src_c_ptr = src_plane_ptr + ci;
float *dst_c_ptr = dst_plane_ptr + ci;
MS_FLOAT32X8 tmp_max = MS_MOV256_F32(-FLT_MAX);
for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
tmp_max = MS_MAX256_F32(tmp_max, MS_LD256_F32(src_win_ptr));
} // win_w loop
} // win_h loop
tmp_max = MS_MAX256_F32(tmp_max, min_value_8);
tmp_max = MS_MIN256_F32(tmp_max, max_value_8);
MS_ST256_F32(dst_c_ptr, tmp_max);
} // ic8 loop
#endif
#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
for (; ci < c4; ci += C4NUM) {
const float *src_c_ptr = src_plane_ptr + ci;
float *dst_c_ptr = dst_plane_ptr + ci;
MS_FLOAT32X4 tmp_max = MS_MOVQ_F32(-FLT_MAX);
for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
tmp_max = MS_MAXQ_F32(tmp_max, MS_LDQ_F32(src_win_ptr));
} // win_w loop
} // win_h loop
tmp_max = MS_MAXQ_F32(tmp_max, min_value);
tmp_max = MS_MINQ_F32(tmp_max, max_value);
MS_STQ_F32(dst_c_ptr, tmp_max);
} // ic4 loop
#endif
for (; ci < channel; ci++) {
float *dst_c_ptr = dst_plane_ptr + ci;
const float *src_c_ptr = src_plane_ptr + ci;
float tmp_max = -FLT_MAX;
for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
tmp_max = fmaxf(tmp_max, src_win_ptr[0]);
} // win_w loop
} // win_h loop
tmp_max = fmaxf(tmp_max, minf);
tmp_max = fminf(tmp_max, maxf);
dst_c_ptr[0] = tmp_max;
} // channel_res loop
} // real_cal_num loop
} // out_plane loop
return NNACL_OK;
}
int MaxPooling(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, int task_id,
float minf, float maxf) {
int win_w = pooling_param->window_w_;
int win_h = pooling_param->window_h_;
int channel = pooling_param->input_channel_;
int in_w = pooling_param->input_w_;
int in_h = pooling_param->input_h_;
int output_w = pooling_param->output_w_;
int output_h = pooling_param->output_h_;
int channel = pooling_param->input_channel_;
int output_batch = pooling_param->output_batch_;
int out_plane = output_w * output_h;
int out_tile_count = UP_DIV(out_plane, TILE_NUM);
NNACL_CHECK_ZERO_RETURN_ERR(output_w);
#ifdef ENABLE_AVX
int c8 = channel / C8NUM * C8NUM;
MS_FLOAT32X8 min_value_8 = MS_MOV256_F32(minf);
MS_FLOAT32X8 max_value_8 = MS_MOV256_F32(maxf);
#endif
#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
int c4 = channel / C4NUM * C4NUM;
MS_FLOAT32X4 min_value = MS_MOVQ_F32(minf);
MS_FLOAT32X4 max_value = MS_MOVQ_F32(maxf);
#endif
for (int batch = 0; batch < output_batch; batch++) {
const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel;
float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel;
for (int thread_id = task_id; thread_id < out_tile_count; thread_id += pooling_param->thread_num_) {
int cal_start_index = thread_id * TILE_NUM;
int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
for (int i = 0; i < real_cal_num; i++) {
int index = cal_start_index + i;
int out_w_index = index % output_w;
int out_h_index = index / output_w;
int in_w_index = out_w_index * pooling_param->stride_w_ - pooling_param->pad_l_;
int in_h_index = out_h_index * pooling_param->stride_h_ - pooling_param->pad_u_;
const float *src_plane_ptr = src_b_ptr;
float *dst_plane_ptr = dst_b_ptr + index * channel;
int real_win_h_start = MSMAX(0, -in_h_index);
int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
int real_win_w_start = MSMAX(0, -in_w_index);
int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
int ci = 0;
#ifdef ENABLE_AVX
for (; ci < c8; ci += C8NUM) {
const float *src_c_ptr = src_plane_ptr + ci;
float *dst_c_ptr = dst_plane_ptr + ci;
MS_FLOAT32X8 tmp_max = MS_MOV256_F32(-FLT_MAX);
for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
tmp_max = MS_MAX256_F32(tmp_max, MS_LD256_F32(src_win_ptr));
} // win_w loop
} // win_h loop
tmp_max = MS_MAX256_F32(tmp_max, min_value_8);
tmp_max = MS_MIN256_F32(tmp_max, max_value_8);
MS_ST256_F32(dst_c_ptr, tmp_max);
} // ic8 loop
#endif
#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
for (; ci < c4; ci += C4NUM) {
const float *src_c_ptr = src_plane_ptr + ci;
float *dst_c_ptr = dst_plane_ptr + ci;
MS_FLOAT32X4 tmp_max = MS_MOVQ_F32(-FLT_MAX);
for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
tmp_max = MS_MAXQ_F32(tmp_max, MS_LDQ_F32(src_win_ptr));
} // win_w loop
} // win_h loop
tmp_max = MS_MAXQ_F32(tmp_max, min_value);
tmp_max = MS_MINQ_F32(tmp_max, max_value);
MS_STQ_F32(dst_c_ptr, tmp_max);
} // ic4 loop
#endif
for (; ci < channel; ci++) {
float *dst_c_ptr = dst_plane_ptr + ci;
const float *src_c_ptr = src_plane_ptr + ci;
float tmp_max = -FLT_MAX;
for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
tmp_max = fmaxf(tmp_max, src_win_ptr[0]);
} // win_w loop
} // win_h loop
tmp_max = fmaxf(tmp_max, minf);
tmp_max = fminf(tmp_max, maxf);
dst_c_ptr[0] = tmp_max;
} // channel_res loop
} // real_cal_num loop
} // out_plane loop
} // out_batch loop
int ret = MaxPoolingBatch(src_b_ptr, dst_b_ptr, pooling_param, task_id, minf, maxf);
if (ret != NNACL_OK) {
return ret;
}
}
return NNACL_OK;
}

View File

@ -76,6 +76,7 @@
#define THIRD_INPUT 2
#define FOURTH_INPUT 3
#define FIFTH_INPUT 4
#define SIXTH_INPUT 5
#define DIMENSION_1D 1
#define DIMENSION_2D 2

View File

@ -165,6 +165,10 @@ int LiteSession::ConvertTensorsData(const lite::Model *model, size_t tensor_inde
auto ret = DecompressTensor(*src_tensor, dst_tensor);
if (ret == RET_NO_CHANGE) {
if (src_tensor->data()->size() < dst_tensor->Size()) {
MS_LOG(ERROR) << "Tensor data shape invalid";
return RET_ERROR;
}
dst_tensor->set_data(const_cast<unsigned char *>(src_tensor->data()->data()));
dst_tensor->set_own_data(false);
} else if (ret != RET_OK) {

View File

@ -28,12 +28,32 @@ namespace mindspore::kernel {
namespace {
constexpr size_t kPadCommonInputSize = 2;
} // namespace
int PadFp16CPUKernel::RunImpl(int task_id) {
int PadFp16CPUKernel::RunImpl(int task_id) const {
PadFp16(input_, output_, in_, out_, pad_param_->paddings_, task_id, op_parameter_->thread_num_);
return RET_OK;
}
int PadFp16CPUKernel::RunMirrorPadImpl(int task_id) {
void PadFp16CPUKernel::RunMirrorPadImplFast(const MirrorPadBlock &block, const float16_t *input_data,
float16_t *output_data) const {
for (int a = 0; a < block.size_[0]; a++) {
int out_a_index = block.out_offset_ + a * block.out_stride_[0];
for (int b = 0; b < block.size_[1]; b++) {
int out_b_index = out_a_index + b * block.out_stride_[1];
for (int c = 0; c < block.size_[2]; ++c) {
int out_c_index = out_b_index + c * block.out_stride_[2];
for (int d = 0; d < block.size_[3]; ++d) {
int out_d_index = out_c_index + d * block.out_stride_[3];
for (int e = 0; e < block.size_[4]; ++e) {
int output_index = out_d_index + e * block.out_stride_[4];
MirrorPadFp16(input_data, output_data, in_, pad_param_, output_index, output_index + block.size_[5]);
}
}
}
}
}
}
int PadFp16CPUKernel::RunMirrorPadImpl(int task_id) const {
auto input = in_tensors_.at(0);
CHECK_NULL_RETURN(input);
auto output = out_tensors_.at(0);
@ -51,23 +71,7 @@ int PadFp16CPUKernel::RunMirrorPadImpl(int task_id) {
/* calculate region part */
for (size_t i = task_id; i < mirror_pad_block_.size(); i += op_parameter_->thread_num_) {
auto block = mirror_pad_block_[i];
for (int a = 0; a < block.size_[0]; a++) {
int out_a_index = block.out_offset_ + a * block.out_stride_[0];
for (int b = 0; b < block.size_[1]; b++) {
int out_b_index = out_a_index + b * block.out_stride_[1];
for (int c = 0; c < block.size_[2]; ++c) {
int out_c_index = out_b_index + c * block.out_stride_[2];
for (int d = 0; d < block.size_[3]; ++d) {
int out_d_index = out_c_index + d * block.out_stride_[3];
for (int e = 0; e < block.size_[4]; ++e) {
int output_index = out_d_index + e * block.out_stride_[4];
MirrorPadFp16(input_data, output_data, in_, pad_param_, output_index, output_index + block.size_[5]);
}
}
}
}
}
RunMirrorPadImplFast(block, input_data, output_data);
}
return RET_OK;
}

View File

@ -30,8 +30,11 @@ class PadFp16CPUKernel : public PadCPUKernel {
~PadFp16CPUKernel() {}
int Run() override;
int RunImpl(int task_id) override;
int RunMirrorPadImpl(int task_id) override;
int RunImpl(int task_id) const override;
int RunMirrorPadImpl(int task_id) const override;
private:
void RunMirrorPadImplFast(const MirrorPadBlock &block, const float16_t *input_data, float16_t *output_data) const;
private:
float16_t *input_ = nullptr;

View File

@ -25,7 +25,7 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_PReLUFusion;
namespace mindspore::kernel {
int PReluFp16CPUKernel::DoExcute(int task_id) {
int PReluFp16CPUKernel::DoExcute(int task_id) const {
int thread_num = param_->op_parameter_.thread_num_;
if (thread_num == 0) {
MS_LOG(ERROR) << "thread_num is 0!";

View File

@ -27,7 +27,7 @@ class PReluFp16CPUKernel : public PReluCPUKernel {
: PReluCPUKernel(parameter, inputs, outputs, ctx) {}
~PReluFp16CPUKernel() = default;
int DoExcute(int task_id) override;
int DoExcute(int task_id) const override;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_PRELU_FP16_H_

View File

@ -102,7 +102,7 @@ void GatherNdCPUKernel::InitOffset() {
}
}
int GatherNdCPUKernel::DoGatherNd(int task_id) {
int GatherNdCPUKernel::DoGatherNd(int task_id) const {
int count = MSMIN(thread_sz_stride_, count_ - task_id * thread_sz_stride_);
if (count <= 0) {
return RET_OK;
@ -116,8 +116,8 @@ int GatherNdCPUKernel::DoGatherNd(int task_id) {
return RET_OK;
}
int GatherNdRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
auto g_kernel = reinterpret_cast<GatherNdCPUKernel *>(cdata);
int GatherNdRun(const void *cdata, int task_id, float, float) {
auto g_kernel = reinterpret_cast<const GatherNdCPUKernel *>(cdata);
auto ret = g_kernel->DoGatherNd(task_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "GatherNdRun error task_id[" << task_id << "] error_code[" << ret << "]";

View File

@ -37,7 +37,7 @@ class GatherNdCPUKernel : public InnerKernel {
int Prepare() override;
int ReSize() override;
int Run() override;
int DoGatherNd(int task_id);
int DoGatherNd(int task_id) const;
private:
void InitOffset();

View File

@ -42,7 +42,7 @@ int GatherCPUKernel::Prepare() {
int GatherCPUKernel::ReSize() { return RET_OK; }
int GatherCPUKernel::DoGather(int task_id) {
int GatherCPUKernel::DoGather(int task_id) const {
auto input_tensor = in_tensors_.at(0);
auto indices_tensor = in_tensors_.at(1);
auto out_tensor = out_tensors_.at(0);
@ -81,8 +81,8 @@ int GatherCPUKernel::DoGather(int task_id) {
return error_code;
}
int GatherRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
auto gather_kernel = reinterpret_cast<GatherCPUKernel *>(cdata);
int GatherRun(const void *cdata, int task_id, float, float) {
auto gather_kernel = reinterpret_cast<const GatherCPUKernel *>(cdata);
auto error_code = gather_kernel->DoGather(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "GatherRun error task_id[" << task_id << "] error_code[" << error_code << "]";

View File

@ -34,7 +34,7 @@ class GatherCPUKernel : public InnerKernel {
int Prepare() override;
int ReSize() override;
int Run() override;
int DoGather(int task_id);
int DoGather(int task_id) const;
private:
int *indices_data_ = nullptr;

View File

@ -35,7 +35,7 @@ int GluCPUKernel::MallocTmpBuffer() {
FreeTmpBuffer();
auto in_tensor = in_tensors_.front();
for (int i = 0; i < kSplitNum; i++) {
split_ptr_[i] = reinterpret_cast<int8_t *>(ms_context_->allocator->Malloc(in_tensor->Size() / kSplitNum));
split_ptr_[i] = ms_context_->allocator->Malloc(in_tensor->Size() / kSplitNum);
if (split_ptr_[i] == nullptr) {
MS_LOG(ERROR) << "GluCPUKernel malloc split ptr failed.";
return RET_ERROR;
@ -96,8 +96,7 @@ int GluCPUKernel::ReSize() {
return RET_OK;
}
int GluCPUKernel::Split(int task_id) {
input_ptr_ = in_tensors_.front()->data();
int GluCPUKernel::Split(int task_id) const {
MS_CHECK_INT_MUL_NOT_OVERFLOW(task_id, thread_n_stride_, RET_ERROR);
int num_unit_thread = MSMIN(thread_n_stride_, num_unit_ - task_id * thread_n_stride_);
if (num_unit_thread <= 0) {
@ -105,8 +104,8 @@ int GluCPUKernel::Split(int task_id) {
}
int thread_offset = task_id * thread_n_stride_;
auto ret =
DoSplit(input_ptr_, reinterpret_cast<void **>(split_ptr_.data()), in_tensors_.front()->shape().data(),
thread_offset, num_unit_thread, &split_param_, lite::DataTypeSize(in_tensors_.front()->data_type()));
DoSplit(input_ptr_, const_cast<void **>(split_ptr_.data()), in_tensors_.front()->shape().data(), thread_offset,
num_unit_thread, &split_param_, lite::DataTypeSize(in_tensors_.front()->data_type()));
if (ret != RET_OK) {
MS_LOG(ERROR) << "Split error task_id[" << task_id << "] error_code[" << ret << "]";
return RET_ERROR;
@ -114,7 +113,7 @@ int GluCPUKernel::Split(int task_id) {
return RET_OK;
}
int GluCPUKernel::Sigmoid(int task_id) {
int GluCPUKernel::Sigmoid(int task_id) const {
auto input_addr = reinterpret_cast<float *>(split_ptr_.at(1));
auto output_addr = reinterpret_cast<float *>(sigmoid_ptr_);
auto length = in_tensors_.at(0)->ElementsNum() / kGluBranchNum;
@ -128,7 +127,7 @@ int GluCPUKernel::Sigmoid(int task_id) {
return ::Sigmoid(input_addr + stride * task_id, count, output_addr + stride * task_id);
}
int GluCPUKernel::Mul(int task_id) {
int GluCPUKernel::Mul(int task_id) const {
auto input_addr0 = reinterpret_cast<float *>(split_ptr_.at(0));
auto input_addr1 = reinterpret_cast<float *>(sigmoid_ptr_);
auto output_addr = reinterpret_cast<float *>(out_tensors_.at(0)->data());
@ -144,22 +143,24 @@ int GluCPUKernel::Mul(int task_id) {
return ElementMul(input_addr0 + offset, input_addr1 + offset, output_addr + offset, count);
}
static int SplitRun(void *cdata, int task_id, float, float) {
auto g_kernel = reinterpret_cast<GluCPUKernel *>(cdata);
static int SplitRun(const void *cdata, int task_id, float, float) {
auto g_kernel = reinterpret_cast<const GluCPUKernel *>(cdata);
return g_kernel->Split(task_id);
}
static int SigmoidRun(void *cdata, int task_id, float, float) {
auto activation_kernel = reinterpret_cast<GluCPUKernel *>(cdata);
static int SigmoidRun(const void *cdata, int task_id, float, float) {
auto activation_kernel = reinterpret_cast<const GluCPUKernel *>(cdata);
return activation_kernel->Sigmoid(task_id);
}
static int MulRun(void *cdata, int task_id, float, float) {
auto g_kernel = reinterpret_cast<GluCPUKernel *>(cdata);
static int MulRun(const void *cdata, int task_id, float, float) {
auto g_kernel = reinterpret_cast<const GluCPUKernel *>(cdata);
return g_kernel->Mul(task_id);
}
int GluCPUKernel::Run() {
input_ptr_ = in_tensors_.front()->data();
auto ret = MallocTmpBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Malloc tmp buffer failed";

View File

@ -43,9 +43,11 @@ class GluCPUKernel : public InnerKernel {
int Prepare() override;
int ReSize() override;
int Run() override;
int Split(int task_id);
int Sigmoid(int task_id);
int Mul(int task_id);
int Split(int task_id) const;
int Sigmoid(int task_id) const;
int Mul(int task_id) const;
private:
void FreeTmpBuffer();
int MallocTmpBuffer();
@ -54,8 +56,8 @@ class GluCPUKernel : public InnerKernel {
GluParameter *glu_param_ = nullptr;
void *input_ptr_ = nullptr;
int8_t *sigmoid_ptr_ = nullptr;
std::vector<int8_t *> split_ptr_;
int split_sizes_[kSplitNum];
std::vector<void *> split_ptr_;
int split_sizes_[kSplitNum] = {0};
int thread_n_stride_ = 0;
int usable_thread_num_ = 0;
int num_unit_ = 0;

View File

@ -50,7 +50,7 @@ class GruCPUKernel : public InnerKernel {
const int weight_r_index = 2;
const int bias_index = 3;
float *buffer_[4];
float *buffer_[4] = {nullptr};
const int gate_num = 3;
const int packed_input_index = 0;
const int input_gate_index = 1;

View File

@ -45,7 +45,7 @@ int InstanceNormCPUKernel::ReSize() {
return RET_OK;
}
int InstanceNormCPUKernel::DoInstanceNorm(int task_id) {
int InstanceNormCPUKernel::DoInstanceNorm(int task_id) const {
int ret = 0;
if (in_tensors_[0]->format() == NC4HW4) { // arm64 x86-avx x86-sse x86
#ifdef ENABLE_AVX
@ -63,8 +63,8 @@ int InstanceNormCPUKernel::DoInstanceNorm(int task_id) {
return RET_OK;
}
int InstanceNormRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
auto kernel = reinterpret_cast<InstanceNormCPUKernel *>(cdata);
int InstanceNormRun(const void *cdata, int task_id, float, float) {
auto kernel = reinterpret_cast<const InstanceNormCPUKernel *>(cdata);
auto ret = kernel->DoInstanceNorm(task_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "InstanceNormRun error task_id[" << task_id << "] error_code[" << ret << "]";

View File

@ -35,7 +35,9 @@ class InstanceNormCPUKernel : public InnerKernel {
int Prepare() override;
int ReSize() override;
int Run() override;
int DoInstanceNorm(int task_id);
int DoInstanceNorm(int task_id) const;
private:
void FreeTmpBuffer() {
if (tmp_src_data_ != nullptr) {
ms_context_->allocator->Free(tmp_src_data_);

View File

@ -89,7 +89,7 @@ int L2NormCPUKernel::ReSize() {
return RET_OK;
}
int L2NormCPUKernel::CalcSquareSum(int task_id) {
int L2NormCPUKernel::CalcSquareSum(int task_id) const {
int unit = UP_DIV(l2_norm_param_->data_num_, op_parameter_->thread_num_);
if (INT_MUL_OVERFLOW(task_id, unit)) {
MS_LOG(ERROR) << "int mul overflow.";
@ -100,7 +100,7 @@ int L2NormCPUKernel::CalcSquareSum(int task_id) {
return CalcThreadSquareSum(input_ptr_, tmp_sum_ + task_id, begin, end);
}
int L2NormCPUKernel::DivSqrtSum(int task_id) {
int L2NormCPUKernel::DivSqrtSum(int task_id) const {
int unit = UP_DIV(l2_norm_param_->data_num_, op_parameter_->thread_num_);
if (INT_MUL_OVERFLOW(task_id, unit)) {
MS_LOG(ERROR) << "int mul overflow.";
@ -111,7 +111,7 @@ int L2NormCPUKernel::DivSqrtSum(int task_id) {
return ThreadDivSqrtSum(input_ptr_, output_ptr_, l2_norm_param_, sqrt_sum_, begin, end);
}
int L2NormCPUKernel::CalcL2NormTrailingAxis(int task_id) {
int L2NormCPUKernel::CalcL2NormTrailingAxis(int task_id) const {
auto input = in_tensors_.at(0);
if (input->shape().back() == 0) {
MS_LOG(ERROR) << "input->shape().back() is 0";
@ -128,8 +128,8 @@ int L2NormCPUKernel::CalcL2NormTrailingAxis(int task_id) {
return ThreadTrailingAxis(input_ptr_, output_ptr_, l2_norm_param_, begin, end);
}
int SquareSumRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
auto kernel = reinterpret_cast<L2NormCPUKernel *>(cdata);
int SquareSumRun(const void *cdata, int task_id, float, float) {
auto kernel = reinterpret_cast<const L2NormCPUKernel *>(cdata);
auto ret = kernel->CalcSquareSum(task_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "L2Norm SquareSumRun error task_id[" << task_id << "] error_code[" << ret << "]";
@ -138,9 +138,9 @@ int SquareSumRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
return RET_OK;
}
int L2NormRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
int L2NormRun(const void *cdata, int task_id, float, float) {
CHECK_NULL_RETURN(cdata);
auto kernel = reinterpret_cast<L2NormCPUKernel *>(cdata);
auto kernel = reinterpret_cast<const L2NormCPUKernel *>(cdata);
auto ret = kernel->DivSqrtSum(task_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "L2Norm L2NormRun error task_id[" << task_id << "] error_code[" << ret << "]";
@ -149,9 +149,9 @@ int L2NormRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
return RET_OK;
}
int L2NormTrailingAxisRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
int L2NormTrailingAxisRun(const void *cdata, int task_id, float, float) {
CHECK_NULL_RETURN(cdata);
auto kernel = reinterpret_cast<L2NormCPUKernel *>(cdata);
auto kernel = reinterpret_cast<const L2NormCPUKernel *>(cdata);
auto ret = kernel->CalcL2NormTrailingAxis(task_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "L2Norm TrailingAxisRun error task_id[" << task_id << "] error_code[" << ret << "]";

View File

@ -36,9 +36,9 @@ class L2NormCPUKernel : public InnerKernel {
}
~L2NormCPUKernel() { FreeTmpBuffer(); }
int CalcSquareSum(int task_id);
int DivSqrtSum(int task_id);
int CalcL2NormTrailingAxis(int task_id);
int CalcSquareSum(int task_id) const;
int DivSqrtSum(int task_id) const;
int CalcL2NormTrailingAxis(int task_id) const;
int Prepare() override;
int ReSize() override;

View File

@ -65,7 +65,7 @@ int LayerNormCPUKernel::ReSize() {
return RET_OK;
}
int LayerNormCPUKernel::DoLayerNorm(int thread_id) {
int LayerNormCPUKernel::DoLayerNorm(int thread_id) const {
auto ret = LayerNorm(src_data_, gamma_data_, beta_data_, dst_data_, mean_data_, var_data_, param_, thread_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "DoLayerNorm error error_code[" << ret << "]";
@ -74,8 +74,8 @@ int LayerNormCPUKernel::DoLayerNorm(int thread_id) {
return RET_OK;
}
int LayerNormRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
auto kernel = reinterpret_cast<LayerNormCPUKernel *>(cdata);
int LayerNormRun(const void *cdata, int task_id, float, float) {
auto kernel = reinterpret_cast<const LayerNormCPUKernel *>(cdata);
CHECK_NULL_RETURN(kernel);
auto ret = kernel->DoLayerNorm(task_id);
if (ret != RET_OK) {

View File

@ -35,7 +35,7 @@ class LayerNormCPUKernel : public InnerKernel {
int Prepare() override;
int ReSize() override;
int Run() override;
int DoLayerNorm(int thread_id);
int DoLayerNorm(int thread_id) const;
private:
LayerNormParameter *param_ = nullptr;

View File

@ -35,7 +35,7 @@ int LocalResponseNormCPUKernel::Prepare() {
int LocalResponseNormCPUKernel::ReSize() { return RET_OK; }
int LocalResponseNormCPUKernel::DoLocalResponseNorm(int task_id) {
int LocalResponseNormCPUKernel::DoLocalResponseNorm(int task_id) const {
auto input_tensor = in_tensors_.front();
auto out_tensor = out_tensors_.front();
auto input_ptr = reinterpret_cast<float *>(input_tensor->MutableData());
@ -67,8 +67,8 @@ int LocalResponseNormCPUKernel::DoLocalResponseNorm(int task_id) {
return RET_OK;
}
int LocalResponseNormRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
auto lrn = reinterpret_cast<LocalResponseNormCPUKernel *>(cdata);
int LocalResponseNormRun(const void *cdata, int task_id, float, float) {
auto lrn = reinterpret_cast<const LocalResponseNormCPUKernel *>(cdata);
auto error_code = lrn->DoLocalResponseNorm(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "LocalResponseNormRun error task_id[" << task_id << "] error_code[" << error_code << "]";

View File

@ -32,7 +32,7 @@ class LocalResponseNormCPUKernel : public InnerKernel {
int Prepare() override;
int ReSize() override;
int Run() override;
int DoLocalResponseNorm(int task_id);
int DoLocalResponseNorm(int task_id) const;
private:
int thread_count_;

View File

@ -79,7 +79,7 @@ int LogSoftmaxCPUKernel::ReSize() {
return RET_OK;
}
int LogSoftmaxCPUKernel::DoLogSoftmaxLastAxis(int task_id) {
int LogSoftmaxCPUKernel::DoLogSoftmaxLastAxis(int task_id) const {
MS_CHECK_FALSE(op_parameter_->thread_num_ == 0, RET_ERROR);
int unit = UP_DIV(out_plane_size_, op_parameter_->thread_num_);
int begin = task_id * unit;
@ -94,8 +94,8 @@ int LogSoftmaxCPUKernel::DoLogSoftmaxLastAxis(int task_id) {
return RET_OK;
}
int LogSoftmaxLastAxisRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
auto kernel = reinterpret_cast<LogSoftmaxCPUKernel *>(cdata);
int LogSoftmaxLastAxisRun(const void *cdata, int task_id, float, float) {
auto kernel = reinterpret_cast<const LogSoftmaxCPUKernel *>(cdata);
CHECK_NULL_RETURN(kernel);
auto ret = kernel->DoLogSoftmaxLastAxis(task_id);
if (ret != RET_OK) {

View File

@ -32,7 +32,7 @@ class LogSoftmaxCPUKernel : public SoftmaxBaseCPUKernel {
int Prepare() override;
int ReSize() override;
int Run() override;
int DoLogSoftmaxLastAxis(int task_id);
int DoLogSoftmaxLastAxis(int task_id) const;
private:
float *tmp_data_ = nullptr;

View File

@ -322,7 +322,7 @@ int LstmCPUKernel::MallocRunBuffer() {
return RET_OK;
}
void LstmCPUKernel::InputWeightMatMul(int task_id) {
void LstmCPUKernel::InputWeightMatMul(int task_id) const {
int current_start_oc = task_id * input_thread_stride_ * col_tile_;
int current_rest_oc = 0;
current_rest_oc = lstm_param_->hidden_size_ - current_start_oc;
@ -339,8 +339,8 @@ void LstmCPUKernel::InputWeightMatMul(int task_id) {
cur_oc, lstm_param_->hidden_size_, OutType_Nhwc);
}
int LstmInputMulWeightRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
auto kernel = reinterpret_cast<LstmCPUKernel *>(cdata);
int LstmInputMulWeightRun(const void *cdata, int task_id, float, float) {
auto kernel = reinterpret_cast<const LstmCPUKernel *>(cdata);
CHECK_NULL_RETURN(kernel);
kernel->InputWeightMatMul(task_id);
return RET_OK;

View File

@ -36,7 +36,7 @@ class LstmCPUKernel : public InnerKernel {
int ReSize() override;
int Run() override;
void InputWeightMatMul(int task_id);
void InputWeightMatMul(int task_id) const;
private:
void FreeTmpBuffer();
@ -50,9 +50,9 @@ class LstmCPUKernel : public InnerKernel {
const float *state_bias, float *hidden_state, float *cell_state, bool is_backward);
int InnerExecute(float *output, const float *input, float *hidden_state, float *cell_state);
void RecordStates(const float *cell_state, int step);
const float *weight_loop_;
const float *bias_loop_;
float *gate_loop_;
const float *weight_loop_ = nullptr;
const float *bias_loop_ = nullptr;
float *gate_loop_ = nullptr;
int input_thread_count_ = 0;
int input_thread_stride_ = 0;
@ -64,7 +64,7 @@ class LstmCPUKernel : public InnerKernel {
const int weight_h_index = 2;
const int bias_index = 3;
float *buffer_[7];
float *buffer_[7] = {nullptr};
const int gate_num = 4;
const int packed_input_index = 0;
const int input_gate_index = 1;

View File

@ -21,9 +21,9 @@
using mindspore::lite::RET_NULL_PTR;
namespace mindspore::kernel {
int MatmulBaseFloatRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
int MatmulBaseFloatRun(const void *cdata, int task_id, float, float) {
CHECK_NULL_RETURN(cdata);
auto op = reinterpret_cast<MatmulFp32BaseCPUKernel *>(cdata);
auto op = reinterpret_cast<const MatmulFp32BaseCPUKernel *>(cdata);
auto error_code = op->FloatRun(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]";
@ -126,32 +126,44 @@ int MatmulFp32BaseCPUKernel::CalBroadCastBiasDataElements() {
}
int MatmulFp32BaseCPUKernel::InitBiasData() {
if (in_tensors_.size() == 3) {
auto bias_tensor = in_tensors_[2];
size_t max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), col_tile_);
// malloc addr need to aligned to 32 bytes
if (in_tensors_.size() != FOURTH_INPUT) {
return RET_OK;
}
auto bias_tensor = in_tensors_[THIRD_INPUT];
if (bias_tensor == nullptr) {
MS_LOG(ERROR) << "bias_tensor invalid";
return RET_ERROR;
}
if (bias_tensor->ElementsNum() == 1) {
// broadcast bias data
size_t max_bias_data = CalBroadCastBiasDataElements();
bias_ptr_ = reinterpret_cast<float *>(malloc(max_bias_data * static_cast<int>(sizeof(float))));
if (bias_ptr_ == nullptr) {
MS_LOG(ERROR) << "malloc bias_ptr_ failed";
return RET_ERROR;
}
// whether to broadcast bias data
if (bias_tensor->ElementsNum() == 1) {
max_bias_data = CalBroadCastBiasDataElements();
float broadcast_data = (reinterpret_cast<float *>(bias_tensor->data()))[0];
// broadcast bias data
for (size_t i = 0; i < max_bias_data; ++i) {
bias_ptr_[i] = broadcast_data;
}
} else {
memset(bias_ptr_, 0, max_bias_data * static_cast<int>(sizeof(float)));
memcpy(bias_ptr_, bias_tensor->data(), bias_tensor->ElementsNum() * static_cast<int>(sizeof(float)));
float broadcast_data = (reinterpret_cast<float *>(bias_tensor->data()))[0];
// broadcast bias data
for (size_t i = 0; i < max_bias_data; ++i) {
bias_ptr_[i] = broadcast_data;
}
return RET_OK;
}
size_t max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), col_tile_);
// malloc addr need to aligned to 32 bytes
bias_ptr_ = reinterpret_cast<float *>(malloc(max_bias_data * static_cast<int>(sizeof(float))));
if (bias_ptr_ == nullptr) {
MS_LOG(ERROR) << "malloc bias_ptr_ failed";
return RET_ERROR;
}
memset(bias_ptr_, 0, max_bias_data * static_cast<int>(sizeof(float)));
memcpy(bias_ptr_, bias_tensor->data(), bias_tensor->ElementsNum() * static_cast<int>(sizeof(float)));
return RET_OK;
}
int MatmulFp32BaseCPUKernel::InitMatrixA(const float *src_ptr) {
int MatmulFp32BaseCPUKernel::InitMatrixA(const float *src_ptr) const {
CHECK_NULL_RETURN(src_ptr);
#ifdef ENABLE_ARM64
if (vec_matmul_) {
@ -175,7 +187,7 @@ int MatmulFp32BaseCPUKernel::InitMatrixA(const float *src_ptr) {
return RET_OK;
}
int MatmulFp32BaseCPUKernel::InitMatrixB(const float *src_ptr) {
int MatmulFp32BaseCPUKernel::InitMatrixB(const float *src_ptr) const {
CHECK_NULL_RETURN(src_ptr);
for (int i = 0; i < params_->batch; i++) {
const float *src = src_ptr + i * params_->deep_ * params_->col_;

View File

@ -47,8 +47,8 @@ class MatmulFp32BaseCPUKernel : public InnerKernel {
protected:
int InitBufferA();
int InitBufferB();
int InitMatrixA(const float *src_ptr);
int InitMatrixB(const float *src_ptr);
int InitMatrixA(const float *src_ptr) const;
int InitMatrixB(const float *src_ptr) const;
void FreeBiasBuf();
int InitBiasData();
void InitParameter();

View File

@ -110,7 +110,7 @@ void ExpandDims(std::vector<int> *shape, size_t size) {
}
int NonMaxSuppressionCPUKernel::Run_Selecte(bool simple_out, int box_num, int batch_num, int class_num,
float *scores_data, float *box_data) {
const float *scores_data, const float *box_data) {
std::vector<NMSBox> selected_box_per_class;
selected_box_per_class.reserve(std::min(static_cast<int32_t>(box_num), max_output_per_class_));
std::vector<NMSIndex> selected_index;
@ -119,8 +119,8 @@ int NonMaxSuppressionCPUKernel::Run_Selecte(bool simple_out, int box_num, int ba
int batch_offset = i * class_num * box_num;
for (auto j = 0; j < class_num; ++j) {
// per batch per class filter
float *per_class_scores = scores_data + batch_offset + j * box_num;
float *box = box_data + i * box_num * kBoxPointNum;
const float *per_class_scores = scores_data + batch_offset + j * box_num;
const float *box = box_data + i * box_num * kBoxPointNum;
std::vector<NMSBox> above_score_candidates;
above_score_candidates.reserve(box_num);
for (auto k = 0; k < box_num; ++k) {

View File

@ -41,7 +41,8 @@ class NonMaxSuppressionCPUKernel : public InnerKernel {
private:
int GetParams();
int Run_Selecte(bool simple_out, int box_num, int batch_num, int class_num, float *scores_data, float *box_data);
int Run_Selecte(bool simple_out, int box_num, int batch_num, int class_num, const float *scores_data,
const float *box_data);
private:
int center_point_box_ = 0;

View File

@ -206,8 +206,8 @@ int PadCPUKernel::ExtendPaddings(int *paddings, int length, const int *ori_paddi
return RET_OK;
}
int PadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
auto padKernel = reinterpret_cast<PadCPUKernel *>(cdata);
int PadImpl(const void *cdata, int task_id, float, float) {
auto padKernel = reinterpret_cast<const PadCPUKernel *>(cdata);
int error_code = padKernel->RunImpl(task_id);
if (error_code != NNACL_OK) {
MS_LOG(ERROR) << "Pad Run error task_id[" << task_id << "] error_code[" << error_code << "]";
@ -216,7 +216,7 @@ int PadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
return RET_OK;
}
int PadCPUKernel::RunImpl(int task_id) {
int PadCPUKernel::RunImpl(int task_id) const {
auto input = in_tensors_.at(0);
auto output = out_tensors_.at(0);
auto input_data = reinterpret_cast<float *>(input->data());
@ -228,8 +228,8 @@ int PadCPUKernel::RunImpl(int task_id) {
return RET_OK;
}
int MirrorPadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
auto padKernel = reinterpret_cast<PadCPUKernel *>(cdata);
int MirrorPadImpl(const void *cdata, int task_id, float, float) {
auto padKernel = reinterpret_cast<const PadCPUKernel *>(cdata);
int error_code = padKernel->RunMirrorPadImpl(task_id);
if (error_code != NNACL_OK) {
MS_LOG(ERROR) << "Pad Run error task_id[" << task_id << "] error_code[" << error_code << "]";
@ -238,7 +238,27 @@ int MirrorPadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
return RET_OK;
}
int PadCPUKernel::RunMirrorPadImpl(int task_id) {
void PadCPUKernel::RunMirrorPadImplFast(const MirrorPadBlock &block, const float *input_data,
float *output_data) const {
for (int a = 0; a < block.size_[FIRST_INPUT]; a++) {
int out_a_index = block.out_offset_ + a * block.out_stride_[FIRST_INPUT];
for (int b = 0; b < block.size_[SECOND_INPUT]; b++) {
int out_b_index = out_a_index + b * block.out_stride_[SECOND_INPUT];
for (int c = 0; c < block.size_[THIRD_INPUT]; ++c) {
int out_c_index = out_b_index + c * block.out_stride_[THIRD_INPUT];
for (int d = 0; d < block.size_[FOURTH_INPUT]; ++d) {
int out_d_index = out_c_index + d * block.out_stride_[FOURTH_INPUT];
for (int e = 0; e < block.size_[FIFTH_INPUT]; ++e) {
int output_index = out_d_index + e * block.out_stride_[FIFTH_INPUT];
MirrorPad(input_data, output_data, in_, pad_param_, output_index, output_index + block.size_[SIXTH_INPUT]);
}
}
}
}
}
}
int PadCPUKernel::RunMirrorPadImpl(int task_id) const {
auto input = in_tensors_.at(0);
auto output = out_tensors_.at(0);
auto input_data = reinterpret_cast<float *>(input->data());
@ -253,23 +273,7 @@ int PadCPUKernel::RunMirrorPadImpl(int task_id) {
/* calculate region part */
for (size_t i = task_id; i < mirror_pad_block_.size(); i += static_cast<size_t>(op_parameter_->thread_num_)) {
auto block = mirror_pad_block_[i];
for (int a = 0; a < block.size_[0]; a++) {
int out_a_index = block.out_offset_ + a * block.out_stride_[0];
for (int b = 0; b < block.size_[1]; b++) {
int out_b_index = out_a_index + b * block.out_stride_[1];
for (int c = 0; c < block.size_[2]; ++c) {
int out_c_index = out_b_index + c * block.out_stride_[2];
for (int d = 0; d < block.size_[3]; ++d) {
int out_d_index = out_c_index + d * block.out_stride_[3];
for (int e = 0; e < block.size_[4]; ++e) {
int output_index = out_d_index + e * block.out_stride_[4];
MirrorPad(input_data, output_data, in_, pad_param_, output_index, output_index + block.size_[5]);
}
}
}
}
}
RunMirrorPadImplFast(block, input_data, output_data);
}
return RET_OK;
}

View File

@ -41,8 +41,8 @@ class PadCPUKernel : public InnerKernel {
int Prepare() override;
int ReSize() override;
int Run() override;
virtual int RunImpl(int task_id);
virtual int RunMirrorPadImpl(int task_id);
virtual int RunImpl(int task_id) const;
virtual int RunMirrorPadImpl(int task_id) const;
private:
int CheckPaddings(const int *paddings, int length, const int *input_shape, int mode);
@ -50,6 +50,7 @@ class PadCPUKernel : public InnerKernel {
int ExtendShape(int *shape, int length, const int *ori_shape, int rank) const;
int ExtendPaddings(int *paddings, int length, const int *ori_paddings, int ori_length) const;
void InitMirrorPadBlock();
void RunMirrorPadImplFast(const MirrorPadBlock &block, const float *input_data, float *output_data) const;
protected:
int HandleMirrorPad();
@ -60,8 +61,8 @@ class PadCPUKernel : public InnerKernel {
std::vector<MirrorPadBlock> mirror_pad_block_;
};
int PadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale);
int MirrorPadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale);
int PadImpl(const void *cdata, int task_id, float, float);
int MirrorPadImpl(const void *cdata, int task_id, float, float);
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_PAD_H_

View File

@ -50,7 +50,7 @@ int PoolingCPUKernel::ReSize() {
return RET_OK;
}
int PoolingCPUKernel::RunImpl(int task_id) {
int PoolingCPUKernel::RunImpl(int task_id) const {
auto input_ptr = reinterpret_cast<float *>(in_tensors_.at(kInputIndex)->MutableData());
CHECK_NULL_RETURN(input_ptr);
auto output_ptr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->MutableData());
@ -76,8 +76,8 @@ int PoolingCPUKernel::RunImpl(int task_id) {
return RET_OK;
}
int PoolingImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
auto pooling = reinterpret_cast<PoolingCPUKernel *>(cdata);
int PoolingImpl(const void *cdata, int task_id, float, float) {
auto pooling = reinterpret_cast<const PoolingCPUKernel *>(cdata);
auto error_code = pooling->RunImpl(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "Pooling Run error task_id[" << task_id << "] error_code[" << error_code << "]";

View File

@ -32,7 +32,7 @@ class PoolingCPUKernel : public PoolingBaseCPUKernel {
int Prepare() override;
int ReSize() override;
int Run() override;
int RunImpl(int task_id);
int RunImpl(int task_id) const;
private:
};

View File

@ -33,9 +33,8 @@ int PowerCPUKernel::Prepare() {
int PowerCPUKernel::ReSize() { return RET_OK; }
int PowerImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
CHECK_NULL_RETURN(cdata);
auto kernel = reinterpret_cast<PowerCPUKernel *>(cdata);
int PowerImpl(const void *cdata, int task_id, float, float) {
auto kernel = reinterpret_cast<const PowerCPUKernel *>(cdata);
CHECK_NULL_RETURN(kernel);
auto ret = kernel->RunImpl(task_id);
if (ret != RET_OK) {
@ -54,7 +53,7 @@ int PowerCPUKernel::Run() {
return RET_OK;
}
int PowerCPUKernel::RunImpl(int task_id) {
int PowerCPUKernel::RunImpl(int task_id) const {
auto x_addr = reinterpret_cast<float *>(in_tensors_.at(0)->MutableData());
CHECK_NULL_RETURN(x_addr);
auto output_addr = reinterpret_cast<float *>(out_tensors_.at(0)->MutableData());

View File

@ -36,7 +36,7 @@ class PowerCPUKernel : public InnerKernel {
int Prepare() override;
int ReSize() override;
int Run() override;
int RunImpl(int task_id);
int RunImpl(int task_id) const;
private:
int thread_count_;

View File

@ -27,8 +27,8 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_PReLUFusion;
namespace mindspore::kernel {
static int PReluRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
auto PRelu = reinterpret_cast<PReluCPUKernel *>(cdata);
static int PReluRun(const void *cdata, int task_id, float, float) {
auto PRelu = reinterpret_cast<const PReluCPUKernel *>(cdata);
auto ret = PRelu->DoExcute(task_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "PReluRun error task_id[" << task_id << "] error_code[" << ret << "]";
@ -55,7 +55,7 @@ int PReluCPUKernel::Prepare() {
return ReSize();
}
int PReluCPUKernel::DoExcute(int task_id) {
int PReluCPUKernel::DoExcute(int task_id) const {
int thread_num = param_->op_parameter_.thread_num_;
if (thread_num == 0) {
MS_LOG(ERROR) << "thread_num is 0!";

View File

@ -34,7 +34,7 @@ class PReluCPUKernel : public InnerKernel {
int Prepare() override;
int ReSize() override;
int Run() override;
virtual int DoExcute(int task_id);
virtual int DoExcute(int task_id) const;
protected:
PReluParameter *param_;

View File

@ -84,7 +84,7 @@ void Nc4hw4PassReplace(std::vector<kernel::LiteKernel *> *kernels, std::vector<T
return;
}
bool Nc4hw4PassMatch(std::vector<kernel::LiteKernel *> *kernels, size_t index) {
bool Nc4hw4PassMatch(const std::vector<kernel::LiteKernel *> *kernels, size_t index) {
kernel::LiteKernel *start_kernel = kernels->at(index);
if (IsContain(Nc4hw4FormatOutOpList, start_kernel->type()) == false) {
return false;
@ -179,7 +179,7 @@ void Nc4hw4PassAct(std::vector<kernel::LiteKernel *> *kernels, std::vector<Tenso
return;
}
void ConvNormC4PassActReplace(kernel::LiteKernel *conv_op, kernel::LiteKernel *in_op) {
void ConvNormC4PassActReplace(const kernel::LiteKernel *conv_op, const kernel::LiteKernel *in_op) {
conv_op->out_tensors().front()->set_format(NC4HW4);
in_op->in_tensors().front()->set_format(NC4HW4);
}