!20493 [MS][LITE][CPU] x86 初始化時間及算子优化

Merge pull request !20493 from liuzhongkai/init_time
This commit is contained in:
i-robot 2021-07-20 10:49:53 +00:00 committed by Gitee
commit 62b39b5d7e
6 changed files with 92 additions and 14 deletions

View File

@ -27,15 +27,12 @@ int LayerNormMeanAndSquareFp16(const float16_t *src, int num, float16_t *mean, f
float square_sum = 0.0f;
for (; index <= num - C8NUM; index += C8NUM) {
float16x8_t srcv = vld1q_f16(src + index);
float16x8_t squarev = vmulq_f16(srcv, srcv);
for (int i = 0; i < C8NUM; ++i) {
square_sum += srcv[i] * srcv[i];
}
float16x4_t sum2 = vadd_f16(vget_low_f16(srcv), vget_high_f16(srcv));
float32x4_t sum_f32 = vcvt_f32_f16(sum2);
sum += MS_ADDVQ_F32(sum_f32);
float16x4_t square2 = vadd_f16(vget_low_f16(squarev), vget_high_f16(squarev));
float32x4_t square_f32 = vcvt_f32_f16(square2);
square_sum += MS_ADDVQ_F32(square_f32);
}
for (; index < num; index++) {
sum += src[index];

View File

@ -166,9 +166,83 @@ void PackNHWCToNHWCXFp32(const void *src, void *dst, int batch, int plane, int c
}
}
void PackNHWCTo1HWCNXFp32(int kernel_h, int kernel_w, int output_channel, int oc_block_num, int input_channel,
#ifdef ENABLE_AVX
// PackNHWCToNXHWCXFp32 is SWPackNHWCToNXHWCXFp32 asm optimize
void PackNHWCToNXHWCXFp32(int kernel_h, int kernel_w, int output_channel, int oc_block_num, int input_channel,
float *tmp_weight, const float *src) {
// pack weight NHWC to 1HWCxN32 1HWCxN24 1HWCxN16 1HWCxN8
// pack weight NHWC to N32HWC32 N24HWC24 N16HWC16 N8HWC8
// output_channel: batch
int ic8 = DOWN_ROUND(input_channel, C8NUM);
int oc_block8 = DOWN_DIV(output_channel, C8NUM);
int oc_block = 0;
int oc = 0;
int oc_remainder_step = 0;
if (oc_block8 != oc_block_num) {
oc_block8 = oc_block8 / C4NUM * C4NUM;
oc_remainder_step = (oc_block_num - oc_block8) * C8NUM;
}
int plane = kernel_w * kernel_h;
if (plane == 1) { // conv 1x1 weight pack
for (; oc < oc_block8; oc += (oc_block / C8NUM)) {
oc_block = MSMIN(C4NUM, oc_block8 - oc) * C8NUM; // max_tile = 32 ==> 24 ==> 16 ==> 8
for (int oc_tmp = 0; oc_tmp < oc_block; oc_tmp += C8NUM) {
int ic = 0;
for (; ic < ic8; ic += C8NUM) {
Transpose8X8Fp32Avx(src + ic, tmp_weight + ic * oc_block + oc_tmp, input_channel, oc_block);
}
for (; ic < input_channel; ++ic) {
for (int j = 0; j < C8NUM; ++j) {
tmp_weight[ic * oc_block + oc_tmp + j] = src[ic + input_channel * j];
}
}
src += C8NUM * input_channel;
}
tmp_weight += oc_block * input_channel;
}
oc = output_channel - oc_block8 * C8NUM;
for (int oc_remainder = 0; oc_remainder < oc; ++oc_remainder) {
for (int ic = 0; ic < input_channel; ++ic) {
tmp_weight[oc_remainder + oc_remainder_step * ic] = src[ic + oc_remainder * input_channel];
}
}
} else {
for (; oc < oc_block8; oc += (oc_block / C8NUM)) {
oc_block = MSMIN(C4NUM, oc_block8 - oc) * C8NUM; // max_tile = 32 ==> 24 ==> 16 ==> 8
for (int oc_tmp = 0; oc_tmp < oc_block; oc_tmp += C8NUM) {
for (int hw = 0; hw < plane; ++hw) {
int ic = 0;
for (; ic < ic8; ic += C8NUM) {
Transpose8X8Fp32Avx(src + hw * input_channel + ic,
tmp_weight + hw * oc_block * input_channel + ic * oc_block + oc_tmp,
input_channel * plane, oc_block);
}
for (; ic < input_channel; ++ic) {
for (int j = 0; j < C8NUM; ++j) {
tmp_weight[ic * oc_block + oc_tmp + j + hw * oc_block * input_channel] =
src[ic + input_channel * j * plane + hw * input_channel];
}
}
}
src += C8NUM * plane * input_channel;
}
tmp_weight += oc_block * input_channel * plane;
}
oc = output_channel - oc_block8 * C8NUM;
for (int oc_remainder = 0; oc_remainder < oc; ++oc_remainder) {
for (int hw = 0; hw < plane; ++hw) {
for (int ic = 0; ic < input_channel; ++ic) {
tmp_weight[oc_remainder + oc_remainder_step * ic + hw * input_channel * oc_remainder_step] =
src[ic + (oc_remainder * plane + hw) * input_channel];
}
}
}
}
}
#ifdef ENABLE_DEBUG
void SWPackNHWCToNXHWCXFp32(int kernel_h, int kernel_w, int output_channel, int oc_block_num, int input_channel,
float *tmp_weight, const float *src) {
// pack weight NHWC to N32HWC32 N24HWC24 N16HWC16 N8HWC8
int oc_block = 0;
for (int i = 0; i < oc_block_num; i += oc_block) {
oc_block = MSMIN(C4NUM, oc_block_num - i); // max_tile = 4
@ -189,6 +263,9 @@ void PackNHWCTo1HWCNXFp32(int kernel_h, int kernel_w, int output_channel, int oc
}
}
}
#endif
#endif
void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel) {
int c8 = UP_DIV(channel, C8NUM);
int c8_channel = c8 * C8NUM;

View File

@ -30,8 +30,6 @@ void PackNHWCToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int
void PackNCHWToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int channel);
void PackNHWCToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int channel);
void PackNHWCToNHWCXFp32(const void *src, void *dst, int batch, int plane, int channel, int oc_tile);
void PackNHWCTo1HWCNXFp32(int kernel_h, int kernel_w, int output_channel, int oc_block_num, int input_channel,
float *tmp_weight, const float *src);
void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel);
// Note: If not multithreaded, please set task_id = 0 and thread_count = 0;
void PackNHWCToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count);
@ -61,6 +59,12 @@ void Transpose8X8Fp32Arm64(const float *src_ptr, float *dst_ptr, int src_stride,
void Transpose8X8Fp32Arm32(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride);
#endif
#ifdef ENABLE_AVX
void PackNHWCToNXHWCXFp32(int kernel_h, int kernel_w, int output_channel, int oc_block_num, int input_channel,
float *tmp_weight, const float *src);
#ifdef ENABLE_DEBUG
void SWPackNHWCToNXHWCXFp32(int kernel_h, int kernel_w, int output_channel, int oc_block_num, int input_channel,
float *tmp_weight, const float *src);
#endif
void Transpose8X8Fp32Avx(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride);
#endif
#if defined(ENABLE_SSE) && !defined(ENABLE_AVX)

View File

@ -3895,7 +3895,7 @@ int SelectOutputUnit(const ConvParameter *conv_param) {
bool CheckIfUseWinograd(int *output_unit, const ConvParameter *conv_param) {
if (conv_param->kernel_w_ == conv_param->kernel_h_ && conv_param->dilation_h_ == 1 && conv_param->dilation_w_ == 1 &&
conv_param->stride_h_ == 1 && conv_param->stride_w_ == 1) {
conv_param->stride_h_ == 1 && conv_param->stride_w_ == 1 && conv_param->input_channel_ != 1) {
*output_unit = SelectOutputUnit(conv_param);
if (*output_unit > 1) {
return true;

View File

@ -50,7 +50,7 @@ int ConvolutionDepthwiseSWCPUKernelX86::InitWeightBias() {
MS_LOG(ERROR) << "Malloc packed_weight_ is failed!";
return RET_NULL_PTR;
}
PackNHWCTo1HWCNXFp32(weight_tensor->Height(), weight_tensor->Width(), weight_tensor->Batch(), oc_algin,
PackNHWCToNXHWCXFp32(weight_tensor->Height(), weight_tensor->Width(), weight_tensor->Batch(), oc_algin,
weight_tensor->Channel(), packed_weight_, origin_weight_);
if (in_tensors_.size() == kInputSize2) {
auto bias_size = oc_algin * oc_tile_;
@ -194,7 +194,7 @@ void ConvolutionDepthwiseSWCPUKernelX86::FreePackedInputOutput() {
void ConvolutionDepthwiseSWCPUKernelX86::PackWeight() {
auto weight_tensor = in_tensors_.at(kWeightIndex);
int oc_algin = UP_DIV(weight_tensor->Batch(), oc_tile_);
PackNHWCTo1HWCNXFp32(weight_tensor->Height(), weight_tensor->Width(), weight_tensor->Batch(), oc_algin,
PackNHWCToNXHWCXFp32(weight_tensor->Height(), weight_tensor->Width(), weight_tensor->Batch(), oc_algin,
weight_tensor->Channel(), packed_weight_, origin_weight_);
}

View File

@ -45,7 +45,7 @@ int ConvolutionSWCPUKernel::InitWeightBias() {
return RET_NULL_PTR;
}
memset(packed_weight_, 0, pack_weight_size * sizeof(float));
PackNHWCTo1HWCNXFp32(kernel_h, kernel_w, output_channel, oc_block_num, input_channel, packed_weight_,
PackNHWCToNXHWCXFp32(kernel_h, kernel_w, output_channel, oc_block_num, input_channel, packed_weight_,
ori_weight_data_);
if (in_tensors_.size() == kInputSize2) {
packed_bias_ = reinterpret_cast<float *>(malloc(oc_block_num * oc_tile_ * sizeof(float)));