!20493 [MS][LITE][CPU] x86 初始化時間及算子优化
Merge pull request !20493 from liuzhongkai/init_time
This commit is contained in:
commit
62b39b5d7e
|
@ -27,15 +27,12 @@ int LayerNormMeanAndSquareFp16(const float16_t *src, int num, float16_t *mean, f
|
|||
float square_sum = 0.0f;
|
||||
for (; index <= num - C8NUM; index += C8NUM) {
|
||||
float16x8_t srcv = vld1q_f16(src + index);
|
||||
float16x8_t squarev = vmulq_f16(srcv, srcv);
|
||||
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
square_sum += srcv[i] * srcv[i];
|
||||
}
|
||||
float16x4_t sum2 = vadd_f16(vget_low_f16(srcv), vget_high_f16(srcv));
|
||||
float32x4_t sum_f32 = vcvt_f32_f16(sum2);
|
||||
sum += MS_ADDVQ_F32(sum_f32);
|
||||
|
||||
float16x4_t square2 = vadd_f16(vget_low_f16(squarev), vget_high_f16(squarev));
|
||||
float32x4_t square_f32 = vcvt_f32_f16(square2);
|
||||
square_sum += MS_ADDVQ_F32(square_f32);
|
||||
}
|
||||
for (; index < num; index++) {
|
||||
sum += src[index];
|
||||
|
|
|
@ -166,9 +166,83 @@ void PackNHWCToNHWCXFp32(const void *src, void *dst, int batch, int plane, int c
|
|||
}
|
||||
}
|
||||
|
||||
void PackNHWCTo1HWCNXFp32(int kernel_h, int kernel_w, int output_channel, int oc_block_num, int input_channel,
|
||||
#ifdef ENABLE_AVX
|
||||
// PackNHWCToNXHWCXFp32 is SWPackNHWCToNXHWCXFp32 asm optimize
|
||||
void PackNHWCToNXHWCXFp32(int kernel_h, int kernel_w, int output_channel, int oc_block_num, int input_channel,
|
||||
float *tmp_weight, const float *src) {
|
||||
// pack weight NHWC to 1HWCxN32 1HWCxN24 1HWCxN16 1HWCxN8
|
||||
// pack weight NHWC to N32HWC32 N24HWC24 N16HWC16 N8HWC8
|
||||
// output_channel: batch
|
||||
int ic8 = DOWN_ROUND(input_channel, C8NUM);
|
||||
int oc_block8 = DOWN_DIV(output_channel, C8NUM);
|
||||
int oc_block = 0;
|
||||
int oc = 0;
|
||||
int oc_remainder_step = 0;
|
||||
if (oc_block8 != oc_block_num) {
|
||||
oc_block8 = oc_block8 / C4NUM * C4NUM;
|
||||
oc_remainder_step = (oc_block_num - oc_block8) * C8NUM;
|
||||
}
|
||||
int plane = kernel_w * kernel_h;
|
||||
if (plane == 1) { // conv 1x1 weight pack
|
||||
for (; oc < oc_block8; oc += (oc_block / C8NUM)) {
|
||||
oc_block = MSMIN(C4NUM, oc_block8 - oc) * C8NUM; // max_tile = 32 ==> 24 ==> 16 ==> 8
|
||||
for (int oc_tmp = 0; oc_tmp < oc_block; oc_tmp += C8NUM) {
|
||||
int ic = 0;
|
||||
for (; ic < ic8; ic += C8NUM) {
|
||||
Transpose8X8Fp32Avx(src + ic, tmp_weight + ic * oc_block + oc_tmp, input_channel, oc_block);
|
||||
}
|
||||
for (; ic < input_channel; ++ic) {
|
||||
for (int j = 0; j < C8NUM; ++j) {
|
||||
tmp_weight[ic * oc_block + oc_tmp + j] = src[ic + input_channel * j];
|
||||
}
|
||||
}
|
||||
src += C8NUM * input_channel;
|
||||
}
|
||||
tmp_weight += oc_block * input_channel;
|
||||
}
|
||||
oc = output_channel - oc_block8 * C8NUM;
|
||||
for (int oc_remainder = 0; oc_remainder < oc; ++oc_remainder) {
|
||||
for (int ic = 0; ic < input_channel; ++ic) {
|
||||
tmp_weight[oc_remainder + oc_remainder_step * ic] = src[ic + oc_remainder * input_channel];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (; oc < oc_block8; oc += (oc_block / C8NUM)) {
|
||||
oc_block = MSMIN(C4NUM, oc_block8 - oc) * C8NUM; // max_tile = 32 ==> 24 ==> 16 ==> 8
|
||||
for (int oc_tmp = 0; oc_tmp < oc_block; oc_tmp += C8NUM) {
|
||||
for (int hw = 0; hw < plane; ++hw) {
|
||||
int ic = 0;
|
||||
for (; ic < ic8; ic += C8NUM) {
|
||||
Transpose8X8Fp32Avx(src + hw * input_channel + ic,
|
||||
tmp_weight + hw * oc_block * input_channel + ic * oc_block + oc_tmp,
|
||||
input_channel * plane, oc_block);
|
||||
}
|
||||
for (; ic < input_channel; ++ic) {
|
||||
for (int j = 0; j < C8NUM; ++j) {
|
||||
tmp_weight[ic * oc_block + oc_tmp + j + hw * oc_block * input_channel] =
|
||||
src[ic + input_channel * j * plane + hw * input_channel];
|
||||
}
|
||||
}
|
||||
}
|
||||
src += C8NUM * plane * input_channel;
|
||||
}
|
||||
tmp_weight += oc_block * input_channel * plane;
|
||||
}
|
||||
oc = output_channel - oc_block8 * C8NUM;
|
||||
for (int oc_remainder = 0; oc_remainder < oc; ++oc_remainder) {
|
||||
for (int hw = 0; hw < plane; ++hw) {
|
||||
for (int ic = 0; ic < input_channel; ++ic) {
|
||||
tmp_weight[oc_remainder + oc_remainder_step * ic + hw * input_channel * oc_remainder_step] =
|
||||
src[ic + (oc_remainder * plane + hw) * input_channel];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef ENABLE_DEBUG
|
||||
void SWPackNHWCToNXHWCXFp32(int kernel_h, int kernel_w, int output_channel, int oc_block_num, int input_channel,
|
||||
float *tmp_weight, const float *src) {
|
||||
// pack weight NHWC to N32HWC32 N24HWC24 N16HWC16 N8HWC8
|
||||
int oc_block = 0;
|
||||
for (int i = 0; i < oc_block_num; i += oc_block) {
|
||||
oc_block = MSMIN(C4NUM, oc_block_num - i); // max_tile = 4
|
||||
|
@ -189,6 +263,9 @@ void PackNHWCTo1HWCNXFp32(int kernel_h, int kernel_w, int output_channel, int oc
|
|||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel) {
|
||||
int c8 = UP_DIV(channel, C8NUM);
|
||||
int c8_channel = c8 * C8NUM;
|
||||
|
|
|
@ -30,8 +30,6 @@ void PackNHWCToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int
|
|||
void PackNCHWToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int channel);
|
||||
void PackNHWCToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int channel);
|
||||
void PackNHWCToNHWCXFp32(const void *src, void *dst, int batch, int plane, int channel, int oc_tile);
|
||||
void PackNHWCTo1HWCNXFp32(int kernel_h, int kernel_w, int output_channel, int oc_block_num, int input_channel,
|
||||
float *tmp_weight, const float *src);
|
||||
void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel);
|
||||
// Note: If not multithreaded, please set task_id = 0 and thread_count = 0;
|
||||
void PackNHWCToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count);
|
||||
|
@ -61,6 +59,12 @@ void Transpose8X8Fp32Arm64(const float *src_ptr, float *dst_ptr, int src_stride,
|
|||
void Transpose8X8Fp32Arm32(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride);
|
||||
#endif
|
||||
#ifdef ENABLE_AVX
|
||||
void PackNHWCToNXHWCXFp32(int kernel_h, int kernel_w, int output_channel, int oc_block_num, int input_channel,
|
||||
float *tmp_weight, const float *src);
|
||||
#ifdef ENABLE_DEBUG
|
||||
void SWPackNHWCToNXHWCXFp32(int kernel_h, int kernel_w, int output_channel, int oc_block_num, int input_channel,
|
||||
float *tmp_weight, const float *src);
|
||||
#endif
|
||||
void Transpose8X8Fp32Avx(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride);
|
||||
#endif
|
||||
#if defined(ENABLE_SSE) && !defined(ENABLE_AVX)
|
||||
|
|
|
@ -3895,7 +3895,7 @@ int SelectOutputUnit(const ConvParameter *conv_param) {
|
|||
|
||||
bool CheckIfUseWinograd(int *output_unit, const ConvParameter *conv_param) {
|
||||
if (conv_param->kernel_w_ == conv_param->kernel_h_ && conv_param->dilation_h_ == 1 && conv_param->dilation_w_ == 1 &&
|
||||
conv_param->stride_h_ == 1 && conv_param->stride_w_ == 1) {
|
||||
conv_param->stride_h_ == 1 && conv_param->stride_w_ == 1 && conv_param->input_channel_ != 1) {
|
||||
*output_unit = SelectOutputUnit(conv_param);
|
||||
if (*output_unit > 1) {
|
||||
return true;
|
||||
|
|
|
@ -50,7 +50,7 @@ int ConvolutionDepthwiseSWCPUKernelX86::InitWeightBias() {
|
|||
MS_LOG(ERROR) << "Malloc packed_weight_ is failed!";
|
||||
return RET_NULL_PTR;
|
||||
}
|
||||
PackNHWCTo1HWCNXFp32(weight_tensor->Height(), weight_tensor->Width(), weight_tensor->Batch(), oc_algin,
|
||||
PackNHWCToNXHWCXFp32(weight_tensor->Height(), weight_tensor->Width(), weight_tensor->Batch(), oc_algin,
|
||||
weight_tensor->Channel(), packed_weight_, origin_weight_);
|
||||
if (in_tensors_.size() == kInputSize2) {
|
||||
auto bias_size = oc_algin * oc_tile_;
|
||||
|
@ -194,7 +194,7 @@ void ConvolutionDepthwiseSWCPUKernelX86::FreePackedInputOutput() {
|
|||
void ConvolutionDepthwiseSWCPUKernelX86::PackWeight() {
|
||||
auto weight_tensor = in_tensors_.at(kWeightIndex);
|
||||
int oc_algin = UP_DIV(weight_tensor->Batch(), oc_tile_);
|
||||
PackNHWCTo1HWCNXFp32(weight_tensor->Height(), weight_tensor->Width(), weight_tensor->Batch(), oc_algin,
|
||||
PackNHWCToNXHWCXFp32(weight_tensor->Height(), weight_tensor->Width(), weight_tensor->Batch(), oc_algin,
|
||||
weight_tensor->Channel(), packed_weight_, origin_weight_);
|
||||
}
|
||||
|
||||
|
|
|
@ -45,7 +45,7 @@ int ConvolutionSWCPUKernel::InitWeightBias() {
|
|||
return RET_NULL_PTR;
|
||||
}
|
||||
memset(packed_weight_, 0, pack_weight_size * sizeof(float));
|
||||
PackNHWCTo1HWCNXFp32(kernel_h, kernel_w, output_channel, oc_block_num, input_channel, packed_weight_,
|
||||
PackNHWCToNXHWCXFp32(kernel_h, kernel_w, output_channel, oc_block_num, input_channel, packed_weight_,
|
||||
ori_weight_data_);
|
||||
if (in_tensors_.size() == kInputSize2) {
|
||||
packed_bias_ = reinterpret_cast<float *>(malloc(oc_block_num * oc_tile_ * sizeof(float)));
|
||||
|
|
Loading…
Reference in New Issue