diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc index 8241f4afa4..e7337cefe8 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc @@ -213,7 +213,7 @@ kernel::LiteKernel *CpuConvFp16KernelCreator(const std::vector & kernel::LiteKernel *kernel = nullptr; if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { - kernel = new (std::nothrow) kernel::Convolution3x3FP16CPUKernel(opParameter, inputs, outputs, ctx, primitive); + kernel = new (std::nothrow) kernel::ConvolutionFP16CPUKernel(opParameter, inputs, outputs, ctx, primitive); } else if (kernel_h == 1 && kernel_w == 1) { kernel = new (std::nothrow) kernel::Convolution1x1FP16CPUKernel(opParameter, inputs, outputs, ctx, primitive); } else { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc index fc9209ca43..4da5295f5a 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc @@ -95,12 +95,22 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() { } int Convolution1x1CPUKernel::InitConv1x1Param() { + int hw_tile = C12NUM; +#ifdef ENABLE_ARM32 + hw_tile = C4NUM; +#endif + if ((matmul_param_->row_ > (hw_tile * op_parameter_->thread_num_)) && (matmul_param_->row_ > matmul_param_->col_)) { + multi_thread_by_hw_ = true; + thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, hw_tile)); + thread_stride_ = UP_DIV(UP_DIV(matmul_param_->row_, hw_tile), thread_count_) * hw_tile; + } else { + multi_thread_by_hw_ = false; + thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM)); + thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, C8NUM), thread_count_) * C8NUM; + } + pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 || conv_param_->stride_w_ != 1); - - thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM)); - thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, C8NUM), thread_count_) * C8NUM; - if (pre_trans_input_) { input_ptr_ = reinterpret_cast(malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(float))); if (input_ptr_ == nullptr) { @@ -113,22 +123,6 @@ int Convolution1x1CPUKernel::InitConv1x1Param() { return RET_OK; } -void Convolution1x1CPUKernel::Pre1x1Trans(float *src_input, float *src_output) { - output_ptr_ = src_output; - - if (pre_trans_input_) { - Conv1x1InputPack(src_input, input_ptr_, conv_param_, sizeof(float)); - } else { - input_ptr_ = src_input; - } -#ifdef ENABLE_ARM32 - RowMajor2Col4Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_); -#else - RowMajor2Col12Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_); -#endif - return; -} - int Convolution1x1CPUKernel::Init() { int error_code = InitConv1x1BiasWeight(); if (error_code != RET_OK) { @@ -164,6 +158,40 @@ int Convolution1x1Run(void *cdata, int task_id) { return RET_OK; } +int Convolution1x1CPUKernel::DoConv1x1Hw(int task_id) { + int res_stride = matmul_param_->row_ - task_id * thread_stride_; + int cur_hw_ = MSMIN(thread_stride_, res_stride); + if (cur_hw_ <= 0) { + return RET_OK; + } + + float *thread_input_ptr = input_ptr_ + task_id * thread_stride_ * matmul_param_->deep_; + float *thread_pack_input = pack_input_ + task_id * thread_stride_ * matmul_param_->deep_; + +#ifdef ENABLE_ARM32 + RowMajor2Col4Major(thread_input_ptr, thread_pack_input, cur_hw_, matmul_param_->deep_); +#else + RowMajor2Col12Major(thread_input_ptr, thread_pack_input, cur_hw_, matmul_param_->deep_); +#endif + + float *thread_output_ptr = output_ptr_ + task_id * thread_stride_ * matmul_param_->col_; + MatMulOpt(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast(bias_data_), + matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_, + OutType_Nhwc); + + return RET_OK; +} + +int Convolution1x1RunHw(void *cdata, int task_id) { + auto conv1x1 = reinterpret_cast(cdata); + auto error_code = conv1x1->DoConv1x1Hw(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "Convolution1x1Run error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + int Convolution1x1CPUKernel::Run() { auto prepare_ret = Prepare(); if (prepare_ret != RET_OK) { @@ -186,13 +214,23 @@ int Convolution1x1CPUKernel::Run() { } for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) { - Pre1x1Trans(src_in + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_, - src_out + batch_index * matmul_param_->row_ * matmul_param_->col_); + output_ptr_ = src_out + batch_index * matmul_param_->row_ * matmul_param_->col_; + auto tmp_in = src_in + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_; + if (pre_trans_input_) { + Conv1x1InputPack(tmp_in, input_ptr_, conv_param_, sizeof(float)); + } else { + input_ptr_ = tmp_in; + } - int error_code = ParallelLaunch(this->context_->thread_pool_, Convolution1x1Run, this, thread_count_); - if (error_code != RET_OK) { - MS_LOG(ERROR) << "conv1x1 strassen error error_code[" << error_code << "]"; - return RET_ERROR; + if (multi_thread_by_hw_) { + ParallelLaunch(this->context_->thread_pool_, Convolution1x1RunHw, this, thread_count_); + } else { +#ifdef ENABLE_ARM32 + RowMajor2Col4Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_); +#else + RowMajor2Col12Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_); +#endif + ParallelLaunch(this->context_->thread_pool_, Convolution1x1Run, this, thread_count_); } } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.h index 9b32f6b9b7..1755bb1da4 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.h @@ -49,17 +49,18 @@ class Convolution1x1CPUKernel : public ConvolutionBaseCPUKernel { public: int DoConv1x1(int task_id); + int DoConv1x1Hw(int task_id); private: int InitConv1x1Param(); int InitConv1x1BiasWeight(); void InitConv1x1MatmulParam(); - void Pre1x1Trans(float *src_input, float *src_output); void FreeTmpBuffer(); private: MatMulParameter *matmul_param_ = nullptr; bool pre_trans_input_ = false; + bool multi_thread_by_hw_ = false; int thread_count_ = 0; int thread_stride_ = 0; float *weight_ptr_ = nullptr;