diff --git a/mindspore/lite/micro/cmake/file_list.cmake b/mindspore/lite/micro/cmake/file_list.cmake index e5b5904e224..20f4c891c07 100644 --- a/mindspore/lite/micro/cmake/file_list.cmake +++ b/mindspore/lite/micro/cmake/file_list.cmake @@ -99,6 +99,7 @@ set(CODER_OPCODERS_SRC ${MICRO_DIR}/coder/opcoders/nnacl/fp32/transpose_fp32_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp32/splice_fp32_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp32/exp_fp32_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp32/deconv2d_fp32_coder.cc #### nnacl int8 coder ${MICRO_DIR}/coder/opcoders/nnacl/int8/activation_int8_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/int8/add_int8_coder.cc @@ -188,11 +189,13 @@ set(LITE_KERNEL_SRC ${NNACL_DIR}/fp32/winograd_utils.c ${NNACL_DIR}/fp32/pack_fp32.c ${NNACL_DIR}/fp32/arithmetic_fp32.c + ${NNACL_DIR}/fp32/deconv_fp32.c + ${NNACL_DIR}/fp32/matmul_fp32.c + ${NNACL_DIR}/fp32/common_func_fp32.c ${NNACL_DIR}/int8/quantize.c ${NNACL_DIR}/int8/pack_int8.c ${NNACL_DIR}/int8/matmul_int8.c ${NNACL_DIR}/int8/fixed_point.c - ${NNACL_DIR}/fp32/matmul_fp32.c ${NNACL_DIR}/int8/arithmetic_int8.c ${NNACL_DIR}/int8/add_int8.c ${NNACL_DIR}/int8/concat_int8.c @@ -288,6 +291,8 @@ set(LITE_KERNEL_SRC if("${X86_64_SIMD}" STREQUAL "sse") set(SSE_SRC ${NNACL_DIR}/intrinsics/sse/MatMul_Sse.c + ${NNACL_DIR}/intrinsics/sse/PostFuncBiasReluC8.c + ${NNACL_DIR}/intrinsics/sse/PostFuncBiasReluC4.c ) set_property(SOURCE ${SSE_SRC} PROPERTY LANGUAGE C) endif() @@ -299,6 +304,8 @@ if("${X86_64_SIMD}" STREQUAL "avx") set(AVX_SRC ${NNACL_DIR}/intrinsics/avx/common_utils.c ${NNACL_DIR}/intrinsics/sse/MatMul_Sse.c + ${NNACL_DIR}/intrinsics/sse/PostFuncBiasReluC8.c + ${NNACL_DIR}/intrinsics/sse/PostFuncBiasReluC4.c ${NNACL_DIR}/assembly/avx/MatmulAvx.S ) set_property(SOURCE ${AVX_SRC} PROPERTY LANGUAGE C) diff --git a/mindspore/lite/micro/cmake/package_wrapper.cmake b/mindspore/lite/micro/cmake/package_wrapper.cmake index 746d5fd460e..2b00fa96623 100644 --- a/mindspore/lite/micro/cmake/package_wrapper.cmake +++ b/mindspore/lite/micro/cmake/package_wrapper.cmake @@ -7,6 +7,7 @@ set(WRAPPER_SRC ${WRAPPER_DIR}/base/optimize_handler_wrapper.c ${WRAPPER_DIR}/fp32/matmul_fp32_wrapper.c ${WRAPPER_DIR}/fp32/arithmetic_fp32_wrapper.c + ${WRAPPER_DIR}/fp32/deconvolution_fp32_wrapper.c ${WRAPPER_DIR}/int8/matmul_int8_wrapper.c ${WRAPPER_DIR}/int8/add_int8_wrapper.c ${WRAPPER_DIR}/int8/concat_int8_wrapper.c diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/deconv2d_fp32_coder.cc b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/deconv2d_fp32_coder.cc new file mode 100644 index 00000000000..0b9d2e8f40d --- /dev/null +++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/deconv2d_fp32_coder.cc @@ -0,0 +1,196 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "coder/opcoders/nnacl/fp32/deconv2d_fp32_coder.h" +#include +#include +#include +#include "coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.h" +#include "nnacl/fp32/winograd_utils.h" +#include "coder/opcoders/file_collector.h" +#include "coder/log.h" +#include "coder/opcoders/parallel.h" +#include "src/common/version_manager.h" +#include "coder/opcoders/nnacl/dequant/de_quant.h" + +using mindspore::schema::PrimitiveType_Conv2dTransposeFusion; +namespace mindspore::lite::micro::nnacl { +int DeConvolutionFP32Coder::InitRunBuf() { + pack_output_size_ = UP_ROUND(conv_param_->output_channel_, C8NUM) * output_plane_ * sizeof(float); + packed_output_ = reinterpret_cast(allocator_->Malloc(kNumberTypeFloat32, pack_output_size_, kWorkspace)); + MS_CHECK_PTR(packed_output_); + + if (target_ == kARM32A) { + tmp_buffer_size_ = matmul_param_.row_4_ * matmul_param_.col_8_ * sizeof(float); + } else { + tmp_buffer_size_ = matmul_param_.row_12_ * matmul_param_.col_8_ * sizeof(float); + } + tmp_buffer_ = reinterpret_cast(allocator_->Malloc(kNumberTypeFloat32, tmp_buffer_size_, kWorkspace)); + MS_CHECK_PTR(tmp_buffer_); + + if (target_ == kARM32A) { + pack_input_size_ = matmul_param_.row_4_ * matmul_param_.deep_ * sizeof(float); + } else { + pack_input_size_ = matmul_param_.row_12_ * matmul_param_.deep_ * sizeof(float); + } + packed_input_ = reinterpret_cast(allocator_->Malloc(kNumberTypeFloat32, pack_input_size_, kWorkspace)); + MS_CHECK_PTR(packed_input_); + return RET_OK; +} + +int DeConvolutionFP32Coder::InitParam() { + input_plane_ = conv_param_->input_h_ * conv_param_->input_w_; + kernel_plane_ = conv_param_->kernel_w_ * conv_param_->kernel_h_; + output_plane_ = conv_param_->output_h_ * conv_param_->output_w_; + + matmul_param_.row_ = input_plane_; + matmul_param_.deep_ = conv_param_->input_channel_; + matmul_param_.col_ = conv_param_->output_channel_ * kernel_plane_; + matmul_param_.row_12_ = UP_ROUND(matmul_param_.row_, C12NUM); + matmul_param_.row_4_ = UP_ROUND(matmul_param_.row_, C4NUM); + matmul_param_.col_8_ = UP_ROUND(conv_param_->output_channel_, C8NUM) * kernel_plane_; + return RET_OK; +} + +int DeConvolutionFP32Coder::Prepare(CoderContext *const context) { + MS_CHECK_RET_CODE(Conv2DBaseCoder::Init(), "Conv2DBaseCoder::Init() failed."); + MS_CHECK_RET_CODE(InitWeightBias(context), "Init weight bias failed."); + return Resize(); +} + +int DeConvolutionFP32Coder::Resize() { + MS_CHECK_RET_CODE(Conv2DBaseCoder::Init(), "init failed."); + MS_CHECK_RET_CODE(InitParam(), "init param failed."); + MS_CHECK_RET_CODE(InitRunBuf(), "init run buffer failed."); + return RET_OK; +} + +int DeConvolutionFP32Coder::InitWeightBias(CoderContext *const context) { + int kernel_h = filter_tensor_->Height(); + int kernel_w = filter_tensor_->Width(); + int in_channel = filter_tensor_->Channel(); + int out_channel = filter_tensor_->Batch(); + conv_param_->input_channel_ = in_channel; + conv_param_->output_channel_ = out_channel; + + if (input_tensors_.size() == kInputSize2) { + bias_data_size_ = UP_ROUND(out_channel, C4NUM) * sizeof(float); + packed_bias_ = reinterpret_cast(allocator_->Malloc(kNumberTypeFloat32, kOnlineSize, kOnlinePackWeight)); + MS_CHECK_PTR(packed_bias_); + } + + int kernel_plane = kernel_h * kernel_w; + int pack_weight_size = in_channel * kernel_plane; + pack_weight_size_ = pack_weight_size * UP_ROUND(out_channel, C8NUM) * sizeof(float); + + packed_weight_ = reinterpret_cast(allocator_->Malloc(kNumberTypeFloat32, kOnlineSize, kOnlinePackWeight)); + MS_CHECK_PTR(packed_weight_); + + NNaclFp32Serializer init_code; + if (input_tensors_.size() == kInputSize2) { + init_code.CodeMallocExpression(packed_bias_, bias_data_size_); + init_code.CodeFunction("memset", packed_bias_, 0, pack_weight_size_); + init_code.CodeFunction("memcpy", packed_bias_, bias_tensor_, out_channel * sizeof(float)); + } + + init_code.CodeMallocExpression(packed_weight_, pack_weight_size_); + init_code.CodeFunction("memset", packed_weight_, 0, pack_weight_size_); + init_code.CodeFunction("PackNHWCToC8HWN8Fp32", filter_tensor_, packed_weight_, in_channel, kernel_plane, out_channel); + + context->AppendInitCode(init_code.str()); + return RET_OK; +} + +int DeConvolutionFP32Coder::DoCode(CoderContext *const context) { + Collect(context, + { + "wrapper/fp32/deconvolution_fp32_wrapper.h", + "nnacl/fp32/conv_common_fp32.h", + "nnacl/pack.h", + "nnacl/fp32/common_func_fp32.h", + "nnacl/base/minimal_filtering_generator.h", + "nnacl/fp32/matmul_fp32.h", + "nnacl/conv_parameter.h", + "nnacl/matmul_parameter.h", + "nnacl/op_base.h", + }, + { + "deconvolution_fp32_wrapper.c", + "common_func.c", + "conv_common_fp32.c", + "matmul_fp32.c", + "pack_fp32.c", + "deconv_fp32.c", + "minimal_filter_generator.c", + }); + if (target_ == kARM32A) { + Collect(context, {}, {}, + { + "MatmulFp32.S", + "MatmulFp32Opt.S", + "PreSum4x16Int8Peroc.S", + "PreSum4x16Int8Pert.S", + "IndirectGemmInt16to32_8x4.S", + "MatmulInt8.S", + "MatmulFp32Opt12x4.S", + }); + } else if (target_ == kARM64) { + Collect(context, {}, {}, + { + "MatmulFp32.S", + "MatmulFp32Opt.S", + "PreSum4x16Int8Peroc.S", + "MatVecMulFp32.S", + "PreSum4x16Int8Peroc.S", + "PreSum4x16Int8Pert.S", + "IndirectGemmInt16to32_8x4.S", + "MatmulInt8.S", + }); + } + + NNaclFp32Serializer code; + // call the op function + code.CodeFunction("memset", packed_input_, "0", pack_input_size_); + code.CodeFunction("memset", packed_output_, "0", pack_output_size_); + code.CodeFunction("memset", tmp_buffer_, "0", tmp_buffer_size_); + code.CodeStruct("conv_parameter", *conv_param_); + code.CodeStruct("matmul_parameter", matmul_param_); + + std::string src_in_ptr_str = allocator_->GetRuntimeAddr(input_tensor_); + std::string src_out_ptr_str = allocator_->GetRuntimeAddr(output_tensor_); + + for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) { + input_ptr_ = src_in_ptr_str + std::to_string(batch_index * input_plane_ * conv_param_->input_channel_); + output_ptr_ = src_out_ptr_str + std::to_string(batch_index * output_plane_ * conv_param_->output_channel_); + + if (target_ == kARM32A) { + code.CodeFunction("RowMajor2Col4Major", input_ptr_, packed_input_, matmul_param_.row_, matmul_param_.deep_); + } else { + code.CodeFunction("RowMajor2Col12Major", input_ptr_, packed_input_, matmul_param_.row_, matmul_param_.deep_); + } + code.CodeBaseStruct("DeConvFp32Args", kRunArgs, packed_input_, packed_weight_, packed_bias_, packed_output_, + output_ptr_, tmp_buffer_, "&matmul_parameter", "&conv_parameter"); + if (!support_parallel_) { + code.CodeFunction("DeConvFp32Run", kRunArgsAddr, kDefaultTaskId); + } else { + code.CodeFunction(kParallelLaunch, gThreadPool, "DeConvFp32Run", kRunArgsAddr, "conv_parameter.thread_num_"); + } + } + context->AppendCode(code.str()); + return RET_OK; +} +REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Conv2dTransposeFusion, + CPUOpCoderCreator); +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/deconv2d_fp32_coder.h b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/deconv2d_fp32_coder.h new file mode 100644 index 00000000000..3c3d7e442af --- /dev/null +++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/deconv2d_fp32_coder.h @@ -0,0 +1,65 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_FP32_DECONV2D_FP32_CODER_H_ +#define MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_FP32_DECONV2D_FP32_CODER_H_ + +#include +#include +#include "nnacl/conv_parameter.h" +#include "coder/opcoders/base/conv2d_base_coder.h" +#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h" +#include "nnacl/fp32/deconv_fp32.h" +#include "nnacl/fp32/matmul_fp32.h" + +namespace mindspore::lite::micro::nnacl { +class DeConvolutionFP32Coder final : public Conv2DBaseCoder { + public: + DeConvolutionFP32Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const Model::Node *node, size_t node_index, Target target) + : Conv2DBaseCoder(in_tensors, out_tensors, node, node_index, target) {} + + int Prepare(CoderContext *const context) override; + + int DoCode(CoderContext *const context) override; + + ~DeConvolutionFP32Coder() override = default; + + private: + int InitWeightBias(CoderContext *const context); + int Resize(); + int InitRunBuf(); + int InitParam(); + + MatMulParameter matmul_param_{}; + size_t pack_output_size_{0}; + size_t tmp_buffer_size_{0}; + size_t pack_input_size_{0}; + size_t bias_data_size_{0}; + size_t pack_weight_size_{0}; + int input_plane_{0}; + int kernel_plane_{0}; + int output_plane_{0}; + float *packed_bias_{nullptr}; + float *packed_weight_{nullptr}; + float *packed_input_{nullptr}; + float *packed_output_{nullptr}; + float *tmp_buffer_{nullptr}; + std::string input_ptr_; + std::string output_ptr_; +}; +} // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_FP32_DECONV2D_FP32_CODER_H_ diff --git a/mindspore/lite/micro/coder/wrapper/fp32/deconvolution_fp32_wrapper.c b/mindspore/lite/micro/coder/wrapper/fp32/deconvolution_fp32_wrapper.c new file mode 100644 index 00000000000..7789a1f631f --- /dev/null +++ b/mindspore/lite/micro/coder/wrapper/fp32/deconvolution_fp32_wrapper.c @@ -0,0 +1,69 @@ +/* + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "wrapper/fp32/deconvolution_fp32_wrapper.h" +#include "nnacl/fp32/deconv_fp32.h" +#include "nnacl/fp32/matmul_fp32.h" + +int DoDeconvFp32(const float *packed_input, const float *packed_weight, const float *packed_bias, float *packed_output, + float *output, float *tmp_ori_buffer, const MatMulParameter *matmul_param, + const ConvParameter *conv_param, int task_id) { + int thread_count = MSMIN(conv_param->thread_num_, UP_DIV(conv_param->output_channel_, C8NUM)); + int thread_stride = UP_DIV(UP_DIV(conv_param->output_channel_, C8NUM), thread_count); + int res_stride = UP_DIV(conv_param->output_channel_, C8NUM) - task_id * thread_stride; + int oc = MSMIN(thread_stride, res_stride); + int cur_stride = thread_stride * C8NUM; + res_stride = conv_param->output_channel_ - task_id * thread_stride * C8NUM; + int oc_res = MSMIN(cur_stride, res_stride); + if (oc <= 0 || oc_res <= 0) { + return NNACL_OK; + } + + int kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_; + int output_plane = conv_param->output_h_ * conv_param->output_w_; + +#if defined(ENABLE_ARM32) + float *tmp_buffer = tmp_ori_buffer + task_id * thread_stride * C8NUM * kernel_plane * matmul_param->row_4_; + MatMulOpt(packed_input, packed_weight + task_id * thread_stride * C8NUM * kernel_plane * matmul_param->deep_, + tmp_buffer, NULL, ActType_No, matmul_param->deep_, matmul_param->row_4_, oc * C8NUM * kernel_plane, + matmul_param->col_, OutType_C8); +#else + float *tmp_buffer = tmp_ori_buffer + task_id * thread_stride * C8NUM * kernel_plane * matmul_param->row_12_; + MatMulOpt(packed_input, packed_weight + task_id * thread_stride * C8NUM * kernel_plane * matmul_param->deep_, + tmp_buffer, NULL, ActType_No, matmul_param->deep_, matmul_param->row_12_, oc * C8NUM * kernel_plane, + matmul_param->col_, OutType_C8); +#endif + + DeConvPostFp32C8(tmp_buffer, packed_output + task_id * thread_stride * C8NUM * output_plane, + packed_bias + thread_stride * task_id * C8NUM, output + task_id * thread_stride * C8NUM, oc_res, + conv_param); + return NNACL_OK; +} + +int DeConvFp32Run(void *cdata, int task_id) { + DeConvFp32Args *args = (DeConvFp32Args *)cdata; + const MatMulParameter *matmul_param = args->matmul_param_; + const ConvParameter *conv_param = args->conv_param_; + const float *packed_input = args->packed_input_; + const float *packed_weight = args->packed_weight_; + const float *packed_bias = args->packed_bias_; + float *packed_output = args->packed_output_; + float *output = args->output_; + float *tmp_buffer = args->tmp_buffer_; + DoDeconvFp32(packed_input, packed_weight, packed_bias, packed_output, output, tmp_buffer, matmul_param, conv_param, + task_id); + return NNACL_OK; +} diff --git a/mindspore/lite/micro/coder/wrapper/fp32/deconvolution_fp32_wrapper.h b/mindspore/lite/micro/coder/wrapper/fp32/deconvolution_fp32_wrapper.h new file mode 100644 index 00000000000..f8356a14d79 --- /dev/null +++ b/mindspore/lite/micro/coder/wrapper/fp32/deconvolution_fp32_wrapper.h @@ -0,0 +1,48 @@ +/* + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_MICRO_CODER_WRAPPER_DECONVOLUTION_FP32_WRAPPER_H_ +#define MINDSPORE_LITE_MICRO_CODER_WRAPPER_DECONVOLUTION_FP32_WRAPPER_H_ + +#include "nnacl/errorcode.h" +#include "nnacl/conv_parameter.h" +#include "nnacl/matmul_parameter.h" + +typedef struct { + const float *packed_input_; + const float *packed_weight_; + const float *packed_bias_; + float *packed_output_; + float *output_; + float *tmp_buffer_; + const MatMulParameter *matmul_param_; + const ConvParameter *conv_param_; +} DeConvFp32Args; + +#ifdef __cplusplus +extern "C" { +#endif + +int DoDeconvFp32(const float *packed_input, const float *packed_weight, const float *packed_bias, float *packed_output, + float *output, float *tmp_ori_buffer, const MatMulParameter *matmul_param, + const ConvParameter *conv_param, int task_id); + +int DeConvFp32Run(void *cdata, int task_id); + +#ifdef __cplusplus +} +#endif +#endif // MINDSPORE_LITE_MICRO_CODER_WRAPPER_DECONVOLUTION_FP32_WRAPPER_H_