diff --git a/mindspore/lite/schema/model.fbs b/mindspore/lite/schema/model.fbs index 060998e2463..b40135ebbaf 100644 --- a/mindspore/lite/schema/model.fbs +++ b/mindspore/lite/schema/model.fbs @@ -172,7 +172,8 @@ union PrimitiveType { TupleGetItem, Div, Where, - OneHot + OneHot, + Lstm } enum QuantType: int { diff --git a/mindspore/lite/schema/ops.fbs b/mindspore/lite/schema/ops.fbs index 5e993a61602..f0b50bfca03 100644 --- a/mindspore/lite/schema/ops.fbs +++ b/mindspore/lite/schema/ops.fbs @@ -718,3 +718,7 @@ table Where{ table OneHot { axis: int; } + +table Lstm{ + bidirection: bool = false; +} diff --git a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc index 5f6db71d68c..815cccfc8dd 100644 --- a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc +++ b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc @@ -25,6 +25,8 @@ #ifdef ENABLE_FP16 #include "src/runtime/kernel/arm/fp16/convolution_fp16.h" #include "src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h" +#include "src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h" +#include "src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h" #endif #include "src/runtime/kernel/arm/int8/deconvolution_int8.h" #include "src/runtime/kernel/arm/int8/convolution_int8.h" @@ -347,6 +349,19 @@ kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector &inputs, + const std::vector &outputs, + OpParameter *opParameter, const Context *ctx) { + auto kernel = new (std::nothrow) ConvolutionDepthwiseFp16CPUKernel(opParameter, inputs, outputs, ctx); + if (kernel == nullptr) { + MS_LOG(ERROR) << "kernel is nullptr."; + return nullptr; + } + return kernel; +} +#endif + kernel::LiteKernel *CpuConvDwInt8KernelCreator(const std::vector &inputs, const std::vector &outputs, OpParameter *opParameter, const Context *ctx) { @@ -372,12 +387,12 @@ kernel::LiteKernel *CpuConvDwKernelCreator(const std::vector &inputs, + const std::vector &outputs, + OpParameter *opParameter, const lite::Context *ctx) { + auto kernel = new (std::nothrow) DeconvolutionDepthwiseFp16CPUKernel(opParameter, inputs, outputs, ctx); + if (kernel == nullptr) { + MS_LOG(ERROR) << "kernel is nullptr."; + return nullptr; + } + return kernel; +} +#endif + kernel::LiteKernel *CpuDeconvDwInt8KernelCreator(const std::vector &inputs, const std::vector &outputs, OpParameter *opParameter, const lite::Context *ctx) { @@ -432,7 +460,11 @@ kernel::LiteKernel *CpuDeconvDwKernelCreator(const std::vectorinput_channel_, C8NUM); + int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8; + packed_input_ = reinterpret_cast(malloc(pack_input_size * sizeof(float16_t))); + if (packed_input_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + memset(packed_input_, 0, pack_input_size * sizeof(float16_t)); + + // malloc pack output buffer + int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8; + packed_output_ = reinterpret_cast(malloc(pack_output_size * sizeof(float16_t))); + if (packed_output_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + memset(packed_output_, 0, pack_output_size * sizeof(float16_t)); + return RET_OK; +} + +int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() { + // init weight: o, h, w, i; o == group, i == 1 + int OC8 = UP_DIV(conv_param_->output_channel_, C8NUM); + auto weight_tensor = inputs_[kWeightIndex]; + auto origin_weight = reinterpret_cast(weight_tensor->Data()); + int pack_weight_size = C8NUM * OC8 * conv_param_->kernel_h_ * conv_param_->kernel_w_; + + packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(float16_t))); + if (packed_weight_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t)); + PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_, + conv_param_->output_channel_); + + // init bias + bias_data_ = reinterpret_cast(malloc(C8NUM * OC8 * sizeof(float16_t))); + if (bias_data_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t)); + auto bias_fp16 = reinterpret_cast(bias_data_); + if (inputs_.size() == kInputSize2) { + auto ori_bias = reinterpret_cast(inputs_.at(kBiasIndex)->Data()); + for (int i = 0; i < conv_param_->output_channel_; i++) { + bias_fp16[i] = (float16_t)ori_bias[i]; + } + } + + conv_param_->thread_num_ = MSMIN(thread_count_, OC8); + return RET_OK; +} + +int ConvolutionDepthwiseFp16CPUKernel::Init() { + // conv base init + ConvolutionBaseCPUKernel::Init(); + + // init sliding_ window param + sliding_ = new SlidingWindowParam; + InitSlidingParam(sliding_, conv_param_, C8NUM); + + auto ret = InitWeightBias(); + if (ret != 0) { + MS_LOG(ERROR) << "Convolution depthwise fp16 InitWeightBias failed."; + return RET_ERROR; + } + + ret = InitBuffer(); + if (ret != 0) { + MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed."; + return RET_ERROR; + } + return RET_OK; +} + +int ConvolutionDepthwiseFp16CPUKernel::ReSize() { + free(packed_input_); + free(packed_output_); + + ConvolutionBaseCPUKernel::Init(); + InitSlidingParam(sliding_, conv_param_, C8NUM); + + auto ret = InitBuffer(); + if (ret != 0) { + MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed."; + return RET_ERROR; + } + return RET_OK; +} + +int ConvolutionDepthwiseFp16CPUKernel::Execute(int task_id) { + ConvDwC8Fp16(packed_output_, packed_input_, packed_weight_, reinterpret_cast(bias_data_), conv_param_, + sliding_, task_id); + return RET_OK; +} + +int ConvDwFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) { + auto conv_dw_fp16 = reinterpret_cast(cdata); + auto ret = conv_dw_fp16->Execute(task_id); + if (ret != RET_OK) { + MS_LOG(ERROR) << "ConvolutionDepthwiseFp16Run error task_id[" << task_id << "] error_code[" << ret << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int ConvolutionDepthwiseFp16CPUKernel::Run() { + if (conv_param_->input_channel_ != conv_param_->output_channel_) { + MS_LOG(ERROR) << "Only support input channel equals output channel."; + return RET_ERROR; + } + + auto input_tensor = inputs_.at(kInputIndex); + auto input_addr = reinterpret_cast(input_tensor->Data()); + // pack input: to nhwc8 + PackNHWCFp32ToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_, + conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); + + auto ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_); + if (ret != RET_OK) { + MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]"; + return RET_ERROR; + } + + auto output_addr = reinterpret_cast(outputs_.at(kOutputIndex)->Data()); + PackNHWC8Fp16ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_, + conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); + + return RET_OK; +} +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h new file mode 100644 index 00000000000..a66e31da31c --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h @@ -0,0 +1,54 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_FP16_H_ + +#include +#include "src/lite_kernel.h" +#include "src/runtime/kernel/arm/base/convolution_base.h" +#include "src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h" + +namespace mindspore::kernel { +class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel { + public: + ConvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const Context *ctx) + : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {} + ~ConvolutionDepthwiseFp16CPUKernel() override { + delete sliding_; + free(packed_weight_); + free(packed_input_); + free(packed_output_); + } + + int Init() override; + int ReSize() override; + int Run() override; + + int InitBuffer(); + int InitWeightBias(); + int Execute(int task_id); + + private: + SlidingWindowParam *sliding_; + float16_t *packed_weight_; + float16_t *packed_input_; + float16_t *packed_output_; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc new file mode 100644 index 00000000000..f32de057814 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc @@ -0,0 +1,174 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h" +#include "schema/model_generated.h" +#include "src/kernel_registry.h" +#include "include/errorcode.h" +#include "src/runtime/runtime_api.h" + +using mindspore::kernel::KERNEL_ARCH::kCPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_DepthwiseConv2D; + +namespace mindspore::kernel { +int DeconvolutionDepthwiseFp16CPUKernel::InitSlideParam() { + conv_param_->input_batch_ = outputs_.front()->shape().at(kNHWC_N); + conv_param_->input_h_ = outputs_.front()->shape().at(kNHWC_H); + conv_param_->input_w_ = outputs_.front()->shape().at(kNHWC_W); + conv_param_->input_channel_ = outputs_.front()->shape().at(kNHWC_C); + conv_param_->output_batch_ = inputs_.front()->shape().at(kNHWC_N); + conv_param_->output_h_ = inputs_.front()->shape().at(kNHWC_H); + conv_param_->output_w_ = inputs_.front()->shape().at(kNHWC_W); + conv_param_->output_channel_ = inputs_.front()->shape().at(kNHWC_C); + + // init sliding_ window param + InitSlidingParam(sliding_, conv_param_, C8NUM); + return RET_OK; +} + +int DeconvolutionDepthwiseFp16CPUKernel::InitBuffer() { + // malloc pack input buffer + int C8 = UP_DIV(conv_param_->input_channel_, C8NUM); + int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8; + packed_input_ = reinterpret_cast(malloc(pack_input_size * sizeof(float16_t))); + if (packed_input_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + memset(packed_input_, 0, pack_input_size * sizeof(float16_t)); + + int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8; + packed_output_ = reinterpret_cast(malloc(pack_output_size * sizeof(float16_t))); + if (packed_output_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + memset(packed_output_, 0, pack_output_size * sizeof(float16_t)); + return RET_OK; +} + +int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() { + // init weight: o, h, w, i; o == group, i == 1 + int OC8 = UP_DIV(conv_param_->output_channel_, C8NUM); + auto weight_tensor = inputs_[kWeightIndex]; + auto origin_weight = reinterpret_cast(weight_tensor->Data()); + int pack_weight_size = C8NUM * OC8 * conv_param_->kernel_h_ * conv_param_->kernel_w_; + + packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(float16_t))); + if (packed_weight_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t)); + PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_, + conv_param_->output_channel_); + + // init bias + bias_data_ = reinterpret_cast(malloc(C8NUM * OC8 * sizeof(float16_t))); + if (bias_data_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t)); + if (inputs_.size() == kInputSize2) { + auto ori_bias = reinterpret_cast(inputs_.at(kBiasIndex)->Data()); + for (int i = 0; i < conv_param_->output_channel_; i++) { + reinterpret_cast(bias_data_)[i] = (float16_t)ori_bias[i]; + } + } + + conv_param_->thread_num_ = MSMIN(thread_count_, OC8); + return RET_OK; +} + +int DeconvolutionDepthwiseFp16CPUKernel::Init() { + sliding_ = new SlidingWindowParam; + InitSlideParam(); + // conv base init + ConvolutionBaseCPUKernel::Init(); + + auto ret = InitWeightBias(); + if (ret != 0) { + MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitWeightBias failed."; + return RET_ERROR; + } + + ret = InitBuffer(); + if (ret != 0) { + MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitBuffer failed."; + return RET_ERROR; + } + return RET_OK; +} + +int DeconvolutionDepthwiseFp16CPUKernel::ReSize() { + free(packed_input_); + free(packed_output_); + + InitSlideParam(); + ConvolutionBaseCPUKernel::Init(); + + auto ret = InitBuffer(); + if (ret != 0) { + MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed."; + return RET_ERROR; + } + return RET_OK; +} + +int DeconvolutionDepthwiseFp16CPUKernel::Execute(int task_id) { + DeconvDwC8Fp16(packed_output_, packed_input_, packed_weight_, reinterpret_cast(bias_data_), conv_param_, + sliding_, task_id); + return RET_OK; +} + +int DeconvDwFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) { + auto deconv_dw_fp16 = reinterpret_cast(cdata); + auto ret = deconv_dw_fp16->Execute(task_id); + if (ret != RET_OK) { + MS_LOG(ERROR) << "DeconvolutionDepthwiseFp16Run error task_id[" << task_id << "] error_code[" << ret << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int DeconvolutionDepthwiseFp16CPUKernel::Run() { + if (conv_param_->input_channel_ != conv_param_->output_channel_) { + MS_LOG(ERROR) << "Only support input channel equals output channel."; + return RET_ERROR; + } + + auto input_tensor = inputs_.at(kInputIndex); + auto input_addr = reinterpret_cast(input_tensor->Data()); + // pack input: to nhwc8 + PackNHWCFp32ToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_, + conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); + + auto ret = LiteBackendParallelLaunch(DeconvDwFp16Run, this, conv_param_->thread_num_); + if (ret != RET_OK) { + MS_LOG(ERROR) << "DeconvDwFp16Run error: error_code[" << ret << "]"; + return RET_ERROR; + } + + auto output_addr = reinterpret_cast(outputs_.at(kOutputIndex)->Data()); + PackNHWC8Fp16ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_, + conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); + return RET_OK; +} +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h new file mode 100644 index 00000000000..51a50b5bc52 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h @@ -0,0 +1,58 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_DEPTHWISE_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_DEPTHWISE_FP16_H_ + +#include +#include "src/lite_kernel.h" +#include "src/runtime/kernel/arm/base/convolution_base.h" +#include "src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h" + +namespace mindspore::kernel { +class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel { + public: + DeconvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const Context *ctx) + : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {} + ~DeconvolutionDepthwiseFp16CPUKernel() override { + delete sliding_; + free(packed_weight_); + if (need_align_) { + free(packed_input_); + free(packed_output_); + } + }; + + int Init() override; + int ReSize() override; + int Run() override; + + int InitBuffer(); + int InitWeightBias(); + int InitSlideParam(); + int Execute(int task_id); + + private: + SlidingWindowParam *sliding_; + float16_t *packed_weight_; + float16_t *packed_input_; + float16_t *packed_output_; + bool need_align_ = false; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_DEPTHWISE_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc index 6fcd243330d..27cb024a4bd 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc @@ -32,8 +32,8 @@ int ConvolutionDepthwiseCPUKernel::Init() { ConvolutionBaseCPUKernel::Init(); // init sliding window param - sliding = new SlidingWindowParam; - InitSlidingParam(sliding, conv_param_, C4NUM); + sliding_ = new SlidingWindowParam; + InitSlidingParam(sliding_, conv_param_, C4NUM); // pack input function: convert_func_ auto input_tensor = inputs_[kInputIndex]; @@ -97,7 +97,7 @@ int ConvolutionDepthwiseCPUKernel::ReSize() { int ConvolutionDepthwiseCPUKernel::Execute(int task_id) { ConvDwC4Fp32(packed_output_, packed_input_, packed_weight_, reinterpret_cast(bias_data_), conv_param_, - sliding, task_id); + sliding_, task_id); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.h index 88266b88dd5..0e326529f9c 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.h @@ -29,7 +29,7 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel { const std::vector &outputs, const Context *ctx) : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {} ~ConvolutionDepthwiseCPUKernel() override { - delete sliding; + delete sliding_; free(packed_weight_); if (convert_func_ != nullptr) { free(packed_input_); @@ -46,7 +46,7 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel { int Execute(int task_id); private: - SlidingWindowParam *sliding; + SlidingWindowParam *sliding_; float *packed_weight_; float *packed_input_; float *packed_output_; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc index e6963ee403c..82ca58b2a9c 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc @@ -38,8 +38,8 @@ int DeconvolutionDepthwiseCPUKernel::InitSlideParam() { conv_param_->output_channel_ = inputs_.front()->shape().at(kNHWC_C); // init sliding window param - sliding = new SlidingWindowParam; - InitSlidingParam(sliding, conv_param_, C4NUM); + sliding_ = new SlidingWindowParam; + InitSlidingParam(sliding_, conv_param_, C4NUM); return RET_OK; } @@ -110,7 +110,7 @@ int DeconvolutionDepthwiseCPUKernel::ReSize() { int DeconvolutionDepthwiseCPUKernel::DoExcute(int task_id) { DeconvDwC4Fp32(packed_output_, packed_input_, packed_weight_, reinterpret_cast(bias_data_), conv_param_, - sliding, task_id); + sliding_, task_id); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.h b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.h index ec612584fc2..f993d818c41 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.h @@ -29,7 +29,7 @@ class DeconvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel { const std::vector &outputs, const Context *ctx) : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {} ~DeconvolutionDepthwiseCPUKernel() override { - delete sliding; + delete sliding_; free(packed_weight_); free(packed_input_); free(packed_output_); @@ -43,7 +43,7 @@ class DeconvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel { int DoExcute(int task_id); private: - SlidingWindowParam *sliding; + SlidingWindowParam *sliding_; float *packed_weight_; float *packed_input_; float *packed_output_; diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc new file mode 100644 index 00000000000..9ecaa41b33b --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc @@ -0,0 +1,302 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h" +#ifdef ENABLE_FP16 +#include + +/*conv depthwise fp16 begin*/ +void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, + int height, int width, int in_kh_step, int in_kw_step, int kernel_w, bool is_relu, + bool is_relu6) { + const float16_t *src_kh = src; + const float16_t *weight_kh = weight; + for (int kh = 0; kh < height; kh++) { + const float16_t *src_kw = src_kh; + const float16_t *weight_kw = weight_kh; + for (int kw = 0; kw < width; kw++) { + float16x8_t src_8 = vld1q_f16(src_kw); + float16x8_t weight_8 = vld1q_f16(weight_kw); + float16x8_t dst_8 = vld1q_f16(dst); + dst_8 = vfmaq_f16(dst_8, src_8, weight_8); + vst1q_f16(dst, dst_8); + + src_kw += in_kw_step; + weight_kw += C8NUM; + } // kernel_w loop + src_kh += in_kh_step; + weight_kh += kernel_w * C8NUM; + } // kernel_h loop + for (int c = 0; c < C8NUM; c++) { + dst[c] += bias[c]; + dst[c] = (is_relu) ? (MSMAX(0, dst[c])) : (dst[c]); + dst[c] = (is_relu6) ? (MSMIN(6, MSMAX(0, dst[c]))) : (dst[c]); + } +} + +void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, int top, + int bottom, int left, int right, const ConvParameter *conv_param, + const SlidingWindowParam *sliding) { + float16_t *dst_h = dst + top * sliding->out_h_step_; + for (int oh = top; oh < bottom; oh++) { + int ih = oh * conv_param->stride_h_ - conv_param->pad_h_; + int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_)); + int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_)); + const float16_t *src_h = src + ih * sliding->in_h_step_; + + float16_t *dst_kernel = dst_h + left * sliding->block_channel_; + for (int ow = left; ow < right; ow++) { + int iw = ow * conv_param->stride_w_ - conv_param->pad_w_; + int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_)); + int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_)); + const float16_t *src_w = src_h + iw * sliding->block_channel_; + + const float16_t *src_kernel = src_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_; + const float16_t *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C8NUM; + + DepthwiseBorderPixelFp16(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, + sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_, conv_param->is_relu_, + conv_param->is_relu6_); + + dst_kernel += sliding->block_channel_; + } // width loop + dst_h += sliding->out_h_step_; + } // height loop +} + +void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, + int height, int width, int kernel_h, int kernel_w, int out_h_step, int block_channel, + int in_sh_step, int in_sw_step, int in_kh_step, int in_kw_step, bool is_relu, bool is_relu6) { + float16_t *dst_h = dst; + const float16_t *src_h = src; + for (int oh = 0; oh < height; oh++) { + float16_t *dst_w = dst_h; + const float16_t *src_w = src_h; + for (int ow = 0; ow < width; ow++) { + const float16_t *src_kh = src_w; + const float16_t *weight_kh = weight; + for (int kh = 0; kh < kernel_h; kh++) { + const float16_t *src_kw = src_kh; + const float16_t *weight_kw = weight_kh; + for (int kw = 0; kw < kernel_w; kw++) { + float16x8_t src_8 = vld1q_f16(src_kw); + float16x8_t weight_8 = vld1q_f16(weight_kw); + float16x8_t dst_8 = vld1q_f16(dst_w); + dst_8 = vfmaq_f16(dst_8, src_8, weight_8); + vst1q_f16(dst_w, dst_8); + + src_kw += in_kw_step; + weight_kw += C8NUM; + } // kernel_w loop + src_kh += in_kh_step; + weight_kh += kernel_w * C8NUM; + } // kernel_h loop + // add biad relu + for (int c = 0; c < C8NUM; c++) { + dst_w[c] += bias[c]; + dst_w[c] = (is_relu) ? (MSMAX(0, dst_w[c])) : (dst_w[c]); + dst_w[c] = (is_relu6) ? (MSMIN(6, MSMAX(0, dst_w[c]))) : (dst_w[c]); + } + dst_w += block_channel; + src_w += in_sw_step; + } // dst_width loop + dst_h += out_h_step; + src_h += in_sh_step; + } // dst_height loop +} + +// conv depthwise fp16: sliding window +void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, + const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, + int task_id) { + const float16_t *src = input_data; + float16_t *dst = output_data; + for (int b = 0; b < conv_param->output_batch_; b++) { + for (int oc = task_id; oc < sliding->c_block_; oc += conv_param->thread_num_) { + const float16_t *src_data = src + oc * C8NUM; + float16_t *dst_data = dst + oc * C8NUM; + const float16_t *weight = weight_data + oc * sliding->kernel_step_; + const float16_t *bias = bias_data + oc * C8NUM; + DepthwiseBorderFp16(dst_data, src_data, weight, bias, 0, sliding->top_, 0, conv_param->output_w_, conv_param, + sliding); + DepthwiseBorderFp16(dst_data, src_data, weight, bias, sliding->bottom_, conv_param->output_h_, 0, + conv_param->output_w_, conv_param, sliding); + DepthwiseBorderFp16(dst_data, src_data, weight, bias, sliding->top_, sliding->bottom_, 0, sliding->left_, + conv_param, sliding); + DepthwiseBorderFp16(dst_data, src_data, weight, bias, sliding->top_, sliding->bottom_, sliding->right_, + conv_param->output_w_, conv_param, sliding); + + if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { + int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_; + int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_; + const float16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_; + float16_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; + + DepthwiseCenterFp16(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, + sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, + sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, sliding->in_sw_step_, + sliding->in_kh_step_, sliding->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_); + } + } // output C8 loop + src += sliding->in_step_; + dst += sliding->out_step_; + } // batch loop + // output nchwc8 +} +/*conv depthwise fp16 end*/ + +/*deconv depthwise fp16 begin*/ +void DeconvDepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, int height, + int width, int in_kh_step, int in_kw_step, int kernel_w) { + float16_t *dst_kh = dst; + const float16_t *weight_kh = weight; + for (int kh = 0; kh < height; kh++) { + float16_t *dst_kw = dst_kh; + const float16_t *weight_kw = weight_kh; + for (int kw = 0; kw < width; kw++) { + float16x8_t src_8 = vld1q_f16(src); + float16x8_t weight_8 = vld1q_f16(weight_kw); + float16x8_t dst_8 = vld1q_f16(dst_kw); + dst_8 = vfmaq_f16(dst_8, src_8, weight_8); + vst1q_f16(dst_kw, dst_8); + + dst_kw += in_kw_step; + weight_kw += C8NUM; + } // kernel_w loop + dst_kh += in_kh_step; + weight_kh += kernel_w * C8NUM; + } // kernel_h loop +} + +void DeconvDepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, int top, int bottom, + int left, int right, const ConvParameter *conv_param, + const SlidingWindowParam *sliding) { + const float16_t *src_h = src + top * sliding->out_h_step_; + for (int ih = top; ih < bottom; ih++) { + int oh = ih * conv_param->stride_h_ - conv_param->pad_h_; + int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); + int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); + float16_t *dst_h = dst + oh * sliding->in_h_step_; + + const float16_t *src_kernel = src_h + left * sliding->block_channel_; + for (int iw = left; iw < right; iw++) { + int ow = iw * conv_param->stride_w_ - conv_param->pad_w_; + int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_)); + int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_)); + float16_t *dst_w = dst_h + ow * sliding->block_channel_; + + const float16_t *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C8NUM; + float16_t *dst_kernel = dst_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_; + + DeconvDepthwiseBorderPixelFp16(dst_kernel, src_kernel, weight_kernel, end_kh - start_kh, end_kw - start_kw, + sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_); + src_kernel += sliding->block_channel_; + } // width loop + src_h += sliding->out_h_step_; + } // height loop +} + +void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, int height, int width, + int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step, + int in_sw_step, int in_kh_step, int in_kw_step) { + float16_t *dst_h = dst; + const float16_t *src_h = src; + for (int oh = 0; oh < height; oh++) { + float16_t *dst_w = dst_h; + const float16_t *src_w = src_h; + for (int ow = 0; ow < width; ow++) { + float16_t *dst_kh = dst_w; + const float16_t *weight_kh = weight; + for (int kh = 0; kh < kernel_h; kh++) { + float16_t *dst_kw = dst_kh; + const float16_t *weight_kw = weight_kh; + for (int kw = 0; kw < kernel_w; kw++) { + float16x8_t src_8 = vld1q_f16(src_w); + float16x8_t weight_8 = vld1q_f16(weight_kw); + float16x8_t dst_8 = vld1q_f16(dst_kw); + dst_8 = vfmaq_f16(dst_8, src_8, weight_8); + vst1q_f16(dst_kw, dst_8); + + dst_kw += in_kw_step; + weight_kw += C8NUM; + } // kernel_w loop + dst_kh += in_kh_step; + weight_kh += kernel_w * C8NUM; + } // kernel_h loop + dst_w += in_sw_step; + src_w += block_channel; + } // dst_width loop + dst_h += in_sh_step; + src_h += out_h_step; + } // dst_height loop +} + +void DeconvDepthwisePostFuncFp16(float16_t *dst, const float16_t *bias, int block_channel, + const ConvParameter *conv_param) { + float16_t *dst_k = dst; + for (int k = 0; k < conv_param->output_h_ * conv_param->output_w_; k++) { + for (int c = 0; c < C8NUM; c++) { + dst_k[c] += bias[c]; + dst_k[c] = (conv_param->is_relu_) ? (MSMAX(0, dst_k[c])) : (dst_k[c]); + dst_k[c] = (conv_param->is_relu6_) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]); + } + dst_k += block_channel; + } +} + +// deconv depthwise fp16: sliding window +void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, + const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, + int task_id) { + const float16_t *src = input_data; + float16_t *dst = output_data; + for (int b = 0; b < conv_param->output_batch_; b++) { + for (int oc = task_id; oc < sliding->c_block_; oc += conv_param->thread_num_) { + const float16_t *src_data = src + oc * C8NUM; + float16_t *dst_data = dst + oc * C8NUM; + const float16_t *weight = weight_data + oc * sliding->kernel_step_; + const float16_t *bias = bias_data + oc * C8NUM; + DeconvDepthwiseBorderFp16(dst_data, src_data, weight, 0, sliding->top_, 0, conv_param->input_w_, conv_param, + sliding); + DeconvDepthwiseBorderFp16(dst_data, src_data, weight, sliding->bottom_, conv_param->input_h_, 0, + conv_param->input_w_, conv_param, sliding); + DeconvDepthwiseBorderFp16(dst_data, src_data, weight, sliding->top_, sliding->bottom_, 0, sliding->left_, + conv_param, sliding); + DeconvDepthwiseBorderFp16(dst_data, src_data, weight, sliding->top_, sliding->bottom_, sliding->right_, + conv_param->input_w_, conv_param, sliding); + + if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { + int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_; + int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_; + float16_t *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_; + const float16_t *in_t = + src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; + + DeconvDepthwiseCenterFp16(out_t, in_t, weight, sliding->bottom_ - sliding->top_, + sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, + sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, + sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_); + } + DeconvDepthwisePostFuncFp16(dst_data, bias, sliding->block_channel_, conv_param); + } // output C8 loop + src += sliding->in_step_; + dst += sliding->out_step_; + } // batch loop + // output nchwc8 +} +/*deconv depthwise fp16 end*/ + +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h b/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h new file mode 100644 index 00000000000..a0a8cac8ebe --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h @@ -0,0 +1,33 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_CONV_DEPTHWISE_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_CONV_DEPTHWISE_FP16_H_ + +#include "src/runtime/kernel/arm/opclib/conv_parameter.h" +#include "src/runtime/kernel/arm/opclib/fp32/conv_depthwise.h" + +#ifdef ENABLE_FP16 +void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, + const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, + int task_id); + +void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, + const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, + int task_id); +#endif + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_CONV_DEPTHWISE_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.h b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.h index a952722a9d2..bdb8e2f0ac9 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.h +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_P32_CONV_DEPTHWISE_H_ -#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_P32_CONV_DEPTHWISE_H_ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_CONV_DEPTHWISE_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_CONV_DEPTHWISE_H_ #include "src/runtime/kernel/arm/opclib/conv_parameter.h" @@ -45,5 +45,5 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig void DeconvDwC4Fp32(float *output_data, const float *input_data, const float *weight_data, const float *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id); -#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_P32_CONV_DEPTHWISE_H_ +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_CONV_DEPTHWISE_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/pack.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/pack.cc index 453d9f8afe4..8ed96919703 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/pack.cc +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/pack.cc @@ -292,6 +292,55 @@ void PackNC4HW4ToNCHWFp16(const void *src, void *dst, int batch, int plane, int } } } + +void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane, int channel) { + int c8 = UP_DIV(channel, C8NUM); + for (int b = 0; b < batch; b++) { + int src_offset = b * plane * channel; + int dst_offset = b * plane * c8 * C8NUM; + for (int c = 0; c < channel; c++) { + int c8_block_num = c / C8NUM; + int c8_block_rem = c % C8NUM; + int src_c_offset = src_offset + c * plane; + int dst_c_offset = dst_offset + c8_block_num * plane * C8NUM; + for (int k = 0; k < plane; k++) { + int src_kernel_offset = src_c_offset + k; + int dst_kernel_offset = dst_c_offset + C8NUM * k + c8_block_rem; + (dst + dst_kernel_offset)[0] = (float16_t)(src + src_kernel_offset)[0]; + } + } + } +} + +void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel) { + int c8 = UP_DIV(channel, C8NUM); + int nhwc8_batch_unit_offset = c8 * C8NUM * plane; + int nhwc8_batch_offset = 0; + for (int b = 0; b < batch; b++) { + int batch_offset = b * channel * plane; + for (int i = 0; i < plane; i++) { + for (int c = 0; c < channel; c++) { + (dst + nhwc8_batch_offset + i * c8 * C8NUM)[c] = (float16_t)(src + batch_offset + i * channel)[c]; + } + } + nhwc8_batch_offset += nhwc8_batch_unit_offset; + } +} + +void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel) { + int c8 = UP_DIV(channel, C8NUM); + int nhwc_batch_unit_offset = channel * plane; + int nhwc_batch_offset = 0; + for (int b = 0; b < batch; b++) { + int batch_offset = b * c8 * C8NUM * plane; + for (int i = 0; i < plane; i++) { + for (int c = 0; c < channel; c++) { + (dst + nhwc_batch_offset + i * channel)[c] = (float)(src + batch_offset + i * c8 * C8NUM)[c]; + } + } + nhwc_batch_offset += nhwc_batch_unit_offset; + } +} #endif void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed_weight) { @@ -1070,7 +1119,7 @@ void PackDepthwiseInt8Input(const int8_t *src, int16_t *dst, const ConvParameter auto src_k = src_b + k * conv_param->input_channel_; auto dst_k = dst_b + k * ic4 * C4NUM; for (int c = 0; c < conv_param->input_channel_; c++) { - dst_k[c] = (int16_t)((int32_t)(src_k[c]) - input_zp); + dst_k[c] = (int16_t)(src_k[c] - input_zp); } } } @@ -1087,7 +1136,7 @@ void PackDepthwiseInt8Weight(const int8_t *origin_weight, int16_t *packed_weight for (int k = 0; k < unit; k++) { auto src_kernel = src_c + k; auto dst_kernel = dst_c + C4NUM * k + c4_block_rem; - *dst_kernel = (int16_t)((int32_t)(src_kernel[0]) - weight_zp); + *dst_kernel = (int16_t)(src_kernel[0] - weight_zp); } } } diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/pack.h b/mindspore/lite/src/runtime/kernel/arm/opclib/pack.h index 5bc02976e40..95e711851e2 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/pack.h +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/pack.h @@ -46,6 +46,14 @@ void PackNC4HW4ToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int void PackNC4HW4ToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel); void PackNC4HW4ToNCHWFp16(const void *src, void *dst, int batch, int plane, int channel); + +void PackNC8HW8ToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel); + +void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane, int channel); + +void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel); + +void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel); #endif void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, float *packed_input, int real_cal_num, int block_index); @@ -163,4 +171,3 @@ inline void C4UnpackToHwcInt8(int8_t *src_ptr, int8_t *dst_ptr, int channel, int } #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_PACK_H_ - diff --git a/mindspore/lite/test/CMakeLists.txt b/mindspore/lite/test/CMakeLists.txt index f8b197e2482..e2ec61d6db6 100644 --- a/mindspore/lite/test/CMakeLists.txt +++ b/mindspore/lite/test/CMakeLists.txt @@ -72,7 +72,7 @@ else() ) endif() ### cpu kernel -file(GLOB_RECURSE KERNEL_OP_SRC +file(GLOB KERNEL_OP_SRC ${LITE_DIR}/src/runtime/kernel/arm/base/*.cc ${LITE_DIR}/src/runtime/kernel/arm/fp32/*.cc ${LITE_DIR}/src/runtime/kernel/arm/int8/*.cc @@ -103,10 +103,13 @@ if (PLATFORM_ARM32) ) endif() if (ENABLE_FP16) + file(GLOB KERNEL_OP_FP16_SRC + ${LITE_DIR}/src/runtime/kernel/arm/fp16/*.cc + ${LITE_DIR}/src/runtime/kernel/arm/opclib/fp16/*.cc + ) set(KERNEL_OP_SRC ${KERNEL_OP_SRC} - ${LITE_DIR}/src/runtime/kernel/arm/fp16/convolution_fp16.cc - ${LITE_DIR}/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc + ${KERNEL_OP_FP16_SRC} ) endif () ### gpu kernel