add lite cpu op: conv_depthwise fp16, deconv_depthwise fp16
This commit is contained in:
parent
7a2bb89b30
commit
bf8d3f153f
|
@ -172,7 +172,8 @@ union PrimitiveType {
|
|||
TupleGetItem,
|
||||
Div,
|
||||
Where,
|
||||
OneHot
|
||||
OneHot,
|
||||
Lstm
|
||||
}
|
||||
|
||||
enum QuantType: int {
|
||||
|
|
|
@ -718,3 +718,7 @@ table Where{
|
|||
table OneHot {
|
||||
axis: int;
|
||||
}
|
||||
|
||||
table Lstm{
|
||||
bidirection: bool = false;
|
||||
}
|
||||
|
|
|
@ -25,6 +25,8 @@
|
|||
#ifdef ENABLE_FP16
|
||||
#include "src/runtime/kernel/arm/fp16/convolution_fp16.h"
|
||||
#include "src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h"
|
||||
#include "src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h"
|
||||
#include "src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h"
|
||||
#endif
|
||||
#include "src/runtime/kernel/arm/int8/deconvolution_int8.h"
|
||||
#include "src/runtime/kernel/arm/int8/convolution_int8.h"
|
||||
|
@ -347,6 +349,19 @@ kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::tensor::T
|
|||
return kernel;
|
||||
}
|
||||
|
||||
#ifdef ENABLE_FP16
|
||||
kernel::LiteKernel *CpuConvDwFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
const std::vector<lite::tensor::Tensor *> &outputs,
|
||||
OpParameter *opParameter, const Context *ctx) {
|
||||
auto kernel = new (std::nothrow) ConvolutionDepthwiseFp16CPUKernel(opParameter, inputs, outputs, ctx);
|
||||
if (kernel == nullptr) {
|
||||
MS_LOG(ERROR) << "kernel is nullptr.";
|
||||
return nullptr;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
#endif
|
||||
|
||||
kernel::LiteKernel *CpuConvDwInt8KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
const std::vector<lite::tensor::Tensor *> &outputs,
|
||||
OpParameter *opParameter, const Context *ctx) {
|
||||
|
@ -372,12 +387,12 @@ kernel::LiteKernel *CpuConvDwKernelCreator(const std::vector<lite::tensor::Tenso
|
|||
break;
|
||||
case kNumberTypeUInt8:
|
||||
break;
|
||||
#ifdef ENABLE_FP16
|
||||
case kNumberTypeFloat16:
|
||||
break;
|
||||
#endif
|
||||
case kNumberTypeFloat32:
|
||||
#ifdef ENABLE_FP16
|
||||
kernel = CpuConvDwFp16KernelCreator(inputs, outputs, opParameter, ctx);
|
||||
#else
|
||||
kernel = CpuConvDwFp32KernelCreator(inputs, outputs, opParameter, ctx);
|
||||
#endif
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
@ -407,6 +422,19 @@ kernel::LiteKernel *CpuDeconvDwFp32KernelCreator(const std::vector<lite::tensor:
|
|||
return kernel;
|
||||
}
|
||||
|
||||
#ifdef ENABLE_FP16
|
||||
kernel::LiteKernel *CpuDeconvDwFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
const std::vector<lite::tensor::Tensor *> &outputs,
|
||||
OpParameter *opParameter, const lite::Context *ctx) {
|
||||
auto kernel = new (std::nothrow) DeconvolutionDepthwiseFp16CPUKernel(opParameter, inputs, outputs, ctx);
|
||||
if (kernel == nullptr) {
|
||||
MS_LOG(ERROR) << "kernel is nullptr.";
|
||||
return nullptr;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
#endif
|
||||
|
||||
kernel::LiteKernel *CpuDeconvDwInt8KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
const std::vector<lite::tensor::Tensor *> &outputs,
|
||||
OpParameter *opParameter, const lite::Context *ctx) {
|
||||
|
@ -432,7 +460,11 @@ kernel::LiteKernel *CpuDeconvDwKernelCreator(const std::vector<lite::tensor::Ten
|
|||
kernel = CpuDeconvDwInt8KernelCreator(inputs, outputs, opParameter, ctx);
|
||||
break;
|
||||
case kNumberTypeFloat32:
|
||||
#ifdef ENABLE_FP16
|
||||
kernel = CpuDeconvDwFp16KernelCreator(inputs, outputs, opParameter, ctx);
|
||||
#else
|
||||
kernel = CpuDeconvDwFp32KernelCreator(inputs, outputs, opParameter, ctx);
|
||||
#endif
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
|
|
@ -0,0 +1,164 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "include/errorcode.h"
|
||||
#include "src/runtime/runtime_api.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_DepthwiseConv2D;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
int ConvolutionDepthwiseFp16CPUKernel::InitBuffer() {
|
||||
// malloc pack input buffer
|
||||
int C8 = UP_DIV(conv_param_->input_channel_, C8NUM);
|
||||
int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8;
|
||||
packed_input_ = reinterpret_cast<float16_t *>(malloc(pack_input_size * sizeof(float16_t)));
|
||||
if (packed_input_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(packed_input_, 0, pack_input_size * sizeof(float16_t));
|
||||
|
||||
// malloc pack output buffer
|
||||
int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8;
|
||||
packed_output_ = reinterpret_cast<float16_t *>(malloc(pack_output_size * sizeof(float16_t)));
|
||||
if (packed_output_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(packed_output_, 0, pack_output_size * sizeof(float16_t));
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
|
||||
// init weight: o, h, w, i; o == group, i == 1
|
||||
int OC8 = UP_DIV(conv_param_->output_channel_, C8NUM);
|
||||
auto weight_tensor = inputs_[kWeightIndex];
|
||||
auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
|
||||
int pack_weight_size = C8NUM * OC8 * conv_param_->kernel_h_ * conv_param_->kernel_w_;
|
||||
|
||||
packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
|
||||
if (packed_weight_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t));
|
||||
PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_,
|
||||
conv_param_->output_channel_);
|
||||
|
||||
// init bias
|
||||
bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t)));
|
||||
if (bias_data_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t));
|
||||
auto bias_fp16 = reinterpret_cast<float16_t *>(bias_data_);
|
||||
if (inputs_.size() == kInputSize2) {
|
||||
auto ori_bias = reinterpret_cast<float *>(inputs_.at(kBiasIndex)->Data());
|
||||
for (int i = 0; i < conv_param_->output_channel_; i++) {
|
||||
bias_fp16[i] = (float16_t)ori_bias[i];
|
||||
}
|
||||
}
|
||||
|
||||
conv_param_->thread_num_ = MSMIN(thread_count_, OC8);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConvolutionDepthwiseFp16CPUKernel::Init() {
|
||||
// conv base init
|
||||
ConvolutionBaseCPUKernel::Init();
|
||||
|
||||
// init sliding_ window param
|
||||
sliding_ = new SlidingWindowParam;
|
||||
InitSlidingParam(sliding_, conv_param_, C8NUM);
|
||||
|
||||
auto ret = InitWeightBias();
|
||||
if (ret != 0) {
|
||||
MS_LOG(ERROR) << "Convolution depthwise fp16 InitWeightBias failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
ret = InitBuffer();
|
||||
if (ret != 0) {
|
||||
MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConvolutionDepthwiseFp16CPUKernel::ReSize() {
|
||||
free(packed_input_);
|
||||
free(packed_output_);
|
||||
|
||||
ConvolutionBaseCPUKernel::Init();
|
||||
InitSlidingParam(sliding_, conv_param_, C8NUM);
|
||||
|
||||
auto ret = InitBuffer();
|
||||
if (ret != 0) {
|
||||
MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConvolutionDepthwiseFp16CPUKernel::Execute(int task_id) {
|
||||
ConvDwC8Fp16(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), conv_param_,
|
||||
sliding_, task_id);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConvDwFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
|
||||
auto conv_dw_fp16 = reinterpret_cast<ConvolutionDepthwiseFp16CPUKernel *>(cdata);
|
||||
auto ret = conv_dw_fp16->Execute(task_id);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "ConvolutionDepthwiseFp16Run error task_id[" << task_id << "] error_code[" << ret << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConvolutionDepthwiseFp16CPUKernel::Run() {
|
||||
if (conv_param_->input_channel_ != conv_param_->output_channel_) {
|
||||
MS_LOG(ERROR) << "Only support input channel equals output channel.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
auto input_tensor = inputs_.at(kInputIndex);
|
||||
auto input_addr = reinterpret_cast<float *>(input_tensor->Data());
|
||||
// pack input: to nhwc8
|
||||
PackNHWCFp32ToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_,
|
||||
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
|
||||
|
||||
auto ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data());
|
||||
PackNHWC8Fp16ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_,
|
||||
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,54 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_FP16_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_FP16_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/lite_kernel.h"
|
||||
#include "src/runtime/kernel/arm/base/convolution_base.h"
|
||||
#include "src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
|
||||
public:
|
||||
ConvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx)
|
||||
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {}
|
||||
~ConvolutionDepthwiseFp16CPUKernel() override {
|
||||
delete sliding_;
|
||||
free(packed_weight_);
|
||||
free(packed_input_);
|
||||
free(packed_output_);
|
||||
}
|
||||
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
|
||||
int InitBuffer();
|
||||
int InitWeightBias();
|
||||
int Execute(int task_id);
|
||||
|
||||
private:
|
||||
SlidingWindowParam *sliding_;
|
||||
float16_t *packed_weight_;
|
||||
float16_t *packed_input_;
|
||||
float16_t *packed_output_;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_FP16_H_
|
|
@ -0,0 +1,174 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "include/errorcode.h"
|
||||
#include "src/runtime/runtime_api.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_DepthwiseConv2D;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
int DeconvolutionDepthwiseFp16CPUKernel::InitSlideParam() {
|
||||
conv_param_->input_batch_ = outputs_.front()->shape().at(kNHWC_N);
|
||||
conv_param_->input_h_ = outputs_.front()->shape().at(kNHWC_H);
|
||||
conv_param_->input_w_ = outputs_.front()->shape().at(kNHWC_W);
|
||||
conv_param_->input_channel_ = outputs_.front()->shape().at(kNHWC_C);
|
||||
conv_param_->output_batch_ = inputs_.front()->shape().at(kNHWC_N);
|
||||
conv_param_->output_h_ = inputs_.front()->shape().at(kNHWC_H);
|
||||
conv_param_->output_w_ = inputs_.front()->shape().at(kNHWC_W);
|
||||
conv_param_->output_channel_ = inputs_.front()->shape().at(kNHWC_C);
|
||||
|
||||
// init sliding_ window param
|
||||
InitSlidingParam(sliding_, conv_param_, C8NUM);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeconvolutionDepthwiseFp16CPUKernel::InitBuffer() {
|
||||
// malloc pack input buffer
|
||||
int C8 = UP_DIV(conv_param_->input_channel_, C8NUM);
|
||||
int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8;
|
||||
packed_input_ = reinterpret_cast<float16_t *>(malloc(pack_input_size * sizeof(float16_t)));
|
||||
if (packed_input_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(packed_input_, 0, pack_input_size * sizeof(float16_t));
|
||||
|
||||
int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8;
|
||||
packed_output_ = reinterpret_cast<float16_t *>(malloc(pack_output_size * sizeof(float16_t)));
|
||||
if (packed_output_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(packed_output_, 0, pack_output_size * sizeof(float16_t));
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
|
||||
// init weight: o, h, w, i; o == group, i == 1
|
||||
int OC8 = UP_DIV(conv_param_->output_channel_, C8NUM);
|
||||
auto weight_tensor = inputs_[kWeightIndex];
|
||||
auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
|
||||
int pack_weight_size = C8NUM * OC8 * conv_param_->kernel_h_ * conv_param_->kernel_w_;
|
||||
|
||||
packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
|
||||
if (packed_weight_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t));
|
||||
PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_,
|
||||
conv_param_->output_channel_);
|
||||
|
||||
// init bias
|
||||
bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t)));
|
||||
if (bias_data_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t));
|
||||
if (inputs_.size() == kInputSize2) {
|
||||
auto ori_bias = reinterpret_cast<float *>(inputs_.at(kBiasIndex)->Data());
|
||||
for (int i = 0; i < conv_param_->output_channel_; i++) {
|
||||
reinterpret_cast<float *>(bias_data_)[i] = (float16_t)ori_bias[i];
|
||||
}
|
||||
}
|
||||
|
||||
conv_param_->thread_num_ = MSMIN(thread_count_, OC8);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeconvolutionDepthwiseFp16CPUKernel::Init() {
|
||||
sliding_ = new SlidingWindowParam;
|
||||
InitSlideParam();
|
||||
// conv base init
|
||||
ConvolutionBaseCPUKernel::Init();
|
||||
|
||||
auto ret = InitWeightBias();
|
||||
if (ret != 0) {
|
||||
MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitWeightBias failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
ret = InitBuffer();
|
||||
if (ret != 0) {
|
||||
MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeconvolutionDepthwiseFp16CPUKernel::ReSize() {
|
||||
free(packed_input_);
|
||||
free(packed_output_);
|
||||
|
||||
InitSlideParam();
|
||||
ConvolutionBaseCPUKernel::Init();
|
||||
|
||||
auto ret = InitBuffer();
|
||||
if (ret != 0) {
|
||||
MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeconvolutionDepthwiseFp16CPUKernel::Execute(int task_id) {
|
||||
DeconvDwC8Fp16(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), conv_param_,
|
||||
sliding_, task_id);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeconvDwFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
|
||||
auto deconv_dw_fp16 = reinterpret_cast<DeconvolutionDepthwiseFp16CPUKernel *>(cdata);
|
||||
auto ret = deconv_dw_fp16->Execute(task_id);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "DeconvolutionDepthwiseFp16Run error task_id[" << task_id << "] error_code[" << ret << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeconvolutionDepthwiseFp16CPUKernel::Run() {
|
||||
if (conv_param_->input_channel_ != conv_param_->output_channel_) {
|
||||
MS_LOG(ERROR) << "Only support input channel equals output channel.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
auto input_tensor = inputs_.at(kInputIndex);
|
||||
auto input_addr = reinterpret_cast<float *>(input_tensor->Data());
|
||||
// pack input: to nhwc8
|
||||
PackNHWCFp32ToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_,
|
||||
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
|
||||
|
||||
auto ret = LiteBackendParallelLaunch(DeconvDwFp16Run, this, conv_param_->thread_num_);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "DeconvDwFp16Run error: error_code[" << ret << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data());
|
||||
PackNHWC8Fp16ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_,
|
||||
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
|
||||
return RET_OK;
|
||||
}
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,58 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_DEPTHWISE_FP16_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_DEPTHWISE_FP16_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/lite_kernel.h"
|
||||
#include "src/runtime/kernel/arm/base/convolution_base.h"
|
||||
#include "src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
|
||||
public:
|
||||
DeconvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx)
|
||||
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {}
|
||||
~DeconvolutionDepthwiseFp16CPUKernel() override {
|
||||
delete sliding_;
|
||||
free(packed_weight_);
|
||||
if (need_align_) {
|
||||
free(packed_input_);
|
||||
free(packed_output_);
|
||||
}
|
||||
};
|
||||
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
|
||||
int InitBuffer();
|
||||
int InitWeightBias();
|
||||
int InitSlideParam();
|
||||
int Execute(int task_id);
|
||||
|
||||
private:
|
||||
SlidingWindowParam *sliding_;
|
||||
float16_t *packed_weight_;
|
||||
float16_t *packed_input_;
|
||||
float16_t *packed_output_;
|
||||
bool need_align_ = false;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_DEPTHWISE_FP16_H_
|
|
@ -32,8 +32,8 @@ int ConvolutionDepthwiseCPUKernel::Init() {
|
|||
ConvolutionBaseCPUKernel::Init();
|
||||
|
||||
// init sliding window param
|
||||
sliding = new SlidingWindowParam;
|
||||
InitSlidingParam(sliding, conv_param_, C4NUM);
|
||||
sliding_ = new SlidingWindowParam;
|
||||
InitSlidingParam(sliding_, conv_param_, C4NUM);
|
||||
|
||||
// pack input function: convert_func_
|
||||
auto input_tensor = inputs_[kInputIndex];
|
||||
|
@ -97,7 +97,7 @@ int ConvolutionDepthwiseCPUKernel::ReSize() {
|
|||
|
||||
int ConvolutionDepthwiseCPUKernel::Execute(int task_id) {
|
||||
ConvDwC4Fp32(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float *>(bias_data_), conv_param_,
|
||||
sliding, task_id);
|
||||
sliding_, task_id);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -29,7 +29,7 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
|
|||
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx)
|
||||
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {}
|
||||
~ConvolutionDepthwiseCPUKernel() override {
|
||||
delete sliding;
|
||||
delete sliding_;
|
||||
free(packed_weight_);
|
||||
if (convert_func_ != nullptr) {
|
||||
free(packed_input_);
|
||||
|
@ -46,7 +46,7 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
|
|||
int Execute(int task_id);
|
||||
|
||||
private:
|
||||
SlidingWindowParam *sliding;
|
||||
SlidingWindowParam *sliding_;
|
||||
float *packed_weight_;
|
||||
float *packed_input_;
|
||||
float *packed_output_;
|
||||
|
|
|
@ -38,8 +38,8 @@ int DeconvolutionDepthwiseCPUKernel::InitSlideParam() {
|
|||
conv_param_->output_channel_ = inputs_.front()->shape().at(kNHWC_C);
|
||||
|
||||
// init sliding window param
|
||||
sliding = new SlidingWindowParam;
|
||||
InitSlidingParam(sliding, conv_param_, C4NUM);
|
||||
sliding_ = new SlidingWindowParam;
|
||||
InitSlidingParam(sliding_, conv_param_, C4NUM);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
@ -110,7 +110,7 @@ int DeconvolutionDepthwiseCPUKernel::ReSize() {
|
|||
|
||||
int DeconvolutionDepthwiseCPUKernel::DoExcute(int task_id) {
|
||||
DeconvDwC4Fp32(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float *>(bias_data_), conv_param_,
|
||||
sliding, task_id);
|
||||
sliding_, task_id);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -29,7 +29,7 @@ class DeconvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
|
|||
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx)
|
||||
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {}
|
||||
~DeconvolutionDepthwiseCPUKernel() override {
|
||||
delete sliding;
|
||||
delete sliding_;
|
||||
free(packed_weight_);
|
||||
free(packed_input_);
|
||||
free(packed_output_);
|
||||
|
@ -43,7 +43,7 @@ class DeconvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
|
|||
int DoExcute(int task_id);
|
||||
|
||||
private:
|
||||
SlidingWindowParam *sliding;
|
||||
SlidingWindowParam *sliding_;
|
||||
float *packed_weight_;
|
||||
float *packed_input_;
|
||||
float *packed_output_;
|
||||
|
|
|
@ -0,0 +1,302 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h"
|
||||
#ifdef ENABLE_FP16
|
||||
#include <arm_neon.h>
|
||||
|
||||
/*conv depthwise fp16 begin*/
|
||||
void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
|
||||
int height, int width, int in_kh_step, int in_kw_step, int kernel_w, bool is_relu,
|
||||
bool is_relu6) {
|
||||
const float16_t *src_kh = src;
|
||||
const float16_t *weight_kh = weight;
|
||||
for (int kh = 0; kh < height; kh++) {
|
||||
const float16_t *src_kw = src_kh;
|
||||
const float16_t *weight_kw = weight_kh;
|
||||
for (int kw = 0; kw < width; kw++) {
|
||||
float16x8_t src_8 = vld1q_f16(src_kw);
|
||||
float16x8_t weight_8 = vld1q_f16(weight_kw);
|
||||
float16x8_t dst_8 = vld1q_f16(dst);
|
||||
dst_8 = vfmaq_f16(dst_8, src_8, weight_8);
|
||||
vst1q_f16(dst, dst_8);
|
||||
|
||||
src_kw += in_kw_step;
|
||||
weight_kw += C8NUM;
|
||||
} // kernel_w loop
|
||||
src_kh += in_kh_step;
|
||||
weight_kh += kernel_w * C8NUM;
|
||||
} // kernel_h loop
|
||||
for (int c = 0; c < C8NUM; c++) {
|
||||
dst[c] += bias[c];
|
||||
dst[c] = (is_relu) ? (MSMAX(0, dst[c])) : (dst[c]);
|
||||
dst[c] = (is_relu6) ? (MSMIN(6, MSMAX(0, dst[c]))) : (dst[c]);
|
||||
}
|
||||
}
|
||||
|
||||
void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, int top,
|
||||
int bottom, int left, int right, const ConvParameter *conv_param,
|
||||
const SlidingWindowParam *sliding) {
|
||||
float16_t *dst_h = dst + top * sliding->out_h_step_;
|
||||
for (int oh = top; oh < bottom; oh++) {
|
||||
int ih = oh * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_));
|
||||
int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_));
|
||||
const float16_t *src_h = src + ih * sliding->in_h_step_;
|
||||
|
||||
float16_t *dst_kernel = dst_h + left * sliding->block_channel_;
|
||||
for (int ow = left; ow < right; ow++) {
|
||||
int iw = ow * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_));
|
||||
int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_));
|
||||
const float16_t *src_w = src_h + iw * sliding->block_channel_;
|
||||
|
||||
const float16_t *src_kernel = src_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_;
|
||||
const float16_t *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C8NUM;
|
||||
|
||||
DepthwiseBorderPixelFp16(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
|
||||
sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_, conv_param->is_relu_,
|
||||
conv_param->is_relu6_);
|
||||
|
||||
dst_kernel += sliding->block_channel_;
|
||||
} // width loop
|
||||
dst_h += sliding->out_h_step_;
|
||||
} // height loop
|
||||
}
|
||||
|
||||
void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
|
||||
int height, int width, int kernel_h, int kernel_w, int out_h_step, int block_channel,
|
||||
int in_sh_step, int in_sw_step, int in_kh_step, int in_kw_step, bool is_relu, bool is_relu6) {
|
||||
float16_t *dst_h = dst;
|
||||
const float16_t *src_h = src;
|
||||
for (int oh = 0; oh < height; oh++) {
|
||||
float16_t *dst_w = dst_h;
|
||||
const float16_t *src_w = src_h;
|
||||
for (int ow = 0; ow < width; ow++) {
|
||||
const float16_t *src_kh = src_w;
|
||||
const float16_t *weight_kh = weight;
|
||||
for (int kh = 0; kh < kernel_h; kh++) {
|
||||
const float16_t *src_kw = src_kh;
|
||||
const float16_t *weight_kw = weight_kh;
|
||||
for (int kw = 0; kw < kernel_w; kw++) {
|
||||
float16x8_t src_8 = vld1q_f16(src_kw);
|
||||
float16x8_t weight_8 = vld1q_f16(weight_kw);
|
||||
float16x8_t dst_8 = vld1q_f16(dst_w);
|
||||
dst_8 = vfmaq_f16(dst_8, src_8, weight_8);
|
||||
vst1q_f16(dst_w, dst_8);
|
||||
|
||||
src_kw += in_kw_step;
|
||||
weight_kw += C8NUM;
|
||||
} // kernel_w loop
|
||||
src_kh += in_kh_step;
|
||||
weight_kh += kernel_w * C8NUM;
|
||||
} // kernel_h loop
|
||||
// add biad relu
|
||||
for (int c = 0; c < C8NUM; c++) {
|
||||
dst_w[c] += bias[c];
|
||||
dst_w[c] = (is_relu) ? (MSMAX(0, dst_w[c])) : (dst_w[c]);
|
||||
dst_w[c] = (is_relu6) ? (MSMIN(6, MSMAX(0, dst_w[c]))) : (dst_w[c]);
|
||||
}
|
||||
dst_w += block_channel;
|
||||
src_w += in_sw_step;
|
||||
} // dst_width loop
|
||||
dst_h += out_h_step;
|
||||
src_h += in_sh_step;
|
||||
} // dst_height loop
|
||||
}
|
||||
|
||||
// conv depthwise fp16: sliding window
|
||||
void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
|
||||
const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding,
|
||||
int task_id) {
|
||||
const float16_t *src = input_data;
|
||||
float16_t *dst = output_data;
|
||||
for (int b = 0; b < conv_param->output_batch_; b++) {
|
||||
for (int oc = task_id; oc < sliding->c_block_; oc += conv_param->thread_num_) {
|
||||
const float16_t *src_data = src + oc * C8NUM;
|
||||
float16_t *dst_data = dst + oc * C8NUM;
|
||||
const float16_t *weight = weight_data + oc * sliding->kernel_step_;
|
||||
const float16_t *bias = bias_data + oc * C8NUM;
|
||||
DepthwiseBorderFp16(dst_data, src_data, weight, bias, 0, sliding->top_, 0, conv_param->output_w_, conv_param,
|
||||
sliding);
|
||||
DepthwiseBorderFp16(dst_data, src_data, weight, bias, sliding->bottom_, conv_param->output_h_, 0,
|
||||
conv_param->output_w_, conv_param, sliding);
|
||||
DepthwiseBorderFp16(dst_data, src_data, weight, bias, sliding->top_, sliding->bottom_, 0, sliding->left_,
|
||||
conv_param, sliding);
|
||||
DepthwiseBorderFp16(dst_data, src_data, weight, bias, sliding->top_, sliding->bottom_, sliding->right_,
|
||||
conv_param->output_w_, conv_param, sliding);
|
||||
|
||||
if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) {
|
||||
int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
const float16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_;
|
||||
float16_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
|
||||
|
||||
DepthwiseCenterFp16(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_,
|
||||
sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
|
||||
sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, sliding->in_sw_step_,
|
||||
sliding->in_kh_step_, sliding->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_);
|
||||
}
|
||||
} // output C8 loop
|
||||
src += sliding->in_step_;
|
||||
dst += sliding->out_step_;
|
||||
} // batch loop
|
||||
// output nchwc8
|
||||
}
|
||||
/*conv depthwise fp16 end*/
|
||||
|
||||
/*deconv depthwise fp16 begin*/
|
||||
void DeconvDepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, int height,
|
||||
int width, int in_kh_step, int in_kw_step, int kernel_w) {
|
||||
float16_t *dst_kh = dst;
|
||||
const float16_t *weight_kh = weight;
|
||||
for (int kh = 0; kh < height; kh++) {
|
||||
float16_t *dst_kw = dst_kh;
|
||||
const float16_t *weight_kw = weight_kh;
|
||||
for (int kw = 0; kw < width; kw++) {
|
||||
float16x8_t src_8 = vld1q_f16(src);
|
||||
float16x8_t weight_8 = vld1q_f16(weight_kw);
|
||||
float16x8_t dst_8 = vld1q_f16(dst_kw);
|
||||
dst_8 = vfmaq_f16(dst_8, src_8, weight_8);
|
||||
vst1q_f16(dst_kw, dst_8);
|
||||
|
||||
dst_kw += in_kw_step;
|
||||
weight_kw += C8NUM;
|
||||
} // kernel_w loop
|
||||
dst_kh += in_kh_step;
|
||||
weight_kh += kernel_w * C8NUM;
|
||||
} // kernel_h loop
|
||||
}
|
||||
|
||||
void DeconvDepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, int top, int bottom,
|
||||
int left, int right, const ConvParameter *conv_param,
|
||||
const SlidingWindowParam *sliding) {
|
||||
const float16_t *src_h = src + top * sliding->out_h_step_;
|
||||
for (int ih = top; ih < bottom; ih++) {
|
||||
int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
|
||||
int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
|
||||
float16_t *dst_h = dst + oh * sliding->in_h_step_;
|
||||
|
||||
const float16_t *src_kernel = src_h + left * sliding->block_channel_;
|
||||
for (int iw = left; iw < right; iw++) {
|
||||
int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_));
|
||||
int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_));
|
||||
float16_t *dst_w = dst_h + ow * sliding->block_channel_;
|
||||
|
||||
const float16_t *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C8NUM;
|
||||
float16_t *dst_kernel = dst_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_;
|
||||
|
||||
DeconvDepthwiseBorderPixelFp16(dst_kernel, src_kernel, weight_kernel, end_kh - start_kh, end_kw - start_kw,
|
||||
sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_);
|
||||
src_kernel += sliding->block_channel_;
|
||||
} // width loop
|
||||
src_h += sliding->out_h_step_;
|
||||
} // height loop
|
||||
}
|
||||
|
||||
void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, int height, int width,
|
||||
int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step,
|
||||
int in_sw_step, int in_kh_step, int in_kw_step) {
|
||||
float16_t *dst_h = dst;
|
||||
const float16_t *src_h = src;
|
||||
for (int oh = 0; oh < height; oh++) {
|
||||
float16_t *dst_w = dst_h;
|
||||
const float16_t *src_w = src_h;
|
||||
for (int ow = 0; ow < width; ow++) {
|
||||
float16_t *dst_kh = dst_w;
|
||||
const float16_t *weight_kh = weight;
|
||||
for (int kh = 0; kh < kernel_h; kh++) {
|
||||
float16_t *dst_kw = dst_kh;
|
||||
const float16_t *weight_kw = weight_kh;
|
||||
for (int kw = 0; kw < kernel_w; kw++) {
|
||||
float16x8_t src_8 = vld1q_f16(src_w);
|
||||
float16x8_t weight_8 = vld1q_f16(weight_kw);
|
||||
float16x8_t dst_8 = vld1q_f16(dst_kw);
|
||||
dst_8 = vfmaq_f16(dst_8, src_8, weight_8);
|
||||
vst1q_f16(dst_kw, dst_8);
|
||||
|
||||
dst_kw += in_kw_step;
|
||||
weight_kw += C8NUM;
|
||||
} // kernel_w loop
|
||||
dst_kh += in_kh_step;
|
||||
weight_kh += kernel_w * C8NUM;
|
||||
} // kernel_h loop
|
||||
dst_w += in_sw_step;
|
||||
src_w += block_channel;
|
||||
} // dst_width loop
|
||||
dst_h += in_sh_step;
|
||||
src_h += out_h_step;
|
||||
} // dst_height loop
|
||||
}
|
||||
|
||||
void DeconvDepthwisePostFuncFp16(float16_t *dst, const float16_t *bias, int block_channel,
|
||||
const ConvParameter *conv_param) {
|
||||
float16_t *dst_k = dst;
|
||||
for (int k = 0; k < conv_param->output_h_ * conv_param->output_w_; k++) {
|
||||
for (int c = 0; c < C8NUM; c++) {
|
||||
dst_k[c] += bias[c];
|
||||
dst_k[c] = (conv_param->is_relu_) ? (MSMAX(0, dst_k[c])) : (dst_k[c]);
|
||||
dst_k[c] = (conv_param->is_relu6_) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]);
|
||||
}
|
||||
dst_k += block_channel;
|
||||
}
|
||||
}
|
||||
|
||||
// deconv depthwise fp16: sliding window
|
||||
void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
|
||||
const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding,
|
||||
int task_id) {
|
||||
const float16_t *src = input_data;
|
||||
float16_t *dst = output_data;
|
||||
for (int b = 0; b < conv_param->output_batch_; b++) {
|
||||
for (int oc = task_id; oc < sliding->c_block_; oc += conv_param->thread_num_) {
|
||||
const float16_t *src_data = src + oc * C8NUM;
|
||||
float16_t *dst_data = dst + oc * C8NUM;
|
||||
const float16_t *weight = weight_data + oc * sliding->kernel_step_;
|
||||
const float16_t *bias = bias_data + oc * C8NUM;
|
||||
DeconvDepthwiseBorderFp16(dst_data, src_data, weight, 0, sliding->top_, 0, conv_param->input_w_, conv_param,
|
||||
sliding);
|
||||
DeconvDepthwiseBorderFp16(dst_data, src_data, weight, sliding->bottom_, conv_param->input_h_, 0,
|
||||
conv_param->input_w_, conv_param, sliding);
|
||||
DeconvDepthwiseBorderFp16(dst_data, src_data, weight, sliding->top_, sliding->bottom_, 0, sliding->left_,
|
||||
conv_param, sliding);
|
||||
DeconvDepthwiseBorderFp16(dst_data, src_data, weight, sliding->top_, sliding->bottom_, sliding->right_,
|
||||
conv_param->input_w_, conv_param, sliding);
|
||||
|
||||
if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) {
|
||||
int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
float16_t *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_;
|
||||
const float16_t *in_t =
|
||||
src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
|
||||
|
||||
DeconvDepthwiseCenterFp16(out_t, in_t, weight, sliding->bottom_ - sliding->top_,
|
||||
sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
|
||||
sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_,
|
||||
sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_);
|
||||
}
|
||||
DeconvDepthwisePostFuncFp16(dst_data, bias, sliding->block_channel_, conv_param);
|
||||
} // output C8 loop
|
||||
src += sliding->in_step_;
|
||||
dst += sliding->out_step_;
|
||||
} // batch loop
|
||||
// output nchwc8
|
||||
}
|
||||
/*deconv depthwise fp16 end*/
|
||||
|
||||
#endif
|
|
@ -0,0 +1,33 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_CONV_DEPTHWISE_FP16_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_CONV_DEPTHWISE_FP16_H_
|
||||
|
||||
#include "src/runtime/kernel/arm/opclib/conv_parameter.h"
|
||||
#include "src/runtime/kernel/arm/opclib/fp32/conv_depthwise.h"
|
||||
|
||||
#ifdef ENABLE_FP16
|
||||
void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
|
||||
const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding,
|
||||
int task_id);
|
||||
|
||||
void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
|
||||
const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding,
|
||||
int task_id);
|
||||
#endif
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_CONV_DEPTHWISE_FP16_H_
|
|
@ -14,8 +14,8 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_P32_CONV_DEPTHWISE_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_P32_CONV_DEPTHWISE_H_
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_CONV_DEPTHWISE_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_CONV_DEPTHWISE_H_
|
||||
|
||||
#include "src/runtime/kernel/arm/opclib/conv_parameter.h"
|
||||
|
||||
|
@ -45,5 +45,5 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig
|
|||
void DeconvDwC4Fp32(float *output_data, const float *input_data, const float *weight_data, const float *bias_data,
|
||||
const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id);
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_P32_CONV_DEPTHWISE_H_
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_CONV_DEPTHWISE_H_
|
||||
|
||||
|
|
|
@ -292,6 +292,55 @@ void PackNC4HW4ToNCHWFp16(const void *src, void *dst, int batch, int plane, int
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane, int channel) {
|
||||
int c8 = UP_DIV(channel, C8NUM);
|
||||
for (int b = 0; b < batch; b++) {
|
||||
int src_offset = b * plane * channel;
|
||||
int dst_offset = b * plane * c8 * C8NUM;
|
||||
for (int c = 0; c < channel; c++) {
|
||||
int c8_block_num = c / C8NUM;
|
||||
int c8_block_rem = c % C8NUM;
|
||||
int src_c_offset = src_offset + c * plane;
|
||||
int dst_c_offset = dst_offset + c8_block_num * plane * C8NUM;
|
||||
for (int k = 0; k < plane; k++) {
|
||||
int src_kernel_offset = src_c_offset + k;
|
||||
int dst_kernel_offset = dst_c_offset + C8NUM * k + c8_block_rem;
|
||||
(dst + dst_kernel_offset)[0] = (float16_t)(src + src_kernel_offset)[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel) {
|
||||
int c8 = UP_DIV(channel, C8NUM);
|
||||
int nhwc8_batch_unit_offset = c8 * C8NUM * plane;
|
||||
int nhwc8_batch_offset = 0;
|
||||
for (int b = 0; b < batch; b++) {
|
||||
int batch_offset = b * channel * plane;
|
||||
for (int i = 0; i < plane; i++) {
|
||||
for (int c = 0; c < channel; c++) {
|
||||
(dst + nhwc8_batch_offset + i * c8 * C8NUM)[c] = (float16_t)(src + batch_offset + i * channel)[c];
|
||||
}
|
||||
}
|
||||
nhwc8_batch_offset += nhwc8_batch_unit_offset;
|
||||
}
|
||||
}
|
||||
|
||||
void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel) {
|
||||
int c8 = UP_DIV(channel, C8NUM);
|
||||
int nhwc_batch_unit_offset = channel * plane;
|
||||
int nhwc_batch_offset = 0;
|
||||
for (int b = 0; b < batch; b++) {
|
||||
int batch_offset = b * c8 * C8NUM * plane;
|
||||
for (int i = 0; i < plane; i++) {
|
||||
for (int c = 0; c < channel; c++) {
|
||||
(dst + nhwc_batch_offset + i * channel)[c] = (float)(src + batch_offset + i * c8 * C8NUM)[c];
|
||||
}
|
||||
}
|
||||
nhwc_batch_offset += nhwc_batch_unit_offset;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed_weight) {
|
||||
|
@ -1070,7 +1119,7 @@ void PackDepthwiseInt8Input(const int8_t *src, int16_t *dst, const ConvParameter
|
|||
auto src_k = src_b + k * conv_param->input_channel_;
|
||||
auto dst_k = dst_b + k * ic4 * C4NUM;
|
||||
for (int c = 0; c < conv_param->input_channel_; c++) {
|
||||
dst_k[c] = (int16_t)((int32_t)(src_k[c]) - input_zp);
|
||||
dst_k[c] = (int16_t)(src_k[c] - input_zp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1087,7 +1136,7 @@ void PackDepthwiseInt8Weight(const int8_t *origin_weight, int16_t *packed_weight
|
|||
for (int k = 0; k < unit; k++) {
|
||||
auto src_kernel = src_c + k;
|
||||
auto dst_kernel = dst_c + C4NUM * k + c4_block_rem;
|
||||
*dst_kernel = (int16_t)((int32_t)(src_kernel[0]) - weight_zp);
|
||||
*dst_kernel = (int16_t)(src_kernel[0] - weight_zp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -46,6 +46,14 @@ void PackNC4HW4ToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int
|
|||
void PackNC4HW4ToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel);
|
||||
|
||||
void PackNC4HW4ToNCHWFp16(const void *src, void *dst, int batch, int plane, int channel);
|
||||
|
||||
void PackNC8HW8ToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel);
|
||||
|
||||
void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane, int channel);
|
||||
|
||||
void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel);
|
||||
|
||||
void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel);
|
||||
#endif
|
||||
void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, float *packed_input, int real_cal_num,
|
||||
int block_index);
|
||||
|
@ -163,4 +171,3 @@ inline void C4UnpackToHwcInt8(int8_t *src_ptr, int8_t *dst_ptr, int channel, int
|
|||
}
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_PACK_H_
|
||||
|
||||
|
|
|
@ -72,7 +72,7 @@ else()
|
|||
)
|
||||
endif()
|
||||
### cpu kernel
|
||||
file(GLOB_RECURSE KERNEL_OP_SRC
|
||||
file(GLOB KERNEL_OP_SRC
|
||||
${LITE_DIR}/src/runtime/kernel/arm/base/*.cc
|
||||
${LITE_DIR}/src/runtime/kernel/arm/fp32/*.cc
|
||||
${LITE_DIR}/src/runtime/kernel/arm/int8/*.cc
|
||||
|
@ -103,10 +103,13 @@ if (PLATFORM_ARM32)
|
|||
)
|
||||
endif()
|
||||
if (ENABLE_FP16)
|
||||
file(GLOB KERNEL_OP_FP16_SRC
|
||||
${LITE_DIR}/src/runtime/kernel/arm/fp16/*.cc
|
||||
${LITE_DIR}/src/runtime/kernel/arm/opclib/fp16/*.cc
|
||||
)
|
||||
set(KERNEL_OP_SRC
|
||||
${KERNEL_OP_SRC}
|
||||
${LITE_DIR}/src/runtime/kernel/arm/fp16/convolution_fp16.cc
|
||||
${LITE_DIR}/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
|
||||
${KERNEL_OP_FP16_SRC}
|
||||
)
|
||||
endif ()
|
||||
### gpu kernel
|
||||
|
|
Loading…
Reference in New Issue