forked from mindspore-Ecosystem/mindspore
deconv fp16
This commit is contained in:
parent
baa6070ce0
commit
e603486104
|
@ -0,0 +1,210 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp16/deconvolution_fp16.h"
|
||||
#include "src/runtime/runtime_api.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_NULL_PTR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_DeConv2D;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
DeConvolutionFp16CPUKernel::~DeConvolutionFp16CPUKernel() {
|
||||
FreeParam();
|
||||
return;
|
||||
}
|
||||
|
||||
int DeConvolutionFp16CPUKernel::ReSize() {
|
||||
FreeParam();
|
||||
InitParam();
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void DeConvolutionFp16CPUKernel::FreeParam() {
|
||||
if (tmp_buffer_ != nullptr) {
|
||||
free(tmp_buffer_);
|
||||
tmp_buffer_ = nullptr;
|
||||
}
|
||||
if (pack_input_ != nullptr) {
|
||||
free(pack_input_);
|
||||
pack_input_ = nullptr;
|
||||
}
|
||||
if (pack_output_ != nullptr) {
|
||||
free(pack_output_);
|
||||
pack_output_ = nullptr;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
int DeConvolutionFp16CPUKernel::InitWeightBias() {
|
||||
bias_data_ = malloc(UP_ROUND(conv_param_->output_channel_, C4NUM) * sizeof(float16_t));
|
||||
if (bias_data_ == nullptr) {
|
||||
MS_LOG(ERROR) << "deconv malloc bias_data_ error!";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(bias_data_, 0, UP_ROUND(conv_param_->output_channel_, C4NUM) * sizeof(float16_t));
|
||||
if (in_tensors_.size() == 3) {
|
||||
Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->Data()), reinterpret_cast<float16_t *>(bias_data_),
|
||||
conv_param_->output_channel_);
|
||||
}
|
||||
|
||||
size_t weight_pack_size = conv_param_->input_channel_ * conv_param_->kernel_w_ * conv_param_->kernel_h_ *
|
||||
UP_ROUND(conv_param_->output_channel_, C8NUM) * sizeof(float16_t);
|
||||
execute_weight_ = reinterpret_cast<float16_t *>(malloc(weight_pack_size));
|
||||
if (execute_weight_ == nullptr) {
|
||||
MS_LOG(ERROR) << "deconv malloc execute_weight_ error!";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(execute_weight_, 0, weight_pack_size);
|
||||
PackNHWCFp32ToC8HWN8Fp16(reinterpret_cast<float *>(in_tensors_[1]->Data()), execute_weight_,
|
||||
conv_param_->input_channel_, kernel_plane_, conv_param_->output_channel_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeConvolutionFp16CPUKernel::InitParam() {
|
||||
input_plane_ = conv_param_->input_h_ * conv_param_->input_w_;
|
||||
kernel_plane_ = conv_param_->kernel_w_ * conv_param_->kernel_h_;
|
||||
output_plane_ = conv_param_->output_h_ * conv_param_->output_w_;
|
||||
|
||||
matmul_param_->row_ = input_plane_;
|
||||
matmul_param_->deep_ = conv_param_->input_channel_;
|
||||
matmul_param_->col_ = conv_param_->output_channel_ * kernel_plane_;
|
||||
row16_ = UP_ROUND(matmul_param_->row_, 16);
|
||||
col8_ = UP_ROUND(matmul_param_->col_, 8);
|
||||
|
||||
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(conv_param_->output_channel_, C8NUM));
|
||||
thread_stride_ = UP_DIV(UP_DIV(conv_param_->output_channel_, C8NUM), thread_count_);
|
||||
|
||||
pack_input_ = reinterpret_cast<float16_t *>(malloc(row16_ * matmul_param_->deep_ * sizeof(float16_t)));
|
||||
if (pack_input_ == nullptr) {
|
||||
MS_LOG(ERROR) << "deconv Malloc pack_input_ error!";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
pack_output_ = reinterpret_cast<float16_t *>(
|
||||
malloc(UP_ROUND(conv_param_->output_channel_, C8NUM) * output_plane_ * sizeof(float16_t)));
|
||||
if (pack_output_ == nullptr) {
|
||||
MS_LOG(ERROR) << "deconv Malloc pack_output_ error!";
|
||||
return RET_NULL_PTR;
|
||||
}
|
||||
|
||||
tmp_buffer_ = reinterpret_cast<float16_t *>(malloc(row16_ * col8_ * sizeof(float16_t)));
|
||||
if (tmp_buffer_ == nullptr) {
|
||||
MS_LOG(ERROR) << "deconv Malloc tmp_buffer_ error!";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeConvFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
|
||||
auto deconv = reinterpret_cast<DeConvolutionFp16CPUKernel *>(cdata);
|
||||
auto error_code = deconv->DoDeconv(task_id);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "DeConvFp16Run error task_id[" << task_id << "] error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeConvolutionFp16CPUKernel::DoDeconv(int task_id) {
|
||||
int oc = MSMIN(thread_stride_ * C8NUM, conv_param_->output_channel_ - task_id * thread_stride_ * C8NUM);
|
||||
int oc_res = MSMIN(thread_stride_ * C8NUM, conv_param_->output_channel_ - task_id * thread_stride_ * C8NUM);
|
||||
if (oc <= 0) {
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
auto tmp_buf = tmp_buffer_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * row16_;
|
||||
MatMulFp16(pack_input_, execute_weight_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_,
|
||||
tmp_buf, nullptr, ActType_No, matmul_param_->deep_, matmul_param_->row_, oc * C8NUM * kernel_plane_, 0,
|
||||
false);
|
||||
DeConvPostFp16(tmp_buf, pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_,
|
||||
reinterpret_cast<float16_t *>(bias_data_) + task_id * thread_stride_ * C8NUM,
|
||||
execute_output_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeConvolutionFp16CPUKernel::Init() {
|
||||
if (context_->infer_shape_interrupt_ && !context_->running_) {
|
||||
set_need_reinit();
|
||||
return RET_OK;
|
||||
}
|
||||
ConvolutionBaseCPUKernel::Init();
|
||||
|
||||
int error_code = InitParam();
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "deconv InitParam error!";
|
||||
return error_code;
|
||||
}
|
||||
|
||||
error_code = InitWeightBias();
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "deconv InitWeightBias error!";
|
||||
return error_code;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeConvolutionFp16CPUKernel::Run() {
|
||||
auto prepare_ret = Prepare();
|
||||
if (prepare_ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
|
||||
return prepare_ret;
|
||||
}
|
||||
ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
|
||||
|
||||
for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
|
||||
RowMajor2Col8MajorFp16(execute_input_, pack_input_, input_plane_, conv_param_->input_channel_);
|
||||
|
||||
int error_code = LiteBackendParallelLaunch(DeConvFp16Run, this, thread_count_);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "deconv fp32 run error! error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
ConvolutionBaseFP16CPUKernel::IfCastOutput();
|
||||
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *CpuDeConvFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
const std::vector<lite::tensor::Tensor *> &outputs,
|
||||
OpParameter *opParameter, const lite::Context *ctx,
|
||||
const kernel::KernelKey &desc, const lite::Primitive *primitive) {
|
||||
MS_ASSERT(opParameter != nullptr);
|
||||
MS_ASSERT(desc.type == schema::PrimitiveType_DeConv2D);
|
||||
auto kernel = new (std::nothrow) DeConvolutionFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
|
||||
if (kernel == nullptr) {
|
||||
MS_LOG(ERROR) << "kernel is nullptr.";
|
||||
return nullptr;
|
||||
}
|
||||
auto ret = kernel->Init();
|
||||
if (ret != RET_OK) {
|
||||
delete kernel;
|
||||
MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
|
||||
<< schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
|
||||
return nullptr;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_DeConv2D, CpuDeConvFp16KernelCreator)
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,68 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_H_
|
||||
|
||||
#include <float.h>
|
||||
#include <vector>
|
||||
#include "src/lite_kernel.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "include/errorcode.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
|
||||
#include "src/runtime/kernel/arm/nnacl/fp16/deconv_fp16.h"
|
||||
#include "src/runtime/kernel/arm/nnacl/fp16/matmul_fp16.h"
|
||||
#include "src/runtime/kernel/arm/nnacl/fp16/pack_fp16.h"
|
||||
#include "src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
class DeConvolutionFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
|
||||
public:
|
||||
DeConvolutionFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
|
||||
const lite::Primitive *primitive)
|
||||
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {
|
||||
matmul_param_ = new MatMulParameter();
|
||||
}
|
||||
~DeConvolutionFp16CPUKernel() override;
|
||||
int Init() override;
|
||||
int Run() override;
|
||||
int ReSize() override;
|
||||
|
||||
public:
|
||||
int DoDeconv(int task_id);
|
||||
|
||||
private:
|
||||
void FreeParam();
|
||||
int InitParam();
|
||||
int InitWeightBias();
|
||||
|
||||
private:
|
||||
MatMulParameter *matmul_param_;
|
||||
int row16_;
|
||||
int col8_;
|
||||
int input_plane_;
|
||||
int kernel_plane_;
|
||||
int output_plane_;
|
||||
int thread_count_;
|
||||
int thread_stride_;
|
||||
float16_t *pack_input_;
|
||||
float16_t *pack_output_;
|
||||
float16_t *tmp_buffer_;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_H_
|
|
@ -24,10 +24,10 @@ MatmulFp16Neon64:
|
|||
st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
|
||||
st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
|
||||
|
||||
mov w18, #32 // sizeof(float) * 8
|
||||
mov w18, #16 // sizeof(float) * 8
|
||||
mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float) * 8 * depth
|
||||
mov x11, x3 // bias flag
|
||||
mov x18, #4
|
||||
mov x18, #2
|
||||
ldr x17, [sp]
|
||||
mul x17, x17, x18
|
||||
|
||||
|
|
|
@ -0,0 +1,459 @@
|
|||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
//.p2align 5,,15
|
||||
.global PostFuncBiasReluC8Fp16
|
||||
#ifndef __APPLE__
|
||||
.type PostFuncBiasReluC8Fp16, %function
|
||||
#endif
|
||||
|
||||
//void PostFuncBiasReluC8Fp16(float *dst, const float *src, const float *bias, size_t oc8div,size_t oc8mod
|
||||
// size_t plane_size, size_t stride, int relu_type);
|
||||
// x0 dst x1 srx x2 bias
|
||||
// x3 oc8div x4 oc8mod x5 plane_size
|
||||
// x6 stride x7 relu_type
|
||||
|
||||
// v0 ~ v7 value
|
||||
// v16 bias data
|
||||
// x24 x25 weite loop tmp buf
|
||||
// x26 relu6 #6; x27 relu #0
|
||||
// w10 oc8 loop control
|
||||
// w13 hw loop control
|
||||
|
||||
PostFuncBiasReluC8Fp16:
|
||||
movi v26.8h, #6
|
||||
scvtf v26.8h, v26.8h
|
||||
dup v27.8h, wzr
|
||||
mov w10, #0
|
||||
|
||||
Loop_C8:
|
||||
cmp w10, w3
|
||||
beq Loop_C1
|
||||
mov x25, #4
|
||||
mul x24, x10, x25
|
||||
add x25, x0, x24
|
||||
add w10, w10, #8
|
||||
mov w13, w5
|
||||
ld1 {v16.8h}, [x2], #16
|
||||
|
||||
Loop8x8:
|
||||
cmp w13, #8
|
||||
blt Loop_4x8
|
||||
sub w13, w13, #8
|
||||
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
|
||||
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64
|
||||
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fadd v1.8h, v1.8h, v16.8h
|
||||
fadd v2.8h, v2.8h, v16.8h
|
||||
fadd v3.8h, v3.8h, v16.8h
|
||||
fadd v4.8h, v4.8h, v16.8h
|
||||
fadd v5.8h, v5.8h, v16.8h
|
||||
fadd v6.8h, v6.8h, v16.8h
|
||||
fadd v7.8h, v7.8h, v16.8h
|
||||
|
||||
cmp w7, #2
|
||||
beq Relu6_8x8
|
||||
cmp w7, #1
|
||||
beq Relu_8x8
|
||||
b Write_8x8
|
||||
Relu6_8x8:
|
||||
fmin v0.8h, v0.8h, v26.8h
|
||||
fmin v1.8h, v1.8h, v26.8h
|
||||
fmin v2.8h, v2.8h, v26.8h
|
||||
fmin v3.8h, v3.8h, v26.8h
|
||||
fmin v4.8h, v4.8h, v26.8h
|
||||
fmin v5.8h, v5.8h, v26.8h
|
||||
fmin v6.8h, v6.8h, v26.8h
|
||||
fmin v7.8h, v7.8h, v26.8h
|
||||
Relu_8x8:
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
fmax v1.8h, v1.8h, v27.8h
|
||||
fmax v2.8h, v2.8h, v27.8h
|
||||
fmax v3.8h, v3.8h, v27.8h
|
||||
fmax v4.8h, v4.8h, v27.8h
|
||||
fmax v5.8h, v5.8h, v27.8h
|
||||
fmax v6.8h, v6.8h, v27.8h
|
||||
fmax v7.8h, v7.8h, v27.8h
|
||||
Write_8x8:
|
||||
st1 {v0.8h}, [x25], x6
|
||||
st1 {v1.8h}, [x25], x6
|
||||
st1 {v2.8h}, [x25], x6
|
||||
st1 {v3.8h}, [x25], x6
|
||||
st1 {v4.8h}, [x25], x6
|
||||
st1 {v5.8h}, [x25], x6
|
||||
st1 {v6.8h}, [x25], x6
|
||||
st1 {v7.8h}, [x25], x6
|
||||
b Loop8x8
|
||||
|
||||
Loop_4x8:
|
||||
cmp w13, #4
|
||||
blt Loop_1x8
|
||||
sub w13, w13, #4
|
||||
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
|
||||
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fadd v1.8h, v1.8h, v16.8h
|
||||
fadd v2.8h, v2.8h, v16.8h
|
||||
fadd v3.8h, v3.8h, v16.8h
|
||||
|
||||
cmp w7, #2
|
||||
beq Relu6_4x8
|
||||
cmp w7, #1
|
||||
beq Relu_4x8
|
||||
b Write_4x8
|
||||
Relu6_4x8:
|
||||
fmin v0.8h, v0.8h, v26.8h
|
||||
fmin v1.8h, v1.8h, v26.8h
|
||||
fmin v2.8h, v2.8h, v26.8h
|
||||
fmin v3.8h, v3.8h, v26.8h
|
||||
Relu_4x8:
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
fmax v1.8h, v1.8h, v27.8h
|
||||
fmax v2.8h, v2.8h, v27.8h
|
||||
fmax v3.8h, v3.8h, v27.8h
|
||||
Write_4x8:
|
||||
st1 {v0.8h}, [x25], x6
|
||||
st1 {v1.8h}, [x25], x6
|
||||
st1 {v2.8h}, [x25], x6
|
||||
st1 {v3.8h}, [x25], x6
|
||||
|
||||
Loop_1x8:
|
||||
cmp w7, #2
|
||||
beq Relu6_1x8
|
||||
cmp w7, #1
|
||||
beq Relu_1x8
|
||||
b Write_1x8
|
||||
Relu6_1x8:
|
||||
cmp w13, #0
|
||||
beq Loop_C8
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmin v0.8h, v0.8h, v26.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v0.8h}, [x25], x6
|
||||
b Relu6_1x8
|
||||
Relu_1x8:
|
||||
cmp w13, #0
|
||||
beq Loop_C8
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v0.8h}, [x25], x6
|
||||
b Relu_1x8
|
||||
Write_1x8:
|
||||
cmp w13, #0
|
||||
beq Loop_C8
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
st1 {v0.8h}, [x25], x6
|
||||
b Write_1x8
|
||||
|
||||
|
||||
Loop_C1:
|
||||
cmp x4, #0
|
||||
beq End
|
||||
mov w13, w5
|
||||
ld1 {v16.8h}, [x2], #16
|
||||
|
||||
cmp x4, #1
|
||||
beq Loop_C1_1
|
||||
cmp x4, #2
|
||||
beq Loop_C1_2
|
||||
cmp x4, #3
|
||||
beq Loop_C1_3
|
||||
cmp x4, #4
|
||||
beq Loop_C1_4
|
||||
cmp x4, #5
|
||||
beq Loop_C1_5
|
||||
cmp x4, #6
|
||||
beq Loop_C1_6
|
||||
cmp x4, #7
|
||||
beq Loop_C1_7
|
||||
|
||||
Loop_C1_1:
|
||||
cmp w7, #2
|
||||
beq Loop_C1_1_Relu6
|
||||
cmp w7, #1
|
||||
beq Loop_C1_1_Relu
|
||||
b Loop_C1_1_Write
|
||||
Loop_C1_1_Relu6:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmin v0.8h, v0.8h, v26.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v1.h}[0], [x0], x6
|
||||
b Loop_C1_1_Relu6
|
||||
Loop_C1_1_Relu:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v1.h}[0], [x0], x6
|
||||
b Loop_C1_1_Relu
|
||||
Loop_C1_1_Write:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
st1 {v1.h}[0], [x0], x6
|
||||
b Loop_C1_1_Write
|
||||
|
||||
Loop_C1_2:
|
||||
add x24, x0, #2
|
||||
cmp w7, #2
|
||||
beq Loop_C1_2_Relu6
|
||||
cmp w7, #1
|
||||
beq Loop_C1_2_Relu
|
||||
b Loop_C1_2_Write
|
||||
Loop_C1_2_Relu6:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmin v0.8h, v0.8h, v26.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v1.h}[0], [x0], x6
|
||||
st1 {v1.h}[1], [x24], x6
|
||||
b Loop_C1_2_Relu6
|
||||
Loop_C1_2_Relu:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v1.h}[0], [x0], x6
|
||||
st1 {v1.h}[1], [x24], x6
|
||||
b Loop_C1_2_Relu
|
||||
Loop_C1_2_Write:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
st1 {v1.h}[0], [x0], x6
|
||||
st1 {v1.h}[1], [x24], x6
|
||||
b Loop_C1_2_Write
|
||||
|
||||
|
||||
Loop_C1_3:
|
||||
add x24, x0, #2
|
||||
add x25, x0, #4
|
||||
cmp w7, #2
|
||||
beq Loop_C1_3_Relu6
|
||||
cmp w7, #1
|
||||
beq Loop_C1_3_Relu
|
||||
b Loop_C1_3_Write
|
||||
Loop_C1_3_Relu6:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmin v0.8h, v0.8h, v26.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v1.h}[0], [x0], x6
|
||||
st1 {v1.h}[1], [x24], x6
|
||||
st1 {v1.h}[2], [x25], x6
|
||||
b Loop_C1_3_Relu6
|
||||
Loop_C1_3_Relu:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v1.h}[0], [x0], x6
|
||||
st1 {v1.h}[1], [x24], x6
|
||||
st1 {v1.h}[2], [x25], x6
|
||||
b Loop_C1_3_Relu
|
||||
Loop_C1_3_Write:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
st1 {v1.h}[0], [x0], x6
|
||||
st1 {v1.h}[1], [x24], x6
|
||||
st1 {v1.h}[2], [x25], x6
|
||||
b Loop_C1_3_Write
|
||||
|
||||
Loop_C1_4:
|
||||
cmp w7, #2
|
||||
beq Loop_C1_4_Relu6
|
||||
cmp w7, #1
|
||||
beq Loop_C1_4_Relu
|
||||
b Loop_C1_4_Write
|
||||
Loop_C1_4_Relu6:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmin v0.8h, v0.8h, v26.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
b Loop_C1_4_Relu6
|
||||
Loop_C1_4_Relu:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
b Loop_C1_4_Relu6
|
||||
Loop_C1_4_Write:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
b Loop_C1_4_Write
|
||||
|
||||
Loop_C1_5:
|
||||
add x25, x0, #16
|
||||
cmp w7, #2
|
||||
beq Loop_C1_5_Relu6
|
||||
cmp w7, #1
|
||||
beq Loop_C1_5_Relu
|
||||
b Loop_C1_5_Write
|
||||
Loop_C1_5_Relu6:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmin v0.8h, v0.8h, v26.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
str h1, [x25]
|
||||
add x25, x25, x6
|
||||
b Loop_C1_5_Relu6
|
||||
Loop_C1_5_Relu:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
str h1, [x25]
|
||||
add x25, x25, x6
|
||||
b Loop_C1_5_Relu
|
||||
Loop_C1_5_Write:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
str h1, [x25]
|
||||
add x25, x25, x6
|
||||
b Loop_C1_5_Write
|
||||
|
||||
Loop_C1_6:
|
||||
add x23, x0, #8
|
||||
add x24, x0, #10
|
||||
cmp w7, #2
|
||||
beq Loop_C1_6_Relu6
|
||||
cmp w7, #1
|
||||
beq Loop_C1_6_Relu
|
||||
b Loop_C1_6_Write
|
||||
Loop_C1_6_Relu6:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmin v0.8h, v0.8h, v26.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
st1 {v1.h}[4], [x23], x6
|
||||
st1 {v1.h}[5], [x24], x6
|
||||
b Loop_C1_6_Relu6
|
||||
Loop_C1_6_Relu:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
st1 {v1.h}[4], [x23], x6
|
||||
st1 {v1.h}[5], [x24], x6
|
||||
b Loop_C1_6_Relu
|
||||
Loop_C1_6_Write:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
st1 {v1.h}[4], [x23], x6
|
||||
st1 {v1.h}[5], [x24], x6
|
||||
b Loop_C1_6_Write
|
||||
|
||||
Loop_C1_7:
|
||||
add x23, x0, #8
|
||||
add x24, x0, #10
|
||||
add x25, x0, #12
|
||||
cmp w7, #2
|
||||
beq Loop_C1_7_Relu6
|
||||
cmp w7, #1
|
||||
beq Loop_C1_7_Relu
|
||||
b Loop_C1_7_Write
|
||||
Loop_C1_7_Relu6:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmin v0.8h, v0.8h, v26.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
st1 {v1.h}[4], [x23], x6
|
||||
st1 {v1.h}[5], [x24], x6
|
||||
st1 {v1.h}[6], [x25], x6
|
||||
b Loop_C1_7_Relu6
|
||||
Loop_C1_7_Relu:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
st1 {v1.h}[4], [x23], x6
|
||||
st1 {v1.h}[5], [x24], x6
|
||||
st1 {v1.h}[6], [x25], x6
|
||||
b Loop_C1_7_Relu
|
||||
Loop_C1_7_Write:
|
||||
cmp w13, #0
|
||||
beq End
|
||||
sub w13, w13, #1
|
||||
ld1 {v0.8h}, [x1], #16
|
||||
fadd v0.8h, v0.8h, v16.8h
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
st1 {v0.4h}, [x0], x6
|
||||
st1 {v1.h}[4], [x23], x6
|
||||
st1 {v1.h}[5], [x24], x6
|
||||
st1 {v1.h}[6], [x25], x6
|
||||
b Loop_C1_7_Write
|
||||
|
||||
End:
|
||||
ret
|
||||
#endif
|
|
@ -0,0 +1,119 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "nnacl/fp16/deconv_fp16.h"
|
||||
|
||||
void PostConvFuncCommFp16(float16_t *out_ptr, const float16_t *src_ptr_, const float16_t *bias_ptr,
|
||||
size_t output_channel, size_t plane_size, size_t stride, bool is_relu, bool is_relu6,
|
||||
int size) {
|
||||
for (int oc = 0; oc < output_channel; oc++) {
|
||||
int oc_div = oc / size, oc_mod = oc % size;
|
||||
for (int hw = 0; hw < plane_size; hw++) {
|
||||
int src_index = oc_div * size * plane_size + hw * size + oc_mod;
|
||||
int dst_index = hw * stride + oc;
|
||||
float16_t value = src_ptr_[src_index];
|
||||
if (bias_ptr != NULL) {
|
||||
value = value + bias_ptr[oc];
|
||||
}
|
||||
value = (is_relu || is_relu6) ? (MSMAX(0.f, value)) : (value);
|
||||
value = (is_relu6) ? (MSMIN(6.f, value)) : (value);
|
||||
out_ptr[dst_index] = value;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void PostConvFuncFp16C8(const float16_t *c8_out_ptr, float16_t *out_ptr, const float16_t *bias_ptr,
|
||||
size_t output_channel, size_t plane_size, size_t stride, bool is_relu, bool is_relu6) {
|
||||
#ifdef DEBUG_CODE
|
||||
PostConvFuncCommFp16(out_ptr, c8_out_ptr, bias_ptr, output_channel, plane_size, stride, is_relu, is_relu6, C8NUM);
|
||||
#else
|
||||
size_t oc8mod = output_channel % C8NUM;
|
||||
size_t oc8div = output_channel - oc8mod;
|
||||
size_t stride_size = stride * sizeof(float16_t);
|
||||
size_t relu_type = is_relu ? 1 : 0;
|
||||
relu_type = is_relu6 ? 2 : relu_type;
|
||||
PostFuncBiasReluC8Fp16(out_ptr, c8_out_ptr, bias_ptr, oc8div, oc8mod, plane_size, stride_size, relu_type);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
int DeConvPostFp16(const float16_t *src, float16_t *tmp, const float16_t *bias, float16_t *dst, int output_channel,
|
||||
ConvParameter *conv_param) {
|
||||
/* row8x8-major(ih*iw x oc*kh*kw) -> row8-major(oh*ow x oc) */
|
||||
size_t input_plane = conv_param->input_w_ * conv_param->input_h_;
|
||||
size_t kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_;
|
||||
size_t output_plane = conv_param->output_w_ * conv_param->output_h_;
|
||||
int oc8 = UP_ROUND(output_channel, C8NUM);
|
||||
int in_plane16 = UP_ROUND(input_plane, 16);
|
||||
int src_iw_stride = C8NUM;
|
||||
int src_ih_stride = conv_param->input_w_ * C8NUM;
|
||||
int src_kw_stride = in_plane16 * C8NUM;
|
||||
int src_kh_stride = in_plane16 * conv_param->kernel_w_ * C8NUM;
|
||||
int dst_oh_stride = conv_param->output_w_ * C8NUM;
|
||||
int dst_ow_stride = C8NUM;
|
||||
int dst_kh_stride = conv_param->dilation_h_ * conv_param->output_w_ * C8NUM;
|
||||
int dst_kw_stride = conv_param->dilation_w_ * C8NUM;
|
||||
|
||||
for (int c = 0; c < oc8; c += 8) {
|
||||
float16_t *dst_ptr = tmp + c * output_plane;
|
||||
const float16_t *src_ptr = src + c * in_plane16 * kernel_plane;
|
||||
memset(dst_ptr, 0, output_plane * C8NUM * sizeof(float16_t));
|
||||
|
||||
for (int ih = 0; ih < conv_param->input_h_; ih++) {
|
||||
for (int iw = 0; iw < conv_param->input_w_; iw++) {
|
||||
int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
|
||||
int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
|
||||
int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
|
||||
int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
|
||||
int kw_start = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_));
|
||||
int kw_end = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_));
|
||||
for (int kh = kh_start; kh < kh_end; kh++) {
|
||||
for (int kw = kw_start; kw < kw_end; kw++) {
|
||||
int src_index = ih * src_ih_stride + iw * src_iw_stride + kh * src_kh_stride + kw * src_kw_stride;
|
||||
int dst_index = oh * dst_oh_stride + ow * dst_ow_stride + kh * dst_kh_stride + kw * dst_kw_stride;
|
||||
float16_t *tmp_dst = dst_ptr + dst_index;
|
||||
const float16_t *tmp_src = src_ptr + src_index;
|
||||
#ifdef DEBUG_CODE
|
||||
for (int i = 0; i < C8NUM; i++) {
|
||||
tmp_dst[i] += tmp_src[i];
|
||||
}
|
||||
#else
|
||||
asm volatile(
|
||||
"mov x0, %[tmp_src] \n"
|
||||
"mov x1, %[tmp_dst] \n"
|
||||
|
||||
"ld1 {v0.8h}, [x0] \n"
|
||||
"ld1 {v1.8h}, [x1] \n"
|
||||
|
||||
"fadd v0.8h, v0.8h, v1.8h \n"
|
||||
|
||||
"st1 {v0.8h}, [x1] \n"
|
||||
|
||||
:
|
||||
: [ tmp_src ] "r"(tmp_src), [ tmp_dst ] "r"(tmp_dst)
|
||||
: "x0", "x1", "v0", "v1");
|
||||
#endif
|
||||
} /*kw*/
|
||||
} /*kh*/
|
||||
} /*iw*/
|
||||
} /*ih*/
|
||||
} /*oc8*/
|
||||
|
||||
PostConvFuncFp16C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_, conv_param->is_relu_,
|
||||
conv_param->is_relu6_);
|
||||
return NNACL_OK;
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_DECONV_FP16_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_DECONV_FP16_H_
|
||||
|
||||
#include <string.h>
|
||||
#include <arm_neon.h>
|
||||
#include "nnacl/conv_parameter.h"
|
||||
#include "nnacl/matmul_parameter.h"
|
||||
#include "nnacl/fp16/matmul_fp16.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
int DeConvPostFp16(const float16_t *src, float16_t *tmp, const float16_t *bias, float16_t *dst, int output_channel,
|
||||
ConvParameter *conv_param);
|
||||
|
||||
void PostConvFuncFp16C8(const float16_t *c8_out_ptr, float16_t *out_ptr, const float16_t *bias_ptr,
|
||||
size_t output_channel, size_t plane_size, size_t stride, bool is_relu, bool is_relu6);
|
||||
|
||||
void PostFuncBiasReluC8Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc8div, size_t oc8mod,
|
||||
size_t plane_size, size_t stride, size_t relu_type);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_DECONV_FP16_H_
|
|
@ -15,10 +15,37 @@
|
|||
*/
|
||||
|
||||
#include "nnacl/fp16/matmul_fp16.h"
|
||||
void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const float16_t *bias, ActType act_type,
|
||||
int deep, int row, int col, int stride, bool write_nhwc) {
|
||||
int row_16 = UP_ROUND(row, C16NUM);
|
||||
int col_8 = UP_ROUND(col, C8NUM);
|
||||
/* col16-major * row8-major => row16x8-major */
|
||||
if (write_nhwc) return;
|
||||
for (int r = 0; r < row_16; r++) {
|
||||
for (int c = 0; c < col_8; c++) {
|
||||
int r16div = r / C16NUM, r16mod = r % C16NUM;
|
||||
int c8div = c / C8NUM, c8mod = c % C8NUM;
|
||||
size_t ci = c8div * row_16 * C8NUM + r * C8NUM + c8mod;
|
||||
float16_t value = 0;
|
||||
for (int d = 0; d < deep; d++) {
|
||||
size_t ai = r16div * deep * C16NUM + d * C16NUM + r16mod;
|
||||
size_t bi = c8div * deep * C8NUM + d * C8NUM + c8mod;
|
||||
value = value + a[ai] * b[bi];
|
||||
}
|
||||
if (bias != NULL) value += bias[col];
|
||||
if (act_type == ActType_Relu6) value = MSMIN(6.0f, value);
|
||||
if (act_type != ActType_No) value = MSMAX(0.0f, value);
|
||||
dst[ci] = value;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type,
|
||||
int depth, int row, int col, int stride, bool write_nhwc) {
|
||||
MatmulFp16Neon64(a, b, c, bias, (int)act_type, depth, row, col, stride, write_nhwc);
|
||||
// MatmulFp16Neon64(a, b, c, bias, (int)act_type, depth, row, col, stride, write_nhwc);
|
||||
MatMul16x8(a, b, c, bias, (int)act_type, depth, row, col, stride, write_nhwc);
|
||||
return;
|
||||
}
|
||||
|
||||
void RowMajor2Col8MajorFp16(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col) {
|
||||
|
|
|
@ -33,10 +33,10 @@ void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const floa
|
|||
int depth, int row, int col, int stride, bool write_nhwc);
|
||||
|
||||
void RowMajor2Col8MajorFp16(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col);
|
||||
#ifdef __aarch64__
|
||||
void MatmulFp16Neon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row,
|
||||
int col, int stride, bool write_nhwc);
|
||||
#endif
|
||||
|
||||
void MatmulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
|
||||
size_t depth, size_t row, size_t col, size_t stride, bool write_nhwc);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -369,6 +369,21 @@ void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, i
|
|||
}
|
||||
}
|
||||
|
||||
void PackNHWCFp32ToC8HWN8Fp16(float *src, float16_t *dst, int batch, int plane, int channel) {
|
||||
for (int n = 0; n < batch; n++) {
|
||||
for (int hw = 0; hw < plane; hw++) {
|
||||
for (int c = 0; c < channel; c++) {
|
||||
int c8div = c / C8NUM;
|
||||
int c8mod = c % C8NUM;
|
||||
int src_index = n * plane * channel + hw * channel + c;
|
||||
int dst_index = c8div * batch * plane * C8NUM + hw * batch * C8NUM + n * C8NUM + c8mod;
|
||||
dst[dst_index] = (float16_t)(src[src_index]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel) {
|
||||
int c8_channel = UP_DIV(channel, C8NUM) * C8NUM;
|
||||
for (int b = 0; b < batch; b++) {
|
||||
|
|
|
@ -59,6 +59,8 @@ void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane,
|
|||
|
||||
void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel);
|
||||
|
||||
void PackNHWCFp32ToC8HWN8Fp16(float *src, float16_t *dst, int batch, int plane, int channel);
|
||||
|
||||
void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel);
|
||||
|
||||
void PackNHWCToNHWC8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel);
|
||||
|
|
|
@ -120,7 +120,7 @@ void RowMajor2Col8Major(float *src_ptr, float *dst_ptr, size_t row, size_t col)
|
|||
}
|
||||
|
||||
void MatrixUnPackUnit(const void *src, void *dst, size_t row, size_t col, size_t src_stride, size_t dst_stride,
|
||||
size_t data_lenth) {
|
||||
size_t data_lenth) {
|
||||
size_t copy_size = col * data_lenth;
|
||||
size_t src_size = src_stride * data_lenth;
|
||||
size_t dst_size = dst_stride * data_lenth;
|
||||
|
|
Loading…
Reference in New Issue