diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc new file mode 100644 index 00000000000..529193ef5fa --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc @@ -0,0 +1,97 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h" +#include "src/runtime/kernel/arm/nnacl/fp16/conv_fp16.h" +#include "src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h" +#include "src/runtime/kernel/arm/nnacl/fp16/pack_fp16.h" +#include "src/runtime/kernel/arm/fp16/layout_transform_fp16.h" +#include "schema/model_generated.h" +#include "src/kernel_registry.h" +#include "include/errorcode.h" +#include "src/runtime/runtime_api.h" + +using mindspore::kernel::KERNEL_ARCH::kCPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_Conv2D; + +namespace mindspore::kernel { +int Convolution1x1FP16CPUKernel::Init() { + auto ret = ConvolutionBaseCPUKernel::Init(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "ConvolutionBase init failed."; + return ret; + } + return RET_OK; +} + +int Convolution1x1FP16CPUKernel::ReSize() { + if (fp16_out_ != nullptr) { + free(fp16_out_); + } + if (fp16_input_ != nullptr) { + free(fp16_input_); + } + if (nhwc4_input_ != nullptr) { + free(nhwc4_input_); + } + + auto ret = ConvolutionBaseCPUKernel::Init(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "ConvolutionBase init failed."; + return ret; + } + return RET_OK; +} + +int Convolution1x1FP16CPUKernel::RunImpl(int task_id) { + // Conv1x1Fp16(reinterpret_cast(nhwc4_input_), transformed_filter_addr_, + // reinterpret_cast(bias_data_), fp16_out_, tile_buffer_, block_unit_buffer_, + // tmp_dst_buffer_, tmp_out_, task_id, conv_param_); + return RET_OK; +} + +int Convolution1x1Fp16Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) { + auto conv = reinterpret_cast(cdata); + auto error_code = conv->RunImpl(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "Convolution1x1 Fp16 Run error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int Convolution1x1FP16CPUKernel::Run() { + auto ret = Prepare(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Prepare failed."; + return RET_ERROR; + } + + ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); + + int error_code = LiteBackendParallelLaunch(Convolution1x1Fp16Impl, this, thread_count_); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "conv1x1 fp16 error error_code[" << error_code << "]"; + return RET_ERROR; + } + + ConvolutionBaseFP16CPUKernel::IfCastOutput(); + return RET_OK; +} +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h new file mode 100644 index 00000000000..989e1e2bc7a --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h @@ -0,0 +1,54 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_1x1_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_1x1_FP16_H_ + +#include +#include +#include "src/lite_kernel.h" +#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" +#include "src/runtime/kernel/arm/nnacl/optimized_kernel.h" + +namespace mindspore::kernel { +class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel { + public: + Convolution1x1FP16CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const Context *ctx, + const lite::Primitive *primitive) + : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} + ~Convolution1x1FP16CPUKernel() override { + if (fp16_input_ != nullptr) { + free(fp16_input_); + } + if (fp16_weight_ != nullptr) { + free(fp16_weight_); + } + if (fp16_out_ != nullptr) { + free(fp16_out_); + } + } + + int Init() override; + int ReSize() override; + int Run() override; + int RunImpl(int task_id); + + private: +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_1x1_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc index f294c96ddb0..46c443fb0ae 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc @@ -52,8 +52,6 @@ void ProcessFilterFp16(float16_t *origin_weight, float16_t *dst_weight, ConvPara int Convolution3x3FP16CPUKernel::InitWeightBias() { auto input_channel = conv_param_->input_channel_; int output_channel = conv_param_->output_channel_; - int kernel_h = conv_param_->kernel_h_; - int kernel_w = conv_param_->kernel_w_; int iC4 = UP_DIV(input_channel, C4NUM); int oC8 = UP_DIV(output_channel, C8NUM); // init weight @@ -64,18 +62,8 @@ int Convolution3x3FP16CPUKernel::InitWeightBias() { return RET_ERROR; } memset(transformed_filter_addr_, 0, transformed_size); - float *origin_weight = reinterpret_cast(in_tensors_.at(kWeightIndex)->Data()); - size_t fp16_weight_size = input_channel * output_channel * kernel_h * kernel_w * sizeof(float16_t); - fp16_weight_ = reinterpret_cast(malloc(fp16_weight_size)); - if (fp16_weight_ == nullptr) { - MS_LOG(ERROR) << "malloc fp16_weight_ failed."; - return RET_ERROR; - } - memset(fp16_weight_, 0, fp16_weight_size); - for (int i = 0; i < fp16_weight_size / sizeof(float16_t); ++i) { - fp16_weight_[i] = (float16_t)origin_weight[i]; - } - ProcessFilterFp16(fp16_weight_, transformed_filter_addr_, conv_param_); + ConvolutionBaseFP16CPUKernel::GetExecuteFilter(); + ProcessFilterFp16(execute_weight_, transformed_filter_addr_, conv_param_); // init bias size_t new_bias_size = oC8 * C8NUM * sizeof(float16_t); @@ -183,10 +171,6 @@ void Convolution3x3FP16CPUKernel::ConfigInputOutput() { } int Convolution3x3FP16CPUKernel::Init() { - if (context_->infer_shape_interrupt_ && !context_->running_) { - set_need_reinit(); - return RET_OK; - } auto ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvolutionBase init failed."; @@ -244,8 +228,8 @@ int Convolution3x3FP16CPUKernel::ReSize() { int Convolution3x3FP16CPUKernel::RunImpl(int task_id) { Conv3x3Fp16(reinterpret_cast(nhwc4_input_), transformed_filter_addr_, - reinterpret_cast(bias_data_), fp16_out_, tile_buffer_, block_unit_buffer_, tmp_dst_buffer_, - tmp_out_, task_id, conv_param_); + reinterpret_cast(bias_data_), execute_output_, tile_buffer_, block_unit_buffer_, + tmp_dst_buffer_, tmp_out_, task_id, conv_param_); return RET_OK; } @@ -265,16 +249,13 @@ int Convolution3x3FP16CPUKernel::Run() { MS_LOG(ERROR) << "Prepare failed."; return RET_ERROR; } - auto input_tensor = in_tensors_.at(kInputIndex); - auto input_ele_num = input_tensor->ElementsNum(); - auto ori_input_data = reinterpret_cast(input_tensor->Data()); - Float32ToFloat16(ori_input_data, fp16_input_, input_ele_num); + ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); int in_batch = conv_param_->input_batch_; int in_h = conv_param_->input_h_; int in_w = conv_param_->input_w_; int in_channel = conv_param_->input_channel_; - convert_func_(reinterpret_cast(fp16_input_), nhwc4_input_, in_batch, in_h * in_w, in_channel); + convert_func_(reinterpret_cast(execute_input_), nhwc4_input_, in_batch, in_h * in_w, in_channel); int error_code = LiteBackendParallelLaunch(Convolution3x3Fp16Impl, this, thread_count_); if (error_code != RET_OK) { @@ -294,7 +275,7 @@ int Convolution3x3FP16CPUKernel::Run() { batch * oc8 * C8NUM * out_w_block * out_h_block * conv_param_->output_unit_ * conv_param_->output_unit_; int ro_batch_size = batch * conv_param_->output_channel_ * conv_param_->output_h_ * conv_param_->output_w_; const float16_t *batch_tmp_out = tmp_out_ + tmp_out_batch_offset; - float16_t *batch_out = fp16_out_ + ro_batch_size; + float16_t *batch_out = execute_output_ + ro_batch_size; for (int h = 0; h < conv_param_->output_h_; h++) { for (int w = 0; w < conv_param_->output_w_; w++) { for (int c = 0; c < conv_param_->output_channel_; c++) { @@ -315,11 +296,7 @@ int Convolution3x3FP16CPUKernel::Run() { } } - // cast fp16 out to fp32 data - auto out_tensor = out_tensors_.at(kOutputIndex); - auto out_ele_num = out_tensor->ElementsNum(); - auto output_addr = reinterpret_cast(out_tensor->Data()); - Float16ToFloat32(fp16_out_, output_addr, out_ele_num); + ConvolutionBaseFP16CPUKernel::IfCastOutput(); return RET_OK; } } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h index 80cde56287d..e0e7e516adf 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h @@ -20,16 +20,16 @@ #include #include #include "src/lite_kernel.h" -#include "src/runtime/kernel/arm/base/convolution_base.h" +#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" #include "src/runtime/kernel/arm/nnacl/optimized_kernel.h" namespace mindspore::kernel { -class Convolution3x3FP16CPUKernel : public ConvolutionBaseCPUKernel { +class Convolution3x3FP16CPUKernel : public ConvolutionBaseFP16CPUKernel { public: Convolution3x3FP16CPUKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs, const Context *ctx, const lite::Primitive *primitive) - : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} + : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} ~Convolution3x3FP16CPUKernel() override { if (fp16_input_ != nullptr) { free(fp16_input_); @@ -66,9 +66,6 @@ class Convolution3x3FP16CPUKernel : public ConvolutionBaseCPUKernel { void ConfigInputOutput(); private: - float16_t *fp16_input_; - float16_t *fp16_weight_; - float16_t *fp16_out_; float16_t *transformed_filter_addr_; float16_t *tile_buffer_; float16_t *block_unit_buffer_; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc new file mode 100644 index 00000000000..a56c4d06f49 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc @@ -0,0 +1,86 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" +#include "src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h" +#include "schema/model_generated.h" +#include "src/kernel_factory.h" +#include "include/errorcode.h" +#include "src/runtime/runtime_api.h" + +namespace mindspore::kernel { +int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() { + // ===================input====================// + auto input_tensor = in_tensors_.at(kInputIndex); + auto input_data_type = input_tensor->data_type(); + MS_ASSERT(input_data_type == kNumberTypeFloat32 || input_data_type == kNumberTypeFloat16); + if (input_data_type == kNumberTypeFloat32) { + auto input_ele_num = input_tensor->ElementsNum(); + auto ori_input_data = reinterpret_cast(input_tensor->Data()); + Float32ToFloat16(ori_input_data, fp16_input_, input_ele_num); + execute_input_ = fp16_input_; + } else { + auto ori_input_data = reinterpret_cast(input_tensor->Data()); + execute_input_ = ori_input_data; + } + // ==================output====================// + auto out_tensor = out_tensors_.at(kOutputIndex); + auto out_data_type = out_tensor->data_type(); + MS_ASSERT(out_data_type == kNumberTypeFloat32 || out_data_type == kNumberTypeFloat16); + out_data_type_ = out_data_type; + if (out_data_type == kNumberTypeFloat32) { + execute_output_ = fp16_out_; + } else { + auto out_ptr = reinterpret_cast(out_tensor->Data()); + execute_output_ = out_ptr; + } + return RET_OK; +} + +int ConvolutionBaseFP16CPUKernel::GetExecuteFilter() { + auto weight_tensor = in_tensors_.at(kWeightIndex); + auto weight_data_type = weight_tensor->data_type(); + MS_ASSERT(weight_data_type == kNumberTypeFloat32 || weight_data_type == kNumberTypeFloat16); + if (weight_data_type == kNumberTypeFloat32) { + float *origin_weight = reinterpret_cast(in_tensors_.at(kWeightIndex)->Data()); + size_t fp16_weight_size = conv_param_->input_channel_ * conv_param_->output_channel_ * conv_param_->kernel_h_ * + conv_param_->input_w_ * sizeof(float16_t); + fp16_weight_ = reinterpret_cast(malloc(fp16_weight_size)); + if (fp16_weight_ == nullptr) { + MS_LOG(ERROR) << "malloc fp16_weight_ failed."; + return RET_ERROR; + } + for (int i = 0; i < fp16_weight_size / sizeof(float16_t); ++i) { + fp16_weight_[i] = (float16_t)origin_weight[i]; + } + execute_weight_ = fp16_weight_; + } else { + auto *origin_weight = reinterpret_cast(in_tensors_.at(kWeightIndex)->Data()); + execute_weight_ = origin_weight; + } + return RET_OK; +} + +void ConvolutionBaseFP16CPUKernel::IfCastOutput() { + if (out_data_type_ == kNumberTypeFloat32) { + auto out_tensor = out_tensors_.at(kOutputIndex); + auto out_ele_num = out_tensor->ElementsNum(); + auto output_addr = reinterpret_cast(out_tensor->Data()); + Float16ToFloat32(fp16_out_, output_addr, out_ele_num); + } +} + +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h new file mode 100644 index 00000000000..c4845a762c5 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h @@ -0,0 +1,54 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_BASE_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_BASE_FP16_H_ + +#include +#include +#include "src/lite_kernel.h" +#include "src/runtime/kernel/arm/base/convolution_base.h" +#include "src/runtime/kernel/arm/nnacl/optimized_kernel.h" + +namespace mindspore::kernel { +class ConvolutionBaseFP16CPUKernel : public ConvolutionBaseCPUKernel { + public: + ConvolutionBaseFP16CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const Context *ctx, + const lite::Primitive *primitive) + : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} + ~ConvolutionBaseFP16CPUKernel() override = default; + + int Init() override { return RET_OK; } + int ReSize() override { return RET_OK; } + int Run() override { return RET_OK; } + int RunImpl(int task_id) { return RET_OK; } + virtual int GetExecuteTensor(); + virtual int GetExecuteFilter(); + virtual void IfCastOutput(); + + protected: + float16_t *fp16_input_ = nullptr; + float16_t *fp16_weight_ = nullptr; + float16_t *fp16_out_ = nullptr; + float16_t *execute_input_; + float16_t *execute_weight_; + float16_t *execute_output_; + TypeId out_data_type_; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_BASE_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc index af4283c60fd..ed38b6b82e1 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc @@ -102,10 +102,6 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() { } int ConvolutionDepthwiseFp16CPUKernel::Init() { - if (context_->infer_shape_interrupt_ && !context_->running_) { - set_need_reinit(); - return RET_OK; - } // conv base init auto ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc index 65116d77f9b..235b7e48ff2 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc @@ -46,24 +46,14 @@ int ConvolutionFP16CPUKernel::InitWeightBias() { int pack_weight_size = oc8 * ic4 * C8NUM * C4NUM * kernel_plane; // init weight - float *origin_weight = reinterpret_cast(in_tensors_.at(kWeightIndex)->Data()); - size_t fp16_weight_size = in_channel * out_channel * kernel_h * kernel_w * sizeof(float16_t); - fp16_weight_ = reinterpret_cast(malloc(fp16_weight_size)); - if (fp16_weight_ == nullptr) { - MS_LOG(ERROR) << "malloc fp16_weight_ failed."; - return RET_ERROR; - } - for (int i = 0; i < fp16_weight_size / sizeof(float16_t); ++i) { - fp16_weight_[i] = (float16_t)origin_weight[i]; - } - + ConvolutionBaseFP16CPUKernel::GetExecuteFilter(); packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(float16_t))); if (packed_weight_ == nullptr) { MS_LOG(ERROR) << "malloc packed_weight_ failed."; return RET_ERROR; } memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t)); - PackWeightFp16(fp16_weight_, conv_param_, packed_weight_); + PackWeightFp16(execute_weight_, conv_param_, packed_weight_); // init bias bias_data_ = malloc(oc8 * C8NUM * sizeof(float16_t)); @@ -157,10 +147,6 @@ void ConvolutionFP16CPUKernel::ConfigInputOutput() { } int ConvolutionFP16CPUKernel::Init() { - if (context_->infer_shape_interrupt_ && !context_->running_) { - set_need_reinit(); - return RET_OK; - } auto ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvolutionBase init fail!ret: " << ret; @@ -212,7 +198,7 @@ int ConvolutionFP16CPUKernel::ReSize() { int ConvolutionFP16CPUKernel::RunImpl(int task_id) { ConvFp16(reinterpret_cast(nhwc4_input_), packed_input_, packed_weight_, - reinterpret_cast(bias_data_), tmp_output_block_, fp16_out_, task_id, conv_param_); + reinterpret_cast(bias_data_), tmp_output_block_, execute_output_, task_id, conv_param_); return RET_OK; } @@ -232,16 +218,13 @@ int ConvolutionFP16CPUKernel::Run() { MS_LOG(ERROR) << "Prepare failed."; return RET_ERROR; } - auto input_tensor = in_tensors_.at(kInputIndex); - auto ori_input_data = reinterpret_cast(input_tensor->Data()); - auto input_ele_num = input_tensor->ElementsNum(); - Float32ToFloat16(ori_input_data, fp16_input_, input_ele_num); + ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); int in_batch = conv_param_->input_batch_; int in_h = conv_param_->input_h_; int in_w = conv_param_->input_w_; int in_channel = conv_param_->input_channel_; - convert_func_(reinterpret_cast(fp16_input_), nhwc4_input_, in_batch, in_h * in_w, in_channel); + convert_func_(reinterpret_cast(execute_input_), nhwc4_input_, in_batch, in_h * in_w, in_channel); int error_code = LiteBackendParallelLaunch(ConvolutionFp16Impl, this, thread_count_); if (error_code != RET_OK) { @@ -249,11 +232,7 @@ int ConvolutionFP16CPUKernel::Run() { return RET_ERROR; } - // cast fp16 out to fp32 data - auto out_tensor = out_tensors_.at(kOutputIndex); - auto output_addr = reinterpret_cast(out_tensor->Data()); - auto out_ele_num = out_tensor->ElementsNum(); - Float16ToFloat32(fp16_out_, output_addr, out_ele_num); + ConvolutionBaseFP16CPUKernel::IfCastOutput(); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h index 27528656501..ad53277c88c 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h @@ -20,15 +20,15 @@ #include #include #include "src/lite_kernel.h" -#include "src/runtime/kernel/arm/base/convolution_base.h" +#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" namespace mindspore::kernel { -class ConvolutionFP16CPUKernel : public ConvolutionBaseCPUKernel { +class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { public: ConvolutionFP16CPUKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs, const Context *ctx, const lite::Primitive *primitive) - : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} + : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} ~ConvolutionFP16CPUKernel() override { if (fp16_input_ != nullptr) { free(fp16_input_); @@ -59,9 +59,6 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseCPUKernel { void ConfigInputOutput(); private: - float16_t *fp16_input_; - float16_t *fp16_weight_; - float16_t *fp16_out_; float16_t *packed_input_; float16_t *packed_weight_; float16_t *tmp_output_block_; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc index 67fd9932ef0..31931e80b84 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc @@ -39,23 +39,13 @@ int ConvolutionSWFP16CPUKernel::ProcessFilter() { int out_channel = conv_param_->output_channel_; int ic4 = UP_DIV(in_channel, C4NUM); - auto *origin_weight = reinterpret_cast(in_tensors_.at(kWeightIndex)->Data()); - size_t fp16_weight_size = in_channel * out_channel * kernel_h * kernel_w * sizeof(float16_t); - fp16_weight_ = reinterpret_cast(malloc(fp16_weight_size)); - if (fp16_weight_ == nullptr) { - MS_LOG(ERROR) << "malloc fp16_weight_ failed."; - return RET_ERROR; - } - // cast origin fp32 weight data to fp16 data - for (int i = 0; i < fp16_weight_size / sizeof(float16_t); ++i) { - fp16_weight_[i] = (float16_t)origin_weight[i]; - } + ConvolutionBaseFP16CPUKernel::GetExecuteFilter(); for (int oc = 0; oc < out_channel; ++oc) { int src_oc_offset = oc * kernel_h * kernel_w * in_channel; int dst_oc_offset = oc * kernel_h * kernel_w * ic4 * C4NUM; for (int i = 0; i < kernel_h * kernel_w; ++i) { - const float16_t *src = fp16_weight_ + src_oc_offset + i * in_channel; + const float16_t *src = execute_weight_ + src_oc_offset + i * in_channel; float16_t *dst = packed_weight_ + dst_oc_offset + i * ic4 * C4NUM; memcpy(dst, src, in_channel * sizeof(float16_t)); } @@ -162,10 +152,6 @@ void ConvolutionSWFP16CPUKernel::ConfigInputOutput() { } int ConvolutionSWFP16CPUKernel::Init() { - if (context_->infer_shape_interrupt_ && !context_->running_) { - set_need_reinit(); - return RET_OK; - } auto ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvolutionBase init fail!ret: " << ret; @@ -222,7 +208,7 @@ int ConvolutionSWFP16CPUKernel::ReSize() { int ConvolutionSWFP16CPUKernel::RunImpl(int task_id) { ConvSWFp16(reinterpret_cast(nhwc4_input_), packed_weight_, reinterpret_cast(bias_data_), - tmp_output_block_, fp16_out_, task_id, conv_param_, slidingWindow_param_); + tmp_output_block_, execute_output_, task_id, conv_param_, slidingWindow_param_); return RET_OK; } @@ -242,16 +228,13 @@ int ConvolutionSWFP16CPUKernel::Run() { MS_LOG(ERROR) << "Prepare failed."; return RET_ERROR; } - auto input_tensor = in_tensors_.at(kInputIndex); - auto input_ele_num = input_tensor->ElementsNum(); - auto ori_input_data = reinterpret_cast(input_tensor->Data()); - Float32ToFloat16(ori_input_data, fp16_input_, input_ele_num); + ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); int in_batch = conv_param_->input_batch_; int in_h = conv_param_->input_h_; int in_w = conv_param_->input_w_; int in_channel = conv_param_->input_channel_; - convert_func_(reinterpret_cast(fp16_input_), nhwc4_input_, in_batch, in_h * in_w, in_channel); + convert_func_(reinterpret_cast(execute_input_), nhwc4_input_, in_batch, in_h * in_w, in_channel); int error_code = LiteBackendParallelLaunch(ConvolutionSWFp16Impl, this, thread_count_); if (error_code != RET_OK) { @@ -259,18 +242,14 @@ int ConvolutionSWFP16CPUKernel::Run() { return RET_ERROR; } - // cast fp16 out to fp32 data - auto out_tensor = out_tensors_.at(kOutputIndex); - auto out_ele_num = out_tensor->ElementsNum(); - auto output_addr = reinterpret_cast(out_tensor->Data()); // output nhwc4 int oc4_res = conv_param_->output_channel_ % C4NUM; if (oc4_res != 0) { - PackNHWC4ToNHWCFp16(reinterpret_cast(tmp_output_block_), reinterpret_cast(fp16_out_), + PackNHWC4ToNHWCFp16(reinterpret_cast(tmp_output_block_), reinterpret_cast(execute_output_), conv_param_->output_batch_, conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); } - Float16ToFloat32(fp16_out_, output_addr, out_ele_num); + ConvolutionBaseFP16CPUKernel::IfCastOutput(); return RET_OK; } } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h index 08239853a61..ce9aa0b6747 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h @@ -19,15 +19,15 @@ #include #include #include "src/lite_kernel.h" -#include "src/runtime/kernel/arm/base/convolution_base.h" +#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" namespace mindspore::kernel { -class ConvolutionSWFP16CPUKernel : public ConvolutionBaseCPUKernel { +class ConvolutionSWFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { public: ConvolutionSWFP16CPUKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs, const Context *ctx, const lite::Primitive *primitive) - : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} + : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} ~ConvolutionSWFP16CPUKernel() override { if (fp16_input_ != nullptr) { free(fp16_input_); @@ -57,9 +57,6 @@ class ConvolutionSWFP16CPUKernel : public ConvolutionBaseCPUKernel { int ProcessFilter(); private: - float16_t *fp16_input_; - float16_t *fp16_weight_; - float16_t *fp16_out_; float16_t *packed_weight_; float16_t *tmp_output_block_; SlidingWindowParam *slidingWindow_param_; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc new file mode 100644 index 00000000000..070f5bed6cb --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc @@ -0,0 +1,409 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h" +#include "src/runtime/kernel/arm/fp16/matrix_fp16.h" +#include "src/runtime/kernel/arm/nnacl/fp16/conv_fp16.h" +#include "src/runtime/kernel/arm/nnacl/fp16/common_func.h" +#include "src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h" +#include "src/runtime/kernel/arm/nnacl/fp16/pack_fp16.h" +#include "src/runtime/kernel/arm/nnacl/fp16/winograd_transform_fp16.h" +#include "src/runtime/kernel/arm/nnacl/fp16/winograd_utils_fp16.h" +#include "src/runtime/kernel/arm/fp16/layout_transform_fp16.h" +#include "schema/model_generated.h" +#include "src/kernel_registry.h" +#include "include/errorcode.h" +#include "src/runtime/runtime_api.h" + +using mindspore::kernel::KERNEL_ARCH::kCPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_Conv2D; + +namespace mindspore::kernel { +void WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, + ConvParameter *conv_param, int oc_block) { + // original weight format : ohwi + auto channel_in = conv_param->input_channel_; + auto channel_out = conv_param->output_channel_; + int input_unit_square = input_unit * input_unit; + + // generate matrix_G && matrix_GT + auto matrix_g = TransformMatrixGenerator(input_unit, kernel_unit); + auto matrix_gt = TransformMatrixGenerator(kernel_unit, input_unit); + ChooseMatrixG(matrix_g, matrix_gt); + auto matrix_g_data = reinterpret_cast(matrix_g->GetData()); + auto matrix_gt_data = reinterpret_cast(matrix_gt->GetData()); + auto matrix_g_data_fp16 = reinterpret_cast(malloc(input_unit * kernel_unit * sizeof(float16_t))); + auto matrix_gt_data_fp16 = reinterpret_cast(malloc(input_unit * kernel_unit * sizeof(float16_t))); + Float32ToFloat16(matrix_g_data, matrix_g_data_fp16, input_unit * kernel_unit); + Float32ToFloat16(matrix_gt_data, matrix_gt_data_fp16, input_unit * kernel_unit); + + // trans_filter = G*g*GT (g represents weight_data) + // separate into two steps ===> tmp = G*g ===> out = tmp * GT + auto tmp_weight_data = reinterpret_cast(malloc(kernel_unit * kernel_unit * sizeof(float16_t))); + auto tmp_data = reinterpret_cast(malloc(input_unit * kernel_unit * sizeof(float16_t))); + auto trans_out_data = reinterpret_cast(malloc(input_unit * input_unit * sizeof(float16_t))); + bool row = true; + auto trans_weight_data = reinterpret_cast(trans_weight->GetData()); + std::vector strides = trans_weight->GetStride(); + + int kernel_plane_stride = channel_in; + if (oc_block == 0) { + MS_LOG(ERROR) << "Divide by zero"; + return; + } + for (int i = 0; i < channel_out; i++) { + int out_c_block = i / oc_block; + int out_c_res = i % oc_block; + int input_oz_offset = i * kernel_unit * kernel_unit * channel_in; + int output_oz_offset = out_c_block * strides[1] * input_unit * input_unit + out_c_res; + for (int j = 0; j < channel_in; j++) { + int ic4_block = j / C4NUM; + int ic4_res = j % C4NUM; + int input_iz_offset = input_oz_offset + j; + int output_iz_offset = output_oz_offset + ic4_block * strides[2] + ic4_res * strides[3]; + for (int k = 0; k < kernel_unit * kernel_unit; k++) { + int input_xy_offset = input_iz_offset + k * kernel_plane_stride; + tmp_weight_data[k] = *(weight_data + input_xy_offset); + } + // now we only support row-major matrix-multiply + // tmp = G * g + MatrixMultiplyFp16(matrix_g_data_fp16, tmp_weight_data, tmp_data, input_unit, kernel_unit, kernel_unit, row); + // out = tmp * GT + MatrixMultiplyFp16(tmp_data, matrix_gt_data_fp16, trans_out_data, input_unit, kernel_unit, input_unit, row); + + for (int z = 0; z < input_unit_square; z++) { + int output_xy_offset = output_iz_offset + z * strides[1]; + *(trans_weight_data + output_xy_offset) = trans_out_data[z]; + } + } + } + free(tmp_weight_data); + free(tmp_data); + free(trans_out_data); + free(matrix_g_data_fp16); + free(matrix_gt_data_fp16); + delete matrix_g; + delete matrix_gt; +} + +int ConvolutionWinogradFP16CPUKernel::InitWeightBias() { + int output_channel = conv_param_->output_channel_; + int oc_block, oc_block_num; + oc_block = C8NUM; + oc_block_num = UP_DIV(output_channel, C8NUM); + + // init weight + auto ret = MallocFilterMatrix(oc_block, oc_block_num); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Malloc filter matrix failed."; + return RET_ERROR; + } + + ConvolutionBaseFP16CPUKernel::GetExecuteFilter(); + WinogradFilterTransformFp16(execute_weight_, trans_weight_, kernel_unit_, input_unit_, conv_param_, oc_block); + + // init bias + bias_data_ = malloc(oc_block_num * oc_block * sizeof(float16_t)); + if (bias_data_ == nullptr) { + MS_LOG(ERROR) << "malloc bias_data_ failed."; + return RET_ERROR; + } + memset(bias_data_, 0, oc_block_num * oc_block * sizeof(float16_t)); + auto fp16_bias_data = reinterpret_cast(bias_data_); + if (in_tensors_.size() == kInputSize2) { + auto ori_bias = reinterpret_cast(in_tensors_.at(kBiasIndex)->Data()); + for (int i = 0; i < output_channel; ++i) { + fp16_bias_data[i] = (float16_t)ori_bias[i]; + } + } else { + MS_ASSERT(inputs_.size() == kInputSize1); + } + return RET_OK; +} + +int ConvolutionWinogradFP16CPUKernel::MallocFilterMatrix(int oc_block, int oc_block_num) { + int channel_in = conv_param_->input_channel_; + int ic4 = UP_DIV(channel_in, BLOCK); + + // set data + auto trans_matrix_data_size = input_unit_ * input_unit_ * ic4 * C4NUM * oc_block_num * oc_block * sizeof(float); + auto matrix_buffer = malloc(trans_matrix_data_size); + if (matrix_buffer == nullptr) { + MS_LOG(ERROR) << "malloc matrix_buffer failed."; + return RET_ERROR; + } + memset(matrix_buffer, 0, trans_matrix_data_size); + trans_weight_ = new Matrix(); + trans_weight_->SetData(matrix_buffer); + trans_weight_->SetNDim(5); + + std::vector shapes; + std::vector strides; + // set shape + shapes.push_back(input_unit_ * input_unit_); + shapes.push_back(oc_block_num); + shapes.push_back(ic4); + shapes.push_back(C4NUM); + shapes.push_back(oc_block); + // set stride + for (int i = 0; i < 4; i++) { + int stride = 1; + for (int j = i + 1; j < 5; j++) { + stride *= shapes[j]; + } + strides.push_back(stride); + } + trans_weight_->SetShape(shapes); + trans_weight_->SetStride(strides); + return RET_OK; +} + +int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() { + int cal_num = 16; + int channel_in = conv_param_->input_channel_; + int channel_out = conv_param_->output_channel_; + int output_h = conv_param_->output_h_; + int output_w = conv_param_->output_w_; + int ic4 = UP_DIV(channel_in, C4NUM); + int oc8 = UP_DIV(channel_out, C8NUM); + + /*=============================fp16_input_============================*/ + size_t fp16_input_size = conv_param_->input_channel_ * conv_param_->input_batch_ * conv_param_->input_h_ * + conv_param_->input_w_ * sizeof(float16_t); + fp16_input_ = reinterpret_cast(malloc(fp16_input_size)); + if (fp16_input_ == nullptr) { + MS_LOG(ERROR) << "malloc fp16_input_ failed."; + return RET_ERROR; + } + + /*=============================trans_input_============================*/ + size_t tile_buffer_size = thread_count_ * cal_num * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float16_t); + trans_input_ = reinterpret_cast(malloc(tile_buffer_size)); + if (trans_input_ == nullptr) { + MS_LOG(ERROR) << "malloc trans_input_ failed."; + return RET_ERROR; + } + memset(trans_input_, 0, tile_buffer_size); + + /*=============================gemm_out_============================*/ + gemm_out_ = reinterpret_cast( + malloc(thread_count_ * cal_num * input_unit_ * input_unit_ * oc8 * C8NUM * sizeof(float16_t))); + if (gemm_out_ == nullptr) { + MS_LOG(ERROR) << "malloc gemm_out_ failed."; + return RET_ERROR; + } + + /*=============================tmp_out_data_============================*/ + int out_w_block = UP_DIV(output_w, output_unit_); + int out_h_block = UP_DIV(output_h, output_unit_); + tmp_out_data_ = reinterpret_cast(malloc(conv_param_->output_batch_ * out_w_block * out_h_block * + output_unit_ * output_unit_ * oc8 * C8NUM * sizeof(float16_t))); + if (tmp_out_data_ == nullptr) { + MS_LOG(ERROR) << "malloc tmp_out_data_ failed."; + return RET_ERROR; + } + /*=============================fp16_out_============================*/ + size_t fp16_output_size = conv_param_->output_channel_ * conv_param_->output_batch_ * conv_param_->output_h_ * + conv_param_->output_w_ * sizeof(float16_t); + fp16_out_ = reinterpret_cast(malloc(fp16_output_size)); + if (fp16_out_ == nullptr) { + MS_LOG(ERROR) << "malloc fp16_out_ failed."; + return RET_ERROR; + } + + /*=============================tmp_data_============================*/ + tmp_data_ = + reinterpret_cast(malloc(thread_count_ * C4NUM * input_unit_ * input_unit_ * sizeof(float16_t))); + if (tmp_data_ == nullptr) { + MS_LOG(ERROR) << "malloc tmp_data_ failed."; + return RET_ERROR; + } + memset(tmp_data_, 0, C4NUM * input_unit_ * input_unit_ * sizeof(float16_t)); + + tmp_buffer_address_list_[0] = trans_input_; + tmp_buffer_address_list_[1] = gemm_out_; + tmp_buffer_address_list_[2] = tmp_out_data_; + tmp_buffer_address_list_[3] = tmp_data_; + + /*=============================nhwc4_input_============================*/ + size_t nhwc4_input_size = + ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); + nhwc4_input_ = malloc(nhwc4_input_size); + if (nhwc4_input_ == nullptr) { + MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; + return RET_ERROR; + } + memset(nhwc4_input_, 0, nhwc4_input_size); + return RET_OK; +} + +int ConvolutionWinogradFP16CPUKernel::ConfigInputOutput() { + auto input_tensor = in_tensors_.at(kInputIndex); + auto ret = CheckLayout(input_tensor); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Check layout failed."; + return RET_ERROR; + } + auto output_tensor = out_tensors_.at(kOutputIndex); + output_tensor->SetFormat(schema::Format_NHWC); + + // choose input transformer function (4x4 unit or 8x8 unit) + input_trans_func_ = GetInputTransFuncFp16(input_unit_); + if (input_trans_func_ == nullptr) { + MS_LOG(ERROR) << "Get input_trans_func failed."; + return RET_ERROR; + } + output_trans_func_ = GetOutputTransFuncFp16(input_unit_, output_unit_); + if (output_trans_func_ == nullptr) { + MS_LOG(ERROR) << "Get output_trans_func_ failed."; + return RET_ERROR; + } + return RET_OK; +} + +int ConvolutionWinogradFP16CPUKernel::Init() { + auto ret = ConvolutionBaseCPUKernel::Init(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "ConvolutionBase init failed."; + return RET_ERROR; + } + kernel_unit_ = conv_param_->kernel_h_; + input_unit_ = output_unit_ + kernel_unit_ - 1; + conv_param_->input_unit_ = input_unit_; + conv_param_->output_unit_ = output_unit_; + + ret = InitWeightBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init weight bias failed."; + return RET_ERROR; + } + // malloc tmp buffer + ret = InitTmpBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init tmp buffer failed."; + return RET_ERROR; + } + ret = ConfigInputOutput(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "ConfigInputOutput failed."; + return RET_ERROR; + } + return RET_OK; +} + +int ConvolutionWinogradFP16CPUKernel::ReSize() { + if (tmp_data_ != nullptr) { + free(tmp_data_); + } + if (trans_input_ != nullptr) { + free(trans_input_); + } + if (gemm_out_ != nullptr) { + free(gemm_out_); + } + if (tmp_out_data_ != nullptr) { + free(tmp_out_data_); + } + if (nhwc4_input_ != nullptr) { + free(nhwc4_input_); + } + if (fp16_input_ != nullptr) { + free(fp16_input_); + } + if (fp16_out_ != nullptr) { + free(fp16_out_); + } + + auto ret = ConvolutionBaseCPUKernel::Init(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "ConvolutionBase init failed."; + return RET_ERROR; + } + kernel_unit_ = conv_param_->kernel_h_; + input_unit_ = output_unit_ + kernel_unit_ - 1; + conv_param_->input_unit_ = input_unit_; + conv_param_->output_unit_ = output_unit_; + + ret = InitTmpBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init tmp buffer failed."; + return RET_ERROR; + } + ret = ConfigInputOutput(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "ConfigInputOutput failed."; + return RET_ERROR; + } + return RET_OK; +} + +int ConvolutionWinogradFP16CPUKernel::RunImpl(int task_id) { + ConvWinogardFp16(reinterpret_cast(nhwc4_input_), reinterpret_cast(trans_weight_->GetData()), + reinterpret_cast(bias_data_), tmp_buffer_address_list_, task_id, conv_param_, + input_trans_func_, output_trans_func_); + return RET_OK; +} + +int ConvolutionWinogradFp16Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) { + auto conv = reinterpret_cast(cdata); + auto error_code = conv->RunImpl(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "ConvolutionWinograd Fp16 Run error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int ConvolutionWinogradFP16CPUKernel::Run() { + auto prepare_ret = Prepare(); + if (prepare_ret != RET_OK) { + MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; + return prepare_ret; + } + + ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); + + int in_batch = conv_param_->input_batch_; + int in_h = conv_param_->input_h_; + int in_w = conv_param_->input_w_; + int in_channel = conv_param_->input_channel_; + convert_func_(execute_input_, nhwc4_input_, in_batch, in_h * in_w, in_channel); + + int error_code = LiteBackendParallelLaunch(ConvolutionWinogradFp16Impl, this, thread_count_); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "conv winograd error error_code[" << error_code << "]"; + return RET_ERROR; + } + + // get real output + UnPackWinogradOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, + conv_param_->output_w_, conv_param_->output_channel_, output_unit_); + int output_num = + conv_param_->output_channel_ * conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_; + if (conv_param_->is_relu_) { + ReluFp16(execute_output_, execute_output_, output_num); + } else if (conv_param_->is_relu6_) { + Relu6Fp16(execute_output_, execute_output_, output_num); + } else { + // do nothing + } + ConvolutionBaseFP16CPUKernel::IfCastOutput(); + return RET_OK; +} +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h new file mode 100644 index 00000000000..c5d2f0e3a00 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h @@ -0,0 +1,87 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_WINOGRAD_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_WINOGRAD_FP16_H_ + +#include +#include +#include "src/lite_kernel.h" +#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" +#include "src/runtime/kernel/arm/nnacl/fp16/conv_fp16.h" +#include "src/runtime/kernel/arm/fp16/matrix_fp16.h" +#include "src/runtime/kernel/arm/nnacl/fp16/winograd_utils_fp16.h" +#include "src/runtime/kernel/arm/nnacl/optimized_kernel.h" + +namespace mindspore::kernel { +class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { + public: + ConvolutionWinogradFP16CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const Context *ctx, + const lite::Primitive *primitive) + : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} + ~ConvolutionWinogradFP16CPUKernel() override { + if (fp16_input_ != nullptr) { + free(fp16_input_); + } + if (fp16_weight_ != nullptr) { + free(fp16_weight_); + } + if (fp16_out_ != nullptr) { + free(fp16_out_); + } + if (tmp_data_ != nullptr) { + free(tmp_data_); + } + if (trans_input_ != nullptr) { + free(trans_input_); + } + if (gemm_out_ != nullptr) { + free(gemm_out_); + } + if (tmp_out_data_ != nullptr) { + free(tmp_out_data_); + } + delete trans_weight_; + } + + int Init() override; + int ReSize() override; + int Run() override; + int RunImpl(int task_id); + int InitWeightBias(); + int MallocFilterMatrix(int oc_block, int oc_block_num); + int InitTmpBuffer(); + int ConfigInputOutput(); + + private: + int kernel_unit_; + int input_unit_; + int output_unit_; + float16_t *tmp_data_; + float16_t *trans_input_; + float16_t *gemm_out_; + float16_t *tmp_out_data_; + Matrix *trans_weight_; + InputTransformUnitFp16Func input_trans_func_; + OutputTransformUnitFp16Func output_trans_func_; + TmpBufferAddressFp16 tmp_buffer_address_list_[4]; +}; +void WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, + ConvParameter *conv_param, int oc_block); +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_WINOGRAD_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc index d5570e621e3..366232034de 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc @@ -115,10 +115,6 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() { } int DeconvolutionDepthwiseFp16CPUKernel::Init() { - if (context_->infer_shape_interrupt_ && !context_->running_) { - set_need_reinit(); - return RET_OK; - } sliding_ = new SlidingWindowParam; InitSlideParam(); // conv base init diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matrix_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/matrix_fp16.cc new file mode 100644 index 00000000000..ce7814f285e --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matrix_fp16.cc @@ -0,0 +1,39 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "src/runtime/kernel/arm/fp16/matrix_fp16.h" + +namespace mindspore::kernel { + +void MatrixMultiplyFp16(const float16_t *matrix_a, const float16_t *matrix_b, float16_t *matrix_c, int m, int k, int n, + bool row) { + // row-major implementation + int count = 0; + for (int h = 0; h < m; h++) { + int h_offset = h * k; + for (int w = 0; w < n; w++) { + float16_t res = 0; + for (int i = 0; i < k; i++) { + res += *(matrix_a + h_offset + i) * *(matrix_b + w + i * n); + } + *(matrix_c + count) = res; + count++; + } + } +} + + + +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matrix_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/matrix_fp16.h new file mode 100644 index 00000000000..de25029aff6 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matrix_fp16.h @@ -0,0 +1,27 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATRIX_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATRIX_FP16_H_ + +#include "src/runtime/kernel/arm/base/matrix.h" + +namespace mindspore::kernel { +void MatrixMultiplyFp16(const float16_t *matrix_a, const float16_t *matrix_b, float16_t *matrix_c, int m, int k, int n, + bool row); +} + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATRIX_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc index 365b254d6a0..db45056ddfc 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc @@ -53,10 +53,6 @@ int PoolingFp16CPUKernel::InitBuffer() { } int PoolingFp16CPUKernel::Init() { - if (context_->infer_shape_interrupt_ && !context_->running_) { - set_need_reinit(); - return RET_OK; - } auto ret = PoolingBaseCPUKernel::Init(); if (ret != RET_OK) { MS_LOG(ERROR) << "PoolingBase Init failed."; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc index 2a3be987430..d803e99f234 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc @@ -329,10 +329,9 @@ int ConvolutionWinogradCPUKernel::RunImpl(int task_id) { MS_LOG(ERROR) << "gemm_func is nullptr."; return RET_ERROR; } - auto output_addr = reinterpret_cast(out_tensors_.at(kOutputIndex)->Data()); ConvWinogardFp32(reinterpret_cast(nhwc4_input_), reinterpret_cast(trans_weight_->GetData()), - reinterpret_cast(bias_data_), output_addr, tmp_buffer_address_list_, task_id, - conv_param_, input_trans_func_, output_trans_func_, gemm_func_); + reinterpret_cast(bias_data_), tmp_buffer_address_list_, task_id, conv_param_, + input_trans_func_, output_trans_func_, gemm_func_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h index ebe5c10d260..785935d7f3e 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h @@ -16,9 +16,7 @@ #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_CAST_FP16_H_ #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_CAST_FP16_H_ -#ifdef ENABLE_NEON #include -#endif #include "nnacl/op_base.h" #include "nnacl/fp32/cast.h" #ifdef __cplusplus diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/common_func.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/common_func.c new file mode 100644 index 00000000000..84ddcd8e4b1 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/common_func.c @@ -0,0 +1,61 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "nnacl/fp16/common_func.h" + +void ReluFp16(float16_t *data, float16_t *dst, int ele_num) { + int eight_block = UP_DIV(ele_num, C8NUM); + for (int i = 0; i < eight_block - 1; i++) { + int index = i * C8NUM; +#ifdef ENABLE_NEON + float16x8_t relu_data = vld1q_f16(data + index); + float16x8_t zero_data = vdupq_n_f16(0); + relu_data = vmaxq_f16(relu_data, zero_data); + vst1q_f16(dst + index, relu_data); +#else + data[index] = data[index] < 0 ? 0 : data[index]; + data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1]; + data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2]; + data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3]; +#endif + } + for (int j = (eight_block - 1) * C8NUM; j < ele_num; ++j) { + data[j] = data[j] < 0 ? 0 : data[j]; + } +} + +void Relu6Fp16(float16_t *data, float16_t *dst, int ele_num) { + int eight_block = UP_DIV(ele_num, C8NUM); + for (int i = 0; i < eight_block - 1; i++) { + int index = i * C8NUM; +#ifdef ENABLE_NEON + float16x8_t relu6_data = vld1q_f16(data + index); + float16x8_t zero_data = vdupq_n_f16(0); + float16x8_t six_data = vdupq_n_f16(6); + relu6_data = vmaxq_f16(relu6_data, zero_data); + relu6_data = vminq_f16(relu6_data, six_data); + vst1q_f16(dst + index, relu6_data); +#else + for (int j = 0; j < C8NUM; ++j) { + data[index + j] = data[index + j] < 0 ? 0 : data[index + j]; + data[index + j] = data[index + j] > 6 ? 6 : data[index + j]; + } +#endif + } + for (int j = (eight_block - 1) * C8NUM; j < ele_num; ++j) { + data[j] = data[j] < 0 ? 0 : data[j]; + data[j] = data[j] > 6 ? 6 : data[j]; + } +} diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/common_func.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/common_func.h index 00395343c61..96ad7bde095 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/common_func.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/common_func.h @@ -39,6 +39,8 @@ void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *w size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); #endif +void ReluFp16(float16_t *data, float16_t *dst, int ele_num); +void Relu6Fp16(float16_t *data, float16_t *dst, int ele_num); #ifdef __cplusplus } diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.c index 9db5662b546..cf034be09aa 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.c +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.c @@ -32,12 +32,21 @@ void IndirectGemmFp16_16x8(float16_t *output, float16_t *input, float16_t *weigh #endif #ifndef ENABLE_NEON void IndirectGemmFp16_16x8(float16_t *output, float16_t *input, float16_t *weight, float16_t *bias, size_t step, - size_t ic4, size_t out_channel, size_t offset, size_t mode, size_t writeC4, size_t relu, + size_t ic4, size_t out_channel, size_t offset, size_t mode, size_t writeC8, size_t relu, size_t relu6) { + if (!(mode && writeC8)) { + IndirectGemmFp16_16x8_common(output, input, weight, bias, step, ic4, output, offset, relu, relu6); + } else { + IndirectGemmFp16_16x8_c8(output, input, weight, bias, step, ic4, output, offset, mode, writeC8, relu, relu6); + } +} + +void IndirectGemmFp16_16x8_common(float16_t *output, float16_t *input, float16_t *weight, float16_t *bias, size_t step, + size_t ic4, size_t oc8, size_t offset, size_t relu, size_t relu6) { const int tile_n = 16; for (int i = 0; i < out_channel; i++) { - int oc8_block = i / 8; - int oc8_res = i % 8; + int oc8_block = i / C8NUM; + int oc8_res = i % C8NUM; int weight_oc_offset = oc8_block * step * ic4 * C4NUM * C8NUM + oc8_res; for (int k = 0; k < tile_n; k++) { int input_tile_offset = k * C4NUM; @@ -72,32 +81,32 @@ void IndirectGemmFp16_16x8(float16_t *output, float16_t *input, float16_t *weigh } } -void IndirectGemmFp16_16x8_tmp(float16_t *output, float16_t *input, float16_t *weight, const float16_t *bias, - size_t step, size_t ic4, size_t output_channel, size_t offset, size_t mode, - size_t writeC4, size_t relu, size_t relu6) { +void IndirectGemmFp16_16x8_c8(float16_t *output, float16_t *input, float16_t *weight, float16_t *bias, size_t step, + size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC8, + size_t relu, size_t relu6) { const int tile_num = 16; - if (mode) { + if (mode && writeC8) { for (int i = 0; i < tile_num; i++) { int input_tile_offset = i * C4NUM; - int output_tile_offset = i * output_channel * 36; + int output_tile_offset = i * output_channel * step; for (int j = 0; j < output_channel; j++) { - int oc8_block = j / 8; - int oc8_res = j % 8; - int weight_oc_offset = oc8_block * 36 * ic4 * C4NUM * 8 + oc8_res; - int out_oc_offset = output_tile_offset + oc8_block * 36 * C8NUM + oc8_res; + int oc8_block = j / C8NUM; + int oc8_res = j % C8NUM; + int weight_oc_offset = oc8_block * step * ic4 * C4NUM * C8NUM + oc8_res; + int out_oc_offset = output_tile_offset + oc8_block * step * C8NUM + oc8_res; for (int n = 0; n < step; n++) { int input_kw_offset = input_tile_offset + n * ic4 * C4NUM * tile_num; - int weight_kw_offset = weight_oc_offset + n * ic4 * C4NUM * 8; + int weight_kw_offset = weight_oc_offset + n * ic4 * C4NUM * C8NUM; int output_kw_offset = out_oc_offset + n * C8NUM; float16_t acc = 0; for (int k = 0; k < ic4; k++) { int input_ic4_offset = input_kw_offset + k * tile_num * C4NUM; - int weight_ic4_offset = weight_kw_offset + k * C4NUM * 8; - for (int m = 0; m < 4; m++) { + int weight_ic4_offset = weight_kw_offset + k * C4NUM * C8NUM; + for (int m = 0; m < C4NUM; m++) { int input_ic_offset = input_ic4_offset + m; - int weight_ic_offset = weight_ic4_offset + m * 8; + int weight_ic_offset = weight_ic4_offset + m * C8NUM; acc += (weight + weight_ic_offset)[0] * (input + input_ic_offset)[0]; } } @@ -405,3 +414,91 @@ void Conv3x3Fp16(float16_t *input_data, float16_t *transed_weight, const float16 } } } + +// fp16 convolution winograd +void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const float16_t *bias_data, + TmpBufferAddressFp16 *buffer_list, int task_id, ConvParameter *conv_param, + InputTransformUnitFp16Func input_trans_func, OutputTransformUnitFp16Func output_trans_func) { + int thread_num = conv_param->thread_num_; + int input_unit = conv_param->input_unit_; + int in_batch = conv_param->input_batch_; + int in_channel = conv_param->input_channel_; + int ic4 = UP_DIV(in_channel, C4NUM); + int out_unit = conv_param->output_unit_; + int out_w_block = UP_DIV(conv_param->output_w_, out_unit); + int out_h_block = UP_DIV(conv_param->output_h_, out_unit); + int tile_num = 16; + int output_count = out_w_block * out_h_block; + int output_tile_count = UP_DIV(output_count, tile_num); + int out_channel = conv_param->output_channel_; + int oc8 = UP_DIV(out_channel, C8NUM); + int input_unit_square = input_unit * input_unit; + size_t output_offset = oc8 * C8NUM * input_unit_square * sizeof(float16_t); + + float16_t *trans_input = buffer_list[0]; + float16_t *gemm_out = buffer_list[1]; + float16_t *tmp_out_data = buffer_list[2]; + float16_t *tmp_data = buffer_list[3]; + int trans_input_offset = tile_num * input_unit_square * ic4 * C4NUM; + int gemm_out_offset = tile_num * input_unit_square * oc8 * C8NUM; + int tmp_data_offset = input_unit_square * C4NUM; + // step 1 : filter transform (pre-processed offline) + // step 2 : input transform (online) + for (int b = 0; b < in_batch; b++) { + int in_batch_offset = b * ic4 * C4NUM * conv_param->input_h_ * conv_param->input_w_; + int tmp_out_batch_offset = b * out_w_block * out_h_block * out_unit * out_unit * oc8 * C8NUM; + for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_num) { + int out_tile_index = thread_id * TILE_NUM; + int cal_num = output_count - thread_id * tile_num; + cal_num = cal_num > tile_num ? tile_num : cal_num; + WinogradInputTransformFp16(input_data + in_batch_offset, trans_input + task_id * trans_input_offset, + tmp_data + task_id * tmp_data_offset, cal_num, out_tile_index, out_w_block, conv_param, + input_trans_func); + // step 3 : gemm + IndirectGemmFp16_16x8(gemm_out + task_id * gemm_out_offset, trans_input + task_id * trans_input_offset, + trans_weight, NULL, input_unit_square, ic4, oc8 * C8NUM, output_offset, 1, 1, 0, 0); + + // step 4 : output transform + WinogradOutputTransformFp16(gemm_out + task_id * gemm_out_offset, tmp_out_data + tmp_out_batch_offset, bias_data, + cal_num, out_tile_index, out_w_block, conv_param, output_trans_func); + } + } +} + +void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel, + int output_unit) { + int out_h_block_num = UP_DIV(height, output_unit); + int out_w_block_num = UP_DIV(width, output_unit); + int c8 = UP_DIV(channel, C8NUM); + for (int b = 0; b < batch; b++) { + int src_batch_offset = b * c8 * C8NUM * out_h_block_num * output_unit * out_w_block_num * output_unit; + int dst_batch_offset = b * height * width * channel; + for (int h = 0; h < height; h++) { + int src_h_offset = src_batch_offset + C8NUM * (h * out_w_block_num * output_unit); + int dst_h_offset = dst_batch_offset + h * width * channel; + for (int w = 0; w < width; w++) { + int src_w_offset = src_h_offset + w * C8NUM; + int dst_w_offset = dst_h_offset + w * channel; + for (int c = 0; c < c8 - 1; c++) { + int src_c8_offset = src_w_offset + c * C8NUM * out_w_block_num * out_h_block_num * output_unit * output_unit; + int dst_c8_offset = dst_w_offset + c * C8NUM; +#ifdef ENABLE_NEON + vst1q_f16(dst + dst_c8_offset, vld1q_f16(src + src_c8_offset)); +#else + for (int i = 0; i < C8NUM; ++i) { + dst[dst_c8_offset + i] = src[src_c8_offset + i]; + } +#endif + } + int c_res = channel - (c8 - 1) * C8NUM; + int src_c_res_offset = (c8 - 1) * C8NUM * out_w_block_num * out_h_block_num * output_unit * output_unit; + int dst_c_res_offset = (c8 - 1) * C8NUM; + for (int c = 0; c < c_res; c++) { + int src_c8_res_offset = src_w_offset + src_c_res_offset + c; + int dst_c8_res_offset = dst_w_offset + dst_c_res_offset + c; + dst[dst_c8_res_offset] = src[src_c8_res_offset]; + } + } + } + } +} diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.h index a6a6e5674e2..918e82c0207 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.h @@ -18,11 +18,22 @@ #include #include "nnacl/conv_parameter.h" +#include "nnacl/fp16/winograd_utils_fp16.h" +#include "nnacl/fp16/winograd_transform_fp16.h" + +typedef float16_t *TmpBufferAddressFp16; #ifndef ENABLE_NEON void IndirectGemmFp16_16x8(float16_t *output, float16_t *input, float16_t *weight, float16_t *bias, size_t step, - size_t ic4, size_t oc8, size_t offset, size_t mode, size_t writeC4, size_t relu, + size_t ic4, size_t oc8, size_t offset, size_t mode, size_t writeC8, size_t relu, size_t relu6); + +void IndirectGemmFp16_16x8_common(float16_t *output, float16_t *input, float16_t *weight, float16_t *bias, size_t step, + size_t ic4, size_t oc8, size_t offset, size_t relu, size_t relu6); + +void IndirectGemmFp16_16x8_c8(float16_t *output, float16_t *input, float16_t *weight, float16_t *bias, size_t step, + size_t ic4, size_t oc8, size_t offset, size_t mode, size_t writeC8, size_t relu, + size_t relu6); #endif #ifdef __cplusplus @@ -48,6 +59,14 @@ void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_ void Conv3x3Fp16(float16_t *input_data, float16_t *transed_weight, const float16_t *bias_data, float16_t *output_data, float16_t *tile_buffer, float16_t *block_unit_buffer, float16_t *tmp_dst_buffer, float16_t *tmp_out, int task_id, ConvParameter *conv_param); + +// fp16 convolution winograd +void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const float16_t *bias_data, + TmpBufferAddressFp16 *buffer_list, int task_id, ConvParameter *conv_param, + InputTransformUnitFp16Func input_trans_func, OutputTransformUnitFp16Func output_trans_func); + +void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel, + int output_unit); #ifdef __cplusplus } #endif diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_transform_fp16.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_transform_fp16.c index 29a111103ea..c7d90a70264 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_transform_fp16.c +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_transform_fp16.c @@ -534,3 +534,95 @@ void Conv3x3Fp16OutputTransform(const float16_t *gemm_out, float16_t *out_data, } } } + +// fp16 common winograd +void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_input, float16_t *tmp_data, int cal_num, + int out_tile_index, int out_w_block_num, ConvParameter *conv_param, + InputTransformUnitFp16Func input_trans_func) { + int tile_num = 16; + int input_unit = conv_param->input_unit_; + int output_unit = conv_param->output_unit_; + int in_channel = conv_param->input_channel_; + int ic4 = UP_DIV(in_channel, C4NUM); + int pad_h = conv_param->pad_h_; + int pad_w = conv_param->pad_w_; + int input_h = conv_param->input_h_; + int input_w = conv_param->input_w_; + if (out_w_block_num == 0) { + return; + } + for (int c = 0; c < cal_num; c++) { // actual tiled number + int src_x_s = (out_tile_index % out_w_block_num) * output_unit - pad_w; + int src_y_s = (out_tile_index / out_w_block_num) * output_unit - pad_h; + int interval_x_s = src_x_s > 0 ? 0 : -src_x_s; + int interval_y_s = src_y_s > 0 ? 0 : -src_y_s; + int src_x_e = src_x_s + input_unit; + int src_y_e = src_y_s + input_unit; + int interval_x_e = src_x_e < input_w ? input_unit : (input_w - src_x_s); + int interval_y_e = src_y_e < input_h ? input_unit : (input_h - src_y_s); + + int src_plane_offset = ic4 * C4NUM * (src_y_s * input_w + src_x_s); + int dst_plane_offset = c * C4NUM; + for (int ic = 0; ic < ic4; ic++) { + // clear tmp buffer + memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float16_t)); + + // get real input block with padding + int src_ic4_offset = src_plane_offset + ic * C4NUM; + for (int interval = interval_y_s; interval < interval_y_e; interval++) { + int src_y_offset = src_ic4_offset + (interval * input_w + interval_x_s) * ic4 * C4NUM; + int dst_y_offset = interval * input_unit * C4NUM + interval_x_s * C4NUM; + for (int j = 0; j < (interval_x_e - interval_x_s); j++) { + int src_x_offset = src_y_offset + j * ic4 * C4NUM; + int dst_x_offset = dst_y_offset + j * C4NUM; + float16_t *src_addr = input_data + src_x_offset; + float16_t *dst_addr = tmp_data + dst_x_offset; +#ifdef ENABLE_NEON + vst1_f16(dst_addr, vld1_f16(src_addr)); +#else + for (int k = 0; k < C4NUM; k++) { + dst_addr[k] = src_addr[k]; + } +#endif + } + } + // input transform + int dst_ic4_offset = dst_plane_offset + ic * tile_num * C4NUM; + size_t dst_step = ic4 * C4NUM * tile_num; + float16_t *trans_input_ptr = trans_input + dst_ic4_offset; + input_trans_func(tmp_data, trans_input_ptr, C4NUM, dst_step); + } + out_tile_index++; + } // cal_tile_num loop +} + +void WinogradOutputTransformFp16(const float16_t *gemm_out, float16_t *tmp_out_data, const float16_t *bias_data, + int cal_num, int out_tile_index, int output_unit_num, ConvParameter *conv_param, + OutputTransformUnitFp16Func output_trans_func) { + int output_unit = conv_param->output_unit_; + int output_w = conv_param->output_w_; + int output_unit_block = UP_DIV(output_w, output_unit); + int output_channel = conv_param->output_channel_; + int oc8 = UP_DIV(output_channel, C8NUM); + int input_unit = conv_param->input_unit_; + if (output_unit_num == 0) { + return; + } + for (int i = 0; i < cal_num; i++) { + int dst_x_s = out_tile_index % output_unit_num; + int dst_y_s = out_tile_index / output_unit_num; + int src_tile_offset = i * oc8 * C8NUM * input_unit * input_unit; + int dst_tile_offset = C4NUM * output_unit * (dst_x_s + dst_y_s * output_unit_block * output_unit); + + for (int j = 0; j < oc8; j++) { + int src_oc8_offset = src_tile_offset + j * input_unit * input_unit * C8NUM; + int dst_oc8_offset = + dst_tile_offset + j * C8NUM * output_unit_block * output_unit_block * output_unit * output_unit; + const float16_t *src_ptr = gemm_out + src_oc8_offset; + const float16_t *bias_ptr = bias_data + j * C8NUM; + float16_t *dst_ptr = tmp_out_data + dst_oc8_offset; + output_trans_func(src_ptr, dst_ptr, bias_ptr, C8NUM, output_unit_block * output_unit); + } + out_tile_index++; + } +} diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_transform_fp16.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_transform_fp16.h index 8a3850ab67b..16bbcd5f1af 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_transform_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_transform_fp16.h @@ -21,6 +21,7 @@ #include #include "nnacl/fp16/pack_fp16.h" #include "nnacl/fp16/conv_fp16.h" +#include "nnacl/fp16/winograd_utils_fp16.h" #ifdef __cplusplus extern "C" { @@ -39,6 +40,15 @@ void Conv3x3Fp16OutputUnit(const float16_t *gemm_out, const float16_t *bias_data void Conv3x3Fp16OutputTransform(const float16_t *gemm_out, float16_t *out_data, const float16_t *bias_data, int start_index, int real_cal_num, int out_w_block, ConvParameter *conv_param); + +// fp16 common winograd +void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_input, float16_t *tmp_data, int cal_num, + int out_tile_index, int out_w_block_num, ConvParameter *conv_param, + InputTransformUnitFp16Func input_trans_func); + +void WinogradOutputTransformFp16(const float16_t *gemm_out, float16_t *tmp_out_data, const float16_t *bias_data, + int cal_num, int out_tile_index, int output_unit_num, ConvParameter *conv_param, + OutputTransformUnitFp16Func output_trans_func); #ifdef __cplusplus } #endif diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_utils_fp16.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_utils_fp16.c new file mode 100644 index 00000000000..0ea259fe3c4 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_utils_fp16.c @@ -0,0 +1,4669 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nnacl/fp16/winograd_utils_fp16.h" + +#define MIN_UNIT 2 +#define MAX_UNIT 8 + +static OutputTransformUnitFp16Func outputTransformUnitFp16[] = { + NULL, // 0 + NULL, // 1 + OutputTransform8x2UnitFp16, + OutputTransform8x3UnitFp16, + OutputTransform8x4UnitFp16, + OutputTransform8x5UnitFp16, + OutputTransform8x6UnitFp16, + OutputTransform8x7UnitFp16, +}; + +void InputTransform4x4UnitFp16(const float16_t *src_data, float16_t *dst_data, int src_step, int dst_step) { +#ifdef ENABLE_ARM + float16x8_t src_data_00 = vld1q_f16(src_data + 0 * src_step); + float16x8_t src_data_01 = vld1q_f16(src_data + 1 * src_step); + float16x8_t src_data_02 = vld1q_f16(src_data + 2 * src_step); + float16x8_t src_data_03 = vld1q_f16(src_data + 3 * src_step); + float16x8_t src_data_10 = vld1q_f16(src_data + 4 * src_step); + float16x8_t src_data_11 = vld1q_f16(src_data + 5 * src_step); + float16x8_t src_data_12 = vld1q_f16(src_data + 6 * src_step); + float16x8_t src_data_13 = vld1q_f16(src_data + 7 * src_step); + float16x8_t src_data_20 = vld1q_f16(src_data + 8 * src_step); + float16x8_t src_data_21 = vld1q_f16(src_data + 9 * src_step); + float16x8_t src_data_22 = vld1q_f16(src_data + 10 * src_step); + float16x8_t src_data_23 = vld1q_f16(src_data + 11 * src_step); + float16x8_t src_data_30 = vld1q_f16(src_data + 12 * src_step); + float16x8_t src_data_31 = vld1q_f16(src_data + 13 * src_step); + float16x8_t src_data_32 = vld1q_f16(src_data + 14 * src_step); + float16x8_t src_data_33 = vld1q_f16(src_data + 15 * src_step); + + float16x8_t t00 = vsubq_f16(src_data_00, vmulq_n_f16(src_data_20, 4)); + float16x8_t t01 = vsubq_f16(src_data_01, vmulq_n_f16(src_data_21, 4)); + float16x8_t t02 = vsubq_f16(src_data_02, vmulq_n_f16(src_data_22, 4)); + float16x8_t t03 = vsubq_f16(src_data_03, vmulq_n_f16(src_data_23, 4)); + + float16x8_t t10 = vaddq_f16(src_data_10, vmulq_n_f16(src_data_20, 2)); + float16x8_t t11 = vaddq_f16(src_data_11, vmulq_n_f16(src_data_21, 2)); + float16x8_t t12 = vaddq_f16(src_data_12, vmulq_n_f16(src_data_22, 2)); + float16x8_t t13 = vaddq_f16(src_data_13, vmulq_n_f16(src_data_23, 2)); + + float16x8_t t20 = vsubq_f16(vmulq_n_f16(src_data_20, 2), src_data_10); + float16x8_t t21 = vsubq_f16(vmulq_n_f16(src_data_21, 2), src_data_11); + float16x8_t t22 = vsubq_f16(vmulq_n_f16(src_data_22, 2), src_data_12); + float16x8_t t23 = vsubq_f16(vmulq_n_f16(src_data_23, 2), src_data_13); + + float16x8_t t30 = vsubq_f16(src_data_30, vmulq_n_f16(src_data_10, 0.25)); + float16x8_t t31 = vsubq_f16(src_data_31, vmulq_n_f16(src_data_11, 0.25)); + float16x8_t t32 = vsubq_f16(src_data_32, vmulq_n_f16(src_data_12, 0.25)); + float16x8_t t33 = vsubq_f16(src_data_33, vmulq_n_f16(src_data_13, 0.25)); + + float16x8_t m00 = vsubq_f16(t00, vmulq_n_f16(t02, 4)); + float16x8_t m01 = vaddq_f16(t01, vmulq_n_f16(t02, 2)); + float16x8_t m02 = vsubq_f16(vmulq_n_f16(t02, 2), t01); + float16x8_t m03 = vsubq_f16(t03, vmulq_n_f16(t01, 0.25)); + + float16x8_t m10 = vsubq_f16(t10, vmulq_n_f16(t12, 4)); + float16x8_t m11 = vaddq_f16(t11, vmulq_n_f16(t12, 2)); + float16x8_t m12 = vsubq_f16(vmulq_n_f16(t12, 2), t11); + float16x8_t m13 = vsubq_f16(t13, vmulq_n_f16(t11, 0.25)); + + float16x8_t m20 = vsubq_f16(t20, vmulq_n_f16(t22, 4)); + float16x8_t m21 = vaddq_f16(t21, vmulq_n_f16(t22, 2)); + float16x8_t m22 = vsubq_f16(vmulq_n_f16(t22, 2), t21); + float16x8_t m23 = vsubq_f16(t23, vmulq_n_f16(t21, 0.25)); + + float16x8_t m30 = vsubq_f16(t30, vmulq_n_f16(t32, 4)); + float16x8_t m31 = vaddq_f16(t31, vmulq_n_f16(t32, 2)); + float16x8_t m32 = vsubq_f16(vmulq_n_f16(t32, 2), t31); + float16x8_t m33 = vsubq_f16(t33, vmulq_n_f16(t31, 0.25)); + + vst1q_f16(dst_data + 0 * dst_step, m00); + vst1q_f16(dst_data + 1 * dst_step, m01); + vst1q_f16(dst_data + 2 * dst_step, m02); + vst1q_f16(dst_data + 3 * dst_step, m03); + vst1q_f16(dst_data + 4 * dst_step, m10); + vst1q_f16(dst_data + 5 * dst_step, m11); + vst1q_f16(dst_data + 6 * dst_step, m12); + vst1q_f16(dst_data + 7 * dst_step, m13); + vst1q_f16(dst_data + 8 * dst_step, m20); + vst1q_f16(dst_data + 9 * dst_step, m21); + vst1q_f16(dst_data + 10 * dst_step, m22); + vst1q_f16(dst_data + 11 * dst_step, m23); + vst1q_f16(dst_data + 12 * dst_step, m30); + vst1q_f16(dst_data + 13 * dst_step, m31); + vst1q_f16(dst_data + 14 * dst_step, m32); + vst1q_f16(dst_data + 15 * dst_step, m33); +#else + for (int i = 0; i < C8NUM; i++) { + float16_t src_data_00 = src_data[i]; + float16_t src_data_01 = src_data[i + src_step]; + float16_t src_data_02 = src_data[i + 2 * src_step]; + float16_t src_data_03 = src_data[i + 3 * src_step]; + float16_t src_data_10 = src_data[i + 4 * src_step]; + float16_t src_data_11 = src_data[i + 5 * src_step]; + float16_t src_data_12 = src_data[i + 6 * src_step]; + float16_t src_data_13 = src_data[i + 7 * src_step]; + float16_t src_data_20 = src_data[i + 8 * src_step]; + float16_t src_data_21 = src_data[i + 9 * src_step]; + float16_t src_data_22 = src_data[i + 10 * src_step]; + float16_t src_data_23 = src_data[i + 11 * src_step]; + float16_t src_data_30 = src_data[i + 12 * src_step]; + float16_t src_data_31 = src_data[i + 13 * src_step]; + float16_t src_data_32 = src_data[i + 14 * src_step]; + float16_t src_data_33 = src_data[i + 15 * src_step]; + + float16_t t00 = src_data_00 - 4 * src_data_20; + float16_t t01 = src_data_01 - 4 * src_data_21; + float16_t t02 = src_data_02 - 4 * src_data_22; + float16_t t03 = src_data_03 - 4 * src_data_23; + + float16_t t10 = src_data_10 + 2 * src_data_20; + float16_t t11 = src_data_11 + 2 * src_data_21; + float16_t t12 = src_data_12 + 2 * src_data_22; + float16_t t13 = src_data_13 + 2 * src_data_23; + + const float16_t t20 = 2 * src_data_20 - src_data_10; + const float16_t t21 = 2 * src_data_21 - src_data_11; + const float16_t t22 = 2 * src_data_22 - src_data_12; + const float16_t t23 = 2 * src_data_23 - src_data_13; + + float16_t t30 = src_data_30 - 0.25f * src_data_10; + float16_t t31 = src_data_31 - 0.25f * src_data_11; + float16_t t32 = src_data_32 - 0.25f * src_data_12; + float16_t t33 = src_data_33 - 0.25f * src_data_13; + + float16_t m00 = t00 - 4 * t02; + float16_t m01 = t01 + 2 * t02; + const float16_t m02 = 2 * t02 - t01; + float16_t m03 = t03 - 0.25f * t01; + + float16_t m10 = t10 - 4 * t12; + float16_t m11 = t11 + 2 * t12; + const float16_t m12 = 2 * t12 - t11; + float16_t m13 = t13 - 0.25f * t11; + + float16_t m20 = t20 - 4 * t22; + float16_t m21 = t21 + 2 * t22; + const float16_t m22 = 2 * t22 - t21; + float16_t m23 = t23 - 0.25f * t21; + + float16_t m30 = t30 - 4 * t32; + float16_t m31 = t31 + 2 * t32; + float16_t m32 = 2 * t32 - t31; + float16_t m33 = t33 - 0.25f * t31; + + (dst_data + i)[0] = m00; + (dst_data + i + dst_step)[0] = m01; + (dst_data + i + 2 * dst_step)[0] = m02; + (dst_data + i + 3 * dst_step)[0] = m03; + + (dst_data + i + 4 * dst_step)[0] = m10; + (dst_data + i + 5 * dst_step)[0] = m11; + (dst_data + i + 6 * dst_step)[0] = m12; + (dst_data + i + 7 * dst_step)[0] = m13; + + (dst_data + i + 8 * dst_step)[0] = m20; + (dst_data + i + 9 * dst_step)[0] = m21; + (dst_data + i + 10 * dst_step)[0] = m22; + (dst_data + i + 11 * dst_step)[0] = m23; + + (dst_data + i + 12 * dst_step)[0] = m30; + (dst_data + i + 13 * dst_step)[0] = m31; + (dst_data + i + 14 * dst_step)[0] = m32; + (dst_data + i + 15 * dst_step)[0] = m33; + } +#endif +} + +void InputTransform8x8UnitFp16(const float16_t *src_data, float16_t *dst_data, int src_step, int dst_step) { +#ifdef ENABLE_ARM + float16x8_t src_data_00 = vld1q_f16(src_data + 0 * src_step); + float16x8_t src_data_01 = vld1q_f16(src_data + 1 * src_step); + float16x8_t src_data_02 = vld1q_f16(src_data + 2 * src_step); + float16x8_t src_data_03 = vld1q_f16(src_data + 3 * src_step); + float16x8_t src_data_04 = vld1q_f16(src_data + 4 * src_step); + float16x8_t src_data_05 = vld1q_f16(src_data + 5 * src_step); + float16x8_t src_data_06 = vld1q_f16(src_data + 6 * src_step); + float16x8_t src_data_07 = vld1q_f16(src_data + 7 * src_step); + float16x8_t src_data_10 = vld1q_f16(src_data + 8 * src_step); + float16x8_t src_data_11 = vld1q_f16(src_data + 9 * src_step); + float16x8_t src_data_12 = vld1q_f16(src_data + 10 * src_step); + float16x8_t src_data_13 = vld1q_f16(src_data + 11 * src_step); + float16x8_t src_data_14 = vld1q_f16(src_data + 12 * src_step); + float16x8_t src_data_15 = vld1q_f16(src_data + 13 * src_step); + float16x8_t src_data_16 = vld1q_f16(src_data + 14 * src_step); + float16x8_t src_data_17 = vld1q_f16(src_data + 15 * src_step); + float16x8_t src_data_20 = vld1q_f16(src_data + 16 * src_step); + float16x8_t src_data_21 = vld1q_f16(src_data + 17 * src_step); + float16x8_t src_data_22 = vld1q_f16(src_data + 18 * src_step); + float16x8_t src_data_23 = vld1q_f16(src_data + 19 * src_step); + float16x8_t src_data_24 = vld1q_f16(src_data + 20 * src_step); + float16x8_t src_data_25 = vld1q_f16(src_data + 21 * src_step); + float16x8_t src_data_26 = vld1q_f16(src_data + 22 * src_step); + float16x8_t src_data_27 = vld1q_f16(src_data + 23 * src_step); + float16x8_t src_data_30 = vld1q_f16(src_data + 24 * src_step); + float16x8_t src_data_31 = vld1q_f16(src_data + 25 * src_step); + float16x8_t src_data_32 = vld1q_f16(src_data + 26 * src_step); + float16x8_t src_data_33 = vld1q_f16(src_data + 27 * src_step); + float16x8_t src_data_34 = vld1q_f16(src_data + 28 * src_step); + float16x8_t src_data_35 = vld1q_f16(src_data + 29 * src_step); + float16x8_t src_data_36 = vld1q_f16(src_data + 30 * src_step); + float16x8_t src_data_37 = vld1q_f16(src_data + 31 * src_step); + float16x8_t src_data_40 = vld1q_f16(src_data + 32 * src_step); + float16x8_t src_data_41 = vld1q_f16(src_data + 33 * src_step); + float16x8_t src_data_42 = vld1q_f16(src_data + 34 * src_step); + float16x8_t src_data_43 = vld1q_f16(src_data + 35 * src_step); + float16x8_t src_data_44 = vld1q_f16(src_data + 36 * src_step); + float16x8_t src_data_45 = vld1q_f16(src_data + 37 * src_step); + float16x8_t src_data_46 = vld1q_f16(src_data + 38 * src_step); + float16x8_t src_data_47 = vld1q_f16(src_data + 39 * src_step); + float16x8_t src_data_50 = vld1q_f16(src_data + 40 * src_step); + float16x8_t src_data_51 = vld1q_f16(src_data + 41 * src_step); + float16x8_t src_data_52 = vld1q_f16(src_data + 42 * src_step); + float16x8_t src_data_53 = vld1q_f16(src_data + 43 * src_step); + float16x8_t src_data_54 = vld1q_f16(src_data + 44 * src_step); + float16x8_t src_data_55 = vld1q_f16(src_data + 45 * src_step); + float16x8_t src_data_56 = vld1q_f16(src_data + 46 * src_step); + float16x8_t src_data_57 = vld1q_f16(src_data + 47 * src_step); + float16x8_t src_data_60 = vld1q_f16(src_data + 48 * src_step); + float16x8_t src_data_61 = vld1q_f16(src_data + 49 * src_step); + float16x8_t src_data_62 = vld1q_f16(src_data + 50 * src_step); + float16x8_t src_data_63 = vld1q_f16(src_data + 51 * src_step); + float16x8_t src_data_64 = vld1q_f16(src_data + 52 * src_step); + float16x8_t src_data_65 = vld1q_f16(src_data + 53 * src_step); + float16x8_t src_data_66 = vld1q_f16(src_data + 54 * src_step); + float16x8_t src_data_67 = vld1q_f16(src_data + 55 * src_step); + float16x8_t src_data_70 = vld1q_f16(src_data + 56 * src_step); + float16x8_t src_data_71 = vld1q_f16(src_data + 57 * src_step); + float16x8_t src_data_72 = vld1q_f16(src_data + 58 * src_step); + float16x8_t src_data_73 = vld1q_f16(src_data + 59 * src_step); + float16x8_t src_data_74 = vld1q_f16(src_data + 60 * src_step); + float16x8_t src_data_75 = vld1q_f16(src_data + 61 * src_step); + float16x8_t src_data_76 = vld1q_f16(src_data + 62 * src_step); + float16x8_t src_data_77 = vld1q_f16(src_data + 63 * src_step); + + float16x8_t t00 = vsubq_f16(vaddq_f16(vsubq_f16(src_data_00, vmulq_n_f16(src_data_20, 5.44444444444444444444444445)), + vmulq_n_f16(src_data_40, 6.222222222222)), + vmulq_n_f16(src_data_60, 1.7777777777777)); + float16x8_t t01 = vsubq_f16(vaddq_f16(vsubq_f16(src_data_01, vmulq_n_f16(src_data_21, 5.44444444444444444444444445)), + vmulq_n_f16(src_data_41, 6.222222222222)), + vmulq_n_f16(src_data_61, 1.7777777777777)); + float16x8_t t02 = vsubq_f16(vaddq_f16(vsubq_f16(src_data_02, vmulq_n_f16(src_data_22, 5.44444444444444444444444445)), + vmulq_n_f16(src_data_42, 6.222222222222)), + vmulq_n_f16(src_data_62, 1.7777777777777)); + float16x8_t t03 = vsubq_f16(vaddq_f16(vsubq_f16(src_data_03, vmulq_n_f16(src_data_23, 5.44444444444444444444444445)), + vmulq_n_f16(src_data_43, 6.222222222222)), + vmulq_n_f16(src_data_63, 1.7777777777777)); + float16x8_t t04 = vsubq_f16(vaddq_f16(vsubq_f16(src_data_04, vmulq_n_f16(src_data_24, 5.44444444444444444444444445)), + vmulq_n_f16(src_data_44, 6.222222222222)), + vmulq_n_f16(src_data_64, 1.7777777777777)); + float16x8_t t05 = vsubq_f16(vaddq_f16(vsubq_f16(src_data_05, vmulq_n_f16(src_data_25, 5.44444444444444444444444445)), + vmulq_n_f16(src_data_45, 6.222222222222)), + vmulq_n_f16(src_data_65, 1.7777777777777)); + float16x8_t t06 = vsubq_f16(vaddq_f16(vsubq_f16(src_data_06, vmulq_n_f16(src_data_26, 5.44444444444444444444444445)), + vmulq_n_f16(src_data_46, 6.222222222222)), + vmulq_n_f16(src_data_66, 1.7777777777777)); + float16x8_t t07 = vsubq_f16(vaddq_f16(vsubq_f16(src_data_07, vmulq_n_f16(src_data_27, 5.44444444444444444444444445)), + vmulq_n_f16(src_data_47, 6.222222222222)), + vmulq_n_f16(src_data_67, 1.7777777777777)); + + float16x8_t t10 = + vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_10, 1.5), vmulq_n_f16(src_data_20, 3)), + vmulq_n_f16(src_data_30, 2.166666666666666667)), + vmulq_n_f16(src_data_40, 4.333333333333)), + vmulq_n_f16(src_data_50, 0.66666666666)), + vmulq_n_f16(src_data_60, 1.333333333333)); + float16x8_t t11 = + vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_11, 1.5), vmulq_n_f16(src_data_21, 3)), + vmulq_n_f16(src_data_31, 2.166666666666666667)), + vmulq_n_f16(src_data_41, 4.333333333333)), + vmulq_n_f16(src_data_51, 0.66666666666)), + vmulq_n_f16(src_data_61, 1.333333333333)); + float16x8_t t12 = + vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_12, 1.5), vmulq_n_f16(src_data_22, 3)), + vmulq_n_f16(src_data_32, 2.166666666666666667)), + vmulq_n_f16(src_data_42, 4.333333333333)), + vmulq_n_f16(src_data_52, 0.66666666666)), + vmulq_n_f16(src_data_62, 1.333333333333)); + float16x8_t t13 = + vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_13, 1.5), vmulq_n_f16(src_data_23, 3)), + vmulq_n_f16(src_data_33, 2.166666666666666667)), + vmulq_n_f16(src_data_43, 4.333333333333)), + vmulq_n_f16(src_data_53, 0.66666666666)), + vmulq_n_f16(src_data_63, 1.333333333333)); + float16x8_t t14 = + vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_14, 1.5), vmulq_n_f16(src_data_24, 3)), + vmulq_n_f16(src_data_34, 2.166666666666666667)), + vmulq_n_f16(src_data_44, 4.333333333333)), + vmulq_n_f16(src_data_54, 0.66666666666)), + vmulq_n_f16(src_data_64, 1.333333333333)); + float16x8_t t15 = + vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_15, 1.5), vmulq_n_f16(src_data_25, 3)), + vmulq_n_f16(src_data_35, 2.166666666666666667)), + vmulq_n_f16(src_data_45, 4.333333333333)), + vmulq_n_f16(src_data_55, 0.66666666666)), + vmulq_n_f16(src_data_65, 1.333333333333)); + float16x8_t t16 = + vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_16, 1.5), vmulq_n_f16(src_data_26, 3)), + vmulq_n_f16(src_data_36, 2.166666666666666667)), + vmulq_n_f16(src_data_46, 4.333333333333)), + vmulq_n_f16(src_data_56, 0.66666666666)), + vmulq_n_f16(src_data_66, 1.333333333333)); + float16x8_t t17 = + vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_17, 1.5), vmulq_n_f16(src_data_27, 3)), + vmulq_n_f16(src_data_37, 2.166666666666666667)), + vmulq_n_f16(src_data_47, 4.333333333333)), + vmulq_n_f16(src_data_57, 0.66666666666)), + vmulq_n_f16(src_data_67, 1.333333333333)); + + float16x8_t t20 = + vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_10, -1.5), vmulq_n_f16(src_data_20, 3)), + vmulq_n_f16(src_data_30, 2.166666666666666667)), + vmulq_n_f16(src_data_40, 4.333333333333)), + vmulq_n_f16(src_data_50, 0.66666666666)), + vmulq_n_f16(src_data_60, 1.333333333333)); + float16x8_t t21 = + vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_11, -1.5), vmulq_n_f16(src_data_21, 3)), + vmulq_n_f16(src_data_31, 2.166666666666666667)), + vmulq_n_f16(src_data_41, 4.333333333333)), + vmulq_n_f16(src_data_51, 0.66666666666)), + vmulq_n_f16(src_data_61, 1.333333333333)); + float16x8_t t22 = + vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_12, -1.5), vmulq_n_f16(src_data_22, 3)), + vmulq_n_f16(src_data_32, 2.166666666666666667)), + vmulq_n_f16(src_data_42, 4.333333333333)), + vmulq_n_f16(src_data_52, 0.66666666666)), + vmulq_n_f16(src_data_62, 1.333333333333)); + float16x8_t t23 = + vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_13, -1.5), vmulq_n_f16(src_data_23, 3)), + vmulq_n_f16(src_data_33, 2.166666666666666667)), + vmulq_n_f16(src_data_43, 4.333333333333)), + vmulq_n_f16(src_data_53, 0.66666666666)), + vmulq_n_f16(src_data_63, 1.333333333333)); + float16x8_t t24 = + vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_14, -1.5), vmulq_n_f16(src_data_24, 3)), + vmulq_n_f16(src_data_34, 2.166666666666666667)), + vmulq_n_f16(src_data_44, 4.333333333333)), + vmulq_n_f16(src_data_54, 0.66666666666)), + vmulq_n_f16(src_data_64, 1.333333333333)); + float16x8_t t25 = + vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_15, -1.5), vmulq_n_f16(src_data_25, 3)), + vmulq_n_f16(src_data_35, 2.166666666666666667)), + vmulq_n_f16(src_data_45, 4.333333333333)), + vmulq_n_f16(src_data_55, 0.66666666666)), + vmulq_n_f16(src_data_65, 1.333333333333)); + float16x8_t t26 = + vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_16, -1.5), vmulq_n_f16(src_data_26, 3)), + vmulq_n_f16(src_data_36, 2.166666666666666667)), + vmulq_n_f16(src_data_46, 4.333333333333)), + vmulq_n_f16(src_data_56, 0.66666666666)), + vmulq_n_f16(src_data_66, 1.333333333333)); + float16x8_t t27 = + vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_17, -1.5), vmulq_n_f16(src_data_27, 3)), + vmulq_n_f16(src_data_37, 2.166666666666666667)), + vmulq_n_f16(src_data_47, 4.333333333333)), + vmulq_n_f16(src_data_57, 0.66666666666)), + vmulq_n_f16(src_data_67, 1.333333333333)); + + float16x8_t t30 = vsubq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(src_data_30, src_data_40), 1.3333333333333), + vmulq_n_f16(vaddq_f16(src_data_10, src_data_20), -0.3)), + vmulq_n_f16(vaddq_f16(src_data_50, src_data_60), 0.53333333333)); + float16x8_t t31 = vsubq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(src_data_31, src_data_41), 1.3333333333333), + vmulq_n_f16(vaddq_f16(src_data_11, src_data_21), -0.3)), + vmulq_n_f16(vaddq_f16(src_data_51, src_data_61), 0.53333333333)); + float16x8_t t32 = vsubq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(src_data_32, src_data_42), 1.3333333333333), + vmulq_n_f16(vaddq_f16(src_data_12, src_data_22), -0.3)), + vmulq_n_f16(vaddq_f16(src_data_52, src_data_62), 0.53333333333)); + float16x8_t t33 = vsubq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(src_data_33, src_data_43), 1.3333333333333), + vmulq_n_f16(vaddq_f16(src_data_13, src_data_23), -0.3)), + vmulq_n_f16(vaddq_f16(src_data_53, src_data_63), 0.53333333333)); + float16x8_t t34 = vsubq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(src_data_34, src_data_44), 1.3333333333333), + vmulq_n_f16(vaddq_f16(src_data_14, src_data_24), -0.3)), + vmulq_n_f16(vaddq_f16(src_data_54, src_data_64), 0.53333333333)); + float16x8_t t35 = vsubq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(src_data_35, src_data_45), 1.3333333333333), + vmulq_n_f16(vaddq_f16(src_data_15, src_data_25), -0.3)), + vmulq_n_f16(vaddq_f16(src_data_55, src_data_65), 0.53333333333)); + float16x8_t t36 = vsubq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(src_data_36, src_data_46), 1.3333333333333), + vmulq_n_f16(vaddq_f16(src_data_16, src_data_26), -0.3)), + vmulq_n_f16(vaddq_f16(src_data_56, src_data_66), 0.53333333333)); + float16x8_t t37 = vsubq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(src_data_37, src_data_47), 1.3333333333333), + vmulq_n_f16(vaddq_f16(src_data_17, src_data_27), -0.3)), + vmulq_n_f16(vaddq_f16(src_data_57, src_data_67), 0.53333333333)); + + float16x8_t t40 = vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(src_data_40, src_data_30), 1.3333333333333), + vmulq_n_f16(vsubq_f16(src_data_10, src_data_20), 0.3)), + vmulq_n_f16(vsubq_f16(src_data_50, src_data_60), 0.53333333333)); + float16x8_t t41 = vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(src_data_41, src_data_31), 1.3333333333333), + vmulq_n_f16(vsubq_f16(src_data_11, src_data_21), 0.3)), + vmulq_n_f16(vsubq_f16(src_data_51, src_data_61), 0.53333333333)); + float16x8_t t42 = vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(src_data_42, src_data_32), 1.3333333333333), + vmulq_n_f16(vsubq_f16(src_data_12, src_data_22), 0.3)), + vmulq_n_f16(vsubq_f16(src_data_52, src_data_62), 0.53333333333)); + float16x8_t t43 = vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(src_data_43, src_data_33), 1.3333333333333), + vmulq_n_f16(vsubq_f16(src_data_13, src_data_23), 0.3)), + vmulq_n_f16(vsubq_f16(src_data_53, src_data_63), 0.53333333333)); + float16x8_t t44 = vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(src_data_44, src_data_34), 1.3333333333333), + vmulq_n_f16(vsubq_f16(src_data_14, src_data_24), 0.3)), + vmulq_n_f16(vsubq_f16(src_data_54, src_data_64), 0.53333333333)); + float16x8_t t45 = vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(src_data_45, src_data_35), 1.3333333333333), + vmulq_n_f16(vsubq_f16(src_data_15, src_data_25), 0.3)), + vmulq_n_f16(vsubq_f16(src_data_55, src_data_65), 0.53333333333)); + float16x8_t t46 = vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(src_data_46, src_data_36), 1.3333333333333), + vmulq_n_f16(vsubq_f16(src_data_16, src_data_26), 0.3)), + vmulq_n_f16(vsubq_f16(src_data_56, src_data_66), 0.53333333333)); + float16x8_t t47 = vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(src_data_47, src_data_37), 1.3333333333333), + vmulq_n_f16(vsubq_f16(src_data_17, src_data_27), 0.3)), + vmulq_n_f16(vsubq_f16(src_data_57, src_data_67), 0.53333333333)); + + float16x8_t t50 = vaddq_f16( + vaddq_f16( + vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_10, 0.03333333), vmulq_n_f16(src_data_20, 0.022222222)), + vmulq_n_f16(src_data_30, 0.1666666666)), + vmulq_n_f16(src_data_40, 0.11111111111)), + vmulq_n_f16(src_data_50, 0.133333333)), + vmulq_n_f16(src_data_60, 0.088888888)); + float16x8_t t51 = vaddq_f16( + vaddq_f16( + vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_11, 0.03333333), vmulq_n_f16(src_data_21, 0.022222222)), + vmulq_n_f16(src_data_31, 0.1666666666)), + vmulq_n_f16(src_data_41, 0.11111111111)), + vmulq_n_f16(src_data_51, 0.133333333)), + vmulq_n_f16(src_data_61, 0.088888888)); + float16x8_t t52 = vaddq_f16( + vaddq_f16( + vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_12, 0.03333333), vmulq_n_f16(src_data_22, 0.022222222)), + vmulq_n_f16(src_data_32, 0.1666666666)), + vmulq_n_f16(src_data_42, 0.11111111111)), + vmulq_n_f16(src_data_52, 0.133333333)), + vmulq_n_f16(src_data_62, 0.088888888)); + float16x8_t t53 = vaddq_f16( + vaddq_f16( + vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_13, 0.03333333), vmulq_n_f16(src_data_23, 0.022222222)), + vmulq_n_f16(src_data_33, 0.1666666666)), + vmulq_n_f16(src_data_43, 0.11111111111)), + vmulq_n_f16(src_data_53, 0.133333333)), + vmulq_n_f16(src_data_63, 0.088888888)); + float16x8_t t54 = vaddq_f16( + vaddq_f16( + vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_14, 0.03333333), vmulq_n_f16(src_data_24, 0.022222222)), + vmulq_n_f16(src_data_34, 0.1666666666)), + vmulq_n_f16(src_data_44, 0.11111111111)), + vmulq_n_f16(src_data_54, 0.133333333)), + vmulq_n_f16(src_data_64, 0.088888888)); + float16x8_t t55 = vaddq_f16( + vaddq_f16( + vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_15, 0.03333333), vmulq_n_f16(src_data_25, 0.022222222)), + vmulq_n_f16(src_data_35, 0.1666666666)), + vmulq_n_f16(src_data_45, 0.11111111111)), + vmulq_n_f16(src_data_55, 0.133333333)), + vmulq_n_f16(src_data_65, 0.088888888)); + float16x8_t t56 = vaddq_f16( + vaddq_f16( + vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_16, 0.03333333), vmulq_n_f16(src_data_26, 0.022222222)), + vmulq_n_f16(src_data_36, 0.1666666666)), + vmulq_n_f16(src_data_46, 0.11111111111)), + vmulq_n_f16(src_data_56, 0.133333333)), + vmulq_n_f16(src_data_66, 0.088888888)); + float16x8_t t57 = vaddq_f16( + vaddq_f16( + vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_17, 0.03333333), vmulq_n_f16(src_data_27, 0.022222222)), + vmulq_n_f16(src_data_37, 0.1666666666)), + vmulq_n_f16(src_data_47, 0.11111111111)), + vmulq_n_f16(src_data_57, 0.133333333)), + vmulq_n_f16(src_data_67, 0.088888888)); + + float16x8_t t60 = vaddq_f16( + vaddq_f16( + vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_10, -0.03333333), vmulq_n_f16(src_data_20, 0.022222222)), + vmulq_n_f16(src_data_30, 0.1666666666)), + vmulq_n_f16(src_data_40, 0.11111111111)), + vmulq_n_f16(src_data_50, -0.133333333)), + vmulq_n_f16(src_data_60, 0.088888888)); + float16x8_t t61 = vaddq_f16( + vaddq_f16( + vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_11, -0.03333333), vmulq_n_f16(src_data_21, 0.022222222)), + vmulq_n_f16(src_data_31, 0.1666666666)), + vmulq_n_f16(src_data_41, 0.11111111111)), + vmulq_n_f16(src_data_51, -0.133333333)), + vmulq_n_f16(src_data_61, 0.088888888)); + float16x8_t t62 = vaddq_f16( + vaddq_f16( + vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_12, -0.03333333), vmulq_n_f16(src_data_22, 0.022222222)), + vmulq_n_f16(src_data_32, 0.1666666666)), + vmulq_n_f16(src_data_42, 0.11111111111)), + vmulq_n_f16(src_data_52, -0.133333333)), + vmulq_n_f16(src_data_62, 0.088888888)); + float16x8_t t63 = vaddq_f16( + vaddq_f16( + vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_13, -0.03333333), vmulq_n_f16(src_data_23, 0.022222222)), + vmulq_n_f16(src_data_33, 0.1666666666)), + vmulq_n_f16(src_data_43, 0.11111111111)), + vmulq_n_f16(src_data_53, -0.133333333)), + vmulq_n_f16(src_data_63, 0.088888888)); + float16x8_t t64 = vaddq_f16( + vaddq_f16( + vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_14, -0.03333333), vmulq_n_f16(src_data_24, 0.022222222)), + vmulq_n_f16(src_data_34, 0.1666666666)), + vmulq_n_f16(src_data_44, 0.11111111111)), + vmulq_n_f16(src_data_54, -0.133333333)), + vmulq_n_f16(src_data_64, 0.088888888)); + float16x8_t t65 = vaddq_f16( + vaddq_f16( + vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_15, -0.03333333), vmulq_n_f16(src_data_25, 0.022222222)), + vmulq_n_f16(src_data_35, 0.1666666666)), + vmulq_n_f16(src_data_45, 0.11111111111)), + vmulq_n_f16(src_data_55, -0.133333333)), + vmulq_n_f16(src_data_65, 0.088888888)); + float16x8_t t66 = vaddq_f16( + vaddq_f16( + vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_16, -0.03333333), vmulq_n_f16(src_data_26, 0.022222222)), + vmulq_n_f16(src_data_36, 0.1666666666)), + vmulq_n_f16(src_data_46, 0.11111111111)), + vmulq_n_f16(src_data_56, -0.133333333)), + vmulq_n_f16(src_data_66, 0.088888888)); + float16x8_t t67 = vaddq_f16( + vaddq_f16( + vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_17, -0.03333333), vmulq_n_f16(src_data_27, 0.022222222)), + vmulq_n_f16(src_data_37, 0.1666666666)), + vmulq_n_f16(src_data_47, 0.11111111111)), + vmulq_n_f16(src_data_57, -0.133333333)), + vmulq_n_f16(src_data_67, 0.088888888)); + + float16x8_t t70 = vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_30, 3.0625), vmulq_n_f16(src_data_10, -0.5625)), + vmulq_n_f16(src_data_50, 3.5)), + src_data_70); + float16x8_t t71 = vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_31, 3.0625), vmulq_n_f16(src_data_11, -0.5625)), + vmulq_n_f16(src_data_51, 3.5)), + src_data_71); + float16x8_t t72 = vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_32, 3.0625), vmulq_n_f16(src_data_12, -0.5625)), + vmulq_n_f16(src_data_52, 3.5)), + src_data_72); + float16x8_t t73 = vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_33, 3.0625), vmulq_n_f16(src_data_13, -0.5625)), + vmulq_n_f16(src_data_53, 3.5)), + src_data_73); + float16x8_t t74 = vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_34, 3.0625), vmulq_n_f16(src_data_14, -0.5625)), + vmulq_n_f16(src_data_54, 3.5)), + src_data_74); + float16x8_t t75 = vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_35, 3.0625), vmulq_n_f16(src_data_15, -0.5625)), + vmulq_n_f16(src_data_55, 3.5)), + src_data_75); + float16x8_t t76 = vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_36, 3.0625), vmulq_n_f16(src_data_16, -0.5625)), + vmulq_n_f16(src_data_56, 3.5)), + src_data_76); + float16x8_t t77 = vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_37, 3.0625), vmulq_n_f16(src_data_17, -0.5625)), + vmulq_n_f16(src_data_57, 3.5)), + src_data_77); + + float16x8_t m00 = + vsubq_f16(vaddq_f16(vsubq_f16(t00, vmulq_n_f16(t02, 5.444444444444444)), vmulq_n_f16(t04, 6.22222222222)), + vmulq_n_f16(t06, 1.77777777777777777778)); + float16x8_t m01 = vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t01, 1.5), vmulq_n_f16(t02, 3)), + vmulq_n_f16(t03, 2.16666666666666667)), + vmulq_n_f16(t04, 4.3333333333)), + vmulq_n_f16(t05, 0.66666666667)), + vmulq_n_f16(t06, 1.333333333333)); + float16x8_t m02 = vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t01, -1.5), vmulq_n_f16(t02, 3)), + vmulq_n_f16(t03, 2.16666666666666667)), + vmulq_n_f16(t04, 4.3333333333)), + vmulq_n_f16(t05, 0.66666666667)), + vmulq_n_f16(t06, 1.333333333333)); + float16x8_t m03 = + vaddq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(t01, t02), -0.3), vmulq_n_f16(vaddq_f16(t03, t04), 1.33333333333)), + vmulq_n_f16(vaddq_f16(t05, t06), -0.533333333333)); + float16x8_t m04 = + vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(t01, t02), 0.3), vmulq_n_f16(vsubq_f16(t04, t03), 1.33333333333)), + vmulq_n_f16(vsubq_f16(t05, t06), 0.533333333333)); + float16x8_t m05 = + vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t01, 0.03333333), vmulq_n_f16(t02, 0.0222222)), + vmulq_n_f16(t03, 0.16666666666666667)), + vmulq_n_f16(t04, 0.11111111111)), + vmulq_n_f16(t05, 0.1333333333)), + vmulq_n_f16(t06, 0.08888888888)); + float16x8_t m06 = + vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t01, -0.03333333), vmulq_n_f16(t02, 0.0222222)), + vmulq_n_f16(t03, 0.16666666666666667)), + vmulq_n_f16(t04, 0.11111111111)), + vmulq_n_f16(t05, 0.1333333333)), + vmulq_n_f16(t06, 0.08888888888)); + float16x8_t m07 = + vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t01, -0.5625), vmulq_n_f16(t03, 3.0625)), vmulq_n_f16(t05, 3.5)), t07); + + float16x8_t m10 = + vsubq_f16(vaddq_f16(vsubq_f16(t10, vmulq_n_f16(t12, 5.444444444444444)), vmulq_n_f16(t14, 6.22222222222)), + vmulq_n_f16(t16, 1.77777777777777777778)); + float16x8_t m11 = vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t11, 1.5), vmulq_n_f16(t12, 3)), + vmulq_n_f16(t13, 2.16666666666666667)), + vmulq_n_f16(t14, 4.3333333333)), + vmulq_n_f16(t15, 0.66666666667)), + vmulq_n_f16(t16, 1.333333333333)); + float16x8_t m12 = vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t11, -1.5), vmulq_n_f16(t12, 3)), + vmulq_n_f16(t13, 2.16666666666666667)), + vmulq_n_f16(t14, 4.3333333333)), + vmulq_n_f16(t15, 0.66666666667)), + vmulq_n_f16(t16, 1.333333333333)); + float16x8_t m13 = + vaddq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(t11, t12), -0.3), vmulq_n_f16(vaddq_f16(t13, t14), 1.33333333333)), + vmulq_n_f16(vaddq_f16(t15, t16), -0.533333333333)); + float16x8_t m14 = + vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(t11, t12), 0.3), vmulq_n_f16(vsubq_f16(t14, t13), 1.33333333333)), + vmulq_n_f16(vsubq_f16(t15, t16), 0.533333333333)); + float16x8_t m15 = + vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t11, 0.03333333), vmulq_n_f16(t12, 0.0222222)), + vmulq_n_f16(t13, 0.16666666666666667)), + vmulq_n_f16(t14, 0.11111111111)), + vmulq_n_f16(t15, 0.1333333333)), + vmulq_n_f16(t16, 0.08888888888)); + float16x8_t m16 = + vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t11, -0.03333333), vmulq_n_f16(t12, 0.0222222)), + vmulq_n_f16(t13, 0.16666666666666667)), + vmulq_n_f16(t14, 0.11111111111)), + vmulq_n_f16(t15, 0.1333333333)), + vmulq_n_f16(t16, 0.08888888888)); + float16x8_t m17 = + vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t11, -0.5625), vmulq_n_f16(t13, 3.0625)), vmulq_n_f16(t15, 3.5)), t17); + + float16x8_t m20 = + vsubq_f16(vaddq_f16(vsubq_f16(t20, vmulq_n_f16(t22, 5.444444444444444)), vmulq_n_f16(t24, 6.22222222222)), + vmulq_n_f16(t26, 1.77777777777777777778)); + float16x8_t m21 = vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t21, 1.5), vmulq_n_f16(t22, 3)), + vmulq_n_f16(t23, 2.16666666666666667)), + vmulq_n_f16(t24, 4.3333333333)), + vmulq_n_f16(t25, 0.66666666667)), + vmulq_n_f16(t26, 1.333333333333)); + float16x8_t m22 = vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t21, -1.5), vmulq_n_f16(t22, 3)), + vmulq_n_f16(t23, 2.16666666666666667)), + vmulq_n_f16(t24, 4.3333333333)), + vmulq_n_f16(t25, 0.66666666667)), + vmulq_n_f16(t26, 1.333333333333)); + float16x8_t m23 = + vaddq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(t21, t22), -0.3), vmulq_n_f16(vaddq_f16(t23, t24), 1.33333333333)), + vmulq_n_f16(vaddq_f16(t25, t26), -0.533333333333)); + float16x8_t m24 = + vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(t21, t22), 0.3), vmulq_n_f16(vsubq_f16(t24, t23), 1.33333333333)), + vmulq_n_f16(vsubq_f16(t25, t26), 0.533333333333)); + float16x8_t m25 = + vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t21, 0.03333333), vmulq_n_f16(t22, 0.0222222)), + vmulq_n_f16(t23, 0.16666666666666667)), + vmulq_n_f16(t24, 0.11111111111)), + vmulq_n_f16(t25, 0.1333333333)), + vmulq_n_f16(t26, 0.08888888888)); + float16x8_t m26 = + vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t21, -0.03333333), vmulq_n_f16(t22, 0.0222222)), + vmulq_n_f16(t23, 0.16666666666666667)), + vmulq_n_f16(t24, 0.11111111111)), + vmulq_n_f16(t25, 0.1333333333)), + vmulq_n_f16(t26, 0.08888888888)); + float16x8_t m27 = + vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t21, -0.5625), vmulq_n_f16(t23, 3.0625)), vmulq_n_f16(t25, 3.5)), t27); + + float16x8_t m30 = + vsubq_f16(vaddq_f16(vsubq_f16(t30, vmulq_n_f16(t32, 5.444444444444444)), vmulq_n_f16(t34, 6.22222222222)), + vmulq_n_f16(t36, 1.77777777777777777778)); + float16x8_t m31 = vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t31, 1.5), vmulq_n_f16(t32, 3)), + vmulq_n_f16(t33, 2.16666666666666667)), + vmulq_n_f16(t34, 4.3333333333)), + vmulq_n_f16(t35, 0.66666666667)), + vmulq_n_f16(t36, 1.333333333333)); + float16x8_t m32 = vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t31, -1.5), vmulq_n_f16(t32, 3)), + vmulq_n_f16(t33, 2.16666666666666667)), + vmulq_n_f16(t34, 4.3333333333)), + vmulq_n_f16(t35, 0.66666666667)), + vmulq_n_f16(t36, 1.333333333333)); + float16x8_t m33 = + vaddq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(t31, t32), -0.3), vmulq_n_f16(vaddq_f16(t33, t34), 1.33333333333)), + vmulq_n_f16(vaddq_f16(t35, t36), -0.533333333333)); + float16x8_t m34 = + vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(t31, t32), 0.3), vmulq_n_f16(vsubq_f16(t34, t33), 1.33333333333)), + vmulq_n_f16(vsubq_f16(t35, t36), 0.533333333333)); + float16x8_t m35 = + vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t31, 0.03333333), vmulq_n_f16(t32, 0.0222222)), + vmulq_n_f16(t33, 0.16666666666666667)), + vmulq_n_f16(t34, 0.11111111111)), + vmulq_n_f16(t35, 0.1333333333)), + vmulq_n_f16(t36, 0.08888888888)); + float16x8_t m36 = + vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t31, -0.03333333), vmulq_n_f16(t32, 0.0222222)), + vmulq_n_f16(t33, 0.16666666666666667)), + vmulq_n_f16(t34, 0.11111111111)), + vmulq_n_f16(t35, 0.1333333333)), + vmulq_n_f16(t36, 0.08888888888)); + float16x8_t m37 = + vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t31, -0.5625), vmulq_n_f16(t33, 3.0625)), vmulq_n_f16(t35, 3.5)), t37); + + float16x8_t m40 = + vsubq_f16(vaddq_f16(vsubq_f16(t40, vmulq_n_f16(t42, 5.444444444444444)), vmulq_n_f16(t44, 6.22222222222)), + vmulq_n_f16(t46, 1.77777777777777777778)); + float16x8_t m41 = vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t41, 1.5), vmulq_n_f16(t42, 3)), + vmulq_n_f16(t43, 2.16666666666666667)), + vmulq_n_f16(t44, 4.3333333333)), + vmulq_n_f16(t45, 0.66666666667)), + vmulq_n_f16(t46, 1.333333333333)); + float16x8_t m42 = vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t41, -1.5), vmulq_n_f16(t42, 3)), + vmulq_n_f16(t43, 2.16666666666666667)), + vmulq_n_f16(t44, 4.3333333333)), + vmulq_n_f16(t45, 0.66666666667)), + vmulq_n_f16(t46, 1.333333333333)); + float16x8_t m43 = + vaddq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(t41, t42), -0.3), vmulq_n_f16(vaddq_f16(t43, t44), 1.33333333333)), + vmulq_n_f16(vaddq_f16(t45, t46), -0.533333333333)); + float16x8_t m44 = + vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(t41, t42), 0.3), vmulq_n_f16(vsubq_f16(t44, t43), 1.33333333333)), + vmulq_n_f16(vsubq_f16(t45, t46), 0.533333333333)); + float16x8_t m45 = + vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t41, 0.03333333), vmulq_n_f16(t42, 0.0222222)), + vmulq_n_f16(t43, 0.16666666666666667)), + vmulq_n_f16(t44, 0.11111111111)), + vmulq_n_f16(t45, 0.1333333333)), + vmulq_n_f16(t46, 0.08888888888)); + float16x8_t m46 = + vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t41, -0.03333333), vmulq_n_f16(t42, 0.0222222)), + vmulq_n_f16(t43, 0.16666666666666667)), + vmulq_n_f16(t44, 0.11111111111)), + vmulq_n_f16(t45, 0.1333333333)), + vmulq_n_f16(t46, 0.08888888888)); + float16x8_t m47 = + vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t41, -0.5625), vmulq_n_f16(t43, 3.0625)), vmulq_n_f16(t45, 3.5)), t47); + + float16x8_t m50 = + vsubq_f16(vaddq_f16(vsubq_f16(t50, vmulq_n_f16(t52, 5.444444444444444)), vmulq_n_f16(t54, 6.22222222222)), + vmulq_n_f16(t56, 1.77777777777777777778)); + float16x8_t m51 = vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t51, 1.5), vmulq_n_f16(t52, 3)), + vmulq_n_f16(t53, 2.16666666666666667)), + vmulq_n_f16(t54, 4.3333333333)), + vmulq_n_f16(t55, 0.66666666667)), + vmulq_n_f16(t56, 1.333333333333)); + float16x8_t m52 = vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t51, -1.5), vmulq_n_f16(t52, 3)), + vmulq_n_f16(t53, 2.16666666666666667)), + vmulq_n_f16(t54, 4.3333333333)), + vmulq_n_f16(t55, 0.66666666667)), + vmulq_n_f16(t56, 1.333333333333)); + float16x8_t m53 = + vaddq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(t51, t52), -0.3), vmulq_n_f16(vaddq_f16(t53, t54), 1.33333333333)), + vmulq_n_f16(vaddq_f16(t55, t56), -0.533333333333)); + float16x8_t m54 = + vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(t51, t52), 0.3), vmulq_n_f16(vsubq_f16(t54, t53), 1.33333333333)), + vmulq_n_f16(vsubq_f16(t55, t56), 0.533333333333)); + float16x8_t m55 = + vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t51, 0.03333333), vmulq_n_f16(t52, 0.0222222)), + vmulq_n_f16(t53, 0.16666666666666667)), + vmulq_n_f16(t54, 0.11111111111)), + vmulq_n_f16(t55, 0.1333333333)), + vmulq_n_f16(t56, 0.08888888888)); + float16x8_t m56 = + vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t51, -0.03333333), vmulq_n_f16(t52, 0.0222222)), + vmulq_n_f16(t53, 0.16666666666666667)), + vmulq_n_f16(t54, 0.11111111111)), + vmulq_n_f16(t55, 0.1333333333)), + vmulq_n_f16(t56, 0.08888888888)); + float16x8_t m57 = + vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t51, -0.5625), vmulq_n_f16(t53, 3.0625)), vmulq_n_f16(t55, 3.5)), t57); + + float16x8_t m60 = + vsubq_f16(vaddq_f16(vsubq_f16(t60, vmulq_n_f16(t62, 5.444444444444444)), vmulq_n_f16(t64, 6.22222222222)), + vmulq_n_f16(t66, 1.77777777777777777778)); + float16x8_t m61 = vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t61, 1.5), vmulq_n_f16(t62, 3)), + vmulq_n_f16(t63, 2.16666666666666667)), + vmulq_n_f16(t64, 4.3333333333)), + vmulq_n_f16(t65, 0.66666666667)), + vmulq_n_f16(t66, 1.333333333333)); + float16x8_t m62 = vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t61, -1.5), vmulq_n_f16(t62, 3)), + vmulq_n_f16(t63, 2.16666666666666667)), + vmulq_n_f16(t64, 4.3333333333)), + vmulq_n_f16(t65, 0.66666666667)), + vmulq_n_f16(t66, 1.333333333333)); + float16x8_t m63 = + vaddq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(t61, t62), -0.3), vmulq_n_f16(vaddq_f16(t63, t64), 1.33333333333)), + vmulq_n_f16(vaddq_f16(t65, t66), -0.533333333333)); + float16x8_t m64 = + vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(t61, t62), 0.3), vmulq_n_f16(vsubq_f16(t64, t63), 1.33333333333)), + vmulq_n_f16(vsubq_f16(t65, t66), 0.533333333333)); + float16x8_t m65 = + vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t61, 0.03333333), vmulq_n_f16(t62, 0.0222222)), + vmulq_n_f16(t63, 0.16666666666666667)), + vmulq_n_f16(t64, 0.11111111111)), + vmulq_n_f16(t65, 0.1333333333)), + vmulq_n_f16(t66, 0.08888888888)); + float16x8_t m66 = + vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t61, -0.03333333), vmulq_n_f16(t62, 0.0222222)), + vmulq_n_f16(t63, 0.16666666666666667)), + vmulq_n_f16(t64, 0.11111111111)), + vmulq_n_f16(t65, 0.1333333333)), + vmulq_n_f16(t66, 0.08888888888)); + float16x8_t m67 = + vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t61, -0.5625), vmulq_n_f16(t63, 3.0625)), vmulq_n_f16(t65, 3.5)), t67); + + float16x8_t m70 = + vsubq_f16(vaddq_f16(vsubq_f16(t70, vmulq_n_f16(t72, 5.444444444444444)), vmulq_n_f16(t74, 6.22222222222)), + vmulq_n_f16(t76, 1.77777777777777777778)); + float16x8_t m71 = vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t71, 1.5), vmulq_n_f16(t72, 3)), + vmulq_n_f16(t73, 2.16666666666666667)), + vmulq_n_f16(t74, 4.3333333333)), + vmulq_n_f16(t75, 0.66666666667)), + vmulq_n_f16(t76, 1.333333333333)); + float16x8_t m72 = vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t71, -1.5), vmulq_n_f16(t72, 3)), + vmulq_n_f16(t73, 2.16666666666666667)), + vmulq_n_f16(t74, 4.3333333333)), + vmulq_n_f16(t75, 0.66666666667)), + vmulq_n_f16(t76, 1.333333333333)); + float16x8_t m73 = + vaddq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(t71, t72), -0.3), vmulq_n_f16(vaddq_f16(t73, t74), 1.33333333333)), + vmulq_n_f16(vaddq_f16(t75, t76), -0.533333333333)); + float16x8_t m74 = + vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(t71, t72), 0.3), vmulq_n_f16(vsubq_f16(t74, t73), 1.33333333333)), + vmulq_n_f16(vsubq_f16(t75, t76), 0.533333333333)); + float16x8_t m75 = + vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t71, 0.03333333), vmulq_n_f16(t72, 0.0222222)), + vmulq_n_f16(t73, 0.16666666666666667)), + vmulq_n_f16(t74, 0.11111111111)), + vmulq_n_f16(t75, 0.1333333333)), + vmulq_n_f16(t76, 0.08888888888)); + float16x8_t m76 = + vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t71, -0.03333333), vmulq_n_f16(t72, 0.0222222)), + vmulq_n_f16(t73, 0.16666666666666667)), + vmulq_n_f16(t74, 0.11111111111)), + vmulq_n_f16(t75, 0.1333333333)), + vmulq_n_f16(t76, 0.08888888888)); + float16x8_t m77 = + vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t71, -0.5625), vmulq_n_f16(t73, 3.0625)), vmulq_n_f16(t75, 3.5)), t77); + + vst1q_f16(dst_data + 0 * dst_step, m00); + vst1q_f16(dst_data + 1 * dst_step, m01); + vst1q_f16(dst_data + 2 * dst_step, m02); + vst1q_f16(dst_data + 3 * dst_step, m03); + vst1q_f16(dst_data + 4 * dst_step, m04); + vst1q_f16(dst_data + 5 * dst_step, m05); + vst1q_f16(dst_data + 6 * dst_step, m06); + vst1q_f16(dst_data + 7 * dst_step, m07); + vst1q_f16(dst_data + 8 * dst_step, m10); + vst1q_f16(dst_data + 9 * dst_step, m11); + vst1q_f16(dst_data + 10 * dst_step, m12); + vst1q_f16(dst_data + 11 * dst_step, m13); + vst1q_f16(dst_data + 12 * dst_step, m14); + vst1q_f16(dst_data + 13 * dst_step, m15); + vst1q_f16(dst_data + 14 * dst_step, m16); + vst1q_f16(dst_data + 15 * dst_step, m17); + vst1q_f16(dst_data + 16 * dst_step, m20); + vst1q_f16(dst_data + 17 * dst_step, m21); + vst1q_f16(dst_data + 18 * dst_step, m22); + vst1q_f16(dst_data + 19 * dst_step, m23); + vst1q_f16(dst_data + 20 * dst_step, m24); + vst1q_f16(dst_data + 21 * dst_step, m25); + vst1q_f16(dst_data + 22 * dst_step, m26); + vst1q_f16(dst_data + 23 * dst_step, m27); + vst1q_f16(dst_data + 24 * dst_step, m30); + vst1q_f16(dst_data + 25 * dst_step, m31); + vst1q_f16(dst_data + 26 * dst_step, m32); + vst1q_f16(dst_data + 27 * dst_step, m33); + vst1q_f16(dst_data + 28 * dst_step, m34); + vst1q_f16(dst_data + 29 * dst_step, m35); + vst1q_f16(dst_data + 30 * dst_step, m36); + vst1q_f16(dst_data + 31 * dst_step, m37); + vst1q_f16(dst_data + 32 * dst_step, m40); + vst1q_f16(dst_data + 33 * dst_step, m41); + vst1q_f16(dst_data + 34 * dst_step, m42); + vst1q_f16(dst_data + 35 * dst_step, m43); + vst1q_f16(dst_data + 36 * dst_step, m44); + vst1q_f16(dst_data + 37 * dst_step, m45); + vst1q_f16(dst_data + 38 * dst_step, m46); + vst1q_f16(dst_data + 39 * dst_step, m47); + vst1q_f16(dst_data + 40 * dst_step, m50); + vst1q_f16(dst_data + 41 * dst_step, m51); + vst1q_f16(dst_data + 42 * dst_step, m52); + vst1q_f16(dst_data + 43 * dst_step, m53); + vst1q_f16(dst_data + 44 * dst_step, m54); + vst1q_f16(dst_data + 45 * dst_step, m55); + vst1q_f16(dst_data + 46 * dst_step, m56); + vst1q_f16(dst_data + 47 * dst_step, m57); + vst1q_f16(dst_data + 48 * dst_step, m60); + vst1q_f16(dst_data + 49 * dst_step, m61); + vst1q_f16(dst_data + 50 * dst_step, m62); + vst1q_f16(dst_data + 51 * dst_step, m63); + vst1q_f16(dst_data + 52 * dst_step, m64); + vst1q_f16(dst_data + 53 * dst_step, m65); + vst1q_f16(dst_data + 54 * dst_step, m66); + vst1q_f16(dst_data + 55 * dst_step, m67); + vst1q_f16(dst_data + 56 * dst_step, m70); + vst1q_f16(dst_data + 57 * dst_step, m71); + vst1q_f16(dst_data + 58 * dst_step, m72); + vst1q_f16(dst_data + 59 * dst_step, m73); + vst1q_f16(dst_data + 60 * dst_step, m74); + vst1q_f16(dst_data + 61 * dst_step, m75); + vst1q_f16(dst_data + 62 * dst_step, m76); + vst1q_f16(dst_data + 63 * dst_step, m77); +#else + for (int i = 0; i < C8NUM; i++) { + float16_t src_data_00 = src_data[i]; + float16_t src_data_01 = src_data[i + src_step]; + float16_t src_data_02 = src_data[i + 2 * src_step]; + float16_t src_data_03 = src_data[i + 3 * src_step]; + float16_t src_data_04 = src_data[i + 4 * src_step]; + float16_t src_data_05 = src_data[i + 5 * src_step]; + float16_t src_data_06 = src_data[i + 6 * src_step]; + float16_t src_data_07 = src_data[i + 7 * src_step]; + float16_t src_data_10 = src_data[i + 8 * src_step]; + float16_t src_data_11 = src_data[i + 9 * src_step]; + float16_t src_data_12 = src_data[i + 10 * src_step]; + float16_t src_data_13 = src_data[i + 11 * src_step]; + float16_t src_data_14 = src_data[i + 12 * src_step]; + float16_t src_data_15 = src_data[i + 13 * src_step]; + float16_t src_data_16 = src_data[i + 14 * src_step]; + float16_t src_data_17 = src_data[i + 15 * src_step]; + float16_t src_data_20 = src_data[i + 16 * src_step]; + float16_t src_data_21 = src_data[i + 17 * src_step]; + float16_t src_data_22 = src_data[i + 18 * src_step]; + float16_t src_data_23 = src_data[i + 19 * src_step]; + float16_t src_data_24 = src_data[i + 20 * src_step]; + float16_t src_data_25 = src_data[i + 21 * src_step]; + float16_t src_data_26 = src_data[i + 22 * src_step]; + float16_t src_data_27 = src_data[i + 23 * src_step]; + float16_t src_data_30 = src_data[i + 24 * src_step]; + float16_t src_data_31 = src_data[i + 25 * src_step]; + float16_t src_data_32 = src_data[i + 26 * src_step]; + float16_t src_data_33 = src_data[i + 27 * src_step]; + float16_t src_data_34 = src_data[i + 28 * src_step]; + float16_t src_data_35 = src_data[i + 29 * src_step]; + float16_t src_data_36 = src_data[i + 30 * src_step]; + float16_t src_data_37 = src_data[i + 31 * src_step]; + float16_t src_data_40 = src_data[i + 32 * src_step]; + float16_t src_data_41 = src_data[i + 33 * src_step]; + float16_t src_data_42 = src_data[i + 34 * src_step]; + float16_t src_data_43 = src_data[i + 35 * src_step]; + float16_t src_data_44 = src_data[i + 36 * src_step]; + float16_t src_data_45 = src_data[i + 37 * src_step]; + float16_t src_data_46 = src_data[i + 38 * src_step]; + float16_t src_data_47 = src_data[i + 39 * src_step]; + float16_t src_data_50 = src_data[i + 40 * src_step]; + float16_t src_data_51 = src_data[i + 41 * src_step]; + float16_t src_data_52 = src_data[i + 42 * src_step]; + float16_t src_data_53 = src_data[i + 43 * src_step]; + float16_t src_data_54 = src_data[i + 44 * src_step]; + float16_t src_data_55 = src_data[i + 45 * src_step]; + float16_t src_data_56 = src_data[i + 46 * src_step]; + float16_t src_data_57 = src_data[i + 47 * src_step]; + float16_t src_data_60 = src_data[i + 48 * src_step]; + float16_t src_data_61 = src_data[i + 49 * src_step]; + float16_t src_data_62 = src_data[i + 50 * src_step]; + float16_t src_data_63 = src_data[i + 51 * src_step]; + float16_t src_data_64 = src_data[i + 52 * src_step]; + float16_t src_data_65 = src_data[i + 53 * src_step]; + float16_t src_data_66 = src_data[i + 54 * src_step]; + float16_t src_data_67 = src_data[i + 55 * src_step]; + float16_t src_data_70 = src_data[i + 56 * src_step]; + float16_t src_data_71 = src_data[i + 57 * src_step]; + float16_t src_data_72 = src_data[i + 58 * src_step]; + float16_t src_data_73 = src_data[i + 59 * src_step]; + float16_t src_data_74 = src_data[i + 60 * src_step]; + float16_t src_data_75 = src_data[i + 61 * src_step]; + float16_t src_data_76 = src_data[i + 62 * src_step]; + float16_t src_data_77 = src_data[i + 63 * src_step]; + + float16_t t00 = src_data_00 - 5.444444444444444445125f * src_data_20 + 6.222222222222222222223f * src_data_40 - + 1.77777777777777778f * src_data_60; + float16_t t01 = src_data_01 - 5.444444444444444445125f * src_data_21 + 6.222222222222222222223f * src_data_41 - + 1.77777777777777778f * src_data_61; + float16_t t02 = src_data_02 - 5.444444444444444445125f * src_data_22 + 6.222222222222222222223f * src_data_42 - + 1.77777777777777778f * src_data_62; + float16_t t03 = src_data_03 - 5.444444444444444445125f * src_data_23 + 6.222222222222222222223f * src_data_43 - + 1.77777777777777778f * src_data_63; + float16_t t04 = src_data_04 - 5.444444444444444445125f * src_data_24 + 6.222222222222222222223f * src_data_44 - + 1.77777777777777778f * src_data_64; + float16_t t05 = src_data_05 - 5.444444444444444445125f * src_data_25 + 6.222222222222222222223f * src_data_45 - + 1.77777777777777778f * src_data_65; + float16_t t06 = src_data_06 - 5.444444444444444445125f * src_data_26 + 6.222222222222222222223f * src_data_46 - + 1.77777777777777778f * src_data_66; + float16_t t07 = src_data_07 - 5.444444444444444445125f * src_data_27 + 6.222222222222222222223f * src_data_47 - + 1.77777777777777778f * src_data_67; + + const float16_t t10 = 1.5f * src_data_10 + 3.0f * src_data_20 - 2.1666666666666667f * src_data_30 - + 4.333333333333333333f * src_data_40 + 0.66666666666666667f * src_data_50 + + 1.333333333333333f * src_data_60; + const float16_t t11 = 1.5f * src_data_11 + 3.0f * src_data_21 - 2.1666666666666667f * src_data_31 - + 4.333333333333333333f * src_data_41 + 0.66666666666666667f * src_data_51 + + 1.333333333333333f * src_data_61; + const float16_t t12 = 1.5f * src_data_12 + 3.0f * src_data_22 - 2.1666666666666667f * src_data_32 - + 4.333333333333333333f * src_data_42 + 0.66666666666666667f * src_data_52 + + 1.333333333333333f * src_data_62; + const float16_t t13 = 1.5f * src_data_13 + 3.0f * src_data_23 - 2.1666666666666667f * src_data_33 - + 4.333333333333333333f * src_data_43 + 0.66666666666666667f * src_data_53 + + 1.333333333333333f * src_data_63; + const float16_t t14 = 1.5f * src_data_14 + 3.0f * src_data_24 - 2.1666666666666667f * src_data_34 - + 4.333333333333333333f * src_data_44 + 0.66666666666666667f * src_data_54 + + 1.333333333333333f * src_data_64; + const float16_t t15 = 1.5f * src_data_15 + 3.0f * src_data_25 - 2.1666666666666667f * src_data_35 - + 4.333333333333333333f * src_data_45 + 0.66666666666666667f * src_data_55 + + 1.333333333333333f * src_data_65; + const float16_t t16 = 1.5f * src_data_16 + 3.0f * src_data_26 - 2.1666666666666667f * src_data_36 - + 4.333333333333333333f * src_data_46 + 0.66666666666666667f * src_data_56 + + 1.333333333333333f * src_data_66; + const float16_t t17 = 1.5f * src_data_17 + 3.0f * src_data_27 - 2.1666666666666667f * src_data_37 - + 4.333333333333333333f * src_data_47 + 0.66666666666666667f * src_data_57 + + 1.333333333333333f * src_data_67; + + const float16_t t20 = -1.5f * src_data_10 + 3.0f * src_data_20 + 2.1666666666666667f * src_data_30 - + 4.333333333333333333f * src_data_40 - 0.66666666666666667f * src_data_50 + + 1.333333333333333f * src_data_60; + const float16_t t21 = -1.5f * src_data_11 + 3.0f * src_data_21 + 2.1666666666666667f * src_data_31 - + 4.333333333333333333f * src_data_41 - 0.66666666666666667f * src_data_51 + + 1.333333333333333f * src_data_61; + const float16_t t22 = -1.5f * src_data_12 + 3.0f * src_data_22 + 2.1666666666666667f * src_data_32 - + 4.333333333333333333f * src_data_42 - 0.66666666666666667f * src_data_52 + + 1.333333333333333f * src_data_62; + const float16_t t23 = -1.5f * src_data_13 + 3.0f * src_data_23 + 2.1666666666666667f * src_data_33 - + 4.333333333333333333f * src_data_43 - 0.66666666666666667f * src_data_53 + + 1.333333333333333f * src_data_63; + const float16_t t24 = -1.5f * src_data_14 + 3.0f * src_data_24 + 2.1666666666666667f * src_data_34 - + 4.333333333333333333f * src_data_44 - 0.66666666666666667f * src_data_54 + + 1.333333333333333f * src_data_64; + const float16_t t25 = -1.5f * src_data_15 + 3.0f * src_data_25 + 2.1666666666666667f * src_data_35 - + 4.333333333333333333f * src_data_45 - 0.66666666666666667f * src_data_55 + + 1.333333333333333f * src_data_65; + const float16_t t26 = -1.5f * src_data_16 + 3.0f * src_data_26 + 2.1666666666666667f * src_data_36 - + 4.333333333333333333f * src_data_46 - 0.66666666666666667f * src_data_56 + + 1.333333333333333f * src_data_66; + const float16_t t27 = -1.5f * src_data_17 + 3.0f * src_data_27 + 2.1666666666666667f * src_data_37 - + 4.333333333333333333f * src_data_47 - 0.66666666666666667f * src_data_57 + + 1.333333333333333f * src_data_67; + + const float16_t t30 = -0.3f * (src_data_10 + src_data_20) + 1.33333333333333f * (src_data_30 + src_data_40) - + 0.53333333333f * (src_data_50 + src_data_60); + const float16_t t31 = -0.3f * (src_data_11 + src_data_21) + 1.33333333333333f * (src_data_31 + src_data_41) - + 0.53333333333f * (src_data_51 + src_data_61); + const float16_t t32 = -0.3f * (src_data_12 + src_data_22) + 1.33333333333333f * (src_data_32 + src_data_42) - + 0.53333333333f * (src_data_52 + src_data_62); + const float16_t t33 = -0.3f * (src_data_13 + src_data_23) + 1.33333333333333f * (src_data_33 + src_data_43) - + 0.53333333333f * (src_data_53 + src_data_63); + const float16_t t34 = -0.3f * (src_data_14 + src_data_24) + 1.33333333333333f * (src_data_34 + src_data_44) - + 0.53333333333f * (src_data_54 + src_data_64); + const float16_t t35 = -0.3f * (src_data_15 + src_data_25) + 1.33333333333333f * (src_data_35 + src_data_45) - + 0.53333333333f * (src_data_55 + src_data_65); + const const float16_t t36 = -0.3f * (src_data_16 + src_data_26) + 1.33333333333333f * (src_data_36 + src_data_46) - + 0.53333333333f * (src_data_56 + src_data_66); + const const float16_t t37 = -0.3f * (src_data_17 + src_data_27) + 1.33333333333333f * (src_data_37 + src_data_47) - + 0.53333333333f * (src_data_57 + src_data_67); + + const float16_t t40 = 0.3f * (src_data_10 - src_data_20) + 1.33333333333333f * (src_data_40 - src_data_30) + + 0.53333333333f * (src_data_50 - src_data_60); + const float16_t t41 = 0.3f * (src_data_11 - src_data_21) + 1.33333333333333f * (src_data_41 - src_data_31) + + 0.53333333333f * (src_data_51 - src_data_61); + const float16_t t42 = 0.3f * (src_data_12 - src_data_22) + 1.33333333333333f * (src_data_42 - src_data_32) + + 0.53333333333f * (src_data_52 - src_data_62); + const float16_t t43 = 0.3f * (src_data_13 - src_data_23) + 1.33333333333333f * (src_data_43 - src_data_33) + + 0.53333333333f * (src_data_53 - src_data_63); + const float16_t t44 = 0.3f * (src_data_14 - src_data_24) + 1.33333333333333f * (src_data_44 - src_data_34) + + 0.53333333333f * (src_data_54 - src_data_64); + const float16_t t45 = 0.3f * (src_data_15 - src_data_25) + 1.33333333333333f * (src_data_45 - src_data_35) + + 0.53333333333f * (src_data_55 - src_data_65); + const float16_t t46 = 0.3f * (src_data_16 - src_data_26) + 1.33333333333333f * (src_data_46 - src_data_36) + + 0.53333333333f * (src_data_56 - src_data_66); + const float16_t t47 = 0.3f * (src_data_17 - src_data_27) + 1.33333333333333f * (src_data_47 - src_data_37) + + 0.53333333333f * (src_data_57 - src_data_67); + + const float16_t t50 = 0.0333333333f * src_data_10 + 0.02222222f * src_data_20 - 0.1666666666f * src_data_30 - + 0.1111111111f * src_data_40 + 0.1333333f * src_data_50 + 0.0888888f * src_data_60; + const float16_t t51 = 0.0333333333f * src_data_11 + 0.02222222f * src_data_21 - 0.1666666666f * src_data_31 - + 0.1111111111f * src_data_41 + 0.1333333f * src_data_51 + 0.0888888f * src_data_61; + const float16_t t52 = 0.0333333333f * src_data_12 + 0.02222222f * src_data_22 - 0.1666666666f * src_data_32 - + 0.1111111111f * src_data_42 + 0.1333333f * src_data_52 + 0.0888888f * src_data_62; + const float16_t t53 = 0.0333333333f * src_data_13 + 0.02222222f * src_data_23 - 0.1666666666f * src_data_33 - + 0.1111111111f * src_data_43 + 0.1333333f * src_data_53 + 0.0888888f * src_data_63; + const float16_t t54 = 0.0333333333f * src_data_14 + 0.02222222f * src_data_24 - 0.1666666666f * src_data_34 - + 0.1111111111f * src_data_44 + 0.1333333f * src_data_54 + 0.0888888f * src_data_64; + const float16_t t55 = 0.0333333333f * src_data_15 + 0.02222222f * src_data_25 - 0.1666666666f * src_data_35 - + 0.1111111111f * src_data_45 + 0.1333333f * src_data_55 + 0.0888888f * src_data_65; + const float16_t t56 = 0.0333333333f * src_data_16 + 0.02222222f * src_data_26 - 0.1666666666f * src_data_36 - + 0.1111111111f * src_data_46 + 0.1333333f * src_data_56 + 0.0888888f * src_data_66; + const float16_t t57 = 0.0333333333f * src_data_17 + 0.02222222f * src_data_27 - 0.1666666666f * src_data_37 - + 0.1111111111f * src_data_47 + 0.1333333f * src_data_57 + 0.0888888f * src_data_67; + + const float16_t t60 = -0.0333333333f * src_data_10 + 0.02222222f * src_data_20 + 0.1666666666f * src_data_30 - + 0.1111111111f * src_data_40 - 0.1333333f * src_data_50 + 0.0888888f * src_data_60; + const float16_t t61 = -0.0333333333f * src_data_11 + 0.02222222f * src_data_21 + 0.1666666666f * src_data_31 - + 0.1111111111f * src_data_41 - 0.1333333f * src_data_51 + 0.0888888f * src_data_61; + const float16_t t62 = -0.0333333333f * src_data_12 + 0.02222222f * src_data_22 + 0.1666666666f * src_data_32 - + 0.1111111111f * src_data_42 - 0.1333333f * src_data_52 + 0.0888888f * src_data_62; + const float16_t t63 = -0.0333333333f * src_data_13 + 0.02222222f * src_data_23 + 0.1666666666f * src_data_33 - + 0.1111111111f * src_data_43 - 0.1333333f * src_data_53 + 0.0888888f * src_data_63; + const float16_t t64 = -0.0333333333f * src_data_14 + 0.02222222f * src_data_24 + 0.1666666666f * src_data_34 - + 0.1111111111f * src_data_44 - 0.1333333f * src_data_54 + 0.0888888f * src_data_64; + const float16_t t65 = -0.0333333333f * src_data_15 + 0.02222222f * src_data_25 + 0.1666666666f * src_data_35 - + 0.1111111111f * src_data_45 - 0.1333333f * src_data_55 + 0.0888888f * src_data_65; + const float16_t t66 = -0.0333333333f * src_data_16 + 0.02222222f * src_data_26 + 0.1666666666f * src_data_36 - + 0.1111111111f * src_data_46 - 0.1333333f * src_data_56 + 0.0888888f * src_data_66; + const float16_t t67 = -0.0333333333f * src_data_17 + 0.02222222f * src_data_27 + 0.1666666666f * src_data_37 - + 0.1111111111f * src_data_47 - 0.1333333f * src_data_57 + 0.0888888f * src_data_67; + + const float16_t t70 = -0.5625f * src_data_10 + 3.0625f * src_data_30 - 3.5f * src_data_50 + src_data_70; + const float16_t t71 = -0.5625f * src_data_11 + 3.0625f * src_data_31 - 3.5f * src_data_51 + src_data_71; + const float16_t t72 = -0.5625f * src_data_12 + 3.0625f * src_data_32 - 3.5f * src_data_52 + src_data_72; + const float16_t t73 = -0.5625f * src_data_13 + 3.0625f * src_data_33 - 3.5f * src_data_53 + src_data_73; + const float16_t t74 = -0.5625f * src_data_14 + 3.0625f * src_data_34 - 3.5f * src_data_54 + src_data_74; + const float16_t t75 = -0.5625f * src_data_15 + 3.0625f * src_data_35 - 3.5f * src_data_55 + src_data_75; + const float16_t t76 = -0.5625f * src_data_16 + 3.0625f * src_data_36 - 3.5f * src_data_56 + src_data_76; + const float16_t t77 = -0.5625f * src_data_17 + 3.0625f * src_data_37 - 3.5f * src_data_57 + src_data_77; + + const float16_t m00 = + t00 - 5.444444444444444445125f * t02 + 6.222222222222222222223f * t04 - 1.77777777777777778f * t06; + const float16_t m01 = 1.5f * t01 + 3.0f * t02 - 2.1666666666666667f * t03 - 4.333333333333333333f * t04 + + 0.66666666666666667f * t05 + 1.333333333333333f * t06; + const float16_t m02 = -1.5f * t01 + 3.0f * t02 + 2.1666666666666667f * t03 - 4.333333333333333333f * t04 - + 0.66666666666666667f * t05 + 1.333333333333333f * t06; + const float16_t m03 = -0.3f * (t01 + t02) + 1.33333333333333f * (t03 + t04) - 0.53333333333f * (t05 + t06); + const float16_t m04 = 0.3f * (t01 - t02) + 1.33333333333333f * (t04 - t03) + 0.53333333333f * (t05 - t06); + const float16_t m05 = 0.0333333333f * t01 + 0.02222222f * t02 - 0.1666666666f * t03 - 0.1111111111f * t04 + + 0.1333333f * t05 + 0.0888888f * t06; + const float16_t m06 = -0.0333333333f * t01 + 0.02222222f * t02 + 0.1666666666f * t03 - 0.1111111111f * t04 - + 0.1333333f * t05 + 0.0888888f * t06; + const float16_t m07 = -0.5625f * t01 + 3.0625f * t03 - 3.5f * t05 + t07; + + float16_t m10 = t10 - 5.444444444444444445125f * t12 + 6.222222222222222222223f * t14 - 1.77777777777777778f * t16; + const float16_t m11 = 1.5f * t11 + 3.0f * t12 - 2.1666666666666667f * t13 - 4.333333333333333333f * t14 + + 0.66666666666666667f * t15 + 1.333333333333333f * t16; + const float16_t m12 = -1.5f * t11 + 3.0f * t12 + 2.1666666666666667f * t13 - 4.333333333333333333f * t14 - + 0.66666666666666667f * t15 + 1.333333333333333f * t16; + const float16_t m13 = -0.3f * (t11 + t12) + 1.33333333333333f * (t13 + t14) - 0.53333333333f * (t15 + t16); + const float16_t m14 = 0.3f * (t11 - t12) + 1.33333333333333f * (t14 - t13) + 0.53333333333f * (t15 - t16); + const float16_t m15 = 0.0333333333f * t11 + 0.02222222f * t12 - 0.1666666666f * t13 - 0.1111111111f * t14 + + 0.1333333f * t15 + 0.0888888f * t16; + const float16_t m16 = -0.0333333333f * t11 + 0.02222222f * t12 + 0.1666666666f * t13 - 0.1111111111f * t14 - + 0.1333333f * t15 + 0.0888888f * t16; + const float16_t m17 = -0.5625f * t11 + 3.0625f * t13 - 3.5f * t15 + t17; + + const float16_t m20 = + t20 - 5.444444444444444445125f * t22 + 6.222222222222222222223f * t24 - 1.77777777777777778f * t26; + const float16_t m21 = 1.5f * t21 + 3.0f * t22 - 2.1666666666666667f * t23 - 4.333333333333333333f * t24 + + 0.66666666666666667f * t25 + 1.333333333333333f * t26; + const float16_t m22 = -1.5f * t21 + 3.0f * t22 + 2.1666666666666667f * t23 - 4.333333333333333333f * t24 - + 0.66666666666666667f * t25 + 1.333333333333333f * t26; + const float16_t m23 = -0.3f * (t21 + t22) + 1.33333333333333f * (t23 + t24) - 0.53333333333f * (t25 + t26); + const float16_t m24 = 0.3f * (t21 - t22) + 1.33333333333333f * (t24 - t23) + 0.53333333333f * (t25 - t26); + const float16_t m25 = 0.0333333333f * t21 + 0.02222222f * t22 - 0.1666666666f * t23 - 0.1111111111f * t24 + + 0.1333333f * t25 + 0.0888888f * t26; + const float16_t m26 = -0.0333333333f * t21 + 0.02222222f * t22 + 0.1666666666f * t23 - 0.1111111111f * t24 - + 0.1333333f * t25 + 0.0888888f * t26; + const float16_t m27 = -0.5625f * t21 + 3.0625f * t23 - 3.5f * t25 + t27; + + float16_t m30 = t30 - 5.444444444444444445125f * t32 + 6.222222222222222222223f * t34 - 1.77777777777777778f * t36; + const float16_t m31 = 1.5f * t31 + 3.0f * t32 - 2.1666666666666667f * t33 - 4.333333333333333333f * t34 + + 0.66666666666666667f * t35 + 1.333333333333333f * t36; + const float16_t m32 = -1.5f * t31 + 3.0f * t32 + 2.1666666666666667f * t33 - 4.333333333333333333f * t34 - + 0.66666666666666667f * t35 + 1.333333333333333f * t36; + const float16_t m33 = -0.3f * (t31 + t32) + 1.33333333333333f * (t33 + t34) - 0.53333333333f * (t35 + t36); + const float16_t m34 = 0.3f * (t31 - t32) + 1.33333333333333f * (t34 - t33) + 0.53333333333f * (t35 - t36); + const float16_t m35 = 0.0333333333f * t31 + 0.02222222f * t32 - 0.1666666666f * t33 - 0.1111111111f * t34 + + 0.1333333f * t35 + 0.0888888f * t36; + const float16_t m36 = -0.0333333333f * t31 + 0.02222222f * t32 + 0.1666666666f * t33 - 0.1111111111f * t34 - + 0.1333333f * t35 + 0.0888888f * t36; + const float16_t m37 = -0.5625f * t31 + 3.0625f * t33 - 3.5f * t35 + t37; + + const float16_t m40 = + t40 - 5.444444444444444445125f * t42 + 6.222222222222222222223f * t44 - 1.77777777777777778f * t46; + const float16_t m41 = 1.5f * t41 + 3.0f * t42 - 2.1666666666666667f * t43 - 4.333333333333333333f * t44 + + 0.66666666666666667f * t45 + 1.333333333333333f * t46; + const float16_t m42 = -1.5f * t41 + 3.0f * t42 + 2.1666666666666667f * t43 - 4.333333333333333333f * t44 - + 0.66666666666666667f * t45 + 1.333333333333333f * t46; + const float16_t m43 = -0.3f * (t41 + t42) + 1.33333333333333f * (t43 + t44) - 0.53333333333f * (t45 + t46); + const float16_t m44 = 0.3f * (t41 - t42) + 1.33333333333333f * (t44 - t43) + 0.53333333333f * (t45 - t46); + const float16_t m45 = 0.0333333333f * t41 + 0.02222222f * t42 - 0.1666666666f * t43 - 0.1111111111f * t44 + + 0.1333333f * t45 + 0.0888888f * t46; + const float16_t m46 = -0.0333333333f * t41 + 0.02222222f * t42 + 0.1666666666f * t43 - 0.1111111111f * t44 - + 0.1333333f * t45 + 0.0888888f * t46; + const float16_t m47 = -0.5625f * t41 + 3.0625f * t43 - 3.5f * t45 + t47; + + float16_t m50 = t50 - 5.444444444444444445125f * t52 + 6.222222222222222222223f * t54 - 1.77777777777777778f * t56; + const float16_t m51 = 1.5f * t51 + 3.0f * t52 - 2.1666666666666667f * t53 - 4.333333333333333333f * t54 + + 0.66666666666666667f * t55 + 1.333333333333333f * t56; + const float16_t m52 = -1.5f * t51 + 3.0f * t52 + 2.1666666666666667f * t53 - 4.333333333333333333f * t54 - + 0.66666666666666667f * t55 + 1.333333333333333f * t56; + const float16_t m53 = -0.3f * (t51 + t52) + 1.33333333333333f * (t53 + t54) - 0.53333333333f * (t55 + t56); + const float16_t m54 = 0.3f * (t51 - t52) + 1.33333333333333f * (t54 - t53) + 0.53333333333f * (t55 - t56); + const float16_t m55 = 0.0333333333f * t51 + 0.02222222f * t52 - 0.1666666666f * t53 - 0.1111111111f * t54 + + 0.1333333f * t55 + 0.0888888f * t56; + const float16_t m56 = -0.0333333333f * t51 + 0.02222222f * t52 + 0.1666666666f * t53 - 0.1111111111f * t54 - + 0.1333333f * t55 + 0.0888888f * t56; + const float16_t m57 = -0.5625f * t51 + 3.0625f * t53 - 3.5f * t55 + t57; + + float16_t m60 = t60 - 5.444444444444444445125f * t62 + 6.222222222222222222223f * t64 - 1.77777777777777778f * t66; + const float16_t m61 = 1.5f * t61 + 3.0f * t62 - 2.1666666666666667f * t63 - 4.333333333333333333f * t64 + + 0.66666666666666667f * t65 + 1.333333333333333f * t66; + const float16_t m62 = -1.5f * t61 + 3.0f * t62 + 2.1666666666666667f * t63 - 4.333333333333333333f * t64 - + 0.66666666666666667f * t65 + 1.333333333333333f * t66; + const float16_t m63 = -0.3f * (t61 + t62) + 1.33333333333333f * (t63 + t64) - 0.53333333333f * (t65 + t66); + const float16_t m64 = 0.3f * (t61 - t62) + 1.33333333333333f * (t64 - t63) + 0.53333333333f * (t65 - t66); + const float16_t m65 = 0.0333333333f * t61 + 0.02222222f * t62 - 0.1666666666f * t63 - 0.1111111111f * t64 + + 0.1333333f * t65 + 0.0888888f * t66; + const float16_t m66 = -0.0333333333f * t61 + 0.02222222f * t62 + 0.1666666666f * t63 - 0.1111111111f * t64 - + 0.1333333f * t65 + 0.0888888f * t66; + const float16_t m67 = -0.5625f * t61 + 3.0625f * t63 - 3.5f * t65 + t67; + + float16_t m70 = t70 - 5.444444444444444445125f * t72 + 6.222222222222222222223f * t74 - 1.77777777777777778f * t76; + const float16_t m71 = 1.5f * t71 + 3.0f * t72 - 2.1666666666666667f * t73 - 4.333333333333333333f * t74 + + 0.66666666666666667f * t75 + 1.333333333333333f * t76; + const float16_t m72 = -1.5f * t71 + 3.0f * t72 + 2.1666666666666667f * t73 - 4.333333333333333333f * t74 - + 0.66666666666666667f * t75 + 1.333333333333333f * t76; + const float16_t m73 = -0.3f * (t71 + t72) + 1.33333333333333f * (t73 + t74) - 0.53333333333f * (t75 + t76); + const float16_t m74 = 0.3f * (t71 - t72) + 1.33333333333333f * (t74 - t73) + 0.53333333333f * (t75 - t76); + const float16_t m75 = 0.0333333333f * t71 + 0.02222222f * t72 - 0.1666666666f * t73 - 0.1111111111f * t74 + + 0.1333333f * t75 + 0.0888888f * t76; + const float16_t m76 = -0.0333333333f * t71 + 0.02222222f * t72 + 0.1666666666f * t73 - 0.1111111111f * t74 - + 0.1333333f * t75 + 0.0888888f * t76; + const float16_t m77 = -0.5625f * t71 + 3.0625f * t73 - 3.5f * t75 + t77; + + (dst_data + i)[0] = m00; + (dst_data + i + dst_step)[0] = m01; + (dst_data + i + 2 * dst_step)[0] = m02; + (dst_data + i + 3 * dst_step)[0] = m03; + (dst_data + i + 4 * dst_step)[0] = m04; + (dst_data + i + 5 * dst_step)[0] = m05; + (dst_data + i + 6 * dst_step)[0] = m06; + (dst_data + i + 7 * dst_step)[0] = m07; + + (dst_data + i + 8 * dst_step)[0] = m10; + (dst_data + i + 9 * dst_step)[0] = m11; + (dst_data + i + 10 * dst_step)[0] = m12; + (dst_data + i + 11 * dst_step)[0] = m13; + (dst_data + i + 12 * dst_step)[0] = m14; + (dst_data + i + 13 * dst_step)[0] = m15; + (dst_data + i + 14 * dst_step)[0] = m16; + (dst_data + i + 15 * dst_step)[0] = m17; + + (dst_data + i + 16 * dst_step)[0] = m20; + (dst_data + i + 17 * dst_step)[0] = m21; + (dst_data + i + 18 * dst_step)[0] = m22; + (dst_data + i + 19 * dst_step)[0] = m23; + (dst_data + i + 20 * dst_step)[0] = m24; + (dst_data + i + 21 * dst_step)[0] = m25; + (dst_data + i + 22 * dst_step)[0] = m26; + (dst_data + i + 23 * dst_step)[0] = m27; + + (dst_data + i + 24 * dst_step)[0] = m30; + (dst_data + i + 25 * dst_step)[0] = m31; + (dst_data + i + 26 * dst_step)[0] = m32; + (dst_data + i + 27 * dst_step)[0] = m33; + (dst_data + i + 28 * dst_step)[0] = m34; + (dst_data + i + 29 * dst_step)[0] = m35; + (dst_data + i + 30 * dst_step)[0] = m36; + (dst_data + i + 31 * dst_step)[0] = m37; + + (dst_data + i + 32 * dst_step)[0] = m40; + (dst_data + i + 33 * dst_step)[0] = m41; + (dst_data + i + 34 * dst_step)[0] = m42; + (dst_data + i + 35 * dst_step)[0] = m43; + (dst_data + i + 36 * dst_step)[0] = m44; + (dst_data + i + 37 * dst_step)[0] = m45; + (dst_data + i + 38 * dst_step)[0] = m46; + (dst_data + i + 39 * dst_step)[0] = m47; + + (dst_data + i + 40 * dst_step)[0] = m50; + (dst_data + i + 41 * dst_step)[0] = m51; + (dst_data + i + 42 * dst_step)[0] = m52; + (dst_data + i + 43 * dst_step)[0] = m53; + (dst_data + i + 44 * dst_step)[0] = m54; + (dst_data + i + 45 * dst_step)[0] = m55; + (dst_data + i + 46 * dst_step)[0] = m56; + (dst_data + i + 47 * dst_step)[0] = m57; + + (dst_data + i + 48 * dst_step)[0] = m60; + (dst_data + i + 49 * dst_step)[0] = m61; + (dst_data + i + 50 * dst_step)[0] = m62; + (dst_data + i + 51 * dst_step)[0] = m63; + (dst_data + i + 52 * dst_step)[0] = m64; + (dst_data + i + 53 * dst_step)[0] = m65; + (dst_data + i + 54 * dst_step)[0] = m66; + (dst_data + i + 55 * dst_step)[0] = m67; + + (dst_data + i + 56 * dst_step)[0] = m70; + (dst_data + i + 57 * dst_step)[0] = m71; + (dst_data + i + 58 * dst_step)[0] = m72; + (dst_data + i + 59 * dst_step)[0] = m73; + (dst_data + i + 60 * dst_step)[0] = m74; + (dst_data + i + 61 * dst_step)[0] = m75; + (dst_data + i + 62 * dst_step)[0] = m76; + (dst_data + i + 63 * dst_step)[0] = m77; + } +#endif +} + +void OutputTransform4x2UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, + int src_step, int dst_step) { +#ifdef ENABLE_ARM + float16x8_t bias_ptr = vld1q_f16(bias_data); + float16x8_t src_data_00 = vld1q_f16(src_data + 0 * src_step); + float16x8_t src_data_01 = vld1q_f16(src_data + 1 * src_step); + float16x8_t src_data_02 = vld1q_f16(src_data + 2 * src_step); + float16x8_t src_data_03 = vld1q_f16(src_data + 3 * src_step); + float16x8_t src_data_10 = vld1q_f16(src_data + 4 * src_step); + float16x8_t src_data_11 = vld1q_f16(src_data + 5 * src_step); + float16x8_t src_data_12 = vld1q_f16(src_data + 6 * src_step); + float16x8_t src_data_13 = vld1q_f16(src_data + 7 * src_step); + float16x8_t src_data_20 = vld1q_f16(src_data + 8 * src_step); + float16x8_t src_data_21 = vld1q_f16(src_data + 9 * src_step); + float16x8_t src_data_22 = vld1q_f16(src_data + 10 * src_step); + float16x8_t src_data_23 = vld1q_f16(src_data + 11 * src_step); + float16x8_t src_data_30 = vld1q_f16(src_data + 12 * src_step); + float16x8_t src_data_31 = vld1q_f16(src_data + 13 * src_step); + float16x8_t src_data_32 = vld1q_f16(src_data + 14 * src_step); + float16x8_t src_data_33 = vld1q_f16(src_data + 15 * src_step); + + float16x8_t t00 = vaddq_f16(src_data_00, vaddq_f16(src_data_10, src_data_20)); + float16x8_t t01 = vaddq_f16(src_data_01, vaddq_f16(src_data_11, src_data_21)); + float16x8_t t02 = vaddq_f16(src_data_02, vaddq_f16(src_data_12, src_data_22)); + float16x8_t t03 = vaddq_f16(src_data_03, vaddq_f16(src_data_13, src_data_23)); + + float16x8_t t10 = vsubq_f16(src_data_30, vmulq_n_f16(vsubq_f16(src_data_10, src_data_20), 0.5)); + float16x8_t t11 = vsubq_f16(src_data_31, vmulq_n_f16(vsubq_f16(src_data_11, src_data_21), 0.5)); + float16x8_t t12 = vsubq_f16(src_data_32, vmulq_n_f16(vsubq_f16(src_data_12, src_data_22), 0.5)); + float16x8_t t13 = vsubq_f16(src_data_33, vmulq_n_f16(vsubq_f16(src_data_13, src_data_23), 0.5)); + + float16x8_t m00 = vaddq_f16(vaddq_f16(t00, vaddq_f16(t01, t02)), bias_ptr); + float16x8_t m01 = vaddq_f16(vaddq_f16(t03, vmulq_n_f16(vsubq_f16(t01, t02), 0.5)), bias_ptr); + float16x8_t m10 = vaddq_f16(vaddq_f16(t10, vaddq_f16(t11, t12)), bias_ptr); + float16x8_t m11 = vaddq_f16(vaddq_f16(t13, vmulq_n_f16(vsubq_f16(t11, t12), 0.5)), bias_ptr); + + vst1q_f16(dst_data, m00); + vst1q_f16(dst_data + C8NUM, m01); + vst1q_f16(dst_data + dst_step * C8NUM, m10); + vst1q_f16(dst_data + dst_step * C8NUM + C8NUM, m11); +#else + for (int i = 0; i < C8NUM; i++) { + float16_t src_data_00 = src_data[i]; + float16_t src_data_01 = src_data[i + src_step]; + float16_t src_data_02 = src_data[i + 2 * src_step]; + float16_t src_data_03 = src_data[i + 3 * src_step]; + float16_t src_data_10 = src_data[i + 4 * src_step]; + float16_t src_data_11 = src_data[i + 5 * src_step]; + float16_t src_data_12 = src_data[i + 6 * src_step]; + float16_t src_data_13 = src_data[i + 7 * src_step]; + float16_t src_data_20 = src_data[i + 8 * src_step]; + float16_t src_data_21 = src_data[i + 9 * src_step]; + float16_t src_data_22 = src_data[i + 10 * src_step]; + float16_t src_data_23 = src_data[i + 11 * src_step]; + float16_t src_data_30 = src_data[i + 12 * src_step]; + float16_t src_data_31 = src_data[i + 13 * src_step]; + float16_t src_data_32 = src_data[i + 14 * src_step]; + float16_t src_data_33 = src_data[i + 15 * src_step]; + + float16_t t00 = src_data_00 + src_data_10 + src_data_20; + float16_t t01 = src_data_01 + src_data_11 + src_data_21; + float16_t t02 = src_data_02 + src_data_12 + src_data_22; + float16_t t03 = src_data_03 + src_data_13 + src_data_23; + + const float16_t t10 = 0.5f * (src_data_10 - src_data_20) + src_data_30; + const float16_t t11 = 0.5f * (src_data_11 - src_data_21) + src_data_31; + const float16_t t12 = 0.5f * (src_data_12 - src_data_22) + src_data_32; + const float16_t t13 = 0.5f * (src_data_13 - src_data_23) + src_data_33; + + float16_t m00 = t00 + t01 + t02 + bias_data[i]; + const float16_t m01 = 0.5f * (t01 - t02) + t03 + bias_data[i]; + float16_t m10 = t10 + t11 + t12 + bias_data[i]; + const float16_t m11 = 0.5f * (t11 - t12) + t13 + bias_data[i]; + + (dst_data + i)[0] = m00; + (dst_data + i + C8NUM)[0] = m01; + (dst_data + i + dst_step * C8NUM)[0] = m10; + (dst_data + i + dst_step * C8NUM + C8NUM)[0] = m11; + } +#endif +} + +void OutputTransform4x3UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, + int src_step, int dst_step) { +#ifdef ENABLE_ARM + float16x8_t bias_ptr = vld1q_f16(bias_data); + float16x8_t src_data_00 = vld1q_f16(src_data + 0 * src_step); + float16x8_t src_data_01 = vld1q_f16(src_data + 1 * src_step); + float16x8_t src_data_02 = vld1q_f16(src_data + 2 * src_step); + float16x8_t src_data_03 = vld1q_f16(src_data + 3 * src_step); + float16x8_t src_data_10 = vld1q_f16(src_data + 4 * src_step); + float16x8_t src_data_11 = vld1q_f16(src_data + 5 * src_step); + float16x8_t src_data_12 = vld1q_f16(src_data + 6 * src_step); + float16x8_t src_data_13 = vld1q_f16(src_data + 7 * src_step); + float16x8_t src_data_20 = vld1q_f16(src_data + 8 * src_step); + float16x8_t src_data_21 = vld1q_f16(src_data + 9 * src_step); + float16x8_t src_data_22 = vld1q_f16(src_data + 10 * src_step); + float16x8_t src_data_23 = vld1q_f16(src_data + 11 * src_step); + float16x8_t src_data_30 = vld1q_f16(src_data + 12 * src_step); + float16x8_t src_data_31 = vld1q_f16(src_data + 13 * src_step); + float16x8_t src_data_32 = vld1q_f16(src_data + 14 * src_step); + float16x8_t src_data_33 = vld1q_f16(src_data + 15 * src_step); + + float16x8_t t00 = vaddq_f16(src_data_00, vaddq_f16(src_data_10, src_data_20)); + float16x8_t t01 = vaddq_f16(src_data_01, vaddq_f16(src_data_11, src_data_21)); + float16x8_t t02 = vaddq_f16(src_data_02, vaddq_f16(src_data_12, src_data_22)); + float16x8_t t03 = vaddq_f16(src_data_03, vaddq_f16(src_data_13, src_data_23)); + + float16x8_t t10 = vmulq_n_f16(vsubq_f16(src_data_10, src_data_20), 0.5); + float16x8_t t11 = vmulq_n_f16(vsubq_f16(src_data_11, src_data_21), 0.5); + float16x8_t t12 = vmulq_n_f16(vsubq_f16(src_data_12, src_data_22), 0.5); + float16x8_t t13 = vmulq_n_f16(vsubq_f16(src_data_13, src_data_23), 0.5); + + float16x8_t t20 = vaddq_f16(src_data_30, vmulq_n_f16(vaddq_f16(src_data_10, src_data_20), 0.25)); + float16x8_t t21 = vaddq_f16(src_data_31, vmulq_n_f16(vaddq_f16(src_data_11, src_data_21), 0.25)); + float16x8_t t22 = vaddq_f16(src_data_32, vmulq_n_f16(vaddq_f16(src_data_12, src_data_22), 0.25)); + float16x8_t t23 = vaddq_f16(src_data_33, vmulq_n_f16(vaddq_f16(src_data_13, src_data_23), 0.25)); + + float16x8_t m00 = vaddq_f16(vaddq_f16(t00, vaddq_f16(t01, t02)), bias_ptr); + float16x8_t m01 = vaddq_f16(vmulq_n_f16(vsubq_f16(t01, t02), 0.5), bias_ptr); + float16x8_t m02 = vaddq_f16(vaddq_f16(t03, vmulq_n_f16(vaddq_f16(t01, t02), 0.25)), bias_ptr); + float16x8_t m10 = vaddq_f16(vaddq_f16(t10, vaddq_f16(t11, t12)), bias_ptr); + float16x8_t m11 = vaddq_f16(vmulq_n_f16(vsubq_f16(t11, t12), 0.5), bias_ptr); + float16x8_t m12 = vaddq_f16(vaddq_f16(t13, vmulq_n_f16(vaddq_f16(t11, t12), 0.25)), bias_ptr); + float16x8_t m20 = vaddq_f16(vaddq_f16(t20, vaddq_f16(t21, t22)), bias_ptr); + float16x8_t m21 = vaddq_f16(vmulq_n_f16(vsubq_f16(t21, t22), 0.5), bias_ptr); + float16x8_t m22 = vaddq_f16(vaddq_f16(t23, vmulq_n_f16(vaddq_f16(t21, t22), 0.25)), bias_ptr); + + vst1q_f16(dst_data, m00); + vst1q_f16(dst_data + C8NUM, m01); + vst1q_f16(dst_data + 2 * C8NUM, m02); + vst1q_f16(dst_data + dst_step * C8NUM, m10); + vst1q_f16(dst_data + dst_step * C8NUM + C8NUM, m11); + vst1q_f16(dst_data + dst_step * C8NUM + 2 * C8NUM, m12); + vst1q_f16(dst_data + 2 * dst_step * C8NUM, m20); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + C8NUM, m21); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + 2 * C8NUM, m22); +#else + for (int i = 0; i < C8NUM; i++) { + float16_t src_data_00 = src_data[i]; + float16_t src_data_01 = src_data[i + src_step]; + float16_t src_data_02 = src_data[i + 2 * src_step]; + float16_t src_data_03 = src_data[i + 3 * src_step]; + float16_t src_data_10 = src_data[i + 4 * src_step]; + float16_t src_data_11 = src_data[i + 5 * src_step]; + float16_t src_data_12 = src_data[i + 6 * src_step]; + float16_t src_data_13 = src_data[i + 7 * src_step]; + float16_t src_data_20 = src_data[i + 8 * src_step]; + float16_t src_data_21 = src_data[i + 9 * src_step]; + float16_t src_data_22 = src_data[i + 10 * src_step]; + float16_t src_data_23 = src_data[i + 11 * src_step]; + float16_t src_data_30 = src_data[i + 12 * src_step]; + float16_t src_data_31 = src_data[i + 13 * src_step]; + float16_t src_data_32 = src_data[i + 14 * src_step]; + float16_t src_data_33 = src_data[i + 15 * src_step]; + + float16_t t00 = src_data_00 + src_data_10 + src_data_20; + float16_t t01 = src_data_01 + src_data_11 + src_data_21; + float16_t t02 = src_data_02 + src_data_12 + src_data_22; + float16_t t03 = src_data_03 + src_data_13 + src_data_23; + + const float16_t t10 = 0.5f * (src_data_10 - src_data_20); + const float16_t t11 = 0.5f * (src_data_11 - src_data_21); + const float16_t t12 = 0.5f * (src_data_12 - src_data_22); + const const float16_t t13 = 0.5f * (src_data_13 - src_data_23); + + const float16_t t20 = 0.25f * (src_data_10 + src_data_20) + src_data_30; + const float16_t t21 = 0.25f * (src_data_11 + src_data_21) + src_data_31; + const float16_t t22 = 0.25f * (src_data_12 + src_data_22) + src_data_32; + const float16_t t23 = 0.25f * (src_data_13 + src_data_23) + src_data_33; + + float16_t m00 = t00 + t01 + t02 + bias_data[i]; + const float16_t m01 = 0.5f * (t01 - t02) + bias_data[i]; + const float16_t m02 = 0.25f * (t01 + t02) + t03 + bias_data[i]; + + float16_t m10 = t10 + t11 + t12 + bias_data[i]; + const float16_t m11 = 0.5f * (t11 - t12) + bias_data[i]; + const float16_t m12 = 0.25f * (t11 + t12) + t13 + bias_data[i]; + + float16_t m20 = t20 + t21 + t22 + bias_data[i]; + const float16_t m21 = 0.5f * (t21 - t22) + bias_data[i]; + const float16_t m22 = 0.25f * (t21 + t22) + t23 + bias_data[i]; + + (dst_data + i)[0] = m00; + (dst_data + i + C8NUM)[0] = m01; + (dst_data + i + 2 * C8NUM)[0] = m02; + + (dst_data + i + dst_step * C8NUM)[0] = m10; + (dst_data + i + dst_step * C8NUM + C8NUM)[0] = m11; + (dst_data + i + dst_step * C8NUM + 2 * C8NUM)[0] = m12; + + (dst_data + i + 2 * dst_step * C8NUM)[0] = m20; + (dst_data + i + 2 * dst_step * C8NUM + C8NUM)[0] = m21; + (dst_data + i + 2 * dst_step * C8NUM + 2 * C8NUM)[0] = m22; + } +#endif +} + +void OutputTransform8x2UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, + int src_step, int dst_step) { +#ifdef ENABLE_ARM + float16x8_t src_data_00 = vld1q_f16(src_data + 0 * src_step); + float16x8_t src_data_01 = vld1q_f16(src_data + 1 * src_step); + float16x8_t src_data_02 = vld1q_f16(src_data + 2 * src_step); + float16x8_t src_data_03 = vld1q_f16(src_data + 3 * src_step); + float16x8_t src_data_04 = vld1q_f16(src_data + 4 * src_step); + float16x8_t src_data_05 = vld1q_f16(src_data + 5 * src_step); + float16x8_t src_data_06 = vld1q_f16(src_data + 6 * src_step); + float16x8_t src_data_07 = vld1q_f16(src_data + 7 * src_step); + float16x8_t src_data_10 = vld1q_f16(src_data + 8 * src_step); + float16x8_t src_data_11 = vld1q_f16(src_data + 9 * src_step); + float16x8_t src_data_12 = vld1q_f16(src_data + 10 * src_step); + float16x8_t src_data_13 = vld1q_f16(src_data + 11 * src_step); + float16x8_t src_data_14 = vld1q_f16(src_data + 12 * src_step); + float16x8_t src_data_15 = vld1q_f16(src_data + 13 * src_step); + float16x8_t src_data_16 = vld1q_f16(src_data + 14 * src_step); + float16x8_t src_data_17 = vld1q_f16(src_data + 15 * src_step); + float16x8_t src_data_20 = vld1q_f16(src_data + 16 * src_step); + float16x8_t src_data_21 = vld1q_f16(src_data + 17 * src_step); + float16x8_t src_data_22 = vld1q_f16(src_data + 18 * src_step); + float16x8_t src_data_23 = vld1q_f16(src_data + 19 * src_step); + float16x8_t src_data_24 = vld1q_f16(src_data + 20 * src_step); + float16x8_t src_data_25 = vld1q_f16(src_data + 21 * src_step); + float16x8_t src_data_26 = vld1q_f16(src_data + 22 * src_step); + float16x8_t src_data_27 = vld1q_f16(src_data + 23 * src_step); + float16x8_t src_data_30 = vld1q_f16(src_data + 24 * src_step); + float16x8_t src_data_31 = vld1q_f16(src_data + 25 * src_step); + float16x8_t src_data_32 = vld1q_f16(src_data + 26 * src_step); + float16x8_t src_data_33 = vld1q_f16(src_data + 27 * src_step); + float16x8_t src_data_34 = vld1q_f16(src_data + 28 * src_step); + float16x8_t src_data_35 = vld1q_f16(src_data + 29 * src_step); + float16x8_t src_data_36 = vld1q_f16(src_data + 30 * src_step); + float16x8_t src_data_37 = vld1q_f16(src_data + 31 * src_step); + float16x8_t src_data_40 = vld1q_f16(src_data + 32 * src_step); + float16x8_t src_data_41 = vld1q_f16(src_data + 33 * src_step); + float16x8_t src_data_42 = vld1q_f16(src_data + 34 * src_step); + float16x8_t src_data_43 = vld1q_f16(src_data + 35 * src_step); + float16x8_t src_data_44 = vld1q_f16(src_data + 36 * src_step); + float16x8_t src_data_45 = vld1q_f16(src_data + 37 * src_step); + float16x8_t src_data_46 = vld1q_f16(src_data + 38 * src_step); + float16x8_t src_data_47 = vld1q_f16(src_data + 39 * src_step); + float16x8_t src_data_50 = vld1q_f16(src_data + 40 * src_step); + float16x8_t src_data_51 = vld1q_f16(src_data + 41 * src_step); + float16x8_t src_data_52 = vld1q_f16(src_data + 42 * src_step); + float16x8_t src_data_53 = vld1q_f16(src_data + 43 * src_step); + float16x8_t src_data_54 = vld1q_f16(src_data + 44 * src_step); + float16x8_t src_data_55 = vld1q_f16(src_data + 45 * src_step); + float16x8_t src_data_56 = vld1q_f16(src_data + 46 * src_step); + float16x8_t src_data_57 = vld1q_f16(src_data + 47 * src_step); + float16x8_t src_data_60 = vld1q_f16(src_data + 48 * src_step); + float16x8_t src_data_61 = vld1q_f16(src_data + 49 * src_step); + float16x8_t src_data_62 = vld1q_f16(src_data + 50 * src_step); + float16x8_t src_data_63 = vld1q_f16(src_data + 51 * src_step); + float16x8_t src_data_64 = vld1q_f16(src_data + 52 * src_step); + float16x8_t src_data_65 = vld1q_f16(src_data + 53 * src_step); + float16x8_t src_data_66 = vld1q_f16(src_data + 54 * src_step); + float16x8_t src_data_67 = vld1q_f16(src_data + 55 * src_step); + float16x8_t src_data_70 = vld1q_f16(src_data + 56 * src_step); + float16x8_t src_data_71 = vld1q_f16(src_data + 57 * src_step); + float16x8_t src_data_72 = vld1q_f16(src_data + 58 * src_step); + float16x8_t src_data_73 = vld1q_f16(src_data + 59 * src_step); + float16x8_t src_data_74 = vld1q_f16(src_data + 60 * src_step); + float16x8_t src_data_75 = vld1q_f16(src_data + 61 * src_step); + float16x8_t src_data_76 = vld1q_f16(src_data + 62 * src_step); + float16x8_t src_data_77 = vld1q_f16(src_data + 63 * src_step); + + float16x8_t d01 = vsubq_f16(src_data_10, src_data_20); + float16x8_t d02 = vsubq_f16(src_data_11, src_data_21); + float16x8_t d03 = vsubq_f16(src_data_12, src_data_22); + float16x8_t d04 = vsubq_f16(src_data_13, src_data_23); + float16x8_t d05 = vsubq_f16(src_data_14, src_data_24); + float16x8_t d06 = vsubq_f16(src_data_15, src_data_25); + float16x8_t d07 = vsubq_f16(src_data_16, src_data_26); + float16x8_t d08 = vsubq_f16(src_data_17, src_data_27); + + float16x8_t d11 = vsubq_f16(src_data_30, src_data_40); + float16x8_t d12 = vsubq_f16(src_data_31, src_data_41); + float16x8_t d13 = vsubq_f16(src_data_32, src_data_42); + float16x8_t d14 = vsubq_f16(src_data_33, src_data_43); + float16x8_t d15 = vsubq_f16(src_data_34, src_data_44); + float16x8_t d16 = vsubq_f16(src_data_35, src_data_45); + float16x8_t d17 = vsubq_f16(src_data_36, src_data_46); + float16x8_t d18 = vsubq_f16(src_data_37, src_data_47); + + float16x8_t d21 = vsubq_f16(src_data_50, src_data_60); + float16x8_t d22 = vsubq_f16(src_data_51, src_data_61); + float16x8_t d23 = vsubq_f16(src_data_52, src_data_62); + float16x8_t d24 = vsubq_f16(src_data_53, src_data_63); + float16x8_t d25 = vsubq_f16(src_data_54, src_data_64); + float16x8_t d26 = vsubq_f16(src_data_55, src_data_65); + float16x8_t d27 = vsubq_f16(src_data_56, src_data_66); + float16x8_t d28 = vsubq_f16(src_data_57, src_data_67); + + float16x8_t t00 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_00, src_data_10), src_data_20), src_data_30), src_data_40), + src_data_50), + src_data_60); + float16x8_t t01 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_01, src_data_11), src_data_21), src_data_31), src_data_41), + src_data_51), + src_data_61); + float16x8_t t02 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_02, src_data_12), src_data_22), src_data_32), src_data_42), + src_data_52), + src_data_62); + float16x8_t t03 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_03, src_data_13), src_data_23), src_data_33), src_data_43), + src_data_53), + src_data_63); + float16x8_t t04 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_04, src_data_14), src_data_24), src_data_34), src_data_44), + src_data_54), + src_data_64); + float16x8_t t05 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_05, src_data_15), src_data_25), src_data_35), src_data_45), + src_data_55), + src_data_65); + float16x8_t t06 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_06, src_data_16), src_data_26), src_data_36), src_data_46), + src_data_56), + src_data_66); + float16x8_t t07 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_07, src_data_17), src_data_27), src_data_37), src_data_47), + src_data_57), + src_data_67); + + float16x8_t t10 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.5), d11), vmulq_n_f16(d21, 1.5)), src_data_70); + float16x8_t t11 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.5), d12), vmulq_n_f16(d22, 1.5)), src_data_71); + float16x8_t t12 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.5), d13), vmulq_n_f16(d23, 1.5)), src_data_72); + float16x8_t t13 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.5), d14), vmulq_n_f16(d24, 1.5)), src_data_73); + float16x8_t t14 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.5), d15), vmulq_n_f16(d25, 1.5)), src_data_74); + float16x8_t t15 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.5), d16), vmulq_n_f16(d26, 1.5)), src_data_75); + float16x8_t t16 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.5), d17), vmulq_n_f16(d27, 1.5)), src_data_76); + float16x8_t t17 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.5), d18), vmulq_n_f16(d28, 1.5)), src_data_77); + + float16x8_t s11 = vsubq_f16(t01, t02); + float16x8_t s12 = vsubq_f16(t11, t12); + + float16x8_t s21 = vsubq_f16(t03, t04); + float16x8_t s22 = vsubq_f16(t13, t14); + + float16x8_t s31 = vsubq_f16(t05, t06); + float16x8_t s32 = vsubq_f16(t15, t16); + + float16x8_t m00 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t00, t01), t02), t03), t04), t05), t06); + float16x8_t m01 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.5), s21), vmulq_n_f16(s31, 1.5)), t07); + + float16x8_t m10 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t10, t11), t12), t13), t14), t15), t16); + float16x8_t m11 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.5), s22), vmulq_n_f16(s32, 1.5)), t17); + + float16x8_t bias_ptr = vld1q_f16(bias_data); + vst1q_f16(dst_data, vaddq_f16(m00, bias_ptr)); + vst1q_f16(dst_data + C8NUM, vaddq_f16(m01, bias_ptr)); + + vst1q_f16(dst_data + dst_step * C8NUM, vaddq_f16(m10, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + C8NUM, vaddq_f16(m11, bias_ptr)); +#else + for (int i = 0; i < C8NUM; i++) { + float16_t src_data_00 = src_data[i]; + float16_t src_data_01 = src_data[i + src_step]; + float16_t src_data_02 = src_data[i + 2 * src_step]; + float16_t src_data_03 = src_data[i + 3 * src_step]; + float16_t src_data_04 = src_data[i + 4 * src_step]; + float16_t src_data_05 = src_data[i + 5 * src_step]; + float16_t src_data_06 = src_data[i + 6 * src_step]; + float16_t src_data_07 = src_data[i + 7 * src_step]; + float16_t src_data_10 = src_data[i + 8 * src_step]; + float16_t src_data_11 = src_data[i + 9 * src_step]; + float16_t src_data_12 = src_data[i + 10 * src_step]; + float16_t src_data_13 = src_data[i + 11 * src_step]; + float16_t src_data_14 = src_data[i + 12 * src_step]; + float16_t src_data_15 = src_data[i + 13 * src_step]; + float16_t src_data_16 = src_data[i + 14 * src_step]; + float16_t src_data_17 = src_data[i + 15 * src_step]; + float16_t src_data_20 = src_data[i + 16 * src_step]; + float16_t src_data_21 = src_data[i + 17 * src_step]; + float16_t src_data_22 = src_data[i + 18 * src_step]; + float16_t src_data_23 = src_data[i + 19 * src_step]; + float16_t src_data_24 = src_data[i + 20 * src_step]; + float16_t src_data_25 = src_data[i + 21 * src_step]; + float16_t src_data_26 = src_data[i + 22 * src_step]; + float16_t src_data_27 = src_data[i + 23 * src_step]; + float16_t src_data_30 = src_data[i + 24 * src_step]; + float16_t src_data_31 = src_data[i + 25 * src_step]; + float16_t src_data_32 = src_data[i + 26 * src_step]; + float16_t src_data_33 = src_data[i + 27 * src_step]; + float16_t src_data_34 = src_data[i + 28 * src_step]; + float16_t src_data_35 = src_data[i + 29 * src_step]; + float16_t src_data_36 = src_data[i + 30 * src_step]; + float16_t src_data_37 = src_data[i + 31 * src_step]; + float16_t src_data_40 = src_data[i + 32 * src_step]; + float16_t src_data_41 = src_data[i + 33 * src_step]; + float16_t src_data_42 = src_data[i + 34 * src_step]; + float16_t src_data_43 = src_data[i + 35 * src_step]; + float16_t src_data_44 = src_data[i + 36 * src_step]; + float16_t src_data_45 = src_data[i + 37 * src_step]; + float16_t src_data_46 = src_data[i + 38 * src_step]; + float16_t src_data_47 = src_data[i + 39 * src_step]; + float16_t src_data_50 = src_data[i + 40 * src_step]; + float16_t src_data_51 = src_data[i + 41 * src_step]; + float16_t src_data_52 = src_data[i + 42 * src_step]; + float16_t src_data_53 = src_data[i + 43 * src_step]; + float16_t src_data_54 = src_data[i + 44 * src_step]; + float16_t src_data_55 = src_data[i + 45 * src_step]; + float16_t src_data_56 = src_data[i + 46 * src_step]; + float16_t src_data_57 = src_data[i + 47 * src_step]; + float16_t src_data_60 = src_data[i + 48 * src_step]; + float16_t src_data_61 = src_data[i + 49 * src_step]; + float16_t src_data_62 = src_data[i + 50 * src_step]; + float16_t src_data_63 = src_data[i + 51 * src_step]; + float16_t src_data_64 = src_data[i + 52 * src_step]; + float16_t src_data_65 = src_data[i + 53 * src_step]; + float16_t src_data_66 = src_data[i + 54 * src_step]; + float16_t src_data_67 = src_data[i + 55 * src_step]; + float16_t src_data_70 = src_data[i + 56 * src_step]; + float16_t src_data_71 = src_data[i + 57 * src_step]; + float16_t src_data_72 = src_data[i + 58 * src_step]; + float16_t src_data_73 = src_data[i + 59 * src_step]; + float16_t src_data_74 = src_data[i + 60 * src_step]; + float16_t src_data_75 = src_data[i + 61 * src_step]; + float16_t src_data_76 = src_data[i + 62 * src_step]; + float16_t src_data_77 = src_data[i + 63 * src_step]; + + float16_t d01 = src_data_10 - src_data_20; + float16_t d02 = src_data_11 - src_data_21; + float16_t d03 = src_data_12 - src_data_22; + float16_t d04 = src_data_13 - src_data_23; + float16_t d05 = src_data_14 - src_data_24; + float16_t d06 = src_data_15 - src_data_25; + float16_t d07 = src_data_16 - src_data_26; + float16_t d08 = src_data_17 - src_data_27; + + float16_t d11 = src_data_30 - src_data_40; + float16_t d12 = src_data_31 - src_data_41; + float16_t d13 = src_data_32 - src_data_42; + float16_t d14 = src_data_33 - src_data_43; + float16_t d15 = src_data_34 - src_data_44; + float16_t d16 = src_data_35 - src_data_45; + float16_t d17 = src_data_36 - src_data_46; + float16_t d18 = src_data_37 - src_data_47; + + float16_t d21 = src_data_50 - src_data_60; + float16_t d22 = src_data_51 - src_data_61; + float16_t d23 = src_data_52 - src_data_62; + float16_t d24 = src_data_53 - src_data_63; + float16_t d25 = src_data_54 - src_data_64; + float16_t d26 = src_data_55 - src_data_65; + float16_t d27 = src_data_56 - src_data_66; + float16_t d28 = src_data_57 - src_data_67; + + float16_t t00 = src_data_00 + src_data_10 + src_data_20 + src_data_30 + src_data_40 + src_data_50 + src_data_60; + float16_t t01 = src_data_01 + src_data_11 + src_data_21 + src_data_31 + src_data_41 + src_data_51 + src_data_61; + float16_t t02 = src_data_02 + src_data_12 + src_data_22 + src_data_32 + src_data_42 + src_data_52 + src_data_62; + float16_t t03 = src_data_03 + src_data_13 + src_data_23 + src_data_33 + src_data_43 + src_data_53 + src_data_63; + float16_t t04 = src_data_04 + src_data_14 + src_data_24 + src_data_34 + src_data_44 + src_data_54 + src_data_64; + float16_t t05 = src_data_05 + src_data_15 + src_data_25 + src_data_35 + src_data_45 + src_data_55 + src_data_65; + float16_t t06 = src_data_06 + src_data_16 + src_data_26 + src_data_36 + src_data_46 + src_data_56 + src_data_66; + float16_t t07 = src_data_07 + src_data_17 + src_data_27 + src_data_37 + src_data_47 + src_data_57 + src_data_67; + + const float16_t t10 = 0.5f * d01 + d11 + 1.5f * d21 + src_data_70; + const float16_t t11 = 0.5f * d02 + d12 + 1.5f * d22 + src_data_71; + const float16_t t12 = 0.5f * d03 + d13 + 1.5f * d23 + src_data_72; + const float16_t t13 = 0.5f * d04 + d14 + 1.5f * d24 + src_data_73; + const float16_t t14 = 0.5f * d05 + d15 + 1.5f * d25 + src_data_74; + const float16_t t15 = 0.5f * d06 + d16 + 1.5f * d26 + src_data_75; + const float16_t t16 = 0.5f * d07 + d17 + 1.5f * d27 + src_data_76; + const float16_t t17 = 0.5f * d08 + d18 + 1.5f * d28 + src_data_77; + + float16_t s11 = t01 - t02; + float16_t s12 = t11 - t12; + float16_t s21 = t03 - t04; + float16_t s22 = t13 - t14; + float16_t s31 = t05 - t06; + float16_t s32 = t15 - t16; + + float16_t m00 = t00 + t01 + t02 + t03 + t04 + t05 + t06; + const float16_t m01 = 0.5f * s11 + s21 + 1.5f * s31 + t07; + float16_t m10 = t10 + t11 + t12 + t13 + t14 + t15 + t16; + const float16_t m11 = 0.5f * s12 + s22 + 1.5f * s32 + t17; + + (dst_data + i)[0] = m00 + bias_data[i]; + (dst_data + i + C8NUM)[0] = m01 + bias_data[i]; + (dst_data + i + dst_step * C8NUM)[0] = m10 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + C8NUM)[0] = m11 + bias_data[i]; + } +#endif +} + +void OutputTransform8x3UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, + int src_step, int dst_step) { +#ifdef ENABLE_ARM + float16x8_t src_data_00 = vld1q_f16(src_data + 0 * src_step); + float16x8_t src_data_01 = vld1q_f16(src_data + 1 * src_step); + float16x8_t src_data_02 = vld1q_f16(src_data + 2 * src_step); + float16x8_t src_data_03 = vld1q_f16(src_data + 3 * src_step); + float16x8_t src_data_04 = vld1q_f16(src_data + 4 * src_step); + float16x8_t src_data_05 = vld1q_f16(src_data + 5 * src_step); + float16x8_t src_data_06 = vld1q_f16(src_data + 6 * src_step); + float16x8_t src_data_07 = vld1q_f16(src_data + 7 * src_step); + float16x8_t src_data_10 = vld1q_f16(src_data + 8 * src_step); + float16x8_t src_data_11 = vld1q_f16(src_data + 9 * src_step); + float16x8_t src_data_12 = vld1q_f16(src_data + 10 * src_step); + float16x8_t src_data_13 = vld1q_f16(src_data + 11 * src_step); + float16x8_t src_data_14 = vld1q_f16(src_data + 12 * src_step); + float16x8_t src_data_15 = vld1q_f16(src_data + 13 * src_step); + float16x8_t src_data_16 = vld1q_f16(src_data + 14 * src_step); + float16x8_t src_data_17 = vld1q_f16(src_data + 15 * src_step); + float16x8_t src_data_20 = vld1q_f16(src_data + 16 * src_step); + float16x8_t src_data_21 = vld1q_f16(src_data + 17 * src_step); + float16x8_t src_data_22 = vld1q_f16(src_data + 18 * src_step); + float16x8_t src_data_23 = vld1q_f16(src_data + 19 * src_step); + float16x8_t src_data_24 = vld1q_f16(src_data + 20 * src_step); + float16x8_t src_data_25 = vld1q_f16(src_data + 21 * src_step); + float16x8_t src_data_26 = vld1q_f16(src_data + 22 * src_step); + float16x8_t src_data_27 = vld1q_f16(src_data + 23 * src_step); + float16x8_t src_data_30 = vld1q_f16(src_data + 24 * src_step); + float16x8_t src_data_31 = vld1q_f16(src_data + 25 * src_step); + float16x8_t src_data_32 = vld1q_f16(src_data + 26 * src_step); + float16x8_t src_data_33 = vld1q_f16(src_data + 27 * src_step); + float16x8_t src_data_34 = vld1q_f16(src_data + 28 * src_step); + float16x8_t src_data_35 = vld1q_f16(src_data + 29 * src_step); + float16x8_t src_data_36 = vld1q_f16(src_data + 30 * src_step); + float16x8_t src_data_37 = vld1q_f16(src_data + 31 * src_step); + float16x8_t src_data_40 = vld1q_f16(src_data + 32 * src_step); + float16x8_t src_data_41 = vld1q_f16(src_data + 33 * src_step); + float16x8_t src_data_42 = vld1q_f16(src_data + 34 * src_step); + float16x8_t src_data_43 = vld1q_f16(src_data + 35 * src_step); + float16x8_t src_data_44 = vld1q_f16(src_data + 36 * src_step); + float16x8_t src_data_45 = vld1q_f16(src_data + 37 * src_step); + float16x8_t src_data_46 = vld1q_f16(src_data + 38 * src_step); + float16x8_t src_data_47 = vld1q_f16(src_data + 39 * src_step); + float16x8_t src_data_50 = vld1q_f16(src_data + 40 * src_step); + float16x8_t src_data_51 = vld1q_f16(src_data + 41 * src_step); + float16x8_t src_data_52 = vld1q_f16(src_data + 42 * src_step); + float16x8_t src_data_53 = vld1q_f16(src_data + 43 * src_step); + float16x8_t src_data_54 = vld1q_f16(src_data + 44 * src_step); + float16x8_t src_data_55 = vld1q_f16(src_data + 45 * src_step); + float16x8_t src_data_56 = vld1q_f16(src_data + 46 * src_step); + float16x8_t src_data_57 = vld1q_f16(src_data + 47 * src_step); + float16x8_t src_data_60 = vld1q_f16(src_data + 48 * src_step); + float16x8_t src_data_61 = vld1q_f16(src_data + 49 * src_step); + float16x8_t src_data_62 = vld1q_f16(src_data + 50 * src_step); + float16x8_t src_data_63 = vld1q_f16(src_data + 51 * src_step); + float16x8_t src_data_64 = vld1q_f16(src_data + 52 * src_step); + float16x8_t src_data_65 = vld1q_f16(src_data + 53 * src_step); + float16x8_t src_data_66 = vld1q_f16(src_data + 54 * src_step); + float16x8_t src_data_67 = vld1q_f16(src_data + 55 * src_step); + float16x8_t src_data_70 = vld1q_f16(src_data + 56 * src_step); + float16x8_t src_data_71 = vld1q_f16(src_data + 57 * src_step); + float16x8_t src_data_72 = vld1q_f16(src_data + 58 * src_step); + float16x8_t src_data_73 = vld1q_f16(src_data + 59 * src_step); + float16x8_t src_data_74 = vld1q_f16(src_data + 60 * src_step); + float16x8_t src_data_75 = vld1q_f16(src_data + 61 * src_step); + float16x8_t src_data_76 = vld1q_f16(src_data + 62 * src_step); + float16x8_t src_data_77 = vld1q_f16(src_data + 63 * src_step); + + float16x8_t d01 = vsubq_f16(src_data_10, src_data_20); + float16x8_t d02 = vsubq_f16(src_data_11, src_data_21); + float16x8_t d03 = vsubq_f16(src_data_12, src_data_22); + float16x8_t d04 = vsubq_f16(src_data_13, src_data_23); + float16x8_t d05 = vsubq_f16(src_data_14, src_data_24); + float16x8_t d06 = vsubq_f16(src_data_15, src_data_25); + float16x8_t d07 = vsubq_f16(src_data_16, src_data_26); + float16x8_t d08 = vsubq_f16(src_data_17, src_data_27); + + float16x8_t d11 = vsubq_f16(src_data_30, src_data_40); + float16x8_t d12 = vsubq_f16(src_data_31, src_data_41); + float16x8_t d13 = vsubq_f16(src_data_32, src_data_42); + float16x8_t d14 = vsubq_f16(src_data_33, src_data_43); + float16x8_t d15 = vsubq_f16(src_data_34, src_data_44); + float16x8_t d16 = vsubq_f16(src_data_35, src_data_45); + float16x8_t d17 = vsubq_f16(src_data_36, src_data_46); + float16x8_t d18 = vsubq_f16(src_data_37, src_data_47); + + float16x8_t d21 = vsubq_f16(src_data_50, src_data_60); + float16x8_t d22 = vsubq_f16(src_data_51, src_data_61); + float16x8_t d23 = vsubq_f16(src_data_52, src_data_62); + float16x8_t d24 = vsubq_f16(src_data_53, src_data_63); + float16x8_t d25 = vsubq_f16(src_data_54, src_data_64); + float16x8_t d26 = vsubq_f16(src_data_55, src_data_65); + float16x8_t d27 = vsubq_f16(src_data_56, src_data_66); + float16x8_t d28 = vsubq_f16(src_data_57, src_data_67); + + float16x8_t d31 = vaddq_f16(src_data_10, src_data_20); + float16x8_t d32 = vaddq_f16(src_data_11, src_data_21); + float16x8_t d33 = vaddq_f16(src_data_12, src_data_22); + float16x8_t d34 = vaddq_f16(src_data_13, src_data_23); + float16x8_t d35 = vaddq_f16(src_data_14, src_data_24); + float16x8_t d36 = vaddq_f16(src_data_15, src_data_25); + float16x8_t d37 = vaddq_f16(src_data_16, src_data_26); + float16x8_t d38 = vaddq_f16(src_data_17, src_data_27); + + float16x8_t d41 = vaddq_f16(src_data_30, src_data_40); + float16x8_t d42 = vaddq_f16(src_data_31, src_data_41); + float16x8_t d43 = vaddq_f16(src_data_32, src_data_42); + float16x8_t d44 = vaddq_f16(src_data_33, src_data_43); + float16x8_t d45 = vaddq_f16(src_data_34, src_data_44); + float16x8_t d46 = vaddq_f16(src_data_35, src_data_45); + float16x8_t d47 = vaddq_f16(src_data_36, src_data_46); + float16x8_t d48 = vaddq_f16(src_data_37, src_data_47); + + float16x8_t d51 = vaddq_f16(src_data_50, src_data_60); + float16x8_t d52 = vaddq_f16(src_data_51, src_data_61); + float16x8_t d53 = vaddq_f16(src_data_52, src_data_62); + float16x8_t d54 = vaddq_f16(src_data_53, src_data_63); + float16x8_t d55 = vaddq_f16(src_data_54, src_data_64); + float16x8_t d56 = vaddq_f16(src_data_55, src_data_65); + float16x8_t d57 = vaddq_f16(src_data_56, src_data_66); + float16x8_t d58 = vaddq_f16(src_data_57, src_data_67); + + float16x8_t t00 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_00, src_data_10), src_data_20), src_data_30), src_data_40), + src_data_50), + src_data_60); + float16x8_t t01 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_01, src_data_11), src_data_21), src_data_31), src_data_41), + src_data_51), + src_data_61); + float16x8_t t02 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_02, src_data_12), src_data_22), src_data_32), src_data_42), + src_data_52), + src_data_62); + float16x8_t t03 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_03, src_data_13), src_data_23), src_data_33), src_data_43), + src_data_53), + src_data_63); + float16x8_t t04 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_04, src_data_14), src_data_24), src_data_34), src_data_44), + src_data_54), + src_data_64); + float16x8_t t05 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_05, src_data_15), src_data_25), src_data_35), src_data_45), + src_data_55), + src_data_65); + float16x8_t t06 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_06, src_data_16), src_data_26), src_data_36), src_data_46), + src_data_56), + src_data_66); + float16x8_t t07 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_07, src_data_17), src_data_27), src_data_37), src_data_47), + src_data_57), + src_data_67); + + float16x8_t t10 = vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.5), d11), vmulq_n_f16(d21, 1.5)); + float16x8_t t11 = vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.5), d12), vmulq_n_f16(d22, 1.5)); + float16x8_t t12 = vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.5), d13), vmulq_n_f16(d23, 1.5)); + float16x8_t t13 = vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.5), d14), vmulq_n_f16(d24, 1.5)); + float16x8_t t14 = vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.5), d15), vmulq_n_f16(d25, 1.5)); + float16x8_t t15 = vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.5), d16), vmulq_n_f16(d26, 1.5)); + float16x8_t t16 = vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.5), d17), vmulq_n_f16(d27, 1.5)); + float16x8_t t17 = vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.5), d18), vmulq_n_f16(d28, 1.5)); + + float16x8_t t20 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d31, 0.25), d41), vmulq_n_f16(d51, 2.25)), src_data_70); + float16x8_t t21 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d32, 0.25), d42), vmulq_n_f16(d52, 2.25)), src_data_71); + float16x8_t t22 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d33, 0.25), d43), vmulq_n_f16(d53, 2.25)), src_data_72); + float16x8_t t23 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d34, 0.25), d44), vmulq_n_f16(d54, 2.25)), src_data_73); + float16x8_t t24 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d35, 0.25), d45), vmulq_n_f16(d55, 2.25)), src_data_74); + float16x8_t t25 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d36, 0.25), d46), vmulq_n_f16(d56, 2.25)), src_data_75); + float16x8_t t26 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d37, 0.25), d47), vmulq_n_f16(d57, 2.25)), src_data_76); + float16x8_t t27 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d38, 0.25), d48), vmulq_n_f16(d58, 2.25)), src_data_77); + + float16x8_t s11 = vsubq_f16(t01, t02); + float16x8_t s12 = vsubq_f16(t11, t12); + float16x8_t s13 = vsubq_f16(t21, t22); + + float16x8_t s21 = vsubq_f16(t03, t04); + float16x8_t s22 = vsubq_f16(t13, t14); + float16x8_t s23 = vsubq_f16(t23, t24); + + float16x8_t s31 = vsubq_f16(t05, t06); + float16x8_t s32 = vsubq_f16(t15, t16); + float16x8_t s33 = vsubq_f16(t25, t26); + + float16x8_t s41 = vaddq_f16(t01, t02); + float16x8_t s42 = vaddq_f16(t11, t12); + float16x8_t s43 = vaddq_f16(t21, t22); + + float16x8_t s51 = vaddq_f16(t03, t04); + float16x8_t s52 = vaddq_f16(t13, t14); + float16x8_t s53 = vaddq_f16(t23, t24); + + float16x8_t s61 = vaddq_f16(t05, t06); + float16x8_t s62 = vaddq_f16(t15, t16); + float16x8_t s63 = vaddq_f16(t25, t26); + + float16x8_t m00 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t00, t01), t02), t03), t04), t05), t06); + float16x8_t m01 = vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.5), s21), vmulq_n_f16(s31, 1.5)); + float16x8_t m02 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s41, 0.25), s51), vmulq_n_f16(s61, 2.25)), t07); + + float16x8_t m10 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t10, t11), t12), t13), t14), t15), t16); + float16x8_t m11 = vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.5), s22), vmulq_n_f16(s32, 1.5)); + float16x8_t m12 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s42, 0.25), s52), vmulq_n_f16(s62, 2.25)), t17); + + float16x8_t m20 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t20, t21), t22), t23), t24), t25), t26); + float16x8_t m21 = vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.5), s23), vmulq_n_f16(s33, 1.5)); + float16x8_t m22 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s43, 0.25), s53), vmulq_n_f16(s63, 2.25)), t27); + + float16x8_t bias_ptr = vld1q_f16(bias_data); + vst1q_f16(dst_data, vaddq_f16(m00, bias_ptr)); + vst1q_f16(dst_data + C8NUM, vaddq_f16(m01, bias_ptr)); + vst1q_f16(dst_data + 2 * C8NUM, vaddq_f16(m02, bias_ptr)); + + vst1q_f16(dst_data + dst_step * C8NUM, vaddq_f16(m10, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + C8NUM, vaddq_f16(m11, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m12, bias_ptr)); + + vst1q_f16(dst_data + 2 * dst_step * C8NUM, vaddq_f16(m20, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + C8NUM, vaddq_f16(m21, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m22, bias_ptr)); +#else + for (int i = 0; i < C8NUM; i++) { + float16_t src_data_00 = src_data[i]; + float16_t src_data_01 = src_data[i + src_step]; + float16_t src_data_02 = src_data[i + 2 * src_step]; + float16_t src_data_03 = src_data[i + 3 * src_step]; + float16_t src_data_04 = src_data[i + 4 * src_step]; + float16_t src_data_05 = src_data[i + 5 * src_step]; + float16_t src_data_06 = src_data[i + 6 * src_step]; + float16_t src_data_07 = src_data[i + 7 * src_step]; + float16_t src_data_10 = src_data[i + 8 * src_step]; + float16_t src_data_11 = src_data[i + 9 * src_step]; + float16_t src_data_12 = src_data[i + 10 * src_step]; + float16_t src_data_13 = src_data[i + 11 * src_step]; + float16_t src_data_14 = src_data[i + 12 * src_step]; + float16_t src_data_15 = src_data[i + 13 * src_step]; + float16_t src_data_16 = src_data[i + 14 * src_step]; + float16_t src_data_17 = src_data[i + 15 * src_step]; + float16_t src_data_20 = src_data[i + 16 * src_step]; + float16_t src_data_21 = src_data[i + 17 * src_step]; + float16_t src_data_22 = src_data[i + 18 * src_step]; + float16_t src_data_23 = src_data[i + 19 * src_step]; + float16_t src_data_24 = src_data[i + 20 * src_step]; + float16_t src_data_25 = src_data[i + 21 * src_step]; + float16_t src_data_26 = src_data[i + 22 * src_step]; + float16_t src_data_27 = src_data[i + 23 * src_step]; + float16_t src_data_30 = src_data[i + 24 * src_step]; + float16_t src_data_31 = src_data[i + 25 * src_step]; + float16_t src_data_32 = src_data[i + 26 * src_step]; + float16_t src_data_33 = src_data[i + 27 * src_step]; + float16_t src_data_34 = src_data[i + 28 * src_step]; + float16_t src_data_35 = src_data[i + 29 * src_step]; + float16_t src_data_36 = src_data[i + 30 * src_step]; + float16_t src_data_37 = src_data[i + 31 * src_step]; + float16_t src_data_40 = src_data[i + 32 * src_step]; + float16_t src_data_41 = src_data[i + 33 * src_step]; + float16_t src_data_42 = src_data[i + 34 * src_step]; + float16_t src_data_43 = src_data[i + 35 * src_step]; + float16_t src_data_44 = src_data[i + 36 * src_step]; + float16_t src_data_45 = src_data[i + 37 * src_step]; + float16_t src_data_46 = src_data[i + 38 * src_step]; + float16_t src_data_47 = src_data[i + 39 * src_step]; + float16_t src_data_50 = src_data[i + 40 * src_step]; + float16_t src_data_51 = src_data[i + 41 * src_step]; + float16_t src_data_52 = src_data[i + 42 * src_step]; + float16_t src_data_53 = src_data[i + 43 * src_step]; + float16_t src_data_54 = src_data[i + 44 * src_step]; + float16_t src_data_55 = src_data[i + 45 * src_step]; + float16_t src_data_56 = src_data[i + 46 * src_step]; + float16_t src_data_57 = src_data[i + 47 * src_step]; + float16_t src_data_60 = src_data[i + 48 * src_step]; + float16_t src_data_61 = src_data[i + 49 * src_step]; + float16_t src_data_62 = src_data[i + 50 * src_step]; + float16_t src_data_63 = src_data[i + 51 * src_step]; + float16_t src_data_64 = src_data[i + 52 * src_step]; + float16_t src_data_65 = src_data[i + 53 * src_step]; + float16_t src_data_66 = src_data[i + 54 * src_step]; + float16_t src_data_67 = src_data[i + 55 * src_step]; + float16_t src_data_70 = src_data[i + 56 * src_step]; + float16_t src_data_71 = src_data[i + 57 * src_step]; + float16_t src_data_72 = src_data[i + 58 * src_step]; + float16_t src_data_73 = src_data[i + 59 * src_step]; + float16_t src_data_74 = src_data[i + 60 * src_step]; + float16_t src_data_75 = src_data[i + 61 * src_step]; + float16_t src_data_76 = src_data[i + 62 * src_step]; + float16_t src_data_77 = src_data[i + 63 * src_step]; + + float16_t d01 = src_data_10 - src_data_20; + float16_t d02 = src_data_11 - src_data_21; + float16_t d03 = src_data_12 - src_data_22; + float16_t d04 = src_data_13 - src_data_23; + float16_t d05 = src_data_14 - src_data_24; + float16_t d06 = src_data_15 - src_data_25; + float16_t d07 = src_data_16 - src_data_26; + float16_t d08 = src_data_17 - src_data_27; + + float16_t d11 = src_data_30 - src_data_40; + float16_t d12 = src_data_31 - src_data_41; + float16_t d13 = src_data_32 - src_data_42; + float16_t d14 = src_data_33 - src_data_43; + float16_t d15 = src_data_34 - src_data_44; + float16_t d16 = src_data_35 - src_data_45; + float16_t d17 = src_data_36 - src_data_46; + float16_t d18 = src_data_37 - src_data_47; + + float16_t d21 = src_data_50 - src_data_60; + float16_t d22 = src_data_51 - src_data_61; + float16_t d23 = src_data_52 - src_data_62; + float16_t d24 = src_data_53 - src_data_63; + float16_t d25 = src_data_54 - src_data_64; + float16_t d26 = src_data_55 - src_data_65; + float16_t d27 = src_data_56 - src_data_66; + float16_t d28 = src_data_57 - src_data_67; + + float16_t d31 = src_data_10 + src_data_20; + float16_t d32 = src_data_11 + src_data_21; + float16_t d33 = src_data_12 + src_data_22; + float16_t d34 = src_data_13 + src_data_23; + float16_t d35 = src_data_14 + src_data_24; + float16_t d36 = src_data_15 + src_data_25; + float16_t d37 = src_data_16 + src_data_26; + float16_t d38 = src_data_17 + src_data_27; + + float16_t d41 = src_data_30 + src_data_40; + float16_t d42 = src_data_31 + src_data_41; + float16_t d43 = src_data_32 + src_data_42; + float16_t d44 = src_data_33 + src_data_43; + float16_t d45 = src_data_34 + src_data_44; + float16_t d46 = src_data_35 + src_data_45; + float16_t d47 = src_data_36 + src_data_46; + float16_t d48 = src_data_37 + src_data_47; + + float16_t d51 = src_data_50 + src_data_60; + float16_t d52 = src_data_51 + src_data_61; + float16_t d53 = src_data_52 + src_data_62; + float16_t d54 = src_data_53 + src_data_63; + float16_t d55 = src_data_54 + src_data_64; + float16_t d56 = src_data_55 + src_data_65; + float16_t d57 = src_data_56 + src_data_66; + float16_t d58 = src_data_57 + src_data_67; + + float16_t t00 = src_data_00 + src_data_10 + src_data_20 + src_data_30 + src_data_40 + src_data_50 + src_data_60; + float16_t t01 = src_data_01 + src_data_11 + src_data_21 + src_data_31 + src_data_41 + src_data_51 + src_data_61; + float16_t t02 = src_data_02 + src_data_12 + src_data_22 + src_data_32 + src_data_42 + src_data_52 + src_data_62; + float16_t t03 = src_data_03 + src_data_13 + src_data_23 + src_data_33 + src_data_43 + src_data_53 + src_data_63; + float16_t t04 = src_data_04 + src_data_14 + src_data_24 + src_data_34 + src_data_44 + src_data_54 + src_data_64; + float16_t t05 = src_data_05 + src_data_15 + src_data_25 + src_data_35 + src_data_45 + src_data_55 + src_data_65; + float16_t t06 = src_data_06 + src_data_16 + src_data_26 + src_data_36 + src_data_46 + src_data_56 + src_data_66; + float16_t t07 = src_data_07 + src_data_17 + src_data_27 + src_data_37 + src_data_47 + src_data_57 + src_data_67; + + const float16_t t10 = 0.5f * d01 + d11 + 1.5f * d21; + const float16_t t11 = 0.5f * d02 + d12 + 1.5f * d22; + const float16_t t12 = 0.5f * d03 + d13 + 1.5f * d23; + const float16_t t13 = 0.5f * d04 + d14 + 1.5f * d24; + const float16_t t14 = 0.5f * d05 + d15 + 1.5f * d25; + const float16_t t15 = 0.5f * d06 + d16 + 1.5f * d26; + const float16_t t16 = 0.5f * d07 + d17 + 1.5f * d27; + const float16_t t17 = 0.5f * d08 + d18 + 1.5f * d28; + + const float16_t t20 = 0.25f * d31 + d41 + 2.25f * d51 + src_data_70; + const float16_t t21 = 0.25f * d32 + d42 + 2.25f * d52 + src_data_71; + const float16_t t22 = 0.25f * d33 + d43 + 2.25f * d53 + src_data_72; + const float16_t t23 = 0.25f * d34 + d44 + 2.25f * d54 + src_data_73; + const float16_t t24 = 0.25f * d35 + d45 + 2.25f * d55 + src_data_74; + const float16_t t25 = 0.25f * d36 + d46 + 2.25f * d56 + src_data_75; + const float16_t t26 = 0.25f * d37 + d47 + 2.25f * d57 + src_data_76; + const float16_t t27 = 0.25f * d38 + d48 + 2.25f * d58 + src_data_77; + + float16_t s11 = t01 - t02; + float16_t s12 = t11 - t12; + float16_t s13 = t21 - t22; + + float16_t s21 = t03 - t04; + float16_t s22 = t13 - t14; + float16_t s23 = t23 - t24; + + float16_t s31 = t05 - t06; + float16_t s32 = t15 - t16; + float16_t s33 = t25 - t26; + + float16_t s41 = t01 + t02; + float16_t s42 = t11 + t12; + float16_t s43 = t21 + t22; + + float16_t s51 = t03 + t04; + float16_t s52 = t13 + t14; + float16_t s53 = t23 + t24; + + float16_t s61 = t05 + t06; + float16_t s62 = t15 + t16; + float16_t s63 = t25 + t26; + + float16_t m00 = t00 + t01 + t02 + t03 + t04 + t05 + t06; + const float16_t m01 = 0.5f * s11 + s21 + 1.5f * s31; + const float16_t m02 = 0.25f * s41 + s51 + 2.25f * s61 + t07; + + float16_t m10 = t10 + t11 + t12 + t13 + t14 + t15 + t16; + const float16_t m11 = 0.5f * s12 + s22 + 1.5f * s32; + const float16_t m12 = 0.25f * s42 + s52 + 2.25f * s62 + t17; + + float16_t m20 = t20 + t21 + t22 + t23 + t24 + t25 + t26; + const float16_t m21 = 0.5f * s13 + s23 + 1.5f * s33; + const float16_t m22 = 0.25f * s43 + s53 + 2.25f * s63 + t27; + + (dst_data + i)[0] = m00 + bias_data[i]; + (dst_data + i + C8NUM)[0] = m01 + bias_data[i]; + (dst_data + i + 2 * C8NUM)[0] = m02 + bias_data[i]; + + (dst_data + i + dst_step * C8NUM)[0] = m10 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + C8NUM)[0] = m11 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + 2 * C8NUM)[0] = m12 + bias_data[i]; + + (dst_data + i + 2 * dst_step * C8NUM)[0] = m20 + bias_data[i]; + (dst_data + i + 2 * dst_step * C8NUM + C8NUM)[0] = m21 + bias_data[i]; + (dst_data + i + 2 * dst_step * C8NUM + 2 * C8NUM)[0] = m22 + bias_data[i]; + } +#endif +} + +void OutputTransform8x4UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, + int src_step, int dst_step) { +#ifdef ENABLE_ARM + float16x8_t src_data_00 = vld1q_f16(src_data + 0 * src_step); + float16x8_t src_data_01 = vld1q_f16(src_data + 1 * src_step); + float16x8_t src_data_02 = vld1q_f16(src_data + 2 * src_step); + float16x8_t src_data_03 = vld1q_f16(src_data + 3 * src_step); + float16x8_t src_data_04 = vld1q_f16(src_data + 4 * src_step); + float16x8_t src_data_05 = vld1q_f16(src_data + 5 * src_step); + float16x8_t src_data_06 = vld1q_f16(src_data + 6 * src_step); + float16x8_t src_data_07 = vld1q_f16(src_data + 7 * src_step); + float16x8_t src_data_10 = vld1q_f16(src_data + 8 * src_step); + float16x8_t src_data_11 = vld1q_f16(src_data + 9 * src_step); + float16x8_t src_data_12 = vld1q_f16(src_data + 10 * src_step); + float16x8_t src_data_13 = vld1q_f16(src_data + 11 * src_step); + float16x8_t src_data_14 = vld1q_f16(src_data + 12 * src_step); + float16x8_t src_data_15 = vld1q_f16(src_data + 13 * src_step); + float16x8_t src_data_16 = vld1q_f16(src_data + 14 * src_step); + float16x8_t src_data_17 = vld1q_f16(src_data + 15 * src_step); + float16x8_t src_data_20 = vld1q_f16(src_data + 16 * src_step); + float16x8_t src_data_21 = vld1q_f16(src_data + 17 * src_step); + float16x8_t src_data_22 = vld1q_f16(src_data + 18 * src_step); + float16x8_t src_data_23 = vld1q_f16(src_data + 19 * src_step); + float16x8_t src_data_24 = vld1q_f16(src_data + 20 * src_step); + float16x8_t src_data_25 = vld1q_f16(src_data + 21 * src_step); + float16x8_t src_data_26 = vld1q_f16(src_data + 22 * src_step); + float16x8_t src_data_27 = vld1q_f16(src_data + 23 * src_step); + float16x8_t src_data_30 = vld1q_f16(src_data + 24 * src_step); + float16x8_t src_data_31 = vld1q_f16(src_data + 25 * src_step); + float16x8_t src_data_32 = vld1q_f16(src_data + 26 * src_step); + float16x8_t src_data_33 = vld1q_f16(src_data + 27 * src_step); + float16x8_t src_data_34 = vld1q_f16(src_data + 28 * src_step); + float16x8_t src_data_35 = vld1q_f16(src_data + 29 * src_step); + float16x8_t src_data_36 = vld1q_f16(src_data + 30 * src_step); + float16x8_t src_data_37 = vld1q_f16(src_data + 31 * src_step); + float16x8_t src_data_40 = vld1q_f16(src_data + 32 * src_step); + float16x8_t src_data_41 = vld1q_f16(src_data + 33 * src_step); + float16x8_t src_data_42 = vld1q_f16(src_data + 34 * src_step); + float16x8_t src_data_43 = vld1q_f16(src_data + 35 * src_step); + float16x8_t src_data_44 = vld1q_f16(src_data + 36 * src_step); + float16x8_t src_data_45 = vld1q_f16(src_data + 37 * src_step); + float16x8_t src_data_46 = vld1q_f16(src_data + 38 * src_step); + float16x8_t src_data_47 = vld1q_f16(src_data + 39 * src_step); + float16x8_t src_data_50 = vld1q_f16(src_data + 40 * src_step); + float16x8_t src_data_51 = vld1q_f16(src_data + 41 * src_step); + float16x8_t src_data_52 = vld1q_f16(src_data + 42 * src_step); + float16x8_t src_data_53 = vld1q_f16(src_data + 43 * src_step); + float16x8_t src_data_54 = vld1q_f16(src_data + 44 * src_step); + float16x8_t src_data_55 = vld1q_f16(src_data + 45 * src_step); + float16x8_t src_data_56 = vld1q_f16(src_data + 46 * src_step); + float16x8_t src_data_57 = vld1q_f16(src_data + 47 * src_step); + float16x8_t src_data_60 = vld1q_f16(src_data + 48 * src_step); + float16x8_t src_data_61 = vld1q_f16(src_data + 49 * src_step); + float16x8_t src_data_62 = vld1q_f16(src_data + 50 * src_step); + float16x8_t src_data_63 = vld1q_f16(src_data + 51 * src_step); + float16x8_t src_data_64 = vld1q_f16(src_data + 52 * src_step); + float16x8_t src_data_65 = vld1q_f16(src_data + 53 * src_step); + float16x8_t src_data_66 = vld1q_f16(src_data + 54 * src_step); + float16x8_t src_data_67 = vld1q_f16(src_data + 55 * src_step); + float16x8_t src_data_70 = vld1q_f16(src_data + 56 * src_step); + float16x8_t src_data_71 = vld1q_f16(src_data + 57 * src_step); + float16x8_t src_data_72 = vld1q_f16(src_data + 58 * src_step); + float16x8_t src_data_73 = vld1q_f16(src_data + 59 * src_step); + float16x8_t src_data_74 = vld1q_f16(src_data + 60 * src_step); + float16x8_t src_data_75 = vld1q_f16(src_data + 61 * src_step); + float16x8_t src_data_76 = vld1q_f16(src_data + 62 * src_step); + float16x8_t src_data_77 = vld1q_f16(src_data + 63 * src_step); + + float16x8_t d01 = vsubq_f16(src_data_10, src_data_20); + float16x8_t d02 = vsubq_f16(src_data_11, src_data_21); + float16x8_t d03 = vsubq_f16(src_data_12, src_data_22); + float16x8_t d04 = vsubq_f16(src_data_13, src_data_23); + float16x8_t d05 = vsubq_f16(src_data_14, src_data_24); + float16x8_t d06 = vsubq_f16(src_data_15, src_data_25); + float16x8_t d07 = vsubq_f16(src_data_16, src_data_26); + float16x8_t d08 = vsubq_f16(src_data_17, src_data_27); + + float16x8_t d11 = vsubq_f16(src_data_30, src_data_40); + float16x8_t d12 = vsubq_f16(src_data_31, src_data_41); + float16x8_t d13 = vsubq_f16(src_data_32, src_data_42); + float16x8_t d14 = vsubq_f16(src_data_33, src_data_43); + float16x8_t d15 = vsubq_f16(src_data_34, src_data_44); + float16x8_t d16 = vsubq_f16(src_data_35, src_data_45); + float16x8_t d17 = vsubq_f16(src_data_36, src_data_46); + float16x8_t d18 = vsubq_f16(src_data_37, src_data_47); + + float16x8_t d21 = vsubq_f16(src_data_50, src_data_60); + float16x8_t d22 = vsubq_f16(src_data_51, src_data_61); + float16x8_t d23 = vsubq_f16(src_data_52, src_data_62); + float16x8_t d24 = vsubq_f16(src_data_53, src_data_63); + float16x8_t d25 = vsubq_f16(src_data_54, src_data_64); + float16x8_t d26 = vsubq_f16(src_data_55, src_data_65); + float16x8_t d27 = vsubq_f16(src_data_56, src_data_66); + float16x8_t d28 = vsubq_f16(src_data_57, src_data_67); + + float16x8_t d31 = vaddq_f16(src_data_10, src_data_20); + float16x8_t d32 = vaddq_f16(src_data_11, src_data_21); + float16x8_t d33 = vaddq_f16(src_data_12, src_data_22); + float16x8_t d34 = vaddq_f16(src_data_13, src_data_23); + float16x8_t d35 = vaddq_f16(src_data_14, src_data_24); + float16x8_t d36 = vaddq_f16(src_data_15, src_data_25); + float16x8_t d37 = vaddq_f16(src_data_16, src_data_26); + float16x8_t d38 = vaddq_f16(src_data_17, src_data_27); + + float16x8_t d41 = vaddq_f16(src_data_30, src_data_40); + float16x8_t d42 = vaddq_f16(src_data_31, src_data_41); + float16x8_t d43 = vaddq_f16(src_data_32, src_data_42); + float16x8_t d44 = vaddq_f16(src_data_33, src_data_43); + float16x8_t d45 = vaddq_f16(src_data_34, src_data_44); + float16x8_t d46 = vaddq_f16(src_data_35, src_data_45); + float16x8_t d47 = vaddq_f16(src_data_36, src_data_46); + float16x8_t d48 = vaddq_f16(src_data_37, src_data_47); + + float16x8_t d51 = vaddq_f16(src_data_50, src_data_60); + float16x8_t d52 = vaddq_f16(src_data_51, src_data_61); + float16x8_t d53 = vaddq_f16(src_data_52, src_data_62); + float16x8_t d54 = vaddq_f16(src_data_53, src_data_63); + float16x8_t d55 = vaddq_f16(src_data_54, src_data_64); + float16x8_t d56 = vaddq_f16(src_data_55, src_data_65); + float16x8_t d57 = vaddq_f16(src_data_56, src_data_66); + float16x8_t d58 = vaddq_f16(src_data_57, src_data_67); + + float16x8_t t00 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_00, src_data_10), src_data_20), src_data_30), src_data_40), + src_data_50), + src_data_60); + float16x8_t t01 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_01, src_data_11), src_data_21), src_data_31), src_data_41), + src_data_51), + src_data_61); + float16x8_t t02 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_02, src_data_12), src_data_22), src_data_32), src_data_42), + src_data_52), + src_data_62); + float16x8_t t03 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_03, src_data_13), src_data_23), src_data_33), src_data_43), + src_data_53), + src_data_63); + float16x8_t t04 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_04, src_data_14), src_data_24), src_data_34), src_data_44), + src_data_54), + src_data_64); + float16x8_t t05 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_05, src_data_15), src_data_25), src_data_35), src_data_45), + src_data_55), + src_data_65); + float16x8_t t06 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_06, src_data_16), src_data_26), src_data_36), src_data_46), + src_data_56), + src_data_66); + float16x8_t t07 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_07, src_data_17), src_data_27), src_data_37), src_data_47), + src_data_57), + src_data_67); + + float16x8_t t10 = vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.5), d11), vmulq_n_f16(d21, 1.5)); + float16x8_t t11 = vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.5), d12), vmulq_n_f16(d22, 1.5)); + float16x8_t t12 = vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.5), d13), vmulq_n_f16(d23, 1.5)); + float16x8_t t13 = vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.5), d14), vmulq_n_f16(d24, 1.5)); + float16x8_t t14 = vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.5), d15), vmulq_n_f16(d25, 1.5)); + float16x8_t t15 = vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.5), d16), vmulq_n_f16(d26, 1.5)); + float16x8_t t16 = vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.5), d17), vmulq_n_f16(d27, 1.5)); + float16x8_t t17 = vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.5), d18), vmulq_n_f16(d28, 1.5)); + + float16x8_t t20 = vaddq_f16(vaddq_f16(vmulq_n_f16(d31, 0.25), d41), vmulq_n_f16(d51, 2.25)); + float16x8_t t21 = vaddq_f16(vaddq_f16(vmulq_n_f16(d32, 0.25), d42), vmulq_n_f16(d52, 2.25)); + float16x8_t t22 = vaddq_f16(vaddq_f16(vmulq_n_f16(d33, 0.25), d43), vmulq_n_f16(d53, 2.25)); + float16x8_t t23 = vaddq_f16(vaddq_f16(vmulq_n_f16(d34, 0.25), d44), vmulq_n_f16(d54, 2.25)); + float16x8_t t24 = vaddq_f16(vaddq_f16(vmulq_n_f16(d35, 0.25), d45), vmulq_n_f16(d55, 2.25)); + float16x8_t t25 = vaddq_f16(vaddq_f16(vmulq_n_f16(d36, 0.25), d46), vmulq_n_f16(d56, 2.25)); + float16x8_t t26 = vaddq_f16(vaddq_f16(vmulq_n_f16(d37, 0.25), d47), vmulq_n_f16(d57, 2.25)); + float16x8_t t27 = vaddq_f16(vaddq_f16(vmulq_n_f16(d38, 0.25), d48), vmulq_n_f16(d58, 2.25)); + + float16x8_t t30 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.125), d11), vmulq_n_f16(d21, 3.375)), src_data_70); + float16x8_t t31 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.125), d12), vmulq_n_f16(d22, 3.375)), src_data_71); + float16x8_t t32 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.125), d13), vmulq_n_f16(d23, 3.375)), src_data_72); + float16x8_t t33 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.125), d14), vmulq_n_f16(d24, 3.375)), src_data_73); + float16x8_t t34 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.125), d15), vmulq_n_f16(d25, 3.375)), src_data_74); + float16x8_t t35 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.125), d16), vmulq_n_f16(d26, 3.375)), src_data_75); + float16x8_t t36 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.125), d17), vmulq_n_f16(d27, 3.375)), src_data_76); + float16x8_t t37 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.125), d18), vmulq_n_f16(d28, 3.375)), src_data_77); + + float16x8_t s11 = vsubq_f16(t01, t02); + float16x8_t s12 = vsubq_f16(t11, t12); + float16x8_t s13 = vsubq_f16(t21, t22); + float16x8_t s14 = vsubq_f16(t31, t32); + + float16x8_t s21 = vsubq_f16(t03, t04); + float16x8_t s22 = vsubq_f16(t13, t14); + float16x8_t s23 = vsubq_f16(t23, t24); + float16x8_t s24 = vsubq_f16(t33, t34); + + float16x8_t s31 = vsubq_f16(t05, t06); + float16x8_t s32 = vsubq_f16(t15, t16); + float16x8_t s33 = vsubq_f16(t25, t26); + float16x8_t s34 = vsubq_f16(t35, t36); + + float16x8_t s41 = vaddq_f16(t01, t02); + float16x8_t s42 = vaddq_f16(t11, t12); + float16x8_t s43 = vaddq_f16(t21, t22); + float16x8_t s44 = vaddq_f16(t31, t32); + + float16x8_t s51 = vaddq_f16(t03, t04); + float16x8_t s52 = vaddq_f16(t13, t14); + float16x8_t s53 = vaddq_f16(t23, t24); + float16x8_t s54 = vaddq_f16(t33, t34); + + float16x8_t s61 = vaddq_f16(t05, t06); + float16x8_t s62 = vaddq_f16(t15, t16); + float16x8_t s63 = vaddq_f16(t25, t26); + float16x8_t s64 = vaddq_f16(t35, t36); + + float16x8_t m00 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t00, t01), t02), t03), t04), t05), t06); + float16x8_t m01 = vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.5), s21), vmulq_n_f16(s31, 1.5)); + float16x8_t m02 = vaddq_f16(vaddq_f16(vmulq_n_f16(s41, 0.25), s51), vmulq_n_f16(s61, 2.25)); + float16x8_t m03 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.125), s21), vmulq_n_f16(s31, 3.375)), t07); + + float16x8_t m10 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t10, t11), t12), t13), t14), t15), t16); + float16x8_t m11 = vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.5), s22), vmulq_n_f16(s32, 1.5)); + float16x8_t m12 = vaddq_f16(vaddq_f16(vmulq_n_f16(s42, 0.25), s52), vmulq_n_f16(s62, 2.25)); + float16x8_t m13 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.125), s22), vmulq_n_f16(s32, 3.375)), t17); + + float16x8_t m20 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t20, t21), t22), t23), t24), t25), t26); + float16x8_t m21 = vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.5), s23), vmulq_n_f16(s33, 1.5)); + float16x8_t m22 = vaddq_f16(vaddq_f16(vmulq_n_f16(s43, 0.25), s53), vmulq_n_f16(s63, 2.25)); + float16x8_t m23 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.125), s23), vmulq_n_f16(s33, 3.375)), t27); + + float16x8_t m30 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t30, t31), t32), t33), t34), t35), t36); + float16x8_t m31 = vaddq_f16(vaddq_f16(vmulq_n_f16(s14, 0.5), s24), vmulq_n_f16(s34, 1.5)); + float16x8_t m32 = vaddq_f16(vaddq_f16(vmulq_n_f16(s44, 0.25), s54), vmulq_n_f16(s64, 2.25)); + float16x8_t m33 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s14, 0.125), s24), vmulq_n_f16(s34, 3.375)), t37); + + float16x8_t bias_ptr = vld1q_f16(bias_data); + vst1q_f16(dst_data, vaddq_f16(m00, bias_ptr)); + vst1q_f16(dst_data + C8NUM, vaddq_f16(m01, bias_ptr)); + vst1q_f16(dst_data + 2 * C8NUM, vaddq_f16(m02, bias_ptr)); + vst1q_f16(dst_data + 3 * C8NUM, vaddq_f16(m03, bias_ptr)); + + vst1q_f16(dst_data + dst_step * C8NUM, vaddq_f16(m10, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + C8NUM, vaddq_f16(m11, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m12, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m13, bias_ptr)); + + vst1q_f16(dst_data + 2 * dst_step * C8NUM, vaddq_f16(m20, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + C8NUM, vaddq_f16(m21, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m22, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m23, bias_ptr)); + + vst1q_f16(dst_data + 3 * dst_step * C8NUM, vaddq_f16(m30, bias_ptr)); + vst1q_f16(dst_data + 3 * dst_step * C8NUM + C8NUM, vaddq_f16(m31, bias_ptr)); + vst1q_f16(dst_data + 3 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m32, bias_ptr)); + vst1q_f16(dst_data + 3 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m33, bias_ptr)); +#else + for (int i = 0; i < C8NUM; i++) { + float16_t src_data_00 = src_data[i]; + float16_t src_data_01 = src_data[i + src_step]; + float16_t src_data_02 = src_data[i + 2 * src_step]; + float16_t src_data_03 = src_data[i + 3 * src_step]; + float16_t src_data_04 = src_data[i + 4 * src_step]; + float16_t src_data_05 = src_data[i + 5 * src_step]; + float16_t src_data_06 = src_data[i + 6 * src_step]; + float16_t src_data_07 = src_data[i + 7 * src_step]; + float16_t src_data_10 = src_data[i + 8 * src_step]; + float16_t src_data_11 = src_data[i + 9 * src_step]; + float16_t src_data_12 = src_data[i + 10 * src_step]; + float16_t src_data_13 = src_data[i + 11 * src_step]; + float16_t src_data_14 = src_data[i + 12 * src_step]; + float16_t src_data_15 = src_data[i + 13 * src_step]; + float16_t src_data_16 = src_data[i + 14 * src_step]; + float16_t src_data_17 = src_data[i + 15 * src_step]; + float16_t src_data_20 = src_data[i + 16 * src_step]; + float16_t src_data_21 = src_data[i + 17 * src_step]; + float16_t src_data_22 = src_data[i + 18 * src_step]; + float16_t src_data_23 = src_data[i + 19 * src_step]; + float16_t src_data_24 = src_data[i + 20 * src_step]; + float16_t src_data_25 = src_data[i + 21 * src_step]; + float16_t src_data_26 = src_data[i + 22 * src_step]; + float16_t src_data_27 = src_data[i + 23 * src_step]; + float16_t src_data_30 = src_data[i + 24 * src_step]; + float16_t src_data_31 = src_data[i + 25 * src_step]; + float16_t src_data_32 = src_data[i + 26 * src_step]; + float16_t src_data_33 = src_data[i + 27 * src_step]; + float16_t src_data_34 = src_data[i + 28 * src_step]; + float16_t src_data_35 = src_data[i + 29 * src_step]; + float16_t src_data_36 = src_data[i + 30 * src_step]; + float16_t src_data_37 = src_data[i + 31 * src_step]; + float16_t src_data_40 = src_data[i + 32 * src_step]; + float16_t src_data_41 = src_data[i + 33 * src_step]; + float16_t src_data_42 = src_data[i + 34 * src_step]; + float16_t src_data_43 = src_data[i + 35 * src_step]; + float16_t src_data_44 = src_data[i + 36 * src_step]; + float16_t src_data_45 = src_data[i + 37 * src_step]; + float16_t src_data_46 = src_data[i + 38 * src_step]; + float16_t src_data_47 = src_data[i + 39 * src_step]; + float16_t src_data_50 = src_data[i + 40 * src_step]; + float16_t src_data_51 = src_data[i + 41 * src_step]; + float16_t src_data_52 = src_data[i + 42 * src_step]; + float16_t src_data_53 = src_data[i + 43 * src_step]; + float16_t src_data_54 = src_data[i + 44 * src_step]; + float16_t src_data_55 = src_data[i + 45 * src_step]; + float16_t src_data_56 = src_data[i + 46 * src_step]; + float16_t src_data_57 = src_data[i + 47 * src_step]; + float16_t src_data_60 = src_data[i + 48 * src_step]; + float16_t src_data_61 = src_data[i + 49 * src_step]; + float16_t src_data_62 = src_data[i + 50 * src_step]; + float16_t src_data_63 = src_data[i + 51 * src_step]; + float16_t src_data_64 = src_data[i + 52 * src_step]; + float16_t src_data_65 = src_data[i + 53 * src_step]; + float16_t src_data_66 = src_data[i + 54 * src_step]; + float16_t src_data_67 = src_data[i + 55 * src_step]; + float16_t src_data_70 = src_data[i + 56 * src_step]; + float16_t src_data_71 = src_data[i + 57 * src_step]; + float16_t src_data_72 = src_data[i + 58 * src_step]; + float16_t src_data_73 = src_data[i + 59 * src_step]; + float16_t src_data_74 = src_data[i + 60 * src_step]; + float16_t src_data_75 = src_data[i + 61 * src_step]; + float16_t src_data_76 = src_data[i + 62 * src_step]; + float16_t src_data_77 = src_data[i + 63 * src_step]; + + float16_t d01 = src_data_10 - src_data_20; + float16_t d02 = src_data_11 - src_data_21; + float16_t d03 = src_data_12 - src_data_22; + float16_t d04 = src_data_13 - src_data_23; + float16_t d05 = src_data_14 - src_data_24; + float16_t d06 = src_data_15 - src_data_25; + float16_t d07 = src_data_16 - src_data_26; + float16_t d08 = src_data_17 - src_data_27; + + float16_t d11 = src_data_30 - src_data_40; + float16_t d12 = src_data_31 - src_data_41; + float16_t d13 = src_data_32 - src_data_42; + float16_t d14 = src_data_33 - src_data_43; + float16_t d15 = src_data_34 - src_data_44; + float16_t d16 = src_data_35 - src_data_45; + float16_t d17 = src_data_36 - src_data_46; + float16_t d18 = src_data_37 - src_data_47; + + float16_t d21 = src_data_50 - src_data_60; + float16_t d22 = src_data_51 - src_data_61; + float16_t d23 = src_data_52 - src_data_62; + float16_t d24 = src_data_53 - src_data_63; + float16_t d25 = src_data_54 - src_data_64; + float16_t d26 = src_data_55 - src_data_65; + float16_t d27 = src_data_56 - src_data_66; + float16_t d28 = src_data_57 - src_data_67; + + float16_t d31 = src_data_10 + src_data_20; + float16_t d32 = src_data_11 + src_data_21; + float16_t d33 = src_data_12 + src_data_22; + float16_t d34 = src_data_13 + src_data_23; + float16_t d35 = src_data_14 + src_data_24; + float16_t d36 = src_data_15 + src_data_25; + float16_t d37 = src_data_16 + src_data_26; + float16_t d38 = src_data_17 + src_data_27; + + float16_t d41 = src_data_30 + src_data_40; + float16_t d42 = src_data_31 + src_data_41; + float16_t d43 = src_data_32 + src_data_42; + float16_t d44 = src_data_33 + src_data_43; + float16_t d45 = src_data_34 + src_data_44; + float16_t d46 = src_data_35 + src_data_45; + float16_t d47 = src_data_36 + src_data_46; + float16_t d48 = src_data_37 + src_data_47; + + float16_t d51 = src_data_50 + src_data_60; + float16_t d52 = src_data_51 + src_data_61; + float16_t d53 = src_data_52 + src_data_62; + float16_t d54 = src_data_53 + src_data_63; + float16_t d55 = src_data_54 + src_data_64; + float16_t d56 = src_data_55 + src_data_65; + float16_t d57 = src_data_56 + src_data_66; + float16_t d58 = src_data_57 + src_data_67; + + float16_t t00 = src_data_00 + src_data_10 + src_data_20 + src_data_30 + src_data_40 + src_data_50 + src_data_60; + float16_t t01 = src_data_01 + src_data_11 + src_data_21 + src_data_31 + src_data_41 + src_data_51 + src_data_61; + float16_t t02 = src_data_02 + src_data_12 + src_data_22 + src_data_32 + src_data_42 + src_data_52 + src_data_62; + float16_t t03 = src_data_03 + src_data_13 + src_data_23 + src_data_33 + src_data_43 + src_data_53 + src_data_63; + float16_t t04 = src_data_04 + src_data_14 + src_data_24 + src_data_34 + src_data_44 + src_data_54 + src_data_64; + float16_t t05 = src_data_05 + src_data_15 + src_data_25 + src_data_35 + src_data_45 + src_data_55 + src_data_65; + float16_t t06 = src_data_06 + src_data_16 + src_data_26 + src_data_36 + src_data_46 + src_data_56 + src_data_66; + float16_t t07 = src_data_07 + src_data_17 + src_data_27 + src_data_37 + src_data_47 + src_data_57 + src_data_67; + + const float16_t t10 = 0.5f * d01 + d11 + 1.5f * d21; + const float16_t t11 = 0.5f * d02 + d12 + 1.5f * d22; + const float16_t t12 = 0.5f * d03 + d13 + 1.5f * d23; + const float16_t t13 = 0.5f * d04 + d14 + 1.5f * d24; + const float16_t t14 = 0.5f * d05 + d15 + 1.5f * d25; + const float16_t t15 = 0.5f * d06 + d16 + 1.5f * d26; + const float16_t t16 = 0.5f * d07 + d17 + 1.5f * d27; + const float16_t t17 = 0.5f * d08 + d18 + 1.5f * d28; + + const float16_t t20 = 0.25f * d31 + d41 + 2.25f * d51; + const float16_t t21 = 0.25f * d32 + d42 + 2.25f * d52; + const float16_t t22 = 0.25f * d33 + d43 + 2.25f * d53; + const float16_t t23 = 0.25f * d34 + d44 + 2.25f * d54; + const float16_t t24 = 0.25f * d35 + d45 + 2.25f * d55; + const float16_t t25 = 0.25f * d36 + d46 + 2.25f * d56; + const float16_t t26 = 0.25f * d37 + d47 + 2.25f * d57; + const const float16_t t27 = 0.25f * d38 + d48 + 2.25f * d58; + + const float16_t t30 = 0.125f * d01 + d11 + 3.375f * d21 + src_data_70; + const float16_t t31 = 0.125f * d02 + d12 + 3.375f * d22 + src_data_71; + const float16_t t32 = 0.125f * d03 + d13 + 3.375f * d23 + src_data_72; + const float16_t t33 = 0.125f * d04 + d14 + 3.375f * d24 + src_data_73; + const float16_t t34 = 0.125f * d05 + d15 + 3.375f * d25 + src_data_74; + const float16_t t35 = 0.125f * d06 + d16 + 3.375f * d26 + src_data_75; + const float16_t t36 = 0.125f * d07 + d17 + 3.375f * d27 + src_data_76; + const float16_t t37 = 0.125f * d08 + d18 + 3.375f * d28 + src_data_77; + + float16_t s11 = t01 - t02; + float16_t s12 = t11 - t12; + float16_t s13 = t21 - t22; + float16_t s14 = t31 - t32; + + float16_t s21 = t03 - t04; + float16_t s22 = t13 - t14; + float16_t s23 = t23 - t24; + float16_t s24 = t33 - t34; + + float16_t s31 = t05 - t06; + float16_t s32 = t15 - t16; + float16_t s33 = t25 - t26; + float16_t s34 = t35 - t36; + + float16_t s41 = t01 + t02; + float16_t s42 = t11 + t12; + float16_t s43 = t21 + t22; + float16_t s44 = t31 + t32; + + float16_t s51 = t03 + t04; + float16_t s52 = t13 + t14; + float16_t s53 = t23 + t24; + float16_t s54 = t33 + t34; + + float16_t s61 = t05 + t06; + float16_t s62 = t15 + t16; + float16_t s63 = t25 + t26; + float16_t s64 = t35 + t36; + + float16_t m00 = t00 + t01 + t02 + t03 + t04 + t05 + t06; + const float16_t m01 = 0.5f * s11 + s21 + 1.5f * s31; + const float16_t m02 = 0.25f * s41 + s51 + 2.25f * s61; + const float16_t m03 = 0.125f * s11 + s21 + 3.375f * s31 + t07; + + float16_t m10 = t10 + t11 + t12 + t13 + t14 + t15 + t16; + const float16_t m11 = 0.5f * s12 + s22 + 1.5f * s32; + const float16_t m12 = 0.25f * s42 + s52 + 2.25f * s62; + const float16_t m13 = 0.125f * s12 + s22 + 3.375f * s32 + t17; + + float16_t m20 = t20 + t21 + t22 + t23 + t24 + t25 + t26; + const float16_t m21 = 0.5f * s13 + s23 + 1.5f * s33; + const float16_t m22 = 0.25f * s43 + s53 + 2.25f * s63; + const float16_t m23 = 0.125f * s13 + s23 + 3.375f * s33 + t27; + + float16_t m30 = t30 + t31 + t32 + t33 + t34 + t35 + t36; + const float16_t m31 = 0.5f * s14 + s24 + 1.5f * s34; + const float16_t m32 = 0.25f * s44 + s54 + 2.25f * s64; + const float16_t m33 = 0.125f * s14 + s24 + 3.375f * s34 + t37; + + (dst_data + i)[0] = m00 + bias_data[i]; + (dst_data + i + C8NUM)[0] = m01 + bias_data[i]; + (dst_data + i + 2 * C8NUM)[0] = m02 + bias_data[i]; + (dst_data + i + 3 * C8NUM)[0] = m03 + bias_data[i]; + + (dst_data + i + dst_step * C8NUM)[0] = m10 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + C8NUM)[0] = m11 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + 2 * C8NUM)[0] = m12 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + 3 * C8NUM)[0] = m13 + bias_data[i]; + + (dst_data + i + 2 * dst_step * C8NUM)[0] = m20 + bias_data[i]; + (dst_data + i + 2 * dst_step * C8NUM + C8NUM)[0] = m21 + bias_data[i]; + (dst_data + i + 2 * dst_step * C8NUM + 2 * C8NUM)[0] = m22 + bias_data[i]; + (dst_data + i + 2 * dst_step * C8NUM + 3 * C8NUM)[0] = m23 + bias_data[i]; + + (dst_data + i + 3 * dst_step * C8NUM)[0] = m30 + bias_data[i]; + (dst_data + i + 3 * dst_step * C8NUM + C8NUM)[0] = m31 + bias_data[i]; + (dst_data + i + 3 * dst_step * C8NUM + 2 * C8NUM)[0] = m32 + bias_data[i]; + (dst_data + i + 3 * dst_step * C8NUM + 3 * C8NUM)[0] = m33 + bias_data[i]; + } +#endif +} + +void OutputTransform8x5UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, + int src_step, int dst_step) { +#ifdef ENABLE_ARM + float16x8_t src_data_00 = vld1q_f16(src_data + 0 * src_step); + float16x8_t src_data_01 = vld1q_f16(src_data + 1 * src_step); + float16x8_t src_data_02 = vld1q_f16(src_data + 2 * src_step); + float16x8_t src_data_03 = vld1q_f16(src_data + 3 * src_step); + float16x8_t src_data_04 = vld1q_f16(src_data + 4 * src_step); + float16x8_t src_data_05 = vld1q_f16(src_data + 5 * src_step); + float16x8_t src_data_06 = vld1q_f16(src_data + 6 * src_step); + float16x8_t src_data_07 = vld1q_f16(src_data + 7 * src_step); + float16x8_t src_data_10 = vld1q_f16(src_data + 8 * src_step); + float16x8_t src_data_11 = vld1q_f16(src_data + 9 * src_step); + float16x8_t src_data_12 = vld1q_f16(src_data + 10 * src_step); + float16x8_t src_data_13 = vld1q_f16(src_data + 11 * src_step); + float16x8_t src_data_14 = vld1q_f16(src_data + 12 * src_step); + float16x8_t src_data_15 = vld1q_f16(src_data + 13 * src_step); + float16x8_t src_data_16 = vld1q_f16(src_data + 14 * src_step); + float16x8_t src_data_17 = vld1q_f16(src_data + 15 * src_step); + float16x8_t src_data_20 = vld1q_f16(src_data + 16 * src_step); + float16x8_t src_data_21 = vld1q_f16(src_data + 17 * src_step); + float16x8_t src_data_22 = vld1q_f16(src_data + 18 * src_step); + float16x8_t src_data_23 = vld1q_f16(src_data + 19 * src_step); + float16x8_t src_data_24 = vld1q_f16(src_data + 20 * src_step); + float16x8_t src_data_25 = vld1q_f16(src_data + 21 * src_step); + float16x8_t src_data_26 = vld1q_f16(src_data + 22 * src_step); + float16x8_t src_data_27 = vld1q_f16(src_data + 23 * src_step); + float16x8_t src_data_30 = vld1q_f16(src_data + 24 * src_step); + float16x8_t src_data_31 = vld1q_f16(src_data + 25 * src_step); + float16x8_t src_data_32 = vld1q_f16(src_data + 26 * src_step); + float16x8_t src_data_33 = vld1q_f16(src_data + 27 * src_step); + float16x8_t src_data_34 = vld1q_f16(src_data + 28 * src_step); + float16x8_t src_data_35 = vld1q_f16(src_data + 29 * src_step); + float16x8_t src_data_36 = vld1q_f16(src_data + 30 * src_step); + float16x8_t src_data_37 = vld1q_f16(src_data + 31 * src_step); + float16x8_t src_data_40 = vld1q_f16(src_data + 32 * src_step); + float16x8_t src_data_41 = vld1q_f16(src_data + 33 * src_step); + float16x8_t src_data_42 = vld1q_f16(src_data + 34 * src_step); + float16x8_t src_data_43 = vld1q_f16(src_data + 35 * src_step); + float16x8_t src_data_44 = vld1q_f16(src_data + 36 * src_step); + float16x8_t src_data_45 = vld1q_f16(src_data + 37 * src_step); + float16x8_t src_data_46 = vld1q_f16(src_data + 38 * src_step); + float16x8_t src_data_47 = vld1q_f16(src_data + 39 * src_step); + float16x8_t src_data_50 = vld1q_f16(src_data + 40 * src_step); + float16x8_t src_data_51 = vld1q_f16(src_data + 41 * src_step); + float16x8_t src_data_52 = vld1q_f16(src_data + 42 * src_step); + float16x8_t src_data_53 = vld1q_f16(src_data + 43 * src_step); + float16x8_t src_data_54 = vld1q_f16(src_data + 44 * src_step); + float16x8_t src_data_55 = vld1q_f16(src_data + 45 * src_step); + float16x8_t src_data_56 = vld1q_f16(src_data + 46 * src_step); + float16x8_t src_data_57 = vld1q_f16(src_data + 47 * src_step); + float16x8_t src_data_60 = vld1q_f16(src_data + 48 * src_step); + float16x8_t src_data_61 = vld1q_f16(src_data + 49 * src_step); + float16x8_t src_data_62 = vld1q_f16(src_data + 50 * src_step); + float16x8_t src_data_63 = vld1q_f16(src_data + 51 * src_step); + float16x8_t src_data_64 = vld1q_f16(src_data + 52 * src_step); + float16x8_t src_data_65 = vld1q_f16(src_data + 53 * src_step); + float16x8_t src_data_66 = vld1q_f16(src_data + 54 * src_step); + float16x8_t src_data_67 = vld1q_f16(src_data + 55 * src_step); + float16x8_t src_data_70 = vld1q_f16(src_data + 56 * src_step); + float16x8_t src_data_71 = vld1q_f16(src_data + 57 * src_step); + float16x8_t src_data_72 = vld1q_f16(src_data + 58 * src_step); + float16x8_t src_data_73 = vld1q_f16(src_data + 59 * src_step); + float16x8_t src_data_74 = vld1q_f16(src_data + 60 * src_step); + float16x8_t src_data_75 = vld1q_f16(src_data + 61 * src_step); + float16x8_t src_data_76 = vld1q_f16(src_data + 62 * src_step); + float16x8_t src_data_77 = vld1q_f16(src_data + 63 * src_step); + + float16x8_t d01 = vsubq_f16(src_data_10, src_data_20); + float16x8_t d02 = vsubq_f16(src_data_11, src_data_21); + float16x8_t d03 = vsubq_f16(src_data_12, src_data_22); + float16x8_t d04 = vsubq_f16(src_data_13, src_data_23); + float16x8_t d05 = vsubq_f16(src_data_14, src_data_24); + float16x8_t d06 = vsubq_f16(src_data_15, src_data_25); + float16x8_t d07 = vsubq_f16(src_data_16, src_data_26); + float16x8_t d08 = vsubq_f16(src_data_17, src_data_27); + + float16x8_t d11 = vsubq_f16(src_data_30, src_data_40); + float16x8_t d12 = vsubq_f16(src_data_31, src_data_41); + float16x8_t d13 = vsubq_f16(src_data_32, src_data_42); + float16x8_t d14 = vsubq_f16(src_data_33, src_data_43); + float16x8_t d15 = vsubq_f16(src_data_34, src_data_44); + float16x8_t d16 = vsubq_f16(src_data_35, src_data_45); + float16x8_t d17 = vsubq_f16(src_data_36, src_data_46); + float16x8_t d18 = vsubq_f16(src_data_37, src_data_47); + + float16x8_t d21 = vsubq_f16(src_data_50, src_data_60); + float16x8_t d22 = vsubq_f16(src_data_51, src_data_61); + float16x8_t d23 = vsubq_f16(src_data_52, src_data_62); + float16x8_t d24 = vsubq_f16(src_data_53, src_data_63); + float16x8_t d25 = vsubq_f16(src_data_54, src_data_64); + float16x8_t d26 = vsubq_f16(src_data_55, src_data_65); + float16x8_t d27 = vsubq_f16(src_data_56, src_data_66); + float16x8_t d28 = vsubq_f16(src_data_57, src_data_67); + + float16x8_t d31 = vaddq_f16(src_data_10, src_data_20); + float16x8_t d32 = vaddq_f16(src_data_11, src_data_21); + float16x8_t d33 = vaddq_f16(src_data_12, src_data_22); + float16x8_t d34 = vaddq_f16(src_data_13, src_data_23); + float16x8_t d35 = vaddq_f16(src_data_14, src_data_24); + float16x8_t d36 = vaddq_f16(src_data_15, src_data_25); + float16x8_t d37 = vaddq_f16(src_data_16, src_data_26); + float16x8_t d38 = vaddq_f16(src_data_17, src_data_27); + + float16x8_t d41 = vaddq_f16(src_data_30, src_data_40); + float16x8_t d42 = vaddq_f16(src_data_31, src_data_41); + float16x8_t d43 = vaddq_f16(src_data_32, src_data_42); + float16x8_t d44 = vaddq_f16(src_data_33, src_data_43); + float16x8_t d45 = vaddq_f16(src_data_34, src_data_44); + float16x8_t d46 = vaddq_f16(src_data_35, src_data_45); + float16x8_t d47 = vaddq_f16(src_data_36, src_data_46); + float16x8_t d48 = vaddq_f16(src_data_37, src_data_47); + + float16x8_t d51 = vaddq_f16(src_data_50, src_data_60); + float16x8_t d52 = vaddq_f16(src_data_51, src_data_61); + float16x8_t d53 = vaddq_f16(src_data_52, src_data_62); + float16x8_t d54 = vaddq_f16(src_data_53, src_data_63); + float16x8_t d55 = vaddq_f16(src_data_54, src_data_64); + float16x8_t d56 = vaddq_f16(src_data_55, src_data_65); + float16x8_t d57 = vaddq_f16(src_data_56, src_data_66); + float16x8_t d58 = vaddq_f16(src_data_57, src_data_67); + + float16x8_t t00 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_00, src_data_10), src_data_20), src_data_30), src_data_40), + src_data_50), + src_data_60); + float16x8_t t01 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_01, src_data_11), src_data_21), src_data_31), src_data_41), + src_data_51), + src_data_61); + float16x8_t t02 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_02, src_data_12), src_data_22), src_data_32), src_data_42), + src_data_52), + src_data_62); + float16x8_t t03 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_03, src_data_13), src_data_23), src_data_33), src_data_43), + src_data_53), + src_data_63); + float16x8_t t04 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_04, src_data_14), src_data_24), src_data_34), src_data_44), + src_data_54), + src_data_64); + float16x8_t t05 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_05, src_data_15), src_data_25), src_data_35), src_data_45), + src_data_55), + src_data_65); + float16x8_t t06 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_06, src_data_16), src_data_26), src_data_36), src_data_46), + src_data_56), + src_data_66); + float16x8_t t07 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_07, src_data_17), src_data_27), src_data_37), src_data_47), + src_data_57), + src_data_67); + + float16x8_t t10 = vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.5), d11), vmulq_n_f16(d21, 1.5)); + float16x8_t t11 = vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.5), d12), vmulq_n_f16(d22, 1.5)); + float16x8_t t12 = vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.5), d13), vmulq_n_f16(d23, 1.5)); + float16x8_t t13 = vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.5), d14), vmulq_n_f16(d24, 1.5)); + float16x8_t t14 = vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.5), d15), vmulq_n_f16(d25, 1.5)); + float16x8_t t15 = vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.5), d16), vmulq_n_f16(d26, 1.5)); + float16x8_t t16 = vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.5), d17), vmulq_n_f16(d27, 1.5)); + float16x8_t t17 = vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.5), d18), vmulq_n_f16(d28, 1.5)); + + float16x8_t t20 = vaddq_f16(vaddq_f16(vmulq_n_f16(d31, 0.25), d41), vmulq_n_f16(d51, 2.25)); + float16x8_t t21 = vaddq_f16(vaddq_f16(vmulq_n_f16(d32, 0.25), d42), vmulq_n_f16(d52, 2.25)); + float16x8_t t22 = vaddq_f16(vaddq_f16(vmulq_n_f16(d33, 0.25), d43), vmulq_n_f16(d53, 2.25)); + float16x8_t t23 = vaddq_f16(vaddq_f16(vmulq_n_f16(d34, 0.25), d44), vmulq_n_f16(d54, 2.25)); + float16x8_t t24 = vaddq_f16(vaddq_f16(vmulq_n_f16(d35, 0.25), d45), vmulq_n_f16(d55, 2.25)); + float16x8_t t25 = vaddq_f16(vaddq_f16(vmulq_n_f16(d36, 0.25), d46), vmulq_n_f16(d56, 2.25)); + float16x8_t t26 = vaddq_f16(vaddq_f16(vmulq_n_f16(d37, 0.25), d47), vmulq_n_f16(d57, 2.25)); + float16x8_t t27 = vaddq_f16(vaddq_f16(vmulq_n_f16(d38, 0.25), d48), vmulq_n_f16(d58, 2.25)); + + float16x8_t t30 = vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.125), d11), vmulq_n_f16(d21, 3.375)); + float16x8_t t31 = vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.125), d12), vmulq_n_f16(d22, 3.375)); + float16x8_t t32 = vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.125), d13), vmulq_n_f16(d23, 3.375)); + float16x8_t t33 = vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.125), d14), vmulq_n_f16(d24, 3.375)); + float16x8_t t34 = vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.125), d15), vmulq_n_f16(d25, 3.375)); + float16x8_t t35 = vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.125), d16), vmulq_n_f16(d26, 3.375)); + float16x8_t t36 = vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.125), d17), vmulq_n_f16(d27, 3.375)); + float16x8_t t37 = vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.125), d18), vmulq_n_f16(d28, 3.375)); + + float16x8_t t40 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d31, 0.0625), d41), vmulq_n_f16(d51, 5.0625)), src_data_70); + float16x8_t t41 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d32, 0.0625), d42), vmulq_n_f16(d52, 5.0625)), src_data_71); + float16x8_t t42 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d33, 0.0625), d43), vmulq_n_f16(d53, 5.0625)), src_data_72); + float16x8_t t43 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d34, 0.0625), d44), vmulq_n_f16(d54, 5.0625)), src_data_73); + float16x8_t t44 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d35, 0.0625), d45), vmulq_n_f16(d55, 5.0625)), src_data_74); + float16x8_t t45 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d36, 0.0625), d46), vmulq_n_f16(d56, 5.0625)), src_data_75); + float16x8_t t46 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d37, 0.0625), d47), vmulq_n_f16(d57, 5.0625)), src_data_76); + float16x8_t t47 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d38, 0.0625), d48), vmulq_n_f16(d58, 5.0625)), src_data_77); + + float16x8_t s11 = vsubq_f16(t01, t02); + float16x8_t s12 = vsubq_f16(t11, t12); + float16x8_t s13 = vsubq_f16(t21, t22); + float16x8_t s14 = vsubq_f16(t31, t32); + float16x8_t s15 = vsubq_f16(t41, t42); + + float16x8_t s21 = vsubq_f16(t03, t04); + float16x8_t s22 = vsubq_f16(t13, t14); + float16x8_t s23 = vsubq_f16(t23, t24); + float16x8_t s24 = vsubq_f16(t33, t34); + float16x8_t s25 = vsubq_f16(t43, t44); + + float16x8_t s31 = vsubq_f16(t05, t06); + float16x8_t s32 = vsubq_f16(t15, t16); + float16x8_t s33 = vsubq_f16(t25, t26); + float16x8_t s34 = vsubq_f16(t35, t36); + float16x8_t s35 = vsubq_f16(t45, t46); + + float16x8_t s41 = vaddq_f16(t01, t02); + float16x8_t s42 = vaddq_f16(t11, t12); + float16x8_t s43 = vaddq_f16(t21, t22); + float16x8_t s44 = vaddq_f16(t31, t32); + float16x8_t s45 = vaddq_f16(t41, t42); + + float16x8_t s51 = vaddq_f16(t03, t04); + float16x8_t s52 = vaddq_f16(t13, t14); + float16x8_t s53 = vaddq_f16(t23, t24); + float16x8_t s54 = vaddq_f16(t33, t34); + float16x8_t s55 = vaddq_f16(t43, t44); + + float16x8_t s61 = vaddq_f16(t05, t06); + float16x8_t s62 = vaddq_f16(t15, t16); + float16x8_t s63 = vaddq_f16(t25, t26); + float16x8_t s64 = vaddq_f16(t35, t36); + float16x8_t s65 = vaddq_f16(t45, t46); + + float16x8_t m00 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t00, t01), t02), t03), t04), t05), t06); + float16x8_t m01 = vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.5), s21), vmulq_n_f16(s31, 1.5)); + float16x8_t m02 = vaddq_f16(vaddq_f16(vmulq_n_f16(s41, 0.25), s51), vmulq_n_f16(s61, 2.25)); + float16x8_t m03 = vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.125), s21), vmulq_n_f16(s31, 3.375)); + float16x8_t m04 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s41, 0.0625), s51), vmulq_n_f16(s61, 5.0625)), t07); + + float16x8_t m10 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t10, t11), t12), t13), t14), t15), t16); + float16x8_t m11 = vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.5), s22), vmulq_n_f16(s32, 1.5)); + float16x8_t m12 = vaddq_f16(vaddq_f16(vmulq_n_f16(s42, 0.25), s52), vmulq_n_f16(s62, 2.25)); + float16x8_t m13 = vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.125), s22), vmulq_n_f16(s32, 3.375)); + float16x8_t m14 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s42, 0.0625), s52), vmulq_n_f16(s62, 5.0625)), t17); + + float16x8_t m20 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t20, t21), t22), t23), t24), t25), t26); + float16x8_t m21 = vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.5), s23), vmulq_n_f16(s33, 1.5)); + float16x8_t m22 = vaddq_f16(vaddq_f16(vmulq_n_f16(s43, 0.25), s53), vmulq_n_f16(s63, 2.25)); + float16x8_t m23 = vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.125), s23), vmulq_n_f16(s33, 3.375)); + float16x8_t m24 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s43, 0.0625), s53), vmulq_n_f16(s63, 5.0625)), t27); + + float16x8_t m30 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t30, t31), t32), t33), t34), t35), t36); + float16x8_t m31 = vaddq_f16(vaddq_f16(vmulq_n_f16(s14, 0.5), s24), vmulq_n_f16(s34, 1.5)); + float16x8_t m32 = vaddq_f16(vaddq_f16(vmulq_n_f16(s44, 0.25), s54), vmulq_n_f16(s64, 2.25)); + float16x8_t m33 = vaddq_f16(vaddq_f16(vmulq_n_f16(s14, 0.125), s24), vmulq_n_f16(s34, 3.375)); + float16x8_t m34 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s44, 0.0625), s54), vmulq_n_f16(s64, 5.0625)), t37); + + float16x8_t m40 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t40, t41), t42), t43), t44), t45), t46); + float16x8_t m41 = vaddq_f16(vaddq_f16(vmulq_n_f16(s15, 0.5), s25), vmulq_n_f16(s35, 1.5)); + float16x8_t m42 = vaddq_f16(vaddq_f16(vmulq_n_f16(s45, 0.25), s55), vmulq_n_f16(s65, 2.25)); + float16x8_t m43 = vaddq_f16(vaddq_f16(vmulq_n_f16(s15, 0.125), s25), vmulq_n_f16(s35, 3.375)); + float16x8_t m44 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s45, 0.0625), s55), vmulq_n_f16(s65, 5.0625)), t47); + + float16x8_t bias_ptr = vld1q_f16(bias_data); + vst1q_f16(dst_data, vaddq_f16(m00, bias_ptr)); + vst1q_f16(dst_data + C8NUM, vaddq_f16(m01, bias_ptr)); + vst1q_f16(dst_data + 2 * C8NUM, vaddq_f16(m02, bias_ptr)); + vst1q_f16(dst_data + 3 * C8NUM, vaddq_f16(m03, bias_ptr)); + vst1q_f16(dst_data + 4 * C8NUM, vaddq_f16(m04, bias_ptr)); + + vst1q_f16(dst_data + dst_step * C8NUM, vaddq_f16(m10, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + C8NUM, vaddq_f16(m11, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m12, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m13, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m14, bias_ptr)); + + vst1q_f16(dst_data + 2 * dst_step * C8NUM, vaddq_f16(m20, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + C8NUM, vaddq_f16(m21, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m22, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m23, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m24, bias_ptr)); + + vst1q_f16(dst_data + 3 * dst_step * C8NUM, vaddq_f16(m30, bias_ptr)); + vst1q_f16(dst_data + 3 * dst_step * C8NUM + C8NUM, vaddq_f16(m31, bias_ptr)); + vst1q_f16(dst_data + 3 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m32, bias_ptr)); + vst1q_f16(dst_data + 3 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m33, bias_ptr)); + vst1q_f16(dst_data + 3 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m34, bias_ptr)); + + vst1q_f16(dst_data + 4 * dst_step * C8NUM, vaddq_f16(m40, bias_ptr)); + vst1q_f16(dst_data + 4 * dst_step * C8NUM + C8NUM, vaddq_f16(m41, bias_ptr)); + vst1q_f16(dst_data + 4 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m42, bias_ptr)); + vst1q_f16(dst_data + 4 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m43, bias_ptr)); + vst1q_f16(dst_data + 4 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m44, bias_ptr)); +#else + for (int i = 0; i < C8NUM; i++) { + float16_t src_data_00 = src_data[i]; + float16_t src_data_01 = src_data[i + src_step]; + float16_t src_data_02 = src_data[i + 2 * src_step]; + float16_t src_data_03 = src_data[i + 3 * src_step]; + float16_t src_data_04 = src_data[i + 4 * src_step]; + float16_t src_data_05 = src_data[i + 5 * src_step]; + float16_t src_data_06 = src_data[i + 6 * src_step]; + float16_t src_data_07 = src_data[i + 7 * src_step]; + float16_t src_data_10 = src_data[i + 8 * src_step]; + float16_t src_data_11 = src_data[i + 9 * src_step]; + float16_t src_data_12 = src_data[i + 10 * src_step]; + float16_t src_data_13 = src_data[i + 11 * src_step]; + float16_t src_data_14 = src_data[i + 12 * src_step]; + float16_t src_data_15 = src_data[i + 13 * src_step]; + float16_t src_data_16 = src_data[i + 14 * src_step]; + float16_t src_data_17 = src_data[i + 15 * src_step]; + float16_t src_data_20 = src_data[i + 16 * src_step]; + float16_t src_data_21 = src_data[i + 17 * src_step]; + float16_t src_data_22 = src_data[i + 18 * src_step]; + float16_t src_data_23 = src_data[i + 19 * src_step]; + float16_t src_data_24 = src_data[i + 20 * src_step]; + float16_t src_data_25 = src_data[i + 21 * src_step]; + float16_t src_data_26 = src_data[i + 22 * src_step]; + float16_t src_data_27 = src_data[i + 23 * src_step]; + float16_t src_data_30 = src_data[i + 24 * src_step]; + float16_t src_data_31 = src_data[i + 25 * src_step]; + float16_t src_data_32 = src_data[i + 26 * src_step]; + float16_t src_data_33 = src_data[i + 27 * src_step]; + float16_t src_data_34 = src_data[i + 28 * src_step]; + float16_t src_data_35 = src_data[i + 29 * src_step]; + float16_t src_data_36 = src_data[i + 30 * src_step]; + float16_t src_data_37 = src_data[i + 31 * src_step]; + float16_t src_data_40 = src_data[i + 32 * src_step]; + float16_t src_data_41 = src_data[i + 33 * src_step]; + float16_t src_data_42 = src_data[i + 34 * src_step]; + float16_t src_data_43 = src_data[i + 35 * src_step]; + float16_t src_data_44 = src_data[i + 36 * src_step]; + float16_t src_data_45 = src_data[i + 37 * src_step]; + float16_t src_data_46 = src_data[i + 38 * src_step]; + float16_t src_data_47 = src_data[i + 39 * src_step]; + float16_t src_data_50 = src_data[i + 40 * src_step]; + float16_t src_data_51 = src_data[i + 41 * src_step]; + float16_t src_data_52 = src_data[i + 42 * src_step]; + float16_t src_data_53 = src_data[i + 43 * src_step]; + float16_t src_data_54 = src_data[i + 44 * src_step]; + float16_t src_data_55 = src_data[i + 45 * src_step]; + float16_t src_data_56 = src_data[i + 46 * src_step]; + float16_t src_data_57 = src_data[i + 47 * src_step]; + float16_t src_data_60 = src_data[i + 48 * src_step]; + float16_t src_data_61 = src_data[i + 49 * src_step]; + float16_t src_data_62 = src_data[i + 50 * src_step]; + float16_t src_data_63 = src_data[i + 51 * src_step]; + float16_t src_data_64 = src_data[i + 52 * src_step]; + float16_t src_data_65 = src_data[i + 53 * src_step]; + float16_t src_data_66 = src_data[i + 54 * src_step]; + float16_t src_data_67 = src_data[i + 55 * src_step]; + float16_t src_data_70 = src_data[i + 56 * src_step]; + float16_t src_data_71 = src_data[i + 57 * src_step]; + float16_t src_data_72 = src_data[i + 58 * src_step]; + float16_t src_data_73 = src_data[i + 59 * src_step]; + float16_t src_data_74 = src_data[i + 60 * src_step]; + float16_t src_data_75 = src_data[i + 61 * src_step]; + float16_t src_data_76 = src_data[i + 62 * src_step]; + float16_t src_data_77 = src_data[i + 63 * src_step]; + + float16_t d01 = src_data_10 - src_data_20; + float16_t d02 = src_data_11 - src_data_21; + float16_t d03 = src_data_12 - src_data_22; + float16_t d04 = src_data_13 - src_data_23; + float16_t d05 = src_data_14 - src_data_24; + float16_t d06 = src_data_15 - src_data_25; + float16_t d07 = src_data_16 - src_data_26; + float16_t d08 = src_data_17 - src_data_27; + + float16_t d11 = src_data_30 - src_data_40; + float16_t d12 = src_data_31 - src_data_41; + float16_t d13 = src_data_32 - src_data_42; + float16_t d14 = src_data_33 - src_data_43; + float16_t d15 = src_data_34 - src_data_44; + float16_t d16 = src_data_35 - src_data_45; + float16_t d17 = src_data_36 - src_data_46; + float16_t d18 = src_data_37 - src_data_47; + + float16_t d21 = src_data_50 - src_data_60; + float16_t d22 = src_data_51 - src_data_61; + float16_t d23 = src_data_52 - src_data_62; + float16_t d24 = src_data_53 - src_data_63; + float16_t d25 = src_data_54 - src_data_64; + float16_t d26 = src_data_55 - src_data_65; + float16_t d27 = src_data_56 - src_data_66; + float16_t d28 = src_data_57 - src_data_67; + + float16_t d31 = src_data_10 + src_data_20; + float16_t d32 = src_data_11 + src_data_21; + float16_t d33 = src_data_12 + src_data_22; + float16_t d34 = src_data_13 + src_data_23; + float16_t d35 = src_data_14 + src_data_24; + float16_t d36 = src_data_15 + src_data_25; + float16_t d37 = src_data_16 + src_data_26; + float16_t d38 = src_data_17 + src_data_27; + + float16_t d41 = src_data_30 + src_data_40; + float16_t d42 = src_data_31 + src_data_41; + float16_t d43 = src_data_32 + src_data_42; + float16_t d44 = src_data_33 + src_data_43; + float16_t d45 = src_data_34 + src_data_44; + float16_t d46 = src_data_35 + src_data_45; + float16_t d47 = src_data_36 + src_data_46; + float16_t d48 = src_data_37 + src_data_47; + + float16_t d51 = src_data_50 + src_data_60; + float16_t d52 = src_data_51 + src_data_61; + float16_t d53 = src_data_52 + src_data_62; + float16_t d54 = src_data_53 + src_data_63; + float16_t d55 = src_data_54 + src_data_64; + float16_t d56 = src_data_55 + src_data_65; + float16_t d57 = src_data_56 + src_data_66; + float16_t d58 = src_data_57 + src_data_67; + + float16_t t00 = src_data_00 + src_data_10 + src_data_20 + src_data_30 + src_data_40 + src_data_50 + src_data_60; + float16_t t01 = src_data_01 + src_data_11 + src_data_21 + src_data_31 + src_data_41 + src_data_51 + src_data_61; + float16_t t02 = src_data_02 + src_data_12 + src_data_22 + src_data_32 + src_data_42 + src_data_52 + src_data_62; + float16_t t03 = src_data_03 + src_data_13 + src_data_23 + src_data_33 + src_data_43 + src_data_53 + src_data_63; + float16_t t04 = src_data_04 + src_data_14 + src_data_24 + src_data_34 + src_data_44 + src_data_54 + src_data_64; + float16_t t05 = src_data_05 + src_data_15 + src_data_25 + src_data_35 + src_data_45 + src_data_55 + src_data_65; + float16_t t06 = src_data_06 + src_data_16 + src_data_26 + src_data_36 + src_data_46 + src_data_56 + src_data_66; + float16_t t07 = src_data_07 + src_data_17 + src_data_27 + src_data_37 + src_data_47 + src_data_57 + src_data_67; + + const float16_t t10 = 0.5f * d01 + d11 + 1.5f * d21; + const float16_t t11 = 0.5f * d02 + d12 + 1.5f * d22; + const float16_t t12 = 0.5f * d03 + d13 + 1.5f * d23; + const float16_t t13 = 0.5f * d04 + d14 + 1.5f * d24; + const float16_t t14 = 0.5f * d05 + d15 + 1.5f * d25; + const float16_t t15 = 0.5f * d06 + d16 + 1.5f * d26; + const float16_t t16 = 0.5f * d07 + d17 + 1.5f * d27; + const float16_t t17 = 0.5f * d08 + d18 + 1.5f * d28; + + const float16_t t20 = 0.25f * d31 + d41 + 2.25f * d51; + const float16_t t21 = 0.25f * d32 + d42 + 2.25f * d52; + const float16_t t22 = 0.25f * d33 + d43 + 2.25f * d53; + const float16_t t23 = 0.25f * d34 + d44 + 2.25f * d54; + const float16_t t24 = 0.25f * d35 + d45 + 2.25f * d55; + const float16_t t25 = 0.25f * d36 + d46 + 2.25f * d56; + const float16_t t26 = 0.25f * d37 + d47 + 2.25f * d57; + const float16_t t27 = 0.25f * d38 + d48 + 2.25f * d58; + + const float16_t t30 = 0.125f * d01 + d11 + 3.375f * d21; + const float16_t t31 = 0.125f * d02 + d12 + 3.375f * d22; + const float16_t t32 = 0.125f * d03 + d13 + 3.375f * d23; + const float16_t t33 = 0.125f * d04 + d14 + 3.375f * d24; + const float16_t t34 = 0.125f * d05 + d15 + 3.375f * d25; + const float16_t t35 = 0.125f * d06 + d16 + 3.375f * d26; + const float16_t t36 = 0.125f * d07 + d17 + 3.375f * d27; + const float16_t t37 = 0.125f * d08 + d18 + 3.375f * d28; + + const float16_t t40 = 0.0625f * d31 + d41 + 5.0625f * d51 + src_data_70; + const float16_t t41 = 0.0625f * d32 + d42 + 5.0625f * d52 + src_data_71; + const float16_t t42 = 0.0625f * d33 + d43 + 5.0625f * d53 + src_data_72; + const float16_t t43 = 0.0625f * d34 + d44 + 5.0625f * d54 + src_data_73; + const float16_t t44 = 0.0625f * d35 + d45 + 5.0625f * d55 + src_data_74; + const float16_t t45 = 0.0625f * d36 + d46 + 5.0625f * d56 + src_data_75; + const float16_t t46 = 0.0625f * d37 + d47 + 5.0625f * d57 + src_data_76; + const float16_t t47 = 0.0625f * d38 + d48 + 5.0625f * d58 + src_data_77; + + float16_t s11 = t01 - t02; + float16_t s12 = t11 - t12; + float16_t s13 = t21 - t22; + float16_t s14 = t31 - t32; + float16_t s15 = t41 - t42; + + float16_t s21 = t03 - t04; + float16_t s22 = t13 - t14; + float16_t s23 = t23 - t24; + float16_t s24 = t33 - t34; + float16_t s25 = t43 - t44; + + float16_t s31 = t05 - t06; + float16_t s32 = t15 - t16; + float16_t s33 = t25 - t26; + float16_t s34 = t35 - t36; + float16_t s35 = t45 - t46; + + float16_t s41 = t01 + t02; + float16_t s42 = t11 + t12; + float16_t s43 = t21 + t22; + float16_t s44 = t31 + t32; + float16_t s45 = t41 + t42; + + float16_t s51 = t03 + t04; + float16_t s52 = t13 + t14; + float16_t s53 = t23 + t24; + float16_t s54 = t33 + t34; + float16_t s55 = t43 + t44; + + float16_t s61 = t05 + t06; + float16_t s62 = t15 + t16; + float16_t s63 = t25 + t26; + float16_t s64 = t35 + t36; + float16_t s65 = t45 + t46; + + float16_t m00 = t00 + t01 + t02 + t03 + t04 + t05 + t06; + const float16_t m01 = 0.5f * s11 + s21 + 1.5f * s31; + const float16_t m02 = 0.25f * s41 + s51 + 2.25f * s61; + const float16_t m03 = 0.125f * s11 + s21 + 3.375f * s31; + const float16_t m04 = 0.0625f * s41 + s51 + 5.0625f * s61 + t07; + + float16_t m10 = t10 + t11 + t12 + t13 + t14 + t15 + t16; + const float16_t m11 = 0.5f * s12 + s22 + 1.5f * s32; + const float16_t m12 = 0.25f * s42 + s52 + 2.25f * s62; + const float16_t m13 = 0.125f * s12 + s22 + 3.375f * s32; + const float16_t m14 = 0.0625f * s42 + s52 + 5.0625f * s62 + t17; + + float16_t m20 = t20 + t21 + t22 + t23 + t24 + t25 + t26; + const float16_t m21 = 0.5f * s13 + s23 + 1.5f * s33; + const float16_t m22 = 0.25f * s43 + s53 + 2.25f * s63; + const float16_t m23 = 0.125f * s13 + s23 + 3.375f * s33; + const float16_t m24 = 0.0625f * s43 + s53 + 5.0625f * s63 + t27; + + float16_t m30 = t30 + t31 + t32 + t33 + t34 + t35 + t36; + const float16_t m31 = 0.5f * s14 + s24 + 1.5f * s34; + const float16_t m32 = 0.25f * s44 + s54 + 2.25f * s64; + const float16_t m33 = 0.125f * s14 + s24 + 3.375f * s34; + const float16_t m34 = 0.0625f * s44 + s54 + 5.0625f * s64 + t37; + + float16_t m40 = t40 + t41 + t42 + t43 + t44 + t45 + t46; + const float16_t m41 = 0.5f * s15 + s25 + 1.5f * s35; + const float16_t m42 = 0.25f * s45 + s55 + 2.25f * s65; + const float16_t m43 = 0.125f * s15 + s25 + 3.375f * s35; + const float16_t m44 = 0.0625f * s45 + s55 + 5.0625f * s65 + t47; + + (dst_data + i)[0] = m00 + bias_data[i]; + (dst_data + i + C8NUM)[0] = m01 + bias_data[i]; + (dst_data + i + 2 * C8NUM)[0] = m02 + bias_data[i]; + (dst_data + i + 3 * C8NUM)[0] = m03 + bias_data[i]; + (dst_data + i + 4 * C8NUM)[0] = m04 + bias_data[i]; + + (dst_data + i + dst_step * C8NUM)[0] = m10 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + C8NUM)[0] = m11 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + 2 * C8NUM)[0] = m12 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + 3 * C8NUM)[0] = m13 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + 4 * C8NUM)[0] = m14 + bias_data[i]; + + (dst_data + i + 2 * dst_step * C8NUM)[0] = m20 + bias_data[i]; + (dst_data + i + 2 * dst_step * C8NUM + C8NUM)[0] = m21 + bias_data[i]; + (dst_data + i + 2 * dst_step * C8NUM + 2 * C8NUM)[0] = m22 + bias_data[i]; + (dst_data + i + 2 * dst_step * C8NUM + 3 * C8NUM)[0] = m23 + bias_data[i]; + (dst_data + i + 2 * dst_step * C8NUM + 4 * C8NUM)[0] = m24 + bias_data[i]; + + (dst_data + i + 3 * dst_step * C8NUM)[0] = m30 + bias_data[i]; + (dst_data + i + 3 * dst_step * C8NUM + C8NUM)[0] = m31 + bias_data[i]; + (dst_data + i + 3 * dst_step * C8NUM + 2 * C8NUM)[0] = m32 + bias_data[i]; + (dst_data + i + 3 * dst_step * C8NUM + 3 * C8NUM)[0] = m33 + bias_data[i]; + (dst_data + i + 3 * dst_step * C8NUM + 4 * C8NUM)[0] = m34 + bias_data[i]; + + (dst_data + i + 4 * dst_step * C8NUM)[0] = m40 + bias_data[i]; + (dst_data + i + 4 * dst_step * C8NUM + C8NUM)[0] = m41 + bias_data[i]; + (dst_data + i + 4 * dst_step * C8NUM + 2 * C8NUM)[0] = m42 + bias_data[i]; + (dst_data + i + 4 * dst_step * C8NUM + 3 * C8NUM)[0] = m43 + bias_data[i]; + (dst_data + i + 4 * dst_step * C8NUM + 4 * C8NUM)[0] = m44 + bias_data[i]; + } +#endif +} + +void OutputTransform8x6UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, + int src_step, int dst_step) { +#ifdef ENABLE_ARM + float16x8_t src_data_00 = vld1q_f16(src_data + 0 * src_step); + float16x8_t src_data_01 = vld1q_f16(src_data + 1 * src_step); + float16x8_t src_data_02 = vld1q_f16(src_data + 2 * src_step); + float16x8_t src_data_03 = vld1q_f16(src_data + 3 * src_step); + float16x8_t src_data_04 = vld1q_f16(src_data + 4 * src_step); + float16x8_t src_data_05 = vld1q_f16(src_data + 5 * src_step); + float16x8_t src_data_06 = vld1q_f16(src_data + 6 * src_step); + float16x8_t src_data_07 = vld1q_f16(src_data + 7 * src_step); + float16x8_t src_data_10 = vld1q_f16(src_data + 8 * src_step); + float16x8_t src_data_11 = vld1q_f16(src_data + 9 * src_step); + float16x8_t src_data_12 = vld1q_f16(src_data + 10 * src_step); + float16x8_t src_data_13 = vld1q_f16(src_data + 11 * src_step); + float16x8_t src_data_14 = vld1q_f16(src_data + 12 * src_step); + float16x8_t src_data_15 = vld1q_f16(src_data + 13 * src_step); + float16x8_t src_data_16 = vld1q_f16(src_data + 14 * src_step); + float16x8_t src_data_17 = vld1q_f16(src_data + 15 * src_step); + float16x8_t src_data_20 = vld1q_f16(src_data + 16 * src_step); + float16x8_t src_data_21 = vld1q_f16(src_data + 17 * src_step); + float16x8_t src_data_22 = vld1q_f16(src_data + 18 * src_step); + float16x8_t src_data_23 = vld1q_f16(src_data + 19 * src_step); + float16x8_t src_data_24 = vld1q_f16(src_data + 20 * src_step); + float16x8_t src_data_25 = vld1q_f16(src_data + 21 * src_step); + float16x8_t src_data_26 = vld1q_f16(src_data + 22 * src_step); + float16x8_t src_data_27 = vld1q_f16(src_data + 23 * src_step); + float16x8_t src_data_30 = vld1q_f16(src_data + 24 * src_step); + float16x8_t src_data_31 = vld1q_f16(src_data + 25 * src_step); + float16x8_t src_data_32 = vld1q_f16(src_data + 26 * src_step); + float16x8_t src_data_33 = vld1q_f16(src_data + 27 * src_step); + float16x8_t src_data_34 = vld1q_f16(src_data + 28 * src_step); + float16x8_t src_data_35 = vld1q_f16(src_data + 29 * src_step); + float16x8_t src_data_36 = vld1q_f16(src_data + 30 * src_step); + float16x8_t src_data_37 = vld1q_f16(src_data + 31 * src_step); + float16x8_t src_data_40 = vld1q_f16(src_data + 32 * src_step); + float16x8_t src_data_41 = vld1q_f16(src_data + 33 * src_step); + float16x8_t src_data_42 = vld1q_f16(src_data + 34 * src_step); + float16x8_t src_data_43 = vld1q_f16(src_data + 35 * src_step); + float16x8_t src_data_44 = vld1q_f16(src_data + 36 * src_step); + float16x8_t src_data_45 = vld1q_f16(src_data + 37 * src_step); + float16x8_t src_data_46 = vld1q_f16(src_data + 38 * src_step); + float16x8_t src_data_47 = vld1q_f16(src_data + 39 * src_step); + float16x8_t src_data_50 = vld1q_f16(src_data + 40 * src_step); + float16x8_t src_data_51 = vld1q_f16(src_data + 41 * src_step); + float16x8_t src_data_52 = vld1q_f16(src_data + 42 * src_step); + float16x8_t src_data_53 = vld1q_f16(src_data + 43 * src_step); + float16x8_t src_data_54 = vld1q_f16(src_data + 44 * src_step); + float16x8_t src_data_55 = vld1q_f16(src_data + 45 * src_step); + float16x8_t src_data_56 = vld1q_f16(src_data + 46 * src_step); + float16x8_t src_data_57 = vld1q_f16(src_data + 47 * src_step); + float16x8_t src_data_60 = vld1q_f16(src_data + 48 * src_step); + float16x8_t src_data_61 = vld1q_f16(src_data + 49 * src_step); + float16x8_t src_data_62 = vld1q_f16(src_data + 50 * src_step); + float16x8_t src_data_63 = vld1q_f16(src_data + 51 * src_step); + float16x8_t src_data_64 = vld1q_f16(src_data + 52 * src_step); + float16x8_t src_data_65 = vld1q_f16(src_data + 53 * src_step); + float16x8_t src_data_66 = vld1q_f16(src_data + 54 * src_step); + float16x8_t src_data_67 = vld1q_f16(src_data + 55 * src_step); + float16x8_t src_data_70 = vld1q_f16(src_data + 56 * src_step); + float16x8_t src_data_71 = vld1q_f16(src_data + 57 * src_step); + float16x8_t src_data_72 = vld1q_f16(src_data + 58 * src_step); + float16x8_t src_data_73 = vld1q_f16(src_data + 59 * src_step); + float16x8_t src_data_74 = vld1q_f16(src_data + 60 * src_step); + float16x8_t src_data_75 = vld1q_f16(src_data + 61 * src_step); + float16x8_t src_data_76 = vld1q_f16(src_data + 62 * src_step); + float16x8_t src_data_77 = vld1q_f16(src_data + 63 * src_step); + + float16x8_t d01 = vsubq_f16(src_data_10, src_data_20); + float16x8_t d02 = vsubq_f16(src_data_11, src_data_21); + float16x8_t d03 = vsubq_f16(src_data_12, src_data_22); + float16x8_t d04 = vsubq_f16(src_data_13, src_data_23); + float16x8_t d05 = vsubq_f16(src_data_14, src_data_24); + float16x8_t d06 = vsubq_f16(src_data_15, src_data_25); + float16x8_t d07 = vsubq_f16(src_data_16, src_data_26); + float16x8_t d08 = vsubq_f16(src_data_17, src_data_27); + + float16x8_t d11 = vsubq_f16(src_data_30, src_data_40); + float16x8_t d12 = vsubq_f16(src_data_31, src_data_41); + float16x8_t d13 = vsubq_f16(src_data_32, src_data_42); + float16x8_t d14 = vsubq_f16(src_data_33, src_data_43); + float16x8_t d15 = vsubq_f16(src_data_34, src_data_44); + float16x8_t d16 = vsubq_f16(src_data_35, src_data_45); + float16x8_t d17 = vsubq_f16(src_data_36, src_data_46); + float16x8_t d18 = vsubq_f16(src_data_37, src_data_47); + + float16x8_t d21 = vsubq_f16(src_data_50, src_data_60); + float16x8_t d22 = vsubq_f16(src_data_51, src_data_61); + float16x8_t d23 = vsubq_f16(src_data_52, src_data_62); + float16x8_t d24 = vsubq_f16(src_data_53, src_data_63); + float16x8_t d25 = vsubq_f16(src_data_54, src_data_64); + float16x8_t d26 = vsubq_f16(src_data_55, src_data_65); + float16x8_t d27 = vsubq_f16(src_data_56, src_data_66); + float16x8_t d28 = vsubq_f16(src_data_57, src_data_67); + + float16x8_t d31 = vaddq_f16(src_data_10, src_data_20); + float16x8_t d32 = vaddq_f16(src_data_11, src_data_21); + float16x8_t d33 = vaddq_f16(src_data_12, src_data_22); + float16x8_t d34 = vaddq_f16(src_data_13, src_data_23); + float16x8_t d35 = vaddq_f16(src_data_14, src_data_24); + float16x8_t d36 = vaddq_f16(src_data_15, src_data_25); + float16x8_t d37 = vaddq_f16(src_data_16, src_data_26); + float16x8_t d38 = vaddq_f16(src_data_17, src_data_27); + + float16x8_t d41 = vaddq_f16(src_data_30, src_data_40); + float16x8_t d42 = vaddq_f16(src_data_31, src_data_41); + float16x8_t d43 = vaddq_f16(src_data_32, src_data_42); + float16x8_t d44 = vaddq_f16(src_data_33, src_data_43); + float16x8_t d45 = vaddq_f16(src_data_34, src_data_44); + float16x8_t d46 = vaddq_f16(src_data_35, src_data_45); + float16x8_t d47 = vaddq_f16(src_data_36, src_data_46); + float16x8_t d48 = vaddq_f16(src_data_37, src_data_47); + + float16x8_t d51 = vaddq_f16(src_data_50, src_data_60); + float16x8_t d52 = vaddq_f16(src_data_51, src_data_61); + float16x8_t d53 = vaddq_f16(src_data_52, src_data_62); + float16x8_t d54 = vaddq_f16(src_data_53, src_data_63); + float16x8_t d55 = vaddq_f16(src_data_54, src_data_64); + float16x8_t d56 = vaddq_f16(src_data_55, src_data_65); + float16x8_t d57 = vaddq_f16(src_data_56, src_data_66); + float16x8_t d58 = vaddq_f16(src_data_57, src_data_67); + + float16x8_t t00 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_00, src_data_10), src_data_20), src_data_30), src_data_40), + src_data_50), + src_data_60); + float16x8_t t01 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_01, src_data_11), src_data_21), src_data_31), src_data_41), + src_data_51), + src_data_61); + float16x8_t t02 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_02, src_data_12), src_data_22), src_data_32), src_data_42), + src_data_52), + src_data_62); + float16x8_t t03 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_03, src_data_13), src_data_23), src_data_33), src_data_43), + src_data_53), + src_data_63); + float16x8_t t04 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_04, src_data_14), src_data_24), src_data_34), src_data_44), + src_data_54), + src_data_64); + float16x8_t t05 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_05, src_data_15), src_data_25), src_data_35), src_data_45), + src_data_55), + src_data_65); + float16x8_t t06 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_06, src_data_16), src_data_26), src_data_36), src_data_46), + src_data_56), + src_data_66); + float16x8_t t07 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_07, src_data_17), src_data_27), src_data_37), src_data_47), + src_data_57), + src_data_67); + + float16x8_t t10 = vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.5), d11), vmulq_n_f16(d21, 1.5)); + float16x8_t t11 = vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.5), d12), vmulq_n_f16(d22, 1.5)); + float16x8_t t12 = vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.5), d13), vmulq_n_f16(d23, 1.5)); + float16x8_t t13 = vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.5), d14), vmulq_n_f16(d24, 1.5)); + float16x8_t t14 = vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.5), d15), vmulq_n_f16(d25, 1.5)); + float16x8_t t15 = vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.5), d16), vmulq_n_f16(d26, 1.5)); + float16x8_t t16 = vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.5), d17), vmulq_n_f16(d27, 1.5)); + float16x8_t t17 = vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.5), d18), vmulq_n_f16(d28, 1.5)); + + float16x8_t t20 = vaddq_f16(vaddq_f16(vmulq_n_f16(d31, 0.25), d41), vmulq_n_f16(d51, 2.25)); + float16x8_t t21 = vaddq_f16(vaddq_f16(vmulq_n_f16(d32, 0.25), d42), vmulq_n_f16(d52, 2.25)); + float16x8_t t22 = vaddq_f16(vaddq_f16(vmulq_n_f16(d33, 0.25), d43), vmulq_n_f16(d53, 2.25)); + float16x8_t t23 = vaddq_f16(vaddq_f16(vmulq_n_f16(d34, 0.25), d44), vmulq_n_f16(d54, 2.25)); + float16x8_t t24 = vaddq_f16(vaddq_f16(vmulq_n_f16(d35, 0.25), d45), vmulq_n_f16(d55, 2.25)); + float16x8_t t25 = vaddq_f16(vaddq_f16(vmulq_n_f16(d36, 0.25), d46), vmulq_n_f16(d56, 2.25)); + float16x8_t t26 = vaddq_f16(vaddq_f16(vmulq_n_f16(d37, 0.25), d47), vmulq_n_f16(d57, 2.25)); + float16x8_t t27 = vaddq_f16(vaddq_f16(vmulq_n_f16(d38, 0.25), d48), vmulq_n_f16(d58, 2.25)); + + float16x8_t t30 = vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.125), d11), vmulq_n_f16(d21, 3.375)); + float16x8_t t31 = vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.125), d12), vmulq_n_f16(d22, 3.375)); + float16x8_t t32 = vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.125), d13), vmulq_n_f16(d23, 3.375)); + float16x8_t t33 = vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.125), d14), vmulq_n_f16(d24, 3.375)); + float16x8_t t34 = vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.125), d15), vmulq_n_f16(d25, 3.375)); + float16x8_t t35 = vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.125), d16), vmulq_n_f16(d26, 3.375)); + float16x8_t t36 = vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.125), d17), vmulq_n_f16(d27, 3.375)); + float16x8_t t37 = vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.125), d18), vmulq_n_f16(d28, 3.375)); + + float16x8_t t40 = vaddq_f16(vaddq_f16(vmulq_n_f16(d31, 0.0625), d41), vmulq_n_f16(d51, 5.0625)); + float16x8_t t41 = vaddq_f16(vaddq_f16(vmulq_n_f16(d32, 0.0625), d42), vmulq_n_f16(d52, 5.0625)); + float16x8_t t42 = vaddq_f16(vaddq_f16(vmulq_n_f16(d33, 0.0625), d43), vmulq_n_f16(d53, 5.0625)); + float16x8_t t43 = vaddq_f16(vaddq_f16(vmulq_n_f16(d34, 0.0625), d44), vmulq_n_f16(d54, 5.0625)); + float16x8_t t44 = vaddq_f16(vaddq_f16(vmulq_n_f16(d35, 0.0625), d45), vmulq_n_f16(d55, 5.0625)); + float16x8_t t45 = vaddq_f16(vaddq_f16(vmulq_n_f16(d36, 0.0625), d46), vmulq_n_f16(d56, 5.0625)); + float16x8_t t46 = vaddq_f16(vaddq_f16(vmulq_n_f16(d37, 0.0625), d47), vmulq_n_f16(d57, 5.0625)); + float16x8_t t47 = vaddq_f16(vaddq_f16(vmulq_n_f16(d38, 0.0625), d48), vmulq_n_f16(d58, 5.0625)); + + float16x8_t t50 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.03125), d11), vmulq_n_f16(d21, 7.59375)), src_data_70); + float16x8_t t51 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.03125), d12), vmulq_n_f16(d22, 7.59375)), src_data_71); + float16x8_t t52 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.03125), d13), vmulq_n_f16(d23, 7.59375)), src_data_72); + float16x8_t t53 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.03125), d14), vmulq_n_f16(d24, 7.59375)), src_data_73); + float16x8_t t54 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.03125), d15), vmulq_n_f16(d25, 7.59375)), src_data_74); + float16x8_t t55 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.03125), d16), vmulq_n_f16(d26, 7.59375)), src_data_75); + float16x8_t t56 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.03125), d17), vmulq_n_f16(d27, 7.59375)), src_data_76); + float16x8_t t57 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.03125), d18), vmulq_n_f16(d28, 7.59375)), src_data_77); + + float16x8_t s11 = vsubq_f16(t01, t02); + float16x8_t s12 = vsubq_f16(t11, t12); + float16x8_t s13 = vsubq_f16(t21, t22); + float16x8_t s14 = vsubq_f16(t31, t32); + float16x8_t s15 = vsubq_f16(t41, t42); + float16x8_t s16 = vsubq_f16(t51, t52); + + float16x8_t s21 = vsubq_f16(t03, t04); + float16x8_t s22 = vsubq_f16(t13, t14); + float16x8_t s23 = vsubq_f16(t23, t24); + float16x8_t s24 = vsubq_f16(t33, t34); + float16x8_t s25 = vsubq_f16(t43, t44); + float16x8_t s26 = vsubq_f16(t53, t54); + + float16x8_t s31 = vsubq_f16(t05, t06); + float16x8_t s32 = vsubq_f16(t15, t16); + float16x8_t s33 = vsubq_f16(t25, t26); + float16x8_t s34 = vsubq_f16(t35, t36); + float16x8_t s35 = vsubq_f16(t45, t46); + float16x8_t s36 = vsubq_f16(t55, t56); + + float16x8_t s41 = vaddq_f16(t01, t02); + float16x8_t s42 = vaddq_f16(t11, t12); + float16x8_t s43 = vaddq_f16(t21, t22); + float16x8_t s44 = vaddq_f16(t31, t32); + float16x8_t s45 = vaddq_f16(t41, t42); + float16x8_t s46 = vaddq_f16(t51, t52); + + float16x8_t s51 = vaddq_f16(t03, t04); + float16x8_t s52 = vaddq_f16(t13, t14); + float16x8_t s53 = vaddq_f16(t23, t24); + float16x8_t s54 = vaddq_f16(t33, t34); + float16x8_t s55 = vaddq_f16(t43, t44); + float16x8_t s56 = vaddq_f16(t53, t54); + + float16x8_t s61 = vaddq_f16(t05, t06); + float16x8_t s62 = vaddq_f16(t15, t16); + float16x8_t s63 = vaddq_f16(t25, t26); + float16x8_t s64 = vaddq_f16(t35, t36); + float16x8_t s65 = vaddq_f16(t45, t46); + float16x8_t s66 = vaddq_f16(t55, t56); + + float16x8_t m00 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t00, t01), t02), t03), t04), t05), t06); + float16x8_t m01 = vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.5), s21), vmulq_n_f16(s31, 1.5)); + float16x8_t m02 = vaddq_f16(vaddq_f16(vmulq_n_f16(s41, 0.25), s51), vmulq_n_f16(s61, 2.25)); + float16x8_t m03 = vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.125), s21), vmulq_n_f16(s31, 3.375)); + float16x8_t m04 = vaddq_f16(vaddq_f16(vmulq_n_f16(s41, 0.0625), s51), vmulq_n_f16(s61, 5.0625)); + float16x8_t m05 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.03125), s21), vmulq_n_f16(s31, 7.59375)), t07); + + float16x8_t m10 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t10, t11), t12), t13), t14), t15), t16); + float16x8_t m11 = vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.5), s22), vmulq_n_f16(s32, 1.5)); + float16x8_t m12 = vaddq_f16(vaddq_f16(vmulq_n_f16(s42, 0.25), s52), vmulq_n_f16(s62, 2.25)); + float16x8_t m13 = vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.125), s22), vmulq_n_f16(s32, 3.375)); + float16x8_t m14 = vaddq_f16(vaddq_f16(vmulq_n_f16(s42, 0.0625), s52), vmulq_n_f16(s62, 5.0625)); + float16x8_t m15 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.03125), s22), vmulq_n_f16(s32, 7.59375)), t17); + + float16x8_t m20 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t20, t21), t22), t23), t24), t25), t26); + float16x8_t m21 = vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.5), s23), vmulq_n_f16(s33, 1.5)); + float16x8_t m22 = vaddq_f16(vaddq_f16(vmulq_n_f16(s43, 0.25), s53), vmulq_n_f16(s63, 2.25)); + float16x8_t m23 = vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.125), s23), vmulq_n_f16(s33, 3.375)); + float16x8_t m24 = vaddq_f16(vaddq_f16(vmulq_n_f16(s43, 0.0625), s53), vmulq_n_f16(s63, 5.0625)); + float16x8_t m25 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.03125), s23), vmulq_n_f16(s33, 7.59375)), t27); + + float16x8_t m30 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t30, t31), t32), t33), t34), t35), t36); + float16x8_t m31 = vaddq_f16(vaddq_f16(vmulq_n_f16(s14, 0.5), s24), vmulq_n_f16(s34, 1.5)); + float16x8_t m32 = vaddq_f16(vaddq_f16(vmulq_n_f16(s44, 0.25), s54), vmulq_n_f16(s64, 2.25)); + float16x8_t m33 = vaddq_f16(vaddq_f16(vmulq_n_f16(s14, 0.125), s24), vmulq_n_f16(s34, 3.375)); + float16x8_t m34 = vaddq_f16(vaddq_f16(vmulq_n_f16(s44, 0.0625), s54), vmulq_n_f16(s64, 5.0625)); + float16x8_t m35 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s14, 0.03125), s24), vmulq_n_f16(s34, 7.59375)), t37); + + float16x8_t m40 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t40, t41), t42), t43), t44), t45), t46); + float16x8_t m41 = vaddq_f16(vaddq_f16(vmulq_n_f16(s15, 0.5), s25), vmulq_n_f16(s35, 1.5)); + float16x8_t m42 = vaddq_f16(vaddq_f16(vmulq_n_f16(s45, 0.25), s55), vmulq_n_f16(s65, 2.25)); + float16x8_t m43 = vaddq_f16(vaddq_f16(vmulq_n_f16(s15, 0.125), s25), vmulq_n_f16(s35, 3.375)); + float16x8_t m44 = vaddq_f16(vaddq_f16(vmulq_n_f16(s45, 0.0625), s55), vmulq_n_f16(s65, 5.0625)); + float16x8_t m45 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s15, 0.03125), s25), vmulq_n_f16(s35, 7.59375)), t47); + + float16x8_t m50 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t50, t51), t52), t53), t54), t55), t56); + float16x8_t m51 = vaddq_f16(vaddq_f16(vmulq_n_f16(s16, 0.5), s26), vmulq_n_f16(s36, 1.5)); + float16x8_t m52 = vaddq_f16(vaddq_f16(vmulq_n_f16(s46, 0.25), s56), vmulq_n_f16(s66, 2.25)); + float16x8_t m53 = vaddq_f16(vaddq_f16(vmulq_n_f16(s16, 0.125), s26), vmulq_n_f16(s36, 3.375)); + float16x8_t m54 = vaddq_f16(vaddq_f16(vmulq_n_f16(s46, 0.0625), s56), vmulq_n_f16(s66, 5.0625)); + float16x8_t m55 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s16, 0.03125), s26), vmulq_n_f16(s36, 7.59375)), t57); + + float16x8_t bias_ptr = vld1q_f16(bias_data); + vst1q_f16(dst_data, vaddq_f16(m00, bias_ptr)); + vst1q_f16(dst_data + C8NUM, vaddq_f16(m01, bias_ptr)); + vst1q_f16(dst_data + 2 * C8NUM, vaddq_f16(m02, bias_ptr)); + vst1q_f16(dst_data + 3 * C8NUM, vaddq_f16(m03, bias_ptr)); + vst1q_f16(dst_data + 4 * C8NUM, vaddq_f16(m04, bias_ptr)); + vst1q_f16(dst_data + 5 * C8NUM, vaddq_f16(m05, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM, vaddq_f16(m10, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + C8NUM, vaddq_f16(m11, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m12, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m13, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m14, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m15, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM, vaddq_f16(m20, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + C8NUM, vaddq_f16(m21, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m22, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m23, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m24, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m25, bias_ptr)); + vst1q_f16(dst_data + 3 * dst_step * C8NUM, vaddq_f16(m30, bias_ptr)); + vst1q_f16(dst_data + 3 * dst_step * C8NUM + C8NUM, vaddq_f16(m31, bias_ptr)); + vst1q_f16(dst_data + 3 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m32, bias_ptr)); + vst1q_f16(dst_data + 3 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m33, bias_ptr)); + vst1q_f16(dst_data + 3 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m34, bias_ptr)); + vst1q_f16(dst_data + 3 * dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m35, bias_ptr)); + vst1q_f16(dst_data + 4 * dst_step * C8NUM, vaddq_f16(m40, bias_ptr)); + vst1q_f16(dst_data + 4 * dst_step * C8NUM + C8NUM, vaddq_f16(m41, bias_ptr)); + vst1q_f16(dst_data + 4 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m42, bias_ptr)); + vst1q_f16(dst_data + 4 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m43, bias_ptr)); + vst1q_f16(dst_data + 4 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m44, bias_ptr)); + vst1q_f16(dst_data + 4 * dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m45, bias_ptr)); + vst1q_f16(dst_data + 5 * dst_step * C8NUM, vaddq_f16(m50, bias_ptr)); + vst1q_f16(dst_data + 5 * dst_step * C8NUM + C8NUM, vaddq_f16(m51, bias_ptr)); + vst1q_f16(dst_data + 5 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m52, bias_ptr)); + vst1q_f16(dst_data + 5 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m53, bias_ptr)); + vst1q_f16(dst_data + 5 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m54, bias_ptr)); + vst1q_f16(dst_data + 5 * dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m55, bias_ptr)); +#else + for (int i = 0; i < C8NUM; i++) { + float16_t src_data_00 = src_data[i]; + float16_t src_data_01 = src_data[i + src_step]; + float16_t src_data_02 = src_data[i + 2 * src_step]; + float16_t src_data_03 = src_data[i + 3 * src_step]; + float16_t src_data_04 = src_data[i + 4 * src_step]; + float16_t src_data_05 = src_data[i + 5 * src_step]; + float16_t src_data_06 = src_data[i + 6 * src_step]; + float16_t src_data_07 = src_data[i + 7 * src_step]; + float16_t src_data_10 = src_data[i + 8 * src_step]; + float16_t src_data_11 = src_data[i + 9 * src_step]; + float16_t src_data_12 = src_data[i + 10 * src_step]; + float16_t src_data_13 = src_data[i + 11 * src_step]; + float16_t src_data_14 = src_data[i + 12 * src_step]; + float16_t src_data_15 = src_data[i + 13 * src_step]; + float16_t src_data_16 = src_data[i + 14 * src_step]; + float16_t src_data_17 = src_data[i + 15 * src_step]; + float16_t src_data_20 = src_data[i + 16 * src_step]; + float16_t src_data_21 = src_data[i + 17 * src_step]; + float16_t src_data_22 = src_data[i + 18 * src_step]; + float16_t src_data_23 = src_data[i + 19 * src_step]; + float16_t src_data_24 = src_data[i + 20 * src_step]; + float16_t src_data_25 = src_data[i + 21 * src_step]; + float16_t src_data_26 = src_data[i + 22 * src_step]; + float16_t src_data_27 = src_data[i + 23 * src_step]; + float16_t src_data_30 = src_data[i + 24 * src_step]; + float16_t src_data_31 = src_data[i + 25 * src_step]; + float16_t src_data_32 = src_data[i + 26 * src_step]; + float16_t src_data_33 = src_data[i + 27 * src_step]; + float16_t src_data_34 = src_data[i + 28 * src_step]; + float16_t src_data_35 = src_data[i + 29 * src_step]; + float16_t src_data_36 = src_data[i + 30 * src_step]; + float16_t src_data_37 = src_data[i + 31 * src_step]; + float16_t src_data_40 = src_data[i + 32 * src_step]; + float16_t src_data_41 = src_data[i + 33 * src_step]; + float16_t src_data_42 = src_data[i + 34 * src_step]; + float16_t src_data_43 = src_data[i + 35 * src_step]; + float16_t src_data_44 = src_data[i + 36 * src_step]; + float16_t src_data_45 = src_data[i + 37 * src_step]; + float16_t src_data_46 = src_data[i + 38 * src_step]; + float16_t src_data_47 = src_data[i + 39 * src_step]; + float16_t src_data_50 = src_data[i + 40 * src_step]; + float16_t src_data_51 = src_data[i + 41 * src_step]; + float16_t src_data_52 = src_data[i + 42 * src_step]; + float16_t src_data_53 = src_data[i + 43 * src_step]; + float16_t src_data_54 = src_data[i + 44 * src_step]; + float16_t src_data_55 = src_data[i + 45 * src_step]; + float16_t src_data_56 = src_data[i + 46 * src_step]; + float16_t src_data_57 = src_data[i + 47 * src_step]; + float16_t src_data_60 = src_data[i + 48 * src_step]; + float16_t src_data_61 = src_data[i + 49 * src_step]; + float16_t src_data_62 = src_data[i + 50 * src_step]; + float16_t src_data_63 = src_data[i + 51 * src_step]; + float16_t src_data_64 = src_data[i + 52 * src_step]; + float16_t src_data_65 = src_data[i + 53 * src_step]; + float16_t src_data_66 = src_data[i + 54 * src_step]; + float16_t src_data_67 = src_data[i + 55 * src_step]; + float16_t src_data_70 = src_data[i + 56 * src_step]; + float16_t src_data_71 = src_data[i + 57 * src_step]; + float16_t src_data_72 = src_data[i + 58 * src_step]; + float16_t src_data_73 = src_data[i + 59 * src_step]; + float16_t src_data_74 = src_data[i + 60 * src_step]; + float16_t src_data_75 = src_data[i + 61 * src_step]; + float16_t src_data_76 = src_data[i + 62 * src_step]; + float16_t src_data_77 = src_data[i + 63 * src_step]; + + float16_t d01 = src_data_10 - src_data_20; + float16_t d02 = src_data_11 - src_data_21; + float16_t d03 = src_data_12 - src_data_22; + float16_t d04 = src_data_13 - src_data_23; + float16_t d05 = src_data_14 - src_data_24; + float16_t d06 = src_data_15 - src_data_25; + float16_t d07 = src_data_16 - src_data_26; + float16_t d08 = src_data_17 - src_data_27; + + float16_t d11 = src_data_30 - src_data_40; + float16_t d12 = src_data_31 - src_data_41; + float16_t d13 = src_data_32 - src_data_42; + float16_t d14 = src_data_33 - src_data_43; + float16_t d15 = src_data_34 - src_data_44; + float16_t d16 = src_data_35 - src_data_45; + float16_t d17 = src_data_36 - src_data_46; + float16_t d18 = src_data_37 - src_data_47; + + float16_t d21 = src_data_50 - src_data_60; + float16_t d22 = src_data_51 - src_data_61; + float16_t d23 = src_data_52 - src_data_62; + float16_t d24 = src_data_53 - src_data_63; + float16_t d25 = src_data_54 - src_data_64; + float16_t d26 = src_data_55 - src_data_65; + float16_t d27 = src_data_56 - src_data_66; + float16_t d28 = src_data_57 - src_data_67; + + float16_t d31 = src_data_10 + src_data_20; + float16_t d32 = src_data_11 + src_data_21; + float16_t d33 = src_data_12 + src_data_22; + float16_t d34 = src_data_13 + src_data_23; + float16_t d35 = src_data_14 + src_data_24; + float16_t d36 = src_data_15 + src_data_25; + float16_t d37 = src_data_16 + src_data_26; + float16_t d38 = src_data_17 + src_data_27; + + float16_t d41 = src_data_30 + src_data_40; + float16_t d42 = src_data_31 + src_data_41; + float16_t d43 = src_data_32 + src_data_42; + float16_t d44 = src_data_33 + src_data_43; + float16_t d45 = src_data_34 + src_data_44; + float16_t d46 = src_data_35 + src_data_45; + float16_t d47 = src_data_36 + src_data_46; + float16_t d48 = src_data_37 + src_data_47; + + float16_t d51 = src_data_50 + src_data_60; + float16_t d52 = src_data_51 + src_data_61; + float16_t d53 = src_data_52 + src_data_62; + float16_t d54 = src_data_53 + src_data_63; + float16_t d55 = src_data_54 + src_data_64; + float16_t d56 = src_data_55 + src_data_65; + float16_t d57 = src_data_56 + src_data_66; + float16_t d58 = src_data_57 + src_data_67; + + float16_t t00 = src_data_00 + src_data_10 + src_data_20 + src_data_30 + src_data_40 + src_data_50 + src_data_60; + float16_t t01 = src_data_01 + src_data_11 + src_data_21 + src_data_31 + src_data_41 + src_data_51 + src_data_61; + float16_t t02 = src_data_02 + src_data_12 + src_data_22 + src_data_32 + src_data_42 + src_data_52 + src_data_62; + float16_t t03 = src_data_03 + src_data_13 + src_data_23 + src_data_33 + src_data_43 + src_data_53 + src_data_63; + float16_t t04 = src_data_04 + src_data_14 + src_data_24 + src_data_34 + src_data_44 + src_data_54 + src_data_64; + float16_t t05 = src_data_05 + src_data_15 + src_data_25 + src_data_35 + src_data_45 + src_data_55 + src_data_65; + float16_t t06 = src_data_06 + src_data_16 + src_data_26 + src_data_36 + src_data_46 + src_data_56 + src_data_66; + float16_t t07 = src_data_07 + src_data_17 + src_data_27 + src_data_37 + src_data_47 + src_data_57 + src_data_67; + + const float16_t t10 = 0.5f * d01 + d11 + 1.5f * d21; + const float16_t t11 = 0.5f * d02 + d12 + 1.5f * d22; + const float16_t t12 = 0.5f * d03 + d13 + 1.5f * d23; + const float16_t t13 = 0.5f * d04 + d14 + 1.5f * d24; + const float16_t t14 = 0.5f * d05 + d15 + 1.5f * d25; + const float16_t t15 = 0.5f * d06 + d16 + 1.5f * d26; + const float16_t t16 = 0.5f * d07 + d17 + 1.5f * d27; + const float16_t t17 = 0.5f * d08 + d18 + 1.5f * d28; + + const float16_t t20 = 0.25f * d31 + d41 + 2.25f * d51; + const float16_t t21 = 0.25f * d32 + d42 + 2.25f * d52; + const float16_t t22 = 0.25f * d33 + d43 + 2.25f * d53; + const float16_t t23 = 0.25f * d34 + d44 + 2.25f * d54; + const float16_t t24 = 0.25f * d35 + d45 + 2.25f * d55; + const float16_t t25 = 0.25f * d36 + d46 + 2.25f * d56; + const float16_t t26 = 0.25f * d37 + d47 + 2.25f * d57; + const float16_t t27 = 0.25f * d38 + d48 + 2.25f * d58; + + const float16_t t30 = 0.125f * d01 + d11 + 3.375f * d21; + const float16_t t31 = 0.125f * d02 + d12 + 3.375f * d22; + const float16_t t32 = 0.125f * d03 + d13 + 3.375f * d23; + const float16_t t33 = 0.125f * d04 + d14 + 3.375f * d24; + const float16_t t34 = 0.125f * d05 + d15 + 3.375f * d25; + const float16_t t35 = 0.125f * d06 + d16 + 3.375f * d26; + const float16_t t36 = 0.125f * d07 + d17 + 3.375f * d27; + const float16_t t37 = 0.125f * d08 + d18 + 3.375f * d28; + + const float16_t t40 = 0.0625f * d31 + d41 + 5.0625f * d51; + const float16_t t41 = 0.0625f * d32 + d42 + 5.0625f * d52; + const float16_t t42 = 0.0625f * d33 + d43 + 5.0625f * d53; + const float16_t t43 = 0.0625f * d34 + d44 + 5.0625f * d54; + const float16_t t44 = 0.0625f * d35 + d45 + 5.0625f * d55; + const float16_t t45 = 0.0625f * d36 + d46 + 5.0625f * d56; + const float16_t t46 = 0.0625f * d37 + d47 + 5.0625f * d57; + const float16_t t47 = 0.0625f * d38 + d48 + 5.0625f * d58; + + const float16_t t50 = 0.03125f * d01 + d11 + 7.59375f * d21 + src_data_70; + const float16_t t51 = 0.03125f * d02 + d12 + 7.59375f * d22 + src_data_71; + const float16_t t52 = 0.03125f * d03 + d13 + 7.59375f * d23 + src_data_72; + const float16_t t53 = 0.03125f * d04 + d14 + 7.59375f * d24 + src_data_73; + const float16_t t54 = 0.03125f * d05 + d15 + 7.59375f * d25 + src_data_74; + const const float16_t t55 = 0.03125f * d06 + d16 + 7.59375f * d26 + src_data_75; + const float16_t t56 = 0.03125f * d07 + d17 + 7.59375f * d27 + src_data_76; + const float16_t t57 = 0.03125f * d08 + d18 + 7.59375f * d28 + src_data_77; + + float16_t s11 = t01 - t02; + float16_t s12 = t11 - t12; + float16_t s13 = t21 - t22; + float16_t s14 = t31 - t32; + float16_t s15 = t41 - t42; + float16_t s16 = t51 - t52; + + float16_t s21 = t03 - t04; + float16_t s22 = t13 - t14; + float16_t s23 = t23 - t24; + float16_t s24 = t33 - t34; + float16_t s25 = t43 - t44; + float16_t s26 = t53 - t54; + + float16_t s31 = t05 - t06; + float16_t s32 = t15 - t16; + float16_t s33 = t25 - t26; + float16_t s34 = t35 - t36; + float16_t s35 = t45 - t46; + float16_t s36 = t55 - t56; + + float16_t s41 = t01 + t02; + float16_t s42 = t11 + t12; + float16_t s43 = t21 + t22; + float16_t s44 = t31 + t32; + float16_t s45 = t41 + t42; + float16_t s46 = t51 + t52; + + float16_t s51 = t03 + t04; + float16_t s52 = t13 + t14; + float16_t s53 = t23 + t24; + float16_t s54 = t33 + t34; + float16_t s55 = t43 + t44; + float16_t s56 = t53 + t54; + + float16_t s61 = t05 + t06; + float16_t s62 = t15 + t16; + float16_t s63 = t25 + t26; + float16_t s64 = t35 + t36; + float16_t s65 = t45 + t46; + float16_t s66 = t55 + t56; + + float16_t m00 = t00 + t01 + t02 + t03 + t04 + t05 + t06; + const float16_t m01 = 0.5f * s11 + s21 + 1.5f * s31; + const float16_t m02 = 0.25f * s41 + s51 + 2.25f * s61; + const float16_t m03 = 0.125f * s11 + s21 + 3.375f * s31; + const float16_t m04 = 0.0625f * s41 + s51 + 5.0625f * s61; + const float16_t m05 = 0.03125f * s11 + s21 + 7.59375f * s31 + t07; + + float16_t m10 = t10 + t11 + t12 + t13 + t14 + t15 + t16; + const float16_t m11 = 0.5f * s12 + s22 + 1.5f * s32; + const float16_t m12 = 0.25f * s42 + s52 + 2.25f * s62; + const float16_t m13 = 0.125f * s12 + s22 + 3.375f * s32; + const float16_t m14 = 0.0625f * s42 + s52 + 5.0625f * s62; + const float16_t m15 = 0.03125f * s12 + s22 + 7.59375f * s32 + t17; + + float16_t m20 = t20 + t21 + t22 + t23 + t24 + t25 + t26; + const float16_t m21 = 0.5f * s13 + s23 + 1.5f * s33; + const float16_t m22 = 0.25f * s43 + s53 + 2.25f * s63; + const float16_t m23 = 0.125f * s13 + s23 + 3.375f * s33; + const float16_t m24 = 0.0625f * s43 + s53 + 5.0625f * s63; + const float16_t m25 = 0.03125f * s13 + s23 + 7.59375f * s33 + t27; + + float16_t m30 = t30 + t31 + t32 + t33 + t34 + t35 + t36; + const float16_t m31 = 0.5f * s14 + s24 + 1.5f * s34; + const float16_t m32 = 0.25f * s44 + s54 + 2.25f * s64; + const float16_t m33 = 0.125f * s14 + s24 + 3.375f * s34; + const float16_t m34 = 0.0625f * s44 + s54 + 5.0625f * s64; + const float16_t m35 = 0.03125f * s14 + s24 + 7.59375f * s34 + t37; + + float16_t m40 = t40 + t41 + t42 + t43 + t44 + t45 + t46; + const float16_t m41 = 0.5f * s15 + s25 + 1.5f * s35; + const float16_t m42 = 0.25f * s45 + s55 + 2.25f * s65; + const float16_t m43 = 0.125f * s15 + s25 + 3.375f * s35; + const float16_t m44 = 0.0625f * s45 + s55 + 5.0625f * s65; + const float16_t m45 = 0.03125f * s15 + s25 + 7.59375f * s35 + t47; + + float16_t m50 = t50 + t51 + t52 + t53 + t54 + t55 + t56; + const float16_t m51 = 0.5f * s16 + s26 + 1.5f * s36; + const float16_t m52 = 0.25f * s46 + s56 + 2.25f * s66; + const float16_t m53 = 0.125f * s16 + s26 + 3.375f * s36; + const float16_t m54 = 0.0625f * s46 + s56 + 5.0625f * s66; + const float16_t m55 = 0.03125f * s16 + s26 + 7.59375f * s36 + t57; + + (dst_data + i)[0] = m00 + bias_data[i]; + (dst_data + i + C8NUM)[0] = m01 + bias_data[i]; + (dst_data + i + 2 * C8NUM)[0] = m02 + bias_data[i]; + (dst_data + i + 3 * C8NUM)[0] = m03 + bias_data[i]; + (dst_data + i + 4 * C8NUM)[0] = m04 + bias_data[i]; + (dst_data + i + 5 * C8NUM)[0] = m05 + bias_data[i]; + + (dst_data + i + dst_step * C8NUM)[0] = m10 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + C8NUM)[0] = m11 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + 2 * C8NUM)[0] = m12 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + 3 * C8NUM)[0] = m13 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + 4 * C8NUM)[0] = m14 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + 5 * C8NUM)[0] = m15 + bias_data[i]; + + (dst_data + i + 2 * dst_step * C8NUM)[0] = m20 + bias_data[i]; + (dst_data + i + 2 * dst_step * C8NUM + C8NUM)[0] = m21 + bias_data[i]; + (dst_data + i + 2 * dst_step * C8NUM + 2 * C8NUM)[0] = m22 + bias_data[i]; + (dst_data + i + 2 * dst_step * C8NUM + 3 * C8NUM)[0] = m23 + bias_data[i]; + (dst_data + i + 2 * dst_step * C8NUM + 4 * C8NUM)[0] = m24 + bias_data[i]; + (dst_data + i + 2 * dst_step * C8NUM + 5 * C8NUM)[0] = m25 + bias_data[i]; + + (dst_data + i + 3 * dst_step * C8NUM)[0] = m30 + bias_data[i]; + (dst_data + i + 3 * dst_step * C8NUM + C8NUM)[0] = m31 + bias_data[i]; + (dst_data + i + 3 * dst_step * C8NUM + 2 * C8NUM)[0] = m32 + bias_data[i]; + (dst_data + i + 3 * dst_step * C8NUM + 3 * C8NUM)[0] = m33 + bias_data[i]; + (dst_data + i + 3 * dst_step * C8NUM + 4 * C8NUM)[0] = m34 + bias_data[i]; + (dst_data + i + 3 * dst_step * C8NUM + 5 * C8NUM)[0] = m35 + bias_data[i]; + + (dst_data + i + 4 * dst_step * C8NUM)[0] = m40 + bias_data[i]; + (dst_data + i + 4 * dst_step * C8NUM + C8NUM)[0] = m41 + bias_data[i]; + (dst_data + i + 4 * dst_step * C8NUM + 2 * C8NUM)[0] = m42 + bias_data[i]; + (dst_data + i + 4 * dst_step * C8NUM + 3 * C8NUM)[0] = m43 + bias_data[i]; + (dst_data + i + 4 * dst_step * C8NUM + 4 * C8NUM)[0] = m44 + bias_data[i]; + (dst_data + i + 4 * dst_step * C8NUM + 5 * C8NUM)[0] = m45 + bias_data[i]; + + (dst_data + i + 5 * dst_step * C8NUM)[0] = m50 + bias_data[i]; + (dst_data + i + 5 * dst_step * C8NUM + C8NUM)[0] = m51 + bias_data[i]; + (dst_data + i + 5 * dst_step * C8NUM + 2 * C8NUM)[0] = m52 + bias_data[i]; + (dst_data + i + 5 * dst_step * C8NUM + 3 * C8NUM)[0] = m53 + bias_data[i]; + (dst_data + i + 5 * dst_step * C8NUM + 4 * C8NUM)[0] = m54 + bias_data[i]; + (dst_data + i + 5 * dst_step * C8NUM + 5 * C8NUM)[0] = m55 + bias_data[i]; + } +#endif +} + +void OutputTransform8x7UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, + int src_step, int dst_step) { +#ifdef ENABLE_ARM + float16x8_t src_data_00 = vld1q_f16(src_data + 0 * src_step); + float16x8_t src_data_01 = vld1q_f16(src_data + 1 * src_step); + float16x8_t src_data_02 = vld1q_f16(src_data + 2 * src_step); + float16x8_t src_data_03 = vld1q_f16(src_data + 3 * src_step); + float16x8_t src_data_04 = vld1q_f16(src_data + 4 * src_step); + float16x8_t src_data_05 = vld1q_f16(src_data + 5 * src_step); + float16x8_t src_data_06 = vld1q_f16(src_data + 6 * src_step); + float16x8_t src_data_07 = vld1q_f16(src_data + 7 * src_step); + float16x8_t src_data_10 = vld1q_f16(src_data + 8 * src_step); + float16x8_t src_data_11 = vld1q_f16(src_data + 9 * src_step); + float16x8_t src_data_12 = vld1q_f16(src_data + 10 * src_step); + float16x8_t src_data_13 = vld1q_f16(src_data + 11 * src_step); + float16x8_t src_data_14 = vld1q_f16(src_data + 12 * src_step); + float16x8_t src_data_15 = vld1q_f16(src_data + 13 * src_step); + float16x8_t src_data_16 = vld1q_f16(src_data + 14 * src_step); + float16x8_t src_data_17 = vld1q_f16(src_data + 15 * src_step); + float16x8_t src_data_20 = vld1q_f16(src_data + 16 * src_step); + float16x8_t src_data_21 = vld1q_f16(src_data + 17 * src_step); + float16x8_t src_data_22 = vld1q_f16(src_data + 18 * src_step); + float16x8_t src_data_23 = vld1q_f16(src_data + 19 * src_step); + float16x8_t src_data_24 = vld1q_f16(src_data + 20 * src_step); + float16x8_t src_data_25 = vld1q_f16(src_data + 21 * src_step); + float16x8_t src_data_26 = vld1q_f16(src_data + 22 * src_step); + float16x8_t src_data_27 = vld1q_f16(src_data + 23 * src_step); + float16x8_t src_data_30 = vld1q_f16(src_data + 24 * src_step); + float16x8_t src_data_31 = vld1q_f16(src_data + 25 * src_step); + float16x8_t src_data_32 = vld1q_f16(src_data + 26 * src_step); + float16x8_t src_data_33 = vld1q_f16(src_data + 27 * src_step); + float16x8_t src_data_34 = vld1q_f16(src_data + 28 * src_step); + float16x8_t src_data_35 = vld1q_f16(src_data + 29 * src_step); + float16x8_t src_data_36 = vld1q_f16(src_data + 30 * src_step); + float16x8_t src_data_37 = vld1q_f16(src_data + 31 * src_step); + float16x8_t src_data_40 = vld1q_f16(src_data + 32 * src_step); + float16x8_t src_data_41 = vld1q_f16(src_data + 33 * src_step); + float16x8_t src_data_42 = vld1q_f16(src_data + 34 * src_step); + float16x8_t src_data_43 = vld1q_f16(src_data + 35 * src_step); + float16x8_t src_data_44 = vld1q_f16(src_data + 36 * src_step); + float16x8_t src_data_45 = vld1q_f16(src_data + 37 * src_step); + float16x8_t src_data_46 = vld1q_f16(src_data + 38 * src_step); + float16x8_t src_data_47 = vld1q_f16(src_data + 39 * src_step); + float16x8_t src_data_50 = vld1q_f16(src_data + 40 * src_step); + float16x8_t src_data_51 = vld1q_f16(src_data + 41 * src_step); + float16x8_t src_data_52 = vld1q_f16(src_data + 42 * src_step); + float16x8_t src_data_53 = vld1q_f16(src_data + 43 * src_step); + float16x8_t src_data_54 = vld1q_f16(src_data + 44 * src_step); + float16x8_t src_data_55 = vld1q_f16(src_data + 45 * src_step); + float16x8_t src_data_56 = vld1q_f16(src_data + 46 * src_step); + float16x8_t src_data_57 = vld1q_f16(src_data + 47 * src_step); + float16x8_t src_data_60 = vld1q_f16(src_data + 48 * src_step); + float16x8_t src_data_61 = vld1q_f16(src_data + 49 * src_step); + float16x8_t src_data_62 = vld1q_f16(src_data + 50 * src_step); + float16x8_t src_data_63 = vld1q_f16(src_data + 51 * src_step); + float16x8_t src_data_64 = vld1q_f16(src_data + 52 * src_step); + float16x8_t src_data_65 = vld1q_f16(src_data + 53 * src_step); + float16x8_t src_data_66 = vld1q_f16(src_data + 54 * src_step); + float16x8_t src_data_67 = vld1q_f16(src_data + 55 * src_step); + float16x8_t src_data_70 = vld1q_f16(src_data + 56 * src_step); + float16x8_t src_data_71 = vld1q_f16(src_data + 57 * src_step); + float16x8_t src_data_72 = vld1q_f16(src_data + 58 * src_step); + float16x8_t src_data_73 = vld1q_f16(src_data + 59 * src_step); + float16x8_t src_data_74 = vld1q_f16(src_data + 60 * src_step); + float16x8_t src_data_75 = vld1q_f16(src_data + 61 * src_step); + float16x8_t src_data_76 = vld1q_f16(src_data + 62 * src_step); + float16x8_t src_data_77 = vld1q_f16(src_data + 63 * src_step); + + float16x8_t d01 = vsubq_f16(src_data_10, src_data_20); + float16x8_t d02 = vsubq_f16(src_data_11, src_data_21); + float16x8_t d03 = vsubq_f16(src_data_12, src_data_22); + float16x8_t d04 = vsubq_f16(src_data_13, src_data_23); + float16x8_t d05 = vsubq_f16(src_data_14, src_data_24); + float16x8_t d06 = vsubq_f16(src_data_15, src_data_25); + float16x8_t d07 = vsubq_f16(src_data_16, src_data_26); + float16x8_t d08 = vsubq_f16(src_data_17, src_data_27); + + float16x8_t d11 = vsubq_f16(src_data_30, src_data_40); + float16x8_t d12 = vsubq_f16(src_data_31, src_data_41); + float16x8_t d13 = vsubq_f16(src_data_32, src_data_42); + float16x8_t d14 = vsubq_f16(src_data_33, src_data_43); + float16x8_t d15 = vsubq_f16(src_data_34, src_data_44); + float16x8_t d16 = vsubq_f16(src_data_35, src_data_45); + float16x8_t d17 = vsubq_f16(src_data_36, src_data_46); + float16x8_t d18 = vsubq_f16(src_data_37, src_data_47); + + float16x8_t d21 = vsubq_f16(src_data_50, src_data_60); + float16x8_t d22 = vsubq_f16(src_data_51, src_data_61); + float16x8_t d23 = vsubq_f16(src_data_52, src_data_62); + float16x8_t d24 = vsubq_f16(src_data_53, src_data_63); + float16x8_t d25 = vsubq_f16(src_data_54, src_data_64); + float16x8_t d26 = vsubq_f16(src_data_55, src_data_65); + float16x8_t d27 = vsubq_f16(src_data_56, src_data_66); + float16x8_t d28 = vsubq_f16(src_data_57, src_data_67); + + float16x8_t d31 = vaddq_f16(src_data_10, src_data_20); + float16x8_t d32 = vaddq_f16(src_data_11, src_data_21); + float16x8_t d33 = vaddq_f16(src_data_12, src_data_22); + float16x8_t d34 = vaddq_f16(src_data_13, src_data_23); + float16x8_t d35 = vaddq_f16(src_data_14, src_data_24); + float16x8_t d36 = vaddq_f16(src_data_15, src_data_25); + float16x8_t d37 = vaddq_f16(src_data_16, src_data_26); + float16x8_t d38 = vaddq_f16(src_data_17, src_data_27); + + float16x8_t d41 = vaddq_f16(src_data_30, src_data_40); + float16x8_t d42 = vaddq_f16(src_data_31, src_data_41); + float16x8_t d43 = vaddq_f16(src_data_32, src_data_42); + float16x8_t d44 = vaddq_f16(src_data_33, src_data_43); + float16x8_t d45 = vaddq_f16(src_data_34, src_data_44); + float16x8_t d46 = vaddq_f16(src_data_35, src_data_45); + float16x8_t d47 = vaddq_f16(src_data_36, src_data_46); + float16x8_t d48 = vaddq_f16(src_data_37, src_data_47); + + float16x8_t d51 = vaddq_f16(src_data_50, src_data_60); + float16x8_t d52 = vaddq_f16(src_data_51, src_data_61); + float16x8_t d53 = vaddq_f16(src_data_52, src_data_62); + float16x8_t d54 = vaddq_f16(src_data_53, src_data_63); + float16x8_t d55 = vaddq_f16(src_data_54, src_data_64); + float16x8_t d56 = vaddq_f16(src_data_55, src_data_65); + float16x8_t d57 = vaddq_f16(src_data_56, src_data_66); + float16x8_t d58 = vaddq_f16(src_data_57, src_data_67); + + float16x8_t t00 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_00, src_data_10), src_data_20), src_data_30), src_data_40), + src_data_50), + src_data_60); + float16x8_t t01 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_01, src_data_11), src_data_21), src_data_31), src_data_41), + src_data_51), + src_data_61); + float16x8_t t02 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_02, src_data_12), src_data_22), src_data_32), src_data_42), + src_data_52), + src_data_62); + float16x8_t t03 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_03, src_data_13), src_data_23), src_data_33), src_data_43), + src_data_53), + src_data_63); + float16x8_t t04 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_04, src_data_14), src_data_24), src_data_34), src_data_44), + src_data_54), + src_data_64); + float16x8_t t05 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_05, src_data_15), src_data_25), src_data_35), src_data_45), + src_data_55), + src_data_65); + float16x8_t t06 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_06, src_data_16), src_data_26), src_data_36), src_data_46), + src_data_56), + src_data_66); + float16x8_t t07 = vaddq_f16( + vaddq_f16( + vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_07, src_data_17), src_data_27), src_data_37), src_data_47), + src_data_57), + src_data_67); + + float16x8_t t10 = vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.5), d11), vmulq_n_f16(d21, 1.5)); + float16x8_t t11 = vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.5), d12), vmulq_n_f16(d22, 1.5)); + float16x8_t t12 = vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.5), d13), vmulq_n_f16(d23, 1.5)); + float16x8_t t13 = vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.5), d14), vmulq_n_f16(d24, 1.5)); + float16x8_t t14 = vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.5), d15), vmulq_n_f16(d25, 1.5)); + float16x8_t t15 = vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.5), d16), vmulq_n_f16(d26, 1.5)); + float16x8_t t16 = vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.5), d17), vmulq_n_f16(d27, 1.5)); + float16x8_t t17 = vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.5), d18), vmulq_n_f16(d28, 1.5)); + + float16x8_t t20 = vaddq_f16(vaddq_f16(vmulq_n_f16(d31, 0.25), d41), vmulq_n_f16(d51, 2.25)); + float16x8_t t21 = vaddq_f16(vaddq_f16(vmulq_n_f16(d32, 0.25), d42), vmulq_n_f16(d52, 2.25)); + float16x8_t t22 = vaddq_f16(vaddq_f16(vmulq_n_f16(d33, 0.25), d43), vmulq_n_f16(d53, 2.25)); + float16x8_t t23 = vaddq_f16(vaddq_f16(vmulq_n_f16(d34, 0.25), d44), vmulq_n_f16(d54, 2.25)); + float16x8_t t24 = vaddq_f16(vaddq_f16(vmulq_n_f16(d35, 0.25), d45), vmulq_n_f16(d55, 2.25)); + float16x8_t t25 = vaddq_f16(vaddq_f16(vmulq_n_f16(d36, 0.25), d46), vmulq_n_f16(d56, 2.25)); + float16x8_t t26 = vaddq_f16(vaddq_f16(vmulq_n_f16(d37, 0.25), d47), vmulq_n_f16(d57, 2.25)); + float16x8_t t27 = vaddq_f16(vaddq_f16(vmulq_n_f16(d38, 0.25), d48), vmulq_n_f16(d58, 2.25)); + + float16x8_t t30 = vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.125), d11), vmulq_n_f16(d21, 3.375)); + float16x8_t t31 = vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.125), d12), vmulq_n_f16(d22, 3.375)); + float16x8_t t32 = vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.125), d13), vmulq_n_f16(d23, 3.375)); + float16x8_t t33 = vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.125), d14), vmulq_n_f16(d24, 3.375)); + float16x8_t t34 = vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.125), d15), vmulq_n_f16(d25, 3.375)); + float16x8_t t35 = vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.125), d16), vmulq_n_f16(d26, 3.375)); + float16x8_t t36 = vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.125), d17), vmulq_n_f16(d27, 3.375)); + float16x8_t t37 = vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.125), d18), vmulq_n_f16(d28, 3.375)); + + float16x8_t t40 = vaddq_f16(vaddq_f16(vmulq_n_f16(d31, 0.0625), d41), vmulq_n_f16(d51, 5.0625)); + float16x8_t t41 = vaddq_f16(vaddq_f16(vmulq_n_f16(d32, 0.0625), d42), vmulq_n_f16(d52, 5.0625)); + float16x8_t t42 = vaddq_f16(vaddq_f16(vmulq_n_f16(d33, 0.0625), d43), vmulq_n_f16(d53, 5.0625)); + float16x8_t t43 = vaddq_f16(vaddq_f16(vmulq_n_f16(d34, 0.0625), d44), vmulq_n_f16(d54, 5.0625)); + float16x8_t t44 = vaddq_f16(vaddq_f16(vmulq_n_f16(d35, 0.0625), d45), vmulq_n_f16(d55, 5.0625)); + float16x8_t t45 = vaddq_f16(vaddq_f16(vmulq_n_f16(d36, 0.0625), d46), vmulq_n_f16(d56, 5.0625)); + float16x8_t t46 = vaddq_f16(vaddq_f16(vmulq_n_f16(d37, 0.0625), d47), vmulq_n_f16(d57, 5.0625)); + float16x8_t t47 = vaddq_f16(vaddq_f16(vmulq_n_f16(d38, 0.0625), d48), vmulq_n_f16(d58, 5.0625)); + + float16x8_t t50 = vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.03125), d11), vmulq_n_f16(d21, 7.59375)); + float16x8_t t51 = vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.03125), d12), vmulq_n_f16(d22, 7.59375)); + float16x8_t t52 = vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.03125), d13), vmulq_n_f16(d23, 7.59375)); + float16x8_t t53 = vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.03125), d14), vmulq_n_f16(d24, 7.59375)); + float16x8_t t54 = vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.03125), d15), vmulq_n_f16(d25, 7.59375)); + float16x8_t t55 = vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.03125), d16), vmulq_n_f16(d26, 7.59375)); + float16x8_t t56 = vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.03125), d17), vmulq_n_f16(d27, 7.59375)); + float16x8_t t57 = vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.03125), d18), vmulq_n_f16(d28, 7.59375)); + + float16x8_t t60 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d31, 0.015625), d41), vmulq_n_f16(d51, 11.390625)), src_data_70); + float16x8_t t61 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d32, 0.015625), d42), vmulq_n_f16(d52, 11.390625)), src_data_71); + float16x8_t t62 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d33, 0.015625), d43), vmulq_n_f16(d53, 11.390625)), src_data_72); + float16x8_t t63 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d34, 0.015625), d44), vmulq_n_f16(d54, 11.390625)), src_data_73); + float16x8_t t64 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d35, 0.015625), d45), vmulq_n_f16(d55, 11.390625)), src_data_74); + float16x8_t t65 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d36, 0.015625), d46), vmulq_n_f16(d56, 11.390625)), src_data_75); + float16x8_t t66 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d37, 0.015625), d47), vmulq_n_f16(d57, 11.390625)), src_data_76); + float16x8_t t67 = + vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d38, 0.015625), d48), vmulq_n_f16(d58, 11.390625)), src_data_77); + + float16x8_t s11 = vsubq_f16(t01, t02); + float16x8_t s12 = vsubq_f16(t11, t12); + float16x8_t s13 = vsubq_f16(t21, t22); + float16x8_t s14 = vsubq_f16(t31, t32); + float16x8_t s15 = vsubq_f16(t41, t42); + float16x8_t s16 = vsubq_f16(t51, t52); + float16x8_t s17 = vsubq_f16(t61, t62); + + float16x8_t s21 = vsubq_f16(t03, t04); + float16x8_t s22 = vsubq_f16(t13, t14); + float16x8_t s23 = vsubq_f16(t23, t24); + float16x8_t s24 = vsubq_f16(t33, t34); + float16x8_t s25 = vsubq_f16(t43, t44); + float16x8_t s26 = vsubq_f16(t53, t54); + float16x8_t s27 = vsubq_f16(t63, t64); + + float16x8_t s31 = vsubq_f16(t05, t06); + float16x8_t s32 = vsubq_f16(t15, t16); + float16x8_t s33 = vsubq_f16(t25, t26); + float16x8_t s34 = vsubq_f16(t35, t36); + float16x8_t s35 = vsubq_f16(t45, t46); + float16x8_t s36 = vsubq_f16(t55, t56); + float16x8_t s37 = vsubq_f16(t65, t66); + + float16x8_t s41 = vaddq_f16(t01, t02); + float16x8_t s42 = vaddq_f16(t11, t12); + float16x8_t s43 = vaddq_f16(t21, t22); + float16x8_t s44 = vaddq_f16(t31, t32); + float16x8_t s45 = vaddq_f16(t41, t42); + float16x8_t s46 = vaddq_f16(t51, t52); + float16x8_t s47 = vaddq_f16(t61, t62); + + float16x8_t s51 = vaddq_f16(t03, t04); + float16x8_t s52 = vaddq_f16(t13, t14); + float16x8_t s53 = vaddq_f16(t23, t24); + float16x8_t s54 = vaddq_f16(t33, t34); + float16x8_t s55 = vaddq_f16(t43, t44); + float16x8_t s56 = vaddq_f16(t53, t54); + float16x8_t s57 = vaddq_f16(t63, t64); + + float16x8_t s61 = vaddq_f16(t05, t06); + float16x8_t s62 = vaddq_f16(t15, t16); + float16x8_t s63 = vaddq_f16(t25, t26); + float16x8_t s64 = vaddq_f16(t35, t36); + float16x8_t s65 = vaddq_f16(t45, t46); + float16x8_t s66 = vaddq_f16(t55, t56); + float16x8_t s67 = vaddq_f16(t65, t66); + + float16x8_t m00 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t00, t01), t02), t03), t04), t05), t06); + float16x8_t m01 = vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.5), s21), vmulq_n_f16(s31, 1.5)); + float16x8_t m02 = vaddq_f16(vaddq_f16(vmulq_n_f16(s41, 0.25), s51), vmulq_n_f16(s61, 2.25)); + float16x8_t m03 = vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.125), s21), vmulq_n_f16(s31, 3.375)); + float16x8_t m04 = vaddq_f16(vaddq_f16(vmulq_n_f16(s41, 0.0625), s51), vmulq_n_f16(s61, 5.0625)); + float16x8_t m05 = vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.03125), s21), vmulq_n_f16(s31, 7.59375)); + float16x8_t m06 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s41, 0.015625), s51), vmulq_n_f16(s61, 11.390625)), t07); + + float16x8_t m10 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t10, t11), t12), t13), t14), t15), t16); + float16x8_t m11 = vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.5), s22), vmulq_n_f16(s32, 1.5)); + float16x8_t m12 = vaddq_f16(vaddq_f16(vmulq_n_f16(s42, 0.25), s52), vmulq_n_f16(s62, 2.25)); + float16x8_t m13 = vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.125), s22), vmulq_n_f16(s32, 3.375)); + float16x8_t m14 = vaddq_f16(vaddq_f16(vmulq_n_f16(s42, 0.0625), s52), vmulq_n_f16(s62, 5.0625)); + float16x8_t m15 = vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.03125), s22), vmulq_n_f16(s32, 7.59375)); + float16x8_t m16 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s42, 0.015625), s52), vmulq_n_f16(s62, 11.390625)), t17); + + float16x8_t m20 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t20, t21), t22), t23), t24), t25), t26); + float16x8_t m21 = vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.5), s23), vmulq_n_f16(s33, 1.5)); + float16x8_t m22 = vaddq_f16(vaddq_f16(vmulq_n_f16(s43, 0.25), s53), vmulq_n_f16(s63, 2.25)); + float16x8_t m23 = vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.125), s23), vmulq_n_f16(s33, 3.375)); + float16x8_t m24 = vaddq_f16(vaddq_f16(vmulq_n_f16(s43, 0.0625), s53), vmulq_n_f16(s63, 5.0625)); + float16x8_t m25 = vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.03125), s23), vmulq_n_f16(s33, 7.59375)); + float16x8_t m26 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s43, 0.015625), s53), vmulq_n_f16(s63, 11.390625)), t27); + + float16x8_t m30 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t30, t31), t32), t33), t34), t35), t36); + float16x8_t m31 = vaddq_f16(vaddq_f16(vmulq_n_f16(s14, 0.5), s24), vmulq_n_f16(s34, 1.5)); + float16x8_t m32 = vaddq_f16(vaddq_f16(vmulq_n_f16(s44, 0.25), s54), vmulq_n_f16(s64, 2.25)); + float16x8_t m33 = vaddq_f16(vaddq_f16(vmulq_n_f16(s14, 0.125), s24), vmulq_n_f16(s34, 3.375)); + float16x8_t m34 = vaddq_f16(vaddq_f16(vmulq_n_f16(s44, 0.0625), s54), vmulq_n_f16(s64, 5.0625)); + float16x8_t m35 = vaddq_f16(vaddq_f16(vmulq_n_f16(s14, 0.03125), s24), vmulq_n_f16(s34, 7.59375)); + float16x8_t m36 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s44, 0.015625), s54), vmulq_n_f16(s64, 11.390625)), t37); + + float16x8_t m40 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t40, t41), t42), t43), t44), t45), t46); + float16x8_t m41 = vaddq_f16(vaddq_f16(vmulq_n_f16(s15, 0.5), s25), vmulq_n_f16(s35, 1.5)); + float16x8_t m42 = vaddq_f16(vaddq_f16(vmulq_n_f16(s45, 0.25), s55), vmulq_n_f16(s65, 2.25)); + float16x8_t m43 = vaddq_f16(vaddq_f16(vmulq_n_f16(s15, 0.125), s25), vmulq_n_f16(s35, 3.375)); + float16x8_t m44 = vaddq_f16(vaddq_f16(vmulq_n_f16(s45, 0.0625), s55), vmulq_n_f16(s65, 5.0625)); + float16x8_t m45 = vaddq_f16(vaddq_f16(vmulq_n_f16(s15, 0.03125), s25), vmulq_n_f16(s35, 7.59375)); + float16x8_t m46 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s45, 0.015625), s55), vmulq_n_f16(s65, 11.390625)), t47); + + float16x8_t m50 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t50, t51), t52), t53), t54), t55), t56); + float16x8_t m51 = vaddq_f16(vaddq_f16(vmulq_n_f16(s16, 0.5), s26), vmulq_n_f16(s36, 1.5)); + float16x8_t m52 = vaddq_f16(vaddq_f16(vmulq_n_f16(s46, 0.25), s56), vmulq_n_f16(s66, 2.25)); + float16x8_t m53 = vaddq_f16(vaddq_f16(vmulq_n_f16(s16, 0.125), s26), vmulq_n_f16(s36, 3.375)); + float16x8_t m54 = vaddq_f16(vaddq_f16(vmulq_n_f16(s46, 0.0625), s56), vmulq_n_f16(s66, 5.0625)); + float16x8_t m55 = vaddq_f16(vaddq_f16(vmulq_n_f16(s16, 0.03125), s26), vmulq_n_f16(s36, 7.59375)); + float16x8_t m56 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s46, 0.015625), s56), vmulq_n_f16(s66, 11.390625)), t57); + + float16x8_t m60 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t60, t61), t62), t63), t64), t65), t66); + float16x8_t m61 = vaddq_f16(vaddq_f16(vmulq_n_f16(s17, 0.5), s27), vmulq_n_f16(s37, 1.5)); + float16x8_t m62 = vaddq_f16(vaddq_f16(vmulq_n_f16(s47, 0.25), s57), vmulq_n_f16(s67, 2.25)); + float16x8_t m63 = vaddq_f16(vaddq_f16(vmulq_n_f16(s17, 0.125), s27), vmulq_n_f16(s37, 3.375)); + float16x8_t m64 = vaddq_f16(vaddq_f16(vmulq_n_f16(s47, 0.0625), s57), vmulq_n_f16(s67, 5.0625)); + float16x8_t m65 = vaddq_f16(vaddq_f16(vmulq_n_f16(s17, 0.03125), s27), vmulq_n_f16(s37, 7.59375)); + float16x8_t m66 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s47, 0.015625), s57), vmulq_n_f16(s67, 11.390625)), t67); + + float16x8_t bias_ptr = vld1q_f16(bias_data); + vst1q_f16(dst_data, vaddq_f16(m00, bias_ptr)); + vst1q_f16(dst_data + C8NUM, vaddq_f16(m01, bias_ptr)); + vst1q_f16(dst_data + 2 * C8NUM, vaddq_f16(m02, bias_ptr)); + vst1q_f16(dst_data + 3 * C8NUM, vaddq_f16(m03, bias_ptr)); + vst1q_f16(dst_data + 4 * C8NUM, vaddq_f16(m04, bias_ptr)); + vst1q_f16(dst_data + 5 * C8NUM, vaddq_f16(m05, bias_ptr)); + vst1q_f16(dst_data + 6 * C8NUM, vaddq_f16(m06, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM, vaddq_f16(m10, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + C8NUM, vaddq_f16(m11, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m12, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m13, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m14, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m15, bias_ptr)); + vst1q_f16(dst_data + dst_step * C8NUM + 6 * C8NUM, vaddq_f16(m16, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM, vaddq_f16(m20, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + C8NUM, vaddq_f16(m21, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m22, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m23, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m24, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m25, bias_ptr)); + vst1q_f16(dst_data + 2 * dst_step * C8NUM + 6 * C8NUM, vaddq_f16(m26, bias_ptr)); + vst1q_f16(dst_data + 3 * dst_step * C8NUM, vaddq_f16(m30, bias_ptr)); + vst1q_f16(dst_data + 3 * dst_step * C8NUM + C8NUM, vaddq_f16(m31, bias_ptr)); + vst1q_f16(dst_data + 3 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m32, bias_ptr)); + vst1q_f16(dst_data + 3 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m33, bias_ptr)); + vst1q_f16(dst_data + 3 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m34, bias_ptr)); + vst1q_f16(dst_data + 3 * dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m35, bias_ptr)); + vst1q_f16(dst_data + 3 * dst_step * C8NUM + 6 * C8NUM, vaddq_f16(m36, bias_ptr)); + vst1q_f16(dst_data + 4 * dst_step * C8NUM, vaddq_f16(m40, bias_ptr)); + vst1q_f16(dst_data + 4 * dst_step * C8NUM + C8NUM, vaddq_f16(m41, bias_ptr)); + vst1q_f16(dst_data + 4 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m42, bias_ptr)); + vst1q_f16(dst_data + 4 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m43, bias_ptr)); + vst1q_f16(dst_data + 4 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m44, bias_ptr)); + vst1q_f16(dst_data + 4 * dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m45, bias_ptr)); + vst1q_f16(dst_data + 4 * dst_step * C8NUM + 6 * C8NUM, vaddq_f16(m46, bias_ptr)); + vst1q_f16(dst_data + 5 * dst_step * C8NUM, vaddq_f16(m50, bias_ptr)); + vst1q_f16(dst_data + 5 * dst_step * C8NUM + C8NUM, vaddq_f16(m51, bias_ptr)); + vst1q_f16(dst_data + 5 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m52, bias_ptr)); + vst1q_f16(dst_data + 5 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m53, bias_ptr)); + vst1q_f16(dst_data + 5 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m54, bias_ptr)); + vst1q_f16(dst_data + 5 * dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m55, bias_ptr)); + vst1q_f16(dst_data + 5 * dst_step * C8NUM + 6 * C8NUM, vaddq_f16(m56, bias_ptr)); + vst1q_f16(dst_data + 6 * dst_step * C8NUM, vaddq_f16(m60, bias_ptr)); + vst1q_f16(dst_data + 6 * dst_step * C8NUM + C8NUM, vaddq_f16(m61, bias_ptr)); + vst1q_f16(dst_data + 6 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m62, bias_ptr)); + vst1q_f16(dst_data + 6 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m63, bias_ptr)); + vst1q_f16(dst_data + 6 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m64, bias_ptr)); + vst1q_f16(dst_data + 6 * dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m65, bias_ptr)); + vst1q_f16(dst_data + 6 * dst_step * C8NUM + 6 * C8NUM, vaddq_f16(m66, bias_ptr)); +#else + for (int i = 0; i < C8NUM; i++) { + float16_t src_data_00 = src_data[i]; + float16_t src_data_01 = src_data[i + src_step]; + float16_t src_data_02 = src_data[i + 2 * src_step]; + float16_t src_data_03 = src_data[i + 3 * src_step]; + float16_t src_data_04 = src_data[i + 4 * src_step]; + float16_t src_data_05 = src_data[i + 5 * src_step]; + float16_t src_data_06 = src_data[i + 6 * src_step]; + float16_t src_data_07 = src_data[i + 7 * src_step]; + float16_t src_data_10 = src_data[i + 8 * src_step]; + float16_t src_data_11 = src_data[i + 9 * src_step]; + float16_t src_data_12 = src_data[i + 10 * src_step]; + float16_t src_data_13 = src_data[i + 11 * src_step]; + float16_t src_data_14 = src_data[i + 12 * src_step]; + float16_t src_data_15 = src_data[i + 13 * src_step]; + float16_t src_data_16 = src_data[i + 14 * src_step]; + float16_t src_data_17 = src_data[i + 15 * src_step]; + float16_t src_data_20 = src_data[i + 16 * src_step]; + float16_t src_data_21 = src_data[i + 17 * src_step]; + float16_t src_data_22 = src_data[i + 18 * src_step]; + float16_t src_data_23 = src_data[i + 19 * src_step]; + float16_t src_data_24 = src_data[i + 20 * src_step]; + float16_t src_data_25 = src_data[i + 21 * src_step]; + float16_t src_data_26 = src_data[i + 22 * src_step]; + float16_t src_data_27 = src_data[i + 23 * src_step]; + float16_t src_data_30 = src_data[i + 24 * src_step]; + float16_t src_data_31 = src_data[i + 25 * src_step]; + float16_t src_data_32 = src_data[i + 26 * src_step]; + float16_t src_data_33 = src_data[i + 27 * src_step]; + float16_t src_data_34 = src_data[i + 28 * src_step]; + float16_t src_data_35 = src_data[i + 29 * src_step]; + float16_t src_data_36 = src_data[i + 30 * src_step]; + float16_t src_data_37 = src_data[i + 31 * src_step]; + float16_t src_data_40 = src_data[i + 32 * src_step]; + float16_t src_data_41 = src_data[i + 33 * src_step]; + float16_t src_data_42 = src_data[i + 34 * src_step]; + float16_t src_data_43 = src_data[i + 35 * src_step]; + float16_t src_data_44 = src_data[i + 36 * src_step]; + float16_t src_data_45 = src_data[i + 37 * src_step]; + float16_t src_data_46 = src_data[i + 38 * src_step]; + float16_t src_data_47 = src_data[i + 39 * src_step]; + float16_t src_data_50 = src_data[i + 40 * src_step]; + float16_t src_data_51 = src_data[i + 41 * src_step]; + float16_t src_data_52 = src_data[i + 42 * src_step]; + float16_t src_data_53 = src_data[i + 43 * src_step]; + float16_t src_data_54 = src_data[i + 44 * src_step]; + float16_t src_data_55 = src_data[i + 45 * src_step]; + float16_t src_data_56 = src_data[i + 46 * src_step]; + float16_t src_data_57 = src_data[i + 47 * src_step]; + float16_t src_data_60 = src_data[i + 48 * src_step]; + float16_t src_data_61 = src_data[i + 49 * src_step]; + float16_t src_data_62 = src_data[i + 50 * src_step]; + float16_t src_data_63 = src_data[i + 51 * src_step]; + float16_t src_data_64 = src_data[i + 52 * src_step]; + float16_t src_data_65 = src_data[i + 53 * src_step]; + float16_t src_data_66 = src_data[i + 54 * src_step]; + float16_t src_data_67 = src_data[i + 55 * src_step]; + float16_t src_data_70 = src_data[i + 56 * src_step]; + float16_t src_data_71 = src_data[i + 57 * src_step]; + float16_t src_data_72 = src_data[i + 58 * src_step]; + float16_t src_data_73 = src_data[i + 59 * src_step]; + float16_t src_data_74 = src_data[i + 60 * src_step]; + float16_t src_data_75 = src_data[i + 61 * src_step]; + float16_t src_data_76 = src_data[i + 62 * src_step]; + float16_t src_data_77 = src_data[i + 63 * src_step]; + + float16_t d01 = src_data_10 - src_data_20; + float16_t d02 = src_data_11 - src_data_21; + float16_t d03 = src_data_12 - src_data_22; + float16_t d04 = src_data_13 - src_data_23; + float16_t d05 = src_data_14 - src_data_24; + float16_t d06 = src_data_15 - src_data_25; + float16_t d07 = src_data_16 - src_data_26; + float16_t d08 = src_data_17 - src_data_27; + + float16_t d11 = src_data_30 - src_data_40; + float16_t d12 = src_data_31 - src_data_41; + float16_t d13 = src_data_32 - src_data_42; + float16_t d14 = src_data_33 - src_data_43; + float16_t d15 = src_data_34 - src_data_44; + float16_t d16 = src_data_35 - src_data_45; + float16_t d17 = src_data_36 - src_data_46; + float16_t d18 = src_data_37 - src_data_47; + + float16_t d21 = src_data_50 - src_data_60; + float16_t d22 = src_data_51 - src_data_61; + float16_t d23 = src_data_52 - src_data_62; + float16_t d24 = src_data_53 - src_data_63; + float16_t d25 = src_data_54 - src_data_64; + float16_t d26 = src_data_55 - src_data_65; + float16_t d27 = src_data_56 - src_data_66; + float16_t d28 = src_data_57 - src_data_67; + + float16_t d31 = src_data_10 + src_data_20; + float16_t d32 = src_data_11 + src_data_21; + float16_t d33 = src_data_12 + src_data_22; + float16_t d34 = src_data_13 + src_data_23; + float16_t d35 = src_data_14 + src_data_24; + float16_t d36 = src_data_15 + src_data_25; + float16_t d37 = src_data_16 + src_data_26; + float16_t d38 = src_data_17 + src_data_27; + + float16_t d41 = src_data_30 + src_data_40; + float16_t d42 = src_data_31 + src_data_41; + float16_t d43 = src_data_32 + src_data_42; + float16_t d44 = src_data_33 + src_data_43; + float16_t d45 = src_data_34 + src_data_44; + float16_t d46 = src_data_35 + src_data_45; + float16_t d47 = src_data_36 + src_data_46; + float16_t d48 = src_data_37 + src_data_47; + + float16_t d51 = src_data_50 + src_data_60; + float16_t d52 = src_data_51 + src_data_61; + float16_t d53 = src_data_52 + src_data_62; + float16_t d54 = src_data_53 + src_data_63; + float16_t d55 = src_data_54 + src_data_64; + float16_t d56 = src_data_55 + src_data_65; + float16_t d57 = src_data_56 + src_data_66; + float16_t d58 = src_data_57 + src_data_67; + + float16_t t00 = src_data_00 + src_data_10 + src_data_20 + src_data_30 + src_data_40 + src_data_50 + src_data_60; + float16_t t01 = src_data_01 + src_data_11 + src_data_21 + src_data_31 + src_data_41 + src_data_51 + src_data_61; + float16_t t02 = src_data_02 + src_data_12 + src_data_22 + src_data_32 + src_data_42 + src_data_52 + src_data_62; + float16_t t03 = src_data_03 + src_data_13 + src_data_23 + src_data_33 + src_data_43 + src_data_53 + src_data_63; + float16_t t04 = src_data_04 + src_data_14 + src_data_24 + src_data_34 + src_data_44 + src_data_54 + src_data_64; + float16_t t05 = src_data_05 + src_data_15 + src_data_25 + src_data_35 + src_data_45 + src_data_55 + src_data_65; + float16_t t06 = src_data_06 + src_data_16 + src_data_26 + src_data_36 + src_data_46 + src_data_56 + src_data_66; + float16_t t07 = src_data_07 + src_data_17 + src_data_27 + src_data_37 + src_data_47 + src_data_57 + src_data_67; + + const float16_t t10 = 0.5f * d01 + d11 + 1.5f * d21; + const float16_t t11 = 0.5f * d02 + d12 + 1.5f * d22; + const float16_t t12 = 0.5f * d03 + d13 + 1.5f * d23; + const float16_t t13 = 0.5f * d04 + d14 + 1.5f * d24; + const float16_t t14 = 0.5f * d05 + d15 + 1.5f * d25; + const float16_t t15 = 0.5f * d06 + d16 + 1.5f * d26; + const float16_t t16 = 0.5f * d07 + d17 + 1.5f * d27; + const float16_t t17 = 0.5f * d08 + d18 + 1.5f * d28; + + const float16_t t20 = 0.25f * d31 + d41 + 2.25f * d51; + const float16_t t21 = 0.25f * d32 + d42 + 2.25f * d52; + const float16_t t22 = 0.25f * d33 + d43 + 2.25f * d53; + const float16_t t23 = 0.25f * d34 + d44 + 2.25f * d54; + const float16_t t24 = 0.25f * d35 + d45 + 2.25f * d55; + const float16_t t25 = 0.25f * d36 + d46 + 2.25f * d56; + const float16_t t26 = 0.25f * d37 + d47 + 2.25f * d57; + const float16_t t27 = 0.25f * d38 + d48 + 2.25f * d58; + + const float16_t t30 = 0.125f * d01 + d11 + 3.375f * d21; + const float16_t t31 = 0.125f * d02 + d12 + 3.375f * d22; + const float16_t t32 = 0.125f * d03 + d13 + 3.375f * d23; + const float16_t t33 = 0.125f * d04 + d14 + 3.375f * d24; + const float16_t t34 = 0.125f * d05 + d15 + 3.375f * d25; + const float16_t t35 = 0.125f * d06 + d16 + 3.375f * d26; + const float16_t t36 = 0.125f * d07 + d17 + 3.375f * d27; + const float16_t t37 = 0.125f * d08 + d18 + 3.375f * d28; + + const float16_t t40 = 0.0625f * d31 + d41 + 5.0625f * d51; + const float16_t t41 = 0.0625f * d32 + d42 + 5.0625f * d52; + const float16_t t42 = 0.0625f * d33 + d43 + 5.0625f * d53; + const float16_t t43 = 0.0625f * d34 + d44 + 5.0625f * d54; + const float16_t t44 = 0.0625f * d35 + d45 + 5.0625f * d55; + const float16_t t45 = 0.0625f * d36 + d46 + 5.0625f * d56; + const float16_t t46 = 0.0625f * d37 + d47 + 5.0625f * d57; + const float16_t t47 = 0.0625f * d38 + d48 + 5.0625f * d58; + + const float16_t t50 = 0.03125f * d01 + d11 + 7.59375f * d21; + const float16_t t51 = 0.03125f * d02 + d12 + 7.59375f * d22; + const float16_t t52 = 0.03125f * d03 + d13 + 7.59375f * d23; + const float16_t t53 = 0.03125f * d04 + d14 + 7.59375f * d24; + const float16_t t54 = 0.03125f * d05 + d15 + 7.59375f * d25; + const float16_t t55 = 0.03125f * d06 + d16 + 7.59375f * d26; + const float16_t t56 = 0.03125f * d07 + d17 + 7.59375f * d27; + const float16_t t57 = 0.03125f * d08 + d18 + 7.59375f * d28; + + const float16_t t60 = 0.015625f * d31 + d41 + 11.390625f * d51 + src_data_70; + const float16_t t61 = 0.015625f * d32 + d42 + 11.390625f * d52 + src_data_71; + const float16_t t62 = 0.015625f * d33 + d43 + 11.390625f * d53 + src_data_72; + const float16_t t63 = 0.015625f * d34 + d44 + 11.390625f * d54 + src_data_73; + const float16_t t64 = 0.015625f * d35 + d45 + 11.390625f * d55 + src_data_74; + const float16_t t65 = 0.015625f * d36 + d46 + 11.390625f * d56 + src_data_75; + const float16_t t66 = 0.015625f * d37 + d47 + 11.390625f * d57 + src_data_76; + const float16_t t67 = 0.015625f * d38 + d48 + 11.390625f * d58 + src_data_77; + + float16_t s11 = t01 - t02; + float16_t s12 = t11 - t12; + float16_t s13 = t21 - t22; + float16_t s14 = t31 - t32; + float16_t s15 = t41 - t42; + float16_t s16 = t51 - t52; + float16_t s17 = t61 - t62; + + float16_t s21 = t03 - t04; + float16_t s22 = t13 - t14; + float16_t s23 = t23 - t24; + float16_t s24 = t33 - t34; + float16_t s25 = t43 - t44; + float16_t s26 = t53 - t54; + float16_t s27 = t63 - t64; + + float16_t s31 = t05 - t06; + float16_t s32 = t15 - t16; + float16_t s33 = t25 - t26; + float16_t s34 = t35 - t36; + float16_t s35 = t45 - t46; + float16_t s36 = t55 - t56; + float16_t s37 = t56 - t66; + + float16_t s41 = t01 + t02; + float16_t s42 = t11 + t12; + float16_t s43 = t21 + t22; + float16_t s44 = t31 + t32; + float16_t s45 = t41 + t42; + float16_t s46 = t51 + t52; + float16_t s47 = t61 + t62; + + float16_t s51 = t03 + t04; + float16_t s52 = t13 + t14; + float16_t s53 = t23 + t24; + float16_t s54 = t33 + t34; + float16_t s55 = t43 + t44; + float16_t s56 = t53 + t54; + float16_t s57 = t63 + t64; + + float16_t s61 = t05 + t06; + float16_t s62 = t15 + t16; + float16_t s63 = t25 + t26; + float16_t s64 = t35 + t36; + float16_t s65 = t45 + t46; + float16_t s66 = t55 + t56; + float16_t s67 = t65 + t66; + + float16_t m00 = t00 + t01 + t02 + t03 + t04 + t05 + t06; + const float16_t m01 = 0.5f * s11 + s21 + 1.5f * s31; + const float16_t m02 = 0.25f * s41 + s51 + 2.25f * s61; + const float16_t m03 = 0.125f * s11 + s21 + 3.375f * s31; + const float16_t m04 = 0.0625f * s41 + s51 + 5.0625f * s61; + const float16_t m05 = 0.03125f * s11 + s21 + 7.59375f * s31; + const float16_t m06 = 0.015625f * s41 + s51 + 11.390625f * s61 + t07; + + float16_t m10 = t10 + t11 + t12 + t13 + t14 + t15 + t16; + const float16_t m11 = 0.5f * s12 + s22 + 1.5f * s32; + const float16_t m12 = 0.25f * s42 + s52 + 2.25f * s62; + const float16_t m13 = 0.125f * s12 + s22 + 3.375f * s32; + const float16_t m14 = 0.0625f * s42 + s52 + 5.0625f * s62; + const float16_t m15 = 0.03125f * s12 + s22 + 7.59375f * s32; + const float16_t m16 = 0.015625f * s42 + s52 + 11.390625f * s62 + t17; + + float16_t m20 = t20 + t21 + t22 + t23 + t24 + t25 + t26; + const float16_t m21 = 0.5f * s13 + s23 + 1.5f * s33; + const float16_t m22 = 0.25f * s43 + s53 + 2.25f * s63; + const float16_t m23 = 0.125f * s13 + s23 + 3.375f * s33; + const float16_t m24 = 0.0625f * s43 + s53 + 5.0625f * s63; + const float16_t m25 = 0.03125f * s13 + s23 + 7.59375f * s33; + const float16_t m26 = 0.015625f * s43 + s53 + 11.390625f * s63 + t27; + + float16_t m30 = t30 + t31 + t32 + t33 + t34 + t35 + t36; + const float16_t m31 = 0.5f * s14 + s24 + 1.5f * s34; + const float16_t m32 = 0.25f * s44 + s54 + 2.25f * s64; + const float16_t m33 = 0.125f * s14 + s24 + 3.375f * s34; + const float16_t m34 = 0.0625f * s44 + s54 + 5.0625f * s64; + const float16_t m35 = 0.03125f * s14 + s24 + 7.59375f * s34; + const float16_t m36 = 0.015625f * s44 + s54 + 11.390625f * s64 + t37; + + float16_t m40 = t40 + t41 + t42 + t43 + t44 + t45 + t46; + const float16_t m41 = 0.5f * s15 + s25 + 1.5f * s35; + const float16_t m42 = 0.25f * s45 + s55 + 2.25f * s65; + const float16_t m43 = 0.125f * s15 + s25 + 3.375f * s35; + const float16_t m44 = 0.0625f * s45 + s55 + 5.0625f * s65; + const float16_t m45 = 0.03125f * s15 + s25 + 7.59375f * s35; + const float16_t m46 = 0.015625f * s45 + s55 + 11.390625f * s65 + t47; + + float16_t m50 = t50 + t51 + t52 + t53 + t54 + t55 + t56; + const float16_t m51 = 0.5f * s16 + s26 + 1.5f * s36; + const float16_t m52 = 0.25f * s46 + s56 + 2.25f * s66; + const float16_t m53 = 0.125f * s16 + s26 + 3.375f * s36; + const float16_t m54 = 0.0625f * s46 + s56 + 5.0625f * s66; + const float16_t m55 = 0.03125f * s16 + s26 + 7.59375f * s36; + const float16_t m56 = 0.015625f * s46 + s56 + 11.390625f * s66 + t57; + + float16_t m60 = t60 + t61 + t62 + t63 + t64 + t65 + t66; + const float16_t m61 = 0.5f * s17 + s27 + 1.5f * s37; + const float16_t m62 = 0.25f * s47 + s57 + 2.25f * s67; + const float16_t m63 = 0.125f * s17 + s27 + 3.375f * s37; + const float16_t m64 = 0.0625f * s47 + s57 + 5.0625f * s67; + const float16_t m65 = 0.03125f * s17 + s27 + 7.59375f * s37; + const float16_t m66 = 0.015625f * s47 + s57 + 11.390625f * s67 + t67; + + (dst_data + i)[0] = m00 + bias_data[i]; + (dst_data + i + C8NUM)[0] = m01 + bias_data[i]; + (dst_data + i + 2 * C8NUM)[0] = m02 + bias_data[i]; + (dst_data + i + 3 * C8NUM)[0] = m03 + bias_data[i]; + (dst_data + i + 4 * C8NUM)[0] = m04 + bias_data[i]; + (dst_data + i + 5 * C8NUM)[0] = m05 + bias_data[i]; + (dst_data + i + 6 * C8NUM)[0] = m06 + bias_data[i]; + + (dst_data + i + dst_step * C8NUM)[0] = m10 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + C8NUM)[0] = m11 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + 2 * C8NUM)[0] = m12 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + 3 * C8NUM)[0] = m13 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + 4 * C8NUM)[0] = m14 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + 5 * C8NUM)[0] = m15 + bias_data[i]; + (dst_data + i + dst_step * C8NUM + 6 * C8NUM)[0] = m16 + bias_data[i]; + + (dst_data + i + 2 * dst_step * C8NUM)[0] = m20 + bias_data[i]; + (dst_data + i + 2 * dst_step * C8NUM + C8NUM)[0] = m21 + bias_data[i]; + (dst_data + i + 2 * dst_step * C8NUM + 2 * C8NUM)[0] = m22 + bias_data[i]; + (dst_data + i + 2 * dst_step * C8NUM + 3 * C8NUM)[0] = m23 + bias_data[i]; + (dst_data + i + 2 * dst_step * C8NUM + 4 * C8NUM)[0] = m24 + bias_data[i]; + (dst_data + i + 2 * dst_step * C8NUM + 5 * C8NUM)[0] = m25 + bias_data[i]; + (dst_data + i + 2 * dst_step * C8NUM + 6 * C8NUM)[0] = m26 + bias_data[i]; + + (dst_data + i + 3 * dst_step * C8NUM)[0] = m30 + bias_data[i]; + (dst_data + i + 3 * dst_step * C8NUM + C8NUM)[0] = m31 + bias_data[i]; + (dst_data + i + 3 * dst_step * C8NUM + 2 * C8NUM)[0] = m32 + bias_data[i]; + (dst_data + i + 3 * dst_step * C8NUM + 3 * C8NUM)[0] = m33 + bias_data[i]; + (dst_data + i + 3 * dst_step * C8NUM + 4 * C8NUM)[0] = m34 + bias_data[i]; + (dst_data + i + 3 * dst_step * C8NUM + 5 * C8NUM)[0] = m35 + bias_data[i]; + (dst_data + i + 3 * dst_step * C8NUM + 6 * C8NUM)[0] = m36 + bias_data[i]; + + (dst_data + i + 4 * dst_step * C8NUM)[0] = m40 + bias_data[i]; + (dst_data + i + 4 * dst_step * C8NUM + C8NUM)[0] = m41 + bias_data[i]; + (dst_data + i + 4 * dst_step * C8NUM + 2 * C8NUM)[0] = m42 + bias_data[i]; + (dst_data + i + 4 * dst_step * C8NUM + 3 * C8NUM)[0] = m43 + bias_data[i]; + (dst_data + i + 4 * dst_step * C8NUM + 4 * C8NUM)[0] = m44 + bias_data[i]; + (dst_data + i + 4 * dst_step * C8NUM + 5 * C8NUM)[0] = m45 + bias_data[i]; + (dst_data + i + 4 * dst_step * C8NUM + 6 * C8NUM)[0] = m46 + bias_data[i]; + + (dst_data + i + 5 * dst_step * C8NUM)[0] = m50 + bias_data[i]; + (dst_data + i + 5 * dst_step * C8NUM + C8NUM)[0] = m51 + bias_data[i]; + (dst_data + i + 5 * dst_step * C8NUM + 2 * C8NUM)[0] = m52 + bias_data[i]; + (dst_data + i + 5 * dst_step * C8NUM + 3 * C8NUM)[0] = m53 + bias_data[i]; + (dst_data + i + 5 * dst_step * C8NUM + 4 * C8NUM)[0] = m54 + bias_data[i]; + (dst_data + i + 5 * dst_step * C8NUM + 5 * C8NUM)[0] = m55 + bias_data[i]; + (dst_data + i + 5 * dst_step * C8NUM + 6 * C8NUM)[0] = m56 + bias_data[i]; + + (dst_data + i + 6 * dst_step * C8NUM)[0] = m60 + bias_data[i]; + (dst_data + i + 6 * dst_step * C8NUM + C8NUM)[0] = m61 + bias_data[i]; + (dst_data + i + 6 * dst_step * C8NUM + 2 * C8NUM)[0] = m62 + bias_data[i]; + (dst_data + i + 6 * dst_step * C8NUM + 3 * C8NUM)[0] = m63 + bias_data[i]; + (dst_data + i + 6 * dst_step * C8NUM + 4 * C8NUM)[0] = m64 + bias_data[i]; + (dst_data + i + 6 * dst_step * C8NUM + 5 * C8NUM)[0] = m65 + bias_data[i]; + (dst_data + i + 6 * dst_step * C8NUM + 6 * C8NUM)[0] = m66 + bias_data[i]; + } +#endif +} + +InputTransformUnitFp16Func GetInputTransFuncFp16(int input_unit) { + if (input_unit == 4) { + return InputTransform4x4UnitFp16; + } else if (input_unit == 8) { + return InputTransform8x8UnitFp16; + } else { + printf("Only support 4 or 8 for input unit."); + return NULL; + } +} + +OutputTransformUnitFp16Func GetOutputTransFuncFp16(int input_unit, int output_unit) { + if (input_unit == 4 && output_unit == 2) { + return OutputTransform4x2UnitFp16; + } else if (input_unit == 4 && output_unit == 3) { + return OutputTransform4x3UnitFp16; + } else if (input_unit == 8) { + return outputTransformUnitFp16[output_unit]; + } else { + printf("."); + return NULL; + } +} diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_utils_fp16.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_utils_fp16.h new file mode 100644 index 00000000000..b86f7d23714 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_utils_fp16.h @@ -0,0 +1,67 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_WINOGRAD_UTILS_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_WINOGRAD_UTILS_H_ + +#include +#include "nnacl/conv_parameter.h" +#include "nnacl/op_base.h" + +typedef void (*InputTransformUnitFp16Func)(const float16_t *src_data, float16_t *dst_data, int src_step, int dst_step); +typedef void (*OutputTransformUnitFp16Func)(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, + int src_step, int dst_step); + +#ifdef __cplusplus +extern "C" { +#endif +void InputTransform4x4UnitFp16(const float16_t *src_data, float16_t *dst_data, int src_step, int dst_step); + +void InputTransform8x8UnitFp16(const float16_t *src_data, float16_t *dst_data, int src_step, int dst_step); + +void OutputTransform4x2UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, + int src_step, int dst_step); + +void OutputTransform4x3UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, + int src_step, int dst_step); + +void OutputTransform8x2UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, + int src_step, int dst_step); + +void OutputTransform8x3UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, + int src_step, int dst_step); + +void OutputTransform8x4UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, + int src_step, int dst_step); + +void OutputTransform8x5UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, + int src_step, int dst_step); + +void OutputTransform8x6UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, + int src_step, int dst_step); + +void OutputTransform8x7UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, + int src_step, int dst_step); + +InputTransformUnitFp16Func GetInputTransFuncFp16(int input_unit); + +OutputTransformUnitFp16Func GetOutputTransFuncFp16(int input_unit, int output_unit); + +#ifdef __cplusplus +} +#endif + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_WINOGRAD_UTILS_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.c index d9f51a8585c..0215e876070 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.c +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.c @@ -243,10 +243,9 @@ int Conv1x1Fp32(const float *input_data, const float *weight_data, float *output } // fp32 conv winograd -void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_data, float *output_data, - TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, - InputTransformUnitFunc input_trans_func, OutputTransformUnitFunc output_trans_func, - GEMM_FUNC_FP32 gemm_func) { +void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_data, TmpBufferAddress *buffer_list, + int task_id, ConvParameter *conv_param, InputTransformUnitFunc input_trans_func, + OutputTransformUnitFunc output_trans_func, GEMM_FUNC_FP32 gemm_func) { int thread_num = conv_param->thread_num_; int input_unit = conv_param->input_unit_; int in_batch = conv_param->input_batch_; diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.h index d4550ef9f3f..c27854cad8a 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.h @@ -57,10 +57,9 @@ int Conv1x1Fp32(const float *input_data, const float *weight_data, float *output StrassenMatMulParameter matmul_param); // fp32 convolution winograd -void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_data, float *output_data, - TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, - InputTransformUnitFunc input_trans_func, OutputTransformUnitFunc output_trans_func, - GEMM_FUNC_FP32 gemm_func); +void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_data, TmpBufferAddress *buffer_list, + int task_id, ConvParameter *conv_param, InputTransformUnitFunc input_trans_func, + OutputTransformUnitFunc output_trans_func, GEMM_FUNC_FP32 gemm_func); void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, int width, int channel, int output_unit);