From 6a9eead4828a4bf671e76aa04baf48396cb52fb3 Mon Sep 17 00:00:00 2001 From: fuzhiye Date: Fri, 4 Sep 2020 10:59:58 +0800 Subject: [PATCH] 1.change pack && rewrite winograd 2.rewrite fp16 winograd 3.remove useless code --- mindspore/lite/nnacl/fp16/conv_fp16.c | 6 +- mindspore/lite/nnacl/fp16/conv_fp16.h | 3 +- mindspore/lite/nnacl/fp16/matrix_fp16.c | 65 + .../kernel/arm => nnacl}/fp16/matrix_fp16.h | 20 +- .../lite/nnacl/fp16/winograd_transform_fp16.c | 11 +- .../lite/nnacl/fp16/winograd_transform_fp16.h | 6 +- .../lite/nnacl/fp16/winograd_utils_fp16.c | 4300 +------------ .../lite/nnacl/fp16/winograd_utils_fp16.h | 40 +- mindspore/lite/nnacl/fp32/conv.c | 8 +- mindspore/lite/nnacl/fp32/conv.h | 5 +- mindspore/lite/nnacl/matrix_table.c | 507 -- mindspore/lite/nnacl/matrix_table.h | 54 - .../lite/nnacl/minimal_filtering_generator.c | 233 + .../lite/nnacl/minimal_filtering_generator.h | 56 + mindspore/lite/nnacl/pack.c | 7 - mindspore/lite/nnacl/winograd_transform.c | 22 +- mindspore/lite/nnacl/winograd_transform.h | 6 +- mindspore/lite/nnacl/winograd_utils.c | 5703 ++++------------- mindspore/lite/nnacl/winograd_utils.h | 224 +- .../src/runtime/kernel/arm/base/matrix.cc | 86 - .../lite/src/runtime/kernel/arm/base/matrix.h | 77 - .../kernel/arm/fp16/convolution_fp16.cc | 4 +- .../arm/fp16/convolution_winograd_fp16.cc | 222 +- .../arm/fp16/convolution_winograd_fp16.h | 20 +- .../runtime/kernel/arm/fp16/matrix_fp16.cc | 36 - .../runtime/kernel/arm/fp32/convolution.cc | 4 +- .../kernel/arm/fp32/convolution_winograd.cc | 146 +- .../kernel/arm/fp32/convolution_winograd.h | 16 +- mindspore/lite/tools/benchmark/benchmark.h | 15 +- 29 files changed, 1997 insertions(+), 9905 deletions(-) create mode 100644 mindspore/lite/nnacl/fp16/matrix_fp16.c rename mindspore/lite/{src/runtime/kernel/arm => nnacl}/fp16/matrix_fp16.h (58%) delete mode 100644 mindspore/lite/nnacl/matrix_table.c delete mode 100644 mindspore/lite/nnacl/matrix_table.h create mode 100644 mindspore/lite/nnacl/minimal_filtering_generator.c create mode 100644 mindspore/lite/nnacl/minimal_filtering_generator.h delete mode 100644 mindspore/lite/src/runtime/kernel/arm/base/matrix.cc delete mode 100644 mindspore/lite/src/runtime/kernel/arm/base/matrix.h delete mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/matrix_fp16.cc diff --git a/mindspore/lite/nnacl/fp16/conv_fp16.c b/mindspore/lite/nnacl/fp16/conv_fp16.c index 04c69c7fb22..693e41020a0 100644 --- a/mindspore/lite/nnacl/fp16/conv_fp16.c +++ b/mindspore/lite/nnacl/fp16/conv_fp16.c @@ -540,7 +540,7 @@ void UnPack3x3Relu6OutputFp16(const float16_t *src, float16_t *dst, int batch, i // fp16 convolution winograd void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const float16_t *bias_data, TmpBufferAddressFp16 *buffer_list, int task_id, ConvParameter *conv_param, - InputTransformUnitFp16Func input_trans_func, OutputTransformUnitFp16Func output_trans_func) { + MatricesFp16 *matrices) { int thread_num = conv_param->thread_num_; int input_unit = conv_param->input_unit_; int in_batch = conv_param->input_batch_; @@ -575,14 +575,14 @@ void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const floa cal_num = cal_num > tile_num ? tile_num : cal_num; WinogradInputTransformFp16(input_data + in_batch_offset, trans_input + task_id * trans_input_offset, tmp_data + task_id * tmp_data_offset, cal_num, out_tile_index, out_w_block, conv_param, - input_trans_func); + matrices[2], matrices[3]); // step 3 : gemm IndirectGemmFp16_16x8(gemm_out + task_id * gemm_out_offset, trans_input + task_id * trans_input_offset, trans_weight, NULL, input_unit_square, ic8 * 2, oc8 * C8NUM, output_offset, 1, 1, 0, 0); // step 4 : output transform WinogradOutputTransformFp16(gemm_out + task_id * gemm_out_offset, tmp_out_data + tmp_out_batch_offset, bias_data, - cal_num, out_tile_index, out_w_block, conv_param, output_trans_func); + cal_num, out_tile_index, out_w_block, conv_param, matrices[0], matrices[1]); } } } diff --git a/mindspore/lite/nnacl/fp16/conv_fp16.h b/mindspore/lite/nnacl/fp16/conv_fp16.h index b54624c5ab1..3d9ab6bb2ad 100644 --- a/mindspore/lite/nnacl/fp16/conv_fp16.h +++ b/mindspore/lite/nnacl/fp16/conv_fp16.h @@ -22,6 +22,7 @@ #include "nnacl/fp16/winograd_transform_fp16.h" typedef float16_t *TmpBufferAddressFp16; +typedef float16_t *MatricesFp16; #ifndef ENABLE_NEON void IndirectGemmFp16_16x8(float16_t *output, float16_t *input, float16_t *weight, float16_t *bias, size_t step, @@ -69,7 +70,7 @@ void UnPack3x3Relu6OutputFp16(const float16_t *src, float16_t *dst, int batch, i // fp16 convolution winograd void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const float16_t *bias_data, TmpBufferAddressFp16 *buffer_list, int task_id, ConvParameter *conv_param, - InputTransformUnitFp16Func input_trans_func, OutputTransformUnitFp16Func output_trans_func); + MatricesFp16 *matrices); void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel, int output_unit); diff --git a/mindspore/lite/nnacl/fp16/matrix_fp16.c b/mindspore/lite/nnacl/fp16/matrix_fp16.c new file mode 100644 index 00000000000..feeb76ce49f --- /dev/null +++ b/mindspore/lite/nnacl/fp16/matrix_fp16.c @@ -0,0 +1,65 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nnacl/fp16/matrix_fp16.h" + +void MatrixMultiplyFp16(const float16_t *matrix_a, const float16_t *matrix_b, float16_t *matrix_c, int m, int k, + int n) { + int count = 0; + for (int h = 0; h < m; h++) { + int h_offset = h * k; + for (int w = 0; w < n; w++) { + float16_t res = 0; + for (int i = 0; i < k; i++) { + res += *(matrix_a + h_offset + i) * *(matrix_b + w + i * n); + } + *(matrix_c + count) = res; + count++; + } + } +} + +void MatrixMultiplyVecFp16(const float16x8_t *matrix_a, const float16x8_t *matrix_b, float16x8_t *matrix_c, + const float16_t *bias, int m, int k, int n) { + if (bias == NULL) { + int count = 0; + for (int h = 0; h < m; h++) { + int h_offset = h * k; + for (int w = 0; w < n; w++) { + float16x8_t res = vmovq_n_f16(0); + for (int i = 0; i < k; i++) { + res = vaddq_f16(res, vmulq_f16(matrix_a[h_offset + i], matrix_b[w + i * n])); + } + matrix_c[count] = res; + count++; + } + } + } else { + int count = 0; + float16x8_t bias_ptr = vld1q_f16(bias); + for (int h = 0; h < m; h++) { + int h_offset = h * k; + for (int w = 0; w < n; w++) { + float16x8_t res = vmovq_n_f16(0); + for (int i = 0; i < k; i++) { + res = vaddq_f16(res, vmulq_f16(matrix_a[h_offset + i], matrix_b[w + i * n])); + } + matrix_c[count] = vaddq_f16(res, bias_ptr); + count++; + } + } + } +} diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matrix_fp16.h b/mindspore/lite/nnacl/fp16/matrix_fp16.h similarity index 58% rename from mindspore/lite/src/runtime/kernel/arm/fp16/matrix_fp16.h rename to mindspore/lite/nnacl/fp16/matrix_fp16.h index de25029aff6..6834fc8d6c3 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/matrix_fp16.h +++ b/mindspore/lite/nnacl/fp16/matrix_fp16.h @@ -14,14 +14,20 @@ * limitations under the License. */ -#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATRIX_FP16_H_ -#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATRIX_FP16_H_ +#ifndef MINDSPORE_LITE_NNACL_FP16_MATRIX_FP16_H_ +#define MINDSPORE_LITE_NNACL_FP16_MATRIX_FP16_H_ -#include "src/runtime/kernel/arm/base/matrix.h" +#include -namespace mindspore::kernel { -void MatrixMultiplyFp16(const float16_t *matrix_a, const float16_t *matrix_b, float16_t *matrix_c, int m, int k, int n, - bool row); +#ifdef __cplusplus +extern "C" { +#endif +void MatrixMultiplyFp16(const float16_t *matrix_a, const float16_t *matrix_b, float16_t *matrix_c, int m, int k, int n); + +void MatrixMultiplyVecFp16(const float16x8_t *matrix_a, const float16x8_t *matrix_b, float16x8_t *matrix_c, + const float16_t *bias, int m, int k, int n); +#ifdef __cplusplus } +#endif -#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATRIX_FP16_H_ +#endif // MINDSPORE_LITE_NNACL_FP16_MATRIX_FP16_H_ diff --git a/mindspore/lite/nnacl/fp16/winograd_transform_fp16.c b/mindspore/lite/nnacl/fp16/winograd_transform_fp16.c index d683f07ab0e..5ced95a3165 100644 --- a/mindspore/lite/nnacl/fp16/winograd_transform_fp16.c +++ b/mindspore/lite/nnacl/fp16/winograd_transform_fp16.c @@ -569,8 +569,8 @@ void Conv3x3Fp16OutputTransform(const float16_t *gemm_out, float16_t *out_data, // fp16 common winograd void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_input, float16_t *tmp_data, int cal_num, - int out_tile_index, int out_w_block_num, ConvParameter *conv_param, - InputTransformUnitFp16Func input_trans_func) { + int out_tile_index, int out_w_block_num, ConvParameter *conv_param, float16_t *matrix_b, + float16_t *matrix_bt) { const int tile_num = 16; int input_unit = conv_param->input_unit_; int output_unit = conv_param->output_unit_; @@ -622,7 +622,7 @@ void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_in int dst_ic8_offset = dst_plane_offset + ic * tile_num * C8NUM; size_t dst_step = ic8 * C8NUM * tile_num; float16_t *trans_input_ptr = trans_input + dst_ic8_offset; - input_trans_func(tmp_data, trans_input_ptr, C8NUM, dst_step); + GeneralInputTransformUnitFp16(tmp_data, trans_input_ptr, matrix_b, matrix_bt, C8NUM, dst_step, input_unit); } out_tile_index++; } // cal_tile_num loop @@ -630,7 +630,7 @@ void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_in void WinogradOutputTransformFp16(const float16_t *gemm_out, float16_t *tmp_out_data, const float16_t *bias_data, int cal_num, int out_tile_index, int output_unit_num, ConvParameter *conv_param, - OutputTransformUnitFp16Func output_trans_func) { + float16_t *matrix_a, float16_t *matrix_at) { int output_unit = conv_param->output_unit_; int output_w = conv_param->output_w_; int output_h = conv_param->output_h_; @@ -655,7 +655,8 @@ void WinogradOutputTransformFp16(const float16_t *gemm_out, float16_t *tmp_out_d const float16_t *src_ptr = gemm_out + src_oc8_offset; const float16_t *bias_ptr = bias_data + j * C8NUM; float16_t *dst_ptr = tmp_out_data + dst_oc8_offset; - output_trans_func(src_ptr, dst_ptr, bias_ptr, C8NUM, output_w_unit_block * output_unit); + GeneralOutputTransformUnitFp16(src_ptr, dst_ptr, bias_ptr, matrix_a, matrix_at, C8NUM, + output_w_unit_block * output_unit, input_unit, output_unit); } out_tile_index++; } diff --git a/mindspore/lite/nnacl/fp16/winograd_transform_fp16.h b/mindspore/lite/nnacl/fp16/winograd_transform_fp16.h index 52f6ae2c823..0ae23b22648 100644 --- a/mindspore/lite/nnacl/fp16/winograd_transform_fp16.h +++ b/mindspore/lite/nnacl/fp16/winograd_transform_fp16.h @@ -43,12 +43,12 @@ void Conv3x3Fp16OutputTransform(const float16_t *gemm_out, float16_t *out_data, // fp16 common winograd void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_input, float16_t *tmp_data, int cal_num, - int out_tile_index, int out_w_block_num, ConvParameter *conv_param, - InputTransformUnitFp16Func input_trans_func); + int out_tile_index, int out_w_block_num, ConvParameter *conv_param, float16_t *matrix_b, + float16_t *matrix_bt); void WinogradOutputTransformFp16(const float16_t *gemm_out, float16_t *tmp_out_data, const float16_t *bias_data, int cal_num, int out_tile_index, int output_unit_num, ConvParameter *conv_param, - OutputTransformUnitFp16Func output_trans_func); + float16_t *matrix_a, float16_t *matrix_at); #ifdef __cplusplus } #endif diff --git a/mindspore/lite/nnacl/fp16/winograd_utils_fp16.c b/mindspore/lite/nnacl/fp16/winograd_utils_fp16.c index 3414d769ef9..d70789cfac6 100644 --- a/mindspore/lite/nnacl/fp16/winograd_utils_fp16.c +++ b/mindspore/lite/nnacl/fp16/winograd_utils_fp16.c @@ -15,4262 +15,64 @@ */ #include "nnacl/fp16/winograd_utils_fp16.h" +#include "nnacl/fp16/matrix_fp16.h" #define MIN_UNIT 2 #define MAX_UNIT 8 -static OutputTransformUnitFp16Func outputTransformUnitFp16[] = { - NULL, // 0 - NULL, // 1 - OutputTransform8x2UnitFp16, - OutputTransform8x3UnitFp16, - OutputTransform8x4UnitFp16, - OutputTransform8x5UnitFp16, - OutputTransform8x6UnitFp16, - OutputTransform8x7UnitFp16, -}; - -void InputTransform4x4UnitFp16(const float16_t *src_data, float16_t *dst_data, int src_step, int dst_step) { - float16x8_t src_data_00 = vld1q_f16(src_data + 0 * src_step); - float16x8_t src_data_01 = vld1q_f16(src_data + 1 * src_step); - float16x8_t src_data_02 = vld1q_f16(src_data + 2 * src_step); - float16x8_t src_data_03 = vld1q_f16(src_data + 3 * src_step); - float16x8_t src_data_10 = vld1q_f16(src_data + 4 * src_step); - float16x8_t src_data_11 = vld1q_f16(src_data + 5 * src_step); - float16x8_t src_data_12 = vld1q_f16(src_data + 6 * src_step); - float16x8_t src_data_13 = vld1q_f16(src_data + 7 * src_step); - float16x8_t src_data_20 = vld1q_f16(src_data + 8 * src_step); - float16x8_t src_data_21 = vld1q_f16(src_data + 9 * src_step); - float16x8_t src_data_22 = vld1q_f16(src_data + 10 * src_step); - float16x8_t src_data_23 = vld1q_f16(src_data + 11 * src_step); - float16x8_t src_data_30 = vld1q_f16(src_data + 12 * src_step); - float16x8_t src_data_31 = vld1q_f16(src_data + 13 * src_step); - float16x8_t src_data_32 = vld1q_f16(src_data + 14 * src_step); - float16x8_t src_data_33 = vld1q_f16(src_data + 15 * src_step); - - float16x8_t t00 = vsubq_f16(src_data_00, vmulq_n_f16(src_data_20, 4)); - float16x8_t t01 = vsubq_f16(src_data_01, vmulq_n_f16(src_data_21, 4)); - float16x8_t t02 = vsubq_f16(src_data_02, vmulq_n_f16(src_data_22, 4)); - float16x8_t t03 = vsubq_f16(src_data_03, vmulq_n_f16(src_data_23, 4)); - - float16x8_t t10 = vaddq_f16(src_data_10, vmulq_n_f16(src_data_20, 2)); - float16x8_t t11 = vaddq_f16(src_data_11, vmulq_n_f16(src_data_21, 2)); - float16x8_t t12 = vaddq_f16(src_data_12, vmulq_n_f16(src_data_22, 2)); - float16x8_t t13 = vaddq_f16(src_data_13, vmulq_n_f16(src_data_23, 2)); - - float16x8_t t20 = vsubq_f16(vmulq_n_f16(src_data_20, 2), src_data_10); - float16x8_t t21 = vsubq_f16(vmulq_n_f16(src_data_21, 2), src_data_11); - float16x8_t t22 = vsubq_f16(vmulq_n_f16(src_data_22, 2), src_data_12); - float16x8_t t23 = vsubq_f16(vmulq_n_f16(src_data_23, 2), src_data_13); - - float16x8_t t30 = vsubq_f16(src_data_30, vmulq_n_f16(src_data_10, 0.25)); - float16x8_t t31 = vsubq_f16(src_data_31, vmulq_n_f16(src_data_11, 0.25)); - float16x8_t t32 = vsubq_f16(src_data_32, vmulq_n_f16(src_data_12, 0.25)); - float16x8_t t33 = vsubq_f16(src_data_33, vmulq_n_f16(src_data_13, 0.25)); - - float16x8_t m00 = vsubq_f16(t00, vmulq_n_f16(t02, 4)); - float16x8_t m01 = vaddq_f16(t01, vmulq_n_f16(t02, 2)); - float16x8_t m02 = vsubq_f16(vmulq_n_f16(t02, 2), t01); - float16x8_t m03 = vsubq_f16(t03, vmulq_n_f16(t01, 0.25)); - - float16x8_t m10 = vsubq_f16(t10, vmulq_n_f16(t12, 4)); - float16x8_t m11 = vaddq_f16(t11, vmulq_n_f16(t12, 2)); - float16x8_t m12 = vsubq_f16(vmulq_n_f16(t12, 2), t11); - float16x8_t m13 = vsubq_f16(t13, vmulq_n_f16(t11, 0.25)); - - float16x8_t m20 = vsubq_f16(t20, vmulq_n_f16(t22, 4)); - float16x8_t m21 = vaddq_f16(t21, vmulq_n_f16(t22, 2)); - float16x8_t m22 = vsubq_f16(vmulq_n_f16(t22, 2), t21); - float16x8_t m23 = vsubq_f16(t23, vmulq_n_f16(t21, 0.25)); - - float16x8_t m30 = vsubq_f16(t30, vmulq_n_f16(t32, 4)); - float16x8_t m31 = vaddq_f16(t31, vmulq_n_f16(t32, 2)); - float16x8_t m32 = vsubq_f16(vmulq_n_f16(t32, 2), t31); - float16x8_t m33 = vsubq_f16(t33, vmulq_n_f16(t31, 0.25)); - - vst1_f16(dst_data, vget_low_f16(m00)); - vst1_f16(dst_data + 64, vget_high_f16(m00)); - vst1_f16(dst_data + 1 * dst_step, vget_low_f16(m01)); - vst1_f16(dst_data + 1 * dst_step + 64, vget_high_f16(m01)); - vst1_f16(dst_data + 2 * dst_step, vget_low_f16(m02)); - vst1_f16(dst_data + 2 * dst_step + 64, vget_high_f16(m02)); - vst1_f16(dst_data + 3 * dst_step, vget_low_f16(m03)); - vst1_f16(dst_data + 3 * dst_step + 64, vget_high_f16(m03)); - vst1_f16(dst_data + 4 * dst_step, vget_low_f16(m10)); - vst1_f16(dst_data + 4 * dst_step + 64, vget_high_f16(m10)); - vst1_f16(dst_data + 5 * dst_step, vget_low_f16(m11)); - vst1_f16(dst_data + 5 * dst_step + 64, vget_high_f16(m11)); - vst1_f16(dst_data + 6 * dst_step, vget_low_f16(m12)); - vst1_f16(dst_data + 6 * dst_step + 64, vget_high_f16(m12)); - vst1_f16(dst_data + 7 * dst_step, vget_low_f16(m13)); - vst1_f16(dst_data + 7 * dst_step + 64, vget_high_f16(m13)); - vst1_f16(dst_data + 8 * dst_step, vget_low_f16(m20)); - vst1_f16(dst_data + 8 * dst_step + 64, vget_high_f16(m20)); - vst1_f16(dst_data + 9 * dst_step, vget_low_f16(m21)); - vst1_f16(dst_data + 9 * dst_step + 64, vget_high_f16(m21)); - vst1_f16(dst_data + 10 * dst_step, vget_low_f16(m22)); - vst1_f16(dst_data + 10 * dst_step + 64, vget_high_f16(m22)); - vst1_f16(dst_data + 11 * dst_step, vget_low_f16(m23)); - vst1_f16(dst_data + 11 * dst_step + 64, vget_high_f16(m23)); - vst1_f16(dst_data + 12 * dst_step, vget_low_f16(m30)); - vst1_f16(dst_data + 12 * dst_step + 64, vget_high_f16(m30)); - vst1_f16(dst_data + 13 * dst_step, vget_low_f16(m31)); - vst1_f16(dst_data + 13 * dst_step + 64, vget_high_f16(m31)); - vst1_f16(dst_data + 14 * dst_step, vget_low_f16(m32)); - vst1_f16(dst_data + 14 * dst_step + 64, vget_high_f16(m32)); - vst1_f16(dst_data + 15 * dst_step, vget_low_f16(m33)); - vst1_f16(dst_data + 15 * dst_step + 64, vget_high_f16(m33)); -} - -void InputTransform8x8UnitFp16(const float16_t *src_data, float16_t *dst_data, int src_step, int dst_step) { - float16x8_t src_data_00 = vld1q_f16(src_data + 0 * src_step); - float16x8_t src_data_01 = vld1q_f16(src_data + 1 * src_step); - float16x8_t src_data_02 = vld1q_f16(src_data + 2 * src_step); - float16x8_t src_data_03 = vld1q_f16(src_data + 3 * src_step); - float16x8_t src_data_04 = vld1q_f16(src_data + 4 * src_step); - float16x8_t src_data_05 = vld1q_f16(src_data + 5 * src_step); - float16x8_t src_data_06 = vld1q_f16(src_data + 6 * src_step); - float16x8_t src_data_07 = vld1q_f16(src_data + 7 * src_step); - float16x8_t src_data_10 = vld1q_f16(src_data + 8 * src_step); - float16x8_t src_data_11 = vld1q_f16(src_data + 9 * src_step); - float16x8_t src_data_12 = vld1q_f16(src_data + 10 * src_step); - float16x8_t src_data_13 = vld1q_f16(src_data + 11 * src_step); - float16x8_t src_data_14 = vld1q_f16(src_data + 12 * src_step); - float16x8_t src_data_15 = vld1q_f16(src_data + 13 * src_step); - float16x8_t src_data_16 = vld1q_f16(src_data + 14 * src_step); - float16x8_t src_data_17 = vld1q_f16(src_data + 15 * src_step); - float16x8_t src_data_20 = vld1q_f16(src_data + 16 * src_step); - float16x8_t src_data_21 = vld1q_f16(src_data + 17 * src_step); - float16x8_t src_data_22 = vld1q_f16(src_data + 18 * src_step); - float16x8_t src_data_23 = vld1q_f16(src_data + 19 * src_step); - float16x8_t src_data_24 = vld1q_f16(src_data + 20 * src_step); - float16x8_t src_data_25 = vld1q_f16(src_data + 21 * src_step); - float16x8_t src_data_26 = vld1q_f16(src_data + 22 * src_step); - float16x8_t src_data_27 = vld1q_f16(src_data + 23 * src_step); - float16x8_t src_data_30 = vld1q_f16(src_data + 24 * src_step); - float16x8_t src_data_31 = vld1q_f16(src_data + 25 * src_step); - float16x8_t src_data_32 = vld1q_f16(src_data + 26 * src_step); - float16x8_t src_data_33 = vld1q_f16(src_data + 27 * src_step); - float16x8_t src_data_34 = vld1q_f16(src_data + 28 * src_step); - float16x8_t src_data_35 = vld1q_f16(src_data + 29 * src_step); - float16x8_t src_data_36 = vld1q_f16(src_data + 30 * src_step); - float16x8_t src_data_37 = vld1q_f16(src_data + 31 * src_step); - float16x8_t src_data_40 = vld1q_f16(src_data + 32 * src_step); - float16x8_t src_data_41 = vld1q_f16(src_data + 33 * src_step); - float16x8_t src_data_42 = vld1q_f16(src_data + 34 * src_step); - float16x8_t src_data_43 = vld1q_f16(src_data + 35 * src_step); - float16x8_t src_data_44 = vld1q_f16(src_data + 36 * src_step); - float16x8_t src_data_45 = vld1q_f16(src_data + 37 * src_step); - float16x8_t src_data_46 = vld1q_f16(src_data + 38 * src_step); - float16x8_t src_data_47 = vld1q_f16(src_data + 39 * src_step); - float16x8_t src_data_50 = vld1q_f16(src_data + 40 * src_step); - float16x8_t src_data_51 = vld1q_f16(src_data + 41 * src_step); - float16x8_t src_data_52 = vld1q_f16(src_data + 42 * src_step); - float16x8_t src_data_53 = vld1q_f16(src_data + 43 * src_step); - float16x8_t src_data_54 = vld1q_f16(src_data + 44 * src_step); - float16x8_t src_data_55 = vld1q_f16(src_data + 45 * src_step); - float16x8_t src_data_56 = vld1q_f16(src_data + 46 * src_step); - float16x8_t src_data_57 = vld1q_f16(src_data + 47 * src_step); - float16x8_t src_data_60 = vld1q_f16(src_data + 48 * src_step); - float16x8_t src_data_61 = vld1q_f16(src_data + 49 * src_step); - float16x8_t src_data_62 = vld1q_f16(src_data + 50 * src_step); - float16x8_t src_data_63 = vld1q_f16(src_data + 51 * src_step); - float16x8_t src_data_64 = vld1q_f16(src_data + 52 * src_step); - float16x8_t src_data_65 = vld1q_f16(src_data + 53 * src_step); - float16x8_t src_data_66 = vld1q_f16(src_data + 54 * src_step); - float16x8_t src_data_67 = vld1q_f16(src_data + 55 * src_step); - float16x8_t src_data_70 = vld1q_f16(src_data + 56 * src_step); - float16x8_t src_data_71 = vld1q_f16(src_data + 57 * src_step); - float16x8_t src_data_72 = vld1q_f16(src_data + 58 * src_step); - float16x8_t src_data_73 = vld1q_f16(src_data + 59 * src_step); - float16x8_t src_data_74 = vld1q_f16(src_data + 60 * src_step); - float16x8_t src_data_75 = vld1q_f16(src_data + 61 * src_step); - float16x8_t src_data_76 = vld1q_f16(src_data + 62 * src_step); - float16x8_t src_data_77 = vld1q_f16(src_data + 63 * src_step); - - float16x8_t t00 = vsubq_f16(vaddq_f16(vsubq_f16(src_data_00, vmulq_n_f16(src_data_20, 5.44444444444444444444444445)), - vmulq_n_f16(src_data_40, 6.222222222222)), - vmulq_n_f16(src_data_60, 1.7777777777777)); - float16x8_t t01 = vsubq_f16(vaddq_f16(vsubq_f16(src_data_01, vmulq_n_f16(src_data_21, 5.44444444444444444444444445)), - vmulq_n_f16(src_data_41, 6.222222222222)), - vmulq_n_f16(src_data_61, 1.7777777777777)); - float16x8_t t02 = vsubq_f16(vaddq_f16(vsubq_f16(src_data_02, vmulq_n_f16(src_data_22, 5.44444444444444444444444445)), - vmulq_n_f16(src_data_42, 6.222222222222)), - vmulq_n_f16(src_data_62, 1.7777777777777)); - float16x8_t t03 = vsubq_f16(vaddq_f16(vsubq_f16(src_data_03, vmulq_n_f16(src_data_23, 5.44444444444444444444444445)), - vmulq_n_f16(src_data_43, 6.222222222222)), - vmulq_n_f16(src_data_63, 1.7777777777777)); - float16x8_t t04 = vsubq_f16(vaddq_f16(vsubq_f16(src_data_04, vmulq_n_f16(src_data_24, 5.44444444444444444444444445)), - vmulq_n_f16(src_data_44, 6.222222222222)), - vmulq_n_f16(src_data_64, 1.7777777777777)); - float16x8_t t05 = vsubq_f16(vaddq_f16(vsubq_f16(src_data_05, vmulq_n_f16(src_data_25, 5.44444444444444444444444445)), - vmulq_n_f16(src_data_45, 6.222222222222)), - vmulq_n_f16(src_data_65, 1.7777777777777)); - float16x8_t t06 = vsubq_f16(vaddq_f16(vsubq_f16(src_data_06, vmulq_n_f16(src_data_26, 5.44444444444444444444444445)), - vmulq_n_f16(src_data_46, 6.222222222222)), - vmulq_n_f16(src_data_66, 1.7777777777777)); - float16x8_t t07 = vsubq_f16(vaddq_f16(vsubq_f16(src_data_07, vmulq_n_f16(src_data_27, 5.44444444444444444444444445)), - vmulq_n_f16(src_data_47, 6.222222222222)), - vmulq_n_f16(src_data_67, 1.7777777777777)); - - float16x8_t t10 = - vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_10, 1.5), vmulq_n_f16(src_data_20, 3)), - vmulq_n_f16(src_data_30, 2.166666666666666667)), - vmulq_n_f16(src_data_40, 4.333333333333)), - vmulq_n_f16(src_data_50, 0.66666666666)), - vmulq_n_f16(src_data_60, 1.333333333333)); - float16x8_t t11 = - vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_11, 1.5), vmulq_n_f16(src_data_21, 3)), - vmulq_n_f16(src_data_31, 2.166666666666666667)), - vmulq_n_f16(src_data_41, 4.333333333333)), - vmulq_n_f16(src_data_51, 0.66666666666)), - vmulq_n_f16(src_data_61, 1.333333333333)); - float16x8_t t12 = - vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_12, 1.5), vmulq_n_f16(src_data_22, 3)), - vmulq_n_f16(src_data_32, 2.166666666666666667)), - vmulq_n_f16(src_data_42, 4.333333333333)), - vmulq_n_f16(src_data_52, 0.66666666666)), - vmulq_n_f16(src_data_62, 1.333333333333)); - float16x8_t t13 = - vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_13, 1.5), vmulq_n_f16(src_data_23, 3)), - vmulq_n_f16(src_data_33, 2.166666666666666667)), - vmulq_n_f16(src_data_43, 4.333333333333)), - vmulq_n_f16(src_data_53, 0.66666666666)), - vmulq_n_f16(src_data_63, 1.333333333333)); - float16x8_t t14 = - vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_14, 1.5), vmulq_n_f16(src_data_24, 3)), - vmulq_n_f16(src_data_34, 2.166666666666666667)), - vmulq_n_f16(src_data_44, 4.333333333333)), - vmulq_n_f16(src_data_54, 0.66666666666)), - vmulq_n_f16(src_data_64, 1.333333333333)); - float16x8_t t15 = - vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_15, 1.5), vmulq_n_f16(src_data_25, 3)), - vmulq_n_f16(src_data_35, 2.166666666666666667)), - vmulq_n_f16(src_data_45, 4.333333333333)), - vmulq_n_f16(src_data_55, 0.66666666666)), - vmulq_n_f16(src_data_65, 1.333333333333)); - float16x8_t t16 = - vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_16, 1.5), vmulq_n_f16(src_data_26, 3)), - vmulq_n_f16(src_data_36, 2.166666666666666667)), - vmulq_n_f16(src_data_46, 4.333333333333)), - vmulq_n_f16(src_data_56, 0.66666666666)), - vmulq_n_f16(src_data_66, 1.333333333333)); - float16x8_t t17 = - vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_17, 1.5), vmulq_n_f16(src_data_27, 3)), - vmulq_n_f16(src_data_37, 2.166666666666666667)), - vmulq_n_f16(src_data_47, 4.333333333333)), - vmulq_n_f16(src_data_57, 0.66666666666)), - vmulq_n_f16(src_data_67, 1.333333333333)); - - float16x8_t t20 = - vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_10, -1.5), vmulq_n_f16(src_data_20, 3)), - vmulq_n_f16(src_data_30, 2.166666666666666667)), - vmulq_n_f16(src_data_40, 4.333333333333)), - vmulq_n_f16(src_data_50, 0.66666666666)), - vmulq_n_f16(src_data_60, 1.333333333333)); - float16x8_t t21 = - vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_11, -1.5), vmulq_n_f16(src_data_21, 3)), - vmulq_n_f16(src_data_31, 2.166666666666666667)), - vmulq_n_f16(src_data_41, 4.333333333333)), - vmulq_n_f16(src_data_51, 0.66666666666)), - vmulq_n_f16(src_data_61, 1.333333333333)); - float16x8_t t22 = - vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_12, -1.5), vmulq_n_f16(src_data_22, 3)), - vmulq_n_f16(src_data_32, 2.166666666666666667)), - vmulq_n_f16(src_data_42, 4.333333333333)), - vmulq_n_f16(src_data_52, 0.66666666666)), - vmulq_n_f16(src_data_62, 1.333333333333)); - float16x8_t t23 = - vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_13, -1.5), vmulq_n_f16(src_data_23, 3)), - vmulq_n_f16(src_data_33, 2.166666666666666667)), - vmulq_n_f16(src_data_43, 4.333333333333)), - vmulq_n_f16(src_data_53, 0.66666666666)), - vmulq_n_f16(src_data_63, 1.333333333333)); - float16x8_t t24 = - vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_14, -1.5), vmulq_n_f16(src_data_24, 3)), - vmulq_n_f16(src_data_34, 2.166666666666666667)), - vmulq_n_f16(src_data_44, 4.333333333333)), - vmulq_n_f16(src_data_54, 0.66666666666)), - vmulq_n_f16(src_data_64, 1.333333333333)); - float16x8_t t25 = - vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_15, -1.5), vmulq_n_f16(src_data_25, 3)), - vmulq_n_f16(src_data_35, 2.166666666666666667)), - vmulq_n_f16(src_data_45, 4.333333333333)), - vmulq_n_f16(src_data_55, 0.66666666666)), - vmulq_n_f16(src_data_65, 1.333333333333)); - float16x8_t t26 = - vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_16, -1.5), vmulq_n_f16(src_data_26, 3)), - vmulq_n_f16(src_data_36, 2.166666666666666667)), - vmulq_n_f16(src_data_46, 4.333333333333)), - vmulq_n_f16(src_data_56, 0.66666666666)), - vmulq_n_f16(src_data_66, 1.333333333333)); - float16x8_t t27 = - vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_17, -1.5), vmulq_n_f16(src_data_27, 3)), - vmulq_n_f16(src_data_37, 2.166666666666666667)), - vmulq_n_f16(src_data_47, 4.333333333333)), - vmulq_n_f16(src_data_57, 0.66666666666)), - vmulq_n_f16(src_data_67, 1.333333333333)); - - float16x8_t t30 = vsubq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(src_data_30, src_data_40), 1.3333333333333), - vmulq_n_f16(vaddq_f16(src_data_10, src_data_20), -0.3)), - vmulq_n_f16(vaddq_f16(src_data_50, src_data_60), 0.53333333333)); - float16x8_t t31 = vsubq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(src_data_31, src_data_41), 1.3333333333333), - vmulq_n_f16(vaddq_f16(src_data_11, src_data_21), -0.3)), - vmulq_n_f16(vaddq_f16(src_data_51, src_data_61), 0.53333333333)); - float16x8_t t32 = vsubq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(src_data_32, src_data_42), 1.3333333333333), - vmulq_n_f16(vaddq_f16(src_data_12, src_data_22), -0.3)), - vmulq_n_f16(vaddq_f16(src_data_52, src_data_62), 0.53333333333)); - float16x8_t t33 = vsubq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(src_data_33, src_data_43), 1.3333333333333), - vmulq_n_f16(vaddq_f16(src_data_13, src_data_23), -0.3)), - vmulq_n_f16(vaddq_f16(src_data_53, src_data_63), 0.53333333333)); - float16x8_t t34 = vsubq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(src_data_34, src_data_44), 1.3333333333333), - vmulq_n_f16(vaddq_f16(src_data_14, src_data_24), -0.3)), - vmulq_n_f16(vaddq_f16(src_data_54, src_data_64), 0.53333333333)); - float16x8_t t35 = vsubq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(src_data_35, src_data_45), 1.3333333333333), - vmulq_n_f16(vaddq_f16(src_data_15, src_data_25), -0.3)), - vmulq_n_f16(vaddq_f16(src_data_55, src_data_65), 0.53333333333)); - float16x8_t t36 = vsubq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(src_data_36, src_data_46), 1.3333333333333), - vmulq_n_f16(vaddq_f16(src_data_16, src_data_26), -0.3)), - vmulq_n_f16(vaddq_f16(src_data_56, src_data_66), 0.53333333333)); - float16x8_t t37 = vsubq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(src_data_37, src_data_47), 1.3333333333333), - vmulq_n_f16(vaddq_f16(src_data_17, src_data_27), -0.3)), - vmulq_n_f16(vaddq_f16(src_data_57, src_data_67), 0.53333333333)); - - float16x8_t t40 = vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(src_data_40, src_data_30), 1.3333333333333), - vmulq_n_f16(vsubq_f16(src_data_10, src_data_20), 0.3)), - vmulq_n_f16(vsubq_f16(src_data_50, src_data_60), 0.53333333333)); - float16x8_t t41 = vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(src_data_41, src_data_31), 1.3333333333333), - vmulq_n_f16(vsubq_f16(src_data_11, src_data_21), 0.3)), - vmulq_n_f16(vsubq_f16(src_data_51, src_data_61), 0.53333333333)); - float16x8_t t42 = vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(src_data_42, src_data_32), 1.3333333333333), - vmulq_n_f16(vsubq_f16(src_data_12, src_data_22), 0.3)), - vmulq_n_f16(vsubq_f16(src_data_52, src_data_62), 0.53333333333)); - float16x8_t t43 = vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(src_data_43, src_data_33), 1.3333333333333), - vmulq_n_f16(vsubq_f16(src_data_13, src_data_23), 0.3)), - vmulq_n_f16(vsubq_f16(src_data_53, src_data_63), 0.53333333333)); - float16x8_t t44 = vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(src_data_44, src_data_34), 1.3333333333333), - vmulq_n_f16(vsubq_f16(src_data_14, src_data_24), 0.3)), - vmulq_n_f16(vsubq_f16(src_data_54, src_data_64), 0.53333333333)); - float16x8_t t45 = vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(src_data_45, src_data_35), 1.3333333333333), - vmulq_n_f16(vsubq_f16(src_data_15, src_data_25), 0.3)), - vmulq_n_f16(vsubq_f16(src_data_55, src_data_65), 0.53333333333)); - float16x8_t t46 = vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(src_data_46, src_data_36), 1.3333333333333), - vmulq_n_f16(vsubq_f16(src_data_16, src_data_26), 0.3)), - vmulq_n_f16(vsubq_f16(src_data_56, src_data_66), 0.53333333333)); - float16x8_t t47 = vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(src_data_47, src_data_37), 1.3333333333333), - vmulq_n_f16(vsubq_f16(src_data_17, src_data_27), 0.3)), - vmulq_n_f16(vsubq_f16(src_data_57, src_data_67), 0.53333333333)); - - float16x8_t t50 = vaddq_f16( - vaddq_f16( - vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_10, 0.03333333), vmulq_n_f16(src_data_20, 0.022222222)), - vmulq_n_f16(src_data_30, 0.1666666666)), - vmulq_n_f16(src_data_40, 0.11111111111)), - vmulq_n_f16(src_data_50, 0.133333333)), - vmulq_n_f16(src_data_60, 0.088888888)); - float16x8_t t51 = vaddq_f16( - vaddq_f16( - vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_11, 0.03333333), vmulq_n_f16(src_data_21, 0.022222222)), - vmulq_n_f16(src_data_31, 0.1666666666)), - vmulq_n_f16(src_data_41, 0.11111111111)), - vmulq_n_f16(src_data_51, 0.133333333)), - vmulq_n_f16(src_data_61, 0.088888888)); - float16x8_t t52 = vaddq_f16( - vaddq_f16( - vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_12, 0.03333333), vmulq_n_f16(src_data_22, 0.022222222)), - vmulq_n_f16(src_data_32, 0.1666666666)), - vmulq_n_f16(src_data_42, 0.11111111111)), - vmulq_n_f16(src_data_52, 0.133333333)), - vmulq_n_f16(src_data_62, 0.088888888)); - float16x8_t t53 = vaddq_f16( - vaddq_f16( - vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_13, 0.03333333), vmulq_n_f16(src_data_23, 0.022222222)), - vmulq_n_f16(src_data_33, 0.1666666666)), - vmulq_n_f16(src_data_43, 0.11111111111)), - vmulq_n_f16(src_data_53, 0.133333333)), - vmulq_n_f16(src_data_63, 0.088888888)); - float16x8_t t54 = vaddq_f16( - vaddq_f16( - vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_14, 0.03333333), vmulq_n_f16(src_data_24, 0.022222222)), - vmulq_n_f16(src_data_34, 0.1666666666)), - vmulq_n_f16(src_data_44, 0.11111111111)), - vmulq_n_f16(src_data_54, 0.133333333)), - vmulq_n_f16(src_data_64, 0.088888888)); - float16x8_t t55 = vaddq_f16( - vaddq_f16( - vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_15, 0.03333333), vmulq_n_f16(src_data_25, 0.022222222)), - vmulq_n_f16(src_data_35, 0.1666666666)), - vmulq_n_f16(src_data_45, 0.11111111111)), - vmulq_n_f16(src_data_55, 0.133333333)), - vmulq_n_f16(src_data_65, 0.088888888)); - float16x8_t t56 = vaddq_f16( - vaddq_f16( - vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_16, 0.03333333), vmulq_n_f16(src_data_26, 0.022222222)), - vmulq_n_f16(src_data_36, 0.1666666666)), - vmulq_n_f16(src_data_46, 0.11111111111)), - vmulq_n_f16(src_data_56, 0.133333333)), - vmulq_n_f16(src_data_66, 0.088888888)); - float16x8_t t57 = vaddq_f16( - vaddq_f16( - vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_17, 0.03333333), vmulq_n_f16(src_data_27, 0.022222222)), - vmulq_n_f16(src_data_37, 0.1666666666)), - vmulq_n_f16(src_data_47, 0.11111111111)), - vmulq_n_f16(src_data_57, 0.133333333)), - vmulq_n_f16(src_data_67, 0.088888888)); - - float16x8_t t60 = vaddq_f16( - vaddq_f16( - vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_10, -0.03333333), vmulq_n_f16(src_data_20, 0.022222222)), - vmulq_n_f16(src_data_30, 0.1666666666)), - vmulq_n_f16(src_data_40, 0.11111111111)), - vmulq_n_f16(src_data_50, -0.133333333)), - vmulq_n_f16(src_data_60, 0.088888888)); - float16x8_t t61 = vaddq_f16( - vaddq_f16( - vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_11, -0.03333333), vmulq_n_f16(src_data_21, 0.022222222)), - vmulq_n_f16(src_data_31, 0.1666666666)), - vmulq_n_f16(src_data_41, 0.11111111111)), - vmulq_n_f16(src_data_51, -0.133333333)), - vmulq_n_f16(src_data_61, 0.088888888)); - float16x8_t t62 = vaddq_f16( - vaddq_f16( - vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_12, -0.03333333), vmulq_n_f16(src_data_22, 0.022222222)), - vmulq_n_f16(src_data_32, 0.1666666666)), - vmulq_n_f16(src_data_42, 0.11111111111)), - vmulq_n_f16(src_data_52, -0.133333333)), - vmulq_n_f16(src_data_62, 0.088888888)); - float16x8_t t63 = vaddq_f16( - vaddq_f16( - vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_13, -0.03333333), vmulq_n_f16(src_data_23, 0.022222222)), - vmulq_n_f16(src_data_33, 0.1666666666)), - vmulq_n_f16(src_data_43, 0.11111111111)), - vmulq_n_f16(src_data_53, -0.133333333)), - vmulq_n_f16(src_data_63, 0.088888888)); - float16x8_t t64 = vaddq_f16( - vaddq_f16( - vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_14, -0.03333333), vmulq_n_f16(src_data_24, 0.022222222)), - vmulq_n_f16(src_data_34, 0.1666666666)), - vmulq_n_f16(src_data_44, 0.11111111111)), - vmulq_n_f16(src_data_54, -0.133333333)), - vmulq_n_f16(src_data_64, 0.088888888)); - float16x8_t t65 = vaddq_f16( - vaddq_f16( - vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_15, -0.03333333), vmulq_n_f16(src_data_25, 0.022222222)), - vmulq_n_f16(src_data_35, 0.1666666666)), - vmulq_n_f16(src_data_45, 0.11111111111)), - vmulq_n_f16(src_data_55, -0.133333333)), - vmulq_n_f16(src_data_65, 0.088888888)); - float16x8_t t66 = vaddq_f16( - vaddq_f16( - vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_16, -0.03333333), vmulq_n_f16(src_data_26, 0.022222222)), - vmulq_n_f16(src_data_36, 0.1666666666)), - vmulq_n_f16(src_data_46, 0.11111111111)), - vmulq_n_f16(src_data_56, -0.133333333)), - vmulq_n_f16(src_data_66, 0.088888888)); - float16x8_t t67 = vaddq_f16( - vaddq_f16( - vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(src_data_17, -0.03333333), vmulq_n_f16(src_data_27, 0.022222222)), - vmulq_n_f16(src_data_37, 0.1666666666)), - vmulq_n_f16(src_data_47, 0.11111111111)), - vmulq_n_f16(src_data_57, -0.133333333)), - vmulq_n_f16(src_data_67, 0.088888888)); - - float16x8_t t70 = vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_30, 3.0625), vmulq_n_f16(src_data_10, -0.5625)), - vmulq_n_f16(src_data_50, 3.5)), - src_data_70); - float16x8_t t71 = vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_31, 3.0625), vmulq_n_f16(src_data_11, -0.5625)), - vmulq_n_f16(src_data_51, 3.5)), - src_data_71); - float16x8_t t72 = vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_32, 3.0625), vmulq_n_f16(src_data_12, -0.5625)), - vmulq_n_f16(src_data_52, 3.5)), - src_data_72); - float16x8_t t73 = vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_33, 3.0625), vmulq_n_f16(src_data_13, -0.5625)), - vmulq_n_f16(src_data_53, 3.5)), - src_data_73); - float16x8_t t74 = vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_34, 3.0625), vmulq_n_f16(src_data_14, -0.5625)), - vmulq_n_f16(src_data_54, 3.5)), - src_data_74); - float16x8_t t75 = vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_35, 3.0625), vmulq_n_f16(src_data_15, -0.5625)), - vmulq_n_f16(src_data_55, 3.5)), - src_data_75); - float16x8_t t76 = vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_36, 3.0625), vmulq_n_f16(src_data_16, -0.5625)), - vmulq_n_f16(src_data_56, 3.5)), - src_data_76); - float16x8_t t77 = vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(src_data_37, 3.0625), vmulq_n_f16(src_data_17, -0.5625)), - vmulq_n_f16(src_data_57, 3.5)), - src_data_77); - - float16x8_t m00 = - vsubq_f16(vaddq_f16(vsubq_f16(t00, vmulq_n_f16(t02, 5.444444444444444)), vmulq_n_f16(t04, 6.22222222222)), - vmulq_n_f16(t06, 1.77777777777777777778)); - float16x8_t m01 = vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t01, 1.5), vmulq_n_f16(t02, 3)), - vmulq_n_f16(t03, 2.16666666666666667)), - vmulq_n_f16(t04, 4.3333333333)), - vmulq_n_f16(t05, 0.66666666667)), - vmulq_n_f16(t06, 1.333333333333)); - float16x8_t m02 = vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t01, -1.5), vmulq_n_f16(t02, 3)), - vmulq_n_f16(t03, 2.16666666666666667)), - vmulq_n_f16(t04, 4.3333333333)), - vmulq_n_f16(t05, 0.66666666667)), - vmulq_n_f16(t06, 1.333333333333)); - float16x8_t m03 = - vaddq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(t01, t02), -0.3), vmulq_n_f16(vaddq_f16(t03, t04), 1.33333333333)), - vmulq_n_f16(vaddq_f16(t05, t06), -0.533333333333)); - float16x8_t m04 = - vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(t01, t02), 0.3), vmulq_n_f16(vsubq_f16(t04, t03), 1.33333333333)), - vmulq_n_f16(vsubq_f16(t05, t06), 0.533333333333)); - float16x8_t m05 = - vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t01, 0.03333333), vmulq_n_f16(t02, 0.0222222)), - vmulq_n_f16(t03, 0.16666666666666667)), - vmulq_n_f16(t04, 0.11111111111)), - vmulq_n_f16(t05, 0.1333333333)), - vmulq_n_f16(t06, 0.08888888888)); - float16x8_t m06 = - vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t01, -0.03333333), vmulq_n_f16(t02, 0.0222222)), - vmulq_n_f16(t03, 0.16666666666666667)), - vmulq_n_f16(t04, 0.11111111111)), - vmulq_n_f16(t05, 0.1333333333)), - vmulq_n_f16(t06, 0.08888888888)); - float16x8_t m07 = - vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t01, -0.5625), vmulq_n_f16(t03, 3.0625)), vmulq_n_f16(t05, 3.5)), t07); - - float16x8_t m10 = - vsubq_f16(vaddq_f16(vsubq_f16(t10, vmulq_n_f16(t12, 5.444444444444444)), vmulq_n_f16(t14, 6.22222222222)), - vmulq_n_f16(t16, 1.77777777777777777778)); - float16x8_t m11 = vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t11, 1.5), vmulq_n_f16(t12, 3)), - vmulq_n_f16(t13, 2.16666666666666667)), - vmulq_n_f16(t14, 4.3333333333)), - vmulq_n_f16(t15, 0.66666666667)), - vmulq_n_f16(t16, 1.333333333333)); - float16x8_t m12 = vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t11, -1.5), vmulq_n_f16(t12, 3)), - vmulq_n_f16(t13, 2.16666666666666667)), - vmulq_n_f16(t14, 4.3333333333)), - vmulq_n_f16(t15, 0.66666666667)), - vmulq_n_f16(t16, 1.333333333333)); - float16x8_t m13 = - vaddq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(t11, t12), -0.3), vmulq_n_f16(vaddq_f16(t13, t14), 1.33333333333)), - vmulq_n_f16(vaddq_f16(t15, t16), -0.533333333333)); - float16x8_t m14 = - vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(t11, t12), 0.3), vmulq_n_f16(vsubq_f16(t14, t13), 1.33333333333)), - vmulq_n_f16(vsubq_f16(t15, t16), 0.533333333333)); - float16x8_t m15 = - vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t11, 0.03333333), vmulq_n_f16(t12, 0.0222222)), - vmulq_n_f16(t13, 0.16666666666666667)), - vmulq_n_f16(t14, 0.11111111111)), - vmulq_n_f16(t15, 0.1333333333)), - vmulq_n_f16(t16, 0.08888888888)); - float16x8_t m16 = - vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t11, -0.03333333), vmulq_n_f16(t12, 0.0222222)), - vmulq_n_f16(t13, 0.16666666666666667)), - vmulq_n_f16(t14, 0.11111111111)), - vmulq_n_f16(t15, 0.1333333333)), - vmulq_n_f16(t16, 0.08888888888)); - float16x8_t m17 = - vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t11, -0.5625), vmulq_n_f16(t13, 3.0625)), vmulq_n_f16(t15, 3.5)), t17); - - float16x8_t m20 = - vsubq_f16(vaddq_f16(vsubq_f16(t20, vmulq_n_f16(t22, 5.444444444444444)), vmulq_n_f16(t24, 6.22222222222)), - vmulq_n_f16(t26, 1.77777777777777777778)); - float16x8_t m21 = vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t21, 1.5), vmulq_n_f16(t22, 3)), - vmulq_n_f16(t23, 2.16666666666666667)), - vmulq_n_f16(t24, 4.3333333333)), - vmulq_n_f16(t25, 0.66666666667)), - vmulq_n_f16(t26, 1.333333333333)); - float16x8_t m22 = vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t21, -1.5), vmulq_n_f16(t22, 3)), - vmulq_n_f16(t23, 2.16666666666666667)), - vmulq_n_f16(t24, 4.3333333333)), - vmulq_n_f16(t25, 0.66666666667)), - vmulq_n_f16(t26, 1.333333333333)); - float16x8_t m23 = - vaddq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(t21, t22), -0.3), vmulq_n_f16(vaddq_f16(t23, t24), 1.33333333333)), - vmulq_n_f16(vaddq_f16(t25, t26), -0.533333333333)); - float16x8_t m24 = - vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(t21, t22), 0.3), vmulq_n_f16(vsubq_f16(t24, t23), 1.33333333333)), - vmulq_n_f16(vsubq_f16(t25, t26), 0.533333333333)); - float16x8_t m25 = - vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t21, 0.03333333), vmulq_n_f16(t22, 0.0222222)), - vmulq_n_f16(t23, 0.16666666666666667)), - vmulq_n_f16(t24, 0.11111111111)), - vmulq_n_f16(t25, 0.1333333333)), - vmulq_n_f16(t26, 0.08888888888)); - float16x8_t m26 = - vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t21, -0.03333333), vmulq_n_f16(t22, 0.0222222)), - vmulq_n_f16(t23, 0.16666666666666667)), - vmulq_n_f16(t24, 0.11111111111)), - vmulq_n_f16(t25, 0.1333333333)), - vmulq_n_f16(t26, 0.08888888888)); - float16x8_t m27 = - vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t21, -0.5625), vmulq_n_f16(t23, 3.0625)), vmulq_n_f16(t25, 3.5)), t27); - - float16x8_t m30 = - vsubq_f16(vaddq_f16(vsubq_f16(t30, vmulq_n_f16(t32, 5.444444444444444)), vmulq_n_f16(t34, 6.22222222222)), - vmulq_n_f16(t36, 1.77777777777777777778)); - float16x8_t m31 = vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t31, 1.5), vmulq_n_f16(t32, 3)), - vmulq_n_f16(t33, 2.16666666666666667)), - vmulq_n_f16(t34, 4.3333333333)), - vmulq_n_f16(t35, 0.66666666667)), - vmulq_n_f16(t36, 1.333333333333)); - float16x8_t m32 = vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t31, -1.5), vmulq_n_f16(t32, 3)), - vmulq_n_f16(t33, 2.16666666666666667)), - vmulq_n_f16(t34, 4.3333333333)), - vmulq_n_f16(t35, 0.66666666667)), - vmulq_n_f16(t36, 1.333333333333)); - float16x8_t m33 = - vaddq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(t31, t32), -0.3), vmulq_n_f16(vaddq_f16(t33, t34), 1.33333333333)), - vmulq_n_f16(vaddq_f16(t35, t36), -0.533333333333)); - float16x8_t m34 = - vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(t31, t32), 0.3), vmulq_n_f16(vsubq_f16(t34, t33), 1.33333333333)), - vmulq_n_f16(vsubq_f16(t35, t36), 0.533333333333)); - float16x8_t m35 = - vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t31, 0.03333333), vmulq_n_f16(t32, 0.0222222)), - vmulq_n_f16(t33, 0.16666666666666667)), - vmulq_n_f16(t34, 0.11111111111)), - vmulq_n_f16(t35, 0.1333333333)), - vmulq_n_f16(t36, 0.08888888888)); - float16x8_t m36 = - vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t31, -0.03333333), vmulq_n_f16(t32, 0.0222222)), - vmulq_n_f16(t33, 0.16666666666666667)), - vmulq_n_f16(t34, 0.11111111111)), - vmulq_n_f16(t35, 0.1333333333)), - vmulq_n_f16(t36, 0.08888888888)); - float16x8_t m37 = - vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t31, -0.5625), vmulq_n_f16(t33, 3.0625)), vmulq_n_f16(t35, 3.5)), t37); - - float16x8_t m40 = - vsubq_f16(vaddq_f16(vsubq_f16(t40, vmulq_n_f16(t42, 5.444444444444444)), vmulq_n_f16(t44, 6.22222222222)), - vmulq_n_f16(t46, 1.77777777777777777778)); - float16x8_t m41 = vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t41, 1.5), vmulq_n_f16(t42, 3)), - vmulq_n_f16(t43, 2.16666666666666667)), - vmulq_n_f16(t44, 4.3333333333)), - vmulq_n_f16(t45, 0.66666666667)), - vmulq_n_f16(t46, 1.333333333333)); - float16x8_t m42 = vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t41, -1.5), vmulq_n_f16(t42, 3)), - vmulq_n_f16(t43, 2.16666666666666667)), - vmulq_n_f16(t44, 4.3333333333)), - vmulq_n_f16(t45, 0.66666666667)), - vmulq_n_f16(t46, 1.333333333333)); - float16x8_t m43 = - vaddq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(t41, t42), -0.3), vmulq_n_f16(vaddq_f16(t43, t44), 1.33333333333)), - vmulq_n_f16(vaddq_f16(t45, t46), -0.533333333333)); - float16x8_t m44 = - vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(t41, t42), 0.3), vmulq_n_f16(vsubq_f16(t44, t43), 1.33333333333)), - vmulq_n_f16(vsubq_f16(t45, t46), 0.533333333333)); - float16x8_t m45 = - vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t41, 0.03333333), vmulq_n_f16(t42, 0.0222222)), - vmulq_n_f16(t43, 0.16666666666666667)), - vmulq_n_f16(t44, 0.11111111111)), - vmulq_n_f16(t45, 0.1333333333)), - vmulq_n_f16(t46, 0.08888888888)); - float16x8_t m46 = - vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t41, -0.03333333), vmulq_n_f16(t42, 0.0222222)), - vmulq_n_f16(t43, 0.16666666666666667)), - vmulq_n_f16(t44, 0.11111111111)), - vmulq_n_f16(t45, 0.1333333333)), - vmulq_n_f16(t46, 0.08888888888)); - float16x8_t m47 = - vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t41, -0.5625), vmulq_n_f16(t43, 3.0625)), vmulq_n_f16(t45, 3.5)), t47); - - float16x8_t m50 = - vsubq_f16(vaddq_f16(vsubq_f16(t50, vmulq_n_f16(t52, 5.444444444444444)), vmulq_n_f16(t54, 6.22222222222)), - vmulq_n_f16(t56, 1.77777777777777777778)); - float16x8_t m51 = vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t51, 1.5), vmulq_n_f16(t52, 3)), - vmulq_n_f16(t53, 2.16666666666666667)), - vmulq_n_f16(t54, 4.3333333333)), - vmulq_n_f16(t55, 0.66666666667)), - vmulq_n_f16(t56, 1.333333333333)); - float16x8_t m52 = vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t51, -1.5), vmulq_n_f16(t52, 3)), - vmulq_n_f16(t53, 2.16666666666666667)), - vmulq_n_f16(t54, 4.3333333333)), - vmulq_n_f16(t55, 0.66666666667)), - vmulq_n_f16(t56, 1.333333333333)); - float16x8_t m53 = - vaddq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(t51, t52), -0.3), vmulq_n_f16(vaddq_f16(t53, t54), 1.33333333333)), - vmulq_n_f16(vaddq_f16(t55, t56), -0.533333333333)); - float16x8_t m54 = - vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(t51, t52), 0.3), vmulq_n_f16(vsubq_f16(t54, t53), 1.33333333333)), - vmulq_n_f16(vsubq_f16(t55, t56), 0.533333333333)); - float16x8_t m55 = - vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t51, 0.03333333), vmulq_n_f16(t52, 0.0222222)), - vmulq_n_f16(t53, 0.16666666666666667)), - vmulq_n_f16(t54, 0.11111111111)), - vmulq_n_f16(t55, 0.1333333333)), - vmulq_n_f16(t56, 0.08888888888)); - float16x8_t m56 = - vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t51, -0.03333333), vmulq_n_f16(t52, 0.0222222)), - vmulq_n_f16(t53, 0.16666666666666667)), - vmulq_n_f16(t54, 0.11111111111)), - vmulq_n_f16(t55, 0.1333333333)), - vmulq_n_f16(t56, 0.08888888888)); - float16x8_t m57 = - vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t51, -0.5625), vmulq_n_f16(t53, 3.0625)), vmulq_n_f16(t55, 3.5)), t57); - - float16x8_t m60 = - vsubq_f16(vaddq_f16(vsubq_f16(t60, vmulq_n_f16(t62, 5.444444444444444)), vmulq_n_f16(t64, 6.22222222222)), - vmulq_n_f16(t66, 1.77777777777777777778)); - float16x8_t m61 = vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t61, 1.5), vmulq_n_f16(t62, 3)), - vmulq_n_f16(t63, 2.16666666666666667)), - vmulq_n_f16(t64, 4.3333333333)), - vmulq_n_f16(t65, 0.66666666667)), - vmulq_n_f16(t66, 1.333333333333)); - float16x8_t m62 = vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t61, -1.5), vmulq_n_f16(t62, 3)), - vmulq_n_f16(t63, 2.16666666666666667)), - vmulq_n_f16(t64, 4.3333333333)), - vmulq_n_f16(t65, 0.66666666667)), - vmulq_n_f16(t66, 1.333333333333)); - float16x8_t m63 = - vaddq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(t61, t62), -0.3), vmulq_n_f16(vaddq_f16(t63, t64), 1.33333333333)), - vmulq_n_f16(vaddq_f16(t65, t66), -0.533333333333)); - float16x8_t m64 = - vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(t61, t62), 0.3), vmulq_n_f16(vsubq_f16(t64, t63), 1.33333333333)), - vmulq_n_f16(vsubq_f16(t65, t66), 0.533333333333)); - float16x8_t m65 = - vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t61, 0.03333333), vmulq_n_f16(t62, 0.0222222)), - vmulq_n_f16(t63, 0.16666666666666667)), - vmulq_n_f16(t64, 0.11111111111)), - vmulq_n_f16(t65, 0.1333333333)), - vmulq_n_f16(t66, 0.08888888888)); - float16x8_t m66 = - vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t61, -0.03333333), vmulq_n_f16(t62, 0.0222222)), - vmulq_n_f16(t63, 0.16666666666666667)), - vmulq_n_f16(t64, 0.11111111111)), - vmulq_n_f16(t65, 0.1333333333)), - vmulq_n_f16(t66, 0.08888888888)); - float16x8_t m67 = - vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t61, -0.5625), vmulq_n_f16(t63, 3.0625)), vmulq_n_f16(t65, 3.5)), t67); - - float16x8_t m70 = - vsubq_f16(vaddq_f16(vsubq_f16(t70, vmulq_n_f16(t72, 5.444444444444444)), vmulq_n_f16(t74, 6.22222222222)), - vmulq_n_f16(t76, 1.77777777777777777778)); - float16x8_t m71 = vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t71, 1.5), vmulq_n_f16(t72, 3)), - vmulq_n_f16(t73, 2.16666666666666667)), - vmulq_n_f16(t74, 4.3333333333)), - vmulq_n_f16(t75, 0.66666666667)), - vmulq_n_f16(t76, 1.333333333333)); - float16x8_t m72 = vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t71, -1.5), vmulq_n_f16(t72, 3)), - vmulq_n_f16(t73, 2.16666666666666667)), - vmulq_n_f16(t74, 4.3333333333)), - vmulq_n_f16(t75, 0.66666666667)), - vmulq_n_f16(t76, 1.333333333333)); - float16x8_t m73 = - vaddq_f16(vaddq_f16(vmulq_n_f16(vaddq_f16(t71, t72), -0.3), vmulq_n_f16(vaddq_f16(t73, t74), 1.33333333333)), - vmulq_n_f16(vaddq_f16(t75, t76), -0.533333333333)); - float16x8_t m74 = - vaddq_f16(vaddq_f16(vmulq_n_f16(vsubq_f16(t71, t72), 0.3), vmulq_n_f16(vsubq_f16(t74, t73), 1.33333333333)), - vmulq_n_f16(vsubq_f16(t75, t76), 0.533333333333)); - float16x8_t m75 = - vaddq_f16(vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t71, 0.03333333), vmulq_n_f16(t72, 0.0222222)), - vmulq_n_f16(t73, 0.16666666666666667)), - vmulq_n_f16(t74, 0.11111111111)), - vmulq_n_f16(t75, 0.1333333333)), - vmulq_n_f16(t76, 0.08888888888)); - float16x8_t m76 = - vaddq_f16(vsubq_f16(vsubq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(t71, -0.03333333), vmulq_n_f16(t72, 0.0222222)), - vmulq_n_f16(t73, 0.16666666666666667)), - vmulq_n_f16(t74, 0.11111111111)), - vmulq_n_f16(t75, 0.1333333333)), - vmulq_n_f16(t76, 0.08888888888)); - float16x8_t m77 = - vaddq_f16(vsubq_f16(vaddq_f16(vmulq_n_f16(t71, -0.5625), vmulq_n_f16(t73, 3.0625)), vmulq_n_f16(t75, 3.5)), t77); - - vst1_f16(dst_data, vget_low_f16(m00)); - vst1_f16(dst_data + 64, vget_high_f16(m00)); - vst1_f16(dst_data + 1 * dst_step, vget_low_f16(m01)); - vst1_f16(dst_data + 1 * dst_step + 64, vget_high_f16(m01)); - vst1_f16(dst_data + 2 * dst_step, vget_low_f16(m02)); - vst1_f16(dst_data + 2 * dst_step + 64, vget_high_f16(m02)); - vst1_f16(dst_data + 3 * dst_step, vget_low_f16(m03)); - vst1_f16(dst_data + 3 * dst_step + 64, vget_high_f16(m03)); - vst1_f16(dst_data + 4 * dst_step, vget_low_f16(m04)); - vst1_f16(dst_data + 4 * dst_step + 64, vget_high_f16(m04)); - vst1_f16(dst_data + 5 * dst_step, vget_low_f16(m05)); - vst1_f16(dst_data + 5 * dst_step + 64, vget_high_f16(m05)); - vst1_f16(dst_data + 6 * dst_step, vget_low_f16(m06)); - vst1_f16(dst_data + 6 * dst_step + 64, vget_high_f16(m06)); - vst1_f16(dst_data + 7 * dst_step, vget_low_f16(m07)); - vst1_f16(dst_data + 7 * dst_step + 64, vget_high_f16(m07)); - vst1_f16(dst_data + 8 * dst_step, vget_low_f16(m10)); - vst1_f16(dst_data + 8 * dst_step + 64, vget_high_f16(m10)); - vst1_f16(dst_data + 9 * dst_step, vget_low_f16(m11)); - vst1_f16(dst_data + 9 * dst_step + 64, vget_high_f16(m11)); - vst1_f16(dst_data + 10 * dst_step, vget_low_f16(m12)); - vst1_f16(dst_data + 10 * dst_step + 64, vget_high_f16(m12)); - vst1_f16(dst_data + 11 * dst_step, vget_low_f16(m13)); - vst1_f16(dst_data + 11 * dst_step + 64, vget_high_f16(m13)); - vst1_f16(dst_data + 12 * dst_step, vget_low_f16(m14)); - vst1_f16(dst_data + 12 * dst_step + 64, vget_high_f16(m14)); - vst1_f16(dst_data + 13 * dst_step, vget_low_f16(m15)); - vst1_f16(dst_data + 13 * dst_step + 64, vget_high_f16(m15)); - vst1_f16(dst_data + 14 * dst_step, vget_low_f16(m16)); - vst1_f16(dst_data + 14 * dst_step + 64, vget_high_f16(m16)); - vst1_f16(dst_data + 15 * dst_step, vget_low_f16(m17)); - vst1_f16(dst_data + 15 * dst_step + 64, vget_high_f16(m17)); - vst1_f16(dst_data + 16 * dst_step, vget_low_f16(m20)); - vst1_f16(dst_data + 16 * dst_step + 64, vget_high_f16(m20)); - vst1_f16(dst_data + 17 * dst_step, vget_low_f16(m21)); - vst1_f16(dst_data + 17 * dst_step + 64, vget_high_f16(m21)); - vst1_f16(dst_data + 18 * dst_step, vget_low_f16(m22)); - vst1_f16(dst_data + 18 * dst_step + 64, vget_high_f16(m22)); - vst1_f16(dst_data + 19 * dst_step, vget_low_f16(m23)); - vst1_f16(dst_data + 19 * dst_step + 64, vget_high_f16(m23)); - vst1_f16(dst_data + 20 * dst_step, vget_low_f16(m24)); - vst1_f16(dst_data + 20 * dst_step + 64, vget_high_f16(m24)); - vst1_f16(dst_data + 21 * dst_step, vget_low_f16(m25)); - vst1_f16(dst_data + 21 * dst_step + 64, vget_high_f16(m25)); - vst1_f16(dst_data + 22 * dst_step, vget_low_f16(m26)); - vst1_f16(dst_data + 22 * dst_step + 64, vget_high_f16(m26)); - vst1_f16(dst_data + 23 * dst_step, vget_low_f16(m27)); - vst1_f16(dst_data + 23 * dst_step + 64, vget_high_f16(m27)); - vst1_f16(dst_data + 24 * dst_step, vget_low_f16(m30)); - vst1_f16(dst_data + 24 * dst_step + 64, vget_high_f16(m30)); - vst1_f16(dst_data + 25 * dst_step, vget_low_f16(m31)); - vst1_f16(dst_data + 25 * dst_step + 64, vget_high_f16(m31)); - vst1_f16(dst_data + 26 * dst_step, vget_low_f16(m32)); - vst1_f16(dst_data + 26 * dst_step + 64, vget_high_f16(m32)); - vst1_f16(dst_data + 27 * dst_step, vget_low_f16(m33)); - vst1_f16(dst_data + 27 * dst_step + 64, vget_high_f16(m33)); - vst1_f16(dst_data + 28 * dst_step, vget_low_f16(m34)); - vst1_f16(dst_data + 28 * dst_step + 64, vget_high_f16(m34)); - vst1_f16(dst_data + 29 * dst_step, vget_low_f16(m35)); - vst1_f16(dst_data + 29 * dst_step + 64, vget_high_f16(m35)); - vst1_f16(dst_data + 30 * dst_step, vget_low_f16(m36)); - vst1_f16(dst_data + 30 * dst_step + 64, vget_high_f16(m36)); - vst1_f16(dst_data + 31 * dst_step, vget_low_f16(m37)); - vst1_f16(dst_data + 31 * dst_step + 64, vget_high_f16(m37)); - vst1_f16(dst_data + 32 * dst_step, vget_low_f16(m40)); - vst1_f16(dst_data + 32 * dst_step + 64, vget_high_f16(m40)); - vst1_f16(dst_data + 33 * dst_step, vget_low_f16(m41)); - vst1_f16(dst_data + 33 * dst_step + 64, vget_high_f16(m41)); - vst1_f16(dst_data + 34 * dst_step, vget_low_f16(m42)); - vst1_f16(dst_data + 34 * dst_step + 64, vget_high_f16(m42)); - vst1_f16(dst_data + 35 * dst_step, vget_low_f16(m43)); - vst1_f16(dst_data + 35 * dst_step + 64, vget_high_f16(m43)); - vst1_f16(dst_data + 36 * dst_step, vget_low_f16(m44)); - vst1_f16(dst_data + 36 * dst_step + 64, vget_high_f16(m44)); - vst1_f16(dst_data + 37 * dst_step, vget_low_f16(m45)); - vst1_f16(dst_data + 37 * dst_step + 64, vget_high_f16(m45)); - vst1_f16(dst_data + 38 * dst_step, vget_low_f16(m46)); - vst1_f16(dst_data + 38 * dst_step + 64, vget_high_f16(m46)); - vst1_f16(dst_data + 39 * dst_step, vget_low_f16(m47)); - vst1_f16(dst_data + 39 * dst_step + 64, vget_high_f16(m47)); - vst1_f16(dst_data + 40 * dst_step, vget_low_f16(m50)); - vst1_f16(dst_data + 40 * dst_step + 64, vget_high_f16(m50)); - vst1_f16(dst_data + 41 * dst_step, vget_low_f16(m51)); - vst1_f16(dst_data + 41 * dst_step + 64, vget_high_f16(m51)); - vst1_f16(dst_data + 42 * dst_step, vget_low_f16(m52)); - vst1_f16(dst_data + 42 * dst_step + 64, vget_high_f16(m52)); - vst1_f16(dst_data + 43 * dst_step, vget_low_f16(m53)); - vst1_f16(dst_data + 43 * dst_step + 64, vget_high_f16(m53)); - vst1_f16(dst_data + 44 * dst_step, vget_low_f16(m54)); - vst1_f16(dst_data + 44 * dst_step + 64, vget_high_f16(m54)); - vst1_f16(dst_data + 45 * dst_step, vget_low_f16(m55)); - vst1_f16(dst_data + 45 * dst_step + 64, vget_high_f16(m55)); - vst1_f16(dst_data + 46 * dst_step, vget_low_f16(m56)); - vst1_f16(dst_data + 46 * dst_step + 64, vget_high_f16(m56)); - vst1_f16(dst_data + 47 * dst_step, vget_low_f16(m57)); - vst1_f16(dst_data + 47 * dst_step + 64, vget_high_f16(m57)); - vst1_f16(dst_data + 48 * dst_step, vget_low_f16(m60)); - vst1_f16(dst_data + 48 * dst_step + 64, vget_high_f16(m60)); - vst1_f16(dst_data + 49 * dst_step, vget_low_f16(m61)); - vst1_f16(dst_data + 49 * dst_step + 64, vget_high_f16(m61)); - vst1_f16(dst_data + 50 * dst_step, vget_low_f16(m62)); - vst1_f16(dst_data + 50 * dst_step + 64, vget_high_f16(m62)); - vst1_f16(dst_data + 51 * dst_step, vget_low_f16(m63)); - vst1_f16(dst_data + 51 * dst_step + 64, vget_high_f16(m63)); - vst1_f16(dst_data + 52 * dst_step, vget_low_f16(m64)); - vst1_f16(dst_data + 52 * dst_step + 64, vget_high_f16(m64)); - vst1_f16(dst_data + 53 * dst_step, vget_low_f16(m65)); - vst1_f16(dst_data + 53 * dst_step + 64, vget_high_f16(m65)); - vst1_f16(dst_data + 54 * dst_step, vget_low_f16(m66)); - vst1_f16(dst_data + 54 * dst_step + 64, vget_high_f16(m66)); - vst1_f16(dst_data + 55 * dst_step, vget_low_f16(m67)); - vst1_f16(dst_data + 55 * dst_step + 64, vget_high_f16(m67)); - vst1_f16(dst_data + 56 * dst_step, vget_low_f16(m70)); - vst1_f16(dst_data + 56 * dst_step + 64, vget_high_f16(m70)); - vst1_f16(dst_data + 57 * dst_step, vget_low_f16(m71)); - vst1_f16(dst_data + 57 * dst_step + 64, vget_high_f16(m71)); - vst1_f16(dst_data + 58 * dst_step, vget_low_f16(m72)); - vst1_f16(dst_data + 58 * dst_step + 64, vget_high_f16(m72)); - vst1_f16(dst_data + 59 * dst_step, vget_low_f16(m73)); - vst1_f16(dst_data + 59 * dst_step + 64, vget_high_f16(m73)); - vst1_f16(dst_data + 60 * dst_step, vget_low_f16(m74)); - vst1_f16(dst_data + 60 * dst_step + 64, vget_high_f16(m74)); - vst1_f16(dst_data + 61 * dst_step, vget_low_f16(m75)); - vst1_f16(dst_data + 61 * dst_step + 64, vget_high_f16(m75)); - vst1_f16(dst_data + 62 * dst_step, vget_low_f16(m76)); - vst1_f16(dst_data + 62 * dst_step + 64, vget_high_f16(m76)); - vst1_f16(dst_data + 63 * dst_step, vget_low_f16(m77)); - vst1_f16(dst_data + 63 * dst_step + 64, vget_high_f16(m77)); -} - -void OutputTransform4x2UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, - int src_step, int dst_step) { -#ifdef ENABLE_ARM - float16x8_t bias_ptr = vld1q_f16(bias_data); - float16x8_t src_data_00 = vld1q_f16(src_data + 0 * src_step); - float16x8_t src_data_01 = vld1q_f16(src_data + 1 * src_step); - float16x8_t src_data_02 = vld1q_f16(src_data + 2 * src_step); - float16x8_t src_data_03 = vld1q_f16(src_data + 3 * src_step); - float16x8_t src_data_10 = vld1q_f16(src_data + 4 * src_step); - float16x8_t src_data_11 = vld1q_f16(src_data + 5 * src_step); - float16x8_t src_data_12 = vld1q_f16(src_data + 6 * src_step); - float16x8_t src_data_13 = vld1q_f16(src_data + 7 * src_step); - float16x8_t src_data_20 = vld1q_f16(src_data + 8 * src_step); - float16x8_t src_data_21 = vld1q_f16(src_data + 9 * src_step); - float16x8_t src_data_22 = vld1q_f16(src_data + 10 * src_step); - float16x8_t src_data_23 = vld1q_f16(src_data + 11 * src_step); - float16x8_t src_data_30 = vld1q_f16(src_data + 12 * src_step); - float16x8_t src_data_31 = vld1q_f16(src_data + 13 * src_step); - float16x8_t src_data_32 = vld1q_f16(src_data + 14 * src_step); - float16x8_t src_data_33 = vld1q_f16(src_data + 15 * src_step); - - float16x8_t t00 = vaddq_f16(src_data_00, vaddq_f16(src_data_10, src_data_20)); - float16x8_t t01 = vaddq_f16(src_data_01, vaddq_f16(src_data_11, src_data_21)); - float16x8_t t02 = vaddq_f16(src_data_02, vaddq_f16(src_data_12, src_data_22)); - float16x8_t t03 = vaddq_f16(src_data_03, vaddq_f16(src_data_13, src_data_23)); - - float16x8_t t10 = vsubq_f16(src_data_30, vmulq_n_f16(vsubq_f16(src_data_10, src_data_20), 0.5)); - float16x8_t t11 = vsubq_f16(src_data_31, vmulq_n_f16(vsubq_f16(src_data_11, src_data_21), 0.5)); - float16x8_t t12 = vsubq_f16(src_data_32, vmulq_n_f16(vsubq_f16(src_data_12, src_data_22), 0.5)); - float16x8_t t13 = vsubq_f16(src_data_33, vmulq_n_f16(vsubq_f16(src_data_13, src_data_23), 0.5)); - - float16x8_t m00 = vaddq_f16(vaddq_f16(t00, vaddq_f16(t01, t02)), bias_ptr); - float16x8_t m01 = vaddq_f16(vaddq_f16(t03, vmulq_n_f16(vsubq_f16(t01, t02), 0.5)), bias_ptr); - float16x8_t m10 = vaddq_f16(vaddq_f16(t10, vaddq_f16(t11, t12)), bias_ptr); - float16x8_t m11 = vaddq_f16(vaddq_f16(t13, vmulq_n_f16(vsubq_f16(t11, t12), 0.5)), bias_ptr); - - vst1q_f16(dst_data, m00); - vst1q_f16(dst_data + C8NUM, m01); - vst1q_f16(dst_data + dst_step * C8NUM, m10); - vst1q_f16(dst_data + dst_step * C8NUM + C8NUM, m11); -#else - for (int i = 0; i < C8NUM; i++) { - float16_t src_data_00 = src_data[i]; - float16_t src_data_01 = src_data[i + src_step]; - float16_t src_data_02 = src_data[i + 2 * src_step]; - float16_t src_data_03 = src_data[i + 3 * src_step]; - float16_t src_data_10 = src_data[i + 4 * src_step]; - float16_t src_data_11 = src_data[i + 5 * src_step]; - float16_t src_data_12 = src_data[i + 6 * src_step]; - float16_t src_data_13 = src_data[i + 7 * src_step]; - float16_t src_data_20 = src_data[i + 8 * src_step]; - float16_t src_data_21 = src_data[i + 9 * src_step]; - float16_t src_data_22 = src_data[i + 10 * src_step]; - float16_t src_data_23 = src_data[i + 11 * src_step]; - float16_t src_data_30 = src_data[i + 12 * src_step]; - float16_t src_data_31 = src_data[i + 13 * src_step]; - float16_t src_data_32 = src_data[i + 14 * src_step]; - float16_t src_data_33 = src_data[i + 15 * src_step]; - - float16_t t00 = src_data_00 + src_data_10 + src_data_20; - float16_t t01 = src_data_01 + src_data_11 + src_data_21; - float16_t t02 = src_data_02 + src_data_12 + src_data_22; - float16_t t03 = src_data_03 + src_data_13 + src_data_23; - - const float16_t t10 = 0.5f * (src_data_10 - src_data_20) + src_data_30; - const float16_t t11 = 0.5f * (src_data_11 - src_data_21) + src_data_31; - const float16_t t12 = 0.5f * (src_data_12 - src_data_22) + src_data_32; - const float16_t t13 = 0.5f * (src_data_13 - src_data_23) + src_data_33; - - float16_t m00 = t00 + t01 + t02 + bias_data[i]; - const float16_t m01 = 0.5f * (t01 - t02) + t03 + bias_data[i]; - float16_t m10 = t10 + t11 + t12 + bias_data[i]; - const float16_t m11 = 0.5f * (t11 - t12) + t13 + bias_data[i]; - - (dst_data + i)[0] = m00; - (dst_data + i + C8NUM)[0] = m01; - (dst_data + i + dst_step * C8NUM)[0] = m10; - (dst_data + i + dst_step * C8NUM + C8NUM)[0] = m11; +void GeneralInputTransformUnitFp16(const float16_t *src_data, float16_t *dst_data, float16_t *matrix_b, + float16_t *matrix_bt, int src_step, int dst_step, int in_unit) { + int len = in_unit * in_unit; + if (len > MAX_LEN) return; + float16x8_t src[MAX_LEN]; + float16x8_t t[MAX_LEN]; + float16x8_t m[MAX_LEN]; + float16x8_t vec_b[MAX_LEN]; + float16x8_t vec_bt[MAX_LEN]; + for (int i = 0; i < len; i++) { + src[i] = vld1q_f16(src_data + i * src_step); + vec_b[i] = vdupq_n_f16(matrix_b[i]); + vec_bt[i] = vdupq_n_f16(matrix_bt[i]); } -#endif -} - -void OutputTransform4x3UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, - int src_step, int dst_step) { -#ifdef ENABLE_ARM - float16x8_t bias_ptr = vld1q_f16(bias_data); - float16x8_t src_data_00 = vld1q_f16(src_data + 0 * src_step); - float16x8_t src_data_01 = vld1q_f16(src_data + 1 * src_step); - float16x8_t src_data_02 = vld1q_f16(src_data + 2 * src_step); - float16x8_t src_data_03 = vld1q_f16(src_data + 3 * src_step); - float16x8_t src_data_10 = vld1q_f16(src_data + 4 * src_step); - float16x8_t src_data_11 = vld1q_f16(src_data + 5 * src_step); - float16x8_t src_data_12 = vld1q_f16(src_data + 6 * src_step); - float16x8_t src_data_13 = vld1q_f16(src_data + 7 * src_step); - float16x8_t src_data_20 = vld1q_f16(src_data + 8 * src_step); - float16x8_t src_data_21 = vld1q_f16(src_data + 9 * src_step); - float16x8_t src_data_22 = vld1q_f16(src_data + 10 * src_step); - float16x8_t src_data_23 = vld1q_f16(src_data + 11 * src_step); - float16x8_t src_data_30 = vld1q_f16(src_data + 12 * src_step); - float16x8_t src_data_31 = vld1q_f16(src_data + 13 * src_step); - float16x8_t src_data_32 = vld1q_f16(src_data + 14 * src_step); - float16x8_t src_data_33 = vld1q_f16(src_data + 15 * src_step); - - float16x8_t t00 = vaddq_f16(src_data_00, vaddq_f16(src_data_10, src_data_20)); - float16x8_t t01 = vaddq_f16(src_data_01, vaddq_f16(src_data_11, src_data_21)); - float16x8_t t02 = vaddq_f16(src_data_02, vaddq_f16(src_data_12, src_data_22)); - float16x8_t t03 = vaddq_f16(src_data_03, vaddq_f16(src_data_13, src_data_23)); - - float16x8_t t10 = vmulq_n_f16(vsubq_f16(src_data_10, src_data_20), 0.5); - float16x8_t t11 = vmulq_n_f16(vsubq_f16(src_data_11, src_data_21), 0.5); - float16x8_t t12 = vmulq_n_f16(vsubq_f16(src_data_12, src_data_22), 0.5); - float16x8_t t13 = vmulq_n_f16(vsubq_f16(src_data_13, src_data_23), 0.5); - - float16x8_t t20 = vaddq_f16(src_data_30, vmulq_n_f16(vaddq_f16(src_data_10, src_data_20), 0.25)); - float16x8_t t21 = vaddq_f16(src_data_31, vmulq_n_f16(vaddq_f16(src_data_11, src_data_21), 0.25)); - float16x8_t t22 = vaddq_f16(src_data_32, vmulq_n_f16(vaddq_f16(src_data_12, src_data_22), 0.25)); - float16x8_t t23 = vaddq_f16(src_data_33, vmulq_n_f16(vaddq_f16(src_data_13, src_data_23), 0.25)); - - float16x8_t m00 = vaddq_f16(vaddq_f16(t00, vaddq_f16(t01, t02)), bias_ptr); - float16x8_t m01 = vaddq_f16(vmulq_n_f16(vsubq_f16(t01, t02), 0.5), bias_ptr); - float16x8_t m02 = vaddq_f16(vaddq_f16(t03, vmulq_n_f16(vaddq_f16(t01, t02), 0.25)), bias_ptr); - float16x8_t m10 = vaddq_f16(vaddq_f16(t10, vaddq_f16(t11, t12)), bias_ptr); - float16x8_t m11 = vaddq_f16(vmulq_n_f16(vsubq_f16(t11, t12), 0.5), bias_ptr); - float16x8_t m12 = vaddq_f16(vaddq_f16(t13, vmulq_n_f16(vaddq_f16(t11, t12), 0.25)), bias_ptr); - float16x8_t m20 = vaddq_f16(vaddq_f16(t20, vaddq_f16(t21, t22)), bias_ptr); - float16x8_t m21 = vaddq_f16(vmulq_n_f16(vsubq_f16(t21, t22), 0.5), bias_ptr); - float16x8_t m22 = vaddq_f16(vaddq_f16(t23, vmulq_n_f16(vaddq_f16(t21, t22), 0.25)), bias_ptr); - - vst1q_f16(dst_data, m00); - vst1q_f16(dst_data + C8NUM, m01); - vst1q_f16(dst_data + 2 * C8NUM, m02); - vst1q_f16(dst_data + dst_step * C8NUM, m10); - vst1q_f16(dst_data + dst_step * C8NUM + C8NUM, m11); - vst1q_f16(dst_data + dst_step * C8NUM + 2 * C8NUM, m12); - vst1q_f16(dst_data + 2 * dst_step * C8NUM, m20); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + C8NUM, m21); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + 2 * C8NUM, m22); -#else - for (int i = 0; i < C8NUM; i++) { - float16_t src_data_00 = src_data[i]; - float16_t src_data_01 = src_data[i + src_step]; - float16_t src_data_02 = src_data[i + 2 * src_step]; - float16_t src_data_03 = src_data[i + 3 * src_step]; - float16_t src_data_10 = src_data[i + 4 * src_step]; - float16_t src_data_11 = src_data[i + 5 * src_step]; - float16_t src_data_12 = src_data[i + 6 * src_step]; - float16_t src_data_13 = src_data[i + 7 * src_step]; - float16_t src_data_20 = src_data[i + 8 * src_step]; - float16_t src_data_21 = src_data[i + 9 * src_step]; - float16_t src_data_22 = src_data[i + 10 * src_step]; - float16_t src_data_23 = src_data[i + 11 * src_step]; - float16_t src_data_30 = src_data[i + 12 * src_step]; - float16_t src_data_31 = src_data[i + 13 * src_step]; - float16_t src_data_32 = src_data[i + 14 * src_step]; - float16_t src_data_33 = src_data[i + 15 * src_step]; - - float16_t t00 = src_data_00 + src_data_10 + src_data_20; - float16_t t01 = src_data_01 + src_data_11 + src_data_21; - float16_t t02 = src_data_02 + src_data_12 + src_data_22; - float16_t t03 = src_data_03 + src_data_13 + src_data_23; - - const float16_t t10 = 0.5f * (src_data_10 - src_data_20); - const float16_t t11 = 0.5f * (src_data_11 - src_data_21); - const float16_t t12 = 0.5f * (src_data_12 - src_data_22); - const float16_t t13 = 0.5f * (src_data_13 - src_data_23); - - const float16_t t20 = 0.25f * (src_data_10 + src_data_20) + src_data_30; - const float16_t t21 = 0.25f * (src_data_11 + src_data_21) + src_data_31; - const float16_t t22 = 0.25f * (src_data_12 + src_data_22) + src_data_32; - const float16_t t23 = 0.25f * (src_data_13 + src_data_23) + src_data_33; - - float16_t m00 = t00 + t01 + t02 + bias_data[i]; - const float16_t m01 = 0.5f * (t01 - t02) + bias_data[i]; - const float16_t m02 = 0.25f * (t01 + t02) + t03 + bias_data[i]; - - float16_t m10 = t10 + t11 + t12 + bias_data[i]; - const float16_t m11 = 0.5f * (t11 - t12) + bias_data[i]; - const float16_t m12 = 0.25f * (t11 + t12) + t13 + bias_data[i]; - - float16_t m20 = t20 + t21 + t22 + bias_data[i]; - const float16_t m21 = 0.5f * (t21 - t22) + bias_data[i]; - const float16_t m22 = 0.25f * (t21 + t22) + t23 + bias_data[i]; - - (dst_data + i)[0] = m00; - (dst_data + i + C8NUM)[0] = m01; - (dst_data + i + 2 * C8NUM)[0] = m02; - - (dst_data + i + dst_step * C8NUM)[0] = m10; - (dst_data + i + dst_step * C8NUM + C8NUM)[0] = m11; - (dst_data + i + dst_step * C8NUM + 2 * C8NUM)[0] = m12; - - (dst_data + i + 2 * dst_step * C8NUM)[0] = m20; - (dst_data + i + 2 * dst_step * C8NUM + C8NUM)[0] = m21; - (dst_data + i + 2 * dst_step * C8NUM + 2 * C8NUM)[0] = m22; - } -#endif -} - -void OutputTransform8x2UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, - int src_step, int dst_step) { -#ifdef ENABLE_ARM - float16x8_t src_data_00 = vld1q_f16(src_data + 0 * src_step); - float16x8_t src_data_01 = vld1q_f16(src_data + 1 * src_step); - float16x8_t src_data_02 = vld1q_f16(src_data + 2 * src_step); - float16x8_t src_data_03 = vld1q_f16(src_data + 3 * src_step); - float16x8_t src_data_04 = vld1q_f16(src_data + 4 * src_step); - float16x8_t src_data_05 = vld1q_f16(src_data + 5 * src_step); - float16x8_t src_data_06 = vld1q_f16(src_data + 6 * src_step); - float16x8_t src_data_07 = vld1q_f16(src_data + 7 * src_step); - float16x8_t src_data_10 = vld1q_f16(src_data + 8 * src_step); - float16x8_t src_data_11 = vld1q_f16(src_data + 9 * src_step); - float16x8_t src_data_12 = vld1q_f16(src_data + 10 * src_step); - float16x8_t src_data_13 = vld1q_f16(src_data + 11 * src_step); - float16x8_t src_data_14 = vld1q_f16(src_data + 12 * src_step); - float16x8_t src_data_15 = vld1q_f16(src_data + 13 * src_step); - float16x8_t src_data_16 = vld1q_f16(src_data + 14 * src_step); - float16x8_t src_data_17 = vld1q_f16(src_data + 15 * src_step); - float16x8_t src_data_20 = vld1q_f16(src_data + 16 * src_step); - float16x8_t src_data_21 = vld1q_f16(src_data + 17 * src_step); - float16x8_t src_data_22 = vld1q_f16(src_data + 18 * src_step); - float16x8_t src_data_23 = vld1q_f16(src_data + 19 * src_step); - float16x8_t src_data_24 = vld1q_f16(src_data + 20 * src_step); - float16x8_t src_data_25 = vld1q_f16(src_data + 21 * src_step); - float16x8_t src_data_26 = vld1q_f16(src_data + 22 * src_step); - float16x8_t src_data_27 = vld1q_f16(src_data + 23 * src_step); - float16x8_t src_data_30 = vld1q_f16(src_data + 24 * src_step); - float16x8_t src_data_31 = vld1q_f16(src_data + 25 * src_step); - float16x8_t src_data_32 = vld1q_f16(src_data + 26 * src_step); - float16x8_t src_data_33 = vld1q_f16(src_data + 27 * src_step); - float16x8_t src_data_34 = vld1q_f16(src_data + 28 * src_step); - float16x8_t src_data_35 = vld1q_f16(src_data + 29 * src_step); - float16x8_t src_data_36 = vld1q_f16(src_data + 30 * src_step); - float16x8_t src_data_37 = vld1q_f16(src_data + 31 * src_step); - float16x8_t src_data_40 = vld1q_f16(src_data + 32 * src_step); - float16x8_t src_data_41 = vld1q_f16(src_data + 33 * src_step); - float16x8_t src_data_42 = vld1q_f16(src_data + 34 * src_step); - float16x8_t src_data_43 = vld1q_f16(src_data + 35 * src_step); - float16x8_t src_data_44 = vld1q_f16(src_data + 36 * src_step); - float16x8_t src_data_45 = vld1q_f16(src_data + 37 * src_step); - float16x8_t src_data_46 = vld1q_f16(src_data + 38 * src_step); - float16x8_t src_data_47 = vld1q_f16(src_data + 39 * src_step); - float16x8_t src_data_50 = vld1q_f16(src_data + 40 * src_step); - float16x8_t src_data_51 = vld1q_f16(src_data + 41 * src_step); - float16x8_t src_data_52 = vld1q_f16(src_data + 42 * src_step); - float16x8_t src_data_53 = vld1q_f16(src_data + 43 * src_step); - float16x8_t src_data_54 = vld1q_f16(src_data + 44 * src_step); - float16x8_t src_data_55 = vld1q_f16(src_data + 45 * src_step); - float16x8_t src_data_56 = vld1q_f16(src_data + 46 * src_step); - float16x8_t src_data_57 = vld1q_f16(src_data + 47 * src_step); - float16x8_t src_data_60 = vld1q_f16(src_data + 48 * src_step); - float16x8_t src_data_61 = vld1q_f16(src_data + 49 * src_step); - float16x8_t src_data_62 = vld1q_f16(src_data + 50 * src_step); - float16x8_t src_data_63 = vld1q_f16(src_data + 51 * src_step); - float16x8_t src_data_64 = vld1q_f16(src_data + 52 * src_step); - float16x8_t src_data_65 = vld1q_f16(src_data + 53 * src_step); - float16x8_t src_data_66 = vld1q_f16(src_data + 54 * src_step); - float16x8_t src_data_67 = vld1q_f16(src_data + 55 * src_step); - float16x8_t src_data_70 = vld1q_f16(src_data + 56 * src_step); - float16x8_t src_data_71 = vld1q_f16(src_data + 57 * src_step); - float16x8_t src_data_72 = vld1q_f16(src_data + 58 * src_step); - float16x8_t src_data_73 = vld1q_f16(src_data + 59 * src_step); - float16x8_t src_data_74 = vld1q_f16(src_data + 60 * src_step); - float16x8_t src_data_75 = vld1q_f16(src_data + 61 * src_step); - float16x8_t src_data_76 = vld1q_f16(src_data + 62 * src_step); - float16x8_t src_data_77 = vld1q_f16(src_data + 63 * src_step); - - float16x8_t d01 = vsubq_f16(src_data_10, src_data_20); - float16x8_t d02 = vsubq_f16(src_data_11, src_data_21); - float16x8_t d03 = vsubq_f16(src_data_12, src_data_22); - float16x8_t d04 = vsubq_f16(src_data_13, src_data_23); - float16x8_t d05 = vsubq_f16(src_data_14, src_data_24); - float16x8_t d06 = vsubq_f16(src_data_15, src_data_25); - float16x8_t d07 = vsubq_f16(src_data_16, src_data_26); - float16x8_t d08 = vsubq_f16(src_data_17, src_data_27); - - float16x8_t d11 = vsubq_f16(src_data_30, src_data_40); - float16x8_t d12 = vsubq_f16(src_data_31, src_data_41); - float16x8_t d13 = vsubq_f16(src_data_32, src_data_42); - float16x8_t d14 = vsubq_f16(src_data_33, src_data_43); - float16x8_t d15 = vsubq_f16(src_data_34, src_data_44); - float16x8_t d16 = vsubq_f16(src_data_35, src_data_45); - float16x8_t d17 = vsubq_f16(src_data_36, src_data_46); - float16x8_t d18 = vsubq_f16(src_data_37, src_data_47); - - float16x8_t d21 = vsubq_f16(src_data_50, src_data_60); - float16x8_t d22 = vsubq_f16(src_data_51, src_data_61); - float16x8_t d23 = vsubq_f16(src_data_52, src_data_62); - float16x8_t d24 = vsubq_f16(src_data_53, src_data_63); - float16x8_t d25 = vsubq_f16(src_data_54, src_data_64); - float16x8_t d26 = vsubq_f16(src_data_55, src_data_65); - float16x8_t d27 = vsubq_f16(src_data_56, src_data_66); - float16x8_t d28 = vsubq_f16(src_data_57, src_data_67); - - float16x8_t t00 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_00, src_data_10), src_data_20), src_data_30), src_data_40), - src_data_50), - src_data_60); - float16x8_t t01 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_01, src_data_11), src_data_21), src_data_31), src_data_41), - src_data_51), - src_data_61); - float16x8_t t02 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_02, src_data_12), src_data_22), src_data_32), src_data_42), - src_data_52), - src_data_62); - float16x8_t t03 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_03, src_data_13), src_data_23), src_data_33), src_data_43), - src_data_53), - src_data_63); - float16x8_t t04 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_04, src_data_14), src_data_24), src_data_34), src_data_44), - src_data_54), - src_data_64); - float16x8_t t05 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_05, src_data_15), src_data_25), src_data_35), src_data_45), - src_data_55), - src_data_65); - float16x8_t t06 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_06, src_data_16), src_data_26), src_data_36), src_data_46), - src_data_56), - src_data_66); - float16x8_t t07 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_07, src_data_17), src_data_27), src_data_37), src_data_47), - src_data_57), - src_data_67); - - float16x8_t t10 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.5), d11), vmulq_n_f16(d21, 1.5)), src_data_70); - float16x8_t t11 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.5), d12), vmulq_n_f16(d22, 1.5)), src_data_71); - float16x8_t t12 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.5), d13), vmulq_n_f16(d23, 1.5)), src_data_72); - float16x8_t t13 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.5), d14), vmulq_n_f16(d24, 1.5)), src_data_73); - float16x8_t t14 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.5), d15), vmulq_n_f16(d25, 1.5)), src_data_74); - float16x8_t t15 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.5), d16), vmulq_n_f16(d26, 1.5)), src_data_75); - float16x8_t t16 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.5), d17), vmulq_n_f16(d27, 1.5)), src_data_76); - float16x8_t t17 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.5), d18), vmulq_n_f16(d28, 1.5)), src_data_77); - - float16x8_t s11 = vsubq_f16(t01, t02); - float16x8_t s12 = vsubq_f16(t11, t12); - - float16x8_t s21 = vsubq_f16(t03, t04); - float16x8_t s22 = vsubq_f16(t13, t14); - - float16x8_t s31 = vsubq_f16(t05, t06); - float16x8_t s32 = vsubq_f16(t15, t16); - - float16x8_t m00 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t00, t01), t02), t03), t04), t05), t06); - float16x8_t m01 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.5), s21), vmulq_n_f16(s31, 1.5)), t07); - - float16x8_t m10 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t10, t11), t12), t13), t14), t15), t16); - float16x8_t m11 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.5), s22), vmulq_n_f16(s32, 1.5)), t17); - - float16x8_t bias_ptr = vld1q_f16(bias_data); - vst1q_f16(dst_data, vaddq_f16(m00, bias_ptr)); - vst1q_f16(dst_data + C8NUM, vaddq_f16(m01, bias_ptr)); - - vst1q_f16(dst_data + dst_step * C8NUM, vaddq_f16(m10, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + C8NUM, vaddq_f16(m11, bias_ptr)); -#else - for (int i = 0; i < C8NUM; i++) { - float16_t src_data_00 = src_data[i]; - float16_t src_data_01 = src_data[i + src_step]; - float16_t src_data_02 = src_data[i + 2 * src_step]; - float16_t src_data_03 = src_data[i + 3 * src_step]; - float16_t src_data_04 = src_data[i + 4 * src_step]; - float16_t src_data_05 = src_data[i + 5 * src_step]; - float16_t src_data_06 = src_data[i + 6 * src_step]; - float16_t src_data_07 = src_data[i + 7 * src_step]; - float16_t src_data_10 = src_data[i + 8 * src_step]; - float16_t src_data_11 = src_data[i + 9 * src_step]; - float16_t src_data_12 = src_data[i + 10 * src_step]; - float16_t src_data_13 = src_data[i + 11 * src_step]; - float16_t src_data_14 = src_data[i + 12 * src_step]; - float16_t src_data_15 = src_data[i + 13 * src_step]; - float16_t src_data_16 = src_data[i + 14 * src_step]; - float16_t src_data_17 = src_data[i + 15 * src_step]; - float16_t src_data_20 = src_data[i + 16 * src_step]; - float16_t src_data_21 = src_data[i + 17 * src_step]; - float16_t src_data_22 = src_data[i + 18 * src_step]; - float16_t src_data_23 = src_data[i + 19 * src_step]; - float16_t src_data_24 = src_data[i + 20 * src_step]; - float16_t src_data_25 = src_data[i + 21 * src_step]; - float16_t src_data_26 = src_data[i + 22 * src_step]; - float16_t src_data_27 = src_data[i + 23 * src_step]; - float16_t src_data_30 = src_data[i + 24 * src_step]; - float16_t src_data_31 = src_data[i + 25 * src_step]; - float16_t src_data_32 = src_data[i + 26 * src_step]; - float16_t src_data_33 = src_data[i + 27 * src_step]; - float16_t src_data_34 = src_data[i + 28 * src_step]; - float16_t src_data_35 = src_data[i + 29 * src_step]; - float16_t src_data_36 = src_data[i + 30 * src_step]; - float16_t src_data_37 = src_data[i + 31 * src_step]; - float16_t src_data_40 = src_data[i + 32 * src_step]; - float16_t src_data_41 = src_data[i + 33 * src_step]; - float16_t src_data_42 = src_data[i + 34 * src_step]; - float16_t src_data_43 = src_data[i + 35 * src_step]; - float16_t src_data_44 = src_data[i + 36 * src_step]; - float16_t src_data_45 = src_data[i + 37 * src_step]; - float16_t src_data_46 = src_data[i + 38 * src_step]; - float16_t src_data_47 = src_data[i + 39 * src_step]; - float16_t src_data_50 = src_data[i + 40 * src_step]; - float16_t src_data_51 = src_data[i + 41 * src_step]; - float16_t src_data_52 = src_data[i + 42 * src_step]; - float16_t src_data_53 = src_data[i + 43 * src_step]; - float16_t src_data_54 = src_data[i + 44 * src_step]; - float16_t src_data_55 = src_data[i + 45 * src_step]; - float16_t src_data_56 = src_data[i + 46 * src_step]; - float16_t src_data_57 = src_data[i + 47 * src_step]; - float16_t src_data_60 = src_data[i + 48 * src_step]; - float16_t src_data_61 = src_data[i + 49 * src_step]; - float16_t src_data_62 = src_data[i + 50 * src_step]; - float16_t src_data_63 = src_data[i + 51 * src_step]; - float16_t src_data_64 = src_data[i + 52 * src_step]; - float16_t src_data_65 = src_data[i + 53 * src_step]; - float16_t src_data_66 = src_data[i + 54 * src_step]; - float16_t src_data_67 = src_data[i + 55 * src_step]; - float16_t src_data_70 = src_data[i + 56 * src_step]; - float16_t src_data_71 = src_data[i + 57 * src_step]; - float16_t src_data_72 = src_data[i + 58 * src_step]; - float16_t src_data_73 = src_data[i + 59 * src_step]; - float16_t src_data_74 = src_data[i + 60 * src_step]; - float16_t src_data_75 = src_data[i + 61 * src_step]; - float16_t src_data_76 = src_data[i + 62 * src_step]; - float16_t src_data_77 = src_data[i + 63 * src_step]; - - float16_t d01 = src_data_10 - src_data_20; - float16_t d02 = src_data_11 - src_data_21; - float16_t d03 = src_data_12 - src_data_22; - float16_t d04 = src_data_13 - src_data_23; - float16_t d05 = src_data_14 - src_data_24; - float16_t d06 = src_data_15 - src_data_25; - float16_t d07 = src_data_16 - src_data_26; - float16_t d08 = src_data_17 - src_data_27; - - float16_t d11 = src_data_30 - src_data_40; - float16_t d12 = src_data_31 - src_data_41; - float16_t d13 = src_data_32 - src_data_42; - float16_t d14 = src_data_33 - src_data_43; - float16_t d15 = src_data_34 - src_data_44; - float16_t d16 = src_data_35 - src_data_45; - float16_t d17 = src_data_36 - src_data_46; - float16_t d18 = src_data_37 - src_data_47; - - float16_t d21 = src_data_50 - src_data_60; - float16_t d22 = src_data_51 - src_data_61; - float16_t d23 = src_data_52 - src_data_62; - float16_t d24 = src_data_53 - src_data_63; - float16_t d25 = src_data_54 - src_data_64; - float16_t d26 = src_data_55 - src_data_65; - float16_t d27 = src_data_56 - src_data_66; - float16_t d28 = src_data_57 - src_data_67; - - float16_t t00 = src_data_00 + src_data_10 + src_data_20 + src_data_30 + src_data_40 + src_data_50 + src_data_60; - float16_t t01 = src_data_01 + src_data_11 + src_data_21 + src_data_31 + src_data_41 + src_data_51 + src_data_61; - float16_t t02 = src_data_02 + src_data_12 + src_data_22 + src_data_32 + src_data_42 + src_data_52 + src_data_62; - float16_t t03 = src_data_03 + src_data_13 + src_data_23 + src_data_33 + src_data_43 + src_data_53 + src_data_63; - float16_t t04 = src_data_04 + src_data_14 + src_data_24 + src_data_34 + src_data_44 + src_data_54 + src_data_64; - float16_t t05 = src_data_05 + src_data_15 + src_data_25 + src_data_35 + src_data_45 + src_data_55 + src_data_65; - float16_t t06 = src_data_06 + src_data_16 + src_data_26 + src_data_36 + src_data_46 + src_data_56 + src_data_66; - float16_t t07 = src_data_07 + src_data_17 + src_data_27 + src_data_37 + src_data_47 + src_data_57 + src_data_67; - - const float16_t t10 = 0.5f * d01 + d11 + 1.5f * d21 + src_data_70; - const float16_t t11 = 0.5f * d02 + d12 + 1.5f * d22 + src_data_71; - const float16_t t12 = 0.5f * d03 + d13 + 1.5f * d23 + src_data_72; - const float16_t t13 = 0.5f * d04 + d14 + 1.5f * d24 + src_data_73; - const float16_t t14 = 0.5f * d05 + d15 + 1.5f * d25 + src_data_74; - const float16_t t15 = 0.5f * d06 + d16 + 1.5f * d26 + src_data_75; - const float16_t t16 = 0.5f * d07 + d17 + 1.5f * d27 + src_data_76; - const float16_t t17 = 0.5f * d08 + d18 + 1.5f * d28 + src_data_77; - - float16_t s11 = t01 - t02; - float16_t s12 = t11 - t12; - float16_t s21 = t03 - t04; - float16_t s22 = t13 - t14; - float16_t s31 = t05 - t06; - float16_t s32 = t15 - t16; - - float16_t m00 = t00 + t01 + t02 + t03 + t04 + t05 + t06; - const float16_t m01 = 0.5f * s11 + s21 + 1.5f * s31 + t07; - float16_t m10 = t10 + t11 + t12 + t13 + t14 + t15 + t16; - const float16_t m11 = 0.5f * s12 + s22 + 1.5f * s32 + t17; - - (dst_data + i)[0] = m00 + bias_data[i]; - (dst_data + i + C8NUM)[0] = m01 + bias_data[i]; - (dst_data + i + dst_step * C8NUM)[0] = m10 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + C8NUM)[0] = m11 + bias_data[i]; - } -#endif -} - -void OutputTransform8x3UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, - int src_step, int dst_step) { -#ifdef ENABLE_ARM - float16x8_t src_data_00 = vld1q_f16(src_data + 0 * src_step); - float16x8_t src_data_01 = vld1q_f16(src_data + 1 * src_step); - float16x8_t src_data_02 = vld1q_f16(src_data + 2 * src_step); - float16x8_t src_data_03 = vld1q_f16(src_data + 3 * src_step); - float16x8_t src_data_04 = vld1q_f16(src_data + 4 * src_step); - float16x8_t src_data_05 = vld1q_f16(src_data + 5 * src_step); - float16x8_t src_data_06 = vld1q_f16(src_data + 6 * src_step); - float16x8_t src_data_07 = vld1q_f16(src_data + 7 * src_step); - float16x8_t src_data_10 = vld1q_f16(src_data + 8 * src_step); - float16x8_t src_data_11 = vld1q_f16(src_data + 9 * src_step); - float16x8_t src_data_12 = vld1q_f16(src_data + 10 * src_step); - float16x8_t src_data_13 = vld1q_f16(src_data + 11 * src_step); - float16x8_t src_data_14 = vld1q_f16(src_data + 12 * src_step); - float16x8_t src_data_15 = vld1q_f16(src_data + 13 * src_step); - float16x8_t src_data_16 = vld1q_f16(src_data + 14 * src_step); - float16x8_t src_data_17 = vld1q_f16(src_data + 15 * src_step); - float16x8_t src_data_20 = vld1q_f16(src_data + 16 * src_step); - float16x8_t src_data_21 = vld1q_f16(src_data + 17 * src_step); - float16x8_t src_data_22 = vld1q_f16(src_data + 18 * src_step); - float16x8_t src_data_23 = vld1q_f16(src_data + 19 * src_step); - float16x8_t src_data_24 = vld1q_f16(src_data + 20 * src_step); - float16x8_t src_data_25 = vld1q_f16(src_data + 21 * src_step); - float16x8_t src_data_26 = vld1q_f16(src_data + 22 * src_step); - float16x8_t src_data_27 = vld1q_f16(src_data + 23 * src_step); - float16x8_t src_data_30 = vld1q_f16(src_data + 24 * src_step); - float16x8_t src_data_31 = vld1q_f16(src_data + 25 * src_step); - float16x8_t src_data_32 = vld1q_f16(src_data + 26 * src_step); - float16x8_t src_data_33 = vld1q_f16(src_data + 27 * src_step); - float16x8_t src_data_34 = vld1q_f16(src_data + 28 * src_step); - float16x8_t src_data_35 = vld1q_f16(src_data + 29 * src_step); - float16x8_t src_data_36 = vld1q_f16(src_data + 30 * src_step); - float16x8_t src_data_37 = vld1q_f16(src_data + 31 * src_step); - float16x8_t src_data_40 = vld1q_f16(src_data + 32 * src_step); - float16x8_t src_data_41 = vld1q_f16(src_data + 33 * src_step); - float16x8_t src_data_42 = vld1q_f16(src_data + 34 * src_step); - float16x8_t src_data_43 = vld1q_f16(src_data + 35 * src_step); - float16x8_t src_data_44 = vld1q_f16(src_data + 36 * src_step); - float16x8_t src_data_45 = vld1q_f16(src_data + 37 * src_step); - float16x8_t src_data_46 = vld1q_f16(src_data + 38 * src_step); - float16x8_t src_data_47 = vld1q_f16(src_data + 39 * src_step); - float16x8_t src_data_50 = vld1q_f16(src_data + 40 * src_step); - float16x8_t src_data_51 = vld1q_f16(src_data + 41 * src_step); - float16x8_t src_data_52 = vld1q_f16(src_data + 42 * src_step); - float16x8_t src_data_53 = vld1q_f16(src_data + 43 * src_step); - float16x8_t src_data_54 = vld1q_f16(src_data + 44 * src_step); - float16x8_t src_data_55 = vld1q_f16(src_data + 45 * src_step); - float16x8_t src_data_56 = vld1q_f16(src_data + 46 * src_step); - float16x8_t src_data_57 = vld1q_f16(src_data + 47 * src_step); - float16x8_t src_data_60 = vld1q_f16(src_data + 48 * src_step); - float16x8_t src_data_61 = vld1q_f16(src_data + 49 * src_step); - float16x8_t src_data_62 = vld1q_f16(src_data + 50 * src_step); - float16x8_t src_data_63 = vld1q_f16(src_data + 51 * src_step); - float16x8_t src_data_64 = vld1q_f16(src_data + 52 * src_step); - float16x8_t src_data_65 = vld1q_f16(src_data + 53 * src_step); - float16x8_t src_data_66 = vld1q_f16(src_data + 54 * src_step); - float16x8_t src_data_67 = vld1q_f16(src_data + 55 * src_step); - float16x8_t src_data_70 = vld1q_f16(src_data + 56 * src_step); - float16x8_t src_data_71 = vld1q_f16(src_data + 57 * src_step); - float16x8_t src_data_72 = vld1q_f16(src_data + 58 * src_step); - float16x8_t src_data_73 = vld1q_f16(src_data + 59 * src_step); - float16x8_t src_data_74 = vld1q_f16(src_data + 60 * src_step); - float16x8_t src_data_75 = vld1q_f16(src_data + 61 * src_step); - float16x8_t src_data_76 = vld1q_f16(src_data + 62 * src_step); - float16x8_t src_data_77 = vld1q_f16(src_data + 63 * src_step); - - float16x8_t d01 = vsubq_f16(src_data_10, src_data_20); - float16x8_t d02 = vsubq_f16(src_data_11, src_data_21); - float16x8_t d03 = vsubq_f16(src_data_12, src_data_22); - float16x8_t d04 = vsubq_f16(src_data_13, src_data_23); - float16x8_t d05 = vsubq_f16(src_data_14, src_data_24); - float16x8_t d06 = vsubq_f16(src_data_15, src_data_25); - float16x8_t d07 = vsubq_f16(src_data_16, src_data_26); - float16x8_t d08 = vsubq_f16(src_data_17, src_data_27); - - float16x8_t d11 = vsubq_f16(src_data_30, src_data_40); - float16x8_t d12 = vsubq_f16(src_data_31, src_data_41); - float16x8_t d13 = vsubq_f16(src_data_32, src_data_42); - float16x8_t d14 = vsubq_f16(src_data_33, src_data_43); - float16x8_t d15 = vsubq_f16(src_data_34, src_data_44); - float16x8_t d16 = vsubq_f16(src_data_35, src_data_45); - float16x8_t d17 = vsubq_f16(src_data_36, src_data_46); - float16x8_t d18 = vsubq_f16(src_data_37, src_data_47); - - float16x8_t d21 = vsubq_f16(src_data_50, src_data_60); - float16x8_t d22 = vsubq_f16(src_data_51, src_data_61); - float16x8_t d23 = vsubq_f16(src_data_52, src_data_62); - float16x8_t d24 = vsubq_f16(src_data_53, src_data_63); - float16x8_t d25 = vsubq_f16(src_data_54, src_data_64); - float16x8_t d26 = vsubq_f16(src_data_55, src_data_65); - float16x8_t d27 = vsubq_f16(src_data_56, src_data_66); - float16x8_t d28 = vsubq_f16(src_data_57, src_data_67); - - float16x8_t d31 = vaddq_f16(src_data_10, src_data_20); - float16x8_t d32 = vaddq_f16(src_data_11, src_data_21); - float16x8_t d33 = vaddq_f16(src_data_12, src_data_22); - float16x8_t d34 = vaddq_f16(src_data_13, src_data_23); - float16x8_t d35 = vaddq_f16(src_data_14, src_data_24); - float16x8_t d36 = vaddq_f16(src_data_15, src_data_25); - float16x8_t d37 = vaddq_f16(src_data_16, src_data_26); - float16x8_t d38 = vaddq_f16(src_data_17, src_data_27); - - float16x8_t d41 = vaddq_f16(src_data_30, src_data_40); - float16x8_t d42 = vaddq_f16(src_data_31, src_data_41); - float16x8_t d43 = vaddq_f16(src_data_32, src_data_42); - float16x8_t d44 = vaddq_f16(src_data_33, src_data_43); - float16x8_t d45 = vaddq_f16(src_data_34, src_data_44); - float16x8_t d46 = vaddq_f16(src_data_35, src_data_45); - float16x8_t d47 = vaddq_f16(src_data_36, src_data_46); - float16x8_t d48 = vaddq_f16(src_data_37, src_data_47); - - float16x8_t d51 = vaddq_f16(src_data_50, src_data_60); - float16x8_t d52 = vaddq_f16(src_data_51, src_data_61); - float16x8_t d53 = vaddq_f16(src_data_52, src_data_62); - float16x8_t d54 = vaddq_f16(src_data_53, src_data_63); - float16x8_t d55 = vaddq_f16(src_data_54, src_data_64); - float16x8_t d56 = vaddq_f16(src_data_55, src_data_65); - float16x8_t d57 = vaddq_f16(src_data_56, src_data_66); - float16x8_t d58 = vaddq_f16(src_data_57, src_data_67); - - float16x8_t t00 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_00, src_data_10), src_data_20), src_data_30), src_data_40), - src_data_50), - src_data_60); - float16x8_t t01 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_01, src_data_11), src_data_21), src_data_31), src_data_41), - src_data_51), - src_data_61); - float16x8_t t02 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_02, src_data_12), src_data_22), src_data_32), src_data_42), - src_data_52), - src_data_62); - float16x8_t t03 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_03, src_data_13), src_data_23), src_data_33), src_data_43), - src_data_53), - src_data_63); - float16x8_t t04 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_04, src_data_14), src_data_24), src_data_34), src_data_44), - src_data_54), - src_data_64); - float16x8_t t05 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_05, src_data_15), src_data_25), src_data_35), src_data_45), - src_data_55), - src_data_65); - float16x8_t t06 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_06, src_data_16), src_data_26), src_data_36), src_data_46), - src_data_56), - src_data_66); - float16x8_t t07 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_07, src_data_17), src_data_27), src_data_37), src_data_47), - src_data_57), - src_data_67); - - float16x8_t t10 = vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.5), d11), vmulq_n_f16(d21, 1.5)); - float16x8_t t11 = vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.5), d12), vmulq_n_f16(d22, 1.5)); - float16x8_t t12 = vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.5), d13), vmulq_n_f16(d23, 1.5)); - float16x8_t t13 = vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.5), d14), vmulq_n_f16(d24, 1.5)); - float16x8_t t14 = vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.5), d15), vmulq_n_f16(d25, 1.5)); - float16x8_t t15 = vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.5), d16), vmulq_n_f16(d26, 1.5)); - float16x8_t t16 = vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.5), d17), vmulq_n_f16(d27, 1.5)); - float16x8_t t17 = vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.5), d18), vmulq_n_f16(d28, 1.5)); - - float16x8_t t20 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d31, 0.25), d41), vmulq_n_f16(d51, 2.25)), src_data_70); - float16x8_t t21 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d32, 0.25), d42), vmulq_n_f16(d52, 2.25)), src_data_71); - float16x8_t t22 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d33, 0.25), d43), vmulq_n_f16(d53, 2.25)), src_data_72); - float16x8_t t23 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d34, 0.25), d44), vmulq_n_f16(d54, 2.25)), src_data_73); - float16x8_t t24 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d35, 0.25), d45), vmulq_n_f16(d55, 2.25)), src_data_74); - float16x8_t t25 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d36, 0.25), d46), vmulq_n_f16(d56, 2.25)), src_data_75); - float16x8_t t26 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d37, 0.25), d47), vmulq_n_f16(d57, 2.25)), src_data_76); - float16x8_t t27 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d38, 0.25), d48), vmulq_n_f16(d58, 2.25)), src_data_77); - - float16x8_t s11 = vsubq_f16(t01, t02); - float16x8_t s12 = vsubq_f16(t11, t12); - float16x8_t s13 = vsubq_f16(t21, t22); - - float16x8_t s21 = vsubq_f16(t03, t04); - float16x8_t s22 = vsubq_f16(t13, t14); - float16x8_t s23 = vsubq_f16(t23, t24); - - float16x8_t s31 = vsubq_f16(t05, t06); - float16x8_t s32 = vsubq_f16(t15, t16); - float16x8_t s33 = vsubq_f16(t25, t26); - - float16x8_t s41 = vaddq_f16(t01, t02); - float16x8_t s42 = vaddq_f16(t11, t12); - float16x8_t s43 = vaddq_f16(t21, t22); - - float16x8_t s51 = vaddq_f16(t03, t04); - float16x8_t s52 = vaddq_f16(t13, t14); - float16x8_t s53 = vaddq_f16(t23, t24); - - float16x8_t s61 = vaddq_f16(t05, t06); - float16x8_t s62 = vaddq_f16(t15, t16); - float16x8_t s63 = vaddq_f16(t25, t26); - - float16x8_t m00 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t00, t01), t02), t03), t04), t05), t06); - float16x8_t m01 = vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.5), s21), vmulq_n_f16(s31, 1.5)); - float16x8_t m02 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s41, 0.25), s51), vmulq_n_f16(s61, 2.25)), t07); - - float16x8_t m10 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t10, t11), t12), t13), t14), t15), t16); - float16x8_t m11 = vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.5), s22), vmulq_n_f16(s32, 1.5)); - float16x8_t m12 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s42, 0.25), s52), vmulq_n_f16(s62, 2.25)), t17); - - float16x8_t m20 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t20, t21), t22), t23), t24), t25), t26); - float16x8_t m21 = vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.5), s23), vmulq_n_f16(s33, 1.5)); - float16x8_t m22 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s43, 0.25), s53), vmulq_n_f16(s63, 2.25)), t27); - - float16x8_t bias_ptr = vld1q_f16(bias_data); - vst1q_f16(dst_data, vaddq_f16(m00, bias_ptr)); - vst1q_f16(dst_data + C8NUM, vaddq_f16(m01, bias_ptr)); - vst1q_f16(dst_data + 2 * C8NUM, vaddq_f16(m02, bias_ptr)); - - vst1q_f16(dst_data + dst_step * C8NUM, vaddq_f16(m10, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + C8NUM, vaddq_f16(m11, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m12, bias_ptr)); - - vst1q_f16(dst_data + 2 * dst_step * C8NUM, vaddq_f16(m20, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + C8NUM, vaddq_f16(m21, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m22, bias_ptr)); -#else - for (int i = 0; i < C8NUM; i++) { - float16_t src_data_00 = src_data[i]; - float16_t src_data_01 = src_data[i + src_step]; - float16_t src_data_02 = src_data[i + 2 * src_step]; - float16_t src_data_03 = src_data[i + 3 * src_step]; - float16_t src_data_04 = src_data[i + 4 * src_step]; - float16_t src_data_05 = src_data[i + 5 * src_step]; - float16_t src_data_06 = src_data[i + 6 * src_step]; - float16_t src_data_07 = src_data[i + 7 * src_step]; - float16_t src_data_10 = src_data[i + 8 * src_step]; - float16_t src_data_11 = src_data[i + 9 * src_step]; - float16_t src_data_12 = src_data[i + 10 * src_step]; - float16_t src_data_13 = src_data[i + 11 * src_step]; - float16_t src_data_14 = src_data[i + 12 * src_step]; - float16_t src_data_15 = src_data[i + 13 * src_step]; - float16_t src_data_16 = src_data[i + 14 * src_step]; - float16_t src_data_17 = src_data[i + 15 * src_step]; - float16_t src_data_20 = src_data[i + 16 * src_step]; - float16_t src_data_21 = src_data[i + 17 * src_step]; - float16_t src_data_22 = src_data[i + 18 * src_step]; - float16_t src_data_23 = src_data[i + 19 * src_step]; - float16_t src_data_24 = src_data[i + 20 * src_step]; - float16_t src_data_25 = src_data[i + 21 * src_step]; - float16_t src_data_26 = src_data[i + 22 * src_step]; - float16_t src_data_27 = src_data[i + 23 * src_step]; - float16_t src_data_30 = src_data[i + 24 * src_step]; - float16_t src_data_31 = src_data[i + 25 * src_step]; - float16_t src_data_32 = src_data[i + 26 * src_step]; - float16_t src_data_33 = src_data[i + 27 * src_step]; - float16_t src_data_34 = src_data[i + 28 * src_step]; - float16_t src_data_35 = src_data[i + 29 * src_step]; - float16_t src_data_36 = src_data[i + 30 * src_step]; - float16_t src_data_37 = src_data[i + 31 * src_step]; - float16_t src_data_40 = src_data[i + 32 * src_step]; - float16_t src_data_41 = src_data[i + 33 * src_step]; - float16_t src_data_42 = src_data[i + 34 * src_step]; - float16_t src_data_43 = src_data[i + 35 * src_step]; - float16_t src_data_44 = src_data[i + 36 * src_step]; - float16_t src_data_45 = src_data[i + 37 * src_step]; - float16_t src_data_46 = src_data[i + 38 * src_step]; - float16_t src_data_47 = src_data[i + 39 * src_step]; - float16_t src_data_50 = src_data[i + 40 * src_step]; - float16_t src_data_51 = src_data[i + 41 * src_step]; - float16_t src_data_52 = src_data[i + 42 * src_step]; - float16_t src_data_53 = src_data[i + 43 * src_step]; - float16_t src_data_54 = src_data[i + 44 * src_step]; - float16_t src_data_55 = src_data[i + 45 * src_step]; - float16_t src_data_56 = src_data[i + 46 * src_step]; - float16_t src_data_57 = src_data[i + 47 * src_step]; - float16_t src_data_60 = src_data[i + 48 * src_step]; - float16_t src_data_61 = src_data[i + 49 * src_step]; - float16_t src_data_62 = src_data[i + 50 * src_step]; - float16_t src_data_63 = src_data[i + 51 * src_step]; - float16_t src_data_64 = src_data[i + 52 * src_step]; - float16_t src_data_65 = src_data[i + 53 * src_step]; - float16_t src_data_66 = src_data[i + 54 * src_step]; - float16_t src_data_67 = src_data[i + 55 * src_step]; - float16_t src_data_70 = src_data[i + 56 * src_step]; - float16_t src_data_71 = src_data[i + 57 * src_step]; - float16_t src_data_72 = src_data[i + 58 * src_step]; - float16_t src_data_73 = src_data[i + 59 * src_step]; - float16_t src_data_74 = src_data[i + 60 * src_step]; - float16_t src_data_75 = src_data[i + 61 * src_step]; - float16_t src_data_76 = src_data[i + 62 * src_step]; - float16_t src_data_77 = src_data[i + 63 * src_step]; - - float16_t d01 = src_data_10 - src_data_20; - float16_t d02 = src_data_11 - src_data_21; - float16_t d03 = src_data_12 - src_data_22; - float16_t d04 = src_data_13 - src_data_23; - float16_t d05 = src_data_14 - src_data_24; - float16_t d06 = src_data_15 - src_data_25; - float16_t d07 = src_data_16 - src_data_26; - float16_t d08 = src_data_17 - src_data_27; - - float16_t d11 = src_data_30 - src_data_40; - float16_t d12 = src_data_31 - src_data_41; - float16_t d13 = src_data_32 - src_data_42; - float16_t d14 = src_data_33 - src_data_43; - float16_t d15 = src_data_34 - src_data_44; - float16_t d16 = src_data_35 - src_data_45; - float16_t d17 = src_data_36 - src_data_46; - float16_t d18 = src_data_37 - src_data_47; - - float16_t d21 = src_data_50 - src_data_60; - float16_t d22 = src_data_51 - src_data_61; - float16_t d23 = src_data_52 - src_data_62; - float16_t d24 = src_data_53 - src_data_63; - float16_t d25 = src_data_54 - src_data_64; - float16_t d26 = src_data_55 - src_data_65; - float16_t d27 = src_data_56 - src_data_66; - float16_t d28 = src_data_57 - src_data_67; - - float16_t d31 = src_data_10 + src_data_20; - float16_t d32 = src_data_11 + src_data_21; - float16_t d33 = src_data_12 + src_data_22; - float16_t d34 = src_data_13 + src_data_23; - float16_t d35 = src_data_14 + src_data_24; - float16_t d36 = src_data_15 + src_data_25; - float16_t d37 = src_data_16 + src_data_26; - float16_t d38 = src_data_17 + src_data_27; - - float16_t d41 = src_data_30 + src_data_40; - float16_t d42 = src_data_31 + src_data_41; - float16_t d43 = src_data_32 + src_data_42; - float16_t d44 = src_data_33 + src_data_43; - float16_t d45 = src_data_34 + src_data_44; - float16_t d46 = src_data_35 + src_data_45; - float16_t d47 = src_data_36 + src_data_46; - float16_t d48 = src_data_37 + src_data_47; - - float16_t d51 = src_data_50 + src_data_60; - float16_t d52 = src_data_51 + src_data_61; - float16_t d53 = src_data_52 + src_data_62; - float16_t d54 = src_data_53 + src_data_63; - float16_t d55 = src_data_54 + src_data_64; - float16_t d56 = src_data_55 + src_data_65; - float16_t d57 = src_data_56 + src_data_66; - float16_t d58 = src_data_57 + src_data_67; - - float16_t t00 = src_data_00 + src_data_10 + src_data_20 + src_data_30 + src_data_40 + src_data_50 + src_data_60; - float16_t t01 = src_data_01 + src_data_11 + src_data_21 + src_data_31 + src_data_41 + src_data_51 + src_data_61; - float16_t t02 = src_data_02 + src_data_12 + src_data_22 + src_data_32 + src_data_42 + src_data_52 + src_data_62; - float16_t t03 = src_data_03 + src_data_13 + src_data_23 + src_data_33 + src_data_43 + src_data_53 + src_data_63; - float16_t t04 = src_data_04 + src_data_14 + src_data_24 + src_data_34 + src_data_44 + src_data_54 + src_data_64; - float16_t t05 = src_data_05 + src_data_15 + src_data_25 + src_data_35 + src_data_45 + src_data_55 + src_data_65; - float16_t t06 = src_data_06 + src_data_16 + src_data_26 + src_data_36 + src_data_46 + src_data_56 + src_data_66; - float16_t t07 = src_data_07 + src_data_17 + src_data_27 + src_data_37 + src_data_47 + src_data_57 + src_data_67; - - const float16_t t10 = 0.5f * d01 + d11 + 1.5f * d21; - const float16_t t11 = 0.5f * d02 + d12 + 1.5f * d22; - const float16_t t12 = 0.5f * d03 + d13 + 1.5f * d23; - const float16_t t13 = 0.5f * d04 + d14 + 1.5f * d24; - const float16_t t14 = 0.5f * d05 + d15 + 1.5f * d25; - const float16_t t15 = 0.5f * d06 + d16 + 1.5f * d26; - const float16_t t16 = 0.5f * d07 + d17 + 1.5f * d27; - const float16_t t17 = 0.5f * d08 + d18 + 1.5f * d28; - - const float16_t t20 = 0.25f * d31 + d41 + 2.25f * d51 + src_data_70; - const float16_t t21 = 0.25f * d32 + d42 + 2.25f * d52 + src_data_71; - const float16_t t22 = 0.25f * d33 + d43 + 2.25f * d53 + src_data_72; - const float16_t t23 = 0.25f * d34 + d44 + 2.25f * d54 + src_data_73; - const float16_t t24 = 0.25f * d35 + d45 + 2.25f * d55 + src_data_74; - const float16_t t25 = 0.25f * d36 + d46 + 2.25f * d56 + src_data_75; - const float16_t t26 = 0.25f * d37 + d47 + 2.25f * d57 + src_data_76; - const float16_t t27 = 0.25f * d38 + d48 + 2.25f * d58 + src_data_77; - - float16_t s11 = t01 - t02; - float16_t s12 = t11 - t12; - float16_t s13 = t21 - t22; - - float16_t s21 = t03 - t04; - float16_t s22 = t13 - t14; - float16_t s23 = t23 - t24; - - float16_t s31 = t05 - t06; - float16_t s32 = t15 - t16; - float16_t s33 = t25 - t26; - - float16_t s41 = t01 + t02; - float16_t s42 = t11 + t12; - float16_t s43 = t21 + t22; - - float16_t s51 = t03 + t04; - float16_t s52 = t13 + t14; - float16_t s53 = t23 + t24; - - float16_t s61 = t05 + t06; - float16_t s62 = t15 + t16; - float16_t s63 = t25 + t26; - - float16_t m00 = t00 + t01 + t02 + t03 + t04 + t05 + t06; - const float16_t m01 = 0.5f * s11 + s21 + 1.5f * s31; - const float16_t m02 = 0.25f * s41 + s51 + 2.25f * s61 + t07; - - float16_t m10 = t10 + t11 + t12 + t13 + t14 + t15 + t16; - const float16_t m11 = 0.5f * s12 + s22 + 1.5f * s32; - const float16_t m12 = 0.25f * s42 + s52 + 2.25f * s62 + t17; - - float16_t m20 = t20 + t21 + t22 + t23 + t24 + t25 + t26; - const float16_t m21 = 0.5f * s13 + s23 + 1.5f * s33; - const float16_t m22 = 0.25f * s43 + s53 + 2.25f * s63 + t27; - - (dst_data + i)[0] = m00 + bias_data[i]; - (dst_data + i + C8NUM)[0] = m01 + bias_data[i]; - (dst_data + i + 2 * C8NUM)[0] = m02 + bias_data[i]; - - (dst_data + i + dst_step * C8NUM)[0] = m10 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + C8NUM)[0] = m11 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + 2 * C8NUM)[0] = m12 + bias_data[i]; - - (dst_data + i + 2 * dst_step * C8NUM)[0] = m20 + bias_data[i]; - (dst_data + i + 2 * dst_step * C8NUM + C8NUM)[0] = m21 + bias_data[i]; - (dst_data + i + 2 * dst_step * C8NUM + 2 * C8NUM)[0] = m22 + bias_data[i]; - } -#endif -} - -void OutputTransform8x4UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, - int src_step, int dst_step) { -#ifdef ENABLE_ARM - float16x8_t src_data_00 = vld1q_f16(src_data + 0 * src_step); - float16x8_t src_data_01 = vld1q_f16(src_data + 1 * src_step); - float16x8_t src_data_02 = vld1q_f16(src_data + 2 * src_step); - float16x8_t src_data_03 = vld1q_f16(src_data + 3 * src_step); - float16x8_t src_data_04 = vld1q_f16(src_data + 4 * src_step); - float16x8_t src_data_05 = vld1q_f16(src_data + 5 * src_step); - float16x8_t src_data_06 = vld1q_f16(src_data + 6 * src_step); - float16x8_t src_data_07 = vld1q_f16(src_data + 7 * src_step); - float16x8_t src_data_10 = vld1q_f16(src_data + 8 * src_step); - float16x8_t src_data_11 = vld1q_f16(src_data + 9 * src_step); - float16x8_t src_data_12 = vld1q_f16(src_data + 10 * src_step); - float16x8_t src_data_13 = vld1q_f16(src_data + 11 * src_step); - float16x8_t src_data_14 = vld1q_f16(src_data + 12 * src_step); - float16x8_t src_data_15 = vld1q_f16(src_data + 13 * src_step); - float16x8_t src_data_16 = vld1q_f16(src_data + 14 * src_step); - float16x8_t src_data_17 = vld1q_f16(src_data + 15 * src_step); - float16x8_t src_data_20 = vld1q_f16(src_data + 16 * src_step); - float16x8_t src_data_21 = vld1q_f16(src_data + 17 * src_step); - float16x8_t src_data_22 = vld1q_f16(src_data + 18 * src_step); - float16x8_t src_data_23 = vld1q_f16(src_data + 19 * src_step); - float16x8_t src_data_24 = vld1q_f16(src_data + 20 * src_step); - float16x8_t src_data_25 = vld1q_f16(src_data + 21 * src_step); - float16x8_t src_data_26 = vld1q_f16(src_data + 22 * src_step); - float16x8_t src_data_27 = vld1q_f16(src_data + 23 * src_step); - float16x8_t src_data_30 = vld1q_f16(src_data + 24 * src_step); - float16x8_t src_data_31 = vld1q_f16(src_data + 25 * src_step); - float16x8_t src_data_32 = vld1q_f16(src_data + 26 * src_step); - float16x8_t src_data_33 = vld1q_f16(src_data + 27 * src_step); - float16x8_t src_data_34 = vld1q_f16(src_data + 28 * src_step); - float16x8_t src_data_35 = vld1q_f16(src_data + 29 * src_step); - float16x8_t src_data_36 = vld1q_f16(src_data + 30 * src_step); - float16x8_t src_data_37 = vld1q_f16(src_data + 31 * src_step); - float16x8_t src_data_40 = vld1q_f16(src_data + 32 * src_step); - float16x8_t src_data_41 = vld1q_f16(src_data + 33 * src_step); - float16x8_t src_data_42 = vld1q_f16(src_data + 34 * src_step); - float16x8_t src_data_43 = vld1q_f16(src_data + 35 * src_step); - float16x8_t src_data_44 = vld1q_f16(src_data + 36 * src_step); - float16x8_t src_data_45 = vld1q_f16(src_data + 37 * src_step); - float16x8_t src_data_46 = vld1q_f16(src_data + 38 * src_step); - float16x8_t src_data_47 = vld1q_f16(src_data + 39 * src_step); - float16x8_t src_data_50 = vld1q_f16(src_data + 40 * src_step); - float16x8_t src_data_51 = vld1q_f16(src_data + 41 * src_step); - float16x8_t src_data_52 = vld1q_f16(src_data + 42 * src_step); - float16x8_t src_data_53 = vld1q_f16(src_data + 43 * src_step); - float16x8_t src_data_54 = vld1q_f16(src_data + 44 * src_step); - float16x8_t src_data_55 = vld1q_f16(src_data + 45 * src_step); - float16x8_t src_data_56 = vld1q_f16(src_data + 46 * src_step); - float16x8_t src_data_57 = vld1q_f16(src_data + 47 * src_step); - float16x8_t src_data_60 = vld1q_f16(src_data + 48 * src_step); - float16x8_t src_data_61 = vld1q_f16(src_data + 49 * src_step); - float16x8_t src_data_62 = vld1q_f16(src_data + 50 * src_step); - float16x8_t src_data_63 = vld1q_f16(src_data + 51 * src_step); - float16x8_t src_data_64 = vld1q_f16(src_data + 52 * src_step); - float16x8_t src_data_65 = vld1q_f16(src_data + 53 * src_step); - float16x8_t src_data_66 = vld1q_f16(src_data + 54 * src_step); - float16x8_t src_data_67 = vld1q_f16(src_data + 55 * src_step); - float16x8_t src_data_70 = vld1q_f16(src_data + 56 * src_step); - float16x8_t src_data_71 = vld1q_f16(src_data + 57 * src_step); - float16x8_t src_data_72 = vld1q_f16(src_data + 58 * src_step); - float16x8_t src_data_73 = vld1q_f16(src_data + 59 * src_step); - float16x8_t src_data_74 = vld1q_f16(src_data + 60 * src_step); - float16x8_t src_data_75 = vld1q_f16(src_data + 61 * src_step); - float16x8_t src_data_76 = vld1q_f16(src_data + 62 * src_step); - float16x8_t src_data_77 = vld1q_f16(src_data + 63 * src_step); - - float16x8_t d01 = vsubq_f16(src_data_10, src_data_20); - float16x8_t d02 = vsubq_f16(src_data_11, src_data_21); - float16x8_t d03 = vsubq_f16(src_data_12, src_data_22); - float16x8_t d04 = vsubq_f16(src_data_13, src_data_23); - float16x8_t d05 = vsubq_f16(src_data_14, src_data_24); - float16x8_t d06 = vsubq_f16(src_data_15, src_data_25); - float16x8_t d07 = vsubq_f16(src_data_16, src_data_26); - float16x8_t d08 = vsubq_f16(src_data_17, src_data_27); - - float16x8_t d11 = vsubq_f16(src_data_30, src_data_40); - float16x8_t d12 = vsubq_f16(src_data_31, src_data_41); - float16x8_t d13 = vsubq_f16(src_data_32, src_data_42); - float16x8_t d14 = vsubq_f16(src_data_33, src_data_43); - float16x8_t d15 = vsubq_f16(src_data_34, src_data_44); - float16x8_t d16 = vsubq_f16(src_data_35, src_data_45); - float16x8_t d17 = vsubq_f16(src_data_36, src_data_46); - float16x8_t d18 = vsubq_f16(src_data_37, src_data_47); - - float16x8_t d21 = vsubq_f16(src_data_50, src_data_60); - float16x8_t d22 = vsubq_f16(src_data_51, src_data_61); - float16x8_t d23 = vsubq_f16(src_data_52, src_data_62); - float16x8_t d24 = vsubq_f16(src_data_53, src_data_63); - float16x8_t d25 = vsubq_f16(src_data_54, src_data_64); - float16x8_t d26 = vsubq_f16(src_data_55, src_data_65); - float16x8_t d27 = vsubq_f16(src_data_56, src_data_66); - float16x8_t d28 = vsubq_f16(src_data_57, src_data_67); - - float16x8_t d31 = vaddq_f16(src_data_10, src_data_20); - float16x8_t d32 = vaddq_f16(src_data_11, src_data_21); - float16x8_t d33 = vaddq_f16(src_data_12, src_data_22); - float16x8_t d34 = vaddq_f16(src_data_13, src_data_23); - float16x8_t d35 = vaddq_f16(src_data_14, src_data_24); - float16x8_t d36 = vaddq_f16(src_data_15, src_data_25); - float16x8_t d37 = vaddq_f16(src_data_16, src_data_26); - float16x8_t d38 = vaddq_f16(src_data_17, src_data_27); - - float16x8_t d41 = vaddq_f16(src_data_30, src_data_40); - float16x8_t d42 = vaddq_f16(src_data_31, src_data_41); - float16x8_t d43 = vaddq_f16(src_data_32, src_data_42); - float16x8_t d44 = vaddq_f16(src_data_33, src_data_43); - float16x8_t d45 = vaddq_f16(src_data_34, src_data_44); - float16x8_t d46 = vaddq_f16(src_data_35, src_data_45); - float16x8_t d47 = vaddq_f16(src_data_36, src_data_46); - float16x8_t d48 = vaddq_f16(src_data_37, src_data_47); - - float16x8_t d51 = vaddq_f16(src_data_50, src_data_60); - float16x8_t d52 = vaddq_f16(src_data_51, src_data_61); - float16x8_t d53 = vaddq_f16(src_data_52, src_data_62); - float16x8_t d54 = vaddq_f16(src_data_53, src_data_63); - float16x8_t d55 = vaddq_f16(src_data_54, src_data_64); - float16x8_t d56 = vaddq_f16(src_data_55, src_data_65); - float16x8_t d57 = vaddq_f16(src_data_56, src_data_66); - float16x8_t d58 = vaddq_f16(src_data_57, src_data_67); - - float16x8_t t00 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_00, src_data_10), src_data_20), src_data_30), src_data_40), - src_data_50), - src_data_60); - float16x8_t t01 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_01, src_data_11), src_data_21), src_data_31), src_data_41), - src_data_51), - src_data_61); - float16x8_t t02 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_02, src_data_12), src_data_22), src_data_32), src_data_42), - src_data_52), - src_data_62); - float16x8_t t03 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_03, src_data_13), src_data_23), src_data_33), src_data_43), - src_data_53), - src_data_63); - float16x8_t t04 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_04, src_data_14), src_data_24), src_data_34), src_data_44), - src_data_54), - src_data_64); - float16x8_t t05 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_05, src_data_15), src_data_25), src_data_35), src_data_45), - src_data_55), - src_data_65); - float16x8_t t06 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_06, src_data_16), src_data_26), src_data_36), src_data_46), - src_data_56), - src_data_66); - float16x8_t t07 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_07, src_data_17), src_data_27), src_data_37), src_data_47), - src_data_57), - src_data_67); - - float16x8_t t10 = vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.5), d11), vmulq_n_f16(d21, 1.5)); - float16x8_t t11 = vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.5), d12), vmulq_n_f16(d22, 1.5)); - float16x8_t t12 = vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.5), d13), vmulq_n_f16(d23, 1.5)); - float16x8_t t13 = vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.5), d14), vmulq_n_f16(d24, 1.5)); - float16x8_t t14 = vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.5), d15), vmulq_n_f16(d25, 1.5)); - float16x8_t t15 = vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.5), d16), vmulq_n_f16(d26, 1.5)); - float16x8_t t16 = vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.5), d17), vmulq_n_f16(d27, 1.5)); - float16x8_t t17 = vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.5), d18), vmulq_n_f16(d28, 1.5)); - - float16x8_t t20 = vaddq_f16(vaddq_f16(vmulq_n_f16(d31, 0.25), d41), vmulq_n_f16(d51, 2.25)); - float16x8_t t21 = vaddq_f16(vaddq_f16(vmulq_n_f16(d32, 0.25), d42), vmulq_n_f16(d52, 2.25)); - float16x8_t t22 = vaddq_f16(vaddq_f16(vmulq_n_f16(d33, 0.25), d43), vmulq_n_f16(d53, 2.25)); - float16x8_t t23 = vaddq_f16(vaddq_f16(vmulq_n_f16(d34, 0.25), d44), vmulq_n_f16(d54, 2.25)); - float16x8_t t24 = vaddq_f16(vaddq_f16(vmulq_n_f16(d35, 0.25), d45), vmulq_n_f16(d55, 2.25)); - float16x8_t t25 = vaddq_f16(vaddq_f16(vmulq_n_f16(d36, 0.25), d46), vmulq_n_f16(d56, 2.25)); - float16x8_t t26 = vaddq_f16(vaddq_f16(vmulq_n_f16(d37, 0.25), d47), vmulq_n_f16(d57, 2.25)); - float16x8_t t27 = vaddq_f16(vaddq_f16(vmulq_n_f16(d38, 0.25), d48), vmulq_n_f16(d58, 2.25)); - - float16x8_t t30 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.125), d11), vmulq_n_f16(d21, 3.375)), src_data_70); - float16x8_t t31 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.125), d12), vmulq_n_f16(d22, 3.375)), src_data_71); - float16x8_t t32 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.125), d13), vmulq_n_f16(d23, 3.375)), src_data_72); - float16x8_t t33 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.125), d14), vmulq_n_f16(d24, 3.375)), src_data_73); - float16x8_t t34 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.125), d15), vmulq_n_f16(d25, 3.375)), src_data_74); - float16x8_t t35 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.125), d16), vmulq_n_f16(d26, 3.375)), src_data_75); - float16x8_t t36 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.125), d17), vmulq_n_f16(d27, 3.375)), src_data_76); - float16x8_t t37 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.125), d18), vmulq_n_f16(d28, 3.375)), src_data_77); - - float16x8_t s11 = vsubq_f16(t01, t02); - float16x8_t s12 = vsubq_f16(t11, t12); - float16x8_t s13 = vsubq_f16(t21, t22); - float16x8_t s14 = vsubq_f16(t31, t32); - - float16x8_t s21 = vsubq_f16(t03, t04); - float16x8_t s22 = vsubq_f16(t13, t14); - float16x8_t s23 = vsubq_f16(t23, t24); - float16x8_t s24 = vsubq_f16(t33, t34); - - float16x8_t s31 = vsubq_f16(t05, t06); - float16x8_t s32 = vsubq_f16(t15, t16); - float16x8_t s33 = vsubq_f16(t25, t26); - float16x8_t s34 = vsubq_f16(t35, t36); - - float16x8_t s41 = vaddq_f16(t01, t02); - float16x8_t s42 = vaddq_f16(t11, t12); - float16x8_t s43 = vaddq_f16(t21, t22); - float16x8_t s44 = vaddq_f16(t31, t32); - - float16x8_t s51 = vaddq_f16(t03, t04); - float16x8_t s52 = vaddq_f16(t13, t14); - float16x8_t s53 = vaddq_f16(t23, t24); - float16x8_t s54 = vaddq_f16(t33, t34); - - float16x8_t s61 = vaddq_f16(t05, t06); - float16x8_t s62 = vaddq_f16(t15, t16); - float16x8_t s63 = vaddq_f16(t25, t26); - float16x8_t s64 = vaddq_f16(t35, t36); - - float16x8_t m00 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t00, t01), t02), t03), t04), t05), t06); - float16x8_t m01 = vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.5), s21), vmulq_n_f16(s31, 1.5)); - float16x8_t m02 = vaddq_f16(vaddq_f16(vmulq_n_f16(s41, 0.25), s51), vmulq_n_f16(s61, 2.25)); - float16x8_t m03 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.125), s21), vmulq_n_f16(s31, 3.375)), t07); - - float16x8_t m10 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t10, t11), t12), t13), t14), t15), t16); - float16x8_t m11 = vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.5), s22), vmulq_n_f16(s32, 1.5)); - float16x8_t m12 = vaddq_f16(vaddq_f16(vmulq_n_f16(s42, 0.25), s52), vmulq_n_f16(s62, 2.25)); - float16x8_t m13 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.125), s22), vmulq_n_f16(s32, 3.375)), t17); - - float16x8_t m20 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t20, t21), t22), t23), t24), t25), t26); - float16x8_t m21 = vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.5), s23), vmulq_n_f16(s33, 1.5)); - float16x8_t m22 = vaddq_f16(vaddq_f16(vmulq_n_f16(s43, 0.25), s53), vmulq_n_f16(s63, 2.25)); - float16x8_t m23 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.125), s23), vmulq_n_f16(s33, 3.375)), t27); - - float16x8_t m30 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t30, t31), t32), t33), t34), t35), t36); - float16x8_t m31 = vaddq_f16(vaddq_f16(vmulq_n_f16(s14, 0.5), s24), vmulq_n_f16(s34, 1.5)); - float16x8_t m32 = vaddq_f16(vaddq_f16(vmulq_n_f16(s44, 0.25), s54), vmulq_n_f16(s64, 2.25)); - float16x8_t m33 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s14, 0.125), s24), vmulq_n_f16(s34, 3.375)), t37); - - float16x8_t bias_ptr = vld1q_f16(bias_data); - vst1q_f16(dst_data, vaddq_f16(m00, bias_ptr)); - vst1q_f16(dst_data + C8NUM, vaddq_f16(m01, bias_ptr)); - vst1q_f16(dst_data + 2 * C8NUM, vaddq_f16(m02, bias_ptr)); - vst1q_f16(dst_data + 3 * C8NUM, vaddq_f16(m03, bias_ptr)); - - vst1q_f16(dst_data + dst_step * C8NUM, vaddq_f16(m10, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + C8NUM, vaddq_f16(m11, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m12, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m13, bias_ptr)); - - vst1q_f16(dst_data + 2 * dst_step * C8NUM, vaddq_f16(m20, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + C8NUM, vaddq_f16(m21, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m22, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m23, bias_ptr)); - - vst1q_f16(dst_data + 3 * dst_step * C8NUM, vaddq_f16(m30, bias_ptr)); - vst1q_f16(dst_data + 3 * dst_step * C8NUM + C8NUM, vaddq_f16(m31, bias_ptr)); - vst1q_f16(dst_data + 3 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m32, bias_ptr)); - vst1q_f16(dst_data + 3 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m33, bias_ptr)); -#else - for (int i = 0; i < C8NUM; i++) { - float16_t src_data_00 = src_data[i]; - float16_t src_data_01 = src_data[i + src_step]; - float16_t src_data_02 = src_data[i + 2 * src_step]; - float16_t src_data_03 = src_data[i + 3 * src_step]; - float16_t src_data_04 = src_data[i + 4 * src_step]; - float16_t src_data_05 = src_data[i + 5 * src_step]; - float16_t src_data_06 = src_data[i + 6 * src_step]; - float16_t src_data_07 = src_data[i + 7 * src_step]; - float16_t src_data_10 = src_data[i + 8 * src_step]; - float16_t src_data_11 = src_data[i + 9 * src_step]; - float16_t src_data_12 = src_data[i + 10 * src_step]; - float16_t src_data_13 = src_data[i + 11 * src_step]; - float16_t src_data_14 = src_data[i + 12 * src_step]; - float16_t src_data_15 = src_data[i + 13 * src_step]; - float16_t src_data_16 = src_data[i + 14 * src_step]; - float16_t src_data_17 = src_data[i + 15 * src_step]; - float16_t src_data_20 = src_data[i + 16 * src_step]; - float16_t src_data_21 = src_data[i + 17 * src_step]; - float16_t src_data_22 = src_data[i + 18 * src_step]; - float16_t src_data_23 = src_data[i + 19 * src_step]; - float16_t src_data_24 = src_data[i + 20 * src_step]; - float16_t src_data_25 = src_data[i + 21 * src_step]; - float16_t src_data_26 = src_data[i + 22 * src_step]; - float16_t src_data_27 = src_data[i + 23 * src_step]; - float16_t src_data_30 = src_data[i + 24 * src_step]; - float16_t src_data_31 = src_data[i + 25 * src_step]; - float16_t src_data_32 = src_data[i + 26 * src_step]; - float16_t src_data_33 = src_data[i + 27 * src_step]; - float16_t src_data_34 = src_data[i + 28 * src_step]; - float16_t src_data_35 = src_data[i + 29 * src_step]; - float16_t src_data_36 = src_data[i + 30 * src_step]; - float16_t src_data_37 = src_data[i + 31 * src_step]; - float16_t src_data_40 = src_data[i + 32 * src_step]; - float16_t src_data_41 = src_data[i + 33 * src_step]; - float16_t src_data_42 = src_data[i + 34 * src_step]; - float16_t src_data_43 = src_data[i + 35 * src_step]; - float16_t src_data_44 = src_data[i + 36 * src_step]; - float16_t src_data_45 = src_data[i + 37 * src_step]; - float16_t src_data_46 = src_data[i + 38 * src_step]; - float16_t src_data_47 = src_data[i + 39 * src_step]; - float16_t src_data_50 = src_data[i + 40 * src_step]; - float16_t src_data_51 = src_data[i + 41 * src_step]; - float16_t src_data_52 = src_data[i + 42 * src_step]; - float16_t src_data_53 = src_data[i + 43 * src_step]; - float16_t src_data_54 = src_data[i + 44 * src_step]; - float16_t src_data_55 = src_data[i + 45 * src_step]; - float16_t src_data_56 = src_data[i + 46 * src_step]; - float16_t src_data_57 = src_data[i + 47 * src_step]; - float16_t src_data_60 = src_data[i + 48 * src_step]; - float16_t src_data_61 = src_data[i + 49 * src_step]; - float16_t src_data_62 = src_data[i + 50 * src_step]; - float16_t src_data_63 = src_data[i + 51 * src_step]; - float16_t src_data_64 = src_data[i + 52 * src_step]; - float16_t src_data_65 = src_data[i + 53 * src_step]; - float16_t src_data_66 = src_data[i + 54 * src_step]; - float16_t src_data_67 = src_data[i + 55 * src_step]; - float16_t src_data_70 = src_data[i + 56 * src_step]; - float16_t src_data_71 = src_data[i + 57 * src_step]; - float16_t src_data_72 = src_data[i + 58 * src_step]; - float16_t src_data_73 = src_data[i + 59 * src_step]; - float16_t src_data_74 = src_data[i + 60 * src_step]; - float16_t src_data_75 = src_data[i + 61 * src_step]; - float16_t src_data_76 = src_data[i + 62 * src_step]; - float16_t src_data_77 = src_data[i + 63 * src_step]; - - float16_t d01 = src_data_10 - src_data_20; - float16_t d02 = src_data_11 - src_data_21; - float16_t d03 = src_data_12 - src_data_22; - float16_t d04 = src_data_13 - src_data_23; - float16_t d05 = src_data_14 - src_data_24; - float16_t d06 = src_data_15 - src_data_25; - float16_t d07 = src_data_16 - src_data_26; - float16_t d08 = src_data_17 - src_data_27; - - float16_t d11 = src_data_30 - src_data_40; - float16_t d12 = src_data_31 - src_data_41; - float16_t d13 = src_data_32 - src_data_42; - float16_t d14 = src_data_33 - src_data_43; - float16_t d15 = src_data_34 - src_data_44; - float16_t d16 = src_data_35 - src_data_45; - float16_t d17 = src_data_36 - src_data_46; - float16_t d18 = src_data_37 - src_data_47; - - float16_t d21 = src_data_50 - src_data_60; - float16_t d22 = src_data_51 - src_data_61; - float16_t d23 = src_data_52 - src_data_62; - float16_t d24 = src_data_53 - src_data_63; - float16_t d25 = src_data_54 - src_data_64; - float16_t d26 = src_data_55 - src_data_65; - float16_t d27 = src_data_56 - src_data_66; - float16_t d28 = src_data_57 - src_data_67; - - float16_t d31 = src_data_10 + src_data_20; - float16_t d32 = src_data_11 + src_data_21; - float16_t d33 = src_data_12 + src_data_22; - float16_t d34 = src_data_13 + src_data_23; - float16_t d35 = src_data_14 + src_data_24; - float16_t d36 = src_data_15 + src_data_25; - float16_t d37 = src_data_16 + src_data_26; - float16_t d38 = src_data_17 + src_data_27; - - float16_t d41 = src_data_30 + src_data_40; - float16_t d42 = src_data_31 + src_data_41; - float16_t d43 = src_data_32 + src_data_42; - float16_t d44 = src_data_33 + src_data_43; - float16_t d45 = src_data_34 + src_data_44; - float16_t d46 = src_data_35 + src_data_45; - float16_t d47 = src_data_36 + src_data_46; - float16_t d48 = src_data_37 + src_data_47; - - float16_t d51 = src_data_50 + src_data_60; - float16_t d52 = src_data_51 + src_data_61; - float16_t d53 = src_data_52 + src_data_62; - float16_t d54 = src_data_53 + src_data_63; - float16_t d55 = src_data_54 + src_data_64; - float16_t d56 = src_data_55 + src_data_65; - float16_t d57 = src_data_56 + src_data_66; - float16_t d58 = src_data_57 + src_data_67; - - float16_t t00 = src_data_00 + src_data_10 + src_data_20 + src_data_30 + src_data_40 + src_data_50 + src_data_60; - float16_t t01 = src_data_01 + src_data_11 + src_data_21 + src_data_31 + src_data_41 + src_data_51 + src_data_61; - float16_t t02 = src_data_02 + src_data_12 + src_data_22 + src_data_32 + src_data_42 + src_data_52 + src_data_62; - float16_t t03 = src_data_03 + src_data_13 + src_data_23 + src_data_33 + src_data_43 + src_data_53 + src_data_63; - float16_t t04 = src_data_04 + src_data_14 + src_data_24 + src_data_34 + src_data_44 + src_data_54 + src_data_64; - float16_t t05 = src_data_05 + src_data_15 + src_data_25 + src_data_35 + src_data_45 + src_data_55 + src_data_65; - float16_t t06 = src_data_06 + src_data_16 + src_data_26 + src_data_36 + src_data_46 + src_data_56 + src_data_66; - float16_t t07 = src_data_07 + src_data_17 + src_data_27 + src_data_37 + src_data_47 + src_data_57 + src_data_67; - - const float16_t t10 = 0.5f * d01 + d11 + 1.5f * d21; - const float16_t t11 = 0.5f * d02 + d12 + 1.5f * d22; - const float16_t t12 = 0.5f * d03 + d13 + 1.5f * d23; - const float16_t t13 = 0.5f * d04 + d14 + 1.5f * d24; - const float16_t t14 = 0.5f * d05 + d15 + 1.5f * d25; - const float16_t t15 = 0.5f * d06 + d16 + 1.5f * d26; - const float16_t t16 = 0.5f * d07 + d17 + 1.5f * d27; - const float16_t t17 = 0.5f * d08 + d18 + 1.5f * d28; - - const float16_t t20 = 0.25f * d31 + d41 + 2.25f * d51; - const float16_t t21 = 0.25f * d32 + d42 + 2.25f * d52; - const float16_t t22 = 0.25f * d33 + d43 + 2.25f * d53; - const float16_t t23 = 0.25f * d34 + d44 + 2.25f * d54; - const float16_t t24 = 0.25f * d35 + d45 + 2.25f * d55; - const float16_t t25 = 0.25f * d36 + d46 + 2.25f * d56; - const float16_t t26 = 0.25f * d37 + d47 + 2.25f * d57; - const float16_t t27 = 0.25f * d38 + d48 + 2.25f * d58; - - const float16_t t30 = 0.125f * d01 + d11 + 3.375f * d21 + src_data_70; - const float16_t t31 = 0.125f * d02 + d12 + 3.375f * d22 + src_data_71; - const float16_t t32 = 0.125f * d03 + d13 + 3.375f * d23 + src_data_72; - const float16_t t33 = 0.125f * d04 + d14 + 3.375f * d24 + src_data_73; - const float16_t t34 = 0.125f * d05 + d15 + 3.375f * d25 + src_data_74; - const float16_t t35 = 0.125f * d06 + d16 + 3.375f * d26 + src_data_75; - const float16_t t36 = 0.125f * d07 + d17 + 3.375f * d27 + src_data_76; - const float16_t t37 = 0.125f * d08 + d18 + 3.375f * d28 + src_data_77; - - float16_t s11 = t01 - t02; - float16_t s12 = t11 - t12; - float16_t s13 = t21 - t22; - float16_t s14 = t31 - t32; - - float16_t s21 = t03 - t04; - float16_t s22 = t13 - t14; - float16_t s23 = t23 - t24; - float16_t s24 = t33 - t34; - - float16_t s31 = t05 - t06; - float16_t s32 = t15 - t16; - float16_t s33 = t25 - t26; - float16_t s34 = t35 - t36; - - float16_t s41 = t01 + t02; - float16_t s42 = t11 + t12; - float16_t s43 = t21 + t22; - float16_t s44 = t31 + t32; - - float16_t s51 = t03 + t04; - float16_t s52 = t13 + t14; - float16_t s53 = t23 + t24; - float16_t s54 = t33 + t34; - - float16_t s61 = t05 + t06; - float16_t s62 = t15 + t16; - float16_t s63 = t25 + t26; - float16_t s64 = t35 + t36; - - float16_t m00 = t00 + t01 + t02 + t03 + t04 + t05 + t06; - const float16_t m01 = 0.5f * s11 + s21 + 1.5f * s31; - const float16_t m02 = 0.25f * s41 + s51 + 2.25f * s61; - const float16_t m03 = 0.125f * s11 + s21 + 3.375f * s31 + t07; - - float16_t m10 = t10 + t11 + t12 + t13 + t14 + t15 + t16; - const float16_t m11 = 0.5f * s12 + s22 + 1.5f * s32; - const float16_t m12 = 0.25f * s42 + s52 + 2.25f * s62; - const float16_t m13 = 0.125f * s12 + s22 + 3.375f * s32 + t17; - - float16_t m20 = t20 + t21 + t22 + t23 + t24 + t25 + t26; - const float16_t m21 = 0.5f * s13 + s23 + 1.5f * s33; - const float16_t m22 = 0.25f * s43 + s53 + 2.25f * s63; - const float16_t m23 = 0.125f * s13 + s23 + 3.375f * s33 + t27; - - float16_t m30 = t30 + t31 + t32 + t33 + t34 + t35 + t36; - const float16_t m31 = 0.5f * s14 + s24 + 1.5f * s34; - const float16_t m32 = 0.25f * s44 + s54 + 2.25f * s64; - const float16_t m33 = 0.125f * s14 + s24 + 3.375f * s34 + t37; - - (dst_data + i)[0] = m00 + bias_data[i]; - (dst_data + i + C8NUM)[0] = m01 + bias_data[i]; - (dst_data + i + 2 * C8NUM)[0] = m02 + bias_data[i]; - (dst_data + i + 3 * C8NUM)[0] = m03 + bias_data[i]; - - (dst_data + i + dst_step * C8NUM)[0] = m10 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + C8NUM)[0] = m11 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + 2 * C8NUM)[0] = m12 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + 3 * C8NUM)[0] = m13 + bias_data[i]; - - (dst_data + i + 2 * dst_step * C8NUM)[0] = m20 + bias_data[i]; - (dst_data + i + 2 * dst_step * C8NUM + C8NUM)[0] = m21 + bias_data[i]; - (dst_data + i + 2 * dst_step * C8NUM + 2 * C8NUM)[0] = m22 + bias_data[i]; - (dst_data + i + 2 * dst_step * C8NUM + 3 * C8NUM)[0] = m23 + bias_data[i]; - - (dst_data + i + 3 * dst_step * C8NUM)[0] = m30 + bias_data[i]; - (dst_data + i + 3 * dst_step * C8NUM + C8NUM)[0] = m31 + bias_data[i]; - (dst_data + i + 3 * dst_step * C8NUM + 2 * C8NUM)[0] = m32 + bias_data[i]; - (dst_data + i + 3 * dst_step * C8NUM + 3 * C8NUM)[0] = m33 + bias_data[i]; - } -#endif -} - -void OutputTransform8x5UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, - int src_step, int dst_step) { -#ifdef ENABLE_ARM - float16x8_t src_data_00 = vld1q_f16(src_data + 0 * src_step); - float16x8_t src_data_01 = vld1q_f16(src_data + 1 * src_step); - float16x8_t src_data_02 = vld1q_f16(src_data + 2 * src_step); - float16x8_t src_data_03 = vld1q_f16(src_data + 3 * src_step); - float16x8_t src_data_04 = vld1q_f16(src_data + 4 * src_step); - float16x8_t src_data_05 = vld1q_f16(src_data + 5 * src_step); - float16x8_t src_data_06 = vld1q_f16(src_data + 6 * src_step); - float16x8_t src_data_07 = vld1q_f16(src_data + 7 * src_step); - float16x8_t src_data_10 = vld1q_f16(src_data + 8 * src_step); - float16x8_t src_data_11 = vld1q_f16(src_data + 9 * src_step); - float16x8_t src_data_12 = vld1q_f16(src_data + 10 * src_step); - float16x8_t src_data_13 = vld1q_f16(src_data + 11 * src_step); - float16x8_t src_data_14 = vld1q_f16(src_data + 12 * src_step); - float16x8_t src_data_15 = vld1q_f16(src_data + 13 * src_step); - float16x8_t src_data_16 = vld1q_f16(src_data + 14 * src_step); - float16x8_t src_data_17 = vld1q_f16(src_data + 15 * src_step); - float16x8_t src_data_20 = vld1q_f16(src_data + 16 * src_step); - float16x8_t src_data_21 = vld1q_f16(src_data + 17 * src_step); - float16x8_t src_data_22 = vld1q_f16(src_data + 18 * src_step); - float16x8_t src_data_23 = vld1q_f16(src_data + 19 * src_step); - float16x8_t src_data_24 = vld1q_f16(src_data + 20 * src_step); - float16x8_t src_data_25 = vld1q_f16(src_data + 21 * src_step); - float16x8_t src_data_26 = vld1q_f16(src_data + 22 * src_step); - float16x8_t src_data_27 = vld1q_f16(src_data + 23 * src_step); - float16x8_t src_data_30 = vld1q_f16(src_data + 24 * src_step); - float16x8_t src_data_31 = vld1q_f16(src_data + 25 * src_step); - float16x8_t src_data_32 = vld1q_f16(src_data + 26 * src_step); - float16x8_t src_data_33 = vld1q_f16(src_data + 27 * src_step); - float16x8_t src_data_34 = vld1q_f16(src_data + 28 * src_step); - float16x8_t src_data_35 = vld1q_f16(src_data + 29 * src_step); - float16x8_t src_data_36 = vld1q_f16(src_data + 30 * src_step); - float16x8_t src_data_37 = vld1q_f16(src_data + 31 * src_step); - float16x8_t src_data_40 = vld1q_f16(src_data + 32 * src_step); - float16x8_t src_data_41 = vld1q_f16(src_data + 33 * src_step); - float16x8_t src_data_42 = vld1q_f16(src_data + 34 * src_step); - float16x8_t src_data_43 = vld1q_f16(src_data + 35 * src_step); - float16x8_t src_data_44 = vld1q_f16(src_data + 36 * src_step); - float16x8_t src_data_45 = vld1q_f16(src_data + 37 * src_step); - float16x8_t src_data_46 = vld1q_f16(src_data + 38 * src_step); - float16x8_t src_data_47 = vld1q_f16(src_data + 39 * src_step); - float16x8_t src_data_50 = vld1q_f16(src_data + 40 * src_step); - float16x8_t src_data_51 = vld1q_f16(src_data + 41 * src_step); - float16x8_t src_data_52 = vld1q_f16(src_data + 42 * src_step); - float16x8_t src_data_53 = vld1q_f16(src_data + 43 * src_step); - float16x8_t src_data_54 = vld1q_f16(src_data + 44 * src_step); - float16x8_t src_data_55 = vld1q_f16(src_data + 45 * src_step); - float16x8_t src_data_56 = vld1q_f16(src_data + 46 * src_step); - float16x8_t src_data_57 = vld1q_f16(src_data + 47 * src_step); - float16x8_t src_data_60 = vld1q_f16(src_data + 48 * src_step); - float16x8_t src_data_61 = vld1q_f16(src_data + 49 * src_step); - float16x8_t src_data_62 = vld1q_f16(src_data + 50 * src_step); - float16x8_t src_data_63 = vld1q_f16(src_data + 51 * src_step); - float16x8_t src_data_64 = vld1q_f16(src_data + 52 * src_step); - float16x8_t src_data_65 = vld1q_f16(src_data + 53 * src_step); - float16x8_t src_data_66 = vld1q_f16(src_data + 54 * src_step); - float16x8_t src_data_67 = vld1q_f16(src_data + 55 * src_step); - float16x8_t src_data_70 = vld1q_f16(src_data + 56 * src_step); - float16x8_t src_data_71 = vld1q_f16(src_data + 57 * src_step); - float16x8_t src_data_72 = vld1q_f16(src_data + 58 * src_step); - float16x8_t src_data_73 = vld1q_f16(src_data + 59 * src_step); - float16x8_t src_data_74 = vld1q_f16(src_data + 60 * src_step); - float16x8_t src_data_75 = vld1q_f16(src_data + 61 * src_step); - float16x8_t src_data_76 = vld1q_f16(src_data + 62 * src_step); - float16x8_t src_data_77 = vld1q_f16(src_data + 63 * src_step); - - float16x8_t d01 = vsubq_f16(src_data_10, src_data_20); - float16x8_t d02 = vsubq_f16(src_data_11, src_data_21); - float16x8_t d03 = vsubq_f16(src_data_12, src_data_22); - float16x8_t d04 = vsubq_f16(src_data_13, src_data_23); - float16x8_t d05 = vsubq_f16(src_data_14, src_data_24); - float16x8_t d06 = vsubq_f16(src_data_15, src_data_25); - float16x8_t d07 = vsubq_f16(src_data_16, src_data_26); - float16x8_t d08 = vsubq_f16(src_data_17, src_data_27); - - float16x8_t d11 = vsubq_f16(src_data_30, src_data_40); - float16x8_t d12 = vsubq_f16(src_data_31, src_data_41); - float16x8_t d13 = vsubq_f16(src_data_32, src_data_42); - float16x8_t d14 = vsubq_f16(src_data_33, src_data_43); - float16x8_t d15 = vsubq_f16(src_data_34, src_data_44); - float16x8_t d16 = vsubq_f16(src_data_35, src_data_45); - float16x8_t d17 = vsubq_f16(src_data_36, src_data_46); - float16x8_t d18 = vsubq_f16(src_data_37, src_data_47); - - float16x8_t d21 = vsubq_f16(src_data_50, src_data_60); - float16x8_t d22 = vsubq_f16(src_data_51, src_data_61); - float16x8_t d23 = vsubq_f16(src_data_52, src_data_62); - float16x8_t d24 = vsubq_f16(src_data_53, src_data_63); - float16x8_t d25 = vsubq_f16(src_data_54, src_data_64); - float16x8_t d26 = vsubq_f16(src_data_55, src_data_65); - float16x8_t d27 = vsubq_f16(src_data_56, src_data_66); - float16x8_t d28 = vsubq_f16(src_data_57, src_data_67); - - float16x8_t d31 = vaddq_f16(src_data_10, src_data_20); - float16x8_t d32 = vaddq_f16(src_data_11, src_data_21); - float16x8_t d33 = vaddq_f16(src_data_12, src_data_22); - float16x8_t d34 = vaddq_f16(src_data_13, src_data_23); - float16x8_t d35 = vaddq_f16(src_data_14, src_data_24); - float16x8_t d36 = vaddq_f16(src_data_15, src_data_25); - float16x8_t d37 = vaddq_f16(src_data_16, src_data_26); - float16x8_t d38 = vaddq_f16(src_data_17, src_data_27); - - float16x8_t d41 = vaddq_f16(src_data_30, src_data_40); - float16x8_t d42 = vaddq_f16(src_data_31, src_data_41); - float16x8_t d43 = vaddq_f16(src_data_32, src_data_42); - float16x8_t d44 = vaddq_f16(src_data_33, src_data_43); - float16x8_t d45 = vaddq_f16(src_data_34, src_data_44); - float16x8_t d46 = vaddq_f16(src_data_35, src_data_45); - float16x8_t d47 = vaddq_f16(src_data_36, src_data_46); - float16x8_t d48 = vaddq_f16(src_data_37, src_data_47); - - float16x8_t d51 = vaddq_f16(src_data_50, src_data_60); - float16x8_t d52 = vaddq_f16(src_data_51, src_data_61); - float16x8_t d53 = vaddq_f16(src_data_52, src_data_62); - float16x8_t d54 = vaddq_f16(src_data_53, src_data_63); - float16x8_t d55 = vaddq_f16(src_data_54, src_data_64); - float16x8_t d56 = vaddq_f16(src_data_55, src_data_65); - float16x8_t d57 = vaddq_f16(src_data_56, src_data_66); - float16x8_t d58 = vaddq_f16(src_data_57, src_data_67); - - float16x8_t t00 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_00, src_data_10), src_data_20), src_data_30), src_data_40), - src_data_50), - src_data_60); - float16x8_t t01 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_01, src_data_11), src_data_21), src_data_31), src_data_41), - src_data_51), - src_data_61); - float16x8_t t02 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_02, src_data_12), src_data_22), src_data_32), src_data_42), - src_data_52), - src_data_62); - float16x8_t t03 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_03, src_data_13), src_data_23), src_data_33), src_data_43), - src_data_53), - src_data_63); - float16x8_t t04 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_04, src_data_14), src_data_24), src_data_34), src_data_44), - src_data_54), - src_data_64); - float16x8_t t05 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_05, src_data_15), src_data_25), src_data_35), src_data_45), - src_data_55), - src_data_65); - float16x8_t t06 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_06, src_data_16), src_data_26), src_data_36), src_data_46), - src_data_56), - src_data_66); - float16x8_t t07 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_07, src_data_17), src_data_27), src_data_37), src_data_47), - src_data_57), - src_data_67); - - float16x8_t t10 = vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.5), d11), vmulq_n_f16(d21, 1.5)); - float16x8_t t11 = vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.5), d12), vmulq_n_f16(d22, 1.5)); - float16x8_t t12 = vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.5), d13), vmulq_n_f16(d23, 1.5)); - float16x8_t t13 = vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.5), d14), vmulq_n_f16(d24, 1.5)); - float16x8_t t14 = vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.5), d15), vmulq_n_f16(d25, 1.5)); - float16x8_t t15 = vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.5), d16), vmulq_n_f16(d26, 1.5)); - float16x8_t t16 = vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.5), d17), vmulq_n_f16(d27, 1.5)); - float16x8_t t17 = vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.5), d18), vmulq_n_f16(d28, 1.5)); - - float16x8_t t20 = vaddq_f16(vaddq_f16(vmulq_n_f16(d31, 0.25), d41), vmulq_n_f16(d51, 2.25)); - float16x8_t t21 = vaddq_f16(vaddq_f16(vmulq_n_f16(d32, 0.25), d42), vmulq_n_f16(d52, 2.25)); - float16x8_t t22 = vaddq_f16(vaddq_f16(vmulq_n_f16(d33, 0.25), d43), vmulq_n_f16(d53, 2.25)); - float16x8_t t23 = vaddq_f16(vaddq_f16(vmulq_n_f16(d34, 0.25), d44), vmulq_n_f16(d54, 2.25)); - float16x8_t t24 = vaddq_f16(vaddq_f16(vmulq_n_f16(d35, 0.25), d45), vmulq_n_f16(d55, 2.25)); - float16x8_t t25 = vaddq_f16(vaddq_f16(vmulq_n_f16(d36, 0.25), d46), vmulq_n_f16(d56, 2.25)); - float16x8_t t26 = vaddq_f16(vaddq_f16(vmulq_n_f16(d37, 0.25), d47), vmulq_n_f16(d57, 2.25)); - float16x8_t t27 = vaddq_f16(vaddq_f16(vmulq_n_f16(d38, 0.25), d48), vmulq_n_f16(d58, 2.25)); - - float16x8_t t30 = vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.125), d11), vmulq_n_f16(d21, 3.375)); - float16x8_t t31 = vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.125), d12), vmulq_n_f16(d22, 3.375)); - float16x8_t t32 = vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.125), d13), vmulq_n_f16(d23, 3.375)); - float16x8_t t33 = vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.125), d14), vmulq_n_f16(d24, 3.375)); - float16x8_t t34 = vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.125), d15), vmulq_n_f16(d25, 3.375)); - float16x8_t t35 = vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.125), d16), vmulq_n_f16(d26, 3.375)); - float16x8_t t36 = vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.125), d17), vmulq_n_f16(d27, 3.375)); - float16x8_t t37 = vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.125), d18), vmulq_n_f16(d28, 3.375)); - - float16x8_t t40 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d31, 0.0625), d41), vmulq_n_f16(d51, 5.0625)), src_data_70); - float16x8_t t41 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d32, 0.0625), d42), vmulq_n_f16(d52, 5.0625)), src_data_71); - float16x8_t t42 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d33, 0.0625), d43), vmulq_n_f16(d53, 5.0625)), src_data_72); - float16x8_t t43 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d34, 0.0625), d44), vmulq_n_f16(d54, 5.0625)), src_data_73); - float16x8_t t44 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d35, 0.0625), d45), vmulq_n_f16(d55, 5.0625)), src_data_74); - float16x8_t t45 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d36, 0.0625), d46), vmulq_n_f16(d56, 5.0625)), src_data_75); - float16x8_t t46 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d37, 0.0625), d47), vmulq_n_f16(d57, 5.0625)), src_data_76); - float16x8_t t47 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d38, 0.0625), d48), vmulq_n_f16(d58, 5.0625)), src_data_77); - - float16x8_t s11 = vsubq_f16(t01, t02); - float16x8_t s12 = vsubq_f16(t11, t12); - float16x8_t s13 = vsubq_f16(t21, t22); - float16x8_t s14 = vsubq_f16(t31, t32); - float16x8_t s15 = vsubq_f16(t41, t42); - - float16x8_t s21 = vsubq_f16(t03, t04); - float16x8_t s22 = vsubq_f16(t13, t14); - float16x8_t s23 = vsubq_f16(t23, t24); - float16x8_t s24 = vsubq_f16(t33, t34); - float16x8_t s25 = vsubq_f16(t43, t44); - - float16x8_t s31 = vsubq_f16(t05, t06); - float16x8_t s32 = vsubq_f16(t15, t16); - float16x8_t s33 = vsubq_f16(t25, t26); - float16x8_t s34 = vsubq_f16(t35, t36); - float16x8_t s35 = vsubq_f16(t45, t46); - - float16x8_t s41 = vaddq_f16(t01, t02); - float16x8_t s42 = vaddq_f16(t11, t12); - float16x8_t s43 = vaddq_f16(t21, t22); - float16x8_t s44 = vaddq_f16(t31, t32); - float16x8_t s45 = vaddq_f16(t41, t42); - - float16x8_t s51 = vaddq_f16(t03, t04); - float16x8_t s52 = vaddq_f16(t13, t14); - float16x8_t s53 = vaddq_f16(t23, t24); - float16x8_t s54 = vaddq_f16(t33, t34); - float16x8_t s55 = vaddq_f16(t43, t44); - - float16x8_t s61 = vaddq_f16(t05, t06); - float16x8_t s62 = vaddq_f16(t15, t16); - float16x8_t s63 = vaddq_f16(t25, t26); - float16x8_t s64 = vaddq_f16(t35, t36); - float16x8_t s65 = vaddq_f16(t45, t46); - - float16x8_t m00 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t00, t01), t02), t03), t04), t05), t06); - float16x8_t m01 = vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.5), s21), vmulq_n_f16(s31, 1.5)); - float16x8_t m02 = vaddq_f16(vaddq_f16(vmulq_n_f16(s41, 0.25), s51), vmulq_n_f16(s61, 2.25)); - float16x8_t m03 = vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.125), s21), vmulq_n_f16(s31, 3.375)); - float16x8_t m04 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s41, 0.0625), s51), vmulq_n_f16(s61, 5.0625)), t07); - - float16x8_t m10 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t10, t11), t12), t13), t14), t15), t16); - float16x8_t m11 = vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.5), s22), vmulq_n_f16(s32, 1.5)); - float16x8_t m12 = vaddq_f16(vaddq_f16(vmulq_n_f16(s42, 0.25), s52), vmulq_n_f16(s62, 2.25)); - float16x8_t m13 = vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.125), s22), vmulq_n_f16(s32, 3.375)); - float16x8_t m14 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s42, 0.0625), s52), vmulq_n_f16(s62, 5.0625)), t17); - - float16x8_t m20 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t20, t21), t22), t23), t24), t25), t26); - float16x8_t m21 = vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.5), s23), vmulq_n_f16(s33, 1.5)); - float16x8_t m22 = vaddq_f16(vaddq_f16(vmulq_n_f16(s43, 0.25), s53), vmulq_n_f16(s63, 2.25)); - float16x8_t m23 = vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.125), s23), vmulq_n_f16(s33, 3.375)); - float16x8_t m24 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s43, 0.0625), s53), vmulq_n_f16(s63, 5.0625)), t27); - - float16x8_t m30 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t30, t31), t32), t33), t34), t35), t36); - float16x8_t m31 = vaddq_f16(vaddq_f16(vmulq_n_f16(s14, 0.5), s24), vmulq_n_f16(s34, 1.5)); - float16x8_t m32 = vaddq_f16(vaddq_f16(vmulq_n_f16(s44, 0.25), s54), vmulq_n_f16(s64, 2.25)); - float16x8_t m33 = vaddq_f16(vaddq_f16(vmulq_n_f16(s14, 0.125), s24), vmulq_n_f16(s34, 3.375)); - float16x8_t m34 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s44, 0.0625), s54), vmulq_n_f16(s64, 5.0625)), t37); - - float16x8_t m40 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t40, t41), t42), t43), t44), t45), t46); - float16x8_t m41 = vaddq_f16(vaddq_f16(vmulq_n_f16(s15, 0.5), s25), vmulq_n_f16(s35, 1.5)); - float16x8_t m42 = vaddq_f16(vaddq_f16(vmulq_n_f16(s45, 0.25), s55), vmulq_n_f16(s65, 2.25)); - float16x8_t m43 = vaddq_f16(vaddq_f16(vmulq_n_f16(s15, 0.125), s25), vmulq_n_f16(s35, 3.375)); - float16x8_t m44 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s45, 0.0625), s55), vmulq_n_f16(s65, 5.0625)), t47); - - float16x8_t bias_ptr = vld1q_f16(bias_data); - vst1q_f16(dst_data, vaddq_f16(m00, bias_ptr)); - vst1q_f16(dst_data + C8NUM, vaddq_f16(m01, bias_ptr)); - vst1q_f16(dst_data + 2 * C8NUM, vaddq_f16(m02, bias_ptr)); - vst1q_f16(dst_data + 3 * C8NUM, vaddq_f16(m03, bias_ptr)); - vst1q_f16(dst_data + 4 * C8NUM, vaddq_f16(m04, bias_ptr)); - - vst1q_f16(dst_data + dst_step * C8NUM, vaddq_f16(m10, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + C8NUM, vaddq_f16(m11, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m12, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m13, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m14, bias_ptr)); - - vst1q_f16(dst_data + 2 * dst_step * C8NUM, vaddq_f16(m20, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + C8NUM, vaddq_f16(m21, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m22, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m23, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m24, bias_ptr)); - - vst1q_f16(dst_data + 3 * dst_step * C8NUM, vaddq_f16(m30, bias_ptr)); - vst1q_f16(dst_data + 3 * dst_step * C8NUM + C8NUM, vaddq_f16(m31, bias_ptr)); - vst1q_f16(dst_data + 3 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m32, bias_ptr)); - vst1q_f16(dst_data + 3 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m33, bias_ptr)); - vst1q_f16(dst_data + 3 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m34, bias_ptr)); - - vst1q_f16(dst_data + 4 * dst_step * C8NUM, vaddq_f16(m40, bias_ptr)); - vst1q_f16(dst_data + 4 * dst_step * C8NUM + C8NUM, vaddq_f16(m41, bias_ptr)); - vst1q_f16(dst_data + 4 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m42, bias_ptr)); - vst1q_f16(dst_data + 4 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m43, bias_ptr)); - vst1q_f16(dst_data + 4 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m44, bias_ptr)); -#else - for (int i = 0; i < C8NUM; i++) { - float16_t src_data_00 = src_data[i]; - float16_t src_data_01 = src_data[i + src_step]; - float16_t src_data_02 = src_data[i + 2 * src_step]; - float16_t src_data_03 = src_data[i + 3 * src_step]; - float16_t src_data_04 = src_data[i + 4 * src_step]; - float16_t src_data_05 = src_data[i + 5 * src_step]; - float16_t src_data_06 = src_data[i + 6 * src_step]; - float16_t src_data_07 = src_data[i + 7 * src_step]; - float16_t src_data_10 = src_data[i + 8 * src_step]; - float16_t src_data_11 = src_data[i + 9 * src_step]; - float16_t src_data_12 = src_data[i + 10 * src_step]; - float16_t src_data_13 = src_data[i + 11 * src_step]; - float16_t src_data_14 = src_data[i + 12 * src_step]; - float16_t src_data_15 = src_data[i + 13 * src_step]; - float16_t src_data_16 = src_data[i + 14 * src_step]; - float16_t src_data_17 = src_data[i + 15 * src_step]; - float16_t src_data_20 = src_data[i + 16 * src_step]; - float16_t src_data_21 = src_data[i + 17 * src_step]; - float16_t src_data_22 = src_data[i + 18 * src_step]; - float16_t src_data_23 = src_data[i + 19 * src_step]; - float16_t src_data_24 = src_data[i + 20 * src_step]; - float16_t src_data_25 = src_data[i + 21 * src_step]; - float16_t src_data_26 = src_data[i + 22 * src_step]; - float16_t src_data_27 = src_data[i + 23 * src_step]; - float16_t src_data_30 = src_data[i + 24 * src_step]; - float16_t src_data_31 = src_data[i + 25 * src_step]; - float16_t src_data_32 = src_data[i + 26 * src_step]; - float16_t src_data_33 = src_data[i + 27 * src_step]; - float16_t src_data_34 = src_data[i + 28 * src_step]; - float16_t src_data_35 = src_data[i + 29 * src_step]; - float16_t src_data_36 = src_data[i + 30 * src_step]; - float16_t src_data_37 = src_data[i + 31 * src_step]; - float16_t src_data_40 = src_data[i + 32 * src_step]; - float16_t src_data_41 = src_data[i + 33 * src_step]; - float16_t src_data_42 = src_data[i + 34 * src_step]; - float16_t src_data_43 = src_data[i + 35 * src_step]; - float16_t src_data_44 = src_data[i + 36 * src_step]; - float16_t src_data_45 = src_data[i + 37 * src_step]; - float16_t src_data_46 = src_data[i + 38 * src_step]; - float16_t src_data_47 = src_data[i + 39 * src_step]; - float16_t src_data_50 = src_data[i + 40 * src_step]; - float16_t src_data_51 = src_data[i + 41 * src_step]; - float16_t src_data_52 = src_data[i + 42 * src_step]; - float16_t src_data_53 = src_data[i + 43 * src_step]; - float16_t src_data_54 = src_data[i + 44 * src_step]; - float16_t src_data_55 = src_data[i + 45 * src_step]; - float16_t src_data_56 = src_data[i + 46 * src_step]; - float16_t src_data_57 = src_data[i + 47 * src_step]; - float16_t src_data_60 = src_data[i + 48 * src_step]; - float16_t src_data_61 = src_data[i + 49 * src_step]; - float16_t src_data_62 = src_data[i + 50 * src_step]; - float16_t src_data_63 = src_data[i + 51 * src_step]; - float16_t src_data_64 = src_data[i + 52 * src_step]; - float16_t src_data_65 = src_data[i + 53 * src_step]; - float16_t src_data_66 = src_data[i + 54 * src_step]; - float16_t src_data_67 = src_data[i + 55 * src_step]; - float16_t src_data_70 = src_data[i + 56 * src_step]; - float16_t src_data_71 = src_data[i + 57 * src_step]; - float16_t src_data_72 = src_data[i + 58 * src_step]; - float16_t src_data_73 = src_data[i + 59 * src_step]; - float16_t src_data_74 = src_data[i + 60 * src_step]; - float16_t src_data_75 = src_data[i + 61 * src_step]; - float16_t src_data_76 = src_data[i + 62 * src_step]; - float16_t src_data_77 = src_data[i + 63 * src_step]; - - float16_t d01 = src_data_10 - src_data_20; - float16_t d02 = src_data_11 - src_data_21; - float16_t d03 = src_data_12 - src_data_22; - float16_t d04 = src_data_13 - src_data_23; - float16_t d05 = src_data_14 - src_data_24; - float16_t d06 = src_data_15 - src_data_25; - float16_t d07 = src_data_16 - src_data_26; - float16_t d08 = src_data_17 - src_data_27; - - float16_t d11 = src_data_30 - src_data_40; - float16_t d12 = src_data_31 - src_data_41; - float16_t d13 = src_data_32 - src_data_42; - float16_t d14 = src_data_33 - src_data_43; - float16_t d15 = src_data_34 - src_data_44; - float16_t d16 = src_data_35 - src_data_45; - float16_t d17 = src_data_36 - src_data_46; - float16_t d18 = src_data_37 - src_data_47; - - float16_t d21 = src_data_50 - src_data_60; - float16_t d22 = src_data_51 - src_data_61; - float16_t d23 = src_data_52 - src_data_62; - float16_t d24 = src_data_53 - src_data_63; - float16_t d25 = src_data_54 - src_data_64; - float16_t d26 = src_data_55 - src_data_65; - float16_t d27 = src_data_56 - src_data_66; - float16_t d28 = src_data_57 - src_data_67; - - float16_t d31 = src_data_10 + src_data_20; - float16_t d32 = src_data_11 + src_data_21; - float16_t d33 = src_data_12 + src_data_22; - float16_t d34 = src_data_13 + src_data_23; - float16_t d35 = src_data_14 + src_data_24; - float16_t d36 = src_data_15 + src_data_25; - float16_t d37 = src_data_16 + src_data_26; - float16_t d38 = src_data_17 + src_data_27; - - float16_t d41 = src_data_30 + src_data_40; - float16_t d42 = src_data_31 + src_data_41; - float16_t d43 = src_data_32 + src_data_42; - float16_t d44 = src_data_33 + src_data_43; - float16_t d45 = src_data_34 + src_data_44; - float16_t d46 = src_data_35 + src_data_45; - float16_t d47 = src_data_36 + src_data_46; - float16_t d48 = src_data_37 + src_data_47; - - float16_t d51 = src_data_50 + src_data_60; - float16_t d52 = src_data_51 + src_data_61; - float16_t d53 = src_data_52 + src_data_62; - float16_t d54 = src_data_53 + src_data_63; - float16_t d55 = src_data_54 + src_data_64; - float16_t d56 = src_data_55 + src_data_65; - float16_t d57 = src_data_56 + src_data_66; - float16_t d58 = src_data_57 + src_data_67; - - float16_t t00 = src_data_00 + src_data_10 + src_data_20 + src_data_30 + src_data_40 + src_data_50 + src_data_60; - float16_t t01 = src_data_01 + src_data_11 + src_data_21 + src_data_31 + src_data_41 + src_data_51 + src_data_61; - float16_t t02 = src_data_02 + src_data_12 + src_data_22 + src_data_32 + src_data_42 + src_data_52 + src_data_62; - float16_t t03 = src_data_03 + src_data_13 + src_data_23 + src_data_33 + src_data_43 + src_data_53 + src_data_63; - float16_t t04 = src_data_04 + src_data_14 + src_data_24 + src_data_34 + src_data_44 + src_data_54 + src_data_64; - float16_t t05 = src_data_05 + src_data_15 + src_data_25 + src_data_35 + src_data_45 + src_data_55 + src_data_65; - float16_t t06 = src_data_06 + src_data_16 + src_data_26 + src_data_36 + src_data_46 + src_data_56 + src_data_66; - float16_t t07 = src_data_07 + src_data_17 + src_data_27 + src_data_37 + src_data_47 + src_data_57 + src_data_67; - - const float16_t t10 = 0.5f * d01 + d11 + 1.5f * d21; - const float16_t t11 = 0.5f * d02 + d12 + 1.5f * d22; - const float16_t t12 = 0.5f * d03 + d13 + 1.5f * d23; - const float16_t t13 = 0.5f * d04 + d14 + 1.5f * d24; - const float16_t t14 = 0.5f * d05 + d15 + 1.5f * d25; - const float16_t t15 = 0.5f * d06 + d16 + 1.5f * d26; - const float16_t t16 = 0.5f * d07 + d17 + 1.5f * d27; - const float16_t t17 = 0.5f * d08 + d18 + 1.5f * d28; - - const float16_t t20 = 0.25f * d31 + d41 + 2.25f * d51; - const float16_t t21 = 0.25f * d32 + d42 + 2.25f * d52; - const float16_t t22 = 0.25f * d33 + d43 + 2.25f * d53; - const float16_t t23 = 0.25f * d34 + d44 + 2.25f * d54; - const float16_t t24 = 0.25f * d35 + d45 + 2.25f * d55; - const float16_t t25 = 0.25f * d36 + d46 + 2.25f * d56; - const float16_t t26 = 0.25f * d37 + d47 + 2.25f * d57; - const float16_t t27 = 0.25f * d38 + d48 + 2.25f * d58; - - const float16_t t30 = 0.125f * d01 + d11 + 3.375f * d21; - const float16_t t31 = 0.125f * d02 + d12 + 3.375f * d22; - const float16_t t32 = 0.125f * d03 + d13 + 3.375f * d23; - const float16_t t33 = 0.125f * d04 + d14 + 3.375f * d24; - const float16_t t34 = 0.125f * d05 + d15 + 3.375f * d25; - const float16_t t35 = 0.125f * d06 + d16 + 3.375f * d26; - const float16_t t36 = 0.125f * d07 + d17 + 3.375f * d27; - const float16_t t37 = 0.125f * d08 + d18 + 3.375f * d28; - - const float16_t t40 = 0.0625f * d31 + d41 + 5.0625f * d51 + src_data_70; - const float16_t t41 = 0.0625f * d32 + d42 + 5.0625f * d52 + src_data_71; - const float16_t t42 = 0.0625f * d33 + d43 + 5.0625f * d53 + src_data_72; - const float16_t t43 = 0.0625f * d34 + d44 + 5.0625f * d54 + src_data_73; - const float16_t t44 = 0.0625f * d35 + d45 + 5.0625f * d55 + src_data_74; - const float16_t t45 = 0.0625f * d36 + d46 + 5.0625f * d56 + src_data_75; - const float16_t t46 = 0.0625f * d37 + d47 + 5.0625f * d57 + src_data_76; - const float16_t t47 = 0.0625f * d38 + d48 + 5.0625f * d58 + src_data_77; - - float16_t s11 = t01 - t02; - float16_t s12 = t11 - t12; - float16_t s13 = t21 - t22; - float16_t s14 = t31 - t32; - float16_t s15 = t41 - t42; - - float16_t s21 = t03 - t04; - float16_t s22 = t13 - t14; - float16_t s23 = t23 - t24; - float16_t s24 = t33 - t34; - float16_t s25 = t43 - t44; - - float16_t s31 = t05 - t06; - float16_t s32 = t15 - t16; - float16_t s33 = t25 - t26; - float16_t s34 = t35 - t36; - float16_t s35 = t45 - t46; - - float16_t s41 = t01 + t02; - float16_t s42 = t11 + t12; - float16_t s43 = t21 + t22; - float16_t s44 = t31 + t32; - float16_t s45 = t41 + t42; - - float16_t s51 = t03 + t04; - float16_t s52 = t13 + t14; - float16_t s53 = t23 + t24; - float16_t s54 = t33 + t34; - float16_t s55 = t43 + t44; - - float16_t s61 = t05 + t06; - float16_t s62 = t15 + t16; - float16_t s63 = t25 + t26; - float16_t s64 = t35 + t36; - float16_t s65 = t45 + t46; - - float16_t m00 = t00 + t01 + t02 + t03 + t04 + t05 + t06; - const float16_t m01 = 0.5f * s11 + s21 + 1.5f * s31; - const float16_t m02 = 0.25f * s41 + s51 + 2.25f * s61; - const float16_t m03 = 0.125f * s11 + s21 + 3.375f * s31; - const float16_t m04 = 0.0625f * s41 + s51 + 5.0625f * s61 + t07; - - float16_t m10 = t10 + t11 + t12 + t13 + t14 + t15 + t16; - const float16_t m11 = 0.5f * s12 + s22 + 1.5f * s32; - const float16_t m12 = 0.25f * s42 + s52 + 2.25f * s62; - const float16_t m13 = 0.125f * s12 + s22 + 3.375f * s32; - const float16_t m14 = 0.0625f * s42 + s52 + 5.0625f * s62 + t17; - - float16_t m20 = t20 + t21 + t22 + t23 + t24 + t25 + t26; - const float16_t m21 = 0.5f * s13 + s23 + 1.5f * s33; - const float16_t m22 = 0.25f * s43 + s53 + 2.25f * s63; - const float16_t m23 = 0.125f * s13 + s23 + 3.375f * s33; - const float16_t m24 = 0.0625f * s43 + s53 + 5.0625f * s63 + t27; - - float16_t m30 = t30 + t31 + t32 + t33 + t34 + t35 + t36; - const float16_t m31 = 0.5f * s14 + s24 + 1.5f * s34; - const float16_t m32 = 0.25f * s44 + s54 + 2.25f * s64; - const float16_t m33 = 0.125f * s14 + s24 + 3.375f * s34; - const float16_t m34 = 0.0625f * s44 + s54 + 5.0625f * s64 + t37; - - float16_t m40 = t40 + t41 + t42 + t43 + t44 + t45 + t46; - const float16_t m41 = 0.5f * s15 + s25 + 1.5f * s35; - const float16_t m42 = 0.25f * s45 + s55 + 2.25f * s65; - const float16_t m43 = 0.125f * s15 + s25 + 3.375f * s35; - const float16_t m44 = 0.0625f * s45 + s55 + 5.0625f * s65 + t47; - - (dst_data + i)[0] = m00 + bias_data[i]; - (dst_data + i + C8NUM)[0] = m01 + bias_data[i]; - (dst_data + i + 2 * C8NUM)[0] = m02 + bias_data[i]; - (dst_data + i + 3 * C8NUM)[0] = m03 + bias_data[i]; - (dst_data + i + 4 * C8NUM)[0] = m04 + bias_data[i]; - - (dst_data + i + dst_step * C8NUM)[0] = m10 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + C8NUM)[0] = m11 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + 2 * C8NUM)[0] = m12 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + 3 * C8NUM)[0] = m13 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + 4 * C8NUM)[0] = m14 + bias_data[i]; - - (dst_data + i + 2 * dst_step * C8NUM)[0] = m20 + bias_data[i]; - (dst_data + i + 2 * dst_step * C8NUM + C8NUM)[0] = m21 + bias_data[i]; - (dst_data + i + 2 * dst_step * C8NUM + 2 * C8NUM)[0] = m22 + bias_data[i]; - (dst_data + i + 2 * dst_step * C8NUM + 3 * C8NUM)[0] = m23 + bias_data[i]; - (dst_data + i + 2 * dst_step * C8NUM + 4 * C8NUM)[0] = m24 + bias_data[i]; - - (dst_data + i + 3 * dst_step * C8NUM)[0] = m30 + bias_data[i]; - (dst_data + i + 3 * dst_step * C8NUM + C8NUM)[0] = m31 + bias_data[i]; - (dst_data + i + 3 * dst_step * C8NUM + 2 * C8NUM)[0] = m32 + bias_data[i]; - (dst_data + i + 3 * dst_step * C8NUM + 3 * C8NUM)[0] = m33 + bias_data[i]; - (dst_data + i + 3 * dst_step * C8NUM + 4 * C8NUM)[0] = m34 + bias_data[i]; - - (dst_data + i + 4 * dst_step * C8NUM)[0] = m40 + bias_data[i]; - (dst_data + i + 4 * dst_step * C8NUM + C8NUM)[0] = m41 + bias_data[i]; - (dst_data + i + 4 * dst_step * C8NUM + 2 * C8NUM)[0] = m42 + bias_data[i]; - (dst_data + i + 4 * dst_step * C8NUM + 3 * C8NUM)[0] = m43 + bias_data[i]; - (dst_data + i + 4 * dst_step * C8NUM + 4 * C8NUM)[0] = m44 + bias_data[i]; - } -#endif -} - -void OutputTransform8x6UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, - int src_step, int dst_step) { -#ifdef ENABLE_ARM - float16x8_t src_data_00 = vld1q_f16(src_data + 0 * src_step); - float16x8_t src_data_01 = vld1q_f16(src_data + 1 * src_step); - float16x8_t src_data_02 = vld1q_f16(src_data + 2 * src_step); - float16x8_t src_data_03 = vld1q_f16(src_data + 3 * src_step); - float16x8_t src_data_04 = vld1q_f16(src_data + 4 * src_step); - float16x8_t src_data_05 = vld1q_f16(src_data + 5 * src_step); - float16x8_t src_data_06 = vld1q_f16(src_data + 6 * src_step); - float16x8_t src_data_07 = vld1q_f16(src_data + 7 * src_step); - float16x8_t src_data_10 = vld1q_f16(src_data + 8 * src_step); - float16x8_t src_data_11 = vld1q_f16(src_data + 9 * src_step); - float16x8_t src_data_12 = vld1q_f16(src_data + 10 * src_step); - float16x8_t src_data_13 = vld1q_f16(src_data + 11 * src_step); - float16x8_t src_data_14 = vld1q_f16(src_data + 12 * src_step); - float16x8_t src_data_15 = vld1q_f16(src_data + 13 * src_step); - float16x8_t src_data_16 = vld1q_f16(src_data + 14 * src_step); - float16x8_t src_data_17 = vld1q_f16(src_data + 15 * src_step); - float16x8_t src_data_20 = vld1q_f16(src_data + 16 * src_step); - float16x8_t src_data_21 = vld1q_f16(src_data + 17 * src_step); - float16x8_t src_data_22 = vld1q_f16(src_data + 18 * src_step); - float16x8_t src_data_23 = vld1q_f16(src_data + 19 * src_step); - float16x8_t src_data_24 = vld1q_f16(src_data + 20 * src_step); - float16x8_t src_data_25 = vld1q_f16(src_data + 21 * src_step); - float16x8_t src_data_26 = vld1q_f16(src_data + 22 * src_step); - float16x8_t src_data_27 = vld1q_f16(src_data + 23 * src_step); - float16x8_t src_data_30 = vld1q_f16(src_data + 24 * src_step); - float16x8_t src_data_31 = vld1q_f16(src_data + 25 * src_step); - float16x8_t src_data_32 = vld1q_f16(src_data + 26 * src_step); - float16x8_t src_data_33 = vld1q_f16(src_data + 27 * src_step); - float16x8_t src_data_34 = vld1q_f16(src_data + 28 * src_step); - float16x8_t src_data_35 = vld1q_f16(src_data + 29 * src_step); - float16x8_t src_data_36 = vld1q_f16(src_data + 30 * src_step); - float16x8_t src_data_37 = vld1q_f16(src_data + 31 * src_step); - float16x8_t src_data_40 = vld1q_f16(src_data + 32 * src_step); - float16x8_t src_data_41 = vld1q_f16(src_data + 33 * src_step); - float16x8_t src_data_42 = vld1q_f16(src_data + 34 * src_step); - float16x8_t src_data_43 = vld1q_f16(src_data + 35 * src_step); - float16x8_t src_data_44 = vld1q_f16(src_data + 36 * src_step); - float16x8_t src_data_45 = vld1q_f16(src_data + 37 * src_step); - float16x8_t src_data_46 = vld1q_f16(src_data + 38 * src_step); - float16x8_t src_data_47 = vld1q_f16(src_data + 39 * src_step); - float16x8_t src_data_50 = vld1q_f16(src_data + 40 * src_step); - float16x8_t src_data_51 = vld1q_f16(src_data + 41 * src_step); - float16x8_t src_data_52 = vld1q_f16(src_data + 42 * src_step); - float16x8_t src_data_53 = vld1q_f16(src_data + 43 * src_step); - float16x8_t src_data_54 = vld1q_f16(src_data + 44 * src_step); - float16x8_t src_data_55 = vld1q_f16(src_data + 45 * src_step); - float16x8_t src_data_56 = vld1q_f16(src_data + 46 * src_step); - float16x8_t src_data_57 = vld1q_f16(src_data + 47 * src_step); - float16x8_t src_data_60 = vld1q_f16(src_data + 48 * src_step); - float16x8_t src_data_61 = vld1q_f16(src_data + 49 * src_step); - float16x8_t src_data_62 = vld1q_f16(src_data + 50 * src_step); - float16x8_t src_data_63 = vld1q_f16(src_data + 51 * src_step); - float16x8_t src_data_64 = vld1q_f16(src_data + 52 * src_step); - float16x8_t src_data_65 = vld1q_f16(src_data + 53 * src_step); - float16x8_t src_data_66 = vld1q_f16(src_data + 54 * src_step); - float16x8_t src_data_67 = vld1q_f16(src_data + 55 * src_step); - float16x8_t src_data_70 = vld1q_f16(src_data + 56 * src_step); - float16x8_t src_data_71 = vld1q_f16(src_data + 57 * src_step); - float16x8_t src_data_72 = vld1q_f16(src_data + 58 * src_step); - float16x8_t src_data_73 = vld1q_f16(src_data + 59 * src_step); - float16x8_t src_data_74 = vld1q_f16(src_data + 60 * src_step); - float16x8_t src_data_75 = vld1q_f16(src_data + 61 * src_step); - float16x8_t src_data_76 = vld1q_f16(src_data + 62 * src_step); - float16x8_t src_data_77 = vld1q_f16(src_data + 63 * src_step); - - float16x8_t d01 = vsubq_f16(src_data_10, src_data_20); - float16x8_t d02 = vsubq_f16(src_data_11, src_data_21); - float16x8_t d03 = vsubq_f16(src_data_12, src_data_22); - float16x8_t d04 = vsubq_f16(src_data_13, src_data_23); - float16x8_t d05 = vsubq_f16(src_data_14, src_data_24); - float16x8_t d06 = vsubq_f16(src_data_15, src_data_25); - float16x8_t d07 = vsubq_f16(src_data_16, src_data_26); - float16x8_t d08 = vsubq_f16(src_data_17, src_data_27); - - float16x8_t d11 = vsubq_f16(src_data_30, src_data_40); - float16x8_t d12 = vsubq_f16(src_data_31, src_data_41); - float16x8_t d13 = vsubq_f16(src_data_32, src_data_42); - float16x8_t d14 = vsubq_f16(src_data_33, src_data_43); - float16x8_t d15 = vsubq_f16(src_data_34, src_data_44); - float16x8_t d16 = vsubq_f16(src_data_35, src_data_45); - float16x8_t d17 = vsubq_f16(src_data_36, src_data_46); - float16x8_t d18 = vsubq_f16(src_data_37, src_data_47); - - float16x8_t d21 = vsubq_f16(src_data_50, src_data_60); - float16x8_t d22 = vsubq_f16(src_data_51, src_data_61); - float16x8_t d23 = vsubq_f16(src_data_52, src_data_62); - float16x8_t d24 = vsubq_f16(src_data_53, src_data_63); - float16x8_t d25 = vsubq_f16(src_data_54, src_data_64); - float16x8_t d26 = vsubq_f16(src_data_55, src_data_65); - float16x8_t d27 = vsubq_f16(src_data_56, src_data_66); - float16x8_t d28 = vsubq_f16(src_data_57, src_data_67); - - float16x8_t d31 = vaddq_f16(src_data_10, src_data_20); - float16x8_t d32 = vaddq_f16(src_data_11, src_data_21); - float16x8_t d33 = vaddq_f16(src_data_12, src_data_22); - float16x8_t d34 = vaddq_f16(src_data_13, src_data_23); - float16x8_t d35 = vaddq_f16(src_data_14, src_data_24); - float16x8_t d36 = vaddq_f16(src_data_15, src_data_25); - float16x8_t d37 = vaddq_f16(src_data_16, src_data_26); - float16x8_t d38 = vaddq_f16(src_data_17, src_data_27); - - float16x8_t d41 = vaddq_f16(src_data_30, src_data_40); - float16x8_t d42 = vaddq_f16(src_data_31, src_data_41); - float16x8_t d43 = vaddq_f16(src_data_32, src_data_42); - float16x8_t d44 = vaddq_f16(src_data_33, src_data_43); - float16x8_t d45 = vaddq_f16(src_data_34, src_data_44); - float16x8_t d46 = vaddq_f16(src_data_35, src_data_45); - float16x8_t d47 = vaddq_f16(src_data_36, src_data_46); - float16x8_t d48 = vaddq_f16(src_data_37, src_data_47); - - float16x8_t d51 = vaddq_f16(src_data_50, src_data_60); - float16x8_t d52 = vaddq_f16(src_data_51, src_data_61); - float16x8_t d53 = vaddq_f16(src_data_52, src_data_62); - float16x8_t d54 = vaddq_f16(src_data_53, src_data_63); - float16x8_t d55 = vaddq_f16(src_data_54, src_data_64); - float16x8_t d56 = vaddq_f16(src_data_55, src_data_65); - float16x8_t d57 = vaddq_f16(src_data_56, src_data_66); - float16x8_t d58 = vaddq_f16(src_data_57, src_data_67); - - float16x8_t t00 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_00, src_data_10), src_data_20), src_data_30), src_data_40), - src_data_50), - src_data_60); - float16x8_t t01 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_01, src_data_11), src_data_21), src_data_31), src_data_41), - src_data_51), - src_data_61); - float16x8_t t02 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_02, src_data_12), src_data_22), src_data_32), src_data_42), - src_data_52), - src_data_62); - float16x8_t t03 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_03, src_data_13), src_data_23), src_data_33), src_data_43), - src_data_53), - src_data_63); - float16x8_t t04 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_04, src_data_14), src_data_24), src_data_34), src_data_44), - src_data_54), - src_data_64); - float16x8_t t05 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_05, src_data_15), src_data_25), src_data_35), src_data_45), - src_data_55), - src_data_65); - float16x8_t t06 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_06, src_data_16), src_data_26), src_data_36), src_data_46), - src_data_56), - src_data_66); - float16x8_t t07 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_07, src_data_17), src_data_27), src_data_37), src_data_47), - src_data_57), - src_data_67); - - float16x8_t t10 = vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.5), d11), vmulq_n_f16(d21, 1.5)); - float16x8_t t11 = vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.5), d12), vmulq_n_f16(d22, 1.5)); - float16x8_t t12 = vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.5), d13), vmulq_n_f16(d23, 1.5)); - float16x8_t t13 = vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.5), d14), vmulq_n_f16(d24, 1.5)); - float16x8_t t14 = vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.5), d15), vmulq_n_f16(d25, 1.5)); - float16x8_t t15 = vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.5), d16), vmulq_n_f16(d26, 1.5)); - float16x8_t t16 = vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.5), d17), vmulq_n_f16(d27, 1.5)); - float16x8_t t17 = vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.5), d18), vmulq_n_f16(d28, 1.5)); - - float16x8_t t20 = vaddq_f16(vaddq_f16(vmulq_n_f16(d31, 0.25), d41), vmulq_n_f16(d51, 2.25)); - float16x8_t t21 = vaddq_f16(vaddq_f16(vmulq_n_f16(d32, 0.25), d42), vmulq_n_f16(d52, 2.25)); - float16x8_t t22 = vaddq_f16(vaddq_f16(vmulq_n_f16(d33, 0.25), d43), vmulq_n_f16(d53, 2.25)); - float16x8_t t23 = vaddq_f16(vaddq_f16(vmulq_n_f16(d34, 0.25), d44), vmulq_n_f16(d54, 2.25)); - float16x8_t t24 = vaddq_f16(vaddq_f16(vmulq_n_f16(d35, 0.25), d45), vmulq_n_f16(d55, 2.25)); - float16x8_t t25 = vaddq_f16(vaddq_f16(vmulq_n_f16(d36, 0.25), d46), vmulq_n_f16(d56, 2.25)); - float16x8_t t26 = vaddq_f16(vaddq_f16(vmulq_n_f16(d37, 0.25), d47), vmulq_n_f16(d57, 2.25)); - float16x8_t t27 = vaddq_f16(vaddq_f16(vmulq_n_f16(d38, 0.25), d48), vmulq_n_f16(d58, 2.25)); - - float16x8_t t30 = vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.125), d11), vmulq_n_f16(d21, 3.375)); - float16x8_t t31 = vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.125), d12), vmulq_n_f16(d22, 3.375)); - float16x8_t t32 = vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.125), d13), vmulq_n_f16(d23, 3.375)); - float16x8_t t33 = vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.125), d14), vmulq_n_f16(d24, 3.375)); - float16x8_t t34 = vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.125), d15), vmulq_n_f16(d25, 3.375)); - float16x8_t t35 = vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.125), d16), vmulq_n_f16(d26, 3.375)); - float16x8_t t36 = vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.125), d17), vmulq_n_f16(d27, 3.375)); - float16x8_t t37 = vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.125), d18), vmulq_n_f16(d28, 3.375)); - - float16x8_t t40 = vaddq_f16(vaddq_f16(vmulq_n_f16(d31, 0.0625), d41), vmulq_n_f16(d51, 5.0625)); - float16x8_t t41 = vaddq_f16(vaddq_f16(vmulq_n_f16(d32, 0.0625), d42), vmulq_n_f16(d52, 5.0625)); - float16x8_t t42 = vaddq_f16(vaddq_f16(vmulq_n_f16(d33, 0.0625), d43), vmulq_n_f16(d53, 5.0625)); - float16x8_t t43 = vaddq_f16(vaddq_f16(vmulq_n_f16(d34, 0.0625), d44), vmulq_n_f16(d54, 5.0625)); - float16x8_t t44 = vaddq_f16(vaddq_f16(vmulq_n_f16(d35, 0.0625), d45), vmulq_n_f16(d55, 5.0625)); - float16x8_t t45 = vaddq_f16(vaddq_f16(vmulq_n_f16(d36, 0.0625), d46), vmulq_n_f16(d56, 5.0625)); - float16x8_t t46 = vaddq_f16(vaddq_f16(vmulq_n_f16(d37, 0.0625), d47), vmulq_n_f16(d57, 5.0625)); - float16x8_t t47 = vaddq_f16(vaddq_f16(vmulq_n_f16(d38, 0.0625), d48), vmulq_n_f16(d58, 5.0625)); - - float16x8_t t50 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.03125), d11), vmulq_n_f16(d21, 7.59375)), src_data_70); - float16x8_t t51 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.03125), d12), vmulq_n_f16(d22, 7.59375)), src_data_71); - float16x8_t t52 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.03125), d13), vmulq_n_f16(d23, 7.59375)), src_data_72); - float16x8_t t53 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.03125), d14), vmulq_n_f16(d24, 7.59375)), src_data_73); - float16x8_t t54 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.03125), d15), vmulq_n_f16(d25, 7.59375)), src_data_74); - float16x8_t t55 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.03125), d16), vmulq_n_f16(d26, 7.59375)), src_data_75); - float16x8_t t56 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.03125), d17), vmulq_n_f16(d27, 7.59375)), src_data_76); - float16x8_t t57 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.03125), d18), vmulq_n_f16(d28, 7.59375)), src_data_77); - - float16x8_t s11 = vsubq_f16(t01, t02); - float16x8_t s12 = vsubq_f16(t11, t12); - float16x8_t s13 = vsubq_f16(t21, t22); - float16x8_t s14 = vsubq_f16(t31, t32); - float16x8_t s15 = vsubq_f16(t41, t42); - float16x8_t s16 = vsubq_f16(t51, t52); - - float16x8_t s21 = vsubq_f16(t03, t04); - float16x8_t s22 = vsubq_f16(t13, t14); - float16x8_t s23 = vsubq_f16(t23, t24); - float16x8_t s24 = vsubq_f16(t33, t34); - float16x8_t s25 = vsubq_f16(t43, t44); - float16x8_t s26 = vsubq_f16(t53, t54); - - float16x8_t s31 = vsubq_f16(t05, t06); - float16x8_t s32 = vsubq_f16(t15, t16); - float16x8_t s33 = vsubq_f16(t25, t26); - float16x8_t s34 = vsubq_f16(t35, t36); - float16x8_t s35 = vsubq_f16(t45, t46); - float16x8_t s36 = vsubq_f16(t55, t56); - - float16x8_t s41 = vaddq_f16(t01, t02); - float16x8_t s42 = vaddq_f16(t11, t12); - float16x8_t s43 = vaddq_f16(t21, t22); - float16x8_t s44 = vaddq_f16(t31, t32); - float16x8_t s45 = vaddq_f16(t41, t42); - float16x8_t s46 = vaddq_f16(t51, t52); - - float16x8_t s51 = vaddq_f16(t03, t04); - float16x8_t s52 = vaddq_f16(t13, t14); - float16x8_t s53 = vaddq_f16(t23, t24); - float16x8_t s54 = vaddq_f16(t33, t34); - float16x8_t s55 = vaddq_f16(t43, t44); - float16x8_t s56 = vaddq_f16(t53, t54); - - float16x8_t s61 = vaddq_f16(t05, t06); - float16x8_t s62 = vaddq_f16(t15, t16); - float16x8_t s63 = vaddq_f16(t25, t26); - float16x8_t s64 = vaddq_f16(t35, t36); - float16x8_t s65 = vaddq_f16(t45, t46); - float16x8_t s66 = vaddq_f16(t55, t56); - - float16x8_t m00 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t00, t01), t02), t03), t04), t05), t06); - float16x8_t m01 = vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.5), s21), vmulq_n_f16(s31, 1.5)); - float16x8_t m02 = vaddq_f16(vaddq_f16(vmulq_n_f16(s41, 0.25), s51), vmulq_n_f16(s61, 2.25)); - float16x8_t m03 = vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.125), s21), vmulq_n_f16(s31, 3.375)); - float16x8_t m04 = vaddq_f16(vaddq_f16(vmulq_n_f16(s41, 0.0625), s51), vmulq_n_f16(s61, 5.0625)); - float16x8_t m05 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.03125), s21), vmulq_n_f16(s31, 7.59375)), t07); - - float16x8_t m10 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t10, t11), t12), t13), t14), t15), t16); - float16x8_t m11 = vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.5), s22), vmulq_n_f16(s32, 1.5)); - float16x8_t m12 = vaddq_f16(vaddq_f16(vmulq_n_f16(s42, 0.25), s52), vmulq_n_f16(s62, 2.25)); - float16x8_t m13 = vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.125), s22), vmulq_n_f16(s32, 3.375)); - float16x8_t m14 = vaddq_f16(vaddq_f16(vmulq_n_f16(s42, 0.0625), s52), vmulq_n_f16(s62, 5.0625)); - float16x8_t m15 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.03125), s22), vmulq_n_f16(s32, 7.59375)), t17); - - float16x8_t m20 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t20, t21), t22), t23), t24), t25), t26); - float16x8_t m21 = vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.5), s23), vmulq_n_f16(s33, 1.5)); - float16x8_t m22 = vaddq_f16(vaddq_f16(vmulq_n_f16(s43, 0.25), s53), vmulq_n_f16(s63, 2.25)); - float16x8_t m23 = vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.125), s23), vmulq_n_f16(s33, 3.375)); - float16x8_t m24 = vaddq_f16(vaddq_f16(vmulq_n_f16(s43, 0.0625), s53), vmulq_n_f16(s63, 5.0625)); - float16x8_t m25 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.03125), s23), vmulq_n_f16(s33, 7.59375)), t27); - - float16x8_t m30 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t30, t31), t32), t33), t34), t35), t36); - float16x8_t m31 = vaddq_f16(vaddq_f16(vmulq_n_f16(s14, 0.5), s24), vmulq_n_f16(s34, 1.5)); - float16x8_t m32 = vaddq_f16(vaddq_f16(vmulq_n_f16(s44, 0.25), s54), vmulq_n_f16(s64, 2.25)); - float16x8_t m33 = vaddq_f16(vaddq_f16(vmulq_n_f16(s14, 0.125), s24), vmulq_n_f16(s34, 3.375)); - float16x8_t m34 = vaddq_f16(vaddq_f16(vmulq_n_f16(s44, 0.0625), s54), vmulq_n_f16(s64, 5.0625)); - float16x8_t m35 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s14, 0.03125), s24), vmulq_n_f16(s34, 7.59375)), t37); - - float16x8_t m40 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t40, t41), t42), t43), t44), t45), t46); - float16x8_t m41 = vaddq_f16(vaddq_f16(vmulq_n_f16(s15, 0.5), s25), vmulq_n_f16(s35, 1.5)); - float16x8_t m42 = vaddq_f16(vaddq_f16(vmulq_n_f16(s45, 0.25), s55), vmulq_n_f16(s65, 2.25)); - float16x8_t m43 = vaddq_f16(vaddq_f16(vmulq_n_f16(s15, 0.125), s25), vmulq_n_f16(s35, 3.375)); - float16x8_t m44 = vaddq_f16(vaddq_f16(vmulq_n_f16(s45, 0.0625), s55), vmulq_n_f16(s65, 5.0625)); - float16x8_t m45 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s15, 0.03125), s25), vmulq_n_f16(s35, 7.59375)), t47); - - float16x8_t m50 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t50, t51), t52), t53), t54), t55), t56); - float16x8_t m51 = vaddq_f16(vaddq_f16(vmulq_n_f16(s16, 0.5), s26), vmulq_n_f16(s36, 1.5)); - float16x8_t m52 = vaddq_f16(vaddq_f16(vmulq_n_f16(s46, 0.25), s56), vmulq_n_f16(s66, 2.25)); - float16x8_t m53 = vaddq_f16(vaddq_f16(vmulq_n_f16(s16, 0.125), s26), vmulq_n_f16(s36, 3.375)); - float16x8_t m54 = vaddq_f16(vaddq_f16(vmulq_n_f16(s46, 0.0625), s56), vmulq_n_f16(s66, 5.0625)); - float16x8_t m55 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s16, 0.03125), s26), vmulq_n_f16(s36, 7.59375)), t57); - - float16x8_t bias_ptr = vld1q_f16(bias_data); - vst1q_f16(dst_data, vaddq_f16(m00, bias_ptr)); - vst1q_f16(dst_data + C8NUM, vaddq_f16(m01, bias_ptr)); - vst1q_f16(dst_data + 2 * C8NUM, vaddq_f16(m02, bias_ptr)); - vst1q_f16(dst_data + 3 * C8NUM, vaddq_f16(m03, bias_ptr)); - vst1q_f16(dst_data + 4 * C8NUM, vaddq_f16(m04, bias_ptr)); - vst1q_f16(dst_data + 5 * C8NUM, vaddq_f16(m05, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM, vaddq_f16(m10, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + C8NUM, vaddq_f16(m11, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m12, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m13, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m14, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m15, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM, vaddq_f16(m20, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + C8NUM, vaddq_f16(m21, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m22, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m23, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m24, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m25, bias_ptr)); - vst1q_f16(dst_data + 3 * dst_step * C8NUM, vaddq_f16(m30, bias_ptr)); - vst1q_f16(dst_data + 3 * dst_step * C8NUM + C8NUM, vaddq_f16(m31, bias_ptr)); - vst1q_f16(dst_data + 3 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m32, bias_ptr)); - vst1q_f16(dst_data + 3 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m33, bias_ptr)); - vst1q_f16(dst_data + 3 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m34, bias_ptr)); - vst1q_f16(dst_data + 3 * dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m35, bias_ptr)); - vst1q_f16(dst_data + 4 * dst_step * C8NUM, vaddq_f16(m40, bias_ptr)); - vst1q_f16(dst_data + 4 * dst_step * C8NUM + C8NUM, vaddq_f16(m41, bias_ptr)); - vst1q_f16(dst_data + 4 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m42, bias_ptr)); - vst1q_f16(dst_data + 4 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m43, bias_ptr)); - vst1q_f16(dst_data + 4 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m44, bias_ptr)); - vst1q_f16(dst_data + 4 * dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m45, bias_ptr)); - vst1q_f16(dst_data + 5 * dst_step * C8NUM, vaddq_f16(m50, bias_ptr)); - vst1q_f16(dst_data + 5 * dst_step * C8NUM + C8NUM, vaddq_f16(m51, bias_ptr)); - vst1q_f16(dst_data + 5 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m52, bias_ptr)); - vst1q_f16(dst_data + 5 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m53, bias_ptr)); - vst1q_f16(dst_data + 5 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m54, bias_ptr)); - vst1q_f16(dst_data + 5 * dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m55, bias_ptr)); -#else - for (int i = 0; i < C8NUM; i++) { - float16_t src_data_00 = src_data[i]; - float16_t src_data_01 = src_data[i + src_step]; - float16_t src_data_02 = src_data[i + 2 * src_step]; - float16_t src_data_03 = src_data[i + 3 * src_step]; - float16_t src_data_04 = src_data[i + 4 * src_step]; - float16_t src_data_05 = src_data[i + 5 * src_step]; - float16_t src_data_06 = src_data[i + 6 * src_step]; - float16_t src_data_07 = src_data[i + 7 * src_step]; - float16_t src_data_10 = src_data[i + 8 * src_step]; - float16_t src_data_11 = src_data[i + 9 * src_step]; - float16_t src_data_12 = src_data[i + 10 * src_step]; - float16_t src_data_13 = src_data[i + 11 * src_step]; - float16_t src_data_14 = src_data[i + 12 * src_step]; - float16_t src_data_15 = src_data[i + 13 * src_step]; - float16_t src_data_16 = src_data[i + 14 * src_step]; - float16_t src_data_17 = src_data[i + 15 * src_step]; - float16_t src_data_20 = src_data[i + 16 * src_step]; - float16_t src_data_21 = src_data[i + 17 * src_step]; - float16_t src_data_22 = src_data[i + 18 * src_step]; - float16_t src_data_23 = src_data[i + 19 * src_step]; - float16_t src_data_24 = src_data[i + 20 * src_step]; - float16_t src_data_25 = src_data[i + 21 * src_step]; - float16_t src_data_26 = src_data[i + 22 * src_step]; - float16_t src_data_27 = src_data[i + 23 * src_step]; - float16_t src_data_30 = src_data[i + 24 * src_step]; - float16_t src_data_31 = src_data[i + 25 * src_step]; - float16_t src_data_32 = src_data[i + 26 * src_step]; - float16_t src_data_33 = src_data[i + 27 * src_step]; - float16_t src_data_34 = src_data[i + 28 * src_step]; - float16_t src_data_35 = src_data[i + 29 * src_step]; - float16_t src_data_36 = src_data[i + 30 * src_step]; - float16_t src_data_37 = src_data[i + 31 * src_step]; - float16_t src_data_40 = src_data[i + 32 * src_step]; - float16_t src_data_41 = src_data[i + 33 * src_step]; - float16_t src_data_42 = src_data[i + 34 * src_step]; - float16_t src_data_43 = src_data[i + 35 * src_step]; - float16_t src_data_44 = src_data[i + 36 * src_step]; - float16_t src_data_45 = src_data[i + 37 * src_step]; - float16_t src_data_46 = src_data[i + 38 * src_step]; - float16_t src_data_47 = src_data[i + 39 * src_step]; - float16_t src_data_50 = src_data[i + 40 * src_step]; - float16_t src_data_51 = src_data[i + 41 * src_step]; - float16_t src_data_52 = src_data[i + 42 * src_step]; - float16_t src_data_53 = src_data[i + 43 * src_step]; - float16_t src_data_54 = src_data[i + 44 * src_step]; - float16_t src_data_55 = src_data[i + 45 * src_step]; - float16_t src_data_56 = src_data[i + 46 * src_step]; - float16_t src_data_57 = src_data[i + 47 * src_step]; - float16_t src_data_60 = src_data[i + 48 * src_step]; - float16_t src_data_61 = src_data[i + 49 * src_step]; - float16_t src_data_62 = src_data[i + 50 * src_step]; - float16_t src_data_63 = src_data[i + 51 * src_step]; - float16_t src_data_64 = src_data[i + 52 * src_step]; - float16_t src_data_65 = src_data[i + 53 * src_step]; - float16_t src_data_66 = src_data[i + 54 * src_step]; - float16_t src_data_67 = src_data[i + 55 * src_step]; - float16_t src_data_70 = src_data[i + 56 * src_step]; - float16_t src_data_71 = src_data[i + 57 * src_step]; - float16_t src_data_72 = src_data[i + 58 * src_step]; - float16_t src_data_73 = src_data[i + 59 * src_step]; - float16_t src_data_74 = src_data[i + 60 * src_step]; - float16_t src_data_75 = src_data[i + 61 * src_step]; - float16_t src_data_76 = src_data[i + 62 * src_step]; - float16_t src_data_77 = src_data[i + 63 * src_step]; - - float16_t d01 = src_data_10 - src_data_20; - float16_t d02 = src_data_11 - src_data_21; - float16_t d03 = src_data_12 - src_data_22; - float16_t d04 = src_data_13 - src_data_23; - float16_t d05 = src_data_14 - src_data_24; - float16_t d06 = src_data_15 - src_data_25; - float16_t d07 = src_data_16 - src_data_26; - float16_t d08 = src_data_17 - src_data_27; - - float16_t d11 = src_data_30 - src_data_40; - float16_t d12 = src_data_31 - src_data_41; - float16_t d13 = src_data_32 - src_data_42; - float16_t d14 = src_data_33 - src_data_43; - float16_t d15 = src_data_34 - src_data_44; - float16_t d16 = src_data_35 - src_data_45; - float16_t d17 = src_data_36 - src_data_46; - float16_t d18 = src_data_37 - src_data_47; - - float16_t d21 = src_data_50 - src_data_60; - float16_t d22 = src_data_51 - src_data_61; - float16_t d23 = src_data_52 - src_data_62; - float16_t d24 = src_data_53 - src_data_63; - float16_t d25 = src_data_54 - src_data_64; - float16_t d26 = src_data_55 - src_data_65; - float16_t d27 = src_data_56 - src_data_66; - float16_t d28 = src_data_57 - src_data_67; - - float16_t d31 = src_data_10 + src_data_20; - float16_t d32 = src_data_11 + src_data_21; - float16_t d33 = src_data_12 + src_data_22; - float16_t d34 = src_data_13 + src_data_23; - float16_t d35 = src_data_14 + src_data_24; - float16_t d36 = src_data_15 + src_data_25; - float16_t d37 = src_data_16 + src_data_26; - float16_t d38 = src_data_17 + src_data_27; - - float16_t d41 = src_data_30 + src_data_40; - float16_t d42 = src_data_31 + src_data_41; - float16_t d43 = src_data_32 + src_data_42; - float16_t d44 = src_data_33 + src_data_43; - float16_t d45 = src_data_34 + src_data_44; - float16_t d46 = src_data_35 + src_data_45; - float16_t d47 = src_data_36 + src_data_46; - float16_t d48 = src_data_37 + src_data_47; - - float16_t d51 = src_data_50 + src_data_60; - float16_t d52 = src_data_51 + src_data_61; - float16_t d53 = src_data_52 + src_data_62; - float16_t d54 = src_data_53 + src_data_63; - float16_t d55 = src_data_54 + src_data_64; - float16_t d56 = src_data_55 + src_data_65; - float16_t d57 = src_data_56 + src_data_66; - float16_t d58 = src_data_57 + src_data_67; - - float16_t t00 = src_data_00 + src_data_10 + src_data_20 + src_data_30 + src_data_40 + src_data_50 + src_data_60; - float16_t t01 = src_data_01 + src_data_11 + src_data_21 + src_data_31 + src_data_41 + src_data_51 + src_data_61; - float16_t t02 = src_data_02 + src_data_12 + src_data_22 + src_data_32 + src_data_42 + src_data_52 + src_data_62; - float16_t t03 = src_data_03 + src_data_13 + src_data_23 + src_data_33 + src_data_43 + src_data_53 + src_data_63; - float16_t t04 = src_data_04 + src_data_14 + src_data_24 + src_data_34 + src_data_44 + src_data_54 + src_data_64; - float16_t t05 = src_data_05 + src_data_15 + src_data_25 + src_data_35 + src_data_45 + src_data_55 + src_data_65; - float16_t t06 = src_data_06 + src_data_16 + src_data_26 + src_data_36 + src_data_46 + src_data_56 + src_data_66; - float16_t t07 = src_data_07 + src_data_17 + src_data_27 + src_data_37 + src_data_47 + src_data_57 + src_data_67; - - const float16_t t10 = 0.5f * d01 + d11 + 1.5f * d21; - const float16_t t11 = 0.5f * d02 + d12 + 1.5f * d22; - const float16_t t12 = 0.5f * d03 + d13 + 1.5f * d23; - const float16_t t13 = 0.5f * d04 + d14 + 1.5f * d24; - const float16_t t14 = 0.5f * d05 + d15 + 1.5f * d25; - const float16_t t15 = 0.5f * d06 + d16 + 1.5f * d26; - const float16_t t16 = 0.5f * d07 + d17 + 1.5f * d27; - const float16_t t17 = 0.5f * d08 + d18 + 1.5f * d28; - - const float16_t t20 = 0.25f * d31 + d41 + 2.25f * d51; - const float16_t t21 = 0.25f * d32 + d42 + 2.25f * d52; - const float16_t t22 = 0.25f * d33 + d43 + 2.25f * d53; - const float16_t t23 = 0.25f * d34 + d44 + 2.25f * d54; - const float16_t t24 = 0.25f * d35 + d45 + 2.25f * d55; - const float16_t t25 = 0.25f * d36 + d46 + 2.25f * d56; - const float16_t t26 = 0.25f * d37 + d47 + 2.25f * d57; - const float16_t t27 = 0.25f * d38 + d48 + 2.25f * d58; - - const float16_t t30 = 0.125f * d01 + d11 + 3.375f * d21; - const float16_t t31 = 0.125f * d02 + d12 + 3.375f * d22; - const float16_t t32 = 0.125f * d03 + d13 + 3.375f * d23; - const float16_t t33 = 0.125f * d04 + d14 + 3.375f * d24; - const float16_t t34 = 0.125f * d05 + d15 + 3.375f * d25; - const float16_t t35 = 0.125f * d06 + d16 + 3.375f * d26; - const float16_t t36 = 0.125f * d07 + d17 + 3.375f * d27; - const float16_t t37 = 0.125f * d08 + d18 + 3.375f * d28; - - const float16_t t40 = 0.0625f * d31 + d41 + 5.0625f * d51; - const float16_t t41 = 0.0625f * d32 + d42 + 5.0625f * d52; - const float16_t t42 = 0.0625f * d33 + d43 + 5.0625f * d53; - const float16_t t43 = 0.0625f * d34 + d44 + 5.0625f * d54; - const float16_t t44 = 0.0625f * d35 + d45 + 5.0625f * d55; - const float16_t t45 = 0.0625f * d36 + d46 + 5.0625f * d56; - const float16_t t46 = 0.0625f * d37 + d47 + 5.0625f * d57; - const float16_t t47 = 0.0625f * d38 + d48 + 5.0625f * d58; - - const float16_t t50 = 0.03125f * d01 + d11 + 7.59375f * d21 + src_data_70; - const float16_t t51 = 0.03125f * d02 + d12 + 7.59375f * d22 + src_data_71; - const float16_t t52 = 0.03125f * d03 + d13 + 7.59375f * d23 + src_data_72; - const float16_t t53 = 0.03125f * d04 + d14 + 7.59375f * d24 + src_data_73; - const float16_t t54 = 0.03125f * d05 + d15 + 7.59375f * d25 + src_data_74; - const float16_t t55 = 0.03125f * d06 + d16 + 7.59375f * d26 + src_data_75; - const float16_t t56 = 0.03125f * d07 + d17 + 7.59375f * d27 + src_data_76; - const float16_t t57 = 0.03125f * d08 + d18 + 7.59375f * d28 + src_data_77; - - float16_t s11 = t01 - t02; - float16_t s12 = t11 - t12; - float16_t s13 = t21 - t22; - float16_t s14 = t31 - t32; - float16_t s15 = t41 - t42; - float16_t s16 = t51 - t52; - - float16_t s21 = t03 - t04; - float16_t s22 = t13 - t14; - float16_t s23 = t23 - t24; - float16_t s24 = t33 - t34; - float16_t s25 = t43 - t44; - float16_t s26 = t53 - t54; - - float16_t s31 = t05 - t06; - float16_t s32 = t15 - t16; - float16_t s33 = t25 - t26; - float16_t s34 = t35 - t36; - float16_t s35 = t45 - t46; - float16_t s36 = t55 - t56; - - float16_t s41 = t01 + t02; - float16_t s42 = t11 + t12; - float16_t s43 = t21 + t22; - float16_t s44 = t31 + t32; - float16_t s45 = t41 + t42; - float16_t s46 = t51 + t52; - - float16_t s51 = t03 + t04; - float16_t s52 = t13 + t14; - float16_t s53 = t23 + t24; - float16_t s54 = t33 + t34; - float16_t s55 = t43 + t44; - float16_t s56 = t53 + t54; - - float16_t s61 = t05 + t06; - float16_t s62 = t15 + t16; - float16_t s63 = t25 + t26; - float16_t s64 = t35 + t36; - float16_t s65 = t45 + t46; - float16_t s66 = t55 + t56; - - float16_t m00 = t00 + t01 + t02 + t03 + t04 + t05 + t06; - const float16_t m01 = 0.5f * s11 + s21 + 1.5f * s31; - const float16_t m02 = 0.25f * s41 + s51 + 2.25f * s61; - const float16_t m03 = 0.125f * s11 + s21 + 3.375f * s31; - const float16_t m04 = 0.0625f * s41 + s51 + 5.0625f * s61; - const float16_t m05 = 0.03125f * s11 + s21 + 7.59375f * s31 + t07; - - float16_t m10 = t10 + t11 + t12 + t13 + t14 + t15 + t16; - const float16_t m11 = 0.5f * s12 + s22 + 1.5f * s32; - const float16_t m12 = 0.25f * s42 + s52 + 2.25f * s62; - const float16_t m13 = 0.125f * s12 + s22 + 3.375f * s32; - const float16_t m14 = 0.0625f * s42 + s52 + 5.0625f * s62; - const float16_t m15 = 0.03125f * s12 + s22 + 7.59375f * s32 + t17; - - float16_t m20 = t20 + t21 + t22 + t23 + t24 + t25 + t26; - const float16_t m21 = 0.5f * s13 + s23 + 1.5f * s33; - const float16_t m22 = 0.25f * s43 + s53 + 2.25f * s63; - const float16_t m23 = 0.125f * s13 + s23 + 3.375f * s33; - const float16_t m24 = 0.0625f * s43 + s53 + 5.0625f * s63; - const float16_t m25 = 0.03125f * s13 + s23 + 7.59375f * s33 + t27; - - float16_t m30 = t30 + t31 + t32 + t33 + t34 + t35 + t36; - const float16_t m31 = 0.5f * s14 + s24 + 1.5f * s34; - const float16_t m32 = 0.25f * s44 + s54 + 2.25f * s64; - const float16_t m33 = 0.125f * s14 + s24 + 3.375f * s34; - const float16_t m34 = 0.0625f * s44 + s54 + 5.0625f * s64; - const float16_t m35 = 0.03125f * s14 + s24 + 7.59375f * s34 + t37; - - float16_t m40 = t40 + t41 + t42 + t43 + t44 + t45 + t46; - const float16_t m41 = 0.5f * s15 + s25 + 1.5f * s35; - const float16_t m42 = 0.25f * s45 + s55 + 2.25f * s65; - const float16_t m43 = 0.125f * s15 + s25 + 3.375f * s35; - const float16_t m44 = 0.0625f * s45 + s55 + 5.0625f * s65; - const float16_t m45 = 0.03125f * s15 + s25 + 7.59375f * s35 + t47; - - float16_t m50 = t50 + t51 + t52 + t53 + t54 + t55 + t56; - const float16_t m51 = 0.5f * s16 + s26 + 1.5f * s36; - const float16_t m52 = 0.25f * s46 + s56 + 2.25f * s66; - const float16_t m53 = 0.125f * s16 + s26 + 3.375f * s36; - const float16_t m54 = 0.0625f * s46 + s56 + 5.0625f * s66; - const float16_t m55 = 0.03125f * s16 + s26 + 7.59375f * s36 + t57; - - (dst_data + i)[0] = m00 + bias_data[i]; - (dst_data + i + C8NUM)[0] = m01 + bias_data[i]; - (dst_data + i + 2 * C8NUM)[0] = m02 + bias_data[i]; - (dst_data + i + 3 * C8NUM)[0] = m03 + bias_data[i]; - (dst_data + i + 4 * C8NUM)[0] = m04 + bias_data[i]; - (dst_data + i + 5 * C8NUM)[0] = m05 + bias_data[i]; - - (dst_data + i + dst_step * C8NUM)[0] = m10 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + C8NUM)[0] = m11 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + 2 * C8NUM)[0] = m12 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + 3 * C8NUM)[0] = m13 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + 4 * C8NUM)[0] = m14 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + 5 * C8NUM)[0] = m15 + bias_data[i]; - - (dst_data + i + 2 * dst_step * C8NUM)[0] = m20 + bias_data[i]; - (dst_data + i + 2 * dst_step * C8NUM + C8NUM)[0] = m21 + bias_data[i]; - (dst_data + i + 2 * dst_step * C8NUM + 2 * C8NUM)[0] = m22 + bias_data[i]; - (dst_data + i + 2 * dst_step * C8NUM + 3 * C8NUM)[0] = m23 + bias_data[i]; - (dst_data + i + 2 * dst_step * C8NUM + 4 * C8NUM)[0] = m24 + bias_data[i]; - (dst_data + i + 2 * dst_step * C8NUM + 5 * C8NUM)[0] = m25 + bias_data[i]; - - (dst_data + i + 3 * dst_step * C8NUM)[0] = m30 + bias_data[i]; - (dst_data + i + 3 * dst_step * C8NUM + C8NUM)[0] = m31 + bias_data[i]; - (dst_data + i + 3 * dst_step * C8NUM + 2 * C8NUM)[0] = m32 + bias_data[i]; - (dst_data + i + 3 * dst_step * C8NUM + 3 * C8NUM)[0] = m33 + bias_data[i]; - (dst_data + i + 3 * dst_step * C8NUM + 4 * C8NUM)[0] = m34 + bias_data[i]; - (dst_data + i + 3 * dst_step * C8NUM + 5 * C8NUM)[0] = m35 + bias_data[i]; - - (dst_data + i + 4 * dst_step * C8NUM)[0] = m40 + bias_data[i]; - (dst_data + i + 4 * dst_step * C8NUM + C8NUM)[0] = m41 + bias_data[i]; - (dst_data + i + 4 * dst_step * C8NUM + 2 * C8NUM)[0] = m42 + bias_data[i]; - (dst_data + i + 4 * dst_step * C8NUM + 3 * C8NUM)[0] = m43 + bias_data[i]; - (dst_data + i + 4 * dst_step * C8NUM + 4 * C8NUM)[0] = m44 + bias_data[i]; - (dst_data + i + 4 * dst_step * C8NUM + 5 * C8NUM)[0] = m45 + bias_data[i]; - - (dst_data + i + 5 * dst_step * C8NUM)[0] = m50 + bias_data[i]; - (dst_data + i + 5 * dst_step * C8NUM + C8NUM)[0] = m51 + bias_data[i]; - (dst_data + i + 5 * dst_step * C8NUM + 2 * C8NUM)[0] = m52 + bias_data[i]; - (dst_data + i + 5 * dst_step * C8NUM + 3 * C8NUM)[0] = m53 + bias_data[i]; - (dst_data + i + 5 * dst_step * C8NUM + 4 * C8NUM)[0] = m54 + bias_data[i]; - (dst_data + i + 5 * dst_step * C8NUM + 5 * C8NUM)[0] = m55 + bias_data[i]; - } -#endif -} - -void OutputTransform8x7UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, - int src_step, int dst_step) { -#ifdef ENABLE_ARM - float16x8_t src_data_00 = vld1q_f16(src_data + 0 * src_step); - float16x8_t src_data_01 = vld1q_f16(src_data + 1 * src_step); - float16x8_t src_data_02 = vld1q_f16(src_data + 2 * src_step); - float16x8_t src_data_03 = vld1q_f16(src_data + 3 * src_step); - float16x8_t src_data_04 = vld1q_f16(src_data + 4 * src_step); - float16x8_t src_data_05 = vld1q_f16(src_data + 5 * src_step); - float16x8_t src_data_06 = vld1q_f16(src_data + 6 * src_step); - float16x8_t src_data_07 = vld1q_f16(src_data + 7 * src_step); - float16x8_t src_data_10 = vld1q_f16(src_data + 8 * src_step); - float16x8_t src_data_11 = vld1q_f16(src_data + 9 * src_step); - float16x8_t src_data_12 = vld1q_f16(src_data + 10 * src_step); - float16x8_t src_data_13 = vld1q_f16(src_data + 11 * src_step); - float16x8_t src_data_14 = vld1q_f16(src_data + 12 * src_step); - float16x8_t src_data_15 = vld1q_f16(src_data + 13 * src_step); - float16x8_t src_data_16 = vld1q_f16(src_data + 14 * src_step); - float16x8_t src_data_17 = vld1q_f16(src_data + 15 * src_step); - float16x8_t src_data_20 = vld1q_f16(src_data + 16 * src_step); - float16x8_t src_data_21 = vld1q_f16(src_data + 17 * src_step); - float16x8_t src_data_22 = vld1q_f16(src_data + 18 * src_step); - float16x8_t src_data_23 = vld1q_f16(src_data + 19 * src_step); - float16x8_t src_data_24 = vld1q_f16(src_data + 20 * src_step); - float16x8_t src_data_25 = vld1q_f16(src_data + 21 * src_step); - float16x8_t src_data_26 = vld1q_f16(src_data + 22 * src_step); - float16x8_t src_data_27 = vld1q_f16(src_data + 23 * src_step); - float16x8_t src_data_30 = vld1q_f16(src_data + 24 * src_step); - float16x8_t src_data_31 = vld1q_f16(src_data + 25 * src_step); - float16x8_t src_data_32 = vld1q_f16(src_data + 26 * src_step); - float16x8_t src_data_33 = vld1q_f16(src_data + 27 * src_step); - float16x8_t src_data_34 = vld1q_f16(src_data + 28 * src_step); - float16x8_t src_data_35 = vld1q_f16(src_data + 29 * src_step); - float16x8_t src_data_36 = vld1q_f16(src_data + 30 * src_step); - float16x8_t src_data_37 = vld1q_f16(src_data + 31 * src_step); - float16x8_t src_data_40 = vld1q_f16(src_data + 32 * src_step); - float16x8_t src_data_41 = vld1q_f16(src_data + 33 * src_step); - float16x8_t src_data_42 = vld1q_f16(src_data + 34 * src_step); - float16x8_t src_data_43 = vld1q_f16(src_data + 35 * src_step); - float16x8_t src_data_44 = vld1q_f16(src_data + 36 * src_step); - float16x8_t src_data_45 = vld1q_f16(src_data + 37 * src_step); - float16x8_t src_data_46 = vld1q_f16(src_data + 38 * src_step); - float16x8_t src_data_47 = vld1q_f16(src_data + 39 * src_step); - float16x8_t src_data_50 = vld1q_f16(src_data + 40 * src_step); - float16x8_t src_data_51 = vld1q_f16(src_data + 41 * src_step); - float16x8_t src_data_52 = vld1q_f16(src_data + 42 * src_step); - float16x8_t src_data_53 = vld1q_f16(src_data + 43 * src_step); - float16x8_t src_data_54 = vld1q_f16(src_data + 44 * src_step); - float16x8_t src_data_55 = vld1q_f16(src_data + 45 * src_step); - float16x8_t src_data_56 = vld1q_f16(src_data + 46 * src_step); - float16x8_t src_data_57 = vld1q_f16(src_data + 47 * src_step); - float16x8_t src_data_60 = vld1q_f16(src_data + 48 * src_step); - float16x8_t src_data_61 = vld1q_f16(src_data + 49 * src_step); - float16x8_t src_data_62 = vld1q_f16(src_data + 50 * src_step); - float16x8_t src_data_63 = vld1q_f16(src_data + 51 * src_step); - float16x8_t src_data_64 = vld1q_f16(src_data + 52 * src_step); - float16x8_t src_data_65 = vld1q_f16(src_data + 53 * src_step); - float16x8_t src_data_66 = vld1q_f16(src_data + 54 * src_step); - float16x8_t src_data_67 = vld1q_f16(src_data + 55 * src_step); - float16x8_t src_data_70 = vld1q_f16(src_data + 56 * src_step); - float16x8_t src_data_71 = vld1q_f16(src_data + 57 * src_step); - float16x8_t src_data_72 = vld1q_f16(src_data + 58 * src_step); - float16x8_t src_data_73 = vld1q_f16(src_data + 59 * src_step); - float16x8_t src_data_74 = vld1q_f16(src_data + 60 * src_step); - float16x8_t src_data_75 = vld1q_f16(src_data + 61 * src_step); - float16x8_t src_data_76 = vld1q_f16(src_data + 62 * src_step); - float16x8_t src_data_77 = vld1q_f16(src_data + 63 * src_step); - - float16x8_t d01 = vsubq_f16(src_data_10, src_data_20); - float16x8_t d02 = vsubq_f16(src_data_11, src_data_21); - float16x8_t d03 = vsubq_f16(src_data_12, src_data_22); - float16x8_t d04 = vsubq_f16(src_data_13, src_data_23); - float16x8_t d05 = vsubq_f16(src_data_14, src_data_24); - float16x8_t d06 = vsubq_f16(src_data_15, src_data_25); - float16x8_t d07 = vsubq_f16(src_data_16, src_data_26); - float16x8_t d08 = vsubq_f16(src_data_17, src_data_27); - - float16x8_t d11 = vsubq_f16(src_data_30, src_data_40); - float16x8_t d12 = vsubq_f16(src_data_31, src_data_41); - float16x8_t d13 = vsubq_f16(src_data_32, src_data_42); - float16x8_t d14 = vsubq_f16(src_data_33, src_data_43); - float16x8_t d15 = vsubq_f16(src_data_34, src_data_44); - float16x8_t d16 = vsubq_f16(src_data_35, src_data_45); - float16x8_t d17 = vsubq_f16(src_data_36, src_data_46); - float16x8_t d18 = vsubq_f16(src_data_37, src_data_47); - - float16x8_t d21 = vsubq_f16(src_data_50, src_data_60); - float16x8_t d22 = vsubq_f16(src_data_51, src_data_61); - float16x8_t d23 = vsubq_f16(src_data_52, src_data_62); - float16x8_t d24 = vsubq_f16(src_data_53, src_data_63); - float16x8_t d25 = vsubq_f16(src_data_54, src_data_64); - float16x8_t d26 = vsubq_f16(src_data_55, src_data_65); - float16x8_t d27 = vsubq_f16(src_data_56, src_data_66); - float16x8_t d28 = vsubq_f16(src_data_57, src_data_67); - - float16x8_t d31 = vaddq_f16(src_data_10, src_data_20); - float16x8_t d32 = vaddq_f16(src_data_11, src_data_21); - float16x8_t d33 = vaddq_f16(src_data_12, src_data_22); - float16x8_t d34 = vaddq_f16(src_data_13, src_data_23); - float16x8_t d35 = vaddq_f16(src_data_14, src_data_24); - float16x8_t d36 = vaddq_f16(src_data_15, src_data_25); - float16x8_t d37 = vaddq_f16(src_data_16, src_data_26); - float16x8_t d38 = vaddq_f16(src_data_17, src_data_27); - - float16x8_t d41 = vaddq_f16(src_data_30, src_data_40); - float16x8_t d42 = vaddq_f16(src_data_31, src_data_41); - float16x8_t d43 = vaddq_f16(src_data_32, src_data_42); - float16x8_t d44 = vaddq_f16(src_data_33, src_data_43); - float16x8_t d45 = vaddq_f16(src_data_34, src_data_44); - float16x8_t d46 = vaddq_f16(src_data_35, src_data_45); - float16x8_t d47 = vaddq_f16(src_data_36, src_data_46); - float16x8_t d48 = vaddq_f16(src_data_37, src_data_47); - - float16x8_t d51 = vaddq_f16(src_data_50, src_data_60); - float16x8_t d52 = vaddq_f16(src_data_51, src_data_61); - float16x8_t d53 = vaddq_f16(src_data_52, src_data_62); - float16x8_t d54 = vaddq_f16(src_data_53, src_data_63); - float16x8_t d55 = vaddq_f16(src_data_54, src_data_64); - float16x8_t d56 = vaddq_f16(src_data_55, src_data_65); - float16x8_t d57 = vaddq_f16(src_data_56, src_data_66); - float16x8_t d58 = vaddq_f16(src_data_57, src_data_67); - - float16x8_t t00 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_00, src_data_10), src_data_20), src_data_30), src_data_40), - src_data_50), - src_data_60); - float16x8_t t01 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_01, src_data_11), src_data_21), src_data_31), src_data_41), - src_data_51), - src_data_61); - float16x8_t t02 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_02, src_data_12), src_data_22), src_data_32), src_data_42), - src_data_52), - src_data_62); - float16x8_t t03 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_03, src_data_13), src_data_23), src_data_33), src_data_43), - src_data_53), - src_data_63); - float16x8_t t04 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_04, src_data_14), src_data_24), src_data_34), src_data_44), - src_data_54), - src_data_64); - float16x8_t t05 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_05, src_data_15), src_data_25), src_data_35), src_data_45), - src_data_55), - src_data_65); - float16x8_t t06 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_06, src_data_16), src_data_26), src_data_36), src_data_46), - src_data_56), - src_data_66); - float16x8_t t07 = vaddq_f16( - vaddq_f16( - vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(src_data_07, src_data_17), src_data_27), src_data_37), src_data_47), - src_data_57), - src_data_67); - - float16x8_t t10 = vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.5), d11), vmulq_n_f16(d21, 1.5)); - float16x8_t t11 = vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.5), d12), vmulq_n_f16(d22, 1.5)); - float16x8_t t12 = vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.5), d13), vmulq_n_f16(d23, 1.5)); - float16x8_t t13 = vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.5), d14), vmulq_n_f16(d24, 1.5)); - float16x8_t t14 = vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.5), d15), vmulq_n_f16(d25, 1.5)); - float16x8_t t15 = vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.5), d16), vmulq_n_f16(d26, 1.5)); - float16x8_t t16 = vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.5), d17), vmulq_n_f16(d27, 1.5)); - float16x8_t t17 = vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.5), d18), vmulq_n_f16(d28, 1.5)); - - float16x8_t t20 = vaddq_f16(vaddq_f16(vmulq_n_f16(d31, 0.25), d41), vmulq_n_f16(d51, 2.25)); - float16x8_t t21 = vaddq_f16(vaddq_f16(vmulq_n_f16(d32, 0.25), d42), vmulq_n_f16(d52, 2.25)); - float16x8_t t22 = vaddq_f16(vaddq_f16(vmulq_n_f16(d33, 0.25), d43), vmulq_n_f16(d53, 2.25)); - float16x8_t t23 = vaddq_f16(vaddq_f16(vmulq_n_f16(d34, 0.25), d44), vmulq_n_f16(d54, 2.25)); - float16x8_t t24 = vaddq_f16(vaddq_f16(vmulq_n_f16(d35, 0.25), d45), vmulq_n_f16(d55, 2.25)); - float16x8_t t25 = vaddq_f16(vaddq_f16(vmulq_n_f16(d36, 0.25), d46), vmulq_n_f16(d56, 2.25)); - float16x8_t t26 = vaddq_f16(vaddq_f16(vmulq_n_f16(d37, 0.25), d47), vmulq_n_f16(d57, 2.25)); - float16x8_t t27 = vaddq_f16(vaddq_f16(vmulq_n_f16(d38, 0.25), d48), vmulq_n_f16(d58, 2.25)); - - float16x8_t t30 = vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.125), d11), vmulq_n_f16(d21, 3.375)); - float16x8_t t31 = vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.125), d12), vmulq_n_f16(d22, 3.375)); - float16x8_t t32 = vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.125), d13), vmulq_n_f16(d23, 3.375)); - float16x8_t t33 = vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.125), d14), vmulq_n_f16(d24, 3.375)); - float16x8_t t34 = vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.125), d15), vmulq_n_f16(d25, 3.375)); - float16x8_t t35 = vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.125), d16), vmulq_n_f16(d26, 3.375)); - float16x8_t t36 = vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.125), d17), vmulq_n_f16(d27, 3.375)); - float16x8_t t37 = vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.125), d18), vmulq_n_f16(d28, 3.375)); - - float16x8_t t40 = vaddq_f16(vaddq_f16(vmulq_n_f16(d31, 0.0625), d41), vmulq_n_f16(d51, 5.0625)); - float16x8_t t41 = vaddq_f16(vaddq_f16(vmulq_n_f16(d32, 0.0625), d42), vmulq_n_f16(d52, 5.0625)); - float16x8_t t42 = vaddq_f16(vaddq_f16(vmulq_n_f16(d33, 0.0625), d43), vmulq_n_f16(d53, 5.0625)); - float16x8_t t43 = vaddq_f16(vaddq_f16(vmulq_n_f16(d34, 0.0625), d44), vmulq_n_f16(d54, 5.0625)); - float16x8_t t44 = vaddq_f16(vaddq_f16(vmulq_n_f16(d35, 0.0625), d45), vmulq_n_f16(d55, 5.0625)); - float16x8_t t45 = vaddq_f16(vaddq_f16(vmulq_n_f16(d36, 0.0625), d46), vmulq_n_f16(d56, 5.0625)); - float16x8_t t46 = vaddq_f16(vaddq_f16(vmulq_n_f16(d37, 0.0625), d47), vmulq_n_f16(d57, 5.0625)); - float16x8_t t47 = vaddq_f16(vaddq_f16(vmulq_n_f16(d38, 0.0625), d48), vmulq_n_f16(d58, 5.0625)); - - float16x8_t t50 = vaddq_f16(vaddq_f16(vmulq_n_f16(d01, 0.03125), d11), vmulq_n_f16(d21, 7.59375)); - float16x8_t t51 = vaddq_f16(vaddq_f16(vmulq_n_f16(d02, 0.03125), d12), vmulq_n_f16(d22, 7.59375)); - float16x8_t t52 = vaddq_f16(vaddq_f16(vmulq_n_f16(d03, 0.03125), d13), vmulq_n_f16(d23, 7.59375)); - float16x8_t t53 = vaddq_f16(vaddq_f16(vmulq_n_f16(d04, 0.03125), d14), vmulq_n_f16(d24, 7.59375)); - float16x8_t t54 = vaddq_f16(vaddq_f16(vmulq_n_f16(d05, 0.03125), d15), vmulq_n_f16(d25, 7.59375)); - float16x8_t t55 = vaddq_f16(vaddq_f16(vmulq_n_f16(d06, 0.03125), d16), vmulq_n_f16(d26, 7.59375)); - float16x8_t t56 = vaddq_f16(vaddq_f16(vmulq_n_f16(d07, 0.03125), d17), vmulq_n_f16(d27, 7.59375)); - float16x8_t t57 = vaddq_f16(vaddq_f16(vmulq_n_f16(d08, 0.03125), d18), vmulq_n_f16(d28, 7.59375)); - - float16x8_t t60 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d31, 0.015625), d41), vmulq_n_f16(d51, 11.390625)), src_data_70); - float16x8_t t61 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d32, 0.015625), d42), vmulq_n_f16(d52, 11.390625)), src_data_71); - float16x8_t t62 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d33, 0.015625), d43), vmulq_n_f16(d53, 11.390625)), src_data_72); - float16x8_t t63 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d34, 0.015625), d44), vmulq_n_f16(d54, 11.390625)), src_data_73); - float16x8_t t64 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d35, 0.015625), d45), vmulq_n_f16(d55, 11.390625)), src_data_74); - float16x8_t t65 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d36, 0.015625), d46), vmulq_n_f16(d56, 11.390625)), src_data_75); - float16x8_t t66 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d37, 0.015625), d47), vmulq_n_f16(d57, 11.390625)), src_data_76); - float16x8_t t67 = - vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(d38, 0.015625), d48), vmulq_n_f16(d58, 11.390625)), src_data_77); - - float16x8_t s11 = vsubq_f16(t01, t02); - float16x8_t s12 = vsubq_f16(t11, t12); - float16x8_t s13 = vsubq_f16(t21, t22); - float16x8_t s14 = vsubq_f16(t31, t32); - float16x8_t s15 = vsubq_f16(t41, t42); - float16x8_t s16 = vsubq_f16(t51, t52); - float16x8_t s17 = vsubq_f16(t61, t62); - - float16x8_t s21 = vsubq_f16(t03, t04); - float16x8_t s22 = vsubq_f16(t13, t14); - float16x8_t s23 = vsubq_f16(t23, t24); - float16x8_t s24 = vsubq_f16(t33, t34); - float16x8_t s25 = vsubq_f16(t43, t44); - float16x8_t s26 = vsubq_f16(t53, t54); - float16x8_t s27 = vsubq_f16(t63, t64); - - float16x8_t s31 = vsubq_f16(t05, t06); - float16x8_t s32 = vsubq_f16(t15, t16); - float16x8_t s33 = vsubq_f16(t25, t26); - float16x8_t s34 = vsubq_f16(t35, t36); - float16x8_t s35 = vsubq_f16(t45, t46); - float16x8_t s36 = vsubq_f16(t55, t56); - float16x8_t s37 = vsubq_f16(t65, t66); - - float16x8_t s41 = vaddq_f16(t01, t02); - float16x8_t s42 = vaddq_f16(t11, t12); - float16x8_t s43 = vaddq_f16(t21, t22); - float16x8_t s44 = vaddq_f16(t31, t32); - float16x8_t s45 = vaddq_f16(t41, t42); - float16x8_t s46 = vaddq_f16(t51, t52); - float16x8_t s47 = vaddq_f16(t61, t62); - - float16x8_t s51 = vaddq_f16(t03, t04); - float16x8_t s52 = vaddq_f16(t13, t14); - float16x8_t s53 = vaddq_f16(t23, t24); - float16x8_t s54 = vaddq_f16(t33, t34); - float16x8_t s55 = vaddq_f16(t43, t44); - float16x8_t s56 = vaddq_f16(t53, t54); - float16x8_t s57 = vaddq_f16(t63, t64); - - float16x8_t s61 = vaddq_f16(t05, t06); - float16x8_t s62 = vaddq_f16(t15, t16); - float16x8_t s63 = vaddq_f16(t25, t26); - float16x8_t s64 = vaddq_f16(t35, t36); - float16x8_t s65 = vaddq_f16(t45, t46); - float16x8_t s66 = vaddq_f16(t55, t56); - float16x8_t s67 = vaddq_f16(t65, t66); - - float16x8_t m00 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t00, t01), t02), t03), t04), t05), t06); - float16x8_t m01 = vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.5), s21), vmulq_n_f16(s31, 1.5)); - float16x8_t m02 = vaddq_f16(vaddq_f16(vmulq_n_f16(s41, 0.25), s51), vmulq_n_f16(s61, 2.25)); - float16x8_t m03 = vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.125), s21), vmulq_n_f16(s31, 3.375)); - float16x8_t m04 = vaddq_f16(vaddq_f16(vmulq_n_f16(s41, 0.0625), s51), vmulq_n_f16(s61, 5.0625)); - float16x8_t m05 = vaddq_f16(vaddq_f16(vmulq_n_f16(s11, 0.03125), s21), vmulq_n_f16(s31, 7.59375)); - float16x8_t m06 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s41, 0.015625), s51), vmulq_n_f16(s61, 11.390625)), t07); - - float16x8_t m10 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t10, t11), t12), t13), t14), t15), t16); - float16x8_t m11 = vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.5), s22), vmulq_n_f16(s32, 1.5)); - float16x8_t m12 = vaddq_f16(vaddq_f16(vmulq_n_f16(s42, 0.25), s52), vmulq_n_f16(s62, 2.25)); - float16x8_t m13 = vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.125), s22), vmulq_n_f16(s32, 3.375)); - float16x8_t m14 = vaddq_f16(vaddq_f16(vmulq_n_f16(s42, 0.0625), s52), vmulq_n_f16(s62, 5.0625)); - float16x8_t m15 = vaddq_f16(vaddq_f16(vmulq_n_f16(s12, 0.03125), s22), vmulq_n_f16(s32, 7.59375)); - float16x8_t m16 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s42, 0.015625), s52), vmulq_n_f16(s62, 11.390625)), t17); - - float16x8_t m20 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t20, t21), t22), t23), t24), t25), t26); - float16x8_t m21 = vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.5), s23), vmulq_n_f16(s33, 1.5)); - float16x8_t m22 = vaddq_f16(vaddq_f16(vmulq_n_f16(s43, 0.25), s53), vmulq_n_f16(s63, 2.25)); - float16x8_t m23 = vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.125), s23), vmulq_n_f16(s33, 3.375)); - float16x8_t m24 = vaddq_f16(vaddq_f16(vmulq_n_f16(s43, 0.0625), s53), vmulq_n_f16(s63, 5.0625)); - float16x8_t m25 = vaddq_f16(vaddq_f16(vmulq_n_f16(s13, 0.03125), s23), vmulq_n_f16(s33, 7.59375)); - float16x8_t m26 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s43, 0.015625), s53), vmulq_n_f16(s63, 11.390625)), t27); - - float16x8_t m30 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t30, t31), t32), t33), t34), t35), t36); - float16x8_t m31 = vaddq_f16(vaddq_f16(vmulq_n_f16(s14, 0.5), s24), vmulq_n_f16(s34, 1.5)); - float16x8_t m32 = vaddq_f16(vaddq_f16(vmulq_n_f16(s44, 0.25), s54), vmulq_n_f16(s64, 2.25)); - float16x8_t m33 = vaddq_f16(vaddq_f16(vmulq_n_f16(s14, 0.125), s24), vmulq_n_f16(s34, 3.375)); - float16x8_t m34 = vaddq_f16(vaddq_f16(vmulq_n_f16(s44, 0.0625), s54), vmulq_n_f16(s64, 5.0625)); - float16x8_t m35 = vaddq_f16(vaddq_f16(vmulq_n_f16(s14, 0.03125), s24), vmulq_n_f16(s34, 7.59375)); - float16x8_t m36 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s44, 0.015625), s54), vmulq_n_f16(s64, 11.390625)), t37); - - float16x8_t m40 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t40, t41), t42), t43), t44), t45), t46); - float16x8_t m41 = vaddq_f16(vaddq_f16(vmulq_n_f16(s15, 0.5), s25), vmulq_n_f16(s35, 1.5)); - float16x8_t m42 = vaddq_f16(vaddq_f16(vmulq_n_f16(s45, 0.25), s55), vmulq_n_f16(s65, 2.25)); - float16x8_t m43 = vaddq_f16(vaddq_f16(vmulq_n_f16(s15, 0.125), s25), vmulq_n_f16(s35, 3.375)); - float16x8_t m44 = vaddq_f16(vaddq_f16(vmulq_n_f16(s45, 0.0625), s55), vmulq_n_f16(s65, 5.0625)); - float16x8_t m45 = vaddq_f16(vaddq_f16(vmulq_n_f16(s15, 0.03125), s25), vmulq_n_f16(s35, 7.59375)); - float16x8_t m46 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s45, 0.015625), s55), vmulq_n_f16(s65, 11.390625)), t47); - - float16x8_t m50 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t50, t51), t52), t53), t54), t55), t56); - float16x8_t m51 = vaddq_f16(vaddq_f16(vmulq_n_f16(s16, 0.5), s26), vmulq_n_f16(s36, 1.5)); - float16x8_t m52 = vaddq_f16(vaddq_f16(vmulq_n_f16(s46, 0.25), s56), vmulq_n_f16(s66, 2.25)); - float16x8_t m53 = vaddq_f16(vaddq_f16(vmulq_n_f16(s16, 0.125), s26), vmulq_n_f16(s36, 3.375)); - float16x8_t m54 = vaddq_f16(vaddq_f16(vmulq_n_f16(s46, 0.0625), s56), vmulq_n_f16(s66, 5.0625)); - float16x8_t m55 = vaddq_f16(vaddq_f16(vmulq_n_f16(s16, 0.03125), s26), vmulq_n_f16(s36, 7.59375)); - float16x8_t m56 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s46, 0.015625), s56), vmulq_n_f16(s66, 11.390625)), t57); - - float16x8_t m60 = vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(vaddq_f16(t60, t61), t62), t63), t64), t65), t66); - float16x8_t m61 = vaddq_f16(vaddq_f16(vmulq_n_f16(s17, 0.5), s27), vmulq_n_f16(s37, 1.5)); - float16x8_t m62 = vaddq_f16(vaddq_f16(vmulq_n_f16(s47, 0.25), s57), vmulq_n_f16(s67, 2.25)); - float16x8_t m63 = vaddq_f16(vaddq_f16(vmulq_n_f16(s17, 0.125), s27), vmulq_n_f16(s37, 3.375)); - float16x8_t m64 = vaddq_f16(vaddq_f16(vmulq_n_f16(s47, 0.0625), s57), vmulq_n_f16(s67, 5.0625)); - float16x8_t m65 = vaddq_f16(vaddq_f16(vmulq_n_f16(s17, 0.03125), s27), vmulq_n_f16(s37, 7.59375)); - float16x8_t m66 = vaddq_f16(vaddq_f16(vaddq_f16(vmulq_n_f16(s47, 0.015625), s57), vmulq_n_f16(s67, 11.390625)), t67); - - float16x8_t bias_ptr = vld1q_f16(bias_data); - vst1q_f16(dst_data, vaddq_f16(m00, bias_ptr)); - vst1q_f16(dst_data + C8NUM, vaddq_f16(m01, bias_ptr)); - vst1q_f16(dst_data + 2 * C8NUM, vaddq_f16(m02, bias_ptr)); - vst1q_f16(dst_data + 3 * C8NUM, vaddq_f16(m03, bias_ptr)); - vst1q_f16(dst_data + 4 * C8NUM, vaddq_f16(m04, bias_ptr)); - vst1q_f16(dst_data + 5 * C8NUM, vaddq_f16(m05, bias_ptr)); - vst1q_f16(dst_data + 6 * C8NUM, vaddq_f16(m06, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM, vaddq_f16(m10, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + C8NUM, vaddq_f16(m11, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m12, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m13, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m14, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m15, bias_ptr)); - vst1q_f16(dst_data + dst_step * C8NUM + 6 * C8NUM, vaddq_f16(m16, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM, vaddq_f16(m20, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + C8NUM, vaddq_f16(m21, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m22, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m23, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m24, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m25, bias_ptr)); - vst1q_f16(dst_data + 2 * dst_step * C8NUM + 6 * C8NUM, vaddq_f16(m26, bias_ptr)); - vst1q_f16(dst_data + 3 * dst_step * C8NUM, vaddq_f16(m30, bias_ptr)); - vst1q_f16(dst_data + 3 * dst_step * C8NUM + C8NUM, vaddq_f16(m31, bias_ptr)); - vst1q_f16(dst_data + 3 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m32, bias_ptr)); - vst1q_f16(dst_data + 3 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m33, bias_ptr)); - vst1q_f16(dst_data + 3 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m34, bias_ptr)); - vst1q_f16(dst_data + 3 * dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m35, bias_ptr)); - vst1q_f16(dst_data + 3 * dst_step * C8NUM + 6 * C8NUM, vaddq_f16(m36, bias_ptr)); - vst1q_f16(dst_data + 4 * dst_step * C8NUM, vaddq_f16(m40, bias_ptr)); - vst1q_f16(dst_data + 4 * dst_step * C8NUM + C8NUM, vaddq_f16(m41, bias_ptr)); - vst1q_f16(dst_data + 4 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m42, bias_ptr)); - vst1q_f16(dst_data + 4 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m43, bias_ptr)); - vst1q_f16(dst_data + 4 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m44, bias_ptr)); - vst1q_f16(dst_data + 4 * dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m45, bias_ptr)); - vst1q_f16(dst_data + 4 * dst_step * C8NUM + 6 * C8NUM, vaddq_f16(m46, bias_ptr)); - vst1q_f16(dst_data + 5 * dst_step * C8NUM, vaddq_f16(m50, bias_ptr)); - vst1q_f16(dst_data + 5 * dst_step * C8NUM + C8NUM, vaddq_f16(m51, bias_ptr)); - vst1q_f16(dst_data + 5 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m52, bias_ptr)); - vst1q_f16(dst_data + 5 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m53, bias_ptr)); - vst1q_f16(dst_data + 5 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m54, bias_ptr)); - vst1q_f16(dst_data + 5 * dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m55, bias_ptr)); - vst1q_f16(dst_data + 5 * dst_step * C8NUM + 6 * C8NUM, vaddq_f16(m56, bias_ptr)); - vst1q_f16(dst_data + 6 * dst_step * C8NUM, vaddq_f16(m60, bias_ptr)); - vst1q_f16(dst_data + 6 * dst_step * C8NUM + C8NUM, vaddq_f16(m61, bias_ptr)); - vst1q_f16(dst_data + 6 * dst_step * C8NUM + 2 * C8NUM, vaddq_f16(m62, bias_ptr)); - vst1q_f16(dst_data + 6 * dst_step * C8NUM + 3 * C8NUM, vaddq_f16(m63, bias_ptr)); - vst1q_f16(dst_data + 6 * dst_step * C8NUM + 4 * C8NUM, vaddq_f16(m64, bias_ptr)); - vst1q_f16(dst_data + 6 * dst_step * C8NUM + 5 * C8NUM, vaddq_f16(m65, bias_ptr)); - vst1q_f16(dst_data + 6 * dst_step * C8NUM + 6 * C8NUM, vaddq_f16(m66, bias_ptr)); -#else - for (int i = 0; i < C8NUM; i++) { - float16_t src_data_00 = src_data[i]; - float16_t src_data_01 = src_data[i + src_step]; - float16_t src_data_02 = src_data[i + 2 * src_step]; - float16_t src_data_03 = src_data[i + 3 * src_step]; - float16_t src_data_04 = src_data[i + 4 * src_step]; - float16_t src_data_05 = src_data[i + 5 * src_step]; - float16_t src_data_06 = src_data[i + 6 * src_step]; - float16_t src_data_07 = src_data[i + 7 * src_step]; - float16_t src_data_10 = src_data[i + 8 * src_step]; - float16_t src_data_11 = src_data[i + 9 * src_step]; - float16_t src_data_12 = src_data[i + 10 * src_step]; - float16_t src_data_13 = src_data[i + 11 * src_step]; - float16_t src_data_14 = src_data[i + 12 * src_step]; - float16_t src_data_15 = src_data[i + 13 * src_step]; - float16_t src_data_16 = src_data[i + 14 * src_step]; - float16_t src_data_17 = src_data[i + 15 * src_step]; - float16_t src_data_20 = src_data[i + 16 * src_step]; - float16_t src_data_21 = src_data[i + 17 * src_step]; - float16_t src_data_22 = src_data[i + 18 * src_step]; - float16_t src_data_23 = src_data[i + 19 * src_step]; - float16_t src_data_24 = src_data[i + 20 * src_step]; - float16_t src_data_25 = src_data[i + 21 * src_step]; - float16_t src_data_26 = src_data[i + 22 * src_step]; - float16_t src_data_27 = src_data[i + 23 * src_step]; - float16_t src_data_30 = src_data[i + 24 * src_step]; - float16_t src_data_31 = src_data[i + 25 * src_step]; - float16_t src_data_32 = src_data[i + 26 * src_step]; - float16_t src_data_33 = src_data[i + 27 * src_step]; - float16_t src_data_34 = src_data[i + 28 * src_step]; - float16_t src_data_35 = src_data[i + 29 * src_step]; - float16_t src_data_36 = src_data[i + 30 * src_step]; - float16_t src_data_37 = src_data[i + 31 * src_step]; - float16_t src_data_40 = src_data[i + 32 * src_step]; - float16_t src_data_41 = src_data[i + 33 * src_step]; - float16_t src_data_42 = src_data[i + 34 * src_step]; - float16_t src_data_43 = src_data[i + 35 * src_step]; - float16_t src_data_44 = src_data[i + 36 * src_step]; - float16_t src_data_45 = src_data[i + 37 * src_step]; - float16_t src_data_46 = src_data[i + 38 * src_step]; - float16_t src_data_47 = src_data[i + 39 * src_step]; - float16_t src_data_50 = src_data[i + 40 * src_step]; - float16_t src_data_51 = src_data[i + 41 * src_step]; - float16_t src_data_52 = src_data[i + 42 * src_step]; - float16_t src_data_53 = src_data[i + 43 * src_step]; - float16_t src_data_54 = src_data[i + 44 * src_step]; - float16_t src_data_55 = src_data[i + 45 * src_step]; - float16_t src_data_56 = src_data[i + 46 * src_step]; - float16_t src_data_57 = src_data[i + 47 * src_step]; - float16_t src_data_60 = src_data[i + 48 * src_step]; - float16_t src_data_61 = src_data[i + 49 * src_step]; - float16_t src_data_62 = src_data[i + 50 * src_step]; - float16_t src_data_63 = src_data[i + 51 * src_step]; - float16_t src_data_64 = src_data[i + 52 * src_step]; - float16_t src_data_65 = src_data[i + 53 * src_step]; - float16_t src_data_66 = src_data[i + 54 * src_step]; - float16_t src_data_67 = src_data[i + 55 * src_step]; - float16_t src_data_70 = src_data[i + 56 * src_step]; - float16_t src_data_71 = src_data[i + 57 * src_step]; - float16_t src_data_72 = src_data[i + 58 * src_step]; - float16_t src_data_73 = src_data[i + 59 * src_step]; - float16_t src_data_74 = src_data[i + 60 * src_step]; - float16_t src_data_75 = src_data[i + 61 * src_step]; - float16_t src_data_76 = src_data[i + 62 * src_step]; - float16_t src_data_77 = src_data[i + 63 * src_step]; - - float16_t d01 = src_data_10 - src_data_20; - float16_t d02 = src_data_11 - src_data_21; - float16_t d03 = src_data_12 - src_data_22; - float16_t d04 = src_data_13 - src_data_23; - float16_t d05 = src_data_14 - src_data_24; - float16_t d06 = src_data_15 - src_data_25; - float16_t d07 = src_data_16 - src_data_26; - float16_t d08 = src_data_17 - src_data_27; - - float16_t d11 = src_data_30 - src_data_40; - float16_t d12 = src_data_31 - src_data_41; - float16_t d13 = src_data_32 - src_data_42; - float16_t d14 = src_data_33 - src_data_43; - float16_t d15 = src_data_34 - src_data_44; - float16_t d16 = src_data_35 - src_data_45; - float16_t d17 = src_data_36 - src_data_46; - float16_t d18 = src_data_37 - src_data_47; - - float16_t d21 = src_data_50 - src_data_60; - float16_t d22 = src_data_51 - src_data_61; - float16_t d23 = src_data_52 - src_data_62; - float16_t d24 = src_data_53 - src_data_63; - float16_t d25 = src_data_54 - src_data_64; - float16_t d26 = src_data_55 - src_data_65; - float16_t d27 = src_data_56 - src_data_66; - float16_t d28 = src_data_57 - src_data_67; - - float16_t d31 = src_data_10 + src_data_20; - float16_t d32 = src_data_11 + src_data_21; - float16_t d33 = src_data_12 + src_data_22; - float16_t d34 = src_data_13 + src_data_23; - float16_t d35 = src_data_14 + src_data_24; - float16_t d36 = src_data_15 + src_data_25; - float16_t d37 = src_data_16 + src_data_26; - float16_t d38 = src_data_17 + src_data_27; - - float16_t d41 = src_data_30 + src_data_40; - float16_t d42 = src_data_31 + src_data_41; - float16_t d43 = src_data_32 + src_data_42; - float16_t d44 = src_data_33 + src_data_43; - float16_t d45 = src_data_34 + src_data_44; - float16_t d46 = src_data_35 + src_data_45; - float16_t d47 = src_data_36 + src_data_46; - float16_t d48 = src_data_37 + src_data_47; - - float16_t d51 = src_data_50 + src_data_60; - float16_t d52 = src_data_51 + src_data_61; - float16_t d53 = src_data_52 + src_data_62; - float16_t d54 = src_data_53 + src_data_63; - float16_t d55 = src_data_54 + src_data_64; - float16_t d56 = src_data_55 + src_data_65; - float16_t d57 = src_data_56 + src_data_66; - float16_t d58 = src_data_57 + src_data_67; - - float16_t t00 = src_data_00 + src_data_10 + src_data_20 + src_data_30 + src_data_40 + src_data_50 + src_data_60; - float16_t t01 = src_data_01 + src_data_11 + src_data_21 + src_data_31 + src_data_41 + src_data_51 + src_data_61; - float16_t t02 = src_data_02 + src_data_12 + src_data_22 + src_data_32 + src_data_42 + src_data_52 + src_data_62; - float16_t t03 = src_data_03 + src_data_13 + src_data_23 + src_data_33 + src_data_43 + src_data_53 + src_data_63; - float16_t t04 = src_data_04 + src_data_14 + src_data_24 + src_data_34 + src_data_44 + src_data_54 + src_data_64; - float16_t t05 = src_data_05 + src_data_15 + src_data_25 + src_data_35 + src_data_45 + src_data_55 + src_data_65; - float16_t t06 = src_data_06 + src_data_16 + src_data_26 + src_data_36 + src_data_46 + src_data_56 + src_data_66; - float16_t t07 = src_data_07 + src_data_17 + src_data_27 + src_data_37 + src_data_47 + src_data_57 + src_data_67; - - const float16_t t10 = 0.5f * d01 + d11 + 1.5f * d21; - const float16_t t11 = 0.5f * d02 + d12 + 1.5f * d22; - const float16_t t12 = 0.5f * d03 + d13 + 1.5f * d23; - const float16_t t13 = 0.5f * d04 + d14 + 1.5f * d24; - const float16_t t14 = 0.5f * d05 + d15 + 1.5f * d25; - const float16_t t15 = 0.5f * d06 + d16 + 1.5f * d26; - const float16_t t16 = 0.5f * d07 + d17 + 1.5f * d27; - const float16_t t17 = 0.5f * d08 + d18 + 1.5f * d28; - - const float16_t t20 = 0.25f * d31 + d41 + 2.25f * d51; - const float16_t t21 = 0.25f * d32 + d42 + 2.25f * d52; - const float16_t t22 = 0.25f * d33 + d43 + 2.25f * d53; - const float16_t t23 = 0.25f * d34 + d44 + 2.25f * d54; - const float16_t t24 = 0.25f * d35 + d45 + 2.25f * d55; - const float16_t t25 = 0.25f * d36 + d46 + 2.25f * d56; - const float16_t t26 = 0.25f * d37 + d47 + 2.25f * d57; - const float16_t t27 = 0.25f * d38 + d48 + 2.25f * d58; - - const float16_t t30 = 0.125f * d01 + d11 + 3.375f * d21; - const float16_t t31 = 0.125f * d02 + d12 + 3.375f * d22; - const float16_t t32 = 0.125f * d03 + d13 + 3.375f * d23; - const float16_t t33 = 0.125f * d04 + d14 + 3.375f * d24; - const float16_t t34 = 0.125f * d05 + d15 + 3.375f * d25; - const float16_t t35 = 0.125f * d06 + d16 + 3.375f * d26; - const float16_t t36 = 0.125f * d07 + d17 + 3.375f * d27; - const float16_t t37 = 0.125f * d08 + d18 + 3.375f * d28; - - const float16_t t40 = 0.0625f * d31 + d41 + 5.0625f * d51; - const float16_t t41 = 0.0625f * d32 + d42 + 5.0625f * d52; - const float16_t t42 = 0.0625f * d33 + d43 + 5.0625f * d53; - const float16_t t43 = 0.0625f * d34 + d44 + 5.0625f * d54; - const float16_t t44 = 0.0625f * d35 + d45 + 5.0625f * d55; - const float16_t t45 = 0.0625f * d36 + d46 + 5.0625f * d56; - const float16_t t46 = 0.0625f * d37 + d47 + 5.0625f * d57; - const float16_t t47 = 0.0625f * d38 + d48 + 5.0625f * d58; - - const float16_t t50 = 0.03125f * d01 + d11 + 7.59375f * d21; - const float16_t t51 = 0.03125f * d02 + d12 + 7.59375f * d22; - const float16_t t52 = 0.03125f * d03 + d13 + 7.59375f * d23; - const float16_t t53 = 0.03125f * d04 + d14 + 7.59375f * d24; - const float16_t t54 = 0.03125f * d05 + d15 + 7.59375f * d25; - const float16_t t55 = 0.03125f * d06 + d16 + 7.59375f * d26; - const float16_t t56 = 0.03125f * d07 + d17 + 7.59375f * d27; - const float16_t t57 = 0.03125f * d08 + d18 + 7.59375f * d28; - - const float16_t t60 = 0.015625f * d31 + d41 + 11.390625f * d51 + src_data_70; - const float16_t t61 = 0.015625f * d32 + d42 + 11.390625f * d52 + src_data_71; - const float16_t t62 = 0.015625f * d33 + d43 + 11.390625f * d53 + src_data_72; - const float16_t t63 = 0.015625f * d34 + d44 + 11.390625f * d54 + src_data_73; - const float16_t t64 = 0.015625f * d35 + d45 + 11.390625f * d55 + src_data_74; - const float16_t t65 = 0.015625f * d36 + d46 + 11.390625f * d56 + src_data_75; - const float16_t t66 = 0.015625f * d37 + d47 + 11.390625f * d57 + src_data_76; - const float16_t t67 = 0.015625f * d38 + d48 + 11.390625f * d58 + src_data_77; - - float16_t s11 = t01 - t02; - float16_t s12 = t11 - t12; - float16_t s13 = t21 - t22; - float16_t s14 = t31 - t32; - float16_t s15 = t41 - t42; - float16_t s16 = t51 - t52; - float16_t s17 = t61 - t62; - - float16_t s21 = t03 - t04; - float16_t s22 = t13 - t14; - float16_t s23 = t23 - t24; - float16_t s24 = t33 - t34; - float16_t s25 = t43 - t44; - float16_t s26 = t53 - t54; - float16_t s27 = t63 - t64; - - float16_t s31 = t05 - t06; - float16_t s32 = t15 - t16; - float16_t s33 = t25 - t26; - float16_t s34 = t35 - t36; - float16_t s35 = t45 - t46; - float16_t s36 = t55 - t56; - float16_t s37 = t56 - t66; - - float16_t s41 = t01 + t02; - float16_t s42 = t11 + t12; - float16_t s43 = t21 + t22; - float16_t s44 = t31 + t32; - float16_t s45 = t41 + t42; - float16_t s46 = t51 + t52; - float16_t s47 = t61 + t62; - - float16_t s51 = t03 + t04; - float16_t s52 = t13 + t14; - float16_t s53 = t23 + t24; - float16_t s54 = t33 + t34; - float16_t s55 = t43 + t44; - float16_t s56 = t53 + t54; - float16_t s57 = t63 + t64; - - float16_t s61 = t05 + t06; - float16_t s62 = t15 + t16; - float16_t s63 = t25 + t26; - float16_t s64 = t35 + t36; - float16_t s65 = t45 + t46; - float16_t s66 = t55 + t56; - float16_t s67 = t65 + t66; - - float16_t m00 = t00 + t01 + t02 + t03 + t04 + t05 + t06; - const float16_t m01 = 0.5f * s11 + s21 + 1.5f * s31; - const float16_t m02 = 0.25f * s41 + s51 + 2.25f * s61; - const float16_t m03 = 0.125f * s11 + s21 + 3.375f * s31; - const float16_t m04 = 0.0625f * s41 + s51 + 5.0625f * s61; - const float16_t m05 = 0.03125f * s11 + s21 + 7.59375f * s31; - const float16_t m06 = 0.015625f * s41 + s51 + 11.390625f * s61 + t07; - - float16_t m10 = t10 + t11 + t12 + t13 + t14 + t15 + t16; - const float16_t m11 = 0.5f * s12 + s22 + 1.5f * s32; - const float16_t m12 = 0.25f * s42 + s52 + 2.25f * s62; - const float16_t m13 = 0.125f * s12 + s22 + 3.375f * s32; - const float16_t m14 = 0.0625f * s42 + s52 + 5.0625f * s62; - const float16_t m15 = 0.03125f * s12 + s22 + 7.59375f * s32; - const float16_t m16 = 0.015625f * s42 + s52 + 11.390625f * s62 + t17; - - float16_t m20 = t20 + t21 + t22 + t23 + t24 + t25 + t26; - const float16_t m21 = 0.5f * s13 + s23 + 1.5f * s33; - const float16_t m22 = 0.25f * s43 + s53 + 2.25f * s63; - const float16_t m23 = 0.125f * s13 + s23 + 3.375f * s33; - const float16_t m24 = 0.0625f * s43 + s53 + 5.0625f * s63; - const float16_t m25 = 0.03125f * s13 + s23 + 7.59375f * s33; - const float16_t m26 = 0.015625f * s43 + s53 + 11.390625f * s63 + t27; - - float16_t m30 = t30 + t31 + t32 + t33 + t34 + t35 + t36; - const float16_t m31 = 0.5f * s14 + s24 + 1.5f * s34; - const float16_t m32 = 0.25f * s44 + s54 + 2.25f * s64; - const float16_t m33 = 0.125f * s14 + s24 + 3.375f * s34; - const float16_t m34 = 0.0625f * s44 + s54 + 5.0625f * s64; - const float16_t m35 = 0.03125f * s14 + s24 + 7.59375f * s34; - const float16_t m36 = 0.015625f * s44 + s54 + 11.390625f * s64 + t37; - - float16_t m40 = t40 + t41 + t42 + t43 + t44 + t45 + t46; - const float16_t m41 = 0.5f * s15 + s25 + 1.5f * s35; - const float16_t m42 = 0.25f * s45 + s55 + 2.25f * s65; - const float16_t m43 = 0.125f * s15 + s25 + 3.375f * s35; - const float16_t m44 = 0.0625f * s45 + s55 + 5.0625f * s65; - const float16_t m45 = 0.03125f * s15 + s25 + 7.59375f * s35; - const float16_t m46 = 0.015625f * s45 + s55 + 11.390625f * s65 + t47; - - float16_t m50 = t50 + t51 + t52 + t53 + t54 + t55 + t56; - const float16_t m51 = 0.5f * s16 + s26 + 1.5f * s36; - const float16_t m52 = 0.25f * s46 + s56 + 2.25f * s66; - const float16_t m53 = 0.125f * s16 + s26 + 3.375f * s36; - const float16_t m54 = 0.0625f * s46 + s56 + 5.0625f * s66; - const float16_t m55 = 0.03125f * s16 + s26 + 7.59375f * s36; - const float16_t m56 = 0.015625f * s46 + s56 + 11.390625f * s66 + t57; - - float16_t m60 = t60 + t61 + t62 + t63 + t64 + t65 + t66; - const float16_t m61 = 0.5f * s17 + s27 + 1.5f * s37; - const float16_t m62 = 0.25f * s47 + s57 + 2.25f * s67; - const float16_t m63 = 0.125f * s17 + s27 + 3.375f * s37; - const float16_t m64 = 0.0625f * s47 + s57 + 5.0625f * s67; - const float16_t m65 = 0.03125f * s17 + s27 + 7.59375f * s37; - const float16_t m66 = 0.015625f * s47 + s57 + 11.390625f * s67 + t67; - - (dst_data + i)[0] = m00 + bias_data[i]; - (dst_data + i + C8NUM)[0] = m01 + bias_data[i]; - (dst_data + i + 2 * C8NUM)[0] = m02 + bias_data[i]; - (dst_data + i + 3 * C8NUM)[0] = m03 + bias_data[i]; - (dst_data + i + 4 * C8NUM)[0] = m04 + bias_data[i]; - (dst_data + i + 5 * C8NUM)[0] = m05 + bias_data[i]; - (dst_data + i + 6 * C8NUM)[0] = m06 + bias_data[i]; - - (dst_data + i + dst_step * C8NUM)[0] = m10 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + C8NUM)[0] = m11 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + 2 * C8NUM)[0] = m12 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + 3 * C8NUM)[0] = m13 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + 4 * C8NUM)[0] = m14 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + 5 * C8NUM)[0] = m15 + bias_data[i]; - (dst_data + i + dst_step * C8NUM + 6 * C8NUM)[0] = m16 + bias_data[i]; - - (dst_data + i + 2 * dst_step * C8NUM)[0] = m20 + bias_data[i]; - (dst_data + i + 2 * dst_step * C8NUM + C8NUM)[0] = m21 + bias_data[i]; - (dst_data + i + 2 * dst_step * C8NUM + 2 * C8NUM)[0] = m22 + bias_data[i]; - (dst_data + i + 2 * dst_step * C8NUM + 3 * C8NUM)[0] = m23 + bias_data[i]; - (dst_data + i + 2 * dst_step * C8NUM + 4 * C8NUM)[0] = m24 + bias_data[i]; - (dst_data + i + 2 * dst_step * C8NUM + 5 * C8NUM)[0] = m25 + bias_data[i]; - (dst_data + i + 2 * dst_step * C8NUM + 6 * C8NUM)[0] = m26 + bias_data[i]; - - (dst_data + i + 3 * dst_step * C8NUM)[0] = m30 + bias_data[i]; - (dst_data + i + 3 * dst_step * C8NUM + C8NUM)[0] = m31 + bias_data[i]; - (dst_data + i + 3 * dst_step * C8NUM + 2 * C8NUM)[0] = m32 + bias_data[i]; - (dst_data + i + 3 * dst_step * C8NUM + 3 * C8NUM)[0] = m33 + bias_data[i]; - (dst_data + i + 3 * dst_step * C8NUM + 4 * C8NUM)[0] = m34 + bias_data[i]; - (dst_data + i + 3 * dst_step * C8NUM + 5 * C8NUM)[0] = m35 + bias_data[i]; - (dst_data + i + 3 * dst_step * C8NUM + 6 * C8NUM)[0] = m36 + bias_data[i]; - - (dst_data + i + 4 * dst_step * C8NUM)[0] = m40 + bias_data[i]; - (dst_data + i + 4 * dst_step * C8NUM + C8NUM)[0] = m41 + bias_data[i]; - (dst_data + i + 4 * dst_step * C8NUM + 2 * C8NUM)[0] = m42 + bias_data[i]; - (dst_data + i + 4 * dst_step * C8NUM + 3 * C8NUM)[0] = m43 + bias_data[i]; - (dst_data + i + 4 * dst_step * C8NUM + 4 * C8NUM)[0] = m44 + bias_data[i]; - (dst_data + i + 4 * dst_step * C8NUM + 5 * C8NUM)[0] = m45 + bias_data[i]; - (dst_data + i + 4 * dst_step * C8NUM + 6 * C8NUM)[0] = m46 + bias_data[i]; - - (dst_data + i + 5 * dst_step * C8NUM)[0] = m50 + bias_data[i]; - (dst_data + i + 5 * dst_step * C8NUM + C8NUM)[0] = m51 + bias_data[i]; - (dst_data + i + 5 * dst_step * C8NUM + 2 * C8NUM)[0] = m52 + bias_data[i]; - (dst_data + i + 5 * dst_step * C8NUM + 3 * C8NUM)[0] = m53 + bias_data[i]; - (dst_data + i + 5 * dst_step * C8NUM + 4 * C8NUM)[0] = m54 + bias_data[i]; - (dst_data + i + 5 * dst_step * C8NUM + 5 * C8NUM)[0] = m55 + bias_data[i]; - (dst_data + i + 5 * dst_step * C8NUM + 6 * C8NUM)[0] = m56 + bias_data[i]; - - (dst_data + i + 6 * dst_step * C8NUM)[0] = m60 + bias_data[i]; - (dst_data + i + 6 * dst_step * C8NUM + C8NUM)[0] = m61 + bias_data[i]; - (dst_data + i + 6 * dst_step * C8NUM + 2 * C8NUM)[0] = m62 + bias_data[i]; - (dst_data + i + 6 * dst_step * C8NUM + 3 * C8NUM)[0] = m63 + bias_data[i]; - (dst_data + i + 6 * dst_step * C8NUM + 4 * C8NUM)[0] = m64 + bias_data[i]; - (dst_data + i + 6 * dst_step * C8NUM + 5 * C8NUM)[0] = m65 + bias_data[i]; - (dst_data + i + 6 * dst_step * C8NUM + 6 * C8NUM)[0] = m66 + bias_data[i]; - } -#endif -} - -InputTransformUnitFp16Func GetInputTransFuncFp16(int input_unit) { - if (input_unit == 4) { - return InputTransform4x4UnitFp16; - } else if (input_unit == 8) { - return InputTransform8x8UnitFp16; - } else { - printf("Only support 4 or 8 for input unit."); - return NULL; + MatrixMultiplyVecFp16(vec_bt, src, t, NULL, in_unit, in_unit, in_unit); + MatrixMultiplyVecFp16(t, vec_b, m, NULL, in_unit, in_unit, in_unit); + for (int i = 0; i < len; i++) { + int dst_step_offset = i * dst_step; + vst1_f16(dst_data + dst_step_offset, vget_low_f16(m[i])); + vst1_f16(dst_data + dst_step_offset + 64, vget_high_f16(m[i])); } } -OutputTransformUnitFp16Func GetOutputTransFuncFp16(int input_unit, int output_unit) { - if (input_unit == 4 && output_unit == 2) { - return OutputTransform4x2UnitFp16; - } else if (input_unit == 4 && output_unit == 3) { - return OutputTransform4x3UnitFp16; - } else if (input_unit == 8) { - return outputTransformUnitFp16[output_unit]; - } else { - printf("."); - return NULL; +void GeneralOutputTransformUnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, + float16_t *matrix_a, float16_t *matrix_at, int src_step, int dst_step, int in_unit, + int out_unit) { + int src_len = in_unit * in_unit; + if (src_len > MAX_LEN) { + return; + } + float16x8_t src[MAX_LEN]; + float16x8_t t[MAX_LEN]; + float16x8_t m[MAX_LEN]; + float16x8_t vec_a[MAX_LEN]; + float16x8_t vec_at[MAX_LEN]; + int tmp_len = in_unit * out_unit; + if (tmp_len > MAX_LEN) return; + + for (int i = 0; i < tmp_len; i++) { + vec_a[i] = vdupq_n_f16(matrix_a[i]); + vec_at[i] = vdupq_n_f16(matrix_at[i]); + } + for (int i = 0; i < src_len; i++) { + src[i] = vld1q_f16(src_data + i * src_step); + } + MatrixMultiplyVecFp16(vec_at, src, t, NULL, out_unit, in_unit, in_unit); + MatrixMultiplyVecFp16(t, vec_a, m, bias_data, out_unit, in_unit, out_unit); + + for (int i = 0; i < out_unit; i++) { + int dst_k_offset = i * dst_step * C8NUM; + int m_k_offset = i * out_unit; + for (int j = 0; j < out_unit; j++) { + vst1q_f16(dst_data + dst_k_offset + j * C8NUM, m[m_k_offset + j]); + } } } diff --git a/mindspore/lite/nnacl/fp16/winograd_utils_fp16.h b/mindspore/lite/nnacl/fp16/winograd_utils_fp16.h index 99728ca0e2f..b961f6a2daa 100644 --- a/mindspore/lite/nnacl/fp16/winograd_utils_fp16.h +++ b/mindspore/lite/nnacl/fp16/winograd_utils_fp16.h @@ -21,45 +21,17 @@ #include "nnacl/conv_parameter.h" #include "nnacl/op_base.h" -typedef void (*InputTransformUnitFp16Func)(const float16_t *src_data, float16_t *dst_data, int src_step, int dst_step); -typedef void (*OutputTransformUnitFp16Func)(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, - int src_step, int dst_step); +#define MAX_LEN 256 #ifdef __cplusplus extern "C" { #endif -void InputTransform4x4UnitFp16(const float16_t *src_data, float16_t *dst_data, int src_step, int dst_step); - -void InputTransform8x8UnitFp16(const float16_t *src_data, float16_t *dst_data, int src_step, int dst_step); - -void OutputTransform4x2UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, - int src_step, int dst_step); - -void OutputTransform4x3UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, - int src_step, int dst_step); - -void OutputTransform8x2UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, - int src_step, int dst_step); - -void OutputTransform8x3UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, - int src_step, int dst_step); - -void OutputTransform8x4UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, - int src_step, int dst_step); - -void OutputTransform8x5UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, - int src_step, int dst_step); - -void OutputTransform8x6UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, - int src_step, int dst_step); - -void OutputTransform8x7UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, - int src_step, int dst_step); - -InputTransformUnitFp16Func GetInputTransFuncFp16(int input_unit); - -OutputTransformUnitFp16Func GetOutputTransFuncFp16(int input_unit, int output_unit); +void GeneralInputTransformUnitFp16(const float16_t *src_data, float16_t *dst_data, float16_t *matrix_b, + float16_t *matrix_bt, int src_step, int dst_step, int in_unit); +void GeneralOutputTransformUnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data, + float16_t *matrix_a, float16_t *matrix_at, int src_step, int dst_step, int in_unit, + int out_unit); #ifdef __cplusplus } #endif diff --git a/mindspore/lite/nnacl/fp32/conv.c b/mindspore/lite/nnacl/fp32/conv.c index 3b390858be6..e52f10f8607 100644 --- a/mindspore/lite/nnacl/fp32/conv.c +++ b/mindspore/lite/nnacl/fp32/conv.c @@ -259,8 +259,8 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons // fp32 conv winograd void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_data, TmpBufferAddress *buffer_list, - int task_id, ConvParameter *conv_param, InputTransformUnitFunc input_trans_func, - OutputTransformUnitFunc output_trans_func, GEMM_FUNC_FP32 gemm_func) { + int task_id, ConvParameter *conv_param, InputTransFunc in_func, OutputTransFunc out_func, + GEMM_FUNC_FP32 gemm_func) { int thread_num = conv_param->thread_num_; int input_unit = conv_param->input_unit_; int in_batch = conv_param->input_batch_; @@ -296,7 +296,7 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_ cal_num = cal_num > C12NUM ? C12NUM : cal_num; WinogradInputTransform(input_data + in_batch_offset, trans_input + task_id * trans_input_offset, tmp_data + task_id * tmp_data_offset, cal_num, out_tile_index, out_w_block, conv_param, - input_trans_func); + in_func); // step 3 : gemm float *src_ptr = trans_input + task_id * trans_input_offset; float *dst_ptr = gemm_out + task_id * gemm_out_offset; @@ -309,7 +309,7 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_ // step 4 : output transform WinogradOutputTransform(dst_ptr, tmp_out_data + tmp_out_batch_offset, bias_data, cal_num, out_tile_index, - out_w_block, conv_param, output_trans_func); + out_w_block, conv_param, out_func); } } } diff --git a/mindspore/lite/nnacl/fp32/conv.h b/mindspore/lite/nnacl/fp32/conv.h index 3ea865d6b24..7baa37a26a2 100644 --- a/mindspore/lite/nnacl/fp32/conv.h +++ b/mindspore/lite/nnacl/fp32/conv.h @@ -28,6 +28,7 @@ #include "nnacl/fp32/conv_depthwise.h" typedef float *TmpBufferAddress; +typedef float *Matrices; typedef void (*GEMM_FUNC_FP32)(float *output, const float *input, const float *weight, const float *bias, size_t step, size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, size_t relu, size_t relu6); @@ -53,8 +54,8 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons // fp32 convolution winograd void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_data, TmpBufferAddress *buffer_list, - int task_id, ConvParameter *conv_param, InputTransformUnitFunc input_trans_func, - OutputTransformUnitFunc output_trans_func, GEMM_FUNC_FP32 gemm_func); + int task_id, ConvParameter *conv_param, InputTransFunc in_func, OutputTransFunc out_func, + GEMM_FUNC_FP32 gemm_func); void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, int width, int channel, int output_unit); diff --git a/mindspore/lite/nnacl/matrix_table.c b/mindspore/lite/nnacl/matrix_table.c deleted file mode 100644 index 3f4329bb6a9..00000000000 --- a/mindspore/lite/nnacl/matrix_table.c +++ /dev/null @@ -1,507 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "nnacl/matrix_table.h" - -void MatrixG4x2(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 0.0f; - matrix_data[2] = 1.0f; - matrix_data[3] = 0.5f; - matrix_data[4] = 1.0f; - matrix_data[5] = -0.5f; - matrix_data[6] = 0.0f; - matrix_data[7] = 1.0f; -} - -void MatrixGT2x4(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 1.0f; - matrix_data[2] = 1.0f; - matrix_data[3] = 0.0f; - matrix_data[4] = 0.0f; - matrix_data[5] = 0.5f; - matrix_data[6] = -0.5f; - matrix_data[7] = 1.0f; -} - -void MatrixG8x2(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 0.0f; - matrix_data[2] = 1.0f; - matrix_data[3] = 0.5f; - matrix_data[4] = 1.0f; - matrix_data[5] = -0.5f; - matrix_data[6] = 1.0f; - matrix_data[7] = 1.0f; - matrix_data[8] = 1.0f; - matrix_data[9] = -1.0f; - matrix_data[10] = 1.0f; - matrix_data[11] = 1.5f; - matrix_data[12] = 1.0f; - matrix_data[13] = -1.5f; - matrix_data[14] = 0.0f; - matrix_data[15] = 1.0f; -} - -void MatrixGT2x8(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 1.0f; - matrix_data[2] = 1.0f; - matrix_data[3] = 1.5f; - matrix_data[4] = 1.0f; - matrix_data[5] = 1.0f; - matrix_data[6] = 1.0f; - matrix_data[7] = 0.0f; - matrix_data[8] = 0.0f; - matrix_data[9] = 0.5f; - matrix_data[10] = -0.5f; - matrix_data[11] = 1.0f; - matrix_data[12] = -1.0f; - matrix_data[13] = 1.5f; - matrix_data[14] = -1.5f; - matrix_data[15] = 1.0f; -} - -void MatrixG8x3(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 0.0f; - matrix_data[2] = 0.0f; - matrix_data[3] = 1.0f; - matrix_data[4] = 0.5f; - matrix_data[5] = 0.25f; - matrix_data[6] = 1.0f; - matrix_data[7] = -0.5f; - matrix_data[8] = 0.25f; - matrix_data[9] = 1.0f; - matrix_data[10] = 1.0f; - matrix_data[11] = 1.0f; - matrix_data[12] = 1.0f; - matrix_data[13] = -1.0f; - matrix_data[14] = 1.0f; - matrix_data[15] = 1.0f; - matrix_data[16] = 1.5f; - matrix_data[17] = 2.25f; - matrix_data[18] = 1.0f; - matrix_data[19] = -1.5f; - matrix_data[20] = 2.25f; - matrix_data[21] = 0.0f; - matrix_data[22] = 0.0f; - matrix_data[23] = 1.0f; -} - -void MatrixGT3x8(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 1.0f; - matrix_data[2] = 1.0f; - matrix_data[3] = 1.0f; - matrix_data[4] = 1.0f; - matrix_data[5] = 1.0f; - matrix_data[6] = 1.0f; - matrix_data[7] = 0.0f; - matrix_data[8] = 0.0f; - matrix_data[9] = 0.5f; - matrix_data[10] = -0.5f; - matrix_data[11] = 1.0f; - matrix_data[12] = -1.0f; - matrix_data[13] = 1.5f; - matrix_data[14] = -1.5f; - matrix_data[15] = 0.0f; - matrix_data[16] = 0.0f; - matrix_data[17] = 0.25f; - matrix_data[18] = 0.25f; - matrix_data[19] = 1.0f; - matrix_data[20] = 1.0f; - matrix_data[21] = 2.25f; - matrix_data[22] = 2.25f; - matrix_data[23] = 1.0f; -} - -void MatrixG8x4(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 0.0f; - matrix_data[2] = 0.0f; - matrix_data[3] = 0.0f; - matrix_data[4] = 1.0f; - matrix_data[5] = 0.5f; - matrix_data[6] = 0.25f; - matrix_data[7] = 0.125f; - matrix_data[8] = 1.0f; - matrix_data[9] = -0.5f; - matrix_data[10] = 0.25f; - matrix_data[11] = -0.125f; - matrix_data[12] = 1.0f; - matrix_data[13] = 1.0f; - matrix_data[14] = 1.0f; - matrix_data[15] = 1.0f; - matrix_data[16] = 1.0f; - matrix_data[17] = -1.0f; - matrix_data[18] = 1.0f; - matrix_data[19] = -1.0f; - matrix_data[20] = 1.0f; - matrix_data[21] = 1.5f; - matrix_data[22] = 2.25f; - matrix_data[23] = 3.375f; - matrix_data[24] = 1.0f; - matrix_data[25] = -1.5f; - matrix_data[26] = 2.25f; - matrix_data[27] = -3.375f; - matrix_data[28] = 0.0f; - matrix_data[29] = 0.0f; - matrix_data[30] = 0.0f; - matrix_data[31] = 1.0f; -} - -void MatrixGT4x8(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 1.0f; - matrix_data[2] = 1.0f; - matrix_data[3] = 1.0f; - matrix_data[4] = 1.0f; - matrix_data[5] = 1.0f; - matrix_data[6] = 1.0f; - matrix_data[7] = 0.0f; - matrix_data[8] = 0.0f; - matrix_data[9] = 0.5f; - matrix_data[10] = -0.5f; - matrix_data[11] = 1.0f; - matrix_data[12] = -1.0f; - matrix_data[13] = 1.5f; - matrix_data[14] = -1.5f; - matrix_data[15] = 0.0f; - matrix_data[16] = 0.0f; - matrix_data[17] = 0.25f; - matrix_data[18] = 0.25f; - matrix_data[19] = 1.0f; - matrix_data[20] = 1.0f; - matrix_data[21] = 2.25f; - matrix_data[22] = 2.25f; - matrix_data[23] = 0.0f; - matrix_data[24] = 0.0f; - matrix_data[25] = 0.125f; - matrix_data[26] = -0.125f; - matrix_data[27] = 1.0f; - matrix_data[28] = -1.0f; - matrix_data[29] = 3.375f; - matrix_data[30] = -3.375f; - matrix_data[31] = 1.0f; -} - -void MatrixG8x5(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 0.0f; - matrix_data[2] = 0.0f; - matrix_data[3] = 0.0f; - matrix_data[4] = 0.0f; - matrix_data[5] = 1.0f; - matrix_data[6] = 0.5f; - matrix_data[7] = 0.25f; - matrix_data[8] = 0.125f; - matrix_data[9] = 0.0625f; - matrix_data[10] = 1.0f; - matrix_data[11] = -0.5f; - matrix_data[12] = 0.25f; - matrix_data[13] = -0.125f; - matrix_data[14] = 0.0625f; - matrix_data[15] = 1.0f; - matrix_data[16] = 1.0f; - matrix_data[17] = 1.0f; - matrix_data[18] = 1.0f; - matrix_data[19] = 1.0f; - matrix_data[20] = 1.0f; - matrix_data[21] = -1.0f; - matrix_data[22] = 1.0f; - matrix_data[23] = -1.0f; - matrix_data[24] = 1.0f; - matrix_data[25] = 1.0f; - matrix_data[26] = 1.5f; - matrix_data[27] = 2.25f; - matrix_data[28] = 3.375f; - matrix_data[29] = 5.0625f; - matrix_data[30] = 1.0f; - matrix_data[31] = -1.5f; - matrix_data[32] = 2.25f; - matrix_data[33] = -3.375f; - matrix_data[34] = 5.0625f; - matrix_data[35] = 0.0f; - matrix_data[36] = 0.0f; - matrix_data[37] = 0.0f; - matrix_data[38] = 0.0f; - matrix_data[39] = 1.0f; -} - -void MatrixGT5x8(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 1.0f; - matrix_data[2] = 1.0f; - matrix_data[3] = 1.0f; - matrix_data[4] = 1.0f; - matrix_data[5] = 1.0f; - matrix_data[6] = 1.0f; - matrix_data[7] = 0.0f; - matrix_data[8] = 0.0f; - matrix_data[9] = 0.5f; - matrix_data[10] = -0.5f; - matrix_data[11] = 1.0f; - matrix_data[12] = -1.0f; - matrix_data[13] = 1.5f; - matrix_data[14] = -1.5f; - matrix_data[15] = 0.0f; - matrix_data[16] = 0.0f; - matrix_data[17] = 0.25f; - matrix_data[18] = 0.25f; - matrix_data[19] = 1.0f; - matrix_data[20] = 1.0f; - matrix_data[21] = 2.25f; - matrix_data[22] = 2.25f; - matrix_data[23] = 0.0f; - matrix_data[24] = 0.0f; - matrix_data[25] = 0.125f; - matrix_data[26] = -0.125f; - matrix_data[27] = 1.0f; - matrix_data[28] = -1.0f; - matrix_data[29] = 3.375f; - matrix_data[30] = -3.375f; - matrix_data[31] = 0.0f; - matrix_data[32] = 0.0f; - matrix_data[33] = 0.0625f; - matrix_data[34] = 0.0625f; - matrix_data[35] = 1.0f; - matrix_data[36] = 1.0f; - matrix_data[37] = 5.0625f; - matrix_data[38] = 5.0625f; - matrix_data[39] = 1.0f; -} - -void MatrixG8x6(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 0.0f; - matrix_data[2] = 0.0f; - matrix_data[3] = 0.0f; - matrix_data[4] = 0.0f; - matrix_data[5] = 0.0f; - matrix_data[6] = 1.0f; - matrix_data[7] = 0.5f; - matrix_data[8] = 0.25f; - matrix_data[9] = 0.125f; - matrix_data[10] = 0.0625f; - matrix_data[11] = 0.03125f; - matrix_data[12] = 1.0f; - matrix_data[13] = -0.5f; - matrix_data[14] = 0.25f; - matrix_data[15] = -0.125f; - matrix_data[16] = 0.0625f; - matrix_data[17] = -0.03125f; - matrix_data[18] = 1.0f; - matrix_data[19] = 1.0f; - matrix_data[20] = 1.0f; - matrix_data[21] = 1.0f; - matrix_data[22] = 1.0f; - matrix_data[23] = 1.0f; - matrix_data[24] = 1.0f; - matrix_data[25] = -1.0f; - matrix_data[26] = 1.0f; - matrix_data[27] = -1.0f; - matrix_data[28] = 1.0f; - matrix_data[29] = -1.0f; - matrix_data[30] = 1.0f; - matrix_data[31] = 1.5f; - matrix_data[32] = 2.25f; - matrix_data[33] = 3.375f; - matrix_data[34] = 5.0625f; - matrix_data[35] = 7.59375f; - matrix_data[36] = 1.0f; - matrix_data[37] = -1.5f; - matrix_data[38] = 2.25f; - matrix_data[39] = -3.375f; - matrix_data[40] = 5.0625f; - matrix_data[41] = -7.59375f; - matrix_data[42] = 0.0f; - matrix_data[43] = 0.0f; - matrix_data[44] = 0.0f; - matrix_data[45] = 0.0f; - matrix_data[46] = 0.0f; - matrix_data[47] = 1.0f; -} - -void MatrixGT6x8(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 1.0f; - matrix_data[2] = 1.0f; - matrix_data[3] = 1.0f; - matrix_data[4] = 1.0f; - matrix_data[5] = 1.0f; - matrix_data[6] = 1.0f; - matrix_data[7] = 0.0f; - matrix_data[8] = 0.0f; - matrix_data[9] = 0.5f; - matrix_data[10] = -0.5f; - matrix_data[11] = 1.0f; - matrix_data[12] = -1.0f; - matrix_data[13] = 1.5f; - matrix_data[14] = -1.5f; - matrix_data[15] = 0.0f; - matrix_data[16] = 0.0f; - matrix_data[17] = 0.25f; - matrix_data[18] = 0.25f; - matrix_data[19] = 1.0f; - matrix_data[20] = 1.0f; - matrix_data[21] = 2.25f; - matrix_data[22] = 2.25f; - matrix_data[23] = 0.0f; - matrix_data[24] = 0.0f; - matrix_data[25] = 0.125f; - matrix_data[26] = -0.125f; - matrix_data[27] = 1.0f; - matrix_data[28] = -1.0f; - matrix_data[29] = 3.375f; - matrix_data[30] = -3.375f; - matrix_data[31] = 0.0f; - matrix_data[32] = 0.0f; - matrix_data[33] = 0.0625f; - matrix_data[34] = 0.0625f; - matrix_data[35] = 1.0f; - matrix_data[36] = 1.0f; - matrix_data[37] = 5.0625f; - matrix_data[38] = 5.0625f; - matrix_data[39] = 0.0f; - matrix_data[40] = 0.0; - matrix_data[41] = 0.03125f; - matrix_data[42] = -0.03125f; - matrix_data[43] = 1.0f; - matrix_data[44] = -1.0f; - matrix_data[45] = 7.59375f; - matrix_data[46] = -7.59375f; - matrix_data[47] = 0.0f; - matrix_data[48] = 1.0f; -} - -void MatrixG8x7(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 0.0f; - matrix_data[2] = 0.0f; - matrix_data[3] = 0.0f; - matrix_data[4] = 0.0f; - matrix_data[5] = 0.0f; - matrix_data[6] = 0.0f; - matrix_data[7] = 1.0f; - matrix_data[8] = 0.5f; - matrix_data[9] = 0.25f; - matrix_data[10] = 0.125f; - matrix_data[11] = 0.0625f; - matrix_data[12] = 0.03125f; - matrix_data[13] = 0.015625f; - matrix_data[14] = 1.0f; - matrix_data[15] = -0.5f; - matrix_data[16] = 0.25f; - matrix_data[17] = -0.125f; - matrix_data[18] = 0.0625f; - matrix_data[19] = -0.03125f; - matrix_data[20] = 0.015625f; - matrix_data[21] = 1.0f; - matrix_data[22] = 1.0f; - matrix_data[23] = 1.0f; - matrix_data[24] = 1.0f; - matrix_data[25] = 1.0f; - matrix_data[26] = 1.0f; - matrix_data[27] = 1.0f; - matrix_data[28] = 1.0f; - matrix_data[29] = -1.0f; - matrix_data[30] = 1.0f; - matrix_data[31] = -1.0f; - matrix_data[32] = 1.0f; - matrix_data[33] = -1.0f; - matrix_data[34] = 1.0f; - matrix_data[35] = 1.0f; - matrix_data[36] = 1.5f; - matrix_data[37] = 2.25f; - matrix_data[38] = 3.375f; - matrix_data[39] = 5.0625f; - matrix_data[40] = 7.59375f; - matrix_data[41] = 11.390625f; - matrix_data[42] = 1.0f; - matrix_data[43] = -1.5f; - matrix_data[44] = 2.25f; - matrix_data[45] = -3.375f; - matrix_data[46] = 5.0625f; - matrix_data[47] = -7.59375f; - matrix_data[48] = 11.390625f; - matrix_data[49] = 0.0f; - matrix_data[50] = 0.0f; - matrix_data[51] = 0.0f; - matrix_data[52] = 0.0f; - matrix_data[53] = 0.0f; - matrix_data[54] = 0.0f; - matrix_data[55] = 1.0f; -} - -void MatrixGT7x8(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 1.0f; - matrix_data[2] = 1.0f; - matrix_data[3] = 1.0f; - matrix_data[4] = 1.0f; - matrix_data[5] = 1.0f; - matrix_data[6] = 1.0f; - matrix_data[7] = 0.0f; - matrix_data[8] = 0.0f; - matrix_data[9] = 0.5f; - matrix_data[10] = -0.5f; - matrix_data[11] = 1.0f; - matrix_data[12] = -1.0f; - matrix_data[13] = 1.5f; - matrix_data[14] = -1.5f; - matrix_data[15] = 0.0f; - matrix_data[16] = 0.0f; - matrix_data[17] = 0.25f; - matrix_data[18] = 0.25f; - matrix_data[19] = 1.0f; - matrix_data[20] = 1.0f; - matrix_data[21] = 2.25f; - matrix_data[22] = 2.25f; - matrix_data[23] = 0.0f; - matrix_data[24] = 0.0f; - matrix_data[25] = 0.125f; - matrix_data[26] = -0.125f; - matrix_data[27] = 1.0f; - matrix_data[28] = -1.0f; - matrix_data[29] = 3.375f; - matrix_data[30] = -3.375f; - matrix_data[31] = 0.0f; - matrix_data[32] = 0.0f; - matrix_data[33] = 0.0625f; - matrix_data[34] = 0.0625f; - matrix_data[35] = 1.0f; - matrix_data[36] = 1.0f; - matrix_data[37] = 5.0625f; - matrix_data[38] = 5.0625f; - matrix_data[39] = 0.0f; - matrix_data[40] = 0.0; - matrix_data[41] = 0.03125f; - matrix_data[42] = -0.03125f; - matrix_data[43] = 1.0f; - matrix_data[44] = -1.0f; - matrix_data[45] = 7.59375f; - matrix_data[46] = -7.59375f; - matrix_data[47] = 0.0f; - matrix_data[48] = 0.0f; - matrix_data[49] = 0.015625f; - matrix_data[50] = 0.015625f; - matrix_data[51] = 1.0f; - matrix_data[52] = 1.0f; - matrix_data[53] = 11.390625f; - matrix_data[54] = 11.390625f; - matrix_data[55] = 1.0f; -} diff --git a/mindspore/lite/nnacl/matrix_table.h b/mindspore/lite/nnacl/matrix_table.h deleted file mode 100644 index b2c7f5404c6..00000000000 --- a/mindspore/lite/nnacl/matrix_table.h +++ /dev/null @@ -1,54 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_LITE_NNACL_MATRIX_TABLE_H_ -#define MINDSPORE_LITE_NNACL_MATRIX_TABLE_H_ - -#ifdef __cplusplus -extern "C" { -#endif -void MatrixG4x2(float *matrix_data); - -void MatrixGT2x4(float *matrix_data); - -void MatrixG8x2(float *matrix_data); - -void MatrixGT2x8(float *matrix_data); - -void MatrixG8x3(float *matrix_data); - -void MatrixGT3x8(float *matrix_data); - -void MatrixG8x4(float *matrix_data); - -void MatrixGT4x8(float *matrix_data); - -void MatrixG8x5(float *matrix_data); - -void MatrixGT5x8(float *matrix_data); - -void MatrixG8x6(float *matrix_data); - -void MatrixGT6x8(float *matrix_data); - -void MatrixG8x7(float *matrix_data); - -void MatrixGT7x8(float *matrix_data); -#ifdef __cplusplus -} -#endif - -#endif // MINDSPORE_LITE_NNACL_MATRIX_TABLE_H_ diff --git a/mindspore/lite/nnacl/minimal_filtering_generator.c b/mindspore/lite/nnacl/minimal_filtering_generator.c new file mode 100644 index 00000000000..00e9901dcf2 --- /dev/null +++ b/mindspore/lite/nnacl/minimal_filtering_generator.c @@ -0,0 +1,233 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "nnacl/minimal_filtering_generator.h" +#include +#include +#include + +void Polynomial(float *interval, float *m, int degree) { + for (int i = 0; i < degree; ++i) { + float mul = 1; + for (int j = 0; j < degree; ++j) { + if (i == j) continue; + mul *= (interval[i] - interval[j]); + } + m[i] = mul; + } +} + +void DiagonalPlusMatrix(float *matrix, float *diagonal_matrix, int degree) { + int data_num = (degree + 1) * (degree + 1); + memset(diagonal_matrix, 0, data_num * sizeof(float)); + for (int i = 0; i < (degree + 1); ++i) { + for (int j = 0; j < (degree + 1); ++j) { + if (j == i) diagonal_matrix[i * (degree + 1) + j] = matrix[i]; + } + } + diagonal_matrix[data_num - 1] = 1; +} + +void ResidueMatrix(float *interval, float *b, int row, int col) { + // row : input unit, col : output_unit + // result : matrix b + int len = row * col; + memset(b, 0, len * sizeof(float)); + for (int i = 0; i < row - 1; ++i) { + for (int j = 0; j < col; ++j) { + b[i * col + j] = pow(interval[i], j); + } + } + b[len - 1] = 1; +} + +void LT(float *poly_array, float *matrix_lt, int n) { + float *coefficient_array = (float *)malloc(n * sizeof(float)); + float *poly = (float *)malloc(n * sizeof(float)); + Polynomial(poly_array, poly, n); + for (int i = 0; i < n; ++i) { + // get coefficient + int index = 1; + memset(coefficient_array, 0, n * sizeof(float)); + coefficient_array[0] = 1; + for (int j = 0; j < n; ++j) { + if (j == i) continue; + float poly_coe = poly_array[j] == 0 ? 0 : -poly_array[j]; + coefficient_array[index] = 1; + for (int k = index - 1; k > 0; --k) { + coefficient_array[k] = coefficient_array[k] * poly_coe + coefficient_array[k - 1]; + } + coefficient_array[0] *= poly_coe; + index++; + } + + // lx[i, 0].nth(j) / f[i] + int setp = i * n; + for (int l = 0; l < n; ++l) { + matrix_lt[setp + l] = coefficient_array[l] / poly[i]; + } + } // matrix L row loop + free(coefficient_array); + free(poly); +} + +void T(float *poly_array, float *matrix_t, int n) { + memset(matrix_t, 0, n * (n + 1) * sizeof(float)); + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n + 1; ++j) { + if (j == i) matrix_t[i * (n + 1) + j] = 1; + if (j == n) { + if (poly_array[i] == 0) { + matrix_t[i * (n + 1) + j] = 0; + } else { + matrix_t[i * (n + 1) + j] = -pow(poly_array[i], n); + } + } + } + } +} + +void B(float *poly_array, float *matrix_b, int in_unit) { + memset(matrix_b, 0, in_unit * in_unit * sizeof(float)); + int n = in_unit - 1; + float *matrix_l = (float *)malloc(n * n * sizeof(float)); + float *matrix_lt = (float *)malloc(n * n * sizeof(float)); + float *matrix_t = (float *)malloc(n * in_unit * sizeof(float)); + T(poly_array, matrix_t, n); + LT(poly_array, matrix_lt, n); + MatrixTranspose(matrix_lt, matrix_l, n, n); + MatrixMultiply(matrix_l, matrix_t, matrix_b, n, n, in_unit); + matrix_b[in_unit * in_unit - 1] = 1; + free(matrix_l); + free(matrix_lt); + free(matrix_t); +} + +void GenerateIntervalArray(float *array, float interval, int degree) { + array[0] = 0; + for (int i = 1; i < degree; ++i) { + int coefficient = pow(-1, i - 1); + array[i] = array[i - 1] + interval * i * coefficient; + } +} + +void MatrixTranspose(float *matrix, float *trans_matrix, int row, int col) { + for (int i = 0; i < col; ++i) { + for (int j = 0; j < row; ++j) { + trans_matrix[i * row + j] = matrix[j * col + i]; + } + } +} + +void MatrixMultiply(const float *matrix_a, const float *matrix_b, float *matrix_c, int m, int k, int n) { + int count = 0; + for (int h = 0; h < m; h++) { + int h_offset = h * k; + for (int w = 0; w < n; w++) { + float res = 0; + for (int i = 0; i < k; i++) { + res += *(matrix_a + h_offset + i) * *(matrix_b + w + i * n); + } + *(matrix_c + count) = res; + count++; + } + } +} + +void CookToomFilter(float *matrix_a, float *matrix_at, float *matrix_b, float *matrix_bt, float *matrix_g, + float *matrix_gt, float coefficient, int out_unit, int filter_size) { + int in_unit = out_unit + filter_size - 1; + int degree = in_unit - 1; + float *polynomial_m = malloc(degree * sizeof(float)); + float *diagonal_matrix = malloc(in_unit * in_unit * sizeof(float)); + float *inverse_diagonal_matrix = malloc(in_unit * in_unit * sizeof(float)); + + // get diagonal matrix + float *interval = malloc(degree * sizeof(float)); + GenerateIntervalArray(interval, coefficient, degree); + Polynomial(interval, polynomial_m, degree); + DiagonalPlusMatrix(polynomial_m, diagonal_matrix, degree); + if (diagonal_matrix[0] < 0) { + for (int i = 0; i < in_unit; ++i) { + if (diagonal_matrix[i] != 0) diagonal_matrix[i] *= -1; + } + } + + // inverse diagonal matrix + for (int j = 0; j < in_unit * in_unit; ++j) { + if (diagonal_matrix[j] != 0) { + inverse_diagonal_matrix[j] = 1.0 / diagonal_matrix[j]; + } else { + inverse_diagonal_matrix[j] = 0; + } + } + + // get matrix A && AT + ResidueMatrix(interval, matrix_a, in_unit, out_unit); + MatrixTranspose(matrix_a, matrix_at, in_unit, out_unit); + + // get matrix B + B(interval, matrix_bt, in_unit); + MatrixTranspose(matrix_bt, matrix_b, in_unit, in_unit); + MatrixMultiply(diagonal_matrix, matrix_b, matrix_bt, in_unit, in_unit, in_unit); + MatrixTranspose(matrix_bt, matrix_b, in_unit, in_unit); + + // get matrix G && GT + float *tmp_g = malloc(in_unit * filter_size * sizeof(float)); + ResidueMatrix(interval, matrix_g, in_unit, filter_size); + MatrixTranspose(matrix_g, tmp_g, in_unit, filter_size); + MatrixMultiply(tmp_g, inverse_diagonal_matrix, matrix_gt, filter_size, in_unit, in_unit); + MatrixTranspose(matrix_gt, matrix_g, filter_size, in_unit); + + free(interval); + free(polynomial_m); + free(diagonal_matrix); + free(inverse_diagonal_matrix); + free(tmp_g); +} + +#ifdef ENABLE_ARM +void MatrixMultiplyVec(const float32x4_t *matrix_a, const float32x4_t *matrix_b, float32x4_t *matrix_c, + const float *bias, int m, int k, int n) { + if (bias == NULL) { + int count = 0; + for (int h = 0; h < m; h++) { + int h_offset = h * k; + for (int w = 0; w < n; w++) { + float32x4_t res = vmovq_n_f32(0); + for (int i = 0; i < k; i++) { + res = vmlaq_f32(res, matrix_a[h_offset + i], matrix_b[w + i * n]); + } + matrix_c[count] = res; + count++; + } + } + } else { + int count = 0; + float32x4_t bias_ptr = vld1q_f32(bias); + for (int h = 0; h < m; h++) { + int h_offset = h * k; + for (int w = 0; w < n; w++) { + float32x4_t res = vmovq_n_f32(0); + for (int i = 0; i < k; i++) { + res = vmlaq_f32(res, matrix_a[h_offset + i], matrix_b[w + i * n]); + } + matrix_c[count] = vaddq_f32(res, bias_ptr); + count++; + } + } + } +} +#endif diff --git a/mindspore/lite/nnacl/minimal_filtering_generator.h b/mindspore/lite/nnacl/minimal_filtering_generator.h new file mode 100644 index 00000000000..5a629bb2beb --- /dev/null +++ b/mindspore/lite/nnacl/minimal_filtering_generator.h @@ -0,0 +1,56 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_NNACL_MINIMAL_FILTERING_GENERATOR_H_ +#define MINDSPORE_LITE_NNACL_MINIMAL_FILTERING_GENERATOR_H_ + +#ifdef ENABLE_ARM +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif +void Polynomial(float *interval, float *m, int degree); + +void DiagonalPlusMatrix(float *matrix, float *diagonal_matrix, int degree); + +void ResidueMatrix(float *interval, float *b, int row, int col); + +void LT(float *poly_array, float *matrix_lt, int n); + +void T(float *poly_array, float *matrix_t, int n); + +void B(float *poly_array, float *matrix_b, int in_unit); + +void GenerateIntervalArray(float *array, float interval, int degree); + +void MatrixTranspose(float *matrix, float *trans_matrix, int row, int col); + +void MatrixMultiply(const float *matrix_a, const float *matrix_b, float *matrix_c, int m, int k, int n); + +void CookToomFilter(float *matrix_a, float *matrix_at, float *matrix_b, float *matrix_bt, float *matrix_g, + float *matrix_gt, float coefficient, int out_unit, int filter_size); + +#ifdef ENABLE_ARM +void MatrixMultiplyVec(const float32x4_t *matrix_a, const float32x4_t *matrix_b, float32x4_t *matrix_c, + const float *bias, int m, int k, int n); +#endif +#ifdef __cplusplus +} +#endif + +#endif // MINDSPORE_LITE_NNACL_MINIMAL_FILTERING_GENERATOR_H_ diff --git a/mindspore/lite/nnacl/pack.c b/mindspore/lite/nnacl/pack.c index 89dbc78788c..83d7caa9286 100644 --- a/mindspore/lite/nnacl/pack.c +++ b/mindspore/lite/nnacl/pack.c @@ -101,10 +101,6 @@ void PackWeightInt8(int8_t *weight_data, ConvParameter *conv_param, int8_t *pack int8_t *origin_data_ptr = weight_data + kernel_block_stride + k * kernel_plane * in_channel; int8_t *packed_data_ptr = packed_weight + packed_kernel_block_size + k * C4NUM * C4NUM; *packed_data_ptr = origin_data_ptr[0]; - // value of weight must between [-127, 127] - if (packed_data_ptr[0] == -128) { - packed_data_ptr[0] = -127; - } weight_sum[j * C4NUM + k] += (int32_t)packed_data_ptr[0]; } } // kernel block loop @@ -146,9 +142,6 @@ void PackWeightInt8Opt(int8_t *weight_data, ConvParameter *conv_param, int8_t *p int8_t *origin_data_ptr = weight_data + kernel_block_stride + k * kernel_plane * in_channel; int8_t *packed_data_ptr = packed_weight + packed_kernel_block_size + k * C4NUM; *packed_data_ptr = origin_data_ptr[0]; - if (packed_data_ptr[0] == -128) { - packed_data_ptr[0] = -127; - } weight_sum[j * C4NUM + k] += (int32_t)(packed_data_ptr[0]); } } // kernel block loop diff --git a/mindspore/lite/nnacl/winograd_transform.c b/mindspore/lite/nnacl/winograd_transform.c index cc8f814d70d..e74da4fcf41 100644 --- a/mindspore/lite/nnacl/winograd_transform.c +++ b/mindspore/lite/nnacl/winograd_transform.c @@ -18,8 +18,7 @@ // fp32 conv winograd void WinogradInputTransform(const float *input_data, float *trans_input, float *tmp_data, int cal_num, - int out_tile_index, int out_w_block_num, ConvParameter *conv_param, - InputTransformUnitFunc input_trans_func) { + int out_tile_index, int out_w_block_num, ConvParameter *conv_param, InputTransFunc func) { int input_unit = conv_param->input_unit_; int output_unit = conv_param->output_unit_; int in_channel = conv_param->input_channel_; @@ -31,6 +30,7 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float * if (out_w_block_num == 0) { return; } + for (int c = 0; c < cal_num; c++) { // actual tiled number int src_x_s = (out_tile_index % out_w_block_num) * output_unit - pad_w; int src_y_s = (out_tile_index / out_w_block_num) * output_unit - pad_h; @@ -70,15 +70,15 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float * int dst_ic4_offset = dst_plane_offset + ic * C4NUM; size_t dst_step = C12NUM * ic4 * C4NUM; float *trans_input_ptr = trans_input + dst_ic4_offset; - input_trans_func(tmp_data, trans_input_ptr, C4NUM, dst_step); + func(tmp_data, trans_input_ptr, C4NUM, dst_step); + // GeneralInputTransformUnit(tmp_data, trans_input_ptr, matrix_b, matrix_bt, C4NUM, dst_step, input_unit); } out_tile_index++; } // cal_tile_num loop } void WinogradOutputTransform(const float *gemm_out, float *tmp_out_data, const float *bias_data, int cal_num, - int out_tile_index, int output_unit_num, ConvParameter *conv_param, - OutputTransformUnitFunc output_trans_func) { + int out_tile_index, int output_unit_num, ConvParameter *conv_param, OutputTransFunc func) { int output_unit = conv_param->output_unit_; int output_w = conv_param->output_w_; int output_h = conv_param->output_h_; @@ -106,7 +106,9 @@ void WinogradOutputTransform(const float *gemm_out, float *tmp_out_data, const f const float *src_ptr = gemm_out + src_oc4_offset; const float *bias_ptr = bias_data + j * C4NUM; float *dst_ptr = tmp_out_data + dst_oc4_offset; - output_trans_func(src_ptr, dst_ptr, bias_ptr, C8NUM, output_w_unit_block * output_unit); + func(src_ptr, dst_ptr, bias_ptr, C8NUM, output_w_unit_block * output_unit); + // GeneralOutputTransformUnit(src_ptr, dst_ptr, bias_ptr, matrix_a, matrix_at, C8NUM, + // output_w_unit_block * output_unit, input_unit, output_unit); } out_tile_index++; } @@ -865,7 +867,7 @@ void Conv3x3Int8InputUnit(int16_t *tmp_data, int16_t *trans_input_data, size_t s } void Conv3x3Int8InputTransform(const int16_t *input_data, int16_t *trans_input, int16_t *tmp_data, int start_index, - int real_cal_num, int out_w_block, ConvParameter *conv_param) { + int real_cal_num, int out_w_block, ConvParameter *conv_param) { // input data format : nhwc int input_channel = conv_param->input_channel_; int input_width = conv_param->input_w_; @@ -1176,7 +1178,7 @@ void Conv3x3Int8FilterTransform(const int16_t *weight_data, int16_t *trans_weigh } void Conv3x3Int8OutputUnit(const int32_t *gemm_out, const int32_t *bias_data, int8_t *output_data, bool h_not_bound, - bool w_not_bound, int output_w, int real_num, int oc_start, ConvParameter *conv_param) { + bool w_not_bound, int output_w, int real_num, int oc_start, ConvParameter *conv_param) { int32_t *left_shift = conv_param->conv_quant_arg_.left_shift_; int32_t *right_shift = conv_param->conv_quant_arg_.right_shift_; int32_t *quant_multiplier = conv_param->conv_quant_arg_.quant_multiplier_; @@ -1457,7 +1459,7 @@ void Conv3x3Int8OutputUnit(const int32_t *gemm_out, const int32_t *bias_data, in } void Conv3x3Int8OutputTransform(const int32_t *gemm_out, int8_t *out_data, const int32_t *bias_data, int start_index, - int real_cal_num, int out_w_block, ConvParameter *conv_param) { + int real_cal_num, int out_w_block, ConvParameter *conv_param) { int output_channel = conv_param->output_channel_; int output_w = conv_param->output_w_; int output_h = conv_param->output_h_; @@ -1484,7 +1486,7 @@ void Conv3x3Int8OutputTransform(const int32_t *gemm_out, int8_t *out_data, const bool w_not_bound = out_w_index * OUPUT_UNIT + 1 < output_w; bool h_not_bound = out_h_index * OUPUT_UNIT + 1 < output_h; Conv3x3Int8OutputUnit(src_ptr, bias_ptr, dst_ptr, h_not_bound, w_not_bound, output_w, real_num, j * C4NUM, - conv_param); + conv_param); } } } diff --git a/mindspore/lite/nnacl/winograd_transform.h b/mindspore/lite/nnacl/winograd_transform.h index 0b53e97ff19..d8b8914b000 100644 --- a/mindspore/lite/nnacl/winograd_transform.h +++ b/mindspore/lite/nnacl/winograd_transform.h @@ -33,12 +33,10 @@ extern "C" { #endif // for fp32 winograd input/output transform void WinogradInputTransform(const float *input_data, float *trans_input, float *tmp_data, int cal_num, - int out_tile_index, int out_w_block_num, ConvParameter *conv_param, - InputTransformUnitFunc input_trans_func); + int out_tile_index, int out_w_block_num, ConvParameter *conv_param, InputTransFunc func); void WinogradOutputTransform(const float *gemm_out, float *tmp_out_data, const float *bias_data, int cal_num, - int out_tile_index, int output_unit_num, ConvParameter *conv_param, - OutputTransformUnitFunc output_trans_func); + int out_tile_index, int output_unit_num, ConvParameter *conv_param, OutputTransFunc func); // for fp32 convolution 3x3 filter/input/output transform void Conv3x3Fp32InputUnit(const float *tmp_data, float *trans_input_data, size_t step); diff --git a/mindspore/lite/nnacl/winograd_utils.c b/mindspore/lite/nnacl/winograd_utils.c index f53c9197151..b79b323e215 100644 --- a/mindspore/lite/nnacl/winograd_utils.c +++ b/mindspore/lite/nnacl/winograd_utils.c @@ -16,1364 +16,408 @@ #include "nnacl/winograd_utils.h" #include +#include "nnacl/minimal_filtering_generator.h" #define MIN_UNIT 2 #define MAX_UNIT 8 -static OutputTransformUnitFunc outputTransformUnit[] = { - NULL, // 0 - NULL, // 1 - OutputTransform8x2Unit, - OutputTransform8x3Unit, - OutputTransform8x4Unit, - OutputTransform8x5Unit, - OutputTransform8x6Unit, - OutputTransform8x7Unit, -}; +static InputTransFunc InputTransFuncList[] = { + NULL, NULL, NULL, NULL, InputTransform4x4Unit, NULL, InputTransform6x6Unit, NULL, InputTransform8x8Unit}; + +static OutputTransFunc OutputTransFuncList4[] = {NULL, NULL, OutputTransform4x2Unit, OutputTransform4x3Unit}; + +static OutputTransFunc OutputTransFuncList6[] = { + NULL, NULL, OutputTransform6x2Unit, OutputTransform6x3Unit, OutputTransform6x4Unit, OutputTransform6x5Unit}; + +static OutputTransFunc OutputTransFuncList8[] = {NULL, + NULL, + OutputTransform8x2Unit, + OutputTransform8x3Unit, + OutputTransform8x4Unit, + OutputTransform8x5Unit, + OutputTransform8x6Unit, + OutputTransform8x7Unit}; +// +// static bool InputUnitList[] = {false, false, false, false, true, false, true, false, true}; + +void GeneralInputTransformUnit(const float *src_data, float *dst_data, float *matrix_b, float *matrix_bt, int src_step, + int dst_step, int in_unit) { + int len = in_unit * in_unit; + if (len > MAX_LEN) return; +#ifdef ENABLE_ARM + float32x4_t src[MAX_LEN]; + float32x4_t t[MAX_LEN]; + float32x4_t m[MAX_LEN]; + float32x4_t vec_b[MAX_LEN]; + float32x4_t vec_bt[MAX_LEN]; + for (int i = 0; i < len; i++) { + src[i] = vld1q_f32(src_data + i * src_step); + vec_b[i] = vdupq_n_f32(matrix_b[i]); + vec_bt[i] = vdupq_n_f32(matrix_bt[i]); + } + MatrixMultiplyVec(vec_bt, src, t, NULL, in_unit, in_unit, in_unit); + MatrixMultiplyVec(t, vec_b, m, NULL, in_unit, in_unit, in_unit); + for (int i = 0; i < len; i++) { + vst1q_f32(dst_data + i * dst_step, m[i]); + } +#else + float src[MAX_LEN]; + float t[MAX_LEN]; + float m[MAX_LEN]; + for (int i = 0; i < C4NUM; ++i) { + for (int j = 0; j < len; ++j) { + src[j] = src_data[i + j * src_step]; + } + MatrixMultiply(matrix_bt, src, t, in_unit, in_unit, in_unit); + MatrixMultiply(t, matrix_b, m, in_unit, in_unit, in_unit); + for (int k = 0; k < len; ++k) { + dst_data[i + k * dst_step] = m[k]; + } + } +#endif +} + +void GeneralOutputTransformUnit(const float *src_data, float *dst_data, const float *bias_data, float *matrix_a, + float *matrix_at, int src_step, int dst_step, int in_unit, int out_unit) { + int src_len = in_unit * in_unit; + if (src_len > MAX_LEN) { + return; + } +#ifdef ENABLE_ARM + float32x4_t src[MAX_LEN]; + float32x4_t t[MAX_LEN]; + float32x4_t m[MAX_LEN]; + float32x4_t vec_a[MAX_LEN]; + float32x4_t vec_at[MAX_LEN]; + int tmp_len = in_unit * out_unit; + if (tmp_len > MAX_LEN) return; + + for (int i = 0; i < tmp_len; i++) { + vec_a[i] = vdupq_n_f32(matrix_a[i]); + vec_at[i] = vdupq_n_f32(matrix_at[i]); + } + for (int i = 0; i < src_len; i++) { + src[i] = vld1q_f32(src_data + i * src_step); + } + MatrixMultiplyVec(vec_at, src, t, NULL, out_unit, in_unit, in_unit); + MatrixMultiplyVec(t, vec_a, m, bias_data, out_unit, in_unit, out_unit); + + for (int i = 0; i < out_unit; i++) { + int dst_k_offset = i * dst_step * C4NUM; + int m_k_offset = i * out_unit; + for (int j = 0; j < out_unit; j++) { + vst1q_f32(dst_data + dst_k_offset + j * C4NUM, m[m_k_offset + j]); + } + } +#else + float src[MAX_LEN]; + float t[MAX_LEN]; + float m[MAX_LEN]; + for (int i = 0; i < C4NUM; ++i) { + // load source data + for (int j = 0; j < src_len; ++j) { + src[j] = src_data[i + j * src_step]; + } + // AT * x * A + MatrixMultiply(matrix_at, src, t, out_unit, in_unit, in_unit); + MatrixMultiply(t, matrix_a, m, out_unit, in_unit, out_unit); + + // store output + for (int k = 0; k < out_unit; ++k) { + int dst_k_offset = k * dst_step * C4NUM; + int m_k_offset = k * out_unit; + for (int j = 0; j < out_unit; ++j) { + dst_data[i + dst_k_offset + j * C4NUM] = m[j + m_k_offset] + bias_data[i]; + } + } + } +#endif +} + +InputTransFunc GetInputTransFunc(int input_unit) { return InputTransFuncList[input_unit]; } void InputTransform4x4Unit(const float *src_data, float *dst_data, int src_step, int dst_step) { #ifdef ENABLE_ARM - float32x4_t src_data_00 = vld1q_f32(src_data + 0 * src_step); - float32x4_t src_data_01 = vld1q_f32(src_data + 1 * src_step); - float32x4_t src_data_02 = vld1q_f32(src_data + 2 * src_step); - float32x4_t src_data_03 = vld1q_f32(src_data + 3 * src_step); - float32x4_t src_data_10 = vld1q_f32(src_data + 4 * src_step); - float32x4_t src_data_11 = vld1q_f32(src_data + 5 * src_step); - float32x4_t src_data_12 = vld1q_f32(src_data + 6 * src_step); - float32x4_t src_data_13 = vld1q_f32(src_data + 7 * src_step); - float32x4_t src_data_20 = vld1q_f32(src_data + 8 * src_step); - float32x4_t src_data_21 = vld1q_f32(src_data + 9 * src_step); - float32x4_t src_data_22 = vld1q_f32(src_data + 10 * src_step); - float32x4_t src_data_23 = vld1q_f32(src_data + 11 * src_step); - float32x4_t src_data_30 = vld1q_f32(src_data + 12 * src_step); - float32x4_t src_data_31 = vld1q_f32(src_data + 13 * src_step); - float32x4_t src_data_32 = vld1q_f32(src_data + 14 * src_step); - float32x4_t src_data_33 = vld1q_f32(src_data + 15 * src_step); - - float32x4_t t00 = vsubq_f32(src_data_00, vmulq_n_f32(src_data_20, 4)); - float32x4_t t01 = vsubq_f32(src_data_01, vmulq_n_f32(src_data_21, 4)); - float32x4_t t02 = vsubq_f32(src_data_02, vmulq_n_f32(src_data_22, 4)); - float32x4_t t03 = vsubq_f32(src_data_03, vmulq_n_f32(src_data_23, 4)); - - float32x4_t t10 = vaddq_f32(src_data_10, vmulq_n_f32(src_data_20, 2)); - float32x4_t t11 = vaddq_f32(src_data_11, vmulq_n_f32(src_data_21, 2)); - float32x4_t t12 = vaddq_f32(src_data_12, vmulq_n_f32(src_data_22, 2)); - float32x4_t t13 = vaddq_f32(src_data_13, vmulq_n_f32(src_data_23, 2)); - - float32x4_t t20 = vsubq_f32(vmulq_n_f32(src_data_20, 2), src_data_10); - float32x4_t t21 = vsubq_f32(vmulq_n_f32(src_data_21, 2), src_data_11); - float32x4_t t22 = vsubq_f32(vmulq_n_f32(src_data_22, 2), src_data_12); - float32x4_t t23 = vsubq_f32(vmulq_n_f32(src_data_23, 2), src_data_13); - - float32x4_t t30 = vsubq_f32(src_data_30, vmulq_n_f32(src_data_10, 0.25)); - float32x4_t t31 = vsubq_f32(src_data_31, vmulq_n_f32(src_data_11, 0.25)); - float32x4_t t32 = vsubq_f32(src_data_32, vmulq_n_f32(src_data_12, 0.25)); - float32x4_t t33 = vsubq_f32(src_data_33, vmulq_n_f32(src_data_13, 0.25)); - - float32x4_t m00 = vsubq_f32(t00, vmulq_n_f32(t02, 4)); - float32x4_t m01 = vaddq_f32(t01, vmulq_n_f32(t02, 2)); - float32x4_t m02 = vsubq_f32(vmulq_n_f32(t02, 2), t01); - float32x4_t m03 = vsubq_f32(t03, vmulq_n_f32(t01, 0.25)); - - float32x4_t m10 = vsubq_f32(t10, vmulq_n_f32(t12, 4)); - float32x4_t m11 = vaddq_f32(t11, vmulq_n_f32(t12, 2)); - float32x4_t m12 = vsubq_f32(vmulq_n_f32(t12, 2), t11); - float32x4_t m13 = vsubq_f32(t13, vmulq_n_f32(t11, 0.25)); - - float32x4_t m20 = vsubq_f32(t20, vmulq_n_f32(t22, 4)); - float32x4_t m21 = vaddq_f32(t21, vmulq_n_f32(t22, 2)); - float32x4_t m22 = vsubq_f32(vmulq_n_f32(t22, 2), t21); - float32x4_t m23 = vsubq_f32(t23, vmulq_n_f32(t21, 0.25)); - - float32x4_t m30 = vsubq_f32(t30, vmulq_n_f32(t32, 4)); - float32x4_t m31 = vaddq_f32(t31, vmulq_n_f32(t32, 2)); - float32x4_t m32 = vsubq_f32(vmulq_n_f32(t32, 2), t31); - float32x4_t m33 = vsubq_f32(t33, vmulq_n_f32(t31, 0.25)); - - vst1q_f32(dst_data + 0 * dst_step, m00); - vst1q_f32(dst_data + 1 * dst_step, m01); - vst1q_f32(dst_data + 2 * dst_step, m02); - vst1q_f32(dst_data + 3 * dst_step, m03); - vst1q_f32(dst_data + 4 * dst_step, m10); - vst1q_f32(dst_data + 5 * dst_step, m11); - vst1q_f32(dst_data + 6 * dst_step, m12); - vst1q_f32(dst_data + 7 * dst_step, m13); - vst1q_f32(dst_data + 8 * dst_step, m20); - vst1q_f32(dst_data + 9 * dst_step, m21); - vst1q_f32(dst_data + 10 * dst_step, m22); - vst1q_f32(dst_data + 11 * dst_step, m23); - vst1q_f32(dst_data + 12 * dst_step, m30); - vst1q_f32(dst_data + 13 * dst_step, m31); - vst1q_f32(dst_data + 14 * dst_step, m32); - vst1q_f32(dst_data + 15 * dst_step, m33); + float32x4_t src[16]; + float32x4_t t[16]; + float32x4_t m[16]; + Load16Data; + for (int l = 0; l < 4; ++l) { + int offset = l * 4; + t[l] = vsubq_f32(src[offset], src[2 + offset]); + t[4 + l] = vaddq_f32(src[1 + offset], src[2 + offset]); + t[8 + l] = vsubq_f32(src[2 + offset], src[1 + offset]); + t[12 + l] = vsubq_f32(src[3 + offset], src[1 + offset]); + } + for (int l = 0; l < 4; ++l) { + int offset = l * 4; + m[l] = vsubq_f32(t[offset], t[2 + offset]); + m[4 + l] = vaddq_f32(t[1 + offset], t[2 + offset]); + m[8 + l] = vsubq_f32(t[2 + offset], t[1 + offset]); + m[12 + l] = vsubq_f32(t[3 + offset], t[1 + offset]); + } + for (int i = 0; i < 16; i++) { + vst1q_f32(dst_data + i * dst_step, m[i]); + } #else - for (int i = 0; i < C4NUM; i++) { - float src_data_00 = src_data[i]; - float src_data_01 = src_data[i + src_step]; - float src_data_02 = src_data[i + 2 * src_step]; - float src_data_03 = src_data[i + 3 * src_step]; - float src_data_10 = src_data[i + 4 * src_step]; - float src_data_11 = src_data[i + 5 * src_step]; - float src_data_12 = src_data[i + 6 * src_step]; - float src_data_13 = src_data[i + 7 * src_step]; - float src_data_20 = src_data[i + 8 * src_step]; - float src_data_21 = src_data[i + 9 * src_step]; - float src_data_22 = src_data[i + 10 * src_step]; - float src_data_23 = src_data[i + 11 * src_step]; - float src_data_30 = src_data[i + 12 * src_step]; - float src_data_31 = src_data[i + 13 * src_step]; - float src_data_32 = src_data[i + 14 * src_step]; - float src_data_33 = src_data[i + 15 * src_step]; + float src[16]; + float t[16]; + float m[16]; + for (int i = 0; i < C4NUM; ++i) { + for (int j = 0; j < 16; ++j) { + src[j] = src_data[i + j * src_step]; + } + for (int l = 0; l < 4; ++l) { + int offset = l * 4; + t[l] = src[offset] - src[2 + offset]; + t[4 + l] = src[1 + offset] + src[2 + offset]; + t[8 + l] = src[2 + offset] - src[1 + offset]; + t[12 + l] = src[3 + offset] - src[1 + offset]; + } + for (int l = 0; l < 4; ++l) { + int offset = l * 4; + m[l] = t[offset] - t[2 + offset]; + m[4 + l] = t[1 + offset] + t[2 + offset]; + m[8 + l] = t[2 + offset] - t[1 + offset]; + m[12 + l] = t[3 + offset] - t[1 + offset]; + } + for (int k = 0; k < 16; ++k) { + dst_data[i + k * dst_step] = m[k]; + } + } +#endif +} - float t00 = src_data_00 - 4 * src_data_20; - float t01 = src_data_01 - 4 * src_data_21; - float t02 = src_data_02 - 4 * src_data_22; - float t03 = src_data_03 - 4 * src_data_23; - - float t10 = src_data_10 + 2 * src_data_20; - float t11 = src_data_11 + 2 * src_data_21; - float t12 = src_data_12 + 2 * src_data_22; - float t13 = src_data_13 + 2 * src_data_23; - - const float t20 = 2 * src_data_20 - src_data_10; - const float t21 = 2 * src_data_21 - src_data_11; - const float t22 = 2 * src_data_22 - src_data_12; - const float t23 = 2 * src_data_23 - src_data_13; - - float t30 = src_data_30 - 0.25f * src_data_10; - float t31 = src_data_31 - 0.25f * src_data_11; - float t32 = src_data_32 - 0.25f * src_data_12; - float t33 = src_data_33 - 0.25f * src_data_13; - - float m00 = t00 - 4 * t02; - float m01 = t01 + 2 * t02; - const float m02 = 2 * t02 - t01; - float m03 = t03 - 0.25f * t01; - - float m10 = t10 - 4 * t12; - float m11 = t11 + 2 * t12; - const float m12 = 2 * t12 - t11; - float m13 = t13 - 0.25f * t11; - - float m20 = t20 - 4 * t22; - float m21 = t21 + 2 * t22; - const float m22 = 2 * t22 - t21; - float m23 = t23 - 0.25f * t21; - - float m30 = t30 - 4 * t32; - float m31 = t31 + 2 * t32; - const float m32 = 2 * t32 - t31; - float m33 = t33 - 0.25f * t31; - - (dst_data + i)[0] = m00; - (dst_data + i + dst_step)[0] = m01; - (dst_data + i + 2 * dst_step)[0] = m02; - (dst_data + i + 3 * dst_step)[0] = m03; - - (dst_data + i + 4 * dst_step)[0] = m10; - (dst_data + i + 5 * dst_step)[0] = m11; - (dst_data + i + 6 * dst_step)[0] = m12; - (dst_data + i + 7 * dst_step)[0] = m13; - - (dst_data + i + 8 * dst_step)[0] = m20; - (dst_data + i + 9 * dst_step)[0] = m21; - (dst_data + i + 10 * dst_step)[0] = m22; - (dst_data + i + 11 * dst_step)[0] = m23; - - (dst_data + i + 12 * dst_step)[0] = m30; - (dst_data + i + 13 * dst_step)[0] = m31; - (dst_data + i + 14 * dst_step)[0] = m32; - (dst_data + i + 15 * dst_step)[0] = m33; +void InputTransform6x6Unit(const float *src_data, float *dst_data, int src_step, int dst_step) { +#ifdef ENABLE_ARM + float32x4_t src[36]; + float32x4_t t[36]; + float32x4_t m[36]; + Load36Data; + for (int l = 0; l < 6; ++l) { + int offset = l * 6; + float32x4_t tmp1 = vsubq_f32(src[3 + offset], src[1 + offset]); + float32x4_t tmp2 = vsubq_f32(src[4 + offset], src[2 + offset]); + t[l] = vaddq_f32(vsubq_f32(vmulq_n_f32(src[offset], 4), vmulq_n_f32(src[2 + offset], 5)), src[4 + offset]); + t[6 + l] = vaddq_f32(vmulq_n_f32(vaddq_f32(src[1 + offset], src[2 + offset]), -4), + vaddq_f32(src[3 + offset], src[4 + offset])); + t[12 + l] = vaddq_f32(vmulq_n_f32(vsubq_f32(src[1 + offset], src[2 + offset]), 4), + vsubq_f32(src[4 + offset], src[3 + offset])); + t[18 + l] = vaddq_f32(vmulq_n_f32(tmp1, 2), tmp2); + t[24 + l] = vaddq_f32(vmulq_n_f32(tmp1, -2), tmp2); + t[30 + l] = vaddq_f32(vsubq_f32(vmulq_n_f32(src[1 + offset], 4), vmulq_n_f32(src[3 + offset], 5)), src[5 + offset]); + } + for (int l = 0; l < 6; ++l) { + int offset = l * 6; + float32x4_t tmp1 = vsubq_f32(t[3 + offset], t[1 + offset]); + float32x4_t tmp2 = vsubq_f32(t[4 + offset], t[2 + offset]); + m[l] = vaddq_f32(vsubq_f32(vmulq_n_f32(t[offset], 4), vmulq_n_f32(t[2 + offset], 5)), t[4 + offset]); + m[6 + l] = + vaddq_f32(vmulq_n_f32(vaddq_f32(t[1 + offset], t[2 + offset]), -4), vaddq_f32(t[3 + offset], t[4 + offset])); + m[12 + l] = + vaddq_f32(vmulq_n_f32(vsubq_f32(t[1 + offset], t[2 + offset]), 4), vsubq_f32(t[4 + offset], t[3 + offset])); + m[18 + l] = vaddq_f32(vmulq_n_f32(tmp1, 2), tmp2); + m[24 + l] = vaddq_f32(vmulq_n_f32(tmp1, -2), tmp2); + m[30 + l] = vaddq_f32(vsubq_f32(vmulq_n_f32(t[1 + offset], 4), vmulq_n_f32(t[3 + offset], 5)), t[5 + offset]); + } + for (int i = 0; i < 36; i++) { + vst1q_f32(dst_data + i * dst_step, m[i]); + } +#else + float src[36]; + float t[36]; + float m[36]; + for (int i = 0; i < C4NUM; ++i) { + for (int j = 0; j < 36; ++j) { + src[j] = src_data[i + j * src_step]; + } + for (int l = 0; l < 6; ++l) { + int offset = l * 6; + float tmp1 = src[3 + offset] - src[1 + offset]; + float tmp2 = src[4 + offset] - src[2 + offset]; + t[l] = 4 * src[offset] - 5 * src[2 + offset] + src[4 + offset]; + t[6 + l] = -4 * (src[1 + offset] + src[2 + offset]) + (src[3 + offset] + src[4 + offset]); + t[12 + l] = 4 * (src[1 + offset] - src[2 + offset]) + (src[4 + offset] - src[3 + offset]); + t[18 + l] = 2 * tmp1 + tmp2; + t[24 + l] = -2 * tmp1 + tmp2; + t[30 + l] = 4 * src[1 + offset] - 5 * src[3 + offset] + src[5 + offset]; + } + for (int l = 0; l < 6; ++l) { + int offset = l * 6; + float tmp1 = t[3 + offset] - t[1 + offset]; + float tmp2 = t[4 + offset] - t[2 + offset]; + m[l] = 4 * t[offset] - 5 * t[2 + offset] + t[4 + offset]; + m[6 + l] = -4 * (t[1 + offset] + t[2 + offset]) + (t[3 + offset] + t[4 + offset]); + m[12 + l] = 4 * (t[1 + offset] - t[2 + offset]) + (t[4 + offset] - t[3 + offset]); + m[18 + l] = 2 * tmp1 + tmp2; + m[24 + l] = -2 * tmp1 + tmp2; + m[30 + l] = 4 * t[1 + offset] - 5 * t[3 + offset] + t[5 + offset]; + } + for (int k = 0; k < 36; ++k) { + dst_data[i + k * dst_step] = m[k]; + } } #endif } void InputTransform8x8Unit(const float *src_data, float *dst_data, int src_step, int dst_step) { #ifdef ENABLE_ARM - float32x4_t src_data_00 = vld1q_f32(src_data + 0 * src_step); - float32x4_t src_data_01 = vld1q_f32(src_data + 1 * src_step); - float32x4_t src_data_02 = vld1q_f32(src_data + 2 * src_step); - float32x4_t src_data_03 = vld1q_f32(src_data + 3 * src_step); - float32x4_t src_data_04 = vld1q_f32(src_data + 4 * src_step); - float32x4_t src_data_05 = vld1q_f32(src_data + 5 * src_step); - float32x4_t src_data_06 = vld1q_f32(src_data + 6 * src_step); - float32x4_t src_data_07 = vld1q_f32(src_data + 7 * src_step); - float32x4_t src_data_10 = vld1q_f32(src_data + 8 * src_step); - float32x4_t src_data_11 = vld1q_f32(src_data + 9 * src_step); - float32x4_t src_data_12 = vld1q_f32(src_data + 10 * src_step); - float32x4_t src_data_13 = vld1q_f32(src_data + 11 * src_step); - float32x4_t src_data_14 = vld1q_f32(src_data + 12 * src_step); - float32x4_t src_data_15 = vld1q_f32(src_data + 13 * src_step); - float32x4_t src_data_16 = vld1q_f32(src_data + 14 * src_step); - float32x4_t src_data_17 = vld1q_f32(src_data + 15 * src_step); - float32x4_t src_data_20 = vld1q_f32(src_data + 16 * src_step); - float32x4_t src_data_21 = vld1q_f32(src_data + 17 * src_step); - float32x4_t src_data_22 = vld1q_f32(src_data + 18 * src_step); - float32x4_t src_data_23 = vld1q_f32(src_data + 19 * src_step); - float32x4_t src_data_24 = vld1q_f32(src_data + 20 * src_step); - float32x4_t src_data_25 = vld1q_f32(src_data + 21 * src_step); - float32x4_t src_data_26 = vld1q_f32(src_data + 22 * src_step); - float32x4_t src_data_27 = vld1q_f32(src_data + 23 * src_step); - float32x4_t src_data_30 = vld1q_f32(src_data + 24 * src_step); - float32x4_t src_data_31 = vld1q_f32(src_data + 25 * src_step); - float32x4_t src_data_32 = vld1q_f32(src_data + 26 * src_step); - float32x4_t src_data_33 = vld1q_f32(src_data + 27 * src_step); - float32x4_t src_data_34 = vld1q_f32(src_data + 28 * src_step); - float32x4_t src_data_35 = vld1q_f32(src_data + 29 * src_step); - float32x4_t src_data_36 = vld1q_f32(src_data + 30 * src_step); - float32x4_t src_data_37 = vld1q_f32(src_data + 31 * src_step); - float32x4_t src_data_40 = vld1q_f32(src_data + 32 * src_step); - float32x4_t src_data_41 = vld1q_f32(src_data + 33 * src_step); - float32x4_t src_data_42 = vld1q_f32(src_data + 34 * src_step); - float32x4_t src_data_43 = vld1q_f32(src_data + 35 * src_step); - float32x4_t src_data_44 = vld1q_f32(src_data + 36 * src_step); - float32x4_t src_data_45 = vld1q_f32(src_data + 37 * src_step); - float32x4_t src_data_46 = vld1q_f32(src_data + 38 * src_step); - float32x4_t src_data_47 = vld1q_f32(src_data + 39 * src_step); - float32x4_t src_data_50 = vld1q_f32(src_data + 40 * src_step); - float32x4_t src_data_51 = vld1q_f32(src_data + 41 * src_step); - float32x4_t src_data_52 = vld1q_f32(src_data + 42 * src_step); - float32x4_t src_data_53 = vld1q_f32(src_data + 43 * src_step); - float32x4_t src_data_54 = vld1q_f32(src_data + 44 * src_step); - float32x4_t src_data_55 = vld1q_f32(src_data + 45 * src_step); - float32x4_t src_data_56 = vld1q_f32(src_data + 46 * src_step); - float32x4_t src_data_57 = vld1q_f32(src_data + 47 * src_step); - float32x4_t src_data_60 = vld1q_f32(src_data + 48 * src_step); - float32x4_t src_data_61 = vld1q_f32(src_data + 49 * src_step); - float32x4_t src_data_62 = vld1q_f32(src_data + 50 * src_step); - float32x4_t src_data_63 = vld1q_f32(src_data + 51 * src_step); - float32x4_t src_data_64 = vld1q_f32(src_data + 52 * src_step); - float32x4_t src_data_65 = vld1q_f32(src_data + 53 * src_step); - float32x4_t src_data_66 = vld1q_f32(src_data + 54 * src_step); - float32x4_t src_data_67 = vld1q_f32(src_data + 55 * src_step); - float32x4_t src_data_70 = vld1q_f32(src_data + 56 * src_step); - float32x4_t src_data_71 = vld1q_f32(src_data + 57 * src_step); - float32x4_t src_data_72 = vld1q_f32(src_data + 58 * src_step); - float32x4_t src_data_73 = vld1q_f32(src_data + 59 * src_step); - float32x4_t src_data_74 = vld1q_f32(src_data + 60 * src_step); - float32x4_t src_data_75 = vld1q_f32(src_data + 61 * src_step); - float32x4_t src_data_76 = vld1q_f32(src_data + 62 * src_step); - float32x4_t src_data_77 = vld1q_f32(src_data + 63 * src_step); - - float32x4_t t00 = vsubq_f32(vaddq_f32(vsubq_f32(src_data_00, vmulq_n_f32(src_data_20, 5.44444444444444444444444445)), - vmulq_n_f32(src_data_40, 6.222222222222)), - vmulq_n_f32(src_data_60, 1.7777777777777)); - float32x4_t t01 = vsubq_f32(vaddq_f32(vsubq_f32(src_data_01, vmulq_n_f32(src_data_21, 5.44444444444444444444444445)), - vmulq_n_f32(src_data_41, 6.222222222222)), - vmulq_n_f32(src_data_61, 1.7777777777777)); - float32x4_t t02 = vsubq_f32(vaddq_f32(vsubq_f32(src_data_02, vmulq_n_f32(src_data_22, 5.44444444444444444444444445)), - vmulq_n_f32(src_data_42, 6.222222222222)), - vmulq_n_f32(src_data_62, 1.7777777777777)); - float32x4_t t03 = vsubq_f32(vaddq_f32(vsubq_f32(src_data_03, vmulq_n_f32(src_data_23, 5.44444444444444444444444445)), - vmulq_n_f32(src_data_43, 6.222222222222)), - vmulq_n_f32(src_data_63, 1.7777777777777)); - float32x4_t t04 = vsubq_f32(vaddq_f32(vsubq_f32(src_data_04, vmulq_n_f32(src_data_24, 5.44444444444444444444444445)), - vmulq_n_f32(src_data_44, 6.222222222222)), - vmulq_n_f32(src_data_64, 1.7777777777777)); - float32x4_t t05 = vsubq_f32(vaddq_f32(vsubq_f32(src_data_05, vmulq_n_f32(src_data_25, 5.44444444444444444444444445)), - vmulq_n_f32(src_data_45, 6.222222222222)), - vmulq_n_f32(src_data_65, 1.7777777777777)); - float32x4_t t06 = vsubq_f32(vaddq_f32(vsubq_f32(src_data_06, vmulq_n_f32(src_data_26, 5.44444444444444444444444445)), - vmulq_n_f32(src_data_46, 6.222222222222)), - vmulq_n_f32(src_data_66, 1.7777777777777)); - float32x4_t t07 = vsubq_f32(vaddq_f32(vsubq_f32(src_data_07, vmulq_n_f32(src_data_27, 5.44444444444444444444444445)), - vmulq_n_f32(src_data_47, 6.222222222222)), - vmulq_n_f32(src_data_67, 1.7777777777777)); - - float32x4_t t10 = - vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_10, 1.5), vmulq_n_f32(src_data_20, 3)), - vmulq_n_f32(src_data_30, 2.166666666666666667)), - vmulq_n_f32(src_data_40, 4.333333333333)), - vmulq_n_f32(src_data_50, 0.66666666666)), - vmulq_n_f32(src_data_60, 1.333333333333)); - float32x4_t t11 = - vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_11, 1.5), vmulq_n_f32(src_data_21, 3)), - vmulq_n_f32(src_data_31, 2.166666666666666667)), - vmulq_n_f32(src_data_41, 4.333333333333)), - vmulq_n_f32(src_data_51, 0.66666666666)), - vmulq_n_f32(src_data_61, 1.333333333333)); - float32x4_t t12 = - vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_12, 1.5), vmulq_n_f32(src_data_22, 3)), - vmulq_n_f32(src_data_32, 2.166666666666666667)), - vmulq_n_f32(src_data_42, 4.333333333333)), - vmulq_n_f32(src_data_52, 0.66666666666)), - vmulq_n_f32(src_data_62, 1.333333333333)); - float32x4_t t13 = - vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_13, 1.5), vmulq_n_f32(src_data_23, 3)), - vmulq_n_f32(src_data_33, 2.166666666666666667)), - vmulq_n_f32(src_data_43, 4.333333333333)), - vmulq_n_f32(src_data_53, 0.66666666666)), - vmulq_n_f32(src_data_63, 1.333333333333)); - float32x4_t t14 = - vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_14, 1.5), vmulq_n_f32(src_data_24, 3)), - vmulq_n_f32(src_data_34, 2.166666666666666667)), - vmulq_n_f32(src_data_44, 4.333333333333)), - vmulq_n_f32(src_data_54, 0.66666666666)), - vmulq_n_f32(src_data_64, 1.333333333333)); - float32x4_t t15 = - vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_15, 1.5), vmulq_n_f32(src_data_25, 3)), - vmulq_n_f32(src_data_35, 2.166666666666666667)), - vmulq_n_f32(src_data_45, 4.333333333333)), - vmulq_n_f32(src_data_55, 0.66666666666)), - vmulq_n_f32(src_data_65, 1.333333333333)); - float32x4_t t16 = - vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_16, 1.5), vmulq_n_f32(src_data_26, 3)), - vmulq_n_f32(src_data_36, 2.166666666666666667)), - vmulq_n_f32(src_data_46, 4.333333333333)), - vmulq_n_f32(src_data_56, 0.66666666666)), - vmulq_n_f32(src_data_66, 1.333333333333)); - float32x4_t t17 = - vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_17, 1.5), vmulq_n_f32(src_data_27, 3)), - vmulq_n_f32(src_data_37, 2.166666666666666667)), - vmulq_n_f32(src_data_47, 4.333333333333)), - vmulq_n_f32(src_data_57, 0.66666666666)), - vmulq_n_f32(src_data_67, 1.333333333333)); - - float32x4_t t20 = - vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(src_data_10, -1.5), vmulq_n_f32(src_data_20, 3)), - vmulq_n_f32(src_data_30, 2.166666666666666667)), - vmulq_n_f32(src_data_40, 4.333333333333)), - vmulq_n_f32(src_data_50, 0.66666666666)), - vmulq_n_f32(src_data_60, 1.333333333333)); - float32x4_t t21 = - vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(src_data_11, -1.5), vmulq_n_f32(src_data_21, 3)), - vmulq_n_f32(src_data_31, 2.166666666666666667)), - vmulq_n_f32(src_data_41, 4.333333333333)), - vmulq_n_f32(src_data_51, 0.66666666666)), - vmulq_n_f32(src_data_61, 1.333333333333)); - float32x4_t t22 = - vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(src_data_12, -1.5), vmulq_n_f32(src_data_22, 3)), - vmulq_n_f32(src_data_32, 2.166666666666666667)), - vmulq_n_f32(src_data_42, 4.333333333333)), - vmulq_n_f32(src_data_52, 0.66666666666)), - vmulq_n_f32(src_data_62, 1.333333333333)); - float32x4_t t23 = - vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(src_data_13, -1.5), vmulq_n_f32(src_data_23, 3)), - vmulq_n_f32(src_data_33, 2.166666666666666667)), - vmulq_n_f32(src_data_43, 4.333333333333)), - vmulq_n_f32(src_data_53, 0.66666666666)), - vmulq_n_f32(src_data_63, 1.333333333333)); - float32x4_t t24 = - vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(src_data_14, -1.5), vmulq_n_f32(src_data_24, 3)), - vmulq_n_f32(src_data_34, 2.166666666666666667)), - vmulq_n_f32(src_data_44, 4.333333333333)), - vmulq_n_f32(src_data_54, 0.66666666666)), - vmulq_n_f32(src_data_64, 1.333333333333)); - float32x4_t t25 = - vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(src_data_15, -1.5), vmulq_n_f32(src_data_25, 3)), - vmulq_n_f32(src_data_35, 2.166666666666666667)), - vmulq_n_f32(src_data_45, 4.333333333333)), - vmulq_n_f32(src_data_55, 0.66666666666)), - vmulq_n_f32(src_data_65, 1.333333333333)); - float32x4_t t26 = - vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(src_data_16, -1.5), vmulq_n_f32(src_data_26, 3)), - vmulq_n_f32(src_data_36, 2.166666666666666667)), - vmulq_n_f32(src_data_46, 4.333333333333)), - vmulq_n_f32(src_data_56, 0.66666666666)), - vmulq_n_f32(src_data_66, 1.333333333333)); - float32x4_t t27 = - vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(src_data_17, -1.5), vmulq_n_f32(src_data_27, 3)), - vmulq_n_f32(src_data_37, 2.166666666666666667)), - vmulq_n_f32(src_data_47, 4.333333333333)), - vmulq_n_f32(src_data_57, 0.66666666666)), - vmulq_n_f32(src_data_67, 1.333333333333)); - - float32x4_t t30 = vsubq_f32(vaddq_f32(vmulq_n_f32(vaddq_f32(src_data_30, src_data_40), 1.3333333333333), - vmulq_n_f32(vaddq_f32(src_data_10, src_data_20), -0.3)), - vmulq_n_f32(vaddq_f32(src_data_50, src_data_60), 0.53333333333)); - float32x4_t t31 = vsubq_f32(vaddq_f32(vmulq_n_f32(vaddq_f32(src_data_31, src_data_41), 1.3333333333333), - vmulq_n_f32(vaddq_f32(src_data_11, src_data_21), -0.3)), - vmulq_n_f32(vaddq_f32(src_data_51, src_data_61), 0.53333333333)); - float32x4_t t32 = vsubq_f32(vaddq_f32(vmulq_n_f32(vaddq_f32(src_data_32, src_data_42), 1.3333333333333), - vmulq_n_f32(vaddq_f32(src_data_12, src_data_22), -0.3)), - vmulq_n_f32(vaddq_f32(src_data_52, src_data_62), 0.53333333333)); - float32x4_t t33 = vsubq_f32(vaddq_f32(vmulq_n_f32(vaddq_f32(src_data_33, src_data_43), 1.3333333333333), - vmulq_n_f32(vaddq_f32(src_data_13, src_data_23), -0.3)), - vmulq_n_f32(vaddq_f32(src_data_53, src_data_63), 0.53333333333)); - float32x4_t t34 = vsubq_f32(vaddq_f32(vmulq_n_f32(vaddq_f32(src_data_34, src_data_44), 1.3333333333333), - vmulq_n_f32(vaddq_f32(src_data_14, src_data_24), -0.3)), - vmulq_n_f32(vaddq_f32(src_data_54, src_data_64), 0.53333333333)); - float32x4_t t35 = vsubq_f32(vaddq_f32(vmulq_n_f32(vaddq_f32(src_data_35, src_data_45), 1.3333333333333), - vmulq_n_f32(vaddq_f32(src_data_15, src_data_25), -0.3)), - vmulq_n_f32(vaddq_f32(src_data_55, src_data_65), 0.53333333333)); - float32x4_t t36 = vsubq_f32(vaddq_f32(vmulq_n_f32(vaddq_f32(src_data_36, src_data_46), 1.3333333333333), - vmulq_n_f32(vaddq_f32(src_data_16, src_data_26), -0.3)), - vmulq_n_f32(vaddq_f32(src_data_56, src_data_66), 0.53333333333)); - float32x4_t t37 = vsubq_f32(vaddq_f32(vmulq_n_f32(vaddq_f32(src_data_37, src_data_47), 1.3333333333333), - vmulq_n_f32(vaddq_f32(src_data_17, src_data_27), -0.3)), - vmulq_n_f32(vaddq_f32(src_data_57, src_data_67), 0.53333333333)); - - float32x4_t t40 = vaddq_f32(vaddq_f32(vmulq_n_f32(vsubq_f32(src_data_40, src_data_30), 1.3333333333333), - vmulq_n_f32(vsubq_f32(src_data_10, src_data_20), 0.3)), - vmulq_n_f32(vsubq_f32(src_data_50, src_data_60), 0.53333333333)); - float32x4_t t41 = vaddq_f32(vaddq_f32(vmulq_n_f32(vsubq_f32(src_data_41, src_data_31), 1.3333333333333), - vmulq_n_f32(vsubq_f32(src_data_11, src_data_21), 0.3)), - vmulq_n_f32(vsubq_f32(src_data_51, src_data_61), 0.53333333333)); - float32x4_t t42 = vaddq_f32(vaddq_f32(vmulq_n_f32(vsubq_f32(src_data_42, src_data_32), 1.3333333333333), - vmulq_n_f32(vsubq_f32(src_data_12, src_data_22), 0.3)), - vmulq_n_f32(vsubq_f32(src_data_52, src_data_62), 0.53333333333)); - float32x4_t t43 = vaddq_f32(vaddq_f32(vmulq_n_f32(vsubq_f32(src_data_43, src_data_33), 1.3333333333333), - vmulq_n_f32(vsubq_f32(src_data_13, src_data_23), 0.3)), - vmulq_n_f32(vsubq_f32(src_data_53, src_data_63), 0.53333333333)); - float32x4_t t44 = vaddq_f32(vaddq_f32(vmulq_n_f32(vsubq_f32(src_data_44, src_data_34), 1.3333333333333), - vmulq_n_f32(vsubq_f32(src_data_14, src_data_24), 0.3)), - vmulq_n_f32(vsubq_f32(src_data_54, src_data_64), 0.53333333333)); - float32x4_t t45 = vaddq_f32(vaddq_f32(vmulq_n_f32(vsubq_f32(src_data_45, src_data_35), 1.3333333333333), - vmulq_n_f32(vsubq_f32(src_data_15, src_data_25), 0.3)), - vmulq_n_f32(vsubq_f32(src_data_55, src_data_65), 0.53333333333)); - float32x4_t t46 = vaddq_f32(vaddq_f32(vmulq_n_f32(vsubq_f32(src_data_46, src_data_36), 1.3333333333333), - vmulq_n_f32(vsubq_f32(src_data_16, src_data_26), 0.3)), - vmulq_n_f32(vsubq_f32(src_data_56, src_data_66), 0.53333333333)); - float32x4_t t47 = vaddq_f32(vaddq_f32(vmulq_n_f32(vsubq_f32(src_data_47, src_data_37), 1.3333333333333), - vmulq_n_f32(vsubq_f32(src_data_17, src_data_27), 0.3)), - vmulq_n_f32(vsubq_f32(src_data_57, src_data_67), 0.53333333333)); - - float32x4_t t50 = vaddq_f32( - vaddq_f32( - vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_10, 0.03333333), vmulq_n_f32(src_data_20, 0.022222222)), - vmulq_n_f32(src_data_30, 0.1666666666)), - vmulq_n_f32(src_data_40, 0.11111111111)), - vmulq_n_f32(src_data_50, 0.133333333)), - vmulq_n_f32(src_data_60, 0.088888888)); - float32x4_t t51 = vaddq_f32( - vaddq_f32( - vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_11, 0.03333333), vmulq_n_f32(src_data_21, 0.022222222)), - vmulq_n_f32(src_data_31, 0.1666666666)), - vmulq_n_f32(src_data_41, 0.11111111111)), - vmulq_n_f32(src_data_51, 0.133333333)), - vmulq_n_f32(src_data_61, 0.088888888)); - float32x4_t t52 = vaddq_f32( - vaddq_f32( - vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_12, 0.03333333), vmulq_n_f32(src_data_22, 0.022222222)), - vmulq_n_f32(src_data_32, 0.1666666666)), - vmulq_n_f32(src_data_42, 0.11111111111)), - vmulq_n_f32(src_data_52, 0.133333333)), - vmulq_n_f32(src_data_62, 0.088888888)); - float32x4_t t53 = vaddq_f32( - vaddq_f32( - vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_13, 0.03333333), vmulq_n_f32(src_data_23, 0.022222222)), - vmulq_n_f32(src_data_33, 0.1666666666)), - vmulq_n_f32(src_data_43, 0.11111111111)), - vmulq_n_f32(src_data_53, 0.133333333)), - vmulq_n_f32(src_data_63, 0.088888888)); - float32x4_t t54 = vaddq_f32( - vaddq_f32( - vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_14, 0.03333333), vmulq_n_f32(src_data_24, 0.022222222)), - vmulq_n_f32(src_data_34, 0.1666666666)), - vmulq_n_f32(src_data_44, 0.11111111111)), - vmulq_n_f32(src_data_54, 0.133333333)), - vmulq_n_f32(src_data_64, 0.088888888)); - float32x4_t t55 = vaddq_f32( - vaddq_f32( - vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_15, 0.03333333), vmulq_n_f32(src_data_25, 0.022222222)), - vmulq_n_f32(src_data_35, 0.1666666666)), - vmulq_n_f32(src_data_45, 0.11111111111)), - vmulq_n_f32(src_data_55, 0.133333333)), - vmulq_n_f32(src_data_65, 0.088888888)); - float32x4_t t56 = vaddq_f32( - vaddq_f32( - vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_16, 0.03333333), vmulq_n_f32(src_data_26, 0.022222222)), - vmulq_n_f32(src_data_36, 0.1666666666)), - vmulq_n_f32(src_data_46, 0.11111111111)), - vmulq_n_f32(src_data_56, 0.133333333)), - vmulq_n_f32(src_data_66, 0.088888888)); - float32x4_t t57 = vaddq_f32( - vaddq_f32( - vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_17, 0.03333333), vmulq_n_f32(src_data_27, 0.022222222)), - vmulq_n_f32(src_data_37, 0.1666666666)), - vmulq_n_f32(src_data_47, 0.11111111111)), - vmulq_n_f32(src_data_57, 0.133333333)), - vmulq_n_f32(src_data_67, 0.088888888)); - - float32x4_t t60 = vaddq_f32( - vaddq_f32( - vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(src_data_10, -0.03333333), vmulq_n_f32(src_data_20, 0.022222222)), - vmulq_n_f32(src_data_30, 0.1666666666)), - vmulq_n_f32(src_data_40, 0.11111111111)), - vmulq_n_f32(src_data_50, -0.133333333)), - vmulq_n_f32(src_data_60, 0.088888888)); - float32x4_t t61 = vaddq_f32( - vaddq_f32( - vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(src_data_11, -0.03333333), vmulq_n_f32(src_data_21, 0.022222222)), - vmulq_n_f32(src_data_31, 0.1666666666)), - vmulq_n_f32(src_data_41, 0.11111111111)), - vmulq_n_f32(src_data_51, -0.133333333)), - vmulq_n_f32(src_data_61, 0.088888888)); - float32x4_t t62 = vaddq_f32( - vaddq_f32( - vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(src_data_12, -0.03333333), vmulq_n_f32(src_data_22, 0.022222222)), - vmulq_n_f32(src_data_32, 0.1666666666)), - vmulq_n_f32(src_data_42, 0.11111111111)), - vmulq_n_f32(src_data_52, -0.133333333)), - vmulq_n_f32(src_data_62, 0.088888888)); - float32x4_t t63 = vaddq_f32( - vaddq_f32( - vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(src_data_13, -0.03333333), vmulq_n_f32(src_data_23, 0.022222222)), - vmulq_n_f32(src_data_33, 0.1666666666)), - vmulq_n_f32(src_data_43, 0.11111111111)), - vmulq_n_f32(src_data_53, -0.133333333)), - vmulq_n_f32(src_data_63, 0.088888888)); - float32x4_t t64 = vaddq_f32( - vaddq_f32( - vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(src_data_14, -0.03333333), vmulq_n_f32(src_data_24, 0.022222222)), - vmulq_n_f32(src_data_34, 0.1666666666)), - vmulq_n_f32(src_data_44, 0.11111111111)), - vmulq_n_f32(src_data_54, -0.133333333)), - vmulq_n_f32(src_data_64, 0.088888888)); - float32x4_t t65 = vaddq_f32( - vaddq_f32( - vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(src_data_15, -0.03333333), vmulq_n_f32(src_data_25, 0.022222222)), - vmulq_n_f32(src_data_35, 0.1666666666)), - vmulq_n_f32(src_data_45, 0.11111111111)), - vmulq_n_f32(src_data_55, -0.133333333)), - vmulq_n_f32(src_data_65, 0.088888888)); - float32x4_t t66 = vaddq_f32( - vaddq_f32( - vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(src_data_16, -0.03333333), vmulq_n_f32(src_data_26, 0.022222222)), - vmulq_n_f32(src_data_36, 0.1666666666)), - vmulq_n_f32(src_data_46, 0.11111111111)), - vmulq_n_f32(src_data_56, -0.133333333)), - vmulq_n_f32(src_data_66, 0.088888888)); - float32x4_t t67 = vaddq_f32( - vaddq_f32( - vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(src_data_17, -0.03333333), vmulq_n_f32(src_data_27, 0.022222222)), - vmulq_n_f32(src_data_37, 0.1666666666)), - vmulq_n_f32(src_data_47, 0.11111111111)), - vmulq_n_f32(src_data_57, -0.133333333)), - vmulq_n_f32(src_data_67, 0.088888888)); - - float32x4_t t70 = vaddq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_30, 3.0625), vmulq_n_f32(src_data_10, -0.5625)), - vmulq_n_f32(src_data_50, 3.5)), - src_data_70); - float32x4_t t71 = vaddq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_31, 3.0625), vmulq_n_f32(src_data_11, -0.5625)), - vmulq_n_f32(src_data_51, 3.5)), - src_data_71); - float32x4_t t72 = vaddq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_32, 3.0625), vmulq_n_f32(src_data_12, -0.5625)), - vmulq_n_f32(src_data_52, 3.5)), - src_data_72); - float32x4_t t73 = vaddq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_33, 3.0625), vmulq_n_f32(src_data_13, -0.5625)), - vmulq_n_f32(src_data_53, 3.5)), - src_data_73); - float32x4_t t74 = vaddq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_34, 3.0625), vmulq_n_f32(src_data_14, -0.5625)), - vmulq_n_f32(src_data_54, 3.5)), - src_data_74); - float32x4_t t75 = vaddq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_35, 3.0625), vmulq_n_f32(src_data_15, -0.5625)), - vmulq_n_f32(src_data_55, 3.5)), - src_data_75); - float32x4_t t76 = vaddq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_36, 3.0625), vmulq_n_f32(src_data_16, -0.5625)), - vmulq_n_f32(src_data_56, 3.5)), - src_data_76); - float32x4_t t77 = vaddq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src_data_37, 3.0625), vmulq_n_f32(src_data_17, -0.5625)), - vmulq_n_f32(src_data_57, 3.5)), - src_data_77); - - float32x4_t m00 = - vsubq_f32(vaddq_f32(vsubq_f32(t00, vmulq_n_f32(t02, 5.444444444444444)), vmulq_n_f32(t04, 6.22222222222)), - vmulq_n_f32(t06, 1.77777777777777777778)); - float32x4_t m01 = vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t01, 1.5), vmulq_n_f32(t02, 3)), - vmulq_n_f32(t03, 2.16666666666666667)), - vmulq_n_f32(t04, 4.3333333333)), - vmulq_n_f32(t05, 0.66666666667)), - vmulq_n_f32(t06, 1.333333333333)); - float32x4_t m02 = vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(t01, -1.5), vmulq_n_f32(t02, 3)), - vmulq_n_f32(t03, 2.16666666666666667)), - vmulq_n_f32(t04, 4.3333333333)), - vmulq_n_f32(t05, 0.66666666667)), - vmulq_n_f32(t06, 1.333333333333)); - float32x4_t m03 = - vaddq_f32(vaddq_f32(vmulq_n_f32(vaddq_f32(t01, t02), -0.3), vmulq_n_f32(vaddq_f32(t03, t04), 1.33333333333)), - vmulq_n_f32(vaddq_f32(t05, t06), -0.533333333333)); - float32x4_t m04 = - vaddq_f32(vaddq_f32(vmulq_n_f32(vsubq_f32(t01, t02), 0.3), vmulq_n_f32(vsubq_f32(t04, t03), 1.33333333333)), - vmulq_n_f32(vsubq_f32(t05, t06), 0.533333333333)); - float32x4_t m05 = - vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t01, 0.03333333), vmulq_n_f32(t02, 0.0222222)), - vmulq_n_f32(t03, 0.16666666666666667)), - vmulq_n_f32(t04, 0.11111111111)), - vmulq_n_f32(t05, 0.1333333333)), - vmulq_n_f32(t06, 0.08888888888)); - float32x4_t m06 = - vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(t01, -0.03333333), vmulq_n_f32(t02, 0.0222222)), - vmulq_n_f32(t03, 0.16666666666666667)), - vmulq_n_f32(t04, 0.11111111111)), - vmulq_n_f32(t05, 0.1333333333)), - vmulq_n_f32(t06, 0.08888888888)); - float32x4_t m07 = - vaddq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t01, -0.5625), vmulq_n_f32(t03, 3.0625)), vmulq_n_f32(t05, 3.5)), t07); - - float32x4_t m10 = - vsubq_f32(vaddq_f32(vsubq_f32(t10, vmulq_n_f32(t12, 5.444444444444444)), vmulq_n_f32(t14, 6.22222222222)), - vmulq_n_f32(t16, 1.77777777777777777778)); - float32x4_t m11 = vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t11, 1.5), vmulq_n_f32(t12, 3)), - vmulq_n_f32(t13, 2.16666666666666667)), - vmulq_n_f32(t14, 4.3333333333)), - vmulq_n_f32(t15, 0.66666666667)), - vmulq_n_f32(t16, 1.333333333333)); - float32x4_t m12 = vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(t11, -1.5), vmulq_n_f32(t12, 3)), - vmulq_n_f32(t13, 2.16666666666666667)), - vmulq_n_f32(t14, 4.3333333333)), - vmulq_n_f32(t15, 0.66666666667)), - vmulq_n_f32(t16, 1.333333333333)); - float32x4_t m13 = - vaddq_f32(vaddq_f32(vmulq_n_f32(vaddq_f32(t11, t12), -0.3), vmulq_n_f32(vaddq_f32(t13, t14), 1.33333333333)), - vmulq_n_f32(vaddq_f32(t15, t16), -0.533333333333)); - float32x4_t m14 = - vaddq_f32(vaddq_f32(vmulq_n_f32(vsubq_f32(t11, t12), 0.3), vmulq_n_f32(vsubq_f32(t14, t13), 1.33333333333)), - vmulq_n_f32(vsubq_f32(t15, t16), 0.533333333333)); - float32x4_t m15 = - vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t11, 0.03333333), vmulq_n_f32(t12, 0.0222222)), - vmulq_n_f32(t13, 0.16666666666666667)), - vmulq_n_f32(t14, 0.11111111111)), - vmulq_n_f32(t15, 0.1333333333)), - vmulq_n_f32(t16, 0.08888888888)); - float32x4_t m16 = - vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(t11, -0.03333333), vmulq_n_f32(t12, 0.0222222)), - vmulq_n_f32(t13, 0.16666666666666667)), - vmulq_n_f32(t14, 0.11111111111)), - vmulq_n_f32(t15, 0.1333333333)), - vmulq_n_f32(t16, 0.08888888888)); - float32x4_t m17 = - vaddq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t11, -0.5625), vmulq_n_f32(t13, 3.0625)), vmulq_n_f32(t15, 3.5)), t17); - - float32x4_t m20 = - vsubq_f32(vaddq_f32(vsubq_f32(t20, vmulq_n_f32(t22, 5.444444444444444)), vmulq_n_f32(t24, 6.22222222222)), - vmulq_n_f32(t26, 1.77777777777777777778)); - float32x4_t m21 = vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t21, 1.5), vmulq_n_f32(t22, 3)), - vmulq_n_f32(t23, 2.16666666666666667)), - vmulq_n_f32(t24, 4.3333333333)), - vmulq_n_f32(t25, 0.66666666667)), - vmulq_n_f32(t26, 1.333333333333)); - float32x4_t m22 = vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(t21, -1.5), vmulq_n_f32(t22, 3)), - vmulq_n_f32(t23, 2.16666666666666667)), - vmulq_n_f32(t24, 4.3333333333)), - vmulq_n_f32(t25, 0.66666666667)), - vmulq_n_f32(t26, 1.333333333333)); - float32x4_t m23 = - vaddq_f32(vaddq_f32(vmulq_n_f32(vaddq_f32(t21, t22), -0.3), vmulq_n_f32(vaddq_f32(t23, t24), 1.33333333333)), - vmulq_n_f32(vaddq_f32(t25, t26), -0.533333333333)); - float32x4_t m24 = - vaddq_f32(vaddq_f32(vmulq_n_f32(vsubq_f32(t21, t22), 0.3), vmulq_n_f32(vsubq_f32(t24, t23), 1.33333333333)), - vmulq_n_f32(vsubq_f32(t25, t26), 0.533333333333)); - float32x4_t m25 = - vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t21, 0.03333333), vmulq_n_f32(t22, 0.0222222)), - vmulq_n_f32(t23, 0.16666666666666667)), - vmulq_n_f32(t24, 0.11111111111)), - vmulq_n_f32(t25, 0.1333333333)), - vmulq_n_f32(t26, 0.08888888888)); - float32x4_t m26 = - vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(t21, -0.03333333), vmulq_n_f32(t22, 0.0222222)), - vmulq_n_f32(t23, 0.16666666666666667)), - vmulq_n_f32(t24, 0.11111111111)), - vmulq_n_f32(t25, 0.1333333333)), - vmulq_n_f32(t26, 0.08888888888)); - float32x4_t m27 = - vaddq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t21, -0.5625), vmulq_n_f32(t23, 3.0625)), vmulq_n_f32(t25, 3.5)), t27); - - float32x4_t m30 = - vsubq_f32(vaddq_f32(vsubq_f32(t30, vmulq_n_f32(t32, 5.444444444444444)), vmulq_n_f32(t34, 6.22222222222)), - vmulq_n_f32(t36, 1.77777777777777777778)); - float32x4_t m31 = vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t31, 1.5), vmulq_n_f32(t32, 3)), - vmulq_n_f32(t33, 2.16666666666666667)), - vmulq_n_f32(t34, 4.3333333333)), - vmulq_n_f32(t35, 0.66666666667)), - vmulq_n_f32(t36, 1.333333333333)); - float32x4_t m32 = vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(t31, -1.5), vmulq_n_f32(t32, 3)), - vmulq_n_f32(t33, 2.16666666666666667)), - vmulq_n_f32(t34, 4.3333333333)), - vmulq_n_f32(t35, 0.66666666667)), - vmulq_n_f32(t36, 1.333333333333)); - float32x4_t m33 = - vaddq_f32(vaddq_f32(vmulq_n_f32(vaddq_f32(t31, t32), -0.3), vmulq_n_f32(vaddq_f32(t33, t34), 1.33333333333)), - vmulq_n_f32(vaddq_f32(t35, t36), -0.533333333333)); - float32x4_t m34 = - vaddq_f32(vaddq_f32(vmulq_n_f32(vsubq_f32(t31, t32), 0.3), vmulq_n_f32(vsubq_f32(t34, t33), 1.33333333333)), - vmulq_n_f32(vsubq_f32(t35, t36), 0.533333333333)); - float32x4_t m35 = - vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t31, 0.03333333), vmulq_n_f32(t32, 0.0222222)), - vmulq_n_f32(t33, 0.16666666666666667)), - vmulq_n_f32(t34, 0.11111111111)), - vmulq_n_f32(t35, 0.1333333333)), - vmulq_n_f32(t36, 0.08888888888)); - float32x4_t m36 = - vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(t31, -0.03333333), vmulq_n_f32(t32, 0.0222222)), - vmulq_n_f32(t33, 0.16666666666666667)), - vmulq_n_f32(t34, 0.11111111111)), - vmulq_n_f32(t35, 0.1333333333)), - vmulq_n_f32(t36, 0.08888888888)); - float32x4_t m37 = - vaddq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t31, -0.5625), vmulq_n_f32(t33, 3.0625)), vmulq_n_f32(t35, 3.5)), t37); - - float32x4_t m40 = - vsubq_f32(vaddq_f32(vsubq_f32(t40, vmulq_n_f32(t42, 5.444444444444444)), vmulq_n_f32(t44, 6.22222222222)), - vmulq_n_f32(t46, 1.77777777777777777778)); - float32x4_t m41 = vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t41, 1.5), vmulq_n_f32(t42, 3)), - vmulq_n_f32(t43, 2.16666666666666667)), - vmulq_n_f32(t44, 4.3333333333)), - vmulq_n_f32(t45, 0.66666666667)), - vmulq_n_f32(t46, 1.333333333333)); - float32x4_t m42 = vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(t41, -1.5), vmulq_n_f32(t42, 3)), - vmulq_n_f32(t43, 2.16666666666666667)), - vmulq_n_f32(t44, 4.3333333333)), - vmulq_n_f32(t45, 0.66666666667)), - vmulq_n_f32(t46, 1.333333333333)); - float32x4_t m43 = - vaddq_f32(vaddq_f32(vmulq_n_f32(vaddq_f32(t41, t42), -0.3), vmulq_n_f32(vaddq_f32(t43, t44), 1.33333333333)), - vmulq_n_f32(vaddq_f32(t45, t46), -0.533333333333)); - float32x4_t m44 = - vaddq_f32(vaddq_f32(vmulq_n_f32(vsubq_f32(t41, t42), 0.3), vmulq_n_f32(vsubq_f32(t44, t43), 1.33333333333)), - vmulq_n_f32(vsubq_f32(t45, t46), 0.533333333333)); - float32x4_t m45 = - vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t41, 0.03333333), vmulq_n_f32(t42, 0.0222222)), - vmulq_n_f32(t43, 0.16666666666666667)), - vmulq_n_f32(t44, 0.11111111111)), - vmulq_n_f32(t45, 0.1333333333)), - vmulq_n_f32(t46, 0.08888888888)); - float32x4_t m46 = - vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(t41, -0.03333333), vmulq_n_f32(t42, 0.0222222)), - vmulq_n_f32(t43, 0.16666666666666667)), - vmulq_n_f32(t44, 0.11111111111)), - vmulq_n_f32(t45, 0.1333333333)), - vmulq_n_f32(t46, 0.08888888888)); - float32x4_t m47 = - vaddq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t41, -0.5625), vmulq_n_f32(t43, 3.0625)), vmulq_n_f32(t45, 3.5)), t47); - - float32x4_t m50 = - vsubq_f32(vaddq_f32(vsubq_f32(t50, vmulq_n_f32(t52, 5.444444444444444)), vmulq_n_f32(t54, 6.22222222222)), - vmulq_n_f32(t56, 1.77777777777777777778)); - float32x4_t m51 = vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t51, 1.5), vmulq_n_f32(t52, 3)), - vmulq_n_f32(t53, 2.16666666666666667)), - vmulq_n_f32(t54, 4.3333333333)), - vmulq_n_f32(t55, 0.66666666667)), - vmulq_n_f32(t56, 1.333333333333)); - float32x4_t m52 = vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(t51, -1.5), vmulq_n_f32(t52, 3)), - vmulq_n_f32(t53, 2.16666666666666667)), - vmulq_n_f32(t54, 4.3333333333)), - vmulq_n_f32(t55, 0.66666666667)), - vmulq_n_f32(t56, 1.333333333333)); - float32x4_t m53 = - vaddq_f32(vaddq_f32(vmulq_n_f32(vaddq_f32(t51, t52), -0.3), vmulq_n_f32(vaddq_f32(t53, t54), 1.33333333333)), - vmulq_n_f32(vaddq_f32(t55, t56), -0.533333333333)); - float32x4_t m54 = - vaddq_f32(vaddq_f32(vmulq_n_f32(vsubq_f32(t51, t52), 0.3), vmulq_n_f32(vsubq_f32(t54, t53), 1.33333333333)), - vmulq_n_f32(vsubq_f32(t55, t56), 0.533333333333)); - float32x4_t m55 = - vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t51, 0.03333333), vmulq_n_f32(t52, 0.0222222)), - vmulq_n_f32(t53, 0.16666666666666667)), - vmulq_n_f32(t54, 0.11111111111)), - vmulq_n_f32(t55, 0.1333333333)), - vmulq_n_f32(t56, 0.08888888888)); - float32x4_t m56 = - vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(t51, -0.03333333), vmulq_n_f32(t52, 0.0222222)), - vmulq_n_f32(t53, 0.16666666666666667)), - vmulq_n_f32(t54, 0.11111111111)), - vmulq_n_f32(t55, 0.1333333333)), - vmulq_n_f32(t56, 0.08888888888)); - float32x4_t m57 = - vaddq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t51, -0.5625), vmulq_n_f32(t53, 3.0625)), vmulq_n_f32(t55, 3.5)), t57); - - float32x4_t m60 = - vsubq_f32(vaddq_f32(vsubq_f32(t60, vmulq_n_f32(t62, 5.444444444444444)), vmulq_n_f32(t64, 6.22222222222)), - vmulq_n_f32(t66, 1.77777777777777777778)); - float32x4_t m61 = vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t61, 1.5), vmulq_n_f32(t62, 3)), - vmulq_n_f32(t63, 2.16666666666666667)), - vmulq_n_f32(t64, 4.3333333333)), - vmulq_n_f32(t65, 0.66666666667)), - vmulq_n_f32(t66, 1.333333333333)); - float32x4_t m62 = vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(t61, -1.5), vmulq_n_f32(t62, 3)), - vmulq_n_f32(t63, 2.16666666666666667)), - vmulq_n_f32(t64, 4.3333333333)), - vmulq_n_f32(t65, 0.66666666667)), - vmulq_n_f32(t66, 1.333333333333)); - float32x4_t m63 = - vaddq_f32(vaddq_f32(vmulq_n_f32(vaddq_f32(t61, t62), -0.3), vmulq_n_f32(vaddq_f32(t63, t64), 1.33333333333)), - vmulq_n_f32(vaddq_f32(t65, t66), -0.533333333333)); - float32x4_t m64 = - vaddq_f32(vaddq_f32(vmulq_n_f32(vsubq_f32(t61, t62), 0.3), vmulq_n_f32(vsubq_f32(t64, t63), 1.33333333333)), - vmulq_n_f32(vsubq_f32(t65, t66), 0.533333333333)); - float32x4_t m65 = - vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t61, 0.03333333), vmulq_n_f32(t62, 0.0222222)), - vmulq_n_f32(t63, 0.16666666666666667)), - vmulq_n_f32(t64, 0.11111111111)), - vmulq_n_f32(t65, 0.1333333333)), - vmulq_n_f32(t66, 0.08888888888)); - float32x4_t m66 = - vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(t61, -0.03333333), vmulq_n_f32(t62, 0.0222222)), - vmulq_n_f32(t63, 0.16666666666666667)), - vmulq_n_f32(t64, 0.11111111111)), - vmulq_n_f32(t65, 0.1333333333)), - vmulq_n_f32(t66, 0.08888888888)); - float32x4_t m67 = - vaddq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t61, -0.5625), vmulq_n_f32(t63, 3.0625)), vmulq_n_f32(t65, 3.5)), t67); - - float32x4_t m70 = - vsubq_f32(vaddq_f32(vsubq_f32(t70, vmulq_n_f32(t72, 5.444444444444444)), vmulq_n_f32(t74, 6.22222222222)), - vmulq_n_f32(t76, 1.77777777777777777778)); - float32x4_t m71 = vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t71, 1.5), vmulq_n_f32(t72, 3)), - vmulq_n_f32(t73, 2.16666666666666667)), - vmulq_n_f32(t74, 4.3333333333)), - vmulq_n_f32(t75, 0.66666666667)), - vmulq_n_f32(t76, 1.333333333333)); - float32x4_t m72 = vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(t71, -1.5), vmulq_n_f32(t72, 3)), - vmulq_n_f32(t73, 2.16666666666666667)), - vmulq_n_f32(t74, 4.3333333333)), - vmulq_n_f32(t75, 0.66666666667)), - vmulq_n_f32(t76, 1.333333333333)); - float32x4_t m73 = - vaddq_f32(vaddq_f32(vmulq_n_f32(vaddq_f32(t71, t72), -0.3), vmulq_n_f32(vaddq_f32(t73, t74), 1.33333333333)), - vmulq_n_f32(vaddq_f32(t75, t76), -0.533333333333)); - float32x4_t m74 = - vaddq_f32(vaddq_f32(vmulq_n_f32(vsubq_f32(t71, t72), 0.3), vmulq_n_f32(vsubq_f32(t74, t73), 1.33333333333)), - vmulq_n_f32(vsubq_f32(t75, t76), 0.533333333333)); - float32x4_t m75 = - vaddq_f32(vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t71, 0.03333333), vmulq_n_f32(t72, 0.0222222)), - vmulq_n_f32(t73, 0.16666666666666667)), - vmulq_n_f32(t74, 0.11111111111)), - vmulq_n_f32(t75, 0.1333333333)), - vmulq_n_f32(t76, 0.08888888888)); - float32x4_t m76 = - vaddq_f32(vsubq_f32(vsubq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(t71, -0.03333333), vmulq_n_f32(t72, 0.0222222)), - vmulq_n_f32(t73, 0.16666666666666667)), - vmulq_n_f32(t74, 0.11111111111)), - vmulq_n_f32(t75, 0.1333333333)), - vmulq_n_f32(t76, 0.08888888888)); - float32x4_t m77 = - vaddq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t71, -0.5625), vmulq_n_f32(t73, 3.0625)), vmulq_n_f32(t75, 3.5)), t77); - - vst1q_f32(dst_data + 0 * dst_step, m00); - vst1q_f32(dst_data + 1 * dst_step, m01); - vst1q_f32(dst_data + 2 * dst_step, m02); - vst1q_f32(dst_data + 3 * dst_step, m03); - vst1q_f32(dst_data + 4 * dst_step, m04); - vst1q_f32(dst_data + 5 * dst_step, m05); - vst1q_f32(dst_data + 6 * dst_step, m06); - vst1q_f32(dst_data + 7 * dst_step, m07); - vst1q_f32(dst_data + 8 * dst_step, m10); - vst1q_f32(dst_data + 9 * dst_step, m11); - vst1q_f32(dst_data + 10 * dst_step, m12); - vst1q_f32(dst_data + 11 * dst_step, m13); - vst1q_f32(dst_data + 12 * dst_step, m14); - vst1q_f32(dst_data + 13 * dst_step, m15); - vst1q_f32(dst_data + 14 * dst_step, m16); - vst1q_f32(dst_data + 15 * dst_step, m17); - vst1q_f32(dst_data + 16 * dst_step, m20); - vst1q_f32(dst_data + 17 * dst_step, m21); - vst1q_f32(dst_data + 18 * dst_step, m22); - vst1q_f32(dst_data + 19 * dst_step, m23); - vst1q_f32(dst_data + 20 * dst_step, m24); - vst1q_f32(dst_data + 21 * dst_step, m25); - vst1q_f32(dst_data + 22 * dst_step, m26); - vst1q_f32(dst_data + 23 * dst_step, m27); - vst1q_f32(dst_data + 24 * dst_step, m30); - vst1q_f32(dst_data + 25 * dst_step, m31); - vst1q_f32(dst_data + 26 * dst_step, m32); - vst1q_f32(dst_data + 27 * dst_step, m33); - vst1q_f32(dst_data + 28 * dst_step, m34); - vst1q_f32(dst_data + 29 * dst_step, m35); - vst1q_f32(dst_data + 30 * dst_step, m36); - vst1q_f32(dst_data + 31 * dst_step, m37); - vst1q_f32(dst_data + 32 * dst_step, m40); - vst1q_f32(dst_data + 33 * dst_step, m41); - vst1q_f32(dst_data + 34 * dst_step, m42); - vst1q_f32(dst_data + 35 * dst_step, m43); - vst1q_f32(dst_data + 36 * dst_step, m44); - vst1q_f32(dst_data + 37 * dst_step, m45); - vst1q_f32(dst_data + 38 * dst_step, m46); - vst1q_f32(dst_data + 39 * dst_step, m47); - vst1q_f32(dst_data + 40 * dst_step, m50); - vst1q_f32(dst_data + 41 * dst_step, m51); - vst1q_f32(dst_data + 42 * dst_step, m52); - vst1q_f32(dst_data + 43 * dst_step, m53); - vst1q_f32(dst_data + 44 * dst_step, m54); - vst1q_f32(dst_data + 45 * dst_step, m55); - vst1q_f32(dst_data + 46 * dst_step, m56); - vst1q_f32(dst_data + 47 * dst_step, m57); - vst1q_f32(dst_data + 48 * dst_step, m60); - vst1q_f32(dst_data + 49 * dst_step, m61); - vst1q_f32(dst_data + 50 * dst_step, m62); - vst1q_f32(dst_data + 51 * dst_step, m63); - vst1q_f32(dst_data + 52 * dst_step, m64); - vst1q_f32(dst_data + 53 * dst_step, m65); - vst1q_f32(dst_data + 54 * dst_step, m66); - vst1q_f32(dst_data + 55 * dst_step, m67); - vst1q_f32(dst_data + 56 * dst_step, m70); - vst1q_f32(dst_data + 57 * dst_step, m71); - vst1q_f32(dst_data + 58 * dst_step, m72); - vst1q_f32(dst_data + 59 * dst_step, m73); - vst1q_f32(dst_data + 60 * dst_step, m74); - vst1q_f32(dst_data + 61 * dst_step, m75); - vst1q_f32(dst_data + 62 * dst_step, m76); - vst1q_f32(dst_data + 63 * dst_step, m77); + float32x4_t src[64]; + float32x4_t t[64]; + float32x4_t m[64]; + Load64Data; + for (int l = 0; l < 8; ++l) { + int offset = l * 8; + t[l] = vsubq_f32(vaddq_f32(vsubq_f32(vmulq_n_f32(src[offset], 36), vmulq_n_f32(src[2 + offset], 49)), + vmulq_n_f32(src[4 + offset], 14)), + src[6 + offset]); + float32x4_t tmp1 = vaddq_f32(vmulq_n_f32(src[1 + offset], 36), src[5 + offset]); + float32x4_t tmp2 = vsubq_f32(vmulq_n_f32(src[2 + offset], 36), vmulq_n_f32(src[4 + offset], 13)); + t[8 + l] = vaddq_f32(vsubq_f32(vaddq_f32(tmp1, tmp2), vmulq_n_f32(src[3 + offset], 13)), src[6 + offset]); + t[16 + l] = vaddq_f32(vaddq_f32(vsubq_f32(tmp2, tmp1), vmulq_n_f32(src[3 + offset], 13)), src[6 + offset]); + tmp1 = vaddq_f32(vmulq_n_f32(src[1 + offset], 18), vmulq_n_f32(src[5 + offset], 2)); + tmp2 = vsubq_f32(vmulq_n_f32(src[2 + offset], 9), vmulq_n_f32(src[4 + offset], 10)); + t[24 + l] = vaddq_f32(vsubq_f32(vaddq_f32(tmp1, tmp2), vmulq_n_f32(src[3 + offset], 20)), src[6 + offset]); + t[32 + l] = vaddq_f32(vaddq_f32(vsubq_f32(tmp2, tmp1), vmulq_n_f32(src[3 + offset], 20)), src[6 + offset]); + tmp1 = vaddq_f32(vmulq_n_f32(src[1 + offset], 12), vmulq_n_f32(src[5 + offset], 3)); + tmp2 = vsubq_f32(vmulq_n_f32(src[2 + offset], 4), vmulq_n_f32(src[4 + offset], 5)); + t[40 + l] = vaddq_f32(vsubq_f32(vaddq_f32(tmp1, tmp2), vmulq_n_f32(src[3 + offset], 15)), src[6 + offset]); + t[48 + l] = vaddq_f32(vaddq_f32(vsubq_f32(tmp2, tmp1), vmulq_n_f32(src[3 + offset], 15)), src[6 + offset]); + t[56 + l] = vaddq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src[1 + offset], -36), vmulq_n_f32(src[3 + offset], 49)), + vmulq_n_f32(src[5 + offset], 14)), + src[7 + offset]); + } + for (int l = 0; l < 8; ++l) { + int offset = l * 8; + m[l] = vsubq_f32( + vaddq_f32(vsubq_f32(vmulq_n_f32(t[offset], 36), vmulq_n_f32(t[2 + offset], 49)), vmulq_n_f32(t[4 + offset], 14)), + t[6 + offset]); + float32x4_t tmp1 = vaddq_f32(vmulq_n_f32(t[1 + offset], 36), t[5 + offset]); + float32x4_t tmp2 = vsubq_f32(vmulq_n_f32(t[2 + offset], 36), vmulq_n_f32(t[4 + offset], 13)); + m[8 + l] = vaddq_f32(vsubq_f32(vaddq_f32(tmp1, tmp2), vmulq_n_f32(t[3 + offset], 13)), t[6 + offset]); + m[16 + l] = vaddq_f32(vaddq_f32(vsubq_f32(tmp2, tmp1), vmulq_n_f32(t[3 + offset], 13)), t[6 + offset]); + tmp1 = vaddq_f32(vmulq_n_f32(t[1 + offset], 18), vmulq_n_f32(t[5 + offset], 2)); + tmp2 = vsubq_f32(vmulq_n_f32(t[2 + offset], 9), vmulq_n_f32(t[4 + offset], 10)); + m[24 + l] = vaddq_f32(vsubq_f32(vaddq_f32(tmp1, tmp2), vmulq_n_f32(t[3 + offset], 20)), t[6 + offset]); + m[32 + l] = vaddq_f32(vaddq_f32(vsubq_f32(tmp2, tmp1), vmulq_n_f32(t[3 + offset], 20)), t[6 + offset]); + tmp1 = vaddq_f32(vmulq_n_f32(t[1 + offset], 12), vmulq_n_f32(t[5 + offset], 3)); + tmp2 = vsubq_f32(vmulq_n_f32(t[2 + offset], 4), vmulq_n_f32(t[4 + offset], 5)); + m[40 + l] = vaddq_f32(vsubq_f32(vaddq_f32(tmp1, tmp2), vmulq_n_f32(t[3 + offset], 15)), t[6 + offset]); + m[48 + l] = vaddq_f32(vaddq_f32(vsubq_f32(tmp2, tmp1), vmulq_n_f32(t[3 + offset], 15)), t[6 + offset]); + m[56 + l] = vaddq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t[1 + offset], -36), vmulq_n_f32(t[3 + offset], 49)), + vmulq_n_f32(t[5 + offset], 14)), + t[7 + offset]); + } + for (int i = 0; i < 64; i++) { + vst1q_f32(dst_data + i * dst_step, m[i]); + } #else - for (int i = 0; i < C4NUM; i++) { - float src_data_00 = src_data[i]; - float src_data_01 = src_data[i + src_step]; - float src_data_02 = src_data[i + 2 * src_step]; - float src_data_03 = src_data[i + 3 * src_step]; - float src_data_04 = src_data[i + 4 * src_step]; - float src_data_05 = src_data[i + 5 * src_step]; - float src_data_06 = src_data[i + 6 * src_step]; - float src_data_07 = src_data[i + 7 * src_step]; - float src_data_10 = src_data[i + 8 * src_step]; - float src_data_11 = src_data[i + 9 * src_step]; - float src_data_12 = src_data[i + 10 * src_step]; - float src_data_13 = src_data[i + 11 * src_step]; - float src_data_14 = src_data[i + 12 * src_step]; - float src_data_15 = src_data[i + 13 * src_step]; - float src_data_16 = src_data[i + 14 * src_step]; - float src_data_17 = src_data[i + 15 * src_step]; - float src_data_20 = src_data[i + 16 * src_step]; - float src_data_21 = src_data[i + 17 * src_step]; - float src_data_22 = src_data[i + 18 * src_step]; - float src_data_23 = src_data[i + 19 * src_step]; - float src_data_24 = src_data[i + 20 * src_step]; - float src_data_25 = src_data[i + 21 * src_step]; - float src_data_26 = src_data[i + 22 * src_step]; - float src_data_27 = src_data[i + 23 * src_step]; - float src_data_30 = src_data[i + 24 * src_step]; - float src_data_31 = src_data[i + 25 * src_step]; - float src_data_32 = src_data[i + 26 * src_step]; - float src_data_33 = src_data[i + 27 * src_step]; - float src_data_34 = src_data[i + 28 * src_step]; - float src_data_35 = src_data[i + 29 * src_step]; - float src_data_36 = src_data[i + 30 * src_step]; - float src_data_37 = src_data[i + 31 * src_step]; - float src_data_40 = src_data[i + 32 * src_step]; - float src_data_41 = src_data[i + 33 * src_step]; - float src_data_42 = src_data[i + 34 * src_step]; - float src_data_43 = src_data[i + 35 * src_step]; - float src_data_44 = src_data[i + 36 * src_step]; - float src_data_45 = src_data[i + 37 * src_step]; - float src_data_46 = src_data[i + 38 * src_step]; - float src_data_47 = src_data[i + 39 * src_step]; - float src_data_50 = src_data[i + 40 * src_step]; - float src_data_51 = src_data[i + 41 * src_step]; - float src_data_52 = src_data[i + 42 * src_step]; - float src_data_53 = src_data[i + 43 * src_step]; - float src_data_54 = src_data[i + 44 * src_step]; - float src_data_55 = src_data[i + 45 * src_step]; - float src_data_56 = src_data[i + 46 * src_step]; - float src_data_57 = src_data[i + 47 * src_step]; - float src_data_60 = src_data[i + 48 * src_step]; - float src_data_61 = src_data[i + 49 * src_step]; - float src_data_62 = src_data[i + 50 * src_step]; - float src_data_63 = src_data[i + 51 * src_step]; - float src_data_64 = src_data[i + 52 * src_step]; - float src_data_65 = src_data[i + 53 * src_step]; - float src_data_66 = src_data[i + 54 * src_step]; - float src_data_67 = src_data[i + 55 * src_step]; - float src_data_70 = src_data[i + 56 * src_step]; - float src_data_71 = src_data[i + 57 * src_step]; - float src_data_72 = src_data[i + 58 * src_step]; - float src_data_73 = src_data[i + 59 * src_step]; - float src_data_74 = src_data[i + 60 * src_step]; - float src_data_75 = src_data[i + 61 * src_step]; - float src_data_76 = src_data[i + 62 * src_step]; - float src_data_77 = src_data[i + 63 * src_step]; - - float t00 = src_data_00 - 5.444444444444444445125f * src_data_20 + 6.222222222222222222223f * src_data_40 - - 1.77777777777777778f * src_data_60; - float t01 = src_data_01 - 5.444444444444444445125f * src_data_21 + 6.222222222222222222223f * src_data_41 - - 1.77777777777777778f * src_data_61; - float t02 = src_data_02 - 5.444444444444444445125f * src_data_22 + 6.222222222222222222223f * src_data_42 - - 1.77777777777777778f * src_data_62; - float t03 = src_data_03 - 5.444444444444444445125f * src_data_23 + 6.222222222222222222223f * src_data_43 - - 1.77777777777777778f * src_data_63; - float t04 = src_data_04 - 5.444444444444444445125f * src_data_24 + 6.222222222222222222223f * src_data_44 - - 1.77777777777777778f * src_data_64; - float t05 = src_data_05 - 5.444444444444444445125f * src_data_25 + 6.222222222222222222223f * src_data_45 - - 1.77777777777777778f * src_data_65; - float t06 = src_data_06 - 5.444444444444444445125f * src_data_26 + 6.222222222222222222223f * src_data_46 - - 1.77777777777777778f * src_data_66; - float t07 = src_data_07 - 5.444444444444444445125f * src_data_27 + 6.222222222222222222223f * src_data_47 - - 1.77777777777777778f * src_data_67; - - const float t10 = 1.5f * src_data_10 + 3.0f * src_data_20 - 2.1666666666666667f * src_data_30 - - 4.333333333333333333f * src_data_40 + 0.66666666666666667f * src_data_50 + - 1.333333333333333f * src_data_60; - const float t11 = 1.5f * src_data_11 + 3.0f * src_data_21 - 2.1666666666666667f * src_data_31 - - 4.333333333333333333f * src_data_41 + 0.66666666666666667f * src_data_51 + - 1.333333333333333f * src_data_61; - const float t12 = 1.5f * src_data_12 + 3.0f * src_data_22 - 2.1666666666666667f * src_data_32 - - 4.333333333333333333f * src_data_42 + 0.66666666666666667f * src_data_52 + - 1.333333333333333f * src_data_62; - const float t13 = 1.5f * src_data_13 + 3.0f * src_data_23 - 2.1666666666666667f * src_data_33 - - 4.333333333333333333f * src_data_43 + 0.66666666666666667f * src_data_53 + - 1.333333333333333f * src_data_63; - const float t14 = 1.5f * src_data_14 + 3.0f * src_data_24 - 2.1666666666666667f * src_data_34 - - 4.333333333333333333f * src_data_44 + 0.66666666666666667f * src_data_54 + - 1.333333333333333f * src_data_64; - const float t15 = 1.5f * src_data_15 + 3.0f * src_data_25 - 2.1666666666666667f * src_data_35 - - 4.333333333333333333f * src_data_45 + 0.66666666666666667f * src_data_55 + - 1.333333333333333f * src_data_65; - const float t16 = 1.5f * src_data_16 + 3.0f * src_data_26 - 2.1666666666666667f * src_data_36 - - 4.333333333333333333f * src_data_46 + 0.66666666666666667f * src_data_56 + - 1.333333333333333f * src_data_66; - const float t17 = 1.5f * src_data_17 + 3.0f * src_data_27 - 2.1666666666666667f * src_data_37 - - 4.333333333333333333f * src_data_47 + 0.66666666666666667f * src_data_57 + - 1.333333333333333f * src_data_67; - - const float t20 = -1.5f * src_data_10 + 3.0f * src_data_20 + 2.1666666666666667f * src_data_30 - - 4.333333333333333333f * src_data_40 - 0.66666666666666667f * src_data_50 + - 1.333333333333333f * src_data_60; - const float t21 = -1.5f * src_data_11 + 3.0f * src_data_21 + 2.1666666666666667f * src_data_31 - - 4.333333333333333333f * src_data_41 - 0.66666666666666667f * src_data_51 + - 1.333333333333333f * src_data_61; - const float t22 = -1.5f * src_data_12 + 3.0f * src_data_22 + 2.1666666666666667f * src_data_32 - - 4.333333333333333333f * src_data_42 - 0.66666666666666667f * src_data_52 + - 1.333333333333333f * src_data_62; - const float t23 = -1.5f * src_data_13 + 3.0f * src_data_23 + 2.1666666666666667f * src_data_33 - - 4.333333333333333333f * src_data_43 - 0.66666666666666667f * src_data_53 + - 1.333333333333333f * src_data_63; - const float t24 = -1.5f * src_data_14 + 3.0f * src_data_24 + 2.1666666666666667f * src_data_34 - - 4.333333333333333333f * src_data_44 - 0.66666666666666667f * src_data_54 + - 1.333333333333333f * src_data_64; - const float t25 = -1.5f * src_data_15 + 3.0f * src_data_25 + 2.1666666666666667f * src_data_35 - - 4.333333333333333333f * src_data_45 - 0.66666666666666667f * src_data_55 + - 1.333333333333333f * src_data_65; - const float t26 = -1.5f * src_data_16 + 3.0f * src_data_26 + 2.1666666666666667f * src_data_36 - - 4.333333333333333333f * src_data_46 - 0.66666666666666667f * src_data_56 + - 1.333333333333333f * src_data_66; - const float t27 = -1.5f * src_data_17 + 3.0f * src_data_27 + 2.1666666666666667f * src_data_37 - - 4.333333333333333333f * src_data_47 - 0.66666666666666667f * src_data_57 + - 1.333333333333333f * src_data_67; - - const float t30 = -0.3f * (src_data_10 + src_data_20) + 1.33333333333333f * (src_data_30 + src_data_40) - - 0.53333333333f * (src_data_50 + src_data_60); - const float t31 = -0.3f * (src_data_11 + src_data_21) + 1.33333333333333f * (src_data_31 + src_data_41) - - 0.53333333333f * (src_data_51 + src_data_61); - const float t32 = -0.3f * (src_data_12 + src_data_22) + 1.33333333333333f * (src_data_32 + src_data_42) - - 0.53333333333f * (src_data_52 + src_data_62); - const float t33 = -0.3f * (src_data_13 + src_data_23) + 1.33333333333333f * (src_data_33 + src_data_43) - - 0.53333333333f * (src_data_53 + src_data_63); - const float t34 = -0.3f * (src_data_14 + src_data_24) + 1.33333333333333f * (src_data_34 + src_data_44) - - 0.53333333333f * (src_data_54 + src_data_64); - const float t35 = -0.3f * (src_data_15 + src_data_25) + 1.33333333333333f * (src_data_35 + src_data_45) - - 0.53333333333f * (src_data_55 + src_data_65); - const float t36 = -0.3f * (src_data_16 + src_data_26) + 1.33333333333333f * (src_data_36 + src_data_46) - - 0.53333333333f * (src_data_56 + src_data_66); - const float t37 = -0.3f * (src_data_17 + src_data_27) + 1.33333333333333f * (src_data_37 + src_data_47) - - 0.53333333333f * (src_data_57 + src_data_67); - - const float t40 = 0.3f * (src_data_10 - src_data_20) + 1.33333333333333f * (src_data_40 - src_data_30) + - 0.53333333333f * (src_data_50 - src_data_60); - const float t41 = 0.3f * (src_data_11 - src_data_21) + 1.33333333333333f * (src_data_41 - src_data_31) + - 0.53333333333f * (src_data_51 - src_data_61); - const float t42 = 0.3f * (src_data_12 - src_data_22) + 1.33333333333333f * (src_data_42 - src_data_32) + - 0.53333333333f * (src_data_52 - src_data_62); - const float t43 = 0.3f * (src_data_13 - src_data_23) + 1.33333333333333f * (src_data_43 - src_data_33) + - 0.53333333333f * (src_data_53 - src_data_63); - const float t44 = 0.3f * (src_data_14 - src_data_24) + 1.33333333333333f * (src_data_44 - src_data_34) + - 0.53333333333f * (src_data_54 - src_data_64); - const float t45 = 0.3f * (src_data_15 - src_data_25) + 1.33333333333333f * (src_data_45 - src_data_35) + - 0.53333333333f * (src_data_55 - src_data_65); - const float t46 = 0.3f * (src_data_16 - src_data_26) + 1.33333333333333f * (src_data_46 - src_data_36) + - 0.53333333333f * (src_data_56 - src_data_66); - const float t47 = 0.3f * (src_data_17 - src_data_27) + 1.33333333333333f * (src_data_47 - src_data_37) + - 0.53333333333f * (src_data_57 - src_data_67); - - const float t50 = 0.0333333333f * src_data_10 + 0.02222222f * src_data_20 - 0.1666666666f * src_data_30 - - 0.1111111111f * src_data_40 + 0.1333333f * src_data_50 + 0.0888888f * src_data_60; - const float t51 = 0.0333333333f * src_data_11 + 0.02222222f * src_data_21 - 0.1666666666f * src_data_31 - - 0.1111111111f * src_data_41 + 0.1333333f * src_data_51 + 0.0888888f * src_data_61; - const float t52 = 0.0333333333f * src_data_12 + 0.02222222f * src_data_22 - 0.1666666666f * src_data_32 - - 0.1111111111f * src_data_42 + 0.1333333f * src_data_52 + 0.0888888f * src_data_62; - const float t53 = 0.0333333333f * src_data_13 + 0.02222222f * src_data_23 - 0.1666666666f * src_data_33 - - 0.1111111111f * src_data_43 + 0.1333333f * src_data_53 + 0.0888888f * src_data_63; - const float t54 = 0.0333333333f * src_data_14 + 0.02222222f * src_data_24 - 0.1666666666f * src_data_34 - - 0.1111111111f * src_data_44 + 0.1333333f * src_data_54 + 0.0888888f * src_data_64; - const float t55 = 0.0333333333f * src_data_15 + 0.02222222f * src_data_25 - 0.1666666666f * src_data_35 - - 0.1111111111f * src_data_45 + 0.1333333f * src_data_55 + 0.0888888f * src_data_65; - const float t56 = 0.0333333333f * src_data_16 + 0.02222222f * src_data_26 - 0.1666666666f * src_data_36 - - 0.1111111111f * src_data_46 + 0.1333333f * src_data_56 + 0.0888888f * src_data_66; - const float t57 = 0.0333333333f * src_data_17 + 0.02222222f * src_data_27 - 0.1666666666f * src_data_37 - - 0.1111111111f * src_data_47 + 0.1333333f * src_data_57 + 0.0888888f * src_data_67; - - const float t60 = -0.0333333333f * src_data_10 + 0.02222222f * src_data_20 + 0.1666666666f * src_data_30 - - 0.1111111111f * src_data_40 - 0.1333333f * src_data_50 + 0.0888888f * src_data_60; - const float t61 = -0.0333333333f * src_data_11 + 0.02222222f * src_data_21 + 0.1666666666f * src_data_31 - - 0.1111111111f * src_data_41 - 0.1333333f * src_data_51 + 0.0888888f * src_data_61; - const float t62 = -0.0333333333f * src_data_12 + 0.02222222f * src_data_22 + 0.1666666666f * src_data_32 - - 0.1111111111f * src_data_42 - 0.1333333f * src_data_52 + 0.0888888f * src_data_62; - const float t63 = -0.0333333333f * src_data_13 + 0.02222222f * src_data_23 + 0.1666666666f * src_data_33 - - 0.1111111111f * src_data_43 - 0.1333333f * src_data_53 + 0.0888888f * src_data_63; - const float t64 = -0.0333333333f * src_data_14 + 0.02222222f * src_data_24 + 0.1666666666f * src_data_34 - - 0.1111111111f * src_data_44 - 0.1333333f * src_data_54 + 0.0888888f * src_data_64; - const float t65 = -0.0333333333f * src_data_15 + 0.02222222f * src_data_25 + 0.1666666666f * src_data_35 - - 0.1111111111f * src_data_45 - 0.1333333f * src_data_55 + 0.0888888f * src_data_65; - const float t66 = -0.0333333333f * src_data_16 + 0.02222222f * src_data_26 + 0.1666666666f * src_data_36 - - 0.1111111111f * src_data_46 - 0.1333333f * src_data_56 + 0.0888888f * src_data_66; - const float t67 = -0.0333333333f * src_data_17 + 0.02222222f * src_data_27 + 0.1666666666f * src_data_37 - - 0.1111111111f * src_data_47 - 0.1333333f * src_data_57 + 0.0888888f * src_data_67; - - const float t70 = -0.5625f * src_data_10 + 3.0625f * src_data_30 - 3.5f * src_data_50 + src_data_70; - const float t71 = -0.5625f * src_data_11 + 3.0625f * src_data_31 - 3.5f * src_data_51 + src_data_71; - const float t72 = -0.5625f * src_data_12 + 3.0625f * src_data_32 - 3.5f * src_data_52 + src_data_72; - const float t73 = -0.5625f * src_data_13 + 3.0625f * src_data_33 - 3.5f * src_data_53 + src_data_73; - const float t74 = -0.5625f * src_data_14 + 3.0625f * src_data_34 - 3.5f * src_data_54 + src_data_74; - const float t75 = -0.5625f * src_data_15 + 3.0625f * src_data_35 - 3.5f * src_data_55 + src_data_75; - const float t76 = -0.5625f * src_data_16 + 3.0625f * src_data_36 - 3.5f * src_data_56 + src_data_76; - const float t77 = -0.5625f * src_data_17 + 3.0625f * src_data_37 - 3.5f * src_data_57 + src_data_77; - - const float m00 = - t00 - 5.444444444444444445125f * t02 + 6.222222222222222222223f * t04 - 1.77777777777777778f * t06; - const float m01 = 1.5f * t01 + 3.0f * t02 - 2.1666666666666667f * t03 - 4.333333333333333333f * t04 + - 0.66666666666666667f * t05 + 1.333333333333333f * t06; - const float m02 = -1.5f * t01 + 3.0f * t02 + 2.1666666666666667f * t03 - 4.333333333333333333f * t04 - - 0.66666666666666667f * t05 + 1.333333333333333f * t06; - const float m03 = -0.3f * (t01 + t02) + 1.33333333333333f * (t03 + t04) - 0.53333333333f * (t05 + t06); - const float m04 = 0.3f * (t01 - t02) + 1.33333333333333f * (t04 - t03) + 0.53333333333f * (t05 - t06); - const float m05 = 0.0333333333f * t01 + 0.02222222f * t02 - 0.1666666666f * t03 - 0.1111111111f * t04 + - 0.1333333f * t05 + 0.0888888f * t06; - const float m06 = -0.0333333333f * t01 + 0.02222222f * t02 + 0.1666666666f * t03 - 0.1111111111f * t04 - - 0.1333333f * t05 + 0.0888888f * t06; - const float m07 = -0.5625f * t01 + 3.0625f * t03 - 3.5f * t05 + t07; - - float m10 = t10 - 5.444444444444444445125f * t12 + 6.222222222222222222223f * t14 - 1.77777777777777778f * t16; - const float m11 = 1.5f * t11 + 3.0f * t12 - 2.1666666666666667f * t13 - 4.333333333333333333f * t14 + - 0.66666666666666667f * t15 + 1.333333333333333f * t16; - const float m12 = -1.5f * t11 + 3.0f * t12 + 2.1666666666666667f * t13 - 4.333333333333333333f * t14 - - 0.66666666666666667f * t15 + 1.333333333333333f * t16; - const float m13 = -0.3f * (t11 + t12) + 1.33333333333333f * (t13 + t14) - 0.53333333333f * (t15 + t16); - const float m14 = 0.3f * (t11 - t12) + 1.33333333333333f * (t14 - t13) + 0.53333333333f * (t15 - t16); - const float m15 = 0.0333333333f * t11 + 0.02222222f * t12 - 0.1666666666f * t13 - 0.1111111111f * t14 + - 0.1333333f * t15 + 0.0888888f * t16; - const float m16 = -0.0333333333f * t11 + 0.02222222f * t12 + 0.1666666666f * t13 - 0.1111111111f * t14 - - 0.1333333f * t15 + 0.0888888f * t16; - const float m17 = -0.5625f * t11 + 3.0625f * t13 - 3.5f * t15 + t17; - - const float m20 = - t20 - 5.444444444444444445125f * t22 + 6.222222222222222222223f * t24 - 1.77777777777777778f * t26; - const float m21 = 1.5f * t21 + 3.0f * t22 - 2.1666666666666667f * t23 - 4.333333333333333333f * t24 + - 0.66666666666666667f * t25 + 1.333333333333333f * t26; - const float m22 = -1.5f * t21 + 3.0f * t22 + 2.1666666666666667f * t23 - 4.333333333333333333f * t24 - - 0.66666666666666667f * t25 + 1.333333333333333f * t26; - const float m23 = -0.3f * (t21 + t22) + 1.33333333333333f * (t23 + t24) - 0.53333333333f * (t25 + t26); - const float m24 = 0.3f * (t21 - t22) + 1.33333333333333f * (t24 - t23) + 0.53333333333f * (t25 - t26); - const float m25 = 0.0333333333f * t21 + 0.02222222f * t22 - 0.1666666666f * t23 - 0.1111111111f * t24 + - 0.1333333f * t25 + 0.0888888f * t26; - const float m26 = -0.0333333333f * t21 + 0.02222222f * t22 + 0.1666666666f * t23 - 0.1111111111f * t24 - - 0.1333333f * t25 + 0.0888888f * t26; - const float m27 = -0.5625f * t21 + 3.0625f * t23 - 3.5f * t25 + t27; - - float m30 = t30 - 5.444444444444444445125f * t32 + 6.222222222222222222223f * t34 - 1.77777777777777778f * t36; - const float m31 = 1.5f * t31 + 3.0f * t32 - 2.1666666666666667f * t33 - 4.333333333333333333f * t34 + - 0.66666666666666667f * t35 + 1.333333333333333f * t36; - const float m32 = -1.5f * t31 + 3.0f * t32 + 2.1666666666666667f * t33 - 4.333333333333333333f * t34 - - 0.66666666666666667f * t35 + 1.333333333333333f * t36; - const float m33 = -0.3f * (t31 + t32) + 1.33333333333333f * (t33 + t34) - 0.53333333333f * (t35 + t36); - const float m34 = 0.3f * (t31 - t32) + 1.33333333333333f * (t34 - t33) + 0.53333333333f * (t35 - t36); - const float m35 = 0.0333333333f * t31 + 0.02222222f * t32 - 0.1666666666f * t33 - 0.1111111111f * t34 + - 0.1333333f * t35 + 0.0888888f * t36; - const float m36 = -0.0333333333f * t31 + 0.02222222f * t32 + 0.1666666666f * t33 - 0.1111111111f * t34 - - 0.1333333f * t35 + 0.0888888f * t36; - const float m37 = -0.5625f * t31 + 3.0625f * t33 - 3.5f * t35 + t37; - - const float m40 = - t40 - 5.444444444444444445125f * t42 + 6.222222222222222222223f * t44 - 1.77777777777777778f * t46; - const float m41 = 1.5f * t41 + 3.0f * t42 - 2.1666666666666667f * t43 - 4.333333333333333333f * t44 + - 0.66666666666666667f * t45 + 1.333333333333333f * t46; - const float m42 = -1.5f * t41 + 3.0f * t42 + 2.1666666666666667f * t43 - 4.333333333333333333f * t44 - - 0.66666666666666667f * t45 + 1.333333333333333f * t46; - const float m43 = -0.3f * (t41 + t42) + 1.33333333333333f * (t43 + t44) - 0.53333333333f * (t45 + t46); - const float m44 = 0.3f * (t41 - t42) + 1.33333333333333f * (t44 - t43) + 0.53333333333f * (t45 - t46); - const float m45 = 0.0333333333f * t41 + 0.02222222f * t42 - 0.1666666666f * t43 - 0.1111111111f * t44 + - 0.1333333f * t45 + 0.0888888f * t46; - const float m46 = -0.0333333333f * t41 + 0.02222222f * t42 + 0.1666666666f * t43 - 0.1111111111f * t44 - - 0.1333333f * t45 + 0.0888888f * t46; - const float m47 = -0.5625f * t41 + 3.0625f * t43 - 3.5f * t45 + t47; - - float m50 = t50 - 5.444444444444444445125f * t52 + 6.222222222222222222223f * t54 - 1.77777777777777778f * t56; - const float m51 = 1.5f * t51 + 3.0f * t52 - 2.1666666666666667f * t53 - 4.333333333333333333f * t54 + - 0.66666666666666667f * t55 + 1.333333333333333f * t56; - const float m52 = -1.5f * t51 + 3.0f * t52 + 2.1666666666666667f * t53 - 4.333333333333333333f * t54 - - 0.66666666666666667f * t55 + 1.333333333333333f * t56; - const float m53 = -0.3f * (t51 + t52) + 1.33333333333333f * (t53 + t54) - 0.53333333333f * (t55 + t56); - const float m54 = 0.3f * (t51 - t52) + 1.33333333333333f * (t54 - t53) + 0.53333333333f * (t55 - t56); - const float m55 = 0.0333333333f * t51 + 0.02222222f * t52 - 0.1666666666f * t53 - 0.1111111111f * t54 + - 0.1333333f * t55 + 0.0888888f * t56; - const float m56 = -0.0333333333f * t51 + 0.02222222f * t52 + 0.1666666666f * t53 - 0.1111111111f * t54 - - 0.1333333f * t55 + 0.0888888f * t56; - const float m57 = -0.5625f * t51 + 3.0625f * t53 - 3.5f * t55 + t57; - - float m60 = t60 - 5.444444444444444445125f * t62 + 6.222222222222222222223f * t64 - 1.77777777777777778f * t66; - const float m61 = 1.5f * t61 + 3.0f * t62 - 2.1666666666666667f * t63 - 4.333333333333333333f * t64 + - 0.66666666666666667f * t65 + 1.333333333333333f * t66; - const float m62 = -1.5f * t61 + 3.0f * t62 + 2.1666666666666667f * t63 - 4.333333333333333333f * t64 - - 0.66666666666666667f * t65 + 1.333333333333333f * t66; - const float m63 = -0.3f * (t61 + t62) + 1.33333333333333f * (t63 + t64) - 0.53333333333f * (t65 + t66); - const float m64 = 0.3f * (t61 - t62) + 1.33333333333333f * (t64 - t63) + 0.53333333333f * (t65 - t66); - const float m65 = 0.0333333333f * t61 + 0.02222222f * t62 - 0.1666666666f * t63 - 0.1111111111f * t64 + - 0.1333333f * t65 + 0.0888888f * t66; - const float m66 = -0.0333333333f * t61 + 0.02222222f * t62 + 0.1666666666f * t63 - 0.1111111111f * t64 - - 0.1333333f * t65 + 0.0888888f * t66; - const float m67 = -0.5625f * t61 + 3.0625f * t63 - 3.5f * t65 + t67; - - float m70 = t70 - 5.444444444444444445125f * t72 + 6.222222222222222222223f * t74 - 1.77777777777777778f * t76; - const float m71 = 1.5f * t71 + 3.0f * t72 - 2.1666666666666667f * t73 - 4.333333333333333333f * t74 + - 0.66666666666666667f * t75 + 1.333333333333333f * t76; - const float m72 = -1.5f * t71 + 3.0f * t72 + 2.1666666666666667f * t73 - 4.333333333333333333f * t74 - - 0.66666666666666667f * t75 + 1.333333333333333f * t76; - const float m73 = -0.3f * (t71 + t72) + 1.33333333333333f * (t73 + t74) - 0.53333333333f * (t75 + t76); - const float m74 = 0.3f * (t71 - t72) + 1.33333333333333f * (t74 - t73) + 0.53333333333f * (t75 - t76); - const float m75 = 0.0333333333f * t71 + 0.02222222f * t72 - 0.1666666666f * t73 - 0.1111111111f * t74 + - 0.1333333f * t75 + 0.0888888f * t76; - const float m76 = -0.0333333333f * t71 + 0.02222222f * t72 + 0.1666666666f * t73 - 0.1111111111f * t74 - - 0.1333333f * t75 + 0.0888888f * t76; - const float m77 = -0.5625f * t71 + 3.0625f * t73 - 3.5f * t75 + t77; - - (dst_data + i)[0] = m00; - (dst_data + i + dst_step)[0] = m01; - (dst_data + i + 2 * dst_step)[0] = m02; - (dst_data + i + 3 * dst_step)[0] = m03; - (dst_data + i + 4 * dst_step)[0] = m04; - (dst_data + i + 5 * dst_step)[0] = m05; - (dst_data + i + 6 * dst_step)[0] = m06; - (dst_data + i + 7 * dst_step)[0] = m07; - - (dst_data + i + 8 * dst_step)[0] = m10; - (dst_data + i + 9 * dst_step)[0] = m11; - (dst_data + i + 10 * dst_step)[0] = m12; - (dst_data + i + 11 * dst_step)[0] = m13; - (dst_data + i + 12 * dst_step)[0] = m14; - (dst_data + i + 13 * dst_step)[0] = m15; - (dst_data + i + 14 * dst_step)[0] = m16; - (dst_data + i + 15 * dst_step)[0] = m17; - - (dst_data + i + 16 * dst_step)[0] = m20; - (dst_data + i + 17 * dst_step)[0] = m21; - (dst_data + i + 18 * dst_step)[0] = m22; - (dst_data + i + 19 * dst_step)[0] = m23; - (dst_data + i + 20 * dst_step)[0] = m24; - (dst_data + i + 21 * dst_step)[0] = m25; - (dst_data + i + 22 * dst_step)[0] = m26; - (dst_data + i + 23 * dst_step)[0] = m27; - - (dst_data + i + 24 * dst_step)[0] = m30; - (dst_data + i + 25 * dst_step)[0] = m31; - (dst_data + i + 26 * dst_step)[0] = m32; - (dst_data + i + 27 * dst_step)[0] = m33; - (dst_data + i + 28 * dst_step)[0] = m34; - (dst_data + i + 29 * dst_step)[0] = m35; - (dst_data + i + 30 * dst_step)[0] = m36; - (dst_data + i + 31 * dst_step)[0] = m37; - - (dst_data + i + 32 * dst_step)[0] = m40; - (dst_data + i + 33 * dst_step)[0] = m41; - (dst_data + i + 34 * dst_step)[0] = m42; - (dst_data + i + 35 * dst_step)[0] = m43; - (dst_data + i + 36 * dst_step)[0] = m44; - (dst_data + i + 37 * dst_step)[0] = m45; - (dst_data + i + 38 * dst_step)[0] = m46; - (dst_data + i + 39 * dst_step)[0] = m47; - - (dst_data + i + 40 * dst_step)[0] = m50; - (dst_data + i + 41 * dst_step)[0] = m51; - (dst_data + i + 42 * dst_step)[0] = m52; - (dst_data + i + 43 * dst_step)[0] = m53; - (dst_data + i + 44 * dst_step)[0] = m54; - (dst_data + i + 45 * dst_step)[0] = m55; - (dst_data + i + 46 * dst_step)[0] = m56; - (dst_data + i + 47 * dst_step)[0] = m57; - - (dst_data + i + 48 * dst_step)[0] = m60; - (dst_data + i + 49 * dst_step)[0] = m61; - (dst_data + i + 50 * dst_step)[0] = m62; - (dst_data + i + 51 * dst_step)[0] = m63; - (dst_data + i + 52 * dst_step)[0] = m64; - (dst_data + i + 53 * dst_step)[0] = m65; - (dst_data + i + 54 * dst_step)[0] = m66; - (dst_data + i + 55 * dst_step)[0] = m67; - - (dst_data + i + 56 * dst_step)[0] = m70; - (dst_data + i + 57 * dst_step)[0] = m71; - (dst_data + i + 58 * dst_step)[0] = m72; - (dst_data + i + 59 * dst_step)[0] = m73; - (dst_data + i + 60 * dst_step)[0] = m74; - (dst_data + i + 61 * dst_step)[0] = m75; - (dst_data + i + 62 * dst_step)[0] = m76; - (dst_data + i + 63 * dst_step)[0] = m77; + float src[64]; + float t[64]; + float m[64]; + for (int i = 0; i < C4NUM; ++i) { + for (int j = 0; j < 64; ++j) { + src[j] = src_data[i + j * src_step]; + } + for (int l = 0; l < 8; ++l) { + int offset = l * 8; + t[l] = 36 * src[offset] - 49 * src[2 + offset] + 14 * src[4 + offset] - src[6 + offset]; + float tmp1 = 36 * src[1 + offset] + src[5 + offset]; + float tmp2 = 36 * src[2 + offset] - 13 * src[4 + offset]; + t[8 + l] = tmp1 + tmp2 - 13 * src[3 + offset] + src[6 + offset]; + t[16 + l] = tmp2 - tmp1 + 13 * src[3 + offset] + src[6 + offset]; + tmp1 = 18 * src[1 + offset] + 2 * src[5 + offset]; + tmp2 = 9 * src[2 + offset] - 10 * src[4 + offset]; + t[24 + l] = tmp1 + tmp2 - 20 * src[3 + offset] + src[6 + offset]; + t[32 + l] = tmp2 - tmp1 + 20 * src[3 + offset] + src[6 + offset]; + tmp1 = 12 * src[1 + offset] + 3 * src[5 + offset]; + tmp2 = 4 * src[2 + offset] - 5 * src[4 + offset]; + t[40 + l] = tmp1 + tmp2 - 15 * src[3 + offset] + src[6 + offset]; + t[48 + l] = tmp2 - tmp1 + 15 * src[3 + offset] + src[6 + offset]; + t[56 + l] = -36 * src[1 + offset] + 49 * src[3 + offset] - 14 * src[5 + offset] + src[7 + offset]; + } + for (int l = 0; l < 8; ++l) { + int offset = l * 8; + m[l] = 36 * t[offset] - 49 * t[2 + offset] + 14 * t[4 + offset] - t[6 + offset]; + float tmp1 = 36 * t[1 + offset] + t[5 + offset]; + float tmp2 = 36 * t[2 + offset] - 13 * t[4 + offset]; + m[8 + l] = tmp1 + tmp2 - 13 * t[3 + offset] + t[6 + offset]; + m[16 + l] = tmp2 - tmp1 + 13 * t[3 + offset] + t[6 + offset]; + tmp1 = 18 * t[1 + offset] + 2 * t[5 + offset]; + tmp2 = 9 * t[2 + offset] - 10 * t[4 + offset]; + m[24 + l] = tmp1 + tmp2 - 20 * t[3 + offset] + t[6 + offset]; + m[32 + l] = tmp2 - tmp1 + 20 * t[3 + offset] + t[6 + offset]; + tmp1 = 12 * t[1 + offset] + 3 * t[5 + offset]; + tmp2 = 4 * t[2 + offset] - 5 * t[4 + offset]; + m[40 + l] = tmp1 + tmp2 - 15 * t[3 + offset] + t[6 + offset]; + m[48 + l] = tmp2 - tmp1 + 15 * t[3 + offset] + t[6 + offset]; + m[56 + l] = -36 * t[1 + offset] + 49 * t[3 + offset] - 14 * t[5 + offset] + t[7 + offset]; + } + for (int k = 0; k < 64; ++k) { + dst_data[i + k * dst_step] = m[k]; + } } #endif } +OutputTransFunc GetOutputTransFunc(int input_unit, int output_unit) { + if (input_unit == 4) { + return OutputTransFuncList4[output_unit]; + } else if (input_unit == 6) { + return OutputTransFuncList6[output_unit]; + } else if (input_unit == 8) { + return OutputTransFuncList8[output_unit]; + } else { + return NULL; + } +} + void OutputTransform4x2Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step) { #ifdef ENABLE_ARM + float32x4_t src[16]; + float32x4_t t[8]; + float32x4_t m[4]; + Load16Data; float32x4_t bias_ptr = vld1q_f32(bias_data); - float32x4_t src_data_00 = vld1q_f32(src_data + 0 * src_step); - float32x4_t src_data_01 = vld1q_f32(src_data + 1 * src_step); - float32x4_t src_data_02 = vld1q_f32(src_data + 2 * src_step); - float32x4_t src_data_03 = vld1q_f32(src_data + 3 * src_step); - float32x4_t src_data_10 = vld1q_f32(src_data + 4 * src_step); - float32x4_t src_data_11 = vld1q_f32(src_data + 5 * src_step); - float32x4_t src_data_12 = vld1q_f32(src_data + 6 * src_step); - float32x4_t src_data_13 = vld1q_f32(src_data + 7 * src_step); - float32x4_t src_data_20 = vld1q_f32(src_data + 8 * src_step); - float32x4_t src_data_21 = vld1q_f32(src_data + 9 * src_step); - float32x4_t src_data_22 = vld1q_f32(src_data + 10 * src_step); - float32x4_t src_data_23 = vld1q_f32(src_data + 11 * src_step); - float32x4_t src_data_30 = vld1q_f32(src_data + 12 * src_step); - float32x4_t src_data_31 = vld1q_f32(src_data + 13 * src_step); - float32x4_t src_data_32 = vld1q_f32(src_data + 14 * src_step); - float32x4_t src_data_33 = vld1q_f32(src_data + 15 * src_step); - - float32x4_t t00 = vaddq_f32(src_data_00, vaddq_f32(src_data_10, src_data_20)); - float32x4_t t01 = vaddq_f32(src_data_01, vaddq_f32(src_data_11, src_data_21)); - float32x4_t t02 = vaddq_f32(src_data_02, vaddq_f32(src_data_12, src_data_22)); - float32x4_t t03 = vaddq_f32(src_data_03, vaddq_f32(src_data_13, src_data_23)); - - float32x4_t t10 = vsubq_f32(src_data_30, vmulq_n_f32(vsubq_f32(src_data_10, src_data_20), 0.5)); - float32x4_t t11 = vsubq_f32(src_data_31, vmulq_n_f32(vsubq_f32(src_data_11, src_data_21), 0.5)); - float32x4_t t12 = vsubq_f32(src_data_32, vmulq_n_f32(vsubq_f32(src_data_12, src_data_22), 0.5)); - float32x4_t t13 = vsubq_f32(src_data_33, vmulq_n_f32(vsubq_f32(src_data_13, src_data_23), 0.5)); - - float32x4_t m00 = vaddq_f32(vaddq_f32(t00, vaddq_f32(t01, t02)), bias_ptr); - float32x4_t m01 = vaddq_f32(vaddq_f32(t03, vmulq_n_f32(vsubq_f32(t01, t02), 0.5)), bias_ptr); - float32x4_t m10 = vaddq_f32(vaddq_f32(t10, vaddq_f32(t11, t12)), bias_ptr); - float32x4_t m11 = vaddq_f32(vaddq_f32(t13, vmulq_n_f32(vsubq_f32(t11, t12), 0.5)), bias_ptr); - - vst1q_f32(dst_data, m00); - vst1q_f32(dst_data + C4NUM, m01); - vst1q_f32(dst_data + dst_step * C4NUM, m10); - vst1q_f32(dst_data + dst_step * C4NUM + C4NUM, m11); + for (int l = 0; l < 4; ++l) { + int offset = l * 4; + t[l] = vaddq_f32(vaddq_f32(src[offset], src[1 + offset]), src[2 + offset]); + t[l + 4] = vaddq_f32(vsubq_f32(src[1 + offset], src[2 + offset]), src[3 + offset]); + } + for (int l = 0; l < 2; ++l) { + int offset = l * 4; + m[l] = vaddq_f32(vaddq_f32(vaddq_f32(t[offset], t[1 + offset]), t[2 + offset]), bias_ptr); + m[l + 2] = vaddq_f32(vaddq_f32(vsubq_f32(t[1 + offset], t[2 + offset]), t[3 + offset]), bias_ptr); + } + Store4Data; #else - for (int i = 0; i < C4NUM; i++) { - float src_data_00 = src_data[i]; - float src_data_01 = src_data[i + src_step]; - float src_data_02 = src_data[i + 2 * src_step]; - float src_data_03 = src_data[i + 3 * src_step]; - float src_data_10 = src_data[i + 4 * src_step]; - float src_data_11 = src_data[i + 5 * src_step]; - float src_data_12 = src_data[i + 6 * src_step]; - float src_data_13 = src_data[i + 7 * src_step]; - float src_data_20 = src_data[i + 8 * src_step]; - float src_data_21 = src_data[i + 9 * src_step]; - float src_data_22 = src_data[i + 10 * src_step]; - float src_data_23 = src_data[i + 11 * src_step]; - float src_data_30 = src_data[i + 12 * src_step]; - float src_data_31 = src_data[i + 13 * src_step]; - float src_data_32 = src_data[i + 14 * src_step]; - float src_data_33 = src_data[i + 15 * src_step]; - - float t00 = src_data_00 + src_data_10 + src_data_20; - float t01 = src_data_01 + src_data_11 + src_data_21; - float t02 = src_data_02 + src_data_12 + src_data_22; - float t03 = src_data_03 + src_data_13 + src_data_23; - - const float t10 = 0.5f * (src_data_10 - src_data_20) + src_data_30; - const float t11 = 0.5f * (src_data_11 - src_data_21) + src_data_31; - const float t12 = 0.5f * (src_data_12 - src_data_22) + src_data_32; - const float t13 = 0.5f * (src_data_13 - src_data_23) + src_data_33; - - float m00 = t00 + t01 + t02 + bias_data[i]; - const float m01 = 0.5f * (t01 - t02) + t03 + bias_data[i]; - float m10 = t10 + t11 + t12 + bias_data[i]; - const float m11 = 0.5f * (t11 - t12) + t13 + bias_data[i]; - - (dst_data + i)[0] = m00; - (dst_data + i + C4NUM)[0] = m01; - (dst_data + i + dst_step * C4NUM)[0] = m10; - (dst_data + i + dst_step * C4NUM + C4NUM)[0] = m11; + float src[16]; + float t[8]; + float m[4]; + for (int i = 0; i < C4NUM; ++i) { + // load source data + for (int j = 0; j < 16; ++j) { + src[j] = src_data[i + j * src_step]; + } + for (int l = 0; l < 4; ++l) { + int offset = l * 4; + t[l] = src[offset] + src[1 + offset] + src[2 + offset]; + t[l + 4] = src[1 + offset] - src[2 + offset] + src[3 + offset]; + } + for (int l = 0; l < 2; ++l) { + int offset = l * 4; + m[l] = t[offset] + t[1 + offset] + t[2 + offset]; + m[l + 2] = t[1 + offset] - t[2 + offset] + t[3 + offset]; + } + // store output + for (int k = 0; k < 2; ++k) { + int dst_k_offset = k * dst_step * C4NUM; + int m_k_offset = k * 2; + for (int j = 0; j < 2; ++j) { + dst_data[i + dst_k_offset + j * C4NUM] = m[j + m_k_offset] + bias_data[i]; + } + } } #endif } @@ -1381,115 +425,307 @@ void OutputTransform4x2Unit(const float *src_data, float *dst_data, const float void OutputTransform4x3Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step) { #ifdef ENABLE_ARM + float32x4_t src[16]; + float32x4_t t[12]; + float32x4_t m[9]; + Load16Data; float32x4_t bias_ptr = vld1q_f32(bias_data); - float32x4_t src_data_00 = vld1q_f32(src_data + 0 * src_step); - float32x4_t src_data_01 = vld1q_f32(src_data + 1 * src_step); - float32x4_t src_data_02 = vld1q_f32(src_data + 2 * src_step); - float32x4_t src_data_03 = vld1q_f32(src_data + 3 * src_step); - float32x4_t src_data_10 = vld1q_f32(src_data + 4 * src_step); - float32x4_t src_data_11 = vld1q_f32(src_data + 5 * src_step); - float32x4_t src_data_12 = vld1q_f32(src_data + 6 * src_step); - float32x4_t src_data_13 = vld1q_f32(src_data + 7 * src_step); - float32x4_t src_data_20 = vld1q_f32(src_data + 8 * src_step); - float32x4_t src_data_21 = vld1q_f32(src_data + 9 * src_step); - float32x4_t src_data_22 = vld1q_f32(src_data + 10 * src_step); - float32x4_t src_data_23 = vld1q_f32(src_data + 11 * src_step); - float32x4_t src_data_30 = vld1q_f32(src_data + 12 * src_step); - float32x4_t src_data_31 = vld1q_f32(src_data + 13 * src_step); - float32x4_t src_data_32 = vld1q_f32(src_data + 14 * src_step); - float32x4_t src_data_33 = vld1q_f32(src_data + 15 * src_step); - - float32x4_t t00 = vaddq_f32(src_data_00, vaddq_f32(src_data_10, src_data_20)); - float32x4_t t01 = vaddq_f32(src_data_01, vaddq_f32(src_data_11, src_data_21)); - float32x4_t t02 = vaddq_f32(src_data_02, vaddq_f32(src_data_12, src_data_22)); - float32x4_t t03 = vaddq_f32(src_data_03, vaddq_f32(src_data_13, src_data_23)); - - float32x4_t t10 = vmulq_n_f32(vsubq_f32(src_data_10, src_data_20), 0.5); - float32x4_t t11 = vmulq_n_f32(vsubq_f32(src_data_11, src_data_21), 0.5); - float32x4_t t12 = vmulq_n_f32(vsubq_f32(src_data_12, src_data_22), 0.5); - float32x4_t t13 = vmulq_n_f32(vsubq_f32(src_data_13, src_data_23), 0.5); - - float32x4_t t20 = vaddq_f32(src_data_30, vmulq_n_f32(vaddq_f32(src_data_10, src_data_20), 0.25)); - float32x4_t t21 = vaddq_f32(src_data_31, vmulq_n_f32(vaddq_f32(src_data_11, src_data_21), 0.25)); - float32x4_t t22 = vaddq_f32(src_data_32, vmulq_n_f32(vaddq_f32(src_data_12, src_data_22), 0.25)); - float32x4_t t23 = vaddq_f32(src_data_33, vmulq_n_f32(vaddq_f32(src_data_13, src_data_23), 0.25)); - - float32x4_t m00 = vaddq_f32(vaddq_f32(t00, vaddq_f32(t01, t02)), bias_ptr); - float32x4_t m01 = vaddq_f32(vmulq_n_f32(vsubq_f32(t01, t02), 0.5), bias_ptr); - float32x4_t m02 = vaddq_f32(vaddq_f32(t03, vmulq_n_f32(vaddq_f32(t01, t02), 0.25)), bias_ptr); - float32x4_t m10 = vaddq_f32(vaddq_f32(t10, vaddq_f32(t11, t12)), bias_ptr); - float32x4_t m11 = vaddq_f32(vmulq_n_f32(vsubq_f32(t11, t12), 0.5), bias_ptr); - float32x4_t m12 = vaddq_f32(vaddq_f32(t13, vmulq_n_f32(vaddq_f32(t11, t12), 0.25)), bias_ptr); - float32x4_t m20 = vaddq_f32(vaddq_f32(t20, vaddq_f32(t21, t22)), bias_ptr); - float32x4_t m21 = vaddq_f32(vmulq_n_f32(vsubq_f32(t21, t22), 0.5), bias_ptr); - float32x4_t m22 = vaddq_f32(vaddq_f32(t23, vmulq_n_f32(vaddq_f32(t21, t22), 0.25)), bias_ptr); - - vst1q_f32(dst_data, m00); - vst1q_f32(dst_data + C4NUM, m01); - vst1q_f32(dst_data + 2 * C4NUM, m02); - vst1q_f32(dst_data + dst_step * C4NUM, m10); - vst1q_f32(dst_data + dst_step * C4NUM + C4NUM, m11); - vst1q_f32(dst_data + dst_step * C4NUM + 2 * C4NUM, m12); - vst1q_f32(dst_data + 2 * dst_step * C4NUM, m20); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + C4NUM, m21); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + 2 * C4NUM, m22); + for (int l = 0; l < 4; ++l) { + int offset = l * 4; + float32x4_t tmp = vaddq_f32(src[1 + offset], src[2 + offset]); + t[l] = vaddq_f32(src[offset], tmp); + t[l + 4] = vsubq_f32(src[1 + offset], src[2 + offset]); + t[l + 8] = vaddq_f32(tmp, src[3 + offset]); + } + for (int l = 0; l < 3; ++l) { + int offset = l * 4; + float32x4_t tmp = vaddq_f32(t[1 + offset], t[2 + offset]); + m[l] = vaddq_f32(vaddq_f32(t[offset], tmp), bias_ptr); + m[l + 3] = vaddq_f32(vsubq_f32(t[1 + offset], t[2 + offset]), bias_ptr); + m[l + 6] = vaddq_f32(vaddq_f32(tmp, t[3 + offset]), bias_ptr); + } + Store9Data; #else - for (int i = 0; i < C4NUM; i++) { - float src_data_00 = src_data[i]; - float src_data_01 = src_data[i + src_step]; - float src_data_02 = src_data[i + 2 * src_step]; - float src_data_03 = src_data[i + 3 * src_step]; - float src_data_10 = src_data[i + 4 * src_step]; - float src_data_11 = src_data[i + 5 * src_step]; - float src_data_12 = src_data[i + 6 * src_step]; - float src_data_13 = src_data[i + 7 * src_step]; - float src_data_20 = src_data[i + 8 * src_step]; - float src_data_21 = src_data[i + 9 * src_step]; - float src_data_22 = src_data[i + 10 * src_step]; - float src_data_23 = src_data[i + 11 * src_step]; - float src_data_30 = src_data[i + 12 * src_step]; - float src_data_31 = src_data[i + 13 * src_step]; - float src_data_32 = src_data[i + 14 * src_step]; - float src_data_33 = src_data[i + 15 * src_step]; + float src[16]; + float t[12]; + float m[9]; + for (int i = 0; i < C4NUM; ++i) { + // load source data + for (int j = 0; j < 16; ++j) { + src[j] = src_data[i + j * src_step]; + } + for (int l = 0; l < 4; ++l) { + int offset = l * 4; + t[l] = src[0 + offset] + src[1 + offset] + src[2 + offset]; + t[l + 4] = src[1 + offset] - src[2 + offset]; + t[l + 8] = src[1 + offset] + src[2 + offset] + src[3 + offset]; + } + for (int l = 0; l < 3; ++l) { + int offset = l * 4; + m[l] = t[offset] + t[1 + offset] + t[2 + offset]; + m[l + 3] = t[1 + offset] - t[2 + offset]; + m[l + 6] = t[1 + offset] + t[2 + offset] + t[3 + offset]; + } + // store output + for (int k = 0; k < 3; ++k) { + int dst_k_offset = k * dst_step * C4NUM; + int m_k_offset = k * 3; + for (int j = 0; j < 3; ++j) { + dst_data[i + dst_k_offset + j * C4NUM] = m[j + m_k_offset] + bias_data[i]; + } + } + } +#endif +} - float t00 = src_data_00 + src_data_10 + src_data_20; - float t01 = src_data_01 + src_data_11 + src_data_21; - float t02 = src_data_02 + src_data_12 + src_data_22; - float t03 = src_data_03 + src_data_13 + src_data_23; - - const float t10 = 0.5f * (src_data_10 - src_data_20); - const float t11 = 0.5f * (src_data_11 - src_data_21); - const float t12 = 0.5f * (src_data_12 - src_data_22); - const float t13 = 0.5f * (src_data_13 - src_data_23); - - const float t20 = 0.25f * (src_data_10 + src_data_20) + src_data_30; - const float t21 = 0.25f * (src_data_11 + src_data_21) + src_data_31; - const float t22 = 0.25f * (src_data_12 + src_data_22) + src_data_32; - const float t23 = 0.25f * (src_data_13 + src_data_23) + src_data_33; - - float m00 = t00 + t01 + t02 + bias_data[i]; - const float m01 = 0.5f * (t01 - t02) + bias_data[i]; - const float m02 = 0.25f * (t01 + t02) + t03 + bias_data[i]; - - float m10 = t10 + t11 + t12 + bias_data[i]; - const float m11 = 0.5f * (t11 - t12) + bias_data[i]; - const float m12 = 0.25f * (t11 + t12) + t13 + bias_data[i]; - - float m20 = t20 + t21 + t22 + bias_data[i]; - const float m21 = 0.5f * (t21 - t22) + bias_data[i]; - const float m22 = 0.25f * (t21 + t22) + t23 + bias_data[i]; - - (dst_data + i)[0] = m00; - (dst_data + i + C4NUM)[0] = m01; - (dst_data + i + 2 * C4NUM)[0] = m02; - - (dst_data + i + dst_step * C4NUM)[0] = m10; - (dst_data + i + dst_step * C4NUM + C4NUM)[0] = m11; - (dst_data + i + dst_step * C4NUM + 2 * C4NUM)[0] = m12; - - (dst_data + i + 2 * dst_step * C4NUM)[0] = m20; - (dst_data + i + 2 * dst_step * C4NUM + C4NUM)[0] = m21; - (dst_data + i + 2 * dst_step * C4NUM + 2 * C4NUM)[0] = m22; +void OutputTransform6x2Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, + int dst_step) { +#ifdef ENABLE_ARM + float32x4_t src[36]; + float32x4_t t[12]; + float32x4_t m[4]; + Load36Data; + float32x4_t bias_ptr = vld1q_f32(bias_data); + for (int l = 0; l < 6; ++l) { + int offset = l * 6; + t[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src[offset], src[1 + offset]), src[2 + offset]), src[3 + offset]), + src[4 + offset]); + t[l + 6] = vaddq_f32(vaddq_f32(vsubq_f32(src[1 + offset], src[2 + offset]), + vmulq_n_f32(vsubq_f32(src[3 + offset], src[4 + offset]), 2)), + src[5 + offset]); + } + for (int l = 0; l < 2; ++l) { + int offset = l * 6; + m[l] = vaddq_f32( + vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], t[1 + offset]), t[2 + offset]), t[3 + offset]), t[4 + offset]), + bias_ptr); + m[l + 2] = vaddq_f32(vaddq_f32(vaddq_f32(vsubq_f32(t[1 + offset], t[2 + offset]), + vmulq_n_f32(vsubq_f32(t[3 + offset], t[4 + offset]), 2)), + t[5 + offset]), + bias_ptr); + } + Store4Data; +#else + float src[36]; + float t[12]; + float m[4]; + for (int i = 0; i < C4NUM; ++i) { + // load source data + for (int j = 0; j < 36; ++j) { + src[j] = src_data[i + j * src_step]; + } + for (int l = 0; l < 6; ++l) { + int offset = l * 6; + t[l] = src[offset] + src[1 + offset] + src[2 + offset] + src[3 + offset] + src[4 + offset]; + t[l + 6] = src[1 + offset] - src[2 + offset] + 2 * (src[3 + offset] - src[4 + offset]) + src[5 + offset]; + } + for (int l = 0; l < 2; ++l) { + int offset = l * 6; + m[l] = t[offset] + t[1 + offset] + t[2 + offset] + t[3 + offset] + t[4 + offset]; + m[l + 2] = t[1 + offset] - t[2 + offset] + 2 * (t[3 + offset] - t[4 + offset]) + t[5 + offset]; + } + // store output + for (int k = 0; k < 2; ++k) { + int dst_k_offset = k * dst_step * C4NUM; + int m_k_offset = k * 2; + for (int j = 0; j < 2; ++j) { + dst_data[i + dst_k_offset + j * C4NUM] = m[j + m_k_offset] + bias_data[i]; + } + } + } +#endif +} +void OutputTransform6x3Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, + int dst_step) { +#ifdef ENABLE_ARM + float32x4_t src[36]; + float32x4_t t[18]; + float32x4_t m[9]; + Load36Data; + float32x4_t bias_ptr = vld1q_f32(bias_data); + for (int l = 0; l < 6; ++l) { + int offset = l * 6; + float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); + float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); + t[l] = vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2); + t[l + 6] = vaddq_f32(vsubq_f32(src[1 + offset], src[2 + offset]), + vmulq_n_f32(vsubq_f32(src[3 + offset], src[4 + offset]), 2)); + t[l + 12] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), src[5 + offset]); + } + for (int l = 0; l < 3; ++l) { + int offset = l * 6; + float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); + float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); + m[l] = vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), bias_ptr); + m[l + 3] = vaddq_f32( + vaddq_f32(vsubq_f32(t[1 + offset], t[2 + offset]), vmulq_n_f32(vsubq_f32(t[3 + offset], t[4 + offset]), 2)), + bias_ptr); + m[l + 6] = vaddq_f32(vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), t[5 + offset]), bias_ptr); + } + Store9Data; +#else + float src[36]; + float t[18]; + float m[9]; + for (int i = 0; i < C4NUM; ++i) { + // load source data + for (int j = 0; j < 36; ++j) { + src[j] = src_data[i + j * src_step]; + } + for (int l = 0; l < 6; ++l) { + int offset = l * 6; + t[l] = src[offset] + src[1 + offset] + src[2 + offset] + src[3 + offset] + src[4 + offset]; + t[l + 6] = src[1 + offset] - src[2 + offset] + 2 * (src[3 + offset] - src[4 + offset]); + t[l + 12] = src[1 + offset] + src[2 + offset] + 4 * (src[3 + offset] + src[4 + offset]) + src[5 + offset]; + } + for (int l = 0; l < 3; ++l) { + int offset = l * 6; + m[l] = t[offset] + t[1 + offset] + t[2 + offset] + t[3 + offset] + t[4 + offset]; + m[l + 3] = t[1 + offset] - t[2 + offset] + 2 * (t[3 + offset] - t[4 + offset]); + m[l + 6] = t[1 + offset] + t[2 + offset] + 4 * (t[3 + offset] + t[4 + offset]) + t[5 + offset]; + } + // store output + for (int k = 0; k < 3; ++k) { + int dst_k_offset = k * dst_step * C4NUM; + int m_k_offset = k * 3; + for (int j = 0; j < 3; ++j) { + dst_data[i + dst_k_offset + j * C4NUM] = m[j + m_k_offset] + bias_data[i]; + } + } + } +#endif +} +void OutputTransform6x4Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, + int dst_step) { +#ifdef ENABLE_ARM + float32x4_t src[36]; + float32x4_t t[24]; + float32x4_t m[16]; + Load36Data; + float32x4_t bias_ptr = vld1q_f32(bias_data); + for (int l = 0; l < 6; ++l) { + int offset = l * 6; + float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); + float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); + float32x4_t tmp3 = vsubq_f32(src[1 + offset], src[2 + offset]); + float32x4_t tmp4 = vsubq_f32(src[3 + offset], src[4 + offset]); + t[l] = vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2); + t[l + 6] = vaddq_f32(tmp3, vmulq_n_f32(tmp4, 2)); + t[l + 12] = vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)); + t[l + 18] = vaddq_f32(vaddq_f32(tmp3, vmulq_n_f32(tmp4, 8)), src[5 + offset]); + } + for (int l = 0; l < 4; ++l) { + int offset = l * 6; + float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); + float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); + float32x4_t tmp3 = vsubq_f32(t[1 + offset], t[2 + offset]); + float32x4_t tmp4 = vsubq_f32(t[3 + offset], t[4 + offset]); + m[l] = vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), bias_ptr); + m[l + 4] = vaddq_f32(vaddq_f32(tmp3, vmulq_n_f32(tmp4, 2)), bias_ptr); + m[l + 8] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), bias_ptr); + m[l + 12] = vaddq_f32(vaddq_f32(vaddq_f32(tmp3, vmulq_n_f32(tmp4, 8)), t[5 + offset]), bias_ptr); + } + Store16Data; +#else + float src[36]; + float t[24]; + float m[16]; + for (int i = 0; i < C4NUM; ++i) { + // load source data + for (int j = 0; j < 36; ++j) { + src[j] = src_data[i + j * src_step]; + } + for (int l = 0; l < 6; ++l) { + int offset = l * 6; + t[l] = src[offset] + src[1 + offset] + src[2 + offset] + src[3 + offset] + src[4 + offset]; + t[l + 6] = src[1 + offset] - src[2 + offset] + 2 * (src[3 + offset] - src[4 + offset]); + t[l + 12] = src[1 + offset] + src[2 + offset] + 4 * (src[3 + offset] + src[4 + offset]); + t[l + 18] = src[1 + offset] - src[2 + offset] + 8 * (src[3 + offset] - src[4 + offset]) + src[5 + offset]; + } + for (int l = 0; l < 4; ++l) { + int offset = l * 6; + m[l] = t[offset] + t[1 + offset] + t[2 + offset] + t[3 + offset] + t[4 + offset]; + m[l + 4] = t[1 + offset] - t[2 + offset] + 2 * (t[3 + offset] - t[4 + offset]); + m[l + 8] = t[1 + offset] + t[2 + offset] + 4 * (t[3 + offset] + t[4 + offset]); + m[l + 12] = t[1 + offset] - t[2 + offset] + 8 * (t[3 + offset] - t[4 + offset]) + t[5 + offset]; + } + // store output + for (int k = 0; k < 4; ++k) { + int dst_k_offset = k * dst_step * C4NUM; + int m_k_offset = k * 4; + for (int j = 0; j < 4; ++j) { + dst_data[i + dst_k_offset + j * C4NUM] = m[j + m_k_offset] + bias_data[i]; + } + } + } +#endif +} +void OutputTransform6x5Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, + int dst_step) { +#ifdef ENABLE_ARM + float32x4_t src[36]; + float32x4_t t[30]; + float32x4_t m[25]; + Load36Data; + float32x4_t bias_ptr = vld1q_f32(bias_data); + for (int l = 0; l < 6; ++l) { + int offset = l * 6; + float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); + float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); + float32x4_t tmp3 = vsubq_f32(src[1 + offset], src[2 + offset]); + float32x4_t tmp4 = vsubq_f32(src[3 + offset], src[4 + offset]); + t[l] = vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2); + t[l + 6] = vaddq_f32(tmp3, vmulq_n_f32(tmp4, 2)); + t[l + 12] = vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)); + t[l + 18] = vaddq_f32(tmp3, vmulq_n_f32(tmp4, 8)); + t[l + 24] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 16)), src[5 + offset]); + } + for (int l = 0; l < 5; ++l) { + int offset = l * 6; + float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); + float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); + float32x4_t tmp3 = vsubq_f32(t[1 + offset], t[2 + offset]); + float32x4_t tmp4 = vsubq_f32(t[3 + offset], t[4 + offset]); + m[l] = vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), bias_ptr); + m[l + 5] = vaddq_f32(vaddq_f32(tmp3, vmulq_n_f32(tmp4, 2)), bias_ptr); + m[l + 10] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), bias_ptr); + m[l + 15] = vaddq_f32(vaddq_f32(tmp3, vmulq_n_f32(tmp4, 8)), bias_ptr); + m[l + 20] = vaddq_f32(vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 16)), t[5 + offset]), bias_ptr); + } + Store25Data; +#else + float src[36]; + float t[30]; + float m[25]; + for (int i = 0; i < C4NUM; ++i) { + // load source data + for (int j = 0; j < 36; ++j) { + src[j] = src_data[i + j * src_step]; + } + for (int l = 0; l < 6; ++l) { + int offset = l * 6; + t[l] = src[offset] + src[1 + offset] + src[2 + offset] + src[3 + offset] + src[4 + offset]; + t[l + 6] = src[1 + offset] - src[2 + offset] + 2 * (src[3 + offset] - src[4 + offset]); + t[l + 12] = src[1 + offset] + src[2 + offset] + 4 * (src[3 + offset] + src[4 + offset]); + t[l + 18] = src[1 + offset] - src[2 + offset] + 8 * (src[3 + offset] - src[4 + offset]); + t[l + 24] = src[1 + offset] + src[2 + offset] + 16 * (src[3 + offset] + src[4 + offset]) + src[5 + offset]; + } + for (int l = 0; l < 5; ++l) { + int offset = l * 6; + m[l] = t[offset] + t[1 + offset] + t[2 + offset] + t[3 + offset] + t[4 + offset]; + m[l + 5] = t[1 + offset] - t[2 + offset] + 2 * (t[3 + offset] - t[4 + offset]); + m[l + 10] = t[1 + offset] + t[2 + offset] + 4 * (t[3 + offset] + t[4 + offset]); + m[l + 15] = t[1 + offset] - t[2 + offset] + 8 * (t[3 + offset] - t[4 + offset]); + m[l + 20] = t[1 + offset] + t[2 + offset] + 16 * (t[3 + offset] + t[4 + offset]) + t[5 + offset]; + } + // store output + for (int k = 0; k < 5; ++k) { + int dst_k_offset = k * dst_step * C4NUM; + int m_k_offset = k * 5; + for (int j = 0; j < 5; ++j) { + dst_data[i + dst_k_offset + j * C4NUM] = m[j + m_k_offset] + bias_data[i]; + } + } } #endif } @@ -1497,3150 +733,503 @@ void OutputTransform4x3Unit(const float *src_data, float *dst_data, const float void OutputTransform8x2Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step) { #ifdef ENABLE_ARM - float32x4_t src_data_00 = vld1q_f32(src_data + 0 * src_step); - float32x4_t src_data_01 = vld1q_f32(src_data + 1 * src_step); - float32x4_t src_data_02 = vld1q_f32(src_data + 2 * src_step); - float32x4_t src_data_03 = vld1q_f32(src_data + 3 * src_step); - float32x4_t src_data_04 = vld1q_f32(src_data + 4 * src_step); - float32x4_t src_data_05 = vld1q_f32(src_data + 5 * src_step); - float32x4_t src_data_06 = vld1q_f32(src_data + 6 * src_step); - float32x4_t src_data_07 = vld1q_f32(src_data + 7 * src_step); - float32x4_t src_data_10 = vld1q_f32(src_data + 8 * src_step); - float32x4_t src_data_11 = vld1q_f32(src_data + 9 * src_step); - float32x4_t src_data_12 = vld1q_f32(src_data + 10 * src_step); - float32x4_t src_data_13 = vld1q_f32(src_data + 11 * src_step); - float32x4_t src_data_14 = vld1q_f32(src_data + 12 * src_step); - float32x4_t src_data_15 = vld1q_f32(src_data + 13 * src_step); - float32x4_t src_data_16 = vld1q_f32(src_data + 14 * src_step); - float32x4_t src_data_17 = vld1q_f32(src_data + 15 * src_step); - float32x4_t src_data_20 = vld1q_f32(src_data + 16 * src_step); - float32x4_t src_data_21 = vld1q_f32(src_data + 17 * src_step); - float32x4_t src_data_22 = vld1q_f32(src_data + 18 * src_step); - float32x4_t src_data_23 = vld1q_f32(src_data + 19 * src_step); - float32x4_t src_data_24 = vld1q_f32(src_data + 20 * src_step); - float32x4_t src_data_25 = vld1q_f32(src_data + 21 * src_step); - float32x4_t src_data_26 = vld1q_f32(src_data + 22 * src_step); - float32x4_t src_data_27 = vld1q_f32(src_data + 23 * src_step); - float32x4_t src_data_30 = vld1q_f32(src_data + 24 * src_step); - float32x4_t src_data_31 = vld1q_f32(src_data + 25 * src_step); - float32x4_t src_data_32 = vld1q_f32(src_data + 26 * src_step); - float32x4_t src_data_33 = vld1q_f32(src_data + 27 * src_step); - float32x4_t src_data_34 = vld1q_f32(src_data + 28 * src_step); - float32x4_t src_data_35 = vld1q_f32(src_data + 29 * src_step); - float32x4_t src_data_36 = vld1q_f32(src_data + 30 * src_step); - float32x4_t src_data_37 = vld1q_f32(src_data + 31 * src_step); - float32x4_t src_data_40 = vld1q_f32(src_data + 32 * src_step); - float32x4_t src_data_41 = vld1q_f32(src_data + 33 * src_step); - float32x4_t src_data_42 = vld1q_f32(src_data + 34 * src_step); - float32x4_t src_data_43 = vld1q_f32(src_data + 35 * src_step); - float32x4_t src_data_44 = vld1q_f32(src_data + 36 * src_step); - float32x4_t src_data_45 = vld1q_f32(src_data + 37 * src_step); - float32x4_t src_data_46 = vld1q_f32(src_data + 38 * src_step); - float32x4_t src_data_47 = vld1q_f32(src_data + 39 * src_step); - float32x4_t src_data_50 = vld1q_f32(src_data + 40 * src_step); - float32x4_t src_data_51 = vld1q_f32(src_data + 41 * src_step); - float32x4_t src_data_52 = vld1q_f32(src_data + 42 * src_step); - float32x4_t src_data_53 = vld1q_f32(src_data + 43 * src_step); - float32x4_t src_data_54 = vld1q_f32(src_data + 44 * src_step); - float32x4_t src_data_55 = vld1q_f32(src_data + 45 * src_step); - float32x4_t src_data_56 = vld1q_f32(src_data + 46 * src_step); - float32x4_t src_data_57 = vld1q_f32(src_data + 47 * src_step); - float32x4_t src_data_60 = vld1q_f32(src_data + 48 * src_step); - float32x4_t src_data_61 = vld1q_f32(src_data + 49 * src_step); - float32x4_t src_data_62 = vld1q_f32(src_data + 50 * src_step); - float32x4_t src_data_63 = vld1q_f32(src_data + 51 * src_step); - float32x4_t src_data_64 = vld1q_f32(src_data + 52 * src_step); - float32x4_t src_data_65 = vld1q_f32(src_data + 53 * src_step); - float32x4_t src_data_66 = vld1q_f32(src_data + 54 * src_step); - float32x4_t src_data_67 = vld1q_f32(src_data + 55 * src_step); - float32x4_t src_data_70 = vld1q_f32(src_data + 56 * src_step); - float32x4_t src_data_71 = vld1q_f32(src_data + 57 * src_step); - float32x4_t src_data_72 = vld1q_f32(src_data + 58 * src_step); - float32x4_t src_data_73 = vld1q_f32(src_data + 59 * src_step); - float32x4_t src_data_74 = vld1q_f32(src_data + 60 * src_step); - float32x4_t src_data_75 = vld1q_f32(src_data + 61 * src_step); - float32x4_t src_data_76 = vld1q_f32(src_data + 62 * src_step); - float32x4_t src_data_77 = vld1q_f32(src_data + 63 * src_step); - - float32x4_t d01 = vsubq_f32(src_data_10, src_data_20); - float32x4_t d02 = vsubq_f32(src_data_11, src_data_21); - float32x4_t d03 = vsubq_f32(src_data_12, src_data_22); - float32x4_t d04 = vsubq_f32(src_data_13, src_data_23); - float32x4_t d05 = vsubq_f32(src_data_14, src_data_24); - float32x4_t d06 = vsubq_f32(src_data_15, src_data_25); - float32x4_t d07 = vsubq_f32(src_data_16, src_data_26); - float32x4_t d08 = vsubq_f32(src_data_17, src_data_27); - - float32x4_t d11 = vsubq_f32(src_data_30, src_data_40); - float32x4_t d12 = vsubq_f32(src_data_31, src_data_41); - float32x4_t d13 = vsubq_f32(src_data_32, src_data_42); - float32x4_t d14 = vsubq_f32(src_data_33, src_data_43); - float32x4_t d15 = vsubq_f32(src_data_34, src_data_44); - float32x4_t d16 = vsubq_f32(src_data_35, src_data_45); - float32x4_t d17 = vsubq_f32(src_data_36, src_data_46); - float32x4_t d18 = vsubq_f32(src_data_37, src_data_47); - - float32x4_t d21 = vsubq_f32(src_data_50, src_data_60); - float32x4_t d22 = vsubq_f32(src_data_51, src_data_61); - float32x4_t d23 = vsubq_f32(src_data_52, src_data_62); - float32x4_t d24 = vsubq_f32(src_data_53, src_data_63); - float32x4_t d25 = vsubq_f32(src_data_54, src_data_64); - float32x4_t d26 = vsubq_f32(src_data_55, src_data_65); - float32x4_t d27 = vsubq_f32(src_data_56, src_data_66); - float32x4_t d28 = vsubq_f32(src_data_57, src_data_67); - - float32x4_t t00 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_00, src_data_10), src_data_20), src_data_30), src_data_40), - src_data_50), - src_data_60); - float32x4_t t01 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_01, src_data_11), src_data_21), src_data_31), src_data_41), - src_data_51), - src_data_61); - float32x4_t t02 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_02, src_data_12), src_data_22), src_data_32), src_data_42), - src_data_52), - src_data_62); - float32x4_t t03 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_03, src_data_13), src_data_23), src_data_33), src_data_43), - src_data_53), - src_data_63); - float32x4_t t04 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_04, src_data_14), src_data_24), src_data_34), src_data_44), - src_data_54), - src_data_64); - float32x4_t t05 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_05, src_data_15), src_data_25), src_data_35), src_data_45), - src_data_55), - src_data_65); - float32x4_t t06 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_06, src_data_16), src_data_26), src_data_36), src_data_46), - src_data_56), - src_data_66); - float32x4_t t07 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_07, src_data_17), src_data_27), src_data_37), src_data_47), - src_data_57), - src_data_67); - - float32x4_t t10 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d01, 0.5), d11), vmulq_n_f32(d21, 1.5)), src_data_70); - float32x4_t t11 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d02, 0.5), d12), vmulq_n_f32(d22, 1.5)), src_data_71); - float32x4_t t12 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d03, 0.5), d13), vmulq_n_f32(d23, 1.5)), src_data_72); - float32x4_t t13 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d04, 0.5), d14), vmulq_n_f32(d24, 1.5)), src_data_73); - float32x4_t t14 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d05, 0.5), d15), vmulq_n_f32(d25, 1.5)), src_data_74); - float32x4_t t15 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d06, 0.5), d16), vmulq_n_f32(d26, 1.5)), src_data_75); - float32x4_t t16 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d07, 0.5), d17), vmulq_n_f32(d27, 1.5)), src_data_76); - float32x4_t t17 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d08, 0.5), d18), vmulq_n_f32(d28, 1.5)), src_data_77); - - float32x4_t s11 = vsubq_f32(t01, t02); - float32x4_t s12 = vsubq_f32(t11, t12); - - float32x4_t s21 = vsubq_f32(t03, t04); - float32x4_t s22 = vsubq_f32(t13, t14); - - float32x4_t s31 = vsubq_f32(t05, t06); - float32x4_t s32 = vsubq_f32(t15, t16); - - float32x4_t m00 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t00, t01), t02), t03), t04), t05), t06); - float32x4_t m01 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s11, 0.5), s21), vmulq_n_f32(s31, 1.5)), t07); - - float32x4_t m10 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t10, t11), t12), t13), t14), t15), t16); - float32x4_t m11 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s12, 0.5), s22), vmulq_n_f32(s32, 1.5)), t17); - + float32x4_t src[64]; + float32x4_t t[16]; + float32x4_t m[4]; + Load64Data; float32x4_t bias_ptr = vld1q_f32(bias_data); - vst1q_f32(dst_data, vaddq_f32(m00, bias_ptr)); - vst1q_f32(dst_data + C4NUM, vaddq_f32(m01, bias_ptr)); - - vst1q_f32(dst_data + dst_step * C4NUM, vaddq_f32(m10, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + C4NUM, vaddq_f32(m11, bias_ptr)); + for (int l = 0; l < 8; ++l) { + int offset = l * 8; + t[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src[offset], src[1 + offset]), src[2 + offset]), + src[3 + offset]), + src[4 + offset]), + src[5 + offset]), + src[6 + offset]); + t[l + 8] = vaddq_f32(vaddq_f32(vaddq_f32(vsubq_f32(src[1 + offset], src[2 + offset]), + vmulq_n_f32(vsubq_f32(src[3 + offset], src[4 + offset]), 2)), + vmulq_n_f32(vsubq_f32(src[5 + offset], src[6 + offset]), 3)), + src[7 + offset]); + } + for (int l = 0; l < 2; ++l) { + int offset = l * 8; + m[l] = vaddq_f32( + vaddq_f32( + vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], t[1 + offset]), t[2 + offset]), t[3 + offset]), + t[4 + offset]), + t[5 + offset]), + t[6 + offset]), + bias_ptr); + m[l + 2] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vsubq_f32(t[1 + offset], t[2 + offset]), + vmulq_n_f32(vsubq_f32(t[3 + offset], t[4 + offset]), 2)), + vmulq_n_f32(vsubq_f32(t[5 + offset], t[6 + offset]), 3)), + t[7 + offset]), + bias_ptr); + } + Store4Data; #else - for (int i = 0; i < C4NUM; i++) { - float src_data_00 = src_data[i]; - float src_data_01 = src_data[i + src_step]; - float src_data_02 = src_data[i + 2 * src_step]; - float src_data_03 = src_data[i + 3 * src_step]; - float src_data_04 = src_data[i + 4 * src_step]; - float src_data_05 = src_data[i + 5 * src_step]; - float src_data_06 = src_data[i + 6 * src_step]; - float src_data_07 = src_data[i + 7 * src_step]; - float src_data_10 = src_data[i + 8 * src_step]; - float src_data_11 = src_data[i + 9 * src_step]; - float src_data_12 = src_data[i + 10 * src_step]; - float src_data_13 = src_data[i + 11 * src_step]; - float src_data_14 = src_data[i + 12 * src_step]; - float src_data_15 = src_data[i + 13 * src_step]; - float src_data_16 = src_data[i + 14 * src_step]; - float src_data_17 = src_data[i + 15 * src_step]; - float src_data_20 = src_data[i + 16 * src_step]; - float src_data_21 = src_data[i + 17 * src_step]; - float src_data_22 = src_data[i + 18 * src_step]; - float src_data_23 = src_data[i + 19 * src_step]; - float src_data_24 = src_data[i + 20 * src_step]; - float src_data_25 = src_data[i + 21 * src_step]; - float src_data_26 = src_data[i + 22 * src_step]; - float src_data_27 = src_data[i + 23 * src_step]; - float src_data_30 = src_data[i + 24 * src_step]; - float src_data_31 = src_data[i + 25 * src_step]; - float src_data_32 = src_data[i + 26 * src_step]; - float src_data_33 = src_data[i + 27 * src_step]; - float src_data_34 = src_data[i + 28 * src_step]; - float src_data_35 = src_data[i + 29 * src_step]; - float src_data_36 = src_data[i + 30 * src_step]; - float src_data_37 = src_data[i + 31 * src_step]; - float src_data_40 = src_data[i + 32 * src_step]; - float src_data_41 = src_data[i + 33 * src_step]; - float src_data_42 = src_data[i + 34 * src_step]; - float src_data_43 = src_data[i + 35 * src_step]; - float src_data_44 = src_data[i + 36 * src_step]; - float src_data_45 = src_data[i + 37 * src_step]; - float src_data_46 = src_data[i + 38 * src_step]; - float src_data_47 = src_data[i + 39 * src_step]; - float src_data_50 = src_data[i + 40 * src_step]; - float src_data_51 = src_data[i + 41 * src_step]; - float src_data_52 = src_data[i + 42 * src_step]; - float src_data_53 = src_data[i + 43 * src_step]; - float src_data_54 = src_data[i + 44 * src_step]; - float src_data_55 = src_data[i + 45 * src_step]; - float src_data_56 = src_data[i + 46 * src_step]; - float src_data_57 = src_data[i + 47 * src_step]; - float src_data_60 = src_data[i + 48 * src_step]; - float src_data_61 = src_data[i + 49 * src_step]; - float src_data_62 = src_data[i + 50 * src_step]; - float src_data_63 = src_data[i + 51 * src_step]; - float src_data_64 = src_data[i + 52 * src_step]; - float src_data_65 = src_data[i + 53 * src_step]; - float src_data_66 = src_data[i + 54 * src_step]; - float src_data_67 = src_data[i + 55 * src_step]; - float src_data_70 = src_data[i + 56 * src_step]; - float src_data_71 = src_data[i + 57 * src_step]; - float src_data_72 = src_data[i + 58 * src_step]; - float src_data_73 = src_data[i + 59 * src_step]; - float src_data_74 = src_data[i + 60 * src_step]; - float src_data_75 = src_data[i + 61 * src_step]; - float src_data_76 = src_data[i + 62 * src_step]; - float src_data_77 = src_data[i + 63 * src_step]; - - float d01 = src_data_10 - src_data_20; - float d02 = src_data_11 - src_data_21; - float d03 = src_data_12 - src_data_22; - float d04 = src_data_13 - src_data_23; - float d05 = src_data_14 - src_data_24; - float d06 = src_data_15 - src_data_25; - float d07 = src_data_16 - src_data_26; - float d08 = src_data_17 - src_data_27; - - float d11 = src_data_30 - src_data_40; - float d12 = src_data_31 - src_data_41; - float d13 = src_data_32 - src_data_42; - float d14 = src_data_33 - src_data_43; - float d15 = src_data_34 - src_data_44; - float d16 = src_data_35 - src_data_45; - float d17 = src_data_36 - src_data_46; - float d18 = src_data_37 - src_data_47; - - float d21 = src_data_50 - src_data_60; - float d22 = src_data_51 - src_data_61; - float d23 = src_data_52 - src_data_62; - float d24 = src_data_53 - src_data_63; - float d25 = src_data_54 - src_data_64; - float d26 = src_data_55 - src_data_65; - float d27 = src_data_56 - src_data_66; - float d28 = src_data_57 - src_data_67; - - float t00 = src_data_00 + src_data_10 + src_data_20 + src_data_30 + src_data_40 + src_data_50 + src_data_60; - float t01 = src_data_01 + src_data_11 + src_data_21 + src_data_31 + src_data_41 + src_data_51 + src_data_61; - float t02 = src_data_02 + src_data_12 + src_data_22 + src_data_32 + src_data_42 + src_data_52 + src_data_62; - float t03 = src_data_03 + src_data_13 + src_data_23 + src_data_33 + src_data_43 + src_data_53 + src_data_63; - float t04 = src_data_04 + src_data_14 + src_data_24 + src_data_34 + src_data_44 + src_data_54 + src_data_64; - float t05 = src_data_05 + src_data_15 + src_data_25 + src_data_35 + src_data_45 + src_data_55 + src_data_65; - float t06 = src_data_06 + src_data_16 + src_data_26 + src_data_36 + src_data_46 + src_data_56 + src_data_66; - float t07 = src_data_07 + src_data_17 + src_data_27 + src_data_37 + src_data_47 + src_data_57 + src_data_67; - - const float t10 = 0.5f * d01 + d11 + 1.5f * d21 + src_data_70; - const float t11 = 0.5f * d02 + d12 + 1.5f * d22 + src_data_71; - const float t12 = 0.5f * d03 + d13 + 1.5f * d23 + src_data_72; - const float t13 = 0.5f * d04 + d14 + 1.5f * d24 + src_data_73; - const float t14 = 0.5f * d05 + d15 + 1.5f * d25 + src_data_74; - const float t15 = 0.5f * d06 + d16 + 1.5f * d26 + src_data_75; - const float t16 = 0.5f * d07 + d17 + 1.5f * d27 + src_data_76; - const float t17 = 0.5f * d08 + d18 + 1.5f * d28 + src_data_77; - - float s11 = t01 - t02; - float s12 = t11 - t12; - float s21 = t03 - t04; - float s22 = t13 - t14; - float s31 = t05 - t06; - float s32 = t15 - t16; - - float m00 = t00 + t01 + t02 + t03 + t04 + t05 + t06; - const float m01 = 0.5f * s11 + s21 + 1.5f * s31 + t07; - float m10 = t10 + t11 + t12 + t13 + t14 + t15 + t16; - const float m11 = 0.5f * s12 + s22 + 1.5f * s32 + t17; - - (dst_data + i)[0] = m00 + bias_data[i]; - (dst_data + i + C4NUM)[0] = m01 + bias_data[i]; - (dst_data + i + dst_step * C4NUM)[0] = m10 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + C4NUM)[0] = m11 + bias_data[i]; + float src[64]; + float t[16]; + float m[4]; + for (int i = 0; i < C4NUM; ++i) { + // load source data + for (int j = 0; j < 64; ++j) { + src[j] = src_data[i + j * src_step]; + } + for (int l = 0; l < 8; ++l) { + int offset = l * 8; + t[l] = src[offset] + src[1 + offset] + src[2 + offset] + src[3 + offset] + src[4 + offset] + src[5 + offset] + + src[6 + offset]; + t[l + 8] = src[1 + offset] - src[2 + offset] + 2 * (src[3 + offset] - src[4 + offset]) + + 3 * (src[5 + offset] - src[6 + offset]) + src[7 + offset]; + } + for (int l = 0; l < 2; ++l) { + int offset = l * 8; + m[l] = t[offset] + t[1 + offset] + t[2 + offset] + t[3 + offset] + t[4 + offset] + t[5 + offset] + t[6 + offset]; + m[l + 2] = t[1 + offset] - t[2 + offset] + 2 * (t[3 + offset] - t[4 + offset]) + + 3 * (t[5 + offset] - t[6 + offset]) + t[7 + offset]; + } + // store output + for (int k = 0; k < 2; ++k) { + int dst_k_offset = k * dst_step * C4NUM; + int m_k_offset = k * 2; + for (int j = 0; j < 2; ++j) { + dst_data[i + dst_k_offset + j * C4NUM] = m[j + m_k_offset] + bias_data[i]; + } + } } #endif } - void OutputTransform8x3Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step) { #ifdef ENABLE_ARM - float32x4_t src_data_00 = vld1q_f32(src_data + 0 * src_step); - float32x4_t src_data_01 = vld1q_f32(src_data + 1 * src_step); - float32x4_t src_data_02 = vld1q_f32(src_data + 2 * src_step); - float32x4_t src_data_03 = vld1q_f32(src_data + 3 * src_step); - float32x4_t src_data_04 = vld1q_f32(src_data + 4 * src_step); - float32x4_t src_data_05 = vld1q_f32(src_data + 5 * src_step); - float32x4_t src_data_06 = vld1q_f32(src_data + 6 * src_step); - float32x4_t src_data_07 = vld1q_f32(src_data + 7 * src_step); - float32x4_t src_data_10 = vld1q_f32(src_data + 8 * src_step); - float32x4_t src_data_11 = vld1q_f32(src_data + 9 * src_step); - float32x4_t src_data_12 = vld1q_f32(src_data + 10 * src_step); - float32x4_t src_data_13 = vld1q_f32(src_data + 11 * src_step); - float32x4_t src_data_14 = vld1q_f32(src_data + 12 * src_step); - float32x4_t src_data_15 = vld1q_f32(src_data + 13 * src_step); - float32x4_t src_data_16 = vld1q_f32(src_data + 14 * src_step); - float32x4_t src_data_17 = vld1q_f32(src_data + 15 * src_step); - float32x4_t src_data_20 = vld1q_f32(src_data + 16 * src_step); - float32x4_t src_data_21 = vld1q_f32(src_data + 17 * src_step); - float32x4_t src_data_22 = vld1q_f32(src_data + 18 * src_step); - float32x4_t src_data_23 = vld1q_f32(src_data + 19 * src_step); - float32x4_t src_data_24 = vld1q_f32(src_data + 20 * src_step); - float32x4_t src_data_25 = vld1q_f32(src_data + 21 * src_step); - float32x4_t src_data_26 = vld1q_f32(src_data + 22 * src_step); - float32x4_t src_data_27 = vld1q_f32(src_data + 23 * src_step); - float32x4_t src_data_30 = vld1q_f32(src_data + 24 * src_step); - float32x4_t src_data_31 = vld1q_f32(src_data + 25 * src_step); - float32x4_t src_data_32 = vld1q_f32(src_data + 26 * src_step); - float32x4_t src_data_33 = vld1q_f32(src_data + 27 * src_step); - float32x4_t src_data_34 = vld1q_f32(src_data + 28 * src_step); - float32x4_t src_data_35 = vld1q_f32(src_data + 29 * src_step); - float32x4_t src_data_36 = vld1q_f32(src_data + 30 * src_step); - float32x4_t src_data_37 = vld1q_f32(src_data + 31 * src_step); - float32x4_t src_data_40 = vld1q_f32(src_data + 32 * src_step); - float32x4_t src_data_41 = vld1q_f32(src_data + 33 * src_step); - float32x4_t src_data_42 = vld1q_f32(src_data + 34 * src_step); - float32x4_t src_data_43 = vld1q_f32(src_data + 35 * src_step); - float32x4_t src_data_44 = vld1q_f32(src_data + 36 * src_step); - float32x4_t src_data_45 = vld1q_f32(src_data + 37 * src_step); - float32x4_t src_data_46 = vld1q_f32(src_data + 38 * src_step); - float32x4_t src_data_47 = vld1q_f32(src_data + 39 * src_step); - float32x4_t src_data_50 = vld1q_f32(src_data + 40 * src_step); - float32x4_t src_data_51 = vld1q_f32(src_data + 41 * src_step); - float32x4_t src_data_52 = vld1q_f32(src_data + 42 * src_step); - float32x4_t src_data_53 = vld1q_f32(src_data + 43 * src_step); - float32x4_t src_data_54 = vld1q_f32(src_data + 44 * src_step); - float32x4_t src_data_55 = vld1q_f32(src_data + 45 * src_step); - float32x4_t src_data_56 = vld1q_f32(src_data + 46 * src_step); - float32x4_t src_data_57 = vld1q_f32(src_data + 47 * src_step); - float32x4_t src_data_60 = vld1q_f32(src_data + 48 * src_step); - float32x4_t src_data_61 = vld1q_f32(src_data + 49 * src_step); - float32x4_t src_data_62 = vld1q_f32(src_data + 50 * src_step); - float32x4_t src_data_63 = vld1q_f32(src_data + 51 * src_step); - float32x4_t src_data_64 = vld1q_f32(src_data + 52 * src_step); - float32x4_t src_data_65 = vld1q_f32(src_data + 53 * src_step); - float32x4_t src_data_66 = vld1q_f32(src_data + 54 * src_step); - float32x4_t src_data_67 = vld1q_f32(src_data + 55 * src_step); - float32x4_t src_data_70 = vld1q_f32(src_data + 56 * src_step); - float32x4_t src_data_71 = vld1q_f32(src_data + 57 * src_step); - float32x4_t src_data_72 = vld1q_f32(src_data + 58 * src_step); - float32x4_t src_data_73 = vld1q_f32(src_data + 59 * src_step); - float32x4_t src_data_74 = vld1q_f32(src_data + 60 * src_step); - float32x4_t src_data_75 = vld1q_f32(src_data + 61 * src_step); - float32x4_t src_data_76 = vld1q_f32(src_data + 62 * src_step); - float32x4_t src_data_77 = vld1q_f32(src_data + 63 * src_step); - - float32x4_t d01 = vsubq_f32(src_data_10, src_data_20); - float32x4_t d02 = vsubq_f32(src_data_11, src_data_21); - float32x4_t d03 = vsubq_f32(src_data_12, src_data_22); - float32x4_t d04 = vsubq_f32(src_data_13, src_data_23); - float32x4_t d05 = vsubq_f32(src_data_14, src_data_24); - float32x4_t d06 = vsubq_f32(src_data_15, src_data_25); - float32x4_t d07 = vsubq_f32(src_data_16, src_data_26); - float32x4_t d08 = vsubq_f32(src_data_17, src_data_27); - - float32x4_t d11 = vsubq_f32(src_data_30, src_data_40); - float32x4_t d12 = vsubq_f32(src_data_31, src_data_41); - float32x4_t d13 = vsubq_f32(src_data_32, src_data_42); - float32x4_t d14 = vsubq_f32(src_data_33, src_data_43); - float32x4_t d15 = vsubq_f32(src_data_34, src_data_44); - float32x4_t d16 = vsubq_f32(src_data_35, src_data_45); - float32x4_t d17 = vsubq_f32(src_data_36, src_data_46); - float32x4_t d18 = vsubq_f32(src_data_37, src_data_47); - - float32x4_t d21 = vsubq_f32(src_data_50, src_data_60); - float32x4_t d22 = vsubq_f32(src_data_51, src_data_61); - float32x4_t d23 = vsubq_f32(src_data_52, src_data_62); - float32x4_t d24 = vsubq_f32(src_data_53, src_data_63); - float32x4_t d25 = vsubq_f32(src_data_54, src_data_64); - float32x4_t d26 = vsubq_f32(src_data_55, src_data_65); - float32x4_t d27 = vsubq_f32(src_data_56, src_data_66); - float32x4_t d28 = vsubq_f32(src_data_57, src_data_67); - - float32x4_t d31 = vaddq_f32(src_data_10, src_data_20); - float32x4_t d32 = vaddq_f32(src_data_11, src_data_21); - float32x4_t d33 = vaddq_f32(src_data_12, src_data_22); - float32x4_t d34 = vaddq_f32(src_data_13, src_data_23); - float32x4_t d35 = vaddq_f32(src_data_14, src_data_24); - float32x4_t d36 = vaddq_f32(src_data_15, src_data_25); - float32x4_t d37 = vaddq_f32(src_data_16, src_data_26); - float32x4_t d38 = vaddq_f32(src_data_17, src_data_27); - - float32x4_t d41 = vaddq_f32(src_data_30, src_data_40); - float32x4_t d42 = vaddq_f32(src_data_31, src_data_41); - float32x4_t d43 = vaddq_f32(src_data_32, src_data_42); - float32x4_t d44 = vaddq_f32(src_data_33, src_data_43); - float32x4_t d45 = vaddq_f32(src_data_34, src_data_44); - float32x4_t d46 = vaddq_f32(src_data_35, src_data_45); - float32x4_t d47 = vaddq_f32(src_data_36, src_data_46); - float32x4_t d48 = vaddq_f32(src_data_37, src_data_47); - - float32x4_t d51 = vaddq_f32(src_data_50, src_data_60); - float32x4_t d52 = vaddq_f32(src_data_51, src_data_61); - float32x4_t d53 = vaddq_f32(src_data_52, src_data_62); - float32x4_t d54 = vaddq_f32(src_data_53, src_data_63); - float32x4_t d55 = vaddq_f32(src_data_54, src_data_64); - float32x4_t d56 = vaddq_f32(src_data_55, src_data_65); - float32x4_t d57 = vaddq_f32(src_data_56, src_data_66); - float32x4_t d58 = vaddq_f32(src_data_57, src_data_67); - - float32x4_t t00 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_00, src_data_10), src_data_20), src_data_30), src_data_40), - src_data_50), - src_data_60); - float32x4_t t01 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_01, src_data_11), src_data_21), src_data_31), src_data_41), - src_data_51), - src_data_61); - float32x4_t t02 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_02, src_data_12), src_data_22), src_data_32), src_data_42), - src_data_52), - src_data_62); - float32x4_t t03 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_03, src_data_13), src_data_23), src_data_33), src_data_43), - src_data_53), - src_data_63); - float32x4_t t04 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_04, src_data_14), src_data_24), src_data_34), src_data_44), - src_data_54), - src_data_64); - float32x4_t t05 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_05, src_data_15), src_data_25), src_data_35), src_data_45), - src_data_55), - src_data_65); - float32x4_t t06 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_06, src_data_16), src_data_26), src_data_36), src_data_46), - src_data_56), - src_data_66); - float32x4_t t07 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_07, src_data_17), src_data_27), src_data_37), src_data_47), - src_data_57), - src_data_67); - - float32x4_t t10 = vaddq_f32(vaddq_f32(vmulq_n_f32(d01, 0.5), d11), vmulq_n_f32(d21, 1.5)); - float32x4_t t11 = vaddq_f32(vaddq_f32(vmulq_n_f32(d02, 0.5), d12), vmulq_n_f32(d22, 1.5)); - float32x4_t t12 = vaddq_f32(vaddq_f32(vmulq_n_f32(d03, 0.5), d13), vmulq_n_f32(d23, 1.5)); - float32x4_t t13 = vaddq_f32(vaddq_f32(vmulq_n_f32(d04, 0.5), d14), vmulq_n_f32(d24, 1.5)); - float32x4_t t14 = vaddq_f32(vaddq_f32(vmulq_n_f32(d05, 0.5), d15), vmulq_n_f32(d25, 1.5)); - float32x4_t t15 = vaddq_f32(vaddq_f32(vmulq_n_f32(d06, 0.5), d16), vmulq_n_f32(d26, 1.5)); - float32x4_t t16 = vaddq_f32(vaddq_f32(vmulq_n_f32(d07, 0.5), d17), vmulq_n_f32(d27, 1.5)); - float32x4_t t17 = vaddq_f32(vaddq_f32(vmulq_n_f32(d08, 0.5), d18), vmulq_n_f32(d28, 1.5)); - - float32x4_t t20 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d31, 0.25), d41), vmulq_n_f32(d51, 2.25)), src_data_70); - float32x4_t t21 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d32, 0.25), d42), vmulq_n_f32(d52, 2.25)), src_data_71); - float32x4_t t22 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d33, 0.25), d43), vmulq_n_f32(d53, 2.25)), src_data_72); - float32x4_t t23 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d34, 0.25), d44), vmulq_n_f32(d54, 2.25)), src_data_73); - float32x4_t t24 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d35, 0.25), d45), vmulq_n_f32(d55, 2.25)), src_data_74); - float32x4_t t25 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d36, 0.25), d46), vmulq_n_f32(d56, 2.25)), src_data_75); - float32x4_t t26 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d37, 0.25), d47), vmulq_n_f32(d57, 2.25)), src_data_76); - float32x4_t t27 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d38, 0.25), d48), vmulq_n_f32(d58, 2.25)), src_data_77); - - float32x4_t s11 = vsubq_f32(t01, t02); - float32x4_t s12 = vsubq_f32(t11, t12); - float32x4_t s13 = vsubq_f32(t21, t22); - - float32x4_t s21 = vsubq_f32(t03, t04); - float32x4_t s22 = vsubq_f32(t13, t14); - float32x4_t s23 = vsubq_f32(t23, t24); - - float32x4_t s31 = vsubq_f32(t05, t06); - float32x4_t s32 = vsubq_f32(t15, t16); - float32x4_t s33 = vsubq_f32(t25, t26); - - float32x4_t s41 = vaddq_f32(t01, t02); - float32x4_t s42 = vaddq_f32(t11, t12); - float32x4_t s43 = vaddq_f32(t21, t22); - - float32x4_t s51 = vaddq_f32(t03, t04); - float32x4_t s52 = vaddq_f32(t13, t14); - float32x4_t s53 = vaddq_f32(t23, t24); - - float32x4_t s61 = vaddq_f32(t05, t06); - float32x4_t s62 = vaddq_f32(t15, t16); - float32x4_t s63 = vaddq_f32(t25, t26); - - float32x4_t m00 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t00, t01), t02), t03), t04), t05), t06); - float32x4_t m01 = vaddq_f32(vaddq_f32(vmulq_n_f32(s11, 0.5), s21), vmulq_n_f32(s31, 1.5)); - float32x4_t m02 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s41, 0.25), s51), vmulq_n_f32(s61, 2.25)), t07); - - float32x4_t m10 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t10, t11), t12), t13), t14), t15), t16); - float32x4_t m11 = vaddq_f32(vaddq_f32(vmulq_n_f32(s12, 0.5), s22), vmulq_n_f32(s32, 1.5)); - float32x4_t m12 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s42, 0.25), s52), vmulq_n_f32(s62, 2.25)), t17); - - float32x4_t m20 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t20, t21), t22), t23), t24), t25), t26); - float32x4_t m21 = vaddq_f32(vaddq_f32(vmulq_n_f32(s13, 0.5), s23), vmulq_n_f32(s33, 1.5)); - float32x4_t m22 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s43, 0.25), s53), vmulq_n_f32(s63, 2.25)), t27); - + float32x4_t src[64]; + float32x4_t t[24]; + float32x4_t m[9]; + Load64Data; float32x4_t bias_ptr = vld1q_f32(bias_data); - vst1q_f32(dst_data, vaddq_f32(m00, bias_ptr)); - vst1q_f32(dst_data + C4NUM, vaddq_f32(m01, bias_ptr)); - vst1q_f32(dst_data + 2 * C4NUM, vaddq_f32(m02, bias_ptr)); - - vst1q_f32(dst_data + dst_step * C4NUM, vaddq_f32(m10, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + C4NUM, vaddq_f32(m11, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + 2 * C4NUM, vaddq_f32(m12, bias_ptr)); - - vst1q_f32(dst_data + 2 * dst_step * C4NUM, vaddq_f32(m20, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + C4NUM, vaddq_f32(m21, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + 2 * C4NUM, vaddq_f32(m22, bias_ptr)); + for (int l = 0; l < 8; ++l) { + int offset = l * 8; + float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); + float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); + float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); + float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); + float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); + float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); + t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 2)), vmulq_n_f32(tmp6, 3)); + t[l + 16] = vaddq_f32(vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), vmulq_n_f32(tmp3, 9)), src[7 + offset]); + } + for (int l = 0; l < 3; ++l) { + int offset = l * 8; + float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); + float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); + float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); + float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); + float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); + float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); + m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 3] = vaddq_f32(vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 2)), vmulq_n_f32(tmp6, 3)), bias_ptr); + m[l + 6] = vaddq_f32( + vaddq_f32(vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), vmulq_n_f32(tmp3, 9)), t[7 + offset]), bias_ptr); + } + Store9Data; #else - for (int i = 0; i < C4NUM; i++) { - float src_data_00 = src_data[i]; - float src_data_01 = src_data[i + src_step]; - float src_data_02 = src_data[i + 2 * src_step]; - float src_data_03 = src_data[i + 3 * src_step]; - float src_data_04 = src_data[i + 4 * src_step]; - float src_data_05 = src_data[i + 5 * src_step]; - float src_data_06 = src_data[i + 6 * src_step]; - float src_data_07 = src_data[i + 7 * src_step]; - float src_data_10 = src_data[i + 8 * src_step]; - float src_data_11 = src_data[i + 9 * src_step]; - float src_data_12 = src_data[i + 10 * src_step]; - float src_data_13 = src_data[i + 11 * src_step]; - float src_data_14 = src_data[i + 12 * src_step]; - float src_data_15 = src_data[i + 13 * src_step]; - float src_data_16 = src_data[i + 14 * src_step]; - float src_data_17 = src_data[i + 15 * src_step]; - float src_data_20 = src_data[i + 16 * src_step]; - float src_data_21 = src_data[i + 17 * src_step]; - float src_data_22 = src_data[i + 18 * src_step]; - float src_data_23 = src_data[i + 19 * src_step]; - float src_data_24 = src_data[i + 20 * src_step]; - float src_data_25 = src_data[i + 21 * src_step]; - float src_data_26 = src_data[i + 22 * src_step]; - float src_data_27 = src_data[i + 23 * src_step]; - float src_data_30 = src_data[i + 24 * src_step]; - float src_data_31 = src_data[i + 25 * src_step]; - float src_data_32 = src_data[i + 26 * src_step]; - float src_data_33 = src_data[i + 27 * src_step]; - float src_data_34 = src_data[i + 28 * src_step]; - float src_data_35 = src_data[i + 29 * src_step]; - float src_data_36 = src_data[i + 30 * src_step]; - float src_data_37 = src_data[i + 31 * src_step]; - float src_data_40 = src_data[i + 32 * src_step]; - float src_data_41 = src_data[i + 33 * src_step]; - float src_data_42 = src_data[i + 34 * src_step]; - float src_data_43 = src_data[i + 35 * src_step]; - float src_data_44 = src_data[i + 36 * src_step]; - float src_data_45 = src_data[i + 37 * src_step]; - float src_data_46 = src_data[i + 38 * src_step]; - float src_data_47 = src_data[i + 39 * src_step]; - float src_data_50 = src_data[i + 40 * src_step]; - float src_data_51 = src_data[i + 41 * src_step]; - float src_data_52 = src_data[i + 42 * src_step]; - float src_data_53 = src_data[i + 43 * src_step]; - float src_data_54 = src_data[i + 44 * src_step]; - float src_data_55 = src_data[i + 45 * src_step]; - float src_data_56 = src_data[i + 46 * src_step]; - float src_data_57 = src_data[i + 47 * src_step]; - float src_data_60 = src_data[i + 48 * src_step]; - float src_data_61 = src_data[i + 49 * src_step]; - float src_data_62 = src_data[i + 50 * src_step]; - float src_data_63 = src_data[i + 51 * src_step]; - float src_data_64 = src_data[i + 52 * src_step]; - float src_data_65 = src_data[i + 53 * src_step]; - float src_data_66 = src_data[i + 54 * src_step]; - float src_data_67 = src_data[i + 55 * src_step]; - float src_data_70 = src_data[i + 56 * src_step]; - float src_data_71 = src_data[i + 57 * src_step]; - float src_data_72 = src_data[i + 58 * src_step]; - float src_data_73 = src_data[i + 59 * src_step]; - float src_data_74 = src_data[i + 60 * src_step]; - float src_data_75 = src_data[i + 61 * src_step]; - float src_data_76 = src_data[i + 62 * src_step]; - float src_data_77 = src_data[i + 63 * src_step]; - - float d01 = src_data_10 - src_data_20; - float d02 = src_data_11 - src_data_21; - float d03 = src_data_12 - src_data_22; - float d04 = src_data_13 - src_data_23; - float d05 = src_data_14 - src_data_24; - float d06 = src_data_15 - src_data_25; - float d07 = src_data_16 - src_data_26; - float d08 = src_data_17 - src_data_27; - - float d11 = src_data_30 - src_data_40; - float d12 = src_data_31 - src_data_41; - float d13 = src_data_32 - src_data_42; - float d14 = src_data_33 - src_data_43; - float d15 = src_data_34 - src_data_44; - float d16 = src_data_35 - src_data_45; - float d17 = src_data_36 - src_data_46; - float d18 = src_data_37 - src_data_47; - - float d21 = src_data_50 - src_data_60; - float d22 = src_data_51 - src_data_61; - float d23 = src_data_52 - src_data_62; - float d24 = src_data_53 - src_data_63; - float d25 = src_data_54 - src_data_64; - float d26 = src_data_55 - src_data_65; - float d27 = src_data_56 - src_data_66; - float d28 = src_data_57 - src_data_67; - - float d31 = src_data_10 + src_data_20; - float d32 = src_data_11 + src_data_21; - float d33 = src_data_12 + src_data_22; - float d34 = src_data_13 + src_data_23; - float d35 = src_data_14 + src_data_24; - float d36 = src_data_15 + src_data_25; - float d37 = src_data_16 + src_data_26; - float d38 = src_data_17 + src_data_27; - - float d41 = src_data_30 + src_data_40; - float d42 = src_data_31 + src_data_41; - float d43 = src_data_32 + src_data_42; - float d44 = src_data_33 + src_data_43; - float d45 = src_data_34 + src_data_44; - float d46 = src_data_35 + src_data_45; - float d47 = src_data_36 + src_data_46; - float d48 = src_data_37 + src_data_47; - - float d51 = src_data_50 + src_data_60; - float d52 = src_data_51 + src_data_61; - float d53 = src_data_52 + src_data_62; - float d54 = src_data_53 + src_data_63; - float d55 = src_data_54 + src_data_64; - float d56 = src_data_55 + src_data_65; - float d57 = src_data_56 + src_data_66; - float d58 = src_data_57 + src_data_67; - - float t00 = src_data_00 + src_data_10 + src_data_20 + src_data_30 + src_data_40 + src_data_50 + src_data_60; - float t01 = src_data_01 + src_data_11 + src_data_21 + src_data_31 + src_data_41 + src_data_51 + src_data_61; - float t02 = src_data_02 + src_data_12 + src_data_22 + src_data_32 + src_data_42 + src_data_52 + src_data_62; - float t03 = src_data_03 + src_data_13 + src_data_23 + src_data_33 + src_data_43 + src_data_53 + src_data_63; - float t04 = src_data_04 + src_data_14 + src_data_24 + src_data_34 + src_data_44 + src_data_54 + src_data_64; - float t05 = src_data_05 + src_data_15 + src_data_25 + src_data_35 + src_data_45 + src_data_55 + src_data_65; - float t06 = src_data_06 + src_data_16 + src_data_26 + src_data_36 + src_data_46 + src_data_56 + src_data_66; - float t07 = src_data_07 + src_data_17 + src_data_27 + src_data_37 + src_data_47 + src_data_57 + src_data_67; - - const float t10 = 0.5f * d01 + d11 + 1.5f * d21; - const float t11 = 0.5f * d02 + d12 + 1.5f * d22; - const float t12 = 0.5f * d03 + d13 + 1.5f * d23; - const float t13 = 0.5f * d04 + d14 + 1.5f * d24; - const float t14 = 0.5f * d05 + d15 + 1.5f * d25; - const float t15 = 0.5f * d06 + d16 + 1.5f * d26; - const float t16 = 0.5f * d07 + d17 + 1.5f * d27; - const float t17 = 0.5f * d08 + d18 + 1.5f * d28; - - const float t20 = 0.25f * d31 + d41 + 2.25f * d51 + src_data_70; - const float t21 = 0.25f * d32 + d42 + 2.25f * d52 + src_data_71; - const float t22 = 0.25f * d33 + d43 + 2.25f * d53 + src_data_72; - const float t23 = 0.25f * d34 + d44 + 2.25f * d54 + src_data_73; - const float t24 = 0.25f * d35 + d45 + 2.25f * d55 + src_data_74; - const float t25 = 0.25f * d36 + d46 + 2.25f * d56 + src_data_75; - const float t26 = 0.25f * d37 + d47 + 2.25f * d57 + src_data_76; - const float t27 = 0.25f * d38 + d48 + 2.25f * d58 + src_data_77; - - float s11 = t01 - t02; - float s12 = t11 - t12; - float s13 = t21 - t22; - - float s21 = t03 - t04; - float s22 = t13 - t14; - float s23 = t23 - t24; - - float s31 = t05 - t06; - float s32 = t15 - t16; - float s33 = t25 - t26; - - float s41 = t01 + t02; - float s42 = t11 + t12; - float s43 = t21 + t22; - - float s51 = t03 + t04; - float s52 = t13 + t14; - float s53 = t23 + t24; - - float s61 = t05 + t06; - float s62 = t15 + t16; - float s63 = t25 + t26; - - float m00 = t00 + t01 + t02 + t03 + t04 + t05 + t06; - const float m01 = 0.5f * s11 + s21 + 1.5f * s31; - const float m02 = 0.25f * s41 + s51 + 2.25f * s61 + t07; - - float m10 = t10 + t11 + t12 + t13 + t14 + t15 + t16; - const float m11 = 0.5f * s12 + s22 + 1.5f * s32; - const float m12 = 0.25f * s42 + s52 + 2.25f * s62 + t17; - - float m20 = t20 + t21 + t22 + t23 + t24 + t25 + t26; - const float m21 = 0.5f * s13 + s23 + 1.5f * s33; - const float m22 = 0.25f * s43 + s53 + 2.25f * s63 + t27; - - (dst_data + i)[0] = m00 + bias_data[i]; - (dst_data + i + C4NUM)[0] = m01 + bias_data[i]; - (dst_data + i + 2 * C4NUM)[0] = m02 + bias_data[i]; - - (dst_data + i + dst_step * C4NUM)[0] = m10 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + C4NUM)[0] = m11 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + 2 * C4NUM)[0] = m12 + bias_data[i]; - - (dst_data + i + 2 * dst_step * C4NUM)[0] = m20 + bias_data[i]; - (dst_data + i + 2 * dst_step * C4NUM + C4NUM)[0] = m21 + bias_data[i]; - (dst_data + i + 2 * dst_step * C4NUM + 2 * C4NUM)[0] = m22 + bias_data[i]; + float src[64]; + float t[24]; + float m[9]; + for (int i = 0; i < C4NUM; ++i) { + // load source data + for (int j = 0; j < 64; ++j) { + src[j] = src_data[i + j * src_step]; + } + for (int l = 0; l < 8; ++l) { + int offset = l * 8; + t[l] = src[offset] + src[1 + offset] + src[2 + offset] + src[3 + offset] + src[4 + offset] + src[5 + offset] + + src[6 + offset]; + t[l + 8] = src[1 + offset] - src[2 + offset] + 2 * (src[3 + offset] - src[4 + offset]) + + 3 * (src[5 + offset] - src[6 + offset]); + t[l + 16] = src[1 + offset] + src[2 + offset] + 4 * (src[3 + offset] + src[4 + offset]) + + 9 * (src[5 + offset] + src[6 + offset]) + src[7 + offset]; + } + for (int l = 0; l < 3; ++l) { + int offset = l * 8; + m[l] = t[offset] + t[1 + offset] + t[2 + offset] + t[3 + offset] + t[4 + offset] + t[5 + offset] + t[6 + offset]; + m[l + 3] = + t[1 + offset] - t[2 + offset] + 2 * (t[3 + offset] - t[4 + offset]) + 3 * (t[5 + offset] - t[6 + offset]); + m[l + 6] = t[1 + offset] + t[2 + offset] + 4 * (t[3 + offset] + t[4 + offset]) + + 9 * (t[5 + offset] + t[6 + offset]) + t[7 + offset]; + } + // store output + for (int k = 0; k < 3; ++k) { + int dst_k_offset = k * dst_step * C4NUM; + int m_k_offset = k * 3; + for (int j = 0; j < 3; ++j) { + dst_data[i + dst_k_offset + j * C4NUM] = m[j + m_k_offset] + bias_data[i]; + } + } } #endif } - void OutputTransform8x4Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step) { #ifdef ENABLE_ARM - float32x4_t src_data_00 = vld1q_f32(src_data + 0 * src_step); - float32x4_t src_data_01 = vld1q_f32(src_data + 1 * src_step); - float32x4_t src_data_02 = vld1q_f32(src_data + 2 * src_step); - float32x4_t src_data_03 = vld1q_f32(src_data + 3 * src_step); - float32x4_t src_data_04 = vld1q_f32(src_data + 4 * src_step); - float32x4_t src_data_05 = vld1q_f32(src_data + 5 * src_step); - float32x4_t src_data_06 = vld1q_f32(src_data + 6 * src_step); - float32x4_t src_data_07 = vld1q_f32(src_data + 7 * src_step); - float32x4_t src_data_10 = vld1q_f32(src_data + 8 * src_step); - float32x4_t src_data_11 = vld1q_f32(src_data + 9 * src_step); - float32x4_t src_data_12 = vld1q_f32(src_data + 10 * src_step); - float32x4_t src_data_13 = vld1q_f32(src_data + 11 * src_step); - float32x4_t src_data_14 = vld1q_f32(src_data + 12 * src_step); - float32x4_t src_data_15 = vld1q_f32(src_data + 13 * src_step); - float32x4_t src_data_16 = vld1q_f32(src_data + 14 * src_step); - float32x4_t src_data_17 = vld1q_f32(src_data + 15 * src_step); - float32x4_t src_data_20 = vld1q_f32(src_data + 16 * src_step); - float32x4_t src_data_21 = vld1q_f32(src_data + 17 * src_step); - float32x4_t src_data_22 = vld1q_f32(src_data + 18 * src_step); - float32x4_t src_data_23 = vld1q_f32(src_data + 19 * src_step); - float32x4_t src_data_24 = vld1q_f32(src_data + 20 * src_step); - float32x4_t src_data_25 = vld1q_f32(src_data + 21 * src_step); - float32x4_t src_data_26 = vld1q_f32(src_data + 22 * src_step); - float32x4_t src_data_27 = vld1q_f32(src_data + 23 * src_step); - float32x4_t src_data_30 = vld1q_f32(src_data + 24 * src_step); - float32x4_t src_data_31 = vld1q_f32(src_data + 25 * src_step); - float32x4_t src_data_32 = vld1q_f32(src_data + 26 * src_step); - float32x4_t src_data_33 = vld1q_f32(src_data + 27 * src_step); - float32x4_t src_data_34 = vld1q_f32(src_data + 28 * src_step); - float32x4_t src_data_35 = vld1q_f32(src_data + 29 * src_step); - float32x4_t src_data_36 = vld1q_f32(src_data + 30 * src_step); - float32x4_t src_data_37 = vld1q_f32(src_data + 31 * src_step); - float32x4_t src_data_40 = vld1q_f32(src_data + 32 * src_step); - float32x4_t src_data_41 = vld1q_f32(src_data + 33 * src_step); - float32x4_t src_data_42 = vld1q_f32(src_data + 34 * src_step); - float32x4_t src_data_43 = vld1q_f32(src_data + 35 * src_step); - float32x4_t src_data_44 = vld1q_f32(src_data + 36 * src_step); - float32x4_t src_data_45 = vld1q_f32(src_data + 37 * src_step); - float32x4_t src_data_46 = vld1q_f32(src_data + 38 * src_step); - float32x4_t src_data_47 = vld1q_f32(src_data + 39 * src_step); - float32x4_t src_data_50 = vld1q_f32(src_data + 40 * src_step); - float32x4_t src_data_51 = vld1q_f32(src_data + 41 * src_step); - float32x4_t src_data_52 = vld1q_f32(src_data + 42 * src_step); - float32x4_t src_data_53 = vld1q_f32(src_data + 43 * src_step); - float32x4_t src_data_54 = vld1q_f32(src_data + 44 * src_step); - float32x4_t src_data_55 = vld1q_f32(src_data + 45 * src_step); - float32x4_t src_data_56 = vld1q_f32(src_data + 46 * src_step); - float32x4_t src_data_57 = vld1q_f32(src_data + 47 * src_step); - float32x4_t src_data_60 = vld1q_f32(src_data + 48 * src_step); - float32x4_t src_data_61 = vld1q_f32(src_data + 49 * src_step); - float32x4_t src_data_62 = vld1q_f32(src_data + 50 * src_step); - float32x4_t src_data_63 = vld1q_f32(src_data + 51 * src_step); - float32x4_t src_data_64 = vld1q_f32(src_data + 52 * src_step); - float32x4_t src_data_65 = vld1q_f32(src_data + 53 * src_step); - float32x4_t src_data_66 = vld1q_f32(src_data + 54 * src_step); - float32x4_t src_data_67 = vld1q_f32(src_data + 55 * src_step); - float32x4_t src_data_70 = vld1q_f32(src_data + 56 * src_step); - float32x4_t src_data_71 = vld1q_f32(src_data + 57 * src_step); - float32x4_t src_data_72 = vld1q_f32(src_data + 58 * src_step); - float32x4_t src_data_73 = vld1q_f32(src_data + 59 * src_step); - float32x4_t src_data_74 = vld1q_f32(src_data + 60 * src_step); - float32x4_t src_data_75 = vld1q_f32(src_data + 61 * src_step); - float32x4_t src_data_76 = vld1q_f32(src_data + 62 * src_step); - float32x4_t src_data_77 = vld1q_f32(src_data + 63 * src_step); - - float32x4_t d01 = vsubq_f32(src_data_10, src_data_20); - float32x4_t d02 = vsubq_f32(src_data_11, src_data_21); - float32x4_t d03 = vsubq_f32(src_data_12, src_data_22); - float32x4_t d04 = vsubq_f32(src_data_13, src_data_23); - float32x4_t d05 = vsubq_f32(src_data_14, src_data_24); - float32x4_t d06 = vsubq_f32(src_data_15, src_data_25); - float32x4_t d07 = vsubq_f32(src_data_16, src_data_26); - float32x4_t d08 = vsubq_f32(src_data_17, src_data_27); - - float32x4_t d11 = vsubq_f32(src_data_30, src_data_40); - float32x4_t d12 = vsubq_f32(src_data_31, src_data_41); - float32x4_t d13 = vsubq_f32(src_data_32, src_data_42); - float32x4_t d14 = vsubq_f32(src_data_33, src_data_43); - float32x4_t d15 = vsubq_f32(src_data_34, src_data_44); - float32x4_t d16 = vsubq_f32(src_data_35, src_data_45); - float32x4_t d17 = vsubq_f32(src_data_36, src_data_46); - float32x4_t d18 = vsubq_f32(src_data_37, src_data_47); - - float32x4_t d21 = vsubq_f32(src_data_50, src_data_60); - float32x4_t d22 = vsubq_f32(src_data_51, src_data_61); - float32x4_t d23 = vsubq_f32(src_data_52, src_data_62); - float32x4_t d24 = vsubq_f32(src_data_53, src_data_63); - float32x4_t d25 = vsubq_f32(src_data_54, src_data_64); - float32x4_t d26 = vsubq_f32(src_data_55, src_data_65); - float32x4_t d27 = vsubq_f32(src_data_56, src_data_66); - float32x4_t d28 = vsubq_f32(src_data_57, src_data_67); - - float32x4_t d31 = vaddq_f32(src_data_10, src_data_20); - float32x4_t d32 = vaddq_f32(src_data_11, src_data_21); - float32x4_t d33 = vaddq_f32(src_data_12, src_data_22); - float32x4_t d34 = vaddq_f32(src_data_13, src_data_23); - float32x4_t d35 = vaddq_f32(src_data_14, src_data_24); - float32x4_t d36 = vaddq_f32(src_data_15, src_data_25); - float32x4_t d37 = vaddq_f32(src_data_16, src_data_26); - float32x4_t d38 = vaddq_f32(src_data_17, src_data_27); - - float32x4_t d41 = vaddq_f32(src_data_30, src_data_40); - float32x4_t d42 = vaddq_f32(src_data_31, src_data_41); - float32x4_t d43 = vaddq_f32(src_data_32, src_data_42); - float32x4_t d44 = vaddq_f32(src_data_33, src_data_43); - float32x4_t d45 = vaddq_f32(src_data_34, src_data_44); - float32x4_t d46 = vaddq_f32(src_data_35, src_data_45); - float32x4_t d47 = vaddq_f32(src_data_36, src_data_46); - float32x4_t d48 = vaddq_f32(src_data_37, src_data_47); - - float32x4_t d51 = vaddq_f32(src_data_50, src_data_60); - float32x4_t d52 = vaddq_f32(src_data_51, src_data_61); - float32x4_t d53 = vaddq_f32(src_data_52, src_data_62); - float32x4_t d54 = vaddq_f32(src_data_53, src_data_63); - float32x4_t d55 = vaddq_f32(src_data_54, src_data_64); - float32x4_t d56 = vaddq_f32(src_data_55, src_data_65); - float32x4_t d57 = vaddq_f32(src_data_56, src_data_66); - float32x4_t d58 = vaddq_f32(src_data_57, src_data_67); - - float32x4_t t00 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_00, src_data_10), src_data_20), src_data_30), src_data_40), - src_data_50), - src_data_60); - float32x4_t t01 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_01, src_data_11), src_data_21), src_data_31), src_data_41), - src_data_51), - src_data_61); - float32x4_t t02 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_02, src_data_12), src_data_22), src_data_32), src_data_42), - src_data_52), - src_data_62); - float32x4_t t03 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_03, src_data_13), src_data_23), src_data_33), src_data_43), - src_data_53), - src_data_63); - float32x4_t t04 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_04, src_data_14), src_data_24), src_data_34), src_data_44), - src_data_54), - src_data_64); - float32x4_t t05 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_05, src_data_15), src_data_25), src_data_35), src_data_45), - src_data_55), - src_data_65); - float32x4_t t06 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_06, src_data_16), src_data_26), src_data_36), src_data_46), - src_data_56), - src_data_66); - float32x4_t t07 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_07, src_data_17), src_data_27), src_data_37), src_data_47), - src_data_57), - src_data_67); - - float32x4_t t10 = vaddq_f32(vaddq_f32(vmulq_n_f32(d01, 0.5), d11), vmulq_n_f32(d21, 1.5)); - float32x4_t t11 = vaddq_f32(vaddq_f32(vmulq_n_f32(d02, 0.5), d12), vmulq_n_f32(d22, 1.5)); - float32x4_t t12 = vaddq_f32(vaddq_f32(vmulq_n_f32(d03, 0.5), d13), vmulq_n_f32(d23, 1.5)); - float32x4_t t13 = vaddq_f32(vaddq_f32(vmulq_n_f32(d04, 0.5), d14), vmulq_n_f32(d24, 1.5)); - float32x4_t t14 = vaddq_f32(vaddq_f32(vmulq_n_f32(d05, 0.5), d15), vmulq_n_f32(d25, 1.5)); - float32x4_t t15 = vaddq_f32(vaddq_f32(vmulq_n_f32(d06, 0.5), d16), vmulq_n_f32(d26, 1.5)); - float32x4_t t16 = vaddq_f32(vaddq_f32(vmulq_n_f32(d07, 0.5), d17), vmulq_n_f32(d27, 1.5)); - float32x4_t t17 = vaddq_f32(vaddq_f32(vmulq_n_f32(d08, 0.5), d18), vmulq_n_f32(d28, 1.5)); - - float32x4_t t20 = vaddq_f32(vaddq_f32(vmulq_n_f32(d31, 0.25), d41), vmulq_n_f32(d51, 2.25)); - float32x4_t t21 = vaddq_f32(vaddq_f32(vmulq_n_f32(d32, 0.25), d42), vmulq_n_f32(d52, 2.25)); - float32x4_t t22 = vaddq_f32(vaddq_f32(vmulq_n_f32(d33, 0.25), d43), vmulq_n_f32(d53, 2.25)); - float32x4_t t23 = vaddq_f32(vaddq_f32(vmulq_n_f32(d34, 0.25), d44), vmulq_n_f32(d54, 2.25)); - float32x4_t t24 = vaddq_f32(vaddq_f32(vmulq_n_f32(d35, 0.25), d45), vmulq_n_f32(d55, 2.25)); - float32x4_t t25 = vaddq_f32(vaddq_f32(vmulq_n_f32(d36, 0.25), d46), vmulq_n_f32(d56, 2.25)); - float32x4_t t26 = vaddq_f32(vaddq_f32(vmulq_n_f32(d37, 0.25), d47), vmulq_n_f32(d57, 2.25)); - float32x4_t t27 = vaddq_f32(vaddq_f32(vmulq_n_f32(d38, 0.25), d48), vmulq_n_f32(d58, 2.25)); - - float32x4_t t30 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d01, 0.125), d11), vmulq_n_f32(d21, 3.375)), src_data_70); - float32x4_t t31 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d02, 0.125), d12), vmulq_n_f32(d22, 3.375)), src_data_71); - float32x4_t t32 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d03, 0.125), d13), vmulq_n_f32(d23, 3.375)), src_data_72); - float32x4_t t33 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d04, 0.125), d14), vmulq_n_f32(d24, 3.375)), src_data_73); - float32x4_t t34 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d05, 0.125), d15), vmulq_n_f32(d25, 3.375)), src_data_74); - float32x4_t t35 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d06, 0.125), d16), vmulq_n_f32(d26, 3.375)), src_data_75); - float32x4_t t36 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d07, 0.125), d17), vmulq_n_f32(d27, 3.375)), src_data_76); - float32x4_t t37 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d08, 0.125), d18), vmulq_n_f32(d28, 3.375)), src_data_77); - - float32x4_t s11 = vsubq_f32(t01, t02); - float32x4_t s12 = vsubq_f32(t11, t12); - float32x4_t s13 = vsubq_f32(t21, t22); - float32x4_t s14 = vsubq_f32(t31, t32); - - float32x4_t s21 = vsubq_f32(t03, t04); - float32x4_t s22 = vsubq_f32(t13, t14); - float32x4_t s23 = vsubq_f32(t23, t24); - float32x4_t s24 = vsubq_f32(t33, t34); - - float32x4_t s31 = vsubq_f32(t05, t06); - float32x4_t s32 = vsubq_f32(t15, t16); - float32x4_t s33 = vsubq_f32(t25, t26); - float32x4_t s34 = vsubq_f32(t35, t36); - - float32x4_t s41 = vaddq_f32(t01, t02); - float32x4_t s42 = vaddq_f32(t11, t12); - float32x4_t s43 = vaddq_f32(t21, t22); - float32x4_t s44 = vaddq_f32(t31, t32); - - float32x4_t s51 = vaddq_f32(t03, t04); - float32x4_t s52 = vaddq_f32(t13, t14); - float32x4_t s53 = vaddq_f32(t23, t24); - float32x4_t s54 = vaddq_f32(t33, t34); - - float32x4_t s61 = vaddq_f32(t05, t06); - float32x4_t s62 = vaddq_f32(t15, t16); - float32x4_t s63 = vaddq_f32(t25, t26); - float32x4_t s64 = vaddq_f32(t35, t36); - - float32x4_t m00 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t00, t01), t02), t03), t04), t05), t06); - float32x4_t m01 = vaddq_f32(vaddq_f32(vmulq_n_f32(s11, 0.5), s21), vmulq_n_f32(s31, 1.5)); - float32x4_t m02 = vaddq_f32(vaddq_f32(vmulq_n_f32(s41, 0.25), s51), vmulq_n_f32(s61, 2.25)); - float32x4_t m03 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s11, 0.125), s21), vmulq_n_f32(s31, 3.375)), t07); - - float32x4_t m10 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t10, t11), t12), t13), t14), t15), t16); - float32x4_t m11 = vaddq_f32(vaddq_f32(vmulq_n_f32(s12, 0.5), s22), vmulq_n_f32(s32, 1.5)); - float32x4_t m12 = vaddq_f32(vaddq_f32(vmulq_n_f32(s42, 0.25), s52), vmulq_n_f32(s62, 2.25)); - float32x4_t m13 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s12, 0.125), s22), vmulq_n_f32(s32, 3.375)), t17); - - float32x4_t m20 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t20, t21), t22), t23), t24), t25), t26); - float32x4_t m21 = vaddq_f32(vaddq_f32(vmulq_n_f32(s13, 0.5), s23), vmulq_n_f32(s33, 1.5)); - float32x4_t m22 = vaddq_f32(vaddq_f32(vmulq_n_f32(s43, 0.25), s53), vmulq_n_f32(s63, 2.25)); - float32x4_t m23 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s13, 0.125), s23), vmulq_n_f32(s33, 3.375)), t27); - - float32x4_t m30 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t30, t31), t32), t33), t34), t35), t36); - float32x4_t m31 = vaddq_f32(vaddq_f32(vmulq_n_f32(s14, 0.5), s24), vmulq_n_f32(s34, 1.5)); - float32x4_t m32 = vaddq_f32(vaddq_f32(vmulq_n_f32(s44, 0.25), s54), vmulq_n_f32(s64, 2.25)); - float32x4_t m33 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s14, 0.125), s24), vmulq_n_f32(s34, 3.375)), t37); - + float32x4_t src[64]; + float32x4_t t[32]; + float32x4_t m[16]; + Load64Data; float32x4_t bias_ptr = vld1q_f32(bias_data); - vst1q_f32(dst_data, vaddq_f32(m00, bias_ptr)); - vst1q_f32(dst_data + C4NUM, vaddq_f32(m01, bias_ptr)); - vst1q_f32(dst_data + 2 * C4NUM, vaddq_f32(m02, bias_ptr)); - vst1q_f32(dst_data + 3 * C4NUM, vaddq_f32(m03, bias_ptr)); - - vst1q_f32(dst_data + dst_step * C4NUM, vaddq_f32(m10, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + C4NUM, vaddq_f32(m11, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + 2 * C4NUM, vaddq_f32(m12, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + 3 * C4NUM, vaddq_f32(m13, bias_ptr)); - - vst1q_f32(dst_data + 2 * dst_step * C4NUM, vaddq_f32(m20, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + C4NUM, vaddq_f32(m21, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + 2 * C4NUM, vaddq_f32(m22, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + 3 * C4NUM, vaddq_f32(m23, bias_ptr)); - - vst1q_f32(dst_data + 3 * dst_step * C4NUM, vaddq_f32(m30, bias_ptr)); - vst1q_f32(dst_data + 3 * dst_step * C4NUM + C4NUM, vaddq_f32(m31, bias_ptr)); - vst1q_f32(dst_data + 3 * dst_step * C4NUM + 2 * C4NUM, vaddq_f32(m32, bias_ptr)); - vst1q_f32(dst_data + 3 * dst_step * C4NUM + 3 * C4NUM, vaddq_f32(m33, bias_ptr)); + for (int l = 0; l < 8; ++l) { + int offset = l * 8; + float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); + float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); + float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); + float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); + float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); + float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); + t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 2)), vmulq_n_f32(tmp6, 3)); + t[l + 16] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), vmulq_n_f32(tmp3, 9)); + t[l + 24] = vaddq_f32(vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 8)), vmulq_n_f32(tmp6, 27)), src[7 + offset]); + } + for (int l = 0; l < 4; ++l) { + int offset = l * 8; + float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); + float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); + float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); + float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); + float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); + float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); + m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 4] = vaddq_f32(vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 2)), vmulq_n_f32(tmp6, 3)), bias_ptr); + m[l + 8] = vaddq_f32(vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), vmulq_n_f32(tmp3, 9)), bias_ptr); + m[l + 12] = vaddq_f32( + vaddq_f32(vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 8)), vmulq_n_f32(tmp6, 27)), t[7 + offset]), bias_ptr); + } + Store16Data; #else - for (int i = 0; i < C4NUM; i++) { - float src_data_00 = src_data[i]; - float src_data_01 = src_data[i + src_step]; - float src_data_02 = src_data[i + 2 * src_step]; - float src_data_03 = src_data[i + 3 * src_step]; - float src_data_04 = src_data[i + 4 * src_step]; - float src_data_05 = src_data[i + 5 * src_step]; - float src_data_06 = src_data[i + 6 * src_step]; - float src_data_07 = src_data[i + 7 * src_step]; - float src_data_10 = src_data[i + 8 * src_step]; - float src_data_11 = src_data[i + 9 * src_step]; - float src_data_12 = src_data[i + 10 * src_step]; - float src_data_13 = src_data[i + 11 * src_step]; - float src_data_14 = src_data[i + 12 * src_step]; - float src_data_15 = src_data[i + 13 * src_step]; - float src_data_16 = src_data[i + 14 * src_step]; - float src_data_17 = src_data[i + 15 * src_step]; - float src_data_20 = src_data[i + 16 * src_step]; - float src_data_21 = src_data[i + 17 * src_step]; - float src_data_22 = src_data[i + 18 * src_step]; - float src_data_23 = src_data[i + 19 * src_step]; - float src_data_24 = src_data[i + 20 * src_step]; - float src_data_25 = src_data[i + 21 * src_step]; - float src_data_26 = src_data[i + 22 * src_step]; - float src_data_27 = src_data[i + 23 * src_step]; - float src_data_30 = src_data[i + 24 * src_step]; - float src_data_31 = src_data[i + 25 * src_step]; - float src_data_32 = src_data[i + 26 * src_step]; - float src_data_33 = src_data[i + 27 * src_step]; - float src_data_34 = src_data[i + 28 * src_step]; - float src_data_35 = src_data[i + 29 * src_step]; - float src_data_36 = src_data[i + 30 * src_step]; - float src_data_37 = src_data[i + 31 * src_step]; - float src_data_40 = src_data[i + 32 * src_step]; - float src_data_41 = src_data[i + 33 * src_step]; - float src_data_42 = src_data[i + 34 * src_step]; - float src_data_43 = src_data[i + 35 * src_step]; - float src_data_44 = src_data[i + 36 * src_step]; - float src_data_45 = src_data[i + 37 * src_step]; - float src_data_46 = src_data[i + 38 * src_step]; - float src_data_47 = src_data[i + 39 * src_step]; - float src_data_50 = src_data[i + 40 * src_step]; - float src_data_51 = src_data[i + 41 * src_step]; - float src_data_52 = src_data[i + 42 * src_step]; - float src_data_53 = src_data[i + 43 * src_step]; - float src_data_54 = src_data[i + 44 * src_step]; - float src_data_55 = src_data[i + 45 * src_step]; - float src_data_56 = src_data[i + 46 * src_step]; - float src_data_57 = src_data[i + 47 * src_step]; - float src_data_60 = src_data[i + 48 * src_step]; - float src_data_61 = src_data[i + 49 * src_step]; - float src_data_62 = src_data[i + 50 * src_step]; - float src_data_63 = src_data[i + 51 * src_step]; - float src_data_64 = src_data[i + 52 * src_step]; - float src_data_65 = src_data[i + 53 * src_step]; - float src_data_66 = src_data[i + 54 * src_step]; - float src_data_67 = src_data[i + 55 * src_step]; - float src_data_70 = src_data[i + 56 * src_step]; - float src_data_71 = src_data[i + 57 * src_step]; - float src_data_72 = src_data[i + 58 * src_step]; - float src_data_73 = src_data[i + 59 * src_step]; - float src_data_74 = src_data[i + 60 * src_step]; - float src_data_75 = src_data[i + 61 * src_step]; - float src_data_76 = src_data[i + 62 * src_step]; - float src_data_77 = src_data[i + 63 * src_step]; - - float d01 = src_data_10 - src_data_20; - float d02 = src_data_11 - src_data_21; - float d03 = src_data_12 - src_data_22; - float d04 = src_data_13 - src_data_23; - float d05 = src_data_14 - src_data_24; - float d06 = src_data_15 - src_data_25; - float d07 = src_data_16 - src_data_26; - float d08 = src_data_17 - src_data_27; - - float d11 = src_data_30 - src_data_40; - float d12 = src_data_31 - src_data_41; - float d13 = src_data_32 - src_data_42; - float d14 = src_data_33 - src_data_43; - float d15 = src_data_34 - src_data_44; - float d16 = src_data_35 - src_data_45; - float d17 = src_data_36 - src_data_46; - float d18 = src_data_37 - src_data_47; - - float d21 = src_data_50 - src_data_60; - float d22 = src_data_51 - src_data_61; - float d23 = src_data_52 - src_data_62; - float d24 = src_data_53 - src_data_63; - float d25 = src_data_54 - src_data_64; - float d26 = src_data_55 - src_data_65; - float d27 = src_data_56 - src_data_66; - float d28 = src_data_57 - src_data_67; - - float d31 = src_data_10 + src_data_20; - float d32 = src_data_11 + src_data_21; - float d33 = src_data_12 + src_data_22; - float d34 = src_data_13 + src_data_23; - float d35 = src_data_14 + src_data_24; - float d36 = src_data_15 + src_data_25; - float d37 = src_data_16 + src_data_26; - float d38 = src_data_17 + src_data_27; - - float d41 = src_data_30 + src_data_40; - float d42 = src_data_31 + src_data_41; - float d43 = src_data_32 + src_data_42; - float d44 = src_data_33 + src_data_43; - float d45 = src_data_34 + src_data_44; - float d46 = src_data_35 + src_data_45; - float d47 = src_data_36 + src_data_46; - float d48 = src_data_37 + src_data_47; - - float d51 = src_data_50 + src_data_60; - float d52 = src_data_51 + src_data_61; - float d53 = src_data_52 + src_data_62; - float d54 = src_data_53 + src_data_63; - float d55 = src_data_54 + src_data_64; - float d56 = src_data_55 + src_data_65; - float d57 = src_data_56 + src_data_66; - float d58 = src_data_57 + src_data_67; - - float t00 = src_data_00 + src_data_10 + src_data_20 + src_data_30 + src_data_40 + src_data_50 + src_data_60; - float t01 = src_data_01 + src_data_11 + src_data_21 + src_data_31 + src_data_41 + src_data_51 + src_data_61; - float t02 = src_data_02 + src_data_12 + src_data_22 + src_data_32 + src_data_42 + src_data_52 + src_data_62; - float t03 = src_data_03 + src_data_13 + src_data_23 + src_data_33 + src_data_43 + src_data_53 + src_data_63; - float t04 = src_data_04 + src_data_14 + src_data_24 + src_data_34 + src_data_44 + src_data_54 + src_data_64; - float t05 = src_data_05 + src_data_15 + src_data_25 + src_data_35 + src_data_45 + src_data_55 + src_data_65; - float t06 = src_data_06 + src_data_16 + src_data_26 + src_data_36 + src_data_46 + src_data_56 + src_data_66; - float t07 = src_data_07 + src_data_17 + src_data_27 + src_data_37 + src_data_47 + src_data_57 + src_data_67; - - const float t10 = 0.5f * d01 + d11 + 1.5f * d21; - const float t11 = 0.5f * d02 + d12 + 1.5f * d22; - const float t12 = 0.5f * d03 + d13 + 1.5f * d23; - const float t13 = 0.5f * d04 + d14 + 1.5f * d24; - const float t14 = 0.5f * d05 + d15 + 1.5f * d25; - const float t15 = 0.5f * d06 + d16 + 1.5f * d26; - const float t16 = 0.5f * d07 + d17 + 1.5f * d27; - const float t17 = 0.5f * d08 + d18 + 1.5f * d28; - - const float t20 = 0.25f * d31 + d41 + 2.25f * d51; - const float t21 = 0.25f * d32 + d42 + 2.25f * d52; - const float t22 = 0.25f * d33 + d43 + 2.25f * d53; - const float t23 = 0.25f * d34 + d44 + 2.25f * d54; - const float t24 = 0.25f * d35 + d45 + 2.25f * d55; - const float t25 = 0.25f * d36 + d46 + 2.25f * d56; - const float t26 = 0.25f * d37 + d47 + 2.25f * d57; - const float t27 = 0.25f * d38 + d48 + 2.25f * d58; - - const float t30 = 0.125f * d01 + d11 + 3.375f * d21 + src_data_70; - const float t31 = 0.125f * d02 + d12 + 3.375f * d22 + src_data_71; - const float t32 = 0.125f * d03 + d13 + 3.375f * d23 + src_data_72; - const float t33 = 0.125f * d04 + d14 + 3.375f * d24 + src_data_73; - const float t34 = 0.125f * d05 + d15 + 3.375f * d25 + src_data_74; - const float t35 = 0.125f * d06 + d16 + 3.375f * d26 + src_data_75; - const float t36 = 0.125f * d07 + d17 + 3.375f * d27 + src_data_76; - const float t37 = 0.125f * d08 + d18 + 3.375f * d28 + src_data_77; - - float s11 = t01 - t02; - float s12 = t11 - t12; - float s13 = t21 - t22; - float s14 = t31 - t32; - - float s21 = t03 - t04; - float s22 = t13 - t14; - float s23 = t23 - t24; - float s24 = t33 - t34; - - float s31 = t05 - t06; - float s32 = t15 - t16; - float s33 = t25 - t26; - float s34 = t35 - t36; - - float s41 = t01 + t02; - float s42 = t11 + t12; - float s43 = t21 + t22; - float s44 = t31 + t32; - - float s51 = t03 + t04; - float s52 = t13 + t14; - float s53 = t23 + t24; - float s54 = t33 + t34; - - float s61 = t05 + t06; - float s62 = t15 + t16; - float s63 = t25 + t26; - float s64 = t35 + t36; - - float m00 = t00 + t01 + t02 + t03 + t04 + t05 + t06; - const float m01 = 0.5f * s11 + s21 + 1.5f * s31; - const float m02 = 0.25f * s41 + s51 + 2.25f * s61; - const float m03 = 0.125f * s11 + s21 + 3.375f * s31 + t07; - - float m10 = t10 + t11 + t12 + t13 + t14 + t15 + t16; - const float m11 = 0.5f * s12 + s22 + 1.5f * s32; - const float m12 = 0.25f * s42 + s52 + 2.25f * s62; - const float m13 = 0.125f * s12 + s22 + 3.375f * s32 + t17; - - float m20 = t20 + t21 + t22 + t23 + t24 + t25 + t26; - const float m21 = 0.5f * s13 + s23 + 1.5f * s33; - const float m22 = 0.25f * s43 + s53 + 2.25f * s63; - const float m23 = 0.125f * s13 + s23 + 3.375f * s33 + t27; - - float m30 = t30 + t31 + t32 + t33 + t34 + t35 + t36; - const float m31 = 0.5f * s14 + s24 + 1.5f * s34; - const float m32 = 0.25f * s44 + s54 + 2.25f * s64; - const float m33 = 0.125f * s14 + s24 + 3.375f * s34 + t37; - - (dst_data + i)[0] = m00 + bias_data[i]; - (dst_data + i + C4NUM)[0] = m01 + bias_data[i]; - (dst_data + i + 2 * C4NUM)[0] = m02 + bias_data[i]; - (dst_data + i + 3 * C4NUM)[0] = m03 + bias_data[i]; - - (dst_data + i + dst_step * C4NUM)[0] = m10 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + C4NUM)[0] = m11 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + 2 * C4NUM)[0] = m12 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + 3 * C4NUM)[0] = m13 + bias_data[i]; - - (dst_data + i + 2 * dst_step * C4NUM)[0] = m20 + bias_data[i]; - (dst_data + i + 2 * dst_step * C4NUM + C4NUM)[0] = m21 + bias_data[i]; - (dst_data + i + 2 * dst_step * C4NUM + 2 * C4NUM)[0] = m22 + bias_data[i]; - (dst_data + i + 2 * dst_step * C4NUM + 3 * C4NUM)[0] = m23 + bias_data[i]; - - (dst_data + i + 3 * dst_step * C4NUM)[0] = m30 + bias_data[i]; - (dst_data + i + 3 * dst_step * C4NUM + C4NUM)[0] = m31 + bias_data[i]; - (dst_data + i + 3 * dst_step * C4NUM + 2 * C4NUM)[0] = m32 + bias_data[i]; - (dst_data + i + 3 * dst_step * C4NUM + 3 * C4NUM)[0] = m33 + bias_data[i]; + float src[64]; + float t[32]; + float m[16]; + for (int i = 0; i < C4NUM; ++i) { + // load source data + for (int j = 0; j < 64; ++j) { + src[j] = src_data[i + j * src_step]; + } + for (int l = 0; l < 8; ++l) { + int offset = l * 8; + t[l] = src[offset] + src[1 + offset] + src[2 + offset] + src[3 + offset] + src[4 + offset] + src[5 + offset] + + src[6 + offset]; + t[l + 8] = src[1 + offset] - src[2 + offset] + 2 * (src[3 + offset] - src[4 + offset]) + + 3 * (src[5 + offset] - src[6 + offset]); + t[l + 16] = src[1 + offset] + src[2 + offset] + 4 * (src[3 + offset] + src[4 + offset]) + + 9 * (src[5 + offset] + src[6 + offset]); + t[l + 24] = src[1 + offset] - src[2 + offset] + 8 * (src[3 + offset] - src[4 + offset]) + + 27 * (src[5 + offset] - src[6 + offset]) + src[7 + offset]; + } + for (int l = 0; l < 4; ++l) { + int offset = l * 8; + m[l] = t[offset] + t[1 + offset] + t[2 + offset] + t[3 + offset] + t[4 + offset] + t[5 + offset] + t[6 + offset]; + m[l + 4] = + t[1 + offset] - t[2 + offset] + 2 * (t[3 + offset] - t[4 + offset]) + 3 * (t[5 + offset] - t[6 + offset]); + m[l + 8] = + t[1 + offset] + t[2 + offset] + 4 * (t[3 + offset] + t[4 + offset]) + 9 * (t[5 + offset] + t[6 + offset]); + m[l + 12] = t[1 + offset] - t[2 + offset] + 8 * (t[3 + offset] - t[4 + offset]) + + 27 * (t[5 + offset] - t[6 + offset]) + t[7 + offset]; + } + // store output + for (int k = 0; k < 4; ++k) { + int dst_k_offset = k * dst_step * C4NUM; + int m_k_offset = k * 4; + for (int j = 0; j < 4; ++j) { + dst_data[i + dst_k_offset + j * C4NUM] = m[j + m_k_offset] + bias_data[i]; + } + } } #endif } - void OutputTransform8x5Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step) { #ifdef ENABLE_ARM - float32x4_t src_data_00 = vld1q_f32(src_data + 0 * src_step); - float32x4_t src_data_01 = vld1q_f32(src_data + 1 * src_step); - float32x4_t src_data_02 = vld1q_f32(src_data + 2 * src_step); - float32x4_t src_data_03 = vld1q_f32(src_data + 3 * src_step); - float32x4_t src_data_04 = vld1q_f32(src_data + 4 * src_step); - float32x4_t src_data_05 = vld1q_f32(src_data + 5 * src_step); - float32x4_t src_data_06 = vld1q_f32(src_data + 6 * src_step); - float32x4_t src_data_07 = vld1q_f32(src_data + 7 * src_step); - float32x4_t src_data_10 = vld1q_f32(src_data + 8 * src_step); - float32x4_t src_data_11 = vld1q_f32(src_data + 9 * src_step); - float32x4_t src_data_12 = vld1q_f32(src_data + 10 * src_step); - float32x4_t src_data_13 = vld1q_f32(src_data + 11 * src_step); - float32x4_t src_data_14 = vld1q_f32(src_data + 12 * src_step); - float32x4_t src_data_15 = vld1q_f32(src_data + 13 * src_step); - float32x4_t src_data_16 = vld1q_f32(src_data + 14 * src_step); - float32x4_t src_data_17 = vld1q_f32(src_data + 15 * src_step); - float32x4_t src_data_20 = vld1q_f32(src_data + 16 * src_step); - float32x4_t src_data_21 = vld1q_f32(src_data + 17 * src_step); - float32x4_t src_data_22 = vld1q_f32(src_data + 18 * src_step); - float32x4_t src_data_23 = vld1q_f32(src_data + 19 * src_step); - float32x4_t src_data_24 = vld1q_f32(src_data + 20 * src_step); - float32x4_t src_data_25 = vld1q_f32(src_data + 21 * src_step); - float32x4_t src_data_26 = vld1q_f32(src_data + 22 * src_step); - float32x4_t src_data_27 = vld1q_f32(src_data + 23 * src_step); - float32x4_t src_data_30 = vld1q_f32(src_data + 24 * src_step); - float32x4_t src_data_31 = vld1q_f32(src_data + 25 * src_step); - float32x4_t src_data_32 = vld1q_f32(src_data + 26 * src_step); - float32x4_t src_data_33 = vld1q_f32(src_data + 27 * src_step); - float32x4_t src_data_34 = vld1q_f32(src_data + 28 * src_step); - float32x4_t src_data_35 = vld1q_f32(src_data + 29 * src_step); - float32x4_t src_data_36 = vld1q_f32(src_data + 30 * src_step); - float32x4_t src_data_37 = vld1q_f32(src_data + 31 * src_step); - float32x4_t src_data_40 = vld1q_f32(src_data + 32 * src_step); - float32x4_t src_data_41 = vld1q_f32(src_data + 33 * src_step); - float32x4_t src_data_42 = vld1q_f32(src_data + 34 * src_step); - float32x4_t src_data_43 = vld1q_f32(src_data + 35 * src_step); - float32x4_t src_data_44 = vld1q_f32(src_data + 36 * src_step); - float32x4_t src_data_45 = vld1q_f32(src_data + 37 * src_step); - float32x4_t src_data_46 = vld1q_f32(src_data + 38 * src_step); - float32x4_t src_data_47 = vld1q_f32(src_data + 39 * src_step); - float32x4_t src_data_50 = vld1q_f32(src_data + 40 * src_step); - float32x4_t src_data_51 = vld1q_f32(src_data + 41 * src_step); - float32x4_t src_data_52 = vld1q_f32(src_data + 42 * src_step); - float32x4_t src_data_53 = vld1q_f32(src_data + 43 * src_step); - float32x4_t src_data_54 = vld1q_f32(src_data + 44 * src_step); - float32x4_t src_data_55 = vld1q_f32(src_data + 45 * src_step); - float32x4_t src_data_56 = vld1q_f32(src_data + 46 * src_step); - float32x4_t src_data_57 = vld1q_f32(src_data + 47 * src_step); - float32x4_t src_data_60 = vld1q_f32(src_data + 48 * src_step); - float32x4_t src_data_61 = vld1q_f32(src_data + 49 * src_step); - float32x4_t src_data_62 = vld1q_f32(src_data + 50 * src_step); - float32x4_t src_data_63 = vld1q_f32(src_data + 51 * src_step); - float32x4_t src_data_64 = vld1q_f32(src_data + 52 * src_step); - float32x4_t src_data_65 = vld1q_f32(src_data + 53 * src_step); - float32x4_t src_data_66 = vld1q_f32(src_data + 54 * src_step); - float32x4_t src_data_67 = vld1q_f32(src_data + 55 * src_step); - float32x4_t src_data_70 = vld1q_f32(src_data + 56 * src_step); - float32x4_t src_data_71 = vld1q_f32(src_data + 57 * src_step); - float32x4_t src_data_72 = vld1q_f32(src_data + 58 * src_step); - float32x4_t src_data_73 = vld1q_f32(src_data + 59 * src_step); - float32x4_t src_data_74 = vld1q_f32(src_data + 60 * src_step); - float32x4_t src_data_75 = vld1q_f32(src_data + 61 * src_step); - float32x4_t src_data_76 = vld1q_f32(src_data + 62 * src_step); - float32x4_t src_data_77 = vld1q_f32(src_data + 63 * src_step); - - float32x4_t d01 = vsubq_f32(src_data_10, src_data_20); - float32x4_t d02 = vsubq_f32(src_data_11, src_data_21); - float32x4_t d03 = vsubq_f32(src_data_12, src_data_22); - float32x4_t d04 = vsubq_f32(src_data_13, src_data_23); - float32x4_t d05 = vsubq_f32(src_data_14, src_data_24); - float32x4_t d06 = vsubq_f32(src_data_15, src_data_25); - float32x4_t d07 = vsubq_f32(src_data_16, src_data_26); - float32x4_t d08 = vsubq_f32(src_data_17, src_data_27); - - float32x4_t d11 = vsubq_f32(src_data_30, src_data_40); - float32x4_t d12 = vsubq_f32(src_data_31, src_data_41); - float32x4_t d13 = vsubq_f32(src_data_32, src_data_42); - float32x4_t d14 = vsubq_f32(src_data_33, src_data_43); - float32x4_t d15 = vsubq_f32(src_data_34, src_data_44); - float32x4_t d16 = vsubq_f32(src_data_35, src_data_45); - float32x4_t d17 = vsubq_f32(src_data_36, src_data_46); - float32x4_t d18 = vsubq_f32(src_data_37, src_data_47); - - float32x4_t d21 = vsubq_f32(src_data_50, src_data_60); - float32x4_t d22 = vsubq_f32(src_data_51, src_data_61); - float32x4_t d23 = vsubq_f32(src_data_52, src_data_62); - float32x4_t d24 = vsubq_f32(src_data_53, src_data_63); - float32x4_t d25 = vsubq_f32(src_data_54, src_data_64); - float32x4_t d26 = vsubq_f32(src_data_55, src_data_65); - float32x4_t d27 = vsubq_f32(src_data_56, src_data_66); - float32x4_t d28 = vsubq_f32(src_data_57, src_data_67); - - float32x4_t d31 = vaddq_f32(src_data_10, src_data_20); - float32x4_t d32 = vaddq_f32(src_data_11, src_data_21); - float32x4_t d33 = vaddq_f32(src_data_12, src_data_22); - float32x4_t d34 = vaddq_f32(src_data_13, src_data_23); - float32x4_t d35 = vaddq_f32(src_data_14, src_data_24); - float32x4_t d36 = vaddq_f32(src_data_15, src_data_25); - float32x4_t d37 = vaddq_f32(src_data_16, src_data_26); - float32x4_t d38 = vaddq_f32(src_data_17, src_data_27); - - float32x4_t d41 = vaddq_f32(src_data_30, src_data_40); - float32x4_t d42 = vaddq_f32(src_data_31, src_data_41); - float32x4_t d43 = vaddq_f32(src_data_32, src_data_42); - float32x4_t d44 = vaddq_f32(src_data_33, src_data_43); - float32x4_t d45 = vaddq_f32(src_data_34, src_data_44); - float32x4_t d46 = vaddq_f32(src_data_35, src_data_45); - float32x4_t d47 = vaddq_f32(src_data_36, src_data_46); - float32x4_t d48 = vaddq_f32(src_data_37, src_data_47); - - float32x4_t d51 = vaddq_f32(src_data_50, src_data_60); - float32x4_t d52 = vaddq_f32(src_data_51, src_data_61); - float32x4_t d53 = vaddq_f32(src_data_52, src_data_62); - float32x4_t d54 = vaddq_f32(src_data_53, src_data_63); - float32x4_t d55 = vaddq_f32(src_data_54, src_data_64); - float32x4_t d56 = vaddq_f32(src_data_55, src_data_65); - float32x4_t d57 = vaddq_f32(src_data_56, src_data_66); - float32x4_t d58 = vaddq_f32(src_data_57, src_data_67); - - float32x4_t t00 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_00, src_data_10), src_data_20), src_data_30), src_data_40), - src_data_50), - src_data_60); - float32x4_t t01 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_01, src_data_11), src_data_21), src_data_31), src_data_41), - src_data_51), - src_data_61); - float32x4_t t02 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_02, src_data_12), src_data_22), src_data_32), src_data_42), - src_data_52), - src_data_62); - float32x4_t t03 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_03, src_data_13), src_data_23), src_data_33), src_data_43), - src_data_53), - src_data_63); - float32x4_t t04 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_04, src_data_14), src_data_24), src_data_34), src_data_44), - src_data_54), - src_data_64); - float32x4_t t05 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_05, src_data_15), src_data_25), src_data_35), src_data_45), - src_data_55), - src_data_65); - float32x4_t t06 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_06, src_data_16), src_data_26), src_data_36), src_data_46), - src_data_56), - src_data_66); - float32x4_t t07 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_07, src_data_17), src_data_27), src_data_37), src_data_47), - src_data_57), - src_data_67); - - float32x4_t t10 = vaddq_f32(vaddq_f32(vmulq_n_f32(d01, 0.5), d11), vmulq_n_f32(d21, 1.5)); - float32x4_t t11 = vaddq_f32(vaddq_f32(vmulq_n_f32(d02, 0.5), d12), vmulq_n_f32(d22, 1.5)); - float32x4_t t12 = vaddq_f32(vaddq_f32(vmulq_n_f32(d03, 0.5), d13), vmulq_n_f32(d23, 1.5)); - float32x4_t t13 = vaddq_f32(vaddq_f32(vmulq_n_f32(d04, 0.5), d14), vmulq_n_f32(d24, 1.5)); - float32x4_t t14 = vaddq_f32(vaddq_f32(vmulq_n_f32(d05, 0.5), d15), vmulq_n_f32(d25, 1.5)); - float32x4_t t15 = vaddq_f32(vaddq_f32(vmulq_n_f32(d06, 0.5), d16), vmulq_n_f32(d26, 1.5)); - float32x4_t t16 = vaddq_f32(vaddq_f32(vmulq_n_f32(d07, 0.5), d17), vmulq_n_f32(d27, 1.5)); - float32x4_t t17 = vaddq_f32(vaddq_f32(vmulq_n_f32(d08, 0.5), d18), vmulq_n_f32(d28, 1.5)); - - float32x4_t t20 = vaddq_f32(vaddq_f32(vmulq_n_f32(d31, 0.25), d41), vmulq_n_f32(d51, 2.25)); - float32x4_t t21 = vaddq_f32(vaddq_f32(vmulq_n_f32(d32, 0.25), d42), vmulq_n_f32(d52, 2.25)); - float32x4_t t22 = vaddq_f32(vaddq_f32(vmulq_n_f32(d33, 0.25), d43), vmulq_n_f32(d53, 2.25)); - float32x4_t t23 = vaddq_f32(vaddq_f32(vmulq_n_f32(d34, 0.25), d44), vmulq_n_f32(d54, 2.25)); - float32x4_t t24 = vaddq_f32(vaddq_f32(vmulq_n_f32(d35, 0.25), d45), vmulq_n_f32(d55, 2.25)); - float32x4_t t25 = vaddq_f32(vaddq_f32(vmulq_n_f32(d36, 0.25), d46), vmulq_n_f32(d56, 2.25)); - float32x4_t t26 = vaddq_f32(vaddq_f32(vmulq_n_f32(d37, 0.25), d47), vmulq_n_f32(d57, 2.25)); - float32x4_t t27 = vaddq_f32(vaddq_f32(vmulq_n_f32(d38, 0.25), d48), vmulq_n_f32(d58, 2.25)); - - float32x4_t t30 = vaddq_f32(vaddq_f32(vmulq_n_f32(d01, 0.125), d11), vmulq_n_f32(d21, 3.375)); - float32x4_t t31 = vaddq_f32(vaddq_f32(vmulq_n_f32(d02, 0.125), d12), vmulq_n_f32(d22, 3.375)); - float32x4_t t32 = vaddq_f32(vaddq_f32(vmulq_n_f32(d03, 0.125), d13), vmulq_n_f32(d23, 3.375)); - float32x4_t t33 = vaddq_f32(vaddq_f32(vmulq_n_f32(d04, 0.125), d14), vmulq_n_f32(d24, 3.375)); - float32x4_t t34 = vaddq_f32(vaddq_f32(vmulq_n_f32(d05, 0.125), d15), vmulq_n_f32(d25, 3.375)); - float32x4_t t35 = vaddq_f32(vaddq_f32(vmulq_n_f32(d06, 0.125), d16), vmulq_n_f32(d26, 3.375)); - float32x4_t t36 = vaddq_f32(vaddq_f32(vmulq_n_f32(d07, 0.125), d17), vmulq_n_f32(d27, 3.375)); - float32x4_t t37 = vaddq_f32(vaddq_f32(vmulq_n_f32(d08, 0.125), d18), vmulq_n_f32(d28, 3.375)); - - float32x4_t t40 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d31, 0.0625), d41), vmulq_n_f32(d51, 5.0625)), src_data_70); - float32x4_t t41 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d32, 0.0625), d42), vmulq_n_f32(d52, 5.0625)), src_data_71); - float32x4_t t42 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d33, 0.0625), d43), vmulq_n_f32(d53, 5.0625)), src_data_72); - float32x4_t t43 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d34, 0.0625), d44), vmulq_n_f32(d54, 5.0625)), src_data_73); - float32x4_t t44 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d35, 0.0625), d45), vmulq_n_f32(d55, 5.0625)), src_data_74); - float32x4_t t45 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d36, 0.0625), d46), vmulq_n_f32(d56, 5.0625)), src_data_75); - float32x4_t t46 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d37, 0.0625), d47), vmulq_n_f32(d57, 5.0625)), src_data_76); - float32x4_t t47 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d38, 0.0625), d48), vmulq_n_f32(d58, 5.0625)), src_data_77); - - float32x4_t s11 = vsubq_f32(t01, t02); - float32x4_t s12 = vsubq_f32(t11, t12); - float32x4_t s13 = vsubq_f32(t21, t22); - float32x4_t s14 = vsubq_f32(t31, t32); - float32x4_t s15 = vsubq_f32(t41, t42); - - float32x4_t s21 = vsubq_f32(t03, t04); - float32x4_t s22 = vsubq_f32(t13, t14); - float32x4_t s23 = vsubq_f32(t23, t24); - float32x4_t s24 = vsubq_f32(t33, t34); - float32x4_t s25 = vsubq_f32(t43, t44); - - float32x4_t s31 = vsubq_f32(t05, t06); - float32x4_t s32 = vsubq_f32(t15, t16); - float32x4_t s33 = vsubq_f32(t25, t26); - float32x4_t s34 = vsubq_f32(t35, t36); - float32x4_t s35 = vsubq_f32(t45, t46); - - float32x4_t s41 = vaddq_f32(t01, t02); - float32x4_t s42 = vaddq_f32(t11, t12); - float32x4_t s43 = vaddq_f32(t21, t22); - float32x4_t s44 = vaddq_f32(t31, t32); - float32x4_t s45 = vaddq_f32(t41, t42); - - float32x4_t s51 = vaddq_f32(t03, t04); - float32x4_t s52 = vaddq_f32(t13, t14); - float32x4_t s53 = vaddq_f32(t23, t24); - float32x4_t s54 = vaddq_f32(t33, t34); - float32x4_t s55 = vaddq_f32(t43, t44); - - float32x4_t s61 = vaddq_f32(t05, t06); - float32x4_t s62 = vaddq_f32(t15, t16); - float32x4_t s63 = vaddq_f32(t25, t26); - float32x4_t s64 = vaddq_f32(t35, t36); - float32x4_t s65 = vaddq_f32(t45, t46); - - float32x4_t m00 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t00, t01), t02), t03), t04), t05), t06); - float32x4_t m01 = vaddq_f32(vaddq_f32(vmulq_n_f32(s11, 0.5), s21), vmulq_n_f32(s31, 1.5)); - float32x4_t m02 = vaddq_f32(vaddq_f32(vmulq_n_f32(s41, 0.25), s51), vmulq_n_f32(s61, 2.25)); - float32x4_t m03 = vaddq_f32(vaddq_f32(vmulq_n_f32(s11, 0.125), s21), vmulq_n_f32(s31, 3.375)); - float32x4_t m04 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s41, 0.0625), s51), vmulq_n_f32(s61, 5.0625)), t07); - - float32x4_t m10 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t10, t11), t12), t13), t14), t15), t16); - float32x4_t m11 = vaddq_f32(vaddq_f32(vmulq_n_f32(s12, 0.5), s22), vmulq_n_f32(s32, 1.5)); - float32x4_t m12 = vaddq_f32(vaddq_f32(vmulq_n_f32(s42, 0.25), s52), vmulq_n_f32(s62, 2.25)); - float32x4_t m13 = vaddq_f32(vaddq_f32(vmulq_n_f32(s12, 0.125), s22), vmulq_n_f32(s32, 3.375)); - float32x4_t m14 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s42, 0.0625), s52), vmulq_n_f32(s62, 5.0625)), t17); - - float32x4_t m20 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t20, t21), t22), t23), t24), t25), t26); - float32x4_t m21 = vaddq_f32(vaddq_f32(vmulq_n_f32(s13, 0.5), s23), vmulq_n_f32(s33, 1.5)); - float32x4_t m22 = vaddq_f32(vaddq_f32(vmulq_n_f32(s43, 0.25), s53), vmulq_n_f32(s63, 2.25)); - float32x4_t m23 = vaddq_f32(vaddq_f32(vmulq_n_f32(s13, 0.125), s23), vmulq_n_f32(s33, 3.375)); - float32x4_t m24 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s43, 0.0625), s53), vmulq_n_f32(s63, 5.0625)), t27); - - float32x4_t m30 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t30, t31), t32), t33), t34), t35), t36); - float32x4_t m31 = vaddq_f32(vaddq_f32(vmulq_n_f32(s14, 0.5), s24), vmulq_n_f32(s34, 1.5)); - float32x4_t m32 = vaddq_f32(vaddq_f32(vmulq_n_f32(s44, 0.25), s54), vmulq_n_f32(s64, 2.25)); - float32x4_t m33 = vaddq_f32(vaddq_f32(vmulq_n_f32(s14, 0.125), s24), vmulq_n_f32(s34, 3.375)); - float32x4_t m34 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s44, 0.0625), s54), vmulq_n_f32(s64, 5.0625)), t37); - - float32x4_t m40 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t40, t41), t42), t43), t44), t45), t46); - float32x4_t m41 = vaddq_f32(vaddq_f32(vmulq_n_f32(s15, 0.5), s25), vmulq_n_f32(s35, 1.5)); - float32x4_t m42 = vaddq_f32(vaddq_f32(vmulq_n_f32(s45, 0.25), s55), vmulq_n_f32(s65, 2.25)); - float32x4_t m43 = vaddq_f32(vaddq_f32(vmulq_n_f32(s15, 0.125), s25), vmulq_n_f32(s35, 3.375)); - float32x4_t m44 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s45, 0.0625), s55), vmulq_n_f32(s65, 5.0625)), t47); - + float32x4_t src[64]; + float32x4_t t[40]; + float32x4_t m[25]; + Load64Data; float32x4_t bias_ptr = vld1q_f32(bias_data); - vst1q_f32(dst_data, vaddq_f32(m00, bias_ptr)); - vst1q_f32(dst_data + C4NUM, vaddq_f32(m01, bias_ptr)); - vst1q_f32(dst_data + 2 * C4NUM, vaddq_f32(m02, bias_ptr)); - vst1q_f32(dst_data + 3 * C4NUM, vaddq_f32(m03, bias_ptr)); - vst1q_f32(dst_data + 4 * C4NUM, vaddq_f32(m04, bias_ptr)); - - vst1q_f32(dst_data + dst_step * C4NUM, vaddq_f32(m10, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + C4NUM, vaddq_f32(m11, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + 2 * C4NUM, vaddq_f32(m12, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + 3 * C4NUM, vaddq_f32(m13, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + 4 * C4NUM, vaddq_f32(m14, bias_ptr)); - - vst1q_f32(dst_data + 2 * dst_step * C4NUM, vaddq_f32(m20, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + C4NUM, vaddq_f32(m21, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + 2 * C4NUM, vaddq_f32(m22, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + 3 * C4NUM, vaddq_f32(m23, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + 4 * C4NUM, vaddq_f32(m24, bias_ptr)); - - vst1q_f32(dst_data + 3 * dst_step * C4NUM, vaddq_f32(m30, bias_ptr)); - vst1q_f32(dst_data + 3 * dst_step * C4NUM + C4NUM, vaddq_f32(m31, bias_ptr)); - vst1q_f32(dst_data + 3 * dst_step * C4NUM + 2 * C4NUM, vaddq_f32(m32, bias_ptr)); - vst1q_f32(dst_data + 3 * dst_step * C4NUM + 3 * C4NUM, vaddq_f32(m33, bias_ptr)); - vst1q_f32(dst_data + 3 * dst_step * C4NUM + 4 * C4NUM, vaddq_f32(m34, bias_ptr)); - - vst1q_f32(dst_data + 4 * dst_step * C4NUM, vaddq_f32(m40, bias_ptr)); - vst1q_f32(dst_data + 4 * dst_step * C4NUM + C4NUM, vaddq_f32(m41, bias_ptr)); - vst1q_f32(dst_data + 4 * dst_step * C4NUM + 2 * C4NUM, vaddq_f32(m42, bias_ptr)); - vst1q_f32(dst_data + 4 * dst_step * C4NUM + 3 * C4NUM, vaddq_f32(m43, bias_ptr)); - vst1q_f32(dst_data + 4 * dst_step * C4NUM + 4 * C4NUM, vaddq_f32(m44, bias_ptr)); + for (int l = 0; l < 8; ++l) { + int offset = l * 8; + float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); + float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); + float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); + float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); + float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); + float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); + t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 2)), vmulq_n_f32(tmp6, 3)); + t[l + 16] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), vmulq_n_f32(tmp3, 9)); + t[l + 24] = vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 8)), vmulq_n_f32(tmp6, 27)); + t[l + 32] = vaddq_f32(vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 16)), vmulq_n_f32(tmp3, 81)), src[7 + offset]); + } + for (int l = 0; l < 5; ++l) { + int offset = l * 8; + float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); + float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); + float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); + float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); + float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); + float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); + m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 5] = vaddq_f32(vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 2)), vmulq_n_f32(tmp6, 3)), bias_ptr); + m[l + 10] = vaddq_f32(vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), vmulq_n_f32(tmp3, 9)), bias_ptr); + m[l + 15] = vaddq_f32(vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 8)), vmulq_n_f32(tmp6, 27)), bias_ptr); + m[l + 20] = vaddq_f32( + vaddq_f32(vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 16)), vmulq_n_f32(tmp3, 81)), t[7 + offset]), bias_ptr); + } + Store25Data; #else - for (int i = 0; i < C4NUM; i++) { - float src_data_00 = src_data[i]; - float src_data_01 = src_data[i + src_step]; - float src_data_02 = src_data[i + 2 * src_step]; - float src_data_03 = src_data[i + 3 * src_step]; - float src_data_04 = src_data[i + 4 * src_step]; - float src_data_05 = src_data[i + 5 * src_step]; - float src_data_06 = src_data[i + 6 * src_step]; - float src_data_07 = src_data[i + 7 * src_step]; - float src_data_10 = src_data[i + 8 * src_step]; - float src_data_11 = src_data[i + 9 * src_step]; - float src_data_12 = src_data[i + 10 * src_step]; - float src_data_13 = src_data[i + 11 * src_step]; - float src_data_14 = src_data[i + 12 * src_step]; - float src_data_15 = src_data[i + 13 * src_step]; - float src_data_16 = src_data[i + 14 * src_step]; - float src_data_17 = src_data[i + 15 * src_step]; - float src_data_20 = src_data[i + 16 * src_step]; - float src_data_21 = src_data[i + 17 * src_step]; - float src_data_22 = src_data[i + 18 * src_step]; - float src_data_23 = src_data[i + 19 * src_step]; - float src_data_24 = src_data[i + 20 * src_step]; - float src_data_25 = src_data[i + 21 * src_step]; - float src_data_26 = src_data[i + 22 * src_step]; - float src_data_27 = src_data[i + 23 * src_step]; - float src_data_30 = src_data[i + 24 * src_step]; - float src_data_31 = src_data[i + 25 * src_step]; - float src_data_32 = src_data[i + 26 * src_step]; - float src_data_33 = src_data[i + 27 * src_step]; - float src_data_34 = src_data[i + 28 * src_step]; - float src_data_35 = src_data[i + 29 * src_step]; - float src_data_36 = src_data[i + 30 * src_step]; - float src_data_37 = src_data[i + 31 * src_step]; - float src_data_40 = src_data[i + 32 * src_step]; - float src_data_41 = src_data[i + 33 * src_step]; - float src_data_42 = src_data[i + 34 * src_step]; - float src_data_43 = src_data[i + 35 * src_step]; - float src_data_44 = src_data[i + 36 * src_step]; - float src_data_45 = src_data[i + 37 * src_step]; - float src_data_46 = src_data[i + 38 * src_step]; - float src_data_47 = src_data[i + 39 * src_step]; - float src_data_50 = src_data[i + 40 * src_step]; - float src_data_51 = src_data[i + 41 * src_step]; - float src_data_52 = src_data[i + 42 * src_step]; - float src_data_53 = src_data[i + 43 * src_step]; - float src_data_54 = src_data[i + 44 * src_step]; - float src_data_55 = src_data[i + 45 * src_step]; - float src_data_56 = src_data[i + 46 * src_step]; - float src_data_57 = src_data[i + 47 * src_step]; - float src_data_60 = src_data[i + 48 * src_step]; - float src_data_61 = src_data[i + 49 * src_step]; - float src_data_62 = src_data[i + 50 * src_step]; - float src_data_63 = src_data[i + 51 * src_step]; - float src_data_64 = src_data[i + 52 * src_step]; - float src_data_65 = src_data[i + 53 * src_step]; - float src_data_66 = src_data[i + 54 * src_step]; - float src_data_67 = src_data[i + 55 * src_step]; - float src_data_70 = src_data[i + 56 * src_step]; - float src_data_71 = src_data[i + 57 * src_step]; - float src_data_72 = src_data[i + 58 * src_step]; - float src_data_73 = src_data[i + 59 * src_step]; - float src_data_74 = src_data[i + 60 * src_step]; - float src_data_75 = src_data[i + 61 * src_step]; - float src_data_76 = src_data[i + 62 * src_step]; - float src_data_77 = src_data[i + 63 * src_step]; - - float d01 = src_data_10 - src_data_20; - float d02 = src_data_11 - src_data_21; - float d03 = src_data_12 - src_data_22; - float d04 = src_data_13 - src_data_23; - float d05 = src_data_14 - src_data_24; - float d06 = src_data_15 - src_data_25; - float d07 = src_data_16 - src_data_26; - float d08 = src_data_17 - src_data_27; - - float d11 = src_data_30 - src_data_40; - float d12 = src_data_31 - src_data_41; - float d13 = src_data_32 - src_data_42; - float d14 = src_data_33 - src_data_43; - float d15 = src_data_34 - src_data_44; - float d16 = src_data_35 - src_data_45; - float d17 = src_data_36 - src_data_46; - float d18 = src_data_37 - src_data_47; - - float d21 = src_data_50 - src_data_60; - float d22 = src_data_51 - src_data_61; - float d23 = src_data_52 - src_data_62; - float d24 = src_data_53 - src_data_63; - float d25 = src_data_54 - src_data_64; - float d26 = src_data_55 - src_data_65; - float d27 = src_data_56 - src_data_66; - float d28 = src_data_57 - src_data_67; - - float d31 = src_data_10 + src_data_20; - float d32 = src_data_11 + src_data_21; - float d33 = src_data_12 + src_data_22; - float d34 = src_data_13 + src_data_23; - float d35 = src_data_14 + src_data_24; - float d36 = src_data_15 + src_data_25; - float d37 = src_data_16 + src_data_26; - float d38 = src_data_17 + src_data_27; - - float d41 = src_data_30 + src_data_40; - float d42 = src_data_31 + src_data_41; - float d43 = src_data_32 + src_data_42; - float d44 = src_data_33 + src_data_43; - float d45 = src_data_34 + src_data_44; - float d46 = src_data_35 + src_data_45; - float d47 = src_data_36 + src_data_46; - float d48 = src_data_37 + src_data_47; - - float d51 = src_data_50 + src_data_60; - float d52 = src_data_51 + src_data_61; - float d53 = src_data_52 + src_data_62; - float d54 = src_data_53 + src_data_63; - float d55 = src_data_54 + src_data_64; - float d56 = src_data_55 + src_data_65; - float d57 = src_data_56 + src_data_66; - float d58 = src_data_57 + src_data_67; - - float t00 = src_data_00 + src_data_10 + src_data_20 + src_data_30 + src_data_40 + src_data_50 + src_data_60; - float t01 = src_data_01 + src_data_11 + src_data_21 + src_data_31 + src_data_41 + src_data_51 + src_data_61; - float t02 = src_data_02 + src_data_12 + src_data_22 + src_data_32 + src_data_42 + src_data_52 + src_data_62; - float t03 = src_data_03 + src_data_13 + src_data_23 + src_data_33 + src_data_43 + src_data_53 + src_data_63; - float t04 = src_data_04 + src_data_14 + src_data_24 + src_data_34 + src_data_44 + src_data_54 + src_data_64; - float t05 = src_data_05 + src_data_15 + src_data_25 + src_data_35 + src_data_45 + src_data_55 + src_data_65; - float t06 = src_data_06 + src_data_16 + src_data_26 + src_data_36 + src_data_46 + src_data_56 + src_data_66; - float t07 = src_data_07 + src_data_17 + src_data_27 + src_data_37 + src_data_47 + src_data_57 + src_data_67; - - const float t10 = 0.5f * d01 + d11 + 1.5f * d21; - const float t11 = 0.5f * d02 + d12 + 1.5f * d22; - const float t12 = 0.5f * d03 + d13 + 1.5f * d23; - const float t13 = 0.5f * d04 + d14 + 1.5f * d24; - const float t14 = 0.5f * d05 + d15 + 1.5f * d25; - const float t15 = 0.5f * d06 + d16 + 1.5f * d26; - const float t16 = 0.5f * d07 + d17 + 1.5f * d27; - const float t17 = 0.5f * d08 + d18 + 1.5f * d28; - - const float t20 = 0.25f * d31 + d41 + 2.25f * d51; - const float t21 = 0.25f * d32 + d42 + 2.25f * d52; - const float t22 = 0.25f * d33 + d43 + 2.25f * d53; - const float t23 = 0.25f * d34 + d44 + 2.25f * d54; - const float t24 = 0.25f * d35 + d45 + 2.25f * d55; - const float t25 = 0.25f * d36 + d46 + 2.25f * d56; - const float t26 = 0.25f * d37 + d47 + 2.25f * d57; - const float t27 = 0.25f * d38 + d48 + 2.25f * d58; - - const float t30 = 0.125f * d01 + d11 + 3.375f * d21; - const float t31 = 0.125f * d02 + d12 + 3.375f * d22; - const float t32 = 0.125f * d03 + d13 + 3.375f * d23; - const float t33 = 0.125f * d04 + d14 + 3.375f * d24; - const float t34 = 0.125f * d05 + d15 + 3.375f * d25; - const float t35 = 0.125f * d06 + d16 + 3.375f * d26; - const float t36 = 0.125f * d07 + d17 + 3.375f * d27; - const float t37 = 0.125f * d08 + d18 + 3.375f * d28; - - const float t40 = 0.0625f * d31 + d41 + 5.0625f * d51 + src_data_70; - const float t41 = 0.0625f * d32 + d42 + 5.0625f * d52 + src_data_71; - const float t42 = 0.0625f * d33 + d43 + 5.0625f * d53 + src_data_72; - const float t43 = 0.0625f * d34 + d44 + 5.0625f * d54 + src_data_73; - const float t44 = 0.0625f * d35 + d45 + 5.0625f * d55 + src_data_74; - const float t45 = 0.0625f * d36 + d46 + 5.0625f * d56 + src_data_75; - const float t46 = 0.0625f * d37 + d47 + 5.0625f * d57 + src_data_76; - const float t47 = 0.0625f * d38 + d48 + 5.0625f * d58 + src_data_77; - - float s11 = t01 - t02; - float s12 = t11 - t12; - float s13 = t21 - t22; - float s14 = t31 - t32; - float s15 = t41 - t42; - - float s21 = t03 - t04; - float s22 = t13 - t14; - float s23 = t23 - t24; - float s24 = t33 - t34; - float s25 = t43 - t44; - - float s31 = t05 - t06; - float s32 = t15 - t16; - float s33 = t25 - t26; - float s34 = t35 - t36; - float s35 = t45 - t46; - - float s41 = t01 + t02; - float s42 = t11 + t12; - float s43 = t21 + t22; - float s44 = t31 + t32; - float s45 = t41 + t42; - - float s51 = t03 + t04; - float s52 = t13 + t14; - float s53 = t23 + t24; - float s54 = t33 + t34; - float s55 = t43 + t44; - - float s61 = t05 + t06; - float s62 = t15 + t16; - float s63 = t25 + t26; - float s64 = t35 + t36; - float s65 = t45 + t46; - - float m00 = t00 + t01 + t02 + t03 + t04 + t05 + t06; - const float m01 = 0.5f * s11 + s21 + 1.5f * s31; - const float m02 = 0.25f * s41 + s51 + 2.25f * s61; - const float m03 = 0.125f * s11 + s21 + 3.375f * s31; - const float m04 = 0.0625f * s41 + s51 + 5.0625f * s61 + t07; - - float m10 = t10 + t11 + t12 + t13 + t14 + t15 + t16; - const float m11 = 0.5f * s12 + s22 + 1.5f * s32; - const float m12 = 0.25f * s42 + s52 + 2.25f * s62; - const float m13 = 0.125f * s12 + s22 + 3.375f * s32; - const float m14 = 0.0625f * s42 + s52 + 5.0625f * s62 + t17; - - float m20 = t20 + t21 + t22 + t23 + t24 + t25 + t26; - const float m21 = 0.5f * s13 + s23 + 1.5f * s33; - const float m22 = 0.25f * s43 + s53 + 2.25f * s63; - const float m23 = 0.125f * s13 + s23 + 3.375f * s33; - const float m24 = 0.0625f * s43 + s53 + 5.0625f * s63 + t27; - - float m30 = t30 + t31 + t32 + t33 + t34 + t35 + t36; - const float m31 = 0.5f * s14 + s24 + 1.5f * s34; - const float m32 = 0.25f * s44 + s54 + 2.25f * s64; - const float m33 = 0.125f * s14 + s24 + 3.375f * s34; - const float m34 = 0.0625f * s44 + s54 + 5.0625f * s64 + t37; - - float m40 = t40 + t41 + t42 + t43 + t44 + t45 + t46; - const float m41 = 0.5f * s15 + s25 + 1.5f * s35; - const float m42 = 0.25f * s45 + s55 + 2.25f * s65; - const float m43 = 0.125f * s15 + s25 + 3.375f * s35; - const float m44 = 0.0625f * s45 + s55 + 5.0625f * s65 + t47; - - (dst_data + i)[0] = m00 + bias_data[i]; - (dst_data + i + C4NUM)[0] = m01 + bias_data[i]; - (dst_data + i + 2 * C4NUM)[0] = m02 + bias_data[i]; - (dst_data + i + 3 * C4NUM)[0] = m03 + bias_data[i]; - (dst_data + i + 4 * C4NUM)[0] = m04 + bias_data[i]; - - (dst_data + i + dst_step * C4NUM)[0] = m10 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + C4NUM)[0] = m11 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + 2 * C4NUM)[0] = m12 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + 3 * C4NUM)[0] = m13 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + 4 * C4NUM)[0] = m14 + bias_data[i]; - - (dst_data + i + 2 * dst_step * C4NUM)[0] = m20 + bias_data[i]; - (dst_data + i + 2 * dst_step * C4NUM + C4NUM)[0] = m21 + bias_data[i]; - (dst_data + i + 2 * dst_step * C4NUM + 2 * C4NUM)[0] = m22 + bias_data[i]; - (dst_data + i + 2 * dst_step * C4NUM + 3 * C4NUM)[0] = m23 + bias_data[i]; - (dst_data + i + 2 * dst_step * C4NUM + 4 * C4NUM)[0] = m24 + bias_data[i]; - - (dst_data + i + 3 * dst_step * C4NUM)[0] = m30 + bias_data[i]; - (dst_data + i + 3 * dst_step * C4NUM + C4NUM)[0] = m31 + bias_data[i]; - (dst_data + i + 3 * dst_step * C4NUM + 2 * C4NUM)[0] = m32 + bias_data[i]; - (dst_data + i + 3 * dst_step * C4NUM + 3 * C4NUM)[0] = m33 + bias_data[i]; - (dst_data + i + 3 * dst_step * C4NUM + 4 * C4NUM)[0] = m34 + bias_data[i]; - - (dst_data + i + 4 * dst_step * C4NUM)[0] = m40 + bias_data[i]; - (dst_data + i + 4 * dst_step * C4NUM + C4NUM)[0] = m41 + bias_data[i]; - (dst_data + i + 4 * dst_step * C4NUM + 2 * C4NUM)[0] = m42 + bias_data[i]; - (dst_data + i + 4 * dst_step * C4NUM + 3 * C4NUM)[0] = m43 + bias_data[i]; - (dst_data + i + 4 * dst_step * C4NUM + 4 * C4NUM)[0] = m44 + bias_data[i]; + float src[64]; + float t[40]; + float m[25]; + for (int i = 0; i < C4NUM; ++i) { + // load source data + for (int j = 0; j < 64; ++j) { + src[j] = src_data[i + j * src_step]; + } + for (int l = 0; l < 8; ++l) { + int offset = l * 8; + t[l] = src[offset] + src[1 + offset] + src[2 + offset] + src[3 + offset] + src[4 + offset] + src[5 + offset] + + src[6 + offset]; + t[l + 8] = src[1 + offset] - src[2 + offset] + 2 * (src[3 + offset] - src[4 + offset]) + + 3 * (src[5 + offset] - src[6 + offset]); + t[l + 16] = src[1 + offset] + src[2 + offset] + 4 * (src[3 + offset] + src[4 + offset]) + + 9 * (src[5 + offset] + src[6 + offset]); + t[l + 24] = src[1 + offset] - src[2 + offset] + 8 * (src[3 + offset] - src[4 + offset]) + + 27 * (src[5 + offset] - src[6 + offset]); + t[l + 32] = src[1 + offset] + src[2 + offset] + 16 * (src[3 + offset] + src[4 + offset]) + + 81 * (src[5 + offset] + src[6 + offset]) + src[7 + offset]; + } + for (int l = 0; l < 5; ++l) { + int offset = l * 8; + m[l] = t[offset] + t[1 + offset] + t[2 + offset] + t[3 + offset] + t[4 + offset] + t[5 + offset] + t[6 + offset]; + m[l + 5] = + t[1 + offset] - t[2 + offset] + 2 * (t[3 + offset] - t[4 + offset]) + 3 * (t[5 + offset] - t[6 + offset]); + m[l + 10] = + t[1 + offset] + t[2 + offset] + 4 * (t[3 + offset] + t[4 + offset]) + 9 * (t[5 + offset] + t[6 + offset]); + m[l + 15] = + t[1 + offset] - t[2 + offset] + 8 * (t[3 + offset] - t[4 + offset]) + 27 * (t[5 + offset] - t[6 + offset]); + m[l + 20] = t[1 + offset] + t[2 + offset] + 16 * (t[3 + offset] + t[4 + offset]) + + 81 * (t[5 + offset] + t[6 + offset]) + t[7 + offset]; + } + // store output + for (int k = 0; k < 5; ++k) { + int dst_k_offset = k * dst_step * C4NUM; + int m_k_offset = k * 5; + for (int j = 0; j < 5; ++j) { + dst_data[i + dst_k_offset + j * C4NUM] = m[j + m_k_offset] + bias_data[i]; + } + } } #endif } - void OutputTransform8x6Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step) { #ifdef ENABLE_ARM - float32x4_t src_data_00 = vld1q_f32(src_data + 0 * src_step); - float32x4_t src_data_01 = vld1q_f32(src_data + 1 * src_step); - float32x4_t src_data_02 = vld1q_f32(src_data + 2 * src_step); - float32x4_t src_data_03 = vld1q_f32(src_data + 3 * src_step); - float32x4_t src_data_04 = vld1q_f32(src_data + 4 * src_step); - float32x4_t src_data_05 = vld1q_f32(src_data + 5 * src_step); - float32x4_t src_data_06 = vld1q_f32(src_data + 6 * src_step); - float32x4_t src_data_07 = vld1q_f32(src_data + 7 * src_step); - float32x4_t src_data_10 = vld1q_f32(src_data + 8 * src_step); - float32x4_t src_data_11 = vld1q_f32(src_data + 9 * src_step); - float32x4_t src_data_12 = vld1q_f32(src_data + 10 * src_step); - float32x4_t src_data_13 = vld1q_f32(src_data + 11 * src_step); - float32x4_t src_data_14 = vld1q_f32(src_data + 12 * src_step); - float32x4_t src_data_15 = vld1q_f32(src_data + 13 * src_step); - float32x4_t src_data_16 = vld1q_f32(src_data + 14 * src_step); - float32x4_t src_data_17 = vld1q_f32(src_data + 15 * src_step); - float32x4_t src_data_20 = vld1q_f32(src_data + 16 * src_step); - float32x4_t src_data_21 = vld1q_f32(src_data + 17 * src_step); - float32x4_t src_data_22 = vld1q_f32(src_data + 18 * src_step); - float32x4_t src_data_23 = vld1q_f32(src_data + 19 * src_step); - float32x4_t src_data_24 = vld1q_f32(src_data + 20 * src_step); - float32x4_t src_data_25 = vld1q_f32(src_data + 21 * src_step); - float32x4_t src_data_26 = vld1q_f32(src_data + 22 * src_step); - float32x4_t src_data_27 = vld1q_f32(src_data + 23 * src_step); - float32x4_t src_data_30 = vld1q_f32(src_data + 24 * src_step); - float32x4_t src_data_31 = vld1q_f32(src_data + 25 * src_step); - float32x4_t src_data_32 = vld1q_f32(src_data + 26 * src_step); - float32x4_t src_data_33 = vld1q_f32(src_data + 27 * src_step); - float32x4_t src_data_34 = vld1q_f32(src_data + 28 * src_step); - float32x4_t src_data_35 = vld1q_f32(src_data + 29 * src_step); - float32x4_t src_data_36 = vld1q_f32(src_data + 30 * src_step); - float32x4_t src_data_37 = vld1q_f32(src_data + 31 * src_step); - float32x4_t src_data_40 = vld1q_f32(src_data + 32 * src_step); - float32x4_t src_data_41 = vld1q_f32(src_data + 33 * src_step); - float32x4_t src_data_42 = vld1q_f32(src_data + 34 * src_step); - float32x4_t src_data_43 = vld1q_f32(src_data + 35 * src_step); - float32x4_t src_data_44 = vld1q_f32(src_data + 36 * src_step); - float32x4_t src_data_45 = vld1q_f32(src_data + 37 * src_step); - float32x4_t src_data_46 = vld1q_f32(src_data + 38 * src_step); - float32x4_t src_data_47 = vld1q_f32(src_data + 39 * src_step); - float32x4_t src_data_50 = vld1q_f32(src_data + 40 * src_step); - float32x4_t src_data_51 = vld1q_f32(src_data + 41 * src_step); - float32x4_t src_data_52 = vld1q_f32(src_data + 42 * src_step); - float32x4_t src_data_53 = vld1q_f32(src_data + 43 * src_step); - float32x4_t src_data_54 = vld1q_f32(src_data + 44 * src_step); - float32x4_t src_data_55 = vld1q_f32(src_data + 45 * src_step); - float32x4_t src_data_56 = vld1q_f32(src_data + 46 * src_step); - float32x4_t src_data_57 = vld1q_f32(src_data + 47 * src_step); - float32x4_t src_data_60 = vld1q_f32(src_data + 48 * src_step); - float32x4_t src_data_61 = vld1q_f32(src_data + 49 * src_step); - float32x4_t src_data_62 = vld1q_f32(src_data + 50 * src_step); - float32x4_t src_data_63 = vld1q_f32(src_data + 51 * src_step); - float32x4_t src_data_64 = vld1q_f32(src_data + 52 * src_step); - float32x4_t src_data_65 = vld1q_f32(src_data + 53 * src_step); - float32x4_t src_data_66 = vld1q_f32(src_data + 54 * src_step); - float32x4_t src_data_67 = vld1q_f32(src_data + 55 * src_step); - float32x4_t src_data_70 = vld1q_f32(src_data + 56 * src_step); - float32x4_t src_data_71 = vld1q_f32(src_data + 57 * src_step); - float32x4_t src_data_72 = vld1q_f32(src_data + 58 * src_step); - float32x4_t src_data_73 = vld1q_f32(src_data + 59 * src_step); - float32x4_t src_data_74 = vld1q_f32(src_data + 60 * src_step); - float32x4_t src_data_75 = vld1q_f32(src_data + 61 * src_step); - float32x4_t src_data_76 = vld1q_f32(src_data + 62 * src_step); - float32x4_t src_data_77 = vld1q_f32(src_data + 63 * src_step); - - float32x4_t d01 = vsubq_f32(src_data_10, src_data_20); - float32x4_t d02 = vsubq_f32(src_data_11, src_data_21); - float32x4_t d03 = vsubq_f32(src_data_12, src_data_22); - float32x4_t d04 = vsubq_f32(src_data_13, src_data_23); - float32x4_t d05 = vsubq_f32(src_data_14, src_data_24); - float32x4_t d06 = vsubq_f32(src_data_15, src_data_25); - float32x4_t d07 = vsubq_f32(src_data_16, src_data_26); - float32x4_t d08 = vsubq_f32(src_data_17, src_data_27); - - float32x4_t d11 = vsubq_f32(src_data_30, src_data_40); - float32x4_t d12 = vsubq_f32(src_data_31, src_data_41); - float32x4_t d13 = vsubq_f32(src_data_32, src_data_42); - float32x4_t d14 = vsubq_f32(src_data_33, src_data_43); - float32x4_t d15 = vsubq_f32(src_data_34, src_data_44); - float32x4_t d16 = vsubq_f32(src_data_35, src_data_45); - float32x4_t d17 = vsubq_f32(src_data_36, src_data_46); - float32x4_t d18 = vsubq_f32(src_data_37, src_data_47); - - float32x4_t d21 = vsubq_f32(src_data_50, src_data_60); - float32x4_t d22 = vsubq_f32(src_data_51, src_data_61); - float32x4_t d23 = vsubq_f32(src_data_52, src_data_62); - float32x4_t d24 = vsubq_f32(src_data_53, src_data_63); - float32x4_t d25 = vsubq_f32(src_data_54, src_data_64); - float32x4_t d26 = vsubq_f32(src_data_55, src_data_65); - float32x4_t d27 = vsubq_f32(src_data_56, src_data_66); - float32x4_t d28 = vsubq_f32(src_data_57, src_data_67); - - float32x4_t d31 = vaddq_f32(src_data_10, src_data_20); - float32x4_t d32 = vaddq_f32(src_data_11, src_data_21); - float32x4_t d33 = vaddq_f32(src_data_12, src_data_22); - float32x4_t d34 = vaddq_f32(src_data_13, src_data_23); - float32x4_t d35 = vaddq_f32(src_data_14, src_data_24); - float32x4_t d36 = vaddq_f32(src_data_15, src_data_25); - float32x4_t d37 = vaddq_f32(src_data_16, src_data_26); - float32x4_t d38 = vaddq_f32(src_data_17, src_data_27); - - float32x4_t d41 = vaddq_f32(src_data_30, src_data_40); - float32x4_t d42 = vaddq_f32(src_data_31, src_data_41); - float32x4_t d43 = vaddq_f32(src_data_32, src_data_42); - float32x4_t d44 = vaddq_f32(src_data_33, src_data_43); - float32x4_t d45 = vaddq_f32(src_data_34, src_data_44); - float32x4_t d46 = vaddq_f32(src_data_35, src_data_45); - float32x4_t d47 = vaddq_f32(src_data_36, src_data_46); - float32x4_t d48 = vaddq_f32(src_data_37, src_data_47); - - float32x4_t d51 = vaddq_f32(src_data_50, src_data_60); - float32x4_t d52 = vaddq_f32(src_data_51, src_data_61); - float32x4_t d53 = vaddq_f32(src_data_52, src_data_62); - float32x4_t d54 = vaddq_f32(src_data_53, src_data_63); - float32x4_t d55 = vaddq_f32(src_data_54, src_data_64); - float32x4_t d56 = vaddq_f32(src_data_55, src_data_65); - float32x4_t d57 = vaddq_f32(src_data_56, src_data_66); - float32x4_t d58 = vaddq_f32(src_data_57, src_data_67); - - float32x4_t t00 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_00, src_data_10), src_data_20), src_data_30), src_data_40), - src_data_50), - src_data_60); - float32x4_t t01 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_01, src_data_11), src_data_21), src_data_31), src_data_41), - src_data_51), - src_data_61); - float32x4_t t02 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_02, src_data_12), src_data_22), src_data_32), src_data_42), - src_data_52), - src_data_62); - float32x4_t t03 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_03, src_data_13), src_data_23), src_data_33), src_data_43), - src_data_53), - src_data_63); - float32x4_t t04 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_04, src_data_14), src_data_24), src_data_34), src_data_44), - src_data_54), - src_data_64); - float32x4_t t05 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_05, src_data_15), src_data_25), src_data_35), src_data_45), - src_data_55), - src_data_65); - float32x4_t t06 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_06, src_data_16), src_data_26), src_data_36), src_data_46), - src_data_56), - src_data_66); - float32x4_t t07 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_07, src_data_17), src_data_27), src_data_37), src_data_47), - src_data_57), - src_data_67); - - float32x4_t t10 = vaddq_f32(vaddq_f32(vmulq_n_f32(d01, 0.5), d11), vmulq_n_f32(d21, 1.5)); - float32x4_t t11 = vaddq_f32(vaddq_f32(vmulq_n_f32(d02, 0.5), d12), vmulq_n_f32(d22, 1.5)); - float32x4_t t12 = vaddq_f32(vaddq_f32(vmulq_n_f32(d03, 0.5), d13), vmulq_n_f32(d23, 1.5)); - float32x4_t t13 = vaddq_f32(vaddq_f32(vmulq_n_f32(d04, 0.5), d14), vmulq_n_f32(d24, 1.5)); - float32x4_t t14 = vaddq_f32(vaddq_f32(vmulq_n_f32(d05, 0.5), d15), vmulq_n_f32(d25, 1.5)); - float32x4_t t15 = vaddq_f32(vaddq_f32(vmulq_n_f32(d06, 0.5), d16), vmulq_n_f32(d26, 1.5)); - float32x4_t t16 = vaddq_f32(vaddq_f32(vmulq_n_f32(d07, 0.5), d17), vmulq_n_f32(d27, 1.5)); - float32x4_t t17 = vaddq_f32(vaddq_f32(vmulq_n_f32(d08, 0.5), d18), vmulq_n_f32(d28, 1.5)); - - float32x4_t t20 = vaddq_f32(vaddq_f32(vmulq_n_f32(d31, 0.25), d41), vmulq_n_f32(d51, 2.25)); - float32x4_t t21 = vaddq_f32(vaddq_f32(vmulq_n_f32(d32, 0.25), d42), vmulq_n_f32(d52, 2.25)); - float32x4_t t22 = vaddq_f32(vaddq_f32(vmulq_n_f32(d33, 0.25), d43), vmulq_n_f32(d53, 2.25)); - float32x4_t t23 = vaddq_f32(vaddq_f32(vmulq_n_f32(d34, 0.25), d44), vmulq_n_f32(d54, 2.25)); - float32x4_t t24 = vaddq_f32(vaddq_f32(vmulq_n_f32(d35, 0.25), d45), vmulq_n_f32(d55, 2.25)); - float32x4_t t25 = vaddq_f32(vaddq_f32(vmulq_n_f32(d36, 0.25), d46), vmulq_n_f32(d56, 2.25)); - float32x4_t t26 = vaddq_f32(vaddq_f32(vmulq_n_f32(d37, 0.25), d47), vmulq_n_f32(d57, 2.25)); - float32x4_t t27 = vaddq_f32(vaddq_f32(vmulq_n_f32(d38, 0.25), d48), vmulq_n_f32(d58, 2.25)); - - float32x4_t t30 = vaddq_f32(vaddq_f32(vmulq_n_f32(d01, 0.125), d11), vmulq_n_f32(d21, 3.375)); - float32x4_t t31 = vaddq_f32(vaddq_f32(vmulq_n_f32(d02, 0.125), d12), vmulq_n_f32(d22, 3.375)); - float32x4_t t32 = vaddq_f32(vaddq_f32(vmulq_n_f32(d03, 0.125), d13), vmulq_n_f32(d23, 3.375)); - float32x4_t t33 = vaddq_f32(vaddq_f32(vmulq_n_f32(d04, 0.125), d14), vmulq_n_f32(d24, 3.375)); - float32x4_t t34 = vaddq_f32(vaddq_f32(vmulq_n_f32(d05, 0.125), d15), vmulq_n_f32(d25, 3.375)); - float32x4_t t35 = vaddq_f32(vaddq_f32(vmulq_n_f32(d06, 0.125), d16), vmulq_n_f32(d26, 3.375)); - float32x4_t t36 = vaddq_f32(vaddq_f32(vmulq_n_f32(d07, 0.125), d17), vmulq_n_f32(d27, 3.375)); - float32x4_t t37 = vaddq_f32(vaddq_f32(vmulq_n_f32(d08, 0.125), d18), vmulq_n_f32(d28, 3.375)); - - float32x4_t t40 = vaddq_f32(vaddq_f32(vmulq_n_f32(d31, 0.0625), d41), vmulq_n_f32(d51, 5.0625)); - float32x4_t t41 = vaddq_f32(vaddq_f32(vmulq_n_f32(d32, 0.0625), d42), vmulq_n_f32(d52, 5.0625)); - float32x4_t t42 = vaddq_f32(vaddq_f32(vmulq_n_f32(d33, 0.0625), d43), vmulq_n_f32(d53, 5.0625)); - float32x4_t t43 = vaddq_f32(vaddq_f32(vmulq_n_f32(d34, 0.0625), d44), vmulq_n_f32(d54, 5.0625)); - float32x4_t t44 = vaddq_f32(vaddq_f32(vmulq_n_f32(d35, 0.0625), d45), vmulq_n_f32(d55, 5.0625)); - float32x4_t t45 = vaddq_f32(vaddq_f32(vmulq_n_f32(d36, 0.0625), d46), vmulq_n_f32(d56, 5.0625)); - float32x4_t t46 = vaddq_f32(vaddq_f32(vmulq_n_f32(d37, 0.0625), d47), vmulq_n_f32(d57, 5.0625)); - float32x4_t t47 = vaddq_f32(vaddq_f32(vmulq_n_f32(d38, 0.0625), d48), vmulq_n_f32(d58, 5.0625)); - - float32x4_t t50 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d01, 0.03125), d11), vmulq_n_f32(d21, 7.59375)), src_data_70); - float32x4_t t51 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d02, 0.03125), d12), vmulq_n_f32(d22, 7.59375)), src_data_71); - float32x4_t t52 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d03, 0.03125), d13), vmulq_n_f32(d23, 7.59375)), src_data_72); - float32x4_t t53 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d04, 0.03125), d14), vmulq_n_f32(d24, 7.59375)), src_data_73); - float32x4_t t54 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d05, 0.03125), d15), vmulq_n_f32(d25, 7.59375)), src_data_74); - float32x4_t t55 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d06, 0.03125), d16), vmulq_n_f32(d26, 7.59375)), src_data_75); - float32x4_t t56 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d07, 0.03125), d17), vmulq_n_f32(d27, 7.59375)), src_data_76); - float32x4_t t57 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d08, 0.03125), d18), vmulq_n_f32(d28, 7.59375)), src_data_77); - - float32x4_t s11 = vsubq_f32(t01, t02); - float32x4_t s12 = vsubq_f32(t11, t12); - float32x4_t s13 = vsubq_f32(t21, t22); - float32x4_t s14 = vsubq_f32(t31, t32); - float32x4_t s15 = vsubq_f32(t41, t42); - float32x4_t s16 = vsubq_f32(t51, t52); - - float32x4_t s21 = vsubq_f32(t03, t04); - float32x4_t s22 = vsubq_f32(t13, t14); - float32x4_t s23 = vsubq_f32(t23, t24); - float32x4_t s24 = vsubq_f32(t33, t34); - float32x4_t s25 = vsubq_f32(t43, t44); - float32x4_t s26 = vsubq_f32(t53, t54); - - float32x4_t s31 = vsubq_f32(t05, t06); - float32x4_t s32 = vsubq_f32(t15, t16); - float32x4_t s33 = vsubq_f32(t25, t26); - float32x4_t s34 = vsubq_f32(t35, t36); - float32x4_t s35 = vsubq_f32(t45, t46); - float32x4_t s36 = vsubq_f32(t55, t56); - - float32x4_t s41 = vaddq_f32(t01, t02); - float32x4_t s42 = vaddq_f32(t11, t12); - float32x4_t s43 = vaddq_f32(t21, t22); - float32x4_t s44 = vaddq_f32(t31, t32); - float32x4_t s45 = vaddq_f32(t41, t42); - float32x4_t s46 = vaddq_f32(t51, t52); - - float32x4_t s51 = vaddq_f32(t03, t04); - float32x4_t s52 = vaddq_f32(t13, t14); - float32x4_t s53 = vaddq_f32(t23, t24); - float32x4_t s54 = vaddq_f32(t33, t34); - float32x4_t s55 = vaddq_f32(t43, t44); - float32x4_t s56 = vaddq_f32(t53, t54); - - float32x4_t s61 = vaddq_f32(t05, t06); - float32x4_t s62 = vaddq_f32(t15, t16); - float32x4_t s63 = vaddq_f32(t25, t26); - float32x4_t s64 = vaddq_f32(t35, t36); - float32x4_t s65 = vaddq_f32(t45, t46); - float32x4_t s66 = vaddq_f32(t55, t56); - - float32x4_t m00 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t00, t01), t02), t03), t04), t05), t06); - float32x4_t m01 = vaddq_f32(vaddq_f32(vmulq_n_f32(s11, 0.5), s21), vmulq_n_f32(s31, 1.5)); - float32x4_t m02 = vaddq_f32(vaddq_f32(vmulq_n_f32(s41, 0.25), s51), vmulq_n_f32(s61, 2.25)); - float32x4_t m03 = vaddq_f32(vaddq_f32(vmulq_n_f32(s11, 0.125), s21), vmulq_n_f32(s31, 3.375)); - float32x4_t m04 = vaddq_f32(vaddq_f32(vmulq_n_f32(s41, 0.0625), s51), vmulq_n_f32(s61, 5.0625)); - float32x4_t m05 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s11, 0.03125), s21), vmulq_n_f32(s31, 7.59375)), t07); - - float32x4_t m10 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t10, t11), t12), t13), t14), t15), t16); - float32x4_t m11 = vaddq_f32(vaddq_f32(vmulq_n_f32(s12, 0.5), s22), vmulq_n_f32(s32, 1.5)); - float32x4_t m12 = vaddq_f32(vaddq_f32(vmulq_n_f32(s42, 0.25), s52), vmulq_n_f32(s62, 2.25)); - float32x4_t m13 = vaddq_f32(vaddq_f32(vmulq_n_f32(s12, 0.125), s22), vmulq_n_f32(s32, 3.375)); - float32x4_t m14 = vaddq_f32(vaddq_f32(vmulq_n_f32(s42, 0.0625), s52), vmulq_n_f32(s62, 5.0625)); - float32x4_t m15 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s12, 0.03125), s22), vmulq_n_f32(s32, 7.59375)), t17); - - float32x4_t m20 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t20, t21), t22), t23), t24), t25), t26); - float32x4_t m21 = vaddq_f32(vaddq_f32(vmulq_n_f32(s13, 0.5), s23), vmulq_n_f32(s33, 1.5)); - float32x4_t m22 = vaddq_f32(vaddq_f32(vmulq_n_f32(s43, 0.25), s53), vmulq_n_f32(s63, 2.25)); - float32x4_t m23 = vaddq_f32(vaddq_f32(vmulq_n_f32(s13, 0.125), s23), vmulq_n_f32(s33, 3.375)); - float32x4_t m24 = vaddq_f32(vaddq_f32(vmulq_n_f32(s43, 0.0625), s53), vmulq_n_f32(s63, 5.0625)); - float32x4_t m25 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s13, 0.03125), s23), vmulq_n_f32(s33, 7.59375)), t27); - - float32x4_t m30 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t30, t31), t32), t33), t34), t35), t36); - float32x4_t m31 = vaddq_f32(vaddq_f32(vmulq_n_f32(s14, 0.5), s24), vmulq_n_f32(s34, 1.5)); - float32x4_t m32 = vaddq_f32(vaddq_f32(vmulq_n_f32(s44, 0.25), s54), vmulq_n_f32(s64, 2.25)); - float32x4_t m33 = vaddq_f32(vaddq_f32(vmulq_n_f32(s14, 0.125), s24), vmulq_n_f32(s34, 3.375)); - float32x4_t m34 = vaddq_f32(vaddq_f32(vmulq_n_f32(s44, 0.0625), s54), vmulq_n_f32(s64, 5.0625)); - float32x4_t m35 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s14, 0.03125), s24), vmulq_n_f32(s34, 7.59375)), t37); - - float32x4_t m40 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t40, t41), t42), t43), t44), t45), t46); - float32x4_t m41 = vaddq_f32(vaddq_f32(vmulq_n_f32(s15, 0.5), s25), vmulq_n_f32(s35, 1.5)); - float32x4_t m42 = vaddq_f32(vaddq_f32(vmulq_n_f32(s45, 0.25), s55), vmulq_n_f32(s65, 2.25)); - float32x4_t m43 = vaddq_f32(vaddq_f32(vmulq_n_f32(s15, 0.125), s25), vmulq_n_f32(s35, 3.375)); - float32x4_t m44 = vaddq_f32(vaddq_f32(vmulq_n_f32(s45, 0.0625), s55), vmulq_n_f32(s65, 5.0625)); - float32x4_t m45 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s15, 0.03125), s25), vmulq_n_f32(s35, 7.59375)), t47); - - float32x4_t m50 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t50, t51), t52), t53), t54), t55), t56); - float32x4_t m51 = vaddq_f32(vaddq_f32(vmulq_n_f32(s16, 0.5), s26), vmulq_n_f32(s36, 1.5)); - float32x4_t m52 = vaddq_f32(vaddq_f32(vmulq_n_f32(s46, 0.25), s56), vmulq_n_f32(s66, 2.25)); - float32x4_t m53 = vaddq_f32(vaddq_f32(vmulq_n_f32(s16, 0.125), s26), vmulq_n_f32(s36, 3.375)); - float32x4_t m54 = vaddq_f32(vaddq_f32(vmulq_n_f32(s46, 0.0625), s56), vmulq_n_f32(s66, 5.0625)); - float32x4_t m55 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s16, 0.03125), s26), vmulq_n_f32(s36, 7.59375)), t57); - + float32x4_t src[64]; + float32x4_t t[48]; + float32x4_t m[36]; + Load64Data; float32x4_t bias_ptr = vld1q_f32(bias_data); - vst1q_f32(dst_data, vaddq_f32(m00, bias_ptr)); - vst1q_f32(dst_data + C4NUM, vaddq_f32(m01, bias_ptr)); - vst1q_f32(dst_data + 2 * C4NUM, vaddq_f32(m02, bias_ptr)); - vst1q_f32(dst_data + 3 * C4NUM, vaddq_f32(m03, bias_ptr)); - vst1q_f32(dst_data + 4 * C4NUM, vaddq_f32(m04, bias_ptr)); - vst1q_f32(dst_data + 5 * C4NUM, vaddq_f32(m05, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM, vaddq_f32(m10, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + C4NUM, vaddq_f32(m11, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + 2 * C4NUM, vaddq_f32(m12, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + 3 * C4NUM, vaddq_f32(m13, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + 4 * C4NUM, vaddq_f32(m14, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + 5 * C4NUM, vaddq_f32(m15, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM, vaddq_f32(m20, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + C4NUM, vaddq_f32(m21, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + 2 * C4NUM, vaddq_f32(m22, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + 3 * C4NUM, vaddq_f32(m23, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + 4 * C4NUM, vaddq_f32(m24, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + 5 * C4NUM, vaddq_f32(m25, bias_ptr)); - vst1q_f32(dst_data + 3 * dst_step * C4NUM, vaddq_f32(m30, bias_ptr)); - vst1q_f32(dst_data + 3 * dst_step * C4NUM + C4NUM, vaddq_f32(m31, bias_ptr)); - vst1q_f32(dst_data + 3 * dst_step * C4NUM + 2 * C4NUM, vaddq_f32(m32, bias_ptr)); - vst1q_f32(dst_data + 3 * dst_step * C4NUM + 3 * C4NUM, vaddq_f32(m33, bias_ptr)); - vst1q_f32(dst_data + 3 * dst_step * C4NUM + 4 * C4NUM, vaddq_f32(m34, bias_ptr)); - vst1q_f32(dst_data + 3 * dst_step * C4NUM + 5 * C4NUM, vaddq_f32(m35, bias_ptr)); - vst1q_f32(dst_data + 4 * dst_step * C4NUM, vaddq_f32(m40, bias_ptr)); - vst1q_f32(dst_data + 4 * dst_step * C4NUM + C4NUM, vaddq_f32(m41, bias_ptr)); - vst1q_f32(dst_data + 4 * dst_step * C4NUM + 2 * C4NUM, vaddq_f32(m42, bias_ptr)); - vst1q_f32(dst_data + 4 * dst_step * C4NUM + 3 * C4NUM, vaddq_f32(m43, bias_ptr)); - vst1q_f32(dst_data + 4 * dst_step * C4NUM + 4 * C4NUM, vaddq_f32(m44, bias_ptr)); - vst1q_f32(dst_data + 4 * dst_step * C4NUM + 5 * C4NUM, vaddq_f32(m45, bias_ptr)); - vst1q_f32(dst_data + 5 * dst_step * C4NUM, vaddq_f32(m50, bias_ptr)); - vst1q_f32(dst_data + 5 * dst_step * C4NUM + C4NUM, vaddq_f32(m51, bias_ptr)); - vst1q_f32(dst_data + 5 * dst_step * C4NUM + 2 * C4NUM, vaddq_f32(m52, bias_ptr)); - vst1q_f32(dst_data + 5 * dst_step * C4NUM + 3 * C4NUM, vaddq_f32(m53, bias_ptr)); - vst1q_f32(dst_data + 5 * dst_step * C4NUM + 4 * C4NUM, vaddq_f32(m54, bias_ptr)); - vst1q_f32(dst_data + 5 * dst_step * C4NUM + 5 * C4NUM, vaddq_f32(m55, bias_ptr)); + for (int l = 0; l < 8; ++l) { + int offset = l * 8; + float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); + float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); + float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); + float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); + float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); + float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); + t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 2)), vmulq_n_f32(tmp6, 3)); + t[l + 16] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), vmulq_n_f32(tmp3, 9)); + t[l + 24] = vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 8)), vmulq_n_f32(tmp6, 27)); + t[l + 32] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 16)), vmulq_n_f32(tmp3, 81)); + t[l + 40] = vaddq_f32(vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 32)), vmulq_n_f32(tmp6, 243)), src[7 + offset]); + } + for (int l = 0; l < 6; ++l) { + int offset = l * 8; + float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); + float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); + float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); + float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); + float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); + float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); + m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 6] = vaddq_f32(vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 2)), vmulq_n_f32(tmp6, 3)), bias_ptr); + m[l + 12] = vaddq_f32(vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), vmulq_n_f32(tmp3, 9)), bias_ptr); + m[l + 18] = vaddq_f32(vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 8)), vmulq_n_f32(tmp6, 27)), bias_ptr); + m[l + 24] = vaddq_f32(vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 16)), vmulq_n_f32(tmp3, 81)), bias_ptr); + m[l + 30] = vaddq_f32( + vaddq_f32(vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 32)), vmulq_n_f32(tmp6, 243)), t[7 + offset]), bias_ptr); + } + for (int i = 0; i < 6; i++) { + int dst_k_offset = i * dst_step * C4NUM; + int m_k_offset = i * 6; + vst1q_f32(dst_data + dst_k_offset + 0 * C4NUM, m[m_k_offset]); + vst1q_f32(dst_data + dst_k_offset + 1 * C4NUM, m[m_k_offset + 1]); + vst1q_f32(dst_data + dst_k_offset + 2 * C4NUM, m[m_k_offset + 2]); + vst1q_f32(dst_data + dst_k_offset + 3 * C4NUM, m[m_k_offset + 3]); + vst1q_f32(dst_data + dst_k_offset + 4 * C4NUM, m[m_k_offset + 4]); + vst1q_f32(dst_data + dst_k_offset + 5 * C4NUM, m[m_k_offset + 5]); + } #else - for (int i = 0; i < C4NUM; i++) { - float src_data_00 = src_data[i]; - float src_data_01 = src_data[i + src_step]; - float src_data_02 = src_data[i + 2 * src_step]; - float src_data_03 = src_data[i + 3 * src_step]; - float src_data_04 = src_data[i + 4 * src_step]; - float src_data_05 = src_data[i + 5 * src_step]; - float src_data_06 = src_data[i + 6 * src_step]; - float src_data_07 = src_data[i + 7 * src_step]; - float src_data_10 = src_data[i + 8 * src_step]; - float src_data_11 = src_data[i + 9 * src_step]; - float src_data_12 = src_data[i + 10 * src_step]; - float src_data_13 = src_data[i + 11 * src_step]; - float src_data_14 = src_data[i + 12 * src_step]; - float src_data_15 = src_data[i + 13 * src_step]; - float src_data_16 = src_data[i + 14 * src_step]; - float src_data_17 = src_data[i + 15 * src_step]; - float src_data_20 = src_data[i + 16 * src_step]; - float src_data_21 = src_data[i + 17 * src_step]; - float src_data_22 = src_data[i + 18 * src_step]; - float src_data_23 = src_data[i + 19 * src_step]; - float src_data_24 = src_data[i + 20 * src_step]; - float src_data_25 = src_data[i + 21 * src_step]; - float src_data_26 = src_data[i + 22 * src_step]; - float src_data_27 = src_data[i + 23 * src_step]; - float src_data_30 = src_data[i + 24 * src_step]; - float src_data_31 = src_data[i + 25 * src_step]; - float src_data_32 = src_data[i + 26 * src_step]; - float src_data_33 = src_data[i + 27 * src_step]; - float src_data_34 = src_data[i + 28 * src_step]; - float src_data_35 = src_data[i + 29 * src_step]; - float src_data_36 = src_data[i + 30 * src_step]; - float src_data_37 = src_data[i + 31 * src_step]; - float src_data_40 = src_data[i + 32 * src_step]; - float src_data_41 = src_data[i + 33 * src_step]; - float src_data_42 = src_data[i + 34 * src_step]; - float src_data_43 = src_data[i + 35 * src_step]; - float src_data_44 = src_data[i + 36 * src_step]; - float src_data_45 = src_data[i + 37 * src_step]; - float src_data_46 = src_data[i + 38 * src_step]; - float src_data_47 = src_data[i + 39 * src_step]; - float src_data_50 = src_data[i + 40 * src_step]; - float src_data_51 = src_data[i + 41 * src_step]; - float src_data_52 = src_data[i + 42 * src_step]; - float src_data_53 = src_data[i + 43 * src_step]; - float src_data_54 = src_data[i + 44 * src_step]; - float src_data_55 = src_data[i + 45 * src_step]; - float src_data_56 = src_data[i + 46 * src_step]; - float src_data_57 = src_data[i + 47 * src_step]; - float src_data_60 = src_data[i + 48 * src_step]; - float src_data_61 = src_data[i + 49 * src_step]; - float src_data_62 = src_data[i + 50 * src_step]; - float src_data_63 = src_data[i + 51 * src_step]; - float src_data_64 = src_data[i + 52 * src_step]; - float src_data_65 = src_data[i + 53 * src_step]; - float src_data_66 = src_data[i + 54 * src_step]; - float src_data_67 = src_data[i + 55 * src_step]; - float src_data_70 = src_data[i + 56 * src_step]; - float src_data_71 = src_data[i + 57 * src_step]; - float src_data_72 = src_data[i + 58 * src_step]; - float src_data_73 = src_data[i + 59 * src_step]; - float src_data_74 = src_data[i + 60 * src_step]; - float src_data_75 = src_data[i + 61 * src_step]; - float src_data_76 = src_data[i + 62 * src_step]; - float src_data_77 = src_data[i + 63 * src_step]; - - float d01 = src_data_10 - src_data_20; - float d02 = src_data_11 - src_data_21; - float d03 = src_data_12 - src_data_22; - float d04 = src_data_13 - src_data_23; - float d05 = src_data_14 - src_data_24; - float d06 = src_data_15 - src_data_25; - float d07 = src_data_16 - src_data_26; - float d08 = src_data_17 - src_data_27; - - float d11 = src_data_30 - src_data_40; - float d12 = src_data_31 - src_data_41; - float d13 = src_data_32 - src_data_42; - float d14 = src_data_33 - src_data_43; - float d15 = src_data_34 - src_data_44; - float d16 = src_data_35 - src_data_45; - float d17 = src_data_36 - src_data_46; - float d18 = src_data_37 - src_data_47; - - float d21 = src_data_50 - src_data_60; - float d22 = src_data_51 - src_data_61; - float d23 = src_data_52 - src_data_62; - float d24 = src_data_53 - src_data_63; - float d25 = src_data_54 - src_data_64; - float d26 = src_data_55 - src_data_65; - float d27 = src_data_56 - src_data_66; - float d28 = src_data_57 - src_data_67; - - float d31 = src_data_10 + src_data_20; - float d32 = src_data_11 + src_data_21; - float d33 = src_data_12 + src_data_22; - float d34 = src_data_13 + src_data_23; - float d35 = src_data_14 + src_data_24; - float d36 = src_data_15 + src_data_25; - float d37 = src_data_16 + src_data_26; - float d38 = src_data_17 + src_data_27; - - float d41 = src_data_30 + src_data_40; - float d42 = src_data_31 + src_data_41; - float d43 = src_data_32 + src_data_42; - float d44 = src_data_33 + src_data_43; - float d45 = src_data_34 + src_data_44; - float d46 = src_data_35 + src_data_45; - float d47 = src_data_36 + src_data_46; - float d48 = src_data_37 + src_data_47; - - float d51 = src_data_50 + src_data_60; - float d52 = src_data_51 + src_data_61; - float d53 = src_data_52 + src_data_62; - float d54 = src_data_53 + src_data_63; - float d55 = src_data_54 + src_data_64; - float d56 = src_data_55 + src_data_65; - float d57 = src_data_56 + src_data_66; - float d58 = src_data_57 + src_data_67; - - float t00 = src_data_00 + src_data_10 + src_data_20 + src_data_30 + src_data_40 + src_data_50 + src_data_60; - float t01 = src_data_01 + src_data_11 + src_data_21 + src_data_31 + src_data_41 + src_data_51 + src_data_61; - float t02 = src_data_02 + src_data_12 + src_data_22 + src_data_32 + src_data_42 + src_data_52 + src_data_62; - float t03 = src_data_03 + src_data_13 + src_data_23 + src_data_33 + src_data_43 + src_data_53 + src_data_63; - float t04 = src_data_04 + src_data_14 + src_data_24 + src_data_34 + src_data_44 + src_data_54 + src_data_64; - float t05 = src_data_05 + src_data_15 + src_data_25 + src_data_35 + src_data_45 + src_data_55 + src_data_65; - float t06 = src_data_06 + src_data_16 + src_data_26 + src_data_36 + src_data_46 + src_data_56 + src_data_66; - float t07 = src_data_07 + src_data_17 + src_data_27 + src_data_37 + src_data_47 + src_data_57 + src_data_67; - - const float t10 = 0.5f * d01 + d11 + 1.5f * d21; - const float t11 = 0.5f * d02 + d12 + 1.5f * d22; - const float t12 = 0.5f * d03 + d13 + 1.5f * d23; - const float t13 = 0.5f * d04 + d14 + 1.5f * d24; - const float t14 = 0.5f * d05 + d15 + 1.5f * d25; - const float t15 = 0.5f * d06 + d16 + 1.5f * d26; - const float t16 = 0.5f * d07 + d17 + 1.5f * d27; - const float t17 = 0.5f * d08 + d18 + 1.5f * d28; - - const float t20 = 0.25f * d31 + d41 + 2.25f * d51; - const float t21 = 0.25f * d32 + d42 + 2.25f * d52; - const float t22 = 0.25f * d33 + d43 + 2.25f * d53; - const float t23 = 0.25f * d34 + d44 + 2.25f * d54; - const float t24 = 0.25f * d35 + d45 + 2.25f * d55; - const float t25 = 0.25f * d36 + d46 + 2.25f * d56; - const float t26 = 0.25f * d37 + d47 + 2.25f * d57; - const float t27 = 0.25f * d38 + d48 + 2.25f * d58; - - const float t30 = 0.125f * d01 + d11 + 3.375f * d21; - const float t31 = 0.125f * d02 + d12 + 3.375f * d22; - const float t32 = 0.125f * d03 + d13 + 3.375f * d23; - const float t33 = 0.125f * d04 + d14 + 3.375f * d24; - const float t34 = 0.125f * d05 + d15 + 3.375f * d25; - const float t35 = 0.125f * d06 + d16 + 3.375f * d26; - const float t36 = 0.125f * d07 + d17 + 3.375f * d27; - const float t37 = 0.125f * d08 + d18 + 3.375f * d28; - - const float t40 = 0.0625f * d31 + d41 + 5.0625f * d51; - const float t41 = 0.0625f * d32 + d42 + 5.0625f * d52; - const float t42 = 0.0625f * d33 + d43 + 5.0625f * d53; - const float t43 = 0.0625f * d34 + d44 + 5.0625f * d54; - const float t44 = 0.0625f * d35 + d45 + 5.0625f * d55; - const float t45 = 0.0625f * d36 + d46 + 5.0625f * d56; - const float t46 = 0.0625f * d37 + d47 + 5.0625f * d57; - const float t47 = 0.0625f * d38 + d48 + 5.0625f * d58; - - const float t50 = 0.03125f * d01 + d11 + 7.59375f * d21 + src_data_70; - const float t51 = 0.03125f * d02 + d12 + 7.59375f * d22 + src_data_71; - const float t52 = 0.03125f * d03 + d13 + 7.59375f * d23 + src_data_72; - const float t53 = 0.03125f * d04 + d14 + 7.59375f * d24 + src_data_73; - const float t54 = 0.03125f * d05 + d15 + 7.59375f * d25 + src_data_74; - const float t55 = 0.03125f * d06 + d16 + 7.59375f * d26 + src_data_75; - const float t56 = 0.03125f * d07 + d17 + 7.59375f * d27 + src_data_76; - const float t57 = 0.03125f * d08 + d18 + 7.59375f * d28 + src_data_77; - - float s11 = t01 - t02; - float s12 = t11 - t12; - float s13 = t21 - t22; - float s14 = t31 - t32; - float s15 = t41 - t42; - float s16 = t51 - t52; - - float s21 = t03 - t04; - float s22 = t13 - t14; - float s23 = t23 - t24; - float s24 = t33 - t34; - float s25 = t43 - t44; - float s26 = t53 - t54; - - float s31 = t05 - t06; - float s32 = t15 - t16; - float s33 = t25 - t26; - float s34 = t35 - t36; - float s35 = t45 - t46; - float s36 = t55 - t56; - - float s41 = t01 + t02; - float s42 = t11 + t12; - float s43 = t21 + t22; - float s44 = t31 + t32; - float s45 = t41 + t42; - float s46 = t51 + t52; - - float s51 = t03 + t04; - float s52 = t13 + t14; - float s53 = t23 + t24; - float s54 = t33 + t34; - float s55 = t43 + t44; - float s56 = t53 + t54; - - float s61 = t05 + t06; - float s62 = t15 + t16; - float s63 = t25 + t26; - float s64 = t35 + t36; - float s65 = t45 + t46; - float s66 = t55 + t56; - - float m00 = t00 + t01 + t02 + t03 + t04 + t05 + t06; - const float m01 = 0.5f * s11 + s21 + 1.5f * s31; - const float m02 = 0.25f * s41 + s51 + 2.25f * s61; - const float m03 = 0.125f * s11 + s21 + 3.375f * s31; - const float m04 = 0.0625f * s41 + s51 + 5.0625f * s61; - const float m05 = 0.03125f * s11 + s21 + 7.59375f * s31 + t07; - - float m10 = t10 + t11 + t12 + t13 + t14 + t15 + t16; - const float m11 = 0.5f * s12 + s22 + 1.5f * s32; - const float m12 = 0.25f * s42 + s52 + 2.25f * s62; - const float m13 = 0.125f * s12 + s22 + 3.375f * s32; - const float m14 = 0.0625f * s42 + s52 + 5.0625f * s62; - const float m15 = 0.03125f * s12 + s22 + 7.59375f * s32 + t17; - - float m20 = t20 + t21 + t22 + t23 + t24 + t25 + t26; - const float m21 = 0.5f * s13 + s23 + 1.5f * s33; - const float m22 = 0.25f * s43 + s53 + 2.25f * s63; - const float m23 = 0.125f * s13 + s23 + 3.375f * s33; - const float m24 = 0.0625f * s43 + s53 + 5.0625f * s63; - const float m25 = 0.03125f * s13 + s23 + 7.59375f * s33 + t27; - - float m30 = t30 + t31 + t32 + t33 + t34 + t35 + t36; - const float m31 = 0.5f * s14 + s24 + 1.5f * s34; - const float m32 = 0.25f * s44 + s54 + 2.25f * s64; - const float m33 = 0.125f * s14 + s24 + 3.375f * s34; - const float m34 = 0.0625f * s44 + s54 + 5.0625f * s64; - const float m35 = 0.03125f * s14 + s24 + 7.59375f * s34 + t37; - - float m40 = t40 + t41 + t42 + t43 + t44 + t45 + t46; - const float m41 = 0.5f * s15 + s25 + 1.5f * s35; - const float m42 = 0.25f * s45 + s55 + 2.25f * s65; - const float m43 = 0.125f * s15 + s25 + 3.375f * s35; - const float m44 = 0.0625f * s45 + s55 + 5.0625f * s65; - const float m45 = 0.03125f * s15 + s25 + 7.59375f * s35 + t47; - - float m50 = t50 + t51 + t52 + t53 + t54 + t55 + t56; - const float m51 = 0.5f * s16 + s26 + 1.5f * s36; - const float m52 = 0.25f * s46 + s56 + 2.25f * s66; - const float m53 = 0.125f * s16 + s26 + 3.375f * s36; - const float m54 = 0.0625f * s46 + s56 + 5.0625f * s66; - const float m55 = 0.03125f * s16 + s26 + 7.59375f * s36 + t57; - - (dst_data + i)[0] = m00 + bias_data[i]; - (dst_data + i + C4NUM)[0] = m01 + bias_data[i]; - (dst_data + i + 2 * C4NUM)[0] = m02 + bias_data[i]; - (dst_data + i + 3 * C4NUM)[0] = m03 + bias_data[i]; - (dst_data + i + 4 * C4NUM)[0] = m04 + bias_data[i]; - (dst_data + i + 5 * C4NUM)[0] = m05 + bias_data[i]; - - (dst_data + i + dst_step * C4NUM)[0] = m10 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + C4NUM)[0] = m11 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + 2 * C4NUM)[0] = m12 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + 3 * C4NUM)[0] = m13 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + 4 * C4NUM)[0] = m14 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + 5 * C4NUM)[0] = m15 + bias_data[i]; - - (dst_data + i + 2 * dst_step * C4NUM)[0] = m20 + bias_data[i]; - (dst_data + i + 2 * dst_step * C4NUM + C4NUM)[0] = m21 + bias_data[i]; - (dst_data + i + 2 * dst_step * C4NUM + 2 * C4NUM)[0] = m22 + bias_data[i]; - (dst_data + i + 2 * dst_step * C4NUM + 3 * C4NUM)[0] = m23 + bias_data[i]; - (dst_data + i + 2 * dst_step * C4NUM + 4 * C4NUM)[0] = m24 + bias_data[i]; - (dst_data + i + 2 * dst_step * C4NUM + 5 * C4NUM)[0] = m25 + bias_data[i]; - - (dst_data + i + 3 * dst_step * C4NUM)[0] = m30 + bias_data[i]; - (dst_data + i + 3 * dst_step * C4NUM + C4NUM)[0] = m31 + bias_data[i]; - (dst_data + i + 3 * dst_step * C4NUM + 2 * C4NUM)[0] = m32 + bias_data[i]; - (dst_data + i + 3 * dst_step * C4NUM + 3 * C4NUM)[0] = m33 + bias_data[i]; - (dst_data + i + 3 * dst_step * C4NUM + 4 * C4NUM)[0] = m34 + bias_data[i]; - (dst_data + i + 3 * dst_step * C4NUM + 5 * C4NUM)[0] = m35 + bias_data[i]; - - (dst_data + i + 4 * dst_step * C4NUM)[0] = m40 + bias_data[i]; - (dst_data + i + 4 * dst_step * C4NUM + C4NUM)[0] = m41 + bias_data[i]; - (dst_data + i + 4 * dst_step * C4NUM + 2 * C4NUM)[0] = m42 + bias_data[i]; - (dst_data + i + 4 * dst_step * C4NUM + 3 * C4NUM)[0] = m43 + bias_data[i]; - (dst_data + i + 4 * dst_step * C4NUM + 4 * C4NUM)[0] = m44 + bias_data[i]; - (dst_data + i + 4 * dst_step * C4NUM + 5 * C4NUM)[0] = m45 + bias_data[i]; - - (dst_data + i + 5 * dst_step * C4NUM)[0] = m50 + bias_data[i]; - (dst_data + i + 5 * dst_step * C4NUM + C4NUM)[0] = m51 + bias_data[i]; - (dst_data + i + 5 * dst_step * C4NUM + 2 * C4NUM)[0] = m52 + bias_data[i]; - (dst_data + i + 5 * dst_step * C4NUM + 3 * C4NUM)[0] = m53 + bias_data[i]; - (dst_data + i + 5 * dst_step * C4NUM + 4 * C4NUM)[0] = m54 + bias_data[i]; - (dst_data + i + 5 * dst_step * C4NUM + 5 * C4NUM)[0] = m55 + bias_data[i]; + float src[64]; + float t[48]; + float m[36]; + for (int i = 0; i < C4NUM; ++i) { + // load source data + for (int j = 0; j < 64; ++j) { + src[j] = src_data[i + j * src_step]; + } + for (int l = 0; l < 8; ++l) { + int offset = l * 8; + t[l] = src[offset] + src[1 + offset] + src[2 + offset] + src[3 + offset] + src[4 + offset] + src[5 + offset] + + src[6 + offset]; + t[l + 8] = src[1 + offset] - src[2 + offset] + 2 * (src[3 + offset] - src[4 + offset]) + + 3 * (src[5 + offset] - src[6 + offset]); + t[l + 16] = src[1 + offset] + src[2 + offset] + 4 * (src[3 + offset] + src[4 + offset]) + + 9 * (src[5 + offset] + src[6 + offset]); + t[l + 24] = src[1 + offset] - src[2 + offset] + 8 * (src[3 + offset] - src[4 + offset]) + + 27 * (src[5 + offset] - src[6 + offset]); + t[l + 32] = src[1 + offset] + src[2 + offset] + 16 * (src[3 + offset] + src[4 + offset]) + + 81 * (src[5 + offset] + src[6 + offset]); + t[l + 40] = src[1 + offset] - src[2 + offset] + 32 * (src[3 + offset] - src[4 + offset]) + + 243 * (src[5 + offset] - src[6 + offset]) + src[7 + offset]; + } + for (int l = 0; l < 6; ++l) { + int offset = l * 8; + m[l] = t[offset] + t[1 + offset] + t[2 + offset] + t[3 + offset] + t[4 + offset] + t[5 + offset] + t[6 + offset]; + m[l + 6] = + t[1 + offset] - t[2 + offset] + 2 * (t[3 + offset] - t[4 + offset]) + 3 * (t[5 + offset] - t[6 + offset]); + m[l + 12] = + t[1 + offset] + t[2 + offset] + 4 * (t[3 + offset] + t[4 + offset]) + 9 * (t[5 + offset] + t[6 + offset]); + m[l + 18] = + t[1 + offset] - t[2 + offset] + 8 * (t[3 + offset] - t[4 + offset]) + 27 * (t[5 + offset] - t[6 + offset]); + m[l + 24] = + t[1 + offset] + t[2 + offset] + 16 * (t[3 + offset] + t[4 + offset]) + 81 * (t[5 + offset] + t[6 + offset]); + m[l + 30] = t[1 + offset] - t[2 + offset] + 32 * (t[3 + offset] - t[4 + offset]) + + 243 * (t[5 + offset] - t[6 + offset]) + t[7 + offset]; + } + // store output + for (int k = 0; k < 6; ++k) { + int dst_k_offset = k * dst_step * C4NUM; + int m_k_offset = k * 6; + for (int j = 0; j < 6; ++j) { + dst_data[i + dst_k_offset + j * C4NUM] = m[j + m_k_offset] + bias_data[i]; + } + } } #endif } - void OutputTransform8x7Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step) { #ifdef ENABLE_ARM - float32x4_t src_data_00 = vld1q_f32(src_data + 0 * src_step); - float32x4_t src_data_01 = vld1q_f32(src_data + 1 * src_step); - float32x4_t src_data_02 = vld1q_f32(src_data + 2 * src_step); - float32x4_t src_data_03 = vld1q_f32(src_data + 3 * src_step); - float32x4_t src_data_04 = vld1q_f32(src_data + 4 * src_step); - float32x4_t src_data_05 = vld1q_f32(src_data + 5 * src_step); - float32x4_t src_data_06 = vld1q_f32(src_data + 6 * src_step); - float32x4_t src_data_07 = vld1q_f32(src_data + 7 * src_step); - float32x4_t src_data_10 = vld1q_f32(src_data + 8 * src_step); - float32x4_t src_data_11 = vld1q_f32(src_data + 9 * src_step); - float32x4_t src_data_12 = vld1q_f32(src_data + 10 * src_step); - float32x4_t src_data_13 = vld1q_f32(src_data + 11 * src_step); - float32x4_t src_data_14 = vld1q_f32(src_data + 12 * src_step); - float32x4_t src_data_15 = vld1q_f32(src_data + 13 * src_step); - float32x4_t src_data_16 = vld1q_f32(src_data + 14 * src_step); - float32x4_t src_data_17 = vld1q_f32(src_data + 15 * src_step); - float32x4_t src_data_20 = vld1q_f32(src_data + 16 * src_step); - float32x4_t src_data_21 = vld1q_f32(src_data + 17 * src_step); - float32x4_t src_data_22 = vld1q_f32(src_data + 18 * src_step); - float32x4_t src_data_23 = vld1q_f32(src_data + 19 * src_step); - float32x4_t src_data_24 = vld1q_f32(src_data + 20 * src_step); - float32x4_t src_data_25 = vld1q_f32(src_data + 21 * src_step); - float32x4_t src_data_26 = vld1q_f32(src_data + 22 * src_step); - float32x4_t src_data_27 = vld1q_f32(src_data + 23 * src_step); - float32x4_t src_data_30 = vld1q_f32(src_data + 24 * src_step); - float32x4_t src_data_31 = vld1q_f32(src_data + 25 * src_step); - float32x4_t src_data_32 = vld1q_f32(src_data + 26 * src_step); - float32x4_t src_data_33 = vld1q_f32(src_data + 27 * src_step); - float32x4_t src_data_34 = vld1q_f32(src_data + 28 * src_step); - float32x4_t src_data_35 = vld1q_f32(src_data + 29 * src_step); - float32x4_t src_data_36 = vld1q_f32(src_data + 30 * src_step); - float32x4_t src_data_37 = vld1q_f32(src_data + 31 * src_step); - float32x4_t src_data_40 = vld1q_f32(src_data + 32 * src_step); - float32x4_t src_data_41 = vld1q_f32(src_data + 33 * src_step); - float32x4_t src_data_42 = vld1q_f32(src_data + 34 * src_step); - float32x4_t src_data_43 = vld1q_f32(src_data + 35 * src_step); - float32x4_t src_data_44 = vld1q_f32(src_data + 36 * src_step); - float32x4_t src_data_45 = vld1q_f32(src_data + 37 * src_step); - float32x4_t src_data_46 = vld1q_f32(src_data + 38 * src_step); - float32x4_t src_data_47 = vld1q_f32(src_data + 39 * src_step); - float32x4_t src_data_50 = vld1q_f32(src_data + 40 * src_step); - float32x4_t src_data_51 = vld1q_f32(src_data + 41 * src_step); - float32x4_t src_data_52 = vld1q_f32(src_data + 42 * src_step); - float32x4_t src_data_53 = vld1q_f32(src_data + 43 * src_step); - float32x4_t src_data_54 = vld1q_f32(src_data + 44 * src_step); - float32x4_t src_data_55 = vld1q_f32(src_data + 45 * src_step); - float32x4_t src_data_56 = vld1q_f32(src_data + 46 * src_step); - float32x4_t src_data_57 = vld1q_f32(src_data + 47 * src_step); - float32x4_t src_data_60 = vld1q_f32(src_data + 48 * src_step); - float32x4_t src_data_61 = vld1q_f32(src_data + 49 * src_step); - float32x4_t src_data_62 = vld1q_f32(src_data + 50 * src_step); - float32x4_t src_data_63 = vld1q_f32(src_data + 51 * src_step); - float32x4_t src_data_64 = vld1q_f32(src_data + 52 * src_step); - float32x4_t src_data_65 = vld1q_f32(src_data + 53 * src_step); - float32x4_t src_data_66 = vld1q_f32(src_data + 54 * src_step); - float32x4_t src_data_67 = vld1q_f32(src_data + 55 * src_step); - float32x4_t src_data_70 = vld1q_f32(src_data + 56 * src_step); - float32x4_t src_data_71 = vld1q_f32(src_data + 57 * src_step); - float32x4_t src_data_72 = vld1q_f32(src_data + 58 * src_step); - float32x4_t src_data_73 = vld1q_f32(src_data + 59 * src_step); - float32x4_t src_data_74 = vld1q_f32(src_data + 60 * src_step); - float32x4_t src_data_75 = vld1q_f32(src_data + 61 * src_step); - float32x4_t src_data_76 = vld1q_f32(src_data + 62 * src_step); - float32x4_t src_data_77 = vld1q_f32(src_data + 63 * src_step); - - float32x4_t d01 = vsubq_f32(src_data_10, src_data_20); - float32x4_t d02 = vsubq_f32(src_data_11, src_data_21); - float32x4_t d03 = vsubq_f32(src_data_12, src_data_22); - float32x4_t d04 = vsubq_f32(src_data_13, src_data_23); - float32x4_t d05 = vsubq_f32(src_data_14, src_data_24); - float32x4_t d06 = vsubq_f32(src_data_15, src_data_25); - float32x4_t d07 = vsubq_f32(src_data_16, src_data_26); - float32x4_t d08 = vsubq_f32(src_data_17, src_data_27); - - float32x4_t d11 = vsubq_f32(src_data_30, src_data_40); - float32x4_t d12 = vsubq_f32(src_data_31, src_data_41); - float32x4_t d13 = vsubq_f32(src_data_32, src_data_42); - float32x4_t d14 = vsubq_f32(src_data_33, src_data_43); - float32x4_t d15 = vsubq_f32(src_data_34, src_data_44); - float32x4_t d16 = vsubq_f32(src_data_35, src_data_45); - float32x4_t d17 = vsubq_f32(src_data_36, src_data_46); - float32x4_t d18 = vsubq_f32(src_data_37, src_data_47); - - float32x4_t d21 = vsubq_f32(src_data_50, src_data_60); - float32x4_t d22 = vsubq_f32(src_data_51, src_data_61); - float32x4_t d23 = vsubq_f32(src_data_52, src_data_62); - float32x4_t d24 = vsubq_f32(src_data_53, src_data_63); - float32x4_t d25 = vsubq_f32(src_data_54, src_data_64); - float32x4_t d26 = vsubq_f32(src_data_55, src_data_65); - float32x4_t d27 = vsubq_f32(src_data_56, src_data_66); - float32x4_t d28 = vsubq_f32(src_data_57, src_data_67); - - float32x4_t d31 = vaddq_f32(src_data_10, src_data_20); - float32x4_t d32 = vaddq_f32(src_data_11, src_data_21); - float32x4_t d33 = vaddq_f32(src_data_12, src_data_22); - float32x4_t d34 = vaddq_f32(src_data_13, src_data_23); - float32x4_t d35 = vaddq_f32(src_data_14, src_data_24); - float32x4_t d36 = vaddq_f32(src_data_15, src_data_25); - float32x4_t d37 = vaddq_f32(src_data_16, src_data_26); - float32x4_t d38 = vaddq_f32(src_data_17, src_data_27); - - float32x4_t d41 = vaddq_f32(src_data_30, src_data_40); - float32x4_t d42 = vaddq_f32(src_data_31, src_data_41); - float32x4_t d43 = vaddq_f32(src_data_32, src_data_42); - float32x4_t d44 = vaddq_f32(src_data_33, src_data_43); - float32x4_t d45 = vaddq_f32(src_data_34, src_data_44); - float32x4_t d46 = vaddq_f32(src_data_35, src_data_45); - float32x4_t d47 = vaddq_f32(src_data_36, src_data_46); - float32x4_t d48 = vaddq_f32(src_data_37, src_data_47); - - float32x4_t d51 = vaddq_f32(src_data_50, src_data_60); - float32x4_t d52 = vaddq_f32(src_data_51, src_data_61); - float32x4_t d53 = vaddq_f32(src_data_52, src_data_62); - float32x4_t d54 = vaddq_f32(src_data_53, src_data_63); - float32x4_t d55 = vaddq_f32(src_data_54, src_data_64); - float32x4_t d56 = vaddq_f32(src_data_55, src_data_65); - float32x4_t d57 = vaddq_f32(src_data_56, src_data_66); - float32x4_t d58 = vaddq_f32(src_data_57, src_data_67); - - float32x4_t t00 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_00, src_data_10), src_data_20), src_data_30), src_data_40), - src_data_50), - src_data_60); - float32x4_t t01 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_01, src_data_11), src_data_21), src_data_31), src_data_41), - src_data_51), - src_data_61); - float32x4_t t02 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_02, src_data_12), src_data_22), src_data_32), src_data_42), - src_data_52), - src_data_62); - float32x4_t t03 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_03, src_data_13), src_data_23), src_data_33), src_data_43), - src_data_53), - src_data_63); - float32x4_t t04 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_04, src_data_14), src_data_24), src_data_34), src_data_44), - src_data_54), - src_data_64); - float32x4_t t05 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_05, src_data_15), src_data_25), src_data_35), src_data_45), - src_data_55), - src_data_65); - float32x4_t t06 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_06, src_data_16), src_data_26), src_data_36), src_data_46), - src_data_56), - src_data_66); - float32x4_t t07 = vaddq_f32( - vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src_data_07, src_data_17), src_data_27), src_data_37), src_data_47), - src_data_57), - src_data_67); - - float32x4_t t10 = vaddq_f32(vaddq_f32(vmulq_n_f32(d01, 0.5), d11), vmulq_n_f32(d21, 1.5)); - float32x4_t t11 = vaddq_f32(vaddq_f32(vmulq_n_f32(d02, 0.5), d12), vmulq_n_f32(d22, 1.5)); - float32x4_t t12 = vaddq_f32(vaddq_f32(vmulq_n_f32(d03, 0.5), d13), vmulq_n_f32(d23, 1.5)); - float32x4_t t13 = vaddq_f32(vaddq_f32(vmulq_n_f32(d04, 0.5), d14), vmulq_n_f32(d24, 1.5)); - float32x4_t t14 = vaddq_f32(vaddq_f32(vmulq_n_f32(d05, 0.5), d15), vmulq_n_f32(d25, 1.5)); - float32x4_t t15 = vaddq_f32(vaddq_f32(vmulq_n_f32(d06, 0.5), d16), vmulq_n_f32(d26, 1.5)); - float32x4_t t16 = vaddq_f32(vaddq_f32(vmulq_n_f32(d07, 0.5), d17), vmulq_n_f32(d27, 1.5)); - float32x4_t t17 = vaddq_f32(vaddq_f32(vmulq_n_f32(d08, 0.5), d18), vmulq_n_f32(d28, 1.5)); - - float32x4_t t20 = vaddq_f32(vaddq_f32(vmulq_n_f32(d31, 0.25), d41), vmulq_n_f32(d51, 2.25)); - float32x4_t t21 = vaddq_f32(vaddq_f32(vmulq_n_f32(d32, 0.25), d42), vmulq_n_f32(d52, 2.25)); - float32x4_t t22 = vaddq_f32(vaddq_f32(vmulq_n_f32(d33, 0.25), d43), vmulq_n_f32(d53, 2.25)); - float32x4_t t23 = vaddq_f32(vaddq_f32(vmulq_n_f32(d34, 0.25), d44), vmulq_n_f32(d54, 2.25)); - float32x4_t t24 = vaddq_f32(vaddq_f32(vmulq_n_f32(d35, 0.25), d45), vmulq_n_f32(d55, 2.25)); - float32x4_t t25 = vaddq_f32(vaddq_f32(vmulq_n_f32(d36, 0.25), d46), vmulq_n_f32(d56, 2.25)); - float32x4_t t26 = vaddq_f32(vaddq_f32(vmulq_n_f32(d37, 0.25), d47), vmulq_n_f32(d57, 2.25)); - float32x4_t t27 = vaddq_f32(vaddq_f32(vmulq_n_f32(d38, 0.25), d48), vmulq_n_f32(d58, 2.25)); - - float32x4_t t30 = vaddq_f32(vaddq_f32(vmulq_n_f32(d01, 0.125), d11), vmulq_n_f32(d21, 3.375)); - float32x4_t t31 = vaddq_f32(vaddq_f32(vmulq_n_f32(d02, 0.125), d12), vmulq_n_f32(d22, 3.375)); - float32x4_t t32 = vaddq_f32(vaddq_f32(vmulq_n_f32(d03, 0.125), d13), vmulq_n_f32(d23, 3.375)); - float32x4_t t33 = vaddq_f32(vaddq_f32(vmulq_n_f32(d04, 0.125), d14), vmulq_n_f32(d24, 3.375)); - float32x4_t t34 = vaddq_f32(vaddq_f32(vmulq_n_f32(d05, 0.125), d15), vmulq_n_f32(d25, 3.375)); - float32x4_t t35 = vaddq_f32(vaddq_f32(vmulq_n_f32(d06, 0.125), d16), vmulq_n_f32(d26, 3.375)); - float32x4_t t36 = vaddq_f32(vaddq_f32(vmulq_n_f32(d07, 0.125), d17), vmulq_n_f32(d27, 3.375)); - float32x4_t t37 = vaddq_f32(vaddq_f32(vmulq_n_f32(d08, 0.125), d18), vmulq_n_f32(d28, 3.375)); - - float32x4_t t40 = vaddq_f32(vaddq_f32(vmulq_n_f32(d31, 0.0625), d41), vmulq_n_f32(d51, 5.0625)); - float32x4_t t41 = vaddq_f32(vaddq_f32(vmulq_n_f32(d32, 0.0625), d42), vmulq_n_f32(d52, 5.0625)); - float32x4_t t42 = vaddq_f32(vaddq_f32(vmulq_n_f32(d33, 0.0625), d43), vmulq_n_f32(d53, 5.0625)); - float32x4_t t43 = vaddq_f32(vaddq_f32(vmulq_n_f32(d34, 0.0625), d44), vmulq_n_f32(d54, 5.0625)); - float32x4_t t44 = vaddq_f32(vaddq_f32(vmulq_n_f32(d35, 0.0625), d45), vmulq_n_f32(d55, 5.0625)); - float32x4_t t45 = vaddq_f32(vaddq_f32(vmulq_n_f32(d36, 0.0625), d46), vmulq_n_f32(d56, 5.0625)); - float32x4_t t46 = vaddq_f32(vaddq_f32(vmulq_n_f32(d37, 0.0625), d47), vmulq_n_f32(d57, 5.0625)); - float32x4_t t47 = vaddq_f32(vaddq_f32(vmulq_n_f32(d38, 0.0625), d48), vmulq_n_f32(d58, 5.0625)); - - float32x4_t t50 = vaddq_f32(vaddq_f32(vmulq_n_f32(d01, 0.03125), d11), vmulq_n_f32(d21, 7.59375)); - float32x4_t t51 = vaddq_f32(vaddq_f32(vmulq_n_f32(d02, 0.03125), d12), vmulq_n_f32(d22, 7.59375)); - float32x4_t t52 = vaddq_f32(vaddq_f32(vmulq_n_f32(d03, 0.03125), d13), vmulq_n_f32(d23, 7.59375)); - float32x4_t t53 = vaddq_f32(vaddq_f32(vmulq_n_f32(d04, 0.03125), d14), vmulq_n_f32(d24, 7.59375)); - float32x4_t t54 = vaddq_f32(vaddq_f32(vmulq_n_f32(d05, 0.03125), d15), vmulq_n_f32(d25, 7.59375)); - float32x4_t t55 = vaddq_f32(vaddq_f32(vmulq_n_f32(d06, 0.03125), d16), vmulq_n_f32(d26, 7.59375)); - float32x4_t t56 = vaddq_f32(vaddq_f32(vmulq_n_f32(d07, 0.03125), d17), vmulq_n_f32(d27, 7.59375)); - float32x4_t t57 = vaddq_f32(vaddq_f32(vmulq_n_f32(d08, 0.03125), d18), vmulq_n_f32(d28, 7.59375)); - - float32x4_t t60 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d31, 0.015625), d41), vmulq_n_f32(d51, 11.390625)), src_data_70); - float32x4_t t61 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d32, 0.015625), d42), vmulq_n_f32(d52, 11.390625)), src_data_71); - float32x4_t t62 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d33, 0.015625), d43), vmulq_n_f32(d53, 11.390625)), src_data_72); - float32x4_t t63 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d34, 0.015625), d44), vmulq_n_f32(d54, 11.390625)), src_data_73); - float32x4_t t64 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d35, 0.015625), d45), vmulq_n_f32(d55, 11.390625)), src_data_74); - float32x4_t t65 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d36, 0.015625), d46), vmulq_n_f32(d56, 11.390625)), src_data_75); - float32x4_t t66 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d37, 0.015625), d47), vmulq_n_f32(d57, 11.390625)), src_data_76); - float32x4_t t67 = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(d38, 0.015625), d48), vmulq_n_f32(d58, 11.390625)), src_data_77); - - float32x4_t s11 = vsubq_f32(t01, t02); - float32x4_t s12 = vsubq_f32(t11, t12); - float32x4_t s13 = vsubq_f32(t21, t22); - float32x4_t s14 = vsubq_f32(t31, t32); - float32x4_t s15 = vsubq_f32(t41, t42); - float32x4_t s16 = vsubq_f32(t51, t52); - float32x4_t s17 = vsubq_f32(t61, t62); - - float32x4_t s21 = vsubq_f32(t03, t04); - float32x4_t s22 = vsubq_f32(t13, t14); - float32x4_t s23 = vsubq_f32(t23, t24); - float32x4_t s24 = vsubq_f32(t33, t34); - float32x4_t s25 = vsubq_f32(t43, t44); - float32x4_t s26 = vsubq_f32(t53, t54); - float32x4_t s27 = vsubq_f32(t63, t64); - - float32x4_t s31 = vsubq_f32(t05, t06); - float32x4_t s32 = vsubq_f32(t15, t16); - float32x4_t s33 = vsubq_f32(t25, t26); - float32x4_t s34 = vsubq_f32(t35, t36); - float32x4_t s35 = vsubq_f32(t45, t46); - float32x4_t s36 = vsubq_f32(t55, t56); - float32x4_t s37 = vsubq_f32(t65, t66); - - float32x4_t s41 = vaddq_f32(t01, t02); - float32x4_t s42 = vaddq_f32(t11, t12); - float32x4_t s43 = vaddq_f32(t21, t22); - float32x4_t s44 = vaddq_f32(t31, t32); - float32x4_t s45 = vaddq_f32(t41, t42); - float32x4_t s46 = vaddq_f32(t51, t52); - float32x4_t s47 = vaddq_f32(t61, t62); - - float32x4_t s51 = vaddq_f32(t03, t04); - float32x4_t s52 = vaddq_f32(t13, t14); - float32x4_t s53 = vaddq_f32(t23, t24); - float32x4_t s54 = vaddq_f32(t33, t34); - float32x4_t s55 = vaddq_f32(t43, t44); - float32x4_t s56 = vaddq_f32(t53, t54); - float32x4_t s57 = vaddq_f32(t63, t64); - - float32x4_t s61 = vaddq_f32(t05, t06); - float32x4_t s62 = vaddq_f32(t15, t16); - float32x4_t s63 = vaddq_f32(t25, t26); - float32x4_t s64 = vaddq_f32(t35, t36); - float32x4_t s65 = vaddq_f32(t45, t46); - float32x4_t s66 = vaddq_f32(t55, t56); - float32x4_t s67 = vaddq_f32(t65, t66); - - float32x4_t m00 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t00, t01), t02), t03), t04), t05), t06); - float32x4_t m01 = vaddq_f32(vaddq_f32(vmulq_n_f32(s11, 0.5), s21), vmulq_n_f32(s31, 1.5)); - float32x4_t m02 = vaddq_f32(vaddq_f32(vmulq_n_f32(s41, 0.25), s51), vmulq_n_f32(s61, 2.25)); - float32x4_t m03 = vaddq_f32(vaddq_f32(vmulq_n_f32(s11, 0.125), s21), vmulq_n_f32(s31, 3.375)); - float32x4_t m04 = vaddq_f32(vaddq_f32(vmulq_n_f32(s41, 0.0625), s51), vmulq_n_f32(s61, 5.0625)); - float32x4_t m05 = vaddq_f32(vaddq_f32(vmulq_n_f32(s11, 0.03125), s21), vmulq_n_f32(s31, 7.59375)); - float32x4_t m06 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s41, 0.015625), s51), vmulq_n_f32(s61, 11.390625)), t07); - - float32x4_t m10 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t10, t11), t12), t13), t14), t15), t16); - float32x4_t m11 = vaddq_f32(vaddq_f32(vmulq_n_f32(s12, 0.5), s22), vmulq_n_f32(s32, 1.5)); - float32x4_t m12 = vaddq_f32(vaddq_f32(vmulq_n_f32(s42, 0.25), s52), vmulq_n_f32(s62, 2.25)); - float32x4_t m13 = vaddq_f32(vaddq_f32(vmulq_n_f32(s12, 0.125), s22), vmulq_n_f32(s32, 3.375)); - float32x4_t m14 = vaddq_f32(vaddq_f32(vmulq_n_f32(s42, 0.0625), s52), vmulq_n_f32(s62, 5.0625)); - float32x4_t m15 = vaddq_f32(vaddq_f32(vmulq_n_f32(s12, 0.03125), s22), vmulq_n_f32(s32, 7.59375)); - float32x4_t m16 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s42, 0.015625), s52), vmulq_n_f32(s62, 11.390625)), t17); - - float32x4_t m20 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t20, t21), t22), t23), t24), t25), t26); - float32x4_t m21 = vaddq_f32(vaddq_f32(vmulq_n_f32(s13, 0.5), s23), vmulq_n_f32(s33, 1.5)); - float32x4_t m22 = vaddq_f32(vaddq_f32(vmulq_n_f32(s43, 0.25), s53), vmulq_n_f32(s63, 2.25)); - float32x4_t m23 = vaddq_f32(vaddq_f32(vmulq_n_f32(s13, 0.125), s23), vmulq_n_f32(s33, 3.375)); - float32x4_t m24 = vaddq_f32(vaddq_f32(vmulq_n_f32(s43, 0.0625), s53), vmulq_n_f32(s63, 5.0625)); - float32x4_t m25 = vaddq_f32(vaddq_f32(vmulq_n_f32(s13, 0.03125), s23), vmulq_n_f32(s33, 7.59375)); - float32x4_t m26 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s43, 0.015625), s53), vmulq_n_f32(s63, 11.390625)), t27); - - float32x4_t m30 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t30, t31), t32), t33), t34), t35), t36); - float32x4_t m31 = vaddq_f32(vaddq_f32(vmulq_n_f32(s14, 0.5), s24), vmulq_n_f32(s34, 1.5)); - float32x4_t m32 = vaddq_f32(vaddq_f32(vmulq_n_f32(s44, 0.25), s54), vmulq_n_f32(s64, 2.25)); - float32x4_t m33 = vaddq_f32(vaddq_f32(vmulq_n_f32(s14, 0.125), s24), vmulq_n_f32(s34, 3.375)); - float32x4_t m34 = vaddq_f32(vaddq_f32(vmulq_n_f32(s44, 0.0625), s54), vmulq_n_f32(s64, 5.0625)); - float32x4_t m35 = vaddq_f32(vaddq_f32(vmulq_n_f32(s14, 0.03125), s24), vmulq_n_f32(s34, 7.59375)); - float32x4_t m36 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s44, 0.015625), s54), vmulq_n_f32(s64, 11.390625)), t37); - - float32x4_t m40 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t40, t41), t42), t43), t44), t45), t46); - float32x4_t m41 = vaddq_f32(vaddq_f32(vmulq_n_f32(s15, 0.5), s25), vmulq_n_f32(s35, 1.5)); - float32x4_t m42 = vaddq_f32(vaddq_f32(vmulq_n_f32(s45, 0.25), s55), vmulq_n_f32(s65, 2.25)); - float32x4_t m43 = vaddq_f32(vaddq_f32(vmulq_n_f32(s15, 0.125), s25), vmulq_n_f32(s35, 3.375)); - float32x4_t m44 = vaddq_f32(vaddq_f32(vmulq_n_f32(s45, 0.0625), s55), vmulq_n_f32(s65, 5.0625)); - float32x4_t m45 = vaddq_f32(vaddq_f32(vmulq_n_f32(s15, 0.03125), s25), vmulq_n_f32(s35, 7.59375)); - float32x4_t m46 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s45, 0.015625), s55), vmulq_n_f32(s65, 11.390625)), t47); - - float32x4_t m50 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t50, t51), t52), t53), t54), t55), t56); - float32x4_t m51 = vaddq_f32(vaddq_f32(vmulq_n_f32(s16, 0.5), s26), vmulq_n_f32(s36, 1.5)); - float32x4_t m52 = vaddq_f32(vaddq_f32(vmulq_n_f32(s46, 0.25), s56), vmulq_n_f32(s66, 2.25)); - float32x4_t m53 = vaddq_f32(vaddq_f32(vmulq_n_f32(s16, 0.125), s26), vmulq_n_f32(s36, 3.375)); - float32x4_t m54 = vaddq_f32(vaddq_f32(vmulq_n_f32(s46, 0.0625), s56), vmulq_n_f32(s66, 5.0625)); - float32x4_t m55 = vaddq_f32(vaddq_f32(vmulq_n_f32(s16, 0.03125), s26), vmulq_n_f32(s36, 7.59375)); - float32x4_t m56 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s46, 0.015625), s56), vmulq_n_f32(s66, 11.390625)), t57); - - float32x4_t m60 = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t60, t61), t62), t63), t64), t65), t66); - float32x4_t m61 = vaddq_f32(vaddq_f32(vmulq_n_f32(s17, 0.5), s27), vmulq_n_f32(s37, 1.5)); - float32x4_t m62 = vaddq_f32(vaddq_f32(vmulq_n_f32(s47, 0.25), s57), vmulq_n_f32(s67, 2.25)); - float32x4_t m63 = vaddq_f32(vaddq_f32(vmulq_n_f32(s17, 0.125), s27), vmulq_n_f32(s37, 3.375)); - float32x4_t m64 = vaddq_f32(vaddq_f32(vmulq_n_f32(s47, 0.0625), s57), vmulq_n_f32(s67, 5.0625)); - float32x4_t m65 = vaddq_f32(vaddq_f32(vmulq_n_f32(s17, 0.03125), s27), vmulq_n_f32(s37, 7.59375)); - float32x4_t m66 = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(s47, 0.015625), s57), vmulq_n_f32(s67, 11.390625)), t67); - + float32x4_t src[64]; + float32x4_t t[56]; + float32x4_t m[49]; + Load64Data; float32x4_t bias_ptr = vld1q_f32(bias_data); - vst1q_f32(dst_data, vaddq_f32(m00, bias_ptr)); - vst1q_f32(dst_data + C4NUM, vaddq_f32(m01, bias_ptr)); - vst1q_f32(dst_data + 2 * C4NUM, vaddq_f32(m02, bias_ptr)); - vst1q_f32(dst_data + 3 * C4NUM, vaddq_f32(m03, bias_ptr)); - vst1q_f32(dst_data + 4 * C4NUM, vaddq_f32(m04, bias_ptr)); - vst1q_f32(dst_data + 5 * C4NUM, vaddq_f32(m05, bias_ptr)); - vst1q_f32(dst_data + 6 * C4NUM, vaddq_f32(m06, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM, vaddq_f32(m10, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + C4NUM, vaddq_f32(m11, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + 2 * C4NUM, vaddq_f32(m12, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + 3 * C4NUM, vaddq_f32(m13, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + 4 * C4NUM, vaddq_f32(m14, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + 5 * C4NUM, vaddq_f32(m15, bias_ptr)); - vst1q_f32(dst_data + dst_step * C4NUM + 6 * C4NUM, vaddq_f32(m16, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM, vaddq_f32(m20, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + C4NUM, vaddq_f32(m21, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + 2 * C4NUM, vaddq_f32(m22, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + 3 * C4NUM, vaddq_f32(m23, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + 4 * C4NUM, vaddq_f32(m24, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + 5 * C4NUM, vaddq_f32(m25, bias_ptr)); - vst1q_f32(dst_data + 2 * dst_step * C4NUM + 6 * C4NUM, vaddq_f32(m26, bias_ptr)); - vst1q_f32(dst_data + 3 * dst_step * C4NUM, vaddq_f32(m30, bias_ptr)); - vst1q_f32(dst_data + 3 * dst_step * C4NUM + C4NUM, vaddq_f32(m31, bias_ptr)); - vst1q_f32(dst_data + 3 * dst_step * C4NUM + 2 * C4NUM, vaddq_f32(m32, bias_ptr)); - vst1q_f32(dst_data + 3 * dst_step * C4NUM + 3 * C4NUM, vaddq_f32(m33, bias_ptr)); - vst1q_f32(dst_data + 3 * dst_step * C4NUM + 4 * C4NUM, vaddq_f32(m34, bias_ptr)); - vst1q_f32(dst_data + 3 * dst_step * C4NUM + 5 * C4NUM, vaddq_f32(m35, bias_ptr)); - vst1q_f32(dst_data + 3 * dst_step * C4NUM + 6 * C4NUM, vaddq_f32(m36, bias_ptr)); - vst1q_f32(dst_data + 4 * dst_step * C4NUM, vaddq_f32(m40, bias_ptr)); - vst1q_f32(dst_data + 4 * dst_step * C4NUM + C4NUM, vaddq_f32(m41, bias_ptr)); - vst1q_f32(dst_data + 4 * dst_step * C4NUM + 2 * C4NUM, vaddq_f32(m42, bias_ptr)); - vst1q_f32(dst_data + 4 * dst_step * C4NUM + 3 * C4NUM, vaddq_f32(m43, bias_ptr)); - vst1q_f32(dst_data + 4 * dst_step * C4NUM + 4 * C4NUM, vaddq_f32(m44, bias_ptr)); - vst1q_f32(dst_data + 4 * dst_step * C4NUM + 5 * C4NUM, vaddq_f32(m45, bias_ptr)); - vst1q_f32(dst_data + 4 * dst_step * C4NUM + 6 * C4NUM, vaddq_f32(m46, bias_ptr)); - vst1q_f32(dst_data + 5 * dst_step * C4NUM, vaddq_f32(m50, bias_ptr)); - vst1q_f32(dst_data + 5 * dst_step * C4NUM + C4NUM, vaddq_f32(m51, bias_ptr)); - vst1q_f32(dst_data + 5 * dst_step * C4NUM + 2 * C4NUM, vaddq_f32(m52, bias_ptr)); - vst1q_f32(dst_data + 5 * dst_step * C4NUM + 3 * C4NUM, vaddq_f32(m53, bias_ptr)); - vst1q_f32(dst_data + 5 * dst_step * C4NUM + 4 * C4NUM, vaddq_f32(m54, bias_ptr)); - vst1q_f32(dst_data + 5 * dst_step * C4NUM + 5 * C4NUM, vaddq_f32(m55, bias_ptr)); - vst1q_f32(dst_data + 5 * dst_step * C4NUM + 6 * C4NUM, vaddq_f32(m56, bias_ptr)); - vst1q_f32(dst_data + 6 * dst_step * C4NUM, vaddq_f32(m60, bias_ptr)); - vst1q_f32(dst_data + 6 * dst_step * C4NUM + C4NUM, vaddq_f32(m61, bias_ptr)); - vst1q_f32(dst_data + 6 * dst_step * C4NUM + 2 * C4NUM, vaddq_f32(m62, bias_ptr)); - vst1q_f32(dst_data + 6 * dst_step * C4NUM + 3 * C4NUM, vaddq_f32(m63, bias_ptr)); - vst1q_f32(dst_data + 6 * dst_step * C4NUM + 4 * C4NUM, vaddq_f32(m64, bias_ptr)); - vst1q_f32(dst_data + 6 * dst_step * C4NUM + 5 * C4NUM, vaddq_f32(m65, bias_ptr)); - vst1q_f32(dst_data + 6 * dst_step * C4NUM + 6 * C4NUM, vaddq_f32(m66, bias_ptr)); + for (int l = 0; l < 8; ++l) { + int offset = l * 8; + float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); + float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); + float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); + float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); + float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); + float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); + t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 2)), vmulq_n_f32(tmp6, 3)); + t[l + 16] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), vmulq_n_f32(tmp3, 9)); + t[l + 24] = vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 8)), vmulq_n_f32(tmp6, 27)); + t[l + 32] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 16)), vmulq_n_f32(tmp3, 81)); + t[l + 40] = vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 32)), vmulq_n_f32(tmp6, 243)); + t[l + 48] = vaddq_f32(vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 64)), vmulq_n_f32(tmp3, 729)), src[7 + offset]); + } + for (int l = 0; l < 7; ++l) { + int offset = l * 8; + float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); + float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); + float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); + float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); + float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); + float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); + m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 7] = vaddq_f32(vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 2)), vmulq_n_f32(tmp6, 3)), bias_ptr); + m[l + 14] = vaddq_f32(vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), vmulq_n_f32(tmp3, 9)), bias_ptr); + m[l + 21] = vaddq_f32(vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 8)), vmulq_n_f32(tmp6, 27)), bias_ptr); + m[l + 28] = vaddq_f32(vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 16)), vmulq_n_f32(tmp3, 81)), bias_ptr); + m[l + 35] = vaddq_f32(vaddq_f32(vaddq_f32(tmp4, vmulq_n_f32(tmp5, 32)), vmulq_n_f32(tmp6, 243)), bias_ptr); + m[l + 42] = vaddq_f32( + vaddq_f32(vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 64)), vmulq_n_f32(tmp3, 729)), t[7 + offset]), bias_ptr); + } + for (int i = 0; i < 7; i++) { + int dst_k_offset = i * dst_step * C4NUM; + int m_k_offset = i * 7; + vst1q_f32(dst_data + dst_k_offset + 0 * C4NUM, m[m_k_offset]); + vst1q_f32(dst_data + dst_k_offset + 1 * C4NUM, m[m_k_offset + 1]); + vst1q_f32(dst_data + dst_k_offset + 2 * C4NUM, m[m_k_offset + 2]); + vst1q_f32(dst_data + dst_k_offset + 3 * C4NUM, m[m_k_offset + 3]); + vst1q_f32(dst_data + dst_k_offset + 4 * C4NUM, m[m_k_offset + 4]); + vst1q_f32(dst_data + dst_k_offset + 5 * C4NUM, m[m_k_offset + 5]); + vst1q_f32(dst_data + dst_k_offset + 6 * C4NUM, m[m_k_offset + 6]); + } #else - for (int i = 0; i < C4NUM; i++) { - float src_data_00 = src_data[i]; - float src_data_01 = src_data[i + src_step]; - float src_data_02 = src_data[i + 2 * src_step]; - float src_data_03 = src_data[i + 3 * src_step]; - float src_data_04 = src_data[i + 4 * src_step]; - float src_data_05 = src_data[i + 5 * src_step]; - float src_data_06 = src_data[i + 6 * src_step]; - float src_data_07 = src_data[i + 7 * src_step]; - float src_data_10 = src_data[i + 8 * src_step]; - float src_data_11 = src_data[i + 9 * src_step]; - float src_data_12 = src_data[i + 10 * src_step]; - float src_data_13 = src_data[i + 11 * src_step]; - float src_data_14 = src_data[i + 12 * src_step]; - float src_data_15 = src_data[i + 13 * src_step]; - float src_data_16 = src_data[i + 14 * src_step]; - float src_data_17 = src_data[i + 15 * src_step]; - float src_data_20 = src_data[i + 16 * src_step]; - float src_data_21 = src_data[i + 17 * src_step]; - float src_data_22 = src_data[i + 18 * src_step]; - float src_data_23 = src_data[i + 19 * src_step]; - float src_data_24 = src_data[i + 20 * src_step]; - float src_data_25 = src_data[i + 21 * src_step]; - float src_data_26 = src_data[i + 22 * src_step]; - float src_data_27 = src_data[i + 23 * src_step]; - float src_data_30 = src_data[i + 24 * src_step]; - float src_data_31 = src_data[i + 25 * src_step]; - float src_data_32 = src_data[i + 26 * src_step]; - float src_data_33 = src_data[i + 27 * src_step]; - float src_data_34 = src_data[i + 28 * src_step]; - float src_data_35 = src_data[i + 29 * src_step]; - float src_data_36 = src_data[i + 30 * src_step]; - float src_data_37 = src_data[i + 31 * src_step]; - float src_data_40 = src_data[i + 32 * src_step]; - float src_data_41 = src_data[i + 33 * src_step]; - float src_data_42 = src_data[i + 34 * src_step]; - float src_data_43 = src_data[i + 35 * src_step]; - float src_data_44 = src_data[i + 36 * src_step]; - float src_data_45 = src_data[i + 37 * src_step]; - float src_data_46 = src_data[i + 38 * src_step]; - float src_data_47 = src_data[i + 39 * src_step]; - float src_data_50 = src_data[i + 40 * src_step]; - float src_data_51 = src_data[i + 41 * src_step]; - float src_data_52 = src_data[i + 42 * src_step]; - float src_data_53 = src_data[i + 43 * src_step]; - float src_data_54 = src_data[i + 44 * src_step]; - float src_data_55 = src_data[i + 45 * src_step]; - float src_data_56 = src_data[i + 46 * src_step]; - float src_data_57 = src_data[i + 47 * src_step]; - float src_data_60 = src_data[i + 48 * src_step]; - float src_data_61 = src_data[i + 49 * src_step]; - float src_data_62 = src_data[i + 50 * src_step]; - float src_data_63 = src_data[i + 51 * src_step]; - float src_data_64 = src_data[i + 52 * src_step]; - float src_data_65 = src_data[i + 53 * src_step]; - float src_data_66 = src_data[i + 54 * src_step]; - float src_data_67 = src_data[i + 55 * src_step]; - float src_data_70 = src_data[i + 56 * src_step]; - float src_data_71 = src_data[i + 57 * src_step]; - float src_data_72 = src_data[i + 58 * src_step]; - float src_data_73 = src_data[i + 59 * src_step]; - float src_data_74 = src_data[i + 60 * src_step]; - float src_data_75 = src_data[i + 61 * src_step]; - float src_data_76 = src_data[i + 62 * src_step]; - float src_data_77 = src_data[i + 63 * src_step]; - - float d01 = src_data_10 - src_data_20; - float d02 = src_data_11 - src_data_21; - float d03 = src_data_12 - src_data_22; - float d04 = src_data_13 - src_data_23; - float d05 = src_data_14 - src_data_24; - float d06 = src_data_15 - src_data_25; - float d07 = src_data_16 - src_data_26; - float d08 = src_data_17 - src_data_27; - - float d11 = src_data_30 - src_data_40; - float d12 = src_data_31 - src_data_41; - float d13 = src_data_32 - src_data_42; - float d14 = src_data_33 - src_data_43; - float d15 = src_data_34 - src_data_44; - float d16 = src_data_35 - src_data_45; - float d17 = src_data_36 - src_data_46; - float d18 = src_data_37 - src_data_47; - - float d21 = src_data_50 - src_data_60; - float d22 = src_data_51 - src_data_61; - float d23 = src_data_52 - src_data_62; - float d24 = src_data_53 - src_data_63; - float d25 = src_data_54 - src_data_64; - float d26 = src_data_55 - src_data_65; - float d27 = src_data_56 - src_data_66; - float d28 = src_data_57 - src_data_67; - - float d31 = src_data_10 + src_data_20; - float d32 = src_data_11 + src_data_21; - float d33 = src_data_12 + src_data_22; - float d34 = src_data_13 + src_data_23; - float d35 = src_data_14 + src_data_24; - float d36 = src_data_15 + src_data_25; - float d37 = src_data_16 + src_data_26; - float d38 = src_data_17 + src_data_27; - - float d41 = src_data_30 + src_data_40; - float d42 = src_data_31 + src_data_41; - float d43 = src_data_32 + src_data_42; - float d44 = src_data_33 + src_data_43; - float d45 = src_data_34 + src_data_44; - float d46 = src_data_35 + src_data_45; - float d47 = src_data_36 + src_data_46; - float d48 = src_data_37 + src_data_47; - - float d51 = src_data_50 + src_data_60; - float d52 = src_data_51 + src_data_61; - float d53 = src_data_52 + src_data_62; - float d54 = src_data_53 + src_data_63; - float d55 = src_data_54 + src_data_64; - float d56 = src_data_55 + src_data_65; - float d57 = src_data_56 + src_data_66; - float d58 = src_data_57 + src_data_67; - - float t00 = src_data_00 + src_data_10 + src_data_20 + src_data_30 + src_data_40 + src_data_50 + src_data_60; - float t01 = src_data_01 + src_data_11 + src_data_21 + src_data_31 + src_data_41 + src_data_51 + src_data_61; - float t02 = src_data_02 + src_data_12 + src_data_22 + src_data_32 + src_data_42 + src_data_52 + src_data_62; - float t03 = src_data_03 + src_data_13 + src_data_23 + src_data_33 + src_data_43 + src_data_53 + src_data_63; - float t04 = src_data_04 + src_data_14 + src_data_24 + src_data_34 + src_data_44 + src_data_54 + src_data_64; - float t05 = src_data_05 + src_data_15 + src_data_25 + src_data_35 + src_data_45 + src_data_55 + src_data_65; - float t06 = src_data_06 + src_data_16 + src_data_26 + src_data_36 + src_data_46 + src_data_56 + src_data_66; - float t07 = src_data_07 + src_data_17 + src_data_27 + src_data_37 + src_data_47 + src_data_57 + src_data_67; - - const float t10 = 0.5f * d01 + d11 + 1.5f * d21; - const float t11 = 0.5f * d02 + d12 + 1.5f * d22; - const float t12 = 0.5f * d03 + d13 + 1.5f * d23; - const float t13 = 0.5f * d04 + d14 + 1.5f * d24; - const float t14 = 0.5f * d05 + d15 + 1.5f * d25; - const float t15 = 0.5f * d06 + d16 + 1.5f * d26; - const float t16 = 0.5f * d07 + d17 + 1.5f * d27; - const float t17 = 0.5f * d08 + d18 + 1.5f * d28; - - const float t20 = 0.25f * d31 + d41 + 2.25f * d51; - const float t21 = 0.25f * d32 + d42 + 2.25f * d52; - const float t22 = 0.25f * d33 + d43 + 2.25f * d53; - const float t23 = 0.25f * d34 + d44 + 2.25f * d54; - const float t24 = 0.25f * d35 + d45 + 2.25f * d55; - const float t25 = 0.25f * d36 + d46 + 2.25f * d56; - const float t26 = 0.25f * d37 + d47 + 2.25f * d57; - const float t27 = 0.25f * d38 + d48 + 2.25f * d58; - - const float t30 = 0.125f * d01 + d11 + 3.375f * d21; - const float t31 = 0.125f * d02 + d12 + 3.375f * d22; - const float t32 = 0.125f * d03 + d13 + 3.375f * d23; - const float t33 = 0.125f * d04 + d14 + 3.375f * d24; - const float t34 = 0.125f * d05 + d15 + 3.375f * d25; - const float t35 = 0.125f * d06 + d16 + 3.375f * d26; - const float t36 = 0.125f * d07 + d17 + 3.375f * d27; - const float t37 = 0.125f * d08 + d18 + 3.375f * d28; - - const float t40 = 0.0625f * d31 + d41 + 5.0625f * d51; - const float t41 = 0.0625f * d32 + d42 + 5.0625f * d52; - const float t42 = 0.0625f * d33 + d43 + 5.0625f * d53; - const float t43 = 0.0625f * d34 + d44 + 5.0625f * d54; - const float t44 = 0.0625f * d35 + d45 + 5.0625f * d55; - const float t45 = 0.0625f * d36 + d46 + 5.0625f * d56; - const float t46 = 0.0625f * d37 + d47 + 5.0625f * d57; - const float t47 = 0.0625f * d38 + d48 + 5.0625f * d58; - - const float t50 = 0.03125f * d01 + d11 + 7.59375f * d21; - const float t51 = 0.03125f * d02 + d12 + 7.59375f * d22; - const float t52 = 0.03125f * d03 + d13 + 7.59375f * d23; - const float t53 = 0.03125f * d04 + d14 + 7.59375f * d24; - const float t54 = 0.03125f * d05 + d15 + 7.59375f * d25; - const float t55 = 0.03125f * d06 + d16 + 7.59375f * d26; - const float t56 = 0.03125f * d07 + d17 + 7.59375f * d27; - const float t57 = 0.03125f * d08 + d18 + 7.59375f * d28; - - const float t60 = 0.015625f * d31 + d41 + 11.390625f * d51 + src_data_70; - const float t61 = 0.015625f * d32 + d42 + 11.390625f * d52 + src_data_71; - const float t62 = 0.015625f * d33 + d43 + 11.390625f * d53 + src_data_72; - const float t63 = 0.015625f * d34 + d44 + 11.390625f * d54 + src_data_73; - const float t64 = 0.015625f * d35 + d45 + 11.390625f * d55 + src_data_74; - const float t65 = 0.015625f * d36 + d46 + 11.390625f * d56 + src_data_75; - const float t66 = 0.015625f * d37 + d47 + 11.390625f * d57 + src_data_76; - const float t67 = 0.015625f * d38 + d48 + 11.390625f * d58 + src_data_77; - - float s11 = t01 - t02; - float s12 = t11 - t12; - float s13 = t21 - t22; - float s14 = t31 - t32; - float s15 = t41 - t42; - float s16 = t51 - t52; - float s17 = t61 - t62; - - float s21 = t03 - t04; - float s22 = t13 - t14; - float s23 = t23 - t24; - float s24 = t33 - t34; - float s25 = t43 - t44; - float s26 = t53 - t54; - float s27 = t63 - t64; - - float s31 = t05 - t06; - float s32 = t15 - t16; - float s33 = t25 - t26; - float s34 = t35 - t36; - float s35 = t45 - t46; - float s36 = t55 - t56; - float s37 = t56 - t66; - - float s41 = t01 + t02; - float s42 = t11 + t12; - float s43 = t21 + t22; - float s44 = t31 + t32; - float s45 = t41 + t42; - float s46 = t51 + t52; - float s47 = t61 + t62; - - float s51 = t03 + t04; - float s52 = t13 + t14; - float s53 = t23 + t24; - float s54 = t33 + t34; - float s55 = t43 + t44; - float s56 = t53 + t54; - float s57 = t63 + t64; - - float s61 = t05 + t06; - float s62 = t15 + t16; - float s63 = t25 + t26; - float s64 = t35 + t36; - float s65 = t45 + t46; - float s66 = t55 + t56; - float s67 = t65 + t66; - - float m00 = t00 + t01 + t02 + t03 + t04 + t05 + t06; - const float m01 = 0.5f * s11 + s21 + 1.5f * s31; - const float m02 = 0.25f * s41 + s51 + 2.25f * s61; - const float m03 = 0.125f * s11 + s21 + 3.375f * s31; - const float m04 = 0.0625f * s41 + s51 + 5.0625f * s61; - const float m05 = 0.03125f * s11 + s21 + 7.59375f * s31; - const float m06 = 0.015625f * s41 + s51 + 11.390625f * s61 + t07; - - float m10 = t10 + t11 + t12 + t13 + t14 + t15 + t16; - const float m11 = 0.5f * s12 + s22 + 1.5f * s32; - const float m12 = 0.25f * s42 + s52 + 2.25f * s62; - const float m13 = 0.125f * s12 + s22 + 3.375f * s32; - const float m14 = 0.0625f * s42 + s52 + 5.0625f * s62; - const float m15 = 0.03125f * s12 + s22 + 7.59375f * s32; - const float m16 = 0.015625f * s42 + s52 + 11.390625f * s62 + t17; - - float m20 = t20 + t21 + t22 + t23 + t24 + t25 + t26; - const float m21 = 0.5f * s13 + s23 + 1.5f * s33; - const float m22 = 0.25f * s43 + s53 + 2.25f * s63; - const float m23 = 0.125f * s13 + s23 + 3.375f * s33; - const float m24 = 0.0625f * s43 + s53 + 5.0625f * s63; - const float m25 = 0.03125f * s13 + s23 + 7.59375f * s33; - const float m26 = 0.015625f * s43 + s53 + 11.390625f * s63 + t27; - - float m30 = t30 + t31 + t32 + t33 + t34 + t35 + t36; - const float m31 = 0.5f * s14 + s24 + 1.5f * s34; - const float m32 = 0.25f * s44 + s54 + 2.25f * s64; - const float m33 = 0.125f * s14 + s24 + 3.375f * s34; - const float m34 = 0.0625f * s44 + s54 + 5.0625f * s64; - const float m35 = 0.03125f * s14 + s24 + 7.59375f * s34; - const float m36 = 0.015625f * s44 + s54 + 11.390625f * s64 + t37; - - float m40 = t40 + t41 + t42 + t43 + t44 + t45 + t46; - const float m41 = 0.5f * s15 + s25 + 1.5f * s35; - const float m42 = 0.25f * s45 + s55 + 2.25f * s65; - const float m43 = 0.125f * s15 + s25 + 3.375f * s35; - const float m44 = 0.0625f * s45 + s55 + 5.0625f * s65; - const float m45 = 0.03125f * s15 + s25 + 7.59375f * s35; - const float m46 = 0.015625f * s45 + s55 + 11.390625f * s65 + t47; - - float m50 = t50 + t51 + t52 + t53 + t54 + t55 + t56; - const float m51 = 0.5f * s16 + s26 + 1.5f * s36; - const float m52 = 0.25f * s46 + s56 + 2.25f * s66; - const float m53 = 0.125f * s16 + s26 + 3.375f * s36; - const float m54 = 0.0625f * s46 + s56 + 5.0625f * s66; - const float m55 = 0.03125f * s16 + s26 + 7.59375f * s36; - const float m56 = 0.015625f * s46 + s56 + 11.390625f * s66 + t57; - - float m60 = t60 + t61 + t62 + t63 + t64 + t65 + t66; - const float m61 = 0.5f * s17 + s27 + 1.5f * s37; - const float m62 = 0.25f * s47 + s57 + 2.25f * s67; - const float m63 = 0.125f * s17 + s27 + 3.375f * s37; - const float m64 = 0.0625f * s47 + s57 + 5.0625f * s67; - const float m65 = 0.03125f * s17 + s27 + 7.59375f * s37; - const float m66 = 0.015625f * s47 + s57 + 11.390625f * s67 + t67; - - (dst_data + i)[0] = m00 + bias_data[i]; - (dst_data + i + C4NUM)[0] = m01 + bias_data[i]; - (dst_data + i + 2 * C4NUM)[0] = m02 + bias_data[i]; - (dst_data + i + 3 * C4NUM)[0] = m03 + bias_data[i]; - (dst_data + i + 4 * C4NUM)[0] = m04 + bias_data[i]; - (dst_data + i + 5 * C4NUM)[0] = m05 + bias_data[i]; - (dst_data + i + 6 * C4NUM)[0] = m06 + bias_data[i]; - - (dst_data + i + dst_step * C4NUM)[0] = m10 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + C4NUM)[0] = m11 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + 2 * C4NUM)[0] = m12 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + 3 * C4NUM)[0] = m13 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + 4 * C4NUM)[0] = m14 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + 5 * C4NUM)[0] = m15 + bias_data[i]; - (dst_data + i + dst_step * C4NUM + 6 * C4NUM)[0] = m16 + bias_data[i]; - - (dst_data + i + 2 * dst_step * C4NUM)[0] = m20 + bias_data[i]; - (dst_data + i + 2 * dst_step * C4NUM + C4NUM)[0] = m21 + bias_data[i]; - (dst_data + i + 2 * dst_step * C4NUM + 2 * C4NUM)[0] = m22 + bias_data[i]; - (dst_data + i + 2 * dst_step * C4NUM + 3 * C4NUM)[0] = m23 + bias_data[i]; - (dst_data + i + 2 * dst_step * C4NUM + 4 * C4NUM)[0] = m24 + bias_data[i]; - (dst_data + i + 2 * dst_step * C4NUM + 5 * C4NUM)[0] = m25 + bias_data[i]; - (dst_data + i + 2 * dst_step * C4NUM + 6 * C4NUM)[0] = m26 + bias_data[i]; - - (dst_data + i + 3 * dst_step * C4NUM)[0] = m30 + bias_data[i]; - (dst_data + i + 3 * dst_step * C4NUM + C4NUM)[0] = m31 + bias_data[i]; - (dst_data + i + 3 * dst_step * C4NUM + 2 * C4NUM)[0] = m32 + bias_data[i]; - (dst_data + i + 3 * dst_step * C4NUM + 3 * C4NUM)[0] = m33 + bias_data[i]; - (dst_data + i + 3 * dst_step * C4NUM + 4 * C4NUM)[0] = m34 + bias_data[i]; - (dst_data + i + 3 * dst_step * C4NUM + 5 * C4NUM)[0] = m35 + bias_data[i]; - (dst_data + i + 3 * dst_step * C4NUM + 6 * C4NUM)[0] = m36 + bias_data[i]; - - (dst_data + i + 4 * dst_step * C4NUM)[0] = m40 + bias_data[i]; - (dst_data + i + 4 * dst_step * C4NUM + C4NUM)[0] = m41 + bias_data[i]; - (dst_data + i + 4 * dst_step * C4NUM + 2 * C4NUM)[0] = m42 + bias_data[i]; - (dst_data + i + 4 * dst_step * C4NUM + 3 * C4NUM)[0] = m43 + bias_data[i]; - (dst_data + i + 4 * dst_step * C4NUM + 4 * C4NUM)[0] = m44 + bias_data[i]; - (dst_data + i + 4 * dst_step * C4NUM + 5 * C4NUM)[0] = m45 + bias_data[i]; - (dst_data + i + 4 * dst_step * C4NUM + 6 * C4NUM)[0] = m46 + bias_data[i]; - - (dst_data + i + 5 * dst_step * C4NUM)[0] = m50 + bias_data[i]; - (dst_data + i + 5 * dst_step * C4NUM + C4NUM)[0] = m51 + bias_data[i]; - (dst_data + i + 5 * dst_step * C4NUM + 2 * C4NUM)[0] = m52 + bias_data[i]; - (dst_data + i + 5 * dst_step * C4NUM + 3 * C4NUM)[0] = m53 + bias_data[i]; - (dst_data + i + 5 * dst_step * C4NUM + 4 * C4NUM)[0] = m54 + bias_data[i]; - (dst_data + i + 5 * dst_step * C4NUM + 5 * C4NUM)[0] = m55 + bias_data[i]; - (dst_data + i + 5 * dst_step * C4NUM + 6 * C4NUM)[0] = m56 + bias_data[i]; - - (dst_data + i + 6 * dst_step * C4NUM)[0] = m60 + bias_data[i]; - (dst_data + i + 6 * dst_step * C4NUM + C4NUM)[0] = m61 + bias_data[i]; - (dst_data + i + 6 * dst_step * C4NUM + 2 * C4NUM)[0] = m62 + bias_data[i]; - (dst_data + i + 6 * dst_step * C4NUM + 3 * C4NUM)[0] = m63 + bias_data[i]; - (dst_data + i + 6 * dst_step * C4NUM + 4 * C4NUM)[0] = m64 + bias_data[i]; - (dst_data + i + 6 * dst_step * C4NUM + 5 * C4NUM)[0] = m65 + bias_data[i]; - (dst_data + i + 6 * dst_step * C4NUM + 6 * C4NUM)[0] = m66 + bias_data[i]; + float src[64]; + float t[56]; + float m[49]; + for (int i = 0; i < C4NUM; ++i) { + // load source data + for (int j = 0; j < 64; ++j) { + src[j] = src_data[i + j * src_step]; + } + for (int l = 0; l < 8; ++l) { + int offset = l * 8; + t[l] = src[offset] + src[1 + offset] + src[2 + offset] + src[3 + offset] + src[4 + offset] + src[5 + offset] + + src[6 + offset]; + t[l + 8] = src[1 + offset] - src[2 + offset] + 2 * (src[3 + offset] - src[4 + offset]) + + 3 * (src[5 + offset] - src[6 + offset]); + t[l + 16] = src[1 + offset] + src[2 + offset] + 4 * (src[3 + offset] + src[4 + offset]) + + 9 * (src[5 + offset] + src[6 + offset]); + t[l + 24] = src[1 + offset] - src[2 + offset] + 8 * (src[3 + offset] - src[4 + offset]) + + 27 * (src[5 + offset] - src[6 + offset]); + t[l + 32] = src[1 + offset] + src[2 + offset] + 16 * (src[3 + offset] + src[4 + offset]) + + 81 * (src[5 + offset] + src[6 + offset]); + t[l + 40] = src[1 + offset] - src[2 + offset] + 32 * (src[3 + offset] - src[4 + offset]) + + 243 * (src[5 + offset] - src[6 + offset]); + t[l + 48] = src[1 + offset] + src[2 + offset] + 64 * (src[3 + offset] + src[4 + offset]) + + 729 * (src[5 + offset] + src[6 + offset]) + src[7 + offset]; + } + for (int l = 0; l < 7; ++l) { + int offset = l * 8; + m[l] = t[offset] + t[1 + offset] + t[2 + offset] + t[3 + offset] + t[4 + offset] + t[5 + offset] + t[6 + offset]; + m[l + 7] = + t[1 + offset] - t[2 + offset] + 2 * (t[3 + offset] - t[4 + offset]) + 3 * (t[5 + offset] - t[6 + offset]); + m[l + 14] = + t[1 + offset] + t[2 + offset] + 4 * (t[3 + offset] + t[4 + offset]) + 9 * (t[5 + offset] + t[6 + offset]); + m[l + 21] = + t[1 + offset] - t[2 + offset] + 8 * (t[3 + offset] - t[4 + offset]) + 27 * (t[5 + offset] - t[6 + offset]); + m[l + 28] = + t[1 + offset] + t[2 + offset] + 16 * (t[3 + offset] + t[4 + offset]) + 81 * (t[5 + offset] + t[6 + offset]); + m[l + 35] = + t[1 + offset] - t[2 + offset] + 32 * (t[3 + offset] - t[4 + offset]) + 243 * (t[5 + offset] - t[6 + offset]); + m[l + 42] = t[1 + offset] + t[2 + offset] + 64 * (t[3 + offset] + t[4 + offset]) + + 729 * (t[5 + offset] + t[6 + offset]) + t[7 + offset]; + } + // store output + for (int k = 0; k < 7; ++k) { + int dst_k_offset = k * dst_step * C4NUM; + int m_k_offset = k * 7; + for (int j = 0; j < 7; ++j) { + dst_data[i + dst_k_offset + j * C4NUM] = m[j + m_k_offset] + bias_data[i]; + } + } } #endif } @@ -4657,7 +1246,7 @@ int SelectOutputUnit(ConvParameter *conv_param) { int out_c = conv_param->output_channel_; int unit2 = UP_DIV(out_w * out_h, C12NUM * conv_param->op_parameter_.thread_num_); int max_out_unit = (int)(sqrtf((float)unit2)); - max_out_unit = max_out_unit < MAX_UNIT ? MAX_UNIT : max_out_unit; + max_out_unit = max_out_unit < MAX_UNIT ? max_out_unit : MAX_UNIT; max_out_unit = max_out_unit > MIN_UNIT ? max_out_unit : MIN_UNIT; int unit = 0; @@ -4666,8 +1255,7 @@ int SelectOutputUnit(ConvParameter *conv_param) { for (int i = MIN_UNIT; i <= max_out_unit; ++i) { int input_unit = i + kernel_w - 1; - OutputTransformUnitFunc output_trans_func = GetOutputTransFunc(input_unit, i); - if (output_trans_func == NULL) { + if (!GetOutputTransFunc(input_unit, i)) { continue; } float penalty = ((float)input_unit * input_unit) / ((float)kernel_h * kernel_w) * 0.12f; @@ -4686,32 +1274,7 @@ int SelectOutputUnit(ConvParameter *conv_param) { return unit; } -InputTransformUnitFunc GetInputTransFunc(int input_unit) { - if (input_unit == 4) { - return InputTransform4x4Unit; - } else if (input_unit == 8) { - return InputTransform8x8Unit; - } else { - printf("Only support 4 or 8 for input unit."); - return NULL; - } -} - -OutputTransformUnitFunc GetOutputTransFunc(int input_unit, int output_unit) { - if (input_unit == 4 && output_unit == 2) { - return OutputTransform4x2Unit; - } else if (input_unit == 4 && output_unit == 3) { - return OutputTransform4x3Unit; - } else if (input_unit == 8) { - return outputTransformUnit[output_unit]; - } else { - printf("."); - return NULL; - } -} - -void CheckIfUseWinograd(bool *use_winograd, int *output_unit, ConvParameter *conv_param, - InputTransformUnitFunc input_trans_func, OutputTransformUnitFunc output_trans_func) { +void CheckIfUseWinograd(bool *use_winograd, int *output_unit, ConvParameter *conv_param) { if (conv_param->kernel_w_ == conv_param->kernel_h_ && conv_param->dilation_h_ == 1 && conv_param->dilation_w_ == 1 && conv_param->stride_h_ == 1 && conv_param->stride_w_ == 1) { *output_unit = SelectOutputUnit(conv_param); diff --git a/mindspore/lite/nnacl/winograd_utils.h b/mindspore/lite/nnacl/winograd_utils.h index e7afd28328a..4fb06563f20 100644 --- a/mindspore/lite/nnacl/winograd_utils.h +++ b/mindspore/lite/nnacl/winograd_utils.h @@ -20,45 +20,237 @@ #ifdef ENABLE_ARM #include #endif -#include "nnacl/matrix_table.h" #include "nnacl/conv_parameter.h" #include "nnacl/op_base.h" -typedef void (*InputTransformUnitFunc)(const float *src_data, float *dst_data, int src_step, int dst_step); -typedef void (*OutputTransformUnitFunc)(const float *src_data, float *dst_data, const float *bias_data, int src_step, - int dst_step); +#define MAX_LEN 256 #ifdef __cplusplus extern "C" { #endif +typedef void (*InputTransFunc)(const float *src_data, float *dst_data, int src_step, int dst_step); + +typedef void (*OutputTransFunc)(const float *src_data, float *dst_data, const float *bias_data, int src_step, + int dst_step); + +void GeneralInputTransformUnit(const float *src_data, float *dst_data, float *matrix_b, float *matrix_bt, int src_step, + int dst_step, int in_unit); + +void GeneralOutputTransformUnit(const float *src_data, float *dst_data, const float *bias_data, float *matrix_a, + float *matrix_at, int src_step, int dst_step, int in_unit, int out_unit); + +#define Load16Data \ + src[0] = vld1q_f32(src_data + 0 * src_step); \ + src[1] = vld1q_f32(src_data + 1 * src_step); \ + src[2] = vld1q_f32(src_data + 2 * src_step); \ + src[3] = vld1q_f32(src_data + 3 * src_step); \ + src[4] = vld1q_f32(src_data + 4 * src_step); \ + src[5] = vld1q_f32(src_data + 5 * src_step); \ + src[6] = vld1q_f32(src_data + 6 * src_step); \ + src[7] = vld1q_f32(src_data + 7 * src_step); \ + src[8] = vld1q_f32(src_data + 8 * src_step); \ + src[9] = vld1q_f32(src_data + 9 * src_step); \ + src[10] = vld1q_f32(src_data + 10 * src_step); \ + src[11] = vld1q_f32(src_data + 11 * src_step); \ + src[12] = vld1q_f32(src_data + 12 * src_step); \ + src[13] = vld1q_f32(src_data + 13 * src_step); \ + src[14] = vld1q_f32(src_data + 14 * src_step); \ + src[15] = vld1q_f32(src_data + 15 * src_step); + +#define Load36Data \ + src[0] = vld1q_f32(src_data + 0 * src_step); \ + src[1] = vld1q_f32(src_data + 1 * src_step); \ + src[2] = vld1q_f32(src_data + 2 * src_step); \ + src[3] = vld1q_f32(src_data + 3 * src_step); \ + src[4] = vld1q_f32(src_data + 4 * src_step); \ + src[5] = vld1q_f32(src_data + 5 * src_step); \ + src[6] = vld1q_f32(src_data + 6 * src_step); \ + src[7] = vld1q_f32(src_data + 7 * src_step); \ + src[8] = vld1q_f32(src_data + 8 * src_step); \ + src[9] = vld1q_f32(src_data + 9 * src_step); \ + src[10] = vld1q_f32(src_data + 10 * src_step); \ + src[11] = vld1q_f32(src_data + 11 * src_step); \ + src[12] = vld1q_f32(src_data + 12 * src_step); \ + src[13] = vld1q_f32(src_data + 13 * src_step); \ + src[14] = vld1q_f32(src_data + 14 * src_step); \ + src[15] = vld1q_f32(src_data + 15 * src_step); \ + src[16] = vld1q_f32(src_data + 16 * src_step); \ + src[17] = vld1q_f32(src_data + 17 * src_step); \ + src[18] = vld1q_f32(src_data + 18 * src_step); \ + src[19] = vld1q_f32(src_data + 19 * src_step); \ + src[20] = vld1q_f32(src_data + 20 * src_step); \ + src[21] = vld1q_f32(src_data + 21 * src_step); \ + src[22] = vld1q_f32(src_data + 22 * src_step); \ + src[23] = vld1q_f32(src_data + 23 * src_step); \ + src[24] = vld1q_f32(src_data + 24 * src_step); \ + src[25] = vld1q_f32(src_data + 25 * src_step); \ + src[26] = vld1q_f32(src_data + 26 * src_step); \ + src[27] = vld1q_f32(src_data + 27 * src_step); \ + src[28] = vld1q_f32(src_data + 28 * src_step); \ + src[29] = vld1q_f32(src_data + 29 * src_step); \ + src[30] = vld1q_f32(src_data + 30 * src_step); \ + src[31] = vld1q_f32(src_data + 31 * src_step); \ + src[32] = vld1q_f32(src_data + 32 * src_step); \ + src[33] = vld1q_f32(src_data + 33 * src_step); \ + src[34] = vld1q_f32(src_data + 34 * src_step); \ + src[35] = vld1q_f32(src_data + 35 * src_step); + +#define Load64Data \ + src[0] = vld1q_f32(src_data + 0 * src_step); \ + src[1] = vld1q_f32(src_data + 1 * src_step); \ + src[2] = vld1q_f32(src_data + 2 * src_step); \ + src[3] = vld1q_f32(src_data + 3 * src_step); \ + src[4] = vld1q_f32(src_data + 4 * src_step); \ + src[5] = vld1q_f32(src_data + 5 * src_step); \ + src[6] = vld1q_f32(src_data + 6 * src_step); \ + src[7] = vld1q_f32(src_data + 7 * src_step); \ + src[8] = vld1q_f32(src_data + 8 * src_step); \ + src[9] = vld1q_f32(src_data + 9 * src_step); \ + src[10] = vld1q_f32(src_data + 10 * src_step); \ + src[11] = vld1q_f32(src_data + 11 * src_step); \ + src[12] = vld1q_f32(src_data + 12 * src_step); \ + src[13] = vld1q_f32(src_data + 13 * src_step); \ + src[14] = vld1q_f32(src_data + 14 * src_step); \ + src[15] = vld1q_f32(src_data + 15 * src_step); \ + src[16] = vld1q_f32(src_data + 16 * src_step); \ + src[17] = vld1q_f32(src_data + 17 * src_step); \ + src[18] = vld1q_f32(src_data + 18 * src_step); \ + src[19] = vld1q_f32(src_data + 19 * src_step); \ + src[20] = vld1q_f32(src_data + 20 * src_step); \ + src[21] = vld1q_f32(src_data + 21 * src_step); \ + src[22] = vld1q_f32(src_data + 22 * src_step); \ + src[23] = vld1q_f32(src_data + 23 * src_step); \ + src[24] = vld1q_f32(src_data + 24 * src_step); \ + src[25] = vld1q_f32(src_data + 25 * src_step); \ + src[26] = vld1q_f32(src_data + 26 * src_step); \ + src[27] = vld1q_f32(src_data + 27 * src_step); \ + src[28] = vld1q_f32(src_data + 28 * src_step); \ + src[29] = vld1q_f32(src_data + 29 * src_step); \ + src[30] = vld1q_f32(src_data + 30 * src_step); \ + src[31] = vld1q_f32(src_data + 31 * src_step); \ + src[32] = vld1q_f32(src_data + 32 * src_step); \ + src[33] = vld1q_f32(src_data + 33 * src_step); \ + src[34] = vld1q_f32(src_data + 34 * src_step); \ + src[35] = vld1q_f32(src_data + 35 * src_step); \ + src[36] = vld1q_f32(src_data + 36 * src_step); \ + src[37] = vld1q_f32(src_data + 37 * src_step); \ + src[38] = vld1q_f32(src_data + 38 * src_step); \ + src[39] = vld1q_f32(src_data + 39 * src_step); \ + src[40] = vld1q_f32(src_data + 40 * src_step); \ + src[41] = vld1q_f32(src_data + 41 * src_step); \ + src[42] = vld1q_f32(src_data + 42 * src_step); \ + src[43] = vld1q_f32(src_data + 43 * src_step); \ + src[44] = vld1q_f32(src_data + 44 * src_step); \ + src[45] = vld1q_f32(src_data + 45 * src_step); \ + src[46] = vld1q_f32(src_data + 46 * src_step); \ + src[47] = vld1q_f32(src_data + 47 * src_step); \ + src[48] = vld1q_f32(src_data + 48 * src_step); \ + src[49] = vld1q_f32(src_data + 49 * src_step); \ + src[50] = vld1q_f32(src_data + 50 * src_step); \ + src[51] = vld1q_f32(src_data + 51 * src_step); \ + src[52] = vld1q_f32(src_data + 52 * src_step); \ + src[53] = vld1q_f32(src_data + 53 * src_step); \ + src[54] = vld1q_f32(src_data + 54 * src_step); \ + src[55] = vld1q_f32(src_data + 55 * src_step); \ + src[56] = vld1q_f32(src_data + 56 * src_step); \ + src[57] = vld1q_f32(src_data + 57 * src_step); \ + src[58] = vld1q_f32(src_data + 58 * src_step); \ + src[59] = vld1q_f32(src_data + 59 * src_step); \ + src[60] = vld1q_f32(src_data + 60 * src_step); \ + src[61] = vld1q_f32(src_data + 61 * src_step); \ + src[62] = vld1q_f32(src_data + 62 * src_step); \ + src[63] = vld1q_f32(src_data + 63 * src_step); + +InputTransFunc GetInputTransFunc(int input_unit); + void InputTransform4x4Unit(const float *src_data, float *dst_data, int src_step, int dst_step); +void InputTransform6x6Unit(const float *src_data, float *dst_data, int src_step, int dst_step); + void InputTransform8x8Unit(const float *src_data, float *dst_data, int src_step, int dst_step); -void OutputTransform4x2Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step); +OutputTransFunc GetOutputTransFunc(int input_unit, int output_unit); +#define Store4Data \ + vst1q_f32(dst_data, m[0]); \ + vst1q_f32(dst_data + C4NUM, m[1]); \ + vst1q_f32(dst_data + dst_step * C4NUM, m[2]); \ + vst1q_f32(dst_data + dst_step * C4NUM + C4NUM, m[3]); + +#define Store9Data \ + vst1q_f32(dst_data, m[0]); \ + vst1q_f32(dst_data + C4NUM, m[1]); \ + vst1q_f32(dst_data + 2 * C4NUM, m[2]); \ + vst1q_f32(dst_data + dst_step * C4NUM, m[3]); \ + vst1q_f32(dst_data + dst_step * C4NUM + C4NUM, m[4]); \ + vst1q_f32(dst_data + dst_step * C4NUM + 2 * C4NUM, m[5]); \ + vst1q_f32(dst_data + 2 * dst_step * C4NUM, m[6]); \ + vst1q_f32(dst_data + 2 * dst_step * C4NUM + C4NUM, m[7]); \ + vst1q_f32(dst_data + 2 * dst_step * C4NUM + 2 * C4NUM, m[8]); + +#define Store16Data \ + vst1q_f32(dst_data, m[0]); \ + vst1q_f32(dst_data + C4NUM, m[1]); \ + vst1q_f32(dst_data + 2 * C4NUM, m[2]); \ + vst1q_f32(dst_data + 3 * C4NUM, m[3]); \ + vst1q_f32(dst_data + dst_step * C4NUM, m[4]); \ + vst1q_f32(dst_data + dst_step * C4NUM + C4NUM, m[5]); \ + vst1q_f32(dst_data + dst_step * C4NUM + 2 * C4NUM, m[6]); \ + vst1q_f32(dst_data + dst_step * C4NUM + 3 * C4NUM, m[7]); \ + vst1q_f32(dst_data + 2 * dst_step * C4NUM, m[8]); \ + vst1q_f32(dst_data + 2 * dst_step * C4NUM + C4NUM, m[9]); \ + vst1q_f32(dst_data + 2 * dst_step * C4NUM + 2 * C4NUM, m[10]); \ + vst1q_f32(dst_data + 2 * dst_step * C4NUM + 3 * C4NUM, m[11]); \ + vst1q_f32(dst_data + 3 * dst_step * C4NUM, m[12]); \ + vst1q_f32(dst_data + 3 * dst_step * C4NUM + C4NUM, m[13]); \ + vst1q_f32(dst_data + 3 * dst_step * C4NUM + 2 * C4NUM, m[14]); \ + vst1q_f32(dst_data + 3 * dst_step * C4NUM + 3 * C4NUM, m[15]); + +#define Store25Data \ + vst1q_f32(dst_data, m[0]); \ + vst1q_f32(dst_data + C4NUM, m[1]); \ + vst1q_f32(dst_data + 2 * C4NUM, m[2]); \ + vst1q_f32(dst_data + 3 * C4NUM, m[3]); \ + vst1q_f32(dst_data + 4 * C4NUM, m[4]); \ + vst1q_f32(dst_data + dst_step * C4NUM, m[5]); \ + vst1q_f32(dst_data + dst_step * C4NUM + C4NUM, m[6]); \ + vst1q_f32(dst_data + dst_step * C4NUM + 2 * C4NUM, m[7]); \ + vst1q_f32(dst_data + dst_step * C4NUM + 3 * C4NUM, m[8]); \ + vst1q_f32(dst_data + dst_step * C4NUM + 4 * C4NUM, m[9]); \ + vst1q_f32(dst_data + 2 * dst_step * C4NUM, m[10]); \ + vst1q_f32(dst_data + 2 * dst_step * C4NUM + C4NUM, m[11]); \ + vst1q_f32(dst_data + 2 * dst_step * C4NUM + 2 * C4NUM, m[12]); \ + vst1q_f32(dst_data + 2 * dst_step * C4NUM + 3 * C4NUM, m[13]); \ + vst1q_f32(dst_data + 2 * dst_step * C4NUM + 4 * C4NUM, m[14]); \ + vst1q_f32(dst_data + 3 * dst_step * C4NUM, m[15]); \ + vst1q_f32(dst_data + 3 * dst_step * C4NUM + C4NUM, m[16]); \ + vst1q_f32(dst_data + 3 * dst_step * C4NUM + 2 * C4NUM, m[17]); \ + vst1q_f32(dst_data + 3 * dst_step * C4NUM + 3 * C4NUM, m[18]); \ + vst1q_f32(dst_data + 3 * dst_step * C4NUM + 4 * C4NUM, m[19]); \ + vst1q_f32(dst_data + 4 * dst_step * C4NUM, m[20]); \ + vst1q_f32(dst_data + 4 * dst_step * C4NUM + C4NUM, m[21]); \ + vst1q_f32(dst_data + 4 * dst_step * C4NUM + 2 * C4NUM, m[22]); \ + vst1q_f32(dst_data + 4 * dst_step * C4NUM + 3 * C4NUM, m[23]); \ + vst1q_f32(dst_data + 4 * dst_step * C4NUM + 4 * C4NUM, m[24]); + +void OutputTransform4x2Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step); void OutputTransform4x3Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step); +void OutputTransform6x2Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step); +void OutputTransform6x3Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step); +void OutputTransform6x4Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step); +void OutputTransform6x5Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step); + void OutputTransform8x2Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step); - void OutputTransform8x3Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step); - void OutputTransform8x4Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step); - void OutputTransform8x5Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step); - void OutputTransform8x6Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step); - void OutputTransform8x7Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step); int SelectOutputUnit(ConvParameter *conv_param); -InputTransformUnitFunc GetInputTransFunc(int input_unit); - -OutputTransformUnitFunc GetOutputTransFunc(int input_unit, int output_unit); - -void CheckIfUseWinograd(bool *use_winograd, int *output_unit, ConvParameter *conv_param, - InputTransformUnitFunc input_trans_func, OutputTransformUnitFunc output_trans_func); +void CheckIfUseWinograd(bool *use_winograd, int *output_unit, ConvParameter *conv_param); #ifdef __cplusplus } #endif diff --git a/mindspore/lite/src/runtime/kernel/arm/base/matrix.cc b/mindspore/lite/src/runtime/kernel/arm/base/matrix.cc deleted file mode 100644 index aed7e0ebb8b..00000000000 --- a/mindspore/lite/src/runtime/kernel/arm/base/matrix.cc +++ /dev/null @@ -1,86 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "src/runtime/kernel/arm/base/matrix.h" -#include "utils/log_adapter.h" - -namespace mindspore::kernel { -Matrix *TransformMatrixGenerator(int m, int k) { - auto matrix = new (std::nothrow) Matrix; - if (matrix == nullptr) { - MS_LOG(ERROR) << "matrix is nullptr."; - return nullptr; - } - auto data = malloc(m * k * sizeof(float)); - if (data == nullptr) { - MS_LOG(ERROR) << "Malloc matrix data failed."; - return nullptr; - } - matrix->SetData(data); - matrix->SetNum(m, k); - return matrix; -} - -void ChooseMatrixG(Matrix *matrix_g, Matrix *matrix_gt) { - int m = matrix_g->GetM(); - int k = matrix_g->GetK(); - auto matrix_g_data = reinterpret_cast(matrix_g->GetData()); - auto matrix_gt_data = reinterpret_cast(matrix_gt->GetData()); - // m represents input unit, only 4 or 8 can be accepted for input unit. - // k represents kernel unit, varies from 2 to 7. - if (m == 4 && k == 2) { - MatrixG4x2(matrix_g_data); - MatrixGT2x4(matrix_gt_data); - } else if (m == 8 && k == 2) { - MatrixG8x2(matrix_g_data); - MatrixGT2x8(matrix_gt_data); - } else if (m == 8 && k == 3) { - MatrixG8x3(matrix_g_data); - MatrixGT3x8(matrix_gt_data); - } else if (m == 8 && k == 4) { - MatrixG8x4(matrix_g_data); - MatrixGT4x8(matrix_gt_data); - } else if (m == 8 && k == 5) { - MatrixG8x5(matrix_g_data); - MatrixGT5x8(matrix_gt_data); - } else if (m == 8 && k == 6) { - MatrixG8x6(matrix_g_data); - MatrixGT6x8(matrix_gt_data); - } else if (m == 8 && k == 7) { - MatrixG8x7(matrix_g_data); - MatrixGT7x8(matrix_gt_data); - } else { - MS_LOG(ERROR) << "Unsupported input unit or kernel unit."; - return; - } -} - -void MatrixMultiply(const float *matrix_a, const float *matrix_b, float *matrix_c, int m, int k, int n, bool row) { - // row-major implementation - int count = 0; - for (int h = 0; h < m; h++) { - int h_offset = h * k; - for (int w = 0; w < n; w++) { - float res = 0; - for (int i = 0; i < k; i++) { - res += *(matrix_a + h_offset + i) * *(matrix_b + w + i * n); - } - *(matrix_c + count) = res; - count++; - } - } -} -} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/base/matrix.h b/mindspore/lite/src/runtime/kernel/arm/base/matrix.h deleted file mode 100644 index 4dbdd59dd59..00000000000 --- a/mindspore/lite/src/runtime/kernel/arm/base/matrix.h +++ /dev/null @@ -1,77 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_MATRIX_H_ -#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_MATRIX_H_ - -#include -#include -#include "nnacl/winograd_utils.h" - -namespace mindspore::kernel { -class Matrix { - public: - Matrix() = default; - ~Matrix() { - if (data_ != nullptr) { - free(data_); - } - } - - void SetData(void *data) { this->data_ = data; } - - void *GetData() { return this->data_; } - - void SetNDim(int dim) { this->n_dim_ = dim; } - - int GetNDim() { return this->n_dim_; } - - void SetShape(std::vector shape) { this->shape_ = shape; } - - std::vector GetShape() { return this->shape_; } - - void SetStride(std::vector stride) { this->stride_ = stride; } - - std::vector GetStride() { return this->stride_; } - - void SetNum(int m, int k) { - this->m_ = m; - this->k_ = k; - } - - int GetM() { return this->m_; } - - int GetK() { return this->k_; } - - protected: - void *data_ = nullptr; - std::vector shape_; - std::vector stride_; - int m_; - int k_; - int n_dim_; - bool row_major_; -}; - -Matrix *TransformMatrixGenerator(int m, int k); - -// Chinese Remainder Theorem interp: 0.5 -void ChooseMatrixG(Matrix *matrix_g, Matrix *matrix_gt); - -void MatrixMultiply(const float *matrix_a, const float *matrix_b, float *matrix_c, int m, int k, int n, bool row); -} // namespace mindspore::kernel - -#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_MATRIX_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc index c3bcff2847d..5870b75933d 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc @@ -238,9 +238,7 @@ kernel::LiteKernel *CpuConvFp16KernelCreator(const std::vector & } else { bool use_winograd = false; int out_unit; - InputTransformUnitFunc input_trans_func = nullptr; - OutputTransformUnitFunc output_trans_func = nullptr; - CheckIfUseWinograd(&use_winograd, &out_unit, conv_param, input_trans_func, output_trans_func); + CheckIfUseWinograd(&use_winograd, &out_unit, conv_param); if (use_winograd) { kernel = new (std::nothrow) kernel::ConvolutionWinogradFP16CPUKernel(opParameter, inputs, outputs, ctx, primitive, out_unit); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc index a61063508c4..b15296b9c75 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc @@ -15,7 +15,7 @@ */ #include "src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h" -#include "src/runtime/kernel/arm/fp16/matrix_fp16.h" +#include "nnacl/fp16/matrix_fp16.h" #include "nnacl/fp16/conv_fp16.h" #include "nnacl/fp16/cast_fp16.h" #include "nnacl/fp16/pack_fp16.h" @@ -34,43 +34,35 @@ using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_Conv2D; namespace mindspore::kernel { -int WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, - ConvParameter *conv_param, int oc_block) { +int ConvolutionWinogradFP16CPUKernel::WinogradFilterTransformFp16(const float16_t *weight_data, float *matrix_g, + float *matrix_gt, int oc_block) { // original weight format : ohwi - auto channel_in = conv_param->input_channel_; - auto channel_out = conv_param->output_channel_; - int input_unit_square = input_unit * input_unit; + auto channel_in = conv_param_->input_channel_; + auto channel_out = conv_param_->output_channel_; + int ic8 = UP_DIV(channel_in, C8NUM); + int ic4 = ic8 * 2; + int input_unit_square = input_unit_ * input_unit_; + int oc_block_num = UP_DIV(channel_out, oc_block); - // generate matrix_G && matrix_GT - auto matrix_g = TransformMatrixGenerator(input_unit, kernel_unit); - if (matrix_g == nullptr) { - MS_LOG(ERROR) << "matrix_g is null."; - delete matrix_g; - return RET_ERROR; - } - auto matrix_gt = TransformMatrixGenerator(kernel_unit, input_unit); - if (matrix_gt == nullptr) { - MS_LOG(ERROR) << "matrix_gt is null."; - delete matrix_g; - delete matrix_gt; - return RET_ERROR; - } - ChooseMatrixG(matrix_g, matrix_gt); - auto matrix_g_data = reinterpret_cast(matrix_g->GetData()); - auto matrix_gt_data = reinterpret_cast(matrix_gt->GetData()); - auto matrix_g_data_fp16 = reinterpret_cast(malloc(input_unit * kernel_unit * sizeof(float16_t))); - auto matrix_gt_data_fp16 = reinterpret_cast(malloc(input_unit * kernel_unit * sizeof(float16_t))); - Float32ToFloat16(matrix_g_data, matrix_g_data_fp16, input_unit * kernel_unit); - Float32ToFloat16(matrix_gt_data, matrix_gt_data_fp16, input_unit * kernel_unit); + auto matrix_g_data_fp16 = reinterpret_cast(malloc(input_unit_ * kernel_unit_ * sizeof(float16_t))); + auto matrix_gt_data_fp16 = reinterpret_cast(malloc(input_unit_ * kernel_unit_ * sizeof(float16_t))); + Float32ToFloat16(matrix_g, matrix_g_data_fp16, input_unit_ * kernel_unit_); + Float32ToFloat16(matrix_gt, matrix_gt_data_fp16, input_unit_ * kernel_unit_); // trans_filter = G*g*GT (g represents weight_data) // separate into two steps ===> tmp = G*g ===> out = tmp * GT - auto tmp_weight_data = reinterpret_cast(malloc(kernel_unit * kernel_unit * sizeof(float16_t))); - auto tmp_data = reinterpret_cast(malloc(input_unit * kernel_unit * sizeof(float16_t))); - auto trans_out_data = reinterpret_cast(malloc(input_unit * input_unit * sizeof(float16_t))); - bool row = true; - auto trans_weight_data = reinterpret_cast(trans_weight->GetData()); - std::vector strides = trans_weight->GetStride(); + auto tmp_weight_data = reinterpret_cast(malloc(kernel_unit_ * kernel_unit_ * sizeof(float16_t))); + auto tmp_data = reinterpret_cast(malloc(input_unit_ * kernel_unit_ * sizeof(float16_t))); + auto trans_out_data = reinterpret_cast(malloc(input_unit_ * input_unit_ * sizeof(float16_t))); + std::vector shape{input_unit_ * input_unit_, oc_block_num, ic4, C4NUM, oc_block}; + std::vector strides; + for (int i = 0; i < 4; i++) { + int stride = 1; + for (int j = i + 1; j < 5; j++) { + stride *= shape[j]; + } + strides.push_back(stride); + } int kernel_plane_stride = channel_in; if (oc_block == 0) { @@ -80,33 +72,31 @@ int WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_weig free(trans_out_data); free(matrix_g_data_fp16); free(matrix_gt_data_fp16); - delete matrix_g; - delete matrix_gt; return RET_ERROR; } for (int i = 0; i < channel_out; i++) { int out_c_block = i / oc_block; int out_c_res = i % oc_block; - int input_oz_offset = i * kernel_unit * kernel_unit * channel_in; - int output_oz_offset = out_c_block * strides[1] * input_unit * input_unit + out_c_res; + int input_oz_offset = i * kernel_unit_ * kernel_unit_ * channel_in; + int output_oz_offset = out_c_block * strides[1] * input_unit_ * input_unit_ + out_c_res; for (int j = 0; j < channel_in; j++) { int ic4_block = j / C4NUM; int ic4_res = j % C4NUM; int input_iz_offset = input_oz_offset + j; int output_iz_offset = output_oz_offset + ic4_block * strides[2] + ic4_res * strides[3]; - for (int k = 0; k < kernel_unit * kernel_unit; k++) { + for (int k = 0; k < kernel_unit_ * kernel_unit_; k++) { int input_xy_offset = input_iz_offset + k * kernel_plane_stride; tmp_weight_data[k] = *(weight_data + input_xy_offset); } // now we only support row-major matrix-multiply // tmp = G * g - MatrixMultiplyFp16(matrix_g_data_fp16, tmp_weight_data, tmp_data, input_unit, kernel_unit, kernel_unit, row); + MatrixMultiplyFp16(matrix_g_data_fp16, tmp_weight_data, tmp_data, input_unit_, kernel_unit_, kernel_unit_); // out = tmp * GT - MatrixMultiplyFp16(tmp_data, matrix_gt_data_fp16, trans_out_data, input_unit, kernel_unit, input_unit, row); + MatrixMultiplyFp16(tmp_data, matrix_gt_data_fp16, trans_out_data, input_unit_, kernel_unit_, input_unit_); for (int z = 0; z < input_unit_square; z++) { int output_xy_offset = output_iz_offset + z * strides[1]; - *(trans_weight_data + output_xy_offset) = trans_out_data[z]; + trans_weight_[output_xy_offset] = trans_out_data[z]; } } } @@ -115,15 +105,58 @@ int WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_weig free(trans_out_data); free(matrix_g_data_fp16); free(matrix_gt_data_fp16); - delete matrix_g; - delete matrix_gt; return RET_OK; } +int ConvolutionWinogradFP16CPUKernel::MallocTransformMatrices() { + matrix_a_ = reinterpret_cast(malloc(input_unit_ * output_unit_ * sizeof(float16_t))); + if (matrix_a_ == nullptr) { + MS_LOG(ERROR) << "malloc matrix_a_ failed."; + return RET_ERROR; + } + matrix_at_ = reinterpret_cast(malloc(input_unit_ * output_unit_ * sizeof(float16_t))); + if (matrix_at_ == nullptr) { + MS_LOG(ERROR) << "malloc matrix_at_ failed."; + return RET_ERROR; + } + matrix_b_ = reinterpret_cast(malloc(input_unit_ * input_unit_ * sizeof(float16_t))); + if (matrix_b_ == nullptr) { + MS_LOG(ERROR) << "malloc matrix_b_ failed."; + return RET_ERROR; + } + matrix_bt_ = reinterpret_cast(malloc(input_unit_ * input_unit_ * sizeof(float16_t))); + if (matrix_bt_ == nullptr) { + MS_LOG(ERROR) << "malloc matrix_bt_ failed."; + return RET_ERROR; + } + return RET_OK; +} + +void ConvolutionWinogradFP16CPUKernel::FreeTransformMatrices() { + if (matrix_a_ != nullptr) { + free(matrix_a_); + matrix_a_ = nullptr; + } + if (matrix_at_ != nullptr) { + free(matrix_at_); + matrix_at_ = nullptr; + } + if (matrix_b_ != nullptr) { + free(matrix_b_); + matrix_b_ = nullptr; + } + if (matrix_bt_ != nullptr) { + free(matrix_bt_); + matrix_bt_ = nullptr; + } + return; +} + int ConvolutionWinogradFP16CPUKernel::InitWeightBias() { auto filter_tensor = in_tensors_.at(kWeightIndex); int in_channel = filter_tensor->Channel(); int out_channel = filter_tensor->Batch(); + int ic8 = UP_DIV(in_channel, C8NUM); conv_param_->input_channel_ = in_channel; conv_param_->output_channel_ = out_channel; @@ -132,19 +165,43 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() { oc_block_num = UP_DIV(out_channel, C8NUM); // init weight - auto ret = MallocFilterMatrix(oc_block, oc_block_num); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Malloc filter matrix failed."; - return RET_ERROR; - } - - ret = ConvolutionBaseFP16CPUKernel::GetExecuteFilter(); + auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteFilter(); if (ret != RET_OK) { MS_LOG(ERROR) << "Get Execute filter failed."; return ret; } - ret = WinogradFilterTransformFp16(execute_weight_, trans_weight_, kernel_unit_, input_unit_, conv_param_, oc_block); + // set data + auto trans_matrix_data_size = input_unit_ * input_unit_ * ic8 * C8NUM * oc_block_num * oc_block * sizeof(float16_t); + trans_weight_ = reinterpret_cast(malloc(trans_matrix_data_size)); + if (trans_weight_ == nullptr) { + MS_LOG(ERROR) << "malloc trans_weight_ failed."; + return RET_ERROR; + } + memset(trans_weight_, 0, trans_matrix_data_size); + auto *matrix_g = reinterpret_cast(malloc(input_unit_ * kernel_unit_ * sizeof(float))); + auto matrix_gt = reinterpret_cast(malloc(input_unit_ * kernel_unit_ * sizeof(float))); + ret = MallocTransformMatrices(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Malloc transform matrices failed."; + return ret; + } + + float matrix_a[MAX_LEN]; + float matrix_at[MAX_LEN]; + float matrix_b[MAX_LEN]; + float matrix_bt[MAX_LEN]; + CookToomFilter(matrix_a, matrix_at, matrix_b, matrix_bt, matrix_g, matrix_gt, 0.5f, output_unit_, kernel_unit_); + Float32ToFloat16(matrix_a, matrix_a_, input_unit_ * output_unit_); + Float32ToFloat16(matrix_at, matrix_at_, input_unit_ * output_unit_); + Float32ToFloat16(matrix_b, matrix_b_, input_unit_ * input_unit_); + Float32ToFloat16(matrix_bt, matrix_bt_, input_unit_ * input_unit_); + matrices_[0] = matrix_a_; + matrices_[1] = matrix_at_; + matrices_[2] = matrix_b_; + matrices_[3] = matrix_bt_; + + ret = WinogradFilterTransformFp16(execute_weight_, matrix_g, matrix_gt, oc_block); if (ret != RET_OK) { MS_LOG(ERROR) << "winograd filter transfrom failed."; return ret; @@ -166,49 +223,8 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() { } else { MS_ASSERT(inputs_.size() == kInputSize1); } - return RET_OK; -} - -int ConvolutionWinogradFP16CPUKernel::MallocFilterMatrix(int oc_block, int oc_block_num) { - int channel_in = conv_param_->input_channel_; - int ic8 = UP_DIV(channel_in, C8NUM); - int ic4 = ic8 * 2; - - // set data - auto trans_matrix_data_size = input_unit_ * input_unit_ * ic8 * C8NUM * oc_block_num * oc_block * sizeof(float); - auto matrix_buffer = malloc(trans_matrix_data_size); - if (matrix_buffer == nullptr) { - MS_LOG(ERROR) << "malloc matrix_buffer failed."; - return RET_ERROR; - } - memset(matrix_buffer, 0, trans_matrix_data_size); - trans_weight_ = new (std::nothrow) Matrix(); - if (trans_weight_ == nullptr) { - MS_LOG(ERROR) << "new Matrix fail!"; - free(matrix_buffer); - return RET_ERROR; - } - trans_weight_->SetData(matrix_buffer); - trans_weight_->SetNDim(5); - - std::vector shapes; - std::vector strides; - // set shape - shapes.push_back(input_unit_ * input_unit_); - shapes.push_back(oc_block_num); - shapes.push_back(ic4); - shapes.push_back(C4NUM); - shapes.push_back(oc_block); - // set stride - for (int i = 0; i < 4; i++) { - int stride = 1; - for (int j = i + 1; j < 5; j++) { - stride *= shapes[j]; - } - strides.push_back(stride); - } - trans_weight_->SetShape(shapes); - trans_weight_->SetStride(strides); + free(matrix_g); + free(matrix_gt); return RET_OK; } @@ -260,19 +276,7 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() { int ConvolutionWinogradFP16CPUKernel::ConfigInputOutput() { auto output_tensor = out_tensors_.at(kOutputIndex); - output_tensor->SetFormat(schema::Format::Format_NHWC); - - // choose input transformer function (4x4 unit or 8x8 unit) - input_trans_func_ = GetInputTransFuncFp16(input_unit_); - if (input_trans_func_ == nullptr) { - MS_LOG(ERROR) << "Get input_trans_func failed."; - return RET_ERROR; - } - output_trans_func_ = GetOutputTransFuncFp16(input_unit_, output_unit_); - if (output_trans_func_ == nullptr) { - MS_LOG(ERROR) << "Get output_trans_func_ failed."; - return RET_ERROR; - } + output_tensor->SetFormat(schema::Format_NHWC); return RET_OK; } @@ -334,9 +338,9 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() { } int ConvolutionWinogradFP16CPUKernel::RunImpl(int task_id) { - ConvWinogardFp16(reinterpret_cast(nhwc4_input_), reinterpret_cast(trans_weight_->GetData()), + ConvWinogardFp16(reinterpret_cast(nhwc4_input_), trans_weight_, reinterpret_cast(bias_data_), tmp_buffer_address_list_, task_id, conv_param_, - input_trans_func_, output_trans_func_); + matrices_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h index 2440062b438..db576525d97 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h @@ -22,9 +22,9 @@ #include "src/lite_kernel.h" #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" #include "nnacl/fp16/conv_fp16.h" -#include "src/runtime/kernel/arm/fp16/matrix_fp16.h" #include "nnacl/fp16/winograd_utils_fp16.h" #include "nnacl/optimized_kernel.h" +#include "nnacl/minimal_filtering_generator.h" namespace mindspore::kernel { class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { @@ -39,9 +39,10 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { fp16_weight_ = nullptr; } if (trans_weight_ != nullptr) { - delete trans_weight_; + free(trans_weight_); trans_weight_ = nullptr; } + FreeTransformMatrices(); } int Init() override; @@ -49,10 +50,12 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { int Run() override; int RunImpl(int task_id); int InitWeightBias(); - int MallocFilterMatrix(int oc_block, int oc_block_num); + int MallocTransformMatrices(); + void FreeTransformMatrices(); int InitTmpBuffer(); int ConfigInputOutput(); int PostProcess(); + int WinogradFilterTransformFp16(const float16_t *weight_data, float *matrix_g, float *matrix_gt, int oc_block); private: void FreeTmpBuffer() { @@ -80,13 +83,14 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { float16_t *trans_input_ = nullptr; float16_t *gemm_out_ = nullptr; float16_t *tmp_out_data_ = nullptr; - Matrix *trans_weight_ = nullptr; - InputTransformUnitFp16Func input_trans_func_; - OutputTransformUnitFp16Func output_trans_func_; + float16_t *matrix_a_ = nullptr; + float16_t *matrix_at_ = nullptr; + float16_t *matrix_b_ = nullptr; + float16_t *matrix_bt_ = nullptr; + float16_t *trans_weight_ = nullptr; TmpBufferAddressFp16 tmp_buffer_address_list_[4]; + MatricesFp16 matrices_[4]; }; -int WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, - ConvParameter *conv_param, int oc_block); } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_WINOGRAD_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matrix_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/matrix_fp16.cc deleted file mode 100644 index 3048b74527a..00000000000 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/matrix_fp16.cc +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "src/runtime/kernel/arm/fp16/matrix_fp16.h" - -namespace mindspore::kernel { - -void MatrixMultiplyFp16(const float16_t *matrix_a, const float16_t *matrix_b, float16_t *matrix_c, int m, int k, int n, - bool row) { - // row-major implementation - int count = 0; - for (int h = 0; h < m; h++) { - int h_offset = h * k; - for (int w = 0; w < n; w++) { - float16_t res = 0; - for (int i = 0; i < k; i++) { - res += *(matrix_a + h_offset + i) * *(matrix_b + w + i * n); - } - *(matrix_c + count) = res; - count++; - } - } -} -} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc index b9f3a5896a4..4c1198013a9 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc @@ -228,10 +228,8 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector & conv_param->op_parameter_.thread_num_ = ctx->thread_num_; bool use_winograd = false; int out_unit; - InputTransformUnitFunc input_trans_func = nullptr; - OutputTransformUnitFunc output_trans_func = nullptr; if (primitive != nullptr && primitive->GetInferFlag()) { - CheckIfUseWinograd(&use_winograd, &out_unit, conv_param, input_trans_func, output_trans_func); + CheckIfUseWinograd(&use_winograd, &out_unit, conv_param); } auto *weight_tensor = inputs.at(kWeightIndex); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc index 51170850114..4398486996e 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc @@ -28,39 +28,29 @@ using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_Conv2D; namespace mindspore::kernel { -int WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, - ConvParameter *conv_param, int oc_block) { +int ConvolutionWinogradCPUKernel::WinogradFilterTransform(const float *weight_data, float *matrix_g, float *matrix_gt, + int oc_block) { // original weight format : ohwi - auto channel_in = conv_param->input_channel_; - auto channel_out = conv_param->output_channel_; - int input_unit_square = input_unit * input_unit; - - // generate matrix_G && matrix_GT - auto matrix_g = TransformMatrixGenerator(input_unit, kernel_unit); - if (matrix_g == nullptr) { - MS_LOG(ERROR) << "matrix_g is null."; - delete matrix_g; - return RET_ERROR; - } - auto matrix_gt = TransformMatrixGenerator(kernel_unit, input_unit); - if (matrix_gt == nullptr) { - MS_LOG(ERROR) << "matrix_gt is null."; - delete matrix_g; - delete matrix_gt; - return RET_ERROR; - } - ChooseMatrixG(matrix_g, matrix_gt); - auto matrix_g_data = reinterpret_cast(matrix_g->GetData()); - auto matrix_gt_data = reinterpret_cast(matrix_gt->GetData()); + auto channel_in = conv_param_->input_channel_; + auto channel_out = conv_param_->output_channel_; + int input_unit_square = input_unit_ * input_unit_; + int ic4 = UP_DIV(channel_in, C4NUM); + int oc_block_num = UP_DIV(channel_out, oc_block); // trans_filter = G*g*GT (g represents weight_data) // separate into two steps ===> tmp = G*g ===> out = tmp * GT - auto tmp_weight_data = reinterpret_cast(malloc(kernel_unit * kernel_unit * sizeof(float))); - auto tmp_data = reinterpret_cast(malloc(input_unit * kernel_unit * sizeof(float))); - auto trans_out_data = reinterpret_cast(malloc(input_unit * input_unit * sizeof(float))); - bool row = true; - auto trans_weight_data = reinterpret_cast(trans_weight->GetData()); - std::vector strides = trans_weight->GetStride(); + auto tmp_weight_data = reinterpret_cast(malloc(kernel_unit_ * kernel_unit_ * sizeof(float))); + auto tmp_data = reinterpret_cast(malloc(input_unit_ * kernel_unit_ * sizeof(float))); + auto trans_out_data = reinterpret_cast(malloc(input_unit_ * input_unit_ * sizeof(float))); + std::vector shape{input_unit_ * input_unit_, oc_block_num, ic4, C4NUM, oc_block}; + std::vector strides; + for (int i = 0; i < 4; i++) { + int stride = 1; + for (int j = i + 1; j < 5; j++) { + stride *= shape[j]; + } + strides.push_back(stride); + } int kernel_plane_stride = channel_in; if (oc_block == 0) { @@ -68,41 +58,37 @@ int WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int free(tmp_weight_data); free(tmp_data); free(trans_out_data); - delete matrix_g; - delete matrix_gt; return RET_ERROR; } for (int i = 0; i < channel_out; i++) { int out_c_block = i / oc_block; int out_c_res = i % oc_block; - int input_oz_offset = i * kernel_unit * kernel_unit * channel_in; + int input_oz_offset = i * kernel_unit_ * kernel_unit_ * channel_in; int output_oz_offset = out_c_block * strides[1] + out_c_res; for (int j = 0; j < channel_in; j++) { int ic4_block = j / C4NUM; int ic4_res = j % C4NUM; int input_iz_offset = input_oz_offset + j; int output_iz_offset = output_oz_offset + ic4_block * strides[2] + ic4_res * strides[3]; - for (int k = 0; k < kernel_unit * kernel_unit; k++) { + for (int k = 0; k < kernel_unit_ * kernel_unit_; k++) { int input_xy_offset = input_iz_offset + k * kernel_plane_stride; tmp_weight_data[k] = *(weight_data + input_xy_offset); } // now we only support row-major matrix-multiply // tmp = G * g - MatrixMultiply(matrix_g_data, tmp_weight_data, tmp_data, input_unit, kernel_unit, kernel_unit, row); + MatrixMultiply(matrix_g, tmp_weight_data, tmp_data, input_unit_, kernel_unit_, kernel_unit_); // out = tmp * GT - MatrixMultiply(tmp_data, matrix_gt_data, trans_out_data, input_unit, kernel_unit, input_unit, row); + MatrixMultiply(tmp_data, matrix_gt, trans_out_data, input_unit_, kernel_unit_, input_unit_); for (int z = 0; z < input_unit_square; z++) { int output_xy_offset = output_iz_offset + z * strides[0]; - *(trans_weight_data + output_xy_offset) = trans_out_data[z]; + *(trans_weight_ + output_xy_offset) = trans_out_data[z]; } } } free(tmp_weight_data); free(tmp_data); free(trans_out_data); - delete matrix_g; - delete matrix_gt; return RET_OK; } @@ -110,6 +96,7 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() { auto filter_tensor = in_tensors_.at(kWeightIndex); int in_channel = filter_tensor->Channel(); int out_channel = filter_tensor->Batch(); + int ic4 = UP_DIV(in_channel, C4NUM); conv_param_->input_channel_ = in_channel; conv_param_->output_channel_ = out_channel; @@ -118,14 +105,26 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() { oc_block = C8NUM; oc_block_num = UP_DIV(out_channel, C8NUM); - // init weight - auto ret = MallocFilterMatrix(oc_block, oc_block_num); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Malloc filter matrix failed."; + // set data + auto trans_matrix_data_size = input_unit_ * input_unit_ * ic4 * C4NUM * oc_block_num * oc_block * sizeof(float); + trans_weight_ = reinterpret_cast(malloc(trans_matrix_data_size)); + if (trans_weight_ == nullptr) { + MS_LOG(ERROR) << "malloc matrix_buffer failed."; return RET_ERROR; } + + memset(trans_weight_, 0, trans_matrix_data_size); + + float matrix_g[64]; + float matrix_gt[64]; + float matrix_a[64]; + float matrix_at[64]; + float matrix_b[64]; + float matrix_bt[64]; + CookToomFilter(matrix_a, matrix_at, matrix_b, matrix_bt, matrix_g, matrix_gt, 1.0f, output_unit_, kernel_unit_); + auto weight_data = reinterpret_cast(filter_tensor->MutableData()); - ret = WinogradFilterTransform(weight_data, trans_weight_, kernel_unit_, input_unit_, conv_param_, oc_block); + auto ret = WinogradFilterTransform(weight_data, matrix_g, matrix_gt, oc_block); if (ret != RET_OK) { MS_LOG(ERROR) << "winograd filter transfrom failed."; return ret; @@ -144,48 +143,6 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() { return RET_OK; } -int ConvolutionWinogradCPUKernel::MallocFilterMatrix(int oc_block, int oc_block_num) { - int channel_in = conv_param_->input_channel_; - int ic4 = UP_DIV(channel_in, C4NUM); - - // set data - auto trans_matrix_data_size = input_unit_ * input_unit_ * ic4 * C4NUM * oc_block_num * oc_block * sizeof(float); - auto matrix_buffer = malloc(trans_matrix_data_size); - if (matrix_buffer == nullptr) { - MS_LOG(ERROR) << "malloc matrix_buffer failed."; - return RET_ERROR; - } - memset(matrix_buffer, 0, trans_matrix_data_size); - trans_weight_ = new (std::nothrow) Matrix(); - if (trans_weight_ == nullptr) { - MS_LOG(ERROR) << "new Matrix fail!"; - free(matrix_buffer); - return RET_ERROR; - } - trans_weight_->SetData(matrix_buffer); - trans_weight_->SetNDim(5); - - std::vector shapes; - std::vector strides; - // set shape - shapes.push_back(input_unit_ * input_unit_); - shapes.push_back(oc_block_num); - shapes.push_back(ic4); - shapes.push_back(C4NUM); - shapes.push_back(oc_block); - // set stride - for (int i = 0; i < 4; i++) { - int stride = 1; - for (int j = i + 1; j < 5; j++) { - stride *= shapes[j]; - } - strides.push_back(stride); - } - trans_weight_->SetShape(shapes); - trans_weight_->SetStride(strides); - return RET_OK; -} - int ConvolutionWinogradCPUKernel::InitTmpBuffer() { int channel_out = conv_param_->output_channel_; int output_h = conv_param_->output_h_; @@ -245,17 +202,17 @@ int ConvolutionWinogradCPUKernel::ConfigInputOutput() { auto output_tensor = out_tensors_.at(kOutputIndex); output_tensor->SetFormat(schema::Format::Format_NHWC); - // choose input transformer function (4x4 unit or 8x8 unit) - input_trans_func_ = GetInputTransFunc(input_unit_); - if (input_trans_func_ == nullptr) { - MS_LOG(ERROR) << "Get input_trans_func failed."; + in_func_ = GetInputTransFunc(input_unit_); + if (in_func_ == nullptr) { + MS_LOG(ERROR) << "in_func_ is null."; return RET_ERROR; } - output_trans_func_ = GetOutputTransFunc(input_unit_, output_unit_); - if (output_trans_func_ == nullptr) { - MS_LOG(ERROR) << "Get output_trans_func_ failed."; + out_func_ = GetOutputTransFunc(input_unit_, output_unit_); + if (out_func_ == nullptr) { + MS_LOG(ERROR) << "out_func_ is null."; return RET_ERROR; } + // #ifdef ENABLE_ARM32 // gemm_func_ = IndirectGemmFp32_8x4; // #else @@ -326,9 +283,8 @@ int ConvolutionWinogradCPUKernel::RunImpl(int task_id) { MS_LOG(ERROR) << "gemm_func is nullptr."; return RET_ERROR; } - ConvWinogardFp32(reinterpret_cast(nhwc4_input_), reinterpret_cast(trans_weight_->GetData()), - reinterpret_cast(bias_data_), tmp_buffer_address_list_, task_id, conv_param_, - input_trans_func_, output_trans_func_, gemm_func_); + ConvWinogardFp32(reinterpret_cast(nhwc4_input_), trans_weight_, reinterpret_cast(bias_data_), + tmp_buffer_address_list_, task_id, conv_param_, in_func_, out_func_, gemm_func_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h index 73ea6b0b8f0..3c164cb43bd 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h @@ -19,10 +19,9 @@ #include #include "src/lite_kernel.h" - #include "nnacl/winograd_transform.h" +#include "nnacl/minimal_filtering_generator.h" #include "src/runtime/kernel/arm/base/convolution_base.h" -#include "src/runtime/kernel/arm/base/matrix.h" namespace mindspore::kernel { class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel { @@ -35,7 +34,7 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel { trans_weight_(nullptr) {} ~ConvolutionWinogradCPUKernel() override { if (trans_weight_ != nullptr) { - delete trans_weight_; + free(trans_weight_); trans_weight_ = nullptr; } }; @@ -44,10 +43,10 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel { int Run() override; int RunImpl(int task_id); int InitWeightBias(); - int MallocFilterMatrix(int oc_block, int oc_block_num); int InitTmpBuffer(); int ConfigInputOutput(); int PostProcess(); + int WinogradFilterTransform(const float *weight_data, float *matrix_g, float *matrix_gt, int oc_block); private: void FreeTmpBuffer() { @@ -80,13 +79,12 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel { float *gemm_out_ = nullptr; float *tmp_out_data_ = nullptr; float *col_buffer_ = nullptr; - Matrix *trans_weight_ = nullptr; - InputTransformUnitFunc input_trans_func_; - OutputTransformUnitFunc output_trans_func_; + float *trans_weight_ = nullptr; TmpBufferAddress tmp_buffer_address_list_[5]; + InputTransFunc in_func_; + OutputTransFunc out_func_; GEMM_FUNC_FP32 gemm_func_ = nullptr; }; -int WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, - ConvParameter *conv_param, int oc_block); + } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_WINOGRAD_H_ diff --git a/mindspore/lite/tools/benchmark/benchmark.h b/mindspore/lite/tools/benchmark/benchmark.h index fc8ac3ec0d1..b69a6a9bf95 100644 --- a/mindspore/lite/tools/benchmark/benchmark.h +++ b/mindspore/lite/tools/benchmark/benchmark.h @@ -181,9 +181,18 @@ class MS_API Benchmark { auto tolerance = absoluteTolerance + relativeTolerance * fabs(calibTensor->data.at(j)); auto absoluteError = std::fabs(msTensorData[j] - calibTensor->data.at(j)); if (absoluteError > tolerance) { - // just assume that atol = rtol - meanError += absoluteError / (fabs(calibTensor->data.at(j)) + FLT_MIN); - errorCount++; + if (fabs(calibTensor->data.at(j)) == 0) { + if (absoluteError > 1e-5) { + meanError += absoluteError; + errorCount++; + } else { + continue; + } + } else { + // just assume that atol = rtol + meanError += absoluteError / (fabs(calibTensor->data.at(j)) + FLT_MIN); + errorCount++; + } } } std::cout << std::endl;