From cc16a5fe81cb1437f6ac5d32ac42432d21f39681 Mon Sep 17 00:00:00 2001 From: songhonglei413 Date: Thu, 13 Aug 2020 21:13:44 +0800 Subject: [PATCH] fix -d compile --- .../src/runtime/kernel/arm/fp32/activation.cc | 4 +- .../activation_grad.h => activation_grad.c} | 52 +- .../kernel/arm/nnacl/activation_grad.h | 65 +-- .../runtime/kernel/arm/nnacl/common_func.c | 10 + .../runtime/kernel/arm/nnacl/common_func.h | 13 +- .../kernel/arm/nnacl/fp32/activation.c | 66 +++ .../kernel/arm/nnacl/fp32/activation.h | 59 +- .../runtime/kernel/arm/nnacl/fp32/matmul.c | 2 +- .../runtime/kernel/arm/nnacl/int8/div_int8.c | 1 + .../kernel/arm/nnacl/int8/relux_int8.c | 30 ++ .../kernel/arm/nnacl/int8/relux_int8.h | 14 +- .../runtime/kernel/arm/nnacl/matrix_table.c | 507 ++++++++++++++++++ .../runtime/kernel/arm/nnacl/matrix_table.h | 491 +---------------- .../arm/nnacl/quantization/fixed_point.c | 171 ++++++ .../arm/nnacl/quantization/fixed_point.h | 149 +---- .../kernel/arm/nnacl/quantization/quantize.c | 64 +++ .../kernel/arm/nnacl/quantization/quantize.h | 62 +-- .../kernel/arm/fp32/activation_fp32_test.cc | 4 +- 18 files changed, 935 insertions(+), 829 deletions(-) rename mindspore/lite/src/runtime/kernel/arm/nnacl/{fp32_grad/activation_grad.h => activation_grad.c} (56%) create mode 100644 mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/activation.c create mode 100644 mindspore/lite/src/runtime/kernel/arm/nnacl/int8/relux_int8.c create mode 100644 mindspore/lite/src/runtime/kernel/arm/nnacl/matrix_table.c create mode 100644 mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/fixed_point.c diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/activation.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/activation.cc index 9d614303ae0..eca22277636 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/activation.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/activation.cc @@ -47,9 +47,9 @@ int ActivationCPUKernel::DoActivation(int task_id) { auto error_code = RET_OK; if (type_ == schema::ActivationType_RELU) { - error_code = Relu(input_addr + stride * task_id, count, output_addr + stride * task_id); + error_code = Fp32Relu(input_addr + stride * task_id, count, output_addr + stride * task_id); } else if (type_ == schema::ActivationType_RELU6) { - error_code = Relu6(input_addr + stride * task_id, count, output_addr + stride * task_id); + error_code = Fp32Relu6(input_addr + stride * task_id, count, output_addr + stride * task_id); } else if (type_ == schema::ActivationType_LEAKY_RELU) { error_code = LRelu(input_addr + stride * task_id, count, output_addr + stride * task_id, alpha_); } else if (type_ == schema::ActivationType_SIGMOID) { diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32_grad/activation_grad.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/activation_grad.c similarity index 56% rename from mindspore/lite/src/runtime/kernel/arm/nnacl/fp32_grad/activation_grad.h rename to mindspore/lite/src/runtime/kernel/arm/nnacl/activation_grad.c index c85a6e42bef..7dcc11b5580 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32_grad/activation_grad.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/activation_grad.c @@ -13,33 +13,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP32_GRAD_ACTIVATION_GRAD_H_ -#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP32_GRAD_ACTIVATION_GRAD_H_ +#include "nnacl/activation_grad.h" -#include -#include "nnacl/op_base.h" -#include "nnacl/fp32/arithmetic.h" -#include "nnacl/errorcode.h" - -typedef struct ActivationGradParameter { - OpParameter op_parameter{}; - int type_; - float alpha_{0.01}; -} ActivationGradParameter; - -#ifdef __cplusplus -extern "C" { -#endif - -inline int ReluGrad(float *src0, float *src1, int length, float *dst) { +int ReluGrad(float *src0, float *src1, int length, float *dst) { for (int i = 0; i < length; ++i) { dst[i] = src1[i] > 0 ? 1.0f : 0.0f; } ElementMul(src0, dst, dst, length); - return OPCLIB_OK; + return NNACL_OK; } -inline int Relu6Grad(float *src0, float *src1, int length, float *dst) { +int Relu6Grad(float *src0, float *src1, int length, float *dst) { for (int i = 0; i < length; ++i) { if (src1[i] < 0) { dst[i] = 0; @@ -48,49 +32,43 @@ inline int Relu6Grad(float *src0, float *src1, int length, float *dst) { } } ElementMul(src0, dst, dst, length); - return OPCLIB_OK; + return NNACL_OK; } -inline int LReluGrad(float *src0, float *src1, int length, float *dst, float alpha) { +int LReluGrad(float *src0, float *src1, int length, float *dst, float alpha) { for (int i = 0; i < length; ++i) { dst[i] = src1[i] > 0.0f ? 1.0f : alpha; } ElementMul(src0, dst, dst, length); - return OPCLIB_OK; + return NNACL_OK; } -inline int SigmoidGrad(float *src0, float *src1, int length, float *dst) { +int SigmoidGrad(float *src0, float *src1, int length, float *dst) { for (int i = 0; i < length; ++i) { dst[i] = src0[i] * (src1[i] * (1.0f - src1[i])); } - return OPCLIB_OK; + return NNACL_OK; } -inline int TanhGrad(float *src0, float *src1, int length, float *dst) { +int TanhGrad(float *src0, float *src1, int length, float *dst) { for (int i = 0; i < length; ++i) { dst[i] = (1.0f - (src1[i] * src1[i])) * src0[i]; } - return OPCLIB_OK; + return NNACL_OK; } -inline int HSwishGrad(float *src0, float *src1, int length, float *dst) { +int HSwishGrad(float *src0, float *src1, int length, float *dst) { for (int i = 0; i < length; ++i) { float tmp = (src1[i] > 3.0f ? 1.0f : (src1[i] < -3.0f ? 0.0f : (2.0f * src1[i] + 3.0f) / 6.0f)); dst[i] = tmp * src0[i]; } - return OPCLIB_OK; + return NNACL_OK; } -inline int HSigmoidGrad(float *src0, float *src1, int length, float *dst) { +int HSigmoidGrad(float *src0, float *src1, int length, float *dst) { for (int i = 0; i < length; ++i) { float tmp = (src1[i] > 3.0f ? 1.0f : (src1[i] < -3.0f ? 0.0f : 1.0f / 6.0f)); dst[i] = tmp * src0[i]; } - return OPCLIB_OK; + return NNACL_OK; } - -#ifdef __cplusplus -} -#endif - -#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP32_GRAD_ACTIVATION_GRAD_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/activation_grad.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/activation_grad.h index eb11039dca5..de20bc95e6a 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/activation_grad.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/activation_grad.h @@ -22,7 +22,7 @@ #include "nnacl/errorcode.h" typedef struct ActivationGradParameter { - OpParameter op_parameter{}; + OpParameter op_parameter; int type_; float alpha_; } ActivationGradParameter; @@ -30,63 +30,14 @@ typedef struct ActivationGradParameter { extern "C" { #endif -inline int ReluGrad(float *src0, float *src1, int length, float *dst) { - for (int i = 0; i < length; ++i) { - dst[i] = src1[i] > 0 ? 1.0f : 0.0f; - } - ElementMul(src0, dst, dst, length); - return NNACL_OK; -} +int ReluGrad(float *src0, float *src1, int length, float *dst); +int Relu6Grad(float *src0, float *src1, int length, float *dst); +int LReluGrad(float *src0, float *src1, int length, float *dst, float alpha); +int SigmoidGrad(float *src0, float *src1, int length, float *dst); +int TanhGrad(float *src0, float *src1, int length, float *dst); +int HSwishGrad(float *src0, float *src1, int length, float *dst); +int HSigmoidGrad(float *src0, float *src1, int length, float *dst); -inline int Relu6Grad(float *src0, float *src1, int length, float *dst) { - for (int i = 0; i < length; ++i) { - if (src1[i] < 0) { - dst[i] = 0; - } else { - dst[i] = src1[i] > 6.0f ? 0.0f : 1.0f; - } - } - ElementMul(src0, dst, dst, length); - return NNACL_OK; -} - -inline int LReluGrad(float *src0, float *src1, int length, float *dst, float alpha) { - for (int i = 0; i < length; ++i) { - dst[i] = src1[i] > 0.0f ? 1.0f : alpha; - } - ElementMul(src0, dst, dst, length); - return NNACL_OK; -} - -inline int SigmoidGrad(float *src0, float *src1, int length, float *dst) { - for (int i = 0; i < length; ++i) { - dst[i] = src0[i] * (src1[i] * (1.0f - src1[i])); - } - return NNACL_OK; -} - -inline int TanhGrad(float *src0, float *src1, int length, float *dst) { - for (int i = 0; i < length; ++i) { - dst[i] = (1.0f - (src1[i] * src1[i])) * src0[i]; - } - return NNACL_OK; -} - -inline int HSwishGrad(float *src0, float *src1, int length, float *dst) { - for (int i = 0; i < length; ++i) { - float tmp = (src1[i] > 3.0f ? 1.0f : (src1[i] < -3.0f ? 0.0f : (2.0f * src1[i] + 3.0f) / 6.0f)); - dst[i] = tmp * src0[i]; - } - return NNACL_OK; -} - -inline int HSigmoidGrad(float *src0, float *src1, int length, float *dst) { - for (int i = 0; i < length; ++i) { - float tmp = (src1[i] > 3.0f ? 1.0f : (src1[i] < -3.0f ? 0.0f : 1.0f / 6.0f)); - dst[i] = tmp * src0[i]; - } - return NNACL_OK; -} #ifdef __cplusplus } #endif diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/common_func.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/common_func.c index 46024c510a1..43774cad828 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/common_func.c +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/common_func.c @@ -17,6 +17,16 @@ #include "nnacl/common_func.h" #include "nnacl/quantization/fixed_point.h" +int offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3) { + return ((dim0 * shape[1] + dim1) * shape[2] + dim2) * shape[3] + dim3; +} + +int offsetComm(const int *shape, const int dim0, const int dim1, const int dim2) { + return ((dim0 * shape[1] + dim1) * shape[2] + dim2) * shape[3]; +} + +int offset4d(const int *shape, const int *dims) { return offset(shape, dims[0], dims[1], dims[2], dims[3]); } + #ifndef ENABLE_ARM64 void IndirectGemmFp32(float *output, const float *input, const float *weight, const float *bias, size_t step, int ic4, int output_channel, size_t offset, size_t relu, size_t relu6) { diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/common_func.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/common_func.h index 33f6256646f..e72966bd2a1 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/common_func.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/common_func.h @@ -45,16 +45,9 @@ void IndirectGemmFp32_Comm(float *output, const float *input, const float *weigh size_t offset); void IndirectGemmFp32(float *output, const float *input, const float *weight, const float *bias, size_t step, int ic4, int output_channel, size_t offset, size_t relu, size_t relu6); - -inline int offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3) { - return ((dim0 * shape[1] + dim1) * shape[2] + dim2) * shape[3] + dim3; -} - -inline int offsetComm(const int *shape, const int dim0, const int dim1, const int dim2) { - return ((dim0 * shape[1] + dim1) * shape[2] + dim2) * shape[3]; -} - -inline int offset4d(const int *shape, const int *dims) { return offset(shape, dims[0], dims[1], dims[2], dims[3]); } +int offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3); +int offsetComm(const int *shape, const int dim0, const int dim1, const int dim2); +int offset4d(const int *shape, const int *dims); #ifdef ENABLE_ARM64 void BiasAdd(const float *bias, float *data, size_t oc4, size_t plan_size); diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/activation.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/activation.c new file mode 100644 index 00000000000..17e340751ea --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/activation.c @@ -0,0 +1,66 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nnacl/fp32/activation.h" +#include "nnacl/errorcode.h" + +int Fp32Relu(const float *src, int length, float *dst) { + for (int i = 0; i < length; ++i) { + dst[i] = src[i] > 0 ? src[i] : 0; + } + return NNACL_OK; +} + +int Fp32Relu6(const float *src, int length, float *dst) { + for (int i = 0; i < length; ++i) { + if (src[i] < 0) { + dst[i] = 0; + } else { + dst[i] = src[i] > 6.0f ? 6.0f : src[i]; + } + } + return NNACL_OK; +} + +int LRelu(const float *src, int length, float *dst, float alpha) { + for (int i = 0; i < length; ++i) { + dst[i] = src[i] > 0 ? src[i] : (src[i] * alpha); + } + return NNACL_OK; +} + +int Sigmoid(const float *src, int length, float *dst) { + for (int i = 0; i < length; ++i) { + dst[i] = 1.0f / (1.0f + exp(-src[i])); + } + return NNACL_OK; +} + +int Tanh(const float *src, int length, float *dst) { + for (int i = 0; i < length; ++i) { + dst[i] = 1.0f - 2.0f / (exp(2 * src[i]) + 1); + } + return NNACL_OK; +} + +int HSwish(const float *src, int length, float *dst) { + for (int i = 0; i < length; ++i) { + float in = src[i]; + float relu6 = MSMIN(MSMAX(in + 3, 0), 6); + dst[i] = in * relu6 / 6; + } + return NNACL_OK; +} diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/activation.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/activation.h index bc047335cf1..89ce7ec8e83 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/activation.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/activation.h @@ -18,7 +18,6 @@ #include #include "nnacl/op_base.h" -#include "nnacl/errorcode.h" #include "nnacl/quantization/fixed_point.h" typedef struct ActivationParameter { @@ -27,52 +26,16 @@ typedef struct ActivationParameter { float alpha_; } ActivationParameter; -inline int Relu(const float *src, int length, float *dst) { - for (int i = 0; i < length; ++i) { - dst[i] = src[i] > 0 ? src[i] : 0; - } - return NNACL_OK; +#ifdef __cplusplus +extern "C" { +#endif +int Fp32Relu(const float *src, int length, float *dst); +int Fp32Relu6(const float *src, int length, float *dst); +int LRelu(const float *src, int length, float *dst, float alpha); +int Sigmoid(const float *src, int length, float *dst); +int Tanh(const float *src, int length, float *dst); +int HSwish(const float *src, int length, float *dst); +#ifdef __cplusplus } - -inline int Relu6(const float *src, int length, float *dst) { - for (int i = 0; i < length; ++i) { - if (src[i] < 0) { - dst[i] = 0; - } else { - dst[i] = src[i] > 6.0f ? 6.0f : src[i]; - } - } - return NNACL_OK; -} - -inline int LRelu(const float *src, int length, float *dst, float alpha) { - for (int i = 0; i < length; ++i) { - dst[i] = src[i] > 0 ? src[i] : (src[i] * alpha); - } - return NNACL_OK; -} - -inline int Sigmoid(const float *src, int length, float *dst) { - for (int i = 0; i < length; ++i) { - dst[i] = 1.0f / (1.0f + exp(-src[i])); - } - return NNACL_OK; -} - -inline int Tanh(const float *src, int length, float *dst) { - for (int i = 0; i < length; ++i) { - dst[i] = 1.0f - 2.0f / (exp(2 * src[i]) + 1); - } - return NNACL_OK; -} - -inline int HSwish(const float *src, int length, float *dst) { - for (int i = 0; i < length; ++i) { - float in = src[i]; - float relu6 = MSMIN(MSMAX(in + 3, 0), 6); - dst[i] = in * relu6 / 6; - } - return NNACL_OK; -} - +#endif #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_ACTIVATION_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/matmul.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/matmul.c index 055d1fe3f8b..bae8dacdbd3 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/matmul.c +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/matmul.c @@ -119,7 +119,7 @@ void RowMajor2Col8Major(float *src_ptr, float *dst_ptr, size_t row, size_t col) return; } -inline void MatrixUnPackUnit(const void *src, void *dst, size_t row, size_t col, size_t src_stride, size_t dst_stride, +void MatrixUnPackUnit(const void *src, void *dst, size_t row, size_t col, size_t src_stride, size_t dst_stride, size_t data_lenth) { size_t copy_size = col * data_lenth; size_t src_size = src_stride * data_lenth; diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/div_int8.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/div_int8.c index 72e3705960a..f3b8d86b667 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/div_int8.c +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/div_int8.c @@ -17,6 +17,7 @@ #include "nnacl/int8/div_int8.h" #include "nnacl/quantization/fixed_point.h" #include "nnacl/errorcode.h" +#include "nnacl/quantization/quantize.h" int DivInt8(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int64_t real_dst_count, DivQuantArg *para) { int index = 0; diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/relux_int8.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/relux_int8.c new file mode 100644 index 00000000000..b2a1c0525f0 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/relux_int8.c @@ -0,0 +1,30 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "nnacl/int8/relux_int8.h" + +void ReluXInt8(const int8_t *src, int length, int8_t *dst, ReluXQuantArg *arg) { + for (int i = 0; i < length; ++i) { + if (src[i] <= arg->input_arg.zp_) { + dst[i] = arg->output_arg.zp_; + continue; + } + const int32_t input_val = src[i] - arg->input_arg.zp_; + const int32_t scaled_input = SaturatingRoundingDoublingHighMul(input_val, arg->input_multiplier_); + const int32_t shifted_input = RoundingDivideByPOT(scaled_input * (1 << arg->left_shift_), -arg->right_shift_); + const int32_t output = shifted_input + arg->output_arg.zp_; + dst[i] = (int8_t)MSMIN(output, arg->quantized_output_max); + } +} diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/relux_int8.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/relux_int8.h index 6ea33ce8576..94a425b8372 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/relux_int8.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/relux_int8.h @@ -35,19 +35,7 @@ typedef struct ReluXQuantArg { #ifdef __cplusplus extern "C" { #endif -inline void ReluXInt8(const int8_t *src, int length, int8_t *dst, ReluXQuantArg *arg) { - for (int i = 0; i < length; ++i) { - if (src[i] <= arg->input_arg.zp_) { - dst[i] = arg->output_arg.zp_; - continue; - } - const int32_t input_val = src[i] - arg->input_arg.zp_; - const int32_t scaled_input = SaturatingRoundingDoublingHighMul(input_val, arg->input_multiplier_); - const int32_t shifted_input = RoundingDivideByPOT(scaled_input * (1 << arg->left_shift_), -arg->right_shift_); - const int32_t output = shifted_input + arg->output_arg.zp_; - dst[i] = (int8_t)MSMIN(output, arg->quantized_output_max); - } -} +void ReluXInt8(const int8_t *src, int length, int8_t *dst, ReluXQuantArg *arg); #ifdef __cplusplus } #endif diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/matrix_table.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/matrix_table.c new file mode 100644 index 00000000000..3f4329bb6a9 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/matrix_table.c @@ -0,0 +1,507 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "nnacl/matrix_table.h" + +void MatrixG4x2(float *matrix_data) { + matrix_data[0] = 1.0f; + matrix_data[1] = 0.0f; + matrix_data[2] = 1.0f; + matrix_data[3] = 0.5f; + matrix_data[4] = 1.0f; + matrix_data[5] = -0.5f; + matrix_data[6] = 0.0f; + matrix_data[7] = 1.0f; +} + +void MatrixGT2x4(float *matrix_data) { + matrix_data[0] = 1.0f; + matrix_data[1] = 1.0f; + matrix_data[2] = 1.0f; + matrix_data[3] = 0.0f; + matrix_data[4] = 0.0f; + matrix_data[5] = 0.5f; + matrix_data[6] = -0.5f; + matrix_data[7] = 1.0f; +} + +void MatrixG8x2(float *matrix_data) { + matrix_data[0] = 1.0f; + matrix_data[1] = 0.0f; + matrix_data[2] = 1.0f; + matrix_data[3] = 0.5f; + matrix_data[4] = 1.0f; + matrix_data[5] = -0.5f; + matrix_data[6] = 1.0f; + matrix_data[7] = 1.0f; + matrix_data[8] = 1.0f; + matrix_data[9] = -1.0f; + matrix_data[10] = 1.0f; + matrix_data[11] = 1.5f; + matrix_data[12] = 1.0f; + matrix_data[13] = -1.5f; + matrix_data[14] = 0.0f; + matrix_data[15] = 1.0f; +} + +void MatrixGT2x8(float *matrix_data) { + matrix_data[0] = 1.0f; + matrix_data[1] = 1.0f; + matrix_data[2] = 1.0f; + matrix_data[3] = 1.5f; + matrix_data[4] = 1.0f; + matrix_data[5] = 1.0f; + matrix_data[6] = 1.0f; + matrix_data[7] = 0.0f; + matrix_data[8] = 0.0f; + matrix_data[9] = 0.5f; + matrix_data[10] = -0.5f; + matrix_data[11] = 1.0f; + matrix_data[12] = -1.0f; + matrix_data[13] = 1.5f; + matrix_data[14] = -1.5f; + matrix_data[15] = 1.0f; +} + +void MatrixG8x3(float *matrix_data) { + matrix_data[0] = 1.0f; + matrix_data[1] = 0.0f; + matrix_data[2] = 0.0f; + matrix_data[3] = 1.0f; + matrix_data[4] = 0.5f; + matrix_data[5] = 0.25f; + matrix_data[6] = 1.0f; + matrix_data[7] = -0.5f; + matrix_data[8] = 0.25f; + matrix_data[9] = 1.0f; + matrix_data[10] = 1.0f; + matrix_data[11] = 1.0f; + matrix_data[12] = 1.0f; + matrix_data[13] = -1.0f; + matrix_data[14] = 1.0f; + matrix_data[15] = 1.0f; + matrix_data[16] = 1.5f; + matrix_data[17] = 2.25f; + matrix_data[18] = 1.0f; + matrix_data[19] = -1.5f; + matrix_data[20] = 2.25f; + matrix_data[21] = 0.0f; + matrix_data[22] = 0.0f; + matrix_data[23] = 1.0f; +} + +void MatrixGT3x8(float *matrix_data) { + matrix_data[0] = 1.0f; + matrix_data[1] = 1.0f; + matrix_data[2] = 1.0f; + matrix_data[3] = 1.0f; + matrix_data[4] = 1.0f; + matrix_data[5] = 1.0f; + matrix_data[6] = 1.0f; + matrix_data[7] = 0.0f; + matrix_data[8] = 0.0f; + matrix_data[9] = 0.5f; + matrix_data[10] = -0.5f; + matrix_data[11] = 1.0f; + matrix_data[12] = -1.0f; + matrix_data[13] = 1.5f; + matrix_data[14] = -1.5f; + matrix_data[15] = 0.0f; + matrix_data[16] = 0.0f; + matrix_data[17] = 0.25f; + matrix_data[18] = 0.25f; + matrix_data[19] = 1.0f; + matrix_data[20] = 1.0f; + matrix_data[21] = 2.25f; + matrix_data[22] = 2.25f; + matrix_data[23] = 1.0f; +} + +void MatrixG8x4(float *matrix_data) { + matrix_data[0] = 1.0f; + matrix_data[1] = 0.0f; + matrix_data[2] = 0.0f; + matrix_data[3] = 0.0f; + matrix_data[4] = 1.0f; + matrix_data[5] = 0.5f; + matrix_data[6] = 0.25f; + matrix_data[7] = 0.125f; + matrix_data[8] = 1.0f; + matrix_data[9] = -0.5f; + matrix_data[10] = 0.25f; + matrix_data[11] = -0.125f; + matrix_data[12] = 1.0f; + matrix_data[13] = 1.0f; + matrix_data[14] = 1.0f; + matrix_data[15] = 1.0f; + matrix_data[16] = 1.0f; + matrix_data[17] = -1.0f; + matrix_data[18] = 1.0f; + matrix_data[19] = -1.0f; + matrix_data[20] = 1.0f; + matrix_data[21] = 1.5f; + matrix_data[22] = 2.25f; + matrix_data[23] = 3.375f; + matrix_data[24] = 1.0f; + matrix_data[25] = -1.5f; + matrix_data[26] = 2.25f; + matrix_data[27] = -3.375f; + matrix_data[28] = 0.0f; + matrix_data[29] = 0.0f; + matrix_data[30] = 0.0f; + matrix_data[31] = 1.0f; +} + +void MatrixGT4x8(float *matrix_data) { + matrix_data[0] = 1.0f; + matrix_data[1] = 1.0f; + matrix_data[2] = 1.0f; + matrix_data[3] = 1.0f; + matrix_data[4] = 1.0f; + matrix_data[5] = 1.0f; + matrix_data[6] = 1.0f; + matrix_data[7] = 0.0f; + matrix_data[8] = 0.0f; + matrix_data[9] = 0.5f; + matrix_data[10] = -0.5f; + matrix_data[11] = 1.0f; + matrix_data[12] = -1.0f; + matrix_data[13] = 1.5f; + matrix_data[14] = -1.5f; + matrix_data[15] = 0.0f; + matrix_data[16] = 0.0f; + matrix_data[17] = 0.25f; + matrix_data[18] = 0.25f; + matrix_data[19] = 1.0f; + matrix_data[20] = 1.0f; + matrix_data[21] = 2.25f; + matrix_data[22] = 2.25f; + matrix_data[23] = 0.0f; + matrix_data[24] = 0.0f; + matrix_data[25] = 0.125f; + matrix_data[26] = -0.125f; + matrix_data[27] = 1.0f; + matrix_data[28] = -1.0f; + matrix_data[29] = 3.375f; + matrix_data[30] = -3.375f; + matrix_data[31] = 1.0f; +} + +void MatrixG8x5(float *matrix_data) { + matrix_data[0] = 1.0f; + matrix_data[1] = 0.0f; + matrix_data[2] = 0.0f; + matrix_data[3] = 0.0f; + matrix_data[4] = 0.0f; + matrix_data[5] = 1.0f; + matrix_data[6] = 0.5f; + matrix_data[7] = 0.25f; + matrix_data[8] = 0.125f; + matrix_data[9] = 0.0625f; + matrix_data[10] = 1.0f; + matrix_data[11] = -0.5f; + matrix_data[12] = 0.25f; + matrix_data[13] = -0.125f; + matrix_data[14] = 0.0625f; + matrix_data[15] = 1.0f; + matrix_data[16] = 1.0f; + matrix_data[17] = 1.0f; + matrix_data[18] = 1.0f; + matrix_data[19] = 1.0f; + matrix_data[20] = 1.0f; + matrix_data[21] = -1.0f; + matrix_data[22] = 1.0f; + matrix_data[23] = -1.0f; + matrix_data[24] = 1.0f; + matrix_data[25] = 1.0f; + matrix_data[26] = 1.5f; + matrix_data[27] = 2.25f; + matrix_data[28] = 3.375f; + matrix_data[29] = 5.0625f; + matrix_data[30] = 1.0f; + matrix_data[31] = -1.5f; + matrix_data[32] = 2.25f; + matrix_data[33] = -3.375f; + matrix_data[34] = 5.0625f; + matrix_data[35] = 0.0f; + matrix_data[36] = 0.0f; + matrix_data[37] = 0.0f; + matrix_data[38] = 0.0f; + matrix_data[39] = 1.0f; +} + +void MatrixGT5x8(float *matrix_data) { + matrix_data[0] = 1.0f; + matrix_data[1] = 1.0f; + matrix_data[2] = 1.0f; + matrix_data[3] = 1.0f; + matrix_data[4] = 1.0f; + matrix_data[5] = 1.0f; + matrix_data[6] = 1.0f; + matrix_data[7] = 0.0f; + matrix_data[8] = 0.0f; + matrix_data[9] = 0.5f; + matrix_data[10] = -0.5f; + matrix_data[11] = 1.0f; + matrix_data[12] = -1.0f; + matrix_data[13] = 1.5f; + matrix_data[14] = -1.5f; + matrix_data[15] = 0.0f; + matrix_data[16] = 0.0f; + matrix_data[17] = 0.25f; + matrix_data[18] = 0.25f; + matrix_data[19] = 1.0f; + matrix_data[20] = 1.0f; + matrix_data[21] = 2.25f; + matrix_data[22] = 2.25f; + matrix_data[23] = 0.0f; + matrix_data[24] = 0.0f; + matrix_data[25] = 0.125f; + matrix_data[26] = -0.125f; + matrix_data[27] = 1.0f; + matrix_data[28] = -1.0f; + matrix_data[29] = 3.375f; + matrix_data[30] = -3.375f; + matrix_data[31] = 0.0f; + matrix_data[32] = 0.0f; + matrix_data[33] = 0.0625f; + matrix_data[34] = 0.0625f; + matrix_data[35] = 1.0f; + matrix_data[36] = 1.0f; + matrix_data[37] = 5.0625f; + matrix_data[38] = 5.0625f; + matrix_data[39] = 1.0f; +} + +void MatrixG8x6(float *matrix_data) { + matrix_data[0] = 1.0f; + matrix_data[1] = 0.0f; + matrix_data[2] = 0.0f; + matrix_data[3] = 0.0f; + matrix_data[4] = 0.0f; + matrix_data[5] = 0.0f; + matrix_data[6] = 1.0f; + matrix_data[7] = 0.5f; + matrix_data[8] = 0.25f; + matrix_data[9] = 0.125f; + matrix_data[10] = 0.0625f; + matrix_data[11] = 0.03125f; + matrix_data[12] = 1.0f; + matrix_data[13] = -0.5f; + matrix_data[14] = 0.25f; + matrix_data[15] = -0.125f; + matrix_data[16] = 0.0625f; + matrix_data[17] = -0.03125f; + matrix_data[18] = 1.0f; + matrix_data[19] = 1.0f; + matrix_data[20] = 1.0f; + matrix_data[21] = 1.0f; + matrix_data[22] = 1.0f; + matrix_data[23] = 1.0f; + matrix_data[24] = 1.0f; + matrix_data[25] = -1.0f; + matrix_data[26] = 1.0f; + matrix_data[27] = -1.0f; + matrix_data[28] = 1.0f; + matrix_data[29] = -1.0f; + matrix_data[30] = 1.0f; + matrix_data[31] = 1.5f; + matrix_data[32] = 2.25f; + matrix_data[33] = 3.375f; + matrix_data[34] = 5.0625f; + matrix_data[35] = 7.59375f; + matrix_data[36] = 1.0f; + matrix_data[37] = -1.5f; + matrix_data[38] = 2.25f; + matrix_data[39] = -3.375f; + matrix_data[40] = 5.0625f; + matrix_data[41] = -7.59375f; + matrix_data[42] = 0.0f; + matrix_data[43] = 0.0f; + matrix_data[44] = 0.0f; + matrix_data[45] = 0.0f; + matrix_data[46] = 0.0f; + matrix_data[47] = 1.0f; +} + +void MatrixGT6x8(float *matrix_data) { + matrix_data[0] = 1.0f; + matrix_data[1] = 1.0f; + matrix_data[2] = 1.0f; + matrix_data[3] = 1.0f; + matrix_data[4] = 1.0f; + matrix_data[5] = 1.0f; + matrix_data[6] = 1.0f; + matrix_data[7] = 0.0f; + matrix_data[8] = 0.0f; + matrix_data[9] = 0.5f; + matrix_data[10] = -0.5f; + matrix_data[11] = 1.0f; + matrix_data[12] = -1.0f; + matrix_data[13] = 1.5f; + matrix_data[14] = -1.5f; + matrix_data[15] = 0.0f; + matrix_data[16] = 0.0f; + matrix_data[17] = 0.25f; + matrix_data[18] = 0.25f; + matrix_data[19] = 1.0f; + matrix_data[20] = 1.0f; + matrix_data[21] = 2.25f; + matrix_data[22] = 2.25f; + matrix_data[23] = 0.0f; + matrix_data[24] = 0.0f; + matrix_data[25] = 0.125f; + matrix_data[26] = -0.125f; + matrix_data[27] = 1.0f; + matrix_data[28] = -1.0f; + matrix_data[29] = 3.375f; + matrix_data[30] = -3.375f; + matrix_data[31] = 0.0f; + matrix_data[32] = 0.0f; + matrix_data[33] = 0.0625f; + matrix_data[34] = 0.0625f; + matrix_data[35] = 1.0f; + matrix_data[36] = 1.0f; + matrix_data[37] = 5.0625f; + matrix_data[38] = 5.0625f; + matrix_data[39] = 0.0f; + matrix_data[40] = 0.0; + matrix_data[41] = 0.03125f; + matrix_data[42] = -0.03125f; + matrix_data[43] = 1.0f; + matrix_data[44] = -1.0f; + matrix_data[45] = 7.59375f; + matrix_data[46] = -7.59375f; + matrix_data[47] = 0.0f; + matrix_data[48] = 1.0f; +} + +void MatrixG8x7(float *matrix_data) { + matrix_data[0] = 1.0f; + matrix_data[1] = 0.0f; + matrix_data[2] = 0.0f; + matrix_data[3] = 0.0f; + matrix_data[4] = 0.0f; + matrix_data[5] = 0.0f; + matrix_data[6] = 0.0f; + matrix_data[7] = 1.0f; + matrix_data[8] = 0.5f; + matrix_data[9] = 0.25f; + matrix_data[10] = 0.125f; + matrix_data[11] = 0.0625f; + matrix_data[12] = 0.03125f; + matrix_data[13] = 0.015625f; + matrix_data[14] = 1.0f; + matrix_data[15] = -0.5f; + matrix_data[16] = 0.25f; + matrix_data[17] = -0.125f; + matrix_data[18] = 0.0625f; + matrix_data[19] = -0.03125f; + matrix_data[20] = 0.015625f; + matrix_data[21] = 1.0f; + matrix_data[22] = 1.0f; + matrix_data[23] = 1.0f; + matrix_data[24] = 1.0f; + matrix_data[25] = 1.0f; + matrix_data[26] = 1.0f; + matrix_data[27] = 1.0f; + matrix_data[28] = 1.0f; + matrix_data[29] = -1.0f; + matrix_data[30] = 1.0f; + matrix_data[31] = -1.0f; + matrix_data[32] = 1.0f; + matrix_data[33] = -1.0f; + matrix_data[34] = 1.0f; + matrix_data[35] = 1.0f; + matrix_data[36] = 1.5f; + matrix_data[37] = 2.25f; + matrix_data[38] = 3.375f; + matrix_data[39] = 5.0625f; + matrix_data[40] = 7.59375f; + matrix_data[41] = 11.390625f; + matrix_data[42] = 1.0f; + matrix_data[43] = -1.5f; + matrix_data[44] = 2.25f; + matrix_data[45] = -3.375f; + matrix_data[46] = 5.0625f; + matrix_data[47] = -7.59375f; + matrix_data[48] = 11.390625f; + matrix_data[49] = 0.0f; + matrix_data[50] = 0.0f; + matrix_data[51] = 0.0f; + matrix_data[52] = 0.0f; + matrix_data[53] = 0.0f; + matrix_data[54] = 0.0f; + matrix_data[55] = 1.0f; +} + +void MatrixGT7x8(float *matrix_data) { + matrix_data[0] = 1.0f; + matrix_data[1] = 1.0f; + matrix_data[2] = 1.0f; + matrix_data[3] = 1.0f; + matrix_data[4] = 1.0f; + matrix_data[5] = 1.0f; + matrix_data[6] = 1.0f; + matrix_data[7] = 0.0f; + matrix_data[8] = 0.0f; + matrix_data[9] = 0.5f; + matrix_data[10] = -0.5f; + matrix_data[11] = 1.0f; + matrix_data[12] = -1.0f; + matrix_data[13] = 1.5f; + matrix_data[14] = -1.5f; + matrix_data[15] = 0.0f; + matrix_data[16] = 0.0f; + matrix_data[17] = 0.25f; + matrix_data[18] = 0.25f; + matrix_data[19] = 1.0f; + matrix_data[20] = 1.0f; + matrix_data[21] = 2.25f; + matrix_data[22] = 2.25f; + matrix_data[23] = 0.0f; + matrix_data[24] = 0.0f; + matrix_data[25] = 0.125f; + matrix_data[26] = -0.125f; + matrix_data[27] = 1.0f; + matrix_data[28] = -1.0f; + matrix_data[29] = 3.375f; + matrix_data[30] = -3.375f; + matrix_data[31] = 0.0f; + matrix_data[32] = 0.0f; + matrix_data[33] = 0.0625f; + matrix_data[34] = 0.0625f; + matrix_data[35] = 1.0f; + matrix_data[36] = 1.0f; + matrix_data[37] = 5.0625f; + matrix_data[38] = 5.0625f; + matrix_data[39] = 0.0f; + matrix_data[40] = 0.0; + matrix_data[41] = 0.03125f; + matrix_data[42] = -0.03125f; + matrix_data[43] = 1.0f; + matrix_data[44] = -1.0f; + matrix_data[45] = 7.59375f; + matrix_data[46] = -7.59375f; + matrix_data[47] = 0.0f; + matrix_data[48] = 0.0f; + matrix_data[49] = 0.015625f; + matrix_data[50] = 0.015625f; + matrix_data[51] = 1.0f; + matrix_data[52] = 1.0f; + matrix_data[53] = 11.390625f; + matrix_data[54] = 11.390625f; + matrix_data[55] = 1.0f; +} diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/matrix_table.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/matrix_table.h index e7a9df2d113..ad99d79f957 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/matrix_table.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/matrix_table.h @@ -20,496 +20,33 @@ #ifdef __cplusplus extern "C" { #endif -inline void MatrixG4x2(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 0.0f; - matrix_data[2] = 1.0f; - matrix_data[3] = 0.5f; - matrix_data[4] = 1.0f; - matrix_data[5] = -0.5f; - matrix_data[6] = 0.0f; - matrix_data[7] = 1.0f; -} +void MatrixG4x2(float *matrix_data); -inline void MatrixGT2x4(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 1.0f; - matrix_data[2] = 1.0f; - matrix_data[3] = 0.0f; - matrix_data[4] = 0.0f; - matrix_data[5] = 0.5f; - matrix_data[6] = -0.5f; - matrix_data[7] = 1.0f; -} +void MatrixGT2x4(float *matrix_data); -inline void MatrixG8x2(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 0.0f; - matrix_data[2] = 1.0f; - matrix_data[3] = 0.5f; - matrix_data[4] = 1.0f; - matrix_data[5] = -0.5f; - matrix_data[6] = 1.0f; - matrix_data[7] = 1.0f; - matrix_data[8] = 1.0f; - matrix_data[9] = -1.0f; - matrix_data[10] = 1.0f; - matrix_data[11] = 1.5f; - matrix_data[12] = 1.0f; - matrix_data[13] = -1.5f; - matrix_data[14] = 0.0f; - matrix_data[15] = 1.0f; -} +void MatrixG8x2(float *matrix_data); -inline void MatrixGT2x8(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 1.0f; - matrix_data[2] = 1.0f; - matrix_data[3] = 1.5f; - matrix_data[4] = 1.0f; - matrix_data[5] = 1.0f; - matrix_data[6] = 1.0f; - matrix_data[7] = 0.0f; - matrix_data[8] = 0.0f; - matrix_data[9] = 0.5f; - matrix_data[10] = -0.5f; - matrix_data[11] = 1.0f; - matrix_data[12] = -1.0f; - matrix_data[13] = 1.5f; - matrix_data[14] = -1.5f; - matrix_data[15] = 1.0f; -} +void MatrixGT2x8(float *matrix_data); -inline void MatrixG8x3(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 0.0f; - matrix_data[2] = 0.0f; - matrix_data[3] = 1.0f; - matrix_data[4] = 0.5f; - matrix_data[5] = 0.25f; - matrix_data[6] = 1.0f; - matrix_data[7] = -0.5f; - matrix_data[8] = 0.25f; - matrix_data[9] = 1.0f; - matrix_data[10] = 1.0f; - matrix_data[11] = 1.0f; - matrix_data[12] = 1.0f; - matrix_data[13] = -1.0f; - matrix_data[14] = 1.0f; - matrix_data[15] = 1.0f; - matrix_data[16] = 1.5f; - matrix_data[17] = 2.25f; - matrix_data[18] = 1.0f; - matrix_data[19] = -1.5f; - matrix_data[20] = 2.25f; - matrix_data[21] = 0.0f; - matrix_data[22] = 0.0f; - matrix_data[23] = 1.0f; -} +void MatrixG8x3(float *matrix_data); -inline void MatrixGT3x8(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 1.0f; - matrix_data[2] = 1.0f; - matrix_data[3] = 1.0f; - matrix_data[4] = 1.0f; - matrix_data[5] = 1.0f; - matrix_data[6] = 1.0f; - matrix_data[7] = 0.0f; - matrix_data[8] = 0.0f; - matrix_data[9] = 0.5f; - matrix_data[10] = -0.5f; - matrix_data[11] = 1.0f; - matrix_data[12] = -1.0f; - matrix_data[13] = 1.5f; - matrix_data[14] = -1.5f; - matrix_data[15] = 0.0f; - matrix_data[16] = 0.0f; - matrix_data[17] = 0.25f; - matrix_data[18] = 0.25f; - matrix_data[19] = 1.0f; - matrix_data[20] = 1.0f; - matrix_data[21] = 2.25f; - matrix_data[22] = 2.25f; - matrix_data[23] = 1.0f; -} +void MatrixGT3x8(float *matrix_data); -inline void MatrixG8x4(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 0.0f; - matrix_data[2] = 0.0f; - matrix_data[3] = 0.0f; - matrix_data[4] = 1.0f; - matrix_data[5] = 0.5f; - matrix_data[6] = 0.25f; - matrix_data[7] = 0.125f; - matrix_data[8] = 1.0f; - matrix_data[9] = -0.5f; - matrix_data[10] = 0.25f; - matrix_data[11] = -0.125f; - matrix_data[12] = 1.0f; - matrix_data[13] = 1.0f; - matrix_data[14] = 1.0f; - matrix_data[15] = 1.0f; - matrix_data[16] = 1.0f; - matrix_data[17] = -1.0f; - matrix_data[18] = 1.0f; - matrix_data[19] = -1.0f; - matrix_data[20] = 1.0f; - matrix_data[21] = 1.5f; - matrix_data[22] = 2.25f; - matrix_data[23] = 3.375f; - matrix_data[24] = 1.0f; - matrix_data[25] = -1.5f; - matrix_data[26] = 2.25f; - matrix_data[27] = -3.375f; - matrix_data[28] = 0.0f; - matrix_data[29] = 0.0f; - matrix_data[30] = 0.0f; - matrix_data[31] = 1.0f; -} +void MatrixG8x4(float *matrix_data); -inline void MatrixGT4x8(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 1.0f; - matrix_data[2] = 1.0f; - matrix_data[3] = 1.0f; - matrix_data[4] = 1.0f; - matrix_data[5] = 1.0f; - matrix_data[6] = 1.0f; - matrix_data[7] = 0.0f; - matrix_data[8] = 0.0f; - matrix_data[9] = 0.5f; - matrix_data[10] = -0.5f; - matrix_data[11] = 1.0f; - matrix_data[12] = -1.0f; - matrix_data[13] = 1.5f; - matrix_data[14] = -1.5f; - matrix_data[15] = 0.0f; - matrix_data[16] = 0.0f; - matrix_data[17] = 0.25f; - matrix_data[18] = 0.25f; - matrix_data[19] = 1.0f; - matrix_data[20] = 1.0f; - matrix_data[21] = 2.25f; - matrix_data[22] = 2.25f; - matrix_data[23] = 0.0f; - matrix_data[24] = 0.0f; - matrix_data[25] = 0.125f; - matrix_data[26] = -0.125f; - matrix_data[27] = 1.0f; - matrix_data[28] = -1.0f; - matrix_data[29] = 3.375f; - matrix_data[30] = -3.375f; - matrix_data[31] = 1.0f; -} +void MatrixGT4x8(float *matrix_data); -inline void MatrixG8x5(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 0.0f; - matrix_data[2] = 0.0f; - matrix_data[3] = 0.0f; - matrix_data[4] = 0.0f; - matrix_data[5] = 1.0f; - matrix_data[6] = 0.5f; - matrix_data[7] = 0.25f; - matrix_data[8] = 0.125f; - matrix_data[9] = 0.0625f; - matrix_data[10] = 1.0f; - matrix_data[11] = -0.5f; - matrix_data[12] = 0.25f; - matrix_data[13] = -0.125f; - matrix_data[14] = 0.0625f; - matrix_data[15] = 1.0f; - matrix_data[16] = 1.0f; - matrix_data[17] = 1.0f; - matrix_data[18] = 1.0f; - matrix_data[19] = 1.0f; - matrix_data[20] = 1.0f; - matrix_data[21] = -1.0f; - matrix_data[22] = 1.0f; - matrix_data[23] = -1.0f; - matrix_data[24] = 1.0f; - matrix_data[25] = 1.0f; - matrix_data[26] = 1.5f; - matrix_data[27] = 2.25f; - matrix_data[28] = 3.375f; - matrix_data[29] = 5.0625f; - matrix_data[30] = 1.0f; - matrix_data[31] = -1.5f; - matrix_data[32] = 2.25f; - matrix_data[33] = -3.375f; - matrix_data[34] = 5.0625f; - matrix_data[35] = 0.0f; - matrix_data[36] = 0.0f; - matrix_data[37] = 0.0f; - matrix_data[38] = 0.0f; - matrix_data[39] = 1.0f; -} +void MatrixG8x5(float *matrix_data); -inline void MatrixGT5x8(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 1.0f; - matrix_data[2] = 1.0f; - matrix_data[3] = 1.0f; - matrix_data[4] = 1.0f; - matrix_data[5] = 1.0f; - matrix_data[6] = 1.0f; - matrix_data[7] = 0.0f; - matrix_data[8] = 0.0f; - matrix_data[9] = 0.5f; - matrix_data[10] = -0.5f; - matrix_data[11] = 1.0f; - matrix_data[12] = -1.0f; - matrix_data[13] = 1.5f; - matrix_data[14] = -1.5f; - matrix_data[15] = 0.0f; - matrix_data[16] = 0.0f; - matrix_data[17] = 0.25f; - matrix_data[18] = 0.25f; - matrix_data[19] = 1.0f; - matrix_data[20] = 1.0f; - matrix_data[21] = 2.25f; - matrix_data[22] = 2.25f; - matrix_data[23] = 0.0f; - matrix_data[24] = 0.0f; - matrix_data[25] = 0.125f; - matrix_data[26] = -0.125f; - matrix_data[27] = 1.0f; - matrix_data[28] = -1.0f; - matrix_data[29] = 3.375f; - matrix_data[30] = -3.375f; - matrix_data[31] = 0.0f; - matrix_data[32] = 0.0f; - matrix_data[33] = 0.0625f; - matrix_data[34] = 0.0625f; - matrix_data[35] = 1.0f; - matrix_data[36] = 1.0f; - matrix_data[37] = 5.0625f; - matrix_data[38] = 5.0625f; - matrix_data[39] = 1.0f; -} +void MatrixGT5x8(float *matrix_data); -inline void MatrixG8x6(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 0.0f; - matrix_data[2] = 0.0f; - matrix_data[3] = 0.0f; - matrix_data[4] = 0.0f; - matrix_data[5] = 0.0f; - matrix_data[6] = 1.0f; - matrix_data[7] = 0.5f; - matrix_data[8] = 0.25f; - matrix_data[9] = 0.125f; - matrix_data[10] = 0.0625f; - matrix_data[11] = 0.03125f; - matrix_data[12] = 1.0f; - matrix_data[13] = -0.5f; - matrix_data[14] = 0.25f; - matrix_data[15] = -0.125f; - matrix_data[16] = 0.0625f; - matrix_data[17] = -0.03125f; - matrix_data[18] = 1.0f; - matrix_data[19] = 1.0f; - matrix_data[20] = 1.0f; - matrix_data[21] = 1.0f; - matrix_data[22] = 1.0f; - matrix_data[23] = 1.0f; - matrix_data[24] = 1.0f; - matrix_data[25] = -1.0f; - matrix_data[26] = 1.0f; - matrix_data[27] = -1.0f; - matrix_data[28] = 1.0f; - matrix_data[29] = -1.0f; - matrix_data[30] = 1.0f; - matrix_data[31] = 1.5f; - matrix_data[32] = 2.25f; - matrix_data[33] = 3.375f; - matrix_data[34] = 5.0625f; - matrix_data[35] = 7.59375f; - matrix_data[36] = 1.0f; - matrix_data[37] = -1.5f; - matrix_data[38] = 2.25f; - matrix_data[39] = -3.375f; - matrix_data[40] = 5.0625f; - matrix_data[41] = -7.59375f; - matrix_data[42] = 0.0f; - matrix_data[43] = 0.0f; - matrix_data[44] = 0.0f; - matrix_data[45] = 0.0f; - matrix_data[46] = 0.0f; - matrix_data[47] = 1.0f; -} +void MatrixG8x6(float *matrix_data); -inline void MatrixGT6x8(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 1.0f; - matrix_data[2] = 1.0f; - matrix_data[3] = 1.0f; - matrix_data[4] = 1.0f; - matrix_data[5] = 1.0f; - matrix_data[6] = 1.0f; - matrix_data[7] = 0.0f; - matrix_data[8] = 0.0f; - matrix_data[9] = 0.5f; - matrix_data[10] = -0.5f; - matrix_data[11] = 1.0f; - matrix_data[12] = -1.0f; - matrix_data[13] = 1.5f; - matrix_data[14] = -1.5f; - matrix_data[15] = 0.0f; - matrix_data[16] = 0.0f; - matrix_data[17] = 0.25f; - matrix_data[18] = 0.25f; - matrix_data[19] = 1.0f; - matrix_data[20] = 1.0f; - matrix_data[21] = 2.25f; - matrix_data[22] = 2.25f; - matrix_data[23] = 0.0f; - matrix_data[24] = 0.0f; - matrix_data[25] = 0.125f; - matrix_data[26] = -0.125f; - matrix_data[27] = 1.0f; - matrix_data[28] = -1.0f; - matrix_data[29] = 3.375f; - matrix_data[30] = -3.375f; - matrix_data[31] = 0.0f; - matrix_data[32] = 0.0f; - matrix_data[33] = 0.0625f; - matrix_data[34] = 0.0625f; - matrix_data[35] = 1.0f; - matrix_data[36] = 1.0f; - matrix_data[37] = 5.0625f; - matrix_data[38] = 5.0625f; - matrix_data[39] = 0.0f; - matrix_data[40] = 0.0; - matrix_data[41] = 0.03125f; - matrix_data[42] = -0.03125f; - matrix_data[43] = 1.0f; - matrix_data[44] = -1.0f; - matrix_data[45] = 7.59375f; - matrix_data[46] = -7.59375f; - matrix_data[47] = 0.0f; - matrix_data[48] = 1.0f; -} +void MatrixGT6x8(float *matrix_data); -inline void MatrixG8x7(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 0.0f; - matrix_data[2] = 0.0f; - matrix_data[3] = 0.0f; - matrix_data[4] = 0.0f; - matrix_data[5] = 0.0f; - matrix_data[6] = 0.0f; - matrix_data[7] = 1.0f; - matrix_data[8] = 0.5f; - matrix_data[9] = 0.25f; - matrix_data[10] = 0.125f; - matrix_data[11] = 0.0625f; - matrix_data[12] = 0.03125f; - matrix_data[13] = 0.015625f; - matrix_data[14] = 1.0f; - matrix_data[15] = -0.5f; - matrix_data[16] = 0.25f; - matrix_data[17] = -0.125f; - matrix_data[18] = 0.0625f; - matrix_data[19] = -0.03125f; - matrix_data[20] = 0.015625f; - matrix_data[21] = 1.0f; - matrix_data[22] = 1.0f; - matrix_data[23] = 1.0f; - matrix_data[24] = 1.0f; - matrix_data[25] = 1.0f; - matrix_data[26] = 1.0f; - matrix_data[27] = 1.0f; - matrix_data[28] = 1.0f; - matrix_data[29] = -1.0f; - matrix_data[30] = 1.0f; - matrix_data[31] = -1.0f; - matrix_data[32] = 1.0f; - matrix_data[33] = -1.0f; - matrix_data[34] = 1.0f; - matrix_data[35] = 1.0f; - matrix_data[36] = 1.5f; - matrix_data[37] = 2.25f; - matrix_data[38] = 3.375f; - matrix_data[39] = 5.0625f; - matrix_data[40] = 7.59375f; - matrix_data[41] = 11.390625f; - matrix_data[42] = 1.0f; - matrix_data[43] = -1.5f; - matrix_data[44] = 2.25f; - matrix_data[45] = -3.375f; - matrix_data[46] = 5.0625f; - matrix_data[47] = -7.59375f; - matrix_data[48] = 11.390625f; - matrix_data[49] = 0.0f; - matrix_data[50] = 0.0f; - matrix_data[51] = 0.0f; - matrix_data[52] = 0.0f; - matrix_data[53] = 0.0f; - matrix_data[54] = 0.0f; - matrix_data[55] = 1.0f; -} +void MatrixG8x7(float *matrix_data); -inline void MatrixGT7x8(float *matrix_data) { - matrix_data[0] = 1.0f; - matrix_data[1] = 1.0f; - matrix_data[2] = 1.0f; - matrix_data[3] = 1.0f; - matrix_data[4] = 1.0f; - matrix_data[5] = 1.0f; - matrix_data[6] = 1.0f; - matrix_data[7] = 0.0f; - matrix_data[8] = 0.0f; - matrix_data[9] = 0.5f; - matrix_data[10] = -0.5f; - matrix_data[11] = 1.0f; - matrix_data[12] = -1.0f; - matrix_data[13] = 1.5f; - matrix_data[14] = -1.5f; - matrix_data[15] = 0.0f; - matrix_data[16] = 0.0f; - matrix_data[17] = 0.25f; - matrix_data[18] = 0.25f; - matrix_data[19] = 1.0f; - matrix_data[20] = 1.0f; - matrix_data[21] = 2.25f; - matrix_data[22] = 2.25f; - matrix_data[23] = 0.0f; - matrix_data[24] = 0.0f; - matrix_data[25] = 0.125f; - matrix_data[26] = -0.125f; - matrix_data[27] = 1.0f; - matrix_data[28] = -1.0f; - matrix_data[29] = 3.375f; - matrix_data[30] = -3.375f; - matrix_data[31] = 0.0f; - matrix_data[32] = 0.0f; - matrix_data[33] = 0.0625f; - matrix_data[34] = 0.0625f; - matrix_data[35] = 1.0f; - matrix_data[36] = 1.0f; - matrix_data[37] = 5.0625f; - matrix_data[38] = 5.0625f; - matrix_data[39] = 0.0f; - matrix_data[40] = 0.0; - matrix_data[41] = 0.03125f; - matrix_data[42] = -0.03125f; - matrix_data[43] = 1.0f; - matrix_data[44] = -1.0f; - matrix_data[45] = 7.59375f; - matrix_data[46] = -7.59375f; - matrix_data[47] = 0.0f; - matrix_data[48] = 0.0f; - matrix_data[49] = 0.015625f; - matrix_data[50] = 0.015625f; - matrix_data[51] = 1.0f; - matrix_data[52] = 1.0f; - matrix_data[53] = 11.390625f; - matrix_data[54] = 11.390625f; - matrix_data[55] = 1.0f; -} +void MatrixGT7x8(float *matrix_data); #ifdef __cplusplus } #endif diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/fixed_point.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/fixed_point.c new file mode 100644 index 00000000000..1f0d156156d --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/fixed_point.c @@ -0,0 +1,171 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nnacl/quantization/fixed_point.h" + +// returns the high-32 bits of a * b with rounding +// assume that a and b is divided by 2^31, who fall into [-1, 1] +// so the mantissa of a * b is (a / 2^31) * (b / 2^31) * 2^31= (a * b) / 2^31 +// actually we compute 2 * a * b / 2^32 +// and take 32 bits of mantissa for rounding +int SaturatingRoundingDoublingHighMul(int a, int b) { + if (a == INT_MIN && b == INT_MIN) { + return INT_MAX; + } + int64_t ab = ((int64_t)a) * ((int64_t)b); + int64_t rounding = ab >= 0 ? (1ll << 30) : (1ll - (1ll << 30)); + // do not apply right shift to potential negetive values + int ab_mantissa = (int)((ab + rounding) / (1ll << 31)); + return ab_mantissa; +} + +int16_t SaturatingRoundingDoublingHighMulInt16(int16_t a, int16_t b) { + if (a == SHRT_MIN && b == SHRT_MIN) { + return SHRT_MAX; + } + int32_t ab = ((int32_t)a) * ((int32_t)b); + int16_t rounding = ab >= 0 ? (1ll << 14) : (1ll - (1ll << 14)); + return (int16_t)((ab + rounding) / (1ll << 15)); +} + +// division by a 2^exponent with rounding +// or arithmetic right shift with rouding +int RoundingDivideByPOT(int x, int exponent) { + const int mask = (1ll << exponent) - 1; + const int remainder = x & mask; + const int threshold = (mask >> 1) + (x < 0 ? 1 : 0); + return (x >> exponent) + (remainder > threshold ? 1 : 0); +} + +int MultiplyByQuantizedMultiplier(int32_t value, int32_t multiplier, int32_t left_shift, int32_t right_shift) { + return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(value * (1 << left_shift), multiplier), -right_shift); +} + +int FractionsBits(int kIntegerBits) { + int totalBits = 8 * sizeof(int32_t) - 1; + return totalBits - kIntegerBits; +} + +int FixedPoint_One(int kIntegerBits, int kFractionsBits) { + return (kIntegerBits == 0 ? INT32_MAX : ((1) << (uint32_t)(kIntegerBits == 0 ? 0 : kFractionsBits))); +} + +int RoundingHalfSum(int a, int b) { + int64_t a64 = a; + int64_t b64 = b; + int64_t sum = a64 + b64; + int64_t sign = sum > 0 ? 1 : -1; + return (int32_t)((sum + sign) / 2); +} + +int32_t BitAnd(int32_t a, int32_t b) { return (uint32_t)a & (uint32_t)b; } + +int32_t BitOr(int32_t a, int32_t b) { return (uint32_t)a | (uint32_t)b; } + +int32_t BitXor(int32_t a, int32_t b) { return (uint32_t)a ^ (uint32_t)b; } + +int32_t BitNot(int32_t a) { return ~(uint32_t)a; } + +int SelectUsingMask(int mask, int bound, int val) { return BitXor(BitAnd(mask, bound), BitAnd(BitNot(mask), val)); } + +int32_t MaskNonZero(int32_t a) { + int32_t zreo = 0; + return a ? BitNot(zreo) : zreo; +} + +int SaturatingRoundingMultiplyByPOT(int32_t x, int Exponent) { + int ExponentSign = (Exponent > 0 ? 1 : Exponent < 0 ? -1 : 0); + if (ExponentSign == 0) { + return x; + } else if (ExponentSign == 1) { + const int min = INT32_MIN; + const int max = INT32_MAX; + const int thresold = ((1 << (uint32_t)(31 - Exponent)) - 1); + const int postive_mask = MaskNonZero(x > thresold); + const int negative_mask = MaskNonZero(x < -thresold); + int result = x << Exponent; + result = SelectUsingMask(postive_mask, max, result); + result = SelectUsingMask(negative_mask, min, result); + return result; + } else if (ExponentSign == -1) { + return RoundingDivideByPOT(x, -Exponent); + } else { + return 0; + } +} + +int32_t Rescale(int x, int kIntegerBitsSrc, int kIntegerBitsDst) { + int kExponent = kIntegerBitsSrc - kIntegerBitsDst; + int result = SaturatingRoundingMultiplyByPOT(x, kExponent); + return result; +} + +static int32_t one_over_one_plus_x_for_x_in_0_1(int32_t a) { + int one = FixedPoint_One(0, FractionsBits(0)); + int half_denominator = RoundingHalfSum(a, one); + const int constant_48_over_17 = 1515870810; + const int constant_neg_32_over_17 = -1010580540; + int x = constant_48_over_17 + SaturatingRoundingDoublingHighMul(half_denominator, constant_neg_32_over_17); + for (int i = 0; i < 3; i++) { + int half_denominator_times_x = SaturatingRoundingDoublingHighMul(half_denominator, x); + int one_minus_half_denominator_times_x = FixedPoint_One(2, FractionsBits(2)) - half_denominator_times_x; + x = x + Rescale(SaturatingRoundingDoublingHighMul(x, one_minus_half_denominator_times_x), 2 + 2, 2); + } + return Rescale(x, 2 - 1, 0); +} + +int CountLeadingZeroBits(uint32_t x) { +#if defined(__GUNC__) + return x ? __builtin_clz(x) : 8 * sizeof(uint32_t); +#else + if (x == 0) { + return 8 * sizeof(uint32_t); + } + const int32_t leading_positive = (int32_t)(1) << (8 * sizeof(uint32_t) - 1); + int leading_zeros = 0; + while (x < leading_positive) { + x <<= 1; + leading_zeros++; + } + return leading_zeros; +#endif +} + +int CountLeadingSignBits(int32_t x) { +#if defined(__GUNC__) && !defined(__clang__) + return x ? __builtin_clrsb(x) : 8 * sizeof(int32_t); +#else + return x >= 0 ? CountLeadingZeroBits((uint32_t)x) - 1 : x != INT32_MIN ? CountLeadingZeroBits(2 * (uint32_t)(-x)) : 0; +#endif +} + +int32_t ComputerReciproal(int32_t x, int x_digits, int *recip_shift) { + int leading_zreos_plus_one = CountLeadingZeroBits((uint32_t)x); + *recip_shift = x_digits - leading_zreos_plus_one; + const int32_t shifted_minus_one = (int32_t)(((uint32_t)x << leading_zreos_plus_one) - ((uint32_t)(1) << 31)); + const int32_t shifted_scaled = one_over_one_plus_x_for_x_in_0_1(shifted_minus_one); + return shifted_scaled; +} +#ifdef ENABLE_NEON +int32x4_t RoundingDivideByPOTInt32x4(int32x4_t x, int exponent) { + const int32x4_t shift_vec = vdupq_n_s32(-exponent); + const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31); + const int32x4_t fixed_up_x = vqaddq_s32(x, fixup); + return vrshlq_s32(fixed_up_x, shift_vec); +} + +int32x4_t SaturatingRoundingDoublingHighMulInt32x4(int32x4_t a, int32x4_t b) { return vqrdmulhq_s32(a, b); } +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/fixed_point.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/fixed_point.h index 67b453fd3c7..3cd47a3f2b7 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/fixed_point.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/fixed_point.h @@ -17,6 +17,7 @@ #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_QUANTIZATION_FIXED_POINT_H_ #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_QUANTIZATION_FIXED_POINT_H_ +#include #include #ifdef ENABLE_NEON #include @@ -31,160 +32,54 @@ extern "C" { // so the mantissa of a * b is (a / 2^31) * (b / 2^31) * 2^31= (a * b) / 2^31 // actually we compute 2 * a * b / 2^32 // and take 32 bits of mantissa for rounding -inline int SaturatingRoundingDoublingHighMul(int a, int b) { - if (a == INT_MIN && b == INT_MIN) { - return INT_MAX; - } - int64_t ab = ((int64_t)a) * ((int64_t)b); - int64_t rounding = ab >= 0 ? (1ll << 30) : (1ll - (1ll << 30)); - // do not apply right shift to potential negetive values - int ab_mantissa = (int)((ab + rounding) / (1ll << 31)); - return ab_mantissa; -} +int SaturatingRoundingDoublingHighMul(int a, int b); -inline int16_t SaturatingRoundingDoublingHighMulInt16(int16_t a, int16_t b) { - if (a == SHRT_MIN && b == SHRT_MIN) { - return SHRT_MAX; - } - int32_t ab = ((int32_t)a) * ((int32_t)b); - int16_t rounding = ab >= 0 ? (1ll << 14) : (1ll - (1ll << 14)); - return (int16_t)((ab + rounding) / (1ll << 15)); -} +int16_t SaturatingRoundingDoublingHighMulInt16(int16_t a, int16_t b); // division by a 2^exponent with rounding // or arithmetic right shift with rouding -inline int RoundingDivideByPOT(int x, int exponent) { - const int mask = (1ll << exponent) - 1; - const int remainder = x & mask; - const int threshold = (mask >> 1) + (x < 0 ? 1 : 0); - return (x >> exponent) + (remainder > threshold ? 1 : 0); -} +int RoundingDivideByPOT(int x, int exponent); -inline int MultiplyByQuantizedMultiplier(int32_t value, int32_t multiplier, int32_t left_shift, int32_t right_shift) { - return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(value * (1 << left_shift), multiplier), -right_shift); -} +int MultiplyByQuantizedMultiplier(int32_t value, int32_t multiplier, int32_t left_shift, int32_t right_shift); -inline int FractionsBits(int kIntegerBits) { - int totalBits = 8 * sizeof(int32_t) - 1; - return totalBits - kIntegerBits; -} +int FractionsBits(int kIntegerBits); -inline int FixedPoint_One(int kIntegerBits, int kFractionsBits) { - return (kIntegerBits == 0 ? INT32_MAX : ((1) << (uint32_t)(kIntegerBits == 0 ? 0 : kFractionsBits))); -} +int FixedPoint_One(int kIntegerBits, int kFractionsBits); -inline int RoundingHalfSum(int a, int b) { - int64_t a64 = a; - int64_t b64 = b; - int64_t sum = a64 + b64; - int64_t sign = sum > 0 ? 1 : -1; - return (int32_t)((sum + sign) / 2); -} +int RoundingHalfSum(int a, int b); -inline int32_t BitAnd(int32_t a, int32_t b) { return (uint32_t)a & (uint32_t)b; } +int32_t BitAnd(int32_t a, int32_t b); -inline int32_t BitOr(int32_t a, int32_t b) { return (uint32_t)a | (uint32_t)b; } +int32_t BitOr(int32_t a, int32_t b); -inline int32_t BitXor(int32_t a, int32_t b) { return (uint32_t)a ^ (uint32_t)b; } +int32_t BitXor(int32_t a, int32_t b); -inline int32_t BitNot(int32_t a) { return ~(uint32_t)a; } +int32_t BitNot(int32_t a); -inline int SelectUsingMask(int mask, int bound, int val) { - return BitXor(BitAnd(mask, bound), BitAnd(BitNot(mask), val)); -} +int SelectUsingMask(int mask, int bound, int val); -inline int32_t MaskNonZero(int32_t a) { - int32_t zreo = 0; - return a ? BitNot(zreo) : zreo; -} +int32_t MaskNonZero(int32_t a); -inline int SaturatingRoundingMultiplyByPOT(int32_t x, int Exponent) { - int ExponentSign = (Exponent > 0 ? 1 : Exponent < 0 ? -1 : 0); - if (ExponentSign == 0) { - return x; - } else if (ExponentSign == 1) { - const int min = INT32_MIN; - const int max = INT32_MAX; - const int thresold = ((1 << (uint32_t)(31 - Exponent)) - 1); - const int postive_mask = MaskNonZero(x > thresold); - const int negative_mask = MaskNonZero(x < -thresold); - int result = x << Exponent; - result = SelectUsingMask(postive_mask, max, result); - result = SelectUsingMask(negative_mask, min, result); - return result; - } else if (ExponentSign == -1) { - return RoundingDivideByPOT(x, -Exponent); - } else { - return 0; - } -} +int SaturatingRoundingMultiplyByPOT(int32_t x, int Exponent); -inline int32_t Rescale(int x, int kIntegerBitsSrc, int kIntegerBitsDst) { - int kExponent = kIntegerBitsSrc - kIntegerBitsDst; - int result = SaturatingRoundingMultiplyByPOT(x, kExponent); - return result; -} +int32_t Rescale(int x, int kIntegerBitsSrc, int kIntegerBitsDst); -static inline int32_t one_over_one_plus_x_for_x_in_0_1(int32_t a) { - int one = FixedPoint_One(0, FractionsBits(0)); - int half_denominator = RoundingHalfSum(a, one); - const int constant_48_over_17 = 1515870810; - const int constant_neg_32_over_17 = -1010580540; - int x = constant_48_over_17 + SaturatingRoundingDoublingHighMul(half_denominator, constant_neg_32_over_17); - for (int i = 0; i < 3; i++) { - int half_denominator_times_x = SaturatingRoundingDoublingHighMul(half_denominator, x); - int one_minus_half_denominator_times_x = FixedPoint_One(2, FractionsBits(2)) - half_denominator_times_x; - x = x + Rescale(SaturatingRoundingDoublingHighMul(x, one_minus_half_denominator_times_x), 2 + 2, 2); - } - return Rescale(x, 2 - 1, 0); -} +static int32_t one_over_one_plus_x_for_x_in_0_1(int32_t a); -inline int CountLeadingZeroBits(uint32_t x) { -#if defined(__GUNC__) - return x ? __builtin_clz(x) : 8 * sizeof(uint32_t); -#else - if (x == 0) { - return 8 * sizeof(uint32_t); - } - const int32_t leading_positive = (int32_t)(1) << (8 * sizeof(uint32_t) - 1); - int leading_zeros = 0; - while (x < leading_positive) { - x <<= 1; - leading_zeros++; - } - return leading_zeros; -#endif -} +int CountLeadingZeroBits(uint32_t x); -inline int CountLeadingSignBits(int32_t x) { -#if defined(__GUNC__) && !defined(__clang__) - return x ? __builtin_clrsb(x) : 8 * sizeof(int32_t); -#else - return x >= 0 ? CountLeadingZeroBits((uint32_t)x) - 1 : x != INT32_MIN ? CountLeadingZeroBits(2 * (uint32_t)(-x)) : 0; -#endif -} +int CountLeadingSignBits(int32_t x); -static inline int32_t ComputerReciproal(int32_t x, int x_digits, int *recip_shift) { - int leading_zreos_plus_one = CountLeadingZeroBits((uint32_t)x); - *recip_shift = x_digits - leading_zreos_plus_one; - const int32_t shifted_minus_one = (int32_t)(((uint32_t)x << leading_zreos_plus_one) - ((uint32_t)(1) << 31)); - const int32_t shifted_scaled = one_over_one_plus_x_for_x_in_0_1(shifted_minus_one); - return shifted_scaled; -} +int32_t ComputerReciproal(int32_t x, int x_digits, int *recip_shift); #ifdef __cplusplus } #endif #ifdef ENABLE_NEON -inline int32x4_t RoundingDivideByPOTInt32x4(int32x4_t x, int exponent) { - const int32x4_t shift_vec = vdupq_n_s32(-exponent); - const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31); - const int32x4_t fixed_up_x = vqaddq_s32(x, fixup); - return vrshlq_s32(fixed_up_x, shift_vec); -} +int32x4_t RoundingDivideByPOTInt32x4(int32x4_t x, int exponent); -inline int32x4_t SaturatingRoundingDoublingHighMulInt32x4(int32x4_t a, int32x4_t b) { return vqrdmulhq_s32(a, b); } +int32x4_t SaturatingRoundingDoublingHighMulInt32x4(int32x4_t a, int32x4_t b); #endif #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_QUANTIZATION_FIXED_POINT_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/quantize.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/quantize.c index 19375cb91f5..3602d40c614 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/quantize.c +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/quantize.c @@ -26,6 +26,70 @@ const double dNormalizer = 0x1p54; const int dNormalizerBias = 54; const int iMantissaBits = 31; + +void QuantizeMultiplierSmallerThanOne(double double_multiplier, int32_t *quantized_multiplier, + int *right_shift) { + if (quantized_multiplier == NULL || right_shift == NULL) { + return; + } + int shift; + QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift); + *right_shift = -shift; +} + +void QuantizeRoundParameter(double double_multiplier, int32_t *quantized_multiplier, int *left_shift, + int *right_shift) { + int shift; + QuantizeMultiplierSmallerThanOne(double_multiplier, quantized_multiplier, &shift); + shift = -shift; + if (shift < 0) { + *left_shift = 0; + *right_shift = shift; + } else { + *left_shift = shift; + *right_shift = 0; + } +} + +uint8_t QuantizeToUint8(float real_value, float scale, int32_t zp) { return round(real_value / scale + zp); } + +int32_t QuantizeToInt8(float real_value, float scale, int32_t zp) { return round(real_value / scale + zp); } + +void CalculateActivationRangeQuantized(bool is_relu, bool is_relu6, int32_t zp, float scale, int *mini, + int *maxi) { + int32_t min = CHAR_MIN; + int32_t max = CHAR_MAX; + int32_t quantized_zero = QuantizeToInt8(0, scale, zp); + int32_t quantized_six = QuantizeToInt8(6, scale, zp); + if (is_relu) { + min = min > quantized_zero ? min : quantized_zero; + } else if (is_relu6) { + min = min > quantized_zero ? min : quantized_zero; + max = max < quantized_six ? max : quantized_six; + } else { + // do nothing + } + *mini = min; + *maxi = max; +} + +// quantize from float to int8 +void Quantize(float *input_data, int length, float scale, int zero_point, int8_t *output_data) { + for (int i = 0; i < length; ++i) { + int q = (int)round(input_data[i] / scale + zero_point); + q = q > CHAR_MAX ? CHAR_MAX : q; + q = q < CHAR_MIN ? CHAR_MIN : q; + output_data[i] = (int8_t)q; + } +} + +// dequantize from int8 to float +void Dequantize(int8_t *input_data, int length, float scale, int zero_point, float *output_data) { + for (int i = 0; i < length; ++i) { + output_data[i] = scale * (input_data[i] - zero_point); + } +} + void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift) { if (quantized_multiplier == NULL || shift == NULL) { return; diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/quantize.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/quantize.h index f6a561a8db2..b4ac17b4cf9 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/quantize.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/quantize.h @@ -213,68 +213,20 @@ extern "C" { void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift); -inline void QuantizeMultiplierSmallerThanOne(double double_multiplier, int32_t *quantized_multiplier, - int *right_shift) { - if (quantized_multiplier == NULL || right_shift == NULL) { - return; - } - int shift; - QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift); - *right_shift = -shift; -} +void QuantizeMultiplierSmallerThanOne(double double_multiplier, int32_t *quantized_multiplier, int *right_shift); -inline void QuantizeRoundParameter(double double_multiplier, int32_t *quantized_multiplier, int *left_shift, - int *right_shift) { - int shift; - QuantizeMultiplierSmallerThanOne(double_multiplier, quantized_multiplier, &shift); - shift = -shift; - if (shift < 0) { - *left_shift = 0; - *right_shift = shift; - } else { - *left_shift = shift; - *right_shift = 0; - } -} +void QuantizeRoundParameter(double double_multiplier, int32_t *quantized_multiplier, int *left_shift, int *right_shift); -inline uint8_t QuantizeToUint8(float real_value, float scale, int32_t zp) { return round(real_value / scale + zp); } +uint8_t QuantizeToUint8(float real_value, float scale, int32_t zp); -inline int32_t QuantizeToInt8(float real_value, float scale, int32_t zp) { return round(real_value / scale + zp); } - -inline void CalculateActivationRangeQuantized(bool is_relu, bool is_relu6, int32_t zp, float scale, int *mini, - int *maxi) { - int32_t min = CHAR_MIN; - int32_t max = CHAR_MAX; - int32_t quantized_zero = QuantizeToInt8(0, scale, zp); - int32_t quantized_six = QuantizeToInt8(6, scale, zp); - if (is_relu) { - min = min > quantized_zero ? min : quantized_zero; - } else if (is_relu6) { - min = min > quantized_zero ? min : quantized_zero; - max = max < quantized_six ? max : quantized_six; - } else { - // do nothing - } - *mini = min; - *maxi = max; -} +int32_t QuantizeToInt8(float real_value, float scale, int32_t zp); +void CalculateActivationRangeQuantized(bool is_relu, bool is_relu6, int32_t zp, float scale, int *mini, int *maxi); // quantize from float to int8 -inline void Quantize(float *input_data, int length, float scale, int zero_point, int8_t *output_data) { - for (int i = 0; i < length; ++i) { - int q = (int)round(input_data[i] / scale + zero_point); - q = q > CHAR_MAX ? CHAR_MAX : q; - q = q < CHAR_MIN ? CHAR_MIN : q; - output_data[i] = (int8_t)q; - } -} +void Quantize(float *input_data, int length, float scale, int zero_point, int8_t *output_data); // dequantize from int8 to float -inline void Dequantize(int8_t *input_data, int length, float scale, int zero_point, float *output_data) { - for (int i = 0; i < length; ++i) { - output_data[i] = scale * (input_data[i] - zero_point); - } -} +void Dequantize(int8_t *input_data, int length, float scale, int zero_point, float *output_data); #ifdef __cplusplus } diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/activation_fp32_test.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/activation_fp32_test.cc index 19c0257b9b1..9a9f84d31f8 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/activation_fp32_test.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/activation_fp32_test.cc @@ -30,7 +30,7 @@ class TestActivationFp32 : public mindspore::CommonTest { TEST_F(TestActivationFp32, ReluFp32) { float input[8] = {-3, -2, -1, 0, 1, 5, 6, 7}; float output[8] = {0}; - Relu(input, 8, output); + Fp32Relu(input, 8, output); float expect[8] = {0, 0, 0, 0, 1, 5, 6, 7}; for (int i = 0; i < 8; ++i) { ASSERT_EQ(output[i], expect[i]); @@ -40,7 +40,7 @@ TEST_F(TestActivationFp32, ReluFp32) { TEST_F(TestActivationFp32, Relu6Fp32) { float input[8] = {-3, -2, -1, 0, 1, 5, 6, 7}; float output[8] = {0}; - Relu6(input, 8, output); + Fp32Relu6(input, 8, output); float expect[8] = {0, 0, 0, 0, 1, 5, 6, 6}; for (int i = 0; i < 8; ++i) { ASSERT_EQ(output[i], expect[i]);