From 415c8e21b90a593d389a9592886c9135674f2c1f Mon Sep 17 00:00:00 2001 From: greatpan Date: Mon, 6 Mar 2023 17:02:19 +0800 Subject: [PATCH] add arthmetic fp32 and calc --- .../nnacl/fp32/arithmetic_compare_fp32.c | 28 +++++++-------- .../cpu/kernel/nnacl/fp32/arithmetic_fp32.c | 15 ++------ .../nnacl/fp32/arithmetic_fp32_simd.h.in | 34 +++++++++++++++++-- 3 files changed, 49 insertions(+), 28 deletions(-) diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/arithmetic_compare_fp32.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/arithmetic_compare_fp32.c index 17b8bd1dcac..6d06844f20a 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/arithmetic_compare_fp32.c +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/arithmetic_compare_fp32.c @@ -1,5 +1,5 @@ /** - * Copyright 2020 Huawei Technologies Co., Ltd + * Copyright 2020-2023 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,20 +16,20 @@ #include "nnacl/fp32/arithmetic_compare_fp32.h" -bool EqualFp32(float x, float y) { return x == y; } -bool NotEqualFp32(float x, float y) { return x != y; } -bool LessFp32(float x, float y) { return x < y; } -bool LessEqualFp32(float x, float y) { return x <= y; } -bool GreaterFp32(float x, float y) { return x > y; } -bool GreaterEqualFp32(float x, float y) { return x >= y; } +inline bool EqualFp32(float x, float y) { return x == y; } +inline bool NotEqualFp32(float x, float y) { return x != y; } +inline bool LessFp32(float x, float y) { return x < y; } +inline bool LessEqualFp32(float x, float y) { return x <= y; } +inline bool GreaterFp32(float x, float y) { return x > y; } +inline bool GreaterEqualFp32(float x, float y) { return x >= y; } -bool EqualInt32(int x, int y) { return x == y; } -bool NotEqualInt32(int x, int y) { return x != y; } -bool NotEqualInt64(int64_t x, int64_t y) { return x != y; } -bool LessInt32(int x, int y) { return x < y; } -bool LessEqualInt32(int x, int y) { return x <= y; } -bool GreaterInt32(int x, int y) { return x > y; } -bool GreaterEqualInt32(int x, int y) { return x >= y; } +inline bool EqualInt32(int x, int y) { return x == y; } +inline bool NotEqualInt32(int x, int y) { return x != y; } +inline bool NotEqualInt64(int64_t x, int64_t y) { return x != y; } +inline bool LessInt32(int x, int y) { return x < y; } +inline bool LessEqualInt32(int x, int y) { return x <= y; } +inline bool GreaterInt32(int x, int y) { return x > y; } +inline bool GreaterEqualInt32(int x, int y) { return x >= y; } #define ELEMENT_COMPARE(input0, input1, output, element_size, compare_func) \ do { \ diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/arithmetic_fp32.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/arithmetic_fp32.c index a2eb743754f..30787edd365 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/arithmetic_fp32.c +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/arithmetic_fp32.c @@ -188,18 +188,8 @@ int ElementOptFloorDivInt(const int *in0, const int *in1, int *out, int size, co int ElementLogicalAnd(const float *in0, const float *in1, float *out, int size) { int index = 0; -#ifdef ENABLE_NEON - float32x4_t vtrue = vdupq_n_f32(1); - float32x4_t vfalse = vdupq_n_f32(0); - uint32x4_t mask = vmovq_n_u32(((uint32_t)(1u << 31) - 1)); - uint32x4_t zeros = vdupq_n_u32(0); - for (; index <= size - 4; index += C4NUM) { - uint32x4_t vin0 = vandq_u32(vreinterpretq_u32_f32(vld1q_f32(in0 + index)), mask); - uint32x4_t vin1 = vandq_u32(vreinterpretq_u32_f32(vld1q_f32(in1 + index)), mask); - float32x4_t vout = vbslq_f32(vceqq_u32(vandq_u32(vin0, vin1), zeros), vfalse, vtrue); - vst1q_f32(out + index, vout); - } -#endif + + SIMD_RUN_NO_SCALAR(ElementLogicalAnd, index, in0, in1, out, size); for (; index < size; index++) { out[index] = (float)((bool)(in0[index]) & (bool)(in1[index])); } @@ -208,6 +198,7 @@ int ElementLogicalAnd(const float *in0, const float *in1, float *out, int size) int ElementOptLogicalAnd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { int index = 0; + SIMD_RUN_NO_SCALAR(ElementOptLogicalAnd, index, in0, in1, out, size, param); if (param->in_elements_num0_ == 1) { for (; index < size; index++) { out[index] = (float)((bool)(in0[0]) & (bool)(in1[index])); diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/arithmetic_fp32_simd.h.in b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/arithmetic_fp32_simd.h.in index 0b47466d9e7..d97dbf68f24 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/arithmetic_fp32_simd.h.in +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/arithmetic_fp32_simd.h.in @@ -1,5 +1,5 @@ /** - * Copyright 2022 Huawei Technologies Co., Ltd + * Copyright 2022-2023 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - +// clang-format off #ifndef MINDSPORE_NNACL_ARITHMETIC_@SIMD_INSTRUCTION@_H_ #define MINDSPORE_NNACL_ARITHMETIC_@SIMD_INSTRUCTION@_H_ @@ -250,6 +250,36 @@ static inline size_t AssignSubOpt@SIMD_INSTRUCTION@(int index, float *in0, const return index; } +int ElementLogicalAnd@SIMD_INSTRUCTION@(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_AND_F32(SIMD_GETSIGN_F32(in0_tmp), SIMD_GETSIGN_F32(in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +int ElementOptLogicalAnd@SIMD_INSTRUCTION@(int index, const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { + if (param->in_elements_num0_ == 1) { + SIMD_F32 in0_tmp = SIMD_MOV_F32(*in0); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_AND_F32(SIMD_GETSIGN_F32(in0_tmp), SIMD_GETSIGN_F32(in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + } else { + SIMD_F32 in1_tmp = SIMD_MOV_F32(*in1); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 out_tmp = SIMD_AND_F32(SIMD_GETSIGN_F32(in0_tmp), SIMD_GETSIGN_F32(in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + } + + return index; +} + @SIMD_INSTRUCTION_END@ #ifdef __cplusplus }