!49848 [MS][LITE] logical and neon/sse/avx/avx512 optimation

Merge pull request !49848 from Greatpan/compare_avx512
2023-03-08 03:56:05 +00:00 · 2023-03-08 03:56:05 +00:00 · 96a7881414
parent 7e8d55b533 415c8e21b9
commit 96a7881414
3 changed files with 49 additions and 28 deletions
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/arithmetic_compare_fp32.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/arithmetic_compare_fp32.c
@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2023 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -16,20 +16,20 @@

 #include "nnacl/fp32/arithmetic_compare_fp32.h"

-bool EqualFp32(float x, float y) { return x == y; }
-bool NotEqualFp32(float x, float y) { return x != y; }
-bool LessFp32(float x, float y) { return x < y; }
-bool LessEqualFp32(float x, float y) { return x <= y; }
-bool GreaterFp32(float x, float y) { return x > y; }
-bool GreaterEqualFp32(float x, float y) { return x >= y; }
+inline bool EqualFp32(float x, float y) { return x == y; }
+inline bool NotEqualFp32(float x, float y) { return x != y; }
+inline bool LessFp32(float x, float y) { return x < y; }
+inline bool LessEqualFp32(float x, float y) { return x <= y; }
+inline bool GreaterFp32(float x, float y) { return x > y; }
+inline bool GreaterEqualFp32(float x, float y) { return x >= y; }

-bool EqualInt32(int x, int y) { return x == y; }
-bool NotEqualInt32(int x, int y) { return x != y; }
-bool NotEqualInt64(int64_t x, int64_t y) { return x != y; }
-bool LessInt32(int x, int y) { return x < y; }
-bool LessEqualInt32(int x, int y) { return x <= y; }
-bool GreaterInt32(int x, int y) { return x > y; }
-bool GreaterEqualInt32(int x, int y) { return x >= y; }
+inline bool EqualInt32(int x, int y) { return x == y; }
+inline bool NotEqualInt32(int x, int y) { return x != y; }
+inline bool NotEqualInt64(int64_t x, int64_t y) { return x != y; }
+inline bool LessInt32(int x, int y) { return x < y; }
+inline bool LessEqualInt32(int x, int y) { return x <= y; }
+inline bool GreaterInt32(int x, int y) { return x > y; }
+inline bool GreaterEqualInt32(int x, int y) { return x >= y; }

 #define ELEMENT_COMPARE(input0, input1, output, element_size, compare_func) \
  do {                                                                      \
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/arithmetic_fp32.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/arithmetic_fp32.c
@ -188,18 +188,8 @@ int ElementOptFloorDivInt(const int *in0, const int *in1, int *out, int size, co

 int ElementLogicalAnd(const float *in0, const float *in1, float *out, int size) {
  int index = 0;
-#ifdef ENABLE_NEON
-  float32x4_t vtrue = vdupq_n_f32(1);
-  float32x4_t vfalse = vdupq_n_f32(0);
-  uint32x4_t mask = vmovq_n_u32(((uint32_t)(1u << 31) - 1));
-  uint32x4_t zeros = vdupq_n_u32(0);
-  for (; index <= size - 4; index += C4NUM) {
-    uint32x4_t vin0 = vandq_u32(vreinterpretq_u32_f32(vld1q_f32(in0 + index)), mask);
-    uint32x4_t vin1 = vandq_u32(vreinterpretq_u32_f32(vld1q_f32(in1 + index)), mask);
-    float32x4_t vout = vbslq_f32(vceqq_u32(vandq_u32(vin0, vin1), zeros), vfalse, vtrue);
-    vst1q_f32(out + index, vout);
-  }
-#endif
+
+  SIMD_RUN_NO_SCALAR(ElementLogicalAnd, index, in0, in1, out, size);
  for (; index < size; index++) {
    out[index] = (float)((bool)(in0[index]) & (bool)(in1[index]));
  }
@ -208,6 +198,7 @@ int ElementLogicalAnd(const float *in0, const float *in1, float *out, int size)

 int ElementOptLogicalAnd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
  int index = 0;
+  SIMD_RUN_NO_SCALAR(ElementOptLogicalAnd, index, in0, in1, out, size, param);
  if (param->in_elements_num0_ == 1) {
    for (; index < size; index++) {
      out[index] = (float)((bool)(in0[0]) & (bool)(in1[index]));
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/arithmetic_fp32_simd.h.in
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/arithmetic_fp32_simd.h.in
@ -1,5 +1,5 @@
 /**
- * Copyright 2022 Huawei Technologies Co., Ltd
+ * Copyright 2022-2023 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -13,7 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-
+// clang-format off
 #ifndef MINDSPORE_NNACL_ARITHMETIC_@SIMD_INSTRUCTION@_H_
 #define MINDSPORE_NNACL_ARITHMETIC_@SIMD_INSTRUCTION@_H_

@ -250,6 +250,36 @@ static inline size_t AssignSubOpt@SIMD_INSTRUCTION@(int index, float *in0, const
  return index;
 }

+int ElementLogicalAnd@SIMD_INSTRUCTION@(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_AND_F32(SIMD_GETSIGN_F32(in0_tmp), SIMD_GETSIGN_F32(in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+int ElementOptLogicalAnd@SIMD_INSTRUCTION@(int index, const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+  if (param->in_elements_num0_ == 1) {
+    SIMD_F32 in0_tmp = SIMD_MOV_F32(*in0);
+    for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+      SIMD_F32 out_tmp = SIMD_AND_F32(SIMD_GETSIGN_F32(in0_tmp), SIMD_GETSIGN_F32(in1_tmp));
+      SIMD_ST_F32(out + index, out_tmp);
+    }
+  } else {
+    SIMD_F32 in1_tmp = SIMD_MOV_F32(*in1);
+    for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+      SIMD_F32 out_tmp = SIMD_AND_F32(SIMD_GETSIGN_F32(in0_tmp), SIMD_GETSIGN_F32(in1_tmp));
+      SIMD_ST_F32(out + index, out_tmp);
+    }
+  }
+
+  return index;
+}
+
@SIMD_INSTRUCTION_END@
 #ifdef __cplusplus
 }