!49848 [MS][LITE] logical and neon/sse/avx/avx512 optimation

Merge pull request !49848 from Greatpan/compare_avx512
This commit is contained in:
i-robot 2023-03-08 03:56:05 +00:00 committed by Gitee
commit 96a7881414
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
3 changed files with 49 additions and 28 deletions

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2023 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -16,20 +16,20 @@
#include "nnacl/fp32/arithmetic_compare_fp32.h"
bool EqualFp32(float x, float y) { return x == y; }
bool NotEqualFp32(float x, float y) { return x != y; }
bool LessFp32(float x, float y) { return x < y; }
bool LessEqualFp32(float x, float y) { return x <= y; }
bool GreaterFp32(float x, float y) { return x > y; }
bool GreaterEqualFp32(float x, float y) { return x >= y; }
inline bool EqualFp32(float x, float y) { return x == y; }
inline bool NotEqualFp32(float x, float y) { return x != y; }
inline bool LessFp32(float x, float y) { return x < y; }
inline bool LessEqualFp32(float x, float y) { return x <= y; }
inline bool GreaterFp32(float x, float y) { return x > y; }
inline bool GreaterEqualFp32(float x, float y) { return x >= y; }
bool EqualInt32(int x, int y) { return x == y; }
bool NotEqualInt32(int x, int y) { return x != y; }
bool NotEqualInt64(int64_t x, int64_t y) { return x != y; }
bool LessInt32(int x, int y) { return x < y; }
bool LessEqualInt32(int x, int y) { return x <= y; }
bool GreaterInt32(int x, int y) { return x > y; }
bool GreaterEqualInt32(int x, int y) { return x >= y; }
inline bool EqualInt32(int x, int y) { return x == y; }
inline bool NotEqualInt32(int x, int y) { return x != y; }
inline bool NotEqualInt64(int64_t x, int64_t y) { return x != y; }
inline bool LessInt32(int x, int y) { return x < y; }
inline bool LessEqualInt32(int x, int y) { return x <= y; }
inline bool GreaterInt32(int x, int y) { return x > y; }
inline bool GreaterEqualInt32(int x, int y) { return x >= y; }
#define ELEMENT_COMPARE(input0, input1, output, element_size, compare_func) \
do { \

View File

@ -188,18 +188,8 @@ int ElementOptFloorDivInt(const int *in0, const int *in1, int *out, int size, co
int ElementLogicalAnd(const float *in0, const float *in1, float *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
float32x4_t vtrue = vdupq_n_f32(1);
float32x4_t vfalse = vdupq_n_f32(0);
uint32x4_t mask = vmovq_n_u32(((uint32_t)(1u << 31) - 1));
uint32x4_t zeros = vdupq_n_u32(0);
for (; index <= size - 4; index += C4NUM) {
uint32x4_t vin0 = vandq_u32(vreinterpretq_u32_f32(vld1q_f32(in0 + index)), mask);
uint32x4_t vin1 = vandq_u32(vreinterpretq_u32_f32(vld1q_f32(in1 + index)), mask);
float32x4_t vout = vbslq_f32(vceqq_u32(vandq_u32(vin0, vin1), zeros), vfalse, vtrue);
vst1q_f32(out + index, vout);
}
#endif
SIMD_RUN_NO_SCALAR(ElementLogicalAnd, index, in0, in1, out, size);
for (; index < size; index++) {
out[index] = (float)((bool)(in0[index]) & (bool)(in1[index]));
}
@ -208,6 +198,7 @@ int ElementLogicalAnd(const float *in0, const float *in1, float *out, int size)
int ElementOptLogicalAnd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
int index = 0;
SIMD_RUN_NO_SCALAR(ElementOptLogicalAnd, index, in0, in1, out, size, param);
if (param->in_elements_num0_ == 1) {
for (; index < size; index++) {
out[index] = (float)((bool)(in0[0]) & (bool)(in1[index]));

View File

@ -1,5 +1,5 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
* Copyright 2022-2023 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// clang-format off
#ifndef MINDSPORE_NNACL_ARITHMETIC_@SIMD_INSTRUCTION@_H_
#define MINDSPORE_NNACL_ARITHMETIC_@SIMD_INSTRUCTION@_H_
@ -250,6 +250,36 @@ static inline size_t AssignSubOpt@SIMD_INSTRUCTION@(int index, float *in0, const
return index;
}
int ElementLogicalAnd@SIMD_INSTRUCTION@(int index, const float *in0, const float *in1, float *out, int size) {
for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
SIMD_F32 out_tmp = SIMD_AND_F32(SIMD_GETSIGN_F32(in0_tmp), SIMD_GETSIGN_F32(in1_tmp));
SIMD_ST_F32(out + index, out_tmp);
}
return index;
}
int ElementOptLogicalAnd@SIMD_INSTRUCTION@(int index, const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
if (param->in_elements_num0_ == 1) {
SIMD_F32 in0_tmp = SIMD_MOV_F32(*in0);
for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
SIMD_F32 out_tmp = SIMD_AND_F32(SIMD_GETSIGN_F32(in0_tmp), SIMD_GETSIGN_F32(in1_tmp));
SIMD_ST_F32(out + index, out_tmp);
}
} else {
SIMD_F32 in1_tmp = SIMD_MOV_F32(*in1);
for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
SIMD_F32 out_tmp = SIMD_AND_F32(SIMD_GETSIGN_F32(in0_tmp), SIMD_GETSIGN_F32(in1_tmp));
SIMD_ST_F32(out + index, out_tmp);
}
}
return index;
}
@SIMD_INSTRUCTION_END@
#ifdef __cplusplus
}