forked from mindspore-Ecosystem/mindspore
!31536 [MSLITE][CPU] AVX512/256/SSE/NENO Advanced packaging, and Arithmetic Op (total 14) Refactoring and optimization
Merge pull request !31536 from Greatpan/avx512_arithmetic_self
This commit is contained in:
commit
f301c9e40f
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
* Copyright 2020-2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -19,20 +19,62 @@
|
|||
|
||||
#define ACCURACY_DATA 0.00000001
|
||||
|
||||
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
|
||||
#define SimdElementFloorModCoreCalc(block_size, block_num, in0, in1, out, size, i) \
|
||||
for (int block_max_size = size - block_num + 1; i < block_max_size; i += block_num) { \
|
||||
MS_FLOAT_32xN(block_num) in0_tmp = MS_LD_F32(block_size, in0 + i); \
|
||||
MS_FLOAT_32xN(block_num) in1_tmp = MS_LD_F32(block_size, in1 + i); \
|
||||
MS_FLOAT_32xN(block_num) floor_tmp = MS_FLOOR_F32(block_size, MS_DIV_F32(block_size, in0_tmp, in1_tmp)); \
|
||||
MS_FLOAT_32xN(block_num) out_tmp = MS_SUB_F32(block_size, in0_tmp, MS_MUL_F32(block_size, floor_tmp, in1_tmp)); \
|
||||
MS_ST_F32(block_size, out + i, out_tmp); \
|
||||
}
|
||||
|
||||
int ElementFloorMod(const float *in0, const float *in1, float *out, int size) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
int i = 0;
|
||||
|
||||
MS_SIMD_RUN_X86_NO_SCALAR(SimdElementFloorModCoreCalc, in0, in1, out, size, i); // neon no floor instruction
|
||||
|
||||
for (; i < size; i++) {
|
||||
out[i] = in0[i] - floorf(in0[i] / in1[i]) * in1[i];
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
|
||||
#define SimdElementOptFloorModCoreCalc1(block_size, block_num, in0, in1, out, size, i) \
|
||||
do { \
|
||||
MS_FLOAT_32xN(block_num) in0_tmp = MS_MOVN_F32(block_size, in0[0]); \
|
||||
for (int block_max_size = size - block_num + 1; i < block_max_size; i += block_num) { \
|
||||
MS_FLOAT_32xN(block_num) in1_tmp = MS_LD_F32(block_size, in1 + i); \
|
||||
MS_FLOAT_32xN(block_num) floor_tmp = MS_FLOOR_F32(block_size, MS_DIV_F32(block_size, in0_tmp, in1_tmp)); \
|
||||
MS_FLOAT_32xN(block_num) out_tmp = MS_SUB_F32(block_size, in0_tmp, MS_MUL_F32(block_size, floor_tmp, in1_tmp)); \
|
||||
MS_ST_F32(block_size, out + i, out_tmp); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define SimdElementOptFloorModCoreCalc2(block_size, block_num, in0, in1, out, size, i) \
|
||||
do { \
|
||||
MS_FLOAT_32xN(block_num) in1_tmp = MS_MOVN_F32(block_size, in1[0]); \
|
||||
for (int block_max_size = size - block_num + 1; i < block_max_size; i += block_num) { \
|
||||
MS_FLOAT_32xN(block_num) in0_tmp = MS_LD_F32(block_size, in0 + i); \
|
||||
MS_FLOAT_32xN(block_num) floor_tmp = MS_FLOOR_F32(block_size, MS_DIV_F32(block_size, in0_tmp, in1_tmp)); \
|
||||
MS_FLOAT_32xN(block_num) out_tmp = MS_SUB_F32(block_size, in0_tmp, MS_MUL_F32(block_size, floor_tmp, in1_tmp)); \
|
||||
MS_ST_F32(block_size, out + i, out_tmp); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
int ElementOptFloorMod(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
|
||||
int i = 0;
|
||||
|
||||
if (param->in_elements_num0_ == 1) {
|
||||
MS_SIMD_RUN_X86_NO_SCALAR(SimdElementOptFloorModCoreCalc1, in0, in1, out, size, i); // neon no floor instruction
|
||||
|
||||
for (; i < size; i++) {
|
||||
out[i] = in0[0] - floorf(in0[0] / in1[i]) * in1[i];
|
||||
}
|
||||
} else {
|
||||
MS_SIMD_RUN_X86_NO_SCALAR(SimdElementOptFloorModCoreCalc2, in0, in1, out, size, i); // neon no floor instruction
|
||||
|
||||
for (; i < size; i++) {
|
||||
out[i] = in0[i] - floorf(in0[i] / in1[0]) * in1[0];
|
||||
}
|
||||
|
@ -113,20 +155,59 @@ int ElementOptModInt(const int *in0, const int *in1, int *out, int size, const A
|
|||
return NNACL_OK;
|
||||
}
|
||||
|
||||
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
|
||||
#define SimdElementFloorDivCoreCalc(block_size, block_num, in0, in1, out, size, i) \
|
||||
for (int block_max_size = size - block_num + 1; i < block_max_size; i += block_num) { \
|
||||
MS_FLOAT_32xN(block_num) in0_tmp = MS_LD_F32(block_size, in0 + i); \
|
||||
MS_FLOAT_32xN(block_num) in1_tmp = MS_LD_F32(block_size, in1 + i); \
|
||||
MS_FLOAT_32xN(block_num) floor_tmp = MS_FLOOR_F32(block_size, MS_DIV_F32(block_size, in0_tmp, in1_tmp)); \
|
||||
MS_ST_F32(block_size, out + i, floor_tmp); \
|
||||
}
|
||||
int ElementFloorDiv(const float *in0, const float *in1, float *out, int size) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
int i = 0;
|
||||
|
||||
MS_SIMD_RUN_X86_NO_SCALAR(SimdElementFloorDivCoreCalc, in0, in1, out, size, i); // neon no floor instruction
|
||||
|
||||
for (; i < size; i++) {
|
||||
out[i] = floorf(in0[i] / in1[i]);
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
|
||||
#define SimdElementOptFloorDivCoreCalc1(block_size, block_num, in0, in1, out, size, i) \
|
||||
do { \
|
||||
MS_FLOAT_32xN(block_num) in0_tmp = MS_MOVN_F32(block_size, in0[0]); \
|
||||
for (int block_max_size = size - block_num + 1; i < block_max_size; i += block_num) { \
|
||||
MS_FLOAT_32xN(block_num) in1_tmp = MS_LD_F32(block_size, in1 + i); \
|
||||
MS_FLOAT_32xN(block_num) out_tmp = MS_FLOOR_F32(block_size, MS_DIV_F32(block_size, in0_tmp, in1_tmp)); \
|
||||
MS_ST_F32(block_size, out + i, out_tmp); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
|
||||
#define SimdElementOptFloorDivCoreCalc2(block_size, block_num, in0, in1, out, size, i) \
|
||||
do { \
|
||||
MS_FLOAT_32xN(block_num) in1_tmp = MS_MOVN_F32(block_size, in1[0]); \
|
||||
for (int block_max_size = size - block_num + 1; i < block_max_size; i += block_num) { \
|
||||
MS_FLOAT_32xN(block_num) in0_tmp = MS_LD_F32(block_size, in0 + i); \
|
||||
MS_FLOAT_32xN(block_num) out_tmp = MS_FLOOR_F32(block_size, MS_DIV_F32(block_size, in0_tmp, in1_tmp)); \
|
||||
MS_ST_F32(block_size, out + i, out_tmp); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
int ElementOptFloorDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
|
||||
int i = 0;
|
||||
|
||||
if (param->in_elements_num0_ == 1) {
|
||||
MS_SIMD_RUN_X86_NO_SCALAR(SimdElementOptFloorDivCoreCalc1, in0, in1, out, size, i); // neon no floor instruction
|
||||
|
||||
for (; i < size; i++) {
|
||||
out[i] = floorf(in0[0] / in1[i]);
|
||||
}
|
||||
} else {
|
||||
MS_SIMD_RUN_X86_NO_SCALAR(SimdElementOptFloorDivCoreCalc2, in0, in1, out, size, i); // neon no floor instruction
|
||||
|
||||
for (; i < size; i++) {
|
||||
out[i] = floorf(in0[i] / in1[0]);
|
||||
}
|
||||
|
@ -135,23 +216,61 @@ int ElementOptFloorDiv(const float *in0, const float *in1, float *out, int size,
|
|||
return NNACL_OK;
|
||||
}
|
||||
|
||||
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
|
||||
#define SimdElementFloorDivIntCoreCalc(block_size, block_num, in0, in1, out, size, i) \
|
||||
for (int block_max_size = size - block_num + 1; i < block_max_size; i += block_num) { \
|
||||
MS_INT_32xN(block_num) in0_tmp = MS_LD_EPI32(block_size, in0 + i); \
|
||||
MS_INT_32xN(block_num) in1_tmp = MS_LD_EPI32(block_size, in1 + i); \
|
||||
MS_INT_32xN(block_num) out_tmp = MS_DIV_EPI32(block_size, in0_tmp, in1_tmp); \
|
||||
MS_ST_EPI32(block_size, out + i, out_tmp); \
|
||||
}
|
||||
int ElementFloorDivInt(const int *in0, const int *in1, int *out, int size) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
int i = 0;
|
||||
|
||||
MS_SIMD_RUN_NO_SCALAR(SimdElementFloorDivIntCoreCalc, in0, in1, out, size, i);
|
||||
|
||||
for (; i < size; i++) {
|
||||
NNACL_CHECK_ZERO_RETURN_ERR(in1[i]);
|
||||
out[i] = in0[i] / in1[i];
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
|
||||
#define SimdElementOptFloorDivIntCoreCalc1(block_size, block_num, in0, in1, out, size, i) \
|
||||
do { \
|
||||
MS_INT_32xN(block_num) in0_tmp = MS_MOVN_EPI32(block_size, in0[0]); \
|
||||
for (int block_max_size = size - block_num + 1; i < block_max_size; i += block_num) { \
|
||||
MS_INT_32xN(block_num) in1_tmp = MS_LD_EPI32(block_size, in1 + i); \
|
||||
MS_INT_32xN(block_num) out_tmp = MS_DIV_EPI32(block_size, in0_tmp, in1_tmp); \
|
||||
MS_ST_EPI32(block_size, out + i, out_tmp); \
|
||||
} \
|
||||
} while (0)
|
||||
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
|
||||
#define SimdElementOptFloorDivIntCoreCalc2(block_size, block_num, in0, in1, out, size, i) \
|
||||
do { \
|
||||
MS_INT_32xN(block_num) in1_tmp = MS_MOVN_EPI32(block_size, in1[0]); \
|
||||
for (int block_max_size = size - block_num + 1; i < block_max_size; i += block_num) { \
|
||||
MS_INT_32xN(block_num) in0_tmp = MS_LD_EPI32(block_size, in0 + i); \
|
||||
MS_INT_32xN(block_num) out_tmp = MS_DIV_EPI32(block_size, in0_tmp, in1_tmp); \
|
||||
MS_ST_EPI32(block_size, out + i, out_tmp); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
int ElementOptFloorDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
|
||||
int i = 0;
|
||||
if (param->in_elements_num0_ == 1) {
|
||||
MS_SIMD_RUN_NO_SCALAR(SimdElementOptFloorDivIntCoreCalc1, in0, in1, out, size, i);
|
||||
|
||||
for (; i < size; i++) {
|
||||
NNACL_CHECK_ZERO_RETURN_ERR(in1[i]);
|
||||
out[i] = in0[0] / in1[i];
|
||||
}
|
||||
} else {
|
||||
NNACL_CHECK_ZERO_RETURN_ERR(in1[0]);
|
||||
|
||||
MS_SIMD_RUN_NO_SCALAR(SimdElementOptFloorDivIntCoreCalc2, in0, in1, out, size, i);
|
||||
|
||||
for (; i < size; i++) {
|
||||
out[i] = in0[i] / in1[0];
|
||||
}
|
||||
|
@ -300,30 +419,58 @@ int ElementOptLogicalOrBool(const bool *in0, const bool *in1, bool *out, int siz
|
|||
return NNACL_OK;
|
||||
}
|
||||
|
||||
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
|
||||
#define SimdElementMaximumCoreCalc(block_size, block_num, in0, in1, out, size, index) \
|
||||
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
|
||||
MS_FLOAT_32xN(block_num) in0_tmp = MS_LD_F32(block_size, in0 + index); \
|
||||
MS_FLOAT_32xN(block_num) in1_tmp = MS_LD_F32(block_size, in1 + index); \
|
||||
MS_FLOAT_32xN(block_num) out_tmp = MS_MAX_F32(block_size, in0_tmp, in1_tmp); \
|
||||
MS_ST_F32(block_size, out + index, out_tmp); \
|
||||
}
|
||||
int ElementMaximum(const float *in0, const float *in1, float *out, int size) {
|
||||
int index = 0;
|
||||
#ifdef ENABLE_NEON
|
||||
for (; index <= size - 4; index += C4NUM) {
|
||||
float32x4_t vin0 = vld1q_f32(in0 + index);
|
||||
float32x4_t vin1 = vld1q_f32(in1 + index);
|
||||
float32x4_t vout = vmaxq_f32(vin0, vin1);
|
||||
vst1q_f32(out + index, vout);
|
||||
}
|
||||
#endif
|
||||
|
||||
MS_SIMD_RUN_NO_SCALAR(SimdElementMaximumCoreCalc, in0, in1, out, size, index);
|
||||
|
||||
for (; index < size; index++) {
|
||||
out[index] = in0[index] > in1[index] ? in0[index] : in1[index];
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
|
||||
#define SimdElementOptMaximumCoreCalc1(block_size, block_num, in0, in1, out, size, index) \
|
||||
do { \
|
||||
MS_FLOAT_32xN(block_num) in0_tmp = MS_MOVN_F32(block_size, in0[0]); \
|
||||
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
|
||||
MS_FLOAT_32xN(block_num) in1_tmp = MS_LD_F32(block_size, in1 + index); \
|
||||
MS_FLOAT_32xN(block_num) out_tmp = MS_MAX_F32(block_size, in0_tmp, in1_tmp); \
|
||||
MS_ST_F32(block_size, out + index, out_tmp); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define SimdElementOptMaximumCoreCalc2(block_size, block_num, in0, in1, out, size, index) \
|
||||
do { \
|
||||
MS_FLOAT_32xN(block_num) in1_tmp = MS_MOVN_F32(block_size, in1[0]); \
|
||||
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
|
||||
MS_FLOAT_32xN(block_num) in0_tmp = MS_LD_F32(block_size, in0 + index); \
|
||||
MS_FLOAT_32xN(block_num) out_tmp = MS_MAX_F32(block_size, in0_tmp, in1_tmp); \
|
||||
MS_ST_F32(block_size, out + index, out_tmp); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
int ElementOptMaximum(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
|
||||
int index = 0;
|
||||
|
||||
if (param->in_elements_num0_ == 1) {
|
||||
MS_SIMD_RUN_NO_SCALAR(SimdElementOptMaximumCoreCalc1, in0, in1, out, size, index);
|
||||
|
||||
for (; index < size; index++) {
|
||||
out[index] = in0[0] > in1[index] ? in0[0] : in1[index];
|
||||
}
|
||||
} else {
|
||||
MS_SIMD_RUN_NO_SCALAR(SimdElementOptMaximumCoreCalc2, in0, in1, out, size, index);
|
||||
|
||||
for (; index < size; index++) {
|
||||
out[index] = in0[index] > in1[0] ? in0[index] : in1[0];
|
||||
}
|
||||
|
@ -332,29 +479,57 @@ int ElementOptMaximum(const float *in0, const float *in1, float *out, int size,
|
|||
return NNACL_OK;
|
||||
}
|
||||
|
||||
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
|
||||
#define SimdElementMaximumIntCoreCalc(block_size, block_num, in0, in1, out, size, index) \
|
||||
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
|
||||
MS_INT_32xN(block_num) in0_tmp = MS_LD_EPI32(block_size, in0 + index); \
|
||||
MS_INT_32xN(block_num) in1_tmp = MS_LD_EPI32(block_size, in1 + index); \
|
||||
MS_INT_32xN(block_num) out_tmp = MS_MAX_EPI32(block_size, in0_tmp, in1_tmp); \
|
||||
MS_ST_EPI32(block_size, out + index, out_tmp); \
|
||||
}
|
||||
int ElementMaximumInt(const int *in0, const int *in1, int *out, int size) {
|
||||
int index = 0;
|
||||
#ifdef ENABLE_NEON
|
||||
for (; index <= size - 4; index += C4NUM) {
|
||||
int32x4_t vin0 = vld1q_s32(in0 + index);
|
||||
int32x4_t vin1 = vld1q_s32(in1 + index);
|
||||
int32x4_t vout = vmaxq_s32(vin0, vin1);
|
||||
vst1q_s32(out + index, vout);
|
||||
}
|
||||
#endif
|
||||
|
||||
MS_SIMD_RUN_NO_SCALAR(SimdElementMaximumIntCoreCalc, in0, in1, out, size, index);
|
||||
|
||||
for (; index < size; index++) {
|
||||
out[index] = in0[index] > in1[index] ? in0[index] : in1[index];
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
|
||||
#define SimdElementOptMaximumIntCoreCalc1(block_size, block_num, in0, in1, out, size, index) \
|
||||
do { \
|
||||
MS_INT_32xN(block_num) in0_tmp = MS_MOVN_EPI32(block_size, in0[0]); \
|
||||
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
|
||||
MS_INT_32xN(block_num) in1_tmp = MS_LD_EPI32(block_size, in1 + index); \
|
||||
MS_INT_32xN(block_num) out_tmp = MS_MAX_EPI32(block_size, in0_tmp, in1_tmp); \
|
||||
MS_ST_EPI32(block_size, out + index, out_tmp); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define SimdElementOptMaximumIntCoreCalc2(block_size, block_num, in0, in1, out, size, index) \
|
||||
do { \
|
||||
MS_INT_32xN(block_num) in1_tmp = MS_MOVN_EPI32(block_size, in1[0]); \
|
||||
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
|
||||
MS_INT_32xN(block_num) in0_tmp = MS_LD_EPI32(block_size, in0 + index); \
|
||||
MS_INT_32xN(block_num) out_tmp = MS_MAX_EPI32(block_size, in0_tmp, in1_tmp); \
|
||||
MS_ST_EPI32(block_size, out + index, out_tmp); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
int ElementOptMaximumInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
|
||||
int index = 0;
|
||||
if (param->in_elements_num0_ == 1) {
|
||||
MS_SIMD_RUN_NO_SCALAR(SimdElementOptMaximumIntCoreCalc1, in0, in1, out, size, index);
|
||||
|
||||
for (; index < size; index++) {
|
||||
out[index] = in0[0] > in1[index] ? in0[0] : in1[index];
|
||||
}
|
||||
} else {
|
||||
MS_SIMD_RUN_NO_SCALAR(SimdElementOptMaximumIntCoreCalc2, in0, in1, out, size, index);
|
||||
|
||||
for (; index < size; index++) {
|
||||
out[index] = in0[index] > in1[0] ? in0[index] : in1[0];
|
||||
}
|
||||
|
@ -363,30 +538,58 @@ int ElementOptMaximumInt(const int *in0, const int *in1, int *out, int size, con
|
|||
return NNACL_OK;
|
||||
}
|
||||
|
||||
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
|
||||
#define SimdElementMinimumIntCoreCalc(block_size, block_num, in0, in1, out, size, index) \
|
||||
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
|
||||
MS_INT_32xN(block_num) in0_tmp = MS_LD_EPI32(block_size, in0 + index); \
|
||||
MS_INT_32xN(block_num) in1_tmp = MS_LD_EPI32(block_size, in1 + index); \
|
||||
MS_INT_32xN(block_num) out_tmp = MS_MIN_EPI32(block_size, in0_tmp, in1_tmp); \
|
||||
MS_ST_EPI32(block_size, out + index, out_tmp); \
|
||||
}
|
||||
int ElementMinimumInt(const int *input0, const int *input1, int *output, int size) {
|
||||
int index = 0;
|
||||
#ifdef ENABLE_NEON
|
||||
for (; index <= size - 4; index += C4NUM) {
|
||||
int32x4_t vin0 = vld1q_s32(input0 + index);
|
||||
int32x4_t vin1 = vld1q_s32(input1 + index);
|
||||
int32x4_t vout = vminq_s32(vin0, vin1);
|
||||
vst1q_s32(output + index, vout);
|
||||
}
|
||||
#endif
|
||||
|
||||
MS_SIMD_RUN_NO_SCALAR(SimdElementMinimumIntCoreCalc, input0, input1, output, size, index);
|
||||
|
||||
for (; index < size; index++) {
|
||||
output[index] = input0[index] > input1[index] ? input1[index] : input0[index];
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
|
||||
#define SimdElementOptMinimumIntCoreCalc1(block_size, block_num, in0, in1, out, size, index) \
|
||||
do { \
|
||||
MS_INT_32xN(block_num) in0_tmp = MS_MOVN_EPI32(block_size, in0[0]); \
|
||||
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
|
||||
MS_INT_32xN(block_num) in1_tmp = MS_LD_EPI32(block_size, in1 + index); \
|
||||
MS_INT_32xN(block_num) out_tmp = MS_MIN_EPI32(block_size, in0_tmp, in1_tmp); \
|
||||
MS_ST_EPI32(block_size, out + index, out_tmp); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define SimdElementOptMinimumIntCoreCalc2(block_size, block_num, in0, in1, out, size, index) \
|
||||
do { \
|
||||
MS_INT_32xN(block_num) in1_tmp = MS_MOVN_EPI32(block_size, in1[0]); \
|
||||
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
|
||||
MS_INT_32xN(block_num) in0_tmp = MS_LD_EPI32(block_size, in0 + index); \
|
||||
MS_INT_32xN(block_num) out_tmp = MS_MIN_EPI32(block_size, in0_tmp, in1_tmp); \
|
||||
MS_ST_EPI32(block_size, out + index, out_tmp); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
int ElementOptMinimumInt(const int *input0, const int *input1, int *output, int size,
|
||||
const ArithmeticParameter *param) {
|
||||
int index = 0;
|
||||
if (param->in_elements_num0_ == 1) {
|
||||
MS_SIMD_RUN_NO_SCALAR(SimdElementOptMinimumIntCoreCalc1, input0, input1, output, size, index);
|
||||
|
||||
for (; index < size; index++) {
|
||||
output[index] = input0[0] > input1[index] ? input1[index] : input0[0];
|
||||
}
|
||||
} else {
|
||||
MS_SIMD_RUN_NO_SCALAR(SimdElementOptMinimumIntCoreCalc2, input0, input1, output, size, index);
|
||||
|
||||
for (; index < size; index++) {
|
||||
output[index] = input0[index] > input1[0] ? input1[0] : input0[index];
|
||||
}
|
||||
|
@ -395,29 +598,58 @@ int ElementOptMinimumInt(const int *input0, const int *input1, int *output, int
|
|||
return NNACL_OK;
|
||||
}
|
||||
|
||||
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
|
||||
#define SimdElementMinimumCoreCalc(block_size, block_num, in0, in1, out, size, index) \
|
||||
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
|
||||
MS_FLOAT_32xN(block_num) in0_tmp = MS_LD_F32(block_size, in0 + index); \
|
||||
MS_FLOAT_32xN(block_num) in1_tmp = MS_LD_F32(block_size, in1 + index); \
|
||||
MS_FLOAT_32xN(block_num) out_tmp = MS_MIN_F32(block_size, in0_tmp, in1_tmp); \
|
||||
MS_ST_F32(block_size, out + index, out_tmp); \
|
||||
}
|
||||
|
||||
int ElementMinimum(const float *in0, const float *in1, float *out, int size) {
|
||||
int index = 0;
|
||||
#ifdef ENABLE_NEON
|
||||
for (; index <= size - 4; index += C4NUM) {
|
||||
float32x4_t vin0 = vld1q_f32(in0 + index);
|
||||
float32x4_t vin1 = vld1q_f32(in1 + index);
|
||||
float32x4_t vout = vminq_f32(vin0, vin1);
|
||||
vst1q_f32(out + index, vout);
|
||||
}
|
||||
#endif
|
||||
|
||||
MS_SIMD_RUN_NO_SCALAR(SimdElementMinimumCoreCalc, in0, in1, out, size, index);
|
||||
|
||||
for (; index < size; index++) {
|
||||
out[index] = in0[index] > in1[index] ? in1[index] : in0[index];
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
// 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
|
||||
#define SimdElementOptMinimumCoreCalc1(block_size, block_num, in0, in1, out, size, index) \
|
||||
do { \
|
||||
MS_FLOAT_32xN(block_num) in0_tmp = MS_MOVN_F32(block_size, in0[0]); \
|
||||
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
|
||||
MS_FLOAT_32xN(block_num) in1_tmp = MS_LD_F32(block_size, in1 + index); \
|
||||
MS_FLOAT_32xN(block_num) out_tmp = MS_MIN_F32(block_size, in0_tmp, in1_tmp); \
|
||||
MS_ST_F32(block_size, out + index, out_tmp); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define SimdElementOptMinimumCoreCalc2(block_size, block_num, in0, in1, out, size, index) \
|
||||
do { \
|
||||
MS_FLOAT_32xN(block_num) in1_tmp = MS_MOVN_F32(block_size, in1[0]); \
|
||||
for (int block_max_size = size - block_num + 1; index < block_max_size; index += block_num) { \
|
||||
MS_FLOAT_32xN(block_num) in0_tmp = MS_LD_F32(block_size, in0 + index); \
|
||||
MS_FLOAT_32xN(block_num) out_tmp = MS_MIN_F32(block_size, in0_tmp, in1_tmp); \
|
||||
MS_ST_F32(block_size, out + index, out_tmp); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
int ElementOptMinimum(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
|
||||
int index = 0;
|
||||
if (param->in_elements_num0_ == 1) {
|
||||
MS_SIMD_RUN_NO_SCALAR(SimdElementOptMinimumCoreCalc1, in0, in1, out, size, index);
|
||||
|
||||
for (; index < size; index++) {
|
||||
out[index] = in0[0] > in1[index] ? in1[index] : in0[0];
|
||||
}
|
||||
} else {
|
||||
MS_SIMD_RUN_NO_SCALAR(SimdElementOptMinimumCoreCalc2, in0, in1, out, size, index);
|
||||
|
||||
for (; index < size; index++) {
|
||||
out[index] = in0[index] > in1[0] ? in1[0] : in0[index];
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue