forked from mindspore-Ecosystem/mindspore
!4567 modify arm cpu fp16 op: arithmetic
Merge pull request !4567 from 陶云浩/lite
This commit is contained in:
commit
95fa3c5202
mindspore/lite/src/runtime/kernel/arm
|
@ -28,7 +28,21 @@ using mindspore::lite::RET_ERROR;
|
|||
using mindspore::lite::RET_OK;
|
||||
|
||||
using mindspore::schema::PrimitiveType_Add;
|
||||
using mindspore::schema::PrimitiveType_Div;
|
||||
using mindspore::schema::PrimitiveType_Equal;
|
||||
using mindspore::schema::PrimitiveType_FloorDiv;
|
||||
using mindspore::schema::PrimitiveType_FloorMod;
|
||||
using mindspore::schema::PrimitiveType_Greater;
|
||||
using mindspore::schema::PrimitiveType_GreaterEqual;
|
||||
using mindspore::schema::PrimitiveType_Less;
|
||||
using mindspore::schema::PrimitiveType_LessEqual;
|
||||
using mindspore::schema::PrimitiveType_LogicalAnd;
|
||||
using mindspore::schema::PrimitiveType_LogicalOr;
|
||||
using mindspore::schema::PrimitiveType_Maximum;
|
||||
using mindspore::schema::PrimitiveType_Minimum;
|
||||
using mindspore::schema::PrimitiveType_Mul;
|
||||
using mindspore::schema::PrimitiveType_NotEqual;
|
||||
using mindspore::schema::PrimitiveType_SquaredDifference;
|
||||
using mindspore::schema::PrimitiveType_Sub;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
@ -97,7 +111,44 @@ int ArithmeticFP16CPUKernel::Init() {
|
|||
arithmetic_run_ = ElementSubFp16;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case PrimitiveType_Div:
|
||||
switch (arithmeticParameter_->activation_type_) {
|
||||
case schema::ActivationType_RELU:
|
||||
arithmetic_run_ = ElementDivReluFp16;
|
||||
break;
|
||||
case schema::ActivationType_RELU6:
|
||||
arithmetic_run_ = ElementDivRelu6Fp16;
|
||||
break;
|
||||
default:
|
||||
arithmetic_run_ = ElementDivFp16;
|
||||
break;
|
||||
}
|
||||
case PrimitiveType_FloorMod:
|
||||
arithmetic_run_ = ElementFloorModFp16;
|
||||
case PrimitiveType_FloorDiv:
|
||||
arithmetic_run_ = ElementFloorDivFp16;
|
||||
case PrimitiveType_LogicalAnd:
|
||||
arithmetic_run_ = ElementLogicalAndFp16;
|
||||
case PrimitiveType_LogicalOr:
|
||||
arithmetic_run_ = ElementLogicalOrFp16;
|
||||
case PrimitiveType_SquaredDifference:
|
||||
arithmetic_run_ = ElementSquaredDifferenceFp16;
|
||||
case PrimitiveType_Maximum:
|
||||
arithmetic_run_ = ElementMaximumFp16;
|
||||
case PrimitiveType_Minimum:
|
||||
arithmetic_run_ = ElementMinimumFp16;
|
||||
case PrimitiveType_NotEqual:
|
||||
arithmetic_run_ = ElementNotEqualFp16;
|
||||
case PrimitiveType_Equal:
|
||||
arithmetic_run_ = ElementEqualFp16;
|
||||
case PrimitiveType_Less:
|
||||
arithmetic_run_ = ElementLessFp16;
|
||||
case PrimitiveType_LessEqual:
|
||||
arithmetic_run_ = ElementLessEqual;
|
||||
case PrimitiveType_Greater:
|
||||
arithmetic_run_ = ElementGreaterFp16;
|
||||
case PrimitiveType_GreaterEqual:
|
||||
arithmetic_run_ = ElementGreaterEqualFp16;
|
||||
default:
|
||||
MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_;
|
||||
arithmetic_run_ = nullptr;
|
||||
|
@ -115,8 +166,8 @@ int ArithmeticFP16CPUKernel::ReSize() {
|
|||
arithmeticParameter_->in_elements_num1_ = in_tensors_[1]->ElementsNum();
|
||||
arithmeticParameter_->out_elements_num_ = out_tensors_[0]->ElementsNum();
|
||||
if (in_tensors_[0]->data_type() == kNumberTypeFloat32 || in_tensors_[0]->data_type() == kNumberTypeFloat) {
|
||||
input0_fp16_ = reinterpret_cast<float16_t *>(context_->allocator->Malloc(
|
||||
arithmeticParameter_->in_elements_num0_ * sizeof(float16_t)));
|
||||
input0_fp16_ = reinterpret_cast<float16_t *>(
|
||||
context_->allocator->Malloc(arithmeticParameter_->in_elements_num0_ * sizeof(float16_t)));
|
||||
if (input0_fp16_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc data fail!";
|
||||
return RET_ERROR;
|
||||
|
@ -125,8 +176,8 @@ int ArithmeticFP16CPUKernel::ReSize() {
|
|||
arithmeticParameter_->in_elements_num0_);
|
||||
}
|
||||
if (in_tensors_[1]->data_type() == kNumberTypeFloat32 || in_tensors_[1]->data_type() == kNumberTypeFloat) {
|
||||
input1_fp16_ = reinterpret_cast<float16_t *>(context_->allocator->Malloc(
|
||||
arithmeticParameter_->in_elements_num1_ * sizeof(float16_t)));
|
||||
input1_fp16_ = reinterpret_cast<float16_t *>(
|
||||
context_->allocator->Malloc(arithmeticParameter_->in_elements_num1_ * sizeof(float16_t)));
|
||||
if (input0_fp16_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc data fail!";
|
||||
return RET_ERROR;
|
||||
|
@ -135,8 +186,8 @@ int ArithmeticFP16CPUKernel::ReSize() {
|
|||
arithmeticParameter_->in_elements_num1_);
|
||||
}
|
||||
if (out_tensors_[0]->data_type() == kNumberTypeFloat32 || out_tensors_[0]->data_type() == kNumberTypeFloat) {
|
||||
output_fp16_ = reinterpret_cast<float16_t *>(context_->allocator->Malloc(
|
||||
arithmeticParameter_->out_elements_num_ * sizeof(float16_t)));
|
||||
output_fp16_ = reinterpret_cast<float16_t *>(
|
||||
context_->allocator->Malloc(arithmeticParameter_->out_elements_num_ * sizeof(float16_t)));
|
||||
if (output_fp16_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc data fail!";
|
||||
return RET_ERROR;
|
||||
|
@ -197,22 +248,22 @@ int ArithmeticFP16CPUKernel::DoArithmetic(int task_id) {
|
|||
|
||||
int error_code = RET_OK;
|
||||
if (arithmeticParameter_->broadcasting_) {
|
||||
error_code = arithmetic_run_(tile_data0_ + thread_stride, tile_data1_ + thread_stride,
|
||||
output_data + thread_stride, count);
|
||||
error_code =
|
||||
arithmetic_run_(tile_data0_ + thread_stride, tile_data1_ + thread_stride, output_data + thread_stride, count);
|
||||
} else if (arithmetic_opt_run_ != nullptr) {
|
||||
if (arithmeticParameter_->in_elements_num0_ == 1) {
|
||||
error_code = arithmetic_opt_run_(input0_data, input1_data1 + thread_stride, output_data + thread_stride,
|
||||
count, arithmeticParameter_);
|
||||
error_code = arithmetic_opt_run_(input0_data, input1_data1 + thread_stride, output_data + thread_stride, count,
|
||||
arithmeticParameter_);
|
||||
} else if (arithmeticParameter_->in_elements_num1_ == 1) {
|
||||
error_code = arithmetic_opt_run_(input0_data + thread_stride, input1_data1, output_data + thread_stride,
|
||||
count, arithmeticParameter_);
|
||||
error_code = arithmetic_opt_run_(input0_data + thread_stride, input1_data1, output_data + thread_stride, count,
|
||||
arithmeticParameter_);
|
||||
} else {
|
||||
error_code = arithmetic_opt_run_(input0_data + thread_stride, input1_data1 + thread_stride,
|
||||
output_data + thread_stride, count, arithmeticParameter_);
|
||||
}
|
||||
} else {
|
||||
error_code = arithmetic_run_(input0_data + thread_stride, input1_data1 + thread_stride,
|
||||
output_data + thread_stride, count);
|
||||
error_code =
|
||||
arithmetic_run_(input0_data + thread_stride, input1_data1 + thread_stride, output_data + thread_stride, count);
|
||||
}
|
||||
if (error_code != RET_OK) {
|
||||
return RET_ERROR;
|
||||
|
|
|
@ -104,10 +104,16 @@ int ElementMulFp16(float16_t *input0, float16_t *input1, float16_t *output, int
|
|||
int block_c8 = element_size - block_mod;
|
||||
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
output[0] = input0[0] * input1[0];
|
||||
output[1] = input0[1] * input1[1];
|
||||
output[2] = input0[2] * input1[2];
|
||||
output[3] = input0[3] * input1[3];
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vin0 = vld1q_f16(input0);
|
||||
float16x8_t vin1 = vld1q_f16(input1);
|
||||
float16x8_t vout = vmulq_f16(vin0, vin1);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = input0[i] * input1[i];
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
|
@ -123,15 +129,24 @@ int ElementMulReluFp16(float16_t *input0, float16_t *input1, float16_t *output,
|
|||
int block_mod = element_size % C8NUM;
|
||||
int block_c8 = element_size - block_mod;
|
||||
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
#endif
|
||||
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
float16_t res = input0[0] * input1[0];
|
||||
output[0] = res > 0 ? res : 0;
|
||||
res = input0[1] * input1[1];
|
||||
output[1] = res > 0 ? res : 0;
|
||||
res = input0[2] * input1[2];
|
||||
output[2] = res > 0 ? res : 0;
|
||||
res = input0[3] * input1[3];
|
||||
output[3] = res > 0 ? res : 0;
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vin0 = vld1q_f16(input0);
|
||||
float16x8_t vin1 = vld1q_f16(input1);
|
||||
float16x8_t vout = vmulq_f16(vin0, vin1);
|
||||
vout = vmaxq_f16(vout, zeros);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
float16_t res;
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
res = input[i] * input1[i];
|
||||
output[i] = res > 0 ? res : 0;
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
|
@ -148,11 +163,23 @@ int ElementMulRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output,
|
|||
int block_mod = element_size % C8NUM;
|
||||
int block_c8 = element_size - block_mod;
|
||||
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6};
|
||||
#endif
|
||||
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
output[0] = MSMIN(MSMAX(input0[0] * input1[0], 0), 6);
|
||||
output[1] = MSMIN(MSMAX(input0[1] * input1[1], 0), 6);
|
||||
output[2] = MSMIN(MSMAX(input0[2] * input1[2], 0), 6);
|
||||
output[3] = MSMIN(MSMAX(input0[3] * input1[3], 0), 6);
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vin0 = vld1q_f16(input0);
|
||||
float16x8_t vin1 = vld1q_f16(input1);
|
||||
float16x8_t vout = vmulq_f16(vin0, vin1);
|
||||
vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = MSMIN(MSMAX(input0[i] * input1[i], 0), 6);
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
|
@ -169,10 +196,16 @@ int ElementAddFp16(float16_t *input0, float16_t *input1, float16_t *output, int
|
|||
int block_c8 = element_size - block_mod;
|
||||
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
output[0] = input0[0] + input1[0];
|
||||
output[1] = input0[1] + input1[1];
|
||||
output[2] = input0[2] + input1[2];
|
||||
output[3] = input0[3] + input1[3];
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vin0 = vld1q_f16(input0);
|
||||
float16x8_t vin1 = vld1q_f16(input1);
|
||||
float16x8_t vout = vaddq_f16(vin0, vin1);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = input0[i] + input1[i];
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
|
@ -187,15 +220,22 @@ int ElementAddReluFp16(float16_t *input0, float16_t *input1, float16_t *output,
|
|||
int block_mod = element_size % C8NUM;
|
||||
int block_c8 = element_size - block_mod;
|
||||
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
#endif
|
||||
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
float16_t res = input0[0] + input1[0];
|
||||
output[0] = res > 0 ? res : 0;
|
||||
res = input0[1] + input1[1];
|
||||
output[1] = res > 0 ? res : 0;
|
||||
res = input0[2] + input1[2];
|
||||
output[2] = res > 0 ? res : 0;
|
||||
res = input0[3] + input1[3];
|
||||
output[3] = res > 0 ? res : 0;
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vin0 = vld1q_f16(input0);
|
||||
float16x8_t vin1 = vld1q_f16(input1);
|
||||
float16x8_t vout = vaddq_f16(vin0, vin1);
|
||||
vout = vmaxq_f16(vout, zeros);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = MSMAX(input0[i] + input1[i], 0);
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
|
@ -211,11 +251,23 @@ int ElementAddRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output,
|
|||
int block_mod = element_size % C8NUM;
|
||||
int block_c8 = element_size - block_mod;
|
||||
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6};
|
||||
#endif
|
||||
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
output[0] = MSMIN(MSMAX(input0[0] + input1[0], 0), 6);
|
||||
output[1] = MSMIN(MSMAX(input0[1] + input1[1], 0), 6);
|
||||
output[2] = MSMIN(MSMAX(input0[2] + input1[2], 0), 6);
|
||||
output[3] = MSMIN(MSMAX(input0[3] + input1[3], 0), 6);
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vin0 = vld1q_f16(input0);
|
||||
float16x8_t vin1 = vld1q_f16(input1);
|
||||
float16x8_t vout = vaddq_f16(vin0, vin1);
|
||||
vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = MSMIN(MSMAX(input0[i] + input1[i], 0), 6);
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
|
@ -232,10 +284,16 @@ int ElementSubFp16(float16_t *input0, float16_t *input1, float16_t *output, int
|
|||
int block_c8 = element_size - block_mod;
|
||||
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
output[0] = input0[0] - input1[0];
|
||||
output[1] = input0[1] - input1[1];
|
||||
output[2] = input0[2] - input1[2];
|
||||
output[3] = input0[3] - input1[3];
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vin0 = vld1q_f16(input0);
|
||||
float16x8_t vin1 = vld1q_f16(input1);
|
||||
float16x8_t vout = vsubq_f16(vin0, vin1);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = input0[i] - input1[i];
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
|
@ -249,16 +307,21 @@ int ElementSubFp16(float16_t *input0, float16_t *input1, float16_t *output, int
|
|||
int ElementSubReluFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
|
||||
int block_mod = element_size % C8NUM;
|
||||
int block_c8 = element_size - block_mod;
|
||||
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
#endif
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
float16_t res = input0[0] - input1[0];
|
||||
output[0] = res > 0 ? res : 0;
|
||||
res = input0[1] - input1[1];
|
||||
output[1] = res > 0 ? res : 0;
|
||||
res = input0[2] - input1[2];
|
||||
output[2] = res > 0 ? res : 0;
|
||||
res = input0[3] - input1[3];
|
||||
output[3] = res > 0 ? res : 0;
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vin0 = vld1q_f16(input0);
|
||||
float16x8_t vin1 = vld1q_f16(input1);
|
||||
float16x8_t vout = vsubq_f16(vin0, vin1);
|
||||
vout = vmaxq_f16(vout, zeros);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = MSMAX(input0[i] - input1[i], 0);
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
|
@ -273,12 +336,22 @@ int ElementSubReluFp16(float16_t *input0, float16_t *input1, float16_t *output,
|
|||
int ElementSubRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
|
||||
int block_mod = element_size % C8NUM;
|
||||
int block_c8 = element_size - block_mod;
|
||||
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6};
|
||||
#endif
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
output[0] = MSMIN(MSMAX(input0[0] - input1[0], 0), 6);
|
||||
output[1] = MSMIN(MSMAX(input0[1] - input1[1], 0), 6);
|
||||
output[2] = MSMIN(MSMAX(input0[2] - input1[2], 0), 6);
|
||||
output[3] = MSMIN(MSMAX(input0[3] - input1[3], 0), 6);
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vin0 = vld1q_f16(input0);
|
||||
float16x8_t vin1 = vld1q_f16(input1);
|
||||
float16x8_t vout = vsubq_f16(vin0, vin1);
|
||||
vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = MSMIN(MSMAX(input0[i] - input1[i], 0), 6);
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
|
@ -289,3 +362,413 @@ int ElementSubRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output,
|
|||
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ElementDivFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
|
||||
int block_mod = element_size % C8NUM;
|
||||
int block_c8 = element_size - block_mod;
|
||||
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
if (input1[i] == 0) {
|
||||
return NNACL_ERRCODE_DIVISOR_ZERO;
|
||||
}
|
||||
}
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vin0 = vld1q_f16(input0);
|
||||
float16x8_t vin1 = vld1q_f16(input1);
|
||||
float16x8_t vout = vsubq_f16(vin0, vin1);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = input0[i] / input1[i];
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
}
|
||||
for (int index = 0; index < block_mod; ++index) {
|
||||
if (input1[index] == 0) {
|
||||
return NNACL_ERRCODE_DIVISOR_ZERO;
|
||||
}
|
||||
output[index] = input0[index] / input1[index];
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ElementDivReluFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
|
||||
int block_mod = element_size % C8NUM;
|
||||
int block_c8 = element_size - block_mod;
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
#endif
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
if (input1[i] == 0) {
|
||||
return NNACL_ERRCODE_DIVISOR_ZERO;
|
||||
}
|
||||
}
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vin0 = vld1q_f16(input0);
|
||||
float16x8_t vin1 = vld1q_f16(input1);
|
||||
float16x8_t vout = vsubq_f16(vin0, vin1);
|
||||
vout = vmaxq_f16(vout, zeros);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = MSMAX(input0[i] - input1[i], 0);
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
}
|
||||
for (int index = 0; index < block_mod; ++index) {
|
||||
if (input1[index] == 0) {
|
||||
return NNACL_ERRCODE_DIVISOR_ZERO;
|
||||
}
|
||||
float16_t res = input0[index] - input1[index];
|
||||
output[index] = res > 0 ? res : 0;
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ElementDivRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
|
||||
int block_mod = element_size % C8NUM;
|
||||
int block_c8 = element_size - block_mod;
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6};
|
||||
#endif
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
if (input1[i] == 0) {
|
||||
return NNACL_ERRCODE_DIVISOR_ZERO;
|
||||
}
|
||||
}
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vin0 = vld1q_f16(input0);
|
||||
float16x8_t vin1 = vld1q_f16(input1);
|
||||
float16x8_t vout = vsubq_f16(vin0, vin1);
|
||||
vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = MSMIN(MSMAX(input0[i] - input1[i], 0), 6);
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
}
|
||||
for (int index = 0; index < block_mod; ++index) {
|
||||
if (input1[index] == 0) {
|
||||
return NNACL_ERRCODE_DIVISOR_ZERO;
|
||||
}
|
||||
output[index] = MSMIN(MSMAX(input0[index] - input1[index], 0), 6);
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ElementFloorModFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
|
||||
for (int i = 0; i < element_size; ++i) {
|
||||
if (input1[i] == 0) {
|
||||
return NNACL_ERRCODE_DIVISOR_ZERO;
|
||||
}
|
||||
output[i] = input0[i] - floorf(input0[i] / input1[i]) * input1[i];
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ElementFloorDivFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
|
||||
for (int i = 0; i < element_size; ++i) {
|
||||
if (input1[i] == 0) {
|
||||
return NNACL_ERRCODE_DIVISOR_ZERO;
|
||||
}
|
||||
output[i] = floorf(input0[i] / input1[i]);
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ElementLogicalAndFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
|
||||
int block_mod = element_size % C8NUM;
|
||||
int block_c8 = element_size - block_mod;
|
||||
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vtrue = {1, 1, 1, 1, 1, 1, 1, 1};
|
||||
float16x8_t vfalse = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1));
|
||||
uint16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
#endif
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
#ifdef ENABLE_NEON
|
||||
uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input0)), mask);
|
||||
uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input1)), mask);
|
||||
float16x8_t vout = vbslq_f16(vceqq_u16(vandq_u16(vin0, vin1), zeros), vfalse, vtrue);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = (float16_t)((bool)(input0[i]) & (bool)(input1[i]));
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
}
|
||||
for (int index = 0; index < block_mod; ++index) {
|
||||
output[index] = (float16_t)((bool)(input0[index]) & (bool)(input1[index]));
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ElementLogicalOrFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
|
||||
int block_mod = element_size % C8NUM;
|
||||
int block_c8 = element_size - block_mod;
|
||||
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vtrue = {1, 1, 1, 1, 1, 1, 1, 1};
|
||||
float16x8_t vfalse = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1));
|
||||
uint16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
#endif
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
#ifdef ENABLE_NEON
|
||||
uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input0)), mask);
|
||||
uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input1)), mask);
|
||||
float16x8_t vout = vbslq_f16(vceqq_u16(vorrq_u16(vin0, vin1), zeros), vfalse, vtrue);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = (float16_t)((bool)(input0[i]) | (bool)(input1[i]));
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
}
|
||||
for (int index = 0; index < block_mod; ++index) {
|
||||
output[index] = (float16_t)((bool)(input0[index]) | (bool)(input1[index]));
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ElementSquaredDifferenceFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
|
||||
ElementSubFp16(input0, input1, output, element_size);
|
||||
return ElementMulFp16(output, output, output, element_size);
|
||||
}
|
||||
|
||||
int ElementMaximumFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
|
||||
int block_mod = element_size % C8NUM;
|
||||
int block_c8 = element_size - block_mod;
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vin0 = vld1q_f16(input0);
|
||||
float16x8_t vin1 = vld1q_f16(input1);
|
||||
float16x8_t vout = vmaxq_f16(vin0, vin1);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = MSMAX(input0[i], input1[i]);
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
}
|
||||
for (int index = 0; index < block_mod; ++index) {
|
||||
output[index] = MSMAX(input0[index], input1[index]);
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ElementMinimumFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
|
||||
int block_mod = element_size % C8NUM;
|
||||
int block_c8 = element_size - block_mod;
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vin0 = vld1q_f16(input0);
|
||||
float16x8_t vin1 = vld1q_f16(input1);
|
||||
float16x8_t vout = vminq_f16(vin0, vin1);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = MSMIN(input0[i], input1[i]);
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
}
|
||||
for (int index = 0; index < block_mod; ++index) {
|
||||
output[index] = MSMIN(input0[index], input1[index]);
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ElementNotEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
|
||||
int block_mod = element_size % C8NUM;
|
||||
int block_c8 = element_size - block_mod;
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vtrue = {1, 1, 1, 1, 1, 1, 1, 1};
|
||||
float16x8_t vfalse = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
#endif
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vin0 = vld1q_f16(input0);
|
||||
float16x8_t vin1 = vld1q_f16(input1);
|
||||
float16x8_t vout = vbslq_f16(vceqq_f16(vin0, vin1), vfalse, vtrue);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = (float16_t)(input0[i] != input1[i]);
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
}
|
||||
for (int index = 0; index < block_mod; ++index) {
|
||||
output[index] = (float16_t)(input0[index] != input1[index]);
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ElementEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
|
||||
int block_mod = element_size % C8NUM;
|
||||
int block_c8 = element_size - block_mod;
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vtrue = {1, 1, 1, 1, 1, 1, 1, 1};
|
||||
float16x8_t vfalse = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
#endif
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vin0 = vld1q_f16(input0);
|
||||
float16x8_t vin1 = vld1q_f16(input1);
|
||||
float16x8_t vout = vbslq_f16(vceqq_f16(vin0, vin1), vtrue, vfalse);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = (float16_t)(input0[i] == input1[i]);
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
}
|
||||
for (int index = 0; index < block_mod; ++index) {
|
||||
output[index] = (float16_t)(input0[index] == input1[index]);
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ElementLessFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
|
||||
int block_mod = element_size % C8NUM;
|
||||
int block_c8 = element_size - block_mod;
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vtrue = {1, 1, 1, 1, 1, 1, 1, 1};
|
||||
float16x8_t vfalse = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
#endif
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vin0 = vld1q_f16(input0);
|
||||
float16x8_t vin1 = vld1q_f16(input1);
|
||||
float16x8_t vout = vbslq_f16(vcltq_f16(vin0, vin1), vtrue, vfalse);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = (float16_t)(input0[i] < input1[i]);
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
}
|
||||
for (int index = 0; index < block_mod; ++index) {
|
||||
output[index] = (float16_t)(input0[index] < input1[index]);
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ElementLessEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
|
||||
int block_mod = element_size % C8NUM;
|
||||
int block_c8 = element_size - block_mod;
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vtrue = {1, 1, 1, 1, 1, 1, 1, 1};
|
||||
float16x8_t vfalse = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
#endif
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vin0 = vld1q_f16(input0);
|
||||
float16x8_t vin1 = vld1q_f16(input1);
|
||||
float16x8_t vout = vbslq_f16(vcleq_f16(vin0, vin1), vtrue, vfalse);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = (float16_t)(input0[i] <= input1[i]);
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
}
|
||||
for (int index = 0; index < block_mod; ++index) {
|
||||
output[index] = (float16_t)(input0[index] <= input1[index]);
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ElementGreaterFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
|
||||
int block_mod = element_size % C8NUM;
|
||||
int block_c8 = element_size - block_mod;
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vtrue = {1, 1, 1, 1, 1, 1, 1, 1};
|
||||
float16x8_t vfalse = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
#endif
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vin0 = vld1q_f16(input0);
|
||||
float16x8_t vin1 = vld1q_f16(input1);
|
||||
float16x8_t vout = vbslq_f16(vcgtq_f16(vin0, vin1), vtrue, vfalse);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = (float16_t)(input0[i] > input1[i]);
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
}
|
||||
for (int index = 0; index < block_mod; ++index) {
|
||||
output[index] = (float16_t)(input0[index] > input1[index]);
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ElementGreaterEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
|
||||
int block_mod = element_size % C8NUM;
|
||||
int block_c8 = element_size - block_mod;
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vtrue = {1, 1, 1, 1, 1, 1, 1, 1};
|
||||
float16x8_t vfalse = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
#endif
|
||||
for (int index = 0; index < block_c8; index += C8NUM) {
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t vin0 = vld1q_f16(input0);
|
||||
float16x8_t vin1 = vld1q_f16(input1);
|
||||
float16x8_t vout = vbslq_f16(vcgeq_f16(vin0, vin1), vtrue, vfalse);
|
||||
vst1q_f16(output, vout);
|
||||
#else
|
||||
for (int i = 0; i < C8NUM; ++i) {
|
||||
output[i] = (float16_t)(input0[i] >= input1[i]);
|
||||
}
|
||||
#endif
|
||||
input0 += C8NUM;
|
||||
input1 += C8NUM;
|
||||
output += C8NUM;
|
||||
}
|
||||
for (int index = 0; index < block_mod; ++index) {
|
||||
output[index] = (float16_t)(input0[index] >= input1[index]);
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
|
|
@ -44,6 +44,28 @@ int ElementSubFp16(float16_t *input0, float16_t *input1, float16_t *output, int
|
|||
int ElementSubReluFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size);
|
||||
int ElementSubRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size);
|
||||
|
||||
int ElementDivFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size);
|
||||
int ElementDivReluFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size);
|
||||
int ElementDivRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size);
|
||||
|
||||
int ElementFloorModFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size);
|
||||
int ElementFloorDivFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size);
|
||||
|
||||
int ElementLogicalAndFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size);
|
||||
int ElementLogicalOrFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size);
|
||||
|
||||
int ElementSquaredDifferenceFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size);
|
||||
|
||||
int ElementMaximumFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size);
|
||||
int ElementMinimumFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size);
|
||||
|
||||
int ElementNotEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size);
|
||||
int ElementEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size);
|
||||
int ElementLessFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size);
|
||||
int ElementLessEqual(float16_t *input0, float16_t *input1, float16_t *output, int element_size);
|
||||
int ElementGreaterFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size);
|
||||
int ElementGreaterEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size);
|
||||
|
||||
void TileDimensionsFp16(float16_t *data0, float16_t *data1, float16_t *tile_data0, float16_t *tile_data1,
|
||||
ArithmeticParameter *param);
|
||||
#ifdef __cplusplus
|
||||
|
|
Loading…
Reference in New Issue