optimize arithmetic
This commit is contained in:
parent
1002ee4887
commit
9f51871c66
1
build.sh
1
build.sh
|
@ -706,6 +706,7 @@ build_lite()
|
||||||
mkdir -p ${OUTPUT_DIR}/include/schema/
|
mkdir -p ${OUTPUT_DIR}/include/schema/
|
||||||
cp ${BASEPATH}/mindspore/lite/schema/*.h ${OUTPUT_DIR}/include/schema/
|
cp ${BASEPATH}/mindspore/lite/schema/*.h ${OUTPUT_DIR}/include/schema/
|
||||||
cp ${BASEPATH}/mindspore/lite/build/src/libmindspore-lite.so ${OUTPUT_DIR}/lib/
|
cp ${BASEPATH}/mindspore/lite/build/src/libmindspore-lite.so ${OUTPUT_DIR}/lib/
|
||||||
|
cp ${BASEPATH}/mindspore/lite/build/src/runtime/kernel/arm/nnacl/liboptimize.so ${OUTPUT_DIR}/lib/
|
||||||
mkdir -p ${OUTPUT_DIR}/third_party/flatbuffers
|
mkdir -p ${OUTPUT_DIR}/third_party/flatbuffers
|
||||||
cp -r ${BASEPATH}/third_party/flatbuffers/include/ ${OUTPUT_DIR}/third_party/flatbuffers/
|
cp -r ${BASEPATH}/third_party/flatbuffers/include/ ${OUTPUT_DIR}/third_party/flatbuffers/
|
||||||
cd ..
|
cd ..
|
||||||
|
|
|
@ -266,7 +266,8 @@ int Convolution3x3FP16CPUKernel::Run() {
|
||||||
}
|
}
|
||||||
auto input_tensor = in_tensors_.at(kInputIndex);
|
auto input_tensor = in_tensors_.at(kInputIndex);
|
||||||
auto ori_input_data = reinterpret_cast<float *>(input_tensor->Data());
|
auto ori_input_data = reinterpret_cast<float *>(input_tensor->Data());
|
||||||
for (int i = 0; i < input_tensor->ElementsNum(); ++i) {
|
auto input_element_num = input_tensor->ElementsNum();
|
||||||
|
for (int i = 0; i < input_element_num; ++i) {
|
||||||
fp16_input_[i] = (float16_t)ori_input_data[i];
|
fp16_input_[i] = (float16_t)ori_input_data[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -285,7 +286,9 @@ int Convolution3x3FP16CPUKernel::Run() {
|
||||||
// cast fp16 out to fp32 data
|
// cast fp16 out to fp32 data
|
||||||
auto out_tensor = out_tensors_.at(kOutputIndex);
|
auto out_tensor = out_tensors_.at(kOutputIndex);
|
||||||
auto output_addr = reinterpret_cast<float *>(out_tensor->Data());
|
auto output_addr = reinterpret_cast<float *>(out_tensor->Data());
|
||||||
for (int j = 0; j < out_tensor->ElementsNum(); ++j) {
|
auto output_element_num = out_tensor->ElementsNum();
|
||||||
|
|
||||||
|
for (int j = 0; j < output_element_num; ++j) {
|
||||||
output_addr[j] = static_cast<float>(fp16_out_[j]);
|
output_addr[j] = static_cast<float>(fp16_out_[j]);
|
||||||
}
|
}
|
||||||
return RET_OK;
|
return RET_OK;
|
||||||
|
|
|
@ -231,7 +231,8 @@ int ConvolutionFP16CPUKernel::Run() {
|
||||||
}
|
}
|
||||||
auto input_tensor = in_tensors_.at(kInputIndex);
|
auto input_tensor = in_tensors_.at(kInputIndex);
|
||||||
auto ori_input_data = reinterpret_cast<float *>(input_tensor->Data());
|
auto ori_input_data = reinterpret_cast<float *>(input_tensor->Data());
|
||||||
for (int i = 0; i < input_tensor->ElementsNum(); ++i) {
|
auto input_element_num = input_tensor->ElementsNum();
|
||||||
|
for (int i = 0; i < input_element_num; ++i) {
|
||||||
fp16_input_[i] = (float16_t)ori_input_data[i];
|
fp16_input_[i] = (float16_t)ori_input_data[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -250,7 +251,8 @@ int ConvolutionFP16CPUKernel::Run() {
|
||||||
// cast fp16 out to fp32 data
|
// cast fp16 out to fp32 data
|
||||||
auto out_tensor = out_tensors_.at(kOutputIndex);
|
auto out_tensor = out_tensors_.at(kOutputIndex);
|
||||||
auto output_addr = reinterpret_cast<float *>(out_tensor->Data());
|
auto output_addr = reinterpret_cast<float *>(out_tensor->Data());
|
||||||
for (int j = 0; j < out_tensor->ElementsNum(); ++j) {
|
auto output_element_num = out_tensor->ElementsNum();
|
||||||
|
for (int j = 0; j < output_element_num; ++j) {
|
||||||
output_addr[j] = static_cast<float>(fp16_out_[j]);
|
output_addr[j] = static_cast<float>(fp16_out_[j]);
|
||||||
}
|
}
|
||||||
return RET_OK;
|
return RET_OK;
|
||||||
|
|
|
@ -51,10 +51,36 @@ int ArithmeticCPUKernel::Init() {
|
||||||
|
|
||||||
int ArithmeticCPUKernel::ReSize() {
|
int ArithmeticCPUKernel::ReSize() {
|
||||||
FreeTileData();
|
FreeTileData();
|
||||||
auto element_num = out_tensors_[0]->ElementsNum();
|
arithmeticParameter_->in_elements_num0_ = in_tensors_[0]->ElementsNum();
|
||||||
|
arithmeticParameter_->in_elements_num1_ = in_tensors_[1]->ElementsNum();
|
||||||
|
arithmeticParameter_->out_elements_num_ = out_tensors_[0]->ElementsNum();
|
||||||
|
|
||||||
|
if (arithmeticParameter_->in_elements_num0_ == 1 || arithmeticParameter_->in_elements_num1_ == 1) {
|
||||||
|
if (arithmeticParameter_->activation_type_ == schema::ActivationType_NO_ACTIVATION) {
|
||||||
|
switch (arithmeticParameter_->op_parameter_.type_) {
|
||||||
|
case PrimitiveType_Mul:
|
||||||
|
arithmeticParameter_->broadcasting_ = false;
|
||||||
|
arithmetic_opt_run_ = ElementOptMul;
|
||||||
|
break;
|
||||||
|
case PrimitiveType_Add:
|
||||||
|
arithmeticParameter_->broadcasting_ = false;
|
||||||
|
arithmetic_opt_run_ = ElementOptAdd;
|
||||||
|
break;
|
||||||
|
case PrimitiveType_Sub:
|
||||||
|
arithmeticParameter_->broadcasting_ = false;
|
||||||
|
arithmetic_opt_run_ = ElementOptSub;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (arithmeticParameter_->broadcasting_) {
|
||||||
|
tile_data0_ = new float[arithmeticParameter_->out_elements_num_];
|
||||||
|
tile_data1_ = new float[arithmeticParameter_->out_elements_num_];
|
||||||
|
}
|
||||||
|
|
||||||
tile_data0_ = new float[element_num];
|
|
||||||
tile_data1_ = new float[element_num];
|
|
||||||
return RET_OK;
|
return RET_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -77,7 +103,17 @@ int ArithmeticCPUKernel::DoArithmetic(int task_id) {
|
||||||
if (arithmeticParameter_->broadcasting_) {
|
if (arithmeticParameter_->broadcasting_) {
|
||||||
error_code = arithmetic_run_(tile_data0_ + stride * task_id, tile_data1_ + stride * task_id,
|
error_code = arithmetic_run_(tile_data0_ + stride * task_id, tile_data1_ + stride * task_id,
|
||||||
output_data + stride * task_id, count);
|
output_data + stride * task_id, count);
|
||||||
|
} else if (arithmetic_opt_run_ != nullptr) {
|
||||||
|
if (arithmeticParameter_->in_elements_num0_ == 1) {
|
||||||
|
error_code = arithmetic_opt_run_(input0_data, input1_data1 + stride * task_id, output_data + stride * task_id,
|
||||||
|
count, arithmeticParameter_);
|
||||||
|
} else if (arithmeticParameter_->in_elements_num1_ == 1) {
|
||||||
|
error_code = arithmetic_opt_run_(input0_data + stride * task_id, input1_data1, output_data + stride * task_id,
|
||||||
|
count, arithmeticParameter_);
|
||||||
|
} else {
|
||||||
|
error_code = arithmetic_opt_run_(input0_data + stride * task_id, input1_data1 + stride * task_id,
|
||||||
|
output_data + stride * task_id, count, arithmeticParameter_);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
error_code = arithmetic_run_(input0_data + stride * task_id, input1_data1 + stride * task_id,
|
error_code = arithmetic_run_(input0_data + stride * task_id, input1_data1 + stride * task_id,
|
||||||
output_data + stride * task_id, count);
|
output_data + stride * task_id, count);
|
||||||
|
@ -104,6 +140,7 @@ int ArithmeticCPUKernel::Run() {
|
||||||
MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
|
MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (arithmeticParameter_->broadcasting_) {
|
if (arithmeticParameter_->broadcasting_) {
|
||||||
auto input_data0 = reinterpret_cast<float *>(in_tensors_[0]->Data());
|
auto input_data0 = reinterpret_cast<float *>(in_tensors_[0]->Data());
|
||||||
auto input_data1 = reinterpret_cast<float *>(in_tensors_[1]->Data());
|
auto input_data1 = reinterpret_cast<float *>(in_tensors_[1]->Data());
|
||||||
|
|
|
@ -43,6 +43,8 @@ using mindspore::schema::PrimitiveType_Sub;
|
||||||
namespace mindspore::kernel {
|
namespace mindspore::kernel {
|
||||||
class ArithmeticCPUKernel : public LiteKernel {
|
class ArithmeticCPUKernel : public LiteKernel {
|
||||||
typedef int (*ArithmeticRun)(float *input0, float *input1, float *output, int element_size);
|
typedef int (*ArithmeticRun)(float *input0, float *input1, float *output, int element_size);
|
||||||
|
typedef int (*ArithmeticOptRun)(float *input0, float *input1, float *output, int element_size,
|
||||||
|
ArithmeticParameter *param);
|
||||||
typedef int (*ArithmeticBroadcastRun)(float *input0, float *input1, float *tile_input0, float *tile_input1,
|
typedef int (*ArithmeticBroadcastRun)(float *input0, float *input1, float *tile_input0, float *tile_input1,
|
||||||
float *output, int element_size, ArithmeticParameter *param);
|
float *output, int element_size, ArithmeticParameter *param);
|
||||||
|
|
||||||
|
@ -177,8 +179,9 @@ class ArithmeticCPUKernel : public LiteKernel {
|
||||||
float *tile_data0_ = nullptr;
|
float *tile_data0_ = nullptr;
|
||||||
float *tile_data1_ = nullptr;
|
float *tile_data1_ = nullptr;
|
||||||
ArithmeticParameter *arithmeticParameter_;
|
ArithmeticParameter *arithmeticParameter_;
|
||||||
ArithmeticRun arithmetic_run_;
|
ArithmeticRun arithmetic_run_ = nullptr;
|
||||||
ArithmeticBroadcastRun arithmetic_broadcast_run_;
|
ArithmeticBroadcastRun arithmetic_broadcast_run_ = nullptr;
|
||||||
|
ArithmeticOptRun arithmetic_opt_run_ = nullptr;
|
||||||
};
|
};
|
||||||
} // namespace mindspore::kernel
|
} // namespace mindspore::kernel
|
||||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_ARITHMETIC_H_
|
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_ARITHMETIC_H_
|
||||||
|
|
|
@ -277,7 +277,8 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::tensor::Ten
|
||||||
kernel =
|
kernel =
|
||||||
new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(opParameter, inputs, outputs, ctx, primitive, out_unit);
|
new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(opParameter, inputs, outputs, ctx, primitive, out_unit);
|
||||||
} else if (use_sw) {
|
} else if (use_sw) {
|
||||||
kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(opParameter, inputs, outputs, ctx, primitive);
|
// kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(opParameter, inputs, outputs, ctx, primitive);
|
||||||
|
kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(opParameter, inputs, outputs, ctx, primitive);
|
||||||
} else {
|
} else {
|
||||||
kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(opParameter, inputs, outputs, ctx, primitive);
|
kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(opParameter, inputs, outputs, ctx, primitive);
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,8 +29,12 @@ typedef struct ArithmeticParameter {
|
||||||
size_t ndim_;
|
size_t ndim_;
|
||||||
int activation_type_;
|
int activation_type_;
|
||||||
int in_shape0_[5];
|
int in_shape0_[5];
|
||||||
|
int in_elements_num0_;
|
||||||
int in_shape1_[5];
|
int in_shape1_[5];
|
||||||
|
int in_elements_num1_;
|
||||||
|
|
||||||
int out_shape_[5];
|
int out_shape_[5];
|
||||||
|
int out_elements_num_;
|
||||||
|
|
||||||
int in_strides0_[5];
|
int in_strides0_[5];
|
||||||
int in_strides1_[5];
|
int in_strides1_[5];
|
||||||
|
|
|
@ -19,6 +19,57 @@
|
||||||
|
|
||||||
#define ACCURACY_DATA 0.00000001
|
#define ACCURACY_DATA 0.00000001
|
||||||
|
|
||||||
|
int ElementOptMul(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
|
||||||
|
if (param->in_elements_num0_ == 1) {
|
||||||
|
for (int i = 0; i < element_size; ++i) {
|
||||||
|
output[i] = input0[0] * input1[i];
|
||||||
|
}
|
||||||
|
} else if (param->in_elements_num1_ == 1) {
|
||||||
|
for (int i = 0; i < element_size; ++i) {
|
||||||
|
output[i] = input0[i] * input1[0];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < element_size; ++i) {
|
||||||
|
output[i] = input0[i] * input1[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return NNACL_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ElementOptSub(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
|
||||||
|
if (param->in_elements_num0_ == 1) {
|
||||||
|
for (int i = 0; i < element_size; ++i) {
|
||||||
|
output[i] = input0[0] - input1[i];
|
||||||
|
}
|
||||||
|
} else if (param->in_elements_num1_ == 1) {
|
||||||
|
for (int i = 0; i < element_size; ++i) {
|
||||||
|
output[i] = input0[i] - input1[0];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < element_size; ++i) {
|
||||||
|
output[i] = input0[i] - input1[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return NNACL_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ElementOptAdd(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
|
||||||
|
if (param->in_elements_num0_ == 1) {
|
||||||
|
for (int i = 0; i < element_size; ++i) {
|
||||||
|
output[i] = input0[0] + input1[i];
|
||||||
|
}
|
||||||
|
} else if (param->in_elements_num1_ == 1) {
|
||||||
|
for (int i = 0; i < element_size; ++i) {
|
||||||
|
output[i] = input0[i] + input1[0];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < element_size; ++i) {
|
||||||
|
output[i] = input0[i] + input1[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return NNACL_OK;
|
||||||
|
}
|
||||||
|
|
||||||
int ElementMul(float *input0, float *input1, float *output, int element_size) {
|
int ElementMul(float *input0, float *input1, float *output, int element_size) {
|
||||||
int block_mod = element_size % C4NUM;
|
int block_mod = element_size % C4NUM;
|
||||||
int block_c4 = element_size - block_mod;
|
int block_c4 = element_size - block_mod;
|
||||||
|
|
|
@ -26,6 +26,9 @@
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
int ElementOptAdd(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param);
|
||||||
|
int ElementOptSub(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param);
|
||||||
|
int ElementOptMul(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param);
|
||||||
int ElementMul(float *input0, float *input1, float *output, int element_size);
|
int ElementMul(float *input0, float *input1, float *output, int element_size);
|
||||||
int ElementMulRelu(float *input0, float *input1, float *output, int element_size);
|
int ElementMulRelu(float *input0, float *input1, float *output, int element_size);
|
||||||
int ElementMulRelu6(float *input0, float *input1, float *output, int element_size);
|
int ElementMulRelu6(float *input0, float *input1, float *output, int element_size);
|
||||||
|
|
Loading…
Reference in New Issue