add exp fp16 kernel

This commit is contained in:
sunsuodong 2021-07-13 16:06:17 +08:00
parent d24c508bf0
commit ce62249782
11 changed files with 239 additions and 114 deletions

View File

@ -0,0 +1,33 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_NNACL_EXP_PARAMETER_H_
#define MINDSPORE_NNACL_EXP_PARAMETER_H_
#include "nnacl/op_base.h"
typedef struct ExpParameter {
// Primitive parameter
OpParameter op_parameter_;
float base_;
float scale_;
float shift_;
// other parameter
float in_scale_;
float out_scale_;
int element_num_;
} ExpParameter;
#endif // MINDSPORE_NNACL_EXP_PARAMETER_H_

View File

@ -13,12 +13,22 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nnacl/fp16/exp_fp16.h"
#include <math.h>
#include <string.h>
#include "nnacl/errorcode.h"
#if defined(ENABLE_NEON)
static inline void simd_exp_fp16(float16x8_t input, float16_t *dst) {
static float16x8_t maxv = {88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f};
static float16x8_t minv = {-88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f};
input = vmaxq_f16(minv, vminq_f16(input, maxv));
float32x4_t input_low = vcvt_f32_f16(vget_low_f16(input));
float32x4_t input_high = vcvt_f32_f16(vget_high_f16(input));
vst1q_f16(dst, vcombine_f16(vcvt_f16_f32(VexpFp32(input_low)), vcvt_f16_f32(VexpFp32(input_high))));
}
#endif
void ExpFp16(const float16_t *src, float16_t *dst, int num) {
int i = 0;
#ifdef ENABLE_NEON
@ -31,3 +41,42 @@ void ExpFp16(const float16_t *src, float16_t *dst, int num) {
single_exp_fp16(src[i], dst + i);
}
}
int ExpFusionFp16(const float16_t *src, float16_t *dst, const ExpParameter *param, int task_id) {
int stride = UP_DIV(param->element_num_, param->op_parameter_.thread_num_);
int start = stride * task_id;
int end = MSMIN(param->element_num_, start + stride);
int num = end - start;
if (param->scale_ == 1) {
ExpFp16(src + start, dst + start, num);
} else {
int i = 0;
#ifdef ENABLE_ARM64
MS_FLOAT16X8 scale = MS_MOVQ_F16(param->in_scale_);
int count = (num / C8NUM) * C8NUM;
for (; i < count; i += C8NUM) {
simd_exp_fp16(MS_MULQ_F16(MS_LDQ_F16(src + i), scale), dst + i);
}
#endif
for (; i < num; ++i) {
single_exp_fp16(src[i] * param->in_scale_, dst + i);
}
}
if (param->out_scale_ != 1) {
int i = 0;
#ifdef ENABLE_ARM64
MS_FLOAT16X8 scale = MS_MOVQ_F16(param->out_scale_);
int count = (num / C8NUM) * C8NUM;
for (; i < count; i += C8NUM) {
simd_exp_fp16(MS_LDQ_F16(src + i), dst + i);
MS_STQ_F16(dst + i, MS_MULQ_F16(MS_LDQ_F16(dst + i), scale));
}
#endif
for (; i < num; ++i) {
single_exp_fp16(src[i], dst + i);
dst[i] *= param->out_scale_;
}
}
return NNACL_OK;
}

View File

@ -13,46 +13,19 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_NNACL_FP16_EXP_H_
#define MINDSPORE_NNACL_FP16_EXP_H_
#ifndef MINDSPORE_NNACL_FP16_EXP_FP16_H_
#define MINDSPORE_NNACL_FP16_EXP_FP16_H_
#include "nnacl/op_base.h"
#include "nnacl/exp_parameter.h"
#include "nnacl/fp32/exp_fp32.h"
#include "nnacl/intrinsics/ms_simd_instructions_fp16.h"
#ifdef __cplusplus
extern "C" {
#endif
void ExpFp16(const float16_t *src, float16_t *dst, int num);
#if defined(ENABLE_NEON)
static inline float32x4_t exp_fp32(float32x4_t input) {
static float32x4_t param[] = {{0.693147f, 0.693147f, 0.693147f, 0.693147f},
{1.0f / 120, 1.0f / 120, 1.0f / 120, 1.0f / 120},
{1.0f / 24, 1.0f / 24, 1.0f / 24, 1.0f / 24},
{1.0f / 6, 1.0f / 6, 1.0f / 6, 1.0f / 6},
{0.5f, 0.5f, 0.5f, 0.5f},
{1.0f, 1.0f, 1.0f, 1.0f}};
int32x4_t integer = vcvtq_s32_f32(input / param[0]);
float32x4_t decimal = input - vcvtq_f32_s32(integer) * param[0];
int32x4_t int_exp = vshlq_s32((integer + vmovq_n_s32(127)), vmovq_n_s32(23));
float32x4_t decimal_exp =
param[5] +
decimal * (param[5] + decimal * (param[4] + decimal * (param[3] + decimal * (param[2] + decimal * param[1]))));
decimal_exp = decimal_exp * vld1q_f32((float *)(&int_exp));
return decimal_exp;
}
static inline void simd_exp_fp16(float16x8_t input, float16_t *dst) {
static float16x8_t maxv = {88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f};
static float16x8_t minv = {-88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f};
input = vmaxq_f16(minv, vminq_f16(input, maxv));
float32x4_t input_low = vcvt_f32_f16(vget_low_f16(input));
float32x4_t input_high = vcvt_f32_f16(vget_high_f16(input));
vst1q_f16(dst, vcombine_f16(vcvt_f16_f32(exp_fp32(input_low)), vcvt_f16_f32(exp_fp32(input_high))));
}
#endif
int ExpFusionFp16(const float16_t *src, float16_t *dst, const ExpParameter *param, int task_id);
static inline void single_exp_fp16(float16_t src, float16_t *dst) {
static float param[] = {0.693147f, 1.0f / 120, 1.0f / 24, 1.0f / 6, 1.0f / 2, 1.0f};
@ -64,8 +37,9 @@ static inline void single_exp_fp16(float16_t src, float16_t *dst) {
1.0f + decimal * (1.0f + decimal * (0.5f + decimal * (param[3] + decimal * (param[2] + decimal * param[1]))));
*dst = (float16_t)(*((float *)&int_exp) * decimal_exp);
}
#ifdef __cplusplus
}
#endif
#endif // MINDSPORE_NNACL_FP16_EXP_H_
#endif // MINDSPORE_NNACL_FP16_EXP_FP16_H_

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -19,27 +19,6 @@
#include <string.h>
#include "nnacl/errorcode.h"
int Exp(const float *input_data, float *output_data, const ExpParameter *parameter, int task_id) {
if (parameter->thread_num_ == 0) {
return NNACL_PARAM_INVALID;
}
if (parameter->scale_ == 1) {
for (size_t i = task_id; i < parameter->element_num_; i += parameter->thread_num_) {
output_data[i] = expf(input_data[i]);
}
} else {
for (size_t i = task_id; i < parameter->element_num_; i += parameter->thread_num_) {
output_data[i] = expf(input_data[i] * parameter->in_scale_);
}
}
if (parameter->out_scale_ != 1) {
for (size_t i = task_id; i < parameter->element_num_; i += parameter->thread_num_) {
output_data[i] = output_data[i] * parameter->out_scale_;
}
}
return NNACL_OK;
}
void ExpFp32(const float *src, float *dst, int num) {
int i = 0;
#ifdef ENABLE_ARM64
@ -52,3 +31,42 @@ void ExpFp32(const float *src, float *dst, int num) {
single_exp(src[i], dst + i);
}
}
int ExpFusionFp32(const float *src, float *dst, const ExpParameter *param, int task_id) {
int stride = UP_DIV(param->element_num_, param->op_parameter_.thread_num_);
int start = stride * task_id;
int end = MSMIN(param->element_num_, start + stride);
int num = end - start;
if (param->scale_ == 1) {
ExpFp32(src + start, dst + start, num);
} else {
int i = 0;
#ifdef ENABLE_ARM64
MS_FLOAT32X4 scale = MS_MOVQ_F32(param->in_scale_);
int count = (num / C4NUM) * C4NUM;
for (; i < count; i += C4NUM) {
simd_exp(MS_MULQ_F32(MS_LDQ_F32(src + i), scale), dst + i);
}
#endif
for (; i < num; ++i) {
single_exp(src[i] * param->in_scale_, dst + i);
}
}
if (param->out_scale_ != 1) {
int i = 0;
#ifdef ENABLE_ARM64
MS_FLOAT32X4 scale = {param->out_scale_, param->out_scale_, param->out_scale_, param->out_scale_};
int count = (num / C4NUM) * C4NUM;
for (; i < count; i += C4NUM) {
simd_exp(MS_LDQ_F32(src + i), dst + i);
MS_STQ_F32(dst + i, MS_MULQ_F32(MS_LDQ_F32(dst + i), scale));
}
#endif
for (; i < num; ++i) {
single_exp(src[i], dst + i);
dst[i] *= param->out_scale_;
}
}
return NNACL_OK;
}

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -18,48 +18,40 @@
#define MINDSPORE_NNACL_FP32_EXP_H_
#include "nnacl/op_base.h"
#include "nnacl/exp_parameter.h"
#if defined(ENABLE_ARM) || defined(ENABLE_SSE)
#include "nnacl/intrinsics/ms_simd_instructions.h"
#endif
typedef struct ExpParameter {
// Primitive parameter
OpParameter op_parameter_;
float base_;
float scale_;
float shift_;
// other parameter
int thread_num_;
float in_scale_;
float out_scale_;
int element_num_;
} ExpParameter;
#ifdef __cplusplus
extern "C" {
#endif
int Exp(const float *input_data, float *output_data, const ExpParameter *parameter, int task_id);
void ExpFp32(const float *src, float *dst, int num);
int ExpFusionFp32(const float *src, float *dst, const ExpParameter *param, int task_id);
#if defined(ENABLE_ARM) || defined(ENABLE_SSE)
static inline void simd_exp(MS_FLOAT32X4 input, float *dst) {
static MS_FLOAT32X4 maxv = {88.0f, 88.0f, 88.0f, 88.0f};
static MS_FLOAT32X4 minv = {-88.0f, -88.0f, -88.0f, -88.0f};
static inline MS_FLOAT32X4 VexpFp32(MS_FLOAT32X4 input) {
static MS_FLOAT32X4 param[] = {{0.693147f, 0.693147f, 0.693147f, 0.693147f},
{1.0f / 120, 1.0f / 120, 1.0f / 120, 1.0f / 120},
{1.0f / 24, 1.0f / 24, 1.0f / 24, 1.0f / 24},
{1.0f / 6, 1.0f / 6, 1.0f / 6, 1.0f / 6},
{0.5f, 0.5f, 0.5f, 0.5f},
{1.0f, 1.0f, 1.0f, 1.0f}};
input = MS_MAXQ_F32(minv, MS_MINQ_F32(input, maxv));
MS_INT32X4 integer = MS_CVTQPS_EPI32(MS_DIVQ_F32(input, param[0]));
MS_FLOAT32X4 decimal = MS_SUBQ_F32(input, MS_MULQ_F32(MS_CVTQEPI32_PS(integer), param[0]));
MS_INT32X4 int_exp = MS_SLLIQ_EPI32(MS_ADDQ_EPI32(integer, MS_MOVQ_EPI32(127)), 23);
MS_FLOAT32X4 tmp = MS_MULQ_F32(decimal, (MS_ADDQ_F32(param[2], MS_MULQ_F32(decimal, param[1]))));
tmp = MS_MULQ_F32(decimal, MS_ADDQ_F32(param[4], MS_MULQ_F32(decimal, MS_ADDQ_F32(param[3], tmp))));
MS_FLOAT32X4 decimal_exp = MS_ADDQ_F32(param[5], MS_MULQ_F32(decimal, MS_ADDQ_F32(param[5], tmp)));
MS_STQ_F32(dst, MS_MULQ_F32(decimal_exp, MS_CAST_F32_S32(int_exp)));
return MS_MULQ_F32(decimal_exp, MS_CAST_F32_S32(int_exp));
}
static inline void simd_exp(MS_FLOAT32X4 input, float *dst) {
static MS_FLOAT32X4 maxv = {88.0f, 88.0f, 88.0f, 88.0f};
static MS_FLOAT32X4 minv = {-88.0f, -88.0f, -88.0f, -88.0f};
input = MS_MAXQ_F32(minv, MS_MINQ_F32(input, maxv));
MS_STQ_F32(dst, VexpFp32(input));
}
#endif

View File

@ -24,7 +24,6 @@ using mindspore::schema::PrimitiveType_ExpFusion;
namespace mindspore::lite::micro::nnacl {
int ExpFP32Coder::Prepare(CoderContext *context) {
exp_parameter_ = reinterpret_cast<ExpParameter *>(parameter_);
exp_parameter_->thread_num_ = exp_parameter_->op_parameter_.thread_num_;
float log_ = (exp_parameter_->base_ == -1) ? 1 : logf(exp_parameter_->base_);
exp_parameter_->in_scale_ = exp_parameter_->scale_ * log_;
if (exp_parameter_->shift_ == 0) {

View File

@ -129,8 +129,8 @@ void NNaclFp32Serializer::CodeStruct(const std::string &name, const SpliceParame
void NNaclFp32Serializer::CodeStruct(const std::string &name, const ExpParameter &exp_parameter) {
CodeBaseStruct("ExpParameter", name, exp_parameter.op_parameter_, exp_parameter.base_, exp_parameter.scale_,
exp_parameter.shift_, exp_parameter.thread_num_, exp_parameter.in_scale_, exp_parameter.out_scale_,
exp_parameter.element_num_);
exp_parameter.shift_, exp_parameter.op_parameter_.thread_num_, exp_parameter.in_scale_,
exp_parameter.out_scale_, exp_parameter.element_num_);
}
void NNaclFp32Serializer::CodeStruct(const std::string &name, const StridedSliceParameter &strided_slice_parameter) {

View File

@ -0,0 +1,34 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16/exp_fp16.h"
#include "nnacl/fp16/exp_fp16.h"
#include "include/errorcode.h"
#include "src/kernel_registry.h"
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_ExpFusion;
namespace mindspore::kernel {
int ExpFp16CPUKernel::DoExcute(int task_id) {
ExpFusionFp16(reinterpret_cast<float16_t *>(input_addr_), reinterpret_cast<float16_t *>(output_addr_), param_,
task_id);
return RET_OK;
}
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_ExpFusion, LiteKernelCreator<ExpFp16CPUKernel>)
} // namespace mindspore::kernel

View File

@ -0,0 +1,34 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_EXP_FP16_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_EXP_FP16_H_
#include <vector>
#include "src/runtime/kernel/arm/fp32/exp_fp32.h"
namespace mindspore::kernel {
class ExpFp16CPUKernel : public ExpCPUKernel {
public:
explicit ExpFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: ExpCPUKernel(parameter, inputs, outputs, ctx) {}
~ExpFp16CPUKernel() override{};
int DoExcute(int task_id) override;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_EXP_FP16_H_

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -13,7 +13,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp32/exp_fp32.h"
#include <cmath>
#include "include/errorcode.h"
@ -26,34 +25,31 @@ using mindspore::schema::PrimitiveType_ExpFusion;
namespace mindspore::kernel {
int ExpCPUKernel::Init() {
exp_parameter_ = reinterpret_cast<ExpParameter *>(op_parameter_);
exp_parameter_->thread_num_ = thread_count_;
float log_base = (param_->base_ == -1) ? 1 : logf(param_->base_);
param_->in_scale_ = param_->scale_ * log_base;
if (param_->shift_ == 0) {
param_->out_scale_ = 1;
} else {
if (log_base == 1) {
param_->out_scale_ = expf(param_->shift_);
} else {
param_->out_scale_ = powf(param_->base_, param_->shift_);
}
}
param_->op_parameter_.thread_num_ = ms_context_->thread_num_;
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
}
int ExpCPUKernel::ReSize() {
exp_parameter_->thread_num_ = thread_count_;
float log_ = (exp_parameter_->base_ == -1) ? 1 : logf(exp_parameter_->base_);
exp_parameter_->in_scale_ = exp_parameter_->scale_ * log_;
if (exp_parameter_->shift_ == 0) {
exp_parameter_->out_scale_ = 1;
} else {
if (log_ == 1) {
exp_parameter_->out_scale_ = expf(exp_parameter_->shift_);
} else {
exp_parameter_->out_scale_ = powf(exp_parameter_->base_, exp_parameter_->shift_);
}
}
param_->element_num_ = in_tensors_.front()->ElementsNum();
return RET_OK;
}
int ExpCPUKernel::DoExcute(int task_id) {
Exp(input_addr_, output_addr_, exp_parameter_, task_id);
ExpFusionFp32(reinterpret_cast<float *>(input_addr_), reinterpret_cast<float *>(output_addr_), param_, task_id);
return RET_OK;
}
@ -68,11 +64,10 @@ int ExpRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
}
int ExpCPUKernel::Run() {
input_addr_ = reinterpret_cast<float *>(in_tensors_.front()->MutableData());
output_addr_ = reinterpret_cast<float *>(out_tensors_.front()->MutableData());
exp_parameter_->element_num_ = in_tensors_.front()->ElementsNum();
input_addr_ = in_tensors_.front()->MutableData();
output_addr_ = out_tensors_.front()->MutableData();
auto ret = ParallelLaunch(this->ms_context_, ExpRun, this, exp_parameter_->thread_num_);
auto ret = ParallelLaunch(this->ms_context_, ExpRun, this, ms_context_->thread_num_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Exp error: error_code[" << ret << "]";
return RET_ERROR;

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -13,7 +13,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_EXP_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_EXP_H_
@ -26,22 +25,20 @@ class ExpCPUKernel : public InnerKernel {
public:
explicit ExpCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: InnerKernel(parameter, inputs, outputs, ctx), ctx_(ctx), thread_count_(ctx->thread_num_) {}
: InnerKernel(parameter, inputs, outputs, ctx) {
param_ = reinterpret_cast<ExpParameter *>(parameter);
}
~ExpCPUKernel() override{};
int Init() override;
int ReSize() override;
int Run() override;
int DoExcute(int task_id);
virtual int DoExcute(int task_id);
protected:
const lite::InnerContext *ctx_ = nullptr;
int thread_count_ = 1;
ExpParameter *exp_parameter_ = nullptr;
private:
float *input_addr_ = nullptr;
float *output_addr_ = nullptr;
ExpParameter *param_ = nullptr;
void *input_addr_ = nullptr;
void *output_addr_ = nullptr;
};
} // namespace mindspore::kernel