!6029 l2norm trainling axis support and multithread

Merge pull request !6029 from zhaozhenlong/lite/issue/l2norm_multithread_trailing_axis
This commit is contained in:
mindspore-ci-bot 2020-09-11 16:22:06 +08:00 committed by Gitee
commit 9562d2d5cc
10 changed files with 399 additions and 63 deletions

View File

@ -16,36 +16,60 @@
#include "nnacl/l2_norm.h"
#include <math.h>
#include "nnacl/errorcode.h"
int L2NormFp32(const float *input_ptr, float *output_ptr,
L2NormParameter *param) {
int *axis = param->axis_;
size_t axis_num = param->axis_num_;
float epsilon = param->epsilon_;
int shape_num = param->shape_num_;
// default case, axis is set default
if (shape_num == axis_num) {
bool default_case_flag = true;
for (int i = 0; i < axis_num; i++) {
if (axis[i] != i) {
default_case_flag = false;
}
}
if (default_case_flag) {
int data_num = param->data_num_;
float sum = 0;
for (int i = 0; i < data_num; i++) {
sum = sum + input_ptr[i] * input_ptr[i];
}
float res = sqrt(sum > epsilon ? sum : epsilon);
for (int i = 0; i < data_num; i++) {
output_ptr[i] = input_ptr[i] / res;
}
return 0;
}
} else {
return -1;
int CalcThreadSquareSum(const float *input_ptr, float *sum, int begin, int end) {
*sum = 0.0f;
int i;
for (i = begin; i < end; ++i) {
*sum += input_ptr[i] * input_ptr[i];
}
return 0;
return NNACL_OK;
}
int ThreadDivSqrtSum(const float *input_ptr, float *output_ptr, const L2NormParameter *param, const float sqrt_sum,
const int begin, const int end) {
bool is_relu = param->act_type_ == ActType_Relu;
bool is_relu6 = param->act_type_ == ActType_Relu6;
int i;
for (i = begin; i < end; i++) {
float tmp = input_ptr[i] / sqrt_sum;
if (is_relu) {
output_ptr[i] = MSMAX(0, tmp);
} else if (is_relu6) {
output_ptr[i] = MSMIN(6, MSMAX(0, tmp));
} else {
output_ptr[i] = tmp;
}
}
return NNACL_OK;
}
int ThreadTrailingAxis(const float *input_ptr, float *output_ptr, const L2NormParameter *param, const int begin,
const int end) {
bool is_relu = param->act_type_ == ActType_Relu;
bool is_relu6 = param->act_type_ == ActType_Relu6;
const int c = param->shape_[param->shape_num_ - 1];
int i = 0;
for (i = begin; i < end; ++i) {
float square_sum = 0.0f;
int j = 0;
for (j = 0; j < c; ++j) {
const float val = input_ptr[i * c + j];
square_sum += val * val;
}
float sqrt_sum = sqrt(square_sum > param->epsilon_ ? square_sum : param->epsilon_);
for (j = 0; j < c; ++j) {
float tmp = input_ptr[i * c + j] / sqrt_sum;
if (is_relu) {
output_ptr[i * c + j] = MSMAX(0, tmp);
} else if (is_relu6) {
output_ptr[i * c + j] = MSMIN(6, MSMAX(0, tmp));
} else {
output_ptr[i * c + j] = tmp;
}
}
}
return NNACL_OK;
}

View File

@ -21,7 +21,11 @@
#ifdef __cplusplus
extern "C" {
#endif
int L2NormFp32(const float *input_ptr, float *output_ptr, L2NormParameter *param);
int CalcThreadSquareSum(const float *input_ptr, float *sum, int begin, int end);
int ThreadDivSqrtSum(const float *input_ptr, float *output_ptr, const L2NormParameter *param, const float sqrt_sum,
const int begin, const int end);
int ThreadTrailingAxis(const float *input_ptr, float *output_ptr, const L2NormParameter *param, const int begin,
const int end);
#ifdef __cplusplus
}
#endif

View File

@ -24,9 +24,10 @@ typedef struct L2NormParameter {
int *axis_;
size_t axis_num_;
float epsilon_;
float data_num_;
int data_num_;
int *shape_;
int shape_num_;
size_t shape_num_;
ActType act_type_;
} L2NormParameter;
#endif // MINDSPORE_LITE_NNACL_L2NORM_PARAMETER_H_

View File

@ -785,6 +785,7 @@ table FloorMod {
table L2Norm {
axis: [int];
epsilon: float;
activationType: ActivationType;
}
table LogicalAnd {

View File

@ -21,9 +21,13 @@ namespace lite {
#ifdef PRIMITIVE_WRITEABLE
std::vector<int> L2Norm::GetAxis() const { return this->primitive_->value.AsL2Norm()->axis; }
float L2Norm::GetEpsilon() const { return this->primitive_->value.AsL2Norm()->epsilon; }
int L2Norm::GetActivationType() const { return this->primitive_->value.AsL2Norm()->activationType; }
void L2Norm::SetAxis(const std::vector<int> &axis) { this->primitive_->value.AsL2Norm()->axis = axis; }
void L2Norm::SetEpsilon(float epsilon) { this->primitive_->value.AsL2Norm()->epsilon = epsilon; }
void L2Norm::SetActivationType(int activationType) {
this->primitive_->value.AsL2Norm()->activationType = (schema::ActivationType)activationType;
}
#else
int L2Norm::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
@ -51,6 +55,7 @@ std::vector<int> L2Norm::GetAxis() const {
return std::vector<int>(fb_vector->begin(), fb_vector->end());
}
float L2Norm::GetEpsilon() const { return this->primitive_->value_as_L2Norm()->epsilon(); }
int L2Norm::GetActivationType() const { return this->primitive_->value_as_L2Norm()->activationType(); }
#endif
} // namespace lite

View File

@ -34,6 +34,7 @@ class L2Norm : public PrimitiveC {
explicit L2Norm(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
void SetAxis(const std::vector<int> &axis);
void SetEpsilon(float epsilon);
void SetActivationType(int activationType);
#else
L2Norm() = default;
@ -41,6 +42,7 @@ class L2Norm : public PrimitiveC {
#endif
std::vector<int> GetAxis() const;
float GetEpsilon() const;
int GetActivationType() const;
};
} // namespace lite
} // namespace mindspore

View File

@ -1536,11 +1536,18 @@ OpParameter *PopulateL2NormParameter(const mindspore::lite::PrimitiveC *primitiv
for (size_t i = 0; i < axis_vec.size(); i++) {
l2_norm_parameter->axis_[i] = axis_vec[i];
}
if (param->GetEpsilon() < 1e-12) {
l2_norm_parameter->epsilon_ = 1e-12;
if (param->GetEpsilon() < 1e-6) {
l2_norm_parameter->epsilon_ = 1e-6;
} else {
l2_norm_parameter->epsilon_ = param->GetEpsilon();
}
if (param->GetActivationType() == static_cast<int>(schema::ActivationType_RELU)) {
l2_norm_parameter->act_type_ = ActType_Relu;
} else if (param->GetActivationType() == static_cast<int>(schema::ActivationType_RELU6)) {
l2_norm_parameter->act_type_ = ActType_Relu6;
} else {
l2_norm_parameter->act_type_ = ActType_No;
}
return reinterpret_cast<OpParameter *>(l2_norm_parameter);
}

View File

@ -15,9 +15,11 @@
*/
#include <vector>
#include <cmath>
#include "src/runtime/kernel/arm/fp32/l2_norm.h"
#include "include/errorcode.h"
#include "nnacl/l2_norm.h"
#include "src/runtime/runtime_api.h"
using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
@ -26,14 +28,154 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_L2Norm;
namespace mindspore::kernel {
namespace {
const int kMaxThreadNum = 8;
}
int L2NormCPUKernel::Init() {
l2_norm_param_->data_num_ = in_tensors_.at(kInputIndex)->ElementsNum();
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
}
int L2NormCPUKernel::MallocTmpBuffer() {
auto shape = in_tensors_.at(kInputIndex)->shape();
l2_norm_param_->shape_ = reinterpret_cast<int *>(malloc(shape.size() * sizeof(int)));
if (l2_norm_param_->shape_ == nullptr) {
MS_LOG(ERROR) << "Malloc data failed";
return RET_ERROR;
}
tmp_sum_ = reinterpret_cast<float *>(malloc(kMaxThreadNum * sizeof(float)));
if (tmp_sum_ == nullptr) {
MS_LOG(ERROR) << "Malloc data failed";
return RET_ERROR;
}
return RET_OK;
}
void L2NormCPUKernel::FreeTmpBuffer() {
if (l2_norm_param_->shape_ != nullptr) {
free(l2_norm_param_->shape_);
l2_norm_param_->shape_ = nullptr;
}
if (tmp_sum_ != nullptr) {
free(tmp_sum_);
tmp_sum_ = nullptr;
}
}
int L2NormCPUKernel::ReSize() {
FreeTmpBuffer();
auto ret = MallocTmpBuffer();
if (ret != RET_OK) {
FreeTmpBuffer();
return ret;
}
l2_norm_param_->data_num_ = in_tensors_.at(kInputIndex)->ElementsNum();
auto shape = in_tensors_.at(kInputIndex)->shape();
l2_norm_param_->shape_num_ = shape.size();
for (size_t i = 0; i < shape.size(); i++) {
l2_norm_param_->shape_[i] = shape[i];
}
for (size_t i = 0; i < l2_norm_param_->axis_num_; ++i) {
if (l2_norm_param_->axis_[i] < 0) {
l2_norm_param_->axis_[i] += static_cast<int>(shape.size());
}
}
return RET_OK;
}
int L2NormCPUKernel::CalcSquareSum(int task_id) {
int unit = UP_DIV(l2_norm_param_->data_num_, context_->thread_num_);
int begin = task_id * unit;
int end = MSMIN(begin + unit, l2_norm_param_->data_num_);
return CalcThreadSquareSum(input_ptr_, tmp_sum_ + task_id, begin, end);
}
int L2NormCPUKernel::DivSqrtSum(int task_id) {
int unit = UP_DIV(l2_norm_param_->data_num_, context_->thread_num_);
int begin = task_id * unit;
int end = MSMIN(begin + unit, l2_norm_param_->data_num_);
return ThreadDivSqrtSum(input_ptr_, output_ptr_, l2_norm_param_, sqrt_sum_, begin, end);
}
int L2NormCPUKernel::CalcL2NormTrailingAxis(int task_id) {
auto input = in_tensors_.at(0);
int outer_size = input->ElementsNum() / input->shape().back();
int unit = UP_DIV(outer_size, context_->thread_num_);
int begin = task_id * unit;
int end = MSMIN(begin + unit, outer_size);
return ThreadTrailingAxis(input_ptr_, output_ptr_, l2_norm_param_, begin, end);
}
int SquareSumRun(void *cdata, int task_id) {
auto kernel = reinterpret_cast<L2NormCPUKernel *>(cdata);
auto ret = kernel->CalcSquareSum(task_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "L2Norm SquareSumRun error task_id[" << task_id << "] error_code[" << ret << "]";
return RET_ERROR;
}
return RET_OK;
}
int L2NormRun(void *cdata, int task_id) {
auto kernel = reinterpret_cast<L2NormCPUKernel *>(cdata);
auto ret = kernel->DivSqrtSum(task_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "L2Norm L2NormRun error task_id[" << task_id << "] error_code[" << ret << "]";
return RET_ERROR;
}
return RET_OK;
}
int L2NormTrailingAxisRun(void *cdata, int task_id) {
auto kernel = reinterpret_cast<L2NormCPUKernel *>(cdata);
auto ret = kernel->CalcL2NormTrailingAxis(task_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "L2Norm TrailingAxisRun error task_id[" << task_id << "] error_code[" << ret << "]";
return RET_ERROR;
}
return RET_OK;
}
int L2NormCPUKernel::Run() {
auto ret = Prepare();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Prepare fail! ret: " << ret;
return ret;
}
auto input_shape = in_tensors().at(kInputIndex)->shape();
input_ptr_ = reinterpret_cast<float *>(in_tensors_.at(kInputIndex)->MutableData());
output_ptr_ = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->MutableData());
if (l2_norm_param_->axis_num_ == 0 || l2_norm_param_->axis_num_ == input_shape.size()) {
// all axis
ret = ParallelLaunch(THREAD_POOL_DEFAULT, SquareSumRun, this, context_->thread_num_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "L2Norm error: error_code[" << ret << "]";
return RET_ERROR;
}
float sum = 0.0f;
for (int i = 0; i < context_->thread_num_; ++i) {
sum += tmp_sum_[i];
}
sqrt_sum_ = sqrt(sum > l2_norm_param_->epsilon_ ? sum : l2_norm_param_->epsilon_);
ret = ParallelLaunch(THREAD_POOL_DEFAULT, L2NormRun, this, context_->thread_num_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "L2Norm error: error_code[" << ret << "]";
return RET_ERROR;
}
} else if (l2_norm_param_->axis_num_ == 1 && l2_norm_param_->axis_[0] == static_cast<int>(input_shape.size()) - 1) {
ret = ParallelLaunch(THREAD_POOL_DEFAULT, L2NormTrailingAxisRun, this, context_->thread_num_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "L2Norm error: error_code[" << ret << "]";
return RET_ERROR;
}
} else {
MS_LOG(ERROR) << "L2Norm only support reduce on all axis and trailing axis with trailing axis";
return RET_ERROR;
}
return RET_OK;
}
@ -61,30 +203,5 @@ kernel::LiteKernel *CpuL2NormFp32KernelCreator(const std::vector<lite::Tensor *>
return kernel;
}
int L2NormCPUKernel::Run() {
auto ret = Prepare();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
return ret;
}
auto input_ptr = reinterpret_cast<float *>(in_tensors_.at(kInputIndex)->MutableData());
auto output_ptr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->MutableData());
ret = L2NormFp32(input_ptr, output_ptr, l2_norm_param_);
if (ret != 0) {
MS_LOG_ERROR << "unsupported axis setting, more work will be done";
return ret;
}
return RET_OK;
}
L2NormCPUKernel::~L2NormCPUKernel() {
if (l2_norm_param_->shape_ != nullptr) {
free(l2_norm_param_->shape_);
}
if (l2_norm_param_->axis_ != nullptr) {
free(l2_norm_param_->axis_);
}
}
REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_L2Norm, CpuL2NormFp32KernelCreator)
} // namespace mindspore::kernel

View File

@ -35,14 +35,29 @@ class L2NormCPUKernel : public LiteKernel {
: LiteKernel(parameter, inputs, outputs, ctx, primitive) {
l2_norm_param_ = reinterpret_cast<L2NormParameter *>(op_parameter_);
}
~L2NormCPUKernel();
~L2NormCPUKernel() {
FreeTmpBuffer();
if (l2_norm_param_->axis_ != nullptr) {
free(l2_norm_param_->axis_);
}
}
int CalcSquareSum(int task_id);
int DivSqrtSum(int task_id);
int CalcL2NormTrailingAxis(int task_id);
int Init() override;
int ReSize() override { return 0; }
int ReSize() override;
int Run() override;
private:
int MallocTmpBuffer();
void FreeTmpBuffer();
L2NormParameter *l2_norm_param_;
float sqrt_sum_;
float *input_ptr_;
float *output_ptr_;
float *tmp_sum_ = nullptr;
};
} // namespace mindspore::kernel

View File

@ -0,0 +1,160 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "mindspore/core/utils/log_adapter.h"
#include "common/common_test.h"
#include "mindspore/lite/src/runtime/kernel/arm/fp32/l2_norm.h"
#include "src/kernel_registry.h"
#include "src/lite_kernel.h"
using mindspore::schema::Format_NHWC;
namespace mindspore {
class TestL2NormFp32 : public mindspore::CommonTest {
public:
TestL2NormFp32() = default;
void Init(const std::vector<int> &input_shape, const std::vector<int> &output_shape, float *input_data,
float *output_data, const int axis_num, ActType activation_type, const int thread_num);
void TearDown() override;
public:
float err_tol_ = 1e-5;
lite::Tensor in_tensor_;
lite::Tensor out_tensor_;
std::vector<lite::Tensor *> inputs_{&in_tensor_};
std::vector<lite::Tensor *> outputs_{&out_tensor_};
L2NormParameter param_;
kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Resize};
lite::Context ctx_ = lite::Context();
kernel::KernelCreator creator_ = nullptr;
kernel::LiteKernel *kernel_ = nullptr;
};
void TestL2NormFp32::TearDown() {
in_tensor_.SetData(nullptr);
out_tensor_.SetData(nullptr);
}
void TestL2NormFp32::Init(const std::vector<int> &input_shape, const std::vector<int> &output_shape, float *input_data,
float *output_data, const int axis_num, ActType activation_type, const int thread_num) {
in_tensor_.set_data_type(kNumberTypeFloat32);
in_tensor_.SetFormat(Format_NHWC);
in_tensor_.set_shape(input_shape);
out_tensor_.set_data_type(kNumberTypeFloat32);
out_tensor_.set_shape(output_shape);
in_tensor_.SetData(input_data);
out_tensor_.SetData(output_data);
param_.axis_num_ = axis_num;
if (axis_num == 1) {
param_.axis_ = reinterpret_cast<int *>(malloc(sizeof(int)));
param_.axis_[0] = -1;
}
param_.epsilon_ = 1e-6;
param_.act_type_ = activation_type;
desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_L2Norm};
ctx_ = lite::Context();
ctx_.thread_num_ = thread_num;
creator_ = lite::KernelRegistry::GetInstance()->GetCreator(desc);
ASSERT_NE(creator_, nullptr);
kernel_ = creator_(inputs_, outputs_, reinterpret_cast<OpParameter *>(&param_), &ctx_, desc, nullptr);
ASSERT_NE(kernel_, nullptr);
}
// 2thread all axis no_activation
TEST_F(TestL2NormFp32, Test1) {
float input_data[18] = {-9.0, -8.0, -7.0, -6.0, -5.0, -4.0, -3.0, -2.0, -1.0,
0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
float output_data[18] = {0};
std::vector<int> input_shape = {1, 3, 2, 3};
std::vector<int> output_shape = {1, 3, 2, 3};
std::vector<float> expect = {-0.40699407, -0.3617725, -0.31655094, -0.27132937, -0.22610782, -0.18088625,
-0.13566469, -0.09044313, -0.045221563, 0.0, 0.045221563, 0.09044313,
0.13566469, 0.18088625, 0.22610782, 0.27132937, 0.31655094, 0.3617725};
auto output_size = 18;
int axis_num = 0;
ActType act_type = ActType_No;
int thread_num = 2;
Init(input_shape, output_shape, input_data, output_data, axis_num, act_type, thread_num);
auto ret = kernel_->Run();
EXPECT_EQ(0, ret);
CompareOutputData(output_data, expect.data(), output_size, err_tol_);
}
// 2thread all axis relu
TEST_F(TestL2NormFp32, Test2) {
float input_data[18] = {-9.0, -8.0, -7.0, -6.0, -5.0, -4.0, -3.0, -2.0, -1.0,
0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
float output_data[18] = {0};
std::vector<int> input_shape = {1, 3, 2, 3};
std::vector<int> output_shape = {1, 3, 2, 3};
std::vector<float> expect = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0, 0.045221563, 0.09044313,
0.13566469, 0.18088625, 0.22610782, 0.27132937, 0.31655094, 0.3617725};
auto output_size = 18;
int axis_num = 0;
ActType act_type = ActType_Relu;
int thread_num = 2;
Init(input_shape, output_shape, input_data, output_data, axis_num, act_type, thread_num);
auto ret = kernel_->Run();
EXPECT_EQ(0, ret);
CompareOutputData(output_data, expect.data(), output_size, err_tol_);
}
// 4 thread trailing axis no activation
TEST_F(TestL2NormFp32, Test3) {
float input_data[18] = {-9.0, -8.0, -7.0, -6.0, -5.0, -4.0, -3.0, -2.0, -1.0,
0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
float output_data[18] = {0};
std::vector<int> input_shape = {1, 3, 2, 3};
std::vector<int> output_shape = {1, 3, 2, 3};
std::vector<float> expect = {-0.6461623, -0.57436645, -0.5025706, -0.6837635, -0.5698029, -0.45584232,
-0.8017837, -0.5345225, -0.26726124, 0.0, 0.4472136, 0.8944272,
0.42426407, 0.56568545, 0.7071068, 0.49153918, 0.57346237, 0.65538555};
auto output_size = 18;
int axis_num = 1;
ActType act_type = ActType_No;
int thread_num = 4;
Init(input_shape, output_shape, input_data, output_data, axis_num, act_type, thread_num);
auto ret = kernel_->Run();
EXPECT_EQ(0, ret);
CompareOutputData(output_data, expect.data(), output_size, err_tol_);
}
// 1 thread trailing axis no activation
TEST_F(TestL2NormFp32, Test4) {
float input_data[18] = {-9.0, -8.0, -7.0, -6.0, -5.0, -4.0, -3.0, -2.0, -1.0,
0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
float output_data[18] = {0};
std::vector<int> input_shape = {1, 3, 2, 3};
std::vector<int> output_shape = {1, 3, 2, 3};
std::vector<float> expect = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0, 0.4472136, 0.8944272,
0.42426407, 0.56568545, 0.7071068, 0.49153918, 0.57346237, 0.65538555};
auto output_size = 18;
int axis_num = 1;
ActType act_type = ActType_Relu6;
int thread_num = 1;
Init(input_shape, output_shape, input_data, output_data, axis_num, act_type, thread_num);
auto ret = kernel_->Run();
EXPECT_EQ(0, ret);
CompareOutputData(output_data, expect.data(), output_size, err_tol_);
}
} // namespace mindspore