forked from mindspore-Ecosystem/mindspore
!4961 Optimize the performance of BatchNorm and FusedBatchNorm, add Fp16 kernel
Merge pull request !4961 from sunsuodong/batch_norm_fp16
This commit is contained in:
commit
e3c053c4ff
|
@ -0,0 +1,52 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "nnacl/fp16/batchnorm_fp16.h"
|
||||
#include <math.h>
|
||||
|
||||
void BatchNormFp16(const void *input, const void *mean, const void *variance,
|
||||
BatchNormParameter *param, int task_id, void *output) {
|
||||
int units_per_thread = UP_DIV(param->unit_, param->op_parameter_.thread_num_);
|
||||
int completed_units = task_id * units_per_thread;
|
||||
int cur_unit = MSMIN(units_per_thread, param->unit_ - completed_units);
|
||||
int cur_offset = completed_units * param->channel_;
|
||||
|
||||
for (int i = 0; i < cur_unit; i++) {
|
||||
for (int c = 0; c < param->channel_; c++) {
|
||||
float16_t variance_sqrt = sqrt(((const float16_t *)variance)[c] + param->epsilon_);
|
||||
((float16_t *)output)[cur_offset + c] =
|
||||
(((const float16_t *)input)[cur_offset + c] - ((const float16_t *)mean)[c]) / variance_sqrt;
|
||||
}
|
||||
cur_offset += param->channel_;
|
||||
}
|
||||
}
|
||||
|
||||
void FusedBatchNormFp16(const void *input, const void *scale, const void *offset, const void *mean,
|
||||
const void *variance, BatchNormParameter *param, int task_id, void *output) {
|
||||
int units_per_thread = UP_DIV(param->unit_, param->op_parameter_.thread_num_);
|
||||
int completed_units = task_id * units_per_thread;
|
||||
int cur_unit = MSMIN(units_per_thread, param->unit_ - completed_units);
|
||||
int cur_offset = completed_units * param->channel_;
|
||||
|
||||
for (int i = 0; i < cur_unit; i++) {
|
||||
for (int c = 0; c < param->channel_; c++) {
|
||||
float16_t variance_sqrt = sqrt(((const float16_t *)variance)[c] + param->epsilon_);
|
||||
float16_t norm_val = (((const float16_t *)input)[cur_offset + c] - ((const float16_t *)mean)[c]) / variance_sqrt;
|
||||
((float16_t *)output)[cur_offset + c] = norm_val * ((const float16_t *)scale)[c] + ((const float16_t *)offset)[c];
|
||||
}
|
||||
cur_offset += param->channel_;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_BATCHNORM_FP16_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_BATCHNORM_FP16_H_
|
||||
|
||||
#ifdef ENABLE_NEON
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
#include "nnacl/batchnorm_parameter.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void BatchNormFp16(const void *input, const void *mean, const void *variance, BatchNormParameter *param, int task_id,
|
||||
void *output);
|
||||
void FusedBatchNormFp16(const void *input, const void *scale, const void *offset, const void *mean,
|
||||
const void *variance, BatchNormParameter *param, int task_id, void *output);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_BATCHNORM_FP16_H_
|
|
@ -15,26 +15,42 @@
|
|||
*/
|
||||
|
||||
#include "nnacl/fp32/batchnorm.h"
|
||||
#include "nnacl/fp16/batchnorm_fp16.h"
|
||||
#include <math.h>
|
||||
#include "nnacl/batchnorm_parameter.h"
|
||||
#include "nnacl/op_base.h"
|
||||
#include "nnacl/errorcode.h"
|
||||
|
||||
void BatchNorm(float *output_ptr, const float *input_ptr, const float *mean_ptr, const float *variance_ptr, int task_id,
|
||||
BatchNormParameter *param) {
|
||||
for (int c = task_id; c < param->channel_; c += param->op_parameter_.thread_num_) {
|
||||
float variance_sqrt = sqrt(variance_ptr[c] + param->epsilon_);
|
||||
for (int u = 0; u < param->unit_; u++) {
|
||||
output_ptr[u * param->channel_ + c] = (input_ptr[u * param->channel_ + c] - mean_ptr[c]) / variance_sqrt;
|
||||
void BatchNormFp32(const void *input, const void *mean, const void *variance,
|
||||
BatchNormParameter *param, int task_id, void *output) {
|
||||
int units_per_thread = UP_DIV(param->unit_, param->op_parameter_.thread_num_);
|
||||
int completed_units = task_id * units_per_thread;
|
||||
int cur_unit = MSMIN(units_per_thread, param->unit_ - completed_units);
|
||||
int cur_offset = completed_units * param->channel_;
|
||||
|
||||
for (int i = 0; i < cur_unit; i++) {
|
||||
for (int c = 0; c < param->channel_; c++) {
|
||||
float variance_sqrt = sqrt(((const float *)variance)[c] + param->epsilon_);
|
||||
((float *)output)[cur_offset + c] =
|
||||
(((const float *)input)[cur_offset + c] - ((const float *)mean)[c]) / variance_sqrt;
|
||||
}
|
||||
cur_offset += param->channel_;
|
||||
}
|
||||
}
|
||||
|
||||
void FusedBatchNorm(float *output_ptr, const float *input_ptr, const float *scale_ptr, const float *offest_ptr,
|
||||
const float *mean_ptr, const float *variance_ptr, int task_id, BatchNormParameter *param) {
|
||||
for (int c = task_id; c < param->channel_; c += param->op_parameter_.thread_num_) {
|
||||
float variance_sqrt = sqrt(variance_ptr[c] + param->epsilon_);
|
||||
for (int u = 0; u < param->unit_; u++) {
|
||||
output_ptr[u * param->channel_ + c] =
|
||||
(input_ptr[u * param->channel_ + c] - mean_ptr[c]) / variance_sqrt * scale_ptr[c] + offest_ptr[c];
|
||||
void FusedBatchNormFp32(const void *input, const void *scale, const void *offset, const void *mean,
|
||||
const void *variance, BatchNormParameter *param, int task_id, void *output) {
|
||||
int units_per_thread = UP_DIV(param->unit_, param->op_parameter_.thread_num_);
|
||||
int completed_units = task_id * units_per_thread;
|
||||
int cur_unit = MSMIN(units_per_thread, param->unit_ - completed_units);
|
||||
int cur_offset = completed_units * param->channel_;
|
||||
|
||||
for (int i = 0; i < cur_unit; i++) {
|
||||
for (int c = 0; c < param->channel_; c++) {
|
||||
float variance_sqrt = sqrt(((const float *)variance)[c] + param->epsilon_);
|
||||
float norm_val = (((const float *)input)[cur_offset + c] - ((const float *)mean)[c]) / variance_sqrt;
|
||||
((float *)output)[cur_offset + c] = norm_val * ((const float *)scale)[c] + ((const float *)offset)[c];
|
||||
}
|
||||
cur_offset += param->channel_;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,18 +17,16 @@
|
|||
#ifndef MINDSPORE_LITE_NNACL_FP32_BATCHNORM_H_
|
||||
#define MINDSPORE_LITE_NNACL_FP32_BATCHNORM_H_
|
||||
|
||||
#include "nnacl/op_base.h"
|
||||
#include "nnacl/batchnorm_parameter.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void BatchNorm(float *output_ptr, const float *input_ptr, const float *mean_ptr, const float *variance_ptr, int task_id,
|
||||
BatchNormParameter *param);
|
||||
|
||||
void FusedBatchNorm(float *output_ptr, const float *input_ptr, const float *scale_ptr, const float *offest_ptr,
|
||||
const float *mean_ptr, const float *variance_ptr, int task_id, BatchNormParameter *param);
|
||||
void BatchNormFp32(const void *input, const void *mean, const void *variance, BatchNormParameter *param, int task_id,
|
||||
void *output);
|
||||
void FusedBatchNormFp32(const void *input, const void *scale, const void *offset, const void *mean,
|
||||
const void *variance, BatchNormParameter *param, int task_id, void *output);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -0,0 +1,87 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp16/batchnorm_fp16.h"
|
||||
#include "nnacl/fp16/batchnorm_fp16.h"
|
||||
#include "nnacl/fp16/cast_fp16.h"
|
||||
#include "src/kernel_registry.h"
|
||||
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::schema::PrimitiveType_BatchNorm;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
int BatchnormFp16CPUKernel::DoExecute(int task_id) {
|
||||
auto param = reinterpret_cast<BatchNormParameter *>(op_parameter_);
|
||||
|
||||
if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32) {
|
||||
auto input = in_tensors_.at(0);
|
||||
auto mean = in_tensors_.at(1);
|
||||
auto variance = in_tensors_.at(2);
|
||||
auto output = out_tensors_.at(0);
|
||||
|
||||
auto input_fp16 = context_->allocator->Malloc(input->ElementsNum() * sizeof(float16_t));
|
||||
auto mean_fp16 = context_->allocator->Malloc(mean->ElementsNum() * sizeof(float16_t));
|
||||
auto variance_fp16 = context_->allocator->Malloc(variance->ElementsNum() * sizeof(float16_t));
|
||||
auto output_fp16 = context_->allocator->Malloc(output->ElementsNum() * sizeof(float16_t));
|
||||
if (input_fp16 == nullptr || mean_fp16 == nullptr || variance_fp16 == nullptr || output_fp16 == nullptr) {
|
||||
context_->allocator->Free(input_fp16);
|
||||
context_->allocator->Free(mean_fp16);
|
||||
context_->allocator->Free(variance_fp16);
|
||||
context_->allocator->Free(output_fp16);
|
||||
}
|
||||
Float32ToFloat16(reinterpret_cast<float *>(input->Data()),
|
||||
reinterpret_cast<float16_t *>(input_fp16), input->ElementsNum());
|
||||
Float32ToFloat16(reinterpret_cast<float *>(mean->Data()),
|
||||
reinterpret_cast<float16_t *>(mean_fp16), mean->ElementsNum());
|
||||
Float32ToFloat16(reinterpret_cast<float *>(variance->Data()),
|
||||
reinterpret_cast<float16_t *>(variance_fp16), variance->ElementsNum());
|
||||
|
||||
BatchNormFp16(input_fp16, mean_fp16, variance_fp16, param, task_id, output_fp16);
|
||||
|
||||
Float16ToFloat32(reinterpret_cast<float16_t *>(output_fp16), reinterpret_cast<float *>(output),
|
||||
output->ElementsNum());
|
||||
context_->allocator->Free(input_fp16);
|
||||
context_->allocator->Free(mean_fp16);
|
||||
context_->allocator->Free(variance_fp16);
|
||||
context_->allocator->Free(output_fp16);
|
||||
return mindspore::lite::RET_OK;
|
||||
}
|
||||
BatchNormFp16(in_tensors_.at(0)->Data(), mean_, variance_, param, task_id, out_tensors_.at(0)->Data());
|
||||
return mindspore::lite::RET_OK;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *CpuBatchnormFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
const std::vector<lite::tensor::Tensor *> &outputs,
|
||||
OpParameter *opParameter, const lite::Context *ctx,
|
||||
const kernel::KernelKey &desc,
|
||||
const mindspore::lite::PrimitiveC *primitive) {
|
||||
auto *kernel = new (std::nothrow) BatchnormFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
|
||||
if (kernel == nullptr) {
|
||||
MS_LOG(ERROR) << "new BatchnormFp16CPUKernel fail!";
|
||||
return nullptr;
|
||||
}
|
||||
auto ret = kernel->Init();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
|
||||
<< schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
|
||||
delete kernel;
|
||||
return nullptr;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_BatchNorm, CpuBatchnormFp16KernelCreator)
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,36 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BATCHNORM_FP16_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BATCHNORM_FP16_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/runtime/kernel/arm/fp32/batchnorm.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
class BatchnormFp16CPUKernel : public BatchnormCPUKernel {
|
||||
public:
|
||||
BatchnormFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
|
||||
const mindspore::lite::PrimitiveC *primitive)
|
||||
: BatchnormCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
|
||||
virtual ~BatchnormFp16CPUKernel() {}
|
||||
|
||||
virtual int DoExecute(int task_id);
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BATCHNORM_FP16_H_
|
|
@ -0,0 +1,103 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp16/fused_batchnorm_fp16.h"
|
||||
#include "nnacl/fp16/batchnorm_fp16.h"
|
||||
#include "nnacl/fp16/cast_fp16.h"
|
||||
#include "src/kernel_registry.h"
|
||||
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::schema::PrimitiveType_FusedBatchNorm;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
int FusedBatchnormFp16CPUKernel::DoExecute(int task_id) {
|
||||
auto param = reinterpret_cast<BatchNormParameter *>(op_parameter_);
|
||||
|
||||
if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32) {
|
||||
auto input = in_tensors_.at(0);
|
||||
auto scale = in_tensors_.at(1);
|
||||
auto offset = in_tensors_.at(2);
|
||||
auto mean = in_tensors_.at(3);
|
||||
auto variance = in_tensors_.at(4);
|
||||
auto output = out_tensors_.at(0);
|
||||
|
||||
auto input_fp16 = context_->allocator->Malloc(input->ElementsNum() * sizeof(float16_t));
|
||||
auto scale_fp16 = context_->allocator->Malloc(scale->ElementsNum() * sizeof(float16_t));
|
||||
auto offset_fp16 = context_->allocator->Malloc(offset->ElementsNum() * sizeof(float16_t));
|
||||
auto mean_fp16 = context_->allocator->Malloc(mean->ElementsNum() * sizeof(float16_t));
|
||||
auto variance_fp16 = context_->allocator->Malloc(variance->ElementsNum() * sizeof(float16_t));
|
||||
auto output_fp16 = context_->allocator->Malloc(output->ElementsNum() * sizeof(float16_t));
|
||||
if (input_fp16 == nullptr || scale_fp16 == nullptr || offset_fp16 == nullptr ||
|
||||
mean_fp16 == nullptr || variance_fp16 == nullptr || output_fp16 == nullptr) {
|
||||
context_->allocator->Free(input_fp16);
|
||||
context_->allocator->Free(scale_fp16);
|
||||
context_->allocator->Free(offset_fp16);
|
||||
context_->allocator->Free(mean_fp16);
|
||||
context_->allocator->Free(variance_fp16);
|
||||
context_->allocator->Free(output_fp16);
|
||||
}
|
||||
Float32ToFloat16(reinterpret_cast<float *>(input->Data()),
|
||||
reinterpret_cast<float16_t *>(input_fp16), input->ElementsNum());
|
||||
Float32ToFloat16(reinterpret_cast<float *>(scale->Data()),
|
||||
reinterpret_cast<float16_t *>(scale_fp16), scale->ElementsNum());
|
||||
Float32ToFloat16(reinterpret_cast<float *>(offset->Data()),
|
||||
reinterpret_cast<float16_t *>(offset_fp16), offset->ElementsNum());
|
||||
Float32ToFloat16(reinterpret_cast<float *>(mean->Data()),
|
||||
reinterpret_cast<float16_t *>(mean_fp16), mean->ElementsNum());
|
||||
Float32ToFloat16(reinterpret_cast<float *>(variance->Data()),
|
||||
reinterpret_cast<float16_t *>(variance_fp16), variance->ElementsNum());
|
||||
|
||||
FusedBatchNormFp16(input_fp16, scale_fp16, offset_fp16, mean_fp16, variance_fp16, param, task_id,
|
||||
output_fp16);
|
||||
|
||||
Float16ToFloat32(reinterpret_cast<float16_t *>(output_fp16), reinterpret_cast<float *>(output),
|
||||
output->ElementsNum());
|
||||
context_->allocator->Free(input_fp16);
|
||||
context_->allocator->Free(scale_fp16);
|
||||
context_->allocator->Free(offset_fp16);
|
||||
context_->allocator->Free(mean_fp16);
|
||||
context_->allocator->Free(variance_fp16);
|
||||
context_->allocator->Free(output_fp16);
|
||||
return mindspore::lite::RET_OK;
|
||||
}
|
||||
FusedBatchNormFp16(in_tensors_.at(0)->Data(), scale_, offset_, mean_, variance_, param, task_id,
|
||||
out_tensors_.at(0)->Data());
|
||||
return mindspore::lite::RET_OK;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *CpuFusedBatchnormFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
const std::vector<lite::tensor::Tensor *> &outputs,
|
||||
OpParameter *op_parameter, const lite::Context *ctx,
|
||||
const kernel::KernelKey &desc,
|
||||
const mindspore::lite::PrimitiveC *primitive) {
|
||||
FusedBatchnormFp16CPUKernel *kernel =
|
||||
new (std::nothrow) FusedBatchnormFp16CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
|
||||
if (kernel == nullptr) {
|
||||
MS_LOG(ERROR) << "new FusedBatchnormFp16CPUKernel fail!";
|
||||
return nullptr;
|
||||
}
|
||||
auto ret = kernel->Init();
|
||||
if (ret != RET_OK) {
|
||||
delete kernel;
|
||||
MS_LOG(ERROR) << "Init kernel failed, name: " << op_parameter->name_ << ", type: "
|
||||
<< schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(op_parameter->type_));
|
||||
return nullptr;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_FusedBatchNorm, CpuFusedBatchnormFp16KernelCreator)
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,36 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_FUSED_BATCHNORM_FP16_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_FUSED_BATCHNORM_FP16_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/runtime/kernel/arm/fp32/fused_batchnorm.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
class FusedBatchnormFp16CPUKernel : public FusedBatchnormCPUKernel {
|
||||
public:
|
||||
FusedBatchnormFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
|
||||
const mindspore::lite::PrimitiveC *primitive)
|
||||
: FusedBatchnormCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
|
||||
virtual ~FusedBatchnormFp16CPUKernel() {}
|
||||
|
||||
virtual int DoExecute(int task_id);
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_FUSED_BATCHNORM_FP16_H_
|
|
@ -15,50 +15,12 @@
|
|||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp32/batchnorm.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "include/errorcode.h"
|
||||
#include "src/runtime/runtime_api.h"
|
||||
#include "nnacl/batchnorm_parameter.h"
|
||||
#include "nnacl/fp32/batchnorm.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_BatchNorm;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
BatchnormCPUKernel::~BatchnormCPUKernel() {
|
||||
if (mean_addr_ != nullptr) {
|
||||
free(mean_addr_);
|
||||
mean_addr_ = nullptr;
|
||||
}
|
||||
if (var_addr_ != nullptr) {
|
||||
free(var_addr_);
|
||||
var_addr_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
int BatchnormCPUKernel::InitConstTensor() {
|
||||
auto mean = in_tensors_[1];
|
||||
mean_addr_ = reinterpret_cast<float *>(malloc(mean->ElementsNum() * sizeof(float)));
|
||||
if (mean_addr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memcpy(mean_addr_, mean->Data(), mean->ElementsNum() * sizeof(float));
|
||||
|
||||
auto variance = in_tensors_[2];
|
||||
var_addr_ = reinterpret_cast<float *>(malloc(variance->ElementsNum() * sizeof(float)));
|
||||
if (var_addr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memcpy(var_addr_, variance->Data(), variance->ElementsNum() * sizeof(float));
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int BatchnormCPUKernel::Init() {
|
||||
if (!InferShapeDone()) {
|
||||
return RET_OK;
|
||||
|
@ -67,62 +29,72 @@ int BatchnormCPUKernel::Init() {
|
|||
}
|
||||
|
||||
int BatchnormCPUKernel::ReSize() {
|
||||
if (mean_addr_ != nullptr) {
|
||||
free(mean_addr_);
|
||||
mean_addr_ = nullptr;
|
||||
FreeMeanAndVariance();
|
||||
FillParam();
|
||||
return InitConstTensor();
|
||||
}
|
||||
|
||||
void BatchnormCPUKernel::FreeMeanAndVariance() {
|
||||
if (mean_ != nullptr) {
|
||||
free(mean_);
|
||||
mean_ = nullptr;
|
||||
}
|
||||
if (var_addr_ != nullptr) {
|
||||
free(var_addr_);
|
||||
var_addr_ = nullptr;
|
||||
if (variance_ != nullptr) {
|
||||
free(variance_);
|
||||
variance_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void BatchnormCPUKernel::FillParam() {
|
||||
auto input_shapes = in_tensors_[0]->shape();
|
||||
auto n_dim = input_shapes.size();
|
||||
batchnorm_param_->channel_ = input_shapes[n_dim - 1];
|
||||
batchnorm_param_->unit_ = 1;
|
||||
auto param = reinterpret_cast<BatchNormParameter *>(op_parameter_);
|
||||
param->channel_ = input_shapes[n_dim - 1];
|
||||
param->unit_ = 1;
|
||||
for (size_t i = 0; i < n_dim - 1; i++) {
|
||||
batchnorm_param_->unit_ *= input_shapes[i];
|
||||
param->unit_ *= input_shapes[i];
|
||||
}
|
||||
batchnorm_param_->op_parameter_.thread_num_ =
|
||||
MSMIN(batchnorm_param_->op_parameter_.thread_num_, batchnorm_param_->channel_);
|
||||
}
|
||||
|
||||
auto ret = InitConstTensor();
|
||||
if (ret != 0) {
|
||||
MS_LOG(ERROR) << "Batchnorm fp32 InitConstTensor failed.";
|
||||
int BatchnormCPUKernel::InitConstTensor() {
|
||||
mean_ = malloc(in_tensors_[1]->Size());
|
||||
variance_ = malloc(in_tensors_[2]->Size());
|
||||
if (mean_ == nullptr || variance_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Memory allocation failed";
|
||||
FreeMeanAndVariance();
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int BatchnormCPUKernel::DoExecute(int task_id) {
|
||||
BatchNorm(out_addr_, in_addr_, mean_addr_, var_addr_, task_id, batchnorm_param_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int BatchNormRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
|
||||
auto g_kernel = reinterpret_cast<BatchnormCPUKernel *>(cdata);
|
||||
auto ret = g_kernel->DoExecute(task_id);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "BatchnormRun error task_id[" << task_id << "] error_code[" << ret << "]";
|
||||
return ret;
|
||||
}
|
||||
memcpy(mean_, in_tensors_[1]->Data(), in_tensors_[1]->Size());
|
||||
memcpy(variance_, in_tensors_[2]->Data(), in_tensors_[2]->Size());
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int BatchnormCPUKernel::Run() {
|
||||
auto prepare_ret = Prepare();
|
||||
if (prepare_ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Prepare fail! Ret error code: " << prepare_ret;
|
||||
return prepare_ret;
|
||||
}
|
||||
in_addr_ = reinterpret_cast<float *>(in_tensors_.at(0)->Data());
|
||||
out_addr_ = reinterpret_cast<float *>(out_tensors_.at(0)->Data());
|
||||
|
||||
int ret = LiteBackendParallelLaunch(BatchNormRun, this, batchnorm_param_->op_parameter_.thread_num_);
|
||||
auto ret = Prepare();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]";
|
||||
MS_LOG(ERROR) << "Prepare fail! Ret error code: " << ret;
|
||||
return ret;
|
||||
}
|
||||
return RET_OK;
|
||||
ret = LiteBackendParallelLaunch(BatchNormRun, this, op_parameter_->thread_num_);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]";
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int BatchnormCPUKernel::DoExecute(int task_id) {
|
||||
auto param = reinterpret_cast<BatchNormParameter *>(op_parameter_);
|
||||
BatchNormFp32(in_tensors_.at(0)->Data(), mean_, variance_, param, task_id, out_tensors_.at(0)->Data());
|
||||
return mindspore::lite::RET_OK;
|
||||
}
|
||||
|
||||
int BatchNormRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
|
||||
auto kernel = reinterpret_cast<BatchnormCPUKernel *>(cdata);
|
||||
auto ret = kernel->DoExecute(task_id);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "BatchnormRun error task_id[" << task_id << "] error_code[" << ret << "]";
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *CpuBatchnormKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
|
@ -131,7 +103,6 @@ kernel::LiteKernel *CpuBatchnormKernelCreator(const std::vector<lite::tensor::Te
|
|||
const kernel::KernelKey &desc,
|
||||
const mindspore::lite::PrimitiveC *primitive) {
|
||||
MS_ASSERT(opParameter != nullptr);
|
||||
MS_ASSERT(desc.type == schema::PrimitiveType_BatchNorm);
|
||||
auto *kernel = new (std::nothrow) BatchnormCPUKernel(opParameter, inputs, outputs, ctx, primitive);
|
||||
if (kernel == nullptr) {
|
||||
MS_LOG(ERROR) << "new BatchNormCPUKernel fail!";
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include "include/context.h"
|
||||
#include "nnacl/fp32/batchnorm.h"
|
||||
#include "nnacl/batchnorm_parameter.h"
|
||||
#include "src/runtime/runtime_api.h"
|
||||
|
||||
using mindspore::lite::Context;
|
||||
|
||||
|
@ -31,24 +32,23 @@ class BatchnormCPUKernel : public LiteKernel {
|
|||
BatchnormCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
|
||||
const mindspore::lite::PrimitiveC *primitive)
|
||||
: LiteKernel(parameter, inputs, outputs, ctx, primitive) {
|
||||
batchnorm_param_ = reinterpret_cast<BatchNormParameter *>(parameter);
|
||||
}
|
||||
~BatchnormCPUKernel() override;
|
||||
: LiteKernel(parameter, inputs, outputs, ctx, primitive) {}
|
||||
virtual ~BatchnormCPUKernel() { FreeMeanAndVariance(); }
|
||||
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int InitConstTensor();
|
||||
int DoExecute(int tid);
|
||||
virtual int InitConstTensor();
|
||||
virtual int DoExecute(int task_id);
|
||||
|
||||
private:
|
||||
float *in_addr_ = nullptr;
|
||||
float *mean_addr_ = nullptr;
|
||||
float *var_addr_ = nullptr;
|
||||
float *out_addr_ = nullptr;
|
||||
BatchNormParameter *batchnorm_param_;
|
||||
protected:
|
||||
void FillParam();
|
||||
void FreeMeanAndVariance();
|
||||
void *mean_ = nullptr;
|
||||
void *variance_ = nullptr;
|
||||
};
|
||||
|
||||
int BatchNormRun(int task_id, LiteParallelGroupEnv *penv, void *cdata);
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_BATCHNORM_H_
|
||||
|
|
|
@ -15,133 +15,59 @@
|
|||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp32/fused_batchnorm.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "include/errorcode.h"
|
||||
#include "src/runtime/runtime_api.h"
|
||||
#include "nnacl/batchnorm_parameter.h"
|
||||
#include "nnacl/fp32/batchnorm.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_FusedBatchNorm;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
FusedBatchnormCPUKernel::~FusedBatchnormCPUKernel() { FreeTmpBuffer(); }
|
||||
int FusedBatchnormCPUKernel::ReSize() {
|
||||
FreeMeanAndVariance();
|
||||
FreeScaleAndOffset();
|
||||
FillParam();
|
||||
return InitConstTensor();
|
||||
}
|
||||
|
||||
void FusedBatchnormCPUKernel::FreeTmpBuffer() {
|
||||
if (scale_addr_ != nullptr) {
|
||||
free(scale_addr_);
|
||||
scale_addr_ = nullptr;
|
||||
void FusedBatchnormCPUKernel::FreeScaleAndOffset() {
|
||||
if (scale_ != nullptr) {
|
||||
free(scale_);
|
||||
scale_ = nullptr;
|
||||
}
|
||||
if (offset_addr_ != nullptr) {
|
||||
free(offset_addr_);
|
||||
offset_addr_ = nullptr;
|
||||
}
|
||||
if (mean_addr_ != nullptr) {
|
||||
free(mean_addr_);
|
||||
mean_addr_ = nullptr;
|
||||
}
|
||||
if (var_addr_ != nullptr) {
|
||||
free(var_addr_);
|
||||
var_addr_ = nullptr;
|
||||
if (offset_ != nullptr) {
|
||||
free(offset_);
|
||||
offset_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
int FusedBatchnormCPUKernel::InitConstTensor() {
|
||||
auto scale = in_tensors_[1];
|
||||
scale_addr_ = reinterpret_cast<float *>(malloc(scale->ElementsNum() * sizeof(float)));
|
||||
if (scale_addr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memcpy(scale_addr_, scale->Data(), scale->ElementsNum() * sizeof(float));
|
||||
|
||||
auto offset = in_tensors_[2];
|
||||
offset_addr_ = reinterpret_cast<float *>(malloc(offset->ElementsNum() * sizeof(float)));
|
||||
if (offset_addr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memcpy(offset_addr_, offset->Data(), offset->ElementsNum() * sizeof(float));
|
||||
|
||||
auto mean = in_tensors_[3];
|
||||
mean_addr_ = reinterpret_cast<float *>(malloc(mean->ElementsNum() * sizeof(float)));
|
||||
if (mean_addr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memcpy(mean_addr_, mean->Data(), mean->ElementsNum() * sizeof(float));
|
||||
|
||||
auto variance = in_tensors_[4];
|
||||
var_addr_ = reinterpret_cast<float *>(malloc(variance->ElementsNum() * sizeof(float)));
|
||||
if (var_addr_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
|
||||
scale_ = malloc(scale->Size());
|
||||
offset_ = malloc(offset->Size());
|
||||
mean_ = malloc(mean->Size());
|
||||
variance_ = malloc(variance->Size());
|
||||
|
||||
if (scale_ == nullptr || offset_ == nullptr || mean_ == nullptr || variance_ == nullptr) {
|
||||
FreeMeanAndVariance();
|
||||
FreeScaleAndOffset();
|
||||
MS_LOG(ERROR) << "Memory allocation failed";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memcpy(var_addr_, variance->Data(), variance->ElementsNum() * sizeof(float));
|
||||
memcpy(scale_, scale->Data(), scale->Size());
|
||||
memcpy(offset_, offset->Data(), offset->Size());
|
||||
memcpy(mean_, mean->Data(), mean->Size());
|
||||
memcpy(variance_, variance->Data(), variance->Size());
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int FusedBatchnormCPUKernel::Init() {
|
||||
if (!InferShapeDone()) {
|
||||
return RET_OK;
|
||||
}
|
||||
return ReSize();
|
||||
}
|
||||
|
||||
int FusedBatchnormCPUKernel::ReSize() {
|
||||
FreeTmpBuffer();
|
||||
auto input_shapes = in_tensors_[0]->shape();
|
||||
auto n_dim = input_shapes.size();
|
||||
batchnorm_param_->channel_ = input_shapes[n_dim - 1];
|
||||
batchnorm_param_->unit_ = 1;
|
||||
for (size_t i = 0; i < n_dim - 1; i++) {
|
||||
batchnorm_param_->unit_ *= input_shapes[i];
|
||||
}
|
||||
batchnorm_param_->op_parameter_.thread_num_ =
|
||||
MSMIN(batchnorm_param_->op_parameter_.thread_num_, batchnorm_param_->channel_);
|
||||
|
||||
auto ret = InitConstTensor();
|
||||
if (ret != 0) {
|
||||
MS_LOG(ERROR) << "FusedBatchnorm fp32 InitConstTensor failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int FusedBatchnormCPUKernel::Execute(int task_id) {
|
||||
FusedBatchNorm(out_addr_, in_addr_, scale_addr_, offset_addr_, mean_addr_, var_addr_, task_id, batchnorm_param_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int FusedBatchNormRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
|
||||
auto g_kernel = reinterpret_cast<FusedBatchnormCPUKernel *>(cdata);
|
||||
auto ret = g_kernel->Execute(task_id);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "FusedBatchnormRun error task_id[" << task_id << "] error_code[" << ret << "]";
|
||||
return ret;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int FusedBatchnormCPUKernel::Run() {
|
||||
auto prepare_ret = Prepare();
|
||||
if (prepare_ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Prepare fail! Ret error code: " << prepare_ret;
|
||||
return prepare_ret;
|
||||
}
|
||||
in_addr_ = reinterpret_cast<float *>(in_tensors_.at(0)->Data());
|
||||
out_addr_ = reinterpret_cast<float *>(out_tensors_.at(0)->Data());
|
||||
|
||||
int ret = LiteBackendParallelLaunch(FusedBatchNormRun, this, batchnorm_param_->op_parameter_.thread_num_);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "FusedBatchnormRun error error_code[" << ret << "]";
|
||||
return ret;
|
||||
}
|
||||
return RET_OK;
|
||||
int FusedBatchnormCPUKernel::DoExecute(int task_id) {
|
||||
auto param = reinterpret_cast<BatchNormParameter *>(op_parameter_);
|
||||
FusedBatchNormFp32(in_tensors_.at(0)->Data(), scale_, offset_, mean_, variance_, param, task_id,
|
||||
out_tensors_.at(0)->Data());
|
||||
return mindspore::lite::RET_OK;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *CpuFusedBatchnormKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
|
@ -149,11 +75,6 @@ kernel::LiteKernel *CpuFusedBatchnormKernelCreator(const std::vector<lite::tenso
|
|||
OpParameter *op_parameter, const lite::Context *ctx,
|
||||
const kernel::KernelKey &desc,
|
||||
const mindspore::lite::PrimitiveC *primitive) {
|
||||
if (op_parameter == nullptr) {
|
||||
MS_LOG(ERROR) << "Input parameter is nullptr!";
|
||||
return nullptr;
|
||||
}
|
||||
MS_ASSERT(desc.type == schema::PrimitiveType_FusedBatchNorm);
|
||||
FusedBatchnormCPUKernel *kernel =
|
||||
new (std::nothrow) FusedBatchnormCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
|
||||
if (kernel == nullptr) {
|
||||
|
|
|
@ -18,37 +18,26 @@
|
|||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_FUSED_BATCHNORM_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/lite_kernel.h"
|
||||
#include "nnacl/batchnorm_parameter.h"
|
||||
#include "src/runtime/kernel/arm/fp32/batchnorm.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
class FusedBatchnormCPUKernel : public LiteKernel {
|
||||
class FusedBatchnormCPUKernel : public BatchnormCPUKernel {
|
||||
public:
|
||||
FusedBatchnormCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
|
||||
const mindspore::lite::PrimitiveC *primitive)
|
||||
: LiteKernel(parameter, inputs, outputs, ctx, primitive) {
|
||||
batchnorm_param_ = reinterpret_cast<BatchNormParameter *>(parameter);
|
||||
}
|
||||
~FusedBatchnormCPUKernel() override;
|
||||
: BatchnormCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
|
||||
~FusedBatchnormCPUKernel() { FreeScaleAndOffset(); }
|
||||
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
|
||||
int InitConstTensor();
|
||||
int Execute(int task_id);
|
||||
int InitConstTensor() override;
|
||||
int DoExecute(int task_id) override;
|
||||
|
||||
private:
|
||||
void FreeTmpBuffer();
|
||||
float *in_addr_ = nullptr;
|
||||
float *mean_addr_ = nullptr;
|
||||
float *var_addr_ = nullptr;
|
||||
float *scale_addr_ = nullptr;
|
||||
float *offset_addr_ = nullptr;
|
||||
float *out_addr_ = nullptr;
|
||||
|
||||
BatchNormParameter *batchnorm_param_;
|
||||
protected:
|
||||
void FreeScaleAndOffset();
|
||||
void *scale_ = nullptr;
|
||||
void *offset_ = nullptr;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
|
@ -31,40 +31,32 @@ TEST_F(TestBatchnormFp32, BNTest) {
|
|||
-1.1983503, -6.6790967, 6.383416, -13.3213005, -8.693595, 9.476344};
|
||||
std::vector<float> in_data1 = {12.352293, 5.122387, 14.249514};
|
||||
std::vector<float> in_data2 = {14.632595, 0.70900035, 11.179003};
|
||||
std::vector<lite::tensor::Tensor *> inputs_tensor;
|
||||
std::vector<lite::tensor::Tensor *> outputs_tensor;
|
||||
|
||||
BatchNormParameter op_param;
|
||||
op_param.op_parameter_.type_ = schema::PrimitiveType_BatchNorm;
|
||||
op_param.epsilon_ = 0.001f;
|
||||
|
||||
std::vector<int> shape = {1, 2, 2, 3};
|
||||
lite::tensor::Tensor input0_tensor;
|
||||
lite::tensor::Tensor input1_tensor;
|
||||
lite::tensor::Tensor input2_tensor;
|
||||
inputs_tensor.push_back(&input0_tensor);
|
||||
inputs_tensor.push_back(&input1_tensor);
|
||||
inputs_tensor.push_back(&input2_tensor);
|
||||
lite::tensor::Tensor input0_tensor(kNumberTypeFloat32, {1, 2, 2, 3});
|
||||
lite::tensor::Tensor input1_tensor(kNumberTypeFloat32, {3});
|
||||
lite::tensor::Tensor input2_tensor(kNumberTypeFloat32, {3});
|
||||
input0_tensor.SetData(in_data.data());
|
||||
input1_tensor.SetData(in_data1.data());
|
||||
input2_tensor.SetData(in_data2.data());
|
||||
input0_tensor.set_shape(shape);
|
||||
input1_tensor.set_shape({3});
|
||||
input2_tensor.set_shape({3});
|
||||
std::vector<lite::tensor::Tensor *> inputs_tensor = {&input0_tensor, &input1_tensor, &input2_tensor};
|
||||
|
||||
std::vector<float> output(12);
|
||||
std::vector<float> corr_out = {-6.1533737, 7.4904885, -0.8563998, -0.289212, -9.356432, 0.13245535,
|
||||
-3.5422924, -14.005781, -2.3525476, -6.7113695, -16.396551, -1.4275324};
|
||||
|
||||
lite::tensor::Tensor output0_tensor;
|
||||
outputs_tensor.push_back(&output0_tensor);
|
||||
lite::tensor::Tensor output0_tensor(kNumberTypeFloat32, {1, 2, 2, 3});
|
||||
output0_tensor.SetData(output.data());
|
||||
output0_tensor.set_shape(shape);
|
||||
std::vector<lite::tensor::Tensor *> outputs_tensor = {&output0_tensor};
|
||||
|
||||
kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_BatchNorm};
|
||||
auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
|
||||
ASSERT_NE(creator, nullptr);
|
||||
lite::Context ctx;
|
||||
ctx.thread_num_ = 1;
|
||||
ctx.thread_num_ = 2;
|
||||
kernel::LiteKernel *kernel =
|
||||
creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&op_param), &ctx, desc, nullptr);
|
||||
ASSERT_NE(kernel, nullptr);
|
||||
|
@ -82,7 +74,6 @@ TEST_F(TestBatchnormFp32, BNTest) {
|
|||
input1_tensor.SetData(nullptr);
|
||||
input2_tensor.SetData(nullptr);
|
||||
output0_tensor.SetData(nullptr);
|
||||
MS_LOG(INFO) << "TestBathNormFp32 accuracy passed";
|
||||
}
|
||||
|
||||
TEST_F(TestBatchnormFp32, FusedBNTest) {
|
||||
|
@ -92,118 +83,102 @@ TEST_F(TestBatchnormFp32, FusedBNTest) {
|
|||
std::vector<float> offset = {27.888096, 24.533648, 15.335093};
|
||||
std::vector<float> mean = {11.5127125, 0.47681615, 5.851508};
|
||||
std::vector<float> var = {1.270583, 13.005714, 6.089223};
|
||||
std::vector<lite::tensor::Tensor *> inputs_tensor;
|
||||
std::vector<lite::tensor::Tensor *> outputs_tensor;
|
||||
|
||||
BatchNormParameter op_param;
|
||||
op_param.op_parameter_.type_ = schema::PrimitiveType_BatchNorm;
|
||||
op_param.epsilon_ = 0.001f;
|
||||
|
||||
std::vector<int> shape = {1, 2, 2, 3};
|
||||
lite::tensor::Tensor input[5];
|
||||
input[0].SetData(in_data.data());
|
||||
input[1].SetData(scale.data());
|
||||
input[2].SetData(offset.data());
|
||||
input[3].SetData(mean.data());
|
||||
input[4].SetData(var.data());
|
||||
|
||||
input[0].set_shape(shape);
|
||||
for (int i = 1; i < 5; i++) {
|
||||
input[i].set_shape({3});
|
||||
}
|
||||
for (int i = 0; i < 5; i++) {
|
||||
inputs_tensor.push_back(&input[i]);
|
||||
}
|
||||
lite::tensor::Tensor input0(kNumberTypeFloat32, {1, 2, 2, 3});
|
||||
lite::tensor::Tensor input1(kNumberTypeFloat32, {3});
|
||||
lite::tensor::Tensor input2(kNumberTypeFloat32, {3});
|
||||
lite::tensor::Tensor input3(kNumberTypeFloat32, {3});
|
||||
lite::tensor::Tensor input4(kNumberTypeFloat32, {3});
|
||||
input0.SetData(in_data.data());
|
||||
input1.SetData(scale.data());
|
||||
input2.SetData(offset.data());
|
||||
input3.SetData(mean.data());
|
||||
input4.SetData(var.data());
|
||||
std::vector<lite::tensor::Tensor *> inputs_tensor = {&input0, &input1, &input2, &input3, &input4};
|
||||
|
||||
std::vector<float> output(12);
|
||||
std::vector<float> corr_out = {-195.5765, 67.03745, -4.243883, -42.028015, 74.37044, 9.075897,
|
||||
5.1857452, 56.60399, -77.215096, -181.18402, 49.81066, -59.204563};
|
||||
|
||||
lite::tensor::Tensor output0_tensor;
|
||||
outputs_tensor.push_back(&output0_tensor);
|
||||
output0_tensor.SetData(output.data());
|
||||
output0_tensor.set_shape(shape);
|
||||
lite::tensor::Tensor output0(kNumberTypeFloat32, {1, 2, 2, 3});
|
||||
output0.SetData(output.data());
|
||||
std::vector<lite::tensor::Tensor *> outputs_tensor = {&output0};
|
||||
|
||||
kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_FusedBatchNorm};
|
||||
auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
|
||||
ASSERT_NE(creator, nullptr);
|
||||
lite::Context ctx;
|
||||
ctx.thread_num_ = 1;
|
||||
ctx.thread_num_ = 2;
|
||||
kernel::LiteKernel *kernel =
|
||||
creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&op_param), &ctx, desc, nullptr);
|
||||
ASSERT_NE(kernel, nullptr);
|
||||
auto output_tensor_shape = output0_tensor.shape();
|
||||
kernel->Run();
|
||||
|
||||
printf("==================output data=================\n");
|
||||
for (int i = 0; i < output0_tensor.ElementsNum(); i++) {
|
||||
for (int i = 0; i < output0.ElementsNum(); i++) {
|
||||
std::cout << output[i] << " ,";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
CompareOutputData(output.data(), corr_out.data(), output0_tensor.ElementsNum(), 0.001);
|
||||
CompareOutputData(output.data(), corr_out.data(), output0.ElementsNum(), 0.001);
|
||||
|
||||
for (int i = 1; i < 5; i++) {
|
||||
input[i].SetData(nullptr);
|
||||
}
|
||||
output0_tensor.SetData(nullptr);
|
||||
MS_LOG(INFO) << "TestFusedBathNormFp32 accuracy passed";
|
||||
input0.SetData(nullptr);
|
||||
input1.SetData(nullptr);
|
||||
input2.SetData(nullptr);
|
||||
input3.SetData(nullptr);
|
||||
input4.SetData(nullptr);
|
||||
output0.SetData(nullptr);
|
||||
}
|
||||
|
||||
TEST_F(TestBatchnormFp32, easyTest) {
|
||||
std::vector<float> in_data = {1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6};
|
||||
std::vector<float> in_data1 = {0.1, 0.6};
|
||||
std::vector<float> in_data2 = {3, 4};
|
||||
std::vector<lite::tensor::Tensor *> inputs_tensor;
|
||||
std::vector<lite::tensor::Tensor *> outputs_tensor;
|
||||
|
||||
BatchNormParameter op_param;
|
||||
op_param.op_parameter_.type_ = schema::PrimitiveType_BatchNorm;
|
||||
op_param.epsilon_ = 0.001f;
|
||||
|
||||
std::vector<int> shape = {1, 1, 6, 2};
|
||||
lite::tensor::Tensor input0_tensor;
|
||||
lite::tensor::Tensor input1_tensor;
|
||||
lite::tensor::Tensor input2_tensor;
|
||||
inputs_tensor.push_back(&input0_tensor);
|
||||
inputs_tensor.push_back(&input1_tensor);
|
||||
inputs_tensor.push_back(&input2_tensor);
|
||||
input0_tensor.SetData(in_data.data());
|
||||
input1_tensor.SetData(in_data1.data());
|
||||
input2_tensor.SetData(in_data2.data());
|
||||
input0_tensor.set_shape(shape);
|
||||
input1_tensor.set_shape({2});
|
||||
input2_tensor.set_shape({2});
|
||||
lite::tensor::Tensor input0(kNumberTypeFloat32, {1, 1, 6, 2});
|
||||
lite::tensor::Tensor input1(kNumberTypeFloat32, {2});
|
||||
lite::tensor::Tensor input2(kNumberTypeFloat32, {2});
|
||||
input0.SetData(in_data.data());
|
||||
input1.SetData(in_data1.data());
|
||||
input2.SetData(in_data2.data());
|
||||
std::vector<lite::tensor::Tensor *> inputs_tensor = {&input0, &input1, &input2};
|
||||
|
||||
std::vector<float> output(12);
|
||||
std::vector<float> corr_out = {0.519529, 1.69979, 1.09678, 2.19973, 1.67404, 2.69966,
|
||||
-0.63498, -2.29971, -1.21223, -2.79965, -1.78949, -3.29959};
|
||||
|
||||
lite::tensor::Tensor output0_tensor;
|
||||
outputs_tensor.push_back(&output0_tensor);
|
||||
output0_tensor.SetData(output.data());
|
||||
output0_tensor.set_shape(shape);
|
||||
lite::tensor::Tensor output0(kNumberTypeFloat32, {1, 1, 6, 2});
|
||||
output0.SetData(output.data());
|
||||
std::vector<lite::tensor::Tensor *> outputs_tensor = {&output0};
|
||||
|
||||
kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_BatchNorm};
|
||||
auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
|
||||
ASSERT_NE(creator, nullptr);
|
||||
lite::Context ctx;
|
||||
ctx.thread_num_ = 1;
|
||||
ctx.thread_num_ = 2;
|
||||
kernel::LiteKernel *kernel =
|
||||
creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&op_param), &ctx, desc, nullptr);
|
||||
ASSERT_NE(kernel, nullptr);
|
||||
auto output_tensor_shape = output0_tensor.shape();
|
||||
kernel->Run();
|
||||
|
||||
printf("==================output data=================\n");
|
||||
for (int i = 0; i < output0_tensor.ElementsNum(); i++) {
|
||||
for (int i = 0; i < output0.ElementsNum(); i++) {
|
||||
std::cout << output[i] << " ,";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
CompareOutputData(output.data(), corr_out.data(), output0_tensor.ElementsNum(), 0.001);
|
||||
CompareOutputData(output.data(), corr_out.data(), output0.ElementsNum(), 0.001);
|
||||
|
||||
input0_tensor.SetData(nullptr);
|
||||
input1_tensor.SetData(nullptr);
|
||||
input2_tensor.SetData(nullptr);
|
||||
output0_tensor.SetData(nullptr);
|
||||
MS_LOG(INFO) << "TestBathNormFp32 accuracy passed";
|
||||
input0.SetData(nullptr);
|
||||
input1.SetData(nullptr);
|
||||
input2.SetData(nullptr);
|
||||
output0.SetData(nullptr);
|
||||
}
|
||||
|
||||
} // namespace mindspore
|
||||
|
|
Loading…
Reference in New Issue