forked from mindspore-Ecosystem/mindspore
!7018 scale and stack fp16
Merge pull request !7018 from zhaozhenlong/lite/op/scale_stack_fp16
This commit is contained in:
commit
6c3cfe3036
|
@ -0,0 +1,223 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "nnacl/fp16/scale_fp16.h"
|
||||
|
||||
void ScaleInner(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start,
|
||||
int outer_end, int axis_size, int inner_size) {
|
||||
for (int out = outer_start; out < outer_end; out++) {
|
||||
int out_offset = out * axis_size * inner_size;
|
||||
for (int i = 0; i < axis_size; i++) {
|
||||
int axis_offset = out_offset + i * inner_size;
|
||||
int in_index = 0;
|
||||
#ifdef ENABLE_ARM64
|
||||
for (; in_index < inner_size - 8; in_index += 8) {
|
||||
int in_offset = axis_offset + in_index;
|
||||
float16x8_t data = vld1q_f16(in_data + in_offset);
|
||||
float16x8_t scale_8 = vdupq_n_f16(scale[i]);
|
||||
float16x8_t offset_8 = vdupq_n_f16(offset[i]);
|
||||
float16x8_t reslut = vfmaq_f16(offset_8, data, scale_8);
|
||||
|
||||
vst1q_f16(out_data + in_offset, reslut);
|
||||
}
|
||||
#endif
|
||||
for (; in_index < inner_size; in_index++) {
|
||||
int in_offset = axis_offset + in_index;
|
||||
out_data[in_offset] = in_data[in_offset] * scale[i] + offset[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleAxis(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start,
|
||||
int outer_end, int axis_size) {
|
||||
for (int out = outer_start; out < outer_end; out++) {
|
||||
int out_offset = out * axis_size;
|
||||
int index = 0;
|
||||
#ifdef ENABLE_ARM64
|
||||
for (; index < axis_size - 8; index += 8) {
|
||||
int in_offset = out_offset + index;
|
||||
float16x8_t data = vld1q_f16(in_data + in_offset);
|
||||
float16x8_t scale_8 = vld1q_f16(scale + index);
|
||||
float16x8_t offset_8 = vld1q_f16(offset + index);
|
||||
float16x8_t reslut = vfmaq_f16(offset_8, data, scale_8);
|
||||
vst1q_f16(out_data + in_offset, reslut);
|
||||
}
|
||||
#endif
|
||||
for (; index < axis_size; index++) {
|
||||
int in_offset = out_offset + index;
|
||||
out_data[in_offset] = in_data[in_offset] * scale[index] + offset[index];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void DoScaleFp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id,
|
||||
ScaleParameter *scale_param) {
|
||||
int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_);
|
||||
int outer_start = task_id * outer_step;
|
||||
int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
|
||||
|
||||
if (scale_param->inner_size_ == 1) {
|
||||
ScaleAxis(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_);
|
||||
} else {
|
||||
ScaleInner(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_,
|
||||
scale_param->inner_size_);
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleInnerRelu(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start,
|
||||
int outer_end, int axis_size, int inner_size) {
|
||||
#ifdef ENABLE_ARM64
|
||||
float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
#endif
|
||||
for (int out = outer_start; out < outer_end; out++) {
|
||||
int out_offset = out * axis_size * inner_size;
|
||||
for (int i = 0; i < axis_size; i++) {
|
||||
int axis_offset = out_offset + i * inner_size;
|
||||
int in_index = 0;
|
||||
#ifdef ENABLE_ARM64
|
||||
for (; in_index < inner_size - 8; in_index += 8) {
|
||||
int in_offset = axis_offset + in_index;
|
||||
float16x8_t data = vld1q_f16(in_data + in_offset);
|
||||
float16x8_t scale_8 = vdupq_n_f16(scale[i]);
|
||||
float16x8_t offset_8 = vdupq_n_f16(offset[i]);
|
||||
float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8);
|
||||
float16x8_t result = vmaxq_f16(tmp, zeros);
|
||||
vst1q_f16(out_data + in_offset, result);
|
||||
}
|
||||
#endif
|
||||
for (; in_index < inner_size; in_index++) {
|
||||
int in_offset = axis_offset + in_index;
|
||||
float tmp = in_data[in_offset] * scale[i] + offset[i];
|
||||
out_data[in_offset] = tmp > 0.0f ? tmp : 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleAxisRelu(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start,
|
||||
int outer_end, int axis_size) {
|
||||
#ifdef ENABLE_ARM64
|
||||
float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
#endif
|
||||
for (int out = outer_start; out < outer_end; out++) {
|
||||
int out_offset = out * axis_size;
|
||||
int index = 0;
|
||||
#ifdef ENABLE_ARM64
|
||||
for (; index < axis_size - 8; index += 8) {
|
||||
int in_offset = out_offset + index;
|
||||
float16x8_t data = vld1q_f16(in_data + in_offset);
|
||||
float16x8_t scale_8 = vld1q_f16(scale + index);
|
||||
float16x8_t offset_8 = vld1q_f16(offset + index);
|
||||
float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8);
|
||||
float16x8_t result = vmaxq_f16(tmp, zeros);
|
||||
vst1q_f16(out_data + in_offset, result);
|
||||
}
|
||||
#endif
|
||||
for (; index < axis_size; index++) {
|
||||
int in_offset = out_offset + index;
|
||||
float tmp = in_data[in_offset] * scale[index] + offset[index];
|
||||
out_data[in_offset] = tmp > 0.0f ? tmp : 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void DoScaleReluFp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id,
|
||||
ScaleParameter *scale_param) {
|
||||
int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_);
|
||||
int outer_start = task_id * outer_step;
|
||||
int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
|
||||
|
||||
if (scale_param->inner_size_ == 1) {
|
||||
ScaleAxisRelu(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_);
|
||||
} else {
|
||||
ScaleInnerRelu(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_,
|
||||
scale_param->inner_size_);
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleInnerRelu6(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start,
|
||||
int outer_end, int axis_size, int inner_size) {
|
||||
#ifdef ENABLE_ARM64
|
||||
float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6};
|
||||
#endif
|
||||
for (int out = outer_start; out < outer_end; out++) {
|
||||
int out_offset = out * axis_size * inner_size;
|
||||
for (int i = 0; i < axis_size; i++) {
|
||||
int axis_offset = out_offset + i * inner_size;
|
||||
int in_index = 0;
|
||||
#ifdef ENABLE_ARM64
|
||||
for (; in_index < inner_size - 8; in_index += 8) {
|
||||
int in_offset = axis_offset + in_index;
|
||||
float16x8_t data = vld1q_f16(in_data + in_offset);
|
||||
float16x8_t scale_8 = vdupq_n_f16(scale[i]);
|
||||
float16x8_t offset_8 = vdupq_n_f16(offset[i]);
|
||||
float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8);
|
||||
float16x8_t result = vminq_f16(vmaxq_f16(tmp, zeros), bounds);
|
||||
vst1q_f16(out_data + in_offset, result);
|
||||
}
|
||||
#endif
|
||||
for (; in_index < inner_size; in_index++) {
|
||||
int in_offset = axis_offset + in_index;
|
||||
float tmp = in_data[in_offset] * scale[i] + offset[i];
|
||||
out_data[in_offset] = MSMIN(MSMAX(tmp, 0.0f), 6.0f);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleAxisRelu6(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start,
|
||||
int outer_end, int axis_size) {
|
||||
#ifdef ENABLE_ARM64
|
||||
float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6};
|
||||
#endif
|
||||
for (int out = outer_start; out < outer_end; out++) {
|
||||
int out_offset = out * axis_size;
|
||||
int index = 0;
|
||||
#ifdef ENABLE_ARM64
|
||||
for (; index < axis_size - 8; index += 8) {
|
||||
int in_offset = out_offset + index;
|
||||
float16x8_t data = vld1q_f16(in_data + in_offset);
|
||||
float16x8_t scale_8 = vld1q_f16(scale + index);
|
||||
float16x8_t offset_8 = vld1q_f16(offset + index);
|
||||
float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8);
|
||||
float16x8_t result = vminq_f16(vmaxq_f16(tmp, zeros), bounds);
|
||||
vst1q_f16(out_data + in_offset, result);
|
||||
}
|
||||
#endif
|
||||
for (; index < axis_size; index++) {
|
||||
int in_offset = out_offset + index;
|
||||
float tmp = in_data[in_offset] * scale[index] + offset[index];
|
||||
out_data[in_offset] = MSMIN(MSMAX(tmp, 0.0f), 6.0f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void DoScaleRelu6Fp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id,
|
||||
ScaleParameter *scale_param) {
|
||||
int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_);
|
||||
int outer_start = task_id * outer_step;
|
||||
int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
|
||||
|
||||
if (scale_param->inner_size_ == 1) {
|
||||
ScaleAxisRelu6(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_);
|
||||
} else {
|
||||
ScaleInnerRelu6(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_,
|
||||
scale_param->inner_size_);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_NNACL_SCALE_FP16_H_
|
||||
#define MINDSPORE_LITE_NNACL_SCALE_FP16_H_
|
||||
|
||||
#include "nnacl/op_base.h"
|
||||
#include "nnacl/scale.h"
|
||||
#ifdef ENABLE_NEON
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
void DoScaleFp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id,
|
||||
ScaleParameter *scale_param);
|
||||
void DoScaleReluFp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id,
|
||||
ScaleParameter *scale_param);
|
||||
void DoScaleRelu6Fp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id,
|
||||
ScaleParameter *scale_param);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // MINDSPORE_LITE_NNACL_SCALE_FP16_H_
|
|
@ -0,0 +1,54 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "nnacl/fp16/stack_fp16.h"
|
||||
#include "nnacl/arithmetic_common.h"
|
||||
|
||||
size_t GetStackCopyNum(int axis, int *in_shape, size_t shape_size) {
|
||||
size_t one_input_size = 1;
|
||||
for (size_t i = 0; i < shape_size; ++i) {
|
||||
one_input_size *= in_shape[i];
|
||||
}
|
||||
int in_strides[4];
|
||||
ComputeStrides(in_shape, in_strides, shape_size);
|
||||
|
||||
size_t copy_num = axis > 0 ? in_strides[axis - 1] : one_input_size;
|
||||
return copy_num;
|
||||
}
|
||||
|
||||
size_t GetStackPreAxisCount(const int *in_shape, int axis) {
|
||||
size_t pre_axis_count = 1;
|
||||
for (size_t i = 0; i < axis; ++i) {
|
||||
pre_axis_count *= in_shape[i];
|
||||
}
|
||||
return pre_axis_count;
|
||||
}
|
||||
|
||||
void DoStackFp16(const float16_t *const *inputs, size_t input_num, int *in_shape, size_t shape_size, int axis,
|
||||
float16_t *output) {
|
||||
size_t copy_num = GetStackCopyNum(axis, in_shape, shape_size);
|
||||
size_t copy_size = copy_num * sizeof(float16_t);
|
||||
size_t pre_axis_count = GetStackPreAxisCount(in_shape, axis);
|
||||
size_t in_offset = 0;
|
||||
size_t out_offset = 0;
|
||||
for (size_t i = 0; i < pre_axis_count; ++i) {
|
||||
for (size_t j = 0; j < input_num; ++j) {
|
||||
memcpy(output + out_offset, inputs[j] + in_offset, copy_size);
|
||||
out_offset += copy_num;
|
||||
}
|
||||
in_offset += copy_num;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_LITE_NNACL_FP16_STACK_FP16_H_
|
||||
#define MINDSPORE_LITE_NNACL_FP16_STACK_FP16_H_
|
||||
|
||||
#include "nnacl/op_base.h"
|
||||
#ifdef ENABLE_NEON
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
void DoStackFp16(const float16_t *const *inputs, size_t input_num, int *in_shape, size_t shape_size, int axis,
|
||||
float16_t *output);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // MINDSPORE_LITE_NNACL_FP16_STACK_FP16_H_
|
|
@ -18,11 +18,6 @@
|
|||
|
||||
#include "nnacl/op_base.h"
|
||||
|
||||
typedef struct StackParameter {
|
||||
OpParameter op_parameter_;
|
||||
int32_t axis_;
|
||||
} StackParameter;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_NNACL_STACK_PARAMETER_H_
|
||||
#define MINDSPORE_LITE_NNACL_STACK_PARAMETER_H_
|
||||
|
||||
#include "nnacl/op_base.h"
|
||||
typedef struct StackParameter {
|
||||
OpParameter op_parameter_;
|
||||
int32_t axis_;
|
||||
} StackParameter;
|
||||
|
||||
#endif // MINDSPORE_LITE_NNACL_STACK_PARAMETER_H_
|
|
@ -126,7 +126,7 @@
|
|||
#include "nnacl/prelu_parameter.h"
|
||||
#include "nnacl/shape.h"
|
||||
#include "nnacl/fp32/constant_of_shape.h"
|
||||
#include "nnacl/fp32/stack.h"
|
||||
#include "nnacl/stack_parameter.h"
|
||||
#include "nnacl/unstack.h"
|
||||
#include "nnacl/depth_to_space.h"
|
||||
#include "nnacl/conv_parameter.h"
|
||||
|
|
|
@ -0,0 +1,214 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp16/scale_fp16.h"
|
||||
#include <string.h>
|
||||
#include <vector>
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "include/errorcode.h"
|
||||
#include "src/runtime/runtime_api.h"
|
||||
#include "src/runtime/kernel/arm/fp16/common_fp16.h"
|
||||
#include "nnacl/fp16/scale_fp16.h"
|
||||
#include "nnacl/fp16/cast_fp16.h"
|
||||
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_Scale;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
int ScaleFp16CPUKernel::InitScaleOffset() {
|
||||
auto input_tensor = in_tensors_.at(0);
|
||||
malloc_input_ = input_tensor->data_type() == kNumberTypeFloat32;
|
||||
|
||||
auto scale_tensor = in_tensors_.at(1);
|
||||
malloc_scale_ = scale_tensor->data_type() == kNumberTypeFloat32;
|
||||
|
||||
if (in_tensors_.size() == 2) {
|
||||
malloc_offset_ = true;
|
||||
} else {
|
||||
auto offset_tensor = in_tensors_.at(2);
|
||||
malloc_offset_ = offset_tensor->data_type() == kNumberTypeFloat32;
|
||||
}
|
||||
|
||||
auto output_tensor = out_tensors_.at(0);
|
||||
malloc_output_ = output_tensor->data_type() == kNumberTypeFloat32;
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ScaleFp16CPUKernel::Init() {
|
||||
if (in_tensors_.size() < 2 || in_tensors_.size() > 3) {
|
||||
MS_LOG(ERROR) << "inputs to Scale operator should be 2 or 3, but " << in_tensors_.size() << " is given.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
if (!InferShapeDone()) {
|
||||
return RET_OK;
|
||||
}
|
||||
ReSize();
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ScaleFp16CPUKernel::ReSize() {
|
||||
auto ret = CalculateParameter();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Scale fp16 CalculateParameter failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ScaleFp16CPUKernel::Scale(int task_id) {
|
||||
switch (scale_param_->activation_type_) {
|
||||
case schema::ActivationType_RELU6:
|
||||
DoScaleRelu6Fp16(input_, output_, scale_, offset_, task_id, scale_param_);
|
||||
break;
|
||||
case schema::ActivationType_RELU:
|
||||
DoScaleReluFp16(input_, output_, scale_, offset_, task_id, scale_param_);
|
||||
break;
|
||||
case schema::ActivationType_NO_ACTIVATION:
|
||||
DoScaleFp16(input_, output_, scale_, offset_, task_id, scale_param_);
|
||||
break;
|
||||
default:
|
||||
MS_LOG(ERROR) << "ScaleFp16 does not support activation type " << scale_param_->activation_type_;
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ScaleRun(void *cdata, int task_id) {
|
||||
auto scale = reinterpret_cast<ScaleFp16CPUKernel *>(cdata);
|
||||
auto ret = scale->Scale(task_id);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "ScaleRun error task_id[" << task_id << "] error_code[" << ret << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ScaleFp16CPUKernel::Run() {
|
||||
auto ret = Prepare();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
|
||||
return ret;
|
||||
}
|
||||
ret = InitScaleOffset();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Scale fp16 InitScaleOffset failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
ret = MallocAssignTmpBuffer();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Scale Fp16 malloc tmp buffer failed";
|
||||
FreeTmpBuffer();
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = ParallelLaunch(this->context_->thread_pool_, ScaleRun, this, op_parameter_->thread_num_);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
// if output tensor is fp32, we need to transform
|
||||
if (malloc_output_) {
|
||||
auto out_tensor = out_tensors_.at(0);
|
||||
Float16ToFloat32(output_, reinterpret_cast<float *>(out_tensor->MutableData()), out_tensor->ElementsNum());
|
||||
}
|
||||
FreeTmpBuffer();
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ScaleFp16CPUKernel::MallocAssignTmpBuffer() {
|
||||
input_ = ConvertInputFp32toFp16(in_tensors_.at(0), context_);
|
||||
if (input_ == nullptr) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
scale_ = ConvertInputFp32toFp16(in_tensors_.at(1), context_);
|
||||
if (scale_ == nullptr) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (in_tensors_.size() == 3) {
|
||||
offset_ = ConvertInputFp32toFp16(in_tensors_.at(2), context_);
|
||||
if (offset_ == nullptr) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
offset_ =
|
||||
reinterpret_cast<float16_t *>(context_->allocator->Malloc(in_tensors_.at(1)->ElementsNum() * sizeof(float16_t)));
|
||||
if (offset_ == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc data failed";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(offset_, 0, in_tensors_.at(1)->ElementsNum() * sizeof(float16_t));
|
||||
}
|
||||
output_ = MallocOutputFp16(out_tensors_.at(0), context_);
|
||||
if (output_ == nullptr) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void ScaleFp16CPUKernel::FreeTmpBuffer() {
|
||||
if (malloc_input_ && input_ != nullptr) {
|
||||
context_->allocator->Free(input_);
|
||||
input_ = nullptr;
|
||||
}
|
||||
if (malloc_scale_ && scale_ != nullptr) {
|
||||
context_->allocator->Free(scale_);
|
||||
scale_ = nullptr;
|
||||
}
|
||||
if (malloc_offset_ && offset_ != nullptr) {
|
||||
context_->allocator->Free(offset_);
|
||||
offset_ = nullptr;
|
||||
}
|
||||
if (malloc_output_ && output_ != nullptr) {
|
||||
context_->allocator->Free(output_);
|
||||
output_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
kernel::LiteKernel *CpuScaleFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
|
||||
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
|
||||
const mindspore::lite::PrimitiveC *primitive) {
|
||||
MS_ASSERT(desc.type == schema::PrimitiveType_Scale);
|
||||
if (opParameter == nullptr) {
|
||||
MS_LOG(ERROR) << "opParameter is nullptr";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto *kernel = new (std::nothrow) ScaleFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
|
||||
if (kernel == nullptr) {
|
||||
MS_LOG(ERROR) << "New kernel fails.";
|
||||
return nullptr;
|
||||
}
|
||||
auto ret = kernel->Init();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
|
||||
<< schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
|
||||
delete kernel;
|
||||
return nullptr;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Scale, CpuScaleFp16KernelCreator)
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,58 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_SCALE_FP16_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_SCALE_FP16_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/lite_kernel.h"
|
||||
#include "src/runtime/kernel/arm/fp32/scale.h"
|
||||
#include "nnacl/scale.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
class ScaleFp16CPUKernel : public ScaleCPUKernel {
|
||||
public:
|
||||
ScaleFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
|
||||
const mindspore::lite::PrimitiveC *primitive)
|
||||
: ScaleCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
|
||||
~ScaleFp16CPUKernel() = default;
|
||||
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int InitScaleOffset() override;
|
||||
int Scale(int task_id);
|
||||
|
||||
private:
|
||||
int MallocAssignTmpBuffer();
|
||||
void FreeTmpBuffer();
|
||||
|
||||
private:
|
||||
bool malloc_input_ = false;
|
||||
bool malloc_scale_ = false;
|
||||
bool malloc_offset_ = false;
|
||||
bool malloc_output_ = false;
|
||||
|
||||
float16_t *input_ = nullptr;
|
||||
float16_t *scale_ = nullptr;
|
||||
float16_t *offset_ = nullptr;
|
||||
float16_t *output_ = nullptr;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_SCALE_FP16_H_
|
|
@ -0,0 +1,134 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "src/runtime/kernel/arm/fp16/stack_fp16.h"
|
||||
#include <vector>
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "nnacl/stack_parameter.h"
|
||||
#include "include/errorcode.h"
|
||||
#include "src/runtime/kernel/arm/fp16/common_fp16.h"
|
||||
#include "nnacl/fp16/cast_fp16.h"
|
||||
#include "nnacl/fp16/stack_fp16.h"
|
||||
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_Stack;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
int StackFp16CPUKernel::Init() {
|
||||
if (!InferShapeDone()) {
|
||||
return RET_OK;
|
||||
}
|
||||
return ReSize();
|
||||
}
|
||||
|
||||
void StackFp16CPUKernel::InitMallocFlags() {
|
||||
malloc_buffers_.resize(in_tensors_.size());
|
||||
for (size_t i = 0; i < in_tensors_.size(); ++i) {
|
||||
malloc_buffers_[i] = in_tensors_[i]->data_type() == kNumberTypeFloat32;
|
||||
}
|
||||
malloc_out = out_tensors_[0]->data_type() == kNumberTypeFloat32;
|
||||
}
|
||||
|
||||
int StackFp16CPUKernel::MallocAssignBuffer() {
|
||||
buffers_.resize(in_tensors_.size(), nullptr);
|
||||
for (size_t i = 0; i < in_tensors_.size(); ++i) {
|
||||
buffers_[i] = ConvertInputFp32toFp16(in_tensors_[i], context_);
|
||||
if (buffers_[i] == nullptr) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
out_buffer_ = nullptr;
|
||||
out_buffer_ = MallocOutputFp16(out_tensors_[0], context_);
|
||||
if (out_buffer_ == nullptr) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void StackFp16CPUKernel::FreeBuffer() {
|
||||
for (size_t i = 0; i < buffers_.size(); ++i) {
|
||||
if (malloc_buffers_[i] && buffers_[i] != nullptr) {
|
||||
context_->allocator->Free(buffers_[i]);
|
||||
buffers_[i] = nullptr;
|
||||
}
|
||||
}
|
||||
if (malloc_out && out_buffer_ != nullptr) {
|
||||
context_->allocator->Free(out_buffer_);
|
||||
out_buffer_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
int StackFp16CPUKernel::Run() {
|
||||
auto ret = Prepare();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
|
||||
return ret;
|
||||
}
|
||||
size_t inputs_num = in_tensors_.size();
|
||||
auto input0 = in_tensors_[0];
|
||||
if (inputs_num == 1) {
|
||||
memcpy(out_tensors_[0]->MutableData(), input0->MutableData(), input0->Size());
|
||||
return RET_OK;
|
||||
}
|
||||
InitMallocFlags();
|
||||
ret = MallocAssignBuffer();
|
||||
if (ret != RET_OK) {
|
||||
FreeBuffer();
|
||||
return ret;
|
||||
}
|
||||
auto input0_shape = input0->shape();
|
||||
DoStackFp16(buffers_.data(), inputs_num, input0_shape.data(), input0_shape.size(), axis_, out_buffer_);
|
||||
// if output tensor is fp32, we need to transform
|
||||
if (malloc_out) {
|
||||
auto out_tensor = out_tensors_.at(0);
|
||||
Float16ToFloat32(out_buffer_, reinterpret_cast<float *>(out_tensor->MutableData()), out_tensor->ElementsNum());
|
||||
}
|
||||
|
||||
FreeBuffer();
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *CpuStackFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
|
||||
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
|
||||
const mindspore::lite::PrimitiveC *primitive) {
|
||||
if (op_parameter == nullptr) {
|
||||
MS_LOG(ERROR) << "Input op_parameter is nullptr!";
|
||||
return nullptr;
|
||||
}
|
||||
MS_ASSERT(desc.type == schema::PrimitiveType_Stack);
|
||||
auto *kernel = new (std::nothrow) StackFp16CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
|
||||
if (kernel == nullptr) {
|
||||
MS_LOG(ERROR) << "new StackFp16CPUKernel fail!";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto ret = kernel->Init();
|
||||
if (ret != RET_OK) {
|
||||
delete kernel;
|
||||
MS_LOG(ERROR) << "Init kernel failed, name: " << op_parameter->name_ << ", type: "
|
||||
<< schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(op_parameter->type_));
|
||||
return nullptr;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Stack, CpuStackFp16KernelCreator)
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,49 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_STACK_FP16_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_STACK_FP16_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/lite_kernel.h"
|
||||
#include "src/runtime/kernel/arm/fp32/stack.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
class StackFp16CPUKernel : public StackCPUKernel {
|
||||
public:
|
||||
StackFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
|
||||
const mindspore::lite::PrimitiveC *primitive)
|
||||
: StackCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
|
||||
|
||||
~StackFp16CPUKernel() = default;
|
||||
|
||||
int Init() override;
|
||||
int Run() override;
|
||||
|
||||
private:
|
||||
void InitMallocFlags();
|
||||
int MallocAssignBuffer();
|
||||
void FreeBuffer();
|
||||
|
||||
private:
|
||||
std::vector<bool> malloc_buffers_;
|
||||
std::vector<float16_t *> buffers_;
|
||||
float16_t *out_buffer_;
|
||||
bool malloc_out;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_STACK_FP16_H_
|
|
@ -138,7 +138,7 @@ int ScaleCPUKernel::Init() {
|
|||
int ScaleCPUKernel::ReSize() {
|
||||
auto ret = CalculateParameter();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Scale fp32 InitParameter failed.";
|
||||
MS_LOG(ERROR) << "Scale fp32 CalculateParameter failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
|
|
|
@ -37,15 +37,17 @@ class ScaleCPUKernel : public LiteKernel {
|
|||
int ReSize() override;
|
||||
int Run() override;
|
||||
int CalculateParameter();
|
||||
int InitScaleOffset();
|
||||
virtual int InitScaleOffset();
|
||||
int Scale(int task_id);
|
||||
|
||||
protected:
|
||||
ScaleParameter *scale_param_;
|
||||
|
||||
private:
|
||||
float *input_ptr_ = nullptr;
|
||||
float *scale_ = nullptr;
|
||||
float *offset_ = nullptr;
|
||||
float *output_ptr_ = nullptr;
|
||||
ScaleParameter *scale_param_;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "nnacl/fp32/stack.h"
|
||||
#include "nnacl/stack_parameter.h"
|
||||
#include "include/errorcode.h"
|
||||
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
|
|
|
@ -33,7 +33,7 @@ class StackCPUKernel : public LiteKernel {
|
|||
int ReSize() override;
|
||||
int Run() override;
|
||||
|
||||
private:
|
||||
protected:
|
||||
int axis_;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
|
Loading…
Reference in New Issue