scale stack fp16

2020-09-29 09:48:41 +08:00 · 2020-09-29 09:48:41 +08:00 · 39c0ef10cb
parent 40918bc46a
commit 39c0ef10cb
15 changed files with 837 additions and 10 deletions
--- a/mindspore/lite/nnacl/fp16/scale_fp16.c
+++ b/mindspore/lite/nnacl/fp16/scale_fp16.c
@ -0,0 +1,223 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp16/scale_fp16.h"
+
+void ScaleInner(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start,
+                int outer_end, int axis_size, int inner_size) {
+  for (int out = outer_start; out < outer_end; out++) {
+    int out_offset = out * axis_size * inner_size;
+    for (int i = 0; i < axis_size; i++) {
+      int axis_offset = out_offset + i * inner_size;
+      int in_index = 0;
+#ifdef ENABLE_ARM64
+      for (; in_index < inner_size - 8; in_index += 8) {
+        int in_offset = axis_offset + in_index;
+        float16x8_t data = vld1q_f16(in_data + in_offset);
+        float16x8_t scale_8 = vdupq_n_f16(scale[i]);
+        float16x8_t offset_8 = vdupq_n_f16(offset[i]);
+        float16x8_t reslut = vfmaq_f16(offset_8, data, scale_8);
+
+        vst1q_f16(out_data + in_offset, reslut);
+      }
+#endif
+      for (; in_index < inner_size; in_index++) {
+        int in_offset = axis_offset + in_index;
+        out_data[in_offset] = in_data[in_offset] * scale[i] + offset[i];
+      }
+    }
+  }
+}
+
+void ScaleAxis(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start,
+               int outer_end, int axis_size) {
+  for (int out = outer_start; out < outer_end; out++) {
+    int out_offset = out * axis_size;
+    int index = 0;
+#ifdef ENABLE_ARM64
+    for (; index < axis_size - 8; index += 8) {
+      int in_offset = out_offset + index;
+      float16x8_t data = vld1q_f16(in_data + in_offset);
+      float16x8_t scale_8 = vld1q_f16(scale + index);
+      float16x8_t offset_8 = vld1q_f16(offset + index);
+      float16x8_t reslut = vfmaq_f16(offset_8, data, scale_8);
+      vst1q_f16(out_data + in_offset, reslut);
+    }
+#endif
+    for (; index < axis_size; index++) {
+      int in_offset = out_offset + index;
+      out_data[in_offset] = in_data[in_offset] * scale[index] + offset[index];
+    }
+  }
+}
+
+void DoScaleFp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id,
+                 ScaleParameter *scale_param) {
+  int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_);
+  int outer_start = task_id * outer_step;
+  int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
+
+  if (scale_param->inner_size_ == 1) {
+    ScaleAxis(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_);
+  } else {
+    ScaleInner(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_,
+               scale_param->inner_size_);
+  }
+}
+
+void ScaleInnerRelu(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start,
+                    int outer_end, int axis_size, int inner_size) {
+#ifdef ENABLE_ARM64
+  float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
+#endif
+  for (int out = outer_start; out < outer_end; out++) {
+    int out_offset = out * axis_size * inner_size;
+    for (int i = 0; i < axis_size; i++) {
+      int axis_offset = out_offset + i * inner_size;
+      int in_index = 0;
+#ifdef ENABLE_ARM64
+      for (; in_index < inner_size - 8; in_index += 8) {
+        int in_offset = axis_offset + in_index;
+        float16x8_t data = vld1q_f16(in_data + in_offset);
+        float16x8_t scale_8 = vdupq_n_f16(scale[i]);
+        float16x8_t offset_8 = vdupq_n_f16(offset[i]);
+        float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8);
+        float16x8_t result = vmaxq_f16(tmp, zeros);
+        vst1q_f16(out_data + in_offset, result);
+      }
+#endif
+      for (; in_index < inner_size; in_index++) {
+        int in_offset = axis_offset + in_index;
+        float tmp = in_data[in_offset] * scale[i] + offset[i];
+        out_data[in_offset] = tmp > 0.0f ? tmp : 0.0f;
+      }
+    }
+  }
+}
+
+void ScaleAxisRelu(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start,
+                   int outer_end, int axis_size) {
+#ifdef ENABLE_ARM64
+  float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
+#endif
+  for (int out = outer_start; out < outer_end; out++) {
+    int out_offset = out * axis_size;
+    int index = 0;
+#ifdef ENABLE_ARM64
+    for (; index < axis_size - 8; index += 8) {
+      int in_offset = out_offset + index;
+      float16x8_t data = vld1q_f16(in_data + in_offset);
+      float16x8_t scale_8 = vld1q_f16(scale + index);
+      float16x8_t offset_8 = vld1q_f16(offset + index);
+      float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8);
+      float16x8_t result = vmaxq_f16(tmp, zeros);
+      vst1q_f16(out_data + in_offset, result);
+    }
+#endif
+    for (; index < axis_size; index++) {
+      int in_offset = out_offset + index;
+      float tmp = in_data[in_offset] * scale[index] + offset[index];
+      out_data[in_offset] = tmp > 0.0f ? tmp : 0.0f;
+    }
+  }
+}
+
+void DoScaleReluFp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id,
+                     ScaleParameter *scale_param) {
+  int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_);
+  int outer_start = task_id * outer_step;
+  int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
+
+  if (scale_param->inner_size_ == 1) {
+    ScaleAxisRelu(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_);
+  } else {
+    ScaleInnerRelu(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_,
+                   scale_param->inner_size_);
+  }
+}
+
+void ScaleInnerRelu6(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start,
+                     int outer_end, int axis_size, int inner_size) {
+#ifdef ENABLE_ARM64
+  float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
+  float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6};
+#endif
+  for (int out = outer_start; out < outer_end; out++) {
+    int out_offset = out * axis_size * inner_size;
+    for (int i = 0; i < axis_size; i++) {
+      int axis_offset = out_offset + i * inner_size;
+      int in_index = 0;
+#ifdef ENABLE_ARM64
+      for (; in_index < inner_size - 8; in_index += 8) {
+        int in_offset = axis_offset + in_index;
+        float16x8_t data = vld1q_f16(in_data + in_offset);
+        float16x8_t scale_8 = vdupq_n_f16(scale[i]);
+        float16x8_t offset_8 = vdupq_n_f16(offset[i]);
+        float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8);
+        float16x8_t result = vminq_f16(vmaxq_f16(tmp, zeros), bounds);
+        vst1q_f16(out_data + in_offset, result);
+      }
+#endif
+      for (; in_index < inner_size; in_index++) {
+        int in_offset = axis_offset + in_index;
+        float tmp = in_data[in_offset] * scale[i] + offset[i];
+        out_data[in_offset] = MSMIN(MSMAX(tmp, 0.0f), 6.0f);
+      }
+    }
+  }
+}
+
+void ScaleAxisRelu6(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start,
+                    int outer_end, int axis_size) {
+#ifdef ENABLE_ARM64
+  float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
+  float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6};
+#endif
+  for (int out = outer_start; out < outer_end; out++) {
+    int out_offset = out * axis_size;
+    int index = 0;
+#ifdef ENABLE_ARM64
+    for (; index < axis_size - 8; index += 8) {
+      int in_offset = out_offset + index;
+      float16x8_t data = vld1q_f16(in_data + in_offset);
+      float16x8_t scale_8 = vld1q_f16(scale + index);
+      float16x8_t offset_8 = vld1q_f16(offset + index);
+      float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8);
+      float16x8_t result = vminq_f16(vmaxq_f16(tmp, zeros), bounds);
+      vst1q_f16(out_data + in_offset, result);
+    }
+#endif
+    for (; index < axis_size; index++) {
+      int in_offset = out_offset + index;
+      float tmp = in_data[in_offset] * scale[index] + offset[index];
+      out_data[in_offset] = MSMIN(MSMAX(tmp, 0.0f), 6.0f);
+    }
+  }
+}
+
+void DoScaleRelu6Fp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id,
+                      ScaleParameter *scale_param) {
+  int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_);
+  int outer_start = task_id * outer_step;
+  int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
+
+  if (scale_param->inner_size_ == 1) {
+    ScaleAxisRelu6(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_);
+  } else {
+    ScaleInnerRelu6(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_,
+                    scale_param->inner_size_);
+  }
+}
--- a/mindspore/lite/nnacl/fp16/scale_fp16.h
+++ b/mindspore/lite/nnacl/fp16/scale_fp16.h
@ -0,0 +1,38 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_NNACL_SCALE_FP16_H_
+#define MINDSPORE_LITE_NNACL_SCALE_FP16_H_
+
+#include "nnacl/op_base.h"
+#include "nnacl/scale.h"
+#ifdef ENABLE_NEON
+#include <arm_neon.h>
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+void DoScaleFp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id,
+                 ScaleParameter *scale_param);
+void DoScaleReluFp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id,
+                     ScaleParameter *scale_param);
+void DoScaleRelu6Fp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id,
+                      ScaleParameter *scale_param);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_NNACL_SCALE_FP16_H_
--- a/mindspore/lite/nnacl/fp16/stack_fp16.c
+++ b/mindspore/lite/nnacl/fp16/stack_fp16.c
@ -0,0 +1,54 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp16/stack_fp16.h"
+#include "nnacl/arithmetic_common.h"
+
+size_t GetStackCopyNum(int axis, int *in_shape, size_t shape_size) {
+  size_t one_input_size = 1;
+  for (size_t i = 0; i < shape_size; ++i) {
+    one_input_size *= in_shape[i];
+  }
+  int in_strides[4];
+  ComputeStrides(in_shape, in_strides, shape_size);
+
+  size_t copy_num = axis > 0 ? in_strides[axis - 1] : one_input_size;
+  return copy_num;
+}
+
+size_t GetStackPreAxisCount(const int *in_shape, int axis) {
+  size_t pre_axis_count = 1;
+  for (size_t i = 0; i < axis; ++i) {
+    pre_axis_count *= in_shape[i];
+  }
+  return pre_axis_count;
+}
+
+void DoStackFp16(const float16_t *const *inputs, size_t input_num, int *in_shape, size_t shape_size, int axis,
+                 float16_t *output) {
+  size_t copy_num = GetStackCopyNum(axis, in_shape, shape_size);
+  size_t copy_size = copy_num * sizeof(float16_t);
+  size_t pre_axis_count = GetStackPreAxisCount(in_shape, axis);
+  size_t in_offset = 0;
+  size_t out_offset = 0;
+  for (size_t i = 0; i < pre_axis_count; ++i) {
+    for (size_t j = 0; j < input_num; ++j) {
+      memcpy(output + out_offset, inputs[j] + in_offset, copy_size);
+      out_offset += copy_num;
+    }
+    in_offset += copy_num;
+  }
+}
--- a/mindspore/lite/nnacl/fp16/stack_fp16.h
+++ b/mindspore/lite/nnacl/fp16/stack_fp16.h
@ -0,0 +1,33 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_NNACL_FP16_STACK_FP16_H_
+#define MINDSPORE_LITE_NNACL_FP16_STACK_FP16_H_
+
+#include "nnacl/op_base.h"
+#ifdef ENABLE_NEON
+#include <arm_neon.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void DoStackFp16(const float16_t *const *inputs, size_t input_num, int *in_shape, size_t shape_size, int axis,
+                 float16_t *output);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_NNACL_FP16_STACK_FP16_H_
--- a/mindspore/lite/nnacl/fp32/stack.h
+++ b/mindspore/lite/nnacl/fp32/stack.h
@ -18,11 +18,6 @@

 #include "nnacl/op_base.h"

-typedef struct StackParameter {
-  OpParameter op_parameter_;
-  int32_t axis_;
-} StackParameter;
-
 #ifdef __cplusplus
 extern "C" {
 #endif
--- a/mindspore/lite/nnacl/stack_parameter.h
+++ b/mindspore/lite/nnacl/stack_parameter.h
@ -0,0 +1,26 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_NNACL_STACK_PARAMETER_H_
+#define MINDSPORE_LITE_NNACL_STACK_PARAMETER_H_
+
+#include "nnacl/op_base.h"
+typedef struct StackParameter {
+  OpParameter op_parameter_;
+  int32_t axis_;
+} StackParameter;
+
+#endif  // MINDSPORE_LITE_NNACL_STACK_PARAMETER_H_
--- a/mindspore/lite/src/populate_parameter.cc
+++ b/mindspore/lite/src/populate_parameter.cc
@ -125,7 +125,7 @@
 #include "nnacl/prelu_parameter.h"
 #include "nnacl/shape.h"
 #include "nnacl/fp32/constant_of_shape.h"
-#include "nnacl/fp32/stack.h"
+#include "nnacl/stack_parameter.h"
 #include "nnacl/unstack.h"
 #include "nnacl/depth_to_space.h"
 #include "nnacl/conv_parameter.h"
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.cc
@ -0,0 +1,214 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp16/scale_fp16.h"
+#include <string.h>
+#include <vector>
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "include/errorcode.h"
+#include "src/runtime/runtime_api.h"
+#include "src/runtime/kernel/arm/fp16/common_fp16.h"
+#include "nnacl/fp16/scale_fp16.h"
+#include "nnacl/fp16/cast_fp16.h"
+
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_Scale;
+
+namespace mindspore::kernel {
+
+int ScaleFp16CPUKernel::InitScaleOffset() {
+  auto input_tensor = in_tensors_.at(0);
+  malloc_input_ = input_tensor->data_type() == kNumberTypeFloat32;
+
+  auto scale_tensor = in_tensors_.at(1);
+  malloc_scale_ = scale_tensor->data_type() == kNumberTypeFloat32;
+
+  if (in_tensors_.size() == 2) {
+    malloc_offset_ = true;
+  } else {
+    auto offset_tensor = in_tensors_.at(2);
+    malloc_offset_ = offset_tensor->data_type() == kNumberTypeFloat32;
+  }
+
+  auto output_tensor = out_tensors_.at(0);
+  malloc_output_ = output_tensor->data_type() == kNumberTypeFloat32;
+  return RET_OK;
+}
+
+int ScaleFp16CPUKernel::Init() {
+  if (in_tensors_.size() < 2 || in_tensors_.size() > 3) {
+    MS_LOG(ERROR) << "inputs to Scale operator should be 2 or 3, but " << in_tensors_.size() << " is given.";
+    return RET_ERROR;
+  }
+
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  ReSize();
+  return RET_OK;
+}
+
+int ScaleFp16CPUKernel::ReSize() {
+  auto ret = CalculateParameter();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Scale fp16 CalculateParameter failed.";
+    return RET_ERROR;
+  }
+
+  return RET_OK;
+}
+
+int ScaleFp16CPUKernel::Scale(int task_id) {
+  switch (scale_param_->activation_type_) {
+    case schema::ActivationType_RELU6:
+      DoScaleRelu6Fp16(input_, output_, scale_, offset_, task_id, scale_param_);
+      break;
+    case schema::ActivationType_RELU:
+      DoScaleReluFp16(input_, output_, scale_, offset_, task_id, scale_param_);
+      break;
+    case schema::ActivationType_NO_ACTIVATION:
+      DoScaleFp16(input_, output_, scale_, offset_, task_id, scale_param_);
+      break;
+    default:
+      MS_LOG(ERROR) << "ScaleFp16 does not support activation type " << scale_param_->activation_type_;
+      return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ScaleRun(void *cdata, int task_id) {
+  auto scale = reinterpret_cast<ScaleFp16CPUKernel *>(cdata);
+  auto ret = scale->Scale(task_id);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ScaleRun error task_id[" << task_id << "] error_code[" << ret << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ScaleFp16CPUKernel::Run() {
+  auto ret = Prepare();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
+    return ret;
+  }
+  ret = InitScaleOffset();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Scale fp16 InitScaleOffset failed.";
+    return RET_ERROR;
+  }
+
+  ret = MallocAssignTmpBuffer();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Scale Fp16 malloc tmp buffer failed";
+    FreeTmpBuffer();
+    return ret;
+  }
+
+  ret = ParallelLaunch(this->context_->thread_pool_, ScaleRun, this, op_parameter_->thread_num_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
+    return RET_ERROR;
+  }
+
+  // if output tensor is fp32, we need to transform
+  if (malloc_output_) {
+    auto out_tensor = out_tensors_.at(0);
+    Float16ToFloat32(output_, reinterpret_cast<float *>(out_tensor->MutableData()), out_tensor->ElementsNum());
+  }
+  FreeTmpBuffer();
+  return RET_OK;
+}
+
+int ScaleFp16CPUKernel::MallocAssignTmpBuffer() {
+  input_ = ConvertInputFp32toFp16(in_tensors_.at(0), context_);
+  if (input_ == nullptr) {
+    return RET_ERROR;
+  }
+  scale_ = ConvertInputFp32toFp16(in_tensors_.at(1), context_);
+  if (scale_ == nullptr) {
+    return RET_ERROR;
+  }
+  if (in_tensors_.size() == 3) {
+    offset_ = ConvertInputFp32toFp16(in_tensors_.at(2), context_);
+    if (offset_ == nullptr) {
+      return RET_ERROR;
+    }
+  } else {
+    offset_ =
+      reinterpret_cast<float16_t *>(context_->allocator->Malloc(in_tensors_.at(1)->ElementsNum() * sizeof(float16_t)));
+    if (offset_ == nullptr) {
+      MS_LOG(ERROR) << "Malloc data failed";
+      return RET_ERROR;
+    }
+    memset(offset_, 0, in_tensors_.at(1)->ElementsNum() * sizeof(float16_t));
+  }
+  output_ = MallocOutputFp16(out_tensors_.at(0), context_);
+  if (output_ == nullptr) {
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+void ScaleFp16CPUKernel::FreeTmpBuffer() {
+  if (malloc_input_ && input_ != nullptr) {
+    context_->allocator->Free(input_);
+    input_ = nullptr;
+  }
+  if (malloc_scale_ && scale_ != nullptr) {
+    context_->allocator->Free(scale_);
+    scale_ = nullptr;
+  }
+  if (malloc_offset_ && offset_ != nullptr) {
+    context_->allocator->Free(offset_);
+    offset_ = nullptr;
+  }
+  if (malloc_output_ && output_ != nullptr) {
+    context_->allocator->Free(output_);
+    output_ = nullptr;
+  }
+}
+
+kernel::LiteKernel *CpuScaleFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
+                                              const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
+                                              const lite::InnerContext *ctx, const kernel::KernelKey &desc,
+                                              const mindspore::lite::PrimitiveC *primitive) {
+  MS_ASSERT(desc.type == schema::PrimitiveType_Scale);
+  if (opParameter == nullptr) {
+    MS_LOG(ERROR) << "opParameter is nullptr";
+    return nullptr;
+  }
+
+  auto *kernel = new (std::nothrow) ScaleFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "New kernel fails.";
+    return nullptr;
+  }
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    delete kernel;
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Scale, CpuScaleFp16KernelCreator)
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.h
@ -0,0 +1,58 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_SCALE_FP16_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_SCALE_FP16_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "src/runtime/kernel/arm/fp32/scale.h"
+#include "nnacl/scale.h"
+
+namespace mindspore::kernel {
+
+class ScaleFp16CPUKernel : public ScaleCPUKernel {
+ public:
+  ScaleFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                     const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
+                     const mindspore::lite::PrimitiveC *primitive)
+      : ScaleCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
+  ~ScaleFp16CPUKernel() = default;
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int InitScaleOffset() override;
+  int Scale(int task_id);
+
+ private:
+  int MallocAssignTmpBuffer();
+  void FreeTmpBuffer();
+
+ private:
+  bool malloc_input_ = false;
+  bool malloc_scale_ = false;
+  bool malloc_offset_ = false;
+  bool malloc_output_ = false;
+
+  float16_t *input_ = nullptr;
+  float16_t *scale_ = nullptr;
+  float16_t *offset_ = nullptr;
+  float16_t *output_ = nullptr;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_SCALE_FP16_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc
@ -0,0 +1,134 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/runtime/kernel/arm/fp16/stack_fp16.h"
+#include <vector>
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "nnacl/stack_parameter.h"
+#include "include/errorcode.h"
+#include "src/runtime/kernel/arm/fp16/common_fp16.h"
+#include "nnacl/fp16/cast_fp16.h"
+#include "nnacl/fp16/stack_fp16.h"
+
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_Stack;
+
+namespace mindspore::kernel {
+
+int StackFp16CPUKernel::Init() {
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  return ReSize();
+}
+
+void StackFp16CPUKernel::InitMallocFlags() {
+  malloc_buffers_.resize(in_tensors_.size());
+  for (size_t i = 0; i < in_tensors_.size(); ++i) {
+    malloc_buffers_[i] = in_tensors_[i]->data_type() == kNumberTypeFloat32;
+  }
+  malloc_out = out_tensors_[0]->data_type() == kNumberTypeFloat32;
+}
+
+int StackFp16CPUKernel::MallocAssignBuffer() {
+  buffers_.resize(in_tensors_.size(), nullptr);
+  for (size_t i = 0; i < in_tensors_.size(); ++i) {
+    buffers_[i] = ConvertInputFp32toFp16(in_tensors_[i], context_);
+    if (buffers_[i] == nullptr) {
+      return RET_ERROR;
+    }
+  }
+
+  out_buffer_ = nullptr;
+  out_buffer_ = MallocOutputFp16(out_tensors_[0], context_);
+  if (out_buffer_ == nullptr) {
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+void StackFp16CPUKernel::FreeBuffer() {
+  for (size_t i = 0; i < buffers_.size(); ++i) {
+    if (malloc_buffers_[i] && buffers_[i] != nullptr) {
+      context_->allocator->Free(buffers_[i]);
+      buffers_[i] = nullptr;
+    }
+  }
+  if (malloc_out && out_buffer_ != nullptr) {
+    context_->allocator->Free(out_buffer_);
+    out_buffer_ = nullptr;
+  }
+}
+
+int StackFp16CPUKernel::Run() {
+  auto ret = Prepare();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
+    return ret;
+  }
+  size_t inputs_num = in_tensors_.size();
+  auto input0 = in_tensors_[0];
+  if (inputs_num == 1) {
+    memcpy(out_tensors_[0]->MutableData(), input0->MutableData(), input0->Size());
+    return RET_OK;
+  }
+  InitMallocFlags();
+  ret = MallocAssignBuffer();
+  if (ret != RET_OK) {
+    FreeBuffer();
+    return ret;
+  }
+  auto input0_shape = input0->shape();
+  DoStackFp16(buffers_.data(), inputs_num, input0_shape.data(), input0_shape.size(), axis_, out_buffer_);
+  // if output tensor is fp32, we need to transform
+  if (malloc_out) {
+    auto out_tensor = out_tensors_.at(0);
+    Float16ToFloat32(out_buffer_, reinterpret_cast<float *>(out_tensor->MutableData()), out_tensor->ElementsNum());
+  }
+
+  FreeBuffer();
+  return RET_OK;
+}
+
+kernel::LiteKernel *CpuStackFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
+                                              const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
+                                              const lite::InnerContext *ctx, const kernel::KernelKey &desc,
+                                              const mindspore::lite::PrimitiveC *primitive) {
+  if (op_parameter == nullptr) {
+    MS_LOG(ERROR) << "Input op_parameter is nullptr!";
+    return nullptr;
+  }
+  MS_ASSERT(desc.type == schema::PrimitiveType_Stack);
+  auto *kernel = new (std::nothrow) StackFp16CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "new StackFp16CPUKernel fail!";
+    return nullptr;
+  }
+
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    delete kernel;
+    MS_LOG(ERROR) << "Init kernel failed, name: " << op_parameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(op_parameter->type_));
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Stack, CpuStackFp16KernelCreator)
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.h
@ -0,0 +1,49 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_STACK_FP16_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_STACK_FP16_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "src/runtime/kernel/arm/fp32/stack.h"
+
+namespace mindspore::kernel {
+class StackFp16CPUKernel : public StackCPUKernel {
+ public:
+  StackFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                     const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
+                     const mindspore::lite::PrimitiveC *primitive)
+      : StackCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
+
+  ~StackFp16CPUKernel() = default;
+
+  int Init() override;
+  int Run() override;
+
+ private:
+  void InitMallocFlags();
+  int MallocAssignBuffer();
+  void FreeBuffer();
+
+ private:
+  std::vector<bool> malloc_buffers_;
+  std::vector<float16_t *> buffers_;
+  float16_t *out_buffer_;
+  bool malloc_out;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_STACK_FP16_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc
@ -138,7 +138,7 @@ int ScaleCPUKernel::Init() {
 int ScaleCPUKernel::ReSize() {
  auto ret = CalculateParameter();
  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Scale fp32 InitParameter failed.";
+    MS_LOG(ERROR) << "Scale fp32 CalculateParameter failed.";
    return RET_ERROR;
  }

--- a/mindspore/lite/src/runtime/kernel/arm/fp32/scale.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/scale.h
@ -37,15 +37,17 @@ class ScaleCPUKernel : public LiteKernel {
  int ReSize() override;
  int Run() override;
  int CalculateParameter();
-  int InitScaleOffset();
+  virtual int InitScaleOffset();
  int Scale(int task_id);

+ protected:
+  ScaleParameter *scale_param_;
+
 private:
  float *input_ptr_ = nullptr;
  float *scale_ = nullptr;
  float *offset_ = nullptr;
  float *output_ptr_ = nullptr;
-  ScaleParameter *scale_param_;
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/src/runtime/kernel/arm/fp32/stack.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/stack.cc
@ -18,6 +18,7 @@
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "nnacl/fp32/stack.h"
+#include "nnacl/stack_parameter.h"
 #include "include/errorcode.h"

 using mindspore::lite::KernelRegistrar;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/stack.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/stack.h
@ -33,7 +33,7 @@ class StackCPUKernel : public LiteKernel {
  int ReSize() override;
  int Run() override;

- private:
+ protected:
  int axis_;
 };
 }  // namespace mindspore::kernel