float16 kernels

2021-06-09 10:41:14 +03:00 · 2021-06-09 10:41:14 +03:00 · 394a8398e2
parent 346a35ba7d
commit 394a8398e2
56 changed files with 4276 additions and 40 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/activation_grad.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/activation_grad.c
@ -14,21 +14,26 @@
 * limitations under the License.
 */

-#include <math.h>
-#include "nnacl/op_base.h"
 #include "nnacl/fp16_grad/activation_grad.h"
+#include <math.h>
+#include <float.h>
+#ifdef ENABLE_NEON
+#include <arm_neon.h>
+#include "nnacl/fp32/exp_fp32.h"
+#endif
+#include "nnacl/op_base.h"
 #include "nnacl/errorcode.h"

-int Fp16ReluGrad(const float16_t *src0, const float16_t *src1, size_t length, float16_t *dst) {
+int ReluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
  int i = 0;
 #ifdef ENABLE_NEON
-  float16x8_t zero_4 = vdupq_n_f16(0);
-  for (; i < length - 4; i += 4) {
-    float16x8_t src0_4 = vld1q_f16(src0 + i);
-    float16x8_t src1_4 = vld1q_f16(src1 + i);
-    uint16x8_t mask_4 = vcgtq_f16(src1_4, zero_4);
-    float16x8_t dst_4 = vbslq_f16(mask_4, src0_4, zero_4);
-    vst1q_f16(dst + i, dst_4);
+  float16x8_t zero_v = vdupq_n_f16(0);
+  for (; i <= length - C8NUM; i += C8NUM) {
+    float16x8_t src0_v = vld1q_f16(src0 + i);
+    float16x8_t src1_v = vld1q_f16(src1 + i);
+    uint16x8_t mask_v = vcgtq_f16(src1_v, zero_v);
+    float16x8_t dst_v = vbslq_f16(mask_v, src0_v, zero_v);
+    vst1q_f16(dst + i, dst_v);
  }
 #endif
  for (; i < length; i++) {
@ -37,15 +42,43 @@ int Fp16ReluGrad(const float16_t *src0, const float16_t *src1, size_t length, fl
  return NNACL_OK;
 }

-int Fp16SigmoidGrad(const float16_t *src0, const float16_t *src1, size_t length, float16_t *dst) {
+int Relu6Fp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
  int i = 0;
 #ifdef ENABLE_NEON
-  float16x8_t one_4 = vdupq_n_f16(1);
-  for (; i < length - 4; i += 4) {
-    float16x8_t src0_4 = vld1q_f16(src0 + i);
-    float16x8_t src1_4 = vld1q_f16(src1 + i);
-    float16x8_t dst_4 = vmulq_f16(src0_4, vmulq_f16(src1_4, vsubq_f16(one_4, src1_4)));
-    vst1q_f16(dst + i, dst_4);
+  float16x8_t zero_8 = vdupq_n_f16(0);
+  float16x8_t six_8 = vdupq_n_f16(6);
+  for (; i < length - 8; i += 8) {
+    float16x8_t src1_8 = vld1q_f16(src1 + i);
+    float16x8_t src0_8 = vld1q_f16(src0 + i);
+    float16x8_t max_8 = vmaxq_f16(src1_8, zero_8);
+    float16x8_t min_max_8 = vminq_f16(max_8, six_8);
+    uint16x8_t mask_8 = vceqq_f16(min_max_8, src1_8);
+    float16x8_t dst_8 = vbslq_f16(mask_8, src0_8, zero_8);
+    vst1q_f16(dst + i, dst_8);
+  }
+#endif
+  for (; i < length; ++i) {
+    dst[i] = (src1[i] > 0.0f && src1[i] <= 6.0f) ? src0[i] : 0.0f;
+  }
+  return NNACL_OK;
+}
+
+int LReluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst, float16_t alpha) {
+  for (int i = 0; i < length; ++i) {
+    dst[i] = src0[i] * (src1[i] * (1.0f - src1[i]));
+  }
+  return NNACL_OK;
+}
+
+int SigmoidFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
+  int i = 0;
+#ifdef ENABLE_NEON
+  float16x8_t one_8 = vdupq_n_f16(1);
+  for (; i < length - 8; i += 8) {
+    float16x8_t src0_8 = vld1q_f16(src0 + i);
+    float16x8_t src1_8 = vld1q_f16(src1 + i);
+    float16x8_t dst_8 = vmulq_f16(src0_8, vmulq_f16(src1_8, vsubq_f16(one_8, src1_8)));
+    vst1q_f16(dst + i, dst_8);
  }
 #endif
  for (; i < length; i++) {
@ -53,3 +86,56 @@ int Fp16SigmoidGrad(const float16_t *src0, const float16_t *src1, size_t length,
  }
  return NNACL_OK;
 }
+
+int TanhFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
+  for (int i = 0; i < length; ++i) {
+    dst[i] = (float16_t)((1.0f - ((float)src1[i] * (float)src1[i])) * (float)src0[i]);
+  }
+  return NNACL_OK;
+}
+
+int HSwishFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
+  for (int i = 0; i < length; ++i) {
+    float16_t tmp = (src1[i] > 3.0f ? 1.0f : (src1[i] < -3.0f ? 0.0f : (2.0f * src1[i] + 3.0f) / 6.0f));
+    dst[i] = tmp * src0[i];
+  }
+  return NNACL_OK;
+}
+
+int HSigmoidFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
+  for (int i = 0; i < length; ++i) {
+    float16_t tmp = (src1[i] > 3.0f ? 0.0f : (src1[i] < -3.0f ? 0.0f : 1.0f / 6.0f));
+    dst[i] = tmp * src0[i];
+  }
+  return NNACL_OK;
+}
+int EluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst, float16_t alpha) {
+  int i = 0;
+#ifdef ENABLE_NEON
+  float16x4_t zero_4 = vdup_n_f16(0);
+  float16x4_t one_4 = vdup_n_f16(1);
+  float16x4_t alpha_4 = vdup_n_f16(alpha);
+  for (; i < length - 4; i += 4) {
+    float16x4_t src0_4 = vld1_f16(src0 + i);
+    float16x4_t src1_4 = vld1_f16(src1 + i);
+    uint16x4_t mask_4 = vcgt_f16(src1_4, zero_4);
+    float32x4_t tmp;
+    simd_exp(vcvt_f32_f16(src1_4), (float *)&tmp);
+    uint16x4_t expm1_4 = vsub_f16(vcvt_f16_f32(tmp), one_4);
+    float16x4_t dst_4 = vbsl_f16(mask_4, src0_4, vmul_f16(alpha_4, vmul_f16(expm1_4, src0_4)));
+    vst1_f16(dst + i, dst_4);
+  }
+#endif
+  for (; i < length; ++i) {
+    dst[i] = (src1[i] > 0.0f ? src0[i] : alpha * expm1(src1[i]) * src0[i]);
+  }
+  return NNACL_OK;
+}
+
+int GeluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
+  for (int i = 0; i < length; ++i) {
+    dst[i] = src0[i] * ((0.5 * (1.0 + erf(src1[i] / 1.4142135623730951))) +
+                        (src1[i] * exp(-0.5 * src1[i] * src1[i]) / 2.5066282746));
+  }
+  return NNACL_OK;
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/activation_grad.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/activation_grad.h
@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifndef MINDSPORE_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_
-#define MINDSPORE_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_

 #ifdef ENABLE_NEON
 #include <arm_neon.h>
@ -23,20 +23,22 @@
 #include "nnacl/op_base.h"
 #include "nnacl/int8/fixed_point.h"

-typedef struct ActivationGradParameterFp16 {
-  OpParameter op_parameter;
-  int type_;
-  float alpha_;
-} ActivationGradParameterFp16;
 #ifdef __cplusplus
 extern "C" {
 #endif

-int Fp16ReluGrad(const float16_t *src0, const float16_t *src1, size_t length, float16_t *dst);
-int Fp16SigmoidGrad(const float16_t *src0, const float16_t *src1, size_t length, float16_t *dst);
+int ReluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);
+int Relu6Fp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);
+int LReluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst, float16_t alpha);
+int SigmoidFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);
+int TanhFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);
+int HSwishFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);
+int HSigmoidFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);
+int EluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst, float16_t alpha);
+int GeluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);

 #ifdef __cplusplus
 }
 #endif

-#endif  // MINDSPORE_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/arithmetic_grad.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/arithmetic_grad.c
@ -0,0 +1,157 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp16_grad/arithmetic_grad.h"
+#include <string.h>
+#include <math.h>
+#include "nnacl/fp32_grad/utils.h"
+#include "nnacl/errorcode.h"
+
+void ElementDivNegSquareFp16(const float16_t *nom, const float16_t *denom, float16_t *output, int element_size) {
+  for (int i = 0; i < element_size; i++) {
+    output[i] = -nom[i] / (denom[i] * denom[i]);
+  }
+}
+
+void ElementMulAndDivNegSquareFp16(const float16_t *a, const float16_t *b, const float16_t *denom, float16_t *output,
+                                   int element_size) {
+  for (int i = 0; i < element_size; i++) {
+    output[i] = -a[i] * b[i] / (denom[i] * denom[i]);
+  }
+}
+
+int ElementAbsGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, int element_size) {
+  for (int i = 0; i < element_size; i++) {
+    out[i] = (in1[i] < 0.f) ? -in2[i] : ((in1[i] > 0.f) ? in2[i] : 0);
+  }
+  return NNACL_OK;
+}
+
+void MaximumByAxesFp16(const float16_t *input0, const float16_t *input1, const float16_t *dy, const int *input0_dims,
+                       const int *input1_dims, const int *dy_dims, float16_t *output0, float16_t *output1,
+                       int num_dims) {
+  int num_output0 = 1;
+  int num_output1 = 1;
+  bool same_shape = true;
+  for (int idx = 0; idx < num_dims; ++idx) {
+    num_output0 *= input0_dims[idx];
+    num_output1 *= input1_dims[idx];
+    if (input0_dims[idx] != input1_dims[idx]) {
+      same_shape = false;
+    }
+  }
+
+  if (same_shape) {
+    int input_iter[8] = {0};
+
+    // Iterate through input_data.
+    do {
+      size_t offset = GetInputOffset(num_dims, input0_dims, input_iter);
+      output0[offset] = input0[offset] > input1[offset] ? dy[offset] : 0.;
+      output1[offset] = input1[offset] >= input0[offset] ? dy[offset] : 0.;
+    } while (NextIndex(num_dims, input0_dims, input_iter));
+  } else {
+    memset(output0, 0, num_output0 * sizeof(float16_t));  // zero output
+    memset(output1, 0, num_output1 * sizeof(float16_t));  // zero output
+
+    int input_iter[8] = {0};
+    int axes0[5] = {0};
+    int axes1[5] = {0};
+    int num_axes0 = 0;
+    int num_axes1 = 0;
+    for (int i = 0; i < num_dims; i++) {
+      if (input0_dims[i] == 1) {
+        axes0[num_axes0++] = i;
+      }
+      if (input1_dims[i] == 1) {
+        axes1[num_axes1++] = i;
+      }
+    }
+
+    do {
+      size_t offset0 = GetOutputOffset(num_dims, input0_dims, input_iter, num_axes0, axes0);
+      size_t offset1 = GetOutputOffset(num_dims, input1_dims, input_iter, num_axes1, axes1);
+      size_t yt_offset = GetInputOffset(num_dims, input0_dims, input_iter);
+      output0[offset0] += input0[offset0] > input1[offset1] ? dy[yt_offset] : 0.;
+      output1[offset1] += input1[offset1] >= input0[offset0] ? dy[yt_offset] : 0.;
+    } while (NextIndex(num_dims, dy_dims, input_iter));
+  }
+}
+
+void MinimumByAxesFp16(const float16_t *input0, const float16_t *input1, const float16_t *dy, const int *input0_dims,
+                       const int *input1_dims, const int *dy_dims, float16_t *output0, float16_t *output1,
+                       int num_dims) {
+  int num_output0 = 1;
+  int num_output1 = 1;
+  bool same_shape = true;
+  for (int idx = 0; idx < num_dims; ++idx) {
+    num_output0 *= input0_dims[idx];
+    num_output1 *= input1_dims[idx];
+    if (input0_dims[idx] != input1_dims[idx]) {
+      same_shape = false;
+    }
+  }
+
+  if (same_shape) {
+    int input_iter[8] = {0};
+
+    // Iterate through input_data.
+    do {
+      size_t offset = GetInputOffset(num_dims, input0_dims, input_iter);
+      output0[offset] = input0[offset] < input1[offset] ? dy[offset] : 0.;
+      output1[offset] = input1[offset] <= input0[offset] ? dy[offset] : 0.;
+    } while (NextIndex(num_dims, input0_dims, input_iter));
+  } else {
+    memset(output0, 0, num_output0 * sizeof(float16_t));  // zero output
+    memset(output1, 0, num_output1 * sizeof(float16_t));  // zero output
+
+    int input_iter[8] = {0};
+    int axes0[5] = {0};
+    int axes1[5] = {0};
+    int num_axes0 = 0;
+    int num_axes1 = 0;
+    for (int i = 0; i < num_dims; i++) {
+      if (input0_dims[i] == 1) {
+        axes0[num_axes0++] = i;
+      }
+      if (input1_dims[i] == 1) {
+        axes1[num_axes1++] = i;
+      }
+    }
+
+    do {
+      size_t offset0 = GetOutputOffset(num_dims, input0_dims, input_iter, num_axes0, axes0);
+      size_t offset1 = GetOutputOffset(num_dims, input1_dims, input_iter, num_axes1, axes1);
+      size_t yt_offset = GetInputOffset(num_dims, input0_dims, input_iter);
+      output0[offset0] += input0[offset0] < input1[offset1] ? dy[yt_offset] : 0.;
+      output1[offset1] += input1[offset1] <= input0[offset0] ? dy[yt_offset] : 0.;
+    } while (NextIndex(num_dims, dy_dims, input_iter));
+  }
+}
+
+int ElementSqrtGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, const int element_size) {
+  for (int i = 0; i < element_size; i++) {
+    out[i] = 0.5f * in2[i] / in1[i];
+  }
+  return NNACL_OK;
+}
+
+int ElementRsqrtGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, const int element_size) {
+  for (int i = 0; i < element_size; i++) {
+    out[i] = -0.5f * in2[i] * in1[i] * in1[1] * in1[i];
+  }
+  return NNACL_OK;
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/arithmetic_grad.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/arithmetic_grad.h
@ -0,0 +1,41 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ARITHMETIC_GRAD_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ARITHMETIC_GRAD_H_
+
+#include "nnacl/op_base.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void ElementDivNegSquareFp16(const float16_t *nom, const float16_t *denom, float16_t *output, int element_size);
+void ElementMulAndDivNegSquareFp16(const float16_t *a, const float16_t *b, const float16_t *denom, float16_t *output,
+                                   int element_size);
+int ElementAbsGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, int element_size);
+void MaximumByAxesFp16(const float16_t *input0, const float16_t *input1, const float16_t *dy, const int *input0_dims,
+                       const int *input1_dims, const int *dy_dims, float16_t *output0, float16_t *output1,
+                       int num_dims);
+void MinimumByAxesFp16(const float16_t *input0, const float16_t *input1, const float16_t *dy, const int *input0_dims,
+                       const int *input1_dims, const int *dy_dims, float16_t *output0, float16_t *output1,
+                       int num_dims);
+int ElementSqrtGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, const int element_size);
+int ElementRsqrtGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, const int element_size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ARITHMETIC_GRAD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/batch_norm.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/batch_norm.c
@ -0,0 +1,86 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <math.h>
+#include <string.h>
+#include "nnacl/fp16_grad/batch_norm.h"
+
+void var2InvarFp16(float16_t *save_var, int size, float eps) {
+  for (int i = 0; i < size; i++) {
+    save_var[i] = (float16_t)(1.0f / sqrtf((float)save_var[i] + eps));
+  }
+}
+
+void backwardAllFp16(const float16_t *restrict in, const float16_t *restrict yt, const float16_t *restrict mean,
+                     const float16_t *restrict invar, const float16_t *restrict scale, int size, int ch,
+                     float *restrict dxhat_sum, float *restrict dxhathat_sum, float16_t *restrict dbias,
+                     float16_t *restrict dscale, float16_t *restrict dx) {
+  float16_t N = (float16_t)size;
+  for (int i = 0; i < size; i++) {
+    for (int c = 0; c < ch; c++) {
+      int ix = i * ch + c;
+      dbias[c] += yt[ix];
+      // dscale
+      float16_t x_hat = (in[ix] - mean[c]) * invar[c];
+      dscale[c] += (yt[ix] * x_hat);
+      // dx_1
+      float dx_hat = (float)(yt[ix] * scale[c]);
+      dxhat_sum[c] += dx_hat;
+      dxhathat_sum[c] += (float)(dx_hat * x_hat);
+    }
+  }
+  for (int i = 0; i < size; i++) {
+    for (int c = 0; c < ch; c++) {
+      // dx_2
+      int ix = i * ch + c;
+      float16_t x_hat = (in[ix] - mean[c]) * invar[c];
+      float16_t dx_hat = yt[ix] * scale[c];
+      dx[ix] = 1.0f / N * (float)((invar[c]) * (N * dx_hat - dxhat_sum[c] - x_hat * dxhathat_sum[c]));
+    }
+  }
+}
+void backwardP1Fp16(const float16_t *restrict in, const float16_t *restrict yt, const float16_t *restrict mean,
+                    const float16_t *restrict invar, const float16_t *restrict scale, int size, int ch,
+                    float *restrict dxhat_sum, float *restrict dxhathat_sum, float16_t *restrict dbias,
+                    float16_t *restrict dscale) {
+  for (int i = 0; i < size; i++) {
+    for (int c = 0; c < ch; c++) {
+      int ix = i * ch + c;
+      dbias[c] += yt[ix];
+      // dscale
+      float x_hat = (float)((in[ix] - mean[c]) * invar[c]);
+      dscale[c] += (yt[ix] * x_hat);
+      // dx_1
+      float dx_hat = (float)(yt[ix] * scale[c]);
+      dxhat_sum[c] += dx_hat;
+      dxhathat_sum[c] += dx_hat * x_hat;
+    }
+  }
+}
+
+void backwardP2Fp16(const float16_t *restrict in, const float16_t *restrict yt, const float16_t *restrict mean,
+                    const float16_t *restrict invar, const float16_t *restrict scale, int size, int total_size, int ch,
+                    const float *dxhat_sum, const float *dxhathat_sum, float16_t *restrict dx) {
+  const float N = (float)total_size;
+  for (int i = 0; i < size; i++) {
+    for (int c = 0; c < ch; c++) {
+      // dx_2
+      int ix = i * ch + c;
+      float x_hat = (float)((in[ix] - mean[c]) * invar[c]);
+      float dx_hat = (float)(yt[ix] * scale[c]);
+      dx[ix] = (float16_t)(1.0f / N * (float)(invar[c]) * (N * dx_hat - dxhat_sum[c] - x_hat * dxhathat_sum[c]));
+    }
+  }
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/batch_norm.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/batch_norm.h
@ -0,0 +1,40 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_BATCH_NORM_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_BATCH_NORM_H_
+
+#include "nnacl/op_base.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void var2InvarFp16(float16_t *save_var, int size, float eps);
+void backwardAllFp16(const float16_t *in, const float16_t *yt, const float16_t *mean, const float16_t *invar,
+                     const float16_t *scale, int size, int ch, float *dxhat_sum, float *dxhathat_sum, float16_t *dbias,
+                     float16_t *dscale, float16_t *dx);
+void backwardP1Fp16(const float16_t *in, const float16_t *yt, const float16_t *mean, const float16_t *invar,
+                    const float16_t *scale, int size, int ch, float *dxhat_sum, float *dxhathat_sum, float16_t *dbias,
+                    float16_t *dscale);
+void backwardP2Fp16(const float16_t *in, const float16_t *yt, const float16_t *mean, const float16_t *invar,
+                    const float16_t *scale, int size, int total_size, int ch, const float *dxhat_sum,
+                    const float *dxhathat_sum, float16_t *dx);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_BATCH_NORM_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/convolution_grad_filter.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/convolution_grad_filter.c
@ -0,0 +1,360 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp16_grad/convolution_grad_filter.h"
+#include "nnacl/intrinsics/ms_simd_instructions_fp16.h"
+#ifdef ENABLE_NEON
+#include <arm_neon.h>
+#endif
+
+#ifdef ENABLE_NEON
+
+static int FilterGrad32Arm(const float16_t *x, const float16_t *dy, int i_c, int k_idx, float16_t *dw,
+                           const ConvParameter *conv_param) {
+  int in_h = conv_param->input_h_;
+  int in_w = conv_param->input_w_;
+  int k_h = conv_param->kernel_h_;
+  int k_w = conv_param->kernel_w_;
+  int batch = conv_param->output_batch_;
+  int out_ch = conv_param->output_channel_;
+  int in_ch = conv_param->input_channel_;
+  int out_h = conv_param->output_h_;
+  int out_w = conv_param->output_w_;
+
+  int m = out_h * out_w;
+  int x_size = in_h * in_w * in_ch;
+  int y_size = out_ch * out_h * out_w;
+  int k_spatial = k_w * k_h;
+  int i_kh = k_idx / k_w;
+  int i_kw = k_idx % k_w;
+  for (; i_c < (out_ch & ~31); i_c += 32) {
+    float32x4_t sum_0 = vdupq_n_f32(0.0f);
+    float32x4_t sum_1 = vdupq_n_f32(0.0f);
+    float32x4_t sum_2 = vdupq_n_f32(0.0f);
+    float32x4_t sum_3 = vdupq_n_f32(0.0f);
+    float32x4_t sum_4 = vdupq_n_f32(0.0f);
+    float32x4_t sum_5 = vdupq_n_f32(0.0f);
+    float32x4_t sum_6 = vdupq_n_f32(0.0f);
+    float32x4_t sum_7 = vdupq_n_f32(0.0f);
+
+    for (int b = 0; b < batch; ++b) {
+      const float16_t *x_addr = &x[b * x_size];
+      const float16_t *dy_addr = &dy[b * y_size];
+      for (int i = 0; i < m; i++) {
+        int idx = i;
+        int input_h = idx / out_w * conv_param->stride_h_;
+        int input_w = idx % out_w * conv_param->stride_w_;
+        int input_row = -conv_param->pad_u_ + i_kh + input_h;
+        int input_col = -conv_param->pad_l_ + i_kw + input_w;
+        if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) {
+          int offset_x = (input_row * in_w + input_col) * out_ch + i_c;
+          int offset_dy = idx * out_ch + i_c;
+
+          float16x8_t x_0 = vld1q_f16(x_addr + offset_x);
+          float16x8_t dy_0 = vld1q_f16(dy_addr + offset_dy);
+          sum_0 = MS_VMLAL_F16(vget_low_f16(x_0), vget_low_f16(dy_0), sum_0);
+          sum_1 = MS_VMLAL_F16(vget_high_f16(x_0), vget_high_f16(dy_0), sum_1);
+
+          float16x8_t x_1 = vld1q_f16(x_addr + offset_x + 8);
+          float16x8_t dy_1 = vld1q_f16(dy_addr + offset_dy + 8);
+          sum_2 = MS_VMLAL_F16(vget_low_f16(x_1), vget_low_f16(dy_1), sum_2);
+          sum_3 = MS_VMLAL_F16(vget_high_f16(x_1), vget_high_f16(dy_1), sum_3);
+
+          float16x8_t x_2 = vld1q_f16(x_addr + offset_x + 16);
+          float16x8_t dy_2 = vld1q_f16(dy_addr + offset_dy + 16);
+          sum_4 = MS_VMLAL_F16(vget_low_f16(x_2), vget_low_f16(dy_2), sum_4);
+          sum_5 = MS_VMLAL_F16(vget_high_f16(x_2), vget_high_f16(dy_2), sum_5);
+
+          float16x8_t x_3 = vld1q_f16(x_addr + offset_x + 24);
+          float16x8_t dy_3 = vld1q_f16(dy_addr + offset_dy + 24);
+          sum_6 = MS_VMLAL_F16(vget_low_f16(x_3), vget_low_f16(dy_3), sum_6);
+          sum_7 = MS_VMLAL_F16(vget_high_f16(x_3), vget_high_f16(dy_3), sum_7);
+        }
+      }
+    }
+    // store into memory
+    for (int l = 0; l < 4; l++) {
+      dw[(i_c + l) * k_spatial + k_idx] = sum_0[l];
+      dw[(i_c + 4 + l) * k_spatial + k_idx] = sum_1[l];
+      dw[(i_c + 8 + l) * k_spatial + k_idx] = sum_2[l];
+      dw[(i_c + 12 + l) * k_spatial + k_idx] = sum_3[l];
+      dw[(i_c + 16 + l) * k_spatial + k_idx] = sum_4[l];
+      dw[(i_c + 20 + l) * k_spatial + k_idx] = sum_5[l];
+      dw[(i_c + 24 + l) * k_spatial + k_idx] = sum_6[l];
+      dw[(i_c + 28 + l) * k_spatial + k_idx] = sum_7[l];
+    }
+  }
+  return i_c;
+}
+
+static int FilterGrad16Arm(const float16_t *x, const float16_t *dy, int i_c, int k_idx, float16_t *dw,
+                           const ConvParameter *conv_param) {
+  int in_h = conv_param->input_h_;
+  int in_w = conv_param->input_w_;
+  int k_h = conv_param->kernel_h_;
+  int k_w = conv_param->kernel_w_;
+  int batch = conv_param->output_batch_;
+  int out_ch = conv_param->output_channel_;
+  int in_ch = conv_param->input_channel_;
+  int out_h = conv_param->output_h_;
+  int out_w = conv_param->output_w_;
+
+  int m = out_h * out_w;
+  int x_size = in_h * in_w * in_ch;
+  int y_size = out_ch * out_h * out_w;
+  int k_spatial = k_w * k_h;
+  int i_kh = k_idx / k_w;
+  int i_kw = k_idx % k_w;
+  for (; i_c < (out_ch & ~15); i_c += 16) {
+    float32x4_t sum_0 = vdupq_n_f32(0.0f);
+    float32x4_t sum_1 = vdupq_n_f32(0.0f);
+    float32x4_t sum_2 = vdupq_n_f32(0.0f);
+    float32x4_t sum_3 = vdupq_n_f32(0.0f);
+    for (int b = 0; b < batch; ++b) {
+      const float16_t *x_addr = &x[b * x_size];
+      const float16_t *dy_addr = &dy[b * y_size];
+      for (int i = 0; i < m; i++) {
+        int idx = i;
+        int input_h = idx / out_w * conv_param->stride_h_;
+        int input_w = idx % out_w * conv_param->stride_w_;
+        int input_row = -conv_param->pad_u_ + i_kh + input_h;
+        int input_col = -conv_param->pad_l_ + i_kw + input_w;
+        if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) {
+          int offset_x = (input_row * in_w + input_col) * out_ch + i_c;
+          int offset_dy = idx * out_ch + i_c;
+
+          float16x8_t x_0 = vld1q_f16(x_addr + offset_x);
+          float16x8_t dy_0 = vld1q_f16(dy_addr + offset_dy);
+          sum_0 = MS_VMLAL_F16(vget_low_f16(x_0), vget_low_f16(dy_0), sum_0);
+          sum_1 = MS_VMLAL_F16(vget_high_f16(x_0), vget_high_f16(dy_0), sum_1);
+
+          float16x8_t x_1 = vld1q_f16(x_addr + offset_x + 8);
+          float16x8_t dy_1 = vld1q_f16(dy_addr + offset_dy + 8);
+          sum_2 = MS_VMLAL_F16(vget_low_f16(x_1), vget_low_f16(dy_1), sum_2);
+          sum_3 = MS_VMLAL_F16(vget_high_f16(x_1), vget_high_f16(dy_1), sum_3);
+        }
+      }
+    }
+    for (int l = 0; l < 4; l++) {
+      dw[(i_c + l) * k_spatial + k_idx] = sum_0[l];
+      dw[(i_c + l + 4) * k_spatial + k_idx] = sum_1[l];
+      dw[(i_c + l + 8) * k_spatial + k_idx] = sum_2[l];
+      dw[(i_c + l + 12) * k_spatial + k_idx] = sum_3[l];
+    }
+  }
+  return i_c;
+}
+
+static int FilterGrad8Arm(const float16_t *x, const float16_t *dy, int i_c, int k_idx, float16_t *dw,
+                          const ConvParameter *conv_param) {
+  int in_h = conv_param->input_h_;
+  int in_w = conv_param->input_w_;
+  int k_h = conv_param->kernel_h_;
+  int k_w = conv_param->kernel_w_;
+  int batch = conv_param->output_batch_;
+  int out_ch = conv_param->output_channel_;
+  int in_ch = conv_param->input_channel_;
+  int out_h = conv_param->output_h_;
+  int out_w = conv_param->output_w_;
+
+  int m = out_h * out_w;
+  int x_size = in_h * in_w * in_ch;
+  int y_size = out_ch * out_h * out_w;
+  int k_spatial = k_w * k_h;
+  int i_kh = k_idx / k_w;
+  int i_kw = k_idx % k_w;
+  for (; i_c < (out_ch & ~7); i_c += 8) {
+    float32x4_t sum_0 = vdupq_n_f32(0.0f);
+    float32x4_t sum_1 = vdupq_n_f32(0.0f);
+
+    for (int b = 0; b < batch; ++b) {
+      const float16_t *x_addr = &x[b * x_size];
+      const float16_t *dy_addr = &dy[b * y_size];
+      for (int i = 0; i < m; i++) {
+        int idx = i;
+        int input_h = idx / out_w * conv_param->stride_h_;
+        int input_w = idx % out_w * conv_param->stride_w_;
+        int input_row = -conv_param->pad_u_ + i_kh + input_h;
+        int input_col = -conv_param->pad_l_ + i_kw + input_w;
+        if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) {
+          int offset_x = (input_row * in_w + input_col) * out_ch + i_c;
+          int offset_dy = idx * out_ch + i_c;
+          float16x8_t x_0 = vld1q_f16(x_addr + offset_x);
+          float16x8_t dy_0 = vld1q_f16(dy_addr + offset_dy);
+          sum_0 = MS_VMLAL_F16(vget_low_f16(x_0), vget_low_f16(dy_0), sum_0);
+          sum_1 = MS_VMLAL_F16(vget_high_f16(x_0), vget_high_f16(dy_0), sum_1);
+        }
+      }
+    }
+    for (int l = 0; l < 4; l++) {
+      dw[(i_c + l) * k_spatial + k_idx] = sum_0[l];
+      dw[(i_c + 4 + l) * k_spatial + k_idx] = sum_1[l];
+    }
+  }
+  return i_c;
+}
+
+static int FilterGrad4Arm(const float16_t *x, const float16_t *dy, int i_c, int k_idx, float16_t *dw,
+                          const ConvParameter *conv_param) {
+  int in_h = conv_param->input_h_;
+  int in_w = conv_param->input_w_;
+  int k_h = conv_param->kernel_h_;
+  int k_w = conv_param->kernel_w_;
+  int batch = conv_param->output_batch_;
+  int out_ch = conv_param->output_channel_;
+  int in_ch = conv_param->input_channel_;
+  int out_h = conv_param->output_h_;
+  int out_w = conv_param->output_w_;
+
+  int m = out_h * out_w;
+  int x_size = in_h * in_w * in_ch;
+  int y_size = out_ch * out_h * out_w;
+  int k_spatial = k_w * k_h;
+  int i_kh = k_idx / k_w;
+  int i_kw = k_idx % k_w;
+  for (; i_c < (out_ch & ~3); i_c += 4) {
+    float32x4_t sum_0 = vdupq_n_f32(0.0f);
+
+    for (int b = 0; b < batch; ++b) {
+      const float16_t *x_addr = &x[b * x_size];
+      const float16_t *dy_addr = &dy[b * y_size];
+
+      for (int i = 0; i < m; i++) {
+        int idx = i;
+        int input_h = idx / out_w * conv_param->stride_h_;
+        int input_w = idx % out_w * conv_param->stride_w_;
+        int input_row = -conv_param->pad_u_ + i_kh + input_h;
+        int input_col = -conv_param->pad_l_ + i_kw + input_w;
+        if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) {
+          int offset_x = (input_row * in_w + input_col) * out_ch + i_c;
+          int offset_dy = idx * out_ch + i_c;
+          float16x4_t x_0 = vld1_f16(x_addr + offset_x);
+          float16x4_t dy_0 = vld1_f16(dy_addr + offset_dy);
+          sum_0 = MS_VMLAL_F16(x_0, dy_0, sum_0);
+        }
+      }
+    }
+    dw[(i_c + 0) * k_spatial + k_idx] = sum_0[0];
+    dw[(i_c + 1) * k_spatial + k_idx] = sum_0[1];
+    dw[(i_c + 2) * k_spatial + k_idx] = sum_0[2];
+    dw[(i_c + 3) * k_spatial + k_idx] = sum_0[3];
+  }
+  return i_c;
+}
+
+static int FilterGradLeftoverArm(const float16_t *x, const float16_t *dy, int i_c, int k_idx, float16_t *dw,
+                                 const ConvParameter *conv_param) {
+  int in_h = conv_param->input_h_;
+  int in_w = conv_param->input_w_;
+  int k_h = conv_param->kernel_h_;
+  int k_w = conv_param->kernel_w_;
+  int batch = conv_param->output_batch_;
+  int out_ch = conv_param->output_channel_;
+  int in_ch = conv_param->input_channel_;
+  int out_h = conv_param->output_h_;
+  int out_w = conv_param->output_w_;
+
+  int m = out_h * out_w;
+  int x_size = in_h * in_w * in_ch;
+  int y_size = out_ch * out_h * out_w;
+  int k_spatial = k_w * k_h;
+  int i_kh = k_idx / k_w;
+  int i_kw = k_idx % k_w;
+  int leftover = out_ch - i_c;
+  if (leftover > 0) {
+    float32x4_t sum_0 = vdupq_n_f32(0.0f);
+
+    for (int b = 0; b < batch; ++b) {
+      const float16_t *x_addr = &x[b * x_size];
+      const float16_t *dy_addr = &dy[b * y_size];
+
+      for (int i = 0; i < m; i++) {
+        int idx = i;
+        int input_h = idx / out_w * conv_param->stride_h_;
+        int input_w = idx % out_w * conv_param->stride_w_;
+        int input_row = -conv_param->pad_u_ + i_kh + input_h;
+        int input_col = -conv_param->pad_l_ + i_kw + input_w;
+        if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) {
+          int offset_x = (input_row * in_w + input_col) * out_ch + i_c;
+          int offset_dy = idx * out_ch + i_c;
+          float16x4_t x_0 = vld1_f16(x_addr + offset_x);
+          float16x4_t dy_0 = vld1_f16(dy_addr + offset_dy);
+          sum_0 = MS_VMLAL_F16(x_0, dy_0, sum_0);
+        }
+      }
+    }
+    for (int l = 0; l < leftover; l++) {
+      dw[(i_c + l) * k_spatial + k_idx] = sum_0[l];
+    }
+  }
+  return out_ch;
+}
+
+#endif
+
+int ConvDwFilterFp16Grad(const float16_t *x, const float16_t *dy, float16_t *dw, int start, int count,
+                         const ConvParameter *conv_param) {
+  int in_h = conv_param->input_h_;
+  int in_w = conv_param->input_w_;
+  int k_h = conv_param->kernel_h_;
+  int k_w = conv_param->kernel_w_;
+  int batch = conv_param->output_batch_;
+  int out_ch = conv_param->output_channel_;
+  int in_ch = conv_param->input_channel_;
+  int out_h = conv_param->output_h_;
+  int out_w = conv_param->output_w_;
+
+  int m = out_h * out_w;
+  int x_size = in_h * in_w * in_ch;
+  int y_size = out_ch * out_h * out_w;
+  int k_spatial = k_w * k_h;
+
+  for (int i_k = 0; i_k < count; i_k++) {
+    int k_idx = start + i_k;
+    int i_kh = k_idx / k_w;
+    int i_kw = k_idx % k_w;
+    int i_c = 0;
+#ifdef ENABLE_NEON
+    i_c = FilterGrad32Arm(x, dy, i_c, k_idx, dw, conv_param);
+    i_c = FilterGrad16Arm(x, dy, i_c, k_idx, dw, conv_param);
+    i_c = FilterGrad8Arm(x, dy, i_c, k_idx, dw, conv_param);
+    i_c = FilterGrad4Arm(x, dy, i_c, k_idx, dw, conv_param);
+    i_c = FilterGradLeftoverArm(x, dy, i_c, k_idx, dw, conv_param);
+#endif
+    for (; i_c < out_ch; i_c++) {
+      float sum = 0;
+      for (int b = 0; b < batch; ++b) {
+        const float16_t *x_addr = &x[b * x_size];
+        const float16_t *dy_addr = &dy[b * y_size];
+
+        for (int i = 0; i < m; i++) {
+          int idx = i;
+          int input_h = idx / out_w * conv_param->stride_h_;
+          int input_w = idx % out_w * conv_param->stride_w_;
+          int input_row = -conv_param->pad_u_ + i_kh + input_h;
+          int input_col = -conv_param->pad_l_ + i_kw + input_w;
+          if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) {
+            int offset_x = (input_row * in_w + input_col) * out_ch + i_c;
+            int offset_dy = idx * out_ch + i_c;
+            sum += x_addr[offset_x] * dy_addr[offset_dy];
+          }
+        }
+      }
+      dw[i_c * k_spatial + k_idx] = sum;
+    }
+  }
+  return 0;
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/convolution_grad_filter.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/convolution_grad_filter.h
@ -0,0 +1,33 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_CONVOLUTION_GRAD_FILTER_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_CONVOLUTION_GRAD_FILTER_H_
+
+#include <stddef.h>
+#include "nnacl/conv_parameter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int ConvDwFilterFp16Grad(const float16_t *x, const float16_t *dy, float16_t *dw, int start, int count,
+                         const ConvParameter *conv_param);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_CONVOLUTION_GRAD_FILTER_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/dropout_grad.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/dropout_grad.c
@ -0,0 +1,24 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp16_grad/dropout_grad.h"
+
+void DropoutFp16Grad(const float16_t *yt_ptr, const float16_t *mask, float16_t *output_ptr, int length,
+                     float16_t scale) {
+  for (int i = 0; i < length; i++) {
+    output_ptr[i] = yt_ptr[i] * mask[i] * scale;
+  }
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/dropout_grad.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/dropout_grad.h
@ -0,0 +1,32 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_DROPOUT_GRAD_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_DROPOUT_GRAD_H_
+
+#include "nnacl/op_base.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void DropoutFp16Grad(const float16_t *yt_ptr, const float16_t *mask, float16_t *output_ptr, int length,
+                     float16_t ratio);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_DROPOUT_GRAD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/gemm_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/gemm_fp16.c
@ -0,0 +1,373 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp16_grad/gemm_fp16.h"
+#include <string.h>
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif
+#include "nnacl/fp16/matmul_fp16.h"
+#include "nnacl/fp16/pack_fp16.h"
+
+#ifdef ENABLE_ARM64
+static void Row2Col16Block16(const float16_t *src_ptr, float16_t *dst_ptr, size_t col) {
+  size_t stride = col * 2;
+  asm volatile(
+    "mov x10, %[src_c]\n"
+    "mov x11, %[dst_c]\n"
+
+    "ld1 {v0.8h}, [x10], %[stride]\n"
+    "ld1 {v1.8h}, [x10], %[stride]\n"
+    "ld1 {v2.8h}, [x10], %[stride]\n"
+    "ld1 {v3.8h}, [x10], %[stride]\n"
+    "ld1 {v4.8h}, [x10], %[stride]\n"
+    "ld1 {v5.8h}, [x10], %[stride]\n"
+    "ld1 {v6.8h}, [x10], %[stride]\n"
+    "ld1 {v7.8h}, [x10], %[stride]\n"
+
+    "zip1 v16.8h, v0.8h, v1.8h\n"
+    "zip1 v17.8h, v2.8h, v3.8h\n"
+    "zip1 v18.8h, v4.8h, v5.8h\n"
+    "zip1 v19.8h, v6.8h, v7.8h\n"
+
+    "ld1 {v8.8h}, [x10], %[stride]\n"
+    "ld1 {v9.8h}, [x10], %[stride]\n"
+    "ld1 {v10.8h}, [x10], %[stride]\n"
+    "ld1 {v11.8h}, [x10], %[stride]\n"
+    "ld1 {v12.8h}, [x10], %[stride]\n"
+    "ld1 {v13.8h}, [x10], %[stride]\n"
+    "ld1 {v14.8h}, [x10], %[stride]\n"
+    "ld1 {v15.8h}, [x10], %[stride]\n"
+
+    "trn1 v20.4s, v16.4s, v17.4s\n"
+    "trn2 v21.4s, v16.4s, v17.4s\n"
+    "trn1 v22.4s, v18.4s, v19.4s\n"
+    "trn2 v23.4s, v18.4s, v19.4s\n"
+
+    "trn1 v24.2d, v20.2d, v22.2d\n"
+    "trn2 v25.2d, v20.2d, v22.2d\n"
+    "trn1 v26.2d, v21.2d, v23.2d\n"
+    "trn2 v27.2d, v21.2d, v23.2d\n"
+
+    "zip1 v16.8h, v8.8h, v9.8h\n"
+    "zip1 v17.8h, v10.8h, v11.8h\n"
+    "zip1 v18.8h, v12.8h, v13.8h\n"
+    "zip1 v19.8h, v14.8h, v15.8h\n"
+
+    "trn1 v20.4s, v16.4s, v17.4s\n"
+    "trn2 v21.4s, v16.4s, v17.4s\n"
+    "trn1 v22.4s, v18.4s, v19.4s\n"
+    "trn2 v23.4s, v18.4s, v19.4s\n"
+
+    "trn1 v28.2d, v20.2d, v22.2d\n"
+    "trn2 v29.2d, v20.2d, v22.2d\n"
+    "trn1 v30.2d, v21.2d, v23.2d\n"
+    "trn2 v31.2d, v21.2d, v23.2d\n"
+
+    "st1 {v24.8h}, [x11], #16\n"
+    "st1 {v28.8h}, [x11], #16\n"
+    "st1 {v26.8h}, [x11], #16\n"
+    "st1 {v30.8h}, [x11], #16\n"
+    "st1 {v25.8h}, [x11], #16\n"
+    "st1 {v29.8h}, [x11], #16\n"
+    "st1 {v27.8h}, [x11], #16\n"
+    "st1 {v31.8h}, [x11], #16\n"
+
+    "zip2 v16.8h, v0.8h, v1.8h\n"
+    "zip2 v17.8h, v2.8h, v3.8h\n"
+    "zip2 v18.8h, v4.8h, v5.8h\n"
+    "zip2 v19.8h, v6.8h, v7.8h\n"
+
+    "trn1 v20.4s, v16.4s, v17.4s\n"
+    "trn2 v21.4s, v16.4s, v17.4s\n"
+    "trn1 v22.4s, v18.4s, v19.4s\n"
+    "trn2 v23.4s, v18.4s, v19.4s\n"
+
+    "trn1 v24.2d, v20.2d, v22.2d\n"
+    "trn2 v25.2d, v20.2d, v22.2d\n"
+    "trn1 v26.2d, v21.2d, v23.2d\n"
+    "trn2 v27.2d, v21.2d, v23.2d\n"
+
+    "zip2 v16.8h, v8.8h, v9.8h\n"
+    "zip2 v17.8h, v10.8h, v11.8h\n"
+    "zip2 v18.8h, v12.8h, v13.8h\n"
+    "zip2 v19.8h, v14.8h, v15.8h\n"
+
+    "trn1 v20.4s, v16.4s, v17.4s\n"
+    "trn2 v21.4s, v16.4s, v17.4s\n"
+    "trn1 v22.4s, v18.4s, v19.4s\n"
+    "trn2 v23.4s, v18.4s, v19.4s\n"
+
+    "trn1 v28.2d, v20.2d, v22.2d\n"
+    "trn2 v29.2d, v20.2d, v22.2d\n"
+    "trn1 v30.2d, v21.2d, v23.2d\n"
+    "trn2 v31.2d, v21.2d, v23.2d\n"
+
+    "st1 {v24.8h}, [x11], #16\n"
+    "st1 {v28.8h}, [x11], #16\n"
+    "st1 {v26.8h}, [x11], #16\n"
+    "st1 {v30.8h}, [x11], #16\n"
+    "st1 {v25.8h}, [x11], #16\n"
+    "st1 {v29.8h}, [x11], #16\n"
+    "st1 {v27.8h}, [x11], #16\n"
+    "st1 {v31.8h}, [x11], #16\n"
+    :
+    : [ dst_c ] "r"(dst_ptr), [ src_c ] "r"(src_ptr), [ stride ] "r"(stride)
+    : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
+      "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+      "v31");
+}
+#endif
+
+void AddMatrixFp16(const float16_t *restrict v1, float16_t *restrict v2, float16_t beta, int row, int col, int stride) {
+  const float16_t *src_ptr = v1;
+  float16_t *dst_ptr = v2;
+  for (int r = 0; r < row; r++) {
+    for (int c = 0; c < col; c++) {
+      dst_ptr[c] += beta * src_ptr[c];
+    }
+    src_ptr += stride;
+    dst_ptr += stride;
+  }
+}
+
+int MatSizeFp16(int row, int col, int round) {
+  int res = UP_ROUND(row, round) * col;
+  return res;
+}
+
+int MatSizeTotalFp16(int row, int col, int deep, int stride) {
+#ifdef ENABLE_ARM64
+  const int num = C16NUM;
+#else
+  const int num = C12NUM;
+#endif
+  int res = MatSizeFp16(row, deep, num) + MatSizeFp16(col, deep, C8NUM);
+  if (stride > 0) res += row * stride;
+  return res;
+}
+
+#ifdef ENABLE_ARM64
+static void RowMajor2Col16MajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
+  size_t row_up_16 = UP_ROUND(row, C16NUM);
+  size_t row16 = row / C16NUM * C16NUM;
+  size_t col8 = col / C8NUM * C8NUM;
+  const float16_t *src_r = src;
+  float16_t *dst_r = dst;
+  size_t ri = 0;
+  // find 16 block unit
+  for (; ri < row16; ri += C16NUM) {
+    size_t ci = 0;
+    for (; ci < col8; ci += C8NUM) {
+      const float16_t *src_c = src_r + ci;
+      float16_t *dst_c = dst_r + ci * C16NUM;
+#ifdef ENABLE_ARM64
+      Row2Col16Block16(src_c, dst_c, stride);
+#else
+      for (int tr = 0; tr < C16NUM; tr++) {
+        for (int tc = 0; tc < C8NUM; tc++) {
+          dst_c[tc * C16NUM + tr] = src_c[tr * stride + tc];
+        }
+      }
+#endif
+    }
+    for (; ci < col; ci++) {
+      const float16_t *src_c = src_r + ci;
+      float16_t *dst_c = dst_r + ci * C16NUM;
+      for (size_t i = 0; i < C16NUM; i++) {
+        dst_c[i] = src_c[i * stride];
+      }
+    }
+    src_r += C16NUM * stride;
+    dst_r += C16NUM * col;
+  }
+  for (; ri < row; ri++) {
+    for (size_t i = 0; i < col; ++i) {
+      dst_r[i * C16NUM] = src_r[i];
+    }
+    src_r += stride;
+    dst_r += 1;
+  }
+  for (; ri < row_up_16; ri++) {
+    for (size_t i = 0; i < col; i++) {
+      dst_r[i * C16NUM] = 0;
+    }
+    dst_r += 1;
+  }
+  return;
+}
+#endif
+
+void RowMajor2Row16MajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
+  for (int r = 0; r < row; r++) {
+    for (int c = 0; c < col; c++) {
+      int c_div16 = c / 16;
+      int c_mod16 = c % 16;
+      dst[c_div16 * 16 * row + r * 16 + c_mod16] = src[r * stride + c];
+    }
+  }
+}
+
+void RowMajor2Col12MajorStrideFp16(const float16_t *src, float16_t *dst, size_t row, size_t col, int stride) {
+  size_t row_up_12 = UP_ROUND(row, C12NUM);
+  size_t row12 = row / C12NUM * C12NUM;
+  size_t col8 = col / C8NUM * C8NUM;
+  const float16_t *src_r = src;
+  float16_t *dst_r = dst;
+  size_t ri = 0;
+  // transpose 12x8
+  for (; ri < row12; ri += C12NUM) {
+    size_t ci = 0;
+    for (; ci < col8; ci += C8NUM) {
+      const float16_t *src_c = src_r + ci;
+      float16_t *dst_c = dst_r + ci * C12NUM;
+#ifdef ENABLE_ARM82_A32
+      Transpose12x8A32Fp16(src_c, dst_c, stride * sizeof(float16_t), 24);
+#else
+      for (int tr = 0; tr < C12NUM; tr++) {
+        for (int tc = 0; tc < C8NUM; tc++) {
+          dst_c[tc * C12NUM + tr] = src_c[tr * stride + tc];
+        }
+      }
+#endif
+    }
+    for (; ci < col; ci++) {
+      const float16_t *src_c = src_r + ci;
+      float16_t *dst_c = dst_r + ci * C12NUM;
+      for (size_t i = 0; i < C12NUM; i++) {
+        dst_c[i] = src_c[i * stride];
+      }
+    }
+    src_r += C12NUM * stride;
+    dst_r += C12NUM * col;
+  }
+  for (; ri < row; ri++) {
+    for (size_t i = 0; i < col; ++i) {
+      dst_r[i * C12NUM] = src_r[i];
+    }
+    src_r += stride;
+    dst_r += 1;
+  }
+  for (; ri < row_up_12; ri++) {
+    for (size_t i = 0; i < col; i++) {
+      dst_r[i * C12NUM] = 0;
+    }
+    dst_r += 1;
+  }
+}
+
+void RowMajor2Row12MajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
+  for (int r = 0; r < row; r++) {
+    for (int c = 0; c < col; c++) {
+      int c_div12 = c / 12;
+      int c_mod12 = c % 12;
+      dst[c_div12 * 12 * row + r * 12 + c_mod12] = src[r * stride + c];
+    }
+  }
+}
+
+static void RowMajor2Col8MajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
+  for (int r = 0; r < row; r++) {
+    for (int c = 0; c < col; c++) {
+      int r_div8 = r / 8;
+      int r_mod8 = r % 8;
+      dst[r_div8 * 8 * col + c * 8 + r_mod8] = src[r * stride + c];
+    }
+  }
+}
+
+static void RowMajor2Row8MajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
+  for (int r = 0; r < row; r++) {
+    const float16_t *src_ptr = src + r * stride;
+    int c = 0;
+    for (; c < col; c++) {
+      int cd8 = c / C8NUM;
+      int cm8 = c % C8NUM;
+      dst[cd8 * C8NUM * row + r * C8NUM + cm8] = src_ptr[c];
+    }
+    for (; c < UP_ROUND(col, C8NUM); c++) {
+      int cd8 = c / C8NUM;
+      int cm8 = c % C8NUM;
+      dst[cd8 * C8NUM * row + r * C8NUM + cm8] = 0;
+    }
+  }
+  return;
+}
+
+static void RowMajor2ColXMajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
+#ifdef ENABLE_ARM64
+  RowMajor2Col16MajorStrideFp16(src, dst, row, col, stride);
+#else
+  RowMajor2Col12MajorStrideFp16(src, dst, row, col, stride);
+#endif
+}
+
+static void RowMajor2RowXMajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
+#ifdef ENABLE_ARM64
+  RowMajor2Row16MajorStrideFp16(src, dst, row, col, stride);
+#else
+  RowMajor2Row12MajorStrideFp16(src, dst, row, col, stride);
+#endif
+}
+
+void GemmMatmulFp16(int ta, int tb, int M, int N, int K, float16_t alpha, const float16_t *mat_a, int lda,
+                    const float16_t *mat_b, int ldb, float16_t beta, float16_t *mat_c, int ldc, float16_t *workspace) {
+  GemmCbFp16 gcb;
+  gcb.atype = ActType_No;
+  gcb.ca = 0;
+  gcb.cb = 0;
+  gcb.bias = NULL;
+  GemmMatmulPlusFp16(ta, tb, M, N, K, alpha, mat_a, lda, mat_b, ldb, beta, mat_c, ldc, workspace, &gcb);
+}
+
+void GemmMatmulPlusFp16(int ta, int tb, int M, int N, int K, float16_t alpha, const float16_t *mat_a, int lda,
+                        const float16_t *mat_b, int ldb, float16_t beta, float16_t *mat_c, int ldc,
+                        float16_t *workspace, GemmCbFp16 *gcb) {
+#ifdef ENABLE_ARM64
+  const int num = C16NUM;
+#else
+  const int num = C12NUM;
+#endif
+  float16_t *output = mat_c;
+  float16_t *fworkspace = workspace;
+  int incremental = (beta < 0.f) || (beta > 0.f);
+  float16_t *mat_a_input = (float16_t *)mat_a;
+  float16_t *mat_b_input = (float16_t *)mat_b;
+
+  if (!gcb->ca) {
+    mat_a_input = fworkspace;
+    fworkspace += MatSizeFp16(M, K, num);
+    if (ta) {
+      RowMajor2RowXMajorStrideFp16(mat_a, mat_a_input, K, M, lda);
+    } else {
+      RowMajor2ColXMajorStrideFp16(mat_a, mat_a_input, M, K, lda);
+    }
+  }
+  if (!gcb->cb) {
+    mat_b_input = fworkspace;
+    fworkspace += MatSizeFp16(N, K, C8NUM);
+    if (tb) {
+      RowMajor2Col8MajorStrideFp16(mat_b, mat_b_input, N, K, ldb);
+    } else {
+      RowMajor2Row8MajorStrideFp16(mat_b, mat_b_input, K, N, ldb);
+    }
+  }
+  if (incremental) output = fworkspace;
+  MatMulFp16(mat_a_input, mat_b_input, output, gcb->bias, gcb->atype, K, M, N, ldc, OutType_Nhwc);
+  if (incremental) AddMatrixFp16(output, mat_c, beta, M, N, ldc);
+  gcb->mat_a = mat_a_input;
+  gcb->mat_b = mat_b_input;
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/gemm_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/gemm_fp16.h
@ -0,0 +1,46 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_GEMM_FP16_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_GEMM_FP16_H_
+
+#include <stdlib.h>
+#include "nnacl/op_base.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef struct {
+  int ca;
+  int cb;
+  ActType atype;
+  float16_t *bias;
+  float16_t *mat_a;
+  float16_t *mat_b;
+} GemmCbFp16;
+
+void GemmMatmulFp16(int ta, int tb, int M, int N, int K, float16_t alpha, const float16_t *mat_a, int lda,
+                    const float16_t *mat_b, int ldb, float16_t beta, float16_t *mat_c, int ldc, float16_t *workspace);
+void GemmMatmulPlusFp16(int ta, int tb, int M, int N, int K, float16_t alpha, const float16_t *mat_a, int lda,
+                        const float16_t *mat_b, int ldb, float16_t beta, float16_t *mat_c, int ldc,
+                        float16_t *workspace, GemmCbFp16 *gcb);
+int MatSizeFp16(int row, int col, int round);
+int MatSizeTotalFp16(int row, int col, int deep, int inc);
+void AddMatrixFp16(const float16_t *v1, float16_t *v2, float16_t beta, int row, int col, int stride);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_GEMM_FP16_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/layernorm_grad.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/layernorm_grad.c
@ -0,0 +1,59 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "nnacl/fp16_grad/layernorm_grad.h"
+#include <stddef.h>
+#include <math.h>
+
+void LayerNormFp16Grad(const float16_t *x, const float16_t *dy, const float16_t *var, const float16_t *mean,
+                       const float16_t *gamma, int param_num, int param_size, int block_num, int block_size,
+                       float16_t *dx, float16_t *dg, float16_t *db) {
+  // var is actually 1/sqrf(var)-> var^0.5
+  const float16_t *var_sqrt_rev = var;
+  for (size_t i = 0; i < param_num; ++i) {
+    float dgamma = 0.0f;
+    float dbeta = 0.0f;
+    for (size_t j = i; j < param_size * param_num; j += param_num) {
+      int norm_shift = (int)(j / block_size);
+      dgamma += dy[j] * var_sqrt_rev[norm_shift] * (x[j] - mean[norm_shift]);
+      dbeta += dy[j];
+    }
+    dg[i] = (float16_t)dgamma;
+    db[i] = (float16_t)dbeta;
+  }
+  for (size_t i = 0; i < block_num; ++i) {
+    float sum1 = 0.0f;
+    float sum2 = 0.0f;
+    float sum3 = 0.0f;
+    for (size_t j = i * block_size; j < (i + 1) * block_size; ++j) {
+      int param_shift = j % param_num;
+      int norm_shift = (int)(j / block_size);
+      float16_t dxm = x[j] - mean[norm_shift];
+      float16_t dyg = dy[j] * gamma[param_shift];
+      sum1 += -0.5f * dyg * dxm * var_sqrt_rev[norm_shift] * var_sqrt_rev[norm_shift] * var_sqrt_rev[norm_shift];
+      sum2 += dyg;
+      sum3 += -2.0f * dxm;
+    }
+    for (size_t j = i * block_size; j < (i + 1) * block_size; ++j) {
+      int param_shift = j % param_num;
+      int norm_shift = (int)(j / block_size);
+      float16_t var_sqrt = var_sqrt_rev[norm_shift];
+      float dx1 = dy[j] * gamma[param_shift] * var_sqrt;
+      float dx2 = sum1 * 2.0f / block_size * (x[j] - mean[norm_shift]);
+      float dx3 = (-1.0f * var_sqrt * sum2 + (1.0f / block_size) * sum1 * sum3) * (1.0f / block_size);
+      dx[j] = (float16_t)(dx1 + dx2 + dx3);
+    }
+  }
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/layernorm_grad.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/layernorm_grad.h
@ -0,0 +1,32 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_LAYERNORM_GRAD_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_LAYERNORM_GRAD_H_
+
+#include "nnacl/op_base.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void LayerNormFp16Grad(const float16_t *x, const float16_t *dy, const float16_t *var, const float16_t *mean,
+                       const float16_t *gamma, int param_num, int param_size, int block_num, int block_size,
+                       float16_t *dx, float16_t *dg, float16_t *db);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_LAYERNORM_GRAD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/pack_fp16_ext.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/pack_fp16_ext.c
@ -0,0 +1,201 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string.h>
+#include "nnacl/fp16_grad/pack_fp16_ext.h"
+
+void RollingIm2ColPackDwUnitFp16(const float16_t *in_data, const ConvParameter *conv_param, float16_t *data_col_orig,
+                                 int real_cal_num, int start) {
+  const int pad_left = conv_param->pad_l_;
+  const int pad_up = conv_param->pad_u_;
+
+  const int stride_h = conv_param->stride_h_;
+  const int stride_w = conv_param->stride_w_;
+
+  const int dilation_h = conv_param->dilation_h_;
+  const int dilation_w = conv_param->dilation_w_;
+
+  const int kernel_h = conv_param->kernel_h_;
+  const int kernel_w = conv_param->kernel_w_;
+
+  const int in_height = conv_param->input_h_;
+  const int in_width = conv_param->input_w_;
+
+  const int output_w = conv_param->output_w_;
+
+  const int channels = conv_param->input_channel_;
+  const int stride = kernel_h * kernel_w;
+
+  int kernel_row, kernel_col;
+
+  for (int i = 0; i < real_cal_num; i++) {
+    int block_start = start + i;
+    int input_h = block_start / output_w * stride_h;
+    int input_w = block_start % output_w * stride_w;
+    float16_t *data_col = data_col_orig + i * channels * stride;
+    for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+      int input_row = -pad_up + kernel_row * dilation_h + input_h;
+      for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+        int input_col = -pad_left + kernel_col * dilation_w + input_w;
+        if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) {
+          const int offset = (input_row * in_width + input_col) * channels;
+          for (int c = 0; c < channels; c++) {
+            data_col[c * stride] = in_data[offset + c];
+          }
+          data_col++;
+        } else {
+          for (int c = 0; c < channels; c++) {
+            data_col[c * stride] = 0;
+          }
+          data_col++;
+        }
+      }
+    }
+  }
+}
+
+void RollingIm2ColPackUnitFp16(const float16_t *input_data, const ConvParameter *conv_param, float16_t *packed_input,
+                               int real_cal_num, int block_index) {
+  const int pad_left = conv_param->pad_l_;
+  const int pad_up = conv_param->pad_u_;
+
+  const int stride_h = conv_param->stride_h_;
+  const int stride_w = conv_param->stride_w_;
+
+  const int dilation_h = conv_param->dilation_h_;
+  const int dilation_w = conv_param->dilation_w_;
+
+  const int kernel_h = conv_param->kernel_h_;
+  const int kernel_w = conv_param->kernel_w_;
+
+  const int in_height = conv_param->input_h_;
+  const int in_width = conv_param->input_w_;
+
+  const int output_w = conv_param->output_w_;
+
+  const int channels = conv_param->input_channel_ / conv_param->group_;
+  const int tot_channels = conv_param->input_channel_;
+
+  int kernel_row, kernel_col;
+
+  if (channels == 1) {
+    for (int i = 0; i < real_cal_num; i++) {
+      int block_start = block_index + i;
+      int input_h = block_start / output_w * stride_h;
+      int input_w = block_start % output_w * stride_w;
+      for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+        int input_row = -pad_up + kernel_row * dilation_h + input_h;
+        for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+          int input_col = -pad_left + kernel_col * dilation_w + input_w;
+          if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) {
+            const int offset = (input_row * in_width + input_col) * tot_channels;
+            *packed_input = input_data[offset];
+            packed_input++;
+          } else {
+            *packed_input = 0;
+            packed_input++;
+          }
+        }
+      }
+    }
+  } else {
+    for (int i = 0; i < real_cal_num; i++) {
+      int block_start = block_index + i;
+      int input_h = block_start / output_w * stride_h;
+      int input_w = block_start % output_w * stride_w;
+      for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+        int input_row = -pad_up + kernel_row * dilation_h + input_h;
+        for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+          int input_col = -pad_left + kernel_col * dilation_w + input_w;
+          if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) {
+            const int offset = (input_row * in_width + input_col) * tot_channels;
+            memcpy(packed_input, input_data + offset, sizeof(float16_t) * channels);
+            packed_input += channels;
+          } else {
+            memset(packed_input, 0, sizeof(float16_t) * channels);
+            packed_input += channels;
+          }
+        }
+      }
+    }
+  }
+}
+
+void RollingCol2ImPackUnitFp16(const float16_t *data_col, float16_t *data_im, const ConvParameter *conv_param,
+                               int real_cal_num, int block_index) {
+  const int pad_left = conv_param->pad_l_;
+  const int pad_up = conv_param->pad_u_;
+
+  const int stride_h = conv_param->stride_h_;
+  const int stride_w = conv_param->stride_w_;
+
+  const int dilation_h = conv_param->dilation_h_;
+  const int dilation_w = conv_param->dilation_w_;
+
+  const int kernel_h = conv_param->kernel_h_;
+  const int kernel_w = conv_param->kernel_w_;
+
+  const int in_height = conv_param->input_h_;
+  const int in_width = conv_param->input_w_;
+
+  const int output_w = conv_param->output_w_;
+  const int channels = conv_param->input_channel_ / conv_param->group_;
+  const int tot_channels = conv_param->input_channel_;
+
+  int kernel_row, kernel_col;
+
+  if (channels == 1) {
+    for (int r = 0; r < real_cal_num; r++) {
+      int output_col = (block_index + r) % output_w;
+      int output_row = (block_index + r) / output_w;
+      int row_stride_offset = output_row * stride_h;
+      int col_stride_offset = output_col * stride_w;
+      for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+        int input_row = -pad_up + kernel_row * dilation_h + row_stride_offset;
+        for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+          int input_col = -pad_left + kernel_col * dilation_w + col_stride_offset;
+          if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) {
+            int offset = (input_row * in_width + input_col) * tot_channels;
+            float16_t *data_im_ptr = data_im + offset;
+            *data_im_ptr += *data_col;
+          }
+          data_col++;
+        }
+      }
+    }
+  } else {
+    for (int r = 0; r < real_cal_num; r++) {
+      int output_col = (block_index + r) % output_w;
+      int output_row = (block_index + r) / output_w;
+      int row_stride_offset = output_row * stride_h;
+      int col_stride_offset = output_col * stride_w;
+      for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+        int input_row = -pad_up + kernel_row * dilation_h + row_stride_offset;
+        for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+          int input_col = -pad_left + kernel_col * dilation_w + col_stride_offset;
+          if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) {
+            int offset = (input_row * in_width + input_col) * tot_channels;
+            float16_t *data_im_ptr = &data_im[offset];
+            for (int i = 0; i < channels; i++) {
+              data_im_ptr[i] += data_col[i];
+            }
+          }
+          data_col += channels;
+        }
+      }
+    }
+  }
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/pack_fp16_ext.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/pack_fp16_ext.h
@ -0,0 +1,37 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_PACK_FP16_EXT_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_PACK_FP16_EXT_H_
+
+#include <stddef.h>
+#include "nnacl/conv_parameter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void RollingIm2ColPackUnitFp16(const float16_t *input_data, const ConvParameter *conv_param, float16_t *packed_input,
+                               int real_cal_num, int block_index);
+void RollingIm2ColPackDwUnitFp16(const float16_t *input_data, const ConvParameter *conv_param, float16_t *packed_input,
+                                 int real_cal_num, int block_index);
+void RollingCol2ImPackUnitFp16(const float16_t *data_col, float16_t *data_im, const ConvParameter *conv_param,
+                               int real_cal_num, int block_index);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_PACK_FP16_EXT_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/pooling_grad.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/pooling_grad.c
@ -0,0 +1,189 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stdint.h>
+#include <string.h>
+#include <float.h>
+#include "nnacl/fp16_grad/pooling_grad.h"
+
+void AvgPoolingFp16Grad(const float16_t *input_ptr, float16_t *output_ptr, int count, PoolingParameter *pooling_param) {
+  int stride_w = pooling_param->stride_w_;
+  int stride_h = pooling_param->stride_h_;
+  int pad_w = pooling_param->pad_l_;
+  int pad_h = pooling_param->pad_u_;
+  int win_w = pooling_param->window_w_;
+  int win_h = pooling_param->window_h_;
+  int channel = pooling_param->input_channel_;
+  int in_w = pooling_param->input_w_;
+  int in_h = pooling_param->input_h_;
+  int output_w = pooling_param->output_w_;
+  int output_h = pooling_param->output_h_;
+
+  const float16_t kk = 1.0f / (float16_t)(win_h * win_w);
+#if ENABLE_NEON
+  const float16x4_t factor = vdup_n_f16(kk);
+#endif
+  for (int ib = 0; ib < count; ib++) {
+    float16_t *out = &output_ptr[(ib * in_h * in_w * channel)];
+    const float16_t *inPtr = &input_ptr[(ib * output_h * output_w * channel)];
+    // iterate over yt
+    for (int yh = 0; yh < output_h; yh++) {
+      int over_h = pad_h - yh * stride_h;
+      int kh_s = MSMAX(0, over_h);
+      int kh_e = MSMIN(win_h, in_h + over_h);
+      for (int yw = 0; yw < output_w; yw++) {
+        int over_w = pad_w - yw * stride_w;
+        int kw_s = MSMAX(0, over_w);
+        int kw_e = MSMIN(win_w, in_w + over_w);
+        int ic = 0;
+        for (; ic < channel - 4; ic += 4) {
+          int idx = (yw + yh * output_w) * channel + ic;
+#ifdef ENABLE_NEON
+          float16x4_t in = vld1_f16(inPtr + idx);
+          float16x4_t delta = vmul_f16(in, factor);
+#else
+          float16_t delta[4] = {inPtr[idx], inPtr[idx + 1], inPtr[idx + 2], inPtr[idx + 3]};
+          for (int i = 0; i < 4; i++) delta[i] *= kk;
+#endif
+          for (int kh = kh_s; kh < kh_e; kh++) {
+            int xh = yh * stride_h + kh - pad_h;
+            for (int kw = kw_s; kw < kw_e; kw++) {
+              int xw = yw * stride_w + kw - pad_w;
+#ifdef ENABLE_NEON
+              float16_t *out_vec = out + (xw + in_w * xh) * channel + ic;
+              float16x4_t outr = vld1_f16(out + (xw + in_w * xh) * channel + ic);
+              float16x4_t outs = vadd_f16(outr, delta);
+              vst1_f16(out_vec, outs);
+#else
+
+              for (int i = 0; i < 4; i++) {
+                out[(xw + in_w * xh) * channel + ic + i] += ((float16_t *)&delta)[i];
+              }
+#endif
+            }
+          }
+        }
+        for (; ic < channel; ic++) {
+          int idx = (yw + yh * output_w) * channel + ic;
+          float16_t delta = inPtr[idx] * kk;
+          for (int kh = kh_s; kh < kh_e; kh++) {
+            int xh = yh * stride_h + kh - pad_h;
+            for (int kw = kw_s; kw < kw_e; kw++) {
+              int xw = yw * stride_w + kw - pad_w;
+              out[(xw + in_w * xh) * channel + ic] += delta;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+#ifdef ENABLE_NEON
+static int32x4_t MaxIndex(float16x4_t in, float16x4_t *max, uint32x4_t index, uint32x4_t prev_index) {
+  uint16x4_t res = vcgt_f16(in, *max);
+  int16x4_t tmp = vreinterpret_s16_u16(res);
+  uint32x4_t res_tmp = vreinterpretq_u32_s32(vmovl_s16(tmp));
+  int32x4_t m_index = vbslq_s32(res_tmp, index, prev_index);
+  *max = vbsl_f16(res, in, *max);
+  return m_index;
+}
+#endif
+
+void MaxPoolingFp16Grad(const float16_t *input_ptr, const float16_t *dy_ptr, float16_t *output_ptr, int output_batch,
+                        PoolingParameter *pooling_param) {
+  int stride_w = pooling_param->stride_w_;
+  int stride_h = pooling_param->stride_h_;
+  int pad_w = pooling_param->pad_l_;
+  int pad_h = pooling_param->pad_u_;
+  int win_w = pooling_param->window_w_;
+  int win_h = pooling_param->window_h_;
+  int channel = pooling_param->input_channel_;
+  int in_w = pooling_param->input_w_;
+  int in_h = pooling_param->input_h_;
+  int output_w = pooling_param->output_w_;
+  int output_h = pooling_param->output_h_;
+
+  for (int ib = 0; ib < output_batch; ib++) {
+    float16_t *out = &output_ptr[(ib * in_h * in_w * channel)];
+    const float16_t *inPtr = &input_ptr[(ib * in_h * in_w * channel)];
+    const float16_t *dyPtr = &dy_ptr[(ib * output_h * output_w * channel)];
+    for (int yh = 0; yh < output_h; yh++) {
+      int over_h = pad_h - yh * stride_h;
+      int kh_s = MSMAX(0, over_h);
+      int kh_e = MSMIN(win_h, in_h + over_h);
+      for (int yw = 0; yw < output_w; yw++) {
+        int over_w = pad_w - yw * stride_w;
+        int kw_s = MSMAX(0, over_w);
+        int kw_e = MSMIN(win_w, in_w + over_w);
+        int ic = 0;
+        for (; ic < (channel & ~3); ic += 4) {
+          int idx = (yw + yh * output_w) * channel + ic;
+#ifdef ENABLE_NEON
+          uint32x4_t max_idx = vdupq_n_u32(0);
+          float16x4_t max_val = vdup_n_f16(-FLT16_MAX);
+          float16x4_t delta = vld1_f16(dyPtr + idx);
+#else
+          float16_t delta[4] = {dyPtr[idx], dyPtr[idx + 1], dyPtr[idx + 2], dyPtr[idx + 3]};
+          float16_t max_val[4] = {-FLT16_MAX, -FLT16_MAX, -FLT16_MAX, -FLT16_MAX};
+          uint max_idx[4] = {0};
+#endif
+          for (int kh = kh_s; kh < kh_e; kh++) {
+            int xh = yh * stride_h + kh - pad_h;
+            for (int kw = kw_s; kw < kw_e; kw++) {
+              int xw = yw * stride_w + kw - pad_w;
+              int val_idx = (xw + in_w * xh) * channel + ic;
+#ifdef ENABLE_NEON
+              uint32x4_t index = {val_idx, val_idx + 1, val_idx + 2, val_idx + 3};
+              float16x4_t in = vld1_f16(inPtr + val_idx);
+              max_idx = MaxIndex(in, &max_val, index, max_idx);
+#else
+              float16_t val[4] = {inPtr[val_idx], inPtr[val_idx + 1], inPtr[val_idx + 2], inPtr[val_idx + 3]};
+              for (int i = 0; i < 4; i++) {
+                if (val[i] > max_val[i]) {
+                  max_val[i] = val[i];
+                  max_idx[i] = val_idx + i;
+                }
+              }
+#endif
+            }
+          }
+          for (int i = 0; i < 4; i++) {
+            out[((int *)&max_idx)[i]] += ((float16_t *)&delta)[i];
+          }
+        }
+        for (; ic < channel; ic++) {
+          float16_t max_val = -FLT16_MAX;
+          int max_idx = 0;
+          int idx = (yw + yh * output_w) * channel + ic;
+          float16_t delta = dyPtr[idx];
+          for (int kh = kh_s; kh < kh_e; kh++) {
+            int xh = yh * stride_h + kh - pad_h;
+            for (int kw = kw_e; kw < kw_s; kw++) {
+              int xw = yw * stride_w + kw - pad_w;
+              int val_idx = (xw + in_w * xh) * channel + ic;
+              float16_t val = inPtr[val_idx];
+              if (val > max_val) {
+                max_val = val;
+                max_idx = val_idx;
+              }
+            }
+          }
+          out[max_idx] += delta;
+        }
+      }
+    }
+  }
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/pooling_grad.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/pooling_grad.h
@ -0,0 +1,32 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_POOLING_GRAD_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_POOLING_GRAD_H_
+
+#include "nnacl/fp16/pooling_fp16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void AvgPoolingFp16Grad(const float16_t *input_ptr, float16_t *output_ptr, int count, PoolingParameter *pooling_param);
+void MaxPoolingFp16Grad(const float16_t *input_ptr, const float16_t *dy_ptr, float16_t *output_ptr, int output_batch,
+                        PoolingParameter *pooling_param);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_POOLING_GRAD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/resize_grad.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/resize_grad.c
@ -0,0 +1,142 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "nnacl/fp16_grad/resize_grad.h"
+#include <math.h>
+#include "nnacl/infer/common_infer.h"
+
+void ResizeNearestNeighborFp16Grad(float16_t *in_addr, float16_t *out_addr, int batch_size, int channel, int format,
+                                   ResizeFp16GradParameter *param) {
+  bool align_corners = param->align_corners_;
+  size_t in_hw_size = param->in_width_ * param->in_height_;
+  size_t out_hw_size = param->out_width_ * param->out_height_;
+
+  if (format == Format_NHWC) {
+    for (int32_t b = 0; b < batch_size; ++b) {
+      for (size_t i = 0; i < in_hw_size; ++i) {
+        size_t in_y = i / param->in_width_;
+        size_t in_x = i % param->in_width_;
+        for (int32_t c = 0; c < channel; ++c) {
+          size_t out_y = MSMIN(
+            (align_corners) ? (size_t)roundf(in_y * param->height_scale_) : (size_t)floorf(in_y * param->height_scale_),
+            param->out_height_ - 1);
+          size_t out_x = MSMIN(
+            (align_corners) ? (size_t)roundf(in_x * param->width_scale_) : (size_t)floorf(in_x * param->width_scale_),
+            param->out_width_ - 1);
+          size_t out_offset = out_y * (param->out_width_ * channel) + (out_x * channel) + c;
+          size_t in_offset = in_y * (param->in_width_ * channel) + (in_x * channel) + c;
+          out_addr[out_offset] += in_addr[in_offset];
+        }
+      }
+      out_addr += out_hw_size * channel;
+      in_addr += in_hw_size * channel;
+    }
+  } else if (format == Format_NCHW) {
+    for (int32_t b = 0; b < batch_size; ++b) {
+      for (int32_t c = 0; c < channel; ++c) {
+        for (size_t h = 0; h < param->in_height_; ++h) {
+          size_t out_y =
+            MSMIN((align_corners) ? (size_t)roundf(h * param->height_scale_) : (size_t)floorf(h * param->height_scale_),
+                  param->out_height_ - 1);
+          for (size_t w = 0; w < param->in_width_; ++w) {
+            size_t out_x =
+              MSMIN((align_corners) ? (size_t)roundf(w * param->width_scale_) : (size_t)floorf(w * param->width_scale_),
+                    param->out_width_ - 1);
+            out_addr[out_y * param->out_width_ + out_x] += in_addr[h * param->in_width_ + w];
+          }
+        }
+        out_addr += out_hw_size;
+        in_addr += in_hw_size;
+      }
+    }
+  }
+}
+
+void ResizeBiLinearFp16Grad(float16_t *in_addr, float16_t *out_addr, int batch_size, int channel, int format,
+                            ResizeFp16GradParameter *param) {
+  size_t in_hw_size = param->in_width_ * param->in_height_;
+  size_t out_hw_size = param->out_width_ * param->out_height_;
+
+  if (format == Format_NHWC) {
+    for (int32_t b = 0; b < batch_size; ++b) {
+      for (size_t i = 0; i < in_hw_size; ++i) {
+        size_t h = i / param->in_width_;
+        size_t w = i % param->in_width_;
+        for (int32_t c = 0; c < channel; ++c) {
+          float16_t in_y = (float16_t)h * param->height_scale_;
+          size_t top_y_index = MSMAX((size_t)(floorf(in_y)), (size_t)(0));
+          size_t bottom_y_index = MSMIN((size_t)(ceilf(in_y)), param->out_height_ - 1);
+          float16_t y_lerp = in_y - floorf(in_y);
+          float16_t inverse_y_lerp = 1.0 - y_lerp;
+
+          float16_t in_x = (float16_t)w * param->width_scale_;
+          size_t left_x_index = MSMAX((size_t)(floorf(in_x)), (size_t)(0));
+          size_t right_x_index = MSMIN((size_t)(ceilf(in_x)), param->out_width_ - 1);
+          float16_t x_lerp = in_x - floorf(in_x);
+          float16_t inverse_x_lerp = 1.0 - x_lerp;
+
+          size_t in_offset = h * (param->in_width_ * channel) + (w * channel) + c;
+          size_t out_offset_top_y_left_x = top_y_index * (param->out_width_ * channel) + (left_x_index * channel) + c;
+          size_t out_offset_top_y_right_x = top_y_index * (param->out_width_ * channel) + (right_x_index * channel) + c;
+          size_t out_offset_bottom_y_left_x =
+            bottom_y_index * (param->out_width_ * channel) + (left_x_index * channel) + c;
+          size_t out_offset_bottom_y_right_x =
+            bottom_y_index * (param->out_width_ * channel) + (right_x_index * channel) + c;
+
+          out_addr[out_offset_top_y_left_x] += in_addr[in_offset] * (float16_t)(inverse_y_lerp * inverse_x_lerp);
+          out_addr[out_offset_top_y_right_x] += in_addr[in_offset] * (float16_t)(inverse_y_lerp * x_lerp);
+          out_addr[out_offset_bottom_y_left_x] += in_addr[in_offset] * (float16_t)(y_lerp * inverse_x_lerp);
+          out_addr[out_offset_bottom_y_right_x] += in_addr[in_offset] * (float16_t)(y_lerp * x_lerp);
+        }
+      }
+      out_addr += out_hw_size * channel;
+      in_addr += in_hw_size * channel;
+    }
+  } else if (format == Format_NCHW) {
+    size_t in_height = param->in_height_;
+    size_t in_width = param->in_width_;
+    size_t out_height = param->out_height_;
+    size_t out_width = param->out_width_;
+
+    for (size_t b = 0; b < batch_size; ++b) {
+      for (size_t c = 0; c < channel; ++c) {
+        for (size_t h = 0; h < in_height; ++h) {
+          const float16_t in_y = (float16_t)(h)*param->height_scale_;
+          const size_t top_y_index = MSMAX((size_t)floorf(in_y), 0);
+          const size_t bottom_y_index = MSMIN((size_t)ceilf(in_y), out_height - 1);
+          const float16_t y_lerp = in_y - floorf(in_y);
+          const float16_t inverse_y_lerp = 1.0 - y_lerp;
+          for (size_t w = 0; w < in_width; ++w) {
+            const float16_t in_x = (float16_t)(w)*param->width_scale_;
+            const size_t left_x_index = MSMAX((size_t)floorf(in_x), 0);
+            const size_t right_x_index = MSMIN((size_t)ceilf(in_x), out_width - 1);
+            const float16_t x_lerp = in_x - floorf(in_x);
+            const float16_t inverse_x_lerp = 1.0 - x_lerp;
+            out_addr[top_y_index * out_width + left_x_index] +=
+              in_addr[h * in_width + w] * (float16_t)(inverse_y_lerp * inverse_x_lerp);
+            out_addr[top_y_index * out_width + right_x_index] +=
+              in_addr[h * in_width + w] * (float16_t)(inverse_y_lerp * x_lerp);
+            out_addr[bottom_y_index * out_width + left_x_index] +=
+              in_addr[h * in_width + w] * (float16_t)(y_lerp * inverse_x_lerp);
+            out_addr[bottom_y_index * out_width + right_x_index] +=
+              in_addr[h * in_width + w] * (float16_t)(y_lerp * x_lerp);
+          }
+        }
+        out_addr += out_hw_size;
+        in_addr += in_hw_size;
+      }
+    }
+  }
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/resize_grad.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/resize_grad.h
@ -0,0 +1,45 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_RESIZE_GRAD_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_RESIZE_GRAD_H_
+
+#include "nnacl/op_base.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct ResizeFp16GradParameter {
+  OpParameter op_parameter_;
+  bool align_corners_;
+  int method;
+  size_t in_height_;
+  size_t in_width_;
+  size_t out_height_;
+  size_t out_width_;
+  float16_t height_scale_;
+  float16_t width_scale_;
+} ResizeFp16GradParameter;
+
+void ResizeNearestNeighborFp16Grad(float16_t *in_addr, float16_t *out_addr, int batch_size, int channel, int format,
+                                   ResizeFp16GradParameter *param);
+void ResizeBiLinearFp16Grad(float16_t *in_addr, float16_t *out_addr, int batch_size, int channel, int format,
+                            ResizeFp16GradParameter *param);
+#ifdef __cplusplus
+}
+#endif
+#endif  //  MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_RESIZE_GRAD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/strided_slice_grad.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/strided_slice_grad.c
@ -0,0 +1,62 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp16_grad/strided_slice_grad.h"
+#include "nnacl/errorcode.h"
+
+static size_t CalcIndex(const int *shape, size_t size, int i, size_t pos) {
+  size_t res = 1;
+  for (size_t j = 0; j < size; j++) {
+    res *= shape[(i + 1) + j];
+  }
+  return (pos / res % shape[i]);
+}
+
+int DoStridedSliceFp16Grad(const float16_t *inputs, float16_t *output, const int *dx_shape,
+                           StridedSliceParameter *param) {
+  if (inputs == NULL || output == NULL || param == NULL) {
+    return NNACL_NULL_PTR;
+  }
+  if (param->num_axes_ > DIMENSION_7D) {
+    return NNACL_PARAM_INVALID;
+  }
+
+  size_t size = 1;
+  int *s = param->strides_;
+  int *b = param->begins_;
+  for (int i = 0; i < DIMENSION_7D; i++) {
+    size *= param->in_shape_[i];
+  }
+
+  for (size_t pos = 0; pos < size; pos++) {
+    size_t i = CalcIndex(param->in_shape_, 6, 0, pos);
+    size_t j = CalcIndex(param->in_shape_, 5, 1, pos);
+    size_t k = CalcIndex(param->in_shape_, 4, 2, pos);
+    size_t l = CalcIndex(param->in_shape_, 3, 3, pos);
+    size_t m = CalcIndex(param->in_shape_, 2, 4, pos);
+    size_t n = CalcIndex(param->in_shape_, 1, 5, pos);
+    size_t o = CalcIndex(param->in_shape_, 0, 6, pos);
+
+    size_t input_idx =
+      (i * s[0] + b[0]) * dx_shape[1] * dx_shape[2] * dx_shape[3] * dx_shape[4] * dx_shape[5] * dx_shape[6] +
+      (j * s[1] + b[1]) * dx_shape[2] * dx_shape[3] * dx_shape[4] * dx_shape[5] * dx_shape[6] +
+      (k * s[2] + b[2]) * dx_shape[3] * dx_shape[4] * dx_shape[5] * dx_shape[6] +
+      (l * s[3] + b[3]) * dx_shape[4] * dx_shape[5] * dx_shape[6] + (m * s[4] + b[4]) * dx_shape[5] * dx_shape[6] +
+      (n * s[5] + b[5]) * dx_shape[6] + (o * s[6] + b[6]);
+    output[input_idx] = inputs[pos];
+  }
+  return NNACL_OK;
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/strided_slice_grad.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/strided_slice_grad.h
@ -0,0 +1,31 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_STRIDED_SLICE_GRAD_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_STRIDED_SLICE_GRAD_H_
+
+#include "nnacl/op_base.h"
+#include "nnacl/strided_slice_parameter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+int DoStridedSliceFp16Grad(const float16_t *inputs, float16_t *output, const int *dx_shape,
+                           StridedSliceParameter *param);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_STRIDED_SLICE_GRAD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/unsorted_segment_sum.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/unsorted_segment_sum.c
@ -0,0 +1,33 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "nnacl/fp16_grad/unsorted_segment_sum.h"
+#include "nnacl/errorcode.h"
+
+int UnsortedSegmentSumFp16(const float16_t *input, int unit_num, int input_dim1, const int *indices, float16_t *output,
+                           int output_dim0, int output_dim1) {
+  for (int i = 0; i < unit_num; ++i) {
+    int j = i / input_dim1;
+    int k = i % input_dim1;
+
+    int index = indices[j];
+    if (index < 0 || index >= output_dim0) {
+      continue;
+    }
+    int output_index = index * output_dim1 + k;
+    output[output_index] += input[i];
+  }
+  return NNACL_OK;
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/unsorted_segment_sum.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/unsorted_segment_sum.h
@ -0,0 +1,31 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_UNSORTED_SEGMENT_SUM_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_UNSORTED_SEGMENT_SUM_H_
+
+#include "nnacl/op_base.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int UnsortedSegmentSumFp16(const float16_t *input, int unit_num, int input_dim1, const int *indices, float16_t *output,
+                           int output_dim0, int output_dim1);
+#ifdef __cplusplus
+}
+#endif
+#endif  //  MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_UNSORTED_SEGMENT_SUM_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/convolution_grad_filter.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/convolution_grad_filter.c
@ -321,7 +321,7 @@ static int Filtergrad2Arm(const float *x, const float *dy, int i_c, int k_idx, f
    dw[(i_c + 1) * k_spatial + k_idx] = sum_2[1];
    i_c += 2;
  }
-  return i_c += 2;
+  return i_c;
 }
 #endif
 int ConvDwFilterGrad(const float *x, const float *dy, float *dw, int start, int count,
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions_fp16.h
@ -118,4 +118,11 @@ static inline float16x8_t MS_ERFX8_F16(float16x8_t src) {
  dst[7] = erff(src[7]);
  return dst;
 }
+
+static inline float32x4_t MS_VMLAL_F16(float16x4_t x, float16x4_t dy, float32x4_t sum) {
+  float32x4_t x_fp32 = MS_CVT_F32_F16(x);
+  float32x4_t dy_fp32 = MS_CVT_F32_F16(dy);
+  return vmlaq_f32(sum, x_fp32, dy_fp32);
+}
+
 #endif  // MINDSPORE_NNACL_INTRINSICS_MS_SIMD_INSTRUCTIONS_FP16_H_
--- a/mindspore/lite/src/ops/populate/v0/partial_populate_v0.cc
+++ b/mindspore/lite/src/ops/populate/v0/partial_populate_v0.cc
@ -39,7 +39,7 @@ OpParameter *PopulatePartialParameter(const void *prim) {
    MS_LOG(ERROR) << "malloc partial parameter failed.";
    return nullptr;
  }
-  memset(partial_parameter, 0, sizeof(PartialParameter));
+  memset(reinterpret_cast<void *>(partial_parameter), 0, sizeof(PartialParameter));
  partial_parameter->op_parameter_.type_ = schema::PrimitiveType_PartialFusion;

  partial_parameter->sub_graph_index_ = partial_prim->subGraphIndex();
--- a/mindspore/lite/src/ops/populate/v0/while_populate_v0.cc
+++ b/mindspore/lite/src/ops/populate/v0/while_populate_v0.cc
@ -39,7 +39,7 @@ OpParameter *PopulateWhileParameter(const void *prim) {
    MS_LOG(ERROR) << "malloc WhileParemeter failed.";
    return nullptr;
  }
-  memset(while_paremeter, 0, sizeof(WhileParemeter));
+  memset(reinterpret_cast<void *>(while_paremeter), 0, sizeof(WhileParemeter));

  while_paremeter->op_parameter_.type_ = schema::PrimitiveType_While;
  while_paremeter->body_subgraph_index = while_prim->bodySubgraphIndex();
--- a/mindspore/lite/src/ops/populate/while_populate.cc
+++ b/mindspore/lite/src/ops/populate/while_populate.cc
@ -38,7 +38,7 @@ OpParameter *PopulateWhileParemeter(const void *prim) {
    MS_LOG(ERROR) << "malloc WhileParemeter failed.";
    return nullptr;
  }
-  memset(param, 0, sizeof(WhileParemeter));
+  memset(reinterpret_cast<void *>(param), 0, sizeof(WhileParemeter));

  param->op_parameter_.type_ = primitive->value_type();
  param->body_subgraph_index = value->body_subgraph_index();
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/activation_fp16_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/activation_fp16_grad.cc
@ -52,10 +52,26 @@ int ActivationGradCPUKernelFp16::DoActivation(int task_id) {
  auto error_code = RET_OK;

  if (param_act_grad_->type_ == schema::ActivationType_RELU) {
-    error_code = Fp16ReluGrad(yt_addr + start, input_addr + start, count, output_addr + start);
+    error_code = ReluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start);
+  } else if (param_act_grad_->type_ == schema::ActivationType_RELU6) {
+    error_code = Relu6Fp16Grad(yt_addr + start, input_addr + start, count, output_addr + start);
+  } else if (param_act_grad_->type_ == schema::ActivationType_LEAKY_RELU) {
+    error_code = LReluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start,
+                               (float16_t)param_act_grad_->alpha_);
  } else if (param_act_grad_->type_ == schema::ActivationType_SIGMOID) {
    // Sigmoid gets the input tensors in reverse order!
-    error_code = Fp16SigmoidGrad(input_addr + start, yt_addr + start, count, output_addr + start);
+    error_code = SigmoidFp16Grad(input_addr + start, yt_addr + start, count, output_addr + start);
+  } else if (param_act_grad_->type_ == schema::ActivationType_TANH) {
+    error_code = TanhFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start);
+  } else if (param_act_grad_->type_ == schema::ActivationType_HSWISH) {
+    error_code = HSwishFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start);
+  } else if (param_act_grad_->type_ == schema::ActivationType_HSIGMOID) {
+    error_code = HSigmoidFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start);
+  } else if (param_act_grad_->type_ == schema::ActivationType_ELU) {
+    error_code =
+      EluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start, (float16_t)param_act_grad_->alpha_);
+  } else if (param_act_grad_->type_ == schema::ActivationType_GELU) {
+    error_code = GeluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start);
  } else {
    MS_LOG(ERROR) << "Activation type error";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/activation_fp16_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/activation_fp16_grad.h
@ -14,12 +14,13 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_ACTIVATION_FP16_GRAD_H
-#define MINDSPORE_ACTIVATION_FP16_GRAD_H
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ACTIVATION_FP16_GRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ACTIVATION_FP16_GRAD_H_

 #include <vector>
 #include "src/inner_kernel.h"
 #include "nnacl/fp16_grad/activation_grad.h"
+#include "nnacl/fp32_grad/activation_grad.h"

 namespace mindspore::kernel {
 class ActivationGradCPUKernelFp16 : public InnerKernel {
@ -27,7 +28,7 @@ class ActivationGradCPUKernelFp16 : public InnerKernel {
  explicit ActivationGradCPUKernelFp16(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
                                       const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
      : InnerKernel(param, inputs, outputs, ctx), thread_count_(ctx->thread_num_) {
-    param_act_grad_ = reinterpret_cast<ActivationGradParameterFp16 *>(param);
+    param_act_grad_ = reinterpret_cast<ActivationGradParameter *>(param);
  }
  ~ActivationGradCPUKernelFp16() override = default;

@ -37,9 +38,9 @@ class ActivationGradCPUKernelFp16 : public InnerKernel {
  int DoActivation(int task_id);

 private:
-  ActivationGradParameterFp16 *param_act_grad_;
+  ActivationGradParameter *param_act_grad_;
  int thread_count_;
 };
 }  // namespace mindspore::kernel

-#endif  // MINDSPORE_ACTIVATION_FP16_GRAD_H
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ACTIVATION_FP16_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/arithmetic_fp16_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/arithmetic_fp16_grad.cc
@ -0,0 +1,89 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp16_grad/arithmetic_fp16_grad.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "nnacl/fp16_grad/arithmetic_grad.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::kernel {
+
+int ArithmeticGradCPUKernelFp16::Init() { return RET_OK; }
+
+void ArithmeticGradCPUKernelFp16::ArithmeticGradMaximum(float16_t *dy, int dy_size, float16_t *dx1, int dx1_size,
+                                                        float16_t *dx2, int dx2_size) {
+  auto x1 = reinterpret_cast<float16_t *>(in_tensors_[0]->data_c());
+  auto x2 = reinterpret_cast<float16_t *>(in_tensors_[1]->data_c());
+  dy = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c());
+
+  MaximumByAxesFp16(x1, x2, dy, arithmeticParameter_->in_shape0_, arithmeticParameter_->in_shape1_,
+                    arithmeticParameter_->out_shape_, dx1, dx2, arithmeticParameter_->ndim_);
+}
+
+void ArithmeticGradCPUKernelFp16::ArithmeticGradMinimum(float16_t *dy, int dy_size, float16_t *dx1, int dx1_size,
+                                                        float16_t *dx2, int dx2_size) {
+  auto x1 = reinterpret_cast<float16_t *>(in_tensors_[0]->data_c());
+  auto x2 = reinterpret_cast<float16_t *>(in_tensors_[1]->data_c());
+  dy = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c());
+
+  MinimumByAxesFp16(x1, x2, dy, arithmeticParameter_->in_shape0_, arithmeticParameter_->in_shape1_,
+                    arithmeticParameter_->out_shape_, dx1, dx2, arithmeticParameter_->ndim_);
+}
+
+int ArithmeticGradCPUKernelFp16::ReSize() { return RET_OK; }
+
+int ArithmeticGradCPUKernelFp16::Execute(int task_id) {
+  auto dy = reinterpret_cast<float16_t *>(in_tensors_[0]->data_c());
+  auto dx1 = reinterpret_cast<float16_t *>(out_tensors_[0]->data_c());
+  auto dx2 = reinterpret_cast<float16_t *>(out_tensors_[1]->data_c());
+
+  size_t dy_size = in_tensors_.at(0)->ElementsNum();
+  size_t dx1_size = out_tensors_.at(0)->ElementsNum();
+  size_t dx2_size = out_tensors_.at(1)->ElementsNum();
+  (this->*arithmetic_grad_)(dy, dy_size, dx1, dx1_size, dx2, dx2_size);
+  return RET_OK;
+}
+
+int ArithmeticGradRunFp16(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
+  MS_ASSERT(cdata != nullptr);
+  auto Arithmetic_kernel = reinterpret_cast<ArithmeticGradCPUKernelFp16 *>(cdata);
+  auto error_code = Arithmetic_kernel->Execute(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "ArithmeticGradRunFp16 error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ArithmeticGradCPUKernelFp16::Run() {
+  int error_code = static_cast<const lite::InnerContext *>(this->context_)
+                     ->thread_pool_->ParallelLaunch(ArithmeticGradRunFp16, this, 1);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "Arithmetic Grad function error error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MaximumGrad, LiteKernelCreator<ArithmeticGradCPUKernelFp16>)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MinimumGrad, LiteKernelCreator<ArithmeticGradCPUKernelFp16>)
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/arithmetic_fp16_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/arithmetic_fp16_grad.h
@ -0,0 +1,80 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ARITHMETIC_FP16_GRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ARITHMETIC_FP16_GRAD_H_
+
+#include <vector>
+#include "src/inner_kernel.h"
+#include "nnacl/fp16/arithmetic_fp16.h"
+#include "schema/model_generated.h"
+
+using mindspore::schema::PrimitiveType_AddGrad;
+using mindspore::schema::PrimitiveType_DivGrad;
+using mindspore::schema::PrimitiveType_MaximumGrad;
+using mindspore::schema::PrimitiveType_MinimumGrad;
+using mindspore::schema::PrimitiveType_MulGrad;
+using mindspore::schema::PrimitiveType_SubGrad;
+
+namespace mindspore::kernel {
+
+class ArithmeticGradCPUKernelFp16;
+
+class ArithmeticGradCPUKernelFp16 : public InnerKernel {
+  typedef void (ArithmeticGradCPUKernelFp16::*ArithmeticGradOperation)(float16_t *, int, float16_t *, int, float16_t *,
+                                                                       int);
+
+ public:
+  explicit ArithmeticGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                                       const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : InnerKernel(parameter, inputs, outputs, ctx), tile_data0(NULL), tile_data1(NULL), tile_data2(NULL) {
+    switch (type()) {
+      case PrimitiveType_MaximumGrad:
+        arithmetic_grad_ = &ArithmeticGradCPUKernelFp16::ArithmeticGradMaximum;
+        break;
+      case PrimitiveType_MinimumGrad:
+        arithmetic_grad_ = &ArithmeticGradCPUKernelFp16::ArithmeticGradMinimum;
+        break;
+      default:
+        MS_LOG(ERROR) << "Error Operator type " << parameter->type_;
+        break;
+    }
+    arithmeticParameter_ = reinterpret_cast<ArithmeticParameter *>(parameter);
+  }
+  ~ArithmeticGradCPUKernelFp16() override {
+    if (tile_data0) delete[] tile_data0;
+    if (tile_data1) delete[] tile_data1;
+    if (tile_data2) delete[] tile_data2;
+  }
+
+  int Init() override;
+  int InferShape();
+  int ReSize() override;
+  int Run() override;
+  int Execute(int task_id);
+
+ private:
+  void ArithmeticGradMaximum(float16_t *dy, int dy_size, float16_t *dx1, int dx1_size, float16_t *dx2, int dx2_size);
+  void ArithmeticGradMinimum(float16_t *dy, int dy_size, float16_t *dx1, int dx1_size, float16_t *dx2, int dx2_size);
+  ArithmeticParameter *arithmeticParameter_;
+  ArithmeticGradOperation arithmetic_grad_;
+  float16_t *tile_data0;
+  float16_t *tile_data1;
+  float16_t *tile_data2;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ARITHMETIC_FP16_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/bias_fp16_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/bias_fp16_grad.cc
@ -0,0 +1,96 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include "src/runtime/kernel/arm/fp16_grad/bias_fp16_grad.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_BiasAddGrad;
+
+namespace mindspore::kernel {
+
+int BiasGradCPUKernelFp16::ReSize() {
+  auto dims = in_tensors_[0]->shape();
+  bias_param->ndim_ = dims.size();
+  for (unsigned int i = 0; i < bias_param->ndim_; i++) {
+    bias_param->in_shape0_[i] = dims[i];
+    bias_param->out_shape_[i] = 1;  // 1 dimension for N,H,W,
+  }
+  bias_param->out_shape_[bias_param->ndim_ - 1] = dims[bias_param->ndim_ - 1];
+  for (auto i = bias_param->ndim_; i < 4; i++) {
+    bias_param->in_shape0_[i] = 0;
+    bias_param->out_shape_[i] = 0;
+  }
+  return RET_OK;
+}
+
+int BiasGradCPUKernelFp16::Init() {
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  return ReSize();
+}
+
+int BiasGradCPUKernelFp16::Execute(int task_id) {
+  auto in = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
+  auto out = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
+
+  size_t nhw_size = 1;
+  size_t channels = bias_param->in_shape0_[bias_param->ndim_ - 1];  // C in NHWC
+  for (unsigned int i = 0; i < bias_param->ndim_ - 1; i++) {
+    nhw_size *= bias_param->in_shape0_[i];
+  }
+
+  size_t total_size = channels * nhw_size;
+  for (size_t c = 0; c < channels; ++c) {
+    float sum = 0;
+    for (size_t offset = 0; offset < total_size; offset += channels) {
+      sum += in[offset + c];
+    }
+    out[c] = (float16_t)sum;
+  }
+
+  return RET_OK;
+}
+
+int BiasGradFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
+  auto bias_kernel = reinterpret_cast<BiasGradCPUKernelFp16 *>(cdata);
+  auto error_code = bias_kernel->Execute(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "bias error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int BiasGradCPUKernelFp16::Run() {
+  int error_code =
+    static_cast<const lite::InnerContext *>(this->context_)->thread_pool_->ParallelLaunch(BiasGradFp16Run, this, 1);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "bias function error error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_BiasAddGrad, LiteKernelCreator<BiasGradCPUKernelFp16>)
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/bias_fp16_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/bias_fp16_grad.h
@ -0,0 +1,44 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BIAS_FP16_GRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BIAS_FP16_GRAD_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "nnacl/fp16/arithmetic_fp16.h"
+
+namespace mindspore::kernel {
+class BiasGradCPUKernelFp16 : public InnerKernel {
+ public:
+  explicit BiasGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                                 const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : InnerKernel(parameter, inputs, outputs, ctx) {
+    bias_param = reinterpret_cast<ArithmeticParameter *>(parameter);
+  }
+  ~BiasGradCPUKernelFp16() override = default;
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int Execute(int task_id);
+
+ private:
+  ArithmeticParameter *bias_param;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BIAS_FP16_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/bn_fp16_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/bn_fp16_grad.cc
@ -0,0 +1,166 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp16_grad/bn_fp16_grad.h"
+#include <math.h>
+#include <algorithm>
+#include <vector>
+#include <string>
+#include <thread>
+#include <fstream>
+
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "nnacl/fp16_grad/batch_norm.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_BatchNormGrad;
+
+namespace mindspore::kernel {
+int BNGradCPUKernelFp16::ReSize() {
+  auto *input_x = in_tensors_.at(1);
+  int channels = input_x->shape().at(kNHWC_C);
+  ws_size_ = 2 * channels;
+  set_workspace_size(ws_size_ * sizeof(float16_t));
+  return RET_OK;
+}
+
+int BNGradCPUKernelFp16::Init() {
+  for (int i = 0; i < in_tensors_.size(); i++) {
+    if (in_tensors_.at(i)->data_type() != kNumberTypeFloat16) {
+      MS_LOG(ERROR) << "BNGradCPUKernelFp16 type error in_tensor_[" << i << "]";
+    }
+  }
+  return ReSize();
+}
+
+int BNGradCPUKernelFp16::Execute(int task_id) {
+  auto *input_yt = in_tensors_.at(0);
+  auto *input_x = in_tensors_.at(1);
+  auto *input_scale = in_tensors_.at(2);
+  auto *input_mean = in_tensors_.at(3);
+  auto *input_var = in_tensors_.at(4);
+
+  auto kernel_name = this->name();
+  if (kernel_name.find("FusedBatchNormGradCPU") != std::string::npos) {
+    input_mean = in_tensors_.at(4);
+    input_var = in_tensors_.at(5);
+  }
+  auto bn_param = reinterpret_cast<BNGradParameter *>(op_parameter_);
+  int stage = stage_;
+  int thread_num = thread_num_;
+  float16_t *save_mean = reinterpret_cast<float16_t *>(input_mean->data_c());
+  float16_t *save_var = reinterpret_cast<float16_t *>(input_var->data_c());
+
+  auto *output_dx = out_tensors_.at(0);
+  auto *output_scale = out_tensors_.at(1);
+  auto *output_bias = out_tensors_.at(2);
+  int32_t batch = input_x->Batch();
+  int32_t channels = input_x->Channel();
+  int32_t spatial = input_x->Height() * input_x->Width();
+
+  float *workspace_temp = static_cast<float *>(workspace());
+  float *dxhat_sum = workspace_temp;
+  float *dxhathat_sum = dxhat_sum + channels;
+  float16_t *x = reinterpret_cast<float16_t *>(input_x->data_c());
+  float16_t *yt = reinterpret_cast<float16_t *>(input_yt->data_c());
+  float16_t *scale = reinterpret_cast<float16_t *>(input_scale->data_c());
+  float16_t *dx = reinterpret_cast<float16_t *>(output_dx->data_c());
+  float16_t *dbias = reinterpret_cast<float16_t *>(output_bias->data_c());
+  float16_t *dscale = reinterpret_cast<float16_t *>(output_scale->data_c());
+  int total = spatial * batch;
+  int stride = UP_DIV(total, thread_num);
+  int count = MSMIN(stride, total - stride * task_id);
+  count = (count < 0) ? 0 : count;
+  switch (stage) {
+    case 0: {
+      for (int job = task_id; job < 4; job += thread_num) {
+        switch (job) {
+          case 0:
+            var2InvarFp16(save_var, input_var->ElementsNum(), bn_param->epsilon_);
+            break;
+          case 1:
+            std::fill(workspace_temp, workspace_temp + ws_size_, 0.f);
+            break;
+          case 2:
+            std::fill(dbias, dbias + channels, 0.f);
+            break;
+          case 3:
+            std::fill(dscale, dscale + channels, 0.f);
+            break;
+        }
+      }
+      if (thread_num == 1) {
+        backwardAllFp16(x, yt, save_mean, save_var, scale, total, channels, dxhat_sum, dxhathat_sum, dbias, dscale, dx);
+      }
+      break;
+    }
+    case 1: {
+      backwardP1Fp16(x, yt, save_mean, save_var, scale, total, channels, dxhat_sum, dxhathat_sum, dbias, dscale);
+      break;
+    }
+    case 2: {
+      backwardP2Fp16(x + task_id * stride * channels, yt + task_id * stride * channels, save_mean, save_var, scale,
+                     count, total, channels, dxhat_sum, dxhathat_sum, dx + task_id * stride * channels);
+      break;
+    }
+  }
+
+  return RET_OK;
+}
+
+int BNGradFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
+  MS_ASSERT(cdata != nullptr);
+  auto bn_kernel = reinterpret_cast<BNGradCPUKernelFp16 *>(cdata);
+  auto error_code = bn_kernel->Execute(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "BNGradRun error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int BNGradCPUKernelFp16::Run() {
+  stage_ = 0;
+  thread_num_ = context_->thread_num_;
+  if (thread_num_ == 1) {
+    int error_code = static_cast<const lite::InnerContext *>(this->context_)
+                       ->thread_pool_->ParallelLaunch(BNGradFp16Run, this, thread_num_);
+    if (error_code != RET_OK) {
+      MS_LOG(ERROR) << "BN function error error_code[" << error_code << "]";
+      return RET_ERROR;
+    }
+  } else {
+    const std::vector<int> threads = {thread_num_, 1, thread_num_};
+    for (size_t stage = 0; stage < threads.size(); stage++) {
+      stage_ = static_cast<int>(stage);
+      int error_code = static_cast<const lite::InnerContext *>(this->context_)
+                         ->thread_pool_->ParallelLaunch(BNGradFp16Run, this, threads.at(stage));
+      if (error_code != RET_OK) {
+        MS_LOG(ERROR) << "BN function error error_code[" << error_code << "]";
+        return RET_ERROR;
+      }
+    }
+  }
+  return RET_OK;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_BatchNormGrad, LiteKernelCreator<BNGradCPUKernelFp16>)
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/bn_fp16_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/bn_fp16_grad.h
@ -0,0 +1,43 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BN_FP16_GRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BN_FP16_GRAD_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "nnacl/fp32_grad/batch_norm.h"
+
+namespace mindspore::kernel {
+
+class BNGradCPUKernelFp16 : public InnerKernel {
+ public:
+  explicit BNGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                               const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : InnerKernel(parameter, inputs, outputs, ctx) {}
+  ~BNGradCPUKernelFp16() override {}
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int Execute(int task_id);
+
+ private:
+  int thread_num_ = 1;
+  int stage_ = 0;
+  size_t ws_size_ = 0;
+};
+}  // namespace mindspore::kernel
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BN_FP16_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_filter.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_filter.cc
@ -0,0 +1,205 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_filter.h"
+#include "src/kernel_registry.h"
+#include "nnacl/pack.h"
+#include "nnacl/fp16_grad/convolution_grad_filter.h"
+#include "nnacl/fp16_grad/pack_fp16_ext.h"
+#include "nnacl/fp16_grad/gemm_fp16.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_Conv2DBackpropFilterFusion;
+
+namespace mindspore::kernel {
+int ConvolutionGradFilterCPUKernelFp16::ReSize() {
+  // dy is in input 0
+  // x is in input 1
+  // dw is output 0
+
+  auto *x_tensor = in_tensors_.at(1);
+  MS_ASSERT(x_tensor != nullptr);
+  auto *dy_tensor = in_tensors_.at(0);
+  MS_ASSERT(dy_tensor != nullptr);
+
+  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter_);
+  conv_param->output_batch_ = dy_tensor->shape().at(kNHWC_N);
+  conv_param->input_batch_ = x_tensor->shape().at(kNHWC_N);
+  conv_param->input_h_ = x_tensor->shape().at(kNHWC_H);
+  conv_param->input_w_ = x_tensor->shape().at(kNHWC_W);
+  // assume OutCh|kh|kw|InCh
+  conv_param->input_channel_ = x_tensor->shape().at(kNHWC_C);
+  conv_param->output_channel_ = dy_tensor->shape().at(kNHWC_C);
+
+  conv_param->output_h_ = dy_tensor->shape()[kNHWC_H];
+  conv_param->output_w_ = dy_tensor->shape()[kNHWC_W];
+
+  do_img2col_ = (conv_param->kernel_h_ == 1) && (conv_param->kernel_w_ == 1) && (conv_param->pad_d_ == 0) &&
+                    (conv_param->pad_u_ == 0) && (conv_param->pad_l_ == 0) && (conv_param->pad_r_ == 0) &&
+                    (conv_param->dilation_h_ == 1) && (conv_param->dilation_w_ == 1) && (conv_param->stride_h_ == 1) &&
+                    (conv_param->stride_w_ == 1) && (conv_param->group_ == 1)
+                  ? false
+                  : true;
+  do_dw_ = (conv_param->output_channel_ == conv_param->group_) &&
+               (conv_param->input_channel_ == conv_param->output_channel_) && (conv_param->dilation_h_ == 1) &&
+               (conv_param->dilation_w_ == 1)
+             ? true
+             : false;
+
+  ws_size_ = chunk_ * conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_;
+  ws_size_ = do_dw_ ? ws_size_ : ws_size_ / conv_param->group_;
+  int n = conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_ / conv_param->group_;
+  int k = conv_param->output_channel_ / conv_param->group_;
+  int thread_num = context_->thread_num_;
+  mat_alloc_ = MatSizeTotalFp16(k, n, chunk_, 0);
+  set_workspace_size((ws_size_ + mat_alloc_ + (k * n)) * thread_num * sizeof(float16_t));
+
+  return RET_OK;
+}
+
+int ConvolutionGradFilterCPUKernelFp16::Init() { return ReSize(); }
+
+int ConvolutionGradFilterCPUKernelFp16::Execute(int task_id) {
+  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter_);
+  auto *input_dy = in_tensors_.at(0);
+  auto *input_x = in_tensors_.at(1);
+  auto *out_dw = out_tensors_.at(0);
+
+  auto x_addr = reinterpret_cast<float16_t *>(input_x->data_c());
+  auto dy_addr = reinterpret_cast<float16_t *>(input_dy->data_c());
+  auto dw_addr = reinterpret_cast<float16_t *>(out_dw->data_c());
+
+  int nweights = out_dw->ElementsNum();
+  int in_ch = conv_param->input_channel_;
+  int in_h = conv_param->input_h_;
+  int in_w = conv_param->input_w_;
+  int k_h = conv_param->kernel_h_;
+  int k_w = conv_param->kernel_w_;
+  int batch = conv_param->output_batch_;
+  int out_ch = conv_param->output_channel_;
+  int groups = conv_param->group_;
+  int out_h = conv_param->output_h_;
+  int out_w = conv_param->output_w_;
+
+  int m = out_h * out_w;
+  int n = k_h * k_w * in_ch / groups;
+  int k = out_ch / groups;
+  int thread_num = context_->thread_num_;
+  float16_t *workspace_temp = reinterpret_cast<float16_t *>(workspace());
+  float16_t *mat_workspace = workspace_temp + ws_size_ * thread_num + task_id * (mat_alloc_ + k * n);
+  float16_t *mat_tmp = mat_workspace + mat_alloc_;
+  int stride = UP_DIV(batch, thread_num);
+  int count = MSMIN(stride, batch - stride * task_id);
+  count = (count < 0) ? 0 : count;
+  int start = stride * task_id;
+  int end = start + count;
+
+  if (do_dw_) {
+#ifdef ENABLE_ARM
+    stride = UP_DIV(k_h * k_w, thread_num);
+    count = MSMIN(stride, k_h * k_w - stride * task_id);
+    count = (count < 0) ? 0 : count;
+    start = stride * task_id;
+    ConvDwFilterFp16Grad(x_addr, dy_addr, dw_addr, start, count, conv_param);
+#else
+    stride = UP_DIV(groups, thread_num);
+    count = MSMIN(stride, groups - stride * task_id);
+    start = stride * task_id;
+    end = start + count;
+
+    const int kernel_spatial = k_h * k_w;
+    for (int i = 0; i < batch; ++i) {
+      for (int ci = 0; ci < m; ci += chunk_) {
+        int real_chunk = MSMIN(m - ci, chunk_);
+        float16_t *mat_b = workspace_temp + task_id * ws_size_;
+        float16_t *im = x_addr + (i * in_ch * in_h * in_w);
+        RollingIm2ColPackDwUnitFp16(im, conv_param, mat_b, real_chunk, ci);
+        for (int j = start; j < end; ++j) {
+          float16_t *mat_a = dy_addr + (i * groups) * m * k + j * (out_ch / groups) + ci * out_ch;
+          float16_t *mat_c = dw_addr + j * nweights / groups;
+          GemmMatmulFp16(1, 0, k, n, real_chunk, 1, mat_a, out_ch, mat_b + (j * kernel_spatial), n * groups, 1, mat_c,
+                         n, mat_workspace);
+        }
+      }
+    }
+#endif
+  } else if (do_img2col_) {
+    for (int i = start; i < end; ++i) {
+      for (int ci = 0; ci < m; ci += chunk_) {
+        for (int j = 0; j < groups; ++j) {
+          int real_chunk = MSMIN(m - ci, chunk_);
+          float16_t *mat_a = dy_addr + (i * groups) * m * k + j * (out_ch / groups) + ci * out_ch;
+          float16_t *mat_b = workspace_temp + task_id * ws_size_;
+          float16_t *mat_c = dw_addr + j * nweights / groups;
+          float16_t *im = x_addr + (i * in_ch * in_h * in_w) + j * (in_ch / groups);
+          RollingIm2ColPackUnitFp16(im, conv_param, mat_b, real_chunk, ci);
+          GemmMatmulFp16(1, 0, k, n, real_chunk, 1, mat_a, out_ch, mat_b, n, 0, mat_tmp, n, mat_workspace);
+          std::unique_lock<std::mutex> merge_lock(lock_);
+          AddMatrixFp16(mat_tmp, mat_c, 1, k, n, n);
+        }
+      }
+    }
+  } else {
+    float16_t *mat_c = dw_addr;
+    const size_t in_plane_size = in_ch * in_h * in_w;
+    for (int i = start; i < end; ++i) {
+      for (int ci = 0; ci < m; ci += chunk_) {
+        int real_chunk = MSMIN(m - ci, chunk_);
+        float16_t *mat_a = dy_addr + i * m * k + ci * out_ch;
+        float16_t *im = x_addr + i * in_plane_size;
+        int input_h = ci / out_w * conv_param->stride_h_;
+        int input_w = ci % out_w * conv_param->stride_w_;
+        int offset = (input_h * in_w + input_w) * in_ch;
+        GemmMatmulFp16(1, 0, k, n, real_chunk, 1, mat_a, out_ch, im + offset, n, 0, mat_tmp, n, mat_workspace);
+        std::unique_lock<std::mutex> merge_lock(lock_);
+        AddMatrixFp16(mat_tmp, mat_c, 1, k, n, n);
+      }
+    }
+  }
+  return RET_OK;
+}
+
+int ConvolutionGradFilterFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
+  MS_ASSERT(cdata != nullptr);
+  auto convfilter_kernel = reinterpret_cast<ConvolutionGradFilterCPUKernelFp16 *>(cdata);
+  auto error_code = convfilter_kernel->Execute(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "ConvolutionGradFilterRun error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ConvolutionGradFilterCPUKernelFp16::Run() {
+  auto *out_dw = out_tensors_.at(0);
+  auto dw_addr = reinterpret_cast<float16_t *>(out_dw->data_c());
+  memset(dw_addr, 0, out_dw->Size());
+  int error_code = static_cast<const lite::InnerContext *>(this->context_)
+                     ->thread_pool_->ParallelLaunch(ConvolutionGradFilterFp16Run, this, context_->thread_num_);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "conv filter function error error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Conv2DBackpropFilterFusion,
+           LiteKernelCreator<ConvolutionGradFilterCPUKernelFp16>)
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_filter.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_filter.h
@ -0,0 +1,51 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_FILTER_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_FILTER_H_
+
+#include <vector>
+#include "src/inner_kernel.h"
+
+namespace mindspore::kernel {
+class ConvolutionGradFilterCPUKernelFp16 : public InnerKernel {
+ public:
+  explicit ConvolutionGradFilterCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                                              const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+
+      : InnerKernel(parameter, inputs, outputs, ctx) {}
+  ~ConvolutionGradFilterCPUKernelFp16() override {}
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int Execute(int task_id);
+
+ private:
+  size_t ws_size_ = 0;
+  bool do_img2col_ = true;
+  bool do_dw_ = false;
+  std::mutex lock_;
+  size_t mat_alloc_ = 0;
+#ifdef ENABLE_ARM32
+  const int chunk_ = C4NUM * 2;
+#else
+  const int chunk_ = C12NUM * 2;
+#endif
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_FILTER_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_input.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_input.cc
@ -0,0 +1,172 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_input.h"
+#include "src/kernel_registry.h"
+#include "nnacl/pack.h"
+#include "nnacl/fp16_grad/pack_fp16_ext.h"
+#include "nnacl/fp16_grad/gemm_fp16.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_Conv2DBackpropInputFusion;
+
+namespace mindspore::kernel {
+int ConvolutionGradInputCPUKernelFp16::ReSize() {
+  auto *dy_tensor = in_tensors_.at(kInputIndex);
+  MS_ASSERT(dy_tensor != nullptr);
+  auto *weight_tensor = in_tensors_.at(kWeightIndex);
+  MS_ASSERT(weight_tensor != nullptr);
+  auto *dx_tensor = out_tensors_.at(kOutputIndex);
+  MS_ASSERT(dx_tensor != nullptr);
+
+  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter_);
+  conv_param->output_batch_ = dx_tensor->shape()[(kNHWC_N)];
+  conv_param->input_batch_ = dy_tensor->shape()[(kNHWC_N)];
+
+  conv_param->input_h_ = dx_tensor->shape()[(kNHWC_H)];
+  conv_param->input_w_ = dx_tensor->shape()[(kNHWC_W)];
+
+  // assume OutCh|kh|kw|In
+  conv_param->input_channel_ = dx_tensor->shape()[(kNHWC_C)];
+  conv_param->output_channel_ = weight_tensor->shape()[(kNHWC_N)];
+
+  conv_param->output_h_ = dy_tensor->shape()[kNHWC_H];
+  conv_param->output_w_ = dy_tensor->shape()[kNHWC_W];
+  ws_size_ = chunk_ * conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_ / conv_param->group_;
+
+  int n = conv_param->kernel_w_ * conv_param->kernel_h_ * conv_param->input_channel_ / conv_param->group_;
+  int k = conv_param->output_channel_ / conv_param->group_;
+  int thread_num = context_->thread_num_;
+  mat_alloc_ = MatSizeTotalFp16(chunk_, n, k, 0);
+  set_workspace_size((ws_size_ + mat_alloc_) * sizeof(float16_t) * thread_num);
+
+  do_img2col_ = (conv_param->kernel_h_ == 1) && (conv_param->kernel_w_ == 1) && (conv_param->pad_d_ == 0) &&
+                    (conv_param->pad_u_ == 0) && (conv_param->pad_l_ == 0) && (conv_param->pad_r_ == 0) &&
+                    (conv_param->dilation_h_ == 1) && (conv_param->dilation_w_ == 1) && (conv_param->stride_h_ == 1) &&
+                    (conv_param->stride_w_ == 1) && (conv_param->group_ == 1)
+                  ? false
+                  : true;
+
+  return RET_OK;
+}
+
+int ConvolutionGradInputCPUKernelFp16::Init() { return ReSize(); }
+
+int ConvolutionGradInputCPUKernelFp16::Execute(int task_id) {
+  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter_);
+  auto *input_dy = in_tensors_.at(0);
+  auto *input_w = in_tensors_.at(1);
+  auto *out_dx = out_tensors_.at(0);
+
+  auto dy_addr = reinterpret_cast<float16_t *>(input_dy->data_c());
+  auto w_addr = reinterpret_cast<float16_t *>(input_w->data_c());
+  auto dx_addr = reinterpret_cast<float16_t *>(out_dx->data_c());
+
+  int i, j;
+  int nweights = input_w->ElementsNum();
+  int in_ch = conv_param->input_channel_;
+  int in_h = conv_param->input_h_;
+  int in_w = conv_param->input_w_;
+  int k_h = conv_param->kernel_h_;  // out_dw->shape()[1];
+  int k_w = conv_param->kernel_w_;  // out_dw->shape()[2];
+  int batch = conv_param->output_batch_;
+  int out_ch = conv_param->output_channel_;
+  int groups = conv_param->group_;
+  int out_h = conv_param->output_h_;
+  int out_w = conv_param->output_w_;
+  int thread_num = context_->thread_num_;
+  int m = out_h * out_w;
+  int n = k_w * k_h * in_ch / groups;
+  int k = out_ch / groups;
+  float16_t *workspace_temp = reinterpret_cast<float16_t *>(workspace()) + task_id * (mat_alloc_ + ws_size_);
+  float16_t *mat_workspace = workspace_temp + ws_size_;
+  int stride = UP_DIV(batch, thread_num);
+  int count = MSMIN(stride, batch - stride * task_id);
+  count = (count < 0) ? 0 : count;
+  int start = stride * task_id;
+  int end = start + count;
+
+  for (i = start; i < end; ++i) {
+    for (j = 0; j < groups; ++j) {
+      GemmCbFp16 gcb;
+      for (int ci = 0; ci < m; ci += chunk_) {
+        float16_t *mat_b = nullptr;
+        if (ci == 0) {
+          mat_b = w_addr + j * nweights / groups;
+          gcb.ca = 0;
+          gcb.cb = 0;
+          gcb.bias = nullptr;
+          gcb.atype = ActType_No;
+        } else {
+          mat_b = gcb.mat_b;
+          gcb.cb = 1;
+        }
+        int real_chunk = MSMIN(m - ci, chunk_);
+        float16_t *mat_a = dy_addr + (i * groups) * m * k + j * (out_ch / groups) + ci * out_ch;
+        if (do_img2col_) {
+          float16_t *mat_c = workspace_temp;
+          GemmMatmulPlusFp16(0, 0, real_chunk, n, k, 1, mat_a, out_ch, mat_b, n, 0, mat_c, n, mat_workspace, &gcb);
+          RollingCol2ImPackUnitFp16(mat_c,
+                                    dx_addr + (i * groups) * (in_ch / groups) * in_h * in_w + j * (in_ch / groups),
+                                    conv_param, real_chunk, ci);
+        } else {
+          float16_t *mat_c =
+            dx_addr + (i * groups) * (in_ch / groups) * in_h * in_w + j * (in_ch / groups) + ci * (in_ch / groups);
+          GemmMatmulPlusFp16(0, 0, real_chunk, n, k, 1, mat_a, out_ch, mat_b, n, 0, mat_c, n, mat_workspace, &gcb);
+        }
+      }
+    }
+  }
+
+  return RET_OK;
+}
+
+int ConvolutionGradInputFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
+  MS_ASSERT(cdata != nullptr);
+  auto convinput_kernel = reinterpret_cast<ConvolutionGradInputCPUKernelFp16 *>(cdata);
+  auto error_code = convinput_kernel->Execute(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "conv input error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ConvolutionGradInputCPUKernelFp16::Run() {
+  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter_);
+  int batch = conv_param->output_batch_;
+  int in_ch = conv_param->input_channel_;
+  int in_h = conv_param->input_h_;
+  int in_w = conv_param->input_w_;
+  auto *out_dx = out_tensors_.at(0);
+  auto dx_addr = reinterpret_cast<float16_t *>(out_dx->data_c());
+  memset(dx_addr, 0, sizeof(float16_t) * batch * in_ch * in_h * in_w);
+  int error_code = static_cast<const lite::InnerContext *>(this->context_)
+                     ->thread_pool_->ParallelLaunch(ConvolutionGradInputFp16Run, this, context_->thread_num_);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "bias function error error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Conv2DBackpropInputFusion,
+           LiteKernelCreator<ConvolutionGradInputCPUKernelFp16>)
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_input.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_input.h
@ -0,0 +1,44 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_INPUT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_INPUT_H_
+
+#include <vector>
+#include "src/inner_kernel.h"
+
+namespace mindspore::kernel {
+class ConvolutionGradInputCPUKernelFp16 : public InnerKernel {
+ public:
+  explicit ConvolutionGradInputCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                                             const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : InnerKernel(parameter, inputs, outputs, ctx) {}
+  ~ConvolutionGradInputCPUKernelFp16() override {}
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int Execute(int task_id);
+
+ private:
+  size_t ws_size_ = 0;
+  size_t mat_alloc_ = 0;
+  bool do_img2col_ = true;
+  const int chunk_ = C12NUM;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_INPUT_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/dropout_fp16_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/dropout_fp16_grad.cc
@ -0,0 +1,95 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <random>
+#include "src/runtime/kernel/arm/fp16_grad/dropout_fp16_grad.h"
+#include "nnacl/fp16_grad/dropout_grad.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "include/errorcode.h"
+#include "nnacl/fp32_grad/dropout_parameter.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_NULL_PTR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_DropoutGrad;
+
+namespace mindspore::kernel {
+
+int DropoutGradCPUKernelFp16::Init() {
+  auto param = reinterpret_cast<DropoutParameter *>(op_parameter_);
+  if (param == nullptr) {
+    MS_LOG(ERROR) << "Dropout op_parameter_ nullptr";
+    return RET_NULL_PTR;
+  }
+  auto ratio = param->ratio_;
+  if ((ratio > 1.0f) || (ratio < 0.0f)) {
+    MS_LOG(ERROR) << "unsupported ratio value - Dropout ratio should be between zero to one";
+    return RET_ERROR;
+  }
+
+  if (ratio >= 1.0f) {
+    scale_ = 1.0f;
+  } else {
+    scale_ = 1. / (1. - ratio);
+  }
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  return ReSize();
+}
+
+int DropoutGradCPUKernelFp16::ReSize() { return RET_OK; }
+
+int DropoutGradCPUKernelFp16::Execute(int task_id) {
+  auto yt_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(kInputIndex)->data_c());
+  auto mask_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c());
+  auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(kOutputIndex)->data_c());
+  auto length = in_tensors_.at(kInputIndex)->ElementsNum();
+  int stride = UP_DIV(length, thread_count_);
+  int count = MSMIN(stride, length - stride * task_id);
+
+  if (count > 0) {
+    int start = stride * task_id;
+    DropoutFp16Grad(&(yt_ptr[start]), &(mask_ptr[start]), &(output_ptr[start]), count, (float16_t)scale_);
+  }
+  return RET_OK;
+}
+
+int RunDropoutFp16Grad(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
+  auto dropout = reinterpret_cast<DropoutGradCPUKernelFp16 *>(cdata);
+  auto error_code = dropout->Execute(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "Dropout Grad Run error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int DropoutGradCPUKernelFp16::Run() {
+  int error_code = static_cast<const lite::InnerContext *>(this->context_)
+                     ->thread_pool_->ParallelLaunch(RunDropoutFp16Grad, this, thread_count_);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "Dropout Grad function error error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_DropoutGrad, LiteKernelCreator<DropoutGradCPUKernelFp16>)
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/dropout_fp16_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/dropout_fp16_grad.h
@ -0,0 +1,43 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_DROPOUT_FP16_GRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_DROPOUT_FP16_GRAD_H_
+
+#include <vector>
+#include "src/inner_kernel.h"
+
+namespace mindspore::kernel {
+class DropoutGradCPUKernelFp16 : public InnerKernel {
+ public:
+  DropoutGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                           const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : InnerKernel(parameter, inputs, outputs, ctx), thread_count_(ctx->thread_num_) {}
+
+  ~DropoutGradCPUKernelFp16() override = default;
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int Execute(int task_id);
+
+ private:
+  float scale_;
+  int thread_count_ = 1;
+};
+
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_DROPOUT_FP16_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/layernorm_fp16_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/layernorm_fp16_grad.cc
@ -0,0 +1,109 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp16_grad/layernorm_fp16_grad.h"
+#include <vector>
+
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "nnacl/fp16_grad/layernorm_grad.h"
+#include "nnacl/fp32_grad/layernormgrad_parameter.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_LayerNormGrad;
+
+namespace mindspore::kernel {
+int LayerNormGradCPUKernelFp16::ReSize() { return RET_OK; }
+
+int LayerNormGradCPUKernelFp16::Init() {
+  auto lngrad_param = reinterpret_cast<LayerNormGradParameter *>(op_parameter_);
+  auto *input_x = in_tensors_.at(0);
+  std::vector<int> x_shape = input_x->shape();
+  int begin_norm_axis = lngrad_param->begin_norm_axis_;
+  if (begin_norm_axis < 0) {
+    begin_norm_axis += x_shape.size();
+  }
+  auto begin_params_axis = lngrad_param->begin_params_axis_;
+  if (begin_params_axis < 0) {
+    begin_params_axis += x_shape.size();
+  }
+  for (size_t i = 0; i < static_cast<size_t>(begin_norm_axis); i++) {
+    block_num_ *= x_shape[i];
+  }
+  for (size_t i = static_cast<size_t>(begin_norm_axis); i < x_shape.size(); i++) {
+    block_size_ *= x_shape[i];
+  }
+  for (size_t i = 0; i < static_cast<size_t>(begin_params_axis); i++) {
+    param_size_ *= x_shape[i];
+  }
+  for (size_t i = begin_params_axis; i < x_shape.size(); i++) {
+    param_num_ *= x_shape[i];
+  }
+  if (block_num_ <= 0 || block_size_ <= 0) {
+    MS_LOG(ERROR) << "LayerNormGradCPUKernelFp16 input shape error, input shape: " << x_shape;
+  }
+  return RET_OK;
+}
+
+int LayerNormGradCPUKernelFp16::Execute(int task_id) {
+  auto input_x = in_tensors_.at(0);
+  auto input_dy = in_tensors_.at(1);
+  auto input_var = in_tensors_.at(2);
+  auto input_mean = in_tensors_.at(3);
+  auto input_gamma = in_tensors_.at(4);
+  auto output_dx = out_tensors_.at(0);
+  auto output_dg = out_tensors_.at(1);
+  auto output_db = out_tensors_.at(2);
+
+  float16_t *x = reinterpret_cast<float16_t *>(input_x->data_c());
+  float16_t *dy = reinterpret_cast<float16_t *>(input_dy->data_c());
+  float16_t *var = reinterpret_cast<float16_t *>(input_var->data_c());
+  float16_t *mean = reinterpret_cast<float16_t *>(input_mean->data_c());
+  float16_t *gamma = reinterpret_cast<float16_t *>(input_gamma->data_c());
+  float16_t *dx = reinterpret_cast<float16_t *>(output_dx->data_c());
+  float16_t *dg = reinterpret_cast<float16_t *>(output_dg->data_c());
+  float16_t *db = reinterpret_cast<float16_t *>(output_db->data_c());
+  LayerNormFp16Grad(x, dy, var, mean, gamma, param_num_, param_size_, block_num_, block_size_, dx, dg, db);
+  return RET_OK;
+}
+
+int LayerNormF16GradRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
+  MS_ASSERT(cdata != nullptr);
+  auto ln_kernel = reinterpret_cast<LayerNormGradCPUKernelFp16 *>(cdata);
+  auto error_code = ln_kernel->Execute(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "LayerNormGradRun error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int LayerNormGradCPUKernelFp16::Run() {
+  int error_code =
+    static_cast<const lite::InnerContext *>(this->context_)->thread_pool_->ParallelLaunch(LayerNormF16GradRun, this, 1);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "LayerNorm function error error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_LayerNormGrad, LiteKernelCreator<LayerNormGradCPUKernelFp16>)
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/layernorm_fp16_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/layernorm_fp16_grad.h
@ -0,0 +1,43 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_LAYERNORM_FP16_GRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_LAYERNORM_FP16_GRAD_H_
+
+#include <vector>
+#include "src/inner_kernel.h"
+
+namespace mindspore::kernel {
+
+class LayerNormGradCPUKernelFp16 : public InnerKernel {
+ public:
+  explicit LayerNormGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                                      const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : InnerKernel(parameter, inputs, outputs, ctx) {}
+  ~LayerNormGradCPUKernelFp16() override {}
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int Execute(int task_id);
+
+ private:
+  int block_num_ = 1;
+  int block_size_ = 1;
+  int param_num_ = 1;
+  int param_size_ = 1;
+};
+}  // namespace mindspore::kernel
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_LAYERNORM_FP16_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/neg_fp16_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/neg_fp16_grad.cc
@ -0,0 +1,69 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp16_grad/neg_fp16_grad.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "include/errorcode.h"
+#include "nnacl/fp16/arithmetic_self_fp16.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_NegGrad;
+
+namespace mindspore::kernel {
+namespace {
+int NegGradRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
+  MS_ASSERT(cdata != nullptr);
+  auto kernel = reinterpret_cast<NegGradCPUKernelFp16 *>(cdata);
+  MS_ASSERT(kernel != nullptr);
+  return kernel->DoNegGrad(task_id);
+}
+}  // namespace
+
+int NegGradCPUKernelFp16::Init() { return RET_OK; }
+
+int NegGradCPUKernelFp16::DoNegGrad(int task_id) {
+  auto dy = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
+  auto dx = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
+  int length = in_tensors_.at(0)->ElementsNum();
+
+  int stride = UP_DIV(length, thread_count_);
+  int count = MSMIN(stride, length - stride * task_id);
+  count = (count < 0) ? 0 : count;
+  int start = stride * task_id;
+
+  ElementNegativeFp16(dy + start, dx + start, count);
+  return RET_OK;
+}
+
+int NegGradCPUKernelFp16::ReSize() { return RET_OK; }
+
+int NegGradCPUKernelFp16::Run() {
+  int ret = static_cast<const lite::InnerContext *>(this->context_)
+              ->thread_pool_->ParallelLaunch(NegGradRun, this, thread_count_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "parallel launch fail!ret: " << ret;
+    return ret;
+  }
+
+  return RET_OK;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_NegGrad, LiteKernelCreator<NegGradCPUKernelFp16>)
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/neg_fp16_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/neg_fp16_grad.h
@ -0,0 +1,42 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_NEG_FP16_GRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_NEG_FP16_GRAD_H_
+
+#include <vector>
+#include "src/inner_kernel.h"
+#include "schema/model_generated.h"
+
+namespace mindspore::kernel {
+
+class NegGradCPUKernelFp16 : public InnerKernel {
+ public:
+  explicit NegGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                                const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : InnerKernel(parameter, inputs, outputs, ctx), thread_count_(ctx->thread_num_) {}
+  ~NegGradCPUKernelFp16() override {}
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int DoNegGrad(int thread_id);
+
+ private:
+  int thread_count_;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_NEG_FP16_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/pooling_fp16_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/pooling_fp16_grad.cc
@ -0,0 +1,113 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp16_grad/pooling_fp16_grad.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "nnacl/fp16/pooling_fp16.h"
+#include "nnacl/fp16_grad/pooling_grad.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_AvgPoolGrad;
+using mindspore::schema::PrimitiveType_MaxPoolGrad;
+
+namespace mindspore::kernel {
+int PoolingGradCPUKernelFp16::ReSize() {
+  PoolingParameter *pool_param = reinterpret_cast<PoolingParameter *>(op_parameter_);
+
+  auto in_shape = in_tensors_.at(0)->shape();
+  auto out_shape = in_tensors_.at(1)->shape();
+
+  if (pool_param->pool_mode_ == PoolMode_AvgPool) {
+    out_shape = in_tensors_.at(2)->shape();
+  }
+
+  int input_h = in_shape.at(1);
+  int input_w = in_shape.at(2);
+
+  if (pool_param->global_) {
+    pool_param->window_w_ = input_w;
+    pool_param->window_h_ = input_h;
+  }
+
+  pool_param->input_h_ = in_shape[kNHWC_H];
+  pool_param->input_w_ = in_shape[kNHWC_W];
+  pool_param->input_batch_ = in_shape[kNHWC_N];
+  pool_param->input_channel_ = in_shape[kNHWC_C];
+  pool_param->output_h_ = out_shape[kNHWC_H];
+  pool_param->output_w_ = out_shape[kNHWC_W];
+  pool_param->output_batch_ = out_shape[kNHWC_N];
+  pool_param->output_channel_ = out_shape[kNHWC_C];
+
+  return RET_OK;
+}
+
+int PoolingGradCPUKernelFp16::Init() { return ReSize(); }
+
+int PoolingGradCPUKernelFp16::Execute(int task_id) {
+  PoolingParameter *pool_param = reinterpret_cast<PoolingParameter *>(op_parameter_);
+  auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
+  auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
+  int stride = UP_DIV(pool_param->output_batch_, thread_num_);
+  int count = MSMIN(stride, pool_param->output_batch_ - stride * task_id);
+
+  if (count > 0) {
+    int in_batch_size = pool_param->input_h_ * pool_param->input_w_ * pool_param->input_channel_;
+    int out_batch_size = pool_param->output_h_ * pool_param->output_w_ * pool_param->input_channel_;
+    std::fill(output_ptr + task_id * stride * in_batch_size, output_ptr + ((task_id * stride) + count) * in_batch_size,
+              0.f);
+    if (pool_param->pool_mode_ == PoolMode_MaxPool) {
+      auto dy_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(2)->data_c());
+      MaxPoolingFp16Grad(input_ptr + task_id * stride * in_batch_size, dy_ptr + task_id * stride * out_batch_size,
+                         output_ptr + task_id * stride * in_batch_size, count, pool_param);
+    } else {
+      input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(2)->data_c());
+      AvgPoolingFp16Grad(input_ptr + task_id * stride * out_batch_size, output_ptr + task_id * stride * in_batch_size,
+                         count, pool_param);
+    }
+  }
+  return RET_OK;
+}
+
+int PoolingFp16GradImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
+  MS_ASSERT(cdata != nullptr);
+  auto pooling = reinterpret_cast<PoolingGradCPUKernelFp16 *>(cdata);
+  auto error_code = pooling->Execute(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "Pooling Run error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int PoolingGradCPUKernelFp16::Run() {
+  thread_num_ = context_->thread_num_;
+  int error_code = static_cast<const lite::InnerContext *>(this->context_)
+                     ->thread_pool_->ParallelLaunch(PoolingFp16GradImpl, this, thread_num_);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "pooling error error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_AvgPoolGrad, LiteKernelCreator<PoolingGradCPUKernelFp16>)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MaxPoolGrad, LiteKernelCreator<PoolingGradCPUKernelFp16>)
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/pooling_fp16_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/pooling_fp16_grad.h
@ -0,0 +1,47 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_POOLING_FP16_GRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_POOLING_FP16_GRAD_H_
+
+#include <vector>
+#include "src/inner_kernel.h"
+
+namespace mindspore::kernel {
+using mindspore::schema::PadMode;
+using mindspore::schema::PoolMode;
+using mindspore::schema::QuantType;
+using mindspore::schema::RoundMode;
+
+class PoolingGradCPUKernelFp16 : public InnerKernel {
+ public:
+  explicit PoolingGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                                    const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : InnerKernel(parameter, inputs, outputs, ctx) {}
+  ~PoolingGradCPUKernelFp16() override = default;
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int Execute(int task_id);
+
+ private:
+  int thread_num_ = 1;
+};
+
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_POOLING_FP16_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/resize_fp16_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/resize_fp16_grad.cc
@ -0,0 +1,104 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include "src/runtime/kernel/arm/fp16_grad/resize_fp16_grad.h"
+#include "nnacl/fp16_grad/resize_grad.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_ResizeGrad;
+
+namespace mindspore::kernel {
+
+float16_t ScalingFp16(size_t in_size, size_t out_size, bool align_corners) {
+  return (align_corners && out_size > 1) ? (in_size - 1) / static_cast<float16_t>(out_size - 1)
+                                         : in_size / static_cast<float16_t>(out_size);
+}
+
+int ResizeGradCPUKernelFp16::ReSize() {
+  auto param = reinterpret_cast<ResizeFp16GradParameter *>(op_parameter_);
+  if (param == nullptr) {
+    MS_LOG(ERROR) << "ResizeGradCPUKernelFp16 op_parameter_ is nullptr";
+    return RET_ERROR;
+  }
+  bool align_corners = param->align_corners_;
+  param->in_height_ = static_cast<size_t>(in_tensors_.at(0)->Height());
+  param->in_width_ = static_cast<size_t>(in_tensors_.at(0)->Width());
+  param->out_height_ = static_cast<size_t>(out_tensors_.at(0)->Height());
+  param->out_width_ = static_cast<size_t>(out_tensors_.at(0)->Width());
+  param->height_scale_ = ScalingFp16(param->out_height_, param->in_height_, align_corners);
+  param->width_scale_ = ScalingFp16(param->out_width_, param->in_width_, align_corners);
+
+  return RET_OK;
+}
+
+int ResizeGradCPUKernelFp16::Init() {
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  return ReSize();
+}
+
+int ResizeGradCPUKernelFp16::Execute(int task_id) {
+  auto in_addr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
+  auto out_addr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
+  auto param = reinterpret_cast<ResizeFp16GradParameter *>(op_parameter_);
+  if (param == nullptr) {
+    MS_LOG(ERROR) << "ResizeGradCPUKernelFp16 op_parameter_ is nullptr";
+    return RET_ERROR;
+  }
+  auto batch_size = in_tensors_.at(0)->Batch();
+  auto channel = in_tensors_.at(0)->Channel();
+
+  if (param->method == static_cast<int>(schema::ResizeMethod_NEAREST)) {
+    ResizeNearestNeighborFp16Grad(in_addr, out_addr, batch_size, channel, in_tensors_.at(0)->format(), param);
+  } else {
+    ResizeBiLinearFp16Grad(in_addr, out_addr, batch_size, channel, in_tensors_.at(0)->format(), param);
+  }
+  return RET_OK;
+}
+
+int ResizeFp16GradRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
+  auto resize_grad_kernel = reinterpret_cast<ResizeGradCPUKernelFp16 *>(cdata);
+  auto error_code = resize_grad_kernel->Execute(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "resize grad error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ResizeGradCPUKernelFp16::Run() {
+  auto out_addr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
+  size_t elem_number = out_tensors_.at(0)->ElementsNum();
+  std::fill(out_addr, out_addr + elem_number, 0.f);
+  int error_code =
+    static_cast<const lite::InnerContext *>(this->context_)->thread_pool_->ParallelLaunch(ResizeFp16GradRun, this, 1);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "ResizeGradCPUKernelFp16 function error error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_ResizeGrad, LiteKernelCreator<ResizeGradCPUKernelFp16>)
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/resize_fp16_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/resize_fp16_grad.h
@ -0,0 +1,38 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_RESIZE_FP16_GRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_RESIZE_FP16_GRAD_H_
+
+#include <vector>
+#include "src/inner_kernel.h"
+
+namespace mindspore::kernel {
+class ResizeGradCPUKernelFp16 : public InnerKernel {
+ public:
+  explicit ResizeGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                                   const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : InnerKernel(parameter, inputs, outputs, ctx) {}
+  ~ResizeGradCPUKernelFp16() override = default;
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int ExecuteInit(int task_id);
+  int Execute(int task_id);
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_RESIZE_FP16_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/strided_slice_fp16_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/strided_slice_fp16_grad.cc
@ -0,0 +1,154 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp16_grad/strided_slice_fp16_grad.h"
+#include <vector>
+#include <algorithm>
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "nnacl/fp16_grad/strided_slice_grad.h"
+#include "src/ops/populate/strided_slice_populate.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_StridedSliceGrad;
+
+namespace mindspore::kernel {
+
+int StridedSliceGradCPUKernelFp16::Init() {
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  param_ = reinterpret_cast<StridedSliceParameter *>(op_parameter_);
+  auto input = in_tensors_.at(0);
+  MS_ASSERT(input);
+  switch (input->data_type()) {
+    case kNumberTypeFloat16:
+      param_->data_type = kDataTypeFloat16;
+      break;
+    default:
+      MS_LOG(ERROR) << "Not supported data type: " << input->data_type();
+      return RET_ERROR;
+  }
+  FillEmptyDims();
+  FillOutputDim();
+  return ReSize();
+}
+
+void StridedSliceGradCPUKernelFp16::FillEmptyDims() {
+  int32_t begins[DIMENSION_7D];
+  int32_t ends[DIMENSION_7D];
+  int32_t strides[DIMENSION_7D];
+  int32_t input_shape[DIMENSION_7D];
+  int32_t i;
+  for (i = 0; i < param_->num_axes_; ++i) {
+    begins[i] = param_->begins_[i];
+    ends[i] = MSMIN(param_->ends_[i], param_->in_shape_[i]);
+    strides[i] = param_->strides_[i];
+    input_shape[i] = param_->in_shape_[i];
+  }
+  for (i = param_->num_axes_; i < param_->in_shape_length_; ++i) {
+    input_shape[i] = param_->in_shape_[i];
+    begins[i] = 0;
+    ends[i] = param_->in_shape_[i];
+    strides[i] = 1;
+  }
+
+  int32_t real_index = param_->in_shape_length_ - 1;
+  for (i = DIMENSION_7D - 1; i >= 0; --i) {
+    if (real_index >= 0) {
+      param_->begins_[i] = begins[real_index];
+      param_->ends_[i] = ends[real_index];
+      param_->strides_[i] = strides[real_index];
+      param_->in_shape_[i] = input_shape[real_index--];
+    } else {
+      param_->begins_[i] = 0;
+      param_->ends_[i] = 1;
+      param_->strides_[i] = 1;
+      param_->in_shape_[i] = 1;
+    }
+  }
+  param_->num_axes_ = DIMENSION_7D;
+  param_->in_shape_length_ = DIMENSION_7D;
+
+  for (i = 0; i < DIMENSION_7D; ++i) {
+    if (param_->begins_[i] < 0) {
+      param_->begins_[i] += param_->in_shape_[i];
+    }
+    if (param_->ends_[i] < 0) {
+      param_->ends_[i] += param_->in_shape_[i];
+    }
+  }
+}
+
+void StridedSliceGradCPUKernelFp16::FillOutputDim() {
+  auto output = out_tensors_.at(0);
+  size_t out_size = output->shape().size();
+  for (size_t i = 0; i < DIMENSION_7D; i++) {
+    if (i < out_size) {
+      output_shape_.push_back(output->shape()[i]);
+    } else {
+      output_shape_.insert(output_shape_.begin(), 1);
+    }
+  }
+}
+
+int StridedSliceGradCPUKernelFp16::ReSize() { return RET_OK; }
+
+int StridedSliceFp16GradImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
+  MS_ASSERT(cdata != nullptr);
+  auto slice = reinterpret_cast<StridedSliceGradCPUKernelFp16 *>(cdata);
+  auto error_code = slice->Execute(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "StridedSliceGrad Run error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int StridedSliceGradCPUKernelFp16::Run() {
+  int error_code = static_cast<const lite::InnerContext *>(this->context_)
+                     ->thread_pool_->ParallelLaunch(StridedSliceFp16GradImpl, this, 1);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "Strided slice error error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int StridedSliceGradCPUKernelFp16::Execute(int task_id) {
+  auto input = in_tensors_.at(0);
+  auto output = out_tensors_.at(0);
+  MS_ASSERT(output);
+
+  int *po = output_shape_.data();
+  auto dx = reinterpret_cast<float16_t *>(output->data_c());
+  auto dy = reinterpret_cast<float16_t *>(input->data_c());
+
+  std::fill(dx, dx + output->ElementsNum(), 0.f);
+  auto ret = DoStridedSliceFp16Grad(dy, dx, po, param_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "StridedSliceGrad error error_code[" << ret << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_StridedSliceGrad, LiteKernelCreator<StridedSliceGradCPUKernelFp16>)
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/strided_slice_fp16_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/strided_slice_fp16_grad.h
@ -0,0 +1,49 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_STRIDED_SLICE_FP16_GRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_STRIDED_SLICE_FP16_GRAD_H_
+
+#include <vector>
+#include "nnacl/fp16_grad/strided_slice_grad.h"
+#include "src/lite_kernel.h"
+
+namespace mindspore::kernel {
+class StridedSliceGradCPUKernelFp16 : public InnerKernel {
+ public:
+  StridedSliceGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                                const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : InnerKernel(parameter, inputs, outputs, ctx) {
+    param_ = reinterpret_cast<StridedSliceParameter *>(parameter);
+  }
+  ~StridedSliceGradCPUKernelFp16() override = default;
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int Execute(int task_id);
+
+ private:
+  void FillEmptyDims();
+  void FillOutputDim();
+  void ParseMasks();
+
+  StridedSliceParameter *param_;
+  std::vector<int> output_shape_;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_STRIDED_SLICE_FP16_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/unsorted_segment_sum_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/unsorted_segment_sum_fp16.cc
@ -0,0 +1,98 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp16_grad/unsorted_segment_sum_fp16.h"
+#include <vector>
+#include <algorithm>
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "nnacl/fp16_grad/unsorted_segment_sum.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_UnsortedSegmentSum;
+
+namespace mindspore::kernel {
+
+int UnsortedSegmentSumCPUKernelFp16::Init() {
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  auto input_shape = in_tensors_.at(0)->shape();
+  auto segment_ids_shape = in_tensors_.at(1)->shape();
+  auto output_shape = out_tensors_.at(0)->shape();
+  unit_num_ = 1;
+  input_dim1_ = 1;
+  for (size_t i = 0; i < input_shape.size(); ++i) {
+    unit_num_ *= input_shape[i];
+    if (i >= segment_ids_shape.size()) {
+      input_dim1_ *= input_shape[i];
+    }
+  }
+  output_dim0_ = output_shape[0];
+  output_dim1_ = 1;
+  for (size_t j = 1; j < output_shape.size(); j++) {
+    output_dim1_ *= output_shape[j];
+  }
+  return RET_OK;
+}
+
+int UnsortedSegmentSumCPUKernelFp16::ReSize() { return RET_OK; }
+
+int UnsortedSegmentSumFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
+  MS_ASSERT(cdata != nullptr);
+  auto kernel = reinterpret_cast<UnsortedSegmentSumCPUKernelFp16 *>(cdata);
+  auto error_code = kernel->Execute(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "UnsortedSegmentSum Run error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int UnsortedSegmentSumCPUKernelFp16::Run() {
+  int error_code = static_cast<const lite::InnerContext *>(this->context_)
+                     ->thread_pool_->ParallelLaunch(UnsortedSegmentSumFp16Run, this, 1);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "Strided slice error error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int UnsortedSegmentSumCPUKernelFp16::Execute(int task_id) {
+  int ret;
+  auto input_tensor = in_tensors_.at(0);
+  auto indices_tensor = in_tensors_.at(1);
+  auto output_tensor = out_tensors_.at(0);
+  float16_t *input = reinterpret_cast<float16_t *>(input_tensor->data_c());
+  int *indices = reinterpret_cast<int *>(indices_tensor->data_c());
+  float16_t *output = reinterpret_cast<float16_t *>(output_tensor->data_c());
+  std::fill(output, output + output_tensor->ElementsNum(), 0.f);
+  ret = UnsortedSegmentSumFp16(input, unit_num_, input_dim1_, indices, output, output_dim0_, output_dim1_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "UnsortedSegmentSumFp16 error error_code[" << ret << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_UnsortedSegmentSum,
+           LiteKernelCreator<UnsortedSegmentSumCPUKernelFp16>)
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/unsorted_segment_sum_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/unsorted_segment_sum_fp16.h
@ -0,0 +1,44 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_UNSORTED_SEGMENT_SUM_FP16_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_UNSORTED_SEGMENT_SUM_FP16_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+
+namespace mindspore::kernel {
+class UnsortedSegmentSumCPUKernelFp16 : public InnerKernel {
+ public:
+  UnsortedSegmentSumCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                                  const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : InnerKernel(parameter, inputs, outputs, ctx) {}
+  ~UnsortedSegmentSumCPUKernelFp16() override = default;
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int Execute(int task_id);
+  size_t unit_num_ = 0;
+  size_t input_dim1_ = 0;
+  size_t output_dim0_ = 0;
+  size_t output_dim1_ = 0;
+
+ private:
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_UNSORTED_SEGMENT_SUM_FP16_H_
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16_grad/activation_grad_fp16_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16_grad/activation_grad_fp16_test.cc
@ -58,7 +58,7 @@ TEST_F(TestActGradFp16, ReluGradFp16) {
    input_buf[i] = (float16_t)input_data[i];
  }

-  Fp16ReluGrad(yt_buf, input_buf, 50, output_buf);
+  ReluFp16Grad(yt_buf, input_buf, 50, output_buf);

  int res = 0;
  float error = 0;
@ -113,7 +113,7 @@ TEST_F(TestActGradFp16, SigmoidGradFp16) {
    input_buf[i] = (float16_t)input_data[i];
  }

-  Fp16SigmoidGrad(yt_buf, input_buf, 50, output_buf);
+  SigmoidFp16Grad(yt_buf, input_buf, 50, output_buf);

  int res = 0;
  float error = 0;