From 3499218aa16bceee06fa1a8945dc56aebe9d42af Mon Sep 17 00:00:00 2001
From: zhaozhenlong <zhaozhenlong1@huawei.com>
Date: Mon, 19 Apr 2021 11:11:39 +0800
Subject: [PATCH] add op softplus cumsum

---
 .../cpu/nnacl/cumsum_parameter.h              |  29 ++
 .../cpu/nnacl/fp32/activation_fp32.c          |   9 +
 .../cpu/nnacl/fp32/activation_fp32.h          |   1 +
 .../cpu/nnacl/fp32/cumsum_fp32.c              | 245 +++++++++++
 .../cpu/nnacl/fp32/cumsum_fp32.h              |  32 ++
 .../cpu/nnacl/infer/cumsum_infer.c            |  40 ++
 .../cpu/nnacl/infer/cumsum_infer.h            |  31 ++
 .../cpu/nnacl/infer/infer_register.h          |   3 +-
 mindspore/core/ops/cumsum.cc                  |  59 +++
 mindspore/core/ops/cumsum.h                   |  46 +++
 mindspore/core/ops/op_utils.h                 |   2 +
 mindspore/lite/schema/ops.fbs                 |   6 +
 mindspore/lite/src/ops/ops_def.cc             |   6 +
 mindspore/lite/src/ops/ops_func_declare.h     |   2 +
 mindspore/lite/src/ops/ops_utils.cc           |   6 +
 .../lite/src/ops/populate/cumsum_populate.cc  |  41 ++
 .../kernel/arm/fp32/activation_fp32.cc        |   6 +-
 .../runtime/kernel/arm/fp32/cumsum_fp32.cc    | 152 +++++++
 .../src/runtime/kernel/arm/fp32/cumsum_fp32.h |  48 +++
 .../test/ut/nnacl/infer/cumsum_infer_test.cc  |  63 +++
 .../kernel/arm/fp32/activation_fp32_test.cc   |  46 ++-
 .../runtime/kernel/arm/fp32/cumsum_tests.cc   | 384 ++++++++++++++++++
 22 files changed, 1254 insertions(+), 3 deletions(-)
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/cumsum_parameter.h
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/cumsum_fp32.c
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/cumsum_fp32.h
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/cumsum_infer.c
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/cumsum_infer.h
 create mode 100644 mindspore/core/ops/cumsum.cc
 create mode 100644 mindspore/core/ops/cumsum.h
 create mode 100644 mindspore/lite/src/ops/populate/cumsum_populate.cc
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/fp32/cumsum_fp32.cc
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/fp32/cumsum_fp32.h
 create mode 100644 mindspore/lite/test/ut/nnacl/infer/cumsum_infer_test.cc
 create mode 100644 mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/cumsum_tests.cc

diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/cumsum_parameter.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/cumsum_parameter.h
new file mode 100644
index 00000000000..767979f7239
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/cumsum_parameter.h
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_CUMSUM_PARAMETER_H_
+#define MINDSPORE_NNACL_CUMSUM_PARAMETER_H_
+
+#include "nnacl/op_base.h"
+
+typedef struct CumSumParameter {
+  OpParameter op_parameter_;
+  bool reverse_;
+  bool exclusive_;
+  int axis_;
+} CumsumParameter;
+
+#endif  // MINDSPORE_NNACL_CUMSUM_PARAMETER_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.c
index 6df6935e22e..cb1be6d2209 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.c
@@ -261,3 +261,12 @@ int Gelu(const float *src, int length, float *dst, bool approximate) {
   }
   return NNACL_OK;
 }
+
+int Softplus(const float *src, int length, float *dst) {
+  int i = 0;
+  for (; i < length; ++i) {
+    single_exp(src[i], dst + i);
+    dst[i] = log1p(dst[i]);
+  }
+  return NNACL_OK;
+}
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.h
index 3a3b4183d3c..f3e32687de0 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.h
@@ -41,6 +41,7 @@ int Swish(const float *src, int length, float *dst);
 int HSwish(const float *src, int length, float *dst);
 int HardTanh(const float *src, int length, float *dst, float min_val, float max_val);
 int Gelu(const float *src, int length, float *dst, bool approximate);
+int Softplus(const float *src, int length, float *dst);
 
 float TanhOpt(float src);
 #ifdef __cplusplus
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/cumsum_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/cumsum_fp32.c
new file mode 100644
index 00000000000..55c2e8a8b74
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/cumsum_fp32.c
@@ -0,0 +1,245 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp32/cumsum_fp32.h"
+#include "nnacl/op_base.h"
+
+// (a, b, c) -> (a, a+b, a+b+c)  exclusive == false
+// (a, b, c) -> (0, a,   a+b)    exclusive == true
+void Cumsum(const float *input, float *output, int out_dim, int axis_dim, int inner_dim, bool exclusive) {
+  // when not exclusive, output axis dim[0] is the same as that of input.
+  // when exclusive, output axis dim[0] is 0.0f
+  if (!exclusive) {
+    for (int i = 0; i < out_dim; ++i) {
+      const float *layer_input = input + i * axis_dim * inner_dim;
+      float *layer_output = output + i * axis_dim * inner_dim;
+      int j = 0;
+#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
+      for (; j <= inner_dim - C4NUM; j += C4NUM) {
+        MS_FLOAT32X4 val = MS_LDQ_F32(layer_input + j);
+        MS_STQ_F32(layer_output + j, val);
+      }
+#endif
+      for (; j < inner_dim; ++j) {
+        *(layer_output + j) = *(layer_input + j);
+      }
+    }
+  } else {
+    for (int i = 0; i < out_dim; ++i) {
+      float *layer_output = output + i * axis_dim * inner_dim;
+      int j = 0;
+#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
+      for (; j <= inner_dim - C4NUM; j += C4NUM) {
+        MS_FLOAT32X4 zero_val = MS_MOVQ_F32(0.0f);
+        MS_STQ_F32(layer_output + j, zero_val);
+      }
+#endif
+      for (; j < inner_dim; ++j) {
+        *(layer_output + j) = 0.0f;
+      }
+    }
+  }
+  int input_offset = exclusive ? 0 : 1;
+  for (int i = 0; i < out_dim; ++i) {
+    const float *layer_input = input + i * axis_dim * inner_dim + inner_dim * input_offset;
+    float *layer_last_output = output + i * axis_dim * inner_dim;
+    float *layer_output = layer_last_output + inner_dim;
+
+    for (int j = 1; j < axis_dim; ++j) {
+      int k = 0;
+#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
+      for (; k <= inner_dim - C4NUM; k += C4NUM) {
+        MS_FLOAT32X4 input_val = MS_LDQ_F32(layer_input + k);
+        MS_FLOAT32X4 last_output_val = MS_LDQ_F32(layer_last_output + k);
+        MS_FLOAT32X4 out_val = MS_ADDQ_F32(input_val, last_output_val);
+        MS_STQ_F32(layer_output + k, out_val);
+      }
+#endif
+      for (; k < inner_dim; ++k) {
+        // layer_output (i, j, k) = layer_input (i, j, k) + layer_last_output (i,j-1, k)
+        *(layer_output + k) = *(layer_input + k) + *(layer_last_output + k);
+      }
+      layer_input += inner_dim;
+      layer_last_output += inner_dim;
+      layer_output += inner_dim;
+    }
+  }
+}
+
+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false
+// (a, b, c) -> (c+b, c, 0) exclusive==true
+void CumsumReverse(const float *input, float *output, int out_dim, int axis_dim, int inner_dim, bool exclusive) {
+  if (!exclusive) {
+    for (int i = 0; i < out_dim; ++i) {
+      const float *layer_input = input + i * axis_dim * inner_dim + (axis_dim - 1) * inner_dim;
+      float *layer_output = output + i * axis_dim * inner_dim + (axis_dim - 1) * inner_dim;
+      int j = 0;
+#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
+      for (; j <= inner_dim - C4NUM; j += C4NUM) {
+        MS_FLOAT32X4 val = MS_LDQ_F32(layer_input + j);
+        MS_STQ_F32(layer_output + j, val);
+      }
+#endif
+      for (; j < inner_dim; ++j) {
+        *(layer_output + j) = *(layer_input + j);
+      }
+    }
+  } else {
+    for (int i = 0; i < out_dim; ++i) {
+      float *layer_output = output + i * axis_dim * inner_dim + (axis_dim - 1) * inner_dim;
+      int j = 0;
+#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
+      for (; j <= inner_dim - C4NUM; j += C4NUM) {
+        MS_FLOAT32X4 zero_val = MS_MOVQ_F32(0.0f);
+        MS_STQ_F32(layer_output + j, zero_val);
+      }
+#endif
+      for (; j < inner_dim; ++j) {
+        *(layer_output + j) = 0.0f;
+      }
+    }
+  }
+  int input_offset = exclusive ? 0 : 1;
+  for (int i = 0; i < out_dim; ++i) {
+    const float *layer_input = input + (i + 1) * axis_dim * inner_dim - 1 - input_offset * inner_dim;
+    float *layer_last_output = output + (i + 1) * axis_dim * inner_dim - 1;
+    float *layer_output = layer_last_output - inner_dim;
+
+    for (int j = 1; j < axis_dim; ++j) {
+      int k = 0;
+#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
+      for (; k <= inner_dim - C4NUM; k += C4NUM) {
+        MS_FLOAT32X4 input_val = MS_LDQ_F32(layer_input - k - 3);
+        MS_FLOAT32X4 last_output_val = MS_LDQ_F32(layer_last_output - k - 3);
+        MS_FLOAT32X4 out_val = MS_ADDQ_F32(input_val, last_output_val);
+        MS_STQ_F32(layer_output - k - 3, out_val);
+      }
+#endif
+      for (; k < inner_dim; ++k) {
+        *(layer_output - k) = *(layer_input - k) + *(layer_last_output - k);
+      }
+      layer_input -= inner_dim;
+      layer_last_output -= inner_dim;
+      layer_output -= inner_dim;
+    }
+  }
+}
+
+// (a, b, c) -> (a, a+b, a+b+c)  exclusive == false
+// (a, b, c) -> (0, a,   a+b)    exclusive == true
+void CumsumInt(const int *input, int *output, int out_dim, int axis_dim, int inner_dim, bool exclusive) {
+  // when not exclusive, output axis dim[0] is the same as that of input.
+  // when exclusive, output axis dim[0] is 0
+  if (!exclusive) {
+    for (int i = 0; i < out_dim; ++i) {
+      const int *layer_input = input + i * axis_dim * inner_dim;
+      int *layer_output = output + i * axis_dim * inner_dim;
+      int j = 0;
+#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
+      for (; j <= inner_dim - C4NUM; j += C4NUM) {
+        MS_INT32X4 val = MS_LDQ_EPI32(layer_input + j);
+        MS_STQ_EPI32(layer_output + j, val);
+      }
+#endif
+      for (; j < inner_dim; ++j) {
+        *(layer_output + j) = *(layer_input + j);
+      }
+    }
+  } else {
+    for (int i = 0; i < out_dim; ++i) {
+      int *layer_output = output + i * axis_dim * inner_dim;
+      int j = 0;
+#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
+      for (; j <= inner_dim - C4NUM; j += C4NUM) {
+        MS_INT32X4 zero_val = MS_MOVQ_EPI32(0);
+        MS_STQ_EPI32(layer_output + j, zero_val);
+      }
+#endif
+      for (; j < inner_dim; ++j) {
+        *(layer_output++) = 0;
+      }
+    }
+  }
+  int input_offset = exclusive ? 0 : 1;
+  for (int i = 0; i < out_dim; ++i) {
+    const int *layer_input = input + i * axis_dim * inner_dim + inner_dim * input_offset;
+    int *layer_last_output = output + i * axis_dim * inner_dim;
+    int *layer_output = layer_last_output + inner_dim;
+
+    for (int j = 1; j < axis_dim; ++j) {
+      int k = 0;
+#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
+      for (; k <= inner_dim - C4NUM; k += C4NUM) {
+        MS_INT32X4 input_val = MS_LDQ_EPI32(layer_input + k);
+        MS_INT32X4 last_output_val = MS_LDQ_EPI32(layer_last_output + k);
+        MS_INT32X4 out_val = MS_ADDQ_EPI32(input_val, last_output_val);
+        MS_STQ_EPI32(layer_output + k, out_val);
+      }
+#endif
+      for (; k < inner_dim; ++k) {
+        *(layer_output + k) = *(layer_input + k) + *(layer_last_output + k);
+      }
+      layer_input += inner_dim;
+      layer_last_output += inner_dim;
+      layer_output += inner_dim;
+    }
+  }
+}
+
+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false
+// (a, b, c) -> (c+b, c, 0) exclusive==true
+void CumsumReverseInt(const int *input, int *output, int out_dim, int axis_dim, int inner_dim, bool exclusive) {
+  if (!exclusive) {
+    for (int i = 0; i < out_dim; ++i) {
+      const int *layer_input = input + i * axis_dim * inner_dim + (axis_dim - 1) * inner_dim;
+      int *layer_output = output + i * axis_dim * inner_dim + (axis_dim - 1) * inner_dim;
+      for (int j = 0; j < inner_dim; ++j) {
+        *(layer_output++) = *(layer_input++);
+      }
+    }
+  } else {
+    for (int i = 0; i < out_dim; ++i) {
+      int *layer_output = output + i * axis_dim * inner_dim + (axis_dim - 1) * inner_dim;
+      for (int j = 0; j < inner_dim; ++j) {
+        *(layer_output++) = 0.0f;
+      }
+    }
+  }
+  int input_offset = exclusive ? 0 : 1;
+  for (int i = 0; i < out_dim; ++i) {
+    const int *layer_input = input + (i + 1) * axis_dim * inner_dim - 1 - input_offset * inner_dim;
+    int *layer_last_output = output + (i + 1) * axis_dim * inner_dim - 1;
+    int *layer_output = layer_last_output - inner_dim;
+
+    for (int j = 1; j < axis_dim; ++j) {
+      int k = 0;
+#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
+      for (; k <= inner_dim - C4NUM; k += C4NUM) {
+        MS_INT32X4 input_val = MS_LDQ_EPI32(layer_input - k - 3);
+        MS_INT32X4 last_output_val = MS_LDQ_EPI32(layer_last_output - k - 3);
+        MS_INT32X4 out_val = MS_ADDQ_EPI32(input_val, last_output_val);
+        MS_STQ_EPI32(layer_output - k - 3, out_val);
+      }
+#endif
+      for (; k < inner_dim; ++k) {
+        *(layer_output - k) = *(layer_input - k) + *(layer_last_output - k);
+      }
+      layer_input -= inner_dim;
+      layer_last_output -= inner_dim;
+      layer_output -= inner_dim;
+    }
+  }
+}
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/cumsum_fp32.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/cumsum_fp32.h
new file mode 100644
index 00000000000..0f1842b2616
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/cumsum_fp32.h
@@ -0,0 +1,32 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_CUMSUM_H_
+#define MINDSPORE_NNACL_FP32_CUMSUM_H_
+#include "nnacl/op_base.h"
+#include "nnacl/cumsum_parameter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void Cumsum(const float *input, float *output, int out_dim, int axis_dim, int inner_dim, bool exclusive);
+void CumsumReverse(const float *input, float *output, int out_dim, int axis_dim, int inner_dim, bool exclusive);
+void CumsumInt(const int *input, int *output, int out_dim, int axis_dim, int inner_dim, bool exclusive);
+void CumsumReverseInt(const int *input, int *output, int out_dim, int axis_dim, int inner_dim, bool exclusive);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_NNACL_FP32_CUMSUM_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/cumsum_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/cumsum_infer.c
new file mode 100644
index 00000000000..ff4d1c61a8f
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/cumsum_infer.c
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/infer/cumsum_infer.h"
+#include "nnacl/infer/infer_register.h"
+
+int CumsumInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
+                     OpParameter *parameter) {
+#ifdef Debug
+  int check_ret = CheckAugmentNullOutputSize(inputs, inputs_size, outputs, outputs_size, parameter, 1);
+  if (check_ret != NNACL_OK) {
+    return check_ret;
+  }
+#endif
+
+  const TensorC *input = inputs[0];
+  TensorC *output = outputs[0];
+  SetDataTypeFormat(output, input);
+  if (!parameter->infer_flag_) {
+    return NNACL_INFER_INVALID;
+  }
+
+  SetShapeTensor(output, input);
+  return NNACL_OK;
+}
+
+REG_INFER(Cumsum, PrimType_CumSum, CumsumInferShape)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/cumsum_infer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/cumsum_infer.h
new file mode 100644
index 00000000000..7680f3e438b
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/cumsum_infer.h
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_CUMSUM_INFER_H
+#define MINDSPORE_NNACL_CUMSUM_INFER_H
+
+#include "nnacl/infer/common_infer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int CumsumInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
+                     OpParameter *parameter);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // MINDSPORE_NNACL_CUMSUM_INFER_H
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer_register.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer_register.h
index 7000c15c370..c57a02dfa7a 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer_register.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer_register.h
@@ -216,8 +216,9 @@ enum PrimType {
   PrimType_LogSoftmax = 189,
   PrimType_Call = 190,
   PrimType_Custom = 191,
+  PrimType_CumSum = 192,
   PrimType_MIN = PrimType_NONE,
-  PrimType_MAX = PrimType_Custom + 1
+  PrimType_MAX = PrimType_CumSum + 1
 };
 
 void RegInfer(int prim_type, InferShape func);
diff --git a/mindspore/core/ops/cumsum.cc b/mindspore/core/ops/cumsum.cc
new file mode 100644
index 00000000000..8dc7db24ac9
--- /dev/null
+++ b/mindspore/core/ops/cumsum.cc
@@ -0,0 +1,59 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <memory>
+#include "ops/cumsum.h"
+#include "utils/check_convert_utils.h"
+#include "abstract/primitive_infer_map.h"
+#include "ops/op_utils.h"
+
+namespace mindspore {
+namespace ops {
+void CumSum::Init(const bool exclusive, const bool reverse) {
+  this->set_exclusive(exclusive);
+  this->set_reverse(reverse);
+}
+
+void CumSum::set_exclusive(const bool exclusive) { this->AddAttr(kExclusive, MakeValue(exclusive)); }
+
+bool CumSum::get_exclusive() const {
+  auto value_ptr = this->GetAttr(kExclusive);
+  return GetValue<bool>(value_ptr);
+}
+
+void CumSum::set_reverse(const bool reverse) { this->AddAttr(kReverse, MakeValue(reverse)); }
+
+bool CumSum::get_reverse() const {
+  auto value_ptr = this->GetAttr(kReverse);
+  return GetValue<bool>(value_ptr);
+}
+AbstractBasePtr CumSumInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                            const std::vector<AbstractBasePtr> &input_args) {
+  MS_EXCEPTION_IF_NULL(primitive);
+  auto prim_name = primitive->name();
+  CheckAndConvertUtils::CheckInteger("input number", input_args.size(), kEqual, 2, prim_name);
+  for (const auto &item : input_args) {
+    MS_EXCEPTION_IF_NULL(item);
+  }
+  // infer shape
+  auto out_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[0]->BuildShape())[kShape];
+  // infer type
+  auto x_type = input_args[0]->BuildType()->cast<TensorTypePtr>()->element();
+  return std::make_shared<abstract::AbstractTensor>(x_type, out_shape);
+}
+REGISTER_PRIMITIVE_C(kNameCumSum, CumSum);
+}  // namespace ops
+}  // namespace mindspore
diff --git a/mindspore/core/ops/cumsum.h b/mindspore/core/ops/cumsum.h
new file mode 100644
index 00000000000..d458187e3d3
--- /dev/null
+++ b/mindspore/core/ops/cumsum.h
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CORE_OPS_CUMSUM_H_
+#define MINDSPORE_CORE_OPS_CUMSUM_H_
+#include <vector>
+#include <memory>
+
+#include "ops/primitive_c.h"
+#include "abstract/abstract_value.h"
+#include "utils/check_convert_utils.h"
+
+namespace mindspore {
+namespace ops {
+constexpr auto kNameCumSum = "CumSum";
+class CumSum : public PrimitiveC {
+ public:
+  CumSum() : PrimitiveC(kNameCumSum) {}
+  ~CumSum() = default;
+  MS_DECLARE_PARENT(CumSum, PrimitiveC);
+  void Init(const bool exclusive, const bool reverse);
+  void set_exclusive(const bool exclusive);
+  void set_reverse(const bool reverse);
+  bool get_exclusive() const;
+  bool get_reverse() const;
+};
+AbstractBasePtr CumSumInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                            const std::vector<AbstractBasePtr> &input_args);
+using PrimCumSum = std::shared_ptr<CumSum>;
+}  // namespace ops
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CORE_OPS_CUMSUM_H_
diff --git a/mindspore/core/ops/op_utils.h b/mindspore/core/ops/op_utils.h
index 69efa1433f2..703e1a720da 100644
--- a/mindspore/core/ops/op_utils.h
+++ b/mindspore/core/ops/op_utils.h
@@ -232,6 +232,8 @@ constexpr auto kSpliceForwardIndexes = "forward_indexes";
 constexpr auto kSpliceOutputDims = "output_dim";
 constexpr auto kSideEffectIO = "side_effect_io";
 constexpr auto kDeviceType = "device_type";
+constexpr auto kExclusive = "exclusive";
+constexpr auto kReverse = "reverse";
 const std::set<TypePtr> common_valid_types = {kInt8,   kInt16,  kInt32,   kInt64,   kUInt8,  kUInt16,
                                               kUInt32, kUInt64, kFloat16, kFloat32, kFloat64};
 
diff --git a/mindspore/lite/schema/ops.fbs b/mindspore/lite/schema/ops.fbs
index 62f1ad4141c..da0bd88273c 100644
--- a/mindspore/lite/schema/ops.fbs
+++ b/mindspore/lite/schema/ops.fbs
@@ -209,6 +209,7 @@ union PrimitiveType {
     LogSoftmax,
     Call,
     Custom,
+    CumSum,
 }
 
 table Abs {
@@ -442,6 +443,11 @@ table Crop {
     offsets: [long];
 }
 
+table CumSum {
+    exclusive: bool = false;
+    reverse: bool = false;
+}
+
 table CustomExtractFeatures {
 }
 
diff --git a/mindspore/lite/src/ops/ops_def.cc b/mindspore/lite/src/ops/ops_def.cc
index 5b17ecef5bd..43888a54060 100644
--- a/mindspore/lite/src/ops/ops_def.cc
+++ b/mindspore/lite/src/ops/ops_def.cc
@@ -208,6 +208,7 @@ OP_TYPE(Splice)
 OP_TYPE(LogSoftmax)
 OP_TYPE(Call)
 OP_TYPE(Custom)
+OP_TYPE(CumSum)
 OP_TYPE_DEF_END(PrimitiveType)
 
 OP_SCHEMA_DEF(Abs)
@@ -1104,6 +1105,11 @@ OP_SCHEMA_DEF_END(LogSoftmax)
 OP_SCHEMA_DEF(Call)
 OP_SCHEMA_DEF_END(Call)
 
+OP_SCHEMA_DEF(CumSum)
+OP_ATTR(exclusive, bool)
+OP_ATTR(reverse, bool)
+OP_SCHEMA_DEF_END(CumSum)
+
 OP_SCHEMA_DEF_ONLY(Custom)
 OP_ATTR_ONLY(type, string)
 OP_ATTR_ONLY(attr, [Attribute])
diff --git a/mindspore/lite/src/ops/ops_func_declare.h b/mindspore/lite/src/ops/ops_func_declare.h
index f768e0b43f7..6ebeb025419 100644
--- a/mindspore/lite/src/ops/ops_func_declare.h
+++ b/mindspore/lite/src/ops/ops_func_declare.h
@@ -245,6 +245,7 @@
 #include "ops/splice.h"
 #include "ops/log_softmax.h"
 #include "ops/call.h"
+#include "ops/cumsum.h"
 
 #define FUNC_MSOP2SCHEMAOP_DECLARE(OP)                                        \
   namespace mindspore::lite::ops {                                            \
@@ -459,5 +460,6 @@ FUNC_MSOP2SCHEMAOP_DECLARE(ResizeGrad);
 FUNC_MSOP2SCHEMAOP_DECLARE(Splice);
 FUNC_MSOP2SCHEMAOP_DECLARE(LogSoftmax);
 FUNC_MSOP2SCHEMAOP_DECLARE(Call);
+FUNC_MSOP2SCHEMAOP_DECLARE(CumSum);
 #endif
 #endif  // MINDSPORE_LITE_SRC_OPS_OPS_FUNC_DECLARE_H_
diff --git a/mindspore/lite/src/ops/ops_utils.cc b/mindspore/lite/src/ops/ops_utils.cc
index eaddf862329..2d6ef9d2019 100644
--- a/mindspore/lite/src/ops/ops_utils.cc
+++ b/mindspore/lite/src/ops/ops_utils.cc
@@ -760,6 +760,11 @@ schema::PrimitiveT *CallPrimitiveCreator(const AnfNodePtr &node) {
   return ms_primc != nullptr ? ops::MSOp2SchemaOp(ms_primc.get()) : nullptr;
 }
 
+schema::PrimitiveT *CumSumPrimitiveCreator(const AnfNodePtr &node) {
+  auto ms_primc = GetValueNode<std::shared_ptr<mindspore::ops::CumSum>>(node);
+  return ms_primc != nullptr ? ops::MSOp2SchemaOp(ms_primc.get()) : nullptr;
+}
+
 RegistryMSOps g_absPrimitiveCreatorRegistry("Abs", AbsPrimitiveCreator);
 RegistryMSOps g_absGradPrimitiveCreatorRegistry("AbsGrad", AbsGradPrimitiveCreator);
 RegistryMSOps g_activationPrimitiveCreatorRegistry("Activation", ActivationPrimitiveCreator);
@@ -975,6 +980,7 @@ RegistryMSOps g_erfPrimitiveCreatorRegistry("Erf", ErfPrimitiveCreator);
 RegistryMSOps g_SplicePrimitiveCreatorRegistry("Splice", SplicePrimitiveCreator);
 RegistryMSOps g_LogSoftmaxPrimitiveCreatorRegistry("LogSoftmax", LogSoftmaxPrimitiveCreator);
 RegistryMSOps g_CallPrimitiveCreatorRegistry("call", CallPrimitiveCreator);
+RegistryMSOps g_CumSumPrimitiveCreatorRegistry("CumSum", CumSumPrimitiveCreator);
 
 schema::PrimitiveT *CustomPrimitiveCreator(const AnfNodePtr &node) {
   auto ms_primc = GetValueNode<std::shared_ptr<mindspore::ops::Custom>>(node);
diff --git a/mindspore/lite/src/ops/populate/cumsum_populate.cc b/mindspore/lite/src/ops/populate/cumsum_populate.cc
new file mode 100644
index 00000000000..b5a925829fa
--- /dev/null
+++ b/mindspore/lite/src/ops/populate/cumsum_populate.cc
@@ -0,0 +1,41 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/ops/populate/populate_register.h"
+#include "nnacl/cumsum_parameter.h"
+using mindspore::schema::PrimitiveType_CumSum;
+
+namespace mindspore {
+namespace lite {
+namespace {
+OpParameter *PopulateCumSumParameter(const void *prim) {
+  auto primitive = static_cast<const schema::Primitive *>(prim);
+  auto cumsum_prim = primitive->value_as_CumSum();
+  CumSumParameter *cumsum_param = reinterpret_cast<CumSumParameter *>(malloc(sizeof(CumSumParameter)));
+  if (cumsum_param == nullptr) {
+    MS_LOG(ERROR) << "malloc CumsumParameter failed.";
+    return nullptr;
+  }
+  memset(cumsum_param, 0, sizeof(CumSumParameter));
+  cumsum_param->op_parameter_.type_ = primitive->value_type();
+  cumsum_param->exclusive_ = cumsum_prim->exclusive();
+  cumsum_param->reverse_ = cumsum_prim->reverse();
+  return reinterpret_cast<OpParameter *>(cumsum_param);
+}
+}  // namespace
+
+REG_POPULATE(PrimitiveType_CumSum, PopulateCumSumParameter, SCHEMA_CUR)
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/activation_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/activation_fp32.cc
index 130ebb60f6b..286be906c8d 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/activation_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/activation_fp32.cc
@@ -29,6 +29,7 @@ using mindspore::schema::ActivationType_HSWISH;
 using mindspore::schema::ActivationType_LEAKY_RELU;
 using mindspore::schema::ActivationType_RELU;
 using mindspore::schema::ActivationType_RELU6;
+using mindspore::schema::ActivationType_SOFTPLUS;
 using mindspore::schema::ActivationType_SWISH;
 using mindspore::schema::PrimitiveType_Activation;
 
@@ -38,7 +39,8 @@ int ActivationCPUKernel::Init() {
       type_ != schema::ActivationType_LEAKY_RELU && type_ != schema::ActivationType_SIGMOID &&
       type_ != schema::ActivationType_TANH && type_ != schema::ActivationType_HSWISH &&
       type_ != schema::ActivationType_SWISH && type_ != schema::ActivationType_HSIGMOID &&
-      type_ != schema::ActivationType_HARD_TANH && type_ != schema::ActivationType_GELU) {
+      type_ != schema::ActivationType_HARD_TANH && type_ != schema::ActivationType_GELU &&
+      type_ != schema::ActivationType_SOFTPLUS) {
     MS_LOG(ERROR) << "Activation fp32 not support type: " << type_;
     return RET_ERROR;
   }
@@ -80,6 +82,8 @@ int ActivationCPUKernel::DoActivation(int task_id) {
     ret = HardTanh(input_addr + stride * task_id, count, output_addr + stride * task_id, min_val_, max_val_);
   } else if (type_ == schema::ActivationType_GELU) {
     ret = Gelu(input_addr + stride * task_id, count, output_addr + stride * task_id, true);
+  } else if (type_ == schema::ActivationType_SOFTPLUS) {
+    ret = Softplus(input_addr + stride * task_id, count, output_addr + stride * task_id);
   } else {
     MS_LOG(ERROR) << "Activation type error";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/cumsum_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/cumsum_fp32.cc
new file mode 100644
index 00000000000..6762d7b8b55
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/cumsum_fp32.cc
@@ -0,0 +1,152 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/runtime/kernel/arm/fp32/cumsum_fp32.h"
+#include "nnacl/fp32/cumsum_fp32.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "src/runtime/runtime_api.h"
+
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_NULL_PTR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_CumSum;
+
+namespace mindspore::kernel {
+namespace {
+int CumsumLaunch(void *cdata, int task_id) {
+  if (cdata == nullptr) {
+    MS_LOG(ERROR) << "cdata is nullptr!";
+    return RET_NULL_PTR;
+  }
+  auto kernel = reinterpret_cast<CumSumCPUKernel *>(cdata);
+  auto input_tensor = kernel->in_tensors().at(0);
+  int ret;
+  if (input_tensor->data_type() == kNumberTypeFloat32) {
+    ret = kernel->DoCumsum(task_id);
+  } else if (input_tensor->data_type() == kNumberTypeInt32) {
+    ret = kernel->DoCumsumInt(task_id);
+  } else {
+    MS_LOG(ERROR) << "Cumsum support data type int32 or float32";
+    return RET_ERROR;
+  }
+  return ret;
+}
+}  // namespace
+
+int CumSumCPUKernel::Init() {
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  return ReSize();
+}
+
+int CumSumCPUKernel::ReSize() {
+  MS_ASSERT(in_tensors_.size() == 2);
+  auto input_tensor = in_tensors_.at(0);
+  auto axis_tensor = in_tensors_.at(1);
+  int *axis_data = reinterpret_cast<int *>(axis_tensor->data_c());
+  if (axis_data == nullptr) {
+    MS_LOG(ERROR) << "Cumsum axis nullptr";
+    return RET_ERROR;
+  }
+  param_->axis_ = *axis_data;
+  if (param_->axis_ < 0) {
+    param_->axis_ += in_tensors_.at(0)->shape().size();
+  }
+  if (static_cast<int>(in_tensors_.at(0)->shape().size()) <= param_->axis_) {
+    MS_LOG(ERROR) << "axis " << param_->axis_ << " larger than in tensor rank " << in_tensors_.at(0)->shape().size();
+    return RET_ERROR;
+  }
+  out_dim_ = 1;
+  for (int i = 0; i < param_->axis_; ++i) {
+    out_dim_ *= input_tensor->shape().at(i);
+  }
+  axis_dim_ = input_tensor->shape().at(param_->axis_);
+  in_dim_ = 1;
+  for (int i = param_->axis_ + 1; i < static_cast<int>(input_tensor->shape().size()); ++i) {
+    in_dim_ *= input_tensor->shape().at(i);
+  }
+  unit_ = UP_DIV(out_dim_, op_parameter_->thread_num_);
+  return RET_OK;
+}
+
+int CumSumCPUKernel::DoCumsum(int task_id) {
+  auto input_tensor = in_tensors_.at(0);
+  MS_ASSERT(input_tensor != nullptr);
+  float *input_data = reinterpret_cast<float *>(input_tensor->data_c());
+  if (input_data == nullptr) {
+    MS_LOG(ERROR) << "input data nullptr";
+    return RET_ERROR;
+  }
+  auto output_tensor = out_tensors_.at(0);
+  MS_ASSERT(output_tensor != nullptr);
+  float *output_data = reinterpret_cast<float *>(output_tensor->data_c());
+  if (output_data == nullptr) {
+    MS_LOG(ERROR) << "output data nullptr";
+    return RET_ERROR;
+  }
+  float *input = input_data + task_id * unit_ * axis_dim_ * in_dim_;
+  int out_dim = MSMIN(out_dim_ - unit_ * task_id, unit_);
+  float *output = output_data + task_id * unit_ * axis_dim_ * in_dim_;
+  if (!param_->reverse_) {
+    Cumsum(input, output, out_dim, axis_dim_, in_dim_, param_->exclusive_);
+  } else {
+    CumsumReverse(input, output, out_dim, axis_dim_, in_dim_, param_->exclusive_);
+  }
+  return RET_OK;
+}
+
+int CumSumCPUKernel::DoCumsumInt(int task_id) {
+  auto input_tensor = in_tensors_.at(0);
+  MS_ASSERT(input_tensor != nullptr);
+  int *input_data = reinterpret_cast<int *>(input_tensor->data_c());
+  if (input_data == nullptr) {
+    MS_LOG(ERROR) << "input data nullptr";
+    return RET_ERROR;
+  }
+  auto output_tensor = out_tensors_.at(0);
+  MS_ASSERT(output_tensor != nullptr);
+  int *output_data = reinterpret_cast<int *>(output_tensor->data_c());
+  if (output_data == nullptr) {
+    MS_LOG(ERROR) << "output data nullptr";
+    return RET_ERROR;
+  }
+  int *input = input_data + task_id * unit_ * axis_dim_ * in_dim_;
+  int out_dim = MSMIN(out_dim_ - unit_ * task_id, unit_);
+  int *output = output_data + task_id * unit_ * axis_dim_ * in_dim_;
+  if (!param_->reverse_) {
+    CumsumInt(input, output, out_dim, axis_dim_, in_dim_, param_->exclusive_);
+  } else {
+    CumsumReverseInt(input, output, out_dim, axis_dim_, in_dim_, param_->exclusive_);
+  }
+  return RET_OK;
+}
+
+int CumSumCPUKernel::Run() {
+  int ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, CumsumLaunch, this,
+                           op_parameter_->thread_num_);
+
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Crop launch fail!ret: " << ret;
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_CumSum, LiteKernelCreator<CumSumCPUKernel>)
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_CumSum, LiteKernelCreator<CumSumCPUKernel>)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/cumsum_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/cumsum_fp32.h
new file mode 100644
index 00000000000..0e914e841d5
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/cumsum_fp32.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CUMSUM_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CUMSUM_H_
+
+#include <vector>
+#include "include/errorcode.h"
+#include "nnacl/cumsum_parameter.h"
+#include "src/lite_kernel.h"
+
+namespace mindspore::kernel {
+class CumSumCPUKernel : public LiteKernel {
+ public:
+  CumSumCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                  const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : LiteKernel(parameter, inputs, outputs, ctx) {
+    param_ = reinterpret_cast<CumSumParameter *>(op_parameter_);
+  }
+  ~CumSumCPUKernel() = default;
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int DoCumsum(int task_id);
+  int DoCumsumInt(int task_id);
+
+ private:
+  int out_dim_ = 1;
+  int axis_dim_ = 1;
+  int in_dim_ = 1;
+  int unit_ = 1;
+  CumSumParameter *param_ = nullptr;
+};
+}  // namespace mindspore::kernel
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CUMSUM_H_
diff --git a/mindspore/lite/test/ut/nnacl/infer/cumsum_infer_test.cc b/mindspore/lite/test/ut/nnacl/infer/cumsum_infer_test.cc
new file mode 100644
index 00000000000..ae61313f7b2
--- /dev/null
+++ b/mindspore/lite/test/ut/nnacl/infer/cumsum_infer_test.cc
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "common/common_test.h"
+#include "nnacl/infer/cumsum_infer.h"
+#include "nnacl/cumsum_parameter.h"
+
+namespace mindspore {
+
+class CumSumInferTest : public mindspore::CommonTest {
+ public:
+  CumSumInferTest() {}
+};
+
+TEST_F(CumSumInferTest, Test0) {
+  size_t inputs_size = 2;
+  std::vector<TensorC *> inputs(inputs_size, NULL);
+  inputs[0] = new TensorC;
+  inputs[0]->shape_size_ = 3;
+  inputs[0]->shape_[0] = 4;
+  inputs[0]->shape_[1] = 3;
+  inputs[0]->shape_[2] = 2;
+  inputs[0]->data_type_ = kNumberTypeInt32;
+  inputs[0]->format_ = Format_NHWC;
+  inputs[1] = new TensorC;
+  inputs[1]->shape_size_ = 1;
+  inputs[1]->shape_[0] = 1;
+
+  std::vector<TensorC *> outputs(1, NULL);
+  outputs[0] = new TensorC;
+  CumSumParameter *parameter = new CumSumParameter;
+  parameter->op_parameter_.infer_flag_ = true;
+  int ret = CumsumInferShape((const TensorC **)inputs.data(), inputs.size(), outputs.data(), outputs.size(),
+                             reinterpret_cast<OpParameter *>(parameter));
+  ASSERT_EQ(ret, NNACL_OK);
+  ASSERT_EQ(outputs[0]->shape_size_, 3);
+  ASSERT_EQ(outputs[0]->shape_[0], 4);
+  ASSERT_EQ(outputs[0]->shape_[1], 3);
+  ASSERT_EQ(outputs[0]->shape_[2], 2);
+  ASSERT_EQ(outputs[0]->data_type_, kNumberTypeInt32);
+  ASSERT_EQ(outputs[0]->format_, Format_NHWC);
+  delete parameter;
+  for (size_t i = 0; i < inputs_size; i++) {
+    delete inputs[i];
+  }
+  for (size_t i = 0; i < outputs.size(); i++) {
+    delete outputs[i];
+  }
+}
+
+}  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/activation_fp32_test.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/activation_fp32_test.cc
index ab3ae8e667a..80450ec255e 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/activation_fp32_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/activation_fp32_test.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -226,4 +226,48 @@ TEST_F(TestActivationFp32, HardTanh2) {
   output0_tensor.set_data(nullptr);
 }
 
+TEST_F(TestActivationFp32, Softplus) {
+  std::vector<lite::Tensor *> inputs_tensor;
+  std::vector<lite::Tensor *> outputs_tensor;
+
+  ActivationParameter op_param;
+  op_param.op_parameter_.type_ = schema::PrimitiveType_Activation;
+  op_param.type_ = schema::ActivationType_SOFTPLUS;
+
+  std::vector<float> input = {1, 2, 3, 4, 5, -1, 6, 7, -10, -20, 20, 30, 14, 0};
+  std::vector<int> in_shape = {14};
+
+  lite::Tensor input0_tensor;
+  inputs_tensor.push_back(&input0_tensor);
+  input0_tensor.set_data(input.data());
+  input0_tensor.set_shape(in_shape);
+
+  std::vector<float> output(14);
+  std::vector<int> output_shape = {14};
+
+  lite::Tensor output0_tensor;
+  outputs_tensor.push_back(&output0_tensor);
+  output0_tensor.set_data(output.data());
+
+  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Activation};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  ASSERT_NE(creator, nullptr);
+  lite::InnerContext ctx;
+  ctx.thread_num_ = 2;
+  ASSERT_EQ(lite::RET_OK, ctx.Init());
+  kernel::LiteKernel *kernel =
+    creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&op_param), &ctx, desc);
+  ASSERT_NE(kernel, nullptr);
+  auto output_tensor_shape = output0_tensor.shape();
+  auto ret = kernel->Run();
+  ASSERT_EQ(0, ret);
+  std::vector<float> expect_output = {1.3132616,   2.1269281,   3.0485871,  4.0181499,    5.0067153,
+                                      0.31326169,  6.0024757,   7.0009117,  0.0000453989, 0.0000000002,
+                                      20.00000000, 30.00000000, 14.0000000, 0.69314718};
+  ASSERT_EQ(0, CompareOutputData(output.data(), expect_output.data(), 14, 0.00001));
+
+  input0_tensor.set_data(nullptr);
+  output0_tensor.set_data(nullptr);
+}
+
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/cumsum_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/cumsum_tests.cc
new file mode 100644
index 00000000000..08502bc92ae
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/cumsum_tests.cc
@@ -0,0 +1,384 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include "common/common_test.h"
+#include "nnacl/cumsum_parameter.h"
+#include "mindspore/lite/src/kernel_registry.h"
+
+namespace mindspore {
+class TestCumsum : public mindspore::CommonTest {
+ public:
+  TestCumsum() {}
+};
+
+TEST_F(TestCumsum, TestThread1) {
+  lite::Tensor in_tensor0(kNumberTypeFloat32, {2, 3, 2});
+  float input_data0[12] = {1, 1, 2, 2, 3, 3, 10, 10, 20, 20, 30, 30};
+  in_tensor0.set_data(input_data0);
+  lite::Tensor in_tensor1(kNumberTypeInt32, {1});
+  int input_data1[1] = {1};  // axis 1
+  in_tensor1.set_data(input_data1);
+  std::vector<lite::Tensor *> inputs = {&in_tensor0, &in_tensor1};
+
+  lite::Tensor out_tensor0(kNumberTypeFloat32, {2, 3, 2});
+  float output_data0[12] = {0};
+  out_tensor0.set_data(output_data0);
+  std::vector<lite::Tensor *> outputs = {&out_tensor0};
+
+  CumSumParameter *parameter = reinterpret_cast<CumSumParameter *>(malloc(sizeof(CumSumParameter)));
+  parameter->op_parameter_.type_ = schema::PrimitiveType_CumSum;
+  parameter->op_parameter_.infer_flag_ = true;
+  parameter->exclusive_ = false;
+  parameter->reverse_ = false;
+  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_CumSum};
+
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  EXPECT_NE(creator, nullptr);
+
+  auto ctx = std::make_shared<lite::InnerContext>();
+  ctx->thread_num_ = 1;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(parameter), ctx.get(), desc);
+  EXPECT_NE(kernel, nullptr);
+
+  auto ret = kernel->Run();
+  EXPECT_EQ(0, ret);
+  EXPECT_NEAR(1.0f, output_data0[0], 0.000001);
+  EXPECT_NEAR(1.0f, output_data0[1], 0.000001);
+  EXPECT_NEAR(3.0f, output_data0[2], 0.000001);
+  EXPECT_NEAR(3.0f, output_data0[3], 0.000001);
+  EXPECT_NEAR(6.0f, output_data0[4], 0.000001);
+  EXPECT_NEAR(6.0f, output_data0[5], 0.000001);
+  EXPECT_NEAR(10.0f, output_data0[6], 0.000001);
+  EXPECT_NEAR(10.0f, output_data0[7], 0.000001);
+  EXPECT_NEAR(30.0f, output_data0[8], 0.000001);
+  EXPECT_NEAR(30.0f, output_data0[9], 0.000001);
+  EXPECT_NEAR(60.0f, output_data0[10], 0.000001);
+  EXPECT_NEAR(60.0f, output_data0[11], 0.000001);
+
+  for (int i = 0; i < 12; ++i) {
+    std::cout << output_data0[i] << " ";
+  }
+  std::cout << std::endl;
+  out_tensor0.set_data(nullptr);
+  in_tensor0.set_data(nullptr);
+  in_tensor1.set_data(nullptr);
+}
+
+TEST_F(TestCumsum, TestExclusive) {
+  lite::Tensor in_tensor0(kNumberTypeFloat32, {2, 3, 2});
+  float input_data0[12] = {1, 1, 2, 2, 3, 3, 10, 10, 20, 20, 30, 30};
+  in_tensor0.set_data(input_data0);
+  lite::Tensor in_tensor1(kNumberTypeInt32, {1});
+  int input_data1[1] = {1};  // axis 1
+  in_tensor1.set_data(input_data1);
+  std::vector<lite::Tensor *> inputs = {&in_tensor0, &in_tensor1};
+
+  lite::Tensor out_tensor0(kNumberTypeFloat32, {2, 3, 2});
+  float output_data0[12] = {0};
+  out_tensor0.set_data(output_data0);
+  std::vector<lite::Tensor *> outputs = {&out_tensor0};
+
+  CumSumParameter *parameter = reinterpret_cast<CumSumParameter *>(malloc(sizeof(CumSumParameter)));
+  parameter->op_parameter_.type_ = schema::PrimitiveType_CumSum;
+  parameter->op_parameter_.infer_flag_ = true;
+  parameter->exclusive_ = true;
+  parameter->reverse_ = false;
+  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_CumSum};
+
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  EXPECT_NE(creator, nullptr);
+
+  auto ctx = std::make_shared<lite::InnerContext>();
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(parameter), ctx.get(), desc);
+  EXPECT_NE(kernel, nullptr);
+
+  auto ret = kernel->Run();
+  EXPECT_EQ(0, ret);
+  EXPECT_NEAR(0.0f, output_data0[0], 0.000001);
+  EXPECT_NEAR(0.0f, output_data0[1], 0.000001);
+  EXPECT_NEAR(1.0f, output_data0[2], 0.000001);
+  EXPECT_NEAR(1.0f, output_data0[3], 0.000001);
+  EXPECT_NEAR(3.0f, output_data0[4], 0.000001);
+  EXPECT_NEAR(3.0f, output_data0[5], 0.000001);
+  EXPECT_NEAR(0.0f, output_data0[6], 0.000001);
+  EXPECT_NEAR(0.0f, output_data0[7], 0.000001);
+  EXPECT_NEAR(10.0f, output_data0[8], 0.000001);
+  EXPECT_NEAR(10.0f, output_data0[9], 0.000001);
+  EXPECT_NEAR(30.0f, output_data0[10], 0.000001);
+  EXPECT_NEAR(30.0f, output_data0[11], 0.000001);
+
+  for (int i = 0; i < 12; ++i) {
+    std::cout << output_data0[i] << " ";
+  }
+  out_tensor0.set_data(nullptr);
+  in_tensor0.set_data(nullptr);
+  in_tensor1.set_data(nullptr);
+  delete kernel;
+}
+
+TEST_F(TestCumsum, TestReverse) {
+  lite::Tensor in_tensor0(kNumberTypeFloat32, {2, 3, 2});
+  float input_data0[12] = {1, 1, 2, 2, 3, 3, 10, 10, 20, 20, 30, 30};
+  in_tensor0.set_data(input_data0);
+  lite::Tensor in_tensor1(kNumberTypeInt32, {1});
+  int input_data1[1] = {1};  // axis 1
+  in_tensor1.set_data(input_data1);
+  std::vector<lite::Tensor *> inputs = {&in_tensor0, &in_tensor1};
+
+  lite::Tensor out_tensor0(kNumberTypeFloat32, {2, 3, 2});
+  float output_data0[12] = {0};
+  out_tensor0.set_data(output_data0);
+  std::vector<lite::Tensor *> outputs = {&out_tensor0};
+
+  CumSumParameter *parameter = reinterpret_cast<CumSumParameter *>(malloc(sizeof(CumSumParameter)));
+  parameter->op_parameter_.type_ = schema::PrimitiveType_CumSum;
+  parameter->op_parameter_.infer_flag_ = 1;
+  parameter->exclusive_ = false;
+  parameter->reverse_ = true;
+  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_CumSum};
+
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  EXPECT_NE(creator, nullptr);
+
+  auto ctx = std::make_shared<lite::InnerContext>();
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(parameter), ctx.get(), desc);
+  EXPECT_NE(kernel, nullptr);
+
+  auto ret = kernel->Run();
+  EXPECT_EQ(0, ret);
+  EXPECT_NEAR(6.0f, output_data0[0], 0.000001);
+  EXPECT_NEAR(6.0f, output_data0[1], 0.000001);
+  EXPECT_NEAR(5.0f, output_data0[2], 0.000001);
+  EXPECT_NEAR(5.0f, output_data0[3], 0.000001);
+  EXPECT_NEAR(3.0f, output_data0[4], 0.000001);
+  EXPECT_NEAR(3.0f, output_data0[5], 0.000001);
+  EXPECT_NEAR(60.0f, output_data0[6], 0.000001);
+  EXPECT_NEAR(60.0f, output_data0[7], 0.000001);
+  EXPECT_NEAR(50.0f, output_data0[8], 0.000001);
+  EXPECT_NEAR(50.0f, output_data0[9], 0.000001);
+  EXPECT_NEAR(30.0f, output_data0[10], 0.000001);
+  EXPECT_NEAR(30.0f, output_data0[11], 0.000001);
+
+  for (int i = 0; i < 12; ++i) {
+    std::cout << output_data0[i] << " ";
+  }
+  out_tensor0.set_data(nullptr);
+  in_tensor0.set_data(nullptr);
+  in_tensor1.set_data(nullptr);
+  delete kernel;
+}
+
+TEST_F(TestCumsum, TestReverseExclusive) {
+  lite::Tensor in_tensor0(kNumberTypeFloat32, {2, 3, 2});
+  float input_data0[12] = {1, 1, 2, 2, 3, 3, 10, 10, 20, 20, 30, 30};
+  in_tensor0.set_data(input_data0);
+  lite::Tensor in_tensor1(kNumberTypeInt32, {1});
+  int input_data1[1] = {1};  // axis 1
+  in_tensor1.set_data(input_data1);
+  std::vector<lite::Tensor *> inputs = {&in_tensor0, &in_tensor1};
+
+  lite::Tensor out_tensor0(kNumberTypeFloat32, {2, 3, 2});
+  float output_data0[12] = {0};
+  out_tensor0.set_data(output_data0);
+  std::vector<lite::Tensor *> outputs = {&out_tensor0};
+
+  CumSumParameter *parameter = reinterpret_cast<CumSumParameter *>(malloc(sizeof(CumSumParameter)));
+  parameter->op_parameter_.type_ = schema::PrimitiveType_CumSum;
+  parameter->op_parameter_.infer_flag_ = true;
+  parameter->exclusive_ = true;
+  parameter->reverse_ = true;
+  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_CumSum};
+
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  EXPECT_NE(creator, nullptr);
+
+  auto ctx = std::make_shared<lite::InnerContext>();
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(parameter), ctx.get(), desc);
+  EXPECT_NE(kernel, nullptr);
+
+  auto ret = kernel->Run();
+  EXPECT_EQ(0, ret);
+  EXPECT_NEAR(5.0f, output_data0[0], 0.000001);
+  EXPECT_NEAR(5.0f, output_data0[1], 0.000001);
+  EXPECT_NEAR(3.0f, output_data0[2], 0.000001);
+  EXPECT_NEAR(3.0f, output_data0[3], 0.000001);
+  EXPECT_NEAR(0.0f, output_data0[4], 0.000001);
+  EXPECT_NEAR(0.0f, output_data0[5], 0.000001);
+  EXPECT_NEAR(50.0f, output_data0[6], 0.000001);
+  EXPECT_NEAR(50.0f, output_data0[7], 0.000001);
+  EXPECT_NEAR(30.0f, output_data0[8], 0.000001);
+  EXPECT_NEAR(30.0f, output_data0[9], 0.000001);
+  EXPECT_NEAR(0.0f, output_data0[10], 0.000001);
+  EXPECT_NEAR(0.0f, output_data0[11], 0.000001);
+
+  for (int i = 0; i < 12; ++i) {
+    std::cout << output_data0[i] << " ";
+  }
+  out_tensor0.set_data(nullptr);
+  in_tensor0.set_data(nullptr);
+  in_tensor1.set_data(nullptr);
+  delete kernel;
+}
+
+TEST_F(TestCumsum, TestIntRank2) {
+  lite::Tensor in_tensor0(kNumberTypeInt32, {1, 6});
+  int input_data0[6] = {1, 2, 3, 4, 5, 6};
+  in_tensor0.set_data(input_data0);
+  lite::Tensor in_tensor1(kNumberTypeInt32, {1});
+  int input_data1[1] = {1};  // axis 1
+  in_tensor1.set_data(input_data1);
+  std::vector<lite::Tensor *> inputs = {&in_tensor0, &in_tensor1};
+
+  lite::Tensor out_tensor0(kNumberTypeInt32, {1, 6});
+  int output_data0[6] = {0};
+  out_tensor0.set_data(output_data0);
+  std::vector<lite::Tensor *> outputs = {&out_tensor0};
+
+  CumSumParameter *parameter = reinterpret_cast<CumSumParameter *>(malloc(sizeof(CumSumParameter)));
+  parameter->op_parameter_.type_ = schema::PrimitiveType_CumSum;
+  parameter->op_parameter_.infer_flag_ = true;
+  parameter->exclusive_ = false;
+  parameter->reverse_ = false;
+  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_CumSum};
+
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  EXPECT_NE(creator, nullptr);
+
+  auto ctx = std::make_shared<lite::InnerContext>();
+  ctx->thread_num_ = 1;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(parameter), ctx.get(), desc);
+  EXPECT_NE(kernel, nullptr);
+
+  auto ret = kernel->Run();
+  EXPECT_EQ(0, ret);
+  EXPECT_EQ(1, output_data0[0]);
+  EXPECT_EQ(3, output_data0[1]);
+  EXPECT_EQ(6, output_data0[2]);
+  EXPECT_EQ(10, output_data0[3]);
+  EXPECT_EQ(15, output_data0[4]);
+  EXPECT_EQ(21, output_data0[5]);
+
+  for (int i = 0; i < 6; ++i) {
+    std::cout << output_data0[i] << " ";
+  }
+  out_tensor0.set_data(nullptr);
+  in_tensor0.set_data(nullptr);
+  in_tensor1.set_data(nullptr);
+  delete kernel;
+}
+
+TEST_F(TestCumsum, TestIntRank2Thread2) {
+  lite::Tensor in_tensor0(kNumberTypeInt32, {1, 6});
+  int input_data0[6] = {1, 2, 3, 4, 5, 6};
+  in_tensor0.set_data(input_data0);
+  lite::Tensor in_tensor1(kNumberTypeInt32, {1});
+  int input_data1[1] = {1};  // axis 1
+  in_tensor1.set_data(input_data1);
+  std::vector<lite::Tensor *> inputs = {&in_tensor0, &in_tensor1};
+
+  lite::Tensor out_tensor0(kNumberTypeInt32, {1, 6});
+  int output_data0[6] = {0};
+  out_tensor0.set_data(output_data0);
+  std::vector<lite::Tensor *> outputs = {&out_tensor0};
+
+  CumSumParameter *parameter = reinterpret_cast<CumSumParameter *>(malloc(sizeof(CumSumParameter)));
+  parameter->op_parameter_.type_ = schema::PrimitiveType_CumSum;
+  parameter->op_parameter_.infer_flag_ = true;
+  parameter->exclusive_ = false;
+  parameter->reverse_ = false;
+  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_CumSum};
+
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  EXPECT_NE(creator, nullptr);
+
+  auto ctx = std::make_shared<lite::InnerContext>();
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(parameter), ctx.get(), desc);
+  EXPECT_NE(kernel, nullptr);
+
+  auto ret = kernel->Run();
+  EXPECT_EQ(0, ret);
+  EXPECT_EQ(1, output_data0[0]);
+  EXPECT_EQ(3, output_data0[1]);
+  EXPECT_EQ(6, output_data0[2]);
+  EXPECT_EQ(10, output_data0[3]);
+  EXPECT_EQ(15, output_data0[4]);
+  EXPECT_EQ(21, output_data0[5]);
+
+  for (int i = 0; i < 6; ++i) {
+    std::cout << output_data0[i] << " ";
+  }
+  out_tensor0.set_data(nullptr);
+  in_tensor0.set_data(nullptr);
+  in_tensor1.set_data(nullptr);
+  delete kernel;
+}
+
+TEST_F(TestCumsum, TestIntRank2Thread4) {
+  lite::Tensor in_tensor0(kNumberTypeInt32, {1, 6});
+  int input_data0[6] = {1, 2, 3, 4, 5, 6};
+  in_tensor0.set_data(input_data0);
+  lite::Tensor in_tensor1(kNumberTypeInt32, {1});
+  int input_data1[1] = {1};  // axis 1
+  in_tensor1.set_data(input_data1);
+  std::vector<lite::Tensor *> inputs = {&in_tensor0, &in_tensor1};
+
+  lite::Tensor out_tensor0(kNumberTypeInt32, {1, 6});
+  int output_data0[6] = {0};
+  out_tensor0.set_data(output_data0);
+  std::vector<lite::Tensor *> outputs = {&out_tensor0};
+
+  CumSumParameter *parameter = reinterpret_cast<CumSumParameter *>(malloc(sizeof(CumSumParameter)));
+  parameter->op_parameter_.type_ = schema::PrimitiveType_CumSum;
+  parameter->op_parameter_.infer_flag_ = true;
+  parameter->exclusive_ = false;
+  parameter->reverse_ = false;
+  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_CumSum};
+
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  EXPECT_NE(creator, nullptr);
+
+  auto ctx = std::make_shared<lite::InnerContext>();
+  ctx->thread_num_ = 4;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(parameter), ctx.get(), desc);
+  EXPECT_NE(kernel, nullptr);
+
+  auto ret = kernel->Run();
+  EXPECT_EQ(0, ret);
+  EXPECT_EQ(1, output_data0[0]);
+  EXPECT_EQ(3, output_data0[1]);
+  EXPECT_EQ(6, output_data0[2]);
+  EXPECT_EQ(10, output_data0[3]);
+  EXPECT_EQ(15, output_data0[4]);
+  EXPECT_EQ(21, output_data0[5]);
+
+  for (int i = 0; i < 6; ++i) {
+    std::cout << output_data0[i] << " ";
+  }
+  out_tensor0.set_data(nullptr);
+  in_tensor0.set_data(nullptr);
+  in_tensor1.set_data(nullptr);
+  delete kernel;
+}
+
+}  // namespace mindspore