!20476 Adding 9 object-detection operators in CPU

Merge pull request !20476 from huangbo/object_detection_2
2021-07-22 11:43:18 +00:00 · 2021-07-22 11:43:18 +00:00 · 66f4756555
parent 1ae69fd48e 7926769fc9
commit 66f4756555
38 changed files with 3987 additions and 9 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_with_value_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_with_value_cpu_kernel.cc
@ -0,0 +1,105 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/kernel_compiler/cpu/argmax_with_value_cpu_kernel.h"
+#include "runtime/device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+namespace {
+size_t get_element_num(const std::vector<size_t> &shape) {
+  size_t size = 1;
+  for (size_t i = 0; i < shape.size(); i++) {
+    size *= shape[i];
+  }
+  return size;
+}
+
+template <typename T>
+bool check_validation(const std::vector<size_t> &shape, const size_t num_before_axis, const size_t num_after_axis,
+                      const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs) {
+  if (inputs.size() != 1 || outputs.size() != 2) {
+    MS_LOG(EXCEPTION) << "Wrong number of inputs or outputs!";
+    return false;
+  }
+  size_t data_size = sizeof(T);
+  size_t input_size = get_element_num(shape) * data_size;
+  size_t output_num = num_before_axis * num_after_axis;
+  size_t out0_size = output_num * sizeof(int);
+  size_t out1_size = output_num * data_size;
+  if (inputs[0]->size != input_size || outputs[0]->size != out0_size || outputs[1]->size != out1_size) {
+    MS_LOG(EXCEPTION) << "Invalid input or output data size!";
+    return false;
+  }
+  return true;
+}
+}  // namespace
+
+template <typename T>
+void ArgMaxWithValueCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
+  size_t shape_len = shape_.size();
+  int64_t axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS);
+  axis += static_cast<int64_t>(shape_len);
+  if (axis < 0) {
+    MS_LOG(EXCEPTION) << "Invalid axis:" << axis << ", should in range [-1, " << (shape_len - 1) << "]";
+  }
+  axis = axis % static_cast<int64_t>(shape_len);
+  num_before_axis_ = 1;
+  num_after_axis_ = 1;
+  for (size_t i = 0; i < shape_len; i++) {
+    if (static_cast<int64_t>(i) < axis) {
+      num_before_axis_ *= shape_[i];
+    } else if (static_cast<int64_t>(i) > axis) {
+      num_after_axis_ *= shape_[i];
+    }
+  }
+  dim_axis_ = shape_[axis];
+}
+
+template <typename T>
+bool ArgMaxWithValueCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                         const std::vector<kernel::AddressPtr> &,
+                                         const std::vector<kernel::AddressPtr> &outputs) {
+  if (!check_validation<T>(shape_, num_before_axis_, num_after_axis_, inputs, outputs)) {
+    return false;
+  }
+
+  auto input = reinterpret_cast<T *>(inputs[0]->addr);
+  auto output0 = reinterpret_cast<int32_t *>(outputs[0]->addr);
+  auto output1 = reinterpret_cast<T *>(outputs[1]->addr);
+
+  for (size_t i = 0; i < num_before_axis_; i++) {
+    size_t src_index_i = i * dim_axis_ * num_after_axis_;
+    for (size_t j = 0; j < num_after_axis_; j++) {
+      std::vector<float> array_axis;
+      size_t src_index_j = src_index_i + j;
+      for (size_t k = 0; k < dim_axis_; k++) {
+        size_t src_index_k = k * num_after_axis_ + src_index_j;
+        array_axis.push_back(static_cast<float>(input[src_index_k]));
+      }
+      auto max_ops = std::max_element(array_axis.begin(), array_axis.end());
+      auto max_index = static_cast<int32_t>(std::distance(array_axis.begin(), max_ops));
+      auto dst_index = i * num_after_axis_ + j;
+      output0[dst_index] = max_index;
+      auto src_index = IntToSize(max_index) * num_after_axis_ + src_index_j;
+      output1[dst_index] = input[src_index];
+    }
+  }
+  return true;
+}
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_with_value_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_with_value_cpu_kernel.h
@ -0,0 +1,50 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAXWITHVALUE_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAXWITHVALUE_CPU_KERNEL_H_
+#include <vector>
+#include <map>
+#include <memory>
+#include <algorithm>
+#include "backend/kernel_compiler/cpu/cpu_kernel.h"
+#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class ArgMaxWithValueCPUKernel : public CPUKernel {
+ public:
+  ArgMaxWithValueCPUKernel() = default;
+  ~ArgMaxWithValueCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  std::vector<size_t> shape_;
+  size_t num_before_axis_;
+  size_t num_after_axis_;
+  size_t dim_axis_;
+};
+
+MS_REG_CPU_KERNEL_T(ArgMaxWithValue, KernelAttr(), ArgMaxWithValueCPUKernel, float);
+MS_REG_CPU_KERNEL_T(ArgMaxWithValue, KernelAttr(), ArgMaxWithValueCPUKernel, float16);
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAXWITHVALUE_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_decode_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_decode_cpu_kernel.cc
@ -0,0 +1,142 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/kernel_compiler/cpu/boundingbox_decode_cpu_kernel.h"
+#include "runtime/device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+
+template <typename T>
+void BoundingBoxDecodeCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  if (input_num != 2) {
+    MS_LOG(ERROR) << "Input num is " << input_num << ", but BoundingBoxDecode needs 2 inputs.";
+  }
+
+  const size_t coordinate_size = 4;
+  if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("means")->isa<ValueTuple>() ||
+      AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("means")->isa<ValueList>()) {
+    means_ = AnfAlgo::GetNodeAttr<std::vector<float>>(kernel_node, "means");
+  } else if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("means")->isa<FloatImm>()) {
+    float mean = AnfAlgo::GetNodeAttr<float>(kernel_node, "means");
+    for (size_t i = 0; i < coordinate_size; i++) {
+      means_.emplace_back(mean);
+    }
+  } else {
+    MS_LOG(EXCEPTION) << "Attribute means type is invalid.";
+  }
+
+  if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("stds")->isa<ValueTuple>() ||
+      AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("stds")->isa<ValueList>()) {
+    stds_ = AnfAlgo::GetNodeAttr<std::vector<float>>(kernel_node, "stds");
+  } else if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("stds")->isa<FloatImm>()) {
+    float std = AnfAlgo::GetNodeAttr<float>(kernel_node, "stds");
+    for (size_t i = 0; i < coordinate_size; i++) {
+      stds_.emplace_back(std);
+    }
+  } else {
+    MS_LOG(EXCEPTION) << "Attribute stds type is invalid.";
+  }
+
+  if (means_.size() < coordinate_size || stds_.size() < coordinate_size) {
+    MS_LOG(EXCEPTION) << "The size of means or stds is less than 4.";
+  }
+
+  std::vector<int64_t> max_shape_me = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, "max_shape");
+  (void)std::transform(max_shape_me.begin(), max_shape_me.end(), std::back_inserter(max_shape_),
+                       [](const int64_t &value) { return static_cast<int>(value); });
+  wh_ratio_clip_ = AnfAlgo::GetNodeAttr<float>(kernel_node, "wh_ratio_clip");
+
+  if (max_shape_.size() < 2) {
+    MS_LOG(EXCEPTION) << "The size of max_shape is less than 2.";
+  }
+}
+
+template <typename T>
+bool BoundingBoxDecodeCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                           const std::vector<kernel::AddressPtr> &,
+                                           const std::vector<kernel::AddressPtr> &outputs) {
+  auto anchor_box = reinterpret_cast<T *>(inputs[0]->addr);
+  auto deltas = reinterpret_cast<T *>(inputs[1]->addr);
+  auto bboxes = reinterpret_cast<T *>(outputs[0]->addr);
+
+  T ms1 = static_cast<T>(max_shape_[0]);
+  T ms2 = static_cast<T>(max_shape_[1]);
+
+  if (inputs[0]->size != inputs[1]->size) {
+    MS_LOG(ERROR) << "Anchor box size must be equal to deltas box size: " << inputs[1]->size << ", but got"
+                  << inputs[0]->size;
+    return false;
+  }
+
+  const size_t coordinate = 4;
+  const size_t block_size = inputs[0]->size / sizeof(T);
+  if ((block_size % coordinate) != 0) {
+    MS_LOG(ERROR) << "The size of the box must be a multiple of 4.";
+    return false;
+  }
+
+  size_t elem_num = block_size / coordinate;
+  auto task = [&](size_t start, size_t end) {
+    for (size_t i = start; i < end; i++) {
+      const size_t left_x = i * 4;
+      const size_t left_y = i * 4 + 1;
+      const size_t right_x = i * 4 + 2;
+      const size_t right_y = i * 4 + 3;
+
+      T dx = deltas[left_x] * static_cast<T>(stds_[0]) + static_cast<T>(means_[0]);
+      T dy = deltas[left_y] * static_cast<T>(stds_[1]) + static_cast<T>(means_[1]);
+      T dw = deltas[right_x] * static_cast<T>(stds_[2]) + static_cast<T>(means_[2]);
+      T dh = deltas[right_y] * static_cast<T>(stds_[3]) + static_cast<T>(means_[3]);
+
+      T max_ratio = static_cast<T>(abs(log(wh_ratio_clip_)));
+
+      dw = dw > max_ratio ? max_ratio : (dw < (-max_ratio) ? (-max_ratio) : dw);
+      dh = dh > max_ratio ? max_ratio : (dh < (-max_ratio) ? (-max_ratio) : dh);
+
+      T px = (anchor_box[left_x] + anchor_box[right_x]) * static_cast<T>(0.5);
+      T py = (anchor_box[left_y] + anchor_box[right_y]) * static_cast<T>(0.5);
+      T pw = anchor_box[right_x] - anchor_box[left_x] + static_cast<T>(1.0);
+      T ph = anchor_box[right_y] - anchor_box[left_y] + static_cast<T>(1.0);
+
+      T gx = px + pw * dx;
+      T gy = py + ph * dy;
+      T gw = pw * exp(dw);
+      T gh = ph * exp(dh);
+
+      T x1 = gx - gw * static_cast<T>(0.5) + static_cast<T>(0.5);
+      T y1 = gy - gh * static_cast<T>(0.5) + static_cast<T>(0.5);
+      T x2 = gx + gw * static_cast<T>(0.5) - static_cast<T>(0.5);
+      T y2 = gy + gh * static_cast<T>(0.5) - static_cast<T>(0.5);
+
+      x1 = x1 > ms2 ? ms2 : (x1 < static_cast<T>(0) ? static_cast<T>(0) : x1);
+      y1 = y1 > ms1 ? ms1 : (y1 < static_cast<T>(0) ? static_cast<T>(0) : y1);
+      x2 = x2 > ms2 ? ms2 : (x2 < static_cast<T>(0) ? static_cast<T>(0) : x2);
+      y2 = y2 > ms1 ? ms1 : (y2 < static_cast<T>(0) ? static_cast<T>(0) : y2);
+
+      bboxes[left_x] = x1;
+      bboxes[left_y] = y1;
+      bboxes[right_x] = x2;
+      bboxes[right_y] = y2;
+    }
+  };
+  CPUKernelUtils::ParallelFor(task, elem_num);
+
+  return true;
+}
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_decode_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_decode_cpu_kernel.h
@ -0,0 +1,56 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BOUNDINGBOX_DECODE_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BOUNDINGBOX_DECODE_CPU_KERNEL_H_
+#include <vector>
+#include <algorithm>
+#include "backend/kernel_compiler/cpu/cpu_kernel.h"
+#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class BoundingBoxDecodeCPUKernel : public CPUKernel {
+ public:
+  BoundingBoxDecodeCPUKernel() = default;
+  ~BoundingBoxDecodeCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  std::vector<float> means_;
+  std::vector<float> stds_;
+  std::vector<int> max_shape_;
+  float wh_ratio_clip_;
+};
+
+MS_REG_CPU_KERNEL_T(
+  BoundingBoxDecode,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  BoundingBoxDecodeCPUKernel, float);
+
+MS_REG_CPU_KERNEL_T(
+  BoundingBoxDecode,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+  BoundingBoxDecodeCPUKernel, float16);
+
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BOUNDINGBOX_DECODE_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_encode_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_encode_cpu_kernel.cc
@ -0,0 +1,115 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/kernel_compiler/cpu/boundingbox_encode_cpu_kernel.h"
+#include "runtime/device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+
+template <typename T>
+void BoundingBoxEncodeCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  if (input_num != 2) {
+    MS_LOG(ERROR) << "Input num is " << input_num << ", but BoundingBoxEncode needs 2 inputs.";
+  }
+
+  const size_t coordinate_size = 4;
+  if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("means")->isa<ValueTuple>() ||
+      AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("means")->isa<ValueList>()) {
+    means_ = AnfAlgo::GetNodeAttr<std::vector<float>>(kernel_node, "means");
+  } else if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("means")->isa<FloatImm>()) {
+    float mean = AnfAlgo::GetNodeAttr<float>(kernel_node, "means");
+    for (size_t i = 0; i < coordinate_size; i++) {
+      means_.emplace_back(mean);
+    }
+  } else {
+    MS_LOG(EXCEPTION) << "Attribute means type is invalid.";
+  }
+
+  if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("stds")->isa<ValueTuple>() ||
+      AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("stds")->isa<ValueList>()) {
+    stds_ = AnfAlgo::GetNodeAttr<std::vector<float>>(kernel_node, "stds");
+  } else if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("stds")->isa<FloatImm>()) {
+    float std = AnfAlgo::GetNodeAttr<float>(kernel_node, "stds");
+    for (size_t i = 0; i < coordinate_size; i++) {
+      stds_.emplace_back(std);
+    }
+  } else {
+    MS_LOG(EXCEPTION) << "Attribute stds type is invalid.";
+  }
+
+  if (means_.size() < coordinate_size || stds_.size() < coordinate_size) {
+    MS_LOG(EXCEPTION) << "The size of means or stds is less than 4.";
+  }
+}
+
+template <typename T>
+bool BoundingBoxEncodeCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                           const std::vector<kernel::AddressPtr> &,
+                                           const std::vector<kernel::AddressPtr> &outputs) {
+  auto anchor_box = reinterpret_cast<T *>(inputs[0]->addr);
+  auto groundtruth_box = reinterpret_cast<T *>(inputs[1]->addr);
+  auto deltas = reinterpret_cast<T *>(outputs[0]->addr);
+
+  if (inputs[0]->size != inputs[1]->size) {
+    MS_LOG(ERROR) << "Anchor box size must be equal to groundtruth box size: " << inputs[1]->size << ", but got"
+                  << inputs[0]->size;
+    return false;
+  }
+
+  const size_t coordinate = 4;
+  const size_t block_size = inputs[0]->size / sizeof(T);
+  if ((block_size % coordinate) != 0) {
+    MS_LOG(ERROR) << "The size of the box must be a multiple of 4.";
+    return false;
+  }
+
+  size_t elem_num = block_size / coordinate;
+  auto task = [&](size_t start, size_t end) {
+    for (size_t i = start; i < end; i++) {
+      const size_t left_x = i * 4;
+      const size_t left_y = i * 4 + 1;
+      const size_t right_x = i * 4 + 2;
+      const size_t right_y = i * 4 + 3;
+
+      T px = (anchor_box[left_x] + anchor_box[right_x]) * static_cast<T>(0.5);
+      T py = (anchor_box[left_y] + anchor_box[right_y]) * static_cast<T>(0.5);
+      T pw = anchor_box[right_x] - anchor_box[left_x] + static_cast<T>(1.0);
+      T ph = anchor_box[right_y] - anchor_box[left_y] + static_cast<T>(1.0);
+
+      T gx = (groundtruth_box[left_x] + groundtruth_box[right_x]) * static_cast<T>(0.5);
+      T gy = (groundtruth_box[left_y] + groundtruth_box[right_y]) * static_cast<T>(0.5);
+      T gw = groundtruth_box[right_x] - groundtruth_box[left_x] + static_cast<T>(1.0);
+      T gh = groundtruth_box[right_y] - groundtruth_box[left_y] + static_cast<T>(1.0);
+
+      T dx = (gx - px) / pw;
+      T dy = (gy - py) / ph;
+      T dw = log(gw / pw);
+      T dh = log(gh / ph);
+
+      deltas[left_x] = (dx - static_cast<T>(means_[0])) / static_cast<T>(stds_[0]);
+      deltas[left_y] = (dy - static_cast<T>(means_[1])) / static_cast<T>(stds_[1]);
+      deltas[right_x] = (dw - static_cast<T>(means_[2])) / static_cast<T>(stds_[2]);
+      deltas[right_y] = (dh - static_cast<T>(means_[3])) / static_cast<T>(stds_[3]);
+    }
+  };
+  CPUKernelUtils::ParallelFor(task, elem_num);
+
+  return true;
+}
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_encode_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_encode_cpu_kernel.h
@ -0,0 +1,54 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BOUNDINGBOX_ENCODE_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BOUNDINGBOX_ENCODE_CPU_KERNEL_H_
+#include <vector>
+#include <algorithm>
+#include "backend/kernel_compiler/cpu/cpu_kernel.h"
+#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class BoundingBoxEncodeCPUKernel : public CPUKernel {
+ public:
+  BoundingBoxEncodeCPUKernel() = default;
+  ~BoundingBoxEncodeCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  std::vector<float> means_;
+  std::vector<float> stds_;
+};
+
+MS_REG_CPU_KERNEL_T(
+  BoundingBoxEncode,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  BoundingBoxEncodeCPUKernel, float);
+
+MS_REG_CPU_KERNEL_T(
+  BoundingBoxEncode,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+  BoundingBoxEncodeCPUKernel, float16);
+
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BOUNDINGBOX_ENCODE_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/check_valid_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/check_valid_cpu_kernel.cc
@ -0,0 +1,84 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/kernel_compiler/cpu/check_valid_cpu_kernel.h"
+#include "runtime/device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+namespace {
+constexpr size_t kInputSize = 2;
+constexpr size_t kOutputSize = 1;
+}  // namespace
+
+template <typename T>
+void CheckValidCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  anchor_box_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
+  img_metas_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
+}
+
+template <typename T>
+bool CheckValidCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                    const std::vector<kernel::AddressPtr> &,
+                                    const std::vector<kernel::AddressPtr> &outputs) {
+  CheckParams(inputs, outputs);
+  auto anchor_box = reinterpret_cast<T *>(inputs[0]->addr);
+  auto img_metas = reinterpret_cast<T *>(inputs[1]->addr);
+  auto output = reinterpret_cast<bool *>(outputs[0]->addr);
+  const size_t coordinate = 4;
+  const size_t elem_num = inputs[0]->size / sizeof(T) / coordinate;
+
+  auto task = [&](size_t start, size_t end) {
+    for (size_t i = start; i < end; i++) {
+      const size_t left_x = i * 4;
+      const size_t left_y = i * 4 + 1;
+      const size_t right_x = i * 4 + 2;
+      const size_t right_y = i * 4 + 3;
+
+      bool valid_flag = false;
+      valid_flag |= !(anchor_box[left_x] >= static_cast<T>(0.0));
+      valid_flag |= !(anchor_box[left_y] >= static_cast<T>(0.0));
+      valid_flag |= !(img_metas[1] * img_metas[2] - static_cast<T>(1.0) >= anchor_box[right_x]);
+      valid_flag |= !(img_metas[0] * img_metas[2] - static_cast<T>(1.0) >= anchor_box[right_y]);
+
+      output[i] = !valid_flag;
+    }
+  };
+  CPUKernelUtils::ParallelFor(task, elem_num);
+
+  return true;
+}
+
+template <typename T>
+void CheckValidCPUKernel<T>::CheckParams(const std::vector<AddressPtr> &inputs,
+                                         const std::vector<AddressPtr> &outputs) {
+  //  inputs: anchor_box, img_metas
+  if (inputs.size() != kInputSize) {
+    MS_LOG(EXCEPTION) << "Input number is: " << inputs.size() << ", but CheckValid needs " << kInputSize << " inputs.";
+  }
+
+  //  outputs: valid
+  if (outputs.size() != kOutputSize) {
+    MS_LOG(EXCEPTION) << "Output number is: " << outputs.size() << ", but CheckValid needs " << kOutputSize
+                      << "outputs.";
+  }
+  if (outputs[0]->size / sizeof(bool) != inputs[0]->size / sizeof(T) / 4) {
+    MS_LOG(EXCEPTION) << "The output dimensions must match the dimensions of img_metas.";
+  }
+}
+
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/check_valid_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/check_valid_cpu_kernel.h
@ -0,0 +1,61 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CHECK_VALID_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CHECK_VALID_CPU_KERNEL_H_
+#include <vector>
+#include "backend/kernel_compiler/cpu/cpu_kernel.h"
+#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class CheckValidCPUKernel : public CPUKernel {
+ public:
+  CheckValidCPUKernel() = default;
+  ~CheckValidCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  void CheckParams(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
+  std::vector<size_t> anchor_box_shape_;
+  std::vector<size_t> img_metas_shape_;
+};
+
+MS_REG_CPU_KERNEL_T(
+  CheckValid,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeBool),
+  CheckValidCPUKernel, float);
+
+MS_REG_CPU_KERNEL_T(
+  CheckValid,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeBool),
+  CheckValidCPUKernel, float16);
+
+MS_REG_CPU_KERNEL_T(
+  CheckValid, KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeBool),
+  CheckValidCPUKernel, int16_t);
+
+MS_REG_CPU_KERNEL_T(
+  CheckValid, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeBool),
+  CheckValidCPUKernel, uint8_t);
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CHECK_VALID_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.cc
@ -0,0 +1,219 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.h"
+#include "runtime/device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+
+template <typename T>
+void CropAndResizeCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  if (input_num != 4) {
+    MS_LOG(ERROR) << "Input num is " << input_num << ", but CropAndResize needs 4 inputs.";
+  }
+
+  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
+  if (output_num != 1) {
+    MS_LOG(ERROR) << "Output num is " << output_num << ", but CropAndResize needs 1 output.";
+  }
+
+  //  input image
+  auto input_image_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  size_t input_image_shape_len = input_image_shape.size();
+  if (input_image_shape_len != 4) {
+    MS_LOG(ERROR) << "Image tensor is " << input_image_shape_len << "-D, but CropAndResize supports only " << 4
+                  << "-D image tensor.";
+  }
+
+  input_height_ = input_image_shape[1];
+  input_width_ = input_image_shape[2];
+
+  //  input boxes
+  auto input_boxes_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+  size_t input_boxes_shape_len = input_boxes_shape.size();
+  if (input_boxes_shape_len != 2) {
+    MS_LOG(ERROR) << "Box is rank " << input_boxes_shape_len << ", but CropAndResize supports only rank " << 2
+                  << "for boxes.";
+  }
+
+  //  input box_index
+  auto input_box_index_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
+  size_t input_box_index_shape_len = input_box_index_shape.size();
+  if (input_box_index_shape_len != 1) {
+    MS_LOG(ERROR) << "Box index is rank " << input_box_index_shape_len << ", but CropAndResize supports only rank " << 1
+                  << "for box_index.";
+  }
+
+  //  input crop_size
+  auto input_crop_size_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
+  size_t input_crop_size_shape_len = input_crop_size_shape.size();
+  if (input_crop_size_shape_len != 1) {
+    MS_LOG(ERROR) << "Crop_size is rank " << input_crop_size_shape_len << "-D, but CropAndResize supports only rank "
+                  << 1 << "for Crop_size.";
+  }
+  if (input_crop_size_shape[0] != 2) {
+    MS_LOG(ERROR) << "Crop_size is size " << input_crop_size_shape[0] << "-D, but CropAndResize supports only size "
+                  << 2 << "for Crop_size.";
+  }
+
+  //  output
+  auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+  auto output_shape_len = output_shape.size();
+  output_size_ = 1;
+  for (size_t i = 0; i < output_shape_len; i++) {
+    output_size_ *= output_shape[i];
+  }
+
+  //  set expected output params
+  final_height_ = output_shape[1];
+  final_width_ = output_shape[2];
+  channel_ = output_shape[3];
+
+  //  get op parameters
+  string method = AnfAlgo::GetNodeAttr<string>(kernel_node, "method");
+  if (method == "bilinear") {
+    method_ = 1;
+  } else if (method == "nearest") {
+    method_ = 2;
+  } else {  //  bilinear-v2
+    method_ = 3;
+  }
+  extrapolation_value_ = AnfAlgo::GetNodeAttr<float>(kernel_node, "extrapolation_value");
+}
+
+template <typename T>
+bool CropAndResizeCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                       const std::vector<kernel::AddressPtr> &,
+                                       const std::vector<kernel::AddressPtr> &outputs) {
+  auto *input_image = reinterpret_cast<T *>(inputs[0]->addr);
+  auto *input_boxes = reinterpret_cast<float *>(inputs[1]->addr);
+  auto *input_box_index = reinterpret_cast<int *>(inputs[2]->addr);
+  auto *output = reinterpret_cast<float *>(outputs[0]->addr);
+
+  auto task = [&](size_t start, size_t end) {
+    for (size_t pos = start; pos < end; pos++) {
+      size_t pos_temp = pos;
+      const int pos_channel = pos_temp % channel_;
+      pos_temp = pos_temp / channel_;
+      const int pos_x = pos_temp % final_width_;
+      pos_temp = pos_temp / final_width_;
+      const int pos_y = pos_temp % final_height_;
+      const int pos_image_idx = pos_temp / final_height_;
+      const int box_index = input_box_index[pos_image_idx];
+
+      //  crop values
+      const float y1 = input_boxes[4 * pos_image_idx + 0];
+      const float x1 = input_boxes[4 * pos_image_idx + 1];
+      const float y2 = input_boxes[4 * pos_image_idx + 2];
+      const float x2 = input_boxes[4 * pos_image_idx + 3];
+
+      //  set scale and target pixels
+      float scale_height = final_height_ > 1 ? (y2 - y1) * (input_height_ - 1) / (final_height_ - 1) : 0;
+      float scale_width = final_width_ > 1 ? (x2 - x1) * (input_width_ - 1) / (final_width_ - 1) : 0;
+      float target_y =
+        final_height_ > 1 ? y1 * (input_height_ - 1) + pos_y * scale_height : 0.5 * (y1 + y2) + (input_height_ - 1);
+      float target_x =
+        final_width_ > 1 ? x1 * (input_width_ - 1) + pos_x * scale_width : 0.5 * (x1 + x2) + (input_width_ - 1);
+
+      //  use extrapolation value if out of range
+      if (((target_x < 0) || (target_x > input_width_ - 1)) || ((target_y < 0) || (target_y > input_height_ - 1))) {
+        if ((method_ == 1) || (method_ == 2)) {
+          output[pos] = extrapolation_value_;
+          continue;
+        }
+      }
+
+      if (method_ == 1) {
+        // Bilinear
+        const int top_y_index = floorf(target_y);
+        const int bottom_y_index = ceilf(target_y);
+        const int left_x_index = floorf(target_x);
+        const int right_x_index = ceilf(target_x);
+        const float y_lerp = target_y - top_y_index;
+        const float x_lerp = target_x - left_x_index;
+        const float top_left = static_cast<float>(
+          input_image[((box_index * input_height_ + top_y_index) * input_width_ + left_x_index) * channel_ +
+                      pos_channel]);
+        const float top_right = static_cast<float>(
+          input_image[((box_index * input_height_ + top_y_index) * input_width_ + right_x_index) * channel_ +
+                      pos_channel]);
+        const float bottom_left = static_cast<float>(
+          input_image[((box_index * input_height_ + bottom_y_index) * input_width_ + left_x_index) * channel_ +
+                      pos_channel]);
+        const float bottom_right = static_cast<float>(
+          input_image[((box_index * input_height_ + bottom_y_index) * input_width_ + right_x_index) * channel_ +
+                      pos_channel]);
+        const float top = top_left + (top_right - top_left) * x_lerp;
+        const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
+        output[pos] = top + (bottom - top) * y_lerp;
+      } else if (method_ == 3) {
+        int y1h = static_cast<int>(y1 * input_height_);
+        int x1w = static_cast<int>(x1 * input_width_);
+        int y2h = static_cast<int>(y2 * input_height_);
+        int x2w = static_cast<int>(x2 * input_width_);
+        int w = ((x2w - x1w + 1) > 1) ? x2w - x1w + 1 : 1;
+        int h = ((y2h - y1h + 1) > 1) ? y2h - y1h + 1 : 1;
+
+        float y_point = (pos_y + 0.5) * (h / static_cast<float>(final_height_)) - 0.5;
+        int top_y_index = floorf(y_point);
+        top_y_index = std::min(std::max(0, top_y_index), h - 1);
+
+        int bottom_y_index = ceilf(y_point);
+        bottom_y_index = std::min(std::max(0, bottom_y_index), h - 1);
+
+        float x_point = (pos_x + 0.5) * (w / static_cast<float>(final_width_)) - 0.5;
+        int left_x_index = floorf(x_point);
+        left_x_index = std::min(std::max(0, left_x_index), w - 1);
+
+        int right_x_index = ceilf(x_point);
+        right_x_index = std::min(std::max(0, right_x_index), w - 1);
+
+        const float y_lerp = y_point - top_y_index;
+        const float x_lerp = x_point - left_x_index;
+        const int y_top_index = box_index * input_height_ + y1h + top_y_index;
+        const int y_bottom_index = box_index * input_height_ + y1h + bottom_y_index;
+
+        const float top_left =
+          static_cast<float>(input_image[(y_top_index * input_width_ + x1w + left_x_index) * channel_ + pos_channel]);
+        const float top_right =
+          static_cast<float>(input_image[(y_top_index * input_width_ + x1w + right_x_index) * channel_ + pos_channel]);
+        const float bottom_left = static_cast<float>(
+          input_image[(y_bottom_index * input_width_ + x1w + left_x_index) * channel_ + pos_channel]);
+        const float bottom_right = static_cast<float>(
+          input_image[(y_bottom_index * input_width_ + x1w + right_x_index) * channel_ + pos_channel]);
+
+        float ret = top_left * (1 - y_lerp) * (1 - x_lerp) + bottom_right * y_lerp * x_lerp +
+                    top_right * (1 - y_lerp) * x_lerp + bottom_left * y_lerp * (1 - x_lerp);
+        output[pos] = ret;
+      } else {
+        // Nearest Neighbour
+        const int closest_x_index = roundf(target_x);
+        const int closest_y_index = roundf(target_y);
+        const float val = static_cast<float>(
+          input_image[((box_index * input_height_ + closest_y_index) * input_width_ + closest_x_index) * channel_ +
+                      pos_channel]);
+        output[pos] = val;
+      }
+    }
+  };
+  CPUKernelUtils::ParallelFor(task, output_size_);
+  return true;
+}
+
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.h
@ -0,0 +1,213 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CROP_AND_RESIZE_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CROP_AND_RESIZE_CPU_KERNEL_H_
+#include <vector>
+#include <string>
+#include <algorithm>
+#include "backend/kernel_compiler/cpu/cpu_kernel.h"
+#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class CropAndResizeCPUKernel : public CPUKernel {
+ public:
+  CropAndResizeCPUKernel() = default;
+  ~CropAndResizeCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  int method_;
+  float extrapolation_value_;
+  int input_crop_size_;
+  int output_size_;
+  int input_height_;
+  int input_width_;
+  int final_height_;
+  int final_width_;
+  int channel_;
+};
+
+MS_REG_CPU_KERNEL_T(CropAndResize,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeFloat16)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddOutputAttr(kNumberTypeFloat32),
+                    CropAndResizeCPUKernel, float16);
+
+MS_REG_CPU_KERNEL_T(CropAndResize,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeFloat16)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddInputAttr(kNumberTypeInt64)
+                      .AddOutputAttr(kNumberTypeFloat32),
+                    CropAndResizeCPUKernel, float16);
+
+MS_REG_CPU_KERNEL_T(CropAndResize,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddOutputAttr(kNumberTypeFloat32),
+                    CropAndResizeCPUKernel, float);
+
+MS_REG_CPU_KERNEL_T(CropAndResize,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddInputAttr(kNumberTypeInt64)
+                      .AddOutputAttr(kNumberTypeFloat32),
+                    CropAndResizeCPUKernel, float);
+
+MS_REG_CPU_KERNEL_T(CropAndResize,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeFloat64)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddOutputAttr(kNumberTypeFloat32),
+                    CropAndResizeCPUKernel, double);
+
+MS_REG_CPU_KERNEL_T(CropAndResize,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeFloat64)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddInputAttr(kNumberTypeInt64)
+                      .AddOutputAttr(kNumberTypeFloat32),
+                    CropAndResizeCPUKernel, double);
+
+MS_REG_CPU_KERNEL_T(CropAndResize,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeInt8)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddOutputAttr(kNumberTypeFloat32),
+                    CropAndResizeCPUKernel, int8_t);
+
+MS_REG_CPU_KERNEL_T(CropAndResize,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeInt8)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddInputAttr(kNumberTypeInt64)
+                      .AddOutputAttr(kNumberTypeFloat32),
+                    CropAndResizeCPUKernel, int8_t);
+
+MS_REG_CPU_KERNEL_T(CropAndResize,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeInt16)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddOutputAttr(kNumberTypeFloat32),
+                    CropAndResizeCPUKernel, int16_t);
+
+MS_REG_CPU_KERNEL_T(CropAndResize,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeInt16)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddInputAttr(kNumberTypeInt64)
+                      .AddOutputAttr(kNumberTypeFloat32),
+                    CropAndResizeCPUKernel, int16_t);
+
+MS_REG_CPU_KERNEL_T(CropAndResize,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeInt8)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddOutputAttr(kNumberTypeFloat32),
+                    CropAndResizeCPUKernel, int8_t);
+
+MS_REG_CPU_KERNEL_T(CropAndResize,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddInputAttr(kNumberTypeInt64)
+                      .AddOutputAttr(kNumberTypeFloat32),
+                    CropAndResizeCPUKernel, int32_t);
+
+MS_REG_CPU_KERNEL_T(CropAndResize,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeInt64)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddOutputAttr(kNumberTypeFloat32),
+                    CropAndResizeCPUKernel, int64_t);
+
+MS_REG_CPU_KERNEL_T(CropAndResize,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeInt64)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddInputAttr(kNumberTypeInt64)
+                      .AddOutputAttr(kNumberTypeFloat32),
+                    CropAndResizeCPUKernel, int64_t);
+
+MS_REG_CPU_KERNEL_T(CropAndResize,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeUInt8)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddOutputAttr(kNumberTypeFloat32),
+                    CropAndResizeCPUKernel, uint8_t);
+
+MS_REG_CPU_KERNEL_T(CropAndResize,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeUInt8)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddInputAttr(kNumberTypeInt64)
+                      .AddOutputAttr(kNumberTypeFloat32),
+                    CropAndResizeCPUKernel, uint8_t);
+
+MS_REG_CPU_KERNEL_T(CropAndResize,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeUInt16)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddOutputAttr(kNumberTypeFloat32),
+                    CropAndResizeCPUKernel, uint16_t);
+
+MS_REG_CPU_KERNEL_T(CropAndResize,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeUInt16)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeInt32)
+                      .AddInputAttr(kNumberTypeInt64)
+                      .AddOutputAttr(kNumberTypeFloat32),
+                    CropAndResizeCPUKernel, uint16_t);
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CROP_AND_RESIZE_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nms_with_mask_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nms_with_mask_cpu_kernel.cc
@ -0,0 +1,243 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/cpu/nms_with_mask_cpu_kernel.h"
+#include "runtime/device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+
+int NmsRoundUpPower2(int v) {
+  v--;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  v++;
+  return v;
+}
+
+template <typename T>
+void Swap(T *lhs, T *rhs) {
+  T tmp = lhs[0];
+  lhs[0] = rhs[0];
+  rhs[0] = tmp;
+}
+
+// Sorting function based on BitonicSort from TopK kernel
+template <typename T>
+void NMSWithMaskCPUKernel<T>::NmsBitonicSortByKeyKernel(const int outer, const int inner, const int ceil_power2,
+                                                        T *input, T *data_buff, int *index_buff, int box_size) {
+  auto task1 = [&](int start, int end) {
+    for (int i = start; i < end; i++) {
+      data_buff[i] = (i < inner) ? input[(i * box_size) + 4] : std::numeric_limits<T>::max();
+      index_buff[i] = i;
+    }
+  };
+  CPUKernelUtils::ParallelFor(task1, ceil_power2);
+
+  for (size_t i = 2; i <= static_cast<size_t>(ceil_power2); i <<= 1) {
+    for (size_t j = (i >> 1); j > 0; j >>= 1) {
+      auto task2 = [&](size_t start, size_t end) {
+        for (size_t tid = start; tid < end; tid++) {
+          size_t tid_comp = tid ^ j;
+          if (tid_comp > tid) {
+            if ((tid & i) == 0) {
+              if (data_buff[tid] > data_buff[tid_comp]) {
+                Swap(&data_buff[tid], &data_buff[tid_comp]);
+                Swap(&index_buff[tid], &index_buff[tid_comp]);
+              }
+            } else {
+              if (data_buff[tid] < data_buff[tid_comp]) {
+                Swap(&data_buff[tid], &data_buff[tid_comp]);
+                Swap(&index_buff[tid], &index_buff[tid_comp]);
+              }
+            }
+          }
+        }
+      };
+      CPUKernelUtils::ParallelFor(task2, ceil_power2);
+    }
+  }
+}
+
+// Initialize per row mask array to all true
+template <typename T>
+void NMSWithMaskCPUKernel<T>::MaskInit(int numSq, bool *row_mask) {
+  auto task = [&](int start, int end) {
+    for (int mat_pos = start; mat_pos < end; mat_pos++) {
+      row_mask[mat_pos] = true;
+    }
+  };
+  CPUKernelUtils::ParallelFor(task, numSq);
+}
+
+// copy data from input to output array sorted by indices returned from bitonic sort
+// flips boxes if asked to,  default - false -> if (x1/y1 > x2/y2)
+template <typename T>
+void NMSWithMaskCPUKernel<T>::PopulateOutput(T *data_in, T *data_out, int *index_buff, const int num, int box_size,
+                                             bool flip_mode) {
+  auto task = [&](int start, int end) {
+    for (int box_num = start; box_num < end; box_num++) {
+      int correct_index = index_buff[(num - 1) - box_num];  // flip the array around
+      int correct_arr_start = correct_index * box_size;
+      int current_arr_start = box_num * box_size;
+      if (flip_mode) {  // flip boxes
+        // check x
+        if (data_in[correct_arr_start + 0] > data_in[correct_arr_start + 2]) {
+          data_out[current_arr_start + 0] = data_in[correct_arr_start + 2];
+          data_out[current_arr_start + 2] = data_in[correct_arr_start + 0];
+        } else {
+          data_out[current_arr_start + 0] = data_in[correct_arr_start + 0];
+          data_out[current_arr_start + 2] = data_in[correct_arr_start + 2];
+        }
+        // check y
+        if (data_in[correct_arr_start + 1] > data_in[correct_arr_start + 3]) {
+          data_out[current_arr_start + 1] = data_in[correct_arr_start + 3];
+          data_out[current_arr_start + 3] = data_in[correct_arr_start + 1];
+        } else {
+          data_out[current_arr_start + 1] = data_in[correct_arr_start + 1];
+          data_out[current_arr_start + 3] = data_in[correct_arr_start + 3];
+        }
+        data_out[current_arr_start + 4] = data_in[correct_arr_start + 4];
+      } else {  // default behaviour, don't flip
+        for (int x = 0; x < 5; x++) {
+          data_out[current_arr_start + x] = data_in[correct_arr_start + x];
+        }
+      }
+    }
+  };
+  CPUKernelUtils::ParallelFor(task, num);
+}
+
+// populated return mask (init to all true) and return index array
+template <typename T>
+void NMSWithMaskCPUKernel<T>::Preprocess(const int num, int *sel_idx, bool *sel_boxes, T *output, int box_size) {
+  auto task = [&](int start, int end) {
+    for (int box_num = start; box_num < end; box_num++) {
+      sel_idx[box_num] = box_num;
+      sel_boxes[box_num] = true;
+    }
+  };
+  CPUKernelUtils::ParallelFor(task, num);
+}
+
+template <typename T>
+bool NMSWithMaskCPUKernel<T>::IouDecision(T *output, int box_A_ix, int box_B_ix, int box_A_start, int box_B_start,
+                                          float IOU_value) {
+  T x_1 = std::max(output[box_A_start + 0], output[box_B_start + 0]);
+  T y_1 = std::max(output[box_A_start + 1], output[box_B_start + 1]);
+  T x_2 = std::min(output[box_A_start + 2], output[box_B_start + 2]);
+  T y_2 = std::min(output[box_A_start + 3], output[box_B_start + 3]);
+  T width = std::max(x_2 - x_1, T(0));  // in case of no overlap
+  T height = std::max(y_2 - y_1, T(0));
+
+  T area1 = (output[box_A_start + 2] - output[box_A_start + 0]) * (output[box_A_start + 3] - output[box_A_start + 1]);
+  T area2 = (output[box_B_start + 2] - output[box_B_start + 0]) * (output[box_B_start + 3] - output[box_B_start + 1]);
+
+  T combined_area = area1 + area2;
+  return !(((width * height) / (combined_area - (width * height))) > static_cast<T>(IOU_value));
+}
+
+// Run parallel NMS pass
+// Every position in the row_mask array is updated wit correct IOU decision after being init to all True
+template <typename T>
+void NMSWithMaskCPUKernel<T>::NmsPass(const int num, const float IOU_value, T *output, bool *sel_boxes, int box_size,
+                                      bool *row_mask) {
+  auto task = [&](int start, int end) {
+    for (int mask_index = start; mask_index < end; mask_index++) {
+      int box_i = mask_index / num;                // row in 2d row_mask array
+      int box_j = mask_index % num;                // col in 2d row_mask array
+      if (box_j > box_i) {                         // skip when box_j index lower/equal to box_i - will remain true
+        int box_i_start_index = box_i * box_size;  // adjust starting indices
+        int box_j_start_index = box_j * box_size;
+        row_mask[mask_index] = IouDecision(output, box_i, box_j, box_i_start_index, box_j_start_index, IOU_value);
+      }
+    }
+  };
+  CPUKernelUtils::ParallelFor(task, num * num);
+}
+
+// Reduce pass runs on 1 block to allow thread sync
+template <typename T>
+void NMSWithMaskCPUKernel<T>::ReducePass(const int num, bool *sel_boxes, bool *row_mask) {
+  // loop over every box in order of high to low confidence score
+  for (int i = 0; i < num - 1; ++i) {
+    if (!sel_boxes[i]) {
+      continue;
+    }
+    // every thread handles a different set of boxes (per all boxes in order)
+    auto task = [&](int start, int end) {
+      for (int j = start; j < end; j++) {
+        sel_boxes[j] = sel_boxes[j] && row_mask[i * num + j];
+      }
+    };
+    CPUKernelUtils::ParallelFor(task, num);
+  }
+}
+
+template <typename T>
+void NMSWithMaskCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  iou_value_ = AnfAlgo::GetNodeAttr<float>(kernel_node, "iou_threshold");
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  if (input_num != 1) {
+    MS_LOG(ERROR) << "Input num is " << input_num << ", but NMSWithMask needs 1 input.";
+  }
+
+  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
+  if (output_num != 3) {
+    MS_LOG(ERROR) << "Output num is " << output_num << ", but NMSWithMask needs 3 outputs.";
+  }
+}
+
+template <typename T>
+void NMSWithMaskCPUKernel<T>::InitInputOutputSize(const CNodePtr &kernel_node) {
+  CPUKernel::InitInputOutputSize(kernel_node);
+  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  num_input_ = input_shape[0];  //  Get N values in  [N, 5] data.
+  ceil_power_2 = NmsRoundUpPower2(num_input_);
+
+  workspace_size_list_.push_back(ceil_power_2 * sizeof(T));                //  data buff
+  workspace_size_list_.push_back(ceil_power_2 * sizeof(int));              //  index buff
+  workspace_size_list_.push_back(num_input_ * num_input_ * sizeof(bool));  //  mask list
+}
+
+template <typename T>
+bool NMSWithMaskCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                     const std::vector<kernel::AddressPtr> &workspace,
+                                     const std::vector<kernel::AddressPtr> &outputs) {
+  auto input = reinterpret_cast<T *>(inputs[0]->addr);
+  auto data_buff = reinterpret_cast<T *>(workspace[0]->addr);
+  auto index_buff = reinterpret_cast<int *>(workspace[1]->addr);
+  auto row_mask = reinterpret_cast<bool *>(workspace[2]->addr);
+  auto output = reinterpret_cast<T *>(outputs[0]->addr);
+  auto sel_idx = reinterpret_cast<int *>(outputs[1]->addr);
+  auto sel_boxes = reinterpret_cast<bool *>(outputs[2]->addr);
+
+  NmsBitonicSortByKeyKernel(1, num_input_, ceil_power_2, input, data_buff, index_buff, box_size_);
+  int total_val = num_input_ * num_input_;
+  MaskInit(total_val, row_mask);
+  PopulateOutput(input, output, index_buff, num_input_, box_size_, false);
+  Preprocess(num_input_, sel_idx, sel_boxes, output, box_size_);
+  NmsPass(num_input_, iou_value_, output, sel_boxes, box_size_, row_mask);
+  ReducePass(num_input_, sel_boxes, row_mask);
+  return true;
+}
+
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nms_with_mask_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nms_with_mask_cpu_kernel.h
@ -0,0 +1,80 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NMS_WITH_MASK_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NMS_WITH_MASK_CPU_KERNEL_H_
+#include <vector>
+#include <algorithm>
+#include <limits>
+#include "backend/kernel_compiler/cpu/cpu_kernel.h"
+#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class NMSWithMaskCPUKernel : public CPUKernel {
+ public:
+  NMSWithMaskCPUKernel() = default;
+  ~NMSWithMaskCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+  void InitInputOutputSize(const CNodePtr &kernel_node) override;
+
+ private:
+  void NmsBitonicSortByKeyKernel(const int outer, const int inner, const int ceil_power2, T *input, T *data_buff,
+                                 int *index_buff, int box_size);
+
+  void MaskInit(int numSq, bool *row_mask);
+
+  void PopulateOutput(T *data_in, T *data_out, int *index_buff, const int num, int box_size, bool flip_mode);
+
+  void Preprocess(const int num, int *sel_idx, bool *sel_boxes, T *output, int box_size);
+
+  bool IouDecision(T *output, int box_A_ix, int box_B_ix, int box_A_start, int box_B_start, float IOU_value);
+
+  void NmsPass(const int num, const float IOU_value, T *output, bool *sel_boxes, int box_size, bool *row_mask);
+
+  void ReducePass(const int num, bool *sel_boxes, bool *row_mask);
+
+  int num_input_;
+  float iou_value_;
+  size_t ceil_power_2;
+  static const int box_size_ = 5;  //  pre_defined box width
+};
+
+MS_REG_CPU_KERNEL_T(NMSWithMask,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddOutputAttr(kNumberTypeFloat32)
+                      .AddOutputAttr(kNumberTypeInt32)
+                      .AddOutputAttr(kNumberTypeBool),
+                    NMSWithMaskCPUKernel, float);
+
+MS_REG_CPU_KERNEL_T(NMSWithMask,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeFloat16)
+                      .AddOutputAttr(kNumberTypeFloat16)
+                      .AddOutputAttr(kNumberTypeInt32)
+                      .AddOutputAttr(kNumberTypeBool),
+                    NMSWithMaskCPUKernel, float16);
+
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NMS_WITH_MASK_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/random_choice_with_mask_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/random_choice_with_mask_cpu_kernel.cc
@ -0,0 +1,225 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/cpu/random_choice_with_mask_cpu_kernel.h"
+#include "runtime/device/cpu/cpu_device_address.h"
+#define BLOCKSIZE 256
+#define MAX_DIMENSION 5
+
+namespace mindspore {
+namespace kernel {
+
+void ParseOutputCoordinate(std::vector<int64_t> dims, int32_t output_length, int32_t input_dim_size,
+                           int32_t input_total_count, const int *tmp_output, int *output) {
+  int it = 0;
+  int column = input_total_count / dims[0];
+  for (int i = 0; i < output_length; i++) {
+    int32_t tmp_output_number = tmp_output[i];
+    int tmp_column = column;
+    for (int j = 0; j < input_dim_size; j++) {
+      if (j == input_dim_size - 1) {
+        output[it++] = tmp_output_number;
+        continue;
+      }
+      output[it++] = tmp_output_number / column;
+      tmp_output_number = tmp_output_number % column;
+      tmp_column = tmp_column / dims[j + 1];
+    }
+  }
+}
+
+void GetOutputLength(bool *padding_flag, int32_t *output_length, int32_t *output_non_zero_length, int32_t count,
+                     int32_t non_zero_num) {
+  if (count == 0) {
+    *padding_flag = false;
+    *output_length = non_zero_num;
+    *output_non_zero_length = non_zero_num;
+  } else if (count > 0 && count <= non_zero_num) {
+    *padding_flag = false;
+    *output_length = count;
+    *output_non_zero_length = count;
+  } else if (count > non_zero_num) {
+    *padding_flag = true;
+    *output_length = count;
+    *output_non_zero_length = non_zero_num;
+  } else {
+    MS_LOG(EXCEPTION) << "Input count must be greater than or equal to 0, but is " << count;
+  }
+}
+
+void GetInputTotalCount(const std::vector<int64_t> &dims_, int32_t *input_total_count, const int32_t &input_dim_size) {
+  for (int32_t i = 0; i < input_dim_size; i++) {
+    *input_total_count *= dims_[i];
+  }
+}
+
+void UpdateOutput(const std::vector<int64_t> &dims_, const int32_t &non_zero_num, const int32_t &count_,
+                  const int32_t &output_length, const int *mask_dim, int32_t *output_coordinate, bool *mask) {
+  for (int32_t i = non_zero_num * dims_.size(); i < static_cast<int32_t>(count_ * dims_.size()); i++) {
+    output_coordinate[i] = 0;
+  }
+  for (int32_t i = 0; i < output_length; i++) {
+    mask[i] = static_cast<bool>(mask_dim[i]);
+  }
+  for (int32_t i = non_zero_num; i < count_; i++) {
+    mask[i] = false;
+  }
+}
+
+void RandomChoiceWithMaskCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  if (input_num != 1) {
+    MS_LOG(ERROR) << "Input num is " << input_num << ", but RandomChoiceWithMask needs 1 input.";
+  }
+
+  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
+  if (output_num != 2) {
+    MS_LOG(ERROR) << "Output num is " << output_num << ", but RandomChoiceWithMask needs 2 outputs.";
+  }
+
+  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  input_shape_size_ = input_shape.size();
+  if (input_shape_size_ < 1 || input_shape_size_ > MAX_DIMENSION) {
+    MS_LOG(ERROR) << "Input is " << input_shape_size_
+                  << "-D, but RandomChoiceWithMask supports only 1-D to 5-D inputs.";
+  }
+
+  seed_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "seed"));
+  seed2_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "seed2"));
+  count_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "count"));
+
+  MS_LOG(INFO) << "This op attr count is " << count_;
+
+  for (size_t i = 0; i < input_num; i++) {
+    auto input_i_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
+    for (size_t j = 0; j < input_i_shape.size(); j++) {
+      dims_.emplace_back(input_i_shape[j]);
+    }
+  }
+}
+
+bool RandomChoiceWithMaskCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                           const std::vector<kernel::AddressPtr> &,
+                                           const std::vector<kernel::AddressPtr> &outputs) {
+  auto *input = reinterpret_cast<bool *>(inputs[0]->addr);
+  auto *output_coordinate = reinterpret_cast<int32_t *>(outputs[0]->addr);
+  auto *mask = reinterpret_cast<bool *>(outputs[1]->addr);
+  int32_t input_dim_size = dims_.size();
+  int32_t non_zero_num = 0;
+  int32_t input_total_count = 1;
+
+  if (input_dim_size < 1 || input_dim_size > 5) {
+    MS_LOG(EXCEPTION) << "Input dim size is " << input_dim_size << ", which is not supported.";
+  }
+
+  int seedc = seed2_ != 0 ? seed2_ : (seed_ != 0 ? seed_ : generator_());
+  GetInputTotalCount(dims_, &input_total_count, input_dim_size);
+  int *input_dim = new (std::nothrow) int[input_total_count];
+  if (input_dim == nullptr) {
+    MS_LOG(EXCEPTION) << "Malloc memory failed!";
+    return false;
+  }
+  for (int32_t i = 0; i < input_total_count; i++) {
+    if (input[i] != 0) {
+      input_dim[non_zero_num] = i;
+      non_zero_num++;
+    }
+  }
+
+  bool padding_flag = false;
+  int32_t output_length = 0;
+  int32_t output_non_zero_length = 0;
+  GetOutputLength(&padding_flag, &output_length, &output_non_zero_length, count_, non_zero_num);
+  int *tmp_output = new (std::nothrow) int[output_length];
+  if (tmp_output == nullptr) {
+    MS_LOG(EXCEPTION) << "Malloc memory failed!";
+    delete[] input_dim;
+    return false;
+  }
+
+  std::mt19937 gen(seedc);
+  std::uniform_int_distribution<> dis(0, non_zero_num - 1);
+  int *mask_dim = new (std::nothrow) int[output_length];
+  if (mask_dim == nullptr) {
+    MS_LOG(EXCEPTION) << "Malloc memory failed!";
+    delete[] input_dim;
+    delete[] tmp_output;
+    return false;
+  }
+  (void)memset_s(mask_dim, output_length, 0X00, output_length);
+  (void)memset_s(tmp_output, output_length, 0X00, output_length);
+
+  for (int32_t i = 0; i < output_non_zero_length; i++) {
+    int32_t mean = dis(gen);
+    tmp_output[i] = input_dim[mean];
+    mask_dim[i] = 1;
+  }
+  if (padding_flag) {
+    int32_t index = 0;
+    for (int32_t i = output_length - 1; i > non_zero_num; i--) {
+      tmp_output[non_zero_num + index] = 0;
+      mask_dim[non_zero_num + index] = 0;
+      index++;
+    }
+  }
+
+  int32_t copy_output_length = 0;
+  if (output_length * input_dim_size >= INT_MAX || output_length * input_dim_size < 0) {
+    MS_LOG(EXCEPTION) << "Output size exceed INT_MAX";
+    delete[] input_dim;
+    delete[] tmp_output;
+    delete[] mask_dim;
+    return false;
+  }
+
+  copy_output_length = output_length * input_dim_size;
+  int *output = new (std::nothrow) int[copy_output_length];
+  if (output == nullptr) {
+    MS_LOG(EXCEPTION) << "Malloc memory failed!";
+    delete[] input_dim;
+    delete[] tmp_output;
+    delete[] mask_dim;
+    return false;
+  }
+  (void)memset_s(output, copy_output_length, 0X00, copy_output_length);
+  ParseOutputCoordinate(dims_, output_length, input_dim_size, input_total_count, tmp_output, output);
+
+  int32_t actual_output_length = count_ * dims_.size();
+  copy_output_length = std::min(actual_output_length, copy_output_length);
+  int32_t copy_output_bytes = 0;
+  if (INT_MAX / static_cast<int>(sizeof(int32_t)) < copy_output_length) {
+    MS_LOG(EXCEPTION) << "The output length is out of range!";
+    delete[] input_dim;
+    delete[] mask_dim;
+    delete[] tmp_output;
+    delete[] output;
+    return false;
+  }
+
+  copy_output_bytes = copy_output_length * sizeof(int32_t);
+  memcpy_s(output_coordinate, copy_output_bytes, output, copy_output_bytes);
+  UpdateOutput(dims_, non_zero_num, count_, output_length, mask_dim, output_coordinate, mask);
+  delete[] input_dim;
+  delete[] mask_dim;
+  delete[] tmp_output;
+  delete[] output;
+
+  return true;
+}
+
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/random_choice_with_mask_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/random_choice_with_mask_cpu_kernel.h
@ -0,0 +1,55 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_RANDOM_CHOICE_WITH_MASK_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_RANDOM_CHOICE_WITH_MASK_CPU_KERNEL_H_
+#include <vector>
+#include <random>
+#include <algorithm>
+#include "backend/kernel_compiler/cpu/cpu_kernel.h"
+#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+
+class RandomChoiceWithMaskCPUKernel : public CPUKernel {
+ public:
+  RandomChoiceWithMaskCPUKernel() = default;
+  ~RandomChoiceWithMaskCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  int32_t count_{0};
+  std::vector<int64_t> dims_;
+  int input_shape_size_{0};
+  int seed_{0};
+  int seed2_{0};
+  int input_size_{1};
+  std::mt19937 generator_;
+};
+
+MS_REG_CPU_KERNEL(
+  RandomChoiceWithMask,
+  KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeBool),
+  RandomChoiceWithMaskCPUKernel);
+
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_RANDOM_CHOICE_WITH_MASK_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/roi_align_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/roi_align_cpu_kernel.cc
@ -0,0 +1,223 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/cpu/roi_align_cpu_kernel.h"
+#include "runtime/device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+namespace {
+constexpr size_t kInputSize = 2;
+constexpr size_t kOutputSize = 1;
+}  //  namespace
+
+template <typename T>
+void ROIAlignCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  //  Get the input shapes
+  auto x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  auto rois_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+
+  auto x_shape_size = x_shape.size();
+  if (x_shape_size != 4) {
+    MS_LOG(ERROR) << "x shape size is " << x_shape_size << ", but should be 4.";
+  }
+
+  channels_ = x_shape[1];
+  height_ = x_shape[2];
+  width_ = x_shape[3];
+
+  roi_rows_ = rois_shape[0];
+  roi_cols_ = rois_shape[1];
+
+  pooled_height_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "pooled_height"));
+  pooled_width_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "pooled_width"));
+  spatial_scale_ = static_cast<T>(AnfAlgo::GetNodeAttr<float>(kernel_node, "spatial_scale"));
+  sample_num_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "sample_num"));
+  roi_end_mode_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "roi_end_mode"));
+}
+
+template <typename T>
+bool ROIAlignCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                  const std::vector<kernel::AddressPtr> &,
+                                  const std::vector<kernel::AddressPtr> &outputs) {
+  const T *input = reinterpret_cast<T *>(inputs[0]->addr);
+  const T *rois = reinterpret_cast<T *>(inputs[1]->addr);
+  auto out_data = reinterpret_cast<T *>(outputs[0]->addr);
+
+  size_t elem_num = roi_rows_ * channels_ * pooled_height_ * pooled_width_;
+  auto task = [&](size_t start, size_t end) {
+    for (size_t thread_idx = start; thread_idx < end; thread_idx++) {
+      int n = thread_idx / pooled_width_ / pooled_height_ / channels_;
+      const T *roi_box = rois + n * roi_cols_;
+      if (roi_box[1] < static_cast<T>(0.001) && roi_box[3] < static_cast<T>(0.001) &&
+          roi_box[1] > static_cast<T>(-0.001) && roi_box[3] > static_cast<T>(-0.001)) {
+        continue;
+      }
+      int offset = -1;
+      int c, ph, pw, roi_bin_grid_h, roi_bin_grid_w;
+      T bin_size_h, bin_size_w, roi_start_h, roi_start_w;
+
+      bin_box(thread_idx, rois, roi_cols_, spatial_scale_, sample_num_, roi_end_mode_, channels_, height_, width_,
+              pooled_height_, pooled_width_, &offset, &n, &c, &ph, &pw, &roi_bin_grid_h, &roi_bin_grid_w, &bin_size_h,
+              &bin_size_w, &roi_start_h, &roi_start_w);
+
+      // (n, c, ph, pw) is the base param of pooled map
+      const T count_points_in_grid_cell = static_cast<T>(roi_bin_grid_h * roi_bin_grid_w);
+
+      T accumulate_val = static_cast<T>(0.);
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        // Shift half point RIGHT for y / x,  while previous scaled roi shift half point LEFT
+        const T y = roi_start_h + static_cast<T>(ph) * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h);
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + static_cast<T>(pw) * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
+          // bilinear interpolate by shifted y / x
+          // calculate bilinear interpolation
+          int x_low = 0, y_low = 0, x_high = 0, y_high = 0;
+          T w1, w2, w3, w4;
+          bilinear_interpolate(height_, width_, y, x, &x_low, &y_low, &x_high, &y_high, &w1, &w2, &w3, &w4);
+          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0 && y_low < height_ && y_high < height_ &&
+              x_low < width_ && x_high < width_) {
+            T v1 = input[offset + y_low * width_ + x_low];
+            T v2 = input[offset + y_low * width_ + x_high];
+            T v3 = input[offset + y_high * width_ + x_low];
+            T v4 = input[offset + y_high * width_ + x_high];
+
+            T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+            accumulate_val += val;
+          }
+        }
+      }
+      accumulate_val /= count_points_in_grid_cell;
+
+      out_data[thread_idx] = accumulate_val;
+    }
+  };
+  CPUKernelUtils::ParallelFor(task, elem_num);
+
+  return true;
+}
+
+template <typename T>
+void ROIAlignCPUKernel<T>::CheckParam(const std::vector<kernel::AddressPtr> &inputs,
+                                      const std::vector<kernel::AddressPtr> &outputs) {
+  if (inputs.size() != kInputSize) {
+    MS_LOG(EXCEPTION) << "Input number is: " << inputs.size() << ", but ROIAlign needs " << kInputSize << " inputs.";
+  }
+
+  if (outputs.size() != kOutputSize) {
+    MS_LOG(EXCEPTION) << "Output number is: " << outputs.size() << ", but ROIAlign needs " << kOutputSize << "outputs.";
+  }
+}
+
+template <typename T>
+void ROIAlignCPUKernel<T>::bilinear_interpolate(const int height, const int width, T y, T x, int *x_low, int *y_low,
+                                                int *x_high, int *y_high, T *w1, T *w2, T *w3, T *w4) {
+  constexpr float eps = 0.00007;
+  if (y < static_cast<T>(-1.0) || y > static_cast<T>(height) || x < static_cast<T>(-1.0) || x > static_cast<T>(width)) {
+    *w1 = *w2 = *w3 = *w4 = static_cast<T>(0);
+    *x_low = *x_high = *y_low = *y_high = -1;
+    return;
+  }
+
+  // low bounder is at least zero
+  y = y <= static_cast<T>(.0) ? static_cast<T>(.0) : y;
+  x = x <= static_cast<T>(.0) ? static_cast<T>(.0) : x;
+
+  // top left point
+  *y_low = (y <= static_cast<T>(eps) ? 0 : static_cast<int>(floor(y)));
+  *x_low = (x <= static_cast<T>(eps) ? 0 : static_cast<int>(floor(x)));
+
+  // bottom right point
+  if (*y_low >= height - 1) {
+    *y_high = *y_low = height - 1;
+    y = static_cast<T>(*y_low);
+  } else {
+    *y_high = *y_low + 1;
+  }
+
+  if (*x_low >= width - 1) {
+    *x_high = *x_low = width - 1;
+    x = static_cast<T>(*x_low);
+  } else {
+    *x_high = *x_low + 1;
+  }
+
+  // distance to nearest points
+  T lx, ly, hx, hy;
+  ly = y - static_cast<T>(*y_low), lx = x - static_cast<T>(*x_low);
+  hy = static_cast<T>(1.) - ly, hx = static_cast<T>(1.) - lx;
+
+  // weight is evaluated by the distance to point away.
+  //   the closer to point home, the more weight, the farther to point away.
+  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
+  return;
+}
+
+template <typename T>
+void ROIAlignCPUKernel<T>::bin_box(int thread_idx, const T *roi_boxes, int roi_cols, const T spatial_scale,
+                                   const int sample_num, int roi_end_mode, const int channels, const int height,
+                                   const int width, const int pooled_height, const int pooled_width, int *offset,
+                                   int *n, int *c, int *ph, int *pw, int *roi_bin_grid_h, int *roi_bin_grid_w,
+                                   T *bin_size_h, T *bin_size_w, T *roi_start_h, T *roi_start_w) {
+  // (n, c, ph, pw) is the base param of pooled map
+  *pw = thread_idx % pooled_width;
+  *ph = (thread_idx / pooled_width) % pooled_height;
+  *c = (thread_idx / pooled_width / pooled_height) % channels;
+  *n = thread_idx / pooled_width / pooled_height / channels;
+
+  // Roi has
+  //   1. 4 points, or
+  //   2. indicator + 4 points (1 + 4)
+  const T *roi_box = roi_boxes + (*n) * roi_cols;
+  int roi_batch_ind = 0;
+  if (roi_cols == 5) {
+    roi_batch_ind = static_cast<int>(rint(static_cast<float>(roi_box[0]) + static_cast<float>(0.00007)));
+    roi_box++;
+  }
+
+  // Scale and shift ROI
+  *roi_start_w = roi_box[0] * spatial_scale;
+  *roi_start_h = roi_box[1] * spatial_scale;
+  T roi_end_w = (roi_box[2] + static_cast<T>(roi_end_mode)) * spatial_scale;
+  T roi_end_h = (roi_box[3] + static_cast<T>(roi_end_mode)) * spatial_scale;
+
+  // New ROI height/width
+  T roi_width = roi_end_w - (*roi_start_w);
+  T roi_height = roi_end_h - (*roi_start_h);
+
+  if (roi_end_mode == 0) {  // backward compatibility
+    // Force malformed ROIs to be 1x1
+    roi_width = roi_width > static_cast<T>(1.0) ? roi_width : static_cast<T>(1.0);
+    roi_height = roi_height > static_cast<T>(1.0) ? roi_height : static_cast<T>(1.0);
+  }
+
+  // ratio of roi / pooled
+  *bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+  *bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+  *offset = (roi_batch_ind * channels + (*c)) * height * width;
+
+  // grid (int) by Sample ratio if defined, otherwise by pooled H/W
+  *roi_bin_grid_h = (sample_num > 0) ? sample_num : static_cast<int>(floor(roi_height / static_cast<T>(pooled_height)));
+  *roi_bin_grid_w = (sample_num > 0) ? sample_num : static_cast<int>(floor(roi_width / static_cast<T>(pooled_width)));
+  return;
+}
+
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/roi_align_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/roi_align_cpu_kernel.h
@ -0,0 +1,72 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ROI_ALIGN_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ROI_ALIGN_CPU_KERNEL_H_
+#include <vector>
+#include "backend/kernel_compiler/cpu/cpu_kernel.h"
+#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class ROIAlignCPUKernel : public CPUKernel {
+ public:
+  ROIAlignCPUKernel() = default;
+  ~ROIAlignCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  void CheckParam(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs);
+
+  void bilinear_interpolate(const int height, const int width, T y, T x, int *x_low, int *y_low, int *x_high,
+                            int *y_high, T *w1, T *w2, T *w3, T *w4);
+
+  void bin_box(int thread_idx, const T *roi_boxes, int roi_cols, const T spatial_scale, const int sample_num,
+               int roi_end_mode, const int channels, const int height, const int width, const int pooled_height,
+               const int pooled_width, int *offset, int *n, int *c, int *ph, int *pw, int *roi_bin_grid_h,
+               int *roi_bin_grid_w, T *bin_size_h, T *bin_size_w, T *roi_start_h, T *roi_start_w);
+
+  int pooled_height_;
+  int pooled_width_;
+  T spatial_scale_;
+  int sample_num_;
+  int roi_end_mode_;
+
+  int roi_rows_;
+  int roi_cols_;
+  int channels_;
+  int height_;
+  int width_;
+};
+
+MS_REG_CPU_KERNEL_T(
+  ROIAlign,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  ROIAlignCPUKernel, float);
+
+MS_REG_CPU_KERNEL_T(
+  ROIAlign,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+  ROIAlignCPUKernel, float16);
+
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ROI_ALIGN_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/roi_align_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/roi_align_grad_cpu_kernel.cc
@ -0,0 +1,280 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/cpu/roi_align_grad_cpu_kernel.h"
+#include "runtime/device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+
+template <typename T, typename U>
+void AtomicAddTask(T *address, T val) {
+  auto *address_as_ull = reinterpret_cast<U *>(address);
+  U old = *address_as_ull;
+  U assumed;
+  T desired;
+  T *assumed_t = NULL;
+  U *desired_u = NULL;
+  do {
+    assumed = old;
+    assumed_t = reinterpret_cast<T *>(&assumed);
+    desired_u = reinterpret_cast<U *>(&desired);
+    desired = *assumed_t + static_cast<T>(val);
+    old = __sync_val_compare_and_swap(address_as_ull, assumed, *desired_u);
+  } while (assumed != old);
+}
+
+template <typename T>
+void AtomicAdd(T *address, T val) {
+  switch (sizeof(T)) {
+    case 1: {
+      AtomicAddTask<T, uint8_t>(address, val);
+      break;
+    }
+    case 2: {
+      AtomicAddTask<T, uint16_t>(address, val);
+      break;
+    }
+    case 4: {
+      AtomicAddTask<T, uint32_t>(address, val);
+      break;
+    }
+    case 8: {
+      AtomicAddTask<T, uint64_t>(address, val);
+      break;
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignGradCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
+  //  Get the number of the input args
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  if (input_num != 2) {
+    MS_LOG(ERROR) << "Input number is: " << input_num << ", but ROIAlignGrad needs 2 inputs.";
+  }
+
+  //  Get the number of the output args
+  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
+  if (output_num != 1) {
+    MS_LOG(ERROR) << "Output number is: " << output_num << ", but ROIAlignGrad needs 1 output.";
+  }
+
+  //  Get the input shapes
+  auto dy_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  auto dy_shape_size = dy_shape.size();
+  if (dy_shape_size != 4) {
+    MS_LOG(ERROR) << "dy shape size is " << dy_shape_size << ", but should be 4.";
+  }
+}
+
+template <typename T>
+void ROIAlignGradCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  CheckParam(kernel_node);
+
+  auto rois_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+  roi_rows_ = rois_shape[0];
+  roi_cols_ = rois_shape[1];
+
+  std::vector<int64_t> xdiff_shape_me = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, "xdiff_shape");
+  (void)std::transform(xdiff_shape_me.begin(), xdiff_shape_me.end(), std::back_inserter(xdiff_shape_),
+                       [](const int64_t &value) { return static_cast<int>(value); });
+  pooled_height_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "pooled_height"));
+  pooled_width_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "pooled_width"));
+  spatial_scale_ = static_cast<T>(AnfAlgo::GetNodeAttr<float>(kernel_node, "spatial_scale"));
+  sample_num_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "sample_num"));
+  roi_end_mode_ = 1;
+
+  batch_size_ = xdiff_shape_[0];
+  channels_ = xdiff_shape_[1];
+  height_ = xdiff_shape_[2];
+  width_ = xdiff_shape_[3];
+}
+
+template <typename T>
+bool ROIAlignGradCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                      const std::vector<kernel::AddressPtr> &,
+                                      const std::vector<kernel::AddressPtr> &outputs) {
+  const T *dy = reinterpret_cast<T *>(inputs[0]->addr);
+  const T *rois = reinterpret_cast<T *>(inputs[1]->addr);
+  T *dx = reinterpret_cast<T *>(outputs[0]->addr);
+
+  size_t size_init = batch_size_ * channels_ * height_ * width_;
+  auto task1 = [&](size_t start, size_t end) {
+    for (size_t thread_idx = start; thread_idx < end; thread_idx++) {
+      dx[thread_idx] = static_cast<T>(0.);
+    }
+  };
+  CPUKernelUtils::ParallelFor(task1, size_init);
+
+  size_t elem_num = roi_rows_ * channels_ * pooled_height_ * pooled_width_;
+  auto task2 = [&](size_t start, size_t end) {
+    for (size_t thread_idx = start; thread_idx < end; thread_idx++) {
+      int n = thread_idx / pooled_width_ / pooled_height_ / channels_;
+      const T *roi_box = rois + n * roi_cols_;
+      if (roi_box[1] < static_cast<T>(0.001) && roi_box[3] < static_cast<T>(0.001) &&
+          roi_box[1] > static_cast<T>(-0.001) && roi_box[3] > static_cast<T>(-0.001)) {
+        continue;
+      }
+      int offset = -1;
+      int c, ph, pw, roi_bin_grid_h, roi_bin_grid_w;
+      T bin_size_h, bin_size_w, roi_start_h, roi_start_w;
+
+      bin_box(thread_idx, rois, roi_cols_, spatial_scale_, sample_num_, roi_end_mode_, channels_, height_, width_,
+              pooled_height_, pooled_width_, &offset, &n, &c, &ph, &pw, &roi_bin_grid_h, &roi_bin_grid_w, &bin_size_h,
+              &bin_size_w, &roi_start_h, &roi_start_w);
+
+      // (n, c, ph, pw) is the base param of pooled map
+      const T count_points_in_grid_cell = static_cast<T>(roi_bin_grid_h * roi_bin_grid_w);
+
+      int top_offset = (n * channels_ + c) * pooled_height_ * pooled_width_;
+      const T *offset_top_diff = dy + top_offset;
+      const T top_diff_this_bin = offset_top_diff[ph * pooled_width_ + pw];
+
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        // Shift half point RIGHT for y / x,  while previous scaled roi shift half point LEFT
+        const T y = roi_start_h + static_cast<T>(ph) * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h);
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + static_cast<T>(pw) * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
+          // bilinear interpolate by shifted y / x
+          // calculate bilinear interpolation
+          int x_low = 0, y_low = 0, x_high = 0, y_high = 0;
+          T w1, w2, w3, w4;
+          bilinear_interpolate(height_, width_, y, x, &x_low, &y_low, &x_high, &y_high, &w1, &w2, &w3, &w4);
+          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0 && y_low < height_ && y_high < height_ &&
+              x_low < width_ && x_high < width_) {
+            T g1 = top_diff_this_bin * w1 / count_points_in_grid_cell;
+            T g2 = top_diff_this_bin * w2 / count_points_in_grid_cell;
+            T g3 = top_diff_this_bin * w3 / count_points_in_grid_cell;
+            T g4 = top_diff_this_bin * w4 / count_points_in_grid_cell;
+
+            T *dx_1 = dx + offset + y_low * width_ + x_low;
+            T *dx_2 = dx + offset + y_low * width_ + x_high;
+            T *dx_3 = dx + offset + y_high * width_ + x_low;
+            T *dx_4 = dx + offset + y_high * width_ + x_high;
+
+            AtomicAdd(dx_1, g1);
+            AtomicAdd(dx_2, g2);
+            AtomicAdd(dx_3, g3);
+            AtomicAdd(dx_4, g4);
+          }
+        }
+      }
+    }
+  };
+  CPUKernelUtils::ParallelFor(task2, elem_num);
+  return true;
+}
+
+template <typename T>
+void ROIAlignGradCPUKernel<T>::bilinear_interpolate(const int height, const int width, T y, T x, int *x_low, int *y_low,
+                                                    int *x_high, int *y_high, T *w1, T *w2, T *w3, T *w4) {
+  constexpr float eps = 0.00007;
+  if (y < static_cast<T>(-1.0) || y > static_cast<T>(height) || x < static_cast<T>(-1.0) || x > static_cast<T>(width)) {
+    *w1 = *w2 = *w3 = *w4 = static_cast<T>(0);
+    *x_low = *x_high = *y_low = *y_high = -1;
+    return;
+  }
+
+  // low bounder is at least zero
+  y = y <= static_cast<T>(.0) ? static_cast<T>(.0) : y;
+  x = x <= static_cast<T>(.0) ? static_cast<T>(.0) : x;
+
+  // top left point
+  *y_low = (y <= static_cast<T>(eps) ? 0 : static_cast<int>(floor(y)));
+  *x_low = (x <= static_cast<T>(eps) ? 0 : static_cast<int>(floor(x)));
+
+  // bottom right point
+  if (*y_low >= height - 1) {
+    *y_high = *y_low = height - 1;
+    y = static_cast<T>(*y_low);
+  } else {
+    *y_high = *y_low + 1;
+  }
+
+  if (*x_low >= width - 1) {
+    *x_high = *x_low = width - 1;
+    x = static_cast<T>(*x_low);
+  } else {
+    *x_high = *x_low + 1;
+  }
+
+  // distance to nearest points
+  T lx, ly, hx, hy;
+  ly = y - static_cast<T>(*y_low), lx = x - static_cast<T>(*x_low);
+  hy = static_cast<T>(1.) - ly, hx = static_cast<T>(1.) - lx;
+
+  // weight is evaluated by the distance to point away.
+  //   the closer to point home, the more weight, the farther to point away.
+  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
+  return;
+}
+
+template <typename T>
+void ROIAlignGradCPUKernel<T>::bin_box(int thread_idx, const T *roi_boxes, int roi_cols, const T spatial_scale,
+                                       const int sample_num, int roi_end_mode, const int channels, const int height,
+                                       const int width, const int pooled_height, const int pooled_width, int *offset,
+                                       int *n, int *c, int *ph, int *pw, int *roi_bin_grid_h, int *roi_bin_grid_w,
+                                       T *bin_size_h, T *bin_size_w, T *roi_start_h, T *roi_start_w) {
+  // (n, c, ph, pw) is the base param of pooled map
+  *pw = thread_idx % pooled_width;
+  *ph = (thread_idx / pooled_width) % pooled_height;
+  *c = (thread_idx / pooled_width / pooled_height) % channels;
+  *n = thread_idx / pooled_width / pooled_height / channels;
+
+  // Roi has
+  //   1. 4 points, or
+  //   2. indicator + 4 points (1 + 4)
+  const T *roi_box = roi_boxes + (*n) * roi_cols;
+  int roi_batch_ind = 0;
+  if (roi_cols == 5) {
+    roi_batch_ind = static_cast<int>(rint(static_cast<float>(roi_box[0]) + static_cast<float>(0.00007)));
+    roi_box++;
+  }
+
+  // Scale and shift ROI
+  *roi_start_w = roi_box[0] * spatial_scale;
+  *roi_start_h = roi_box[1] * spatial_scale;
+  T roi_end_w = (roi_box[2] + static_cast<T>(roi_end_mode)) * spatial_scale;
+  T roi_end_h = (roi_box[3] + static_cast<T>(roi_end_mode)) * spatial_scale;
+
+  // New ROI height/width
+  T roi_width = roi_end_w - (*roi_start_w);
+  T roi_height = roi_end_h - (*roi_start_h);
+
+  if (roi_end_mode == 0) {  // backward compatibility
+    // Force malformed ROIs to be 1x1
+    roi_width = roi_width > static_cast<T>(1.0) ? roi_width : static_cast<T>(1.0);
+    roi_height = roi_height > static_cast<T>(1.0) ? roi_height : static_cast<T>(1.0);
+  }
+
+  // ratio of roi / pooled
+  *bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+  *bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+  *offset = (roi_batch_ind * channels + (*c)) * height * width;
+
+  // grid (int) by Sample ratio if defined, otherwise by pooled H/W
+  *roi_bin_grid_h = (sample_num > 0) ? sample_num : static_cast<int>(floor(roi_height / static_cast<T>(pooled_height)));
+  *roi_bin_grid_w = (sample_num > 0) ? sample_num : static_cast<int>(floor(roi_width / static_cast<T>(pooled_width)));
+  return;
+}
+
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/roi_align_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/roi_align_grad_cpu_kernel.h
@ -0,0 +1,75 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ROI_ALIGN_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ROI_ALIGN_CPU_KERNEL_H_
+#include <vector>
+#include <algorithm>
+#include "backend/kernel_compiler/cpu/cpu_kernel.h"
+#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class ROIAlignGradCPUKernel : public CPUKernel {
+ public:
+  ROIAlignGradCPUKernel() = default;
+  ~ROIAlignGradCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  void CheckParam(const CNodePtr &kernel_node);
+
+  void bilinear_interpolate(const int height, const int width, T y, T x, int *x_low, int *y_low, int *x_high,
+                            int *y_high, T *w1, T *w2, T *w3, T *w4);
+
+  void bin_box(int thread_idx, const T *roi_boxes, int roi_cols, const T spatial_scale, const int sample_num,
+               int roi_end_mode, const int channels, const int height, const int width, const int pooled_height,
+               const int pooled_width, int *offset, int *n, int *c, int *ph, int *pw, int *roi_bin_grid_h,
+               int *roi_bin_grid_w, T *bin_size_h, T *bin_size_w, T *roi_start_h, T *roi_start_w);
+
+  std::vector<int> xdiff_shape_;
+  int pooled_height_;
+  int pooled_width_;
+  T spatial_scale_;
+  int sample_num_;
+  int roi_end_mode_;
+
+  int roi_rows_;
+  int roi_cols_;
+  int batch_size_;
+  int channels_;
+  int height_;
+  int width_;
+};
+
+MS_REG_CPU_KERNEL_T(
+  ROIAlignGrad,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  ROIAlignGradCPUKernel, float);
+
+MS_REG_CPU_KERNEL_T(
+  ROIAlignGrad,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+  ROIAlignGradCPUKernel, float16);
+
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ROI_ALIGN_GRAD_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/scatter_nd_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/scatter_nd_cpu_kernel.cc
@ -0,0 +1,127 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/cpu/scatter_nd_cpu_kernel.h"
+#include <string>
+#include "runtime/device/cpu/cpu_device_address.h"
+#include "common/thread_pool.h"
+
+namespace mindspore {
+namespace kernel {
+namespace {
+template <typename S, typename T>
+void Compute(const ComputeParams<S, T> *params, const size_t start, const size_t end) {
+  MS_EXCEPTION_IF_NULL(params);
+  T *target = params->target_;
+  S *indices = params->indices_;
+  T *updates = params->updates_;
+  std::vector<int> *out_strides = params->out_strides_;
+  MS_EXCEPTION_IF_NULL(out_strides);
+
+  for (size_t i = start; i < end; ++i) {
+    int offset = 0;
+    for (int j = 0; j < params->indices_unit_rank_; ++j) {
+      auto index = indices[i * params->indices_unit_rank_ + j];
+      if (index < 0) {
+        MS_LOG(EXCEPTION) << "Indices contains element " << index << " less than 0.";
+      }
+      offset += index * out_strides->at(j) * params->unit_size_;
+    }
+    target[offset] += updates[params->unit_size_ * i];
+  }
+}
+}  // namespace
+
+template <typename S, typename T>
+void ScatterNdCPUKernel<S, T>::InitKernel(const CNodePtr &kernel_node) {
+  Check(kernel_node);
+  auto shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+  auto indices_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  auto updates_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+  auto indices_unit_rank = indices_shape.back();
+  if (indices_unit_rank > shape.size()) {
+    MS_LOG(EXCEPTION) << "Value of last dimension of indices is greater than shape rank";
+  }
+  if (indices_shape.size() < 2) {
+    MS_LOG(EXCEPTION) << "Indices has dimension less than 2";
+  }
+  if (updates_shape.size() != indices_shape.size() - 1 + shape.size() - indices_unit_rank) {
+    MS_LOG(EXCEPTION) << "The ranks of update and indices are inconsistent";
+  }
+  for (size_t i = 0; i < indices_shape.size() - 1; ++i) {
+    if (updates_shape[i] != indices_shape[i]) {
+      MS_LOG(EXCEPTION) << "The shape of updates and indices are different in dimension " << i << " .";
+    }
+  }
+  indices_unit_rank_ = SizeToInt(indices_unit_rank);
+  unit_size_ = 1;
+  for (size_t i = indices_shape.size() - 1; i < updates_shape.size(); ++i) {
+    unit_size_ *= SizeToInt(updates_shape[i]);
+  }
+  num_units_ = 1;
+  num_units_ *= updates_shape[indices_shape.size() - 2];
+  for (int i = SizeToInt(indices_shape.size()) - 3; i >= 0; i--) {
+    num_units_ *= updates_shape[i];
+  }
+  int out_stride = 1;
+  out_strides_.push_back(out_stride);
+  for (int i = indices_unit_rank_ - 2; i >= 0; i--) {
+    out_stride *= shape[i + 1];
+    out_strides_.push_back(out_stride);
+  }
+  reverse(out_strides_.begin(), out_strides_.end());
+}
+
+template <typename S, typename T>
+bool ScatterNdCPUKernel<S, T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                      const std::vector<kernel::AddressPtr> &,
+                                      const std::vector<kernel::AddressPtr> &outputs) {
+  auto target = reinterpret_cast<T *>(outputs[0]->addr);
+  auto target_init = memset_s(target, outputs[0]->size / sizeof(T), static_cast<T>(0.0), outputs[0]->size / sizeof(T));
+  if (target_init != EOK) {
+    MS_LOG(EXCEPTION) << "ScatterNdCPUKernel Launch task memset failed.";
+  }
+  ComputeParams<S, T> params;
+  params.target_ = target;
+  params.indices_ = reinterpret_cast<S *>(inputs[0]->addr);
+  params.updates_ = reinterpret_cast<T *>(inputs[1]->addr);
+  params.target_mem_size_ = outputs[0]->size;
+  params.unit_size_ = unit_size_;
+  params.indices_unit_rank_ = indices_unit_rank_;
+  params.out_strides_ = &out_strides_;
+
+  auto task = [&](size_t start, size_t end) {
+    for (size_t idx = start; idx < end; idx++) {
+      Compute<S, T>(&params, idx, idx + 1);
+    }
+  };
+  CPUKernelUtils::ParallelFor(task, num_units_);
+  return true;
+}
+
+template <typename S, typename T>
+void ScatterNdCPUKernel<S, T>::Check(const CNodePtr &kernel_node) {
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  if (input_num != 2) {
+    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but ScatterNd needs 2 input.";
+  }
+  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
+  if (output_num != 1) {
+    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but ScatterNd needs 1 output.";
+  }
+}
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/scatter_nd_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/scatter_nd_cpu_kernel.h
@ -0,0 +1,150 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SCATTER_ND_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SCATTER_ND_CPU_KERNEL_H_
+#include <vector>
+#include <unordered_map>
+#include "backend/kernel_compiler/cpu/cpu_kernel.h"
+#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+template <typename S, typename T>
+struct ComputeParams {
+  T *target_{nullptr};
+  S *indices_{nullptr};
+  T *updates_{nullptr};
+  int unit_size_{0};
+  int indices_unit_rank_{0};
+  std::vector<int> *out_strides_{nullptr};
+  size_t target_mem_size_{0};
+};
+
+template <typename S, typename T>
+class ScatterNdCPUKernel : public CPUKernel {
+ public:
+  ScatterNdCPUKernel() = default;
+  ~ScatterNdCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  void Check(const CNodePtr &kernel_node);
+
+  int unit_size_{0};
+  size_t num_units_{0};
+  int indices_unit_rank_{0};
+  std::vector<int> out_strides_;
+};
+
+MS_REG_CPU_KERNEL_T_S(
+  ScatterNd,
+  KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
+  ScatterNdCPUKernel, int64_t, double);
+
+MS_REG_CPU_KERNEL_T_S(
+  ScatterNd,
+  KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  ScatterNdCPUKernel, int64_t, float);
+
+MS_REG_CPU_KERNEL_T_S(
+  ScatterNd, KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
+  ScatterNdCPUKernel, int64_t, int64_t);
+
+MS_REG_CPU_KERNEL_T_S(
+  ScatterNd, KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+  ScatterNdCPUKernel, int64_t, int32_t);
+
+MS_REG_CPU_KERNEL_T_S(
+  ScatterNd, KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
+  ScatterNdCPUKernel, int64_t, int16_t);
+
+MS_REG_CPU_KERNEL_T_S(
+  ScatterNd, KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
+  ScatterNdCPUKernel, int64_t, int8_t);
+
+MS_REG_CPU_KERNEL_T_S(
+  ScatterNd,
+  KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
+  ScatterNdCPUKernel, int64_t, uint64_t);
+
+MS_REG_CPU_KERNEL_T_S(
+  ScatterNd,
+  KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
+  ScatterNdCPUKernel, int64_t, uint32_t);
+
+MS_REG_CPU_KERNEL_T_S(
+  ScatterNd,
+  KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
+  ScatterNdCPUKernel, int64_t, uint16_t);
+
+MS_REG_CPU_KERNEL_T_S(
+  ScatterNd, KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
+  ScatterNdCPUKernel, int64_t, uint8_t);
+
+MS_REG_CPU_KERNEL_T_S(
+  ScatterNd,
+  KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
+  ScatterNdCPUKernel, int32_t, double);
+
+MS_REG_CPU_KERNEL_T_S(
+  ScatterNd,
+  KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  ScatterNdCPUKernel, int32_t, float);
+
+MS_REG_CPU_KERNEL_T_S(
+  ScatterNd, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
+  ScatterNdCPUKernel, int32_t, int64_t);
+
+MS_REG_CPU_KERNEL_T_S(
+  ScatterNd, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+  ScatterNdCPUKernel, int32_t, int32_t);
+
+MS_REG_CPU_KERNEL_T_S(
+  ScatterNd, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
+  ScatterNdCPUKernel, int32_t, int16_t);
+
+MS_REG_CPU_KERNEL_T_S(
+  ScatterNd, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
+  ScatterNdCPUKernel, int32_t, int8_t);
+
+MS_REG_CPU_KERNEL_T_S(
+  ScatterNd,
+  KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
+  ScatterNdCPUKernel, int32_t, uint64_t);
+
+MS_REG_CPU_KERNEL_T_S(
+  ScatterNd,
+  KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
+  ScatterNdCPUKernel, int32_t, uint32_t);
+
+MS_REG_CPU_KERNEL_T_S(
+  ScatterNd,
+  KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
+  ScatterNdCPUKernel, int32_t, uint16_t);
+
+MS_REG_CPU_KERNEL_T_S(
+  ScatterNd, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
+  ScatterNdCPUKernel, int32_t, uint8_t);
+
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SCATTER_ND_CPU_KERNEL_H_
--- a/mindspore/ops/_op_impl/cpu/init.py
+++ b/mindspore/ops/_op_impl/cpu/init.py
@ -24,6 +24,7 @@ from .split import _split_cpu
 from .adam import _adam_cpu
 from .arg_max import _arg_max_cpu
 from .arg_min_with_value import _arg_min_with_value_cpu
+from .arg_max_with_value import _arg_max_with_value_cpu
 from .bias_add import _bias_add_cpu
 from .bias_add_grad import _bias_add_grad_cpu
 from .dropout import _dropout_cpu
--- a/mindspore/ops/_op_impl/cpu/arg_max_with_value.py
+++ b/mindspore/ops/_op_impl/cpu/arg_max_with_value.py
@ -0,0 +1,31 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ArgMaxWithValue op"""
+from mindspore.ops.op_info_register import op_info_register, CpuRegOp, DataType
+
+arg_max_with_value_op_info = CpuRegOp("ArgMaxWithValue") \
+    .input(0, "x", "required") \
+    .output(0, "indice", "required") \
+    .output(1, "values", "required") \
+    .dtype_format(DataType.F32_Default, DataType.I32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I32_Default, DataType.F16_Default) \
+    .get_op_info()
+
+
+@op_info_register(arg_max_with_value_op_info)
+def _arg_max_with_value_cpu():
+    """ArgMaxWithValue cpu register"""
+    return
--- a/mindspore/ops/operations/array_ops.py
+++ b/mindspore/ops/operations/array_ops.py
@ -1785,7 +1785,7 @@ class ArgMaxWithValue(PrimitiveWithInfer):
        TypeError: If `axis` is not an int.

    Supported Platforms:
-        ``Ascend`` ``GPU``
+        ``Ascend`` ``GPU`` ``CPU``

    Examples:
        >>> input_x = Tensor(np.array([0.0, 0.4, 0.6, 0.7, 0.1]), mindspore.float32)
@ -3484,7 +3484,7 @@ class ScatterNd(PrimitiveWithInfer):
        ValueError: If any element of `shape` is less than 1.

    Supported Platforms:
-        ``Ascend`` ``GPU``
+        ``Ascend`` ``GPU`` ``CPU``

    Examples:
        >>> op = ops.ScatterNd()
--- a/mindspore/ops/operations/image_ops.py
+++ b/mindspore/ops/operations/image_ops.py
@ -59,7 +59,7 @@ class CropAndResize(PrimitiveWithInfer):
        ValueError: If `method` is not one of 'bilinear', 'nearest', 'bilinear_v2'.

    Supported Platforms:
-        ``Ascend`` ``GPU``
+        ``Ascend`` ``GPU`` ``CPU``

    Examples:
        >>> class CropAndResizeNet(nn.Cell):
--- a/mindspore/ops/operations/math_ops.py
+++ b/mindspore/ops/operations/math_ops.py
@ -4257,7 +4257,7 @@ class NMSWithMask(PrimitiveWithInfer):
            Tensor is not float16 or float32.

    Supported Platforms:
-        ``Ascend`` ``GPU``
+        ``Ascend`` ``GPU`` ``CPU``

    Examples:
        >>> bbox = np.array([[100.0, 100.0, 50.0, 68.0, 0.63], [150.0, 75.0, 165.0, 115.0, 0.55],
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@ -4262,7 +4262,7 @@ class ROIAlign(PrimitiveWithInfer):
        TypeError: If `features` or `rois` is not a Tensor.

    Supported Platforms:
-        ``Ascend`` ``GPU``
+        ``Ascend`` ``GPU`` ``CPU``

    Examples:
        >>> input_tensor = Tensor(np.array([[[[1., 2.], [3., 4.]]]]), mindspore.float32)
--- a/mindspore/ops/operations/other_ops.py
+++ b/mindspore/ops/operations/other_ops.py
@ -168,7 +168,7 @@ class BoundingBoxEncode(PrimitiveWithInfer):
        TypeError: If `anchor_box` or `groundtruth_box` is not a Tensor.

    Supported Platforms:
-        ``Ascend`` ``GPU``
+        ``Ascend`` ``GPU`` ``CPU``

    Examples:
        >>> anchor_box = Tensor([[2, 2, 2, 3], [2, 2, 2, 3]], mindspore.float32)
@ -230,7 +230,7 @@ class BoundingBoxDecode(PrimitiveWithInfer):
        TypeError: If `anchor_box` or `deltas` is not a Tensor.

    Supported Platforms:
-        ``Ascend`` ``GPU``
+        ``Ascend`` ``GPU`` ``CPU``

    Examples:
        >>> anchor_box = Tensor([[4, 1, 2, 1], [2, 2, 2, 3]], mindspore.float32)
@ -293,7 +293,7 @@ class CheckValid(PrimitiveWithInfer):
        TypeError: If dtype of `bboxes` or `img_metas` is neither float16 nor float32.

    Supported Platforms:
-        ``Ascend`` ``GPU``
+        ``Ascend`` ``GPU`` ``CPU``

    Examples:
        >>> import mindspore
--- a/mindspore/ops/operations/random_ops.py
+++ b/mindspore/ops/operations/random_ops.py
@ -404,7 +404,7 @@ class RandomChoiceWithMask(PrimitiveWithInfer):
        TypeError: If `input_x` is not a Tensor.

    Supported Platforms:
-        ``Ascend`` ``GPU``
+        ``Ascend`` ``GPU`` ``CPU``

    Examples:
        >>> rnd_choice_mask = ops.RandomChoiceWithMask()
--- a/tests/st/ops/cpu/test_argmaxwithvalue_op.py
+++ b/tests/st/ops/cpu/test_argmaxwithvalue_op.py
@ -0,0 +1,146 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+
+class NetArgmaxWithValue(nn.Cell):
+    def __init__(self):
+        super(NetArgmaxWithValue, self).__init__()
+        axis1 = 0
+        axis2 = -1
+        self.argmax1 = P.ArgMaxWithValue(axis1)
+        self.argmax2 = P.ArgMaxWithValue(axis2)
+        self.argmax3 = P.ArgMaxWithValue()
+
+    def construct(self, x):
+        return (self.argmax1(x), self.argmax2(x), self.argmax3(x))
+
+
+class NetArgmaxWithValueBig(nn.Cell):
+    def __init__(self, axis=0):
+        super(NetArgmaxWithValueBig, self).__init__()
+        self.argmax = P.ArgMaxWithValue(axis)
+
+    def construct(self, x):
+        return self.argmax(x)
+
+
+def argmaxwithvalue_base(data_type):
+    x = Tensor(np.array([[1., 20., 5.],
+                         [67., 8., 9.],
+                         [130., 24., 15.],
+                         [0.3, -0.4, -15.]]).astype(data_type))
+    expect1 = np.array([2, 2, 2]).astype(data_type)
+    expect2 = np.array([1, 0, 0, 0]).astype(data_type)
+    expect11 = np.array([130, 24, 15]).astype(data_type)
+    expect22 = np.array([20, 67, 130, 0.3]).astype(data_type)
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU")
+    argmax = NetArgmaxWithValue()
+    output = argmax(x)
+    assert (output[0][0].asnumpy() == expect1).all()
+    assert (output[0][1].asnumpy() == expect11).all()
+    assert (output[1][0].asnumpy() == expect2).all()
+    assert (output[1][1].asnumpy() == expect22).all()
+    assert (output[2][0].asnumpy() == expect1).all()
+    assert (output[2][1].asnumpy() == expect11).all()
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    argmax = NetArgmaxWithValue()
+    output = argmax(x)
+    assert (output[0][0].asnumpy() == expect1).all()
+    assert (output[0][1].asnumpy() == expect11).all()
+    assert (output[1][0].asnumpy() == expect2).all()
+    assert (output[1][1].asnumpy() == expect22).all()
+    assert (output[2][0].asnumpy() == expect1).all()
+    assert (output[2][1].asnumpy() == expect11).all()
+
+
+def argmaxwithvalue_3d(data_type, shape_x):
+    np.random.seed(2)
+    x_np = np.random.random(shape_x).astype(data_type)
+    x = Tensor(x_np)
+
+    argmax = NetArgmaxWithValueBig(0)
+    output = argmax(x)
+    expect1 = np.argmax(x_np, axis=0)
+    expect2 = np.maximum.reduce(x_np, 0)
+    assert (output[0].asnumpy() == expect1).all()
+    assert (output[1].asnumpy() == expect2).all()
+
+    argmax = NetArgmaxWithValueBig(1)
+    output = argmax(x)
+    expect1 = np.argmax(x_np, axis=1)
+    expect2 = np.maximum.reduce(x_np, 1)
+    assert (output[0].asnumpy() == expect1).all()
+    assert (output[1].asnumpy() == expect2).all()
+
+    argmax = NetArgmaxWithValueBig(2)
+    output = argmax(x)
+    expect1 = np.argmax(x_np, axis=2)
+    expect2 = np.maximum.reduce(x_np, 2)
+    assert (output[0].asnumpy() == expect1).all()
+    assert (output[1].asnumpy() == expect2).all()
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_argmaxwithvalue_base_float32():
+    argmaxwithvalue_base(np.float32)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_argmaxwithvalue_base_float16():
+    argmaxwithvalue_base(np.float16)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_argmaxwithvalue_3d_float32():
+    shape_x = (2, 32, 256)
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU")
+    argmaxwithvalue_3d(np.float32, shape_x)
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    argmaxwithvalue_3d(np.float32, shape_x)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_argmaxwithvalue_3d_float16():
+    shape_x = (2, 64, 128)
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    argmaxwithvalue_3d(np.float16, shape_x)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_argmaxwithvalue_3d_big_float32():
+    shape_x = (128, 1024, 1)
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU")
+    argmaxwithvalue_3d(np.float32, shape_x)
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    argmaxwithvalue_3d(np.float32, shape_x)
--- a/tests/st/ops/cpu/test_boundingbox_decode_op.py
+++ b/tests/st/ops/cpu/test_boundingbox_decode_op.py
@ -0,0 +1,60 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+
+class NetBoundingBoxDecode(nn.Cell):
+    def __init__(self, means=(0.0, 0.0, 0.0, 0.0), stds=(1.0, 1.0, 1.0, 1.0)):
+        super(NetBoundingBoxDecode, self).__init__()
+        self.decode = P.BoundingBoxDecode(max_shape=(768, 1280), means=means, stds=stds,
+                                          wh_ratio_clip=0.016)
+
+    def construct(self, anchor, groundtruth):
+        return self.decode(anchor, groundtruth)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_boundingbox_decode():
+    anchor = np.array([[4, 1, 2, 1], [2, 2, 2, 3]], np.float32)
+    deltas = np.array([[3, 1, 2, 2], [1, 2, 1, 4]], np.float32)
+    means = (0.1, 0.1, 0.2, 0.2)
+    stds = (2.0, 2.0, 3.0, 3.0)
+    anchor_box = Tensor(anchor, mindspore.float32)
+    deltas_box = Tensor(deltas, mindspore.float32)
+    expect_deltas = np.array([[28.6500, 0.0000, 0.0000, 33.8500],
+                              [0.0000, 0.0000, 15.8663, 72.7000]], np.float32)
+
+    error = np.ones(shape=[2, 4]) * 1.0e-4
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+    boundingbox_decode = NetBoundingBoxDecode(means, stds)
+    output = boundingbox_decode(anchor_box, deltas_box)
+    diff = output.asnumpy() - expect_deltas
+    assert np.all(abs(diff) < error)
+
+    context.set_context(mode=context.PYNATIVE_MODE, device_target='CPU')
+    boundingbox_decode = NetBoundingBoxDecode(means, stds)
+    output = boundingbox_decode(anchor_box, deltas_box)
+    diff = output.asnumpy() - expect_deltas
+    assert np.all(abs(diff) < error)
--- a/tests/st/ops/cpu/test_boundingbox_encode_op.py
+++ b/tests/st/ops/cpu/test_boundingbox_encode_op.py
@ -0,0 +1,80 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+
+class NetBoundingBoxEncode(nn.Cell):
+    def __init__(self, means=(0.0, 0.0, 0.0, 0.0), stds=(1.0, 1.0, 1.0, 1.0)):
+        super(NetBoundingBoxEncode, self).__init__()
+        self.encode = P.BoundingBoxEncode(means=means, stds=stds)
+
+    def construct(self, anchor, groundtruth):
+        return self.encode(anchor, groundtruth)
+
+def bbox2delta(proposals, gt, means, stds):
+    px = (proposals[..., 0] + proposals[..., 2]) * 0.5
+    py = (proposals[..., 1] + proposals[..., 3]) * 0.5
+    pw = proposals[..., 2] - proposals[..., 0] + 1.0
+    ph = proposals[..., 3] - proposals[..., 1] + 1.0
+
+    gx = (gt[..., 0] + gt[..., 2]) * 0.5
+    gy = (gt[..., 1] + gt[..., 3]) * 0.5
+    gw = gt[..., 2] - gt[..., 0] + 1.0
+    gh = gt[..., 3] - gt[..., 1] + 1.0
+
+    dx = (gx - px) / pw
+    dy = (gy - py) / ph
+    dw = np.log(gw / pw)
+    dh = np.log(gh / ph)
+    means = np.array(means, np.float32)
+    stds = np.array(stds, np.float32)
+    deltas = np.stack([(dx - means[0]) / stds[0], (dy - means[1]) / stds[1],
+                       (dw - means[2]) / stds[2], (dh - means[3]) / stds[3]], axis=-1)
+
+    return deltas
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_boundingbox_encode():
+    anchor = np.array([[4, 1, 6, 9], [2, 5, 5, 9]]).astype(np.float32)
+    gt = np.array([[3, 2, 7, 7], [1, 5, 5, 8]]).astype(np.float32)
+    means = (0.1, 0.1, 0.2, 0.2)
+    stds = (2.0, 2.0, 3.0, 3.0)
+    anchor_box = Tensor(anchor, mindspore.float32)
+    groundtruth_box = Tensor(gt, mindspore.float32)
+    expect_deltas = bbox2delta(anchor, gt, means, stds)
+
+    error = np.ones(shape=[2, 4]) * 1.0e-6
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+    boundingbox_encode = NetBoundingBoxEncode(means, stds)
+    output = boundingbox_encode(anchor_box, groundtruth_box)
+    diff = output.asnumpy() - expect_deltas
+    assert np.all(abs(diff) < error)
+
+    context.set_context(mode=context.PYNATIVE_MODE, device_target='CPU')
+    boundingbox_encode = NetBoundingBoxEncode(means, stds)
+    output = boundingbox_encode(anchor_box, groundtruth_box)
+    diff = output.asnumpy() - expect_deltas
+    assert np.all(abs(diff) < error)
--- a/tests/st/ops/cpu/test_check_valid_op.py
+++ b/tests/st/ops/cpu/test_check_valid_op.py
@ -0,0 +1,86 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+
+class NetCheckValid(nn.Cell):
+    def __init__(self):
+        super(NetCheckValid, self).__init__()
+        self.valid = P.CheckValid()
+
+    def construct(self, anchor, image_metas):
+        return self.valid(anchor, image_metas)
+
+def check_valid(nptype):
+    anchor = np.array([[50, 0, 100, 700], [-2, 2, 8, 100], [10, 20, 300, 2000]], nptype)
+    image_metas = np.array([768, 1280, 1], nptype)
+    anchor_box = Tensor(anchor)
+    image_metas_box = Tensor(image_metas)
+    expect = np.array([True, False, False], np.bool)
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+    boundingbox_decode = NetCheckValid()
+    output = boundingbox_decode(anchor_box, image_metas_box)
+    assert np.array_equal(output.asnumpy(), expect)
+
+    context.set_context(mode=context.PYNATIVE_MODE, device_target='CPU')
+    boundingbox_decode = NetCheckValid()
+    output = boundingbox_decode(anchor_box, image_metas_box)
+    assert np.array_equal(output.asnumpy(), expect)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_check_valid_float32():
+    check_valid(np.float32)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_check_valid_float16():
+    check_valid(np.float16)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_check_valid_int16():
+    check_valid(np.int16)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_check_valid_uint8():
+    anchor = np.array([[5, 0, 10, 70], [2, 2, 8, 10], [1, 2, 30, 200]], np.uint8)
+    image_metas = np.array([76, 128, 1], np.uint8)
+    anchor_box = Tensor(anchor)
+    image_metas_box = Tensor(image_metas)
+    expect = np.array([True, True, False], np.bool)
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+    boundingbox_decode = NetCheckValid()
+    output = boundingbox_decode(anchor_box, image_metas_box)
+    assert np.array_equal(output.asnumpy(), expect)
+
+    context.set_context(mode=context.PYNATIVE_MODE, device_target='CPU')
+    boundingbox_decode = NetCheckValid()
+    output = boundingbox_decode(anchor_box, image_metas_box)
+    assert np.array_equal(output.asnumpy(), expect)
--- a/tests/st/ops/cpu/test_crop_and_resize_op.py
+++ b/tests/st/ops/cpu/test_crop_and_resize_op.py
@ -0,0 +1,423 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import pytest
+
+from mindspore import context, Tensor
+from mindspore.ops import operations as P
+from mindspore import nn
+
+
+class NetCropAndResize(nn.Cell):
+    def __init__(self, method_="bilinear", extrapolation_value_=0.0):
+        super(NetCropAndResize, self).__init__()
+        self.op = P.CropAndResize(
+            method=method_, extrapolation_value=extrapolation_value_)
+
+    def construct(self, image, boxes, box_index, channel):
+        return self.op(image, boxes, box_index, channel)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_crop_and_resize_int8_bilinear(datatype=np.int8):
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    batch_size = 2
+    image_height = 32
+    image_width = 18
+    channels = 2
+    crop_size = (5, 3)
+    total_values = batch_size * image_height * image_width * channels
+    input_data = np.arange(0, total_values).reshape(
+        (batch_size, image_height, image_width, channels))
+    input_boxes = np.array(
+        [[0, 0.5, 0.5, 0.0], [0, 0, 0.75, 1.75]]).astype(np.float32)
+    input_box_index = np.array([1, 0]).astype(np.int32)
+    input_data_tensor = Tensor(input_data.astype(datatype))
+    input_boxes_tensor = Tensor(input_boxes)
+    input_box_index_tensor = Tensor(input_box_index)
+    net = NetCropAndResize("bilinear", 0.5)
+    output = net(input_data_tensor, input_boxes_tensor,
+                 input_box_index_tensor, crop_size)
+    output_ms = output.asnumpy()
+    expected_output = np.array([[[[-111.0, -110.0], [-119.5, -118.5], [-128.0, -127.0]],
+                                 [[28.5, 29.5], [20.0, 21.0], [11.5, 12.5]],
+                                 [[-88.0, -87.0], [-96.5, -95.5], [-41.0, -40.0]],
+                                 [[51.5, 52.5], [43.0, 44.0], [34.5, 35.5]],
+                                 [[-65.0, -64.0], [-73.5, -72.5], [-82.0, -81.0]]],
+                                [[[0.0, 1.0], [29.75, 30.75], [0.5, 0.5]],
+                                 [[-46.75, -45.75], [-17.0, -16.0], [0.5, 0.5]],
+                                 [[-93.5, -92.5], [-63.75, -62.75], [0.5, 0.5]],
+                                 [[3.75, 4.75], [-110.5, -109.5], [0.5, 0.5]],
+                                 [[69.0, 70.0], [98.75, 99.75], [0.5, 0.5]]]]).astype(np.float32)
+    error = np.ones(shape=[2, *crop_size, channels]) * 1.0e-6
+    diff = output_ms - expected_output
+    assert np.all(abs(diff) < error)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_crop_and_resize_int16_nearest(datatype=np.int16):
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU")
+    batch_size = 2
+    image_height = 32
+    image_width = 18
+    channels = 2
+    crop_size = (5, 3)
+    total_values = batch_size * image_height * image_width * channels
+    input_data = np.arange(0, total_values).reshape(
+        (batch_size, image_height, image_width, channels))
+    input_boxes = np.array(
+        [[0, 0.5, 0.5, 0.0], [0, 0, 0.75, 1.75]]).astype(np.float32)
+    input_box_index = np.array([1, 0]).astype(np.int32)
+    input_data_tensor = Tensor(input_data.astype(datatype))
+    input_boxes_tensor = Tensor(input_boxes)
+    input_box_index_tensor = Tensor(input_box_index)
+    net = NetCropAndResize("nearest", 0.5)
+    output = net(input_data_tensor, input_boxes_tensor,
+                 input_box_index_tensor, crop_size)
+    output_ms = output.asnumpy()
+    expected_output = np.array([[[[1170.0, 1171.0], [1160.0, 1161.0], [1152.0, 1153.0]],
+                                 [[1314.0, 1315.0], [1304.0, 1305.0], [1296.0, 1297.0]],
+                                 [[1458.0, 1459.0], [1448.0, 1449.0], [1440.0, 1441.0]],
+                                 [[1602.0, 1603.0], [1592.0, 1593.0], [1584.0, 1585.0]],
+                                 [[1746.0, 1747.0], [1736.0, 1737.0], [1728.0, 1729.0]]],
+                                [[[0.0, 1.0], [30.0, 31.0], [0.5, 0.5]],
+                                 [[216.0, 217.0], [246.0, 247.0], [0.5, 0.5]],
+                                 [[432.0, 433.0], [462.0, 463.0], [0.5, 0.5]],
+                                 [[612.0, 613.0], [642.0, 643.0], [0.5, 0.5]],
+                                 [[828.0, 829.0], [858.0, 859.0], [0.5, 0.5]]]]).astype(np.float32)
+    error = np.ones(shape=[2, *crop_size, channels]) * 1.0e-6
+    diff = output_ms - expected_output
+    assert np.all(abs(diff) < error)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_crop_and_resize_int32_bilinear_v2(datatype=np.int32):
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    batch_size = 2
+    image_height = 32
+    image_width = 18
+    channels = 2
+    crop_size = (5, 3)
+    offset = 8795
+    total_values = batch_size * image_height * image_width * channels
+    input_data = np.arange(0 + offset, total_values + offset).reshape(
+        (batch_size, image_height, image_width, channels))
+    input_boxes = np.array(
+        [[0, 0.5, 0.5, 0.0], [0, 0, 0.75, 1.75]]).astype(np.float32)
+    input_box_index = np.array([1, 0]).astype(np.int32)
+    input_data_tensor = Tensor(input_data.astype(datatype))
+    input_boxes_tensor = Tensor(input_boxes)
+    input_box_index_tensor = Tensor(input_box_index)
+    net = NetCropAndResize("bilinear_v2", 0.369)
+    output = net(input_data_tensor, input_boxes_tensor,
+                 input_box_index_tensor, crop_size)
+    output_ms = output.asnumpy()
+    expected_output = np.array([[[[10008.199, 10009.199], [10008.2, 10009.2], [10008.199, 10009.2]],
+                                 [[10130.6, 10131.6], [10130.6, 10131.6], [10130.601, 10131.6]],
+                                 [[10253, 10253.999], [10253, 10254], [10253, 10254]],
+                                 [[10375.4, 10376.398], [10375.4, 10376.4], [10375.4, 10376.399]],
+                                 [[10497.799, 10498.799], [10497.801, 10498.8], [10497.8, 10498.8]]],
+                                [[[8876.667, 8877.667], [8898, 8899], [8919.334, 8920.333]],
+                                 [[9056.667, 9057.667], [9078, 9079], [9099.333, 9100.333]],
+                                 [[9236.667, 9237.667], [9258, 9259], [9279.333, 9280.333]],
+                                 [[9416.667, 9417.667], [9438, 9439], [9459.333, 9460.333]],
+                                 [[9596.667, 9597.667], [9618, 9619], [9639.333, 9640.334]]]]).astype(
+                                     np.float32)
+    error = np.ones(shape=[2, *crop_size, channels]) * 1.0e-6
+    diff = output_ms - expected_output
+    assert np.all(abs(diff) < error)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_crop_and_resize_float16_nearest(datatype=np.float16):
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU")
+    batch_size = 2
+    image_height = 50
+    image_width = 40
+    channels = 3
+    crop_size = (5, 3)
+    offset = 0
+    total_values = batch_size * image_height * image_width * channels
+    input_data = np.arange(0 + offset, total_values + offset).reshape(
+        (batch_size, image_height, image_width, channels))
+    input_boxes = np.array(
+        [[0.23, 0.5, 0.75, 0.0], [0, 0.1, 0.75, 1.75]]).astype(np.float32)
+    input_box_index = np.array([1, 0]).astype(np.int32)
+    input_data_tensor = Tensor(input_data.astype(datatype))
+    input_boxes_tensor = Tensor(input_boxes)
+    input_box_index_tensor = Tensor(input_box_index)
+    net = NetCropAndResize("nearest", 0.0)
+    output = net(input_data_tensor, input_boxes_tensor,
+                 input_box_index_tensor, crop_size)
+    output_ms = output.asnumpy()
+    expected_output = np.array([[[[7380.0, 7380.0, 7384.0], [7352.0, 7352.0, 7352.0],
+                                  [7320.0, 7320.0, 7320.0]],
+                                 [[8224.0, 8224.0, 8224.0], [8192.0, 8192.0, 8192.0],
+                                  [8160.0, 8160.0, 8160.0]],
+                                 [[8944.0, 8944.0, 8944.0], [8912.0, 8912.0, 8912.0],
+                                  [8880.0, 8880.0, 8880.0]],
+                                 [[9664.0, 9664.0, 9664.0], [9632.0, 9632.0, 9632.0],
+                                  [9600.0, 9600.0, 9600.0]],
+                                 [[10496.0, 10504.0, 10504.0], [10472.0, 10472.0, 10472.0],
+                                  [10440.0, 10440.0, 10440.0]]],
+                                [[[12.0, 13.0, 14.0], [108.0, 109.0, 110.0], [0.0, 0.0, 0.0]],
+                                 [[1092.0, 1093.0, 1094.0], [1188.0, 1189.0, 1190.0], [0.0, 0.0, 0.0]],
+                                 [[2172.0, 2172.0, 2174.0], [2268.0, 2268.0, 2270.0], [0.0, 0.0, 0.0]],
+                                 [[3372.0, 3372.0, 3374.0], [3468.0, 3468.0, 3470.0], [0.0, 0.0, 0.0]],
+                                 [[4452.0, 4452.0, 4456.0], [4548.0, 4548.0, 4552.0],
+                                  [0.0, 0.0, 0.0]]]]).astype(np.float32)
+    error = np.ones(shape=[2, *crop_size, channels]) * 1.0e-6
+    diff = output_ms - expected_output
+    assert np.all(abs(diff) < error)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_crop_and_resize_float32_bilinear(datatype=np.float32):
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    batch_size = 2
+    image_height = 512
+    image_width = 256
+    channels = 3
+    crop_size = (5, 3)
+    offset = 5000
+    total_values = batch_size * image_height * image_width * channels
+    input_data = np.arange(0 + offset, total_values + offset).reshape(
+        (batch_size, image_height, image_width, channels))
+    input_boxes = np.array(
+        [[0.23, 0.5, 0.75, 0.0], [0, 0.1, 0.75, 1.75]]).astype(np.float32)
+    input_box_index = np.array([1, 0]).astype(np.int32)
+    input_data_tensor = Tensor(input_data.astype(datatype))
+    input_boxes_tensor = Tensor(input_boxes)
+    input_box_index_tensor = Tensor(input_box_index)
+    net = NetCropAndResize("bilinear", 0.0)
+    output = net(input_data_tensor, input_boxes_tensor,
+                 input_box_index_tensor, crop_size)
+    output_ms = output.asnumpy()
+    expected_output = np.array([[[[488861.53, 488862.53, 488863.53],
+                                  [488670.28, 488671.28, 488672.28],
+                                  [488479.03, 488480.03, 488481.03]],
+                                 [[539879.75, 539880.75, 539881.75],
+                                  [539688.5, 539689.5, 539690.5],
+                                  [539497.25, 539498.25, 539499.25]],
+                                 [[590898.0, 590899.0, 590900.0], [590706.75, 590707.75, 590708.75],
+                                  [590515.5, 590516.5, 590517.5]],
+                                 [[641916.25, 641917.25, 641918.25], [641725.0, 641726.0, 641727.0],
+                                  [641533.75, 641534.75, 641535.75]],
+                                 [[692934.5, 692935.5, 692936.5], [692743.25, 692744.25, 692745.25],
+                                  [692552.0, 692553.0, 692554.0]]],
+                                [[[5076.5, 5077.5, 5078.5], [5707.625, 5708.625, 5709.625], [0.0, 0.0, 0.0]],
+                                 [[78660.5, 78661.5, 78662.5], [79291.625, 79292.625, 79293.625], [0.0, 0.0, 0.0]],
+                                 [[152244.5, 152245.5, 152246.5], [152875.625, 152876.625, 152877.625],
+                                  [0.0, 0.0, 0.0]],
+                                 [[225828.5, 225829.5, 225830.5], [226459.625, 226460.625, 226461.625],
+                                  [0.0, 0.0, 0.0]],
+                                 [[299412.5, 299413.5, 299414.5], [300043.625, 300044.625, 300045.625],
+                                  [0.0, 0.0, 0.0]]]]).astype(np.float32)
+    error = np.ones(shape=[2, *crop_size, channels]) * 1.0e-6
+    diff = output_ms - expected_output
+    assert np.all(abs(diff) < error)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_crop_and_resize_float64_nearest(datatype=np.float64):
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU")
+    batch_size = 2
+    image_height = 50
+    image_width = 25
+    channels = 3
+    crop_size = (5, 3)
+    offset = 7549
+    total_values = batch_size * image_height * image_width * channels
+    input_data = np.arange(0 + offset, total_values + offset).reshape(
+        (batch_size, image_height, image_width, channels))
+    input_boxes = np.array(
+        [[0.23, 0.5, 0.75, 0.0], [0, 0.1, 0.75, 1.75]]).astype(np.float32)
+    input_box_index = np.array([1, 0]).astype(np.int32)
+    input_data_tensor = Tensor(input_data.astype(datatype))
+    input_boxes_tensor = Tensor(input_boxes)
+    input_box_index_tensor = Tensor(input_box_index)
+    net = NetCropAndResize("nearest", 0.0)
+    output = net(input_data_tensor, input_boxes_tensor,
+                 input_box_index_tensor, crop_size)
+    output_ms = output.asnumpy()
+    expected_output = np.array([[[[12160.0, 12161.0, 12162.0], [12142.0, 12143.0, 12144.0],
+                                  [12124.0, 12125.0, 12126.0]],
+                                 [[12685.0, 12686.0, 12687.0], [12667.0, 12668.0, 12669.0],
+                                  [12649.0, 12650.0, 12651.0]],
+                                 [[13135.0, 13136.0, 13137.0], [13117.0, 13118.0, 13119.0],
+                                  [13099.0, 13100.0, 13101.0]],
+                                 [[13585.0, 13586.0, 13587.0], [13567.0, 13568.0, 13569.0],
+                                  [13549.0, 13550.0, 13551.0]],
+                                 [[14110.0, 14111.0, 14112.0], [14092.0, 14093.0, 14094.0],
+                                  [14074.0, 14075.0, 14076.0]]],
+                                [[[7555.0, 7556.0, 7557.0], [7615.0, 7616.0, 7617.0], [0.0, 0.0, 0.0]],
+                                 [[8230.0, 8231.0, 8232.0], [8290.0, 8291.0, 8292.0], [0.0, 0.0, 0.0]],
+                                 [[8905.0, 8906.0, 8907.0], [8965.0, 8966.0, 8967.0], [0.0, 0.0, 0.0]],
+                                 [[9655.0, 9656.0, 9657.0], [9715.0, 9716.0, 9717.0], [0.0, 0.0, 0.0]],
+                                 [[10330.0, 10331.0, 10332.0], [10390.0, 10391.0, 10392.0],
+                                  [0.0, 0.0, 0.0]]]]).astype(np.float32)
+    error = np.ones(shape=[2, *crop_size, channels]) * 1.0e-6
+    diff = output_ms - expected_output
+    assert np.all(abs(diff) < error)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_crop_and_resize_int64_bilinearv2(datatype=np.int64):
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    batch_size = 2
+    image_height = 50
+    image_width = 25
+    channels = 3
+    crop_size = (5, 3)
+    offset = 7549
+    total_values = batch_size * image_height * image_width * channels
+    input_data = np.arange(0 + offset, total_values + offset).reshape(
+        (batch_size, image_height, image_width, channels))
+    input_boxes = np.array(
+        [[0.23, 0.5, 0.75, 0.0], [0, 0.1, 0.75, 1.75]]).astype(np.float32)
+    input_box_index = np.array([1, 0]).astype(np.int32)
+    input_data_tensor = Tensor(input_data.astype(datatype))
+    input_boxes_tensor = Tensor(input_boxes)
+    input_box_index_tensor = Tensor(input_box_index)
+    net = NetCropAndResize("bilinear_v2", 0.0)
+    output = net(input_data_tensor, input_boxes_tensor,
+                 input_box_index_tensor, crop_size)
+    output_ms = output.asnumpy()
+    expected_output = np.array([[[[12324.999, 12326, 12327], [12325, 12326, 12327],
+                                  [12325, 12326, 12327.001]],
+                                 [[12730, 12730.999, 12732], [12730, 12731, 12732],
+                                  [12730, 12731, 12732]],
+                                 [[13134.999, 13136, 13136.998], [13135, 13136, 13137],
+                                  [13135, 13136, 13137]],
+                                 [[13540, 13540.999, 13541.999], [13540, 13541, 13542],
+                                  [13540, 13541, 13542]],
+                                 [[13944.999, 13945.999, 13946.999], [13945, 13946.001, 13947],
+                                  [13945, 13946, 13947]]],
+                                [[[7822, 7823, 7824], [7864, 7865, 7866], [7906, 7907, 7908]],
+                                 [[8392, 8393, 8394], [8434, 8435, 8436], [8476, 8477, 8478]],
+                                 [[8962, 8963, 8964], [9004, 9005, 9006], [9046, 9047, 9048]],
+                                 [[9531.999, 9533.001, 9534], [9574, 9575, 9576], [9616, 9617, 9618.001]],
+                                 [[10102, 10103, 10104], [10144, 10145, 10146],
+                                  [10186, 10187, 10188]]]]).astype(np.float32)
+    error = np.ones(shape=[2, *crop_size, channels]) * 1.0e-6
+    diff = output_ms - expected_output
+    assert np.all(abs(diff) < error)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_crop_and_resize_uint8_nearest(datatype=np.uint8):
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU")
+    batch_size = 2
+    image_height = 7
+    image_width = 5
+    channels = 2
+    crop_size = (5, 3)
+    offset = 0
+    total_values = batch_size * image_height * image_width * channels
+    input_data = np.arange(0 + offset, total_values + offset).reshape(
+        (batch_size, image_height, image_width, channels))
+    input_boxes = np.array(
+        [[0.23, 0.5, 0.75, 0.0], [0, 0.1, 0.75, 1.75]]).astype(np.float32)
+    input_box_index = np.array([1, 0]).astype(np.int32)
+    input_data_tensor = Tensor(input_data.astype(datatype))
+    input_boxes_tensor = Tensor(input_boxes)
+    input_box_index_tensor = Tensor(input_box_index)
+    net = NetCropAndResize("nearest", 0.0)
+    output = net(input_data_tensor, input_boxes_tensor,
+                 input_box_index_tensor, crop_size)
+    output_ms = output.asnumpy()
+    expected_output = np.array([[[[84.0, 85.0], [82.0, 83.0], [80.0, 81.0]],
+                                 [[94.0, 95.0], [92.0, 93.0], [90.0, 91.0]],
+                                 [[104.0, 105.0], [102.0, 103.0], [100.0, 101.0]],
+                                 [[114.0, 115.0], [112.0, 113.0], [110.0, 111.0]],
+                                 [[124.0, 125.0], [122.0, 123.0], [120.0, 121.0]]],
+                                [[[0.0, 1.0], [8.0, 9.0], [0.0, 0.0]],
+                                 [[10.0, 11.0], [18.0, 19.0], [0.0, 0.0]],
+                                 [[20.0, 21.0], [28.0, 29.0], [0.0, 0.0]],
+                                 [[30.0, 31.0], [38.0, 39.0], [0.0, 0.0]],
+                                 [[50.0, 51.0], [58.0, 59.0], [0.0, 0.0]]]]).astype(np.float32)
+    error = np.ones(shape=[2, *crop_size, channels]) * 1.0e-6
+    diff = output_ms - expected_output
+    assert np.all(abs(diff) < error)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_crop_and_resize_uint16_bilinear(datatype=np.uint16):
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    batch_size = 2
+    image_height = 50
+    image_width = 30
+    channels = 3
+    crop_size = (5, 3)
+    offset = 0
+    total_values = batch_size * image_height * image_width * channels
+    input_data = np.arange(0 + offset, total_values + offset).reshape(
+        (batch_size, image_height, image_width, channels))
+    input_boxes = np.array(
+        [[0.23, 0.5, 0.75, 0.0], [0, 0.1, 0.75, 1.75]]).astype(np.float32)
+    input_box_index = np.array([1, 0]).astype(np.int32)
+    input_data_tensor = Tensor(input_data.astype(datatype))
+    input_boxes_tensor = Tensor(input_boxes)
+    input_box_index_tensor = Tensor(input_box_index)
+    net = NetCropAndResize("bilinear", 0.0)
+    output = net(input_data_tensor, input_boxes_tensor,
+                 input_box_index_tensor, crop_size)
+    output_ms = output.asnumpy()
+    expected_output = np.array([[[[5557.7998046875, 5558.7998046875, 5559.7998046875],
+                                  [5536.0498046875, 5537.0498046875, 5538.0498046875],
+                                  [5514.2998046875, 5515.2998046875, 5516.2998046875]],
+                                 [[6131.10009765625, 6132.10009765625, 6133.10009765625],
+                                  [6109.35009765625, 6110.35009765625, 6111.35009765625],
+                                  [6087.60009765625, 6088.60009765625, 6089.60009765625]],
+                                 [[6704.39990234375, 6705.39990234375, 6706.39990234375],
+                                  [6682.64990234375, 6683.64990234375, 6684.64990234375],
+                                  [6660.89990234375, 6661.89990234375, 6662.89990234375]],
+                                 [[7277.7001953125, 7278.7001953125, 7279.7001953125],
+                                  [7255.9501953125, 7256.9501953125, 7257.9501953125],
+                                  [7234.2001953125, 7235.2001953125, 7236.2001953125]],
+                                 [[7851.0, 7852.0, 7853.0], [7829.25, 7830.25, 7831.25],
+                                  [7807.5, 7808.5, 7809.5]]],
+                                [[[8.700000762939453, 9.700000762939453, 10.700000762939453],
+                                  [80.4749984741211, 81.4749984741211, 82.4749984741211],
+                                  [0.0, 0.0, 0.0]],
+                                 [[835.5750122070312, 836.5750122070312, 837.5750122070312],
+                                  [907.3499755859375, 908.3499755859375, 909.3499755859375], [0.0, 0.0, 0.0]],
+                                 [[1662.449951171875, 1663.449951171875, 1664.449951171875],
+                                  [1734.2249755859375, 1735.2249755859375, 1736.2249755859375],
+                                  [0.0, 0.0, 0.0]],
+                                 [[2489.324951171875, 2490.324951171875, 2491.324951171875],
+                                  [2561.10009765625, 2562.10009765625, 2563.10009765625], [0.0, 0.0, 0.0]],
+                                 [[3316.199951171875, 3317.199951171875, 3318.199951171875],
+                                  [3387.97509765625, 3388.97509765625, 3389.97509765625],
+                                  [0.0, 0.0, 0.0]]]]).astype(np.float32)
+    error = np.ones(shape=[2, *crop_size, channels]) * 1.0e-6
+    diff = output_ms - expected_output
+    assert np.all(abs(diff) < error)
--- a/tests/st/ops/cpu/test_nms_with_mask_op.py
+++ b/tests/st/ops/cpu/test_nms_with_mask_op.py
@ -0,0 +1,109 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+
+def runMSRun(op, bbox):
+    inputs = Tensor(bbox, mindspore.float32)
+    box, _, mask = op(inputs)
+    box = box.asnumpy()
+    mask = mask.asnumpy()
+    sel_idx = np.where(mask)
+    sel_rows = box[sel_idx][:, 0:4]
+    sel_score = box[sel_idx][:, -1]
+    return sel_rows, sel_score
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_nms_with_mask_check_order():
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU")
+    nms_op = P.NMSWithMask(0.5)
+    for _ in range(10):
+        count = 4000
+        box = np.random.randint(1, 100, size=(count, 4))
+        box[:, 2] = box[:, 0] + box[:, 2]
+        box[:, 3] = box[:, 1] + box[:, 3]
+        unsorted_scores = np.random.rand(count, 1)
+        bbox = np.hstack((box, unsorted_scores))
+        bbox = Tensor(bbox, dtype=mindspore.float32)
+        prop, _, _ = nms_op(bbox)
+        ms_sorted_scores = (prop.asnumpy()[:, -1])  # select just scores
+        np_sorted_scores = (np.sort(unsorted_scores, axis=0)[::-1][:, 0])  # sort manually
+        np.testing.assert_array_almost_equal(
+            ms_sorted_scores, np_sorted_scores)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_nms_with_mask_edge_case_1():
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    # CASE 1  - FULL OVERLAP BOXES - Every box is duplicated and has a different score
+    nms_op1 = P.NMSWithMask(0.3)
+    bbox1 = [[12, 4, 33, 17, 0.6], [20, 11, 38, 23, 0.1], [20, 10, 45, 26, 0.9], [15, 17, 35, 38, 0.5],
+             [10, 20, 30, 40, 0.4], [35, 35, 89, 90, 0.8], [12, 4, 33, 17, 0.3], [20, 11, 38, 23, 0.2],
+             [20, 10, 45, 26, 0.1], [15, 17, 35, 38, 0.8], [10, 20, 30, 40, 0.41], [35, 35, 89, 90, 0.82]]
+    expected_bbox = np.array([[20., 10., 45., 26.],
+                              [35., 35., 89., 90.],
+                              [15., 17., 35., 38.],
+                              [12., 4., 33., 17.]])
+    expected_score = np.array([0.9, 0.82, 0.8, 0.6])
+
+    sel_rows, sel_score = runMSRun(nms_op1, bbox1)
+    np.testing.assert_almost_equal(sel_rows, expected_bbox)
+    np.testing.assert_almost_equal(sel_score, expected_score)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_nms_with_mask_edge_case_2():
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    # CASE 2 - 0 value boxes - with valid scores
+    nms_op2 = P.NMSWithMask(0.5)
+    bbox2 = [[0, 0, 0, 0, 0.6], [0, 0, 0, 0, 0.1]]
+    expected_bbox = np.array([[0., 0., 0., 0.],
+                              [0., 0., 0., 0.]])
+    expected_score = np.array([0.6, 0.1])
+
+    sel_rows, sel_score = runMSRun(nms_op2, bbox2)
+    np.testing.assert_almost_equal(sel_rows, expected_bbox)
+    np.testing.assert_almost_equal(sel_score, expected_score)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_nms_with_mask_edge_case_3():
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    # CASE 3 - x2/x1 and y2/y1 sequence out of place
+    nms_op3 = P.NMSWithMask(0.7)
+    bbox3 = [[70, 70, 45, 75, 0.6], [30, 33, 43, 29, 0.1]]
+    expected_bbox = np.array([[70., 70., 45., 75.],
+                              [30., 33., 43., 29.]])
+    expected_score = np.array([0.6, 0.1])
+
+    sel_rows, sel_score = runMSRun(nms_op3, bbox3)
+    np.testing.assert_almost_equal(sel_rows, expected_bbox)
+    np.testing.assert_almost_equal(sel_score, expected_score)
--- a/tests/st/ops/cpu/test_random_choice_with_mask_op.py
+++ b/tests/st/ops/cpu/test_random_choice_with_mask_op.py
@ -0,0 +1,121 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+
+class RCWM_count_in(nn.Cell):
+    def __init__(self):
+        super(RCWM_count_in, self).__init__()
+        self.RCWM_count_in = P.RandomChoiceWithMask(count=4, seed=1)
+
+    def construct(self, x):
+        return self.RCWM_count_in(x)
+
+
+class RCWM_count_out(nn.Cell):
+    def __init__(self):
+        super(RCWM_count_out, self).__init__()
+        self.RCWM_count_out = P.RandomChoiceWithMask(count=10, seed=1)
+
+    def construct(self, x):
+        return self.RCWM_count_out(x)
+
+
+class RCWM_3D(nn.Cell):
+    def __init__(self):
+        super(RCWM_3D, self).__init__()
+        self.RCWM_3D = P.RandomChoiceWithMask(count=10, seed=1)
+
+    def construct(self, x):
+        return self.RCWM_3D(x)
+
+
+class RCWM_1D(nn.Cell):
+    def __init__(self):
+        super(RCWM_1D, self).__init__()
+        self.RCWM_1D = P.RandomChoiceWithMask(count=10, seed=9)
+
+    def construct(self, x):
+        return self.RCWM_1D(x)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_RCWM_3D():
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    input_tensor = Tensor(np.ones([3, 4, 5]).astype(np.bool))
+    expect1 = (10, 3)
+    expect2 = (10,)
+    rcwm = RCWM_3D()
+    output1, output2 = rcwm(input_tensor)
+    assert output1.shape == expect1
+    assert output2.shape == expect2
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_RCWM_count_out():
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    input_tensor = Tensor(np.array([[1, 0, 1, 0], [0, 0, 0, 1], [1, 1, 1, 1],
+                                    [0, 0, 0, 1]]).astype(np.bool))
+    expect1 = (10, 2)
+    expect2 = (10,)
+    rcwm = RCWM_count_out()
+    output1, output2 = rcwm(input_tensor)
+    assert output1.shape == expect1
+    assert output2.shape == expect2
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_RCWM_count_in():
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    input_tensor = Tensor(np.array([[1, 0, 1, 0], [0, 0, 0, 1], [1, 1, 1, 1],
+                                    [0, 0, 0, 1]]).astype(np.bool))
+    expect1 = (4, 2)
+    expect2 = (4,)
+    rcwm = RCWM_count_in()
+    output1, output2 = rcwm(input_tensor)
+    assert output1.shape == expect1
+    assert output2.shape == expect2
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_RCWM_1D():
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    input_tensor = Tensor(
+        np.array([1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1]).astype(np.bool))
+    expect_index = np.array([[0], [7], [9], [8], [8], [0],
+                             [2], [7], [0], [0]]).astype(np.int32)
+    expect_mask = np.array(
+        [True, True, True, True, True, True, True, True, False, False])
+    rcwm = RCWM_1D()
+    output1, output2 = rcwm(input_tensor)
+    print(output1.asnumpy())
+    print(output2)
+    assert np.array_equal(output1.asnumpy(), expect_index)
+    assert np.array_equal(output2.asnumpy(), expect_mask)
--- a/tests/st/ops/cpu/test_roi_align_grad_op.py
+++ b/tests/st/ops/cpu/test_roi_align_grad_op.py
@ -0,0 +1,75 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops.operations import _grad_ops as G
+
+context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+
+
+class NetROIAlignGrad(nn.Cell):
+    def __init__(self, xdiff_shape, pooled_height, pooled_width, spatial_scale, sample_num):
+        super(NetROIAlignGrad, self).__init__()
+        self.roiAlignGrad = G.ROIAlignGrad(
+            xdiff_shape,
+            pooled_height,
+            pooled_width,
+            spatial_scale,
+            sample_num)
+
+    def construct(self, dy, rois):
+        return self.roiAlignGrad(dy, rois)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_roi_align_grad():
+    def roi_align_grad_case(data_type):
+        rois = Tensor(np.array([[0, -2.0, -2.0, 21.0, 21.0]], data_type))
+
+        dy = Tensor(np.array([[[
+            [.1, .2, .3],
+            [.1, .2, .3],
+            [.1, .2, .3]
+        ]]], data_type))
+
+        xdiff_shape = (1, 1, 6, 6)
+        pooled_height, pooled_width, spatial_scale, sample_num = 3, 3, 0.25, 2
+        context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+
+        roi_align_grad = NetROIAlignGrad(
+            xdiff_shape,
+            pooled_height,
+            pooled_width,
+            spatial_scale,
+            sample_num)
+        output = roi_align_grad(dy, rois)
+        #print(output)
+        expect = ([[[[0.025, 0.025, 0.05, 0.05, 0.075, 0.075],
+                     [0.025, 0.025, 0.05, 0.05, 0.075, 0.075],
+                     [0.025, 0.025, 0.05, 0.05, 0.075, 0.075],
+                     [0.025, 0.025, 0.05, 0.05, 0.075, 0.075],
+                     [0.025, 0.025, 0.05, 0.05, 0.075, 0.075],
+                     [0.025, 0.025, 0.05, 0.05, 0.075, 0.075]]]])
+        np.testing.assert_almost_equal(output.asnumpy(), expect, decimal=4)
+
+    roi_align_grad_case(np.float32)
+    roi_align_grad_case(np.float16)
--- a/tests/st/ops/cpu/test_roi_align_op.py
+++ b/tests/st/ops/cpu/test_roi_align_op.py
@ -0,0 +1,75 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_roi_align():
+    def roi_align_case(data_type):
+        context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU")
+        x = Tensor(np.array([[
+            [[1, 2, 3, 4, 5, 6],
+             [7, 8, 9, 10, 11, 12],
+             [13, 14, 15, 16, 17, 18],
+             [19, 20, 21, 22, 23, 24],
+             [25, 26, 27, 28, 29, 30],
+             [31, 32, 33, 34, 35, 36]]
+        ]], data_type))
+
+        # test case 1
+        rois = Tensor(np.array([[0, -2.0, -2.0, 21.0, 21.0]], data_type))
+        pooled_height, pooled_width, spatial_scale, sample_num = 3, 3, 0.25, 2
+        roi_align = P.ROIAlign(pooled_height, pooled_width,
+                               spatial_scale, sample_num, 1)
+        output = roi_align(x, rois)
+        #print(output)
+        expect = [[[[4.5, 6.5, 8.5],
+                    [16.5, 18.5, 20.5],
+                    [28.5, 30.5, 32.5]]]]
+        assert (output.asnumpy() == expect).all()
+
+        # test case 2
+        rois = Tensor(np.array([[0, -2.0, -2.0, 22.0, 22.0]], data_type))
+        pooled_height, pooled_width, spatial_scale, sample_num = 3, 3, 0.25, 2
+        roi_align = P.ROIAlign(pooled_height, pooled_width,
+                               spatial_scale, sample_num, 0)
+        output = roi_align(x, rois)
+        #print(output)
+        expect = [[[[4.5, 6.5, 8.5],
+                    [16.5, 18.5, 20.5],
+                    [28.5, 30.5, 32.5]]]]
+        assert (output.asnumpy() == expect).all()
+
+        # test case 3
+        pooled_height, pooled_width, spatial_scale, sample_num = 2, 2, 1.0, -1
+        rois = Tensor(np.array([[0, -2.0, -2.0, 22.0, 22.0]], data_type))
+        roi_align = P.ROIAlign(pooled_height, pooled_width,
+                               spatial_scale, sample_num, 0)
+        output = roi_align(x, rois)
+        #print(output)
+        expect = [[[[6.295, 0.],
+                    [0., 0.]]]]
+        np.testing.assert_almost_equal(output.asnumpy(), expect, decimal=2)
+
+    roi_align_case(np.float32)
+    roi_align_case(np.float16)
--- a/tests/st/ops/cpu/test_scatter_nd_op.py
+++ b/tests/st/ops/cpu/test_scatter_nd_op.py
@ -0,0 +1,142 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import pytest
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+
+class Net(nn.Cell):
+    def __init__(self, _shape):
+        super(Net, self).__init__()
+        self.shape = _shape
+        self.scatternd = P.ScatterNd()
+
+    def construct(self, indices, update):
+        return self.scatternd(indices, update, self.shape)
+
+
+def scatternd_net(indices, update, _shape, expect):
+    scatternd = Net(_shape)
+    output = scatternd(Tensor(indices), Tensor(update))
+    error = np.ones(shape=output.asnumpy().shape) * 1.0e-6
+    diff = output.asnumpy() - expect
+    assert np.all(diff < error)
+    assert np.all(-diff < error)
+
+def scatternd_positive(nptype):
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+
+    arr_indices = np.array([[0, 1], [1, 1], [0, 1], [0, 1], [0, 1]]).astype(np.int32)
+    arr_update = np.array([3.2, 1.1, 5.3, -2.2, -1.0]).astype(nptype)
+    shape = (2, 2)
+    expect = np.array([[0., 5.3],
+                       [0., 1.1]]).astype(nptype)
+    scatternd_net(arr_indices, arr_update, shape, expect)
+
+    arr_indices = np.array([[0, 1], [1, 1], [0, 1], [0, 1], [0, 1]]).astype(np.int64)
+    arr_update = np.array([3.2, 1.1, 5.3, -2.2, -1.0]).astype(nptype)
+    shape = (2, 2)
+    expect = np.array([[0., 5.3],
+                       [0., 1.1]]).astype(nptype)
+    scatternd_net(arr_indices, arr_update, shape, expect)
+
+def scatternd_negative(nptype):
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+
+    arr_indices = np.array([[1, 0], [1, 1], [1, 0], [1, 0], [1, 0]]).astype(np.int32)
+    arr_update = np.array([-13.4, -3.1, 5.1, -12.1, -1.0]).astype(nptype)
+    shape = (2, 2)
+    expect = np.array([[0., 0.],
+                       [-21.4, -3.1]]).astype(nptype)
+    scatternd_net(arr_indices, arr_update, shape, expect)
+
+    arr_indices = np.array([[1, 0], [1, 1], [1, 0], [1, 0], [1, 0]]).astype(np.int64)
+    arr_update = np.array([-13.4, -3.1, 5.1, -12.1, -1.0]).astype(nptype)
+    shape = (2, 2)
+    expect = np.array([[0., 0.],
+                       [-21.4, -3.1]]).astype(nptype)
+    scatternd_net(arr_indices, arr_update, shape, expect)
+
+def scatternd_positive_uint(nptype):
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+
+    arr_indices = np.array([[0, 1], [1, 1], [0, 1], [0, 1], [0, 1]]).astype(np.int32)
+    arr_update = np.array([3.2, 1.1, 5.3, 3.8, 1.2]).astype(nptype)
+    shape = (2, 2)
+    expect = np.array([[0., 12.],
+                       [0., 1.]]).astype(nptype)
+    scatternd_net(arr_indices, arr_update, shape, expect)
+
+    arr_indices = np.array([[0, 1], [1, 1], [0, 1], [0, 1], [0, 1]]).astype(np.int64)
+    arr_update = np.array([3.2, 1.1, 5.3, 3.8, 1.2]).astype(nptype)
+    shape = (2, 2)
+    expect = np.array([[0., 12.],
+                       [0., 1.]]).astype(nptype)
+    scatternd_net(arr_indices, arr_update, shape, expect)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_scatternd_float64():
+    scatternd_positive(np.float64)
+    scatternd_negative(np.float64)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_scatternd_float32():
+    scatternd_positive(np.float32)
+    scatternd_negative(np.float32)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_scatternd_int64():
+    scatternd_positive(np.int64)
+    scatternd_negative(np.int64)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_scatternd_int16():
+    scatternd_positive(np.int16)
+    scatternd_negative(np.int16)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_scatternd_uint64():
+    scatternd_positive_uint(np.uint64)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_scatternd_uint32():
+    scatternd_positive_uint(np.uint32)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_scatternd_uint16():
+    scatternd_positive_uint(np.uint16)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_scatternd_uint8():
+    scatternd_positive_uint(np.uint8)