!41060 [assistant][ops]New operator implementation, include CombinedNonMaxSuppression

Merge pull request !41060 from Wangsong95/combinednonmaxsuppression
2022-09-17 03:57:09 +00:00 · 2022-09-17 03:57:09 +00:00 · b1a32c6c89
parent 4903ea8d43 1d40e969b8
commit b1a32c6c89
9 changed files with 934 additions and 1 deletions
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/combined_non_max_suppression_cpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/combined_non_max_suppression_cpu_kernel.cc
@ -0,0 +1,458 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "plugin/device/cpu/kernel/combined_non_max_suppression_cpu_kernel.h"
+#include "plugin/device/cpu/hal/device/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+namespace {
+constexpr char kKernelName[] = "CombinedNonMaxSuppression";
+constexpr size_t kCombinedNonMaxSuppressionInputsNum = 6;
+constexpr size_t kCombinedNonMaxSuppressionOutputsNum = 4;
+constexpr size_t KIndex0 = 0;
+constexpr size_t KIndex1 = 1;
+constexpr size_t KIndex2 = 2;
+constexpr size_t KIndex3 = 3;
+constexpr size_t KIndex4 = 4;
+constexpr size_t KIndex5 = 5;
+constexpr size_t KIndex6 = 6;
+constexpr size_t KIndex7 = 7;
+constexpr size_t KIndex8 = 8;
+constexpr size_t KIndex9 = 9;
+constexpr size_t KIndex10 = 10;
+constexpr int64_t DimSize4 = 4;
+constexpr float k_5 = 0.5;
+constexpr int multiplier = 4;
+}  // namespace
+
+void CombinedNonMaxSuppressionCpuKernelMod::regular_input2buffer(std::vector<std::vector<float>> *boxes_buffer,
+                                                                 float *box_src, const int class_idx) {
+  /**
+   * shape of box_src
+   * box_src[num_boxes_*q_*4]
+   * ways to visit box_src[i][class_idx][k] which stored by 1-dimension
+   * box_src[i][class_idx][k]=box_src[i*q_*4+class_idx*4+k]
+   */
+  int sub_box_len1 = q_ * multiplier;
+  int box_len2 = (class_idx << KIndex2);
+  for (size_t i = 0; i < IntToSize(num_boxes_); i++) {
+    size_t box_len1 = IntToSize(i * sub_box_len1 + box_len2);
+    if (box_src[box_len1] > box_src[box_len1 + KIndex2]) {
+      (*boxes_buffer)[i][0] = box_src[box_len1 + KIndex2];
+      (*boxes_buffer)[i][KIndex2] = box_src[box_len1 + 0];
+    } else {
+      (*boxes_buffer)[i][0] = box_src[box_len1 + 0];
+      (*boxes_buffer)[i][KIndex2] = box_src[box_len1 + KIndex2];
+    }
+    if (box_src[box_len1 + KIndex1] > box_src[box_len1 + KIndex3]) {
+      (*boxes_buffer)[i][KIndex1] = box_src[box_len1 + KIndex3];
+      (*boxes_buffer)[i][KIndex3] = box_src[box_len1 + KIndex1];
+    } else {
+      (*boxes_buffer)[i][KIndex1] = box_src[box_len1 + KIndex1];
+      (*boxes_buffer)[i][KIndex3] = box_src[box_len1 + KIndex3];
+    }
+  }
+}
+
+// Calculate the area ratio of the intersection of two squares
+float CombinedNonMaxSuppressionCpuKernelMod::IOU(std::vector<std::vector<float>> *boxes_buffer, int i, int j) {
+  std::vector<float> box_a = (*boxes_buffer)[i];
+  std::vector<float> box_b = (*boxes_buffer)[j];
+  float lx, ly, rx, ry;
+  float w, h;
+  float area;
+  float area_a = (box_a[KIndex2] - box_a[0]) * (box_a[KIndex3] - box_a[KIndex1]);
+  float area_b = (box_b[KIndex2] - box_b[0]) * (box_b[KIndex3] - box_b[KIndex1]);
+  if (area_a <= 0 || area_b <= 0) {
+    return 0.0;
+  }
+  lx = box_a[0] > box_b[0] ? box_a[0] : box_b[0];
+  ly = box_a[KIndex1] > box_b[KIndex1] ? box_a[KIndex1] : box_b[KIndex1];
+  rx = box_a[KIndex2] < box_b[KIndex2] ? box_a[KIndex2] : box_b[KIndex2];
+  ry = box_a[KIndex3] < box_b[KIndex3] ? box_a[KIndex3] : box_b[KIndex3];
+  w = rx > lx ? (rx - lx) : 0;
+  h = ry > ly ? (ry - ly) : 0;
+  area = w * h;
+  return area / (area_a + area_b - area);
+}
+
+/**
+ * if soft_nms_sigma_ > 0.0, soft_nms is used, means update by score=score*exp(scale*iou^2)
+ * if soft_nms_sigma_ <= 0.0, nms is used, means delete it when iou > iou_threshold_
+ * run non max suppression per bath per class
+ */
+void CombinedNonMaxSuppressionCpuKernelMod::non_max_suppression(std::vector<std::vector<float>> *boxes_buffer,
+                                                                std::vector<float> *scores_buffer,
+                                                                std::vector<int> &selected) {
+  std::priority_queue<non_max_suppression_local::score_index> pq;
+  for (size_t i = 0; i < IntToSize(num_boxes_); i++) {
+    if ((*scores_buffer)[i] > score_threshold_) {
+      pq.push(non_max_suppression_local::score_index(static_cast<int>(i), (*scores_buffer)[i], 0));
+    }
+  }
+
+  float scale = static_cast<float>(0.0);
+  bool is_soft_nms = soft_nms_sigma_ > static_cast<float>(0.0);
+  if (is_soft_nms) {
+    scale = static_cast<float>(-k_5) / soft_nms_sigma_;
+  }
+
+  float similarity;
+  non_max_suppression_local::score_index next_si;
+  while (static_cast<int>(selected.size()) < size_per_class_ && !pq.empty()) {
+    next_si = pq.top();
+    float original_score = next_si.score;
+    pq.pop();
+    bool should_hard_suppress = false;
+    for (int j = selected.size() - 1; j >= next_si.suppress_begin_index; j--) {
+      similarity = IOU(boxes_buffer, next_si.box_index, selected[IntToSize(j)]);
+      if (is_soft_nms) {
+        next_si.score *=
+          similarity <= iou_threshold_ ? std::exp(scale * similarity * similarity) : static_cast<float>(0.0);
+      }
+      if (!is_soft_nms && similarity > iou_threshold_) {
+        should_hard_suppress = true;
+        break;
+      }
+      if (next_si.score <= score_threshold_) break;
+    }
+
+    next_si.suppress_begin_index = static_cast<int>(selected.size());
+    if (!should_hard_suppress) {
+      if (next_si.score == original_score) {
+        selected.push_back(next_si.box_index);
+        continue;
+      }
+      if (next_si.score > score_threshold_) {
+        pq.push(next_si);
+      }
+    }
+  }
+}
+
+void CombinedNonMaxSuppressionCpuKernelMod::nms_perclass(
+  float *boxes, float *scores, std::vector<non_max_suppression_local::result_para> &sub_result_vec, int &result_size) {
+  size_t k = 0;
+  int box_idx;
+  size_t boxe_len1;
+  int sub_box_len1 = q_ * multiplier;
+  int box_len2 = 0;
+  std::vector<std::vector<float>> boxes_buffer(num_boxes_, std::vector<float>(KIndex4));
+  std::vector<float> scores_buffer(num_boxes_);
+  /**
+   * shape of score and boxes
+   * score[num_boxes_*num_class_]
+   * boxes[num_boxes_*q_*4]
+   */
+  if (q_ == 1) {
+    regular_input2buffer(&boxes_buffer, boxes, 0);
+  }
+  for (int j = 0; j < num_class_; j++) {
+    for (int i = 0; i < num_boxes_; i++) {
+      scores_buffer[IntToSize(i)] = scores[IntToSize(i * num_class_ + j)];
+    }
+    if (q_ > 1) {
+      regular_input2buffer(&boxes_buffer, boxes, j);
+      box_len2 = j * multiplier;
+    }
+    std::vector<int> selected;
+    non_max_suppression(&boxes_buffer, &scores_buffer, selected);
+    for (size_t i = 0; i < selected.size(); i++) {
+      box_idx = selected[i];
+      boxe_len1 = IntToSize(box_idx * sub_box_len1 + box_len2);
+      sub_result_vec[k++] = {
+        box_idx,
+        scores_buffer[IntToSize(box_idx)],
+        j,
+        {boxes[boxe_len1 + 0], boxes[boxe_len1 + 1], boxes[boxe_len1 + KIndex2], boxes[boxe_len1 + KIndex3]}};
+    }
+    result_size += selected.size();
+  }
+}
+
+size_t CombinedNonMaxSuppressionCpuKernelMod::nms_perbath(float *boxes, float *scores, float *nmsed_boxes,
+                                                          float *nmsed_scores, float *nmsed_class,
+                                                          int *valid_detection) {
+  int box_size = num_bath_ * num_detection_ * sizeof(float) * multiplier;
+  int score_size = num_bath_ * num_detection_ * sizeof(float);
+  void(memset_s(nmsed_boxes, box_size, 0.0, box_size));
+  void(memset_s(nmsed_scores, score_size, 0.0, score_size));
+  void(memset_s(nmsed_class, score_size, 0.0, score_size));
+  void(memset_s(valid_detection, sizeof(int) * num_bath_, 0, sizeof(int) * num_bath_));
+  const float box_min = 0.0;
+  const float box_max = 1.0;
+  /**
+   * shape of scores and boxes:
+   * scores[num_bath_*num_boxes_*num_class_]
+   * boxes[num_bath_*num_boxes_*q_*4]
+   */
+  int score_len2 = num_boxes_ * num_class_;
+  int boxes_len2 = num_boxes_ * q_ * multiplier;
+  auto shard_nms = [this, &boxes, &scores, score_len2, boxes_len2, &nmsed_boxes, &nmsed_scores, &nmsed_class,
+                    &valid_detection, box_max, box_min](size_t start, size_t end) {
+    for (size_t i = start; i < end; i++) {
+      int tmp_i = static_cast<int>(i);
+      int per_detections = 0;
+      size_t scores_KIndex = 0;
+      int result_size = 0;
+      std::vector<non_max_suppression_local::result_para> result_vec(size_per_class_ * num_class_,
+                                                                     {0, 0.0, 0, {0.0, 0.0, 0.0, 0.0}});
+      nms_perclass(boxes + tmp_i * boxes_len2, scores + tmp_i * score_len2, result_vec, result_size);
+      if (!pad_per_class_) {
+        per_detections = std::min(result_size, max_total_size_);
+      } else {
+        per_detections = std::min(result_size, num_detection_);
+      }
+      std::sort(result_vec.begin(), result_vec.begin() + result_size, non_max_suppression_local::result_cmp);
+      scores_KIndex = IntToSize(tmp_i * num_detection_);
+      for (size_t k = 0; k < IntToSize(per_detections); k++) {
+        if (clip_boxes_) {
+          nmsed_boxes[(scores_KIndex << KIndex2) + 0] =
+            std::max(std::min(result_vec[k].box_coord[0], box_max), box_min);
+          nmsed_boxes[(scores_KIndex << KIndex2) + KIndex1] =
+            std::max(std::min(result_vec[k].box_coord[KIndex1], box_max), box_min);
+          nmsed_boxes[(scores_KIndex << KIndex2) + KIndex2] =
+            std::max(std::min(result_vec[k].box_coord[KIndex2], box_max), box_min);
+          nmsed_boxes[(scores_KIndex << KIndex2) + KIndex3] =
+            std::max(std::min(result_vec[k].box_coord[KIndex3], box_max), box_min);
+          nmsed_scores[scores_KIndex] = result_vec[k].score;
+          nmsed_class[scores_KIndex] = static_cast<float>(result_vec[k].class_idx);
+        } else {
+          nmsed_boxes[(scores_KIndex << KIndex2) + 0] = result_vec[k].box_coord[0];
+          nmsed_boxes[(scores_KIndex << KIndex2) + KIndex1] = result_vec[k].box_coord[KIndex1];
+          nmsed_boxes[(scores_KIndex << KIndex2) + KIndex2] = result_vec[k].box_coord[KIndex2];
+          nmsed_boxes[(scores_KIndex << KIndex2) + KIndex3] = result_vec[k].box_coord[KIndex3];
+          nmsed_scores[scores_KIndex] = result_vec[k].score;
+          nmsed_class[scores_KIndex] = static_cast<float>(result_vec[k].class_idx);
+        }
+        scores_KIndex++;
+      }
+      valid_detection[i] = per_detections;
+    }
+  };
+  ParallelLaunchAutoSearch(shard_nms, num_bath_, this, &parallel_search_info_);
+  return true;
+}
+
+void CombinedNonMaxSuppressionCpuKernelMod::CheckInput() {
+  constexpr int kInputDimension0 = 4;
+  if (input0_shape_.size() != kInputDimension0) {
+    MS_LOG(EXCEPTION) << "For " << kKernelName << ", the boxes's dims must be 4, but got " << input0_shape_.size()
+                      << " .";
+  }
+  constexpr int kInputDimension1 = 3;
+  if (input1_shape_.size() != kInputDimension1) {
+    MS_LOG(EXCEPTION) << "For " << kKernelName << ", the scores's dims must be 3, but got " << input1_shape_.size()
+                      << " .";
+  }
+  if (input2_shape_.size() != 0) {
+    MS_LOG(EXCEPTION) << "For " << kKernelName << ", the max_output_size_per_class's dims must be 0, but got "
+                      << input1_shape_.size() << " .";
+  }
+  if (input3_shape_.size() != 0) {
+    MS_LOG(EXCEPTION) << "For " << kKernelName << ", the max_total_size's dims must be 0, but got "
+                      << input1_shape_.size() << " .";
+  }
+  if (input4_shape_.size() != 0) {
+    MS_LOG(EXCEPTION) << "For " << kKernelName << ", the iou_threshold's dims must be, 0 but got "
+                      << input1_shape_.size() << " .";
+  }
+  if (input5_shape_.size() != 0) {
+    MS_LOG(EXCEPTION) << "For " << kKernelName << ", the score_threshold's dims must be 0, but got "
+                      << input1_shape_.size() << ".";
+  }
+  if (input0_shape_[0] != input1_shape_[0]) {
+    MS_LOG(EXCEPTION) << "For " << kKernelName << ", the boxes's 1st dim need to be with the scores's 1st dim, but got "
+                      << input0_shape_[0] << " and " << input1_shape_[0] << ".";
+  }
+  if (input0_shape_[KIndex1] != input1_shape_[KIndex1]) {
+    MS_LOG(EXCEPTION) << "For " << kKernelName << ", the boxes's 2nd dim need to be same with the scores's 2nd dim,"
+                      << " but got " << input0_shape_[KIndex1] << " and " << input1_shape_[KIndex1] << ".";
+  }
+  if (input0_shape_[KIndex2] != input1_shape_[KIndex2] && input0_shape_[KIndex2] != 1) {
+    MS_LOG(EXCEPTION) << "For " << kKernelName << ", the boxes's 3rd dim need to be same with the scores's 3rd dim or 1"
+                      << ", but got " << input0_shape_[KIndex2] << ".";
+  }
+  if (input0_shape_[KIndex3] != DimSize4) {
+    MS_LOG(EXCEPTION) << "For " << kKernelName << ", the boxes's 4th dim need to be equal to 4, but got "
+                      << input0_shape_[KIndex3] << ".";
+  }
+}
+
+void CombinedNonMaxSuppressionCpuKernelMod::CheckOutput() {
+  constexpr size_t kOutputDimension0 = 3;
+  constexpr size_t kOutputDimension1 = 2;
+  constexpr size_t kOutputDimension2 = 2;
+  constexpr size_t kOutputDimension3 = 1;
+  if (output0_shape_.size() != kOutputDimension0) {
+    MS_LOG(EXCEPTION) << "For " << kKernelName << ", the nmsed_boxes's dims must be 3, but got "
+                      << output0_shape_.size() << ".";
+  }
+  if (output1_shape_.size() != kOutputDimension1) {
+    MS_LOG(EXCEPTION) << "For " << kKernelName << ", the nmsed_scores's dims must be 2, but got "
+                      << output1_shape_.size() << ".";
+  }
+  if (output2_shape_.size() != kOutputDimension2) {
+    MS_LOG(EXCEPTION) << "For " << kKernelName << ", the nmsed_classes's dims must be 2, but got "
+                      << output2_shape_.size() << ".";
+  }
+  if (output3_shape_.size() != kOutputDimension3) {
+    MS_LOG(EXCEPTION) << "For " << kKernelName << ", the valid_detection's dims must be 1, but got "
+                      << output3_shape_.size() << ".";
+  }
+  if ((output0_shape_[0] != output1_shape_[0] || output0_shape_[0] != output2_shape_[0]) ||
+      output0_shape_[0] != output3_shape_[0]) {
+    MS_LOG(EXCEPTION) << "For " << kKernelName << ", the nmsed_boxes's 1st dim, nmsed_scores's 1st dim,"
+                      << " nmsed_classes's 1st dim, valid_detection's 1st dim, must be same with each other, but got"
+                      << " four as follows: " << output0_shape_[0] << " and " << output1_shape_[0] << " and "
+                      << output2_shape_[0] << " and " << output3_shape_[0] << ".";
+  }
+  if (output0_shape_[1] != output1_shape_[1] || output0_shape_[1] != output2_shape_[1]) {
+    MS_LOG(EXCEPTION) << "For " << kKernelName << ", the nmsed_boxes's 2nd dim, nmsed_scores's 2nd dim, nmsed_classes's"
+                      << " 2nd dim bust be same with each other, but got the three as follows: " << output0_shape_[1]
+                      << " and " << output1_shape_[1] << " and " << output2_shape_[1] << ".";
+  }
+  if (static_cast<int>(output0_shape_[0]) != num_bath_) {
+    MS_LOG(EXCEPTION) << "For " << kKernelName << ", the nmsed_boxes's 1st dim must be same with the boxes's 1st dim,"
+                      << " but got " << output0_shape_[0] << ".";
+  }
+  if (static_cast<int>(output1_shape_[0]) != num_bath_) {
+    MS_LOG(EXCEPTION) << "For " << kKernelName << ", the nmsed_scores's 1st dim must be same with the boxes's 1st dim,"
+                      << " but got " << output1_shape_[0] << ".";
+  }
+  if (static_cast<int>(output2_shape_[0]) != num_bath_) {
+    MS_LOG(EXCEPTION) << "For " << kKernelName << ", the nmsed_classes's 1st dim must be same with the boxes's 1st dim,"
+                      << " but got " << output2_shape_[0] << ".";
+  }
+  if (static_cast<int>(output3_shape_[0]) != num_bath_) {
+    MS_LOG(EXCEPTION) << "For " << kKernelName << ", the valid_detection's 1st dim must be same with the boxes's 1st"
+                      << " dim, but got " << output3_shape_[0] << ".";
+  }
+}
+
+void CombinedNonMaxSuppressionCpuKernelMod::InitKernel(const CNodePtr &kernel_node) {
+  size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
+  size_t output_num = common::AnfAlgo::GetOutputTensorNum(kernel_node);
+  node_wpt_ = kernel_node;
+  input0_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
+  input1_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, KIndex1);
+  input2_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, KIndex2);
+  input3_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, KIndex3);
+  input4_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, KIndex4);
+  input5_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, KIndex5);
+  soft_nms_sigma_ = 0.0;
+  num_bath_ = static_cast<int>(input0_shape_[0]);
+  num_boxes_ = static_cast<int>(input0_shape_[KIndex1]);
+  q_ = static_cast<int>(input0_shape_[KIndex2]);
+  num_class_ = static_cast<int>((input1_shape_[KIndex2]));
+  kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
+
+  pad_per_class_ = false;
+  clip_boxes_ = true;
+  auto prim = common::AnfAlgo::GetCNodePrimitive(kernel_node);
+  auto pad_per_class = prim->GetAttr("pad_per_class");
+  auto clip_boxes = prim->GetAttr("clip_boxes");
+  if (pad_per_class != nullptr) {
+    pad_per_class_ = GetValue<bool>(pad_per_class);
+  }
+  if (clip_boxes != nullptr) {
+    clip_boxes_ = GetValue<bool>(clip_boxes);
+  }
+  CHECK_KERNEL_INPUTS_NUM(input_num, kCombinedNonMaxSuppressionInputsNum, kernel_name_);
+  CHECK_KERNEL_OUTPUTS_NUM(output_num, kCombinedNonMaxSuppressionOutputsNum, kernel_name_);
+}
+
+bool CombinedNonMaxSuppressionCpuKernelMod::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                                   const std::vector<kernel::AddressPtr> &,
+                                                   const std::vector<kernel::AddressPtr> &outputs) {
+  float *boxes = reinterpret_cast<float *>(inputs[0]->addr);
+  float *scores = reinterpret_cast<float *>(inputs[KIndex1]->addr);
+  max_output_size_per_class_ = *(reinterpret_cast<int *>(inputs[KIndex2]->addr));
+  max_total_size_ = *(reinterpret_cast<int *>(inputs[KIndex3]->addr));
+  iou_threshold_ = *(reinterpret_cast<float *>(inputs[KIndex4]->addr));
+  score_threshold_ = *(reinterpret_cast<float *>(inputs[KIndex5]->addr));
+  float *nmsed_boxes = reinterpret_cast<float *>(outputs[KIndex0]->addr);
+  float *nmsed_scores = reinterpret_cast<float *>(outputs[KIndex1]->addr);
+  float *nmsed_class = reinterpret_cast<float *>(outputs[KIndex2]->addr);
+  int *valid_detection = reinterpret_cast<int *>(outputs[KIndex3]->addr);
+  if (pad_per_class_) {
+    num_detection_ = std::min(max_total_size_, max_output_size_per_class_ * num_class_);
+  } else {
+    num_detection_ = max_total_size_;
+  }
+  auto node_ = node_wpt_.lock();
+  if (!node_) {
+    MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', node_wpt_(kernel_node) is expired. Error no: " << node_ << ".";
+  }
+  ShapeVector shape0 = {input0_shape_[0], static_cast<int64_t>(num_detection_), DimSize4};
+  ShapeVector shape1 = {input0_shape_[0], static_cast<int64_t>(num_detection_)};
+  ShapeVector shape2 = {input0_shape_[0], static_cast<int64_t>(num_detection_)};
+  ShapeVector shape3 = {input0_shape_[0]};
+  common::AnfAlgo::SetOutputInferTypeAndShape(
+    {kNumberTypeFloat32, kNumberTypeFloat32, kNumberTypeFloat32, kNumberTypeInt32}, {shape0, shape1, shape2, shape3},
+    node_.get());
+  output0_shape_ = AnfAlgo::GetOutputDeviceShape(node_, KIndex0);
+  output1_shape_ = AnfAlgo::GetOutputDeviceShape(node_, KIndex1);
+  output2_shape_ = AnfAlgo::GetOutputDeviceShape(node_, KIndex2);
+  output3_shape_ = AnfAlgo::GetOutputDeviceShape(node_, KIndex3);
+  size_per_class_ = max_output_size_per_class_ < num_boxes_ ? max_output_size_per_class_ : num_boxes_;
+  CheckInput();
+  CheckOutput();
+  if (max_total_size_ <= 0) {
+    MS_LOG(EXCEPTION) << "For " << kernel_name_ << " max_total_size must be > 0, but got " << max_total_size_ << ".";
+  }
+  if (max_output_size_per_class_ <= 0) {
+    MS_LOG(EXCEPTION) << "For " << kernel_name_ << " max_output_size_per_class must be > 0, but got "
+                      << max_output_size_per_class_ << ".";
+  }
+  if (iou_threshold_ < 0 || iou_threshold_ > 1) {
+    MS_LOG(EXCEPTION) << "For " << kernel_name_ << " iou_threshold must be in [0,1], but got " << iou_threshold_ << ".";
+  }
+  if (static_cast<int>(output0_shape_[KIndex1]) != num_detection_) {
+    MS_LOG(EXCEPTION) << "For " << kernel_name_ << " The nmsed_boxes's 2nd dims must be same with " << num_detection_
+                      << "but got " << output0_shape_[KIndex1] << ".";
+  }
+  if (static_cast<int>(output1_shape_[KIndex1]) != num_detection_) {
+    MS_LOG(EXCEPTION) << "For " << kernel_name_ << " The nmsed_scores's 2nd dims must be same with " << num_detection_
+                      << "but got " << output1_shape_[KIndex1] << ".";
+  }
+  if (static_cast<int>(output2_shape_[KIndex1]) != num_detection_) {
+    MS_LOG(EXCEPTION) << "For " << kernel_name_ << " The nmsed_classes's 2nd dims must be same with " << num_detection_
+                      << "but got " << output2_shape_[KIndex1] << ".";
+  }
+  nms_perbath(boxes, scores, nmsed_boxes, nmsed_scores, nmsed_class, valid_detection);
+  return true;
+}
+std::vector<KernelAttr> CombinedNonMaxSuppressionCpuKernelMod::GetOpSupport() {
+  static std::vector<KernelAttr> kernel_attr_list = {
+    KernelAttr()
+      .AddInputAttr(kNumberTypeFloat32)
+      .AddInputAttr(kNumberTypeFloat32)
+      .AddInputAttr(kNumberTypeInt32)
+      .AddInputAttr(kNumberTypeInt32)
+      .AddInputAttr(kNumberTypeFloat32)
+      .AddInputAttr(kNumberTypeFloat32)
+      .AddOutputAttr(kNumberTypeFloat32)
+      .AddOutputAttr(kNumberTypeFloat32)
+      .AddOutputAttr(kNumberTypeFloat32)
+      .AddOutputAttr(kNumberTypeInt32),
+  };
+
+  return kernel_attr_list;
+}
+MS_KERNEL_FACTORY_REG(NativeCpuKernelMod, CombinedNonMaxSuppression, CombinedNonMaxSuppressionCpuKernelMod);
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/combined_non_max_suppression_cpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/combined_non_max_suppression_cpu_kernel.h
@ -0,0 +1,104 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_COMBINED_NON_MAX_SUPPRESSION_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_COMBINED_NON_MAX_SUPPRESSION_CPU_KERNEL_H_
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include <queue>
+#include "plugin/device/cpu/kernel/cpu_kernel.h"
+#include "plugin/factory/ms_factory.h"
+
+namespace non_max_suppression_local {
+struct score_index {
+  int box_index;
+  float score;
+  int suppress_begin_index;
+  score_index() {}
+  score_index(int bi, float s, int sbi) : box_index(bi), score(s), suppress_begin_index(sbi) {}
+  bool operator<(const score_index &b) const {
+    return (score < b.score) || ((score == b.score) && (box_index > b.box_index));
+  }
+};
+struct result_para {
+  int box_index;
+  float score;
+  int class_idx;
+  float box_coord[4];
+};
+
+bool result_cmp(const result_para &a, const result_para &b) { return a.score > b.score; }
+}  // namespace non_max_suppression_local
+
+namespace mindspore {
+namespace kernel {
+class CombinedNonMaxSuppressionCpuKernelMod : public DeprecatedNativeCpuKernelMod {
+ public:
+  CombinedNonMaxSuppressionCpuKernelMod() = default;
+  ~CombinedNonMaxSuppressionCpuKernelMod() override = default;
+  void InitKernel(const CNodePtr &kernel_node) override;
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+ protected:
+  std::vector<KernelAttr> GetOpSupport() override;
+
+ private:
+  size_t nms_perbath(float *, float *, float *, float *, float *, int *);
+  void regular_input2buffer(std::vector<std::vector<float>> *, float *, const int);
+  float IOU(std::vector<std::vector<float>> *, int, int);
+  void non_max_suppression(std::vector<std::vector<float>> *, std::vector<float> *, std::vector<int> &);
+  void nms_perclass(float *, float *, std::vector<non_max_suppression_local::result_para> &, int &);
+  void CheckInput();
+  void CheckOutput();
+  int num_bath_ = 0;
+  int num_boxes_ = 0;
+  int q_ = 0;
+  int num_class_ = 0;
+  // per batch size;
+  int num_detection_ = 0;
+  int max_total_size_ = 0;
+  // The length of each type of selection defined by the user
+  int max_output_size_per_class_ = 0;
+  // Calculation num_detection length
+  int size_per_class_ = 0;
+  // When lower than a score_threshold, delete the relevant box
+  float score_threshold_ = 0.0;
+  // When it is higher than the threshold value, according to the soft_nms_sigma determines deletion or decay
+  float iou_threshold_ = 0.0;
+  float soft_nms_sigma_ = 0.0;
+  bool pad_per_class_ = 0;
+  bool clip_boxes_ = 1;
+  CNodeWeakPtr node_wpt_;
+  std::vector<int64_t> input0_shape_;
+  std::vector<int64_t> input1_shape_;
+  std::vector<int64_t> input2_shape_;
+  std::vector<int64_t> input3_shape_;
+  std::vector<int64_t> input4_shape_;
+  std::vector<int64_t> input5_shape_;
+  std::vector<int64_t> output0_shape_;
+  std::vector<int64_t> output1_shape_;
+  std::vector<int64_t> output2_shape_;
+  std::vector<int64_t> output3_shape_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_COMBINED_NON_MAX_SUPPRESSION_CPU_KERNEL_H_
--- a/mindspore/core/ops/combined_non_max_suppression.cc
+++ b/mindspore/core/ops/combined_non_max_suppression.cc
@ -0,0 +1,178 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ops/combined_non_max_suppression.h"
+#define IsValue(value_ptr) (!value_ptr->isa<AnyValue>() && !value_ptr->isa<None>())
+#include <algorithm>
+#include <set>
+#include "ops/op_utils.h"
+#include "utils/check_convert_utils.h"
+#include "abstract/ops/primitive_infer_map.h"
+#include "mindapi/src/helper.h"
+
+namespace mindspore {
+namespace ops {
+namespace {
+const int64_t kInputDimension0 = 4;
+const int64_t kInputDimension1 = 3;
+const int64_t kDimsize = 4;
+const int64_t kInputs = 6;
+const size_t ksecond = 2;
+tensor::TensorPtr Get_Value(const std::vector<AbstractBasePtr> &input_args, size_t index) {
+  auto input = input_args[index]->cast<abstract::AbstractTensorPtr>();
+  MS_EXCEPTION_IF_NULL(input);
+  auto input_shape_value_ptr = input->BuildValue();
+  MS_EXCEPTION_IF_NULL(input_shape_value_ptr);
+  return input_shape_value_ptr->cast<tensor::TensorPtr>();
+}
+abstract::TupleShapePtr CombinedNonMaxSuppressionInferShape(const PrimitivePtr &primitive,
+                                                            const std::vector<AbstractBasePtr> &input_args) {
+  auto prim_name = primitive->name();
+  auto input0_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[kInputIndex0]->BuildShape())[kShape];
+  auto input1_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[kInputIndex1]->BuildShape())[kShape];
+  auto input2_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[kInputIndex2]->BuildShape())[kShape];
+  auto input3_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[kInputIndex3]->BuildShape())[kShape];
+  auto input4_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[kInputIndex4]->BuildShape())[kShape];
+  auto input5_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[kInputIndex5]->BuildShape())[kShape];
+  (void)CheckAndConvertUtils::CheckInteger("boxes dim", input0_shape.size(), kEqual, kInputDimension0, prim_name);
+  (void)CheckAndConvertUtils::CheckInteger("scores dim", input1_shape.size(), kEqual, kInputDimension1, prim_name);
+  (void)CheckAndConvertUtils::CheckInteger("max_output_size_per_class dim", input2_shape.size(), kEqual, 0, prim_name);
+  (void)CheckAndConvertUtils::CheckInteger("max_total_size dim", input3_shape.size(), kEqual, 0, prim_name);
+  (void)CheckAndConvertUtils::CheckInteger("iou_threshold", input4_shape.size(), kEqual, 0, prim_name);
+  (void)CheckAndConvertUtils::CheckInteger("score_threshold", input5_shape.size(), kEqual, 0, prim_name);
+  if (input0_shape[0] != input1_shape[0]) {
+    MS_EXCEPTION(ValueError) << "For " << prim_name << ", the boxes's 1st dim must be same with the scores's"
+                             << " 1st dim, but got" << input0_shape[0] << " and " << input1_shape[0] << ".";
+  }
+  if (input0_shape[1] != input1_shape[1]) {
+    MS_EXCEPTION(ValueError) << "For " << prim_name << ", the boxes's 2nd dim must be same with the scores's"
+                             << " 2nd dim, but got" << input0_shape[1] << " and " << input1_shape[1] << ".";
+  }
+  if (input0_shape[kInputIndex2] != input1_shape[kInputIndex2] && input0_shape[kInputIndex2] != 1) {
+    MS_EXCEPTION(ValueError) << "For " << prim_name
+                             << ", the boxes's 3rd dim is must be same with the scores's 3rd dim or 1, but got "
+                             << input0_shape[kInputIndex2] << ".";
+  }
+  if (input0_shape[kInputIndex3] != kDimsize) {
+    MS_EXCEPTION(ValueError) << "For " << prim_name << ", the boxes's 4th dim must be equal to 4, but got"
+                             << input0_shape[kInputIndex3] << ".";
+  }
+  for (int i = 0; i < kInputs; i++) {
+    if (!input_args[i]->isa<abstract::AbstractTensor>()) {
+      MS_EXCEPTION(TypeError) << "For " << prim_name << " input" << i << " only support tensor!";
+    }
+  }
+  auto pad_per_class_ptr = primitive->GetAttr("pad_per_class");
+  MS_EXCEPTION_IF_NULL(pad_per_class_ptr);
+  bool pad_per_class = GetValue<bool>(pad_per_class_ptr);
+  auto input2_tensor = Get_Value(input_args, kInputIndex2);
+  auto input3_tensor = Get_Value(input_args, kInputIndex3);
+  auto input4_tensor = Get_Value(input_args, kInputIndex4);
+  auto input5_tensor = Get_Value(input_args, kInputIndex5);
+  if (IsValue(input_args[kInputIndex2]->BuildValue()) && IsValue(input_args[kInputIndex3]->BuildValue())) {
+    if (IsValue(input_args[kInputIndex4]->BuildValue()) && input_args[kInputIndex5]->BuildValue()) {
+      auto iou_threshold = *(reinterpret_cast<float *>(input4_tensor->data_c()));
+      auto score_threshold = *(reinterpret_cast<float *>(input5_tensor->data_c()));
+      if (iou_threshold < 0 || iou_threshold > 1) {
+        MS_EXCEPTION(ValueError) << "For " << prim_name << ", iou_threshold must be in [0,1], but got " << iou_threshold
+                                 << ".";
+      }
+      if (score_threshold < 0 && input0_shape[kInputIndex2] == input1_shape[kInputIndex2]) {
+        MS_EXCEPTION(ValueError) << "For " << prim_name << ", it is temporarily unsupported when boxes's 2'nd dim "
+                                 << "is not 1 and score_threshold is less than 1.";
+      }
+    }
+    auto max_output_size_per_class = *(reinterpret_cast<int32_t *>(input2_tensor->data_c()));
+    auto max_total_size = *(reinterpret_cast<int32_t *>(input3_tensor->data_c()));
+    if (max_total_size <= 0) {
+      MS_EXCEPTION(ValueError) << "For " << prim_name << " max_total_size must be > 0, but got " << max_total_size
+                               << ".";
+    }
+    if (max_output_size_per_class <= 0) {
+      MS_EXCEPTION(ValueError) << "For " << prim_name << " max_output_size_per_class must be > 0, but got "
+                               << max_output_size_per_class << ".";
+    }
+    auto num_detection = max_total_size;
+    if (pad_per_class) {
+      num_detection = std::min(max_total_size, max_output_size_per_class * static_cast<int32_t>(input1_shape[ksecond]));
+    }
+    int64_t bs = input0_shape[0];
+    ShapeVector shape1 = {bs, num_detection, 4};
+    ShapeVector shape2 = {bs, num_detection};
+    ShapeVector shape3 = {bs, num_detection};
+    ShapeVector shape4 = {bs};
+    auto out1 = std::make_shared<abstract::Shape>(shape1);
+    auto out2 = std::make_shared<abstract::Shape>(shape2);
+    auto out3 = std::make_shared<abstract::Shape>(shape3);
+    auto out4 = std::make_shared<abstract::Shape>(shape4);
+    return std::make_shared<abstract::TupleShape>(std::vector<abstract::BaseShapePtr>{out1, out2, out3, out4});
+  } else {
+    ShapeVector nmsed_boxes_shape = {-2, -2, -2};
+    ShapeVector max_nmsed_boxes_shape = {1, 1, 1};
+    ShapeVector min_nmsed_boxes_shape = {1, 1, 1};
+    auto shape1 = std::make_shared<abstract::Shape>(nmsed_boxes_shape, min_nmsed_boxes_shape, max_nmsed_boxes_shape);
+    ShapeVector nmsed_scores_shape = {-2, -2};
+    ShapeVector max_nmsed_scores_shape = {1, 1};
+    ShapeVector min_nmsed_scores_shape = {1, 1};
+    auto shape2 = std::make_shared<abstract::Shape>(nmsed_scores_shape, min_nmsed_scores_shape, max_nmsed_scores_shape);
+    ShapeVector nmsed_class_shape = {-2, -2};
+    ShapeVector max_nmsed_class_shape = {1, 1};
+    ShapeVector min_nmsed_class_shape = {1, 1};
+    auto shape3 = std::make_shared<abstract::Shape>(nmsed_class_shape, min_nmsed_class_shape, max_nmsed_class_shape);
+    ShapeVector valid_detection = {-2};
+    auto shape4 = std::make_shared<abstract::Shape>(valid_detection);
+    return std::make_shared<abstract::TupleShape>(std::vector<abstract::BaseShapePtr>{shape1, shape2, shape3, shape4});
+  }
+}
+
+TuplePtr CombinedNonMaxSuppressionInferType(const PrimitivePtr &primitive,
+                                            const std::vector<AbstractBasePtr> &input_args) {
+  auto prim_name = primitive->name();
+  auto input0_type = input_args[kInputIndex0]->BuildType();
+  auto input1_type = input_args[kInputIndex1]->BuildType();
+  auto input2_type = input_args[kInputIndex2]->BuildType();
+  auto input3_type = input_args[kInputIndex3]->BuildType();
+  auto input4_type = input_args[kInputIndex4]->BuildType();
+  auto input5_type = input_args[kInputIndex5]->BuildType();
+  const std::set valid_type_float32 = {kFloat32};
+  const std::set valid_type_int = {kInt32};
+  (void)CheckAndConvertUtils::CheckTensorTypeValid("boxes", input0_type, valid_type_float32, prim_name);
+  (void)CheckAndConvertUtils::CheckTensorTypeValid("scores", input1_type, valid_type_float32, prim_name);
+  (void)CheckAndConvertUtils::CheckTensorTypeValid("max_output_size_per_class", input2_type, valid_type_int, prim_name);
+  (void)CheckAndConvertUtils::CheckTensorTypeValid("max_total_size", input3_type, valid_type_int, prim_name);
+  (void)CheckAndConvertUtils::CheckTensorTypeValid("iou_threshold", input4_type, valid_type_float32, prim_name);
+  (void)CheckAndConvertUtils::CheckTensorTypeValid("score_threshold", input5_type, valid_type_float32, prim_name);
+  return std::make_shared<Tuple>(
+    std::vector<TypePtr>{std::make_shared<TensorType>(kFloat32), std::make_shared<TensorType>(kFloat32),
+                         std::make_shared<TensorType>(kFloat32), std::make_shared<TensorType>(kInt32)});
+}
+}  // namespace
+MIND_API_OPERATOR_IMPL(CombinedNonMaxSuppression, BaseOperator);
+AbstractBasePtr CombinedNonMaxSuppressionInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                               const std::vector<AbstractBasePtr> &input_args) {
+  auto prim_name = primitive->name();
+  const int64_t kInputNum = 6;
+  (void)CheckAndConvertUtils::CheckInputArgs(input_args, kGreaterEqual, kInputNum, prim_name);
+  auto infer_shape = CombinedNonMaxSuppressionInferShape(primitive, input_args);
+  auto infer_type = CombinedNonMaxSuppressionInferType(primitive, input_args);
+  return abstract::MakeAbstract(infer_shape, infer_type);
+}
+
+REGISTER_PRIMITIVE_EVAL_IMPL(CombinedNonMaxSuppression, prim::kPrimCombinedNonMaxSuppression,
+                             CombinedNonMaxSuppressionInfer, nullptr, true);
+REGISTER_HOST_DEPENDS(kNameCombinedNonMaxSuppression, {2, 3, 4, 5});
+}  // namespace ops
+}  // namespace mindspore
--- a/mindspore/core/ops/combined_non_max_suppression.h
+++ b/mindspore/core/ops/combined_non_max_suppression.h
@ -0,0 +1,48 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CORE_OPS_COMBINED_NON_MAX_SUPPRESSION_H_
+#define MINDSPORE_CORE_OPS_COMBINED_NON_MAX_SUPPRESSION_H_
+#include <map>
+#include <vector>
+#include <string>
+#include <memory>
+#include "ops/base_operator.h"
+#include "mindapi/base/types.h"
+
+namespace mindspore {
+namespace ops {
+constexpr auto kNameCombinedNonMaxSuppression = "CombinedNonMaxSuppression";
+/// \brief Greedily selects a subset of bounding boxes in descending order of score.
+/// Refer to Python API @ref mindspore.ops.CombineNonMaxSuppression for more details.
+class MIND_API CombinedNonMaxSuppression : public BaseOperator {
+ public:
+  MIND_API_BASE_MEMBER(CombinedNonMaxSuppression);
+  /// \brief Constructor.
+  CombinedNonMaxSuppression() : BaseOperator(kNameCombinedNonMaxSuppression) {
+    InitIOName({"boxes", "scores", "max_output_size_per_class", "max_total_size", "iou_threshold", "score_threshold"},
+               {"nmsed_box", "nmsed_scores", "nmsed_classes", "valid_detections"});
+  }
+};
+abstract::AbstractBasePtr CombinedNonMaxSuppressionInfer(const abstract::AnalysisEnginePtr &,
+                                                         const PrimitivePtr &primitive,
+                                                         const std::vector<abstract::AbstractBasePtr> &input_args);
+
+using kPrimCombinedNonMaxSuppressionPtr = std::shared_ptr<CombinedNonMaxSuppression>;
+}  // namespace ops
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CORE_OPS_COMBINED_NON_MAX_SUPPRESSION_H_
--- a/mindspore/core/ops/core_ops.h
+++ b/mindspore/core/ops/core_ops.h
@ -1244,6 +1244,7 @@ GVAR_DEF(PrimitivePtr, kPrimAdjustSaturation, std::make_shared<Primitive>(kAdjus
 GVAR_DEF(PrimitivePtr, kPrimCompareAndBitpack, std::make_shared<Primitive>(kCompareAndBitpack));
 GVAR_DEF(PrimitivePtr, kPrimScaleAndTranslate, std::make_shared<Primitive>("ScaleAndTranslate"));
 GVAR_DEF(PrimitivePtr, kPrimScaleAndTranslateGrad, std::make_shared<Primitive>("ScaleAndTranslateGrad"));
+GVAR_DEF(PrimitivePtr, kPrimCombinedNonMaxSuppression, std::make_shared<Primitive>("CombinedNonMaxSuppression"))

 // Statements
 GVAR_DEF(PrimitivePtr, kPrimReturn, std::make_shared<Primitive>(kReturn));
--- a/mindspore/python/mindspore/ops/_op_impl/aicpu/init.py
+++ b/mindspore/python/mindspore/ops/_op_impl/aicpu/init.py
@ -168,3 +168,9 @@ from .reservoir_replay_buffer import _rrb_create_op_cpu
 from .reservoir_replay_buffer import _rrb_push_op_cpu
 from .reservoir_replay_buffer import _rrb_sample_op_cpu
 from .reservoir_replay_buffer import _rrb_destroy_op_cpu
+from .sparse_reshape import _sparse_reshape_aicpu
+from .unsorted_segment_sum import _unsorted_segment_sum_aicpu
+from .unsorted_segment_prod import _unsorted_segment_prod_aicpu
+from .hard_sigmoid import _hard_sigmoid_aicpu
+from .hard_sigmoid_grad import _hard_sigmoid_grad_aicpu
+from .sparse_reorder import _sparse_reorder_aicpu
--- a/mindspore/python/mindspore/ops/_op_impl/aicpu/combined_non_max_suppression.py
+++ b/mindspore/python/mindspore/ops/_op_impl/aicpu/combined_non_max_suppression.py
@ -0,0 +1,42 @@
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""CombinedNonMaxSuppression op"""
+from mindspore.ops.op_info_register import op_info_register, AiCPURegOp, DataType
+
+combined_non_max_suppression_op_info = AiCPURegOp("CombinedNonMaxSuppression")\
+    .fusion_type("OPAQUE")\
+    .attr("pad_per_class", "bool")\
+    .attr("clip_boxes", "bool")\
+    .input(0, "boxes", "required")\
+    .input(1, "scores", "required")\
+    .input(2, "max_output_size_per_class", "required")\
+    .input(3, "max_total_size", "required")\
+    .input(4, "iou_threshold", "required")\
+    .input(5, "score_threshold", "required")\
+    .output(0, "nmsed_box", "required")\
+    .output(1, "nmsed_scores", "required")\
+    .output(2, "nmsed_classes", "required")\
+    .output(3, "valid_detections", "required")\
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.I32_Default, DataType.I32_Default, \
+    DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, \
+    DataType.F32_Default, DataType.I32_Default)\
+    .get_op_info()
+
+
+@op_info_register(combined_non_max_suppression_op_info)
+def _combined_non_max_suppression_aicpu():
+    """CombinedNonMaxSuppression AiCPU register"""
+    return
--- a/mindspore/python/mindspore/ops/operations/image_ops.py
+++ b/mindspore/python/mindspore/ops/operations/image_ops.py
@ -1045,3 +1045,88 @@ class ScaleAndTranslate(Primitive):
        validator.check_string(kernel_type, ["lanczos1", "lanczos3", "lanczos5", "gaussian", "box", "triangle",
                                             "keyscubic", "mitchellcubic"], "kernel_type", self.name)
        validator.check_value_type("antialias", antialias, [bool], self.name)
+
+
+class CombinedNonMaxSuppression(Primitive):
+    r"""
+    Greedily selects a subset of bounding boxes in descending order of score.
+
+    Args:
+        clip_boxes (bool): If true, assume the box coordinates are between [0, 1] and clip the output boxes
+            if they fall beyond [0, 1]. If false, do not do clipping and output the box coordinates as it is.
+            Defaults to true.
+        pad_per_class (bool): If false, the output nmsed boxes, scores and classes are padded/clipped to max_total_size.
+            If true, the output nmsed boxes, scores and classes are padded to be of length
+            max_size_per_class * num_classes, unless it exceeds max_total_size in which case it is clipped to
+            max_total_size. Defaults to false.
+
+    Inputs:
+        - **boxes** (Tensor) - A Tensor of type float32 and shape (batch_size, num_boxes, q, 4).
+          If q is 1 then same boxes are used for all classes otherwise,
+          if q is equal to number of classes, class-specific boxes are used.
+        - **scores** (Tensor) - A Tensor of type float32 and shape (batch_size, num_boxes, num_classes)
+          representing a single score corresponding to each box (each row of boxes).
+        - **max_output_size_per_class** - A 0D Tensor of type int32, representing the max number of boxes to be
+          selected by non max suppression per class.
+        - **max_total_size** - A 0D Tensor of type int32, representing the maximum number of boxes retained over all
+          classes.
+        - **iou_threshold** - A 0-D float32 tensor representing the threshold for deciding whether
+          boxes overlap too much with respect to IOU, and iou_threshold must be equal or greater
+          than 0 and be equal or smaller than 1.
+        - **score_threshold** - A 0-D float32 tensor representing the threshold for deciding when to remove
+          boxes based on score.
+
+    Outputs:
+        - **nmsed_boxes** - A Tensor of float32 with shape of (batch_size, num_detection, 4), which contains
+          the non-max suppressed boxes.
+        - **nmsed_scores** - A Tensor of float32 with shape of (batch_size, num_detection), which contains score
+          of boxes.
+        - **nmsed_classes** - A Tensor of float32 with shape of (batch_size, num_detection), which contains classes
+          of boxes.
+        - **valid_detections** A Tensor of int32 with shape of (batch_size,), which indicates the number of valid
+          detections of each batch.
+
+    Raises:
+        TypeError: If the dtype of `boxes` `scores` `iou_threshold` `score threshold` are not float32.
+        TypeError: If the dtype of `max_output_size_per_class` and `max_total_size` are not int32.
+        ValueError: If `boxes`is not 4D.
+        ValueError: If `max_output_size_per_class`, `max_total_size`, `iou_threshold` and `score threshold` are not 0D.
+        ValueError: If shape[0] of `boxes` is not same with shape[0] of `scores`.
+        ValueError: If `scores` is not 3D.
+        ValueError: If shape[1] of `boxes` is not same with shape[1] of the `scores`.
+        ValueError: If shape[2] of `boxes` is not same with shape[2] of `scores` or 1
+        ValueError: If `max_total_size` < 0.
+        ValueError: If `max_output_size_per_class` < 0.
+        ValueError: If `iou_threshold` not in [0,1].
+
+    Supported Platforms:
+        ``Ascend`` ``CPU``
+
+    Examples:
+        >>> boxes = Tensor(np.array([[[[200, 100, 150, 100]],
+        ...                           [[220, 120, 150, 100]],
+        ...                           [[190, 110, 150, 100]],
+        ...                           [[210, 112, 150, 100]]]])).astype('float32')
+        >>> scores = Tensor(np.array([[[0.2000, 0.7000, 0.1000], [0.1000, 0.8000, 0.1000], [0.3000, 0.6000, 0.1000],
+        ...                            [0.0500, 0.9000, 0.0500]]])).astype('float32')
+        >>> max_output_size_per_class = Tensor(4, mstype.int32)
+        >>> max_total_size = Tensor(1, mstype.int32)
+        >>> iou_threshold = Tensor(0, mstype.float32)
+        >>> score_threshold = Tensor(0, mstype.float32)
+        >>> net = P.CombinedNonMaxSuppression()
+        >>> out = net(boxes, scores, max_output_size_per_class, max_total_size, iou_threshold, score_threshold)
+        >>> print(out)
+        (Tensor(shape=[1, 1, 4], dtype=Float32, value= [[[1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
+                                                          1.00000000e+00]]]),
+        Tensor(shape=[1, 1], dtype=Float32, value= [[ 8.99999976e-01]]),
+        Tensor(shape=[1, 1], dtype=Float32, value= [[ 1.00000000e+00]]),
+        Tensor(shape=[1], dtype=Int32, value= [1]))
+    """
+
+    @prim_attr_register
+    def __init__(self, pad_per_class=False, clip_boxes=True):
+        """Initialize CombinedNonMaxSuppression"""
+        self.pad_per_class = validator.check_value_type("pad_per_class", pad_per_class, [bool], self.name)
+        self.add_prim_attr('pad_per_class', self.pad_per_class)
+        self.clip_boxes = validator.check_value_type("clip_boxes", clip_boxes, [bool], self.name)
+        self.add_prim_attr('clip_boxes', self.clip_boxes)
--- a/tests/ut/python/ops/test_ops.py
+++ b/tests/ut/python/ops/test_ops.py
@ -29,7 +29,7 @@ from mindspore.ops import operations as P
 from mindspore.ops.function.math_func import matrix_exp
 from mindspore.ops.function.math_func import sinc
 from mindspore.ops.operations.image_ops import CropAndResizeGradBoxes, AdjustHue, AdjustContrastv2, \
-                                               AdjustSaturation
+                                               AdjustSaturation, CombinedNonMaxSuppression
 from mindspore.ops.operations.image_ops import ExtractGlimpse
 from mindspore.ops.operations import _grad_ops as G
 from mindspore.ops.operations import _inner_ops as inner
@ -4030,6 +4030,17 @@ test_case_image_ops = [
                        Tensor([3, 4], mstype.int32), Tensor([5, 6], mstype.float32),
                        Tensor([0.1, 0.8], mstype.float32)],
        'desc_bprop': [Tensor(np.random.rand(2, 2, 2, 4), mstype.float32)]}),
+    ('CombinedNonMaxSuppression', {
+        'block': CombinedNonMaxSuppression(),
+        'desc_inputs': [Tensor(np.array([[[[200, 100, 150, 100]], [[220, 120, 150, 100]], [[190, 110, 150, 100]],
+                                          [[210, 112, 150, 100]]]]).astype(np.float32)),
+                        Tensor(np.array([[[0.2000, 0.7000, 0.1000], [0.1000, 0.8000, 0.1000],
+                                          [0.3000, 0.6000, 0.1000], [0.0500, 0.9000, 0.0500]]]).astype(np.float32)),
+                        Tensor(4, mstype.int32),
+                        Tensor(1, mstype.int32),
+                        Tensor(0, mstype.float32),
+                        Tensor(0, mstype.float32)],
+        'skip': ['backward']}),
 ]

 test_case_other_ops = [