!48406 fix api migration issues and some doc issues

Merge pull request !48406 from 李林杰/0204_fix_aicpu_migration_issues_master
2023-02-06 01:41:21 +00:00 · 2023-02-06 01:41:21 +00:00 · 9171b5d329
parent 7cddb2c437 2b78a9ecbd
commit 9171b5d329
23 changed files with 815 additions and 350 deletions
--- a/.jenkins/check/config/whitelizard.txt
+++ b/.jenkins/check/config/whitelizard.txt
@ -348,3 +348,5 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_dense_cwise_utils.cc:aicpu::SparseDenseCwiseOpKernel<Op>::SparseDenseCwiseOpSpecialComputeComplex
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_dense_cwise_utils.cc:aicpu::SparseDenseCwiseOpKernel<Op>::SparseDenseCwiseOpBcastComputeComplex
 mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/resize_bicubic_grad.cc:aicpu::ResizeBicubicGrad
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_max.cc:aicpu::SegmentMaxCpuKernel::SegmentMaxCompute
+mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/extract_glimpse.cc:aicpu::ExtractGlimpseCpuKernel::Compute
--- a/mindspore/ccsrc/include/common/utils/utils.h
+++ b/mindspore/ccsrc/include/common/utils/utils.h
@ -312,6 +312,7 @@ constexpr auto kExpandDOpName = "ExpandD";
 constexpr auto kExpandDimsOpName = "ExpandDims";
 constexpr auto kExpOpName = "Exp";
 constexpr auto kExtractGlimpse = "ExtractGlimpse";
+constexpr auto kExtractGlimpseOpName = "ExtractGlimpse";
 constexpr auto kExtractImagePatchesOpName = "ExtractImagePatches";
 constexpr auto kEyeOpName = "Eye";
 constexpr auto kFastGeLUOpName = "FastGeLU";
@ -468,6 +469,7 @@ constexpr auto kLSTMGradOpName = "LSTMGrad";
 constexpr auto kLSTMInputGradOpName = "LSTMInputGrad";
 constexpr auto kLSTMOpName = "LSTM";
 constexpr auto kLstsqOpName = "Lstsq";
+constexpr auto kLuSolveOpName = "LuSolve";
 constexpr auto kLuUnpackOpName = "LuUnpack";
 constexpr auto kLuUnpackGradOpName = "LuUnpackGrad";
 constexpr auto kMaskedFillOpName = "MaskedFill";
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/common/runtime_tensor_desc.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/common/runtime_tensor_desc.h
@ -19,6 +19,7 @@

 namespace ge {
 constexpr int64_t kMaxDimSize = 32;
+constexpr int64_t DIM_SIZE2 = 2;

 #pragma pack(push, 1)
 struct RuntimeTensorDesc {
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/cache_swap_table.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/cache_swap_table.cc
@ -1,154 +0,0 @@
-/**
- * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "cache_swap_table.h"
-#include <securec.h>
-#include <map>
-#include "cpu_types.h"
-#include "kernel_log.h"
-#include "status.h"
-#include "utils/sparse_tensor.h"
-#include "utils/kernel_util.h"
-
-namespace {
-const char *const kCacheSwapTable = "CacheSwapTable";
-}
-
-namespace aicpu {
-template <typename T>
-uint32_t CacheSwapTableTask(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, int64_t batch_size,
-                            int64_t output_size, int64_t one_line_col, int type_size) {
-  if (inputs.size() == 0 || outputs.size() == 0) {
-    KERNEL_LOG_ERROR("CacheSwapTable input or output is empty.");
-    return KERNEL_STATUS_PARAM_INVALID;
-  }
-
-  char *cache_table = reinterpret_cast<char *>(inputs[0]->GetData());
-  T *swap_cache_idx = reinterpret_cast<T *>(inputs[1]->GetData());
-  uint64_t swap_cache_idx_size = inputs[1]->GetDataSize();
-  char *miss_value = reinterpret_cast<char *>(inputs[2]->GetData());
-
-  char *old_value = reinterpret_cast<char *>(outputs[0]->GetData());
-
-  errno_t ret = memset_s(old_value, static_cast<size_t>(output_size * type_size), 0x00,
-                         static_cast<size_t>(output_size * type_size));
-  if (ret != EOK) {
-    KERNEL_LOG_ERROR("Memset failed, result[%d]", ret);
-    return KERNEL_STATUS_INNER_ERROR;
-  }
-
-  uint64_t single_copy_size = static_cast<uint64_t>(type_size * one_line_col);
-
-  if (swap_cache_idx_size < static_cast<uint64_t>(batch_size)) {
-    KERNEL_LOG_ERROR(
-      "The value of swap_cache_idx_size:[%llu] must be less than "
-      "batch_size:[%lld]",
-      swap_cache_idx_size, batch_size);
-    return KERNEL_STATUS_INNER_ERROR;
-  }
-
-  uint64_t old_value_size = outputs[0]->GetDataSize();
-  uint64_t cache_table_size = inputs[0]->GetDataSize();
-  for (int64_t i = 0; i < batch_size; ++i) {
-    if (swap_cache_idx[i] < 0) {
-      continue;
-    }
-    ret = memcpy_s(old_value + i * single_copy_size, old_value_size, cache_table + swap_cache_idx[i] * single_copy_size,
-                   single_copy_size);
-    old_value_size -= single_copy_size;
-    if (ret != EOK) {
-      KERNEL_LOG_ERROR("CacheSwapTable memcpy failed, result [%d].", ret);
-      return KERNEL_STATUS_INNER_ERROR;
-    }
-    ret = memcpy_s(cache_table + swap_cache_idx[i] * single_copy_size, cache_table_size,
-                   miss_value + i * single_copy_size, single_copy_size);
-    cache_table_size -= single_copy_size;
-    if (ret != EOK) {
-      KERNEL_LOG_ERROR("CacheSwapTable memcpy failed, result [%d].", ret);
-      return KERNEL_STATUS_INNER_ERROR;
-    }
-  }
-  return KERNEL_STATUS_OK;
-}
-
-uint32_t CacheSwapTableMsCpuKernel::DoCompute() {
-  std::map<int, std::function<uint32_t(std::vector<Tensor *> &, std::vector<Tensor *> &, int64_t &, int64_t &,
-                                       int64_t &, int &)>>
-    calls;
-  calls[DT_INT32] = CacheSwapTableTask<int32_t>;
-  calls[DT_INT64] = CacheSwapTableTask<int64_t>;
-
-  if (calls.find(indices_type_) == calls.end()) {
-    KERNEL_LOG_ERROR(
-      "CacheSwapTableMsCpuKernel op doesn't support indices tensor types: "
-      "[%s]",
-      DTypeStr(indices_type_).c_str());
-    return KERNEL_STATUS_PARAM_INVALID;
-  }
-
-  int type_size = GetSizeByDataType(param_type_);
-  return calls[indices_type_](inputs_, outputs_, batch_size_, output_size_, one_line_col_, type_size);
-}
-
-uint32_t CacheSwapTableMsCpuKernel::GetInputAndCheck(CpuKernelContext &ctx) {
-  KERNEL_LOG_INFO("GetInputAndCheck start!");
-  // get input Tensors
-  const uint32_t kNumInput = 3;
-  for (uint32_t i = 0; i < kNumInput; ++i) {
-    Tensor *tensor = ctx.Input(i);
-    KERNEL_CHECK_NULLPTR(tensor, KERNEL_STATUS_PARAM_INVALID, "Get input tensor[%d] failed", i)
-    inputs_.push_back(tensor);
-  }
-  // get output Tensors
-  const uint32_t kNumOutput = 1;
-  for (uint32_t i = 0; i < kNumOutput; ++i) {
-    Tensor *tensor = ctx.Output(i);
-    KERNEL_CHECK_NULLPTR(tensor, KERNEL_STATUS_PARAM_INVALID, "Get output tensor[%d] failed", i)
-    outputs_.push_back(tensor);
-  }
-  // get param type
-  param_type_ = static_cast<DataType>(inputs_[0]->GetDataType());
-  indices_type_ = static_cast<DataType>(inputs_[1]->GetDataType());
-  KERNEL_LOG_INFO("GetInputAndCheck success!");
-
-  std::shared_ptr<TensorShape> cache_table_shape = ctx.Input(0)->GetTensorShape();
-  std::shared_ptr<TensorShape> indices_shape = ctx.Input(1)->GetTensorShape();
-
-  for (int32_t i = 1; i < cache_table_shape->GetDims(); ++i) {
-    KERNEL_CHECK_ASSIGN_64S_MULTI(one_line_col_, cache_table_shape->GetDimSize(i), one_line_col_,
-                                  KERNEL_STATUS_PARAM_INVALID);
-  }
-  for (int32_t i = 0; i < indices_shape->GetDims(); ++i) {
-    KERNEL_CHECK_ASSIGN_64S_MULTI(batch_size_, indices_shape->GetDimSize(i), batch_size_, KERNEL_STATUS_PARAM_INVALID);
-  }
-  output_size_ = batch_size_ * one_line_col_;
-  return KERNEL_STATUS_OK;
-}
-
-uint32_t CacheSwapTableMsCpuKernel::Compute(CpuKernelContext &ctx) {
-  uint32_t res = GetInputAndCheck(ctx);
-  if (res != KERNEL_STATUS_OK) {
-    return res;
-  }
-
-  res = DoCompute();
-  if (res != KERNEL_STATUS_OK) {
-    KERNEL_LOG_ERROR("Compute failed");
-    return res;
-  }
-  return KERNEL_STATUS_OK;
-}
-REGISTER_CPU_KERNEL(kCacheSwapTable, CacheSwapTableMsCpuKernel);
-}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/extract_glimpse.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/extract_glimpse.cc
@ -0,0 +1,213 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "extract_glimpse.h"
+#include <iostream>
+#include <random>
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+using namespace std;
+random_device rd;
+mt19937 gen(rd());
+uniform_real_distribution<float> dis_uniform(0.0f, 255.0f);
+normal_distribution<float> dis_normal(10, 0.5);
+#define SHED 2048
+namespace {
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 3;
+const char *kExtractGlimpse = "ExtractGlimpse";
+}  // namespace
+namespace aicpu {
+uint32_t ExtractGlimpseCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "ExtractGlimpse check input and output number failed.");
+  KERNEL_HANDLE_ERROR(ExtractGlimpseCheck(ctx), "ExtractGlimpse check params failed.");
+  Tensor *x = ctx.Input(0);
+  Tensor *ss = ctx.Input(1);
+  Tensor *offsets = ctx.Input(2);
+  Tensor *y = ctx.Output(0);
+  AttrValue *centered = ctx.GetAttr("centered");
+  AttrValue *normalized = ctx.GetAttr("normalized");
+  AttrValue *uniform_noise = ctx.GetAttr("uniform_noise");
+  AttrValue *noise = ctx.GetAttr("noise");
+  float *x_data = (float *)x->GetData();
+  int32_t *ss_data = (int32_t *)ss->GetData();
+  float *offsets_data = (float *)offsets->GetData();
+  float *y_data = (float *)y->GetData();
+  uint64_t offsets_cnt = offsets->GetTensorShape()->GetDimSize(0);
+  uint64_t batch_cnt = x->GetTensorShape()->GetDimSize(0);
+  KERNEL_CHECK_FALSE(offsets_cnt == batch_cnt, KERNEL_STATUS_PARAM_INVALID, "offsets should equal to batches")
+  int64_t image_height = x->GetTensorShape()->GetDimSize(1);
+  int64_t image_width = x->GetTensorShape()->GetDimSize(2);
+  int64_t channels = x->GetTensorShape()->GetDimSize(3);
+  uint64_t g_height = ss_data[0], g_width = ss_data[1];
+  uint64_t size1 = image_width * image_height * channels;
+  uint64_t size2 = image_width * channels;
+  uint64_t size3 = g_height * g_width * channels;
+  uint64_t size4 = size3 / g_height;
+  int64_t g_size = g_width * g_height;
+  if (batch_cnt > SHED) {
+    uint32_t min_core = 1;
+    uint64_t max_core = std::max(min_core, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    max_core = min(max_core, (uint64_t)batch_cnt);
+    auto fun = [&](size_t st, size_t ed) {
+      for (auto i = st; i < ed; i++) {
+        float x = offsets_data[i << 1], y = offsets_data[1 + (i << 1)];
+        if (normalized->GetBool()) {
+          x *= image_height;
+          y *= image_width;
+        }
+        if (centered->GetBool()) {
+          x /= 2.0f;
+          y /= 2.0f;
+          x += image_height / 2.0f;
+          y += image_width / 2.0f;
+        }
+        x -= g_height / 2.0f;
+        y -= g_width / 2.0f;
+        for (int64_t v = 0; v < g_size; v++) {
+          int64_t j = v / g_width, k = v % g_width;
+          int64_t a = (int64_t)x + j, b = (int64_t)y + k;
+          uint64_t pos_y = i * size3 + j * size4 + k * channels;
+          if (a < 0 || a >= image_height || b < 0 || b >= image_width) {
+            for (int u = 0; u < channels; u++) {
+              if (uniform_noise->GetBool())
+                y_data[pos_y + u] = dis_uniform(gen);
+              else if (noise->GetString() == "zero")
+                y_data[pos_y + u] = 0.0f;
+              else if (noise->GetString() == "gaussian")
+                y_data[pos_y + u] = max(0.0f, dis_normal(gen));
+              else {
+                KERNEL_LOG_ERROR("noise type [%s] unsupported.", noise->GetString().c_str());
+                return KERNEL_STATUS_PARAM_INVALID;
+              }
+            }
+            continue;
+          }
+          uint64_t pos_x = i * size1 + a * size2 + b * channels;
+          for (int u = 0; u < channels; u++) {
+            y_data[pos_y + u] = x_data[pos_x + u];
+          }
+        }
+      }
+      return KERNEL_STATUS_OK;
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch_cnt, batch_cnt / max_core, fun),
+                        "ExtractGlimpse Compute failed.");
+  } else {
+    for (uint64_t i = 0; i < batch_cnt; i++) {
+      float x = offsets_data[i << 1], y = offsets_data[1 + (i << 1)];
+      if (normalized->GetBool()) {
+        x *= image_height;
+        y *= image_width;
+      }
+      if (centered->GetBool()) {
+        x /= 2.0f;
+        y /= 2.0f;
+        x += image_height / 2.0f;
+        y += image_width / 2.0f;
+      }
+      x -= g_height / 2.0f;
+      y -= g_width / 2.0f;
+      if (g_size < SHED) {
+        for (int64_t v = 0; v < g_size; v++) {
+          int64_t j = v / g_width, k = v % g_width;
+          int64_t a = (int64_t)x + j, b = (int64_t)y + k;
+          uint64_t pos_y = i * size3 + j * size4 + k * channels;
+          if (a < 0 || a >= image_height || b < 0 || b >= image_width) {
+            for (int u = 0; u < channels; u++) {
+              if (uniform_noise->GetBool())
+                y_data[pos_y + u] = dis_uniform(gen);
+              else if (noise->GetString() == "zero")
+                y_data[pos_y + u] = 0.0f;
+              else if (noise->GetString() == "gaussian")
+                y_data[pos_y + u] = max(0.0f, dis_normal(gen));
+              else {
+                KERNEL_LOG_ERROR("noise type [%s] unsupported.", noise->GetString().c_str());
+                return KERNEL_STATUS_PARAM_INVALID;
+              }
+            }
+            continue;
+          }
+          uint64_t pos_x = i * size1 + a * size2 + b * channels;
+          for (int u = 0; u < channels; u++) {
+            y_data[pos_y + u] = x_data[pos_x + u];
+          }
+        }
+      } else {
+        uint32_t min_core = 1;
+        uint64_t max_core = std::max(min_core, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+        max_core = min(max_core, (uint64_t)g_size);
+        auto fun = [&](size_t st, size_t ed) {
+          for (auto v = st; v < ed; v++) {
+            int64_t j = v / g_width, k = v % g_width;
+            int64_t a = (int64_t)x + j, b = (int64_t)y + k;
+            uint64_t pos_y = i * size3 + j * size4 + k * channels;
+            if (a < 0 || a >= image_height || b < 0 || b >= image_width) {
+              for (int u = 0; u < channels; u++)
+                if (uniform_noise->GetBool())
+                  y_data[pos_y + u] = dis_uniform(gen);
+                else if (noise->GetString() == "zero")
+                  y_data[pos_y + u] = 0.0f;
+                else if (noise->GetString() == "gaussian")
+                  y_data[pos_y + u] = max(0.0f, dis_normal(gen));
+                else {
+                  KERNEL_LOG_ERROR("noise type [%s] unsupported.", noise->GetString().c_str());
+                  return KERNEL_STATUS_PARAM_INVALID;
+                }
+              continue;
+            }
+            uint64_t pos_x = i * size1 + a * size2 + b * channels;
+            for (int u = 0; u < channels; u++) {
+              y_data[pos_y + u] = x_data[pos_x + u];
+            }
+          }
+          return KERNEL_STATUS_OK;
+        };
+        KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, g_size, g_size / max_core, fun),
+                            "ExtractGlimpse Compute failed.");
+      }
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+uint32_t ExtractGlimpseCpuKernel::ExtractGlimpseCheck(CpuKernelContext &ctx) {
+  Tensor *x = ctx.Input(0);
+  Tensor *ss = ctx.Input(1);
+  Tensor *offsets = ctx.Input(2);
+  Tensor *y = ctx.Output(0);
+  AttrValue *centered = ctx.GetAttr("centered");
+  AttrValue *normalized = ctx.GetAttr("normalized");
+  AttrValue *uniform_noise = ctx.GetAttr("uniform_noise");
+  AttrValue *noise = ctx.GetAttr("noise");
+  KERNEL_CHECK_NULLPTR(x, KERNEL_STATUS_PARAM_INVALID, "Get input 0 failed.")
+  KERNEL_CHECK_NULLPTR(ss, KERNEL_STATUS_PARAM_INVALID, "Get input 1 failed.")
+  KERNEL_CHECK_NULLPTR(offsets, KERNEL_STATUS_PARAM_INVALID, "Get input 2 failed.")
+  KERNEL_CHECK_NULLPTR(y, KERNEL_STATUS_PARAM_INVALID, "Get output 0 failed.")
+  KERNEL_CHECK_NULLPTR(centered, KERNEL_STATUS_PARAM_INVALID, "Get attribute centered failed.")
+  KERNEL_CHECK_NULLPTR(normalized, KERNEL_STATUS_PARAM_INVALID, "Get attribute normalized failed.")
+  KERNEL_CHECK_NULLPTR(uniform_noise, KERNEL_STATUS_PARAM_INVALID, "Get attribute uniform_noise failed.")
+  KERNEL_CHECK_NULLPTR(noise, KERNEL_STATUS_PARAM_INVALID, "Get attribute noise failed.")
+  KERNEL_CHECK_NULLPTR(x->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 0 data failed.")
+  KERNEL_CHECK_NULLPTR(ss->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 1 data failed.")
+  KERNEL_CHECK_NULLPTR(offsets->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 2 data failed.")
+  KERNEL_CHECK_NULLPTR(y->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output 0 data failed.")
+  KERNEL_CHECK_FALSE(x->GetDataType() == DT_FLOAT && ss->GetDataType() == DT_INT32 &&
+                       offsets->GetDataType() == DT_FLOAT && y->GetDataType() == DT_FLOAT,
+                     KERNEL_STATUS_PARAM_INVALID, "data type error.")
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kExtractGlimpse, ExtractGlimpseCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/extract_glimpse.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/extract_glimpse.h
@ -0,0 +1,35 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_IMPL_EXTRACT_GLIMPSE_H_
+#define AICPU_IMPL_EXTRACT_GLIMPSE_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class ExtractGlimpseCpuKernel : public CpuKernel {
+ public:
+  ExtractGlimpseCpuKernel() = default;
+  ~ExtractGlimpseCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  static uint32_t ExtractGlimpseCheck(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fft_with_size.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fft_with_size.cc
@ -124,27 +124,6 @@ double FFTWithSizeCpuKernel::Getnormalized(int64_t n, std::string normalized, bo
    if (normalized == "backward") result = 1.0 / n;
    if (normalized == "ortho") result = 1.0 / sqrt((double)n);
  }
-  // if (signal_ndim == 1) {
-  //   result = sqrt((double)out_shape[out_shape.size() - 1]);
-  // } else if (signal_ndim == 2) {
-  //   result = sqrt((double)(out_shape[out_shape.size() - 1] *
-  //                          out_shape[out_shape.size() - 2]));
-  // } else {
-  //   result = sqrt((double)(out_shape[out_shape.size() - 1] *
-  //                          out_shape[out_shape.size() - 2] *
-  //                          out_shape[out_shape.size() - 3]));
-  // }
-  // if (is_reverse) {
-  //   if (result == 0) {
-  //       KERNEL_LOG_ERROR("DivideByZeroExcepiton");
-  //   }
-  //   result = 1.0 / result;
-  // }
-  // KERNEL_LOG_DEBUG(
-  //     "FFTWithSizeCpuKernel[GetNormalized], "
-  //     "input_shape[%s]  normalize[%s]. "
-  //     "is_reverse: [%d]. norm_:[%lf]",
-  //     VectorToString(out_shape).c_str(), normalized, is_reverse, result);
  std::cout << "result = " << result << std::endl;
  return result;
 }
@ -350,14 +329,7 @@ uint32_t FFTWithSizeCpuKernel::FFTWithSizeCompute(CpuKernelContext &ctx, bool on
  if (is_real) {
    inverse = real_inverse;
  }
-  std::cout << out;
-  std::cout << "===========";
-  // if
-  //  std::vector<int64_t> out_shape(out.dimensions().begin(),
-  //                                out.dimensions().end());
-  //  if (is_real && !inverse) {
-  //    out_shape.back() = x_shape.back();
-  //  }
+
  std::cout << out;
  auto cout = x_shape_ptr->NumElements();
  auto norm = Getnormalized(cout, normalized, inverse);
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fft_with_size.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fft_with_size.h
@ -1,18 +1,3 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
 #ifndef AICPU_KERNELS_NORMALIZED_FFTWITHSIZE_H_
 #define AICPU_KERNELS_NORMALIZED_FFTWITHSIZE_H_

--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fill.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fill.cc
@ -15,129 +15,160 @@
 */

 #include "fill.h"
+#include "cpu_kernel_utils.h"
 #include "utils/eigen_tensor.h"
 #include "utils/kernel_util.h"

 namespace {
-const char *const kFill = "Fill";
-}
+const uint32_t kOutputNum = 1;
+const uint32_t kInputNum = 2;
+const char *kFill = "Fill";
+const char *kFillV2 = "FillV2";
+const int64_t kParallelDataNumCriticalPoint1 = 128 * 1024;
+const int64_t kParallelDataNumCriticalPoint2 = 2 * 1024 * 1024;
+
+#define CALCULATE_DIMS_DTYPE_CASE(DTYPE, TYPE)                        \
+  case (DTYPE): {                                                     \
+    if (CalculateDims<TYPE>(dims_tensor, dims) != KERNEL_STATUS_OK) { \
+      KERNEL_LOG_ERROR("Fill kernel calculate dims failed.");         \
+      return KERNEL_STATUS_PARAM_INVALID;                             \
+    }                                                                 \
+    break;                                                            \
+  }
+
+#define FILL_GENERATE_DTYPE_CASE(DTYPE, TYPE)    \
+  case (DTYPE): {                                \
+    FillOutput<TYPE>(ctx, value_tensor, output); \
+    break;                                       \
+  }
+}  // namespace

 namespace aicpu {
-template <typename T>
-void FillGenerateCase(Tensor *&value_tensor, Tensor *&output) {
-  auto value = *(reinterpret_cast<T *>(value_tensor->GetData()));
-  if (AddrAlignedCheck(output->GetData())) {
-    Eigen::TensorMap<Eigen::Tensor<T, 1>, Eigen::Aligned> eigen_output(static_cast<T *>(output->GetData()),
-                                                                       output->GetTensorShape()->NumElements());
-    eigen_output.setConstant(value);
-  } else {
-    Eigen::TensorMap<Eigen::Tensor<T, 1>, Eigen::Unaligned> eigen_output(static_cast<T *>(output->GetData()),
-                                                                         output->GetTensorShape()->NumElements());
-    eigen_output.setConstant(value);
-  }
-}
+uint32_t FillCpuKernel::Compute(CpuKernelContext &ctx) {
+  // 校验输入个数和输出个数，以及输入和输入tensor的属性是否为空
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Check input and output number failed.");

-uint32_t FillCpuKernel::GetDimsByType(CpuKernelContext &ctx) {
-  dims.clear();
+  std::vector<int64_t> dims;
  Tensor *dims_tensor = ctx.Input(0);
-  KERNEL_CHECK_NULLPTR(dims_tensor, KERNEL_STATUS_PARAM_INVALID, "Get dims input failed")
-  uint32_t ret;
  auto dims_dtype = dims_tensor->GetDataType();
  switch (dims_dtype) {
-    case (DT_INT32):
-      ret = CalcDims<int32_t>(dims_tensor, dims);
-      break;
-    case (DT_INT64):
-      ret = CalcDims<int64_t>(dims_tensor, dims);
-      break;
+    CALCULATE_DIMS_DTYPE_CASE(DT_INT32, int32_t)
+    CALCULATE_DIMS_DTYPE_CASE(DT_INT64, int64_t)
    default:
-      KERNEL_LOG_ERROR(
-        "Fill kernel dims data_type [%u] not support, support data_types: "
-        "DT_INT32, DT_INT64",
-        dims_dtype);
+      KERNEL_LOG_ERROR("Fill kernel dims data_type [%u] not support, support data_types: DT_INT32, DT_INT64.",
+                       dims_dtype);
      return KERNEL_STATUS_PARAM_INVALID;
  }
-  if (ret != KERNEL_STATUS_OK) {
-    KERNEL_LOG_ERROR("Fill kernel calculate dims failed");
-  }
-  return ret;
-}

-uint32_t FillCpuKernel::Compute(CpuKernelContext &ctx) {
-  uint32_t check = GetDimsByType(ctx);
-  if (check != KERNEL_STATUS_OK) {
-    return check;
-  }
  Tensor *value_tensor = ctx.Input(1);
-  KERNEL_CHECK_NULLPTR(value_tensor, KERNEL_STATUS_PARAM_INVALID, "Get value input failed")
-  KERNEL_CHECK_NULLPTR(value_tensor->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get value input data failed")
-  KERNEL_CHECK_NULLPTR(value_tensor->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get value input shape failed")
-  if (!value_tensor->GetTensorShape()->GetDimSizes().empty()) {
+  if (value_tensor->NumElements() != 1) {
    KERNEL_LOG_ERROR("Fill kernel value input is not a scalar.");
    return KERNEL_STATUS_PARAM_INVALID;
  }
+
  Tensor *output = ctx.Output(0);
-  KERNEL_CHECK_NULLPTR(output, KERNEL_STATUS_PARAM_INVALID, "Get output failed")
-  KERNEL_CHECK_NULLPTR(output->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed")
-  KERNEL_CHECK_NULLPTR(output->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get output shape failed")
-  if (output->GetTensorShape()->GetDimSizes() != dims) {
+  if (output->GetTensorShape()->GetDims() != static_cast<int64_t>(dims.size())) {
    KERNEL_LOG_ERROR("Fill kernel output shape not matched.");
    return KERNEL_STATUS_PARAM_INVALID;
  }
+  if (output->GetTensorShape()->GetDimSizes() != dims) {
+    output->GetTensorShape()->SetDimSizes(dims);
+  }
+
  auto input_dtype = value_tensor->GetDataType();
  auto output_dtype = output->GetDataType();
  if (input_dtype != output_dtype) {
-    KERNEL_LOG_ERROR("Fill kernel data type not matched, value input dtype [%u], output dtype [%u].", input_dtype,
-                     output_dtype);
+    KERNEL_LOG_ERROR(
+      "Fill kernel data type not matched, value input dtype [%u], output dtype [%u], support data_types: "
+      "DT_COMPLEX128, DT_COMPLEX64, DT_DOUBLE, DT_FLOAT, DT_FLOAT16, DT_INT16, DT_INT32, DT_INT64, DT_INT8, DT_UINT16, "
+      "DT_UINT32, DT_UINT64, DT_UINT8, DT_BOOL.",
+      input_dtype, output_dtype);
    return KERNEL_STATUS_PARAM_INVALID;
  }

-  std::map<int, std::function<void(Tensor *&, Tensor *&)>> calls;
-  calls[DT_INT8] = FillGenerateCase<int8_t>;
-  calls[DT_UINT8] = FillGenerateCase<uint8_t>;
-  calls[DT_INT16] = FillGenerateCase<int16_t>;
-  calls[DT_UINT16] = FillGenerateCase<uint16_t>;
-  calls[DT_INT32] = FillGenerateCase<int32_t>;
-  calls[DT_UINT32] = FillGenerateCase<uint32_t>;
-  calls[DT_INT64] = FillGenerateCase<int64_t>;
-  calls[DT_UINT64] = FillGenerateCase<uint64_t>;
-  calls[DT_BOOL] = FillGenerateCase<bool>;
-  calls[DT_FLOAT16] = FillGenerateCase<Eigen::half>;
-  calls[DT_FLOAT] = FillGenerateCase<float>;
-  calls[DT_DOUBLE] = FillGenerateCase<double>;
-
-  if (calls.find(output_dtype) == calls.end()) {
-    KERNEL_LOG_ERROR("Fill kernel data type [%u] not support", output_dtype);
-    return KERNEL_STATUS_PARAM_INVALID;
+  switch (output_dtype) {
+    FILL_GENERATE_DTYPE_CASE(DT_INT8, int8_t)
+    FILL_GENERATE_DTYPE_CASE(DT_UINT8, uint8_t)
+    FILL_GENERATE_DTYPE_CASE(DT_INT16, int16_t)
+    FILL_GENERATE_DTYPE_CASE(DT_UINT16, uint16_t)
+    FILL_GENERATE_DTYPE_CASE(DT_INT32, int32_t)
+    FILL_GENERATE_DTYPE_CASE(DT_UINT32, uint32_t)
+    FILL_GENERATE_DTYPE_CASE(DT_INT64, int64_t)
+    FILL_GENERATE_DTYPE_CASE(DT_UINT64, uint64_t)
+    FILL_GENERATE_DTYPE_CASE(DT_BOOL, bool)
+    FILL_GENERATE_DTYPE_CASE(DT_FLOAT16, Eigen::half)
+    FILL_GENERATE_DTYPE_CASE(DT_FLOAT, float)
+    FILL_GENERATE_DTYPE_CASE(DT_DOUBLE, double)
+    FILL_GENERATE_DTYPE_CASE(DT_COMPLEX64, std::complex<float>)
+    FILL_GENERATE_DTYPE_CASE(DT_COMPLEX128, std::complex<double>)
+    default:
+      KERNEL_LOG_ERROR(
+        "Fill kernel data type [%u] not support, not support data_types: DT_STRING, DT_DUAL_SUB_INT8, "
+        "DT_DUAL_SUB_UINT8, DT_QUINT8, DT_QINT8, DT_QINT32, DT_QINT16, DT_QUINT16, DT_RESOURCE, DT_STRING_REF, "
+        "DT_DUAL, DT_UNDEFINED.",
+        output_dtype);
+      return KERNEL_STATUS_PARAM_INVALID;
  }
-  calls[output_dtype](value_tensor, output);
+
  return KERNEL_STATUS_OK;
 }

 template <typename T>
-uint32_t FillCpuKernel::CalcDims(const Tensor *dims_tensor, std::vector<int64_t> &dim_vec) {
+uint32_t FillCpuKernel::CalculateDims(const Tensor *dims_tensor, std::vector<int64_t> &dims) {
+  // 获取第一个输入tensor中的元素个数，第一个输入是一个一维的tensor(dims_tensor)
  uint64_t data_num = dims_tensor->GetDataSize() / sizeof(T);
-  if (data_num == 0) {
-    KERNEL_LOG_INFO("Fill kernel: dims is empty, fill scalar output.");
-    return KERNEL_STATUS_OK;
-  }
+  auto dims_data = reinterpret_cast<const T *>(dims_tensor->GetData());

-  KERNEL_CHECK_NULLPTR(dims_tensor->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get dims data failed")
  for (uint64_t i = 0; i < data_num; i++) {
-    auto dim = *(reinterpret_cast<const T *>(dims_tensor->GetData()) + i);
+    auto dim = *(dims_data + i);
    if (dim < 0) {
-      KERNEL_LOG_ERROR("Fill kernel: input dim [%llu] is negative, value=[%lld]", i, static_cast<int64_t>(dim));
+      KERNEL_LOG_ERROR("dims input dim [%llu] is negative, value=[%lld].", i, static_cast<int64_t>(dim));
      return KERNEL_STATUS_PARAM_INVALID;
    }
-    // zero dim is different from empty dim.
    if (dim == 0) {
-      KERNEL_LOG_INFO("Fill kernel: input dim [%llu] is zero", i);
+      KERNEL_LOG_INFO("dims input dim [%llu] is zero.", i);
+      dims.clear();
+      break;
    }
-    dim_vec.emplace_back(dim);
+    dims.emplace_back(dim);
  }

  return KERNEL_STATUS_OK;
 }

+template <typename T>
+void FillCpuKernel::FillOutput(CpuKernelContext &ctx, const Tensor *value_tensor, Tensor *output) {
+  auto value = reinterpret_cast<T *>(value_tensor->GetData());
+  auto output_data = reinterpret_cast<T *>(output->GetData());
+  int64_t data_num = output->NumElements();
+
+  if (data_num >= kParallelDataNumCriticalPoint1) {
+    uint32_t min_core_num = 1;
+    uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
+
+    if (data_num <= kParallelDataNumCriticalPoint2) {
+      max_core_num = std::min(max_core_num, 4U);
+    }
+
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+
+    auto shared_fill = [&](int64_t start, int64_t end) { SpecialFillOutput<T>(start, end, output_data, value); };
+
+    CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_fill);
+  } else {
+    SpecialFillOutput<T>(0, data_num, output_data, value);
+  }
+}
+
+template <typename T>
+void FillCpuKernel::SpecialFillOutput(int64_t start, int64_t end, T *output_data, const T *value) {
+  for (int64_t i = start; i < end; i++) {
+    *(output_data + i) = *(value);
+  }
+}
+
 REGISTER_CPU_KERNEL(kFill, FillCpuKernel);
+REGISTER_CPU_KERNEL(kFillV2, FillCpuKernel);
 }  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fill.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fill.h
@ -1,5 +1,5 @@
 /**
- * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
+ * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -14,8 +14,8 @@
 * limitations under the License.
 */

-#ifndef AICPU_KERNELS_NORMALIZED_FILL_H
-#define AICPU_KERNELS_NORMALIZED_FILL_H
+#ifndef AICPU_KERNELS_NORMALIZED_FILL_H_
+#define AICPU_KERNELS_NORMALIZED_FILL_H_

 #include "cpu_ops_kernel.h"

@ -23,21 +23,18 @@ namespace aicpu {
 class FillCpuKernel : public CpuKernel {
 public:
  FillCpuKernel() = default;
-  ~FillCpuKernel() override = default;
+  ~FillCpuKernel() = default;
  uint32_t Compute(CpuKernelContext &ctx) override;

 private:
-  uint32_t GetDimsByType(CpuKernelContext &ctx);
-  /**
-   * @brief calc dims from input dims tensor
-   * @param dims_tensor input dims tensor
-   * @param dims output shape dims
-   * @return status if success
-   */
  template <typename T>
-  uint32_t CalcDims(const Tensor *dims_tensor, std::vector<int64_t> &dims);
+  uint32_t CalculateDims(const Tensor *dims_tensor, std::vector<int64_t> &dims);

-  std::vector<int64_t> dims;
+  template <typename T>
+  void FillOutput(CpuKernelContext &ctx, const Tensor *value_tensor, Tensor *output);
+
+  template <typename T>
+  void SpecialFillOutput(int64_t start, int64_t end, T *output_data, const T *value);
 };
 }  // namespace aicpu
-#endif  // AICPU_KERNELS_NORMALIZED_FILL_H_
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/log_normal_reverse.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/log_normal_reverse.cc
@ -0,0 +1,132 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "log_normal_reverse.h"
+#include <random>
+#include <set>
+#include "cpu_kernel_utils.h"
+#include "cpu_ops_kernel.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+#include <ctime>
+#include <iostream>
+
+#include "Eigen/Core"
+using namespace std;
+using namespace Eigen;
+
+namespace {
+const uint32_t kNumInput = 1;
+const uint32_t kNumOutput = 1;
+
+const char *kLogNormalReverse = "LogNormalReverse";
+const int64_t kParallelDataNumSameShape = 16 * 1024;
+const int64_t kParallelDataNumMid = 128 * 1024;
+}  // namespace
+namespace aicpu {
+uint32_t LogNormalReverseCpuKernel::GetInputAndCheck(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kNumInput, kNumOutput), "LogNormalReverse check input and output failed.");
+  // get and check input
+  Tensor *input = ctx.Input(0);
+  inputs_.push_back(input);
+
+  // get output Tensors
+  Tensor *output = ctx.Output(0);
+  outputs_.push_back(output);
+
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T>
+uint32_t LogNormalReverseCpuKernel::DoCompute(CpuKernelContext &ctx) {
+  float input_mean = 1.0;
+  float input_std = 2.0;
+
+  auto mean_value = ctx.GetAttr("mean");
+  auto std_value = ctx.GetAttr("std");
+
+  if (mean_value != nullptr) {
+    input_mean = mean_value->GetFloat();
+  }
+  if (std_value != nullptr) {
+    input_std = std_value->GetFloat();
+  }
+
+  T *output_y = reinterpret_cast<T *>(outputs_[0]->GetData());
+
+  static default_random_engine random_engine(time(0));
+  static std::normal_distribution<float> normal_value(input_mean, input_std);
+
+  int64_t Nums = inputs_[0]->GetTensorShape()->NumElements();
+
+  int64_t data_num = Nums;
+  if (data_num >= kParallelDataNumSameShape) {
+    uint32_t max_core_num = std::max(1U, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
+
+    if (data_num <= kParallelDataNumMid) {
+      max_core_num = std::min(max_core_num, 4U);
+    }
+    if (max_core_num > data_num) {
+      max_core_num = data_num;
+    }
+
+    auto shared_lognormalreverse = [&](size_t start, size_t end) {
+      for (size_t i = start; i < end; i++) {
+        output_y[i] = static_cast<T>(std::exp(normal_value(random_engine)));
+      }
+    };
+
+    if (max_core_num == 0) {
+      max_core_num = 1;
+    }
+    CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_lognormalreverse);
+  } else {
+    for (int64_t i = 0; i < Nums; i++) {
+      output_y[i] = static_cast<T>(std::exp(normal_value(random_engine)));
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+uint32_t LogNormalReverseCpuKernel::Compute(CpuKernelContext &ctx) {
+  uint32_t res = GetInputAndCheck(ctx);
+  if (res != KERNEL_STATUS_OK) {
+    return res;
+  }
+
+  DataType input_type{ctx.Input(0)->GetDataType()};
+  switch (input_type) {
+    case (DT_FLOAT16): {
+      DoCompute<Eigen::half>(ctx);
+      break;
+    }
+    case (DT_FLOAT): {
+      DoCompute<float>(ctx);
+      break;
+    }
+    default:
+      KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
+                       DTypeStr(input_type).c_str());
+      res = KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (res != KERNEL_STATUS_OK) {
+    KERNEL_LOG_ERROR("log normal reverse failed");
+    return res;
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kLogNormalReverse, LogNormalReverseCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/log_normal_reverse.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/log_normal_reverse.h
@ -1,5 +1,5 @@
 /**
- * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
+ * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -13,32 +13,26 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifndef AICPU_KERNELS_NORMALIZED_CACHE_SWAP_TABLE_H
-#define AICPU_KERNELS_NORMALIZED_CACHE_SWAP_TABLE_H

-#include <cmath>
-#include <vector>
+#ifndef AICPU_KERNELS_NORMALIZED_LOGNORMALREVERSE_H_
+#define AICPU_KERNELS_NORMALIZED_LOGNORMALREVERSE_H_
+
 #include "cpu_ops_kernel.h"

 namespace aicpu {
-class CacheSwapTableMsCpuKernel : public CpuKernel {
+class LogNormalReverseCpuKernel : public CpuKernel {
 public:
-  ~CacheSwapTableMsCpuKernel() = default;
+  LogNormalReverseCpuKernel() = default;
+  ~LogNormalReverseCpuKernel() override = default;
  uint32_t Compute(CpuKernelContext &ctx) override;

 private:
-  uint32_t DoCompute();
-
+  template <typename T>
+  uint32_t DoCompute(CpuKernelContext &ctx);
  uint32_t GetInputAndCheck(CpuKernelContext &ctx);

-  int64_t batch_size_ = 1;
-  int64_t one_line_col_ = 1;
-  int64_t output_size_ = 1;
-
  std::vector<Tensor *> inputs_;
  std::vector<Tensor *> outputs_;
-  DataType param_type_ = DT_FLOAT;
-  DataType indices_type_ = DT_INT32;
 };
 }  // namespace aicpu
-#endif
+#endif  // AICPU_KERNELS_NORMALIZED_LOGNORMALREVERSE_H_
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nuclear_norm.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nuclear_norm.cc
@ -21,7 +21,6 @@
 #include <iomanip>
 #include <iostream>
 #include <unsupported/Eigen/CXX11/Tensor>
-#include "kernel_util.h"
 #include "utils/kernel_util.h"
 #define NoneN 1000
 using namespace Eigen;
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_max.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_max.cc
@ -0,0 +1,217 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "segment_max.h"
+
+#include "cpu_kernel_utils.h"
+#include "utils/eigen_tensor.h"
+#include "utils/kernel_util.h"
+#include "cpu_kernel/common/runtime_tensor_desc.h"
+
+namespace {
+const uint32_t kInputNum = 2;
+const uint32_t kOutputNum = 1;
+const char *kSegmentMax = "SegmentMax";
+const int64_t kDataSize = 2 * 1024;
+
+#define SEGMENTMAX_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX)    \
+  case (DTYPE): {                                            \
+    uint32_t result = SegmentMaxCompute<TYPE1, TYPE2>(CTX);  \
+    if (result != KERNEL_STATUS_OK) {                        \
+      KERNEL_LOG_ERROR("SegmentMax kernel compute failed."); \
+      return result;                                         \
+    }                                                        \
+    break;                                                   \
+  }
+
+#define SEGMENTMAX_COMPUTE_CASE_ALL(TYPE, CTX)                \
+  SEGMENTMAX_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX)         \
+  SEGMENTMAX_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX)       \
+  SEGMENTMAX_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX)       \
+  SEGMENTMAX_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX)       \
+  SEGMENTMAX_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX)       \
+  SEGMENTMAX_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX)     \
+  SEGMENTMAX_COMPUTE_CASE(DT_UINT32, uint32_t, TYPE, CTX)     \
+  SEGMENTMAX_COMPUTE_CASE(DT_UINT64, uint64_t, TYPE, CTX)     \
+  SEGMENTMAX_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX) \
+  SEGMENTMAX_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX)         \
+  SEGMENTMAX_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX)
+}  // namespace
+
+namespace aicpu {
+uint32_t SegmentMaxCpuKernel::Compute(CpuKernelContext &ctx) {
+  KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SegmentMax check input and output number failed.");
+  auto data_type = ctx.Input(0)->GetDataType();
+  auto segment_ids_type = ctx.Input(1)->GetDataType();
+  switch (segment_ids_type) {
+    case DT_INT32: {
+      switch (data_type) {
+        SEGMENTMAX_COMPUTE_CASE_ALL(int32_t, ctx)
+        default:
+          KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      }
+      break;
+    }
+    case DT_INT64: {
+      switch (data_type) {
+        SEGMENTMAX_COMPUTE_CASE_ALL(int64_t, ctx)
+        default:
+          KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
+          return KERNEL_STATUS_PARAM_INVALID;
+      }
+      break;
+    }
+    default: {
+      KERNEL_LOG_ERROR("Input[1] data type[%s] not supported.", DTypeStr(segment_ids_type).c_str());
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+  }
+  return KERNEL_STATUS_OK;
+}
+
+template <typename T1, typename T2>
+uint32_t SegmentMaxCpuKernel::SegmentMaxCompute(CpuKernelContext &ctx) {
+  Tensor *input_x_data = ctx.Input(0);
+  auto input_x_addr = reinterpret_cast<T1 *>(input_x_data->GetData());
+  auto input_x_shape = input_x_data->GetTensorShape();
+  auto input_x_dims = input_x_shape->GetDimSizes();
+  int64_t input_x_num = input_x_data->NumElements();
+  Tensor *segment_ids_data = ctx.Input(1);
+  auto segment_ids_data_addr = reinterpret_cast<T2 *>(segment_ids_data->GetData());
+  int64_t segment_ids_data_num = segment_ids_data->NumElements();
+  input_x_dims[0] = segment_ids_data_addr[segment_ids_data_num - 1] + 1;
+  Tensor *output_data = ctx.Output(0);
+  auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
+  auto output_data_shape = output_data->GetTensorShape();
+  if (output_data_shape->GetDimSize(0) < input_x_dims[0]) {
+    KERNEL_LOG_ERROR("The number of segments of the segmentation result of segment_ids is too large.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  output_data_shape->SetDimSizes(input_x_dims);
+  if (!output_data->SetTensorShape(output_data_shape.get())) {
+    KERNEL_LOG_ERROR("Set output shape failed.");
+    return KERNEL_STATUS_INNER_ERROR;
+  }
+  int64_t output_data_num = output_data->NumElements();
+  for (int64_t i = 0; i < output_data_num; i++) {
+    output_data_addr[i] = static_cast<T1>(0);
+  }
+  std::vector<int64_t> segments_segment_ids;
+  if (segment_ids_data_num != (input_x_data->GetTensorShape()->GetDimSize(0))) {
+    KERNEL_LOG_ERROR("The amount of data for input[1] must be equal to the first dimension of input[0].");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  if (segment_ids_data_addr[0] < 0) {
+    KERNEL_LOG_ERROR("Input[1] must be nonnegative data.");
+    return KERNEL_STATUS_PARAM_INVALID;
+  }
+  int64_t seg_tmp = 1;
+  for (int64_t i = 0; i < segment_ids_data_num - 1; i++) {
+    if (segment_ids_data_addr[i] > segment_ids_data_addr[i + 1]) {
+      KERNEL_LOG_ERROR("Input[1] must be an ascending ordered sequence.");
+      return KERNEL_STATUS_PARAM_INVALID;
+    }
+    if (segment_ids_data_addr[i] == segment_ids_data_addr[i + 1]) {
+      seg_tmp++;
+    } else {
+      segments_segment_ids.push_back(seg_tmp);
+      seg_tmp = 1;
+    }
+    if (i == segment_ids_data_num - ge::DIM_SIZE2) {
+      segments_segment_ids.push_back(seg_tmp);
+    }
+  }
+  const int64_t num_compare_per = input_x_num / (input_x_shape->GetDimSize(0));
+  const int64_t num_segments_segment_ids = segments_segment_ids.size();
+  if (num_segments_segment_ids < kDataSize) {
+    for (int64_t i = 0; i < num_segments_segment_ids; i++) {
+      int64_t count = segments_segment_ids[i];
+      int64_t count_no = 0;
+      for (int64_t j = 0; j < i; j++) {
+        count_no += segments_segment_ids[j];
+      }
+      int64_t input_addr_base = count_no * num_compare_per;
+      if (num_compare_per < kDataSize) {
+        for (int64_t j = 0; j < num_compare_per; j++) {
+          int64_t max_init_addr = input_addr_base + j;
+          T1 max_value = input_x_addr[max_init_addr];
+          for (int64_t k = 1; k < count; k++) {
+            int cmp_addr = max_init_addr + k * num_compare_per;
+            if (max_value < input_x_addr[cmp_addr]) {
+              max_value = input_x_addr[cmp_addr];
+            }
+          }
+          output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = max_value;
+        }
+      } else {
+        uint32_t min_core_num = 1;
+        int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+        if (max_core_num > num_compare_per) {
+          max_core_num = num_compare_per;
+        }
+        auto shard_compute = [&](size_t start, size_t end) {
+          for (size_t j = start; j < end; j++) {
+            int64_t max_init_addr = input_addr_base + j;
+            T1 max_value = input_x_addr[max_init_addr];
+            for (int64_t k = 1; k < count; k++) {
+              int cmp_addr = max_init_addr + k * num_compare_per;
+              if (max_value < input_x_addr[cmp_addr]) {
+                max_value = input_x_addr[cmp_addr];
+              }
+            }
+            output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = max_value;
+          }
+        };
+        KERNEL_HANDLE_ERROR(
+          CpuKernelUtils::ParallelFor(ctx, num_compare_per, num_compare_per / max_core_num, shard_compute),
+          "SegmentMax Compute failed.");
+      }
+    }
+  } else {
+    uint32_t min_core_num_seg = 1;
+    int64_t max_core_num_seg = std::max(min_core_num_seg, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
+    if (max_core_num_seg > num_segments_segment_ids) {
+      max_core_num_seg = num_segments_segment_ids;
+    }
+    auto shard_compute_seg = [&](size_t start_seg, size_t end_seg) {
+      for (size_t i = start_seg; i < end_seg; i++) {
+        int64_t count = segments_segment_ids[i];
+        int64_t count_no = 0;
+        for (size_t j = 0; j < i; j++) {
+          count_no += segments_segment_ids[j];
+        }
+        int64_t input_addr_base = count_no * num_compare_per;
+        for (int64_t j = 0; j < num_compare_per; j++) {
+          int64_t max_init_addr = input_addr_base + j;
+          T1 max_value = input_x_addr[max_init_addr];
+          for (int64_t k = 1; k < count; k++) {
+            int cmp_addr = max_init_addr + k * num_compare_per;
+            if (max_value < input_x_addr[cmp_addr]) {
+              max_value = input_x_addr[cmp_addr];
+            }
+          }
+          output_data_addr[segment_ids_data_addr[count_no] * num_compare_per + j] = max_value;
+        }
+      }
+    };
+    KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, num_segments_segment_ids,
+                                                    num_segments_segment_ids / max_core_num_seg, shard_compute_seg),
+                        "SegmentMax Compute failed.");
+  }
+  return KERNEL_STATUS_OK;
+}
+REGISTER_CPU_KERNEL(kSegmentMax, SegmentMaxCpuKernel);
+}  // namespace aicpu
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_max.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_max.h
@ -0,0 +1,35 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_KERNELS_NORMALIZED_SEGMENTMAX_H_
+#define AICPU_KERNELS_NORMALIZED_SEGMENTMAX_H_
+
+#include "cpu_ops_kernel.h"
+
+namespace aicpu {
+class SegmentMaxCpuKernel : public CpuKernel {
+ public:
+  SegmentMaxCpuKernel() = default;
+  ~SegmentMaxCpuKernel() override = default;
+
+ protected:
+  uint32_t Compute(CpuKernelContext &ctx) override;
+
+ private:
+  template <typename T1, typename T2>
+  static uint32_t SegmentMaxCompute(CpuKernelContext &ctx);
+};
+}  // namespace aicpu
+#endif
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc
@ -54,7 +54,6 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
  static const std::set<std::string> kMigrateAicpuKernelOps = {mindspore::kAdaptiveAvgPool2DV1OpName,
                                                               mindspore::kAdaptiveAvgPool2DGradV1OpName,
                                                               mindspore::kBucketizeOpName,
-                                                               mindspore::kCacheSwapTableOpName,
                                                               mindspore::kCauchyOpName,
                                                               mindspore::kChannelShuffleOpName,
                                                               mindspore::kCholeskyOpName,
@ -252,7 +251,10 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
                                                               mindspore::kLogicalXorOpName,
                                                               mindspore::kLogNormalReverseOpName,
                                                               mindspore::kBetaincOpName,
-                                                               mindspore::kLessEqualOpName};
+                                                               mindspore::kLessEqualOpName,
+                                                               mindspore::kHSVToRGBOpName,
+                                                               mindspore::kLuSolveOpName,
+                                                               mindspore::kExtractGlimpseOpName};

  static const std::string kEnvOpSoNames = "mindspore_aicpu_kernels";
  static const std::string kCpuKernelSoName = "mindspore_cpu_kernels";
--- a/mindspore/python/mindspore/nn/loss/loss.py
+++ b/mindspore/python/mindspore/nn/loss/loss.py
@ -1160,7 +1160,7 @@ class PoissonNLLLoss(LossBase):
    Args:
        log_input (bool, optional): Whether use log input. Default: True.
        full (bool, optional): Whether include the Stirling approximation term in the loss calculation. Default: False.
-        eps (float, optional): Lower bound of `x` when calculating logarithms. Default: 1e-8.
+        eps (float, optional): Lower bound of `x` when calculating logarithms. Default: 1e-08.
        reduction (str, optional): Apply specific reduction method to the output:
            'none', 'mean', 'sum'. Default: 'mean'.

--- a/mindspore/python/mindspore/ops/_op_impl/aicpu/init.py
+++ b/mindspore/python/mindspore/ops/_op_impl/aicpu/init.py
@ -350,3 +350,5 @@ from .hsv_to_rgb import _hsv_to_rgb_aicpu
 from .im2col import _im2col_aicpu
 from .lu_solve import _lu_solve_aicpu
 from .relu_grad_v3 import _relu_grad_v3_aicpu
+from .resize_bicubic import _resize_bicubic_aicpu
+from .extract_glimpse import _extract_glimpse_aicpu
--- a/mindspore/python/mindspore/ops/operations/init.py
+++ b/mindspore/python/mindspore/ops/operations/init.py
@ -86,7 +86,7 @@ from .math_ops import (Abs, ACos, Asin, Asinh, AddN, AccumulateNV2, AssignAdd, A
                       MatrixLogarithm, MatrixPower, MatrixSolve, MatrixTriangularSolve, ReduceStd, STFT,
                       NextAfter, Orgqr, Qr, RaggedRange, Digamma, Eig, EuclideanNorm, CompareAndBitpack, ComplexAbs,
                       CumulativeLogsumexp, Gcd, Trace, TridiagonalMatMul, TrilIndices, TriuIndices, Zeta,
-                       Roll, Lgamma, Logit)
+                       Roll, Lgamma, Logit, MatrixSolveLs)
 from .nn_ops import (LSTM, SGD, Adam, AdamWeightDecay, FusedSparseAdam, FusedSparseLazyAdam, AdamNoUpdateParam,
                     ApplyMomentum, BatchNorm, BiasAdd, Conv2D, Conv3D, Conv2DTranspose, Conv3DTranspose,
                     DepthwiseConv2dNative,
@ -647,7 +647,8 @@ __all__ = [
    "SparseSlice",
    "ResizeLinear1D",
    "ResizeBicubic",
-    "Logit"
+    "Logit",
+    "MatrixSolveLs"
 ]

 __custom__ = [
--- a/mindspore/python/mindspore/ops/operations/image_ops.py
+++ b/mindspore/python/mindspore/ops/operations/image_ops.py
@ -465,10 +465,10 @@ class NonMaxSuppressionWithOverlaps(Primitive):

    Examples:
        >>> overlaps = Tensor(np.array([[0.6964692, 0.28613934, 0.22685145, 0.5513148],
-                                [0.71946895, 0.42310646, 0.9807642, 0.6848297],
-                                [0.4809319, 0.39211753, 0.343178, 0.7290497],
-                                [0.43857226, 0.059677895, 0.39804426, 0.7379954]
-                                ]), mstype.float32)
+        ...                     [0.71946895, 0.42310646, 0.9807642, 0.6848297],
+        ...                     [0.4809319, 0.39211753, 0.343178, 0.7290497],
+        ...                     [0.43857226, 0.059677895, 0.39804426, 0.7379954]
+        ...                     ]), mstype.float32)
        >>> scores = Tensor(np.array([0.18249173, 0.17545176, 0.53155136, 0.53182757]), mstype.float32)
        >>> max_output_size = Tensor(4, mstype.int32)
        >>> overlap_threshold = Tensor(0.1, mstype.float32)
--- a/mindspore/python/mindspore/ops/operations/math_ops.py
+++ b/mindspore/python/mindspore/ops/operations/math_ops.py
@ -260,6 +260,7 @@ class Addcdiv(Primitive):

    Raises:
        TypeError: If dtype of `x1`, `x2`, `value`, `input_data` is not tensor.
+        TypeError: If dtype of `x1`, `x2`, `value`, `input_data` are not the same.
        ValueError: If `x1` could not be broadcast to `x2`.
        ValueError: If `value` could not be broadcast to `x1/x2`.
        ValueError: If `input_data` could not be broadcast to `value*(x1/x2)`.
@ -303,9 +304,7 @@ class Addcmul(Primitive):

    Raises:
        TypeError: If dtype of `x1`, `x2`, `value`, `input_data` is not tensor.
-        TypeError: If dtype of `input_data` is not one of: float32, float16, int32.
-        TypeError: If dtype of `x1` or `x2` is not one of: float32, float16, int32.
-        TypeError: If dtype of `value` is not one of: float32, float16, int32.
+        TypeError: If dtype of `x1`, `x2`, `value`, `input_data` are not the same.
        ValueError: If `x1` could not be broadcast to `x2`.
        ValueError: If `value` could not be broadcast to `x1` * `x2`.
        ValueError: If `input_data` could not be broadcast to `value*(x1*x2)`.