!49948 NPU V2 overflow check

Merge pull request !49948 from archer2049/master_npu3_merge
2023-03-08 06:47:14 +00:00 · 2023-03-08 06:47:14 +00:00 · 522d61880a
parent 9236c3b4e9 2261cf7bca
commit 522d61880a
37 changed files with 1280 additions and 240 deletions
--- a/docs/api/api_python/amp/mindspore.amp.all_finite.rst
+++ b/docs/api/api_python/amp/mindspore.amp.all_finite.rst
@ -12,7 +12,6 @@ mindspore.amp.all_finite

    参数：
        - **inputs** (Union(tuple(Tensor), list(Tensor))) - 可迭代的Tensor。
-        - **status** (Tensor) - 溢出检测时所需要的初始状态，仅在Ascend需要。默认值：None。

    返回：
        Tensor，布尔类型的标量Tensor。
--- a/docs/api/api_python/amp/mindspore.amp.init_status.rst
+++ b/docs/api/api_python/amp/mindspore.amp.init_status.rst
@ -1,12 +0,0 @@
-mindspore.amp.init_status
-===========================
-
-.. py:function:: mindspore.amp.init_status()
-
-    初始化溢出状态检测变量。
-
-    .. note::
-        该接口仅在Ascend后端有效，在GPU、CPU上调用的返回值没有作用。
-
-    返回：
-        Tensor，shape为 (8,) 。
--- a/docs/api/api_python/mindspore.amp.rst
+++ b/docs/api/api_python/mindspore.amp.rst
@ -35,5 +35,4 @@ mindspore.amp
    :nosignatures:
    :template: classtemplate.rst

-    mindspore.amp.init_status
    mindspore.amp.all_finite
--- a/docs/api/api_python_en/mindspore.amp.rst
+++ b/docs/api/api_python_en/mindspore.amp.rst
@ -35,5 +35,4 @@ Overflow Detection
    :nosignatures:
    :template: classtemplate.rst

-    mindspore.amp.init_status
    mindspore.amp.all_finite
--- a/mindspore/ccsrc/include/common/utils/utils.h
+++ b/mindspore/ccsrc/include/common/utils/utils.h
@ -572,6 +572,8 @@ constexpr auto kNonZeroOpName = "NonZero";
 constexpr auto kNPUAllocFloatStatusOpName = "NPUAllocFloatStatus";
 constexpr auto kNPUClearFloatStatusOpName = "NPUClearFloatStatus";
 constexpr auto kNPUGetFloatStatusOpName = "NPUGetFloatStatus";
+constexpr auto kNPUClearFloatStatusV2OpName = "NPUClearFloatStatusV2";
+constexpr auto kNPUGetFloatStatusV2OpName = "NPUGetFloatStatusV2";
 constexpr auto kNthElementOpName = "NthElement";
 constexpr auto kOneHotOpName = "OneHot";
 constexpr auto kOneHotDOpName = "OneHotD";
--- a/mindspore/ccsrc/kernel/kash/kernel_pack.cc
+++ b/mindspore/ccsrc/kernel/kash/kernel_pack.cc
@ -24,14 +24,31 @@
 namespace mindspore {
 namespace kernel {
 constexpr size_t kJsonSuffixLength = 5;
+constexpr char kMagic[] = "magic";
+constexpr char kBlockDim[] = "blockDim";
+constexpr char kKernelName[] = "kernelName";
+constexpr char kBinFileName[] = "binFileName";
+constexpr char kBinFileSuffix[] = "binFileSuffix";
+constexpr char kCoreType[] = "core_type";
+constexpr char kTaskRation[] = "taskRation";
+constexpr char kWorkspace[] = "workspace";
+constexpr char kParameters[] = "parameters";
+constexpr char kOpParaSize[] = "opParaSize";
+constexpr char kSHA256[] = "sha256";
+constexpr char kKBHit[] = "KBHit";
+constexpr char kKernelList[] = "kernelList";
+constexpr char kModeInArgsFirstField[] = "modeInArgsFirstField";
+constexpr char kBatchBindOnly[] = "batchBindOnly";
+constexpr char kArgsRemap[] = "args_remap";
+constexpr char kSize[] = "size";
+constexpr char kGlobalWorkspaceSpecWorkspace[] = "globalworkspace_spec_workspace";
 namespace {
 bool CheckHash(const std::string &json_file, const std::string &bin_file, const nlohmann::json &js) {
-  if (js.find("sha256") == js.end()) {
-    MS_LOG(ERROR) << "No sha256 found in " << json_file;
+  if (js.find(kSHA256) == js.end()) {
    return false;
  }
  std::string sha256_cal = system::sha256::GetHashFromFile(bin_file);
-  std::string sha256_str = js["sha256"];
+  std::string sha256_str = js[kSHA256];
  if (sha256_cal.empty() || sha256_cal != sha256_str) {
    MS_LOG(WARNING) << "Check sha256 for [" << bin_file << "] failed, it will try to rebuild the op.";
    return false;
@ -154,9 +171,9 @@ bool KernelPack::ReadFromJsonFile(const std::string &json_f, const std::string &
    }

    // cuda json file may have workspace information
-    if (js.find("workspace") != js.end()) {
-      auto workspace = js.at("workspace");
-      std::vector<size_t> sizes = workspace.at("size");
+    if (js.find(kWorkspace) != js.end()) {
+      auto workspace = js.at(kWorkspace);
+      std::vector<size_t> sizes = workspace.at(kSize);
      for (auto size : sizes) {
        kernel_json_info_.workspaces.push_back(size);
      }
@ -165,7 +182,7 @@ bool KernelPack::ReadFromJsonFile(const std::string &json_f, const std::string &
    return true;
  }

-  std::string binfile_suffix = js["binFileSuffix"];
+  std::string binfile_suffix = js[kBinFileSuffix];
  std::string bin_f = json_f.substr(0, json_f.length() - kJsonSuffixLength) + binfile_suffix;
  if (binfile_suffix == ".so") {
    // change "xx/xx.so" -> "xx/libxx.so"
@ -282,18 +299,18 @@ void KernelPack::ParseWorkSpace(const std::string &key, const nlohmann::json &js
  }
  try {
    auto workspace = js.at(key);
-    if (workspace.find("num") == workspace.end() || workspace.find("size") == workspace.end()) {
+    if (workspace.find("num") == workspace.end() || workspace.find(kSize) == workspace.end()) {
      MS_LOG(WARNING) << "'num' and 'size' ars necessary in workspace, but not found. " << js.dump(indent);
      return;
    }
    size_t num = workspace.at("num");
-    std::vector<size_t> sizes = workspace.at("size");
+    std::vector<size_t> sizes = workspace.at(kSize);
    if (num != sizes.size()) {
      MS_LOG(WARNING) << "'num' and length of 'size' must be same. " << js.dump(indent);
      return;
    }
-    if (workspace.find("type") != workspace.end()) {
-      std::vector<size_t> type = workspace.at("type");
+    if (workspace.find(kType) != workspace.end()) {
+      std::vector<size_t> type = workspace.at(kType);
      if (num != type.size()) {
        MS_LOG(WARNING) << "'num' and length of 'type' must be same. " << js.dump(indent);
        return;
@ -383,24 +400,47 @@ void KernelPack::ParseArgsRemap(const std::string &key, const nlohmann::json &js
  }
 }

+void KernelPack::ParseGlogbleWorkSpace(const std::string &key, const nlohmann::json &js,
+                                       KernelJsonInfo *kernel_json_info) {
+  MS_EXCEPTION_IF_NULL(kernel_json_info);
+  if (js.find(key) == js.end()) {
+    return;
+  }
+  try {
+    auto globalWorkspace = js.at(key);
+    if (globalWorkspace.find(kSize) != globalWorkspace.end()) {
+      kernel_json_info->global_workspace.size = globalWorkspace.at(kSize);
+      kernel_json_info->global_workspace.is_overflow = true;
+    }
+    if (globalWorkspace.find(kType) != globalWorkspace.end()) {
+      kernel_json_info->global_workspace.type = globalWorkspace.at(kType);
+      kernel_json_info->global_workspace.is_overflow = true;
+    }
+  } catch (std::exception &e) {
+    MS_LOG(ERROR) << "Parse json value failed, jsong is:" + js.dump() + ", error info: " << e.what();
+  }
+}
+
 void KernelPack::ParseKernelJson(const nlohmann::json &js) {
  using KernelJsonParser = std::function<void(const std::string &, const nlohmann::json &, KernelJsonInfo *)>;
-  const std::map<std::string, KernelJsonParser> kernel_json_map = {{"magic", ParseMagic},
-                                                                   {"blockDim", ParseBlockDim},
-                                                                   {"kernelName", ParseKernelName},
-                                                                   {"binFileName", ParseBinFileName},
-                                                                   {"binFileSuffix", ParseBinFileSuffix},
-                                                                   {"core_type", ParseCoreType},
-                                                                   {"taskRation", ParseTaskRatio},
-                                                                   {"workspace", ParseWorkSpace},
-                                                                   {"parameters", ParseParameters},
-                                                                   {"opParaSize", ParseOpParaSize},
-                                                                   {"sha256", ParseSHA256},
-                                                                   {"KBHit", ParseKBHit},
-                                                                   {"kernelList", ParseKernelList},
-                                                                   {"modeInArgsFirstField", ParseModeInArgsFirstField},
-                                                                   {"batchBindOnly", ParseBatchBindOnly},
-                                                                   {"args_remap", ParseArgsRemap}};
+  const std::map<std::string, KernelJsonParser> kernel_json_map = {
+    {kMagic, ParseMagic},
+    {kBlockDim, ParseBlockDim},
+    {kKernelName, ParseKernelName},
+    {kBinFileName, ParseBinFileName},
+    {kBinFileSuffix, ParseBinFileSuffix},
+    {kCoreType, ParseCoreType},
+    {kTaskRation, ParseTaskRatio},
+    {kWorkspace, ParseWorkSpace},
+    {kParameters, ParseParameters},
+    {kOpParaSize, ParseOpParaSize},
+    {kSHA256, ParseSHA256},
+    {kKBHit, ParseKBHit},
+    {kKernelList, ParseKernelList},
+    {kModeInArgsFirstField, ParseModeInArgsFirstField},
+    {kBatchBindOnly, ParseBatchBindOnly},
+    {kArgsRemap, ParseArgsRemap},
+    {kGlobalWorkspaceSpecWorkspace, ParseGlogbleWorkSpace}};
  auto iter = kernel_json_map.begin();
  while (iter != kernel_json_map.end()) {
    iter->second(iter->first, js, &kernel_json_info_);
--- a/mindspore/ccsrc/kernel/kernel.h
+++ b/mindspore/ccsrc/kernel/kernel.h
@ -123,6 +123,12 @@ struct FlexArray {
  char contents[];
 };

+struct GlobalWorkspace {
+  size_t size;
+  size_t type;
+  bool is_overflow = false;
+};
+
 struct KernelJsonInfo {
  std::string bin_file_name;
  std::string bin_file_suffix;
@ -133,6 +139,7 @@ struct KernelJsonInfo {
  std::string sha256;
  std::vector<size_t> workspaces_type;
  std::vector<size_t> workspaces;
+  GlobalWorkspace global_workspace;
  bool has_kernel_list = false;
  uint32_t op_para_size;
  int32_t KBHit;
@ -185,6 +192,7 @@ class BACKEND_EXPORT KernelPack {
  static void ParseModeInArgsFirstField(const std::string &key, const nlohmann::json &js,
                                        KernelJsonInfo *kernel_json_info);
  static void ParseArgsRemap(const std::string &key, const nlohmann::json &js, KernelJsonInfo *kernel_json_info);
+  static void ParseGlogbleWorkSpace(const std::string &key, const nlohmann::json &js, KernelJsonInfo *kernel_json_info);

  KernelJsonInfo kernel_json_info_;
  FlexArray *json_;
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_adapter.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_adapter.cc
@ -19,6 +19,7 @@
 #include <algorithm>
 #include "ir/func_graph.h"
 #include "runtime/mem.h"
+#include "acl/acl_rt.h"
 #include "utils/ms_context.h"
 #include "utils/convert_utils_base.h"
 #include "graphengine/inc/external/runtime/rt_error_codes.h"
@ -35,6 +36,7 @@ constexpr double kHalfRatio = 0.5;
 // The Ascend max available device memory is 32GB.
 constexpr float kAscendMaxDeviceMemory = 32;
 constexpr uint64_t kOverflowAddrSize = 512;
+constexpr char kGlobalOverflowWorkspace[] = "GLOBAL_OVERFLOW_WORKSPACE";

 size_t AscendMemAdapter::GetRoundDownAlignSize(size_t input_size) {
  return (input_size / kAscendMemAlignSize) * kAscendMemAlignSize;
@ -180,16 +182,16 @@ uint8_t *AscendMemAdapter::MallocDynamicDevMem(size_t size, const std::string &t
  return memory_block_ptr;
 }

-uint8_t *AscendMemAdapter::MallocOverflowMem(const CNodePtr &kernel) {
+uint8_t *AscendMemAdapter::MallocOverflowMem() {
  std::lock_guard<std::mutex> locker(overflow_mutex_);
-  auto funcGraph = kernel->func_graph();
-  MS_EXCEPTION_IF_NULL(funcGraph);
-  if (overflow_memory_info_map_.find(funcGraph->ToString()) != overflow_memory_info_map_.cend()) {
-    return overflow_memory_info_map_.find(funcGraph->ToString())->second;
+  if (overflow_memory_info_map_.find(kGlobalOverflowWorkspace) != overflow_memory_info_map_.cend()) {
+    auto addr = overflow_memory_info_map_.find(kGlobalOverflowWorkspace);
+    return addr->second;
  } else {
-    auto overflow_memory_ptr = MallocStaticDevMem(kOverflowAddrSize, "overflow memory ptr");
+    auto overflow_memory_ptr = MallocStaticDevMem(kOverflowAddrSize, "global overflow memory ptr");
    MS_EXCEPTION_IF_NULL(overflow_memory_ptr);
-    (void)overflow_memory_info_map_.emplace(funcGraph->ToString(), overflow_memory_ptr);
+    (void)aclrtMemset(overflow_memory_ptr, kOverflowAddrSize, 0, kOverflowAddrSize);
+    (void)overflow_memory_info_map_.emplace(kGlobalOverflowWorkspace, overflow_memory_ptr);
    return overflow_memory_ptr;
  }
 }
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_adapter.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_adapter.h
@ -39,7 +39,7 @@ class AscendMemAdapter {

  uint8_t *MallocStaticDevMem(size_t size, const std::string &tag = "");
  uint8_t *MallocDynamicDevMem(size_t size, const std::string &tag = "");
-  uint8_t *MallocOverflowMem(const CNodePtr &kernel);
+  uint8_t *MallocOverflowMem();
  bool FreeStaticDevMem(void *) const { return true; }
  void ResetDynamicMemory();

--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/kernel_adjust.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/kernel_adjust.cc
@ -735,74 +735,56 @@ void KernelAdjust::InsertProfilingKernel(const ProfilingTraceInfo &profiling_tra
 }
 #endif

-CNodePtr KernelAdjust::CreateNPUGetFloatStatus(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
-                                               const CNodePtr &npu_alloc_cnode) const {
+CNodePtr KernelAdjust::CreateNPUGetFloatStatusV2(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
+                                                 const AnfNodePtr &status_value_node) const {
  MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
-  MS_EXCEPTION_IF_NULL(npu_alloc_cnode);
-  auto npu_get_primitive = std::make_shared<Primitive>(kNPUGetFloatStatusOpName);
-  std::vector<AnfNodePtr> npu_get_inputs = {NewValueNode(npu_get_primitive), npu_alloc_cnode};
+  MS_EXCEPTION_IF_NULL(status_value_node);
+  auto npu_get_primitive = std::make_shared<Primitive>(kNPUGetFloatStatusV2OpName);
+  std::vector<AnfNodePtr> npu_get_inputs = {NewValueNode(npu_get_primitive), status_value_node};
  auto npu_get_cnode = kernel_graph_ptr->NewCNode(npu_get_inputs);
  MS_EXCEPTION_IF_NULL(npu_get_cnode);
-  npu_alloc_cnode->set_scope(kDefaultScope);
-  npu_get_cnode->set_abstract(npu_alloc_cnode->abstract());
+  status_value_node->set_scope(kDefaultScope);
+  ShapeVector npu_output_shape = {kNPUShape};
+  common::AnfAlgo::SetOutputInferTypeAndShape({kNumberTypeInt32}, {npu_output_shape}, npu_get_cnode.get());

  kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  selected_kernel_builder.SetInputsFormat({kOpFormat_DEFAULT});
-  selected_kernel_builder.SetInputsDeviceType({kNumberTypeFloat32});
+  selected_kernel_builder.SetInputsDeviceType({kNumberTypeInt32});
  selected_kernel_builder.SetFusionType(kernel::kPatternOpaque);
  selected_kernel_builder.SetProcessor(kernel::Processor::AICORE);
  selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL);
  selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT});
-  selected_kernel_builder.SetOutputsDeviceType({kNumberTypeFloat32});
+  selected_kernel_builder.SetOutputsDeviceType({kNumberTypeInt32});
  AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), npu_get_cnode.get());
  return npu_get_cnode;
 }

-CNodePtr KernelAdjust::CreateNPUClearStatus(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
-                                            const CNodePtr &npu_alloc_cnode) const {
+CNodePtr KernelAdjust::CreateNPUClearStatusV2(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
+                                              const AnfNodePtr &status_value_node) const {
  MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
-  MS_EXCEPTION_IF_NULL(npu_alloc_cnode);
-  auto npu_clear_primitive = std::make_shared<Primitive>(kNPUClearFloatStatusOpName);
-  std::vector<AnfNodePtr> npu_clear_inputs = {NewValueNode(npu_clear_primitive), npu_alloc_cnode};
+  MS_EXCEPTION_IF_NULL(status_value_node);
+  auto npu_clear_primitive = std::make_shared<Primitive>(kNPUClearFloatStatusV2OpName);
+  std::vector<AnfNodePtr> npu_clear_inputs = {NewValueNode(npu_clear_primitive), status_value_node};
  auto npu_clear_cnode = kernel_graph_ptr->NewCNode(npu_clear_inputs);
  MS_EXCEPTION_IF_NULL(npu_clear_cnode);
-  npu_alloc_cnode->set_scope(kDefaultScope);
-  npu_clear_cnode->set_abstract(npu_alloc_cnode->abstract());
+  status_value_node->set_scope(kDefaultScope);
+  npu_clear_cnode->set_abstract(status_value_node->abstract());
+  ShapeVector npu_output_shape = {kNPUShape};
+  common::AnfAlgo::SetOutputInferTypeAndShape({kNumberTypeInt32}, {npu_output_shape}, npu_clear_cnode.get());

  kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  selected_kernel_builder.SetInputsFormat({kOpFormat_DEFAULT});
-  selected_kernel_builder.SetInputsDeviceType({kNumberTypeFloat32});
+  selected_kernel_builder.SetInputsDeviceType({kNumberTypeInt32});
  selected_kernel_builder.SetFusionType(kernel::kPatternOpaque);
  selected_kernel_builder.SetProcessor(kernel::Processor::AICORE);
  selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL);
  selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT});
-  selected_kernel_builder.SetOutputsDeviceType({kNumberTypeFloat32});
+  selected_kernel_builder.SetOutputsDeviceType({kNumberTypeInt32});
  AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), npu_clear_cnode.get());

  return npu_clear_cnode;
 }

-CNodePtr KernelAdjust::CreateNPUAllocStatus(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) const {
-  MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
-  // create npu_alloc_cnode
-  auto npu_alloc_primitive = std::make_shared<Primitive>(kNPUAllocFloatStatusOpName);
-  std::vector<AnfNodePtr> npu_alloc_inputs = {NewValueNode(npu_alloc_primitive)};
-  auto npu_alloc_cnode = kernel_graph_ptr->NewCNode(npu_alloc_inputs);
-  MS_EXCEPTION_IF_NULL(npu_alloc_cnode);
-  npu_alloc_cnode->set_scope(kDefaultScope);
-  ShapeVector npu_output_shape = {kNPUShape};
-  common::AnfAlgo::SetOutputInferTypeAndShape({kNumberTypeFloat32}, {npu_output_shape}, npu_alloc_cnode.get());
-
-  kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
-  selected_kernel_builder.SetFusionType(kernel::kPatternOpaque);
-  selected_kernel_builder.SetProcessor(kernel::Processor::AICORE);
-  selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL);
-  selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT});
-  selected_kernel_builder.SetOutputsDeviceType({kNumberTypeFloat32});
-  AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), npu_alloc_cnode.get());
-  return npu_alloc_cnode;
-}
-
 CNodePtr KernelAdjust::CreateAssignAdd(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
                                       const CNodePtr &npu_alloc_cnode, const AnfNodePtr &specify_para) const {
  MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
@ -836,39 +818,41 @@ CNodePtr KernelAdjust::CreateAssignAdd(const std::shared_ptr<session::KernelGrap
  return assign_add_cnode;
 }

-CNodePtr KernelAdjust::CreateAssign(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
-                                    const AnfNodePtr &specify_para) const {
-  MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
-  MS_EXCEPTION_IF_NULL(specify_para);
-
-  std::vector<float> reset(kNPUShape, 0.0);
-  ShapeVector reset_shape({kNPUShape});
-  auto shp_buf_size = sizeof(float) * reset.size();
-  auto reset_tensor = std::make_shared<tensor::Tensor>(kNumberTypeFloat32, reset_shape, reset.data(), shp_buf_size);
-  auto reset_value_node = std::make_shared<ValueNode>(reset_tensor);
-  MS_EXCEPTION_IF_NULL(reset_value_node);
-  reset_value_node->set_abstract(specify_para->abstract());
-  kernel_graph_ptr->AddValueNodeToGraph(reset_value_node);
+AnfNodePtr KernelAdjust::CreateZerosValueNode(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) const {
+  std::vector<int32_t> zeros(kNPUShape, 0);
+  ShapeVector zeros_shape({kNPUShape});
+  auto shp_buf_size = sizeof(float) * zeros.size();
+  auto zeros_tensor = std::make_shared<tensor::Tensor>(kNumberTypeInt32, zeros_shape, zeros.data(), shp_buf_size);
+  auto zeros_value_node = std::make_shared<ValueNode>(zeros_tensor);
+  MS_EXCEPTION_IF_NULL(zeros_value_node);
+  kernel_graph_ptr->AddValueNodeToGraph(zeros_value_node);

  auto kernel_info = std::make_shared<device::KernelInfo>();
  MS_EXCEPTION_IF_NULL(kernel_info);
-  reset_value_node->set_kernel_info(kernel_info);
+  zeros_value_node->set_kernel_info(kernel_info);
  kernel::KernelBuildInfo::KernelBuildInfoBuilder builder1;
  builder1.SetOutputsFormat({kOpFormat_DEFAULT});
-  builder1.SetOutputsDeviceType({kNumberTypeFloat32});
-  AnfAlgo::SetSelectKernelBuildInfo(builder1.Build(), reset_value_node.get());
+  builder1.SetOutputsDeviceType({kNumberTypeInt32});
+  AnfAlgo::SetSelectKernelBuildInfo(builder1.Build(), zeros_value_node.get());
+  return zeros_value_node;
+}
+
+CNodePtr KernelAdjust::CreateAssign(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
+                                    const AnfNodePtr &specify_para, const AnfNodePtr &data) const {
+  MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
+  MS_EXCEPTION_IF_NULL(specify_para);

  auto assign_primitive = std::make_shared<Primitive>(kAssignOpName);
-  std::vector<AnfNodePtr> assign_inputs = {NewValueNode(assign_primitive), specify_para, reset_value_node};
+  std::vector<AnfNodePtr> assign_inputs = {NewValueNode(assign_primitive), specify_para, data};
  auto assign_cnode = kernel_graph_ptr->NewCNode(assign_inputs);
  MS_EXCEPTION_IF_NULL(assign_cnode);
  assign_cnode->set_scope(kDefaultScope);
  assign_cnode->set_abstract(specify_para->abstract());

  kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder(
-    {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeFloat32, TypeId::kNumberTypeFloat32});
+    {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeInt32, TypeId::kNumberTypeInt32});
  selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT});
-  selected_kernel_builder.SetOutputsDeviceType({kNumberTypeFloat32});
+  selected_kernel_builder.SetOutputsDeviceType({kNumberTypeInt32});

  AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), assign_cnode.get());
  std::vector<std::string> input_names = {"ref", "value"};
@ -944,7 +928,8 @@ void KernelAdjust::InsertGradientOverflowCheckOperations(
  MS_EXCEPTION_IF_NULL(kernel_graph_ptr);

  bool first_grad_op = true;
-  CNodePtr npu_alloc_cnode;
+  auto status_value_node = CreateZerosValueNode(kernel_graph_ptr);
+  auto reset_value_node = CreateZerosValueNode(kernel_graph_ptr);
  std::vector<CNodePtr> new_execution_order;
  auto execution_order = kernel_graph_ptr->execution_order();
  for (size_t i = 0; i < execution_order.size() - 1; i++) {
@ -956,39 +941,37 @@ void KernelAdjust::InsertGradientOverflowCheckOperations(

    if (cur_full_name.find(kGradients) == std::string::npos && next_full_name.find(kGradients) != std::string::npos) {
      if (first_grad_op) {
-        npu_alloc_cnode = CreateNPUAllocStatus(kernel_graph_ptr);
-        auto npu_clear_cnode = CreateNPUClearStatus(kernel_graph_ptr, npu_alloc_cnode);
-        auto assign_cnode = CreateAssign(kernel_graph_ptr, specify_para);
-        AnfAlgo::SetStreamId(next_stream_id, npu_alloc_cnode.get());
+        auto npu_clear_cnode = CreateNPUClearStatusV2(kernel_graph_ptr, status_value_node);
+        auto assign_cnode = CreateAssign(kernel_graph_ptr, specify_para, reset_value_node);
+        AnfAlgo::SetStreamId(next_stream_id, status_value_node.get());
        AnfAlgo::SetStreamId(next_stream_id, npu_clear_cnode.get());
        AnfAlgo::SetStreamId(next_stream_id, assign_cnode.get());
-        new_execution_order.push_back(npu_alloc_cnode);
        new_execution_order.push_back(npu_clear_cnode);
        new_execution_order.push_back(assign_cnode);
        first_grad_op = false;
      } else {
-        auto npu_clear_cnode = CreateNPUClearStatus(kernel_graph_ptr, npu_alloc_cnode);
+        auto npu_clear_cnode = CreateNPUClearStatusV2(kernel_graph_ptr, status_value_node);
        AnfAlgo::SetStreamId(next_stream_id, npu_clear_cnode.get());
        new_execution_order.push_back(npu_clear_cnode);
      }
    }
    if (cur_full_name.find(kGradients) != std::string::npos && next_full_name.find(kGradients) == std::string::npos) {
-      auto npu_get_cnode = CreateNPUGetFloatStatus(kernel_graph_ptr, npu_alloc_cnode);
-      auto assign_add_cnode = CreateAssignAdd(kernel_graph_ptr, npu_alloc_cnode, specify_para);
+      auto npu_get_cnode = CreateNPUGetFloatStatusV2(kernel_graph_ptr, status_value_node);
+      auto assign_status_node = CreateAssign(kernel_graph_ptr, specify_para, npu_get_cnode);
+      AnfAlgo::SetStreamId(cur_stream_id, npu_get_cnode.get());
      AnfAlgo::SetStreamId(cur_stream_id, npu_get_cnode.get());
-      AnfAlgo::SetStreamId(cur_stream_id, assign_add_cnode.get());
      new_execution_order.push_back(npu_get_cnode);
-      new_execution_order.push_back(assign_add_cnode);
+      new_execution_order.push_back(assign_status_node);
    }
    if (i == execution_order.size() - kLastHandleDiff) {
      new_execution_order.push_back(execution_order[i + 1]);
      if (next_full_name.find(kGradients) != std::string::npos) {
-        auto npu_get_cnode = CreateNPUGetFloatStatus(kernel_graph_ptr, npu_alloc_cnode);
-        auto assign_add_cnode = CreateAssignAdd(kernel_graph_ptr, npu_alloc_cnode, specify_para);
+        auto npu_get_cnode = CreateNPUGetFloatStatusV2(kernel_graph_ptr, status_value_node);
+        auto assign_status_node = CreateAssign(kernel_graph_ptr, specify_para, npu_get_cnode);
        AnfAlgo::SetStreamId(cur_stream_id, npu_get_cnode.get());
-        AnfAlgo::SetStreamId(cur_stream_id, assign_add_cnode.get());
+        AnfAlgo::SetStreamId(cur_stream_id, assign_status_node.get());
        new_execution_order.push_back(npu_get_cnode);
-        new_execution_order.push_back(assign_add_cnode);
+        new_execution_order.push_back(assign_status_node);
      }
    }
  }
@ -1030,18 +1013,16 @@ void KernelAdjust::InsertDynamicLossScaleCheckOperations(const std::shared_ptr<s
  bool first_layer_op = true;
  std::vector<CNodePtr> new_execution_order;
  int64_t cur_param = static_cast<int64_t>(dynamic_loss_scale_param_list->size()) - 1;
-  CNodePtr npu_alloc_cnode;
+  auto status_value_node = CreateZerosValueNode(kernel_graph_ptr);
+  auto reset_value_node = CreateZerosValueNode(kernel_graph_ptr);
  std::set<int64_t> viewed_id;
  for (size_t i = 0; i < execution_order.size(); ++i) {
    auto cur_node = execution_order[i];
    auto cur_stream_id = AnfAlgo::GetStreamId(cur_node);
    if (common::AnfAlgo::HasNodeAttr(kSplitOverFlow, cur_node) || (i == end_gradient_index)) {
      if (first_layer_op) {
-        npu_alloc_cnode = CreateNPUAllocStatus(kernel_graph_ptr);
-        AnfAlgo::SetStreamId(cur_stream_id, npu_alloc_cnode.get());
-        (void)new_execution_order.emplace_back(npu_alloc_cnode);
        for (const auto &param : *dynamic_loss_scale_param_list) {
-          auto assign_cnode = CreateAssign(kernel_graph_ptr, param);
+          auto assign_cnode = CreateAssign(kernel_graph_ptr, param, reset_value_node);
          AnfAlgo::SetStreamId(cur_stream_id, assign_cnode.get());
          (void)new_execution_order.emplace_back(assign_cnode);
        }
@ -1055,22 +1036,19 @@ void KernelAdjust::InsertDynamicLossScaleCheckOperations(const std::shared_ptr<s
          (void)new_execution_order.emplace_back(cur_node);
          continue;
        }
-        if (viewed_id.count(cur_param) != 0) {
-          auto assign_cnode = CreateAssign(kernel_graph_ptr, dynamic_loss_scale_param_list->at(cur_param));
-          AnfAlgo::SetStreamId(cur_stream_id, assign_cnode.get());
-          (void)new_execution_order.emplace_back(assign_cnode);
-        }
-        auto npu_get_cnode = CreateNPUGetFloatStatus(kernel_graph_ptr, npu_alloc_cnode);
+
+        auto npu_get_cnode = CreateNPUGetFloatStatusV2(kernel_graph_ptr, status_value_node);
        AnfAlgo::SetStreamId(cur_stream_id, npu_get_cnode.get());
        (void)new_execution_order.emplace_back(npu_get_cnode);
-        auto assign_add_cnode =
-          CreateAssignAdd(kernel_graph_ptr, npu_alloc_cnode, dynamic_loss_scale_param_list->at(cur_param));
-        AnfAlgo::SetStreamId(cur_stream_id, assign_add_cnode.get());
-        (void)new_execution_order.emplace_back(assign_add_cnode);
+
+        auto assign_status_node =
+          CreateAssign(kernel_graph_ptr, dynamic_loss_scale_param_list->at(cur_param), npu_get_cnode);
+        AnfAlgo::SetStreamId(cur_stream_id, assign_status_node.get());
+        (void)new_execution_order.emplace_back(assign_status_node);
        (void)viewed_id.insert(cur_param);
        cur_param--;
      }
-      auto npu_clear_cnode = CreateNPUClearStatus(kernel_graph_ptr, npu_alloc_cnode);
+      auto npu_clear_cnode = CreateNPUClearStatusV2(kernel_graph_ptr, status_value_node);
      AnfAlgo::SetStreamId(cur_stream_id, npu_clear_cnode.get());
      (void)new_execution_order.emplace_back(npu_clear_cnode);
    }
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/kernel_adjust.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/kernel_adjust.h
@ -80,15 +80,15 @@ class KernelAdjust {
  KernelAdjust() = default;
  ~KernelAdjust() = default;

-  CNodePtr CreateNPUGetFloatStatus(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
-                                   const CNodePtr &npu_alloc_cnode) const;
-  CNodePtr CreateNPUClearStatus(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
-                                const CNodePtr &npu_alloc_cnode) const;
-  CNodePtr CreateNPUAllocStatus(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) const;
+  AnfNodePtr CreateZerosValueNode(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) const;
+  CNodePtr CreateNPUGetFloatStatusV2(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
+                                     const AnfNodePtr &status_value_node) const;
+  CNodePtr CreateNPUClearStatusV2(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
+                                  const AnfNodePtr &status_value_node) const;
  CNodePtr CreateAssignAdd(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
                           const CNodePtr &npu_alloc_cnode, const AnfNodePtr &specify_para) const;
-  CNodePtr CreateAssign(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
-                        const AnfNodePtr &specify_para) const;
+  CNodePtr CreateAssign(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr, const AnfNodePtr &specify_para,
+                        const AnfNodePtr &data) const;
  void ReorderGetNext(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) const;
  CNodePtr CreateStreamSwitchOp(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
                                const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input,
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hccl_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hccl_kernel.cc
@ -305,10 +305,11 @@ std::vector<TaskInfoPtr> HcclKernel::GenTask(const std::vector<AddressPtr> &inpu
    }

    std::vector<void *> global_workspace_addr;
-    auto overflow_memory_ptr =
-      device::ascend::AscendMemAdapter::GetInstance().MallocOverflowMem(anf_node_.lock()->cast<CNodePtr>());
+    auto overflow_memory_ptr = device::ascend::AscendMemAdapter::GetInstance().MallocOverflowMem();
    MS_EXCEPTION_IF_NULL(overflow_memory_ptr);
    global_workspace_addr.push_back(reinterpret_cast<void *>(overflow_memory_ptr));
+    MS_LOG(DEBUG) << "Assign overflow memory for node " << anf_node->fullname_with_scope() << ", addr is "
+                  << reinterpret_cast<void *>(overflow_memory_ptr);

    HcclTaskInfoPtr hcclTaskInfo =
      std::make_shared<HcclTaskInfo>(unique_name_, stream_id, hccl::HcclAdapter::GetHcclType(anf_node), input_data_addr,
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/dynamic_tbe_kernel_mod.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/dynamic_tbe_kernel_mod.cc
@ -289,6 +289,13 @@ bool DynamicTbeKernelMod::Launch(const std::vector<AddressPtr> &inputs, const st
    runtimeargs.push_back(tiling_data_ptr_);
  }

+  AddressPtr overflow_address_ptr = GetOverflowAddress();
+  if (overflow_address_ptr != nullptr) {
+    runtimeargs.emplace_back(overflow_address_ptr->addr);
+    MS_LOG(DEBUG) << "Assign overflow memory for node " << node->fullname_with_scope() << ", addr is "
+                  << overflow_address_ptr->addr;
+  }
+
  rtL2Ctrl_t *l2ctrl = nullptr;
  auto args_size = static_cast<uint32_t>(UlongToUint(sizeof(void *)) * runtimeargs.size());
  auto node_info = cnode->fullname_with_scope();
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/single_tbe_json_creator.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/single_tbe_json_creator.cc
@ -58,6 +58,43 @@ bool SingleTbeJsonCreator::GenJson(const AnfNodePtr &anf_node, nlohmann::json *k
  return true;
 }

+void NpuClearV2PostProcessing(const AnfNodePtr &anf_node, std::vector<nlohmann::json> *op_list_json) {
+  if (op_list_json->size() != 2) {
+    MS_LOG(ERROR) << "Op list json's size is not equal to 2, abort post processing.";
+  }
+
+  auto compute_json = (*op_list_json)[1];
+  std::vector<nlohmann::json> empty_vector_json;
+  compute_json[kJInputDesc] = empty_vector_json;
+  compute_json[kJOutputDataDesc] = empty_vector_json;
+  compute_json[kJOutputDesc] = empty_vector_json;
+  op_list_json->clear();
+  (*op_list_json).emplace_back(compute_json);
+  MS_LOG(DEBUG) << "Op list json after post processing:" << compute_json.dump();
+}
+
+void NpuGetV2PostProcessing(const AnfNodePtr &anf_node, std::vector<nlohmann::json> *op_list_json) {
+  if (op_list_json->size() != 2) {
+    MS_LOG(ERROR) << "Op list json's size is not equal to 2, abort post processing.";
+  }
+
+  auto compute_json = (*op_list_json)[1];
+  std::vector<nlohmann::json> empty_vector_json;
+  compute_json[kJInputDesc] = empty_vector_json;
+  op_list_json->clear();
+  (*op_list_json).emplace_back(compute_json);
+  MS_LOG(DEBUG) << "Op list json after post processing:" << compute_json.dump();
+}
+
+void SingleTbeJsonCreator::OpListPostProcessing(const AnfNodePtr &anf_node, std::vector<nlohmann::json> *op_list_json) {
+  auto kernel_name = common::AnfAlgo::GetCNodeName(anf_node);
+  if (kernel_name == kNPUClearFloatStatusV2OpName) {
+    NpuClearV2PostProcessing(anf_node, op_list_json);
+  } else if (kernel_name == kNPUGetFloatStatusV2OpName) {
+    NpuGetV2PostProcessing(anf_node, op_list_json);
+  }
+}
+
 bool SingleTbeJsonCreator::GenOpListJson(const AnfNodePtr &anf_node, std::vector<nlohmann::json> *op_list_json) {
  MS_EXCEPTION_IF_NULL(anf_node);
  MS_EXCEPTION_IF_NULL(op_list_json);
@ -69,6 +106,7 @@ bool SingleTbeJsonCreator::GenOpListJson(const AnfNodePtr &anf_node, std::vector
  }
  GenDataJson(anf_node, compute_json, op_list_json);
  (*op_list_json).push_back(compute_json);
+  OpListPostProcessing(anf_node, op_list_json);
  MS_LOG(DEBUG) << "End.";
  return true;
 }
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/single_tbe_json_creator.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/single_tbe_json_creator.h
@ -29,6 +29,7 @@ class SingleTbeJsonCreator : public TbeJsonCreator {

 protected:
  bool GenOpListJson(const AnfNodePtr &anf_node, std::vector<nlohmann::json> *op_list_json);
+  void OpListPostProcessing(const AnfNodePtr &anf_node, std::vector<nlohmann::json> *op_list_json);
  void GenDataJson(const AnfNodePtr &anf_node, const nlohmann::json &compute_json,
                   std::vector<nlohmann::json> *op_list_json) const;
  virtual void GenInputDescJson(const AnfNodePtr &anf_node, size_t real_input_index, nlohmann::json *input_desc);
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_compile.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_compile.cc
@ -571,15 +571,17 @@ void TbeKernelCompileManager::Query(const std::string &type) {
 std::pair<std::vector<CNodePtr>, std::vector<CNodePtr>> TbeKernelCompileManager::GenKernelMod(
  const std::vector<CNodePtr> &node_list) {
  MS_LOG(INFO) << "Gen kernel mod start!";
-  std::vector<CNodePtr> success_node;
-  std::vector<CNodePtr> failed_node;
+  std::vector<CNodePtr> success_nodes;
+  std::vector<CNodePtr> failed_nodes;

  for (auto &node : node_list) {
    MS_EXCEPTION_IF_NULL(node);
    if (AnfAlgo::GetKernelMod(node) != nullptr) {
-      (void)success_node.emplace_back(node);
+      (void)success_nodes.emplace_back(node);
      continue;  // kernel mod already exist, continue;
    }
+    auto op_name = common::AnfAlgo::GetCNodeName(node);
+
    auto full_name = node->fullname_with_scope();
    if (common::AnfAlgo::HasNodeAttr(kAttrOriFusionName, node)) {
      full_name = common::AnfAlgo::GetNodeAttr<std::string>(node, kAttrOriFusionName);
@ -592,7 +594,7 @@ std::pair<std::vector<CNodePtr>, std::vector<CNodePtr>> TbeKernelCompileManager:
      kernel_pack = bin_map->SearchInFile(json_name);
      if (kernel_pack == nullptr) {
        MS_LOG(INFO) << "Can not find .json file or the .o file for op:" << json_name << trace::DumpSourceLines(node);
-        (void)failed_node.emplace_back(node);
+        (void)failed_nodes.emplace_back(node);
        continue;
      }
    }
@ -612,11 +614,17 @@ std::pair<std::vector<CNodePtr>, std::vector<CNodePtr>> TbeKernelCompileManager:
    kernel_mod_ptr->SetInputSizeList(iter->second.input_size_list);
    kernel_mod_ptr->SetOutputSizeList(iter->second.output_size_list);
    kernel_mod_ptr->SetWorkspaceSizeList(kernel_info_json.workspaces);
+    if (op_name == kNPUClearFloatStatusV2OpName || op_name == kNPUGetFloatStatusV2OpName) {
+      constexpr size_t io_byte_size = 32;
+      const std::vector<size_t> size_list = {io_byte_size};
+      kernel_mod_ptr->SetInputSizeList(size_list);
+      kernel_mod_ptr->SetOutputSizeList(size_list);
+    }
    AnfAlgo::SetKernelMod(kernel_mod_ptr, node.get());
-    (void)success_node.emplace_back(node);
+    (void)success_nodes.emplace_back(node);
  }
  MS_LOG(INFO) << "Gen kernel mod end!";
-  return std::make_pair(success_node, failed_node);
+  return std::make_pair(success_nodes, failed_nodes);
 }

 void TbeKernelCompileManager::UpdateFusionTypeAndOutputDataDesc(const std::vector<CNodePtr> &nodes) {
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_mod.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_mod.cc
@ -21,12 +21,12 @@
 #include "utils/ms_context.h"
 #include "plugin/device/ascend/hal/device/ge_runtime/task_info.h"
 #include "runtime/device/kernel_runtime.h"
+#include "plugin/device/ascend/hal/device/ascend_memory_adapter.h"

 namespace mindspore {
 namespace kernel {
 using TbeTaskInfoPtr = std::shared_ptr<mindspore::ge::model_runner::TbeTaskInfo>;
 using tbe::KernelManager;
-using AddressPtrList = std::vector<mindspore::kernel::AddressPtr>;
 bool TbeKernelMod::Launch(const std::vector<mindspore::kernel::AddressPtr> &inputs,
                          const std::vector<mindspore::kernel::AddressPtr> &workspace,
                          const std::vector<mindspore::kernel::AddressPtr> &outputs, void *stream_ptr) {
@ -60,14 +60,23 @@ bool TbeKernelMod::Launch(const std::vector<mindspore::kernel::AddressPtr> &inpu
    return false;
  }

+  auto node = anf_node_.lock();
+  MS_EXCEPTION_IF_NULL(node);
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+
+  std::vector<mindspore::kernel::AddressPtr> real_inputs;
+  std::vector<mindspore::kernel::AddressPtr> real_outputs;
+  GetRealIOAddress(cnode, inputs, outputs, &real_inputs, &real_outputs);
+
  // pack all addresses into a vector.
  std::vector<void *> runtimeargs;
-  (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(runtimeargs),
+  (void)std::transform(std::begin(real_inputs), std::end(real_inputs), std::back_inserter(runtimeargs),
                       [](const AddressPtr &input) -> void * {
                         MS_EXCEPTION_IF_NULL(input);
                         return input->addr;
                       });
-  (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtimeargs),
+  (void)std::transform(std::begin(real_outputs), std::end(real_outputs), std::back_inserter(runtimeargs),
                       [](const AddressPtr &output) -> void * {
                         MS_EXCEPTION_IF_NULL(output);
                         return output->addr;
@ -79,6 +88,14 @@ bool TbeKernelMod::Launch(const std::vector<mindspore::kernel::AddressPtr> &inpu
                           return addr->addr;
                         });
  }
+
+  AddressPtr overflow_address_ptr = GetOverflowAddress();
+  if (overflow_address_ptr != nullptr) {
+    runtimeargs.emplace_back(overflow_address_ptr->addr);
+    MS_LOG(DEBUG) << "Assign overflow memory for node " << cnode->fullname_with_scope() << ", addr is "
+                  << overflow_address_ptr->addr;
+  }
+
  rtL2Ctrl_t *l2ctrl = nullptr;
  const void *stubFunc = reinterpret_cast<void *>(func_stub);
  auto argsSize = static_cast<uint32_t>(UlongToUint(sizeof(void *)) * runtimeargs.size());
@ -106,13 +123,22 @@ std::vector<TaskInfoPtr> TbeKernelMod::GenTask(const std::vector<AddressPtr> &in
  std::vector<void *> output_data_addrs;
  std::vector<void *> workspace_addrs;

+  auto node = anf_node_.lock();
+  MS_EXCEPTION_IF_NULL(node);
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+
+  std::vector<mindspore::kernel::AddressPtr> real_inputs;
+  std::vector<mindspore::kernel::AddressPtr> real_outputs;
+  GetRealIOAddress(cnode, inputs, outputs, &real_inputs, &real_outputs);
+
  // pack all addresses into a vector.
-  (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(input_data_addrs),
+  (void)std::transform(std::begin(real_inputs), std::end(real_inputs), std::back_inserter(input_data_addrs),
                       [](const AddressPtr &input) -> void * {
                         MS_EXCEPTION_IF_NULL(input);
                         return input->addr;
                       });
-  (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(output_data_addrs),
+  (void)std::transform(std::begin(real_outputs), std::end(real_outputs), std::back_inserter(output_data_addrs),
                       [](const AddressPtr &output) -> void * {
                         MS_EXCEPTION_IF_NULL(output);
                         return output->addr;
@ -125,6 +151,13 @@ std::vector<TaskInfoPtr> TbeKernelMod::GenTask(const std::vector<AddressPtr> &in
                         });
  }

+  AddressPtr overflow_address_ptr = GetOverflowAddress();
+  if (overflow_address_ptr != nullptr) {
+    workspace_addrs.emplace_back(overflow_address_ptr->addr);
+    MS_LOG(DEBUG) << "Assign overflow memory for node " << cnode->fullname_with_scope() << ", addr is "
+                  << overflow_address_ptr->addr;
+  }
+
  stream_id_ = stream_id;
  auto funcstub = KernelManager::GenFuncStub(*kernel_pack_, false, &block_dim_, nullptr);
  if (funcstub == 0) {
@ -146,5 +179,40 @@ vector<size_t> TbeKernelMod::GenParameters() {
  auto kernel_json_info = kernel_pack_->kernel_json_info();
  return kernel_json_info.parameters;
 }
+
+AddressPtr TbeKernelMod::GetOverflowAddress() {
+  AddressPtr overflow_address_ptr = nullptr;
+  auto is_overflow = kernel_pack_.get()->kernel_json_info().global_workspace.is_overflow;
+  if (is_overflow) {
+    constexpr size_t size = 32;
+    auto overflow_memory_ptr = device::ascend::AscendMemAdapter::GetInstance().MallocOverflowMem();
+    MS_EXCEPTION_IF_NULL(overflow_memory_ptr);
+    overflow_address_ptr = std::make_shared<kernel::Address>();
+    overflow_address_ptr->addr = reinterpret_cast<void *>(overflow_memory_ptr);
+    overflow_address_ptr->size = size;
+  }
+  return overflow_address_ptr;
+}
+
+void TbeKernelMod::GetRealIOAddress(const AnfNodePtr &cnode, const vector<AddressPtr> &inputs,
+                                    const vector<AddressPtr> &outputs,
+                                    vector<mindspore::kernel::AddressPtr> *real_inputs,
+                                    vector<mindspore::kernel::AddressPtr> *real_outputs) const {
+  auto op_name = common::AnfAlgo::GetCNodeName(cnode);
+  MS_EXCEPTION_IF_NULL(real_inputs);
+  MS_EXCEPTION_IF_NULL(real_outputs);
+  *real_inputs = inputs;
+  *real_outputs = outputs;
+  if (op_name == kNPUClearFloatStatusV2OpName) {
+    // NPUClearFloatStatusV2 has no input output.
+    real_inputs->clear();
+    real_outputs->clear();
+    MS_LOG(INFO) << "Clear Node " << cnode->fullname_with_scope() << "'s inputs and outputs";
+  } else if (op_name == kNPUGetFloatStatusV2OpName) {
+    // NPUGetFloatStatusV2 has no input
+    real_inputs->clear();
+    MS_LOG(INFO) << "Clear Node " << cnode->fullname_with_scope() << "'s inputs";
+  }
+}
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_mod.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_mod.h
@ -42,6 +42,10 @@ class TbeKernelMod : public AscendKernelMod {
  std::vector<TaskInfoPtr> GenTask(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspaces,
                                   const std::vector<AddressPtr> &outputs, uint32_t stream_id) override;
  std::vector<size_t> GenParameters() override;
+  AddressPtr GetOverflowAddress();
+  void GetRealIOAddress(const AnfNodePtr &cnode, const std::vector<AddressPtr> &inputs,
+                        const std::vector<AddressPtr> &outputs, std::vector<AddressPtr> *real_inputs,
+                        std::vector<AddressPtr> *real_outputs) const;

 protected:
  KernelPackPtr kernel_pack_;
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_utils.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_utils.cc
@ -208,6 +208,8 @@ nlohmann::json TbeUtils::GenSocInfo() {
  soc_info_json["op_debug_config"] = GetOpDebugConfig();
  soc_info_json["autoTilingMode"] = context_ptr->get_param<std::string>(MS_CTX_TUNE_MODE);
  soc_info_json["deviceId"] = std::to_string(context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID));
+  soc_info_json["status_check"] = "true";
+
  std::string config_path;
  if (!Common::CommonFuncForConfigPath("", common::GetEnv("OP_BANK_PATH"), &config_path)) {
    MS_LOG(EXCEPTION) << "Invalid environment variable 'OP_BANK_PATH', the path is " << common::GetEnv("OP_BANK_PATH")
--- a/mindspore/core/ops/core_ops.h
+++ b/mindspore/core/ops/core_ops.h
@ -1608,6 +1608,8 @@ GVAR_DEF(PrimitivePtr, kPrimPush, std::make_shared<Primitive>("Push"));
 GVAR_DEF(PrimitivePtr, kPrimNPUGetFloatStatus, std::make_shared<Primitive>("NPUGetFloatStatus"));
 GVAR_DEF(PrimitivePtr, kPrimNPUAllocFloatStatus, std::make_shared<Primitive>("NPUAllocFloatStatus"));
 GVAR_DEF(PrimitivePtr, kPrimNPUClearFloatStatus, std::make_shared<Primitive>("NPUClearFloatStatus"));
+GVAR_DEF(PrimitivePtr, kPrimNPUGetFloatStatusV2, std::make_shared<Primitive>("NPUGetFloatStatusV2"));
+GVAR_DEF(PrimitivePtr, kPrimNPUClearFloatStatusV2, std::make_shared<Primitive>("NPUClearFloatStatusV2"));
 GVAR_DEF(PrimitivePtr, kPrimPyFunc, std::make_shared<Primitive>("PyFunc"));
 GVAR_DEF(PrimitivePtr, kPrimDynamicLossScale, std::make_shared<Primitive>("_DynamicLossScale"));
 GVAR_DEF(PrimitivePtr, kPrimScaleGrad, std::make_shared<Primitive>("ScaleGrad"));
--- a/mindspore/core/ops/npu_clear_float_status_v2.cc
+++ b/mindspore/core/ops/npu_clear_float_status_v2.cc
@ -0,0 +1,99 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <map>
+#include <set>
+#include <string>
+
+#include "ops/npu_clear_float_status_v2.h"
+#include "ops/op_utils.h"
+#include "abstract/param_validator.h"
+#include "utils/check_convert_utils.h"
+#include "abstract/ops/primitive_infer_map.h"
+#include "mindapi/src/helper.h"
+
+namespace mindspore {
+namespace ops {
+namespace {
+abstract::ShapePtr NPUClearFloatStatusV2InferShape(const PrimitivePtr &,
+                                                   const std::vector<AbstractBasePtr> &input_args) {
+  auto input_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[0]->BuildShape())[kShape];
+  // dynamic rank
+  if (IsDynamicRank(input_shape)) {
+    return std::make_shared<abstract::Shape>(ShapeVector{abstract::Shape::kShapeRankAny});
+  }
+  // dynamic shape
+  if (IsDynamic(input_shape)) {
+    ShapeVector out_shape_dyn;
+    for (size_t i = 0; i < input_shape.size(); ++i) {
+      out_shape_dyn.push_back(abstract::Shape::kShapeDimAny);
+    }
+    return std::make_shared<abstract::Shape>(out_shape_dyn);
+  }
+  const int64_t normal_shape_size = 1;
+  const int64_t normal_shape_len = 8;
+  if (input_shape.size() != normal_shape_size) {
+    MS_EXCEPTION(ValueError) << "Input_x must be a 1-dimensional tensor, but got " << std::to_string(input_shape.size())
+                             << "-dimensional tensor.";
+  }
+  if (input_shape[0] != normal_shape_len) {
+    MS_EXCEPTION(ValueError) << "The first dimension of input_x must be 8, but got " << std::to_string(input_shape[0]);
+  }
+  std::vector<int64_t> output_shape = {normal_shape_len};
+  return std::make_shared<abstract::Shape>(output_shape);
+}
+
+TypePtr NPUClearFloatStatusV2InferType(const PrimitivePtr &primitive, const std::vector<AbstractBasePtr> &input_args) {
+  std::map<std::string, TypePtr> types;
+  std::set<TypePtr> valid_types = {kInt32};
+  TypePtr input_x_type = input_args[0]->BuildType();
+  (void)types.emplace("input_x", input_x_type);
+  (void)CheckAndConvertUtils::CheckTensorTypeSame(types, valid_types, primitive->name());
+  return kInt32;
+}
+}  // namespace
+MIND_API_OPERATOR_IMPL(NPUClearFloatStatusV2, BaseOperator);
+AbstractBasePtr NPUClearFloatStatusV2Infer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                           const std::vector<AbstractBasePtr> &input_args) {
+  MS_EXCEPTION_IF_NULL(primitive);
+  const int64_t input_num = 1;
+  CheckAndConvertUtils::CheckInputArgs(input_args, kEqual, input_num, primitive->name());
+  auto infer_type = NPUClearFloatStatusV2InferType(primitive, input_args);
+  auto infer_shape = NPUClearFloatStatusV2InferShape(primitive, input_args);
+  return abstract::MakeAbstract(infer_shape, infer_type);
+}
+
+// AG means auto generated
+class MIND_API AGNPUClearFloatStatusV2Infer : public abstract::OpInferBase {
+ public:
+  BaseShapePtr InferShape(const PrimitivePtr &primitive,
+                          const std::vector<AbstractBasePtr> &input_args) const override {
+    return NPUClearFloatStatusV2InferShape(primitive, input_args);
+  }
+
+  TypePtr InferType(const PrimitivePtr &primitive, const std::vector<AbstractBasePtr> &input_args) const override {
+    return NPUClearFloatStatusV2InferType(primitive, input_args);
+  }
+  AbstractBasePtr InferShapeAndType(const abstract::AnalysisEnginePtr &engine, const PrimitivePtr &primitive,
+                                    const std::vector<AbstractBasePtr> &input_args) const override {
+    return NPUClearFloatStatusV2Infer(engine, primitive, input_args);
+  }
+};
+
+REGISTER_PRIMITIVE_OP_INFER_IMPL(NPUClearFloatStatusV2, prim::kPrimNPUClearFloatStatusV2, AGNPUClearFloatStatusV2Infer,
+                                 false);
+}  // namespace ops
+}  // namespace mindspore
--- a/mindspore/core/ops/npu_clear_float_status_v2.h
+++ b/mindspore/core/ops/npu_clear_float_status_v2.h
@ -0,0 +1,39 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CORE_OPS_NPU_CLEAR_FLOAT_STATUS_V2_H_
+#define MINDSPORE_CORE_OPS_NPU_CLEAR_FLOAT_STATUS_V2_H_
+#include <vector>
+#include <memory>
+
+#include "ops/base_operator.h"
+#include "mindapi/base/types.h"
+
+namespace mindspore {
+namespace ops {
+constexpr auto kNameNPUClearFloatStatusV2 = "NPUClearFloatStatusV2";
+class MIND_API NPUClearFloatStatusV2 : public BaseOperator {
+ public:
+  MIND_API_BASE_MEMBER(NPUClearFloatStatusV2);
+  NPUClearFloatStatusV2() : BaseOperator(kNameNPUClearFloatStatusV2) { InitIOName({"addr"}, {"data"}); }
+  void Init() const {}
+};
+MIND_API abstract::AbstractBasePtr NPUClearFloatStatusV2Infer(const abstract::AnalysisEnginePtr &,
+                                                              const PrimitivePtr &primitive,
+                                                              const std::vector<abstract::AbstractBasePtr> &input_args);
+}  // namespace ops
+}  // namespace mindspore
+#endif  // MINDSPORE_CORE_OPS_NPU_CLEAR_FLOAT_STATUS_V2_H_
--- a/mindspore/core/ops/npu_get_float_status_v2.cc
+++ b/mindspore/core/ops/npu_get_float_status_v2.cc
@ -0,0 +1,99 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <map>
+#include <set>
+#include <string>
+
+#include "ops/npu_get_float_status_v2.h"
+#include "ops/op_utils.h"
+#include "abstract/param_validator.h"
+#include "utils/check_convert_utils.h"
+#include "abstract/ops/primitive_infer_map.h"
+#include "mindapi/src/helper.h"
+
+namespace mindspore {
+namespace ops {
+namespace {
+abstract::ShapePtr NPUGetFloatStatusV2InferShape(const PrimitivePtr &, const std::vector<AbstractBasePtr> &input_args) {
+  auto input_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[0]->BuildShape())[kShape];
+  // dynamic rank
+  if (IsDynamicRank(input_shape)) {
+    return std::make_shared<abstract::Shape>(ShapeVector{abstract::Shape::kShapeRankAny});
+  }
+  // dynamic shape
+  if (IsDynamic(input_shape)) {
+    ShapeVector out_shape_dyn;
+    for (size_t i = 0; i < input_shape.size(); ++i) {
+      out_shape_dyn.push_back(abstract::Shape::kShapeDimAny);
+    }
+    return std::make_shared<abstract::Shape>(out_shape_dyn);
+  }
+  const int64_t normal_shape_size = 1;
+  const int64_t normal_shape_len = 8;
+  if (input_shape.size() != normal_shape_size) {
+    MS_EXCEPTION(ValueError) << "Input_x must be a 1-dimensional tensor, but got " << std::to_string(input_shape.size())
+                             << "-dimensional tensor.";
+  }
+  if (input_shape[0] != normal_shape_len) {
+    MS_EXCEPTION(ValueError) << "The first dimension of input_x must be 8, but got " << std::to_string(input_shape[0]);
+  }
+
+  std::vector<int64_t> output_shape = {normal_shape_len};
+  return std::make_shared<abstract::Shape>(output_shape);
+}
+
+TypePtr NPUGetFloatStatusV2InferType(const PrimitivePtr &primitive, const std::vector<AbstractBasePtr> &input_args) {
+  std::map<std::string, TypePtr> types;
+  std::set<TypePtr> valid_types = {kInt32};
+  TypePtr input_x_type = input_args[0]->BuildType();
+  (void)types.emplace("input_x", input_x_type);
+  (void)CheckAndConvertUtils::CheckTensorTypeSame(types, valid_types, primitive->name());
+  return kInt32;
+}
+}  // namespace
+MIND_API_OPERATOR_IMPL(NPUGetFloatStatusV2, BaseOperator);
+AbstractBasePtr NPUGetFloatStatusV2Infer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                         const std::vector<AbstractBasePtr> &input_args) {
+  MS_EXCEPTION_IF_NULL(primitive);
+  const int64_t input_num = 1;
+  CheckAndConvertUtils::CheckInputArgs(input_args, kEqual, input_num, primitive->name());
+  auto infer_type = NPUGetFloatStatusV2InferType(primitive, input_args);
+  auto infer_shape = NPUGetFloatStatusV2InferShape(primitive, input_args);
+  return abstract::MakeAbstract(infer_shape, infer_type);
+}
+
+// AG means auto generated
+class MIND_API AGNPUGetFloatStatusV2Infer : public abstract::OpInferBase {
+ public:
+  BaseShapePtr InferShape(const PrimitivePtr &primitive,
+                          const std::vector<AbstractBasePtr> &input_args) const override {
+    return NPUGetFloatStatusV2InferShape(primitive, input_args);
+  }
+
+  TypePtr InferType(const PrimitivePtr &primitive, const std::vector<AbstractBasePtr> &input_args) const override {
+    return NPUGetFloatStatusV2InferType(primitive, input_args);
+  }
+  AbstractBasePtr InferShapeAndType(const abstract::AnalysisEnginePtr &engine, const PrimitivePtr &primitive,
+                                    const std::vector<AbstractBasePtr> &input_args) const override {
+    return NPUGetFloatStatusV2Infer(engine, primitive, input_args);
+  }
+};
+
+REGISTER_PRIMITIVE_OP_INFER_IMPL(NPUGetFloatStatusV2, prim::kPrimNPUGetFloatStatusV2, AGNPUGetFloatStatusV2Infer,
+                                 false);
+}  // namespace ops
+}  // namespace mindspore
--- a/mindspore/core/ops/npu_get_float_status_v2.h
+++ b/mindspore/core/ops/npu_get_float_status_v2.h
@ -0,0 +1,39 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CORE_OPS_NPU_GET_FLOAT_STATUS_V2_H_
+#define MINDSPORE_CORE_OPS_NPU_GET_FLOAT_STATUS_V2_H_
+#include <vector>
+#include <memory>
+
+#include "ops/base_operator.h"
+#include "mindapi/base/types.h"
+
+namespace mindspore {
+namespace ops {
+constexpr auto kNameNPUGetFloatStatusV2 = "NPUGetFloatStatusV2";
+class MIND_API NPUGetFloatStatusV2 : public BaseOperator {
+ public:
+  MIND_API_BASE_MEMBER(NPUGetFloatStatusV2);
+  NPUGetFloatStatusV2() : BaseOperator(kNameNPUGetFloatStatusV2) { InitIOName({"addr"}, {"data"}); }
+  void Init() const {}
+};
+MIND_API abstract::AbstractBasePtr NPUGetFloatStatusV2Infer(const abstract::AnalysisEnginePtr &,
+                                                            const PrimitivePtr &primitive,
+                                                            const std::vector<abstract::AbstractBasePtr> &input_args);
+}  // namespace ops
+}  // namespace mindspore
+#endif  // MINDSPORE_CORE_OPS_NPU_GET_FLOAT_STATUS_V2_H_
--- a/mindspore/python/mindspore/_extends/parallel_compile/tbe_compiler/tbe_helper.py
+++ b/mindspore/python/mindspore/_extends/parallel_compile/tbe_compiler/tbe_helper.py
@ -298,7 +298,7 @@ def get_options_info(job_content):
    options["op_impl_mode_list"] = job_content["SocInfo"]["op_impl_mode_list"]
    options["kernel_meta_temp_dir"] = job_content["SocInfo"]["kernel_meta_temp_dir"]
    options["deterministic"] = job_content["SocInfo"]["deterministic"]
-    options["status_check"] = "false"
+    options["status_check"] = job_content["SocInfo"]["status_check"]
    return options


--- a/mindspore/python/mindspore/amp.py
+++ b/mindspore/python/mindspore/amp.py
@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2023 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -16,7 +16,8 @@
 from __future__ import absolute_import

 from abc import ABC, abstractmethod
-
+from mindspore.ops._primitive_cache import _get_cache_prim
+from mindspore.ops.operations.math_ops import NPUGetFloatStatusV2, NPUClearFloatStatusV2
 from ._checkparam import Validator as validator
 from .common import dtype as mstype
 from . import context
@ -58,34 +59,7 @@ def _overflow(inputs):
    return 1 - status.all()


-def init_status():
-    r"""
-    Returns a Tensor indicating initialized status for overflow detection.
-
-    Note:
-        Only Ascend need status to capture overflow status, you can also call
-        this function on GPU or CPU, but the return value is useless.
-
-    Returns:
-        Tensor, has the shape of `(8,)`.
-
-    Supported Platforms:
-        ``Ascend`` ``GPU`` ``CPU``
-
-    Examples:
-        >>> status = amp.init_status()
-    """
-    if _ascend_target():
-        status = ops.NPUAllocFloatStatus()()
-        clear_status = ops.NPUClearFloatStatus()(status)
-        status = ops.depend(status, clear_status)
-    else:
-        status = Tensor([0, 0, 0, 0, 0, 0, 0, 0], mstype.float32)
-
-    return status
-
-
-def all_finite(inputs, status=None):
+def all_finite(inputs):
    r"""
    Returns a scalar Tensor indicating whether the inputs are finite.

@ -98,8 +72,6 @@ def all_finite(inputs, status=None):

    Args:
        inputs (Union(tuple(Tensor), list(Tensor))): a iterable Tensor.
-        status (Tensor): the status Tensor for overflow detection, only required on
-            Ascend. Default: None.

    Returns:
        Tensor, a scalar Tensor and the dtype is bool.
@ -112,13 +84,13 @@ def all_finite(inputs, status=None):
        >>> output = amp.all_finite(x)
    """
    if _ascend_target():
-        if status is None:
-            raise ValueError("The status must be initialized on Ascend, but get 'None'.")
+        status = Tensor([0] * 8, mstype.int32)
        status = ops.depend(status, inputs)
-        get_status = ops.NPUGetFloatStatus()(status)
+        get_status = _get_cache_prim(NPUGetFloatStatusV2)()(status)
        status = ops.depend(status, get_status)
-        status_finite = status.sum() == 0
-        _ = ops.NPUClearFloatStatus()(status)
+        clear_status = _get_cache_prim(NPUClearFloatStatusV2)()(status)
+        get_status = ops.depend(get_status, clear_status)
+        status_finite = get_status.equal(Tensor(0, mstype.int32)).all()
        return status_finite
    outputs = _hypermap(_partial(_overflow), inputs)
    flag_sum = ops.addn(outputs).reshape(())
@ -329,5 +301,5 @@ class DynamicLossScaler(LossScaler):
 __all__ = [
    "DynamicLossScaleManager", "LossScaleManager", "FixedLossScaleManager",
    "build_train_network", "DynamicLossScaler", "StaticLossScaler", "LossScaler",
-    "auto_mixed_precision", "init_status", "all_finite"
+    "auto_mixed_precision", "all_finite"
 ]
--- a/mindspore/python/mindspore/boost/boost_cell_wrapper.py
+++ b/mindspore/python/mindspore/boost/boost_cell_wrapper.py
@ -27,6 +27,7 @@ from mindspore.common import Tensor
 from mindspore.common.sparse_tensor import RowTensorInner
 from mindspore.common.parameter import Parameter, ParameterTuple
 from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
+from mindspore.ops.operations.math_ops import NPUGetFloatStatusV2, NPUClearFloatStatusV2
 from mindspore.ops import functional as F
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P
@ -460,6 +461,9 @@ class BoostTrainOneStepWithLossScaleCell(BoostTrainOneStepCell):
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.gpu_target = (context.get_context("device_target") == "GPU")
        self.loss_scaling_manager = None
+        self.base0 = Tensor(0, mstype.int32)
+        self.reduce_all = P.ReduceAll(keep_dims=False)
+        self.equal = P.Equal()

        if self.auto_boost.boost_config.get("loss_scale_group", False):
            self.enable_enhanced_amp = True
@ -535,12 +539,13 @@ class BoostTrainOneStepWithLossScaleCell(BoostTrainOneStepCell):
            bool, overflow value.
            float, update ratio.
        """
-        flag_sum = self.reduce_sum(param, (0,))
+        flag_sum = self.equal(self.base0, param)
        if self.reducer_flag:
            flag_reduce = self.allreduce(flag_sum)
-            overflow = self.less_equal(self.base, flag_reduce)
+            overflow = not self.reduce_all(flag_reduce)
        else:
-            overflow = self.less_equal(self.base, flag_sum)
+            overflow = not self.reduce_all(flag_sum)
+
        if overflow:
            update_ratio = self.reduce_ratio
        else:
@ -609,13 +614,11 @@ class BoostTrainOneStepWithLossScaleCell(BoostTrainOneStepCell):
            The second value is the same as the input of `compute_input`, but contains some information about the
            execution order.
        """
-        status = False
+        status = Tensor([0] * 8, mstype.int32)
        if not self.gpu_target:
-            # init overflow buffer
-            status = P.NPUAllocFloatStatus()()
            status = F.depend(status, pre_cond)
            # clear overflow buffer
-            clear_status = P.NPUClearFloatStatus()(status)
+            clear_status = NPUClearFloatStatusV2()(status)
            compute_input = F.depend(compute_input, clear_status)
        return status, compute_input

@ -636,22 +639,36 @@ class BoostTrainOneStepWithLossScaleCell(BoostTrainOneStepCell):
        """
        if not self.gpu_target:
            status = F.depend(status, compute_output)
-            get_status = P.NPUGetFloatStatus()(status)
-            status = F.depend(status, get_status)
-            # sum overflow buffer elements, 0:not overflow , >0:overflow
-            flag_sum = self.reduce_sum(status, (0,))
+            get_status = NPUGetFloatStatusV2()(status)
+
+            if self.is_distributed:
+                # sum overflow flag over devices
+                flag_reduce = self.allreduce(get_status)
+                # get_status not equal to [0]*8 means overflow
+                flag = self.equal(self.base0, flag_reduce)
+                status = F.depend(status, flag)
+                clear_status = NPUClearFloatStatusV2()(status)
+                flag = F.depend(flag, clear_status)
+                overall_finite = self.reduce_all(flag)
+            else:
+                status = F.depend(status, get_status)
+                clear_status = NPUClearFloatStatusV2()(status)
+                get_status = F.depend(get_status, clear_status)
+                flag = self.equal(self.base0, get_status)
+                overall_finite = self.reduce_all(flag)
+            overflow = not overall_finite
        else:
            flag_sum = self.hyper_map(F.partial(_grad_overflow), compute_output)
            flag_sum = P.AddN()(flag_sum)
            # convert flag_sum to scalar
            flag_sum = P.Reshape()(flag_sum, (()))

-        if self.is_distributed:
-            # sum overflow flag over devices
-            flag_reduce = self.allreduce(flag_sum)
-            overflow = self.less_equal(self.base, flag_reduce)
-        else:
-            overflow = self.less_equal(self.base, flag_sum)
+            if self.is_distributed:
+                # sum overflow flag over devices
+                flag_reduce = self.allreduce(flag_sum)
+                overflow = self.less_equal(self.base, flag_reduce)
+            else:
+                overflow = self.less_equal(self.base, flag_sum)
        return overflow

    def _process_loss_scale(self, overflow):
@ -688,7 +705,7 @@ class BoostTrainOneStepWithLossScaleCell(BoostTrainOneStepCell):
        self.optimizer_loss_scale = [self.parent.count(x) for x in parent_set]
        self.reduce_ratio = Tensor(1.0 / (2 ** 0.5), mstype.float32)
        self.growth_ratio = Tensor(2 ** (1.0 / 1000.0), mstype.float32)
-        self.overflow_status_list = ParameterTuple(Parameter(Tensor(np.zeros(shape=[8]), mstype.float32),
+        self.overflow_status_list = ParameterTuple(Parameter(Tensor(np.zeros(shape=[8]), mstype.int32),
                                                             name='mix_layer_status_{}'.format(x), requires_grad=False)
                                                   for x in range(loss_scale_number))
        self.loss_scaling_manager.set_loss_scale_status(loss_scale_number, self.loss_scaling_manager.get_loss_scale())
--- a/mindspore/python/mindspore/nn/wrap/loss_scale.py
+++ b/mindspore/python/mindspore/nn/wrap/loss_scale.py
@ -23,6 +23,7 @@ from mindspore.nn.cell import Cell
 from mindspore.common import Tensor
 from mindspore.common.sparse_tensor import RowTensorInner
 from mindspore.common.parameter import Parameter
+from mindspore.ops.operations.math_ops import NPUGetFloatStatusV2, NPUClearFloatStatusV2
 from mindspore.ops import functional as F
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P
@ -309,8 +310,11 @@ class TrainOneStepWithLossScaleCell(TrainOneStepCell):
        super(TrainOneStepWithLossScaleCell, self).__init__(network, optimizer, sens=None)
        self.hyper_map = C.HyperMap()
        self.base = Tensor(1, mstype.float32)
+        self.base0 = Tensor(0, mstype.int32)
        self.reduce_sum = P.ReduceSum(keep_dims=False)
+        self.reduce_all = P.ReduceAll(keep_dims=False)
        self.less_equal = P.LessEqual()
+        self.equal = P.Equal()
        self.allreduce = P.AllReduce()
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.gpu_target = (context.get_context("device_target") == "GPU")
@ -390,13 +394,11 @@ class TrainOneStepWithLossScaleCell(TrainOneStepCell):
            The second value is the same as the input of `compute_input`, but contains some information about the
            execution order.
        """
-        status = False
+        status = Tensor([0] * 8, mstype.int32)
        if not self.gpu_target:
-            # init overflow buffer
-            status = P.NPUAllocFloatStatus()()
            status = F.depend(status, pre_cond)
            # clear overflow buffer
-            clear_status = P.NPUClearFloatStatus()(status)
+            clear_status = NPUClearFloatStatusV2()(status)
            compute_input = F.depend(compute_input, clear_status)
        return status, compute_input

@ -419,22 +421,36 @@ class TrainOneStepWithLossScaleCell(TrainOneStepCell):
        """
        if not self.gpu_target:
            status = F.depend(status, compute_output)
-            get_status = P.NPUGetFloatStatus()(status)
-            status = F.depend(status, get_status)
-            # sum overflow buffer elements, 0:not overflow , >0:overflow
-            flag_sum = self.reduce_sum(status, (0,))
+            get_status = NPUGetFloatStatusV2()(status)
+
+            if self.is_distributed:
+                # sum overflow flag over devices
+                flag_reduce = self.allreduce(get_status)
+                # get_status not equal to [0]*8 means overflow
+                flag = self.equal(self.base0, flag_reduce)
+                status = F.depend(status, flag)
+                clear_status = NPUClearFloatStatusV2()(status)
+                flag = F.depend(flag, clear_status)
+                overall_finite = self.reduce_all(flag)
+            else:
+                status = F.depend(status, get_status)
+                clear_status = NPUClearFloatStatusV2()(status)
+                get_status = F.depend(get_status, clear_status)
+                flag = self.equal(self.base0, get_status)
+                overall_finite = self.reduce_all(flag)
+            overflow = not overall_finite
        else:
            flag_sum = self.hyper_map(F.partial(_grad_overflow), compute_output)
            flag_sum = P.AddN()(flag_sum)
            # convert flag_sum to scalar
            flag_sum = P.Reshape()(flag_sum, (()))

-        if self.is_distributed:
-            # sum overflow flag over devices
-            flag_reduce = self.allreduce(flag_sum)
-            overflow = self.less_equal(self.base, flag_reduce)
-        else:
-            overflow = self.less_equal(self.base, flag_sum)
+            if self.is_distributed:
+                # sum overflow flag over devices
+                flag_reduce = self.allreduce(flag_sum)
+                overflow = self.less_equal(self.base, flag_reduce)
+            else:
+                overflow = self.less_equal(self.base, flag_sum)
        return overflow

    def process_loss_scale(self, overflow):
--- a/mindspore/python/mindspore/ops/_op_impl/tbe/init.py
+++ b/mindspore/python/mindspore/ops/_op_impl/tbe/init.py
@ -37,3 +37,5 @@ from .scatter_nd_d import _scatter_nd_d_tbe # in python no check supported
 from .assign_add_ds import _assign_add_ds_tbe # "Frac_nz in pangu not support"
 from .atomic_addr_clean import _atomic_addr_clean_tbe # need to clean addr larger than 2G, int32 is not enough
 from .assign import _assign_tbe # Different formats of assign inputs cause memory to increase
+from .npu_clear_float_status_v2 import _npu_clear_float_status_v2_tbe  # io mismatch
+from .npu_get_float_status_v2 import _npu_get_float_status_v2_tbe  # io mismatch
--- a/mindspore/python/mindspore/ops/_op_impl/tbe/npu_clear_float_status_v2.py
+++ b/mindspore/python/mindspore/ops/_op_impl/tbe/npu_clear_float_status_v2.py
@ -0,0 +1,35 @@
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""NPUClearFloatStatusV2 op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+npu_clear_float_status_v2_op_info = TBERegOp("NPUClearFloatStatusV2") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("n_p_u_clear_float_status_v2.so") \
+    .compute_cost(10) \
+    .kernel_name("n_p_u_clear_float_status_v2") \
+    .partial_flag(True) \
+    .input(0, "addr", False, "required", "all") \
+    .output(0, "data", False, "required", "all") \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default) \
+    .get_op_info()
+
+
+@op_info_register(npu_clear_float_status_v2_op_info)
+def _npu_clear_float_status_v2_tbe():
+    """NPUClearFloatStatusV2 TBE register"""
+    return
--- a/mindspore/python/mindspore/ops/_op_impl/tbe/npu_get_float_status_v2.py
+++ b/mindspore/python/mindspore/ops/_op_impl/tbe/npu_get_float_status_v2.py
@ -0,0 +1,35 @@
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""NPUGetFloatStatusV2 op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+npu_get_float_status_v2_op_info = TBERegOp("NPUGetFloatStatusV2") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("n_p_u_get_float_status_v2.so") \
+    .compute_cost(10) \
+    .kernel_name("n_p_u_get_float_status_v2") \
+    .partial_flag(True) \
+    .input(0, "addr", False, "required", "all") \
+    .output(0, "data", False, "required", "all") \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default) \
+    .get_op_info()
+
+
+@op_info_register(npu_get_float_status_v2_op_info)
+def _npu_get_float_status_v2_tbe():
+    """NPUGetFloatStatusV2 TBE register"""
+    return
--- a/mindspore/python/mindspore/ops/operations/math_ops.py
+++ b/mindspore/python/mindspore/ops/operations/math_ops.py
@ -20,6 +20,7 @@ from __future__ import division
 import numpy as np

 from mindspore import context
+from mindspore import log as logger
 from mindspore.ops import signature as sig
 from mindspore._checkparam import Validator as validator
 from mindspore._checkparam import Rel
@ -4339,6 +4340,7 @@ class NPUAllocFloatStatus(Primitive):
    @prim_attr_register
    def __init__(self):
        """Initialize NPUAllocFloatStatus"""
+        logger.warning("The 'NPUAllocFloatStatus' operator will be deprecated in the future. Please don't use it.")


 class NPUGetFloatStatus(Primitive):
@ -4408,6 +4410,7 @@ class NPUGetFloatStatus(Primitive):
    @prim_attr_register
    def __init__(self):
        """Initialize NPUGetFloatStatus"""
+        logger.warning("The 'NPUGetFloatStatus' operator will be deprecated in the future. Please don't use it.")


 class NPUClearFloatStatus(Primitive):
@ -4471,6 +4474,173 @@ class NPUClearFloatStatus(Primitive):
    @prim_attr_register
    def __init__(self):
        """Initialize NPUClearFloatStatus"""
+        logger.warning("The 'NPUClearFloatStatus' operator will be deprecated in the future. Please don't use it.")
+
+
+class NPUGetFloatStatusV2(Primitive):
+    """
+    Get the flag for storage overflow status. This flag is located in a register at a
+    fixed address on the `Ascend` device, and overflow information is automatically
+    written to this register.
+    The flag is a one-dimensional Tensor with shape :math:`(8,)` and data type `mindspore.dtype.int32`.
+    If the value of flag is zero, no overflow has occurred, otherwise, overflow.
+    When performing overflow detection on the network, you should first call `NPUClearFloatStatusV2` to
+    reset the register before the detection, and then call `NPUGetFloatStatusV2` to get the register
+    status after the network execution is completed.
+
+    Note:
+        - In order to avoid mis-optimization by the compiler, additional input is added to
+          this operator. The input is defined as a shape of: math:`(8,)` and data type of
+          `mindspore.dtype.int32` Tensor, meaningless.
+        - Since this op lacks contextual dependencies with parameters in the network,
+          :class:`mindspore.ops.Depend` needs to be used to ensure order of execution.
+
+    Inputs:
+        Tensor, an additional input created to avoid compiler optimization, is specified as shape :math:`(8,)`,
+        data type is `mindspore.dtype.int32`, and has no actual meaning.
+        Usually use the output of `NPUClearFloatStatusV2`.
+
+    Outputs:
+        Tensor, shape and data type are the same as input. If all are zero, it means no overflow, otherwise, overflow.
+
+    Raises:
+        TypeError: If `x` is not a Tensor.
+        TypeError: If dtype of `x` is not int32.
+        ValueError: If shape of `x` is not equal to :math:`(8,)`.
+
+    Supported Platforms:
+        ``Ascend``
+
+    Examples:
+        >>> import mindspore as ms
+        >>> import numpy as np
+        >>> from mindspore import ops, nn, Tensor
+        >>> from mindspore.ops.operations.math_ops import NPUGetFloatStatusV2, NPUClearFloatStatusV2
+        >>> class Net(nn.Cell):
+        ...     def __init__(self):
+        ...         super().__init__()
+        ...         self.clear_status = NPUClearFloatStatusV2()
+        ...         self.get_status = NPUGetFloatStatusV2()
+        ...         self.sub = ops.Sub()
+        ...         self.neg = ops.Neg()
+        ...         self.equal = ops.Equal()
+        ...         self.reduce_all = ops.ReduceAll(keep_dims=False)
+        ...         self.base = Tensor([0], dtype=ms.int32)
+        ...
+        ...     def construct(self, x):
+        ...         init = Tensor([0]*8, dtype=ms.int32)
+        ...         clear_status = self.clear_status(init)
+        ...         x = ops.depend(x, clear_status)
+        ...         res = self.sub(x, self.neg(x))
+        ...         init = ops.depend(init, res)
+        ...         get_status = self.get_status(init)
+        ...         flag = self.equal(self.base, get_status)
+        ...         overall_finite = self.reduce_all(flag)
+        ...         overflow = not overall_finite
+        ...         return overflow
+        ...
+        >>> value = 65504
+        >>> data = np.full((2, 3), value, dtype=np.float16)
+        >>> x = Tensor(data, dtype=ms.float16)
+        >>> net = Net()
+        >>> res = net(x)
+        >>> print(res)
+        True
+        >>> value = 10
+        >>> data = np.full((2, 3), value, dtype=np.float16)
+        >>> x = Tensor(data, dtype=ms.float16)
+        >>> net = Net()
+        >>> res = net(x)
+        >>> print(res)
+        False
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        """Initialize NPUGetFloatStatusV2"""
+
+
+
+class NPUClearFloatStatusV2(Primitive):
+    """
+    Clear the flag for storage overflow status. This flag is located in a register at a
+    fixed address on the `Ascend` device, and overflow information is automatically
+    written to this register.
+    The flag is a one-dimensional Tensor with shape :math:`(8,)` and data type `mindspore.dtype.int32`.
+    If the value of flag is zero, no overflow has occurred, otherwise, overflow.
+    When performing overflow detection on the network, you should first call `NPUClearFloatStatusV2` to
+    reset the register before the detection, and then call `NPUGetFloatStatusV2` to get the register
+    status after the network execution is completed.
+
+    Note:
+        - In order to avoid mis-optimization by the compiler, additional input and output are added to
+          this operator. The input and output are defined as a shape of: math:`(8,)` and data type of
+          `mindspore.dtype.int32` Tensor, meaningless.
+        - Since this op lacks contextual dependencies with parameters in the network,
+          :class:`mindspore.ops.Depend` needs to be used to ensure order of execution.
+
+    Inputs:
+        Tensor, an additional input created to avoid compiler optimization, is specified as shape :math:`(8,)`,
+        data type is `mindspore.dtype.int32`, and has no actual meaning.
+
+    Outputs:
+        Tensor, shape and data type are the same as input, meaningless.
+
+    Raises:
+        TypeError: If `x` is not a Tensor.
+        TypeError: If dtype of `x` is not int32.
+        ValueError: If shape of `x` is not equal to :math:`(8,)`.
+
+    Supported Platforms:
+        ``Ascend``
+
+    Examples:
+        >>> import mindspore as ms
+        >>> import numpy as np
+        >>> from mindspore import ops, nn, Tensor
+        >>> from mindspore.ops.operations.math_ops import NPUGetFloatStatusV2, NPUClearFloatStatusV2
+        >>> class Net(nn.Cell):
+        ...     def __init__(self):
+        ...         super().__init__()
+        ...         self.clear_status = NPUClearFloatStatusV2()
+        ...         self.get_status = NPUGetFloatStatusV2()
+        ...         self.sub = ops.Sub()
+        ...         self.neg = ops.Neg()
+        ...         self.equal = ops.Equal()
+        ...         self.reduce_all = ops.ReduceAll(keep_dims=False)
+        ...         self.base = Tensor([0], dtype=ms.int32)
+        ...
+        ...     def construct(self, x):
+        ...         init = Tensor([0]*8, dtype=ms.int32)
+        ...         clear_status = self.clear_status(init)
+        ...         x = ops.depend(x, clear_status)
+        ...         res = self.sub(x, self.neg(x))
+        ...         init = ops.depend(init, res)
+        ...         get_status = self.get_status(init)
+        ...         flag = self.equal(self.base, get_status)
+        ...         overall_finite = self.reduce_all(flag)
+        ...         overflow = not overall_finite
+        ...         return overflow
+        ...
+        >>> value = 65504
+        >>> data = np.full((2, 3), value, dtype=np.float16)
+        >>> x = Tensor(data, dtype=ms.float16)
+        >>> net = Net()
+        >>> res = net(x)
+        >>> print(res)
+        True
+        >>> value = 10
+        >>> data = np.full((2, 3), value, dtype=np.float16)
+        >>> x = Tensor(data, dtype=ms.float16)
+        >>> net = Net()
+        >>> res = net(x)
+        >>> print(res)
+        False
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        """Initialize NPUClearFloatStatusV2"""


 class Cos(Primitive):
--- a/tests/st/mix_precision/test_mix_precision_func.py
+++ b/tests/st/mix_precision/test_mix_precision_func.py
@ -15,6 +15,7 @@

 import numpy as np
 import pytest
+
 import mindspore
 from mindspore import Tensor, Parameter
 from mindspore.common import dtype as mstype
@ -60,22 +61,20 @@ def test_dynamic_loss_scaler(mode):
    Expectation: the `scale_value` can be adjusted correctly.
    """
    context.set_context(mode=mode)
-    status = amp.init_status()
    loss_scaler = amp.DynamicLossScaler(scale_value=2**10, scale_factor=2, scale_window=50)

    grads = (Tensor(np.array([0.5, 1.0]), mindspore.float16),
             Tensor(np.array([0.2]), mindspore.float16))
    unscaled_grads = loss_scaler.unscale(grads)
-    grads_finite = amp.all_finite(unscaled_grads, status)
+    grads_finite = amp.all_finite(unscaled_grads)
    loss_scaler.counter = Parameter(Tensor(49, dtype=mstype.int32))
    loss_scaler.adjust(grads_finite)
    assert loss_scaler.scale_value.asnumpy() == np.array(2048.)

-    status = amp.init_status()
    grads = (Tensor(np.array([2., 1.0]), mindspore.float16),
             Tensor(np.array([0.2]), mindspore.float16))
    unscaled_grads = loss_scaler.unscale(grads)
-    grads_finite = amp.all_finite(unscaled_grads, status)
+    grads_finite = amp.all_finite(unscaled_grads)
    loss_scaler.scale_value = Parameter(Tensor(2**10, dtype=mstype.float32))
    loss_scaler.adjust(grads_finite)
    assert loss_scaler.scale_value.asnumpy() == np.array(1024.)
--- a/tests/st/ops/ascend/test_npu_overflow_v2.py
+++ b/tests/st/ops/ascend/test_npu_overflow_v2.py
@ -0,0 +1,175 @@
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import pytest
+import numpy as np
+
+import mindspore as ms
+from mindspore import Tensor, nn, ops
+from mindspore import dtype as mstype
+from mindspore.ops._primitive_cache import _get_cache_prim
+from mindspore.ops.operations.math_ops import NPUGetFloatStatusV2, NPUClearFloatStatusV2
+
+
+class OverflowCheckNet(nn.Cell):
+    def __init__(self):
+        super(OverflowCheckNet, self).__init__()
+        self.base1 = Tensor(1, mstype.float32)
+        self.base2 = Tensor(0, mstype.int32)
+        self.reduce_sum = ops.ReduceSum(keep_dims=False)
+        self.less_equal = ops.LessEqual()
+        self.reduce_all = ops.ReduceAll(keep_dims=False)
+        self.equal = ops.Equal()
+
+    def start_overflow_check_v1(self, pre_cond, compute_input):
+        status = False
+        # init overflow buffer
+        status = ops.NPUAllocFloatStatus()()
+        status = ops.depend(status, pre_cond)
+        # clear overflow buffer
+        clear_status = ops.NPUClearFloatStatus()(status)
+        compute_input = ops.depend(compute_input, clear_status)
+        return status, compute_input
+
+    def get_overflow_status_v1(self, status, compute_output):
+        status = ops.depend(status, compute_output)
+        get_status = ops.NPUGetFloatStatus()(status)
+        status = ops.depend(status, get_status)
+        # sum overflow buffer elements, 0:not overflow , >0:overflow
+        flag_sum = self.reduce_sum(status, (0,))
+        overflow = self.less_equal(self.base1, flag_sum)
+        return overflow
+
+    def start_overflow_check_v2(self, pre_cond, compute_input):
+        status = Tensor([0] * 8, mstype.int32)
+        status = ops.depend(status, pre_cond)
+        # clear overflow buffer
+        clear_status = _get_cache_prim(NPUClearFloatStatusV2)()(status)
+        compute_input = ops.depend(compute_input, clear_status)
+        return status, compute_input
+
+    def get_overflow_status_v2(self, status, compute_output):
+        status = ops.depend(status, compute_output)
+        get_status = _get_cache_prim(NPUGetFloatStatusV2)()(status)
+        status = ops.depend(status, get_status)
+        clear_status = _get_cache_prim(NPUClearFloatStatusV2)()(status)
+        get_status = ops.depend(get_status, clear_status)
+        flag = self.equal(self.base2, get_status)
+        overall_finite = self.reduce_all(flag)
+        return not overall_finite
+
+
+class OverFlowNetV2GetStatusAfterClear(OverflowCheckNet):
+    def __init__(self):
+        super(OverFlowNetV2GetStatusAfterClear, self).__init__()
+        self.mul = ops.Mul()
+        self.sub = ops.Sub()
+
+    def construct(self, x1, x2):
+        y1 = self.mul(x1, x1)
+        status, compute_input = self.start_overflow_check_v2(y1, x2)
+        y2 = self.sub(y1, compute_input)
+        cond = self.get_overflow_status_v2(status, y2)
+        return cond
+
+
+class OverFlowNetV2GetStatus(OverflowCheckNet):
+    def __init__(self):
+        super(OverFlowNetV2GetStatus, self).__init__()
+        self.add = ops.Add()
+        self.mul = ops.Mul()
+
+    def construct(self, x1, x2):
+        y1 = self.add(x1, x1)
+        status, compute_input = self.start_overflow_check_v2(y1, x2)
+        y2 = self.mul(y1, compute_input)
+        cond = self.get_overflow_status_v2(status, y2)
+        return cond
+
+
+class OverflowCheckV1vsV2(OverflowCheckNet):
+    def __init__(self):
+        super(OverflowCheckV1vsV2, self).__init__()
+        self.add = ops.Add()
+        self.atan2 = ops.Atan2()
+
+    def construct(self, x1, x2, version):
+        y1 = self.add(x1, x1)
+        if version == 1:
+            status, compute_input = self.start_overflow_check_v1(y1, x2)
+            y2 = self.atan2(y1, compute_input)
+            cond = self.get_overflow_status_v1(status, y2)
+        else:
+            status, compute_input = self.start_overflow_check_v2(y1, x2)
+            y2 = self.atan2(y1, compute_input)
+            cond = self.get_overflow_status_v2(status, y2)
+        return cond
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+@pytest.mark.parametrize('mode', [ms.GRAPH_MODE, ms.PYNATIVE_MODE])
+def test_v2_overflow_get_after_clear(mode):
+    """
+    Feature: overflow check v2
+    Description: Verify the result of get_status after clear
+    Expectation: success
+    """
+    ms.set_context(mode=mode)
+    net = OverFlowNetV2GetStatusAfterClear()
+    output = net(Tensor(65504, mstype.float16), Tensor(1, mstype.float16))
+    assert not output
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+@pytest.mark.parametrize('mode', [ms.GRAPH_MODE, ms.PYNATIVE_MODE])
+def test_v2_clear_overflow_get(mode):
+    """
+    Feature: overflow check v2
+    Description: Verify the result of get_status when overflow
+    Expectation: success
+    """
+    ms.set_context(mode=mode)
+    net = OverFlowNetV2GetStatus()
+    output = net(Tensor(1, mstype.float16), Tensor(65504, mstype.float16))
+    assert output
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+@pytest.mark.parametrize('mode', [ms.GRAPH_MODE, ms.PYNATIVE_MODE])
+def test_v1_vs_v2_overflow_check(mode):
+    """
+    Feature: overflow check v1 vs v2
+    Description: Verify the result of atan2 when inputs include 0
+    Expectation: success
+    """
+    ms.set_context(mode=mode)
+    input1 = np.random.random((2, 4)).astype(np.float32)
+    input2 = np.random.random((2, 4)).astype(np.float32)
+    input1[0] = 0
+    input2[1] = 0
+    net = OverflowCheckV1vsV2()
+    overflow_v1 = net(Tensor(input1), Tensor(input2), 1)
+    overflow_v2 = net(Tensor(input1), Tensor(input2), 2)
+    assert overflow_v1
+    assert not overflow_v2
--- a/tests/st/train/test_amp_overflow.py
+++ b/tests/st/train/test_amp_overflow.py
@ -0,0 +1,83 @@
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+'''test overflow'''
+import pytest
+import numpy as np
+
+from mindspore import Tensor, Parameter, nn, ops
+import mindspore.amp as amp
+import mindspore as ms
+
+
+class Net(nn.Cell):
+    def __init__(self, in_features, out_features):
+        super(Net, self).__init__()
+        self.weight = Parameter(Tensor(np.full([in_features, out_features], 2, np.float16)),
+                                name='weight')
+        self.matmul = ops.MatMul()
+
+    def construct(self, x):
+        output = self.matmul(x, self.weight)
+        return output
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_x86_cpu_training
+@pytest.mark.env_onecard
+@pytest.mark.parametrize('mode', [ms.GRAPH_MODE, ms.PYNATIVE_MODE])
+def test_functional_amp_overflow(mode):
+    """
+    Feature: mindspore.amp.overflow
+    Description: test amp overflow
+    Expectation: Success.
+    """
+    ms.set_context(mode=mode)
+    size, in_features, out_features = 1, 2, 2
+    net = Net(in_features, out_features)
+    loss_fn = nn.MSELoss()
+
+    def forward_fn(data, label):
+        logits = net(data)
+        loss = loss_fn(logits, label)
+        return loss, logits
+
+    grad_fn = ops.value_and_grad(forward_fn, grad_position=None, weights=net.trainable_params())
+
+    @ms.jit
+    def train_step(data, label):
+        (loss, _), grads = grad_fn(data, label)
+        is_finite = amp.all_finite(grads)
+        return loss, is_finite
+
+    shape = (size, in_features)
+    inputs = [
+        Tensor(np.full(shape, -np.inf, np.float16)),
+        Tensor(np.full(shape, 0, np.float16)),
+        Tensor(np.full(shape, 40000, np.float16)),
+        Tensor(np.full(shape, 10, np.float16)),
+        Tensor(np.full(shape, np.inf, np.float16)),
+    ]
+    label = Tensor(np.full([out_features,], 0, np.float16))
+    datasets = list(zip(inputs, [label for _ in range(len(inputs))]))
+    expect_results = [False, True, False, True, False]
+    outputs = []
+    for data, label in datasets:
+        _, is_finite = train_step(data, label)
+        outputs.append(is_finite.asnumpy().tolist())
+    assert outputs == expect_results
--- a/tests/st/train/test_loss_scale_overflow.py
+++ b/tests/st/train/test_loss_scale_overflow.py
@ -0,0 +1,114 @@
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+'''test overflow'''
+import pytest
+import numpy as np
+
+import mindspore as ms
+from mindspore import Tensor, Parameter, nn, ops, boost
+from mindspore import dtype as mstype
+
+
+class Net(nn.Cell):
+    def __init__(self, in_features, out_features):
+        super(Net, self).__init__()
+        self.weight = Parameter(Tensor(np.full([in_features, out_features], 2, np.float16)),
+                                name='weight')
+        self.matmul = ops.MatMul()
+
+    def construct(self, x):
+        output = self.matmul(x, self.weight)
+        return output
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+@pytest.mark.parametrize('mode', [ms.GRAPH_MODE, ms.PYNATIVE_MODE])
+def test_train_one_step_with_loss_scale_cell_overflow(mode):
+    """
+    Feature: mindspore.TrainOneStepWithLossScaleCell.overflow
+    Description: test TrainOneStepWithLossScaleCell overflow
+    Expectation: Success.
+    """
+    ms.set_context(mode=mode)
+    size, in_features, out_features = 1, 2, 2
+    net = Net(in_features, out_features)
+    loss = nn.MSELoss()
+    optimizer = nn.Momentum(net.trainable_params(),
+                            learning_rate=0.1, momentum=0.9)
+    net_with_loss = nn.WithLossCell(net, loss)
+    shape = (size, in_features)
+    inputs = [
+        Tensor(np.full(shape, -np.inf, np.float16)),
+        Tensor(np.full(shape, 0, np.float16)),
+        Tensor(np.full(shape, 40000, np.float16)),
+        Tensor(np.full(shape, 10, np.float16)),
+        Tensor(np.full(shape, np.inf, np.float16)),
+    ]
+    label = Tensor(np.full([out_features,], 0, np.float16))
+    datasets = list(zip(inputs, [label for _ in range(len(inputs))]))
+    scaling_sens = Tensor([8], dtype=mstype.float16)
+    train_network = nn.TrainOneStepWithLossScaleCell(
+        net_with_loss, optimizer, scale_sense=scaling_sens)
+    expect_results = [True, False, True, False, True]
+    outputs = []
+    for x, label in datasets:
+        _, overflow, _ = train_network(x, label)
+        outputs.append(overflow.asnumpy().tolist())
+    assert outputs == expect_results
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+@pytest.mark.parametrize('mode', [ms.PYNATIVE_MODE])
+def test_boost_train_one_step_with_loss_scale_cell_overflow(mode):
+    """
+    Feature: mindspore.BoostTrainOneStepWithLossScaleCell.overflow
+    Description: test BoostTrainOneStepWithLossScaleCell overflow
+    Expectation: Success.
+    """
+    ms.set_context(mode=mode)
+    size, in_features, out_features = 1, 2, 2
+    net = Net(in_features, out_features)
+    loss = nn.MSELoss()
+    optimizer = nn.Momentum(net.trainable_params(),
+                            learning_rate=0.1, momentum=0.9)
+    net_with_loss = nn.WithLossCell(net, loss)
+    shape = (size, in_features)
+    inputs = [
+        Tensor(np.full(shape, -np.inf, np.float16)),
+        Tensor(np.full(shape, 0, np.float16)),
+        Tensor(np.full(shape, 40000, np.float16)),
+        Tensor(np.full(shape, 10, np.float16)),
+        Tensor(np.full(shape, np.inf, np.float16)),
+    ]
+    label = Tensor(np.full([out_features,], 0, np.float16))
+    datasets = list(zip(inputs, [label for _ in range(len(inputs))]))
+    scaling_sens = Tensor([8], dtype=mstype.float16)
+    train_network = boost.BoostTrainOneStepWithLossScaleCell(
+        net_with_loss, optimizer, scale_sense=scaling_sens)
+    expect_results = [True, False, True, False, True]
+    outputs = []
+    for x, label in datasets:
+        _, overflow, _ = train_network(x, label)
+        outputs.append(overflow)
+    assert outputs == expect_results
--- a/tests/ut/cpp/tbe/tbe_json_creator_test.cc
+++ b/tests/ut/cpp/tbe/tbe_json_creator_test.cc
@ -76,7 +76,7 @@ TEST_F(TestHWTBEJsonCreator, DISABLED_test_tbe_single_common) {
  auto tbe_json_creator_build = std::make_shared<BuildTbeJsonCreator>();
  nlohmann::json kernel_json;
  EXPECT_TRUE(tbe_json_creator_select->GenJson(relu1, &kernel_json));
-  EXPECT_EQ(tbe_json_creator_select->GetJsonHash(), 10654173078034037040U)
+  EXPECT_EQ(tbe_json_creator_select->GetJsonHash(), 12207851473833394607U)
    << "Error json is:" << kernel_json << ", for expected json, see file: tbe_single_common_select.json";
  EXPECT_TRUE(tbe_json_creator_build->GenJson(relu1, &kernel_json));
  EXPECT_EQ(tbe_json_creator_build->GetJsonHash(), 2389029245513168162U)
@ -118,7 +118,7 @@ TEST_F(TestHWTBEJsonCreator, DISABLED_test_tbe_single_conv2d_backprop_filter) {
  auto tbe_json_creator_build = std::make_shared<BuildTbeJsonCreator>();
  nlohmann::json kernel_json;
  EXPECT_TRUE(tbe_json_creator_select->GenJson(conv2d_backprop_filter, &kernel_json));
-  EXPECT_EQ(tbe_json_creator_select->GetJsonHash(), 16416634683849134630U)
+  EXPECT_EQ(tbe_json_creator_select->GetJsonHash(), 14683931476519216146U)
    << "Error json is:" << kernel_json
    << ", for expected json, see file: tbe_single_conv2d_backprop_filter_select.json";
  EXPECT_TRUE(tbe_json_creator_build->GenJson(conv2d_backprop_filter, &kernel_json));
@ -177,7 +177,7 @@ TEST_F(TestHWTBEJsonCreator, DISABLED_test_tbe_single_dynamic_rnn) {
  auto tbe_json_creator_build = std::make_shared<BuildTbeJsonCreator>();
  nlohmann::json kernel_json;
  EXPECT_TRUE(tbe_json_creator_select->GenJson(dynamic_rnn, &kernel_json));
-  EXPECT_EQ(tbe_json_creator_select->GetJsonHash(), 3107761065269367419U)
+  EXPECT_EQ(tbe_json_creator_select->GetJsonHash(), 16143536111232395651U)
    << "Error json is:" << kernel_json << ", for expected json, see file: tbe_single_dynamic_rnn_select.json";
  EXPECT_TRUE(tbe_json_creator_build->GenJson(dynamic_rnn, &kernel_json));
  EXPECT_EQ(tbe_json_creator_build->GetJsonHash(), 14916511955212123861U)
@ -230,7 +230,7 @@ TEST_F(TestHWTBEJsonCreator, DISABLED_test_tbe_single_layer_norm) {
  auto tbe_json_creator_build = std::make_shared<BuildTbeJsonCreator>();
  nlohmann::json kernel_json;
  EXPECT_TRUE(tbe_json_creator_select->GenJson(layer_norm, &kernel_json));
-  EXPECT_EQ(tbe_json_creator_select->GetJsonHash(), 6592146268336877821U)
+  EXPECT_EQ(tbe_json_creator_select->GetJsonHash(), 1161191001728520611U)
    << "Error json is:" << kernel_json << ", for expected json, see file: tbe_single_layer_norm_select.json";
  EXPECT_TRUE(tbe_json_creator_build->GenJson(layer_norm, &kernel_json));
  EXPECT_EQ(tbe_json_creator_build->GetJsonHash(), 2848618249728529296U)
@ -306,7 +306,7 @@ TEST_F(TestHWTBEJsonCreator, test_tbe_fusion_common) {
  nlohmann::json fusion_json;
  auto tbe_json_creator = std::make_shared<FusionBuildTbeJsonCreator>();
  EXPECT_TRUE(tbe_json_creator->GenJson(fusion_scope_info, &fusion_json));
-  EXPECT_EQ(tbe_json_creator->GetJsonHash(), 9482071119130243510U)
+  EXPECT_EQ(tbe_json_creator->GetJsonHash(), 18379117451241093022U)
    << "Error json is:" << fusion_json << ", for expected json, see file: tbe_fusion_common.json";
 }

@ -367,7 +367,7 @@ TEST_F(TestHWTBEJsonCreator, test_fusion_add_conv2d) {
  nlohmann::json fusion_json;
  auto tbe_json_creator = std::make_shared<FusionBuildTbeJsonCreator>();
  EXPECT_TRUE(tbe_json_creator->GenJson(fusion_scope_info, &fusion_json));
-  EXPECT_EQ(tbe_json_creator->GetJsonHash(), 1515571995667332418U)
+  EXPECT_EQ(tbe_json_creator->GetJsonHash(), 16132617067967162574U)
    << "Error json is:" << fusion_json << ", for expected json, see file: test_fusion_add_conv2d.json";
 }