!28310 dynamic_kernel_mod

Merge pull request !28310 from TuDouNi/dynamic_shape_stage1
2022-01-04 12:48:53 +00:00 · 2022-01-04 12:48:53 +00:00 · e4438f3028
parent b846976494 9373679c04
commit e4438f3028
38 changed files with 1430 additions and 91 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
@ -1,5 +1,6 @@
 file(GLOB_RECURSE KERNEL_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
    "kernel_build_info.cc"
+    "kernel.cc"
    "kash/*.cc"
    "common_utils.cc"
    "oplib/*.cc"
@ -12,6 +13,7 @@ endif()

 if(ENABLE_D)
    file(GLOB_RECURSE D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+        "ascend_kernel_mod.cc"
        "kernel_query.cc"
        "tbe/*.cc"
        "host/*.cc"
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.cc
@ -36,13 +36,12 @@ using HostDynamicKernel = mindspore::device::ascend::HostDynamicKernel;

 namespace mindspore {
 namespace kernel {
-AicpuOpKernelMod::AicpuOpKernelMod() : anf_node_(nullptr) {}
+AicpuOpKernelMod::AicpuOpKernelMod() {}

 AicpuOpKernelMod::~AicpuOpKernelMod() {
  args_.clear();
-  inputList_.clear();
-  outputList_.clear();
-  anf_node_ = nullptr;
+  input_list_.clear();
+  output_list_.clear();
  input_size_list_.clear();
  output_size_list_.clear();
  workspace_size_list_.clear();
@ -55,9 +54,9 @@ void AicpuOpKernelMod::SetOutputSizeList(const std::vector<size_t> &size_list) {
 const std::vector<size_t> &AicpuOpKernelMod::GetOutputSizeList() const { return output_size_list_; }
 void AicpuOpKernelMod::SetWorkspaceSizeList(const std::vector<size_t> &size_list) { workspace_size_list_ = size_list; }
 const std::vector<size_t> &AicpuOpKernelMod::GetWorkspaceSizeList() const { return workspace_size_list_; }
-void AicpuOpKernelMod::SetInputList(const std::vector<int64_t> &inputList) { inputList_ = inputList; }
-void AicpuOpKernelMod::SetOutputList(const std::vector<int64_t> &outputList) { outputList_ = outputList; }
-void AicpuOpKernelMod::SetNodeDef(const std::string &nodeDef) { (void)node_def_str_.assign(nodeDef); }
+void AicpuOpKernelMod::SetInputList(const std::vector<int64_t> &input_list) { input_list_ = input_list; }
+void AicpuOpKernelMod::SetOutputList(const std::vector<int64_t> &output_list) { output_list_ = output_list; }
+void AicpuOpKernelMod::SetNodeDef(const std::string &node_def) { (void)node_def_str_.assign(node_def); }
 void AicpuOpKernelMod::SetExtInfo(const std::string &ext_info) { ext_info_ = ext_info; }
 void AicpuOpKernelMod::SetNodeName(const std::string &node_name) { node_name_ = node_name; }
 void AicpuOpKernelMod::SetCustSo(const std::string &cust_so) {
@ -85,11 +84,18 @@ void AicpuOpKernelMod::CreateCpuKernelInfo(const std::vector<AddressPtr> &inputs
        node_so_ = kLibAicpuKernelSoName;
      }
    }
-  } else {
-    if (kCpuKernelBaseOps.find(node_name_) == kCpuKernelBaseOps.end()) {
-      node_name_ = kCpuRunApi;
-    }
+  } else if (kCpuKernelBaseOps.find(node_name_) == kCpuKernelBaseOps.end()) {
+    node_name_ = kCpuRunApi;
  }
+
+  if (node_name_ == kTopK) {
+    node_name_ = kTopKV2;
+  }
+
+  if (node_name_ == kStack) {
+    node_name_ = kPack;
+  }
+
  // InputOutputAddr
  vector<void *> io_addrs;
  (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(io_addrs),
@ -120,6 +126,8 @@ void AicpuOpKernelMod::CreateCpuKernelInfo(const std::vector<AddressPtr> &inputs
    aicpu_param_head.extInfoAddr = 0;
  } else {
    MS_LOG(INFO) << "Dynamic Kernel Ext Info size:" << ext_info_.size();
+    aicpu_param_head.extInfoLength = SizeToUint(ext_info_.size());
+    aicpu_param_head.extInfoAddr = reinterpret_cast<uint64_t>(ext_info_addr_dev_);
  }

  args_.clear();
@ -162,6 +170,8 @@ bool AicpuOpKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::
  }
  MS_LOG(INFO) << "Aicpu launch, node_so_:" << node_so_ << ", node name:" << node_name_
               << ", args_size:" << args_.length();
+  // cppcheck-suppress unreadVariable
+  auto lock = AscendKernelMod::LockRuntime();
  if (rtCpuKernelLaunchWithFlag(reinterpret_cast<const void *>(node_so_.c_str()),
                                reinterpret_cast<const void *>(node_name_.c_str()), 1,
                                reinterpret_cast<const void *>(args_.data()), static_cast<uint32_t>(args_.length()),
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.h
@ -25,6 +25,8 @@ namespace kernel {
 class AicpuOpKernelMod : public AscendKernelMod {
 public:
  AicpuOpKernelMod();
+  explicit AicpuOpKernelMod(const AnfNodePtr &anf_node_ptr) : AscendKernelMod(anf_node_ptr) {}
+
  ~AicpuOpKernelMod() override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
@ -33,10 +35,10 @@ class AicpuOpKernelMod : public AscendKernelMod {
                                   const std::vector<AddressPtr> &outputs, uint32_t stream_id) override;
  device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override;

-  void SetInputList(const std::vector<int64_t> &inputList);
-  void SetOutputList(const std::vector<int64_t> &outputList);
+  void SetInputList(const std::vector<int64_t> &input_list);
+  void SetOutputList(const std::vector<int64_t> &output_list);
  void SetAnfNode(const AnfNodePtr &anf_node);
-  void SetNodeDef(const std::string &nodeDef);
+  void SetNodeDef(const std::string &node_def);
  void SetExtInfo(const std::string &ext_info);
  void SetNodeName(const std::string &node_name);
  void SetCustSo(const std::string &cust_so);
@ -56,16 +58,18 @@ class AicpuOpKernelMod : public AscendKernelMod {
  const std::vector<size_t> &GetOutputSizeList() const override;
  const std::vector<size_t> &GetWorkspaceSizeList() const override;

- private:
-  bool cust_kernel_{false};
+ protected:
  std::string args_;
-  std::string node_def_str_;
+  std::string ext_info_;
  std::string node_name_;
  std::string node_so_;
-  std::string ext_info_;
-  std::vector<int64_t> inputList_;
-  std::vector<int64_t> outputList_;
-  AnfNodePtr anf_node_;
+  bool cust_kernel_{false};
+  std::string node_def_str_;
+  void *ext_info_addr_dev_ = nullptr;
+
+ private:
+  std::vector<int64_t> input_list_;
+  std::vector<int64_t> output_list_;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/dynamic_aicpu_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/dynamic_aicpu_kernel_mod.cc
@ -0,0 +1,231 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/aicpu/dynamic_aicpu_kernel_mod.h"
+
+#include <memory>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include "runtime/mem.h"
+#include "acl/acl_rt.h"
+#include "utils/convert_utils.h"
+#include "backend/kernel_compiler/aicpu/aicpu_util.h"
+#include "utils/ms_context.h"
+#include "runtime/device/kernel_runtime.h"
+#include "runtime/kernel.h"
+#include "utils/utils.h"
+#include "backend/session/anf_runtime_algorithm.h"
+
+namespace mindspore {
+namespace kernel {
+DynamicAicpuOpKernelMod::DynamicAicpuOpKernelMod(const AnfNodePtr &anf_node_ptr) : AicpuOpKernelMod(anf_node_ptr) {
+  unknow_type_ = device::ascend::UnknowShapeOpType::DEPEND_IN_SHAPE;
+  auto cnode = anf_node_ptr->cast<CNodePtr>();
+  if (cnode != nullptr) {
+    auto op_name = AnfAlgo::GetCNodeName(cnode);
+    if (kComputeDepend.find(op_name) != kComputeDepend.end()) {
+      unknow_type_ = device::ascend::UnknowShapeOpType::DEPEND_COMPUTE;
+    }
+  }
+}
+
+DynamicAicpuOpKernelMod::~DynamicAicpuOpKernelMod() {
+  // free dev ptr
+  if (ext_info_addr_dev_ == nullptr) {
+    return;
+  }
+  auto ret = rtFree(ext_info_addr_dev_);
+  if (ret != RT_ERROR_NONE) {
+    MS_LOG(ERROR) << "rtFree failed";
+  }
+}
+
+void DynamicAicpuOpKernelMod::InferOp() {
+  auto node = anf_node_.lock();
+  MS_EXCEPTION_IF_NULL(node);
+  if (!AnfAlgo::IsDynamicShape(node)) {
+    MS_LOG(EXCEPTION) << "The node is not dynamic shape.";
+  }
+  KernelMod::InferShape();
+}
+
+void DynamicAicpuOpKernelMod::InitOp() {
+  auto node = anf_node_.lock();
+  MS_EXCEPTION_IF_NULL(node);
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  if (!AnfAlgo::IsDynamicShape(cnode)) {
+    MS_LOG(EXCEPTION) << "The node is not dynamic shape: " << cnode->fullname_with_scope();
+  }
+
+  MS_LOG(INFO) << "UpdateExtInfo of " << cnode->fullname_with_scope() << " start";
+  auto input_num = AnfAlgo::GetInputTensorNum(cnode);
+  auto output_num = AnfAlgo::GetOutputTensorNum(cnode);
+  if (input_num == 0 && output_num == 0) {
+    MS_LOG(INFO) << "Node:" << cnode->fullname_with_scope() << " no need to update output shape";
+    return;
+  }
+
+  // Parse aicpu ext info
+  ext_info_handler_ = std::make_shared<device::ascend::AicpuExtInfoHandler>(
+    cnode->fullname_with_scope(), static_cast<uint32_t>(input_num), static_cast<uint32_t>(output_num), unknow_type_);
+  MS_EXCEPTION_IF_NULL(ext_info_handler_);
+  if (!ext_info_handler_->Parse(ext_info_)) {
+    MS_LOG(EXCEPTION) << "Parse AiCpu ext_info_handler failed";
+  }
+
+  if (ext_info_.empty()) {
+    MS_LOG(INFO) << "No need to copy to device, ext_info_ is empty. ";
+    return;
+  }
+
+  for (size_t i = 0; i < input_num; ++i) {
+    if (!ext_info_handler_->UpdateInputShapeAndType(i, NOT_NULL(cnode))) {
+      MS_LOG(EXCEPTION) << "Update input shape failed, cnode:" << cnode->fullname_with_scope() << " input:" << i;
+    }
+  }
+
+  if (unknow_type_ != device::ascend::UnknowShapeOpType::DEPEND_COMPUTE) {
+    for (size_t i = 0; i < output_num; ++i) {
+      if (!ext_info_handler_->UpdateOutputShapeAndType(i, NOT_NULL(cnode))) {
+        MS_LOG(EXCEPTION) << "Update output shape failed, cnode:" << cnode->fullname_with_scope() << " output:" << i;
+      }
+    }
+  }
+}
+
+void DynamicAicpuOpKernelMod::AllocateExtInfoDeviceAddr(const CNodePtr &cnode) {
+  MS_EXCEPTION_IF_NULL(cnode);
+  if (ext_info_addr_dev_ != nullptr) {
+    return;
+  }
+  // Allocate ext info addr in device
+  if (ext_info_.size() != 0) {
+    auto ret = rtMalloc(&ext_info_addr_dev_, ext_info_.size(), RT_MEMORY_HBM);
+    if (ret != RT_ERROR_NONE) {
+      MS_LOG(EXCEPTION) << "Call rtMalloc ext_info_addr_dev_ failed. Op name: " << cnode->fullname_with_scope();
+    }
+  }
+  ext_info_size_ = ext_info_.size();
+}
+
+bool DynamicAicpuOpKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+                                     const std::vector<AddressPtr> &outputs, void *stream_ptr) {
+  if (stream_ptr == nullptr) {
+    MS_LOG(ERROR) << "stream_ptr should not be nullptr.";
+    return false;
+  }
+  if (stream_ == nullptr) {
+    stream_ = stream_ptr;
+  }
+  auto node = anf_node_.lock();
+  MS_EXCEPTION_IF_NULL(node);
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  MS_LOG(INFO) << "Start launch of node: " << cnode->fullname_with_scope();
+
+  // is dynamic shape
+  if (!AnfAlgo::IsDynamicShape(cnode)) {
+    MS_LOG(EXCEPTION) << "The cnode is not dynamic shape:" << cnode->fullname_with_scope();
+  }
+
+  // copy extinfo to device
+  AllocateExtInfoDeviceAddr(cnode);
+  MS_EXCEPTION_IF_NULL(ext_info_handler_);
+  auto ret = aclrtMemcpy(ext_info_addr_dev_, ext_info_size_, ext_info_handler_->GetExtInfo(),
+                         ext_info_handler_->GetExtInfoLen(), ACL_MEMCPY_HOST_TO_DEVICE);
+  if (ret != RT_ERROR_NONE) {
+    MS_LOG(ERROR) << "UpdateExtInfo aclrtMemcpy failed. Node info: " << cnode->fullname_with_scope();
+    return false;
+  }
+
+  AicpuOpKernelMod::CreateCpuKernelInfo(inputs, outputs);
+  MS_LOG(INFO) << "Aicpu launch, node_so_:" << node_so_ << ", node name:" << node_name_
+               << ", args_size:" << args_.length();
+  // cppcheck-suppress unreadVariable
+  auto lock = AscendKernelMod::LockRuntime();
+  ret = rtCpuKernelLaunchWithFlag(reinterpret_cast<const void *>(node_so_.c_str()),
+                                  reinterpret_cast<const void *>(node_name_.c_str()), 1,
+                                  reinterpret_cast<const void *>(args_.data()), static_cast<uint32_t>(args_.length()),
+                                  nullptr, stream_, RT_KERNEL_DEFAULT);
+  if (ret != RT_ERROR_NONE) {
+    MS_LOG(ERROR) << "Aicpu op launch failed!";
+    return false;
+  }
+
+  if (unknow_type_ == device::ascend::UnknowShapeOpType::DEPEND_COMPUTE) {
+    ret = aclrtMemcpyAsync(ext_info_handler_->GetExtInfo(), ext_info_handler_->GetExtInfoLen(), ext_info_addr_dev_,
+                           ext_info_size_, ACL_MEMCPY_DEVICE_TO_HOST, stream_);
+    if (ret != RT_ERROR_NONE) {
+      MS_LOG(ERROR) << "aclrtMemcpyAsync output shape failed. Op name: " << cnode->fullname_with_scope();
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void DynamicAicpuOpKernelMod::UpdateOp() {
+  auto node = anf_node_.lock();
+  MS_EXCEPTION_IF_NULL(node);
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  MS_LOG(INFO) << "Aicpu " << cnode->fullname_with_scope() << " PostExecute";
+  // is dynamic shape
+  if (!AnfAlgo::IsDynamicShape(cnode)) {
+    MS_LOG(EXCEPTION) << "The cnode is not dynamic shape:" << cnode->fullname_with_scope();
+  }
+
+  if (unknow_type_ != device::ascend::UnknowShapeOpType::DEPEND_COMPUTE) {
+    MS_LOG(INFO) << "Node " << node->fullname_with_scope() << " update op skip.";
+    return;
+  }
+  // cppcheck-suppress unreadVariable
+  auto lock = AscendKernelMod::LockRuntime();
+  auto ret = rtStreamSynchronize(stream_);
+  if (ret != RT_ERROR_NONE) {
+    MS_LOG(EXCEPTION) << "Call runtime rtStreamSynchronize failed. Op name: " << cnode->fullname_with_scope();
+  }
+
+  MS_LOG(INFO) << "Update aicpu kernel output shape from ext_info. Op name: " << cnode->fullname_with_scope();
+  UpdateOutputShapeFromExtInfo(cnode);
+}
+
+bool DynamicAicpuOpKernelMod::UpdateOutputShapeFromExtInfo(const CNodePtr &cnode) {
+  MS_EXCEPTION_IF_NULL(cnode);
+  MS_LOG(INFO) << "UpdateOutputShapeFromExtInfo start. Op name " << cnode->fullname_with_scope();
+  MS_EXCEPTION_IF_NULL(ext_info_handler_);
+
+  std::vector<TypeId> type_ids;
+  std::vector<std::vector<size_t>> shapes;
+  auto output_num = AnfAlgo::GetOutputTensorNum(cnode);
+  for (size_t i = 0; i < output_num; ++i) {
+    MS_LOG(INFO) << "Get output:" << output_num << " Shape";
+    std::vector<int64_t> shape;
+    TypeId type_id;
+    (void)ext_info_handler_->GetOutputShapeAndType(SizeToUint(i), NOT_NULL(&shape), NOT_NULL(&type_id));
+    type_ids.emplace_back(type_id);
+    std::vector<size_t> size_t_shape;
+    std::transform(shape.begin(), shape.end(), std::back_inserter(size_t_shape), LongToSize);
+    shapes.emplace_back(size_t_shape);
+  }
+
+  AnfAlgo::SetOutputInferTypeAndShape(type_ids, shapes, cnode.get());
+  return true;
+}
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/dynamic_aicpu_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/dynamic_aicpu_kernel_mod.h
@ -0,0 +1,54 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_DYNAMIC_AICPU_KERNEL_MOD_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_DYNAMIC_AICPU_KERNEL_MOD_H_
+#include <vector>
+#include <memory>
+#include <string>
+#include "backend/kernel_compiler/aicpu/aicpu_kernel_mod.h"
+#include "backend/kernel_compiler/aicpu/aicpu_util.h"
+#include "runtime/device/ascend/executor/aicpu_ext_info_handle.h"
+namespace mindspore {
+namespace kernel {
+class DynamicAicpuOpKernelMod : public AicpuOpKernelMod {
+ public:
+  DynamicAicpuOpKernelMod() : unknow_type_(device::ascend::UnknowShapeOpType::DEPEND_IN_SHAPE) {}
+  explicit DynamicAicpuOpKernelMod(const AnfNodePtr &anf_node_ptr);
+
+  ~DynamicAicpuOpKernelMod() override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
+
+  void InferOp() override;
+  void InitOp() override;
+  void UpdateOp() override;
+
+ private:
+  void AllocateExtInfoDeviceAddr(const CNodePtr &cnode);
+  bool UpdateOutputShapeFromExtInfo(const CNodePtr &cnode);
+
+  std::shared_ptr<device::ascend::AicpuExtInfoHandler> ext_info_handler_ = nullptr;
+  size_t ext_info_size_ = 0;
+  device::ascend::UnknowShapeOpType unknow_type_;
+};
+
+using DynamicAicpuOpKernelModPtr = std::shared_ptr<DynamicAicpuOpKernelMod>;
+using DynamicAicputOpKernelModPtrList = std::vector<DynamicAicpuOpKernelModPtr>;
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_DYNAMIC_AICPU_KERNEL_MOD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.cc
@ -0,0 +1,35 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/ascend_kernel_mod.h"
+#include "runtime/rt.h"
+namespace mindspore {
+namespace kernel {
+void AscendKernelMod::UpdateOp() {
+  MS_EXCEPTION_IF_NULL(stream_);
+  // cppcheck-suppress unreadVariable
+  auto lock = LockRuntime();
+  if (RT_ERROR_NONE != rtStreamSynchronize(stream_)) {
+    MS_LOG(EXCEPTION) << "Call runtime rtStreamSynchronize failed.";
+  }
+}
+
+std::lock_guard<std::mutex> AscendKernelMod::LockRuntime() {
+  static std::mutex mutex;
+  return std::lock_guard<std::mutex>(mutex);
+}
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.h
@ -31,6 +31,8 @@ namespace mindspore {
 namespace kernel {
 class AscendKernelMod : public KernelMod {
 public:
+  AscendKernelMod() {}
+  explicit AscendKernelMod(const AnfNodePtr &anf_node_ptr) : KernelMod(anf_node_ptr) {}
  virtual std::vector<TaskInfoPtr> GenTask(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
                                           const std::vector<AddressPtr> &, uint32_t) = 0;
  uint32_t block_dim() { return block_dim_; }
@ -44,6 +46,7 @@ class AscendKernelMod : public KernelMod {
    return false;
 #endif
  }
+  void UpdateOp() override;

  void InitDynamicKernel(const CNodePtr &cnode_ptr, void *stream) {
    if (dynamic_kernel_ == nullptr) {
@ -54,6 +57,8 @@ class AscendKernelMod : public KernelMod {
  }
  device::DynamicKernelPtr DynamicKernel() const { return dynamic_kernel_; }

+  static std::lock_guard<std::mutex> LockRuntime();
+
 protected:
  uint32_t block_dim_{1};
  uint32_t stream_id_{0};
--- a/mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_kernel.cc
@ -66,7 +66,13 @@ HcclKernelFactory &HcclKernelFactory::Get() {

 HcclKernel::HcclKernel()
    : hccl_count_(0), op_type_(::HcclReduceOp::HCCL_REDUCE_SUM), root_id_(0), src_rank_(0), dest_rank_(0) {}
-
+HcclKernel::HcclKernel(const AnfNodePtr &anf_node)
+    : AscendKernelMod(),
+      hccl_count_(0),
+      op_type_(::HcclReduceOp::HCCL_REDUCE_SUM),
+      root_id_(0),
+      src_rank_(0),
+      dest_rank_(0) {}
 HcclKernel::~HcclKernel() {
  hccl_kernel_input_shape_list_.clear();
  hccl_kernel_output_shape_list_.clear();
@ -294,5 +300,99 @@ device::DynamicKernelPtr HcclKernel::GenDynamicKernel(const CNodePtr &cnode_ptr,
    hccl_type, input_data_addr, output_data_addr, hccl_count_, data_type, op_type_, root_id_, stream_ptr, cnode_ptr);
  return executor;
 }
+
+void HcclKernel::InferOp() {
+  if (AnfAlgo::IsDynamicShape(anf_node_.lock())) {
+    KernelMod::InferShape();
+  }
+}
+
+bool HcclKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+                        const std::vector<AddressPtr> &outputs, void *stream_ptr) {
+  auto node = anf_node_.lock();
+  MS_EXCEPTION_IF_NULL(node);
+  if (!node->isa<CNode>()) {
+    MS_LOG(EXCEPTION) << "anfnode is not a cnode";
+  }
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+
+  if (inputs.empty() && outputs.empty()) {
+    MS_LOG(ERROR) << "Hccl kernel input or output is empty";
+    return false;
+  }
+  if (hccl_data_type_list_.empty()) {
+    MS_LOG(ERROR) << "Hccl data type list is empty";
+    return false;
+  }
+
+  MS_EXCEPTION_IF_NULL(stream_ptr);
+
+  MS_LOG(INFO) << "Start Execute: " << cnode->DebugString();
+  std::string hccl_type = MsOpNameToHcomOpType(AnfAlgo::GetCNodeName(anf_node_.lock()));
+  HcclDataType data_type = hccl_data_type_list_[0];
+
+  ::HcomOperation op_info;
+  op_info.hcclType = hccl_type;
+  op_info.inputPtr = inputs[0]->addr;
+  op_info.outputPtr = outputs[0]->addr;
+  op_info.dataType = static_cast<HcclDataType>(data_type);
+  op_info.opType = static_cast<HcclReduceOp>(op_type_);
+  op_info.root = IntToUint(root_id_);
+  op_info.count = hccl_count_;
+
+  auto callback = [this](HcclResult status) {
+    if (status != HCCL_SUCCESS) {
+      MS_LOG(ERROR) << "HcomExcutorInitialize failed, ret:" << status;
+    }
+    std::lock_guard<std::mutex> lock(this->hccl_mutex_);
+    this->cond_.notify_all();
+    MS_LOG(INFO) << "hccl callback success.";
+  };
+
+  auto hccl_ret = hccl::HcclAdapter::GetInstance().HcclExecEnqueueOp(op_info, callback);
+  if (hccl_ret != HCCL_SUCCESS) {
+    MS_LOG(EXCEPTION) << "Call EnqueueHcomOperation failed, node info: " << cnode->DebugString();
+    return false;
+  }
+
+  std::unique_lock<std::mutex> ulock(hccl_mutex_);
+  cond_.wait(ulock);
+  MS_LOG(INFO) << "Execute " << cnode->DebugString() << " success";
+  return true;
+}
+
+void HcclKernel::InitOp() {
+  auto node = anf_node_.lock();
+  MS_EXCEPTION_IF_NULL(node);
+  if (!node->isa<CNode>()) {
+    MS_LOG(EXCEPTION) << "anfnode is not a cnode";
+  }
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+
+  if (!AnfAlgo::IsDynamicShape(cnode)) {
+    MS_LOG(DEBUG) << "The node is not dynamic shape: " << cnode->fullname_with_scope();
+    return;
+  }
+
+  MS_LOG(INFO) << "Start to InitOp. Node info: " << cnode->DebugString();
+
+  std::vector<std::vector<size_t>> hccl_kernel_input_shape_list;
+  if (!HcomUtil::GetKernelInputShape(cnode, &hccl_kernel_input_shape_list)) {
+    MS_LOG(EXCEPTION) << "GetKernelInputShape fail! Node info: " << cnode->DebugString();
+  }
+
+  std::vector<HcclDataType> hccl_data_type_list;
+  if (!HcomUtil::GetHcomDataType(cnode, &hccl_data_type_list)) {
+    MS_LOG(EXCEPTION) << "GetHcomDataType fail! Node info: " << cnode->DebugString();
+  }
+
+  // Update Hccl count
+  if (!HcomUtil::GetHcomCount(cnode, hccl_data_type_list, hccl_kernel_input_shape_list, &hccl_count_)) {
+    MS_LOG(EXCEPTION) << "GetHcomCount fail! Node info: " << cnode->DebugString();
+  }
+  MS_LOG(INFO) << "Update Hccl count:" << hccl_count_;
+}
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_kernel.h
@ -34,6 +34,7 @@ namespace kernel {
 class HcclKernel : public AscendKernelMod {
 public:
  HcclKernel();
+  explicit HcclKernel(const AnfNodePtr &anf_node);
  ~HcclKernel() override;
  virtual bool Init(const AnfNodePtr &anf_node);
  const std::vector<size_t> &GetInputSizeList() const override;
@ -43,6 +44,12 @@ class HcclKernel : public AscendKernelMod {
                                   const std::vector<AddressPtr> &outputs, uint32_t stream_id) override;
  device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override;

+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
+
+  void InferOp() override;
+  void InitOp() override;
+
 protected:
  std::vector<std::vector<size_t>> hccl_kernel_input_shape_list_;
  std::vector<std::vector<size_t>> hccl_kernel_output_shape_list_;
@ -56,9 +63,10 @@ class HcclKernel : public AscendKernelMod {
  mutable std::vector<size_t> input_size_list_;
  mutable std::vector<size_t> output_size_list_;
  mutable std::vector<size_t> workspace_size_list_;
-  AnfNodeWeakPtr anf_node_;
  std::string op_name_;
  std::string group_;
+  std::mutex hccl_mutex_;
+  std::condition_variable cond_;
 };

 using HcclKernelCreater = std::function<std::shared_ptr<HcclKernel>()>;
--- a/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_broadcast_gradient_args_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_broadcast_gradient_args_kernel.cc
@ -16,6 +16,7 @@

 #include "backend/kernel_compiler/host/dynamic_broadcast_gradient_args_kernel.h"
 #include "backend/session/anf_runtime_algorithm.h"
+#include "runtime/device/ascend/ascend_kernel_runtime.h"
 #include "utils/trace_base.h"

 namespace mindspore {
@ -195,6 +196,15 @@ void DynamicBroadcastGradientArgsKernel::Execute() {
  input_shapes[1] = GetInputShape(cnode, 1);
  auto grad_reduce_idx = CalculateOutput(input_shapes);

+  auto runtime_instance = device::KernelRuntimeManager::Instance().GetCurrentKernelRuntime();
+  MS_EXCEPTION_IF_NULL(runtime_instance);
+  // cppcheck-suppress unreadVariable
+  auto lock = AscendKernelMod::LockRuntime();
+  auto ret = runtime_instance->SyncStream();
+  if (!ret) {
+    MS_LOG(EXCEPTION) << "Sync stream error!";
+  }
+
  auto r0_size = SetOutputValue(cnode, grad_reduce_idx, 0, input_shapes[0].size());
  auto r1_size = SetOutputValue(cnode, grad_reduce_idx, 1, input_shapes[1].size());

@ -209,5 +219,26 @@ device::DynamicKernelPtr DynamicBroadcastGradientArgsKernelMod::GenDynamicKernel
                                                                                 void *stream_ptr) {
  return std::make_shared<DynamicBroadcastGradientArgsKernel>(stream_ptr, cnode_ptr);
 }
+
+bool DynamicBroadcastGradientArgsKernelMod::Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
+                                                   const std::vector<AddressPtr> &, void *stream_ptr) {
+  auto node = anf_node_.lock();
+  MS_EXCEPTION_IF_NULL(node);
+  if (!node->isa<CNode>()) {
+    MS_LOG(EXCEPTION) << "anfnode is not a cnode";
+  }
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  stream_ = stream_ptr;
+  auto broadcast_grad_kernel = std::make_shared<DynamicBroadcastGradientArgsKernel>(stream_ptr, cnode);
+  try {
+    broadcast_grad_kernel->Execute();
+  } catch (const std::exception &e) {
+    MS_LOG(ERROR) << "DynamicBroadcastGradientArgsKernel Launch failed. node: " << cnode->fullname_with_scope()
+                  << ", Error message is " << e.what();
+    return false;
+  }
+  return true;
+}
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_broadcast_gradient_args_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_broadcast_gradient_args_kernel.h
@ -36,6 +36,8 @@ class DynamicBroadcastGradientArgsKernelMod : public HostKernelMod {
  DynamicBroadcastGradientArgsKernelMod() = default;
  ~DynamicBroadcastGradientArgsKernelMod() override = default;
  device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override;
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
 };
 MS_HOST_REG_KERNEL(DynamicBroadcastGradientArgs, DynamicBroadcastGradientArgsKernelMod);
 }  // namespace kernel
--- a/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_reshape_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_reshape_kernel.cc
@ -114,5 +114,26 @@ void DynamicReshapeKernel::Execute() {
 device::DynamicKernelPtr DynamicReshapeKernelMod::GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) {
  return std::make_shared<DynamicReshapeKernel>(stream_ptr, cnode_ptr);
 }
+
+bool DynamicReshapeKernelMod::Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
+                                     const std::vector<AddressPtr> &, void *stream_ptr) {
+  auto node = anf_node_.lock();
+  MS_EXCEPTION_IF_NULL(node);
+  if (!node->isa<CNode>()) {
+    MS_LOG(EXCEPTION) << "anfnode is not a cnode";
+  }
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  stream_ = stream_ptr;
+  auto reshape_kernel = std::make_shared<DynamicReshapeKernel>(stream_ptr, cnode);
+  try {
+    reshape_kernel->Execute();
+  } catch (const std::exception &e) {
+    MS_LOG(ERROR) << "DynamicReshapeKernel Launch failed. node: " << cnode->fullname_with_scope()
+                  << ", Error message is " << e.what();
+    return false;
+  }
+  return true;
+}
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_reshape_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_reshape_kernel.h
@ -35,6 +35,9 @@ class DynamicReshapeKernelMod : public HostKernelMod {
  DynamicReshapeKernelMod() = default;
  ~DynamicReshapeKernelMod() override = default;
  device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override;
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
+  void UpdateOp() override { AscendKernelMod::UpdateOp(); }
 };
 MS_HOST_REG_KERNEL(DynamicReshape, DynamicReshapeKernelMod);
 }  // namespace kernel
--- a/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_shape_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_shape_kernel.cc
@ -57,6 +57,8 @@ void DynamicShapeKernel::Execute() {
  } else {
    auto runtime_instance = device::KernelRuntimeManager::Instance().GetCurrentKernelRuntime();
    MS_EXCEPTION_IF_NULL(runtime_instance);
+    // cppcheck-suppress unreadVariable
+    auto lock = AscendKernelMod::LockRuntime();
    auto ret = runtime_instance->SyncStream();
    if (!ret) {
      MS_LOG(EXCEPTION) << "Sync stream error!";
@ -106,5 +108,23 @@ void DynamicShapeKernel::Execute(const std::vector<AddressPtr> &inputs, const st
 device::DynamicKernelPtr DynamicShapeKernelMod::GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) {
  return std::make_shared<DynamicShapeKernel>(stream_ptr, cnode_ptr);
 }
+
+bool DynamicShapeKernelMod::Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
+                                   const std::vector<AddressPtr> &, void *stream_ptr) {
+  auto node = anf_node_.lock();
+  MS_EXCEPTION_IF_NULL(node);
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  stream_ = stream_ptr;
+  auto shape_kernel = std::make_shared<DynamicShapeKernel>(stream_ptr, cnode);
+  try {
+    shape_kernel->Execute();
+  } catch (const std::exception &e) {
+    MS_LOG(ERROR) << "DynamicShapeKernelMod Launch failed. node: " << cnode->fullname_with_scope()
+                  << ", Error message is " << e.what();
+    return false;
+  }
+  return true;
+}
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_shape_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_shape_kernel.h
@ -38,18 +38,7 @@ class DynamicShapeKernelMod : public HostKernelMod {
  ~DynamicShapeKernelMod() override = default;
  device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
-              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
-    if (kernel_ == nullptr) {
-      kernel_ =
-        std::dynamic_pointer_cast<DynamicShapeKernel>(GenDynamicKernel(anf_node_->cast<CNodePtr>(), stream_ptr));
-      kernel_->Initialize();
-    }
-    kernel_->Execute(inputs, outputs);
-    return true;
-  }
-
- private:
-  std::shared_ptr<DynamicShapeKernel> kernel_;
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
 };
 MS_HOST_REG_KERNEL(DynamicShape, DynamicShapeKernelMod);
 }  // namespace kernel
--- a/mindspore/ccsrc/backend/kernel_compiler/host/host_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/host/host_kernel_mod.cc
@ -77,6 +77,16 @@ bool HostKernelMod::Launch(const std::vector<AddressPtr> &, const std::vector<Ad
                           const std::vector<AddressPtr> &, void *) {
  return true;
 }
+
+void HostKernelMod::InferOp() {
+  auto node = anf_node_.lock();
+  MS_EXCEPTION_IF_NULL(node);
+  if (!AnfAlgo::IsDynamicShape(node)) {
+    MS_LOG(EXCEPTION) << "The node is not dynamic shape.";
+  }
+  KernelMod::InferShape();
+}
+
 std::vector<TaskInfoPtr> HostKernelMod::GenTask(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
                                                const std::vector<AddressPtr> &, uint32_t) {
  return {};
--- a/mindspore/ccsrc/backend/kernel_compiler/host/host_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/host/host_kernel_mod.h
@ -36,9 +36,10 @@ class HostKernelMod : public AscendKernelMod {
                                   const std::vector<AddressPtr> &, uint32_t) override;
  device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override = 0;
  bool Init(const AnfNodePtr &anf_node);
+  void InferOp() override;
+  void UpdateOp() override {}

 protected:
-  AnfNodePtr anf_node_;
  std::string op_name_;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
--- a/mindspore/ccsrc/backend/kernel_compiler/kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/kernel.cc
@ -0,0 +1,184 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/kernel.h"
+
+#include <algorithm>
+#include <stack>
+#include <utility>
+#include "utils/ms_context.h"
+#include "utils/anf_utils.h"
+#include "utils/ms_device_shape_transfer.h"
+#include "backend/session/anf_runtime_algorithm.h"
+#include "backend/optimizer/common/helper.h"
+
+namespace mindspore {
+namespace kernel {
+constexpr int64_t kInvalidShape = -2;
+
+void KernelMod::InferShape() {
+  auto node = anf_node_.lock();
+  MS_EXCEPTION_IF_NULL(node);
+  if (!node->isa<CNode>()) {
+    MS_LOG(EXCEPTION) << "anfnode is not a cnode";
+  }
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  MS_LOG(INFO) << "InferShape start, node:" << cnode->fullname_with_scope();
+  GetDepndLists(cnode);
+  auto ret = InferShapeForDefiniteOutputNode(cnode);
+  if (ret) {
+    return;
+  }
+  depend_tensor_map_.clear();
+  auto inputs = cnode->inputs();
+  if (inputs.empty()) {
+    MS_LOG(EXCEPTION) << "Invalid inputs";
+  }
+  auto context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context);
+  AbstractBasePtrList args_spec_list;
+  auto primitive = GetValueNode<PrimitivePtr>(inputs[0]);
+  auto input_size = AnfAlgo::GetInputTensorNum(cnode);
+  std::vector<AnfNodePtr> input_nodes;
+  for (size_t i = 0; i < input_size; i++) {
+    auto input_node_with_index = AnfAlgo::GetPrevNodeOutput(cnode, i);
+    auto real_input = input_node_with_index.first;
+    MS_EXCEPTION_IF_NULL(real_input);
+    auto cnode_input = cnode->input(i + 1);
+    MS_EXCEPTION_IF_NULL(cnode_input);
+    InferShapeForNopNode(&real_input);
+    if (depend_list_.find(i) != depend_list_.end()) {
+      auto pre_node_with_index = AnfAlgo::GetPrevNodeOutput(cnode, i);
+      bool skip_nop_node = !context->get_param<bool>(MS_CTX_ENABLE_MINDRT);
+      auto output_addr = AnfAlgo::GetPrevNodeMutableOutputAddr(cnode, i, skip_nop_node);
+      std::vector<int64_t> shapes =
+        trans::GetRuntimePaddingShape(pre_node_with_index.first, pre_node_with_index.second);
+      auto host_type = AnfAlgo::GetOutputInferDataType(pre_node_with_index.first, pre_node_with_index.second);
+      auto out_tensor = std::make_shared<tensor::Tensor>(host_type, shapes);
+      MS_EXCEPTION_IF_NULL(out_tensor);
+      // The second parameter must be false, otherwise the device address cannot be released and allocated, and the
+      // address size will be wrong in the dynamic shape scenario.
+      out_tensor->set_device_address(output_addr, false);
+      auto ret2 = depend_tensor_map_.try_emplace(i, out_tensor);
+      if (!ret2.second) {
+        MS_LOG(EXCEPTION) << "Insert map failed";
+      }
+      out_tensor->data_sync();
+      auto lock = AnfUtils::GetAbstractLock(real_input.get());
+      MS_EXCEPTION_IF_NULL(real_input->abstract());
+      auto real_abs = real_input->abstract()->Clone();
+      if (real_abs->isa<abstract::AbstractTensor>()) {
+        real_abs->set_value(out_tensor);
+      } else if (real_abs->isa<abstract::AbstractTuple>()) {
+        auto tuple_get_item_index = AnfAlgo::GetTupleGetItemOutIndex(cnode_input->cast<CNodePtr>());
+        auto abstract_tuple = real_abs->cast<abstract::AbstractTuplePtr>();
+        MS_EXCEPTION_IF_NULL(abstract_tuple);
+        auto tuple_elements = abstract_tuple->elements()[tuple_get_item_index];
+        tuple_elements->set_value(out_tensor);
+      }
+      real_input->set_abstract(real_abs);
+    }
+    bool is_cnode_input = AnfAlgo::AddArgList(&args_spec_list, cnode_input, real_input, i);
+    if (is_cnode_input) {
+      input_nodes.push_back(cnode_input);
+    } else {
+      input_nodes.push_back(real_input);
+    }
+  }
+  std::vector<AbstractScope> locks;
+  std::transform(input_nodes.begin(), input_nodes.end(), std::back_inserter(locks),
+                 [](const AnfNodePtr &input) { return AnfUtils::GetAbstractLock(input.get()); });
+  auto eval_result = opt::CppInferShape(primitive, args_spec_list);
+  locks.clear();
+  // cppcheck-suppress unreadVariable
+  auto lock = AnfUtils::GetAbstractLock(cnode.get());
+  cnode->set_abstract(eval_result);
+}
+
+bool KernelMod::InferShapeForDefiniteOutputNode(const CNodePtr &cnode) {
+  MS_EXCEPTION_IF_NULL(cnode);
+  if (!AnfAlgo::CheckPrimitiveType(cnode, prim::kPrimShape)) {
+    return false;
+  }
+  auto input_size = AnfAlgo::GetInputTensorNum(cnode);
+  if (input_size != 1) {
+    MS_LOG(EXCEPTION) << "Node only has one input: " << cnode->fullname_with_scope();
+  }
+  auto cur_shape = dynamic_cast<mindspore::abstract::Shape *>(cnode->Shape().get())->shape();
+  if (std::any_of(cur_shape.begin(), cur_shape.end(), [](int64_t x) { return x == kInvalidShape; })) {
+    return false;
+  }
+  std::vector<int64_t> output_shape = {static_cast<int64_t>(cur_shape.size())};
+  mindspore::abstract::BaseShapePtr shape = std::make_shared<mindspore::abstract::Shape>(output_shape);
+
+  auto lock = AnfUtils::GetAbstractLock(cnode.get());
+  auto abstract = cnode->abstract()->Clone();
+  MS_EXCEPTION_IF_NULL(abstract);
+  abstract->set_shape(shape);
+  cnode->set_abstract(abstract);
+  return true;
+}
+
+void KernelMod::InferShapeForNopNode(AnfNodePtr *input_node) {
+  MS_EXCEPTION_IF_NULL(*input_node);
+  if (!opt::IsNopNode(*input_node) || !AnfAlgo::IsDynamicShape(*input_node)) {
+    MS_LOG(INFO) << "Input node is not a nop node, no need infer.";
+    return;
+  }
+  MS_LOG(INFO) << "Infer shape for nop node.";
+  std::stack<AnfNodePtr> nop_road;
+  nop_road.push(*input_node);
+
+  /*lint -e716*/
+  while (true) {
+    auto input_node_with_idx = AnfAlgo::GetPrevNodeOutput(*input_node, 0);
+    auto in_node = input_node_with_idx.first;
+    MS_EXCEPTION_IF_NULL(in_node);
+    if (opt::IsNopNode(in_node)) {
+      nop_road.push(in_node);
+      *input_node = in_node;
+    } else {
+      break;
+    }
+  }
+
+  /*lint +e716*/
+  while (!nop_road.empty()) {
+    auto nop_node = nop_road.top();
+    MS_EXCEPTION_IF_NULL(nop_node);
+    AnfAlgo::InferShape(nop_node->cast<CNodePtr>());
+    nop_road.pop();
+  }
+}
+
+void KernelMod::GetDepndLists(const CNodePtr &cnode) {
+  MS_EXCEPTION_IF_NULL(cnode);
+  if (depend_list_.size() != 0) {
+    return;
+  }
+  auto ret = abstract::GetDependsFormMap(cnode);
+  if (ret.empty()) {
+    MS_LOG(DEBUG) << "No dynamic_shape_depends found";
+    return;
+  }
+  MS_LOG(INFO) << "Have depends";
+  (void)std::transform(ret.begin(), ret.end(), std::inserter(depend_list_, depend_list_.begin()),
+                       [](const int64_t &value) { return static_cast<int>(value); });
+  MS_LOG(INFO) << "Init End";
+}
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/kernel.h
@ -18,6 +18,8 @@
 #include <vector>
 #include <string>
 #include <memory>
+#include <map>
+#include <set>
 #include "nlohmann/json.hpp"
 #include "ir/anf.h"
 #include "ir/dtype.h"
@ -180,6 +182,8 @@ struct KernelLaunchInfo {

 class KernelMod {
 public:
+  KernelMod() {}
+  explicit KernelMod(const AnfNodePtr &anf_node_ptr) : anf_node_(anf_node_ptr) {}
  virtual const std::vector<size_t> &GetInputSizeList() const = 0;
  virtual const std::vector<size_t> &GetOutputSizeList() const = 0;
  virtual const std::vector<size_t> &GetWorkspaceSizeList() const = 0;
@ -193,6 +197,10 @@ class KernelMod {
  virtual std::vector<size_t> GenParameters() { return {}; }
  virtual void ReleaseResource() {}

+  virtual void InferOp() {}
+  virtual void InitOp() {}
+  virtual void UpdateOp() {}
+
  virtual ~KernelMod() = default;
  void set_unique_name(const std::string &unique_name) { unique_name_ = unique_name; }
  void set_fullname(const std::string &fullname) { fullname_ = fullname; }
@ -205,18 +213,29 @@ class KernelMod {
  const std::vector<AddressPtr> &GetOutputsAddr() { return outputs_addr_; }
  void SetStream(void *stream) { stream_ = stream; }
  void *GetStream() const { return stream_; }
+  void SetAtomicCleanNodes(const std::vector<CNodePtr> &atomic_clean_node) { atomic_clean_nodes_ = atomic_clean_node; }

 protected:
+  void InferShape();
+
  std::string kernel_name_;
  std::string unique_name_;
  std::string fullname_;
  bool is_monad_{false};
  void *stream_{nullptr};
+  AnfNodeWeakPtr anf_node_;
+  std::map<uint32_t, tensor::TensorPtr> depend_tensor_map_;
+  std::vector<CNodePtr> atomic_clean_nodes_;

 private:
+  void InferShapeForNopNode(AnfNodePtr *input_node);
+  void GetDepndLists(const CNodePtr &cnode);
+  bool InferShapeForDefiniteOutputNode(const CNodePtr &cnode);
+
  std::vector<AddressPtr> inputs_addr_;
  std::vector<AddressPtr> workspaces_addr_;
  std::vector<AddressPtr> outputs_addr_;
+  std::set<uint32_t> depend_list_;
 };
 using KernelModPtr = std::shared_ptr<KernelMod>;
 }  // namespace kernel
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/dynamic_tbe_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/dynamic_tbe_kernel_mod.cc
@ -0,0 +1,298 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/tbe/dynamic_tbe_kernel_mod.h"
+
+#include <algorithm>
+#include <stack>
+#include "acl/acl_rt.h"
+#include "utils/ms_context.h"
+#include "backend/session/anf_runtime_algorithm.h"
+#include "runtime/device/kernel_runtime.h"
+#include "backend/optimizer/common/helper.h"
+#include "framework/common/debug/log.h"
+#include "utils/log_adapter.h"
+#include "utils/convert_utils_base.h"
+#include "runtime/device/kernel_runtime_manager.h"
+#include "runtime/kernel.h"
+#include "runtime/mem.h"
+#include "pipeline/jit/static_analysis/static_analysis.h"
+#include "runtime/device/ascend/executor/tiling/op_tiling_adapter.h"
+#include "utils/ms_device_shape_transfer.h"
+#include "utils/utils.h"
+#include "register/op_tiling.h"
+#include "nlohmann/json.hpp"
+
+namespace mindspore {
+namespace kernel {
+using TbeTaskInfoPtr = std::shared_ptr<mindspore::ge::model_runner::TbeTaskInfo>;
+using tbe::KernelManager;
+using AddressPtrList = std::vector<mindspore::kernel::AddressPtr>;
+
+DynamicTbeKernelMod::DynamicTbeKernelMod(KernelPackPtr kernel_pack, const AnfNodePtr &anf_node_ptr)
+    : TbeKernelMod(std::move(kernel_pack), anf_node_ptr) {
+  MS_EXCEPTION_IF_NULL(anf_node_ptr);
+  auto cnode = anf_node_ptr->cast<CNodePtr>();
+  if (cnode != nullptr) {
+    op_compile_info_ = ParseCompileJson(cnode);
+  }
+}
+
+DynamicTbeKernelMod::~DynamicTbeKernelMod() {
+  if (tiling_data_ptr_ != nullptr) {
+    (void)rtFree(tiling_data_ptr_);
+  }
+}
+
+void DynamicTbeKernelMod::InferOp() {
+  if (AnfAlgo::IsDynamicShape(anf_node_.lock())) {
+    auto node = anf_node_.lock();
+    MS_EXCEPTION_IF_NULL(node);
+    auto cnode = node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    need_skip_execute_ = NeedSkipExecute(cnode);
+    if (need_skip_execute_) {
+      std::vector<TypeId> dtypes{AnfAlgo::GetOutputInferDataType(cnode, 0)};
+      AnfAlgo::SetOutputInferTypeAndShape(dtypes, {AnfAlgo::GetInputDeviceShape(cnode, 0)}, cnode.get());
+    } else {
+      KernelMod::InferShape();
+    }
+  }
+}
+
+void DynamicTbeKernelMod::InitOp() {
+  auto node = anf_node_.lock();
+  MS_EXCEPTION_IF_NULL(node);
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+
+  if (!AnfAlgo::IsDynamicShape(cnode)) {
+    MS_LOG(EXCEPTION) << "The node is not dynamic shape: " << cnode->fullname_with_scope();
+  }
+
+  if (!atomic_clean_nodes_.empty()) {
+    for (const auto &atomic_clean_node : atomic_clean_nodes_) {
+      AnfAlgo::GetKernelMod(atomic_clean_node)->InitOp();
+    }
+  }
+
+  if (need_skip_execute_) {
+    return;
+  }
+
+  // gen FuncStub
+  if (handle_ == nullptr) {
+    auto func_stub = KernelManager::GenFuncStub(*kernel_pack_, false, &block_dim_, true, &handle_, &origin_key_);
+    if (func_stub != 1) {
+      MS_LOG(EXCEPTION) << "GenFuncStub failed.";
+    }
+  }
+
+  // start compute tiling
+  MS_LOG(INFO) << "Start compute tiling of: " << cnode->fullname_with_scope();
+  optiling::utils::OpRunInfo op_run_info_v2(-1, true, 0);
+  device::tiling::OpTilingCalculateAdapter converter;
+  ::ge::ComputeGraphPtr ge_graph = std::make_shared<::ge::ComputeGraph>("default");
+  auto ge_node = converter.AnfNodeToGeNodeAdapter(cnode, &ge_graph, depend_tensor_map_, op_compile_info_);
+  (void)optiling::OpParaCalculateV2(ge_node, op_run_info_v2);
+
+  block_dim_ = op_run_info_v2.GetBlockDim();
+  std::vector<int64_t> workspace_size_list;
+  op_run_info_v2.GetAllWorkspaces(workspace_size_list);
+  tiling_data_ = op_run_info_v2.GetAllTilingData().str();
+  tiling_key_ = op_run_info_v2.GetTilingKey();
+
+  workspace_size_list_.clear();
+  workspace_size_list_.resize(workspace_size_list.size());
+  std::transform(workspace_size_list.begin(), workspace_size_list.end(), workspace_size_list_.begin(),
+                 [](int64_t size) { return static_cast<size_t>(size); });
+}
+
+std::string DynamicTbeKernelMod::ParseCompileJson(const CNodePtr &cnode) {
+  MS_EXCEPTION_IF_NULL(cnode);
+
+  bool get_flag = true;
+  std::string op_compile_info = "";
+  TbeUtils::GetCompileInfo(cnode, &op_compile_info, &get_flag);
+  if (!get_flag) {
+    MS_LOG(EXCEPTION) << "Get compile_info failed. The compile result of [" << cnode->fullname_with_scope()
+                      << "] maybe not in the json file(kernel_meta/) or the file had been deleted.";
+  }
+  MS_LOG(INFO) << "Node: " << cnode->fullname_with_scope() << " get compile_info: " << op_compile_info;
+  return op_compile_info;
+}
+
+void DynamicTbeKernelMod::InitTilingDataPtr() {
+  if (tiling_data_ptr_ != nullptr) {
+    return;
+  }
+  auto kernel_json_info = kernel_pack_->kernel_json_info();
+  auto op_para_size = kernel_json_info.op_para_size;
+  if (op_para_size > 0) {
+    auto ret = rtMalloc(&tiling_data_ptr_, op_para_size, RT_MEMORY_HBM);
+    if (ret != RT_ERROR_NONE) {
+      MS_LOG(EXCEPTION) << "rtMalloc tiling data failed";
+    }
+  }
+}
+
+bool DynamicTbeKernelMod::CopyTilingToDevice(void *stream_ptr) {
+  InitTilingDataPtr();
+  MS_EXCEPTION_IF_NULL(kernel_pack_);
+  auto kernel_json_info = kernel_pack_->kernel_json_info();
+
+  auto op_para_size = kernel_json_info.op_para_size;
+  if (tiling_data_.size() > op_para_size) {
+    MS_LOG(EXCEPTION) << "Compute tiling size:" << tiling_data_.size()
+                      << " larger than tbe build op_para_size:" << op_para_size;
+  }
+
+  if (tiling_data_.empty() || tiling_data_ptr_ == nullptr) {
+    MS_LOG(INFO) << "Tiling size is 0, skip aclrtMemcpyAsync";
+    return true;
+  }
+  // cppcheck-suppress unreadVariable
+  auto lock = AscendKernelMod::LockRuntime();
+  auto ret = aclrtMemcpyAsync(tiling_data_ptr_, op_para_size, tiling_data_.c_str(), tiling_data_.size(),
+                              ACL_MEMCPY_HOST_TO_DEVICE, stream_ptr);
+  if (ret != RT_ERROR_NONE) {
+    MS_LOG(EXCEPTION) << "Tiling aclrtMemcpyAsync failed, ret:" << ret;
+  }
+  return true;
+}
+
+bool DynamicTbeKernelMod::NeedSkipExecute(const CNodePtr &cnode) {
+  // Skip run ReduceSum when axis is a Empty Tensor
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto op_name = AnfAlgo::GetCNodeName(cnode);
+  if (op_name != kReduceSumOpName) {
+    return false;
+  }
+
+  const size_t axes_index = 1;
+  if (cnode->inputs().size() <= axes_index + 1) {
+    return false;
+  }
+  auto input_axes = cnode->input(axes_index + 1);
+  // cppcheck-suppress unreadVariable
+  auto lock = AnfUtils::GetAbstractLock(input_axes.get());
+  auto axes_abs = input_axes->abstract()->Clone();
+  MS_EXCEPTION_IF_NULL(axes_abs);
+  auto axes_shape = AnfAlgo::GetInputDeviceShape(cnode, axes_index);
+  if (axes_abs->isa<abstract::AbstractTensor>()) {
+    if (std::any_of(axes_shape.begin(), axes_shape.end(), [](ssize_t shape) { return shape == 0; })) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool DynamicTbeKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+                                 const std::vector<AddressPtr> &outputs, void *stream_ptr) {
+  if (stream_ptr == nullptr) {
+    MS_LOG(ERROR) << "stream_ptr should not be nullptr.";
+    return false;
+  }
+
+  if (kernel_pack_ == nullptr) {
+    MS_LOG(ERROR) << "kernel pack should not be nullptr.";
+    return false;
+  }
+  if (stream_ == nullptr) {
+    stream_ = stream_ptr;
+  }
+
+  auto node = anf_node_.lock();
+  MS_EXCEPTION_IF_NULL(node);
+  if (!node->isa<CNode>()) {
+    MS_LOG(EXCEPTION) << "anfnode is not a cnode";
+  }
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+
+  // is dynamic shape
+  if (!AnfAlgo::IsDynamicShape(cnode)) {
+    MS_LOG(EXCEPTION) << "The cnode is not dynamic shape:" << cnode->fullname_with_scope();
+  }
+
+  if (!atomic_clean_nodes_.empty()) {
+    for (auto atomic_clean_node : atomic_clean_nodes_) {
+      KernelLaunchInfo kernel_launch_info;
+      auto kernel_mod = AnfAlgo::GetKernelMod(atomic_clean_node);
+      MS_EXCEPTION_IF_NULL(kernel_mod);
+      device::KernelRuntime::GenLaunchArgs(*kernel_mod, atomic_clean_node, &kernel_launch_info);
+      auto atomic_inputs = kernel_launch_info.inputs_;
+      std::vector<AddressPtr> atomic_outputs;
+      std::vector<AddressPtr> atomic_workspace;
+      kernel_mod->Launch(atomic_inputs, atomic_workspace, atomic_outputs, stream_ptr);
+    }
+  }
+
+  // need skip, for reducesum empty input axis
+  if (need_skip_execute_) {
+    // Skip reduce if axis is a empty Tensor (shape = 0)
+    MS_LOG(INFO) << "The node " << cnode->fullname_with_scope() << "Need Skip.";
+    // cppcheck-suppress unreadVariable
+    auto lock = AscendKernelMod::LockRuntime();
+    rtError_t status = aclrtMemcpyAsync(outputs[0]->addr, inputs[0]->size, inputs[0]->addr, inputs[0]->size,
+                                        ACL_MEMCPY_DEVICE_TO_DEVICE, stream_ptr);
+    if (status != RT_ERROR_NONE) {
+      MS_LOG(EXCEPTION) << "aclrtMemcpyAsync failed for " << cnode->fullname_with_scope();
+    }
+
+    MS_LOG(INFO) << "Execute node:" << cnode->fullname_with_scope() << " success.";
+    return true;
+  }
+
+  // copy tiling to device
+  if (!CopyTilingToDevice(stream_ptr)) {
+    MS_LOG(EXCEPTION) << "Copy tiling to device failed. op name: " << cnode->fullname_with_scope();
+  }
+
+  // pack all addresses into a vector.
+  std::vector<void *> runtimeargs;
+  (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(runtimeargs),
+                       [](const AddressPtr &input) -> void * { return input->addr; });
+  (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtimeargs),
+                       [](const AddressPtr &output) -> void * { return output->addr; });
+  if (!workspace.empty()) {
+    (void)std::transform(std::begin(workspace), std::end(workspace), std::back_inserter(runtimeargs),
+                         [](const AddressPtr &addr) -> void * { return addr->addr; });
+  }
+
+  if (!tiling_data_.empty() && tiling_data_ptr_ != nullptr) {
+    runtimeargs.push_back(tiling_data_ptr_);
+  }
+
+  rtL2Ctrl_t *l2ctrl = nullptr;
+  auto args_size = static_cast<uint32_t>(UlongToUint(sizeof(void *)) * runtimeargs.size());
+  auto node_info = cnode->fullname_with_scope();
+  const auto dev_func =
+    origin_key_.find("kernel0") != origin_key_.npos ? origin_key_ : origin_key_ + "_" + std::to_string(tiling_key_);
+  const auto kernel_info = node_info + "/" + std::to_string(tiling_key_);
+  // cppcheck-suppress unreadVariable
+  auto lock = AscendKernelMod::LockRuntime();
+  auto ret = rtKernelLaunchWithHandle(handle_, dev_func.c_str(), block_dim_, runtimeargs.data(), args_size, l2ctrl,
+                                      stream_ptr, kernel_info.c_str());
+  if (ret != RT_ERROR_NONE) {
+    MS_LOG(ERROR) << "Call runtime rtKernelLaunchWithHandle error. Node info: " << node_info;
+    return false;
+  }
+
+  return true;
+}
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/dynamic_tbe_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/dynamic_tbe_kernel_mod.h
@ -0,0 +1,65 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_TBE_DYNAMIC_TBE_KERNEL_MOD_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_TBE_DYNAMIC_TBE_KERNEL_MOD_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <utility>
+#include <map>
+#include "backend/kernel_compiler/tbe/tbe_kernel_mod.h"
+#include "backend/kernel_compiler/tbe/tbe_utils.h"
+#include "runtime/device/device_address.h"
+#include "ir/tensor.h"
+
+namespace mindspore {
+namespace kernel {
+class DynamicTbeKernelMod : public TbeKernelMod {
+ public:
+  explicit DynamicTbeKernelMod(KernelPackPtr kernel_pack) : TbeKernelMod(kernel_pack) {}  // maybe delete later
+  DynamicTbeKernelMod(KernelPackPtr kernel_pack, const AnfNodePtr &anf_node_ptr);
+  ~DynamicTbeKernelMod() override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
+
+  void InferOp() override;
+  void InitOp() override;
+
+ private:
+  void InferShapeRecursive();
+  void InferShapeForNopNode(AnfNodePtr *input_node);
+  std::string ParseCompileJson(const CNodePtr &cnode);
+  void InitTilingDataPtr();
+  bool CopyTilingToDevice(void *stream_ptr);
+  bool NeedSkipExecute(const CNodePtr &cnode);
+
+  uint32_t block_dim_ = 1;
+  std::string tiling_data_;
+  void *tiling_data_ptr_ = nullptr;
+  uint32_t tiling_key_{0};
+  void *handle_ = nullptr;
+  std::string origin_key_{""};
+  std::string op_compile_info_{};
+  bool need_skip_execute_ = false;
+};
+
+using DynamicTbeKernelModPtr = std::shared_ptr<DynamicTbeKernelMod>;
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_TBE_TBE_KERNEL_MOD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_mod.cc
@ -15,6 +15,8 @@
 */

 #include "backend/kernel_compiler/tbe/tbe_kernel_mod.h"
+
+#include <algorithm>
 #include "runtime/rt.h"
 #include "utils/ms_context.h"
 #include "runtime/device/ascend/ge_runtime/task_info.h"
@ -41,6 +43,20 @@ bool TbeKernelMod::Launch(const std::vector<mindspore::kernel::AddressPtr> &inpu
  if (stream_ == nullptr) {
    stream_ = stream_ptr;
  }
+  // launch atomic_cleans first
+  if (!atomic_clean_nodes_.empty()) {
+    for (const auto &atomic_clean_node : atomic_clean_nodes_) {
+      KernelLaunchInfo kernel_launch_info;
+      auto kernel_mod = AnfAlgo::GetKernelMod(atomic_clean_node);
+      MS_EXCEPTION_IF_NULL(kernel_mod);
+      device::KernelRuntime::GenLaunchArgs(*kernel_mod, atomic_clean_node, &kernel_launch_info);
+      auto atomic_inputs = kernel_launch_info.inputs_;
+      std::vector<AddressPtr> atomic_outputs;
+      std::vector<AddressPtr> atomic_workspace;
+      kernel_mod->Launch(atomic_inputs, atomic_workspace, atomic_outputs, stream_ptr);
+    }
+  }
+
  uint32_t blockdim = 1;  // default blockdim equal to 1.
  auto func_stub = KernelManager::GenFuncStub(*kernel_pack_, false, &blockdim);
  if (func_stub == 0) {
@ -61,6 +77,7 @@ bool TbeKernelMod::Launch(const std::vector<mindspore::kernel::AddressPtr> &inpu
  rtL2Ctrl_t *l2ctrl = nullptr;
  const void *stubFunc = reinterpret_cast<void *>(func_stub);
  auto argsSize = static_cast<uint32_t>(UlongToUint(sizeof(void *)) * runtimeargs.size());
+  auto lock = AscendKernelMod::LockRuntime();
  auto ret = rtKernelLaunch(stubFunc, blockdim, runtimeargs.data(), argsSize, l2ctrl, stream_);
  if (ret != RT_ERROR_NONE) {
    MS_LOG(ERROR) << "Call runtime rtKernelLaunch error.";
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_mod.h
@ -29,6 +29,8 @@ namespace kernel {
 class TbeKernelMod : public AscendKernelMod {
 public:
  explicit TbeKernelMod(KernelPackPtr kernel_pack) : kernel_pack_(std::move(kernel_pack)) {}
+  TbeKernelMod(KernelPackPtr kernel_pack, const AnfNodePtr &anf_node_ptr)
+      : AscendKernelMod(anf_node_ptr), kernel_pack_(std::move(kernel_pack)) {}
  ~TbeKernelMod() override = default;

  void SetInputSizeList(const std::vector<size_t> &size_list) { input_size_list_ = size_list; }
@ -45,7 +47,7 @@ class TbeKernelMod : public AscendKernelMod {
  device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override;
  std::vector<size_t> GenParameters() override;

- private:
+ protected:
  KernelPackPtr kernel_pack_;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
@ -729,9 +729,8 @@ KernelWithIndex AnfRuntimeAlgorithm::GetPrevNodeOutput(const AnfNodePtr &anf_nod
  auto kernel_info = anf_node->kernel_info();
  if (kernel_info) {
    auto runtime_cache = kernel_info->runtime_cache();
-    MS_EXCEPTION_IF_NULL(runtime_cache);
-    if (runtime_cache->is_valid()) {
-      auto output = runtime_cache->get_prev_node_output(input_idx);
+    if (runtime_cache.runtime_cache().is_valid()) {
+      auto output = runtime_cache.runtime_cache().get_prev_node_output(input_idx);
      if (output.first != nullptr) {
        return output;
      }
@ -747,9 +746,8 @@ KernelWithIndex AnfRuntimeAlgorithm::GetPrevNodeOutput(const AnfNodePtr &anf_nod
  }
  if (kernel_info) {
    auto runtime_cache = kernel_info->runtime_cache();
-    MS_EXCEPTION_IF_NULL(runtime_cache);
-    if (runtime_cache->is_valid()) {
-      runtime_cache->set_prev_node_output(input_idx, res);
+    if (runtime_cache.runtime_cache().is_valid()) {
+      runtime_cache.runtime_cache().set_prev_node_output(input_idx, res);
    }
  }
  return res;
@ -2065,7 +2063,7 @@ std::vector<int64_t> AnfRuntimeAlgorithm::GetOutputMinShape(const AnfNodePtr &an
  }
 }

-bool IsNodeInputDynamicShape(const CNodePtr &anf_node_ptr) {
+bool AnfRuntimeAlgorithm::IsNodeInputDynamicShape(const CNodePtr &anf_node_ptr) {
  MS_EXCEPTION_IF_NULL(anf_node_ptr);
  auto input_num = AnfAlgo::GetInputTensorNum(anf_node_ptr);
  for (size_t i = 0; i < input_num; ++i) {
@ -2274,6 +2272,7 @@ void AnfRuntimeAlgorithm::InferShape(const CNodePtr &node, std::map<uint32_t, te
  AbstractBasePtrList args_spec_list;
  auto primitive = GetValueNode<PrimitivePtr>(inputs[0]);
  auto input_size = AnfAlgo::GetInputTensorNum(node);
+  std::vector<AnfNodePtr> input_nodes;
  for (size_t i = 0; i < input_size; ++i) {
    auto input_with_index = AnfAlgo::GetPrevNodeOutput(node, i);
    auto real_input = input_with_index.first;
@ -2289,9 +2288,12 @@ void AnfRuntimeAlgorithm::InferShape(const CNodePtr &node, std::map<uint32_t, te
          // sync data from device to host
          tensor_ptr->data_sync();
        }
-        auto real_abs = real_input->abstract();
+        // cppcheck-suppress unreadVariable
+        auto lock = AnfUtils::GetAbstractLock(real_input.get());
+        MS_EXCEPTION_IF_NULL(real_input->abstract());
+        auto real_abs = real_input->abstract()->Clone();
        if (real_abs->isa<abstract::AbstractTensor>()) {
-          real_input->abstract()->set_value(tensor_ptr);
+          real_abs->set_value(tensor_ptr);
        } else if (real_abs->isa<abstract::AbstractTuple>()) {
          auto tuple_get_item_index = AnfAlgo::GetTupleGetItemOutIndex(cnode_input->cast<CNodePtr>());
          auto abstract_tuple = real_abs->cast<abstract::AbstractTuplePtr>();
@ -2299,15 +2301,27 @@ void AnfRuntimeAlgorithm::InferShape(const CNodePtr &node, std::map<uint32_t, te
          auto tuple_elements = abstract_tuple->elements()[tuple_get_item_index];
          tuple_elements->set_value(tensor_ptr);
        }
+        real_input->set_abstract(real_abs);
      }
    }
-    AddArgList(&args_spec_list, cnode_input, real_input, i);
+    bool is_cnode_input = AddArgList(&args_spec_list, cnode_input, real_input, i);
+    if (is_cnode_input) {
+      input_nodes.push_back(cnode_input);
+    } else {
+      input_nodes.push_back(real_input);
+    }
  }
+  std::vector<AbstractScope> locks;
+  std::transform(input_nodes.begin(), input_nodes.end(), std::back_inserter(locks),
+                 [](const AnfNodePtr &input) { return AnfUtils::GetAbstractLock(input.get()); });
  auto eval_result = opt::CppInferShape(primitive, args_spec_list);
+  locks.clear();
+  // cppcheck-suppress unreadVariable
+  auto lock = AnfUtils::GetAbstractLock(node.get());
  node->set_abstract(eval_result);
 }

-void AnfRuntimeAlgorithm::AddArgList(AbstractBasePtrList *args_spec_list, const AnfNodePtr &cnode_input,
+bool AnfRuntimeAlgorithm::AddArgList(AbstractBasePtrList *args_spec_list, const AnfNodePtr &cnode_input,
                                     const AnfNodePtr &real_input, size_t index) {
  if (AnfAlgo::CheckPrimitiveType(cnode_input, prim::kPrimTupleGetItem)) {
    auto base_shape = real_input->Shape();
@ -2315,15 +2329,24 @@ void AnfRuntimeAlgorithm::AddArgList(AbstractBasePtrList *args_spec_list, const
      MS_LOG(EXCEPTION) << "Node input is a tuple_get_item but real input node shape is not a TupleShape. trace: "
                        << trace::DumpSourceLines(real_input);
    }
+    // cppcheck-suppress unreadVariable
+    auto lock = AnfUtils::GetAbstractLock(real_input.get());
    auto abs = real_input->abstract()->cast<abstract::AbstractTuplePtr>();
    MS_EXCEPTION_IF_NULL(abs);
    auto tuple_get_item_indexk = AnfAlgo::GetTupleGetItemOutIndex(cnode_input->cast<CNodePtr>());
    auto abs_i = abs->elements()[tuple_get_item_indexk];
    (void)args_spec_list->emplace_back(abs_i);
+    return false;
  } else if (cnode_input->isa<CNode>() && AnfAlgo::GetCNodeName(cnode_input) == prim::kPrimReshape->name()) {
+    // cppcheck-suppress unreadVariable
+    auto lock = AnfUtils::GetAbstractLock(cnode_input.get());
    (void)args_spec_list->emplace_back(cnode_input->abstract());
+    return true;
  } else {
+    // cppcheck-suppress unreadVariable
+    auto lock = AnfUtils::GetAbstractLock(real_input.get());
    (void)args_spec_list->emplace_back(real_input->abstract());
+    return false;
  }
 }

--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
@ -288,6 +288,7 @@ class AnfRuntimeAlgorithm {
  static TypeId GetCNodeOutputPrecision(const AnfNodePtr &node);
  // get fix output precision from prev node, input_idx is the input index of current node related to prev node.
  static TypeId GetPrevNodeOutputPrecision(const AnfNodePtr &node, size_t input_idx);
+  static bool IsNodeInputDynamicShape(const CNodePtr &anf_node_ptr);
  static bool IsDynamicShape(const AnfNodePtr &node);
  static bool HasDynamicShapeFlag(const PrimitivePtr &prim);
  static bool IsCondControlKernel(const CNodePtr &node);
@ -302,7 +303,8 @@ class AnfRuntimeAlgorithm {
  static bool IsNodeDynamicShape(const AnfNodePtr &node);
  static bool IsHostKernel(const CNodePtr &node);
  static void InferShape(const CNodePtr &node, std::map<uint32_t, tensor::TensorPtr> *depend_tensors = nullptr);
-  static void AddArgList(AbstractBasePtrList *args_spec_list, const AnfNodePtr &cnode_input,
+  // return true if use cnode_input's abstract, false if use real_input's abstract
+  static bool AddArgList(AbstractBasePtrList *args_spec_list, const AnfNodePtr &cnode_input,
                         const AnfNodePtr &real_input, size_t index);
  static std::vector<size_t> GetInputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index);
  static std::vector<size_t> GetOutputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index);
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@ -123,8 +123,7 @@ void AscendEnableDynamicRuntimeCache(const KernelGraph *graph) {
    }
    MS_EXCEPTION_IF_NULL(kernel_info);
    auto runtime_cache = kernel_info->runtime_cache();
-    MS_EXCEPTION_IF_NULL(runtime_cache);
-    runtime_cache->set_valid();
+    runtime_cache.runtime_cache().set_valid();
  }
 }
 }  // namespace
--- a/mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_adapter.h
+++ b/mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_adapter.h
@ -37,21 +37,21 @@ class OpTilingCalculateAdapter {
  OpTilingCalculateAdapter() = default;
  ~OpTilingCalculateAdapter() = default;

-  ge::Operator AnfNodeToGeNodeAdapter(const CNodePtr &node, ge::ComputeGraphPtr *ge_graph,
-                                      const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map,
-                                      const std::string &op_compile_info);
+  ::ge::Operator AnfNodeToGeNodeAdapter(const CNodePtr &node, ::ge::ComputeGraphPtr *ge_graph,
+                                        const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map,
+                                        const std::string &op_compile_info);

 private:
-  void ConvertInputShapeAndType(const CNodePtr &node, ge::OpDescPtr *op_desc);
-  void ConvertOutputShapeAndType(const CNodePtr &node, ge::OpDescPtr *op_desc);
-  void ConvertCompileInfo(const CNodePtr &node, ge::OpDescPtr *op_desc);
-  void ConvertAttrs(const CNodePtr &node, ge::OpDescPtr *op_desc);
-  std::vector<std::tuple<std::size_t, ge::NodePtr>> ConvertDepends(
-    const CNodePtr &node, const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map, ge::OpDescPtr *op_desc,
-    ge::ComputeGraphPtr *ge_graph);
-  ge::NodePtr NewConstantOp(const CNodePtr &node, const std::string &name, const tensor::TensorPtr &tensor_data,
-                            ge::ComputeGraphPtr *ge_graph, size_t index);
-  void AddEdge(const ge::NodePtr &ge_node, const std::vector<std::tuple<std::size_t, ge::NodePtr>> &constant_ops);
+  void ConvertInputShapeAndType(const CNodePtr &node, ::ge::OpDescPtr *op_desc);
+  void ConvertOutputShapeAndType(const CNodePtr &node, ::ge::OpDescPtr *op_desc);
+  void ConvertCompileInfo(const CNodePtr &node, ::ge::OpDescPtr *op_desc);
+  void ConvertAttrs(const CNodePtr &node, ::ge::OpDescPtr *op_desc);
+  std::vector<std::tuple<std::size_t, ::ge::NodePtr>> ConvertDepends(
+    const CNodePtr &node, const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map, ::ge::OpDescPtr *op_desc,
+    ::ge::ComputeGraphPtr *ge_graph);
+  ::ge::NodePtr NewConstantOp(const CNodePtr &node, const std::string &name, const tensor::TensorPtr &tensor_data,
+                              ::ge::ComputeGraphPtr *ge_graph, size_t index);
+  void AddEdge(const ::ge::NodePtr &ge_node, const std::vector<std::tuple<std::size_t, ::ge::NodePtr>> &constant_ops);
  std::string GetRealOpType(const std::string &op_type);
  std::string GetInputName(const CNodePtr &node, size_t index);
  std::string GetOutputName(const CNodePtr &node, size_t index);
--- a/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.cc
+++ b/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.cc
@ -103,7 +103,7 @@ void DynamicKernel::InferShape() {
        tuple_elements->set_value(out_tensor);
      }
    }
-    AnfAlgo::AddArgList(&args_spec_list, cnode_input, real_input, i);
+    (void)AnfAlgo::AddArgList(&args_spec_list, cnode_input, real_input, i);
  }
  auto eval_result = opt::CppInferShape(primitive, args_spec_list);
  cnode->set_abstract(eval_result);
--- a/mindspore/ccsrc/runtime/hardware/device_context.h
+++ b/mindspore/ccsrc/runtime/hardware/device_context.h
@ -164,8 +164,7 @@ class DeviceContext {
      }
      MS_EXCEPTION_IF_NULL(kernel_info);
      auto runtime_cache = kernel_info->runtime_cache();
-      MS_EXCEPTION_IF_NULL(runtime_cache);
-      runtime_cache->set_valid();
+      runtime_cache.runtime_cache().set_valid();
    }
  }

--- a/mindspore/core/ir/anf.cc
+++ b/mindspore/core/ir/anf.cc
@ -28,8 +28,21 @@
 #include "ir/func_graph.h"
 #include "ir/primitive.h"
 #include "utils/ms_context.h"
+#include "utils/anf_utils.h"

 namespace mindspore {
+const AbstractBasePtr &AnfNode::abstract() const {
+  // cppcheck-suppress unreadVariable
+  auto lock = AnfUtils::GetAbstractLock(this);
+  return abstract_;
+}
+
+void AnfNode::set_abstract(const AbstractBasePtr &abs) {
+  // cppcheck-suppress unreadVariable
+  auto lock = AnfUtils::GetAbstractLock(this);
+  abstract_ = abs;
+}
+
 // namespace to support intermediate representation definition
 CNode::CNode(const std::vector<AnfNodePtr> &inputs, const FuncGraphPtr &func_graph)
    : AnfNode(func_graph),
@ -574,9 +587,8 @@ std::string GetCNodeTarget(const AnfNodePtr &node) {
  auto kernel_info = node->kernel_info();
  if (kernel_info != nullptr) {
    auto runtime_cache = kernel_info->runtime_cache();
-    MS_EXCEPTION_IF_NULL(runtime_cache);
-    if (runtime_cache->is_valid()) {
-      auto tmp_target = runtime_cache->device_target();
+    if (runtime_cache.runtime_cache().is_valid()) {
+      auto tmp_target = runtime_cache.runtime_cache().device_target();
      if (!tmp_target.empty()) {
        return tmp_target;
      }
@ -595,9 +607,8 @@ std::string GetCNodeTarget(const AnfNodePtr &node) {

  if (kernel_info != nullptr) {
    auto runtime_cache = kernel_info->runtime_cache();
-    MS_EXCEPTION_IF_NULL(runtime_cache);
-    if (runtime_cache->is_valid()) {
-      runtime_cache->set_device_target(target);
+    if (runtime_cache.runtime_cache().is_valid()) {
+      runtime_cache.runtime_cache().set_device_target(target);
    }
  }
  return target;
--- a/mindspore/core/ir/anf.h
+++ b/mindspore/core/ir/anf.h
@ -178,12 +178,12 @@ class MS_CORE_API AnfNode : public Base {
  /// \brief Obtain the inferred abstract value of this AnfNode.
  ///
  /// \return The inferred abstract value.
-  const AbstractBasePtr &abstract() const { return abstract_; }
+  const AbstractBasePtr &abstract() const;

  /// \brief Set the abstract value of this AnfNode.
  ///
  /// \param[in] abs New abstract value.
-  void set_abstract(const AbstractBasePtr &abs) { abstract_ = abs; }
+  void set_abstract(const AbstractBasePtr &abs);

  /// \brief Obtain the intermediate abstract value of this AnfNode.
  ///
--- a/mindspore/core/ir/anf_extends.cc
+++ b/mindspore/core/ir/anf_extends.cc
@ -24,12 +24,21 @@
 #include "ir/visitor.h"
 #include "ir/func_graph.h"
 #include "base/core_ops.h"
+#include "utils/anf_utils.h"

 namespace mindspore {
 // namespace to support intermediate representation definition
 // Methods of AnfNode
-TypePtr AnfNode::Type() const { return (abstract_ == nullptr) ? nullptr : abstract_->BuildType(); }
-BaseShapePtr AnfNode::Shape() const { return (abstract_ == nullptr) ? nullptr : abstract_->BuildShape(); }
+TypePtr AnfNode::Type() const {
+  // cppcheck-suppress unreadVariable
+  auto lock = AnfUtils::GetAbstractLock(this);
+  return (abstract_ == nullptr) ? nullptr : abstract_->BuildType();
+}
+BaseShapePtr AnfNode::Shape() const {
+  // cppcheck-suppress unreadVariable
+  auto lock = AnfUtils::GetAbstractLock(this);
+  return (abstract_ == nullptr) ? nullptr : abstract_->BuildShape();
+}

 std::string AnfNode::ToString() const {
  return mindspore::label_manage::Label(const_cast<AnfNode *>(this)->shared_from_base<AnfNode>()->debug_info());
--- a/mindspore/core/ir/kernel_info_dev.h
+++ b/mindspore/core/ir/kernel_info_dev.h
@ -68,13 +68,26 @@ class RuntimeCache {
 // Interface for device kernel program information.
 class KernelInfoDevice {
 public:
+  class RuntimeCacheScope {
+   public:
+    RuntimeCacheScope(RuntimeCache &base, std::mutex &mu) : runtime_cache_(base), mu_(mu) { mu_.lock(); }
+    RuntimeCacheScope(const RuntimeCacheScope &other) = delete;
+    RuntimeCacheScope operator=(const RuntimeCacheScope &other) = delete;
+    ~RuntimeCacheScope() { mu_.unlock(); }
+    RuntimeCache &runtime_cache() { return runtime_cache_; }
+
+   private:
+    RuntimeCache &runtime_cache_;
+    std::mutex &mu_;
+  };
  // If kernel program was built and build info is set.
  virtual bool has_build_info() const = 0;

-  RuntimeCache *runtime_cache() { return &runtime_cache_; }
+  RuntimeCacheScope runtime_cache() { return RuntimeCacheScope(runtime_cache_, mu_); }

 private:
  RuntimeCache runtime_cache_;
+  std::mutex mu_;
 };
 using KernelInfoDevicePtr = std::shared_ptr<KernelInfoDevice>;
 }  // namespace mindspore
--- a/mindspore/core/utils/anf_utils.cc
+++ b/mindspore/core/utils/anf_utils.cc
@ -15,6 +15,7 @@
 */

 #include "utils/anf_utils.h"
+#include <map>
 #include <string>
 #include "base/core_ops.h"
 #include "utils/trace_base.h"
@ -23,8 +24,52 @@
 namespace mindspore {
 namespace {
 const PrimitiveSet follow_first_input_prims = {prim::kPrimDepend, prim::kPrimLoad};
+
+class AbstractMutexManager {
+ public:
+  static AbstractMutexManager &GetInstance() {
+    static AbstractMutexManager instance;
+    return instance;
+  }
+
+  AbstractScope GetAbstractLock(const AnfNode *node) {
+    std::lock_guard<std::recursive_mutex> lock(mu_);
+    return AbstractScope(&mu_for_nodes_[node]);
+  }
+
+ private:
+  std::map<const AnfNode *, std::recursive_mutex> mu_for_nodes_;
+  std::recursive_mutex mu_;
+};
 }  // namespace

+AbstractScope::AbstractScope(std::recursive_mutex *mu) {
+  MS_EXCEPTION_IF_NULL(mu);
+  mu_ = mu;
+  mu_->lock();
+}
+
+AbstractScope::AbstractScope(AbstractScope &&other) {
+  mu_ = other.mu_;
+  other.mu_ = nullptr;
+}
+
+AbstractScope &AbstractScope::operator=(AbstractScope &&other) {
+  mu_ = other.mu_;
+  other.mu_ = nullptr;
+  return *this;
+}
+
+AbstractScope::~AbstractScope() {
+  if (mu_ != nullptr) {
+    mu_->unlock();
+  }
+}
+
+AbstractScope AnfUtils::GetAbstractLock(const AnfNode *node) {
+  return AbstractMutexManager::GetInstance().GetAbstractLock(node);
+}
+
 bool AnfUtils::IsDimUnknown(const abstract::ShapePtr &shape) {
  MS_EXCEPTION_IF_NULL(shape);
  return std::any_of(shape->shape().begin(), shape->shape().end(), [](int64_t s) { return s < -1; });
@ -112,20 +157,18 @@ bool AnfUtils::IsRealKernel(const AnfNodePtr &node) {
  auto kernel_info = cnode->kernel_info();
  if (kernel_info) {
    auto runtime_cache = kernel_info->runtime_cache();
-    MS_EXCEPTION_IF_NULL(runtime_cache);
-    if (runtime_cache->is_real_kernel() != CacheBool::UNCACHED) {
-      return (runtime_cache->is_real_kernel() == CacheBool::TRUE);
+    if (runtime_cache.runtime_cache().is_real_kernel() != CacheBool::UNCACHED) {
+      return (runtime_cache.runtime_cache().is_real_kernel() == CacheBool::TRUE);
    }
  }
  bool res = !IsOneOfPrimitive(cnode->input(kAnfPrimitiveIndex), virtual_prims);

  if (kernel_info) {
    auto runtime_cache = kernel_info->runtime_cache();
-    MS_EXCEPTION_IF_NULL(runtime_cache);
    if (res) {
-      runtime_cache->set_real_kernel(CacheBool::TRUE);
+      runtime_cache.runtime_cache().set_real_kernel(CacheBool::TRUE);
    } else {
-      runtime_cache->set_real_kernel(CacheBool::FALSE);
+      runtime_cache.runtime_cache().set_real_kernel(CacheBool::FALSE);
    }
  }

@ -175,10 +218,15 @@ size_t AnfUtils::GetInputTensorNum(const AnfNodePtr &node) {
    MS_LOG(EXCEPTION) << "Only cnode has real input, but this anf is " << node->DebugString()
                      << trace::DumpSourceLines(node);
  }
-  ssize_t input_tensor_num = cnode->input_tensor_num();
-  if (input_tensor_num >= 0) {
-    return static_cast<size_t>(input_tensor_num);
+  {
+    // cppcheck-suppress unreadVariable
+    auto lock = AnfUtils::GetAbstractLock(node.get());
+    ssize_t input_tensor_num = cnode->input_tensor_num();
+    if (input_tensor_num >= 0) {
+      return static_cast<size_t>(input_tensor_num);
+    }
  }
+
  size_t input_num = cnode->inputs().size();
  if (input_num == 0) {
    MS_LOG(EXCEPTION) << "Cnode inputs size can't be zero" << trace::DumpSourceLines(node);
@ -191,6 +239,8 @@ size_t AnfUtils::GetInputTensorNum(const AnfNodePtr &node) {
    auto &inputs = cnode->inputs();
    // Search monad inputs, backward.
    for (auto iter = inputs.rbegin(); iter != inputs.rend(); ++iter) {
+      // cppcheck-suppress unreadVariable
+      auto lock = AnfUtils::GetAbstractLock(node.get());
      if (!HasAbstractMonad(*iter)) {
        // Stop count if we encounter a non-monad input.
        break;
@ -198,6 +248,8 @@ size_t AnfUtils::GetInputTensorNum(const AnfNodePtr &node) {
      --input_num;
    }
  }
+  // cppcheck-suppress unreadVariable
+  auto lock = AnfUtils::GetAbstractLock(node.get());
  cnode->set_input_tensor_num(static_cast<ssize_t>(input_num));
  return input_num;
 }
@ -207,8 +259,8 @@ size_t AnfUtils::GetOutputTensorNum(const AnfNodePtr &node) {
  auto kernel_info = node->kernel_info();
  if (kernel_info) {
    auto runtime_cache = kernel_info->runtime_cache();
-    if (runtime_cache->is_valid()) {
-      ssize_t output_tensor_num = runtime_cache->output_tensor_num();
+    if (runtime_cache.runtime_cache().is_valid()) {
+      ssize_t output_tensor_num = runtime_cache.runtime_cache().output_tensor_num();
      if (output_tensor_num >= 0) {
        return static_cast<size_t>(output_tensor_num);
      }
@ -231,8 +283,8 @@ size_t AnfUtils::GetOutputTensorNum(const AnfNodePtr &node) {

  if (kernel_info) {
    auto runtime_cache = kernel_info->runtime_cache();
-    if (runtime_cache->is_valid()) {
-      runtime_cache->set_output_tensor_num(static_cast<ssize_t>(res));
+    if (runtime_cache.runtime_cache().is_valid()) {
+      runtime_cache.runtime_cache().set_output_tensor_num(static_cast<ssize_t>(res));
    }
  }
  return res;
--- a/mindspore/core/utils/anf_utils.h
+++ b/mindspore/core/utils/anf_utils.h
@ -25,6 +25,19 @@
 #include "ir/primitive.h"

 namespace mindspore {
+class AbstractScope {
+ public:
+  explicit AbstractScope(std::recursive_mutex *mu);
+  AbstractScope(const AbstractScope &other) = delete;
+  AbstractScope operator=(const AbstractScope &other) = delete;
+  AbstractScope(AbstractScope &&other);
+  AbstractScope &operator=(AbstractScope &&other);
+  ~AbstractScope();
+
+ private:
+  std::recursive_mutex *mu_;
+};
+
 class AnfUtils {
 public:
  static bool IsDimUnknown(const abstract::ShapePtr &shape);
@ -52,6 +65,7 @@ class AnfUtils {
  static void SetDumpFlag(const AnfNodePtr &node);
  // Get dump flag from CNode's primitive.
  static bool GetDumpFlag(const AnfNodePtr &node);
+  static AbstractScope GetAbstractLock(const AnfNode *node);
 };
 }  // namespace mindspore
 #endif  // MINDSPORE_CORE_UTILS_ANF_UTILS_H_
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@ -182,6 +182,13 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "../../../mindspore/ccsrc/profiler/device/ascend/*.cc"
        "../../../mindspore/ccsrc/profiler/device/profiling.cc"
        "../../../mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/adam_fp32.c"
+        "../../../mindspore/ccsrc/backend/kernel_compiler/kernel.cc"
+        "../../../mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.cc"
+        "../../../mindspore/ccsrc/backend/optimizer/common/helper.cc"
+        "../../../mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_adapter.cc"
+        "../../../mindspore/ccsrc/runtime/device/ascend/executor/aicpu_ext_info_handle.cc"
+        "../../../mindspore/ccsrc/runtime/device/ascend/ge_types_convert.cc"
+        "../../../mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_util.cc"
        )

 if(ENABLE_SECURITY)
@ -230,6 +237,24 @@ add_dependencies(_ut_ut_obj engine-cache-server graph)
 add_executable(ut_tests $<TARGET_OBJECTS:_ut_ut_obj>
        $<TARGET_OBJECTS:_ut_mindspore_obj>)

+include_directories("${CMAKE_BINARY_DIR}/backend/kernel_compiler/aicpu")
+file(GLOB_RECURSE PROTO_IN RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+    "../../../mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/*.proto")
+ms_protobuf_generate(PROTOSRCS PROTOHDRS ${PROTO_IN})
+
+file(GLOB_RECURSE PROTO_DUMP RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+    "../../../mindspore/ccsrc/runtime/device/ascend/dump/proto/*.proto")
+ms_protobuf_generate(DUMP_PROTOSRCS PROTOHDRS ${PROTO_DUMP})
+
+list(APPEND MINDSPORE_PROTO_LIST ${PROTOSRCS})
+list(APPEND MINDSPORE_PROTO_LIST ${PREDICT_PROTOSRCS})
+list(APPEND MINDSPORE_PROTO_LIST ${DUMP_PROTOSRCS})
+
+if(MINDSPORE_PROTO_LIST)
+    add_library(proto_input_ut STATIC ${MINDSPORE_PROTO_LIST})
+    set_target_properties(proto_input_ut PROPERTIES COMPILE_FLAGS "-Wno-unused-variable")
+endif()
+
 if(ENABLE_GE)
    if(ENABLE_TRAIN)
        target_link_libraries(ut_tests PRIVATE graph ge_runner)
--- a/tests/ut/cpp/stub/ge/ge_op_tiling_info.cc
+++ b/tests/ut/cpp/stub/ge/ge_op_tiling_info.cc
@ -0,0 +1,75 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "register/op_tiling_info.h"
+#include "register/op_tiling.h"
+
+namespace optiling {
+using std::make_shared;
+extern "C" ge::graphStatus OpParaCalculateV2(const ge::Operator &op, OpRunInfoV2 &run_info) {
+  return ge::GRAPH_SUCCESS;
+}
+
+namespace utils {
+OpRunInfo::OpRunInfo() {}
+
+OpRunInfo::OpRunInfo(const uint32_t &block_dim, const bool &clear_atomic, const uint64_t &tiling_key) {}
+
+OpRunInfo::OpRunInfo(const OpRunInfo &runinfo) {}
+
+OpRunInfo::OpRunInfo(OpRunInfo &&runinfo) {}
+
+OpRunInfo &OpRunInfo::operator=(const OpRunInfo &runinfo) { return *this; }
+
+OpRunInfo &OpRunInfo::operator=(OpRunInfo &&runinfo) { return *this; }
+
+void OpRunInfo::SetBlockDim(const uint32_t &block_dim) { return; }
+
+uint32_t OpRunInfo::GetBlockDim() const { return 0; }
+
+void OpRunInfo::AddWorkspace(const int64_t &workspace) { return; }
+
+size_t OpRunInfo::GetWorkspaceNum() const { return 0; }
+
+ge::graphStatus OpRunInfo::GetWorkspace(const size_t &idx, int64_t &workspace) const { return ge::GRAPH_SUCCESS; }
+
+void OpRunInfo::GetAllWorkspaces(std::vector<int64_t> &workspaces) const { return; }
+
+void OpRunInfo::SetWorkspaces(const std::vector<int64_t> &workspaces) { return; }
+
+void OpRunInfo::InternelSetTiling(const ByteBuffer &value) { return; }
+
+void OpRunInfo::AddTilingData(const char *_value, size_t _size) { return; }
+
+ByteBuffer &OpRunInfo::GetAllTilingData() {
+  std::shared_ptr<ByteBuffer> tiling_data = std::make_shared<ByteBuffer>();
+  return *tiling_data;
+}
+
+const ByteBuffer &OpRunInfo::GetAllTilingData() const {
+  std::shared_ptr<ByteBuffer> tiling_data = std::make_shared<ByteBuffer>();
+  return *tiling_data;
+}
+
+void OpRunInfo::SetClearAtomic(bool clear_atomic_input) { return; }
+
+bool OpRunInfo::GetClearAtomic() const { return true; }
+
+void OpRunInfo::SetTilingKey(const uint64_t &new_tiling_key) { return; }
+
+uint64_t OpRunInfo::GetTilingKey() const { return 0; }
+}  // namespace utils
+}  // namespace optiling
--- a/tests/ut/cpp/stub/runtime/runtime_stub.cc
+++ b/tests/ut/cpp/stub/runtime/runtime_stub.cc
@ -211,3 +211,9 @@ RTS_API rtError_t rtMemGetInfoEx(rtMemInfoType_t memInfoType, size_t *free, size
 RTS_API rtError_t rtProfRegisterCtrlCallback(uint32_t moduleId, rtProfCtrlHandle callback) { return RT_ERROR_NONE; }

 RTS_API rtError_t rtGetRtCapability(rtFeatureType_t, int32_t, int64_t *) { return RT_ERROR_NONE; }
+
+RTS_API rtError_t rtKernelLaunchWithHandle(void *handle, const void *devFunc, uint32_t blockDim, void *args,
+                                           uint32_t argsSize, rtSmDesc_t *smDesc, rtStream_t stream,
+                                           const void *kernelInfo) {
+  return RT_ERROR_NONE;
+}