ascend device context

2021-10-28 14:57:29 +08:00 · 2021-10-28 14:57:29 +08:00 · 07e5ed9f16
parent fc4f8812a5
commit 07e5ed9f16
17 changed files with 975 additions and 59 deletions
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@ -192,8 +192,11 @@ void GenOpOutputStubTensor(const KernelGraphPtr &single_op_graph, const CNodePtr
    device_info.format_ = output_format;
    device_info.data_type_ = TypeIdToType(output_type);
    stub_output_tensor->set_device_info(device_info);
-    device::DeviceAddressPtr device_address =
-      std::make_shared<device::ascend::AscendDeviceAddress>(nullptr, 0, output_format, output_type);
+    auto ms_context = MsContext::GetInstance();
+    MS_EXCEPTION_IF_NULL(ms_context);
+    auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+    device::DeviceAddressPtr device_address = std::make_shared<device::ascend::AscendDeviceAddress>(
+      nullptr, 0, output_format, output_type, kAscendDevice, device_id);
    stub_output_tensor->set_device_address(device_address);
    output_tensor_info.output_stub_tensor = stub_output_tensor;
    auto kernel_info = dynamic_cast<const device::KernelInfo *>(output_node->kernel_info());
@ -721,7 +724,7 @@ void AscendSession::BatchBuildKernel(const std::vector<std::shared_ptr<SessionTa

  std::vector<CNodePtr> atomic_node_to_build;
  for (auto &graph : graphs) {
-    device::ascend::InsertAtomicCleanOp(graph);
+    device::ascend::InsertAtomicCleanOps(graph);
    const auto &nodes = graph->execution_order();
    std::copy(nodes.begin(), nodes.end(), std::back_inserter(atomic_node_to_build));
  }
@ -998,10 +1001,10 @@ void AscendSession::BuildOpsInGraph(const GraphId &graph_id, const std::map<AnfN
  InitRuntimeResource();
  // Compile all kernels parallel
  BuildKernel(kernels);
-  // Some new kernel may be added after InsertAtomicCleanOp, so collect and build kernels again
+  // Some new kernel may be added after InsertAtomicCleanOps, so collect and build kernels again
  kernels.clear();
  for (const auto &graph_item : single_op_graphs) {
-    device::ascend::InsertAtomicCleanOp(graph_item.first);
+    device::ascend::InsertAtomicCleanOps(graph_item.first);
    const auto &execution_order = graph_item.first->execution_order();
    std::copy(execution_order.begin(), execution_order.end(), std::back_inserter(kernels));
  }
@ -1078,7 +1081,7 @@ void AscendSession::AdjustKernel(const std::shared_ptr<KernelGraph> &kernel_grap
  // Insert CLearZero op
  // prepare for next step from json get atomic info
  BuildKernel(kernel_graph);
-  device::ascend::InsertAtomicCleanOp(kernel_graph);
+  device::ascend::InsertAtomicCleanOps(kernel_graph);
  device::KernelAdjust::GetInstance().InsertDeviceLoopCtrl(kernel_graph);
  device::KernelAdjust::GetInstance().ProcessLoopSink(kernel_graph);
 #ifdef ENABLE_DUMP_IR
@ -1098,7 +1101,7 @@ void AscendSession::RunOpAdjustKernel(const std::shared_ptr<KernelGraph> &kernel
  // Insert CLearZero op
  // prepare for next step from json get atomic info
  BuildKernel(kernel_graph);
-  device::ascend::InsertAtomicCleanOp(kernel_graph);
+  device::ascend::InsertAtomicCleanOps(kernel_graph);
  MS_LOG(INFO) << "Finish!";
 }

--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
@ -26,6 +26,8 @@
 #include "runtime/device/memory_manager.h"
 #include "runtime/device/convert_tensor_utils.h"
 #include "runtime/device/ascend/ascend_launch_transdata.h"
+#include "runtime/hardware/device_context_manager.h"
+#include "runtime/hardware/ascend/ascend_device_context.h"
 #include "ir/dtype/type.h"
 #include "ir/tensor.h"
 #include "abstract/utils.h"
@ -162,6 +164,25 @@ bool SyncDeviceToHostAndFloatToFloat64(void *dst, size_t dst_size, const void *s
  return true;
 }

+void AscendDeviceAddress::BindDevice() const {
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  if (!MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
+    return;
+  }
+
+  // Bind device by device name and device id on the current thread.
+  if (device_name_ != "") {
+    auto device_context =
+      device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_name_, device_id_});
+    auto ascend_device_context = dynamic_cast<AscendDeviceContext *>(device_context);
+    MS_EXCEPTION_IF_NULL(ascend_device_context);
+    if (!ascend_device_context->BindDeviceToCurrentThread()) {
+      MS_LOG(EXCEPTION) << "BindDeviceToCurrentThread failed.";
+    }
+  }
+}
+
 void AscendDeviceAddress::SyncStream() const {
  MS_LOG(DEBUG) << "SyncStream Start!";
  auto ms_context = MsContext::GetInstance();
@ -183,6 +204,7 @@ void AscendDeviceAddress::SyncStream() const {

 bool AscendDeviceAddress::SyncDeviceToHost(size_t size, void *const host_ptr) const {
  MS_EXCEPTION_IF_NULL(host_ptr);
+  BindDevice();
  SyncStream();
  SyncMemory(host_ptr, ptr_, size, RT_MEMCPY_DEVICE_TO_HOST);
  return true;
@ -190,6 +212,7 @@ bool AscendDeviceAddress::SyncDeviceToHost(size_t size, void *const host_ptr) co

 bool AscendDeviceAddress::SyncHostToDevice(size_t size, const void *host_ptr) const {
  MS_EXCEPTION_IF_NULL(host_ptr);
+  BindDevice();
  SyncMemory(ptr_, host_ptr, size, RT_MEMCPY_HOST_TO_DEVICE);
  return true;
 }
@ -201,6 +224,7 @@ bool AscendDeviceAddress::SyncDeviceToHost(const ShapeVector &shape, size_t size
  if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) {
    return true;
  }
+  BindDevice();
  SyncStream();
  bool sync_ok = false;
  std::vector<size_t> host_shape;
@ -368,7 +392,7 @@ bool AscendDeviceAddress::SyncHostToDevice(const ShapeVector &shape, size_t size
  if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) {
    return true;
  }
-
+  BindDevice();
  bool sync_ok = false;
  std::vector<size_t> host_shape;
  (void)std::transform(shape.begin(), shape.end(), std::back_inserter(host_shape), LongToSize);
@ -416,6 +440,7 @@ bool AscendDeviceAddress::SyncDeviceToDevice(const ShapeVector &, size_t size, T
  if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) {
    return true;
  }
+  BindDevice();
  bool sync_ok = false;
  if (format_ == format && type_id_ == type) {
    if (!DataSync(ptr_, src_ptr, size)) {
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h
@ -36,11 +36,14 @@ namespace ascend {
 class AscendDeviceAddress : public DeviceAddress {
 public:
  explicit AscendDeviceAddress(void *ptr, size_t size) : DeviceAddress(ptr, size) {}
-  explicit AscendDeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id)
-      : DeviceAddress(ptr, size, format, type_id) {}
+  explicit AscendDeviceAddress(void *ptr, size_t size, const std::string &device_name, uint32_t device_id)
+      : DeviceAddress(ptr, size, device_name, device_id) {}
  explicit AscendDeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id,
-                               const KernelWithIndex &node_index)
-      : DeviceAddress(ptr, size, format, type_id, node_index) {}
+                               const std::string &device_name, uint32_t device_id)
+      : DeviceAddress(ptr, size, format, type_id, device_name, device_id) {}
+  explicit AscendDeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id,
+                               const KernelWithIndex &node_index, const std::string &device_name, uint32_t device_id)
+      : DeviceAddress(ptr, size, format, type_id, node_index, device_name, device_id) {}
  ~AscendDeviceAddress() override;
  bool SyncDeviceToHost(size_t size, void *const host_ptr) const override;
  bool SyncHostToDevice(size_t size, const void *host_ptr) const override;
@ -71,6 +74,7 @@ class AscendDeviceAddress : public DeviceAddress {
                                                      const std::string &ori_format,
                                                      const std::string &dst_format) const;
  mutable std::shared_ptr<LaunchKernel> launch_transdata_{nullptr};
+  void BindDevice() const;
 };
 using AscendDeviceAddressPtr = std::shared_ptr<AscendDeviceAddress>;
 }  // namespace ascend
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@ -404,12 +404,19 @@ bool AscendKernelRuntime::KernelMemNotReuse(const AnfNodePtr &node) {

 DeviceAddressPtr AscendKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
                                                          TypeId type_id) const {
-  return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id);
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+  return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id, kAscendDevice, device_id);
 }

 DeviceAddressPtr AscendKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
                                                          TypeId type_id, const KernelWithIndex &node_index) const {
-  return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id, node_index);
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+  return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id, node_index, kAscendDevice,
+                                               device_id);
 }

 bool AscendKernelRuntime::Load(const session::KernelGraph &graph, bool is_task_sink) {
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
@ -71,6 +71,9 @@ class AscendKernelRuntime : public KernelRuntime {
  void *compute_stream() const override { return stream_; }
  void *communication_stream() const override { return communication_stream_; }
  void *GetModelStream(uint32_t graph_id) const override;
+  // add for MindRT
+  void ReleaseDeviceRes() override;
+  void SetCurrentContext();

 protected:
  DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
@ -87,10 +90,8 @@ class AscendKernelRuntime : public KernelRuntime {
  static bool HcclInit();
  static bool NeedDestroyHccl();
  static bool DestroyHccl();
-  void SetCurrentContext();

  void ClearGraphModelMap();
-  void ReleaseDeviceRes() override;
  bool GraphWithEmptyTaskList(const session::KernelGraph &graph) const;
  bool CheckGraphIdValid(GraphId graph_id) const;
 #ifndef ENABLE_SECURITY
--- a/mindspore/ccsrc/runtime/device/ascend/executor/ai_core_dynamic_kernel.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/executor/ai_core_dynamic_kernel.cc
@ -147,7 +147,7 @@ void AiCoreDynamicKernel::AllocateWorkspace() {

  workspace_addr_.clear();
  for (auto size : workspaces_size_) {
-    auto device_address_ptr = std::make_shared<AscendDeviceAddress>(nullptr, size);
+    auto device_address_ptr = std::make_shared<AscendDeviceAddress>(nullptr, size, kAscendDevice, device_id);
    auto device_ptr = runtime_instance->MallocMem(MemType::kDynamicMem, size, device_address_ptr);
    if (device_ptr == nullptr) {
      MS_LOG(EXCEPTION) << "MallocMem from memory pool failed. Node info :" << cnode->fullname_with_scope();
--- a/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.cc
@ -338,7 +338,6 @@ void ProcessAtomicFusion(const std::vector<CNodePtr> &kernels, CleanOpsMap *clea
    InsertFusionAtomicOp(first_node, fusion_clear_inputs, clean_size_list, clean_ops);
  }
 }
-}  // namespace

 void InsertAtomicOps(const std::vector<CNodePtr> &kernels, CleanOpsMap *clean_ops) {
  // fusion
@ -358,9 +357,9 @@ void InsertAtomicOps(const std::vector<CNodePtr> &kernels, CleanOpsMap *clean_op
  }
 }

-std::map<AnfNodePtr, std::vector<size_t>> GetCommunicationOpInputInfo(const std::vector<CNodePtr> &exe_orders) {
+std::map<AnfNodePtr, std::vector<size_t>> GetCommunicationOpInputInfo(const std::vector<CNodePtr> &kernels) {
  std::map<AnfNodePtr, std::vector<size_t>> comm_input_info_map;
-  for (auto &kernel : exe_orders) {
+  for (auto &kernel : kernels) {
    MS_EXCEPTION_IF_NULL(kernel);
    auto input_num = AnfAlgo::GetInputTensorNum(kernel);
    if (mindspore::session::AnfRuntimeAlgorithm::IsCommunicationOp(kernel)) {
@ -401,12 +400,12 @@ std::map<AnfNodePtr, std::vector<size_t>> GetCommunicationOpInputInfo(const std:
  return comm_input_info_map;
 }

-void AddNeedInsertAtomicAttrForAllOps(const std::vector<CNodePtr> &exe_orders) {
-  if (exe_orders.empty()) {
+void TagNeedInsertAtomicAttr(const std::vector<CNodePtr> &nodes) {
+  if (nodes.empty()) {
    return;
  }
-  std::map<AnfNodePtr, std::vector<size_t>> comm_input_info_map = GetCommunicationOpInputInfo(exe_orders);
-  for (const auto &anf_node : exe_orders) {
+  std::map<AnfNodePtr, std::vector<size_t>> comm_input_info_map = GetCommunicationOpInputInfo(nodes);
+  for (const auto &anf_node : nodes) {
    if (comm_input_info_map.find(anf_node) != comm_input_info_map.end()) {
      auto indexes = comm_input_info_map[anf_node];
      if (AnfAlgo::HasNodeAttr(kAttrAtomicOutputIndexs, anf_node)) {
@ -433,23 +432,24 @@ std::vector<CNodePtr> GatherAllAtomicOps(const CleanOpsMap &node_maps) {
  }
  return all_atomics;
 }
+}  // namespace

-void InsertAtomicCleanOpForMindRT(const std::vector<CNodePtr> &exe_orders, CleanOpsMap *maps) {
+void InsertAtomicCleanOps(const std::vector<CNodePtr> &nodes, CleanOpsMap *maps) {
  MS_EXCEPTION_IF_NULL(maps);
  // assign attr
-  AddNeedInsertAtomicAttrForAllOps(exe_orders);
+  TagNeedInsertAtomicAttr(nodes);
  // insert atomic
-  InsertAtomicOps(exe_orders, maps);
+  InsertAtomicOps(nodes, maps);
  std::vector<CNodePtr> all_atomics = GatherAllAtomicOps(*maps);
  // build atomic
  KernelBuild(all_atomics);
 }

-void InsertAtomicCleanOp(const KernelGraphPtr &kernel_graph) {
+void InsertAtomicCleanOps(const KernelGraphPtr &kernel_graph) {
  MS_EXCEPTION_IF_NULL(kernel_graph);
  const auto &exe_orders = kernel_graph->execution_order();
  // assign attr
-  AddNeedInsertAtomicAttrForAllOps(exe_orders);
+  TagNeedInsertAtomicAttr(exe_orders);
  // insert atomic
  CleanOpsMap node_to_cleans;
  InsertAtomicOps(exe_orders, &node_to_cleans);
--- a/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.h
+++ b/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.h
@ -32,36 +32,15 @@ using CleanOpsMap = std::map<CNodePtr, std::vector<CNodePtr>>;
 */
 bool KernelBuild(const std::vector<CNodePtr> &kernels);

-/**
- * @brief preprocess of kernel build for ascend, e.g. inserting clear_zero node for max_pool, bn.
- * Must DO these changes just before kernel build, and after all of other optimizations on AnfGraph
- */
-void InsertAtomicCleanOp(const KernelGraphPtr &kernel_graph);
-
-/**
- *  @brief preprocess for mind rt
- * */
-void InsertAtomicCleanOpForMindRT(const std::vector<CNodePtr> &exe_orders, CleanOpsMap *maps);
-
-/**
- * @brief communication op input info.
- * */
-CommOpInputInfo GetCommunicationOpInputInfo(const std::vector<CNodePtr> &exe_orders);
-
 /**
 * @brief insert atomic
- * */
-void InsertAtomicOps(const std::vector<CNodePtr> &exe_orders, CleanOpsMap *clean_ops);
+ */
+void InsertAtomicCleanOps(const KernelGraphPtr &kernel_graph);

 /**
- * @brief gather all atomics
+ *  @brief insert atomic for mind rt
 * */
-std::vector<CNodePtr> GatherAllAtomicOps(const CleanOpsMap &node_maps);
-
-/**
- * @brief add attr for op if need insert atomic
- * */
-void AddNeedInsertAtomicAttrForAllOps(const std::vector<CNodePtr> &exe_orders);
+void InsertAtomicCleanOps(const std::vector<CNodePtr> &nodes, CleanOpsMap *maps);
 }  // namespace ascend
 }  // namespace device
 }  // namespace mindspore
--- a/mindspore/ccsrc/runtime/device/device_address.h
+++ b/mindspore/ccsrc/runtime/device/device_address.h
@ -38,6 +38,7 @@ class CPUDeviceContext;
 namespace ascend {
 class AscendKernelRuntime;
 class AscendMemoryManager;
+class AscendDeviceContext;
 #ifndef ENABLE_SECURITY
 class DataDumper;
 #endif
@ -71,9 +72,21 @@ class DeviceAddress : public mindspore::DeviceSync {
  explicit DeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id,
                         const KernelWithIndex &node_index)
      : ptr_(ptr), size_(size), format_(format), type_id_(type_id), node_index_(node_index) {}
-  explicit DeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id,
-                         const std::string &device_name, uint32_t device_id)
+
+  explicit DeviceAddress(void *ptr, size_t size, const std::string &device_name, uint32_t device_id)
+      : ptr_(ptr), size_(size), device_name_(device_name), device_id_(device_id) {}
+  explicit DeviceAddress(void *ptr, size_t size, const string &format, TypeId type_id, const std::string &device_name,
+                         uint32_t device_id)
      : ptr_(ptr), size_(size), format_(format), type_id_(type_id), device_name_(device_name), device_id_(device_id) {}
+  explicit DeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id,
+                         const KernelWithIndex &node_index, const std::string &device_name, uint32_t device_id)
+      : ptr_(ptr),
+        size_(size),
+        format_(format),
+        type_id_(type_id),
+        node_index_(node_index),
+        device_name_(device_name),
+        device_id_(device_id) {}
  virtual ~DeviceAddress() { ptr_ = nullptr; }

  const void *GetPtr() const { return ptr_; }
@ -133,6 +146,7 @@ class DeviceAddress : public mindspore::DeviceSync {
  friend class mindspore::device::gpu::GPUDeviceContext;
  friend class mindspore::device::ascend::AscendKernelRuntime;
  friend class mindspore::device::ascend::AscendMemoryManager;
+  friend class mindspore::device::ascend::AscendDeviceContext;
 #ifndef ENABLE_SECURITY
  friend class mindspore::device::ascend::DataDumper;
 #endif
--- a/mindspore/ccsrc/runtime/device/kernel_adjust.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_adjust.cc
@ -989,7 +989,12 @@ void KernelAdjust::AssignLoopCtrlTensorMem(const session::KernelGraph &kernel_gr
    auto format = AnfAlgo::GetOutputFormat(param, 0);
    auto type_id = AnfAlgo::GetOutputDeviceDataType(param, 0);

-    device_address = std::make_shared<device::ascend::AscendDeviceAddress>(nullptr, size, format, type_id);
+    auto ms_context = MsContext::GetInstance();
+    MS_EXCEPTION_IF_NULL(ms_context);
+    auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+    device_address =
+      std::make_shared<device::ascend::AscendDeviceAddress>(nullptr, size, format, type_id, kAscendDevice, device_id);
+
    if (runtime_instance->MallocMem(kStaticMem, size, device_address) == nullptr) {
      MS_LOG(EXCEPTION) << "Cannot alloc static memory for device loop control parameter " << name
                        << " , tensor size is : " << size;
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@ -119,6 +119,11 @@ class KernelRuntime {
  virtual DeviceAddressPtr AssignExtraStaticMem(const TensorPtr &tensor, const AnfNodePtr &node, size_t index);
  virtual void *GetModelStream(uint32_t graph_id) const { return nullptr; }

+  // add for MindRT
+  std::shared_ptr<MemoryManager> GetMemoryManager() { return mem_manager_; }
+  void AssignStaticMemoryOutput(const session::KernelGraph &graph);
+  void AssignDynamicMemory(const session::KernelGraph &graph);
+
 protected:
  virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
                                               TypeId type_id) const = 0;
@ -128,7 +133,6 @@ class KernelRuntime {
  virtual bool KernelMemNotReuse(const AnfNodePtr &node);

  void AssignStaticMemory(const session::KernelGraph &graph);
-  void AssignDynamicMemory(const session::KernelGraph &graph);
  void AssignNodeOutputMem(MemType type, const AnfNodePtr &node, int index);
  void AssignWorkSpaceMem(MemType type, const AnfNodePtr &node);

@ -154,7 +158,6 @@ class KernelRuntime {
                             const AnfNodePtr &kernel, bool mock);

  void AssignCommunicationMem(const session::KernelGraph &graph);
-  void AssignStaticMemoryOutput(const session::KernelGraph &graph);
  bool LaunchKernelMod(const session::KernelGraph &graph, bool mock = false);
  void LaunchKernelEvent(const std::vector<std::vector<std::function<void()>>> &run_events, size_t index) const;
  void DebugStreamSync(const CNodePtr &kernel);
--- a/mindspore/ccsrc/runtime/hardware/CMakeLists.txt
+++ b/mindspore/ccsrc/runtime/hardware/CMakeLists.txt
@ -1,6 +1,10 @@
 file(GLOB_RECURSE HARDWARE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
    "device_context_manager.cc")

+if(ENABLE_D)
+    file(GLOB_RECURSE HARDWARE_D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "ascend/*.cc")
+endif()
+
 if(ENABLE_GPU)
    file(GLOB_RECURSE HARDWARE_GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "gpu/*.cc")
 endif()
--- a/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc
+++ b/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc
@ -0,0 +1,346 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "runtime/hardware/ascend/ascend_device_context.h"
+#include <algorithm>
+#include <set>
+#include "backend/optimizer/ascend/ascend_backend_optimization.h"
+#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
+#include "backend/session/ascend_auto_monad.h"
+#include "utils/context/graph_kernel_flags.h"
+#include "runtime/device/ascend/kernel_select_ascend.h"
+#include "runtime/device/kernel_adjust.h"
+#include "runtime/device/ascend/ascend_stream_assign.h"
+#include "runtime/device/ascend/kernel_build_ascend.h"
+#include "runtime/hardware/ascend/ascend_graph_optimization.h"
+
+#ifndef ENABLE_SECURITY
+#include "debug/data_dump/dump_json_parser.h"
+#include "toolchain/adx_datadump_server.h"
+#include "debug/anf_ir_dump.h"
+#include "debug/dump_proto.h"
+#include "debug/data_dump/e2e_dump.h"
+#endif
+
+namespace mindspore {
+namespace device {
+namespace ascend {
+using KernelGraph = mindspore::session::KernelGraph;
+
+#ifndef ENABLE_SECURITY
+void DumpInit(uint32_t device_id) {
+  auto &json_parser = DumpJsonParser::GetInstance();
+  json_parser.Parse();
+  json_parser.CopyDumpJsonToDir(device_id);
+  json_parser.CopyHcclJsonToDir(device_id);
+  json_parser.CopyMSCfgJsonToDir(device_id);
+  if (json_parser.async_dump_enabled()) {
+    if (AdxDataDumpServerInit() != 0) {
+      MS_LOG(EXCEPTION) << "Adx data dump server init failed";
+    }
+  }
+}
+
+void DumpSetup(const KernelGraphPtr &graph) {
+  MS_LOG(DEBUG) << "Start!";
+  MS_EXCEPTION_IF_NULL(graph);
+  E2eDump::DumpSetup(graph.get());
+  MS_LOG(DEBUG) << "Finish!";
+}
+
+void Dump(const KernelGraphPtr &graph, uint32_t rank_id) {
+  MS_LOG(DEBUG) << "Start!";
+  MS_EXCEPTION_IF_NULL(graph);
+  E2eDump::DumpData(graph.get(), rank_id);
+  MS_LOG(DEBUG) << "Finish!";
+}
+#endif
+
+void AscendDeviceContext::Initialize() {
+  MS_LOG(INFO) << "Status record: Enter Initialize...";
+  if (initialized_) {
+    MS_EXCEPTION_IF_NULL(runtime_instance_);
+    runtime_instance_->SetCurrentContext();
+    return;
+  }
+
+  MS_LOG(INFO) << "Status record: Initialize start...";
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+  runtime_instance_ = dynamic_cast<AscendKernelRuntime *>(
+    device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id));
+  MS_EXCEPTION_IF_NULL(runtime_instance_);
+  if (!runtime_instance_->Init()) {
+    MS_LOG(EXCEPTION) << "Kernel runtime init error.";
+  }
+  mem_manager_ = runtime_instance_->GetMemoryManager();
+  MS_EXCEPTION_IF_NULL(mem_manager_);
+
+  auto env_rank_id = common::GetEnv("RANK_ID");
+  if (ms_context->get_param<bool>(MS_CTX_ENABLE_HCCL) && !env_rank_id.empty()) {
+    // get actual rank id if it's distribution training case.
+    rank_id_ = GetRankId();
+  }
+#ifndef ENABLE_SECURITY
+  DumpInit(rank_id_);
+#endif
+  initialized_ = true;
+  MS_LOG(INFO) << "Status record: Initialize success.";
+}
+
+void AscendDeviceContext::Destroy() {
+  MS_LOG(INFO) << "Status record: Enter Destroy...";
+  if (!initialized_) {
+    return;
+  }
+  MS_LOG(INFO) << "Status record: Destroy start...";
+  rank_id_ = 0;
+  if (runtime_instance_ != nullptr) {
+    runtime_instance_->ReleaseDeviceRes();
+    runtime_instance_ = nullptr;
+  }
+  initialized_ = false;
+  MS_LOG(INFO) << "Status record: Destroy success.";
+}
+
+std::vector<GraphSegmentPtr> AscendDeviceContext::PartitionGraph(
+  const FuncGraphPtr &func_graph, const std::vector<GraphSegmentPtr> &default_partition_segments) {
+  return std::vector<GraphSegmentPtr>();
+}
+
+void AscendDeviceContext::UnifyMindIR(const KernelGraphPtr &graph) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  AscendGraphOptimization::GetInstance().UnifyMindIR(graph);
+}
+
+void AscendDeviceContext::OptimizeGraph(const KernelGraphPtr &graph) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  AscendGraphOptimization::GetInstance().OptimizeGraph(graph);
+}
+
+void AscendDeviceContext::SetOperatorInfo(const std::vector<CNodePtr> &nodes) const {
+  AscendGraphOptimization::GetInstance().SetOperatorInfo(nodes);
+}
+
+void AscendDeviceContext::CreateKernel(const std::vector<CNodePtr> &nodes) const {
+  MS_LOG(INFO) << "CreateKernel Start...";
+  struct timeval start_time, end_time;
+  (void)gettimeofday(&start_time, nullptr);
+  auto ret = device::ascend::KernelBuild(nodes);
+  if (!ret) {
+    MS_LOG(EXCEPTION) << "Kernel build error.";
+  }
+  (void)gettimeofday(&end_time, nullptr);
+  const uint64_t kUSecondInSecond = 1000000;
+  uint64_t cost = kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
+  cost += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
+  MS_LOG(INFO) << "CreateKernel finish run in  " << PRIu64 << " us " << cost;
+}
+
+void AscendDeviceContext::UpdateExecOrder(const KernelGraphPtr &graph) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  std::vector<CNodePtr> new_orders;
+  auto nodes = graph->execution_order();
+  for (const auto &node : nodes) {
+    if (node_atomics_.find(node) != node_atomics_.end()) {
+      auto atomics = node_atomics_[node];
+      (void)std::copy(atomics.begin(), atomics.end(), std::back_inserter(new_orders));
+    }
+    new_orders.push_back(node);
+  }
+  graph->set_execution_order(new_orders);
+  node_atomics_.clear();
+}
+
+void AscendDeviceContext::PreprocessBeforeRunGraph(const KernelGraphPtr &graph) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_LOG(INFO) << "PreprocessBeforeRunGraph Start for graph " << graph->graph_id();
+  device::ascend::InsertAtomicCleanOps(graph->execution_order(), &node_atomics_);
+  if (graph->is_executing_sink()) {
+    UpdateExecOrder(graph);
+    device::KernelAdjust::GetInstance().InsertDeviceLoopCtrl(graph);
+    device::KernelAdjust::GetInstance().ProcessLoopSink(graph);
+    AscendStreamAssign::GetInstance().AssignStream(NOT_NULL(graph));
+    CreateKernel(graph->execution_order());
+    AllocateGraphMemory(NOT_NULL(graph));
+    LoadModel(NOT_NULL(graph));
+    MS_LOG(INFO) << "PreprocessBeforeRunGraph success.";
+    return;
+  }
+  MS_LOG(INFO) << "PreprocessBeforeRunGraph success.";
+}
+
+void AscendDeviceContext::AllocateGraphMemory(const NotNull<KernelGraphPtr> &root_graph) const {
+  MS_EXCEPTION_IF_NULL(runtime_instance_);
+  runtime_instance_->ClearGlobalIdleMem();
+  memo_.clear();
+  AssignInputMemory(root_graph, NOT_NULL(&memo_));
+  device::KernelAdjust::GetInstance().AssignLoopCtrlMemory(*root_graph.get());
+  runtime_instance_->AssignStaticMemoryOutput(*root_graph.get());
+  mem_manager_->ResetDynamicMemory();
+  runtime_instance_->AssignDynamicMemory(*root_graph.get());
+  runtime_instance_->UpdateRefNodeOutputMem(*root_graph.get());
+}
+
+void AscendDeviceContext::AssignInputMemory(const NotNull<KernelGraphPtr> &graph,
+                                            NotNull<std::set<KernelGraphPtr> *> const memo) const {
+  if (memo->find(graph) != memo->end()) {
+    return;
+  }
+  memo->insert(graph.get());
+
+  MS_LOG(INFO) << "Start to assign static memory for Parameter and Value node in graph: " << graph->graph_id();
+  runtime_instance_->AssignStaticMemoryInput(*graph.get());
+  runtime_instance_->AssignStaticMemoryValueNode(*graph.get());
+  for (auto &child_graph : graph->child_graph_order()) {
+    AssignInputMemory(NOT_NULL(child_graph.lock()), memo);
+  }
+  MS_LOG(INFO) << "Finish assigning static memory for Parameter and Value node in graph: " << graph->graph_id();
+}
+
+void AscendDeviceContext::LoadModel(const NotNull<KernelGraphPtr> &root_graph) const {
+  MS_LOG(INFO) << "Start LoadModel for graph " << root_graph->graph_id();
+  MS_EXCEPTION_IF_NULL(runtime_instance_);
+  bool ret_ok = runtime_instance_->Load(*root_graph.get(), true);
+  if (!ret_ok) {
+    MS_LOG(EXCEPTION) << "Load task error!";
+  }
+  MS_LOG(INFO) << "Finish!";
+}
+
+bool AscendDeviceContext::AllocateMemory(DeviceAddress *const &address, size_t size) const {
+  MS_EXCEPTION_IF_NULL(address);
+  MS_EXCEPTION_IF_NULL(runtime_instance_);
+  runtime_instance_->SetCurrentContext();
+  auto device_ptr = mem_manager_->MallocMemFromMemPool(size);
+  if (!device_ptr) {
+    return false;
+  }
+  address->ptr_ = device_ptr;
+  address->size_ = size;
+  address->from_mem_pool_ = true;
+  return true;
+}
+
+void AscendDeviceContext::FreeMemory(DeviceAddress *const &address) const {
+  MS_EXCEPTION_IF_NULL(address);
+  MS_EXCEPTION_IF_NULL(address->ptr_);
+  if (!address->from_mem_pool()) {
+    return;
+  }
+  mem_manager_->FreeMemFromMemPool(address->ptr_);
+  address->ptr_ = nullptr;
+}
+
+bool AscendDeviceContext::AllocateContinuousMemory(const std::vector<DeviceAddressPtr> &addr_list, size_t total_size,
+                                                   const std::vector<size_t> &size_list) const {
+  MS_EXCEPTION_IF_NULL(runtime_instance_);
+  runtime_instance_->SetCurrentContext();
+  return mem_manager_->MallocContinuousMemFromMemPool(addr_list, total_size, size_list);
+}
+
+bool AscendDeviceContext::ExecuteGraph(const KernelGraphPtr &graph) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  const uint64_t kUSecondInSecond = 1000000;
+  bool ret = false;
+  if (graph->is_executing_sink()) {
+#if defined(_WIN32) || defined(_WIN64)
+    auto start_time = std::chrono::steady_clock::now();
+#else
+    struct timeval start_time {};
+    struct timeval end_time {};
+    (void)gettimeofday(&start_time, nullptr);
+#endif
+    MS_EXCEPTION_IF_NULL(runtime_instance_);
+#ifndef ENABLE_SECURITY
+    DumpSetup(graph);
+#endif
+    {
+      std::lock_guard<std::mutex> locker(launch_mutex_);
+      ret = runtime_instance_->RunTask(*graph);
+    }
+#ifndef ENABLE_SECURITY
+    Dump(graph, GetRankID());
+#endif
+#if defined(_WIN32) || defined(_WIN64)
+    auto end_time = std::chrono::steady_clock::now();
+    std::chrono::duration<double, std::ratio<1, kUSecondInSecond>> cost = end_time - start_time;
+    MS_LOG(INFO) << "Call MS Run Success in " << cost.count() << " us";
+#else
+    (void)gettimeofday(&end_time, nullptr);
+    uint64_t cost = kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
+    cost += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
+    MS_LOG(INFO) << "Call MS Run Success in " << cost << " us";
+#endif
+  } else {
+    MS_LOG(EXCEPTION) << graph->ToString() << " does not sink, should launch kernels";
+  }
+  return ret;
+}
+
+bool AscendDeviceContext::LaunchGraph(const KernelGraphPtr &graph) const {
+  MS_LOG(INFO) << "Status record: start launch graph. graph id: " << graph->graph_id();
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(runtime_instance_);
+  runtime_instance_->SetCurrentContext();
+  device::KernelAdjust::GetInstance().LoadDeviceLoopCtrlParameters(graph);
+  auto ret = ExecuteGraph(graph);
+  MS_LOG(INFO) << "Status record: end launch graph. graph id: " << graph->graph_id();
+  return ret;
+}
+
+bool AscendDeviceContext::SyncStream(size_t stream_id) const {
+  MS_EXCEPTION_IF_NULL(runtime_instance_);
+  return runtime_instance_->SyncStream();
+}
+bool AscendDeviceContext::IsExecutingSink(const KernelGraphPtr &graph) const { return true; }
+bool AscendDeviceContext::IsLoopCountSink(const KernelGraphPtr &graph) const { return true; }
+
+// kernel by kernel mode interface
+void AscendDeviceContext::OptimizeSingleOpGraph(const KernelGraphPtr &graph) const {
+  MS_LOG(ERROR) << "!!! Ascend with MindRT not support kernel by kernel mode. !!! ";
+}
+
+void AscendDeviceContext::PreprocessBeforeRunSingleOpGraph(const KernelGraphPtr &graph) const {
+  MS_LOG(ERROR) << "!!! Ascend with MindRT not support kernel by kernel mode. !!! ";
+}
+
+void AscendDeviceContext::UpdateDynamicShape(const CNodePtr &kernel) const {
+  MS_LOG(ERROR) << "!!! Ascend with MindRT not support function UpdateDynamicShape. !!! ";
+}
+
+std::shared_ptr<Bucket> AscendDeviceContext::CreateBucket(uint32_t bucket_id, uint32_t bucket_size) const {
+  MS_LOG(ERROR) << "!!! Ascend with MindRT not support function CreateBucket. !!! ";
+  return DeviceContext::CreateBucket(bucket_id, bucket_size);
+}
+
+bool AscendDeviceContext::LaunchKernel(const CNodePtr &kernel, const vector<AddressPtr> &inputs,
+                                       const vector<AddressPtr> &workspace, const vector<AddressPtr> &outputs,
+                                       bool is_dynamic_shape) const {
+  MS_LOG(ERROR) << "!!! Ascend with MindRT not support kernel by kernel mode. !!! ";
+  return true;
+}
+
+bool AscendDeviceContext::BindDeviceToCurrentThread() const {
+  runtime_instance_->SetCurrentContext();
+  return true;
+}
+
+MS_REGISTER_DEVICE(kAscendDevice, AscendDeviceContext);
+}  // namespace ascend
+}  // namespace device
+}  // namespace mindspore
--- a/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.h
+++ b/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.h
@ -0,0 +1,158 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_ASCEND_DEVICE_CONTEXT_H_
+#define MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_ASCEND_DEVICE_CONTEXT_H_
+
+#include <vector>
+#include <memory>
+#include <string>
+#include <set>
+#include <map>
+#include "runtime/hardware/device_context.h"
+#include "runtime/hardware/device_context_manager.h"
+#include "runtime/device/memory_manager.h"
+#include "runtime/device/ascend/ascend_kernel_runtime.h"
+#include "runtime/device/ascend/ascend_device_address.h"
+
+namespace mindspore {
+namespace device {
+namespace ascend {
+class AscendDeviceContext : public DeviceContext {
+ public:
+  explicit AscendDeviceContext(const DeviceContextKey &device_context_key)
+      : DeviceContext(device_context_key), mem_manager_(nullptr), initialized_(false) {}
+  ~AscendDeviceContext() override = default;
+
+  // Initialize the device context.
+  void Initialize() override;
+
+  // Destroy device context and release device resource.
+  void Destroy() override;
+
+  // Get rank id for distributed training.
+  uint32_t GetRankID() const override { return rank_id_; }
+
+  // Partition the function graph through the device capability and return the partition segments.
+  // The second parameter is the default partition segments which are provided by the framework.
+  // Device can reprocess the default partition segments to new segments, also can partition the function graph again.
+  // If Device can launch the whole graph and not expect partitioning the function graph, then return the empty
+  // segments. The default behavior is return the default partition segments.
+  std::vector<GraphSegmentPtr> PartitionGraph(const FuncGraphPtr &func_graph,
+                                              const std::vector<GraphSegmentPtr> &default_partition_segments) override;
+
+  // Optimize the kernel graph for graph mode.
+  void OptimizeGraph(const KernelGraphPtr &graph) const override;
+
+  // Optimize the single operator graph for PyNative mode.
+  void OptimizeSingleOpGraph(const KernelGraphPtr &graph) const override;
+
+  // Select the matching backend kernels according to the data type and format of input and output for all
+  // execution operators, and set final device data type and format information for backend kernels, device
+  // data type and format which replace original data type and format will use for executing kernels.
+  void SetOperatorInfo(const std::vector<CNodePtr> &nodes) const override;
+
+  // Generate 'KernelMod' for all kernels and set 'KernelMod' into kernel,
+  // 'KernelMod' is real executive object of kernel.
+  void CreateKernel(const std::vector<CNodePtr> &nodes) const override;
+
+  // Adjust kernel graph before run graph, used in Graph Mode.
+  void PreprocessBeforeRunGraph(const KernelGraphPtr &graph) const override;
+  // Adjust single op kernel graph before run graph, used in PyNative Mode.
+  void PreprocessBeforeRunSingleOpGraph(const KernelGraphPtr &graph) const override;
+
+  // Infer kernel shape and update abstract info for dynamic shape kernel.
+  void UpdateDynamicShape(const CNodePtr &kernel) const override;
+
+  // Relevant function to allocate and free device memory.
+  bool AllocateMemory(DeviceAddress *const &address, size_t size) const override;
+  void FreeMemory(DeviceAddress *const &address) const override;
+
+  // Allocate continuous device memory end to end into 'addr_list'.
+  // Communication operators may need continuous memory for input and output
+  // to optimize the communication performance.
+  bool AllocateContinuousMemory(const std::vector<DeviceAddressPtr> &addr_list, size_t total_size,
+                                const std::vector<size_t> &size_list) const override;
+
+  // Create concrete device address according different device type.
+  DeviceAddressPtr CreateDeviceAddress(void *const device_ptr, size_t device_size, const string &format,
+                                       TypeId type_id) const override {
+    return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id,
+                                                 device_context_key_.device_name_, device_context_key_.device_id_);
+  }
+
+  // Get device address type according different device type, such GPU, Ascend.
+  DeviceAddressType GetDeviceAddressType() const override { return DeviceAddressType::kAscend; }
+
+  // Launch graph, device such as Ascend support the whole graph sink to the device executing.
+  bool LaunchGraph(const KernelGraphPtr &graph) const override;
+
+  // Launch a kernel via 'KernelMod' of the kernel.
+  bool LaunchKernel(const CNodePtr &kernel, const std::vector<AddressPtr> &inputs,
+                    const std::vector<AddressPtr> &workspace, const std::vector<AddressPtr> &outputs,
+                    bool is_dynamic_shape = false) const override;
+
+  // Synchronize stream, device such as GPU and Ascend need stream to launch kernel asynchronously,
+  // using 'SyncStream' to block thread and wait for completing all tasks in stream.
+  // Devices that do not need stream could ignore the implementation of this function.
+  bool SyncStream(size_t stream_id = 0) const override;
+
+  // Create and initialize bucket for every allreduce operator. Bucket is used in PyNative distributed training mode,
+  // one bucket handles all resource to launch and sync allreduce operator.
+  std::shared_ptr<Bucket> CreateBucket(uint32_t bucket_id, uint32_t bucket_size) const override;
+
+  // Unify the MindIR, the default behavior uses the common unified MindIR.
+  void UnifyMindIR(const KernelGraphPtr &graph) const override;
+
+  // Whether the graph sink executing through the device capability, the default behavior is not sink and return false.
+  bool IsExecutingSink(const KernelGraphPtr &graph) const override;
+  // Whether the graph loop sink executing through the device capability, the default behavior is not loop sink and
+  // return false.
+  bool IsLoopCountSink(const KernelGraphPtr &graph) const override;
+
+  // set rt_context_ to this thread to control device
+  bool BindDeviceToCurrentThread() const;
+
+ private:
+  // Graph loader interface
+  void AllocateGraphMemory(const NotNull<KernelGraphPtr> &root_graph) const;
+  void AssignInputMemory(const NotNull<KernelGraphPtr> &graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
+  void LoadModel(const NotNull<KernelGraphPtr> &root_graph) const;
+  void UpdateExecOrder(const KernelGraphPtr &graph) const;
+
+  // Kernel Runtime  --- only for task sink
+  AscendKernelRuntime *runtime_instance_{nullptr};
+  std::shared_ptr<MemoryManager> mem_manager_{nullptr};
+  // rank id of physical device
+  uint32_t rank_id_{0};
+  bool initialized_{false};
+
+  // LaunchGraph interface
+  bool ExecuteGraph(const KernelGraphPtr &graph) const;
+  // The ExecuteGraph is not thread safety specifically, it is not recommended that multiple threads access the same
+  // func at the same time, so need the launch mutex when multiple threads launch the graph.
+  mutable std::mutex launch_mutex_;
+  // The graphs has been traversed when the graph id traversed recursively.
+  // Note: Please clean the set before each use.
+  mutable std::set<KernelGraphPtr> memo_;
+  // Using node to get it's atomics
+  mutable std::map<CNodePtr, std::vector<CNodePtr>> node_atomics_;
+};
+}  // namespace ascend
+}  // namespace device
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_ASCEND_DEVICE_CONTEXT_H_
--- a/mindspore/ccsrc/runtime/hardware/ascend/ascend_graph_optimization.cc
+++ b/mindspore/ccsrc/runtime/hardware/ascend/ascend_graph_optimization.cc
@ -0,0 +1,288 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "runtime/hardware/ascend/ascend_graph_optimization.h"
+#include <set>
+#include "backend/optimizer/common/common_backend_optimization.h"
+#include "backend/optimizer/ascend/ascend_backend_optimization.h"
+#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
+#include "backend/session/ascend_auto_monad.h"
+#include "utils/context/graph_kernel_flags.h"
+#include "runtime/device/ascend/kernel_select_ascend.h"
+#include "runtime/device/kernel_adjust.h"
+
+#ifndef ENABLE_SECURITY
+#include "debug/anf_ir_dump.h"
+#include "debug/dump_proto.h"
+#endif
+
+namespace mindspore {
+namespace device {
+namespace ascend {
+using AscendAutoMonad = mindspore::session::AscendAutoMonad;
+
+void AscendGraphOptimization::OptimizeGraph(const KernelGraphPtr &graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_LOG(INFO) << "Status record: start optimize graph. graph id: " << graph->graph_id();
+
+  // empty graph dont entry to backend
+  if (graph->execution_order().empty()) {
+    MS_LOG(INFO) << graph->ToString() << " is empty graph.";
+    AnfAlgo::InsertMakeTupleForOutput(NOT_NULL(graph));
+    graph->set_executable(false);
+    MS_LOG(INFO) << "Status record: end optimize graph. graph id: " << graph->graph_id();
+  }
+
+  OptimizeGraphWithoutDeviceInfo(graph);
+  SelectKernel(graph);
+  OptimizeGraphWithDeviceInfo(graph);
+  OptimizeExecutionOrder(graph);
+  PostOptimization(graph);
+  MS_LOG(INFO) << "Status record: end optimize graph. graph id: " << graph->graph_id();
+}
+
+void AscendGraphOptimization::OptimizeGraphWithoutDeviceInfo(const KernelGraphPtr &graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  HandleControlFlow(NOT_NULL(graph));
+
+  // add all graphs to manager first, so that don't have to make new manager in following passes.
+  auto manager = Manage(graph, true);
+  memo_.clear();
+  AddGraphToManager(NOT_NULL(graph), NOT_NULL(manager));
+
+  memo_.clear();
+  IRFusionOptimization(graph);
+}
+
+void AscendGraphOptimization::OptimizeGraphWithDeviceInfo(const KernelGraphPtr &graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  memo_.clear();
+  HardWareOptimization(graph);
+  AnfAlgo::InsertMakeTupleForOutput(NOT_NULL(graph));
+}
+
+void AscendGraphOptimization::OptimizeExecutionOrder(const KernelGraphPtr &graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_LOG(INFO) << "Status record: start optimize execution order. graph id: " << graph->graph_id();
+  // root root_graph validate,include generate execute order and so on
+  RootGraphExecutorValidate(NOT_NULL(graph));
+
+#ifdef ENABLE_DUMP_IR
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
+  if (save_graphs) {
+    DumpIRProto(graph, "before_removeNop_" + std::to_string(graph->graph_id()));
+  }
+#endif
+
+  opt::HideNopNode(graph.get());
+
+  auto execution_order = graph->execution_order();
+  AnfAlgo::ReorderExecList(NOT_NULL(&execution_order));
+  graph->set_execution_order(execution_order);
+
+#ifndef ENABLE_SECURITY
+  // insert profiling point
+  device::KernelAdjust::GetInstance().Profiling(NOT_NULL(graph.get()));
+#endif
+
+  device::KernelAdjust::GetInstance().InsertOverflowCheckOperations(NOT_NULL(graph));
+
+#ifdef ENABLE_DUMP_IR
+  if (save_graphs) {
+    DumpIR("after_adjust_kernel.ir", graph);
+  }
+#endif
+  MS_LOG(INFO) << "Status record: end optimize execution order. graph id: " << graph->graph_id();
+}
+
+void AscendGraphOptimization::PostOptimization(const KernelGraphPtr &graph) {
+  MS_LOG(INFO) << "Status record: start post optimization. graph id: " << graph->graph_id();
+  // copy child graph ref output map to father graph ref output map
+  memo_.clear();
+  UpdateRefOutputMap(graph);
+  graph->SetInputNodes();
+  graph->SetOptimizerFlag();
+  MS_LOG(INFO) << "Status record: end post optimization. graph id: " << graph->graph_id();
+}
+
+void AscendGraphOptimization::HardWareOptimization(const KernelGraphPtr &graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_LOG(INFO) << "Status record: start hardware optimize. graph id: " << graph->graph_id();
+  if (memo_.find(graph) != memo_.end()) {
+    return;
+  }
+  memo_.insert(graph);
+  opt::AscendBackendOptimization(graph);
+  opt::CommonFinalOptimization(graph);
+  if (graphkernel::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) {
+    graphkernel::GraphKernelOptimize(graph);
+    graph->SetExecOrderByDefault();
+  }
+  MS_LOG(INFO) << "Status record: end hardware optimize. graph id: " << graph->graph_id();
+
+  for (auto &child_graph : graph->child_graph_order()) {
+    HardWareOptimization(child_graph.lock());
+  }
+}
+
+void AscendGraphOptimization::AddGraphToManager(const NotNull<KernelGraphPtr> graph,
+                                                NotNull<FuncGraphManagerPtr> manager) {
+  if (memo_.find(graph) != memo_.end()) {
+    return;
+  }
+  memo_.insert(graph.get());
+  manager->AddFuncGraph(graph.get(), false);
+
+  for (auto &child_graph : graph->child_graph_order()) {
+    AddGraphToManager(NOT_NULL(child_graph.lock()), manager);
+  }
+}
+
+void AscendGraphOptimization::IRFusionOptimization(const KernelGraphPtr &graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  if (memo_.find(graph) != memo_.end()) {
+    return;
+  }
+  memo_.insert(graph);
+
+  opt::AscendBackendIRFusionOptimization(graph);
+
+#ifdef ENABLE_DUMP_IR
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
+  if (save_graphs) {
+    std::string file_name = "select_kernel_before_graph_" + std::to_string(graph->graph_id()) + ".ir";
+    DumpIR(file_name, graph);
+  }
+#endif
+
+  for (auto &child_graph : graph->child_graph_order()) {
+    IRFusionOptimization(NOT_NULL(child_graph.lock()));
+  }
+}
+
+void AscendGraphOptimization::HandleControlFlow(const NotNull<KernelGraphPtr> graph) {
+  MS_LOG(INFO) << "Status record: start handle control flow. graph id: " << graph->graph_id();
+  AscendAutoMonad auto_monad(graph);
+  auto_monad.Run();
+  MS_LOG(INFO) << "Status record: end handle control flow. graph id: " << graph->graph_id();
+}
+
+void AscendGraphOptimization::RootGraphExecutorValidate(NotNull<KernelGraphPtr> graph) {
+  MS_LOG(INFO) << "Status record: start graph executor validate. graph id: " << graph->graph_id();
+  AscendAutoMonad auto_monad(graph);
+  auto_monad.GenerateExecuteOrder();
+  MS_LOG(INFO) << "Status record: end graph executor validate. graph id: " << graph->graph_id();
+}
+
+void AscendGraphOptimization::RecurseSelectKernelInfo(const KernelGraphPtr &graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  if (memo_.find(graph) != memo_.end()) {
+    return;
+  }
+  memo_.insert(graph);
+  MS_LOG(INFO) << "Status record: start select kernel info. graph id: " << graph->graph_id();
+  SetOperatorInfo(graph->execution_order());
+  MS_LOG(INFO) << "Status record: end select kernel info. graph id: " << graph->graph_id();
+
+#ifdef ENABLE_DUMP_IR
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
+  if (save_graphs) {
+    std::string file_name = "select_kernel_after_graph_" + std::to_string(graph->graph_id()) + ".ir";
+    DumpIR(file_name, graph);
+  }
+#endif
+
+  for (auto &child_graph : graph->child_graph_order()) {
+    RecurseSelectKernelInfo(child_graph.lock());
+  }
+}
+
+void AscendGraphOptimization::SelectKernel(const KernelGraphPtr &graph) {
+  MS_LOG(INFO) << "Status record: start select kernel info. graph id: " << graph->graph_id();
+  raise_precision_count_ = 0;
+  reduce_precision_count_ = 0;
+  memo_.clear();
+  RecurseSelectKernelInfo(graph);
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode) {
+    if (raise_precision_count_ > 0) {
+      MS_LOG(WARNING) << "There are " << raise_precision_count_
+                      << " node/nodes used raise precision to selected the kernel!";
+    }
+    if (reduce_precision_count_ > 0) {
+      MS_LOG(WARNING) << "There are " << reduce_precision_count_
+                      << " node/nodes used reduce precision to selected the kernel!";
+    }
+  }
+  MS_LOG(INFO) << "Status record: end select kernel info. graph id: " << graph->graph_id();
+}
+
+void AscendGraphOptimization::UpdateRefOutputMap(const KernelGraphPtr &graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  if (memo_.find(graph) != memo_.end()) {
+    return;
+  }
+  memo_.insert(graph);
+
+  for (auto &child_graph : graph->child_graph_order()) {
+    auto child_graph_ptr = child_graph.lock();
+    MS_EXCEPTION_IF_NULL(child_graph_ptr);
+    UpdateRefOutputMap(NOT_NULL(child_graph_ptr));
+    // copy ref map to final graph
+    auto child_ref_map = child_graph_ptr->GetRefMap();
+    for (auto &item : child_ref_map) {
+      if (graph->IsInRefOutputMap(item.first)) {
+        MS_LOG(WARNING) << "The ref pair <" << item.first.first->DebugString() << ", " << item.first.second
+                        << "> is already in " << graph->ToString();
+        continue;
+      }
+      graph->AddRefCorrespondPairs(item.first, item.second);
+    }
+  }
+}
+
+void AscendGraphOptimization::UnifyMindIR(const KernelGraphPtr &graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_LOG(INFO) << "Status record: start unify mindir. graph id: " << graph->graph_id();
+  opt::CommonUnifyMindIR(graph);
+  opt::AscendUnifyMindIR(graph);
+  MS_LOG(INFO) << "Status record: end unify mindir. graph id: " << graph->graph_id();
+}
+
+void AscendGraphOptimization::SetOperatorInfo(const std::vector<CNodePtr> &nodes) {
+  for (const auto &node : nodes) {
+    auto status = device::ascend::SelectKernelInfo(node);
+    AnfAlgo::EraseNodeAttr(kAttrPynativeNextOpName, node);
+    AnfAlgo::EraseNodeAttr(kAttrPynativeNextIndex, node);
+    if (status == device::ascend::kStatusRaisePrecision) {
+      raise_precision_count_++;
+    } else if (status == device::ascend::kStatusReducePrecision) {
+      reduce_precision_count_++;
+    }
+    MS_LOG(DEBUG) << "Select ApplyKernel: " << node->DebugString();
+  }
+}
+
+}  // namespace ascend
+}  // namespace device
+}  // namespace mindspore
--- a/mindspore/ccsrc/runtime/hardware/ascend/ascend_graph_optimization.h
+++ b/mindspore/ccsrc/runtime/hardware/ascend/ascend_graph_optimization.h
@ -0,0 +1,77 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_MINDSPORE_ASCEND_GRAPH_OPTIMIZATION_H
+#define MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_MINDSPORE_ASCEND_GRAPH_OPTIMIZATION_H
+
+#include <vector>
+#include <memory>
+#include <string>
+#include <set>
+#include <map>
+#include "runtime/hardware/device_context.h"
+#include "runtime/hardware/device_context_manager.h"
+#include "runtime/device/memory_manager.h"
+#include "runtime/device/ascend/ascend_kernel_runtime.h"
+#include "runtime/device/ascend/ascend_device_address.h"
+
+namespace mindspore {
+namespace device {
+namespace ascend {
+class AscendGraphOptimization {
+ public:
+  static AscendGraphOptimization &GetInstance() {
+    static AscendGraphOptimization instance;
+    return instance;
+  }
+  AscendGraphOptimization() = default;
+  ~AscendGraphOptimization() = default;
+  AscendGraphOptimization(const AscendGraphOptimization &) = delete;
+  AscendGraphOptimization &operator=(const AscendGraphOptimization &) = delete;
+
+  void OptimizeGraph(const KernelGraphPtr &graph);
+  void SetOperatorInfo(const std::vector<CNodePtr> &nodes);
+  void UnifyMindIR(const KernelGraphPtr &graph);
+
+ private:
+  // Graph Optimized level-2 interface
+  void OptimizeGraphWithoutDeviceInfo(const KernelGraphPtr &graph);
+  void OptimizeGraphWithDeviceInfo(const KernelGraphPtr &graph);
+  void OptimizeExecutionOrder(const KernelGraphPtr &graph);
+  void PostOptimization(const KernelGraphPtr &graph);
+
+  // Graph Optimized level-3 interface
+  void IRFusionOptimization(const KernelGraphPtr &graph);
+  void UpdateRefOutputMap(const KernelGraphPtr &graph);
+  void AddGraphToManager(const NotNull<KernelGraphPtr> graph, NotNull<FuncGraphManagerPtr> manager);
+  void SelectKernel(const KernelGraphPtr &graph);
+  void RecurseSelectKernelInfo(const KernelGraphPtr &graph);
+  void HardWareOptimization(const KernelGraphPtr &graph);
+  void HandleControlFlow(const NotNull<KernelGraphPtr> graph);
+  void RootGraphExecutorValidate(NotNull<KernelGraphPtr> graph);
+
+  // Number of operators whose precision changes after select kernel
+  size_t raise_precision_count_{0};
+  size_t reduce_precision_count_{0};
+  // The graphs has been traversed when the graph id traversed recursively.
+  // Note: Please clean the set before each use.
+  std::set<KernelGraphPtr> memo_;
+};
+}  // namespace ascend
+}  // namespace device
+}  // namespace mindspore
+
+#endif  // MINDSPORE_ASCEND_GRAPH_OPTIMIZATION_H
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@ -136,6 +136,8 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "../../../mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc"
        "../../../mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.cc"
        "../../../mindspore/ccsrc/runtime/device/ascend/lic_manager.cc"
+        "../../../mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc"
+        "../../../mindspore/ccsrc/runtime/hardware/ascend/ascend_graph_optimization.cc"
        "../../../mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc"
        "../../../mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel_factory.cc"
        "../../../mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_adam_cpu_kernel.cc"