Optimize pynative device memory use

Add gradient to pynative unique Signed-off-by: zjun <zhangjun0@huawei.com>
2021-10-15 09:21:31 +08:00 · 2021-10-15 09:21:31 +08:00 · eb450dd31f
parent 4711194524
commit eb450dd31f
41 changed files with 574 additions and 274 deletions
--- a/docs/api/api_python/mindspore.context.rst
+++ b/docs/api/api_python/mindspore.context.rst
@ -24,6 +24,8 @@ MindSpore上下文，用于配置当前执行环境，包括执行模式、执
    |                         |  max_device_memory           |  GPU                       |
    |                         +------------------------------+----------------------------+
    |                         |  variable_memory_max_size    |  Ascend                    |
+    |                         +------------------------------+----------------------------+
+    |                         |  mempool_block_size          |  GPU/Ascend                |
    +-------------------------+------------------------------+----------------------------+
    | 调试配置                |  save_graphs                 |  CPU/GPU/Ascend            |
    |                         +------------------------------+----------------------------+
@ -76,6 +78,7 @@ MindSpore上下文，用于配置当前执行环境，包括执行模式、执
    - **device_target** (str) - 表示待运行的目标设备，支持Ascend、GPU和CPU。如果未设置设备目标，则使用MindSpore包的版本。
    - **max_device_memory** (str) - 设置设备可用的最大内存。目前，仅在GPU上支持。格式为“xxGB”。默认值：1024GB。实际使用的内存大小是设备的可用内存和 `max_device_memory` 值中的最小值。
    - **variable_memory_max_size** (str) - 设置可变内存的最大值。默认值：30GB。设置此参数后，框架使用的最大内存受配置值的限制。
+    - **mempool_block_size** (str) - 设置PyNative模式下设备内存池的块大小。格式为“xxGB”。默认值：1GB。最小值是1GB。实际使用的内存池块大小是设备的可用内存和 `mempool_block_size` 值中的最小值。
    - **save_graphs** (bool) - 表示是否保存图形。默认值：False。当 `save_graphs` 属性设为True时， `save_graphs_path` 属性用于设置中间编译图的存储路径。默认情况下，图形保存在当前目录下。
    - **save_graphs_path** (str) - 表示保存图形的路径。默认值："."。如果指定的目录不存在，系统将自动创建该目录。在分布式训练中，图形将被保存到 `save_graphs_path/rank_${rank_id}/` 目录下。 `rank_id` 为集群中当前设备的ID。
    - **enable_dump** (bool) - 此参数已弃用，将在下一版本中删除。
@ -155,6 +158,7 @@ MindSpore上下文，用于配置当前执行环境，包括执行模式、执
    ...                     profiling_options='{"output":"/home/data/output","training_trace":"on"}')
    >>> context.set_context(check_bprop=True)
    >>> context.set_context(max_device_memory="3.5GB")
+    >>> context.set_context(mempool_block_size="1GB")
    >>> context.set_context(print_file_path="print.pb")
    >>> context.set_context(enable_sparse=True)
    >>> context.set_context(max_call_depth=80)
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_dynamic_allocator.cc
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_dynamic_allocator.cc
@ -15,39 +15,51 @@
 */

 #include "backend/optimizer/mem_reuse/mem_dynamic_allocator.h"
+#include <string>
 #include "utils/ms_utils.h"
 #include "utils/convert_utils.h"
 #include "utils/log_adapter.h"
+#include "utils/ms_context.h"

 namespace mindspore {
 namespace device {
-DynamicMemPoolBestFit::~DynamicMemPoolBestFit() {
-  global_mem_block_list_.clear();
-  global_idle_mem_buf_map_.clear();
+static const char kPynativeParamMem[] = "Pynative unique mem";
+static const char kCommonMem[] = "Common mem";
+const size_t kGBToByte = 1024 << 20;
+
+static bool IsPynativeMode() {
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  return ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode;
 }

-DeviceMemPtr DynamicMemPoolBestFit::AllocTensorMem(size_t size) {
+DynamicMemPoolBestFit::~DynamicMemPoolBestFit() {
+  persistent_mem_->clear();
+  common_mem_->clear();
+}
+
+DeviceMemPtr DynamicMemPoolBestFit::AllocTensorMem(size_t size, bool from_persistent_mem) {
  size_t align_size = AlignMemorySize(size);
  std::lock_guard<std::mutex> locker(mutex_);
  // Find the idle memory buf by tensor size, if not find, then add new memory block and memory buf.
-  DeviceMemPtr device_addr = FindIdleMemBuf(align_size);
+  DeviceMemPtr device_addr = FindIdleMemBuf(align_size, from_persistent_mem);
  if (!device_addr) {
-    device_addr = AddMemBlockAndMemBuf(align_size);
+    device_addr = AddMemBlockAndMemBuf(align_size, from_persistent_mem);
  }
  return device_addr;
 }

 std::vector<DeviceMemPtr> DynamicMemPoolBestFit::AllocContinuousTensorMem(size_t total_size,
-                                                                          std::vector<size_t> size_list) {
+                                                                          const std::vector<size_t> &size_list) {
  std::vector<DeviceMemPtr> device_addr_list;
  // Pre-alloc the one whole piece memory.
-  auto device_addr = AllocTensorMem(total_size);
+  auto device_addr = AllocTensorMem(total_size, false);
  if (!device_addr) {
    return device_addr_list;
  }
  std::lock_guard<std::mutex> locker(mutex_);
  // Remove the pre-alloc memory.
-  const auto &mem_block = FindMemBlock(device_addr);
+  const auto &mem_block = FindMemBlock(device_addr, common_mem_);
  MS_EXCEPTION_IF_NULL(mem_block);
  const auto &iter = mem_block->block_all_mem_buf_map_.find(device_addr);
  if (iter == mem_block->block_all_mem_buf_map_.end()) {
@ -63,11 +75,11 @@ std::vector<DeviceMemPtr> DynamicMemPoolBestFit::AllocContinuousTensorMem(size_t
  // Split the pre-alloc memory into continuous memory by the size list.
  DynamicMemBufPtr continuous_mem_buf;
  auto buf_addr = device_addr;
-  for (size_t i = 0; i < size_list.size(); i++) {
-    continuous_mem_buf = std::make_shared<DynamicMemBuf>(buf_addr, kMemBufUsed, size_list[i]);
+  for (size_t i : size_list) {
+    continuous_mem_buf = std::make_shared<DynamicMemBuf>(buf_addr, kMemBufUsed, i);
    (void)mem_block->block_all_mem_buf_map_.emplace(buf_addr, continuous_mem_buf);
    device_addr_list.emplace_back(buf_addr);
-    buf_addr = AddressOffset(buf_addr, size_list[i]);
+    buf_addr = AddressOffset(buf_addr, i);
  }
  // Update the size of the last memory buf.
  continuous_mem_buf->size_ += rest_size;
@ -81,9 +93,14 @@ size_t DynamicMemPoolBestFit::AlignMemorySize(size_t size) const {
  return ((size + DYNAMIC_MEM_ALIGN_SIZE - 1) / DYNAMIC_MEM_ALIGN_SIZE) * DYNAMIC_MEM_ALIGN_SIZE;
 }

-DeviceMemPtr DynamicMemPoolBestFit::FindIdleMemBuf(size_t size) {
-  const auto &iter = global_idle_mem_buf_map_.lower_bound(size);
-  if (iter != global_idle_mem_buf_map_.end()) {
+DeviceMemPtr DynamicMemPoolBestFit::FindIdleMemBuf(size_t size, bool from_persistent_mem) {
+  auto mem_mng = common_mem_;
+  if (from_persistent_mem) {
+    mem_mng = persistent_mem_;
+  }
+  MS_EXCEPTION_IF_NULL(mem_mng);
+  const auto &iter = mem_mng->idle_mem_buf_map_.lower_bound(size);
+  if (iter != mem_mng->idle_mem_buf_map_.end()) {
    auto mem_buf = iter->second;
    MS_EXCEPTION_IF_NULL(mem_buf);
    if (mem_buf->status_ != kMemBufIdle) {
@ -92,24 +109,69 @@ DeviceMemPtr DynamicMemPoolBestFit::FindIdleMemBuf(size_t size) {
    }
    mem_buf->status_ = kMemBufUsed;
    // Remove map of old idle memory buf
-    (void)global_idle_mem_buf_map_.erase(iter);
+    (void)mem_mng->idle_mem_buf_map_.erase(iter);
    // Divide memory buf
-    if (IsDivide(size, mem_buf->size_)) {
-      DivideMemBuf(size, mem_buf);
+    if (IsSplit(size, mem_buf->size_)) {
+      SplitMemBuf(size, mem_buf, mem_mng);
    }
    // Memory statistics
-    total_used_mem_statistics_ += mem_buf->size_;
-    if (total_used_mem_statistics_ > used_mem_peak_statistics_) {
-      used_mem_peak_statistics_ = total_used_mem_statistics_;
+    mem_mng->mps_.total_used_mem_size_ += mem_buf->size_;
+    if (mem_mng->mps_.total_used_mem_size_ > mem_mng->mps_.used_mem_peak_size_) {
+      mem_mng->mps_.used_mem_peak_size_ = mem_mng->mps_.total_used_mem_size_;
    }
    return mem_buf->device_addr_;
  }
  return nullptr;
 }

-DeviceMemPtr DynamicMemPoolBestFit::AddMemBlockAndMemBuf(size_t size) {
-  size_t alloc_mem_size = CalMemBlockAllocSize(size);
+size_t DynamicMemPoolBestFit::MemAllocUnitSize(bool from_persistent_mem) const {
+  return from_persistent_mem ? persistent_mem_->unit_size_ : common_mem_->unit_size_;
+}
+
+void DynamicMemPoolBestFit::SetMemAllocUintSize(size_t size) {
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  persistent_mem_->unit_size_ = DYNAMIC_MEM_ALLOC_UNIT_SIZE;
+  common_mem_->unit_size_ = size - persistent_mem_->unit_size_;
+  MS_LOG(INFO) << "Set mem alloc unit size " << size;
+}
+
+void DynamicMemPoolBestFit::SetMempoolBlockSize(size_t device_mem_size) {
+  if (!IsPynativeMode()) {
+    return;
+  }
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  float mem_block_size = ms_context->get_param<float>(MS_CTX_MEMPOOL_BLOCK_SIZE);
+  if (mem_block_size == kDefaultMempoolBlockSize) {
+    return;
+  }
+  size_t config_size = FloatToSize(mem_block_size * kGBToByte);
+  size_t real_block_size = std::min(config_size, device_mem_size);
+  if (config_size > device_mem_size) {
+    MS_LOG(WARNING) << "Memory pool block size " << config_size << " is bigger than currently available maximum memory "
+                    << device_mem_size << ", and the actual effective value will be " << device_mem_size;
+  }
+  SetMemAllocUintSize(real_block_size);
+}
+
+DeviceMemPtr DynamicMemPoolBestFit::AddMemBlockAndMemBuf(size_t size, bool from_persistent_mem) {
+  // Pyantive unique mem is not enough, find from common
+  if (from_persistent_mem && !persistent_mem_->mem_block_list_.empty()) {
+    auto mem_addr = FindIdleMemBuf(size, false);
+    if (mem_addr != nullptr) {
+      return mem_addr;
+    }
+    from_persistent_mem = false;
+  }
+  size_t alloc_mem_size = CalMemBlockAllocSize(size, from_persistent_mem);
  if (alloc_mem_size == 0) {
+    MS_LOG(DEBUG) << "Try to find in other mem";
+    auto mem_addr = FindIdleMemBuf(size, !from_persistent_mem);
+    if (mem_addr != nullptr) {
+      return mem_addr;
+    }
+    DumpDynamicMemPoolInfo();
    return nullptr;
  }
  // Add new memory block
@ -118,40 +180,50 @@ DeviceMemPtr DynamicMemPoolBestFit::AddMemBlockAndMemBuf(size_t size) {
  if (real_alloc_size < size) {
    MS_LOG(WARNING) << "Memory not enough: alloc size[" << real_alloc_size << "] is smaller than required size[" << size
                    << "].";
+    DumpDynamicMemPoolInfo();
    return nullptr;
  }
-  mem_alloc_unit_size_ = DYNAMIC_MEM_ALLOC_UNIT_SIZE;
+  // In graph mode, unit_size are set once using an estimated memory value size, and subsequent memory requests use the
+  // default size
+  if (!IsPynativeMode()) {
+    common_mem_->unit_size_ = DYNAMIC_MEM_ALLOC_UNIT_SIZE;
+  }
+  auto mem_mng = common_mem_;
+  if (from_persistent_mem) {
+    mem_mng = persistent_mem_;
+  }
+  MS_EXCEPTION_IF_NULL(mem_mng);
  auto mem_block = std::make_shared<DynamicMemBlock>(device_addr, real_alloc_size);
  MS_EXCEPTION_IF_NULL(mem_block);
  const auto &iter =
-    std::upper_bound(global_mem_block_list_.begin(), global_mem_block_list_.end(), device_addr, CmpMemBlock);
-  (void)global_mem_block_list_.insert(iter, mem_block);
+    std::upper_bound(mem_mng->mem_block_list_.begin(), mem_mng->mem_block_list_.end(), device_addr, CmpMemBlock);
+  (void)mem_mng->mem_block_list_.insert(iter, mem_block);
  // Add new memory buf
  auto mem_buf = std::make_shared<DynamicMemBuf>(device_addr, kMemBufUsed, real_alloc_size);
  MS_EXCEPTION_IF_NULL(mem_buf);
  // Add map of new memory buf in the block
  (void)mem_block->block_all_mem_buf_map_.emplace(device_addr, mem_buf);
-  // Divide memory buf
-  if (IsDivide(size, mem_buf->size_)) {
-    DivideMemBuf(size, mem_buf);
+  // Split memory buf
+  if (IsSplit(size, mem_buf->size_)) {
+    SplitMemBuf(size, mem_buf, mem_mng);
  }
  // Memory statistics
-  total_mem_statistics_ += real_alloc_size;
-  total_used_mem_statistics_ += mem_buf->size_;
-  if (total_used_mem_statistics_ > used_mem_peak_statistics_) {
-    used_mem_peak_statistics_ = total_used_mem_statistics_;
+  mem_mng->mps_.total_mem_size_ += real_alloc_size;
+  mem_mng->mps_.total_used_mem_size_ += mem_buf->size_;
+  if (mem_mng->mps_.total_used_mem_size_ > mem_mng->mps_.used_mem_peak_size_) {
+    mem_mng->mps_.used_mem_peak_size_ = mem_mng->mps_.total_used_mem_size_;
  }
  return mem_buf->device_addr_;
 }

-size_t DynamicMemPoolBestFit::CalMemBlockAllocSize(size_t size) {
+size_t DynamicMemPoolBestFit::CalMemBlockAllocSize(size_t size, bool from_persistent_mem) {
  auto device_free_mem_size = free_mem_size();
  if (device_free_mem_size < size) {
    MS_LOG(WARNING) << "Memory not enough: current free memory size[" << device_free_mem_size
                    << "] is smaller than required size[" << size << "].";
    return 0;
  }
-  auto alloc_mem_size = mem_alloc_unit_size();
+  auto alloc_mem_size = MemAllocUnitSize(from_persistent_mem);
  // Growing at twice of alloc size
  constexpr size_t kDouble = 2;
  while (alloc_mem_size < size) {
@ -161,13 +233,14 @@ size_t DynamicMemPoolBestFit::CalMemBlockAllocSize(size_t size) {
  return alloc_mem_size;
 }

-bool DynamicMemPoolBestFit::IsDivide(size_t tensor_size, size_t mem_buf_size) const {
+bool DynamicMemPoolBestFit::IsSplit(size_t tensor_size, size_t mem_buf_size) const {
  return mem_buf_size - tensor_size >= DYNAMIC_MEM_ALIGN_SIZE;
 }

-void DynamicMemPoolBestFit::DivideMemBuf(size_t size, const DynamicMemBufPtr &mem_buf) {
+void DynamicMemPoolBestFit::SplitMemBuf(size_t size, const DynamicMemBufPtr &mem_buf,
+                                        const MemStatusManagerPtr &mem_mng) {
  MS_EXCEPTION_IF_NULL(mem_buf);
-  const auto &mem_block = FindMemBlock(mem_buf->device_addr_);
+  const auto &mem_block = FindMemBlock(mem_buf->device_addr_, mem_mng);
  MS_EXCEPTION_IF_NULL(mem_block);
  // Divide new memory buf
  if (mem_buf->size_ < size) {
@ -180,7 +253,7 @@ void DynamicMemPoolBestFit::DivideMemBuf(size_t size, const DynamicMemBufPtr &me
  // Add map of new memory buf in the block
  (void)mem_block->block_all_mem_buf_map_.emplace(newbuf_addr, new_mem_buf);
  // Add map of new idle memory buf
-  (void)global_idle_mem_buf_map_.emplace(newbuf_size, new_mem_buf);
+  (void)mem_mng->idle_mem_buf_map_.emplace(newbuf_size, new_mem_buf);
 }

 bool DynamicMemPoolBestFit::CmpMemBlock(const DeviceMemPtr &device_addr, const DynamicMemBlockPtr &mem_block) {
@ -189,11 +262,12 @@ bool DynamicMemPoolBestFit::CmpMemBlock(const DeviceMemPtr &device_addr, const D
  return device_addr < mem_block->device_addr();
 }

-DynamicMemBlockPtr DynamicMemPoolBestFit::FindMemBlock(const DeviceMemPtr &device_addr) {
+DynamicMemBlockPtr DynamicMemPoolBestFit::FindMemBlock(const DeviceMemPtr &device_addr,
+                                                       const MemStatusManagerPtr &mem_mng) {
  MS_EXCEPTION_IF_NULL(device_addr);
  auto &&iter =
-    std::upper_bound(global_mem_block_list_.begin(), global_mem_block_list_.end(), device_addr, CmpMemBlock);
-  if (iter != global_mem_block_list_.begin()) {
+    std::upper_bound(mem_mng->mem_block_list_.begin(), mem_mng->mem_block_list_.end(), device_addr, CmpMemBlock);
+  if (iter != mem_mng->mem_block_list_.begin()) {
    return *(--iter);
  }
  return nullptr;
@ -202,16 +276,32 @@ DynamicMemBlockPtr DynamicMemPoolBestFit::FindMemBlock(const DeviceMemPtr &devic
 void DynamicMemPoolBestFit::FreeTensorMem(const DeviceMemPtr &device_addr) {
  MS_EXCEPTION_IF_NULL(device_addr);
  std::lock_guard<std::mutex> locker(mutex_);
-  const auto &mem_block = FindMemBlock(device_addr);
+  auto fn = [this](const MemStatusManagerPtr &mem_mng, const DeviceMemPtr &device_addr) -> DynamicMemBlockPtr {
+    auto mem_block = FindMemBlock(device_addr, mem_mng);
+    if (mem_block != nullptr) {
+      const auto &iter = mem_block->block_all_mem_buf_map_.find(device_addr);
+      if (iter != mem_block->block_all_mem_buf_map_.end()) {
+        return mem_block;
+      }
+    }
+    return nullptr;
+  };
+  auto mem_block = fn(common_mem_, device_addr);
  if (mem_block == nullptr) {
-    // May be destroy the memory pool first, then destroy the address, so this is normal case.
-    MS_LOG(DEBUG) << "Can't find the mem_block of the device address[" << device_addr << "].";
-    return;
+    mem_block = fn(persistent_mem_, device_addr);
+    if (mem_block == nullptr) {
+      // May be destroy the memory pool first, then destroy the address, so this is normal case.
+      MS_LOG(DEBUG) << "Can't find the mem_block of the device address[" << device_addr << "].";
+      return;
+    }
+    CombineMemBuf(mem_block, device_addr, persistent_mem_);
+  } else {
+    CombineMemBuf(mem_block, device_addr, common_mem_);
  }
-  CombineMemBuf(mem_block, device_addr);
 }

-void DynamicMemPoolBestFit::CombineMemBuf(const DynamicMemBlockPtr &mem_block, const DeviceMemPtr &device_addr) {
+void DynamicMemPoolBestFit::CombineMemBuf(const DynamicMemBlockPtr &mem_block, const DeviceMemPtr &device_addr,
+                                          const MemStatusManagerPtr &mem_mng) {
  MS_EXCEPTION_IF_NULL(mem_block);
  MS_EXCEPTION_IF_NULL(device_addr);
  const auto &iter = mem_block->block_all_mem_buf_map_.find(device_addr);
@ -224,10 +314,10 @@ void DynamicMemPoolBestFit::CombineMemBuf(const DynamicMemBlockPtr &mem_block, c
    MS_LOG(EXCEPTION) << "Find the mem_buf is not used, mem_buf_address[" << mem_buf->device_addr_ << "].";
  }
  mem_buf->status_ = kMemBufIdle;
-  if (total_used_mem_statistics_ < mem_buf->size_) {
+  if (mem_mng->mps_.total_used_mem_size_ < mem_buf->size_) {
    MS_LOG(EXCEPTION) << "The total used mem size is less than the size of membuf.";
  }
-  total_used_mem_statistics_ -= mem_buf->size_;
+  mem_mng->mps_.total_used_mem_size_ -= mem_buf->size_;
  // Combine backward(combine the next_mem_buf to mem_buf)
  auto next_iter = iter;
  (void)next_iter++;
@ -236,7 +326,7 @@ void DynamicMemPoolBestFit::CombineMemBuf(const DynamicMemBlockPtr &mem_block, c
    MS_EXCEPTION_IF_NULL(next_mem_buf);
    if (next_mem_buf->status_ == kMemBufIdle) {
      mem_buf->size_ += next_mem_buf->size_;
-      EraseIdleMemBuf(next_mem_buf->size_, next_mem_buf->device_addr_);
+      EraseIdleMemBuf(next_mem_buf->size_, next_mem_buf->device_addr_, mem_mng);
      (void)mem_block->block_all_mem_buf_map_.erase(next_iter);
    }
  }
@ -249,7 +339,7 @@ void DynamicMemPoolBestFit::CombineMemBuf(const DynamicMemBlockPtr &mem_block, c
    prev_mem_buf = prev_iter->second;
    MS_EXCEPTION_IF_NULL(prev_mem_buf);
    if (prev_mem_buf->status_ == kMemBufIdle) {
-      EraseIdleMemBuf(prev_mem_buf->size_, prev_mem_buf->device_addr_);
+      EraseIdleMemBuf(prev_mem_buf->size_, prev_mem_buf->device_addr_, mem_mng);
      prev_mem_buf->size_ += mem_buf->size_;
      (void)mem_block->block_all_mem_buf_map_.erase(iter);
      forward_combine = true;
@ -257,20 +347,21 @@ void DynamicMemPoolBestFit::CombineMemBuf(const DynamicMemBlockPtr &mem_block, c
  }
  // Add map of new idle memory
  if (forward_combine) {
-    (void)global_idle_mem_buf_map_.emplace(prev_mem_buf->size_, prev_mem_buf);
+    (void)mem_mng->idle_mem_buf_map_.emplace(prev_mem_buf->size_, prev_mem_buf);
  } else {
-    (void)global_idle_mem_buf_map_.emplace(mem_buf->size_, mem_buf);
+    (void)mem_mng->idle_mem_buf_map_.emplace(mem_buf->size_, mem_buf);
  }
 }

-void DynamicMemPoolBestFit::EraseIdleMemBuf(size_t size, const DeviceMemPtr &device_addr) {
+void DynamicMemPoolBestFit::EraseIdleMemBuf(size_t size, const DeviceMemPtr &device_addr,
+                                            const MemStatusManagerPtr &mem_mng) {
  MS_EXCEPTION_IF_NULL(device_addr);
-  auto &&iter = global_idle_mem_buf_map_.equal_range(size);
+  auto &&iter = mem_mng->idle_mem_buf_map_.equal_range(size);
  while (iter.first != iter.second) {
    MS_EXCEPTION_IF_NULL(iter.first->second);
    // Remove map of the idle memory buf by size and device address
    if (iter.first->second->device_addr_ == device_addr) {
-      (void)global_idle_mem_buf_map_.erase(iter.first);
+      (void)mem_mng->idle_mem_buf_map_.erase(iter.first);
      return;
    }
    (void)iter.first++;
@ -280,70 +371,48 @@ void DynamicMemPoolBestFit::EraseIdleMemBuf(size_t size, const DeviceMemPtr &dev

 void DynamicMemPoolBestFit::ReleaseDeviceRes() {
  std::lock_guard<std::mutex> locker(mutex_);
-  MS_LOG(INFO) << "The dynamic memory pool total size is " << total_mem_statistics_ << ", total used size is "
-               << total_used_mem_statistics_ << ", used peak size is " << used_mem_peak_statistics_ << ".";
-  for (auto iter = global_mem_block_list_.begin(); iter != global_mem_block_list_.end(); ++iter) {
-    auto &device_addr = (*iter)->device_addr_base_;
-    if (device_addr != nullptr) {
-      if (!FreeDeviceMem(device_addr)) {
-        MS_LOG(EXCEPTION) << "Free device memory[" << device_addr << "] error.";
+  auto fn = [this](const MemStatusManagerPtr &mem_mng) {
+    for (auto &iter : mem_mng->mem_block_list_) {
+      auto &device_addr = iter->device_addr_base_;
+      if (device_addr != nullptr) {
+        if (!FreeDeviceMem(device_addr)) {
+          MS_LOG(EXCEPTION) << "Free device memory[" << device_addr << "] error.";
+        }
+        device_addr = nullptr;
      }
-      device_addr = nullptr;
    }
-  }
-
-  global_mem_block_list_.clear();
-  global_idle_mem_buf_map_.clear();
+    mem_mng->mem_block_list_.clear();
+    mem_mng->idle_mem_buf_map_.clear();
+  };
+  fn(common_mem_);
+  fn(persistent_mem_);
 }

 void DynamicMemPoolBestFit::DumpDynamicMemPoolInfo() {
-  std::lock_guard<std::mutex> locker(mutex_);
-  MS_LOG(INFO) << "Start dump dynamic memory pool info.";
-  DeviceAddrMapMemBuf mem_block_map;
-  DynamicMemBufPtr mem_buf;
-  size_t total_mem = 0;
-  size_t total_used_mem = 0;
-  size_t total_idle_mem1 = 0;
-  size_t total_idle_mem2 = 0;
-  // Dump the memory block info and memory buf info
-  MS_LOG(INFO) << "Dump all mem_block info: counts[" << global_mem_block_list_.size() << "].";
-  for (auto iter = global_mem_block_list_.begin(); iter != global_mem_block_list_.end(); ++iter) {
-    total_mem += (*iter)->size();
-    mem_block_map = (*iter)->block_all_mem_buf_map_;
-    MS_LOG(INFO) << "MemBlock info: number[" << iter - global_mem_block_list_.begin() << "] mem_buf_counts["
-                 << mem_block_map.size() << "] base_address[" << (*iter)->device_addr() << "] block_size["
-                 << (*iter)->size() << "].";
-    for (auto iter_mem_buf = mem_block_map.begin(); iter_mem_buf != mem_block_map.end(); ++iter_mem_buf) {
-      mem_buf = iter_mem_buf->second;
-      MS_EXCEPTION_IF_NULL(mem_buf);
-      if (mem_buf->status_ == kMemBufIdle) {
-        total_idle_mem1 += mem_buf->size_;
-      } else {
-        total_used_mem += mem_buf->size_;
-      }
-      MS_LOG(INFO) << "MemBuf info: address[" << mem_buf->device_addr_ << "] size[" << mem_buf->size_ << "] status["
-                   << mem_buf->status_ << "].";
+  auto fn = [](const MemStatusManagerPtr &mem_mng, const std::string &mem_type) {
+    if (mem_mng->mem_block_list_.empty()) {
+      return;
    }
-  }
-  // Dump all the idle memory buf info
-  MS_LOG(INFO) << "Dump all idle mem_buf info: counts[" << global_idle_mem_buf_map_.size() << "].";
-  for (auto iter_idle = global_idle_mem_buf_map_.begin(); iter_idle != global_idle_mem_buf_map_.end(); ++iter_idle) {
-    mem_buf = iter_idle->second;
-    MS_EXCEPTION_IF_NULL(mem_buf);
-    total_idle_mem2 += mem_buf->size_;
-    MS_LOG(INFO) << "Idle mem_buf info: size[" << mem_buf->size_ << "] address[" << mem_buf->device_addr_ << "] status["
-                 << mem_buf->status_ << "].";
-  }
-  // Dump the memory statistical info
-  MS_LOG(INFO) << "Total allocated memory[" << total_mem << "], used memory[" << total_used_mem << "], idle memory["
-               << total_idle_mem1 << "].";
-  if (total_idle_mem1 != total_idle_mem2) {
-    MS_LOG(ERROR) << "Check error: the idle memory in the mem_block is not equal the global idle memory.";
-  }
-  if (total_mem != total_used_mem + total_idle_mem1) {
-    MS_LOG(ERROR) << "Check error: the the total memory is not equal the sum of used memory and idle memory.";
-  }
-  MS_LOG(INFO) << "Finish dump dynamic memory pool info.";
+    std::ostringstream buf;
+    for (size_t i = 0; i < mem_mng->mem_block_list_.size(); ++i) {
+      size_t idle_size = 0;
+      for (auto mb = mem_mng->mem_block_list_[i]->block_all_mem_buf_map_.begin();
+           mb != mem_mng->mem_block_list_[i]->block_all_mem_buf_map_.end(); ++mb) {
+        if (mb->second->status_ == kMemBufIdle) {
+          idle_size += mb->second->size_;
+        }
+      }
+      buf << ", block[" << i << "] idle size " << idle_size;
+    }
+    // Dump all the memory buf info
+    MS_LOG(WARNING) << mem_type << "pool info: block size " << mem_mng->unit_size_ << ", block counts "
+                    << mem_mng->mem_block_list_.size() << buf.str() << ". Total allocated mem "
+                    << mem_mng->mps_.total_mem_size_ << ", peak used mem " << mem_mng->mps_.used_mem_peak_size_
+                    << ", in used mem " << mem_mng->mps_.total_used_mem_size_ << ", total idle mem "
+                    << mem_mng->mps_.total_mem_size_ - mem_mng->mps_.total_used_mem_size_;
+  };
+  fn(common_mem_, std::string(kCommonMem));
+  fn(persistent_mem_, std::string(kPynativeParamMem));
 }
 }  // namespace device
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_dynamic_allocator.h
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_dynamic_allocator.h
@ -36,7 +36,7 @@ enum DynamicMemBufStatus : int { kMemBufIdle, kMemBufUsed };
 static const size_t DYNAMIC_MEM_ALIGN_SIZE = 512;

 // The minimum unit size (1G) of memory block used for dynamic extend.
-static const size_t DYNAMIC_MEM_ALLOC_UNIT_SIZE = 1024 << 20;
+static const size_t DYNAMIC_MEM_ALLOC_UNIT_SIZE = 1073741824;

 // The Comparator of device address from small to large.
 struct DeviceAddrCmp {
@ -77,16 +77,40 @@ class DynamicMemBlock {
 };
 using DynamicMemBlockPtr = std::shared_ptr<DynamicMemBlock>;

+struct DeviceState {
+  // Memory allocated from device
+  size_t total_mem_size_{0};
+  // Memory in use
+  size_t total_used_mem_size_{0};
+  // Maximum peak memory usage
+  size_t used_mem_peak_size_{0};
+};
+
+struct MemStatusManager {
+  size_t unit_size_{DYNAMIC_MEM_ALLOC_UNIT_SIZE};
+  // Mempool state
+  DeviceState mps_;
+  std::vector<DynamicMemBlockPtr> mem_block_list_;
+  // The map of all idle memory buf by size.
+  SizeMapMemBuf idle_mem_buf_map_;
+  void clear() {
+    mem_block_list_.clear();
+    idle_mem_buf_map_.clear();
+  }
+};
+using MemStatusManagerPtr = std::shared_ptr<MemStatusManager>;
+
 // The main class of dynamic memory pool.
 class DynamicMemPoolBestFit {
 public:
-  DynamicMemPoolBestFit() = default;
+  DynamicMemPoolBestFit()
+      : persistent_mem_(std::make_shared<MemStatusManager>()), common_mem_(std::make_shared<MemStatusManager>()) {}
  virtual ~DynamicMemPoolBestFit();

  // The main program entry of memory alloc.
-  DeviceMemPtr AllocTensorMem(size_t size);
+  DeviceMemPtr AllocTensorMem(size_t size, bool from_persistent_mem = false);
  // The main program entry of continuous memory alloc.
-  std::vector<DeviceMemPtr> AllocContinuousTensorMem(size_t total_size, std::vector<size_t> size_list);
+  std::vector<DeviceMemPtr> AllocContinuousTensorMem(size_t total_size, const std::vector<size_t> &size_list);
  // The main program entry of memory free.
  void FreeTensorMem(const DeviceMemPtr &device_addr);

@ -94,21 +118,22 @@ class DynamicMemPoolBestFit {
  void ReleaseDeviceRes();
  // Display the information of memory block and memory buf.
  void DumpDynamicMemPoolInfo();
-  // Get the map of global idle mem buf and size.
-  SizeMapMemBuf global_idle_mem_buf_map() {
-    std::lock_guard<std::mutex> locker(mutex_);
-    return global_idle_mem_buf_map_;
-  }

  // Get the minimum memory unit size using for dynamic extend.
-  size_t mem_alloc_unit_size() const { return mem_alloc_unit_size_; }
+  size_t MemAllocUnitSize(bool from_persistent_mem = false) const;
  // Set the minimum memory unit size using for dynamic extend.
-  void set_mem_alloc_unit_size(const size_t &size) { mem_alloc_unit_size_ = size; }
-
-  // Get the related memory statistics information.
-  size_t total_mem_statistics() const { return total_mem_statistics_; }
-  size_t used_mem_statistics() const { return total_used_mem_statistics_; }
-  size_t used_mem_peak_statistics() const { return used_mem_peak_statistics_; }
+  void SetMemAllocUintSize(size_t size);
+  // Set mempool init percent in pynative mode
+  void SetMempoolBlockSize(size_t device_mem_size);
+  size_t TotalMemStatistics() const {
+    return common_mem_->mps_.total_mem_size_ + persistent_mem_->mps_.total_mem_size_;
+  }
+  size_t TotalUsedMemStatistics() const {
+    return common_mem_->mps_.total_used_mem_size_ + persistent_mem_->mps_.total_used_mem_size_;
+  }
+  size_t UsedMemPeakStatistics() const {
+    return common_mem_->mps_.used_mem_peak_size_ + persistent_mem_->mps_.used_mem_peak_size_;
+  }

  // The related interface of device memory real operation, needs override by device type.
  virtual size_t AllocDeviceMem(size_t size, DeviceMemPtr *addr) = 0;
@ -116,43 +141,35 @@ class DynamicMemPoolBestFit {
  virtual size_t free_mem_size() = 0;

 protected:
+  MemStatusManagerPtr &common_mem() { return common_mem_; }
+  MemStatusManagerPtr &persistent_mem() { return persistent_mem_; }
  // The real size by memory alloc aligned.
  virtual size_t AlignMemorySize(size_t size) const;
  // Calculate memory block required alloc size when adding the memory block.
-  virtual size_t CalMemBlockAllocSize(size_t size);
+  virtual size_t CalMemBlockAllocSize(size_t size, bool from_persistent_mem);

 private:
  // Find the idle memory buf by aligned size when memory alloc.
-  DeviceMemPtr FindIdleMemBuf(size_t size);
+  DeviceMemPtr FindIdleMemBuf(size_t size, bool from_persistent_mem);
  // Add the memory block and memory buf when memory alloc not find the idle memory buf.
-  DeviceMemPtr AddMemBlockAndMemBuf(size_t size);
-  // Judge whether need divide the memory buf by alloc size and memory buf size.
-  bool IsDivide(size_t tensor_size, size_t mem_buf_size) const;
-  // Divide the memory buf by alloc size.
-  void DivideMemBuf(size_t size, const DynamicMemBufPtr &mem_buf);
+  DeviceMemPtr AddMemBlockAndMemBuf(size_t size, bool from_persistent_mem);
+  // Judge whether need split the memory buf by alloc size and memory buf size.
+  bool IsSplit(size_t tensor_size, size_t mem_buf_size) const;
+  // Split the memory buf by alloc size.
+  void SplitMemBuf(size_t size, const DynamicMemBufPtr &mem_buf, const MemStatusManagerPtr &mem_mng);
  // Find the memory block by device address.
-  DynamicMemBlockPtr FindMemBlock(const DeviceMemPtr &device_addr);
+  DynamicMemBlockPtr FindMemBlock(const DeviceMemPtr &device_addr, const MemStatusManagerPtr &mem_mgr);
  // The Comparator of memory block by device address, because memory blocks are arranged in order by device address.
  static bool CmpMemBlock(const DeviceMemPtr &device_addr, const DynamicMemBlockPtr &mem_block);

  // Combine the memory buf when memory free, to avoid the memory fragmentation.
-  void CombineMemBuf(const DynamicMemBlockPtr &mem_block, const DeviceMemPtr &device_addr);
+  void CombineMemBuf(const DynamicMemBlockPtr &mem_block, const DeviceMemPtr &device_addr,
+                     const MemStatusManagerPtr &mem_mng);
  // Erase the idle memory buf by size and device address when idle memory buf is combined.
-  void EraseIdleMemBuf(size_t size, const DeviceMemPtr &device_addr);
-
-  // The global memory block list which is arranged in order by base device address of memory block.
-  std::vector<DynamicMemBlockPtr> global_mem_block_list_;
-  // The map of all idle memory buf by size.
-  SizeMapMemBuf global_idle_mem_buf_map_;
-
-  // The related memory statistics information.
-  size_t total_mem_statistics_{0};
-  size_t total_used_mem_statistics_{0};
-  size_t used_mem_peak_statistics_{0};
-
-  // The minimum memory unit size.
-  size_t mem_alloc_unit_size_{DYNAMIC_MEM_ALLOC_UNIT_SIZE};
+  void EraseIdleMemBuf(size_t size, const DeviceMemPtr &device_addr, const MemStatusManagerPtr &mem_mng);

+  MemStatusManagerPtr persistent_mem_{nullptr};
+  MemStatusManagerPtr common_mem_{nullptr};
  // Support multi-thread.
  std::mutex mutex_;
 };
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@ -864,7 +864,7 @@ void AscendSession::RunOpImplOrigin(const GraphInfo &graph_info, OpRunInfo *op_r

  // malloc mem
  RunOpRemoveNopNode(graph);
-  RunOpMemoryAlloc(*input_tensors, graph.get());
+  RunOpMemoryAlloc(*input_tensors, graph.get(), op_run_info->is_gradient_out);
  RunOpGenKernelEvent(graph.get());
  AnfAlgo::CacheAddrForGraph(graph);
  // Build dynamic kernel
@ -998,7 +998,7 @@ void AscendSession::BuildOpsInGraph(const GraphId &graph_id, const std::map<AnfN
    GetOpInputStubTensors(kernel, parameter_index, graph_inputs, op_output_info, &input_tensor_info);
    // Get OpRunInfo and GraphInfo
    const GraphInfo &graph_info = GetSingleOpGraphInfo(kernel, input_tensor_info.input_tensors);
-    OpRunInfo op_run_info = GetSingleOpRunInfo(kernel, graph_info, input_tensor_info);
+    OpRunInfo op_run_info = GetSingleOpRunInfo(kernel, graph_info, input_tensor_info, nullptr);
    if (op_run_info.is_dynamic_shape) {
      MS_LOG(INFO) << "BuildOpsInGraph stop, op " << op_run_info.op_name << " is dynamic shape.";
      break;
@ -1318,19 +1318,19 @@ void AscendSession::MemoryAlloc(KernelGraph *kernel_graph) const {
  MS_LOG(INFO) << "Status record: end memory alloc. graph id: " << kernel_graph->graph_id()
               << ", Memory Statistics:" << device::ascend::AscendMemAdapter::GetInstance().DevMemStatistics();
  MS_LOG(INFO) << "The dynamic memory pool total size is "
-               << device::ascend::AscendMemoryPool::GetInstance().total_mem_statistics() / kMBToByte
+               << device::ascend::AscendMemoryPool::GetInstance().TotalMemStatistics() / kMBToByte
               << "M, total used size is "
-               << device::ascend::AscendMemoryPool::GetInstance().used_mem_statistics() / kMBToByte
+               << device::ascend::AscendMemoryPool::GetInstance().TotalUsedMemStatistics() / kMBToByte
               << "M, used peak size is "
-               << device::ascend::AscendMemoryPool::GetInstance().used_mem_peak_statistics() / kMBToByte << "M.";
+               << device::ascend::AscendMemoryPool::GetInstance().UsedMemPeakStatistics() / kMBToByte << "M.";
 }

-void AscendSession::RunOpMemoryAlloc(const std::vector<tensor::TensorPtr> &input_tensors,
-                                     KernelGraph *kernel_graph) const {
+void AscendSession::RunOpMemoryAlloc(const std::vector<tensor::TensorPtr> &input_tensors, KernelGraph *kernel_graph,
+                                     bool is_gradient_out) const {
  MS_EXCEPTION_IF_NULL(kernel_graph);
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
  MS_EXCEPTION_IF_NULL(runtime_instance);
-  runtime_instance->RunOpAssignMemory(input_tensors, *kernel_graph);
+  runtime_instance->RunOpAssignMemory(input_tensors, *kernel_graph, is_gradient_out);
 }

 void AscendSession::RunOpMemoryAllocNew(const std::vector<tensor::TensorPtr> &input_tensors,
@ -1338,7 +1338,7 @@ void AscendSession::RunOpMemoryAllocNew(const std::vector<tensor::TensorPtr> &in
                                        const KernelGraph &kernel_graph) const {
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
  MS_EXCEPTION_IF_NULL(runtime_instance);
-  runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph, tensor_to_node);
+  runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph, false, tensor_to_node);
 }

 void AscendSession::RunOpGenKernelEvent(const KernelGraph *graph) const {
--- a/mindspore/ccsrc/backend/session/ascend_session.h
+++ b/mindspore/ccsrc/backend/session/ascend_session.h
@ -103,7 +103,8 @@ class AscendSession : public SessionBasic {
  static void BuildKernel(const std::vector<CNodePtr> &kernels);
  void BuildDynamicKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const;
  void MemoryAlloc(KernelGraph *kernel_graph) const;
-  void RunOpMemoryAlloc(const std::vector<tensor::TensorPtr> &input_tensors, KernelGraph *kernel_graph) const;
+  void RunOpMemoryAlloc(const std::vector<tensor::TensorPtr> &input_tensors, KernelGraph *kernel_graph,
+                        bool is_gradient_out) const;
  void RunOpMemoryAllocNew(const std::vector<tensor::TensorPtr> &input_tensors,
                           const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node,
                           const KernelGraph &kernel_graph) const;
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@ -255,11 +255,11 @@ void GPUSession::AllocateMemory(const KernelGraph *kernel_graph) const {
 }

 void GPUSession::RunOpAllocateMemory(const std::vector<tensor::TensorPtr> &input_tensors,
-                                     const KernelGraph *kernel_graph) const {
+                                     const KernelGraph *kernel_graph, bool is_gradient_out) const {
  MS_EXCEPTION_IF_NULL(kernel_graph);
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  MS_EXCEPTION_IF_NULL(runtime_instance);
-  runtime_instance->RunOpAssignMemory(input_tensors, *kernel_graph);
+  runtime_instance->RunOpAssignMemory(input_tensors, *kernel_graph, is_gradient_out);
 }

 void GPUSession::RunOpGenKernelEvent(const KernelGraph *graph) const {
@ -701,7 +701,7 @@ void GPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info,
  // run op
  MS_EXCEPTION_IF_NULL(kernel_graph);
  RunOpRemoveNopNode(kernel_graph);
-  RunOpAllocateMemory(*input_tensors, kernel_graph.get());
+  RunOpAllocateMemory(*input_tensors, kernel_graph.get(), op_run_info->is_gradient_out);
  RunOpGenKernelEvent(kernel_graph.get());
  // Execute the computation
  LoadInputData(kernel_graph, *input_tensors);
--- a/mindspore/ccsrc/backend/session/gpu_session.h
+++ b/mindspore/ccsrc/backend/session/gpu_session.h
@ -82,7 +82,8 @@ class GPUSession : public SessionBasic {

  void AllocateMemory(const KernelGraph *kernel_graph) const;

-  void RunOpAllocateMemory(const std::vector<tensor::TensorPtr> &input_tensors, const KernelGraph *kernel_graph) const;
+  void RunOpAllocateMemory(const std::vector<tensor::TensorPtr> &input_tensors, const KernelGraph *kernel_graph,
+                           bool is_gradient_out) const;

  void RunOpClearMemory(const KernelGraph *kernel_graph) const;

--- a/mindspore/ccsrc/backend/session/session_basic.cc
+++ b/mindspore/ccsrc/backend/session/session_basic.cc
@ -54,6 +54,7 @@
 #endif
 #include "backend/session/session_factory.h"
 #include "backend/session/pynative_task_manager.h"
+#include "pipeline/pynative/pynative_execute.h"

 namespace mindspore {
 namespace session {
@ -1220,7 +1221,8 @@ GraphInfo SessionBasic::GetSingleOpGraphInfo(const CNodePtr &kernel,
 }

 OpRunInfo SessionBasic::GetSingleOpRunInfo(const CNodePtr &cnode, const GraphInfo &graph_info,
-                                           const InputTensorInfo &tensor_info) {
+                                           const InputTensorInfo &tensor_info,
+                                           GraphOutputInfo *const graph_output_info) {
  MS_EXCEPTION_IF_NULL(cnode);
  auto primitive = AnfAlgo::GetCNodePrimitive(cnode);
  const auto &abstract = cnode->abstract();
@ -1230,7 +1232,14 @@ OpRunInfo SessionBasic::GetSingleOpRunInfo(const CNodePtr &cnode, const GraphInf
  const auto &shape = abstract->BuildShape();
  MS_EXCEPTION_IF_NULL(shape);

-  OpRunInfo op_run_info = {.op_name = primitive->name(),
+  bool is_gradient_out =
+    graph_output_info != nullptr &&
+    std::any_of(graph_output_info->output_indexes.begin(), graph_output_info->output_indexes.end(),
+                [cnode](const std::pair<KernelWithIndex, std::vector<std::vector<size_t>>> &output_index) {
+                  return output_index.first.first == cnode;
+                });
+  OpRunInfo op_run_info = {.is_gradient_out = is_gradient_out,
+                           .op_name = primitive->name(),
                           .primitive = primitive.get(),
                           .abstract = abstract,
                           .is_dynamic_shape = shape->IsDynamic(),
@ -1304,16 +1313,63 @@ void SessionBasic::GetRefCount(const KernelGraph *graph, std::map<KernelWithInde
  }
 }

+void SessionBasic::GetForwardOutputRefCount(const KernelGraph *graph,
+                                            std::map<AnfNodePtr, size_t> *forward_output_refcount) {
+  if (!pynative::PynativeExecutor::GetInstance()->grad_flag()) {
+    return;
+  }
+  const auto &forward_value_nodes_id = pynative::PynativeExecutor::GetInstance()->grad_executor()->forward_outputs_id();
+  for (const auto &kernel : graph->execution_order()) {
+    for (size_t i = 1; i < kernel->inputs().size(); i += 1) {
+      const auto &input = kernel->input(i);
+      if (!input->isa<ValueNode>()) {
+        continue;
+      }
+      auto value_node = input->cast<ValueNodePtr>();
+      auto value = GetValueNode(value_node);
+      MS_EXCEPTION_IF_NULL(value);
+      if (value->isa<tensor::Tensor>()) {
+        auto tensor = value->cast<tensor::TensorPtr>();
+        MS_EXCEPTION_IF_NULL(tensor);
+        if (forward_value_nodes_id.find(tensor->id()) != forward_value_nodes_id.end()) {
+          (*forward_output_refcount)[input] += 1;
+        }
+      }
+    }
+  }
+}
+
 void SessionBasic::HandleOpInputs(const std::set<KernelWithIndex> &input_kernel,
                                  std::map<KernelWithIndex, size_t> *ref_count,
+                                  std::map<AnfNodePtr, size_t> *forward_output_refcount,
                                  std::map<KernelWithIndex, tensor::TensorPtr> *op_output_map) {
  MS_EXCEPTION_IF_NULL(ref_count);
+  MS_EXCEPTION_IF_NULL(forward_output_refcount);
  MS_EXCEPTION_IF_NULL(op_output_map);
  for (auto &kernel_with_index : input_kernel) {
-    MS_EXCEPTION_IF_NULL(kernel_with_index.first);
    if (!kernel_with_index.first->isa<CNode>()) {
      continue;
    }
+
+    // Release forward output
+    auto cnode = kernel_with_index.first->cast<CNodePtr>();
+    for (size_t i = 1; i < cnode->inputs().size(); i += 1) {
+      const auto &input = cnode->input(i);
+      auto it = forward_output_refcount->find(input);
+      if (it != forward_output_refcount->end()) {
+        if (--(it->second) == 0) {
+          auto value_node = input->cast<ValueNodePtr>();
+          auto value = GetValueNode(value_node);
+          MS_EXCEPTION_IF_NULL(value);
+          auto tensor = value->cast<tensor::TensorPtr>();
+          MS_EXCEPTION_IF_NULL(tensor);
+          tensor->set_device_address(nullptr);
+          forward_output_refcount->erase(it);
+        }
+      }
+    }
+
+    // Release previous output
    auto ref_iter = ref_count->find(kernel_with_index);
    if (ref_iter == ref_count->end()) {
      MS_LOG(EXCEPTION) << "Can not find input KernelWithIndex in cnode reference count map, input cnode = "
@ -2329,7 +2385,9 @@ void SessionBasic::RunOpsInGraphImpl(const GraphId &graph_id, const std::vector<
  graph_output_info.graph_outputs = outputs;
  CreateOutputPlaceholder(kernel_graph, inputs, graph_output_info.graph_outputs, &graph_output_info.output_indexes);
  std::map<KernelWithIndex, size_t> cnode_refcount;
+  std::map<AnfNodePtr, size_t> forward_output_refcount;
  GetRefCount(kernel_graph.get(), &cnode_refcount);
+  GetForwardOutputRefCount(kernel_graph.get(), &forward_output_refcount);
  BuildOpsInGraph(graph_id, parameter_index, inputs, cnode_refcount);

  // Clear bucket resources every step
@ -2346,14 +2404,14 @@ void SessionBasic::RunOpsInGraphImpl(const GraphId &graph_id, const std::vector<
    VectorRef op_outputs;
    // Get OpRunInfo and GraphInfo
    GraphInfo graph_info = GetSingleOpGraphInfo(kernel, input_tensor_info.input_tensors);
-    OpRunInfo run_info = GetSingleOpRunInfo(kernel, graph_info, input_tensor_info);
+    OpRunInfo run_info = GetSingleOpRunInfo(kernel, graph_info, input_tensor_info, &graph_output_info);

    // Build and run current single op
    RunOpImplOrigin(graph_info, &run_info, &input_tensor_info.input_tensors, &op_outputs,
                    input_tensor_info.input_tensors_mask);
    graph_output_info.graph_output_tensors.clear();
    // Handle inputs and outputs of current op
-    HandleOpInputs(input_tensor_info.input_kernel, &cnode_refcount, &op_output_map);
+    HandleOpInputs(input_tensor_info.input_kernel, &cnode_refcount, &forward_output_refcount, &op_output_map);
    HandleOpOutputs(kernel, op_outputs, cnode_refcount, &op_output_map, &graph_output_info);
    // Save grad node to Bucket
    if (kernel_graph->is_bprop()) {
--- a/mindspore/ccsrc/backend/session/session_basic.h
+++ b/mindspore/ccsrc/backend/session/session_basic.h
@ -57,6 +57,7 @@ using AnyList = std::vector<Any>;
 using AnyListPtr = std::shared_ptr<AnyList>;

 struct OpRunInfo {
+  bool is_gradient_out = false;
  std::string op_name;
  Primitive *primitive;
  AbstractBasePtr abstract;
@ -193,7 +194,9 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
                               VectorRef *const outputs,
                               std::map<KernelWithIndex, std::vector<std::vector<size_t>>> *output_indexes);
  void GetRefCount(const KernelGraph *graph, std::map<KernelWithIndex, size_t> *ref_count);
+  void GetForwardOutputRefCount(const KernelGraph *graph, std::map<AnfNodePtr, size_t> *forward_output_refcount);
  void HandleOpInputs(const std::set<KernelWithIndex> &input_kernel, std::map<KernelWithIndex, size_t> *ref_count,
+                      std::map<AnfNodePtr, size_t> *forward_output_refcount,
                      std::map<KernelWithIndex, tensor::TensorPtr> *op_output_map);

  void HandleOpOutputs(const AnfNodePtr &kernel, const VectorRef &op_outputs,
@ -280,7 +283,8 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
  CNodePtr ConstructOutput(const AnfNodePtrList &outputs, const std::shared_ptr<KernelGraph> &graph);
  // Generate graph info for a single op graph
  GraphInfo GetSingleOpGraphInfo(const CNodePtr &kernel, const std::vector<tensor::TensorPtr> &input_tensors);
-  OpRunInfo GetSingleOpRunInfo(const CNodePtr &cnode, const GraphInfo &graph_info, const InputTensorInfo &tensor_info);
+  OpRunInfo GetSingleOpRunInfo(const CNodePtr &cnode, const GraphInfo &graph_info, const InputTensorInfo &tensor_info,
+                               GraphOutputInfo *const graph_output_info);
  tensor::TensorPtr GetValueNodeOutputTensor(const AnfNodePtr &node, size_t output_index);
  tensor::TensorPtr GetParameterOutputTensor(const AnfNodePtr &node,
                                             const std::map<AnfNodePtr, size_t> &parameter_index,
--- a/mindspore/ccsrc/pipeline/pynative/base.h
+++ b/mindspore/ccsrc/pipeline/pynative/base.h
@ -50,6 +50,7 @@ enum PynativeStatusCode {
 enum RunOpArgsEnum { PY_PRIM = 0, PY_NAME, PY_INPUTS, PY_ARGS_NUM };

 struct OpExecInfo {
+  bool is_nop_prim = false;
  bool is_dynamic_shape = false;
  bool is_mixed_precision_cast = false;
  size_t next_input_index = 0;
--- a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
+++ b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
@ -880,6 +880,37 @@ py::object GetDstType(const TypeId &type_id) {
 bool IsPyObjTypeInvalid(const py::object &obj) {
  return !py::isinstance<tensor::Tensor>(obj) && !py::isinstance<py::int_>(obj) && !py::isinstance<py::float_>(obj);
 }
+
+inline bool IsNopPrim(const std::string &op_name) {
+  static std::set<std::string> nop_prim = {prim::kPrimReshape->name(), kExpandDimsOpName,  prim::kPrimSqueeze->name(),
+                                           prim::kPrimFlatten->name(), kFlattenGradOpName, prim::kPrimReformat->name()};
+  return nop_prim.find(op_name) != nop_prim.end();
+}
+
+// Shallow Copy Value and change shape
+ValuePtr ShallowCopyValue(const OpExecInfoPtr &op_exec_info, const ValuePtr &value) {
+  MS_EXCEPTION_IF_NULL(op_exec_info);
+  MS_EXCEPTION_IF_NULL(value);
+  auto tensor_abs = op_exec_info->abstract;
+  if (tensor_abs->isa<abstract::AbstractRef>()) {
+    tensor_abs = tensor_abs->cast<abstract::AbstractRefPtr>()->CloneAsTensor();
+  }
+  auto new_shape = tensor_abs->BuildShape()->cast<abstract::ShapePtr>();
+  MS_EXCEPTION_IF_NULL(new_shape);
+  if (value->isa<mindspore::tensor::Tensor>()) {
+    auto tensor_value = value->cast<mindspore::tensor::TensorPtr>();
+    return std::make_shared<mindspore::tensor::Tensor>(tensor_value->data_type(), new_shape->shape(),
+                                                       tensor_value->data_c(), tensor_value->Size());
+  } else if (value->isa<ValueTuple>()) {
+    std::vector<ValuePtr> values;
+    auto value_tuple = value->cast<ValueTuplePtr>();
+    (void)std::transform(value_tuple->value().begin(), value_tuple->value().end(), std::back_inserter(values),
+                         [op_exec_info](const ValuePtr &elem) { return ShallowCopyValue(op_exec_info, elem); });
+    return std::make_shared<ValueTuple>(values);
+  } else {
+    return value;
+  }
+}
 }  // namespace

 py::object RealRunOp(const py::args &args) {
@ -957,6 +988,7 @@ void TopCellInfo::Clear() {
  k_pynative_cell_ptr_ = nullptr;
  graph_info_map_.clear();
  sub_cell_list_.clear();
+  outputs_id_.clear();
  op_info_with_tensor_id_.clear();
  tensor_id_with_tensor_object_.clear();
  op_info_with_ms_func_forward_tensors_.clear();
@ -992,6 +1024,7 @@ OpExecInfoPtr ForwardExecutor::GenerateOpExecInfo(const py::args &args) {
  const auto &op_exec_info = std::make_shared<OpExecInfo>();
  const auto &op_name = py::cast<std::string>(args[PY_NAME]);
  op_exec_info->op_name = op_name;
+  op_exec_info->is_nop_prim = false;  // IsNopPrim(op_exec_info->op_name);

  const auto &adapter = py::cast<PrimitivePyAdapterPtr>(args[PY_PRIM]);
  MS_EXCEPTION_IF_NULL(adapter);
@ -1013,7 +1046,7 @@ OpExecInfoPtr ForwardExecutor::GenerateOpExecInfo(const py::args &args) {
 void ForwardExecutor::SetCastForInputs(const OpExecInfoPtr &op_exec_info) {
  MS_EXCEPTION_IF_NULL(op_exec_info);
  // No need cast self
-  if (op_exec_info->op_name == prim::kPrimCast->name()) {
+  if (op_exec_info->op_name == prim::kPrimCast->name() || op_exec_info->is_nop_prim) {
    return;
  }

@ -1167,6 +1200,21 @@ void ForwardExecutor::GetOpOutputAbstract(const OpExecInfoPtr &op_exec_info,
  }
 }

+void ForwardExecutor::DoNopOutput(const OpExecInfoPtr &op_exec_info, ValuePtr *out_real_value) {
+  MS_EXCEPTION_IF_NULL(op_exec_info);
+  // Get First input
+  if (op_exec_info->op_inputs.empty()) {
+    MS_LOG(EXCEPTION) << "Inputs of " << op_exec_info->op_name << " is empty";
+  }
+  const auto &obj = op_exec_info->op_inputs[0];
+  if (!py::isinstance<tensor::Tensor>(obj)) {
+    MS_LOG(EXCEPTION) << "First input of " << op_exec_info->op_name << " must be a tensor";
+  }
+  const auto &tensor_ptr = py::cast<tensor::TensorPtr>(obj);
+  *out_real_value = ShallowCopyValue(op_exec_info, tensor_ptr);
+  MS_LOG(DEBUG) << "New copy value is " << (*out_real_value)->ToString();
+}
+
 void ForwardExecutor::GetOpOutput(const OpExecInfoPtr &op_exec_info,
                                  const abstract::AbstractBasePtrList &args_spec_list, const CNodePtr &cnode,
                                  bool prim_cache_hit, py::object *ret) {
@ -1194,25 +1242,32 @@ void ForwardExecutor::GetOpOutput(const OpExecInfoPtr &op_exec_info,
    out[args_spec_list].abs = op_exec_info->abstract;
    out[args_spec_list].attrs = prim->evaluate_added_attrs();
  }
-  // run op with selected backend
-  auto result = RunOpWithInitBackendPolicy(op_exec_info);
-  py::object out_real = result;
-  if (result.size() == 1 && op_exec_info->abstract != nullptr &&
-      !op_exec_info->abstract->isa<abstract::AbstractSequence>()) {
-    out_real = result[0];
-  }
-  // get output value
+
+  // Run op with selected backend, nop is no need run backend
  ValuePtr out_real_value = nullptr;
-  if (grad()->grad_flag()) {
-    out_real_value = PyObjToValue(out_real);
+  if (op_exec_info->is_nop_prim) {
+    DoNopOutput(op_exec_info, &out_real_value);
+    *ret = BaseRefToPyData(out_real_value);
+  } else {
+    auto result = RunOpWithInitBackendPolicy(op_exec_info);
+    py::object out_real = result;
+    if (result.size() == 1 && op_exec_info->abstract != nullptr &&
+        !op_exec_info->abstract->isa<abstract::AbstractSequence>()) {
+      out_real = result[0];
+    }
+    // get output value
+    if (grad()->grad_flag()) {
+      out_real_value = PyObjToValue(out_real);
+    }
+    *ret = out_real;
  }
-  // Save cnode info and build grad graph
+
  if (grad()->need_construct_graph() && !grad()->in_cell_with_custom_bprop_()) {
    MS_EXCEPTION_IF_NULL(cnode);
-    const auto &obj_id = GetId(out_real);
+    const auto &obj_id = GetId(*ret);
    cnode->set_abstract(op_exec_info->abstract);
    node_abs_map_[obj_id] = op_exec_info->abstract;
-    grad()->SaveOutputNodeMap(obj_id, out_real, cnode);
+    grad()->SaveOutputNodeMap(obj_id, *ret, cnode);
    grad()->DoOpGrad(op_exec_info, cnode, out_real_value);
  } else {
    node_abs_map_.clear();
@ -1220,7 +1275,6 @@ void ForwardExecutor::GetOpOutput(const OpExecInfoPtr &op_exec_info,
  // Record op info for judge whether the construct of cell has been changed
  grad()->RecordGradOpInfo(op_exec_info, out_real_value);
  grad()->UpdateForwardTensorInfoInBpropGraph(op_exec_info, out_real_value);
-  *ret = out_real;
 }

 py::object ForwardExecutor::DoAutoCast(const py::object &arg, const TypeId &type_id, const std::string &op_name,
@ -1701,7 +1755,7 @@ void GradExecutor::SaveOutputNodeMap(const std::string &obj_id, const py::object
    return;
  }
  MS_EXCEPTION_IF_NULL(cnode);
-  MS_LOG(DEBUG) << "Cnode is " << cnode->DebugString() << " id " << obj_id;
+  MS_LOG(DEBUG) << "Cnode is " << cnode->DebugString() << ", out value id " << obj_id;
  if (py::isinstance<py::tuple>(out_real)) {
    auto value = py::cast<py::tuple>(out_real);
    auto size = static_cast<int64_t>(value.size());
@ -1925,6 +1979,7 @@ void GradExecutor::SaveForwardTensorInfoInBpropGraph(const pipeline::ResourcePtr
      continue;
    }
    tensor_id_with_tensor_object[tensor->id()].emplace_back(tensor);
+    top_cell()->outputs_id().insert(tensor->id());
    MS_LOG(DEBUG) << "Save forward tensor " << tensor.get() << " id " << tensor->id()
                  << " device address: " << tensor->device_address() << " shape and dtype "
                  << tensor->GetShapeAndDataTypeInfo();
@ -2090,7 +2145,8 @@ py::object ForwardExecutor::RunOpInMs(const OpExecInfoPtr &op_exec_info, Pynativ
  // get graph info for checking it whether existing in the cache
  GetSingleOpGraphInfo(op_exec_info, input_tensors, tensors_mask, &graph_info);
 #if defined(__APPLE__)
-  session::OpRunInfo op_run_info = {op_exec_info->op_name,
+  session::OpRunInfo op_run_info = {false,
+                                    op_exec_info->op_name,
                                    op_exec_info->py_primitive.get(),
                                    op_exec_info->abstract,
                                    op_exec_info->is_dynamic_shape,
@ -2102,7 +2158,8 @@ py::object ForwardExecutor::RunOpInMs(const OpExecInfoPtr &op_exec_info, Pynativ
                                    tensors_mask,
                                    input_tensors};
 #else
-  session::OpRunInfo op_run_info = {op_exec_info->op_name,
+  session::OpRunInfo op_run_info = {false,
+                                    op_exec_info->op_name,
                                    op_exec_info->py_primitive.get(),
                                    op_exec_info->abstract,
                                    op_exec_info->is_dynamic_shape,
--- a/mindspore/ccsrc/pipeline/pynative/pynative_execute.h
+++ b/mindspore/ccsrc/pipeline/pynative/pynative_execute.h
@ -100,6 +100,7 @@ class TopCellInfo {
  const std::string &grad_operation() const { return grad_operation_; }
  void set_grad_operation(const std::string &grad_operation) { grad_operation_ = grad_operation; }
  mindspore::HashSet<std::string> &sub_cell_list() { return sub_cell_list_; }
+  std::set<std::string> &outputs_id() { return outputs_id_; }
  bool IsSubCell(const std::string &cell_id) const;
  OrderedMap<FuncGraphPtr, GraphInfoPtr> &graph_info_map() { return graph_info_map_; }
  OpInfoWithTensorId &op_info_with_tensor_id() { return op_info_with_tensor_id_; }
@ -139,6 +140,8 @@ class TopCellInfo {
  std::string grad_operation_;
  OrderedMap<FuncGraphPtr, GraphInfoPtr> graph_info_map_;
  mindspore::HashSet<std::string> sub_cell_list_;
+  // Record op output tensor id
+  std::set<std::string> outputs_id_;
  OpInfoWithTensorId op_info_with_tensor_id_;
  TensorIdWithTensorObject tensor_id_with_tensor_object_;
  OpInfoWithMsFuncForwardTensors op_info_with_ms_func_forward_tensors_;
@ -195,6 +198,7 @@ class GradExecutor {
  void set_grad_flag(bool flag) { grad_flag_ = flag; }
  void set_graph_phase(const std::string &graph_phase) { graph_phase_ = graph_phase; }
  bool in_cell_with_custom_bprop_() const { return custom_bprop_cell_count_ > 0; }
+  std::set<std::string> &forward_outputs_id() const { return top_cell()->outputs_id(); }
  AnfNodePtr GetInput(const py::object &obj, bool op_mask);
  std::string GetCellId(const py::object &obj, const py::args &args);
  void RecordGradOpInfo(const OpExecInfoPtr &op_exec_info, const ValuePtr &op_out);
@ -347,6 +351,7 @@ class ForwardExecutor {
                           bool *prim_cache_hit);
  void GetOpOutput(const OpExecInfoPtr &op_exec_info, const abstract::AbstractBasePtrList &args_spec_list,
                   const CNodePtr &cnode, bool prim_cache_hit, py::object *ret);
+  void DoNopOutput(const OpExecInfoPtr &op_exec_info, ValuePtr *out_real_value);
  // Mix precision and Implicit transform
  void SetCastForInputs(const OpExecInfoPtr &op_exec_info);
  void SetTensorMixPrecisionCast(const OpExecInfoPtr &op_exec_info);
--- a/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc
+++ b/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc
@ -84,6 +84,7 @@ REGISTER_PYBIND_DEFINE(MsContextPy, ([](const py::module *m) {
                           .value("save_graphs", MsCtxParam::MS_CTX_SAVE_GRAPHS_FLAG)
                           .value("enable_parallel_split", MsCtxParam::MS_CTX_ENABLE_PARALLEL_SPLIT)
                           .value("max_device_memory", MsCtxParam::MS_CTX_MAX_DEVICE_MEMORY)
+                           .value("mempool_block_size", MsCtxParam::MS_CTX_MEMPOOL_BLOCK_SIZE)
                           .value("mode", MsCtxParam::MS_CTX_EXECUTION_MODE)
                           .value("device_target", MsCtxParam::MS_CTX_DEVICE_TARGET)
                           .value("_graph_memory_max_size", MsCtxParam::MS_CTX_GRAPH_MEMORY_MAX_SIZE)
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_adapter.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_adapter.cc
@ -77,6 +77,7 @@ bool AscendMemAdapter::Initialize() {
  static_mem_offset_ = ms_used_hbm_size_;
  cur_dynamic_mem_offset_ = 0;
  max_dynamic_mem_offset_ = 0;
+  AscendMemoryPool::GetInstance().SetMempoolBlockSize(ms_used_hbm_size_);
  MS_LOG(INFO) << "Ascend Memory Adapter initialize success, Memory Statistics:" << DevMemStatistics();
  initialized_ = true;
  return true;
@ -109,7 +110,7 @@ bool AscendMemAdapter::DeInitialize() {
  return ret;
 }

-uint8_t *AscendMemAdapter::MallocStaticDevMem(size_t size, std::string tag) {
+uint8_t *AscendMemAdapter::MallocStaticDevMem(size_t size, const std::string &tag) {
  std::lock_guard<std::mutex> locker(mutex_);
  auto new_static_offset = static_mem_offset_ - size;
  if (new_static_offset < max_dynamic_mem_offset_) {
@ -122,11 +123,10 @@ uint8_t *AscendMemAdapter::MallocStaticDevMem(size_t size, std::string tag) {
  auto memory_block_ptr = device_mem_base_addr_ + new_static_offset;
  static_mem_offset_ = new_static_offset;
  static_memory_block_list_.push_back(std::make_shared<MemoryBlock>(memory_block_ptr, size, tag));
-
  return memory_block_ptr;
 }

-uint8_t *AscendMemAdapter::MallocDynamicDevMem(size_t size, std::string tag) {
+uint8_t *AscendMemAdapter::MallocDynamicDevMem(size_t size, const std::string &tag) {
  std::lock_guard<std::mutex> locker(mutex_);
  auto new_dynamic_offset = cur_dynamic_mem_offset_ + size;
  if (new_dynamic_offset > static_mem_offset_) {
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_adapter.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_adapter.h
@ -22,6 +22,7 @@
 #include <memory>
 #include <vector>
 #include "utils/ms_context.h"
+#include "runtime/device/ascend/ascend_memory_pool.h"

 namespace mindspore {
 namespace device {
@ -37,8 +38,8 @@ class AscendMemAdapter {
  bool Initialize();
  bool DeInitialize();

-  uint8_t *MallocStaticDevMem(size_t size, std::string tag = "");
-  uint8_t *MallocDynamicDevMem(size_t size, std::string tag = "");
+  uint8_t *MallocStaticDevMem(size_t size, const std::string &tag = "");
+  uint8_t *MallocDynamicDevMem(size_t size, const std::string &tag = "");
  bool FreeStaticDevMem(void *devPtr) { return true; }
  void ResetDynamicMemory();

@ -76,12 +77,13 @@ class AscendMemAdapter {
  uint8_t *device_mem_base_addr_{nullptr};
  uint64_t ms_used_hbm_size_{0};

-  // dynamic memory info
+  // dynamic memory info, from a low address to a high address
  uint64_t cur_dynamic_mem_offset_{0};
+  // Maximum dynamic memory have already allocated, dynamically updated
  uint64_t max_dynamic_mem_offset_{0};
  std::vector<std::shared_ptr<MemoryBlock>> dynamic_memory_block_list_;

-  // static memory info
+  // static memory info, from a high address to a low address
  uint64_t static_mem_offset_{0};
  std::vector<std::shared_ptr<MemoryBlock>> static_memory_block_list_;
 };
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.cc
@ -15,7 +15,6 @@
 */
 #include <string>
 #include "runtime/device/ascend/ascend_memory_manager.h"
-#include "runtime/device/ascend/ascend_memory_pool.h"
 #include "runtime/device/ascend/ascend_memory_adapter.h"
 #include "utils/ms_context.h"
 #include "runtime/mem.h"
@ -49,9 +48,9 @@ void *AscendMemoryManager::MallocDevice(size_t size) {
  return AscendMemoryPool::GetInstance().AllocTensorMem(align_size);
 }

-void *AscendMemoryManager::MallocMemFromMemPool(size_t size) {
+void *AscendMemoryManager::MallocMemFromMemPool(size_t size, bool from_persistent_mem) {
  auto align_size = GetCommonAlignSize(size);
-  const auto device_addr = AscendMemoryPool::GetInstance().AllocTensorMem(align_size);
+  const auto device_addr = AscendMemoryPool::GetInstance().AllocTensorMem(align_size, from_persistent_mem);
  if (device_addr == nullptr) {
    MS_LOG(EXCEPTION) << "Fail to alloc memory, size: " << align_size
                      << ", memory statistics:" << AscendMemAdapter::GetInstance().DevMemStatistics();
@ -132,8 +131,8 @@ uint8_t *AscendMemoryManager::MallocCommunicationMemFromMemPool(size_t size) {

 size_t AscendMemoryManager::GetAvailableMemSize() {
  auto available_mem_size = AscendMemoryPool::GetInstance().free_mem_size() +
-                            AscendMemoryPool::GetInstance().total_mem_statistics() -
-                            AscendMemoryPool::GetInstance().used_mem_statistics();
+                            AscendMemoryPool::GetInstance().TotalMemStatistics() -
+                            AscendMemoryPool::GetInstance().TotalUsedMemStatistics();
  return available_mem_size;
 }

--- a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.h
@ -32,7 +32,7 @@ class AscendMemoryManager : public MemoryManager {
  void FreeDeviceMemory() override;
  void ResetDynamicMemory() override;
  void ClearGlobalIdleMem() override;
-  void *MallocMemFromMemPool(size_t size) override;
+  void *MallocMemFromMemPool(size_t size, bool from_persistent_mem) override;
  void *MallocDevice(size_t size) override;
  void FreeMemFromMemPool(void *device_ptr) override;
  uint64_t GetMsMaxMemSize();
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.cc
@ -24,37 +24,37 @@
 namespace mindspore {
 namespace device {
 namespace ascend {
-// The minimum unit size (256MB) of memory block used for dynamic extend.
-static const size_t ASCEND_DYNAMIC_MEM_ALLOC_UNIT_SIZE = 256 << 20;
 // The minimum unit size (8MB) of memory block used for dynamic extend in graph mode.
 static const size_t ASCEND_DYNAMIC_MEM_ALLOC_UNIT_SIZE_FOR_GRAPH = 8 << 20;

-size_t AscendMemoryPool::CalMemBlockAllocSize(size_t size) {
+size_t AscendMemoryPool::CalMemBlockAllocSize(size_t size, bool from_persistent_mem) {
  auto device_free_mem_size = free_mem_size();
  if (device_free_mem_size < size) {
    MS_LOG(WARNING) << "The dynamic memory pool total size is "
-                    << device::ascend::AscendMemoryPool::GetInstance().total_mem_statistics() / kMBToByte
+                    << device::ascend::AscendMemoryPool::GetInstance().TotalMemStatistics() / kMBToByte
                    << "M, total used size is "
-                    << device::ascend::AscendMemoryPool::GetInstance().used_mem_statistics() / kMBToByte
+                    << device::ascend::AscendMemoryPool::GetInstance().TotalUsedMemStatistics() / kMBToByte
                    << "M, used peak size is "
-                    << device::ascend::AscendMemoryPool::GetInstance().used_mem_peak_statistics() / kMBToByte << "M.";
-    MS_LOG(WARNING) << "Out of Memory. Request memory size: " << size
+                    << device::ascend::AscendMemoryPool::GetInstance().UsedMemPeakStatistics() / kMBToByte << "M.";
+    MS_LOG(WARNING) << "Out of Memory. Request memory size: " << size << ", device free size " << device_free_mem_size
                    << ", Memory Statistic:" << AscendMemAdapter::GetInstance().DevMemStatistics()
                    << "Please try to reduce 'batch_size' or check whether exists extra large shape. More "
                       "details can be found in MindSpore's FAQ with keyword 'Out of Memory'.";
    return 0;
  }
-  auto alloc_mem_size = ASCEND_DYNAMIC_MEM_ALLOC_UNIT_SIZE;
+  size_t alloc_mem_size = MemAllocUnitSize(from_persistent_mem);
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  const bool pynative_mode = (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode);
  if (pynative_mode) {
    // Growing at twice of alloc size
+    MS_LOG(DEBUG) << "Get unit block size " << alloc_mem_size;
    constexpr size_t kDouble = 2;
    while (alloc_mem_size < size) {
      alloc_mem_size = alloc_mem_size * kDouble;
    }
  } else {
+    // The graph mode controls itself independently
    alloc_mem_size = ASCEND_DYNAMIC_MEM_ALLOC_UNIT_SIZE_FOR_GRAPH;
    while (alloc_mem_size < size) {
      alloc_mem_size = alloc_mem_size + ASCEND_DYNAMIC_MEM_ALLOC_UNIT_SIZE_FOR_GRAPH;
@ -69,7 +69,6 @@ size_t AscendMemoryPool::AllocDeviceMem(size_t size, DeviceMemPtr *addr) {
  if (size == 0) {
    MS_LOG(EXCEPTION) << "Failed to alloc memory pool resource, the size is zero!";
  }
-
  *addr = AscendMemAdapter::GetInstance().MallocStaticDevMem(size);
  if (*addr == nullptr) {
    MS_LOG(EXCEPTION) << "Alloc device memory pool address is nullptr, failed to alloc memory pool resource!";
@ -83,11 +82,17 @@ bool AscendMemoryPool::FreeDeviceMem(const DeviceMemPtr &addr) {
 }

 void AscendMemoryPool::ResetIdleMemBuf() {
-  auto idle_mem_buf_map = DynamicMemPoolBestFit::global_idle_mem_buf_map();
-  for (auto &it : idle_mem_buf_map) {
-    MS_EXCEPTION_IF_NULL(it.second);
-    (void)rtMemset(it.second->device_addr_, it.first, 0, it.first);
-  }
+  auto fn = [this](const MemStatusManagerPtr &mem_mng) {
+    if (mem_mng->mem_block_list_.empty()) {
+      return;
+    }
+    for (auto &it : mem_mng->idle_mem_buf_map_) {
+      MS_EXCEPTION_IF_NULL(it.second);
+      (void)rtMemset(it.second->device_addr_, it.first, 0, it.first);
+    }
+  };
+  fn(persistent_mem());
+  fn(common_mem());
 }

 size_t AscendMemoryPool::free_mem_size() { return AscendMemAdapter::GetInstance().FreeDevMemSize(); }
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.h
@ -42,7 +42,7 @@ class AscendMemoryPool : public DynamicMemPoolBestFit {

 protected:
  // Calculate memory block required alloc size when adding the memory block.
-  size_t CalMemBlockAllocSize(size_t size) override;
+  size_t CalMemBlockAllocSize(size_t size, bool from_persistent_mem) override;

 private:
  AscendMemoryPool() = default;
--- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
@ -112,6 +112,7 @@ void CPUKernelRuntime::AssignValueNodeAddress(session::KernelGraph *kernel_graph
      size_t tensor_size = std::accumulate(data_shape.begin(), data_shape.end(), type_size, std::multiplies<size_t>());
      DeviceAddressPtr address = nullptr;
      address = CreateDeviceAddress(nullptr, tensor_size, kOpFormat_DEFAULT, output_type_id);
+      address->set_from_persistent_mem(tensor->is_parameter());
      MS_EXCEPTION_IF_NULL(address);
      if (tensor->data_type() == output_type_id) {
        address->ptr_ = tensor->data_c();
@ -146,6 +147,7 @@ void CPUKernelRuntime::AssignInputNodeAddress(const session::KernelGraph *kernel
                            : std::accumulate(fmt_shape.begin(), fmt_shape.end(), type_size, std::multiplies<size_t>());
        auto format = AnfAlgo::GetOutputFormat(item, index);
        auto address = CreateDeviceAddress(nullptr, tensor_size, format, output_type_id);
+        address->set_from_persistent_mem(true);
        AnfAlgo::SetOutputAddr(address, index, item.get());
      }
    }
--- a/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.h
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.h
@ -46,7 +46,9 @@ class CPUMemoryManager : public MemoryManager {
  void IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs);
  void DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs);

-  void *MallocMemFromMemPool(size_t size) override { return CPUMemoryPool::GetInstance().AllocTensorMem(size); }
+  void *MallocMemFromMemPool(size_t size, bool from_persistent_mem) override {
+    return CPUMemoryPool::GetInstance().AllocTensorMem(size, from_persistent_mem);
+  }
  void FreeMemFromMemPool(void *device_ptr) override { CPUMemoryPool::GetInstance().FreeTensorMem(device_ptr); }
  std::vector<void *> MallocContinuousMemFromMemPool(size_t total_size, std::vector<size_t> size_list) override {
    return CPUMemoryPool::GetInstance().AllocContinuousTensorMem(total_size, size_list);
--- a/mindspore/ccsrc/runtime/device/device_address.h
+++ b/mindspore/ccsrc/runtime/device/device_address.h
@ -101,6 +101,8 @@ class DeviceAddress : public mindspore::DeviceSync {
  bool is_ptr_persisted() const { return is_ptr_persisted_; }
  void set_is_ptr_persisted(bool is_ptr_persisted) { is_ptr_persisted_ = is_ptr_persisted; }
  void set_host_shape(const ShapeVector &shape) { host_shape_ = shape; }
+  bool from_persistent_mem() const { return from_persistent_mem_; }
+  void set_from_persistent_mem(bool from_persistent_mem) { from_persistent_mem_ = from_persistent_mem; }
  virtual void set_status(DeviceAddressStatus status) {}
  virtual DeviceAddressStatus status() const { return DeviceAddressStatus::kInDevice; }
  virtual DeviceAddressType DeviceType() const { return DeviceAddressType::kUnknown; }
@ -145,6 +147,7 @@ class DeviceAddress : public mindspore::DeviceSync {
  // The key of device context.
  std::string device_name_{""};
  uint32_t device_id_{0};
+  bool from_persistent_mem_{false};

  friend class KernelRuntime;
  friend class MemoryManager;
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@ -422,7 +422,7 @@ void GPUKernelRuntime::FetchMemUnitSize(const session::KernelGraph *graph) {
      current_sum_size = 0;
    }
  }
-  if (max_sum_size > GPUMemoryAllocator::GetInstance().mem_alloc_unit_size()) {
+  if (max_sum_size > GPUMemoryAllocator::GetInstance().MemAllocUnitSize()) {
    size_t unit_size = (max_sum_size / DYNAMIC_MEM_ALLOC_UNIT_SIZE + 1) * DYNAMIC_MEM_ALLOC_UNIT_SIZE;
    if (unit_size < DYNAMIC_MEM_ALLOC_UNIT_SIZE) {
      MS_LOG(WARNING) << "Current memory unit size [" << unit_size << "] is too small.";
@ -432,7 +432,7 @@ void GPUKernelRuntime::FetchMemUnitSize(const session::KernelGraph *graph) {
    constexpr float kValidMemoryRatio = 0.9;
    free_mem_size = kValidMemoryRatio * free_mem_size;
    unit_size = std::min(unit_size, free_mem_size);
-    GPUMemoryAllocator::GetInstance().set_mem_alloc_unit_size(unit_size);
+    GPUMemoryAllocator::GetInstance().SetMemAllocUintSize(unit_size);
  }
 }

--- a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_allocator.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_allocator.cc
@ -25,6 +25,7 @@ namespace mindspore {
 namespace device {
 namespace gpu {
 const size_t kGBToByte = 1024 << 20;
+constexpr float kReservedMemoryRatio = 0.0625;  // 1/16

 bool GPUMemoryAllocator::Init() {
  size_t total_size = CudaDriver::total_mem_size();
@ -42,6 +43,13 @@ bool GPUMemoryAllocator::Init() {
      << total_size << ", current free memory size " << free_size << ", set max available memory size "
      << available_device_memory_ << ".";
  }
+  // In gpu mode, recommend 1/16 reserved for other cuda functions
+  if (available_device_memory_ > total_size) {
+    size_t recommend_mem_size_for_others = FloatToSize(total_size * kReservedMemoryRatio);
+    SetMempoolBlockSize(std::min(available_device_memory_, total_size - recommend_mem_size_for_others));
+  } else {
+    SetMempoolBlockSize(std::min(available_device_memory_, total_size));
+  }
  return true;
 }

@ -71,7 +79,7 @@ bool GPUMemoryAllocator::AllocBufferQueueMem(size_t size, DeviceMemPtr *addr) {
  auto alloc_size = AllocDeviceMem(size, addr);
  buffer_q_addr_ = *addr;
  // Buffer queue needs to ensure that the alloc_size and size is equal.
-  return (alloc_size == size) ? true : false;
+  return alloc_size == size;
 }

 size_t GPUMemoryAllocator::AllocDeviceMem(size_t size, DeviceMemPtr *addr) {
@ -90,8 +98,9 @@ size_t GPUMemoryAllocator::AllocDeviceMem(size_t size, DeviceMemPtr *addr) {
  }
  total_used_device_memory_ += alloc_size;
  available_device_memory_ -= alloc_size;
-  MS_LOG(INFO) << "Current free memory size[" << free_size - alloc_size << "], current alloc size[" << alloc_size
-               << "], total used size[" << total_used_device_memory_ << "].";
+  MS_LOG(INFO) << "Cuda current free memory size[" << free_size << "], alloc size[" << alloc_size
+               << "], left free memory size[" << free_size - alloc_size << "]"
+               << ".Total used size[" << total_used_device_memory_ << "].";
  return alloc_size;
 }

--- a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.cc
@ -24,8 +24,8 @@
 namespace mindspore {
 namespace device {
 namespace gpu {
-void *GPUMemoryManager::MallocMemFromMemPool(size_t size) {
-  return GPUMemoryAllocator::GetInstance().AllocTensorMem(size);
+void *GPUMemoryManager::MallocMemFromMemPool(size_t size, bool from_persistent_mem) {
+  return GPUMemoryAllocator::GetInstance().AllocTensorMem(size, from_persistent_mem);
 }

 void GPUMemoryManager::FreeMemFromMemPool(void *device_ptr) {
@ -39,7 +39,7 @@ std::vector<void *> GPUMemoryManager::MallocContinuousMemFromMemPool(size_t tota
 bool GPUMemoryManager::MallocContinuousMemFromMemPool(const DeviceAddressPtrList &addr_list, size_t total_size,
                                                      std::vector<size_t> size_list) {
  auto device_ptr_list = MallocContinuousMemFromMemPool(total_size, size_list);
-  if (device_ptr_list.size() == 0) {
+  if (device_ptr_list.empty()) {
    return false;
  }
  if (addr_list.size() != device_ptr_list.size()) {
@ -76,7 +76,7 @@ void GPUMemoryManager::MallocDeviceMemory() {
  if (ps::ps_cache_instance.initialized_ps_cache()) {
    return;
  }
-  auto device_addr = MallocMemFromMemPool(1);
+  auto device_addr = MallocMemFromMemPool(1, false);
  if (!device_addr) {
    MS_LOG(EXCEPTION) << "Dynamic memory pool init error.";
  }
@ -87,7 +87,7 @@ void GPUMemoryManager::FreeDeviceMemory() { GPUMemoryAllocator::GetInstance().Re
 uint8_t *GPUMemoryManager::MallocStaticMem(size_t size, bool, uint32_t) {
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
-  auto device_ptr = MallocMemFromMemPool(size);
+  auto device_ptr = MallocMemFromMemPool(size, false);
  if (device_ptr == nullptr) {
    MS_LOG(EXCEPTION) << "Device memory isn't enough and alloc failed, alloc size:" << size;
  }
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.h
@ -29,7 +29,7 @@ class GPUMemoryManager : public MemoryManager {
  void MallocDeviceMemory() override;
  void FreeDeviceMemory() override;

-  void *MallocMemFromMemPool(size_t size) override;
+  void *MallocMemFromMemPool(size_t size, bool from_persistent_mem) override;
  void FreeMemFromMemPool(void *device_ptr) override;
  std::vector<void *> MallocContinuousMemFromMemPool(size_t total_size, std::vector<size_t> size_list) override;
  bool MallocContinuousMemFromMemPool(const DeviceAddressPtrList &addr_list, size_t total_size,
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
@ -258,6 +258,7 @@ void KernelRuntime::RunOpMallocPre(const session::KernelGraph &graph,
      auto output_format = op_runtime_info->output_format(index);
      auto device_address =
        CreateDeviceAddress(nullptr, output_tensor_size, output_format, output_type_id, {item, index});
+      device_address->set_from_persistent_mem(current_tensor->is_parameter());
      AnfAlgo::SetOutputAddr(device_address, index, item.get());
      current_tensor->set_device_address(device_address);
      current_tensor->set_sync_status(kNeedSyncHostToDevice);
@ -287,6 +288,7 @@ void KernelRuntime::ResetNodeAddress(const session::KernelGraph &kernel_graph) {
      auto tensor_size = AnfAlgo::GetOutputTensorMemSize(input_node, index);
      auto device_address = CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(input_node, index),
                                                output_type_id, {input_node, index});
+      device_address->set_from_persistent_mem(input_node->isa<Parameter>());
      AnfAlgo::SetOutputAddr(device_address, index, input_node.get());
    }

@ -306,7 +308,7 @@ void KernelRuntime::ResetNodeAddress(const session::KernelGraph &kernel_graph) {
 }

 void KernelRuntime::RunOpAssignMemory(const std::vector<tensor::TensorPtr> &input_tensors,
-                                      const session::KernelGraph &graph,
+                                      const session::KernelGraph &graph, bool is_gradient_out,
                                      const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node) {
  MS_EXCEPTION_IF_NULL(mem_manager_);
  mem_manager_->ResetDynamicMemory();
@ -319,7 +321,7 @@ void KernelRuntime::RunOpAssignMemory(const std::vector<tensor::TensorPtr> &inpu
  RunOpAssignInputMemory(input_tensors, graph);
  AssignStaticMemoryValueNode(graph);
  for (const auto &node : graph.execution_order()) {
-    RunOpAssignOutputMemory(node, tensor_to_node);
+    RunOpAssignOutputMemory(node, tensor_to_node, is_gradient_out);
    RunOpAssignWorkSpaceMemory(node);
  }
  UpdateRefNodeOutputMem(graph);
@ -398,6 +400,7 @@ void KernelRuntime::RunOpAssignInputMemory(const std::vector<tensor::TensorPtr>
      auto current_tensor = input_tensors[input_index];
      MS_EXCEPTION_IF_NULL(current_tensor);
      auto output_address = std::dynamic_pointer_cast<device::DeviceAddress>(current_tensor->device_address());
+      // Device address have already create
      if (output_address != nullptr && output_address->DeviceType() == GetTargetDeviceAddressType()) {
        if (output_address->ptr_ == nullptr) {
          if (!mem_manager_->MallocMemFromMemPool(output_address, output_address->size())) {
@ -413,10 +416,12 @@ void KernelRuntime::RunOpAssignInputMemory(const std::vector<tensor::TensorPtr>
        output_type_id = AnfAlgo::GetOutputInferDataType(item, index);
      }
      auto tensor_size = AnfAlgo::GetOutputTensorMemSize(item, index);
+      // Device address new create
      auto device_address =
        CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id, {item, index});
      MS_EXCEPTION_IF_NULL(device_address);
      MS_EXCEPTION_IF_NULL(mem_manager_);
+      device_address->set_from_persistent_mem(true);
      auto ret = mem_manager_->MallocMemFromMemPool(device_address, tensor_size);
      if (!ret) {
        MS_LOG(EXCEPTION) << "Device memory isn't enough and alloc failed, alloc size:" << tensor_size;
@ -426,8 +431,9 @@ void KernelRuntime::RunOpAssignInputMemory(const std::vector<tensor::TensorPtr>
  }
 }

-void KernelRuntime::RunOpAssignOutputMemory(
-  const AnfNodePtr &kernel, const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node) {
+void KernelRuntime::RunOpAssignOutputMemory(const AnfNodePtr &kernel,
+                                            const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node,
+                                            bool is_gradient_out) {
  MS_EXCEPTION_IF_NULL(kernel);
  MS_EXCEPTION_IF_NULL(mem_manager_);
  auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
@ -464,8 +470,11 @@ void KernelRuntime::RunOpAssignOutputMemory(
    std::string output_format = AnfAlgo::GetOutputFormat(kernel, i);
    auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i);
    auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type, {kernel, i});
-    device_address->set_host_shape(trans::GetRuntimePaddingShape(kernel, i));
    MS_EXCEPTION_IF_NULL(device_address);
+    device_address->set_host_shape(trans::GetRuntimePaddingShape(kernel, i));
+    if (is_gradient_out) {
+      device_address->set_from_persistent_mem(true);
+    }
    auto ret = mem_manager_->MallocMemFromMemPool(device_address, output_sizes[i]);
    if (!ret) {
      MS_LOG(EXCEPTION) << "Device memory isn't enough and alloc failed, alloc size:" << output_sizes[i];
@ -917,6 +926,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const
    auto output_format = AnfAlgo::GetOutputFormat(value_node, output_idx);
    DeviceAddressPtr address =
      CreateDeviceAddress(nullptr, node_size, output_format, output_type_id, {value_node, output_idx});
+    address->set_from_persistent_mem(true);
    MS_EXCEPTION_IF_NULL(address);
    if (ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER) &&
        !mem_manager_->MallocMemFromMemPool(address, node_size)) {
@ -979,6 +989,7 @@ void KernelRuntime::AssignStaticMemoryValueNode(const session::KernelGraph &grap
                                            ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode;
      auto address = CreateDeviceAddressForStringValue(node_value, use_mem_from_memory_pool, graph.graph_id());
      MS_EXCEPTION_IF_NULL(address);
+      address->set_from_persistent_mem(true);
      AnfAlgo::SetOutputAddr(address, 0, value_node.get());
    }
  }
@ -991,6 +1002,7 @@ DeviceAddressPtr KernelRuntime::CreateDeviceAddressForStringValue(const ValuePtr
  size_t tensor_size = value_string.size();
  DeviceAddressPtr address = CreateDeviceAddress(nullptr, tensor_size, kOpFormat_DEFAULT, kNumberTypeUInt8);
  MS_EXCEPTION_IF_NULL(address);
+  address->set_from_persistent_mem(true);
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  if (use_mem_pool && !mem_manager_->MallocMemFromMemPool(address, tensor_size)) {
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@ -56,6 +56,7 @@ class KernelRuntime {
  virtual bool Init() = 0;
  virtual void AssignMemory(const session::KernelGraph &graph);
  void RunOpAssignMemory(const std::vector<tensor::TensorPtr> &input_tensors, const session::KernelGraph &graph,
+                         bool is_gradient_out,
                         const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node = {});
  void AssignCommunicationOutputFromMemoryPool(const AnfNodePtr &node) const;
  void AssignCommunicationInputFromMemoryPool(const AnfNodePtr &node) const;
@ -173,7 +174,8 @@ class KernelRuntime {
                                     const std::shared_ptr<MemScheduler> &mem_schedule = nullptr);
  void RunOpAssignInputMemory(const std::vector<tensor::TensorPtr> &input_tensors, const session::KernelGraph &graph);
  void RunOpAssignOutputMemory(const AnfNodePtr &kernel,
-                               const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node = {});
+                               const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node,
+                               bool is_gradient_out);
  void RunOpAssignWorkSpaceMemory(const AnfNodePtr &kernel);
  void RunOpAssignOutputNodeMemory(const ValuePtr &pre_output_value, const session::KernelGraph &graph);
  void AssignValueNodeTensor(const ValueNodePtr &value_node, const ValuePtr &node_value, size_t output_idx);
--- a/mindspore/ccsrc/runtime/device/memory_manager.cc
+++ b/mindspore/ccsrc/runtime/device/memory_manager.cc
@ -141,9 +141,9 @@ uint8_t *MemoryManager::MallocDynamicMem(size_t size, bool communication_mem) {
  return nullptr;
 }

-bool MemoryManager::MallocMemFromMemPool(const DeviceAddressPtr address, size_t size) {
+bool MemoryManager::MallocMemFromMemPool(const DeviceAddressPtr &address, size_t size) {
  MS_EXCEPTION_IF_NULL(address);
-  auto device_ptr = MallocMemFromMemPool(size);
+  auto device_ptr = MallocMemFromMemPool(size, address->from_persistent_mem_);
  if (!device_ptr) {
    return false;
  }
@ -154,7 +154,7 @@ bool MemoryManager::MallocMemFromMemPool(const DeviceAddressPtr address, size_t
  return true;
 }

-void *MemoryManager::MallocMemFromMemPool(size_t size) {
+void *MemoryManager::MallocMemFromMemPool(size_t size, bool from_persistent_mem) {
  if (size == 0) {
    MS_LOG(ERROR) << "MallocMemFromMemPool size is 0.";
  }
--- a/mindspore/ccsrc/runtime/device/memory_manager.h
+++ b/mindspore/ccsrc/runtime/device/memory_manager.h
@ -49,8 +49,10 @@ class MemoryManager : public MemHandler {
  virtual uint8_t *MallocMem(MemType type, size_t size, const DeviceAddressPtr &address,
                             uint32_t graph_id = kInvalidGraphId);

-  virtual bool MallocMemFromMemPool(const DeviceAddressPtr address, size_t size);
-  virtual void *MallocMemFromMemPool(size_t size);
+  // param address is the address type of each device
+  // param from_persistent_mem shows whether the tensor is a parameter in Pynative mode
+  virtual bool MallocMemFromMemPool(const DeviceAddressPtr &address, size_t size);
+  virtual void *MallocMemFromMemPool(size_t size, bool from_persistent_mem);
  virtual uint8_t *MallocCommunicationMemFromMemPool(size_t size) { return nullptr; }
  virtual void FreeMemFromMemPool(const DeviceAddressPtr address);
  virtual void FreeMemFromMemPool(void *device_ptr);
@ -62,7 +64,7 @@ class MemoryManager : public MemHandler {
  static size_t GetCommunicationAlignSize(size_t input_size);

  // swap manager interface
-  void *MallocDevice(size_t mem_size) override { return MallocMemFromMemPool(mem_size); }
+  void *MallocDevice(size_t mem_size) override { return MallocMemFromMemPool(mem_size, false); }
  void FreeDevice(void *ptr) override {
    MS_EXCEPTION_IF_NULL(ptr);
    FreeMemFromMemPool(ptr);
--- a/mindspore/ccsrc/runtime/framework/actor/data_prepare_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/data_prepare_actor.cc
@ -607,6 +607,7 @@ void DataPrepareActor::PrepareDataForWeightNode(const AnfNodePtr &backend_node,
          (device_tensor->DeviceType() != device_context->GetDeviceAddressType())) {
        host_tensor_address = device_context->CreateDeviceAddress(nullptr, device_tensor->GetSize(),
                                                                  device_tensor->format(), device_tensor->type_id());
+        host_tensor_address->set_from_persistent_mem(tensor->is_parameter());
      } else {
        host_tensor_address = device_tensor;
      }
--- a/mindspore/ccsrc/runtime/framework/graph_compiler.cc
+++ b/mindspore/ccsrc/runtime/framework/graph_compiler.cc
@ -100,6 +100,7 @@ void CreateParameterDeviceAddress(const DeviceContext *device_context, const Ker
      size_t tensor_size = AnfAlgo::GetOutputTensorMemSize(item, index);
      auto device_address = device_context->CreateDeviceAddress(nullptr, tensor_size,
                                                                AnfAlgo::GetOutputFormat(item, index), output_type_id);
+      device_address->set_from_persistent_mem(item->isa<Parameter>());
      MS_LOG(DEBUG) << "Create addr for node:" << AnfAlgo::GetNodeDebugString(item) << " addr:" << device_address;
      AnfAlgo::SetOutputAddr(device_address, index, item.get());
    }
@ -143,6 +144,7 @@ void CreateDeviceAddressForTensorValue(const DeviceContext *device_context, cons
      device_context->CreateDeviceAddress(nullptr, tensor_size, output_format, output_type_id);
    MS_LOG(DEBUG) << "Create addr for node:" << AnfAlgo::GetNodeDebugString(value_node) << " addr:" << address;
    MS_EXCEPTION_IF_NULL(address);
+    address->set_from_persistent_mem(true);
    AnfAlgo::SetOutputAddr(address, output_idx++, value_node.get());
  }
 }
@ -165,6 +167,7 @@ void CreateValueNodeDeviceAddress(const DeviceContext *device_context, const Ker
      size_t tensor_size = value.size();
      auto address = device_context->CreateDeviceAddress(nullptr, tensor_size, kOpFormat_DEFAULT, kNumberTypeUInt8);
      MS_EXCEPTION_IF_NULL(address);
+      address->set_from_persistent_mem(true);
      MS_LOG(DEBUG) << "Create addr for node:" << AnfAlgo::GetNodeDebugString(value_node) << " addr:" << address;

      AnfAlgo::SetOutputAddr(address, 0, value_node.get());
@ -172,7 +175,8 @@ void CreateValueNodeDeviceAddress(const DeviceContext *device_context, const Ker
  }
 }

-void CreateKernelOutputDeviceAddress(const DeviceContext *device_context, const KernelGraphPtr &graph) {
+void CreateKernelOutputDeviceAddress(const DeviceContext *device_context, const KernelGraphPtr &graph,
+                                     bool is_gradient_out) {
  MS_EXCEPTION_IF_NULL(device_context);
  MS_EXCEPTION_IF_NULL(graph);
  const std::vector<CNodePtr> &kernels = graph->execution_order();
@ -191,6 +195,9 @@ void CreateKernelOutputDeviceAddress(const DeviceContext *device_context, const
      auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i);
      auto address_size = AnfAlgo::GetOutputTensorMemSize(kernel, i);
      auto device_address = device_context->CreateDeviceAddress(nullptr, address_size, output_format, output_type);
+      if (is_gradient_out) {
+        device_address->set_from_persistent_mem(true);
+      }
      MS_LOG(DEBUG) << "Create addr for node:" << AnfAlgo::GetNodeDebugString(kernel) << " addr:" << device_address;
      AnfAlgo::SetOutputAddr(device_address, i, kernel.get());
    }
@ -434,7 +441,7 @@ GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const Devic
  device_context->PreprocessBeforeRunGraph(graph);

  // Create device address for all anf nodes of graph.
-  CreateDeviceAddress(graph, device_context);
+  CreateDeviceAddress(graph, device_context, false);

  graph->set_is_all_nop_node(opt::IsAllNopNode(graph.get()));

@ -498,7 +505,7 @@ GraphId GraphCompiler::CompileGraph(const session::OpRunInfo &op_run_info, bool
  device_context->OptimizeSingleOpGraph(graph);

  // Create device address for all anf nodes of graph.
-  CreateDeviceAddressWithoutWorkspace(graph, device_context);
+  CreateDeviceAddressWithoutWorkspace(graph, device_context, op_run_info.is_gradient_out);

  graph->set_is_all_nop_node(opt::IsAllNopNode(graph.get()));
  run_op_graphs_[op_run_info.graph_info] = graph;
@ -546,20 +553,22 @@ KernelGraphPtr GraphCompiler::Fetch(const GraphInfo &graph_info) const {
  return iter->second;
 }

-void GraphCompiler::CreateDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *device_context) const {
+void GraphCompiler::CreateDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *device_context,
+                                        bool is_gradient_out) const {
  CreateParameterDeviceAddress(device_context, graph);
  CreateValueNodeDeviceAddress(device_context, graph);
-  CreateKernelOutputDeviceAddress(device_context, graph);
+  CreateKernelOutputDeviceAddress(device_context, graph, is_gradient_out);
  CreateKernelWorkspaceDeviceAddress(device_context, graph);
  UpdateDeviceAddressForInplaceNode(graph);
  UpdateDeviceAddressForRefNode(graph);
 }

 void GraphCompiler::CreateDeviceAddressWithoutWorkspace(const KernelGraphPtr &graph,
-                                                        const DeviceContext *device_context) const {
+                                                        const DeviceContext *device_context,
+                                                        bool is_gradient_out) const {
  CreateParameterDeviceAddress(device_context, graph);
  CreateValueNodeDeviceAddress(device_context, graph);
-  CreateKernelOutputDeviceAddress(device_context, graph);
+  CreateKernelOutputDeviceAddress(device_context, graph, is_gradient_out);
  UpdateDeviceAddressForInplaceNode(graph);
  UpdateDeviceAddressForRefNode(graph);
 }
@ -593,24 +602,27 @@ TensorPtr GraphCompiler::GetSingleOpInputTensorByIndex(const CNodePtr &kernel,
 }

 void GraphCompiler::GetSingleOpRunInfoAndGraphInfo(const CNodePtr &kernel, const InputTensorInfo &tensor_info,
-                                                   OpRunInfo *run_info, GraphInfo *graph_info) {
+                                                   OpRunInfo *run_info, GraphInfo *graph_info,
+                                                   GraphOutputInfo *const graph_output_info) {
  MS_EXCEPTION_IF_NULL(session_);
  MS_EXCEPTION_IF_NULL(graph_info);
-
  *graph_info = session_->GetSingleOpGraphInfo(kernel, tensor_info.input_tensors);
-  *run_info = session_->GetSingleOpRunInfo(kernel, *graph_info, tensor_info);
+  *run_info = session_->GetSingleOpRunInfo(kernel, *graph_info, tensor_info, graph_output_info);
 }

-void GraphCompiler::CalculateRefCount(const KernelGraphPtr &graph, std::map<KernelWithIndex, size_t> *ref_count) const {
+void GraphCompiler::CalculateRefCount(const KernelGraphPtr &graph, std::map<KernelWithIndex, size_t> *ref_count,
+                                      std::map<AnfNodePtr, size_t> *forward_output_refcount) const {
  MS_EXCEPTION_IF_NULL(session_);
  session_->GetRefCount(graph.get(), ref_count);
+  session_->GetForwardOutputRefCount(graph.get(), forward_output_refcount);
 }

 void GraphCompiler::UpdateRefCount(const std::set<KernelWithIndex> &input_kernels_with_index,
                                   std::map<KernelWithIndex, size_t> *ref_count,
+                                   std::map<AnfNodePtr, size_t> *forward_output_refcount,
                                   std::map<KernelWithIndex, tensor::TensorPtr> *op_output_map) const {
  MS_EXCEPTION_IF_NULL(session_);
-  session_->HandleOpInputs(input_kernels_with_index, ref_count, op_output_map);
+  session_->HandleOpInputs(input_kernels_with_index, ref_count, forward_output_refcount, op_output_map);
 }

 void GraphCompiler::RecoverGraphOutput(const AnfNodePtr &kernel, const VectorRef &op_outputs,
--- a/mindspore/ccsrc/runtime/framework/graph_compiler.h
+++ b/mindspore/ccsrc/runtime/framework/graph_compiler.h
@ -139,14 +139,16 @@ class GraphCompiler {

  // Get OpRunInfo and GraphInfo for single op compile and run.
  void GetSingleOpRunInfoAndGraphInfo(const CNodePtr &kernel, const InputTensorInfo &tensor_info, OpRunInfo *run_info,
-                                      GraphInfo *graph_info);
+                                      GraphInfo *graph_info, GraphOutputInfo *const graph_output_info);

  // Calculate ref count of PyNative back propagation operators.
-  void CalculateRefCount(const KernelGraphPtr &graph, std::map<KernelWithIndex, size_t> *ref_count) const;
+  void CalculateRefCount(const KernelGraphPtr &graph, std::map<KernelWithIndex, size_t> *ref_count,
+                         std::map<AnfNodePtr, size_t> *forward_output_refcount) const;

  // Update ref count of PyNative back propagation operators.
  void UpdateRefCount(const std::set<KernelWithIndex> &input_kernels_with_index,
                      std::map<KernelWithIndex, size_t> *ref_count,
+                      std::map<AnfNodePtr, size_t> *forward_output_refcount,
                      std::map<KernelWithIndex, tensor::TensorPtr> *op_output_map) const;

  // Handle single op output tensor and recover output of original complete kernel graph.
@ -182,10 +184,12 @@ class GraphCompiler {
  GraphId CompileGraphImpl(const KernelGraphPtr &graph, const DeviceContext *device_context) const;

  // Create device address for all anf nodes of graph.
-  void CreateDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *device_context) const;
+  void CreateDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *device_context,
+                           bool is_gradient_out) const;

  // Create device address for input and output of ops.
-  void CreateDeviceAddressWithoutWorkspace(const KernelGraphPtr &graph, const DeviceContext *device_context) const;
+  void CreateDeviceAddressWithoutWorkspace(const KernelGraphPtr &graph, const DeviceContext *device_context,
+                                           bool is_gradient_out) const;

  // Set Graph's dependencies for pre_graph and post_graph.
  void SetGraphDependency(const KernelGraphPtr &graph, const GraphSegmentPtr &segment) const;
--- a/mindspore/ccsrc/runtime/framework/graph_scheduler.cc
+++ b/mindspore/ccsrc/runtime/framework/graph_scheduler.cc
@ -1763,6 +1763,7 @@ void GraphScheduler::PersistDeviceTensor(const GraphCompilerInfo &graph_compiler
        auto other_type_device_tensor = device_context->CreateDeviceAddress(
          nullptr, device_tensor->GetSize(), device_tensor->format(), device_tensor->type_id());
        other_type_device_tensor->SetNodeIndex(input_node, 0);
+        other_type_device_tensor->set_from_persistent_mem(input_node->isa<Parameter>());
        AddDeviceTensorStore(front_node.get(), other_type_device_tensor);
      }
    }
--- a/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc
+++ b/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc
@ -511,7 +511,7 @@ bool AscendDeviceContext::AllocateMemory(DeviceAddress *const &address, size_t s
  MS_EXCEPTION_IF_NULL(address);
  MS_EXCEPTION_IF_NULL(runtime_instance_);
  runtime_instance_->SetContext();
-  auto device_ptr = mem_manager_->MallocMemFromMemPool(size);
+  auto device_ptr = mem_manager_->MallocMemFromMemPool(size, address->from_persistent_mem_);
  if (!device_ptr) {
    return false;
  }
--- a/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc
+++ b/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc
@ -89,7 +89,7 @@ bool CPUDeviceContext::AllocateMemory(DeviceAddress *const &address, size_t size
    MS_LOG(EXCEPTION) << "The device address type is wrong: " << address->DeviceType();
  }

-  auto device_ptr = mem_manager_->MallocMemFromMemPool(size);
+  auto device_ptr = mem_manager_->MallocMemFromMemPool(size, 0);
  if (!device_ptr) {
    return false;
  }
--- a/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc
+++ b/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc
@ -173,7 +173,7 @@ bool GPUDeviceContext::AllocateMemory(DeviceAddress *const &address, size_t size
  if (!BindDeviceToCurrentThread()) {
    return false;
  }
-  auto device_ptr = mem_manager_->MallocMemFromMemPool(size);
+  auto device_ptr = mem_manager_->MallocMemFromMemPool(size, address->from_persistent_mem_);
  if (!device_ptr) {
    return false;
  }
--- a/mindspore/ccsrc/vm/backend.cc
+++ b/mindspore/ccsrc/vm/backend.cc
@ -234,7 +234,7 @@ void UpdateOutput(const std::vector<session::KernelWithIndex> &output_nodes, Vec
  }
 }

-void ClearGraphDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *device_context) {
+void ClearGraphDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *device_context, bool is_gradient_out) {
  MS_EXCEPTION_IF_NULL(graph);
  for (const auto &node : graph->execution_order()) {
    auto output_address_num = AnfAlgo::GetOutputAddressNum(node);
@ -252,6 +252,9 @@ void ClearGraphDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *d
      MS_EXCEPTION_IF_NULL(new_device_address);
      new_device_address->set_original_ref_count(device_address->original_ref_count());
      new_device_address->ResetRefCount();
+      if (is_gradient_out) {
+        new_device_address->set_from_persistent_mem(true);
+      }
      AnfAlgo::SetOutputAddr(new_device_address, i, node.get());
    }
  }
@ -800,9 +803,10 @@ void MindRTBackend::RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs
                                            &graph_output_info.output_indexes);

    std::map<KernelWithIndex, size_t> cnode_ref_count;
+    std::map<AnfNodePtr, size_t> forward_output_refcount;
    auto iter = cnode_ref_counts_.find(graph->graph_id());
    if (iter == cnode_ref_counts_.end()) {
-      graph_compiler_->CalculateRefCount(graph, &cnode_ref_count);
+      graph_compiler_->CalculateRefCount(graph, &cnode_ref_count, &forward_output_refcount);
      (void)cnode_ref_counts_.emplace(graph->graph_id(), cnode_ref_count);
    } else {
      cnode_ref_count = iter->second;
@ -822,7 +826,8 @@ void MindRTBackend::RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs
        GraphInfo graph_info;
        graph_compiler_->GetSingleOpInputTensors(kernel, op_output_map, parameter_index, inputs[graph_index],
                                                 &input_tensor_info);
-        graph_compiler_->GetSingleOpRunInfoAndGraphInfo(kernel, input_tensor_info, &op_run_info, &graph_info);
+        graph_compiler_->GetSingleOpRunInfoAndGraphInfo(kernel, input_tensor_info, &op_run_info, &graph_info,
+                                                        &graph_output_info);

        RunOp(&op_run_info, &op_outputs);
      } else {
@ -832,7 +837,8 @@ void MindRTBackend::RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs
        runtime::OpLazyBuilder::GetInstance().ExecuteRemainingTasks();
      }

-      graph_compiler_->UpdateRefCount(input_tensor_info.input_kernel, &cnode_ref_count, &op_output_map);
+      graph_compiler_->UpdateRefCount(input_tensor_info.input_kernel, &cnode_ref_count, &forward_output_refcount,
+                                      &op_output_map);

      graph_output_info.graph_output_tensors.clear();
      graph_compiler_->RecoverGraphOutput(kernel, op_outputs, cnode_ref_count, &op_output_map, &graph_output_info);
@ -1246,7 +1252,8 @@ void MindRTBackend::LazyExecuteTaskCallback() {
      const auto &context = op_run_task->context();
      RunSingleOpGraph(context->graph(), context->output_nodes(), context->op_run_info(),
                       context->graph_compiler_info(), context->device_context());
-      ClearGraphDeviceAddress(context->graph(), context->device_context());
+      ClearGraphDeviceAddress(context->graph(), context->device_context(), false);
+
      UpdateInputDeviceAddress(context->graph());

      op_lazy_builder.PopOpRunTask();
@ -1304,7 +1311,7 @@ void MindRTBackend::RunOpInternal(bool single_op_cache_hit, GraphCompilerInfo *g
    }
    RunSingleOpGraph(graph, output_nodes, *op_run_info, graph_compiler_info, device_context);
    UpdateOutput(output_nodes, outputs);
-    ClearGraphDeviceAddress(graph, device_context);
+    ClearGraphDeviceAddress(graph, device_context, false);
    UpdateInputDeviceAddress(graph);
    if (op_run_info->is_dynamic_shape) {
      UpdateOutputAbstract(graph, op_run_info);
--- a/mindspore/context.py
+++ b/mindspore/context.py
@ -273,6 +273,14 @@ class _Context:
            raise ValueError("For 'context.set_context', the argument 'max_device_memory' should not be \"0GB\".")
        self.set_param(ms_ctx_param.max_device_memory, max_device_memory_value)

+    def set_mempool_block_size(self, mempool_block_size):
+        if not Validator.check_str_by_regular(mempool_block_size, _re_pattern):
+            raise ValueError("Context param mempool_block_size should be in correct format! Such as \"10GB\"")
+        mempool_block_size_value = float(mempool_block_size[:-2])
+        if mempool_block_size_value < 1.0:
+            raise ValueError("Context param mempool_block_size should be greater or equal to \"1GB\"")
+        self.set_param(ms_ctx_param.mempool_block_size, mempool_block_size_value)
+
    def set_print_file_path(self, file_path):
        """Add timestamp suffix to file name. Sets print file path."""
        print_file_path = os.path.realpath(file_path)
@ -317,6 +325,7 @@ class _Context:
        'profiling_options': set_profiling_options,
        'variable_memory_max_size': set_variable_memory_max_size,
        'max_device_memory': set_max_device_memory,
+        'mempool_block_size': set_mempool_block_size,
        'print_file_path': set_print_file_path,
        'env_config_path': set_env_config_path
    }
@ -570,7 +579,8 @@ def _check_target_specific_cfgs(device, arg_key):
        'print_file_path': ['Ascend'],
        'variable_memory_max_size': ['Ascend'],
        'auto_tune_mode': ['Ascend'],
-        'max_device_memory': ['GPU']
+        'max_device_memory': ['GPU'],
+        'mempool_block_size': ['GPU', 'Ascend']
    }
    # configs not in map device_cfgs are supposed to be suitable for all devices
    if not arg_key in device_cfgs:
@ -583,15 +593,15 @@ def _check_target_specific_cfgs(device, arg_key):
    return False


-@args_unreset_check(device_id=int, variable_memory_max_size=str, max_device_memory=str)
+@args_unreset_check(device_id=int, variable_memory_max_size=str, max_device_memory=str, mempool_block_size=str)
@args_type_check(mode=int, precompile_only=bool, device_target=str, device_id=int, save_graphs=bool,
                 save_graphs_path=str, enable_dump=bool, auto_tune_mode=str,
                 save_dump_path=str, enable_reduce_precision=bool, variable_memory_max_size=str,
                 enable_profiling=bool, profiling_options=str, enable_auto_mixed_precision=bool,
                 enable_graph_kernel=bool, reserve_class_name_in_scope=bool, check_bprop=bool,
                 max_device_memory=str, print_file_path=str, enable_sparse=bool, max_call_depth=int,
-                 env_config_path=str, graph_kernel_flags=str, enable_compile_cache=bool,
-                 compile_cache_path=str, grad_for_scalar=bool, pynative_synchronize=bool)
+                 env_config_path=str, graph_kernel_flags=str, save_compile_cache=bool,
+                 load_compile_cache=bool, grad_for_scalar=bool, pynative_synchronize=bool, mempool_block_size=str)
 def set_context(**kwargs):
    """
    Set context for running environment.
@ -616,6 +626,8 @@ def set_context(**kwargs):
    |                         |  max_device_memory           |  GPU                       |
    |                         +------------------------------+----------------------------+
    |                         |  variable_memory_max_size    |  Ascend                    |
+    |                         +------------------------------+----------------------------+
+    |                         |  mempool_block_size          |  GPU/Ascend                |
    +-------------------------+------------------------------+----------------------------+
    | Debug Configuration     |  save_graphs                 |  CPU/GPU/Ascend            |
    |                         +------------------------------+----------------------------+
@ -672,6 +684,9 @@ def set_context(**kwargs):
            The actual used memory size is the minimum of the available memory of the device and max_device_memory.
        variable_memory_max_size (str): Set the maximum size of the variable memory max size. Default: "30GB".
            After this parameter is set, the maximum memory used by the framework is restricted to the configured value.
+        mempool_block_size (str): Set the size of the memory pool block in PyNative mode for devices.
+            The format is "xxGB". Default: "1GB". Minimum size is "1G". The actual used memory block size is the minimum
+            of the available memory of the device and mempool_block_size.
        save_graphs (bool): Whether to save graphs. Default: False.
            When the `save_graphs` attribute is set as True, attribute of `save_graphs_path` is used to set the
            intermediate compilation graph storage path. By default, the graphs are saved in the current directory.
@ -813,6 +828,7 @@ def set_context(**kwargs):
        ...                     profiling_options='{"output":"/home/data/output","training_trace":"on"}')
        >>> context.set_context(check_bprop=True)
        >>> context.set_context(max_device_memory="3.5GB")
+        >>> context.set_context(mempool_block_size="1GB")
        >>> context.set_context(print_file_path="print.pb")
        >>> context.set_context(enable_sparse=True)
        >>> context.set_context(max_call_depth=80)
--- a/mindspore/core/utils/ms_context.cc
+++ b/mindspore/core/utils/ms_context.cc
@ -86,6 +86,7 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
  set_param<std::string>(MS_CTX_PROFILING_OPTIONS, "training_trace");
  set_param<bool>(MS_CTX_CHECK_BPROP_FLAG, false);
  set_param<float>(MS_CTX_MAX_DEVICE_MEMORY, kDefaultMaxDeviceMemory);
+  set_param<float>(MS_CTX_MEMPOOL_BLOCK_SIZE, kDefaultMempoolBlockSize);
  set_param<std::string>(MS_CTX_PRINT_FILE_PATH, "");
  set_param<bool>(MS_CTX_ENABLE_GRAPH_KERNEL, false);
  set_param<bool>(MS_CTX_ENABLE_SPARSE, false);
@ -189,5 +190,4 @@ bool MsContext::enable_dump_ir() const {
  return false;
 #endif
 }
-
 }  // namespace mindspore
--- a/mindspore/core/utils/ms_context.h
+++ b/mindspore/core/utils/ms_context.h
@ -60,6 +60,8 @@ const unsigned int MAX_CALL_DEPTH_DEFAULT = 1000;
 const std::set<std::string> kTargetSet = {kCPUDevice, kGPUDevice, kAscendDevice, kDavinciDevice};
 // The default max available device memory is 1024GB.
 const float kDefaultMaxDeviceMemory = 1024;
+// The default memory pool init percent is 0.0.
+const float kDefaultMempoolBlockSize = 1.0;

 // enum definition for MindSpore Context Parameter
 enum MsCtxParam : unsigned {
@ -109,6 +111,7 @@ enum MsCtxParam : unsigned {
  // parameter of type float
  MS_CTX_TYPE_FLOAT_BEGIN = MS_CTX_TYPE_UINT32_END,
  MS_CTX_MAX_DEVICE_MEMORY = MS_CTX_TYPE_FLOAT_BEGIN,
+  MS_CTX_MEMPOOL_BLOCK_SIZE,
  MS_CTX_TYPE_FLOAT_END,

  // parameter of type string