diff --git a/docs/api/api_python/mindspore.context.rst b/docs/api/api_python/mindspore.context.rst index d84a25e89c9..0589fdf75cc 100644 --- a/docs/api/api_python/mindspore.context.rst +++ b/docs/api/api_python/mindspore.context.rst @@ -24,6 +24,8 @@ MindSpore上下文,用于配置当前执行环境,包括执行模式、执 | | max_device_memory | GPU | | +------------------------------+----------------------------+ | | variable_memory_max_size | Ascend | + | +------------------------------+----------------------------+ + | | mempool_block_size | GPU/Ascend | +-------------------------+------------------------------+----------------------------+ | 调试配置 | save_graphs | CPU/GPU/Ascend | | +------------------------------+----------------------------+ @@ -76,6 +78,7 @@ MindSpore上下文,用于配置当前执行环境,包括执行模式、执 - **device_target** (str) - 表示待运行的目标设备,支持Ascend、GPU和CPU。如果未设置设备目标,则使用MindSpore包的版本。 - **max_device_memory** (str) - 设置设备可用的最大内存。目前,仅在GPU上支持。格式为“xxGB”。默认值:1024GB。实际使用的内存大小是设备的可用内存和 `max_device_memory` 值中的最小值。 - **variable_memory_max_size** (str) - 设置可变内存的最大值。默认值:30GB。设置此参数后,框架使用的最大内存受配置值的限制。 + - **mempool_block_size** (str) - 设置PyNative模式下设备内存池的块大小。格式为“xxGB”。默认值:1GB。最小值是1GB。实际使用的内存池块大小是设备的可用内存和 `mempool_block_size` 值中的最小值。 - **save_graphs** (bool) - 表示是否保存图形。默认值:False。当 `save_graphs` 属性设为True时, `save_graphs_path` 属性用于设置中间编译图的存储路径。默认情况下,图形保存在当前目录下。 - **save_graphs_path** (str) - 表示保存图形的路径。默认值:"."。如果指定的目录不存在,系统将自动创建该目录。在分布式训练中,图形将被保存到 `save_graphs_path/rank_${rank_id}/` 目录下。 `rank_id` 为集群中当前设备的ID。 - **enable_dump** (bool) - 此参数已弃用,将在下一版本中删除。 @@ -155,6 +158,7 @@ MindSpore上下文,用于配置当前执行环境,包括执行模式、执 ... profiling_options='{"output":"/home/data/output","training_trace":"on"}') >>> context.set_context(check_bprop=True) >>> context.set_context(max_device_memory="3.5GB") + >>> context.set_context(mempool_block_size="1GB") >>> context.set_context(print_file_path="print.pb") >>> context.set_context(enable_sparse=True) >>> context.set_context(max_call_depth=80) diff --git a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_dynamic_allocator.cc b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_dynamic_allocator.cc index 8551e59f098..0af2b61815b 100644 --- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_dynamic_allocator.cc +++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_dynamic_allocator.cc @@ -15,39 +15,51 @@ */ #include "backend/optimizer/mem_reuse/mem_dynamic_allocator.h" +#include #include "utils/ms_utils.h" #include "utils/convert_utils.h" #include "utils/log_adapter.h" +#include "utils/ms_context.h" namespace mindspore { namespace device { -DynamicMemPoolBestFit::~DynamicMemPoolBestFit() { - global_mem_block_list_.clear(); - global_idle_mem_buf_map_.clear(); +static const char kPynativeParamMem[] = "Pynative unique mem"; +static const char kCommonMem[] = "Common mem"; +const size_t kGBToByte = 1024 << 20; + +static bool IsPynativeMode() { + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + return ms_context->get_param(MS_CTX_EXECUTION_MODE) == kPynativeMode; } -DeviceMemPtr DynamicMemPoolBestFit::AllocTensorMem(size_t size) { +DynamicMemPoolBestFit::~DynamicMemPoolBestFit() { + persistent_mem_->clear(); + common_mem_->clear(); +} + +DeviceMemPtr DynamicMemPoolBestFit::AllocTensorMem(size_t size, bool from_persistent_mem) { size_t align_size = AlignMemorySize(size); std::lock_guard locker(mutex_); // Find the idle memory buf by tensor size, if not find, then add new memory block and memory buf. - DeviceMemPtr device_addr = FindIdleMemBuf(align_size); + DeviceMemPtr device_addr = FindIdleMemBuf(align_size, from_persistent_mem); if (!device_addr) { - device_addr = AddMemBlockAndMemBuf(align_size); + device_addr = AddMemBlockAndMemBuf(align_size, from_persistent_mem); } return device_addr; } std::vector DynamicMemPoolBestFit::AllocContinuousTensorMem(size_t total_size, - std::vector size_list) { + const std::vector &size_list) { std::vector device_addr_list; // Pre-alloc the one whole piece memory. - auto device_addr = AllocTensorMem(total_size); + auto device_addr = AllocTensorMem(total_size, false); if (!device_addr) { return device_addr_list; } std::lock_guard locker(mutex_); // Remove the pre-alloc memory. - const auto &mem_block = FindMemBlock(device_addr); + const auto &mem_block = FindMemBlock(device_addr, common_mem_); MS_EXCEPTION_IF_NULL(mem_block); const auto &iter = mem_block->block_all_mem_buf_map_.find(device_addr); if (iter == mem_block->block_all_mem_buf_map_.end()) { @@ -63,11 +75,11 @@ std::vector DynamicMemPoolBestFit::AllocContinuousTensorMem(size_t // Split the pre-alloc memory into continuous memory by the size list. DynamicMemBufPtr continuous_mem_buf; auto buf_addr = device_addr; - for (size_t i = 0; i < size_list.size(); i++) { - continuous_mem_buf = std::make_shared(buf_addr, kMemBufUsed, size_list[i]); + for (size_t i : size_list) { + continuous_mem_buf = std::make_shared(buf_addr, kMemBufUsed, i); (void)mem_block->block_all_mem_buf_map_.emplace(buf_addr, continuous_mem_buf); device_addr_list.emplace_back(buf_addr); - buf_addr = AddressOffset(buf_addr, size_list[i]); + buf_addr = AddressOffset(buf_addr, i); } // Update the size of the last memory buf. continuous_mem_buf->size_ += rest_size; @@ -81,9 +93,14 @@ size_t DynamicMemPoolBestFit::AlignMemorySize(size_t size) const { return ((size + DYNAMIC_MEM_ALIGN_SIZE - 1) / DYNAMIC_MEM_ALIGN_SIZE) * DYNAMIC_MEM_ALIGN_SIZE; } -DeviceMemPtr DynamicMemPoolBestFit::FindIdleMemBuf(size_t size) { - const auto &iter = global_idle_mem_buf_map_.lower_bound(size); - if (iter != global_idle_mem_buf_map_.end()) { +DeviceMemPtr DynamicMemPoolBestFit::FindIdleMemBuf(size_t size, bool from_persistent_mem) { + auto mem_mng = common_mem_; + if (from_persistent_mem) { + mem_mng = persistent_mem_; + } + MS_EXCEPTION_IF_NULL(mem_mng); + const auto &iter = mem_mng->idle_mem_buf_map_.lower_bound(size); + if (iter != mem_mng->idle_mem_buf_map_.end()) { auto mem_buf = iter->second; MS_EXCEPTION_IF_NULL(mem_buf); if (mem_buf->status_ != kMemBufIdle) { @@ -92,24 +109,69 @@ DeviceMemPtr DynamicMemPoolBestFit::FindIdleMemBuf(size_t size) { } mem_buf->status_ = kMemBufUsed; // Remove map of old idle memory buf - (void)global_idle_mem_buf_map_.erase(iter); + (void)mem_mng->idle_mem_buf_map_.erase(iter); // Divide memory buf - if (IsDivide(size, mem_buf->size_)) { - DivideMemBuf(size, mem_buf); + if (IsSplit(size, mem_buf->size_)) { + SplitMemBuf(size, mem_buf, mem_mng); } // Memory statistics - total_used_mem_statistics_ += mem_buf->size_; - if (total_used_mem_statistics_ > used_mem_peak_statistics_) { - used_mem_peak_statistics_ = total_used_mem_statistics_; + mem_mng->mps_.total_used_mem_size_ += mem_buf->size_; + if (mem_mng->mps_.total_used_mem_size_ > mem_mng->mps_.used_mem_peak_size_) { + mem_mng->mps_.used_mem_peak_size_ = mem_mng->mps_.total_used_mem_size_; } return mem_buf->device_addr_; } return nullptr; } -DeviceMemPtr DynamicMemPoolBestFit::AddMemBlockAndMemBuf(size_t size) { - size_t alloc_mem_size = CalMemBlockAllocSize(size); +size_t DynamicMemPoolBestFit::MemAllocUnitSize(bool from_persistent_mem) const { + return from_persistent_mem ? persistent_mem_->unit_size_ : common_mem_->unit_size_; +} + +void DynamicMemPoolBestFit::SetMemAllocUintSize(size_t size) { + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + persistent_mem_->unit_size_ = DYNAMIC_MEM_ALLOC_UNIT_SIZE; + common_mem_->unit_size_ = size - persistent_mem_->unit_size_; + MS_LOG(INFO) << "Set mem alloc unit size " << size; +} + +void DynamicMemPoolBestFit::SetMempoolBlockSize(size_t device_mem_size) { + if (!IsPynativeMode()) { + return; + } + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + float mem_block_size = ms_context->get_param(MS_CTX_MEMPOOL_BLOCK_SIZE); + if (mem_block_size == kDefaultMempoolBlockSize) { + return; + } + size_t config_size = FloatToSize(mem_block_size * kGBToByte); + size_t real_block_size = std::min(config_size, device_mem_size); + if (config_size > device_mem_size) { + MS_LOG(WARNING) << "Memory pool block size " << config_size << " is bigger than currently available maximum memory " + << device_mem_size << ", and the actual effective value will be " << device_mem_size; + } + SetMemAllocUintSize(real_block_size); +} + +DeviceMemPtr DynamicMemPoolBestFit::AddMemBlockAndMemBuf(size_t size, bool from_persistent_mem) { + // Pyantive unique mem is not enough, find from common + if (from_persistent_mem && !persistent_mem_->mem_block_list_.empty()) { + auto mem_addr = FindIdleMemBuf(size, false); + if (mem_addr != nullptr) { + return mem_addr; + } + from_persistent_mem = false; + } + size_t alloc_mem_size = CalMemBlockAllocSize(size, from_persistent_mem); if (alloc_mem_size == 0) { + MS_LOG(DEBUG) << "Try to find in other mem"; + auto mem_addr = FindIdleMemBuf(size, !from_persistent_mem); + if (mem_addr != nullptr) { + return mem_addr; + } + DumpDynamicMemPoolInfo(); return nullptr; } // Add new memory block @@ -118,40 +180,50 @@ DeviceMemPtr DynamicMemPoolBestFit::AddMemBlockAndMemBuf(size_t size) { if (real_alloc_size < size) { MS_LOG(WARNING) << "Memory not enough: alloc size[" << real_alloc_size << "] is smaller than required size[" << size << "]."; + DumpDynamicMemPoolInfo(); return nullptr; } - mem_alloc_unit_size_ = DYNAMIC_MEM_ALLOC_UNIT_SIZE; + // In graph mode, unit_size are set once using an estimated memory value size, and subsequent memory requests use the + // default size + if (!IsPynativeMode()) { + common_mem_->unit_size_ = DYNAMIC_MEM_ALLOC_UNIT_SIZE; + } + auto mem_mng = common_mem_; + if (from_persistent_mem) { + mem_mng = persistent_mem_; + } + MS_EXCEPTION_IF_NULL(mem_mng); auto mem_block = std::make_shared(device_addr, real_alloc_size); MS_EXCEPTION_IF_NULL(mem_block); const auto &iter = - std::upper_bound(global_mem_block_list_.begin(), global_mem_block_list_.end(), device_addr, CmpMemBlock); - (void)global_mem_block_list_.insert(iter, mem_block); + std::upper_bound(mem_mng->mem_block_list_.begin(), mem_mng->mem_block_list_.end(), device_addr, CmpMemBlock); + (void)mem_mng->mem_block_list_.insert(iter, mem_block); // Add new memory buf auto mem_buf = std::make_shared(device_addr, kMemBufUsed, real_alloc_size); MS_EXCEPTION_IF_NULL(mem_buf); // Add map of new memory buf in the block (void)mem_block->block_all_mem_buf_map_.emplace(device_addr, mem_buf); - // Divide memory buf - if (IsDivide(size, mem_buf->size_)) { - DivideMemBuf(size, mem_buf); + // Split memory buf + if (IsSplit(size, mem_buf->size_)) { + SplitMemBuf(size, mem_buf, mem_mng); } // Memory statistics - total_mem_statistics_ += real_alloc_size; - total_used_mem_statistics_ += mem_buf->size_; - if (total_used_mem_statistics_ > used_mem_peak_statistics_) { - used_mem_peak_statistics_ = total_used_mem_statistics_; + mem_mng->mps_.total_mem_size_ += real_alloc_size; + mem_mng->mps_.total_used_mem_size_ += mem_buf->size_; + if (mem_mng->mps_.total_used_mem_size_ > mem_mng->mps_.used_mem_peak_size_) { + mem_mng->mps_.used_mem_peak_size_ = mem_mng->mps_.total_used_mem_size_; } return mem_buf->device_addr_; } -size_t DynamicMemPoolBestFit::CalMemBlockAllocSize(size_t size) { +size_t DynamicMemPoolBestFit::CalMemBlockAllocSize(size_t size, bool from_persistent_mem) { auto device_free_mem_size = free_mem_size(); if (device_free_mem_size < size) { MS_LOG(WARNING) << "Memory not enough: current free memory size[" << device_free_mem_size << "] is smaller than required size[" << size << "]."; return 0; } - auto alloc_mem_size = mem_alloc_unit_size(); + auto alloc_mem_size = MemAllocUnitSize(from_persistent_mem); // Growing at twice of alloc size constexpr size_t kDouble = 2; while (alloc_mem_size < size) { @@ -161,13 +233,14 @@ size_t DynamicMemPoolBestFit::CalMemBlockAllocSize(size_t size) { return alloc_mem_size; } -bool DynamicMemPoolBestFit::IsDivide(size_t tensor_size, size_t mem_buf_size) const { +bool DynamicMemPoolBestFit::IsSplit(size_t tensor_size, size_t mem_buf_size) const { return mem_buf_size - tensor_size >= DYNAMIC_MEM_ALIGN_SIZE; } -void DynamicMemPoolBestFit::DivideMemBuf(size_t size, const DynamicMemBufPtr &mem_buf) { +void DynamicMemPoolBestFit::SplitMemBuf(size_t size, const DynamicMemBufPtr &mem_buf, + const MemStatusManagerPtr &mem_mng) { MS_EXCEPTION_IF_NULL(mem_buf); - const auto &mem_block = FindMemBlock(mem_buf->device_addr_); + const auto &mem_block = FindMemBlock(mem_buf->device_addr_, mem_mng); MS_EXCEPTION_IF_NULL(mem_block); // Divide new memory buf if (mem_buf->size_ < size) { @@ -180,7 +253,7 @@ void DynamicMemPoolBestFit::DivideMemBuf(size_t size, const DynamicMemBufPtr &me // Add map of new memory buf in the block (void)mem_block->block_all_mem_buf_map_.emplace(newbuf_addr, new_mem_buf); // Add map of new idle memory buf - (void)global_idle_mem_buf_map_.emplace(newbuf_size, new_mem_buf); + (void)mem_mng->idle_mem_buf_map_.emplace(newbuf_size, new_mem_buf); } bool DynamicMemPoolBestFit::CmpMemBlock(const DeviceMemPtr &device_addr, const DynamicMemBlockPtr &mem_block) { @@ -189,11 +262,12 @@ bool DynamicMemPoolBestFit::CmpMemBlock(const DeviceMemPtr &device_addr, const D return device_addr < mem_block->device_addr(); } -DynamicMemBlockPtr DynamicMemPoolBestFit::FindMemBlock(const DeviceMemPtr &device_addr) { +DynamicMemBlockPtr DynamicMemPoolBestFit::FindMemBlock(const DeviceMemPtr &device_addr, + const MemStatusManagerPtr &mem_mng) { MS_EXCEPTION_IF_NULL(device_addr); auto &&iter = - std::upper_bound(global_mem_block_list_.begin(), global_mem_block_list_.end(), device_addr, CmpMemBlock); - if (iter != global_mem_block_list_.begin()) { + std::upper_bound(mem_mng->mem_block_list_.begin(), mem_mng->mem_block_list_.end(), device_addr, CmpMemBlock); + if (iter != mem_mng->mem_block_list_.begin()) { return *(--iter); } return nullptr; @@ -202,16 +276,32 @@ DynamicMemBlockPtr DynamicMemPoolBestFit::FindMemBlock(const DeviceMemPtr &devic void DynamicMemPoolBestFit::FreeTensorMem(const DeviceMemPtr &device_addr) { MS_EXCEPTION_IF_NULL(device_addr); std::lock_guard locker(mutex_); - const auto &mem_block = FindMemBlock(device_addr); + auto fn = [this](const MemStatusManagerPtr &mem_mng, const DeviceMemPtr &device_addr) -> DynamicMemBlockPtr { + auto mem_block = FindMemBlock(device_addr, mem_mng); + if (mem_block != nullptr) { + const auto &iter = mem_block->block_all_mem_buf_map_.find(device_addr); + if (iter != mem_block->block_all_mem_buf_map_.end()) { + return mem_block; + } + } + return nullptr; + }; + auto mem_block = fn(common_mem_, device_addr); if (mem_block == nullptr) { - // May be destroy the memory pool first, then destroy the address, so this is normal case. - MS_LOG(DEBUG) << "Can't find the mem_block of the device address[" << device_addr << "]."; - return; + mem_block = fn(persistent_mem_, device_addr); + if (mem_block == nullptr) { + // May be destroy the memory pool first, then destroy the address, so this is normal case. + MS_LOG(DEBUG) << "Can't find the mem_block of the device address[" << device_addr << "]."; + return; + } + CombineMemBuf(mem_block, device_addr, persistent_mem_); + } else { + CombineMemBuf(mem_block, device_addr, common_mem_); } - CombineMemBuf(mem_block, device_addr); } -void DynamicMemPoolBestFit::CombineMemBuf(const DynamicMemBlockPtr &mem_block, const DeviceMemPtr &device_addr) { +void DynamicMemPoolBestFit::CombineMemBuf(const DynamicMemBlockPtr &mem_block, const DeviceMemPtr &device_addr, + const MemStatusManagerPtr &mem_mng) { MS_EXCEPTION_IF_NULL(mem_block); MS_EXCEPTION_IF_NULL(device_addr); const auto &iter = mem_block->block_all_mem_buf_map_.find(device_addr); @@ -224,10 +314,10 @@ void DynamicMemPoolBestFit::CombineMemBuf(const DynamicMemBlockPtr &mem_block, c MS_LOG(EXCEPTION) << "Find the mem_buf is not used, mem_buf_address[" << mem_buf->device_addr_ << "]."; } mem_buf->status_ = kMemBufIdle; - if (total_used_mem_statistics_ < mem_buf->size_) { + if (mem_mng->mps_.total_used_mem_size_ < mem_buf->size_) { MS_LOG(EXCEPTION) << "The total used mem size is less than the size of membuf."; } - total_used_mem_statistics_ -= mem_buf->size_; + mem_mng->mps_.total_used_mem_size_ -= mem_buf->size_; // Combine backward(combine the next_mem_buf to mem_buf) auto next_iter = iter; (void)next_iter++; @@ -236,7 +326,7 @@ void DynamicMemPoolBestFit::CombineMemBuf(const DynamicMemBlockPtr &mem_block, c MS_EXCEPTION_IF_NULL(next_mem_buf); if (next_mem_buf->status_ == kMemBufIdle) { mem_buf->size_ += next_mem_buf->size_; - EraseIdleMemBuf(next_mem_buf->size_, next_mem_buf->device_addr_); + EraseIdleMemBuf(next_mem_buf->size_, next_mem_buf->device_addr_, mem_mng); (void)mem_block->block_all_mem_buf_map_.erase(next_iter); } } @@ -249,7 +339,7 @@ void DynamicMemPoolBestFit::CombineMemBuf(const DynamicMemBlockPtr &mem_block, c prev_mem_buf = prev_iter->second; MS_EXCEPTION_IF_NULL(prev_mem_buf); if (prev_mem_buf->status_ == kMemBufIdle) { - EraseIdleMemBuf(prev_mem_buf->size_, prev_mem_buf->device_addr_); + EraseIdleMemBuf(prev_mem_buf->size_, prev_mem_buf->device_addr_, mem_mng); prev_mem_buf->size_ += mem_buf->size_; (void)mem_block->block_all_mem_buf_map_.erase(iter); forward_combine = true; @@ -257,20 +347,21 @@ void DynamicMemPoolBestFit::CombineMemBuf(const DynamicMemBlockPtr &mem_block, c } // Add map of new idle memory if (forward_combine) { - (void)global_idle_mem_buf_map_.emplace(prev_mem_buf->size_, prev_mem_buf); + (void)mem_mng->idle_mem_buf_map_.emplace(prev_mem_buf->size_, prev_mem_buf); } else { - (void)global_idle_mem_buf_map_.emplace(mem_buf->size_, mem_buf); + (void)mem_mng->idle_mem_buf_map_.emplace(mem_buf->size_, mem_buf); } } -void DynamicMemPoolBestFit::EraseIdleMemBuf(size_t size, const DeviceMemPtr &device_addr) { +void DynamicMemPoolBestFit::EraseIdleMemBuf(size_t size, const DeviceMemPtr &device_addr, + const MemStatusManagerPtr &mem_mng) { MS_EXCEPTION_IF_NULL(device_addr); - auto &&iter = global_idle_mem_buf_map_.equal_range(size); + auto &&iter = mem_mng->idle_mem_buf_map_.equal_range(size); while (iter.first != iter.second) { MS_EXCEPTION_IF_NULL(iter.first->second); // Remove map of the idle memory buf by size and device address if (iter.first->second->device_addr_ == device_addr) { - (void)global_idle_mem_buf_map_.erase(iter.first); + (void)mem_mng->idle_mem_buf_map_.erase(iter.first); return; } (void)iter.first++; @@ -280,70 +371,48 @@ void DynamicMemPoolBestFit::EraseIdleMemBuf(size_t size, const DeviceMemPtr &dev void DynamicMemPoolBestFit::ReleaseDeviceRes() { std::lock_guard locker(mutex_); - MS_LOG(INFO) << "The dynamic memory pool total size is " << total_mem_statistics_ << ", total used size is " - << total_used_mem_statistics_ << ", used peak size is " << used_mem_peak_statistics_ << "."; - for (auto iter = global_mem_block_list_.begin(); iter != global_mem_block_list_.end(); ++iter) { - auto &device_addr = (*iter)->device_addr_base_; - if (device_addr != nullptr) { - if (!FreeDeviceMem(device_addr)) { - MS_LOG(EXCEPTION) << "Free device memory[" << device_addr << "] error."; + auto fn = [this](const MemStatusManagerPtr &mem_mng) { + for (auto &iter : mem_mng->mem_block_list_) { + auto &device_addr = iter->device_addr_base_; + if (device_addr != nullptr) { + if (!FreeDeviceMem(device_addr)) { + MS_LOG(EXCEPTION) << "Free device memory[" << device_addr << "] error."; + } + device_addr = nullptr; } - device_addr = nullptr; } - } - - global_mem_block_list_.clear(); - global_idle_mem_buf_map_.clear(); + mem_mng->mem_block_list_.clear(); + mem_mng->idle_mem_buf_map_.clear(); + }; + fn(common_mem_); + fn(persistent_mem_); } void DynamicMemPoolBestFit::DumpDynamicMemPoolInfo() { - std::lock_guard locker(mutex_); - MS_LOG(INFO) << "Start dump dynamic memory pool info."; - DeviceAddrMapMemBuf mem_block_map; - DynamicMemBufPtr mem_buf; - size_t total_mem = 0; - size_t total_used_mem = 0; - size_t total_idle_mem1 = 0; - size_t total_idle_mem2 = 0; - // Dump the memory block info and memory buf info - MS_LOG(INFO) << "Dump all mem_block info: counts[" << global_mem_block_list_.size() << "]."; - for (auto iter = global_mem_block_list_.begin(); iter != global_mem_block_list_.end(); ++iter) { - total_mem += (*iter)->size(); - mem_block_map = (*iter)->block_all_mem_buf_map_; - MS_LOG(INFO) << "MemBlock info: number[" << iter - global_mem_block_list_.begin() << "] mem_buf_counts[" - << mem_block_map.size() << "] base_address[" << (*iter)->device_addr() << "] block_size[" - << (*iter)->size() << "]."; - for (auto iter_mem_buf = mem_block_map.begin(); iter_mem_buf != mem_block_map.end(); ++iter_mem_buf) { - mem_buf = iter_mem_buf->second; - MS_EXCEPTION_IF_NULL(mem_buf); - if (mem_buf->status_ == kMemBufIdle) { - total_idle_mem1 += mem_buf->size_; - } else { - total_used_mem += mem_buf->size_; - } - MS_LOG(INFO) << "MemBuf info: address[" << mem_buf->device_addr_ << "] size[" << mem_buf->size_ << "] status[" - << mem_buf->status_ << "]."; + auto fn = [](const MemStatusManagerPtr &mem_mng, const std::string &mem_type) { + if (mem_mng->mem_block_list_.empty()) { + return; } - } - // Dump all the idle memory buf info - MS_LOG(INFO) << "Dump all idle mem_buf info: counts[" << global_idle_mem_buf_map_.size() << "]."; - for (auto iter_idle = global_idle_mem_buf_map_.begin(); iter_idle != global_idle_mem_buf_map_.end(); ++iter_idle) { - mem_buf = iter_idle->second; - MS_EXCEPTION_IF_NULL(mem_buf); - total_idle_mem2 += mem_buf->size_; - MS_LOG(INFO) << "Idle mem_buf info: size[" << mem_buf->size_ << "] address[" << mem_buf->device_addr_ << "] status[" - << mem_buf->status_ << "]."; - } - // Dump the memory statistical info - MS_LOG(INFO) << "Total allocated memory[" << total_mem << "], used memory[" << total_used_mem << "], idle memory[" - << total_idle_mem1 << "]."; - if (total_idle_mem1 != total_idle_mem2) { - MS_LOG(ERROR) << "Check error: the idle memory in the mem_block is not equal the global idle memory."; - } - if (total_mem != total_used_mem + total_idle_mem1) { - MS_LOG(ERROR) << "Check error: the the total memory is not equal the sum of used memory and idle memory."; - } - MS_LOG(INFO) << "Finish dump dynamic memory pool info."; + std::ostringstream buf; + for (size_t i = 0; i < mem_mng->mem_block_list_.size(); ++i) { + size_t idle_size = 0; + for (auto mb = mem_mng->mem_block_list_[i]->block_all_mem_buf_map_.begin(); + mb != mem_mng->mem_block_list_[i]->block_all_mem_buf_map_.end(); ++mb) { + if (mb->second->status_ == kMemBufIdle) { + idle_size += mb->second->size_; + } + } + buf << ", block[" << i << "] idle size " << idle_size; + } + // Dump all the memory buf info + MS_LOG(WARNING) << mem_type << "pool info: block size " << mem_mng->unit_size_ << ", block counts " + << mem_mng->mem_block_list_.size() << buf.str() << ". Total allocated mem " + << mem_mng->mps_.total_mem_size_ << ", peak used mem " << mem_mng->mps_.used_mem_peak_size_ + << ", in used mem " << mem_mng->mps_.total_used_mem_size_ << ", total idle mem " + << mem_mng->mps_.total_mem_size_ - mem_mng->mps_.total_used_mem_size_; + }; + fn(common_mem_, std::string(kCommonMem)); + fn(persistent_mem_, std::string(kPynativeParamMem)); } } // namespace device } // namespace mindspore diff --git a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_dynamic_allocator.h b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_dynamic_allocator.h index b1023f6195d..0dc387bfb6c 100644 --- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_dynamic_allocator.h +++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_dynamic_allocator.h @@ -36,7 +36,7 @@ enum DynamicMemBufStatus : int { kMemBufIdle, kMemBufUsed }; static const size_t DYNAMIC_MEM_ALIGN_SIZE = 512; // The minimum unit size (1G) of memory block used for dynamic extend. -static const size_t DYNAMIC_MEM_ALLOC_UNIT_SIZE = 1024 << 20; +static const size_t DYNAMIC_MEM_ALLOC_UNIT_SIZE = 1073741824; // The Comparator of device address from small to large. struct DeviceAddrCmp { @@ -77,16 +77,40 @@ class DynamicMemBlock { }; using DynamicMemBlockPtr = std::shared_ptr; +struct DeviceState { + // Memory allocated from device + size_t total_mem_size_{0}; + // Memory in use + size_t total_used_mem_size_{0}; + // Maximum peak memory usage + size_t used_mem_peak_size_{0}; +}; + +struct MemStatusManager { + size_t unit_size_{DYNAMIC_MEM_ALLOC_UNIT_SIZE}; + // Mempool state + DeviceState mps_; + std::vector mem_block_list_; + // The map of all idle memory buf by size. + SizeMapMemBuf idle_mem_buf_map_; + void clear() { + mem_block_list_.clear(); + idle_mem_buf_map_.clear(); + } +}; +using MemStatusManagerPtr = std::shared_ptr; + // The main class of dynamic memory pool. class DynamicMemPoolBestFit { public: - DynamicMemPoolBestFit() = default; + DynamicMemPoolBestFit() + : persistent_mem_(std::make_shared()), common_mem_(std::make_shared()) {} virtual ~DynamicMemPoolBestFit(); // The main program entry of memory alloc. - DeviceMemPtr AllocTensorMem(size_t size); + DeviceMemPtr AllocTensorMem(size_t size, bool from_persistent_mem = false); // The main program entry of continuous memory alloc. - std::vector AllocContinuousTensorMem(size_t total_size, std::vector size_list); + std::vector AllocContinuousTensorMem(size_t total_size, const std::vector &size_list); // The main program entry of memory free. void FreeTensorMem(const DeviceMemPtr &device_addr); @@ -94,21 +118,22 @@ class DynamicMemPoolBestFit { void ReleaseDeviceRes(); // Display the information of memory block and memory buf. void DumpDynamicMemPoolInfo(); - // Get the map of global idle mem buf and size. - SizeMapMemBuf global_idle_mem_buf_map() { - std::lock_guard locker(mutex_); - return global_idle_mem_buf_map_; - } // Get the minimum memory unit size using for dynamic extend. - size_t mem_alloc_unit_size() const { return mem_alloc_unit_size_; } + size_t MemAllocUnitSize(bool from_persistent_mem = false) const; // Set the minimum memory unit size using for dynamic extend. - void set_mem_alloc_unit_size(const size_t &size) { mem_alloc_unit_size_ = size; } - - // Get the related memory statistics information. - size_t total_mem_statistics() const { return total_mem_statistics_; } - size_t used_mem_statistics() const { return total_used_mem_statistics_; } - size_t used_mem_peak_statistics() const { return used_mem_peak_statistics_; } + void SetMemAllocUintSize(size_t size); + // Set mempool init percent in pynative mode + void SetMempoolBlockSize(size_t device_mem_size); + size_t TotalMemStatistics() const { + return common_mem_->mps_.total_mem_size_ + persistent_mem_->mps_.total_mem_size_; + } + size_t TotalUsedMemStatistics() const { + return common_mem_->mps_.total_used_mem_size_ + persistent_mem_->mps_.total_used_mem_size_; + } + size_t UsedMemPeakStatistics() const { + return common_mem_->mps_.used_mem_peak_size_ + persistent_mem_->mps_.used_mem_peak_size_; + } // The related interface of device memory real operation, needs override by device type. virtual size_t AllocDeviceMem(size_t size, DeviceMemPtr *addr) = 0; @@ -116,43 +141,35 @@ class DynamicMemPoolBestFit { virtual size_t free_mem_size() = 0; protected: + MemStatusManagerPtr &common_mem() { return common_mem_; } + MemStatusManagerPtr &persistent_mem() { return persistent_mem_; } // The real size by memory alloc aligned. virtual size_t AlignMemorySize(size_t size) const; // Calculate memory block required alloc size when adding the memory block. - virtual size_t CalMemBlockAllocSize(size_t size); + virtual size_t CalMemBlockAllocSize(size_t size, bool from_persistent_mem); private: // Find the idle memory buf by aligned size when memory alloc. - DeviceMemPtr FindIdleMemBuf(size_t size); + DeviceMemPtr FindIdleMemBuf(size_t size, bool from_persistent_mem); // Add the memory block and memory buf when memory alloc not find the idle memory buf. - DeviceMemPtr AddMemBlockAndMemBuf(size_t size); - // Judge whether need divide the memory buf by alloc size and memory buf size. - bool IsDivide(size_t tensor_size, size_t mem_buf_size) const; - // Divide the memory buf by alloc size. - void DivideMemBuf(size_t size, const DynamicMemBufPtr &mem_buf); + DeviceMemPtr AddMemBlockAndMemBuf(size_t size, bool from_persistent_mem); + // Judge whether need split the memory buf by alloc size and memory buf size. + bool IsSplit(size_t tensor_size, size_t mem_buf_size) const; + // Split the memory buf by alloc size. + void SplitMemBuf(size_t size, const DynamicMemBufPtr &mem_buf, const MemStatusManagerPtr &mem_mng); // Find the memory block by device address. - DynamicMemBlockPtr FindMemBlock(const DeviceMemPtr &device_addr); + DynamicMemBlockPtr FindMemBlock(const DeviceMemPtr &device_addr, const MemStatusManagerPtr &mem_mgr); // The Comparator of memory block by device address, because memory blocks are arranged in order by device address. static bool CmpMemBlock(const DeviceMemPtr &device_addr, const DynamicMemBlockPtr &mem_block); // Combine the memory buf when memory free, to avoid the memory fragmentation. - void CombineMemBuf(const DynamicMemBlockPtr &mem_block, const DeviceMemPtr &device_addr); + void CombineMemBuf(const DynamicMemBlockPtr &mem_block, const DeviceMemPtr &device_addr, + const MemStatusManagerPtr &mem_mng); // Erase the idle memory buf by size and device address when idle memory buf is combined. - void EraseIdleMemBuf(size_t size, const DeviceMemPtr &device_addr); - - // The global memory block list which is arranged in order by base device address of memory block. - std::vector global_mem_block_list_; - // The map of all idle memory buf by size. - SizeMapMemBuf global_idle_mem_buf_map_; - - // The related memory statistics information. - size_t total_mem_statistics_{0}; - size_t total_used_mem_statistics_{0}; - size_t used_mem_peak_statistics_{0}; - - // The minimum memory unit size. - size_t mem_alloc_unit_size_{DYNAMIC_MEM_ALLOC_UNIT_SIZE}; + void EraseIdleMemBuf(size_t size, const DeviceMemPtr &device_addr, const MemStatusManagerPtr &mem_mng); + MemStatusManagerPtr persistent_mem_{nullptr}; + MemStatusManagerPtr common_mem_{nullptr}; // Support multi-thread. std::mutex mutex_; }; diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc index 3a0b52f8129..0f51636f116 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.cc +++ b/mindspore/ccsrc/backend/session/ascend_session.cc @@ -864,7 +864,7 @@ void AscendSession::RunOpImplOrigin(const GraphInfo &graph_info, OpRunInfo *op_r // malloc mem RunOpRemoveNopNode(graph); - RunOpMemoryAlloc(*input_tensors, graph.get()); + RunOpMemoryAlloc(*input_tensors, graph.get(), op_run_info->is_gradient_out); RunOpGenKernelEvent(graph.get()); AnfAlgo::CacheAddrForGraph(graph); // Build dynamic kernel @@ -998,7 +998,7 @@ void AscendSession::BuildOpsInGraph(const GraphId &graph_id, const std::mapgraph_id() << ", Memory Statistics:" << device::ascend::AscendMemAdapter::GetInstance().DevMemStatistics(); MS_LOG(INFO) << "The dynamic memory pool total size is " - << device::ascend::AscendMemoryPool::GetInstance().total_mem_statistics() / kMBToByte + << device::ascend::AscendMemoryPool::GetInstance().TotalMemStatistics() / kMBToByte << "M, total used size is " - << device::ascend::AscendMemoryPool::GetInstance().used_mem_statistics() / kMBToByte + << device::ascend::AscendMemoryPool::GetInstance().TotalUsedMemStatistics() / kMBToByte << "M, used peak size is " - << device::ascend::AscendMemoryPool::GetInstance().used_mem_peak_statistics() / kMBToByte << "M."; + << device::ascend::AscendMemoryPool::GetInstance().UsedMemPeakStatistics() / kMBToByte << "M."; } -void AscendSession::RunOpMemoryAlloc(const std::vector &input_tensors, - KernelGraph *kernel_graph) const { +void AscendSession::RunOpMemoryAlloc(const std::vector &input_tensors, KernelGraph *kernel_graph, + bool is_gradient_out) const { MS_EXCEPTION_IF_NULL(kernel_graph); auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); - runtime_instance->RunOpAssignMemory(input_tensors, *kernel_graph); + runtime_instance->RunOpAssignMemory(input_tensors, *kernel_graph, is_gradient_out); } void AscendSession::RunOpMemoryAllocNew(const std::vector &input_tensors, @@ -1338,7 +1338,7 @@ void AscendSession::RunOpMemoryAllocNew(const std::vector &in const KernelGraph &kernel_graph) const { auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); - runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph, tensor_to_node); + runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph, false, tensor_to_node); } void AscendSession::RunOpGenKernelEvent(const KernelGraph *graph) const { diff --git a/mindspore/ccsrc/backend/session/ascend_session.h b/mindspore/ccsrc/backend/session/ascend_session.h index 040acf413db..1b8b5ba1abb 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.h +++ b/mindspore/ccsrc/backend/session/ascend_session.h @@ -103,7 +103,8 @@ class AscendSession : public SessionBasic { static void BuildKernel(const std::vector &kernels); void BuildDynamicKernel(const std::shared_ptr &kernel_graph) const; void MemoryAlloc(KernelGraph *kernel_graph) const; - void RunOpMemoryAlloc(const std::vector &input_tensors, KernelGraph *kernel_graph) const; + void RunOpMemoryAlloc(const std::vector &input_tensors, KernelGraph *kernel_graph, + bool is_gradient_out) const; void RunOpMemoryAllocNew(const std::vector &input_tensors, const std::map &tensor_to_node, const KernelGraph &kernel_graph) const; diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc index 12337297a8c..9fcd53bdef1 100644 --- a/mindspore/ccsrc/backend/session/gpu_session.cc +++ b/mindspore/ccsrc/backend/session/gpu_session.cc @@ -255,11 +255,11 @@ void GPUSession::AllocateMemory(const KernelGraph *kernel_graph) const { } void GPUSession::RunOpAllocateMemory(const std::vector &input_tensors, - const KernelGraph *kernel_graph) const { + const KernelGraph *kernel_graph, bool is_gradient_out) const { MS_EXCEPTION_IF_NULL(kernel_graph); auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); - runtime_instance->RunOpAssignMemory(input_tensors, *kernel_graph); + runtime_instance->RunOpAssignMemory(input_tensors, *kernel_graph, is_gradient_out); } void GPUSession::RunOpGenKernelEvent(const KernelGraph *graph) const { @@ -701,7 +701,7 @@ void GPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info, // run op MS_EXCEPTION_IF_NULL(kernel_graph); RunOpRemoveNopNode(kernel_graph); - RunOpAllocateMemory(*input_tensors, kernel_graph.get()); + RunOpAllocateMemory(*input_tensors, kernel_graph.get(), op_run_info->is_gradient_out); RunOpGenKernelEvent(kernel_graph.get()); // Execute the computation LoadInputData(kernel_graph, *input_tensors); diff --git a/mindspore/ccsrc/backend/session/gpu_session.h b/mindspore/ccsrc/backend/session/gpu_session.h index 44997d3ccc5..4b722fc8444 100644 --- a/mindspore/ccsrc/backend/session/gpu_session.h +++ b/mindspore/ccsrc/backend/session/gpu_session.h @@ -82,7 +82,8 @@ class GPUSession : public SessionBasic { void AllocateMemory(const KernelGraph *kernel_graph) const; - void RunOpAllocateMemory(const std::vector &input_tensors, const KernelGraph *kernel_graph) const; + void RunOpAllocateMemory(const std::vector &input_tensors, const KernelGraph *kernel_graph, + bool is_gradient_out) const; void RunOpClearMemory(const KernelGraph *kernel_graph) const; diff --git a/mindspore/ccsrc/backend/session/session_basic.cc b/mindspore/ccsrc/backend/session/session_basic.cc index 87fc92e6361..615a49c8b11 100644 --- a/mindspore/ccsrc/backend/session/session_basic.cc +++ b/mindspore/ccsrc/backend/session/session_basic.cc @@ -54,6 +54,7 @@ #endif #include "backend/session/session_factory.h" #include "backend/session/pynative_task_manager.h" +#include "pipeline/pynative/pynative_execute.h" namespace mindspore { namespace session { @@ -1220,7 +1221,8 @@ GraphInfo SessionBasic::GetSingleOpGraphInfo(const CNodePtr &kernel, } OpRunInfo SessionBasic::GetSingleOpRunInfo(const CNodePtr &cnode, const GraphInfo &graph_info, - const InputTensorInfo &tensor_info) { + const InputTensorInfo &tensor_info, + GraphOutputInfo *const graph_output_info) { MS_EXCEPTION_IF_NULL(cnode); auto primitive = AnfAlgo::GetCNodePrimitive(cnode); const auto &abstract = cnode->abstract(); @@ -1230,7 +1232,14 @@ OpRunInfo SessionBasic::GetSingleOpRunInfo(const CNodePtr &cnode, const GraphInf const auto &shape = abstract->BuildShape(); MS_EXCEPTION_IF_NULL(shape); - OpRunInfo op_run_info = {.op_name = primitive->name(), + bool is_gradient_out = + graph_output_info != nullptr && + std::any_of(graph_output_info->output_indexes.begin(), graph_output_info->output_indexes.end(), + [cnode](const std::pair>> &output_index) { + return output_index.first.first == cnode; + }); + OpRunInfo op_run_info = {.is_gradient_out = is_gradient_out, + .op_name = primitive->name(), .primitive = primitive.get(), .abstract = abstract, .is_dynamic_shape = shape->IsDynamic(), @@ -1304,16 +1313,63 @@ void SessionBasic::GetRefCount(const KernelGraph *graph, std::map *forward_output_refcount) { + if (!pynative::PynativeExecutor::GetInstance()->grad_flag()) { + return; + } + const auto &forward_value_nodes_id = pynative::PynativeExecutor::GetInstance()->grad_executor()->forward_outputs_id(); + for (const auto &kernel : graph->execution_order()) { + for (size_t i = 1; i < kernel->inputs().size(); i += 1) { + const auto &input = kernel->input(i); + if (!input->isa()) { + continue; + } + auto value_node = input->cast(); + auto value = GetValueNode(value_node); + MS_EXCEPTION_IF_NULL(value); + if (value->isa()) { + auto tensor = value->cast(); + MS_EXCEPTION_IF_NULL(tensor); + if (forward_value_nodes_id.find(tensor->id()) != forward_value_nodes_id.end()) { + (*forward_output_refcount)[input] += 1; + } + } + } + } +} + void SessionBasic::HandleOpInputs(const std::set &input_kernel, std::map *ref_count, + std::map *forward_output_refcount, std::map *op_output_map) { MS_EXCEPTION_IF_NULL(ref_count); + MS_EXCEPTION_IF_NULL(forward_output_refcount); MS_EXCEPTION_IF_NULL(op_output_map); for (auto &kernel_with_index : input_kernel) { - MS_EXCEPTION_IF_NULL(kernel_with_index.first); if (!kernel_with_index.first->isa()) { continue; } + + // Release forward output + auto cnode = kernel_with_index.first->cast(); + for (size_t i = 1; i < cnode->inputs().size(); i += 1) { + const auto &input = cnode->input(i); + auto it = forward_output_refcount->find(input); + if (it != forward_output_refcount->end()) { + if (--(it->second) == 0) { + auto value_node = input->cast(); + auto value = GetValueNode(value_node); + MS_EXCEPTION_IF_NULL(value); + auto tensor = value->cast(); + MS_EXCEPTION_IF_NULL(tensor); + tensor->set_device_address(nullptr); + forward_output_refcount->erase(it); + } + } + } + + // Release previous output auto ref_iter = ref_count->find(kernel_with_index); if (ref_iter == ref_count->end()) { MS_LOG(EXCEPTION) << "Can not find input KernelWithIndex in cnode reference count map, input cnode = " @@ -2329,7 +2385,9 @@ void SessionBasic::RunOpsInGraphImpl(const GraphId &graph_id, const std::vector< graph_output_info.graph_outputs = outputs; CreateOutputPlaceholder(kernel_graph, inputs, graph_output_info.graph_outputs, &graph_output_info.output_indexes); std::map cnode_refcount; + std::map forward_output_refcount; GetRefCount(kernel_graph.get(), &cnode_refcount); + GetForwardOutputRefCount(kernel_graph.get(), &forward_output_refcount); BuildOpsInGraph(graph_id, parameter_index, inputs, cnode_refcount); // Clear bucket resources every step @@ -2346,14 +2404,14 @@ void SessionBasic::RunOpsInGraphImpl(const GraphId &graph_id, const std::vector< VectorRef op_outputs; // Get OpRunInfo and GraphInfo GraphInfo graph_info = GetSingleOpGraphInfo(kernel, input_tensor_info.input_tensors); - OpRunInfo run_info = GetSingleOpRunInfo(kernel, graph_info, input_tensor_info); + OpRunInfo run_info = GetSingleOpRunInfo(kernel, graph_info, input_tensor_info, &graph_output_info); // Build and run current single op RunOpImplOrigin(graph_info, &run_info, &input_tensor_info.input_tensors, &op_outputs, input_tensor_info.input_tensors_mask); graph_output_info.graph_output_tensors.clear(); // Handle inputs and outputs of current op - HandleOpInputs(input_tensor_info.input_kernel, &cnode_refcount, &op_output_map); + HandleOpInputs(input_tensor_info.input_kernel, &cnode_refcount, &forward_output_refcount, &op_output_map); HandleOpOutputs(kernel, op_outputs, cnode_refcount, &op_output_map, &graph_output_info); // Save grad node to Bucket if (kernel_graph->is_bprop()) { diff --git a/mindspore/ccsrc/backend/session/session_basic.h b/mindspore/ccsrc/backend/session/session_basic.h index 59cee05b729..ecd2279f598 100644 --- a/mindspore/ccsrc/backend/session/session_basic.h +++ b/mindspore/ccsrc/backend/session/session_basic.h @@ -57,6 +57,7 @@ using AnyList = std::vector; using AnyListPtr = std::shared_ptr; struct OpRunInfo { + bool is_gradient_out = false; std::string op_name; Primitive *primitive; AbstractBasePtr abstract; @@ -193,7 +194,9 @@ class SessionBasic : public std::enable_shared_from_this { VectorRef *const outputs, std::map>> *output_indexes); void GetRefCount(const KernelGraph *graph, std::map *ref_count); + void GetForwardOutputRefCount(const KernelGraph *graph, std::map *forward_output_refcount); void HandleOpInputs(const std::set &input_kernel, std::map *ref_count, + std::map *forward_output_refcount, std::map *op_output_map); void HandleOpOutputs(const AnfNodePtr &kernel, const VectorRef &op_outputs, @@ -280,7 +283,8 @@ class SessionBasic : public std::enable_shared_from_this { CNodePtr ConstructOutput(const AnfNodePtrList &outputs, const std::shared_ptr &graph); // Generate graph info for a single op graph GraphInfo GetSingleOpGraphInfo(const CNodePtr &kernel, const std::vector &input_tensors); - OpRunInfo GetSingleOpRunInfo(const CNodePtr &cnode, const GraphInfo &graph_info, const InputTensorInfo &tensor_info); + OpRunInfo GetSingleOpRunInfo(const CNodePtr &cnode, const GraphInfo &graph_info, const InputTensorInfo &tensor_info, + GraphOutputInfo *const graph_output_info); tensor::TensorPtr GetValueNodeOutputTensor(const AnfNodePtr &node, size_t output_index); tensor::TensorPtr GetParameterOutputTensor(const AnfNodePtr &node, const std::map ¶meter_index, diff --git a/mindspore/ccsrc/pipeline/pynative/base.h b/mindspore/ccsrc/pipeline/pynative/base.h index 8f9b5e6ff1b..63ff5648714 100644 --- a/mindspore/ccsrc/pipeline/pynative/base.h +++ b/mindspore/ccsrc/pipeline/pynative/base.h @@ -50,6 +50,7 @@ enum PynativeStatusCode { enum RunOpArgsEnum { PY_PRIM = 0, PY_NAME, PY_INPUTS, PY_ARGS_NUM }; struct OpExecInfo { + bool is_nop_prim = false; bool is_dynamic_shape = false; bool is_mixed_precision_cast = false; size_t next_input_index = 0; diff --git a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc index fce6b0ec97b..1f4c5030ba2 100644 --- a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc +++ b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc @@ -880,6 +880,37 @@ py::object GetDstType(const TypeId &type_id) { bool IsPyObjTypeInvalid(const py::object &obj) { return !py::isinstance(obj) && !py::isinstance(obj) && !py::isinstance(obj); } + +inline bool IsNopPrim(const std::string &op_name) { + static std::set nop_prim = {prim::kPrimReshape->name(), kExpandDimsOpName, prim::kPrimSqueeze->name(), + prim::kPrimFlatten->name(), kFlattenGradOpName, prim::kPrimReformat->name()}; + return nop_prim.find(op_name) != nop_prim.end(); +} + +// Shallow Copy Value and change shape +ValuePtr ShallowCopyValue(const OpExecInfoPtr &op_exec_info, const ValuePtr &value) { + MS_EXCEPTION_IF_NULL(op_exec_info); + MS_EXCEPTION_IF_NULL(value); + auto tensor_abs = op_exec_info->abstract; + if (tensor_abs->isa()) { + tensor_abs = tensor_abs->cast()->CloneAsTensor(); + } + auto new_shape = tensor_abs->BuildShape()->cast(); + MS_EXCEPTION_IF_NULL(new_shape); + if (value->isa()) { + auto tensor_value = value->cast(); + return std::make_shared(tensor_value->data_type(), new_shape->shape(), + tensor_value->data_c(), tensor_value->Size()); + } else if (value->isa()) { + std::vector values; + auto value_tuple = value->cast(); + (void)std::transform(value_tuple->value().begin(), value_tuple->value().end(), std::back_inserter(values), + [op_exec_info](const ValuePtr &elem) { return ShallowCopyValue(op_exec_info, elem); }); + return std::make_shared(values); + } else { + return value; + } +} } // namespace py::object RealRunOp(const py::args &args) { @@ -957,6 +988,7 @@ void TopCellInfo::Clear() { k_pynative_cell_ptr_ = nullptr; graph_info_map_.clear(); sub_cell_list_.clear(); + outputs_id_.clear(); op_info_with_tensor_id_.clear(); tensor_id_with_tensor_object_.clear(); op_info_with_ms_func_forward_tensors_.clear(); @@ -992,6 +1024,7 @@ OpExecInfoPtr ForwardExecutor::GenerateOpExecInfo(const py::args &args) { const auto &op_exec_info = std::make_shared(); const auto &op_name = py::cast(args[PY_NAME]); op_exec_info->op_name = op_name; + op_exec_info->is_nop_prim = false; // IsNopPrim(op_exec_info->op_name); const auto &adapter = py::cast(args[PY_PRIM]); MS_EXCEPTION_IF_NULL(adapter); @@ -1013,7 +1046,7 @@ OpExecInfoPtr ForwardExecutor::GenerateOpExecInfo(const py::args &args) { void ForwardExecutor::SetCastForInputs(const OpExecInfoPtr &op_exec_info) { MS_EXCEPTION_IF_NULL(op_exec_info); // No need cast self - if (op_exec_info->op_name == prim::kPrimCast->name()) { + if (op_exec_info->op_name == prim::kPrimCast->name() || op_exec_info->is_nop_prim) { return; } @@ -1167,6 +1200,21 @@ void ForwardExecutor::GetOpOutputAbstract(const OpExecInfoPtr &op_exec_info, } } +void ForwardExecutor::DoNopOutput(const OpExecInfoPtr &op_exec_info, ValuePtr *out_real_value) { + MS_EXCEPTION_IF_NULL(op_exec_info); + // Get First input + if (op_exec_info->op_inputs.empty()) { + MS_LOG(EXCEPTION) << "Inputs of " << op_exec_info->op_name << " is empty"; + } + const auto &obj = op_exec_info->op_inputs[0]; + if (!py::isinstance(obj)) { + MS_LOG(EXCEPTION) << "First input of " << op_exec_info->op_name << " must be a tensor"; + } + const auto &tensor_ptr = py::cast(obj); + *out_real_value = ShallowCopyValue(op_exec_info, tensor_ptr); + MS_LOG(DEBUG) << "New copy value is " << (*out_real_value)->ToString(); +} + void ForwardExecutor::GetOpOutput(const OpExecInfoPtr &op_exec_info, const abstract::AbstractBasePtrList &args_spec_list, const CNodePtr &cnode, bool prim_cache_hit, py::object *ret) { @@ -1194,25 +1242,32 @@ void ForwardExecutor::GetOpOutput(const OpExecInfoPtr &op_exec_info, out[args_spec_list].abs = op_exec_info->abstract; out[args_spec_list].attrs = prim->evaluate_added_attrs(); } - // run op with selected backend - auto result = RunOpWithInitBackendPolicy(op_exec_info); - py::object out_real = result; - if (result.size() == 1 && op_exec_info->abstract != nullptr && - !op_exec_info->abstract->isa()) { - out_real = result[0]; - } - // get output value + + // Run op with selected backend, nop is no need run backend ValuePtr out_real_value = nullptr; - if (grad()->grad_flag()) { - out_real_value = PyObjToValue(out_real); + if (op_exec_info->is_nop_prim) { + DoNopOutput(op_exec_info, &out_real_value); + *ret = BaseRefToPyData(out_real_value); + } else { + auto result = RunOpWithInitBackendPolicy(op_exec_info); + py::object out_real = result; + if (result.size() == 1 && op_exec_info->abstract != nullptr && + !op_exec_info->abstract->isa()) { + out_real = result[0]; + } + // get output value + if (grad()->grad_flag()) { + out_real_value = PyObjToValue(out_real); + } + *ret = out_real; } - // Save cnode info and build grad graph + if (grad()->need_construct_graph() && !grad()->in_cell_with_custom_bprop_()) { MS_EXCEPTION_IF_NULL(cnode); - const auto &obj_id = GetId(out_real); + const auto &obj_id = GetId(*ret); cnode->set_abstract(op_exec_info->abstract); node_abs_map_[obj_id] = op_exec_info->abstract; - grad()->SaveOutputNodeMap(obj_id, out_real, cnode); + grad()->SaveOutputNodeMap(obj_id, *ret, cnode); grad()->DoOpGrad(op_exec_info, cnode, out_real_value); } else { node_abs_map_.clear(); @@ -1220,7 +1275,6 @@ void ForwardExecutor::GetOpOutput(const OpExecInfoPtr &op_exec_info, // Record op info for judge whether the construct of cell has been changed grad()->RecordGradOpInfo(op_exec_info, out_real_value); grad()->UpdateForwardTensorInfoInBpropGraph(op_exec_info, out_real_value); - *ret = out_real; } py::object ForwardExecutor::DoAutoCast(const py::object &arg, const TypeId &type_id, const std::string &op_name, @@ -1701,7 +1755,7 @@ void GradExecutor::SaveOutputNodeMap(const std::string &obj_id, const py::object return; } MS_EXCEPTION_IF_NULL(cnode); - MS_LOG(DEBUG) << "Cnode is " << cnode->DebugString() << " id " << obj_id; + MS_LOG(DEBUG) << "Cnode is " << cnode->DebugString() << ", out value id " << obj_id; if (py::isinstance(out_real)) { auto value = py::cast(out_real); auto size = static_cast(value.size()); @@ -1925,6 +1979,7 @@ void GradExecutor::SaveForwardTensorInfoInBpropGraph(const pipeline::ResourcePtr continue; } tensor_id_with_tensor_object[tensor->id()].emplace_back(tensor); + top_cell()->outputs_id().insert(tensor->id()); MS_LOG(DEBUG) << "Save forward tensor " << tensor.get() << " id " << tensor->id() << " device address: " << tensor->device_address() << " shape and dtype " << tensor->GetShapeAndDataTypeInfo(); @@ -2090,7 +2145,8 @@ py::object ForwardExecutor::RunOpInMs(const OpExecInfoPtr &op_exec_info, Pynativ // get graph info for checking it whether existing in the cache GetSingleOpGraphInfo(op_exec_info, input_tensors, tensors_mask, &graph_info); #if defined(__APPLE__) - session::OpRunInfo op_run_info = {op_exec_info->op_name, + session::OpRunInfo op_run_info = {false, + op_exec_info->op_name, op_exec_info->py_primitive.get(), op_exec_info->abstract, op_exec_info->is_dynamic_shape, @@ -2102,7 +2158,8 @@ py::object ForwardExecutor::RunOpInMs(const OpExecInfoPtr &op_exec_info, Pynativ tensors_mask, input_tensors}; #else - session::OpRunInfo op_run_info = {op_exec_info->op_name, + session::OpRunInfo op_run_info = {false, + op_exec_info->op_name, op_exec_info->py_primitive.get(), op_exec_info->abstract, op_exec_info->is_dynamic_shape, diff --git a/mindspore/ccsrc/pipeline/pynative/pynative_execute.h b/mindspore/ccsrc/pipeline/pynative/pynative_execute.h index 1a050834b1a..5a43cf5fd60 100644 --- a/mindspore/ccsrc/pipeline/pynative/pynative_execute.h +++ b/mindspore/ccsrc/pipeline/pynative/pynative_execute.h @@ -100,6 +100,7 @@ class TopCellInfo { const std::string &grad_operation() const { return grad_operation_; } void set_grad_operation(const std::string &grad_operation) { grad_operation_ = grad_operation; } mindspore::HashSet &sub_cell_list() { return sub_cell_list_; } + std::set &outputs_id() { return outputs_id_; } bool IsSubCell(const std::string &cell_id) const; OrderedMap &graph_info_map() { return graph_info_map_; } OpInfoWithTensorId &op_info_with_tensor_id() { return op_info_with_tensor_id_; } @@ -139,6 +140,8 @@ class TopCellInfo { std::string grad_operation_; OrderedMap graph_info_map_; mindspore::HashSet sub_cell_list_; + // Record op output tensor id + std::set outputs_id_; OpInfoWithTensorId op_info_with_tensor_id_; TensorIdWithTensorObject tensor_id_with_tensor_object_; OpInfoWithMsFuncForwardTensors op_info_with_ms_func_forward_tensors_; @@ -195,6 +198,7 @@ class GradExecutor { void set_grad_flag(bool flag) { grad_flag_ = flag; } void set_graph_phase(const std::string &graph_phase) { graph_phase_ = graph_phase; } bool in_cell_with_custom_bprop_() const { return custom_bprop_cell_count_ > 0; } + std::set &forward_outputs_id() const { return top_cell()->outputs_id(); } AnfNodePtr GetInput(const py::object &obj, bool op_mask); std::string GetCellId(const py::object &obj, const py::args &args); void RecordGradOpInfo(const OpExecInfoPtr &op_exec_info, const ValuePtr &op_out); @@ -347,6 +351,7 @@ class ForwardExecutor { bool *prim_cache_hit); void GetOpOutput(const OpExecInfoPtr &op_exec_info, const abstract::AbstractBasePtrList &args_spec_list, const CNodePtr &cnode, bool prim_cache_hit, py::object *ret); + void DoNopOutput(const OpExecInfoPtr &op_exec_info, ValuePtr *out_real_value); // Mix precision and Implicit transform void SetCastForInputs(const OpExecInfoPtr &op_exec_info); void SetTensorMixPrecisionCast(const OpExecInfoPtr &op_exec_info); diff --git a/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc b/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc index 66921ef65cc..abffad0f7a7 100644 --- a/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc +++ b/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc @@ -84,6 +84,7 @@ REGISTER_PYBIND_DEFINE(MsContextPy, ([](const py::module *m) { .value("save_graphs", MsCtxParam::MS_CTX_SAVE_GRAPHS_FLAG) .value("enable_parallel_split", MsCtxParam::MS_CTX_ENABLE_PARALLEL_SPLIT) .value("max_device_memory", MsCtxParam::MS_CTX_MAX_DEVICE_MEMORY) + .value("mempool_block_size", MsCtxParam::MS_CTX_MEMPOOL_BLOCK_SIZE) .value("mode", MsCtxParam::MS_CTX_EXECUTION_MODE) .value("device_target", MsCtxParam::MS_CTX_DEVICE_TARGET) .value("_graph_memory_max_size", MsCtxParam::MS_CTX_GRAPH_MEMORY_MAX_SIZE) diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_adapter.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_adapter.cc index 7f12e25f910..dd3853dcaa2 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_adapter.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_adapter.cc @@ -77,6 +77,7 @@ bool AscendMemAdapter::Initialize() { static_mem_offset_ = ms_used_hbm_size_; cur_dynamic_mem_offset_ = 0; max_dynamic_mem_offset_ = 0; + AscendMemoryPool::GetInstance().SetMempoolBlockSize(ms_used_hbm_size_); MS_LOG(INFO) << "Ascend Memory Adapter initialize success, Memory Statistics:" << DevMemStatistics(); initialized_ = true; return true; @@ -109,7 +110,7 @@ bool AscendMemAdapter::DeInitialize() { return ret; } -uint8_t *AscendMemAdapter::MallocStaticDevMem(size_t size, std::string tag) { +uint8_t *AscendMemAdapter::MallocStaticDevMem(size_t size, const std::string &tag) { std::lock_guard locker(mutex_); auto new_static_offset = static_mem_offset_ - size; if (new_static_offset < max_dynamic_mem_offset_) { @@ -122,11 +123,10 @@ uint8_t *AscendMemAdapter::MallocStaticDevMem(size_t size, std::string tag) { auto memory_block_ptr = device_mem_base_addr_ + new_static_offset; static_mem_offset_ = new_static_offset; static_memory_block_list_.push_back(std::make_shared(memory_block_ptr, size, tag)); - return memory_block_ptr; } -uint8_t *AscendMemAdapter::MallocDynamicDevMem(size_t size, std::string tag) { +uint8_t *AscendMemAdapter::MallocDynamicDevMem(size_t size, const std::string &tag) { std::lock_guard locker(mutex_); auto new_dynamic_offset = cur_dynamic_mem_offset_ + size; if (new_dynamic_offset > static_mem_offset_) { diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_adapter.h b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_adapter.h index e864bc802f6..b9e09087053 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_adapter.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_adapter.h @@ -22,6 +22,7 @@ #include #include #include "utils/ms_context.h" +#include "runtime/device/ascend/ascend_memory_pool.h" namespace mindspore { namespace device { @@ -37,8 +38,8 @@ class AscendMemAdapter { bool Initialize(); bool DeInitialize(); - uint8_t *MallocStaticDevMem(size_t size, std::string tag = ""); - uint8_t *MallocDynamicDevMem(size_t size, std::string tag = ""); + uint8_t *MallocStaticDevMem(size_t size, const std::string &tag = ""); + uint8_t *MallocDynamicDevMem(size_t size, const std::string &tag = ""); bool FreeStaticDevMem(void *devPtr) { return true; } void ResetDynamicMemory(); @@ -76,12 +77,13 @@ class AscendMemAdapter { uint8_t *device_mem_base_addr_{nullptr}; uint64_t ms_used_hbm_size_{0}; - // dynamic memory info + // dynamic memory info, from a low address to a high address uint64_t cur_dynamic_mem_offset_{0}; + // Maximum dynamic memory have already allocated, dynamically updated uint64_t max_dynamic_mem_offset_{0}; std::vector> dynamic_memory_block_list_; - // static memory info + // static memory info, from a high address to a low address uint64_t static_mem_offset_{0}; std::vector> static_memory_block_list_; }; diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.cc index 5892142453e..9b6afd64cf7 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.cc @@ -15,7 +15,6 @@ */ #include #include "runtime/device/ascend/ascend_memory_manager.h" -#include "runtime/device/ascend/ascend_memory_pool.h" #include "runtime/device/ascend/ascend_memory_adapter.h" #include "utils/ms_context.h" #include "runtime/mem.h" @@ -49,9 +48,9 @@ void *AscendMemoryManager::MallocDevice(size_t size) { return AscendMemoryPool::GetInstance().AllocTensorMem(align_size); } -void *AscendMemoryManager::MallocMemFromMemPool(size_t size) { +void *AscendMemoryManager::MallocMemFromMemPool(size_t size, bool from_persistent_mem) { auto align_size = GetCommonAlignSize(size); - const auto device_addr = AscendMemoryPool::GetInstance().AllocTensorMem(align_size); + const auto device_addr = AscendMemoryPool::GetInstance().AllocTensorMem(align_size, from_persistent_mem); if (device_addr == nullptr) { MS_LOG(EXCEPTION) << "Fail to alloc memory, size: " << align_size << ", memory statistics:" << AscendMemAdapter::GetInstance().DevMemStatistics(); @@ -132,8 +131,8 @@ uint8_t *AscendMemoryManager::MallocCommunicationMemFromMemPool(size_t size) { size_t AscendMemoryManager::GetAvailableMemSize() { auto available_mem_size = AscendMemoryPool::GetInstance().free_mem_size() + - AscendMemoryPool::GetInstance().total_mem_statistics() - - AscendMemoryPool::GetInstance().used_mem_statistics(); + AscendMemoryPool::GetInstance().TotalMemStatistics() - + AscendMemoryPool::GetInstance().TotalUsedMemStatistics(); return available_mem_size; } diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.h b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.h index 7a0ec3ad2d1..314b62639af 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.h @@ -32,7 +32,7 @@ class AscendMemoryManager : public MemoryManager { void FreeDeviceMemory() override; void ResetDynamicMemory() override; void ClearGlobalIdleMem() override; - void *MallocMemFromMemPool(size_t size) override; + void *MallocMemFromMemPool(size_t size, bool from_persistent_mem) override; void *MallocDevice(size_t size) override; void FreeMemFromMemPool(void *device_ptr) override; uint64_t GetMsMaxMemSize(); diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.cc index 5f0d7537ce4..511d6d8abaa 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.cc @@ -24,37 +24,37 @@ namespace mindspore { namespace device { namespace ascend { -// The minimum unit size (256MB) of memory block used for dynamic extend. -static const size_t ASCEND_DYNAMIC_MEM_ALLOC_UNIT_SIZE = 256 << 20; // The minimum unit size (8MB) of memory block used for dynamic extend in graph mode. static const size_t ASCEND_DYNAMIC_MEM_ALLOC_UNIT_SIZE_FOR_GRAPH = 8 << 20; -size_t AscendMemoryPool::CalMemBlockAllocSize(size_t size) { +size_t AscendMemoryPool::CalMemBlockAllocSize(size_t size, bool from_persistent_mem) { auto device_free_mem_size = free_mem_size(); if (device_free_mem_size < size) { MS_LOG(WARNING) << "The dynamic memory pool total size is " - << device::ascend::AscendMemoryPool::GetInstance().total_mem_statistics() / kMBToByte + << device::ascend::AscendMemoryPool::GetInstance().TotalMemStatistics() / kMBToByte << "M, total used size is " - << device::ascend::AscendMemoryPool::GetInstance().used_mem_statistics() / kMBToByte + << device::ascend::AscendMemoryPool::GetInstance().TotalUsedMemStatistics() / kMBToByte << "M, used peak size is " - << device::ascend::AscendMemoryPool::GetInstance().used_mem_peak_statistics() / kMBToByte << "M."; - MS_LOG(WARNING) << "Out of Memory. Request memory size: " << size + << device::ascend::AscendMemoryPool::GetInstance().UsedMemPeakStatistics() / kMBToByte << "M."; + MS_LOG(WARNING) << "Out of Memory. Request memory size: " << size << ", device free size " << device_free_mem_size << ", Memory Statistic:" << AscendMemAdapter::GetInstance().DevMemStatistics() << "Please try to reduce 'batch_size' or check whether exists extra large shape. More " "details can be found in MindSpore's FAQ with keyword 'Out of Memory'."; return 0; } - auto alloc_mem_size = ASCEND_DYNAMIC_MEM_ALLOC_UNIT_SIZE; + size_t alloc_mem_size = MemAllocUnitSize(from_persistent_mem); auto ms_context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(ms_context); const bool pynative_mode = (ms_context->get_param(MS_CTX_EXECUTION_MODE) == kPynativeMode); if (pynative_mode) { // Growing at twice of alloc size + MS_LOG(DEBUG) << "Get unit block size " << alloc_mem_size; constexpr size_t kDouble = 2; while (alloc_mem_size < size) { alloc_mem_size = alloc_mem_size * kDouble; } } else { + // The graph mode controls itself independently alloc_mem_size = ASCEND_DYNAMIC_MEM_ALLOC_UNIT_SIZE_FOR_GRAPH; while (alloc_mem_size < size) { alloc_mem_size = alloc_mem_size + ASCEND_DYNAMIC_MEM_ALLOC_UNIT_SIZE_FOR_GRAPH; @@ -69,7 +69,6 @@ size_t AscendMemoryPool::AllocDeviceMem(size_t size, DeviceMemPtr *addr) { if (size == 0) { MS_LOG(EXCEPTION) << "Failed to alloc memory pool resource, the size is zero!"; } - *addr = AscendMemAdapter::GetInstance().MallocStaticDevMem(size); if (*addr == nullptr) { MS_LOG(EXCEPTION) << "Alloc device memory pool address is nullptr, failed to alloc memory pool resource!"; @@ -83,11 +82,17 @@ bool AscendMemoryPool::FreeDeviceMem(const DeviceMemPtr &addr) { } void AscendMemoryPool::ResetIdleMemBuf() { - auto idle_mem_buf_map = DynamicMemPoolBestFit::global_idle_mem_buf_map(); - for (auto &it : idle_mem_buf_map) { - MS_EXCEPTION_IF_NULL(it.second); - (void)rtMemset(it.second->device_addr_, it.first, 0, it.first); - } + auto fn = [this](const MemStatusManagerPtr &mem_mng) { + if (mem_mng->mem_block_list_.empty()) { + return; + } + for (auto &it : mem_mng->idle_mem_buf_map_) { + MS_EXCEPTION_IF_NULL(it.second); + (void)rtMemset(it.second->device_addr_, it.first, 0, it.first); + } + }; + fn(persistent_mem()); + fn(common_mem()); } size_t AscendMemoryPool::free_mem_size() { return AscendMemAdapter::GetInstance().FreeDevMemSize(); } diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.h b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.h index 04f704e8292..25358d54a7e 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.h @@ -42,7 +42,7 @@ class AscendMemoryPool : public DynamicMemPoolBestFit { protected: // Calculate memory block required alloc size when adding the memory block. - size_t CalMemBlockAllocSize(size_t size) override; + size_t CalMemBlockAllocSize(size_t size, bool from_persistent_mem) override; private: AscendMemoryPool() = default; diff --git a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc index 8fbeac4c4c9..98569e72175 100644 --- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc @@ -112,6 +112,7 @@ void CPUKernelRuntime::AssignValueNodeAddress(session::KernelGraph *kernel_graph size_t tensor_size = std::accumulate(data_shape.begin(), data_shape.end(), type_size, std::multiplies()); DeviceAddressPtr address = nullptr; address = CreateDeviceAddress(nullptr, tensor_size, kOpFormat_DEFAULT, output_type_id); + address->set_from_persistent_mem(tensor->is_parameter()); MS_EXCEPTION_IF_NULL(address); if (tensor->data_type() == output_type_id) { address->ptr_ = tensor->data_c(); @@ -146,6 +147,7 @@ void CPUKernelRuntime::AssignInputNodeAddress(const session::KernelGraph *kernel : std::accumulate(fmt_shape.begin(), fmt_shape.end(), type_size, std::multiplies()); auto format = AnfAlgo::GetOutputFormat(item, index); auto address = CreateDeviceAddress(nullptr, tensor_size, format, output_type_id); + address->set_from_persistent_mem(true); AnfAlgo::SetOutputAddr(address, index, item.get()); } } diff --git a/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.h b/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.h index e9959e9fd34..fcca577e307 100644 --- a/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.h +++ b/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.h @@ -46,7 +46,9 @@ class CPUMemoryManager : public MemoryManager { void IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs); void DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs); - void *MallocMemFromMemPool(size_t size) override { return CPUMemoryPool::GetInstance().AllocTensorMem(size); } + void *MallocMemFromMemPool(size_t size, bool from_persistent_mem) override { + return CPUMemoryPool::GetInstance().AllocTensorMem(size, from_persistent_mem); + } void FreeMemFromMemPool(void *device_ptr) override { CPUMemoryPool::GetInstance().FreeTensorMem(device_ptr); } std::vector MallocContinuousMemFromMemPool(size_t total_size, std::vector size_list) override { return CPUMemoryPool::GetInstance().AllocContinuousTensorMem(total_size, size_list); diff --git a/mindspore/ccsrc/runtime/device/device_address.h b/mindspore/ccsrc/runtime/device/device_address.h index 7bd6e1ae79b..b230159f270 100644 --- a/mindspore/ccsrc/runtime/device/device_address.h +++ b/mindspore/ccsrc/runtime/device/device_address.h @@ -101,6 +101,8 @@ class DeviceAddress : public mindspore::DeviceSync { bool is_ptr_persisted() const { return is_ptr_persisted_; } void set_is_ptr_persisted(bool is_ptr_persisted) { is_ptr_persisted_ = is_ptr_persisted; } void set_host_shape(const ShapeVector &shape) { host_shape_ = shape; } + bool from_persistent_mem() const { return from_persistent_mem_; } + void set_from_persistent_mem(bool from_persistent_mem) { from_persistent_mem_ = from_persistent_mem; } virtual void set_status(DeviceAddressStatus status) {} virtual DeviceAddressStatus status() const { return DeviceAddressStatus::kInDevice; } virtual DeviceAddressType DeviceType() const { return DeviceAddressType::kUnknown; } @@ -145,6 +147,7 @@ class DeviceAddress : public mindspore::DeviceSync { // The key of device context. std::string device_name_{""}; uint32_t device_id_{0}; + bool from_persistent_mem_{false}; friend class KernelRuntime; friend class MemoryManager; diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc index 835263c738b..40e11be5ac5 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc @@ -422,7 +422,7 @@ void GPUKernelRuntime::FetchMemUnitSize(const session::KernelGraph *graph) { current_sum_size = 0; } } - if (max_sum_size > GPUMemoryAllocator::GetInstance().mem_alloc_unit_size()) { + if (max_sum_size > GPUMemoryAllocator::GetInstance().MemAllocUnitSize()) { size_t unit_size = (max_sum_size / DYNAMIC_MEM_ALLOC_UNIT_SIZE + 1) * DYNAMIC_MEM_ALLOC_UNIT_SIZE; if (unit_size < DYNAMIC_MEM_ALLOC_UNIT_SIZE) { MS_LOG(WARNING) << "Current memory unit size [" << unit_size << "] is too small."; @@ -432,7 +432,7 @@ void GPUKernelRuntime::FetchMemUnitSize(const session::KernelGraph *graph) { constexpr float kValidMemoryRatio = 0.9; free_mem_size = kValidMemoryRatio * free_mem_size; unit_size = std::min(unit_size, free_mem_size); - GPUMemoryAllocator::GetInstance().set_mem_alloc_unit_size(unit_size); + GPUMemoryAllocator::GetInstance().SetMemAllocUintSize(unit_size); } } diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_allocator.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_allocator.cc index 2726dd64bb5..24762e51d6b 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_allocator.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_allocator.cc @@ -25,6 +25,7 @@ namespace mindspore { namespace device { namespace gpu { const size_t kGBToByte = 1024 << 20; +constexpr float kReservedMemoryRatio = 0.0625; // 1/16 bool GPUMemoryAllocator::Init() { size_t total_size = CudaDriver::total_mem_size(); @@ -42,6 +43,13 @@ bool GPUMemoryAllocator::Init() { << total_size << ", current free memory size " << free_size << ", set max available memory size " << available_device_memory_ << "."; } + // In gpu mode, recommend 1/16 reserved for other cuda functions + if (available_device_memory_ > total_size) { + size_t recommend_mem_size_for_others = FloatToSize(total_size * kReservedMemoryRatio); + SetMempoolBlockSize(std::min(available_device_memory_, total_size - recommend_mem_size_for_others)); + } else { + SetMempoolBlockSize(std::min(available_device_memory_, total_size)); + } return true; } @@ -71,7 +79,7 @@ bool GPUMemoryAllocator::AllocBufferQueueMem(size_t size, DeviceMemPtr *addr) { auto alloc_size = AllocDeviceMem(size, addr); buffer_q_addr_ = *addr; // Buffer queue needs to ensure that the alloc_size and size is equal. - return (alloc_size == size) ? true : false; + return alloc_size == size; } size_t GPUMemoryAllocator::AllocDeviceMem(size_t size, DeviceMemPtr *addr) { @@ -90,8 +98,9 @@ size_t GPUMemoryAllocator::AllocDeviceMem(size_t size, DeviceMemPtr *addr) { } total_used_device_memory_ += alloc_size; available_device_memory_ -= alloc_size; - MS_LOG(INFO) << "Current free memory size[" << free_size - alloc_size << "], current alloc size[" << alloc_size - << "], total used size[" << total_used_device_memory_ << "]."; + MS_LOG(INFO) << "Cuda current free memory size[" << free_size << "], alloc size[" << alloc_size + << "], left free memory size[" << free_size - alloc_size << "]" + << ".Total used size[" << total_used_device_memory_ << "]."; return alloc_size; } diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.cc index aad89ec8e48..10afad6a1b2 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.cc @@ -24,8 +24,8 @@ namespace mindspore { namespace device { namespace gpu { -void *GPUMemoryManager::MallocMemFromMemPool(size_t size) { - return GPUMemoryAllocator::GetInstance().AllocTensorMem(size); +void *GPUMemoryManager::MallocMemFromMemPool(size_t size, bool from_persistent_mem) { + return GPUMemoryAllocator::GetInstance().AllocTensorMem(size, from_persistent_mem); } void GPUMemoryManager::FreeMemFromMemPool(void *device_ptr) { @@ -39,7 +39,7 @@ std::vector GPUMemoryManager::MallocContinuousMemFromMemPool(size_t tota bool GPUMemoryManager::MallocContinuousMemFromMemPool(const DeviceAddressPtrList &addr_list, size_t total_size, std::vector size_list) { auto device_ptr_list = MallocContinuousMemFromMemPool(total_size, size_list); - if (device_ptr_list.size() == 0) { + if (device_ptr_list.empty()) { return false; } if (addr_list.size() != device_ptr_list.size()) { @@ -76,7 +76,7 @@ void GPUMemoryManager::MallocDeviceMemory() { if (ps::ps_cache_instance.initialized_ps_cache()) { return; } - auto device_addr = MallocMemFromMemPool(1); + auto device_addr = MallocMemFromMemPool(1, false); if (!device_addr) { MS_LOG(EXCEPTION) << "Dynamic memory pool init error."; } @@ -87,7 +87,7 @@ void GPUMemoryManager::FreeDeviceMemory() { GPUMemoryAllocator::GetInstance().Re uint8_t *GPUMemoryManager::MallocStaticMem(size_t size, bool, uint32_t) { auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); - auto device_ptr = MallocMemFromMemPool(size); + auto device_ptr = MallocMemFromMemPool(size, false); if (device_ptr == nullptr) { MS_LOG(EXCEPTION) << "Device memory isn't enough and alloc failed, alloc size:" << size; } diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.h b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.h index 6c1afd0d6f2..e3d3189a448 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.h +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.h @@ -29,7 +29,7 @@ class GPUMemoryManager : public MemoryManager { void MallocDeviceMemory() override; void FreeDeviceMemory() override; - void *MallocMemFromMemPool(size_t size) override; + void *MallocMemFromMemPool(size_t size, bool from_persistent_mem) override; void FreeMemFromMemPool(void *device_ptr) override; std::vector MallocContinuousMemFromMemPool(size_t total_size, std::vector size_list) override; bool MallocContinuousMemFromMemPool(const DeviceAddressPtrList &addr_list, size_t total_size, diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.cc b/mindspore/ccsrc/runtime/device/kernel_runtime.cc index 607fbcc78d5..99ef0777ebd 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc @@ -258,6 +258,7 @@ void KernelRuntime::RunOpMallocPre(const session::KernelGraph &graph, auto output_format = op_runtime_info->output_format(index); auto device_address = CreateDeviceAddress(nullptr, output_tensor_size, output_format, output_type_id, {item, index}); + device_address->set_from_persistent_mem(current_tensor->is_parameter()); AnfAlgo::SetOutputAddr(device_address, index, item.get()); current_tensor->set_device_address(device_address); current_tensor->set_sync_status(kNeedSyncHostToDevice); @@ -287,6 +288,7 @@ void KernelRuntime::ResetNodeAddress(const session::KernelGraph &kernel_graph) { auto tensor_size = AnfAlgo::GetOutputTensorMemSize(input_node, index); auto device_address = CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(input_node, index), output_type_id, {input_node, index}); + device_address->set_from_persistent_mem(input_node->isa()); AnfAlgo::SetOutputAddr(device_address, index, input_node.get()); } @@ -306,7 +308,7 @@ void KernelRuntime::ResetNodeAddress(const session::KernelGraph &kernel_graph) { } void KernelRuntime::RunOpAssignMemory(const std::vector &input_tensors, - const session::KernelGraph &graph, + const session::KernelGraph &graph, bool is_gradient_out, const std::map &tensor_to_node) { MS_EXCEPTION_IF_NULL(mem_manager_); mem_manager_->ResetDynamicMemory(); @@ -319,7 +321,7 @@ void KernelRuntime::RunOpAssignMemory(const std::vector &inpu RunOpAssignInputMemory(input_tensors, graph); AssignStaticMemoryValueNode(graph); for (const auto &node : graph.execution_order()) { - RunOpAssignOutputMemory(node, tensor_to_node); + RunOpAssignOutputMemory(node, tensor_to_node, is_gradient_out); RunOpAssignWorkSpaceMemory(node); } UpdateRefNodeOutputMem(graph); @@ -398,6 +400,7 @@ void KernelRuntime::RunOpAssignInputMemory(const std::vector auto current_tensor = input_tensors[input_index]; MS_EXCEPTION_IF_NULL(current_tensor); auto output_address = std::dynamic_pointer_cast(current_tensor->device_address()); + // Device address have already create if (output_address != nullptr && output_address->DeviceType() == GetTargetDeviceAddressType()) { if (output_address->ptr_ == nullptr) { if (!mem_manager_->MallocMemFromMemPool(output_address, output_address->size())) { @@ -413,10 +416,12 @@ void KernelRuntime::RunOpAssignInputMemory(const std::vector output_type_id = AnfAlgo::GetOutputInferDataType(item, index); } auto tensor_size = AnfAlgo::GetOutputTensorMemSize(item, index); + // Device address new create auto device_address = CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id, {item, index}); MS_EXCEPTION_IF_NULL(device_address); MS_EXCEPTION_IF_NULL(mem_manager_); + device_address->set_from_persistent_mem(true); auto ret = mem_manager_->MallocMemFromMemPool(device_address, tensor_size); if (!ret) { MS_LOG(EXCEPTION) << "Device memory isn't enough and alloc failed, alloc size:" << tensor_size; @@ -426,8 +431,9 @@ void KernelRuntime::RunOpAssignInputMemory(const std::vector } } -void KernelRuntime::RunOpAssignOutputMemory( - const AnfNodePtr &kernel, const std::map &tensor_to_node) { +void KernelRuntime::RunOpAssignOutputMemory(const AnfNodePtr &kernel, + const std::map &tensor_to_node, + bool is_gradient_out) { MS_EXCEPTION_IF_NULL(kernel); MS_EXCEPTION_IF_NULL(mem_manager_); auto kernel_mod = AnfAlgo::GetKernelMod(kernel); @@ -464,8 +470,11 @@ void KernelRuntime::RunOpAssignOutputMemory( std::string output_format = AnfAlgo::GetOutputFormat(kernel, i); auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i); auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type, {kernel, i}); - device_address->set_host_shape(trans::GetRuntimePaddingShape(kernel, i)); MS_EXCEPTION_IF_NULL(device_address); + device_address->set_host_shape(trans::GetRuntimePaddingShape(kernel, i)); + if (is_gradient_out) { + device_address->set_from_persistent_mem(true); + } auto ret = mem_manager_->MallocMemFromMemPool(device_address, output_sizes[i]); if (!ret) { MS_LOG(EXCEPTION) << "Device memory isn't enough and alloc failed, alloc size:" << output_sizes[i]; @@ -917,6 +926,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const auto output_format = AnfAlgo::GetOutputFormat(value_node, output_idx); DeviceAddressPtr address = CreateDeviceAddress(nullptr, node_size, output_format, output_type_id, {value_node, output_idx}); + address->set_from_persistent_mem(true); MS_EXCEPTION_IF_NULL(address); if (ms_context->get_param(MS_CTX_ENABLE_PYNATIVE_INFER) && !mem_manager_->MallocMemFromMemPool(address, node_size)) { @@ -979,6 +989,7 @@ void KernelRuntime::AssignStaticMemoryValueNode(const session::KernelGraph &grap ms_context->get_param(MS_CTX_EXECUTION_MODE) == kPynativeMode; auto address = CreateDeviceAddressForStringValue(node_value, use_mem_from_memory_pool, graph.graph_id()); MS_EXCEPTION_IF_NULL(address); + address->set_from_persistent_mem(true); AnfAlgo::SetOutputAddr(address, 0, value_node.get()); } } @@ -991,6 +1002,7 @@ DeviceAddressPtr KernelRuntime::CreateDeviceAddressForStringValue(const ValuePtr size_t tensor_size = value_string.size(); DeviceAddressPtr address = CreateDeviceAddress(nullptr, tensor_size, kOpFormat_DEFAULT, kNumberTypeUInt8); MS_EXCEPTION_IF_NULL(address); + address->set_from_persistent_mem(true); auto ms_context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(ms_context); if (use_mem_pool && !mem_manager_->MallocMemFromMemPool(address, tensor_size)) { diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.h b/mindspore/ccsrc/runtime/device/kernel_runtime.h index 89ea06a165d..0b6928e9ec0 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h @@ -56,6 +56,7 @@ class KernelRuntime { virtual bool Init() = 0; virtual void AssignMemory(const session::KernelGraph &graph); void RunOpAssignMemory(const std::vector &input_tensors, const session::KernelGraph &graph, + bool is_gradient_out, const std::map &tensor_to_node = {}); void AssignCommunicationOutputFromMemoryPool(const AnfNodePtr &node) const; void AssignCommunicationInputFromMemoryPool(const AnfNodePtr &node) const; @@ -173,7 +174,8 @@ class KernelRuntime { const std::shared_ptr &mem_schedule = nullptr); void RunOpAssignInputMemory(const std::vector &input_tensors, const session::KernelGraph &graph); void RunOpAssignOutputMemory(const AnfNodePtr &kernel, - const std::map &tensor_to_node = {}); + const std::map &tensor_to_node, + bool is_gradient_out); void RunOpAssignWorkSpaceMemory(const AnfNodePtr &kernel); void RunOpAssignOutputNodeMemory(const ValuePtr &pre_output_value, const session::KernelGraph &graph); void AssignValueNodeTensor(const ValueNodePtr &value_node, const ValuePtr &node_value, size_t output_idx); diff --git a/mindspore/ccsrc/runtime/device/memory_manager.cc b/mindspore/ccsrc/runtime/device/memory_manager.cc index 823437ff235..f61951a5e42 100644 --- a/mindspore/ccsrc/runtime/device/memory_manager.cc +++ b/mindspore/ccsrc/runtime/device/memory_manager.cc @@ -141,9 +141,9 @@ uint8_t *MemoryManager::MallocDynamicMem(size_t size, bool communication_mem) { return nullptr; } -bool MemoryManager::MallocMemFromMemPool(const DeviceAddressPtr address, size_t size) { +bool MemoryManager::MallocMemFromMemPool(const DeviceAddressPtr &address, size_t size) { MS_EXCEPTION_IF_NULL(address); - auto device_ptr = MallocMemFromMemPool(size); + auto device_ptr = MallocMemFromMemPool(size, address->from_persistent_mem_); if (!device_ptr) { return false; } @@ -154,7 +154,7 @@ bool MemoryManager::MallocMemFromMemPool(const DeviceAddressPtr address, size_t return true; } -void *MemoryManager::MallocMemFromMemPool(size_t size) { +void *MemoryManager::MallocMemFromMemPool(size_t size, bool from_persistent_mem) { if (size == 0) { MS_LOG(ERROR) << "MallocMemFromMemPool size is 0."; } diff --git a/mindspore/ccsrc/runtime/device/memory_manager.h b/mindspore/ccsrc/runtime/device/memory_manager.h index 6eb0314ef54..d31fa9b95a9 100644 --- a/mindspore/ccsrc/runtime/device/memory_manager.h +++ b/mindspore/ccsrc/runtime/device/memory_manager.h @@ -49,8 +49,10 @@ class MemoryManager : public MemHandler { virtual uint8_t *MallocMem(MemType type, size_t size, const DeviceAddressPtr &address, uint32_t graph_id = kInvalidGraphId); - virtual bool MallocMemFromMemPool(const DeviceAddressPtr address, size_t size); - virtual void *MallocMemFromMemPool(size_t size); + // param address is the address type of each device + // param from_persistent_mem shows whether the tensor is a parameter in Pynative mode + virtual bool MallocMemFromMemPool(const DeviceAddressPtr &address, size_t size); + virtual void *MallocMemFromMemPool(size_t size, bool from_persistent_mem); virtual uint8_t *MallocCommunicationMemFromMemPool(size_t size) { return nullptr; } virtual void FreeMemFromMemPool(const DeviceAddressPtr address); virtual void FreeMemFromMemPool(void *device_ptr); @@ -62,7 +64,7 @@ class MemoryManager : public MemHandler { static size_t GetCommunicationAlignSize(size_t input_size); // swap manager interface - void *MallocDevice(size_t mem_size) override { return MallocMemFromMemPool(mem_size); } + void *MallocDevice(size_t mem_size) override { return MallocMemFromMemPool(mem_size, false); } void FreeDevice(void *ptr) override { MS_EXCEPTION_IF_NULL(ptr); FreeMemFromMemPool(ptr); diff --git a/mindspore/ccsrc/runtime/framework/actor/data_prepare_actor.cc b/mindspore/ccsrc/runtime/framework/actor/data_prepare_actor.cc index 669228bdfdf..31f952217b6 100644 --- a/mindspore/ccsrc/runtime/framework/actor/data_prepare_actor.cc +++ b/mindspore/ccsrc/runtime/framework/actor/data_prepare_actor.cc @@ -607,6 +607,7 @@ void DataPrepareActor::PrepareDataForWeightNode(const AnfNodePtr &backend_node, (device_tensor->DeviceType() != device_context->GetDeviceAddressType())) { host_tensor_address = device_context->CreateDeviceAddress(nullptr, device_tensor->GetSize(), device_tensor->format(), device_tensor->type_id()); + host_tensor_address->set_from_persistent_mem(tensor->is_parameter()); } else { host_tensor_address = device_tensor; } diff --git a/mindspore/ccsrc/runtime/framework/graph_compiler.cc b/mindspore/ccsrc/runtime/framework/graph_compiler.cc index 423a331dfe1..c4c467e2b20 100644 --- a/mindspore/ccsrc/runtime/framework/graph_compiler.cc +++ b/mindspore/ccsrc/runtime/framework/graph_compiler.cc @@ -100,6 +100,7 @@ void CreateParameterDeviceAddress(const DeviceContext *device_context, const Ker size_t tensor_size = AnfAlgo::GetOutputTensorMemSize(item, index); auto device_address = device_context->CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id); + device_address->set_from_persistent_mem(item->isa()); MS_LOG(DEBUG) << "Create addr for node:" << AnfAlgo::GetNodeDebugString(item) << " addr:" << device_address; AnfAlgo::SetOutputAddr(device_address, index, item.get()); } @@ -143,6 +144,7 @@ void CreateDeviceAddressForTensorValue(const DeviceContext *device_context, cons device_context->CreateDeviceAddress(nullptr, tensor_size, output_format, output_type_id); MS_LOG(DEBUG) << "Create addr for node:" << AnfAlgo::GetNodeDebugString(value_node) << " addr:" << address; MS_EXCEPTION_IF_NULL(address); + address->set_from_persistent_mem(true); AnfAlgo::SetOutputAddr(address, output_idx++, value_node.get()); } } @@ -165,6 +167,7 @@ void CreateValueNodeDeviceAddress(const DeviceContext *device_context, const Ker size_t tensor_size = value.size(); auto address = device_context->CreateDeviceAddress(nullptr, tensor_size, kOpFormat_DEFAULT, kNumberTypeUInt8); MS_EXCEPTION_IF_NULL(address); + address->set_from_persistent_mem(true); MS_LOG(DEBUG) << "Create addr for node:" << AnfAlgo::GetNodeDebugString(value_node) << " addr:" << address; AnfAlgo::SetOutputAddr(address, 0, value_node.get()); @@ -172,7 +175,8 @@ void CreateValueNodeDeviceAddress(const DeviceContext *device_context, const Ker } } -void CreateKernelOutputDeviceAddress(const DeviceContext *device_context, const KernelGraphPtr &graph) { +void CreateKernelOutputDeviceAddress(const DeviceContext *device_context, const KernelGraphPtr &graph, + bool is_gradient_out) { MS_EXCEPTION_IF_NULL(device_context); MS_EXCEPTION_IF_NULL(graph); const std::vector &kernels = graph->execution_order(); @@ -191,6 +195,9 @@ void CreateKernelOutputDeviceAddress(const DeviceContext *device_context, const auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i); auto address_size = AnfAlgo::GetOutputTensorMemSize(kernel, i); auto device_address = device_context->CreateDeviceAddress(nullptr, address_size, output_format, output_type); + if (is_gradient_out) { + device_address->set_from_persistent_mem(true); + } MS_LOG(DEBUG) << "Create addr for node:" << AnfAlgo::GetNodeDebugString(kernel) << " addr:" << device_address; AnfAlgo::SetOutputAddr(device_address, i, kernel.get()); } @@ -434,7 +441,7 @@ GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const Devic device_context->PreprocessBeforeRunGraph(graph); // Create device address for all anf nodes of graph. - CreateDeviceAddress(graph, device_context); + CreateDeviceAddress(graph, device_context, false); graph->set_is_all_nop_node(opt::IsAllNopNode(graph.get())); @@ -498,7 +505,7 @@ GraphId GraphCompiler::CompileGraph(const session::OpRunInfo &op_run_info, bool device_context->OptimizeSingleOpGraph(graph); // Create device address for all anf nodes of graph. - CreateDeviceAddressWithoutWorkspace(graph, device_context); + CreateDeviceAddressWithoutWorkspace(graph, device_context, op_run_info.is_gradient_out); graph->set_is_all_nop_node(opt::IsAllNopNode(graph.get())); run_op_graphs_[op_run_info.graph_info] = graph; @@ -546,20 +553,22 @@ KernelGraphPtr GraphCompiler::Fetch(const GraphInfo &graph_info) const { return iter->second; } -void GraphCompiler::CreateDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *device_context) const { +void GraphCompiler::CreateDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *device_context, + bool is_gradient_out) const { CreateParameterDeviceAddress(device_context, graph); CreateValueNodeDeviceAddress(device_context, graph); - CreateKernelOutputDeviceAddress(device_context, graph); + CreateKernelOutputDeviceAddress(device_context, graph, is_gradient_out); CreateKernelWorkspaceDeviceAddress(device_context, graph); UpdateDeviceAddressForInplaceNode(graph); UpdateDeviceAddressForRefNode(graph); } void GraphCompiler::CreateDeviceAddressWithoutWorkspace(const KernelGraphPtr &graph, - const DeviceContext *device_context) const { + const DeviceContext *device_context, + bool is_gradient_out) const { CreateParameterDeviceAddress(device_context, graph); CreateValueNodeDeviceAddress(device_context, graph); - CreateKernelOutputDeviceAddress(device_context, graph); + CreateKernelOutputDeviceAddress(device_context, graph, is_gradient_out); UpdateDeviceAddressForInplaceNode(graph); UpdateDeviceAddressForRefNode(graph); } @@ -593,24 +602,27 @@ TensorPtr GraphCompiler::GetSingleOpInputTensorByIndex(const CNodePtr &kernel, } void GraphCompiler::GetSingleOpRunInfoAndGraphInfo(const CNodePtr &kernel, const InputTensorInfo &tensor_info, - OpRunInfo *run_info, GraphInfo *graph_info) { + OpRunInfo *run_info, GraphInfo *graph_info, + GraphOutputInfo *const graph_output_info) { MS_EXCEPTION_IF_NULL(session_); MS_EXCEPTION_IF_NULL(graph_info); - *graph_info = session_->GetSingleOpGraphInfo(kernel, tensor_info.input_tensors); - *run_info = session_->GetSingleOpRunInfo(kernel, *graph_info, tensor_info); + *run_info = session_->GetSingleOpRunInfo(kernel, *graph_info, tensor_info, graph_output_info); } -void GraphCompiler::CalculateRefCount(const KernelGraphPtr &graph, std::map *ref_count) const { +void GraphCompiler::CalculateRefCount(const KernelGraphPtr &graph, std::map *ref_count, + std::map *forward_output_refcount) const { MS_EXCEPTION_IF_NULL(session_); session_->GetRefCount(graph.get(), ref_count); + session_->GetForwardOutputRefCount(graph.get(), forward_output_refcount); } void GraphCompiler::UpdateRefCount(const std::set &input_kernels_with_index, std::map *ref_count, + std::map *forward_output_refcount, std::map *op_output_map) const { MS_EXCEPTION_IF_NULL(session_); - session_->HandleOpInputs(input_kernels_with_index, ref_count, op_output_map); + session_->HandleOpInputs(input_kernels_with_index, ref_count, forward_output_refcount, op_output_map); } void GraphCompiler::RecoverGraphOutput(const AnfNodePtr &kernel, const VectorRef &op_outputs, diff --git a/mindspore/ccsrc/runtime/framework/graph_compiler.h b/mindspore/ccsrc/runtime/framework/graph_compiler.h index f5f6485a1f8..b0eff9a9ad8 100644 --- a/mindspore/ccsrc/runtime/framework/graph_compiler.h +++ b/mindspore/ccsrc/runtime/framework/graph_compiler.h @@ -139,14 +139,16 @@ class GraphCompiler { // Get OpRunInfo and GraphInfo for single op compile and run. void GetSingleOpRunInfoAndGraphInfo(const CNodePtr &kernel, const InputTensorInfo &tensor_info, OpRunInfo *run_info, - GraphInfo *graph_info); + GraphInfo *graph_info, GraphOutputInfo *const graph_output_info); // Calculate ref count of PyNative back propagation operators. - void CalculateRefCount(const KernelGraphPtr &graph, std::map *ref_count) const; + void CalculateRefCount(const KernelGraphPtr &graph, std::map *ref_count, + std::map *forward_output_refcount) const; // Update ref count of PyNative back propagation operators. void UpdateRefCount(const std::set &input_kernels_with_index, std::map *ref_count, + std::map *forward_output_refcount, std::map *op_output_map) const; // Handle single op output tensor and recover output of original complete kernel graph. @@ -182,10 +184,12 @@ class GraphCompiler { GraphId CompileGraphImpl(const KernelGraphPtr &graph, const DeviceContext *device_context) const; // Create device address for all anf nodes of graph. - void CreateDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *device_context) const; + void CreateDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *device_context, + bool is_gradient_out) const; // Create device address for input and output of ops. - void CreateDeviceAddressWithoutWorkspace(const KernelGraphPtr &graph, const DeviceContext *device_context) const; + void CreateDeviceAddressWithoutWorkspace(const KernelGraphPtr &graph, const DeviceContext *device_context, + bool is_gradient_out) const; // Set Graph's dependencies for pre_graph and post_graph. void SetGraphDependency(const KernelGraphPtr &graph, const GraphSegmentPtr &segment) const; diff --git a/mindspore/ccsrc/runtime/framework/graph_scheduler.cc b/mindspore/ccsrc/runtime/framework/graph_scheduler.cc index b23bc8990d8..0e0bc03521a 100644 --- a/mindspore/ccsrc/runtime/framework/graph_scheduler.cc +++ b/mindspore/ccsrc/runtime/framework/graph_scheduler.cc @@ -1790,6 +1790,7 @@ void GraphScheduler::PersistDeviceTensor(const GraphCompilerInfo &graph_compiler auto other_type_device_tensor = device_context->CreateDeviceAddress( nullptr, device_tensor->GetSize(), device_tensor->format(), device_tensor->type_id()); other_type_device_tensor->SetNodeIndex(input_node, 0); + other_type_device_tensor->set_from_persistent_mem(input_node->isa()); AddDeviceTensorStore(front_node.get(), other_type_device_tensor); } } diff --git a/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc b/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc index deb65c7f0a4..6aebe46746f 100644 --- a/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc +++ b/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc @@ -511,7 +511,7 @@ bool AscendDeviceContext::AllocateMemory(DeviceAddress *const &address, size_t s MS_EXCEPTION_IF_NULL(address); MS_EXCEPTION_IF_NULL(runtime_instance_); runtime_instance_->SetContext(); - auto device_ptr = mem_manager_->MallocMemFromMemPool(size); + auto device_ptr = mem_manager_->MallocMemFromMemPool(size, address->from_persistent_mem_); if (!device_ptr) { return false; } diff --git a/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc b/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc index 73c17978260..9a3d100f4cc 100644 --- a/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc +++ b/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc @@ -89,7 +89,7 @@ bool CPUDeviceContext::AllocateMemory(DeviceAddress *const &address, size_t size MS_LOG(EXCEPTION) << "The device address type is wrong: " << address->DeviceType(); } - auto device_ptr = mem_manager_->MallocMemFromMemPool(size); + auto device_ptr = mem_manager_->MallocMemFromMemPool(size, 0); if (!device_ptr) { return false; } diff --git a/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc b/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc index 2de96ac640e..4780422ad4d 100644 --- a/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc +++ b/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc @@ -173,7 +173,7 @@ bool GPUDeviceContext::AllocateMemory(DeviceAddress *const &address, size_t size if (!BindDeviceToCurrentThread()) { return false; } - auto device_ptr = mem_manager_->MallocMemFromMemPool(size); + auto device_ptr = mem_manager_->MallocMemFromMemPool(size, address->from_persistent_mem_); if (!device_ptr) { return false; } diff --git a/mindspore/ccsrc/vm/backend.cc b/mindspore/ccsrc/vm/backend.cc index 789c36b5391..b926a577fd4 100644 --- a/mindspore/ccsrc/vm/backend.cc +++ b/mindspore/ccsrc/vm/backend.cc @@ -234,7 +234,7 @@ void UpdateOutput(const std::vector &output_nodes, Vec } } -void ClearGraphDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *device_context) { +void ClearGraphDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *device_context, bool is_gradient_out) { MS_EXCEPTION_IF_NULL(graph); for (const auto &node : graph->execution_order()) { auto output_address_num = AnfAlgo::GetOutputAddressNum(node); @@ -252,6 +252,9 @@ void ClearGraphDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *d MS_EXCEPTION_IF_NULL(new_device_address); new_device_address->set_original_ref_count(device_address->original_ref_count()); new_device_address->ResetRefCount(); + if (is_gradient_out) { + new_device_address->set_from_persistent_mem(true); + } AnfAlgo::SetOutputAddr(new_device_address, i, node.get()); } } @@ -800,9 +803,10 @@ void MindRTBackend::RunGraphBySingleOp(const std::vector &graphs &graph_output_info.output_indexes); std::map cnode_ref_count; + std::map forward_output_refcount; auto iter = cnode_ref_counts_.find(graph->graph_id()); if (iter == cnode_ref_counts_.end()) { - graph_compiler_->CalculateRefCount(graph, &cnode_ref_count); + graph_compiler_->CalculateRefCount(graph, &cnode_ref_count, &forward_output_refcount); (void)cnode_ref_counts_.emplace(graph->graph_id(), cnode_ref_count); } else { cnode_ref_count = iter->second; @@ -822,7 +826,8 @@ void MindRTBackend::RunGraphBySingleOp(const std::vector &graphs GraphInfo graph_info; graph_compiler_->GetSingleOpInputTensors(kernel, op_output_map, parameter_index, inputs[graph_index], &input_tensor_info); - graph_compiler_->GetSingleOpRunInfoAndGraphInfo(kernel, input_tensor_info, &op_run_info, &graph_info); + graph_compiler_->GetSingleOpRunInfoAndGraphInfo(kernel, input_tensor_info, &op_run_info, &graph_info, + &graph_output_info); RunOp(&op_run_info, &op_outputs); } else { @@ -832,7 +837,8 @@ void MindRTBackend::RunGraphBySingleOp(const std::vector &graphs runtime::OpLazyBuilder::GetInstance().ExecuteRemainingTasks(); } - graph_compiler_->UpdateRefCount(input_tensor_info.input_kernel, &cnode_ref_count, &op_output_map); + graph_compiler_->UpdateRefCount(input_tensor_info.input_kernel, &cnode_ref_count, &forward_output_refcount, + &op_output_map); graph_output_info.graph_output_tensors.clear(); graph_compiler_->RecoverGraphOutput(kernel, op_outputs, cnode_ref_count, &op_output_map, &graph_output_info); @@ -1246,7 +1252,8 @@ void MindRTBackend::LazyExecuteTaskCallback() { const auto &context = op_run_task->context(); RunSingleOpGraph(context->graph(), context->output_nodes(), context->op_run_info(), context->graph_compiler_info(), context->device_context()); - ClearGraphDeviceAddress(context->graph(), context->device_context()); + ClearGraphDeviceAddress(context->graph(), context->device_context(), false); + UpdateInputDeviceAddress(context->graph()); op_lazy_builder.PopOpRunTask(); @@ -1304,7 +1311,7 @@ void MindRTBackend::RunOpInternal(bool single_op_cache_hit, GraphCompilerInfo *g } RunSingleOpGraph(graph, output_nodes, *op_run_info, graph_compiler_info, device_context); UpdateOutput(output_nodes, outputs); - ClearGraphDeviceAddress(graph, device_context); + ClearGraphDeviceAddress(graph, device_context, false); UpdateInputDeviceAddress(graph); if (op_run_info->is_dynamic_shape) { UpdateOutputAbstract(graph, op_run_info); diff --git a/mindspore/context.py b/mindspore/context.py index e91dbe3361b..f5e7a1db6b9 100644 --- a/mindspore/context.py +++ b/mindspore/context.py @@ -273,6 +273,14 @@ class _Context: raise ValueError("For 'context.set_context', the argument 'max_device_memory' should not be \"0GB\".") self.set_param(ms_ctx_param.max_device_memory, max_device_memory_value) + def set_mempool_block_size(self, mempool_block_size): + if not Validator.check_str_by_regular(mempool_block_size, _re_pattern): + raise ValueError("Context param mempool_block_size should be in correct format! Such as \"10GB\"") + mempool_block_size_value = float(mempool_block_size[:-2]) + if mempool_block_size_value < 1.0: + raise ValueError("Context param mempool_block_size should be greater or equal to \"1GB\"") + self.set_param(ms_ctx_param.mempool_block_size, mempool_block_size_value) + def set_print_file_path(self, file_path): """Add timestamp suffix to file name. Sets print file path.""" print_file_path = os.path.realpath(file_path) @@ -317,6 +325,7 @@ class _Context: 'profiling_options': set_profiling_options, 'variable_memory_max_size': set_variable_memory_max_size, 'max_device_memory': set_max_device_memory, + 'mempool_block_size': set_mempool_block_size, 'print_file_path': set_print_file_path, 'env_config_path': set_env_config_path } @@ -570,7 +579,8 @@ def _check_target_specific_cfgs(device, arg_key): 'print_file_path': ['Ascend'], 'variable_memory_max_size': ['Ascend'], 'auto_tune_mode': ['Ascend'], - 'max_device_memory': ['GPU'] + 'max_device_memory': ['GPU'], + 'mempool_block_size': ['GPU', 'Ascend'] } # configs not in map device_cfgs are supposed to be suitable for all devices if not arg_key in device_cfgs: @@ -583,15 +593,15 @@ def _check_target_specific_cfgs(device, arg_key): return False -@args_unreset_check(device_id=int, variable_memory_max_size=str, max_device_memory=str) +@args_unreset_check(device_id=int, variable_memory_max_size=str, max_device_memory=str, mempool_block_size=str) @args_type_check(mode=int, precompile_only=bool, device_target=str, device_id=int, save_graphs=bool, save_graphs_path=str, enable_dump=bool, auto_tune_mode=str, save_dump_path=str, enable_reduce_precision=bool, variable_memory_max_size=str, enable_profiling=bool, profiling_options=str, enable_auto_mixed_precision=bool, enable_graph_kernel=bool, reserve_class_name_in_scope=bool, check_bprop=bool, max_device_memory=str, print_file_path=str, enable_sparse=bool, max_call_depth=int, - env_config_path=str, graph_kernel_flags=str, enable_compile_cache=bool, - compile_cache_path=str, grad_for_scalar=bool, pynative_synchronize=bool) + env_config_path=str, graph_kernel_flags=str, save_compile_cache=bool, + load_compile_cache=bool, grad_for_scalar=bool, pynative_synchronize=bool, mempool_block_size=str) def set_context(**kwargs): """ Set context for running environment. @@ -616,6 +626,8 @@ def set_context(**kwargs): | | max_device_memory | GPU | | +------------------------------+----------------------------+ | | variable_memory_max_size | Ascend | + | +------------------------------+----------------------------+ + | | mempool_block_size | GPU/Ascend | +-------------------------+------------------------------+----------------------------+ | Debug Configuration | save_graphs | CPU/GPU/Ascend | | +------------------------------+----------------------------+ @@ -672,6 +684,9 @@ def set_context(**kwargs): The actual used memory size is the minimum of the available memory of the device and max_device_memory. variable_memory_max_size (str): Set the maximum size of the variable memory max size. Default: "30GB". After this parameter is set, the maximum memory used by the framework is restricted to the configured value. + mempool_block_size (str): Set the size of the memory pool block in PyNative mode for devices. + The format is "xxGB". Default: "1GB". Minimum size is "1G". The actual used memory block size is the minimum + of the available memory of the device and mempool_block_size. save_graphs (bool): Whether to save graphs. Default: False. When the `save_graphs` attribute is set as True, attribute of `save_graphs_path` is used to set the intermediate compilation graph storage path. By default, the graphs are saved in the current directory. @@ -813,6 +828,7 @@ def set_context(**kwargs): ... profiling_options='{"output":"/home/data/output","training_trace":"on"}') >>> context.set_context(check_bprop=True) >>> context.set_context(max_device_memory="3.5GB") + >>> context.set_context(mempool_block_size="1GB") >>> context.set_context(print_file_path="print.pb") >>> context.set_context(enable_sparse=True) >>> context.set_context(max_call_depth=80) diff --git a/mindspore/core/utils/ms_context.cc b/mindspore/core/utils/ms_context.cc index 81dcecd9d94..fd8045bc107 100644 --- a/mindspore/core/utils/ms_context.cc +++ b/mindspore/core/utils/ms_context.cc @@ -86,6 +86,7 @@ MsContext::MsContext(const std::string &policy, const std::string &target) { set_param(MS_CTX_PROFILING_OPTIONS, "training_trace"); set_param(MS_CTX_CHECK_BPROP_FLAG, false); set_param(MS_CTX_MAX_DEVICE_MEMORY, kDefaultMaxDeviceMemory); + set_param(MS_CTX_MEMPOOL_BLOCK_SIZE, kDefaultMempoolBlockSize); set_param(MS_CTX_PRINT_FILE_PATH, ""); set_param(MS_CTX_ENABLE_GRAPH_KERNEL, false); set_param(MS_CTX_ENABLE_SPARSE, false); @@ -189,5 +190,4 @@ bool MsContext::enable_dump_ir() const { return false; #endif } - } // namespace mindspore diff --git a/mindspore/core/utils/ms_context.h b/mindspore/core/utils/ms_context.h index 827525dbb23..a8a87ee23f8 100644 --- a/mindspore/core/utils/ms_context.h +++ b/mindspore/core/utils/ms_context.h @@ -60,6 +60,8 @@ const unsigned int MAX_CALL_DEPTH_DEFAULT = 1000; const std::set kTargetSet = {kCPUDevice, kGPUDevice, kAscendDevice, kDavinciDevice}; // The default max available device memory is 1024GB. const float kDefaultMaxDeviceMemory = 1024; +// The default memory pool init percent is 0.0. +const float kDefaultMempoolBlockSize = 1.0; // enum definition for MindSpore Context Parameter enum MsCtxParam : unsigned { @@ -109,6 +111,7 @@ enum MsCtxParam : unsigned { // parameter of type float MS_CTX_TYPE_FLOAT_BEGIN = MS_CTX_TYPE_UINT32_END, MS_CTX_MAX_DEVICE_MEMORY = MS_CTX_TYPE_FLOAT_BEGIN, + MS_CTX_MEMPOOL_BLOCK_SIZE, MS_CTX_TYPE_FLOAT_END, // parameter of type string