From de0f6edf5adac863fb402dc896dc5f9fca80229b Mon Sep 17 00:00:00 2001 From: laiyongqiang Date: Fri, 16 Apr 2021 17:18:27 +0800 Subject: [PATCH] support hbm profile --- .../mem_reuse/mem_dynamic_allocator.h | 8 +-- .../ccsrc/backend/optimizer/somas/somas.cc | 55 +++++++++++--- .../device/ascend/ascend_memory_pool.cc | 40 ++++++++++- .../device/ascend/ascend_memory_pool.h | 4 +- .../ccsrc/runtime/device/kernel_runtime.cc | 3 +- .../ccsrc/runtime/device/memory_manager.cc | 71 ++----------------- .../ccsrc/runtime/device/memory_manager.h | 2 +- tests/st/ops/ascend/test_tbe_ops/test_relu.py | 8 ++- 8 files changed, 104 insertions(+), 87 deletions(-) diff --git a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_dynamic_allocator.h b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_dynamic_allocator.h index 62dfc0aee85..022626d58c5 100644 --- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_dynamic_allocator.h +++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_dynamic_allocator.h @@ -109,16 +109,16 @@ class DynamicMemPoolBestFit { protected: // The real size by memory alloc aligned. virtual size_t AlignMemorySize(size_t size) const; - // Get the minimum memory unit size using for dynamic extend. - virtual size_t mem_alloc_unit_size() const { return DYNAMIC_MEM_ALLOC_UNIT_SIZE; } + // Calculate memory block required alloc size when adding the memory block. + virtual size_t CalMemBlockAllocSize(size_t size); private: + // Get the minimum memory unit size using for dynamic extend. + size_t mem_alloc_unit_size() const { return DYNAMIC_MEM_ALLOC_UNIT_SIZE; } // Find the idle memory buf by aligned size when memory alloc. DeviceMemPtr FindIdleMemBuf(size_t size); // Add the memory block and memory buf when memory alloc not find the idle memory buf. DeviceMemPtr AddMemBlockAndMemBuf(size_t size); - // Calculate memory block required alloc size when adding the memory block. - size_t CalMemBlockAllocSize(size_t size); // Judge whether need divide the memory buf by alloc size and memory buf size. bool IsDivide(size_t tensor_size, size_t mem_buf_size) const; // Divide the memory buf by alloc size. diff --git a/mindspore/ccsrc/backend/optimizer/somas/somas.cc b/mindspore/ccsrc/backend/optimizer/somas/somas.cc index 7702211380f..90e75e9a565 100644 --- a/mindspore/ccsrc/backend/optimizer/somas/somas.cc +++ b/mindspore/ccsrc/backend/optimizer/somas/somas.cc @@ -447,6 +447,7 @@ void Somas::InitSomasOutputAndWorkspaceTensors(const session::KernelGraph *graph auto kernel_mod = AnfAlgo::GetKernelMod(kernel); MS_EXCEPTION_IF_NULL(kernel_mod); auto output_sizes = kernel_mod->GetOutputSizeList(); + auto index = 0; for (const auto &size : output_sizes) { auto output_tensor_index = tensor_index; tensor_index++; @@ -455,15 +456,21 @@ void Somas::InitSomasOutputAndWorkspaceTensors(const session::KernelGraph *graph tensor->lifetime_.start_ = node->GetId(); tensor->lifetime_.end_ = node->GetId(); tensor->type_ = kOutputOnly; + if (AnfAlgo::OutputAddrExist(kernel, index)) { + tensor->aligned_size_ = 0; + } + tensors_list_.push_back(tensor); tensors_map_[output_tensor_index] = tensor; stream->tensors_.push_back(tensor); node->tensors_.insert(tensor); node->output_tensors_.push_back(tensor); + index++; } // WorkSpace Tensor auto workspace_sizes = kernel_mod->GetWorkspaceSizeList(); + index = 0; for (const auto &size : workspace_sizes) { auto workspace_tensor_index = tensor_index; tensor_index++; @@ -471,11 +478,15 @@ void Somas::InitSomasOutputAndWorkspaceTensors(const session::KernelGraph *graph tensor->type_ = kWorkspace; tensor->lifetime_.start_ = node->GetId(); tensor->lifetime_.end_ = node->GetId(); + if (AnfAlgo::WorkspaceAddrExist(kernel, index)) { + tensor->aligned_size_ = 0; + } tensors_list_.push_back(tensor); tensors_map_[workspace_tensor_index] = tensor; stream->tensors_.push_back(tensor); node->tensors_.insert(tensor); node->workspace_tensors_.push_back(tensor); + index++; } } } @@ -874,8 +885,12 @@ void Somas::GenContiguousList(const session::KernelGraph *graph) { // Contiguous input if ((!node->input_tensors_.empty()) && (!node->input_tensors_[0]->contiguous_)) { - node->input_tensors_[0]->aligned_size_ += kGapSize; - node->input_tensors_[node->input_tensors_.size() - 1]->aligned_size_ += kGapSize; + if (node->input_tensors_[0]->aligned_size_) { + node->input_tensors_[0]->aligned_size_ += kGapSize; + } + if (node->input_tensors_[node->input_tensors_.size() - 1]->aligned_size_) { + node->input_tensors_[node->input_tensors_.size() - 1]->aligned_size_ += kGapSize; + } std::vector inputs; for (const auto &input_tensor : node->input_tensors_) { comm_input_total_size_ += input_tensor->aligned_size_; @@ -887,8 +902,12 @@ void Somas::GenContiguousList(const session::KernelGraph *graph) { // Contiguous output if ((!node->output_tensors_.empty()) && (!node->output_tensors_[0]->contiguous_)) { - node->output_tensors_[0]->aligned_size_ += kGapSize; - node->output_tensors_[node->output_tensors_.size() - 1]->aligned_size_ += kGapSize; + if (node->output_tensors_[0]->aligned_size_) { + node->output_tensors_[0]->aligned_size_ += kGapSize; + } + if (node->output_tensors_[node->output_tensors_.size() - 1]->aligned_size_) { + node->output_tensors_[node->output_tensors_.size() - 1]->aligned_size_ += kGapSize; + } std::vector outputs; for (const auto &output_tensor : node->output_tensors_) { comm_output_total_size_ += output_tensor->aligned_size_; @@ -1097,17 +1116,33 @@ bool Somas::Assign(const session::KernelGraph *graph) { // Ref Node Preprocessing UpdateRefTensorsConflict(); std::map contiguous_list_with_ref_index_map = GetContiguousListContainRefTensor(); - vector> contiguous_tensors_list_removed_ref = contiguous_tensors_list_; + vector> contiguous_tensors_list_removed = contiguous_tensors_list_; std::set> contiguous_tensors_list_to_remove; for (auto ref_list_pair : contiguous_list_with_ref_index_map) { contiguous_tensors_list_to_remove.insert(contiguous_tensors_list_[ref_list_pair.second]); } + // remove the contiguous list which all tensors' align size is 0 + for (auto contiguous_list : contiguous_tensors_list_) { + bool all_outputs = true; + for (auto tensor_id : contiguous_list) { + auto tensor = tensors_list_[tensor_id]; + if (tensor->aligned_size_ != 0) { + all_outputs = false; + break; + } + } + + if (all_outputs) { + contiguous_tensors_list_to_remove.insert(contiguous_list); + } + } + for (auto contiguous_list : contiguous_tensors_list_to_remove) { - auto iterator = std::find(contiguous_tensors_list_removed_ref.begin(), contiguous_tensors_list_removed_ref.end(), - contiguous_list); - if (iterator != contiguous_tensors_list_removed_ref.end()) { - contiguous_tensors_list_removed_ref.erase(iterator); + auto iterator = + std::find(contiguous_tensors_list_removed.begin(), contiguous_tensors_list_removed.end(), contiguous_list); + if (iterator != contiguous_tensors_list_removed.end()) { + contiguous_tensors_list_removed.erase(iterator); } else { MS_LOG(WARNING) << "Could not find contiguous list to remove for ref"; } @@ -1142,7 +1177,7 @@ bool Somas::Assign(const session::KernelGraph *graph) { somas_solver_ = std::make_shared(); auto status = - somas_solver_->Solving(graph, &solver_tensor_desc_map_, &reuse_matrix_, contiguous_tensors_list_removed_ref, false); + somas_solver_->Solving(graph, &solver_tensor_desc_map_, &reuse_matrix_, contiguous_tensors_list_removed, false); MS_LOG(INFO) << "End Solving"; if (status != SUCCESS) { GenGraphStatisticInfo(); diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.cc index 40e875f703e..861e45f1583 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.cc @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include "runtime/device/ascend/ascend_memory_pool.h" #include "runtime/device/ascend/ascend_kernel_runtime.h" #include "utils/log_adapter.h" @@ -21,6 +22,9 @@ namespace mindspore { namespace device { namespace ascend { +// The minimum unit size (256MB) of memory block used for dynamic extend. +static const size_t ASCEND_DYNAMIC_MEM_ALLOC_UNIT_SIZE = 256 << 20; + void AscendMemoryPool::Init(uint8_t *device_mem_base, uint64_t device_mem_size, uint64_t dynamic_mem_offset) { static bool initialized = false; if (initialized) { @@ -40,11 +44,43 @@ void AscendMemoryPool::Init(uint8_t *device_mem_base, uint64_t device_mem_size, initialized = true; } +size_t AscendMemoryPool::CalMemBlockAllocSize(size_t size) { + auto device_free_mem_size = free_mem_size(); + if (device_free_mem_size < size) { + MS_LOG(EXCEPTION) << "Memory not enough: current free memory size[" << device_free_mem_size + << "] is smaller than required size[" << size << "], dynamic offset [" + << graph_dynamic_mem_offset_ << "] memory pool offset[" + << device_mem_size_ - device_mem_pool_offset_ << "])"; + return 0; + } + auto alloc_mem_size = ASCEND_DYNAMIC_MEM_ALLOC_UNIT_SIZE; + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + const bool pynative_mode = (ms_context->get_param(MS_CTX_EXECUTION_MODE) == kPynativeMode); + if (pynative_mode) { + // Growing at twice of alloc size + while (alloc_mem_size < size) { + alloc_mem_size = alloc_mem_size * 2; + } + } else { + while (alloc_mem_size < size) { + alloc_mem_size = alloc_mem_size + ASCEND_DYNAMIC_MEM_ALLOC_UNIT_SIZE; + } + } + alloc_mem_size = std::min(alloc_mem_size, device_free_mem_size); + return alloc_mem_size; +} + size_t AscendMemoryPool::AllocDeviceMem(size_t size, DeviceMemPtr *addr) { + MS_LOG(INFO) << "Malloc Memory: Pool, total[" << device_mem_size_ << "] (dynamic[" << graph_dynamic_mem_offset_ + << "] memory pool[" << device_mem_size_ - device_mem_pool_offset_ << "])" + << " malloc [" << size << "]"; + if (size == 0) { MS_LOG(EXCEPTION) << "Failed to alloc memory pool resource, the size is zero!"; } - if (device_mem_pool_offset_ - size <= graph_dynamic_mem_offset_) { + + if (device_mem_pool_offset_ - size < graph_dynamic_mem_offset_) { MS_LOG(EXCEPTION) << "Failed to alloc memory pool memory, the current device_mem_pool_offset_ [" << device_mem_pool_offset_ << "], current graph_dynamic_mem_offset_ " << graph_dynamic_mem_offset_ << "], need memory size [" << size << "]"; @@ -76,8 +112,6 @@ size_t AscendMemoryPool::AlignMemorySize(size_t size) const { return size; } -size_t AscendMemoryPool::mem_alloc_unit_size() const { return DYNAMIC_MEM_ALLOC_UNIT_SIZE / 4; } - void AscendMemoryPool::set_device_mem_pool_base(uint8_t *device_mem_pool_base) { MS_EXCEPTION_IF_NULL(device_mem_pool_base); device_mem_pool_base_ = device_mem_pool_base; diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.h b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.h index dd67968db35..65162815b31 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.h @@ -50,8 +50,8 @@ class AscendMemoryPool : public DynamicMemPoolBestFit { protected: // The real size by memory alloc aligned. size_t AlignMemorySize(size_t size) const override; - // Get the minimum memory unit size using for dynamic extend. - size_t mem_alloc_unit_size() const override; + // Calculate memory block required alloc size when adding the memory block. + size_t CalMemBlockAllocSize(size_t size) override; private: AscendMemoryPool() = default; diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.cc b/mindspore/ccsrc/runtime/device/kernel_runtime.cc index add0d7426d7..78dacf3f08f 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc @@ -340,7 +340,8 @@ void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) { #endif auto tensor_size = AnfAlgo::GetOutputTensorMemSize(item, index); device_address = CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id); - MS_LOG(DEBUG) << "Malloc static memory for " << item->fullname_with_scope(); + MS_LOG(INFO) << "Malloc Input for graph " << graph->graph_id() << ", node: " << item->fullname_with_scope() + << " index: " << index << " size: " << tensor_size; if (mem_manager_->MallocMem(kStaticMem, tensor_size, device_address, graph->graph_id()) == nullptr) { MS_LOG(EXCEPTION) << "Cannot alloc address when flag is: " << kStaticMem << ", tensor size is: " << tensor_size; } diff --git a/mindspore/ccsrc/runtime/device/memory_manager.cc b/mindspore/ccsrc/runtime/device/memory_manager.cc index 587749c46c9..29919eab688 100644 --- a/mindspore/ccsrc/runtime/device/memory_manager.cc +++ b/mindspore/ccsrc/runtime/device/memory_manager.cc @@ -65,10 +65,12 @@ void MemoryManager::MallocSomasDynamicMem(const session::KernelGraph *graph) { size_t total_allocated_size = somas_reuse_util_ptr->GetTotalMemSize(); MS_LOG(INFO) << "Graph " << graph->graph_id() << ": TotalSomasReuseDynamicSize [" << total_allocated_size << "]"; - auto base_ptr = MallocDynamicMem(total_allocated_size, false); - MS_LOG(INFO) << "Somas Reuse Memory Base Address [" << static_cast(base_ptr) << "], End Address [" - << static_cast(base_ptr + total_allocated_size) << "]"; - somas_reuse_util_ptr->set_mem_base_addr(base_ptr); + if (total_allocated_size > 0) { + auto base_ptr = MallocDynamicMem(total_allocated_size, false); + MS_LOG(INFO) << "Somas Reuse Memory Base Address [" << static_cast(base_ptr) << "], End Address [" + << static_cast(base_ptr + total_allocated_size) << "]"; + somas_reuse_util_ptr->set_mem_base_addr(base_ptr); + } auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); @@ -166,66 +168,7 @@ uint8_t *MemoryManager::MallocMem(MemType type, size_t size, const DeviceAddress return ptr; } -uint8_t *MemoryManager::MallocStaticMem(size_t size, bool communication_mem, uint32_t graph_id) { - size_t align_size = 0; - if (communication_mem) { - align_size = GetCommunicationAlignSize(size); - } else { - align_size = GetCommonAlignSize(size); - } - - MS_LOG(INFO) << "Malloc Memory for Static: total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ - << "] static[" << total_static_size_ << "])" - << " malloc [" << align_size << "] communication_mem: " << communication_mem; - - if (static_mem_offset_ < align_size) { - MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ - << "] static[" << total_static_size_ << "])" - << " malloc [" << align_size << "] failed!"; - } - total_static_size_ += align_size; - auto offset = static_mem_offset_ - align_size; - if (dynamic_mem_offset_ > offset) { - MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ - << "] static[" << total_static_size_ << "])" - << " malloc [" << align_size << "] failed!"; - } - static_mem_offset_ = offset; - if (communication_mem) { - return device_mem_base_ + offset + kMemAlignSize; - } else { - return device_mem_base_ + offset; - } -} - -uint8_t *MemoryManager::MallocDynamicMem(size_t size, bool communication_mem) { - size_t align_size = 0; - if (communication_mem) { - align_size = GetCommunicationAlignSize(size); - } else { - align_size = GetCommonAlignSize(size); - } - - MS_LOG(INFO) << "Malloc Memory for Dynamic: total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ - << "] static[" << total_static_size_ << "])" - << " malloc [" << align_size << "] communication_mem: " << communication_mem; - - uint64_t offset = dynamic_mem_offset_; - auto new_offset = dynamic_mem_offset_ + align_size; - if (new_offset > static_mem_offset_) { - MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ - << "] static[" << total_static_size_ << "])" - << " malloc [" << align_size << "] failed!"; - } - total_dynamic_size_ += align_size; - dynamic_mem_offset_ = new_offset; - - if (communication_mem) { - return device_mem_base_ + offset + kMemAlignSize; - } else { - return device_mem_base_ + offset; - } -} +uint8_t *MemoryManager::MallocDynamicMem(size_t size, bool communication_mem) { return nullptr; } bool MemoryManager::MallocMemFromMemPool(const DeviceAddressPtr address, size_t size) { auto device_ptr = MallocMemFromMemPool(size); diff --git a/mindspore/ccsrc/runtime/device/memory_manager.h b/mindspore/ccsrc/runtime/device/memory_manager.h index 5dd77378762..d1e2c7a0031 100644 --- a/mindspore/ccsrc/runtime/device/memory_manager.h +++ b/mindspore/ccsrc/runtime/device/memory_manager.h @@ -64,7 +64,7 @@ class MemoryManager { size_t GetCommunicationAlignSize(size_t input_size) const; protected: - virtual uint8_t *MallocStaticMem(size_t size, bool communication_mem, uint32_t graph_id = kInvalidGraphId); + virtual uint8_t *MallocStaticMem(size_t size, bool communication_mem, uint32_t graph_id = kInvalidGraphId) = 0; virtual uint8_t *MallocDynamicMem(size_t size, bool communication_mem); uint8_t *device_mem_base_{nullptr}; uint64_t device_mem_size_{0}; diff --git a/tests/st/ops/ascend/test_tbe_ops/test_relu.py b/tests/st/ops/ascend/test_tbe_ops/test_relu.py index 77e898c08bf..b2ac007eb7f 100644 --- a/tests/st/ops/ascend/test_tbe_ops/test_relu.py +++ b/tests/st/ops/ascend/test_tbe_ops/test_relu.py @@ -20,7 +20,7 @@ from mindspore import Tensor from mindspore.common.api import ms_function from mindspore.ops import operations as P -context.set_context(device_target="Ascend") +context.set_context(device_target="Ascend", mode=context.GRAPH_MODE, variable_memory_max_size="31GB") class Net(nn.Cell): @@ -34,8 +34,12 @@ class Net(nn.Cell): def test_net(): - x = np.random.randn(2, 3, 3, 4).astype(np.float32) + # size (31GB/2/-512)s/ize(float32) 4160749440 + x = np.random.randn(16, 120, 2167057).astype(np.float32) relu = Net() output = relu(Tensor(x)) + expect = 1 * (x > 0) * x print(x) print(output.asnumpy()) + print(expect) + assert (output.asnumpy() == expect).all()