!10294 Add support for CPU memory reuse

From: @ythuang
Reviewed-by: 
Signed-off-by:
This commit is contained in:
mindspore-ci-bot 2020-12-29 11:26:58 +08:00 committed by Gitee
commit 05ec9352f3
6 changed files with 161 additions and 47 deletions

View File

@ -81,6 +81,14 @@ GraphId CPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr
#endif
MS_LOG(INFO) << "Build kernel";
BuildKernel(graph.get());
// Set graph execution order before memory alloc, ensure that memory alloc is according to the reorder graph
auto execution_order = graph->execution_order();
Reorder(&execution_order);
graph->set_execution_order(execution_order);
// runtime init
if (!runtime_.Init()) {
MS_LOG(EXCEPTION) << "Kernel runtime init error.";
}
MS_LOG(INFO) << "Assign kernel address";
runtime_.AssignKernelAddress(graph.get());
return graph_id;
@ -116,11 +124,8 @@ void CPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor:
#endif
MS_LOG(INFO) << "Run graph start";
auto execution_order = kernel_graph->execution_order();
Reorder(&execution_order);
bool enable_summary = summary_callback_ != nullptr;
kernel_graph->set_execution_order(execution_order);
NamedSummaryOutputs summary_outputs;
if (enable_summary) {
SetSummaryNodes(kernel_graph.get());
@ -181,16 +186,21 @@ void CPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info,
auto kernel_graph = run_op_graphs_[graph_info];
MS_EXCEPTION_IF_NULL(kernel_graph);
// Set graph execution order before memory alloc, ensure that memory alloc is according to the reorder graph
auto execution_order = kernel_graph->execution_order();
Reorder(&execution_order);
kernel_graph->set_execution_order(execution_order);
// runtime init
if (!runtime_.Init()) {
MS_LOG(EXCEPTION) << "Kernel runtime init error.";
}
runtime_.AssignKernelAddress(kernel_graph.get());
std::map<tensor::TensorPtr, session::KernelWithIndex> tensor_to_node;
runtime_.CreateOutputTensors(kernel_graph.get(), *input_tensors, outputs, &tensor_to_node);
runtime_.BindInputOutput(kernel_graph.get(), *input_tensors, outputs);
MS_LOG(INFO) << "Run Op start";
auto execution_order = kernel_graph->execution_order();
Reorder(&execution_order);
kernel_graph->set_execution_order(execution_order);
bool ret = runtime_.Run(kernel_graph.get(), false);
if (!ret) {

View File

@ -24,6 +24,7 @@
#include <exception>
#include "backend/kernel_compiler/kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"
#include "runtime/device/cpu/cpu_memory_manager.h"
#include "utils/ms_context.h"
#include "backend/session/anf_runtime_algorithm.h"
#include "backend/session/session_basic.h"
@ -31,16 +32,47 @@
#include "utils/shape_utils.h"
#include "utils/profile.h"
#include "utils/trace_base.h"
#ifdef MEM_REUSE_DEBUG
#include "backend/optimizer/mem_reuse/mem_reuse_checker.h"
#endif
namespace mindspore {
namespace device {
namespace cpu {
bool CPUKernelRuntime::Init() {
if (initialized_) {
return true;
}
mem_manager_ = std::make_shared<CPUMemoryManager>();
MS_EXCEPTION_IF_NULL(mem_manager_);
initialized_ = true;
return true;
}
const size_t INIT_NODE_REF = 1;
void CPUKernelRuntime::AssignKernelAddress(session::KernelGraph *kernel_graph) {
AssignValueNodeAddress(kernel_graph);
AssignInputNodeAddress(kernel_graph);
AssignKernelOutputAddress(kernel_graph);
resource_manager_.AssignMemory(kernel_graph);
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
bool is_enable_mem_reuse = context_ptr->get_param<bool>(MS_CTX_ENABLE_MEM_REUSE);
if (context_ptr->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) {
// disable mem reuse for kPynativeMode
is_enable_mem_reuse = false;
}
if (is_enable_mem_reuse) {
MS_EXCEPTION_IF_NULL(mem_manager_);
mem_manager_->ResetDynamicMemory();
AssignDynamicMemory(kernel_graph);
#ifdef MEM_REUSE_DEBUG
// Get normal graph ir for memreuse
mindspore::memreuse::MemReuseChecker::GetInstance().CheckNormalIR(kernel_graph);
#endif
} else {
AssignKernelOutputAddress(kernel_graph);
static_cast<CPUMemoryManager *>(mem_manager_.get())->AssignMemory(kernel_graph);
}
}
void CPUKernelRuntime::AssignValueNodeAddress(session::KernelGraph *kernel_graph) {
@ -75,7 +107,7 @@ void CPUKernelRuntime::AssignValueNodeAddress(session::KernelGraph *kernel_graph
if (tensor->data_type() == output_type_id) {
address->ptr_ = tensor->data_c();
} else {
address->ptr_ = resource_manager_.MemMalloc(tensor_size);
address->ptr_ = static_cast<CPUMemoryManager *>(mem_manager_.get())->StaticMemMalloc(tensor_size);
if (!address->SyncHostToDevice(data_shape, LongToSize(tensor->data().nbytes()), tensor->data_type(),
tensor->data_c())) {
MS_LOG(EXCEPTION) << "Value node sync host to device failed!";
@ -169,7 +201,7 @@ tensor::TensorPtr CPUKernelRuntime::CreatTensorForOutput(
size_t type_size = GetTypeByte(TypeIdToType(device_type_id));
ShapeVector data_shape = tensor->shape();
size_t tensor_size = std::accumulate(data_shape.begin(), data_shape.end(), type_size, std::multiplies<size_t>());
address->ptr_ = resource_manager_.MemMalloc(tensor_size);
address->ptr_ = static_cast<CPUMemoryManager *>(mem_manager_.get())->StaticMemMalloc(tensor_size);
tensor->set_sync_status(kNeedSyncDeviceToHostImmediately);
} else {
tensor->set_sync_status(kNoNeedSync);
@ -269,7 +301,7 @@ void CPUKernelRuntime::BindInputTensorAddressPtr(const session::KernelGraph &ker
ShapeVector data_shape = tensor->shape();
size_t tensor_size = std::accumulate(data_shape.begin(), data_shape.end(),
GetTypeByte(TypeIdToType(address->type_id_)), std::multiplies<size_t>());
address->ptr_ = resource_manager_.MemMalloc(tensor_size);
address->ptr_ = static_cast<CPUMemoryManager *>(mem_manager_.get())->StaticMemMalloc(tensor_size);
if (!address->SyncHostToDevice(data_shape, LongToSize(tensor->data().nbytes()), tensor->data_type(),
tensor->data_c())) {
MS_LOG(EXCEPTION) << "Parameter node sync host to device failed!";
@ -323,7 +355,7 @@ void CPUKernelRuntime::AddRuntimeAddress(DeviceAddress *address, std::vector<ker
kernel::AddressPtr input = std::make_shared<kernel::Address>();
MS_EXCEPTION_IF_NULL(input);
if (address->ptr_ == nullptr) {
address->ptr_ = resource_manager_.MemMalloc(address->size_);
address->ptr_ = static_cast<CPUMemoryManager *>(mem_manager_.get())->StaticMemMalloc(address->size_);
}
MS_EXCEPTION_IF_NULL(address->ptr_);
input->addr = address->ptr_;
@ -332,16 +364,16 @@ void CPUKernelRuntime::AddRuntimeAddress(DeviceAddress *address, std::vector<ker
}
void CPUKernelRuntime::IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) {
resource_manager_.IncreaseSummaryRefCount(summary_outputs);
static_cast<CPUMemoryManager *>(mem_manager_.get())->IncreaseSummaryRefCount(summary_outputs);
}
void CPUKernelRuntime::DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) {
resource_manager_.DecreaseSummaryRefCount(summary_outputs);
static_cast<CPUMemoryManager *>(mem_manager_.get())->DecreaseSummaryRefCount(summary_outputs);
}
bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool is_task_sink) {
MS_EXCEPTION_IF_NULL(kernel_graph);
resource_manager_.IncreaseAddressRefCount(kernel_graph);
static_cast<CPUMemoryManager *>(mem_manager_.get())->IncreaseAddressRefCount(kernel_graph);
auto kernels = kernel_graph->execution_order();
for (const auto &kernel : kernels) {
@ -382,7 +414,7 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool is_task_sink
if (!ret) {
MS_LOG(EXCEPTION) << "Launch kernel failed. Trace:" << trace::DumpSourceLines(kernel);
}
resource_manager_.DecreaseAddressRefCount(kernel);
static_cast<CPUMemoryManager *>(mem_manager_.get())->DecreaseAddressRefCount(kernel);
#ifdef ENABLE_PROFILE
double cost_time = GetTime() - start_time;
MS_LOG(INFO) << "cpu kernel: " << kernel->fullname_with_scope() << " costs " << cost_time * 1e6 << " us";

View File

@ -24,7 +24,6 @@
#include "runtime/device/kernel_runtime.h"
#include "backend/session/kernel_graph.h"
#include "backend/session/session_basic.h"
#include "runtime/device/cpu/cpu_resource_manager.h"
#include "backend/session/anf_runtime_algorithm.h"
#include "utils/any.h"
namespace mindspore {
@ -35,7 +34,7 @@ class CPUKernelRuntime : public KernelRuntime {
CPUKernelRuntime() = default;
~CPUKernelRuntime() override = default;
bool Init() override { return true; }
bool Init();
bool Run(session::KernelGraph *graph, bool is_task_sink) override;
void AssignKernelAddress(session::KernelGraph *kernel_graph);
void CreateOutputTensors(session::KernelGraph *kernel_graph, const std::vector<tensor::TensorPtr> &inputs,
@ -63,9 +62,9 @@ class CPUKernelRuntime : public KernelRuntime {
void AssignInputNodeAddress(const session::KernelGraph *kernel_graph);
void AssignKernelOutputAddress(const session::KernelGraph *kernel_graph);
void AddRuntimeAddress(DeviceAddress *address, std::vector<kernel::AddressPtr> *input_list);
CPUResourceManager resource_manager_;
std::set<DeviceAddressPtr> bound_addresses_;
std::map<AnfNodePtr, tensor::TensorPtr> input_param_tensor_map_;
bool initialized_{false};
};
} // namespace cpu
} // namespace device

View File

@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -13,28 +13,90 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "runtime/device/cpu/cpu_resource_manager.h"
#include "backend/session/anf_runtime_algorithm.h"
#include "runtime/device/cpu/cpu_memory_manager.h"
#include "backend/session/anf_runtime_algorithm.h"
#include "utils/ms_context.h"
#include "utils/convert_utils.h"
namespace mindspore {
namespace device {
namespace cpu {
CPUResourceManager::~CPUResourceManager() { MemFree(); }
void CPUResourceManager::MemFree() {
uint8_t *CPUMemoryManager::MallocStaticMem(size_t size, bool) {
void *ptr = malloc(size);
if (ptr != nullptr) {
memset_s(ptr, size, 0, size);
static_mem_[ptr] = size;
return reinterpret_cast<uint8_t *>(ptr);
} else {
MS_LOG(EXCEPTION) << "Malloc memory failed: size " << size;
}
}
uint8_t *CPUMemoryManager::MallocDynamicMem(size_t size, bool) {
void *ptr = nullptr;
size_t min_size = 0;
// first find the smallest cached_mem_ which fits the size
for (auto &&iter : cached_mem_) {
if (iter.second >= size) {
if (min_size == 0) {
ptr = iter.first;
min_size = iter.second;
} else if (iter.second < min_size) {
ptr = iter.first;
min_size = iter.second;
}
}
}
if (ptr != nullptr) {
memset_s(ptr, size, 0, size);
dynamic_mem_[ptr] = min_size;
(void)cached_mem_.erase(ptr);
return reinterpret_cast<uint8_t *>(ptr);
}
// if not found, malloc
ptr = malloc(size);
if (ptr != nullptr) {
memset_s(ptr, size, 0, size);
dynamic_mem_[ptr] = size;
return reinterpret_cast<uint8_t *>(ptr);
} else {
MS_LOG(EXCEPTION) << "Malloc memory failed: size " << size;
}
}
void CPUMemoryManager::ResetDynamicMemory() {
// don't free, for multi graph
for (auto &&iter : dynamic_mem_) {
cached_mem_[iter.first] = iter.second;
}
dynamic_mem_.clear();
}
CPUMemoryManager::~CPUMemoryManager() { MemFree(); }
void CPUMemoryManager::MemFree() {
if (mem_ptr_ != nullptr) {
free(mem_ptr_);
mem_ptr_ = nullptr;
mem_size_ = 0;
}
for (auto &&iter : static_mem_) {
free(iter.first);
}
static_mem_.clear();
for (auto &&iter : dynamic_mem_) {
free(iter.first);
}
dynamic_mem_.clear();
for (auto &&iter : cached_mem_) {
free(iter.first);
}
cached_mem_.clear();
}
void CPUResourceManager::AssignMemory(const session::KernelGraph *graph) {
void CPUMemoryManager::AssignMemory(const session::KernelGraph *graph) {
size_t graph_mem_size = mem_plan_.MemPlan(graph);
if (graph_mem_size > mem_size_) {
if (mem_size_ > 0) {
@ -43,6 +105,7 @@ void CPUResourceManager::AssignMemory(const session::KernelGraph *graph) {
}
mem_ptr_ = reinterpret_cast<uint8_t *>(malloc(graph_mem_size));
if (mem_ptr_ != nullptr) {
MS_LOG(INFO) << "Simple MemPlan GraphMemSize [" << graph_mem_size << "]";
mem_size_ = graph_mem_size;
dynamic_malloc_ = false;
} else {
@ -56,26 +119,26 @@ void CPUResourceManager::AssignMemory(const session::KernelGraph *graph) {
mem_plan_.MemAssign(graph, mem_ptr_);
}
void *CPUResourceManager::MemMalloc(size_t mem_size) {
void *CPUMemoryManager::StaticMemMalloc(size_t mem_size) {
void *ptr = malloc(mem_size);
if (ptr != nullptr) {
memset_s(ptr, mem_size, 0, mem_size);
dynamic_mem_[ptr] = mem_size;
static_mem_[ptr] = mem_size;
return ptr;
} else {
MS_LOG(EXCEPTION) << "Malloc memory failed: size " << mem_size;
}
}
void CPUResourceManager::MemFree(void *ptr) {
auto iter = dynamic_mem_.find(ptr);
if (iter != dynamic_mem_.end()) {
(void)dynamic_mem_.erase(iter);
void CPUMemoryManager::MemFree(void *ptr) {
auto iter = static_mem_.find(ptr);
if (iter != static_mem_.end()) {
(void)static_mem_.erase(iter);
free(ptr);
}
}
void CPUResourceManager::IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) {
void CPUMemoryManager::IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) {
if (!dynamic_malloc_) {
return;
}
@ -93,7 +156,7 @@ void CPUResourceManager::IncreaseSummaryRefCount(const session::NamedSummaryOutp
}
}
void CPUResourceManager::DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) {
void CPUMemoryManager::DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) {
if (!dynamic_malloc_) {
return;
}
@ -115,7 +178,7 @@ void CPUResourceManager::DecreaseSummaryRefCount(const session::NamedSummaryOutp
}
}
void CPUResourceManager::IncreaseAddressRefCount(const session::KernelGraph *graph) {
void CPUMemoryManager::IncreaseAddressRefCount(const session::KernelGraph *graph) {
if (!dynamic_malloc_) {
return;
}
@ -140,7 +203,7 @@ void CPUResourceManager::IncreaseAddressRefCount(const session::KernelGraph *gra
}
}
void CPUResourceManager::DecreaseAddressRefCount(const AnfNodePtr &kernel) {
void CPUMemoryManager::DecreaseAddressRefCount(const AnfNodePtr &kernel) {
if (!dynamic_malloc_) {
return;
}

View File

@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -13,31 +13,40 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_RESOURCE_MANAGER_H_
#define MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_RESOURCE_MANAGER_H_
#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_MEMORY_MANAGER_H_
#define MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_MEMORY_MANAGER_H_
#include <vector>
#include <map>
#include "backend/session/kernel_graph.h"
#include "backend/session/session_basic.h"
#include "runtime/device/device_address.h"
#include "runtime/device/memory_manager.h"
#include "runtime/device/cpu/cpu_simple_mem_plan.h"
namespace mindspore {
namespace device {
namespace cpu {
class CPUResourceManager {
class CPUMemoryManager : public MemoryManager {
public:
CPUResourceManager() = default;
~CPUResourceManager();
CPUMemoryManager() = default;
virtual ~CPUMemoryManager();
void MallocDeviceMemory() override {}
void FreeDeviceMemory() override {}
void ResetDynamicMemory() override;
void AssignMemory(const session::KernelGraph *graph);
void IncreaseAddressRefCount(const session::KernelGraph *graph);
void DecreaseAddressRefCount(const AnfNodePtr &kernel);
void *MemMalloc(size_t mem_size);
void *StaticMemMalloc(size_t mem_size);
void MemFree(void *ptr);
void IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs);
void DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs);
protected:
uint8_t *MallocStaticMem(size_t size, bool communication_mem) override;
uint8_t *MallocDynamicMem(size_t size, bool communication_mem) override;
private:
void MemFree();
CPUSimpleMemPlan mem_plan_;
@ -46,9 +55,10 @@ class CPUResourceManager {
uint8_t *mem_ptr_{nullptr};
bool dynamic_malloc_{false};
std::map<void *, size_t> dynamic_mem_;
std::map<void *, size_t> static_mem_;
std::map<void *, size_t> cached_mem_;
};
} // namespace cpu
} // namespace device
} // namespace mindspore
#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_RESOURCE_MANAGER_H_
#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_MEMORY_MANAGER_H_

View File

@ -28,7 +28,7 @@ namespace mindspore {
namespace device {
namespace cpu {
class CPUSimpleMemPlan;
class CPUResourceManager;
class CPUMemoryManager;
class CPUKernelRuntime;
} // namespace cpu
namespace ascend {
@ -93,7 +93,7 @@ class DeviceAddress : public mindspore::DeviceSync {
friend class MemoryManager;
friend class mindspore::device::ascend::tasksink::TaskGenerator;
friend class mindspore::device::cpu::CPUSimpleMemPlan;
friend class mindspore::device::cpu::CPUResourceManager;
friend class mindspore::device::cpu::CPUMemoryManager;
friend class mindspore::device::cpu::CPUKernelRuntime;
friend class mindspore::device::gpu::GPUKernelRuntime;
friend class mindspore::device::gpu::GPUMemoryManager;